From 661dd1381e1d0365a4b3c3033630758005963331 Mon Sep 17 00:00:00 2001 From: Ken Lautner Date: Fri, 27 Oct 2023 13:38:10 -0700 Subject: [PATCH] Added personal changes to shrink the size of the crypto module --- .../Library/BaseCryptLib/BaseCryptLib.inf | 6 +- .../Library/BaseCryptLib/PeiCryptLib.inf | 2 +- .../Library/BaseCryptLib/SmmCryptLib.inf | 2 +- .../OpensslGen/IA32-GCC/crypto/sha/sha1-586.S | 1180 ---- .../IA32-GCC/crypto/sha/sha256-586.S | 2248 +------- .../IA32-MSFT/crypto/sha/sha1-586.nasm | 1173 ---- .../IA32-MSFT/crypto/sha/sha256-586.nasm | 2248 +------- .../X64-GCC/crypto/aes/aes-x86_64.s | 2 +- .../X64-GCC/crypto/aes/aesni-mb-x86_64.s | 1031 +--- .../X64-GCC/crypto/aes/aesni-sha1-x86_64.s | 1352 +---- .../X64-GCC/crypto/aes/aesni-sha256-x86_64.s | 4378 +-------------- .../X64-GCC/crypto/aes/aesni-x86_64.s | 2 +- .../X64-GCC/crypto/aes/bsaes-x86_64.s | 2 +- .../X64-GCC/crypto/aes/vpaes-x86_64.s | 2 +- .../OpensslGen/X64-GCC/crypto/bn/rsaz-avx2.s | 1751 +----- .../X64-GCC/crypto/bn/rsaz-avx512.s | 873 +-- .../X64-GCC/crypto/bn/rsaz-x86_64.s | 666 +-- .../X64-GCC/crypto/bn/x86_64-gf2m.s | 2 +- .../X64-GCC/crypto/bn/x86_64-mont.s | 382 +- .../X64-GCC/crypto/bn/x86_64-mont5.s | 1352 +---- .../X64-GCC/crypto/ec/ecp_nistz256-x86_64.s | 2539 +-------- .../X64-GCC/crypto/ec/x25519-x86_64.s | 392 +- .../X64-GCC/crypto/md5/md5-x86_64.s | 2 +- .../X64-GCC/crypto/modes/aesni-gcm-x86_64.s | 786 +-- .../X64-GCC/crypto/modes/ghash-x86_64.s | 477 +- .../X64-GCC/crypto/sha/keccak1600-x86_64.s | 2 +- .../X64-GCC/crypto/sha/sha1-mb-x86_64.s | 4341 +-------------- .../X64-GCC/crypto/sha/sha1-x86_64.s | 2831 +--------- .../X64-GCC/crypto/sha/sha256-mb-x86_64.s | 4698 +--------------- .../X64-GCC/crypto/sha/sha256-x86_64.s | 2371 +------- .../X64-GCC/crypto/sha/sha512-x86_64.s | 3662 +------------ .../OpensslGen/X64-GCC/crypto/x86_64cpuid.s | 2 +- .../X64-MSFT/crypto/aes/aesni-mb-x86_64.nasm | 1107 ---- .../crypto/aes/aesni-sha1-x86_64.nasm | 1386 +---- .../crypto/aes/aesni-sha256-x86_64.nasm | 4640 ---------------- .../X64-MSFT/crypto/bn/rsaz-avx2.nasm | 1975 +------ .../X64-MSFT/crypto/bn/rsaz-avx512.nasm | 1019 +--- .../X64-MSFT/crypto/bn/rsaz-x86_64.nasm | 664 --- .../X64-MSFT/crypto/bn/x86_64-mont.nasm | 402 -- .../X64-MSFT/crypto/bn/x86_64-mont5.nasm | 1395 ----- .../crypto/ec/ecp_nistz256-x86_64.nasm | 2646 +-------- .../X64-MSFT/crypto/ec/x25519-x86_64.nasm | 486 +- .../crypto/modes/aesni-gcm-x86_64.nasm | 965 +--- .../X64-MSFT/crypto/modes/ghash-x86_64.nasm | 515 +- .../X64-MSFT/crypto/sha/sha1-mb-x86_64.nasm | 4471 --------------- .../X64-MSFT/crypto/sha/sha1-x86_64.nasm | 2892 ---------- .../X64-MSFT/crypto/sha/sha256-mb-x86_64.nasm | 4828 ----------------- .../X64-MSFT/crypto/sha/sha256-x86_64.nasm | 2408 -------- .../X64-MSFT/crypto/sha/sha512-x86_64.nasm | 3901 +------------ .../include/openssl/configuration-ec.h | 6 + .../include/openssl/configuration-noec.h | 6 + CryptoPkg/Library/OpensslLib/OpensslLib.inf | 19 - .../Library/OpensslLib/OpensslLibAccel.inf | 38 - .../Library/OpensslLib/OpensslLibCrypto.inf | 19 - .../Library/OpensslLib/OpensslLibFull.inf | 19 - .../OpensslLib/OpensslLibFullAccel.inf | 38 - .../OpensslLib/OpensslLibFullAccelTest.inf | 1473 +++++ .../OpensslLib/OpensslStub/CipherNull.c | 14 + .../Library/OpensslLib/OpensslStub/Md5Null.c | 73 + .../Library/OpensslLib/OpensslStub/Sm3Null.c | 47 + .../Library/OpensslLib/OpensslStub/uefiprov.c | 18 +- CryptoPkg/Library/OpensslLib/configure.py | 2 + 62 files changed, 2357 insertions(+), 75872 deletions(-) create mode 100644 CryptoPkg/Library/OpensslLib/OpensslLibFullAccelTest.inf create mode 100644 CryptoPkg/Library/OpensslLib/OpensslStub/CipherNull.c create mode 100644 CryptoPkg/Library/OpensslLib/OpensslStub/Md5Null.c create mode 100644 CryptoPkg/Library/OpensslLib/OpensslStub/Sm3Null.c diff --git a/CryptoPkg/Library/BaseCryptLib/BaseCryptLib.inf b/CryptoPkg/Library/BaseCryptLib/BaseCryptLib.inf index 39395b71b84..34a17d84bdd 100644 --- a/CryptoPkg/Library/BaseCryptLib/BaseCryptLib.inf +++ b/CryptoPkg/Library/BaseCryptLib/BaseCryptLib.inf @@ -34,7 +34,7 @@ Hash/CryptSha1.c Hash/CryptSha256.c Hash/CryptSha512.c - Hash/CryptSm3.c + Hash/CryptSm3Null.c ## Temp change Hash/CryptSha3.c Hash/CryptXkcp.c Hash/CryptCShake256.c @@ -42,7 +42,7 @@ Hash/CryptDispatchApDxe.c Hmac/CryptHmac.c Kdf/CryptHkdf.c - Cipher/CryptAes.c + Cipher/CryptAesNull.c ## Temp change Cipher/CryptAeadAesGcm.c Pk/CryptRsaBasic.c Pk/CryptRsaExt.c @@ -52,7 +52,7 @@ Pk/CryptPkcs7VerifyCommon.c Pk/CryptPkcs7VerifyBase.c Pk/CryptPkcs7VerifyEku.c - Pk/CryptDh.c + Pk/CryptDhNull.c Pk/CryptX509.c Pk/CryptAuthenticode.c Pk/CryptTs.c diff --git a/CryptoPkg/Library/BaseCryptLib/PeiCryptLib.inf b/CryptoPkg/Library/BaseCryptLib/PeiCryptLib.inf index 7ae3c55de3a..840706acc80 100644 --- a/CryptoPkg/Library/BaseCryptLib/PeiCryptLib.inf +++ b/CryptoPkg/Library/BaseCryptLib/PeiCryptLib.inf @@ -38,7 +38,7 @@ Hash/CryptMd5Null.c ## MS_CHANGE_162948 MSChange - Remove support for deprecated crypto. Hash/CryptSha1.c Hash/CryptSha256.c - Hash/CryptSm3.c + Hash/CryptSm3Null.c Hash/CryptSha512.c Hash/CryptSha3.c Hash/CryptXkcp.c diff --git a/CryptoPkg/Library/BaseCryptLib/SmmCryptLib.inf b/CryptoPkg/Library/BaseCryptLib/SmmCryptLib.inf index f84290db371..fb16f52bca6 100644 --- a/CryptoPkg/Library/BaseCryptLib/SmmCryptLib.inf +++ b/CryptoPkg/Library/BaseCryptLib/SmmCryptLib.inf @@ -36,7 +36,7 @@ Hash/CryptMd5Null.c ## MS_CHANGE_162948 - MSChange - Remove support for deprecated crypto. Hash/CryptSha1.c Hash/CryptSha256.c - Hash/CryptSm3.c + Hash/CryptSm3Null.c Hash/CryptSha512.c Hash/CryptSha3.c Hash/CryptXkcp.c diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-GCC/crypto/sha/sha1-586.S b/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-GCC/crypto/sha/sha1-586.S index 9cfe5a46603..f7bae3560ec 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-GCC/crypto/sha/sha1-586.S +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-GCC/crypto/sha/sha1-586.S @@ -27,11 +27,6 @@ sha1_block_data_order: jz .L001x86 testl $536870912,%ecx jnz .Lshaext_shortcut - andl $268435456,%edx - andl $1073741824,%eax - orl %edx,%eax - cmpl $1342177280,%eax - je .Lavx_shortcut jmp .Lssse3_shortcut .align 16 .L001x86: @@ -2799,1181 +2794,6 @@ _sha1_block_data_order_ssse3: popl %ebp ret .size _sha1_block_data_order_ssse3,.-_sha1_block_data_order_ssse3 -.type _sha1_block_data_order_avx,@function -.align 16 -_sha1_block_data_order_avx: - #ifdef __CET__ - -.byte 243,15,30,251 - #endif - - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - call .L008pic_point -.L008pic_point: - popl %ebp - leal .LK_XX_XX-.L008pic_point(%ebp),%ebp -.Lavx_shortcut: - vzeroall - vmovdqa (%ebp),%xmm7 - vmovdqa 16(%ebp),%xmm0 - vmovdqa 32(%ebp),%xmm1 - vmovdqa 48(%ebp),%xmm2 - vmovdqa 64(%ebp),%xmm6 - movl 20(%esp),%edi - movl 24(%esp),%ebp - movl 28(%esp),%edx - movl %esp,%esi - subl $208,%esp - andl $-64,%esp - vmovdqa %xmm0,112(%esp) - vmovdqa %xmm1,128(%esp) - vmovdqa %xmm2,144(%esp) - shll $6,%edx - vmovdqa %xmm7,160(%esp) - addl %ebp,%edx - vmovdqa %xmm6,176(%esp) - addl $64,%ebp - movl %edi,192(%esp) - movl %ebp,196(%esp) - movl %edx,200(%esp) - movl %esi,204(%esp) - movl (%edi),%eax - movl 4(%edi),%ebx - movl 8(%edi),%ecx - movl 12(%edi),%edx - movl 16(%edi),%edi - movl %ebx,%esi - vmovdqu -64(%ebp),%xmm0 - vmovdqu -48(%ebp),%xmm1 - vmovdqu -32(%ebp),%xmm2 - vmovdqu -16(%ebp),%xmm3 - vpshufb %xmm6,%xmm0,%xmm0 - vpshufb %xmm6,%xmm1,%xmm1 - vpshufb %xmm6,%xmm2,%xmm2 - vmovdqa %xmm7,96(%esp) - vpshufb %xmm6,%xmm3,%xmm3 - vpaddd %xmm7,%xmm0,%xmm4 - vpaddd %xmm7,%xmm1,%xmm5 - vpaddd %xmm7,%xmm2,%xmm6 - vmovdqa %xmm4,(%esp) - movl %ecx,%ebp - vmovdqa %xmm5,16(%esp) - xorl %edx,%ebp - vmovdqa %xmm6,32(%esp) - andl %ebp,%esi - jmp .L009loop -.align 16 -.L009loop: - shrdl $2,%ebx,%ebx - xorl %edx,%esi - vpalignr $8,%xmm0,%xmm1,%xmm4 - movl %eax,%ebp - addl (%esp),%edi - vpaddd %xmm3,%xmm7,%xmm7 - vmovdqa %xmm0,64(%esp) - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpsrldq $4,%xmm3,%xmm6 - addl %esi,%edi - andl %ebx,%ebp - vpxor %xmm0,%xmm4,%xmm4 - xorl %ecx,%ebx - addl %eax,%edi - vpxor %xmm2,%xmm6,%xmm6 - shrdl $7,%eax,%eax - xorl %ecx,%ebp - vmovdqa %xmm7,48(%esp) - movl %edi,%esi - addl 4(%esp),%edx - vpxor %xmm6,%xmm4,%xmm4 - xorl %ebx,%eax - shldl $5,%edi,%edi - addl %ebp,%edx - andl %eax,%esi - vpsrld $31,%xmm4,%xmm6 - xorl %ebx,%eax - addl %edi,%edx - shrdl $7,%edi,%edi - xorl %ebx,%esi - vpslldq $12,%xmm4,%xmm0 - vpaddd %xmm4,%xmm4,%xmm4 - movl %edx,%ebp - addl 8(%esp),%ecx - xorl %eax,%edi - shldl $5,%edx,%edx - vpsrld $30,%xmm0,%xmm7 - vpor %xmm6,%xmm4,%xmm4 - addl %esi,%ecx - andl %edi,%ebp - xorl %eax,%edi - addl %edx,%ecx - vpslld $2,%xmm0,%xmm0 - shrdl $7,%edx,%edx - xorl %eax,%ebp - vpxor %xmm7,%xmm4,%xmm4 - movl %ecx,%esi - addl 12(%esp),%ebx - xorl %edi,%edx - shldl $5,%ecx,%ecx - vpxor %xmm0,%xmm4,%xmm4 - addl %ebp,%ebx - andl %edx,%esi - vmovdqa 96(%esp),%xmm0 - xorl %edi,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %edi,%esi - vpalignr $8,%xmm1,%xmm2,%xmm5 - movl %ebx,%ebp - addl 16(%esp),%eax - vpaddd %xmm4,%xmm0,%xmm0 - vmovdqa %xmm1,80(%esp) - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpsrldq $4,%xmm4,%xmm7 - addl %esi,%eax - andl %ecx,%ebp - vpxor %xmm1,%xmm5,%xmm5 - xorl %edx,%ecx - addl %ebx,%eax - vpxor %xmm3,%xmm7,%xmm7 - shrdl $7,%ebx,%ebx - xorl %edx,%ebp - vmovdqa %xmm0,(%esp) - movl %eax,%esi - addl 20(%esp),%edi - vpxor %xmm7,%xmm5,%xmm5 - xorl %ecx,%ebx - shldl $5,%eax,%eax - addl %ebp,%edi - andl %ebx,%esi - vpsrld $31,%xmm5,%xmm7 - xorl %ecx,%ebx - addl %eax,%edi - shrdl $7,%eax,%eax - xorl %ecx,%esi - vpslldq $12,%xmm5,%xmm1 - vpaddd %xmm5,%xmm5,%xmm5 - movl %edi,%ebp - addl 24(%esp),%edx - xorl %ebx,%eax - shldl $5,%edi,%edi - vpsrld $30,%xmm1,%xmm0 - vpor %xmm7,%xmm5,%xmm5 - addl %esi,%edx - andl %eax,%ebp - xorl %ebx,%eax - addl %edi,%edx - vpslld $2,%xmm1,%xmm1 - shrdl $7,%edi,%edi - xorl %ebx,%ebp - vpxor %xmm0,%xmm5,%xmm5 - movl %edx,%esi - addl 28(%esp),%ecx - xorl %eax,%edi - shldl $5,%edx,%edx - vpxor %xmm1,%xmm5,%xmm5 - addl %ebp,%ecx - andl %edi,%esi - vmovdqa 112(%esp),%xmm1 - xorl %eax,%edi - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - vpalignr $8,%xmm2,%xmm3,%xmm6 - movl %ecx,%ebp - addl 32(%esp),%ebx - vpaddd %xmm5,%xmm1,%xmm1 - vmovdqa %xmm2,96(%esp) - xorl %edi,%edx - shldl $5,%ecx,%ecx - vpsrldq $4,%xmm5,%xmm0 - addl %esi,%ebx - andl %edx,%ebp - vpxor %xmm2,%xmm6,%xmm6 - xorl %edi,%edx - addl %ecx,%ebx - vpxor %xmm4,%xmm0,%xmm0 - shrdl $7,%ecx,%ecx - xorl %edi,%ebp - vmovdqa %xmm1,16(%esp) - movl %ebx,%esi - addl 36(%esp),%eax - vpxor %xmm0,%xmm6,%xmm6 - xorl %edx,%ecx - shldl $5,%ebx,%ebx - addl %ebp,%eax - andl %ecx,%esi - vpsrld $31,%xmm6,%xmm0 - xorl %edx,%ecx - addl %ebx,%eax - shrdl $7,%ebx,%ebx - xorl %edx,%esi - vpslldq $12,%xmm6,%xmm2 - vpaddd %xmm6,%xmm6,%xmm6 - movl %eax,%ebp - addl 40(%esp),%edi - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpsrld $30,%xmm2,%xmm1 - vpor %xmm0,%xmm6,%xmm6 - addl %esi,%edi - andl %ebx,%ebp - xorl %ecx,%ebx - addl %eax,%edi - vpslld $2,%xmm2,%xmm2 - vmovdqa 64(%esp),%xmm0 - shrdl $7,%eax,%eax - xorl %ecx,%ebp - vpxor %xmm1,%xmm6,%xmm6 - movl %edi,%esi - addl 44(%esp),%edx - xorl %ebx,%eax - shldl $5,%edi,%edi - vpxor %xmm2,%xmm6,%xmm6 - addl %ebp,%edx - andl %eax,%esi - vmovdqa 112(%esp),%xmm2 - xorl %ebx,%eax - addl %edi,%edx - shrdl $7,%edi,%edi - xorl %ebx,%esi - vpalignr $8,%xmm3,%xmm4,%xmm7 - movl %edx,%ebp - addl 48(%esp),%ecx - vpaddd %xmm6,%xmm2,%xmm2 - vmovdqa %xmm3,64(%esp) - xorl %eax,%edi - shldl $5,%edx,%edx - vpsrldq $4,%xmm6,%xmm1 - addl %esi,%ecx - andl %edi,%ebp - vpxor %xmm3,%xmm7,%xmm7 - xorl %eax,%edi - addl %edx,%ecx - vpxor %xmm5,%xmm1,%xmm1 - shrdl $7,%edx,%edx - xorl %eax,%ebp - vmovdqa %xmm2,32(%esp) - movl %ecx,%esi - addl 52(%esp),%ebx - vpxor %xmm1,%xmm7,%xmm7 - xorl %edi,%edx - shldl $5,%ecx,%ecx - addl %ebp,%ebx - andl %edx,%esi - vpsrld $31,%xmm7,%xmm1 - xorl %edi,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %edi,%esi - vpslldq $12,%xmm7,%xmm3 - vpaddd %xmm7,%xmm7,%xmm7 - movl %ebx,%ebp - addl 56(%esp),%eax - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpsrld $30,%xmm3,%xmm2 - vpor %xmm1,%xmm7,%xmm7 - addl %esi,%eax - andl %ecx,%ebp - xorl %edx,%ecx - addl %ebx,%eax - vpslld $2,%xmm3,%xmm3 - vmovdqa 80(%esp),%xmm1 - shrdl $7,%ebx,%ebx - xorl %edx,%ebp - vpxor %xmm2,%xmm7,%xmm7 - movl %eax,%esi - addl 60(%esp),%edi - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpxor %xmm3,%xmm7,%xmm7 - addl %ebp,%edi - andl %ebx,%esi - vmovdqa 112(%esp),%xmm3 - xorl %ecx,%ebx - addl %eax,%edi - vpalignr $8,%xmm6,%xmm7,%xmm2 - vpxor %xmm4,%xmm0,%xmm0 - shrdl $7,%eax,%eax - xorl %ecx,%esi - movl %edi,%ebp - addl (%esp),%edx - vpxor %xmm1,%xmm0,%xmm0 - vmovdqa %xmm4,80(%esp) - xorl %ebx,%eax - shldl $5,%edi,%edi - vmovdqa %xmm3,%xmm4 - vpaddd %xmm7,%xmm3,%xmm3 - addl %esi,%edx - andl %eax,%ebp - vpxor %xmm2,%xmm0,%xmm0 - xorl %ebx,%eax - addl %edi,%edx - shrdl $7,%edi,%edi - xorl %ebx,%ebp - vpsrld $30,%xmm0,%xmm2 - vmovdqa %xmm3,48(%esp) - movl %edx,%esi - addl 4(%esp),%ecx - xorl %eax,%edi - shldl $5,%edx,%edx - vpslld $2,%xmm0,%xmm0 - addl %ebp,%ecx - andl %edi,%esi - xorl %eax,%edi - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - movl %ecx,%ebp - addl 8(%esp),%ebx - vpor %xmm2,%xmm0,%xmm0 - xorl %edi,%edx - shldl $5,%ecx,%ecx - vmovdqa 96(%esp),%xmm2 - addl %esi,%ebx - andl %edx,%ebp - xorl %edi,%edx - addl %ecx,%ebx - addl 12(%esp),%eax - xorl %edi,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm7,%xmm0,%xmm3 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - shldl $5,%eax,%eax - vpxor %xmm2,%xmm1,%xmm1 - vmovdqa %xmm5,96(%esp) - addl %esi,%edi - xorl %ecx,%ebp - vmovdqa %xmm4,%xmm5 - vpaddd %xmm0,%xmm4,%xmm4 - shrdl $7,%ebx,%ebx - addl %eax,%edi - vpxor %xmm3,%xmm1,%xmm1 - addl 20(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - shldl $5,%edi,%edi - vpsrld $30,%xmm1,%xmm3 - vmovdqa %xmm4,(%esp) - addl %ebp,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %edi,%edx - vpslld $2,%xmm1,%xmm1 - addl 24(%esp),%ecx - xorl %eax,%esi - movl %edx,%ebp - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%ebp - shrdl $7,%edi,%edi - addl %edx,%ecx - vpor %xmm3,%xmm1,%xmm1 - addl 28(%esp),%ebx - xorl %edi,%ebp - vmovdqa 64(%esp),%xmm3 - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edi,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpalignr $8,%xmm0,%xmm1,%xmm4 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - shldl $5,%ebx,%ebx - vpxor %xmm3,%xmm2,%xmm2 - vmovdqa %xmm6,64(%esp) - addl %esi,%eax - xorl %edx,%ebp - vmovdqa 128(%esp),%xmm6 - vpaddd %xmm1,%xmm5,%xmm5 - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpxor %xmm4,%xmm2,%xmm2 - addl 36(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - shldl $5,%eax,%eax - vpsrld $30,%xmm2,%xmm4 - vmovdqa %xmm5,16(%esp) - addl %ebp,%edi - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%edi - vpslld $2,%xmm2,%xmm2 - addl 40(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - shldl $5,%edi,%edi - addl %esi,%edx - xorl %ebx,%ebp - shrdl $7,%eax,%eax - addl %edi,%edx - vpor %xmm4,%xmm2,%xmm2 - addl 44(%esp),%ecx - xorl %eax,%ebp - vmovdqa 80(%esp),%xmm4 - movl %edx,%esi - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %eax,%esi - shrdl $7,%edi,%edi - addl %edx,%ecx - vpalignr $8,%xmm1,%xmm2,%xmm5 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - shldl $5,%ecx,%ecx - vpxor %xmm4,%xmm3,%xmm3 - vmovdqa %xmm7,80(%esp) - addl %esi,%ebx - xorl %edi,%ebp - vmovdqa %xmm6,%xmm7 - vpaddd %xmm2,%xmm6,%xmm6 - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpxor %xmm5,%xmm3,%xmm3 - addl 52(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - vpsrld $30,%xmm3,%xmm5 - vmovdqa %xmm6,32(%esp) - addl %ebp,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpslld $2,%xmm3,%xmm3 - addl 56(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - shldl $5,%eax,%eax - addl %esi,%edi - xorl %ecx,%ebp - shrdl $7,%ebx,%ebx - addl %eax,%edi - vpor %xmm5,%xmm3,%xmm3 - addl 60(%esp),%edx - xorl %ebx,%ebp - vmovdqa 96(%esp),%xmm5 - movl %edi,%esi - shldl $5,%edi,%edi - addl %ebp,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %edi,%edx - vpalignr $8,%xmm2,%xmm3,%xmm6 - vpxor %xmm0,%xmm4,%xmm4 - addl (%esp),%ecx - xorl %eax,%esi - movl %edx,%ebp - shldl $5,%edx,%edx - vpxor %xmm5,%xmm4,%xmm4 - vmovdqa %xmm0,96(%esp) - addl %esi,%ecx - xorl %eax,%ebp - vmovdqa %xmm7,%xmm0 - vpaddd %xmm3,%xmm7,%xmm7 - shrdl $7,%edi,%edi - addl %edx,%ecx - vpxor %xmm6,%xmm4,%xmm4 - addl 4(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - shldl $5,%ecx,%ecx - vpsrld $30,%xmm4,%xmm6 - vmovdqa %xmm7,48(%esp) - addl %ebp,%ebx - xorl %edi,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpslld $2,%xmm4,%xmm4 - addl 8(%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%ebp - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpor %xmm6,%xmm4,%xmm4 - addl 12(%esp),%edi - xorl %ecx,%ebp - vmovdqa 64(%esp),%xmm6 - movl %eax,%esi - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%edi - vpalignr $8,%xmm3,%xmm4,%xmm7 - vpxor %xmm1,%xmm5,%xmm5 - addl 16(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - shldl $5,%edi,%edi - vpxor %xmm6,%xmm5,%xmm5 - vmovdqa %xmm1,64(%esp) - addl %esi,%edx - xorl %ebx,%ebp - vmovdqa %xmm0,%xmm1 - vpaddd %xmm4,%xmm0,%xmm0 - shrdl $7,%eax,%eax - addl %edi,%edx - vpxor %xmm7,%xmm5,%xmm5 - addl 20(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - shldl $5,%edx,%edx - vpsrld $30,%xmm5,%xmm7 - vmovdqa %xmm0,(%esp) - addl %ebp,%ecx - xorl %eax,%esi - shrdl $7,%edi,%edi - addl %edx,%ecx - vpslld $2,%xmm5,%xmm5 - addl 24(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edi,%ebp - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpor %xmm7,%xmm5,%xmm5 - addl 28(%esp),%eax - vmovdqa 80(%esp),%xmm7 - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%ebp - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm4,%xmm5,%xmm0 - vpxor %xmm2,%xmm6,%xmm6 - addl 32(%esp),%edi - andl %ecx,%esi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - vpxor %xmm7,%xmm6,%xmm6 - vmovdqa %xmm2,80(%esp) - movl %eax,%ebp - xorl %ecx,%esi - vmovdqa %xmm1,%xmm2 - vpaddd %xmm5,%xmm1,%xmm1 - shldl $5,%eax,%eax - addl %esi,%edi - vpxor %xmm0,%xmm6,%xmm6 - xorl %ebx,%ebp - xorl %ecx,%ebx - addl %eax,%edi - addl 36(%esp),%edx - vpsrld $30,%xmm6,%xmm0 - vmovdqa %xmm1,16(%esp) - andl %ebx,%ebp - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %edi,%esi - vpslld $2,%xmm6,%xmm6 - xorl %ebx,%ebp - shldl $5,%edi,%edi - addl %ebp,%edx - xorl %eax,%esi - xorl %ebx,%eax - addl %edi,%edx - addl 40(%esp),%ecx - andl %eax,%esi - vpor %xmm0,%xmm6,%xmm6 - xorl %ebx,%eax - shrdl $7,%edi,%edi - vmovdqa 96(%esp),%xmm0 - movl %edx,%ebp - xorl %eax,%esi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %edi,%ebp - xorl %eax,%edi - addl %edx,%ecx - addl 44(%esp),%ebx - andl %edi,%ebp - xorl %eax,%edi - shrdl $7,%edx,%edx - movl %ecx,%esi - xorl %edi,%ebp - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edx,%esi - xorl %edi,%edx - addl %ecx,%ebx - vpalignr $8,%xmm5,%xmm6,%xmm1 - vpxor %xmm3,%xmm7,%xmm7 - addl 48(%esp),%eax - andl %edx,%esi - xorl %edi,%edx - shrdl $7,%ecx,%ecx - vpxor %xmm0,%xmm7,%xmm7 - vmovdqa %xmm3,96(%esp) - movl %ebx,%ebp - xorl %edx,%esi - vmovdqa 144(%esp),%xmm3 - vpaddd %xmm6,%xmm2,%xmm2 - shldl $5,%ebx,%ebx - addl %esi,%eax - vpxor %xmm1,%xmm7,%xmm7 - xorl %ecx,%ebp - xorl %edx,%ecx - addl %ebx,%eax - addl 52(%esp),%edi - vpsrld $30,%xmm7,%xmm1 - vmovdqa %xmm2,32(%esp) - andl %ecx,%ebp - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - vpslld $2,%xmm7,%xmm7 - xorl %ecx,%ebp - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%edi - addl 56(%esp),%edx - andl %ebx,%esi - vpor %xmm1,%xmm7,%xmm7 - xorl %ecx,%ebx - shrdl $7,%eax,%eax - vmovdqa 64(%esp),%xmm1 - movl %edi,%ebp - xorl %ebx,%esi - shldl $5,%edi,%edi - addl %esi,%edx - xorl %eax,%ebp - xorl %ebx,%eax - addl %edi,%edx - addl 60(%esp),%ecx - andl %eax,%ebp - xorl %ebx,%eax - shrdl $7,%edi,%edi - movl %edx,%esi - xorl %eax,%ebp - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %edi,%esi - xorl %eax,%edi - addl %edx,%ecx - vpalignr $8,%xmm6,%xmm7,%xmm2 - vpxor %xmm4,%xmm0,%xmm0 - addl (%esp),%ebx - andl %edi,%esi - xorl %eax,%edi - shrdl $7,%edx,%edx - vpxor %xmm1,%xmm0,%xmm0 - vmovdqa %xmm4,64(%esp) - movl %ecx,%ebp - xorl %edi,%esi - vmovdqa %xmm3,%xmm4 - vpaddd %xmm7,%xmm3,%xmm3 - shldl $5,%ecx,%ecx - addl %esi,%ebx - vpxor %xmm2,%xmm0,%xmm0 - xorl %edx,%ebp - xorl %edi,%edx - addl %ecx,%ebx - addl 4(%esp),%eax - vpsrld $30,%xmm0,%xmm2 - vmovdqa %xmm3,48(%esp) - andl %edx,%ebp - xorl %edi,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - vpslld $2,%xmm0,%xmm0 - xorl %edx,%ebp - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - addl 8(%esp),%edi - andl %ecx,%esi - vpor %xmm2,%xmm0,%xmm0 - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - vmovdqa 80(%esp),%xmm2 - movl %eax,%ebp - xorl %ecx,%esi - shldl $5,%eax,%eax - addl %esi,%edi - xorl %ebx,%ebp - xorl %ecx,%ebx - addl %eax,%edi - addl 12(%esp),%edx - andl %ebx,%ebp - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %edi,%esi - xorl %ebx,%ebp - shldl $5,%edi,%edi - addl %ebp,%edx - xorl %eax,%esi - xorl %ebx,%eax - addl %edi,%edx - vpalignr $8,%xmm7,%xmm0,%xmm3 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%esp),%ecx - andl %eax,%esi - xorl %ebx,%eax - shrdl $7,%edi,%edi - vpxor %xmm2,%xmm1,%xmm1 - vmovdqa %xmm5,80(%esp) - movl %edx,%ebp - xorl %eax,%esi - vmovdqa %xmm4,%xmm5 - vpaddd %xmm0,%xmm4,%xmm4 - shldl $5,%edx,%edx - addl %esi,%ecx - vpxor %xmm3,%xmm1,%xmm1 - xorl %edi,%ebp - xorl %eax,%edi - addl %edx,%ecx - addl 20(%esp),%ebx - vpsrld $30,%xmm1,%xmm3 - vmovdqa %xmm4,(%esp) - andl %edi,%ebp - xorl %eax,%edi - shrdl $7,%edx,%edx - movl %ecx,%esi - vpslld $2,%xmm1,%xmm1 - xorl %edi,%ebp - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edx,%esi - xorl %edi,%edx - addl %ecx,%ebx - addl 24(%esp),%eax - andl %edx,%esi - vpor %xmm3,%xmm1,%xmm1 - xorl %edi,%edx - shrdl $7,%ecx,%ecx - vmovdqa 96(%esp),%xmm3 - movl %ebx,%ebp - xorl %edx,%esi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %ecx,%ebp - xorl %edx,%ecx - addl %ebx,%eax - addl 28(%esp),%edi - andl %ecx,%ebp - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - xorl %ecx,%ebp - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%edi - vpalignr $8,%xmm0,%xmm1,%xmm4 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%esp),%edx - andl %ebx,%esi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - vpxor %xmm3,%xmm2,%xmm2 - vmovdqa %xmm6,96(%esp) - movl %edi,%ebp - xorl %ebx,%esi - vmovdqa %xmm5,%xmm6 - vpaddd %xmm1,%xmm5,%xmm5 - shldl $5,%edi,%edi - addl %esi,%edx - vpxor %xmm4,%xmm2,%xmm2 - xorl %eax,%ebp - xorl %ebx,%eax - addl %edi,%edx - addl 36(%esp),%ecx - vpsrld $30,%xmm2,%xmm4 - vmovdqa %xmm5,16(%esp) - andl %eax,%ebp - xorl %ebx,%eax - shrdl $7,%edi,%edi - movl %edx,%esi - vpslld $2,%xmm2,%xmm2 - xorl %eax,%ebp - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %edi,%esi - xorl %eax,%edi - addl %edx,%ecx - addl 40(%esp),%ebx - andl %edi,%esi - vpor %xmm4,%xmm2,%xmm2 - xorl %eax,%edi - shrdl $7,%edx,%edx - vmovdqa 64(%esp),%xmm4 - movl %ecx,%ebp - xorl %edi,%esi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edx,%ebp - xorl %edi,%edx - addl %ecx,%ebx - addl 44(%esp),%eax - andl %edx,%ebp - xorl %edi,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%ebp - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %edx,%esi - addl %ebx,%eax - vpalignr $8,%xmm1,%xmm2,%xmm5 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - shldl $5,%eax,%eax - vpxor %xmm4,%xmm3,%xmm3 - vmovdqa %xmm7,64(%esp) - addl %esi,%edi - xorl %ecx,%ebp - vmovdqa %xmm6,%xmm7 - vpaddd %xmm2,%xmm6,%xmm6 - shrdl $7,%ebx,%ebx - addl %eax,%edi - vpxor %xmm5,%xmm3,%xmm3 - addl 52(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - shldl $5,%edi,%edi - vpsrld $30,%xmm3,%xmm5 - vmovdqa %xmm6,32(%esp) - addl %ebp,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %edi,%edx - vpslld $2,%xmm3,%xmm3 - addl 56(%esp),%ecx - xorl %eax,%esi - movl %edx,%ebp - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%ebp - shrdl $7,%edi,%edi - addl %edx,%ecx - vpor %xmm5,%xmm3,%xmm3 - addl 60(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edi,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl (%esp),%eax - vpaddd %xmm3,%xmm7,%xmm7 - xorl %edx,%esi - movl %ebx,%ebp - shldl $5,%ebx,%ebx - addl %esi,%eax - vmovdqa %xmm7,48(%esp) - xorl %edx,%ebp - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 4(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%edi - addl 8(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - shldl $5,%edi,%edi - addl %esi,%edx - xorl %ebx,%ebp - shrdl $7,%eax,%eax - addl %edi,%edx - addl 12(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %eax,%esi - shrdl $7,%edi,%edi - addl %edx,%ecx - movl 196(%esp),%ebp - cmpl 200(%esp),%ebp - je .L010done - vmovdqa 160(%esp),%xmm7 - vmovdqa 176(%esp),%xmm6 - vmovdqu (%ebp),%xmm0 - vmovdqu 16(%ebp),%xmm1 - vmovdqu 32(%ebp),%xmm2 - vmovdqu 48(%ebp),%xmm3 - addl $64,%ebp - vpshufb %xmm6,%xmm0,%xmm0 - movl %ebp,196(%esp) - vmovdqa %xmm7,96(%esp) - addl 16(%esp),%ebx - xorl %edi,%esi - vpshufb %xmm6,%xmm1,%xmm1 - movl %ecx,%ebp - shldl $5,%ecx,%ecx - vpaddd %xmm7,%xmm0,%xmm4 - addl %esi,%ebx - xorl %edi,%ebp - shrdl $7,%edx,%edx - addl %ecx,%ebx - vmovdqa %xmm4,(%esp) - addl 20(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - shldl $5,%eax,%eax - addl %esi,%edi - xorl %ecx,%ebp - shrdl $7,%ebx,%ebx - addl %eax,%edi - addl 28(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - shldl $5,%edi,%edi - addl %ebp,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %edi,%edx - addl 32(%esp),%ecx - xorl %eax,%esi - vpshufb %xmm6,%xmm2,%xmm2 - movl %edx,%ebp - shldl $5,%edx,%edx - vpaddd %xmm7,%xmm1,%xmm5 - addl %esi,%ecx - xorl %eax,%ebp - shrdl $7,%edi,%edi - addl %edx,%ecx - vmovdqa %xmm5,16(%esp) - addl 36(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edi,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%ebp - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%edi - addl 48(%esp),%edx - xorl %ebx,%esi - vpshufb %xmm6,%xmm3,%xmm3 - movl %edi,%ebp - shldl $5,%edi,%edi - vpaddd %xmm7,%xmm2,%xmm6 - addl %esi,%edx - xorl %ebx,%ebp - shrdl $7,%eax,%eax - addl %edi,%edx - vmovdqa %xmm6,32(%esp) - addl 52(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %eax,%esi - shrdl $7,%edi,%edi - addl %edx,%ecx - addl 56(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edi,%ebp - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %ebp,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - movl 192(%esp),%ebp - addl (%ebp),%eax - addl 4(%ebp),%esi - addl 8(%ebp),%ecx - movl %eax,(%ebp) - addl 12(%ebp),%edx - movl %esi,4(%ebp) - addl 16(%ebp),%edi - movl %ecx,%ebx - movl %ecx,8(%ebp) - xorl %edx,%ebx - movl %edx,12(%ebp) - movl %edi,16(%ebp) - movl %esi,%ebp - andl %ebx,%esi - movl %ebp,%ebx - jmp .L009loop -.align 16 -.L010done: - addl 16(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edi,%ebp - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 20(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - shldl $5,%eax,%eax - addl %esi,%edi - xorl %ecx,%ebp - shrdl $7,%ebx,%ebx - addl %eax,%edi - addl 28(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - shldl $5,%edi,%edi - addl %ebp,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %edi,%edx - addl 32(%esp),%ecx - xorl %eax,%esi - movl %edx,%ebp - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%ebp - shrdl $7,%edi,%edi - addl %edx,%ecx - addl 36(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edi,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%ebp - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%edi - addl 48(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - shldl $5,%edi,%edi - addl %esi,%edx - xorl %ebx,%ebp - shrdl $7,%eax,%eax - addl %edi,%edx - addl 52(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %eax,%esi - shrdl $7,%edi,%edi - addl %edx,%ecx - addl 56(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edi,%ebp - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %ebp,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vzeroall - movl 192(%esp),%ebp - addl (%ebp),%eax - movl 204(%esp),%esp - addl 4(%ebp),%esi - addl 8(%ebp),%ecx - movl %eax,(%ebp) - addl 12(%ebp),%edx - movl %esi,4(%ebp) - addl 16(%ebp),%edi - movl %ecx,8(%ebp) - movl %edx,12(%ebp) - movl %edi,16(%ebp) - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size _sha1_block_data_order_avx,.-_sha1_block_data_order_avx .align 64 .LK_XX_XX: .long 1518500249,1518500249,1518500249,1518500249 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-GCC/crypto/sha/sha256-586.S b/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-GCC/crypto/sha/sha256-586.S index 9253ab18d0d..5f515ac8835 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-GCC/crypto/sha/sha256-586.S +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-GCC/crypto/sha/sha256-586.S @@ -44,13 +44,12 @@ sha256_block_data_order: orl %ebx,%ecx andl $1342177280,%ecx cmpl $1342177280,%ecx - je .L005AVX testl $512,%ebx - jnz .L006SSSE3 + jnz .L005SSSE3 .L003no_xmm: subl %edi,%eax cmpl $256,%eax - jae .L007unrolled + jae .L006unrolled jmp .L002loop .align 16 .L002loop: @@ -122,7 +121,7 @@ sha256_block_data_order: movl %ecx,28(%esp) movl %edi,32(%esp) .align 16 -.L00800_15: +.L00700_15: movl %edx,%ecx movl 24(%esp),%esi rorl $14,%ecx @@ -160,11 +159,11 @@ sha256_block_data_order: addl $4,%ebp addl %ebx,%eax cmpl $3248222580,%esi - jne .L00800_15 + jne .L00700_15 movl 156(%esp),%ecx - jmp .L00916_63 + jmp .L00816_63 .align 16 -.L00916_63: +.L00816_63: movl %ecx,%ebx movl 104(%esp),%esi rorl $11,%ecx @@ -219,7 +218,7 @@ sha256_block_data_order: addl $4,%ebp addl %ebx,%eax cmpl $3329325298,%esi - jne .L00916_63 + jne .L00816_63 movl 356(%esp),%esi movl 8(%esp),%ebx movl 16(%esp),%ecx @@ -263,7 +262,7 @@ sha256_block_data_order: .byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 .byte 62,0 .align 16 -.L007unrolled: +.L006unrolled: leal -96(%esp),%esp movl (%esi),%eax movl 4(%esi),%ebp @@ -280,9 +279,9 @@ sha256_block_data_order: movl %ebx,20(%esp) movl %ecx,24(%esp) movl %esi,28(%esp) - jmp .L010grand_loop + jmp .L009grand_loop .align 16 -.L010grand_loop: +.L009grand_loop: movl (%edi),%ebx movl 4(%edi),%ecx bswap %ebx @@ -3162,7 +3161,7 @@ sha256_block_data_order: movl %ebx,24(%esp) movl %ecx,28(%esp) cmpl 104(%esp),%edi - jb .L010grand_loop + jb .L009grand_loop movl 108(%esp),%esp popl %edi popl %esi @@ -3181,9 +3180,9 @@ sha256_block_data_order: pshufd $27,%xmm2,%xmm2 .byte 102,15,58,15,202,8 punpcklqdq %xmm0,%xmm2 - jmp .L011loop_shaext + jmp .L010loop_shaext .align 16 -.L011loop_shaext: +.L010loop_shaext: movdqu (%edi),%xmm3 movdqu 16(%edi),%xmm4 movdqu 32(%edi),%xmm5 @@ -3353,7 +3352,7 @@ sha256_block_data_order: .byte 15,56,203,202 paddd 16(%esp),%xmm2 paddd (%esp),%xmm1 - jnz .L011loop_shaext + jnz .L010loop_shaext pshufd $177,%xmm2,%xmm2 pshufd $27,%xmm1,%xmm7 pshufd $177,%xmm1,%xmm1 @@ -3368,7 +3367,7 @@ sha256_block_data_order: popl %ebp ret .align 32 -.L006SSSE3: +.L005SSSE3: leal -96(%esp),%esp movl (%esi),%eax movl 4(%esi),%ebx @@ -3387,9 +3386,9 @@ sha256_block_data_order: movl %ecx,24(%esp) movl %esi,28(%esp) movdqa 256(%ebp),%xmm7 - jmp .L012grand_ssse3 + jmp .L011grand_ssse3 .align 16 -.L012grand_ssse3: +.L011grand_ssse3: movdqu (%edi),%xmm0 movdqu 16(%edi),%xmm1 movdqu 32(%edi),%xmm2 @@ -3412,9 +3411,9 @@ sha256_block_data_order: paddd %xmm3,%xmm7 movdqa %xmm6,64(%esp) movdqa %xmm7,80(%esp) - jmp .L013ssse3_00_47 + jmp .L012ssse3_00_47 .align 16 -.L013ssse3_00_47: +.L012ssse3_00_47: addl $64,%ebp movl %edx,%ecx movdqa %xmm1,%xmm4 @@ -4057,7 +4056,7 @@ sha256_block_data_order: addl %ecx,%eax movdqa %xmm6,80(%esp) cmpl $66051,64(%ebp) - jne .L013ssse3_00_47 + jne .L012ssse3_00_47 movl %edx,%ecx rorl $14,%edx movl 20(%esp),%esi @@ -4571,2218 +4570,13 @@ sha256_block_data_order: movdqa 64(%ebp),%xmm7 subl $192,%ebp cmpl 104(%esp),%edi - jb .L012grand_ssse3 + jb .L011grand_ssse3 movl 108(%esp),%esp popl %edi popl %esi popl %ebx popl %ebp ret -.align 32 -.L005AVX: - andl $264,%edx - cmpl $264,%edx - je .L014AVX_BMI - leal -96(%esp),%esp - vzeroall - movl (%esi),%eax - movl 4(%esi),%ebx - movl 8(%esi),%ecx - movl 12(%esi),%edi - movl %ebx,4(%esp) - xorl %ecx,%ebx - movl %ecx,8(%esp) - movl %edi,12(%esp) - movl 16(%esi),%edx - movl 20(%esi),%edi - movl 24(%esi),%ecx - movl 28(%esi),%esi - movl %edi,20(%esp) - movl 100(%esp),%edi - movl %ecx,24(%esp) - movl %esi,28(%esp) - vmovdqa 256(%ebp),%xmm7 - jmp .L015grand_avx -.align 32 -.L015grand_avx: - vmovdqu (%edi),%xmm0 - vmovdqu 16(%edi),%xmm1 - vmovdqu 32(%edi),%xmm2 - vmovdqu 48(%edi),%xmm3 - addl $64,%edi - vpshufb %xmm7,%xmm0,%xmm0 - movl %edi,100(%esp) - vpshufb %xmm7,%xmm1,%xmm1 - vpshufb %xmm7,%xmm2,%xmm2 - vpaddd (%ebp),%xmm0,%xmm4 - vpshufb %xmm7,%xmm3,%xmm3 - vpaddd 16(%ebp),%xmm1,%xmm5 - vpaddd 32(%ebp),%xmm2,%xmm6 - vpaddd 48(%ebp),%xmm3,%xmm7 - vmovdqa %xmm4,32(%esp) - vmovdqa %xmm5,48(%esp) - vmovdqa %xmm6,64(%esp) - vmovdqa %xmm7,80(%esp) - jmp .L016avx_00_47 -.align 16 -.L016avx_00_47: - addl $64,%ebp - vpalignr $4,%xmm0,%xmm1,%xmm4 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 20(%esp),%esi - vpalignr $4,%xmm2,%xmm3,%xmm7 - xorl %ecx,%edx - movl 24(%esp),%edi - xorl %edi,%esi - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - vpaddd %xmm7,%xmm0,%xmm0 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrld $3,%xmm4,%xmm7 - movl %eax,%ecx - addl %edi,%edx - movl 4(%esp),%edi - vpslld $14,%xmm4,%xmm5 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,(%esp) - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - vpshufd $250,%xmm3,%xmm7 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpsrld $11,%xmm6,%xmm6 - addl 32(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpxor %xmm5,%xmm4,%xmm4 - addl %edx,%ebx - addl 12(%esp),%edx - addl %ecx,%ebx - vpslld $11,%xmm5,%xmm5 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 16(%esp),%esi - vpxor %xmm6,%xmm4,%xmm4 - xorl %ecx,%edx - movl 20(%esp),%edi - xorl %edi,%esi - vpsrld $10,%xmm7,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - vpxor %xmm5,%xmm4,%xmm4 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %ebx,%ecx - addl %edi,%edx - movl (%esp),%edi - vpaddd %xmm4,%xmm0,%xmm0 - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,28(%esp) - vpxor %xmm5,%xmm6,%xmm6 - xorl %ebx,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - vpsrlq $19,%xmm7,%xmm7 - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - addl 36(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - vpshufd $132,%xmm6,%xmm7 - addl %edx,%eax - addl 8(%esp),%edx - addl %ecx,%eax - vpsrldq $8,%xmm7,%xmm7 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 12(%esp),%esi - vpaddd %xmm7,%xmm0,%xmm0 - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - vpshufd $80,%xmm0,%xmm7 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - vpsrld $10,%xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - vpxor %xmm5,%xmm6,%xmm6 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,24(%esp) - vpsrlq $19,%xmm7,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpshufd $232,%xmm6,%xmm7 - addl 40(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpslldq $8,%xmm7,%xmm7 - addl %edx,%ebx - addl 4(%esp),%edx - addl %ecx,%ebx - vpaddd %xmm7,%xmm0,%xmm0 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 8(%esp),%esi - vpaddd (%ebp),%xmm0,%xmm6 - xorl %ecx,%edx - movl 12(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 44(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - vmovdqa %xmm6,32(%esp) - vpalignr $4,%xmm1,%xmm2,%xmm4 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 4(%esp),%esi - vpalignr $4,%xmm3,%xmm0,%xmm7 - xorl %ecx,%edx - movl 8(%esp),%edi - xorl %edi,%esi - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,(%esp) - vpaddd %xmm7,%xmm1,%xmm1 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrld $3,%xmm4,%xmm7 - movl %eax,%ecx - addl %edi,%edx - movl 20(%esp),%edi - vpslld $14,%xmm4,%xmm5 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,16(%esp) - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - vpshufd $250,%xmm0,%xmm7 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpsrld $11,%xmm6,%xmm6 - addl 48(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpxor %xmm5,%xmm4,%xmm4 - addl %edx,%ebx - addl 28(%esp),%edx - addl %ecx,%ebx - vpslld $11,%xmm5,%xmm5 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl (%esp),%esi - vpxor %xmm6,%xmm4,%xmm4 - xorl %ecx,%edx - movl 4(%esp),%edi - xorl %edi,%esi - vpsrld $10,%xmm7,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - vpxor %xmm5,%xmm4,%xmm4 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %ebx,%ecx - addl %edi,%edx - movl 16(%esp),%edi - vpaddd %xmm4,%xmm1,%xmm1 - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,12(%esp) - vpxor %xmm5,%xmm6,%xmm6 - xorl %ebx,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - vpsrlq $19,%xmm7,%xmm7 - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - addl 52(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - vpshufd $132,%xmm6,%xmm7 - addl %edx,%eax - addl 24(%esp),%edx - addl %ecx,%eax - vpsrldq $8,%xmm7,%xmm7 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 28(%esp),%esi - vpaddd %xmm7,%xmm1,%xmm1 - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - vpshufd $80,%xmm1,%xmm7 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - vpsrld $10,%xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - vpxor %xmm5,%xmm6,%xmm6 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,8(%esp) - vpsrlq $19,%xmm7,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpshufd $232,%xmm6,%xmm7 - addl 56(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpslldq $8,%xmm7,%xmm7 - addl %edx,%ebx - addl 20(%esp),%edx - addl %ecx,%ebx - vpaddd %xmm7,%xmm1,%xmm1 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 24(%esp),%esi - vpaddd 16(%ebp),%xmm1,%xmm6 - xorl %ecx,%edx - movl 28(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 60(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - vmovdqa %xmm6,48(%esp) - vpalignr $4,%xmm2,%xmm3,%xmm4 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 20(%esp),%esi - vpalignr $4,%xmm0,%xmm1,%xmm7 - xorl %ecx,%edx - movl 24(%esp),%edi - xorl %edi,%esi - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - vpaddd %xmm7,%xmm2,%xmm2 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrld $3,%xmm4,%xmm7 - movl %eax,%ecx - addl %edi,%edx - movl 4(%esp),%edi - vpslld $14,%xmm4,%xmm5 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,(%esp) - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - vpshufd $250,%xmm1,%xmm7 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpsrld $11,%xmm6,%xmm6 - addl 64(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpxor %xmm5,%xmm4,%xmm4 - addl %edx,%ebx - addl 12(%esp),%edx - addl %ecx,%ebx - vpslld $11,%xmm5,%xmm5 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 16(%esp),%esi - vpxor %xmm6,%xmm4,%xmm4 - xorl %ecx,%edx - movl 20(%esp),%edi - xorl %edi,%esi - vpsrld $10,%xmm7,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - vpxor %xmm5,%xmm4,%xmm4 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %ebx,%ecx - addl %edi,%edx - movl (%esp),%edi - vpaddd %xmm4,%xmm2,%xmm2 - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,28(%esp) - vpxor %xmm5,%xmm6,%xmm6 - xorl %ebx,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - vpsrlq $19,%xmm7,%xmm7 - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - addl 68(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - vpshufd $132,%xmm6,%xmm7 - addl %edx,%eax - addl 8(%esp),%edx - addl %ecx,%eax - vpsrldq $8,%xmm7,%xmm7 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 12(%esp),%esi - vpaddd %xmm7,%xmm2,%xmm2 - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - vpshufd $80,%xmm2,%xmm7 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - vpsrld $10,%xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - vpxor %xmm5,%xmm6,%xmm6 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,24(%esp) - vpsrlq $19,%xmm7,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpshufd $232,%xmm6,%xmm7 - addl 72(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpslldq $8,%xmm7,%xmm7 - addl %edx,%ebx - addl 4(%esp),%edx - addl %ecx,%ebx - vpaddd %xmm7,%xmm2,%xmm2 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 8(%esp),%esi - vpaddd 32(%ebp),%xmm2,%xmm6 - xorl %ecx,%edx - movl 12(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 76(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - vmovdqa %xmm6,64(%esp) - vpalignr $4,%xmm3,%xmm0,%xmm4 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 4(%esp),%esi - vpalignr $4,%xmm1,%xmm2,%xmm7 - xorl %ecx,%edx - movl 8(%esp),%edi - xorl %edi,%esi - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,(%esp) - vpaddd %xmm7,%xmm3,%xmm3 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrld $3,%xmm4,%xmm7 - movl %eax,%ecx - addl %edi,%edx - movl 20(%esp),%edi - vpslld $14,%xmm4,%xmm5 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,16(%esp) - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - vpshufd $250,%xmm2,%xmm7 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpsrld $11,%xmm6,%xmm6 - addl 80(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpxor %xmm5,%xmm4,%xmm4 - addl %edx,%ebx - addl 28(%esp),%edx - addl %ecx,%ebx - vpslld $11,%xmm5,%xmm5 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl (%esp),%esi - vpxor %xmm6,%xmm4,%xmm4 - xorl %ecx,%edx - movl 4(%esp),%edi - xorl %edi,%esi - vpsrld $10,%xmm7,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - vpxor %xmm5,%xmm4,%xmm4 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %ebx,%ecx - addl %edi,%edx - movl 16(%esp),%edi - vpaddd %xmm4,%xmm3,%xmm3 - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,12(%esp) - vpxor %xmm5,%xmm6,%xmm6 - xorl %ebx,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - vpsrlq $19,%xmm7,%xmm7 - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - addl 84(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - vpshufd $132,%xmm6,%xmm7 - addl %edx,%eax - addl 24(%esp),%edx - addl %ecx,%eax - vpsrldq $8,%xmm7,%xmm7 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 28(%esp),%esi - vpaddd %xmm7,%xmm3,%xmm3 - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - vpshufd $80,%xmm3,%xmm7 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - vpsrld $10,%xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - vpxor %xmm5,%xmm6,%xmm6 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,8(%esp) - vpsrlq $19,%xmm7,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpshufd $232,%xmm6,%xmm7 - addl 88(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpslldq $8,%xmm7,%xmm7 - addl %edx,%ebx - addl 20(%esp),%edx - addl %ecx,%ebx - vpaddd %xmm7,%xmm3,%xmm3 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 24(%esp),%esi - vpaddd 48(%ebp),%xmm3,%xmm6 - xorl %ecx,%edx - movl 28(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 92(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - vmovdqa %xmm6,80(%esp) - cmpl $66051,64(%ebp) - jne .L016avx_00_47 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 20(%esp),%esi - xorl %ecx,%edx - movl 24(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 4(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 32(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 12(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 16(%esp),%esi - xorl %ecx,%edx - movl 20(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl (%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,28(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 36(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 8(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 12(%esp),%esi - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,24(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 40(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 4(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 8(%esp),%esi - xorl %ecx,%edx - movl 12(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 44(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 4(%esp),%esi - xorl %ecx,%edx - movl 8(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 20(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,16(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 48(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 28(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl (%esp),%esi - xorl %ecx,%edx - movl 4(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 16(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,12(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 52(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 24(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 28(%esp),%esi - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,8(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 56(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 20(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 24(%esp),%esi - xorl %ecx,%edx - movl 28(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 60(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 20(%esp),%esi - xorl %ecx,%edx - movl 24(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 4(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 64(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 12(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 16(%esp),%esi - xorl %ecx,%edx - movl 20(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl (%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,28(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 68(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 8(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 12(%esp),%esi - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,24(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 72(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 4(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 8(%esp),%esi - xorl %ecx,%edx - movl 12(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 76(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 4(%esp),%esi - xorl %ecx,%edx - movl 8(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 20(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,16(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 80(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 28(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl (%esp),%esi - xorl %ecx,%edx - movl 4(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 16(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,12(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 84(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 24(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 28(%esp),%esi - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,8(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 88(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 20(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 24(%esp),%esi - xorl %ecx,%edx - movl 28(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 92(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - movl 96(%esp),%esi - xorl %edi,%ebx - movl 12(%esp),%ecx - addl (%esi),%eax - addl 4(%esi),%ebx - addl 8(%esi),%edi - addl 12(%esi),%ecx - movl %eax,(%esi) - movl %ebx,4(%esi) - movl %edi,8(%esi) - movl %ecx,12(%esi) - movl %ebx,4(%esp) - xorl %edi,%ebx - movl %edi,8(%esp) - movl %ecx,12(%esp) - movl 20(%esp),%edi - movl 24(%esp),%ecx - addl 16(%esi),%edx - addl 20(%esi),%edi - addl 24(%esi),%ecx - movl %edx,16(%esi) - movl %edi,20(%esi) - movl %edi,20(%esp) - movl 28(%esp),%edi - movl %ecx,24(%esi) - addl 28(%esi),%edi - movl %ecx,24(%esp) - movl %edi,28(%esi) - movl %edi,28(%esp) - movl 100(%esp),%edi - vmovdqa 64(%ebp),%xmm7 - subl $192,%ebp - cmpl 104(%esp),%edi - jb .L015grand_avx - movl 108(%esp),%esp - vzeroall - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.align 32 -.L014AVX_BMI: - leal -96(%esp),%esp - vzeroall - movl (%esi),%eax - movl 4(%esi),%ebx - movl 8(%esi),%ecx - movl 12(%esi),%edi - movl %ebx,4(%esp) - xorl %ecx,%ebx - movl %ecx,8(%esp) - movl %edi,12(%esp) - movl 16(%esi),%edx - movl 20(%esi),%edi - movl 24(%esi),%ecx - movl 28(%esi),%esi - movl %edi,20(%esp) - movl 100(%esp),%edi - movl %ecx,24(%esp) - movl %esi,28(%esp) - vmovdqa 256(%ebp),%xmm7 - jmp .L017grand_avx_bmi -.align 32 -.L017grand_avx_bmi: - vmovdqu (%edi),%xmm0 - vmovdqu 16(%edi),%xmm1 - vmovdqu 32(%edi),%xmm2 - vmovdqu 48(%edi),%xmm3 - addl $64,%edi - vpshufb %xmm7,%xmm0,%xmm0 - movl %edi,100(%esp) - vpshufb %xmm7,%xmm1,%xmm1 - vpshufb %xmm7,%xmm2,%xmm2 - vpaddd (%ebp),%xmm0,%xmm4 - vpshufb %xmm7,%xmm3,%xmm3 - vpaddd 16(%ebp),%xmm1,%xmm5 - vpaddd 32(%ebp),%xmm2,%xmm6 - vpaddd 48(%ebp),%xmm3,%xmm7 - vmovdqa %xmm4,32(%esp) - vmovdqa %xmm5,48(%esp) - vmovdqa %xmm6,64(%esp) - vmovdqa %xmm7,80(%esp) - jmp .L018avx_bmi_00_47 -.align 16 -.L018avx_bmi_00_47: - addl $64,%ebp - vpalignr $4,%xmm0,%xmm1,%xmm4 - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,16(%esp) - vpalignr $4,%xmm2,%xmm3,%xmm7 - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 24(%esp),%edx,%esi - vpsrld $7,%xmm4,%xmm6 - xorl %edi,%ecx - andl 20(%esp),%edx - movl %eax,(%esp) - vpaddd %xmm7,%xmm0,%xmm0 - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - vpsrld $3,%xmm4,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - vpslld $14,%xmm4,%xmm5 - movl 4(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - vpxor %xmm6,%xmm7,%xmm4 - addl 28(%esp),%edx - andl %eax,%ebx - addl 32(%esp),%edx - vpshufd $250,%xmm3,%xmm7 - xorl %edi,%ebx - addl %edx,%ecx - addl 12(%esp),%edx - vpsrld $11,%xmm6,%xmm6 - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpxor %xmm5,%xmm4,%xmm4 - movl %edx,12(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpslld $11,%xmm5,%xmm5 - andnl 20(%esp),%edx,%esi - xorl %edi,%ecx - andl 16(%esp),%edx - vpxor %xmm6,%xmm4,%xmm4 - movl %ebx,28(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - vpsrld $10,%xmm7,%xmm6 - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - vpxor %xmm5,%xmm4,%xmm4 - movl (%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - vpsrlq $17,%xmm7,%xmm5 - addl 24(%esp),%edx - andl %ebx,%eax - addl 36(%esp),%edx - vpaddd %xmm4,%xmm0,%xmm0 - xorl %edi,%eax - addl %edx,%ecx - addl 8(%esp),%edx - vpxor %xmm5,%xmm6,%xmm6 - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpsrlq $19,%xmm7,%xmm7 - movl %edx,8(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - andnl 16(%esp),%edx,%esi - xorl %edi,%ecx - andl 12(%esp),%edx - vpshufd $132,%xmm6,%xmm7 - movl %eax,24(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - vpsrldq $8,%xmm7,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - vpaddd %xmm7,%xmm0,%xmm0 - movl 28(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - vpshufd $80,%xmm0,%xmm7 - addl 20(%esp),%edx - andl %eax,%ebx - addl 40(%esp),%edx - vpsrld $10,%xmm7,%xmm6 - xorl %edi,%ebx - addl %edx,%ecx - addl 4(%esp),%edx - vpsrlq $17,%xmm7,%xmm5 - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpxor %xmm5,%xmm6,%xmm6 - movl %edx,4(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpsrlq $19,%xmm7,%xmm7 - andnl 12(%esp),%edx,%esi - xorl %edi,%ecx - andl 8(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - movl %ebx,20(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - vpshufd $232,%xmm6,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - vpslldq $8,%xmm7,%xmm7 - movl 24(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - vpaddd %xmm7,%xmm0,%xmm0 - addl 16(%esp),%edx - andl %ebx,%eax - addl 44(%esp),%edx - vpaddd (%ebp),%xmm0,%xmm6 - xorl %edi,%eax - addl %edx,%ecx - addl (%esp),%edx - leal (%eax,%ecx,1),%eax - vmovdqa %xmm6,32(%esp) - vpalignr $4,%xmm1,%xmm2,%xmm4 - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,(%esp) - vpalignr $4,%xmm3,%xmm0,%xmm7 - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 8(%esp),%edx,%esi - vpsrld $7,%xmm4,%xmm6 - xorl %edi,%ecx - andl 4(%esp),%edx - movl %eax,16(%esp) - vpaddd %xmm7,%xmm1,%xmm1 - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - vpsrld $3,%xmm4,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - vpslld $14,%xmm4,%xmm5 - movl 20(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - vpxor %xmm6,%xmm7,%xmm4 - addl 12(%esp),%edx - andl %eax,%ebx - addl 48(%esp),%edx - vpshufd $250,%xmm0,%xmm7 - xorl %edi,%ebx - addl %edx,%ecx - addl 28(%esp),%edx - vpsrld $11,%xmm6,%xmm6 - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpxor %xmm5,%xmm4,%xmm4 - movl %edx,28(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpslld $11,%xmm5,%xmm5 - andnl 4(%esp),%edx,%esi - xorl %edi,%ecx - andl (%esp),%edx - vpxor %xmm6,%xmm4,%xmm4 - movl %ebx,12(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - vpsrld $10,%xmm7,%xmm6 - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - vpxor %xmm5,%xmm4,%xmm4 - movl 16(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - vpsrlq $17,%xmm7,%xmm5 - addl 8(%esp),%edx - andl %ebx,%eax - addl 52(%esp),%edx - vpaddd %xmm4,%xmm1,%xmm1 - xorl %edi,%eax - addl %edx,%ecx - addl 24(%esp),%edx - vpxor %xmm5,%xmm6,%xmm6 - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpsrlq $19,%xmm7,%xmm7 - movl %edx,24(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - andnl (%esp),%edx,%esi - xorl %edi,%ecx - andl 28(%esp),%edx - vpshufd $132,%xmm6,%xmm7 - movl %eax,8(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - vpsrldq $8,%xmm7,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - vpaddd %xmm7,%xmm1,%xmm1 - movl 12(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - vpshufd $80,%xmm1,%xmm7 - addl 4(%esp),%edx - andl %eax,%ebx - addl 56(%esp),%edx - vpsrld $10,%xmm7,%xmm6 - xorl %edi,%ebx - addl %edx,%ecx - addl 20(%esp),%edx - vpsrlq $17,%xmm7,%xmm5 - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpxor %xmm5,%xmm6,%xmm6 - movl %edx,20(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpsrlq $19,%xmm7,%xmm7 - andnl 28(%esp),%edx,%esi - xorl %edi,%ecx - andl 24(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - movl %ebx,4(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - vpshufd $232,%xmm6,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - vpslldq $8,%xmm7,%xmm7 - movl 8(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - vpaddd %xmm7,%xmm1,%xmm1 - addl (%esp),%edx - andl %ebx,%eax - addl 60(%esp),%edx - vpaddd 16(%ebp),%xmm1,%xmm6 - xorl %edi,%eax - addl %edx,%ecx - addl 16(%esp),%edx - leal (%eax,%ecx,1),%eax - vmovdqa %xmm6,48(%esp) - vpalignr $4,%xmm2,%xmm3,%xmm4 - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,16(%esp) - vpalignr $4,%xmm0,%xmm1,%xmm7 - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 24(%esp),%edx,%esi - vpsrld $7,%xmm4,%xmm6 - xorl %edi,%ecx - andl 20(%esp),%edx - movl %eax,(%esp) - vpaddd %xmm7,%xmm2,%xmm2 - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - vpsrld $3,%xmm4,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - vpslld $14,%xmm4,%xmm5 - movl 4(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - vpxor %xmm6,%xmm7,%xmm4 - addl 28(%esp),%edx - andl %eax,%ebx - addl 64(%esp),%edx - vpshufd $250,%xmm1,%xmm7 - xorl %edi,%ebx - addl %edx,%ecx - addl 12(%esp),%edx - vpsrld $11,%xmm6,%xmm6 - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpxor %xmm5,%xmm4,%xmm4 - movl %edx,12(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpslld $11,%xmm5,%xmm5 - andnl 20(%esp),%edx,%esi - xorl %edi,%ecx - andl 16(%esp),%edx - vpxor %xmm6,%xmm4,%xmm4 - movl %ebx,28(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - vpsrld $10,%xmm7,%xmm6 - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - vpxor %xmm5,%xmm4,%xmm4 - movl (%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - vpsrlq $17,%xmm7,%xmm5 - addl 24(%esp),%edx - andl %ebx,%eax - addl 68(%esp),%edx - vpaddd %xmm4,%xmm2,%xmm2 - xorl %edi,%eax - addl %edx,%ecx - addl 8(%esp),%edx - vpxor %xmm5,%xmm6,%xmm6 - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpsrlq $19,%xmm7,%xmm7 - movl %edx,8(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - andnl 16(%esp),%edx,%esi - xorl %edi,%ecx - andl 12(%esp),%edx - vpshufd $132,%xmm6,%xmm7 - movl %eax,24(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - vpsrldq $8,%xmm7,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - vpaddd %xmm7,%xmm2,%xmm2 - movl 28(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - vpshufd $80,%xmm2,%xmm7 - addl 20(%esp),%edx - andl %eax,%ebx - addl 72(%esp),%edx - vpsrld $10,%xmm7,%xmm6 - xorl %edi,%ebx - addl %edx,%ecx - addl 4(%esp),%edx - vpsrlq $17,%xmm7,%xmm5 - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpxor %xmm5,%xmm6,%xmm6 - movl %edx,4(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpsrlq $19,%xmm7,%xmm7 - andnl 12(%esp),%edx,%esi - xorl %edi,%ecx - andl 8(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - movl %ebx,20(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - vpshufd $232,%xmm6,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - vpslldq $8,%xmm7,%xmm7 - movl 24(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - vpaddd %xmm7,%xmm2,%xmm2 - addl 16(%esp),%edx - andl %ebx,%eax - addl 76(%esp),%edx - vpaddd 32(%ebp),%xmm2,%xmm6 - xorl %edi,%eax - addl %edx,%ecx - addl (%esp),%edx - leal (%eax,%ecx,1),%eax - vmovdqa %xmm6,64(%esp) - vpalignr $4,%xmm3,%xmm0,%xmm4 - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,(%esp) - vpalignr $4,%xmm1,%xmm2,%xmm7 - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 8(%esp),%edx,%esi - vpsrld $7,%xmm4,%xmm6 - xorl %edi,%ecx - andl 4(%esp),%edx - movl %eax,16(%esp) - vpaddd %xmm7,%xmm3,%xmm3 - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - vpsrld $3,%xmm4,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - vpslld $14,%xmm4,%xmm5 - movl 20(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - vpxor %xmm6,%xmm7,%xmm4 - addl 12(%esp),%edx - andl %eax,%ebx - addl 80(%esp),%edx - vpshufd $250,%xmm2,%xmm7 - xorl %edi,%ebx - addl %edx,%ecx - addl 28(%esp),%edx - vpsrld $11,%xmm6,%xmm6 - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpxor %xmm5,%xmm4,%xmm4 - movl %edx,28(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpslld $11,%xmm5,%xmm5 - andnl 4(%esp),%edx,%esi - xorl %edi,%ecx - andl (%esp),%edx - vpxor %xmm6,%xmm4,%xmm4 - movl %ebx,12(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - vpsrld $10,%xmm7,%xmm6 - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - vpxor %xmm5,%xmm4,%xmm4 - movl 16(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - vpsrlq $17,%xmm7,%xmm5 - addl 8(%esp),%edx - andl %ebx,%eax - addl 84(%esp),%edx - vpaddd %xmm4,%xmm3,%xmm3 - xorl %edi,%eax - addl %edx,%ecx - addl 24(%esp),%edx - vpxor %xmm5,%xmm6,%xmm6 - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpsrlq $19,%xmm7,%xmm7 - movl %edx,24(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - andnl (%esp),%edx,%esi - xorl %edi,%ecx - andl 28(%esp),%edx - vpshufd $132,%xmm6,%xmm7 - movl %eax,8(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - vpsrldq $8,%xmm7,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - vpaddd %xmm7,%xmm3,%xmm3 - movl 12(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - vpshufd $80,%xmm3,%xmm7 - addl 4(%esp),%edx - andl %eax,%ebx - addl 88(%esp),%edx - vpsrld $10,%xmm7,%xmm6 - xorl %edi,%ebx - addl %edx,%ecx - addl 20(%esp),%edx - vpsrlq $17,%xmm7,%xmm5 - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpxor %xmm5,%xmm6,%xmm6 - movl %edx,20(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpsrlq $19,%xmm7,%xmm7 - andnl 28(%esp),%edx,%esi - xorl %edi,%ecx - andl 24(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - movl %ebx,4(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - vpshufd $232,%xmm6,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - vpslldq $8,%xmm7,%xmm7 - movl 8(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - vpaddd %xmm7,%xmm3,%xmm3 - addl (%esp),%edx - andl %ebx,%eax - addl 92(%esp),%edx - vpaddd 48(%ebp),%xmm3,%xmm6 - xorl %edi,%eax - addl %edx,%ecx - addl 16(%esp),%edx - leal (%eax,%ecx,1),%eax - vmovdqa %xmm6,80(%esp) - cmpl $66051,64(%ebp) - jne .L018avx_bmi_00_47 - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,16(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 24(%esp),%edx,%esi - xorl %edi,%ecx - andl 20(%esp),%edx - movl %eax,(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - movl 4(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - andl %eax,%ebx - addl 32(%esp),%edx - xorl %edi,%ebx - addl %edx,%ecx - addl 12(%esp),%edx - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,12(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 20(%esp),%edx,%esi - xorl %edi,%ecx - andl 16(%esp),%edx - movl %ebx,28(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - movl (%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - andl %ebx,%eax - addl 36(%esp),%edx - xorl %edi,%eax - addl %edx,%ecx - addl 8(%esp),%edx - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,8(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 16(%esp),%edx,%esi - xorl %edi,%ecx - andl 12(%esp),%edx - movl %eax,24(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - movl 28(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - andl %eax,%ebx - addl 40(%esp),%edx - xorl %edi,%ebx - addl %edx,%ecx - addl 4(%esp),%edx - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,4(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 12(%esp),%edx,%esi - xorl %edi,%ecx - andl 8(%esp),%edx - movl %ebx,20(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - movl 24(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - andl %ebx,%eax - addl 44(%esp),%edx - xorl %edi,%eax - addl %edx,%ecx - addl (%esp),%edx - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 8(%esp),%edx,%esi - xorl %edi,%ecx - andl 4(%esp),%edx - movl %eax,16(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - movl 20(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - andl %eax,%ebx - addl 48(%esp),%edx - xorl %edi,%ebx - addl %edx,%ecx - addl 28(%esp),%edx - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,28(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 4(%esp),%edx,%esi - xorl %edi,%ecx - andl (%esp),%edx - movl %ebx,12(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - movl 16(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - andl %ebx,%eax - addl 52(%esp),%edx - xorl %edi,%eax - addl %edx,%ecx - addl 24(%esp),%edx - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,24(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl (%esp),%edx,%esi - xorl %edi,%ecx - andl 28(%esp),%edx - movl %eax,8(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - movl 12(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - andl %eax,%ebx - addl 56(%esp),%edx - xorl %edi,%ebx - addl %edx,%ecx - addl 20(%esp),%edx - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,20(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 28(%esp),%edx,%esi - xorl %edi,%ecx - andl 24(%esp),%edx - movl %ebx,4(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - movl 8(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - addl (%esp),%edx - andl %ebx,%eax - addl 60(%esp),%edx - xorl %edi,%eax - addl %edx,%ecx - addl 16(%esp),%edx - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,16(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 24(%esp),%edx,%esi - xorl %edi,%ecx - andl 20(%esp),%edx - movl %eax,(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - movl 4(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - andl %eax,%ebx - addl 64(%esp),%edx - xorl %edi,%ebx - addl %edx,%ecx - addl 12(%esp),%edx - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,12(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 20(%esp),%edx,%esi - xorl %edi,%ecx - andl 16(%esp),%edx - movl %ebx,28(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - movl (%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - andl %ebx,%eax - addl 68(%esp),%edx - xorl %edi,%eax - addl %edx,%ecx - addl 8(%esp),%edx - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,8(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 16(%esp),%edx,%esi - xorl %edi,%ecx - andl 12(%esp),%edx - movl %eax,24(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - movl 28(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - andl %eax,%ebx - addl 72(%esp),%edx - xorl %edi,%ebx - addl %edx,%ecx - addl 4(%esp),%edx - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,4(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 12(%esp),%edx,%esi - xorl %edi,%ecx - andl 8(%esp),%edx - movl %ebx,20(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - movl 24(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - andl %ebx,%eax - addl 76(%esp),%edx - xorl %edi,%eax - addl %edx,%ecx - addl (%esp),%edx - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 8(%esp),%edx,%esi - xorl %edi,%ecx - andl 4(%esp),%edx - movl %eax,16(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - movl 20(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - andl %eax,%ebx - addl 80(%esp),%edx - xorl %edi,%ebx - addl %edx,%ecx - addl 28(%esp),%edx - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,28(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 4(%esp),%edx,%esi - xorl %edi,%ecx - andl (%esp),%edx - movl %ebx,12(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - movl 16(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - andl %ebx,%eax - addl 84(%esp),%edx - xorl %edi,%eax - addl %edx,%ecx - addl 24(%esp),%edx - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,24(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl (%esp),%edx,%esi - xorl %edi,%ecx - andl 28(%esp),%edx - movl %eax,8(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - movl 12(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - andl %eax,%ebx - addl 88(%esp),%edx - xorl %edi,%ebx - addl %edx,%ecx - addl 20(%esp),%edx - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,20(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 28(%esp),%edx,%esi - xorl %edi,%ecx - andl 24(%esp),%edx - movl %ebx,4(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - movl 8(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - addl (%esp),%edx - andl %ebx,%eax - addl 92(%esp),%edx - xorl %edi,%eax - addl %edx,%ecx - addl 16(%esp),%edx - leal (%eax,%ecx,1),%eax - movl 96(%esp),%esi - xorl %edi,%ebx - movl 12(%esp),%ecx - addl (%esi),%eax - addl 4(%esi),%ebx - addl 8(%esi),%edi - addl 12(%esi),%ecx - movl %eax,(%esi) - movl %ebx,4(%esi) - movl %edi,8(%esi) - movl %ecx,12(%esi) - movl %ebx,4(%esp) - xorl %edi,%ebx - movl %edi,8(%esp) - movl %ecx,12(%esp) - movl 20(%esp),%edi - movl 24(%esp),%ecx - addl 16(%esi),%edx - addl 20(%esi),%edi - addl 24(%esi),%ecx - movl %edx,16(%esi) - movl %edi,20(%esi) - movl %edi,20(%esp) - movl 28(%esp),%edi - movl %ecx,24(%esi) - addl 28(%esi),%edi - movl %ecx,24(%esp) - movl %edi,28(%esi) - movl %edi,28(%esp) - movl 100(%esp),%edi - vmovdqa 64(%ebp),%xmm7 - subl $192,%ebp - cmpl 104(%esp),%edi - jb .L017grand_avx_bmi - movl 108(%esp),%esp - vzeroall - popl %edi - popl %esi - popl %ebx - popl %ebp - ret .size sha256_block_data_order,.-.L_sha256_block_data_order_begin .comm OPENSSL_ia32cap_P,16,4 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-MSFT/crypto/sha/sha1-586.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-MSFT/crypto/sha/sha1-586.nasm index 0d644acce05..ab7e9f25eb3 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-MSFT/crypto/sha/sha1-586.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-MSFT/crypto/sha/sha1-586.nasm @@ -29,11 +29,6 @@ L$000pic_point: jz NEAR L$001x86 test ecx,536870912 jnz NEAR L$shaext_shortcut - and edx,268435456 - and eax,1073741824 - or eax,edx - cmp eax,1342177280 - je NEAR L$avx_shortcut jmp NEAR L$ssse3_shortcut align 16 L$001x86: @@ -2786,1174 +2781,6 @@ L$007done: pop ebx pop ebp ret -align 16 -__sha1_block_data_order_avx: - push ebp - push ebx - push esi - push edi - call L$008pic_point -L$008pic_point: - pop ebp - lea ebp,[(L$K_XX_XX-L$008pic_point)+ebp] -L$avx_shortcut: - vzeroall - vmovdqa xmm7,[ebp] - vmovdqa xmm0,[16+ebp] - vmovdqa xmm1,[32+ebp] - vmovdqa xmm2,[48+ebp] - vmovdqa xmm6,[64+ebp] - mov edi,DWORD [20+esp] - mov ebp,DWORD [24+esp] - mov edx,DWORD [28+esp] - mov esi,esp - sub esp,208 - and esp,-64 - vmovdqa [112+esp],xmm0 - vmovdqa [128+esp],xmm1 - vmovdqa [144+esp],xmm2 - shl edx,6 - vmovdqa [160+esp],xmm7 - add edx,ebp - vmovdqa [176+esp],xmm6 - add ebp,64 - mov DWORD [192+esp],edi - mov DWORD [196+esp],ebp - mov DWORD [200+esp],edx - mov DWORD [204+esp],esi - mov eax,DWORD [edi] - mov ebx,DWORD [4+edi] - mov ecx,DWORD [8+edi] - mov edx,DWORD [12+edi] - mov edi,DWORD [16+edi] - mov esi,ebx - vmovdqu xmm0,[ebp-64] - vmovdqu xmm1,[ebp-48] - vmovdqu xmm2,[ebp-32] - vmovdqu xmm3,[ebp-16] - vpshufb xmm0,xmm0,xmm6 - vpshufb xmm1,xmm1,xmm6 - vpshufb xmm2,xmm2,xmm6 - vmovdqa [96+esp],xmm7 - vpshufb xmm3,xmm3,xmm6 - vpaddd xmm4,xmm0,xmm7 - vpaddd xmm5,xmm1,xmm7 - vpaddd xmm6,xmm2,xmm7 - vmovdqa [esp],xmm4 - mov ebp,ecx - vmovdqa [16+esp],xmm5 - xor ebp,edx - vmovdqa [32+esp],xmm6 - and esi,ebp - jmp NEAR L$009loop -align 16 -L$009loop: - shrd ebx,ebx,2 - xor esi,edx - vpalignr xmm4,xmm1,xmm0,8 - mov ebp,eax - add edi,DWORD [esp] - vpaddd xmm7,xmm7,xmm3 - vmovdqa [64+esp],xmm0 - xor ebx,ecx - shld eax,eax,5 - vpsrldq xmm6,xmm3,4 - add edi,esi - and ebp,ebx - vpxor xmm4,xmm4,xmm0 - xor ebx,ecx - add edi,eax - vpxor xmm6,xmm6,xmm2 - shrd eax,eax,7 - xor ebp,ecx - vmovdqa [48+esp],xmm7 - mov esi,edi - add edx,DWORD [4+esp] - vpxor xmm4,xmm4,xmm6 - xor eax,ebx - shld edi,edi,5 - add edx,ebp - and esi,eax - vpsrld xmm6,xmm4,31 - xor eax,ebx - add edx,edi - shrd edi,edi,7 - xor esi,ebx - vpslldq xmm0,xmm4,12 - vpaddd xmm4,xmm4,xmm4 - mov ebp,edx - add ecx,DWORD [8+esp] - xor edi,eax - shld edx,edx,5 - vpsrld xmm7,xmm0,30 - vpor xmm4,xmm4,xmm6 - add ecx,esi - and ebp,edi - xor edi,eax - add ecx,edx - vpslld xmm0,xmm0,2 - shrd edx,edx,7 - xor ebp,eax - vpxor xmm4,xmm4,xmm7 - mov esi,ecx - add ebx,DWORD [12+esp] - xor edx,edi - shld ecx,ecx,5 - vpxor xmm4,xmm4,xmm0 - add ebx,ebp - and esi,edx - vmovdqa xmm0,[96+esp] - xor edx,edi - add ebx,ecx - shrd ecx,ecx,7 - xor esi,edi - vpalignr xmm5,xmm2,xmm1,8 - mov ebp,ebx - add eax,DWORD [16+esp] - vpaddd xmm0,xmm0,xmm4 - vmovdqa [80+esp],xmm1 - xor ecx,edx - shld ebx,ebx,5 - vpsrldq xmm7,xmm4,4 - add eax,esi - and ebp,ecx - vpxor xmm5,xmm5,xmm1 - xor ecx,edx - add eax,ebx - vpxor xmm7,xmm7,xmm3 - shrd ebx,ebx,7 - xor ebp,edx - vmovdqa [esp],xmm0 - mov esi,eax - add edi,DWORD [20+esp] - vpxor xmm5,xmm5,xmm7 - xor ebx,ecx - shld eax,eax,5 - add edi,ebp - and esi,ebx - vpsrld xmm7,xmm5,31 - xor ebx,ecx - add edi,eax - shrd eax,eax,7 - xor esi,ecx - vpslldq xmm1,xmm5,12 - vpaddd xmm5,xmm5,xmm5 - mov ebp,edi - add edx,DWORD [24+esp] - xor eax,ebx - shld edi,edi,5 - vpsrld xmm0,xmm1,30 - vpor xmm5,xmm5,xmm7 - add edx,esi - and ebp,eax - xor eax,ebx - add edx,edi - vpslld xmm1,xmm1,2 - shrd edi,edi,7 - xor ebp,ebx - vpxor xmm5,xmm5,xmm0 - mov esi,edx - add ecx,DWORD [28+esp] - xor edi,eax - shld edx,edx,5 - vpxor xmm5,xmm5,xmm1 - add ecx,ebp - and esi,edi - vmovdqa xmm1,[112+esp] - xor edi,eax - add ecx,edx - shrd edx,edx,7 - xor esi,eax - vpalignr xmm6,xmm3,xmm2,8 - mov ebp,ecx - add ebx,DWORD [32+esp] - vpaddd xmm1,xmm1,xmm5 - vmovdqa [96+esp],xmm2 - xor edx,edi - shld ecx,ecx,5 - vpsrldq xmm0,xmm5,4 - add ebx,esi - and ebp,edx - vpxor xmm6,xmm6,xmm2 - xor edx,edi - add ebx,ecx - vpxor xmm0,xmm0,xmm4 - shrd ecx,ecx,7 - xor ebp,edi - vmovdqa [16+esp],xmm1 - mov esi,ebx - add eax,DWORD [36+esp] - vpxor xmm6,xmm6,xmm0 - xor ecx,edx - shld ebx,ebx,5 - add eax,ebp - and esi,ecx - vpsrld xmm0,xmm6,31 - xor ecx,edx - add eax,ebx - shrd ebx,ebx,7 - xor esi,edx - vpslldq xmm2,xmm6,12 - vpaddd xmm6,xmm6,xmm6 - mov ebp,eax - add edi,DWORD [40+esp] - xor ebx,ecx - shld eax,eax,5 - vpsrld xmm1,xmm2,30 - vpor xmm6,xmm6,xmm0 - add edi,esi - and ebp,ebx - xor ebx,ecx - add edi,eax - vpslld xmm2,xmm2,2 - vmovdqa xmm0,[64+esp] - shrd eax,eax,7 - xor ebp,ecx - vpxor xmm6,xmm6,xmm1 - mov esi,edi - add edx,DWORD [44+esp] - xor eax,ebx - shld edi,edi,5 - vpxor xmm6,xmm6,xmm2 - add edx,ebp - and esi,eax - vmovdqa xmm2,[112+esp] - xor eax,ebx - add edx,edi - shrd edi,edi,7 - xor esi,ebx - vpalignr xmm7,xmm4,xmm3,8 - mov ebp,edx - add ecx,DWORD [48+esp] - vpaddd xmm2,xmm2,xmm6 - vmovdqa [64+esp],xmm3 - xor edi,eax - shld edx,edx,5 - vpsrldq xmm1,xmm6,4 - add ecx,esi - and ebp,edi - vpxor xmm7,xmm7,xmm3 - xor edi,eax - add ecx,edx - vpxor xmm1,xmm1,xmm5 - shrd edx,edx,7 - xor ebp,eax - vmovdqa [32+esp],xmm2 - mov esi,ecx - add ebx,DWORD [52+esp] - vpxor xmm7,xmm7,xmm1 - xor edx,edi - shld ecx,ecx,5 - add ebx,ebp - and esi,edx - vpsrld xmm1,xmm7,31 - xor edx,edi - add ebx,ecx - shrd ecx,ecx,7 - xor esi,edi - vpslldq xmm3,xmm7,12 - vpaddd xmm7,xmm7,xmm7 - mov ebp,ebx - add eax,DWORD [56+esp] - xor ecx,edx - shld ebx,ebx,5 - vpsrld xmm2,xmm3,30 - vpor xmm7,xmm7,xmm1 - add eax,esi - and ebp,ecx - xor ecx,edx - add eax,ebx - vpslld xmm3,xmm3,2 - vmovdqa xmm1,[80+esp] - shrd ebx,ebx,7 - xor ebp,edx - vpxor xmm7,xmm7,xmm2 - mov esi,eax - add edi,DWORD [60+esp] - xor ebx,ecx - shld eax,eax,5 - vpxor xmm7,xmm7,xmm3 - add edi,ebp - and esi,ebx - vmovdqa xmm3,[112+esp] - xor ebx,ecx - add edi,eax - vpalignr xmm2,xmm7,xmm6,8 - vpxor xmm0,xmm0,xmm4 - shrd eax,eax,7 - xor esi,ecx - mov ebp,edi - add edx,DWORD [esp] - vpxor xmm0,xmm0,xmm1 - vmovdqa [80+esp],xmm4 - xor eax,ebx - shld edi,edi,5 - vmovdqa xmm4,xmm3 - vpaddd xmm3,xmm3,xmm7 - add edx,esi - and ebp,eax - vpxor xmm0,xmm0,xmm2 - xor eax,ebx - add edx,edi - shrd edi,edi,7 - xor ebp,ebx - vpsrld xmm2,xmm0,30 - vmovdqa [48+esp],xmm3 - mov esi,edx - add ecx,DWORD [4+esp] - xor edi,eax - shld edx,edx,5 - vpslld xmm0,xmm0,2 - add ecx,ebp - and esi,edi - xor edi,eax - add ecx,edx - shrd edx,edx,7 - xor esi,eax - mov ebp,ecx - add ebx,DWORD [8+esp] - vpor xmm0,xmm0,xmm2 - xor edx,edi - shld ecx,ecx,5 - vmovdqa xmm2,[96+esp] - add ebx,esi - and ebp,edx - xor edx,edi - add ebx,ecx - add eax,DWORD [12+esp] - xor ebp,edi - mov esi,ebx - shld ebx,ebx,5 - add eax,ebp - xor esi,edx - shrd ecx,ecx,7 - add eax,ebx - vpalignr xmm3,xmm0,xmm7,8 - vpxor xmm1,xmm1,xmm5 - add edi,DWORD [16+esp] - xor esi,ecx - mov ebp,eax - shld eax,eax,5 - vpxor xmm1,xmm1,xmm2 - vmovdqa [96+esp],xmm5 - add edi,esi - xor ebp,ecx - vmovdqa xmm5,xmm4 - vpaddd xmm4,xmm4,xmm0 - shrd ebx,ebx,7 - add edi,eax - vpxor xmm1,xmm1,xmm3 - add edx,DWORD [20+esp] - xor ebp,ebx - mov esi,edi - shld edi,edi,5 - vpsrld xmm3,xmm1,30 - vmovdqa [esp],xmm4 - add edx,ebp - xor esi,ebx - shrd eax,eax,7 - add edx,edi - vpslld xmm1,xmm1,2 - add ecx,DWORD [24+esp] - xor esi,eax - mov ebp,edx - shld edx,edx,5 - add ecx,esi - xor ebp,eax - shrd edi,edi,7 - add ecx,edx - vpor xmm1,xmm1,xmm3 - add ebx,DWORD [28+esp] - xor ebp,edi - vmovdqa xmm3,[64+esp] - mov esi,ecx - shld ecx,ecx,5 - add ebx,ebp - xor esi,edi - shrd edx,edx,7 - add ebx,ecx - vpalignr xmm4,xmm1,xmm0,8 - vpxor xmm2,xmm2,xmm6 - add eax,DWORD [32+esp] - xor esi,edx - mov ebp,ebx - shld ebx,ebx,5 - vpxor xmm2,xmm2,xmm3 - vmovdqa [64+esp],xmm6 - add eax,esi - xor ebp,edx - vmovdqa xmm6,[128+esp] - vpaddd xmm5,xmm5,xmm1 - shrd ecx,ecx,7 - add eax,ebx - vpxor xmm2,xmm2,xmm4 - add edi,DWORD [36+esp] - xor ebp,ecx - mov esi,eax - shld eax,eax,5 - vpsrld xmm4,xmm2,30 - vmovdqa [16+esp],xmm5 - add edi,ebp - xor esi,ecx - shrd ebx,ebx,7 - add edi,eax - vpslld xmm2,xmm2,2 - add edx,DWORD [40+esp] - xor esi,ebx - mov ebp,edi - shld edi,edi,5 - add edx,esi - xor ebp,ebx - shrd eax,eax,7 - add edx,edi - vpor xmm2,xmm2,xmm4 - add ecx,DWORD [44+esp] - xor ebp,eax - vmovdqa xmm4,[80+esp] - mov esi,edx - shld edx,edx,5 - add ecx,ebp - xor esi,eax - shrd edi,edi,7 - add ecx,edx - vpalignr xmm5,xmm2,xmm1,8 - vpxor xmm3,xmm3,xmm7 - add ebx,DWORD [48+esp] - xor esi,edi - mov ebp,ecx - shld ecx,ecx,5 - vpxor xmm3,xmm3,xmm4 - vmovdqa [80+esp],xmm7 - add ebx,esi - xor ebp,edi - vmovdqa xmm7,xmm6 - vpaddd xmm6,xmm6,xmm2 - shrd edx,edx,7 - add ebx,ecx - vpxor xmm3,xmm3,xmm5 - add eax,DWORD [52+esp] - xor ebp,edx - mov esi,ebx - shld ebx,ebx,5 - vpsrld xmm5,xmm3,30 - vmovdqa [32+esp],xmm6 - add eax,ebp - xor esi,edx - shrd ecx,ecx,7 - add eax,ebx - vpslld xmm3,xmm3,2 - add edi,DWORD [56+esp] - xor esi,ecx - mov ebp,eax - shld eax,eax,5 - add edi,esi - xor ebp,ecx - shrd ebx,ebx,7 - add edi,eax - vpor xmm3,xmm3,xmm5 - add edx,DWORD [60+esp] - xor ebp,ebx - vmovdqa xmm5,[96+esp] - mov esi,edi - shld edi,edi,5 - add edx,ebp - xor esi,ebx - shrd eax,eax,7 - add edx,edi - vpalignr xmm6,xmm3,xmm2,8 - vpxor xmm4,xmm4,xmm0 - add ecx,DWORD [esp] - xor esi,eax - mov ebp,edx - shld edx,edx,5 - vpxor xmm4,xmm4,xmm5 - vmovdqa [96+esp],xmm0 - add ecx,esi - xor ebp,eax - vmovdqa xmm0,xmm7 - vpaddd xmm7,xmm7,xmm3 - shrd edi,edi,7 - add ecx,edx - vpxor xmm4,xmm4,xmm6 - add ebx,DWORD [4+esp] - xor ebp,edi - mov esi,ecx - shld ecx,ecx,5 - vpsrld xmm6,xmm4,30 - vmovdqa [48+esp],xmm7 - add ebx,ebp - xor esi,edi - shrd edx,edx,7 - add ebx,ecx - vpslld xmm4,xmm4,2 - add eax,DWORD [8+esp] - xor esi,edx - mov ebp,ebx - shld ebx,ebx,5 - add eax,esi - xor ebp,edx - shrd ecx,ecx,7 - add eax,ebx - vpor xmm4,xmm4,xmm6 - add edi,DWORD [12+esp] - xor ebp,ecx - vmovdqa xmm6,[64+esp] - mov esi,eax - shld eax,eax,5 - add edi,ebp - xor esi,ecx - shrd ebx,ebx,7 - add edi,eax - vpalignr xmm7,xmm4,xmm3,8 - vpxor xmm5,xmm5,xmm1 - add edx,DWORD [16+esp] - xor esi,ebx - mov ebp,edi - shld edi,edi,5 - vpxor xmm5,xmm5,xmm6 - vmovdqa [64+esp],xmm1 - add edx,esi - xor ebp,ebx - vmovdqa xmm1,xmm0 - vpaddd xmm0,xmm0,xmm4 - shrd eax,eax,7 - add edx,edi - vpxor xmm5,xmm5,xmm7 - add ecx,DWORD [20+esp] - xor ebp,eax - mov esi,edx - shld edx,edx,5 - vpsrld xmm7,xmm5,30 - vmovdqa [esp],xmm0 - add ecx,ebp - xor esi,eax - shrd edi,edi,7 - add ecx,edx - vpslld xmm5,xmm5,2 - add ebx,DWORD [24+esp] - xor esi,edi - mov ebp,ecx - shld ecx,ecx,5 - add ebx,esi - xor ebp,edi - shrd edx,edx,7 - add ebx,ecx - vpor xmm5,xmm5,xmm7 - add eax,DWORD [28+esp] - vmovdqa xmm7,[80+esp] - shrd ecx,ecx,7 - mov esi,ebx - xor ebp,edx - shld ebx,ebx,5 - add eax,ebp - xor esi,ecx - xor ecx,edx - add eax,ebx - vpalignr xmm0,xmm5,xmm4,8 - vpxor xmm6,xmm6,xmm2 - add edi,DWORD [32+esp] - and esi,ecx - xor ecx,edx - shrd ebx,ebx,7 - vpxor xmm6,xmm6,xmm7 - vmovdqa [80+esp],xmm2 - mov ebp,eax - xor esi,ecx - vmovdqa xmm2,xmm1 - vpaddd xmm1,xmm1,xmm5 - shld eax,eax,5 - add edi,esi - vpxor xmm6,xmm6,xmm0 - xor ebp,ebx - xor ebx,ecx - add edi,eax - add edx,DWORD [36+esp] - vpsrld xmm0,xmm6,30 - vmovdqa [16+esp],xmm1 - and ebp,ebx - xor ebx,ecx - shrd eax,eax,7 - mov esi,edi - vpslld xmm6,xmm6,2 - xor ebp,ebx - shld edi,edi,5 - add edx,ebp - xor esi,eax - xor eax,ebx - add edx,edi - add ecx,DWORD [40+esp] - and esi,eax - vpor xmm6,xmm6,xmm0 - xor eax,ebx - shrd edi,edi,7 - vmovdqa xmm0,[96+esp] - mov ebp,edx - xor esi,eax - shld edx,edx,5 - add ecx,esi - xor ebp,edi - xor edi,eax - add ecx,edx - add ebx,DWORD [44+esp] - and ebp,edi - xor edi,eax - shrd edx,edx,7 - mov esi,ecx - xor ebp,edi - shld ecx,ecx,5 - add ebx,ebp - xor esi,edx - xor edx,edi - add ebx,ecx - vpalignr xmm1,xmm6,xmm5,8 - vpxor xmm7,xmm7,xmm3 - add eax,DWORD [48+esp] - and esi,edx - xor edx,edi - shrd ecx,ecx,7 - vpxor xmm7,xmm7,xmm0 - vmovdqa [96+esp],xmm3 - mov ebp,ebx - xor esi,edx - vmovdqa xmm3,[144+esp] - vpaddd xmm2,xmm2,xmm6 - shld ebx,ebx,5 - add eax,esi - vpxor xmm7,xmm7,xmm1 - xor ebp,ecx - xor ecx,edx - add eax,ebx - add edi,DWORD [52+esp] - vpsrld xmm1,xmm7,30 - vmovdqa [32+esp],xmm2 - and ebp,ecx - xor ecx,edx - shrd ebx,ebx,7 - mov esi,eax - vpslld xmm7,xmm7,2 - xor ebp,ecx - shld eax,eax,5 - add edi,ebp - xor esi,ebx - xor ebx,ecx - add edi,eax - add edx,DWORD [56+esp] - and esi,ebx - vpor xmm7,xmm7,xmm1 - xor ebx,ecx - shrd eax,eax,7 - vmovdqa xmm1,[64+esp] - mov ebp,edi - xor esi,ebx - shld edi,edi,5 - add edx,esi - xor ebp,eax - xor eax,ebx - add edx,edi - add ecx,DWORD [60+esp] - and ebp,eax - xor eax,ebx - shrd edi,edi,7 - mov esi,edx - xor ebp,eax - shld edx,edx,5 - add ecx,ebp - xor esi,edi - xor edi,eax - add ecx,edx - vpalignr xmm2,xmm7,xmm6,8 - vpxor xmm0,xmm0,xmm4 - add ebx,DWORD [esp] - and esi,edi - xor edi,eax - shrd edx,edx,7 - vpxor xmm0,xmm0,xmm1 - vmovdqa [64+esp],xmm4 - mov ebp,ecx - xor esi,edi - vmovdqa xmm4,xmm3 - vpaddd xmm3,xmm3,xmm7 - shld ecx,ecx,5 - add ebx,esi - vpxor xmm0,xmm0,xmm2 - xor ebp,edx - xor edx,edi - add ebx,ecx - add eax,DWORD [4+esp] - vpsrld xmm2,xmm0,30 - vmovdqa [48+esp],xmm3 - and ebp,edx - xor edx,edi - shrd ecx,ecx,7 - mov esi,ebx - vpslld xmm0,xmm0,2 - xor ebp,edx - shld ebx,ebx,5 - add eax,ebp - xor esi,ecx - xor ecx,edx - add eax,ebx - add edi,DWORD [8+esp] - and esi,ecx - vpor xmm0,xmm0,xmm2 - xor ecx,edx - shrd ebx,ebx,7 - vmovdqa xmm2,[80+esp] - mov ebp,eax - xor esi,ecx - shld eax,eax,5 - add edi,esi - xor ebp,ebx - xor ebx,ecx - add edi,eax - add edx,DWORD [12+esp] - and ebp,ebx - xor ebx,ecx - shrd eax,eax,7 - mov esi,edi - xor ebp,ebx - shld edi,edi,5 - add edx,ebp - xor esi,eax - xor eax,ebx - add edx,edi - vpalignr xmm3,xmm0,xmm7,8 - vpxor xmm1,xmm1,xmm5 - add ecx,DWORD [16+esp] - and esi,eax - xor eax,ebx - shrd edi,edi,7 - vpxor xmm1,xmm1,xmm2 - vmovdqa [80+esp],xmm5 - mov ebp,edx - xor esi,eax - vmovdqa xmm5,xmm4 - vpaddd xmm4,xmm4,xmm0 - shld edx,edx,5 - add ecx,esi - vpxor xmm1,xmm1,xmm3 - xor ebp,edi - xor edi,eax - add ecx,edx - add ebx,DWORD [20+esp] - vpsrld xmm3,xmm1,30 - vmovdqa [esp],xmm4 - and ebp,edi - xor edi,eax - shrd edx,edx,7 - mov esi,ecx - vpslld xmm1,xmm1,2 - xor ebp,edi - shld ecx,ecx,5 - add ebx,ebp - xor esi,edx - xor edx,edi - add ebx,ecx - add eax,DWORD [24+esp] - and esi,edx - vpor xmm1,xmm1,xmm3 - xor edx,edi - shrd ecx,ecx,7 - vmovdqa xmm3,[96+esp] - mov ebp,ebx - xor esi,edx - shld ebx,ebx,5 - add eax,esi - xor ebp,ecx - xor ecx,edx - add eax,ebx - add edi,DWORD [28+esp] - and ebp,ecx - xor ecx,edx - shrd ebx,ebx,7 - mov esi,eax - xor ebp,ecx - shld eax,eax,5 - add edi,ebp - xor esi,ebx - xor ebx,ecx - add edi,eax - vpalignr xmm4,xmm1,xmm0,8 - vpxor xmm2,xmm2,xmm6 - add edx,DWORD [32+esp] - and esi,ebx - xor ebx,ecx - shrd eax,eax,7 - vpxor xmm2,xmm2,xmm3 - vmovdqa [96+esp],xmm6 - mov ebp,edi - xor esi,ebx - vmovdqa xmm6,xmm5 - vpaddd xmm5,xmm5,xmm1 - shld edi,edi,5 - add edx,esi - vpxor xmm2,xmm2,xmm4 - xor ebp,eax - xor eax,ebx - add edx,edi - add ecx,DWORD [36+esp] - vpsrld xmm4,xmm2,30 - vmovdqa [16+esp],xmm5 - and ebp,eax - xor eax,ebx - shrd edi,edi,7 - mov esi,edx - vpslld xmm2,xmm2,2 - xor ebp,eax - shld edx,edx,5 - add ecx,ebp - xor esi,edi - xor edi,eax - add ecx,edx - add ebx,DWORD [40+esp] - and esi,edi - vpor xmm2,xmm2,xmm4 - xor edi,eax - shrd edx,edx,7 - vmovdqa xmm4,[64+esp] - mov ebp,ecx - xor esi,edi - shld ecx,ecx,5 - add ebx,esi - xor ebp,edx - xor edx,edi - add ebx,ecx - add eax,DWORD [44+esp] - and ebp,edx - xor edx,edi - shrd ecx,ecx,7 - mov esi,ebx - xor ebp,edx - shld ebx,ebx,5 - add eax,ebp - xor esi,edx - add eax,ebx - vpalignr xmm5,xmm2,xmm1,8 - vpxor xmm3,xmm3,xmm7 - add edi,DWORD [48+esp] - xor esi,ecx - mov ebp,eax - shld eax,eax,5 - vpxor xmm3,xmm3,xmm4 - vmovdqa [64+esp],xmm7 - add edi,esi - xor ebp,ecx - vmovdqa xmm7,xmm6 - vpaddd xmm6,xmm6,xmm2 - shrd ebx,ebx,7 - add edi,eax - vpxor xmm3,xmm3,xmm5 - add edx,DWORD [52+esp] - xor ebp,ebx - mov esi,edi - shld edi,edi,5 - vpsrld xmm5,xmm3,30 - vmovdqa [32+esp],xmm6 - add edx,ebp - xor esi,ebx - shrd eax,eax,7 - add edx,edi - vpslld xmm3,xmm3,2 - add ecx,DWORD [56+esp] - xor esi,eax - mov ebp,edx - shld edx,edx,5 - add ecx,esi - xor ebp,eax - shrd edi,edi,7 - add ecx,edx - vpor xmm3,xmm3,xmm5 - add ebx,DWORD [60+esp] - xor ebp,edi - mov esi,ecx - shld ecx,ecx,5 - add ebx,ebp - xor esi,edi - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD [esp] - vpaddd xmm7,xmm7,xmm3 - xor esi,edx - mov ebp,ebx - shld ebx,ebx,5 - add eax,esi - vmovdqa [48+esp],xmm7 - xor ebp,edx - shrd ecx,ecx,7 - add eax,ebx - add edi,DWORD [4+esp] - xor ebp,ecx - mov esi,eax - shld eax,eax,5 - add edi,ebp - xor esi,ecx - shrd ebx,ebx,7 - add edi,eax - add edx,DWORD [8+esp] - xor esi,ebx - mov ebp,edi - shld edi,edi,5 - add edx,esi - xor ebp,ebx - shrd eax,eax,7 - add edx,edi - add ecx,DWORD [12+esp] - xor ebp,eax - mov esi,edx - shld edx,edx,5 - add ecx,ebp - xor esi,eax - shrd edi,edi,7 - add ecx,edx - mov ebp,DWORD [196+esp] - cmp ebp,DWORD [200+esp] - je NEAR L$010done - vmovdqa xmm7,[160+esp] - vmovdqa xmm6,[176+esp] - vmovdqu xmm0,[ebp] - vmovdqu xmm1,[16+ebp] - vmovdqu xmm2,[32+ebp] - vmovdqu xmm3,[48+ebp] - add ebp,64 - vpshufb xmm0,xmm0,xmm6 - mov DWORD [196+esp],ebp - vmovdqa [96+esp],xmm7 - add ebx,DWORD [16+esp] - xor esi,edi - vpshufb xmm1,xmm1,xmm6 - mov ebp,ecx - shld ecx,ecx,5 - vpaddd xmm4,xmm0,xmm7 - add ebx,esi - xor ebp,edi - shrd edx,edx,7 - add ebx,ecx - vmovdqa [esp],xmm4 - add eax,DWORD [20+esp] - xor ebp,edx - mov esi,ebx - shld ebx,ebx,5 - add eax,ebp - xor esi,edx - shrd ecx,ecx,7 - add eax,ebx - add edi,DWORD [24+esp] - xor esi,ecx - mov ebp,eax - shld eax,eax,5 - add edi,esi - xor ebp,ecx - shrd ebx,ebx,7 - add edi,eax - add edx,DWORD [28+esp] - xor ebp,ebx - mov esi,edi - shld edi,edi,5 - add edx,ebp - xor esi,ebx - shrd eax,eax,7 - add edx,edi - add ecx,DWORD [32+esp] - xor esi,eax - vpshufb xmm2,xmm2,xmm6 - mov ebp,edx - shld edx,edx,5 - vpaddd xmm5,xmm1,xmm7 - add ecx,esi - xor ebp,eax - shrd edi,edi,7 - add ecx,edx - vmovdqa [16+esp],xmm5 - add ebx,DWORD [36+esp] - xor ebp,edi - mov esi,ecx - shld ecx,ecx,5 - add ebx,ebp - xor esi,edi - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD [40+esp] - xor esi,edx - mov ebp,ebx - shld ebx,ebx,5 - add eax,esi - xor ebp,edx - shrd ecx,ecx,7 - add eax,ebx - add edi,DWORD [44+esp] - xor ebp,ecx - mov esi,eax - shld eax,eax,5 - add edi,ebp - xor esi,ecx - shrd ebx,ebx,7 - add edi,eax - add edx,DWORD [48+esp] - xor esi,ebx - vpshufb xmm3,xmm3,xmm6 - mov ebp,edi - shld edi,edi,5 - vpaddd xmm6,xmm2,xmm7 - add edx,esi - xor ebp,ebx - shrd eax,eax,7 - add edx,edi - vmovdqa [32+esp],xmm6 - add ecx,DWORD [52+esp] - xor ebp,eax - mov esi,edx - shld edx,edx,5 - add ecx,ebp - xor esi,eax - shrd edi,edi,7 - add ecx,edx - add ebx,DWORD [56+esp] - xor esi,edi - mov ebp,ecx - shld ecx,ecx,5 - add ebx,esi - xor ebp,edi - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD [60+esp] - xor ebp,edx - mov esi,ebx - shld ebx,ebx,5 - add eax,ebp - shrd ecx,ecx,7 - add eax,ebx - mov ebp,DWORD [192+esp] - add eax,DWORD [ebp] - add esi,DWORD [4+ebp] - add ecx,DWORD [8+ebp] - mov DWORD [ebp],eax - add edx,DWORD [12+ebp] - mov DWORD [4+ebp],esi - add edi,DWORD [16+ebp] - mov ebx,ecx - mov DWORD [8+ebp],ecx - xor ebx,edx - mov DWORD [12+ebp],edx - mov DWORD [16+ebp],edi - mov ebp,esi - and esi,ebx - mov ebx,ebp - jmp NEAR L$009loop -align 16 -L$010done: - add ebx,DWORD [16+esp] - xor esi,edi - mov ebp,ecx - shld ecx,ecx,5 - add ebx,esi - xor ebp,edi - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD [20+esp] - xor ebp,edx - mov esi,ebx - shld ebx,ebx,5 - add eax,ebp - xor esi,edx - shrd ecx,ecx,7 - add eax,ebx - add edi,DWORD [24+esp] - xor esi,ecx - mov ebp,eax - shld eax,eax,5 - add edi,esi - xor ebp,ecx - shrd ebx,ebx,7 - add edi,eax - add edx,DWORD [28+esp] - xor ebp,ebx - mov esi,edi - shld edi,edi,5 - add edx,ebp - xor esi,ebx - shrd eax,eax,7 - add edx,edi - add ecx,DWORD [32+esp] - xor esi,eax - mov ebp,edx - shld edx,edx,5 - add ecx,esi - xor ebp,eax - shrd edi,edi,7 - add ecx,edx - add ebx,DWORD [36+esp] - xor ebp,edi - mov esi,ecx - shld ecx,ecx,5 - add ebx,ebp - xor esi,edi - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD [40+esp] - xor esi,edx - mov ebp,ebx - shld ebx,ebx,5 - add eax,esi - xor ebp,edx - shrd ecx,ecx,7 - add eax,ebx - add edi,DWORD [44+esp] - xor ebp,ecx - mov esi,eax - shld eax,eax,5 - add edi,ebp - xor esi,ecx - shrd ebx,ebx,7 - add edi,eax - add edx,DWORD [48+esp] - xor esi,ebx - mov ebp,edi - shld edi,edi,5 - add edx,esi - xor ebp,ebx - shrd eax,eax,7 - add edx,edi - add ecx,DWORD [52+esp] - xor ebp,eax - mov esi,edx - shld edx,edx,5 - add ecx,ebp - xor esi,eax - shrd edi,edi,7 - add ecx,edx - add ebx,DWORD [56+esp] - xor esi,edi - mov ebp,ecx - shld ecx,ecx,5 - add ebx,esi - xor ebp,edi - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD [60+esp] - xor ebp,edx - mov esi,ebx - shld ebx,ebx,5 - add eax,ebp - shrd ecx,ecx,7 - add eax,ebx - vzeroall - mov ebp,DWORD [192+esp] - add eax,DWORD [ebp] - mov esp,DWORD [204+esp] - add esi,DWORD [4+ebp] - add ecx,DWORD [8+ebp] - mov DWORD [ebp],eax - add edx,DWORD [12+ebp] - mov DWORD [4+ebp],esi - add edi,DWORD [16+ebp] - mov DWORD [8+ebp],ecx - mov DWORD [12+ebp],edx - mov DWORD [16+ebp],edi - pop edi - pop esi - pop ebx - pop ebp - ret align 64 L$K_XX_XX: dd 1518500249,1518500249,1518500249,1518500249 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-MSFT/crypto/sha/sha256-586.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-MSFT/crypto/sha/sha256-586.nasm index 7d8398c7d37..922c31604a2 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-MSFT/crypto/sha/sha256-586.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-MSFT/crypto/sha/sha256-586.nasm @@ -46,13 +46,12 @@ L$000pic_point: or ecx,ebx and ecx,1342177280 cmp ecx,1342177280 - je NEAR L$005AVX test ebx,512 - jnz NEAR L$006SSSE3 + jnz NEAR L$005SSSE3 L$003no_xmm: sub eax,edi cmp eax,256 - jae NEAR L$007unrolled + jae NEAR L$006unrolled jmp NEAR L$002loop align 16 L$002loop: @@ -124,7 +123,7 @@ L$002loop: mov DWORD [28+esp],ecx mov DWORD [32+esp],edi align 16 -L$00800_15: +L$00700_15: mov ecx,edx mov esi,DWORD [24+esp] ror ecx,14 @@ -162,11 +161,11 @@ L$00800_15: add ebp,4 add eax,ebx cmp esi,3248222580 - jne NEAR L$00800_15 + jne NEAR L$00700_15 mov ecx,DWORD [156+esp] - jmp NEAR L$00916_63 + jmp NEAR L$00816_63 align 16 -L$00916_63: +L$00816_63: mov ebx,ecx mov esi,DWORD [104+esp] ror ecx,11 @@ -221,7 +220,7 @@ L$00916_63: add ebp,4 add eax,ebx cmp esi,3329325298 - jne NEAR L$00916_63 + jne NEAR L$00816_63 mov esi,DWORD [356+esp] mov ebx,DWORD [8+esp] mov ecx,DWORD [16+esp] @@ -265,7 +264,7 @@ db 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 db 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 db 62,0 align 16 -L$007unrolled: +L$006unrolled: lea esp,[esp-96] mov eax,DWORD [esi] mov ebp,DWORD [4+esi] @@ -282,9 +281,9 @@ L$007unrolled: mov DWORD [20+esp],ebx mov DWORD [24+esp],ecx mov DWORD [28+esp],esi - jmp NEAR L$010grand_loop + jmp NEAR L$009grand_loop align 16 -L$010grand_loop: +L$009grand_loop: mov ebx,DWORD [edi] mov ecx,DWORD [4+edi] bswap ebx @@ -3164,7 +3163,7 @@ L$010grand_loop: mov DWORD [24+esp],ebx mov DWORD [28+esp],ecx cmp edi,DWORD [104+esp] - jb NEAR L$010grand_loop + jb NEAR L$009grand_loop mov esp,DWORD [108+esp] pop edi pop esi @@ -3183,9 +3182,9 @@ L$004shaext: pshufd xmm2,xmm2,27 db 102,15,58,15,202,8 punpcklqdq xmm2,xmm0 - jmp NEAR L$011loop_shaext + jmp NEAR L$010loop_shaext align 16 -L$011loop_shaext: +L$010loop_shaext: movdqu xmm3,[edi] movdqu xmm4,[16+edi] movdqu xmm5,[32+edi] @@ -3355,7 +3354,7 @@ db 15,56,203,209 db 15,56,203,202 paddd xmm2,[16+esp] paddd xmm1,[esp] - jnz NEAR L$011loop_shaext + jnz NEAR L$010loop_shaext pshufd xmm2,xmm2,177 pshufd xmm7,xmm1,27 pshufd xmm1,xmm1,177 @@ -3370,7 +3369,7 @@ db 102,15,58,15,215,8 pop ebp ret align 32 -L$006SSSE3: +L$005SSSE3: lea esp,[esp-96] mov eax,DWORD [esi] mov ebx,DWORD [4+esi] @@ -3389,9 +3388,9 @@ L$006SSSE3: mov DWORD [24+esp],ecx mov DWORD [28+esp],esi movdqa xmm7,[256+ebp] - jmp NEAR L$012grand_ssse3 + jmp NEAR L$011grand_ssse3 align 16 -L$012grand_ssse3: +L$011grand_ssse3: movdqu xmm0,[edi] movdqu xmm1,[16+edi] movdqu xmm2,[32+edi] @@ -3414,9 +3413,9 @@ db 102,15,56,0,223 paddd xmm7,xmm3 movdqa [64+esp],xmm6 movdqa [80+esp],xmm7 - jmp NEAR L$013ssse3_00_47 + jmp NEAR L$012ssse3_00_47 align 16 -L$013ssse3_00_47: +L$012ssse3_00_47: add ebp,64 mov ecx,edx movdqa xmm4,xmm1 @@ -4059,7 +4058,7 @@ db 102,15,58,15,249,4 add eax,ecx movdqa [80+esp],xmm6 cmp DWORD [64+ebp],66051 - jne NEAR L$013ssse3_00_47 + jne NEAR L$012ssse3_00_47 mov ecx,edx ror edx,14 mov esi,DWORD [20+esp] @@ -4573,2217 +4572,12 @@ db 102,15,58,15,249,4 movdqa xmm7,[64+ebp] sub ebp,192 cmp edi,DWORD [104+esp] - jb NEAR L$012grand_ssse3 + jb NEAR L$011grand_ssse3 mov esp,DWORD [108+esp] pop edi pop esi pop ebx pop ebp ret -align 32 -L$005AVX: - and edx,264 - cmp edx,264 - je NEAR L$014AVX_BMI - lea esp,[esp-96] - vzeroall - mov eax,DWORD [esi] - mov ebx,DWORD [4+esi] - mov ecx,DWORD [8+esi] - mov edi,DWORD [12+esi] - mov DWORD [4+esp],ebx - xor ebx,ecx - mov DWORD [8+esp],ecx - mov DWORD [12+esp],edi - mov edx,DWORD [16+esi] - mov edi,DWORD [20+esi] - mov ecx,DWORD [24+esi] - mov esi,DWORD [28+esi] - mov DWORD [20+esp],edi - mov edi,DWORD [100+esp] - mov DWORD [24+esp],ecx - mov DWORD [28+esp],esi - vmovdqa xmm7,[256+ebp] - jmp NEAR L$015grand_avx -align 32 -L$015grand_avx: - vmovdqu xmm0,[edi] - vmovdqu xmm1,[16+edi] - vmovdqu xmm2,[32+edi] - vmovdqu xmm3,[48+edi] - add edi,64 - vpshufb xmm0,xmm0,xmm7 - mov DWORD [100+esp],edi - vpshufb xmm1,xmm1,xmm7 - vpshufb xmm2,xmm2,xmm7 - vpaddd xmm4,xmm0,[ebp] - vpshufb xmm3,xmm3,xmm7 - vpaddd xmm5,xmm1,[16+ebp] - vpaddd xmm6,xmm2,[32+ebp] - vpaddd xmm7,xmm3,[48+ebp] - vmovdqa [32+esp],xmm4 - vmovdqa [48+esp],xmm5 - vmovdqa [64+esp],xmm6 - vmovdqa [80+esp],xmm7 - jmp NEAR L$016avx_00_47 -align 16 -L$016avx_00_47: - add ebp,64 - vpalignr xmm4,xmm1,xmm0,4 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [20+esp] - vpalignr xmm7,xmm3,xmm2,4 - xor edx,ecx - mov edi,DWORD [24+esp] - xor esi,edi - vpsrld xmm6,xmm4,7 - shrd edx,edx,5 - and esi,ecx - mov DWORD [16+esp],ecx - vpaddd xmm0,xmm0,xmm7 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrld xmm7,xmm4,3 - mov ecx,eax - add edx,edi - mov edi,DWORD [4+esp] - vpslld xmm5,xmm4,14 - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [esp],eax - vpxor xmm4,xmm7,xmm6 - xor ecx,eax - xor eax,edi - add edx,DWORD [28+esp] - vpshufd xmm7,xmm3,250 - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - vpsrld xmm6,xmm6,11 - add edx,DWORD [32+esp] - xor ebx,edi - shrd ecx,ecx,2 - vpxor xmm4,xmm4,xmm5 - add ebx,edx - add edx,DWORD [12+esp] - add ebx,ecx - vpslld xmm5,xmm5,11 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [16+esp] - vpxor xmm4,xmm4,xmm6 - xor edx,ecx - mov edi,DWORD [20+esp] - xor esi,edi - vpsrld xmm6,xmm7,10 - shrd edx,edx,5 - and esi,ecx - mov DWORD [12+esp],ecx - vpxor xmm4,xmm4,xmm5 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrlq xmm5,xmm7,17 - mov ecx,ebx - add edx,edi - mov edi,DWORD [esp] - vpaddd xmm0,xmm0,xmm4 - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [28+esp],ebx - vpxor xmm6,xmm6,xmm5 - xor ecx,ebx - xor ebx,edi - add edx,DWORD [24+esp] - vpsrlq xmm7,xmm7,19 - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - vpxor xmm6,xmm6,xmm7 - add edx,DWORD [36+esp] - xor eax,edi - shrd ecx,ecx,2 - vpshufd xmm7,xmm6,132 - add eax,edx - add edx,DWORD [8+esp] - add eax,ecx - vpsrldq xmm7,xmm7,8 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [12+esp] - vpaddd xmm0,xmm0,xmm7 - xor edx,ecx - mov edi,DWORD [16+esp] - xor esi,edi - vpshufd xmm7,xmm0,80 - shrd edx,edx,5 - and esi,ecx - mov DWORD [8+esp],ecx - vpsrld xmm6,xmm7,10 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrlq xmm5,xmm7,17 - mov ecx,eax - add edx,edi - mov edi,DWORD [28+esp] - vpxor xmm6,xmm6,xmm5 - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [24+esp],eax - vpsrlq xmm7,xmm7,19 - xor ecx,eax - xor eax,edi - add edx,DWORD [20+esp] - vpxor xmm6,xmm6,xmm7 - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - vpshufd xmm7,xmm6,232 - add edx,DWORD [40+esp] - xor ebx,edi - shrd ecx,ecx,2 - vpslldq xmm7,xmm7,8 - add ebx,edx - add edx,DWORD [4+esp] - add ebx,ecx - vpaddd xmm0,xmm0,xmm7 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [8+esp] - vpaddd xmm6,xmm0,[ebp] - xor edx,ecx - mov edi,DWORD [12+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [4+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [24+esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [20+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [16+esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [44+esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD [esp] - add eax,ecx - vmovdqa [32+esp],xmm6 - vpalignr xmm4,xmm2,xmm1,4 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [4+esp] - vpalignr xmm7,xmm0,xmm3,4 - xor edx,ecx - mov edi,DWORD [8+esp] - xor esi,edi - vpsrld xmm6,xmm4,7 - shrd edx,edx,5 - and esi,ecx - mov DWORD [esp],ecx - vpaddd xmm1,xmm1,xmm7 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrld xmm7,xmm4,3 - mov ecx,eax - add edx,edi - mov edi,DWORD [20+esp] - vpslld xmm5,xmm4,14 - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [16+esp],eax - vpxor xmm4,xmm7,xmm6 - xor ecx,eax - xor eax,edi - add edx,DWORD [12+esp] - vpshufd xmm7,xmm0,250 - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - vpsrld xmm6,xmm6,11 - add edx,DWORD [48+esp] - xor ebx,edi - shrd ecx,ecx,2 - vpxor xmm4,xmm4,xmm5 - add ebx,edx - add edx,DWORD [28+esp] - add ebx,ecx - vpslld xmm5,xmm5,11 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [esp] - vpxor xmm4,xmm4,xmm6 - xor edx,ecx - mov edi,DWORD [4+esp] - xor esi,edi - vpsrld xmm6,xmm7,10 - shrd edx,edx,5 - and esi,ecx - mov DWORD [28+esp],ecx - vpxor xmm4,xmm4,xmm5 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrlq xmm5,xmm7,17 - mov ecx,ebx - add edx,edi - mov edi,DWORD [16+esp] - vpaddd xmm1,xmm1,xmm4 - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [12+esp],ebx - vpxor xmm6,xmm6,xmm5 - xor ecx,ebx - xor ebx,edi - add edx,DWORD [8+esp] - vpsrlq xmm7,xmm7,19 - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - vpxor xmm6,xmm6,xmm7 - add edx,DWORD [52+esp] - xor eax,edi - shrd ecx,ecx,2 - vpshufd xmm7,xmm6,132 - add eax,edx - add edx,DWORD [24+esp] - add eax,ecx - vpsrldq xmm7,xmm7,8 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [28+esp] - vpaddd xmm1,xmm1,xmm7 - xor edx,ecx - mov edi,DWORD [esp] - xor esi,edi - vpshufd xmm7,xmm1,80 - shrd edx,edx,5 - and esi,ecx - mov DWORD [24+esp],ecx - vpsrld xmm6,xmm7,10 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrlq xmm5,xmm7,17 - mov ecx,eax - add edx,edi - mov edi,DWORD [12+esp] - vpxor xmm6,xmm6,xmm5 - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [8+esp],eax - vpsrlq xmm7,xmm7,19 - xor ecx,eax - xor eax,edi - add edx,DWORD [4+esp] - vpxor xmm6,xmm6,xmm7 - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - vpshufd xmm7,xmm6,232 - add edx,DWORD [56+esp] - xor ebx,edi - shrd ecx,ecx,2 - vpslldq xmm7,xmm7,8 - add ebx,edx - add edx,DWORD [20+esp] - add ebx,ecx - vpaddd xmm1,xmm1,xmm7 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [24+esp] - vpaddd xmm6,xmm1,[16+ebp] - xor edx,ecx - mov edi,DWORD [28+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [20+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [8+esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [4+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [60+esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD [16+esp] - add eax,ecx - vmovdqa [48+esp],xmm6 - vpalignr xmm4,xmm3,xmm2,4 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [20+esp] - vpalignr xmm7,xmm1,xmm0,4 - xor edx,ecx - mov edi,DWORD [24+esp] - xor esi,edi - vpsrld xmm6,xmm4,7 - shrd edx,edx,5 - and esi,ecx - mov DWORD [16+esp],ecx - vpaddd xmm2,xmm2,xmm7 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrld xmm7,xmm4,3 - mov ecx,eax - add edx,edi - mov edi,DWORD [4+esp] - vpslld xmm5,xmm4,14 - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [esp],eax - vpxor xmm4,xmm7,xmm6 - xor ecx,eax - xor eax,edi - add edx,DWORD [28+esp] - vpshufd xmm7,xmm1,250 - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - vpsrld xmm6,xmm6,11 - add edx,DWORD [64+esp] - xor ebx,edi - shrd ecx,ecx,2 - vpxor xmm4,xmm4,xmm5 - add ebx,edx - add edx,DWORD [12+esp] - add ebx,ecx - vpslld xmm5,xmm5,11 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [16+esp] - vpxor xmm4,xmm4,xmm6 - xor edx,ecx - mov edi,DWORD [20+esp] - xor esi,edi - vpsrld xmm6,xmm7,10 - shrd edx,edx,5 - and esi,ecx - mov DWORD [12+esp],ecx - vpxor xmm4,xmm4,xmm5 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrlq xmm5,xmm7,17 - mov ecx,ebx - add edx,edi - mov edi,DWORD [esp] - vpaddd xmm2,xmm2,xmm4 - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [28+esp],ebx - vpxor xmm6,xmm6,xmm5 - xor ecx,ebx - xor ebx,edi - add edx,DWORD [24+esp] - vpsrlq xmm7,xmm7,19 - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - vpxor xmm6,xmm6,xmm7 - add edx,DWORD [68+esp] - xor eax,edi - shrd ecx,ecx,2 - vpshufd xmm7,xmm6,132 - add eax,edx - add edx,DWORD [8+esp] - add eax,ecx - vpsrldq xmm7,xmm7,8 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [12+esp] - vpaddd xmm2,xmm2,xmm7 - xor edx,ecx - mov edi,DWORD [16+esp] - xor esi,edi - vpshufd xmm7,xmm2,80 - shrd edx,edx,5 - and esi,ecx - mov DWORD [8+esp],ecx - vpsrld xmm6,xmm7,10 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrlq xmm5,xmm7,17 - mov ecx,eax - add edx,edi - mov edi,DWORD [28+esp] - vpxor xmm6,xmm6,xmm5 - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [24+esp],eax - vpsrlq xmm7,xmm7,19 - xor ecx,eax - xor eax,edi - add edx,DWORD [20+esp] - vpxor xmm6,xmm6,xmm7 - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - vpshufd xmm7,xmm6,232 - add edx,DWORD [72+esp] - xor ebx,edi - shrd ecx,ecx,2 - vpslldq xmm7,xmm7,8 - add ebx,edx - add edx,DWORD [4+esp] - add ebx,ecx - vpaddd xmm2,xmm2,xmm7 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [8+esp] - vpaddd xmm6,xmm2,[32+ebp] - xor edx,ecx - mov edi,DWORD [12+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [4+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [24+esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [20+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [16+esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [76+esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD [esp] - add eax,ecx - vmovdqa [64+esp],xmm6 - vpalignr xmm4,xmm0,xmm3,4 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [4+esp] - vpalignr xmm7,xmm2,xmm1,4 - xor edx,ecx - mov edi,DWORD [8+esp] - xor esi,edi - vpsrld xmm6,xmm4,7 - shrd edx,edx,5 - and esi,ecx - mov DWORD [esp],ecx - vpaddd xmm3,xmm3,xmm7 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrld xmm7,xmm4,3 - mov ecx,eax - add edx,edi - mov edi,DWORD [20+esp] - vpslld xmm5,xmm4,14 - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [16+esp],eax - vpxor xmm4,xmm7,xmm6 - xor ecx,eax - xor eax,edi - add edx,DWORD [12+esp] - vpshufd xmm7,xmm2,250 - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - vpsrld xmm6,xmm6,11 - add edx,DWORD [80+esp] - xor ebx,edi - shrd ecx,ecx,2 - vpxor xmm4,xmm4,xmm5 - add ebx,edx - add edx,DWORD [28+esp] - add ebx,ecx - vpslld xmm5,xmm5,11 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [esp] - vpxor xmm4,xmm4,xmm6 - xor edx,ecx - mov edi,DWORD [4+esp] - xor esi,edi - vpsrld xmm6,xmm7,10 - shrd edx,edx,5 - and esi,ecx - mov DWORD [28+esp],ecx - vpxor xmm4,xmm4,xmm5 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrlq xmm5,xmm7,17 - mov ecx,ebx - add edx,edi - mov edi,DWORD [16+esp] - vpaddd xmm3,xmm3,xmm4 - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [12+esp],ebx - vpxor xmm6,xmm6,xmm5 - xor ecx,ebx - xor ebx,edi - add edx,DWORD [8+esp] - vpsrlq xmm7,xmm7,19 - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - vpxor xmm6,xmm6,xmm7 - add edx,DWORD [84+esp] - xor eax,edi - shrd ecx,ecx,2 - vpshufd xmm7,xmm6,132 - add eax,edx - add edx,DWORD [24+esp] - add eax,ecx - vpsrldq xmm7,xmm7,8 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [28+esp] - vpaddd xmm3,xmm3,xmm7 - xor edx,ecx - mov edi,DWORD [esp] - xor esi,edi - vpshufd xmm7,xmm3,80 - shrd edx,edx,5 - and esi,ecx - mov DWORD [24+esp],ecx - vpsrld xmm6,xmm7,10 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrlq xmm5,xmm7,17 - mov ecx,eax - add edx,edi - mov edi,DWORD [12+esp] - vpxor xmm6,xmm6,xmm5 - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [8+esp],eax - vpsrlq xmm7,xmm7,19 - xor ecx,eax - xor eax,edi - add edx,DWORD [4+esp] - vpxor xmm6,xmm6,xmm7 - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - vpshufd xmm7,xmm6,232 - add edx,DWORD [88+esp] - xor ebx,edi - shrd ecx,ecx,2 - vpslldq xmm7,xmm7,8 - add ebx,edx - add edx,DWORD [20+esp] - add ebx,ecx - vpaddd xmm3,xmm3,xmm7 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [24+esp] - vpaddd xmm6,xmm3,[48+ebp] - xor edx,ecx - mov edi,DWORD [28+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [20+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [8+esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [4+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [92+esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD [16+esp] - add eax,ecx - vmovdqa [80+esp],xmm6 - cmp DWORD [64+ebp],66051 - jne NEAR L$016avx_00_47 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [20+esp] - xor edx,ecx - mov edi,DWORD [24+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [16+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [4+esp] - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD [28+esp] - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD [32+esp] - xor ebx,edi - shrd ecx,ecx,2 - add ebx,edx - add edx,DWORD [12+esp] - add ebx,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [16+esp] - xor edx,ecx - mov edi,DWORD [20+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [12+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [28+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [24+esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [36+esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD [8+esp] - add eax,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [12+esp] - xor edx,ecx - mov edi,DWORD [16+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [8+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [28+esp] - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [24+esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD [20+esp] - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD [40+esp] - xor ebx,edi - shrd ecx,ecx,2 - add ebx,edx - add edx,DWORD [4+esp] - add ebx,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [8+esp] - xor edx,ecx - mov edi,DWORD [12+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [4+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [24+esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [20+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [16+esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [44+esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD [esp] - add eax,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [4+esp] - xor edx,ecx - mov edi,DWORD [8+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [20+esp] - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [16+esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD [12+esp] - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD [48+esp] - xor ebx,edi - shrd ecx,ecx,2 - add ebx,edx - add edx,DWORD [28+esp] - add ebx,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [esp] - xor edx,ecx - mov edi,DWORD [4+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [28+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [16+esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [12+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [8+esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [52+esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD [24+esp] - add eax,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [28+esp] - xor edx,ecx - mov edi,DWORD [esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [24+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [12+esp] - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [8+esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD [4+esp] - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD [56+esp] - xor ebx,edi - shrd ecx,ecx,2 - add ebx,edx - add edx,DWORD [20+esp] - add ebx,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [24+esp] - xor edx,ecx - mov edi,DWORD [28+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [20+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [8+esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [4+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [60+esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD [16+esp] - add eax,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [20+esp] - xor edx,ecx - mov edi,DWORD [24+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [16+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [4+esp] - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD [28+esp] - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD [64+esp] - xor ebx,edi - shrd ecx,ecx,2 - add ebx,edx - add edx,DWORD [12+esp] - add ebx,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [16+esp] - xor edx,ecx - mov edi,DWORD [20+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [12+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [28+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [24+esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [68+esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD [8+esp] - add eax,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [12+esp] - xor edx,ecx - mov edi,DWORD [16+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [8+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [28+esp] - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [24+esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD [20+esp] - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD [72+esp] - xor ebx,edi - shrd ecx,ecx,2 - add ebx,edx - add edx,DWORD [4+esp] - add ebx,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [8+esp] - xor edx,ecx - mov edi,DWORD [12+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [4+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [24+esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [20+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [16+esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [76+esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD [esp] - add eax,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [4+esp] - xor edx,ecx - mov edi,DWORD [8+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [20+esp] - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [16+esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD [12+esp] - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD [80+esp] - xor ebx,edi - shrd ecx,ecx,2 - add ebx,edx - add edx,DWORD [28+esp] - add ebx,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [esp] - xor edx,ecx - mov edi,DWORD [4+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [28+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [16+esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [12+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [8+esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [84+esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD [24+esp] - add eax,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [28+esp] - xor edx,ecx - mov edi,DWORD [esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [24+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD [12+esp] - mov esi,eax - shrd ecx,ecx,9 - mov DWORD [8+esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD [4+esp] - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD [88+esp] - xor ebx,edi - shrd ecx,ecx,2 - add ebx,edx - add edx,DWORD [20+esp] - add ebx,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD [24+esp] - xor edx,ecx - mov edi,DWORD [28+esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD [20+esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD [8+esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD [4+esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD [esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD [92+esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD [16+esp] - add eax,ecx - mov esi,DWORD [96+esp] - xor ebx,edi - mov ecx,DWORD [12+esp] - add eax,DWORD [esi] - add ebx,DWORD [4+esi] - add edi,DWORD [8+esi] - add ecx,DWORD [12+esi] - mov DWORD [esi],eax - mov DWORD [4+esi],ebx - mov DWORD [8+esi],edi - mov DWORD [12+esi],ecx - mov DWORD [4+esp],ebx - xor ebx,edi - mov DWORD [8+esp],edi - mov DWORD [12+esp],ecx - mov edi,DWORD [20+esp] - mov ecx,DWORD [24+esp] - add edx,DWORD [16+esi] - add edi,DWORD [20+esi] - add ecx,DWORD [24+esi] - mov DWORD [16+esi],edx - mov DWORD [20+esi],edi - mov DWORD [20+esp],edi - mov edi,DWORD [28+esp] - mov DWORD [24+esi],ecx - add edi,DWORD [28+esi] - mov DWORD [24+esp],ecx - mov DWORD [28+esi],edi - mov DWORD [28+esp],edi - mov edi,DWORD [100+esp] - vmovdqa xmm7,[64+ebp] - sub ebp,192 - cmp edi,DWORD [104+esp] - jb NEAR L$015grand_avx - mov esp,DWORD [108+esp] - vzeroall - pop edi - pop esi - pop ebx - pop ebp - ret -align 32 -L$014AVX_BMI: - lea esp,[esp-96] - vzeroall - mov eax,DWORD [esi] - mov ebx,DWORD [4+esi] - mov ecx,DWORD [8+esi] - mov edi,DWORD [12+esi] - mov DWORD [4+esp],ebx - xor ebx,ecx - mov DWORD [8+esp],ecx - mov DWORD [12+esp],edi - mov edx,DWORD [16+esi] - mov edi,DWORD [20+esi] - mov ecx,DWORD [24+esi] - mov esi,DWORD [28+esi] - mov DWORD [20+esp],edi - mov edi,DWORD [100+esp] - mov DWORD [24+esp],ecx - mov DWORD [28+esp],esi - vmovdqa xmm7,[256+ebp] - jmp NEAR L$017grand_avx_bmi -align 32 -L$017grand_avx_bmi: - vmovdqu xmm0,[edi] - vmovdqu xmm1,[16+edi] - vmovdqu xmm2,[32+edi] - vmovdqu xmm3,[48+edi] - add edi,64 - vpshufb xmm0,xmm0,xmm7 - mov DWORD [100+esp],edi - vpshufb xmm1,xmm1,xmm7 - vpshufb xmm2,xmm2,xmm7 - vpaddd xmm4,xmm0,[ebp] - vpshufb xmm3,xmm3,xmm7 - vpaddd xmm5,xmm1,[16+ebp] - vpaddd xmm6,xmm2,[32+ebp] - vpaddd xmm7,xmm3,[48+ebp] - vmovdqa [32+esp],xmm4 - vmovdqa [48+esp],xmm5 - vmovdqa [64+esp],xmm6 - vmovdqa [80+esp],xmm7 - jmp NEAR L$018avx_bmi_00_47 -align 16 -L$018avx_bmi_00_47: - add ebp,64 - vpalignr xmm4,xmm1,xmm0,4 - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD [16+esp],edx - vpalignr xmm7,xmm3,xmm2,4 - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD [24+esp] - vpsrld xmm6,xmm4,7 - xor ecx,edi - and edx,DWORD [20+esp] - mov DWORD [esp],eax - vpaddd xmm0,xmm0,xmm7 - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - vpsrld xmm7,xmm4,3 - lea edx,[ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - vpslld xmm5,xmm4,14 - mov edi,DWORD [4+esp] - xor ecx,esi - xor eax,edi - vpxor xmm4,xmm7,xmm6 - add edx,DWORD [28+esp] - and ebx,eax - add edx,DWORD [32+esp] - vpshufd xmm7,xmm3,250 - xor ebx,edi - add ecx,edx - add edx,DWORD [12+esp] - vpsrld xmm6,xmm6,11 - lea ebx,[ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - vpxor xmm4,xmm4,xmm5 - mov DWORD [12+esp],edx - rorx edi,edx,25 - xor ecx,esi - vpslld xmm5,xmm5,11 - andn esi,edx,DWORD [20+esp] - xor ecx,edi - and edx,DWORD [16+esp] - vpxor xmm4,xmm4,xmm6 - mov DWORD [28+esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - vpsrld xmm6,xmm7,10 - lea edx,[ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - vpxor xmm4,xmm4,xmm5 - mov edi,DWORD [esp] - xor ecx,esi - xor ebx,edi - vpsrlq xmm5,xmm7,17 - add edx,DWORD [24+esp] - and eax,ebx - add edx,DWORD [36+esp] - vpaddd xmm0,xmm0,xmm4 - xor eax,edi - add ecx,edx - add edx,DWORD [8+esp] - vpxor xmm6,xmm6,xmm5 - lea eax,[ecx*1+eax] - rorx ecx,edx,6 - rorx esi,edx,11 - vpsrlq xmm7,xmm7,19 - mov DWORD [8+esp],edx - rorx edi,edx,25 - xor ecx,esi - vpxor xmm6,xmm6,xmm7 - andn esi,edx,DWORD [16+esp] - xor ecx,edi - and edx,DWORD [12+esp] - vpshufd xmm7,xmm6,132 - mov DWORD [24+esp],eax - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - vpsrldq xmm7,xmm7,8 - lea edx,[ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - vpaddd xmm0,xmm0,xmm7 - mov edi,DWORD [28+esp] - xor ecx,esi - xor eax,edi - vpshufd xmm7,xmm0,80 - add edx,DWORD [20+esp] - and ebx,eax - add edx,DWORD [40+esp] - vpsrld xmm6,xmm7,10 - xor ebx,edi - add ecx,edx - add edx,DWORD [4+esp] - vpsrlq xmm5,xmm7,17 - lea ebx,[ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - vpxor xmm6,xmm6,xmm5 - mov DWORD [4+esp],edx - rorx edi,edx,25 - xor ecx,esi - vpsrlq xmm7,xmm7,19 - andn esi,edx,DWORD [12+esp] - xor ecx,edi - and edx,DWORD [8+esp] - vpxor xmm6,xmm6,xmm7 - mov DWORD [20+esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - vpshufd xmm7,xmm6,232 - lea edx,[ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - vpslldq xmm7,xmm7,8 - mov edi,DWORD [24+esp] - xor ecx,esi - xor ebx,edi - vpaddd xmm0,xmm0,xmm7 - add edx,DWORD [16+esp] - and eax,ebx - add edx,DWORD [44+esp] - vpaddd xmm6,xmm0,[ebp] - xor eax,edi - add ecx,edx - add edx,DWORD [esp] - lea eax,[ecx*1+eax] - vmovdqa [32+esp],xmm6 - vpalignr xmm4,xmm2,xmm1,4 - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD [esp],edx - vpalignr xmm7,xmm0,xmm3,4 - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD [8+esp] - vpsrld xmm6,xmm4,7 - xor ecx,edi - and edx,DWORD [4+esp] - mov DWORD [16+esp],eax - vpaddd xmm1,xmm1,xmm7 - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - vpsrld xmm7,xmm4,3 - lea edx,[ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - vpslld xmm5,xmm4,14 - mov edi,DWORD [20+esp] - xor ecx,esi - xor eax,edi - vpxor xmm4,xmm7,xmm6 - add edx,DWORD [12+esp] - and ebx,eax - add edx,DWORD [48+esp] - vpshufd xmm7,xmm0,250 - xor ebx,edi - add ecx,edx - add edx,DWORD [28+esp] - vpsrld xmm6,xmm6,11 - lea ebx,[ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - vpxor xmm4,xmm4,xmm5 - mov DWORD [28+esp],edx - rorx edi,edx,25 - xor ecx,esi - vpslld xmm5,xmm5,11 - andn esi,edx,DWORD [4+esp] - xor ecx,edi - and edx,DWORD [esp] - vpxor xmm4,xmm4,xmm6 - mov DWORD [12+esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - vpsrld xmm6,xmm7,10 - lea edx,[ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - vpxor xmm4,xmm4,xmm5 - mov edi,DWORD [16+esp] - xor ecx,esi - xor ebx,edi - vpsrlq xmm5,xmm7,17 - add edx,DWORD [8+esp] - and eax,ebx - add edx,DWORD [52+esp] - vpaddd xmm1,xmm1,xmm4 - xor eax,edi - add ecx,edx - add edx,DWORD [24+esp] - vpxor xmm6,xmm6,xmm5 - lea eax,[ecx*1+eax] - rorx ecx,edx,6 - rorx esi,edx,11 - vpsrlq xmm7,xmm7,19 - mov DWORD [24+esp],edx - rorx edi,edx,25 - xor ecx,esi - vpxor xmm6,xmm6,xmm7 - andn esi,edx,DWORD [esp] - xor ecx,edi - and edx,DWORD [28+esp] - vpshufd xmm7,xmm6,132 - mov DWORD [8+esp],eax - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - vpsrldq xmm7,xmm7,8 - lea edx,[ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - vpaddd xmm1,xmm1,xmm7 - mov edi,DWORD [12+esp] - xor ecx,esi - xor eax,edi - vpshufd xmm7,xmm1,80 - add edx,DWORD [4+esp] - and ebx,eax - add edx,DWORD [56+esp] - vpsrld xmm6,xmm7,10 - xor ebx,edi - add ecx,edx - add edx,DWORD [20+esp] - vpsrlq xmm5,xmm7,17 - lea ebx,[ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - vpxor xmm6,xmm6,xmm5 - mov DWORD [20+esp],edx - rorx edi,edx,25 - xor ecx,esi - vpsrlq xmm7,xmm7,19 - andn esi,edx,DWORD [28+esp] - xor ecx,edi - and edx,DWORD [24+esp] - vpxor xmm6,xmm6,xmm7 - mov DWORD [4+esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - vpshufd xmm7,xmm6,232 - lea edx,[ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - vpslldq xmm7,xmm7,8 - mov edi,DWORD [8+esp] - xor ecx,esi - xor ebx,edi - vpaddd xmm1,xmm1,xmm7 - add edx,DWORD [esp] - and eax,ebx - add edx,DWORD [60+esp] - vpaddd xmm6,xmm1,[16+ebp] - xor eax,edi - add ecx,edx - add edx,DWORD [16+esp] - lea eax,[ecx*1+eax] - vmovdqa [48+esp],xmm6 - vpalignr xmm4,xmm3,xmm2,4 - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD [16+esp],edx - vpalignr xmm7,xmm1,xmm0,4 - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD [24+esp] - vpsrld xmm6,xmm4,7 - xor ecx,edi - and edx,DWORD [20+esp] - mov DWORD [esp],eax - vpaddd xmm2,xmm2,xmm7 - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - vpsrld xmm7,xmm4,3 - lea edx,[ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - vpslld xmm5,xmm4,14 - mov edi,DWORD [4+esp] - xor ecx,esi - xor eax,edi - vpxor xmm4,xmm7,xmm6 - add edx,DWORD [28+esp] - and ebx,eax - add edx,DWORD [64+esp] - vpshufd xmm7,xmm1,250 - xor ebx,edi - add ecx,edx - add edx,DWORD [12+esp] - vpsrld xmm6,xmm6,11 - lea ebx,[ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - vpxor xmm4,xmm4,xmm5 - mov DWORD [12+esp],edx - rorx edi,edx,25 - xor ecx,esi - vpslld xmm5,xmm5,11 - andn esi,edx,DWORD [20+esp] - xor ecx,edi - and edx,DWORD [16+esp] - vpxor xmm4,xmm4,xmm6 - mov DWORD [28+esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - vpsrld xmm6,xmm7,10 - lea edx,[ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - vpxor xmm4,xmm4,xmm5 - mov edi,DWORD [esp] - xor ecx,esi - xor ebx,edi - vpsrlq xmm5,xmm7,17 - add edx,DWORD [24+esp] - and eax,ebx - add edx,DWORD [68+esp] - vpaddd xmm2,xmm2,xmm4 - xor eax,edi - add ecx,edx - add edx,DWORD [8+esp] - vpxor xmm6,xmm6,xmm5 - lea eax,[ecx*1+eax] - rorx ecx,edx,6 - rorx esi,edx,11 - vpsrlq xmm7,xmm7,19 - mov DWORD [8+esp],edx - rorx edi,edx,25 - xor ecx,esi - vpxor xmm6,xmm6,xmm7 - andn esi,edx,DWORD [16+esp] - xor ecx,edi - and edx,DWORD [12+esp] - vpshufd xmm7,xmm6,132 - mov DWORD [24+esp],eax - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - vpsrldq xmm7,xmm7,8 - lea edx,[ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - vpaddd xmm2,xmm2,xmm7 - mov edi,DWORD [28+esp] - xor ecx,esi - xor eax,edi - vpshufd xmm7,xmm2,80 - add edx,DWORD [20+esp] - and ebx,eax - add edx,DWORD [72+esp] - vpsrld xmm6,xmm7,10 - xor ebx,edi - add ecx,edx - add edx,DWORD [4+esp] - vpsrlq xmm5,xmm7,17 - lea ebx,[ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - vpxor xmm6,xmm6,xmm5 - mov DWORD [4+esp],edx - rorx edi,edx,25 - xor ecx,esi - vpsrlq xmm7,xmm7,19 - andn esi,edx,DWORD [12+esp] - xor ecx,edi - and edx,DWORD [8+esp] - vpxor xmm6,xmm6,xmm7 - mov DWORD [20+esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - vpshufd xmm7,xmm6,232 - lea edx,[ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - vpslldq xmm7,xmm7,8 - mov edi,DWORD [24+esp] - xor ecx,esi - xor ebx,edi - vpaddd xmm2,xmm2,xmm7 - add edx,DWORD [16+esp] - and eax,ebx - add edx,DWORD [76+esp] - vpaddd xmm6,xmm2,[32+ebp] - xor eax,edi - add ecx,edx - add edx,DWORD [esp] - lea eax,[ecx*1+eax] - vmovdqa [64+esp],xmm6 - vpalignr xmm4,xmm0,xmm3,4 - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD [esp],edx - vpalignr xmm7,xmm2,xmm1,4 - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD [8+esp] - vpsrld xmm6,xmm4,7 - xor ecx,edi - and edx,DWORD [4+esp] - mov DWORD [16+esp],eax - vpaddd xmm3,xmm3,xmm7 - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - vpsrld xmm7,xmm4,3 - lea edx,[ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - vpslld xmm5,xmm4,14 - mov edi,DWORD [20+esp] - xor ecx,esi - xor eax,edi - vpxor xmm4,xmm7,xmm6 - add edx,DWORD [12+esp] - and ebx,eax - add edx,DWORD [80+esp] - vpshufd xmm7,xmm2,250 - xor ebx,edi - add ecx,edx - add edx,DWORD [28+esp] - vpsrld xmm6,xmm6,11 - lea ebx,[ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - vpxor xmm4,xmm4,xmm5 - mov DWORD [28+esp],edx - rorx edi,edx,25 - xor ecx,esi - vpslld xmm5,xmm5,11 - andn esi,edx,DWORD [4+esp] - xor ecx,edi - and edx,DWORD [esp] - vpxor xmm4,xmm4,xmm6 - mov DWORD [12+esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - vpsrld xmm6,xmm7,10 - lea edx,[ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - vpxor xmm4,xmm4,xmm5 - mov edi,DWORD [16+esp] - xor ecx,esi - xor ebx,edi - vpsrlq xmm5,xmm7,17 - add edx,DWORD [8+esp] - and eax,ebx - add edx,DWORD [84+esp] - vpaddd xmm3,xmm3,xmm4 - xor eax,edi - add ecx,edx - add edx,DWORD [24+esp] - vpxor xmm6,xmm6,xmm5 - lea eax,[ecx*1+eax] - rorx ecx,edx,6 - rorx esi,edx,11 - vpsrlq xmm7,xmm7,19 - mov DWORD [24+esp],edx - rorx edi,edx,25 - xor ecx,esi - vpxor xmm6,xmm6,xmm7 - andn esi,edx,DWORD [esp] - xor ecx,edi - and edx,DWORD [28+esp] - vpshufd xmm7,xmm6,132 - mov DWORD [8+esp],eax - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - vpsrldq xmm7,xmm7,8 - lea edx,[ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - vpaddd xmm3,xmm3,xmm7 - mov edi,DWORD [12+esp] - xor ecx,esi - xor eax,edi - vpshufd xmm7,xmm3,80 - add edx,DWORD [4+esp] - and ebx,eax - add edx,DWORD [88+esp] - vpsrld xmm6,xmm7,10 - xor ebx,edi - add ecx,edx - add edx,DWORD [20+esp] - vpsrlq xmm5,xmm7,17 - lea ebx,[ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - vpxor xmm6,xmm6,xmm5 - mov DWORD [20+esp],edx - rorx edi,edx,25 - xor ecx,esi - vpsrlq xmm7,xmm7,19 - andn esi,edx,DWORD [28+esp] - xor ecx,edi - and edx,DWORD [24+esp] - vpxor xmm6,xmm6,xmm7 - mov DWORD [4+esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - vpshufd xmm7,xmm6,232 - lea edx,[ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - vpslldq xmm7,xmm7,8 - mov edi,DWORD [8+esp] - xor ecx,esi - xor ebx,edi - vpaddd xmm3,xmm3,xmm7 - add edx,DWORD [esp] - and eax,ebx - add edx,DWORD [92+esp] - vpaddd xmm6,xmm3,[48+ebp] - xor eax,edi - add ecx,edx - add edx,DWORD [16+esp] - lea eax,[ecx*1+eax] - vmovdqa [80+esp],xmm6 - cmp DWORD [64+ebp],66051 - jne NEAR L$018avx_bmi_00_47 - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD [16+esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD [24+esp] - xor ecx,edi - and edx,DWORD [20+esp] - mov DWORD [esp],eax - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - lea edx,[ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - mov edi,DWORD [4+esp] - xor ecx,esi - xor eax,edi - add edx,DWORD [28+esp] - and ebx,eax - add edx,DWORD [32+esp] - xor ebx,edi - add ecx,edx - add edx,DWORD [12+esp] - lea ebx,[ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD [12+esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD [20+esp] - xor ecx,edi - and edx,DWORD [16+esp] - mov DWORD [28+esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - lea edx,[ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - mov edi,DWORD [esp] - xor ecx,esi - xor ebx,edi - add edx,DWORD [24+esp] - and eax,ebx - add edx,DWORD [36+esp] - xor eax,edi - add ecx,edx - add edx,DWORD [8+esp] - lea eax,[ecx*1+eax] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD [8+esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD [16+esp] - xor ecx,edi - and edx,DWORD [12+esp] - mov DWORD [24+esp],eax - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - lea edx,[ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - mov edi,DWORD [28+esp] - xor ecx,esi - xor eax,edi - add edx,DWORD [20+esp] - and ebx,eax - add edx,DWORD [40+esp] - xor ebx,edi - add ecx,edx - add edx,DWORD [4+esp] - lea ebx,[ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD [4+esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD [12+esp] - xor ecx,edi - and edx,DWORD [8+esp] - mov DWORD [20+esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - lea edx,[ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - mov edi,DWORD [24+esp] - xor ecx,esi - xor ebx,edi - add edx,DWORD [16+esp] - and eax,ebx - add edx,DWORD [44+esp] - xor eax,edi - add ecx,edx - add edx,DWORD [esp] - lea eax,[ecx*1+eax] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD [esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD [8+esp] - xor ecx,edi - and edx,DWORD [4+esp] - mov DWORD [16+esp],eax - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - lea edx,[ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - mov edi,DWORD [20+esp] - xor ecx,esi - xor eax,edi - add edx,DWORD [12+esp] - and ebx,eax - add edx,DWORD [48+esp] - xor ebx,edi - add ecx,edx - add edx,DWORD [28+esp] - lea ebx,[ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD [28+esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD [4+esp] - xor ecx,edi - and edx,DWORD [esp] - mov DWORD [12+esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - lea edx,[ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - mov edi,DWORD [16+esp] - xor ecx,esi - xor ebx,edi - add edx,DWORD [8+esp] - and eax,ebx - add edx,DWORD [52+esp] - xor eax,edi - add ecx,edx - add edx,DWORD [24+esp] - lea eax,[ecx*1+eax] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD [24+esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD [esp] - xor ecx,edi - and edx,DWORD [28+esp] - mov DWORD [8+esp],eax - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - lea edx,[ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - mov edi,DWORD [12+esp] - xor ecx,esi - xor eax,edi - add edx,DWORD [4+esp] - and ebx,eax - add edx,DWORD [56+esp] - xor ebx,edi - add ecx,edx - add edx,DWORD [20+esp] - lea ebx,[ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD [20+esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD [28+esp] - xor ecx,edi - and edx,DWORD [24+esp] - mov DWORD [4+esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - lea edx,[ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - mov edi,DWORD [8+esp] - xor ecx,esi - xor ebx,edi - add edx,DWORD [esp] - and eax,ebx - add edx,DWORD [60+esp] - xor eax,edi - add ecx,edx - add edx,DWORD [16+esp] - lea eax,[ecx*1+eax] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD [16+esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD [24+esp] - xor ecx,edi - and edx,DWORD [20+esp] - mov DWORD [esp],eax - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - lea edx,[ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - mov edi,DWORD [4+esp] - xor ecx,esi - xor eax,edi - add edx,DWORD [28+esp] - and ebx,eax - add edx,DWORD [64+esp] - xor ebx,edi - add ecx,edx - add edx,DWORD [12+esp] - lea ebx,[ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD [12+esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD [20+esp] - xor ecx,edi - and edx,DWORD [16+esp] - mov DWORD [28+esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - lea edx,[ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - mov edi,DWORD [esp] - xor ecx,esi - xor ebx,edi - add edx,DWORD [24+esp] - and eax,ebx - add edx,DWORD [68+esp] - xor eax,edi - add ecx,edx - add edx,DWORD [8+esp] - lea eax,[ecx*1+eax] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD [8+esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD [16+esp] - xor ecx,edi - and edx,DWORD [12+esp] - mov DWORD [24+esp],eax - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - lea edx,[ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - mov edi,DWORD [28+esp] - xor ecx,esi - xor eax,edi - add edx,DWORD [20+esp] - and ebx,eax - add edx,DWORD [72+esp] - xor ebx,edi - add ecx,edx - add edx,DWORD [4+esp] - lea ebx,[ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD [4+esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD [12+esp] - xor ecx,edi - and edx,DWORD [8+esp] - mov DWORD [20+esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - lea edx,[ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - mov edi,DWORD [24+esp] - xor ecx,esi - xor ebx,edi - add edx,DWORD [16+esp] - and eax,ebx - add edx,DWORD [76+esp] - xor eax,edi - add ecx,edx - add edx,DWORD [esp] - lea eax,[ecx*1+eax] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD [esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD [8+esp] - xor ecx,edi - and edx,DWORD [4+esp] - mov DWORD [16+esp],eax - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - lea edx,[ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - mov edi,DWORD [20+esp] - xor ecx,esi - xor eax,edi - add edx,DWORD [12+esp] - and ebx,eax - add edx,DWORD [80+esp] - xor ebx,edi - add ecx,edx - add edx,DWORD [28+esp] - lea ebx,[ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD [28+esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD [4+esp] - xor ecx,edi - and edx,DWORD [esp] - mov DWORD [12+esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - lea edx,[ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - mov edi,DWORD [16+esp] - xor ecx,esi - xor ebx,edi - add edx,DWORD [8+esp] - and eax,ebx - add edx,DWORD [84+esp] - xor eax,edi - add ecx,edx - add edx,DWORD [24+esp] - lea eax,[ecx*1+eax] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD [24+esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD [esp] - xor ecx,edi - and edx,DWORD [28+esp] - mov DWORD [8+esp],eax - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - lea edx,[ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - mov edi,DWORD [12+esp] - xor ecx,esi - xor eax,edi - add edx,DWORD [4+esp] - and ebx,eax - add edx,DWORD [88+esp] - xor ebx,edi - add ecx,edx - add edx,DWORD [20+esp] - lea ebx,[ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD [20+esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD [28+esp] - xor ecx,edi - and edx,DWORD [24+esp] - mov DWORD [4+esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - lea edx,[ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - mov edi,DWORD [8+esp] - xor ecx,esi - xor ebx,edi - add edx,DWORD [esp] - and eax,ebx - add edx,DWORD [92+esp] - xor eax,edi - add ecx,edx - add edx,DWORD [16+esp] - lea eax,[ecx*1+eax] - mov esi,DWORD [96+esp] - xor ebx,edi - mov ecx,DWORD [12+esp] - add eax,DWORD [esi] - add ebx,DWORD [4+esi] - add edi,DWORD [8+esi] - add ecx,DWORD [12+esi] - mov DWORD [esi],eax - mov DWORD [4+esi],ebx - mov DWORD [8+esi],edi - mov DWORD [12+esi],ecx - mov DWORD [4+esp],ebx - xor ebx,edi - mov DWORD [8+esp],edi - mov DWORD [12+esp],ecx - mov edi,DWORD [20+esp] - mov ecx,DWORD [24+esp] - add edx,DWORD [16+esi] - add edi,DWORD [20+esi] - add ecx,DWORD [24+esi] - mov DWORD [16+esi],edx - mov DWORD [20+esi],edi - mov DWORD [20+esp],edi - mov edi,DWORD [28+esp] - mov DWORD [24+esi],ecx - add edi,DWORD [28+esi] - mov DWORD [24+esp],ecx - mov DWORD [28+esi],edi - mov DWORD [28+esp],edi - mov edi,DWORD [100+esp] - vmovdqa xmm7,[64+ebp] - sub ebp,192 - cmp edi,DWORD [104+esp] - jb NEAR L$017grand_avx_bmi - mov esp,DWORD [108+esp] - vzeroall - pop edi - pop esi - pop ebx - pop ebp - ret segment .bss common _OPENSSL_ia32cap_P 16 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aes-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aes-x86_64.s index 7d2428b971f..0b98b38c3f6 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aes-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aes-x86_64.s @@ -2656,7 +2656,7 @@ AES_cbc_encrypt: .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 .byte 65,69,83,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 - .section ".note.gnu.property", "a" + .section .note.gnu.property, #alloc .p2align 3 .long 1f - 0f .long 4f - 1f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-mb-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-mb-x86_64.s index 6e9fe9d7514..fd0c91fa45c 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-mb-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-mb-x86_64.s @@ -7,14 +7,6 @@ .align 32 aesni_multi_cbc_encrypt: .cfi_startproc - cmpl $2,%edx - jb .Lenc_non_avx - movl OPENSSL_ia32cap_P+4(%rip),%ecx - testl $268435456,%ecx - jnz _avx_cbc_enc_shortcut - jmp .Lenc_non_avx -.align 16 -.Lenc_non_avx: movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx @@ -298,14 +290,6 @@ aesni_multi_cbc_encrypt: .align 32 aesni_multi_cbc_decrypt: .cfi_startproc - cmpl $2,%edx - jb .Ldec_non_avx - movl OPENSSL_ia32cap_P+4(%rip),%ecx - testl $268435456,%ecx - jnz _avx_cbc_dec_shortcut - jmp .Ldec_non_avx -.align 16 -.Ldec_non_avx: movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx @@ -573,1020 +557,7 @@ aesni_multi_cbc_decrypt: .byte 0xf3,0xc3 .cfi_endproc .size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt -.type aesni_multi_cbc_encrypt_avx,@function -.align 32 -aesni_multi_cbc_encrypt_avx: -.cfi_startproc -_avx_cbc_enc_shortcut: - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - - - - - - - - - subq $192,%rsp - andq $-128,%rsp - movq %rax,16(%rsp) -.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08 - -.Lenc8x_body: - vzeroupper - vmovdqu (%rsi),%xmm15 - leaq 120(%rsi),%rsi - leaq 160(%rdi),%rdi - shrl $1,%edx - -.Lenc8x_loop_grande: - - xorl %edx,%edx - - movl -144(%rdi),%ecx - - movq -160(%rdi),%r8 - cmpl %edx,%ecx - - movq -152(%rdi),%rbx - cmovgl %ecx,%edx - testl %ecx,%ecx - - vmovdqu -136(%rdi),%xmm2 - movl %ecx,32(%rsp) - cmovleq %rsp,%r8 - subq %r8,%rbx - movq %rbx,64(%rsp) - - movl -104(%rdi),%ecx - - movq -120(%rdi),%r9 - cmpl %edx,%ecx - - movq -112(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - - vmovdqu -96(%rdi),%xmm3 - movl %ecx,36(%rsp) - cmovleq %rsp,%r9 - subq %r9,%rbp - movq %rbp,72(%rsp) - - movl -64(%rdi),%ecx - - movq -80(%rdi),%r10 - cmpl %edx,%ecx - - movq -72(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - - vmovdqu -56(%rdi),%xmm4 - movl %ecx,40(%rsp) - cmovleq %rsp,%r10 - subq %r10,%rbp - movq %rbp,80(%rsp) - - movl -24(%rdi),%ecx - - movq -40(%rdi),%r11 - cmpl %edx,%ecx - - movq -32(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - - vmovdqu -16(%rdi),%xmm5 - movl %ecx,44(%rsp) - cmovleq %rsp,%r11 - subq %r11,%rbp - movq %rbp,88(%rsp) - - movl 16(%rdi),%ecx - - movq 0(%rdi),%r12 - cmpl %edx,%ecx - - movq 8(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - - vmovdqu 24(%rdi),%xmm6 - movl %ecx,48(%rsp) - cmovleq %rsp,%r12 - subq %r12,%rbp - movq %rbp,96(%rsp) - - movl 56(%rdi),%ecx - - movq 40(%rdi),%r13 - cmpl %edx,%ecx - - movq 48(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - - vmovdqu 64(%rdi),%xmm7 - movl %ecx,52(%rsp) - cmovleq %rsp,%r13 - subq %r13,%rbp - movq %rbp,104(%rsp) - - movl 96(%rdi),%ecx - - movq 80(%rdi),%r14 - cmpl %edx,%ecx - - movq 88(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - - vmovdqu 104(%rdi),%xmm8 - movl %ecx,56(%rsp) - cmovleq %rsp,%r14 - subq %r14,%rbp - movq %rbp,112(%rsp) - - movl 136(%rdi),%ecx - - movq 120(%rdi),%r15 - cmpl %edx,%ecx - - movq 128(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - - vmovdqu 144(%rdi),%xmm9 - movl %ecx,60(%rsp) - cmovleq %rsp,%r15 - subq %r15,%rbp - movq %rbp,120(%rsp) - testl %edx,%edx - jz .Lenc8x_done - - vmovups 16-120(%rsi),%xmm1 - vmovups 32-120(%rsi),%xmm0 - movl 240-120(%rsi),%eax - - vpxor (%r8),%xmm15,%xmm10 - leaq 128(%rsp),%rbp - vpxor (%r9),%xmm15,%xmm11 - vpxor (%r10),%xmm15,%xmm12 - vpxor (%r11),%xmm15,%xmm13 - vpxor %xmm10,%xmm2,%xmm2 - vpxor (%r12),%xmm15,%xmm10 - vpxor %xmm11,%xmm3,%xmm3 - vpxor (%r13),%xmm15,%xmm11 - vpxor %xmm12,%xmm4,%xmm4 - vpxor (%r14),%xmm15,%xmm12 - vpxor %xmm13,%xmm5,%xmm5 - vpxor (%r15),%xmm15,%xmm13 - vpxor %xmm10,%xmm6,%xmm6 - movl $1,%ecx - vpxor %xmm11,%xmm7,%xmm7 - vpxor %xmm12,%xmm8,%xmm8 - vpxor %xmm13,%xmm9,%xmm9 - jmp .Loop_enc8x - -.align 32 -.Loop_enc8x: - vaesenc %xmm1,%xmm2,%xmm2 - cmpl 32+0(%rsp),%ecx - vaesenc %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r8) - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm5,%xmm5 - leaq (%r8,%rbx,1),%rbx - cmovgeq %rsp,%r8 - vaesenc %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm1,%xmm7,%xmm7 - subq %r8,%rbx - vaesenc %xmm1,%xmm8,%xmm8 - vpxor 16(%r8),%xmm15,%xmm10 - movq %rbx,64+0(%rsp) - vaesenc %xmm1,%xmm9,%xmm9 - vmovups -72(%rsi),%xmm1 - leaq 16(%r8,%rbx,1),%r8 - vmovdqu %xmm10,0(%rbp) - vaesenc %xmm0,%xmm2,%xmm2 - cmpl 32+4(%rsp),%ecx - movq 64+8(%rsp),%rbx - vaesenc %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r9) - vaesenc %xmm0,%xmm4,%xmm4 - vaesenc %xmm0,%xmm5,%xmm5 - leaq (%r9,%rbx,1),%rbx - cmovgeq %rsp,%r9 - vaesenc %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm0,%xmm7,%xmm7 - subq %r9,%rbx - vaesenc %xmm0,%xmm8,%xmm8 - vpxor 16(%r9),%xmm15,%xmm11 - movq %rbx,64+8(%rsp) - vaesenc %xmm0,%xmm9,%xmm9 - vmovups -56(%rsi),%xmm0 - leaq 16(%r9,%rbx,1),%r9 - vmovdqu %xmm11,16(%rbp) - vaesenc %xmm1,%xmm2,%xmm2 - cmpl 32+8(%rsp),%ecx - movq 64+16(%rsp),%rbx - vaesenc %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r10) - vaesenc %xmm1,%xmm4,%xmm4 - prefetcht0 15(%r8) - vaesenc %xmm1,%xmm5,%xmm5 - leaq (%r10,%rbx,1),%rbx - cmovgeq %rsp,%r10 - vaesenc %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm1,%xmm7,%xmm7 - subq %r10,%rbx - vaesenc %xmm1,%xmm8,%xmm8 - vpxor 16(%r10),%xmm15,%xmm12 - movq %rbx,64+16(%rsp) - vaesenc %xmm1,%xmm9,%xmm9 - vmovups -40(%rsi),%xmm1 - leaq 16(%r10,%rbx,1),%r10 - vmovdqu %xmm12,32(%rbp) - vaesenc %xmm0,%xmm2,%xmm2 - cmpl 32+12(%rsp),%ecx - movq 64+24(%rsp),%rbx - vaesenc %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r11) - vaesenc %xmm0,%xmm4,%xmm4 - prefetcht0 15(%r9) - vaesenc %xmm0,%xmm5,%xmm5 - leaq (%r11,%rbx,1),%rbx - cmovgeq %rsp,%r11 - vaesenc %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm0,%xmm7,%xmm7 - subq %r11,%rbx - vaesenc %xmm0,%xmm8,%xmm8 - vpxor 16(%r11),%xmm15,%xmm13 - movq %rbx,64+24(%rsp) - vaesenc %xmm0,%xmm9,%xmm9 - vmovups -24(%rsi),%xmm0 - leaq 16(%r11,%rbx,1),%r11 - vmovdqu %xmm13,48(%rbp) - vaesenc %xmm1,%xmm2,%xmm2 - cmpl 32+16(%rsp),%ecx - movq 64+32(%rsp),%rbx - vaesenc %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r12) - vaesenc %xmm1,%xmm4,%xmm4 - prefetcht0 15(%r10) - vaesenc %xmm1,%xmm5,%xmm5 - leaq (%r12,%rbx,1),%rbx - cmovgeq %rsp,%r12 - vaesenc %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm1,%xmm7,%xmm7 - subq %r12,%rbx - vaesenc %xmm1,%xmm8,%xmm8 - vpxor 16(%r12),%xmm15,%xmm10 - movq %rbx,64+32(%rsp) - vaesenc %xmm1,%xmm9,%xmm9 - vmovups -8(%rsi),%xmm1 - leaq 16(%r12,%rbx,1),%r12 - vaesenc %xmm0,%xmm2,%xmm2 - cmpl 32+20(%rsp),%ecx - movq 64+40(%rsp),%rbx - vaesenc %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r13) - vaesenc %xmm0,%xmm4,%xmm4 - prefetcht0 15(%r11) - vaesenc %xmm0,%xmm5,%xmm5 - leaq (%rbx,%r13,1),%rbx - cmovgeq %rsp,%r13 - vaesenc %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm0,%xmm7,%xmm7 - subq %r13,%rbx - vaesenc %xmm0,%xmm8,%xmm8 - vpxor 16(%r13),%xmm15,%xmm11 - movq %rbx,64+40(%rsp) - vaesenc %xmm0,%xmm9,%xmm9 - vmovups 8(%rsi),%xmm0 - leaq 16(%r13,%rbx,1),%r13 - vaesenc %xmm1,%xmm2,%xmm2 - cmpl 32+24(%rsp),%ecx - movq 64+48(%rsp),%rbx - vaesenc %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r14) - vaesenc %xmm1,%xmm4,%xmm4 - prefetcht0 15(%r12) - vaesenc %xmm1,%xmm5,%xmm5 - leaq (%r14,%rbx,1),%rbx - cmovgeq %rsp,%r14 - vaesenc %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm1,%xmm7,%xmm7 - subq %r14,%rbx - vaesenc %xmm1,%xmm8,%xmm8 - vpxor 16(%r14),%xmm15,%xmm12 - movq %rbx,64+48(%rsp) - vaesenc %xmm1,%xmm9,%xmm9 - vmovups 24(%rsi),%xmm1 - leaq 16(%r14,%rbx,1),%r14 - vaesenc %xmm0,%xmm2,%xmm2 - cmpl 32+28(%rsp),%ecx - movq 64+56(%rsp),%rbx - vaesenc %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r15) - vaesenc %xmm0,%xmm4,%xmm4 - prefetcht0 15(%r13) - vaesenc %xmm0,%xmm5,%xmm5 - leaq (%r15,%rbx,1),%rbx - cmovgeq %rsp,%r15 - vaesenc %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm0,%xmm7,%xmm7 - subq %r15,%rbx - vaesenc %xmm0,%xmm8,%xmm8 - vpxor 16(%r15),%xmm15,%xmm13 - movq %rbx,64+56(%rsp) - vaesenc %xmm0,%xmm9,%xmm9 - vmovups 40(%rsi),%xmm0 - leaq 16(%r15,%rbx,1),%r15 - vmovdqu 32(%rsp),%xmm14 - prefetcht0 15(%r14) - prefetcht0 15(%r15) - cmpl $11,%eax - jb .Lenc8x_tail - - vaesenc %xmm1,%xmm2,%xmm2 - vaesenc %xmm1,%xmm3,%xmm3 - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm5,%xmm5 - vaesenc %xmm1,%xmm6,%xmm6 - vaesenc %xmm1,%xmm7,%xmm7 - vaesenc %xmm1,%xmm8,%xmm8 - vaesenc %xmm1,%xmm9,%xmm9 - vmovups 176-120(%rsi),%xmm1 - - vaesenc %xmm0,%xmm2,%xmm2 - vaesenc %xmm0,%xmm3,%xmm3 - vaesenc %xmm0,%xmm4,%xmm4 - vaesenc %xmm0,%xmm5,%xmm5 - vaesenc %xmm0,%xmm6,%xmm6 - vaesenc %xmm0,%xmm7,%xmm7 - vaesenc %xmm0,%xmm8,%xmm8 - vaesenc %xmm0,%xmm9,%xmm9 - vmovups 192-120(%rsi),%xmm0 - je .Lenc8x_tail - - vaesenc %xmm1,%xmm2,%xmm2 - vaesenc %xmm1,%xmm3,%xmm3 - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm5,%xmm5 - vaesenc %xmm1,%xmm6,%xmm6 - vaesenc %xmm1,%xmm7,%xmm7 - vaesenc %xmm1,%xmm8,%xmm8 - vaesenc %xmm1,%xmm9,%xmm9 - vmovups 208-120(%rsi),%xmm1 - - vaesenc %xmm0,%xmm2,%xmm2 - vaesenc %xmm0,%xmm3,%xmm3 - vaesenc %xmm0,%xmm4,%xmm4 - vaesenc %xmm0,%xmm5,%xmm5 - vaesenc %xmm0,%xmm6,%xmm6 - vaesenc %xmm0,%xmm7,%xmm7 - vaesenc %xmm0,%xmm8,%xmm8 - vaesenc %xmm0,%xmm9,%xmm9 - vmovups 224-120(%rsi),%xmm0 - -.Lenc8x_tail: - vaesenc %xmm1,%xmm2,%xmm2 - vpxor %xmm15,%xmm15,%xmm15 - vaesenc %xmm1,%xmm3,%xmm3 - vaesenc %xmm1,%xmm4,%xmm4 - vpcmpgtd %xmm15,%xmm14,%xmm15 - vaesenc %xmm1,%xmm5,%xmm5 - vaesenc %xmm1,%xmm6,%xmm6 - vpaddd %xmm14,%xmm15,%xmm15 - vmovdqu 48(%rsp),%xmm14 - vaesenc %xmm1,%xmm7,%xmm7 - movq 64(%rsp),%rbx - vaesenc %xmm1,%xmm8,%xmm8 - vaesenc %xmm1,%xmm9,%xmm9 - vmovups 16-120(%rsi),%xmm1 - - vaesenclast %xmm0,%xmm2,%xmm2 - vmovdqa %xmm15,32(%rsp) - vpxor %xmm15,%xmm15,%xmm15 - vaesenclast %xmm0,%xmm3,%xmm3 - vaesenclast %xmm0,%xmm4,%xmm4 - vpcmpgtd %xmm15,%xmm14,%xmm15 - vaesenclast %xmm0,%xmm5,%xmm5 - vaesenclast %xmm0,%xmm6,%xmm6 - vpaddd %xmm15,%xmm14,%xmm14 - vmovdqu -120(%rsi),%xmm15 - vaesenclast %xmm0,%xmm7,%xmm7 - vaesenclast %xmm0,%xmm8,%xmm8 - vmovdqa %xmm14,48(%rsp) - vaesenclast %xmm0,%xmm9,%xmm9 - vmovups 32-120(%rsi),%xmm0 - - vmovups %xmm2,-16(%r8) - subq %rbx,%r8 - vpxor 0(%rbp),%xmm2,%xmm2 - vmovups %xmm3,-16(%r9) - subq 72(%rsp),%r9 - vpxor 16(%rbp),%xmm3,%xmm3 - vmovups %xmm4,-16(%r10) - subq 80(%rsp),%r10 - vpxor 32(%rbp),%xmm4,%xmm4 - vmovups %xmm5,-16(%r11) - subq 88(%rsp),%r11 - vpxor 48(%rbp),%xmm5,%xmm5 - vmovups %xmm6,-16(%r12) - subq 96(%rsp),%r12 - vpxor %xmm10,%xmm6,%xmm6 - vmovups %xmm7,-16(%r13) - subq 104(%rsp),%r13 - vpxor %xmm11,%xmm7,%xmm7 - vmovups %xmm8,-16(%r14) - subq 112(%rsp),%r14 - vpxor %xmm12,%xmm8,%xmm8 - vmovups %xmm9,-16(%r15) - subq 120(%rsp),%r15 - vpxor %xmm13,%xmm9,%xmm9 - - decl %edx - jnz .Loop_enc8x - - movq 16(%rsp),%rax -.cfi_def_cfa %rax,8 - - - - - -.Lenc8x_done: - vzeroupper - movq -48(%rax),%r15 -.cfi_restore %r15 - movq -40(%rax),%r14 -.cfi_restore %r14 - movq -32(%rax),%r13 -.cfi_restore %r13 - movq -24(%rax),%r12 -.cfi_restore %r12 - movq -16(%rax),%rbp -.cfi_restore %rbp - movq -8(%rax),%rbx -.cfi_restore %rbx - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Lenc8x_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx - -.type aesni_multi_cbc_decrypt_avx,@function -.align 32 -aesni_multi_cbc_decrypt_avx: -.cfi_startproc -_avx_cbc_dec_shortcut: - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - - - - - - - - - - subq $256,%rsp - andq $-256,%rsp - subq $192,%rsp - movq %rax,16(%rsp) -.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08 - -.Ldec8x_body: - vzeroupper - vmovdqu (%rsi),%xmm15 - leaq 120(%rsi),%rsi - leaq 160(%rdi),%rdi - shrl $1,%edx - -.Ldec8x_loop_grande: - - xorl %edx,%edx - - movl -144(%rdi),%ecx - - movq -160(%rdi),%r8 - cmpl %edx,%ecx - - movq -152(%rdi),%rbx - cmovgl %ecx,%edx - testl %ecx,%ecx - - vmovdqu -136(%rdi),%xmm2 - movl %ecx,32(%rsp) - cmovleq %rsp,%r8 - subq %r8,%rbx - movq %rbx,64(%rsp) - vmovdqu %xmm2,192(%rsp) - - movl -104(%rdi),%ecx - - movq -120(%rdi),%r9 - cmpl %edx,%ecx - - movq -112(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - - vmovdqu -96(%rdi),%xmm3 - movl %ecx,36(%rsp) - cmovleq %rsp,%r9 - subq %r9,%rbp - movq %rbp,72(%rsp) - vmovdqu %xmm3,208(%rsp) - - movl -64(%rdi),%ecx - - movq -80(%rdi),%r10 - cmpl %edx,%ecx - - movq -72(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - - vmovdqu -56(%rdi),%xmm4 - movl %ecx,40(%rsp) - cmovleq %rsp,%r10 - subq %r10,%rbp - movq %rbp,80(%rsp) - vmovdqu %xmm4,224(%rsp) - - movl -24(%rdi),%ecx - - movq -40(%rdi),%r11 - cmpl %edx,%ecx - - movq -32(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - - vmovdqu -16(%rdi),%xmm5 - movl %ecx,44(%rsp) - cmovleq %rsp,%r11 - subq %r11,%rbp - movq %rbp,88(%rsp) - vmovdqu %xmm5,240(%rsp) - - movl 16(%rdi),%ecx - - movq 0(%rdi),%r12 - cmpl %edx,%ecx - - movq 8(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - - vmovdqu 24(%rdi),%xmm6 - movl %ecx,48(%rsp) - cmovleq %rsp,%r12 - subq %r12,%rbp - movq %rbp,96(%rsp) - vmovdqu %xmm6,256(%rsp) - - movl 56(%rdi),%ecx - - movq 40(%rdi),%r13 - cmpl %edx,%ecx - - movq 48(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - - vmovdqu 64(%rdi),%xmm7 - movl %ecx,52(%rsp) - cmovleq %rsp,%r13 - subq %r13,%rbp - movq %rbp,104(%rsp) - vmovdqu %xmm7,272(%rsp) - - movl 96(%rdi),%ecx - - movq 80(%rdi),%r14 - cmpl %edx,%ecx - - movq 88(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - - vmovdqu 104(%rdi),%xmm8 - movl %ecx,56(%rsp) - cmovleq %rsp,%r14 - subq %r14,%rbp - movq %rbp,112(%rsp) - vmovdqu %xmm8,288(%rsp) - - movl 136(%rdi),%ecx - - movq 120(%rdi),%r15 - cmpl %edx,%ecx - - movq 128(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - - vmovdqu 144(%rdi),%xmm9 - movl %ecx,60(%rsp) - cmovleq %rsp,%r15 - subq %r15,%rbp - movq %rbp,120(%rsp) - vmovdqu %xmm9,304(%rsp) - testl %edx,%edx - jz .Ldec8x_done - - vmovups 16-120(%rsi),%xmm1 - vmovups 32-120(%rsi),%xmm0 - movl 240-120(%rsi),%eax - leaq 192+128(%rsp),%rbp - - vmovdqu (%r8),%xmm2 - vmovdqu (%r9),%xmm3 - vmovdqu (%r10),%xmm4 - vmovdqu (%r11),%xmm5 - vmovdqu (%r12),%xmm6 - vmovdqu (%r13),%xmm7 - vmovdqu (%r14),%xmm8 - vmovdqu (%r15),%xmm9 - vmovdqu %xmm2,0(%rbp) - vpxor %xmm15,%xmm2,%xmm2 - vmovdqu %xmm3,16(%rbp) - vpxor %xmm15,%xmm3,%xmm3 - vmovdqu %xmm4,32(%rbp) - vpxor %xmm15,%xmm4,%xmm4 - vmovdqu %xmm5,48(%rbp) - vpxor %xmm15,%xmm5,%xmm5 - vmovdqu %xmm6,64(%rbp) - vpxor %xmm15,%xmm6,%xmm6 - vmovdqu %xmm7,80(%rbp) - vpxor %xmm15,%xmm7,%xmm7 - vmovdqu %xmm8,96(%rbp) - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu %xmm9,112(%rbp) - vpxor %xmm15,%xmm9,%xmm9 - xorq $0x80,%rbp - movl $1,%ecx - jmp .Loop_dec8x - -.align 32 -.Loop_dec8x: - vaesdec %xmm1,%xmm2,%xmm2 - cmpl 32+0(%rsp),%ecx - vaesdec %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r8) - vaesdec %xmm1,%xmm4,%xmm4 - vaesdec %xmm1,%xmm5,%xmm5 - leaq (%r8,%rbx,1),%rbx - cmovgeq %rsp,%r8 - vaesdec %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm1,%xmm7,%xmm7 - subq %r8,%rbx - vaesdec %xmm1,%xmm8,%xmm8 - vmovdqu 16(%r8),%xmm10 - movq %rbx,64+0(%rsp) - vaesdec %xmm1,%xmm9,%xmm9 - vmovups -72(%rsi),%xmm1 - leaq 16(%r8,%rbx,1),%r8 - vmovdqu %xmm10,128(%rsp) - vaesdec %xmm0,%xmm2,%xmm2 - cmpl 32+4(%rsp),%ecx - movq 64+8(%rsp),%rbx - vaesdec %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r9) - vaesdec %xmm0,%xmm4,%xmm4 - vaesdec %xmm0,%xmm5,%xmm5 - leaq (%r9,%rbx,1),%rbx - cmovgeq %rsp,%r9 - vaesdec %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm0,%xmm7,%xmm7 - subq %r9,%rbx - vaesdec %xmm0,%xmm8,%xmm8 - vmovdqu 16(%r9),%xmm11 - movq %rbx,64+8(%rsp) - vaesdec %xmm0,%xmm9,%xmm9 - vmovups -56(%rsi),%xmm0 - leaq 16(%r9,%rbx,1),%r9 - vmovdqu %xmm11,144(%rsp) - vaesdec %xmm1,%xmm2,%xmm2 - cmpl 32+8(%rsp),%ecx - movq 64+16(%rsp),%rbx - vaesdec %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r10) - vaesdec %xmm1,%xmm4,%xmm4 - prefetcht0 15(%r8) - vaesdec %xmm1,%xmm5,%xmm5 - leaq (%r10,%rbx,1),%rbx - cmovgeq %rsp,%r10 - vaesdec %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm1,%xmm7,%xmm7 - subq %r10,%rbx - vaesdec %xmm1,%xmm8,%xmm8 - vmovdqu 16(%r10),%xmm12 - movq %rbx,64+16(%rsp) - vaesdec %xmm1,%xmm9,%xmm9 - vmovups -40(%rsi),%xmm1 - leaq 16(%r10,%rbx,1),%r10 - vmovdqu %xmm12,160(%rsp) - vaesdec %xmm0,%xmm2,%xmm2 - cmpl 32+12(%rsp),%ecx - movq 64+24(%rsp),%rbx - vaesdec %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r11) - vaesdec %xmm0,%xmm4,%xmm4 - prefetcht0 15(%r9) - vaesdec %xmm0,%xmm5,%xmm5 - leaq (%r11,%rbx,1),%rbx - cmovgeq %rsp,%r11 - vaesdec %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm0,%xmm7,%xmm7 - subq %r11,%rbx - vaesdec %xmm0,%xmm8,%xmm8 - vmovdqu 16(%r11),%xmm13 - movq %rbx,64+24(%rsp) - vaesdec %xmm0,%xmm9,%xmm9 - vmovups -24(%rsi),%xmm0 - leaq 16(%r11,%rbx,1),%r11 - vmovdqu %xmm13,176(%rsp) - vaesdec %xmm1,%xmm2,%xmm2 - cmpl 32+16(%rsp),%ecx - movq 64+32(%rsp),%rbx - vaesdec %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r12) - vaesdec %xmm1,%xmm4,%xmm4 - prefetcht0 15(%r10) - vaesdec %xmm1,%xmm5,%xmm5 - leaq (%r12,%rbx,1),%rbx - cmovgeq %rsp,%r12 - vaesdec %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm1,%xmm7,%xmm7 - subq %r12,%rbx - vaesdec %xmm1,%xmm8,%xmm8 - vmovdqu 16(%r12),%xmm10 - movq %rbx,64+32(%rsp) - vaesdec %xmm1,%xmm9,%xmm9 - vmovups -8(%rsi),%xmm1 - leaq 16(%r12,%rbx,1),%r12 - vaesdec %xmm0,%xmm2,%xmm2 - cmpl 32+20(%rsp),%ecx - movq 64+40(%rsp),%rbx - vaesdec %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r13) - vaesdec %xmm0,%xmm4,%xmm4 - prefetcht0 15(%r11) - vaesdec %xmm0,%xmm5,%xmm5 - leaq (%rbx,%r13,1),%rbx - cmovgeq %rsp,%r13 - vaesdec %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm0,%xmm7,%xmm7 - subq %r13,%rbx - vaesdec %xmm0,%xmm8,%xmm8 - vmovdqu 16(%r13),%xmm11 - movq %rbx,64+40(%rsp) - vaesdec %xmm0,%xmm9,%xmm9 - vmovups 8(%rsi),%xmm0 - leaq 16(%r13,%rbx,1),%r13 - vaesdec %xmm1,%xmm2,%xmm2 - cmpl 32+24(%rsp),%ecx - movq 64+48(%rsp),%rbx - vaesdec %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r14) - vaesdec %xmm1,%xmm4,%xmm4 - prefetcht0 15(%r12) - vaesdec %xmm1,%xmm5,%xmm5 - leaq (%r14,%rbx,1),%rbx - cmovgeq %rsp,%r14 - vaesdec %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm1,%xmm7,%xmm7 - subq %r14,%rbx - vaesdec %xmm1,%xmm8,%xmm8 - vmovdqu 16(%r14),%xmm12 - movq %rbx,64+48(%rsp) - vaesdec %xmm1,%xmm9,%xmm9 - vmovups 24(%rsi),%xmm1 - leaq 16(%r14,%rbx,1),%r14 - vaesdec %xmm0,%xmm2,%xmm2 - cmpl 32+28(%rsp),%ecx - movq 64+56(%rsp),%rbx - vaesdec %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r15) - vaesdec %xmm0,%xmm4,%xmm4 - prefetcht0 15(%r13) - vaesdec %xmm0,%xmm5,%xmm5 - leaq (%r15,%rbx,1),%rbx - cmovgeq %rsp,%r15 - vaesdec %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm0,%xmm7,%xmm7 - subq %r15,%rbx - vaesdec %xmm0,%xmm8,%xmm8 - vmovdqu 16(%r15),%xmm13 - movq %rbx,64+56(%rsp) - vaesdec %xmm0,%xmm9,%xmm9 - vmovups 40(%rsi),%xmm0 - leaq 16(%r15,%rbx,1),%r15 - vmovdqu 32(%rsp),%xmm14 - prefetcht0 15(%r14) - prefetcht0 15(%r15) - cmpl $11,%eax - jb .Ldec8x_tail - - vaesdec %xmm1,%xmm2,%xmm2 - vaesdec %xmm1,%xmm3,%xmm3 - vaesdec %xmm1,%xmm4,%xmm4 - vaesdec %xmm1,%xmm5,%xmm5 - vaesdec %xmm1,%xmm6,%xmm6 - vaesdec %xmm1,%xmm7,%xmm7 - vaesdec %xmm1,%xmm8,%xmm8 - vaesdec %xmm1,%xmm9,%xmm9 - vmovups 176-120(%rsi),%xmm1 - - vaesdec %xmm0,%xmm2,%xmm2 - vaesdec %xmm0,%xmm3,%xmm3 - vaesdec %xmm0,%xmm4,%xmm4 - vaesdec %xmm0,%xmm5,%xmm5 - vaesdec %xmm0,%xmm6,%xmm6 - vaesdec %xmm0,%xmm7,%xmm7 - vaesdec %xmm0,%xmm8,%xmm8 - vaesdec %xmm0,%xmm9,%xmm9 - vmovups 192-120(%rsi),%xmm0 - je .Ldec8x_tail - - vaesdec %xmm1,%xmm2,%xmm2 - vaesdec %xmm1,%xmm3,%xmm3 - vaesdec %xmm1,%xmm4,%xmm4 - vaesdec %xmm1,%xmm5,%xmm5 - vaesdec %xmm1,%xmm6,%xmm6 - vaesdec %xmm1,%xmm7,%xmm7 - vaesdec %xmm1,%xmm8,%xmm8 - vaesdec %xmm1,%xmm9,%xmm9 - vmovups 208-120(%rsi),%xmm1 - - vaesdec %xmm0,%xmm2,%xmm2 - vaesdec %xmm0,%xmm3,%xmm3 - vaesdec %xmm0,%xmm4,%xmm4 - vaesdec %xmm0,%xmm5,%xmm5 - vaesdec %xmm0,%xmm6,%xmm6 - vaesdec %xmm0,%xmm7,%xmm7 - vaesdec %xmm0,%xmm8,%xmm8 - vaesdec %xmm0,%xmm9,%xmm9 - vmovups 224-120(%rsi),%xmm0 - -.Ldec8x_tail: - vaesdec %xmm1,%xmm2,%xmm2 - vpxor %xmm15,%xmm15,%xmm15 - vaesdec %xmm1,%xmm3,%xmm3 - vaesdec %xmm1,%xmm4,%xmm4 - vpcmpgtd %xmm15,%xmm14,%xmm15 - vaesdec %xmm1,%xmm5,%xmm5 - vaesdec %xmm1,%xmm6,%xmm6 - vpaddd %xmm14,%xmm15,%xmm15 - vmovdqu 48(%rsp),%xmm14 - vaesdec %xmm1,%xmm7,%xmm7 - movq 64(%rsp),%rbx - vaesdec %xmm1,%xmm8,%xmm8 - vaesdec %xmm1,%xmm9,%xmm9 - vmovups 16-120(%rsi),%xmm1 - - vaesdeclast %xmm0,%xmm2,%xmm2 - vmovdqa %xmm15,32(%rsp) - vpxor %xmm15,%xmm15,%xmm15 - vaesdeclast %xmm0,%xmm3,%xmm3 - vpxor 0(%rbp),%xmm2,%xmm2 - vaesdeclast %xmm0,%xmm4,%xmm4 - vpxor 16(%rbp),%xmm3,%xmm3 - vpcmpgtd %xmm15,%xmm14,%xmm15 - vaesdeclast %xmm0,%xmm5,%xmm5 - vpxor 32(%rbp),%xmm4,%xmm4 - vaesdeclast %xmm0,%xmm6,%xmm6 - vpxor 48(%rbp),%xmm5,%xmm5 - vpaddd %xmm15,%xmm14,%xmm14 - vmovdqu -120(%rsi),%xmm15 - vaesdeclast %xmm0,%xmm7,%xmm7 - vpxor 64(%rbp),%xmm6,%xmm6 - vaesdeclast %xmm0,%xmm8,%xmm8 - vpxor 80(%rbp),%xmm7,%xmm7 - vmovdqa %xmm14,48(%rsp) - vaesdeclast %xmm0,%xmm9,%xmm9 - vpxor 96(%rbp),%xmm8,%xmm8 - vmovups 32-120(%rsi),%xmm0 - - vmovups %xmm2,-16(%r8) - subq %rbx,%r8 - vmovdqu 128+0(%rsp),%xmm2 - vpxor 112(%rbp),%xmm9,%xmm9 - vmovups %xmm3,-16(%r9) - subq 72(%rsp),%r9 - vmovdqu %xmm2,0(%rbp) - vpxor %xmm15,%xmm2,%xmm2 - vmovdqu 128+16(%rsp),%xmm3 - vmovups %xmm4,-16(%r10) - subq 80(%rsp),%r10 - vmovdqu %xmm3,16(%rbp) - vpxor %xmm15,%xmm3,%xmm3 - vmovdqu 128+32(%rsp),%xmm4 - vmovups %xmm5,-16(%r11) - subq 88(%rsp),%r11 - vmovdqu %xmm4,32(%rbp) - vpxor %xmm15,%xmm4,%xmm4 - vmovdqu 128+48(%rsp),%xmm5 - vmovups %xmm6,-16(%r12) - subq 96(%rsp),%r12 - vmovdqu %xmm5,48(%rbp) - vpxor %xmm15,%xmm5,%xmm5 - vmovdqu %xmm10,64(%rbp) - vpxor %xmm10,%xmm15,%xmm6 - vmovups %xmm7,-16(%r13) - subq 104(%rsp),%r13 - vmovdqu %xmm11,80(%rbp) - vpxor %xmm11,%xmm15,%xmm7 - vmovups %xmm8,-16(%r14) - subq 112(%rsp),%r14 - vmovdqu %xmm12,96(%rbp) - vpxor %xmm12,%xmm15,%xmm8 - vmovups %xmm9,-16(%r15) - subq 120(%rsp),%r15 - vmovdqu %xmm13,112(%rbp) - vpxor %xmm13,%xmm15,%xmm9 - - xorq $128,%rbp - decl %edx - jnz .Loop_dec8x - - movq 16(%rsp),%rax -.cfi_def_cfa %rax,8 - - - - - -.Ldec8x_done: - vzeroupper - movq -48(%rax),%r15 -.cfi_restore %r15 - movq -40(%rax),%r14 -.cfi_restore %r14 - movq -32(%rax),%r13 -.cfi_restore %r13 - movq -24(%rax),%r12 -.cfi_restore %r12 - movq -16(%rax),%rbp -.cfi_restore %rbp - movq -8(%rax),%rbx -.cfi_restore %rbx - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Ldec8x_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx - .section ".note.gnu.property", "a" + .section .note.gnu.property, #alloc .p2align 3 .long 1f - 0f .long 4f - 1f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-sha1-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-sha1-x86_64.s index 68af8c69a68..303a9518821 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-sha1-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-sha1-x86_64.s @@ -11,11 +11,6 @@ aesni_cbc_sha1_enc: movq OPENSSL_ia32cap_P+4(%rip),%r11 btq $61,%r11 jc aesni_cbc_sha1_enc_shaext - andl $268435456,%r11d - andl $1073741824,%r10d - orl %r11d,%r10d - cmpl $1342177280,%r10d - je aesni_cbc_sha1_enc_avx jmp aesni_cbc_sha1_enc_ssse3 .byte 0xf3,0xc3 .cfi_endproc @@ -1397,1327 +1392,6 @@ aesni_cbc_sha1_enc_ssse3: .byte 0xf3,0xc3 .cfi_endproc .size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3 -.type aesni_cbc_sha1_enc_avx,@function -.align 32 -aesni_cbc_sha1_enc_avx: -.cfi_startproc - movq 8(%rsp),%r10 - - - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - leaq -104(%rsp),%rsp -.cfi_adjust_cfa_offset 104 - - - vzeroall - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - leaq 112(%rcx),%r15 - vmovdqu (%r8),%xmm12 - movq %r8,88(%rsp) - shlq $6,%r14 - subq %r12,%r13 - movl 240-112(%r15),%r8d - addq %r10,%r14 - - leaq K_XX_XX(%rip),%r11 - movl 0(%r9),%eax - movl 4(%r9),%ebx - movl 8(%r9),%ecx - movl 12(%r9),%edx - movl %ebx,%esi - movl 16(%r9),%ebp - movl %ecx,%edi - xorl %edx,%edi - andl %edi,%esi - - vmovdqa 64(%r11),%xmm6 - vmovdqa 0(%r11),%xmm10 - vmovdqu 0(%r10),%xmm0 - vmovdqu 16(%r10),%xmm1 - vmovdqu 32(%r10),%xmm2 - vmovdqu 48(%r10),%xmm3 - vpshufb %xmm6,%xmm0,%xmm0 - addq $64,%r10 - vpshufb %xmm6,%xmm1,%xmm1 - vpshufb %xmm6,%xmm2,%xmm2 - vpshufb %xmm6,%xmm3,%xmm3 - vpaddd %xmm10,%xmm0,%xmm4 - vpaddd %xmm10,%xmm1,%xmm5 - vpaddd %xmm10,%xmm2,%xmm6 - vmovdqa %xmm4,0(%rsp) - vmovdqa %xmm5,16(%rsp) - vmovdqa %xmm6,32(%rsp) - vmovups -112(%r15),%xmm15 - vmovups 16-112(%r15),%xmm14 - jmp .Loop_avx -.align 32 -.Loop_avx: - shrdl $2,%ebx,%ebx - vmovdqu 0(%r12),%xmm13 - vpxor %xmm15,%xmm13,%xmm13 - vpxor %xmm13,%xmm12,%xmm12 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -80(%r15),%xmm15 - xorl %edx,%esi - vpalignr $8,%xmm0,%xmm1,%xmm4 - movl %eax,%edi - addl 0(%rsp),%ebp - vpaddd %xmm3,%xmm10,%xmm9 - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpsrldq $4,%xmm3,%xmm8 - addl %esi,%ebp - andl %ebx,%edi - vpxor %xmm0,%xmm4,%xmm4 - xorl %ecx,%ebx - addl %eax,%ebp - vpxor %xmm2,%xmm8,%xmm8 - shrdl $7,%eax,%eax - xorl %ecx,%edi - movl %ebp,%esi - addl 4(%rsp),%edx - vpxor %xmm8,%xmm4,%xmm4 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vmovdqa %xmm9,48(%rsp) - addl %edi,%edx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -64(%r15),%xmm14 - andl %eax,%esi - vpsrld $31,%xmm4,%xmm8 - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%esi - vpslldq $12,%xmm4,%xmm9 - vpaddd %xmm4,%xmm4,%xmm4 - movl %edx,%edi - addl 8(%rsp),%ecx - xorl %eax,%ebp - shldl $5,%edx,%edx - vpor %xmm8,%xmm4,%xmm4 - vpsrld $30,%xmm9,%xmm8 - addl %esi,%ecx - andl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - vpslld $2,%xmm9,%xmm9 - vpxor %xmm8,%xmm4,%xmm4 - shrdl $7,%edx,%edx - xorl %eax,%edi - movl %ecx,%esi - addl 12(%rsp),%ebx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -48(%r15),%xmm15 - vpxor %xmm9,%xmm4,%xmm4 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - addl %edi,%ebx - andl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %ebp,%esi - vpalignr $8,%xmm1,%xmm2,%xmm5 - movl %ebx,%edi - addl 16(%rsp),%eax - vpaddd %xmm4,%xmm10,%xmm9 - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpsrldq $4,%xmm4,%xmm8 - addl %esi,%eax - andl %ecx,%edi - vpxor %xmm1,%xmm5,%xmm5 - xorl %edx,%ecx - addl %ebx,%eax - vpxor %xmm3,%xmm8,%xmm8 - shrdl $7,%ebx,%ebx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -32(%r15),%xmm14 - xorl %edx,%edi - movl %eax,%esi - addl 20(%rsp),%ebp - vpxor %xmm8,%xmm5,%xmm5 - xorl %ecx,%ebx - shldl $5,%eax,%eax - vmovdqa %xmm9,0(%rsp) - addl %edi,%ebp - andl %ebx,%esi - vpsrld $31,%xmm5,%xmm8 - xorl %ecx,%ebx - addl %eax,%ebp - shrdl $7,%eax,%eax - xorl %ecx,%esi - vpslldq $12,%xmm5,%xmm9 - vpaddd %xmm5,%xmm5,%xmm5 - movl %ebp,%edi - addl 24(%rsp),%edx - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vpor %xmm8,%xmm5,%xmm5 - vpsrld $30,%xmm9,%xmm8 - addl %esi,%edx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -16(%r15),%xmm15 - andl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - vpslld $2,%xmm9,%xmm9 - vpxor %xmm8,%xmm5,%xmm5 - shrdl $7,%ebp,%ebp - xorl %ebx,%edi - movl %edx,%esi - addl 28(%rsp),%ecx - vpxor %xmm9,%xmm5,%xmm5 - xorl %eax,%ebp - shldl $5,%edx,%edx - vmovdqa 16(%r11),%xmm10 - addl %edi,%ecx - andl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - vpalignr $8,%xmm2,%xmm3,%xmm6 - movl %ecx,%edi - addl 32(%rsp),%ebx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 0(%r15),%xmm14 - vpaddd %xmm5,%xmm10,%xmm9 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - vpsrldq $4,%xmm5,%xmm8 - addl %esi,%ebx - andl %edx,%edi - vpxor %xmm2,%xmm6,%xmm6 - xorl %ebp,%edx - addl %ecx,%ebx - vpxor %xmm4,%xmm8,%xmm8 - shrdl $7,%ecx,%ecx - xorl %ebp,%edi - movl %ebx,%esi - addl 36(%rsp),%eax - vpxor %xmm8,%xmm6,%xmm6 - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vmovdqa %xmm9,16(%rsp) - addl %edi,%eax - andl %ecx,%esi - vpsrld $31,%xmm6,%xmm8 - xorl %edx,%ecx - addl %ebx,%eax - shrdl $7,%ebx,%ebx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 16(%r15),%xmm15 - xorl %edx,%esi - vpslldq $12,%xmm6,%xmm9 - vpaddd %xmm6,%xmm6,%xmm6 - movl %eax,%edi - addl 40(%rsp),%ebp - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpor %xmm8,%xmm6,%xmm6 - vpsrld $30,%xmm9,%xmm8 - addl %esi,%ebp - andl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - vpslld $2,%xmm9,%xmm9 - vpxor %xmm8,%xmm6,%xmm6 - shrdl $7,%eax,%eax - xorl %ecx,%edi - movl %ebp,%esi - addl 44(%rsp),%edx - vpxor %xmm9,%xmm6,%xmm6 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - addl %edi,%edx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 32(%r15),%xmm14 - andl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%esi - vpalignr $8,%xmm3,%xmm4,%xmm7 - movl %edx,%edi - addl 48(%rsp),%ecx - vpaddd %xmm6,%xmm10,%xmm9 - xorl %eax,%ebp - shldl $5,%edx,%edx - vpsrldq $4,%xmm6,%xmm8 - addl %esi,%ecx - andl %ebp,%edi - vpxor %xmm3,%xmm7,%xmm7 - xorl %eax,%ebp - addl %edx,%ecx - vpxor %xmm5,%xmm8,%xmm8 - shrdl $7,%edx,%edx - xorl %eax,%edi - movl %ecx,%esi - addl 52(%rsp),%ebx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 48(%r15),%xmm15 - vpxor %xmm8,%xmm7,%xmm7 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - vmovdqa %xmm9,32(%rsp) - addl %edi,%ebx - andl %edx,%esi - vpsrld $31,%xmm7,%xmm8 - xorl %ebp,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %ebp,%esi - vpslldq $12,%xmm7,%xmm9 - vpaddd %xmm7,%xmm7,%xmm7 - movl %ebx,%edi - addl 56(%rsp),%eax - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpor %xmm8,%xmm7,%xmm7 - vpsrld $30,%xmm9,%xmm8 - addl %esi,%eax - andl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - vpslld $2,%xmm9,%xmm9 - vpxor %xmm8,%xmm7,%xmm7 - shrdl $7,%ebx,%ebx - cmpl $11,%r8d - jb .Lvaesenclast6 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 64(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 80(%r15),%xmm15 - je .Lvaesenclast6 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 96(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 112(%r15),%xmm15 -.Lvaesenclast6: - vaesenclast %xmm15,%xmm12,%xmm12 - vmovups -112(%r15),%xmm15 - vmovups 16-112(%r15),%xmm14 - xorl %edx,%edi - movl %eax,%esi - addl 60(%rsp),%ebp - vpxor %xmm9,%xmm7,%xmm7 - xorl %ecx,%ebx - shldl $5,%eax,%eax - addl %edi,%ebp - andl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm6,%xmm7,%xmm8 - vpxor %xmm4,%xmm0,%xmm0 - shrdl $7,%eax,%eax - xorl %ecx,%esi - movl %ebp,%edi - addl 0(%rsp),%edx - vpxor %xmm1,%xmm0,%xmm0 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vpaddd %xmm7,%xmm10,%xmm9 - addl %esi,%edx - vmovdqu 16(%r12),%xmm13 - vpxor %xmm15,%xmm13,%xmm13 - vmovups %xmm12,0(%r12,%r13,1) - vpxor %xmm13,%xmm12,%xmm12 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -80(%r15),%xmm15 - andl %eax,%edi - vpxor %xmm8,%xmm0,%xmm0 - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%edi - vpsrld $30,%xmm0,%xmm8 - vmovdqa %xmm9,48(%rsp) - movl %edx,%esi - addl 4(%rsp),%ecx - xorl %eax,%ebp - shldl $5,%edx,%edx - vpslld $2,%xmm0,%xmm0 - addl %edi,%ecx - andl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - movl %ecx,%edi - addl 8(%rsp),%ebx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -64(%r15),%xmm14 - vpor %xmm8,%xmm0,%xmm0 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - addl %esi,%ebx - andl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 12(%rsp),%eax - xorl %ebp,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm7,%xmm0,%xmm8 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -48(%r15),%xmm15 - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - vpxor %xmm2,%xmm1,%xmm1 - addl %esi,%ebp - xorl %ecx,%edi - vpaddd %xmm0,%xmm10,%xmm9 - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpxor %xmm8,%xmm1,%xmm1 - addl 20(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - vpsrld $30,%xmm1,%xmm8 - vmovdqa %xmm9,0(%rsp) - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpslld $2,%xmm1,%xmm1 - addl 24(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -32(%r15),%xmm14 - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpor %xmm8,%xmm1,%xmm1 - addl 28(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpalignr $8,%xmm0,%xmm1,%xmm8 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - vpxor %xmm3,%xmm2,%xmm2 - addl %esi,%eax - xorl %edx,%edi - vpaddd %xmm1,%xmm10,%xmm9 - vmovdqa 32(%r11),%xmm10 - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpxor %xmm8,%xmm2,%xmm2 - addl 36(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -16(%r15),%xmm15 - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - vpsrld $30,%xmm2,%xmm8 - vmovdqa %xmm9,16(%rsp) - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpslld $2,%xmm2,%xmm2 - addl 40(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpor %xmm8,%xmm2,%xmm2 - addl 44(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 0(%r15),%xmm14 - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpalignr $8,%xmm1,%xmm2,%xmm8 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - vpxor %xmm4,%xmm3,%xmm3 - addl %esi,%ebx - xorl %ebp,%edi - vpaddd %xmm2,%xmm10,%xmm9 - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpxor %xmm8,%xmm3,%xmm3 - addl 52(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - vpsrld $30,%xmm3,%xmm8 - vmovdqa %xmm9,32(%rsp) - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpslld $2,%xmm3,%xmm3 - addl 56(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 16(%r15),%xmm15 - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpor %xmm8,%xmm3,%xmm3 - addl 60(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpalignr $8,%xmm2,%xmm3,%xmm8 - vpxor %xmm0,%xmm4,%xmm4 - addl 0(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - vpxor %xmm5,%xmm4,%xmm4 - addl %esi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 32(%r15),%xmm14 - xorl %eax,%edi - vpaddd %xmm3,%xmm10,%xmm9 - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpxor %xmm8,%xmm4,%xmm4 - addl 4(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - vpsrld $30,%xmm4,%xmm8 - vmovdqa %xmm9,48(%rsp) - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpslld $2,%xmm4,%xmm4 - addl 8(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpor %xmm8,%xmm4,%xmm4 - addl 12(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 48(%r15),%xmm15 - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm3,%xmm4,%xmm8 - vpxor %xmm1,%xmm5,%xmm5 - addl 16(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - vpxor %xmm6,%xmm5,%xmm5 - addl %esi,%edx - xorl %ebx,%edi - vpaddd %xmm4,%xmm10,%xmm9 - shrdl $7,%eax,%eax - addl %ebp,%edx - vpxor %xmm8,%xmm5,%xmm5 - addl 20(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - vpsrld $30,%xmm5,%xmm8 - vmovdqa %xmm9,0(%rsp) - addl %edi,%ecx - cmpl $11,%r8d - jb .Lvaesenclast7 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 64(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 80(%r15),%xmm15 - je .Lvaesenclast7 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 96(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 112(%r15),%xmm15 -.Lvaesenclast7: - vaesenclast %xmm15,%xmm12,%xmm12 - vmovups -112(%r15),%xmm15 - vmovups 16-112(%r15),%xmm14 - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpslld $2,%xmm5,%xmm5 - addl 24(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpor %xmm8,%xmm5,%xmm5 - addl 28(%rsp),%eax - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm4,%xmm5,%xmm8 - vpxor %xmm2,%xmm6,%xmm6 - addl 32(%rsp),%ebp - vmovdqu 32(%r12),%xmm13 - vpxor %xmm15,%xmm13,%xmm13 - vmovups %xmm12,16(%r13,%r12,1) - vpxor %xmm13,%xmm12,%xmm12 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -80(%r15),%xmm15 - andl %ecx,%esi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - vpxor %xmm7,%xmm6,%xmm6 - movl %eax,%edi - xorl %ecx,%esi - vpaddd %xmm5,%xmm10,%xmm9 - shldl $5,%eax,%eax - addl %esi,%ebp - vpxor %xmm8,%xmm6,%xmm6 - xorl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - addl 36(%rsp),%edx - vpsrld $30,%xmm6,%xmm8 - vmovdqa %xmm9,16(%rsp) - andl %ebx,%edi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%esi - vpslld $2,%xmm6,%xmm6 - xorl %ebx,%edi - shldl $5,%ebp,%ebp - addl %edi,%edx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -64(%r15),%xmm14 - xorl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - addl 40(%rsp),%ecx - andl %eax,%esi - vpor %xmm8,%xmm6,%xmm6 - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%edi - xorl %eax,%esi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - addl 44(%rsp),%ebx - andl %ebp,%edi - xorl %eax,%ebp - shrdl $7,%edx,%edx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -48(%r15),%xmm15 - movl %ecx,%esi - xorl %ebp,%edi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - vpalignr $8,%xmm5,%xmm6,%xmm8 - vpxor %xmm3,%xmm7,%xmm7 - addl 48(%rsp),%eax - andl %edx,%esi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - vpxor %xmm0,%xmm7,%xmm7 - movl %ebx,%edi - xorl %edx,%esi - vpaddd %xmm6,%xmm10,%xmm9 - vmovdqa 48(%r11),%xmm10 - shldl $5,%ebx,%ebx - addl %esi,%eax - vpxor %xmm8,%xmm7,%xmm7 - xorl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - addl 52(%rsp),%ebp - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -32(%r15),%xmm14 - vpsrld $30,%xmm7,%xmm8 - vmovdqa %xmm9,32(%rsp) - andl %ecx,%edi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - vpslld $2,%xmm7,%xmm7 - xorl %ecx,%edi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - addl 56(%rsp),%edx - andl %ebx,%esi - vpor %xmm8,%xmm7,%xmm7 - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%edi - xorl %ebx,%esi - shldl $5,%ebp,%ebp - addl %esi,%edx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -16(%r15),%xmm15 - xorl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - addl 60(%rsp),%ecx - andl %eax,%edi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%esi - xorl %eax,%edi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - vpalignr $8,%xmm6,%xmm7,%xmm8 - vpxor %xmm4,%xmm0,%xmm0 - addl 0(%rsp),%ebx - andl %ebp,%esi - xorl %eax,%ebp - shrdl $7,%edx,%edx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 0(%r15),%xmm14 - vpxor %xmm1,%xmm0,%xmm0 - movl %ecx,%edi - xorl %ebp,%esi - vpaddd %xmm7,%xmm10,%xmm9 - shldl $5,%ecx,%ecx - addl %esi,%ebx - vpxor %xmm8,%xmm0,%xmm0 - xorl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 4(%rsp),%eax - vpsrld $30,%xmm0,%xmm8 - vmovdqa %xmm9,48(%rsp) - andl %edx,%edi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - vpslld $2,%xmm0,%xmm0 - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - addl 8(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 16(%r15),%xmm15 - andl %ecx,%esi - vpor %xmm8,%xmm0,%xmm0 - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%edi - xorl %ecx,%esi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - addl 12(%rsp),%edx - andl %ebx,%edi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%esi - xorl %ebx,%edi - shldl $5,%ebp,%ebp - addl %edi,%edx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 32(%r15),%xmm14 - xorl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - vpalignr $8,%xmm7,%xmm0,%xmm8 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%rsp),%ecx - andl %eax,%esi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - vpxor %xmm2,%xmm1,%xmm1 - movl %edx,%edi - xorl %eax,%esi - vpaddd %xmm0,%xmm10,%xmm9 - shldl $5,%edx,%edx - addl %esi,%ecx - vpxor %xmm8,%xmm1,%xmm1 - xorl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - addl 20(%rsp),%ebx - vpsrld $30,%xmm1,%xmm8 - vmovdqa %xmm9,0(%rsp) - andl %ebp,%edi - xorl %eax,%ebp - shrdl $7,%edx,%edx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 48(%r15),%xmm15 - movl %ecx,%esi - vpslld $2,%xmm1,%xmm1 - xorl %ebp,%edi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - addl 24(%rsp),%eax - andl %edx,%esi - vpor %xmm8,%xmm1,%xmm1 - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%edi - xorl %edx,%esi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - addl 28(%rsp),%ebp - cmpl $11,%r8d - jb .Lvaesenclast8 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 64(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 80(%r15),%xmm15 - je .Lvaesenclast8 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 96(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 112(%r15),%xmm15 -.Lvaesenclast8: - vaesenclast %xmm15,%xmm12,%xmm12 - vmovups -112(%r15),%xmm15 - vmovups 16-112(%r15),%xmm14 - andl %ecx,%edi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - xorl %ecx,%edi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm0,%xmm1,%xmm8 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%rsp),%edx - andl %ebx,%esi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - vpxor %xmm3,%xmm2,%xmm2 - movl %ebp,%edi - xorl %ebx,%esi - vpaddd %xmm1,%xmm10,%xmm9 - shldl $5,%ebp,%ebp - addl %esi,%edx - vmovdqu 48(%r12),%xmm13 - vpxor %xmm15,%xmm13,%xmm13 - vmovups %xmm12,32(%r13,%r12,1) - vpxor %xmm13,%xmm12,%xmm12 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -80(%r15),%xmm15 - vpxor %xmm8,%xmm2,%xmm2 - xorl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - addl 36(%rsp),%ecx - vpsrld $30,%xmm2,%xmm8 - vmovdqa %xmm9,16(%rsp) - andl %eax,%edi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%esi - vpslld $2,%xmm2,%xmm2 - xorl %eax,%edi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - addl 40(%rsp),%ebx - andl %ebp,%esi - vpor %xmm8,%xmm2,%xmm2 - xorl %eax,%ebp - shrdl $7,%edx,%edx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -64(%r15),%xmm14 - movl %ecx,%edi - xorl %ebp,%esi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 44(%rsp),%eax - andl %edx,%edi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - addl %ebx,%eax - vpalignr $8,%xmm1,%xmm2,%xmm8 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -48(%r15),%xmm15 - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - vpxor %xmm4,%xmm3,%xmm3 - addl %esi,%ebp - xorl %ecx,%edi - vpaddd %xmm2,%xmm10,%xmm9 - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpxor %xmm8,%xmm3,%xmm3 - addl 52(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - vpsrld $30,%xmm3,%xmm8 - vmovdqa %xmm9,32(%rsp) - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpslld $2,%xmm3,%xmm3 - addl 56(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -32(%r15),%xmm14 - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpor %xmm8,%xmm3,%xmm3 - addl 60(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 0(%rsp),%eax - vpaddd %xmm3,%xmm10,%xmm9 - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - vmovdqa %xmm9,48(%rsp) - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 4(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -16(%r15),%xmm15 - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 8(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 12(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 0(%r15),%xmm14 - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - cmpq %r14,%r10 - je .Ldone_avx - vmovdqa 64(%r11),%xmm9 - vmovdqa 0(%r11),%xmm10 - vmovdqu 0(%r10),%xmm0 - vmovdqu 16(%r10),%xmm1 - vmovdqu 32(%r10),%xmm2 - vmovdqu 48(%r10),%xmm3 - vpshufb %xmm9,%xmm0,%xmm0 - addq $64,%r10 - addl 16(%rsp),%ebx - xorl %ebp,%esi - vpshufb %xmm9,%xmm1,%xmm1 - movl %ecx,%edi - shldl $5,%ecx,%ecx - vpaddd %xmm10,%xmm0,%xmm8 - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vmovdqa %xmm8,0(%rsp) - addl 20(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 16(%r15),%xmm15 - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 28(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 32(%rsp),%ecx - xorl %eax,%esi - vpshufb %xmm9,%xmm2,%xmm2 - movl %edx,%edi - shldl $5,%edx,%edx - vpaddd %xmm10,%xmm1,%xmm8 - addl %esi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 32(%r15),%xmm14 - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vmovdqa %xmm8,16(%rsp) - addl 36(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 48(%r15),%xmm15 - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 48(%rsp),%edx - xorl %ebx,%esi - vpshufb %xmm9,%xmm3,%xmm3 - movl %ebp,%edi - shldl $5,%ebp,%ebp - vpaddd %xmm10,%xmm2,%xmm8 - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - vmovdqa %xmm8,32(%rsp) - addl 52(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - cmpl $11,%r8d - jb .Lvaesenclast9 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 64(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 80(%r15),%xmm15 - je .Lvaesenclast9 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 96(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 112(%r15),%xmm15 -.Lvaesenclast9: - vaesenclast %xmm15,%xmm12,%xmm12 - vmovups -112(%r15),%xmm15 - vmovups 16-112(%r15),%xmm14 - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 56(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vmovups %xmm12,48(%r13,%r12,1) - leaq 64(%r12),%r12 - - addl 0(%r9),%eax - addl 4(%r9),%esi - addl 8(%r9),%ecx - addl 12(%r9),%edx - movl %eax,0(%r9) - addl 16(%r9),%ebp - movl %esi,4(%r9) - movl %esi,%ebx - movl %ecx,8(%r9) - movl %ecx,%edi - movl %edx,12(%r9) - xorl %edx,%edi - movl %ebp,16(%r9) - andl %edi,%esi - jmp .Loop_avx - -.Ldone_avx: - addl 16(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 20(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 16(%r15),%xmm15 - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 28(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 32(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 32(%r15),%xmm14 - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 36(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 48(%r15),%xmm15 - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 48(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 52(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - cmpl $11,%r8d - jb .Lvaesenclast10 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 64(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 80(%r15),%xmm15 - je .Lvaesenclast10 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 96(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 112(%r15),%xmm15 -.Lvaesenclast10: - vaesenclast %xmm15,%xmm12,%xmm12 - vmovups -112(%r15),%xmm15 - vmovups 16-112(%r15),%xmm14 - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 56(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vmovups %xmm12,48(%r13,%r12,1) - movq 88(%rsp),%r8 - - addl 0(%r9),%eax - addl 4(%r9),%esi - addl 8(%r9),%ecx - movl %eax,0(%r9) - addl 12(%r9),%edx - movl %esi,4(%r9) - addl 16(%r9),%ebp - movl %ecx,8(%r9) - movl %edx,12(%r9) - movl %ebp,16(%r9) - vmovups %xmm12,(%r8) - vzeroall - leaq 104(%rsp),%rsi -.cfi_def_cfa %rsi,56 - movq 0(%rsi),%r15 -.cfi_restore %r15 - movq 8(%rsi),%r14 -.cfi_restore %r14 - movq 16(%rsi),%r13 -.cfi_restore %r13 - movq 24(%rsi),%r12 -.cfi_restore %r12 - movq 32(%rsi),%rbp -.cfi_restore %rbp - movq 40(%rsi),%rbx -.cfi_restore %rbx - leaq 48(%rsi),%rsp -.cfi_def_cfa %rsp,8 -.Lepilogue_avx: - .byte 0xf3,0xc3 -.cfi_endproc -.size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx .align 64 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 @@ -2809,17 +1483,17 @@ aesni_cbc_sha1_enc_shaext: pxor %xmm3,%xmm5 .byte 15,56,201,243 cmpl $11,%r11d - jb .Laesenclast11 + jb .Laesenclast6 movups 64(%rcx),%xmm0 .byte 102,15,56,220,209 movups 80(%rcx),%xmm1 .byte 102,15,56,220,208 - je .Laesenclast11 + je .Laesenclast6 movups 96(%rcx),%xmm0 .byte 102,15,56,220,209 movups 112(%rcx),%xmm1 .byte 102,15,56,220,208 -.Laesenclast11: +.Laesenclast6: .byte 102,15,56,221,209 movups 16-112(%rcx),%xmm0 movdqa %xmm8,%xmm10 @@ -2875,17 +1549,17 @@ aesni_cbc_sha1_enc_shaext: pxor %xmm4,%xmm6 .byte 15,56,201,220 cmpl $11,%r11d - jb .Laesenclast12 + jb .Laesenclast7 movups 64(%rcx),%xmm0 .byte 102,15,56,220,209 movups 80(%rcx),%xmm1 .byte 102,15,56,220,208 - je .Laesenclast12 + je .Laesenclast7 movups 96(%rcx),%xmm0 .byte 102,15,56,220,209 movups 112(%rcx),%xmm1 .byte 102,15,56,220,208 -.Laesenclast12: +.Laesenclast7: .byte 102,15,56,221,209 movups 16-112(%rcx),%xmm0 movdqa %xmm8,%xmm9 @@ -2941,17 +1615,17 @@ aesni_cbc_sha1_enc_shaext: pxor %xmm5,%xmm3 .byte 15,56,201,229 cmpl $11,%r11d - jb .Laesenclast13 + jb .Laesenclast8 movups 64(%rcx),%xmm0 .byte 102,15,56,220,209 movups 80(%rcx),%xmm1 .byte 102,15,56,220,208 - je .Laesenclast13 + je .Laesenclast8 movups 96(%rcx),%xmm0 .byte 102,15,56,220,209 movups 112(%rcx),%xmm1 .byte 102,15,56,220,208 -.Laesenclast13: +.Laesenclast8: .byte 102,15,56,221,209 movups 16-112(%rcx),%xmm0 movdqa %xmm8,%xmm10 @@ -3005,17 +1679,17 @@ aesni_cbc_sha1_enc_shaext: movups 48(%rcx),%xmm1 .byte 102,15,56,220,208 cmpl $11,%r11d - jb .Laesenclast14 + jb .Laesenclast9 movups 64(%rcx),%xmm0 .byte 102,15,56,220,209 movups 80(%rcx),%xmm1 .byte 102,15,56,220,208 - je .Laesenclast14 + je .Laesenclast9 movups 96(%rcx),%xmm0 .byte 102,15,56,220,209 movups 112(%rcx),%xmm1 .byte 102,15,56,220,208 -.Laesenclast14: +.Laesenclast9: .byte 102,15,56,221,209 movups 16-112(%rcx),%xmm0 decq %rdx @@ -3033,7 +1707,7 @@ aesni_cbc_sha1_enc_shaext: .byte 0xf3,0xc3 .cfi_endproc .size aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext - .section ".note.gnu.property", "a" + .section .note.gnu.property, #alloc .p2align 3 .long 1f - 0f .long 4f - 1f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-sha256-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-sha256-x86_64.s index 0e022a30c0d..f1256ca0eca 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-sha256-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-sha256-x86_64.s @@ -6,25 +6,6 @@ .align 16 aesni_cbc_sha256_enc: .cfi_startproc - leaq OPENSSL_ia32cap_P(%rip),%r11 - movl $1,%eax - cmpq $0,%rdi - je .Lprobe - movl 0(%r11),%eax - movq 4(%r11),%r10 - btq $61,%r10 - jc aesni_cbc_sha256_enc_shaext - movq %r10,%r11 - shrq $32,%r11 - - testl $2048,%r10d - jnz aesni_cbc_sha256_enc_xop - andl $296,%r11d - cmpl $296,%r11d - je aesni_cbc_sha256_enc_avx2 - andl $268435456,%r10d - jnz aesni_cbc_sha256_enc_avx - ud2 xorl %eax,%eax cmpq $0,%rdi je .Lprobe @@ -76,4364 +57,7 @@ K256: .long 0,0,0,0, 0,0,0,0 .byte 65,69,83,78,73,45,67,66,67,43,83,72,65,50,53,54,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 -.type aesni_cbc_sha256_enc_xop,@function -.align 64 -aesni_cbc_sha256_enc_xop: -.cfi_startproc -.Lxop_shortcut: - movq 8(%rsp),%r10 - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - subq $128,%rsp - andq $-64,%rsp - - shlq $6,%rdx - subq %rdi,%rsi - subq %rdi,%r10 - addq %rdi,%rdx - - - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - - movq %r8,64+32(%rsp) - movq %r9,64+40(%rsp) - movq %r10,64+48(%rsp) - movq %rax,120(%rsp) -.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 -.Lprologue_xop: - vzeroall - - movq %rdi,%r12 - leaq 128(%rcx),%rdi - leaq K256+544(%rip),%r13 - movl 240-128(%rdi),%r14d - movq %r9,%r15 - movq %r10,%rsi - vmovdqu (%r8),%xmm8 - subq $9,%r14 - - movl 0(%r15),%eax - movl 4(%r15),%ebx - movl 8(%r15),%ecx - movl 12(%r15),%edx - movl 16(%r15),%r8d - movl 20(%r15),%r9d - movl 24(%r15),%r10d - movl 28(%r15),%r11d - - vmovdqa 0(%r13,%r14,8),%xmm14 - vmovdqa 16(%r13,%r14,8),%xmm13 - vmovdqa 32(%r13,%r14,8),%xmm12 - vmovdqu 0-128(%rdi),%xmm10 - jmp .Lloop_xop -.align 16 -.Lloop_xop: - vmovdqa K256+512(%rip),%xmm7 - vmovdqu 0(%rsi,%r12,1),%xmm0 - vmovdqu 16(%rsi,%r12,1),%xmm1 - vmovdqu 32(%rsi,%r12,1),%xmm2 - vmovdqu 48(%rsi,%r12,1),%xmm3 - vpshufb %xmm7,%xmm0,%xmm0 - leaq K256(%rip),%rbp - vpshufb %xmm7,%xmm1,%xmm1 - vpshufb %xmm7,%xmm2,%xmm2 - vpaddd 0(%rbp),%xmm0,%xmm4 - vpshufb %xmm7,%xmm3,%xmm3 - vpaddd 32(%rbp),%xmm1,%xmm5 - vpaddd 64(%rbp),%xmm2,%xmm6 - vpaddd 96(%rbp),%xmm3,%xmm7 - vmovdqa %xmm4,0(%rsp) - movl %eax,%r14d - vmovdqa %xmm5,16(%rsp) - movl %ebx,%esi - vmovdqa %xmm6,32(%rsp) - xorl %ecx,%esi - vmovdqa %xmm7,48(%rsp) - movl %r8d,%r13d - jmp .Lxop_00_47 - -.align 16 -.Lxop_00_47: - subq $-32*4,%rbp - vmovdqu (%r12),%xmm9 - movq %r12,64+0(%rsp) - vpalignr $4,%xmm0,%xmm1,%xmm4 - rorl $14,%r13d - movl %r14d,%eax - vpalignr $4,%xmm2,%xmm3,%xmm7 - movl %r9d,%r12d - xorl %r8d,%r13d -.byte 143,232,120,194,236,14 - rorl $9,%r14d - xorl %r10d,%r12d - vpsrld $3,%xmm4,%xmm4 - rorl $5,%r13d - xorl %eax,%r14d - vpaddd %xmm7,%xmm0,%xmm0 - andl %r8d,%r12d - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d -.byte 143,232,120,194,245,11 - rorl $11,%r14d - xorl %r10d,%r12d - vpxor %xmm5,%xmm4,%xmm4 - xorl %ebx,%r15d - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%esi -.byte 143,232,120,194,251,13 - xorl %eax,%r14d - addl %r13d,%r11d - vpxor %xmm6,%xmm4,%xmm4 - xorl %ebx,%esi - addl %r11d,%edx - vpsrld $10,%xmm3,%xmm6 - rorl $2,%r14d - addl %esi,%r11d - vpaddd %xmm4,%xmm0,%xmm0 - movl %edx,%r13d - addl %r11d,%r14d -.byte 143,232,120,194,239,2 - rorl $14,%r13d - movl %r14d,%r11d - vpxor %xmm6,%xmm7,%xmm7 - movl %r8d,%r12d - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%r12d - vpxor %xmm5,%xmm7,%xmm7 - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - vpxor %xmm8,%xmm9,%xmm9 - xorl %edx,%r13d - vpsrldq $8,%xmm7,%xmm7 - addl 4(%rsp),%r10d - movl %r11d,%esi - rorl $11,%r14d - xorl %r9d,%r12d - vpaddd %xmm7,%xmm0,%xmm0 - xorl %eax,%esi - rorl $6,%r13d - addl %r12d,%r10d - andl %esi,%r15d -.byte 143,232,120,194,248,13 - xorl %r11d,%r14d - addl %r13d,%r10d - vpsrld $10,%xmm0,%xmm6 - xorl %eax,%r15d - addl %r10d,%ecx -.byte 143,232,120,194,239,2 - rorl $2,%r14d - addl %r15d,%r10d - vpxor %xmm6,%xmm7,%xmm7 - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - vpxor %xmm5,%xmm7,%xmm7 - movl %edx,%r12d - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r12d - vpslldq $8,%xmm7,%xmm7 - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %ecx,%r13d - vpaddd %xmm7,%xmm0,%xmm0 - addl 8(%rsp),%r9d - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - vpaddd 0(%rbp),%xmm0,%xmm6 - xorl %r11d,%r15d - rorl $6,%r13d - addl %r12d,%r9d - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%esi - addl %r9d,%ebx - rorl $2,%r14d - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%esi - rorl $11,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - rorl $6,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,0(%rsp) - vpalignr $4,%xmm1,%xmm2,%xmm4 - rorl $14,%r13d - movl %r14d,%r8d - vpalignr $4,%xmm3,%xmm0,%xmm7 - movl %ebx,%r12d - xorl %eax,%r13d -.byte 143,232,120,194,236,14 - rorl $9,%r14d - xorl %ecx,%r12d - vpsrld $3,%xmm4,%xmm4 - rorl $5,%r13d - xorl %r8d,%r14d - vpaddd %xmm7,%xmm1,%xmm1 - andl %eax,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d -.byte 143,232,120,194,245,11 - rorl $11,%r14d - xorl %ecx,%r12d - vpxor %xmm5,%xmm4,%xmm4 - xorl %r9d,%r15d - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%esi -.byte 143,232,120,194,248,13 - xorl %r8d,%r14d - addl %r13d,%edx - vpxor %xmm6,%xmm4,%xmm4 - xorl %r9d,%esi - addl %edx,%r11d - vpsrld $10,%xmm0,%xmm6 - rorl $2,%r14d - addl %esi,%edx - vpaddd %xmm4,%xmm1,%xmm1 - movl %r11d,%r13d - addl %edx,%r14d -.byte 143,232,120,194,239,2 - rorl $14,%r13d - movl %r14d,%edx - vpxor %xmm6,%xmm7,%xmm7 - movl %eax,%r12d - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%r12d - vpxor %xmm5,%xmm7,%xmm7 - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r11d,%r13d - vpsrldq $8,%xmm7,%xmm7 - addl 20(%rsp),%ecx - movl %edx,%esi - rorl $11,%r14d - xorl %ebx,%r12d - vpaddd %xmm7,%xmm1,%xmm1 - xorl %r8d,%esi - rorl $6,%r13d - addl %r12d,%ecx - andl %esi,%r15d -.byte 143,232,120,194,249,13 - xorl %edx,%r14d - addl %r13d,%ecx - vpsrld $10,%xmm1,%xmm6 - xorl %r8d,%r15d - addl %ecx,%r10d -.byte 143,232,120,194,239,2 - rorl $2,%r14d - addl %r15d,%ecx - vpxor %xmm6,%xmm7,%xmm7 - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - vpxor %xmm5,%xmm7,%xmm7 - movl %r11d,%r12d - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r12d - vpslldq $8,%xmm7,%xmm7 - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r10d,%r13d - vpaddd %xmm7,%xmm1,%xmm1 - addl 24(%rsp),%ebx - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - vpaddd 32(%rbp),%xmm1,%xmm6 - xorl %edx,%r15d - rorl $6,%r13d - addl %r12d,%ebx - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%esi - addl %ebx,%r9d - rorl $2,%r14d - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%esi - rorl $11,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - rorl $6,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,16(%rsp) - vpalignr $4,%xmm2,%xmm3,%xmm4 - rorl $14,%r13d - movl %r14d,%eax - vpalignr $4,%xmm0,%xmm1,%xmm7 - movl %r9d,%r12d - xorl %r8d,%r13d -.byte 143,232,120,194,236,14 - rorl $9,%r14d - xorl %r10d,%r12d - vpsrld $3,%xmm4,%xmm4 - rorl $5,%r13d - xorl %eax,%r14d - vpaddd %xmm7,%xmm2,%xmm2 - andl %r8d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d -.byte 143,232,120,194,245,11 - rorl $11,%r14d - xorl %r10d,%r12d - vpxor %xmm5,%xmm4,%xmm4 - xorl %ebx,%r15d - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%esi -.byte 143,232,120,194,249,13 - xorl %eax,%r14d - addl %r13d,%r11d - vpxor %xmm6,%xmm4,%xmm4 - xorl %ebx,%esi - addl %r11d,%edx - vpsrld $10,%xmm1,%xmm6 - rorl $2,%r14d - addl %esi,%r11d - vpaddd %xmm4,%xmm2,%xmm2 - movl %edx,%r13d - addl %r11d,%r14d -.byte 143,232,120,194,239,2 - rorl $14,%r13d - movl %r14d,%r11d - vpxor %xmm6,%xmm7,%xmm7 - movl %r8d,%r12d - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%r12d - vpxor %xmm5,%xmm7,%xmm7 - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %edx,%r13d - vpsrldq $8,%xmm7,%xmm7 - addl 36(%rsp),%r10d - movl %r11d,%esi - rorl $11,%r14d - xorl %r9d,%r12d - vpaddd %xmm7,%xmm2,%xmm2 - xorl %eax,%esi - rorl $6,%r13d - addl %r12d,%r10d - andl %esi,%r15d -.byte 143,232,120,194,250,13 - xorl %r11d,%r14d - addl %r13d,%r10d - vpsrld $10,%xmm2,%xmm6 - xorl %eax,%r15d - addl %r10d,%ecx -.byte 143,232,120,194,239,2 - rorl $2,%r14d - addl %r15d,%r10d - vpxor %xmm6,%xmm7,%xmm7 - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - vpxor %xmm5,%xmm7,%xmm7 - movl %edx,%r12d - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r12d - vpslldq $8,%xmm7,%xmm7 - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %ecx,%r13d - vpaddd %xmm7,%xmm2,%xmm2 - addl 40(%rsp),%r9d - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - vpaddd 64(%rbp),%xmm2,%xmm6 - xorl %r11d,%r15d - rorl $6,%r13d - addl %r12d,%r9d - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%esi - addl %r9d,%ebx - rorl $2,%r14d - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%esi - rorl $11,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - rorl $6,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,32(%rsp) - vpalignr $4,%xmm3,%xmm0,%xmm4 - rorl $14,%r13d - movl %r14d,%r8d - vpalignr $4,%xmm1,%xmm2,%xmm7 - movl %ebx,%r12d - xorl %eax,%r13d -.byte 143,232,120,194,236,14 - rorl $9,%r14d - xorl %ecx,%r12d - vpsrld $3,%xmm4,%xmm4 - rorl $5,%r13d - xorl %r8d,%r14d - vpaddd %xmm7,%xmm3,%xmm3 - andl %eax,%r12d - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d -.byte 143,232,120,194,245,11 - rorl $11,%r14d - xorl %ecx,%r12d - vpxor %xmm5,%xmm4,%xmm4 - xorl %r9d,%r15d - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%esi -.byte 143,232,120,194,250,13 - xorl %r8d,%r14d - addl %r13d,%edx - vpxor %xmm6,%xmm4,%xmm4 - xorl %r9d,%esi - addl %edx,%r11d - vpsrld $10,%xmm2,%xmm6 - rorl $2,%r14d - addl %esi,%edx - vpaddd %xmm4,%xmm3,%xmm3 - movl %r11d,%r13d - addl %edx,%r14d -.byte 143,232,120,194,239,2 - rorl $14,%r13d - movl %r14d,%edx - vpxor %xmm6,%xmm7,%xmm7 - movl %eax,%r12d - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%r12d - vpxor %xmm5,%xmm7,%xmm7 - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r11d,%r13d - vpsrldq $8,%xmm7,%xmm7 - addl 52(%rsp),%ecx - movl %edx,%esi - rorl $11,%r14d - xorl %ebx,%r12d - vpaddd %xmm7,%xmm3,%xmm3 - xorl %r8d,%esi - rorl $6,%r13d - addl %r12d,%ecx - andl %esi,%r15d -.byte 143,232,120,194,251,13 - xorl %edx,%r14d - addl %r13d,%ecx - vpsrld $10,%xmm3,%xmm6 - xorl %r8d,%r15d - addl %ecx,%r10d -.byte 143,232,120,194,239,2 - rorl $2,%r14d - addl %r15d,%ecx - vpxor %xmm6,%xmm7,%xmm7 - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - vpxor %xmm5,%xmm7,%xmm7 - movl %r11d,%r12d - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r12d - vpslldq $8,%xmm7,%xmm7 - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r10d,%r13d - vpaddd %xmm7,%xmm3,%xmm3 - addl 56(%rsp),%ebx - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - vpaddd 96(%rbp),%xmm3,%xmm6 - xorl %edx,%r15d - rorl $6,%r13d - addl %r12d,%ebx - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%esi - addl %ebx,%r9d - rorl $2,%r14d - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%esi - rorl $11,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - rorl $6,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,48(%rsp) - movq 64+0(%rsp),%r12 - vpand %xmm14,%xmm11,%xmm11 - movq 64+8(%rsp),%r15 - vpor %xmm11,%xmm8,%xmm8 - vmovdqu %xmm8,(%r15,%r12,1) - leaq 16(%r12),%r12 - cmpb $0,131(%rbp) - jne .Lxop_00_47 - vmovdqu (%r12),%xmm9 - movq %r12,64+0(%rsp) - rorl $14,%r13d - movl %r14d,%eax - movl %r9d,%r12d - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - rorl $11,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%esi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%esi - addl %r11d,%edx - rorl $2,%r14d - addl %esi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%r12d - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - vpxor %xmm8,%xmm9,%xmm9 - xorl %edx,%r13d - addl 4(%rsp),%r10d - movl %r11d,%esi - rorl $11,%r14d - xorl %r9d,%r12d - xorl %eax,%esi - rorl $6,%r13d - addl %r12d,%r10d - andl %esi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - rorl $2,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - rorl $6,%r13d - addl %r12d,%r9d - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%esi - addl %r9d,%ebx - rorl $2,%r14d - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%esi - rorl $11,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - rorl $6,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - rorl $14,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - rorl $11,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%esi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%esi - addl %edx,%r11d - rorl $2,%r14d - addl %esi,%edx - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%r12d - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r11d,%r13d - addl 20(%rsp),%ecx - movl %edx,%esi - rorl $11,%r14d - xorl %ebx,%r12d - xorl %r8d,%esi - rorl $6,%r13d - addl %r12d,%ecx - andl %esi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - rorl $2,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - rorl $6,%r13d - addl %r12d,%ebx - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%esi - addl %ebx,%r9d - rorl $2,%r14d - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%esi - rorl $11,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - rorl $6,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - rorl $14,%r13d - movl %r14d,%eax - movl %r9d,%r12d - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - rorl $11,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%esi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%esi - addl %r11d,%edx - rorl $2,%r14d - addl %esi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%r12d - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %edx,%r13d - addl 36(%rsp),%r10d - movl %r11d,%esi - rorl $11,%r14d - xorl %r9d,%r12d - xorl %eax,%esi - rorl $6,%r13d - addl %r12d,%r10d - andl %esi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - rorl $2,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - rorl $6,%r13d - addl %r12d,%r9d - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%esi - addl %r9d,%ebx - rorl $2,%r14d - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%esi - rorl $11,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - rorl $6,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - rorl $14,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - rorl $11,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%esi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%esi - addl %edx,%r11d - rorl $2,%r14d - addl %esi,%edx - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%r12d - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r11d,%r13d - addl 52(%rsp),%ecx - movl %edx,%esi - rorl $11,%r14d - xorl %ebx,%r12d - xorl %r8d,%esi - rorl $6,%r13d - addl %r12d,%ecx - andl %esi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - rorl $2,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - rorl $6,%r13d - addl %r12d,%ebx - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%esi - addl %ebx,%r9d - rorl $2,%r14d - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%esi - rorl $11,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - rorl $6,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movq 64+0(%rsp),%r12 - movq 64+8(%rsp),%r13 - movq 64+40(%rsp),%r15 - movq 64+48(%rsp),%rsi - - vpand %xmm14,%xmm11,%xmm11 - movl %r14d,%eax - vpor %xmm11,%xmm8,%xmm8 - vmovdqu %xmm8,(%r12,%r13,1) - leaq 16(%r12),%r12 - - addl 0(%r15),%eax - addl 4(%r15),%ebx - addl 8(%r15),%ecx - addl 12(%r15),%edx - addl 16(%r15),%r8d - addl 20(%r15),%r9d - addl 24(%r15),%r10d - addl 28(%r15),%r11d - - cmpq 64+16(%rsp),%r12 - - movl %eax,0(%r15) - movl %ebx,4(%r15) - movl %ecx,8(%r15) - movl %edx,12(%r15) - movl %r8d,16(%r15) - movl %r9d,20(%r15) - movl %r10d,24(%r15) - movl %r11d,28(%r15) - - jb .Lloop_xop - - movq 64+32(%rsp),%r8 - movq 120(%rsp),%rsi -.cfi_def_cfa %rsi,8 - vmovdqu %xmm8,(%r8) - vzeroall - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_xop: - .byte 0xf3,0xc3 -.cfi_endproc -.size aesni_cbc_sha256_enc_xop,.-aesni_cbc_sha256_enc_xop -.type aesni_cbc_sha256_enc_avx,@function -.align 64 -aesni_cbc_sha256_enc_avx: -.cfi_startproc -.Lavx_shortcut: - movq 8(%rsp),%r10 - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - subq $128,%rsp - andq $-64,%rsp - - shlq $6,%rdx - subq %rdi,%rsi - subq %rdi,%r10 - addq %rdi,%rdx - - - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - - movq %r8,64+32(%rsp) - movq %r9,64+40(%rsp) - movq %r10,64+48(%rsp) - movq %rax,120(%rsp) -.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 -.Lprologue_avx: - vzeroall - - movq %rdi,%r12 - leaq 128(%rcx),%rdi - leaq K256+544(%rip),%r13 - movl 240-128(%rdi),%r14d - movq %r9,%r15 - movq %r10,%rsi - vmovdqu (%r8),%xmm8 - subq $9,%r14 - - movl 0(%r15),%eax - movl 4(%r15),%ebx - movl 8(%r15),%ecx - movl 12(%r15),%edx - movl 16(%r15),%r8d - movl 20(%r15),%r9d - movl 24(%r15),%r10d - movl 28(%r15),%r11d - - vmovdqa 0(%r13,%r14,8),%xmm14 - vmovdqa 16(%r13,%r14,8),%xmm13 - vmovdqa 32(%r13,%r14,8),%xmm12 - vmovdqu 0-128(%rdi),%xmm10 - jmp .Lloop_avx -.align 16 -.Lloop_avx: - vmovdqa K256+512(%rip),%xmm7 - vmovdqu 0(%rsi,%r12,1),%xmm0 - vmovdqu 16(%rsi,%r12,1),%xmm1 - vmovdqu 32(%rsi,%r12,1),%xmm2 - vmovdqu 48(%rsi,%r12,1),%xmm3 - vpshufb %xmm7,%xmm0,%xmm0 - leaq K256(%rip),%rbp - vpshufb %xmm7,%xmm1,%xmm1 - vpshufb %xmm7,%xmm2,%xmm2 - vpaddd 0(%rbp),%xmm0,%xmm4 - vpshufb %xmm7,%xmm3,%xmm3 - vpaddd 32(%rbp),%xmm1,%xmm5 - vpaddd 64(%rbp),%xmm2,%xmm6 - vpaddd 96(%rbp),%xmm3,%xmm7 - vmovdqa %xmm4,0(%rsp) - movl %eax,%r14d - vmovdqa %xmm5,16(%rsp) - movl %ebx,%esi - vmovdqa %xmm6,32(%rsp) - xorl %ecx,%esi - vmovdqa %xmm7,48(%rsp) - movl %r8d,%r13d - jmp .Lavx_00_47 - -.align 16 -.Lavx_00_47: - subq $-32*4,%rbp - vmovdqu (%r12),%xmm9 - movq %r12,64+0(%rsp) - vpalignr $4,%xmm0,%xmm1,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - vpalignr $4,%xmm2,%xmm3,%xmm7 - xorl %r8d,%r13d - shrdl $9,%r14d,%r14d - xorl %r10d,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpaddd %xmm7,%xmm0,%xmm0 - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - vpsrld $3,%xmm4,%xmm7 - shrdl $11,%r14d,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - vpslld $14,%xmm4,%xmm5 - shrdl $6,%r13d,%r13d - addl %r12d,%r11d - andl %r15d,%esi - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%esi - vpshufd $250,%xmm3,%xmm7 - addl %r11d,%edx - shrdl $2,%r14d,%r14d - addl %esi,%r11d - vpsrld $11,%xmm6,%xmm6 - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - vpslld $11,%xmm5,%xmm5 - shrdl $9,%r14d,%r14d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %r11d,%r14d - andl %edx,%r12d - vpxor %xmm8,%xmm9,%xmm9 - xorl %edx,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 4(%rsp),%r10d - movl %r11d,%esi - shrdl $11,%r14d,%r14d - vpxor %xmm5,%xmm4,%xmm4 - xorl %r9d,%r12d - xorl %eax,%esi - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - addl %r12d,%r10d - andl %esi,%r15d - xorl %r11d,%r14d - vpaddd %xmm4,%xmm0,%xmm0 - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $2,%r14d,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - vpxor %xmm7,%xmm6,%xmm6 - movl %edx,%r12d - xorl %ecx,%r13d - shrdl $9,%r14d,%r14d - vpshufd $132,%xmm6,%xmm6 - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - vpsrldq $8,%xmm6,%xmm6 - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %ecx,%r13d - addl 8(%rsp),%r9d - vpaddd %xmm6,%xmm0,%xmm0 - movl %r10d,%r15d - shrdl $11,%r14d,%r14d - xorl %r8d,%r12d - vpshufd $80,%xmm0,%xmm7 - xorl %r11d,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%r9d - vpsrld $10,%xmm7,%xmm6 - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - vpsrlq $17,%xmm7,%xmm7 - xorl %r11d,%esi - addl %r9d,%ebx - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - vpsrlq $2,%xmm7,%xmm7 - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - vpxor %xmm7,%xmm6,%xmm6 - xorl %ebx,%r13d - shrdl $9,%r14d,%r14d - xorl %edx,%r12d - vpshufd $232,%xmm6,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vpslldq $8,%xmm6,%xmm6 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%esi - vpaddd %xmm6,%xmm0,%xmm0 - shrdl $11,%r14d,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - vpaddd 0(%rbp),%xmm0,%xmm6 - shrdl $6,%r13d,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - shrdl $2,%r14d,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,0(%rsp) - vpalignr $4,%xmm1,%xmm2,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - vpalignr $4,%xmm3,%xmm0,%xmm7 - xorl %eax,%r13d - shrdl $9,%r14d,%r14d - xorl %ecx,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpaddd %xmm7,%xmm1,%xmm1 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - vpsrld $3,%xmm4,%xmm7 - shrdl $11,%r14d,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - vpslld $14,%xmm4,%xmm5 - shrdl $6,%r13d,%r13d - addl %r12d,%edx - andl %r15d,%esi - vpxor %xmm6,%xmm7,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%esi - vpshufd $250,%xmm0,%xmm7 - addl %edx,%r11d - shrdl $2,%r14d,%r14d - addl %esi,%edx - vpsrld $11,%xmm6,%xmm6 - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - vpslld $11,%xmm5,%xmm5 - shrdl $9,%r14d,%r14d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %edx,%r14d - andl %r11d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r11d,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 20(%rsp),%ecx - movl %edx,%esi - shrdl $11,%r14d,%r14d - vpxor %xmm5,%xmm4,%xmm4 - xorl %ebx,%r12d - xorl %r8d,%esi - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - addl %r12d,%ecx - andl %esi,%r15d - xorl %edx,%r14d - vpaddd %xmm4,%xmm1,%xmm1 - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $2,%r14d,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - movl %r11d,%r12d - xorl %r10d,%r13d - shrdl $9,%r14d,%r14d - vpshufd $132,%xmm6,%xmm6 - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - vpsrldq $8,%xmm6,%xmm6 - andl %r10d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r10d,%r13d - addl 24(%rsp),%ebx - vpaddd %xmm6,%xmm1,%xmm1 - movl %ecx,%r15d - shrdl $11,%r14d,%r14d - xorl %eax,%r12d - vpshufd $80,%xmm1,%xmm7 - xorl %edx,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%ebx - vpsrld $10,%xmm7,%xmm6 - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - vpsrlq $17,%xmm7,%xmm7 - xorl %edx,%esi - addl %ebx,%r9d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - vpsrlq $2,%xmm7,%xmm7 - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - vpxor %xmm7,%xmm6,%xmm6 - xorl %r9d,%r13d - shrdl $9,%r14d,%r14d - xorl %r11d,%r12d - vpshufd $232,%xmm6,%xmm6 - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpslldq $8,%xmm6,%xmm6 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%esi - vpaddd %xmm6,%xmm1,%xmm1 - shrdl $11,%r14d,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - vpaddd 32(%rbp),%xmm1,%xmm6 - shrdl $6,%r13d,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - shrdl $2,%r14d,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,16(%rsp) - vpalignr $4,%xmm2,%xmm3,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - vpalignr $4,%xmm0,%xmm1,%xmm7 - xorl %r8d,%r13d - shrdl $9,%r14d,%r14d - xorl %r10d,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpaddd %xmm7,%xmm2,%xmm2 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - vpsrld $3,%xmm4,%xmm7 - shrdl $11,%r14d,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - vpslld $14,%xmm4,%xmm5 - shrdl $6,%r13d,%r13d - addl %r12d,%r11d - andl %r15d,%esi - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%esi - vpshufd $250,%xmm1,%xmm7 - addl %r11d,%edx - shrdl $2,%r14d,%r14d - addl %esi,%r11d - vpsrld $11,%xmm6,%xmm6 - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - vpslld $11,%xmm5,%xmm5 - shrdl $9,%r14d,%r14d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %r11d,%r14d - andl %edx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %edx,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 36(%rsp),%r10d - movl %r11d,%esi - shrdl $11,%r14d,%r14d - vpxor %xmm5,%xmm4,%xmm4 - xorl %r9d,%r12d - xorl %eax,%esi - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - addl %r12d,%r10d - andl %esi,%r15d - xorl %r11d,%r14d - vpaddd %xmm4,%xmm2,%xmm2 - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $2,%r14d,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - vpxor %xmm7,%xmm6,%xmm6 - movl %edx,%r12d - xorl %ecx,%r13d - shrdl $9,%r14d,%r14d - vpshufd $132,%xmm6,%xmm6 - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - vpsrldq $8,%xmm6,%xmm6 - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %ecx,%r13d - addl 40(%rsp),%r9d - vpaddd %xmm6,%xmm2,%xmm2 - movl %r10d,%r15d - shrdl $11,%r14d,%r14d - xorl %r8d,%r12d - vpshufd $80,%xmm2,%xmm7 - xorl %r11d,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%r9d - vpsrld $10,%xmm7,%xmm6 - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - vpsrlq $17,%xmm7,%xmm7 - xorl %r11d,%esi - addl %r9d,%ebx - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - vpsrlq $2,%xmm7,%xmm7 - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - vpxor %xmm7,%xmm6,%xmm6 - xorl %ebx,%r13d - shrdl $9,%r14d,%r14d - xorl %edx,%r12d - vpshufd $232,%xmm6,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vpslldq $8,%xmm6,%xmm6 - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%esi - vpaddd %xmm6,%xmm2,%xmm2 - shrdl $11,%r14d,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - vpaddd 64(%rbp),%xmm2,%xmm6 - shrdl $6,%r13d,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - shrdl $2,%r14d,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,32(%rsp) - vpalignr $4,%xmm3,%xmm0,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - vpalignr $4,%xmm1,%xmm2,%xmm7 - xorl %eax,%r13d - shrdl $9,%r14d,%r14d - xorl %ecx,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpaddd %xmm7,%xmm3,%xmm3 - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - vpsrld $3,%xmm4,%xmm7 - shrdl $11,%r14d,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - vpslld $14,%xmm4,%xmm5 - shrdl $6,%r13d,%r13d - addl %r12d,%edx - andl %r15d,%esi - vpxor %xmm6,%xmm7,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%esi - vpshufd $250,%xmm2,%xmm7 - addl %edx,%r11d - shrdl $2,%r14d,%r14d - addl %esi,%edx - vpsrld $11,%xmm6,%xmm6 - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - vpslld $11,%xmm5,%xmm5 - shrdl $9,%r14d,%r14d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %edx,%r14d - andl %r11d,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r11d,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 52(%rsp),%ecx - movl %edx,%esi - shrdl $11,%r14d,%r14d - vpxor %xmm5,%xmm4,%xmm4 - xorl %ebx,%r12d - xorl %r8d,%esi - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - addl %r12d,%ecx - andl %esi,%r15d - xorl %edx,%r14d - vpaddd %xmm4,%xmm3,%xmm3 - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $2,%r14d,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - movl %r11d,%r12d - xorl %r10d,%r13d - shrdl $9,%r14d,%r14d - vpshufd $132,%xmm6,%xmm6 - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - vpsrldq $8,%xmm6,%xmm6 - andl %r10d,%r12d - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r10d,%r13d - addl 56(%rsp),%ebx - vpaddd %xmm6,%xmm3,%xmm3 - movl %ecx,%r15d - shrdl $11,%r14d,%r14d - xorl %eax,%r12d - vpshufd $80,%xmm3,%xmm7 - xorl %edx,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%ebx - vpsrld $10,%xmm7,%xmm6 - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - vpsrlq $17,%xmm7,%xmm7 - xorl %edx,%esi - addl %ebx,%r9d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - vpsrlq $2,%xmm7,%xmm7 - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - vpxor %xmm7,%xmm6,%xmm6 - xorl %r9d,%r13d - shrdl $9,%r14d,%r14d - xorl %r11d,%r12d - vpshufd $232,%xmm6,%xmm6 - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpslldq $8,%xmm6,%xmm6 - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%esi - vpaddd %xmm6,%xmm3,%xmm3 - shrdl $11,%r14d,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - vpaddd 96(%rbp),%xmm3,%xmm6 - shrdl $6,%r13d,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - shrdl $2,%r14d,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,48(%rsp) - movq 64+0(%rsp),%r12 - vpand %xmm14,%xmm11,%xmm11 - movq 64+8(%rsp),%r15 - vpor %xmm11,%xmm8,%xmm8 - vmovdqu %xmm8,(%r15,%r12,1) - leaq 16(%r12),%r12 - cmpb $0,131(%rbp) - jne .Lavx_00_47 - vmovdqu (%r12),%xmm9 - movq %r12,64+0(%rsp) - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - xorl %r8d,%r13d - shrdl $9,%r14d,%r14d - xorl %r10d,%r12d - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - shrdl $11,%r14d,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%r11d - andl %r15d,%esi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%esi - addl %r11d,%edx - shrdl $2,%r14d,%r14d - addl %esi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - shrdl $9,%r14d,%r14d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - vpxor %xmm8,%xmm9,%xmm9 - xorl %edx,%r13d - addl 4(%rsp),%r10d - movl %r11d,%esi - shrdl $11,%r14d,%r14d - xorl %r9d,%r12d - xorl %eax,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%r10d - andl %esi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - shrdl $2,%r14d,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - movl %edx,%r12d - xorl %ecx,%r13d - shrdl $9,%r14d,%r14d - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - shrdl $11,%r14d,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%r9d - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%esi - addl %r9d,%ebx - shrdl $2,%r14d,%r14d - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - shrdl $9,%r14d,%r14d - xorl %edx,%r12d - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%esi - shrdl $11,%r14d,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - shrdl $2,%r14d,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - xorl %eax,%r13d - shrdl $9,%r14d,%r14d - xorl %ecx,%r12d - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - shrdl $11,%r14d,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%edx - andl %r15d,%esi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%esi - addl %edx,%r11d - shrdl $2,%r14d,%r14d - addl %esi,%edx - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - shrdl $9,%r14d,%r14d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r11d,%r13d - addl 20(%rsp),%ecx - movl %edx,%esi - shrdl $11,%r14d,%r14d - xorl %ebx,%r12d - xorl %r8d,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%ecx - andl %esi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - shrdl $2,%r14d,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - xorl %r10d,%r13d - shrdl $9,%r14d,%r14d - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - shrdl $11,%r14d,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%ebx - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%esi - addl %ebx,%r9d - shrdl $2,%r14d,%r14d - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - shrdl $9,%r14d,%r14d - xorl %r11d,%r12d - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%esi - shrdl $11,%r14d,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - shrdl $2,%r14d,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - xorl %r8d,%r13d - shrdl $9,%r14d,%r14d - xorl %r10d,%r12d - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - shrdl $11,%r14d,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%r11d - andl %r15d,%esi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%esi - addl %r11d,%edx - shrdl $2,%r14d,%r14d - addl %esi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - shrdl $9,%r14d,%r14d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %edx,%r13d - addl 36(%rsp),%r10d - movl %r11d,%esi - shrdl $11,%r14d,%r14d - xorl %r9d,%r12d - xorl %eax,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%r10d - andl %esi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - shrdl $2,%r14d,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - movl %edx,%r12d - xorl %ecx,%r13d - shrdl $9,%r14d,%r14d - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - shrdl $11,%r14d,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%r9d - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%esi - addl %r9d,%ebx - shrdl $2,%r14d,%r14d - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - shrdl $9,%r14d,%r14d - xorl %edx,%r12d - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%esi - shrdl $11,%r14d,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - shrdl $2,%r14d,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - xorl %eax,%r13d - shrdl $9,%r14d,%r14d - xorl %ecx,%r12d - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - shrdl $11,%r14d,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%edx - andl %r15d,%esi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%esi - addl %edx,%r11d - shrdl $2,%r14d,%r14d - addl %esi,%edx - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - shrdl $9,%r14d,%r14d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r11d,%r13d - addl 52(%rsp),%ecx - movl %edx,%esi - shrdl $11,%r14d,%r14d - xorl %ebx,%r12d - xorl %r8d,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%ecx - andl %esi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - shrdl $2,%r14d,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - xorl %r10d,%r13d - shrdl $9,%r14d,%r14d - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - shrdl $11,%r14d,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%ebx - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%esi - addl %ebx,%r9d - shrdl $2,%r14d,%r14d - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - shrdl $9,%r14d,%r14d - xorl %r11d,%r12d - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%esi - shrdl $11,%r14d,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - shrdl $2,%r14d,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movq 64+0(%rsp),%r12 - movq 64+8(%rsp),%r13 - movq 64+40(%rsp),%r15 - movq 64+48(%rsp),%rsi - - vpand %xmm14,%xmm11,%xmm11 - movl %r14d,%eax - vpor %xmm11,%xmm8,%xmm8 - vmovdqu %xmm8,(%r12,%r13,1) - leaq 16(%r12),%r12 - - addl 0(%r15),%eax - addl 4(%r15),%ebx - addl 8(%r15),%ecx - addl 12(%r15),%edx - addl 16(%r15),%r8d - addl 20(%r15),%r9d - addl 24(%r15),%r10d - addl 28(%r15),%r11d - - cmpq 64+16(%rsp),%r12 - - movl %eax,0(%r15) - movl %ebx,4(%r15) - movl %ecx,8(%r15) - movl %edx,12(%r15) - movl %r8d,16(%r15) - movl %r9d,20(%r15) - movl %r10d,24(%r15) - movl %r11d,28(%r15) - jb .Lloop_avx - - movq 64+32(%rsp),%r8 - movq 120(%rsp),%rsi -.cfi_def_cfa %rsi,8 - vmovdqu %xmm8,(%r8) - vzeroall - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_avx: - .byte 0xf3,0xc3 -.cfi_endproc -.size aesni_cbc_sha256_enc_avx,.-aesni_cbc_sha256_enc_avx -.type aesni_cbc_sha256_enc_avx2,@function -.align 64 -aesni_cbc_sha256_enc_avx2: -.cfi_startproc -.Lavx2_shortcut: - movq 8(%rsp),%r10 - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - subq $576,%rsp - andq $-1024,%rsp - addq $448,%rsp - - shlq $6,%rdx - subq %rdi,%rsi - subq %rdi,%r10 - addq %rdi,%rdx - - - - movq %rdx,64+16(%rsp) - - movq %r8,64+32(%rsp) - movq %r9,64+40(%rsp) - movq %r10,64+48(%rsp) - movq %rax,120(%rsp) -.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 -.Lprologue_avx2: - vzeroall - - movq %rdi,%r13 - vpinsrq $1,%rsi,%xmm15,%xmm15 - leaq 128(%rcx),%rdi - leaq K256+544(%rip),%r12 - movl 240-128(%rdi),%r14d - movq %r9,%r15 - movq %r10,%rsi - vmovdqu (%r8),%xmm8 - leaq -9(%r14),%r14 - - vmovdqa 0(%r12,%r14,8),%xmm14 - vmovdqa 16(%r12,%r14,8),%xmm13 - vmovdqa 32(%r12,%r14,8),%xmm12 - - subq $-64,%r13 - movl 0(%r15),%eax - leaq (%rsi,%r13,1),%r12 - movl 4(%r15),%ebx - cmpq %rdx,%r13 - movl 8(%r15),%ecx - cmoveq %rsp,%r12 - movl 12(%r15),%edx - movl 16(%r15),%r8d - movl 20(%r15),%r9d - movl 24(%r15),%r10d - movl 28(%r15),%r11d - vmovdqu 0-128(%rdi),%xmm10 - jmp .Loop_avx2 -.align 16 -.Loop_avx2: - vmovdqa K256+512(%rip),%ymm7 - vmovdqu -64+0(%rsi,%r13,1),%xmm0 - vmovdqu -64+16(%rsi,%r13,1),%xmm1 - vmovdqu -64+32(%rsi,%r13,1),%xmm2 - vmovdqu -64+48(%rsi,%r13,1),%xmm3 - - vinserti128 $1,(%r12),%ymm0,%ymm0 - vinserti128 $1,16(%r12),%ymm1,%ymm1 - vpshufb %ymm7,%ymm0,%ymm0 - vinserti128 $1,32(%r12),%ymm2,%ymm2 - vpshufb %ymm7,%ymm1,%ymm1 - vinserti128 $1,48(%r12),%ymm3,%ymm3 - - leaq K256(%rip),%rbp - vpshufb %ymm7,%ymm2,%ymm2 - leaq -64(%r13),%r13 - vpaddd 0(%rbp),%ymm0,%ymm4 - vpshufb %ymm7,%ymm3,%ymm3 - vpaddd 32(%rbp),%ymm1,%ymm5 - vpaddd 64(%rbp),%ymm2,%ymm6 - vpaddd 96(%rbp),%ymm3,%ymm7 - vmovdqa %ymm4,0(%rsp) - xorl %r14d,%r14d - vmovdqa %ymm5,32(%rsp) - - movq 120(%rsp),%rsi -.cfi_def_cfa %rsi,8 - leaq -64(%rsp),%rsp - - - - movq %rsi,-8(%rsp) -.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 - movl %ebx,%esi - vmovdqa %ymm6,0(%rsp) - xorl %ecx,%esi - vmovdqa %ymm7,32(%rsp) - movl %r9d,%r12d - subq $-32*4,%rbp - jmp .Lavx2_00_47 - -.align 16 -.Lavx2_00_47: - vmovdqu (%r13),%xmm9 - vpinsrq $0,%r13,%xmm15,%xmm15 - leaq -64(%rsp),%rsp -.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08 - - pushq 64-8(%rsp) -.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 - leaq 8(%rsp),%rsp -.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 - vpalignr $4,%ymm0,%ymm1,%ymm4 - addl 0+128(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - vpalignr $4,%ymm2,%ymm3,%ymm7 - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - vpsrld $7,%ymm4,%ymm6 - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - vpaddd %ymm7,%ymm0,%ymm0 - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%esi - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ebx,%esi - vpshufd $250,%ymm3,%ymm7 - xorl %r13d,%r14d - leal (%r11,%rsi,1),%r11d - movl %r8d,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 4+128(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%edx,%esi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - vpslld $11,%ymm5,%ymm5 - andnl %r9d,%edx,%r12d - xorl %esi,%r13d - rorxl $6,%edx,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%esi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%esi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - vpsrlq $17,%ymm7,%ymm7 - andl %esi,%r15d - vpxor %xmm8,%xmm9,%xmm9 - xorl %r12d,%r14d - xorl %eax,%r15d - vpaddd %ymm4,%ymm0,%ymm0 - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 8+128(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - vpxor %ymm7,%ymm6,%ymm6 - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - vpshufd $132,%ymm6,%ymm6 - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - vpsrldq $8,%ymm6,%ymm6 - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - vpaddd %ymm6,%ymm0,%ymm0 - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - vpshufd $80,%ymm0,%ymm7 - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r11d,%esi - vpsrld $10,%ymm7,%ymm6 - xorl %r13d,%r14d - leal (%r9,%rsi,1),%r9d - movl %ecx,%r12d - vpsrlq $17,%ymm7,%ymm7 - addl 12+128(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - vpxor %ymm7,%ymm6,%ymm6 - rorxl $11,%ebx,%esi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - vpsrlq $2,%ymm7,%ymm7 - andnl %edx,%ebx,%r12d - xorl %esi,%r13d - rorxl $6,%ebx,%r14d - vpxor %ymm7,%ymm6,%ymm6 - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%esi - vpshufd $232,%ymm6,%ymm6 - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%esi - vpslldq $8,%ymm6,%ymm6 - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - vpaddd %ymm6,%ymm0,%ymm0 - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r10d,%r15d - vpaddd 0(%rbp),%ymm0,%ymm6 - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - vmovdqa %ymm6,0(%rsp) - vpalignr $4,%ymm1,%ymm2,%ymm4 - addl 32+128(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - vpalignr $4,%ymm3,%ymm0,%ymm7 - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - vpsrld $7,%ymm4,%ymm6 - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - vpaddd %ymm7,%ymm1,%ymm1 - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r9d,%esi - vpshufd $250,%ymm0,%ymm7 - xorl %r13d,%r14d - leal (%rdx,%rsi,1),%edx - movl %eax,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 36+128(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%r11d,%esi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - vpslld $11,%ymm5,%ymm5 - andnl %ebx,%r11d,%r12d - xorl %esi,%r13d - rorxl $6,%r11d,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%esi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%esi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - vpsrlq $17,%ymm7,%ymm7 - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r8d,%r15d - vpaddd %ymm4,%ymm1,%ymm1 - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 40+128(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - vpxor %ymm7,%ymm6,%ymm6 - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - vpshufd $132,%ymm6,%ymm6 - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - vpsrldq $8,%ymm6,%ymm6 - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - vpaddd %ymm6,%ymm1,%ymm1 - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - vpshufd $80,%ymm1,%ymm7 - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %edx,%esi - vpsrld $10,%ymm7,%ymm6 - xorl %r13d,%r14d - leal (%rbx,%rsi,1),%ebx - movl %r10d,%r12d - vpsrlq $17,%ymm7,%ymm7 - addl 44+128(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - vpxor %ymm7,%ymm6,%ymm6 - rorxl $11,%r9d,%esi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - vpsrlq $2,%ymm7,%ymm7 - andnl %r11d,%r9d,%r12d - xorl %esi,%r13d - rorxl $6,%r9d,%r14d - vpxor %ymm7,%ymm6,%ymm6 - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%esi - vpshufd $232,%ymm6,%ymm6 - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%esi - vpslldq $8,%ymm6,%ymm6 - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - vpaddd %ymm6,%ymm1,%ymm1 - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ecx,%r15d - vpaddd 32(%rbp),%ymm1,%ymm6 - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - vmovdqa %ymm6,32(%rsp) - leaq -64(%rsp),%rsp -.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08 - - pushq 64-8(%rsp) -.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 - leaq 8(%rsp),%rsp -.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 - vpalignr $4,%ymm2,%ymm3,%ymm4 - addl 0+128(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - vpalignr $4,%ymm0,%ymm1,%ymm7 - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - vpsrld $7,%ymm4,%ymm6 - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - vpaddd %ymm7,%ymm2,%ymm2 - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ebx,%esi - vpshufd $250,%ymm1,%ymm7 - xorl %r13d,%r14d - leal (%r11,%rsi,1),%r11d - movl %r8d,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 4+128(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%edx,%esi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - vpslld $11,%ymm5,%ymm5 - andnl %r9d,%edx,%r12d - xorl %esi,%r13d - rorxl $6,%edx,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%esi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%esi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - vpsrlq $17,%ymm7,%ymm7 - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %eax,%r15d - vpaddd %ymm4,%ymm2,%ymm2 - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 8+128(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - vpxor %ymm7,%ymm6,%ymm6 - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - vpshufd $132,%ymm6,%ymm6 - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - vpsrldq $8,%ymm6,%ymm6 - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - vpaddd %ymm6,%ymm2,%ymm2 - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - vpshufd $80,%ymm2,%ymm7 - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r11d,%esi - vpsrld $10,%ymm7,%ymm6 - xorl %r13d,%r14d - leal (%r9,%rsi,1),%r9d - movl %ecx,%r12d - vpsrlq $17,%ymm7,%ymm7 - addl 12+128(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - vpxor %ymm7,%ymm6,%ymm6 - rorxl $11,%ebx,%esi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - vpsrlq $2,%ymm7,%ymm7 - andnl %edx,%ebx,%r12d - xorl %esi,%r13d - rorxl $6,%ebx,%r14d - vpxor %ymm7,%ymm6,%ymm6 - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%esi - vpshufd $232,%ymm6,%ymm6 - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%esi - vpslldq $8,%ymm6,%ymm6 - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - vpaddd %ymm6,%ymm2,%ymm2 - andl %esi,%r15d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r10d,%r15d - vpaddd 64(%rbp),%ymm2,%ymm6 - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - vmovdqa %ymm6,0(%rsp) - vpalignr $4,%ymm3,%ymm0,%ymm4 - addl 32+128(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - vpalignr $4,%ymm1,%ymm2,%ymm7 - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - vpsrld $7,%ymm4,%ymm6 - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - vpaddd %ymm7,%ymm3,%ymm3 - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%esi - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r9d,%esi - vpshufd $250,%ymm2,%ymm7 - xorl %r13d,%r14d - leal (%rdx,%rsi,1),%edx - movl %eax,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 36+128(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%r11d,%esi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - vpslld $11,%ymm5,%ymm5 - andnl %ebx,%r11d,%r12d - xorl %esi,%r13d - rorxl $6,%r11d,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%esi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%esi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - vpsrlq $17,%ymm7,%ymm7 - andl %esi,%r15d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r8d,%r15d - vpaddd %ymm4,%ymm3,%ymm3 - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 40+128(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - vpxor %ymm7,%ymm6,%ymm6 - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - vpshufd $132,%ymm6,%ymm6 - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - vpsrldq $8,%ymm6,%ymm6 - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - vpaddd %ymm6,%ymm3,%ymm3 - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - vpshufd $80,%ymm3,%ymm7 - andl %r15d,%esi - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %edx,%esi - vpsrld $10,%ymm7,%ymm6 - xorl %r13d,%r14d - leal (%rbx,%rsi,1),%ebx - movl %r10d,%r12d - vpsrlq $17,%ymm7,%ymm7 - addl 44+128(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - vpxor %ymm7,%ymm6,%ymm6 - rorxl $11,%r9d,%esi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - vpsrlq $2,%ymm7,%ymm7 - andnl %r11d,%r9d,%r12d - xorl %esi,%r13d - rorxl $6,%r9d,%r14d - vpxor %ymm7,%ymm6,%ymm6 - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%esi - vpshufd $232,%ymm6,%ymm6 - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%esi - vpslldq $8,%ymm6,%ymm6 - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - vpaddd %ymm6,%ymm3,%ymm3 - andl %esi,%r15d - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ecx,%r15d - vpaddd 96(%rbp),%ymm3,%ymm6 - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - vmovdqa %ymm6,32(%rsp) - vmovq %xmm15,%r13 - vpextrq $1,%xmm15,%r15 - vpand %xmm14,%xmm11,%xmm11 - vpor %xmm11,%xmm8,%xmm8 - vmovdqu %xmm8,(%r15,%r13,1) - leaq 16(%r13),%r13 - leaq 128(%rbp),%rbp - cmpb $0,3(%rbp) - jne .Lavx2_00_47 - vmovdqu (%r13),%xmm9 - vpinsrq $0,%r13,%xmm15,%xmm15 - addl 0+64(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - andl %r15d,%esi - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ebx,%esi - xorl %r13d,%r14d - leal (%r11,%rsi,1),%r11d - movl %r8d,%r12d - addl 4+64(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - rorxl $11,%edx,%esi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - andnl %r9d,%edx,%r12d - xorl %esi,%r13d - rorxl $6,%edx,%r14d - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%esi - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%esi - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - andl %esi,%r15d - vpxor %xmm8,%xmm9,%xmm9 - xorl %r12d,%r14d - xorl %eax,%r15d - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - addl 8+64(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r11d,%esi - xorl %r13d,%r14d - leal (%r9,%rsi,1),%r9d - movl %ecx,%r12d - addl 12+64(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - rorxl $11,%ebx,%esi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - andnl %edx,%ebx,%r12d - xorl %esi,%r13d - rorxl $6,%ebx,%r14d - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%esi - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%esi - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - addl 32+64(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r9d,%esi - xorl %r13d,%r14d - leal (%rdx,%rsi,1),%edx - movl %eax,%r12d - addl 36+64(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - rorxl $11,%r11d,%esi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - andnl %ebx,%r11d,%r12d - xorl %esi,%r13d - rorxl $6,%r11d,%r14d - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%esi - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%esi - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r8d,%r15d - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - addl 40+64(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %edx,%esi - xorl %r13d,%r14d - leal (%rbx,%rsi,1),%ebx - movl %r10d,%r12d - addl 44+64(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - rorxl $11,%r9d,%esi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - andnl %r11d,%r9d,%r12d - xorl %esi,%r13d - rorxl $6,%r9d,%r14d - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%esi - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%esi - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - addl 0(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ebx,%esi - xorl %r13d,%r14d - leal (%r11,%rsi,1),%r11d - movl %r8d,%r12d - addl 4(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - rorxl $11,%edx,%esi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - andnl %r9d,%edx,%r12d - xorl %esi,%r13d - rorxl $6,%edx,%r14d - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%esi - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%esi - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %eax,%r15d - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - addl 8(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r11d,%esi - xorl %r13d,%r14d - leal (%r9,%rsi,1),%r9d - movl %ecx,%r12d - addl 12(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - rorxl $11,%ebx,%esi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - andnl %edx,%ebx,%r12d - xorl %esi,%r13d - rorxl $6,%ebx,%r14d - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%esi - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%esi - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %esi,%r15d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - addl 32(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - andl %r15d,%esi - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r9d,%esi - xorl %r13d,%r14d - leal (%rdx,%rsi,1),%edx - movl %eax,%r12d - addl 36(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - rorxl $11,%r11d,%esi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - andnl %ebx,%r11d,%r12d - xorl %esi,%r13d - rorxl $6,%r11d,%r14d - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%esi - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%esi - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - andl %esi,%r15d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r8d,%r15d - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - addl 40(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - andl %r15d,%esi - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %edx,%esi - xorl %r13d,%r14d - leal (%rbx,%rsi,1),%ebx - movl %r10d,%r12d - addl 44(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - rorxl $11,%r9d,%esi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - andnl %r11d,%r9d,%r12d - xorl %esi,%r13d - rorxl $6,%r9d,%r14d - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%esi - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%esi - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %esi,%r15d - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - vpextrq $1,%xmm15,%r12 - vmovq %xmm15,%r13 - movq 552(%rsp),%r15 - addl %r14d,%eax - leaq 448(%rsp),%rbp - - vpand %xmm14,%xmm11,%xmm11 - vpor %xmm11,%xmm8,%xmm8 - vmovdqu %xmm8,(%r12,%r13,1) - leaq 16(%r13),%r13 - - addl 0(%r15),%eax - addl 4(%r15),%ebx - addl 8(%r15),%ecx - addl 12(%r15),%edx - addl 16(%r15),%r8d - addl 20(%r15),%r9d - addl 24(%r15),%r10d - addl 28(%r15),%r11d - - movl %eax,0(%r15) - movl %ebx,4(%r15) - movl %ecx,8(%r15) - movl %edx,12(%r15) - movl %r8d,16(%r15) - movl %r9d,20(%r15) - movl %r10d,24(%r15) - movl %r11d,28(%r15) - - cmpq 80(%rbp),%r13 - je .Ldone_avx2 - - xorl %r14d,%r14d - movl %ebx,%esi - movl %r9d,%r12d - xorl %ecx,%esi - jmp .Lower_avx2 -.align 16 -.Lower_avx2: - vmovdqu (%r13),%xmm9 - vpinsrq $0,%r13,%xmm15,%xmm15 - addl 0+16(%rbp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - andl %r15d,%esi - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ebx,%esi - xorl %r13d,%r14d - leal (%r11,%rsi,1),%r11d - movl %r8d,%r12d - addl 4+16(%rbp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - rorxl $11,%edx,%esi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - andnl %r9d,%edx,%r12d - xorl %esi,%r13d - rorxl $6,%edx,%r14d - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%esi - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%esi - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - andl %esi,%r15d - vpxor %xmm8,%xmm9,%xmm9 - xorl %r12d,%r14d - xorl %eax,%r15d - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - addl 8+16(%rbp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r11d,%esi - xorl %r13d,%r14d - leal (%r9,%rsi,1),%r9d - movl %ecx,%r12d - addl 12+16(%rbp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - rorxl $11,%ebx,%esi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - andnl %edx,%ebx,%r12d - xorl %esi,%r13d - rorxl $6,%ebx,%r14d - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%esi - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%esi - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - addl 32+16(%rbp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r9d,%esi - xorl %r13d,%r14d - leal (%rdx,%rsi,1),%edx - movl %eax,%r12d - addl 36+16(%rbp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - rorxl $11,%r11d,%esi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - andnl %ebx,%r11d,%r12d - xorl %esi,%r13d - rorxl $6,%r11d,%r14d - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%esi - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%esi - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r8d,%r15d - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - addl 40+16(%rbp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %edx,%esi - xorl %r13d,%r14d - leal (%rbx,%rsi,1),%ebx - movl %r10d,%r12d - addl 44+16(%rbp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - rorxl $11,%r9d,%esi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - andnl %r11d,%r9d,%r12d - xorl %esi,%r13d - rorxl $6,%r9d,%r14d - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%esi - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%esi - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - leaq -64(%rbp),%rbp - addl 0+16(%rbp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ebx,%esi - xorl %r13d,%r14d - leal (%r11,%rsi,1),%r11d - movl %r8d,%r12d - addl 4+16(%rbp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - rorxl $11,%edx,%esi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - andnl %r9d,%edx,%r12d - xorl %esi,%r13d - rorxl $6,%edx,%r14d - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%esi - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%esi - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %eax,%r15d - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - addl 8+16(%rbp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r11d,%esi - xorl %r13d,%r14d - leal (%r9,%rsi,1),%r9d - movl %ecx,%r12d - addl 12+16(%rbp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - rorxl $11,%ebx,%esi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - andnl %edx,%ebx,%r12d - xorl %esi,%r13d - rorxl $6,%ebx,%r14d - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%esi - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%esi - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %esi,%r15d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - addl 32+16(%rbp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - andl %r15d,%esi - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r9d,%esi - xorl %r13d,%r14d - leal (%rdx,%rsi,1),%edx - movl %eax,%r12d - addl 36+16(%rbp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - rorxl $11,%r11d,%esi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - andnl %ebx,%r11d,%r12d - xorl %esi,%r13d - rorxl $6,%r11d,%r14d - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%esi - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%esi - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - andl %esi,%r15d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r8d,%r15d - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - addl 40+16(%rbp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - andl %r15d,%esi - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %edx,%esi - xorl %r13d,%r14d - leal (%rbx,%rsi,1),%ebx - movl %r10d,%r12d - addl 44+16(%rbp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - rorxl $11,%r9d,%esi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - andnl %r11d,%r9d,%r12d - xorl %esi,%r13d - rorxl $6,%r9d,%r14d - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%esi - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%esi - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %esi,%r15d - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - vmovq %xmm15,%r13 - vpextrq $1,%xmm15,%r15 - vpand %xmm14,%xmm11,%xmm11 - vpor %xmm11,%xmm8,%xmm8 - leaq -64(%rbp),%rbp - vmovdqu %xmm8,(%r15,%r13,1) - leaq 16(%r13),%r13 - cmpq %rsp,%rbp - jae .Lower_avx2 - - movq 552(%rsp),%r15 - leaq 64(%r13),%r13 - movq 560(%rsp),%rsi - addl %r14d,%eax - leaq 448(%rsp),%rsp - - addl 0(%r15),%eax - addl 4(%r15),%ebx - addl 8(%r15),%ecx - addl 12(%r15),%edx - addl 16(%r15),%r8d - addl 20(%r15),%r9d - addl 24(%r15),%r10d - leaq (%rsi,%r13,1),%r12 - addl 28(%r15),%r11d - - cmpq 64+16(%rsp),%r13 - - movl %eax,0(%r15) - cmoveq %rsp,%r12 - movl %ebx,4(%r15) - movl %ecx,8(%r15) - movl %edx,12(%r15) - movl %r8d,16(%r15) - movl %r9d,20(%r15) - movl %r10d,24(%r15) - movl %r11d,28(%r15) - - jbe .Loop_avx2 - leaq (%rsp),%rbp - - -.cfi_escape 0x0f,0x06,0x76,0xf8,0x00,0x06,0x23,0x08 - -.Ldone_avx2: - movq 64+32(%rbp),%r8 - movq 64+56(%rbp),%rsi -.cfi_def_cfa %rsi,8 - vmovdqu %xmm8,(%r8) - vzeroall - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_avx2: - .byte 0xf3,0xc3 -.cfi_endproc -.size aesni_cbc_sha256_enc_avx2,.-aesni_cbc_sha256_enc_avx2 -.type aesni_cbc_sha256_enc_shaext,@function -.align 32 -aesni_cbc_sha256_enc_shaext: -.cfi_startproc - movq 8(%rsp),%r10 - leaq K256+128(%rip),%rax - movdqu (%r9),%xmm1 - movdqu 16(%r9),%xmm2 - movdqa 512-128(%rax),%xmm3 - - movl 240(%rcx),%r11d - subq %rdi,%rsi - movups (%rcx),%xmm15 - movups (%r8),%xmm6 - movups 16(%rcx),%xmm4 - leaq 112(%rcx),%rcx - - pshufd $0x1b,%xmm1,%xmm0 - pshufd $0xb1,%xmm1,%xmm1 - pshufd $0x1b,%xmm2,%xmm2 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,202,8 - punpcklqdq %xmm0,%xmm2 - - jmp .Loop_shaext - -.align 16 -.Loop_shaext: - movdqu (%r10),%xmm10 - movdqu 16(%r10),%xmm11 - movdqu 32(%r10),%xmm12 -.byte 102,68,15,56,0,211 - movdqu 48(%r10),%xmm13 - - movdqa 0-128(%rax),%xmm0 - paddd %xmm10,%xmm0 -.byte 102,68,15,56,0,219 - movdqa %xmm2,%xmm9 - movdqa %xmm1,%xmm8 - movups 0(%rdi),%xmm14 - xorps %xmm15,%xmm14 - xorps %xmm14,%xmm6 - movups -80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movups -64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,202 - - movdqa 32-128(%rax),%xmm0 - paddd %xmm11,%xmm0 -.byte 102,68,15,56,0,227 - leaq 64(%r10),%r10 - movups -48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movups -32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,202 - - movdqa 64-128(%rax),%xmm0 - paddd %xmm12,%xmm0 -.byte 102,68,15,56,0,235 -.byte 69,15,56,204,211 - movups -16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm13,%xmm3 -.byte 102,65,15,58,15,220,4 - paddd %xmm3,%xmm10 - movups 0(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,202 - - movdqa 96-128(%rax),%xmm0 - paddd %xmm13,%xmm0 -.byte 69,15,56,205,213 -.byte 69,15,56,204,220 - movups 16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movups 32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movdqa %xmm10,%xmm3 -.byte 102,65,15,58,15,221,4 - paddd %xmm3,%xmm11 -.byte 15,56,203,202 - movdqa 128-128(%rax),%xmm0 - paddd %xmm10,%xmm0 -.byte 69,15,56,205,218 -.byte 69,15,56,204,229 - movups 48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm11,%xmm3 -.byte 102,65,15,58,15,218,4 - paddd %xmm3,%xmm12 - cmpl $11,%r11d - jb .Laesenclast1 - movups 64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - je .Laesenclast1 - movups 96(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 112(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.Laesenclast1: - aesenclast %xmm5,%xmm6 - movups 16-112(%rcx),%xmm4 - nop -.byte 15,56,203,202 - movups 16(%rdi),%xmm14 - xorps %xmm15,%xmm14 - movups %xmm6,0(%rsi,%rdi,1) - xorps %xmm14,%xmm6 - movups -80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - movdqa 160-128(%rax),%xmm0 - paddd %xmm11,%xmm0 -.byte 69,15,56,205,227 -.byte 69,15,56,204,234 - movups -64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm12,%xmm3 -.byte 102,65,15,58,15,219,4 - paddd %xmm3,%xmm13 - movups -48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 192-128(%rax),%xmm0 - paddd %xmm12,%xmm0 -.byte 69,15,56,205,236 -.byte 69,15,56,204,211 - movups -32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm13,%xmm3 -.byte 102,65,15,58,15,220,4 - paddd %xmm3,%xmm10 - movups -16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 224-128(%rax),%xmm0 - paddd %xmm13,%xmm0 -.byte 69,15,56,205,213 -.byte 69,15,56,204,220 - movups 0(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm10,%xmm3 -.byte 102,65,15,58,15,221,4 - paddd %xmm3,%xmm11 - movups 16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 256-128(%rax),%xmm0 - paddd %xmm10,%xmm0 -.byte 69,15,56,205,218 -.byte 69,15,56,204,229 - movups 32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm11,%xmm3 -.byte 102,65,15,58,15,218,4 - paddd %xmm3,%xmm12 - movups 48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - cmpl $11,%r11d - jb .Laesenclast2 - movups 64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - je .Laesenclast2 - movups 96(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 112(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.Laesenclast2: - aesenclast %xmm5,%xmm6 - movups 16-112(%rcx),%xmm4 - nop -.byte 15,56,203,202 - movups 32(%rdi),%xmm14 - xorps %xmm15,%xmm14 - movups %xmm6,16(%rsi,%rdi,1) - xorps %xmm14,%xmm6 - movups -80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - movdqa 288-128(%rax),%xmm0 - paddd %xmm11,%xmm0 -.byte 69,15,56,205,227 -.byte 69,15,56,204,234 - movups -64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm12,%xmm3 -.byte 102,65,15,58,15,219,4 - paddd %xmm3,%xmm13 - movups -48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 320-128(%rax),%xmm0 - paddd %xmm12,%xmm0 -.byte 69,15,56,205,236 -.byte 69,15,56,204,211 - movups -32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm13,%xmm3 -.byte 102,65,15,58,15,220,4 - paddd %xmm3,%xmm10 - movups -16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 352-128(%rax),%xmm0 - paddd %xmm13,%xmm0 -.byte 69,15,56,205,213 -.byte 69,15,56,204,220 - movups 0(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm10,%xmm3 -.byte 102,65,15,58,15,221,4 - paddd %xmm3,%xmm11 - movups 16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 384-128(%rax),%xmm0 - paddd %xmm10,%xmm0 -.byte 69,15,56,205,218 -.byte 69,15,56,204,229 - movups 32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm11,%xmm3 -.byte 102,65,15,58,15,218,4 - paddd %xmm3,%xmm12 - movups 48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 416-128(%rax),%xmm0 - paddd %xmm11,%xmm0 -.byte 69,15,56,205,227 -.byte 69,15,56,204,234 - cmpl $11,%r11d - jb .Laesenclast3 - movups 64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - je .Laesenclast3 - movups 96(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 112(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.Laesenclast3: - aesenclast %xmm5,%xmm6 - movups 16-112(%rcx),%xmm4 - nop -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm12,%xmm3 -.byte 102,65,15,58,15,219,4 - paddd %xmm3,%xmm13 - movups 48(%rdi),%xmm14 - xorps %xmm15,%xmm14 - movups %xmm6,32(%rsi,%rdi,1) - xorps %xmm14,%xmm6 - movups -80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - movups -64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,202 - - movdqa 448-128(%rax),%xmm0 - paddd %xmm12,%xmm0 -.byte 69,15,56,205,236 - movdqa %xmm7,%xmm3 - movups -48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movups -32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,202 - - movdqa 480-128(%rax),%xmm0 - paddd %xmm13,%xmm0 - movups -16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - movups 0(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movups 16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - - movups 32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - cmpl $11,%r11d - jb .Laesenclast4 - movups 64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - je .Laesenclast4 - movups 96(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 112(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.Laesenclast4: - aesenclast %xmm5,%xmm6 - movups 16-112(%rcx),%xmm4 - nop - - paddd %xmm9,%xmm2 - paddd %xmm8,%xmm1 - - decq %rdx - movups %xmm6,48(%rsi,%rdi,1) - leaq 64(%rdi),%rdi - jnz .Loop_shaext - - pshufd $0xb1,%xmm2,%xmm2 - pshufd $0x1b,%xmm1,%xmm3 - pshufd $0xb1,%xmm1,%xmm1 - punpckhqdq %xmm2,%xmm1 -.byte 102,15,58,15,211,8 - - movups %xmm6,(%r8) - movdqu %xmm1,(%r9) - movdqu %xmm2,16(%r9) - .byte 0xf3,0xc3 -.cfi_endproc -.size aesni_cbc_sha256_enc_shaext,.-aesni_cbc_sha256_enc_shaext - .section ".note.gnu.property", "a" + .section .note.gnu.property, #alloc .p2align 3 .long 1f - 0f .long 4f - 1f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-x86_64.s index aa7585f179a..d637b0d12fe 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-x86_64.s @@ -4483,7 +4483,7 @@ __aesni_set_encrypt_key: .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 - .section ".note.gnu.property", "a" + .section .note.gnu.property, #alloc .p2align 3 .long 1f - 0f .long 4f - 1f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/bsaes-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/bsaes-x86_64.s index 5abda703024..57f86f616e9 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/bsaes-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/bsaes-x86_64.s @@ -2595,7 +2595,7 @@ _bsaes_const: .byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,69,109,105,108,105,97,32,75,195,164,115,112,101,114,44,32,80,101,116,101,114,32,83,99,104,119,97,98,101,44,32,65,110,100,121,32,80,111,108,121,97,107,111,118,0 .align 64 .size _bsaes_const,.-_bsaes_const - .section ".note.gnu.property", "a" + .section .note.gnu.property, #alloc .p2align 3 .long 1f - 0f .long 4f - 1f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/vpaes-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/vpaes-x86_64.s index 4bd2e683b9f..4ee6ed9dc9a 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/vpaes-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/vpaes-x86_64.s @@ -856,7 +856,7 @@ _vpaes_consts: .byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 .align 64 .size _vpaes_consts,.-_vpaes_consts - .section ".note.gnu.property", "a" + .section .note.gnu.property, #alloc .p2align 3 .long 1f - 0f .long 4f - 1f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-avx2.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-avx2.s index 7644d07da74..214f397a33a 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-avx2.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-avx2.s @@ -1,1748 +1,29 @@ .text +.globl rsaz_avx2_eligible +.type rsaz_avx2_eligible,@function +rsaz_avx2_eligible: + xorl %eax,%eax + .byte 0xf3,0xc3 +.size rsaz_avx2_eligible,.-rsaz_avx2_eligible + .globl rsaz_1024_sqr_avx2 +.globl rsaz_1024_mul_avx2 +.globl rsaz_1024_norm2red_avx2 +.globl rsaz_1024_red2norm_avx2 +.globl rsaz_1024_scatter5_avx2 +.globl rsaz_1024_gather5_avx2 .type rsaz_1024_sqr_avx2,@function -.align 64 rsaz_1024_sqr_avx2: -.cfi_startproc - leaq (%rsp),%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - vzeroupper - movq %rax,%rbp -.cfi_def_cfa_register %rbp - movq %rdx,%r13 - subq $832,%rsp - movq %r13,%r15 - subq $-128,%rdi - subq $-128,%rsi - subq $-128,%r13 - - andq $4095,%r15 - addq $320,%r15 - shrq $12,%r15 - vpxor %ymm9,%ymm9,%ymm9 - jz .Lsqr_1024_no_n_copy - - - - - - subq $320,%rsp - vmovdqu 0-128(%r13),%ymm0 - andq $-2048,%rsp - vmovdqu 32-128(%r13),%ymm1 - vmovdqu 64-128(%r13),%ymm2 - vmovdqu 96-128(%r13),%ymm3 - vmovdqu 128-128(%r13),%ymm4 - vmovdqu 160-128(%r13),%ymm5 - vmovdqu 192-128(%r13),%ymm6 - vmovdqu 224-128(%r13),%ymm7 - vmovdqu 256-128(%r13),%ymm8 - leaq 832+128(%rsp),%r13 - vmovdqu %ymm0,0-128(%r13) - vmovdqu %ymm1,32-128(%r13) - vmovdqu %ymm2,64-128(%r13) - vmovdqu %ymm3,96-128(%r13) - vmovdqu %ymm4,128-128(%r13) - vmovdqu %ymm5,160-128(%r13) - vmovdqu %ymm6,192-128(%r13) - vmovdqu %ymm7,224-128(%r13) - vmovdqu %ymm8,256-128(%r13) - vmovdqu %ymm9,288-128(%r13) - -.Lsqr_1024_no_n_copy: - andq $-1024,%rsp - - vmovdqu 32-128(%rsi),%ymm1 - vmovdqu 64-128(%rsi),%ymm2 - vmovdqu 96-128(%rsi),%ymm3 - vmovdqu 128-128(%rsi),%ymm4 - vmovdqu 160-128(%rsi),%ymm5 - vmovdqu 192-128(%rsi),%ymm6 - vmovdqu 224-128(%rsi),%ymm7 - vmovdqu 256-128(%rsi),%ymm8 - - leaq 192(%rsp),%rbx - vmovdqu .Land_mask(%rip),%ymm15 - jmp .LOOP_GRANDE_SQR_1024 - -.align 32 -.LOOP_GRANDE_SQR_1024: - leaq 576+128(%rsp),%r9 - leaq 448(%rsp),%r12 - - - - - vpaddq %ymm1,%ymm1,%ymm1 - vpbroadcastq 0-128(%rsi),%ymm10 - vpaddq %ymm2,%ymm2,%ymm2 - vmovdqa %ymm1,0-128(%r9) - vpaddq %ymm3,%ymm3,%ymm3 - vmovdqa %ymm2,32-128(%r9) - vpaddq %ymm4,%ymm4,%ymm4 - vmovdqa %ymm3,64-128(%r9) - vpaddq %ymm5,%ymm5,%ymm5 - vmovdqa %ymm4,96-128(%r9) - vpaddq %ymm6,%ymm6,%ymm6 - vmovdqa %ymm5,128-128(%r9) - vpaddq %ymm7,%ymm7,%ymm7 - vmovdqa %ymm6,160-128(%r9) - vpaddq %ymm8,%ymm8,%ymm8 - vmovdqa %ymm7,192-128(%r9) - vpxor %ymm9,%ymm9,%ymm9 - vmovdqa %ymm8,224-128(%r9) - - vpmuludq 0-128(%rsi),%ymm10,%ymm0 - vpbroadcastq 32-128(%rsi),%ymm11 - vmovdqu %ymm9,288-192(%rbx) - vpmuludq %ymm10,%ymm1,%ymm1 - vmovdqu %ymm9,320-448(%r12) - vpmuludq %ymm10,%ymm2,%ymm2 - vmovdqu %ymm9,352-448(%r12) - vpmuludq %ymm10,%ymm3,%ymm3 - vmovdqu %ymm9,384-448(%r12) - vpmuludq %ymm10,%ymm4,%ymm4 - vmovdqu %ymm9,416-448(%r12) - vpmuludq %ymm10,%ymm5,%ymm5 - vmovdqu %ymm9,448-448(%r12) - vpmuludq %ymm10,%ymm6,%ymm6 - vmovdqu %ymm9,480-448(%r12) - vpmuludq %ymm10,%ymm7,%ymm7 - vmovdqu %ymm9,512-448(%r12) - vpmuludq %ymm10,%ymm8,%ymm8 - vpbroadcastq 64-128(%rsi),%ymm10 - vmovdqu %ymm9,544-448(%r12) - - movq %rsi,%r15 - movl $4,%r14d - jmp .Lsqr_entry_1024 -.align 32 -.LOOP_SQR_1024: - vpbroadcastq 32-128(%r15),%ymm11 - vpmuludq 0-128(%rsi),%ymm10,%ymm0 - vpaddq 0-192(%rbx),%ymm0,%ymm0 - vpmuludq 0-128(%r9),%ymm10,%ymm1 - vpaddq 32-192(%rbx),%ymm1,%ymm1 - vpmuludq 32-128(%r9),%ymm10,%ymm2 - vpaddq 64-192(%rbx),%ymm2,%ymm2 - vpmuludq 64-128(%r9),%ymm10,%ymm3 - vpaddq 96-192(%rbx),%ymm3,%ymm3 - vpmuludq 96-128(%r9),%ymm10,%ymm4 - vpaddq 128-192(%rbx),%ymm4,%ymm4 - vpmuludq 128-128(%r9),%ymm10,%ymm5 - vpaddq 160-192(%rbx),%ymm5,%ymm5 - vpmuludq 160-128(%r9),%ymm10,%ymm6 - vpaddq 192-192(%rbx),%ymm6,%ymm6 - vpmuludq 192-128(%r9),%ymm10,%ymm7 - vpaddq 224-192(%rbx),%ymm7,%ymm7 - vpmuludq 224-128(%r9),%ymm10,%ymm8 - vpbroadcastq 64-128(%r15),%ymm10 - vpaddq 256-192(%rbx),%ymm8,%ymm8 -.Lsqr_entry_1024: - vmovdqu %ymm0,0-192(%rbx) - vmovdqu %ymm1,32-192(%rbx) - - vpmuludq 32-128(%rsi),%ymm11,%ymm12 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq 32-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm3,%ymm3 - vpmuludq 64-128(%r9),%ymm11,%ymm13 - vpaddq %ymm13,%ymm4,%ymm4 - vpmuludq 96-128(%r9),%ymm11,%ymm12 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq 128-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm6,%ymm6 - vpmuludq 160-128(%r9),%ymm11,%ymm13 - vpaddq %ymm13,%ymm7,%ymm7 - vpmuludq 192-128(%r9),%ymm11,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq 224-128(%r9),%ymm11,%ymm0 - vpbroadcastq 96-128(%r15),%ymm11 - vpaddq 288-192(%rbx),%ymm0,%ymm0 - - vmovdqu %ymm2,64-192(%rbx) - vmovdqu %ymm3,96-192(%rbx) - - vpmuludq 64-128(%rsi),%ymm10,%ymm13 - vpaddq %ymm13,%ymm4,%ymm4 - vpmuludq 64-128(%r9),%ymm10,%ymm12 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq 96-128(%r9),%ymm10,%ymm14 - vpaddq %ymm14,%ymm6,%ymm6 - vpmuludq 128-128(%r9),%ymm10,%ymm13 - vpaddq %ymm13,%ymm7,%ymm7 - vpmuludq 160-128(%r9),%ymm10,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq 192-128(%r9),%ymm10,%ymm14 - vpaddq %ymm14,%ymm0,%ymm0 - vpmuludq 224-128(%r9),%ymm10,%ymm1 - vpbroadcastq 128-128(%r15),%ymm10 - vpaddq 320-448(%r12),%ymm1,%ymm1 - - vmovdqu %ymm4,128-192(%rbx) - vmovdqu %ymm5,160-192(%rbx) - - vpmuludq 96-128(%rsi),%ymm11,%ymm12 - vpaddq %ymm12,%ymm6,%ymm6 - vpmuludq 96-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm7,%ymm7 - vpmuludq 128-128(%r9),%ymm11,%ymm13 - vpaddq %ymm13,%ymm8,%ymm8 - vpmuludq 160-128(%r9),%ymm11,%ymm12 - vpaddq %ymm12,%ymm0,%ymm0 - vpmuludq 192-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm1,%ymm1 - vpmuludq 224-128(%r9),%ymm11,%ymm2 - vpbroadcastq 160-128(%r15),%ymm11 - vpaddq 352-448(%r12),%ymm2,%ymm2 - - vmovdqu %ymm6,192-192(%rbx) - vmovdqu %ymm7,224-192(%rbx) - - vpmuludq 128-128(%rsi),%ymm10,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq 128-128(%r9),%ymm10,%ymm14 - vpaddq %ymm14,%ymm0,%ymm0 - vpmuludq 160-128(%r9),%ymm10,%ymm13 - vpaddq %ymm13,%ymm1,%ymm1 - vpmuludq 192-128(%r9),%ymm10,%ymm12 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq 224-128(%r9),%ymm10,%ymm3 - vpbroadcastq 192-128(%r15),%ymm10 - vpaddq 384-448(%r12),%ymm3,%ymm3 - - vmovdqu %ymm8,256-192(%rbx) - vmovdqu %ymm0,288-192(%rbx) - leaq 8(%rbx),%rbx - - vpmuludq 160-128(%rsi),%ymm11,%ymm13 - vpaddq %ymm13,%ymm1,%ymm1 - vpmuludq 160-128(%r9),%ymm11,%ymm12 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq 192-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm3,%ymm3 - vpmuludq 224-128(%r9),%ymm11,%ymm4 - vpbroadcastq 224-128(%r15),%ymm11 - vpaddq 416-448(%r12),%ymm4,%ymm4 - - vmovdqu %ymm1,320-448(%r12) - vmovdqu %ymm2,352-448(%r12) - - vpmuludq 192-128(%rsi),%ymm10,%ymm12 - vpaddq %ymm12,%ymm3,%ymm3 - vpmuludq 192-128(%r9),%ymm10,%ymm14 - vpbroadcastq 256-128(%r15),%ymm0 - vpaddq %ymm14,%ymm4,%ymm4 - vpmuludq 224-128(%r9),%ymm10,%ymm5 - vpbroadcastq 0+8-128(%r15),%ymm10 - vpaddq 448-448(%r12),%ymm5,%ymm5 - - vmovdqu %ymm3,384-448(%r12) - vmovdqu %ymm4,416-448(%r12) - leaq 8(%r15),%r15 - - vpmuludq 224-128(%rsi),%ymm11,%ymm12 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq 224-128(%r9),%ymm11,%ymm6 - vpaddq 480-448(%r12),%ymm6,%ymm6 - - vpmuludq 256-128(%rsi),%ymm0,%ymm7 - vmovdqu %ymm5,448-448(%r12) - vpaddq 512-448(%r12),%ymm7,%ymm7 - vmovdqu %ymm6,480-448(%r12) - vmovdqu %ymm7,512-448(%r12) - leaq 8(%r12),%r12 - - decl %r14d - jnz .LOOP_SQR_1024 - - vmovdqu 256(%rsp),%ymm8 - vmovdqu 288(%rsp),%ymm1 - vmovdqu 320(%rsp),%ymm2 - leaq 192(%rsp),%rbx - - vpsrlq $29,%ymm8,%ymm14 - vpand %ymm15,%ymm8,%ymm8 - vpsrlq $29,%ymm1,%ymm11 - vpand %ymm15,%ymm1,%ymm1 - - vpermq $0x93,%ymm14,%ymm14 - vpxor %ymm9,%ymm9,%ymm9 - vpermq $0x93,%ymm11,%ymm11 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm8,%ymm8 - vpblendd $3,%ymm11,%ymm9,%ymm11 - vpaddq %ymm14,%ymm1,%ymm1 - vpaddq %ymm11,%ymm2,%ymm2 - vmovdqu %ymm1,288-192(%rbx) - vmovdqu %ymm2,320-192(%rbx) - - movq (%rsp),%rax - movq 8(%rsp),%r10 - movq 16(%rsp),%r11 - movq 24(%rsp),%r12 - vmovdqu 32(%rsp),%ymm1 - vmovdqu 64-192(%rbx),%ymm2 - vmovdqu 96-192(%rbx),%ymm3 - vmovdqu 128-192(%rbx),%ymm4 - vmovdqu 160-192(%rbx),%ymm5 - vmovdqu 192-192(%rbx),%ymm6 - vmovdqu 224-192(%rbx),%ymm7 - - movq %rax,%r9 - imull %ecx,%eax - andl $0x1fffffff,%eax - vmovd %eax,%xmm12 - - movq %rax,%rdx - imulq -128(%r13),%rax - vpbroadcastq %xmm12,%ymm12 - addq %rax,%r9 - movq %rdx,%rax - imulq 8-128(%r13),%rax - shrq $29,%r9 - addq %rax,%r10 - movq %rdx,%rax - imulq 16-128(%r13),%rax - addq %r9,%r10 - addq %rax,%r11 - imulq 24-128(%r13),%rdx - addq %rdx,%r12 - - movq %r10,%rax - imull %ecx,%eax - andl $0x1fffffff,%eax - - movl $9,%r14d - jmp .LOOP_REDUCE_1024 - -.align 32 -.LOOP_REDUCE_1024: - vmovd %eax,%xmm13 - vpbroadcastq %xmm13,%ymm13 - - vpmuludq 32-128(%r13),%ymm12,%ymm10 - movq %rax,%rdx - imulq -128(%r13),%rax - vpaddq %ymm10,%ymm1,%ymm1 - addq %rax,%r10 - vpmuludq 64-128(%r13),%ymm12,%ymm14 - movq %rdx,%rax - imulq 8-128(%r13),%rax - vpaddq %ymm14,%ymm2,%ymm2 - vpmuludq 96-128(%r13),%ymm12,%ymm11 -.byte 0x67 - addq %rax,%r11 -.byte 0x67 - movq %rdx,%rax - imulq 16-128(%r13),%rax - shrq $29,%r10 - vpaddq %ymm11,%ymm3,%ymm3 - vpmuludq 128-128(%r13),%ymm12,%ymm10 - addq %rax,%r12 - addq %r10,%r11 - vpaddq %ymm10,%ymm4,%ymm4 - vpmuludq 160-128(%r13),%ymm12,%ymm14 - movq %r11,%rax - imull %ecx,%eax - vpaddq %ymm14,%ymm5,%ymm5 - vpmuludq 192-128(%r13),%ymm12,%ymm11 - andl $0x1fffffff,%eax - vpaddq %ymm11,%ymm6,%ymm6 - vpmuludq 224-128(%r13),%ymm12,%ymm10 - vpaddq %ymm10,%ymm7,%ymm7 - vpmuludq 256-128(%r13),%ymm12,%ymm14 - vmovd %eax,%xmm12 - - vpaddq %ymm14,%ymm8,%ymm8 - - vpbroadcastq %xmm12,%ymm12 - - vpmuludq 32-8-128(%r13),%ymm13,%ymm11 - vmovdqu 96-8-128(%r13),%ymm14 - movq %rax,%rdx - imulq -128(%r13),%rax - vpaddq %ymm11,%ymm1,%ymm1 - vpmuludq 64-8-128(%r13),%ymm13,%ymm10 - vmovdqu 128-8-128(%r13),%ymm11 - addq %rax,%r11 - movq %rdx,%rax - imulq 8-128(%r13),%rax - vpaddq %ymm10,%ymm2,%ymm2 - addq %r12,%rax - shrq $29,%r11 - vpmuludq %ymm13,%ymm14,%ymm14 - vmovdqu 160-8-128(%r13),%ymm10 - addq %r11,%rax - vpaddq %ymm14,%ymm3,%ymm3 - vpmuludq %ymm13,%ymm11,%ymm11 - vmovdqu 192-8-128(%r13),%ymm14 -.byte 0x67 - movq %rax,%r12 - imull %ecx,%eax - vpaddq %ymm11,%ymm4,%ymm4 - vpmuludq %ymm13,%ymm10,%ymm10 -.byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 - andl $0x1fffffff,%eax - vpaddq %ymm10,%ymm5,%ymm5 - vpmuludq %ymm13,%ymm14,%ymm14 - vmovdqu 256-8-128(%r13),%ymm10 - vpaddq %ymm14,%ymm6,%ymm6 - vpmuludq %ymm13,%ymm11,%ymm11 - vmovdqu 288-8-128(%r13),%ymm9 - vmovd %eax,%xmm0 - imulq -128(%r13),%rax - vpaddq %ymm11,%ymm7,%ymm7 - vpmuludq %ymm13,%ymm10,%ymm10 - vmovdqu 32-16-128(%r13),%ymm14 - vpbroadcastq %xmm0,%ymm0 - vpaddq %ymm10,%ymm8,%ymm8 - vpmuludq %ymm13,%ymm9,%ymm9 - vmovdqu 64-16-128(%r13),%ymm11 - addq %rax,%r12 - - vmovdqu 32-24-128(%r13),%ymm13 - vpmuludq %ymm12,%ymm14,%ymm14 - vmovdqu 96-16-128(%r13),%ymm10 - vpaddq %ymm14,%ymm1,%ymm1 - vpmuludq %ymm0,%ymm13,%ymm13 - vpmuludq %ymm12,%ymm11,%ymm11 -.byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff - vpaddq %ymm1,%ymm13,%ymm13 - vpaddq %ymm11,%ymm2,%ymm2 - vpmuludq %ymm12,%ymm10,%ymm10 - vmovdqu 160-16-128(%r13),%ymm11 -.byte 0x67 - vmovq %xmm13,%rax - vmovdqu %ymm13,(%rsp) - vpaddq %ymm10,%ymm3,%ymm3 - vpmuludq %ymm12,%ymm14,%ymm14 - vmovdqu 192-16-128(%r13),%ymm10 - vpaddq %ymm14,%ymm4,%ymm4 - vpmuludq %ymm12,%ymm11,%ymm11 - vmovdqu 224-16-128(%r13),%ymm14 - vpaddq %ymm11,%ymm5,%ymm5 - vpmuludq %ymm12,%ymm10,%ymm10 - vmovdqu 256-16-128(%r13),%ymm11 - vpaddq %ymm10,%ymm6,%ymm6 - vpmuludq %ymm12,%ymm14,%ymm14 - shrq $29,%r12 - vmovdqu 288-16-128(%r13),%ymm10 - addq %r12,%rax - vpaddq %ymm14,%ymm7,%ymm7 - vpmuludq %ymm12,%ymm11,%ymm11 - - movq %rax,%r9 - imull %ecx,%eax - vpaddq %ymm11,%ymm8,%ymm8 - vpmuludq %ymm12,%ymm10,%ymm10 - andl $0x1fffffff,%eax - vmovd %eax,%xmm12 - vmovdqu 96-24-128(%r13),%ymm11 -.byte 0x67 - vpaddq %ymm10,%ymm9,%ymm9 - vpbroadcastq %xmm12,%ymm12 - - vpmuludq 64-24-128(%r13),%ymm0,%ymm14 - vmovdqu 128-24-128(%r13),%ymm10 - movq %rax,%rdx - imulq -128(%r13),%rax - movq 8(%rsp),%r10 - vpaddq %ymm14,%ymm2,%ymm1 - vpmuludq %ymm0,%ymm11,%ymm11 - vmovdqu 160-24-128(%r13),%ymm14 - addq %rax,%r9 - movq %rdx,%rax - imulq 8-128(%r13),%rax -.byte 0x67 - shrq $29,%r9 - movq 16(%rsp),%r11 - vpaddq %ymm11,%ymm3,%ymm2 - vpmuludq %ymm0,%ymm10,%ymm10 - vmovdqu 192-24-128(%r13),%ymm11 - addq %rax,%r10 - movq %rdx,%rax - imulq 16-128(%r13),%rax - vpaddq %ymm10,%ymm4,%ymm3 - vpmuludq %ymm0,%ymm14,%ymm14 - vmovdqu 224-24-128(%r13),%ymm10 - imulq 24-128(%r13),%rdx - addq %rax,%r11 - leaq (%r9,%r10,1),%rax - vpaddq %ymm14,%ymm5,%ymm4 - vpmuludq %ymm0,%ymm11,%ymm11 - vmovdqu 256-24-128(%r13),%ymm14 - movq %rax,%r10 - imull %ecx,%eax - vpmuludq %ymm0,%ymm10,%ymm10 - vpaddq %ymm11,%ymm6,%ymm5 - vmovdqu 288-24-128(%r13),%ymm11 - andl $0x1fffffff,%eax - vpaddq %ymm10,%ymm7,%ymm6 - vpmuludq %ymm0,%ymm14,%ymm14 - addq 24(%rsp),%rdx - vpaddq %ymm14,%ymm8,%ymm7 - vpmuludq %ymm0,%ymm11,%ymm11 - vpaddq %ymm11,%ymm9,%ymm8 - vmovq %r12,%xmm9 - movq %rdx,%r12 - - decl %r14d - jnz .LOOP_REDUCE_1024 - leaq 448(%rsp),%r12 - vpaddq %ymm9,%ymm13,%ymm0 - vpxor %ymm9,%ymm9,%ymm9 - - vpaddq 288-192(%rbx),%ymm0,%ymm0 - vpaddq 320-448(%r12),%ymm1,%ymm1 - vpaddq 352-448(%r12),%ymm2,%ymm2 - vpaddq 384-448(%r12),%ymm3,%ymm3 - vpaddq 416-448(%r12),%ymm4,%ymm4 - vpaddq 448-448(%r12),%ymm5,%ymm5 - vpaddq 480-448(%r12),%ymm6,%ymm6 - vpaddq 512-448(%r12),%ymm7,%ymm7 - vpaddq 544-448(%r12),%ymm8,%ymm8 - - vpsrlq $29,%ymm0,%ymm14 - vpand %ymm15,%ymm0,%ymm0 - vpsrlq $29,%ymm1,%ymm11 - vpand %ymm15,%ymm1,%ymm1 - vpsrlq $29,%ymm2,%ymm12 - vpermq $0x93,%ymm14,%ymm14 - vpand %ymm15,%ymm2,%ymm2 - vpsrlq $29,%ymm3,%ymm13 - vpermq $0x93,%ymm11,%ymm11 - vpand %ymm15,%ymm3,%ymm3 - vpermq $0x93,%ymm12,%ymm12 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpermq $0x93,%ymm13,%ymm13 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm0,%ymm0 - vpblendd $3,%ymm11,%ymm12,%ymm11 - vpaddq %ymm14,%ymm1,%ymm1 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm11,%ymm2,%ymm2 - vpblendd $3,%ymm13,%ymm9,%ymm13 - vpaddq %ymm12,%ymm3,%ymm3 - vpaddq %ymm13,%ymm4,%ymm4 - - vpsrlq $29,%ymm0,%ymm14 - vpand %ymm15,%ymm0,%ymm0 - vpsrlq $29,%ymm1,%ymm11 - vpand %ymm15,%ymm1,%ymm1 - vpsrlq $29,%ymm2,%ymm12 - vpermq $0x93,%ymm14,%ymm14 - vpand %ymm15,%ymm2,%ymm2 - vpsrlq $29,%ymm3,%ymm13 - vpermq $0x93,%ymm11,%ymm11 - vpand %ymm15,%ymm3,%ymm3 - vpermq $0x93,%ymm12,%ymm12 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpermq $0x93,%ymm13,%ymm13 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm0,%ymm0 - vpblendd $3,%ymm11,%ymm12,%ymm11 - vpaddq %ymm14,%ymm1,%ymm1 - vmovdqu %ymm0,0-128(%rdi) - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm11,%ymm2,%ymm2 - vmovdqu %ymm1,32-128(%rdi) - vpblendd $3,%ymm13,%ymm9,%ymm13 - vpaddq %ymm12,%ymm3,%ymm3 - vmovdqu %ymm2,64-128(%rdi) - vpaddq %ymm13,%ymm4,%ymm4 - vmovdqu %ymm3,96-128(%rdi) - vpsrlq $29,%ymm4,%ymm14 - vpand %ymm15,%ymm4,%ymm4 - vpsrlq $29,%ymm5,%ymm11 - vpand %ymm15,%ymm5,%ymm5 - vpsrlq $29,%ymm6,%ymm12 - vpermq $0x93,%ymm14,%ymm14 - vpand %ymm15,%ymm6,%ymm6 - vpsrlq $29,%ymm7,%ymm13 - vpermq $0x93,%ymm11,%ymm11 - vpand %ymm15,%ymm7,%ymm7 - vpsrlq $29,%ymm8,%ymm0 - vpermq $0x93,%ymm12,%ymm12 - vpand %ymm15,%ymm8,%ymm8 - vpermq $0x93,%ymm13,%ymm13 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpermq $0x93,%ymm0,%ymm0 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm4,%ymm4 - vpblendd $3,%ymm11,%ymm12,%ymm11 - vpaddq %ymm14,%ymm5,%ymm5 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm11,%ymm6,%ymm6 - vpblendd $3,%ymm13,%ymm0,%ymm13 - vpaddq %ymm12,%ymm7,%ymm7 - vpaddq %ymm13,%ymm8,%ymm8 - - vpsrlq $29,%ymm4,%ymm14 - vpand %ymm15,%ymm4,%ymm4 - vpsrlq $29,%ymm5,%ymm11 - vpand %ymm15,%ymm5,%ymm5 - vpsrlq $29,%ymm6,%ymm12 - vpermq $0x93,%ymm14,%ymm14 - vpand %ymm15,%ymm6,%ymm6 - vpsrlq $29,%ymm7,%ymm13 - vpermq $0x93,%ymm11,%ymm11 - vpand %ymm15,%ymm7,%ymm7 - vpsrlq $29,%ymm8,%ymm0 - vpermq $0x93,%ymm12,%ymm12 - vpand %ymm15,%ymm8,%ymm8 - vpermq $0x93,%ymm13,%ymm13 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpermq $0x93,%ymm0,%ymm0 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm4,%ymm4 - vpblendd $3,%ymm11,%ymm12,%ymm11 - vpaddq %ymm14,%ymm5,%ymm5 - vmovdqu %ymm4,128-128(%rdi) - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm11,%ymm6,%ymm6 - vmovdqu %ymm5,160-128(%rdi) - vpblendd $3,%ymm13,%ymm0,%ymm13 - vpaddq %ymm12,%ymm7,%ymm7 - vmovdqu %ymm6,192-128(%rdi) - vpaddq %ymm13,%ymm8,%ymm8 - vmovdqu %ymm7,224-128(%rdi) - vmovdqu %ymm8,256-128(%rdi) - - movq %rdi,%rsi - decl %r8d - jne .LOOP_GRANDE_SQR_1024 - - vzeroall - movq %rbp,%rax -.cfi_def_cfa_register %rax - movq -48(%rax),%r15 -.cfi_restore %r15 - movq -40(%rax),%r14 -.cfi_restore %r14 - movq -32(%rax),%r13 -.cfi_restore %r13 - movq -24(%rax),%r12 -.cfi_restore %r12 - movq -16(%rax),%rbp -.cfi_restore %rbp - movq -8(%rax),%rbx -.cfi_restore %rbx - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Lsqr_1024_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 -.globl rsaz_1024_mul_avx2 -.type rsaz_1024_mul_avx2,@function -.align 64 rsaz_1024_mul_avx2: -.cfi_startproc - leaq (%rsp),%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - movq %rax,%rbp -.cfi_def_cfa_register %rbp - vzeroall - movq %rdx,%r13 - subq $64,%rsp - - - - - - -.byte 0x67,0x67 - movq %rsi,%r15 - andq $4095,%r15 - addq $320,%r15 - shrq $12,%r15 - movq %rsi,%r15 - cmovnzq %r13,%rsi - cmovnzq %r15,%r13 - - movq %rcx,%r15 - subq $-128,%rsi - subq $-128,%rcx - subq $-128,%rdi - - andq $4095,%r15 - addq $320,%r15 -.byte 0x67,0x67 - shrq $12,%r15 - jz .Lmul_1024_no_n_copy - - - - - - subq $320,%rsp - vmovdqu 0-128(%rcx),%ymm0 - andq $-512,%rsp - vmovdqu 32-128(%rcx),%ymm1 - vmovdqu 64-128(%rcx),%ymm2 - vmovdqu 96-128(%rcx),%ymm3 - vmovdqu 128-128(%rcx),%ymm4 - vmovdqu 160-128(%rcx),%ymm5 - vmovdqu 192-128(%rcx),%ymm6 - vmovdqu 224-128(%rcx),%ymm7 - vmovdqu 256-128(%rcx),%ymm8 - leaq 64+128(%rsp),%rcx - vmovdqu %ymm0,0-128(%rcx) - vpxor %ymm0,%ymm0,%ymm0 - vmovdqu %ymm1,32-128(%rcx) - vpxor %ymm1,%ymm1,%ymm1 - vmovdqu %ymm2,64-128(%rcx) - vpxor %ymm2,%ymm2,%ymm2 - vmovdqu %ymm3,96-128(%rcx) - vpxor %ymm3,%ymm3,%ymm3 - vmovdqu %ymm4,128-128(%rcx) - vpxor %ymm4,%ymm4,%ymm4 - vmovdqu %ymm5,160-128(%rcx) - vpxor %ymm5,%ymm5,%ymm5 - vmovdqu %ymm6,192-128(%rcx) - vpxor %ymm6,%ymm6,%ymm6 - vmovdqu %ymm7,224-128(%rcx) - vpxor %ymm7,%ymm7,%ymm7 - vmovdqu %ymm8,256-128(%rcx) - vmovdqa %ymm0,%ymm8 - vmovdqu %ymm9,288-128(%rcx) -.Lmul_1024_no_n_copy: - andq $-64,%rsp - - movq (%r13),%rbx - vpbroadcastq (%r13),%ymm10 - vmovdqu %ymm0,(%rsp) - xorq %r9,%r9 -.byte 0x67 - xorq %r10,%r10 - xorq %r11,%r11 - xorq %r12,%r12 - - vmovdqu .Land_mask(%rip),%ymm15 - movl $9,%r14d - vmovdqu %ymm9,288-128(%rdi) - jmp .Loop_mul_1024 - -.align 32 -.Loop_mul_1024: - vpsrlq $29,%ymm3,%ymm9 - movq %rbx,%rax - imulq -128(%rsi),%rax - addq %r9,%rax - movq %rbx,%r10 - imulq 8-128(%rsi),%r10 - addq 8(%rsp),%r10 - - movq %rax,%r9 - imull %r8d,%eax - andl $0x1fffffff,%eax - - movq %rbx,%r11 - imulq 16-128(%rsi),%r11 - addq 16(%rsp),%r11 - - movq %rbx,%r12 - imulq 24-128(%rsi),%r12 - addq 24(%rsp),%r12 - vpmuludq 32-128(%rsi),%ymm10,%ymm0 - vmovd %eax,%xmm11 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq 64-128(%rsi),%ymm10,%ymm12 - vpbroadcastq %xmm11,%ymm11 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq 96-128(%rsi),%ymm10,%ymm13 - vpand %ymm15,%ymm3,%ymm3 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq 128-128(%rsi),%ymm10,%ymm0 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq 160-128(%rsi),%ymm10,%ymm12 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq 192-128(%rsi),%ymm10,%ymm13 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq 224-128(%rsi),%ymm10,%ymm0 - vpermq $0x93,%ymm9,%ymm9 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq 256-128(%rsi),%ymm10,%ymm12 - vpbroadcastq 8(%r13),%ymm10 - vpaddq %ymm12,%ymm8,%ymm8 - - movq %rax,%rdx - imulq -128(%rcx),%rax - addq %rax,%r9 - movq %rdx,%rax - imulq 8-128(%rcx),%rax - addq %rax,%r10 - movq %rdx,%rax - imulq 16-128(%rcx),%rax - addq %rax,%r11 - shrq $29,%r9 - imulq 24-128(%rcx),%rdx - addq %rdx,%r12 - addq %r9,%r10 - - vpmuludq 32-128(%rcx),%ymm11,%ymm13 - vmovq %xmm10,%rbx - vpaddq %ymm13,%ymm1,%ymm1 - vpmuludq 64-128(%rcx),%ymm11,%ymm0 - vpaddq %ymm0,%ymm2,%ymm2 - vpmuludq 96-128(%rcx),%ymm11,%ymm12 - vpaddq %ymm12,%ymm3,%ymm3 - vpmuludq 128-128(%rcx),%ymm11,%ymm13 - vpaddq %ymm13,%ymm4,%ymm4 - vpmuludq 160-128(%rcx),%ymm11,%ymm0 - vpaddq %ymm0,%ymm5,%ymm5 - vpmuludq 192-128(%rcx),%ymm11,%ymm12 - vpaddq %ymm12,%ymm6,%ymm6 - vpmuludq 224-128(%rcx),%ymm11,%ymm13 - vpblendd $3,%ymm14,%ymm9,%ymm12 - vpaddq %ymm13,%ymm7,%ymm7 - vpmuludq 256-128(%rcx),%ymm11,%ymm0 - vpaddq %ymm12,%ymm3,%ymm3 - vpaddq %ymm0,%ymm8,%ymm8 - - movq %rbx,%rax - imulq -128(%rsi),%rax - addq %rax,%r10 - vmovdqu -8+32-128(%rsi),%ymm12 - movq %rbx,%rax - imulq 8-128(%rsi),%rax - addq %rax,%r11 - vmovdqu -8+64-128(%rsi),%ymm13 - - movq %r10,%rax - vpblendd $0xfc,%ymm14,%ymm9,%ymm9 - imull %r8d,%eax - vpaddq %ymm9,%ymm4,%ymm4 - andl $0x1fffffff,%eax - - imulq 16-128(%rsi),%rbx - addq %rbx,%r12 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovd %eax,%xmm11 - vmovdqu -8+96-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm1,%ymm1 - vpmuludq %ymm10,%ymm13,%ymm13 - vpbroadcastq %xmm11,%ymm11 - vmovdqu -8+128-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm2,%ymm2 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -8+160-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm3,%ymm3 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovdqu -8+192-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm4,%ymm4 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -8+224-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm5,%ymm5 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -8+256-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm6,%ymm6 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovdqu -8+288-128(%rsi),%ymm9 - vpaddq %ymm12,%ymm7,%ymm7 - vpmuludq %ymm10,%ymm13,%ymm13 - vpaddq %ymm13,%ymm8,%ymm8 - vpmuludq %ymm10,%ymm9,%ymm9 - vpbroadcastq 16(%r13),%ymm10 - - movq %rax,%rdx - imulq -128(%rcx),%rax - addq %rax,%r10 - vmovdqu -8+32-128(%rcx),%ymm0 - movq %rdx,%rax - imulq 8-128(%rcx),%rax - addq %rax,%r11 - vmovdqu -8+64-128(%rcx),%ymm12 - shrq $29,%r10 - imulq 16-128(%rcx),%rdx - addq %rdx,%r12 - addq %r10,%r11 - - vpmuludq %ymm11,%ymm0,%ymm0 - vmovq %xmm10,%rbx - vmovdqu -8+96-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -8+128-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -8+160-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -8+192-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -8+224-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -8+256-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -8+288-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq %ymm11,%ymm12,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq %ymm11,%ymm13,%ymm13 - vpaddq %ymm13,%ymm9,%ymm9 - - vmovdqu -16+32-128(%rsi),%ymm0 - movq %rbx,%rax - imulq -128(%rsi),%rax - addq %r11,%rax - - vmovdqu -16+64-128(%rsi),%ymm12 - movq %rax,%r11 - imull %r8d,%eax - andl $0x1fffffff,%eax - - imulq 8-128(%rsi),%rbx - addq %rbx,%r12 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovd %eax,%xmm11 - vmovdqu -16+96-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq %ymm10,%ymm12,%ymm12 - vpbroadcastq %xmm11,%ymm11 - vmovdqu -16+128-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -16+160-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -16+192-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovdqu -16+224-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -16+256-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -16+288-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq %ymm10,%ymm12,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq %ymm10,%ymm13,%ymm13 - vpbroadcastq 24(%r13),%ymm10 - vpaddq %ymm13,%ymm9,%ymm9 - - vmovdqu -16+32-128(%rcx),%ymm0 - movq %rax,%rdx - imulq -128(%rcx),%rax - addq %rax,%r11 - vmovdqu -16+64-128(%rcx),%ymm12 - imulq 8-128(%rcx),%rdx - addq %rdx,%r12 - shrq $29,%r11 - - vpmuludq %ymm11,%ymm0,%ymm0 - vmovq %xmm10,%rbx - vmovdqu -16+96-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -16+128-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -16+160-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -16+192-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -16+224-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -16+256-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -16+288-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -24+32-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -24+64-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm9,%ymm9 - - addq %r11,%r12 - imulq -128(%rsi),%rbx - addq %rbx,%r12 - - movq %r12,%rax - imull %r8d,%eax - andl $0x1fffffff,%eax - - vpmuludq %ymm10,%ymm0,%ymm0 - vmovd %eax,%xmm11 - vmovdqu -24+96-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq %ymm10,%ymm12,%ymm12 - vpbroadcastq %xmm11,%ymm11 - vmovdqu -24+128-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -24+160-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -24+192-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovdqu -24+224-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -24+256-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -24+288-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq %ymm10,%ymm12,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq %ymm10,%ymm13,%ymm13 - vpbroadcastq 32(%r13),%ymm10 - vpaddq %ymm13,%ymm9,%ymm9 - addq $32,%r13 - - vmovdqu -24+32-128(%rcx),%ymm0 - imulq -128(%rcx),%rax - addq %rax,%r12 - shrq $29,%r12 - - vmovdqu -24+64-128(%rcx),%ymm12 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovq %xmm10,%rbx - vmovdqu -24+96-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm1,%ymm0 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu %ymm0,(%rsp) - vpaddq %ymm12,%ymm2,%ymm1 - vmovdqu -24+128-128(%rcx),%ymm0 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -24+160-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm3,%ymm2 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -24+192-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm4,%ymm3 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -24+224-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm5,%ymm4 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -24+256-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm6,%ymm5 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -24+288-128(%rcx),%ymm13 - movq %r12,%r9 - vpaddq %ymm0,%ymm7,%ymm6 - vpmuludq %ymm11,%ymm12,%ymm12 - addq (%rsp),%r9 - vpaddq %ymm12,%ymm8,%ymm7 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovq %r12,%xmm12 - vpaddq %ymm13,%ymm9,%ymm8 - - decl %r14d - jnz .Loop_mul_1024 - vpaddq (%rsp),%ymm12,%ymm0 - - vpsrlq $29,%ymm0,%ymm12 - vpand %ymm15,%ymm0,%ymm0 - vpsrlq $29,%ymm1,%ymm13 - vpand %ymm15,%ymm1,%ymm1 - vpsrlq $29,%ymm2,%ymm10 - vpermq $0x93,%ymm12,%ymm12 - vpand %ymm15,%ymm2,%ymm2 - vpsrlq $29,%ymm3,%ymm11 - vpermq $0x93,%ymm13,%ymm13 - vpand %ymm15,%ymm3,%ymm3 - - vpblendd $3,%ymm14,%ymm12,%ymm9 - vpermq $0x93,%ymm10,%ymm10 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpermq $0x93,%ymm11,%ymm11 - vpaddq %ymm9,%ymm0,%ymm0 - vpblendd $3,%ymm13,%ymm10,%ymm13 - vpaddq %ymm12,%ymm1,%ymm1 - vpblendd $3,%ymm10,%ymm11,%ymm10 - vpaddq %ymm13,%ymm2,%ymm2 - vpblendd $3,%ymm11,%ymm14,%ymm11 - vpaddq %ymm10,%ymm3,%ymm3 - vpaddq %ymm11,%ymm4,%ymm4 - - vpsrlq $29,%ymm0,%ymm12 - vpand %ymm15,%ymm0,%ymm0 - vpsrlq $29,%ymm1,%ymm13 - vpand %ymm15,%ymm1,%ymm1 - vpsrlq $29,%ymm2,%ymm10 - vpermq $0x93,%ymm12,%ymm12 - vpand %ymm15,%ymm2,%ymm2 - vpsrlq $29,%ymm3,%ymm11 - vpermq $0x93,%ymm13,%ymm13 - vpand %ymm15,%ymm3,%ymm3 - vpermq $0x93,%ymm10,%ymm10 - - vpblendd $3,%ymm14,%ymm12,%ymm9 - vpermq $0x93,%ymm11,%ymm11 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm9,%ymm0,%ymm0 - vpblendd $3,%ymm13,%ymm10,%ymm13 - vpaddq %ymm12,%ymm1,%ymm1 - vpblendd $3,%ymm10,%ymm11,%ymm10 - vpaddq %ymm13,%ymm2,%ymm2 - vpblendd $3,%ymm11,%ymm14,%ymm11 - vpaddq %ymm10,%ymm3,%ymm3 - vpaddq %ymm11,%ymm4,%ymm4 - - vmovdqu %ymm0,0-128(%rdi) - vmovdqu %ymm1,32-128(%rdi) - vmovdqu %ymm2,64-128(%rdi) - vmovdqu %ymm3,96-128(%rdi) - vpsrlq $29,%ymm4,%ymm12 - vpand %ymm15,%ymm4,%ymm4 - vpsrlq $29,%ymm5,%ymm13 - vpand %ymm15,%ymm5,%ymm5 - vpsrlq $29,%ymm6,%ymm10 - vpermq $0x93,%ymm12,%ymm12 - vpand %ymm15,%ymm6,%ymm6 - vpsrlq $29,%ymm7,%ymm11 - vpermq $0x93,%ymm13,%ymm13 - vpand %ymm15,%ymm7,%ymm7 - vpsrlq $29,%ymm8,%ymm0 - vpermq $0x93,%ymm10,%ymm10 - vpand %ymm15,%ymm8,%ymm8 - vpermq $0x93,%ymm11,%ymm11 - - vpblendd $3,%ymm14,%ymm12,%ymm9 - vpermq $0x93,%ymm0,%ymm0 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm9,%ymm4,%ymm4 - vpblendd $3,%ymm13,%ymm10,%ymm13 - vpaddq %ymm12,%ymm5,%ymm5 - vpblendd $3,%ymm10,%ymm11,%ymm10 - vpaddq %ymm13,%ymm6,%ymm6 - vpblendd $3,%ymm11,%ymm0,%ymm11 - vpaddq %ymm10,%ymm7,%ymm7 - vpaddq %ymm11,%ymm8,%ymm8 - - vpsrlq $29,%ymm4,%ymm12 - vpand %ymm15,%ymm4,%ymm4 - vpsrlq $29,%ymm5,%ymm13 - vpand %ymm15,%ymm5,%ymm5 - vpsrlq $29,%ymm6,%ymm10 - vpermq $0x93,%ymm12,%ymm12 - vpand %ymm15,%ymm6,%ymm6 - vpsrlq $29,%ymm7,%ymm11 - vpermq $0x93,%ymm13,%ymm13 - vpand %ymm15,%ymm7,%ymm7 - vpsrlq $29,%ymm8,%ymm0 - vpermq $0x93,%ymm10,%ymm10 - vpand %ymm15,%ymm8,%ymm8 - vpermq $0x93,%ymm11,%ymm11 - - vpblendd $3,%ymm14,%ymm12,%ymm9 - vpermq $0x93,%ymm0,%ymm0 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm9,%ymm4,%ymm4 - vpblendd $3,%ymm13,%ymm10,%ymm13 - vpaddq %ymm12,%ymm5,%ymm5 - vpblendd $3,%ymm10,%ymm11,%ymm10 - vpaddq %ymm13,%ymm6,%ymm6 - vpblendd $3,%ymm11,%ymm0,%ymm11 - vpaddq %ymm10,%ymm7,%ymm7 - vpaddq %ymm11,%ymm8,%ymm8 - - vmovdqu %ymm4,128-128(%rdi) - vmovdqu %ymm5,160-128(%rdi) - vmovdqu %ymm6,192-128(%rdi) - vmovdqu %ymm7,224-128(%rdi) - vmovdqu %ymm8,256-128(%rdi) - vzeroupper - - movq %rbp,%rax -.cfi_def_cfa_register %rax - movq -48(%rax),%r15 -.cfi_restore %r15 - movq -40(%rax),%r14 -.cfi_restore %r14 - movq -32(%rax),%r13 -.cfi_restore %r13 - movq -24(%rax),%r12 -.cfi_restore %r12 - movq -16(%rax),%rbp -.cfi_restore %rbp - movq -8(%rax),%rbx -.cfi_restore %rbx - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Lmul_1024_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2 -.globl rsaz_1024_red2norm_avx2 -.type rsaz_1024_red2norm_avx2,@function -.align 32 -rsaz_1024_red2norm_avx2: -.cfi_startproc - subq $-128,%rsi - xorq %rax,%rax - movq -128(%rsi),%r8 - movq -120(%rsi),%r9 - movq -112(%rsi),%r10 - shlq $0,%r8 - shlq $29,%r9 - movq %r10,%r11 - shlq $58,%r10 - shrq $6,%r11 - addq %r8,%rax - addq %r9,%rax - addq %r10,%rax - adcq $0,%r11 - movq %rax,0(%rdi) - movq %r11,%rax - movq -104(%rsi),%r8 - movq -96(%rsi),%r9 - shlq $23,%r8 - movq %r9,%r10 - shlq $52,%r9 - shrq $12,%r10 - addq %r8,%rax - addq %r9,%rax - adcq $0,%r10 - movq %rax,8(%rdi) - movq %r10,%rax - movq -88(%rsi),%r11 - movq -80(%rsi),%r8 - shlq $17,%r11 - movq %r8,%r9 - shlq $46,%r8 - shrq $18,%r9 - addq %r11,%rax - addq %r8,%rax - adcq $0,%r9 - movq %rax,16(%rdi) - movq %r9,%rax - movq -72(%rsi),%r10 - movq -64(%rsi),%r11 - shlq $11,%r10 - movq %r11,%r8 - shlq $40,%r11 - shrq $24,%r8 - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,24(%rdi) - movq %r8,%rax - movq -56(%rsi),%r9 - movq -48(%rsi),%r10 - movq -40(%rsi),%r11 - shlq $5,%r9 - shlq $34,%r10 - movq %r11,%r8 - shlq $63,%r11 - shrq $1,%r8 - addq %r9,%rax - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,32(%rdi) - movq %r8,%rax - movq -32(%rsi),%r9 - movq -24(%rsi),%r10 - shlq $28,%r9 - movq %r10,%r11 - shlq $57,%r10 - shrq $7,%r11 - addq %r9,%rax - addq %r10,%rax - adcq $0,%r11 - movq %rax,40(%rdi) - movq %r11,%rax - movq -16(%rsi),%r8 - movq -8(%rsi),%r9 - shlq $22,%r8 - movq %r9,%r10 - shlq $51,%r9 - shrq $13,%r10 - addq %r8,%rax - addq %r9,%rax - adcq $0,%r10 - movq %rax,48(%rdi) - movq %r10,%rax - movq 0(%rsi),%r11 - movq 8(%rsi),%r8 - shlq $16,%r11 - movq %r8,%r9 - shlq $45,%r8 - shrq $19,%r9 - addq %r11,%rax - addq %r8,%rax - adcq $0,%r9 - movq %rax,56(%rdi) - movq %r9,%rax - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - shlq $10,%r10 - movq %r11,%r8 - shlq $39,%r11 - shrq $25,%r8 - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,64(%rdi) - movq %r8,%rax - movq 32(%rsi),%r9 - movq 40(%rsi),%r10 - movq 48(%rsi),%r11 - shlq $4,%r9 - shlq $33,%r10 - movq %r11,%r8 - shlq $62,%r11 - shrq $2,%r8 - addq %r9,%rax - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,72(%rdi) - movq %r8,%rax - movq 56(%rsi),%r9 - movq 64(%rsi),%r10 - shlq $27,%r9 - movq %r10,%r11 - shlq $56,%r10 - shrq $8,%r11 - addq %r9,%rax - addq %r10,%rax - adcq $0,%r11 - movq %rax,80(%rdi) - movq %r11,%rax - movq 72(%rsi),%r8 - movq 80(%rsi),%r9 - shlq $21,%r8 - movq %r9,%r10 - shlq $50,%r9 - shrq $14,%r10 - addq %r8,%rax - addq %r9,%rax - adcq $0,%r10 - movq %rax,88(%rdi) - movq %r10,%rax - movq 88(%rsi),%r11 - movq 96(%rsi),%r8 - shlq $15,%r11 - movq %r8,%r9 - shlq $44,%r8 - shrq $20,%r9 - addq %r11,%rax - addq %r8,%rax - adcq $0,%r9 - movq %rax,96(%rdi) - movq %r9,%rax - movq 104(%rsi),%r10 - movq 112(%rsi),%r11 - shlq $9,%r10 - movq %r11,%r8 - shlq $38,%r11 - shrq $26,%r8 - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,104(%rdi) - movq %r8,%rax - movq 120(%rsi),%r9 - movq 128(%rsi),%r10 - movq 136(%rsi),%r11 - shlq $3,%r9 - shlq $32,%r10 - movq %r11,%r8 - shlq $61,%r11 - shrq $3,%r8 - addq %r9,%rax - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,112(%rdi) - movq %r8,%rax - movq 144(%rsi),%r9 - movq 152(%rsi),%r10 - shlq $26,%r9 - movq %r10,%r11 - shlq $55,%r10 - shrq $9,%r11 - addq %r9,%rax - addq %r10,%rax - adcq $0,%r11 - movq %rax,120(%rdi) - movq %r11,%rax - .byte 0xf3,0xc3 -.cfi_endproc -.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2 - -.globl rsaz_1024_norm2red_avx2 -.type rsaz_1024_norm2red_avx2,@function -.align 32 rsaz_1024_norm2red_avx2: -.cfi_startproc - subq $-128,%rdi - movq (%rsi),%r8 - movl $0x1fffffff,%eax - movq 8(%rsi),%r9 - movq %r8,%r11 - shrq $0,%r11 - andq %rax,%r11 - movq %r11,-128(%rdi) - movq %r8,%r10 - shrq $29,%r10 - andq %rax,%r10 - movq %r10,-120(%rdi) - shrdq $58,%r9,%r8 - andq %rax,%r8 - movq %r8,-112(%rdi) - movq 16(%rsi),%r10 - movq %r9,%r8 - shrq $23,%r8 - andq %rax,%r8 - movq %r8,-104(%rdi) - shrdq $52,%r10,%r9 - andq %rax,%r9 - movq %r9,-96(%rdi) - movq 24(%rsi),%r11 - movq %r10,%r9 - shrq $17,%r9 - andq %rax,%r9 - movq %r9,-88(%rdi) - shrdq $46,%r11,%r10 - andq %rax,%r10 - movq %r10,-80(%rdi) - movq 32(%rsi),%r8 - movq %r11,%r10 - shrq $11,%r10 - andq %rax,%r10 - movq %r10,-72(%rdi) - shrdq $40,%r8,%r11 - andq %rax,%r11 - movq %r11,-64(%rdi) - movq 40(%rsi),%r9 - movq %r8,%r11 - shrq $5,%r11 - andq %rax,%r11 - movq %r11,-56(%rdi) - movq %r8,%r10 - shrq $34,%r10 - andq %rax,%r10 - movq %r10,-48(%rdi) - shrdq $63,%r9,%r8 - andq %rax,%r8 - movq %r8,-40(%rdi) - movq 48(%rsi),%r10 - movq %r9,%r8 - shrq $28,%r8 - andq %rax,%r8 - movq %r8,-32(%rdi) - shrdq $57,%r10,%r9 - andq %rax,%r9 - movq %r9,-24(%rdi) - movq 56(%rsi),%r11 - movq %r10,%r9 - shrq $22,%r9 - andq %rax,%r9 - movq %r9,-16(%rdi) - shrdq $51,%r11,%r10 - andq %rax,%r10 - movq %r10,-8(%rdi) - movq 64(%rsi),%r8 - movq %r11,%r10 - shrq $16,%r10 - andq %rax,%r10 - movq %r10,0(%rdi) - shrdq $45,%r8,%r11 - andq %rax,%r11 - movq %r11,8(%rdi) - movq 72(%rsi),%r9 - movq %r8,%r11 - shrq $10,%r11 - andq %rax,%r11 - movq %r11,16(%rdi) - shrdq $39,%r9,%r8 - andq %rax,%r8 - movq %r8,24(%rdi) - movq 80(%rsi),%r10 - movq %r9,%r8 - shrq $4,%r8 - andq %rax,%r8 - movq %r8,32(%rdi) - movq %r9,%r11 - shrq $33,%r11 - andq %rax,%r11 - movq %r11,40(%rdi) - shrdq $62,%r10,%r9 - andq %rax,%r9 - movq %r9,48(%rdi) - movq 88(%rsi),%r11 - movq %r10,%r9 - shrq $27,%r9 - andq %rax,%r9 - movq %r9,56(%rdi) - shrdq $56,%r11,%r10 - andq %rax,%r10 - movq %r10,64(%rdi) - movq 96(%rsi),%r8 - movq %r11,%r10 - shrq $21,%r10 - andq %rax,%r10 - movq %r10,72(%rdi) - shrdq $50,%r8,%r11 - andq %rax,%r11 - movq %r11,80(%rdi) - movq 104(%rsi),%r9 - movq %r8,%r11 - shrq $15,%r11 - andq %rax,%r11 - movq %r11,88(%rdi) - shrdq $44,%r9,%r8 - andq %rax,%r8 - movq %r8,96(%rdi) - movq 112(%rsi),%r10 - movq %r9,%r8 - shrq $9,%r8 - andq %rax,%r8 - movq %r8,104(%rdi) - shrdq $38,%r10,%r9 - andq %rax,%r9 - movq %r9,112(%rdi) - movq 120(%rsi),%r11 - movq %r10,%r9 - shrq $3,%r9 - andq %rax,%r9 - movq %r9,120(%rdi) - movq %r10,%r8 - shrq $32,%r8 - andq %rax,%r8 - movq %r8,128(%rdi) - shrdq $61,%r11,%r10 - andq %rax,%r10 - movq %r10,136(%rdi) - xorq %r8,%r8 - movq %r11,%r10 - shrq $26,%r10 - andq %rax,%r10 - movq %r10,144(%rdi) - shrdq $55,%r8,%r11 - andq %rax,%r11 - movq %r11,152(%rdi) - movq %r8,160(%rdi) - movq %r8,168(%rdi) - movq %r8,176(%rdi) - movq %r8,184(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2 -.globl rsaz_1024_scatter5_avx2 -.type rsaz_1024_scatter5_avx2,@function -.align 32 +rsaz_1024_red2norm_avx2: rsaz_1024_scatter5_avx2: -.cfi_startproc - vzeroupper - vmovdqu .Lscatter_permd(%rip),%ymm5 - shll $4,%edx - leaq (%rdi,%rdx,1),%rdi - movl $9,%eax - jmp .Loop_scatter_1024 - -.align 32 -.Loop_scatter_1024: - vmovdqu (%rsi),%ymm0 - leaq 32(%rsi),%rsi - vpermd %ymm0,%ymm5,%ymm0 - vmovdqu %xmm0,(%rdi) - leaq 512(%rdi),%rdi - decl %eax - jnz .Loop_scatter_1024 - - vzeroupper - .byte 0xf3,0xc3 -.cfi_endproc -.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2 - -.globl rsaz_1024_gather5_avx2 -.type rsaz_1024_gather5_avx2,@function -.align 32 rsaz_1024_gather5_avx2: -.cfi_startproc - vzeroupper - movq %rsp,%r11 -.cfi_def_cfa_register %r11 - leaq -256(%rsp),%rsp - andq $-32,%rsp - leaq .Linc(%rip),%r10 - leaq -128(%rsp),%rax - - vmovd %edx,%xmm4 - vmovdqa (%r10),%ymm0 - vmovdqa 32(%r10),%ymm1 - vmovdqa 64(%r10),%ymm5 - vpbroadcastd %xmm4,%ymm4 - - vpaddd %ymm5,%ymm0,%ymm2 - vpcmpeqd %ymm4,%ymm0,%ymm0 - vpaddd %ymm5,%ymm1,%ymm3 - vpcmpeqd %ymm4,%ymm1,%ymm1 - vmovdqa %ymm0,0+128(%rax) - vpaddd %ymm5,%ymm2,%ymm0 - vpcmpeqd %ymm4,%ymm2,%ymm2 - vmovdqa %ymm1,32+128(%rax) - vpaddd %ymm5,%ymm3,%ymm1 - vpcmpeqd %ymm4,%ymm3,%ymm3 - vmovdqa %ymm2,64+128(%rax) - vpaddd %ymm5,%ymm0,%ymm2 - vpcmpeqd %ymm4,%ymm0,%ymm0 - vmovdqa %ymm3,96+128(%rax) - vpaddd %ymm5,%ymm1,%ymm3 - vpcmpeqd %ymm4,%ymm1,%ymm1 - vmovdqa %ymm0,128+128(%rax) - vpaddd %ymm5,%ymm2,%ymm8 - vpcmpeqd %ymm4,%ymm2,%ymm2 - vmovdqa %ymm1,160+128(%rax) - vpaddd %ymm5,%ymm3,%ymm9 - vpcmpeqd %ymm4,%ymm3,%ymm3 - vmovdqa %ymm2,192+128(%rax) - vpaddd %ymm5,%ymm8,%ymm10 - vpcmpeqd %ymm4,%ymm8,%ymm8 - vmovdqa %ymm3,224+128(%rax) - vpaddd %ymm5,%ymm9,%ymm11 - vpcmpeqd %ymm4,%ymm9,%ymm9 - vpaddd %ymm5,%ymm10,%ymm12 - vpcmpeqd %ymm4,%ymm10,%ymm10 - vpaddd %ymm5,%ymm11,%ymm13 - vpcmpeqd %ymm4,%ymm11,%ymm11 - vpaddd %ymm5,%ymm12,%ymm14 - vpcmpeqd %ymm4,%ymm12,%ymm12 - vpaddd %ymm5,%ymm13,%ymm15 - vpcmpeqd %ymm4,%ymm13,%ymm13 - vpcmpeqd %ymm4,%ymm14,%ymm14 - vpcmpeqd %ymm4,%ymm15,%ymm15 - - vmovdqa -32(%r10),%ymm7 - leaq 128(%rsi),%rsi - movl $9,%edx - -.Loop_gather_1024: - vmovdqa 0-128(%rsi),%ymm0 - vmovdqa 32-128(%rsi),%ymm1 - vmovdqa 64-128(%rsi),%ymm2 - vmovdqa 96-128(%rsi),%ymm3 - vpand 0+128(%rax),%ymm0,%ymm0 - vpand 32+128(%rax),%ymm1,%ymm1 - vpand 64+128(%rax),%ymm2,%ymm2 - vpor %ymm0,%ymm1,%ymm4 - vpand 96+128(%rax),%ymm3,%ymm3 - vmovdqa 128-128(%rsi),%ymm0 - vmovdqa 160-128(%rsi),%ymm1 - vpor %ymm2,%ymm3,%ymm5 - vmovdqa 192-128(%rsi),%ymm2 - vmovdqa 224-128(%rsi),%ymm3 - vpand 128+128(%rax),%ymm0,%ymm0 - vpand 160+128(%rax),%ymm1,%ymm1 - vpand 192+128(%rax),%ymm2,%ymm2 - vpor %ymm0,%ymm4,%ymm4 - vpand 224+128(%rax),%ymm3,%ymm3 - vpand 256-128(%rsi),%ymm8,%ymm0 - vpor %ymm1,%ymm5,%ymm5 - vpand 288-128(%rsi),%ymm9,%ymm1 - vpor %ymm2,%ymm4,%ymm4 - vpand 320-128(%rsi),%ymm10,%ymm2 - vpor %ymm3,%ymm5,%ymm5 - vpand 352-128(%rsi),%ymm11,%ymm3 - vpor %ymm0,%ymm4,%ymm4 - vpand 384-128(%rsi),%ymm12,%ymm0 - vpor %ymm1,%ymm5,%ymm5 - vpand 416-128(%rsi),%ymm13,%ymm1 - vpor %ymm2,%ymm4,%ymm4 - vpand 448-128(%rsi),%ymm14,%ymm2 - vpor %ymm3,%ymm5,%ymm5 - vpand 480-128(%rsi),%ymm15,%ymm3 - leaq 512(%rsi),%rsi - vpor %ymm0,%ymm4,%ymm4 - vpor %ymm1,%ymm5,%ymm5 - vpor %ymm2,%ymm4,%ymm4 - vpor %ymm3,%ymm5,%ymm5 - - vpor %ymm5,%ymm4,%ymm4 - vextracti128 $1,%ymm4,%xmm5 - vpor %xmm4,%xmm5,%xmm5 - vpermd %ymm5,%ymm7,%ymm5 - vmovdqu %ymm5,(%rdi) - leaq 32(%rdi),%rdi - decl %edx - jnz .Loop_gather_1024 - - vpxor %ymm0,%ymm0,%ymm0 - vmovdqu %ymm0,(%rdi) - vzeroupper - leaq (%r11),%rsp -.cfi_def_cfa_register %rsp +.byte 0x0f,0x0b .byte 0xf3,0xc3 -.cfi_endproc -.LSEH_end_rsaz_1024_gather5: -.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 - -.globl rsaz_avx2_eligible -.type rsaz_avx2_eligible,@function -.align 32 -rsaz_avx2_eligible: - movl OPENSSL_ia32cap_P+8(%rip),%eax - movl $524544,%ecx - movl $0,%edx - andl %eax,%ecx - cmpl $524544,%ecx - cmovel %edx,%eax - andl $32,%eax - shrl $5,%eax - .byte 0xf3,0xc3 -.size rsaz_avx2_eligible,.-rsaz_avx2_eligible - -.align 64 -.Land_mask: -.quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff -.Lscatter_permd: -.long 0,2,4,6,7,7,7,7 -.Lgather_permd: -.long 0,7,1,7,2,7,3,7 -.Linc: -.long 0,0,0,0, 1,1,1,1 -.long 2,2,2,2, 3,3,3,3 -.long 4,4,4,4, 4,4,4,4 -.align 64 - .section ".note.gnu.property", "a" +.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 + .section .note.gnu.property, #alloc .p2align 3 .long 1f - 0f .long 4f - 1f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-avx512.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-avx512.s index 341cd06cd89..106ae6bbb92 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-avx512.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-avx512.s @@ -1,884 +1,23 @@ +.text .globl ossl_rsaz_avx512ifma_eligible .type ossl_rsaz_avx512ifma_eligible,@function -.align 32 ossl_rsaz_avx512ifma_eligible: - movl OPENSSL_ia32cap_P+8(%rip),%ecx xorl %eax,%eax - andl $2149777408,%ecx - cmpl $2149777408,%ecx - cmovel %ecx,%eax .byte 0xf3,0xc3 .size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible -.text .globl ossl_rsaz_amm52x20_x1_256 +.globl ossl_rsaz_amm52x20_x2_256 +.globl ossl_extract_multiplier_2x20_win5 .type ossl_rsaz_amm52x20_x1_256,@function -.align 32 ossl_rsaz_amm52x20_x1_256: -.cfi_startproc -.byte 243,15,30,250 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 -.Lrsaz_amm52x20_x1_256_body: - - - vpxord %ymm0,%ymm0,%ymm0 - vmovdqa64 %ymm0,%ymm1 - vmovdqa64 %ymm0,%ymm16 - vmovdqa64 %ymm0,%ymm17 - vmovdqa64 %ymm0,%ymm18 - vmovdqa64 %ymm0,%ymm19 - - xorl %r9d,%r9d - - movq %rdx,%r11 - movq $0xfffffffffffff,%rax - - - movl $5,%ebx - -.align 32 -.Lloop5: - movq 0(%r11),%r13 - - vpbroadcastq %r13,%ymm3 - movq 0(%rsi),%rdx - mulxq %r13,%r13,%r12 - addq %r13,%r9 - movq %r12,%r10 - adcq $0,%r10 - - movq %r8,%r13 - imulq %r9,%r13 - andq %rax,%r13 - - vpbroadcastq %r13,%ymm4 - movq 0(%rcx),%rdx - mulxq %r13,%r13,%r12 - addq %r13,%r9 - adcq %r12,%r10 - - shrq $52,%r9 - salq $12,%r10 - orq %r10,%r9 - - vpmadd52luq 0(%rsi),%ymm3,%ymm1 - vpmadd52luq 32(%rsi),%ymm3,%ymm16 - vpmadd52luq 64(%rsi),%ymm3,%ymm17 - vpmadd52luq 96(%rsi),%ymm3,%ymm18 - vpmadd52luq 128(%rsi),%ymm3,%ymm19 - - vpmadd52luq 0(%rcx),%ymm4,%ymm1 - vpmadd52luq 32(%rcx),%ymm4,%ymm16 - vpmadd52luq 64(%rcx),%ymm4,%ymm17 - vpmadd52luq 96(%rcx),%ymm4,%ymm18 - vpmadd52luq 128(%rcx),%ymm4,%ymm19 - - - valignq $1,%ymm1,%ymm16,%ymm1 - valignq $1,%ymm16,%ymm17,%ymm16 - valignq $1,%ymm17,%ymm18,%ymm17 - valignq $1,%ymm18,%ymm19,%ymm18 - valignq $1,%ymm19,%ymm0,%ymm19 - - vmovq %xmm1,%r13 - addq %r13,%r9 - - vpmadd52huq 0(%rsi),%ymm3,%ymm1 - vpmadd52huq 32(%rsi),%ymm3,%ymm16 - vpmadd52huq 64(%rsi),%ymm3,%ymm17 - vpmadd52huq 96(%rsi),%ymm3,%ymm18 - vpmadd52huq 128(%rsi),%ymm3,%ymm19 - - vpmadd52huq 0(%rcx),%ymm4,%ymm1 - vpmadd52huq 32(%rcx),%ymm4,%ymm16 - vpmadd52huq 64(%rcx),%ymm4,%ymm17 - vpmadd52huq 96(%rcx),%ymm4,%ymm18 - vpmadd52huq 128(%rcx),%ymm4,%ymm19 - movq 8(%r11),%r13 - - vpbroadcastq %r13,%ymm3 - movq 0(%rsi),%rdx - mulxq %r13,%r13,%r12 - addq %r13,%r9 - movq %r12,%r10 - adcq $0,%r10 - - movq %r8,%r13 - imulq %r9,%r13 - andq %rax,%r13 - - vpbroadcastq %r13,%ymm4 - movq 0(%rcx),%rdx - mulxq %r13,%r13,%r12 - addq %r13,%r9 - adcq %r12,%r10 - - shrq $52,%r9 - salq $12,%r10 - orq %r10,%r9 - - vpmadd52luq 0(%rsi),%ymm3,%ymm1 - vpmadd52luq 32(%rsi),%ymm3,%ymm16 - vpmadd52luq 64(%rsi),%ymm3,%ymm17 - vpmadd52luq 96(%rsi),%ymm3,%ymm18 - vpmadd52luq 128(%rsi),%ymm3,%ymm19 - - vpmadd52luq 0(%rcx),%ymm4,%ymm1 - vpmadd52luq 32(%rcx),%ymm4,%ymm16 - vpmadd52luq 64(%rcx),%ymm4,%ymm17 - vpmadd52luq 96(%rcx),%ymm4,%ymm18 - vpmadd52luq 128(%rcx),%ymm4,%ymm19 - - - valignq $1,%ymm1,%ymm16,%ymm1 - valignq $1,%ymm16,%ymm17,%ymm16 - valignq $1,%ymm17,%ymm18,%ymm17 - valignq $1,%ymm18,%ymm19,%ymm18 - valignq $1,%ymm19,%ymm0,%ymm19 - - vmovq %xmm1,%r13 - addq %r13,%r9 - - vpmadd52huq 0(%rsi),%ymm3,%ymm1 - vpmadd52huq 32(%rsi),%ymm3,%ymm16 - vpmadd52huq 64(%rsi),%ymm3,%ymm17 - vpmadd52huq 96(%rsi),%ymm3,%ymm18 - vpmadd52huq 128(%rsi),%ymm3,%ymm19 - - vpmadd52huq 0(%rcx),%ymm4,%ymm1 - vpmadd52huq 32(%rcx),%ymm4,%ymm16 - vpmadd52huq 64(%rcx),%ymm4,%ymm17 - vpmadd52huq 96(%rcx),%ymm4,%ymm18 - vpmadd52huq 128(%rcx),%ymm4,%ymm19 - movq 16(%r11),%r13 - - vpbroadcastq %r13,%ymm3 - movq 0(%rsi),%rdx - mulxq %r13,%r13,%r12 - addq %r13,%r9 - movq %r12,%r10 - adcq $0,%r10 - - movq %r8,%r13 - imulq %r9,%r13 - andq %rax,%r13 - - vpbroadcastq %r13,%ymm4 - movq 0(%rcx),%rdx - mulxq %r13,%r13,%r12 - addq %r13,%r9 - adcq %r12,%r10 - - shrq $52,%r9 - salq $12,%r10 - orq %r10,%r9 - - vpmadd52luq 0(%rsi),%ymm3,%ymm1 - vpmadd52luq 32(%rsi),%ymm3,%ymm16 - vpmadd52luq 64(%rsi),%ymm3,%ymm17 - vpmadd52luq 96(%rsi),%ymm3,%ymm18 - vpmadd52luq 128(%rsi),%ymm3,%ymm19 - - vpmadd52luq 0(%rcx),%ymm4,%ymm1 - vpmadd52luq 32(%rcx),%ymm4,%ymm16 - vpmadd52luq 64(%rcx),%ymm4,%ymm17 - vpmadd52luq 96(%rcx),%ymm4,%ymm18 - vpmadd52luq 128(%rcx),%ymm4,%ymm19 - - - valignq $1,%ymm1,%ymm16,%ymm1 - valignq $1,%ymm16,%ymm17,%ymm16 - valignq $1,%ymm17,%ymm18,%ymm17 - valignq $1,%ymm18,%ymm19,%ymm18 - valignq $1,%ymm19,%ymm0,%ymm19 - - vmovq %xmm1,%r13 - addq %r13,%r9 - - vpmadd52huq 0(%rsi),%ymm3,%ymm1 - vpmadd52huq 32(%rsi),%ymm3,%ymm16 - vpmadd52huq 64(%rsi),%ymm3,%ymm17 - vpmadd52huq 96(%rsi),%ymm3,%ymm18 - vpmadd52huq 128(%rsi),%ymm3,%ymm19 - - vpmadd52huq 0(%rcx),%ymm4,%ymm1 - vpmadd52huq 32(%rcx),%ymm4,%ymm16 - vpmadd52huq 64(%rcx),%ymm4,%ymm17 - vpmadd52huq 96(%rcx),%ymm4,%ymm18 - vpmadd52huq 128(%rcx),%ymm4,%ymm19 - movq 24(%r11),%r13 - - vpbroadcastq %r13,%ymm3 - movq 0(%rsi),%rdx - mulxq %r13,%r13,%r12 - addq %r13,%r9 - movq %r12,%r10 - adcq $0,%r10 - - movq %r8,%r13 - imulq %r9,%r13 - andq %rax,%r13 - - vpbroadcastq %r13,%ymm4 - movq 0(%rcx),%rdx - mulxq %r13,%r13,%r12 - addq %r13,%r9 - adcq %r12,%r10 - - shrq $52,%r9 - salq $12,%r10 - orq %r10,%r9 - - vpmadd52luq 0(%rsi),%ymm3,%ymm1 - vpmadd52luq 32(%rsi),%ymm3,%ymm16 - vpmadd52luq 64(%rsi),%ymm3,%ymm17 - vpmadd52luq 96(%rsi),%ymm3,%ymm18 - vpmadd52luq 128(%rsi),%ymm3,%ymm19 - - vpmadd52luq 0(%rcx),%ymm4,%ymm1 - vpmadd52luq 32(%rcx),%ymm4,%ymm16 - vpmadd52luq 64(%rcx),%ymm4,%ymm17 - vpmadd52luq 96(%rcx),%ymm4,%ymm18 - vpmadd52luq 128(%rcx),%ymm4,%ymm19 - - - valignq $1,%ymm1,%ymm16,%ymm1 - valignq $1,%ymm16,%ymm17,%ymm16 - valignq $1,%ymm17,%ymm18,%ymm17 - valignq $1,%ymm18,%ymm19,%ymm18 - valignq $1,%ymm19,%ymm0,%ymm19 - - vmovq %xmm1,%r13 - addq %r13,%r9 - - vpmadd52huq 0(%rsi),%ymm3,%ymm1 - vpmadd52huq 32(%rsi),%ymm3,%ymm16 - vpmadd52huq 64(%rsi),%ymm3,%ymm17 - vpmadd52huq 96(%rsi),%ymm3,%ymm18 - vpmadd52huq 128(%rsi),%ymm3,%ymm19 - - vpmadd52huq 0(%rcx),%ymm4,%ymm1 - vpmadd52huq 32(%rcx),%ymm4,%ymm16 - vpmadd52huq 64(%rcx),%ymm4,%ymm17 - vpmadd52huq 96(%rcx),%ymm4,%ymm18 - vpmadd52huq 128(%rcx),%ymm4,%ymm19 - leaq 32(%r11),%r11 - decl %ebx - jne .Lloop5 - - vmovdqa64 .Lmask52x4(%rip),%ymm4 - - vpbroadcastq %r9,%ymm3 - vpblendd $3,%ymm3,%ymm1,%ymm1 - - - - vpsrlq $52,%ymm1,%ymm24 - vpsrlq $52,%ymm16,%ymm25 - vpsrlq $52,%ymm17,%ymm26 - vpsrlq $52,%ymm18,%ymm27 - vpsrlq $52,%ymm19,%ymm28 - - - valignq $3,%ymm27,%ymm28,%ymm28 - valignq $3,%ymm26,%ymm27,%ymm27 - valignq $3,%ymm25,%ymm26,%ymm26 - valignq $3,%ymm24,%ymm25,%ymm25 - valignq $3,%ymm0,%ymm24,%ymm24 - - - vpandq %ymm4,%ymm1,%ymm1 - vpandq %ymm4,%ymm16,%ymm16 - vpandq %ymm4,%ymm17,%ymm17 - vpandq %ymm4,%ymm18,%ymm18 - vpandq %ymm4,%ymm19,%ymm19 - - - vpaddq %ymm24,%ymm1,%ymm1 - vpaddq %ymm25,%ymm16,%ymm16 - vpaddq %ymm26,%ymm17,%ymm17 - vpaddq %ymm27,%ymm18,%ymm18 - vpaddq %ymm28,%ymm19,%ymm19 - - - - vpcmpuq $1,%ymm1,%ymm4,%k1 - vpcmpuq $1,%ymm16,%ymm4,%k2 - vpcmpuq $1,%ymm17,%ymm4,%k3 - vpcmpuq $1,%ymm18,%ymm4,%k4 - vpcmpuq $1,%ymm19,%ymm4,%k5 - kmovb %k1,%r14d - kmovb %k2,%r13d - kmovb %k3,%r12d - kmovb %k4,%r11d - kmovb %k5,%r10d - - - vpcmpuq $0,%ymm1,%ymm4,%k1 - vpcmpuq $0,%ymm16,%ymm4,%k2 - vpcmpuq $0,%ymm17,%ymm4,%k3 - vpcmpuq $0,%ymm18,%ymm4,%k4 - vpcmpuq $0,%ymm19,%ymm4,%k5 - kmovb %k1,%r9d - kmovb %k2,%r8d - kmovb %k3,%ebx - kmovb %k4,%ecx - kmovb %k5,%edx - - - - shlb $4,%r13b - orb %r13b,%r14b - shlb $4,%r11b - orb %r11b,%r12b - - addb %r14b,%r14b - adcb %r12b,%r12b - adcb %r10b,%r10b - - shlb $4,%r8b - orb %r8b,%r9b - shlb $4,%cl - orb %cl,%bl - - addb %r9b,%r14b - adcb %bl,%r12b - adcb %dl,%r10b - - xorb %r9b,%r14b - xorb %bl,%r12b - xorb %dl,%r10b - - kmovb %r14d,%k1 - shrb $4,%r14b - kmovb %r14d,%k2 - kmovb %r12d,%k3 - shrb $4,%r12b - kmovb %r12d,%k4 - kmovb %r10d,%k5 - - - vpsubq %ymm4,%ymm1,%ymm1{%k1} - vpsubq %ymm4,%ymm16,%ymm16{%k2} - vpsubq %ymm4,%ymm17,%ymm17{%k3} - vpsubq %ymm4,%ymm18,%ymm18{%k4} - vpsubq %ymm4,%ymm19,%ymm19{%k5} - - vpandq %ymm4,%ymm1,%ymm1 - vpandq %ymm4,%ymm16,%ymm16 - vpandq %ymm4,%ymm17,%ymm17 - vpandq %ymm4,%ymm18,%ymm18 - vpandq %ymm4,%ymm19,%ymm19 - - vmovdqu64 %ymm1,(%rdi) - vmovdqu64 %ymm16,32(%rdi) - vmovdqu64 %ymm17,64(%rdi) - vmovdqu64 %ymm18,96(%rdi) - vmovdqu64 %ymm19,128(%rdi) - - vzeroupper - movq 0(%rsp),%r15 -.cfi_restore %r15 - movq 8(%rsp),%r14 -.cfi_restore %r14 - movq 16(%rsp),%r13 -.cfi_restore %r13 - movq 24(%rsp),%r12 -.cfi_restore %r12 - movq 32(%rsp),%rbp -.cfi_restore %rbp - movq 40(%rsp),%rbx -.cfi_restore %rbx - leaq 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.Lrsaz_amm52x20_x1_256_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ossl_rsaz_amm52x20_x1_256, .-ossl_rsaz_amm52x20_x1_256 -.data -.align 32 -.Lmask52x4: -.quad 0xfffffffffffff -.quad 0xfffffffffffff -.quad 0xfffffffffffff -.quad 0xfffffffffffff -.text - -.globl ossl_rsaz_amm52x20_x2_256 -.type ossl_rsaz_amm52x20_x2_256,@function -.align 32 ossl_rsaz_amm52x20_x2_256: -.cfi_startproc -.byte 243,15,30,250 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 -.Lrsaz_amm52x20_x2_256_body: - - - vpxord %ymm0,%ymm0,%ymm0 - vmovdqa64 %ymm0,%ymm1 - vmovdqa64 %ymm0,%ymm16 - vmovdqa64 %ymm0,%ymm17 - vmovdqa64 %ymm0,%ymm18 - vmovdqa64 %ymm0,%ymm19 - vmovdqa64 %ymm0,%ymm2 - vmovdqa64 %ymm0,%ymm20 - vmovdqa64 %ymm0,%ymm21 - vmovdqa64 %ymm0,%ymm22 - vmovdqa64 %ymm0,%ymm23 - - xorl %r9d,%r9d - xorl %r15d,%r15d - - movq %rdx,%r11 - movq $0xfffffffffffff,%rax - - movl $20,%ebx - -.align 32 -.Lloop20: - movq 0(%r11),%r13 - - vpbroadcastq %r13,%ymm3 - movq 0(%rsi),%rdx - mulxq %r13,%r13,%r12 - addq %r13,%r9 - movq %r12,%r10 - adcq $0,%r10 - - movq (%r8),%r13 - imulq %r9,%r13 - andq %rax,%r13 - - vpbroadcastq %r13,%ymm4 - movq 0(%rcx),%rdx - mulxq %r13,%r13,%r12 - addq %r13,%r9 - adcq %r12,%r10 - - shrq $52,%r9 - salq $12,%r10 - orq %r10,%r9 - - vpmadd52luq 0(%rsi),%ymm3,%ymm1 - vpmadd52luq 32(%rsi),%ymm3,%ymm16 - vpmadd52luq 64(%rsi),%ymm3,%ymm17 - vpmadd52luq 96(%rsi),%ymm3,%ymm18 - vpmadd52luq 128(%rsi),%ymm3,%ymm19 - - vpmadd52luq 0(%rcx),%ymm4,%ymm1 - vpmadd52luq 32(%rcx),%ymm4,%ymm16 - vpmadd52luq 64(%rcx),%ymm4,%ymm17 - vpmadd52luq 96(%rcx),%ymm4,%ymm18 - vpmadd52luq 128(%rcx),%ymm4,%ymm19 - - - valignq $1,%ymm1,%ymm16,%ymm1 - valignq $1,%ymm16,%ymm17,%ymm16 - valignq $1,%ymm17,%ymm18,%ymm17 - valignq $1,%ymm18,%ymm19,%ymm18 - valignq $1,%ymm19,%ymm0,%ymm19 - - vmovq %xmm1,%r13 - addq %r13,%r9 - - vpmadd52huq 0(%rsi),%ymm3,%ymm1 - vpmadd52huq 32(%rsi),%ymm3,%ymm16 - vpmadd52huq 64(%rsi),%ymm3,%ymm17 - vpmadd52huq 96(%rsi),%ymm3,%ymm18 - vpmadd52huq 128(%rsi),%ymm3,%ymm19 - - vpmadd52huq 0(%rcx),%ymm4,%ymm1 - vpmadd52huq 32(%rcx),%ymm4,%ymm16 - vpmadd52huq 64(%rcx),%ymm4,%ymm17 - vpmadd52huq 96(%rcx),%ymm4,%ymm18 - vpmadd52huq 128(%rcx),%ymm4,%ymm19 - movq 160(%r11),%r13 - - vpbroadcastq %r13,%ymm3 - movq 160(%rsi),%rdx - mulxq %r13,%r13,%r12 - addq %r13,%r15 - movq %r12,%r10 - adcq $0,%r10 - - movq 8(%r8),%r13 - imulq %r15,%r13 - andq %rax,%r13 - - vpbroadcastq %r13,%ymm4 - movq 160(%rcx),%rdx - mulxq %r13,%r13,%r12 - addq %r13,%r15 - adcq %r12,%r10 - - shrq $52,%r15 - salq $12,%r10 - orq %r10,%r15 - - vpmadd52luq 160(%rsi),%ymm3,%ymm2 - vpmadd52luq 192(%rsi),%ymm3,%ymm20 - vpmadd52luq 224(%rsi),%ymm3,%ymm21 - vpmadd52luq 256(%rsi),%ymm3,%ymm22 - vpmadd52luq 288(%rsi),%ymm3,%ymm23 - - vpmadd52luq 160(%rcx),%ymm4,%ymm2 - vpmadd52luq 192(%rcx),%ymm4,%ymm20 - vpmadd52luq 224(%rcx),%ymm4,%ymm21 - vpmadd52luq 256(%rcx),%ymm4,%ymm22 - vpmadd52luq 288(%rcx),%ymm4,%ymm23 - - - valignq $1,%ymm2,%ymm20,%ymm2 - valignq $1,%ymm20,%ymm21,%ymm20 - valignq $1,%ymm21,%ymm22,%ymm21 - valignq $1,%ymm22,%ymm23,%ymm22 - valignq $1,%ymm23,%ymm0,%ymm23 - - vmovq %xmm2,%r13 - addq %r13,%r15 - - vpmadd52huq 160(%rsi),%ymm3,%ymm2 - vpmadd52huq 192(%rsi),%ymm3,%ymm20 - vpmadd52huq 224(%rsi),%ymm3,%ymm21 - vpmadd52huq 256(%rsi),%ymm3,%ymm22 - vpmadd52huq 288(%rsi),%ymm3,%ymm23 - - vpmadd52huq 160(%rcx),%ymm4,%ymm2 - vpmadd52huq 192(%rcx),%ymm4,%ymm20 - vpmadd52huq 224(%rcx),%ymm4,%ymm21 - vpmadd52huq 256(%rcx),%ymm4,%ymm22 - vpmadd52huq 288(%rcx),%ymm4,%ymm23 - leaq 8(%r11),%r11 - decl %ebx - jne .Lloop20 - - vmovdqa64 .Lmask52x4(%rip),%ymm4 - - vpbroadcastq %r9,%ymm3 - vpblendd $3,%ymm3,%ymm1,%ymm1 - - - - vpsrlq $52,%ymm1,%ymm24 - vpsrlq $52,%ymm16,%ymm25 - vpsrlq $52,%ymm17,%ymm26 - vpsrlq $52,%ymm18,%ymm27 - vpsrlq $52,%ymm19,%ymm28 - - - valignq $3,%ymm27,%ymm28,%ymm28 - valignq $3,%ymm26,%ymm27,%ymm27 - valignq $3,%ymm25,%ymm26,%ymm26 - valignq $3,%ymm24,%ymm25,%ymm25 - valignq $3,%ymm0,%ymm24,%ymm24 - - - vpandq %ymm4,%ymm1,%ymm1 - vpandq %ymm4,%ymm16,%ymm16 - vpandq %ymm4,%ymm17,%ymm17 - vpandq %ymm4,%ymm18,%ymm18 - vpandq %ymm4,%ymm19,%ymm19 - - - vpaddq %ymm24,%ymm1,%ymm1 - vpaddq %ymm25,%ymm16,%ymm16 - vpaddq %ymm26,%ymm17,%ymm17 - vpaddq %ymm27,%ymm18,%ymm18 - vpaddq %ymm28,%ymm19,%ymm19 - - - - vpcmpuq $1,%ymm1,%ymm4,%k1 - vpcmpuq $1,%ymm16,%ymm4,%k2 - vpcmpuq $1,%ymm17,%ymm4,%k3 - vpcmpuq $1,%ymm18,%ymm4,%k4 - vpcmpuq $1,%ymm19,%ymm4,%k5 - kmovb %k1,%r14d - kmovb %k2,%r13d - kmovb %k3,%r12d - kmovb %k4,%r11d - kmovb %k5,%r10d - - - vpcmpuq $0,%ymm1,%ymm4,%k1 - vpcmpuq $0,%ymm16,%ymm4,%k2 - vpcmpuq $0,%ymm17,%ymm4,%k3 - vpcmpuq $0,%ymm18,%ymm4,%k4 - vpcmpuq $0,%ymm19,%ymm4,%k5 - kmovb %k1,%r9d - kmovb %k2,%r8d - kmovb %k3,%ebx - kmovb %k4,%ecx - kmovb %k5,%edx - - - - shlb $4,%r13b - orb %r13b,%r14b - shlb $4,%r11b - orb %r11b,%r12b - - addb %r14b,%r14b - adcb %r12b,%r12b - adcb %r10b,%r10b - - shlb $4,%r8b - orb %r8b,%r9b - shlb $4,%cl - orb %cl,%bl - - addb %r9b,%r14b - adcb %bl,%r12b - adcb %dl,%r10b - - xorb %r9b,%r14b - xorb %bl,%r12b - xorb %dl,%r10b - - kmovb %r14d,%k1 - shrb $4,%r14b - kmovb %r14d,%k2 - kmovb %r12d,%k3 - shrb $4,%r12b - kmovb %r12d,%k4 - kmovb %r10d,%k5 - - - vpsubq %ymm4,%ymm1,%ymm1{%k1} - vpsubq %ymm4,%ymm16,%ymm16{%k2} - vpsubq %ymm4,%ymm17,%ymm17{%k3} - vpsubq %ymm4,%ymm18,%ymm18{%k4} - vpsubq %ymm4,%ymm19,%ymm19{%k5} - - vpandq %ymm4,%ymm1,%ymm1 - vpandq %ymm4,%ymm16,%ymm16 - vpandq %ymm4,%ymm17,%ymm17 - vpandq %ymm4,%ymm18,%ymm18 - vpandq %ymm4,%ymm19,%ymm19 - - vpbroadcastq %r15,%ymm3 - vpblendd $3,%ymm3,%ymm2,%ymm2 - - - - vpsrlq $52,%ymm2,%ymm24 - vpsrlq $52,%ymm20,%ymm25 - vpsrlq $52,%ymm21,%ymm26 - vpsrlq $52,%ymm22,%ymm27 - vpsrlq $52,%ymm23,%ymm28 - - - valignq $3,%ymm27,%ymm28,%ymm28 - valignq $3,%ymm26,%ymm27,%ymm27 - valignq $3,%ymm25,%ymm26,%ymm26 - valignq $3,%ymm24,%ymm25,%ymm25 - valignq $3,%ymm0,%ymm24,%ymm24 - - - vpandq %ymm4,%ymm2,%ymm2 - vpandq %ymm4,%ymm20,%ymm20 - vpandq %ymm4,%ymm21,%ymm21 - vpandq %ymm4,%ymm22,%ymm22 - vpandq %ymm4,%ymm23,%ymm23 - - - vpaddq %ymm24,%ymm2,%ymm2 - vpaddq %ymm25,%ymm20,%ymm20 - vpaddq %ymm26,%ymm21,%ymm21 - vpaddq %ymm27,%ymm22,%ymm22 - vpaddq %ymm28,%ymm23,%ymm23 - - - - vpcmpuq $1,%ymm2,%ymm4,%k1 - vpcmpuq $1,%ymm20,%ymm4,%k2 - vpcmpuq $1,%ymm21,%ymm4,%k3 - vpcmpuq $1,%ymm22,%ymm4,%k4 - vpcmpuq $1,%ymm23,%ymm4,%k5 - kmovb %k1,%r14d - kmovb %k2,%r13d - kmovb %k3,%r12d - kmovb %k4,%r11d - kmovb %k5,%r10d - - - vpcmpuq $0,%ymm2,%ymm4,%k1 - vpcmpuq $0,%ymm20,%ymm4,%k2 - vpcmpuq $0,%ymm21,%ymm4,%k3 - vpcmpuq $0,%ymm22,%ymm4,%k4 - vpcmpuq $0,%ymm23,%ymm4,%k5 - kmovb %k1,%r9d - kmovb %k2,%r8d - kmovb %k3,%ebx - kmovb %k4,%ecx - kmovb %k5,%edx - - - - shlb $4,%r13b - orb %r13b,%r14b - shlb $4,%r11b - orb %r11b,%r12b - - addb %r14b,%r14b - adcb %r12b,%r12b - adcb %r10b,%r10b - - shlb $4,%r8b - orb %r8b,%r9b - shlb $4,%cl - orb %cl,%bl - - addb %r9b,%r14b - adcb %bl,%r12b - adcb %dl,%r10b - - xorb %r9b,%r14b - xorb %bl,%r12b - xorb %dl,%r10b - - kmovb %r14d,%k1 - shrb $4,%r14b - kmovb %r14d,%k2 - kmovb %r12d,%k3 - shrb $4,%r12b - kmovb %r12d,%k4 - kmovb %r10d,%k5 - - - vpsubq %ymm4,%ymm2,%ymm2{%k1} - vpsubq %ymm4,%ymm20,%ymm20{%k2} - vpsubq %ymm4,%ymm21,%ymm21{%k3} - vpsubq %ymm4,%ymm22,%ymm22{%k4} - vpsubq %ymm4,%ymm23,%ymm23{%k5} - - vpandq %ymm4,%ymm2,%ymm2 - vpandq %ymm4,%ymm20,%ymm20 - vpandq %ymm4,%ymm21,%ymm21 - vpandq %ymm4,%ymm22,%ymm22 - vpandq %ymm4,%ymm23,%ymm23 - - vmovdqu64 %ymm1,(%rdi) - vmovdqu64 %ymm16,32(%rdi) - vmovdqu64 %ymm17,64(%rdi) - vmovdqu64 %ymm18,96(%rdi) - vmovdqu64 %ymm19,128(%rdi) - - vmovdqu64 %ymm2,160(%rdi) - vmovdqu64 %ymm20,192(%rdi) - vmovdqu64 %ymm21,224(%rdi) - vmovdqu64 %ymm22,256(%rdi) - vmovdqu64 %ymm23,288(%rdi) - - vzeroupper - movq 0(%rsp),%r15 -.cfi_restore %r15 - movq 8(%rsp),%r14 -.cfi_restore %r14 - movq 16(%rsp),%r13 -.cfi_restore %r13 - movq 24(%rsp),%r12 -.cfi_restore %r12 - movq 32(%rsp),%rbp -.cfi_restore %rbp - movq 40(%rsp),%rbx -.cfi_restore %rbx - leaq 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.Lrsaz_amm52x20_x2_256_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ossl_rsaz_amm52x20_x2_256, .-ossl_rsaz_amm52x20_x2_256 -.text - -.align 32 -.globl ossl_extract_multiplier_2x20_win5 -.type ossl_extract_multiplier_2x20_win5,@function ossl_extract_multiplier_2x20_win5: -.cfi_startproc -.byte 243,15,30,250 - leaq (%rcx,%rcx,4),%rax - salq $5,%rax - addq %rax,%rsi - - vmovdqa64 .Lones(%rip),%ymm23 - vpbroadcastq %rdx,%ymm22 - leaq 10240(%rsi),%rax - - vpxor %xmm4,%xmm4,%xmm4 - vmovdqa64 %ymm4,%ymm3 - vmovdqa64 %ymm4,%ymm2 - vmovdqa64 %ymm4,%ymm1 - vmovdqa64 %ymm4,%ymm0 - vmovdqa64 %ymm4,%ymm21 - -.align 32 -.Lloop: - vpcmpq $0,%ymm21,%ymm22,%k1 - addq $320,%rsi - vpaddq %ymm23,%ymm21,%ymm21 - vmovdqu64 -320(%rsi),%ymm16 - vmovdqu64 -288(%rsi),%ymm17 - vmovdqu64 -256(%rsi),%ymm18 - vmovdqu64 -224(%rsi),%ymm19 - vmovdqu64 -192(%rsi),%ymm20 - vpblendmq %ymm16,%ymm0,%ymm0{%k1} - vpblendmq %ymm17,%ymm1,%ymm1{%k1} - vpblendmq %ymm18,%ymm2,%ymm2{%k1} - vpblendmq %ymm19,%ymm3,%ymm3{%k1} - vpblendmq %ymm20,%ymm4,%ymm4{%k1} - cmpq %rsi,%rax - jne .Lloop - - vmovdqu64 %ymm0,(%rdi) - vmovdqu64 %ymm1,32(%rdi) - vmovdqu64 %ymm2,64(%rdi) - vmovdqu64 %ymm3,96(%rdi) - vmovdqu64 %ymm4,128(%rdi) - +.byte 0x0f,0x0b .byte 0xf3,0xc3 -.cfi_endproc -.size ossl_extract_multiplier_2x20_win5, .-ossl_extract_multiplier_2x20_win5 -.data -.align 32 -.Lones: -.quad 1,1,1,1 - .section ".note.gnu.property", "a" +.size ossl_rsaz_amm52x20_x1_256, .-ossl_rsaz_amm52x20_x1_256 + .section .note.gnu.property, #alloc .p2align 3 .long 1f - 0f .long 4f - 1f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-x86_64.s index ea1ae389119..d68613212f1 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-x86_64.s @@ -33,10 +33,6 @@ rsaz_512_sqr: movq (%rsi),%rdx movq 8(%rsi),%rax movq %rcx,128(%rsp) - movl $0x80100,%r11d - andl OPENSSL_ia32cap_P+8(%rip),%r11d - cmpl $0x80100,%r11d - je .Loop_sqrx jmp .Loop_sqr .align 32 @@ -407,282 +403,6 @@ rsaz_512_sqr: decl %r8d jnz .Loop_sqr - jmp .Lsqr_tail - -.align 32 -.Loop_sqrx: - movl %r8d,128+8(%rsp) -.byte 102,72,15,110,199 - - mulxq %rax,%r8,%r9 - movq %rax,%rbx - - mulxq 16(%rsi),%rcx,%r10 - xorq %rbp,%rbp - - mulxq 24(%rsi),%rax,%r11 - adcxq %rcx,%r9 - -.byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00 - adcxq %rax,%r10 - -.byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00 - adcxq %rcx,%r11 - - mulxq 48(%rsi),%rcx,%r14 - adcxq %rax,%r12 - adcxq %rcx,%r13 - - mulxq 56(%rsi),%rax,%r15 - adcxq %rax,%r14 - adcxq %rbp,%r15 - - mulxq %rdx,%rax,%rdi - movq %rbx,%rdx - xorq %rcx,%rcx - adoxq %r8,%r8 - adcxq %rdi,%r8 - adoxq %rbp,%rcx - adcxq %rbp,%rcx - - movq %rax,(%rsp) - movq %r8,8(%rsp) - - -.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00 - adoxq %rax,%r10 - adcxq %rbx,%r11 - - mulxq 24(%rsi),%rdi,%r8 - adoxq %rdi,%r11 -.byte 0x66 - adcxq %r8,%r12 - - mulxq 32(%rsi),%rax,%rbx - adoxq %rax,%r12 - adcxq %rbx,%r13 - - mulxq 40(%rsi),%rdi,%r8 - adoxq %rdi,%r13 - adcxq %r8,%r14 - -.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 - adoxq %rax,%r14 - adcxq %rbx,%r15 - -.byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 - adoxq %rdi,%r15 - adcxq %rbp,%r8 - mulxq %rdx,%rax,%rdi - adoxq %rbp,%r8 -.byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00 - - xorq %rbx,%rbx - adoxq %r9,%r9 - - adcxq %rcx,%rax - adoxq %r10,%r10 - adcxq %rax,%r9 - adoxq %rbp,%rbx - adcxq %rdi,%r10 - adcxq %rbp,%rbx - - movq %r9,16(%rsp) -.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 - - - mulxq 24(%rsi),%rdi,%r9 - adoxq %rdi,%r12 - adcxq %r9,%r13 - - mulxq 32(%rsi),%rax,%rcx - adoxq %rax,%r13 - adcxq %rcx,%r14 - -.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00 - adoxq %rdi,%r14 - adcxq %r9,%r15 - -.byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 - adoxq %rax,%r15 - adcxq %rcx,%r8 - - mulxq 56(%rsi),%rdi,%r9 - adoxq %rdi,%r8 - adcxq %rbp,%r9 - mulxq %rdx,%rax,%rdi - adoxq %rbp,%r9 - movq 24(%rsi),%rdx - - xorq %rcx,%rcx - adoxq %r11,%r11 - - adcxq %rbx,%rax - adoxq %r12,%r12 - adcxq %rax,%r11 - adoxq %rbp,%rcx - adcxq %rdi,%r12 - adcxq %rbp,%rcx - - movq %r11,32(%rsp) - movq %r12,40(%rsp) - - - mulxq 32(%rsi),%rax,%rbx - adoxq %rax,%r14 - adcxq %rbx,%r15 - - mulxq 40(%rsi),%rdi,%r10 - adoxq %rdi,%r15 - adcxq %r10,%r8 - - mulxq 48(%rsi),%rax,%rbx - adoxq %rax,%r8 - adcxq %rbx,%r9 - - mulxq 56(%rsi),%rdi,%r10 - adoxq %rdi,%r9 - adcxq %rbp,%r10 - mulxq %rdx,%rax,%rdi - adoxq %rbp,%r10 - movq 32(%rsi),%rdx - - xorq %rbx,%rbx - adoxq %r13,%r13 - - adcxq %rcx,%rax - adoxq %r14,%r14 - adcxq %rax,%r13 - adoxq %rbp,%rbx - adcxq %rdi,%r14 - adcxq %rbp,%rbx - - movq %r13,48(%rsp) - movq %r14,56(%rsp) - - - mulxq 40(%rsi),%rdi,%r11 - adoxq %rdi,%r8 - adcxq %r11,%r9 - - mulxq 48(%rsi),%rax,%rcx - adoxq %rax,%r9 - adcxq %rcx,%r10 - - mulxq 56(%rsi),%rdi,%r11 - adoxq %rdi,%r10 - adcxq %rbp,%r11 - mulxq %rdx,%rax,%rdi - movq 40(%rsi),%rdx - adoxq %rbp,%r11 - - xorq %rcx,%rcx - adoxq %r15,%r15 - - adcxq %rbx,%rax - adoxq %r8,%r8 - adcxq %rax,%r15 - adoxq %rbp,%rcx - adcxq %rdi,%r8 - adcxq %rbp,%rcx - - movq %r15,64(%rsp) - movq %r8,72(%rsp) - - -.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 - adoxq %rax,%r10 - adcxq %rbx,%r11 - -.byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 - adoxq %rdi,%r11 - adcxq %rbp,%r12 - mulxq %rdx,%rax,%rdi - adoxq %rbp,%r12 - movq 48(%rsi),%rdx - - xorq %rbx,%rbx - adoxq %r9,%r9 - - adcxq %rcx,%rax - adoxq %r10,%r10 - adcxq %rax,%r9 - adcxq %rdi,%r10 - adoxq %rbp,%rbx - adcxq %rbp,%rbx - - movq %r9,80(%rsp) - movq %r10,88(%rsp) - - -.byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 - adoxq %rax,%r12 - adoxq %rbp,%r13 - - mulxq %rdx,%rax,%rdi - xorq %rcx,%rcx - movq 56(%rsi),%rdx - adoxq %r11,%r11 - - adcxq %rbx,%rax - adoxq %r12,%r12 - adcxq %rax,%r11 - adoxq %rbp,%rcx - adcxq %rdi,%r12 - adcxq %rbp,%rcx - -.byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 -.byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 - - - mulxq %rdx,%rax,%rdx - xorq %rbx,%rbx - adoxq %r13,%r13 - - adcxq %rcx,%rax - adoxq %rbp,%rbx - adcxq %r13,%rax - adcxq %rdx,%rbx - -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 - - movq 128(%rsp),%rdx - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - movq %rax,112(%rsp) - movq %rbx,120(%rsp) - - call __rsaz_512_reducex - - addq 64(%rsp),%r8 - adcq 72(%rsp),%r9 - adcq 80(%rsp),%r10 - adcq 88(%rsp),%r11 - adcq 96(%rsp),%r12 - adcq 104(%rsp),%r13 - adcq 112(%rsp),%r14 - adcq 120(%rsp),%r15 - sbbq %rcx,%rcx - - call __rsaz_512_subtract - - movq %r8,%rdx - movq %r9,%rax - movl 128+8(%rsp),%r8d - movq %rdi,%rsi - - decl %r8d - jnz .Loop_sqrx - -.Lsqr_tail: leaq 128+24+48(%rsp),%rax .cfi_def_cfa %rax,8 @@ -734,10 +454,6 @@ rsaz_512_mul: .byte 102,72,15,110,199 .byte 102,72,15,110,201 movq %r8,128(%rsp) - movl $0x80100,%r11d - andl OPENSSL_ia32cap_P+8(%rip),%r11d - cmpl $0x80100,%r11d - je .Lmulx movq (%rdx),%rbx movq %rdx,%rbp call __rsaz_512_mul @@ -755,29 +471,6 @@ rsaz_512_mul: movq 56(%rsp),%r15 call __rsaz_512_reduce - jmp .Lmul_tail - -.align 32 -.Lmulx: - movq %rdx,%rbp - movq (%rdx),%rdx - call __rsaz_512_mulx - -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 - - movq 128(%rsp),%rdx - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - call __rsaz_512_reducex -.Lmul_tail: addq 64(%rsp),%r8 adcq 72(%rsp),%r9 adcq 80(%rsp),%r10 @@ -891,10 +584,6 @@ rsaz_512_mul_gather4: por %xmm9,%xmm8 pshufd $0x4e,%xmm8,%xmm9 por %xmm9,%xmm8 - movl $0x80100,%r11d - andl OPENSSL_ia32cap_P+8(%rip),%r11d - cmpl $0x80100,%r11d - je .Lmulx_gather .byte 102,76,15,126,195 movq %r8,128(%rsp) @@ -1075,142 +764,6 @@ rsaz_512_mul_gather4: movq 56(%rsp),%r15 call __rsaz_512_reduce - jmp .Lmul_gather_tail - -.align 32 -.Lmulx_gather: -.byte 102,76,15,126,194 - - movq %r8,128(%rsp) - movq %rdi,128+8(%rsp) - movq %rcx,128+16(%rsp) - - mulxq (%rsi),%rbx,%r8 - movq %rbx,(%rsp) - xorl %edi,%edi - - mulxq 8(%rsi),%rax,%r9 - - mulxq 16(%rsi),%rbx,%r10 - adcxq %rax,%r8 - - mulxq 24(%rsi),%rax,%r11 - adcxq %rbx,%r9 - - mulxq 32(%rsi),%rbx,%r12 - adcxq %rax,%r10 - - mulxq 40(%rsi),%rax,%r13 - adcxq %rbx,%r11 - - mulxq 48(%rsi),%rbx,%r14 - adcxq %rax,%r12 - - mulxq 56(%rsi),%rax,%r15 - adcxq %rbx,%r13 - adcxq %rax,%r14 -.byte 0x67 - movq %r8,%rbx - adcxq %rdi,%r15 - - movq $-7,%rcx - jmp .Loop_mulx_gather - -.align 32 -.Loop_mulx_gather: - movdqa 0(%rbp),%xmm8 - movdqa 16(%rbp),%xmm9 - movdqa 32(%rbp),%xmm10 - movdqa 48(%rbp),%xmm11 - pand %xmm0,%xmm8 - movdqa 64(%rbp),%xmm12 - pand %xmm1,%xmm9 - movdqa 80(%rbp),%xmm13 - pand %xmm2,%xmm10 - movdqa 96(%rbp),%xmm14 - pand %xmm3,%xmm11 - movdqa 112(%rbp),%xmm15 - leaq 128(%rbp),%rbp - pand %xmm4,%xmm12 - pand %xmm5,%xmm13 - pand %xmm6,%xmm14 - pand %xmm7,%xmm15 - por %xmm10,%xmm8 - por %xmm11,%xmm9 - por %xmm12,%xmm8 - por %xmm13,%xmm9 - por %xmm14,%xmm8 - por %xmm15,%xmm9 - - por %xmm9,%xmm8 - pshufd $0x4e,%xmm8,%xmm9 - por %xmm9,%xmm8 -.byte 102,76,15,126,194 - -.byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 - adcxq %rax,%rbx - adoxq %r9,%r8 - - mulxq 8(%rsi),%rax,%r9 - adcxq %rax,%r8 - adoxq %r10,%r9 - - mulxq 16(%rsi),%rax,%r10 - adcxq %rax,%r9 - adoxq %r11,%r10 - -.byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 - adcxq %rax,%r10 - adoxq %r12,%r11 - - mulxq 32(%rsi),%rax,%r12 - adcxq %rax,%r11 - adoxq %r13,%r12 - - mulxq 40(%rsi),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - -.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 - adcxq %rax,%r13 -.byte 0x67 - adoxq %r15,%r14 - - mulxq 56(%rsi),%rax,%r15 - movq %rbx,64(%rsp,%rcx,8) - adcxq %rax,%r14 - adoxq %rdi,%r15 - movq %r8,%rbx - adcxq %rdi,%r15 - - incq %rcx - jnz .Loop_mulx_gather - - movq %r8,64(%rsp) - movq %r9,64+8(%rsp) - movq %r10,64+16(%rsp) - movq %r11,64+24(%rsp) - movq %r12,64+32(%rsp) - movq %r13,64+40(%rsp) - movq %r14,64+48(%rsp) - movq %r15,64+56(%rsp) - - movq 128(%rsp),%rdx - movq 128+8(%rsp),%rdi - movq 128+16(%rsp),%rbp - - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - call __rsaz_512_reducex - -.Lmul_gather_tail: addq 64(%rsp),%r8 adcq 72(%rsp),%r9 adcq 80(%rsp),%r10 @@ -1278,10 +831,6 @@ rsaz_512_mul_scatter4: movq %rcx,128(%rsp) movq %rdi,%rbp - movl $0x80100,%r11d - andl OPENSSL_ia32cap_P+8(%rip),%r11d - cmpl $0x80100,%r11d - je .Lmulx_scatter movq (%rdi),%rbx call __rsaz_512_mul @@ -1298,29 +847,6 @@ rsaz_512_mul_scatter4: movq 56(%rsp),%r15 call __rsaz_512_reduce - jmp .Lmul_scatter_tail - -.align 32 -.Lmulx_scatter: - movq (%rdi),%rdx - call __rsaz_512_mulx - -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 - - movq 128(%rsp),%rdx - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - call __rsaz_512_reducex - -.Lmul_scatter_tail: addq 64(%rsp),%r8 adcq 72(%rsp),%r9 adcq 80(%rsp),%r10 @@ -1390,7 +916,6 @@ rsaz_512_mul_by_one: subq $128+24,%rsp .cfi_adjust_cfa_offset 128+24 .Lmul_by_one_body: - movl OPENSSL_ia32cap_P+8(%rip),%eax movq %rdx,%rbp movq %rcx,128(%rsp) @@ -1411,16 +936,7 @@ rsaz_512_mul_by_one: movdqa %xmm0,64(%rsp) movdqa %xmm0,80(%rsp) movdqa %xmm0,96(%rsp) - andl $0x80100,%eax - cmpl $0x80100,%eax - je .Lby_one_callx call __rsaz_512_reduce - jmp .Lby_one_tail -.align 32 -.Lby_one_callx: - movq 128(%rsp),%rdx - call __rsaz_512_reducex -.Lby_one_tail: movq %r8,(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) @@ -1535,64 +1051,6 @@ __rsaz_512_reduce: .byte 0xf3,0xc3 .cfi_endproc .size __rsaz_512_reduce,.-__rsaz_512_reduce -.type __rsaz_512_reducex,@function -.align 32 -__rsaz_512_reducex: -.cfi_startproc - - imulq %r8,%rdx - xorq %rsi,%rsi - movl $8,%ecx - jmp .Lreduction_loopx - -.align 32 -.Lreduction_loopx: - movq %r8,%rbx - mulxq 0(%rbp),%rax,%r8 - adcxq %rbx,%rax - adoxq %r9,%r8 - - mulxq 8(%rbp),%rax,%r9 - adcxq %rax,%r8 - adoxq %r10,%r9 - - mulxq 16(%rbp),%rbx,%r10 - adcxq %rbx,%r9 - adoxq %r11,%r10 - - mulxq 24(%rbp),%rbx,%r11 - adcxq %rbx,%r10 - adoxq %r12,%r11 - -.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 - movq %rdx,%rax - movq %r8,%rdx - adcxq %rbx,%r11 - adoxq %r13,%r12 - - mulxq 128+8(%rsp),%rbx,%rdx - movq %rax,%rdx - - mulxq 40(%rbp),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - -.byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 - adcxq %rax,%r13 - adoxq %r15,%r14 - - mulxq 56(%rbp),%rax,%r15 - movq %rbx,%rdx - adcxq %rax,%r14 - adoxq %rsi,%r15 - adcxq %rsi,%r15 - - decl %ecx - jne .Lreduction_loopx - - .byte 0xf3,0xc3 -.cfi_endproc -.size __rsaz_512_reducex,.-__rsaz_512_reducex .type __rsaz_512_subtract,@function .align 32 __rsaz_512_subtract: @@ -1796,128 +1254,6 @@ __rsaz_512_mul: .byte 0xf3,0xc3 .cfi_endproc .size __rsaz_512_mul,.-__rsaz_512_mul -.type __rsaz_512_mulx,@function -.align 32 -__rsaz_512_mulx: -.cfi_startproc - mulxq (%rsi),%rbx,%r8 - movq $-6,%rcx - - mulxq 8(%rsi),%rax,%r9 - movq %rbx,8(%rsp) - - mulxq 16(%rsi),%rbx,%r10 - adcq %rax,%r8 - - mulxq 24(%rsi),%rax,%r11 - adcq %rbx,%r9 - - mulxq 32(%rsi),%rbx,%r12 - adcq %rax,%r10 - - mulxq 40(%rsi),%rax,%r13 - adcq %rbx,%r11 - - mulxq 48(%rsi),%rbx,%r14 - adcq %rax,%r12 - - mulxq 56(%rsi),%rax,%r15 - movq 8(%rbp),%rdx - adcq %rbx,%r13 - adcq %rax,%r14 - adcq $0,%r15 - - xorq %rdi,%rdi - jmp .Loop_mulx - -.align 32 -.Loop_mulx: - movq %r8,%rbx - mulxq (%rsi),%rax,%r8 - adcxq %rax,%rbx - adoxq %r9,%r8 - - mulxq 8(%rsi),%rax,%r9 - adcxq %rax,%r8 - adoxq %r10,%r9 - - mulxq 16(%rsi),%rax,%r10 - adcxq %rax,%r9 - adoxq %r11,%r10 - - mulxq 24(%rsi),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - -.byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 - adcxq %rax,%r11 - adoxq %r13,%r12 - - mulxq 40(%rsi),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - - mulxq 48(%rsi),%rax,%r14 - adcxq %rax,%r13 - adoxq %r15,%r14 - - mulxq 56(%rsi),%rax,%r15 - movq 64(%rbp,%rcx,8),%rdx - movq %rbx,8+64-8(%rsp,%rcx,8) - adcxq %rax,%r14 - adoxq %rdi,%r15 - adcxq %rdi,%r15 - - incq %rcx - jnz .Loop_mulx - - movq %r8,%rbx - mulxq (%rsi),%rax,%r8 - adcxq %rax,%rbx - adoxq %r9,%r8 - -.byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 - adcxq %rax,%r8 - adoxq %r10,%r9 - -.byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 - adcxq %rax,%r9 - adoxq %r11,%r10 - - mulxq 24(%rsi),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - - mulxq 32(%rsi),%rax,%r12 - adcxq %rax,%r11 - adoxq %r13,%r12 - - mulxq 40(%rsi),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - -.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 - adcxq %rax,%r13 - adoxq %r15,%r14 - -.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 - adcxq %rax,%r14 - adoxq %rdi,%r15 - adcxq %rdi,%r15 - - movq %rbx,8+64-8(%rsp) - movq %r8,8+64(%rsp) - movq %r9,8+64+8(%rsp) - movq %r10,8+64+16(%rsp) - movq %r11,8+64+24(%rsp) - movq %r12,8+64+32(%rsp) - movq %r13,8+64+40(%rsp) - movq %r14,8+64+48(%rsp) - movq %r15,8+64+56(%rsp) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __rsaz_512_mulx,.-__rsaz_512_mulx .globl rsaz_512_scatter4 .type rsaz_512_scatter4,@function .align 16 @@ -2013,7 +1349,7 @@ rsaz_512_gather4: .Linc: .long 0,0, 1,1 .long 2,2, 2,2 - .section ".note.gnu.property", "a" + .section .note.gnu.property, #alloc .p2align 3 .long 1f - 0f .long 4f - 1f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-gf2m.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-gf2m.s index 4f259df94bc..3c1e47c6ddd 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-gf2m.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-gf2m.s @@ -309,7 +309,7 @@ bn_GF2m_mul_2x2: .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 .byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 16 - .section ".note.gnu.property", "a" + .section .note.gnu.property, #alloc .p2align 3 .long 1f - 0f .long 4f - 1f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-mont.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-mont.s index f412eee41c9..ba7bb44ca38 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-mont.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-mont.s @@ -14,7 +14,6 @@ bn_mul_mont: jnz .Lmul_enter cmpl $8,%r9d jb .Lmul_enter - movl OPENSSL_ia32cap_P+8(%rip),%r11d cmpq %rsi,%rdx jne .Lmul4x_enter testl $7,%r9d @@ -263,9 +262,6 @@ bn_mul4x_mont: movq %rsp,%rax .cfi_def_cfa_register %rax .Lmul4x_enter: - andl $0x80100,%r11d - cmpl $0x80100,%r11d - je .Lmulx4x_enter pushq %rbx .cfi_offset %rbx,-16 pushq %rbp @@ -691,7 +687,6 @@ bn_mul4x_mont: .size bn_mul4x_mont,.-bn_mul4x_mont - .type bn_sqr8x_mont,@function .align 32 bn_sqr8x_mont: @@ -773,25 +768,6 @@ bn_sqr8x_mont: pxor %xmm0,%xmm0 .byte 102,72,15,110,207 .byte 102,73,15,110,218 - movl OPENSSL_ia32cap_P+8(%rip),%eax - andl $0x80100,%eax - cmpl $0x80100,%eax - jne .Lsqr8x_nox - - call bn_sqrx8x_internal - - - - - leaq (%r8,%rcx,1),%rbx - movq %rcx,%r9 - movq %rcx,%rdx -.byte 102,72,15,126,207 - sarq $3+2,%rcx - jmp .Lsqr8x_sub - -.align 32 -.Lsqr8x_nox: call bn_sqr8x_internal @@ -879,365 +855,9 @@ bn_sqr8x_mont: .byte 0xf3,0xc3 .cfi_endproc .size bn_sqr8x_mont,.-bn_sqr8x_mont -.type bn_mulx4x_mont,@function -.align 32 -bn_mulx4x_mont: -.cfi_startproc - movq %rsp,%rax -.cfi_def_cfa_register %rax -.Lmulx4x_enter: - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 -.Lmulx4x_prologue: - - shll $3,%r9d - xorq %r10,%r10 - subq %r9,%r10 - movq (%r8),%r8 - leaq -72(%rsp,%r10,1),%rbp - andq $-128,%rbp - movq %rsp,%r11 - subq %rbp,%r11 - andq $-4096,%r11 - leaq (%r11,%rbp,1),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja .Lmulx4x_page_walk - jmp .Lmulx4x_page_walk_done - -.align 16 -.Lmulx4x_page_walk: - leaq -4096(%rsp),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja .Lmulx4x_page_walk -.Lmulx4x_page_walk_done: - - leaq (%rdx,%r9,1),%r10 - - - - - - - - - - - - - movq %r9,0(%rsp) - shrq $5,%r9 - movq %r10,16(%rsp) - subq $1,%r9 - movq %r8,24(%rsp) - movq %rdi,32(%rsp) - movq %rax,40(%rsp) -.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 - movq %r9,48(%rsp) - jmp .Lmulx4x_body - -.align 32 -.Lmulx4x_body: - leaq 8(%rdx),%rdi - movq (%rdx),%rdx - leaq 64+32(%rsp),%rbx - movq %rdx,%r9 - - mulxq 0(%rsi),%r8,%rax - mulxq 8(%rsi),%r11,%r14 - addq %rax,%r11 - movq %rdi,8(%rsp) - mulxq 16(%rsi),%r12,%r13 - adcq %r14,%r12 - adcq $0,%r13 - - movq %r8,%rdi - imulq 24(%rsp),%r8 - xorq %rbp,%rbp - - mulxq 24(%rsi),%rax,%r14 - movq %r8,%rdx - leaq 32(%rsi),%rsi - adcxq %rax,%r13 - adcxq %rbp,%r14 - - mulxq 0(%rcx),%rax,%r10 - adcxq %rax,%rdi - adoxq %r11,%r10 - mulxq 8(%rcx),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 -.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 - movq 48(%rsp),%rdi - movq %r10,-32(%rbx) - adcxq %rax,%r11 - adoxq %r13,%r12 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r11,-24(%rbx) - adcxq %rax,%r12 - adoxq %rbp,%r15 - leaq 32(%rcx),%rcx - movq %r12,-16(%rbx) - - jmp .Lmulx4x_1st - -.align 32 -.Lmulx4x_1st: - adcxq %rbp,%r15 - mulxq 0(%rsi),%r10,%rax - adcxq %r14,%r10 - mulxq 8(%rsi),%r11,%r14 - adcxq %rax,%r11 - mulxq 16(%rsi),%r12,%rax - adcxq %r14,%r12 - mulxq 24(%rsi),%r13,%r14 -.byte 0x67,0x67 - movq %r8,%rdx - adcxq %rax,%r13 - adcxq %rbp,%r14 - leaq 32(%rsi),%rsi - leaq 32(%rbx),%rbx - - adoxq %r15,%r10 - mulxq 0(%rcx),%rax,%r15 - adcxq %rax,%r10 - adoxq %r15,%r11 - mulxq 8(%rcx),%rax,%r15 - adcxq %rax,%r11 - adoxq %r15,%r12 - mulxq 16(%rcx),%rax,%r15 - movq %r10,-40(%rbx) - adcxq %rax,%r12 - movq %r11,-32(%rbx) - adoxq %r15,%r13 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r12,-24(%rbx) - adcxq %rax,%r13 - adoxq %rbp,%r15 - leaq 32(%rcx),%rcx - movq %r13,-16(%rbx) - - decq %rdi - jnz .Lmulx4x_1st - - movq 0(%rsp),%rax - movq 8(%rsp),%rdi - adcq %rbp,%r15 - addq %r15,%r14 - sbbq %r15,%r15 - movq %r14,-8(%rbx) - jmp .Lmulx4x_outer - -.align 32 -.Lmulx4x_outer: - movq (%rdi),%rdx - leaq 8(%rdi),%rdi - subq %rax,%rsi - movq %r15,(%rbx) - leaq 64+32(%rsp),%rbx - subq %rax,%rcx - - mulxq 0(%rsi),%r8,%r11 - xorl %ebp,%ebp - movq %rdx,%r9 - mulxq 8(%rsi),%r14,%r12 - adoxq -32(%rbx),%r8 - adcxq %r14,%r11 - mulxq 16(%rsi),%r15,%r13 - adoxq -24(%rbx),%r11 - adcxq %r15,%r12 - adoxq -16(%rbx),%r12 - adcxq %rbp,%r13 - adoxq %rbp,%r13 - - movq %rdi,8(%rsp) - movq %r8,%r15 - imulq 24(%rsp),%r8 - xorl %ebp,%ebp - - mulxq 24(%rsi),%rax,%r14 - movq %r8,%rdx - adcxq %rax,%r13 - adoxq -8(%rbx),%r13 - adcxq %rbp,%r14 - leaq 32(%rsi),%rsi - adoxq %rbp,%r14 - - mulxq 0(%rcx),%rax,%r10 - adcxq %rax,%r15 - adoxq %r11,%r10 - mulxq 8(%rcx),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - mulxq 16(%rcx),%rax,%r12 - movq %r10,-32(%rbx) - adcxq %rax,%r11 - adoxq %r13,%r12 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r11,-24(%rbx) - leaq 32(%rcx),%rcx - adcxq %rax,%r12 - adoxq %rbp,%r15 - movq 48(%rsp),%rdi - movq %r12,-16(%rbx) - - jmp .Lmulx4x_inner - -.align 32 -.Lmulx4x_inner: - mulxq 0(%rsi),%r10,%rax - adcxq %rbp,%r15 - adoxq %r14,%r10 - mulxq 8(%rsi),%r11,%r14 - adcxq 0(%rbx),%r10 - adoxq %rax,%r11 - mulxq 16(%rsi),%r12,%rax - adcxq 8(%rbx),%r11 - adoxq %r14,%r12 - mulxq 24(%rsi),%r13,%r14 - movq %r8,%rdx - adcxq 16(%rbx),%r12 - adoxq %rax,%r13 - adcxq 24(%rbx),%r13 - adoxq %rbp,%r14 - leaq 32(%rsi),%rsi - leaq 32(%rbx),%rbx - adcxq %rbp,%r14 - - adoxq %r15,%r10 - mulxq 0(%rcx),%rax,%r15 - adcxq %rax,%r10 - adoxq %r15,%r11 - mulxq 8(%rcx),%rax,%r15 - adcxq %rax,%r11 - adoxq %r15,%r12 - mulxq 16(%rcx),%rax,%r15 - movq %r10,-40(%rbx) - adcxq %rax,%r12 - adoxq %r15,%r13 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r11,-32(%rbx) - movq %r12,-24(%rbx) - adcxq %rax,%r13 - adoxq %rbp,%r15 - leaq 32(%rcx),%rcx - movq %r13,-16(%rbx) - - decq %rdi - jnz .Lmulx4x_inner - - movq 0(%rsp),%rax - movq 8(%rsp),%rdi - adcq %rbp,%r15 - subq 0(%rbx),%rbp - adcq %r15,%r14 - sbbq %r15,%r15 - movq %r14,-8(%rbx) - - cmpq 16(%rsp),%rdi - jne .Lmulx4x_outer - - leaq 64(%rsp),%rbx - subq %rax,%rcx - negq %r15 - movq %rax,%rdx - shrq $3+2,%rax - movq 32(%rsp),%rdi - jmp .Lmulx4x_sub - -.align 32 -.Lmulx4x_sub: - movq 0(%rbx),%r11 - movq 8(%rbx),%r12 - movq 16(%rbx),%r13 - movq 24(%rbx),%r14 - leaq 32(%rbx),%rbx - sbbq 0(%rcx),%r11 - sbbq 8(%rcx),%r12 - sbbq 16(%rcx),%r13 - sbbq 24(%rcx),%r14 - leaq 32(%rcx),%rcx - movq %r11,0(%rdi) - movq %r12,8(%rdi) - movq %r13,16(%rdi) - movq %r14,24(%rdi) - leaq 32(%rdi),%rdi - decq %rax - jnz .Lmulx4x_sub - - sbbq $0,%r15 - leaq 64(%rsp),%rbx - subq %rdx,%rdi - -.byte 102,73,15,110,207 - pxor %xmm0,%xmm0 - pshufd $0,%xmm1,%xmm1 - movq 40(%rsp),%rsi -.cfi_def_cfa %rsi,8 - jmp .Lmulx4x_cond_copy - -.align 32 -.Lmulx4x_cond_copy: - movdqa 0(%rbx),%xmm2 - movdqa 16(%rbx),%xmm3 - leaq 32(%rbx),%rbx - movdqu 0(%rdi),%xmm4 - movdqu 16(%rdi),%xmm5 - leaq 32(%rdi),%rdi - movdqa %xmm0,-32(%rbx) - movdqa %xmm0,-16(%rbx) - pcmpeqd %xmm1,%xmm0 - pand %xmm1,%xmm2 - pand %xmm1,%xmm3 - pand %xmm0,%xmm4 - pand %xmm0,%xmm5 - pxor %xmm0,%xmm0 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqu %xmm4,-32(%rdi) - movdqu %xmm5,-16(%rdi) - subq $32,%rdx - jnz .Lmulx4x_cond_copy - - movq %rdx,(%rbx) - - movq $1,%rax - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lmulx4x_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size bn_mulx4x_mont,.-bn_mulx4x_mont .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 16 - .section ".note.gnu.property", "a" + .section .note.gnu.property, #alloc .p2align 3 .long 1f - 0f .long 4f - 1f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-mont5.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-mont5.s index d0025f94e2d..4614a037ae6 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-mont5.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-mont5.s @@ -12,7 +12,6 @@ bn_mul_mont_gather5: .cfi_def_cfa_register %rax testl $7,%r9d jnz .Lmul_enter - movl OPENSSL_ia32cap_P+8(%rip),%r11d jmp .Lmul4x_enter .align 16 @@ -449,9 +448,6 @@ bn_mul4x_mont_gather5: movq %rsp,%rax .cfi_def_cfa_register %rax .Lmul4x_enter: - andl $0x80108,%r11d - cmpl $0x80108,%r11d - je .Lmulx4x_enter pushq %rbx .cfi_offset %rbx,-16 pushq %rbp @@ -1081,10 +1077,6 @@ bn_power5: .cfi_startproc movq %rsp,%rax .cfi_def_cfa_register %rax - movl OPENSSL_ia32cap_P+8(%rip),%r11d - andl $0x80108,%r11d - cmpl $0x80108,%r11d - je .Lpowerx5_enter pushq %rbx .cfi_offset %rbx,-16 pushq %rbp @@ -2048,1348 +2040,6 @@ __bn_post4x_internal: .byte 0xf3,0xc3 .cfi_endproc .size __bn_post4x_internal,.-__bn_post4x_internal -.type bn_mulx4x_mont_gather5,@function -.align 32 -bn_mulx4x_mont_gather5: -.cfi_startproc - movq %rsp,%rax -.cfi_def_cfa_register %rax -.Lmulx4x_enter: - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 -.Lmulx4x_prologue: - - shll $3,%r9d - leaq (%r9,%r9,2),%r10 - negq %r9 - movq (%r8),%r8 - - - - - - - - - - - leaq -320(%rsp,%r9,2),%r11 - movq %rsp,%rbp - subq %rdi,%r11 - andq $4095,%r11 - cmpq %r11,%r10 - jb .Lmulx4xsp_alt - subq %r11,%rbp - leaq -320(%rbp,%r9,2),%rbp - jmp .Lmulx4xsp_done - -.Lmulx4xsp_alt: - leaq 4096-320(,%r9,2),%r10 - leaq -320(%rbp,%r9,2),%rbp - subq %r10,%r11 - movq $0,%r10 - cmovcq %r10,%r11 - subq %r11,%rbp -.Lmulx4xsp_done: - andq $-64,%rbp - movq %rsp,%r11 - subq %rbp,%r11 - andq $-4096,%r11 - leaq (%r11,%rbp,1),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja .Lmulx4x_page_walk - jmp .Lmulx4x_page_walk_done - -.Lmulx4x_page_walk: - leaq -4096(%rsp),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja .Lmulx4x_page_walk -.Lmulx4x_page_walk_done: - - - - - - - - - - - - - - movq %r8,32(%rsp) - movq %rax,40(%rsp) -.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 -.Lmulx4x_body: - call mulx4x_internal - - movq 40(%rsp),%rsi -.cfi_def_cfa %rsi,8 - movq $1,%rax - - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lmulx4x_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 - -.type mulx4x_internal,@function -.align 32 -mulx4x_internal: -.cfi_startproc - movq %r9,8(%rsp) - movq %r9,%r10 - negq %r9 - shlq $5,%r9 - negq %r10 - leaq 128(%rdx,%r9,1),%r13 - shrq $5+5,%r9 - movd 8(%rax),%xmm5 - subq $1,%r9 - leaq .Linc(%rip),%rax - movq %r13,16+8(%rsp) - movq %r9,24+8(%rsp) - movq %rdi,56+8(%rsp) - movdqa 0(%rax),%xmm0 - movdqa 16(%rax),%xmm1 - leaq 88-112(%rsp,%r10,1),%r10 - leaq 128(%rdx),%rdi - - pshufd $0,%xmm5,%xmm5 - movdqa %xmm1,%xmm4 -.byte 0x67 - movdqa %xmm1,%xmm2 -.byte 0x67 - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm4,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,112(%r10) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,128(%r10) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,144(%r10) - movdqa %xmm4,%xmm2 - - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,160(%r10) - movdqa %xmm4,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,176(%r10) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,192(%r10) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,208(%r10) - movdqa %xmm4,%xmm2 - - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,224(%r10) - movdqa %xmm4,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,240(%r10) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,256(%r10) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,272(%r10) - movdqa %xmm4,%xmm2 - - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,288(%r10) - movdqa %xmm4,%xmm3 -.byte 0x67 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,304(%r10) - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,320(%r10) - - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,336(%r10) - - pand 64(%rdi),%xmm0 - pand 80(%rdi),%xmm1 - pand 96(%rdi),%xmm2 - movdqa %xmm3,352(%r10) - pand 112(%rdi),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - movdqa -128(%rdi),%xmm4 - movdqa -112(%rdi),%xmm5 - movdqa -96(%rdi),%xmm2 - pand 112(%r10),%xmm4 - movdqa -80(%rdi),%xmm3 - pand 128(%r10),%xmm5 - por %xmm4,%xmm0 - pand 144(%r10),%xmm2 - por %xmm5,%xmm1 - pand 160(%r10),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - movdqa -64(%rdi),%xmm4 - movdqa -48(%rdi),%xmm5 - movdqa -32(%rdi),%xmm2 - pand 176(%r10),%xmm4 - movdqa -16(%rdi),%xmm3 - pand 192(%r10),%xmm5 - por %xmm4,%xmm0 - pand 208(%r10),%xmm2 - por %xmm5,%xmm1 - pand 224(%r10),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - movdqa 0(%rdi),%xmm4 - movdqa 16(%rdi),%xmm5 - movdqa 32(%rdi),%xmm2 - pand 240(%r10),%xmm4 - movdqa 48(%rdi),%xmm3 - pand 256(%r10),%xmm5 - por %xmm4,%xmm0 - pand 272(%r10),%xmm2 - por %xmm5,%xmm1 - pand 288(%r10),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - pxor %xmm1,%xmm0 - pshufd $0x4e,%xmm0,%xmm1 - por %xmm1,%xmm0 - leaq 256(%rdi),%rdi -.byte 102,72,15,126,194 - leaq 64+32+8(%rsp),%rbx - - movq %rdx,%r9 - mulxq 0(%rsi),%r8,%rax - mulxq 8(%rsi),%r11,%r12 - addq %rax,%r11 - mulxq 16(%rsi),%rax,%r13 - adcq %rax,%r12 - adcq $0,%r13 - mulxq 24(%rsi),%rax,%r14 - - movq %r8,%r15 - imulq 32+8(%rsp),%r8 - xorq %rbp,%rbp - movq %r8,%rdx - - movq %rdi,8+8(%rsp) - - leaq 32(%rsi),%rsi - adcxq %rax,%r13 - adcxq %rbp,%r14 - - mulxq 0(%rcx),%rax,%r10 - adcxq %rax,%r15 - adoxq %r11,%r10 - mulxq 8(%rcx),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - mulxq 16(%rcx),%rax,%r12 - movq 24+8(%rsp),%rdi - movq %r10,-32(%rbx) - adcxq %rax,%r11 - adoxq %r13,%r12 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r11,-24(%rbx) - adcxq %rax,%r12 - adoxq %rbp,%r15 - leaq 32(%rcx),%rcx - movq %r12,-16(%rbx) - jmp .Lmulx4x_1st - -.align 32 -.Lmulx4x_1st: - adcxq %rbp,%r15 - mulxq 0(%rsi),%r10,%rax - adcxq %r14,%r10 - mulxq 8(%rsi),%r11,%r14 - adcxq %rax,%r11 - mulxq 16(%rsi),%r12,%rax - adcxq %r14,%r12 - mulxq 24(%rsi),%r13,%r14 -.byte 0x67,0x67 - movq %r8,%rdx - adcxq %rax,%r13 - adcxq %rbp,%r14 - leaq 32(%rsi),%rsi - leaq 32(%rbx),%rbx - - adoxq %r15,%r10 - mulxq 0(%rcx),%rax,%r15 - adcxq %rax,%r10 - adoxq %r15,%r11 - mulxq 8(%rcx),%rax,%r15 - adcxq %rax,%r11 - adoxq %r15,%r12 - mulxq 16(%rcx),%rax,%r15 - movq %r10,-40(%rbx) - adcxq %rax,%r12 - movq %r11,-32(%rbx) - adoxq %r15,%r13 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r12,-24(%rbx) - adcxq %rax,%r13 - adoxq %rbp,%r15 - leaq 32(%rcx),%rcx - movq %r13,-16(%rbx) - - decq %rdi - jnz .Lmulx4x_1st - - movq 8(%rsp),%rax - adcq %rbp,%r15 - leaq (%rsi,%rax,1),%rsi - addq %r15,%r14 - movq 8+8(%rsp),%rdi - adcq %rbp,%rbp - movq %r14,-8(%rbx) - jmp .Lmulx4x_outer - -.align 32 -.Lmulx4x_outer: - leaq 16-256(%rbx),%r10 - pxor %xmm4,%xmm4 -.byte 0x67,0x67 - pxor %xmm5,%xmm5 - movdqa -128(%rdi),%xmm0 - movdqa -112(%rdi),%xmm1 - movdqa -96(%rdi),%xmm2 - pand 256(%r10),%xmm0 - movdqa -80(%rdi),%xmm3 - pand 272(%r10),%xmm1 - por %xmm0,%xmm4 - pand 288(%r10),%xmm2 - por %xmm1,%xmm5 - pand 304(%r10),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa -64(%rdi),%xmm0 - movdqa -48(%rdi),%xmm1 - movdqa -32(%rdi),%xmm2 - pand 320(%r10),%xmm0 - movdqa -16(%rdi),%xmm3 - pand 336(%r10),%xmm1 - por %xmm0,%xmm4 - pand 352(%r10),%xmm2 - por %xmm1,%xmm5 - pand 368(%r10),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa 0(%rdi),%xmm0 - movdqa 16(%rdi),%xmm1 - movdqa 32(%rdi),%xmm2 - pand 384(%r10),%xmm0 - movdqa 48(%rdi),%xmm3 - pand 400(%r10),%xmm1 - por %xmm0,%xmm4 - pand 416(%r10),%xmm2 - por %xmm1,%xmm5 - pand 432(%r10),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa 64(%rdi),%xmm0 - movdqa 80(%rdi),%xmm1 - movdqa 96(%rdi),%xmm2 - pand 448(%r10),%xmm0 - movdqa 112(%rdi),%xmm3 - pand 464(%r10),%xmm1 - por %xmm0,%xmm4 - pand 480(%r10),%xmm2 - por %xmm1,%xmm5 - pand 496(%r10),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - por %xmm5,%xmm4 - pshufd $0x4e,%xmm4,%xmm0 - por %xmm4,%xmm0 - leaq 256(%rdi),%rdi -.byte 102,72,15,126,194 - - movq %rbp,(%rbx) - leaq 32(%rbx,%rax,1),%rbx - mulxq 0(%rsi),%r8,%r11 - xorq %rbp,%rbp - movq %rdx,%r9 - mulxq 8(%rsi),%r14,%r12 - adoxq -32(%rbx),%r8 - adcxq %r14,%r11 - mulxq 16(%rsi),%r15,%r13 - adoxq -24(%rbx),%r11 - adcxq %r15,%r12 - mulxq 24(%rsi),%rdx,%r14 - adoxq -16(%rbx),%r12 - adcxq %rdx,%r13 - leaq (%rcx,%rax,1),%rcx - leaq 32(%rsi),%rsi - adoxq -8(%rbx),%r13 - adcxq %rbp,%r14 - adoxq %rbp,%r14 - - movq %r8,%r15 - imulq 32+8(%rsp),%r8 - - movq %r8,%rdx - xorq %rbp,%rbp - movq %rdi,8+8(%rsp) - - mulxq 0(%rcx),%rax,%r10 - adcxq %rax,%r15 - adoxq %r11,%r10 - mulxq 8(%rcx),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - mulxq 16(%rcx),%rax,%r12 - adcxq %rax,%r11 - adoxq %r13,%r12 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq 24+8(%rsp),%rdi - movq %r10,-32(%rbx) - adcxq %rax,%r12 - movq %r11,-24(%rbx) - adoxq %rbp,%r15 - movq %r12,-16(%rbx) - leaq 32(%rcx),%rcx - jmp .Lmulx4x_inner - -.align 32 -.Lmulx4x_inner: - mulxq 0(%rsi),%r10,%rax - adcxq %rbp,%r15 - adoxq %r14,%r10 - mulxq 8(%rsi),%r11,%r14 - adcxq 0(%rbx),%r10 - adoxq %rax,%r11 - mulxq 16(%rsi),%r12,%rax - adcxq 8(%rbx),%r11 - adoxq %r14,%r12 - mulxq 24(%rsi),%r13,%r14 - movq %r8,%rdx - adcxq 16(%rbx),%r12 - adoxq %rax,%r13 - adcxq 24(%rbx),%r13 - adoxq %rbp,%r14 - leaq 32(%rsi),%rsi - leaq 32(%rbx),%rbx - adcxq %rbp,%r14 - - adoxq %r15,%r10 - mulxq 0(%rcx),%rax,%r15 - adcxq %rax,%r10 - adoxq %r15,%r11 - mulxq 8(%rcx),%rax,%r15 - adcxq %rax,%r11 - adoxq %r15,%r12 - mulxq 16(%rcx),%rax,%r15 - movq %r10,-40(%rbx) - adcxq %rax,%r12 - adoxq %r15,%r13 - movq %r11,-32(%rbx) - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - leaq 32(%rcx),%rcx - movq %r12,-24(%rbx) - adcxq %rax,%r13 - adoxq %rbp,%r15 - movq %r13,-16(%rbx) - - decq %rdi - jnz .Lmulx4x_inner - - movq 0+8(%rsp),%rax - adcq %rbp,%r15 - subq 0(%rbx),%rdi - movq 8+8(%rsp),%rdi - movq 16+8(%rsp),%r10 - adcq %r15,%r14 - leaq (%rsi,%rax,1),%rsi - adcq %rbp,%rbp - movq %r14,-8(%rbx) - - cmpq %r10,%rdi - jb .Lmulx4x_outer - - movq -8(%rcx),%r10 - movq %rbp,%r8 - movq (%rcx,%rax,1),%r12 - leaq (%rcx,%rax,1),%rbp - movq %rax,%rcx - leaq (%rbx,%rax,1),%rdi - xorl %eax,%eax - xorq %r15,%r15 - subq %r14,%r10 - adcq %r15,%r15 - orq %r15,%r8 - sarq $3+2,%rcx - subq %r8,%rax - movq 56+8(%rsp),%rdx - decq %r12 - movq 8(%rbp),%r13 - xorq %r8,%r8 - movq 16(%rbp),%r14 - movq 24(%rbp),%r15 - jmp .Lsqrx4x_sub_entry -.cfi_endproc -.size mulx4x_internal,.-mulx4x_internal -.type bn_powerx5,@function -.align 32 -bn_powerx5: -.cfi_startproc - movq %rsp,%rax -.cfi_def_cfa_register %rax -.Lpowerx5_enter: - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 -.Lpowerx5_prologue: - - shll $3,%r9d - leaq (%r9,%r9,2),%r10 - negq %r9 - movq (%r8),%r8 - - - - - - - - - leaq -320(%rsp,%r9,2),%r11 - movq %rsp,%rbp - subq %rdi,%r11 - andq $4095,%r11 - cmpq %r11,%r10 - jb .Lpwrx_sp_alt - subq %r11,%rbp - leaq -320(%rbp,%r9,2),%rbp - jmp .Lpwrx_sp_done - -.align 32 -.Lpwrx_sp_alt: - leaq 4096-320(,%r9,2),%r10 - leaq -320(%rbp,%r9,2),%rbp - subq %r10,%r11 - movq $0,%r10 - cmovcq %r10,%r11 - subq %r11,%rbp -.Lpwrx_sp_done: - andq $-64,%rbp - movq %rsp,%r11 - subq %rbp,%r11 - andq $-4096,%r11 - leaq (%r11,%rbp,1),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja .Lpwrx_page_walk - jmp .Lpwrx_page_walk_done - -.Lpwrx_page_walk: - leaq -4096(%rsp),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja .Lpwrx_page_walk -.Lpwrx_page_walk_done: - - movq %r9,%r10 - negq %r9 - - - - - - - - - - - - - pxor %xmm0,%xmm0 -.byte 102,72,15,110,207 -.byte 102,72,15,110,209 -.byte 102,73,15,110,218 -.byte 102,72,15,110,226 - movq %r8,32(%rsp) - movq %rax,40(%rsp) -.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 -.Lpowerx5_body: - - call __bn_sqrx8x_internal - call __bn_postx4x_internal - call __bn_sqrx8x_internal - call __bn_postx4x_internal - call __bn_sqrx8x_internal - call __bn_postx4x_internal - call __bn_sqrx8x_internal - call __bn_postx4x_internal - call __bn_sqrx8x_internal - call __bn_postx4x_internal - - movq %r10,%r9 - movq %rsi,%rdi -.byte 102,72,15,126,209 -.byte 102,72,15,126,226 - movq 40(%rsp),%rax - - call mulx4x_internal - - movq 40(%rsp),%rsi -.cfi_def_cfa %rsi,8 - movq $1,%rax - - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lpowerx5_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size bn_powerx5,.-bn_powerx5 - -.globl bn_sqrx8x_internal -.hidden bn_sqrx8x_internal -.type bn_sqrx8x_internal,@function -.align 32 -bn_sqrx8x_internal: -__bn_sqrx8x_internal: -.cfi_startproc - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - leaq 48+8(%rsp),%rdi - leaq (%rsi,%r9,1),%rbp - movq %r9,0+8(%rsp) - movq %rbp,8+8(%rsp) - jmp .Lsqr8x_zero_start - -.align 32 -.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 -.Lsqrx8x_zero: -.byte 0x3e - movdqa %xmm0,0(%rdi) - movdqa %xmm0,16(%rdi) - movdqa %xmm0,32(%rdi) - movdqa %xmm0,48(%rdi) -.Lsqr8x_zero_start: - movdqa %xmm0,64(%rdi) - movdqa %xmm0,80(%rdi) - movdqa %xmm0,96(%rdi) - movdqa %xmm0,112(%rdi) - leaq 128(%rdi),%rdi - subq $64,%r9 - jnz .Lsqrx8x_zero - - movq 0(%rsi),%rdx - - xorq %r10,%r10 - xorq %r11,%r11 - xorq %r12,%r12 - xorq %r13,%r13 - xorq %r14,%r14 - xorq %r15,%r15 - leaq 48+8(%rsp),%rdi - xorq %rbp,%rbp - jmp .Lsqrx8x_outer_loop - -.align 32 -.Lsqrx8x_outer_loop: - mulxq 8(%rsi),%r8,%rax - adcxq %r9,%r8 - adoxq %rax,%r10 - mulxq 16(%rsi),%r9,%rax - adcxq %r10,%r9 - adoxq %rax,%r11 -.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 - adcxq %r11,%r10 - adoxq %rax,%r12 -.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 - adcxq %r12,%r11 - adoxq %rax,%r13 - mulxq 40(%rsi),%r12,%rax - adcxq %r13,%r12 - adoxq %rax,%r14 - mulxq 48(%rsi),%r13,%rax - adcxq %r14,%r13 - adoxq %r15,%rax - mulxq 56(%rsi),%r14,%r15 - movq 8(%rsi),%rdx - adcxq %rax,%r14 - adoxq %rbp,%r15 - adcq 64(%rdi),%r15 - movq %r8,8(%rdi) - movq %r9,16(%rdi) - sbbq %rcx,%rcx - xorq %rbp,%rbp - - - mulxq 16(%rsi),%r8,%rbx - mulxq 24(%rsi),%r9,%rax - adcxq %r10,%r8 - adoxq %rbx,%r9 - mulxq 32(%rsi),%r10,%rbx - adcxq %r11,%r9 - adoxq %rax,%r10 -.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 - adcxq %r12,%r10 - adoxq %rbx,%r11 -.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 - adcxq %r13,%r11 - adoxq %r14,%r12 -.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 - movq 16(%rsi),%rdx - adcxq %rax,%r12 - adoxq %rbx,%r13 - adcxq %r15,%r13 - adoxq %rbp,%r14 - adcxq %rbp,%r14 - - movq %r8,24(%rdi) - movq %r9,32(%rdi) - - mulxq 24(%rsi),%r8,%rbx - mulxq 32(%rsi),%r9,%rax - adcxq %r10,%r8 - adoxq %rbx,%r9 - mulxq 40(%rsi),%r10,%rbx - adcxq %r11,%r9 - adoxq %rax,%r10 -.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 - adcxq %r12,%r10 - adoxq %r13,%r11 -.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 -.byte 0x3e - movq 24(%rsi),%rdx - adcxq %rbx,%r11 - adoxq %rax,%r12 - adcxq %r14,%r12 - movq %r8,40(%rdi) - movq %r9,48(%rdi) - mulxq 32(%rsi),%r8,%rax - adoxq %rbp,%r13 - adcxq %rbp,%r13 - - mulxq 40(%rsi),%r9,%rbx - adcxq %r10,%r8 - adoxq %rax,%r9 - mulxq 48(%rsi),%r10,%rax - adcxq %r11,%r9 - adoxq %r12,%r10 - mulxq 56(%rsi),%r11,%r12 - movq 32(%rsi),%rdx - movq 40(%rsi),%r14 - adcxq %rbx,%r10 - adoxq %rax,%r11 - movq 48(%rsi),%r15 - adcxq %r13,%r11 - adoxq %rbp,%r12 - adcxq %rbp,%r12 - - movq %r8,56(%rdi) - movq %r9,64(%rdi) - - mulxq %r14,%r9,%rax - movq 56(%rsi),%r8 - adcxq %r10,%r9 - mulxq %r15,%r10,%rbx - adoxq %rax,%r10 - adcxq %r11,%r10 - mulxq %r8,%r11,%rax - movq %r14,%rdx - adoxq %rbx,%r11 - adcxq %r12,%r11 - - adcxq %rbp,%rax - - mulxq %r15,%r14,%rbx - mulxq %r8,%r12,%r13 - movq %r15,%rdx - leaq 64(%rsi),%rsi - adcxq %r14,%r11 - adoxq %rbx,%r12 - adcxq %rax,%r12 - adoxq %rbp,%r13 - -.byte 0x67,0x67 - mulxq %r8,%r8,%r14 - adcxq %r8,%r13 - adcxq %rbp,%r14 - - cmpq 8+8(%rsp),%rsi - je .Lsqrx8x_outer_break - - negq %rcx - movq $-8,%rcx - movq %rbp,%r15 - movq 64(%rdi),%r8 - adcxq 72(%rdi),%r9 - adcxq 80(%rdi),%r10 - adcxq 88(%rdi),%r11 - adcq 96(%rdi),%r12 - adcq 104(%rdi),%r13 - adcq 112(%rdi),%r14 - adcq 120(%rdi),%r15 - leaq (%rsi),%rbp - leaq 128(%rdi),%rdi - sbbq %rax,%rax - - movq -64(%rsi),%rdx - movq %rax,16+8(%rsp) - movq %rdi,24+8(%rsp) - - - xorl %eax,%eax - jmp .Lsqrx8x_loop - -.align 32 -.Lsqrx8x_loop: - movq %r8,%rbx - mulxq 0(%rbp),%rax,%r8 - adcxq %rax,%rbx - adoxq %r9,%r8 - - mulxq 8(%rbp),%rax,%r9 - adcxq %rax,%r8 - adoxq %r10,%r9 - - mulxq 16(%rbp),%rax,%r10 - adcxq %rax,%r9 - adoxq %r11,%r10 - - mulxq 24(%rbp),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - -.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 - adcxq %rax,%r11 - adoxq %r13,%r12 - - mulxq 40(%rbp),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - - mulxq 48(%rbp),%rax,%r14 - movq %rbx,(%rdi,%rcx,8) - movl $0,%ebx - adcxq %rax,%r13 - adoxq %r15,%r14 - -.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 - movq 8(%rsi,%rcx,8),%rdx - adcxq %rax,%r14 - adoxq %rbx,%r15 - adcxq %rbx,%r15 - -.byte 0x67 - incq %rcx - jnz .Lsqrx8x_loop - - leaq 64(%rbp),%rbp - movq $-8,%rcx - cmpq 8+8(%rsp),%rbp - je .Lsqrx8x_break - - subq 16+8(%rsp),%rbx -.byte 0x66 - movq -64(%rsi),%rdx - adcxq 0(%rdi),%r8 - adcxq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - leaq 64(%rdi),%rdi -.byte 0x67 - sbbq %rax,%rax - xorl %ebx,%ebx - movq %rax,16+8(%rsp) - jmp .Lsqrx8x_loop - -.align 32 -.Lsqrx8x_break: - xorq %rbp,%rbp - subq 16+8(%rsp),%rbx - adcxq %rbp,%r8 - movq 24+8(%rsp),%rcx - adcxq %rbp,%r9 - movq 0(%rsi),%rdx - adcq $0,%r10 - movq %r8,0(%rdi) - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - adcq $0,%r14 - adcq $0,%r15 - cmpq %rcx,%rdi - je .Lsqrx8x_outer_loop - - movq %r9,8(%rdi) - movq 8(%rcx),%r9 - movq %r10,16(%rdi) - movq 16(%rcx),%r10 - movq %r11,24(%rdi) - movq 24(%rcx),%r11 - movq %r12,32(%rdi) - movq 32(%rcx),%r12 - movq %r13,40(%rdi) - movq 40(%rcx),%r13 - movq %r14,48(%rdi) - movq 48(%rcx),%r14 - movq %r15,56(%rdi) - movq 56(%rcx),%r15 - movq %rcx,%rdi - jmp .Lsqrx8x_outer_loop - -.align 32 -.Lsqrx8x_outer_break: - movq %r9,72(%rdi) -.byte 102,72,15,126,217 - movq %r10,80(%rdi) - movq %r11,88(%rdi) - movq %r12,96(%rdi) - movq %r13,104(%rdi) - movq %r14,112(%rdi) - leaq 48+8(%rsp),%rdi - movq (%rsi,%rcx,1),%rdx - - movq 8(%rdi),%r11 - xorq %r10,%r10 - movq 0+8(%rsp),%r9 - adoxq %r11,%r11 - movq 16(%rdi),%r12 - movq 24(%rdi),%r13 - - -.align 32 -.Lsqrx4x_shift_n_add: - mulxq %rdx,%rax,%rbx - adoxq %r12,%r12 - adcxq %r10,%rax -.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 -.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 - adoxq %r13,%r13 - adcxq %r11,%rbx - movq 40(%rdi),%r11 - movq %rax,0(%rdi) - movq %rbx,8(%rdi) - - mulxq %rdx,%rax,%rbx - adoxq %r10,%r10 - adcxq %r12,%rax - movq 16(%rsi,%rcx,1),%rdx - movq 48(%rdi),%r12 - adoxq %r11,%r11 - adcxq %r13,%rbx - movq 56(%rdi),%r13 - movq %rax,16(%rdi) - movq %rbx,24(%rdi) - - mulxq %rdx,%rax,%rbx - adoxq %r12,%r12 - adcxq %r10,%rax - movq 24(%rsi,%rcx,1),%rdx - leaq 32(%rcx),%rcx - movq 64(%rdi),%r10 - adoxq %r13,%r13 - adcxq %r11,%rbx - movq 72(%rdi),%r11 - movq %rax,32(%rdi) - movq %rbx,40(%rdi) - - mulxq %rdx,%rax,%rbx - adoxq %r10,%r10 - adcxq %r12,%rax - jrcxz .Lsqrx4x_shift_n_add_break -.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 - adoxq %r11,%r11 - adcxq %r13,%rbx - movq 80(%rdi),%r12 - movq 88(%rdi),%r13 - movq %rax,48(%rdi) - movq %rbx,56(%rdi) - leaq 64(%rdi),%rdi - nop - jmp .Lsqrx4x_shift_n_add - -.align 32 -.Lsqrx4x_shift_n_add_break: - adcxq %r13,%rbx - movq %rax,48(%rdi) - movq %rbx,56(%rdi) - leaq 64(%rdi),%rdi -.byte 102,72,15,126,213 -__bn_sqrx8x_reduction: - xorl %eax,%eax - movq 32+8(%rsp),%rbx - movq 48+8(%rsp),%rdx - leaq -64(%rbp,%r9,1),%rcx - - movq %rcx,0+8(%rsp) - movq %rdi,8+8(%rsp) - - leaq 48+8(%rsp),%rdi - jmp .Lsqrx8x_reduction_loop - -.align 32 -.Lsqrx8x_reduction_loop: - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - movq 32(%rdi),%r12 - movq %rdx,%r8 - imulq %rbx,%rdx - movq 40(%rdi),%r13 - movq 48(%rdi),%r14 - movq 56(%rdi),%r15 - movq %rax,24+8(%rsp) - - leaq 64(%rdi),%rdi - xorq %rsi,%rsi - movq $-8,%rcx - jmp .Lsqrx8x_reduce - -.align 32 -.Lsqrx8x_reduce: - movq %r8,%rbx - mulxq 0(%rbp),%rax,%r8 - adcxq %rbx,%rax - adoxq %r9,%r8 - - mulxq 8(%rbp),%rbx,%r9 - adcxq %rbx,%r8 - adoxq %r10,%r9 - - mulxq 16(%rbp),%rbx,%r10 - adcxq %rbx,%r9 - adoxq %r11,%r10 - - mulxq 24(%rbp),%rbx,%r11 - adcxq %rbx,%r10 - adoxq %r12,%r11 - -.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 - movq %rdx,%rax - movq %r8,%rdx - adcxq %rbx,%r11 - adoxq %r13,%r12 - - mulxq 32+8(%rsp),%rbx,%rdx - movq %rax,%rdx - movq %rax,64+48+8(%rsp,%rcx,8) - - mulxq 40(%rbp),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - - mulxq 48(%rbp),%rax,%r14 - adcxq %rax,%r13 - adoxq %r15,%r14 - - mulxq 56(%rbp),%rax,%r15 - movq %rbx,%rdx - adcxq %rax,%r14 - adoxq %rsi,%r15 - adcxq %rsi,%r15 - -.byte 0x67,0x67,0x67 - incq %rcx - jnz .Lsqrx8x_reduce - - movq %rsi,%rax - cmpq 0+8(%rsp),%rbp - jae .Lsqrx8x_no_tail - - movq 48+8(%rsp),%rdx - addq 0(%rdi),%r8 - leaq 64(%rbp),%rbp - movq $-8,%rcx - adcxq 8(%rdi),%r9 - adcxq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - leaq 64(%rdi),%rdi - sbbq %rax,%rax - - xorq %rsi,%rsi - movq %rax,16+8(%rsp) - jmp .Lsqrx8x_tail - -.align 32 -.Lsqrx8x_tail: - movq %r8,%rbx - mulxq 0(%rbp),%rax,%r8 - adcxq %rax,%rbx - adoxq %r9,%r8 - - mulxq 8(%rbp),%rax,%r9 - adcxq %rax,%r8 - adoxq %r10,%r9 - - mulxq 16(%rbp),%rax,%r10 - adcxq %rax,%r9 - adoxq %r11,%r10 - - mulxq 24(%rbp),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - -.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 - adcxq %rax,%r11 - adoxq %r13,%r12 - - mulxq 40(%rbp),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - - mulxq 48(%rbp),%rax,%r14 - adcxq %rax,%r13 - adoxq %r15,%r14 - - mulxq 56(%rbp),%rax,%r15 - movq 72+48+8(%rsp,%rcx,8),%rdx - adcxq %rax,%r14 - adoxq %rsi,%r15 - movq %rbx,(%rdi,%rcx,8) - movq %r8,%rbx - adcxq %rsi,%r15 - - incq %rcx - jnz .Lsqrx8x_tail - - cmpq 0+8(%rsp),%rbp - jae .Lsqrx8x_tail_done - - subq 16+8(%rsp),%rsi - movq 48+8(%rsp),%rdx - leaq 64(%rbp),%rbp - adcq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - leaq 64(%rdi),%rdi - sbbq %rax,%rax - subq $8,%rcx - - xorq %rsi,%rsi - movq %rax,16+8(%rsp) - jmp .Lsqrx8x_tail - -.align 32 -.Lsqrx8x_tail_done: - xorq %rax,%rax - addq 24+8(%rsp),%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - adcq $0,%r14 - adcq $0,%r15 - adcq $0,%rax - - subq 16+8(%rsp),%rsi -.Lsqrx8x_no_tail: - adcq 0(%rdi),%r8 -.byte 102,72,15,126,217 - adcq 8(%rdi),%r9 - movq 56(%rbp),%rsi -.byte 102,72,15,126,213 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - adcq $0,%rax - - movq 32+8(%rsp),%rbx - movq 64(%rdi,%rcx,1),%rdx - - movq %r8,0(%rdi) - leaq 64(%rdi),%r8 - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq %r14,48(%rdi) - movq %r15,56(%rdi) - - leaq 64(%rdi,%rcx,1),%rdi - cmpq 8+8(%rsp),%r8 - jb .Lsqrx8x_reduction_loop - .byte 0xf3,0xc3 -.cfi_endproc -.size bn_sqrx8x_internal,.-bn_sqrx8x_internal -.align 32 -__bn_postx4x_internal: -.cfi_startproc - movq 0(%rbp),%r12 - movq %rcx,%r10 - movq %rcx,%r9 - negq %rax - sarq $3+2,%rcx - -.byte 102,72,15,126,202 -.byte 102,72,15,126,206 - decq %r12 - movq 8(%rbp),%r13 - xorq %r8,%r8 - movq 16(%rbp),%r14 - movq 24(%rbp),%r15 - jmp .Lsqrx4x_sub_entry - -.align 16 -.Lsqrx4x_sub: - movq 0(%rbp),%r12 - movq 8(%rbp),%r13 - movq 16(%rbp),%r14 - movq 24(%rbp),%r15 -.Lsqrx4x_sub_entry: - andnq %rax,%r12,%r12 - leaq 32(%rbp),%rbp - andnq %rax,%r13,%r13 - andnq %rax,%r14,%r14 - andnq %rax,%r15,%r15 - - negq %r8 - adcq 0(%rdi),%r12 - adcq 8(%rdi),%r13 - adcq 16(%rdi),%r14 - adcq 24(%rdi),%r15 - movq %r12,0(%rdx) - leaq 32(%rdi),%rdi - movq %r13,8(%rdx) - sbbq %r8,%r8 - movq %r14,16(%rdx) - movq %r15,24(%rdx) - leaq 32(%rdx),%rdx - - incq %rcx - jnz .Lsqrx4x_sub - - negq %r9 - - .byte 0xf3,0xc3 -.cfi_endproc -.size __bn_postx4x_internal,.-__bn_postx4x_internal .globl bn_get_bits5 .type bn_get_bits5,@function .align 16 @@ -3601,7 +2251,7 @@ bn_gather5: .long 0,0, 1,1 .long 2,2, 2,2 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 - .section ".note.gnu.property", "a" + .section .note.gnu.property, #alloc .p2align 3 .long 1f - 0f .long 4f - 1f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/ec/ecp_nistz256-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/ec/ecp_nistz256-x86_64.s index 4e05eefb1ee..be22fac090f 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/ec/ecp_nistz256-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/ec/ecp_nistz256-x86_64.s @@ -2788,10 +2788,6 @@ ecp_nistz256_neg: .align 32 ecp_nistz256_ord_mul_mont: .cfi_startproc - movl $0x80100,%ecx - andl OPENSSL_ia32cap_P+8(%rip),%ecx - cmpl $0x80100,%ecx - je .Lecp_nistz256_ord_mul_montx pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -3120,10 +3116,6 @@ ecp_nistz256_ord_mul_mont: .align 32 ecp_nistz256_ord_sqr_mont: .cfi_startproc - movl $0x80100,%ecx - andl OPENSSL_ia32cap_P+8(%rip),%ecx - cmpl $0x80100,%ecx - je .Lecp_nistz256_ord_sqr_montx pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -3411,462 +3403,6 @@ ecp_nistz256_ord_sqr_mont: .cfi_endproc .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont -.type ecp_nistz256_ord_mul_montx,@function -.align 32 -ecp_nistz256_ord_mul_montx: -.cfi_startproc -.Lecp_nistz256_ord_mul_montx: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 -.Lord_mulx_body: - - movq %rdx,%rbx - movq 0(%rdx),%rdx - movq 0(%rsi),%r9 - movq 8(%rsi),%r10 - movq 16(%rsi),%r11 - movq 24(%rsi),%r12 - leaq -128(%rsi),%rsi - leaq .Lord-128(%rip),%r14 - movq .LordK(%rip),%r15 - - - mulxq %r9,%r8,%r9 - mulxq %r10,%rcx,%r10 - mulxq %r11,%rbp,%r11 - addq %rcx,%r9 - mulxq %r12,%rcx,%r12 - movq %r8,%rdx - mulxq %r15,%rdx,%rax - adcq %rbp,%r10 - adcq %rcx,%r11 - adcq $0,%r12 - - - xorq %r13,%r13 - mulxq 0+128(%r14),%rcx,%rbp - adcxq %rcx,%r8 - adoxq %rbp,%r9 - - mulxq 8+128(%r14),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - - mulxq 16+128(%r14),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 24+128(%r14),%rcx,%rbp - movq 8(%rbx),%rdx - adcxq %rcx,%r11 - adoxq %rbp,%r12 - adcxq %r8,%r12 - adoxq %r8,%r13 - adcq $0,%r13 - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r9,%rdx - mulxq %r15,%rdx,%rax - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - adcxq %r8,%r13 - adoxq %r8,%r8 - adcq $0,%r8 - - - mulxq 0+128(%r14),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - - mulxq 8+128(%r14),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 16+128(%r14),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 24+128(%r14),%rcx,%rbp - movq 16(%rbx),%rdx - adcxq %rcx,%r12 - adoxq %rbp,%r13 - adcxq %r9,%r13 - adoxq %r9,%r8 - adcq $0,%r8 - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r10,%rdx - mulxq %r15,%rdx,%rax - adcxq %rcx,%r13 - adoxq %rbp,%r8 - - adcxq %r9,%r8 - adoxq %r9,%r9 - adcq $0,%r9 - - - mulxq 0+128(%r14),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 8+128(%r14),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 16+128(%r14),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 24+128(%r14),%rcx,%rbp - movq 24(%rbx),%rdx - adcxq %rcx,%r13 - adoxq %rbp,%r8 - adcxq %r10,%r8 - adoxq %r10,%r9 - adcq $0,%r9 - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r13 - adoxq %rbp,%r8 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r11,%rdx - mulxq %r15,%rdx,%rax - adcxq %rcx,%r8 - adoxq %rbp,%r9 - - adcxq %r10,%r9 - adoxq %r10,%r10 - adcq $0,%r10 - - - mulxq 0+128(%r14),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 8+128(%r14),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 16+128(%r14),%rcx,%rbp - adcxq %rcx,%r13 - adoxq %rbp,%r8 - - mulxq 24+128(%r14),%rcx,%rbp - leaq 128(%r14),%r14 - movq %r12,%rbx - adcxq %rcx,%r8 - adoxq %rbp,%r9 - movq %r13,%rdx - adcxq %r11,%r9 - adoxq %r11,%r10 - adcq $0,%r10 - - - - movq %r8,%rcx - subq 0(%r14),%r12 - sbbq 8(%r14),%r13 - sbbq 16(%r14),%r8 - movq %r9,%rbp - sbbq 24(%r14),%r9 - sbbq $0,%r10 - - cmovcq %rbx,%r12 - cmovcq %rdx,%r13 - cmovcq %rcx,%r8 - cmovcq %rbp,%r9 - - movq %r12,0(%rdi) - movq %r13,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - movq 0(%rsp),%r15 -.cfi_restore %r15 - movq 8(%rsp),%r14 -.cfi_restore %r14 - movq 16(%rsp),%r13 -.cfi_restore %r13 - movq 24(%rsp),%r12 -.cfi_restore %r12 - movq 32(%rsp),%rbx -.cfi_restore %rbx - movq 40(%rsp),%rbp -.cfi_restore %rbp - leaq 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.Lord_mulx_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx - -.type ecp_nistz256_ord_sqr_montx,@function -.align 32 -ecp_nistz256_ord_sqr_montx: -.cfi_startproc -.Lecp_nistz256_ord_sqr_montx: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 -.Lord_sqrx_body: - - movq %rdx,%rbx - movq 0(%rsi),%rdx - movq 8(%rsi),%r14 - movq 16(%rsi),%r15 - movq 24(%rsi),%r8 - leaq .Lord(%rip),%rsi - jmp .Loop_ord_sqrx - -.align 32 -.Loop_ord_sqrx: - mulxq %r14,%r9,%r10 - mulxq %r15,%rcx,%r11 - movq %rdx,%rax -.byte 102,73,15,110,206 - mulxq %r8,%rbp,%r12 - movq %r14,%rdx - addq %rcx,%r10 -.byte 102,73,15,110,215 - adcq %rbp,%r11 - adcq $0,%r12 - xorq %r13,%r13 - - mulxq %r15,%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq %r8,%rcx,%rbp - movq %r15,%rdx - adcxq %rcx,%r12 - adoxq %rbp,%r13 - adcq $0,%r13 - - mulxq %r8,%rcx,%r14 - movq %rax,%rdx -.byte 102,73,15,110,216 - xorq %r15,%r15 - adcxq %r9,%r9 - adoxq %rcx,%r13 - adcxq %r10,%r10 - adoxq %r15,%r14 - - - mulxq %rdx,%r8,%rbp -.byte 102,72,15,126,202 - adcxq %r11,%r11 - adoxq %rbp,%r9 - adcxq %r12,%r12 - mulxq %rdx,%rcx,%rax -.byte 102,72,15,126,210 - adcxq %r13,%r13 - adoxq %rcx,%r10 - adcxq %r14,%r14 - mulxq %rdx,%rcx,%rbp -.byte 0x67 -.byte 102,72,15,126,218 - adoxq %rax,%r11 - adcxq %r15,%r15 - adoxq %rcx,%r12 - adoxq %rbp,%r13 - mulxq %rdx,%rcx,%rax - adoxq %rcx,%r14 - adoxq %rax,%r15 - - - movq %r8,%rdx - mulxq 32(%rsi),%rdx,%rcx - - xorq %rax,%rax - mulxq 0(%rsi),%rcx,%rbp - adcxq %rcx,%r8 - adoxq %rbp,%r9 - mulxq 8(%rsi),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - mulxq 16(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - mulxq 24(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r8 - adcxq %rax,%r8 - - - movq %r9,%rdx - mulxq 32(%rsi),%rdx,%rcx - - mulxq 0(%rsi),%rcx,%rbp - adoxq %rcx,%r9 - adcxq %rbp,%r10 - mulxq 8(%rsi),%rcx,%rbp - adoxq %rcx,%r10 - adcxq %rbp,%r11 - mulxq 16(%rsi),%rcx,%rbp - adoxq %rcx,%r11 - adcxq %rbp,%r8 - mulxq 24(%rsi),%rcx,%rbp - adoxq %rcx,%r8 - adcxq %rbp,%r9 - adoxq %rax,%r9 - - - movq %r10,%rdx - mulxq 32(%rsi),%rdx,%rcx - - mulxq 0(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - mulxq 8(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r8 - mulxq 16(%rsi),%rcx,%rbp - adcxq %rcx,%r8 - adoxq %rbp,%r9 - mulxq 24(%rsi),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - adcxq %rax,%r10 - - - movq %r11,%rdx - mulxq 32(%rsi),%rdx,%rcx - - mulxq 0(%rsi),%rcx,%rbp - adoxq %rcx,%r11 - adcxq %rbp,%r8 - mulxq 8(%rsi),%rcx,%rbp - adoxq %rcx,%r8 - adcxq %rbp,%r9 - mulxq 16(%rsi),%rcx,%rbp - adoxq %rcx,%r9 - adcxq %rbp,%r10 - mulxq 24(%rsi),%rcx,%rbp - adoxq %rcx,%r10 - adcxq %rbp,%r11 - adoxq %rax,%r11 - - - addq %r8,%r12 - adcq %r13,%r9 - movq %r12,%rdx - adcq %r14,%r10 - adcq %r15,%r11 - movq %r9,%r14 - adcq $0,%rax - - - subq 0(%rsi),%r12 - movq %r10,%r15 - sbbq 8(%rsi),%r9 - sbbq 16(%rsi),%r10 - movq %r11,%r8 - sbbq 24(%rsi),%r11 - sbbq $0,%rax - - cmovncq %r12,%rdx - cmovncq %r9,%r14 - cmovncq %r10,%r15 - cmovncq %r11,%r8 - - decq %rbx - jnz .Loop_ord_sqrx - - movq %rdx,0(%rdi) - movq %r14,8(%rdi) - pxor %xmm1,%xmm1 - movq %r15,16(%rdi) - pxor %xmm2,%xmm2 - movq %r8,24(%rdi) - pxor %xmm3,%xmm3 - - movq 0(%rsp),%r15 -.cfi_restore %r15 - movq 8(%rsp),%r14 -.cfi_restore %r14 - movq 16(%rsp),%r13 -.cfi_restore %r13 - movq 24(%rsp),%r12 -.cfi_restore %r12 - movq 32(%rsp),%rbx -.cfi_restore %rbx - movq 40(%rsp),%rbp -.cfi_restore %rbp - leaq 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.Lord_sqrx_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx - @@ -3875,8 +3411,6 @@ ecp_nistz256_ord_sqr_montx: .align 32 ecp_nistz256_to_mont: .cfi_startproc - movl $0x80100,%ecx - andl OPENSSL_ia32cap_P+8(%rip),%ecx leaq .LRR(%rip),%rdx jmp .Lmul_mont .cfi_endproc @@ -3893,8 +3427,6 @@ ecp_nistz256_to_mont: .align 32 ecp_nistz256_mul_mont: .cfi_startproc - movl $0x80100,%ecx - andl OPENSSL_ia32cap_P+8(%rip),%ecx .Lmul_mont: pushq %rbp .cfi_adjust_cfa_offset 8 @@ -3915,8 +3447,6 @@ ecp_nistz256_mul_mont: .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lmul_body: - cmpl $0x80100,%ecx - je .Lmul_montx movq %rdx,%rbx movq 0(%rdx),%rax movq 0(%rsi),%r9 @@ -3925,19 +3455,6 @@ ecp_nistz256_mul_mont: movq 24(%rsi),%r12 call __ecp_nistz256_mul_montq - jmp .Lmul_mont_done - -.align 32 -.Lmul_montx: - movq %rdx,%rbx - movq 0(%rdx),%rdx - movq 0(%rsi),%r9 - movq 8(%rsi),%r10 - movq 16(%rsi),%r11 - movq 24(%rsi),%r12 - leaq -128(%rsi),%rsi - - call __ecp_nistz256_mul_montx .Lmul_mont_done: movq 0(%rsp),%r15 .cfi_restore %r15 @@ -4188,8 +3705,6 @@ __ecp_nistz256_mul_montq: .align 32 ecp_nistz256_sqr_mont: .cfi_startproc - movl $0x80100,%ecx - andl OPENSSL_ia32cap_P+8(%rip),%ecx pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -4209,25 +3724,12 @@ ecp_nistz256_sqr_mont: .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lsqr_body: - cmpl $0x80100,%ecx - je .Lsqr_montx movq 0(%rsi),%rax movq 8(%rsi),%r14 movq 16(%rsi),%r15 movq 24(%rsi),%r8 call __ecp_nistz256_sqr_montq - jmp .Lsqr_mont_done - -.align 32 -.Lsqr_montx: - movq 0(%rsi),%rdx - movq 8(%rsi),%r14 - movq 16(%rsi),%r15 - movq 24(%rsi),%r8 - leaq -128(%rsi),%rsi - - call __ecp_nistz256_sqr_montx .Lsqr_mont_done: movq 0(%rsp),%r15 .cfi_restore %r15 @@ -4411,342 +3913,44 @@ __ecp_nistz256_sqr_montq: .byte 0xf3,0xc3 .cfi_endproc .size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq -.type __ecp_nistz256_mul_montx,@function -.align 32 -__ecp_nistz256_mul_montx: -.cfi_startproc - mulxq %r9,%r8,%r9 - mulxq %r10,%rcx,%r10 - movq $32,%r14 - xorq %r13,%r13 - mulxq %r11,%rbp,%r11 - movq .Lpoly+24(%rip),%r15 - adcq %rcx,%r9 - mulxq %r12,%rcx,%r12 - movq %r8,%rdx - adcq %rbp,%r10 - shlxq %r14,%r8,%rbp - adcq %rcx,%r11 - shrxq %r14,%r8,%rcx - adcq $0,%r12 - addq %rbp,%r9 - adcq %rcx,%r10 - mulxq %r15,%rcx,%rbp - movq 8(%rbx),%rdx - adcq %rcx,%r11 - adcq %rbp,%r12 - adcq $0,%r13 - xorq %r8,%r8 +.globl ecp_nistz256_from_mont +.type ecp_nistz256_from_mont,@function +.align 32 +ecp_nistz256_from_mont: +.cfi_startproc + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-24 +.Lfrom_body: + movq 0(%rsi),%rax + movq .Lpoly+24(%rip),%r13 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %rax,%r8 + movq .Lpoly+8(%rip),%r12 - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r9,%rdx - adcxq %rcx,%r12 - shlxq %r14,%r9,%rcx - adoxq %rbp,%r13 - shrxq %r14,%r9,%rbp - - adcxq %r8,%r13 - adoxq %r8,%r8 - adcq $0,%r8 - - - - addq %rcx,%r10 - adcq %rbp,%r11 - - mulxq %r15,%rcx,%rbp - movq 16(%rbx),%rdx - adcq %rcx,%r12 - adcq %rbp,%r13 - adcq $0,%r8 - xorq %r9,%r9 - - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r10,%rdx - adcxq %rcx,%r13 - shlxq %r14,%r10,%rcx - adoxq %rbp,%r8 - shrxq %r14,%r10,%rbp - - adcxq %r9,%r8 - adoxq %r9,%r9 - adcq $0,%r9 - - - - addq %rcx,%r11 - adcq %rbp,%r12 - - mulxq %r15,%rcx,%rbp - movq 24(%rbx),%rdx - adcq %rcx,%r13 - adcq %rbp,%r8 - adcq $0,%r9 - xorq %r10,%r10 - - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r13 - adoxq %rbp,%r8 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r11,%rdx - adcxq %rcx,%r8 - shlxq %r14,%r11,%rcx - adoxq %rbp,%r9 - shrxq %r14,%r11,%rbp - - adcxq %r10,%r9 - adoxq %r10,%r10 - adcq $0,%r10 - - - - addq %rcx,%r12 - adcq %rbp,%r13 - - mulxq %r15,%rcx,%rbp - movq %r12,%rbx - movq .Lpoly+8(%rip),%r14 - adcq %rcx,%r8 - movq %r13,%rdx - adcq %rbp,%r9 - adcq $0,%r10 - - - - xorl %eax,%eax - movq %r8,%rcx - sbbq $-1,%r12 - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%rbp - sbbq %r15,%r9 - sbbq $0,%r10 - - cmovcq %rbx,%r12 - cmovcq %rdx,%r13 - movq %r12,0(%rdi) - cmovcq %rcx,%r8 - movq %r13,8(%rdi) - cmovcq %rbp,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx - -.type __ecp_nistz256_sqr_montx,@function -.align 32 -__ecp_nistz256_sqr_montx: -.cfi_startproc - mulxq %r14,%r9,%r10 - mulxq %r15,%rcx,%r11 - xorl %eax,%eax - adcq %rcx,%r10 - mulxq %r8,%rbp,%r12 - movq %r14,%rdx - adcq %rbp,%r11 - adcq $0,%r12 - xorq %r13,%r13 - - - mulxq %r15,%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq %r8,%rcx,%rbp - movq %r15,%rdx - adcxq %rcx,%r12 - adoxq %rbp,%r13 - adcq $0,%r13 - - - mulxq %r8,%rcx,%r14 - movq 0+128(%rsi),%rdx - xorq %r15,%r15 - adcxq %r9,%r9 - adoxq %rcx,%r13 - adcxq %r10,%r10 - adoxq %r15,%r14 - - mulxq %rdx,%r8,%rbp - movq 8+128(%rsi),%rdx - adcxq %r11,%r11 - adoxq %rbp,%r9 - adcxq %r12,%r12 - mulxq %rdx,%rcx,%rax - movq 16+128(%rsi),%rdx - adcxq %r13,%r13 - adoxq %rcx,%r10 - adcxq %r14,%r14 -.byte 0x67 - mulxq %rdx,%rcx,%rbp - movq 24+128(%rsi),%rdx - adoxq %rax,%r11 - adcxq %r15,%r15 - adoxq %rcx,%r12 - movq $32,%rsi - adoxq %rbp,%r13 -.byte 0x67,0x67 - mulxq %rdx,%rcx,%rax - movq .Lpoly+24(%rip),%rdx - adoxq %rcx,%r14 - shlxq %rsi,%r8,%rcx - adoxq %rax,%r15 - shrxq %rsi,%r8,%rax - movq %rdx,%rbp - - - addq %rcx,%r9 - adcq %rax,%r10 - - mulxq %r8,%rcx,%r8 - adcq %rcx,%r11 - shlxq %rsi,%r9,%rcx - adcq $0,%r8 - shrxq %rsi,%r9,%rax - - - addq %rcx,%r10 - adcq %rax,%r11 - - mulxq %r9,%rcx,%r9 - adcq %rcx,%r8 - shlxq %rsi,%r10,%rcx - adcq $0,%r9 - shrxq %rsi,%r10,%rax - - - addq %rcx,%r11 - adcq %rax,%r8 - - mulxq %r10,%rcx,%r10 - adcq %rcx,%r9 - shlxq %rsi,%r11,%rcx - adcq $0,%r10 - shrxq %rsi,%r11,%rax - - - addq %rcx,%r8 - adcq %rax,%r9 - - mulxq %r11,%rcx,%r11 - adcq %rcx,%r10 - adcq $0,%r11 - - xorq %rdx,%rdx - addq %r8,%r12 - movq .Lpoly+8(%rip),%rsi - adcq %r9,%r13 - movq %r12,%r8 - adcq %r10,%r14 - adcq %r11,%r15 - movq %r13,%r9 - adcq $0,%rdx - - subq $-1,%r12 - movq %r14,%r10 - sbbq %rsi,%r13 - sbbq $0,%r14 - movq %r15,%r11 - sbbq %rbp,%r15 - sbbq $0,%rdx - - cmovcq %r8,%r12 - cmovcq %r9,%r13 - movq %r12,0(%rdi) - cmovcq %r10,%r14 - movq %r13,8(%rdi) - cmovcq %r11,%r15 - movq %r14,16(%rdi) - movq %r15,24(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx - - - - - - -.globl ecp_nistz256_from_mont -.type ecp_nistz256_from_mont,@function -.align 32 -ecp_nistz256_from_mont: -.cfi_startproc - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-16 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-24 -.Lfrom_body: - - movq 0(%rsi),%rax - movq .Lpoly+24(%rip),%r13 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq %rax,%r8 - movq .Lpoly+8(%rip),%r12 - - - - movq %rax,%rcx - shlq $32,%r8 - mulq %r13 - shrq $32,%rcx - addq %r8,%r9 - adcq %rcx,%r10 - adcq %rax,%r11 - movq %r9,%rax - adcq $0,%rdx + movq %rax,%rcx + shlq $32,%r8 + mulq %r13 + shrq $32,%rcx + addq %r8,%r9 + adcq %rcx,%r10 + adcq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx @@ -4850,9 +4054,6 @@ ecp_nistz256_scatter_w5: .align 32 ecp_nistz256_gather_w5: .cfi_startproc - movl OPENSSL_ia32cap_P+8(%rip),%eax - testl $32,%eax - jnz .Lavx2_gather_w5 movdqa .LOne(%rip),%xmm0 movd %edx,%xmm1 @@ -4936,9 +4137,6 @@ ecp_nistz256_scatter_w7: .align 32 ecp_nistz256_gather_w7: .cfi_startproc - movl OPENSSL_ia32cap_P+8(%rip),%eax - testl $32,%eax - jnz .Lavx2_gather_w7 movdqa .LOne(%rip),%xmm8 movd %edx,%xmm1 @@ -4957,1291 +4155,46 @@ ecp_nistz256_gather_w7: movdqa 0(%rsi),%xmm9 movdqa 16(%rsi),%xmm10 pcmpeqd %xmm1,%xmm15 - movdqa 32(%rsi),%xmm11 - movdqa 48(%rsi),%xmm12 - leaq 64(%rsi),%rsi - - pand %xmm15,%xmm9 - pand %xmm15,%xmm10 - por %xmm9,%xmm2 - pand %xmm15,%xmm11 - por %xmm10,%xmm3 - pand %xmm15,%xmm12 - por %xmm11,%xmm4 - prefetcht0 255(%rsi) - por %xmm12,%xmm5 - - decq %rax - jnz .Lselect_loop_sse_w7 - - movdqu %xmm2,0(%rdi) - movdqu %xmm3,16(%rdi) - movdqu %xmm4,32(%rdi) - movdqu %xmm5,48(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.LSEH_end_ecp_nistz256_gather_w7: -.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 - - -.type ecp_nistz256_avx2_gather_w5,@function -.align 32 -ecp_nistz256_avx2_gather_w5: -.cfi_startproc -.Lavx2_gather_w5: - vzeroupper - vmovdqa .LTwo(%rip),%ymm0 - - vpxor %ymm2,%ymm2,%ymm2 - vpxor %ymm3,%ymm3,%ymm3 - vpxor %ymm4,%ymm4,%ymm4 - - vmovdqa .LOne(%rip),%ymm5 - vmovdqa .LTwo(%rip),%ymm10 - - vmovd %edx,%xmm1 - vpermd %ymm1,%ymm2,%ymm1 - - movq $8,%rax -.Lselect_loop_avx2_w5: - - vmovdqa 0(%rsi),%ymm6 - vmovdqa 32(%rsi),%ymm7 - vmovdqa 64(%rsi),%ymm8 - - vmovdqa 96(%rsi),%ymm11 - vmovdqa 128(%rsi),%ymm12 - vmovdqa 160(%rsi),%ymm13 - - vpcmpeqd %ymm1,%ymm5,%ymm9 - vpcmpeqd %ymm1,%ymm10,%ymm14 - - vpaddd %ymm0,%ymm5,%ymm5 - vpaddd %ymm0,%ymm10,%ymm10 - leaq 192(%rsi),%rsi - - vpand %ymm9,%ymm6,%ymm6 - vpand %ymm9,%ymm7,%ymm7 - vpand %ymm9,%ymm8,%ymm8 - vpand %ymm14,%ymm11,%ymm11 - vpand %ymm14,%ymm12,%ymm12 - vpand %ymm14,%ymm13,%ymm13 - - vpxor %ymm6,%ymm2,%ymm2 - vpxor %ymm7,%ymm3,%ymm3 - vpxor %ymm8,%ymm4,%ymm4 - vpxor %ymm11,%ymm2,%ymm2 - vpxor %ymm12,%ymm3,%ymm3 - vpxor %ymm13,%ymm4,%ymm4 - - decq %rax - jnz .Lselect_loop_avx2_w5 - - vmovdqu %ymm2,0(%rdi) - vmovdqu %ymm3,32(%rdi) - vmovdqu %ymm4,64(%rdi) - vzeroupper - .byte 0xf3,0xc3 -.cfi_endproc -.LSEH_end_ecp_nistz256_avx2_gather_w5: -.size ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5 - - - -.globl ecp_nistz256_avx2_gather_w7 -.type ecp_nistz256_avx2_gather_w7,@function -.align 32 -ecp_nistz256_avx2_gather_w7: -.cfi_startproc -.Lavx2_gather_w7: - vzeroupper - vmovdqa .LThree(%rip),%ymm0 - - vpxor %ymm2,%ymm2,%ymm2 - vpxor %ymm3,%ymm3,%ymm3 - - vmovdqa .LOne(%rip),%ymm4 - vmovdqa .LTwo(%rip),%ymm8 - vmovdqa .LThree(%rip),%ymm12 - - vmovd %edx,%xmm1 - vpermd %ymm1,%ymm2,%ymm1 - - - movq $21,%rax -.Lselect_loop_avx2_w7: - - vmovdqa 0(%rsi),%ymm5 - vmovdqa 32(%rsi),%ymm6 - - vmovdqa 64(%rsi),%ymm9 - vmovdqa 96(%rsi),%ymm10 - - vmovdqa 128(%rsi),%ymm13 - vmovdqa 160(%rsi),%ymm14 - - vpcmpeqd %ymm1,%ymm4,%ymm7 - vpcmpeqd %ymm1,%ymm8,%ymm11 - vpcmpeqd %ymm1,%ymm12,%ymm15 - - vpaddd %ymm0,%ymm4,%ymm4 - vpaddd %ymm0,%ymm8,%ymm8 - vpaddd %ymm0,%ymm12,%ymm12 - leaq 192(%rsi),%rsi - - vpand %ymm7,%ymm5,%ymm5 - vpand %ymm7,%ymm6,%ymm6 - vpand %ymm11,%ymm9,%ymm9 - vpand %ymm11,%ymm10,%ymm10 - vpand %ymm15,%ymm13,%ymm13 - vpand %ymm15,%ymm14,%ymm14 - - vpxor %ymm5,%ymm2,%ymm2 - vpxor %ymm6,%ymm3,%ymm3 - vpxor %ymm9,%ymm2,%ymm2 - vpxor %ymm10,%ymm3,%ymm3 - vpxor %ymm13,%ymm2,%ymm2 - vpxor %ymm14,%ymm3,%ymm3 - - decq %rax - jnz .Lselect_loop_avx2_w7 - - - vmovdqa 0(%rsi),%ymm5 - vmovdqa 32(%rsi),%ymm6 - - vpcmpeqd %ymm1,%ymm4,%ymm7 - - vpand %ymm7,%ymm5,%ymm5 - vpand %ymm7,%ymm6,%ymm6 - - vpxor %ymm5,%ymm2,%ymm2 - vpxor %ymm6,%ymm3,%ymm3 - - vmovdqu %ymm2,0(%rdi) - vmovdqu %ymm3,32(%rdi) - vzeroupper - .byte 0xf3,0xc3 -.cfi_endproc -.LSEH_end_ecp_nistz256_avx2_gather_w7: -.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7 -.type __ecp_nistz256_add_toq,@function -.align 32 -__ecp_nistz256_add_toq: -.cfi_startproc - xorq %r11,%r11 - addq 0(%rbx),%r12 - adcq 8(%rbx),%r13 - movq %r12,%rax - adcq 16(%rbx),%r8 - adcq 24(%rbx),%r9 - movq %r13,%rbp - adcq $0,%r11 - - subq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - sbbq $0,%r11 - - cmovcq %rax,%r12 - cmovcq %rbp,%r13 - movq %r12,0(%rdi) - cmovcq %rcx,%r8 - movq %r13,8(%rdi) - cmovcq %r10,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq - -.type __ecp_nistz256_sub_fromq,@function -.align 32 -__ecp_nistz256_sub_fromq: -.cfi_startproc - subq 0(%rbx),%r12 - sbbq 8(%rbx),%r13 - movq %r12,%rax - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - movq %r13,%rbp - sbbq %r11,%r11 - - addq $-1,%r12 - movq %r8,%rcx - adcq %r14,%r13 - adcq $0,%r8 - movq %r9,%r10 - adcq %r15,%r9 - testq %r11,%r11 - - cmovzq %rax,%r12 - cmovzq %rbp,%r13 - movq %r12,0(%rdi) - cmovzq %rcx,%r8 - movq %r13,8(%rdi) - cmovzq %r10,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq - -.type __ecp_nistz256_subq,@function -.align 32 -__ecp_nistz256_subq: -.cfi_startproc - subq %r12,%rax - sbbq %r13,%rbp - movq %rax,%r12 - sbbq %r8,%rcx - sbbq %r9,%r10 - movq %rbp,%r13 - sbbq %r11,%r11 - - addq $-1,%rax - movq %rcx,%r8 - adcq %r14,%rbp - adcq $0,%rcx - movq %r10,%r9 - adcq %r15,%r10 - testq %r11,%r11 - - cmovnzq %rax,%r12 - cmovnzq %rbp,%r13 - cmovnzq %rcx,%r8 - cmovnzq %r10,%r9 - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ecp_nistz256_subq,.-__ecp_nistz256_subq - -.type __ecp_nistz256_mul_by_2q,@function -.align 32 -__ecp_nistz256_mul_by_2q: -.cfi_startproc - xorq %r11,%r11 - addq %r12,%r12 - adcq %r13,%r13 - movq %r12,%rax - adcq %r8,%r8 - adcq %r9,%r9 - movq %r13,%rbp - adcq $0,%r11 - - subq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - sbbq $0,%r11 - - cmovcq %rax,%r12 - cmovcq %rbp,%r13 - movq %r12,0(%rdi) - cmovcq %rcx,%r8 - movq %r13,8(%rdi) - cmovcq %r10,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q -.globl ecp_nistz256_point_double -.type ecp_nistz256_point_double,@function -.align 32 -ecp_nistz256_point_double: -.cfi_startproc - movl $0x80100,%ecx - andl OPENSSL_ia32cap_P+8(%rip),%ecx - cmpl $0x80100,%ecx - je .Lpoint_doublex - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $160+8,%rsp -.cfi_adjust_cfa_offset 32*5+8 -.Lpoint_doubleq_body: - -.Lpoint_double_shortcutq: - movdqu 0(%rsi),%xmm0 - movq %rsi,%rbx - movdqu 16(%rsi),%xmm1 - movq 32+0(%rsi),%r12 - movq 32+8(%rsi),%r13 - movq 32+16(%rsi),%r8 - movq 32+24(%rsi),%r9 - movq .Lpoly+8(%rip),%r14 - movq .Lpoly+24(%rip),%r15 - movdqa %xmm0,96(%rsp) - movdqa %xmm1,96+16(%rsp) - leaq 32(%rdi),%r10 - leaq 64(%rdi),%r11 -.byte 102,72,15,110,199 -.byte 102,73,15,110,202 -.byte 102,73,15,110,211 - - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_by_2q - - movq 64+0(%rsi),%rax - movq 64+8(%rsi),%r14 - movq 64+16(%rsi),%r15 - movq 64+24(%rsi),%r8 - leaq 64-0(%rsi),%rsi - leaq 64(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - movq 0+0(%rsp),%rax - movq 8+0(%rsp),%r14 - leaq 0+0(%rsp),%rsi - movq 16+0(%rsp),%r15 - movq 24+0(%rsp),%r8 - leaq 0(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - movq 32(%rbx),%rax - movq 64+0(%rbx),%r9 - movq 64+8(%rbx),%r10 - movq 64+16(%rbx),%r11 - movq 64+24(%rbx),%r12 - leaq 64-0(%rbx),%rsi - leaq 32(%rbx),%rbx -.byte 102,72,15,126,215 - call __ecp_nistz256_mul_montq - call __ecp_nistz256_mul_by_2q - - movq 96+0(%rsp),%r12 - movq 96+8(%rsp),%r13 - leaq 64(%rsp),%rbx - movq 96+16(%rsp),%r8 - movq 96+24(%rsp),%r9 - leaq 32(%rsp),%rdi - call __ecp_nistz256_add_toq - - movq 96+0(%rsp),%r12 - movq 96+8(%rsp),%r13 - leaq 64(%rsp),%rbx - movq 96+16(%rsp),%r8 - movq 96+24(%rsp),%r9 - leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromq - - movq 0+0(%rsp),%rax - movq 8+0(%rsp),%r14 - leaq 0+0(%rsp),%rsi - movq 16+0(%rsp),%r15 - movq 24+0(%rsp),%r8 -.byte 102,72,15,126,207 - call __ecp_nistz256_sqr_montq - xorq %r9,%r9 - movq %r12,%rax - addq $-1,%r12 - movq %r13,%r10 - adcq %rsi,%r13 - movq %r14,%rcx - adcq $0,%r14 - movq %r15,%r8 - adcq %rbp,%r15 - adcq $0,%r9 - xorq %rsi,%rsi - testq $1,%rax - - cmovzq %rax,%r12 - cmovzq %r10,%r13 - cmovzq %rcx,%r14 - cmovzq %r8,%r15 - cmovzq %rsi,%r9 - - movq %r13,%rax - shrq $1,%r12 - shlq $63,%rax - movq %r14,%r10 - shrq $1,%r13 - orq %rax,%r12 - shlq $63,%r10 - movq %r15,%rcx - shrq $1,%r14 - orq %r10,%r13 - shlq $63,%rcx - movq %r12,0(%rdi) - shrq $1,%r15 - movq %r13,8(%rdi) - shlq $63,%r9 - orq %rcx,%r14 - orq %r9,%r15 - movq %r14,16(%rdi) - movq %r15,24(%rdi) - movq 64(%rsp),%rax - leaq 64(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montq - - leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_by_2q - - leaq 32(%rsp),%rbx - leaq 32(%rsp),%rdi - call __ecp_nistz256_add_toq - - movq 96(%rsp),%rax - leaq 96(%rsp),%rbx - movq 0+0(%rsp),%r9 - movq 8+0(%rsp),%r10 - leaq 0+0(%rsp),%rsi - movq 16+0(%rsp),%r11 - movq 24+0(%rsp),%r12 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montq - - leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_by_2q - - movq 0+32(%rsp),%rax - movq 8+32(%rsp),%r14 - leaq 0+32(%rsp),%rsi - movq 16+32(%rsp),%r15 - movq 24+32(%rsp),%r8 -.byte 102,72,15,126,199 - call __ecp_nistz256_sqr_montq - - leaq 128(%rsp),%rbx - movq %r14,%r8 - movq %r15,%r9 - movq %rsi,%r14 - movq %rbp,%r15 - call __ecp_nistz256_sub_fromq - - movq 0+0(%rsp),%rax - movq 0+8(%rsp),%rbp - movq 0+16(%rsp),%rcx - movq 0+24(%rsp),%r10 - leaq 0(%rsp),%rdi - call __ecp_nistz256_subq - - movq 32(%rsp),%rax - leaq 32(%rsp),%rbx - movq %r12,%r14 - xorl %ecx,%ecx - movq %r12,0+0(%rsp) - movq %r13,%r10 - movq %r13,0+8(%rsp) - cmovzq %r8,%r11 - movq %r8,0+16(%rsp) - leaq 0-0(%rsp),%rsi - cmovzq %r9,%r12 - movq %r9,0+24(%rsp) - movq %r14,%r9 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montq - -.byte 102,72,15,126,203 -.byte 102,72,15,126,207 - call __ecp_nistz256_sub_fromq - - leaq 160+56(%rsp),%rsi -.cfi_def_cfa %rsi,8 - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbx -.cfi_restore %rbx - movq -8(%rsi),%rbp -.cfi_restore %rbp - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lpoint_doubleq_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ecp_nistz256_point_double,.-ecp_nistz256_point_double -.globl ecp_nistz256_point_add -.type ecp_nistz256_point_add,@function -.align 32 -ecp_nistz256_point_add: -.cfi_startproc - movl $0x80100,%ecx - andl OPENSSL_ia32cap_P+8(%rip),%ecx - cmpl $0x80100,%ecx - je .Lpoint_addx - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $576+8,%rsp -.cfi_adjust_cfa_offset 32*18+8 -.Lpoint_addq_body: - - movdqu 0(%rsi),%xmm0 - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm3 - movdqu 64(%rsi),%xmm4 - movdqu 80(%rsi),%xmm5 - movq %rsi,%rbx - movq %rdx,%rsi - movdqa %xmm0,384(%rsp) - movdqa %xmm1,384+16(%rsp) - movdqa %xmm2,416(%rsp) - movdqa %xmm3,416+16(%rsp) - movdqa %xmm4,448(%rsp) - movdqa %xmm5,448+16(%rsp) - por %xmm4,%xmm5 - - movdqu 0(%rsi),%xmm0 - pshufd $0xb1,%xmm5,%xmm3 - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - por %xmm3,%xmm5 - movdqu 48(%rsi),%xmm3 - movq 64+0(%rsi),%rax - movq 64+8(%rsi),%r14 - movq 64+16(%rsi),%r15 - movq 64+24(%rsi),%r8 - movdqa %xmm0,480(%rsp) - pshufd $0x1e,%xmm5,%xmm4 - movdqa %xmm1,480+16(%rsp) - movdqu 64(%rsi),%xmm0 - movdqu 80(%rsi),%xmm1 - movdqa %xmm2,512(%rsp) - movdqa %xmm3,512+16(%rsp) - por %xmm4,%xmm5 - pxor %xmm4,%xmm4 - por %xmm0,%xmm1 -.byte 102,72,15,110,199 - - leaq 64-0(%rsi),%rsi - movq %rax,544+0(%rsp) - movq %r14,544+8(%rsp) - movq %r15,544+16(%rsp) - movq %r8,544+24(%rsp) - leaq 96(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - pcmpeqd %xmm4,%xmm5 - pshufd $0xb1,%xmm1,%xmm4 - por %xmm1,%xmm4 - pshufd $0,%xmm5,%xmm5 - pshufd $0x1e,%xmm4,%xmm3 - por %xmm3,%xmm4 - pxor %xmm3,%xmm3 - pcmpeqd %xmm3,%xmm4 - pshufd $0,%xmm4,%xmm4 - movq 64+0(%rbx),%rax - movq 64+8(%rbx),%r14 - movq 64+16(%rbx),%r15 - movq 64+24(%rbx),%r8 -.byte 102,72,15,110,203 - - leaq 64-0(%rbx),%rsi - leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - movq 544(%rsp),%rax - leaq 544(%rsp),%rbx - movq 0+96(%rsp),%r9 - movq 8+96(%rsp),%r10 - leaq 0+96(%rsp),%rsi - movq 16+96(%rsp),%r11 - movq 24+96(%rsp),%r12 - leaq 224(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 448(%rsp),%rax - leaq 448(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 416(%rsp),%rax - leaq 416(%rsp),%rbx - movq 0+224(%rsp),%r9 - movq 8+224(%rsp),%r10 - leaq 0+224(%rsp),%rsi - movq 16+224(%rsp),%r11 - movq 24+224(%rsp),%r12 - leaq 224(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 512(%rsp),%rax - leaq 512(%rsp),%rbx - movq 0+256(%rsp),%r9 - movq 8+256(%rsp),%r10 - leaq 0+256(%rsp),%rsi - movq 16+256(%rsp),%r11 - movq 24+256(%rsp),%r12 - leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montq - - leaq 224(%rsp),%rbx - leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromq - - orq %r13,%r12 - movdqa %xmm4,%xmm2 - orq %r8,%r12 - orq %r9,%r12 - por %xmm5,%xmm2 -.byte 102,73,15,110,220 - - movq 384(%rsp),%rax - leaq 384(%rsp),%rbx - movq 0+96(%rsp),%r9 - movq 8+96(%rsp),%r10 - leaq 0+96(%rsp),%rsi - movq 16+96(%rsp),%r11 - movq 24+96(%rsp),%r12 - leaq 160(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 480(%rsp),%rax - leaq 480(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 192(%rsp),%rdi - call __ecp_nistz256_mul_montq - - leaq 160(%rsp),%rbx - leaq 0(%rsp),%rdi - call __ecp_nistz256_sub_fromq - - orq %r13,%r12 - orq %r8,%r12 - orq %r9,%r12 - -.byte 102,73,15,126,208 -.byte 102,73,15,126,217 - - orq %r8,%r12 - orq %r9,%r12 - - -.byte 0x3e - jnz .Ladd_proceedq - -.Ladd_doubleq: -.byte 102,72,15,126,206 -.byte 102,72,15,126,199 - addq $416,%rsp -.cfi_adjust_cfa_offset -416 - jmp .Lpoint_double_shortcutq -.cfi_adjust_cfa_offset 416 - -.align 32 -.Ladd_proceedq: - movq 0+64(%rsp),%rax - movq 8+64(%rsp),%r14 - leaq 0+64(%rsp),%rsi - movq 16+64(%rsp),%r15 - movq 24+64(%rsp),%r8 - leaq 96(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - movq 448(%rsp),%rax - leaq 448(%rsp),%rbx - movq 0+0(%rsp),%r9 - movq 8+0(%rsp),%r10 - leaq 0+0(%rsp),%rsi - movq 16+0(%rsp),%r11 - movq 24+0(%rsp),%r12 - leaq 352(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 0+0(%rsp),%rax - movq 8+0(%rsp),%r14 - leaq 0+0(%rsp),%rsi - movq 16+0(%rsp),%r15 - movq 24+0(%rsp),%r8 - leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - movq 544(%rsp),%rax - leaq 544(%rsp),%rbx - movq 0+352(%rsp),%r9 - movq 8+352(%rsp),%r10 - leaq 0+352(%rsp),%rsi - movq 16+352(%rsp),%r11 - movq 24+352(%rsp),%r12 - leaq 352(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 0(%rsp),%rax - leaq 0(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 160(%rsp),%rax - leaq 160(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 192(%rsp),%rdi - call __ecp_nistz256_mul_montq - - - - - xorq %r11,%r11 - addq %r12,%r12 - leaq 96(%rsp),%rsi - adcq %r13,%r13 - movq %r12,%rax - adcq %r8,%r8 - adcq %r9,%r9 - movq %r13,%rbp - adcq $0,%r11 - - subq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - sbbq $0,%r11 - - cmovcq %rax,%r12 - movq 0(%rsi),%rax - cmovcq %rbp,%r13 - movq 8(%rsi),%rbp - cmovcq %rcx,%r8 - movq 16(%rsi),%rcx - cmovcq %r10,%r9 - movq 24(%rsi),%r10 - - call __ecp_nistz256_subq - - leaq 128(%rsp),%rbx - leaq 288(%rsp),%rdi - call __ecp_nistz256_sub_fromq - - movq 192+0(%rsp),%rax - movq 192+8(%rsp),%rbp - movq 192+16(%rsp),%rcx - movq 192+24(%rsp),%r10 - leaq 320(%rsp),%rdi - - call __ecp_nistz256_subq - - movq %r12,0(%rdi) - movq %r13,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - movq 128(%rsp),%rax - leaq 128(%rsp),%rbx - movq 0+224(%rsp),%r9 - movq 8+224(%rsp),%r10 - leaq 0+224(%rsp),%rsi - movq 16+224(%rsp),%r11 - movq 24+224(%rsp),%r12 - leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 320(%rsp),%rax - leaq 320(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq 0+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 320(%rsp),%rdi - call __ecp_nistz256_mul_montq - - leaq 256(%rsp),%rbx - leaq 320(%rsp),%rdi - call __ecp_nistz256_sub_fromq - -.byte 102,72,15,126,199 - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 352(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 352+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 544(%rsp),%xmm2 - pand 544+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 448(%rsp),%xmm2 - pand 448+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,64(%rdi) - movdqu %xmm3,80(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 288(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 288+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 480(%rsp),%xmm2 - pand 480+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 384(%rsp),%xmm2 - pand 384+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,0(%rdi) - movdqu %xmm3,16(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 320(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 320+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 512(%rsp),%xmm2 - pand 512+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 416(%rsp),%xmm2 - pand 416+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,32(%rdi) - movdqu %xmm3,48(%rdi) - -.Ladd_doneq: - leaq 576+56(%rsp),%rsi -.cfi_def_cfa %rsi,8 - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbx -.cfi_restore %rbx - movq -8(%rsi),%rbp -.cfi_restore %rbp - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lpoint_addq_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ecp_nistz256_point_add,.-ecp_nistz256_point_add -.globl ecp_nistz256_point_add_affine -.type ecp_nistz256_point_add_affine,@function -.align 32 -ecp_nistz256_point_add_affine: -.cfi_startproc - movl $0x80100,%ecx - andl OPENSSL_ia32cap_P+8(%rip),%ecx - cmpl $0x80100,%ecx - je .Lpoint_add_affinex - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $480+8,%rsp -.cfi_adjust_cfa_offset 32*15+8 -.Ladd_affineq_body: - - movdqu 0(%rsi),%xmm0 - movq %rdx,%rbx - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm3 - movdqu 64(%rsi),%xmm4 - movdqu 80(%rsi),%xmm5 - movq 64+0(%rsi),%rax - movq 64+8(%rsi),%r14 - movq 64+16(%rsi),%r15 - movq 64+24(%rsi),%r8 - movdqa %xmm0,320(%rsp) - movdqa %xmm1,320+16(%rsp) - movdqa %xmm2,352(%rsp) - movdqa %xmm3,352+16(%rsp) - movdqa %xmm4,384(%rsp) - movdqa %xmm5,384+16(%rsp) - por %xmm4,%xmm5 - - movdqu 0(%rbx),%xmm0 - pshufd $0xb1,%xmm5,%xmm3 - movdqu 16(%rbx),%xmm1 - movdqu 32(%rbx),%xmm2 - por %xmm3,%xmm5 - movdqu 48(%rbx),%xmm3 - movdqa %xmm0,416(%rsp) - pshufd $0x1e,%xmm5,%xmm4 - movdqa %xmm1,416+16(%rsp) - por %xmm0,%xmm1 -.byte 102,72,15,110,199 - movdqa %xmm2,448(%rsp) - movdqa %xmm3,448+16(%rsp) - por %xmm2,%xmm3 - por %xmm4,%xmm5 - pxor %xmm4,%xmm4 - por %xmm1,%xmm3 - - leaq 64-0(%rsi),%rsi - leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - pcmpeqd %xmm4,%xmm5 - pshufd $0xb1,%xmm3,%xmm4 - movq 0(%rbx),%rax - - movq %r12,%r9 - por %xmm3,%xmm4 - pshufd $0,%xmm5,%xmm5 - pshufd $0x1e,%xmm4,%xmm3 - movq %r13,%r10 - por %xmm3,%xmm4 - pxor %xmm3,%xmm3 - movq %r14,%r11 - pcmpeqd %xmm3,%xmm4 - pshufd $0,%xmm4,%xmm4 - - leaq 32-0(%rsp),%rsi - movq %r15,%r12 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montq - - leaq 320(%rsp),%rbx - leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromq - - movq 384(%rsp),%rax - leaq 384(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 384(%rsp),%rax - leaq 384(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq 0+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 288(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 448(%rsp),%rax - leaq 448(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montq - - leaq 352(%rsp),%rbx - leaq 96(%rsp),%rdi - call __ecp_nistz256_sub_fromq - - movq 0+64(%rsp),%rax - movq 8+64(%rsp),%r14 - leaq 0+64(%rsp),%rsi - movq 16+64(%rsp),%r15 - movq 24+64(%rsp),%r8 - leaq 128(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - movq 0+96(%rsp),%rax - movq 8+96(%rsp),%r14 - leaq 0+96(%rsp),%rsi - movq 16+96(%rsp),%r15 - movq 24+96(%rsp),%r8 - leaq 192(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - movq 128(%rsp),%rax - leaq 128(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq 0+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 160(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 320(%rsp),%rax - leaq 320(%rsp),%rbx - movq 0+128(%rsp),%r9 - movq 8+128(%rsp),%r10 - leaq 0+128(%rsp),%rsi - movq 16+128(%rsp),%r11 - movq 24+128(%rsp),%r12 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montq - - - - - xorq %r11,%r11 - addq %r12,%r12 - leaq 192(%rsp),%rsi - adcq %r13,%r13 - movq %r12,%rax - adcq %r8,%r8 - adcq %r9,%r9 - movq %r13,%rbp - adcq $0,%r11 - - subq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - sbbq $0,%r11 - - cmovcq %rax,%r12 - movq 0(%rsi),%rax - cmovcq %rbp,%r13 - movq 8(%rsi),%rbp - cmovcq %rcx,%r8 - movq 16(%rsi),%rcx - cmovcq %r10,%r9 - movq 24(%rsi),%r10 - - call __ecp_nistz256_subq - - leaq 160(%rsp),%rbx - leaq 224(%rsp),%rdi - call __ecp_nistz256_sub_fromq - - movq 0+0(%rsp),%rax - movq 0+8(%rsp),%rbp - movq 0+16(%rsp),%rcx - movq 0+24(%rsp),%r10 - leaq 64(%rsp),%rdi - - call __ecp_nistz256_subq - - movq %r12,0(%rdi) - movq %r13,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - movq 352(%rsp),%rax - leaq 352(%rsp),%rbx - movq 0+160(%rsp),%r9 - movq 8+160(%rsp),%r10 - leaq 0+160(%rsp),%rsi - movq 16+160(%rsp),%r11 - movq 24+160(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 96(%rsp),%rax - leaq 96(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq 0+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 64(%rsp),%rdi - call __ecp_nistz256_mul_montq - - leaq 32(%rsp),%rbx - leaq 256(%rsp),%rdi - call __ecp_nistz256_sub_fromq - -.byte 102,72,15,126,199 - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 288(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 288+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand .LONE_mont(%rip),%xmm2 - pand .LONE_mont+16(%rip),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 384(%rsp),%xmm2 - pand 384+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,64(%rdi) - movdqu %xmm3,80(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 224(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 224+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 416(%rsp),%xmm2 - pand 416+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 320(%rsp),%xmm2 - pand 320+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,0(%rdi) - movdqu %xmm3,16(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 256(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 256+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 448(%rsp),%xmm2 - pand 448+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 + movdqa 32(%rsi),%xmm11 + movdqa 48(%rsi),%xmm12 + leaq 64(%rsi),%rsi - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 352(%rsp),%xmm2 - pand 352+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,32(%rdi) - movdqu %xmm3,48(%rdi) + pand %xmm15,%xmm9 + pand %xmm15,%xmm10 + por %xmm9,%xmm2 + pand %xmm15,%xmm11 + por %xmm10,%xmm3 + pand %xmm15,%xmm12 + por %xmm11,%xmm4 + prefetcht0 255(%rsi) + por %xmm12,%xmm5 - leaq 480+56(%rsp),%rsi -.cfi_def_cfa %rsi,8 - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbx -.cfi_restore %rbx - movq -8(%rsi),%rbp -.cfi_restore %rbp - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Ladd_affineq_epilogue: + decq %rax + jnz .Lselect_loop_sse_w7 + + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + movdqu %xmm4,32(%rdi) + movdqu %xmm5,48(%rdi) .byte 0xf3,0xc3 .cfi_endproc -.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine -.type __ecp_nistz256_add_tox,@function +.LSEH_end_ecp_nistz256_gather_w7: +.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 +.globl ecp_nistz256_avx2_gather_w7 +.type ecp_nistz256_avx2_gather_w7,@function +.align 32 +ecp_nistz256_avx2_gather_w7: +.cfi_startproc +.byte 0x0f,0x0b + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7 +.type __ecp_nistz256_add_toq,@function .align 32 -__ecp_nistz256_add_tox: +__ecp_nistz256_add_toq: .cfi_startproc xorq %r11,%r11 - adcq 0(%rbx),%r12 + addq 0(%rbx),%r12 adcq 8(%rbx),%r13 movq %r12,%rax adcq 16(%rbx),%r8 @@ -6249,8 +4202,7 @@ __ecp_nistz256_add_tox: movq %r13,%rbp adcq $0,%r11 - xorq %r10,%r10 - sbbq $-1,%r12 + subq $-1,%r12 movq %r8,%rcx sbbq %r14,%r13 sbbq $0,%r8 @@ -6269,80 +4221,76 @@ __ecp_nistz256_add_tox: .byte 0xf3,0xc3 .cfi_endproc -.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox +.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq -.type __ecp_nistz256_sub_fromx,@function +.type __ecp_nistz256_sub_fromq,@function .align 32 -__ecp_nistz256_sub_fromx: +__ecp_nistz256_sub_fromq: .cfi_startproc - xorq %r11,%r11 - sbbq 0(%rbx),%r12 + subq 0(%rbx),%r12 sbbq 8(%rbx),%r13 movq %r12,%rax sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 movq %r13,%rbp - sbbq $0,%r11 + sbbq %r11,%r11 - xorq %r10,%r10 - adcq $-1,%r12 + addq $-1,%r12 movq %r8,%rcx adcq %r14,%r13 adcq $0,%r8 movq %r9,%r10 adcq %r15,%r9 + testq %r11,%r11 - btq $0,%r11 - cmovncq %rax,%r12 - cmovncq %rbp,%r13 + cmovzq %rax,%r12 + cmovzq %rbp,%r13 movq %r12,0(%rdi) - cmovncq %rcx,%r8 + cmovzq %rcx,%r8 movq %r13,8(%rdi) - cmovncq %r10,%r9 + cmovzq %r10,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) .byte 0xf3,0xc3 .cfi_endproc -.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx +.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq -.type __ecp_nistz256_subx,@function +.type __ecp_nistz256_subq,@function .align 32 -__ecp_nistz256_subx: +__ecp_nistz256_subq: .cfi_startproc - xorq %r11,%r11 - sbbq %r12,%rax + subq %r12,%rax sbbq %r13,%rbp movq %rax,%r12 sbbq %r8,%rcx sbbq %r9,%r10 movq %rbp,%r13 - sbbq $0,%r11 + sbbq %r11,%r11 - xorq %r9,%r9 - adcq $-1,%rax + addq $-1,%rax movq %rcx,%r8 adcq %r14,%rbp adcq $0,%rcx movq %r10,%r9 adcq %r15,%r10 + testq %r11,%r11 - btq $0,%r11 - cmovcq %rax,%r12 - cmovcq %rbp,%r13 - cmovcq %rcx,%r8 - cmovcq %r10,%r9 + cmovnzq %rax,%r12 + cmovnzq %rbp,%r13 + cmovnzq %rcx,%r8 + cmovnzq %r10,%r9 .byte 0xf3,0xc3 .cfi_endproc -.size __ecp_nistz256_subx,.-__ecp_nistz256_subx +.size __ecp_nistz256_subq,.-__ecp_nistz256_subq -.type __ecp_nistz256_mul_by_2x,@function +.type __ecp_nistz256_mul_by_2q,@function .align 32 -__ecp_nistz256_mul_by_2x: +__ecp_nistz256_mul_by_2q: .cfi_startproc xorq %r11,%r11 - adcq %r12,%r12 + addq %r12,%r12 adcq %r13,%r13 movq %r12,%rax adcq %r8,%r8 @@ -6350,8 +4298,7 @@ __ecp_nistz256_mul_by_2x: movq %r13,%rbp adcq $0,%r11 - xorq %r10,%r10 - sbbq $-1,%r12 + subq $-1,%r12 movq %r8,%rcx sbbq %r14,%r13 sbbq $0,%r8 @@ -6370,12 +4317,12 @@ __ecp_nistz256_mul_by_2x: .byte 0xf3,0xc3 .cfi_endproc -.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x -.type ecp_nistz256_point_doublex,@function +.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q +.globl ecp_nistz256_point_double +.type ecp_nistz256_point_double,@function .align 32 -ecp_nistz256_point_doublex: +ecp_nistz256_point_double: .cfi_startproc -.Lpoint_doublex: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -6396,9 +4343,9 @@ ecp_nistz256_point_doublex: .cfi_offset %r15,-56 subq $160+8,%rsp .cfi_adjust_cfa_offset 32*5+8 -.Lpoint_doublex_body: +.Lpoint_doubleq_body: -.Lpoint_double_shortcutx: +.Lpoint_double_shortcutq: movdqu 0(%rsi),%xmm0 movq %rsi,%rbx movdqu 16(%rsi),%xmm1 @@ -6417,34 +4364,34 @@ ecp_nistz256_point_doublex: .byte 102,73,15,110,211 leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_by_2x + call __ecp_nistz256_mul_by_2q - movq 64+0(%rsi),%rdx + movq 64+0(%rsi),%rax movq 64+8(%rsi),%r14 movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 - leaq 64-128(%rsi),%rsi + leaq 64-0(%rsi),%rsi leaq 64(%rsp),%rdi - call __ecp_nistz256_sqr_montx + call __ecp_nistz256_sqr_montq - movq 0+0(%rsp),%rdx + movq 0+0(%rsp),%rax movq 8+0(%rsp),%r14 - leaq -128+0(%rsp),%rsi + leaq 0+0(%rsp),%rsi movq 16+0(%rsp),%r15 movq 24+0(%rsp),%r8 leaq 0(%rsp),%rdi - call __ecp_nistz256_sqr_montx + call __ecp_nistz256_sqr_montq - movq 32(%rbx),%rdx + movq 32(%rbx),%rax movq 64+0(%rbx),%r9 movq 64+8(%rbx),%r10 movq 64+16(%rbx),%r11 movq 64+24(%rbx),%r12 - leaq 64-128(%rbx),%rsi + leaq 64-0(%rbx),%rsi leaq 32(%rbx),%rbx .byte 102,72,15,126,215 - call __ecp_nistz256_mul_montx - call __ecp_nistz256_mul_by_2x + call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_by_2q movq 96+0(%rsp),%r12 movq 96+8(%rsp),%r13 @@ -6452,7 +4399,7 @@ ecp_nistz256_point_doublex: movq 96+16(%rsp),%r8 movq 96+24(%rsp),%r9 leaq 32(%rsp),%rdi - call __ecp_nistz256_add_tox + call __ecp_nistz256_add_toq movq 96+0(%rsp),%r12 movq 96+8(%rsp),%r13 @@ -6460,15 +4407,15 @@ ecp_nistz256_point_doublex: movq 96+16(%rsp),%r8 movq 96+24(%rsp),%r9 leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromx + call __ecp_nistz256_sub_fromq - movq 0+0(%rsp),%rdx + movq 0+0(%rsp),%rax movq 8+0(%rsp),%r14 - leaq -128+0(%rsp),%rsi + leaq 0+0(%rsp),%rsi movq 16+0(%rsp),%r15 movq 24+0(%rsp),%r8 .byte 102,72,15,126,207 - call __ecp_nistz256_sqr_montx + call __ecp_nistz256_sqr_montq xorq %r9,%r9 movq %r12,%rax addq $-1,%r12 @@ -6507,59 +4454,59 @@ ecp_nistz256_point_doublex: orq %r9,%r15 movq %r14,16(%rdi) movq %r15,24(%rdi) - movq 64(%rsp),%rdx + movq 64(%rsp),%rax leaq 64(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi + leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_by_2x + call __ecp_nistz256_mul_by_2q leaq 32(%rsp),%rbx leaq 32(%rsp),%rdi - call __ecp_nistz256_add_tox + call __ecp_nistz256_add_toq - movq 96(%rsp),%rdx + movq 96(%rsp),%rax leaq 96(%rsp),%rbx movq 0+0(%rsp),%r9 movq 8+0(%rsp),%r10 - leaq -128+0(%rsp),%rsi + leaq 0+0(%rsp),%rsi movq 16+0(%rsp),%r11 movq 24+0(%rsp),%r12 leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_by_2x + call __ecp_nistz256_mul_by_2q - movq 0+32(%rsp),%rdx + movq 0+32(%rsp),%rax movq 8+32(%rsp),%r14 - leaq -128+32(%rsp),%rsi + leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r15 movq 24+32(%rsp),%r8 .byte 102,72,15,126,199 - call __ecp_nistz256_sqr_montx + call __ecp_nistz256_sqr_montq leaq 128(%rsp),%rbx movq %r14,%r8 movq %r15,%r9 movq %rsi,%r14 movq %rbp,%r15 - call __ecp_nistz256_sub_fromx + call __ecp_nistz256_sub_fromq movq 0+0(%rsp),%rax movq 0+8(%rsp),%rbp movq 0+16(%rsp),%rcx movq 0+24(%rsp),%r10 leaq 0(%rsp),%rdi - call __ecp_nistz256_subx + call __ecp_nistz256_subq - movq 32(%rsp),%rdx + movq 32(%rsp),%rax leaq 32(%rsp),%rbx movq %r12,%r14 xorl %ecx,%ecx @@ -6568,16 +4515,16 @@ ecp_nistz256_point_doublex: movq %r13,0+8(%rsp) cmovzq %r8,%r11 movq %r8,0+16(%rsp) - leaq 0-128(%rsp),%rsi + leaq 0-0(%rsp),%rsi cmovzq %r9,%r12 movq %r9,0+24(%rsp) movq %r14,%r9 leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq .byte 102,72,15,126,203 .byte 102,72,15,126,207 - call __ecp_nistz256_sub_fromx + call __ecp_nistz256_sub_fromq leaq 160+56(%rsp),%rsi .cfi_def_cfa %rsi,8 @@ -6595,15 +4542,15 @@ ecp_nistz256_point_doublex: .cfi_restore %rbp leaq (%rsi),%rsp .cfi_def_cfa_register %rsp -.Lpoint_doublex_epilogue: +.Lpoint_doubleq_epilogue: .byte 0xf3,0xc3 .cfi_endproc -.size ecp_nistz256_point_doublex,.-ecp_nistz256_point_doublex -.type ecp_nistz256_point_addx,@function +.size ecp_nistz256_point_double,.-ecp_nistz256_point_double +.globl ecp_nistz256_point_add +.type ecp_nistz256_point_add,@function .align 32 -ecp_nistz256_point_addx: +ecp_nistz256_point_add: .cfi_startproc -.Lpoint_addx: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -6624,7 +4571,7 @@ ecp_nistz256_point_addx: .cfi_offset %r15,-56 subq $576+8,%rsp .cfi_adjust_cfa_offset 32*18+8 -.Lpoint_addx_body: +.Lpoint_addq_body: movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 @@ -6648,7 +4595,7 @@ ecp_nistz256_point_addx: movdqu 32(%rsi),%xmm2 por %xmm3,%xmm5 movdqu 48(%rsi),%xmm3 - movq 64+0(%rsi),%rdx + movq 64+0(%rsi),%rax movq 64+8(%rsi),%r14 movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 @@ -6664,13 +4611,13 @@ ecp_nistz256_point_addx: por %xmm0,%xmm1 .byte 102,72,15,110,199 - leaq 64-128(%rsi),%rsi - movq %rdx,544+0(%rsp) + leaq 64-0(%rsi),%rsi + movq %rax,544+0(%rsp) movq %r14,544+8(%rsp) movq %r15,544+16(%rsp) movq %r8,544+24(%rsp) leaq 96(%rsp),%rdi - call __ecp_nistz256_sqr_montx + call __ecp_nistz256_sqr_montq pcmpeqd %xmm4,%xmm5 pshufd $0xb1,%xmm1,%xmm4 @@ -6681,59 +4628,59 @@ ecp_nistz256_point_addx: pxor %xmm3,%xmm3 pcmpeqd %xmm3,%xmm4 pshufd $0,%xmm4,%xmm4 - movq 64+0(%rbx),%rdx + movq 64+0(%rbx),%rax movq 64+8(%rbx),%r14 movq 64+16(%rbx),%r15 movq 64+24(%rbx),%r8 .byte 102,72,15,110,203 - leaq 64-128(%rbx),%rsi + leaq 64-0(%rbx),%rsi leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montx + call __ecp_nistz256_sqr_montq - movq 544(%rsp),%rdx + movq 544(%rsp),%rax leaq 544(%rsp),%rbx movq 0+96(%rsp),%r9 movq 8+96(%rsp),%r10 - leaq -128+96(%rsp),%rsi + leaq 0+96(%rsp),%rsi movq 16+96(%rsp),%r11 movq 24+96(%rsp),%r12 leaq 224(%rsp),%rdi - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq - movq 448(%rsp),%rdx + movq 448(%rsp),%rax leaq 448(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi + leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq - movq 416(%rsp),%rdx + movq 416(%rsp),%rax leaq 416(%rsp),%rbx movq 0+224(%rsp),%r9 movq 8+224(%rsp),%r10 - leaq -128+224(%rsp),%rsi + leaq 0+224(%rsp),%rsi movq 16+224(%rsp),%r11 movq 24+224(%rsp),%r12 leaq 224(%rsp),%rdi - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq - movq 512(%rsp),%rdx + movq 512(%rsp),%rax leaq 512(%rsp),%rbx movq 0+256(%rsp),%r9 movq 8+256(%rsp),%r10 - leaq -128+256(%rsp),%rsi + leaq 0+256(%rsp),%rsi movq 16+256(%rsp),%r11 movq 24+256(%rsp),%r12 leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq leaq 224(%rsp),%rbx leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromx + call __ecp_nistz256_sub_fromq orq %r13,%r12 movdqa %xmm4,%xmm2 @@ -6742,29 +4689,29 @@ ecp_nistz256_point_addx: por %xmm5,%xmm2 .byte 102,73,15,110,220 - movq 384(%rsp),%rdx + movq 384(%rsp),%rax leaq 384(%rsp),%rbx movq 0+96(%rsp),%r9 movq 8+96(%rsp),%r10 - leaq -128+96(%rsp),%rsi + leaq 0+96(%rsp),%rsi movq 16+96(%rsp),%r11 movq 24+96(%rsp),%r12 leaq 160(%rsp),%rdi - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq - movq 480(%rsp),%rdx + movq 480(%rsp),%rax leaq 480(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi + leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 192(%rsp),%rdi - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq leaq 160(%rsp),%rbx leaq 0(%rsp),%rdi - call __ecp_nistz256_sub_fromx + call __ecp_nistz256_sub_fromq orq %r13,%r12 orq %r8,%r12 @@ -6778,73 +4725,73 @@ ecp_nistz256_point_addx: .byte 0x3e - jnz .Ladd_proceedx + jnz .Ladd_proceedq -.Ladd_doublex: +.Ladd_doubleq: .byte 102,72,15,126,206 .byte 102,72,15,126,199 addq $416,%rsp .cfi_adjust_cfa_offset -416 - jmp .Lpoint_double_shortcutx + jmp .Lpoint_double_shortcutq .cfi_adjust_cfa_offset 416 .align 32 -.Ladd_proceedx: - movq 0+64(%rsp),%rdx +.Ladd_proceedq: + movq 0+64(%rsp),%rax movq 8+64(%rsp),%r14 - leaq -128+64(%rsp),%rsi + leaq 0+64(%rsp),%rsi movq 16+64(%rsp),%r15 movq 24+64(%rsp),%r8 leaq 96(%rsp),%rdi - call __ecp_nistz256_sqr_montx + call __ecp_nistz256_sqr_montq - movq 448(%rsp),%rdx + movq 448(%rsp),%rax leaq 448(%rsp),%rbx movq 0+0(%rsp),%r9 movq 8+0(%rsp),%r10 - leaq -128+0(%rsp),%rsi + leaq 0+0(%rsp),%rsi movq 16+0(%rsp),%r11 movq 24+0(%rsp),%r12 leaq 352(%rsp),%rdi - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq - movq 0+0(%rsp),%rdx + movq 0+0(%rsp),%rax movq 8+0(%rsp),%r14 - leaq -128+0(%rsp),%rsi + leaq 0+0(%rsp),%rsi movq 16+0(%rsp),%r15 movq 24+0(%rsp),%r8 leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montx + call __ecp_nistz256_sqr_montq - movq 544(%rsp),%rdx + movq 544(%rsp),%rax leaq 544(%rsp),%rbx movq 0+352(%rsp),%r9 movq 8+352(%rsp),%r10 - leaq -128+352(%rsp),%rsi + leaq 0+352(%rsp),%rsi movq 16+352(%rsp),%r11 movq 24+352(%rsp),%r12 leaq 352(%rsp),%rdi - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq - movq 0(%rsp),%rdx + movq 0(%rsp),%rax leaq 0(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi + leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq - movq 160(%rsp),%rdx + movq 160(%rsp),%rax leaq 160(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi + leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 192(%rsp),%rdi - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq @@ -6876,11 +4823,11 @@ ecp_nistz256_point_addx: cmovcq %r10,%r9 movq 24(%rsi),%r10 - call __ecp_nistz256_subx + call __ecp_nistz256_subq leaq 128(%rsp),%rbx leaq 288(%rsp),%rdi - call __ecp_nistz256_sub_fromx + call __ecp_nistz256_sub_fromq movq 192+0(%rsp),%rax movq 192+8(%rsp),%rbp @@ -6888,35 +4835,35 @@ ecp_nistz256_point_addx: movq 192+24(%rsp),%r10 leaq 320(%rsp),%rdi - call __ecp_nistz256_subx + call __ecp_nistz256_subq movq %r12,0(%rdi) movq %r13,8(%rdi) movq %r8,16(%rdi) movq %r9,24(%rdi) - movq 128(%rsp),%rdx + movq 128(%rsp),%rax leaq 128(%rsp),%rbx movq 0+224(%rsp),%r9 movq 8+224(%rsp),%r10 - leaq -128+224(%rsp),%rsi + leaq 0+224(%rsp),%rsi movq 16+224(%rsp),%r11 movq 24+224(%rsp),%r12 leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq - movq 320(%rsp),%rdx + movq 320(%rsp),%rax leaq 320(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 - leaq -128+64(%rsp),%rsi + leaq 0+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 320(%rsp),%rdi - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq leaq 256(%rsp),%rbx leaq 320(%rsp),%rdi - call __ecp_nistz256_sub_fromx + call __ecp_nistz256_sub_fromq .byte 102,72,15,126,199 @@ -6992,7 +4939,7 @@ ecp_nistz256_point_addx: movdqu %xmm2,32(%rdi) movdqu %xmm3,48(%rdi) -.Ladd_donex: +.Ladd_doneq: leaq 576+56(%rsp),%rsi .cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 @@ -7009,15 +4956,15 @@ ecp_nistz256_point_addx: .cfi_restore %rbp leaq (%rsi),%rsp .cfi_def_cfa_register %rsp -.Lpoint_addx_epilogue: +.Lpoint_addq_epilogue: .byte 0xf3,0xc3 .cfi_endproc -.size ecp_nistz256_point_addx,.-ecp_nistz256_point_addx -.type ecp_nistz256_point_add_affinex,@function +.size ecp_nistz256_point_add,.-ecp_nistz256_point_add +.globl ecp_nistz256_point_add_affine +.type ecp_nistz256_point_add_affine,@function .align 32 -ecp_nistz256_point_add_affinex: +ecp_nistz256_point_add_affine: .cfi_startproc -.Lpoint_add_affinex: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -7038,7 +4985,7 @@ ecp_nistz256_point_add_affinex: .cfi_offset %r15,-56 subq $480+8,%rsp .cfi_adjust_cfa_offset 32*15+8 -.Ladd_affinex_body: +.Ladd_affineq_body: movdqu 0(%rsi),%xmm0 movq %rdx,%rbx @@ -7047,7 +4994,7 @@ ecp_nistz256_point_add_affinex: movdqu 48(%rsi),%xmm3 movdqu 64(%rsi),%xmm4 movdqu 80(%rsi),%xmm5 - movq 64+0(%rsi),%rdx + movq 64+0(%rsi),%rax movq 64+8(%rsi),%r14 movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 @@ -7077,13 +5024,13 @@ ecp_nistz256_point_add_affinex: pxor %xmm4,%xmm4 por %xmm1,%xmm3 - leaq 64-128(%rsi),%rsi + leaq 64-0(%rsi),%rsi leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montx + call __ecp_nistz256_sqr_montq pcmpeqd %xmm4,%xmm5 pshufd $0xb1,%xmm3,%xmm4 - movq 0(%rbx),%rdx + movq 0(%rbx),%rax movq %r12,%r9 por %xmm3,%xmm4 @@ -7096,84 +5043,84 @@ ecp_nistz256_point_add_affinex: pcmpeqd %xmm3,%xmm4 pshufd $0,%xmm4,%xmm4 - leaq 32-128(%rsp),%rsi + leaq 32-0(%rsp),%rsi movq %r15,%r12 leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq leaq 320(%rsp),%rbx leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromx + call __ecp_nistz256_sub_fromq - movq 384(%rsp),%rdx + movq 384(%rsp),%rax leaq 384(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi + leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq - movq 384(%rsp),%rdx + movq 384(%rsp),%rax leaq 384(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 - leaq -128+64(%rsp),%rsi + leaq 0+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 288(%rsp),%rdi - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq - movq 448(%rsp),%rdx + movq 448(%rsp),%rax leaq 448(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi + leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq leaq 352(%rsp),%rbx leaq 96(%rsp),%rdi - call __ecp_nistz256_sub_fromx + call __ecp_nistz256_sub_fromq - movq 0+64(%rsp),%rdx + movq 0+64(%rsp),%rax movq 8+64(%rsp),%r14 - leaq -128+64(%rsp),%rsi + leaq 0+64(%rsp),%rsi movq 16+64(%rsp),%r15 movq 24+64(%rsp),%r8 leaq 128(%rsp),%rdi - call __ecp_nistz256_sqr_montx + call __ecp_nistz256_sqr_montq - movq 0+96(%rsp),%rdx + movq 0+96(%rsp),%rax movq 8+96(%rsp),%r14 - leaq -128+96(%rsp),%rsi + leaq 0+96(%rsp),%rsi movq 16+96(%rsp),%r15 movq 24+96(%rsp),%r8 leaq 192(%rsp),%rdi - call __ecp_nistz256_sqr_montx + call __ecp_nistz256_sqr_montq - movq 128(%rsp),%rdx + movq 128(%rsp),%rax leaq 128(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 - leaq -128+64(%rsp),%rsi + leaq 0+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 160(%rsp),%rdi - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq - movq 320(%rsp),%rdx + movq 320(%rsp),%rax leaq 320(%rsp),%rbx movq 0+128(%rsp),%r9 movq 8+128(%rsp),%r10 - leaq -128+128(%rsp),%rsi + leaq 0+128(%rsp),%rsi movq 16+128(%rsp),%r11 movq 24+128(%rsp),%r12 leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq @@ -7205,11 +5152,11 @@ ecp_nistz256_point_add_affinex: cmovcq %r10,%r9 movq 24(%rsi),%r10 - call __ecp_nistz256_subx + call __ecp_nistz256_subq leaq 160(%rsp),%rbx leaq 224(%rsp),%rdi - call __ecp_nistz256_sub_fromx + call __ecp_nistz256_sub_fromq movq 0+0(%rsp),%rax movq 0+8(%rsp),%rbp @@ -7217,35 +5164,35 @@ ecp_nistz256_point_add_affinex: movq 0+24(%rsp),%r10 leaq 64(%rsp),%rdi - call __ecp_nistz256_subx + call __ecp_nistz256_subq movq %r12,0(%rdi) movq %r13,8(%rdi) movq %r8,16(%rdi) movq %r9,24(%rdi) - movq 352(%rsp),%rdx + movq 352(%rsp),%rax leaq 352(%rsp),%rbx movq 0+160(%rsp),%r9 movq 8+160(%rsp),%r10 - leaq -128+160(%rsp),%rsi + leaq 0+160(%rsp),%rsi movq 16+160(%rsp),%r11 movq 24+160(%rsp),%r12 leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq - movq 96(%rsp),%rdx + movq 96(%rsp),%rax leaq 96(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 - leaq -128+64(%rsp),%rsi + leaq 0+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 64(%rsp),%rdi - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq leaq 32(%rsp),%rbx leaq 256(%rsp),%rdi - call __ecp_nistz256_sub_fromx + call __ecp_nistz256_sub_fromq .byte 102,72,15,126,199 @@ -7337,11 +5284,11 @@ ecp_nistz256_point_add_affinex: .cfi_restore %rbp leaq (%rsi),%rsp .cfi_def_cfa_register %rsp -.Ladd_affinex_epilogue: +.Ladd_affineq_epilogue: .byte 0xf3,0xc3 .cfi_endproc -.size ecp_nistz256_point_add_affinex,.-ecp_nistz256_point_add_affinex - .section ".note.gnu.property", "a" +.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine + .section .note.gnu.property, #alloc .p2align 3 .long 1f - 0f .long 4f - 1f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/ec/x25519-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/ec/x25519-x86_64.s index dd5a6efce58..3ee9bc6bbb8 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/ec/x25519-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/ec/x25519-x86_64.s @@ -395,412 +395,36 @@ x25519_fe51_mul121666: .Lfe51_mul121666_epilogue: .cfi_endproc .size x25519_fe51_mul121666,.-x25519_fe51_mul121666 - .globl x25519_fe64_eligible .type x25519_fe64_eligible,@function .align 32 x25519_fe64_eligible: .cfi_startproc - movl OPENSSL_ia32cap_P+8(%rip),%ecx xorl %eax,%eax - andl $0x80100,%ecx - cmpl $0x80100,%ecx - cmovel %ecx,%eax .byte 0xf3,0xc3 .cfi_endproc .size x25519_fe64_eligible,.-x25519_fe64_eligible .globl x25519_fe64_mul .type x25519_fe64_mul,@function -.align 32 -x25519_fe64_mul: -.cfi_startproc - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rdi -.cfi_adjust_cfa_offset 8 -.cfi_offset %rdi,-64 - leaq -16(%rsp),%rsp -.cfi_adjust_cfa_offset 16 -.Lfe64_mul_body: - - movq %rdx,%rax - movq 0(%rdx),%rbp - movq 0(%rsi),%rdx - movq 8(%rax),%rcx - movq 16(%rax),%r14 - movq 24(%rax),%r15 - - mulxq %rbp,%r8,%rax - xorl %edi,%edi - mulxq %rcx,%r9,%rbx - adcxq %rax,%r9 - mulxq %r14,%r10,%rax - adcxq %rbx,%r10 - mulxq %r15,%r11,%r12 - movq 8(%rsi),%rdx - adcxq %rax,%r11 - movq %r14,(%rsp) - adcxq %rdi,%r12 - - mulxq %rbp,%rax,%rbx - adoxq %rax,%r9 - adcxq %rbx,%r10 - mulxq %rcx,%rax,%rbx - adoxq %rax,%r10 - adcxq %rbx,%r11 - mulxq %r14,%rax,%rbx - adoxq %rax,%r11 - adcxq %rbx,%r12 - mulxq %r15,%rax,%r13 - movq 16(%rsi),%rdx - adoxq %rax,%r12 - adcxq %rdi,%r13 - adoxq %rdi,%r13 - - mulxq %rbp,%rax,%rbx - adcxq %rax,%r10 - adoxq %rbx,%r11 - mulxq %rcx,%rax,%rbx - adcxq %rax,%r11 - adoxq %rbx,%r12 - mulxq %r14,%rax,%rbx - adcxq %rax,%r12 - adoxq %rbx,%r13 - mulxq %r15,%rax,%r14 - movq 24(%rsi),%rdx - adcxq %rax,%r13 - adoxq %rdi,%r14 - adcxq %rdi,%r14 - - mulxq %rbp,%rax,%rbx - adoxq %rax,%r11 - adcxq %rbx,%r12 - mulxq %rcx,%rax,%rbx - adoxq %rax,%r12 - adcxq %rbx,%r13 - mulxq (%rsp),%rax,%rbx - adoxq %rax,%r13 - adcxq %rbx,%r14 - mulxq %r15,%rax,%r15 - movl $38,%edx - adoxq %rax,%r14 - adcxq %rdi,%r15 - adoxq %rdi,%r15 - - jmp .Lreduce64 -.Lfe64_mul_epilogue: -.cfi_endproc -.size x25519_fe64_mul,.-x25519_fe64_mul - .globl x25519_fe64_sqr -.type x25519_fe64_sqr,@function -.align 32 -x25519_fe64_sqr: -.cfi_startproc - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rdi -.cfi_adjust_cfa_offset 8 -.cfi_offset %rdi,-64 - leaq -16(%rsp),%rsp -.cfi_adjust_cfa_offset 16 -.Lfe64_sqr_body: - - movq 0(%rsi),%rdx - movq 8(%rsi),%rcx - movq 16(%rsi),%rbp - movq 24(%rsi),%rsi - - - mulxq %rdx,%r8,%r15 - mulxq %rcx,%r9,%rax - xorl %edi,%edi - mulxq %rbp,%r10,%rbx - adcxq %rax,%r10 - mulxq %rsi,%r11,%r12 - movq %rcx,%rdx - adcxq %rbx,%r11 - adcxq %rdi,%r12 - - - mulxq %rbp,%rax,%rbx - adoxq %rax,%r11 - adcxq %rbx,%r12 - mulxq %rsi,%rax,%r13 - movq %rbp,%rdx - adoxq %rax,%r12 - adcxq %rdi,%r13 - - - mulxq %rsi,%rax,%r14 - movq %rcx,%rdx - adoxq %rax,%r13 - adcxq %rdi,%r14 - adoxq %rdi,%r14 - - adcxq %r9,%r9 - adoxq %r15,%r9 - adcxq %r10,%r10 - mulxq %rdx,%rax,%rbx - movq %rbp,%rdx - adcxq %r11,%r11 - adoxq %rax,%r10 - adcxq %r12,%r12 - adoxq %rbx,%r11 - mulxq %rdx,%rax,%rbx - movq %rsi,%rdx - adcxq %r13,%r13 - adoxq %rax,%r12 - adcxq %r14,%r14 - adoxq %rbx,%r13 - mulxq %rdx,%rax,%r15 - movl $38,%edx - adoxq %rax,%r14 - adcxq %rdi,%r15 - adoxq %rdi,%r15 - jmp .Lreduce64 - -.align 32 -.Lreduce64: - mulxq %r12,%rax,%rbx - adcxq %rax,%r8 - adoxq %rbx,%r9 - mulxq %r13,%rax,%rbx - adcxq %rax,%r9 - adoxq %rbx,%r10 - mulxq %r14,%rax,%rbx - adcxq %rax,%r10 - adoxq %rbx,%r11 - mulxq %r15,%rax,%r12 - adcxq %rax,%r11 - adoxq %rdi,%r12 - adcxq %rdi,%r12 - - movq 16(%rsp),%rdi - imulq %rdx,%r12 - - addq %r12,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - - sbbq %rax,%rax - andq $38,%rax - - addq %rax,%r8 - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r8,0(%rdi) - - movq 24(%rsp),%r15 -.cfi_restore %r15 - movq 32(%rsp),%r14 -.cfi_restore %r14 - movq 40(%rsp),%r13 -.cfi_restore %r13 - movq 48(%rsp),%r12 -.cfi_restore %r12 - movq 56(%rsp),%rbx -.cfi_restore %rbx - movq 64(%rsp),%rbp -.cfi_restore %rbp - leaq 72(%rsp),%rsp -.cfi_adjust_cfa_offset 88 -.Lfe64_sqr_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size x25519_fe64_sqr,.-x25519_fe64_sqr - .globl x25519_fe64_mul121666 -.type x25519_fe64_mul121666,@function -.align 32 -x25519_fe64_mul121666: -.Lfe64_mul121666_body: -.cfi_startproc - movl $121666,%edx - mulxq 0(%rsi),%r8,%rcx - mulxq 8(%rsi),%r9,%rax - addq %rcx,%r9 - mulxq 16(%rsi),%r10,%rcx - adcq %rax,%r10 - mulxq 24(%rsi),%r11,%rax - adcq %rcx,%r11 - adcq $0,%rax - - imulq $38,%rax,%rax - - addq %rax,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - - sbbq %rax,%rax - andq $38,%rax - - addq %rax,%r8 - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r8,0(%rdi) - -.Lfe64_mul121666_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size x25519_fe64_mul121666,.-x25519_fe64_mul121666 - .globl x25519_fe64_add -.type x25519_fe64_add,@function -.align 32 -x25519_fe64_add: -.Lfe64_add_body: -.cfi_startproc - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - - addq 0(%rdx),%r8 - adcq 8(%rdx),%r9 - adcq 16(%rdx),%r10 - adcq 24(%rdx),%r11 - - sbbq %rax,%rax - andq $38,%rax - - addq %rax,%r8 - adcq $0,%r9 - adcq $0,%r10 - movq %r9,8(%rdi) - adcq $0,%r11 - movq %r10,16(%rdi) - sbbq %rax,%rax - movq %r11,24(%rdi) - andq $38,%rax - - addq %rax,%r8 - movq %r8,0(%rdi) - -.Lfe64_add_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size x25519_fe64_add,.-x25519_fe64_add - .globl x25519_fe64_sub -.type x25519_fe64_sub,@function -.align 32 -x25519_fe64_sub: -.Lfe64_sub_body: -.cfi_startproc - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - - subq 0(%rdx),%r8 - sbbq 8(%rdx),%r9 - sbbq 16(%rdx),%r10 - sbbq 24(%rdx),%r11 - - sbbq %rax,%rax - andq $38,%rax - - subq %rax,%r8 - sbbq $0,%r9 - sbbq $0,%r10 - movq %r9,8(%rdi) - sbbq $0,%r11 - movq %r10,16(%rdi) - sbbq %rax,%rax - movq %r11,24(%rdi) - andq $38,%rax - - subq %rax,%r8 - movq %r8,0(%rdi) - -.Lfe64_sub_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size x25519_fe64_sub,.-x25519_fe64_sub - .globl x25519_fe64_tobytes -.type x25519_fe64_tobytes,@function -.align 32 +x25519_fe64_mul: +x25519_fe64_sqr: +x25519_fe64_mul121666: +x25519_fe64_add: +x25519_fe64_sub: x25519_fe64_tobytes: -.Lfe64_to_body: .cfi_startproc - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - - - leaq (%r11,%r11,1),%rax - sarq $63,%r11 - shrq $1,%rax - andq $19,%r11 - addq $19,%r11 - - addq %r11,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%rax - - leaq (%rax,%rax,1),%r11 - sarq $63,%rax - shrq $1,%r11 - notq %rax - andq $19,%rax - - subq %rax,%r8 - sbbq $0,%r9 - sbbq $0,%r10 - sbbq $0,%r11 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - -.Lfe64_to_epilogue: +.byte 0x0f,0x0b .byte 0xf3,0xc3 .cfi_endproc -.size x25519_fe64_tobytes,.-x25519_fe64_tobytes +.size x25519_fe64_mul,.-x25519_fe64_mul .byte 88,50,53,53,49,57,32,112,114,105,109,105,116,105,118,101,115,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 - .section ".note.gnu.property", "a" + .section .note.gnu.property, #alloc .p2align 3 .long 1f - 0f .long 4f - 1f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/md5/md5-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/md5/md5-x86_64.s index 40bfc69f380..3d1a966de9b 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/md5/md5-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/md5/md5-x86_64.s @@ -681,7 +681,7 @@ ossl_md5_block_asm_data_order: .byte 0xf3,0xc3 .cfi_endproc .size ossl_md5_block_asm_data_order,.-ossl_md5_block_asm_data_order - .section ".note.gnu.property", "a" + .section .note.gnu.property, #alloc .p2align 3 .long 1f - 0f .long 4f - 1f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/aesni-gcm-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/aesni-gcm-x86_64.s index 288f44af921..19e0b738366 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/aesni-gcm-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/aesni-gcm-x86_64.s @@ -1,793 +1,23 @@ .text -.type _aesni_ctr32_ghash_6x,@function -.align 32 -_aesni_ctr32_ghash_6x: +.globl aesni_gcm_encrypt +.type aesni_gcm_encrypt,@function +aesni_gcm_encrypt: .cfi_startproc - vmovdqu 32(%r11),%xmm2 - subq $6,%rdx - vpxor %xmm4,%xmm4,%xmm4 - vmovdqu 0-128(%rcx),%xmm15 - vpaddb %xmm2,%xmm1,%xmm10 - vpaddb %xmm2,%xmm10,%xmm11 - vpaddb %xmm2,%xmm11,%xmm12 - vpaddb %xmm2,%xmm12,%xmm13 - vpaddb %xmm2,%xmm13,%xmm14 - vpxor %xmm15,%xmm1,%xmm9 - vmovdqu %xmm4,16+8(%rsp) - jmp .Loop6x - -.align 32 -.Loop6x: - addl $100663296,%ebx - jc .Lhandle_ctr32 - vmovdqu 0-32(%r9),%xmm3 - vpaddb %xmm2,%xmm14,%xmm1 - vpxor %xmm15,%xmm10,%xmm10 - vpxor %xmm15,%xmm11,%xmm11 - -.Lresume_ctr32: - vmovdqu %xmm1,(%r8) - vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 - vpxor %xmm15,%xmm12,%xmm12 - vmovups 16-128(%rcx),%xmm2 - vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 - xorq %r12,%r12 - cmpq %r14,%r15 - - vaesenc %xmm2,%xmm9,%xmm9 - vmovdqu 48+8(%rsp),%xmm0 - vpxor %xmm15,%xmm13,%xmm13 - vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 - vaesenc %xmm2,%xmm10,%xmm10 - vpxor %xmm15,%xmm14,%xmm14 - setnc %r12b - vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 - vaesenc %xmm2,%xmm11,%xmm11 - vmovdqu 16-32(%r9),%xmm3 - negq %r12 - vaesenc %xmm2,%xmm12,%xmm12 - vpxor %xmm5,%xmm6,%xmm6 - vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 - vpxor %xmm4,%xmm8,%xmm8 - vaesenc %xmm2,%xmm13,%xmm13 - vpxor %xmm5,%xmm1,%xmm4 - andq $0x60,%r12 - vmovups 32-128(%rcx),%xmm15 - vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 - vaesenc %xmm2,%xmm14,%xmm14 - - vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 - leaq (%r14,%r12,1),%r14 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor 16+8(%rsp),%xmm8,%xmm8 - vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 - vmovdqu 64+8(%rsp),%xmm0 - vaesenc %xmm15,%xmm10,%xmm10 - movbeq 88(%r14),%r13 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 80(%r14),%r12 - vaesenc %xmm15,%xmm12,%xmm12 - movq %r13,32+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - movq %r12,40+8(%rsp) - vmovdqu 48-32(%r9),%xmm5 - vaesenc %xmm15,%xmm14,%xmm14 - - vmovups 48-128(%rcx),%xmm15 - vpxor %xmm1,%xmm6,%xmm6 - vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm2,%xmm6,%xmm6 - vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 - vaesenc %xmm15,%xmm10,%xmm10 - vpxor %xmm3,%xmm7,%xmm7 - vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 - vaesenc %xmm15,%xmm11,%xmm11 - vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 - vmovdqu 80+8(%rsp),%xmm0 - vaesenc %xmm15,%xmm12,%xmm12 - vaesenc %xmm15,%xmm13,%xmm13 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqu 64-32(%r9),%xmm1 - vaesenc %xmm15,%xmm14,%xmm14 - - vmovups 64-128(%rcx),%xmm15 - vpxor %xmm2,%xmm6,%xmm6 - vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm3,%xmm6,%xmm6 - vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 - vaesenc %xmm15,%xmm10,%xmm10 - movbeq 72(%r14),%r13 - vpxor %xmm5,%xmm7,%xmm7 - vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 64(%r14),%r12 - vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 - vmovdqu 96+8(%rsp),%xmm0 - vaesenc %xmm15,%xmm12,%xmm12 - movq %r13,48+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - movq %r12,56+8(%rsp) - vpxor %xmm2,%xmm4,%xmm4 - vmovdqu 96-32(%r9),%xmm2 - vaesenc %xmm15,%xmm14,%xmm14 - - vmovups 80-128(%rcx),%xmm15 - vpxor %xmm3,%xmm6,%xmm6 - vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm5,%xmm6,%xmm6 - vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 - vaesenc %xmm15,%xmm10,%xmm10 - movbeq 56(%r14),%r13 - vpxor %xmm1,%xmm7,%xmm7 - vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 - vpxor 112+8(%rsp),%xmm8,%xmm8 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 48(%r14),%r12 - vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 - vaesenc %xmm15,%xmm12,%xmm12 - movq %r13,64+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - movq %r12,72+8(%rsp) - vpxor %xmm3,%xmm4,%xmm4 - vmovdqu 112-32(%r9),%xmm3 - vaesenc %xmm15,%xmm14,%xmm14 - - vmovups 96-128(%rcx),%xmm15 - vpxor %xmm5,%xmm6,%xmm6 - vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm1,%xmm6,%xmm6 - vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 - vaesenc %xmm15,%xmm10,%xmm10 - movbeq 40(%r14),%r13 - vpxor %xmm2,%xmm7,%xmm7 - vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 32(%r14),%r12 - vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 - vaesenc %xmm15,%xmm12,%xmm12 - movq %r13,80+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - movq %r12,88+8(%rsp) - vpxor %xmm5,%xmm6,%xmm6 - vaesenc %xmm15,%xmm14,%xmm14 - vpxor %xmm1,%xmm6,%xmm6 - - vmovups 112-128(%rcx),%xmm15 - vpslldq $8,%xmm6,%xmm5 - vpxor %xmm2,%xmm4,%xmm4 - vmovdqu 16(%r11),%xmm3 - - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm8,%xmm7,%xmm7 - vaesenc %xmm15,%xmm10,%xmm10 - vpxor %xmm5,%xmm4,%xmm4 - movbeq 24(%r14),%r13 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 16(%r14),%r12 - vpalignr $8,%xmm4,%xmm4,%xmm0 - vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 - movq %r13,96+8(%rsp) - vaesenc %xmm15,%xmm12,%xmm12 - movq %r12,104+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - vmovups 128-128(%rcx),%xmm1 - vaesenc %xmm15,%xmm14,%xmm14 - - vaesenc %xmm1,%xmm9,%xmm9 - vmovups 144-128(%rcx),%xmm15 - vaesenc %xmm1,%xmm10,%xmm10 - vpsrldq $8,%xmm6,%xmm6 - vaesenc %xmm1,%xmm11,%xmm11 - vpxor %xmm6,%xmm7,%xmm7 - vaesenc %xmm1,%xmm12,%xmm12 - vpxor %xmm0,%xmm4,%xmm4 - movbeq 8(%r14),%r13 - vaesenc %xmm1,%xmm13,%xmm13 - movbeq 0(%r14),%r12 - vaesenc %xmm1,%xmm14,%xmm14 - vmovups 160-128(%rcx),%xmm1 - cmpl $11,%ebp - jb .Lenc_tail - - vaesenc %xmm15,%xmm9,%xmm9 - vaesenc %xmm15,%xmm10,%xmm10 - vaesenc %xmm15,%xmm11,%xmm11 - vaesenc %xmm15,%xmm12,%xmm12 - vaesenc %xmm15,%xmm13,%xmm13 - vaesenc %xmm15,%xmm14,%xmm14 - - vaesenc %xmm1,%xmm9,%xmm9 - vaesenc %xmm1,%xmm10,%xmm10 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - vmovups 176-128(%rcx),%xmm15 - vaesenc %xmm1,%xmm14,%xmm14 - vmovups 192-128(%rcx),%xmm1 - je .Lenc_tail - - vaesenc %xmm15,%xmm9,%xmm9 - vaesenc %xmm15,%xmm10,%xmm10 - vaesenc %xmm15,%xmm11,%xmm11 - vaesenc %xmm15,%xmm12,%xmm12 - vaesenc %xmm15,%xmm13,%xmm13 - vaesenc %xmm15,%xmm14,%xmm14 - - vaesenc %xmm1,%xmm9,%xmm9 - vaesenc %xmm1,%xmm10,%xmm10 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - vmovups 208-128(%rcx),%xmm15 - vaesenc %xmm1,%xmm14,%xmm14 - vmovups 224-128(%rcx),%xmm1 - jmp .Lenc_tail - -.align 32 -.Lhandle_ctr32: - vmovdqu (%r11),%xmm0 - vpshufb %xmm0,%xmm1,%xmm6 - vmovdqu 48(%r11),%xmm5 - vpaddd 64(%r11),%xmm6,%xmm10 - vpaddd %xmm5,%xmm6,%xmm11 - vmovdqu 0-32(%r9),%xmm3 - vpaddd %xmm5,%xmm10,%xmm12 - vpshufb %xmm0,%xmm10,%xmm10 - vpaddd %xmm5,%xmm11,%xmm13 - vpshufb %xmm0,%xmm11,%xmm11 - vpxor %xmm15,%xmm10,%xmm10 - vpaddd %xmm5,%xmm12,%xmm14 - vpshufb %xmm0,%xmm12,%xmm12 - vpxor %xmm15,%xmm11,%xmm11 - vpaddd %xmm5,%xmm13,%xmm1 - vpshufb %xmm0,%xmm13,%xmm13 - vpshufb %xmm0,%xmm14,%xmm14 - vpshufb %xmm0,%xmm1,%xmm1 - jmp .Lresume_ctr32 - -.align 32 -.Lenc_tail: - vaesenc %xmm15,%xmm9,%xmm9 - vmovdqu %xmm7,16+8(%rsp) - vpalignr $8,%xmm4,%xmm4,%xmm8 - vaesenc %xmm15,%xmm10,%xmm10 - vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 - vpxor 0(%rdi),%xmm1,%xmm2 - vaesenc %xmm15,%xmm11,%xmm11 - vpxor 16(%rdi),%xmm1,%xmm0 - vaesenc %xmm15,%xmm12,%xmm12 - vpxor 32(%rdi),%xmm1,%xmm5 - vaesenc %xmm15,%xmm13,%xmm13 - vpxor 48(%rdi),%xmm1,%xmm6 - vaesenc %xmm15,%xmm14,%xmm14 - vpxor 64(%rdi),%xmm1,%xmm7 - vpxor 80(%rdi),%xmm1,%xmm3 - vmovdqu (%r8),%xmm1 - - vaesenclast %xmm2,%xmm9,%xmm9 - vmovdqu 32(%r11),%xmm2 - vaesenclast %xmm0,%xmm10,%xmm10 - vpaddb %xmm2,%xmm1,%xmm0 - movq %r13,112+8(%rsp) - leaq 96(%rdi),%rdi - vaesenclast %xmm5,%xmm11,%xmm11 - vpaddb %xmm2,%xmm0,%xmm5 - movq %r12,120+8(%rsp) - leaq 96(%rsi),%rsi - vmovdqu 0-128(%rcx),%xmm15 - vaesenclast %xmm6,%xmm12,%xmm12 - vpaddb %xmm2,%xmm5,%xmm6 - vaesenclast %xmm7,%xmm13,%xmm13 - vpaddb %xmm2,%xmm6,%xmm7 - vaesenclast %xmm3,%xmm14,%xmm14 - vpaddb %xmm2,%xmm7,%xmm3 - - addq $0x60,%r10 - subq $0x6,%rdx - jc .L6x_done - - vmovups %xmm9,-96(%rsi) - vpxor %xmm15,%xmm1,%xmm9 - vmovups %xmm10,-80(%rsi) - vmovdqa %xmm0,%xmm10 - vmovups %xmm11,-64(%rsi) - vmovdqa %xmm5,%xmm11 - vmovups %xmm12,-48(%rsi) - vmovdqa %xmm6,%xmm12 - vmovups %xmm13,-32(%rsi) - vmovdqa %xmm7,%xmm13 - vmovups %xmm14,-16(%rsi) - vmovdqa %xmm3,%xmm14 - vmovdqu 32+8(%rsp),%xmm7 - jmp .Loop6x - -.L6x_done: - vpxor 16+8(%rsp),%xmm8,%xmm8 - vpxor %xmm4,%xmm8,%xmm8 - + xorl %eax,%eax .byte 0xf3,0xc3 .cfi_endproc -.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x +.size aesni_gcm_encrypt,.-aesni_gcm_encrypt + .globl aesni_gcm_decrypt .type aesni_gcm_decrypt,@function -.align 32 aesni_gcm_decrypt: .cfi_startproc - xorq %r10,%r10 - cmpq $0x60,%rdx - jb .Lgcm_dec_abort - - leaq (%rsp),%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - vzeroupper - - vmovdqu (%r8),%xmm1 - addq $-128,%rsp - movl 12(%r8),%ebx - leaq .Lbswap_mask(%rip),%r11 - leaq -128(%rcx),%r14 - movq $0xf80,%r15 - vmovdqu (%r9),%xmm8 - andq $-128,%rsp - vmovdqu (%r11),%xmm0 - leaq 128(%rcx),%rcx - leaq 32+32(%r9),%r9 - movl 240-128(%rcx),%ebp - vpshufb %xmm0,%xmm8,%xmm8 - - andq %r15,%r14 - andq %rsp,%r15 - subq %r14,%r15 - jc .Ldec_no_key_aliasing - cmpq $768,%r15 - jnc .Ldec_no_key_aliasing - subq %r15,%rsp -.Ldec_no_key_aliasing: - - vmovdqu 80(%rdi),%xmm7 - leaq (%rdi),%r14 - vmovdqu 64(%rdi),%xmm4 - leaq -192(%rdi,%rdx,1),%r15 - vmovdqu 48(%rdi),%xmm5 - shrq $4,%rdx - xorq %r10,%r10 - vmovdqu 32(%rdi),%xmm6 - vpshufb %xmm0,%xmm7,%xmm7 - vmovdqu 16(%rdi),%xmm2 - vpshufb %xmm0,%xmm4,%xmm4 - vmovdqu (%rdi),%xmm3 - vpshufb %xmm0,%xmm5,%xmm5 - vmovdqu %xmm4,48(%rsp) - vpshufb %xmm0,%xmm6,%xmm6 - vmovdqu %xmm5,64(%rsp) - vpshufb %xmm0,%xmm2,%xmm2 - vmovdqu %xmm6,80(%rsp) - vpshufb %xmm0,%xmm3,%xmm3 - vmovdqu %xmm2,96(%rsp) - vmovdqu %xmm3,112(%rsp) - - call _aesni_ctr32_ghash_6x - - vmovups %xmm9,-96(%rsi) - vmovups %xmm10,-80(%rsi) - vmovups %xmm11,-64(%rsi) - vmovups %xmm12,-48(%rsi) - vmovups %xmm13,-32(%rsi) - vmovups %xmm14,-16(%rsi) - - vpshufb (%r11),%xmm8,%xmm8 - vmovdqu %xmm8,-64(%r9) - - vzeroupper - movq -48(%rax),%r15 -.cfi_restore %r15 - movq -40(%rax),%r14 -.cfi_restore %r14 - movq -32(%rax),%r13 -.cfi_restore %r13 - movq -24(%rax),%r12 -.cfi_restore %r12 - movq -16(%rax),%rbp -.cfi_restore %rbp - movq -8(%rax),%rbx -.cfi_restore %rbx - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Lgcm_dec_abort: - movq %r10,%rax + xorl %eax,%eax .byte 0xf3,0xc3 .cfi_endproc .size aesni_gcm_decrypt,.-aesni_gcm_decrypt -.type _aesni_ctr32_6x,@function -.align 32 -_aesni_ctr32_6x: -.cfi_startproc - vmovdqu 0-128(%rcx),%xmm4 - vmovdqu 32(%r11),%xmm2 - leaq -1(%rbp),%r13 - vmovups 16-128(%rcx),%xmm15 - leaq 32-128(%rcx),%r12 - vpxor %xmm4,%xmm1,%xmm9 - addl $100663296,%ebx - jc .Lhandle_ctr32_2 - vpaddb %xmm2,%xmm1,%xmm10 - vpaddb %xmm2,%xmm10,%xmm11 - vpxor %xmm4,%xmm10,%xmm10 - vpaddb %xmm2,%xmm11,%xmm12 - vpxor %xmm4,%xmm11,%xmm11 - vpaddb %xmm2,%xmm12,%xmm13 - vpxor %xmm4,%xmm12,%xmm12 - vpaddb %xmm2,%xmm13,%xmm14 - vpxor %xmm4,%xmm13,%xmm13 - vpaddb %xmm2,%xmm14,%xmm1 - vpxor %xmm4,%xmm14,%xmm14 - jmp .Loop_ctr32 - -.align 16 -.Loop_ctr32: - vaesenc %xmm15,%xmm9,%xmm9 - vaesenc %xmm15,%xmm10,%xmm10 - vaesenc %xmm15,%xmm11,%xmm11 - vaesenc %xmm15,%xmm12,%xmm12 - vaesenc %xmm15,%xmm13,%xmm13 - vaesenc %xmm15,%xmm14,%xmm14 - vmovups (%r12),%xmm15 - leaq 16(%r12),%r12 - decl %r13d - jnz .Loop_ctr32 - - vmovdqu (%r12),%xmm3 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor 0(%rdi),%xmm3,%xmm4 - vaesenc %xmm15,%xmm10,%xmm10 - vpxor 16(%rdi),%xmm3,%xmm5 - vaesenc %xmm15,%xmm11,%xmm11 - vpxor 32(%rdi),%xmm3,%xmm6 - vaesenc %xmm15,%xmm12,%xmm12 - vpxor 48(%rdi),%xmm3,%xmm8 - vaesenc %xmm15,%xmm13,%xmm13 - vpxor 64(%rdi),%xmm3,%xmm2 - vaesenc %xmm15,%xmm14,%xmm14 - vpxor 80(%rdi),%xmm3,%xmm3 - leaq 96(%rdi),%rdi - - vaesenclast %xmm4,%xmm9,%xmm9 - vaesenclast %xmm5,%xmm10,%xmm10 - vaesenclast %xmm6,%xmm11,%xmm11 - vaesenclast %xmm8,%xmm12,%xmm12 - vaesenclast %xmm2,%xmm13,%xmm13 - vaesenclast %xmm3,%xmm14,%xmm14 - vmovups %xmm9,0(%rsi) - vmovups %xmm10,16(%rsi) - vmovups %xmm11,32(%rsi) - vmovups %xmm12,48(%rsi) - vmovups %xmm13,64(%rsi) - vmovups %xmm14,80(%rsi) - leaq 96(%rsi),%rsi - - .byte 0xf3,0xc3 -.align 32 -.Lhandle_ctr32_2: - vpshufb %xmm0,%xmm1,%xmm6 - vmovdqu 48(%r11),%xmm5 - vpaddd 64(%r11),%xmm6,%xmm10 - vpaddd %xmm5,%xmm6,%xmm11 - vpaddd %xmm5,%xmm10,%xmm12 - vpshufb %xmm0,%xmm10,%xmm10 - vpaddd %xmm5,%xmm11,%xmm13 - vpshufb %xmm0,%xmm11,%xmm11 - vpxor %xmm4,%xmm10,%xmm10 - vpaddd %xmm5,%xmm12,%xmm14 - vpshufb %xmm0,%xmm12,%xmm12 - vpxor %xmm4,%xmm11,%xmm11 - vpaddd %xmm5,%xmm13,%xmm1 - vpshufb %xmm0,%xmm13,%xmm13 - vpxor %xmm4,%xmm12,%xmm12 - vpshufb %xmm0,%xmm14,%xmm14 - vpxor %xmm4,%xmm13,%xmm13 - vpshufb %xmm0,%xmm1,%xmm1 - vpxor %xmm4,%xmm14,%xmm14 - jmp .Loop_ctr32 -.cfi_endproc -.size _aesni_ctr32_6x,.-_aesni_ctr32_6x - -.globl aesni_gcm_encrypt -.type aesni_gcm_encrypt,@function -.align 32 -aesni_gcm_encrypt: -.cfi_startproc - xorq %r10,%r10 - cmpq $288,%rdx - jb .Lgcm_enc_abort - - leaq (%rsp),%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - vzeroupper - - vmovdqu (%r8),%xmm1 - addq $-128,%rsp - movl 12(%r8),%ebx - leaq .Lbswap_mask(%rip),%r11 - leaq -128(%rcx),%r14 - movq $0xf80,%r15 - leaq 128(%rcx),%rcx - vmovdqu (%r11),%xmm0 - andq $-128,%rsp - movl 240-128(%rcx),%ebp - - andq %r15,%r14 - andq %rsp,%r15 - subq %r14,%r15 - jc .Lenc_no_key_aliasing - cmpq $768,%r15 - jnc .Lenc_no_key_aliasing - subq %r15,%rsp -.Lenc_no_key_aliasing: - - leaq (%rsi),%r14 - leaq -192(%rsi,%rdx,1),%r15 - shrq $4,%rdx - - call _aesni_ctr32_6x - vpshufb %xmm0,%xmm9,%xmm8 - vpshufb %xmm0,%xmm10,%xmm2 - vmovdqu %xmm8,112(%rsp) - vpshufb %xmm0,%xmm11,%xmm4 - vmovdqu %xmm2,96(%rsp) - vpshufb %xmm0,%xmm12,%xmm5 - vmovdqu %xmm4,80(%rsp) - vpshufb %xmm0,%xmm13,%xmm6 - vmovdqu %xmm5,64(%rsp) - vpshufb %xmm0,%xmm14,%xmm7 - vmovdqu %xmm6,48(%rsp) - - call _aesni_ctr32_6x - - vmovdqu (%r9),%xmm8 - leaq 32+32(%r9),%r9 - subq $12,%rdx - movq $192,%r10 - vpshufb %xmm0,%xmm8,%xmm8 - - call _aesni_ctr32_ghash_6x - vmovdqu 32(%rsp),%xmm7 - vmovdqu (%r11),%xmm0 - vmovdqu 0-32(%r9),%xmm3 - vpunpckhqdq %xmm7,%xmm7,%xmm1 - vmovdqu 32-32(%r9),%xmm15 - vmovups %xmm9,-96(%rsi) - vpshufb %xmm0,%xmm9,%xmm9 - vpxor %xmm7,%xmm1,%xmm1 - vmovups %xmm10,-80(%rsi) - vpshufb %xmm0,%xmm10,%xmm10 - vmovups %xmm11,-64(%rsi) - vpshufb %xmm0,%xmm11,%xmm11 - vmovups %xmm12,-48(%rsi) - vpshufb %xmm0,%xmm12,%xmm12 - vmovups %xmm13,-32(%rsi) - vpshufb %xmm0,%xmm13,%xmm13 - vmovups %xmm14,-16(%rsi) - vpshufb %xmm0,%xmm14,%xmm14 - vmovdqu %xmm9,16(%rsp) - vmovdqu 48(%rsp),%xmm6 - vmovdqu 16-32(%r9),%xmm0 - vpunpckhqdq %xmm6,%xmm6,%xmm2 - vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 - vpxor %xmm6,%xmm2,%xmm2 - vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 - vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 - - vmovdqu 64(%rsp),%xmm9 - vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 - vmovdqu 48-32(%r9),%xmm3 - vpxor %xmm5,%xmm4,%xmm4 - vpunpckhqdq %xmm9,%xmm9,%xmm5 - vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 - vpxor %xmm9,%xmm5,%xmm5 - vpxor %xmm7,%xmm6,%xmm6 - vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 - vmovdqu 80-32(%r9),%xmm15 - vpxor %xmm1,%xmm2,%xmm2 - - vmovdqu 80(%rsp),%xmm1 - vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 - vmovdqu 64-32(%r9),%xmm0 - vpxor %xmm4,%xmm7,%xmm7 - vpunpckhqdq %xmm1,%xmm1,%xmm4 - vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 - vpxor %xmm1,%xmm4,%xmm4 - vpxor %xmm6,%xmm9,%xmm9 - vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 - vpxor %xmm2,%xmm5,%xmm5 - - vmovdqu 96(%rsp),%xmm2 - vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 - vmovdqu 96-32(%r9),%xmm3 - vpxor %xmm7,%xmm6,%xmm6 - vpunpckhqdq %xmm2,%xmm2,%xmm7 - vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpxor %xmm9,%xmm1,%xmm1 - vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 - vmovdqu 128-32(%r9),%xmm15 - vpxor %xmm5,%xmm4,%xmm4 - - vpxor 112(%rsp),%xmm8,%xmm8 - vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 - vmovdqu 112-32(%r9),%xmm0 - vpunpckhqdq %xmm8,%xmm8,%xmm9 - vpxor %xmm6,%xmm5,%xmm5 - vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 - vpxor %xmm8,%xmm9,%xmm9 - vpxor %xmm1,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 - vpxor %xmm4,%xmm7,%xmm4 - - vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 - vmovdqu 0-32(%r9),%xmm3 - vpunpckhqdq %xmm14,%xmm14,%xmm1 - vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 - vpxor %xmm14,%xmm1,%xmm1 - vpxor %xmm5,%xmm6,%xmm5 - vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 - vmovdqu 32-32(%r9),%xmm15 - vpxor %xmm2,%xmm8,%xmm7 - vpxor %xmm4,%xmm9,%xmm6 - - vmovdqu 16-32(%r9),%xmm0 - vpxor %xmm5,%xmm7,%xmm9 - vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 - vpxor %xmm9,%xmm6,%xmm6 - vpunpckhqdq %xmm13,%xmm13,%xmm2 - vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 - vpxor %xmm13,%xmm2,%xmm2 - vpslldq $8,%xmm6,%xmm9 - vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 - vpxor %xmm9,%xmm5,%xmm8 - vpsrldq $8,%xmm6,%xmm6 - vpxor %xmm6,%xmm7,%xmm7 - - vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 - vmovdqu 48-32(%r9),%xmm3 - vpxor %xmm4,%xmm5,%xmm5 - vpunpckhqdq %xmm12,%xmm12,%xmm9 - vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 - vpxor %xmm12,%xmm9,%xmm9 - vpxor %xmm14,%xmm13,%xmm13 - vpalignr $8,%xmm8,%xmm8,%xmm14 - vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 - vmovdqu 80-32(%r9),%xmm15 - vpxor %xmm1,%xmm2,%xmm2 - - vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 - vmovdqu 64-32(%r9),%xmm0 - vpxor %xmm5,%xmm4,%xmm4 - vpunpckhqdq %xmm11,%xmm11,%xmm1 - vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 - vpxor %xmm11,%xmm1,%xmm1 - vpxor %xmm13,%xmm12,%xmm12 - vxorps 16(%rsp),%xmm7,%xmm7 - vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 - vpxor %xmm2,%xmm9,%xmm9 - - vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 - vxorps %xmm14,%xmm8,%xmm8 - - vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 - vmovdqu 96-32(%r9),%xmm3 - vpxor %xmm4,%xmm5,%xmm5 - vpunpckhqdq %xmm10,%xmm10,%xmm2 - vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 - vpxor %xmm10,%xmm2,%xmm2 - vpalignr $8,%xmm8,%xmm8,%xmm14 - vpxor %xmm12,%xmm11,%xmm11 - vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 - vmovdqu 128-32(%r9),%xmm15 - vpxor %xmm9,%xmm1,%xmm1 - - vxorps %xmm7,%xmm14,%xmm14 - vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 - vxorps %xmm14,%xmm8,%xmm8 - - vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 - vmovdqu 112-32(%r9),%xmm0 - vpxor %xmm5,%xmm4,%xmm4 - vpunpckhqdq %xmm8,%xmm8,%xmm9 - vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 - vpxor %xmm8,%xmm9,%xmm9 - vpxor %xmm11,%xmm10,%xmm10 - vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 - vpxor %xmm1,%xmm2,%xmm2 - - vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 - vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 - vpxor %xmm4,%xmm5,%xmm5 - vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 - vpxor %xmm10,%xmm7,%xmm7 - vpxor %xmm2,%xmm6,%xmm6 - - vpxor %xmm5,%xmm7,%xmm4 - vpxor %xmm4,%xmm6,%xmm6 - vpslldq $8,%xmm6,%xmm1 - vmovdqu 16(%r11),%xmm3 - vpsrldq $8,%xmm6,%xmm6 - vpxor %xmm1,%xmm5,%xmm8 - vpxor %xmm6,%xmm7,%xmm7 - - vpalignr $8,%xmm8,%xmm8,%xmm2 - vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 - vpxor %xmm2,%xmm8,%xmm8 - - vpalignr $8,%xmm8,%xmm8,%xmm2 - vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 - vpxor %xmm7,%xmm2,%xmm2 - vpxor %xmm2,%xmm8,%xmm8 - vpshufb (%r11),%xmm8,%xmm8 - vmovdqu %xmm8,-64(%r9) - - vzeroupper - movq -48(%rax),%r15 -.cfi_restore %r15 - movq -40(%rax),%r14 -.cfi_restore %r14 - movq -32(%rax),%r13 -.cfi_restore %r13 - movq -24(%rax),%r12 -.cfi_restore %r12 - movq -16(%rax),%rbp -.cfi_restore %rbp - movq -8(%rax),%rbx -.cfi_restore %rbx - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Lgcm_enc_abort: - movq %r10,%rax - .byte 0xf3,0xc3 -.cfi_endproc -.size aesni_gcm_encrypt,.-aesni_gcm_encrypt -.align 64 -.Lbswap_mask: -.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -.Lpoly: -.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 -.Lone_msb: -.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 -.Ltwo_lsb: -.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -.Lone_lsb: -.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 64 - .section ".note.gnu.property", "a" + .section .note.gnu.property, #alloc .p2align 3 .long 1f - 0f .long 4f - 1f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/ghash-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/ghash-x86_64.s index ac4823fe589..3aa9f6c1784 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/ghash-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/ghash-x86_64.s @@ -1306,108 +1306,7 @@ gcm_ghash_clmul: .align 32 gcm_init_avx: .cfi_startproc - vzeroupper - - vmovdqu (%rsi),%xmm2 - vpshufd $78,%xmm2,%xmm2 - - - vpshufd $255,%xmm2,%xmm4 - vpsrlq $63,%xmm2,%xmm3 - vpsllq $1,%xmm2,%xmm2 - vpxor %xmm5,%xmm5,%xmm5 - vpcmpgtd %xmm4,%xmm5,%xmm5 - vpslldq $8,%xmm3,%xmm3 - vpor %xmm3,%xmm2,%xmm2 - - - vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5 - vpxor %xmm5,%xmm2,%xmm2 - - vpunpckhqdq %xmm2,%xmm2,%xmm6 - vmovdqa %xmm2,%xmm0 - vpxor %xmm2,%xmm6,%xmm6 - movq $4,%r10 - jmp .Linit_start_avx -.align 32 -.Linit_loop_avx: - vpalignr $8,%xmm3,%xmm4,%xmm5 - vmovdqu %xmm5,-16(%rdi) - vpunpckhqdq %xmm0,%xmm0,%xmm3 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 - vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 - vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 - vpxor %xmm0,%xmm1,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - - vpslldq $8,%xmm3,%xmm4 - vpsrldq $8,%xmm3,%xmm3 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm3,%xmm1,%xmm1 - vpsllq $57,%xmm0,%xmm3 - vpsllq $62,%xmm0,%xmm4 - vpxor %xmm3,%xmm4,%xmm4 - vpsllq $63,%xmm0,%xmm3 - vpxor %xmm3,%xmm4,%xmm4 - vpslldq $8,%xmm4,%xmm3 - vpsrldq $8,%xmm4,%xmm4 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm4,%xmm1,%xmm1 - - vpsrlq $1,%xmm0,%xmm4 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpsrlq $5,%xmm4,%xmm4 - vpxor %xmm4,%xmm0,%xmm0 - vpsrlq $1,%xmm0,%xmm0 - vpxor %xmm1,%xmm0,%xmm0 -.Linit_start_avx: - vmovdqa %xmm0,%xmm5 - vpunpckhqdq %xmm0,%xmm0,%xmm3 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 - vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 - vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 - vpxor %xmm0,%xmm1,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - - vpslldq $8,%xmm3,%xmm4 - vpsrldq $8,%xmm3,%xmm3 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm3,%xmm1,%xmm1 - vpsllq $57,%xmm0,%xmm3 - vpsllq $62,%xmm0,%xmm4 - vpxor %xmm3,%xmm4,%xmm4 - vpsllq $63,%xmm0,%xmm3 - vpxor %xmm3,%xmm4,%xmm4 - vpslldq $8,%xmm4,%xmm3 - vpsrldq $8,%xmm4,%xmm4 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm4,%xmm1,%xmm1 - - vpsrlq $1,%xmm0,%xmm4 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpsrlq $5,%xmm4,%xmm4 - vpxor %xmm4,%xmm0,%xmm0 - vpsrlq $1,%xmm0,%xmm0 - vpxor %xmm1,%xmm0,%xmm0 - vpshufd $78,%xmm5,%xmm3 - vpshufd $78,%xmm0,%xmm4 - vpxor %xmm5,%xmm3,%xmm3 - vmovdqu %xmm5,0(%rdi) - vpxor %xmm0,%xmm4,%xmm4 - vmovdqu %xmm0,16(%rdi) - leaq 48(%rdi),%rdi - subq $1,%r10 - jnz .Linit_loop_avx - - vpalignr $8,%xmm4,%xmm3,%xmm5 - vmovdqu %xmm5,-16(%rdi) - - vzeroupper - .byte 0xf3,0xc3 + jmp .L_init_clmul .cfi_endproc .size gcm_init_avx,.-gcm_init_avx .globl gcm_gmult_avx @@ -1425,377 +1324,7 @@ gcm_gmult_avx: gcm_ghash_avx: .cfi_startproc .byte 243,15,30,250 - vzeroupper - - vmovdqu (%rdi),%xmm10 - leaq .L0x1c2_polynomial(%rip),%r10 - leaq 64(%rsi),%rsi - vmovdqu .Lbswap_mask(%rip),%xmm13 - vpshufb %xmm13,%xmm10,%xmm10 - cmpq $0x80,%rcx - jb .Lshort_avx - subq $0x80,%rcx - - vmovdqu 112(%rdx),%xmm14 - vmovdqu 0-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm14 - vmovdqu 32-64(%rsi),%xmm7 - - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vmovdqu 96(%rdx),%xmm15 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpxor %xmm14,%xmm9,%xmm9 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 16-64(%rsi),%xmm6 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vmovdqu 80(%rdx),%xmm14 - vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 - vpxor %xmm15,%xmm8,%xmm8 - - vpshufb %xmm13,%xmm14,%xmm14 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 - vmovdqu 48-64(%rsi),%xmm6 - vpxor %xmm14,%xmm9,%xmm9 - vmovdqu 64(%rdx),%xmm15 - vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 - vmovdqu 80-64(%rsi),%xmm7 - - vpshufb %xmm13,%xmm15,%xmm15 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpxor %xmm1,%xmm4,%xmm4 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 64-64(%rsi),%xmm6 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 - vpxor %xmm15,%xmm8,%xmm8 - - vmovdqu 48(%rdx),%xmm14 - vpxor %xmm3,%xmm0,%xmm0 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 - vpxor %xmm4,%xmm1,%xmm1 - vpshufb %xmm13,%xmm14,%xmm14 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 - vmovdqu 96-64(%rsi),%xmm6 - vpxor %xmm5,%xmm2,%xmm2 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 - vmovdqu 128-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - - vmovdqu 32(%rdx),%xmm15 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpxor %xmm1,%xmm4,%xmm4 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 112-64(%rsi),%xmm6 - vpxor %xmm2,%xmm5,%xmm5 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 - vpxor %xmm15,%xmm8,%xmm8 - - vmovdqu 16(%rdx),%xmm14 - vpxor %xmm3,%xmm0,%xmm0 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 - vpxor %xmm4,%xmm1,%xmm1 - vpshufb %xmm13,%xmm14,%xmm14 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 - vmovdqu 144-64(%rsi),%xmm6 - vpxor %xmm5,%xmm2,%xmm2 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 - vmovdqu 176-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - - vmovdqu (%rdx),%xmm15 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpxor %xmm1,%xmm4,%xmm4 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 160-64(%rsi),%xmm6 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 - - leaq 128(%rdx),%rdx - cmpq $0x80,%rcx - jb .Ltail_avx - - vpxor %xmm10,%xmm15,%xmm15 - subq $0x80,%rcx - jmp .Loop8x_avx - -.align 32 -.Loop8x_avx: - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vmovdqu 112(%rdx),%xmm14 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm15,%xmm8,%xmm8 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 - vpshufb %xmm13,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 - vmovdqu 0-64(%rsi),%xmm6 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 - vmovdqu 32-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - - vmovdqu 96(%rdx),%xmm15 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpxor %xmm3,%xmm10,%xmm10 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vxorps %xmm4,%xmm11,%xmm11 - vmovdqu 16-64(%rsi),%xmm6 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 - vpxor %xmm5,%xmm12,%xmm12 - vxorps %xmm15,%xmm8,%xmm8 - - vmovdqu 80(%rdx),%xmm14 - vpxor %xmm10,%xmm12,%xmm12 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 - vpxor %xmm11,%xmm12,%xmm12 - vpslldq $8,%xmm12,%xmm9 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 - vpsrldq $8,%xmm12,%xmm12 - vpxor %xmm9,%xmm10,%xmm10 - vmovdqu 48-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm14 - vxorps %xmm12,%xmm11,%xmm11 - vpxor %xmm1,%xmm4,%xmm4 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 - vmovdqu 80-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - vpxor %xmm2,%xmm5,%xmm5 - - vmovdqu 64(%rdx),%xmm15 - vpalignr $8,%xmm10,%xmm10,%xmm12 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpshufb %xmm13,%xmm15,%xmm15 - vpxor %xmm3,%xmm0,%xmm0 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 64-64(%rsi),%xmm6 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 - vxorps %xmm15,%xmm8,%xmm8 - vpxor %xmm5,%xmm2,%xmm2 - - vmovdqu 48(%rdx),%xmm14 - vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 - vpshufb %xmm13,%xmm14,%xmm14 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 - vmovdqu 96-64(%rsi),%xmm6 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 - vmovdqu 128-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - vpxor %xmm2,%xmm5,%xmm5 - - vmovdqu 32(%rdx),%xmm15 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpshufb %xmm13,%xmm15,%xmm15 - vpxor %xmm3,%xmm0,%xmm0 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 112-64(%rsi),%xmm6 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 - vpxor %xmm15,%xmm8,%xmm8 - vpxor %xmm5,%xmm2,%xmm2 - vxorps %xmm12,%xmm10,%xmm10 - - vmovdqu 16(%rdx),%xmm14 - vpalignr $8,%xmm10,%xmm10,%xmm12 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 - vpshufb %xmm13,%xmm14,%xmm14 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 - vmovdqu 144-64(%rsi),%xmm6 - vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 - vxorps %xmm11,%xmm12,%xmm12 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 - vmovdqu 176-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - vpxor %xmm2,%xmm5,%xmm5 - - vmovdqu (%rdx),%xmm15 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 160-64(%rsi),%xmm6 - vpxor %xmm12,%xmm15,%xmm15 - vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 - vpxor %xmm10,%xmm15,%xmm15 - - leaq 128(%rdx),%rdx - subq $0x80,%rcx - jnc .Loop8x_avx - - addq $0x80,%rcx - jmp .Ltail_no_xor_avx - -.align 32 -.Lshort_avx: - vmovdqu -16(%rdx,%rcx,1),%xmm14 - leaq (%rdx,%rcx,1),%rdx - vmovdqu 0-64(%rsi),%xmm6 - vmovdqu 32-64(%rsi),%xmm7 - vpshufb %xmm13,%xmm14,%xmm15 - - vmovdqa %xmm0,%xmm3 - vmovdqa %xmm1,%xmm4 - vmovdqa %xmm2,%xmm5 - subq $0x10,%rcx - jz .Ltail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -32(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vmovdqu 16-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - vpsrldq $8,%xmm7,%xmm7 - subq $0x10,%rcx - jz .Ltail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -48(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vmovdqu 48-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - vmovdqu 80-64(%rsi),%xmm7 - subq $0x10,%rcx - jz .Ltail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -64(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vmovdqu 64-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - vpsrldq $8,%xmm7,%xmm7 - subq $0x10,%rcx - jz .Ltail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -80(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vmovdqu 96-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - vmovdqu 128-64(%rsi),%xmm7 - subq $0x10,%rcx - jz .Ltail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -96(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vmovdqu 112-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - vpsrldq $8,%xmm7,%xmm7 - subq $0x10,%rcx - jz .Ltail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -112(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vmovdqu 144-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - vmovq 184-64(%rsi),%xmm7 - subq $0x10,%rcx - jmp .Ltail_avx - -.align 32 -.Ltail_avx: - vpxor %xmm10,%xmm15,%xmm15 -.Ltail_no_xor_avx: - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - - vmovdqu (%r10),%xmm12 - - vpxor %xmm0,%xmm3,%xmm10 - vpxor %xmm1,%xmm4,%xmm11 - vpxor %xmm2,%xmm5,%xmm5 - - vpxor %xmm10,%xmm5,%xmm5 - vpxor %xmm11,%xmm5,%xmm5 - vpslldq $8,%xmm5,%xmm9 - vpsrldq $8,%xmm5,%xmm5 - vpxor %xmm9,%xmm10,%xmm10 - vpxor %xmm5,%xmm11,%xmm11 - - vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 - vpalignr $8,%xmm10,%xmm10,%xmm10 - vpxor %xmm9,%xmm10,%xmm10 - - vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 - vpalignr $8,%xmm10,%xmm10,%xmm10 - vpxor %xmm11,%xmm10,%xmm10 - vpxor %xmm9,%xmm10,%xmm10 - - cmpq $0,%rcx - jne .Lshort_avx - - vpshufb %xmm13,%xmm10,%xmm10 - vmovdqu %xmm10,(%rdi) - vzeroupper - .byte 0xf3,0xc3 + jmp .L_ghash_clmul .cfi_endproc .size gcm_ghash_avx,.-gcm_ghash_avx .align 64 @@ -1851,7 +1380,7 @@ gcm_ghash_avx: .byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 - .section ".note.gnu.property", "a" + .section .note.gnu.property, #alloc .p2align 3 .long 1f - 0f .long 4f - 1f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/keccak1600-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/keccak1600-x86_64.s index 9c0054aa175..6beb92e69e3 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/keccak1600-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/keccak1600-x86_64.s @@ -522,7 +522,7 @@ iotas: .quad 0x8000000080008008 .size iotas,.-iotas .byte 75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 - .section ".note.gnu.property", "a" + .section .note.gnu.property, #alloc .p2align 3 .long 1f - 0f .long 4f - 1f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha1-mb-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha1-mb-x86_64.s index 589ffb37468..76135fdbb10 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha1-mb-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha1-mb-x86_64.s @@ -10,8 +10,6 @@ sha1_multi_block: movq OPENSSL_ia32cap_P+4(%rip),%rcx btq $61,%rcx jc _shaext_shortcut - testl $268435456,%ecx - jnz _avx_shortcut movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx @@ -2949,4343 +2947,6 @@ _shaext_shortcut: .byte 0xf3,0xc3 .cfi_endproc .size sha1_multi_block_shaext,.-sha1_multi_block_shaext -.type sha1_multi_block_avx,@function -.align 32 -sha1_multi_block_avx: -.cfi_startproc -_avx_shortcut: - shrq $32,%rcx - cmpl $2,%edx - jb .Lavx - testl $32,%ecx - jnz _avx2_shortcut - jmp .Lavx -.align 32 -.Lavx: - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - subq $288,%rsp - andq $-256,%rsp - movq %rax,272(%rsp) -.cfi_escape 0x0f,0x06,0x77,0x90,0x02,0x06,0x23,0x08 -.Lbody_avx: - leaq K_XX_XX(%rip),%rbp - leaq 256(%rsp),%rbx - - vzeroupper -.Loop_grande_avx: - movl %edx,280(%rsp) - xorl %edx,%edx - - movq 0(%rsi),%r8 - - movl 8(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,0(%rbx) - cmovleq %rbp,%r8 - - movq 16(%rsi),%r9 - - movl 24(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,4(%rbx) - cmovleq %rbp,%r9 - - movq 32(%rsi),%r10 - - movl 40(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,8(%rbx) - cmovleq %rbp,%r10 - - movq 48(%rsi),%r11 - - movl 56(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,12(%rbx) - cmovleq %rbp,%r11 - testl %edx,%edx - jz .Ldone_avx - - vmovdqu 0(%rdi),%xmm10 - leaq 128(%rsp),%rax - vmovdqu 32(%rdi),%xmm11 - vmovdqu 64(%rdi),%xmm12 - vmovdqu 96(%rdi),%xmm13 - vmovdqu 128(%rdi),%xmm14 - vmovdqu 96(%rbp),%xmm5 - jmp .Loop_avx - -.align 32 -.Loop_avx: - vmovdqa -32(%rbp),%xmm15 - vmovd (%r8),%xmm0 - leaq 64(%r8),%r8 - vmovd (%r9),%xmm2 - leaq 64(%r9),%r9 - vpinsrd $1,(%r10),%xmm0,%xmm0 - leaq 64(%r10),%r10 - vpinsrd $1,(%r11),%xmm2,%xmm2 - leaq 64(%r11),%r11 - vmovd -60(%r8),%xmm1 - vpunpckldq %xmm2,%xmm0,%xmm0 - vmovd -60(%r9),%xmm9 - vpshufb %xmm5,%xmm0,%xmm0 - vpinsrd $1,-60(%r10),%xmm1,%xmm1 - vpinsrd $1,-60(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpandn %xmm13,%xmm11,%xmm7 - vpand %xmm12,%xmm11,%xmm6 - - vmovdqa %xmm0,0-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpunpckldq %xmm9,%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -56(%r8),%xmm2 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -56(%r9),%xmm9 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpshufb %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpinsrd $1,-56(%r10),%xmm2,%xmm2 - vpinsrd $1,-56(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpandn %xmm12,%xmm10,%xmm7 - vpand %xmm11,%xmm10,%xmm6 - - vmovdqa %xmm1,16-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpunpckldq %xmm9,%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -52(%r8),%xmm3 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -52(%r9),%xmm9 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpshufb %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpinsrd $1,-52(%r10),%xmm3,%xmm3 - vpinsrd $1,-52(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpandn %xmm11,%xmm14,%xmm7 - vpand %xmm10,%xmm14,%xmm6 - - vmovdqa %xmm2,32-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpunpckldq %xmm9,%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -48(%r8),%xmm4 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -48(%r9),%xmm9 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpshufb %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpinsrd $1,-48(%r10),%xmm4,%xmm4 - vpinsrd $1,-48(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpandn %xmm10,%xmm13,%xmm7 - vpand %xmm14,%xmm13,%xmm6 - - vmovdqa %xmm3,48-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpunpckldq %xmm9,%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -44(%r8),%xmm0 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -44(%r9),%xmm9 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpshufb %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpinsrd $1,-44(%r10),%xmm0,%xmm0 - vpinsrd $1,-44(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm10,%xmm10 - vpslld $5,%xmm11,%xmm8 - vpandn %xmm14,%xmm12,%xmm7 - vpand %xmm13,%xmm12,%xmm6 - - vmovdqa %xmm4,64-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpunpckldq %xmm9,%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -40(%r8),%xmm1 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -40(%r9),%xmm9 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpshufb %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpinsrd $1,-40(%r10),%xmm1,%xmm1 - vpinsrd $1,-40(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpandn %xmm13,%xmm11,%xmm7 - vpand %xmm12,%xmm11,%xmm6 - - vmovdqa %xmm0,80-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpunpckldq %xmm9,%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -36(%r8),%xmm2 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -36(%r9),%xmm9 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpshufb %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpinsrd $1,-36(%r10),%xmm2,%xmm2 - vpinsrd $1,-36(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpandn %xmm12,%xmm10,%xmm7 - vpand %xmm11,%xmm10,%xmm6 - - vmovdqa %xmm1,96-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpunpckldq %xmm9,%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -32(%r8),%xmm3 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -32(%r9),%xmm9 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpshufb %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpinsrd $1,-32(%r10),%xmm3,%xmm3 - vpinsrd $1,-32(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpandn %xmm11,%xmm14,%xmm7 - vpand %xmm10,%xmm14,%xmm6 - - vmovdqa %xmm2,112-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpunpckldq %xmm9,%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -28(%r8),%xmm4 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -28(%r9),%xmm9 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpshufb %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpinsrd $1,-28(%r10),%xmm4,%xmm4 - vpinsrd $1,-28(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpandn %xmm10,%xmm13,%xmm7 - vpand %xmm14,%xmm13,%xmm6 - - vmovdqa %xmm3,128-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpunpckldq %xmm9,%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -24(%r8),%xmm0 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -24(%r9),%xmm9 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpshufb %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpinsrd $1,-24(%r10),%xmm0,%xmm0 - vpinsrd $1,-24(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm10,%xmm10 - vpslld $5,%xmm11,%xmm8 - vpandn %xmm14,%xmm12,%xmm7 - vpand %xmm13,%xmm12,%xmm6 - - vmovdqa %xmm4,144-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpunpckldq %xmm9,%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -20(%r8),%xmm1 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -20(%r9),%xmm9 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpshufb %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpinsrd $1,-20(%r10),%xmm1,%xmm1 - vpinsrd $1,-20(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpandn %xmm13,%xmm11,%xmm7 - vpand %xmm12,%xmm11,%xmm6 - - vmovdqa %xmm0,160-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpunpckldq %xmm9,%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -16(%r8),%xmm2 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -16(%r9),%xmm9 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpshufb %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpinsrd $1,-16(%r10),%xmm2,%xmm2 - vpinsrd $1,-16(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpandn %xmm12,%xmm10,%xmm7 - vpand %xmm11,%xmm10,%xmm6 - - vmovdqa %xmm1,176-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpunpckldq %xmm9,%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -12(%r8),%xmm3 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -12(%r9),%xmm9 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpshufb %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpinsrd $1,-12(%r10),%xmm3,%xmm3 - vpinsrd $1,-12(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpandn %xmm11,%xmm14,%xmm7 - vpand %xmm10,%xmm14,%xmm6 - - vmovdqa %xmm2,192-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpunpckldq %xmm9,%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -8(%r8),%xmm4 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -8(%r9),%xmm9 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpshufb %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpinsrd $1,-8(%r10),%xmm4,%xmm4 - vpinsrd $1,-8(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpandn %xmm10,%xmm13,%xmm7 - vpand %xmm14,%xmm13,%xmm6 - - vmovdqa %xmm3,208-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpunpckldq %xmm9,%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -4(%r8),%xmm0 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -4(%r9),%xmm9 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpshufb %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vmovdqa 0-128(%rax),%xmm1 - vpinsrd $1,-4(%r10),%xmm0,%xmm0 - vpinsrd $1,-4(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm10,%xmm10 - prefetcht0 63(%r8) - vpslld $5,%xmm11,%xmm8 - vpandn %xmm14,%xmm12,%xmm7 - vpand %xmm13,%xmm12,%xmm6 - - vmovdqa %xmm4,224-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpunpckldq %xmm9,%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - prefetcht0 63(%r9) - vpxor %xmm7,%xmm6,%xmm6 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - prefetcht0 63(%r10) - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - prefetcht0 63(%r11) - vpshufb %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vmovdqa 16-128(%rax),%xmm2 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 32-128(%rax),%xmm3 - - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpandn %xmm13,%xmm11,%xmm7 - - vpand %xmm12,%xmm11,%xmm6 - - vmovdqa %xmm0,240-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 128-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 48-128(%rax),%xmm4 - - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpandn %xmm12,%xmm10,%xmm7 - - vpand %xmm11,%xmm10,%xmm6 - - vmovdqa %xmm1,0-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 144-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 64-128(%rax),%xmm0 - - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpandn %xmm11,%xmm14,%xmm7 - - vpand %xmm10,%xmm14,%xmm6 - - vmovdqa %xmm2,16-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 160-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 80-128(%rax),%xmm1 - - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpandn %xmm10,%xmm13,%xmm7 - - vpand %xmm14,%xmm13,%xmm6 - - vmovdqa %xmm3,32-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 176-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 96-128(%rax),%xmm2 - - vpaddd %xmm15,%xmm10,%xmm10 - vpslld $5,%xmm11,%xmm8 - vpandn %xmm14,%xmm12,%xmm7 - - vpand %xmm13,%xmm12,%xmm6 - - vmovdqa %xmm4,48-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpxor 192-128(%rax),%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $31,%xmm0,%xmm5 - vpaddd %xmm0,%xmm0,%xmm0 - - vpsrld $2,%xmm12,%xmm12 - - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vmovdqa 0(%rbp),%xmm15 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 112-128(%rax),%xmm3 - - vpslld $5,%xmm10,%xmm8 - vpaddd %xmm15,%xmm14,%xmm14 - vpxor %xmm11,%xmm13,%xmm6 - vmovdqa %xmm0,64-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 208-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 128-128(%rax),%xmm4 - - vpslld $5,%xmm14,%xmm8 - vpaddd %xmm15,%xmm13,%xmm13 - vpxor %xmm10,%xmm12,%xmm6 - vmovdqa %xmm1,80-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 224-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 144-128(%rax),%xmm0 - - vpslld $5,%xmm13,%xmm8 - vpaddd %xmm15,%xmm12,%xmm12 - vpxor %xmm14,%xmm11,%xmm6 - vmovdqa %xmm2,96-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 240-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 160-128(%rax),%xmm1 - - vpslld $5,%xmm12,%xmm8 - vpaddd %xmm15,%xmm11,%xmm11 - vpxor %xmm13,%xmm10,%xmm6 - vmovdqa %xmm3,112-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 0-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 176-128(%rax),%xmm2 - - vpslld $5,%xmm11,%xmm8 - vpaddd %xmm15,%xmm10,%xmm10 - vpxor %xmm12,%xmm14,%xmm6 - vmovdqa %xmm4,128-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpxor 16-128(%rax),%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm6,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - vpsrld $31,%xmm0,%xmm5 - vpaddd %xmm0,%xmm0,%xmm0 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 192-128(%rax),%xmm3 - - vpslld $5,%xmm10,%xmm8 - vpaddd %xmm15,%xmm14,%xmm14 - vpxor %xmm11,%xmm13,%xmm6 - vmovdqa %xmm0,144-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 32-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 208-128(%rax),%xmm4 - - vpslld $5,%xmm14,%xmm8 - vpaddd %xmm15,%xmm13,%xmm13 - vpxor %xmm10,%xmm12,%xmm6 - vmovdqa %xmm1,160-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 48-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 224-128(%rax),%xmm0 - - vpslld $5,%xmm13,%xmm8 - vpaddd %xmm15,%xmm12,%xmm12 - vpxor %xmm14,%xmm11,%xmm6 - vmovdqa %xmm2,176-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 64-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 240-128(%rax),%xmm1 - - vpslld $5,%xmm12,%xmm8 - vpaddd %xmm15,%xmm11,%xmm11 - vpxor %xmm13,%xmm10,%xmm6 - vmovdqa %xmm3,192-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 80-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 0-128(%rax),%xmm2 - - vpslld $5,%xmm11,%xmm8 - vpaddd %xmm15,%xmm10,%xmm10 - vpxor %xmm12,%xmm14,%xmm6 - vmovdqa %xmm4,208-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpxor 96-128(%rax),%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm6,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - vpsrld $31,%xmm0,%xmm5 - vpaddd %xmm0,%xmm0,%xmm0 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 16-128(%rax),%xmm3 - - vpslld $5,%xmm10,%xmm8 - vpaddd %xmm15,%xmm14,%xmm14 - vpxor %xmm11,%xmm13,%xmm6 - vmovdqa %xmm0,224-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 112-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 32-128(%rax),%xmm4 - - vpslld $5,%xmm14,%xmm8 - vpaddd %xmm15,%xmm13,%xmm13 - vpxor %xmm10,%xmm12,%xmm6 - vmovdqa %xmm1,240-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 128-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 48-128(%rax),%xmm0 - - vpslld $5,%xmm13,%xmm8 - vpaddd %xmm15,%xmm12,%xmm12 - vpxor %xmm14,%xmm11,%xmm6 - vmovdqa %xmm2,0-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 144-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 64-128(%rax),%xmm1 - - vpslld $5,%xmm12,%xmm8 - vpaddd %xmm15,%xmm11,%xmm11 - vpxor %xmm13,%xmm10,%xmm6 - vmovdqa %xmm3,16-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 160-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 80-128(%rax),%xmm2 - - vpslld $5,%xmm11,%xmm8 - vpaddd %xmm15,%xmm10,%xmm10 - vpxor %xmm12,%xmm14,%xmm6 - vmovdqa %xmm4,32-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpxor 176-128(%rax),%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm6,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - vpsrld $31,%xmm0,%xmm5 - vpaddd %xmm0,%xmm0,%xmm0 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 96-128(%rax),%xmm3 - - vpslld $5,%xmm10,%xmm8 - vpaddd %xmm15,%xmm14,%xmm14 - vpxor %xmm11,%xmm13,%xmm6 - vmovdqa %xmm0,48-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 192-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 112-128(%rax),%xmm4 - - vpslld $5,%xmm14,%xmm8 - vpaddd %xmm15,%xmm13,%xmm13 - vpxor %xmm10,%xmm12,%xmm6 - vmovdqa %xmm1,64-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 208-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 128-128(%rax),%xmm0 - - vpslld $5,%xmm13,%xmm8 - vpaddd %xmm15,%xmm12,%xmm12 - vpxor %xmm14,%xmm11,%xmm6 - vmovdqa %xmm2,80-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 224-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 144-128(%rax),%xmm1 - - vpslld $5,%xmm12,%xmm8 - vpaddd %xmm15,%xmm11,%xmm11 - vpxor %xmm13,%xmm10,%xmm6 - vmovdqa %xmm3,96-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 240-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 160-128(%rax),%xmm2 - - vpslld $5,%xmm11,%xmm8 - vpaddd %xmm15,%xmm10,%xmm10 - vpxor %xmm12,%xmm14,%xmm6 - vmovdqa %xmm4,112-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpxor 0-128(%rax),%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm6,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - vpsrld $31,%xmm0,%xmm5 - vpaddd %xmm0,%xmm0,%xmm0 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vmovdqa 32(%rbp),%xmm15 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 176-128(%rax),%xmm3 - - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpand %xmm12,%xmm13,%xmm7 - vpxor 16-128(%rax),%xmm1,%xmm1 - - vpaddd %xmm7,%xmm14,%xmm14 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm13,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vmovdqu %xmm0,128-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm1,%xmm5 - vpand %xmm11,%xmm6,%xmm6 - vpaddd %xmm1,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 192-128(%rax),%xmm4 - - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpand %xmm11,%xmm12,%xmm7 - vpxor 32-128(%rax),%xmm2,%xmm2 - - vpaddd %xmm7,%xmm13,%xmm13 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm12,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vmovdqu %xmm1,144-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm2,%xmm5 - vpand %xmm10,%xmm6,%xmm6 - vpaddd %xmm2,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 208-128(%rax),%xmm0 - - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpand %xmm10,%xmm11,%xmm7 - vpxor 48-128(%rax),%xmm3,%xmm3 - - vpaddd %xmm7,%xmm12,%xmm12 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm11,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vmovdqu %xmm2,160-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm3,%xmm5 - vpand %xmm14,%xmm6,%xmm6 - vpaddd %xmm3,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 224-128(%rax),%xmm1 - - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpand %xmm14,%xmm10,%xmm7 - vpxor 64-128(%rax),%xmm4,%xmm4 - - vpaddd %xmm7,%xmm11,%xmm11 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm10,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vmovdqu %xmm3,176-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm4,%xmm5 - vpand %xmm13,%xmm6,%xmm6 - vpaddd %xmm4,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 240-128(%rax),%xmm2 - - vpaddd %xmm15,%xmm10,%xmm10 - vpslld $5,%xmm11,%xmm8 - vpand %xmm13,%xmm14,%xmm7 - vpxor 80-128(%rax),%xmm0,%xmm0 - - vpaddd %xmm7,%xmm10,%xmm10 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm14,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vmovdqu %xmm4,192-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm0,%xmm5 - vpand %xmm12,%xmm6,%xmm6 - vpaddd %xmm0,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 0-128(%rax),%xmm3 - - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpand %xmm12,%xmm13,%xmm7 - vpxor 96-128(%rax),%xmm1,%xmm1 - - vpaddd %xmm7,%xmm14,%xmm14 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm13,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vmovdqu %xmm0,208-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm1,%xmm5 - vpand %xmm11,%xmm6,%xmm6 - vpaddd %xmm1,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 16-128(%rax),%xmm4 - - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpand %xmm11,%xmm12,%xmm7 - vpxor 112-128(%rax),%xmm2,%xmm2 - - vpaddd %xmm7,%xmm13,%xmm13 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm12,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vmovdqu %xmm1,224-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm2,%xmm5 - vpand %xmm10,%xmm6,%xmm6 - vpaddd %xmm2,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 32-128(%rax),%xmm0 - - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpand %xmm10,%xmm11,%xmm7 - vpxor 128-128(%rax),%xmm3,%xmm3 - - vpaddd %xmm7,%xmm12,%xmm12 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm11,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vmovdqu %xmm2,240-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm3,%xmm5 - vpand %xmm14,%xmm6,%xmm6 - vpaddd %xmm3,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 48-128(%rax),%xmm1 - - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpand %xmm14,%xmm10,%xmm7 - vpxor 144-128(%rax),%xmm4,%xmm4 - - vpaddd %xmm7,%xmm11,%xmm11 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm10,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vmovdqu %xmm3,0-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm4,%xmm5 - vpand %xmm13,%xmm6,%xmm6 - vpaddd %xmm4,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 64-128(%rax),%xmm2 - - vpaddd %xmm15,%xmm10,%xmm10 - vpslld $5,%xmm11,%xmm8 - vpand %xmm13,%xmm14,%xmm7 - vpxor 160-128(%rax),%xmm0,%xmm0 - - vpaddd %xmm7,%xmm10,%xmm10 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm14,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vmovdqu %xmm4,16-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm0,%xmm5 - vpand %xmm12,%xmm6,%xmm6 - vpaddd %xmm0,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 80-128(%rax),%xmm3 - - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpand %xmm12,%xmm13,%xmm7 - vpxor 176-128(%rax),%xmm1,%xmm1 - - vpaddd %xmm7,%xmm14,%xmm14 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm13,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vmovdqu %xmm0,32-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm1,%xmm5 - vpand %xmm11,%xmm6,%xmm6 - vpaddd %xmm1,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 96-128(%rax),%xmm4 - - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpand %xmm11,%xmm12,%xmm7 - vpxor 192-128(%rax),%xmm2,%xmm2 - - vpaddd %xmm7,%xmm13,%xmm13 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm12,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vmovdqu %xmm1,48-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm2,%xmm5 - vpand %xmm10,%xmm6,%xmm6 - vpaddd %xmm2,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 112-128(%rax),%xmm0 - - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpand %xmm10,%xmm11,%xmm7 - vpxor 208-128(%rax),%xmm3,%xmm3 - - vpaddd %xmm7,%xmm12,%xmm12 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm11,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vmovdqu %xmm2,64-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm3,%xmm5 - vpand %xmm14,%xmm6,%xmm6 - vpaddd %xmm3,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 128-128(%rax),%xmm1 - - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpand %xmm14,%xmm10,%xmm7 - vpxor 224-128(%rax),%xmm4,%xmm4 - - vpaddd %xmm7,%xmm11,%xmm11 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm10,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vmovdqu %xmm3,80-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm4,%xmm5 - vpand %xmm13,%xmm6,%xmm6 - vpaddd %xmm4,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 144-128(%rax),%xmm2 - - vpaddd %xmm15,%xmm10,%xmm10 - vpslld $5,%xmm11,%xmm8 - vpand %xmm13,%xmm14,%xmm7 - vpxor 240-128(%rax),%xmm0,%xmm0 - - vpaddd %xmm7,%xmm10,%xmm10 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm14,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vmovdqu %xmm4,96-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm0,%xmm5 - vpand %xmm12,%xmm6,%xmm6 - vpaddd %xmm0,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 160-128(%rax),%xmm3 - - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpand %xmm12,%xmm13,%xmm7 - vpxor 0-128(%rax),%xmm1,%xmm1 - - vpaddd %xmm7,%xmm14,%xmm14 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm13,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vmovdqu %xmm0,112-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm1,%xmm5 - vpand %xmm11,%xmm6,%xmm6 - vpaddd %xmm1,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 176-128(%rax),%xmm4 - - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpand %xmm11,%xmm12,%xmm7 - vpxor 16-128(%rax),%xmm2,%xmm2 - - vpaddd %xmm7,%xmm13,%xmm13 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm12,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vmovdqu %xmm1,128-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm2,%xmm5 - vpand %xmm10,%xmm6,%xmm6 - vpaddd %xmm2,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 192-128(%rax),%xmm0 - - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpand %xmm10,%xmm11,%xmm7 - vpxor 32-128(%rax),%xmm3,%xmm3 - - vpaddd %xmm7,%xmm12,%xmm12 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm11,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vmovdqu %xmm2,144-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm3,%xmm5 - vpand %xmm14,%xmm6,%xmm6 - vpaddd %xmm3,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 208-128(%rax),%xmm1 - - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpand %xmm14,%xmm10,%xmm7 - vpxor 48-128(%rax),%xmm4,%xmm4 - - vpaddd %xmm7,%xmm11,%xmm11 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm10,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vmovdqu %xmm3,160-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm4,%xmm5 - vpand %xmm13,%xmm6,%xmm6 - vpaddd %xmm4,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 224-128(%rax),%xmm2 - - vpaddd %xmm15,%xmm10,%xmm10 - vpslld $5,%xmm11,%xmm8 - vpand %xmm13,%xmm14,%xmm7 - vpxor 64-128(%rax),%xmm0,%xmm0 - - vpaddd %xmm7,%xmm10,%xmm10 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm14,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vmovdqu %xmm4,176-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm0,%xmm5 - vpand %xmm12,%xmm6,%xmm6 - vpaddd %xmm0,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vmovdqa 64(%rbp),%xmm15 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 240-128(%rax),%xmm3 - - vpslld $5,%xmm10,%xmm8 - vpaddd %xmm15,%xmm14,%xmm14 - vpxor %xmm11,%xmm13,%xmm6 - vmovdqa %xmm0,192-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 80-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 0-128(%rax),%xmm4 - - vpslld $5,%xmm14,%xmm8 - vpaddd %xmm15,%xmm13,%xmm13 - vpxor %xmm10,%xmm12,%xmm6 - vmovdqa %xmm1,208-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 96-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 16-128(%rax),%xmm0 - - vpslld $5,%xmm13,%xmm8 - vpaddd %xmm15,%xmm12,%xmm12 - vpxor %xmm14,%xmm11,%xmm6 - vmovdqa %xmm2,224-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 112-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 32-128(%rax),%xmm1 - - vpslld $5,%xmm12,%xmm8 - vpaddd %xmm15,%xmm11,%xmm11 - vpxor %xmm13,%xmm10,%xmm6 - vmovdqa %xmm3,240-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 128-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 48-128(%rax),%xmm2 - - vpslld $5,%xmm11,%xmm8 - vpaddd %xmm15,%xmm10,%xmm10 - vpxor %xmm12,%xmm14,%xmm6 - vmovdqa %xmm4,0-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpxor 144-128(%rax),%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm6,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - vpsrld $31,%xmm0,%xmm5 - vpaddd %xmm0,%xmm0,%xmm0 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 64-128(%rax),%xmm3 - - vpslld $5,%xmm10,%xmm8 - vpaddd %xmm15,%xmm14,%xmm14 - vpxor %xmm11,%xmm13,%xmm6 - vmovdqa %xmm0,16-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 160-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 80-128(%rax),%xmm4 - - vpslld $5,%xmm14,%xmm8 - vpaddd %xmm15,%xmm13,%xmm13 - vpxor %xmm10,%xmm12,%xmm6 - vmovdqa %xmm1,32-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 176-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 96-128(%rax),%xmm0 - - vpslld $5,%xmm13,%xmm8 - vpaddd %xmm15,%xmm12,%xmm12 - vpxor %xmm14,%xmm11,%xmm6 - vmovdqa %xmm2,48-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 192-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 112-128(%rax),%xmm1 - - vpslld $5,%xmm12,%xmm8 - vpaddd %xmm15,%xmm11,%xmm11 - vpxor %xmm13,%xmm10,%xmm6 - vmovdqa %xmm3,64-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 208-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 128-128(%rax),%xmm2 - - vpslld $5,%xmm11,%xmm8 - vpaddd %xmm15,%xmm10,%xmm10 - vpxor %xmm12,%xmm14,%xmm6 - vmovdqa %xmm4,80-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpxor 224-128(%rax),%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm6,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - vpsrld $31,%xmm0,%xmm5 - vpaddd %xmm0,%xmm0,%xmm0 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 144-128(%rax),%xmm3 - - vpslld $5,%xmm10,%xmm8 - vpaddd %xmm15,%xmm14,%xmm14 - vpxor %xmm11,%xmm13,%xmm6 - vmovdqa %xmm0,96-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 240-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 160-128(%rax),%xmm4 - - vpslld $5,%xmm14,%xmm8 - vpaddd %xmm15,%xmm13,%xmm13 - vpxor %xmm10,%xmm12,%xmm6 - vmovdqa %xmm1,112-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 0-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 176-128(%rax),%xmm0 - - vpslld $5,%xmm13,%xmm8 - vpaddd %xmm15,%xmm12,%xmm12 - vpxor %xmm14,%xmm11,%xmm6 - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 16-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 192-128(%rax),%xmm1 - - vpslld $5,%xmm12,%xmm8 - vpaddd %xmm15,%xmm11,%xmm11 - vpxor %xmm13,%xmm10,%xmm6 - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 32-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 208-128(%rax),%xmm2 - - vpslld $5,%xmm11,%xmm8 - vpaddd %xmm15,%xmm10,%xmm10 - vpxor %xmm12,%xmm14,%xmm6 - vpaddd %xmm4,%xmm10,%xmm10 - vpxor 48-128(%rax),%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm6,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - vpsrld $31,%xmm0,%xmm5 - vpaddd %xmm0,%xmm0,%xmm0 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 224-128(%rax),%xmm3 - - vpslld $5,%xmm10,%xmm8 - vpaddd %xmm15,%xmm14,%xmm14 - vpxor %xmm11,%xmm13,%xmm6 - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 64-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 240-128(%rax),%xmm4 - - vpslld $5,%xmm14,%xmm8 - vpaddd %xmm15,%xmm13,%xmm13 - vpxor %xmm10,%xmm12,%xmm6 - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 80-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 0-128(%rax),%xmm0 - - vpslld $5,%xmm13,%xmm8 - vpaddd %xmm15,%xmm12,%xmm12 - vpxor %xmm14,%xmm11,%xmm6 - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 96-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 16-128(%rax),%xmm1 - - vpslld $5,%xmm12,%xmm8 - vpaddd %xmm15,%xmm11,%xmm11 - vpxor %xmm13,%xmm10,%xmm6 - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 112-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpslld $5,%xmm11,%xmm8 - vpaddd %xmm15,%xmm10,%xmm10 - vpxor %xmm12,%xmm14,%xmm6 - - vpsrld $27,%xmm11,%xmm9 - vpaddd %xmm4,%xmm10,%xmm10 - vpxor %xmm13,%xmm6,%xmm6 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm7,%xmm12,%xmm12 - movl $1,%ecx - cmpl 0(%rbx),%ecx - cmovgeq %rbp,%r8 - cmpl 4(%rbx),%ecx - cmovgeq %rbp,%r9 - cmpl 8(%rbx),%ecx - cmovgeq %rbp,%r10 - cmpl 12(%rbx),%ecx - cmovgeq %rbp,%r11 - vmovdqu (%rbx),%xmm6 - vpxor %xmm8,%xmm8,%xmm8 - vmovdqa %xmm6,%xmm7 - vpcmpgtd %xmm8,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - - vpand %xmm7,%xmm10,%xmm10 - vpand %xmm7,%xmm11,%xmm11 - vpaddd 0(%rdi),%xmm10,%xmm10 - vpand %xmm7,%xmm12,%xmm12 - vpaddd 32(%rdi),%xmm11,%xmm11 - vpand %xmm7,%xmm13,%xmm13 - vpaddd 64(%rdi),%xmm12,%xmm12 - vpand %xmm7,%xmm14,%xmm14 - vpaddd 96(%rdi),%xmm13,%xmm13 - vpaddd 128(%rdi),%xmm14,%xmm14 - vmovdqu %xmm10,0(%rdi) - vmovdqu %xmm11,32(%rdi) - vmovdqu %xmm12,64(%rdi) - vmovdqu %xmm13,96(%rdi) - vmovdqu %xmm14,128(%rdi) - - vmovdqu %xmm6,(%rbx) - vmovdqu 96(%rbp),%xmm5 - decl %edx - jnz .Loop_avx - - movl 280(%rsp),%edx - leaq 16(%rdi),%rdi - leaq 64(%rsi),%rsi - decl %edx - jnz .Loop_grande_avx - -.Ldone_avx: - movq 272(%rsp),%rax -.cfi_def_cfa %rax,8 - vzeroupper - movq -16(%rax),%rbp -.cfi_restore %rbp - movq -8(%rax),%rbx -.cfi_restore %rbx - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_avx: - .byte 0xf3,0xc3 -.cfi_endproc -.size sha1_multi_block_avx,.-sha1_multi_block_avx -.type sha1_multi_block_avx2,@function -.align 32 -sha1_multi_block_avx2: -.cfi_startproc -_avx2_shortcut: - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - subq $576,%rsp - andq $-256,%rsp - movq %rax,544(%rsp) -.cfi_escape 0x0f,0x06,0x77,0xa0,0x04,0x06,0x23,0x08 -.Lbody_avx2: - leaq K_XX_XX(%rip),%rbp - shrl $1,%edx - - vzeroupper -.Loop_grande_avx2: - movl %edx,552(%rsp) - xorl %edx,%edx - leaq 512(%rsp),%rbx - - movq 0(%rsi),%r12 - - movl 8(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,0(%rbx) - cmovleq %rbp,%r12 - - movq 16(%rsi),%r13 - - movl 24(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,4(%rbx) - cmovleq %rbp,%r13 - - movq 32(%rsi),%r14 - - movl 40(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,8(%rbx) - cmovleq %rbp,%r14 - - movq 48(%rsi),%r15 - - movl 56(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,12(%rbx) - cmovleq %rbp,%r15 - - movq 64(%rsi),%r8 - - movl 72(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,16(%rbx) - cmovleq %rbp,%r8 - - movq 80(%rsi),%r9 - - movl 88(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,20(%rbx) - cmovleq %rbp,%r9 - - movq 96(%rsi),%r10 - - movl 104(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,24(%rbx) - cmovleq %rbp,%r10 - - movq 112(%rsi),%r11 - - movl 120(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,28(%rbx) - cmovleq %rbp,%r11 - vmovdqu 0(%rdi),%ymm0 - leaq 128(%rsp),%rax - vmovdqu 32(%rdi),%ymm1 - leaq 256+128(%rsp),%rbx - vmovdqu 64(%rdi),%ymm2 - vmovdqu 96(%rdi),%ymm3 - vmovdqu 128(%rdi),%ymm4 - vmovdqu 96(%rbp),%ymm9 - jmp .Loop_avx2 - -.align 32 -.Loop_avx2: - vmovdqa -32(%rbp),%ymm15 - vmovd (%r12),%xmm10 - leaq 64(%r12),%r12 - vmovd (%r8),%xmm12 - leaq 64(%r8),%r8 - vmovd (%r13),%xmm7 - leaq 64(%r13),%r13 - vmovd (%r9),%xmm6 - leaq 64(%r9),%r9 - vpinsrd $1,(%r14),%xmm10,%xmm10 - leaq 64(%r14),%r14 - vpinsrd $1,(%r10),%xmm12,%xmm12 - leaq 64(%r10),%r10 - vpinsrd $1,(%r15),%xmm7,%xmm7 - leaq 64(%r15),%r15 - vpunpckldq %ymm7,%ymm10,%ymm10 - vpinsrd $1,(%r11),%xmm6,%xmm6 - leaq 64(%r11),%r11 - vpunpckldq %ymm6,%ymm12,%ymm12 - vmovd -60(%r12),%xmm11 - vinserti128 $1,%xmm12,%ymm10,%ymm10 - vmovd -60(%r8),%xmm8 - vpshufb %ymm9,%ymm10,%ymm10 - vmovd -60(%r13),%xmm7 - vmovd -60(%r9),%xmm6 - vpinsrd $1,-60(%r14),%xmm11,%xmm11 - vpinsrd $1,-60(%r10),%xmm8,%xmm8 - vpinsrd $1,-60(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm11,%ymm11 - vpinsrd $1,-60(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm4,%ymm4 - vpslld $5,%ymm0,%ymm7 - vpandn %ymm3,%ymm1,%ymm6 - vpand %ymm2,%ymm1,%ymm5 - - vmovdqa %ymm10,0-128(%rax) - vpaddd %ymm10,%ymm4,%ymm4 - vinserti128 $1,%xmm8,%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -56(%r12),%xmm12 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -56(%r8),%xmm8 - vpaddd %ymm5,%ymm4,%ymm4 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpshufb %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vmovd -56(%r13),%xmm7 - vmovd -56(%r9),%xmm6 - vpinsrd $1,-56(%r14),%xmm12,%xmm12 - vpinsrd $1,-56(%r10),%xmm8,%xmm8 - vpinsrd $1,-56(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm12,%ymm12 - vpinsrd $1,-56(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm3,%ymm3 - vpslld $5,%ymm4,%ymm7 - vpandn %ymm2,%ymm0,%ymm6 - vpand %ymm1,%ymm0,%ymm5 - - vmovdqa %ymm11,32-128(%rax) - vpaddd %ymm11,%ymm3,%ymm3 - vinserti128 $1,%xmm8,%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -52(%r12),%xmm13 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -52(%r8),%xmm8 - vpaddd %ymm5,%ymm3,%ymm3 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpshufb %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vmovd -52(%r13),%xmm7 - vmovd -52(%r9),%xmm6 - vpinsrd $1,-52(%r14),%xmm13,%xmm13 - vpinsrd $1,-52(%r10),%xmm8,%xmm8 - vpinsrd $1,-52(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm13,%ymm13 - vpinsrd $1,-52(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm2,%ymm2 - vpslld $5,%ymm3,%ymm7 - vpandn %ymm1,%ymm4,%ymm6 - vpand %ymm0,%ymm4,%ymm5 - - vmovdqa %ymm12,64-128(%rax) - vpaddd %ymm12,%ymm2,%ymm2 - vinserti128 $1,%xmm8,%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -48(%r12),%xmm14 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -48(%r8),%xmm8 - vpaddd %ymm5,%ymm2,%ymm2 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpshufb %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vmovd -48(%r13),%xmm7 - vmovd -48(%r9),%xmm6 - vpinsrd $1,-48(%r14),%xmm14,%xmm14 - vpinsrd $1,-48(%r10),%xmm8,%xmm8 - vpinsrd $1,-48(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm14,%ymm14 - vpinsrd $1,-48(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm1,%ymm1 - vpslld $5,%ymm2,%ymm7 - vpandn %ymm0,%ymm3,%ymm6 - vpand %ymm4,%ymm3,%ymm5 - - vmovdqa %ymm13,96-128(%rax) - vpaddd %ymm13,%ymm1,%ymm1 - vinserti128 $1,%xmm8,%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -44(%r12),%xmm10 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -44(%r8),%xmm8 - vpaddd %ymm5,%ymm1,%ymm1 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpshufb %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vmovd -44(%r13),%xmm7 - vmovd -44(%r9),%xmm6 - vpinsrd $1,-44(%r14),%xmm10,%xmm10 - vpinsrd $1,-44(%r10),%xmm8,%xmm8 - vpinsrd $1,-44(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm10,%ymm10 - vpinsrd $1,-44(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm0,%ymm0 - vpslld $5,%ymm1,%ymm7 - vpandn %ymm4,%ymm2,%ymm6 - vpand %ymm3,%ymm2,%ymm5 - - vmovdqa %ymm14,128-128(%rax) - vpaddd %ymm14,%ymm0,%ymm0 - vinserti128 $1,%xmm8,%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -40(%r12),%xmm11 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -40(%r8),%xmm8 - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpshufb %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vmovd -40(%r13),%xmm7 - vmovd -40(%r9),%xmm6 - vpinsrd $1,-40(%r14),%xmm11,%xmm11 - vpinsrd $1,-40(%r10),%xmm8,%xmm8 - vpinsrd $1,-40(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm11,%ymm11 - vpinsrd $1,-40(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm4,%ymm4 - vpslld $5,%ymm0,%ymm7 - vpandn %ymm3,%ymm1,%ymm6 - vpand %ymm2,%ymm1,%ymm5 - - vmovdqa %ymm10,160-128(%rax) - vpaddd %ymm10,%ymm4,%ymm4 - vinserti128 $1,%xmm8,%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -36(%r12),%xmm12 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -36(%r8),%xmm8 - vpaddd %ymm5,%ymm4,%ymm4 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpshufb %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vmovd -36(%r13),%xmm7 - vmovd -36(%r9),%xmm6 - vpinsrd $1,-36(%r14),%xmm12,%xmm12 - vpinsrd $1,-36(%r10),%xmm8,%xmm8 - vpinsrd $1,-36(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm12,%ymm12 - vpinsrd $1,-36(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm3,%ymm3 - vpslld $5,%ymm4,%ymm7 - vpandn %ymm2,%ymm0,%ymm6 - vpand %ymm1,%ymm0,%ymm5 - - vmovdqa %ymm11,192-128(%rax) - vpaddd %ymm11,%ymm3,%ymm3 - vinserti128 $1,%xmm8,%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -32(%r12),%xmm13 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -32(%r8),%xmm8 - vpaddd %ymm5,%ymm3,%ymm3 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpshufb %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vmovd -32(%r13),%xmm7 - vmovd -32(%r9),%xmm6 - vpinsrd $1,-32(%r14),%xmm13,%xmm13 - vpinsrd $1,-32(%r10),%xmm8,%xmm8 - vpinsrd $1,-32(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm13,%ymm13 - vpinsrd $1,-32(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm2,%ymm2 - vpslld $5,%ymm3,%ymm7 - vpandn %ymm1,%ymm4,%ymm6 - vpand %ymm0,%ymm4,%ymm5 - - vmovdqa %ymm12,224-128(%rax) - vpaddd %ymm12,%ymm2,%ymm2 - vinserti128 $1,%xmm8,%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -28(%r12),%xmm14 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -28(%r8),%xmm8 - vpaddd %ymm5,%ymm2,%ymm2 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpshufb %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vmovd -28(%r13),%xmm7 - vmovd -28(%r9),%xmm6 - vpinsrd $1,-28(%r14),%xmm14,%xmm14 - vpinsrd $1,-28(%r10),%xmm8,%xmm8 - vpinsrd $1,-28(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm14,%ymm14 - vpinsrd $1,-28(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm1,%ymm1 - vpslld $5,%ymm2,%ymm7 - vpandn %ymm0,%ymm3,%ymm6 - vpand %ymm4,%ymm3,%ymm5 - - vmovdqa %ymm13,256-256-128(%rbx) - vpaddd %ymm13,%ymm1,%ymm1 - vinserti128 $1,%xmm8,%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -24(%r12),%xmm10 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -24(%r8),%xmm8 - vpaddd %ymm5,%ymm1,%ymm1 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpshufb %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vmovd -24(%r13),%xmm7 - vmovd -24(%r9),%xmm6 - vpinsrd $1,-24(%r14),%xmm10,%xmm10 - vpinsrd $1,-24(%r10),%xmm8,%xmm8 - vpinsrd $1,-24(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm10,%ymm10 - vpinsrd $1,-24(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm0,%ymm0 - vpslld $5,%ymm1,%ymm7 - vpandn %ymm4,%ymm2,%ymm6 - vpand %ymm3,%ymm2,%ymm5 - - vmovdqa %ymm14,288-256-128(%rbx) - vpaddd %ymm14,%ymm0,%ymm0 - vinserti128 $1,%xmm8,%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -20(%r12),%xmm11 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -20(%r8),%xmm8 - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpshufb %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vmovd -20(%r13),%xmm7 - vmovd -20(%r9),%xmm6 - vpinsrd $1,-20(%r14),%xmm11,%xmm11 - vpinsrd $1,-20(%r10),%xmm8,%xmm8 - vpinsrd $1,-20(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm11,%ymm11 - vpinsrd $1,-20(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm4,%ymm4 - vpslld $5,%ymm0,%ymm7 - vpandn %ymm3,%ymm1,%ymm6 - vpand %ymm2,%ymm1,%ymm5 - - vmovdqa %ymm10,320-256-128(%rbx) - vpaddd %ymm10,%ymm4,%ymm4 - vinserti128 $1,%xmm8,%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -16(%r12),%xmm12 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -16(%r8),%xmm8 - vpaddd %ymm5,%ymm4,%ymm4 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpshufb %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vmovd -16(%r13),%xmm7 - vmovd -16(%r9),%xmm6 - vpinsrd $1,-16(%r14),%xmm12,%xmm12 - vpinsrd $1,-16(%r10),%xmm8,%xmm8 - vpinsrd $1,-16(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm12,%ymm12 - vpinsrd $1,-16(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm3,%ymm3 - vpslld $5,%ymm4,%ymm7 - vpandn %ymm2,%ymm0,%ymm6 - vpand %ymm1,%ymm0,%ymm5 - - vmovdqa %ymm11,352-256-128(%rbx) - vpaddd %ymm11,%ymm3,%ymm3 - vinserti128 $1,%xmm8,%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -12(%r12),%xmm13 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -12(%r8),%xmm8 - vpaddd %ymm5,%ymm3,%ymm3 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpshufb %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vmovd -12(%r13),%xmm7 - vmovd -12(%r9),%xmm6 - vpinsrd $1,-12(%r14),%xmm13,%xmm13 - vpinsrd $1,-12(%r10),%xmm8,%xmm8 - vpinsrd $1,-12(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm13,%ymm13 - vpinsrd $1,-12(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm2,%ymm2 - vpslld $5,%ymm3,%ymm7 - vpandn %ymm1,%ymm4,%ymm6 - vpand %ymm0,%ymm4,%ymm5 - - vmovdqa %ymm12,384-256-128(%rbx) - vpaddd %ymm12,%ymm2,%ymm2 - vinserti128 $1,%xmm8,%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -8(%r12),%xmm14 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -8(%r8),%xmm8 - vpaddd %ymm5,%ymm2,%ymm2 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpshufb %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vmovd -8(%r13),%xmm7 - vmovd -8(%r9),%xmm6 - vpinsrd $1,-8(%r14),%xmm14,%xmm14 - vpinsrd $1,-8(%r10),%xmm8,%xmm8 - vpinsrd $1,-8(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm14,%ymm14 - vpinsrd $1,-8(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm1,%ymm1 - vpslld $5,%ymm2,%ymm7 - vpandn %ymm0,%ymm3,%ymm6 - vpand %ymm4,%ymm3,%ymm5 - - vmovdqa %ymm13,416-256-128(%rbx) - vpaddd %ymm13,%ymm1,%ymm1 - vinserti128 $1,%xmm8,%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -4(%r12),%xmm10 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -4(%r8),%xmm8 - vpaddd %ymm5,%ymm1,%ymm1 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpshufb %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vmovdqa 0-128(%rax),%ymm11 - vmovd -4(%r13),%xmm7 - vmovd -4(%r9),%xmm6 - vpinsrd $1,-4(%r14),%xmm10,%xmm10 - vpinsrd $1,-4(%r10),%xmm8,%xmm8 - vpinsrd $1,-4(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm10,%ymm10 - vpinsrd $1,-4(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm0,%ymm0 - prefetcht0 63(%r12) - vpslld $5,%ymm1,%ymm7 - vpandn %ymm4,%ymm2,%ymm6 - vpand %ymm3,%ymm2,%ymm5 - - vmovdqa %ymm14,448-256-128(%rbx) - vpaddd %ymm14,%ymm0,%ymm0 - vinserti128 $1,%xmm8,%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - prefetcht0 63(%r13) - vpxor %ymm6,%ymm5,%ymm5 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - prefetcht0 63(%r14) - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - prefetcht0 63(%r15) - vpshufb %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vmovdqa 32-128(%rax),%ymm12 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 64-128(%rax),%ymm13 - - vpaddd %ymm15,%ymm4,%ymm4 - vpslld $5,%ymm0,%ymm7 - vpandn %ymm3,%ymm1,%ymm6 - prefetcht0 63(%r8) - vpand %ymm2,%ymm1,%ymm5 - - vmovdqa %ymm10,480-256-128(%rbx) - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 256-256-128(%rbx),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - prefetcht0 63(%r9) - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - prefetcht0 63(%r10) - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - prefetcht0 63(%r11) - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 96-128(%rax),%ymm14 - - vpaddd %ymm15,%ymm3,%ymm3 - vpslld $5,%ymm4,%ymm7 - vpandn %ymm2,%ymm0,%ymm6 - - vpand %ymm1,%ymm0,%ymm5 - - vmovdqa %ymm11,0-128(%rax) - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 288-256-128(%rbx),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 128-128(%rax),%ymm10 - - vpaddd %ymm15,%ymm2,%ymm2 - vpslld $5,%ymm3,%ymm7 - vpandn %ymm1,%ymm4,%ymm6 - - vpand %ymm0,%ymm4,%ymm5 - - vmovdqa %ymm12,32-128(%rax) - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 320-256-128(%rbx),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 160-128(%rax),%ymm11 - - vpaddd %ymm15,%ymm1,%ymm1 - vpslld $5,%ymm2,%ymm7 - vpandn %ymm0,%ymm3,%ymm6 - - vpand %ymm4,%ymm3,%ymm5 - - vmovdqa %ymm13,64-128(%rax) - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 352-256-128(%rbx),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 192-128(%rax),%ymm12 - - vpaddd %ymm15,%ymm0,%ymm0 - vpslld $5,%ymm1,%ymm7 - vpandn %ymm4,%ymm2,%ymm6 - - vpand %ymm3,%ymm2,%ymm5 - - vmovdqa %ymm14,96-128(%rax) - vpaddd %ymm14,%ymm0,%ymm0 - vpxor 384-256-128(%rbx),%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $31,%ymm10,%ymm9 - vpaddd %ymm10,%ymm10,%ymm10 - - vpsrld $2,%ymm2,%ymm2 - - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vmovdqa 0(%rbp),%ymm15 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 224-128(%rax),%ymm13 - - vpslld $5,%ymm0,%ymm7 - vpaddd %ymm15,%ymm4,%ymm4 - vpxor %ymm1,%ymm3,%ymm5 - vmovdqa %ymm10,128-128(%rax) - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 416-256-128(%rbx),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 256-256-128(%rbx),%ymm14 - - vpslld $5,%ymm4,%ymm7 - vpaddd %ymm15,%ymm3,%ymm3 - vpxor %ymm0,%ymm2,%ymm5 - vmovdqa %ymm11,160-128(%rax) - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 448-256-128(%rbx),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 288-256-128(%rbx),%ymm10 - - vpslld $5,%ymm3,%ymm7 - vpaddd %ymm15,%ymm2,%ymm2 - vpxor %ymm4,%ymm1,%ymm5 - vmovdqa %ymm12,192-128(%rax) - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 480-256-128(%rbx),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 320-256-128(%rbx),%ymm11 - - vpslld $5,%ymm2,%ymm7 - vpaddd %ymm15,%ymm1,%ymm1 - vpxor %ymm3,%ymm0,%ymm5 - vmovdqa %ymm13,224-128(%rax) - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 0-128(%rax),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 352-256-128(%rbx),%ymm12 - - vpslld $5,%ymm1,%ymm7 - vpaddd %ymm15,%ymm0,%ymm0 - vpxor %ymm2,%ymm4,%ymm5 - vmovdqa %ymm14,256-256-128(%rbx) - vpaddd %ymm14,%ymm0,%ymm0 - vpxor 32-128(%rax),%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm5,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - vpsrld $31,%ymm10,%ymm9 - vpaddd %ymm10,%ymm10,%ymm10 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 384-256-128(%rbx),%ymm13 - - vpslld $5,%ymm0,%ymm7 - vpaddd %ymm15,%ymm4,%ymm4 - vpxor %ymm1,%ymm3,%ymm5 - vmovdqa %ymm10,288-256-128(%rbx) - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 64-128(%rax),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 416-256-128(%rbx),%ymm14 - - vpslld $5,%ymm4,%ymm7 - vpaddd %ymm15,%ymm3,%ymm3 - vpxor %ymm0,%ymm2,%ymm5 - vmovdqa %ymm11,320-256-128(%rbx) - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 96-128(%rax),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 448-256-128(%rbx),%ymm10 - - vpslld $5,%ymm3,%ymm7 - vpaddd %ymm15,%ymm2,%ymm2 - vpxor %ymm4,%ymm1,%ymm5 - vmovdqa %ymm12,352-256-128(%rbx) - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 128-128(%rax),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 480-256-128(%rbx),%ymm11 - - vpslld $5,%ymm2,%ymm7 - vpaddd %ymm15,%ymm1,%ymm1 - vpxor %ymm3,%ymm0,%ymm5 - vmovdqa %ymm13,384-256-128(%rbx) - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 160-128(%rax),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 0-128(%rax),%ymm12 - - vpslld $5,%ymm1,%ymm7 - vpaddd %ymm15,%ymm0,%ymm0 - vpxor %ymm2,%ymm4,%ymm5 - vmovdqa %ymm14,416-256-128(%rbx) - vpaddd %ymm14,%ymm0,%ymm0 - vpxor 192-128(%rax),%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm5,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - vpsrld $31,%ymm10,%ymm9 - vpaddd %ymm10,%ymm10,%ymm10 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 32-128(%rax),%ymm13 - - vpslld $5,%ymm0,%ymm7 - vpaddd %ymm15,%ymm4,%ymm4 - vpxor %ymm1,%ymm3,%ymm5 - vmovdqa %ymm10,448-256-128(%rbx) - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 224-128(%rax),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 64-128(%rax),%ymm14 - - vpslld $5,%ymm4,%ymm7 - vpaddd %ymm15,%ymm3,%ymm3 - vpxor %ymm0,%ymm2,%ymm5 - vmovdqa %ymm11,480-256-128(%rbx) - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 256-256-128(%rbx),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 96-128(%rax),%ymm10 - - vpslld $5,%ymm3,%ymm7 - vpaddd %ymm15,%ymm2,%ymm2 - vpxor %ymm4,%ymm1,%ymm5 - vmovdqa %ymm12,0-128(%rax) - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 288-256-128(%rbx),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 128-128(%rax),%ymm11 - - vpslld $5,%ymm2,%ymm7 - vpaddd %ymm15,%ymm1,%ymm1 - vpxor %ymm3,%ymm0,%ymm5 - vmovdqa %ymm13,32-128(%rax) - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 320-256-128(%rbx),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 160-128(%rax),%ymm12 - - vpslld $5,%ymm1,%ymm7 - vpaddd %ymm15,%ymm0,%ymm0 - vpxor %ymm2,%ymm4,%ymm5 - vmovdqa %ymm14,64-128(%rax) - vpaddd %ymm14,%ymm0,%ymm0 - vpxor 352-256-128(%rbx),%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm5,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - vpsrld $31,%ymm10,%ymm9 - vpaddd %ymm10,%ymm10,%ymm10 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 192-128(%rax),%ymm13 - - vpslld $5,%ymm0,%ymm7 - vpaddd %ymm15,%ymm4,%ymm4 - vpxor %ymm1,%ymm3,%ymm5 - vmovdqa %ymm10,96-128(%rax) - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 384-256-128(%rbx),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 224-128(%rax),%ymm14 - - vpslld $5,%ymm4,%ymm7 - vpaddd %ymm15,%ymm3,%ymm3 - vpxor %ymm0,%ymm2,%ymm5 - vmovdqa %ymm11,128-128(%rax) - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 416-256-128(%rbx),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 256-256-128(%rbx),%ymm10 - - vpslld $5,%ymm3,%ymm7 - vpaddd %ymm15,%ymm2,%ymm2 - vpxor %ymm4,%ymm1,%ymm5 - vmovdqa %ymm12,160-128(%rax) - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 448-256-128(%rbx),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 288-256-128(%rbx),%ymm11 - - vpslld $5,%ymm2,%ymm7 - vpaddd %ymm15,%ymm1,%ymm1 - vpxor %ymm3,%ymm0,%ymm5 - vmovdqa %ymm13,192-128(%rax) - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 480-256-128(%rbx),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 320-256-128(%rbx),%ymm12 - - vpslld $5,%ymm1,%ymm7 - vpaddd %ymm15,%ymm0,%ymm0 - vpxor %ymm2,%ymm4,%ymm5 - vmovdqa %ymm14,224-128(%rax) - vpaddd %ymm14,%ymm0,%ymm0 - vpxor 0-128(%rax),%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm5,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - vpsrld $31,%ymm10,%ymm9 - vpaddd %ymm10,%ymm10,%ymm10 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vmovdqa 32(%rbp),%ymm15 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 352-256-128(%rbx),%ymm13 - - vpaddd %ymm15,%ymm4,%ymm4 - vpslld $5,%ymm0,%ymm7 - vpand %ymm2,%ymm3,%ymm6 - vpxor 32-128(%rax),%ymm11,%ymm11 - - vpaddd %ymm6,%ymm4,%ymm4 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm3,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vmovdqu %ymm10,256-256-128(%rbx) - vpaddd %ymm10,%ymm4,%ymm4 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm11,%ymm9 - vpand %ymm1,%ymm5,%ymm5 - vpaddd %ymm11,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpaddd %ymm5,%ymm4,%ymm4 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 384-256-128(%rbx),%ymm14 - - vpaddd %ymm15,%ymm3,%ymm3 - vpslld $5,%ymm4,%ymm7 - vpand %ymm1,%ymm2,%ymm6 - vpxor 64-128(%rax),%ymm12,%ymm12 - - vpaddd %ymm6,%ymm3,%ymm3 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm2,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vmovdqu %ymm11,288-256-128(%rbx) - vpaddd %ymm11,%ymm3,%ymm3 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm12,%ymm9 - vpand %ymm0,%ymm5,%ymm5 - vpaddd %ymm12,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpaddd %ymm5,%ymm3,%ymm3 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 416-256-128(%rbx),%ymm10 - - vpaddd %ymm15,%ymm2,%ymm2 - vpslld $5,%ymm3,%ymm7 - vpand %ymm0,%ymm1,%ymm6 - vpxor 96-128(%rax),%ymm13,%ymm13 - - vpaddd %ymm6,%ymm2,%ymm2 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm1,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vmovdqu %ymm12,320-256-128(%rbx) - vpaddd %ymm12,%ymm2,%ymm2 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm13,%ymm9 - vpand %ymm4,%ymm5,%ymm5 - vpaddd %ymm13,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpaddd %ymm5,%ymm2,%ymm2 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 448-256-128(%rbx),%ymm11 - - vpaddd %ymm15,%ymm1,%ymm1 - vpslld $5,%ymm2,%ymm7 - vpand %ymm4,%ymm0,%ymm6 - vpxor 128-128(%rax),%ymm14,%ymm14 - - vpaddd %ymm6,%ymm1,%ymm1 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm0,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vmovdqu %ymm13,352-256-128(%rbx) - vpaddd %ymm13,%ymm1,%ymm1 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm14,%ymm9 - vpand %ymm3,%ymm5,%ymm5 - vpaddd %ymm14,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpaddd %ymm5,%ymm1,%ymm1 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 480-256-128(%rbx),%ymm12 - - vpaddd %ymm15,%ymm0,%ymm0 - vpslld $5,%ymm1,%ymm7 - vpand %ymm3,%ymm4,%ymm6 - vpxor 160-128(%rax),%ymm10,%ymm10 - - vpaddd %ymm6,%ymm0,%ymm0 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm4,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vmovdqu %ymm14,384-256-128(%rbx) - vpaddd %ymm14,%ymm0,%ymm0 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm10,%ymm9 - vpand %ymm2,%ymm5,%ymm5 - vpaddd %ymm10,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 0-128(%rax),%ymm13 - - vpaddd %ymm15,%ymm4,%ymm4 - vpslld $5,%ymm0,%ymm7 - vpand %ymm2,%ymm3,%ymm6 - vpxor 192-128(%rax),%ymm11,%ymm11 - - vpaddd %ymm6,%ymm4,%ymm4 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm3,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vmovdqu %ymm10,416-256-128(%rbx) - vpaddd %ymm10,%ymm4,%ymm4 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm11,%ymm9 - vpand %ymm1,%ymm5,%ymm5 - vpaddd %ymm11,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpaddd %ymm5,%ymm4,%ymm4 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 32-128(%rax),%ymm14 - - vpaddd %ymm15,%ymm3,%ymm3 - vpslld $5,%ymm4,%ymm7 - vpand %ymm1,%ymm2,%ymm6 - vpxor 224-128(%rax),%ymm12,%ymm12 - - vpaddd %ymm6,%ymm3,%ymm3 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm2,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vmovdqu %ymm11,448-256-128(%rbx) - vpaddd %ymm11,%ymm3,%ymm3 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm12,%ymm9 - vpand %ymm0,%ymm5,%ymm5 - vpaddd %ymm12,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpaddd %ymm5,%ymm3,%ymm3 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 64-128(%rax),%ymm10 - - vpaddd %ymm15,%ymm2,%ymm2 - vpslld $5,%ymm3,%ymm7 - vpand %ymm0,%ymm1,%ymm6 - vpxor 256-256-128(%rbx),%ymm13,%ymm13 - - vpaddd %ymm6,%ymm2,%ymm2 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm1,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vmovdqu %ymm12,480-256-128(%rbx) - vpaddd %ymm12,%ymm2,%ymm2 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm13,%ymm9 - vpand %ymm4,%ymm5,%ymm5 - vpaddd %ymm13,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpaddd %ymm5,%ymm2,%ymm2 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 96-128(%rax),%ymm11 - - vpaddd %ymm15,%ymm1,%ymm1 - vpslld $5,%ymm2,%ymm7 - vpand %ymm4,%ymm0,%ymm6 - vpxor 288-256-128(%rbx),%ymm14,%ymm14 - - vpaddd %ymm6,%ymm1,%ymm1 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm0,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vmovdqu %ymm13,0-128(%rax) - vpaddd %ymm13,%ymm1,%ymm1 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm14,%ymm9 - vpand %ymm3,%ymm5,%ymm5 - vpaddd %ymm14,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpaddd %ymm5,%ymm1,%ymm1 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 128-128(%rax),%ymm12 - - vpaddd %ymm15,%ymm0,%ymm0 - vpslld $5,%ymm1,%ymm7 - vpand %ymm3,%ymm4,%ymm6 - vpxor 320-256-128(%rbx),%ymm10,%ymm10 - - vpaddd %ymm6,%ymm0,%ymm0 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm4,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vmovdqu %ymm14,32-128(%rax) - vpaddd %ymm14,%ymm0,%ymm0 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm10,%ymm9 - vpand %ymm2,%ymm5,%ymm5 - vpaddd %ymm10,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 160-128(%rax),%ymm13 - - vpaddd %ymm15,%ymm4,%ymm4 - vpslld $5,%ymm0,%ymm7 - vpand %ymm2,%ymm3,%ymm6 - vpxor 352-256-128(%rbx),%ymm11,%ymm11 - - vpaddd %ymm6,%ymm4,%ymm4 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm3,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vmovdqu %ymm10,64-128(%rax) - vpaddd %ymm10,%ymm4,%ymm4 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm11,%ymm9 - vpand %ymm1,%ymm5,%ymm5 - vpaddd %ymm11,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpaddd %ymm5,%ymm4,%ymm4 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 192-128(%rax),%ymm14 - - vpaddd %ymm15,%ymm3,%ymm3 - vpslld $5,%ymm4,%ymm7 - vpand %ymm1,%ymm2,%ymm6 - vpxor 384-256-128(%rbx),%ymm12,%ymm12 - - vpaddd %ymm6,%ymm3,%ymm3 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm2,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vmovdqu %ymm11,96-128(%rax) - vpaddd %ymm11,%ymm3,%ymm3 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm12,%ymm9 - vpand %ymm0,%ymm5,%ymm5 - vpaddd %ymm12,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpaddd %ymm5,%ymm3,%ymm3 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 224-128(%rax),%ymm10 - - vpaddd %ymm15,%ymm2,%ymm2 - vpslld $5,%ymm3,%ymm7 - vpand %ymm0,%ymm1,%ymm6 - vpxor 416-256-128(%rbx),%ymm13,%ymm13 - - vpaddd %ymm6,%ymm2,%ymm2 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm1,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vmovdqu %ymm12,128-128(%rax) - vpaddd %ymm12,%ymm2,%ymm2 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm13,%ymm9 - vpand %ymm4,%ymm5,%ymm5 - vpaddd %ymm13,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpaddd %ymm5,%ymm2,%ymm2 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 256-256-128(%rbx),%ymm11 - - vpaddd %ymm15,%ymm1,%ymm1 - vpslld $5,%ymm2,%ymm7 - vpand %ymm4,%ymm0,%ymm6 - vpxor 448-256-128(%rbx),%ymm14,%ymm14 - - vpaddd %ymm6,%ymm1,%ymm1 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm0,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vmovdqu %ymm13,160-128(%rax) - vpaddd %ymm13,%ymm1,%ymm1 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm14,%ymm9 - vpand %ymm3,%ymm5,%ymm5 - vpaddd %ymm14,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpaddd %ymm5,%ymm1,%ymm1 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 288-256-128(%rbx),%ymm12 - - vpaddd %ymm15,%ymm0,%ymm0 - vpslld $5,%ymm1,%ymm7 - vpand %ymm3,%ymm4,%ymm6 - vpxor 480-256-128(%rbx),%ymm10,%ymm10 - - vpaddd %ymm6,%ymm0,%ymm0 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm4,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vmovdqu %ymm14,192-128(%rax) - vpaddd %ymm14,%ymm0,%ymm0 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm10,%ymm9 - vpand %ymm2,%ymm5,%ymm5 - vpaddd %ymm10,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 320-256-128(%rbx),%ymm13 - - vpaddd %ymm15,%ymm4,%ymm4 - vpslld $5,%ymm0,%ymm7 - vpand %ymm2,%ymm3,%ymm6 - vpxor 0-128(%rax),%ymm11,%ymm11 - - vpaddd %ymm6,%ymm4,%ymm4 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm3,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vmovdqu %ymm10,224-128(%rax) - vpaddd %ymm10,%ymm4,%ymm4 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm11,%ymm9 - vpand %ymm1,%ymm5,%ymm5 - vpaddd %ymm11,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpaddd %ymm5,%ymm4,%ymm4 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 352-256-128(%rbx),%ymm14 - - vpaddd %ymm15,%ymm3,%ymm3 - vpslld $5,%ymm4,%ymm7 - vpand %ymm1,%ymm2,%ymm6 - vpxor 32-128(%rax),%ymm12,%ymm12 - - vpaddd %ymm6,%ymm3,%ymm3 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm2,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vmovdqu %ymm11,256-256-128(%rbx) - vpaddd %ymm11,%ymm3,%ymm3 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm12,%ymm9 - vpand %ymm0,%ymm5,%ymm5 - vpaddd %ymm12,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpaddd %ymm5,%ymm3,%ymm3 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 384-256-128(%rbx),%ymm10 - - vpaddd %ymm15,%ymm2,%ymm2 - vpslld $5,%ymm3,%ymm7 - vpand %ymm0,%ymm1,%ymm6 - vpxor 64-128(%rax),%ymm13,%ymm13 - - vpaddd %ymm6,%ymm2,%ymm2 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm1,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vmovdqu %ymm12,288-256-128(%rbx) - vpaddd %ymm12,%ymm2,%ymm2 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm13,%ymm9 - vpand %ymm4,%ymm5,%ymm5 - vpaddd %ymm13,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpaddd %ymm5,%ymm2,%ymm2 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 416-256-128(%rbx),%ymm11 - - vpaddd %ymm15,%ymm1,%ymm1 - vpslld $5,%ymm2,%ymm7 - vpand %ymm4,%ymm0,%ymm6 - vpxor 96-128(%rax),%ymm14,%ymm14 - - vpaddd %ymm6,%ymm1,%ymm1 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm0,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vmovdqu %ymm13,320-256-128(%rbx) - vpaddd %ymm13,%ymm1,%ymm1 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm14,%ymm9 - vpand %ymm3,%ymm5,%ymm5 - vpaddd %ymm14,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpaddd %ymm5,%ymm1,%ymm1 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 448-256-128(%rbx),%ymm12 - - vpaddd %ymm15,%ymm0,%ymm0 - vpslld $5,%ymm1,%ymm7 - vpand %ymm3,%ymm4,%ymm6 - vpxor 128-128(%rax),%ymm10,%ymm10 - - vpaddd %ymm6,%ymm0,%ymm0 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm4,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vmovdqu %ymm14,352-256-128(%rbx) - vpaddd %ymm14,%ymm0,%ymm0 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm10,%ymm9 - vpand %ymm2,%ymm5,%ymm5 - vpaddd %ymm10,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vmovdqa 64(%rbp),%ymm15 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 480-256-128(%rbx),%ymm13 - - vpslld $5,%ymm0,%ymm7 - vpaddd %ymm15,%ymm4,%ymm4 - vpxor %ymm1,%ymm3,%ymm5 - vmovdqa %ymm10,384-256-128(%rbx) - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 160-128(%rax),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 0-128(%rax),%ymm14 - - vpslld $5,%ymm4,%ymm7 - vpaddd %ymm15,%ymm3,%ymm3 - vpxor %ymm0,%ymm2,%ymm5 - vmovdqa %ymm11,416-256-128(%rbx) - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 192-128(%rax),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 32-128(%rax),%ymm10 - - vpslld $5,%ymm3,%ymm7 - vpaddd %ymm15,%ymm2,%ymm2 - vpxor %ymm4,%ymm1,%ymm5 - vmovdqa %ymm12,448-256-128(%rbx) - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 224-128(%rax),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 64-128(%rax),%ymm11 - - vpslld $5,%ymm2,%ymm7 - vpaddd %ymm15,%ymm1,%ymm1 - vpxor %ymm3,%ymm0,%ymm5 - vmovdqa %ymm13,480-256-128(%rbx) - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 256-256-128(%rbx),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 96-128(%rax),%ymm12 - - vpslld $5,%ymm1,%ymm7 - vpaddd %ymm15,%ymm0,%ymm0 - vpxor %ymm2,%ymm4,%ymm5 - vmovdqa %ymm14,0-128(%rax) - vpaddd %ymm14,%ymm0,%ymm0 - vpxor 288-256-128(%rbx),%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm5,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - vpsrld $31,%ymm10,%ymm9 - vpaddd %ymm10,%ymm10,%ymm10 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 128-128(%rax),%ymm13 - - vpslld $5,%ymm0,%ymm7 - vpaddd %ymm15,%ymm4,%ymm4 - vpxor %ymm1,%ymm3,%ymm5 - vmovdqa %ymm10,32-128(%rax) - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 320-256-128(%rbx),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 160-128(%rax),%ymm14 - - vpslld $5,%ymm4,%ymm7 - vpaddd %ymm15,%ymm3,%ymm3 - vpxor %ymm0,%ymm2,%ymm5 - vmovdqa %ymm11,64-128(%rax) - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 352-256-128(%rbx),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 192-128(%rax),%ymm10 - - vpslld $5,%ymm3,%ymm7 - vpaddd %ymm15,%ymm2,%ymm2 - vpxor %ymm4,%ymm1,%ymm5 - vmovdqa %ymm12,96-128(%rax) - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 384-256-128(%rbx),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 224-128(%rax),%ymm11 - - vpslld $5,%ymm2,%ymm7 - vpaddd %ymm15,%ymm1,%ymm1 - vpxor %ymm3,%ymm0,%ymm5 - vmovdqa %ymm13,128-128(%rax) - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 416-256-128(%rbx),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 256-256-128(%rbx),%ymm12 - - vpslld $5,%ymm1,%ymm7 - vpaddd %ymm15,%ymm0,%ymm0 - vpxor %ymm2,%ymm4,%ymm5 - vmovdqa %ymm14,160-128(%rax) - vpaddd %ymm14,%ymm0,%ymm0 - vpxor 448-256-128(%rbx),%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm5,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - vpsrld $31,%ymm10,%ymm9 - vpaddd %ymm10,%ymm10,%ymm10 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 288-256-128(%rbx),%ymm13 - - vpslld $5,%ymm0,%ymm7 - vpaddd %ymm15,%ymm4,%ymm4 - vpxor %ymm1,%ymm3,%ymm5 - vmovdqa %ymm10,192-128(%rax) - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 480-256-128(%rbx),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 320-256-128(%rbx),%ymm14 - - vpslld $5,%ymm4,%ymm7 - vpaddd %ymm15,%ymm3,%ymm3 - vpxor %ymm0,%ymm2,%ymm5 - vmovdqa %ymm11,224-128(%rax) - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 0-128(%rax),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 352-256-128(%rbx),%ymm10 - - vpslld $5,%ymm3,%ymm7 - vpaddd %ymm15,%ymm2,%ymm2 - vpxor %ymm4,%ymm1,%ymm5 - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 32-128(%rax),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 384-256-128(%rbx),%ymm11 - - vpslld $5,%ymm2,%ymm7 - vpaddd %ymm15,%ymm1,%ymm1 - vpxor %ymm3,%ymm0,%ymm5 - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 64-128(%rax),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 416-256-128(%rbx),%ymm12 - - vpslld $5,%ymm1,%ymm7 - vpaddd %ymm15,%ymm0,%ymm0 - vpxor %ymm2,%ymm4,%ymm5 - vpaddd %ymm14,%ymm0,%ymm0 - vpxor 96-128(%rax),%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm5,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - vpsrld $31,%ymm10,%ymm9 - vpaddd %ymm10,%ymm10,%ymm10 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 448-256-128(%rbx),%ymm13 - - vpslld $5,%ymm0,%ymm7 - vpaddd %ymm15,%ymm4,%ymm4 - vpxor %ymm1,%ymm3,%ymm5 - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 128-128(%rax),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 480-256-128(%rbx),%ymm14 - - vpslld $5,%ymm4,%ymm7 - vpaddd %ymm15,%ymm3,%ymm3 - vpxor %ymm0,%ymm2,%ymm5 - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 160-128(%rax),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 0-128(%rax),%ymm10 - - vpslld $5,%ymm3,%ymm7 - vpaddd %ymm15,%ymm2,%ymm2 - vpxor %ymm4,%ymm1,%ymm5 - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 192-128(%rax),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 32-128(%rax),%ymm11 - - vpslld $5,%ymm2,%ymm7 - vpaddd %ymm15,%ymm1,%ymm1 - vpxor %ymm3,%ymm0,%ymm5 - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 224-128(%rax),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpslld $5,%ymm1,%ymm7 - vpaddd %ymm15,%ymm0,%ymm0 - vpxor %ymm2,%ymm4,%ymm5 - - vpsrld $27,%ymm1,%ymm8 - vpaddd %ymm14,%ymm0,%ymm0 - vpxor %ymm3,%ymm5,%ymm5 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm6,%ymm2,%ymm2 - movl $1,%ecx - leaq 512(%rsp),%rbx - cmpl 0(%rbx),%ecx - cmovgeq %rbp,%r12 - cmpl 4(%rbx),%ecx - cmovgeq %rbp,%r13 - cmpl 8(%rbx),%ecx - cmovgeq %rbp,%r14 - cmpl 12(%rbx),%ecx - cmovgeq %rbp,%r15 - cmpl 16(%rbx),%ecx - cmovgeq %rbp,%r8 - cmpl 20(%rbx),%ecx - cmovgeq %rbp,%r9 - cmpl 24(%rbx),%ecx - cmovgeq %rbp,%r10 - cmpl 28(%rbx),%ecx - cmovgeq %rbp,%r11 - vmovdqu (%rbx),%ymm5 - vpxor %ymm7,%ymm7,%ymm7 - vmovdqa %ymm5,%ymm6 - vpcmpgtd %ymm7,%ymm6,%ymm6 - vpaddd %ymm6,%ymm5,%ymm5 - - vpand %ymm6,%ymm0,%ymm0 - vpand %ymm6,%ymm1,%ymm1 - vpaddd 0(%rdi),%ymm0,%ymm0 - vpand %ymm6,%ymm2,%ymm2 - vpaddd 32(%rdi),%ymm1,%ymm1 - vpand %ymm6,%ymm3,%ymm3 - vpaddd 64(%rdi),%ymm2,%ymm2 - vpand %ymm6,%ymm4,%ymm4 - vpaddd 96(%rdi),%ymm3,%ymm3 - vpaddd 128(%rdi),%ymm4,%ymm4 - vmovdqu %ymm0,0(%rdi) - vmovdqu %ymm1,32(%rdi) - vmovdqu %ymm2,64(%rdi) - vmovdqu %ymm3,96(%rdi) - vmovdqu %ymm4,128(%rdi) - - vmovdqu %ymm5,(%rbx) - leaq 256+128(%rsp),%rbx - vmovdqu 96(%rbp),%ymm9 - decl %edx - jnz .Loop_avx2 - - - - - - - -.Ldone_avx2: - movq 544(%rsp),%rax -.cfi_def_cfa %rax,8 - vzeroupper - movq -48(%rax),%r15 -.cfi_restore %r15 - movq -40(%rax),%r14 -.cfi_restore %r14 - movq -32(%rax),%r13 -.cfi_restore %r13 - movq -24(%rax),%r12 -.cfi_restore %r12 - movq -16(%rax),%rbp -.cfi_restore %rbp - movq -8(%rax),%rbx -.cfi_restore %rbx - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_avx2: - .byte 0xf3,0xc3 -.cfi_endproc -.size sha1_multi_block_avx2,.-sha1_multi_block_avx2 .align 256 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 @@ -7301,7 +2962,7 @@ K_XX_XX: .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 .byte 83,72,65,49,32,109,117,108,116,105,45,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 - .section ".note.gnu.property", "a" + .section .note.gnu.property, #alloc .p2align 3 .long 1f - 0f .long 4f - 1f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha1-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha1-x86_64.s index 3a03212f8b6..e730222f30b 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha1-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha1-x86_64.s @@ -13,14 +13,6 @@ sha1_block_data_order: jz .Lialu testl $536870912,%r10d jnz _shaext_shortcut - andl $296,%r10d - cmpl $296,%r10d - je _avx2_shortcut - andl $268435456,%r8d - andl $1073741824,%r9d - orl %r9d,%r8d - cmpl $1342177280,%r8d - je _avx_shortcut jmp _ssse3_shortcut .align 16 @@ -2612,2827 +2604,6 @@ _ssse3_shortcut: .byte 0xf3,0xc3 .cfi_endproc .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 -.type sha1_block_data_order_avx,@function -.align 16 -sha1_block_data_order_avx: -_avx_shortcut: -.cfi_startproc - movq %rsp,%r11 -.cfi_def_cfa_register %r11 - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - leaq -64(%rsp),%rsp - vzeroupper - andq $-64,%rsp - movq %rdi,%r8 - movq %rsi,%r9 - movq %rdx,%r10 - - shlq $6,%r10 - addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r14 - - movl 0(%r8),%eax - movl 4(%r8),%ebx - movl 8(%r8),%ecx - movl 12(%r8),%edx - movl %ebx,%esi - movl 16(%r8),%ebp - movl %ecx,%edi - xorl %edx,%edi - andl %edi,%esi - - vmovdqa 64(%r14),%xmm6 - vmovdqa -64(%r14),%xmm11 - vmovdqu 0(%r9),%xmm0 - vmovdqu 16(%r9),%xmm1 - vmovdqu 32(%r9),%xmm2 - vmovdqu 48(%r9),%xmm3 - vpshufb %xmm6,%xmm0,%xmm0 - addq $64,%r9 - vpshufb %xmm6,%xmm1,%xmm1 - vpshufb %xmm6,%xmm2,%xmm2 - vpshufb %xmm6,%xmm3,%xmm3 - vpaddd %xmm11,%xmm0,%xmm4 - vpaddd %xmm11,%xmm1,%xmm5 - vpaddd %xmm11,%xmm2,%xmm6 - vmovdqa %xmm4,0(%rsp) - vmovdqa %xmm5,16(%rsp) - vmovdqa %xmm6,32(%rsp) - jmp .Loop_avx -.align 16 -.Loop_avx: - shrdl $2,%ebx,%ebx - xorl %edx,%esi - vpalignr $8,%xmm0,%xmm1,%xmm4 - movl %eax,%edi - addl 0(%rsp),%ebp - vpaddd %xmm3,%xmm11,%xmm9 - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpsrldq $4,%xmm3,%xmm8 - addl %esi,%ebp - andl %ebx,%edi - vpxor %xmm0,%xmm4,%xmm4 - xorl %ecx,%ebx - addl %eax,%ebp - vpxor %xmm2,%xmm8,%xmm8 - shrdl $7,%eax,%eax - xorl %ecx,%edi - movl %ebp,%esi - addl 4(%rsp),%edx - vpxor %xmm8,%xmm4,%xmm4 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vmovdqa %xmm9,48(%rsp) - addl %edi,%edx - andl %eax,%esi - vpsrld $31,%xmm4,%xmm8 - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%esi - vpslldq $12,%xmm4,%xmm10 - vpaddd %xmm4,%xmm4,%xmm4 - movl %edx,%edi - addl 8(%rsp),%ecx - xorl %eax,%ebp - shldl $5,%edx,%edx - vpsrld $30,%xmm10,%xmm9 - vpor %xmm8,%xmm4,%xmm4 - addl %esi,%ecx - andl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - vpslld $2,%xmm10,%xmm10 - vpxor %xmm9,%xmm4,%xmm4 - shrdl $7,%edx,%edx - xorl %eax,%edi - movl %ecx,%esi - addl 12(%rsp),%ebx - vpxor %xmm10,%xmm4,%xmm4 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - addl %edi,%ebx - andl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %ebp,%esi - vpalignr $8,%xmm1,%xmm2,%xmm5 - movl %ebx,%edi - addl 16(%rsp),%eax - vpaddd %xmm4,%xmm11,%xmm9 - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpsrldq $4,%xmm4,%xmm8 - addl %esi,%eax - andl %ecx,%edi - vpxor %xmm1,%xmm5,%xmm5 - xorl %edx,%ecx - addl %ebx,%eax - vpxor %xmm3,%xmm8,%xmm8 - shrdl $7,%ebx,%ebx - xorl %edx,%edi - movl %eax,%esi - addl 20(%rsp),%ebp - vpxor %xmm8,%xmm5,%xmm5 - xorl %ecx,%ebx - shldl $5,%eax,%eax - vmovdqa %xmm9,0(%rsp) - addl %edi,%ebp - andl %ebx,%esi - vpsrld $31,%xmm5,%xmm8 - xorl %ecx,%ebx - addl %eax,%ebp - shrdl $7,%eax,%eax - xorl %ecx,%esi - vpslldq $12,%xmm5,%xmm10 - vpaddd %xmm5,%xmm5,%xmm5 - movl %ebp,%edi - addl 24(%rsp),%edx - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vpsrld $30,%xmm10,%xmm9 - vpor %xmm8,%xmm5,%xmm5 - addl %esi,%edx - andl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - vpslld $2,%xmm10,%xmm10 - vpxor %xmm9,%xmm5,%xmm5 - shrdl $7,%ebp,%ebp - xorl %ebx,%edi - movl %edx,%esi - addl 28(%rsp),%ecx - vpxor %xmm10,%xmm5,%xmm5 - xorl %eax,%ebp - shldl $5,%edx,%edx - vmovdqa -32(%r14),%xmm11 - addl %edi,%ecx - andl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - vpalignr $8,%xmm2,%xmm3,%xmm6 - movl %ecx,%edi - addl 32(%rsp),%ebx - vpaddd %xmm5,%xmm11,%xmm9 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - vpsrldq $4,%xmm5,%xmm8 - addl %esi,%ebx - andl %edx,%edi - vpxor %xmm2,%xmm6,%xmm6 - xorl %ebp,%edx - addl %ecx,%ebx - vpxor %xmm4,%xmm8,%xmm8 - shrdl $7,%ecx,%ecx - xorl %ebp,%edi - movl %ebx,%esi - addl 36(%rsp),%eax - vpxor %xmm8,%xmm6,%xmm6 - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vmovdqa %xmm9,16(%rsp) - addl %edi,%eax - andl %ecx,%esi - vpsrld $31,%xmm6,%xmm8 - xorl %edx,%ecx - addl %ebx,%eax - shrdl $7,%ebx,%ebx - xorl %edx,%esi - vpslldq $12,%xmm6,%xmm10 - vpaddd %xmm6,%xmm6,%xmm6 - movl %eax,%edi - addl 40(%rsp),%ebp - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpsrld $30,%xmm10,%xmm9 - vpor %xmm8,%xmm6,%xmm6 - addl %esi,%ebp - andl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - vpslld $2,%xmm10,%xmm10 - vpxor %xmm9,%xmm6,%xmm6 - shrdl $7,%eax,%eax - xorl %ecx,%edi - movl %ebp,%esi - addl 44(%rsp),%edx - vpxor %xmm10,%xmm6,%xmm6 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - addl %edi,%edx - andl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%esi - vpalignr $8,%xmm3,%xmm4,%xmm7 - movl %edx,%edi - addl 48(%rsp),%ecx - vpaddd %xmm6,%xmm11,%xmm9 - xorl %eax,%ebp - shldl $5,%edx,%edx - vpsrldq $4,%xmm6,%xmm8 - addl %esi,%ecx - andl %ebp,%edi - vpxor %xmm3,%xmm7,%xmm7 - xorl %eax,%ebp - addl %edx,%ecx - vpxor %xmm5,%xmm8,%xmm8 - shrdl $7,%edx,%edx - xorl %eax,%edi - movl %ecx,%esi - addl 52(%rsp),%ebx - vpxor %xmm8,%xmm7,%xmm7 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - vmovdqa %xmm9,32(%rsp) - addl %edi,%ebx - andl %edx,%esi - vpsrld $31,%xmm7,%xmm8 - xorl %ebp,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %ebp,%esi - vpslldq $12,%xmm7,%xmm10 - vpaddd %xmm7,%xmm7,%xmm7 - movl %ebx,%edi - addl 56(%rsp),%eax - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpsrld $30,%xmm10,%xmm9 - vpor %xmm8,%xmm7,%xmm7 - addl %esi,%eax - andl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - vpslld $2,%xmm10,%xmm10 - vpxor %xmm9,%xmm7,%xmm7 - shrdl $7,%ebx,%ebx - xorl %edx,%edi - movl %eax,%esi - addl 60(%rsp),%ebp - vpxor %xmm10,%xmm7,%xmm7 - xorl %ecx,%ebx - shldl $5,%eax,%eax - addl %edi,%ebp - andl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm6,%xmm7,%xmm8 - vpxor %xmm4,%xmm0,%xmm0 - shrdl $7,%eax,%eax - xorl %ecx,%esi - movl %ebp,%edi - addl 0(%rsp),%edx - vpxor %xmm1,%xmm0,%xmm0 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vpaddd %xmm7,%xmm11,%xmm9 - addl %esi,%edx - andl %eax,%edi - vpxor %xmm8,%xmm0,%xmm0 - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%edi - vpsrld $30,%xmm0,%xmm8 - vmovdqa %xmm9,48(%rsp) - movl %edx,%esi - addl 4(%rsp),%ecx - xorl %eax,%ebp - shldl $5,%edx,%edx - vpslld $2,%xmm0,%xmm0 - addl %edi,%ecx - andl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - movl %ecx,%edi - addl 8(%rsp),%ebx - vpor %xmm8,%xmm0,%xmm0 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - addl %esi,%ebx - andl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 12(%rsp),%eax - xorl %ebp,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm7,%xmm0,%xmm8 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - vpxor %xmm2,%xmm1,%xmm1 - addl %esi,%ebp - xorl %ecx,%edi - vpaddd %xmm0,%xmm11,%xmm9 - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpxor %xmm8,%xmm1,%xmm1 - addl 20(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - vpsrld $30,%xmm1,%xmm8 - vmovdqa %xmm9,0(%rsp) - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpslld $2,%xmm1,%xmm1 - addl 24(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpor %xmm8,%xmm1,%xmm1 - addl 28(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpalignr $8,%xmm0,%xmm1,%xmm8 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - vpxor %xmm3,%xmm2,%xmm2 - addl %esi,%eax - xorl %edx,%edi - vpaddd %xmm1,%xmm11,%xmm9 - vmovdqa 0(%r14),%xmm11 - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpxor %xmm8,%xmm2,%xmm2 - addl 36(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - vpsrld $30,%xmm2,%xmm8 - vmovdqa %xmm9,16(%rsp) - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpslld $2,%xmm2,%xmm2 - addl 40(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpor %xmm8,%xmm2,%xmm2 - addl 44(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpalignr $8,%xmm1,%xmm2,%xmm8 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - vpxor %xmm4,%xmm3,%xmm3 - addl %esi,%ebx - xorl %ebp,%edi - vpaddd %xmm2,%xmm11,%xmm9 - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpxor %xmm8,%xmm3,%xmm3 - addl 52(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - vpsrld $30,%xmm3,%xmm8 - vmovdqa %xmm9,32(%rsp) - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpslld $2,%xmm3,%xmm3 - addl 56(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpor %xmm8,%xmm3,%xmm3 - addl 60(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpalignr $8,%xmm2,%xmm3,%xmm8 - vpxor %xmm0,%xmm4,%xmm4 - addl 0(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - vpxor %xmm5,%xmm4,%xmm4 - addl %esi,%ecx - xorl %eax,%edi - vpaddd %xmm3,%xmm11,%xmm9 - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpxor %xmm8,%xmm4,%xmm4 - addl 4(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - vpsrld $30,%xmm4,%xmm8 - vmovdqa %xmm9,48(%rsp) - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpslld $2,%xmm4,%xmm4 - addl 8(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpor %xmm8,%xmm4,%xmm4 - addl 12(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm3,%xmm4,%xmm8 - vpxor %xmm1,%xmm5,%xmm5 - addl 16(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - vpxor %xmm6,%xmm5,%xmm5 - addl %esi,%edx - xorl %ebx,%edi - vpaddd %xmm4,%xmm11,%xmm9 - shrdl $7,%eax,%eax - addl %ebp,%edx - vpxor %xmm8,%xmm5,%xmm5 - addl 20(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - vpsrld $30,%xmm5,%xmm8 - vmovdqa %xmm9,0(%rsp) - addl %edi,%ecx - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpslld $2,%xmm5,%xmm5 - addl 24(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpor %xmm8,%xmm5,%xmm5 - addl 28(%rsp),%eax - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm4,%xmm5,%xmm8 - vpxor %xmm2,%xmm6,%xmm6 - addl 32(%rsp),%ebp - andl %ecx,%esi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - vpxor %xmm7,%xmm6,%xmm6 - movl %eax,%edi - xorl %ecx,%esi - vpaddd %xmm5,%xmm11,%xmm9 - shldl $5,%eax,%eax - addl %esi,%ebp - vpxor %xmm8,%xmm6,%xmm6 - xorl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - addl 36(%rsp),%edx - vpsrld $30,%xmm6,%xmm8 - vmovdqa %xmm9,16(%rsp) - andl %ebx,%edi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%esi - vpslld $2,%xmm6,%xmm6 - xorl %ebx,%edi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - addl 40(%rsp),%ecx - andl %eax,%esi - vpor %xmm8,%xmm6,%xmm6 - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%edi - xorl %eax,%esi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - addl 44(%rsp),%ebx - andl %ebp,%edi - xorl %eax,%ebp - shrdl $7,%edx,%edx - movl %ecx,%esi - xorl %ebp,%edi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - vpalignr $8,%xmm5,%xmm6,%xmm8 - vpxor %xmm3,%xmm7,%xmm7 - addl 48(%rsp),%eax - andl %edx,%esi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - vpxor %xmm0,%xmm7,%xmm7 - movl %ebx,%edi - xorl %edx,%esi - vpaddd %xmm6,%xmm11,%xmm9 - vmovdqa 32(%r14),%xmm11 - shldl $5,%ebx,%ebx - addl %esi,%eax - vpxor %xmm8,%xmm7,%xmm7 - xorl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - addl 52(%rsp),%ebp - vpsrld $30,%xmm7,%xmm8 - vmovdqa %xmm9,32(%rsp) - andl %ecx,%edi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - vpslld $2,%xmm7,%xmm7 - xorl %ecx,%edi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - addl 56(%rsp),%edx - andl %ebx,%esi - vpor %xmm8,%xmm7,%xmm7 - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%edi - xorl %ebx,%esi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - addl 60(%rsp),%ecx - andl %eax,%edi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%esi - xorl %eax,%edi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - vpalignr $8,%xmm6,%xmm7,%xmm8 - vpxor %xmm4,%xmm0,%xmm0 - addl 0(%rsp),%ebx - andl %ebp,%esi - xorl %eax,%ebp - shrdl $7,%edx,%edx - vpxor %xmm1,%xmm0,%xmm0 - movl %ecx,%edi - xorl %ebp,%esi - vpaddd %xmm7,%xmm11,%xmm9 - shldl $5,%ecx,%ecx - addl %esi,%ebx - vpxor %xmm8,%xmm0,%xmm0 - xorl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 4(%rsp),%eax - vpsrld $30,%xmm0,%xmm8 - vmovdqa %xmm9,48(%rsp) - andl %edx,%edi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - vpslld $2,%xmm0,%xmm0 - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - addl 8(%rsp),%ebp - andl %ecx,%esi - vpor %xmm8,%xmm0,%xmm0 - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%edi - xorl %ecx,%esi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - addl 12(%rsp),%edx - andl %ebx,%edi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%esi - xorl %ebx,%edi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - vpalignr $8,%xmm7,%xmm0,%xmm8 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%rsp),%ecx - andl %eax,%esi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - vpxor %xmm2,%xmm1,%xmm1 - movl %edx,%edi - xorl %eax,%esi - vpaddd %xmm0,%xmm11,%xmm9 - shldl $5,%edx,%edx - addl %esi,%ecx - vpxor %xmm8,%xmm1,%xmm1 - xorl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - addl 20(%rsp),%ebx - vpsrld $30,%xmm1,%xmm8 - vmovdqa %xmm9,0(%rsp) - andl %ebp,%edi - xorl %eax,%ebp - shrdl $7,%edx,%edx - movl %ecx,%esi - vpslld $2,%xmm1,%xmm1 - xorl %ebp,%edi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - addl 24(%rsp),%eax - andl %edx,%esi - vpor %xmm8,%xmm1,%xmm1 - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%edi - xorl %edx,%esi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - addl 28(%rsp),%ebp - andl %ecx,%edi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - xorl %ecx,%edi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm0,%xmm1,%xmm8 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%rsp),%edx - andl %ebx,%esi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - vpxor %xmm3,%xmm2,%xmm2 - movl %ebp,%edi - xorl %ebx,%esi - vpaddd %xmm1,%xmm11,%xmm9 - shldl $5,%ebp,%ebp - addl %esi,%edx - vpxor %xmm8,%xmm2,%xmm2 - xorl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - addl 36(%rsp),%ecx - vpsrld $30,%xmm2,%xmm8 - vmovdqa %xmm9,16(%rsp) - andl %eax,%edi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%esi - vpslld $2,%xmm2,%xmm2 - xorl %eax,%edi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - addl 40(%rsp),%ebx - andl %ebp,%esi - vpor %xmm8,%xmm2,%xmm2 - xorl %eax,%ebp - shrdl $7,%edx,%edx - movl %ecx,%edi - xorl %ebp,%esi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 44(%rsp),%eax - andl %edx,%edi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - addl %ebx,%eax - vpalignr $8,%xmm1,%xmm2,%xmm8 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - vpxor %xmm4,%xmm3,%xmm3 - addl %esi,%ebp - xorl %ecx,%edi - vpaddd %xmm2,%xmm11,%xmm9 - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpxor %xmm8,%xmm3,%xmm3 - addl 52(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - vpsrld $30,%xmm3,%xmm8 - vmovdqa %xmm9,32(%rsp) - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpslld $2,%xmm3,%xmm3 - addl 56(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpor %xmm8,%xmm3,%xmm3 - addl 60(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 0(%rsp),%eax - vpaddd %xmm3,%xmm11,%xmm9 - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - vmovdqa %xmm9,48(%rsp) - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 4(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 8(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 12(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - cmpq %r10,%r9 - je .Ldone_avx - vmovdqa 64(%r14),%xmm6 - vmovdqa -64(%r14),%xmm11 - vmovdqu 0(%r9),%xmm0 - vmovdqu 16(%r9),%xmm1 - vmovdqu 32(%r9),%xmm2 - vmovdqu 48(%r9),%xmm3 - vpshufb %xmm6,%xmm0,%xmm0 - addq $64,%r9 - addl 16(%rsp),%ebx - xorl %ebp,%esi - vpshufb %xmm6,%xmm1,%xmm1 - movl %ecx,%edi - shldl $5,%ecx,%ecx - vpaddd %xmm11,%xmm0,%xmm4 - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vmovdqa %xmm4,0(%rsp) - addl 20(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 28(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 32(%rsp),%ecx - xorl %eax,%esi - vpshufb %xmm6,%xmm2,%xmm2 - movl %edx,%edi - shldl $5,%edx,%edx - vpaddd %xmm11,%xmm1,%xmm5 - addl %esi,%ecx - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vmovdqa %xmm5,16(%rsp) - addl 36(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 48(%rsp),%edx - xorl %ebx,%esi - vpshufb %xmm6,%xmm3,%xmm3 - movl %ebp,%edi - shldl $5,%ebp,%ebp - vpaddd %xmm11,%xmm2,%xmm6 - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - vmovdqa %xmm6,32(%rsp) - addl 52(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 56(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 0(%r8),%eax - addl 4(%r8),%esi - addl 8(%r8),%ecx - addl 12(%r8),%edx - movl %eax,0(%r8) - addl 16(%r8),%ebp - movl %esi,4(%r8) - movl %esi,%ebx - movl %ecx,8(%r8) - movl %ecx,%edi - movl %edx,12(%r8) - xorl %edx,%edi - movl %ebp,16(%r8) - andl %edi,%esi - jmp .Loop_avx - -.align 16 -.Ldone_avx: - addl 16(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 20(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 28(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 32(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 36(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 48(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 52(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 56(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vzeroupper - - addl 0(%r8),%eax - addl 4(%r8),%esi - addl 8(%r8),%ecx - movl %eax,0(%r8) - addl 12(%r8),%edx - movl %esi,4(%r8) - addl 16(%r8),%ebp - movl %ecx,8(%r8) - movl %edx,12(%r8) - movl %ebp,16(%r8) - movq -40(%r11),%r14 -.cfi_restore %r14 - movq -32(%r11),%r13 -.cfi_restore %r13 - movq -24(%r11),%r12 -.cfi_restore %r12 - movq -16(%r11),%rbp -.cfi_restore %rbp - movq -8(%r11),%rbx -.cfi_restore %rbx - leaq (%r11),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_avx: - .byte 0xf3,0xc3 -.cfi_endproc -.size sha1_block_data_order_avx,.-sha1_block_data_order_avx -.type sha1_block_data_order_avx2,@function -.align 16 -sha1_block_data_order_avx2: -_avx2_shortcut: -.cfi_startproc - movq %rsp,%r11 -.cfi_def_cfa_register %r11 - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - vzeroupper - movq %rdi,%r8 - movq %rsi,%r9 - movq %rdx,%r10 - - leaq -640(%rsp),%rsp - shlq $6,%r10 - leaq 64(%r9),%r13 - andq $-128,%rsp - addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r14 - - movl 0(%r8),%eax - cmpq %r10,%r13 - cmovaeq %r9,%r13 - movl 4(%r8),%ebp - movl 8(%r8),%ecx - movl 12(%r8),%edx - movl 16(%r8),%esi - vmovdqu 64(%r14),%ymm6 - - vmovdqu (%r9),%xmm0 - vmovdqu 16(%r9),%xmm1 - vmovdqu 32(%r9),%xmm2 - vmovdqu 48(%r9),%xmm3 - leaq 64(%r9),%r9 - vinserti128 $1,(%r13),%ymm0,%ymm0 - vinserti128 $1,16(%r13),%ymm1,%ymm1 - vpshufb %ymm6,%ymm0,%ymm0 - vinserti128 $1,32(%r13),%ymm2,%ymm2 - vpshufb %ymm6,%ymm1,%ymm1 - vinserti128 $1,48(%r13),%ymm3,%ymm3 - vpshufb %ymm6,%ymm2,%ymm2 - vmovdqu -64(%r14),%ymm11 - vpshufb %ymm6,%ymm3,%ymm3 - - vpaddd %ymm11,%ymm0,%ymm4 - vpaddd %ymm11,%ymm1,%ymm5 - vmovdqu %ymm4,0(%rsp) - vpaddd %ymm11,%ymm2,%ymm6 - vmovdqu %ymm5,32(%rsp) - vpaddd %ymm11,%ymm3,%ymm7 - vmovdqu %ymm6,64(%rsp) - vmovdqu %ymm7,96(%rsp) - vpalignr $8,%ymm0,%ymm1,%ymm4 - vpsrldq $4,%ymm3,%ymm8 - vpxor %ymm0,%ymm4,%ymm4 - vpxor %ymm2,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $31,%ymm4,%ymm8 - vpslldq $12,%ymm4,%ymm10 - vpaddd %ymm4,%ymm4,%ymm4 - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm4,%ymm4 - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm4,%ymm4 - vpxor %ymm10,%ymm4,%ymm4 - vpaddd %ymm11,%ymm4,%ymm9 - vmovdqu %ymm9,128(%rsp) - vpalignr $8,%ymm1,%ymm2,%ymm5 - vpsrldq $4,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm3,%ymm8,%ymm8 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $31,%ymm5,%ymm8 - vmovdqu -32(%r14),%ymm11 - vpslldq $12,%ymm5,%ymm10 - vpaddd %ymm5,%ymm5,%ymm5 - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm5,%ymm5 - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm10,%ymm5,%ymm5 - vpaddd %ymm11,%ymm5,%ymm9 - vmovdqu %ymm9,160(%rsp) - vpalignr $8,%ymm2,%ymm3,%ymm6 - vpsrldq $4,%ymm5,%ymm8 - vpxor %ymm2,%ymm6,%ymm6 - vpxor %ymm4,%ymm8,%ymm8 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $31,%ymm6,%ymm8 - vpslldq $12,%ymm6,%ymm10 - vpaddd %ymm6,%ymm6,%ymm6 - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm6,%ymm6 - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm6,%ymm6 - vpxor %ymm10,%ymm6,%ymm6 - vpaddd %ymm11,%ymm6,%ymm9 - vmovdqu %ymm9,192(%rsp) - vpalignr $8,%ymm3,%ymm4,%ymm7 - vpsrldq $4,%ymm6,%ymm8 - vpxor %ymm3,%ymm7,%ymm7 - vpxor %ymm5,%ymm8,%ymm8 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm7,%ymm8 - vpslldq $12,%ymm7,%ymm10 - vpaddd %ymm7,%ymm7,%ymm7 - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm7,%ymm7 - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm7,%ymm7 - vpxor %ymm10,%ymm7,%ymm7 - vpaddd %ymm11,%ymm7,%ymm9 - vmovdqu %ymm9,224(%rsp) - leaq 128(%rsp),%r13 - jmp .Loop_avx2 -.align 32 -.Loop_avx2: - rorxl $2,%ebp,%ebx - andnl %edx,%ebp,%edi - andl %ecx,%ebp - xorl %edi,%ebp - jmp .Lalign32_1 -.align 32 -.Lalign32_1: - vpalignr $8,%ymm6,%ymm7,%ymm8 - vpxor %ymm4,%ymm0,%ymm0 - addl -128(%r13),%esi - andnl %ecx,%eax,%edi - vpxor %ymm1,%ymm0,%ymm0 - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - vpxor %ymm8,%ymm0,%ymm0 - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - vpsrld $30,%ymm0,%ymm8 - vpslld $2,%ymm0,%ymm0 - addl -124(%r13),%edx - andnl %ebx,%esi,%edi - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - vpor %ymm8,%ymm0,%ymm0 - addl %r12d,%edx - xorl %edi,%esi - addl -120(%r13),%ecx - andnl %ebp,%edx,%edi - vpaddd %ymm11,%ymm0,%ymm9 - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - vmovdqu %ymm9,256(%rsp) - addl %r12d,%ecx - xorl %edi,%edx - addl -116(%r13),%ebx - andnl %eax,%ecx,%edi - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - addl -96(%r13),%ebp - andnl %esi,%ebx,%edi - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - andl %edx,%ebx - addl %r12d,%ebp - xorl %edi,%ebx - vpalignr $8,%ymm7,%ymm0,%ymm8 - vpxor %ymm5,%ymm1,%ymm1 - addl -92(%r13),%eax - andnl %edx,%ebp,%edi - vpxor %ymm2,%ymm1,%ymm1 - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - vpxor %ymm8,%ymm1,%ymm1 - andl %ecx,%ebp - addl %r12d,%eax - xorl %edi,%ebp - vpsrld $30,%ymm1,%ymm8 - vpslld $2,%ymm1,%ymm1 - addl -88(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - vpor %ymm8,%ymm1,%ymm1 - addl %r12d,%esi - xorl %edi,%eax - addl -84(%r13),%edx - andnl %ebx,%esi,%edi - vpaddd %ymm11,%ymm1,%ymm9 - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - vmovdqu %ymm9,288(%rsp) - addl %r12d,%edx - xorl %edi,%esi - addl -64(%r13),%ecx - andnl %ebp,%edx,%edi - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - addl %r12d,%ecx - xorl %edi,%edx - addl -60(%r13),%ebx - andnl %eax,%ecx,%edi - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - vpalignr $8,%ymm0,%ymm1,%ymm8 - vpxor %ymm6,%ymm2,%ymm2 - addl -56(%r13),%ebp - andnl %esi,%ebx,%edi - vpxor %ymm3,%ymm2,%ymm2 - vmovdqu 0(%r14),%ymm11 - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - vpxor %ymm8,%ymm2,%ymm2 - andl %edx,%ebx - addl %r12d,%ebp - xorl %edi,%ebx - vpsrld $30,%ymm2,%ymm8 - vpslld $2,%ymm2,%ymm2 - addl -52(%r13),%eax - andnl %edx,%ebp,%edi - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - andl %ecx,%ebp - vpor %ymm8,%ymm2,%ymm2 - addl %r12d,%eax - xorl %edi,%ebp - addl -32(%r13),%esi - andnl %ecx,%eax,%edi - vpaddd %ymm11,%ymm2,%ymm9 - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - vmovdqu %ymm9,320(%rsp) - addl %r12d,%esi - xorl %edi,%eax - addl -28(%r13),%edx - andnl %ebx,%esi,%edi - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - addl %r12d,%edx - xorl %edi,%esi - addl -24(%r13),%ecx - andnl %ebp,%edx,%edi - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - addl %r12d,%ecx - xorl %edi,%edx - vpalignr $8,%ymm1,%ymm2,%ymm8 - vpxor %ymm7,%ymm3,%ymm3 - addl -20(%r13),%ebx - andnl %eax,%ecx,%edi - vpxor %ymm4,%ymm3,%ymm3 - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - vpxor %ymm8,%ymm3,%ymm3 - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - vpsrld $30,%ymm3,%ymm8 - vpslld $2,%ymm3,%ymm3 - addl 0(%r13),%ebp - andnl %esi,%ebx,%edi - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - andl %edx,%ebx - vpor %ymm8,%ymm3,%ymm3 - addl %r12d,%ebp - xorl %edi,%ebx - addl 4(%r13),%eax - andnl %edx,%ebp,%edi - vpaddd %ymm11,%ymm3,%ymm9 - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - andl %ecx,%ebp - vmovdqu %ymm9,352(%rsp) - addl %r12d,%eax - xorl %edi,%ebp - addl 8(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - addl 12(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - vpalignr $8,%ymm2,%ymm3,%ymm8 - vpxor %ymm0,%ymm4,%ymm4 - addl 32(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - vpxor %ymm5,%ymm4,%ymm4 - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - vpxor %ymm8,%ymm4,%ymm4 - addl %r12d,%ecx - xorl %ebp,%edx - addl 36(%r13),%ebx - vpsrld $30,%ymm4,%ymm8 - vpslld $2,%ymm4,%ymm4 - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - vpor %ymm8,%ymm4,%ymm4 - addl 40(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - vpaddd %ymm11,%ymm4,%ymm9 - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl 44(%r13),%eax - vmovdqu %ymm9,384(%rsp) - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl 64(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - vpalignr $8,%ymm3,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - addl 68(%r13),%edx - leal (%rdx,%rax,1),%edx - vpxor %ymm6,%ymm5,%ymm5 - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - vpxor %ymm8,%ymm5,%ymm5 - addl %r12d,%edx - xorl %ebx,%esi - addl 72(%r13),%ecx - vpsrld $30,%ymm5,%ymm8 - vpslld $2,%ymm5,%ymm5 - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - vpor %ymm8,%ymm5,%ymm5 - addl 76(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - vpaddd %ymm11,%ymm5,%ymm9 - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl 96(%r13),%ebp - vmovdqu %ymm9,416(%rsp) - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl 100(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - vpalignr $8,%ymm4,%ymm5,%ymm8 - vpxor %ymm2,%ymm6,%ymm6 - addl 104(%r13),%esi - leal (%rsi,%rbp,1),%esi - vpxor %ymm7,%ymm6,%ymm6 - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - vpxor %ymm8,%ymm6,%ymm6 - addl %r12d,%esi - xorl %ecx,%eax - addl 108(%r13),%edx - leaq 256(%r13),%r13 - vpsrld $30,%ymm6,%ymm8 - vpslld $2,%ymm6,%ymm6 - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - vpor %ymm8,%ymm6,%ymm6 - addl -128(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - vpaddd %ymm11,%ymm6,%ymm9 - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -124(%r13),%ebx - vmovdqu %ymm9,448(%rsp) - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -120(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - vpalignr $8,%ymm5,%ymm6,%ymm8 - vpxor %ymm3,%ymm7,%ymm7 - addl -116(%r13),%eax - leal (%rax,%rbx,1),%eax - vpxor %ymm0,%ymm7,%ymm7 - vmovdqu 32(%r14),%ymm11 - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - vpxor %ymm8,%ymm7,%ymm7 - addl %r12d,%eax - xorl %edx,%ebp - addl -96(%r13),%esi - vpsrld $30,%ymm7,%ymm8 - vpslld $2,%ymm7,%ymm7 - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - vpor %ymm8,%ymm7,%ymm7 - addl -92(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - vpaddd %ymm11,%ymm7,%ymm9 - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl -88(%r13),%ecx - vmovdqu %ymm9,480(%rsp) - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -84(%r13),%ebx - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - andl %edi,%ecx - jmp .Lalign32_2 -.align 32 -.Lalign32_2: - vpalignr $8,%ymm6,%ymm7,%ymm8 - vpxor %ymm4,%ymm0,%ymm0 - addl -64(%r13),%ebp - xorl %esi,%ecx - vpxor %ymm1,%ymm0,%ymm0 - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - vpxor %ymm8,%ymm0,%ymm0 - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - vpsrld $30,%ymm0,%ymm8 - vpslld $2,%ymm0,%ymm0 - addl %r12d,%ebp - andl %edi,%ebx - addl -60(%r13),%eax - xorl %edx,%ebx - movl %ecx,%edi - xorl %edx,%edi - vpor %ymm8,%ymm0,%ymm0 - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - vpaddd %ymm11,%ymm0,%ymm9 - addl %r12d,%eax - andl %edi,%ebp - addl -56(%r13),%esi - xorl %ecx,%ebp - vmovdqu %ymm9,512(%rsp) - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - addl -52(%r13),%edx - xorl %ebx,%eax - movl %ebp,%edi - xorl %ebx,%edi - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - andl %edi,%esi - addl -32(%r13),%ecx - xorl %ebp,%esi - movl %eax,%edi - xorl %ebp,%edi - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - andl %edi,%edx - vpalignr $8,%ymm7,%ymm0,%ymm8 - vpxor %ymm5,%ymm1,%ymm1 - addl -28(%r13),%ebx - xorl %eax,%edx - vpxor %ymm2,%ymm1,%ymm1 - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - vpxor %ymm8,%ymm1,%ymm1 - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - vpsrld $30,%ymm1,%ymm8 - vpslld $2,%ymm1,%ymm1 - addl %r12d,%ebx - andl %edi,%ecx - addl -24(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - vpor %ymm8,%ymm1,%ymm1 - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - vpaddd %ymm11,%ymm1,%ymm9 - addl %r12d,%ebp - andl %edi,%ebx - addl -20(%r13),%eax - xorl %edx,%ebx - vmovdqu %ymm9,544(%rsp) - movl %ecx,%edi - xorl %edx,%edi - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - andl %edi,%ebp - addl 0(%r13),%esi - xorl %ecx,%ebp - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - addl 4(%r13),%edx - xorl %ebx,%eax - movl %ebp,%edi - xorl %ebx,%edi - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - andl %edi,%esi - vpalignr $8,%ymm0,%ymm1,%ymm8 - vpxor %ymm6,%ymm2,%ymm2 - addl 8(%r13),%ecx - xorl %ebp,%esi - vpxor %ymm3,%ymm2,%ymm2 - movl %eax,%edi - xorl %ebp,%edi - leal (%rcx,%rsi,1),%ecx - vpxor %ymm8,%ymm2,%ymm2 - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - vpsrld $30,%ymm2,%ymm8 - vpslld $2,%ymm2,%ymm2 - addl %r12d,%ecx - andl %edi,%edx - addl 12(%r13),%ebx - xorl %eax,%edx - movl %esi,%edi - xorl %eax,%edi - vpor %ymm8,%ymm2,%ymm2 - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - vpaddd %ymm11,%ymm2,%ymm9 - addl %r12d,%ebx - andl %edi,%ecx - addl 32(%r13),%ebp - xorl %esi,%ecx - vmovdqu %ymm9,576(%rsp) - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl 36(%r13),%eax - xorl %edx,%ebx - movl %ecx,%edi - xorl %edx,%edi - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - andl %edi,%ebp - addl 40(%r13),%esi - xorl %ecx,%ebp - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - vpalignr $8,%ymm1,%ymm2,%ymm8 - vpxor %ymm7,%ymm3,%ymm3 - addl 44(%r13),%edx - xorl %ebx,%eax - vpxor %ymm4,%ymm3,%ymm3 - movl %ebp,%edi - xorl %ebx,%edi - leal (%rdx,%rax,1),%edx - vpxor %ymm8,%ymm3,%ymm3 - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - vpsrld $30,%ymm3,%ymm8 - vpslld $2,%ymm3,%ymm3 - addl %r12d,%edx - andl %edi,%esi - addl 64(%r13),%ecx - xorl %ebp,%esi - movl %eax,%edi - xorl %ebp,%edi - vpor %ymm8,%ymm3,%ymm3 - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - vpaddd %ymm11,%ymm3,%ymm9 - addl %r12d,%ecx - andl %edi,%edx - addl 68(%r13),%ebx - xorl %eax,%edx - vmovdqu %ymm9,608(%rsp) - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - andl %edi,%ecx - addl 72(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl 76(%r13),%eax - xorl %edx,%ebx - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl 96(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl 100(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl 104(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl 108(%r13),%ebx - leaq 256(%r13),%r13 - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -128(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl -124(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl -120(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl -116(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl -96(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -92(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -88(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl -84(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl -64(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl -60(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl -56(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -52(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -32(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl -28(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl -24(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl -20(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - addl %r12d,%edx - leaq 128(%r9),%r13 - leaq 128(%r9),%rdi - cmpq %r10,%r13 - cmovaeq %r9,%r13 - - - addl 0(%r8),%edx - addl 4(%r8),%esi - addl 8(%r8),%ebp - movl %edx,0(%r8) - addl 12(%r8),%ebx - movl %esi,4(%r8) - movl %edx,%eax - addl 16(%r8),%ecx - movl %ebp,%r12d - movl %ebp,8(%r8) - movl %ebx,%edx - - movl %ebx,12(%r8) - movl %esi,%ebp - movl %ecx,16(%r8) - - movl %ecx,%esi - movl %r12d,%ecx - - - cmpq %r10,%r9 - je .Ldone_avx2 - vmovdqu 64(%r14),%ymm6 - cmpq %r10,%rdi - ja .Last_avx2 - - vmovdqu -64(%rdi),%xmm0 - vmovdqu -48(%rdi),%xmm1 - vmovdqu -32(%rdi),%xmm2 - vmovdqu -16(%rdi),%xmm3 - vinserti128 $1,0(%r13),%ymm0,%ymm0 - vinserti128 $1,16(%r13),%ymm1,%ymm1 - vinserti128 $1,32(%r13),%ymm2,%ymm2 - vinserti128 $1,48(%r13),%ymm3,%ymm3 - jmp .Last_avx2 - -.align 32 -.Last_avx2: - leaq 128+16(%rsp),%r13 - rorxl $2,%ebp,%ebx - andnl %edx,%ebp,%edi - andl %ecx,%ebp - xorl %edi,%ebp - subq $-128,%r9 - addl -128(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - addl -124(%r13),%edx - andnl %ebx,%esi,%edi - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - addl %r12d,%edx - xorl %edi,%esi - addl -120(%r13),%ecx - andnl %ebp,%edx,%edi - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - addl %r12d,%ecx - xorl %edi,%edx - addl -116(%r13),%ebx - andnl %eax,%ecx,%edi - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - addl -96(%r13),%ebp - andnl %esi,%ebx,%edi - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - andl %edx,%ebx - addl %r12d,%ebp - xorl %edi,%ebx - addl -92(%r13),%eax - andnl %edx,%ebp,%edi - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - andl %ecx,%ebp - addl %r12d,%eax - xorl %edi,%ebp - addl -88(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - addl -84(%r13),%edx - andnl %ebx,%esi,%edi - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - addl %r12d,%edx - xorl %edi,%esi - addl -64(%r13),%ecx - andnl %ebp,%edx,%edi - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - addl %r12d,%ecx - xorl %edi,%edx - addl -60(%r13),%ebx - andnl %eax,%ecx,%edi - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - addl -56(%r13),%ebp - andnl %esi,%ebx,%edi - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - andl %edx,%ebx - addl %r12d,%ebp - xorl %edi,%ebx - addl -52(%r13),%eax - andnl %edx,%ebp,%edi - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - andl %ecx,%ebp - addl %r12d,%eax - xorl %edi,%ebp - addl -32(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - addl -28(%r13),%edx - andnl %ebx,%esi,%edi - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - addl %r12d,%edx - xorl %edi,%esi - addl -24(%r13),%ecx - andnl %ebp,%edx,%edi - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - addl %r12d,%ecx - xorl %edi,%edx - addl -20(%r13),%ebx - andnl %eax,%ecx,%edi - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - addl 0(%r13),%ebp - andnl %esi,%ebx,%edi - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - andl %edx,%ebx - addl %r12d,%ebp - xorl %edi,%ebx - addl 4(%r13),%eax - andnl %edx,%ebp,%edi - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - andl %ecx,%ebp - addl %r12d,%eax - xorl %edi,%ebp - addl 8(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - addl 12(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl 32(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl 36(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl 40(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl 44(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl 64(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - vmovdqu -64(%r14),%ymm11 - vpshufb %ymm6,%ymm0,%ymm0 - addl 68(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl 72(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl 76(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl 96(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl 100(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - vpshufb %ymm6,%ymm1,%ymm1 - vpaddd %ymm11,%ymm0,%ymm8 - addl 104(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl 108(%r13),%edx - leaq 256(%r13),%r13 - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl -128(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -124(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -120(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - vmovdqu %ymm8,0(%rsp) - vpshufb %ymm6,%ymm2,%ymm2 - vpaddd %ymm11,%ymm1,%ymm9 - addl -116(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl -96(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl -92(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl -88(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -84(%r13),%ebx - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - andl %edi,%ecx - vmovdqu %ymm9,32(%rsp) - vpshufb %ymm6,%ymm3,%ymm3 - vpaddd %ymm11,%ymm2,%ymm6 - addl -64(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl -60(%r13),%eax - xorl %edx,%ebx - movl %ecx,%edi - xorl %edx,%edi - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - andl %edi,%ebp - addl -56(%r13),%esi - xorl %ecx,%ebp - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - addl -52(%r13),%edx - xorl %ebx,%eax - movl %ebp,%edi - xorl %ebx,%edi - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - andl %edi,%esi - addl -32(%r13),%ecx - xorl %ebp,%esi - movl %eax,%edi - xorl %ebp,%edi - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - andl %edi,%edx - jmp .Lalign32_3 -.align 32 -.Lalign32_3: - vmovdqu %ymm6,64(%rsp) - vpaddd %ymm11,%ymm3,%ymm7 - addl -28(%r13),%ebx - xorl %eax,%edx - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - andl %edi,%ecx - addl -24(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl -20(%r13),%eax - xorl %edx,%ebx - movl %ecx,%edi - xorl %edx,%edi - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - andl %edi,%ebp - addl 0(%r13),%esi - xorl %ecx,%ebp - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - addl 4(%r13),%edx - xorl %ebx,%eax - movl %ebp,%edi - xorl %ebx,%edi - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - andl %edi,%esi - vmovdqu %ymm7,96(%rsp) - addl 8(%r13),%ecx - xorl %ebp,%esi - movl %eax,%edi - xorl %ebp,%edi - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - andl %edi,%edx - addl 12(%r13),%ebx - xorl %eax,%edx - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - andl %edi,%ecx - addl 32(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl 36(%r13),%eax - xorl %edx,%ebx - movl %ecx,%edi - xorl %edx,%edi - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - andl %edi,%ebp - addl 40(%r13),%esi - xorl %ecx,%ebp - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - vpalignr $8,%ymm0,%ymm1,%ymm4 - addl 44(%r13),%edx - xorl %ebx,%eax - movl %ebp,%edi - xorl %ebx,%edi - vpsrldq $4,%ymm3,%ymm8 - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - vpxor %ymm0,%ymm4,%ymm4 - vpxor %ymm2,%ymm8,%ymm8 - xorl %ebp,%esi - addl %r12d,%edx - vpxor %ymm8,%ymm4,%ymm4 - andl %edi,%esi - addl 64(%r13),%ecx - xorl %ebp,%esi - movl %eax,%edi - vpsrld $31,%ymm4,%ymm8 - xorl %ebp,%edi - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - vpslldq $12,%ymm4,%ymm10 - vpaddd %ymm4,%ymm4,%ymm4 - rorxl $2,%edx,%esi - xorl %eax,%edx - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm4,%ymm4 - addl %r12d,%ecx - andl %edi,%edx - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm4,%ymm4 - addl 68(%r13),%ebx - xorl %eax,%edx - vpxor %ymm10,%ymm4,%ymm4 - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - vpaddd %ymm11,%ymm4,%ymm9 - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - vmovdqu %ymm9,128(%rsp) - addl %r12d,%ebx - andl %edi,%ecx - addl 72(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl 76(%r13),%eax - xorl %edx,%ebx - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - vpalignr $8,%ymm1,%ymm2,%ymm5 - addl 96(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - vpsrldq $4,%ymm4,%ymm8 - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm3,%ymm8,%ymm8 - addl 100(%r13),%edx - leal (%rdx,%rax,1),%edx - vpxor %ymm8,%ymm5,%ymm5 - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - vpsrld $31,%ymm5,%ymm8 - vmovdqu -32(%r14),%ymm11 - xorl %ebx,%esi - addl 104(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - vpslldq $12,%ymm5,%ymm10 - vpaddd %ymm5,%ymm5,%ymm5 - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm5,%ymm5 - xorl %eax,%edx - addl %r12d,%ecx - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm5,%ymm5 - xorl %ebp,%edx - addl 108(%r13),%ebx - leaq 256(%r13),%r13 - vpxor %ymm10,%ymm5,%ymm5 - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - vpaddd %ymm11,%ymm5,%ymm9 - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - vmovdqu %ymm9,160(%rsp) - addl -128(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - vpalignr $8,%ymm2,%ymm3,%ymm6 - addl -124(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - vpsrldq $4,%ymm5,%ymm8 - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - vpxor %ymm2,%ymm6,%ymm6 - vpxor %ymm4,%ymm8,%ymm8 - addl -120(%r13),%esi - leal (%rsi,%rbp,1),%esi - vpxor %ymm8,%ymm6,%ymm6 - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - vpsrld $31,%ymm6,%ymm8 - xorl %ecx,%eax - addl -116(%r13),%edx - leal (%rdx,%rax,1),%edx - vpslldq $12,%ymm6,%ymm10 - vpaddd %ymm6,%ymm6,%ymm6 - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm6,%ymm6 - xorl %ebp,%esi - addl %r12d,%edx - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm6,%ymm6 - xorl %ebx,%esi - addl -96(%r13),%ecx - vpxor %ymm10,%ymm6,%ymm6 - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - vpaddd %ymm11,%ymm6,%ymm9 - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - vmovdqu %ymm9,192(%rsp) - addl -92(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - vpalignr $8,%ymm3,%ymm4,%ymm7 - addl -88(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - vpsrldq $4,%ymm6,%ymm8 - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - vpxor %ymm3,%ymm7,%ymm7 - vpxor %ymm5,%ymm8,%ymm8 - addl -84(%r13),%eax - leal (%rax,%rbx,1),%eax - vpxor %ymm8,%ymm7,%ymm7 - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - vpsrld $31,%ymm7,%ymm8 - xorl %edx,%ebp - addl -64(%r13),%esi - leal (%rsi,%rbp,1),%esi - vpslldq $12,%ymm7,%ymm10 - vpaddd %ymm7,%ymm7,%ymm7 - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm7,%ymm7 - xorl %ebx,%eax - addl %r12d,%esi - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm7,%ymm7 - xorl %ecx,%eax - addl -60(%r13),%edx - vpxor %ymm10,%ymm7,%ymm7 - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - vpaddd %ymm11,%ymm7,%ymm9 - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - vmovdqu %ymm9,224(%rsp) - addl -56(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -52(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -32(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl -28(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl -24(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl -20(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - addl %r12d,%edx - leaq 128(%rsp),%r13 - - - addl 0(%r8),%edx - addl 4(%r8),%esi - addl 8(%r8),%ebp - movl %edx,0(%r8) - addl 12(%r8),%ebx - movl %esi,4(%r8) - movl %edx,%eax - addl 16(%r8),%ecx - movl %ebp,%r12d - movl %ebp,8(%r8) - movl %ebx,%edx - - movl %ebx,12(%r8) - movl %esi,%ebp - movl %ecx,16(%r8) - - movl %ecx,%esi - movl %r12d,%ecx - - - cmpq %r10,%r9 - jbe .Loop_avx2 - -.Ldone_avx2: - vzeroupper - movq -40(%r11),%r14 -.cfi_restore %r14 - movq -32(%r11),%r13 -.cfi_restore %r13 - movq -24(%r11),%r12 -.cfi_restore %r12 - movq -16(%r11),%rbp -.cfi_restore %rbp - movq -8(%r11),%rbx -.cfi_restore %rbx - leaq (%r11),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_avx2: - .byte 0xf3,0xc3 -.cfi_endproc -.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2 .align 64 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 @@ -5448,7 +2619,7 @@ K_XX_XX: .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 - .section ".note.gnu.property", "a" + .section .note.gnu.property, #alloc .p2align 3 .long 1f - 0f .long 4f - 1f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha256-mb-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha256-mb-x86_64.s index 8f9e4bfe5cf..5bb4ca7ed49 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha256-mb-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha256-mb-x86_64.s @@ -10,8 +10,6 @@ sha256_multi_block: movq OPENSSL_ia32cap_P+4(%rip),%rcx btq $61,%rcx jc _shaext_shortcut - testl $268435456,%ecx - jnz _avx_shortcut movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx @@ -3137,4700 +3135,6 @@ _shaext_shortcut: .byte 0xf3,0xc3 .cfi_endproc .size sha256_multi_block_shaext,.-sha256_multi_block_shaext -.type sha256_multi_block_avx,@function -.align 32 -sha256_multi_block_avx: -.cfi_startproc -_avx_shortcut: - shrq $32,%rcx - cmpl $2,%edx - jb .Lavx - testl $32,%ecx - jnz _avx2_shortcut - jmp .Lavx -.align 32 -.Lavx: - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - subq $288,%rsp - andq $-256,%rsp - movq %rax,272(%rsp) -.cfi_escape 0x0f,0x06,0x77,0x90,0x02,0x06,0x23,0x08 -.Lbody_avx: - leaq K256+128(%rip),%rbp - leaq 256(%rsp),%rbx - leaq 128(%rdi),%rdi - -.Loop_grande_avx: - movl %edx,280(%rsp) - xorl %edx,%edx - - movq 0(%rsi),%r8 - - movl 8(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,0(%rbx) - cmovleq %rbp,%r8 - - movq 16(%rsi),%r9 - - movl 24(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,4(%rbx) - cmovleq %rbp,%r9 - - movq 32(%rsi),%r10 - - movl 40(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,8(%rbx) - cmovleq %rbp,%r10 - - movq 48(%rsi),%r11 - - movl 56(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,12(%rbx) - cmovleq %rbp,%r11 - testl %edx,%edx - jz .Ldone_avx - - vmovdqu 0-128(%rdi),%xmm8 - leaq 128(%rsp),%rax - vmovdqu 32-128(%rdi),%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - vmovdqu 96-128(%rdi),%xmm11 - vmovdqu 128-128(%rdi),%xmm12 - vmovdqu 160-128(%rdi),%xmm13 - vmovdqu 192-128(%rdi),%xmm14 - vmovdqu 224-128(%rdi),%xmm15 - vmovdqu .Lpbswap(%rip),%xmm6 - jmp .Loop_avx - -.align 32 -.Loop_avx: - vpxor %xmm9,%xmm10,%xmm4 - vmovd 0(%r8),%xmm5 - vmovd 0(%r9),%xmm0 - vpinsrd $1,0(%r10),%xmm5,%xmm5 - vpinsrd $1,0(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm12,%xmm7 - vpslld $26,%xmm12,%xmm2 - vmovdqu %xmm5,0-128(%rax) - vpaddd %xmm15,%xmm5,%xmm5 - - vpsrld $11,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm12,%xmm2 - vpaddd -128(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm12,%xmm2 - vpandn %xmm14,%xmm12,%xmm0 - vpand %xmm13,%xmm12,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm8,%xmm15 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm8,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm8,%xmm9,%xmm3 - - vpxor %xmm1,%xmm15,%xmm15 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm8,%xmm1 - - vpslld $19,%xmm8,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm15,%xmm7 - - vpsrld $22,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm8,%xmm2 - vpxor %xmm4,%xmm9,%xmm15 - vpaddd %xmm5,%xmm11,%xmm11 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm15,%xmm15 - vpaddd %xmm7,%xmm15,%xmm15 - vmovd 4(%r8),%xmm5 - vmovd 4(%r9),%xmm0 - vpinsrd $1,4(%r10),%xmm5,%xmm5 - vpinsrd $1,4(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm11,%xmm7 - vpslld $26,%xmm11,%xmm2 - vmovdqu %xmm5,16-128(%rax) - vpaddd %xmm14,%xmm5,%xmm5 - - vpsrld $11,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm11,%xmm2 - vpaddd -96(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm11,%xmm2 - vpandn %xmm13,%xmm11,%xmm0 - vpand %xmm12,%xmm11,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm15,%xmm14 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm15,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm15,%xmm8,%xmm4 - - vpxor %xmm1,%xmm14,%xmm14 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm15,%xmm1 - - vpslld $19,%xmm15,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm14,%xmm7 - - vpsrld $22,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm15,%xmm2 - vpxor %xmm3,%xmm8,%xmm14 - vpaddd %xmm5,%xmm10,%xmm10 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm14,%xmm14 - vpaddd %xmm7,%xmm14,%xmm14 - vmovd 8(%r8),%xmm5 - vmovd 8(%r9),%xmm0 - vpinsrd $1,8(%r10),%xmm5,%xmm5 - vpinsrd $1,8(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm10,%xmm7 - vpslld $26,%xmm10,%xmm2 - vmovdqu %xmm5,32-128(%rax) - vpaddd %xmm13,%xmm5,%xmm5 - - vpsrld $11,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm10,%xmm2 - vpaddd -64(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm10,%xmm2 - vpandn %xmm12,%xmm10,%xmm0 - vpand %xmm11,%xmm10,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm14,%xmm13 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm14,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm14,%xmm15,%xmm3 - - vpxor %xmm1,%xmm13,%xmm13 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm14,%xmm1 - - vpslld $19,%xmm14,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm13,%xmm7 - - vpsrld $22,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm14,%xmm2 - vpxor %xmm4,%xmm15,%xmm13 - vpaddd %xmm5,%xmm9,%xmm9 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm13,%xmm13 - vpaddd %xmm7,%xmm13,%xmm13 - vmovd 12(%r8),%xmm5 - vmovd 12(%r9),%xmm0 - vpinsrd $1,12(%r10),%xmm5,%xmm5 - vpinsrd $1,12(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm9,%xmm7 - vpslld $26,%xmm9,%xmm2 - vmovdqu %xmm5,48-128(%rax) - vpaddd %xmm12,%xmm5,%xmm5 - - vpsrld $11,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm9,%xmm2 - vpaddd -32(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm9,%xmm2 - vpandn %xmm11,%xmm9,%xmm0 - vpand %xmm10,%xmm9,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm13,%xmm12 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm13,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm13,%xmm14,%xmm4 - - vpxor %xmm1,%xmm12,%xmm12 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm13,%xmm1 - - vpslld $19,%xmm13,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm12,%xmm7 - - vpsrld $22,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm13,%xmm2 - vpxor %xmm3,%xmm14,%xmm12 - vpaddd %xmm5,%xmm8,%xmm8 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm12,%xmm12 - vpaddd %xmm7,%xmm12,%xmm12 - vmovd 16(%r8),%xmm5 - vmovd 16(%r9),%xmm0 - vpinsrd $1,16(%r10),%xmm5,%xmm5 - vpinsrd $1,16(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm8,%xmm7 - vpslld $26,%xmm8,%xmm2 - vmovdqu %xmm5,64-128(%rax) - vpaddd %xmm11,%xmm5,%xmm5 - - vpsrld $11,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm8,%xmm2 - vpaddd 0(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm8,%xmm2 - vpandn %xmm10,%xmm8,%xmm0 - vpand %xmm9,%xmm8,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm12,%xmm11 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm12,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm12,%xmm13,%xmm3 - - vpxor %xmm1,%xmm11,%xmm11 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm12,%xmm1 - - vpslld $19,%xmm12,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm11,%xmm7 - - vpsrld $22,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm12,%xmm2 - vpxor %xmm4,%xmm13,%xmm11 - vpaddd %xmm5,%xmm15,%xmm15 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm11,%xmm11 - vpaddd %xmm7,%xmm11,%xmm11 - vmovd 20(%r8),%xmm5 - vmovd 20(%r9),%xmm0 - vpinsrd $1,20(%r10),%xmm5,%xmm5 - vpinsrd $1,20(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm15,%xmm7 - vpslld $26,%xmm15,%xmm2 - vmovdqu %xmm5,80-128(%rax) - vpaddd %xmm10,%xmm5,%xmm5 - - vpsrld $11,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm15,%xmm2 - vpaddd 32(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm15,%xmm2 - vpandn %xmm9,%xmm15,%xmm0 - vpand %xmm8,%xmm15,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm11,%xmm10 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm11,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm11,%xmm12,%xmm4 - - vpxor %xmm1,%xmm10,%xmm10 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm11,%xmm1 - - vpslld $19,%xmm11,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm10,%xmm7 - - vpsrld $22,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm11,%xmm2 - vpxor %xmm3,%xmm12,%xmm10 - vpaddd %xmm5,%xmm14,%xmm14 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm10,%xmm10 - vpaddd %xmm7,%xmm10,%xmm10 - vmovd 24(%r8),%xmm5 - vmovd 24(%r9),%xmm0 - vpinsrd $1,24(%r10),%xmm5,%xmm5 - vpinsrd $1,24(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm14,%xmm7 - vpslld $26,%xmm14,%xmm2 - vmovdqu %xmm5,96-128(%rax) - vpaddd %xmm9,%xmm5,%xmm5 - - vpsrld $11,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm14,%xmm2 - vpaddd 64(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm14,%xmm2 - vpandn %xmm8,%xmm14,%xmm0 - vpand %xmm15,%xmm14,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm10,%xmm9 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm10,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm10,%xmm11,%xmm3 - - vpxor %xmm1,%xmm9,%xmm9 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm10,%xmm1 - - vpslld $19,%xmm10,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm9,%xmm7 - - vpsrld $22,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm10,%xmm2 - vpxor %xmm4,%xmm11,%xmm9 - vpaddd %xmm5,%xmm13,%xmm13 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm9,%xmm9 - vpaddd %xmm7,%xmm9,%xmm9 - vmovd 28(%r8),%xmm5 - vmovd 28(%r9),%xmm0 - vpinsrd $1,28(%r10),%xmm5,%xmm5 - vpinsrd $1,28(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm13,%xmm7 - vpslld $26,%xmm13,%xmm2 - vmovdqu %xmm5,112-128(%rax) - vpaddd %xmm8,%xmm5,%xmm5 - - vpsrld $11,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm13,%xmm2 - vpaddd 96(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm13,%xmm2 - vpandn %xmm15,%xmm13,%xmm0 - vpand %xmm14,%xmm13,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm9,%xmm8 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm9,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm9,%xmm10,%xmm4 - - vpxor %xmm1,%xmm8,%xmm8 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm9,%xmm1 - - vpslld $19,%xmm9,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm8,%xmm7 - - vpsrld $22,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm9,%xmm2 - vpxor %xmm3,%xmm10,%xmm8 - vpaddd %xmm5,%xmm12,%xmm12 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm8,%xmm8 - vpaddd %xmm7,%xmm8,%xmm8 - addq $256,%rbp - vmovd 32(%r8),%xmm5 - vmovd 32(%r9),%xmm0 - vpinsrd $1,32(%r10),%xmm5,%xmm5 - vpinsrd $1,32(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm12,%xmm7 - vpslld $26,%xmm12,%xmm2 - vmovdqu %xmm5,128-128(%rax) - vpaddd %xmm15,%xmm5,%xmm5 - - vpsrld $11,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm12,%xmm2 - vpaddd -128(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm12,%xmm2 - vpandn %xmm14,%xmm12,%xmm0 - vpand %xmm13,%xmm12,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm8,%xmm15 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm8,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm8,%xmm9,%xmm3 - - vpxor %xmm1,%xmm15,%xmm15 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm8,%xmm1 - - vpslld $19,%xmm8,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm15,%xmm7 - - vpsrld $22,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm8,%xmm2 - vpxor %xmm4,%xmm9,%xmm15 - vpaddd %xmm5,%xmm11,%xmm11 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm15,%xmm15 - vpaddd %xmm7,%xmm15,%xmm15 - vmovd 36(%r8),%xmm5 - vmovd 36(%r9),%xmm0 - vpinsrd $1,36(%r10),%xmm5,%xmm5 - vpinsrd $1,36(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm11,%xmm7 - vpslld $26,%xmm11,%xmm2 - vmovdqu %xmm5,144-128(%rax) - vpaddd %xmm14,%xmm5,%xmm5 - - vpsrld $11,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm11,%xmm2 - vpaddd -96(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm11,%xmm2 - vpandn %xmm13,%xmm11,%xmm0 - vpand %xmm12,%xmm11,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm15,%xmm14 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm15,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm15,%xmm8,%xmm4 - - vpxor %xmm1,%xmm14,%xmm14 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm15,%xmm1 - - vpslld $19,%xmm15,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm14,%xmm7 - - vpsrld $22,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm15,%xmm2 - vpxor %xmm3,%xmm8,%xmm14 - vpaddd %xmm5,%xmm10,%xmm10 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm14,%xmm14 - vpaddd %xmm7,%xmm14,%xmm14 - vmovd 40(%r8),%xmm5 - vmovd 40(%r9),%xmm0 - vpinsrd $1,40(%r10),%xmm5,%xmm5 - vpinsrd $1,40(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm10,%xmm7 - vpslld $26,%xmm10,%xmm2 - vmovdqu %xmm5,160-128(%rax) - vpaddd %xmm13,%xmm5,%xmm5 - - vpsrld $11,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm10,%xmm2 - vpaddd -64(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm10,%xmm2 - vpandn %xmm12,%xmm10,%xmm0 - vpand %xmm11,%xmm10,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm14,%xmm13 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm14,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm14,%xmm15,%xmm3 - - vpxor %xmm1,%xmm13,%xmm13 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm14,%xmm1 - - vpslld $19,%xmm14,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm13,%xmm7 - - vpsrld $22,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm14,%xmm2 - vpxor %xmm4,%xmm15,%xmm13 - vpaddd %xmm5,%xmm9,%xmm9 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm13,%xmm13 - vpaddd %xmm7,%xmm13,%xmm13 - vmovd 44(%r8),%xmm5 - vmovd 44(%r9),%xmm0 - vpinsrd $1,44(%r10),%xmm5,%xmm5 - vpinsrd $1,44(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm9,%xmm7 - vpslld $26,%xmm9,%xmm2 - vmovdqu %xmm5,176-128(%rax) - vpaddd %xmm12,%xmm5,%xmm5 - - vpsrld $11,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm9,%xmm2 - vpaddd -32(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm9,%xmm2 - vpandn %xmm11,%xmm9,%xmm0 - vpand %xmm10,%xmm9,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm13,%xmm12 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm13,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm13,%xmm14,%xmm4 - - vpxor %xmm1,%xmm12,%xmm12 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm13,%xmm1 - - vpslld $19,%xmm13,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm12,%xmm7 - - vpsrld $22,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm13,%xmm2 - vpxor %xmm3,%xmm14,%xmm12 - vpaddd %xmm5,%xmm8,%xmm8 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm12,%xmm12 - vpaddd %xmm7,%xmm12,%xmm12 - vmovd 48(%r8),%xmm5 - vmovd 48(%r9),%xmm0 - vpinsrd $1,48(%r10),%xmm5,%xmm5 - vpinsrd $1,48(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm8,%xmm7 - vpslld $26,%xmm8,%xmm2 - vmovdqu %xmm5,192-128(%rax) - vpaddd %xmm11,%xmm5,%xmm5 - - vpsrld $11,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm8,%xmm2 - vpaddd 0(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm8,%xmm2 - vpandn %xmm10,%xmm8,%xmm0 - vpand %xmm9,%xmm8,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm12,%xmm11 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm12,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm12,%xmm13,%xmm3 - - vpxor %xmm1,%xmm11,%xmm11 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm12,%xmm1 - - vpslld $19,%xmm12,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm11,%xmm7 - - vpsrld $22,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm12,%xmm2 - vpxor %xmm4,%xmm13,%xmm11 - vpaddd %xmm5,%xmm15,%xmm15 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm11,%xmm11 - vpaddd %xmm7,%xmm11,%xmm11 - vmovd 52(%r8),%xmm5 - vmovd 52(%r9),%xmm0 - vpinsrd $1,52(%r10),%xmm5,%xmm5 - vpinsrd $1,52(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm15,%xmm7 - vpslld $26,%xmm15,%xmm2 - vmovdqu %xmm5,208-128(%rax) - vpaddd %xmm10,%xmm5,%xmm5 - - vpsrld $11,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm15,%xmm2 - vpaddd 32(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm15,%xmm2 - vpandn %xmm9,%xmm15,%xmm0 - vpand %xmm8,%xmm15,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm11,%xmm10 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm11,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm11,%xmm12,%xmm4 - - vpxor %xmm1,%xmm10,%xmm10 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm11,%xmm1 - - vpslld $19,%xmm11,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm10,%xmm7 - - vpsrld $22,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm11,%xmm2 - vpxor %xmm3,%xmm12,%xmm10 - vpaddd %xmm5,%xmm14,%xmm14 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm10,%xmm10 - vpaddd %xmm7,%xmm10,%xmm10 - vmovd 56(%r8),%xmm5 - vmovd 56(%r9),%xmm0 - vpinsrd $1,56(%r10),%xmm5,%xmm5 - vpinsrd $1,56(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm14,%xmm7 - vpslld $26,%xmm14,%xmm2 - vmovdqu %xmm5,224-128(%rax) - vpaddd %xmm9,%xmm5,%xmm5 - - vpsrld $11,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm14,%xmm2 - vpaddd 64(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm14,%xmm2 - vpandn %xmm8,%xmm14,%xmm0 - vpand %xmm15,%xmm14,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm10,%xmm9 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm10,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm10,%xmm11,%xmm3 - - vpxor %xmm1,%xmm9,%xmm9 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm10,%xmm1 - - vpslld $19,%xmm10,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm9,%xmm7 - - vpsrld $22,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm10,%xmm2 - vpxor %xmm4,%xmm11,%xmm9 - vpaddd %xmm5,%xmm13,%xmm13 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm9,%xmm9 - vpaddd %xmm7,%xmm9,%xmm9 - vmovd 60(%r8),%xmm5 - leaq 64(%r8),%r8 - vmovd 60(%r9),%xmm0 - leaq 64(%r9),%r9 - vpinsrd $1,60(%r10),%xmm5,%xmm5 - leaq 64(%r10),%r10 - vpinsrd $1,60(%r11),%xmm0,%xmm0 - leaq 64(%r11),%r11 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm13,%xmm7 - vpslld $26,%xmm13,%xmm2 - vmovdqu %xmm5,240-128(%rax) - vpaddd %xmm8,%xmm5,%xmm5 - - vpsrld $11,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm13,%xmm2 - vpaddd 96(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - prefetcht0 63(%r8) - vpslld $7,%xmm13,%xmm2 - vpandn %xmm15,%xmm13,%xmm0 - vpand %xmm14,%xmm13,%xmm4 - prefetcht0 63(%r9) - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm9,%xmm8 - vpxor %xmm2,%xmm7,%xmm7 - prefetcht0 63(%r10) - vpslld $30,%xmm9,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm9,%xmm10,%xmm4 - prefetcht0 63(%r11) - vpxor %xmm1,%xmm8,%xmm8 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm9,%xmm1 - - vpslld $19,%xmm9,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm8,%xmm7 - - vpsrld $22,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm9,%xmm2 - vpxor %xmm3,%xmm10,%xmm8 - vpaddd %xmm5,%xmm12,%xmm12 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm8,%xmm8 - vpaddd %xmm7,%xmm8,%xmm8 - addq $256,%rbp - vmovdqu 0-128(%rax),%xmm5 - movl $3,%ecx - jmp .Loop_16_xx_avx -.align 32 -.Loop_16_xx_avx: - vmovdqu 16-128(%rax),%xmm6 - vpaddd 144-128(%rax),%xmm5,%xmm5 - - vpsrld $3,%xmm6,%xmm7 - vpsrld $7,%xmm6,%xmm1 - vpslld $25,%xmm6,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm6,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm6,%xmm2 - vmovdqu 224-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm5,%xmm5 - vpxor %xmm1,%xmm3,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm5,%xmm5 - vpsrld $6,%xmm12,%xmm7 - vpslld $26,%xmm12,%xmm2 - vmovdqu %xmm5,0-128(%rax) - vpaddd %xmm15,%xmm5,%xmm5 - - vpsrld $11,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm12,%xmm2 - vpaddd -128(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm12,%xmm2 - vpandn %xmm14,%xmm12,%xmm0 - vpand %xmm13,%xmm12,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm8,%xmm15 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm8,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm8,%xmm9,%xmm3 - - vpxor %xmm1,%xmm15,%xmm15 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm8,%xmm1 - - vpslld $19,%xmm8,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm15,%xmm7 - - vpsrld $22,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm8,%xmm2 - vpxor %xmm4,%xmm9,%xmm15 - vpaddd %xmm5,%xmm11,%xmm11 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm15,%xmm15 - vpaddd %xmm7,%xmm15,%xmm15 - vmovdqu 32-128(%rax),%xmm5 - vpaddd 160-128(%rax),%xmm6,%xmm6 - - vpsrld $3,%xmm5,%xmm7 - vpsrld $7,%xmm5,%xmm1 - vpslld $25,%xmm5,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm5,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm5,%xmm2 - vmovdqu 240-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - vpsrld $6,%xmm11,%xmm7 - vpslld $26,%xmm11,%xmm2 - vmovdqu %xmm6,16-128(%rax) - vpaddd %xmm14,%xmm6,%xmm6 - - vpsrld $11,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm11,%xmm2 - vpaddd -96(%rbp),%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm11,%xmm2 - vpandn %xmm13,%xmm11,%xmm0 - vpand %xmm12,%xmm11,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm15,%xmm14 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm15,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm15,%xmm8,%xmm4 - - vpxor %xmm1,%xmm14,%xmm14 - vpaddd %xmm7,%xmm6,%xmm6 - - vpsrld $13,%xmm15,%xmm1 - - vpslld $19,%xmm15,%xmm2 - vpaddd %xmm0,%xmm6,%xmm6 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm14,%xmm7 - - vpsrld $22,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm15,%xmm2 - vpxor %xmm3,%xmm8,%xmm14 - vpaddd %xmm6,%xmm10,%xmm10 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm6,%xmm14,%xmm14 - vpaddd %xmm7,%xmm14,%xmm14 - vmovdqu 48-128(%rax),%xmm6 - vpaddd 176-128(%rax),%xmm5,%xmm5 - - vpsrld $3,%xmm6,%xmm7 - vpsrld $7,%xmm6,%xmm1 - vpslld $25,%xmm6,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm6,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm6,%xmm2 - vmovdqu 0-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm5,%xmm5 - vpxor %xmm1,%xmm3,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm5,%xmm5 - vpsrld $6,%xmm10,%xmm7 - vpslld $26,%xmm10,%xmm2 - vmovdqu %xmm5,32-128(%rax) - vpaddd %xmm13,%xmm5,%xmm5 - - vpsrld $11,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm10,%xmm2 - vpaddd -64(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm10,%xmm2 - vpandn %xmm12,%xmm10,%xmm0 - vpand %xmm11,%xmm10,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm14,%xmm13 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm14,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm14,%xmm15,%xmm3 - - vpxor %xmm1,%xmm13,%xmm13 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm14,%xmm1 - - vpslld $19,%xmm14,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm13,%xmm7 - - vpsrld $22,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm14,%xmm2 - vpxor %xmm4,%xmm15,%xmm13 - vpaddd %xmm5,%xmm9,%xmm9 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm13,%xmm13 - vpaddd %xmm7,%xmm13,%xmm13 - vmovdqu 64-128(%rax),%xmm5 - vpaddd 192-128(%rax),%xmm6,%xmm6 - - vpsrld $3,%xmm5,%xmm7 - vpsrld $7,%xmm5,%xmm1 - vpslld $25,%xmm5,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm5,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm5,%xmm2 - vmovdqu 16-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - vpsrld $6,%xmm9,%xmm7 - vpslld $26,%xmm9,%xmm2 - vmovdqu %xmm6,48-128(%rax) - vpaddd %xmm12,%xmm6,%xmm6 - - vpsrld $11,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm9,%xmm2 - vpaddd -32(%rbp),%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm9,%xmm2 - vpandn %xmm11,%xmm9,%xmm0 - vpand %xmm10,%xmm9,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm13,%xmm12 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm13,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm13,%xmm14,%xmm4 - - vpxor %xmm1,%xmm12,%xmm12 - vpaddd %xmm7,%xmm6,%xmm6 - - vpsrld $13,%xmm13,%xmm1 - - vpslld $19,%xmm13,%xmm2 - vpaddd %xmm0,%xmm6,%xmm6 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm12,%xmm7 - - vpsrld $22,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm13,%xmm2 - vpxor %xmm3,%xmm14,%xmm12 - vpaddd %xmm6,%xmm8,%xmm8 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm6,%xmm12,%xmm12 - vpaddd %xmm7,%xmm12,%xmm12 - vmovdqu 80-128(%rax),%xmm6 - vpaddd 208-128(%rax),%xmm5,%xmm5 - - vpsrld $3,%xmm6,%xmm7 - vpsrld $7,%xmm6,%xmm1 - vpslld $25,%xmm6,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm6,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm6,%xmm2 - vmovdqu 32-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm5,%xmm5 - vpxor %xmm1,%xmm3,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm5,%xmm5 - vpsrld $6,%xmm8,%xmm7 - vpslld $26,%xmm8,%xmm2 - vmovdqu %xmm5,64-128(%rax) - vpaddd %xmm11,%xmm5,%xmm5 - - vpsrld $11,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm8,%xmm2 - vpaddd 0(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm8,%xmm2 - vpandn %xmm10,%xmm8,%xmm0 - vpand %xmm9,%xmm8,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm12,%xmm11 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm12,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm12,%xmm13,%xmm3 - - vpxor %xmm1,%xmm11,%xmm11 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm12,%xmm1 - - vpslld $19,%xmm12,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm11,%xmm7 - - vpsrld $22,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm12,%xmm2 - vpxor %xmm4,%xmm13,%xmm11 - vpaddd %xmm5,%xmm15,%xmm15 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm11,%xmm11 - vpaddd %xmm7,%xmm11,%xmm11 - vmovdqu 96-128(%rax),%xmm5 - vpaddd 224-128(%rax),%xmm6,%xmm6 - - vpsrld $3,%xmm5,%xmm7 - vpsrld $7,%xmm5,%xmm1 - vpslld $25,%xmm5,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm5,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm5,%xmm2 - vmovdqu 48-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - vpsrld $6,%xmm15,%xmm7 - vpslld $26,%xmm15,%xmm2 - vmovdqu %xmm6,80-128(%rax) - vpaddd %xmm10,%xmm6,%xmm6 - - vpsrld $11,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm15,%xmm2 - vpaddd 32(%rbp),%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm15,%xmm2 - vpandn %xmm9,%xmm15,%xmm0 - vpand %xmm8,%xmm15,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm11,%xmm10 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm11,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm11,%xmm12,%xmm4 - - vpxor %xmm1,%xmm10,%xmm10 - vpaddd %xmm7,%xmm6,%xmm6 - - vpsrld $13,%xmm11,%xmm1 - - vpslld $19,%xmm11,%xmm2 - vpaddd %xmm0,%xmm6,%xmm6 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm10,%xmm7 - - vpsrld $22,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm11,%xmm2 - vpxor %xmm3,%xmm12,%xmm10 - vpaddd %xmm6,%xmm14,%xmm14 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm6,%xmm10,%xmm10 - vpaddd %xmm7,%xmm10,%xmm10 - vmovdqu 112-128(%rax),%xmm6 - vpaddd 240-128(%rax),%xmm5,%xmm5 - - vpsrld $3,%xmm6,%xmm7 - vpsrld $7,%xmm6,%xmm1 - vpslld $25,%xmm6,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm6,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm6,%xmm2 - vmovdqu 64-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm5,%xmm5 - vpxor %xmm1,%xmm3,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm5,%xmm5 - vpsrld $6,%xmm14,%xmm7 - vpslld $26,%xmm14,%xmm2 - vmovdqu %xmm5,96-128(%rax) - vpaddd %xmm9,%xmm5,%xmm5 - - vpsrld $11,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm14,%xmm2 - vpaddd 64(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm14,%xmm2 - vpandn %xmm8,%xmm14,%xmm0 - vpand %xmm15,%xmm14,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm10,%xmm9 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm10,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm10,%xmm11,%xmm3 - - vpxor %xmm1,%xmm9,%xmm9 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm10,%xmm1 - - vpslld $19,%xmm10,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm9,%xmm7 - - vpsrld $22,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm10,%xmm2 - vpxor %xmm4,%xmm11,%xmm9 - vpaddd %xmm5,%xmm13,%xmm13 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm9,%xmm9 - vpaddd %xmm7,%xmm9,%xmm9 - vmovdqu 128-128(%rax),%xmm5 - vpaddd 0-128(%rax),%xmm6,%xmm6 - - vpsrld $3,%xmm5,%xmm7 - vpsrld $7,%xmm5,%xmm1 - vpslld $25,%xmm5,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm5,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm5,%xmm2 - vmovdqu 80-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - vpsrld $6,%xmm13,%xmm7 - vpslld $26,%xmm13,%xmm2 - vmovdqu %xmm6,112-128(%rax) - vpaddd %xmm8,%xmm6,%xmm6 - - vpsrld $11,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm13,%xmm2 - vpaddd 96(%rbp),%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm13,%xmm2 - vpandn %xmm15,%xmm13,%xmm0 - vpand %xmm14,%xmm13,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm9,%xmm8 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm9,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm9,%xmm10,%xmm4 - - vpxor %xmm1,%xmm8,%xmm8 - vpaddd %xmm7,%xmm6,%xmm6 - - vpsrld $13,%xmm9,%xmm1 - - vpslld $19,%xmm9,%xmm2 - vpaddd %xmm0,%xmm6,%xmm6 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm8,%xmm7 - - vpsrld $22,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm9,%xmm2 - vpxor %xmm3,%xmm10,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm6,%xmm8,%xmm8 - vpaddd %xmm7,%xmm8,%xmm8 - addq $256,%rbp - vmovdqu 144-128(%rax),%xmm6 - vpaddd 16-128(%rax),%xmm5,%xmm5 - - vpsrld $3,%xmm6,%xmm7 - vpsrld $7,%xmm6,%xmm1 - vpslld $25,%xmm6,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm6,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm6,%xmm2 - vmovdqu 96-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm5,%xmm5 - vpxor %xmm1,%xmm3,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm5,%xmm5 - vpsrld $6,%xmm12,%xmm7 - vpslld $26,%xmm12,%xmm2 - vmovdqu %xmm5,128-128(%rax) - vpaddd %xmm15,%xmm5,%xmm5 - - vpsrld $11,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm12,%xmm2 - vpaddd -128(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm12,%xmm2 - vpandn %xmm14,%xmm12,%xmm0 - vpand %xmm13,%xmm12,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm8,%xmm15 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm8,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm8,%xmm9,%xmm3 - - vpxor %xmm1,%xmm15,%xmm15 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm8,%xmm1 - - vpslld $19,%xmm8,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm15,%xmm7 - - vpsrld $22,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm8,%xmm2 - vpxor %xmm4,%xmm9,%xmm15 - vpaddd %xmm5,%xmm11,%xmm11 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm15,%xmm15 - vpaddd %xmm7,%xmm15,%xmm15 - vmovdqu 160-128(%rax),%xmm5 - vpaddd 32-128(%rax),%xmm6,%xmm6 - - vpsrld $3,%xmm5,%xmm7 - vpsrld $7,%xmm5,%xmm1 - vpslld $25,%xmm5,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm5,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm5,%xmm2 - vmovdqu 112-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - vpsrld $6,%xmm11,%xmm7 - vpslld $26,%xmm11,%xmm2 - vmovdqu %xmm6,144-128(%rax) - vpaddd %xmm14,%xmm6,%xmm6 - - vpsrld $11,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm11,%xmm2 - vpaddd -96(%rbp),%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm11,%xmm2 - vpandn %xmm13,%xmm11,%xmm0 - vpand %xmm12,%xmm11,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm15,%xmm14 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm15,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm15,%xmm8,%xmm4 - - vpxor %xmm1,%xmm14,%xmm14 - vpaddd %xmm7,%xmm6,%xmm6 - - vpsrld $13,%xmm15,%xmm1 - - vpslld $19,%xmm15,%xmm2 - vpaddd %xmm0,%xmm6,%xmm6 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm14,%xmm7 - - vpsrld $22,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm15,%xmm2 - vpxor %xmm3,%xmm8,%xmm14 - vpaddd %xmm6,%xmm10,%xmm10 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm6,%xmm14,%xmm14 - vpaddd %xmm7,%xmm14,%xmm14 - vmovdqu 176-128(%rax),%xmm6 - vpaddd 48-128(%rax),%xmm5,%xmm5 - - vpsrld $3,%xmm6,%xmm7 - vpsrld $7,%xmm6,%xmm1 - vpslld $25,%xmm6,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm6,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm6,%xmm2 - vmovdqu 128-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm5,%xmm5 - vpxor %xmm1,%xmm3,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm5,%xmm5 - vpsrld $6,%xmm10,%xmm7 - vpslld $26,%xmm10,%xmm2 - vmovdqu %xmm5,160-128(%rax) - vpaddd %xmm13,%xmm5,%xmm5 - - vpsrld $11,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm10,%xmm2 - vpaddd -64(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm10,%xmm2 - vpandn %xmm12,%xmm10,%xmm0 - vpand %xmm11,%xmm10,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm14,%xmm13 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm14,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm14,%xmm15,%xmm3 - - vpxor %xmm1,%xmm13,%xmm13 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm14,%xmm1 - - vpslld $19,%xmm14,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm13,%xmm7 - - vpsrld $22,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm14,%xmm2 - vpxor %xmm4,%xmm15,%xmm13 - vpaddd %xmm5,%xmm9,%xmm9 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm13,%xmm13 - vpaddd %xmm7,%xmm13,%xmm13 - vmovdqu 192-128(%rax),%xmm5 - vpaddd 64-128(%rax),%xmm6,%xmm6 - - vpsrld $3,%xmm5,%xmm7 - vpsrld $7,%xmm5,%xmm1 - vpslld $25,%xmm5,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm5,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm5,%xmm2 - vmovdqu 144-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - vpsrld $6,%xmm9,%xmm7 - vpslld $26,%xmm9,%xmm2 - vmovdqu %xmm6,176-128(%rax) - vpaddd %xmm12,%xmm6,%xmm6 - - vpsrld $11,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm9,%xmm2 - vpaddd -32(%rbp),%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm9,%xmm2 - vpandn %xmm11,%xmm9,%xmm0 - vpand %xmm10,%xmm9,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm13,%xmm12 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm13,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm13,%xmm14,%xmm4 - - vpxor %xmm1,%xmm12,%xmm12 - vpaddd %xmm7,%xmm6,%xmm6 - - vpsrld $13,%xmm13,%xmm1 - - vpslld $19,%xmm13,%xmm2 - vpaddd %xmm0,%xmm6,%xmm6 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm12,%xmm7 - - vpsrld $22,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm13,%xmm2 - vpxor %xmm3,%xmm14,%xmm12 - vpaddd %xmm6,%xmm8,%xmm8 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm6,%xmm12,%xmm12 - vpaddd %xmm7,%xmm12,%xmm12 - vmovdqu 208-128(%rax),%xmm6 - vpaddd 80-128(%rax),%xmm5,%xmm5 - - vpsrld $3,%xmm6,%xmm7 - vpsrld $7,%xmm6,%xmm1 - vpslld $25,%xmm6,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm6,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm6,%xmm2 - vmovdqu 160-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm5,%xmm5 - vpxor %xmm1,%xmm3,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm5,%xmm5 - vpsrld $6,%xmm8,%xmm7 - vpslld $26,%xmm8,%xmm2 - vmovdqu %xmm5,192-128(%rax) - vpaddd %xmm11,%xmm5,%xmm5 - - vpsrld $11,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm8,%xmm2 - vpaddd 0(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm8,%xmm2 - vpandn %xmm10,%xmm8,%xmm0 - vpand %xmm9,%xmm8,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm12,%xmm11 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm12,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm12,%xmm13,%xmm3 - - vpxor %xmm1,%xmm11,%xmm11 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm12,%xmm1 - - vpslld $19,%xmm12,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm11,%xmm7 - - vpsrld $22,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm12,%xmm2 - vpxor %xmm4,%xmm13,%xmm11 - vpaddd %xmm5,%xmm15,%xmm15 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm11,%xmm11 - vpaddd %xmm7,%xmm11,%xmm11 - vmovdqu 224-128(%rax),%xmm5 - vpaddd 96-128(%rax),%xmm6,%xmm6 - - vpsrld $3,%xmm5,%xmm7 - vpsrld $7,%xmm5,%xmm1 - vpslld $25,%xmm5,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm5,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm5,%xmm2 - vmovdqu 176-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - vpsrld $6,%xmm15,%xmm7 - vpslld $26,%xmm15,%xmm2 - vmovdqu %xmm6,208-128(%rax) - vpaddd %xmm10,%xmm6,%xmm6 - - vpsrld $11,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm15,%xmm2 - vpaddd 32(%rbp),%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm15,%xmm2 - vpandn %xmm9,%xmm15,%xmm0 - vpand %xmm8,%xmm15,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm11,%xmm10 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm11,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm11,%xmm12,%xmm4 - - vpxor %xmm1,%xmm10,%xmm10 - vpaddd %xmm7,%xmm6,%xmm6 - - vpsrld $13,%xmm11,%xmm1 - - vpslld $19,%xmm11,%xmm2 - vpaddd %xmm0,%xmm6,%xmm6 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm10,%xmm7 - - vpsrld $22,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm11,%xmm2 - vpxor %xmm3,%xmm12,%xmm10 - vpaddd %xmm6,%xmm14,%xmm14 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm6,%xmm10,%xmm10 - vpaddd %xmm7,%xmm10,%xmm10 - vmovdqu 240-128(%rax),%xmm6 - vpaddd 112-128(%rax),%xmm5,%xmm5 - - vpsrld $3,%xmm6,%xmm7 - vpsrld $7,%xmm6,%xmm1 - vpslld $25,%xmm6,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm6,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm6,%xmm2 - vmovdqu 192-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm5,%xmm5 - vpxor %xmm1,%xmm3,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm5,%xmm5 - vpsrld $6,%xmm14,%xmm7 - vpslld $26,%xmm14,%xmm2 - vmovdqu %xmm5,224-128(%rax) - vpaddd %xmm9,%xmm5,%xmm5 - - vpsrld $11,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm14,%xmm2 - vpaddd 64(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm14,%xmm2 - vpandn %xmm8,%xmm14,%xmm0 - vpand %xmm15,%xmm14,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm10,%xmm9 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm10,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm10,%xmm11,%xmm3 - - vpxor %xmm1,%xmm9,%xmm9 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm10,%xmm1 - - vpslld $19,%xmm10,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm9,%xmm7 - - vpsrld $22,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm10,%xmm2 - vpxor %xmm4,%xmm11,%xmm9 - vpaddd %xmm5,%xmm13,%xmm13 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm9,%xmm9 - vpaddd %xmm7,%xmm9,%xmm9 - vmovdqu 0-128(%rax),%xmm5 - vpaddd 128-128(%rax),%xmm6,%xmm6 - - vpsrld $3,%xmm5,%xmm7 - vpsrld $7,%xmm5,%xmm1 - vpslld $25,%xmm5,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm5,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm5,%xmm2 - vmovdqu 208-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - vpsrld $6,%xmm13,%xmm7 - vpslld $26,%xmm13,%xmm2 - vmovdqu %xmm6,240-128(%rax) - vpaddd %xmm8,%xmm6,%xmm6 - - vpsrld $11,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm13,%xmm2 - vpaddd 96(%rbp),%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm13,%xmm2 - vpandn %xmm15,%xmm13,%xmm0 - vpand %xmm14,%xmm13,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm9,%xmm8 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm9,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm9,%xmm10,%xmm4 - - vpxor %xmm1,%xmm8,%xmm8 - vpaddd %xmm7,%xmm6,%xmm6 - - vpsrld $13,%xmm9,%xmm1 - - vpslld $19,%xmm9,%xmm2 - vpaddd %xmm0,%xmm6,%xmm6 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm8,%xmm7 - - vpsrld $22,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm9,%xmm2 - vpxor %xmm3,%xmm10,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm6,%xmm8,%xmm8 - vpaddd %xmm7,%xmm8,%xmm8 - addq $256,%rbp - decl %ecx - jnz .Loop_16_xx_avx - - movl $1,%ecx - leaq K256+128(%rip),%rbp - cmpl 0(%rbx),%ecx - cmovgeq %rbp,%r8 - cmpl 4(%rbx),%ecx - cmovgeq %rbp,%r9 - cmpl 8(%rbx),%ecx - cmovgeq %rbp,%r10 - cmpl 12(%rbx),%ecx - cmovgeq %rbp,%r11 - vmovdqa (%rbx),%xmm7 - vpxor %xmm0,%xmm0,%xmm0 - vmovdqa %xmm7,%xmm6 - vpcmpgtd %xmm0,%xmm6,%xmm6 - vpaddd %xmm6,%xmm7,%xmm7 - - vmovdqu 0-128(%rdi),%xmm0 - vpand %xmm6,%xmm8,%xmm8 - vmovdqu 32-128(%rdi),%xmm1 - vpand %xmm6,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm2 - vpand %xmm6,%xmm10,%xmm10 - vmovdqu 96-128(%rdi),%xmm5 - vpand %xmm6,%xmm11,%xmm11 - vpaddd %xmm0,%xmm8,%xmm8 - vmovdqu 128-128(%rdi),%xmm0 - vpand %xmm6,%xmm12,%xmm12 - vpaddd %xmm1,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm1 - vpand %xmm6,%xmm13,%xmm13 - vpaddd %xmm2,%xmm10,%xmm10 - vmovdqu 192-128(%rdi),%xmm2 - vpand %xmm6,%xmm14,%xmm14 - vpaddd %xmm5,%xmm11,%xmm11 - vmovdqu 224-128(%rdi),%xmm5 - vpand %xmm6,%xmm15,%xmm15 - vpaddd %xmm0,%xmm12,%xmm12 - vpaddd %xmm1,%xmm13,%xmm13 - vmovdqu %xmm8,0-128(%rdi) - vpaddd %xmm2,%xmm14,%xmm14 - vmovdqu %xmm9,32-128(%rdi) - vpaddd %xmm5,%xmm15,%xmm15 - vmovdqu %xmm10,64-128(%rdi) - vmovdqu %xmm11,96-128(%rdi) - vmovdqu %xmm12,128-128(%rdi) - vmovdqu %xmm13,160-128(%rdi) - vmovdqu %xmm14,192-128(%rdi) - vmovdqu %xmm15,224-128(%rdi) - - vmovdqu %xmm7,(%rbx) - vmovdqu .Lpbswap(%rip),%xmm6 - decl %edx - jnz .Loop_avx - - movl 280(%rsp),%edx - leaq 16(%rdi),%rdi - leaq 64(%rsi),%rsi - decl %edx - jnz .Loop_grande_avx - -.Ldone_avx: - movq 272(%rsp),%rax -.cfi_def_cfa %rax,8 - vzeroupper - movq -16(%rax),%rbp -.cfi_restore %rbp - movq -8(%rax),%rbx -.cfi_restore %rbx - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_avx: - .byte 0xf3,0xc3 -.cfi_endproc -.size sha256_multi_block_avx,.-sha256_multi_block_avx -.type sha256_multi_block_avx2,@function -.align 32 -sha256_multi_block_avx2: -.cfi_startproc -_avx2_shortcut: - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - subq $576,%rsp - andq $-256,%rsp - movq %rax,544(%rsp) -.cfi_escape 0x0f,0x06,0x77,0xa0,0x04,0x06,0x23,0x08 -.Lbody_avx2: - leaq K256+128(%rip),%rbp - leaq 128(%rdi),%rdi - -.Loop_grande_avx2: - movl %edx,552(%rsp) - xorl %edx,%edx - leaq 512(%rsp),%rbx - - movq 0(%rsi),%r12 - - movl 8(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,0(%rbx) - cmovleq %rbp,%r12 - - movq 16(%rsi),%r13 - - movl 24(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,4(%rbx) - cmovleq %rbp,%r13 - - movq 32(%rsi),%r14 - - movl 40(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,8(%rbx) - cmovleq %rbp,%r14 - - movq 48(%rsi),%r15 - - movl 56(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,12(%rbx) - cmovleq %rbp,%r15 - - movq 64(%rsi),%r8 - - movl 72(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,16(%rbx) - cmovleq %rbp,%r8 - - movq 80(%rsi),%r9 - - movl 88(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,20(%rbx) - cmovleq %rbp,%r9 - - movq 96(%rsi),%r10 - - movl 104(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,24(%rbx) - cmovleq %rbp,%r10 - - movq 112(%rsi),%r11 - - movl 120(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,28(%rbx) - cmovleq %rbp,%r11 - vmovdqu 0-128(%rdi),%ymm8 - leaq 128(%rsp),%rax - vmovdqu 32-128(%rdi),%ymm9 - leaq 256+128(%rsp),%rbx - vmovdqu 64-128(%rdi),%ymm10 - vmovdqu 96-128(%rdi),%ymm11 - vmovdqu 128-128(%rdi),%ymm12 - vmovdqu 160-128(%rdi),%ymm13 - vmovdqu 192-128(%rdi),%ymm14 - vmovdqu 224-128(%rdi),%ymm15 - vmovdqu .Lpbswap(%rip),%ymm6 - jmp .Loop_avx2 - -.align 32 -.Loop_avx2: - vpxor %ymm9,%ymm10,%ymm4 - vmovd 0(%r12),%xmm5 - vmovd 0(%r8),%xmm0 - vmovd 0(%r13),%xmm1 - vmovd 0(%r9),%xmm2 - vpinsrd $1,0(%r14),%xmm5,%xmm5 - vpinsrd $1,0(%r10),%xmm0,%xmm0 - vpinsrd $1,0(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,0(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm12,%ymm7 - vpslld $26,%ymm12,%ymm2 - vmovdqu %ymm5,0-128(%rax) - vpaddd %ymm15,%ymm5,%ymm5 - - vpsrld $11,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm12,%ymm2 - vpaddd -128(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm12,%ymm2 - vpandn %ymm14,%ymm12,%ymm0 - vpand %ymm13,%ymm12,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm8,%ymm15 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm8,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm8,%ymm9,%ymm3 - - vpxor %ymm1,%ymm15,%ymm15 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm8,%ymm1 - - vpslld $19,%ymm8,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm15,%ymm7 - - vpsrld $22,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm8,%ymm2 - vpxor %ymm4,%ymm9,%ymm15 - vpaddd %ymm5,%ymm11,%ymm11 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm15,%ymm15 - vpaddd %ymm7,%ymm15,%ymm15 - vmovd 4(%r12),%xmm5 - vmovd 4(%r8),%xmm0 - vmovd 4(%r13),%xmm1 - vmovd 4(%r9),%xmm2 - vpinsrd $1,4(%r14),%xmm5,%xmm5 - vpinsrd $1,4(%r10),%xmm0,%xmm0 - vpinsrd $1,4(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,4(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm11,%ymm7 - vpslld $26,%ymm11,%ymm2 - vmovdqu %ymm5,32-128(%rax) - vpaddd %ymm14,%ymm5,%ymm5 - - vpsrld $11,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm11,%ymm2 - vpaddd -96(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm11,%ymm2 - vpandn %ymm13,%ymm11,%ymm0 - vpand %ymm12,%ymm11,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm15,%ymm14 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm15,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm15,%ymm8,%ymm4 - - vpxor %ymm1,%ymm14,%ymm14 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm15,%ymm1 - - vpslld $19,%ymm15,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm14,%ymm7 - - vpsrld $22,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm15,%ymm2 - vpxor %ymm3,%ymm8,%ymm14 - vpaddd %ymm5,%ymm10,%ymm10 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm14,%ymm14 - vpaddd %ymm7,%ymm14,%ymm14 - vmovd 8(%r12),%xmm5 - vmovd 8(%r8),%xmm0 - vmovd 8(%r13),%xmm1 - vmovd 8(%r9),%xmm2 - vpinsrd $1,8(%r14),%xmm5,%xmm5 - vpinsrd $1,8(%r10),%xmm0,%xmm0 - vpinsrd $1,8(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,8(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm10,%ymm7 - vpslld $26,%ymm10,%ymm2 - vmovdqu %ymm5,64-128(%rax) - vpaddd %ymm13,%ymm5,%ymm5 - - vpsrld $11,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm10,%ymm2 - vpaddd -64(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm10,%ymm2 - vpandn %ymm12,%ymm10,%ymm0 - vpand %ymm11,%ymm10,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm14,%ymm13 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm14,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm14,%ymm15,%ymm3 - - vpxor %ymm1,%ymm13,%ymm13 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm14,%ymm1 - - vpslld $19,%ymm14,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm13,%ymm7 - - vpsrld $22,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm14,%ymm2 - vpxor %ymm4,%ymm15,%ymm13 - vpaddd %ymm5,%ymm9,%ymm9 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm13,%ymm13 - vpaddd %ymm7,%ymm13,%ymm13 - vmovd 12(%r12),%xmm5 - vmovd 12(%r8),%xmm0 - vmovd 12(%r13),%xmm1 - vmovd 12(%r9),%xmm2 - vpinsrd $1,12(%r14),%xmm5,%xmm5 - vpinsrd $1,12(%r10),%xmm0,%xmm0 - vpinsrd $1,12(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,12(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm9,%ymm7 - vpslld $26,%ymm9,%ymm2 - vmovdqu %ymm5,96-128(%rax) - vpaddd %ymm12,%ymm5,%ymm5 - - vpsrld $11,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm9,%ymm2 - vpaddd -32(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm9,%ymm2 - vpandn %ymm11,%ymm9,%ymm0 - vpand %ymm10,%ymm9,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm13,%ymm12 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm13,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm13,%ymm14,%ymm4 - - vpxor %ymm1,%ymm12,%ymm12 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm13,%ymm1 - - vpslld $19,%ymm13,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm12,%ymm7 - - vpsrld $22,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm13,%ymm2 - vpxor %ymm3,%ymm14,%ymm12 - vpaddd %ymm5,%ymm8,%ymm8 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm12,%ymm12 - vpaddd %ymm7,%ymm12,%ymm12 - vmovd 16(%r12),%xmm5 - vmovd 16(%r8),%xmm0 - vmovd 16(%r13),%xmm1 - vmovd 16(%r9),%xmm2 - vpinsrd $1,16(%r14),%xmm5,%xmm5 - vpinsrd $1,16(%r10),%xmm0,%xmm0 - vpinsrd $1,16(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,16(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm8,%ymm7 - vpslld $26,%ymm8,%ymm2 - vmovdqu %ymm5,128-128(%rax) - vpaddd %ymm11,%ymm5,%ymm5 - - vpsrld $11,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm8,%ymm2 - vpaddd 0(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm8,%ymm2 - vpandn %ymm10,%ymm8,%ymm0 - vpand %ymm9,%ymm8,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm12,%ymm11 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm12,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm12,%ymm13,%ymm3 - - vpxor %ymm1,%ymm11,%ymm11 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm12,%ymm1 - - vpslld $19,%ymm12,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm11,%ymm7 - - vpsrld $22,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm12,%ymm2 - vpxor %ymm4,%ymm13,%ymm11 - vpaddd %ymm5,%ymm15,%ymm15 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm11,%ymm11 - vpaddd %ymm7,%ymm11,%ymm11 - vmovd 20(%r12),%xmm5 - vmovd 20(%r8),%xmm0 - vmovd 20(%r13),%xmm1 - vmovd 20(%r9),%xmm2 - vpinsrd $1,20(%r14),%xmm5,%xmm5 - vpinsrd $1,20(%r10),%xmm0,%xmm0 - vpinsrd $1,20(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,20(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm15,%ymm7 - vpslld $26,%ymm15,%ymm2 - vmovdqu %ymm5,160-128(%rax) - vpaddd %ymm10,%ymm5,%ymm5 - - vpsrld $11,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm15,%ymm2 - vpaddd 32(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm15,%ymm2 - vpandn %ymm9,%ymm15,%ymm0 - vpand %ymm8,%ymm15,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm11,%ymm10 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm11,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm11,%ymm12,%ymm4 - - vpxor %ymm1,%ymm10,%ymm10 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm11,%ymm1 - - vpslld $19,%ymm11,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm10,%ymm7 - - vpsrld $22,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm11,%ymm2 - vpxor %ymm3,%ymm12,%ymm10 - vpaddd %ymm5,%ymm14,%ymm14 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm10,%ymm10 - vpaddd %ymm7,%ymm10,%ymm10 - vmovd 24(%r12),%xmm5 - vmovd 24(%r8),%xmm0 - vmovd 24(%r13),%xmm1 - vmovd 24(%r9),%xmm2 - vpinsrd $1,24(%r14),%xmm5,%xmm5 - vpinsrd $1,24(%r10),%xmm0,%xmm0 - vpinsrd $1,24(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,24(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm14,%ymm7 - vpslld $26,%ymm14,%ymm2 - vmovdqu %ymm5,192-128(%rax) - vpaddd %ymm9,%ymm5,%ymm5 - - vpsrld $11,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm14,%ymm2 - vpaddd 64(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm14,%ymm2 - vpandn %ymm8,%ymm14,%ymm0 - vpand %ymm15,%ymm14,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm10,%ymm9 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm10,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm10,%ymm11,%ymm3 - - vpxor %ymm1,%ymm9,%ymm9 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm10,%ymm1 - - vpslld $19,%ymm10,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm9,%ymm7 - - vpsrld $22,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm10,%ymm2 - vpxor %ymm4,%ymm11,%ymm9 - vpaddd %ymm5,%ymm13,%ymm13 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm9,%ymm9 - vpaddd %ymm7,%ymm9,%ymm9 - vmovd 28(%r12),%xmm5 - vmovd 28(%r8),%xmm0 - vmovd 28(%r13),%xmm1 - vmovd 28(%r9),%xmm2 - vpinsrd $1,28(%r14),%xmm5,%xmm5 - vpinsrd $1,28(%r10),%xmm0,%xmm0 - vpinsrd $1,28(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,28(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm13,%ymm7 - vpslld $26,%ymm13,%ymm2 - vmovdqu %ymm5,224-128(%rax) - vpaddd %ymm8,%ymm5,%ymm5 - - vpsrld $11,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm13,%ymm2 - vpaddd 96(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm13,%ymm2 - vpandn %ymm15,%ymm13,%ymm0 - vpand %ymm14,%ymm13,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm9,%ymm8 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm9,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm9,%ymm10,%ymm4 - - vpxor %ymm1,%ymm8,%ymm8 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm9,%ymm1 - - vpslld $19,%ymm9,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm8,%ymm7 - - vpsrld $22,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm9,%ymm2 - vpxor %ymm3,%ymm10,%ymm8 - vpaddd %ymm5,%ymm12,%ymm12 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm8,%ymm8 - vpaddd %ymm7,%ymm8,%ymm8 - addq $256,%rbp - vmovd 32(%r12),%xmm5 - vmovd 32(%r8),%xmm0 - vmovd 32(%r13),%xmm1 - vmovd 32(%r9),%xmm2 - vpinsrd $1,32(%r14),%xmm5,%xmm5 - vpinsrd $1,32(%r10),%xmm0,%xmm0 - vpinsrd $1,32(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,32(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm12,%ymm7 - vpslld $26,%ymm12,%ymm2 - vmovdqu %ymm5,256-256-128(%rbx) - vpaddd %ymm15,%ymm5,%ymm5 - - vpsrld $11,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm12,%ymm2 - vpaddd -128(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm12,%ymm2 - vpandn %ymm14,%ymm12,%ymm0 - vpand %ymm13,%ymm12,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm8,%ymm15 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm8,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm8,%ymm9,%ymm3 - - vpxor %ymm1,%ymm15,%ymm15 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm8,%ymm1 - - vpslld $19,%ymm8,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm15,%ymm7 - - vpsrld $22,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm8,%ymm2 - vpxor %ymm4,%ymm9,%ymm15 - vpaddd %ymm5,%ymm11,%ymm11 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm15,%ymm15 - vpaddd %ymm7,%ymm15,%ymm15 - vmovd 36(%r12),%xmm5 - vmovd 36(%r8),%xmm0 - vmovd 36(%r13),%xmm1 - vmovd 36(%r9),%xmm2 - vpinsrd $1,36(%r14),%xmm5,%xmm5 - vpinsrd $1,36(%r10),%xmm0,%xmm0 - vpinsrd $1,36(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,36(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm11,%ymm7 - vpslld $26,%ymm11,%ymm2 - vmovdqu %ymm5,288-256-128(%rbx) - vpaddd %ymm14,%ymm5,%ymm5 - - vpsrld $11,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm11,%ymm2 - vpaddd -96(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm11,%ymm2 - vpandn %ymm13,%ymm11,%ymm0 - vpand %ymm12,%ymm11,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm15,%ymm14 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm15,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm15,%ymm8,%ymm4 - - vpxor %ymm1,%ymm14,%ymm14 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm15,%ymm1 - - vpslld $19,%ymm15,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm14,%ymm7 - - vpsrld $22,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm15,%ymm2 - vpxor %ymm3,%ymm8,%ymm14 - vpaddd %ymm5,%ymm10,%ymm10 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm14,%ymm14 - vpaddd %ymm7,%ymm14,%ymm14 - vmovd 40(%r12),%xmm5 - vmovd 40(%r8),%xmm0 - vmovd 40(%r13),%xmm1 - vmovd 40(%r9),%xmm2 - vpinsrd $1,40(%r14),%xmm5,%xmm5 - vpinsrd $1,40(%r10),%xmm0,%xmm0 - vpinsrd $1,40(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,40(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm10,%ymm7 - vpslld $26,%ymm10,%ymm2 - vmovdqu %ymm5,320-256-128(%rbx) - vpaddd %ymm13,%ymm5,%ymm5 - - vpsrld $11,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm10,%ymm2 - vpaddd -64(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm10,%ymm2 - vpandn %ymm12,%ymm10,%ymm0 - vpand %ymm11,%ymm10,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm14,%ymm13 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm14,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm14,%ymm15,%ymm3 - - vpxor %ymm1,%ymm13,%ymm13 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm14,%ymm1 - - vpslld $19,%ymm14,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm13,%ymm7 - - vpsrld $22,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm14,%ymm2 - vpxor %ymm4,%ymm15,%ymm13 - vpaddd %ymm5,%ymm9,%ymm9 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm13,%ymm13 - vpaddd %ymm7,%ymm13,%ymm13 - vmovd 44(%r12),%xmm5 - vmovd 44(%r8),%xmm0 - vmovd 44(%r13),%xmm1 - vmovd 44(%r9),%xmm2 - vpinsrd $1,44(%r14),%xmm5,%xmm5 - vpinsrd $1,44(%r10),%xmm0,%xmm0 - vpinsrd $1,44(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,44(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm9,%ymm7 - vpslld $26,%ymm9,%ymm2 - vmovdqu %ymm5,352-256-128(%rbx) - vpaddd %ymm12,%ymm5,%ymm5 - - vpsrld $11,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm9,%ymm2 - vpaddd -32(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm9,%ymm2 - vpandn %ymm11,%ymm9,%ymm0 - vpand %ymm10,%ymm9,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm13,%ymm12 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm13,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm13,%ymm14,%ymm4 - - vpxor %ymm1,%ymm12,%ymm12 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm13,%ymm1 - - vpslld $19,%ymm13,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm12,%ymm7 - - vpsrld $22,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm13,%ymm2 - vpxor %ymm3,%ymm14,%ymm12 - vpaddd %ymm5,%ymm8,%ymm8 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm12,%ymm12 - vpaddd %ymm7,%ymm12,%ymm12 - vmovd 48(%r12),%xmm5 - vmovd 48(%r8),%xmm0 - vmovd 48(%r13),%xmm1 - vmovd 48(%r9),%xmm2 - vpinsrd $1,48(%r14),%xmm5,%xmm5 - vpinsrd $1,48(%r10),%xmm0,%xmm0 - vpinsrd $1,48(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,48(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm8,%ymm7 - vpslld $26,%ymm8,%ymm2 - vmovdqu %ymm5,384-256-128(%rbx) - vpaddd %ymm11,%ymm5,%ymm5 - - vpsrld $11,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm8,%ymm2 - vpaddd 0(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm8,%ymm2 - vpandn %ymm10,%ymm8,%ymm0 - vpand %ymm9,%ymm8,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm12,%ymm11 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm12,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm12,%ymm13,%ymm3 - - vpxor %ymm1,%ymm11,%ymm11 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm12,%ymm1 - - vpslld $19,%ymm12,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm11,%ymm7 - - vpsrld $22,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm12,%ymm2 - vpxor %ymm4,%ymm13,%ymm11 - vpaddd %ymm5,%ymm15,%ymm15 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm11,%ymm11 - vpaddd %ymm7,%ymm11,%ymm11 - vmovd 52(%r12),%xmm5 - vmovd 52(%r8),%xmm0 - vmovd 52(%r13),%xmm1 - vmovd 52(%r9),%xmm2 - vpinsrd $1,52(%r14),%xmm5,%xmm5 - vpinsrd $1,52(%r10),%xmm0,%xmm0 - vpinsrd $1,52(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,52(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm15,%ymm7 - vpslld $26,%ymm15,%ymm2 - vmovdqu %ymm5,416-256-128(%rbx) - vpaddd %ymm10,%ymm5,%ymm5 - - vpsrld $11,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm15,%ymm2 - vpaddd 32(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm15,%ymm2 - vpandn %ymm9,%ymm15,%ymm0 - vpand %ymm8,%ymm15,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm11,%ymm10 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm11,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm11,%ymm12,%ymm4 - - vpxor %ymm1,%ymm10,%ymm10 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm11,%ymm1 - - vpslld $19,%ymm11,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm10,%ymm7 - - vpsrld $22,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm11,%ymm2 - vpxor %ymm3,%ymm12,%ymm10 - vpaddd %ymm5,%ymm14,%ymm14 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm10,%ymm10 - vpaddd %ymm7,%ymm10,%ymm10 - vmovd 56(%r12),%xmm5 - vmovd 56(%r8),%xmm0 - vmovd 56(%r13),%xmm1 - vmovd 56(%r9),%xmm2 - vpinsrd $1,56(%r14),%xmm5,%xmm5 - vpinsrd $1,56(%r10),%xmm0,%xmm0 - vpinsrd $1,56(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,56(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm14,%ymm7 - vpslld $26,%ymm14,%ymm2 - vmovdqu %ymm5,448-256-128(%rbx) - vpaddd %ymm9,%ymm5,%ymm5 - - vpsrld $11,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm14,%ymm2 - vpaddd 64(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm14,%ymm2 - vpandn %ymm8,%ymm14,%ymm0 - vpand %ymm15,%ymm14,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm10,%ymm9 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm10,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm10,%ymm11,%ymm3 - - vpxor %ymm1,%ymm9,%ymm9 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm10,%ymm1 - - vpslld $19,%ymm10,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm9,%ymm7 - - vpsrld $22,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm10,%ymm2 - vpxor %ymm4,%ymm11,%ymm9 - vpaddd %ymm5,%ymm13,%ymm13 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm9,%ymm9 - vpaddd %ymm7,%ymm9,%ymm9 - vmovd 60(%r12),%xmm5 - leaq 64(%r12),%r12 - vmovd 60(%r8),%xmm0 - leaq 64(%r8),%r8 - vmovd 60(%r13),%xmm1 - leaq 64(%r13),%r13 - vmovd 60(%r9),%xmm2 - leaq 64(%r9),%r9 - vpinsrd $1,60(%r14),%xmm5,%xmm5 - leaq 64(%r14),%r14 - vpinsrd $1,60(%r10),%xmm0,%xmm0 - leaq 64(%r10),%r10 - vpinsrd $1,60(%r15),%xmm1,%xmm1 - leaq 64(%r15),%r15 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,60(%r11),%xmm2,%xmm2 - leaq 64(%r11),%r11 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm13,%ymm7 - vpslld $26,%ymm13,%ymm2 - vmovdqu %ymm5,480-256-128(%rbx) - vpaddd %ymm8,%ymm5,%ymm5 - - vpsrld $11,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm13,%ymm2 - vpaddd 96(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - prefetcht0 63(%r12) - vpslld $7,%ymm13,%ymm2 - vpandn %ymm15,%ymm13,%ymm0 - vpand %ymm14,%ymm13,%ymm4 - prefetcht0 63(%r13) - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm9,%ymm8 - vpxor %ymm2,%ymm7,%ymm7 - prefetcht0 63(%r14) - vpslld $30,%ymm9,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm9,%ymm10,%ymm4 - prefetcht0 63(%r15) - vpxor %ymm1,%ymm8,%ymm8 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm9,%ymm1 - prefetcht0 63(%r8) - vpslld $19,%ymm9,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm4,%ymm3,%ymm3 - prefetcht0 63(%r9) - vpxor %ymm1,%ymm8,%ymm7 - - vpsrld $22,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - prefetcht0 63(%r10) - vpslld $10,%ymm9,%ymm2 - vpxor %ymm3,%ymm10,%ymm8 - vpaddd %ymm5,%ymm12,%ymm12 - prefetcht0 63(%r11) - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm8,%ymm8 - vpaddd %ymm7,%ymm8,%ymm8 - addq $256,%rbp - vmovdqu 0-128(%rax),%ymm5 - movl $3,%ecx - jmp .Loop_16_xx_avx2 -.align 32 -.Loop_16_xx_avx2: - vmovdqu 32-128(%rax),%ymm6 - vpaddd 288-256-128(%rbx),%ymm5,%ymm5 - - vpsrld $3,%ymm6,%ymm7 - vpsrld $7,%ymm6,%ymm1 - vpslld $25,%ymm6,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm6,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm6,%ymm2 - vmovdqu 448-256-128(%rbx),%ymm0 - vpsrld $10,%ymm0,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm5,%ymm5 - vpxor %ymm1,%ymm3,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm5,%ymm5 - vpsrld $6,%ymm12,%ymm7 - vpslld $26,%ymm12,%ymm2 - vmovdqu %ymm5,0-128(%rax) - vpaddd %ymm15,%ymm5,%ymm5 - - vpsrld $11,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm12,%ymm2 - vpaddd -128(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm12,%ymm2 - vpandn %ymm14,%ymm12,%ymm0 - vpand %ymm13,%ymm12,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm8,%ymm15 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm8,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm8,%ymm9,%ymm3 - - vpxor %ymm1,%ymm15,%ymm15 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm8,%ymm1 - - vpslld $19,%ymm8,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm15,%ymm7 - - vpsrld $22,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm8,%ymm2 - vpxor %ymm4,%ymm9,%ymm15 - vpaddd %ymm5,%ymm11,%ymm11 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm15,%ymm15 - vpaddd %ymm7,%ymm15,%ymm15 - vmovdqu 64-128(%rax),%ymm5 - vpaddd 320-256-128(%rbx),%ymm6,%ymm6 - - vpsrld $3,%ymm5,%ymm7 - vpsrld $7,%ymm5,%ymm1 - vpslld $25,%ymm5,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm5,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm5,%ymm2 - vmovdqu 480-256-128(%rbx),%ymm0 - vpsrld $10,%ymm0,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm1,%ymm4,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm6,%ymm6 - vpsrld $6,%ymm11,%ymm7 - vpslld $26,%ymm11,%ymm2 - vmovdqu %ymm6,32-128(%rax) - vpaddd %ymm14,%ymm6,%ymm6 - - vpsrld $11,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm11,%ymm2 - vpaddd -96(%rbp),%ymm6,%ymm6 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm11,%ymm2 - vpandn %ymm13,%ymm11,%ymm0 - vpand %ymm12,%ymm11,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm15,%ymm14 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm15,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm15,%ymm8,%ymm4 - - vpxor %ymm1,%ymm14,%ymm14 - vpaddd %ymm7,%ymm6,%ymm6 - - vpsrld $13,%ymm15,%ymm1 - - vpslld $19,%ymm15,%ymm2 - vpaddd %ymm0,%ymm6,%ymm6 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm14,%ymm7 - - vpsrld $22,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm15,%ymm2 - vpxor %ymm3,%ymm8,%ymm14 - vpaddd %ymm6,%ymm10,%ymm10 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm6,%ymm14,%ymm14 - vpaddd %ymm7,%ymm14,%ymm14 - vmovdqu 96-128(%rax),%ymm6 - vpaddd 352-256-128(%rbx),%ymm5,%ymm5 - - vpsrld $3,%ymm6,%ymm7 - vpsrld $7,%ymm6,%ymm1 - vpslld $25,%ymm6,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm6,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm6,%ymm2 - vmovdqu 0-128(%rax),%ymm0 - vpsrld $10,%ymm0,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm5,%ymm5 - vpxor %ymm1,%ymm3,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm5,%ymm5 - vpsrld $6,%ymm10,%ymm7 - vpslld $26,%ymm10,%ymm2 - vmovdqu %ymm5,64-128(%rax) - vpaddd %ymm13,%ymm5,%ymm5 - - vpsrld $11,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm10,%ymm2 - vpaddd -64(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm10,%ymm2 - vpandn %ymm12,%ymm10,%ymm0 - vpand %ymm11,%ymm10,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm14,%ymm13 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm14,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm14,%ymm15,%ymm3 - - vpxor %ymm1,%ymm13,%ymm13 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm14,%ymm1 - - vpslld $19,%ymm14,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm13,%ymm7 - - vpsrld $22,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm14,%ymm2 - vpxor %ymm4,%ymm15,%ymm13 - vpaddd %ymm5,%ymm9,%ymm9 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm13,%ymm13 - vpaddd %ymm7,%ymm13,%ymm13 - vmovdqu 128-128(%rax),%ymm5 - vpaddd 384-256-128(%rbx),%ymm6,%ymm6 - - vpsrld $3,%ymm5,%ymm7 - vpsrld $7,%ymm5,%ymm1 - vpslld $25,%ymm5,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm5,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm5,%ymm2 - vmovdqu 32-128(%rax),%ymm0 - vpsrld $10,%ymm0,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm1,%ymm4,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm6,%ymm6 - vpsrld $6,%ymm9,%ymm7 - vpslld $26,%ymm9,%ymm2 - vmovdqu %ymm6,96-128(%rax) - vpaddd %ymm12,%ymm6,%ymm6 - - vpsrld $11,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm9,%ymm2 - vpaddd -32(%rbp),%ymm6,%ymm6 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm9,%ymm2 - vpandn %ymm11,%ymm9,%ymm0 - vpand %ymm10,%ymm9,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm13,%ymm12 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm13,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm13,%ymm14,%ymm4 - - vpxor %ymm1,%ymm12,%ymm12 - vpaddd %ymm7,%ymm6,%ymm6 - - vpsrld $13,%ymm13,%ymm1 - - vpslld $19,%ymm13,%ymm2 - vpaddd %ymm0,%ymm6,%ymm6 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm12,%ymm7 - - vpsrld $22,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm13,%ymm2 - vpxor %ymm3,%ymm14,%ymm12 - vpaddd %ymm6,%ymm8,%ymm8 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm6,%ymm12,%ymm12 - vpaddd %ymm7,%ymm12,%ymm12 - vmovdqu 160-128(%rax),%ymm6 - vpaddd 416-256-128(%rbx),%ymm5,%ymm5 - - vpsrld $3,%ymm6,%ymm7 - vpsrld $7,%ymm6,%ymm1 - vpslld $25,%ymm6,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm6,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm6,%ymm2 - vmovdqu 64-128(%rax),%ymm0 - vpsrld $10,%ymm0,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm5,%ymm5 - vpxor %ymm1,%ymm3,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm5,%ymm5 - vpsrld $6,%ymm8,%ymm7 - vpslld $26,%ymm8,%ymm2 - vmovdqu %ymm5,128-128(%rax) - vpaddd %ymm11,%ymm5,%ymm5 - - vpsrld $11,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm8,%ymm2 - vpaddd 0(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm8,%ymm2 - vpandn %ymm10,%ymm8,%ymm0 - vpand %ymm9,%ymm8,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm12,%ymm11 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm12,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm12,%ymm13,%ymm3 - - vpxor %ymm1,%ymm11,%ymm11 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm12,%ymm1 - - vpslld $19,%ymm12,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm11,%ymm7 - - vpsrld $22,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm12,%ymm2 - vpxor %ymm4,%ymm13,%ymm11 - vpaddd %ymm5,%ymm15,%ymm15 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm11,%ymm11 - vpaddd %ymm7,%ymm11,%ymm11 - vmovdqu 192-128(%rax),%ymm5 - vpaddd 448-256-128(%rbx),%ymm6,%ymm6 - - vpsrld $3,%ymm5,%ymm7 - vpsrld $7,%ymm5,%ymm1 - vpslld $25,%ymm5,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm5,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm5,%ymm2 - vmovdqu 96-128(%rax),%ymm0 - vpsrld $10,%ymm0,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm1,%ymm4,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm6,%ymm6 - vpsrld $6,%ymm15,%ymm7 - vpslld $26,%ymm15,%ymm2 - vmovdqu %ymm6,160-128(%rax) - vpaddd %ymm10,%ymm6,%ymm6 - - vpsrld $11,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm15,%ymm2 - vpaddd 32(%rbp),%ymm6,%ymm6 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm15,%ymm2 - vpandn %ymm9,%ymm15,%ymm0 - vpand %ymm8,%ymm15,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm11,%ymm10 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm11,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm11,%ymm12,%ymm4 - - vpxor %ymm1,%ymm10,%ymm10 - vpaddd %ymm7,%ymm6,%ymm6 - - vpsrld $13,%ymm11,%ymm1 - - vpslld $19,%ymm11,%ymm2 - vpaddd %ymm0,%ymm6,%ymm6 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm10,%ymm7 - - vpsrld $22,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm11,%ymm2 - vpxor %ymm3,%ymm12,%ymm10 - vpaddd %ymm6,%ymm14,%ymm14 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm6,%ymm10,%ymm10 - vpaddd %ymm7,%ymm10,%ymm10 - vmovdqu 224-128(%rax),%ymm6 - vpaddd 480-256-128(%rbx),%ymm5,%ymm5 - - vpsrld $3,%ymm6,%ymm7 - vpsrld $7,%ymm6,%ymm1 - vpslld $25,%ymm6,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm6,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm6,%ymm2 - vmovdqu 128-128(%rax),%ymm0 - vpsrld $10,%ymm0,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm5,%ymm5 - vpxor %ymm1,%ymm3,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm5,%ymm5 - vpsrld $6,%ymm14,%ymm7 - vpslld $26,%ymm14,%ymm2 - vmovdqu %ymm5,192-128(%rax) - vpaddd %ymm9,%ymm5,%ymm5 - - vpsrld $11,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm14,%ymm2 - vpaddd 64(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm14,%ymm2 - vpandn %ymm8,%ymm14,%ymm0 - vpand %ymm15,%ymm14,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm10,%ymm9 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm10,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm10,%ymm11,%ymm3 - - vpxor %ymm1,%ymm9,%ymm9 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm10,%ymm1 - - vpslld $19,%ymm10,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm9,%ymm7 - - vpsrld $22,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm10,%ymm2 - vpxor %ymm4,%ymm11,%ymm9 - vpaddd %ymm5,%ymm13,%ymm13 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm9,%ymm9 - vpaddd %ymm7,%ymm9,%ymm9 - vmovdqu 256-256-128(%rbx),%ymm5 - vpaddd 0-128(%rax),%ymm6,%ymm6 - - vpsrld $3,%ymm5,%ymm7 - vpsrld $7,%ymm5,%ymm1 - vpslld $25,%ymm5,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm5,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm5,%ymm2 - vmovdqu 160-128(%rax),%ymm0 - vpsrld $10,%ymm0,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm1,%ymm4,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm6,%ymm6 - vpsrld $6,%ymm13,%ymm7 - vpslld $26,%ymm13,%ymm2 - vmovdqu %ymm6,224-128(%rax) - vpaddd %ymm8,%ymm6,%ymm6 - - vpsrld $11,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm13,%ymm2 - vpaddd 96(%rbp),%ymm6,%ymm6 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm13,%ymm2 - vpandn %ymm15,%ymm13,%ymm0 - vpand %ymm14,%ymm13,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm9,%ymm8 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm9,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm9,%ymm10,%ymm4 - - vpxor %ymm1,%ymm8,%ymm8 - vpaddd %ymm7,%ymm6,%ymm6 - - vpsrld $13,%ymm9,%ymm1 - - vpslld $19,%ymm9,%ymm2 - vpaddd %ymm0,%ymm6,%ymm6 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm8,%ymm7 - - vpsrld $22,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm9,%ymm2 - vpxor %ymm3,%ymm10,%ymm8 - vpaddd %ymm6,%ymm12,%ymm12 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm6,%ymm8,%ymm8 - vpaddd %ymm7,%ymm8,%ymm8 - addq $256,%rbp - vmovdqu 288-256-128(%rbx),%ymm6 - vpaddd 32-128(%rax),%ymm5,%ymm5 - - vpsrld $3,%ymm6,%ymm7 - vpsrld $7,%ymm6,%ymm1 - vpslld $25,%ymm6,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm6,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm6,%ymm2 - vmovdqu 192-128(%rax),%ymm0 - vpsrld $10,%ymm0,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm5,%ymm5 - vpxor %ymm1,%ymm3,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm5,%ymm5 - vpsrld $6,%ymm12,%ymm7 - vpslld $26,%ymm12,%ymm2 - vmovdqu %ymm5,256-256-128(%rbx) - vpaddd %ymm15,%ymm5,%ymm5 - - vpsrld $11,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm12,%ymm2 - vpaddd -128(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm12,%ymm2 - vpandn %ymm14,%ymm12,%ymm0 - vpand %ymm13,%ymm12,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm8,%ymm15 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm8,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm8,%ymm9,%ymm3 - - vpxor %ymm1,%ymm15,%ymm15 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm8,%ymm1 - - vpslld $19,%ymm8,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm15,%ymm7 - - vpsrld $22,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm8,%ymm2 - vpxor %ymm4,%ymm9,%ymm15 - vpaddd %ymm5,%ymm11,%ymm11 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm15,%ymm15 - vpaddd %ymm7,%ymm15,%ymm15 - vmovdqu 320-256-128(%rbx),%ymm5 - vpaddd 64-128(%rax),%ymm6,%ymm6 - - vpsrld $3,%ymm5,%ymm7 - vpsrld $7,%ymm5,%ymm1 - vpslld $25,%ymm5,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm5,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm5,%ymm2 - vmovdqu 224-128(%rax),%ymm0 - vpsrld $10,%ymm0,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm1,%ymm4,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm6,%ymm6 - vpsrld $6,%ymm11,%ymm7 - vpslld $26,%ymm11,%ymm2 - vmovdqu %ymm6,288-256-128(%rbx) - vpaddd %ymm14,%ymm6,%ymm6 - - vpsrld $11,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm11,%ymm2 - vpaddd -96(%rbp),%ymm6,%ymm6 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm11,%ymm2 - vpandn %ymm13,%ymm11,%ymm0 - vpand %ymm12,%ymm11,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm15,%ymm14 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm15,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm15,%ymm8,%ymm4 - - vpxor %ymm1,%ymm14,%ymm14 - vpaddd %ymm7,%ymm6,%ymm6 - - vpsrld $13,%ymm15,%ymm1 - - vpslld $19,%ymm15,%ymm2 - vpaddd %ymm0,%ymm6,%ymm6 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm14,%ymm7 - - vpsrld $22,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm15,%ymm2 - vpxor %ymm3,%ymm8,%ymm14 - vpaddd %ymm6,%ymm10,%ymm10 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm6,%ymm14,%ymm14 - vpaddd %ymm7,%ymm14,%ymm14 - vmovdqu 352-256-128(%rbx),%ymm6 - vpaddd 96-128(%rax),%ymm5,%ymm5 - - vpsrld $3,%ymm6,%ymm7 - vpsrld $7,%ymm6,%ymm1 - vpslld $25,%ymm6,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm6,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm6,%ymm2 - vmovdqu 256-256-128(%rbx),%ymm0 - vpsrld $10,%ymm0,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm5,%ymm5 - vpxor %ymm1,%ymm3,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm5,%ymm5 - vpsrld $6,%ymm10,%ymm7 - vpslld $26,%ymm10,%ymm2 - vmovdqu %ymm5,320-256-128(%rbx) - vpaddd %ymm13,%ymm5,%ymm5 - - vpsrld $11,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm10,%ymm2 - vpaddd -64(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm10,%ymm2 - vpandn %ymm12,%ymm10,%ymm0 - vpand %ymm11,%ymm10,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm14,%ymm13 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm14,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm14,%ymm15,%ymm3 - - vpxor %ymm1,%ymm13,%ymm13 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm14,%ymm1 - - vpslld $19,%ymm14,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm13,%ymm7 - - vpsrld $22,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm14,%ymm2 - vpxor %ymm4,%ymm15,%ymm13 - vpaddd %ymm5,%ymm9,%ymm9 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm13,%ymm13 - vpaddd %ymm7,%ymm13,%ymm13 - vmovdqu 384-256-128(%rbx),%ymm5 - vpaddd 128-128(%rax),%ymm6,%ymm6 - - vpsrld $3,%ymm5,%ymm7 - vpsrld $7,%ymm5,%ymm1 - vpslld $25,%ymm5,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm5,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm5,%ymm2 - vmovdqu 288-256-128(%rbx),%ymm0 - vpsrld $10,%ymm0,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm1,%ymm4,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm6,%ymm6 - vpsrld $6,%ymm9,%ymm7 - vpslld $26,%ymm9,%ymm2 - vmovdqu %ymm6,352-256-128(%rbx) - vpaddd %ymm12,%ymm6,%ymm6 - - vpsrld $11,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm9,%ymm2 - vpaddd -32(%rbp),%ymm6,%ymm6 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm9,%ymm2 - vpandn %ymm11,%ymm9,%ymm0 - vpand %ymm10,%ymm9,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm13,%ymm12 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm13,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm13,%ymm14,%ymm4 - - vpxor %ymm1,%ymm12,%ymm12 - vpaddd %ymm7,%ymm6,%ymm6 - - vpsrld $13,%ymm13,%ymm1 - - vpslld $19,%ymm13,%ymm2 - vpaddd %ymm0,%ymm6,%ymm6 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm12,%ymm7 - - vpsrld $22,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm13,%ymm2 - vpxor %ymm3,%ymm14,%ymm12 - vpaddd %ymm6,%ymm8,%ymm8 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm6,%ymm12,%ymm12 - vpaddd %ymm7,%ymm12,%ymm12 - vmovdqu 416-256-128(%rbx),%ymm6 - vpaddd 160-128(%rax),%ymm5,%ymm5 - - vpsrld $3,%ymm6,%ymm7 - vpsrld $7,%ymm6,%ymm1 - vpslld $25,%ymm6,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm6,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm6,%ymm2 - vmovdqu 320-256-128(%rbx),%ymm0 - vpsrld $10,%ymm0,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm5,%ymm5 - vpxor %ymm1,%ymm3,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm5,%ymm5 - vpsrld $6,%ymm8,%ymm7 - vpslld $26,%ymm8,%ymm2 - vmovdqu %ymm5,384-256-128(%rbx) - vpaddd %ymm11,%ymm5,%ymm5 - - vpsrld $11,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm8,%ymm2 - vpaddd 0(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm8,%ymm2 - vpandn %ymm10,%ymm8,%ymm0 - vpand %ymm9,%ymm8,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm12,%ymm11 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm12,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm12,%ymm13,%ymm3 - - vpxor %ymm1,%ymm11,%ymm11 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm12,%ymm1 - - vpslld $19,%ymm12,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm11,%ymm7 - - vpsrld $22,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm12,%ymm2 - vpxor %ymm4,%ymm13,%ymm11 - vpaddd %ymm5,%ymm15,%ymm15 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm11,%ymm11 - vpaddd %ymm7,%ymm11,%ymm11 - vmovdqu 448-256-128(%rbx),%ymm5 - vpaddd 192-128(%rax),%ymm6,%ymm6 - - vpsrld $3,%ymm5,%ymm7 - vpsrld $7,%ymm5,%ymm1 - vpslld $25,%ymm5,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm5,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm5,%ymm2 - vmovdqu 352-256-128(%rbx),%ymm0 - vpsrld $10,%ymm0,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm1,%ymm4,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm6,%ymm6 - vpsrld $6,%ymm15,%ymm7 - vpslld $26,%ymm15,%ymm2 - vmovdqu %ymm6,416-256-128(%rbx) - vpaddd %ymm10,%ymm6,%ymm6 - - vpsrld $11,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm15,%ymm2 - vpaddd 32(%rbp),%ymm6,%ymm6 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm15,%ymm2 - vpandn %ymm9,%ymm15,%ymm0 - vpand %ymm8,%ymm15,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm11,%ymm10 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm11,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm11,%ymm12,%ymm4 - - vpxor %ymm1,%ymm10,%ymm10 - vpaddd %ymm7,%ymm6,%ymm6 - - vpsrld $13,%ymm11,%ymm1 - - vpslld $19,%ymm11,%ymm2 - vpaddd %ymm0,%ymm6,%ymm6 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm10,%ymm7 - - vpsrld $22,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm11,%ymm2 - vpxor %ymm3,%ymm12,%ymm10 - vpaddd %ymm6,%ymm14,%ymm14 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm6,%ymm10,%ymm10 - vpaddd %ymm7,%ymm10,%ymm10 - vmovdqu 480-256-128(%rbx),%ymm6 - vpaddd 224-128(%rax),%ymm5,%ymm5 - - vpsrld $3,%ymm6,%ymm7 - vpsrld $7,%ymm6,%ymm1 - vpslld $25,%ymm6,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm6,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm6,%ymm2 - vmovdqu 384-256-128(%rbx),%ymm0 - vpsrld $10,%ymm0,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm5,%ymm5 - vpxor %ymm1,%ymm3,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm5,%ymm5 - vpsrld $6,%ymm14,%ymm7 - vpslld $26,%ymm14,%ymm2 - vmovdqu %ymm5,448-256-128(%rbx) - vpaddd %ymm9,%ymm5,%ymm5 - - vpsrld $11,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm14,%ymm2 - vpaddd 64(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm14,%ymm2 - vpandn %ymm8,%ymm14,%ymm0 - vpand %ymm15,%ymm14,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm10,%ymm9 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm10,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm10,%ymm11,%ymm3 - - vpxor %ymm1,%ymm9,%ymm9 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm10,%ymm1 - - vpslld $19,%ymm10,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm9,%ymm7 - - vpsrld $22,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm10,%ymm2 - vpxor %ymm4,%ymm11,%ymm9 - vpaddd %ymm5,%ymm13,%ymm13 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm9,%ymm9 - vpaddd %ymm7,%ymm9,%ymm9 - vmovdqu 0-128(%rax),%ymm5 - vpaddd 256-256-128(%rbx),%ymm6,%ymm6 - - vpsrld $3,%ymm5,%ymm7 - vpsrld $7,%ymm5,%ymm1 - vpslld $25,%ymm5,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm5,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm5,%ymm2 - vmovdqu 416-256-128(%rbx),%ymm0 - vpsrld $10,%ymm0,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm1,%ymm4,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm6,%ymm6 - vpsrld $6,%ymm13,%ymm7 - vpslld $26,%ymm13,%ymm2 - vmovdqu %ymm6,480-256-128(%rbx) - vpaddd %ymm8,%ymm6,%ymm6 - - vpsrld $11,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm13,%ymm2 - vpaddd 96(%rbp),%ymm6,%ymm6 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm13,%ymm2 - vpandn %ymm15,%ymm13,%ymm0 - vpand %ymm14,%ymm13,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm9,%ymm8 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm9,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm9,%ymm10,%ymm4 - - vpxor %ymm1,%ymm8,%ymm8 - vpaddd %ymm7,%ymm6,%ymm6 - - vpsrld $13,%ymm9,%ymm1 - - vpslld $19,%ymm9,%ymm2 - vpaddd %ymm0,%ymm6,%ymm6 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm8,%ymm7 - - vpsrld $22,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm9,%ymm2 - vpxor %ymm3,%ymm10,%ymm8 - vpaddd %ymm6,%ymm12,%ymm12 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm6,%ymm8,%ymm8 - vpaddd %ymm7,%ymm8,%ymm8 - addq $256,%rbp - decl %ecx - jnz .Loop_16_xx_avx2 - - movl $1,%ecx - leaq 512(%rsp),%rbx - leaq K256+128(%rip),%rbp - cmpl 0(%rbx),%ecx - cmovgeq %rbp,%r12 - cmpl 4(%rbx),%ecx - cmovgeq %rbp,%r13 - cmpl 8(%rbx),%ecx - cmovgeq %rbp,%r14 - cmpl 12(%rbx),%ecx - cmovgeq %rbp,%r15 - cmpl 16(%rbx),%ecx - cmovgeq %rbp,%r8 - cmpl 20(%rbx),%ecx - cmovgeq %rbp,%r9 - cmpl 24(%rbx),%ecx - cmovgeq %rbp,%r10 - cmpl 28(%rbx),%ecx - cmovgeq %rbp,%r11 - vmovdqa (%rbx),%ymm7 - vpxor %ymm0,%ymm0,%ymm0 - vmovdqa %ymm7,%ymm6 - vpcmpgtd %ymm0,%ymm6,%ymm6 - vpaddd %ymm6,%ymm7,%ymm7 - - vmovdqu 0-128(%rdi),%ymm0 - vpand %ymm6,%ymm8,%ymm8 - vmovdqu 32-128(%rdi),%ymm1 - vpand %ymm6,%ymm9,%ymm9 - vmovdqu 64-128(%rdi),%ymm2 - vpand %ymm6,%ymm10,%ymm10 - vmovdqu 96-128(%rdi),%ymm5 - vpand %ymm6,%ymm11,%ymm11 - vpaddd %ymm0,%ymm8,%ymm8 - vmovdqu 128-128(%rdi),%ymm0 - vpand %ymm6,%ymm12,%ymm12 - vpaddd %ymm1,%ymm9,%ymm9 - vmovdqu 160-128(%rdi),%ymm1 - vpand %ymm6,%ymm13,%ymm13 - vpaddd %ymm2,%ymm10,%ymm10 - vmovdqu 192-128(%rdi),%ymm2 - vpand %ymm6,%ymm14,%ymm14 - vpaddd %ymm5,%ymm11,%ymm11 - vmovdqu 224-128(%rdi),%ymm5 - vpand %ymm6,%ymm15,%ymm15 - vpaddd %ymm0,%ymm12,%ymm12 - vpaddd %ymm1,%ymm13,%ymm13 - vmovdqu %ymm8,0-128(%rdi) - vpaddd %ymm2,%ymm14,%ymm14 - vmovdqu %ymm9,32-128(%rdi) - vpaddd %ymm5,%ymm15,%ymm15 - vmovdqu %ymm10,64-128(%rdi) - vmovdqu %ymm11,96-128(%rdi) - vmovdqu %ymm12,128-128(%rdi) - vmovdqu %ymm13,160-128(%rdi) - vmovdqu %ymm14,192-128(%rdi) - vmovdqu %ymm15,224-128(%rdi) - - vmovdqu %ymm7,(%rbx) - leaq 256+128(%rsp),%rbx - vmovdqu .Lpbswap(%rip),%ymm6 - decl %edx - jnz .Loop_avx2 - - - - - - - -.Ldone_avx2: - movq 544(%rsp),%rax -.cfi_def_cfa %rax,8 - vzeroupper - movq -48(%rax),%r15 -.cfi_restore %r15 - movq -40(%rax),%r14 -.cfi_restore %r14 - movq -32(%rax),%r13 -.cfi_restore %r13 - movq -24(%rax),%r12 -.cfi_restore %r12 - movq -16(%rax),%rbp -.cfi_restore %rbp - movq -8(%rax),%rbx -.cfi_restore %rbx - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_avx2: - .byte 0xf3,0xc3 -.cfi_endproc -.size sha256_multi_block_avx2,.-sha256_multi_block_avx2 .align 256 K256: .long 1116352408,1116352408,1116352408,1116352408 @@ -7982,7 +3286,7 @@ K256_shaext: .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .byte 83,72,65,50,53,54,32,109,117,108,116,105,45,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 - .section ".note.gnu.property", "a" + .section .note.gnu.property, #alloc .p2align 3 .long 1f - 0f .long 4f - 1f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha256-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha256-x86_64.s index 1b03ce39b99..f2c864d92ec 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha256-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha256-x86_64.s @@ -12,14 +12,6 @@ sha256_block_data_order: movl 8(%r11),%r11d testl $536870912,%r11d jnz _shaext_shortcut - andl $296,%r11d - cmpl $296,%r11d - je .Lavx2_shortcut - andl $1073741824,%r9d - andl $268435968,%r10d - orl %r9d,%r10d - cmpl $1342177792,%r10d - je .Lavx_shortcut testl $512,%r10d jnz .Lssse3_shortcut movq %rsp,%rax @@ -3093,2368 +3085,7 @@ sha256_block_data_order_ssse3: .byte 0xf3,0xc3 .cfi_endproc .size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3 -.type sha256_block_data_order_avx,@function -.align 64 -sha256_block_data_order_avx: -.cfi_startproc -.Lavx_shortcut: - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - shlq $4,%rdx - subq $96,%rsp - leaq (%rsi,%rdx,4),%rdx - andq $-64,%rsp - movq %rdi,64+0(%rsp) - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - movq %rax,88(%rsp) -.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 -.Lprologue_avx: - - vzeroupper - movl 0(%rdi),%eax - movl 4(%rdi),%ebx - movl 8(%rdi),%ecx - movl 12(%rdi),%edx - movl 16(%rdi),%r8d - movl 20(%rdi),%r9d - movl 24(%rdi),%r10d - movl 28(%rdi),%r11d - vmovdqa K256+512+32(%rip),%xmm8 - vmovdqa K256+512+64(%rip),%xmm9 - jmp .Lloop_avx -.align 16 -.Lloop_avx: - vmovdqa K256+512(%rip),%xmm7 - vmovdqu 0(%rsi),%xmm0 - vmovdqu 16(%rsi),%xmm1 - vmovdqu 32(%rsi),%xmm2 - vmovdqu 48(%rsi),%xmm3 - vpshufb %xmm7,%xmm0,%xmm0 - leaq K256(%rip),%rbp - vpshufb %xmm7,%xmm1,%xmm1 - vpshufb %xmm7,%xmm2,%xmm2 - vpaddd 0(%rbp),%xmm0,%xmm4 - vpshufb %xmm7,%xmm3,%xmm3 - vpaddd 32(%rbp),%xmm1,%xmm5 - vpaddd 64(%rbp),%xmm2,%xmm6 - vpaddd 96(%rbp),%xmm3,%xmm7 - vmovdqa %xmm4,0(%rsp) - movl %eax,%r14d - vmovdqa %xmm5,16(%rsp) - movl %ebx,%edi - vmovdqa %xmm6,32(%rsp) - xorl %ecx,%edi - vmovdqa %xmm7,48(%rsp) - movl %r8d,%r13d - jmp .Lavx_00_47 - -.align 16 -.Lavx_00_47: - subq $-128,%rbp - vpalignr $4,%xmm0,%xmm1,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - vpalignr $4,%xmm2,%xmm3,%xmm7 - shrdl $9,%r14d,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpaddd %xmm7,%xmm0,%xmm0 - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - vpsrld $3,%xmm4,%xmm7 - xorl %r10d,%r12d - shrdl $11,%r14d,%r14d - xorl %ebx,%r15d - vpslld $14,%xmm4,%xmm5 - addl %r12d,%r11d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - vpshufd $250,%xmm3,%xmm7 - shrdl $2,%r14d,%r14d - addl %r11d,%edx - addl %edi,%r11d - vpsrld $11,%xmm6,%xmm6 - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%r11d - movl %r8d,%r12d - shrdl $9,%r14d,%r14d - vpslld $11,%xmm5,%xmm5 - xorl %edx,%r13d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 4(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - vpxor %xmm5,%xmm4,%xmm4 - shrdl $11,%r14d,%r14d - xorl %eax,%edi - addl %r12d,%r10d - vpsrlq $17,%xmm7,%xmm7 - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - vpaddd %xmm4,%xmm0,%xmm0 - addl %r13d,%r10d - xorl %eax,%r15d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - vpxor %xmm7,%xmm6,%xmm6 - movl %edx,%r12d - shrdl $9,%r14d,%r14d - xorl %ecx,%r13d - vpshufb %xmm8,%xmm6,%xmm6 - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - vpaddd %xmm6,%xmm0,%xmm0 - andl %ecx,%r12d - xorl %ecx,%r13d - addl 8(%rsp),%r9d - vpshufd $80,%xmm0,%xmm7 - movl %r10d,%r15d - xorl %r8d,%r12d - shrdl $11,%r14d,%r14d - vpsrld $10,%xmm7,%xmm6 - xorl %r11d,%r15d - addl %r12d,%r9d - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - vpxor %xmm7,%xmm6,%xmm6 - xorl %r11d,%edi - shrdl $2,%r14d,%r14d - addl %r9d,%ebx - vpsrlq $2,%xmm7,%xmm7 - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - vpshufb %xmm9,%xmm6,%xmm6 - shrdl $9,%r14d,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - vpaddd %xmm6,%xmm0,%xmm0 - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vpaddd 0(%rbp),%xmm0,%xmm6 - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - shrdl $11,%r14d,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - shrdl $2,%r14d,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,0(%rsp) - vpalignr $4,%xmm1,%xmm2,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - vpalignr $4,%xmm3,%xmm0,%xmm7 - shrdl $9,%r14d,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpaddd %xmm7,%xmm1,%xmm1 - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - vpsrld $3,%xmm4,%xmm7 - xorl %ecx,%r12d - shrdl $11,%r14d,%r14d - xorl %r9d,%r15d - vpslld $14,%xmm4,%xmm5 - addl %r12d,%edx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - vpxor %xmm6,%xmm7,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - vpshufd $250,%xmm0,%xmm7 - shrdl $2,%r14d,%r14d - addl %edx,%r11d - addl %edi,%edx - vpsrld $11,%xmm6,%xmm6 - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%edx - movl %eax,%r12d - shrdl $9,%r14d,%r14d - vpslld $11,%xmm5,%xmm5 - xorl %r11d,%r13d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 20(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - vpxor %xmm5,%xmm4,%xmm4 - shrdl $11,%r14d,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - vpsrlq $17,%xmm7,%xmm7 - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %edx,%r14d - vpaddd %xmm4,%xmm1,%xmm1 - addl %r13d,%ecx - xorl %r8d,%r15d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - movl %r11d,%r12d - shrdl $9,%r14d,%r14d - xorl %r10d,%r13d - vpshufb %xmm8,%xmm6,%xmm6 - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - vpaddd %xmm6,%xmm1,%xmm1 - andl %r10d,%r12d - xorl %r10d,%r13d - addl 24(%rsp),%ebx - vpshufd $80,%xmm1,%xmm7 - movl %ecx,%r15d - xorl %eax,%r12d - shrdl $11,%r14d,%r14d - vpsrld $10,%xmm7,%xmm6 - xorl %edx,%r15d - addl %r12d,%ebx - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - vpxor %xmm7,%xmm6,%xmm6 - xorl %edx,%edi - shrdl $2,%r14d,%r14d - addl %ebx,%r9d - vpsrlq $2,%xmm7,%xmm7 - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - vpshufb %xmm9,%xmm6,%xmm6 - shrdl $9,%r14d,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - vpaddd %xmm6,%xmm1,%xmm1 - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpaddd 32(%rbp),%xmm1,%xmm6 - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - shrdl $11,%r14d,%r14d - xorl %ecx,%edi - addl %r12d,%eax - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - shrdl $2,%r14d,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,16(%rsp) - vpalignr $4,%xmm2,%xmm3,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - vpalignr $4,%xmm0,%xmm1,%xmm7 - shrdl $9,%r14d,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpaddd %xmm7,%xmm2,%xmm2 - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - vpsrld $3,%xmm4,%xmm7 - xorl %r10d,%r12d - shrdl $11,%r14d,%r14d - xorl %ebx,%r15d - vpslld $14,%xmm4,%xmm5 - addl %r12d,%r11d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - vpshufd $250,%xmm1,%xmm7 - shrdl $2,%r14d,%r14d - addl %r11d,%edx - addl %edi,%r11d - vpsrld $11,%xmm6,%xmm6 - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%r11d - movl %r8d,%r12d - shrdl $9,%r14d,%r14d - vpslld $11,%xmm5,%xmm5 - xorl %edx,%r13d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 36(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - vpxor %xmm5,%xmm4,%xmm4 - shrdl $11,%r14d,%r14d - xorl %eax,%edi - addl %r12d,%r10d - vpsrlq $17,%xmm7,%xmm7 - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - vpaddd %xmm4,%xmm2,%xmm2 - addl %r13d,%r10d - xorl %eax,%r15d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - vpxor %xmm7,%xmm6,%xmm6 - movl %edx,%r12d - shrdl $9,%r14d,%r14d - xorl %ecx,%r13d - vpshufb %xmm8,%xmm6,%xmm6 - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - vpaddd %xmm6,%xmm2,%xmm2 - andl %ecx,%r12d - xorl %ecx,%r13d - addl 40(%rsp),%r9d - vpshufd $80,%xmm2,%xmm7 - movl %r10d,%r15d - xorl %r8d,%r12d - shrdl $11,%r14d,%r14d - vpsrld $10,%xmm7,%xmm6 - xorl %r11d,%r15d - addl %r12d,%r9d - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - vpxor %xmm7,%xmm6,%xmm6 - xorl %r11d,%edi - shrdl $2,%r14d,%r14d - addl %r9d,%ebx - vpsrlq $2,%xmm7,%xmm7 - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - vpshufb %xmm9,%xmm6,%xmm6 - shrdl $9,%r14d,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - vpaddd %xmm6,%xmm2,%xmm2 - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vpaddd 64(%rbp),%xmm2,%xmm6 - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - shrdl $11,%r14d,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - shrdl $2,%r14d,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,32(%rsp) - vpalignr $4,%xmm3,%xmm0,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - vpalignr $4,%xmm1,%xmm2,%xmm7 - shrdl $9,%r14d,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpaddd %xmm7,%xmm3,%xmm3 - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - vpsrld $3,%xmm4,%xmm7 - xorl %ecx,%r12d - shrdl $11,%r14d,%r14d - xorl %r9d,%r15d - vpslld $14,%xmm4,%xmm5 - addl %r12d,%edx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - vpxor %xmm6,%xmm7,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - vpshufd $250,%xmm2,%xmm7 - shrdl $2,%r14d,%r14d - addl %edx,%r11d - addl %edi,%edx - vpsrld $11,%xmm6,%xmm6 - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%edx - movl %eax,%r12d - shrdl $9,%r14d,%r14d - vpslld $11,%xmm5,%xmm5 - xorl %r11d,%r13d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 52(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - vpxor %xmm5,%xmm4,%xmm4 - shrdl $11,%r14d,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - vpsrlq $17,%xmm7,%xmm7 - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %edx,%r14d - vpaddd %xmm4,%xmm3,%xmm3 - addl %r13d,%ecx - xorl %r8d,%r15d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - movl %r11d,%r12d - shrdl $9,%r14d,%r14d - xorl %r10d,%r13d - vpshufb %xmm8,%xmm6,%xmm6 - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - vpaddd %xmm6,%xmm3,%xmm3 - andl %r10d,%r12d - xorl %r10d,%r13d - addl 56(%rsp),%ebx - vpshufd $80,%xmm3,%xmm7 - movl %ecx,%r15d - xorl %eax,%r12d - shrdl $11,%r14d,%r14d - vpsrld $10,%xmm7,%xmm6 - xorl %edx,%r15d - addl %r12d,%ebx - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - vpxor %xmm7,%xmm6,%xmm6 - xorl %edx,%edi - shrdl $2,%r14d,%r14d - addl %ebx,%r9d - vpsrlq $2,%xmm7,%xmm7 - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - vpshufb %xmm9,%xmm6,%xmm6 - shrdl $9,%r14d,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - vpaddd %xmm6,%xmm3,%xmm3 - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpaddd 96(%rbp),%xmm3,%xmm6 - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - shrdl $11,%r14d,%r14d - xorl %ecx,%edi - addl %r12d,%eax - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - shrdl $2,%r14d,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,48(%rsp) - cmpb $0,131(%rbp) - jne .Lavx_00_47 - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - shrdl $9,%r14d,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - shrdl $11,%r14d,%r14d - xorl %ebx,%r15d - addl %r12d,%r11d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - shrdl $2,%r14d,%r14d - addl %r11d,%edx - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - shrdl $9,%r14d,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 4(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - shrdl $11,%r14d,%r14d - xorl %eax,%edi - addl %r12d,%r10d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - shrdl $2,%r14d,%r14d - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - movl %edx,%r12d - shrdl $9,%r14d,%r14d - xorl %ecx,%r13d - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - xorl %r8d,%r12d - shrdl $11,%r14d,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - shrdl $2,%r14d,%r14d - addl %r9d,%ebx - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - shrdl $9,%r14d,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - shrdl $11,%r14d,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - shrdl $2,%r14d,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - shrdl $9,%r14d,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - shrdl $11,%r14d,%r14d - xorl %r9d,%r15d - addl %r12d,%edx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - shrdl $2,%r14d,%r14d - addl %edx,%r11d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%edx - movl %eax,%r12d - shrdl $9,%r14d,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 20(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - shrdl $11,%r14d,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - shrdl $2,%r14d,%r14d - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - shrdl $9,%r14d,%r14d - xorl %r10d,%r13d - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - xorl %eax,%r12d - shrdl $11,%r14d,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - shrdl $2,%r14d,%r14d - addl %ebx,%r9d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - shrdl $9,%r14d,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - shrdl $11,%r14d,%r14d - xorl %ecx,%edi - addl %r12d,%eax - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - shrdl $2,%r14d,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - shrdl $9,%r14d,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - shrdl $11,%r14d,%r14d - xorl %ebx,%r15d - addl %r12d,%r11d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - shrdl $2,%r14d,%r14d - addl %r11d,%edx - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - shrdl $9,%r14d,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 36(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - shrdl $11,%r14d,%r14d - xorl %eax,%edi - addl %r12d,%r10d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - shrdl $2,%r14d,%r14d - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - movl %edx,%r12d - shrdl $9,%r14d,%r14d - xorl %ecx,%r13d - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - xorl %r8d,%r12d - shrdl $11,%r14d,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - shrdl $2,%r14d,%r14d - addl %r9d,%ebx - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - shrdl $9,%r14d,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - shrdl $11,%r14d,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - shrdl $2,%r14d,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - shrdl $9,%r14d,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - shrdl $11,%r14d,%r14d - xorl %r9d,%r15d - addl %r12d,%edx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - shrdl $2,%r14d,%r14d - addl %edx,%r11d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%edx - movl %eax,%r12d - shrdl $9,%r14d,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 52(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - shrdl $11,%r14d,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - shrdl $2,%r14d,%r14d - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - shrdl $9,%r14d,%r14d - xorl %r10d,%r13d - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - xorl %eax,%r12d - shrdl $11,%r14d,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - shrdl $2,%r14d,%r14d - addl %ebx,%r9d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - shrdl $9,%r14d,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - shrdl $11,%r14d,%r14d - xorl %ecx,%edi - addl %r12d,%eax - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - shrdl $2,%r14d,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movq 64+0(%rsp),%rdi - movl %r14d,%eax - - addl 0(%rdi),%eax - leaq 64(%rsi),%rsi - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - addl 24(%rdi),%r10d - addl 28(%rdi),%r11d - - cmpq 64+16(%rsp),%rsi - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - jb .Lloop_avx - - movq 88(%rsp),%rsi -.cfi_def_cfa %rsi,8 - vzeroupper - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_avx: - .byte 0xf3,0xc3 -.cfi_endproc -.size sha256_block_data_order_avx,.-sha256_block_data_order_avx -.type sha256_block_data_order_avx2,@function -.align 64 -sha256_block_data_order_avx2: -.cfi_startproc -.Lavx2_shortcut: - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - subq $544,%rsp - shlq $4,%rdx - andq $-1024,%rsp - leaq (%rsi,%rdx,4),%rdx - addq $448,%rsp - movq %rdi,64+0(%rsp) - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - movq %rax,88(%rsp) -.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 -.Lprologue_avx2: - - vzeroupper - subq $-64,%rsi - movl 0(%rdi),%eax - movq %rsi,%r12 - movl 4(%rdi),%ebx - cmpq %rdx,%rsi - movl 8(%rdi),%ecx - cmoveq %rsp,%r12 - movl 12(%rdi),%edx - movl 16(%rdi),%r8d - movl 20(%rdi),%r9d - movl 24(%rdi),%r10d - movl 28(%rdi),%r11d - vmovdqa K256+512+32(%rip),%ymm8 - vmovdqa K256+512+64(%rip),%ymm9 - jmp .Loop_avx2 -.align 16 -.Loop_avx2: - vmovdqa K256+512(%rip),%ymm7 - vmovdqu -64+0(%rsi),%xmm0 - vmovdqu -64+16(%rsi),%xmm1 - vmovdqu -64+32(%rsi),%xmm2 - vmovdqu -64+48(%rsi),%xmm3 - - vinserti128 $1,(%r12),%ymm0,%ymm0 - vinserti128 $1,16(%r12),%ymm1,%ymm1 - vpshufb %ymm7,%ymm0,%ymm0 - vinserti128 $1,32(%r12),%ymm2,%ymm2 - vpshufb %ymm7,%ymm1,%ymm1 - vinserti128 $1,48(%r12),%ymm3,%ymm3 - - leaq K256(%rip),%rbp - vpshufb %ymm7,%ymm2,%ymm2 - vpaddd 0(%rbp),%ymm0,%ymm4 - vpshufb %ymm7,%ymm3,%ymm3 - vpaddd 32(%rbp),%ymm1,%ymm5 - vpaddd 64(%rbp),%ymm2,%ymm6 - vpaddd 96(%rbp),%ymm3,%ymm7 - vmovdqa %ymm4,0(%rsp) - xorl %r14d,%r14d - vmovdqa %ymm5,32(%rsp) - - movq 88(%rsp),%rdi -.cfi_def_cfa %rdi,8 - leaq -64(%rsp),%rsp - - - - movq %rdi,-8(%rsp) -.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 - movl %ebx,%edi - vmovdqa %ymm6,0(%rsp) - xorl %ecx,%edi - vmovdqa %ymm7,32(%rsp) - movl %r9d,%r12d - subq $-32*4,%rbp - jmp .Lavx2_00_47 - -.align 16 -.Lavx2_00_47: - leaq -64(%rsp),%rsp -.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08 - - pushq 64-8(%rsp) -.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 - leaq 8(%rsp),%rsp -.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 - vpalignr $4,%ymm0,%ymm1,%ymm4 - addl 0+128(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - vpalignr $4,%ymm2,%ymm3,%ymm7 - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - vpsrld $7,%ymm4,%ymm6 - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - vpaddd %ymm7,%ymm0,%ymm0 - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%edi - xorl %r12d,%r14d - xorl %ebx,%edi - vpshufd $250,%ymm3,%ymm7 - xorl %r13d,%r14d - leal (%r11,%rdi,1),%r11d - movl %r8d,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 4+128(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%edx,%edi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - vpslld $11,%ymm5,%ymm5 - andnl %r9d,%edx,%r12d - xorl %edi,%r13d - rorxl $6,%edx,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%edi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%edi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - vpsrlq $17,%ymm7,%ymm7 - andl %edi,%r15d - xorl %r12d,%r14d - xorl %eax,%r15d - vpaddd %ymm4,%ymm0,%ymm0 - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 8+128(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - vpxor %ymm7,%ymm6,%ymm6 - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - vpshufb %ymm8,%ymm6,%ymm6 - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - vpaddd %ymm6,%ymm0,%ymm0 - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - vpshufd $80,%ymm0,%ymm7 - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - vpsrld $10,%ymm7,%ymm6 - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r11d,%edi - vpsrlq $17,%ymm7,%ymm7 - xorl %r13d,%r14d - leal (%r9,%rdi,1),%r9d - movl %ecx,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 12+128(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%ebx,%edi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - vpxor %ymm7,%ymm6,%ymm6 - andnl %edx,%ebx,%r12d - xorl %edi,%r13d - rorxl $6,%ebx,%r14d - vpshufb %ymm9,%ymm6,%ymm6 - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%edi - vpaddd %ymm6,%ymm0,%ymm0 - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%edi - vpaddd 0(%rbp),%ymm0,%ymm6 - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - vmovdqa %ymm6,0(%rsp) - vpalignr $4,%ymm1,%ymm2,%ymm4 - addl 32+128(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - vpalignr $4,%ymm3,%ymm0,%ymm7 - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - vpsrld $7,%ymm4,%ymm6 - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - vpaddd %ymm7,%ymm1,%ymm1 - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r9d,%edi - vpshufd $250,%ymm0,%ymm7 - xorl %r13d,%r14d - leal (%rdx,%rdi,1),%edx - movl %eax,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 36+128(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%r11d,%edi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - vpslld $11,%ymm5,%ymm5 - andnl %ebx,%r11d,%r12d - xorl %edi,%r13d - rorxl $6,%r11d,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%edi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%edi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - vpsrlq $17,%ymm7,%ymm7 - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r8d,%r15d - vpaddd %ymm4,%ymm1,%ymm1 - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 40+128(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - vpxor %ymm7,%ymm6,%ymm6 - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - vpshufb %ymm8,%ymm6,%ymm6 - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - vpaddd %ymm6,%ymm1,%ymm1 - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - vpshufd $80,%ymm1,%ymm7 - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - vpsrld $10,%ymm7,%ymm6 - andl %r15d,%edi - xorl %r12d,%r14d - xorl %edx,%edi - vpsrlq $17,%ymm7,%ymm7 - xorl %r13d,%r14d - leal (%rbx,%rdi,1),%ebx - movl %r10d,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 44+128(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%r9d,%edi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - vpxor %ymm7,%ymm6,%ymm6 - andnl %r11d,%r9d,%r12d - xorl %edi,%r13d - rorxl $6,%r9d,%r14d - vpshufb %ymm9,%ymm6,%ymm6 - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%edi - vpaddd %ymm6,%ymm1,%ymm1 - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%edi - vpaddd 32(%rbp),%ymm1,%ymm6 - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %edi,%r15d - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - vmovdqa %ymm6,32(%rsp) - leaq -64(%rsp),%rsp -.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08 - - pushq 64-8(%rsp) -.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 - leaq 8(%rsp),%rsp -.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 - vpalignr $4,%ymm2,%ymm3,%ymm4 - addl 0+128(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - vpalignr $4,%ymm0,%ymm1,%ymm7 - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - vpsrld $7,%ymm4,%ymm6 - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - vpaddd %ymm7,%ymm2,%ymm2 - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%edi - xorl %r12d,%r14d - xorl %ebx,%edi - vpshufd $250,%ymm1,%ymm7 - xorl %r13d,%r14d - leal (%r11,%rdi,1),%r11d - movl %r8d,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 4+128(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%edx,%edi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - vpslld $11,%ymm5,%ymm5 - andnl %r9d,%edx,%r12d - xorl %edi,%r13d - rorxl $6,%edx,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%edi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%edi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - vpsrlq $17,%ymm7,%ymm7 - andl %edi,%r15d - xorl %r12d,%r14d - xorl %eax,%r15d - vpaddd %ymm4,%ymm2,%ymm2 - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 8+128(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - vpxor %ymm7,%ymm6,%ymm6 - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - vpshufb %ymm8,%ymm6,%ymm6 - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - vpaddd %ymm6,%ymm2,%ymm2 - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - vpshufd $80,%ymm2,%ymm7 - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - vpsrld $10,%ymm7,%ymm6 - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r11d,%edi - vpsrlq $17,%ymm7,%ymm7 - xorl %r13d,%r14d - leal (%r9,%rdi,1),%r9d - movl %ecx,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 12+128(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%ebx,%edi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - vpxor %ymm7,%ymm6,%ymm6 - andnl %edx,%ebx,%r12d - xorl %edi,%r13d - rorxl $6,%ebx,%r14d - vpshufb %ymm9,%ymm6,%ymm6 - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%edi - vpaddd %ymm6,%ymm2,%ymm2 - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%edi - vpaddd 64(%rbp),%ymm2,%ymm6 - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - vmovdqa %ymm6,0(%rsp) - vpalignr $4,%ymm3,%ymm0,%ymm4 - addl 32+128(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - vpalignr $4,%ymm1,%ymm2,%ymm7 - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - vpsrld $7,%ymm4,%ymm6 - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - vpaddd %ymm7,%ymm3,%ymm3 - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r9d,%edi - vpshufd $250,%ymm2,%ymm7 - xorl %r13d,%r14d - leal (%rdx,%rdi,1),%edx - movl %eax,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 36+128(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%r11d,%edi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - vpslld $11,%ymm5,%ymm5 - andnl %ebx,%r11d,%r12d - xorl %edi,%r13d - rorxl $6,%r11d,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%edi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%edi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - vpsrlq $17,%ymm7,%ymm7 - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r8d,%r15d - vpaddd %ymm4,%ymm3,%ymm3 - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 40+128(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - vpxor %ymm7,%ymm6,%ymm6 - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - vpshufb %ymm8,%ymm6,%ymm6 - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - vpaddd %ymm6,%ymm3,%ymm3 - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - vpshufd $80,%ymm3,%ymm7 - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - vpsrld $10,%ymm7,%ymm6 - andl %r15d,%edi - xorl %r12d,%r14d - xorl %edx,%edi - vpsrlq $17,%ymm7,%ymm7 - xorl %r13d,%r14d - leal (%rbx,%rdi,1),%ebx - movl %r10d,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 44+128(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%r9d,%edi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - vpxor %ymm7,%ymm6,%ymm6 - andnl %r11d,%r9d,%r12d - xorl %edi,%r13d - rorxl $6,%r9d,%r14d - vpshufb %ymm9,%ymm6,%ymm6 - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%edi - vpaddd %ymm6,%ymm3,%ymm3 - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%edi - vpaddd 96(%rbp),%ymm3,%ymm6 - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %edi,%r15d - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - vmovdqa %ymm6,32(%rsp) - leaq 128(%rbp),%rbp - cmpb $0,3(%rbp) - jne .Lavx2_00_47 - addl 0+64(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - andl %r15d,%edi - xorl %r12d,%r14d - xorl %ebx,%edi - xorl %r13d,%r14d - leal (%r11,%rdi,1),%r11d - movl %r8d,%r12d - addl 4+64(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - rorxl $11,%edx,%edi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - andnl %r9d,%edx,%r12d - xorl %edi,%r13d - rorxl $6,%edx,%r14d - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%edi - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%edi - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - andl %edi,%r15d - xorl %r12d,%r14d - xorl %eax,%r15d - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - addl 8+64(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r11d,%edi - xorl %r13d,%r14d - leal (%r9,%rdi,1),%r9d - movl %ecx,%r12d - addl 12+64(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - rorxl $11,%ebx,%edi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - andnl %edx,%ebx,%r12d - xorl %edi,%r13d - rorxl $6,%ebx,%r14d - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%edi - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%edi - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - addl 32+64(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r9d,%edi - xorl %r13d,%r14d - leal (%rdx,%rdi,1),%edx - movl %eax,%r12d - addl 36+64(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - rorxl $11,%r11d,%edi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - andnl %ebx,%r11d,%r12d - xorl %edi,%r13d - rorxl $6,%r11d,%r14d - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%edi - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%edi - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r8d,%r15d - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - addl 40+64(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - andl %r15d,%edi - xorl %r12d,%r14d - xorl %edx,%edi - xorl %r13d,%r14d - leal (%rbx,%rdi,1),%ebx - movl %r10d,%r12d - addl 44+64(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - rorxl $11,%r9d,%edi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - andnl %r11d,%r9d,%r12d - xorl %edi,%r13d - rorxl $6,%r9d,%r14d - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%edi - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%edi - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %edi,%r15d - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - addl 0(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - andl %r15d,%edi - xorl %r12d,%r14d - xorl %ebx,%edi - xorl %r13d,%r14d - leal (%r11,%rdi,1),%r11d - movl %r8d,%r12d - addl 4(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - rorxl $11,%edx,%edi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - andnl %r9d,%edx,%r12d - xorl %edi,%r13d - rorxl $6,%edx,%r14d - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%edi - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%edi - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - andl %edi,%r15d - xorl %r12d,%r14d - xorl %eax,%r15d - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - addl 8(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r11d,%edi - xorl %r13d,%r14d - leal (%r9,%rdi,1),%r9d - movl %ecx,%r12d - addl 12(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - rorxl $11,%ebx,%edi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - andnl %edx,%ebx,%r12d - xorl %edi,%r13d - rorxl $6,%ebx,%r14d - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%edi - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%edi - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - addl 32(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r9d,%edi - xorl %r13d,%r14d - leal (%rdx,%rdi,1),%edx - movl %eax,%r12d - addl 36(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - rorxl $11,%r11d,%edi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - andnl %ebx,%r11d,%r12d - xorl %edi,%r13d - rorxl $6,%r11d,%r14d - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%edi - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%edi - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r8d,%r15d - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - addl 40(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - andl %r15d,%edi - xorl %r12d,%r14d - xorl %edx,%edi - xorl %r13d,%r14d - leal (%rbx,%rdi,1),%ebx - movl %r10d,%r12d - addl 44(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - rorxl $11,%r9d,%edi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - andnl %r11d,%r9d,%r12d - xorl %edi,%r13d - rorxl $6,%r9d,%r14d - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%edi - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%edi - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %edi,%r15d - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - movq 512(%rsp),%rdi - addl %r14d,%eax - - leaq 448(%rsp),%rbp - - addl 0(%rdi),%eax - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - addl 24(%rdi),%r10d - addl 28(%rdi),%r11d - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - - cmpq 80(%rbp),%rsi - je .Ldone_avx2 - - xorl %r14d,%r14d - movl %ebx,%edi - xorl %ecx,%edi - movl %r9d,%r12d - jmp .Lower_avx2 -.align 16 -.Lower_avx2: - addl 0+16(%rbp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - andl %r15d,%edi - xorl %r12d,%r14d - xorl %ebx,%edi - xorl %r13d,%r14d - leal (%r11,%rdi,1),%r11d - movl %r8d,%r12d - addl 4+16(%rbp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - rorxl $11,%edx,%edi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - andnl %r9d,%edx,%r12d - xorl %edi,%r13d - rorxl $6,%edx,%r14d - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%edi - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%edi - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - andl %edi,%r15d - xorl %r12d,%r14d - xorl %eax,%r15d - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - addl 8+16(%rbp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r11d,%edi - xorl %r13d,%r14d - leal (%r9,%rdi,1),%r9d - movl %ecx,%r12d - addl 12+16(%rbp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - rorxl $11,%ebx,%edi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - andnl %edx,%ebx,%r12d - xorl %edi,%r13d - rorxl $6,%ebx,%r14d - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%edi - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%edi - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - addl 32+16(%rbp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r9d,%edi - xorl %r13d,%r14d - leal (%rdx,%rdi,1),%edx - movl %eax,%r12d - addl 36+16(%rbp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - rorxl $11,%r11d,%edi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - andnl %ebx,%r11d,%r12d - xorl %edi,%r13d - rorxl $6,%r11d,%r14d - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%edi - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%edi - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r8d,%r15d - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - addl 40+16(%rbp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - andl %r15d,%edi - xorl %r12d,%r14d - xorl %edx,%edi - xorl %r13d,%r14d - leal (%rbx,%rdi,1),%ebx - movl %r10d,%r12d - addl 44+16(%rbp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - rorxl $11,%r9d,%edi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - andnl %r11d,%r9d,%r12d - xorl %edi,%r13d - rorxl $6,%r9d,%r14d - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%edi - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%edi - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %edi,%r15d - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - leaq -64(%rbp),%rbp - cmpq %rsp,%rbp - jae .Lower_avx2 - - movq 512(%rsp),%rdi - addl %r14d,%eax - - leaq 448(%rsp),%rsp - -.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 - - addl 0(%rdi),%eax - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - leaq 128(%rsi),%rsi - addl 24(%rdi),%r10d - movq %rsi,%r12 - addl 28(%rdi),%r11d - cmpq 64+16(%rsp),%rsi - - movl %eax,0(%rdi) - cmoveq %rsp,%r12 - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - - jbe .Loop_avx2 - leaq (%rsp),%rbp - - -.cfi_escape 0x0f,0x06,0x76,0xd8,0x00,0x06,0x23,0x08 - -.Ldone_avx2: - movq 88(%rbp),%rsi -.cfi_def_cfa %rsi,8 - vzeroupper - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_avx2: - .byte 0xf3,0xc3 -.cfi_endproc -.size sha256_block_data_order_avx2,.-sha256_block_data_order_avx2 - .section ".note.gnu.property", "a" + .section .note.gnu.property, #alloc .p2align 3 .long 1f - 0f .long 4f - 1f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha512-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha512-x86_64.s index 3744b830146..72f59523725 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha512-x86_64.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha512-x86_64.s @@ -6,20 +6,6 @@ .align 16 sha512_block_data_order: .cfi_startproc - leaq OPENSSL_ia32cap_P(%rip),%r11 - movl 0(%r11),%r9d - movl 4(%r11),%r10d - movl 8(%r11),%r11d - testl $2048,%r10d - jnz .Lxop_shortcut - andl $296,%r11d - cmpl $296,%r11d - je .Lavx2_shortcut - andl $1073741824,%r9d - andl $268435968,%r10d - orl %r9d,%r10d - cmpl $1342177792,%r10d - je .Lavx_shortcut movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx @@ -1813,3653 +1799,7 @@ K512: .quad 0x0001020304050607,0x08090a0b0c0d0e0f .quad 0x0001020304050607,0x08090a0b0c0d0e0f .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.type sha512_block_data_order_xop,@function -.align 64 -sha512_block_data_order_xop: -.cfi_startproc -.Lxop_shortcut: - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - shlq $4,%rdx - subq $160,%rsp - leaq (%rsi,%rdx,8),%rdx - andq $-64,%rsp - movq %rdi,128+0(%rsp) - movq %rsi,128+8(%rsp) - movq %rdx,128+16(%rsp) - movq %rax,152(%rsp) -.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 -.Lprologue_xop: - - vzeroupper - movq 0(%rdi),%rax - movq 8(%rdi),%rbx - movq 16(%rdi),%rcx - movq 24(%rdi),%rdx - movq 32(%rdi),%r8 - movq 40(%rdi),%r9 - movq 48(%rdi),%r10 - movq 56(%rdi),%r11 - jmp .Lloop_xop -.align 16 -.Lloop_xop: - vmovdqa K512+1280(%rip),%xmm11 - vmovdqu 0(%rsi),%xmm0 - leaq K512+128(%rip),%rbp - vmovdqu 16(%rsi),%xmm1 - vmovdqu 32(%rsi),%xmm2 - vpshufb %xmm11,%xmm0,%xmm0 - vmovdqu 48(%rsi),%xmm3 - vpshufb %xmm11,%xmm1,%xmm1 - vmovdqu 64(%rsi),%xmm4 - vpshufb %xmm11,%xmm2,%xmm2 - vmovdqu 80(%rsi),%xmm5 - vpshufb %xmm11,%xmm3,%xmm3 - vmovdqu 96(%rsi),%xmm6 - vpshufb %xmm11,%xmm4,%xmm4 - vmovdqu 112(%rsi),%xmm7 - vpshufb %xmm11,%xmm5,%xmm5 - vpaddq -128(%rbp),%xmm0,%xmm8 - vpshufb %xmm11,%xmm6,%xmm6 - vpaddq -96(%rbp),%xmm1,%xmm9 - vpshufb %xmm11,%xmm7,%xmm7 - vpaddq -64(%rbp),%xmm2,%xmm10 - vpaddq -32(%rbp),%xmm3,%xmm11 - vmovdqa %xmm8,0(%rsp) - vpaddq 0(%rbp),%xmm4,%xmm8 - vmovdqa %xmm9,16(%rsp) - vpaddq 32(%rbp),%xmm5,%xmm9 - vmovdqa %xmm10,32(%rsp) - vpaddq 64(%rbp),%xmm6,%xmm10 - vmovdqa %xmm11,48(%rsp) - vpaddq 96(%rbp),%xmm7,%xmm11 - vmovdqa %xmm8,64(%rsp) - movq %rax,%r14 - vmovdqa %xmm9,80(%rsp) - movq %rbx,%rdi - vmovdqa %xmm10,96(%rsp) - xorq %rcx,%rdi - vmovdqa %xmm11,112(%rsp) - movq %r8,%r13 - jmp .Lxop_00_47 - -.align 16 -.Lxop_00_47: - addq $256,%rbp - vpalignr $8,%xmm0,%xmm1,%xmm8 - rorq $23,%r13 - movq %r14,%rax - vpalignr $8,%xmm4,%xmm5,%xmm11 - movq %r9,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r8,%r13 - xorq %r10,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rax,%r14 - vpaddq %xmm11,%xmm0,%xmm0 - andq %r8,%r12 - xorq %r8,%r13 - addq 0(%rsp),%r11 - movq %rax,%r15 -.byte 143,72,120,195,209,7 - xorq %r10,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,223,3 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm7,%xmm10 - addq %r11,%rdx - addq %rdi,%r11 - vpaddq %xmm8,%xmm0,%xmm0 - movq %rdx,%r13 - addq %r11,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r11 - vpxor %xmm10,%xmm11,%xmm11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - vpaddq %xmm11,%xmm0,%xmm0 - addq 8(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - vpaddq -128(%rbp),%xmm0,%xmm10 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,0(%rsp) - vpalignr $8,%xmm1,%xmm2,%xmm8 - rorq $23,%r13 - movq %r14,%r10 - vpalignr $8,%xmm5,%xmm6,%xmm11 - movq %rdx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rcx,%r13 - xorq %r8,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r10,%r14 - vpaddq %xmm11,%xmm1,%xmm1 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 16(%rsp),%r9 - movq %r10,%r15 -.byte 143,72,120,195,209,7 - xorq %r8,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,216,3 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm0,%xmm10 - addq %r9,%rbx - addq %rdi,%r9 - vpaddq %xmm8,%xmm1,%xmm1 - movq %rbx,%r13 - addq %r9,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r9 - vpxor %xmm10,%xmm11,%xmm11 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - vpaddq %xmm11,%xmm1,%xmm1 - addq 24(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - vpaddq -96(%rbp),%xmm1,%xmm10 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,16(%rsp) - vpalignr $8,%xmm2,%xmm3,%xmm8 - rorq $23,%r13 - movq %r14,%r8 - vpalignr $8,%xmm6,%xmm7,%xmm11 - movq %rbx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rax,%r13 - xorq %rcx,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r8,%r14 - vpaddq %xmm11,%xmm2,%xmm2 - andq %rax,%r12 - xorq %rax,%r13 - addq 32(%rsp),%rdx - movq %r8,%r15 -.byte 143,72,120,195,209,7 - xorq %rcx,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,217,3 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm1,%xmm10 - addq %rdx,%r11 - addq %rdi,%rdx - vpaddq %xmm8,%xmm2,%xmm2 - movq %r11,%r13 - addq %rdx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rdx - vpxor %xmm10,%xmm11,%xmm11 - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - vpaddq %xmm11,%xmm2,%xmm2 - addq 40(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - vpaddq -64(%rbp),%xmm2,%xmm10 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,32(%rsp) - vpalignr $8,%xmm3,%xmm4,%xmm8 - rorq $23,%r13 - movq %r14,%rcx - vpalignr $8,%xmm7,%xmm0,%xmm11 - movq %r11,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r10,%r13 - xorq %rax,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rcx,%r14 - vpaddq %xmm11,%xmm3,%xmm3 - andq %r10,%r12 - xorq %r10,%r13 - addq 48(%rsp),%rbx - movq %rcx,%r15 -.byte 143,72,120,195,209,7 - xorq %rax,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,218,3 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm2,%xmm10 - addq %rbx,%r9 - addq %rdi,%rbx - vpaddq %xmm8,%xmm3,%xmm3 - movq %r9,%r13 - addq %rbx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rbx - vpxor %xmm10,%xmm11,%xmm11 - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - vpaddq %xmm11,%xmm3,%xmm3 - addq 56(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - vpaddq -32(%rbp),%xmm3,%xmm10 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,48(%rsp) - vpalignr $8,%xmm4,%xmm5,%xmm8 - rorq $23,%r13 - movq %r14,%rax - vpalignr $8,%xmm0,%xmm1,%xmm11 - movq %r9,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r8,%r13 - xorq %r10,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rax,%r14 - vpaddq %xmm11,%xmm4,%xmm4 - andq %r8,%r12 - xorq %r8,%r13 - addq 64(%rsp),%r11 - movq %rax,%r15 -.byte 143,72,120,195,209,7 - xorq %r10,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,219,3 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm3,%xmm10 - addq %r11,%rdx - addq %rdi,%r11 - vpaddq %xmm8,%xmm4,%xmm4 - movq %rdx,%r13 - addq %r11,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r11 - vpxor %xmm10,%xmm11,%xmm11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - vpaddq %xmm11,%xmm4,%xmm4 - addq 72(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - vpaddq 0(%rbp),%xmm4,%xmm10 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,64(%rsp) - vpalignr $8,%xmm5,%xmm6,%xmm8 - rorq $23,%r13 - movq %r14,%r10 - vpalignr $8,%xmm1,%xmm2,%xmm11 - movq %rdx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rcx,%r13 - xorq %r8,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r10,%r14 - vpaddq %xmm11,%xmm5,%xmm5 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 80(%rsp),%r9 - movq %r10,%r15 -.byte 143,72,120,195,209,7 - xorq %r8,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,220,3 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm4,%xmm10 - addq %r9,%rbx - addq %rdi,%r9 - vpaddq %xmm8,%xmm5,%xmm5 - movq %rbx,%r13 - addq %r9,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r9 - vpxor %xmm10,%xmm11,%xmm11 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - vpaddq %xmm11,%xmm5,%xmm5 - addq 88(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - vpaddq 32(%rbp),%xmm5,%xmm10 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,80(%rsp) - vpalignr $8,%xmm6,%xmm7,%xmm8 - rorq $23,%r13 - movq %r14,%r8 - vpalignr $8,%xmm2,%xmm3,%xmm11 - movq %rbx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rax,%r13 - xorq %rcx,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r8,%r14 - vpaddq %xmm11,%xmm6,%xmm6 - andq %rax,%r12 - xorq %rax,%r13 - addq 96(%rsp),%rdx - movq %r8,%r15 -.byte 143,72,120,195,209,7 - xorq %rcx,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,221,3 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm5,%xmm10 - addq %rdx,%r11 - addq %rdi,%rdx - vpaddq %xmm8,%xmm6,%xmm6 - movq %r11,%r13 - addq %rdx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rdx - vpxor %xmm10,%xmm11,%xmm11 - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - vpaddq %xmm11,%xmm6,%xmm6 - addq 104(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - vpaddq 64(%rbp),%xmm6,%xmm10 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,96(%rsp) - vpalignr $8,%xmm7,%xmm0,%xmm8 - rorq $23,%r13 - movq %r14,%rcx - vpalignr $8,%xmm3,%xmm4,%xmm11 - movq %r11,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r10,%r13 - xorq %rax,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rcx,%r14 - vpaddq %xmm11,%xmm7,%xmm7 - andq %r10,%r12 - xorq %r10,%r13 - addq 112(%rsp),%rbx - movq %rcx,%r15 -.byte 143,72,120,195,209,7 - xorq %rax,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,222,3 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm6,%xmm10 - addq %rbx,%r9 - addq %rdi,%rbx - vpaddq %xmm8,%xmm7,%xmm7 - movq %r9,%r13 - addq %rbx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rbx - vpxor %xmm10,%xmm11,%xmm11 - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - vpaddq %xmm11,%xmm7,%xmm7 - addq 120(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - vpaddq 96(%rbp),%xmm7,%xmm10 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,112(%rsp) - cmpb $0,135(%rbp) - jne .Lxop_00_47 - rorq $23,%r13 - movq %r14,%rax - movq %r9,%r12 - rorq $5,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - rorq $4,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 0(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - rorq $6,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - rorq $28,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - rorq $23,%r13 - movq %r14,%r11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 8(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - rorq $23,%r13 - movq %r14,%r10 - movq %rdx,%r12 - rorq $5,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - rorq $4,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 16(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - rorq $6,%r14 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - rorq $28,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - rorq $23,%r13 - movq %r14,%r9 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 24(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - rorq $23,%r13 - movq %r14,%r8 - movq %rbx,%r12 - rorq $5,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - rorq $4,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 32(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - rorq $6,%r14 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - rorq $28,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - rorq $23,%r13 - movq %r14,%rdx - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 40(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - rorq $23,%r13 - movq %r14,%rcx - movq %r11,%r12 - rorq $5,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - rorq $4,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 48(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - rorq $6,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - rorq $28,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - rorq $23,%r13 - movq %r14,%rbx - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 56(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - rorq $23,%r13 - movq %r14,%rax - movq %r9,%r12 - rorq $5,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - rorq $4,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 64(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - rorq $6,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - rorq $28,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - rorq $23,%r13 - movq %r14,%r11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 72(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - rorq $23,%r13 - movq %r14,%r10 - movq %rdx,%r12 - rorq $5,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - rorq $4,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 80(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - rorq $6,%r14 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - rorq $28,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - rorq $23,%r13 - movq %r14,%r9 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 88(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - rorq $23,%r13 - movq %r14,%r8 - movq %rbx,%r12 - rorq $5,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - rorq $4,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 96(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - rorq $6,%r14 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - rorq $28,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - rorq $23,%r13 - movq %r14,%rdx - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 104(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - rorq $23,%r13 - movq %r14,%rcx - movq %r11,%r12 - rorq $5,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - rorq $4,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 112(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - rorq $6,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - rorq $28,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - rorq $23,%r13 - movq %r14,%rbx - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 120(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - movq 128+0(%rsp),%rdi - movq %r14,%rax - - addq 0(%rdi),%rax - leaq 128(%rsi),%rsi - addq 8(%rdi),%rbx - addq 16(%rdi),%rcx - addq 24(%rdi),%rdx - addq 32(%rdi),%r8 - addq 40(%rdi),%r9 - addq 48(%rdi),%r10 - addq 56(%rdi),%r11 - - cmpq 128+16(%rsp),%rsi - - movq %rax,0(%rdi) - movq %rbx,8(%rdi) - movq %rcx,16(%rdi) - movq %rdx,24(%rdi) - movq %r8,32(%rdi) - movq %r9,40(%rdi) - movq %r10,48(%rdi) - movq %r11,56(%rdi) - jb .Lloop_xop - - movq 152(%rsp),%rsi -.cfi_def_cfa %rsi,8 - vzeroupper - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_xop: - .byte 0xf3,0xc3 -.cfi_endproc -.size sha512_block_data_order_xop,.-sha512_block_data_order_xop -.type sha512_block_data_order_avx,@function -.align 64 -sha512_block_data_order_avx: -.cfi_startproc -.Lavx_shortcut: - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - shlq $4,%rdx - subq $160,%rsp - leaq (%rsi,%rdx,8),%rdx - andq $-64,%rsp - movq %rdi,128+0(%rsp) - movq %rsi,128+8(%rsp) - movq %rdx,128+16(%rsp) - movq %rax,152(%rsp) -.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 -.Lprologue_avx: - - vzeroupper - movq 0(%rdi),%rax - movq 8(%rdi),%rbx - movq 16(%rdi),%rcx - movq 24(%rdi),%rdx - movq 32(%rdi),%r8 - movq 40(%rdi),%r9 - movq 48(%rdi),%r10 - movq 56(%rdi),%r11 - jmp .Lloop_avx -.align 16 -.Lloop_avx: - vmovdqa K512+1280(%rip),%xmm11 - vmovdqu 0(%rsi),%xmm0 - leaq K512+128(%rip),%rbp - vmovdqu 16(%rsi),%xmm1 - vmovdqu 32(%rsi),%xmm2 - vpshufb %xmm11,%xmm0,%xmm0 - vmovdqu 48(%rsi),%xmm3 - vpshufb %xmm11,%xmm1,%xmm1 - vmovdqu 64(%rsi),%xmm4 - vpshufb %xmm11,%xmm2,%xmm2 - vmovdqu 80(%rsi),%xmm5 - vpshufb %xmm11,%xmm3,%xmm3 - vmovdqu 96(%rsi),%xmm6 - vpshufb %xmm11,%xmm4,%xmm4 - vmovdqu 112(%rsi),%xmm7 - vpshufb %xmm11,%xmm5,%xmm5 - vpaddq -128(%rbp),%xmm0,%xmm8 - vpshufb %xmm11,%xmm6,%xmm6 - vpaddq -96(%rbp),%xmm1,%xmm9 - vpshufb %xmm11,%xmm7,%xmm7 - vpaddq -64(%rbp),%xmm2,%xmm10 - vpaddq -32(%rbp),%xmm3,%xmm11 - vmovdqa %xmm8,0(%rsp) - vpaddq 0(%rbp),%xmm4,%xmm8 - vmovdqa %xmm9,16(%rsp) - vpaddq 32(%rbp),%xmm5,%xmm9 - vmovdqa %xmm10,32(%rsp) - vpaddq 64(%rbp),%xmm6,%xmm10 - vmovdqa %xmm11,48(%rsp) - vpaddq 96(%rbp),%xmm7,%xmm11 - vmovdqa %xmm8,64(%rsp) - movq %rax,%r14 - vmovdqa %xmm9,80(%rsp) - movq %rbx,%rdi - vmovdqa %xmm10,96(%rsp) - xorq %rcx,%rdi - vmovdqa %xmm11,112(%rsp) - movq %r8,%r13 - jmp .Lavx_00_47 - -.align 16 -.Lavx_00_47: - addq $256,%rbp - vpalignr $8,%xmm0,%xmm1,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%rax - vpalignr $8,%xmm4,%xmm5,%xmm11 - movq %r9,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %r8,%r13 - xorq %r10,%r12 - vpaddq %xmm11,%xmm0,%xmm0 - shrdq $4,%r13,%r13 - xorq %rax,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %r8,%r12 - xorq %r8,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 0(%rsp),%r11 - movq %rax,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %r10,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %rbx,%r15 - addq %r12,%r11 - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm7,%xmm11 - addq %r11,%rdx - addq %rdi,%r11 - vpxor %xmm9,%xmm8,%xmm8 - movq %rdx,%r13 - addq %r11,%r14 - vpsllq $3,%xmm7,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%r11 - vpaddq %xmm8,%xmm0,%xmm0 - movq %r8,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm7,%xmm9 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %r11,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %rdx,%r12 - xorq %rdx,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 8(%rsp),%r10 - movq %r11,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %r9,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %rax,%rdi - addq %r12,%r10 - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm0,%xmm0 - xorq %r11,%r14 - addq %r13,%r10 - vpaddq -128(%rbp),%xmm0,%xmm10 - xorq %rax,%r15 - shrdq $28,%r14,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,0(%rsp) - vpalignr $8,%xmm1,%xmm2,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%r10 - vpalignr $8,%xmm5,%xmm6,%xmm11 - movq %rdx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %rcx,%r13 - xorq %r8,%r12 - vpaddq %xmm11,%xmm1,%xmm1 - shrdq $4,%r13,%r13 - xorq %r10,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %rcx,%r12 - xorq %rcx,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 16(%rsp),%r9 - movq %r10,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %r8,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %r11,%r15 - addq %r12,%r9 - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm0,%xmm11 - addq %r9,%rbx - addq %rdi,%r9 - vpxor %xmm9,%xmm8,%xmm8 - movq %rbx,%r13 - addq %r9,%r14 - vpsllq $3,%xmm0,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%r9 - vpaddq %xmm8,%xmm1,%xmm1 - movq %rcx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm0,%xmm9 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %r9,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %rbx,%r12 - xorq %rbx,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 24(%rsp),%r8 - movq %r9,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %rdx,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %r10,%rdi - addq %r12,%r8 - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm1,%xmm1 - xorq %r9,%r14 - addq %r13,%r8 - vpaddq -96(%rbp),%xmm1,%xmm10 - xorq %r10,%r15 - shrdq $28,%r14,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,16(%rsp) - vpalignr $8,%xmm2,%xmm3,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%r8 - vpalignr $8,%xmm6,%xmm7,%xmm11 - movq %rbx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %rax,%r13 - xorq %rcx,%r12 - vpaddq %xmm11,%xmm2,%xmm2 - shrdq $4,%r13,%r13 - xorq %r8,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %rax,%r12 - xorq %rax,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 32(%rsp),%rdx - movq %r8,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %rcx,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %r9,%r15 - addq %r12,%rdx - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm1,%xmm11 - addq %rdx,%r11 - addq %rdi,%rdx - vpxor %xmm9,%xmm8,%xmm8 - movq %r11,%r13 - addq %rdx,%r14 - vpsllq $3,%xmm1,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%rdx - vpaddq %xmm8,%xmm2,%xmm2 - movq %rax,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm1,%xmm9 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %rdx,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %r11,%r12 - xorq %r11,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 40(%rsp),%rcx - movq %rdx,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %rbx,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %r8,%rdi - addq %r12,%rcx - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm2,%xmm2 - xorq %rdx,%r14 - addq %r13,%rcx - vpaddq -64(%rbp),%xmm2,%xmm10 - xorq %r8,%r15 - shrdq $28,%r14,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,32(%rsp) - vpalignr $8,%xmm3,%xmm4,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%rcx - vpalignr $8,%xmm7,%xmm0,%xmm11 - movq %r11,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %r10,%r13 - xorq %rax,%r12 - vpaddq %xmm11,%xmm3,%xmm3 - shrdq $4,%r13,%r13 - xorq %rcx,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %r10,%r12 - xorq %r10,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 48(%rsp),%rbx - movq %rcx,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %rax,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %rdx,%r15 - addq %r12,%rbx - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm2,%xmm11 - addq %rbx,%r9 - addq %rdi,%rbx - vpxor %xmm9,%xmm8,%xmm8 - movq %r9,%r13 - addq %rbx,%r14 - vpsllq $3,%xmm2,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%rbx - vpaddq %xmm8,%xmm3,%xmm3 - movq %r10,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm2,%xmm9 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %rbx,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %r9,%r12 - xorq %r9,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 56(%rsp),%rax - movq %rbx,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %r11,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %rcx,%rdi - addq %r12,%rax - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm3,%xmm3 - xorq %rbx,%r14 - addq %r13,%rax - vpaddq -32(%rbp),%xmm3,%xmm10 - xorq %rcx,%r15 - shrdq $28,%r14,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,48(%rsp) - vpalignr $8,%xmm4,%xmm5,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%rax - vpalignr $8,%xmm0,%xmm1,%xmm11 - movq %r9,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %r8,%r13 - xorq %r10,%r12 - vpaddq %xmm11,%xmm4,%xmm4 - shrdq $4,%r13,%r13 - xorq %rax,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %r8,%r12 - xorq %r8,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 64(%rsp),%r11 - movq %rax,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %r10,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %rbx,%r15 - addq %r12,%r11 - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm3,%xmm11 - addq %r11,%rdx - addq %rdi,%r11 - vpxor %xmm9,%xmm8,%xmm8 - movq %rdx,%r13 - addq %r11,%r14 - vpsllq $3,%xmm3,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%r11 - vpaddq %xmm8,%xmm4,%xmm4 - movq %r8,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm3,%xmm9 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %r11,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %rdx,%r12 - xorq %rdx,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 72(%rsp),%r10 - movq %r11,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %r9,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %rax,%rdi - addq %r12,%r10 - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm4,%xmm4 - xorq %r11,%r14 - addq %r13,%r10 - vpaddq 0(%rbp),%xmm4,%xmm10 - xorq %rax,%r15 - shrdq $28,%r14,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,64(%rsp) - vpalignr $8,%xmm5,%xmm6,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%r10 - vpalignr $8,%xmm1,%xmm2,%xmm11 - movq %rdx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %rcx,%r13 - xorq %r8,%r12 - vpaddq %xmm11,%xmm5,%xmm5 - shrdq $4,%r13,%r13 - xorq %r10,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %rcx,%r12 - xorq %rcx,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 80(%rsp),%r9 - movq %r10,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %r8,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %r11,%r15 - addq %r12,%r9 - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm4,%xmm11 - addq %r9,%rbx - addq %rdi,%r9 - vpxor %xmm9,%xmm8,%xmm8 - movq %rbx,%r13 - addq %r9,%r14 - vpsllq $3,%xmm4,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%r9 - vpaddq %xmm8,%xmm5,%xmm5 - movq %rcx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm4,%xmm9 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %r9,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %rbx,%r12 - xorq %rbx,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 88(%rsp),%r8 - movq %r9,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %rdx,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %r10,%rdi - addq %r12,%r8 - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm5,%xmm5 - xorq %r9,%r14 - addq %r13,%r8 - vpaddq 32(%rbp),%xmm5,%xmm10 - xorq %r10,%r15 - shrdq $28,%r14,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,80(%rsp) - vpalignr $8,%xmm6,%xmm7,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%r8 - vpalignr $8,%xmm2,%xmm3,%xmm11 - movq %rbx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %rax,%r13 - xorq %rcx,%r12 - vpaddq %xmm11,%xmm6,%xmm6 - shrdq $4,%r13,%r13 - xorq %r8,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %rax,%r12 - xorq %rax,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 96(%rsp),%rdx - movq %r8,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %rcx,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %r9,%r15 - addq %r12,%rdx - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm5,%xmm11 - addq %rdx,%r11 - addq %rdi,%rdx - vpxor %xmm9,%xmm8,%xmm8 - movq %r11,%r13 - addq %rdx,%r14 - vpsllq $3,%xmm5,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%rdx - vpaddq %xmm8,%xmm6,%xmm6 - movq %rax,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm5,%xmm9 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %rdx,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %r11,%r12 - xorq %r11,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 104(%rsp),%rcx - movq %rdx,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %rbx,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %r8,%rdi - addq %r12,%rcx - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm6,%xmm6 - xorq %rdx,%r14 - addq %r13,%rcx - vpaddq 64(%rbp),%xmm6,%xmm10 - xorq %r8,%r15 - shrdq $28,%r14,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,96(%rsp) - vpalignr $8,%xmm7,%xmm0,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%rcx - vpalignr $8,%xmm3,%xmm4,%xmm11 - movq %r11,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %r10,%r13 - xorq %rax,%r12 - vpaddq %xmm11,%xmm7,%xmm7 - shrdq $4,%r13,%r13 - xorq %rcx,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %r10,%r12 - xorq %r10,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 112(%rsp),%rbx - movq %rcx,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %rax,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %rdx,%r15 - addq %r12,%rbx - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm6,%xmm11 - addq %rbx,%r9 - addq %rdi,%rbx - vpxor %xmm9,%xmm8,%xmm8 - movq %r9,%r13 - addq %rbx,%r14 - vpsllq $3,%xmm6,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%rbx - vpaddq %xmm8,%xmm7,%xmm7 - movq %r10,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm6,%xmm9 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %rbx,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %r9,%r12 - xorq %r9,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 120(%rsp),%rax - movq %rbx,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %r11,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %rcx,%rdi - addq %r12,%rax - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm7,%xmm7 - xorq %rbx,%r14 - addq %r13,%rax - vpaddq 96(%rbp),%xmm7,%xmm10 - xorq %rcx,%r15 - shrdq $28,%r14,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,112(%rsp) - cmpb $0,135(%rbp) - jne .Lavx_00_47 - shrdq $23,%r13,%r13 - movq %r14,%rax - movq %r9,%r12 - shrdq $5,%r14,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - shrdq $4,%r13,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 0(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - shrdq $6,%r14,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - shrdq $28,%r14,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r11 - movq %r8,%r12 - shrdq $5,%r14,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - shrdq $4,%r13,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 8(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - shrdq $6,%r14,%r14 - xorq %rax,%rdi - addq %r12,%r10 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - shrdq $28,%r14,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r10 - movq %rdx,%r12 - shrdq $5,%r14,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - shrdq $4,%r13,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 16(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - shrdq $6,%r14,%r14 - xorq %r11,%r15 - addq %r12,%r9 - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - shrdq $28,%r14,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r9 - movq %rcx,%r12 - shrdq $5,%r14,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - shrdq $4,%r13,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 24(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - shrdq $6,%r14,%r14 - xorq %r10,%rdi - addq %r12,%r8 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - shrdq $28,%r14,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r8 - movq %rbx,%r12 - shrdq $5,%r14,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - shrdq $4,%r13,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 32(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - shrdq $6,%r14,%r14 - xorq %r9,%r15 - addq %r12,%rdx - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - shrdq $28,%r14,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rdx - movq %rax,%r12 - shrdq $5,%r14,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - shrdq $4,%r13,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 40(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - shrdq $6,%r14,%r14 - xorq %r8,%rdi - addq %r12,%rcx - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - shrdq $28,%r14,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rcx - movq %r11,%r12 - shrdq $5,%r14,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - shrdq $4,%r13,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 48(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - shrdq $6,%r14,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - shrdq $28,%r14,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rbx - movq %r10,%r12 - shrdq $5,%r14,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - shrdq $4,%r13,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 56(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - shrdq $6,%r14,%r14 - xorq %rcx,%rdi - addq %r12,%rax - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - shrdq $28,%r14,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rax - movq %r9,%r12 - shrdq $5,%r14,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - shrdq $4,%r13,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 64(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - shrdq $6,%r14,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - shrdq $28,%r14,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r11 - movq %r8,%r12 - shrdq $5,%r14,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - shrdq $4,%r13,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 72(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - shrdq $6,%r14,%r14 - xorq %rax,%rdi - addq %r12,%r10 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - shrdq $28,%r14,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r10 - movq %rdx,%r12 - shrdq $5,%r14,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - shrdq $4,%r13,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 80(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - shrdq $6,%r14,%r14 - xorq %r11,%r15 - addq %r12,%r9 - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - shrdq $28,%r14,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r9 - movq %rcx,%r12 - shrdq $5,%r14,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - shrdq $4,%r13,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 88(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - shrdq $6,%r14,%r14 - xorq %r10,%rdi - addq %r12,%r8 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - shrdq $28,%r14,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r8 - movq %rbx,%r12 - shrdq $5,%r14,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - shrdq $4,%r13,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 96(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - shrdq $6,%r14,%r14 - xorq %r9,%r15 - addq %r12,%rdx - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - shrdq $28,%r14,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rdx - movq %rax,%r12 - shrdq $5,%r14,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - shrdq $4,%r13,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 104(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - shrdq $6,%r14,%r14 - xorq %r8,%rdi - addq %r12,%rcx - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - shrdq $28,%r14,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rcx - movq %r11,%r12 - shrdq $5,%r14,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - shrdq $4,%r13,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 112(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - shrdq $6,%r14,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - shrdq $28,%r14,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rbx - movq %r10,%r12 - shrdq $5,%r14,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - shrdq $4,%r13,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 120(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - shrdq $6,%r14,%r14 - xorq %rcx,%rdi - addq %r12,%rax - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - shrdq $28,%r14,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - movq 128+0(%rsp),%rdi - movq %r14,%rax - - addq 0(%rdi),%rax - leaq 128(%rsi),%rsi - addq 8(%rdi),%rbx - addq 16(%rdi),%rcx - addq 24(%rdi),%rdx - addq 32(%rdi),%r8 - addq 40(%rdi),%r9 - addq 48(%rdi),%r10 - addq 56(%rdi),%r11 - - cmpq 128+16(%rsp),%rsi - - movq %rax,0(%rdi) - movq %rbx,8(%rdi) - movq %rcx,16(%rdi) - movq %rdx,24(%rdi) - movq %r8,32(%rdi) - movq %r9,40(%rdi) - movq %r10,48(%rdi) - movq %r11,56(%rdi) - jb .Lloop_avx - - movq 152(%rsp),%rsi -.cfi_def_cfa %rsi,8 - vzeroupper - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_avx: - .byte 0xf3,0xc3 -.cfi_endproc -.size sha512_block_data_order_avx,.-sha512_block_data_order_avx -.type sha512_block_data_order_avx2,@function -.align 64 -sha512_block_data_order_avx2: -.cfi_startproc -.Lavx2_shortcut: - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - subq $1312,%rsp - shlq $4,%rdx - andq $-2048,%rsp - leaq (%rsi,%rdx,8),%rdx - addq $1152,%rsp - movq %rdi,128+0(%rsp) - movq %rsi,128+8(%rsp) - movq %rdx,128+16(%rsp) - movq %rax,152(%rsp) -.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 -.Lprologue_avx2: - - vzeroupper - subq $-128,%rsi - movq 0(%rdi),%rax - movq %rsi,%r12 - movq 8(%rdi),%rbx - cmpq %rdx,%rsi - movq 16(%rdi),%rcx - cmoveq %rsp,%r12 - movq 24(%rdi),%rdx - movq 32(%rdi),%r8 - movq 40(%rdi),%r9 - movq 48(%rdi),%r10 - movq 56(%rdi),%r11 - jmp .Loop_avx2 -.align 16 -.Loop_avx2: - vmovdqu -128(%rsi),%xmm0 - vmovdqu -128+16(%rsi),%xmm1 - vmovdqu -128+32(%rsi),%xmm2 - leaq K512+128(%rip),%rbp - vmovdqu -128+48(%rsi),%xmm3 - vmovdqu -128+64(%rsi),%xmm4 - vmovdqu -128+80(%rsi),%xmm5 - vmovdqu -128+96(%rsi),%xmm6 - vmovdqu -128+112(%rsi),%xmm7 - - vmovdqa 1152(%rbp),%ymm10 - vinserti128 $1,(%r12),%ymm0,%ymm0 - vinserti128 $1,16(%r12),%ymm1,%ymm1 - vpshufb %ymm10,%ymm0,%ymm0 - vinserti128 $1,32(%r12),%ymm2,%ymm2 - vpshufb %ymm10,%ymm1,%ymm1 - vinserti128 $1,48(%r12),%ymm3,%ymm3 - vpshufb %ymm10,%ymm2,%ymm2 - vinserti128 $1,64(%r12),%ymm4,%ymm4 - vpshufb %ymm10,%ymm3,%ymm3 - vinserti128 $1,80(%r12),%ymm5,%ymm5 - vpshufb %ymm10,%ymm4,%ymm4 - vinserti128 $1,96(%r12),%ymm6,%ymm6 - vpshufb %ymm10,%ymm5,%ymm5 - vinserti128 $1,112(%r12),%ymm7,%ymm7 - - vpaddq -128(%rbp),%ymm0,%ymm8 - vpshufb %ymm10,%ymm6,%ymm6 - vpaddq -96(%rbp),%ymm1,%ymm9 - vpshufb %ymm10,%ymm7,%ymm7 - vpaddq -64(%rbp),%ymm2,%ymm10 - vpaddq -32(%rbp),%ymm3,%ymm11 - vmovdqa %ymm8,0(%rsp) - vpaddq 0(%rbp),%ymm4,%ymm8 - vmovdqa %ymm9,32(%rsp) - vpaddq 32(%rbp),%ymm5,%ymm9 - vmovdqa %ymm10,64(%rsp) - vpaddq 64(%rbp),%ymm6,%ymm10 - vmovdqa %ymm11,96(%rsp) - - movq 152(%rsp),%rdi -.cfi_def_cfa %rdi,8 - leaq -128(%rsp),%rsp - - - - movq %rdi,-8(%rsp) -.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 - vpaddq 96(%rbp),%ymm7,%ymm11 - vmovdqa %ymm8,0(%rsp) - xorq %r14,%r14 - vmovdqa %ymm9,32(%rsp) - movq %rbx,%rdi - vmovdqa %ymm10,64(%rsp) - xorq %rcx,%rdi - vmovdqa %ymm11,96(%rsp) - movq %r9,%r12 - addq $32*8,%rbp - jmp .Lavx2_00_47 - -.align 16 -.Lavx2_00_47: - leaq -128(%rsp),%rsp -.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 - - pushq 128-8(%rsp) -.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 - leaq 8(%rsp),%rsp -.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 - vpalignr $8,%ymm0,%ymm1,%ymm8 - addq 0+256(%rsp),%r11 - andq %r8,%r12 - rorxq $41,%r8,%r13 - vpalignr $8,%ymm4,%ymm5,%ymm11 - rorxq $18,%r8,%r15 - leaq (%rax,%r14,1),%rax - leaq (%r11,%r12,1),%r11 - vpsrlq $1,%ymm8,%ymm10 - andnq %r10,%r8,%r12 - xorq %r15,%r13 - rorxq $14,%r8,%r14 - vpaddq %ymm11,%ymm0,%ymm0 - vpsrlq $7,%ymm8,%ymm11 - leaq (%r11,%r12,1),%r11 - xorq %r14,%r13 - movq %rax,%r15 - vpsllq $56,%ymm8,%ymm9 - vpxor %ymm10,%ymm11,%ymm8 - rorxq $39,%rax,%r12 - leaq (%r11,%r13,1),%r11 - xorq %rbx,%r15 - vpsrlq $7,%ymm10,%ymm10 - vpxor %ymm9,%ymm8,%ymm8 - rorxq $34,%rax,%r14 - rorxq $28,%rax,%r13 - leaq (%rdx,%r11,1),%rdx - vpsllq $7,%ymm9,%ymm9 - vpxor %ymm10,%ymm8,%ymm8 - andq %r15,%rdi - xorq %r12,%r14 - xorq %rbx,%rdi - vpsrlq $6,%ymm7,%ymm11 - vpxor %ymm9,%ymm8,%ymm8 - xorq %r13,%r14 - leaq (%r11,%rdi,1),%r11 - movq %r8,%r12 - vpsllq $3,%ymm7,%ymm10 - vpaddq %ymm8,%ymm0,%ymm0 - addq 8+256(%rsp),%r10 - andq %rdx,%r12 - rorxq $41,%rdx,%r13 - vpsrlq $19,%ymm7,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - rorxq $18,%rdx,%rdi - leaq (%r11,%r14,1),%r11 - leaq (%r10,%r12,1),%r10 - vpsllq $42,%ymm10,%ymm10 - vpxor %ymm9,%ymm11,%ymm11 - andnq %r9,%rdx,%r12 - xorq %rdi,%r13 - rorxq $14,%rdx,%r14 - vpsrlq $42,%ymm9,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - leaq (%r10,%r12,1),%r10 - xorq %r14,%r13 - movq %r11,%rdi - vpxor %ymm9,%ymm11,%ymm11 - rorxq $39,%r11,%r12 - leaq (%r10,%r13,1),%r10 - xorq %rax,%rdi - vpaddq %ymm11,%ymm0,%ymm0 - rorxq $34,%r11,%r14 - rorxq $28,%r11,%r13 - leaq (%rcx,%r10,1),%rcx - vpaddq -128(%rbp),%ymm0,%ymm10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rax,%r15 - xorq %r13,%r14 - leaq (%r10,%r15,1),%r10 - movq %rdx,%r12 - vmovdqa %ymm10,0(%rsp) - vpalignr $8,%ymm1,%ymm2,%ymm8 - addq 32+256(%rsp),%r9 - andq %rcx,%r12 - rorxq $41,%rcx,%r13 - vpalignr $8,%ymm5,%ymm6,%ymm11 - rorxq $18,%rcx,%r15 - leaq (%r10,%r14,1),%r10 - leaq (%r9,%r12,1),%r9 - vpsrlq $1,%ymm8,%ymm10 - andnq %r8,%rcx,%r12 - xorq %r15,%r13 - rorxq $14,%rcx,%r14 - vpaddq %ymm11,%ymm1,%ymm1 - vpsrlq $7,%ymm8,%ymm11 - leaq (%r9,%r12,1),%r9 - xorq %r14,%r13 - movq %r10,%r15 - vpsllq $56,%ymm8,%ymm9 - vpxor %ymm10,%ymm11,%ymm8 - rorxq $39,%r10,%r12 - leaq (%r9,%r13,1),%r9 - xorq %r11,%r15 - vpsrlq $7,%ymm10,%ymm10 - vpxor %ymm9,%ymm8,%ymm8 - rorxq $34,%r10,%r14 - rorxq $28,%r10,%r13 - leaq (%rbx,%r9,1),%rbx - vpsllq $7,%ymm9,%ymm9 - vpxor %ymm10,%ymm8,%ymm8 - andq %r15,%rdi - xorq %r12,%r14 - xorq %r11,%rdi - vpsrlq $6,%ymm0,%ymm11 - vpxor %ymm9,%ymm8,%ymm8 - xorq %r13,%r14 - leaq (%r9,%rdi,1),%r9 - movq %rcx,%r12 - vpsllq $3,%ymm0,%ymm10 - vpaddq %ymm8,%ymm1,%ymm1 - addq 40+256(%rsp),%r8 - andq %rbx,%r12 - rorxq $41,%rbx,%r13 - vpsrlq $19,%ymm0,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - rorxq $18,%rbx,%rdi - leaq (%r9,%r14,1),%r9 - leaq (%r8,%r12,1),%r8 - vpsllq $42,%ymm10,%ymm10 - vpxor %ymm9,%ymm11,%ymm11 - andnq %rdx,%rbx,%r12 - xorq %rdi,%r13 - rorxq $14,%rbx,%r14 - vpsrlq $42,%ymm9,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - leaq (%r8,%r12,1),%r8 - xorq %r14,%r13 - movq %r9,%rdi - vpxor %ymm9,%ymm11,%ymm11 - rorxq $39,%r9,%r12 - leaq (%r8,%r13,1),%r8 - xorq %r10,%rdi - vpaddq %ymm11,%ymm1,%ymm1 - rorxq $34,%r9,%r14 - rorxq $28,%r9,%r13 - leaq (%rax,%r8,1),%rax - vpaddq -96(%rbp),%ymm1,%ymm10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r10,%r15 - xorq %r13,%r14 - leaq (%r8,%r15,1),%r8 - movq %rbx,%r12 - vmovdqa %ymm10,32(%rsp) - vpalignr $8,%ymm2,%ymm3,%ymm8 - addq 64+256(%rsp),%rdx - andq %rax,%r12 - rorxq $41,%rax,%r13 - vpalignr $8,%ymm6,%ymm7,%ymm11 - rorxq $18,%rax,%r15 - leaq (%r8,%r14,1),%r8 - leaq (%rdx,%r12,1),%rdx - vpsrlq $1,%ymm8,%ymm10 - andnq %rcx,%rax,%r12 - xorq %r15,%r13 - rorxq $14,%rax,%r14 - vpaddq %ymm11,%ymm2,%ymm2 - vpsrlq $7,%ymm8,%ymm11 - leaq (%rdx,%r12,1),%rdx - xorq %r14,%r13 - movq %r8,%r15 - vpsllq $56,%ymm8,%ymm9 - vpxor %ymm10,%ymm11,%ymm8 - rorxq $39,%r8,%r12 - leaq (%rdx,%r13,1),%rdx - xorq %r9,%r15 - vpsrlq $7,%ymm10,%ymm10 - vpxor %ymm9,%ymm8,%ymm8 - rorxq $34,%r8,%r14 - rorxq $28,%r8,%r13 - leaq (%r11,%rdx,1),%r11 - vpsllq $7,%ymm9,%ymm9 - vpxor %ymm10,%ymm8,%ymm8 - andq %r15,%rdi - xorq %r12,%r14 - xorq %r9,%rdi - vpsrlq $6,%ymm1,%ymm11 - vpxor %ymm9,%ymm8,%ymm8 - xorq %r13,%r14 - leaq (%rdx,%rdi,1),%rdx - movq %rax,%r12 - vpsllq $3,%ymm1,%ymm10 - vpaddq %ymm8,%ymm2,%ymm2 - addq 72+256(%rsp),%rcx - andq %r11,%r12 - rorxq $41,%r11,%r13 - vpsrlq $19,%ymm1,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - rorxq $18,%r11,%rdi - leaq (%rdx,%r14,1),%rdx - leaq (%rcx,%r12,1),%rcx - vpsllq $42,%ymm10,%ymm10 - vpxor %ymm9,%ymm11,%ymm11 - andnq %rbx,%r11,%r12 - xorq %rdi,%r13 - rorxq $14,%r11,%r14 - vpsrlq $42,%ymm9,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - leaq (%rcx,%r12,1),%rcx - xorq %r14,%r13 - movq %rdx,%rdi - vpxor %ymm9,%ymm11,%ymm11 - rorxq $39,%rdx,%r12 - leaq (%rcx,%r13,1),%rcx - xorq %r8,%rdi - vpaddq %ymm11,%ymm2,%ymm2 - rorxq $34,%rdx,%r14 - rorxq $28,%rdx,%r13 - leaq (%r10,%rcx,1),%r10 - vpaddq -64(%rbp),%ymm2,%ymm10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r8,%r15 - xorq %r13,%r14 - leaq (%rcx,%r15,1),%rcx - movq %r11,%r12 - vmovdqa %ymm10,64(%rsp) - vpalignr $8,%ymm3,%ymm4,%ymm8 - addq 96+256(%rsp),%rbx - andq %r10,%r12 - rorxq $41,%r10,%r13 - vpalignr $8,%ymm7,%ymm0,%ymm11 - rorxq $18,%r10,%r15 - leaq (%rcx,%r14,1),%rcx - leaq (%rbx,%r12,1),%rbx - vpsrlq $1,%ymm8,%ymm10 - andnq %rax,%r10,%r12 - xorq %r15,%r13 - rorxq $14,%r10,%r14 - vpaddq %ymm11,%ymm3,%ymm3 - vpsrlq $7,%ymm8,%ymm11 - leaq (%rbx,%r12,1),%rbx - xorq %r14,%r13 - movq %rcx,%r15 - vpsllq $56,%ymm8,%ymm9 - vpxor %ymm10,%ymm11,%ymm8 - rorxq $39,%rcx,%r12 - leaq (%rbx,%r13,1),%rbx - xorq %rdx,%r15 - vpsrlq $7,%ymm10,%ymm10 - vpxor %ymm9,%ymm8,%ymm8 - rorxq $34,%rcx,%r14 - rorxq $28,%rcx,%r13 - leaq (%r9,%rbx,1),%r9 - vpsllq $7,%ymm9,%ymm9 - vpxor %ymm10,%ymm8,%ymm8 - andq %r15,%rdi - xorq %r12,%r14 - xorq %rdx,%rdi - vpsrlq $6,%ymm2,%ymm11 - vpxor %ymm9,%ymm8,%ymm8 - xorq %r13,%r14 - leaq (%rbx,%rdi,1),%rbx - movq %r10,%r12 - vpsllq $3,%ymm2,%ymm10 - vpaddq %ymm8,%ymm3,%ymm3 - addq 104+256(%rsp),%rax - andq %r9,%r12 - rorxq $41,%r9,%r13 - vpsrlq $19,%ymm2,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - rorxq $18,%r9,%rdi - leaq (%rbx,%r14,1),%rbx - leaq (%rax,%r12,1),%rax - vpsllq $42,%ymm10,%ymm10 - vpxor %ymm9,%ymm11,%ymm11 - andnq %r11,%r9,%r12 - xorq %rdi,%r13 - rorxq $14,%r9,%r14 - vpsrlq $42,%ymm9,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - leaq (%rax,%r12,1),%rax - xorq %r14,%r13 - movq %rbx,%rdi - vpxor %ymm9,%ymm11,%ymm11 - rorxq $39,%rbx,%r12 - leaq (%rax,%r13,1),%rax - xorq %rcx,%rdi - vpaddq %ymm11,%ymm3,%ymm3 - rorxq $34,%rbx,%r14 - rorxq $28,%rbx,%r13 - leaq (%r8,%rax,1),%r8 - vpaddq -32(%rbp),%ymm3,%ymm10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rcx,%r15 - xorq %r13,%r14 - leaq (%rax,%r15,1),%rax - movq %r9,%r12 - vmovdqa %ymm10,96(%rsp) - leaq -128(%rsp),%rsp -.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 - - pushq 128-8(%rsp) -.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 - leaq 8(%rsp),%rsp -.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 - vpalignr $8,%ymm4,%ymm5,%ymm8 - addq 0+256(%rsp),%r11 - andq %r8,%r12 - rorxq $41,%r8,%r13 - vpalignr $8,%ymm0,%ymm1,%ymm11 - rorxq $18,%r8,%r15 - leaq (%rax,%r14,1),%rax - leaq (%r11,%r12,1),%r11 - vpsrlq $1,%ymm8,%ymm10 - andnq %r10,%r8,%r12 - xorq %r15,%r13 - rorxq $14,%r8,%r14 - vpaddq %ymm11,%ymm4,%ymm4 - vpsrlq $7,%ymm8,%ymm11 - leaq (%r11,%r12,1),%r11 - xorq %r14,%r13 - movq %rax,%r15 - vpsllq $56,%ymm8,%ymm9 - vpxor %ymm10,%ymm11,%ymm8 - rorxq $39,%rax,%r12 - leaq (%r11,%r13,1),%r11 - xorq %rbx,%r15 - vpsrlq $7,%ymm10,%ymm10 - vpxor %ymm9,%ymm8,%ymm8 - rorxq $34,%rax,%r14 - rorxq $28,%rax,%r13 - leaq (%rdx,%r11,1),%rdx - vpsllq $7,%ymm9,%ymm9 - vpxor %ymm10,%ymm8,%ymm8 - andq %r15,%rdi - xorq %r12,%r14 - xorq %rbx,%rdi - vpsrlq $6,%ymm3,%ymm11 - vpxor %ymm9,%ymm8,%ymm8 - xorq %r13,%r14 - leaq (%r11,%rdi,1),%r11 - movq %r8,%r12 - vpsllq $3,%ymm3,%ymm10 - vpaddq %ymm8,%ymm4,%ymm4 - addq 8+256(%rsp),%r10 - andq %rdx,%r12 - rorxq $41,%rdx,%r13 - vpsrlq $19,%ymm3,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - rorxq $18,%rdx,%rdi - leaq (%r11,%r14,1),%r11 - leaq (%r10,%r12,1),%r10 - vpsllq $42,%ymm10,%ymm10 - vpxor %ymm9,%ymm11,%ymm11 - andnq %r9,%rdx,%r12 - xorq %rdi,%r13 - rorxq $14,%rdx,%r14 - vpsrlq $42,%ymm9,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - leaq (%r10,%r12,1),%r10 - xorq %r14,%r13 - movq %r11,%rdi - vpxor %ymm9,%ymm11,%ymm11 - rorxq $39,%r11,%r12 - leaq (%r10,%r13,1),%r10 - xorq %rax,%rdi - vpaddq %ymm11,%ymm4,%ymm4 - rorxq $34,%r11,%r14 - rorxq $28,%r11,%r13 - leaq (%rcx,%r10,1),%rcx - vpaddq 0(%rbp),%ymm4,%ymm10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rax,%r15 - xorq %r13,%r14 - leaq (%r10,%r15,1),%r10 - movq %rdx,%r12 - vmovdqa %ymm10,0(%rsp) - vpalignr $8,%ymm5,%ymm6,%ymm8 - addq 32+256(%rsp),%r9 - andq %rcx,%r12 - rorxq $41,%rcx,%r13 - vpalignr $8,%ymm1,%ymm2,%ymm11 - rorxq $18,%rcx,%r15 - leaq (%r10,%r14,1),%r10 - leaq (%r9,%r12,1),%r9 - vpsrlq $1,%ymm8,%ymm10 - andnq %r8,%rcx,%r12 - xorq %r15,%r13 - rorxq $14,%rcx,%r14 - vpaddq %ymm11,%ymm5,%ymm5 - vpsrlq $7,%ymm8,%ymm11 - leaq (%r9,%r12,1),%r9 - xorq %r14,%r13 - movq %r10,%r15 - vpsllq $56,%ymm8,%ymm9 - vpxor %ymm10,%ymm11,%ymm8 - rorxq $39,%r10,%r12 - leaq (%r9,%r13,1),%r9 - xorq %r11,%r15 - vpsrlq $7,%ymm10,%ymm10 - vpxor %ymm9,%ymm8,%ymm8 - rorxq $34,%r10,%r14 - rorxq $28,%r10,%r13 - leaq (%rbx,%r9,1),%rbx - vpsllq $7,%ymm9,%ymm9 - vpxor %ymm10,%ymm8,%ymm8 - andq %r15,%rdi - xorq %r12,%r14 - xorq %r11,%rdi - vpsrlq $6,%ymm4,%ymm11 - vpxor %ymm9,%ymm8,%ymm8 - xorq %r13,%r14 - leaq (%r9,%rdi,1),%r9 - movq %rcx,%r12 - vpsllq $3,%ymm4,%ymm10 - vpaddq %ymm8,%ymm5,%ymm5 - addq 40+256(%rsp),%r8 - andq %rbx,%r12 - rorxq $41,%rbx,%r13 - vpsrlq $19,%ymm4,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - rorxq $18,%rbx,%rdi - leaq (%r9,%r14,1),%r9 - leaq (%r8,%r12,1),%r8 - vpsllq $42,%ymm10,%ymm10 - vpxor %ymm9,%ymm11,%ymm11 - andnq %rdx,%rbx,%r12 - xorq %rdi,%r13 - rorxq $14,%rbx,%r14 - vpsrlq $42,%ymm9,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - leaq (%r8,%r12,1),%r8 - xorq %r14,%r13 - movq %r9,%rdi - vpxor %ymm9,%ymm11,%ymm11 - rorxq $39,%r9,%r12 - leaq (%r8,%r13,1),%r8 - xorq %r10,%rdi - vpaddq %ymm11,%ymm5,%ymm5 - rorxq $34,%r9,%r14 - rorxq $28,%r9,%r13 - leaq (%rax,%r8,1),%rax - vpaddq 32(%rbp),%ymm5,%ymm10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r10,%r15 - xorq %r13,%r14 - leaq (%r8,%r15,1),%r8 - movq %rbx,%r12 - vmovdqa %ymm10,32(%rsp) - vpalignr $8,%ymm6,%ymm7,%ymm8 - addq 64+256(%rsp),%rdx - andq %rax,%r12 - rorxq $41,%rax,%r13 - vpalignr $8,%ymm2,%ymm3,%ymm11 - rorxq $18,%rax,%r15 - leaq (%r8,%r14,1),%r8 - leaq (%rdx,%r12,1),%rdx - vpsrlq $1,%ymm8,%ymm10 - andnq %rcx,%rax,%r12 - xorq %r15,%r13 - rorxq $14,%rax,%r14 - vpaddq %ymm11,%ymm6,%ymm6 - vpsrlq $7,%ymm8,%ymm11 - leaq (%rdx,%r12,1),%rdx - xorq %r14,%r13 - movq %r8,%r15 - vpsllq $56,%ymm8,%ymm9 - vpxor %ymm10,%ymm11,%ymm8 - rorxq $39,%r8,%r12 - leaq (%rdx,%r13,1),%rdx - xorq %r9,%r15 - vpsrlq $7,%ymm10,%ymm10 - vpxor %ymm9,%ymm8,%ymm8 - rorxq $34,%r8,%r14 - rorxq $28,%r8,%r13 - leaq (%r11,%rdx,1),%r11 - vpsllq $7,%ymm9,%ymm9 - vpxor %ymm10,%ymm8,%ymm8 - andq %r15,%rdi - xorq %r12,%r14 - xorq %r9,%rdi - vpsrlq $6,%ymm5,%ymm11 - vpxor %ymm9,%ymm8,%ymm8 - xorq %r13,%r14 - leaq (%rdx,%rdi,1),%rdx - movq %rax,%r12 - vpsllq $3,%ymm5,%ymm10 - vpaddq %ymm8,%ymm6,%ymm6 - addq 72+256(%rsp),%rcx - andq %r11,%r12 - rorxq $41,%r11,%r13 - vpsrlq $19,%ymm5,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - rorxq $18,%r11,%rdi - leaq (%rdx,%r14,1),%rdx - leaq (%rcx,%r12,1),%rcx - vpsllq $42,%ymm10,%ymm10 - vpxor %ymm9,%ymm11,%ymm11 - andnq %rbx,%r11,%r12 - xorq %rdi,%r13 - rorxq $14,%r11,%r14 - vpsrlq $42,%ymm9,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - leaq (%rcx,%r12,1),%rcx - xorq %r14,%r13 - movq %rdx,%rdi - vpxor %ymm9,%ymm11,%ymm11 - rorxq $39,%rdx,%r12 - leaq (%rcx,%r13,1),%rcx - xorq %r8,%rdi - vpaddq %ymm11,%ymm6,%ymm6 - rorxq $34,%rdx,%r14 - rorxq $28,%rdx,%r13 - leaq (%r10,%rcx,1),%r10 - vpaddq 64(%rbp),%ymm6,%ymm10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r8,%r15 - xorq %r13,%r14 - leaq (%rcx,%r15,1),%rcx - movq %r11,%r12 - vmovdqa %ymm10,64(%rsp) - vpalignr $8,%ymm7,%ymm0,%ymm8 - addq 96+256(%rsp),%rbx - andq %r10,%r12 - rorxq $41,%r10,%r13 - vpalignr $8,%ymm3,%ymm4,%ymm11 - rorxq $18,%r10,%r15 - leaq (%rcx,%r14,1),%rcx - leaq (%rbx,%r12,1),%rbx - vpsrlq $1,%ymm8,%ymm10 - andnq %rax,%r10,%r12 - xorq %r15,%r13 - rorxq $14,%r10,%r14 - vpaddq %ymm11,%ymm7,%ymm7 - vpsrlq $7,%ymm8,%ymm11 - leaq (%rbx,%r12,1),%rbx - xorq %r14,%r13 - movq %rcx,%r15 - vpsllq $56,%ymm8,%ymm9 - vpxor %ymm10,%ymm11,%ymm8 - rorxq $39,%rcx,%r12 - leaq (%rbx,%r13,1),%rbx - xorq %rdx,%r15 - vpsrlq $7,%ymm10,%ymm10 - vpxor %ymm9,%ymm8,%ymm8 - rorxq $34,%rcx,%r14 - rorxq $28,%rcx,%r13 - leaq (%r9,%rbx,1),%r9 - vpsllq $7,%ymm9,%ymm9 - vpxor %ymm10,%ymm8,%ymm8 - andq %r15,%rdi - xorq %r12,%r14 - xorq %rdx,%rdi - vpsrlq $6,%ymm6,%ymm11 - vpxor %ymm9,%ymm8,%ymm8 - xorq %r13,%r14 - leaq (%rbx,%rdi,1),%rbx - movq %r10,%r12 - vpsllq $3,%ymm6,%ymm10 - vpaddq %ymm8,%ymm7,%ymm7 - addq 104+256(%rsp),%rax - andq %r9,%r12 - rorxq $41,%r9,%r13 - vpsrlq $19,%ymm6,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - rorxq $18,%r9,%rdi - leaq (%rbx,%r14,1),%rbx - leaq (%rax,%r12,1),%rax - vpsllq $42,%ymm10,%ymm10 - vpxor %ymm9,%ymm11,%ymm11 - andnq %r11,%r9,%r12 - xorq %rdi,%r13 - rorxq $14,%r9,%r14 - vpsrlq $42,%ymm9,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - leaq (%rax,%r12,1),%rax - xorq %r14,%r13 - movq %rbx,%rdi - vpxor %ymm9,%ymm11,%ymm11 - rorxq $39,%rbx,%r12 - leaq (%rax,%r13,1),%rax - xorq %rcx,%rdi - vpaddq %ymm11,%ymm7,%ymm7 - rorxq $34,%rbx,%r14 - rorxq $28,%rbx,%r13 - leaq (%r8,%rax,1),%r8 - vpaddq 96(%rbp),%ymm7,%ymm10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rcx,%r15 - xorq %r13,%r14 - leaq (%rax,%r15,1),%rax - movq %r9,%r12 - vmovdqa %ymm10,96(%rsp) - leaq 256(%rbp),%rbp - cmpb $0,-121(%rbp) - jne .Lavx2_00_47 - addq 0+128(%rsp),%r11 - andq %r8,%r12 - rorxq $41,%r8,%r13 - rorxq $18,%r8,%r15 - leaq (%rax,%r14,1),%rax - leaq (%r11,%r12,1),%r11 - andnq %r10,%r8,%r12 - xorq %r15,%r13 - rorxq $14,%r8,%r14 - leaq (%r11,%r12,1),%r11 - xorq %r14,%r13 - movq %rax,%r15 - rorxq $39,%rax,%r12 - leaq (%r11,%r13,1),%r11 - xorq %rbx,%r15 - rorxq $34,%rax,%r14 - rorxq $28,%rax,%r13 - leaq (%rdx,%r11,1),%rdx - andq %r15,%rdi - xorq %r12,%r14 - xorq %rbx,%rdi - xorq %r13,%r14 - leaq (%r11,%rdi,1),%r11 - movq %r8,%r12 - addq 8+128(%rsp),%r10 - andq %rdx,%r12 - rorxq $41,%rdx,%r13 - rorxq $18,%rdx,%rdi - leaq (%r11,%r14,1),%r11 - leaq (%r10,%r12,1),%r10 - andnq %r9,%rdx,%r12 - xorq %rdi,%r13 - rorxq $14,%rdx,%r14 - leaq (%r10,%r12,1),%r10 - xorq %r14,%r13 - movq %r11,%rdi - rorxq $39,%r11,%r12 - leaq (%r10,%r13,1),%r10 - xorq %rax,%rdi - rorxq $34,%r11,%r14 - rorxq $28,%r11,%r13 - leaq (%rcx,%r10,1),%rcx - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rax,%r15 - xorq %r13,%r14 - leaq (%r10,%r15,1),%r10 - movq %rdx,%r12 - addq 32+128(%rsp),%r9 - andq %rcx,%r12 - rorxq $41,%rcx,%r13 - rorxq $18,%rcx,%r15 - leaq (%r10,%r14,1),%r10 - leaq (%r9,%r12,1),%r9 - andnq %r8,%rcx,%r12 - xorq %r15,%r13 - rorxq $14,%rcx,%r14 - leaq (%r9,%r12,1),%r9 - xorq %r14,%r13 - movq %r10,%r15 - rorxq $39,%r10,%r12 - leaq (%r9,%r13,1),%r9 - xorq %r11,%r15 - rorxq $34,%r10,%r14 - rorxq $28,%r10,%r13 - leaq (%rbx,%r9,1),%rbx - andq %r15,%rdi - xorq %r12,%r14 - xorq %r11,%rdi - xorq %r13,%r14 - leaq (%r9,%rdi,1),%r9 - movq %rcx,%r12 - addq 40+128(%rsp),%r8 - andq %rbx,%r12 - rorxq $41,%rbx,%r13 - rorxq $18,%rbx,%rdi - leaq (%r9,%r14,1),%r9 - leaq (%r8,%r12,1),%r8 - andnq %rdx,%rbx,%r12 - xorq %rdi,%r13 - rorxq $14,%rbx,%r14 - leaq (%r8,%r12,1),%r8 - xorq %r14,%r13 - movq %r9,%rdi - rorxq $39,%r9,%r12 - leaq (%r8,%r13,1),%r8 - xorq %r10,%rdi - rorxq $34,%r9,%r14 - rorxq $28,%r9,%r13 - leaq (%rax,%r8,1),%rax - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r10,%r15 - xorq %r13,%r14 - leaq (%r8,%r15,1),%r8 - movq %rbx,%r12 - addq 64+128(%rsp),%rdx - andq %rax,%r12 - rorxq $41,%rax,%r13 - rorxq $18,%rax,%r15 - leaq (%r8,%r14,1),%r8 - leaq (%rdx,%r12,1),%rdx - andnq %rcx,%rax,%r12 - xorq %r15,%r13 - rorxq $14,%rax,%r14 - leaq (%rdx,%r12,1),%rdx - xorq %r14,%r13 - movq %r8,%r15 - rorxq $39,%r8,%r12 - leaq (%rdx,%r13,1),%rdx - xorq %r9,%r15 - rorxq $34,%r8,%r14 - rorxq $28,%r8,%r13 - leaq (%r11,%rdx,1),%r11 - andq %r15,%rdi - xorq %r12,%r14 - xorq %r9,%rdi - xorq %r13,%r14 - leaq (%rdx,%rdi,1),%rdx - movq %rax,%r12 - addq 72+128(%rsp),%rcx - andq %r11,%r12 - rorxq $41,%r11,%r13 - rorxq $18,%r11,%rdi - leaq (%rdx,%r14,1),%rdx - leaq (%rcx,%r12,1),%rcx - andnq %rbx,%r11,%r12 - xorq %rdi,%r13 - rorxq $14,%r11,%r14 - leaq (%rcx,%r12,1),%rcx - xorq %r14,%r13 - movq %rdx,%rdi - rorxq $39,%rdx,%r12 - leaq (%rcx,%r13,1),%rcx - xorq %r8,%rdi - rorxq $34,%rdx,%r14 - rorxq $28,%rdx,%r13 - leaq (%r10,%rcx,1),%r10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r8,%r15 - xorq %r13,%r14 - leaq (%rcx,%r15,1),%rcx - movq %r11,%r12 - addq 96+128(%rsp),%rbx - andq %r10,%r12 - rorxq $41,%r10,%r13 - rorxq $18,%r10,%r15 - leaq (%rcx,%r14,1),%rcx - leaq (%rbx,%r12,1),%rbx - andnq %rax,%r10,%r12 - xorq %r15,%r13 - rorxq $14,%r10,%r14 - leaq (%rbx,%r12,1),%rbx - xorq %r14,%r13 - movq %rcx,%r15 - rorxq $39,%rcx,%r12 - leaq (%rbx,%r13,1),%rbx - xorq %rdx,%r15 - rorxq $34,%rcx,%r14 - rorxq $28,%rcx,%r13 - leaq (%r9,%rbx,1),%r9 - andq %r15,%rdi - xorq %r12,%r14 - xorq %rdx,%rdi - xorq %r13,%r14 - leaq (%rbx,%rdi,1),%rbx - movq %r10,%r12 - addq 104+128(%rsp),%rax - andq %r9,%r12 - rorxq $41,%r9,%r13 - rorxq $18,%r9,%rdi - leaq (%rbx,%r14,1),%rbx - leaq (%rax,%r12,1),%rax - andnq %r11,%r9,%r12 - xorq %rdi,%r13 - rorxq $14,%r9,%r14 - leaq (%rax,%r12,1),%rax - xorq %r14,%r13 - movq %rbx,%rdi - rorxq $39,%rbx,%r12 - leaq (%rax,%r13,1),%rax - xorq %rcx,%rdi - rorxq $34,%rbx,%r14 - rorxq $28,%rbx,%r13 - leaq (%r8,%rax,1),%r8 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rcx,%r15 - xorq %r13,%r14 - leaq (%rax,%r15,1),%rax - movq %r9,%r12 - addq 0(%rsp),%r11 - andq %r8,%r12 - rorxq $41,%r8,%r13 - rorxq $18,%r8,%r15 - leaq (%rax,%r14,1),%rax - leaq (%r11,%r12,1),%r11 - andnq %r10,%r8,%r12 - xorq %r15,%r13 - rorxq $14,%r8,%r14 - leaq (%r11,%r12,1),%r11 - xorq %r14,%r13 - movq %rax,%r15 - rorxq $39,%rax,%r12 - leaq (%r11,%r13,1),%r11 - xorq %rbx,%r15 - rorxq $34,%rax,%r14 - rorxq $28,%rax,%r13 - leaq (%rdx,%r11,1),%rdx - andq %r15,%rdi - xorq %r12,%r14 - xorq %rbx,%rdi - xorq %r13,%r14 - leaq (%r11,%rdi,1),%r11 - movq %r8,%r12 - addq 8(%rsp),%r10 - andq %rdx,%r12 - rorxq $41,%rdx,%r13 - rorxq $18,%rdx,%rdi - leaq (%r11,%r14,1),%r11 - leaq (%r10,%r12,1),%r10 - andnq %r9,%rdx,%r12 - xorq %rdi,%r13 - rorxq $14,%rdx,%r14 - leaq (%r10,%r12,1),%r10 - xorq %r14,%r13 - movq %r11,%rdi - rorxq $39,%r11,%r12 - leaq (%r10,%r13,1),%r10 - xorq %rax,%rdi - rorxq $34,%r11,%r14 - rorxq $28,%r11,%r13 - leaq (%rcx,%r10,1),%rcx - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rax,%r15 - xorq %r13,%r14 - leaq (%r10,%r15,1),%r10 - movq %rdx,%r12 - addq 32(%rsp),%r9 - andq %rcx,%r12 - rorxq $41,%rcx,%r13 - rorxq $18,%rcx,%r15 - leaq (%r10,%r14,1),%r10 - leaq (%r9,%r12,1),%r9 - andnq %r8,%rcx,%r12 - xorq %r15,%r13 - rorxq $14,%rcx,%r14 - leaq (%r9,%r12,1),%r9 - xorq %r14,%r13 - movq %r10,%r15 - rorxq $39,%r10,%r12 - leaq (%r9,%r13,1),%r9 - xorq %r11,%r15 - rorxq $34,%r10,%r14 - rorxq $28,%r10,%r13 - leaq (%rbx,%r9,1),%rbx - andq %r15,%rdi - xorq %r12,%r14 - xorq %r11,%rdi - xorq %r13,%r14 - leaq (%r9,%rdi,1),%r9 - movq %rcx,%r12 - addq 40(%rsp),%r8 - andq %rbx,%r12 - rorxq $41,%rbx,%r13 - rorxq $18,%rbx,%rdi - leaq (%r9,%r14,1),%r9 - leaq (%r8,%r12,1),%r8 - andnq %rdx,%rbx,%r12 - xorq %rdi,%r13 - rorxq $14,%rbx,%r14 - leaq (%r8,%r12,1),%r8 - xorq %r14,%r13 - movq %r9,%rdi - rorxq $39,%r9,%r12 - leaq (%r8,%r13,1),%r8 - xorq %r10,%rdi - rorxq $34,%r9,%r14 - rorxq $28,%r9,%r13 - leaq (%rax,%r8,1),%rax - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r10,%r15 - xorq %r13,%r14 - leaq (%r8,%r15,1),%r8 - movq %rbx,%r12 - addq 64(%rsp),%rdx - andq %rax,%r12 - rorxq $41,%rax,%r13 - rorxq $18,%rax,%r15 - leaq (%r8,%r14,1),%r8 - leaq (%rdx,%r12,1),%rdx - andnq %rcx,%rax,%r12 - xorq %r15,%r13 - rorxq $14,%rax,%r14 - leaq (%rdx,%r12,1),%rdx - xorq %r14,%r13 - movq %r8,%r15 - rorxq $39,%r8,%r12 - leaq (%rdx,%r13,1),%rdx - xorq %r9,%r15 - rorxq $34,%r8,%r14 - rorxq $28,%r8,%r13 - leaq (%r11,%rdx,1),%r11 - andq %r15,%rdi - xorq %r12,%r14 - xorq %r9,%rdi - xorq %r13,%r14 - leaq (%rdx,%rdi,1),%rdx - movq %rax,%r12 - addq 72(%rsp),%rcx - andq %r11,%r12 - rorxq $41,%r11,%r13 - rorxq $18,%r11,%rdi - leaq (%rdx,%r14,1),%rdx - leaq (%rcx,%r12,1),%rcx - andnq %rbx,%r11,%r12 - xorq %rdi,%r13 - rorxq $14,%r11,%r14 - leaq (%rcx,%r12,1),%rcx - xorq %r14,%r13 - movq %rdx,%rdi - rorxq $39,%rdx,%r12 - leaq (%rcx,%r13,1),%rcx - xorq %r8,%rdi - rorxq $34,%rdx,%r14 - rorxq $28,%rdx,%r13 - leaq (%r10,%rcx,1),%r10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r8,%r15 - xorq %r13,%r14 - leaq (%rcx,%r15,1),%rcx - movq %r11,%r12 - addq 96(%rsp),%rbx - andq %r10,%r12 - rorxq $41,%r10,%r13 - rorxq $18,%r10,%r15 - leaq (%rcx,%r14,1),%rcx - leaq (%rbx,%r12,1),%rbx - andnq %rax,%r10,%r12 - xorq %r15,%r13 - rorxq $14,%r10,%r14 - leaq (%rbx,%r12,1),%rbx - xorq %r14,%r13 - movq %rcx,%r15 - rorxq $39,%rcx,%r12 - leaq (%rbx,%r13,1),%rbx - xorq %rdx,%r15 - rorxq $34,%rcx,%r14 - rorxq $28,%rcx,%r13 - leaq (%r9,%rbx,1),%r9 - andq %r15,%rdi - xorq %r12,%r14 - xorq %rdx,%rdi - xorq %r13,%r14 - leaq (%rbx,%rdi,1),%rbx - movq %r10,%r12 - addq 104(%rsp),%rax - andq %r9,%r12 - rorxq $41,%r9,%r13 - rorxq $18,%r9,%rdi - leaq (%rbx,%r14,1),%rbx - leaq (%rax,%r12,1),%rax - andnq %r11,%r9,%r12 - xorq %rdi,%r13 - rorxq $14,%r9,%r14 - leaq (%rax,%r12,1),%rax - xorq %r14,%r13 - movq %rbx,%rdi - rorxq $39,%rbx,%r12 - leaq (%rax,%r13,1),%rax - xorq %rcx,%rdi - rorxq $34,%rbx,%r14 - rorxq $28,%rbx,%r13 - leaq (%r8,%rax,1),%r8 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rcx,%r15 - xorq %r13,%r14 - leaq (%rax,%r15,1),%rax - movq %r9,%r12 - movq 1280(%rsp),%rdi - addq %r14,%rax - - leaq 1152(%rsp),%rbp - - addq 0(%rdi),%rax - addq 8(%rdi),%rbx - addq 16(%rdi),%rcx - addq 24(%rdi),%rdx - addq 32(%rdi),%r8 - addq 40(%rdi),%r9 - addq 48(%rdi),%r10 - addq 56(%rdi),%r11 - - movq %rax,0(%rdi) - movq %rbx,8(%rdi) - movq %rcx,16(%rdi) - movq %rdx,24(%rdi) - movq %r8,32(%rdi) - movq %r9,40(%rdi) - movq %r10,48(%rdi) - movq %r11,56(%rdi) - - cmpq 144(%rbp),%rsi - je .Ldone_avx2 - - xorq %r14,%r14 - movq %rbx,%rdi - xorq %rcx,%rdi - movq %r9,%r12 - jmp .Lower_avx2 -.align 16 -.Lower_avx2: - addq 0+16(%rbp),%r11 - andq %r8,%r12 - rorxq $41,%r8,%r13 - rorxq $18,%r8,%r15 - leaq (%rax,%r14,1),%rax - leaq (%r11,%r12,1),%r11 - andnq %r10,%r8,%r12 - xorq %r15,%r13 - rorxq $14,%r8,%r14 - leaq (%r11,%r12,1),%r11 - xorq %r14,%r13 - movq %rax,%r15 - rorxq $39,%rax,%r12 - leaq (%r11,%r13,1),%r11 - xorq %rbx,%r15 - rorxq $34,%rax,%r14 - rorxq $28,%rax,%r13 - leaq (%rdx,%r11,1),%rdx - andq %r15,%rdi - xorq %r12,%r14 - xorq %rbx,%rdi - xorq %r13,%r14 - leaq (%r11,%rdi,1),%r11 - movq %r8,%r12 - addq 8+16(%rbp),%r10 - andq %rdx,%r12 - rorxq $41,%rdx,%r13 - rorxq $18,%rdx,%rdi - leaq (%r11,%r14,1),%r11 - leaq (%r10,%r12,1),%r10 - andnq %r9,%rdx,%r12 - xorq %rdi,%r13 - rorxq $14,%rdx,%r14 - leaq (%r10,%r12,1),%r10 - xorq %r14,%r13 - movq %r11,%rdi - rorxq $39,%r11,%r12 - leaq (%r10,%r13,1),%r10 - xorq %rax,%rdi - rorxq $34,%r11,%r14 - rorxq $28,%r11,%r13 - leaq (%rcx,%r10,1),%rcx - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rax,%r15 - xorq %r13,%r14 - leaq (%r10,%r15,1),%r10 - movq %rdx,%r12 - addq 32+16(%rbp),%r9 - andq %rcx,%r12 - rorxq $41,%rcx,%r13 - rorxq $18,%rcx,%r15 - leaq (%r10,%r14,1),%r10 - leaq (%r9,%r12,1),%r9 - andnq %r8,%rcx,%r12 - xorq %r15,%r13 - rorxq $14,%rcx,%r14 - leaq (%r9,%r12,1),%r9 - xorq %r14,%r13 - movq %r10,%r15 - rorxq $39,%r10,%r12 - leaq (%r9,%r13,1),%r9 - xorq %r11,%r15 - rorxq $34,%r10,%r14 - rorxq $28,%r10,%r13 - leaq (%rbx,%r9,1),%rbx - andq %r15,%rdi - xorq %r12,%r14 - xorq %r11,%rdi - xorq %r13,%r14 - leaq (%r9,%rdi,1),%r9 - movq %rcx,%r12 - addq 40+16(%rbp),%r8 - andq %rbx,%r12 - rorxq $41,%rbx,%r13 - rorxq $18,%rbx,%rdi - leaq (%r9,%r14,1),%r9 - leaq (%r8,%r12,1),%r8 - andnq %rdx,%rbx,%r12 - xorq %rdi,%r13 - rorxq $14,%rbx,%r14 - leaq (%r8,%r12,1),%r8 - xorq %r14,%r13 - movq %r9,%rdi - rorxq $39,%r9,%r12 - leaq (%r8,%r13,1),%r8 - xorq %r10,%rdi - rorxq $34,%r9,%r14 - rorxq $28,%r9,%r13 - leaq (%rax,%r8,1),%rax - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r10,%r15 - xorq %r13,%r14 - leaq (%r8,%r15,1),%r8 - movq %rbx,%r12 - addq 64+16(%rbp),%rdx - andq %rax,%r12 - rorxq $41,%rax,%r13 - rorxq $18,%rax,%r15 - leaq (%r8,%r14,1),%r8 - leaq (%rdx,%r12,1),%rdx - andnq %rcx,%rax,%r12 - xorq %r15,%r13 - rorxq $14,%rax,%r14 - leaq (%rdx,%r12,1),%rdx - xorq %r14,%r13 - movq %r8,%r15 - rorxq $39,%r8,%r12 - leaq (%rdx,%r13,1),%rdx - xorq %r9,%r15 - rorxq $34,%r8,%r14 - rorxq $28,%r8,%r13 - leaq (%r11,%rdx,1),%r11 - andq %r15,%rdi - xorq %r12,%r14 - xorq %r9,%rdi - xorq %r13,%r14 - leaq (%rdx,%rdi,1),%rdx - movq %rax,%r12 - addq 72+16(%rbp),%rcx - andq %r11,%r12 - rorxq $41,%r11,%r13 - rorxq $18,%r11,%rdi - leaq (%rdx,%r14,1),%rdx - leaq (%rcx,%r12,1),%rcx - andnq %rbx,%r11,%r12 - xorq %rdi,%r13 - rorxq $14,%r11,%r14 - leaq (%rcx,%r12,1),%rcx - xorq %r14,%r13 - movq %rdx,%rdi - rorxq $39,%rdx,%r12 - leaq (%rcx,%r13,1),%rcx - xorq %r8,%rdi - rorxq $34,%rdx,%r14 - rorxq $28,%rdx,%r13 - leaq (%r10,%rcx,1),%r10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r8,%r15 - xorq %r13,%r14 - leaq (%rcx,%r15,1),%rcx - movq %r11,%r12 - addq 96+16(%rbp),%rbx - andq %r10,%r12 - rorxq $41,%r10,%r13 - rorxq $18,%r10,%r15 - leaq (%rcx,%r14,1),%rcx - leaq (%rbx,%r12,1),%rbx - andnq %rax,%r10,%r12 - xorq %r15,%r13 - rorxq $14,%r10,%r14 - leaq (%rbx,%r12,1),%rbx - xorq %r14,%r13 - movq %rcx,%r15 - rorxq $39,%rcx,%r12 - leaq (%rbx,%r13,1),%rbx - xorq %rdx,%r15 - rorxq $34,%rcx,%r14 - rorxq $28,%rcx,%r13 - leaq (%r9,%rbx,1),%r9 - andq %r15,%rdi - xorq %r12,%r14 - xorq %rdx,%rdi - xorq %r13,%r14 - leaq (%rbx,%rdi,1),%rbx - movq %r10,%r12 - addq 104+16(%rbp),%rax - andq %r9,%r12 - rorxq $41,%r9,%r13 - rorxq $18,%r9,%rdi - leaq (%rbx,%r14,1),%rbx - leaq (%rax,%r12,1),%rax - andnq %r11,%r9,%r12 - xorq %rdi,%r13 - rorxq $14,%r9,%r14 - leaq (%rax,%r12,1),%rax - xorq %r14,%r13 - movq %rbx,%rdi - rorxq $39,%rbx,%r12 - leaq (%rax,%r13,1),%rax - xorq %rcx,%rdi - rorxq $34,%rbx,%r14 - rorxq $28,%rbx,%r13 - leaq (%r8,%rax,1),%r8 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rcx,%r15 - xorq %r13,%r14 - leaq (%rax,%r15,1),%rax - movq %r9,%r12 - leaq -128(%rbp),%rbp - cmpq %rsp,%rbp - jae .Lower_avx2 - - movq 1280(%rsp),%rdi - addq %r14,%rax - - leaq 1152(%rsp),%rsp - -.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 - - addq 0(%rdi),%rax - addq 8(%rdi),%rbx - addq 16(%rdi),%rcx - addq 24(%rdi),%rdx - addq 32(%rdi),%r8 - addq 40(%rdi),%r9 - leaq 256(%rsi),%rsi - addq 48(%rdi),%r10 - movq %rsi,%r12 - addq 56(%rdi),%r11 - cmpq 128+16(%rsp),%rsi - - movq %rax,0(%rdi) - cmoveq %rsp,%r12 - movq %rbx,8(%rdi) - movq %rcx,16(%rdi) - movq %rdx,24(%rdi) - movq %r8,32(%rdi) - movq %r9,40(%rdi) - movq %r10,48(%rdi) - movq %r11,56(%rdi) - - jbe .Loop_avx2 - leaq (%rsp),%rbp - - -.cfi_escape 0x0f,0x06,0x76,0x98,0x01,0x06,0x23,0x08 - -.Ldone_avx2: - movq 152(%rbp),%rsi -.cfi_def_cfa %rsi,8 - vzeroupper - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_avx2: - .byte 0xf3,0xc3 -.cfi_endproc -.size sha512_block_data_order_avx2,.-sha512_block_data_order_avx2 - .section ".note.gnu.property", "a" + .section .note.gnu.property, #alloc .p2align 3 .long 1f - 0f .long 4f - 1f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/x86_64cpuid.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/x86_64cpuid.s index 5fda386d1df..e23b53af26e 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/x86_64cpuid.s +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/x86_64cpuid.s @@ -489,7 +489,7 @@ OPENSSL_ia32_rdseed_bytes: .byte 0xf3,0xc3 .cfi_endproc .size OPENSSL_ia32_rdseed_bytes,.-OPENSSL_ia32_rdseed_bytes - .section ".note.gnu.property", "a" + .section .note.gnu.property, #alloc .p2align 3 .long 1f - 0f .long 4f - 1f diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-mb-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-mb-x86_64.nasm index 7908342cf4c..2ec2d3bd1cd 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-mb-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-mb-x86_64.nasm @@ -21,14 +21,6 @@ $L$SEH_begin_aesni_multi_cbc_encrypt: - cmp edx,2 - jb NEAR $L$enc_non_avx - mov ecx,DWORD[((OPENSSL_ia32cap_P+4))] - test ecx,268435456 - jnz NEAR _avx_cbc_enc_shortcut - jmp NEAR $L$enc_non_avx -ALIGN 16 -$L$enc_non_avx: mov rax,rsp push rbx @@ -344,14 +336,6 @@ $L$SEH_begin_aesni_multi_cbc_decrypt: - cmp edx,2 - jb NEAR $L$dec_non_avx - mov ecx,DWORD[((OPENSSL_ia32cap_P+4))] - test ecx,268435456 - jnz NEAR _avx_cbc_dec_shortcut - jmp NEAR $L$dec_non_avx -ALIGN 16 -$L$dec_non_avx: mov rax,rsp push rbx @@ -642,1083 +626,6 @@ $L$dec4x_epilogue: DB 0F3h,0C3h ;repret $L$SEH_end_aesni_multi_cbc_decrypt: - -ALIGN 32 -aesni_multi_cbc_encrypt_avx: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aesni_multi_cbc_encrypt_avx: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - - -_avx_cbc_enc_shortcut: - mov rax,rsp - - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - push r15 - - lea rsp,[((-168))+rsp] - movaps XMMWORD[rsp],xmm6 - movaps XMMWORD[16+rsp],xmm7 - movaps XMMWORD[32+rsp],xmm8 - movaps XMMWORD[48+rsp],xmm9 - movaps XMMWORD[64+rsp],xmm10 - movaps XMMWORD[80+rsp],xmm11 - movaps XMMWORD[(-120)+rax],xmm12 - movaps XMMWORD[(-104)+rax],xmm13 - movaps XMMWORD[(-88)+rax],xmm14 - movaps XMMWORD[(-72)+rax],xmm15 - - - - - - - - - sub rsp,192 - and rsp,-128 - mov QWORD[16+rsp],rax - - -$L$enc8x_body: - vzeroupper - vmovdqu xmm15,XMMWORD[rsi] - lea rsi,[120+rsi] - lea rdi,[160+rdi] - shr edx,1 - -$L$enc8x_loop_grande: - - xor edx,edx - - mov ecx,DWORD[((-144))+rdi] - - mov r8,QWORD[((-160))+rdi] - cmp ecx,edx - - mov rbx,QWORD[((-152))+rdi] - cmovg edx,ecx - test ecx,ecx - - vmovdqu xmm2,XMMWORD[((-136))+rdi] - mov DWORD[32+rsp],ecx - cmovle r8,rsp - sub rbx,r8 - mov QWORD[64+rsp],rbx - - mov ecx,DWORD[((-104))+rdi] - - mov r9,QWORD[((-120))+rdi] - cmp ecx,edx - - mov rbp,QWORD[((-112))+rdi] - cmovg edx,ecx - test ecx,ecx - - vmovdqu xmm3,XMMWORD[((-96))+rdi] - mov DWORD[36+rsp],ecx - cmovle r9,rsp - sub rbp,r9 - mov QWORD[72+rsp],rbp - - mov ecx,DWORD[((-64))+rdi] - - mov r10,QWORD[((-80))+rdi] - cmp ecx,edx - - mov rbp,QWORD[((-72))+rdi] - cmovg edx,ecx - test ecx,ecx - - vmovdqu xmm4,XMMWORD[((-56))+rdi] - mov DWORD[40+rsp],ecx - cmovle r10,rsp - sub rbp,r10 - mov QWORD[80+rsp],rbp - - mov ecx,DWORD[((-24))+rdi] - - mov r11,QWORD[((-40))+rdi] - cmp ecx,edx - - mov rbp,QWORD[((-32))+rdi] - cmovg edx,ecx - test ecx,ecx - - vmovdqu xmm5,XMMWORD[((-16))+rdi] - mov DWORD[44+rsp],ecx - cmovle r11,rsp - sub rbp,r11 - mov QWORD[88+rsp],rbp - - mov ecx,DWORD[16+rdi] - - mov r12,QWORD[rdi] - cmp ecx,edx - - mov rbp,QWORD[8+rdi] - cmovg edx,ecx - test ecx,ecx - - vmovdqu xmm6,XMMWORD[24+rdi] - mov DWORD[48+rsp],ecx - cmovle r12,rsp - sub rbp,r12 - mov QWORD[96+rsp],rbp - - mov ecx,DWORD[56+rdi] - - mov r13,QWORD[40+rdi] - cmp ecx,edx - - mov rbp,QWORD[48+rdi] - cmovg edx,ecx - test ecx,ecx - - vmovdqu xmm7,XMMWORD[64+rdi] - mov DWORD[52+rsp],ecx - cmovle r13,rsp - sub rbp,r13 - mov QWORD[104+rsp],rbp - - mov ecx,DWORD[96+rdi] - - mov r14,QWORD[80+rdi] - cmp ecx,edx - - mov rbp,QWORD[88+rdi] - cmovg edx,ecx - test ecx,ecx - - vmovdqu xmm8,XMMWORD[104+rdi] - mov DWORD[56+rsp],ecx - cmovle r14,rsp - sub rbp,r14 - mov QWORD[112+rsp],rbp - - mov ecx,DWORD[136+rdi] - - mov r15,QWORD[120+rdi] - cmp ecx,edx - - mov rbp,QWORD[128+rdi] - cmovg edx,ecx - test ecx,ecx - - vmovdqu xmm9,XMMWORD[144+rdi] - mov DWORD[60+rsp],ecx - cmovle r15,rsp - sub rbp,r15 - mov QWORD[120+rsp],rbp - test edx,edx - jz NEAR $L$enc8x_done - - vmovups xmm1,XMMWORD[((16-120))+rsi] - vmovups xmm0,XMMWORD[((32-120))+rsi] - mov eax,DWORD[((240-120))+rsi] - - vpxor xmm10,xmm15,XMMWORD[r8] - lea rbp,[128+rsp] - vpxor xmm11,xmm15,XMMWORD[r9] - vpxor xmm12,xmm15,XMMWORD[r10] - vpxor xmm13,xmm15,XMMWORD[r11] - vpxor xmm2,xmm2,xmm10 - vpxor xmm10,xmm15,XMMWORD[r12] - vpxor xmm3,xmm3,xmm11 - vpxor xmm11,xmm15,XMMWORD[r13] - vpxor xmm4,xmm4,xmm12 - vpxor xmm12,xmm15,XMMWORD[r14] - vpxor xmm5,xmm5,xmm13 - vpxor xmm13,xmm15,XMMWORD[r15] - vpxor xmm6,xmm6,xmm10 - mov ecx,1 - vpxor xmm7,xmm7,xmm11 - vpxor xmm8,xmm8,xmm12 - vpxor xmm9,xmm9,xmm13 - jmp NEAR $L$oop_enc8x - -ALIGN 32 -$L$oop_enc8x: - vaesenc xmm2,xmm2,xmm1 - cmp ecx,DWORD[((32+0))+rsp] - vaesenc xmm3,xmm3,xmm1 - prefetcht0 [31+r8] - vaesenc xmm4,xmm4,xmm1 - vaesenc xmm5,xmm5,xmm1 - lea rbx,[rbx*1+r8] - cmovge r8,rsp - vaesenc xmm6,xmm6,xmm1 - cmovg rbx,rsp - vaesenc xmm7,xmm7,xmm1 - sub rbx,r8 - vaesenc xmm8,xmm8,xmm1 - vpxor xmm10,xmm15,XMMWORD[16+r8] - mov QWORD[((64+0))+rsp],rbx - vaesenc xmm9,xmm9,xmm1 - vmovups xmm1,XMMWORD[((-72))+rsi] - lea r8,[16+rbx*1+r8] - vmovdqu XMMWORD[rbp],xmm10 - vaesenc xmm2,xmm2,xmm0 - cmp ecx,DWORD[((32+4))+rsp] - mov rbx,QWORD[((64+8))+rsp] - vaesenc xmm3,xmm3,xmm0 - prefetcht0 [31+r9] - vaesenc xmm4,xmm4,xmm0 - vaesenc xmm5,xmm5,xmm0 - lea rbx,[rbx*1+r9] - cmovge r9,rsp - vaesenc xmm6,xmm6,xmm0 - cmovg rbx,rsp - vaesenc xmm7,xmm7,xmm0 - sub rbx,r9 - vaesenc xmm8,xmm8,xmm0 - vpxor xmm11,xmm15,XMMWORD[16+r9] - mov QWORD[((64+8))+rsp],rbx - vaesenc xmm9,xmm9,xmm0 - vmovups xmm0,XMMWORD[((-56))+rsi] - lea r9,[16+rbx*1+r9] - vmovdqu XMMWORD[16+rbp],xmm11 - vaesenc xmm2,xmm2,xmm1 - cmp ecx,DWORD[((32+8))+rsp] - mov rbx,QWORD[((64+16))+rsp] - vaesenc xmm3,xmm3,xmm1 - prefetcht0 [31+r10] - vaesenc xmm4,xmm4,xmm1 - prefetcht0 [15+r8] - vaesenc xmm5,xmm5,xmm1 - lea rbx,[rbx*1+r10] - cmovge r10,rsp - vaesenc xmm6,xmm6,xmm1 - cmovg rbx,rsp - vaesenc xmm7,xmm7,xmm1 - sub rbx,r10 - vaesenc xmm8,xmm8,xmm1 - vpxor xmm12,xmm15,XMMWORD[16+r10] - mov QWORD[((64+16))+rsp],rbx - vaesenc xmm9,xmm9,xmm1 - vmovups xmm1,XMMWORD[((-40))+rsi] - lea r10,[16+rbx*1+r10] - vmovdqu XMMWORD[32+rbp],xmm12 - vaesenc xmm2,xmm2,xmm0 - cmp ecx,DWORD[((32+12))+rsp] - mov rbx,QWORD[((64+24))+rsp] - vaesenc xmm3,xmm3,xmm0 - prefetcht0 [31+r11] - vaesenc xmm4,xmm4,xmm0 - prefetcht0 [15+r9] - vaesenc xmm5,xmm5,xmm0 - lea rbx,[rbx*1+r11] - cmovge r11,rsp - vaesenc xmm6,xmm6,xmm0 - cmovg rbx,rsp - vaesenc xmm7,xmm7,xmm0 - sub rbx,r11 - vaesenc xmm8,xmm8,xmm0 - vpxor xmm13,xmm15,XMMWORD[16+r11] - mov QWORD[((64+24))+rsp],rbx - vaesenc xmm9,xmm9,xmm0 - vmovups xmm0,XMMWORD[((-24))+rsi] - lea r11,[16+rbx*1+r11] - vmovdqu XMMWORD[48+rbp],xmm13 - vaesenc xmm2,xmm2,xmm1 - cmp ecx,DWORD[((32+16))+rsp] - mov rbx,QWORD[((64+32))+rsp] - vaesenc xmm3,xmm3,xmm1 - prefetcht0 [31+r12] - vaesenc xmm4,xmm4,xmm1 - prefetcht0 [15+r10] - vaesenc xmm5,xmm5,xmm1 - lea rbx,[rbx*1+r12] - cmovge r12,rsp - vaesenc xmm6,xmm6,xmm1 - cmovg rbx,rsp - vaesenc xmm7,xmm7,xmm1 - sub rbx,r12 - vaesenc xmm8,xmm8,xmm1 - vpxor xmm10,xmm15,XMMWORD[16+r12] - mov QWORD[((64+32))+rsp],rbx - vaesenc xmm9,xmm9,xmm1 - vmovups xmm1,XMMWORD[((-8))+rsi] - lea r12,[16+rbx*1+r12] - vaesenc xmm2,xmm2,xmm0 - cmp ecx,DWORD[((32+20))+rsp] - mov rbx,QWORD[((64+40))+rsp] - vaesenc xmm3,xmm3,xmm0 - prefetcht0 [31+r13] - vaesenc xmm4,xmm4,xmm0 - prefetcht0 [15+r11] - vaesenc xmm5,xmm5,xmm0 - lea rbx,[r13*1+rbx] - cmovge r13,rsp - vaesenc xmm6,xmm6,xmm0 - cmovg rbx,rsp - vaesenc xmm7,xmm7,xmm0 - sub rbx,r13 - vaesenc xmm8,xmm8,xmm0 - vpxor xmm11,xmm15,XMMWORD[16+r13] - mov QWORD[((64+40))+rsp],rbx - vaesenc xmm9,xmm9,xmm0 - vmovups xmm0,XMMWORD[8+rsi] - lea r13,[16+rbx*1+r13] - vaesenc xmm2,xmm2,xmm1 - cmp ecx,DWORD[((32+24))+rsp] - mov rbx,QWORD[((64+48))+rsp] - vaesenc xmm3,xmm3,xmm1 - prefetcht0 [31+r14] - vaesenc xmm4,xmm4,xmm1 - prefetcht0 [15+r12] - vaesenc xmm5,xmm5,xmm1 - lea rbx,[rbx*1+r14] - cmovge r14,rsp - vaesenc xmm6,xmm6,xmm1 - cmovg rbx,rsp - vaesenc xmm7,xmm7,xmm1 - sub rbx,r14 - vaesenc xmm8,xmm8,xmm1 - vpxor xmm12,xmm15,XMMWORD[16+r14] - mov QWORD[((64+48))+rsp],rbx - vaesenc xmm9,xmm9,xmm1 - vmovups xmm1,XMMWORD[24+rsi] - lea r14,[16+rbx*1+r14] - vaesenc xmm2,xmm2,xmm0 - cmp ecx,DWORD[((32+28))+rsp] - mov rbx,QWORD[((64+56))+rsp] - vaesenc xmm3,xmm3,xmm0 - prefetcht0 [31+r15] - vaesenc xmm4,xmm4,xmm0 - prefetcht0 [15+r13] - vaesenc xmm5,xmm5,xmm0 - lea rbx,[rbx*1+r15] - cmovge r15,rsp - vaesenc xmm6,xmm6,xmm0 - cmovg rbx,rsp - vaesenc xmm7,xmm7,xmm0 - sub rbx,r15 - vaesenc xmm8,xmm8,xmm0 - vpxor xmm13,xmm15,XMMWORD[16+r15] - mov QWORD[((64+56))+rsp],rbx - vaesenc xmm9,xmm9,xmm0 - vmovups xmm0,XMMWORD[40+rsi] - lea r15,[16+rbx*1+r15] - vmovdqu xmm14,XMMWORD[32+rsp] - prefetcht0 [15+r14] - prefetcht0 [15+r15] - cmp eax,11 - jb NEAR $L$enc8x_tail - - vaesenc xmm2,xmm2,xmm1 - vaesenc xmm3,xmm3,xmm1 - vaesenc xmm4,xmm4,xmm1 - vaesenc xmm5,xmm5,xmm1 - vaesenc xmm6,xmm6,xmm1 - vaesenc xmm7,xmm7,xmm1 - vaesenc xmm8,xmm8,xmm1 - vaesenc xmm9,xmm9,xmm1 - vmovups xmm1,XMMWORD[((176-120))+rsi] - - vaesenc xmm2,xmm2,xmm0 - vaesenc xmm3,xmm3,xmm0 - vaesenc xmm4,xmm4,xmm0 - vaesenc xmm5,xmm5,xmm0 - vaesenc xmm6,xmm6,xmm0 - vaesenc xmm7,xmm7,xmm0 - vaesenc xmm8,xmm8,xmm0 - vaesenc xmm9,xmm9,xmm0 - vmovups xmm0,XMMWORD[((192-120))+rsi] - je NEAR $L$enc8x_tail - - vaesenc xmm2,xmm2,xmm1 - vaesenc xmm3,xmm3,xmm1 - vaesenc xmm4,xmm4,xmm1 - vaesenc xmm5,xmm5,xmm1 - vaesenc xmm6,xmm6,xmm1 - vaesenc xmm7,xmm7,xmm1 - vaesenc xmm8,xmm8,xmm1 - vaesenc xmm9,xmm9,xmm1 - vmovups xmm1,XMMWORD[((208-120))+rsi] - - vaesenc xmm2,xmm2,xmm0 - vaesenc xmm3,xmm3,xmm0 - vaesenc xmm4,xmm4,xmm0 - vaesenc xmm5,xmm5,xmm0 - vaesenc xmm6,xmm6,xmm0 - vaesenc xmm7,xmm7,xmm0 - vaesenc xmm8,xmm8,xmm0 - vaesenc xmm9,xmm9,xmm0 - vmovups xmm0,XMMWORD[((224-120))+rsi] - -$L$enc8x_tail: - vaesenc xmm2,xmm2,xmm1 - vpxor xmm15,xmm15,xmm15 - vaesenc xmm3,xmm3,xmm1 - vaesenc xmm4,xmm4,xmm1 - vpcmpgtd xmm15,xmm14,xmm15 - vaesenc xmm5,xmm5,xmm1 - vaesenc xmm6,xmm6,xmm1 - vpaddd xmm15,xmm15,xmm14 - vmovdqu xmm14,XMMWORD[48+rsp] - vaesenc xmm7,xmm7,xmm1 - mov rbx,QWORD[64+rsp] - vaesenc xmm8,xmm8,xmm1 - vaesenc xmm9,xmm9,xmm1 - vmovups xmm1,XMMWORD[((16-120))+rsi] - - vaesenclast xmm2,xmm2,xmm0 - vmovdqa XMMWORD[32+rsp],xmm15 - vpxor xmm15,xmm15,xmm15 - vaesenclast xmm3,xmm3,xmm0 - vaesenclast xmm4,xmm4,xmm0 - vpcmpgtd xmm15,xmm14,xmm15 - vaesenclast xmm5,xmm5,xmm0 - vaesenclast xmm6,xmm6,xmm0 - vpaddd xmm14,xmm14,xmm15 - vmovdqu xmm15,XMMWORD[((-120))+rsi] - vaesenclast xmm7,xmm7,xmm0 - vaesenclast xmm8,xmm8,xmm0 - vmovdqa XMMWORD[48+rsp],xmm14 - vaesenclast xmm9,xmm9,xmm0 - vmovups xmm0,XMMWORD[((32-120))+rsi] - - vmovups XMMWORD[(-16)+r8],xmm2 - sub r8,rbx - vpxor xmm2,xmm2,XMMWORD[rbp] - vmovups XMMWORD[(-16)+r9],xmm3 - sub r9,QWORD[72+rsp] - vpxor xmm3,xmm3,XMMWORD[16+rbp] - vmovups XMMWORD[(-16)+r10],xmm4 - sub r10,QWORD[80+rsp] - vpxor xmm4,xmm4,XMMWORD[32+rbp] - vmovups XMMWORD[(-16)+r11],xmm5 - sub r11,QWORD[88+rsp] - vpxor xmm5,xmm5,XMMWORD[48+rbp] - vmovups XMMWORD[(-16)+r12],xmm6 - sub r12,QWORD[96+rsp] - vpxor xmm6,xmm6,xmm10 - vmovups XMMWORD[(-16)+r13],xmm7 - sub r13,QWORD[104+rsp] - vpxor xmm7,xmm7,xmm11 - vmovups XMMWORD[(-16)+r14],xmm8 - sub r14,QWORD[112+rsp] - vpxor xmm8,xmm8,xmm12 - vmovups XMMWORD[(-16)+r15],xmm9 - sub r15,QWORD[120+rsp] - vpxor xmm9,xmm9,xmm13 - - dec edx - jnz NEAR $L$oop_enc8x - - mov rax,QWORD[16+rsp] - - - - - - -$L$enc8x_done: - vzeroupper - movaps xmm6,XMMWORD[((-216))+rax] - movaps xmm7,XMMWORD[((-200))+rax] - movaps xmm8,XMMWORD[((-184))+rax] - movaps xmm9,XMMWORD[((-168))+rax] - movaps xmm10,XMMWORD[((-152))+rax] - movaps xmm11,XMMWORD[((-136))+rax] - movaps xmm12,XMMWORD[((-120))+rax] - movaps xmm13,XMMWORD[((-104))+rax] - movaps xmm14,XMMWORD[((-88))+rax] - movaps xmm15,XMMWORD[((-72))+rax] - mov r15,QWORD[((-48))+rax] - - mov r14,QWORD[((-40))+rax] - - mov r13,QWORD[((-32))+rax] - - mov r12,QWORD[((-24))+rax] - - mov rbp,QWORD[((-16))+rax] - - mov rbx,QWORD[((-8))+rax] - - lea rsp,[rax] - -$L$enc8x_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_aesni_multi_cbc_encrypt_avx: - - -ALIGN 32 -aesni_multi_cbc_decrypt_avx: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aesni_multi_cbc_decrypt_avx: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - - -_avx_cbc_dec_shortcut: - mov rax,rsp - - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - push r15 - - lea rsp,[((-168))+rsp] - movaps XMMWORD[rsp],xmm6 - movaps XMMWORD[16+rsp],xmm7 - movaps XMMWORD[32+rsp],xmm8 - movaps XMMWORD[48+rsp],xmm9 - movaps XMMWORD[64+rsp],xmm10 - movaps XMMWORD[80+rsp],xmm11 - movaps XMMWORD[(-120)+rax],xmm12 - movaps XMMWORD[(-104)+rax],xmm13 - movaps XMMWORD[(-88)+rax],xmm14 - movaps XMMWORD[(-72)+rax],xmm15 - - - - - - - - - - sub rsp,256 - and rsp,-256 - sub rsp,192 - mov QWORD[16+rsp],rax - - -$L$dec8x_body: - vzeroupper - vmovdqu xmm15,XMMWORD[rsi] - lea rsi,[120+rsi] - lea rdi,[160+rdi] - shr edx,1 - -$L$dec8x_loop_grande: - - xor edx,edx - - mov ecx,DWORD[((-144))+rdi] - - mov r8,QWORD[((-160))+rdi] - cmp ecx,edx - - mov rbx,QWORD[((-152))+rdi] - cmovg edx,ecx - test ecx,ecx - - vmovdqu xmm2,XMMWORD[((-136))+rdi] - mov DWORD[32+rsp],ecx - cmovle r8,rsp - sub rbx,r8 - mov QWORD[64+rsp],rbx - vmovdqu XMMWORD[192+rsp],xmm2 - - mov ecx,DWORD[((-104))+rdi] - - mov r9,QWORD[((-120))+rdi] - cmp ecx,edx - - mov rbp,QWORD[((-112))+rdi] - cmovg edx,ecx - test ecx,ecx - - vmovdqu xmm3,XMMWORD[((-96))+rdi] - mov DWORD[36+rsp],ecx - cmovle r9,rsp - sub rbp,r9 - mov QWORD[72+rsp],rbp - vmovdqu XMMWORD[208+rsp],xmm3 - - mov ecx,DWORD[((-64))+rdi] - - mov r10,QWORD[((-80))+rdi] - cmp ecx,edx - - mov rbp,QWORD[((-72))+rdi] - cmovg edx,ecx - test ecx,ecx - - vmovdqu xmm4,XMMWORD[((-56))+rdi] - mov DWORD[40+rsp],ecx - cmovle r10,rsp - sub rbp,r10 - mov QWORD[80+rsp],rbp - vmovdqu XMMWORD[224+rsp],xmm4 - - mov ecx,DWORD[((-24))+rdi] - - mov r11,QWORD[((-40))+rdi] - cmp ecx,edx - - mov rbp,QWORD[((-32))+rdi] - cmovg edx,ecx - test ecx,ecx - - vmovdqu xmm5,XMMWORD[((-16))+rdi] - mov DWORD[44+rsp],ecx - cmovle r11,rsp - sub rbp,r11 - mov QWORD[88+rsp],rbp - vmovdqu XMMWORD[240+rsp],xmm5 - - mov ecx,DWORD[16+rdi] - - mov r12,QWORD[rdi] - cmp ecx,edx - - mov rbp,QWORD[8+rdi] - cmovg edx,ecx - test ecx,ecx - - vmovdqu xmm6,XMMWORD[24+rdi] - mov DWORD[48+rsp],ecx - cmovle r12,rsp - sub rbp,r12 - mov QWORD[96+rsp],rbp - vmovdqu XMMWORD[256+rsp],xmm6 - - mov ecx,DWORD[56+rdi] - - mov r13,QWORD[40+rdi] - cmp ecx,edx - - mov rbp,QWORD[48+rdi] - cmovg edx,ecx - test ecx,ecx - - vmovdqu xmm7,XMMWORD[64+rdi] - mov DWORD[52+rsp],ecx - cmovle r13,rsp - sub rbp,r13 - mov QWORD[104+rsp],rbp - vmovdqu XMMWORD[272+rsp],xmm7 - - mov ecx,DWORD[96+rdi] - - mov r14,QWORD[80+rdi] - cmp ecx,edx - - mov rbp,QWORD[88+rdi] - cmovg edx,ecx - test ecx,ecx - - vmovdqu xmm8,XMMWORD[104+rdi] - mov DWORD[56+rsp],ecx - cmovle r14,rsp - sub rbp,r14 - mov QWORD[112+rsp],rbp - vmovdqu XMMWORD[288+rsp],xmm8 - - mov ecx,DWORD[136+rdi] - - mov r15,QWORD[120+rdi] - cmp ecx,edx - - mov rbp,QWORD[128+rdi] - cmovg edx,ecx - test ecx,ecx - - vmovdqu xmm9,XMMWORD[144+rdi] - mov DWORD[60+rsp],ecx - cmovle r15,rsp - sub rbp,r15 - mov QWORD[120+rsp],rbp - vmovdqu XMMWORD[304+rsp],xmm9 - test edx,edx - jz NEAR $L$dec8x_done - - vmovups xmm1,XMMWORD[((16-120))+rsi] - vmovups xmm0,XMMWORD[((32-120))+rsi] - mov eax,DWORD[((240-120))+rsi] - lea rbp,[((192+128))+rsp] - - vmovdqu xmm2,XMMWORD[r8] - vmovdqu xmm3,XMMWORD[r9] - vmovdqu xmm4,XMMWORD[r10] - vmovdqu xmm5,XMMWORD[r11] - vmovdqu xmm6,XMMWORD[r12] - vmovdqu xmm7,XMMWORD[r13] - vmovdqu xmm8,XMMWORD[r14] - vmovdqu xmm9,XMMWORD[r15] - vmovdqu XMMWORD[rbp],xmm2 - vpxor xmm2,xmm2,xmm15 - vmovdqu XMMWORD[16+rbp],xmm3 - vpxor xmm3,xmm3,xmm15 - vmovdqu XMMWORD[32+rbp],xmm4 - vpxor xmm4,xmm4,xmm15 - vmovdqu XMMWORD[48+rbp],xmm5 - vpxor xmm5,xmm5,xmm15 - vmovdqu XMMWORD[64+rbp],xmm6 - vpxor xmm6,xmm6,xmm15 - vmovdqu XMMWORD[80+rbp],xmm7 - vpxor xmm7,xmm7,xmm15 - vmovdqu XMMWORD[96+rbp],xmm8 - vpxor xmm8,xmm8,xmm15 - vmovdqu XMMWORD[112+rbp],xmm9 - vpxor xmm9,xmm9,xmm15 - xor rbp,0x80 - mov ecx,1 - jmp NEAR $L$oop_dec8x - -ALIGN 32 -$L$oop_dec8x: - vaesdec xmm2,xmm2,xmm1 - cmp ecx,DWORD[((32+0))+rsp] - vaesdec xmm3,xmm3,xmm1 - prefetcht0 [31+r8] - vaesdec xmm4,xmm4,xmm1 - vaesdec xmm5,xmm5,xmm1 - lea rbx,[rbx*1+r8] - cmovge r8,rsp - vaesdec xmm6,xmm6,xmm1 - cmovg rbx,rsp - vaesdec xmm7,xmm7,xmm1 - sub rbx,r8 - vaesdec xmm8,xmm8,xmm1 - vmovdqu xmm10,XMMWORD[16+r8] - mov QWORD[((64+0))+rsp],rbx - vaesdec xmm9,xmm9,xmm1 - vmovups xmm1,XMMWORD[((-72))+rsi] - lea r8,[16+rbx*1+r8] - vmovdqu XMMWORD[128+rsp],xmm10 - vaesdec xmm2,xmm2,xmm0 - cmp ecx,DWORD[((32+4))+rsp] - mov rbx,QWORD[((64+8))+rsp] - vaesdec xmm3,xmm3,xmm0 - prefetcht0 [31+r9] - vaesdec xmm4,xmm4,xmm0 - vaesdec xmm5,xmm5,xmm0 - lea rbx,[rbx*1+r9] - cmovge r9,rsp - vaesdec xmm6,xmm6,xmm0 - cmovg rbx,rsp - vaesdec xmm7,xmm7,xmm0 - sub rbx,r9 - vaesdec xmm8,xmm8,xmm0 - vmovdqu xmm11,XMMWORD[16+r9] - mov QWORD[((64+8))+rsp],rbx - vaesdec xmm9,xmm9,xmm0 - vmovups xmm0,XMMWORD[((-56))+rsi] - lea r9,[16+rbx*1+r9] - vmovdqu XMMWORD[144+rsp],xmm11 - vaesdec xmm2,xmm2,xmm1 - cmp ecx,DWORD[((32+8))+rsp] - mov rbx,QWORD[((64+16))+rsp] - vaesdec xmm3,xmm3,xmm1 - prefetcht0 [31+r10] - vaesdec xmm4,xmm4,xmm1 - prefetcht0 [15+r8] - vaesdec xmm5,xmm5,xmm1 - lea rbx,[rbx*1+r10] - cmovge r10,rsp - vaesdec xmm6,xmm6,xmm1 - cmovg rbx,rsp - vaesdec xmm7,xmm7,xmm1 - sub rbx,r10 - vaesdec xmm8,xmm8,xmm1 - vmovdqu xmm12,XMMWORD[16+r10] - mov QWORD[((64+16))+rsp],rbx - vaesdec xmm9,xmm9,xmm1 - vmovups xmm1,XMMWORD[((-40))+rsi] - lea r10,[16+rbx*1+r10] - vmovdqu XMMWORD[160+rsp],xmm12 - vaesdec xmm2,xmm2,xmm0 - cmp ecx,DWORD[((32+12))+rsp] - mov rbx,QWORD[((64+24))+rsp] - vaesdec xmm3,xmm3,xmm0 - prefetcht0 [31+r11] - vaesdec xmm4,xmm4,xmm0 - prefetcht0 [15+r9] - vaesdec xmm5,xmm5,xmm0 - lea rbx,[rbx*1+r11] - cmovge r11,rsp - vaesdec xmm6,xmm6,xmm0 - cmovg rbx,rsp - vaesdec xmm7,xmm7,xmm0 - sub rbx,r11 - vaesdec xmm8,xmm8,xmm0 - vmovdqu xmm13,XMMWORD[16+r11] - mov QWORD[((64+24))+rsp],rbx - vaesdec xmm9,xmm9,xmm0 - vmovups xmm0,XMMWORD[((-24))+rsi] - lea r11,[16+rbx*1+r11] - vmovdqu XMMWORD[176+rsp],xmm13 - vaesdec xmm2,xmm2,xmm1 - cmp ecx,DWORD[((32+16))+rsp] - mov rbx,QWORD[((64+32))+rsp] - vaesdec xmm3,xmm3,xmm1 - prefetcht0 [31+r12] - vaesdec xmm4,xmm4,xmm1 - prefetcht0 [15+r10] - vaesdec xmm5,xmm5,xmm1 - lea rbx,[rbx*1+r12] - cmovge r12,rsp - vaesdec xmm6,xmm6,xmm1 - cmovg rbx,rsp - vaesdec xmm7,xmm7,xmm1 - sub rbx,r12 - vaesdec xmm8,xmm8,xmm1 - vmovdqu xmm10,XMMWORD[16+r12] - mov QWORD[((64+32))+rsp],rbx - vaesdec xmm9,xmm9,xmm1 - vmovups xmm1,XMMWORD[((-8))+rsi] - lea r12,[16+rbx*1+r12] - vaesdec xmm2,xmm2,xmm0 - cmp ecx,DWORD[((32+20))+rsp] - mov rbx,QWORD[((64+40))+rsp] - vaesdec xmm3,xmm3,xmm0 - prefetcht0 [31+r13] - vaesdec xmm4,xmm4,xmm0 - prefetcht0 [15+r11] - vaesdec xmm5,xmm5,xmm0 - lea rbx,[r13*1+rbx] - cmovge r13,rsp - vaesdec xmm6,xmm6,xmm0 - cmovg rbx,rsp - vaesdec xmm7,xmm7,xmm0 - sub rbx,r13 - vaesdec xmm8,xmm8,xmm0 - vmovdqu xmm11,XMMWORD[16+r13] - mov QWORD[((64+40))+rsp],rbx - vaesdec xmm9,xmm9,xmm0 - vmovups xmm0,XMMWORD[8+rsi] - lea r13,[16+rbx*1+r13] - vaesdec xmm2,xmm2,xmm1 - cmp ecx,DWORD[((32+24))+rsp] - mov rbx,QWORD[((64+48))+rsp] - vaesdec xmm3,xmm3,xmm1 - prefetcht0 [31+r14] - vaesdec xmm4,xmm4,xmm1 - prefetcht0 [15+r12] - vaesdec xmm5,xmm5,xmm1 - lea rbx,[rbx*1+r14] - cmovge r14,rsp - vaesdec xmm6,xmm6,xmm1 - cmovg rbx,rsp - vaesdec xmm7,xmm7,xmm1 - sub rbx,r14 - vaesdec xmm8,xmm8,xmm1 - vmovdqu xmm12,XMMWORD[16+r14] - mov QWORD[((64+48))+rsp],rbx - vaesdec xmm9,xmm9,xmm1 - vmovups xmm1,XMMWORD[24+rsi] - lea r14,[16+rbx*1+r14] - vaesdec xmm2,xmm2,xmm0 - cmp ecx,DWORD[((32+28))+rsp] - mov rbx,QWORD[((64+56))+rsp] - vaesdec xmm3,xmm3,xmm0 - prefetcht0 [31+r15] - vaesdec xmm4,xmm4,xmm0 - prefetcht0 [15+r13] - vaesdec xmm5,xmm5,xmm0 - lea rbx,[rbx*1+r15] - cmovge r15,rsp - vaesdec xmm6,xmm6,xmm0 - cmovg rbx,rsp - vaesdec xmm7,xmm7,xmm0 - sub rbx,r15 - vaesdec xmm8,xmm8,xmm0 - vmovdqu xmm13,XMMWORD[16+r15] - mov QWORD[((64+56))+rsp],rbx - vaesdec xmm9,xmm9,xmm0 - vmovups xmm0,XMMWORD[40+rsi] - lea r15,[16+rbx*1+r15] - vmovdqu xmm14,XMMWORD[32+rsp] - prefetcht0 [15+r14] - prefetcht0 [15+r15] - cmp eax,11 - jb NEAR $L$dec8x_tail - - vaesdec xmm2,xmm2,xmm1 - vaesdec xmm3,xmm3,xmm1 - vaesdec xmm4,xmm4,xmm1 - vaesdec xmm5,xmm5,xmm1 - vaesdec xmm6,xmm6,xmm1 - vaesdec xmm7,xmm7,xmm1 - vaesdec xmm8,xmm8,xmm1 - vaesdec xmm9,xmm9,xmm1 - vmovups xmm1,XMMWORD[((176-120))+rsi] - - vaesdec xmm2,xmm2,xmm0 - vaesdec xmm3,xmm3,xmm0 - vaesdec xmm4,xmm4,xmm0 - vaesdec xmm5,xmm5,xmm0 - vaesdec xmm6,xmm6,xmm0 - vaesdec xmm7,xmm7,xmm0 - vaesdec xmm8,xmm8,xmm0 - vaesdec xmm9,xmm9,xmm0 - vmovups xmm0,XMMWORD[((192-120))+rsi] - je NEAR $L$dec8x_tail - - vaesdec xmm2,xmm2,xmm1 - vaesdec xmm3,xmm3,xmm1 - vaesdec xmm4,xmm4,xmm1 - vaesdec xmm5,xmm5,xmm1 - vaesdec xmm6,xmm6,xmm1 - vaesdec xmm7,xmm7,xmm1 - vaesdec xmm8,xmm8,xmm1 - vaesdec xmm9,xmm9,xmm1 - vmovups xmm1,XMMWORD[((208-120))+rsi] - - vaesdec xmm2,xmm2,xmm0 - vaesdec xmm3,xmm3,xmm0 - vaesdec xmm4,xmm4,xmm0 - vaesdec xmm5,xmm5,xmm0 - vaesdec xmm6,xmm6,xmm0 - vaesdec xmm7,xmm7,xmm0 - vaesdec xmm8,xmm8,xmm0 - vaesdec xmm9,xmm9,xmm0 - vmovups xmm0,XMMWORD[((224-120))+rsi] - -$L$dec8x_tail: - vaesdec xmm2,xmm2,xmm1 - vpxor xmm15,xmm15,xmm15 - vaesdec xmm3,xmm3,xmm1 - vaesdec xmm4,xmm4,xmm1 - vpcmpgtd xmm15,xmm14,xmm15 - vaesdec xmm5,xmm5,xmm1 - vaesdec xmm6,xmm6,xmm1 - vpaddd xmm15,xmm15,xmm14 - vmovdqu xmm14,XMMWORD[48+rsp] - vaesdec xmm7,xmm7,xmm1 - mov rbx,QWORD[64+rsp] - vaesdec xmm8,xmm8,xmm1 - vaesdec xmm9,xmm9,xmm1 - vmovups xmm1,XMMWORD[((16-120))+rsi] - - vaesdeclast xmm2,xmm2,xmm0 - vmovdqa XMMWORD[32+rsp],xmm15 - vpxor xmm15,xmm15,xmm15 - vaesdeclast xmm3,xmm3,xmm0 - vpxor xmm2,xmm2,XMMWORD[rbp] - vaesdeclast xmm4,xmm4,xmm0 - vpxor xmm3,xmm3,XMMWORD[16+rbp] - vpcmpgtd xmm15,xmm14,xmm15 - vaesdeclast xmm5,xmm5,xmm0 - vpxor xmm4,xmm4,XMMWORD[32+rbp] - vaesdeclast xmm6,xmm6,xmm0 - vpxor xmm5,xmm5,XMMWORD[48+rbp] - vpaddd xmm14,xmm14,xmm15 - vmovdqu xmm15,XMMWORD[((-120))+rsi] - vaesdeclast xmm7,xmm7,xmm0 - vpxor xmm6,xmm6,XMMWORD[64+rbp] - vaesdeclast xmm8,xmm8,xmm0 - vpxor xmm7,xmm7,XMMWORD[80+rbp] - vmovdqa XMMWORD[48+rsp],xmm14 - vaesdeclast xmm9,xmm9,xmm0 - vpxor xmm8,xmm8,XMMWORD[96+rbp] - vmovups xmm0,XMMWORD[((32-120))+rsi] - - vmovups XMMWORD[(-16)+r8],xmm2 - sub r8,rbx - vmovdqu xmm2,XMMWORD[((128+0))+rsp] - vpxor xmm9,xmm9,XMMWORD[112+rbp] - vmovups XMMWORD[(-16)+r9],xmm3 - sub r9,QWORD[72+rsp] - vmovdqu XMMWORD[rbp],xmm2 - vpxor xmm2,xmm2,xmm15 - vmovdqu xmm3,XMMWORD[((128+16))+rsp] - vmovups XMMWORD[(-16)+r10],xmm4 - sub r10,QWORD[80+rsp] - vmovdqu XMMWORD[16+rbp],xmm3 - vpxor xmm3,xmm3,xmm15 - vmovdqu xmm4,XMMWORD[((128+32))+rsp] - vmovups XMMWORD[(-16)+r11],xmm5 - sub r11,QWORD[88+rsp] - vmovdqu XMMWORD[32+rbp],xmm4 - vpxor xmm4,xmm4,xmm15 - vmovdqu xmm5,XMMWORD[((128+48))+rsp] - vmovups XMMWORD[(-16)+r12],xmm6 - sub r12,QWORD[96+rsp] - vmovdqu XMMWORD[48+rbp],xmm5 - vpxor xmm5,xmm5,xmm15 - vmovdqu XMMWORD[64+rbp],xmm10 - vpxor xmm6,xmm15,xmm10 - vmovups XMMWORD[(-16)+r13],xmm7 - sub r13,QWORD[104+rsp] - vmovdqu XMMWORD[80+rbp],xmm11 - vpxor xmm7,xmm15,xmm11 - vmovups XMMWORD[(-16)+r14],xmm8 - sub r14,QWORD[112+rsp] - vmovdqu XMMWORD[96+rbp],xmm12 - vpxor xmm8,xmm15,xmm12 - vmovups XMMWORD[(-16)+r15],xmm9 - sub r15,QWORD[120+rsp] - vmovdqu XMMWORD[112+rbp],xmm13 - vpxor xmm9,xmm15,xmm13 - - xor rbp,128 - dec edx - jnz NEAR $L$oop_dec8x - - mov rax,QWORD[16+rsp] - - - - - - -$L$dec8x_done: - vzeroupper - movaps xmm6,XMMWORD[((-216))+rax] - movaps xmm7,XMMWORD[((-200))+rax] - movaps xmm8,XMMWORD[((-184))+rax] - movaps xmm9,XMMWORD[((-168))+rax] - movaps xmm10,XMMWORD[((-152))+rax] - movaps xmm11,XMMWORD[((-136))+rax] - movaps xmm12,XMMWORD[((-120))+rax] - movaps xmm13,XMMWORD[((-104))+rax] - movaps xmm14,XMMWORD[((-88))+rax] - movaps xmm15,XMMWORD[((-72))+rax] - mov r15,QWORD[((-48))+rax] - - mov r14,QWORD[((-40))+rax] - - mov r13,QWORD[((-32))+rax] - - mov r12,QWORD[((-24))+rax] - - mov rbp,QWORD[((-16))+rax] - - mov rbx,QWORD[((-8))+rax] - - lea rsp,[rax] - -$L$dec8x_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_aesni_multi_cbc_decrypt_avx: EXTERN __imp_RtlVirtualUnwind ALIGN 16 @@ -1820,12 +727,6 @@ ALIGN 4 DD $L$SEH_begin_aesni_multi_cbc_decrypt wrt ..imagebase DD $L$SEH_end_aesni_multi_cbc_decrypt wrt ..imagebase DD $L$SEH_info_aesni_multi_cbc_decrypt wrt ..imagebase - DD $L$SEH_begin_aesni_multi_cbc_encrypt_avx wrt ..imagebase - DD $L$SEH_end_aesni_multi_cbc_encrypt_avx wrt ..imagebase - DD $L$SEH_info_aesni_multi_cbc_encrypt_avx wrt ..imagebase - DD $L$SEH_begin_aesni_multi_cbc_decrypt_avx wrt ..imagebase - DD $L$SEH_end_aesni_multi_cbc_decrypt_avx wrt ..imagebase - DD $L$SEH_info_aesni_multi_cbc_decrypt_avx wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_aesni_multi_cbc_encrypt: @@ -1836,11 +737,3 @@ $L$SEH_info_aesni_multi_cbc_decrypt: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$dec4x_body wrt ..imagebase,$L$dec4x_epilogue wrt ..imagebase -$L$SEH_info_aesni_multi_cbc_encrypt_avx: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$enc8x_body wrt ..imagebase,$L$enc8x_epilogue wrt ..imagebase -$L$SEH_info_aesni_multi_cbc_decrypt_avx: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$dec8x_body wrt ..imagebase,$L$dec8x_epilogue wrt ..imagebase diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-sha1-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-sha1-x86_64.nasm index f4ed3f70843..e52174799ef 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-sha1-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-sha1-x86_64.nasm @@ -16,11 +16,6 @@ aesni_cbc_sha1_enc: mov r11,QWORD[((OPENSSL_ia32cap_P+4))] bt r11,61 jc NEAR aesni_cbc_sha1_enc_shaext - and r11d,268435456 - and r10d,1073741824 - or r10d,r11d - cmp r10d,1342177280 - je NEAR aesni_cbc_sha1_enc_avx jmp NEAR aesni_cbc_sha1_enc_ssse3 DB 0F3h,0C3h ;repret @@ -1431,1356 +1426,6 @@ $L$epilogue_ssse3: DB 0F3h,0C3h ;repret $L$SEH_end_aesni_cbc_sha1_enc_ssse3: - -ALIGN 32 -aesni_cbc_sha1_enc_avx: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aesni_cbc_sha1_enc_avx: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - - mov r10,QWORD[56+rsp] - - - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - push r15 - - lea rsp,[((-264))+rsp] - - - - movaps XMMWORD[(96+0)+rsp],xmm6 - movaps XMMWORD[(96+16)+rsp],xmm7 - movaps XMMWORD[(96+32)+rsp],xmm8 - movaps XMMWORD[(96+48)+rsp],xmm9 - movaps XMMWORD[(96+64)+rsp],xmm10 - movaps XMMWORD[(96+80)+rsp],xmm11 - movaps XMMWORD[(96+96)+rsp],xmm12 - movaps XMMWORD[(96+112)+rsp],xmm13 - movaps XMMWORD[(96+128)+rsp],xmm14 - movaps XMMWORD[(96+144)+rsp],xmm15 -$L$prologue_avx: - vzeroall - mov r12,rdi - mov r13,rsi - mov r14,rdx - lea r15,[112+rcx] - vmovdqu xmm12,XMMWORD[r8] - mov QWORD[88+rsp],r8 - shl r14,6 - sub r13,r12 - mov r8d,DWORD[((240-112))+r15] - add r14,r10 - - lea r11,[K_XX_XX] - mov eax,DWORD[r9] - mov ebx,DWORD[4+r9] - mov ecx,DWORD[8+r9] - mov edx,DWORD[12+r9] - mov esi,ebx - mov ebp,DWORD[16+r9] - mov edi,ecx - xor edi,edx - and esi,edi - - vmovdqa xmm6,XMMWORD[64+r11] - vmovdqa xmm10,XMMWORD[r11] - vmovdqu xmm0,XMMWORD[r10] - vmovdqu xmm1,XMMWORD[16+r10] - vmovdqu xmm2,XMMWORD[32+r10] - vmovdqu xmm3,XMMWORD[48+r10] - vpshufb xmm0,xmm0,xmm6 - add r10,64 - vpshufb xmm1,xmm1,xmm6 - vpshufb xmm2,xmm2,xmm6 - vpshufb xmm3,xmm3,xmm6 - vpaddd xmm4,xmm0,xmm10 - vpaddd xmm5,xmm1,xmm10 - vpaddd xmm6,xmm2,xmm10 - vmovdqa XMMWORD[rsp],xmm4 - vmovdqa XMMWORD[16+rsp],xmm5 - vmovdqa XMMWORD[32+rsp],xmm6 - vmovups xmm15,XMMWORD[((-112))+r15] - vmovups xmm14,XMMWORD[((16-112))+r15] - jmp NEAR $L$oop_avx -ALIGN 32 -$L$oop_avx: - shrd ebx,ebx,2 - vmovdqu xmm13,XMMWORD[r12] - vpxor xmm13,xmm13,xmm15 - vpxor xmm12,xmm12,xmm13 - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[((-80))+r15] - xor esi,edx - vpalignr xmm4,xmm1,xmm0,8 - mov edi,eax - add ebp,DWORD[rsp] - vpaddd xmm9,xmm10,xmm3 - xor ebx,ecx - shld eax,eax,5 - vpsrldq xmm8,xmm3,4 - add ebp,esi - and edi,ebx - vpxor xmm4,xmm4,xmm0 - xor ebx,ecx - add ebp,eax - vpxor xmm8,xmm8,xmm2 - shrd eax,eax,7 - xor edi,ecx - mov esi,ebp - add edx,DWORD[4+rsp] - vpxor xmm4,xmm4,xmm8 - xor eax,ebx - shld ebp,ebp,5 - vmovdqa XMMWORD[48+rsp],xmm9 - add edx,edi - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[((-64))+r15] - and esi,eax - vpsrld xmm8,xmm4,31 - xor eax,ebx - add edx,ebp - shrd ebp,ebp,7 - xor esi,ebx - vpslldq xmm9,xmm4,12 - vpaddd xmm4,xmm4,xmm4 - mov edi,edx - add ecx,DWORD[8+rsp] - xor ebp,eax - shld edx,edx,5 - vpor xmm4,xmm4,xmm8 - vpsrld xmm8,xmm9,30 - add ecx,esi - and edi,ebp - xor ebp,eax - add ecx,edx - vpslld xmm9,xmm9,2 - vpxor xmm4,xmm4,xmm8 - shrd edx,edx,7 - xor edi,eax - mov esi,ecx - add ebx,DWORD[12+rsp] - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[((-48))+r15] - vpxor xmm4,xmm4,xmm9 - xor edx,ebp - shld ecx,ecx,5 - add ebx,edi - and esi,edx - xor edx,ebp - add ebx,ecx - shrd ecx,ecx,7 - xor esi,ebp - vpalignr xmm5,xmm2,xmm1,8 - mov edi,ebx - add eax,DWORD[16+rsp] - vpaddd xmm9,xmm10,xmm4 - xor ecx,edx - shld ebx,ebx,5 - vpsrldq xmm8,xmm4,4 - add eax,esi - and edi,ecx - vpxor xmm5,xmm5,xmm1 - xor ecx,edx - add eax,ebx - vpxor xmm8,xmm8,xmm3 - shrd ebx,ebx,7 - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[((-32))+r15] - xor edi,edx - mov esi,eax - add ebp,DWORD[20+rsp] - vpxor xmm5,xmm5,xmm8 - xor ebx,ecx - shld eax,eax,5 - vmovdqa XMMWORD[rsp],xmm9 - add ebp,edi - and esi,ebx - vpsrld xmm8,xmm5,31 - xor ebx,ecx - add ebp,eax - shrd eax,eax,7 - xor esi,ecx - vpslldq xmm9,xmm5,12 - vpaddd xmm5,xmm5,xmm5 - mov edi,ebp - add edx,DWORD[24+rsp] - xor eax,ebx - shld ebp,ebp,5 - vpor xmm5,xmm5,xmm8 - vpsrld xmm8,xmm9,30 - add edx,esi - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[((-16))+r15] - and edi,eax - xor eax,ebx - add edx,ebp - vpslld xmm9,xmm9,2 - vpxor xmm5,xmm5,xmm8 - shrd ebp,ebp,7 - xor edi,ebx - mov esi,edx - add ecx,DWORD[28+rsp] - vpxor xmm5,xmm5,xmm9 - xor ebp,eax - shld edx,edx,5 - vmovdqa xmm10,XMMWORD[16+r11] - add ecx,edi - and esi,ebp - xor ebp,eax - add ecx,edx - shrd edx,edx,7 - xor esi,eax - vpalignr xmm6,xmm3,xmm2,8 - mov edi,ecx - add ebx,DWORD[32+rsp] - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[r15] - vpaddd xmm9,xmm10,xmm5 - xor edx,ebp - shld ecx,ecx,5 - vpsrldq xmm8,xmm5,4 - add ebx,esi - and edi,edx - vpxor xmm6,xmm6,xmm2 - xor edx,ebp - add ebx,ecx - vpxor xmm8,xmm8,xmm4 - shrd ecx,ecx,7 - xor edi,ebp - mov esi,ebx - add eax,DWORD[36+rsp] - vpxor xmm6,xmm6,xmm8 - xor ecx,edx - shld ebx,ebx,5 - vmovdqa XMMWORD[16+rsp],xmm9 - add eax,edi - and esi,ecx - vpsrld xmm8,xmm6,31 - xor ecx,edx - add eax,ebx - shrd ebx,ebx,7 - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[16+r15] - xor esi,edx - vpslldq xmm9,xmm6,12 - vpaddd xmm6,xmm6,xmm6 - mov edi,eax - add ebp,DWORD[40+rsp] - xor ebx,ecx - shld eax,eax,5 - vpor xmm6,xmm6,xmm8 - vpsrld xmm8,xmm9,30 - add ebp,esi - and edi,ebx - xor ebx,ecx - add ebp,eax - vpslld xmm9,xmm9,2 - vpxor xmm6,xmm6,xmm8 - shrd eax,eax,7 - xor edi,ecx - mov esi,ebp - add edx,DWORD[44+rsp] - vpxor xmm6,xmm6,xmm9 - xor eax,ebx - shld ebp,ebp,5 - add edx,edi - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[32+r15] - and esi,eax - xor eax,ebx - add edx,ebp - shrd ebp,ebp,7 - xor esi,ebx - vpalignr xmm7,xmm4,xmm3,8 - mov edi,edx - add ecx,DWORD[48+rsp] - vpaddd xmm9,xmm10,xmm6 - xor ebp,eax - shld edx,edx,5 - vpsrldq xmm8,xmm6,4 - add ecx,esi - and edi,ebp - vpxor xmm7,xmm7,xmm3 - xor ebp,eax - add ecx,edx - vpxor xmm8,xmm8,xmm5 - shrd edx,edx,7 - xor edi,eax - mov esi,ecx - add ebx,DWORD[52+rsp] - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[48+r15] - vpxor xmm7,xmm7,xmm8 - xor edx,ebp - shld ecx,ecx,5 - vmovdqa XMMWORD[32+rsp],xmm9 - add ebx,edi - and esi,edx - vpsrld xmm8,xmm7,31 - xor edx,ebp - add ebx,ecx - shrd ecx,ecx,7 - xor esi,ebp - vpslldq xmm9,xmm7,12 - vpaddd xmm7,xmm7,xmm7 - mov edi,ebx - add eax,DWORD[56+rsp] - xor ecx,edx - shld ebx,ebx,5 - vpor xmm7,xmm7,xmm8 - vpsrld xmm8,xmm9,30 - add eax,esi - and edi,ecx - xor ecx,edx - add eax,ebx - vpslld xmm9,xmm9,2 - vpxor xmm7,xmm7,xmm8 - shrd ebx,ebx,7 - cmp r8d,11 - jb NEAR $L$vaesenclast6 - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[64+r15] - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[80+r15] - je NEAR $L$vaesenclast6 - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[96+r15] - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[112+r15] -$L$vaesenclast6: - vaesenclast xmm12,xmm12,xmm15 - vmovups xmm15,XMMWORD[((-112))+r15] - vmovups xmm14,XMMWORD[((16-112))+r15] - xor edi,edx - mov esi,eax - add ebp,DWORD[60+rsp] - vpxor xmm7,xmm7,xmm9 - xor ebx,ecx - shld eax,eax,5 - add ebp,edi - and esi,ebx - xor ebx,ecx - add ebp,eax - vpalignr xmm8,xmm7,xmm6,8 - vpxor xmm0,xmm0,xmm4 - shrd eax,eax,7 - xor esi,ecx - mov edi,ebp - add edx,DWORD[rsp] - vpxor xmm0,xmm0,xmm1 - xor eax,ebx - shld ebp,ebp,5 - vpaddd xmm9,xmm10,xmm7 - add edx,esi - vmovdqu xmm13,XMMWORD[16+r12] - vpxor xmm13,xmm13,xmm15 - vmovups XMMWORD[r13*1+r12],xmm12 - vpxor xmm12,xmm12,xmm13 - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[((-80))+r15] - and edi,eax - vpxor xmm0,xmm0,xmm8 - xor eax,ebx - add edx,ebp - shrd ebp,ebp,7 - xor edi,ebx - vpsrld xmm8,xmm0,30 - vmovdqa XMMWORD[48+rsp],xmm9 - mov esi,edx - add ecx,DWORD[4+rsp] - xor ebp,eax - shld edx,edx,5 - vpslld xmm0,xmm0,2 - add ecx,edi - and esi,ebp - xor ebp,eax - add ecx,edx - shrd edx,edx,7 - xor esi,eax - mov edi,ecx - add ebx,DWORD[8+rsp] - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[((-64))+r15] - vpor xmm0,xmm0,xmm8 - xor edx,ebp - shld ecx,ecx,5 - add ebx,esi - and edi,edx - xor edx,ebp - add ebx,ecx - add eax,DWORD[12+rsp] - xor edi,ebp - mov esi,ebx - shld ebx,ebx,5 - add eax,edi - xor esi,edx - shrd ecx,ecx,7 - add eax,ebx - vpalignr xmm8,xmm0,xmm7,8 - vpxor xmm1,xmm1,xmm5 - add ebp,DWORD[16+rsp] - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[((-48))+r15] - xor esi,ecx - mov edi,eax - shld eax,eax,5 - vpxor xmm1,xmm1,xmm2 - add ebp,esi - xor edi,ecx - vpaddd xmm9,xmm10,xmm0 - shrd ebx,ebx,7 - add ebp,eax - vpxor xmm1,xmm1,xmm8 - add edx,DWORD[20+rsp] - xor edi,ebx - mov esi,ebp - shld ebp,ebp,5 - vpsrld xmm8,xmm1,30 - vmovdqa XMMWORD[rsp],xmm9 - add edx,edi - xor esi,ebx - shrd eax,eax,7 - add edx,ebp - vpslld xmm1,xmm1,2 - add ecx,DWORD[24+rsp] - xor esi,eax - mov edi,edx - shld edx,edx,5 - add ecx,esi - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[((-32))+r15] - xor edi,eax - shrd ebp,ebp,7 - add ecx,edx - vpor xmm1,xmm1,xmm8 - add ebx,DWORD[28+rsp] - xor edi,ebp - mov esi,ecx - shld ecx,ecx,5 - add ebx,edi - xor esi,ebp - shrd edx,edx,7 - add ebx,ecx - vpalignr xmm8,xmm1,xmm0,8 - vpxor xmm2,xmm2,xmm6 - add eax,DWORD[32+rsp] - xor esi,edx - mov edi,ebx - shld ebx,ebx,5 - vpxor xmm2,xmm2,xmm3 - add eax,esi - xor edi,edx - vpaddd xmm9,xmm10,xmm1 - vmovdqa xmm10,XMMWORD[32+r11] - shrd ecx,ecx,7 - add eax,ebx - vpxor xmm2,xmm2,xmm8 - add ebp,DWORD[36+rsp] - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[((-16))+r15] - xor edi,ecx - mov esi,eax - shld eax,eax,5 - vpsrld xmm8,xmm2,30 - vmovdqa XMMWORD[16+rsp],xmm9 - add ebp,edi - xor esi,ecx - shrd ebx,ebx,7 - add ebp,eax - vpslld xmm2,xmm2,2 - add edx,DWORD[40+rsp] - xor esi,ebx - mov edi,ebp - shld ebp,ebp,5 - add edx,esi - xor edi,ebx - shrd eax,eax,7 - add edx,ebp - vpor xmm2,xmm2,xmm8 - add ecx,DWORD[44+rsp] - xor edi,eax - mov esi,edx - shld edx,edx,5 - add ecx,edi - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[r15] - xor esi,eax - shrd ebp,ebp,7 - add ecx,edx - vpalignr xmm8,xmm2,xmm1,8 - vpxor xmm3,xmm3,xmm7 - add ebx,DWORD[48+rsp] - xor esi,ebp - mov edi,ecx - shld ecx,ecx,5 - vpxor xmm3,xmm3,xmm4 - add ebx,esi - xor edi,ebp - vpaddd xmm9,xmm10,xmm2 - shrd edx,edx,7 - add ebx,ecx - vpxor xmm3,xmm3,xmm8 - add eax,DWORD[52+rsp] - xor edi,edx - mov esi,ebx - shld ebx,ebx,5 - vpsrld xmm8,xmm3,30 - vmovdqa XMMWORD[32+rsp],xmm9 - add eax,edi - xor esi,edx - shrd ecx,ecx,7 - add eax,ebx - vpslld xmm3,xmm3,2 - add ebp,DWORD[56+rsp] - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[16+r15] - xor esi,ecx - mov edi,eax - shld eax,eax,5 - add ebp,esi - xor edi,ecx - shrd ebx,ebx,7 - add ebp,eax - vpor xmm3,xmm3,xmm8 - add edx,DWORD[60+rsp] - xor edi,ebx - mov esi,ebp - shld ebp,ebp,5 - add edx,edi - xor esi,ebx - shrd eax,eax,7 - add edx,ebp - vpalignr xmm8,xmm3,xmm2,8 - vpxor xmm4,xmm4,xmm0 - add ecx,DWORD[rsp] - xor esi,eax - mov edi,edx - shld edx,edx,5 - vpxor xmm4,xmm4,xmm5 - add ecx,esi - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[32+r15] - xor edi,eax - vpaddd xmm9,xmm10,xmm3 - shrd ebp,ebp,7 - add ecx,edx - vpxor xmm4,xmm4,xmm8 - add ebx,DWORD[4+rsp] - xor edi,ebp - mov esi,ecx - shld ecx,ecx,5 - vpsrld xmm8,xmm4,30 - vmovdqa XMMWORD[48+rsp],xmm9 - add ebx,edi - xor esi,ebp - shrd edx,edx,7 - add ebx,ecx - vpslld xmm4,xmm4,2 - add eax,DWORD[8+rsp] - xor esi,edx - mov edi,ebx - shld ebx,ebx,5 - add eax,esi - xor edi,edx - shrd ecx,ecx,7 - add eax,ebx - vpor xmm4,xmm4,xmm8 - add ebp,DWORD[12+rsp] - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[48+r15] - xor edi,ecx - mov esi,eax - shld eax,eax,5 - add ebp,edi - xor esi,ecx - shrd ebx,ebx,7 - add ebp,eax - vpalignr xmm8,xmm4,xmm3,8 - vpxor xmm5,xmm5,xmm1 - add edx,DWORD[16+rsp] - xor esi,ebx - mov edi,ebp - shld ebp,ebp,5 - vpxor xmm5,xmm5,xmm6 - add edx,esi - xor edi,ebx - vpaddd xmm9,xmm10,xmm4 - shrd eax,eax,7 - add edx,ebp - vpxor xmm5,xmm5,xmm8 - add ecx,DWORD[20+rsp] - xor edi,eax - mov esi,edx - shld edx,edx,5 - vpsrld xmm8,xmm5,30 - vmovdqa XMMWORD[rsp],xmm9 - add ecx,edi - cmp r8d,11 - jb NEAR $L$vaesenclast7 - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[64+r15] - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[80+r15] - je NEAR $L$vaesenclast7 - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[96+r15] - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[112+r15] -$L$vaesenclast7: - vaesenclast xmm12,xmm12,xmm15 - vmovups xmm15,XMMWORD[((-112))+r15] - vmovups xmm14,XMMWORD[((16-112))+r15] - xor esi,eax - shrd ebp,ebp,7 - add ecx,edx - vpslld xmm5,xmm5,2 - add ebx,DWORD[24+rsp] - xor esi,ebp - mov edi,ecx - shld ecx,ecx,5 - add ebx,esi - xor edi,ebp - shrd edx,edx,7 - add ebx,ecx - vpor xmm5,xmm5,xmm8 - add eax,DWORD[28+rsp] - shrd ecx,ecx,7 - mov esi,ebx - xor edi,edx - shld ebx,ebx,5 - add eax,edi - xor esi,ecx - xor ecx,edx - add eax,ebx - vpalignr xmm8,xmm5,xmm4,8 - vpxor xmm6,xmm6,xmm2 - add ebp,DWORD[32+rsp] - vmovdqu xmm13,XMMWORD[32+r12] - vpxor xmm13,xmm13,xmm15 - vmovups XMMWORD[16+r12*1+r13],xmm12 - vpxor xmm12,xmm12,xmm13 - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[((-80))+r15] - and esi,ecx - xor ecx,edx - shrd ebx,ebx,7 - vpxor xmm6,xmm6,xmm7 - mov edi,eax - xor esi,ecx - vpaddd xmm9,xmm10,xmm5 - shld eax,eax,5 - add ebp,esi - vpxor xmm6,xmm6,xmm8 - xor edi,ebx - xor ebx,ecx - add ebp,eax - add edx,DWORD[36+rsp] - vpsrld xmm8,xmm6,30 - vmovdqa XMMWORD[16+rsp],xmm9 - and edi,ebx - xor ebx,ecx - shrd eax,eax,7 - mov esi,ebp - vpslld xmm6,xmm6,2 - xor edi,ebx - shld ebp,ebp,5 - add edx,edi - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[((-64))+r15] - xor esi,eax - xor eax,ebx - add edx,ebp - add ecx,DWORD[40+rsp] - and esi,eax - vpor xmm6,xmm6,xmm8 - xor eax,ebx - shrd ebp,ebp,7 - mov edi,edx - xor esi,eax - shld edx,edx,5 - add ecx,esi - xor edi,ebp - xor ebp,eax - add ecx,edx - add ebx,DWORD[44+rsp] - and edi,ebp - xor ebp,eax - shrd edx,edx,7 - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[((-48))+r15] - mov esi,ecx - xor edi,ebp - shld ecx,ecx,5 - add ebx,edi - xor esi,edx - xor edx,ebp - add ebx,ecx - vpalignr xmm8,xmm6,xmm5,8 - vpxor xmm7,xmm7,xmm3 - add eax,DWORD[48+rsp] - and esi,edx - xor edx,ebp - shrd ecx,ecx,7 - vpxor xmm7,xmm7,xmm0 - mov edi,ebx - xor esi,edx - vpaddd xmm9,xmm10,xmm6 - vmovdqa xmm10,XMMWORD[48+r11] - shld ebx,ebx,5 - add eax,esi - vpxor xmm7,xmm7,xmm8 - xor edi,ecx - xor ecx,edx - add eax,ebx - add ebp,DWORD[52+rsp] - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[((-32))+r15] - vpsrld xmm8,xmm7,30 - vmovdqa XMMWORD[32+rsp],xmm9 - and edi,ecx - xor ecx,edx - shrd ebx,ebx,7 - mov esi,eax - vpslld xmm7,xmm7,2 - xor edi,ecx - shld eax,eax,5 - add ebp,edi - xor esi,ebx - xor ebx,ecx - add ebp,eax - add edx,DWORD[56+rsp] - and esi,ebx - vpor xmm7,xmm7,xmm8 - xor ebx,ecx - shrd eax,eax,7 - mov edi,ebp - xor esi,ebx - shld ebp,ebp,5 - add edx,esi - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[((-16))+r15] - xor edi,eax - xor eax,ebx - add edx,ebp - add ecx,DWORD[60+rsp] - and edi,eax - xor eax,ebx - shrd ebp,ebp,7 - mov esi,edx - xor edi,eax - shld edx,edx,5 - add ecx,edi - xor esi,ebp - xor ebp,eax - add ecx,edx - vpalignr xmm8,xmm7,xmm6,8 - vpxor xmm0,xmm0,xmm4 - add ebx,DWORD[rsp] - and esi,ebp - xor ebp,eax - shrd edx,edx,7 - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[r15] - vpxor xmm0,xmm0,xmm1 - mov edi,ecx - xor esi,ebp - vpaddd xmm9,xmm10,xmm7 - shld ecx,ecx,5 - add ebx,esi - vpxor xmm0,xmm0,xmm8 - xor edi,edx - xor edx,ebp - add ebx,ecx - add eax,DWORD[4+rsp] - vpsrld xmm8,xmm0,30 - vmovdqa XMMWORD[48+rsp],xmm9 - and edi,edx - xor edx,ebp - shrd ecx,ecx,7 - mov esi,ebx - vpslld xmm0,xmm0,2 - xor edi,edx - shld ebx,ebx,5 - add eax,edi - xor esi,ecx - xor ecx,edx - add eax,ebx - add ebp,DWORD[8+rsp] - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[16+r15] - and esi,ecx - vpor xmm0,xmm0,xmm8 - xor ecx,edx - shrd ebx,ebx,7 - mov edi,eax - xor esi,ecx - shld eax,eax,5 - add ebp,esi - xor edi,ebx - xor ebx,ecx - add ebp,eax - add edx,DWORD[12+rsp] - and edi,ebx - xor ebx,ecx - shrd eax,eax,7 - mov esi,ebp - xor edi,ebx - shld ebp,ebp,5 - add edx,edi - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[32+r15] - xor esi,eax - xor eax,ebx - add edx,ebp - vpalignr xmm8,xmm0,xmm7,8 - vpxor xmm1,xmm1,xmm5 - add ecx,DWORD[16+rsp] - and esi,eax - xor eax,ebx - shrd ebp,ebp,7 - vpxor xmm1,xmm1,xmm2 - mov edi,edx - xor esi,eax - vpaddd xmm9,xmm10,xmm0 - shld edx,edx,5 - add ecx,esi - vpxor xmm1,xmm1,xmm8 - xor edi,ebp - xor ebp,eax - add ecx,edx - add ebx,DWORD[20+rsp] - vpsrld xmm8,xmm1,30 - vmovdqa XMMWORD[rsp],xmm9 - and edi,ebp - xor ebp,eax - shrd edx,edx,7 - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[48+r15] - mov esi,ecx - vpslld xmm1,xmm1,2 - xor edi,ebp - shld ecx,ecx,5 - add ebx,edi - xor esi,edx - xor edx,ebp - add ebx,ecx - add eax,DWORD[24+rsp] - and esi,edx - vpor xmm1,xmm1,xmm8 - xor edx,ebp - shrd ecx,ecx,7 - mov edi,ebx - xor esi,edx - shld ebx,ebx,5 - add eax,esi - xor edi,ecx - xor ecx,edx - add eax,ebx - add ebp,DWORD[28+rsp] - cmp r8d,11 - jb NEAR $L$vaesenclast8 - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[64+r15] - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[80+r15] - je NEAR $L$vaesenclast8 - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[96+r15] - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[112+r15] -$L$vaesenclast8: - vaesenclast xmm12,xmm12,xmm15 - vmovups xmm15,XMMWORD[((-112))+r15] - vmovups xmm14,XMMWORD[((16-112))+r15] - and edi,ecx - xor ecx,edx - shrd ebx,ebx,7 - mov esi,eax - xor edi,ecx - shld eax,eax,5 - add ebp,edi - xor esi,ebx - xor ebx,ecx - add ebp,eax - vpalignr xmm8,xmm1,xmm0,8 - vpxor xmm2,xmm2,xmm6 - add edx,DWORD[32+rsp] - and esi,ebx - xor ebx,ecx - shrd eax,eax,7 - vpxor xmm2,xmm2,xmm3 - mov edi,ebp - xor esi,ebx - vpaddd xmm9,xmm10,xmm1 - shld ebp,ebp,5 - add edx,esi - vmovdqu xmm13,XMMWORD[48+r12] - vpxor xmm13,xmm13,xmm15 - vmovups XMMWORD[32+r12*1+r13],xmm12 - vpxor xmm12,xmm12,xmm13 - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[((-80))+r15] - vpxor xmm2,xmm2,xmm8 - xor edi,eax - xor eax,ebx - add edx,ebp - add ecx,DWORD[36+rsp] - vpsrld xmm8,xmm2,30 - vmovdqa XMMWORD[16+rsp],xmm9 - and edi,eax - xor eax,ebx - shrd ebp,ebp,7 - mov esi,edx - vpslld xmm2,xmm2,2 - xor edi,eax - shld edx,edx,5 - add ecx,edi - xor esi,ebp - xor ebp,eax - add ecx,edx - add ebx,DWORD[40+rsp] - and esi,ebp - vpor xmm2,xmm2,xmm8 - xor ebp,eax - shrd edx,edx,7 - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[((-64))+r15] - mov edi,ecx - xor esi,ebp - shld ecx,ecx,5 - add ebx,esi - xor edi,edx - xor edx,ebp - add ebx,ecx - add eax,DWORD[44+rsp] - and edi,edx - xor edx,ebp - shrd ecx,ecx,7 - mov esi,ebx - xor edi,edx - shld ebx,ebx,5 - add eax,edi - xor esi,edx - add eax,ebx - vpalignr xmm8,xmm2,xmm1,8 - vpxor xmm3,xmm3,xmm7 - add ebp,DWORD[48+rsp] - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[((-48))+r15] - xor esi,ecx - mov edi,eax - shld eax,eax,5 - vpxor xmm3,xmm3,xmm4 - add ebp,esi - xor edi,ecx - vpaddd xmm9,xmm10,xmm2 - shrd ebx,ebx,7 - add ebp,eax - vpxor xmm3,xmm3,xmm8 - add edx,DWORD[52+rsp] - xor edi,ebx - mov esi,ebp - shld ebp,ebp,5 - vpsrld xmm8,xmm3,30 - vmovdqa XMMWORD[32+rsp],xmm9 - add edx,edi - xor esi,ebx - shrd eax,eax,7 - add edx,ebp - vpslld xmm3,xmm3,2 - add ecx,DWORD[56+rsp] - xor esi,eax - mov edi,edx - shld edx,edx,5 - add ecx,esi - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[((-32))+r15] - xor edi,eax - shrd ebp,ebp,7 - add ecx,edx - vpor xmm3,xmm3,xmm8 - add ebx,DWORD[60+rsp] - xor edi,ebp - mov esi,ecx - shld ecx,ecx,5 - add ebx,edi - xor esi,ebp - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD[rsp] - vpaddd xmm9,xmm10,xmm3 - xor esi,edx - mov edi,ebx - shld ebx,ebx,5 - add eax,esi - vmovdqa XMMWORD[48+rsp],xmm9 - xor edi,edx - shrd ecx,ecx,7 - add eax,ebx - add ebp,DWORD[4+rsp] - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[((-16))+r15] - xor edi,ecx - mov esi,eax - shld eax,eax,5 - add ebp,edi - xor esi,ecx - shrd ebx,ebx,7 - add ebp,eax - add edx,DWORD[8+rsp] - xor esi,ebx - mov edi,ebp - shld ebp,ebp,5 - add edx,esi - xor edi,ebx - shrd eax,eax,7 - add edx,ebp - add ecx,DWORD[12+rsp] - xor edi,eax - mov esi,edx - shld edx,edx,5 - add ecx,edi - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[r15] - xor esi,eax - shrd ebp,ebp,7 - add ecx,edx - cmp r10,r14 - je NEAR $L$done_avx - vmovdqa xmm9,XMMWORD[64+r11] - vmovdqa xmm10,XMMWORD[r11] - vmovdqu xmm0,XMMWORD[r10] - vmovdqu xmm1,XMMWORD[16+r10] - vmovdqu xmm2,XMMWORD[32+r10] - vmovdqu xmm3,XMMWORD[48+r10] - vpshufb xmm0,xmm0,xmm9 - add r10,64 - add ebx,DWORD[16+rsp] - xor esi,ebp - vpshufb xmm1,xmm1,xmm9 - mov edi,ecx - shld ecx,ecx,5 - vpaddd xmm8,xmm0,xmm10 - add ebx,esi - xor edi,ebp - shrd edx,edx,7 - add ebx,ecx - vmovdqa XMMWORD[rsp],xmm8 - add eax,DWORD[20+rsp] - xor edi,edx - mov esi,ebx - shld ebx,ebx,5 - add eax,edi - xor esi,edx - shrd ecx,ecx,7 - add eax,ebx - add ebp,DWORD[24+rsp] - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[16+r15] - xor esi,ecx - mov edi,eax - shld eax,eax,5 - add ebp,esi - xor edi,ecx - shrd ebx,ebx,7 - add ebp,eax - add edx,DWORD[28+rsp] - xor edi,ebx - mov esi,ebp - shld ebp,ebp,5 - add edx,edi - xor esi,ebx - shrd eax,eax,7 - add edx,ebp - add ecx,DWORD[32+rsp] - xor esi,eax - vpshufb xmm2,xmm2,xmm9 - mov edi,edx - shld edx,edx,5 - vpaddd xmm8,xmm1,xmm10 - add ecx,esi - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[32+r15] - xor edi,eax - shrd ebp,ebp,7 - add ecx,edx - vmovdqa XMMWORD[16+rsp],xmm8 - add ebx,DWORD[36+rsp] - xor edi,ebp - mov esi,ecx - shld ecx,ecx,5 - add ebx,edi - xor esi,ebp - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD[40+rsp] - xor esi,edx - mov edi,ebx - shld ebx,ebx,5 - add eax,esi - xor edi,edx - shrd ecx,ecx,7 - add eax,ebx - add ebp,DWORD[44+rsp] - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[48+r15] - xor edi,ecx - mov esi,eax - shld eax,eax,5 - add ebp,edi - xor esi,ecx - shrd ebx,ebx,7 - add ebp,eax - add edx,DWORD[48+rsp] - xor esi,ebx - vpshufb xmm3,xmm3,xmm9 - mov edi,ebp - shld ebp,ebp,5 - vpaddd xmm8,xmm2,xmm10 - add edx,esi - xor edi,ebx - shrd eax,eax,7 - add edx,ebp - vmovdqa XMMWORD[32+rsp],xmm8 - add ecx,DWORD[52+rsp] - xor edi,eax - mov esi,edx - shld edx,edx,5 - add ecx,edi - cmp r8d,11 - jb NEAR $L$vaesenclast9 - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[64+r15] - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[80+r15] - je NEAR $L$vaesenclast9 - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[96+r15] - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[112+r15] -$L$vaesenclast9: - vaesenclast xmm12,xmm12,xmm15 - vmovups xmm15,XMMWORD[((-112))+r15] - vmovups xmm14,XMMWORD[((16-112))+r15] - xor esi,eax - shrd ebp,ebp,7 - add ecx,edx - add ebx,DWORD[56+rsp] - xor esi,ebp - mov edi,ecx - shld ecx,ecx,5 - add ebx,esi - xor edi,ebp - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD[60+rsp] - xor edi,edx - mov esi,ebx - shld ebx,ebx,5 - add eax,edi - shrd ecx,ecx,7 - add eax,ebx - vmovups XMMWORD[48+r12*1+r13],xmm12 - lea r12,[64+r12] - - add eax,DWORD[r9] - add esi,DWORD[4+r9] - add ecx,DWORD[8+r9] - add edx,DWORD[12+r9] - mov DWORD[r9],eax - add ebp,DWORD[16+r9] - mov DWORD[4+r9],esi - mov ebx,esi - mov DWORD[8+r9],ecx - mov edi,ecx - mov DWORD[12+r9],edx - xor edi,edx - mov DWORD[16+r9],ebp - and esi,edi - jmp NEAR $L$oop_avx - -$L$done_avx: - add ebx,DWORD[16+rsp] - xor esi,ebp - mov edi,ecx - shld ecx,ecx,5 - add ebx,esi - xor edi,ebp - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD[20+rsp] - xor edi,edx - mov esi,ebx - shld ebx,ebx,5 - add eax,edi - xor esi,edx - shrd ecx,ecx,7 - add eax,ebx - add ebp,DWORD[24+rsp] - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[16+r15] - xor esi,ecx - mov edi,eax - shld eax,eax,5 - add ebp,esi - xor edi,ecx - shrd ebx,ebx,7 - add ebp,eax - add edx,DWORD[28+rsp] - xor edi,ebx - mov esi,ebp - shld ebp,ebp,5 - add edx,edi - xor esi,ebx - shrd eax,eax,7 - add edx,ebp - add ecx,DWORD[32+rsp] - xor esi,eax - mov edi,edx - shld edx,edx,5 - add ecx,esi - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[32+r15] - xor edi,eax - shrd ebp,ebp,7 - add ecx,edx - add ebx,DWORD[36+rsp] - xor edi,ebp - mov esi,ecx - shld ecx,ecx,5 - add ebx,edi - xor esi,ebp - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD[40+rsp] - xor esi,edx - mov edi,ebx - shld ebx,ebx,5 - add eax,esi - xor edi,edx - shrd ecx,ecx,7 - add eax,ebx - add ebp,DWORD[44+rsp] - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[48+r15] - xor edi,ecx - mov esi,eax - shld eax,eax,5 - add ebp,edi - xor esi,ecx - shrd ebx,ebx,7 - add ebp,eax - add edx,DWORD[48+rsp] - xor esi,ebx - mov edi,ebp - shld ebp,ebp,5 - add edx,esi - xor edi,ebx - shrd eax,eax,7 - add edx,ebp - add ecx,DWORD[52+rsp] - xor edi,eax - mov esi,edx - shld edx,edx,5 - add ecx,edi - cmp r8d,11 - jb NEAR $L$vaesenclast10 - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[64+r15] - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[80+r15] - je NEAR $L$vaesenclast10 - vaesenc xmm12,xmm12,xmm15 - vmovups xmm14,XMMWORD[96+r15] - vaesenc xmm12,xmm12,xmm14 - vmovups xmm15,XMMWORD[112+r15] -$L$vaesenclast10: - vaesenclast xmm12,xmm12,xmm15 - vmovups xmm15,XMMWORD[((-112))+r15] - vmovups xmm14,XMMWORD[((16-112))+r15] - xor esi,eax - shrd ebp,ebp,7 - add ecx,edx - add ebx,DWORD[56+rsp] - xor esi,ebp - mov edi,ecx - shld ecx,ecx,5 - add ebx,esi - xor edi,ebp - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD[60+rsp] - xor edi,edx - mov esi,ebx - shld ebx,ebx,5 - add eax,edi - shrd ecx,ecx,7 - add eax,ebx - vmovups XMMWORD[48+r12*1+r13],xmm12 - mov r8,QWORD[88+rsp] - - add eax,DWORD[r9] - add esi,DWORD[4+r9] - add ecx,DWORD[8+r9] - mov DWORD[r9],eax - add edx,DWORD[12+r9] - mov DWORD[4+r9],esi - add ebp,DWORD[16+r9] - mov DWORD[8+r9],ecx - mov DWORD[12+r9],edx - mov DWORD[16+r9],ebp - vmovups XMMWORD[r8],xmm12 - vzeroall - movaps xmm6,XMMWORD[((96+0))+rsp] - movaps xmm7,XMMWORD[((96+16))+rsp] - movaps xmm8,XMMWORD[((96+32))+rsp] - movaps xmm9,XMMWORD[((96+48))+rsp] - movaps xmm10,XMMWORD[((96+64))+rsp] - movaps xmm11,XMMWORD[((96+80))+rsp] - movaps xmm12,XMMWORD[((96+96))+rsp] - movaps xmm13,XMMWORD[((96+112))+rsp] - movaps xmm14,XMMWORD[((96+128))+rsp] - movaps xmm15,XMMWORD[((96+144))+rsp] - lea rsi,[264+rsp] - - mov r15,QWORD[rsi] - - mov r14,QWORD[8+rsi] - - mov r13,QWORD[16+rsi] - - mov r12,QWORD[24+rsi] - - mov rbp,QWORD[32+rsi] - - mov rbx,QWORD[40+rsi] - - lea rsp,[48+rsi] - -$L$epilogue_avx: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_aesni_cbc_sha1_enc_avx: ALIGN 64 K_XX_XX: DD 0x5a827999,0x5a827999,0x5a827999,0x5a827999 @@ -2900,17 +1545,17 @@ DB 15,56,202,227 pxor xmm5,xmm3 DB 15,56,201,243 cmp r11d,11 - jb NEAR $L$aesenclast11 + jb NEAR $L$aesenclast6 movups xmm0,XMMWORD[64+rcx] DB 102,15,56,220,209 movups xmm1,XMMWORD[80+rcx] DB 102,15,56,220,208 - je NEAR $L$aesenclast11 + je NEAR $L$aesenclast6 movups xmm0,XMMWORD[96+rcx] DB 102,15,56,220,209 movups xmm1,XMMWORD[112+rcx] DB 102,15,56,220,208 -$L$aesenclast11: +$L$aesenclast6: DB 102,15,56,221,209 movups xmm0,XMMWORD[((16-112))+rcx] movdqa xmm10,xmm8 @@ -2966,17 +1611,17 @@ DB 15,56,202,236 pxor xmm6,xmm4 DB 15,56,201,220 cmp r11d,11 - jb NEAR $L$aesenclast12 + jb NEAR $L$aesenclast7 movups xmm0,XMMWORD[64+rcx] DB 102,15,56,220,209 movups xmm1,XMMWORD[80+rcx] DB 102,15,56,220,208 - je NEAR $L$aesenclast12 + je NEAR $L$aesenclast7 movups xmm0,XMMWORD[96+rcx] DB 102,15,56,220,209 movups xmm1,XMMWORD[112+rcx] DB 102,15,56,220,208 -$L$aesenclast12: +$L$aesenclast7: DB 102,15,56,221,209 movups xmm0,XMMWORD[((16-112))+rcx] movdqa xmm9,xmm8 @@ -3032,17 +1677,17 @@ DB 15,56,202,245 pxor xmm3,xmm5 DB 15,56,201,229 cmp r11d,11 - jb NEAR $L$aesenclast13 + jb NEAR $L$aesenclast8 movups xmm0,XMMWORD[64+rcx] DB 102,15,56,220,209 movups xmm1,XMMWORD[80+rcx] DB 102,15,56,220,208 - je NEAR $L$aesenclast13 + je NEAR $L$aesenclast8 movups xmm0,XMMWORD[96+rcx] DB 102,15,56,220,209 movups xmm1,XMMWORD[112+rcx] DB 102,15,56,220,208 -$L$aesenclast13: +$L$aesenclast8: DB 102,15,56,221,209 movups xmm0,XMMWORD[((16-112))+rcx] movdqa xmm10,xmm8 @@ -3096,17 +1741,17 @@ DB 102,15,56,220,209 movups xmm1,XMMWORD[48+rcx] DB 102,15,56,220,208 cmp r11d,11 - jb NEAR $L$aesenclast14 + jb NEAR $L$aesenclast9 movups xmm0,XMMWORD[64+rcx] DB 102,15,56,220,209 movups xmm1,XMMWORD[80+rcx] DB 102,15,56,220,208 - je NEAR $L$aesenclast14 + je NEAR $L$aesenclast9 movups xmm0,XMMWORD[96+rcx] DB 102,15,56,220,209 movups xmm1,XMMWORD[112+rcx] DB 102,15,56,220,208 -$L$aesenclast14: +$L$aesenclast9: DB 102,15,56,221,209 movups xmm0,XMMWORD[((16-112))+rcx] dec rdx @@ -3246,9 +1891,6 @@ ALIGN 4 DD $L$SEH_begin_aesni_cbc_sha1_enc_ssse3 wrt ..imagebase DD $L$SEH_end_aesni_cbc_sha1_enc_ssse3 wrt ..imagebase DD $L$SEH_info_aesni_cbc_sha1_enc_ssse3 wrt ..imagebase - DD $L$SEH_begin_aesni_cbc_sha1_enc_avx wrt ..imagebase - DD $L$SEH_end_aesni_cbc_sha1_enc_avx wrt ..imagebase - DD $L$SEH_info_aesni_cbc_sha1_enc_avx wrt ..imagebase DD $L$SEH_begin_aesni_cbc_sha1_enc_shaext wrt ..imagebase DD $L$SEH_end_aesni_cbc_sha1_enc_shaext wrt ..imagebase DD $L$SEH_info_aesni_cbc_sha1_enc_shaext wrt ..imagebase @@ -3258,10 +1900,6 @@ $L$SEH_info_aesni_cbc_sha1_enc_ssse3: DB 9,0,0,0 DD ssse3_handler wrt ..imagebase DD $L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase -$L$SEH_info_aesni_cbc_sha1_enc_avx: -DB 9,0,0,0 - DD ssse3_handler wrt ..imagebase - DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase $L$SEH_info_aesni_cbc_sha1_enc_shaext: DB 9,0,0,0 DD ssse3_handler wrt ..imagebase diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-sha256-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-sha256-x86_64.nasm index b2a9c65f5d0..38beecde894 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-sha256-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-sha256-x86_64.nasm @@ -11,25 +11,6 @@ global aesni_cbc_sha256_enc ALIGN 16 aesni_cbc_sha256_enc: - lea r11,[OPENSSL_ia32cap_P] - mov eax,1 - cmp rcx,0 - je NEAR $L$probe - mov eax,DWORD[r11] - mov r10,QWORD[4+r11] - bt r10,61 - jc NEAR aesni_cbc_sha256_enc_shaext - mov r11,r10 - shr r11,32 - - test r10d,2048 - jnz NEAR aesni_cbc_sha256_enc_xop - and r11d,296 - cmp r11d,296 - je NEAR aesni_cbc_sha256_enc_avx2 - and r10d,268435456 - jnz NEAR aesni_cbc_sha256_enc_avx - ud2 xor eax,eax cmp rcx,0 je NEAR $L$probe @@ -85,4624 +66,3 @@ DB 54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98 DB 121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108 DB 46,111,114,103,62,0 ALIGN 64 - -ALIGN 64 -aesni_cbc_sha256_enc_xop: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aesni_cbc_sha256_enc_xop: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - -$L$xop_shortcut: - mov r10,QWORD[56+rsp] - mov rax,rsp - - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,288 - and rsp,-64 - - shl rdx,6 - sub rsi,rdi - sub r10,rdi - add rdx,rdi - - - mov QWORD[((64+8))+rsp],rsi - mov QWORD[((64+16))+rsp],rdx - - mov QWORD[((64+32))+rsp],r8 - mov QWORD[((64+40))+rsp],r9 - mov QWORD[((64+48))+rsp],r10 - mov QWORD[120+rsp],rax - - movaps XMMWORD[128+rsp],xmm6 - movaps XMMWORD[144+rsp],xmm7 - movaps XMMWORD[160+rsp],xmm8 - movaps XMMWORD[176+rsp],xmm9 - movaps XMMWORD[192+rsp],xmm10 - movaps XMMWORD[208+rsp],xmm11 - movaps XMMWORD[224+rsp],xmm12 - movaps XMMWORD[240+rsp],xmm13 - movaps XMMWORD[256+rsp],xmm14 - movaps XMMWORD[272+rsp],xmm15 -$L$prologue_xop: - vzeroall - - mov r12,rdi - lea rdi,[128+rcx] - lea r13,[((K256+544))] - mov r14d,DWORD[((240-128))+rdi] - mov r15,r9 - mov rsi,r10 - vmovdqu xmm8,XMMWORD[r8] - sub r14,9 - - mov eax,DWORD[r15] - mov ebx,DWORD[4+r15] - mov ecx,DWORD[8+r15] - mov edx,DWORD[12+r15] - mov r8d,DWORD[16+r15] - mov r9d,DWORD[20+r15] - mov r10d,DWORD[24+r15] - mov r11d,DWORD[28+r15] - - vmovdqa xmm14,XMMWORD[r14*8+r13] - vmovdqa xmm13,XMMWORD[16+r14*8+r13] - vmovdqa xmm12,XMMWORD[32+r14*8+r13] - vmovdqu xmm10,XMMWORD[((0-128))+rdi] - jmp NEAR $L$loop_xop -ALIGN 16 -$L$loop_xop: - vmovdqa xmm7,XMMWORD[((K256+512))] - vmovdqu xmm0,XMMWORD[r12*1+rsi] - vmovdqu xmm1,XMMWORD[16+r12*1+rsi] - vmovdqu xmm2,XMMWORD[32+r12*1+rsi] - vmovdqu xmm3,XMMWORD[48+r12*1+rsi] - vpshufb xmm0,xmm0,xmm7 - lea rbp,[K256] - vpshufb xmm1,xmm1,xmm7 - vpshufb xmm2,xmm2,xmm7 - vpaddd xmm4,xmm0,XMMWORD[rbp] - vpshufb xmm3,xmm3,xmm7 - vpaddd xmm5,xmm1,XMMWORD[32+rbp] - vpaddd xmm6,xmm2,XMMWORD[64+rbp] - vpaddd xmm7,xmm3,XMMWORD[96+rbp] - vmovdqa XMMWORD[rsp],xmm4 - mov r14d,eax - vmovdqa XMMWORD[16+rsp],xmm5 - mov esi,ebx - vmovdqa XMMWORD[32+rsp],xmm6 - xor esi,ecx - vmovdqa XMMWORD[48+rsp],xmm7 - mov r13d,r8d - jmp NEAR $L$xop_00_47 - -ALIGN 16 -$L$xop_00_47: - sub rbp,-16*2*4 - vmovdqu xmm9,XMMWORD[r12] - mov QWORD[((64+0))+rsp],r12 - vpalignr xmm4,xmm1,xmm0,4 - ror r13d,14 - mov eax,r14d - vpalignr xmm7,xmm3,xmm2,4 - mov r12d,r9d - xor r13d,r8d -DB 143,232,120,194,236,14 - ror r14d,9 - xor r12d,r10d - vpsrld xmm4,xmm4,3 - ror r13d,5 - xor r14d,eax - vpaddd xmm0,xmm0,xmm7 - and r12d,r8d - vpxor xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((16-128))+rdi] - xor r13d,r8d - add r11d,DWORD[rsp] - mov r15d,eax -DB 143,232,120,194,245,11 - ror r14d,11 - xor r12d,r10d - vpxor xmm4,xmm4,xmm5 - xor r15d,ebx - ror r13d,6 - add r11d,r12d - and esi,r15d -DB 143,232,120,194,251,13 - xor r14d,eax - add r11d,r13d - vpxor xmm4,xmm4,xmm6 - xor esi,ebx - add edx,r11d - vpsrld xmm6,xmm3,10 - ror r14d,2 - add r11d,esi - vpaddd xmm0,xmm0,xmm4 - mov r13d,edx - add r14d,r11d -DB 143,232,120,194,239,2 - ror r13d,14 - mov r11d,r14d - vpxor xmm7,xmm7,xmm6 - mov r12d,r8d - xor r13d,edx - ror r14d,9 - xor r12d,r9d - vpxor xmm7,xmm7,xmm5 - ror r13d,5 - xor r14d,r11d - and r12d,edx - vpxor xmm9,xmm9,xmm8 - xor r13d,edx - vpsrldq xmm7,xmm7,8 - add r10d,DWORD[4+rsp] - mov esi,r11d - ror r14d,11 - xor r12d,r9d - vpaddd xmm0,xmm0,xmm7 - xor esi,eax - ror r13d,6 - add r10d,r12d - and r15d,esi -DB 143,232,120,194,248,13 - xor r14d,r11d - add r10d,r13d - vpsrld xmm6,xmm0,10 - xor r15d,eax - add ecx,r10d -DB 143,232,120,194,239,2 - ror r14d,2 - add r10d,r15d - vpxor xmm7,xmm7,xmm6 - mov r13d,ecx - add r14d,r10d - ror r13d,14 - mov r10d,r14d - vpxor xmm7,xmm7,xmm5 - mov r12d,edx - xor r13d,ecx - ror r14d,9 - xor r12d,r8d - vpslldq xmm7,xmm7,8 - ror r13d,5 - xor r14d,r10d - and r12d,ecx - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((32-128))+rdi] - xor r13d,ecx - vpaddd xmm0,xmm0,xmm7 - add r9d,DWORD[8+rsp] - mov r15d,r10d - ror r14d,11 - xor r12d,r8d - vpaddd xmm6,xmm0,XMMWORD[rbp] - xor r15d,r11d - ror r13d,6 - add r9d,r12d - and esi,r15d - xor r14d,r10d - add r9d,r13d - xor esi,r11d - add ebx,r9d - ror r14d,2 - add r9d,esi - mov r13d,ebx - add r14d,r9d - ror r13d,14 - mov r9d,r14d - mov r12d,ecx - xor r13d,ebx - ror r14d,9 - xor r12d,edx - ror r13d,5 - xor r14d,r9d - and r12d,ebx - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((48-128))+rdi] - xor r13d,ebx - add r8d,DWORD[12+rsp] - mov esi,r9d - ror r14d,11 - xor r12d,edx - xor esi,r10d - ror r13d,6 - add r8d,r12d - and r15d,esi - xor r14d,r9d - add r8d,r13d - xor r15d,r10d - add eax,r8d - ror r14d,2 - add r8d,r15d - mov r13d,eax - add r14d,r8d - vmovdqa XMMWORD[rsp],xmm6 - vpalignr xmm4,xmm2,xmm1,4 - ror r13d,14 - mov r8d,r14d - vpalignr xmm7,xmm0,xmm3,4 - mov r12d,ebx - xor r13d,eax -DB 143,232,120,194,236,14 - ror r14d,9 - xor r12d,ecx - vpsrld xmm4,xmm4,3 - ror r13d,5 - xor r14d,r8d - vpaddd xmm1,xmm1,xmm7 - and r12d,eax - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((64-128))+rdi] - xor r13d,eax - add edx,DWORD[16+rsp] - mov r15d,r8d -DB 143,232,120,194,245,11 - ror r14d,11 - xor r12d,ecx - vpxor xmm4,xmm4,xmm5 - xor r15d,r9d - ror r13d,6 - add edx,r12d - and esi,r15d -DB 143,232,120,194,248,13 - xor r14d,r8d - add edx,r13d - vpxor xmm4,xmm4,xmm6 - xor esi,r9d - add r11d,edx - vpsrld xmm6,xmm0,10 - ror r14d,2 - add edx,esi - vpaddd xmm1,xmm1,xmm4 - mov r13d,r11d - add r14d,edx -DB 143,232,120,194,239,2 - ror r13d,14 - mov edx,r14d - vpxor xmm7,xmm7,xmm6 - mov r12d,eax - xor r13d,r11d - ror r14d,9 - xor r12d,ebx - vpxor xmm7,xmm7,xmm5 - ror r13d,5 - xor r14d,edx - and r12d,r11d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((80-128))+rdi] - xor r13d,r11d - vpsrldq xmm7,xmm7,8 - add ecx,DWORD[20+rsp] - mov esi,edx - ror r14d,11 - xor r12d,ebx - vpaddd xmm1,xmm1,xmm7 - xor esi,r8d - ror r13d,6 - add ecx,r12d - and r15d,esi -DB 143,232,120,194,249,13 - xor r14d,edx - add ecx,r13d - vpsrld xmm6,xmm1,10 - xor r15d,r8d - add r10d,ecx -DB 143,232,120,194,239,2 - ror r14d,2 - add ecx,r15d - vpxor xmm7,xmm7,xmm6 - mov r13d,r10d - add r14d,ecx - ror r13d,14 - mov ecx,r14d - vpxor xmm7,xmm7,xmm5 - mov r12d,r11d - xor r13d,r10d - ror r14d,9 - xor r12d,eax - vpslldq xmm7,xmm7,8 - ror r13d,5 - xor r14d,ecx - and r12d,r10d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((96-128))+rdi] - xor r13d,r10d - vpaddd xmm1,xmm1,xmm7 - add ebx,DWORD[24+rsp] - mov r15d,ecx - ror r14d,11 - xor r12d,eax - vpaddd xmm6,xmm1,XMMWORD[32+rbp] - xor r15d,edx - ror r13d,6 - add ebx,r12d - and esi,r15d - xor r14d,ecx - add ebx,r13d - xor esi,edx - add r9d,ebx - ror r14d,2 - add ebx,esi - mov r13d,r9d - add r14d,ebx - ror r13d,14 - mov ebx,r14d - mov r12d,r10d - xor r13d,r9d - ror r14d,9 - xor r12d,r11d - ror r13d,5 - xor r14d,ebx - and r12d,r9d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((112-128))+rdi] - xor r13d,r9d - add eax,DWORD[28+rsp] - mov esi,ebx - ror r14d,11 - xor r12d,r11d - xor esi,ecx - ror r13d,6 - add eax,r12d - and r15d,esi - xor r14d,ebx - add eax,r13d - xor r15d,ecx - add r8d,eax - ror r14d,2 - add eax,r15d - mov r13d,r8d - add r14d,eax - vmovdqa XMMWORD[16+rsp],xmm6 - vpalignr xmm4,xmm3,xmm2,4 - ror r13d,14 - mov eax,r14d - vpalignr xmm7,xmm1,xmm0,4 - mov r12d,r9d - xor r13d,r8d -DB 143,232,120,194,236,14 - ror r14d,9 - xor r12d,r10d - vpsrld xmm4,xmm4,3 - ror r13d,5 - xor r14d,eax - vpaddd xmm2,xmm2,xmm7 - and r12d,r8d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((128-128))+rdi] - xor r13d,r8d - add r11d,DWORD[32+rsp] - mov r15d,eax -DB 143,232,120,194,245,11 - ror r14d,11 - xor r12d,r10d - vpxor xmm4,xmm4,xmm5 - xor r15d,ebx - ror r13d,6 - add r11d,r12d - and esi,r15d -DB 143,232,120,194,249,13 - xor r14d,eax - add r11d,r13d - vpxor xmm4,xmm4,xmm6 - xor esi,ebx - add edx,r11d - vpsrld xmm6,xmm1,10 - ror r14d,2 - add r11d,esi - vpaddd xmm2,xmm2,xmm4 - mov r13d,edx - add r14d,r11d -DB 143,232,120,194,239,2 - ror r13d,14 - mov r11d,r14d - vpxor xmm7,xmm7,xmm6 - mov r12d,r8d - xor r13d,edx - ror r14d,9 - xor r12d,r9d - vpxor xmm7,xmm7,xmm5 - ror r13d,5 - xor r14d,r11d - and r12d,edx - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((144-128))+rdi] - xor r13d,edx - vpsrldq xmm7,xmm7,8 - add r10d,DWORD[36+rsp] - mov esi,r11d - ror r14d,11 - xor r12d,r9d - vpaddd xmm2,xmm2,xmm7 - xor esi,eax - ror r13d,6 - add r10d,r12d - and r15d,esi -DB 143,232,120,194,250,13 - xor r14d,r11d - add r10d,r13d - vpsrld xmm6,xmm2,10 - xor r15d,eax - add ecx,r10d -DB 143,232,120,194,239,2 - ror r14d,2 - add r10d,r15d - vpxor xmm7,xmm7,xmm6 - mov r13d,ecx - add r14d,r10d - ror r13d,14 - mov r10d,r14d - vpxor xmm7,xmm7,xmm5 - mov r12d,edx - xor r13d,ecx - ror r14d,9 - xor r12d,r8d - vpslldq xmm7,xmm7,8 - ror r13d,5 - xor r14d,r10d - and r12d,ecx - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((160-128))+rdi] - xor r13d,ecx - vpaddd xmm2,xmm2,xmm7 - add r9d,DWORD[40+rsp] - mov r15d,r10d - ror r14d,11 - xor r12d,r8d - vpaddd xmm6,xmm2,XMMWORD[64+rbp] - xor r15d,r11d - ror r13d,6 - add r9d,r12d - and esi,r15d - xor r14d,r10d - add r9d,r13d - xor esi,r11d - add ebx,r9d - ror r14d,2 - add r9d,esi - mov r13d,ebx - add r14d,r9d - ror r13d,14 - mov r9d,r14d - mov r12d,ecx - xor r13d,ebx - ror r14d,9 - xor r12d,edx - ror r13d,5 - xor r14d,r9d - and r12d,ebx - vaesenclast xmm11,xmm9,xmm10 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((176-128))+rdi] - xor r13d,ebx - add r8d,DWORD[44+rsp] - mov esi,r9d - ror r14d,11 - xor r12d,edx - xor esi,r10d - ror r13d,6 - add r8d,r12d - and r15d,esi - xor r14d,r9d - add r8d,r13d - xor r15d,r10d - add eax,r8d - ror r14d,2 - add r8d,r15d - mov r13d,eax - add r14d,r8d - vmovdqa XMMWORD[32+rsp],xmm6 - vpalignr xmm4,xmm0,xmm3,4 - ror r13d,14 - mov r8d,r14d - vpalignr xmm7,xmm2,xmm1,4 - mov r12d,ebx - xor r13d,eax -DB 143,232,120,194,236,14 - ror r14d,9 - xor r12d,ecx - vpsrld xmm4,xmm4,3 - ror r13d,5 - xor r14d,r8d - vpaddd xmm3,xmm3,xmm7 - and r12d,eax - vpand xmm8,xmm11,xmm12 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((192-128))+rdi] - xor r13d,eax - add edx,DWORD[48+rsp] - mov r15d,r8d -DB 143,232,120,194,245,11 - ror r14d,11 - xor r12d,ecx - vpxor xmm4,xmm4,xmm5 - xor r15d,r9d - ror r13d,6 - add edx,r12d - and esi,r15d -DB 143,232,120,194,250,13 - xor r14d,r8d - add edx,r13d - vpxor xmm4,xmm4,xmm6 - xor esi,r9d - add r11d,edx - vpsrld xmm6,xmm2,10 - ror r14d,2 - add edx,esi - vpaddd xmm3,xmm3,xmm4 - mov r13d,r11d - add r14d,edx -DB 143,232,120,194,239,2 - ror r13d,14 - mov edx,r14d - vpxor xmm7,xmm7,xmm6 - mov r12d,eax - xor r13d,r11d - ror r14d,9 - xor r12d,ebx - vpxor xmm7,xmm7,xmm5 - ror r13d,5 - xor r14d,edx - and r12d,r11d - vaesenclast xmm11,xmm9,xmm10 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((208-128))+rdi] - xor r13d,r11d - vpsrldq xmm7,xmm7,8 - add ecx,DWORD[52+rsp] - mov esi,edx - ror r14d,11 - xor r12d,ebx - vpaddd xmm3,xmm3,xmm7 - xor esi,r8d - ror r13d,6 - add ecx,r12d - and r15d,esi -DB 143,232,120,194,251,13 - xor r14d,edx - add ecx,r13d - vpsrld xmm6,xmm3,10 - xor r15d,r8d - add r10d,ecx -DB 143,232,120,194,239,2 - ror r14d,2 - add ecx,r15d - vpxor xmm7,xmm7,xmm6 - mov r13d,r10d - add r14d,ecx - ror r13d,14 - mov ecx,r14d - vpxor xmm7,xmm7,xmm5 - mov r12d,r11d - xor r13d,r10d - ror r14d,9 - xor r12d,eax - vpslldq xmm7,xmm7,8 - ror r13d,5 - xor r14d,ecx - and r12d,r10d - vpand xmm11,xmm11,xmm13 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((224-128))+rdi] - xor r13d,r10d - vpaddd xmm3,xmm3,xmm7 - add ebx,DWORD[56+rsp] - mov r15d,ecx - ror r14d,11 - xor r12d,eax - vpaddd xmm6,xmm3,XMMWORD[96+rbp] - xor r15d,edx - ror r13d,6 - add ebx,r12d - and esi,r15d - xor r14d,ecx - add ebx,r13d - xor esi,edx - add r9d,ebx - ror r14d,2 - add ebx,esi - mov r13d,r9d - add r14d,ebx - ror r13d,14 - mov ebx,r14d - mov r12d,r10d - xor r13d,r9d - ror r14d,9 - xor r12d,r11d - ror r13d,5 - xor r14d,ebx - and r12d,r9d - vpor xmm8,xmm8,xmm11 - vaesenclast xmm11,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((0-128))+rdi] - xor r13d,r9d - add eax,DWORD[60+rsp] - mov esi,ebx - ror r14d,11 - xor r12d,r11d - xor esi,ecx - ror r13d,6 - add eax,r12d - and r15d,esi - xor r14d,ebx - add eax,r13d - xor r15d,ecx - add r8d,eax - ror r14d,2 - add eax,r15d - mov r13d,r8d - add r14d,eax - vmovdqa XMMWORD[48+rsp],xmm6 - mov r12,QWORD[((64+0))+rsp] - vpand xmm11,xmm11,xmm14 - mov r15,QWORD[((64+8))+rsp] - vpor xmm8,xmm8,xmm11 - vmovdqu XMMWORD[r12*1+r15],xmm8 - lea r12,[16+r12] - cmp BYTE[131+rbp],0 - jne NEAR $L$xop_00_47 - vmovdqu xmm9,XMMWORD[r12] - mov QWORD[((64+0))+rsp],r12 - ror r13d,14 - mov eax,r14d - mov r12d,r9d - xor r13d,r8d - ror r14d,9 - xor r12d,r10d - ror r13d,5 - xor r14d,eax - and r12d,r8d - vpxor xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((16-128))+rdi] - xor r13d,r8d - add r11d,DWORD[rsp] - mov r15d,eax - ror r14d,11 - xor r12d,r10d - xor r15d,ebx - ror r13d,6 - add r11d,r12d - and esi,r15d - xor r14d,eax - add r11d,r13d - xor esi,ebx - add edx,r11d - ror r14d,2 - add r11d,esi - mov r13d,edx - add r14d,r11d - ror r13d,14 - mov r11d,r14d - mov r12d,r8d - xor r13d,edx - ror r14d,9 - xor r12d,r9d - ror r13d,5 - xor r14d,r11d - and r12d,edx - vpxor xmm9,xmm9,xmm8 - xor r13d,edx - add r10d,DWORD[4+rsp] - mov esi,r11d - ror r14d,11 - xor r12d,r9d - xor esi,eax - ror r13d,6 - add r10d,r12d - and r15d,esi - xor r14d,r11d - add r10d,r13d - xor r15d,eax - add ecx,r10d - ror r14d,2 - add r10d,r15d - mov r13d,ecx - add r14d,r10d - ror r13d,14 - mov r10d,r14d - mov r12d,edx - xor r13d,ecx - ror r14d,9 - xor r12d,r8d - ror r13d,5 - xor r14d,r10d - and r12d,ecx - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((32-128))+rdi] - xor r13d,ecx - add r9d,DWORD[8+rsp] - mov r15d,r10d - ror r14d,11 - xor r12d,r8d - xor r15d,r11d - ror r13d,6 - add r9d,r12d - and esi,r15d - xor r14d,r10d - add r9d,r13d - xor esi,r11d - add ebx,r9d - ror r14d,2 - add r9d,esi - mov r13d,ebx - add r14d,r9d - ror r13d,14 - mov r9d,r14d - mov r12d,ecx - xor r13d,ebx - ror r14d,9 - xor r12d,edx - ror r13d,5 - xor r14d,r9d - and r12d,ebx - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((48-128))+rdi] - xor r13d,ebx - add r8d,DWORD[12+rsp] - mov esi,r9d - ror r14d,11 - xor r12d,edx - xor esi,r10d - ror r13d,6 - add r8d,r12d - and r15d,esi - xor r14d,r9d - add r8d,r13d - xor r15d,r10d - add eax,r8d - ror r14d,2 - add r8d,r15d - mov r13d,eax - add r14d,r8d - ror r13d,14 - mov r8d,r14d - mov r12d,ebx - xor r13d,eax - ror r14d,9 - xor r12d,ecx - ror r13d,5 - xor r14d,r8d - and r12d,eax - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((64-128))+rdi] - xor r13d,eax - add edx,DWORD[16+rsp] - mov r15d,r8d - ror r14d,11 - xor r12d,ecx - xor r15d,r9d - ror r13d,6 - add edx,r12d - and esi,r15d - xor r14d,r8d - add edx,r13d - xor esi,r9d - add r11d,edx - ror r14d,2 - add edx,esi - mov r13d,r11d - add r14d,edx - ror r13d,14 - mov edx,r14d - mov r12d,eax - xor r13d,r11d - ror r14d,9 - xor r12d,ebx - ror r13d,5 - xor r14d,edx - and r12d,r11d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((80-128))+rdi] - xor r13d,r11d - add ecx,DWORD[20+rsp] - mov esi,edx - ror r14d,11 - xor r12d,ebx - xor esi,r8d - ror r13d,6 - add ecx,r12d - and r15d,esi - xor r14d,edx - add ecx,r13d - xor r15d,r8d - add r10d,ecx - ror r14d,2 - add ecx,r15d - mov r13d,r10d - add r14d,ecx - ror r13d,14 - mov ecx,r14d - mov r12d,r11d - xor r13d,r10d - ror r14d,9 - xor r12d,eax - ror r13d,5 - xor r14d,ecx - and r12d,r10d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((96-128))+rdi] - xor r13d,r10d - add ebx,DWORD[24+rsp] - mov r15d,ecx - ror r14d,11 - xor r12d,eax - xor r15d,edx - ror r13d,6 - add ebx,r12d - and esi,r15d - xor r14d,ecx - add ebx,r13d - xor esi,edx - add r9d,ebx - ror r14d,2 - add ebx,esi - mov r13d,r9d - add r14d,ebx - ror r13d,14 - mov ebx,r14d - mov r12d,r10d - xor r13d,r9d - ror r14d,9 - xor r12d,r11d - ror r13d,5 - xor r14d,ebx - and r12d,r9d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((112-128))+rdi] - xor r13d,r9d - add eax,DWORD[28+rsp] - mov esi,ebx - ror r14d,11 - xor r12d,r11d - xor esi,ecx - ror r13d,6 - add eax,r12d - and r15d,esi - xor r14d,ebx - add eax,r13d - xor r15d,ecx - add r8d,eax - ror r14d,2 - add eax,r15d - mov r13d,r8d - add r14d,eax - ror r13d,14 - mov eax,r14d - mov r12d,r9d - xor r13d,r8d - ror r14d,9 - xor r12d,r10d - ror r13d,5 - xor r14d,eax - and r12d,r8d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((128-128))+rdi] - xor r13d,r8d - add r11d,DWORD[32+rsp] - mov r15d,eax - ror r14d,11 - xor r12d,r10d - xor r15d,ebx - ror r13d,6 - add r11d,r12d - and esi,r15d - xor r14d,eax - add r11d,r13d - xor esi,ebx - add edx,r11d - ror r14d,2 - add r11d,esi - mov r13d,edx - add r14d,r11d - ror r13d,14 - mov r11d,r14d - mov r12d,r8d - xor r13d,edx - ror r14d,9 - xor r12d,r9d - ror r13d,5 - xor r14d,r11d - and r12d,edx - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((144-128))+rdi] - xor r13d,edx - add r10d,DWORD[36+rsp] - mov esi,r11d - ror r14d,11 - xor r12d,r9d - xor esi,eax - ror r13d,6 - add r10d,r12d - and r15d,esi - xor r14d,r11d - add r10d,r13d - xor r15d,eax - add ecx,r10d - ror r14d,2 - add r10d,r15d - mov r13d,ecx - add r14d,r10d - ror r13d,14 - mov r10d,r14d - mov r12d,edx - xor r13d,ecx - ror r14d,9 - xor r12d,r8d - ror r13d,5 - xor r14d,r10d - and r12d,ecx - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((160-128))+rdi] - xor r13d,ecx - add r9d,DWORD[40+rsp] - mov r15d,r10d - ror r14d,11 - xor r12d,r8d - xor r15d,r11d - ror r13d,6 - add r9d,r12d - and esi,r15d - xor r14d,r10d - add r9d,r13d - xor esi,r11d - add ebx,r9d - ror r14d,2 - add r9d,esi - mov r13d,ebx - add r14d,r9d - ror r13d,14 - mov r9d,r14d - mov r12d,ecx - xor r13d,ebx - ror r14d,9 - xor r12d,edx - ror r13d,5 - xor r14d,r9d - and r12d,ebx - vaesenclast xmm11,xmm9,xmm10 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((176-128))+rdi] - xor r13d,ebx - add r8d,DWORD[44+rsp] - mov esi,r9d - ror r14d,11 - xor r12d,edx - xor esi,r10d - ror r13d,6 - add r8d,r12d - and r15d,esi - xor r14d,r9d - add r8d,r13d - xor r15d,r10d - add eax,r8d - ror r14d,2 - add r8d,r15d - mov r13d,eax - add r14d,r8d - ror r13d,14 - mov r8d,r14d - mov r12d,ebx - xor r13d,eax - ror r14d,9 - xor r12d,ecx - ror r13d,5 - xor r14d,r8d - and r12d,eax - vpand xmm8,xmm11,xmm12 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((192-128))+rdi] - xor r13d,eax - add edx,DWORD[48+rsp] - mov r15d,r8d - ror r14d,11 - xor r12d,ecx - xor r15d,r9d - ror r13d,6 - add edx,r12d - and esi,r15d - xor r14d,r8d - add edx,r13d - xor esi,r9d - add r11d,edx - ror r14d,2 - add edx,esi - mov r13d,r11d - add r14d,edx - ror r13d,14 - mov edx,r14d - mov r12d,eax - xor r13d,r11d - ror r14d,9 - xor r12d,ebx - ror r13d,5 - xor r14d,edx - and r12d,r11d - vaesenclast xmm11,xmm9,xmm10 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((208-128))+rdi] - xor r13d,r11d - add ecx,DWORD[52+rsp] - mov esi,edx - ror r14d,11 - xor r12d,ebx - xor esi,r8d - ror r13d,6 - add ecx,r12d - and r15d,esi - xor r14d,edx - add ecx,r13d - xor r15d,r8d - add r10d,ecx - ror r14d,2 - add ecx,r15d - mov r13d,r10d - add r14d,ecx - ror r13d,14 - mov ecx,r14d - mov r12d,r11d - xor r13d,r10d - ror r14d,9 - xor r12d,eax - ror r13d,5 - xor r14d,ecx - and r12d,r10d - vpand xmm11,xmm11,xmm13 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((224-128))+rdi] - xor r13d,r10d - add ebx,DWORD[56+rsp] - mov r15d,ecx - ror r14d,11 - xor r12d,eax - xor r15d,edx - ror r13d,6 - add ebx,r12d - and esi,r15d - xor r14d,ecx - add ebx,r13d - xor esi,edx - add r9d,ebx - ror r14d,2 - add ebx,esi - mov r13d,r9d - add r14d,ebx - ror r13d,14 - mov ebx,r14d - mov r12d,r10d - xor r13d,r9d - ror r14d,9 - xor r12d,r11d - ror r13d,5 - xor r14d,ebx - and r12d,r9d - vpor xmm8,xmm8,xmm11 - vaesenclast xmm11,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((0-128))+rdi] - xor r13d,r9d - add eax,DWORD[60+rsp] - mov esi,ebx - ror r14d,11 - xor r12d,r11d - xor esi,ecx - ror r13d,6 - add eax,r12d - and r15d,esi - xor r14d,ebx - add eax,r13d - xor r15d,ecx - add r8d,eax - ror r14d,2 - add eax,r15d - mov r13d,r8d - add r14d,eax - mov r12,QWORD[((64+0))+rsp] - mov r13,QWORD[((64+8))+rsp] - mov r15,QWORD[((64+40))+rsp] - mov rsi,QWORD[((64+48))+rsp] - - vpand xmm11,xmm11,xmm14 - mov eax,r14d - vpor xmm8,xmm8,xmm11 - vmovdqu XMMWORD[r13*1+r12],xmm8 - lea r12,[16+r12] - - add eax,DWORD[r15] - add ebx,DWORD[4+r15] - add ecx,DWORD[8+r15] - add edx,DWORD[12+r15] - add r8d,DWORD[16+r15] - add r9d,DWORD[20+r15] - add r10d,DWORD[24+r15] - add r11d,DWORD[28+r15] - - cmp r12,QWORD[((64+16))+rsp] - - mov DWORD[r15],eax - mov DWORD[4+r15],ebx - mov DWORD[8+r15],ecx - mov DWORD[12+r15],edx - mov DWORD[16+r15],r8d - mov DWORD[20+r15],r9d - mov DWORD[24+r15],r10d - mov DWORD[28+r15],r11d - - jb NEAR $L$loop_xop - - mov r8,QWORD[((64+32))+rsp] - mov rsi,QWORD[120+rsp] - - vmovdqu XMMWORD[r8],xmm8 - vzeroall - movaps xmm6,XMMWORD[128+rsp] - movaps xmm7,XMMWORD[144+rsp] - movaps xmm8,XMMWORD[160+rsp] - movaps xmm9,XMMWORD[176+rsp] - movaps xmm10,XMMWORD[192+rsp] - movaps xmm11,XMMWORD[208+rsp] - movaps xmm12,XMMWORD[224+rsp] - movaps xmm13,XMMWORD[240+rsp] - movaps xmm14,XMMWORD[256+rsp] - movaps xmm15,XMMWORD[272+rsp] - mov r15,QWORD[((-48))+rsi] - - mov r14,QWORD[((-40))+rsi] - - mov r13,QWORD[((-32))+rsi] - - mov r12,QWORD[((-24))+rsi] - - mov rbp,QWORD[((-16))+rsi] - - mov rbx,QWORD[((-8))+rsi] - - lea rsp,[rsi] - -$L$epilogue_xop: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_aesni_cbc_sha256_enc_xop: - -ALIGN 64 -aesni_cbc_sha256_enc_avx: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aesni_cbc_sha256_enc_avx: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - -$L$avx_shortcut: - mov r10,QWORD[56+rsp] - mov rax,rsp - - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,288 - and rsp,-64 - - shl rdx,6 - sub rsi,rdi - sub r10,rdi - add rdx,rdi - - - mov QWORD[((64+8))+rsp],rsi - mov QWORD[((64+16))+rsp],rdx - - mov QWORD[((64+32))+rsp],r8 - mov QWORD[((64+40))+rsp],r9 - mov QWORD[((64+48))+rsp],r10 - mov QWORD[120+rsp],rax - - movaps XMMWORD[128+rsp],xmm6 - movaps XMMWORD[144+rsp],xmm7 - movaps XMMWORD[160+rsp],xmm8 - movaps XMMWORD[176+rsp],xmm9 - movaps XMMWORD[192+rsp],xmm10 - movaps XMMWORD[208+rsp],xmm11 - movaps XMMWORD[224+rsp],xmm12 - movaps XMMWORD[240+rsp],xmm13 - movaps XMMWORD[256+rsp],xmm14 - movaps XMMWORD[272+rsp],xmm15 -$L$prologue_avx: - vzeroall - - mov r12,rdi - lea rdi,[128+rcx] - lea r13,[((K256+544))] - mov r14d,DWORD[((240-128))+rdi] - mov r15,r9 - mov rsi,r10 - vmovdqu xmm8,XMMWORD[r8] - sub r14,9 - - mov eax,DWORD[r15] - mov ebx,DWORD[4+r15] - mov ecx,DWORD[8+r15] - mov edx,DWORD[12+r15] - mov r8d,DWORD[16+r15] - mov r9d,DWORD[20+r15] - mov r10d,DWORD[24+r15] - mov r11d,DWORD[28+r15] - - vmovdqa xmm14,XMMWORD[r14*8+r13] - vmovdqa xmm13,XMMWORD[16+r14*8+r13] - vmovdqa xmm12,XMMWORD[32+r14*8+r13] - vmovdqu xmm10,XMMWORD[((0-128))+rdi] - jmp NEAR $L$loop_avx -ALIGN 16 -$L$loop_avx: - vmovdqa xmm7,XMMWORD[((K256+512))] - vmovdqu xmm0,XMMWORD[r12*1+rsi] - vmovdqu xmm1,XMMWORD[16+r12*1+rsi] - vmovdqu xmm2,XMMWORD[32+r12*1+rsi] - vmovdqu xmm3,XMMWORD[48+r12*1+rsi] - vpshufb xmm0,xmm0,xmm7 - lea rbp,[K256] - vpshufb xmm1,xmm1,xmm7 - vpshufb xmm2,xmm2,xmm7 - vpaddd xmm4,xmm0,XMMWORD[rbp] - vpshufb xmm3,xmm3,xmm7 - vpaddd xmm5,xmm1,XMMWORD[32+rbp] - vpaddd xmm6,xmm2,XMMWORD[64+rbp] - vpaddd xmm7,xmm3,XMMWORD[96+rbp] - vmovdqa XMMWORD[rsp],xmm4 - mov r14d,eax - vmovdqa XMMWORD[16+rsp],xmm5 - mov esi,ebx - vmovdqa XMMWORD[32+rsp],xmm6 - xor esi,ecx - vmovdqa XMMWORD[48+rsp],xmm7 - mov r13d,r8d - jmp NEAR $L$avx_00_47 - -ALIGN 16 -$L$avx_00_47: - sub rbp,-16*2*4 - vmovdqu xmm9,XMMWORD[r12] - mov QWORD[((64+0))+rsp],r12 - vpalignr xmm4,xmm1,xmm0,4 - shrd r13d,r13d,14 - mov eax,r14d - mov r12d,r9d - vpalignr xmm7,xmm3,xmm2,4 - xor r13d,r8d - shrd r14d,r14d,9 - xor r12d,r10d - vpsrld xmm6,xmm4,7 - shrd r13d,r13d,5 - xor r14d,eax - and r12d,r8d - vpaddd xmm0,xmm0,xmm7 - vpxor xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((16-128))+rdi] - xor r13d,r8d - add r11d,DWORD[rsp] - mov r15d,eax - vpsrld xmm7,xmm4,3 - shrd r14d,r14d,11 - xor r12d,r10d - xor r15d,ebx - vpslld xmm5,xmm4,14 - shrd r13d,r13d,6 - add r11d,r12d - and esi,r15d - vpxor xmm4,xmm7,xmm6 - xor r14d,eax - add r11d,r13d - xor esi,ebx - vpshufd xmm7,xmm3,250 - add edx,r11d - shrd r14d,r14d,2 - add r11d,esi - vpsrld xmm6,xmm6,11 - mov r13d,edx - add r14d,r11d - shrd r13d,r13d,14 - vpxor xmm4,xmm4,xmm5 - mov r11d,r14d - mov r12d,r8d - xor r13d,edx - vpslld xmm5,xmm5,11 - shrd r14d,r14d,9 - xor r12d,r9d - shrd r13d,r13d,5 - vpxor xmm4,xmm4,xmm6 - xor r14d,r11d - and r12d,edx - vpxor xmm9,xmm9,xmm8 - xor r13d,edx - vpsrld xmm6,xmm7,10 - add r10d,DWORD[4+rsp] - mov esi,r11d - shrd r14d,r14d,11 - vpxor xmm4,xmm4,xmm5 - xor r12d,r9d - xor esi,eax - shrd r13d,r13d,6 - vpsrlq xmm7,xmm7,17 - add r10d,r12d - and r15d,esi - xor r14d,r11d - vpaddd xmm0,xmm0,xmm4 - add r10d,r13d - xor r15d,eax - add ecx,r10d - vpxor xmm6,xmm6,xmm7 - shrd r14d,r14d,2 - add r10d,r15d - mov r13d,ecx - vpsrlq xmm7,xmm7,2 - add r14d,r10d - shrd r13d,r13d,14 - mov r10d,r14d - vpxor xmm6,xmm6,xmm7 - mov r12d,edx - xor r13d,ecx - shrd r14d,r14d,9 - vpshufd xmm6,xmm6,132 - xor r12d,r8d - shrd r13d,r13d,5 - xor r14d,r10d - vpsrldq xmm6,xmm6,8 - and r12d,ecx - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((32-128))+rdi] - xor r13d,ecx - add r9d,DWORD[8+rsp] - vpaddd xmm0,xmm0,xmm6 - mov r15d,r10d - shrd r14d,r14d,11 - xor r12d,r8d - vpshufd xmm7,xmm0,80 - xor r15d,r11d - shrd r13d,r13d,6 - add r9d,r12d - vpsrld xmm6,xmm7,10 - and esi,r15d - xor r14d,r10d - add r9d,r13d - vpsrlq xmm7,xmm7,17 - xor esi,r11d - add ebx,r9d - shrd r14d,r14d,2 - vpxor xmm6,xmm6,xmm7 - add r9d,esi - mov r13d,ebx - add r14d,r9d - vpsrlq xmm7,xmm7,2 - shrd r13d,r13d,14 - mov r9d,r14d - mov r12d,ecx - vpxor xmm6,xmm6,xmm7 - xor r13d,ebx - shrd r14d,r14d,9 - xor r12d,edx - vpshufd xmm6,xmm6,232 - shrd r13d,r13d,5 - xor r14d,r9d - and r12d,ebx - vpslldq xmm6,xmm6,8 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((48-128))+rdi] - xor r13d,ebx - add r8d,DWORD[12+rsp] - mov esi,r9d - vpaddd xmm0,xmm0,xmm6 - shrd r14d,r14d,11 - xor r12d,edx - xor esi,r10d - vpaddd xmm6,xmm0,XMMWORD[rbp] - shrd r13d,r13d,6 - add r8d,r12d - and r15d,esi - xor r14d,r9d - add r8d,r13d - xor r15d,r10d - add eax,r8d - shrd r14d,r14d,2 - add r8d,r15d - mov r13d,eax - add r14d,r8d - vmovdqa XMMWORD[rsp],xmm6 - vpalignr xmm4,xmm2,xmm1,4 - shrd r13d,r13d,14 - mov r8d,r14d - mov r12d,ebx - vpalignr xmm7,xmm0,xmm3,4 - xor r13d,eax - shrd r14d,r14d,9 - xor r12d,ecx - vpsrld xmm6,xmm4,7 - shrd r13d,r13d,5 - xor r14d,r8d - and r12d,eax - vpaddd xmm1,xmm1,xmm7 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((64-128))+rdi] - xor r13d,eax - add edx,DWORD[16+rsp] - mov r15d,r8d - vpsrld xmm7,xmm4,3 - shrd r14d,r14d,11 - xor r12d,ecx - xor r15d,r9d - vpslld xmm5,xmm4,14 - shrd r13d,r13d,6 - add edx,r12d - and esi,r15d - vpxor xmm4,xmm7,xmm6 - xor r14d,r8d - add edx,r13d - xor esi,r9d - vpshufd xmm7,xmm0,250 - add r11d,edx - shrd r14d,r14d,2 - add edx,esi - vpsrld xmm6,xmm6,11 - mov r13d,r11d - add r14d,edx - shrd r13d,r13d,14 - vpxor xmm4,xmm4,xmm5 - mov edx,r14d - mov r12d,eax - xor r13d,r11d - vpslld xmm5,xmm5,11 - shrd r14d,r14d,9 - xor r12d,ebx - shrd r13d,r13d,5 - vpxor xmm4,xmm4,xmm6 - xor r14d,edx - and r12d,r11d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((80-128))+rdi] - xor r13d,r11d - vpsrld xmm6,xmm7,10 - add ecx,DWORD[20+rsp] - mov esi,edx - shrd r14d,r14d,11 - vpxor xmm4,xmm4,xmm5 - xor r12d,ebx - xor esi,r8d - shrd r13d,r13d,6 - vpsrlq xmm7,xmm7,17 - add ecx,r12d - and r15d,esi - xor r14d,edx - vpaddd xmm1,xmm1,xmm4 - add ecx,r13d - xor r15d,r8d - add r10d,ecx - vpxor xmm6,xmm6,xmm7 - shrd r14d,r14d,2 - add ecx,r15d - mov r13d,r10d - vpsrlq xmm7,xmm7,2 - add r14d,ecx - shrd r13d,r13d,14 - mov ecx,r14d - vpxor xmm6,xmm6,xmm7 - mov r12d,r11d - xor r13d,r10d - shrd r14d,r14d,9 - vpshufd xmm6,xmm6,132 - xor r12d,eax - shrd r13d,r13d,5 - xor r14d,ecx - vpsrldq xmm6,xmm6,8 - and r12d,r10d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((96-128))+rdi] - xor r13d,r10d - add ebx,DWORD[24+rsp] - vpaddd xmm1,xmm1,xmm6 - mov r15d,ecx - shrd r14d,r14d,11 - xor r12d,eax - vpshufd xmm7,xmm1,80 - xor r15d,edx - shrd r13d,r13d,6 - add ebx,r12d - vpsrld xmm6,xmm7,10 - and esi,r15d - xor r14d,ecx - add ebx,r13d - vpsrlq xmm7,xmm7,17 - xor esi,edx - add r9d,ebx - shrd r14d,r14d,2 - vpxor xmm6,xmm6,xmm7 - add ebx,esi - mov r13d,r9d - add r14d,ebx - vpsrlq xmm7,xmm7,2 - shrd r13d,r13d,14 - mov ebx,r14d - mov r12d,r10d - vpxor xmm6,xmm6,xmm7 - xor r13d,r9d - shrd r14d,r14d,9 - xor r12d,r11d - vpshufd xmm6,xmm6,232 - shrd r13d,r13d,5 - xor r14d,ebx - and r12d,r9d - vpslldq xmm6,xmm6,8 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((112-128))+rdi] - xor r13d,r9d - add eax,DWORD[28+rsp] - mov esi,ebx - vpaddd xmm1,xmm1,xmm6 - shrd r14d,r14d,11 - xor r12d,r11d - xor esi,ecx - vpaddd xmm6,xmm1,XMMWORD[32+rbp] - shrd r13d,r13d,6 - add eax,r12d - and r15d,esi - xor r14d,ebx - add eax,r13d - xor r15d,ecx - add r8d,eax - shrd r14d,r14d,2 - add eax,r15d - mov r13d,r8d - add r14d,eax - vmovdqa XMMWORD[16+rsp],xmm6 - vpalignr xmm4,xmm3,xmm2,4 - shrd r13d,r13d,14 - mov eax,r14d - mov r12d,r9d - vpalignr xmm7,xmm1,xmm0,4 - xor r13d,r8d - shrd r14d,r14d,9 - xor r12d,r10d - vpsrld xmm6,xmm4,7 - shrd r13d,r13d,5 - xor r14d,eax - and r12d,r8d - vpaddd xmm2,xmm2,xmm7 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((128-128))+rdi] - xor r13d,r8d - add r11d,DWORD[32+rsp] - mov r15d,eax - vpsrld xmm7,xmm4,3 - shrd r14d,r14d,11 - xor r12d,r10d - xor r15d,ebx - vpslld xmm5,xmm4,14 - shrd r13d,r13d,6 - add r11d,r12d - and esi,r15d - vpxor xmm4,xmm7,xmm6 - xor r14d,eax - add r11d,r13d - xor esi,ebx - vpshufd xmm7,xmm1,250 - add edx,r11d - shrd r14d,r14d,2 - add r11d,esi - vpsrld xmm6,xmm6,11 - mov r13d,edx - add r14d,r11d - shrd r13d,r13d,14 - vpxor xmm4,xmm4,xmm5 - mov r11d,r14d - mov r12d,r8d - xor r13d,edx - vpslld xmm5,xmm5,11 - shrd r14d,r14d,9 - xor r12d,r9d - shrd r13d,r13d,5 - vpxor xmm4,xmm4,xmm6 - xor r14d,r11d - and r12d,edx - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((144-128))+rdi] - xor r13d,edx - vpsrld xmm6,xmm7,10 - add r10d,DWORD[36+rsp] - mov esi,r11d - shrd r14d,r14d,11 - vpxor xmm4,xmm4,xmm5 - xor r12d,r9d - xor esi,eax - shrd r13d,r13d,6 - vpsrlq xmm7,xmm7,17 - add r10d,r12d - and r15d,esi - xor r14d,r11d - vpaddd xmm2,xmm2,xmm4 - add r10d,r13d - xor r15d,eax - add ecx,r10d - vpxor xmm6,xmm6,xmm7 - shrd r14d,r14d,2 - add r10d,r15d - mov r13d,ecx - vpsrlq xmm7,xmm7,2 - add r14d,r10d - shrd r13d,r13d,14 - mov r10d,r14d - vpxor xmm6,xmm6,xmm7 - mov r12d,edx - xor r13d,ecx - shrd r14d,r14d,9 - vpshufd xmm6,xmm6,132 - xor r12d,r8d - shrd r13d,r13d,5 - xor r14d,r10d - vpsrldq xmm6,xmm6,8 - and r12d,ecx - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((160-128))+rdi] - xor r13d,ecx - add r9d,DWORD[40+rsp] - vpaddd xmm2,xmm2,xmm6 - mov r15d,r10d - shrd r14d,r14d,11 - xor r12d,r8d - vpshufd xmm7,xmm2,80 - xor r15d,r11d - shrd r13d,r13d,6 - add r9d,r12d - vpsrld xmm6,xmm7,10 - and esi,r15d - xor r14d,r10d - add r9d,r13d - vpsrlq xmm7,xmm7,17 - xor esi,r11d - add ebx,r9d - shrd r14d,r14d,2 - vpxor xmm6,xmm6,xmm7 - add r9d,esi - mov r13d,ebx - add r14d,r9d - vpsrlq xmm7,xmm7,2 - shrd r13d,r13d,14 - mov r9d,r14d - mov r12d,ecx - vpxor xmm6,xmm6,xmm7 - xor r13d,ebx - shrd r14d,r14d,9 - xor r12d,edx - vpshufd xmm6,xmm6,232 - shrd r13d,r13d,5 - xor r14d,r9d - and r12d,ebx - vpslldq xmm6,xmm6,8 - vaesenclast xmm11,xmm9,xmm10 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((176-128))+rdi] - xor r13d,ebx - add r8d,DWORD[44+rsp] - mov esi,r9d - vpaddd xmm2,xmm2,xmm6 - shrd r14d,r14d,11 - xor r12d,edx - xor esi,r10d - vpaddd xmm6,xmm2,XMMWORD[64+rbp] - shrd r13d,r13d,6 - add r8d,r12d - and r15d,esi - xor r14d,r9d - add r8d,r13d - xor r15d,r10d - add eax,r8d - shrd r14d,r14d,2 - add r8d,r15d - mov r13d,eax - add r14d,r8d - vmovdqa XMMWORD[32+rsp],xmm6 - vpalignr xmm4,xmm0,xmm3,4 - shrd r13d,r13d,14 - mov r8d,r14d - mov r12d,ebx - vpalignr xmm7,xmm2,xmm1,4 - xor r13d,eax - shrd r14d,r14d,9 - xor r12d,ecx - vpsrld xmm6,xmm4,7 - shrd r13d,r13d,5 - xor r14d,r8d - and r12d,eax - vpaddd xmm3,xmm3,xmm7 - vpand xmm8,xmm11,xmm12 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((192-128))+rdi] - xor r13d,eax - add edx,DWORD[48+rsp] - mov r15d,r8d - vpsrld xmm7,xmm4,3 - shrd r14d,r14d,11 - xor r12d,ecx - xor r15d,r9d - vpslld xmm5,xmm4,14 - shrd r13d,r13d,6 - add edx,r12d - and esi,r15d - vpxor xmm4,xmm7,xmm6 - xor r14d,r8d - add edx,r13d - xor esi,r9d - vpshufd xmm7,xmm2,250 - add r11d,edx - shrd r14d,r14d,2 - add edx,esi - vpsrld xmm6,xmm6,11 - mov r13d,r11d - add r14d,edx - shrd r13d,r13d,14 - vpxor xmm4,xmm4,xmm5 - mov edx,r14d - mov r12d,eax - xor r13d,r11d - vpslld xmm5,xmm5,11 - shrd r14d,r14d,9 - xor r12d,ebx - shrd r13d,r13d,5 - vpxor xmm4,xmm4,xmm6 - xor r14d,edx - and r12d,r11d - vaesenclast xmm11,xmm9,xmm10 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((208-128))+rdi] - xor r13d,r11d - vpsrld xmm6,xmm7,10 - add ecx,DWORD[52+rsp] - mov esi,edx - shrd r14d,r14d,11 - vpxor xmm4,xmm4,xmm5 - xor r12d,ebx - xor esi,r8d - shrd r13d,r13d,6 - vpsrlq xmm7,xmm7,17 - add ecx,r12d - and r15d,esi - xor r14d,edx - vpaddd xmm3,xmm3,xmm4 - add ecx,r13d - xor r15d,r8d - add r10d,ecx - vpxor xmm6,xmm6,xmm7 - shrd r14d,r14d,2 - add ecx,r15d - mov r13d,r10d - vpsrlq xmm7,xmm7,2 - add r14d,ecx - shrd r13d,r13d,14 - mov ecx,r14d - vpxor xmm6,xmm6,xmm7 - mov r12d,r11d - xor r13d,r10d - shrd r14d,r14d,9 - vpshufd xmm6,xmm6,132 - xor r12d,eax - shrd r13d,r13d,5 - xor r14d,ecx - vpsrldq xmm6,xmm6,8 - and r12d,r10d - vpand xmm11,xmm11,xmm13 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((224-128))+rdi] - xor r13d,r10d - add ebx,DWORD[56+rsp] - vpaddd xmm3,xmm3,xmm6 - mov r15d,ecx - shrd r14d,r14d,11 - xor r12d,eax - vpshufd xmm7,xmm3,80 - xor r15d,edx - shrd r13d,r13d,6 - add ebx,r12d - vpsrld xmm6,xmm7,10 - and esi,r15d - xor r14d,ecx - add ebx,r13d - vpsrlq xmm7,xmm7,17 - xor esi,edx - add r9d,ebx - shrd r14d,r14d,2 - vpxor xmm6,xmm6,xmm7 - add ebx,esi - mov r13d,r9d - add r14d,ebx - vpsrlq xmm7,xmm7,2 - shrd r13d,r13d,14 - mov ebx,r14d - mov r12d,r10d - vpxor xmm6,xmm6,xmm7 - xor r13d,r9d - shrd r14d,r14d,9 - xor r12d,r11d - vpshufd xmm6,xmm6,232 - shrd r13d,r13d,5 - xor r14d,ebx - and r12d,r9d - vpslldq xmm6,xmm6,8 - vpor xmm8,xmm8,xmm11 - vaesenclast xmm11,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((0-128))+rdi] - xor r13d,r9d - add eax,DWORD[60+rsp] - mov esi,ebx - vpaddd xmm3,xmm3,xmm6 - shrd r14d,r14d,11 - xor r12d,r11d - xor esi,ecx - vpaddd xmm6,xmm3,XMMWORD[96+rbp] - shrd r13d,r13d,6 - add eax,r12d - and r15d,esi - xor r14d,ebx - add eax,r13d - xor r15d,ecx - add r8d,eax - shrd r14d,r14d,2 - add eax,r15d - mov r13d,r8d - add r14d,eax - vmovdqa XMMWORD[48+rsp],xmm6 - mov r12,QWORD[((64+0))+rsp] - vpand xmm11,xmm11,xmm14 - mov r15,QWORD[((64+8))+rsp] - vpor xmm8,xmm8,xmm11 - vmovdqu XMMWORD[r12*1+r15],xmm8 - lea r12,[16+r12] - cmp BYTE[131+rbp],0 - jne NEAR $L$avx_00_47 - vmovdqu xmm9,XMMWORD[r12] - mov QWORD[((64+0))+rsp],r12 - shrd r13d,r13d,14 - mov eax,r14d - mov r12d,r9d - xor r13d,r8d - shrd r14d,r14d,9 - xor r12d,r10d - shrd r13d,r13d,5 - xor r14d,eax - and r12d,r8d - vpxor xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((16-128))+rdi] - xor r13d,r8d - add r11d,DWORD[rsp] - mov r15d,eax - shrd r14d,r14d,11 - xor r12d,r10d - xor r15d,ebx - shrd r13d,r13d,6 - add r11d,r12d - and esi,r15d - xor r14d,eax - add r11d,r13d - xor esi,ebx - add edx,r11d - shrd r14d,r14d,2 - add r11d,esi - mov r13d,edx - add r14d,r11d - shrd r13d,r13d,14 - mov r11d,r14d - mov r12d,r8d - xor r13d,edx - shrd r14d,r14d,9 - xor r12d,r9d - shrd r13d,r13d,5 - xor r14d,r11d - and r12d,edx - vpxor xmm9,xmm9,xmm8 - xor r13d,edx - add r10d,DWORD[4+rsp] - mov esi,r11d - shrd r14d,r14d,11 - xor r12d,r9d - xor esi,eax - shrd r13d,r13d,6 - add r10d,r12d - and r15d,esi - xor r14d,r11d - add r10d,r13d - xor r15d,eax - add ecx,r10d - shrd r14d,r14d,2 - add r10d,r15d - mov r13d,ecx - add r14d,r10d - shrd r13d,r13d,14 - mov r10d,r14d - mov r12d,edx - xor r13d,ecx - shrd r14d,r14d,9 - xor r12d,r8d - shrd r13d,r13d,5 - xor r14d,r10d - and r12d,ecx - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((32-128))+rdi] - xor r13d,ecx - add r9d,DWORD[8+rsp] - mov r15d,r10d - shrd r14d,r14d,11 - xor r12d,r8d - xor r15d,r11d - shrd r13d,r13d,6 - add r9d,r12d - and esi,r15d - xor r14d,r10d - add r9d,r13d - xor esi,r11d - add ebx,r9d - shrd r14d,r14d,2 - add r9d,esi - mov r13d,ebx - add r14d,r9d - shrd r13d,r13d,14 - mov r9d,r14d - mov r12d,ecx - xor r13d,ebx - shrd r14d,r14d,9 - xor r12d,edx - shrd r13d,r13d,5 - xor r14d,r9d - and r12d,ebx - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((48-128))+rdi] - xor r13d,ebx - add r8d,DWORD[12+rsp] - mov esi,r9d - shrd r14d,r14d,11 - xor r12d,edx - xor esi,r10d - shrd r13d,r13d,6 - add r8d,r12d - and r15d,esi - xor r14d,r9d - add r8d,r13d - xor r15d,r10d - add eax,r8d - shrd r14d,r14d,2 - add r8d,r15d - mov r13d,eax - add r14d,r8d - shrd r13d,r13d,14 - mov r8d,r14d - mov r12d,ebx - xor r13d,eax - shrd r14d,r14d,9 - xor r12d,ecx - shrd r13d,r13d,5 - xor r14d,r8d - and r12d,eax - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((64-128))+rdi] - xor r13d,eax - add edx,DWORD[16+rsp] - mov r15d,r8d - shrd r14d,r14d,11 - xor r12d,ecx - xor r15d,r9d - shrd r13d,r13d,6 - add edx,r12d - and esi,r15d - xor r14d,r8d - add edx,r13d - xor esi,r9d - add r11d,edx - shrd r14d,r14d,2 - add edx,esi - mov r13d,r11d - add r14d,edx - shrd r13d,r13d,14 - mov edx,r14d - mov r12d,eax - xor r13d,r11d - shrd r14d,r14d,9 - xor r12d,ebx - shrd r13d,r13d,5 - xor r14d,edx - and r12d,r11d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((80-128))+rdi] - xor r13d,r11d - add ecx,DWORD[20+rsp] - mov esi,edx - shrd r14d,r14d,11 - xor r12d,ebx - xor esi,r8d - shrd r13d,r13d,6 - add ecx,r12d - and r15d,esi - xor r14d,edx - add ecx,r13d - xor r15d,r8d - add r10d,ecx - shrd r14d,r14d,2 - add ecx,r15d - mov r13d,r10d - add r14d,ecx - shrd r13d,r13d,14 - mov ecx,r14d - mov r12d,r11d - xor r13d,r10d - shrd r14d,r14d,9 - xor r12d,eax - shrd r13d,r13d,5 - xor r14d,ecx - and r12d,r10d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((96-128))+rdi] - xor r13d,r10d - add ebx,DWORD[24+rsp] - mov r15d,ecx - shrd r14d,r14d,11 - xor r12d,eax - xor r15d,edx - shrd r13d,r13d,6 - add ebx,r12d - and esi,r15d - xor r14d,ecx - add ebx,r13d - xor esi,edx - add r9d,ebx - shrd r14d,r14d,2 - add ebx,esi - mov r13d,r9d - add r14d,ebx - shrd r13d,r13d,14 - mov ebx,r14d - mov r12d,r10d - xor r13d,r9d - shrd r14d,r14d,9 - xor r12d,r11d - shrd r13d,r13d,5 - xor r14d,ebx - and r12d,r9d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((112-128))+rdi] - xor r13d,r9d - add eax,DWORD[28+rsp] - mov esi,ebx - shrd r14d,r14d,11 - xor r12d,r11d - xor esi,ecx - shrd r13d,r13d,6 - add eax,r12d - and r15d,esi - xor r14d,ebx - add eax,r13d - xor r15d,ecx - add r8d,eax - shrd r14d,r14d,2 - add eax,r15d - mov r13d,r8d - add r14d,eax - shrd r13d,r13d,14 - mov eax,r14d - mov r12d,r9d - xor r13d,r8d - shrd r14d,r14d,9 - xor r12d,r10d - shrd r13d,r13d,5 - xor r14d,eax - and r12d,r8d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((128-128))+rdi] - xor r13d,r8d - add r11d,DWORD[32+rsp] - mov r15d,eax - shrd r14d,r14d,11 - xor r12d,r10d - xor r15d,ebx - shrd r13d,r13d,6 - add r11d,r12d - and esi,r15d - xor r14d,eax - add r11d,r13d - xor esi,ebx - add edx,r11d - shrd r14d,r14d,2 - add r11d,esi - mov r13d,edx - add r14d,r11d - shrd r13d,r13d,14 - mov r11d,r14d - mov r12d,r8d - xor r13d,edx - shrd r14d,r14d,9 - xor r12d,r9d - shrd r13d,r13d,5 - xor r14d,r11d - and r12d,edx - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((144-128))+rdi] - xor r13d,edx - add r10d,DWORD[36+rsp] - mov esi,r11d - shrd r14d,r14d,11 - xor r12d,r9d - xor esi,eax - shrd r13d,r13d,6 - add r10d,r12d - and r15d,esi - xor r14d,r11d - add r10d,r13d - xor r15d,eax - add ecx,r10d - shrd r14d,r14d,2 - add r10d,r15d - mov r13d,ecx - add r14d,r10d - shrd r13d,r13d,14 - mov r10d,r14d - mov r12d,edx - xor r13d,ecx - shrd r14d,r14d,9 - xor r12d,r8d - shrd r13d,r13d,5 - xor r14d,r10d - and r12d,ecx - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((160-128))+rdi] - xor r13d,ecx - add r9d,DWORD[40+rsp] - mov r15d,r10d - shrd r14d,r14d,11 - xor r12d,r8d - xor r15d,r11d - shrd r13d,r13d,6 - add r9d,r12d - and esi,r15d - xor r14d,r10d - add r9d,r13d - xor esi,r11d - add ebx,r9d - shrd r14d,r14d,2 - add r9d,esi - mov r13d,ebx - add r14d,r9d - shrd r13d,r13d,14 - mov r9d,r14d - mov r12d,ecx - xor r13d,ebx - shrd r14d,r14d,9 - xor r12d,edx - shrd r13d,r13d,5 - xor r14d,r9d - and r12d,ebx - vaesenclast xmm11,xmm9,xmm10 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((176-128))+rdi] - xor r13d,ebx - add r8d,DWORD[44+rsp] - mov esi,r9d - shrd r14d,r14d,11 - xor r12d,edx - xor esi,r10d - shrd r13d,r13d,6 - add r8d,r12d - and r15d,esi - xor r14d,r9d - add r8d,r13d - xor r15d,r10d - add eax,r8d - shrd r14d,r14d,2 - add r8d,r15d - mov r13d,eax - add r14d,r8d - shrd r13d,r13d,14 - mov r8d,r14d - mov r12d,ebx - xor r13d,eax - shrd r14d,r14d,9 - xor r12d,ecx - shrd r13d,r13d,5 - xor r14d,r8d - and r12d,eax - vpand xmm8,xmm11,xmm12 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((192-128))+rdi] - xor r13d,eax - add edx,DWORD[48+rsp] - mov r15d,r8d - shrd r14d,r14d,11 - xor r12d,ecx - xor r15d,r9d - shrd r13d,r13d,6 - add edx,r12d - and esi,r15d - xor r14d,r8d - add edx,r13d - xor esi,r9d - add r11d,edx - shrd r14d,r14d,2 - add edx,esi - mov r13d,r11d - add r14d,edx - shrd r13d,r13d,14 - mov edx,r14d - mov r12d,eax - xor r13d,r11d - shrd r14d,r14d,9 - xor r12d,ebx - shrd r13d,r13d,5 - xor r14d,edx - and r12d,r11d - vaesenclast xmm11,xmm9,xmm10 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((208-128))+rdi] - xor r13d,r11d - add ecx,DWORD[52+rsp] - mov esi,edx - shrd r14d,r14d,11 - xor r12d,ebx - xor esi,r8d - shrd r13d,r13d,6 - add ecx,r12d - and r15d,esi - xor r14d,edx - add ecx,r13d - xor r15d,r8d - add r10d,ecx - shrd r14d,r14d,2 - add ecx,r15d - mov r13d,r10d - add r14d,ecx - shrd r13d,r13d,14 - mov ecx,r14d - mov r12d,r11d - xor r13d,r10d - shrd r14d,r14d,9 - xor r12d,eax - shrd r13d,r13d,5 - xor r14d,ecx - and r12d,r10d - vpand xmm11,xmm11,xmm13 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((224-128))+rdi] - xor r13d,r10d - add ebx,DWORD[56+rsp] - mov r15d,ecx - shrd r14d,r14d,11 - xor r12d,eax - xor r15d,edx - shrd r13d,r13d,6 - add ebx,r12d - and esi,r15d - xor r14d,ecx - add ebx,r13d - xor esi,edx - add r9d,ebx - shrd r14d,r14d,2 - add ebx,esi - mov r13d,r9d - add r14d,ebx - shrd r13d,r13d,14 - mov ebx,r14d - mov r12d,r10d - xor r13d,r9d - shrd r14d,r14d,9 - xor r12d,r11d - shrd r13d,r13d,5 - xor r14d,ebx - and r12d,r9d - vpor xmm8,xmm8,xmm11 - vaesenclast xmm11,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((0-128))+rdi] - xor r13d,r9d - add eax,DWORD[60+rsp] - mov esi,ebx - shrd r14d,r14d,11 - xor r12d,r11d - xor esi,ecx - shrd r13d,r13d,6 - add eax,r12d - and r15d,esi - xor r14d,ebx - add eax,r13d - xor r15d,ecx - add r8d,eax - shrd r14d,r14d,2 - add eax,r15d - mov r13d,r8d - add r14d,eax - mov r12,QWORD[((64+0))+rsp] - mov r13,QWORD[((64+8))+rsp] - mov r15,QWORD[((64+40))+rsp] - mov rsi,QWORD[((64+48))+rsp] - - vpand xmm11,xmm11,xmm14 - mov eax,r14d - vpor xmm8,xmm8,xmm11 - vmovdqu XMMWORD[r13*1+r12],xmm8 - lea r12,[16+r12] - - add eax,DWORD[r15] - add ebx,DWORD[4+r15] - add ecx,DWORD[8+r15] - add edx,DWORD[12+r15] - add r8d,DWORD[16+r15] - add r9d,DWORD[20+r15] - add r10d,DWORD[24+r15] - add r11d,DWORD[28+r15] - - cmp r12,QWORD[((64+16))+rsp] - - mov DWORD[r15],eax - mov DWORD[4+r15],ebx - mov DWORD[8+r15],ecx - mov DWORD[12+r15],edx - mov DWORD[16+r15],r8d - mov DWORD[20+r15],r9d - mov DWORD[24+r15],r10d - mov DWORD[28+r15],r11d - jb NEAR $L$loop_avx - - mov r8,QWORD[((64+32))+rsp] - mov rsi,QWORD[120+rsp] - - vmovdqu XMMWORD[r8],xmm8 - vzeroall - movaps xmm6,XMMWORD[128+rsp] - movaps xmm7,XMMWORD[144+rsp] - movaps xmm8,XMMWORD[160+rsp] - movaps xmm9,XMMWORD[176+rsp] - movaps xmm10,XMMWORD[192+rsp] - movaps xmm11,XMMWORD[208+rsp] - movaps xmm12,XMMWORD[224+rsp] - movaps xmm13,XMMWORD[240+rsp] - movaps xmm14,XMMWORD[256+rsp] - movaps xmm15,XMMWORD[272+rsp] - mov r15,QWORD[((-48))+rsi] - - mov r14,QWORD[((-40))+rsi] - - mov r13,QWORD[((-32))+rsi] - - mov r12,QWORD[((-24))+rsi] - - mov rbp,QWORD[((-16))+rsi] - - mov rbx,QWORD[((-8))+rsi] - - lea rsp,[rsi] - -$L$epilogue_avx: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_aesni_cbc_sha256_enc_avx: - -ALIGN 64 -aesni_cbc_sha256_enc_avx2: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aesni_cbc_sha256_enc_avx2: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - -$L$avx2_shortcut: - mov r10,QWORD[56+rsp] - mov rax,rsp - - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,736 - and rsp,-256*4 - add rsp,448 - - shl rdx,6 - sub rsi,rdi - sub r10,rdi - add rdx,rdi - - - - mov QWORD[((64+16))+rsp],rdx - - mov QWORD[((64+32))+rsp],r8 - mov QWORD[((64+40))+rsp],r9 - mov QWORD[((64+48))+rsp],r10 - mov QWORD[120+rsp],rax - - movaps XMMWORD[128+rsp],xmm6 - movaps XMMWORD[144+rsp],xmm7 - movaps XMMWORD[160+rsp],xmm8 - movaps XMMWORD[176+rsp],xmm9 - movaps XMMWORD[192+rsp],xmm10 - movaps XMMWORD[208+rsp],xmm11 - movaps XMMWORD[224+rsp],xmm12 - movaps XMMWORD[240+rsp],xmm13 - movaps XMMWORD[256+rsp],xmm14 - movaps XMMWORD[272+rsp],xmm15 -$L$prologue_avx2: - vzeroall - - mov r13,rdi - vpinsrq xmm15,xmm15,rsi,1 - lea rdi,[128+rcx] - lea r12,[((K256+544))] - mov r14d,DWORD[((240-128))+rdi] - mov r15,r9 - mov rsi,r10 - vmovdqu xmm8,XMMWORD[r8] - lea r14,[((-9))+r14] - - vmovdqa xmm14,XMMWORD[r14*8+r12] - vmovdqa xmm13,XMMWORD[16+r14*8+r12] - vmovdqa xmm12,XMMWORD[32+r14*8+r12] - - sub r13,-16*4 - mov eax,DWORD[r15] - lea r12,[r13*1+rsi] - mov ebx,DWORD[4+r15] - cmp r13,rdx - mov ecx,DWORD[8+r15] - cmove r12,rsp - mov edx,DWORD[12+r15] - mov r8d,DWORD[16+r15] - mov r9d,DWORD[20+r15] - mov r10d,DWORD[24+r15] - mov r11d,DWORD[28+r15] - vmovdqu xmm10,XMMWORD[((0-128))+rdi] - jmp NEAR $L$oop_avx2 -ALIGN 16 -$L$oop_avx2: - vmovdqa ymm7,YMMWORD[((K256+512))] - vmovdqu xmm0,XMMWORD[((-64+0))+r13*1+rsi] - vmovdqu xmm1,XMMWORD[((-64+16))+r13*1+rsi] - vmovdqu xmm2,XMMWORD[((-64+32))+r13*1+rsi] - vmovdqu xmm3,XMMWORD[((-64+48))+r13*1+rsi] - - vinserti128 ymm0,ymm0,XMMWORD[r12],1 - vinserti128 ymm1,ymm1,XMMWORD[16+r12],1 - vpshufb ymm0,ymm0,ymm7 - vinserti128 ymm2,ymm2,XMMWORD[32+r12],1 - vpshufb ymm1,ymm1,ymm7 - vinserti128 ymm3,ymm3,XMMWORD[48+r12],1 - - lea rbp,[K256] - vpshufb ymm2,ymm2,ymm7 - lea r13,[((-64))+r13] - vpaddd ymm4,ymm0,YMMWORD[rbp] - vpshufb ymm3,ymm3,ymm7 - vpaddd ymm5,ymm1,YMMWORD[32+rbp] - vpaddd ymm6,ymm2,YMMWORD[64+rbp] - vpaddd ymm7,ymm3,YMMWORD[96+rbp] - vmovdqa YMMWORD[rsp],ymm4 - xor r14d,r14d - vmovdqa YMMWORD[32+rsp],ymm5 - lea rsp,[((-64))+rsp] - mov esi,ebx - vmovdqa YMMWORD[rsp],ymm6 - xor esi,ecx - vmovdqa YMMWORD[32+rsp],ymm7 - mov r12d,r9d - sub rbp,-16*2*4 - jmp NEAR $L$avx2_00_47 - -ALIGN 16 -$L$avx2_00_47: - vmovdqu xmm9,XMMWORD[r13] - vpinsrq xmm15,xmm15,r13,0 - lea rsp,[((-64))+rsp] - vpalignr ymm4,ymm1,ymm0,4 - add r11d,DWORD[((0+128))+rsp] - and r12d,r8d - rorx r13d,r8d,25 - vpalignr ymm7,ymm3,ymm2,4 - rorx r15d,r8d,11 - lea eax,[r14*1+rax] - lea r11d,[r12*1+r11] - vpsrld ymm6,ymm4,7 - andn r12d,r8d,r10d - xor r13d,r15d - rorx r14d,r8d,6 - vpaddd ymm0,ymm0,ymm7 - lea r11d,[r12*1+r11] - xor r13d,r14d - mov r15d,eax - vpsrld ymm7,ymm4,3 - rorx r12d,eax,22 - lea r11d,[r13*1+r11] - xor r15d,ebx - vpslld ymm5,ymm4,14 - rorx r14d,eax,13 - rorx r13d,eax,2 - lea edx,[r11*1+rdx] - vpxor ymm4,ymm7,ymm6 - and esi,r15d - vpxor xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((16-128))+rdi] - xor r14d,r12d - xor esi,ebx - vpshufd ymm7,ymm3,250 - xor r14d,r13d - lea r11d,[rsi*1+r11] - mov r12d,r8d - vpsrld ymm6,ymm6,11 - add r10d,DWORD[((4+128))+rsp] - and r12d,edx - rorx r13d,edx,25 - vpxor ymm4,ymm4,ymm5 - rorx esi,edx,11 - lea r11d,[r14*1+r11] - lea r10d,[r12*1+r10] - vpslld ymm5,ymm5,11 - andn r12d,edx,r9d - xor r13d,esi - rorx r14d,edx,6 - vpxor ymm4,ymm4,ymm6 - lea r10d,[r12*1+r10] - xor r13d,r14d - mov esi,r11d - vpsrld ymm6,ymm7,10 - rorx r12d,r11d,22 - lea r10d,[r13*1+r10] - xor esi,eax - vpxor ymm4,ymm4,ymm5 - rorx r14d,r11d,13 - rorx r13d,r11d,2 - lea ecx,[r10*1+rcx] - vpsrlq ymm7,ymm7,17 - and r15d,esi - vpxor xmm9,xmm9,xmm8 - xor r14d,r12d - xor r15d,eax - vpaddd ymm0,ymm0,ymm4 - xor r14d,r13d - lea r10d,[r15*1+r10] - mov r12d,edx - vpxor ymm6,ymm6,ymm7 - add r9d,DWORD[((8+128))+rsp] - and r12d,ecx - rorx r13d,ecx,25 - vpsrlq ymm7,ymm7,2 - rorx r15d,ecx,11 - lea r10d,[r14*1+r10] - lea r9d,[r12*1+r9] - vpxor ymm6,ymm6,ymm7 - andn r12d,ecx,r8d - xor r13d,r15d - rorx r14d,ecx,6 - vpshufd ymm6,ymm6,132 - lea r9d,[r12*1+r9] - xor r13d,r14d - mov r15d,r10d - vpsrldq ymm6,ymm6,8 - rorx r12d,r10d,22 - lea r9d,[r13*1+r9] - xor r15d,r11d - vpaddd ymm0,ymm0,ymm6 - rorx r14d,r10d,13 - rorx r13d,r10d,2 - lea ebx,[r9*1+rbx] - vpshufd ymm7,ymm0,80 - and esi,r15d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((32-128))+rdi] - xor r14d,r12d - xor esi,r11d - vpsrld ymm6,ymm7,10 - xor r14d,r13d - lea r9d,[rsi*1+r9] - mov r12d,ecx - vpsrlq ymm7,ymm7,17 - add r8d,DWORD[((12+128))+rsp] - and r12d,ebx - rorx r13d,ebx,25 - vpxor ymm6,ymm6,ymm7 - rorx esi,ebx,11 - lea r9d,[r14*1+r9] - lea r8d,[r12*1+r8] - vpsrlq ymm7,ymm7,2 - andn r12d,ebx,edx - xor r13d,esi - rorx r14d,ebx,6 - vpxor ymm6,ymm6,ymm7 - lea r8d,[r12*1+r8] - xor r13d,r14d - mov esi,r9d - vpshufd ymm6,ymm6,232 - rorx r12d,r9d,22 - lea r8d,[r13*1+r8] - xor esi,r10d - vpslldq ymm6,ymm6,8 - rorx r14d,r9d,13 - rorx r13d,r9d,2 - lea eax,[r8*1+rax] - vpaddd ymm0,ymm0,ymm6 - and r15d,esi - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((48-128))+rdi] - xor r14d,r12d - xor r15d,r10d - vpaddd ymm6,ymm0,YMMWORD[rbp] - xor r14d,r13d - lea r8d,[r15*1+r8] - mov r12d,ebx - vmovdqa YMMWORD[rsp],ymm6 - vpalignr ymm4,ymm2,ymm1,4 - add edx,DWORD[((32+128))+rsp] - and r12d,eax - rorx r13d,eax,25 - vpalignr ymm7,ymm0,ymm3,4 - rorx r15d,eax,11 - lea r8d,[r14*1+r8] - lea edx,[r12*1+rdx] - vpsrld ymm6,ymm4,7 - andn r12d,eax,ecx - xor r13d,r15d - rorx r14d,eax,6 - vpaddd ymm1,ymm1,ymm7 - lea edx,[r12*1+rdx] - xor r13d,r14d - mov r15d,r8d - vpsrld ymm7,ymm4,3 - rorx r12d,r8d,22 - lea edx,[r13*1+rdx] - xor r15d,r9d - vpslld ymm5,ymm4,14 - rorx r14d,r8d,13 - rorx r13d,r8d,2 - lea r11d,[rdx*1+r11] - vpxor ymm4,ymm7,ymm6 - and esi,r15d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((64-128))+rdi] - xor r14d,r12d - xor esi,r9d - vpshufd ymm7,ymm0,250 - xor r14d,r13d - lea edx,[rsi*1+rdx] - mov r12d,eax - vpsrld ymm6,ymm6,11 - add ecx,DWORD[((36+128))+rsp] - and r12d,r11d - rorx r13d,r11d,25 - vpxor ymm4,ymm4,ymm5 - rorx esi,r11d,11 - lea edx,[r14*1+rdx] - lea ecx,[r12*1+rcx] - vpslld ymm5,ymm5,11 - andn r12d,r11d,ebx - xor r13d,esi - rorx r14d,r11d,6 - vpxor ymm4,ymm4,ymm6 - lea ecx,[r12*1+rcx] - xor r13d,r14d - mov esi,edx - vpsrld ymm6,ymm7,10 - rorx r12d,edx,22 - lea ecx,[r13*1+rcx] - xor esi,r8d - vpxor ymm4,ymm4,ymm5 - rorx r14d,edx,13 - rorx r13d,edx,2 - lea r10d,[rcx*1+r10] - vpsrlq ymm7,ymm7,17 - and r15d,esi - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((80-128))+rdi] - xor r14d,r12d - xor r15d,r8d - vpaddd ymm1,ymm1,ymm4 - xor r14d,r13d - lea ecx,[r15*1+rcx] - mov r12d,r11d - vpxor ymm6,ymm6,ymm7 - add ebx,DWORD[((40+128))+rsp] - and r12d,r10d - rorx r13d,r10d,25 - vpsrlq ymm7,ymm7,2 - rorx r15d,r10d,11 - lea ecx,[r14*1+rcx] - lea ebx,[r12*1+rbx] - vpxor ymm6,ymm6,ymm7 - andn r12d,r10d,eax - xor r13d,r15d - rorx r14d,r10d,6 - vpshufd ymm6,ymm6,132 - lea ebx,[r12*1+rbx] - xor r13d,r14d - mov r15d,ecx - vpsrldq ymm6,ymm6,8 - rorx r12d,ecx,22 - lea ebx,[r13*1+rbx] - xor r15d,edx - vpaddd ymm1,ymm1,ymm6 - rorx r14d,ecx,13 - rorx r13d,ecx,2 - lea r9d,[rbx*1+r9] - vpshufd ymm7,ymm1,80 - and esi,r15d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((96-128))+rdi] - xor r14d,r12d - xor esi,edx - vpsrld ymm6,ymm7,10 - xor r14d,r13d - lea ebx,[rsi*1+rbx] - mov r12d,r10d - vpsrlq ymm7,ymm7,17 - add eax,DWORD[((44+128))+rsp] - and r12d,r9d - rorx r13d,r9d,25 - vpxor ymm6,ymm6,ymm7 - rorx esi,r9d,11 - lea ebx,[r14*1+rbx] - lea eax,[r12*1+rax] - vpsrlq ymm7,ymm7,2 - andn r12d,r9d,r11d - xor r13d,esi - rorx r14d,r9d,6 - vpxor ymm6,ymm6,ymm7 - lea eax,[r12*1+rax] - xor r13d,r14d - mov esi,ebx - vpshufd ymm6,ymm6,232 - rorx r12d,ebx,22 - lea eax,[r13*1+rax] - xor esi,ecx - vpslldq ymm6,ymm6,8 - rorx r14d,ebx,13 - rorx r13d,ebx,2 - lea r8d,[rax*1+r8] - vpaddd ymm1,ymm1,ymm6 - and r15d,esi - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((112-128))+rdi] - xor r14d,r12d - xor r15d,ecx - vpaddd ymm6,ymm1,YMMWORD[32+rbp] - xor r14d,r13d - lea eax,[r15*1+rax] - mov r12d,r9d - vmovdqa YMMWORD[32+rsp],ymm6 - lea rsp,[((-64))+rsp] - vpalignr ymm4,ymm3,ymm2,4 - add r11d,DWORD[((0+128))+rsp] - and r12d,r8d - rorx r13d,r8d,25 - vpalignr ymm7,ymm1,ymm0,4 - rorx r15d,r8d,11 - lea eax,[r14*1+rax] - lea r11d,[r12*1+r11] - vpsrld ymm6,ymm4,7 - andn r12d,r8d,r10d - xor r13d,r15d - rorx r14d,r8d,6 - vpaddd ymm2,ymm2,ymm7 - lea r11d,[r12*1+r11] - xor r13d,r14d - mov r15d,eax - vpsrld ymm7,ymm4,3 - rorx r12d,eax,22 - lea r11d,[r13*1+r11] - xor r15d,ebx - vpslld ymm5,ymm4,14 - rorx r14d,eax,13 - rorx r13d,eax,2 - lea edx,[r11*1+rdx] - vpxor ymm4,ymm7,ymm6 - and esi,r15d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((128-128))+rdi] - xor r14d,r12d - xor esi,ebx - vpshufd ymm7,ymm1,250 - xor r14d,r13d - lea r11d,[rsi*1+r11] - mov r12d,r8d - vpsrld ymm6,ymm6,11 - add r10d,DWORD[((4+128))+rsp] - and r12d,edx - rorx r13d,edx,25 - vpxor ymm4,ymm4,ymm5 - rorx esi,edx,11 - lea r11d,[r14*1+r11] - lea r10d,[r12*1+r10] - vpslld ymm5,ymm5,11 - andn r12d,edx,r9d - xor r13d,esi - rorx r14d,edx,6 - vpxor ymm4,ymm4,ymm6 - lea r10d,[r12*1+r10] - xor r13d,r14d - mov esi,r11d - vpsrld ymm6,ymm7,10 - rorx r12d,r11d,22 - lea r10d,[r13*1+r10] - xor esi,eax - vpxor ymm4,ymm4,ymm5 - rorx r14d,r11d,13 - rorx r13d,r11d,2 - lea ecx,[r10*1+rcx] - vpsrlq ymm7,ymm7,17 - and r15d,esi - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((144-128))+rdi] - xor r14d,r12d - xor r15d,eax - vpaddd ymm2,ymm2,ymm4 - xor r14d,r13d - lea r10d,[r15*1+r10] - mov r12d,edx - vpxor ymm6,ymm6,ymm7 - add r9d,DWORD[((8+128))+rsp] - and r12d,ecx - rorx r13d,ecx,25 - vpsrlq ymm7,ymm7,2 - rorx r15d,ecx,11 - lea r10d,[r14*1+r10] - lea r9d,[r12*1+r9] - vpxor ymm6,ymm6,ymm7 - andn r12d,ecx,r8d - xor r13d,r15d - rorx r14d,ecx,6 - vpshufd ymm6,ymm6,132 - lea r9d,[r12*1+r9] - xor r13d,r14d - mov r15d,r10d - vpsrldq ymm6,ymm6,8 - rorx r12d,r10d,22 - lea r9d,[r13*1+r9] - xor r15d,r11d - vpaddd ymm2,ymm2,ymm6 - rorx r14d,r10d,13 - rorx r13d,r10d,2 - lea ebx,[r9*1+rbx] - vpshufd ymm7,ymm2,80 - and esi,r15d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((160-128))+rdi] - xor r14d,r12d - xor esi,r11d - vpsrld ymm6,ymm7,10 - xor r14d,r13d - lea r9d,[rsi*1+r9] - mov r12d,ecx - vpsrlq ymm7,ymm7,17 - add r8d,DWORD[((12+128))+rsp] - and r12d,ebx - rorx r13d,ebx,25 - vpxor ymm6,ymm6,ymm7 - rorx esi,ebx,11 - lea r9d,[r14*1+r9] - lea r8d,[r12*1+r8] - vpsrlq ymm7,ymm7,2 - andn r12d,ebx,edx - xor r13d,esi - rorx r14d,ebx,6 - vpxor ymm6,ymm6,ymm7 - lea r8d,[r12*1+r8] - xor r13d,r14d - mov esi,r9d - vpshufd ymm6,ymm6,232 - rorx r12d,r9d,22 - lea r8d,[r13*1+r8] - xor esi,r10d - vpslldq ymm6,ymm6,8 - rorx r14d,r9d,13 - rorx r13d,r9d,2 - lea eax,[r8*1+rax] - vpaddd ymm2,ymm2,ymm6 - and r15d,esi - vaesenclast xmm11,xmm9,xmm10 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((176-128))+rdi] - xor r14d,r12d - xor r15d,r10d - vpaddd ymm6,ymm2,YMMWORD[64+rbp] - xor r14d,r13d - lea r8d,[r15*1+r8] - mov r12d,ebx - vmovdqa YMMWORD[rsp],ymm6 - vpalignr ymm4,ymm0,ymm3,4 - add edx,DWORD[((32+128))+rsp] - and r12d,eax - rorx r13d,eax,25 - vpalignr ymm7,ymm2,ymm1,4 - rorx r15d,eax,11 - lea r8d,[r14*1+r8] - lea edx,[r12*1+rdx] - vpsrld ymm6,ymm4,7 - andn r12d,eax,ecx - xor r13d,r15d - rorx r14d,eax,6 - vpaddd ymm3,ymm3,ymm7 - lea edx,[r12*1+rdx] - xor r13d,r14d - mov r15d,r8d - vpsrld ymm7,ymm4,3 - rorx r12d,r8d,22 - lea edx,[r13*1+rdx] - xor r15d,r9d - vpslld ymm5,ymm4,14 - rorx r14d,r8d,13 - rorx r13d,r8d,2 - lea r11d,[rdx*1+r11] - vpxor ymm4,ymm7,ymm6 - and esi,r15d - vpand xmm8,xmm11,xmm12 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((192-128))+rdi] - xor r14d,r12d - xor esi,r9d - vpshufd ymm7,ymm2,250 - xor r14d,r13d - lea edx,[rsi*1+rdx] - mov r12d,eax - vpsrld ymm6,ymm6,11 - add ecx,DWORD[((36+128))+rsp] - and r12d,r11d - rorx r13d,r11d,25 - vpxor ymm4,ymm4,ymm5 - rorx esi,r11d,11 - lea edx,[r14*1+rdx] - lea ecx,[r12*1+rcx] - vpslld ymm5,ymm5,11 - andn r12d,r11d,ebx - xor r13d,esi - rorx r14d,r11d,6 - vpxor ymm4,ymm4,ymm6 - lea ecx,[r12*1+rcx] - xor r13d,r14d - mov esi,edx - vpsrld ymm6,ymm7,10 - rorx r12d,edx,22 - lea ecx,[r13*1+rcx] - xor esi,r8d - vpxor ymm4,ymm4,ymm5 - rorx r14d,edx,13 - rorx r13d,edx,2 - lea r10d,[rcx*1+r10] - vpsrlq ymm7,ymm7,17 - and r15d,esi - vaesenclast xmm11,xmm9,xmm10 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((208-128))+rdi] - xor r14d,r12d - xor r15d,r8d - vpaddd ymm3,ymm3,ymm4 - xor r14d,r13d - lea ecx,[r15*1+rcx] - mov r12d,r11d - vpxor ymm6,ymm6,ymm7 - add ebx,DWORD[((40+128))+rsp] - and r12d,r10d - rorx r13d,r10d,25 - vpsrlq ymm7,ymm7,2 - rorx r15d,r10d,11 - lea ecx,[r14*1+rcx] - lea ebx,[r12*1+rbx] - vpxor ymm6,ymm6,ymm7 - andn r12d,r10d,eax - xor r13d,r15d - rorx r14d,r10d,6 - vpshufd ymm6,ymm6,132 - lea ebx,[r12*1+rbx] - xor r13d,r14d - mov r15d,ecx - vpsrldq ymm6,ymm6,8 - rorx r12d,ecx,22 - lea ebx,[r13*1+rbx] - xor r15d,edx - vpaddd ymm3,ymm3,ymm6 - rorx r14d,ecx,13 - rorx r13d,ecx,2 - lea r9d,[rbx*1+r9] - vpshufd ymm7,ymm3,80 - and esi,r15d - vpand xmm11,xmm11,xmm13 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((224-128))+rdi] - xor r14d,r12d - xor esi,edx - vpsrld ymm6,ymm7,10 - xor r14d,r13d - lea ebx,[rsi*1+rbx] - mov r12d,r10d - vpsrlq ymm7,ymm7,17 - add eax,DWORD[((44+128))+rsp] - and r12d,r9d - rorx r13d,r9d,25 - vpxor ymm6,ymm6,ymm7 - rorx esi,r9d,11 - lea ebx,[r14*1+rbx] - lea eax,[r12*1+rax] - vpsrlq ymm7,ymm7,2 - andn r12d,r9d,r11d - xor r13d,esi - rorx r14d,r9d,6 - vpxor ymm6,ymm6,ymm7 - lea eax,[r12*1+rax] - xor r13d,r14d - mov esi,ebx - vpshufd ymm6,ymm6,232 - rorx r12d,ebx,22 - lea eax,[r13*1+rax] - xor esi,ecx - vpslldq ymm6,ymm6,8 - rorx r14d,ebx,13 - rorx r13d,ebx,2 - lea r8d,[rax*1+r8] - vpaddd ymm3,ymm3,ymm6 - and r15d,esi - vpor xmm8,xmm8,xmm11 - vaesenclast xmm11,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((0-128))+rdi] - xor r14d,r12d - xor r15d,ecx - vpaddd ymm6,ymm3,YMMWORD[96+rbp] - xor r14d,r13d - lea eax,[r15*1+rax] - mov r12d,r9d - vmovdqa YMMWORD[32+rsp],ymm6 - vmovq r13,xmm15 - vpextrq r15,xmm15,1 - vpand xmm11,xmm11,xmm14 - vpor xmm8,xmm8,xmm11 - vmovdqu XMMWORD[r13*1+r15],xmm8 - lea r13,[16+r13] - lea rbp,[128+rbp] - cmp BYTE[3+rbp],0 - jne NEAR $L$avx2_00_47 - vmovdqu xmm9,XMMWORD[r13] - vpinsrq xmm15,xmm15,r13,0 - add r11d,DWORD[((0+64))+rsp] - and r12d,r8d - rorx r13d,r8d,25 - rorx r15d,r8d,11 - lea eax,[r14*1+rax] - lea r11d,[r12*1+r11] - andn r12d,r8d,r10d - xor r13d,r15d - rorx r14d,r8d,6 - lea r11d,[r12*1+r11] - xor r13d,r14d - mov r15d,eax - rorx r12d,eax,22 - lea r11d,[r13*1+r11] - xor r15d,ebx - rorx r14d,eax,13 - rorx r13d,eax,2 - lea edx,[r11*1+rdx] - and esi,r15d - vpxor xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((16-128))+rdi] - xor r14d,r12d - xor esi,ebx - xor r14d,r13d - lea r11d,[rsi*1+r11] - mov r12d,r8d - add r10d,DWORD[((4+64))+rsp] - and r12d,edx - rorx r13d,edx,25 - rorx esi,edx,11 - lea r11d,[r14*1+r11] - lea r10d,[r12*1+r10] - andn r12d,edx,r9d - xor r13d,esi - rorx r14d,edx,6 - lea r10d,[r12*1+r10] - xor r13d,r14d - mov esi,r11d - rorx r12d,r11d,22 - lea r10d,[r13*1+r10] - xor esi,eax - rorx r14d,r11d,13 - rorx r13d,r11d,2 - lea ecx,[r10*1+rcx] - and r15d,esi - vpxor xmm9,xmm9,xmm8 - xor r14d,r12d - xor r15d,eax - xor r14d,r13d - lea r10d,[r15*1+r10] - mov r12d,edx - add r9d,DWORD[((8+64))+rsp] - and r12d,ecx - rorx r13d,ecx,25 - rorx r15d,ecx,11 - lea r10d,[r14*1+r10] - lea r9d,[r12*1+r9] - andn r12d,ecx,r8d - xor r13d,r15d - rorx r14d,ecx,6 - lea r9d,[r12*1+r9] - xor r13d,r14d - mov r15d,r10d - rorx r12d,r10d,22 - lea r9d,[r13*1+r9] - xor r15d,r11d - rorx r14d,r10d,13 - rorx r13d,r10d,2 - lea ebx,[r9*1+rbx] - and esi,r15d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((32-128))+rdi] - xor r14d,r12d - xor esi,r11d - xor r14d,r13d - lea r9d,[rsi*1+r9] - mov r12d,ecx - add r8d,DWORD[((12+64))+rsp] - and r12d,ebx - rorx r13d,ebx,25 - rorx esi,ebx,11 - lea r9d,[r14*1+r9] - lea r8d,[r12*1+r8] - andn r12d,ebx,edx - xor r13d,esi - rorx r14d,ebx,6 - lea r8d,[r12*1+r8] - xor r13d,r14d - mov esi,r9d - rorx r12d,r9d,22 - lea r8d,[r13*1+r8] - xor esi,r10d - rorx r14d,r9d,13 - rorx r13d,r9d,2 - lea eax,[r8*1+rax] - and r15d,esi - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((48-128))+rdi] - xor r14d,r12d - xor r15d,r10d - xor r14d,r13d - lea r8d,[r15*1+r8] - mov r12d,ebx - add edx,DWORD[((32+64))+rsp] - and r12d,eax - rorx r13d,eax,25 - rorx r15d,eax,11 - lea r8d,[r14*1+r8] - lea edx,[r12*1+rdx] - andn r12d,eax,ecx - xor r13d,r15d - rorx r14d,eax,6 - lea edx,[r12*1+rdx] - xor r13d,r14d - mov r15d,r8d - rorx r12d,r8d,22 - lea edx,[r13*1+rdx] - xor r15d,r9d - rorx r14d,r8d,13 - rorx r13d,r8d,2 - lea r11d,[rdx*1+r11] - and esi,r15d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((64-128))+rdi] - xor r14d,r12d - xor esi,r9d - xor r14d,r13d - lea edx,[rsi*1+rdx] - mov r12d,eax - add ecx,DWORD[((36+64))+rsp] - and r12d,r11d - rorx r13d,r11d,25 - rorx esi,r11d,11 - lea edx,[r14*1+rdx] - lea ecx,[r12*1+rcx] - andn r12d,r11d,ebx - xor r13d,esi - rorx r14d,r11d,6 - lea ecx,[r12*1+rcx] - xor r13d,r14d - mov esi,edx - rorx r12d,edx,22 - lea ecx,[r13*1+rcx] - xor esi,r8d - rorx r14d,edx,13 - rorx r13d,edx,2 - lea r10d,[rcx*1+r10] - and r15d,esi - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((80-128))+rdi] - xor r14d,r12d - xor r15d,r8d - xor r14d,r13d - lea ecx,[r15*1+rcx] - mov r12d,r11d - add ebx,DWORD[((40+64))+rsp] - and r12d,r10d - rorx r13d,r10d,25 - rorx r15d,r10d,11 - lea ecx,[r14*1+rcx] - lea ebx,[r12*1+rbx] - andn r12d,r10d,eax - xor r13d,r15d - rorx r14d,r10d,6 - lea ebx,[r12*1+rbx] - xor r13d,r14d - mov r15d,ecx - rorx r12d,ecx,22 - lea ebx,[r13*1+rbx] - xor r15d,edx - rorx r14d,ecx,13 - rorx r13d,ecx,2 - lea r9d,[rbx*1+r9] - and esi,r15d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((96-128))+rdi] - xor r14d,r12d - xor esi,edx - xor r14d,r13d - lea ebx,[rsi*1+rbx] - mov r12d,r10d - add eax,DWORD[((44+64))+rsp] - and r12d,r9d - rorx r13d,r9d,25 - rorx esi,r9d,11 - lea ebx,[r14*1+rbx] - lea eax,[r12*1+rax] - andn r12d,r9d,r11d - xor r13d,esi - rorx r14d,r9d,6 - lea eax,[r12*1+rax] - xor r13d,r14d - mov esi,ebx - rorx r12d,ebx,22 - lea eax,[r13*1+rax] - xor esi,ecx - rorx r14d,ebx,13 - rorx r13d,ebx,2 - lea r8d,[rax*1+r8] - and r15d,esi - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((112-128))+rdi] - xor r14d,r12d - xor r15d,ecx - xor r14d,r13d - lea eax,[r15*1+rax] - mov r12d,r9d - add r11d,DWORD[rsp] - and r12d,r8d - rorx r13d,r8d,25 - rorx r15d,r8d,11 - lea eax,[r14*1+rax] - lea r11d,[r12*1+r11] - andn r12d,r8d,r10d - xor r13d,r15d - rorx r14d,r8d,6 - lea r11d,[r12*1+r11] - xor r13d,r14d - mov r15d,eax - rorx r12d,eax,22 - lea r11d,[r13*1+r11] - xor r15d,ebx - rorx r14d,eax,13 - rorx r13d,eax,2 - lea edx,[r11*1+rdx] - and esi,r15d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((128-128))+rdi] - xor r14d,r12d - xor esi,ebx - xor r14d,r13d - lea r11d,[rsi*1+r11] - mov r12d,r8d - add r10d,DWORD[4+rsp] - and r12d,edx - rorx r13d,edx,25 - rorx esi,edx,11 - lea r11d,[r14*1+r11] - lea r10d,[r12*1+r10] - andn r12d,edx,r9d - xor r13d,esi - rorx r14d,edx,6 - lea r10d,[r12*1+r10] - xor r13d,r14d - mov esi,r11d - rorx r12d,r11d,22 - lea r10d,[r13*1+r10] - xor esi,eax - rorx r14d,r11d,13 - rorx r13d,r11d,2 - lea ecx,[r10*1+rcx] - and r15d,esi - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((144-128))+rdi] - xor r14d,r12d - xor r15d,eax - xor r14d,r13d - lea r10d,[r15*1+r10] - mov r12d,edx - add r9d,DWORD[8+rsp] - and r12d,ecx - rorx r13d,ecx,25 - rorx r15d,ecx,11 - lea r10d,[r14*1+r10] - lea r9d,[r12*1+r9] - andn r12d,ecx,r8d - xor r13d,r15d - rorx r14d,ecx,6 - lea r9d,[r12*1+r9] - xor r13d,r14d - mov r15d,r10d - rorx r12d,r10d,22 - lea r9d,[r13*1+r9] - xor r15d,r11d - rorx r14d,r10d,13 - rorx r13d,r10d,2 - lea ebx,[r9*1+rbx] - and esi,r15d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((160-128))+rdi] - xor r14d,r12d - xor esi,r11d - xor r14d,r13d - lea r9d,[rsi*1+r9] - mov r12d,ecx - add r8d,DWORD[12+rsp] - and r12d,ebx - rorx r13d,ebx,25 - rorx esi,ebx,11 - lea r9d,[r14*1+r9] - lea r8d,[r12*1+r8] - andn r12d,ebx,edx - xor r13d,esi - rorx r14d,ebx,6 - lea r8d,[r12*1+r8] - xor r13d,r14d - mov esi,r9d - rorx r12d,r9d,22 - lea r8d,[r13*1+r8] - xor esi,r10d - rorx r14d,r9d,13 - rorx r13d,r9d,2 - lea eax,[r8*1+rax] - and r15d,esi - vaesenclast xmm11,xmm9,xmm10 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((176-128))+rdi] - xor r14d,r12d - xor r15d,r10d - xor r14d,r13d - lea r8d,[r15*1+r8] - mov r12d,ebx - add edx,DWORD[32+rsp] - and r12d,eax - rorx r13d,eax,25 - rorx r15d,eax,11 - lea r8d,[r14*1+r8] - lea edx,[r12*1+rdx] - andn r12d,eax,ecx - xor r13d,r15d - rorx r14d,eax,6 - lea edx,[r12*1+rdx] - xor r13d,r14d - mov r15d,r8d - rorx r12d,r8d,22 - lea edx,[r13*1+rdx] - xor r15d,r9d - rorx r14d,r8d,13 - rorx r13d,r8d,2 - lea r11d,[rdx*1+r11] - and esi,r15d - vpand xmm8,xmm11,xmm12 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((192-128))+rdi] - xor r14d,r12d - xor esi,r9d - xor r14d,r13d - lea edx,[rsi*1+rdx] - mov r12d,eax - add ecx,DWORD[36+rsp] - and r12d,r11d - rorx r13d,r11d,25 - rorx esi,r11d,11 - lea edx,[r14*1+rdx] - lea ecx,[r12*1+rcx] - andn r12d,r11d,ebx - xor r13d,esi - rorx r14d,r11d,6 - lea ecx,[r12*1+rcx] - xor r13d,r14d - mov esi,edx - rorx r12d,edx,22 - lea ecx,[r13*1+rcx] - xor esi,r8d - rorx r14d,edx,13 - rorx r13d,edx,2 - lea r10d,[rcx*1+r10] - and r15d,esi - vaesenclast xmm11,xmm9,xmm10 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((208-128))+rdi] - xor r14d,r12d - xor r15d,r8d - xor r14d,r13d - lea ecx,[r15*1+rcx] - mov r12d,r11d - add ebx,DWORD[40+rsp] - and r12d,r10d - rorx r13d,r10d,25 - rorx r15d,r10d,11 - lea ecx,[r14*1+rcx] - lea ebx,[r12*1+rbx] - andn r12d,r10d,eax - xor r13d,r15d - rorx r14d,r10d,6 - lea ebx,[r12*1+rbx] - xor r13d,r14d - mov r15d,ecx - rorx r12d,ecx,22 - lea ebx,[r13*1+rbx] - xor r15d,edx - rorx r14d,ecx,13 - rorx r13d,ecx,2 - lea r9d,[rbx*1+r9] - and esi,r15d - vpand xmm11,xmm11,xmm13 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((224-128))+rdi] - xor r14d,r12d - xor esi,edx - xor r14d,r13d - lea ebx,[rsi*1+rbx] - mov r12d,r10d - add eax,DWORD[44+rsp] - and r12d,r9d - rorx r13d,r9d,25 - rorx esi,r9d,11 - lea ebx,[r14*1+rbx] - lea eax,[r12*1+rax] - andn r12d,r9d,r11d - xor r13d,esi - rorx r14d,r9d,6 - lea eax,[r12*1+rax] - xor r13d,r14d - mov esi,ebx - rorx r12d,ebx,22 - lea eax,[r13*1+rax] - xor esi,ecx - rorx r14d,ebx,13 - rorx r13d,ebx,2 - lea r8d,[rax*1+r8] - and r15d,esi - vpor xmm8,xmm8,xmm11 - vaesenclast xmm11,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((0-128))+rdi] - xor r14d,r12d - xor r15d,ecx - xor r14d,r13d - lea eax,[r15*1+rax] - mov r12d,r9d - vpextrq r12,xmm15,1 - vmovq r13,xmm15 - mov r15,QWORD[552+rsp] - add eax,r14d - lea rbp,[448+rsp] - - vpand xmm11,xmm11,xmm14 - vpor xmm8,xmm8,xmm11 - vmovdqu XMMWORD[r13*1+r12],xmm8 - lea r13,[16+r13] - - add eax,DWORD[r15] - add ebx,DWORD[4+r15] - add ecx,DWORD[8+r15] - add edx,DWORD[12+r15] - add r8d,DWORD[16+r15] - add r9d,DWORD[20+r15] - add r10d,DWORD[24+r15] - add r11d,DWORD[28+r15] - - mov DWORD[r15],eax - mov DWORD[4+r15],ebx - mov DWORD[8+r15],ecx - mov DWORD[12+r15],edx - mov DWORD[16+r15],r8d - mov DWORD[20+r15],r9d - mov DWORD[24+r15],r10d - mov DWORD[28+r15],r11d - - cmp r13,QWORD[80+rbp] - je NEAR $L$done_avx2 - - xor r14d,r14d - mov esi,ebx - mov r12d,r9d - xor esi,ecx - jmp NEAR $L$ower_avx2 -ALIGN 16 -$L$ower_avx2: - vmovdqu xmm9,XMMWORD[r13] - vpinsrq xmm15,xmm15,r13,0 - add r11d,DWORD[((0+16))+rbp] - and r12d,r8d - rorx r13d,r8d,25 - rorx r15d,r8d,11 - lea eax,[r14*1+rax] - lea r11d,[r12*1+r11] - andn r12d,r8d,r10d - xor r13d,r15d - rorx r14d,r8d,6 - lea r11d,[r12*1+r11] - xor r13d,r14d - mov r15d,eax - rorx r12d,eax,22 - lea r11d,[r13*1+r11] - xor r15d,ebx - rorx r14d,eax,13 - rorx r13d,eax,2 - lea edx,[r11*1+rdx] - and esi,r15d - vpxor xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((16-128))+rdi] - xor r14d,r12d - xor esi,ebx - xor r14d,r13d - lea r11d,[rsi*1+r11] - mov r12d,r8d - add r10d,DWORD[((4+16))+rbp] - and r12d,edx - rorx r13d,edx,25 - rorx esi,edx,11 - lea r11d,[r14*1+r11] - lea r10d,[r12*1+r10] - andn r12d,edx,r9d - xor r13d,esi - rorx r14d,edx,6 - lea r10d,[r12*1+r10] - xor r13d,r14d - mov esi,r11d - rorx r12d,r11d,22 - lea r10d,[r13*1+r10] - xor esi,eax - rorx r14d,r11d,13 - rorx r13d,r11d,2 - lea ecx,[r10*1+rcx] - and r15d,esi - vpxor xmm9,xmm9,xmm8 - xor r14d,r12d - xor r15d,eax - xor r14d,r13d - lea r10d,[r15*1+r10] - mov r12d,edx - add r9d,DWORD[((8+16))+rbp] - and r12d,ecx - rorx r13d,ecx,25 - rorx r15d,ecx,11 - lea r10d,[r14*1+r10] - lea r9d,[r12*1+r9] - andn r12d,ecx,r8d - xor r13d,r15d - rorx r14d,ecx,6 - lea r9d,[r12*1+r9] - xor r13d,r14d - mov r15d,r10d - rorx r12d,r10d,22 - lea r9d,[r13*1+r9] - xor r15d,r11d - rorx r14d,r10d,13 - rorx r13d,r10d,2 - lea ebx,[r9*1+rbx] - and esi,r15d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((32-128))+rdi] - xor r14d,r12d - xor esi,r11d - xor r14d,r13d - lea r9d,[rsi*1+r9] - mov r12d,ecx - add r8d,DWORD[((12+16))+rbp] - and r12d,ebx - rorx r13d,ebx,25 - rorx esi,ebx,11 - lea r9d,[r14*1+r9] - lea r8d,[r12*1+r8] - andn r12d,ebx,edx - xor r13d,esi - rorx r14d,ebx,6 - lea r8d,[r12*1+r8] - xor r13d,r14d - mov esi,r9d - rorx r12d,r9d,22 - lea r8d,[r13*1+r8] - xor esi,r10d - rorx r14d,r9d,13 - rorx r13d,r9d,2 - lea eax,[r8*1+rax] - and r15d,esi - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((48-128))+rdi] - xor r14d,r12d - xor r15d,r10d - xor r14d,r13d - lea r8d,[r15*1+r8] - mov r12d,ebx - add edx,DWORD[((32+16))+rbp] - and r12d,eax - rorx r13d,eax,25 - rorx r15d,eax,11 - lea r8d,[r14*1+r8] - lea edx,[r12*1+rdx] - andn r12d,eax,ecx - xor r13d,r15d - rorx r14d,eax,6 - lea edx,[r12*1+rdx] - xor r13d,r14d - mov r15d,r8d - rorx r12d,r8d,22 - lea edx,[r13*1+rdx] - xor r15d,r9d - rorx r14d,r8d,13 - rorx r13d,r8d,2 - lea r11d,[rdx*1+r11] - and esi,r15d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((64-128))+rdi] - xor r14d,r12d - xor esi,r9d - xor r14d,r13d - lea edx,[rsi*1+rdx] - mov r12d,eax - add ecx,DWORD[((36+16))+rbp] - and r12d,r11d - rorx r13d,r11d,25 - rorx esi,r11d,11 - lea edx,[r14*1+rdx] - lea ecx,[r12*1+rcx] - andn r12d,r11d,ebx - xor r13d,esi - rorx r14d,r11d,6 - lea ecx,[r12*1+rcx] - xor r13d,r14d - mov esi,edx - rorx r12d,edx,22 - lea ecx,[r13*1+rcx] - xor esi,r8d - rorx r14d,edx,13 - rorx r13d,edx,2 - lea r10d,[rcx*1+r10] - and r15d,esi - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((80-128))+rdi] - xor r14d,r12d - xor r15d,r8d - xor r14d,r13d - lea ecx,[r15*1+rcx] - mov r12d,r11d - add ebx,DWORD[((40+16))+rbp] - and r12d,r10d - rorx r13d,r10d,25 - rorx r15d,r10d,11 - lea ecx,[r14*1+rcx] - lea ebx,[r12*1+rbx] - andn r12d,r10d,eax - xor r13d,r15d - rorx r14d,r10d,6 - lea ebx,[r12*1+rbx] - xor r13d,r14d - mov r15d,ecx - rorx r12d,ecx,22 - lea ebx,[r13*1+rbx] - xor r15d,edx - rorx r14d,ecx,13 - rorx r13d,ecx,2 - lea r9d,[rbx*1+r9] - and esi,r15d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((96-128))+rdi] - xor r14d,r12d - xor esi,edx - xor r14d,r13d - lea ebx,[rsi*1+rbx] - mov r12d,r10d - add eax,DWORD[((44+16))+rbp] - and r12d,r9d - rorx r13d,r9d,25 - rorx esi,r9d,11 - lea ebx,[r14*1+rbx] - lea eax,[r12*1+rax] - andn r12d,r9d,r11d - xor r13d,esi - rorx r14d,r9d,6 - lea eax,[r12*1+rax] - xor r13d,r14d - mov esi,ebx - rorx r12d,ebx,22 - lea eax,[r13*1+rax] - xor esi,ecx - rorx r14d,ebx,13 - rorx r13d,ebx,2 - lea r8d,[rax*1+r8] - and r15d,esi - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((112-128))+rdi] - xor r14d,r12d - xor r15d,ecx - xor r14d,r13d - lea eax,[r15*1+rax] - mov r12d,r9d - lea rbp,[((-64))+rbp] - add r11d,DWORD[((0+16))+rbp] - and r12d,r8d - rorx r13d,r8d,25 - rorx r15d,r8d,11 - lea eax,[r14*1+rax] - lea r11d,[r12*1+r11] - andn r12d,r8d,r10d - xor r13d,r15d - rorx r14d,r8d,6 - lea r11d,[r12*1+r11] - xor r13d,r14d - mov r15d,eax - rorx r12d,eax,22 - lea r11d,[r13*1+r11] - xor r15d,ebx - rorx r14d,eax,13 - rorx r13d,eax,2 - lea edx,[r11*1+rdx] - and esi,r15d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((128-128))+rdi] - xor r14d,r12d - xor esi,ebx - xor r14d,r13d - lea r11d,[rsi*1+r11] - mov r12d,r8d - add r10d,DWORD[((4+16))+rbp] - and r12d,edx - rorx r13d,edx,25 - rorx esi,edx,11 - lea r11d,[r14*1+r11] - lea r10d,[r12*1+r10] - andn r12d,edx,r9d - xor r13d,esi - rorx r14d,edx,6 - lea r10d,[r12*1+r10] - xor r13d,r14d - mov esi,r11d - rorx r12d,r11d,22 - lea r10d,[r13*1+r10] - xor esi,eax - rorx r14d,r11d,13 - rorx r13d,r11d,2 - lea ecx,[r10*1+rcx] - and r15d,esi - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((144-128))+rdi] - xor r14d,r12d - xor r15d,eax - xor r14d,r13d - lea r10d,[r15*1+r10] - mov r12d,edx - add r9d,DWORD[((8+16))+rbp] - and r12d,ecx - rorx r13d,ecx,25 - rorx r15d,ecx,11 - lea r10d,[r14*1+r10] - lea r9d,[r12*1+r9] - andn r12d,ecx,r8d - xor r13d,r15d - rorx r14d,ecx,6 - lea r9d,[r12*1+r9] - xor r13d,r14d - mov r15d,r10d - rorx r12d,r10d,22 - lea r9d,[r13*1+r9] - xor r15d,r11d - rorx r14d,r10d,13 - rorx r13d,r10d,2 - lea ebx,[r9*1+rbx] - and esi,r15d - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((160-128))+rdi] - xor r14d,r12d - xor esi,r11d - xor r14d,r13d - lea r9d,[rsi*1+r9] - mov r12d,ecx - add r8d,DWORD[((12+16))+rbp] - and r12d,ebx - rorx r13d,ebx,25 - rorx esi,ebx,11 - lea r9d,[r14*1+r9] - lea r8d,[r12*1+r8] - andn r12d,ebx,edx - xor r13d,esi - rorx r14d,ebx,6 - lea r8d,[r12*1+r8] - xor r13d,r14d - mov esi,r9d - rorx r12d,r9d,22 - lea r8d,[r13*1+r8] - xor esi,r10d - rorx r14d,r9d,13 - rorx r13d,r9d,2 - lea eax,[r8*1+rax] - and r15d,esi - vaesenclast xmm11,xmm9,xmm10 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((176-128))+rdi] - xor r14d,r12d - xor r15d,r10d - xor r14d,r13d - lea r8d,[r15*1+r8] - mov r12d,ebx - add edx,DWORD[((32+16))+rbp] - and r12d,eax - rorx r13d,eax,25 - rorx r15d,eax,11 - lea r8d,[r14*1+r8] - lea edx,[r12*1+rdx] - andn r12d,eax,ecx - xor r13d,r15d - rorx r14d,eax,6 - lea edx,[r12*1+rdx] - xor r13d,r14d - mov r15d,r8d - rorx r12d,r8d,22 - lea edx,[r13*1+rdx] - xor r15d,r9d - rorx r14d,r8d,13 - rorx r13d,r8d,2 - lea r11d,[rdx*1+r11] - and esi,r15d - vpand xmm8,xmm11,xmm12 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((192-128))+rdi] - xor r14d,r12d - xor esi,r9d - xor r14d,r13d - lea edx,[rsi*1+rdx] - mov r12d,eax - add ecx,DWORD[((36+16))+rbp] - and r12d,r11d - rorx r13d,r11d,25 - rorx esi,r11d,11 - lea edx,[r14*1+rdx] - lea ecx,[r12*1+rcx] - andn r12d,r11d,ebx - xor r13d,esi - rorx r14d,r11d,6 - lea ecx,[r12*1+rcx] - xor r13d,r14d - mov esi,edx - rorx r12d,edx,22 - lea ecx,[r13*1+rcx] - xor esi,r8d - rorx r14d,edx,13 - rorx r13d,edx,2 - lea r10d,[rcx*1+r10] - and r15d,esi - vaesenclast xmm11,xmm9,xmm10 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((208-128))+rdi] - xor r14d,r12d - xor r15d,r8d - xor r14d,r13d - lea ecx,[r15*1+rcx] - mov r12d,r11d - add ebx,DWORD[((40+16))+rbp] - and r12d,r10d - rorx r13d,r10d,25 - rorx r15d,r10d,11 - lea ecx,[r14*1+rcx] - lea ebx,[r12*1+rbx] - andn r12d,r10d,eax - xor r13d,r15d - rorx r14d,r10d,6 - lea ebx,[r12*1+rbx] - xor r13d,r14d - mov r15d,ecx - rorx r12d,ecx,22 - lea ebx,[r13*1+rbx] - xor r15d,edx - rorx r14d,ecx,13 - rorx r13d,ecx,2 - lea r9d,[rbx*1+r9] - and esi,r15d - vpand xmm11,xmm11,xmm13 - vaesenc xmm9,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((224-128))+rdi] - xor r14d,r12d - xor esi,edx - xor r14d,r13d - lea ebx,[rsi*1+rbx] - mov r12d,r10d - add eax,DWORD[((44+16))+rbp] - and r12d,r9d - rorx r13d,r9d,25 - rorx esi,r9d,11 - lea ebx,[r14*1+rbx] - lea eax,[r12*1+rax] - andn r12d,r9d,r11d - xor r13d,esi - rorx r14d,r9d,6 - lea eax,[r12*1+rax] - xor r13d,r14d - mov esi,ebx - rorx r12d,ebx,22 - lea eax,[r13*1+rax] - xor esi,ecx - rorx r14d,ebx,13 - rorx r13d,ebx,2 - lea r8d,[rax*1+r8] - and r15d,esi - vpor xmm8,xmm8,xmm11 - vaesenclast xmm11,xmm9,xmm10 - vmovdqu xmm10,XMMWORD[((0-128))+rdi] - xor r14d,r12d - xor r15d,ecx - xor r14d,r13d - lea eax,[r15*1+rax] - mov r12d,r9d - vmovq r13,xmm15 - vpextrq r15,xmm15,1 - vpand xmm11,xmm11,xmm14 - vpor xmm8,xmm8,xmm11 - lea rbp,[((-64))+rbp] - vmovdqu XMMWORD[r13*1+r15],xmm8 - lea r13,[16+r13] - cmp rbp,rsp - jae NEAR $L$ower_avx2 - - mov r15,QWORD[552+rsp] - lea r13,[64+r13] - mov rsi,QWORD[560+rsp] - add eax,r14d - lea rsp,[448+rsp] - - add eax,DWORD[r15] - add ebx,DWORD[4+r15] - add ecx,DWORD[8+r15] - add edx,DWORD[12+r15] - add r8d,DWORD[16+r15] - add r9d,DWORD[20+r15] - add r10d,DWORD[24+r15] - lea r12,[r13*1+rsi] - add r11d,DWORD[28+r15] - - cmp r13,QWORD[((64+16))+rsp] - - mov DWORD[r15],eax - cmove r12,rsp - mov DWORD[4+r15],ebx - mov DWORD[8+r15],ecx - mov DWORD[12+r15],edx - mov DWORD[16+r15],r8d - mov DWORD[20+r15],r9d - mov DWORD[24+r15],r10d - mov DWORD[28+r15],r11d - - jbe NEAR $L$oop_avx2 - lea rbp,[rsp] - - - - -$L$done_avx2: - mov r8,QWORD[((64+32))+rbp] - mov rsi,QWORD[((64+56))+rbp] - - vmovdqu XMMWORD[r8],xmm8 - vzeroall - movaps xmm6,XMMWORD[128+rbp] - movaps xmm7,XMMWORD[144+rbp] - movaps xmm8,XMMWORD[160+rbp] - movaps xmm9,XMMWORD[176+rbp] - movaps xmm10,XMMWORD[192+rbp] - movaps xmm11,XMMWORD[208+rbp] - movaps xmm12,XMMWORD[224+rbp] - movaps xmm13,XMMWORD[240+rbp] - movaps xmm14,XMMWORD[256+rbp] - movaps xmm15,XMMWORD[272+rbp] - mov r15,QWORD[((-48))+rsi] - - mov r14,QWORD[((-40))+rsi] - - mov r13,QWORD[((-32))+rsi] - - mov r12,QWORD[((-24))+rsi] - - mov rbp,QWORD[((-16))+rsi] - - mov rbx,QWORD[((-8))+rsi] - - lea rsp,[rsi] - -$L$epilogue_avx2: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_aesni_cbc_sha256_enc_avx2: - -ALIGN 32 -aesni_cbc_sha256_enc_shaext: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aesni_cbc_sha256_enc_shaext: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - - mov r10,QWORD[56+rsp] - lea rsp,[((-168))+rsp] - movaps XMMWORD[(-8-160)+rax],xmm6 - movaps XMMWORD[(-8-144)+rax],xmm7 - movaps XMMWORD[(-8-128)+rax],xmm8 - movaps XMMWORD[(-8-112)+rax],xmm9 - movaps XMMWORD[(-8-96)+rax],xmm10 - movaps XMMWORD[(-8-80)+rax],xmm11 - movaps XMMWORD[(-8-64)+rax],xmm12 - movaps XMMWORD[(-8-48)+rax],xmm13 - movaps XMMWORD[(-8-32)+rax],xmm14 - movaps XMMWORD[(-8-16)+rax],xmm15 -$L$prologue_shaext: - lea rax,[((K256+128))] - movdqu xmm1,XMMWORD[r9] - movdqu xmm2,XMMWORD[16+r9] - movdqa xmm3,XMMWORD[((512-128))+rax] - - mov r11d,DWORD[240+rcx] - sub rsi,rdi - movups xmm15,XMMWORD[rcx] - movups xmm6,XMMWORD[r8] - movups xmm4,XMMWORD[16+rcx] - lea rcx,[112+rcx] - - pshufd xmm0,xmm1,0x1b - pshufd xmm1,xmm1,0xb1 - pshufd xmm2,xmm2,0x1b - movdqa xmm7,xmm3 -DB 102,15,58,15,202,8 - punpcklqdq xmm2,xmm0 - - jmp NEAR $L$oop_shaext - -ALIGN 16 -$L$oop_shaext: - movdqu xmm10,XMMWORD[r10] - movdqu xmm11,XMMWORD[16+r10] - movdqu xmm12,XMMWORD[32+r10] -DB 102,68,15,56,0,211 - movdqu xmm13,XMMWORD[48+r10] - - movdqa xmm0,XMMWORD[((0-128))+rax] - paddd xmm0,xmm10 -DB 102,68,15,56,0,219 - movdqa xmm9,xmm2 - movdqa xmm8,xmm1 - movups xmm14,XMMWORD[rdi] - xorps xmm14,xmm15 - xorps xmm6,xmm14 - movups xmm5,XMMWORD[((-80))+rcx] - aesenc xmm6,xmm4 -DB 15,56,203,209 - pshufd xmm0,xmm0,0x0e - movups xmm4,XMMWORD[((-64))+rcx] - aesenc xmm6,xmm5 -DB 15,56,203,202 - - movdqa xmm0,XMMWORD[((32-128))+rax] - paddd xmm0,xmm11 -DB 102,68,15,56,0,227 - lea r10,[64+r10] - movups xmm5,XMMWORD[((-48))+rcx] - aesenc xmm6,xmm4 -DB 15,56,203,209 - pshufd xmm0,xmm0,0x0e - movups xmm4,XMMWORD[((-32))+rcx] - aesenc xmm6,xmm5 -DB 15,56,203,202 - - movdqa xmm0,XMMWORD[((64-128))+rax] - paddd xmm0,xmm12 -DB 102,68,15,56,0,235 -DB 69,15,56,204,211 - movups xmm5,XMMWORD[((-16))+rcx] - aesenc xmm6,xmm4 -DB 15,56,203,209 - pshufd xmm0,xmm0,0x0e - movdqa xmm3,xmm13 -DB 102,65,15,58,15,220,4 - paddd xmm10,xmm3 - movups xmm4,XMMWORD[rcx] - aesenc xmm6,xmm5 -DB 15,56,203,202 - - movdqa xmm0,XMMWORD[((96-128))+rax] - paddd xmm0,xmm13 -DB 69,15,56,205,213 -DB 69,15,56,204,220 - movups xmm5,XMMWORD[16+rcx] - aesenc xmm6,xmm4 -DB 15,56,203,209 - pshufd xmm0,xmm0,0x0e - movups xmm4,XMMWORD[32+rcx] - aesenc xmm6,xmm5 - movdqa xmm3,xmm10 -DB 102,65,15,58,15,221,4 - paddd xmm11,xmm3 -DB 15,56,203,202 - movdqa xmm0,XMMWORD[((128-128))+rax] - paddd xmm0,xmm10 -DB 69,15,56,205,218 -DB 69,15,56,204,229 - movups xmm5,XMMWORD[48+rcx] - aesenc xmm6,xmm4 -DB 15,56,203,209 - pshufd xmm0,xmm0,0x0e - movdqa xmm3,xmm11 -DB 102,65,15,58,15,218,4 - paddd xmm12,xmm3 - cmp r11d,11 - jb NEAR $L$aesenclast1 - movups xmm4,XMMWORD[64+rcx] - aesenc xmm6,xmm5 - movups xmm5,XMMWORD[80+rcx] - aesenc xmm6,xmm4 - je NEAR $L$aesenclast1 - movups xmm4,XMMWORD[96+rcx] - aesenc xmm6,xmm5 - movups xmm5,XMMWORD[112+rcx] - aesenc xmm6,xmm4 -$L$aesenclast1: - aesenclast xmm6,xmm5 - movups xmm4,XMMWORD[((16-112))+rcx] - nop -DB 15,56,203,202 - movups xmm14,XMMWORD[16+rdi] - xorps xmm14,xmm15 - movups XMMWORD[rdi*1+rsi],xmm6 - xorps xmm6,xmm14 - movups xmm5,XMMWORD[((-80))+rcx] - aesenc xmm6,xmm4 - movdqa xmm0,XMMWORD[((160-128))+rax] - paddd xmm0,xmm11 -DB 69,15,56,205,227 -DB 69,15,56,204,234 - movups xmm4,XMMWORD[((-64))+rcx] - aesenc xmm6,xmm5 -DB 15,56,203,209 - pshufd xmm0,xmm0,0x0e - movdqa xmm3,xmm12 -DB 102,65,15,58,15,219,4 - paddd xmm13,xmm3 - movups xmm5,XMMWORD[((-48))+rcx] - aesenc xmm6,xmm4 -DB 15,56,203,202 - movdqa xmm0,XMMWORD[((192-128))+rax] - paddd xmm0,xmm12 -DB 69,15,56,205,236 -DB 69,15,56,204,211 - movups xmm4,XMMWORD[((-32))+rcx] - aesenc xmm6,xmm5 -DB 15,56,203,209 - pshufd xmm0,xmm0,0x0e - movdqa xmm3,xmm13 -DB 102,65,15,58,15,220,4 - paddd xmm10,xmm3 - movups xmm5,XMMWORD[((-16))+rcx] - aesenc xmm6,xmm4 -DB 15,56,203,202 - movdqa xmm0,XMMWORD[((224-128))+rax] - paddd xmm0,xmm13 -DB 69,15,56,205,213 -DB 69,15,56,204,220 - movups xmm4,XMMWORD[rcx] - aesenc xmm6,xmm5 -DB 15,56,203,209 - pshufd xmm0,xmm0,0x0e - movdqa xmm3,xmm10 -DB 102,65,15,58,15,221,4 - paddd xmm11,xmm3 - movups xmm5,XMMWORD[16+rcx] - aesenc xmm6,xmm4 -DB 15,56,203,202 - movdqa xmm0,XMMWORD[((256-128))+rax] - paddd xmm0,xmm10 -DB 69,15,56,205,218 -DB 69,15,56,204,229 - movups xmm4,XMMWORD[32+rcx] - aesenc xmm6,xmm5 -DB 15,56,203,209 - pshufd xmm0,xmm0,0x0e - movdqa xmm3,xmm11 -DB 102,65,15,58,15,218,4 - paddd xmm12,xmm3 - movups xmm5,XMMWORD[48+rcx] - aesenc xmm6,xmm4 - cmp r11d,11 - jb NEAR $L$aesenclast2 - movups xmm4,XMMWORD[64+rcx] - aesenc xmm6,xmm5 - movups xmm5,XMMWORD[80+rcx] - aesenc xmm6,xmm4 - je NEAR $L$aesenclast2 - movups xmm4,XMMWORD[96+rcx] - aesenc xmm6,xmm5 - movups xmm5,XMMWORD[112+rcx] - aesenc xmm6,xmm4 -$L$aesenclast2: - aesenclast xmm6,xmm5 - movups xmm4,XMMWORD[((16-112))+rcx] - nop -DB 15,56,203,202 - movups xmm14,XMMWORD[32+rdi] - xorps xmm14,xmm15 - movups XMMWORD[16+rdi*1+rsi],xmm6 - xorps xmm6,xmm14 - movups xmm5,XMMWORD[((-80))+rcx] - aesenc xmm6,xmm4 - movdqa xmm0,XMMWORD[((288-128))+rax] - paddd xmm0,xmm11 -DB 69,15,56,205,227 -DB 69,15,56,204,234 - movups xmm4,XMMWORD[((-64))+rcx] - aesenc xmm6,xmm5 -DB 15,56,203,209 - pshufd xmm0,xmm0,0x0e - movdqa xmm3,xmm12 -DB 102,65,15,58,15,219,4 - paddd xmm13,xmm3 - movups xmm5,XMMWORD[((-48))+rcx] - aesenc xmm6,xmm4 -DB 15,56,203,202 - movdqa xmm0,XMMWORD[((320-128))+rax] - paddd xmm0,xmm12 -DB 69,15,56,205,236 -DB 69,15,56,204,211 - movups xmm4,XMMWORD[((-32))+rcx] - aesenc xmm6,xmm5 -DB 15,56,203,209 - pshufd xmm0,xmm0,0x0e - movdqa xmm3,xmm13 -DB 102,65,15,58,15,220,4 - paddd xmm10,xmm3 - movups xmm5,XMMWORD[((-16))+rcx] - aesenc xmm6,xmm4 -DB 15,56,203,202 - movdqa xmm0,XMMWORD[((352-128))+rax] - paddd xmm0,xmm13 -DB 69,15,56,205,213 -DB 69,15,56,204,220 - movups xmm4,XMMWORD[rcx] - aesenc xmm6,xmm5 -DB 15,56,203,209 - pshufd xmm0,xmm0,0x0e - movdqa xmm3,xmm10 -DB 102,65,15,58,15,221,4 - paddd xmm11,xmm3 - movups xmm5,XMMWORD[16+rcx] - aesenc xmm6,xmm4 -DB 15,56,203,202 - movdqa xmm0,XMMWORD[((384-128))+rax] - paddd xmm0,xmm10 -DB 69,15,56,205,218 -DB 69,15,56,204,229 - movups xmm4,XMMWORD[32+rcx] - aesenc xmm6,xmm5 -DB 15,56,203,209 - pshufd xmm0,xmm0,0x0e - movdqa xmm3,xmm11 -DB 102,65,15,58,15,218,4 - paddd xmm12,xmm3 - movups xmm5,XMMWORD[48+rcx] - aesenc xmm6,xmm4 -DB 15,56,203,202 - movdqa xmm0,XMMWORD[((416-128))+rax] - paddd xmm0,xmm11 -DB 69,15,56,205,227 -DB 69,15,56,204,234 - cmp r11d,11 - jb NEAR $L$aesenclast3 - movups xmm4,XMMWORD[64+rcx] - aesenc xmm6,xmm5 - movups xmm5,XMMWORD[80+rcx] - aesenc xmm6,xmm4 - je NEAR $L$aesenclast3 - movups xmm4,XMMWORD[96+rcx] - aesenc xmm6,xmm5 - movups xmm5,XMMWORD[112+rcx] - aesenc xmm6,xmm4 -$L$aesenclast3: - aesenclast xmm6,xmm5 - movups xmm4,XMMWORD[((16-112))+rcx] - nop -DB 15,56,203,209 - pshufd xmm0,xmm0,0x0e - movdqa xmm3,xmm12 -DB 102,65,15,58,15,219,4 - paddd xmm13,xmm3 - movups xmm14,XMMWORD[48+rdi] - xorps xmm14,xmm15 - movups XMMWORD[32+rdi*1+rsi],xmm6 - xorps xmm6,xmm14 - movups xmm5,XMMWORD[((-80))+rcx] - aesenc xmm6,xmm4 - movups xmm4,XMMWORD[((-64))+rcx] - aesenc xmm6,xmm5 -DB 15,56,203,202 - - movdqa xmm0,XMMWORD[((448-128))+rax] - paddd xmm0,xmm12 -DB 69,15,56,205,236 - movdqa xmm3,xmm7 - movups xmm5,XMMWORD[((-48))+rcx] - aesenc xmm6,xmm4 -DB 15,56,203,209 - pshufd xmm0,xmm0,0x0e - movups xmm4,XMMWORD[((-32))+rcx] - aesenc xmm6,xmm5 -DB 15,56,203,202 - - movdqa xmm0,XMMWORD[((480-128))+rax] - paddd xmm0,xmm13 - movups xmm5,XMMWORD[((-16))+rcx] - aesenc xmm6,xmm4 - movups xmm4,XMMWORD[rcx] - aesenc xmm6,xmm5 -DB 15,56,203,209 - pshufd xmm0,xmm0,0x0e - movups xmm5,XMMWORD[16+rcx] - aesenc xmm6,xmm4 -DB 15,56,203,202 - - movups xmm4,XMMWORD[32+rcx] - aesenc xmm6,xmm5 - movups xmm5,XMMWORD[48+rcx] - aesenc xmm6,xmm4 - cmp r11d,11 - jb NEAR $L$aesenclast4 - movups xmm4,XMMWORD[64+rcx] - aesenc xmm6,xmm5 - movups xmm5,XMMWORD[80+rcx] - aesenc xmm6,xmm4 - je NEAR $L$aesenclast4 - movups xmm4,XMMWORD[96+rcx] - aesenc xmm6,xmm5 - movups xmm5,XMMWORD[112+rcx] - aesenc xmm6,xmm4 -$L$aesenclast4: - aesenclast xmm6,xmm5 - movups xmm4,XMMWORD[((16-112))+rcx] - nop - - paddd xmm2,xmm9 - paddd xmm1,xmm8 - - dec rdx - movups XMMWORD[48+rdi*1+rsi],xmm6 - lea rdi,[64+rdi] - jnz NEAR $L$oop_shaext - - pshufd xmm2,xmm2,0xb1 - pshufd xmm3,xmm1,0x1b - pshufd xmm1,xmm1,0xb1 - punpckhqdq xmm1,xmm2 -DB 102,15,58,15,211,8 - - movups XMMWORD[r8],xmm6 - movdqu XMMWORD[r9],xmm1 - movdqu XMMWORD[16+r9],xmm2 - movaps xmm6,XMMWORD[rsp] - movaps xmm7,XMMWORD[16+rsp] - movaps xmm8,XMMWORD[32+rsp] - movaps xmm9,XMMWORD[48+rsp] - movaps xmm10,XMMWORD[64+rsp] - movaps xmm11,XMMWORD[80+rsp] - movaps xmm12,XMMWORD[96+rsp] - movaps xmm13,XMMWORD[112+rsp] - movaps xmm14,XMMWORD[128+rsp] - movaps xmm15,XMMWORD[144+rsp] - lea rsp,[((8+160))+rsp] -$L$epilogue_shaext: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_aesni_cbc_sha256_enc_shaext: -EXTERN __imp_RtlVirtualUnwind - -ALIGN 16 -se_handler: - push rsi - push rdi - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - pushfq - sub rsp,64 - - mov rax,QWORD[120+r8] - mov rbx,QWORD[248+r8] - - mov rsi,QWORD[8+r9] - mov r11,QWORD[56+r9] - - mov r10d,DWORD[r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jb NEAR $L$in_prologue - - mov rax,QWORD[152+r8] - - mov r10d,DWORD[4+r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jae NEAR $L$in_prologue - lea r10,[aesni_cbc_sha256_enc_shaext] - cmp rbx,r10 - jb NEAR $L$not_in_shaext - - lea rsi,[rax] - lea rdi,[512+r8] - mov ecx,20 - DD 0xa548f3fc - lea rax,[168+rax] - jmp NEAR $L$in_prologue -$L$not_in_shaext: - lea r10,[$L$avx2_shortcut] - cmp rbx,r10 - jb NEAR $L$not_in_avx2 - - and rax,-256*4 - add rax,448 -$L$not_in_avx2: - mov rsi,rax - mov rax,QWORD[((64+56))+rax] - - mov rbx,QWORD[((-8))+rax] - mov rbp,QWORD[((-16))+rax] - mov r12,QWORD[((-24))+rax] - mov r13,QWORD[((-32))+rax] - mov r14,QWORD[((-40))+rax] - mov r15,QWORD[((-48))+rax] - mov QWORD[144+r8],rbx - mov QWORD[160+r8],rbp - mov QWORD[216+r8],r12 - mov QWORD[224+r8],r13 - mov QWORD[232+r8],r14 - mov QWORD[240+r8],r15 - - lea rsi,[((64+64))+rsi] - lea rdi,[512+r8] - mov ecx,20 - DD 0xa548f3fc - -$L$in_prologue: - mov rdi,QWORD[8+rax] - mov rsi,QWORD[16+rax] - mov QWORD[152+r8],rax - mov QWORD[168+r8],rsi - mov QWORD[176+r8],rdi - - mov rdi,QWORD[40+r9] - mov rsi,r8 - mov ecx,154 - DD 0xa548f3fc - - mov rsi,r9 - xor rcx,rcx - mov rdx,QWORD[8+rsi] - mov r8,QWORD[rsi] - mov r9,QWORD[16+rsi] - mov r10,QWORD[40+rsi] - lea r11,[56+rsi] - lea r12,[24+rsi] - mov QWORD[32+rsp],r10 - mov QWORD[40+rsp],r11 - mov QWORD[48+rsp],r12 - mov QWORD[56+rsp],rcx - call QWORD[__imp_RtlVirtualUnwind] - - mov eax,1 - add rsp,64 - popfq - pop r15 - pop r14 - pop r13 - pop r12 - pop rbp - pop rbx - pop rdi - pop rsi - DB 0F3h,0C3h ;repret - - -section .pdata rdata align=4 - DD $L$SEH_begin_aesni_cbc_sha256_enc_xop wrt ..imagebase - DD $L$SEH_end_aesni_cbc_sha256_enc_xop wrt ..imagebase - DD $L$SEH_info_aesni_cbc_sha256_enc_xop wrt ..imagebase - - DD $L$SEH_begin_aesni_cbc_sha256_enc_avx wrt ..imagebase - DD $L$SEH_end_aesni_cbc_sha256_enc_avx wrt ..imagebase - DD $L$SEH_info_aesni_cbc_sha256_enc_avx wrt ..imagebase - DD $L$SEH_begin_aesni_cbc_sha256_enc_avx2 wrt ..imagebase - DD $L$SEH_end_aesni_cbc_sha256_enc_avx2 wrt ..imagebase - DD $L$SEH_info_aesni_cbc_sha256_enc_avx2 wrt ..imagebase - DD $L$SEH_begin_aesni_cbc_sha256_enc_shaext wrt ..imagebase - DD $L$SEH_end_aesni_cbc_sha256_enc_shaext wrt ..imagebase - DD $L$SEH_info_aesni_cbc_sha256_enc_shaext wrt ..imagebase -section .xdata rdata align=8 -ALIGN 8 -$L$SEH_info_aesni_cbc_sha256_enc_xop: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$prologue_xop wrt ..imagebase,$L$epilogue_xop wrt ..imagebase - -$L$SEH_info_aesni_cbc_sha256_enc_avx: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase -$L$SEH_info_aesni_cbc_sha256_enc_avx2: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$prologue_avx2 wrt ..imagebase,$L$epilogue_avx2 wrt ..imagebase -$L$SEH_info_aesni_cbc_sha256_enc_shaext: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$prologue_shaext wrt ..imagebase,$L$epilogue_shaext wrt ..imagebase diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-avx2.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-avx2.nasm index 7342e16c22c..f097a539bb1 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-avx2.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-avx2.nasm @@ -5,1977 +5,26 @@ default rel section .text code align=64 -global rsaz_1024_sqr_avx2 - -ALIGN 64 -rsaz_1024_sqr_avx2: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_rsaz_1024_sqr_avx2: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - - - - lea rax,[rsp] - - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - push r15 - - vzeroupper - lea rsp,[((-168))+rsp] - vmovaps XMMWORD[(-216)+rax],xmm6 - vmovaps XMMWORD[(-200)+rax],xmm7 - vmovaps XMMWORD[(-184)+rax],xmm8 - vmovaps XMMWORD[(-168)+rax],xmm9 - vmovaps XMMWORD[(-152)+rax],xmm10 - vmovaps XMMWORD[(-136)+rax],xmm11 - vmovaps XMMWORD[(-120)+rax],xmm12 - vmovaps XMMWORD[(-104)+rax],xmm13 - vmovaps XMMWORD[(-88)+rax],xmm14 - vmovaps XMMWORD[(-72)+rax],xmm15 -$L$sqr_1024_body: - mov rbp,rax - - mov r13,rdx - sub rsp,832 - mov r15,r13 - sub rdi,-128 - sub rsi,-128 - sub r13,-128 - - and r15,4095 - add r15,32*10 - shr r15,12 - vpxor ymm9,ymm9,ymm9 - jz NEAR $L$sqr_1024_no_n_copy - - - - - - sub rsp,32*10 - vmovdqu ymm0,YMMWORD[((0-128))+r13] - and rsp,-2048 - vmovdqu ymm1,YMMWORD[((32-128))+r13] - vmovdqu ymm2,YMMWORD[((64-128))+r13] - vmovdqu ymm3,YMMWORD[((96-128))+r13] - vmovdqu ymm4,YMMWORD[((128-128))+r13] - vmovdqu ymm5,YMMWORD[((160-128))+r13] - vmovdqu ymm6,YMMWORD[((192-128))+r13] - vmovdqu ymm7,YMMWORD[((224-128))+r13] - vmovdqu ymm8,YMMWORD[((256-128))+r13] - lea r13,[((832+128))+rsp] - vmovdqu YMMWORD[(0-128)+r13],ymm0 - vmovdqu YMMWORD[(32-128)+r13],ymm1 - vmovdqu YMMWORD[(64-128)+r13],ymm2 - vmovdqu YMMWORD[(96-128)+r13],ymm3 - vmovdqu YMMWORD[(128-128)+r13],ymm4 - vmovdqu YMMWORD[(160-128)+r13],ymm5 - vmovdqu YMMWORD[(192-128)+r13],ymm6 - vmovdqu YMMWORD[(224-128)+r13],ymm7 - vmovdqu YMMWORD[(256-128)+r13],ymm8 - vmovdqu YMMWORD[(288-128)+r13],ymm9 - -$L$sqr_1024_no_n_copy: - and rsp,-1024 - - vmovdqu ymm1,YMMWORD[((32-128))+rsi] - vmovdqu ymm2,YMMWORD[((64-128))+rsi] - vmovdqu ymm3,YMMWORD[((96-128))+rsi] - vmovdqu ymm4,YMMWORD[((128-128))+rsi] - vmovdqu ymm5,YMMWORD[((160-128))+rsi] - vmovdqu ymm6,YMMWORD[((192-128))+rsi] - vmovdqu ymm7,YMMWORD[((224-128))+rsi] - vmovdqu ymm8,YMMWORD[((256-128))+rsi] - - lea rbx,[192+rsp] - vmovdqu ymm15,YMMWORD[$L$and_mask] - jmp NEAR $L$OOP_GRANDE_SQR_1024 - -ALIGN 32 -$L$OOP_GRANDE_SQR_1024: - lea r9,[((576+128))+rsp] - lea r12,[448+rsp] - - - - - vpaddq ymm1,ymm1,ymm1 - vpbroadcastq ymm10,QWORD[((0-128))+rsi] - vpaddq ymm2,ymm2,ymm2 - vmovdqa YMMWORD[(0-128)+r9],ymm1 - vpaddq ymm3,ymm3,ymm3 - vmovdqa YMMWORD[(32-128)+r9],ymm2 - vpaddq ymm4,ymm4,ymm4 - vmovdqa YMMWORD[(64-128)+r9],ymm3 - vpaddq ymm5,ymm5,ymm5 - vmovdqa YMMWORD[(96-128)+r9],ymm4 - vpaddq ymm6,ymm6,ymm6 - vmovdqa YMMWORD[(128-128)+r9],ymm5 - vpaddq ymm7,ymm7,ymm7 - vmovdqa YMMWORD[(160-128)+r9],ymm6 - vpaddq ymm8,ymm8,ymm8 - vmovdqa YMMWORD[(192-128)+r9],ymm7 - vpxor ymm9,ymm9,ymm9 - vmovdqa YMMWORD[(224-128)+r9],ymm8 - - vpmuludq ymm0,ymm10,YMMWORD[((0-128))+rsi] - vpbroadcastq ymm11,QWORD[((32-128))+rsi] - vmovdqu YMMWORD[(288-192)+rbx],ymm9 - vpmuludq ymm1,ymm1,ymm10 - vmovdqu YMMWORD[(320-448)+r12],ymm9 - vpmuludq ymm2,ymm2,ymm10 - vmovdqu YMMWORD[(352-448)+r12],ymm9 - vpmuludq ymm3,ymm3,ymm10 - vmovdqu YMMWORD[(384-448)+r12],ymm9 - vpmuludq ymm4,ymm4,ymm10 - vmovdqu YMMWORD[(416-448)+r12],ymm9 - vpmuludq ymm5,ymm5,ymm10 - vmovdqu YMMWORD[(448-448)+r12],ymm9 - vpmuludq ymm6,ymm6,ymm10 - vmovdqu YMMWORD[(480-448)+r12],ymm9 - vpmuludq ymm7,ymm7,ymm10 - vmovdqu YMMWORD[(512-448)+r12],ymm9 - vpmuludq ymm8,ymm8,ymm10 - vpbroadcastq ymm10,QWORD[((64-128))+rsi] - vmovdqu YMMWORD[(544-448)+r12],ymm9 - - mov r15,rsi - mov r14d,4 - jmp NEAR $L$sqr_entry_1024 -ALIGN 32 -$L$OOP_SQR_1024: - vpbroadcastq ymm11,QWORD[((32-128))+r15] - vpmuludq ymm0,ymm10,YMMWORD[((0-128))+rsi] - vpaddq ymm0,ymm0,YMMWORD[((0-192))+rbx] - vpmuludq ymm1,ymm10,YMMWORD[((0-128))+r9] - vpaddq ymm1,ymm1,YMMWORD[((32-192))+rbx] - vpmuludq ymm2,ymm10,YMMWORD[((32-128))+r9] - vpaddq ymm2,ymm2,YMMWORD[((64-192))+rbx] - vpmuludq ymm3,ymm10,YMMWORD[((64-128))+r9] - vpaddq ymm3,ymm3,YMMWORD[((96-192))+rbx] - vpmuludq ymm4,ymm10,YMMWORD[((96-128))+r9] - vpaddq ymm4,ymm4,YMMWORD[((128-192))+rbx] - vpmuludq ymm5,ymm10,YMMWORD[((128-128))+r9] - vpaddq ymm5,ymm5,YMMWORD[((160-192))+rbx] - vpmuludq ymm6,ymm10,YMMWORD[((160-128))+r9] - vpaddq ymm6,ymm6,YMMWORD[((192-192))+rbx] - vpmuludq ymm7,ymm10,YMMWORD[((192-128))+r9] - vpaddq ymm7,ymm7,YMMWORD[((224-192))+rbx] - vpmuludq ymm8,ymm10,YMMWORD[((224-128))+r9] - vpbroadcastq ymm10,QWORD[((64-128))+r15] - vpaddq ymm8,ymm8,YMMWORD[((256-192))+rbx] -$L$sqr_entry_1024: - vmovdqu YMMWORD[(0-192)+rbx],ymm0 - vmovdqu YMMWORD[(32-192)+rbx],ymm1 - - vpmuludq ymm12,ymm11,YMMWORD[((32-128))+rsi] - vpaddq ymm2,ymm2,ymm12 - vpmuludq ymm14,ymm11,YMMWORD[((32-128))+r9] - vpaddq ymm3,ymm3,ymm14 - vpmuludq ymm13,ymm11,YMMWORD[((64-128))+r9] - vpaddq ymm4,ymm4,ymm13 - vpmuludq ymm12,ymm11,YMMWORD[((96-128))+r9] - vpaddq ymm5,ymm5,ymm12 - vpmuludq ymm14,ymm11,YMMWORD[((128-128))+r9] - vpaddq ymm6,ymm6,ymm14 - vpmuludq ymm13,ymm11,YMMWORD[((160-128))+r9] - vpaddq ymm7,ymm7,ymm13 - vpmuludq ymm12,ymm11,YMMWORD[((192-128))+r9] - vpaddq ymm8,ymm8,ymm12 - vpmuludq ymm0,ymm11,YMMWORD[((224-128))+r9] - vpbroadcastq ymm11,QWORD[((96-128))+r15] - vpaddq ymm0,ymm0,YMMWORD[((288-192))+rbx] - - vmovdqu YMMWORD[(64-192)+rbx],ymm2 - vmovdqu YMMWORD[(96-192)+rbx],ymm3 - - vpmuludq ymm13,ymm10,YMMWORD[((64-128))+rsi] - vpaddq ymm4,ymm4,ymm13 - vpmuludq ymm12,ymm10,YMMWORD[((64-128))+r9] - vpaddq ymm5,ymm5,ymm12 - vpmuludq ymm14,ymm10,YMMWORD[((96-128))+r9] - vpaddq ymm6,ymm6,ymm14 - vpmuludq ymm13,ymm10,YMMWORD[((128-128))+r9] - vpaddq ymm7,ymm7,ymm13 - vpmuludq ymm12,ymm10,YMMWORD[((160-128))+r9] - vpaddq ymm8,ymm8,ymm12 - vpmuludq ymm14,ymm10,YMMWORD[((192-128))+r9] - vpaddq ymm0,ymm0,ymm14 - vpmuludq ymm1,ymm10,YMMWORD[((224-128))+r9] - vpbroadcastq ymm10,QWORD[((128-128))+r15] - vpaddq ymm1,ymm1,YMMWORD[((320-448))+r12] - - vmovdqu YMMWORD[(128-192)+rbx],ymm4 - vmovdqu YMMWORD[(160-192)+rbx],ymm5 - - vpmuludq ymm12,ymm11,YMMWORD[((96-128))+rsi] - vpaddq ymm6,ymm6,ymm12 - vpmuludq ymm14,ymm11,YMMWORD[((96-128))+r9] - vpaddq ymm7,ymm7,ymm14 - vpmuludq ymm13,ymm11,YMMWORD[((128-128))+r9] - vpaddq ymm8,ymm8,ymm13 - vpmuludq ymm12,ymm11,YMMWORD[((160-128))+r9] - vpaddq ymm0,ymm0,ymm12 - vpmuludq ymm14,ymm11,YMMWORD[((192-128))+r9] - vpaddq ymm1,ymm1,ymm14 - vpmuludq ymm2,ymm11,YMMWORD[((224-128))+r9] - vpbroadcastq ymm11,QWORD[((160-128))+r15] - vpaddq ymm2,ymm2,YMMWORD[((352-448))+r12] - - vmovdqu YMMWORD[(192-192)+rbx],ymm6 - vmovdqu YMMWORD[(224-192)+rbx],ymm7 - - vpmuludq ymm12,ymm10,YMMWORD[((128-128))+rsi] - vpaddq ymm8,ymm8,ymm12 - vpmuludq ymm14,ymm10,YMMWORD[((128-128))+r9] - vpaddq ymm0,ymm0,ymm14 - vpmuludq ymm13,ymm10,YMMWORD[((160-128))+r9] - vpaddq ymm1,ymm1,ymm13 - vpmuludq ymm12,ymm10,YMMWORD[((192-128))+r9] - vpaddq ymm2,ymm2,ymm12 - vpmuludq ymm3,ymm10,YMMWORD[((224-128))+r9] - vpbroadcastq ymm10,QWORD[((192-128))+r15] - vpaddq ymm3,ymm3,YMMWORD[((384-448))+r12] - - vmovdqu YMMWORD[(256-192)+rbx],ymm8 - vmovdqu YMMWORD[(288-192)+rbx],ymm0 - lea rbx,[8+rbx] - - vpmuludq ymm13,ymm11,YMMWORD[((160-128))+rsi] - vpaddq ymm1,ymm1,ymm13 - vpmuludq ymm12,ymm11,YMMWORD[((160-128))+r9] - vpaddq ymm2,ymm2,ymm12 - vpmuludq ymm14,ymm11,YMMWORD[((192-128))+r9] - vpaddq ymm3,ymm3,ymm14 - vpmuludq ymm4,ymm11,YMMWORD[((224-128))+r9] - vpbroadcastq ymm11,QWORD[((224-128))+r15] - vpaddq ymm4,ymm4,YMMWORD[((416-448))+r12] - - vmovdqu YMMWORD[(320-448)+r12],ymm1 - vmovdqu YMMWORD[(352-448)+r12],ymm2 - - vpmuludq ymm12,ymm10,YMMWORD[((192-128))+rsi] - vpaddq ymm3,ymm3,ymm12 - vpmuludq ymm14,ymm10,YMMWORD[((192-128))+r9] - vpbroadcastq ymm0,QWORD[((256-128))+r15] - vpaddq ymm4,ymm4,ymm14 - vpmuludq ymm5,ymm10,YMMWORD[((224-128))+r9] - vpbroadcastq ymm10,QWORD[((0+8-128))+r15] - vpaddq ymm5,ymm5,YMMWORD[((448-448))+r12] - - vmovdqu YMMWORD[(384-448)+r12],ymm3 - vmovdqu YMMWORD[(416-448)+r12],ymm4 - lea r15,[8+r15] - - vpmuludq ymm12,ymm11,YMMWORD[((224-128))+rsi] - vpaddq ymm5,ymm5,ymm12 - vpmuludq ymm6,ymm11,YMMWORD[((224-128))+r9] - vpaddq ymm6,ymm6,YMMWORD[((480-448))+r12] - - vpmuludq ymm7,ymm0,YMMWORD[((256-128))+rsi] - vmovdqu YMMWORD[(448-448)+r12],ymm5 - vpaddq ymm7,ymm7,YMMWORD[((512-448))+r12] - vmovdqu YMMWORD[(480-448)+r12],ymm6 - vmovdqu YMMWORD[(512-448)+r12],ymm7 - lea r12,[8+r12] - - dec r14d - jnz NEAR $L$OOP_SQR_1024 - - vmovdqu ymm8,YMMWORD[256+rsp] - vmovdqu ymm1,YMMWORD[288+rsp] - vmovdqu ymm2,YMMWORD[320+rsp] - lea rbx,[192+rsp] - - vpsrlq ymm14,ymm8,29 - vpand ymm8,ymm8,ymm15 - vpsrlq ymm11,ymm1,29 - vpand ymm1,ymm1,ymm15 - - vpermq ymm14,ymm14,0x93 - vpxor ymm9,ymm9,ymm9 - vpermq ymm11,ymm11,0x93 - - vpblendd ymm10,ymm14,ymm9,3 - vpblendd ymm14,ymm11,ymm14,3 - vpaddq ymm8,ymm8,ymm10 - vpblendd ymm11,ymm9,ymm11,3 - vpaddq ymm1,ymm1,ymm14 - vpaddq ymm2,ymm2,ymm11 - vmovdqu YMMWORD[(288-192)+rbx],ymm1 - vmovdqu YMMWORD[(320-192)+rbx],ymm2 - - mov rax,QWORD[rsp] - mov r10,QWORD[8+rsp] - mov r11,QWORD[16+rsp] - mov r12,QWORD[24+rsp] - vmovdqu ymm1,YMMWORD[32+rsp] - vmovdqu ymm2,YMMWORD[((64-192))+rbx] - vmovdqu ymm3,YMMWORD[((96-192))+rbx] - vmovdqu ymm4,YMMWORD[((128-192))+rbx] - vmovdqu ymm5,YMMWORD[((160-192))+rbx] - vmovdqu ymm6,YMMWORD[((192-192))+rbx] - vmovdqu ymm7,YMMWORD[((224-192))+rbx] - - mov r9,rax - imul eax,ecx - and eax,0x1fffffff - vmovd xmm12,eax - - mov rdx,rax - imul rax,QWORD[((-128))+r13] - vpbroadcastq ymm12,xmm12 - add r9,rax - mov rax,rdx - imul rax,QWORD[((8-128))+r13] - shr r9,29 - add r10,rax - mov rax,rdx - imul rax,QWORD[((16-128))+r13] - add r10,r9 - add r11,rax - imul rdx,QWORD[((24-128))+r13] - add r12,rdx - - mov rax,r10 - imul eax,ecx - and eax,0x1fffffff - - mov r14d,9 - jmp NEAR $L$OOP_REDUCE_1024 - -ALIGN 32 -$L$OOP_REDUCE_1024: - vmovd xmm13,eax - vpbroadcastq ymm13,xmm13 - - vpmuludq ymm10,ymm12,YMMWORD[((32-128))+r13] - mov rdx,rax - imul rax,QWORD[((-128))+r13] - vpaddq ymm1,ymm1,ymm10 - add r10,rax - vpmuludq ymm14,ymm12,YMMWORD[((64-128))+r13] - mov rax,rdx - imul rax,QWORD[((8-128))+r13] - vpaddq ymm2,ymm2,ymm14 - vpmuludq ymm11,ymm12,YMMWORD[((96-128))+r13] -DB 0x67 - add r11,rax -DB 0x67 - mov rax,rdx - imul rax,QWORD[((16-128))+r13] - shr r10,29 - vpaddq ymm3,ymm3,ymm11 - vpmuludq ymm10,ymm12,YMMWORD[((128-128))+r13] - add r12,rax - add r11,r10 - vpaddq ymm4,ymm4,ymm10 - vpmuludq ymm14,ymm12,YMMWORD[((160-128))+r13] - mov rax,r11 - imul eax,ecx - vpaddq ymm5,ymm5,ymm14 - vpmuludq ymm11,ymm12,YMMWORD[((192-128))+r13] - and eax,0x1fffffff - vpaddq ymm6,ymm6,ymm11 - vpmuludq ymm10,ymm12,YMMWORD[((224-128))+r13] - vpaddq ymm7,ymm7,ymm10 - vpmuludq ymm14,ymm12,YMMWORD[((256-128))+r13] - vmovd xmm12,eax - - vpaddq ymm8,ymm8,ymm14 - - vpbroadcastq ymm12,xmm12 - - vpmuludq ymm11,ymm13,YMMWORD[((32-8-128))+r13] - vmovdqu ymm14,YMMWORD[((96-8-128))+r13] - mov rdx,rax - imul rax,QWORD[((-128))+r13] - vpaddq ymm1,ymm1,ymm11 - vpmuludq ymm10,ymm13,YMMWORD[((64-8-128))+r13] - vmovdqu ymm11,YMMWORD[((128-8-128))+r13] - add r11,rax - mov rax,rdx - imul rax,QWORD[((8-128))+r13] - vpaddq ymm2,ymm2,ymm10 - add rax,r12 - shr r11,29 - vpmuludq ymm14,ymm14,ymm13 - vmovdqu ymm10,YMMWORD[((160-8-128))+r13] - add rax,r11 - vpaddq ymm3,ymm3,ymm14 - vpmuludq ymm11,ymm11,ymm13 - vmovdqu ymm14,YMMWORD[((192-8-128))+r13] -DB 0x67 - mov r12,rax - imul eax,ecx - vpaddq ymm4,ymm4,ymm11 - vpmuludq ymm10,ymm10,ymm13 -DB 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 - and eax,0x1fffffff - vpaddq ymm5,ymm5,ymm10 - vpmuludq ymm14,ymm14,ymm13 - vmovdqu ymm10,YMMWORD[((256-8-128))+r13] - vpaddq ymm6,ymm6,ymm14 - vpmuludq ymm11,ymm11,ymm13 - vmovdqu ymm9,YMMWORD[((288-8-128))+r13] - vmovd xmm0,eax - imul rax,QWORD[((-128))+r13] - vpaddq ymm7,ymm7,ymm11 - vpmuludq ymm10,ymm10,ymm13 - vmovdqu ymm14,YMMWORD[((32-16-128))+r13] - vpbroadcastq ymm0,xmm0 - vpaddq ymm8,ymm8,ymm10 - vpmuludq ymm9,ymm9,ymm13 - vmovdqu ymm11,YMMWORD[((64-16-128))+r13] - add r12,rax - - vmovdqu ymm13,YMMWORD[((32-24-128))+r13] - vpmuludq ymm14,ymm14,ymm12 - vmovdqu ymm10,YMMWORD[((96-16-128))+r13] - vpaddq ymm1,ymm1,ymm14 - vpmuludq ymm13,ymm13,ymm0 - vpmuludq ymm11,ymm11,ymm12 -DB 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff - vpaddq ymm13,ymm13,ymm1 - vpaddq ymm2,ymm2,ymm11 - vpmuludq ymm10,ymm10,ymm12 - vmovdqu ymm11,YMMWORD[((160-16-128))+r13] -DB 0x67 - vmovq rax,xmm13 - vmovdqu YMMWORD[rsp],ymm13 - vpaddq ymm3,ymm3,ymm10 - vpmuludq ymm14,ymm14,ymm12 - vmovdqu ymm10,YMMWORD[((192-16-128))+r13] - vpaddq ymm4,ymm4,ymm14 - vpmuludq ymm11,ymm11,ymm12 - vmovdqu ymm14,YMMWORD[((224-16-128))+r13] - vpaddq ymm5,ymm5,ymm11 - vpmuludq ymm10,ymm10,ymm12 - vmovdqu ymm11,YMMWORD[((256-16-128))+r13] - vpaddq ymm6,ymm6,ymm10 - vpmuludq ymm14,ymm14,ymm12 - shr r12,29 - vmovdqu ymm10,YMMWORD[((288-16-128))+r13] - add rax,r12 - vpaddq ymm7,ymm7,ymm14 - vpmuludq ymm11,ymm11,ymm12 - - mov r9,rax - imul eax,ecx - vpaddq ymm8,ymm8,ymm11 - vpmuludq ymm10,ymm10,ymm12 - and eax,0x1fffffff - vmovd xmm12,eax - vmovdqu ymm11,YMMWORD[((96-24-128))+r13] -DB 0x67 - vpaddq ymm9,ymm9,ymm10 - vpbroadcastq ymm12,xmm12 - - vpmuludq ymm14,ymm0,YMMWORD[((64-24-128))+r13] - vmovdqu ymm10,YMMWORD[((128-24-128))+r13] - mov rdx,rax - imul rax,QWORD[((-128))+r13] - mov r10,QWORD[8+rsp] - vpaddq ymm1,ymm2,ymm14 - vpmuludq ymm11,ymm11,ymm0 - vmovdqu ymm14,YMMWORD[((160-24-128))+r13] - add r9,rax - mov rax,rdx - imul rax,QWORD[((8-128))+r13] -DB 0x67 - shr r9,29 - mov r11,QWORD[16+rsp] - vpaddq ymm2,ymm3,ymm11 - vpmuludq ymm10,ymm10,ymm0 - vmovdqu ymm11,YMMWORD[((192-24-128))+r13] - add r10,rax - mov rax,rdx - imul rax,QWORD[((16-128))+r13] - vpaddq ymm3,ymm4,ymm10 - vpmuludq ymm14,ymm14,ymm0 - vmovdqu ymm10,YMMWORD[((224-24-128))+r13] - imul rdx,QWORD[((24-128))+r13] - add r11,rax - lea rax,[r10*1+r9] - vpaddq ymm4,ymm5,ymm14 - vpmuludq ymm11,ymm11,ymm0 - vmovdqu ymm14,YMMWORD[((256-24-128))+r13] - mov r10,rax - imul eax,ecx - vpmuludq ymm10,ymm10,ymm0 - vpaddq ymm5,ymm6,ymm11 - vmovdqu ymm11,YMMWORD[((288-24-128))+r13] - and eax,0x1fffffff - vpaddq ymm6,ymm7,ymm10 - vpmuludq ymm14,ymm14,ymm0 - add rdx,QWORD[24+rsp] - vpaddq ymm7,ymm8,ymm14 - vpmuludq ymm11,ymm11,ymm0 - vpaddq ymm8,ymm9,ymm11 - vmovq xmm9,r12 - mov r12,rdx - - dec r14d - jnz NEAR $L$OOP_REDUCE_1024 - lea r12,[448+rsp] - vpaddq ymm0,ymm13,ymm9 - vpxor ymm9,ymm9,ymm9 - - vpaddq ymm0,ymm0,YMMWORD[((288-192))+rbx] - vpaddq ymm1,ymm1,YMMWORD[((320-448))+r12] - vpaddq ymm2,ymm2,YMMWORD[((352-448))+r12] - vpaddq ymm3,ymm3,YMMWORD[((384-448))+r12] - vpaddq ymm4,ymm4,YMMWORD[((416-448))+r12] - vpaddq ymm5,ymm5,YMMWORD[((448-448))+r12] - vpaddq ymm6,ymm6,YMMWORD[((480-448))+r12] - vpaddq ymm7,ymm7,YMMWORD[((512-448))+r12] - vpaddq ymm8,ymm8,YMMWORD[((544-448))+r12] - - vpsrlq ymm14,ymm0,29 - vpand ymm0,ymm0,ymm15 - vpsrlq ymm11,ymm1,29 - vpand ymm1,ymm1,ymm15 - vpsrlq ymm12,ymm2,29 - vpermq ymm14,ymm14,0x93 - vpand ymm2,ymm2,ymm15 - vpsrlq ymm13,ymm3,29 - vpermq ymm11,ymm11,0x93 - vpand ymm3,ymm3,ymm15 - vpermq ymm12,ymm12,0x93 - - vpblendd ymm10,ymm14,ymm9,3 - vpermq ymm13,ymm13,0x93 - vpblendd ymm14,ymm11,ymm14,3 - vpaddq ymm0,ymm0,ymm10 - vpblendd ymm11,ymm12,ymm11,3 - vpaddq ymm1,ymm1,ymm14 - vpblendd ymm12,ymm13,ymm12,3 - vpaddq ymm2,ymm2,ymm11 - vpblendd ymm13,ymm9,ymm13,3 - vpaddq ymm3,ymm3,ymm12 - vpaddq ymm4,ymm4,ymm13 - - vpsrlq ymm14,ymm0,29 - vpand ymm0,ymm0,ymm15 - vpsrlq ymm11,ymm1,29 - vpand ymm1,ymm1,ymm15 - vpsrlq ymm12,ymm2,29 - vpermq ymm14,ymm14,0x93 - vpand ymm2,ymm2,ymm15 - vpsrlq ymm13,ymm3,29 - vpermq ymm11,ymm11,0x93 - vpand ymm3,ymm3,ymm15 - vpermq ymm12,ymm12,0x93 - - vpblendd ymm10,ymm14,ymm9,3 - vpermq ymm13,ymm13,0x93 - vpblendd ymm14,ymm11,ymm14,3 - vpaddq ymm0,ymm0,ymm10 - vpblendd ymm11,ymm12,ymm11,3 - vpaddq ymm1,ymm1,ymm14 - vmovdqu YMMWORD[(0-128)+rdi],ymm0 - vpblendd ymm12,ymm13,ymm12,3 - vpaddq ymm2,ymm2,ymm11 - vmovdqu YMMWORD[(32-128)+rdi],ymm1 - vpblendd ymm13,ymm9,ymm13,3 - vpaddq ymm3,ymm3,ymm12 - vmovdqu YMMWORD[(64-128)+rdi],ymm2 - vpaddq ymm4,ymm4,ymm13 - vmovdqu YMMWORD[(96-128)+rdi],ymm3 - vpsrlq ymm14,ymm4,29 - vpand ymm4,ymm4,ymm15 - vpsrlq ymm11,ymm5,29 - vpand ymm5,ymm5,ymm15 - vpsrlq ymm12,ymm6,29 - vpermq ymm14,ymm14,0x93 - vpand ymm6,ymm6,ymm15 - vpsrlq ymm13,ymm7,29 - vpermq ymm11,ymm11,0x93 - vpand ymm7,ymm7,ymm15 - vpsrlq ymm0,ymm8,29 - vpermq ymm12,ymm12,0x93 - vpand ymm8,ymm8,ymm15 - vpermq ymm13,ymm13,0x93 - - vpblendd ymm10,ymm14,ymm9,3 - vpermq ymm0,ymm0,0x93 - vpblendd ymm14,ymm11,ymm14,3 - vpaddq ymm4,ymm4,ymm10 - vpblendd ymm11,ymm12,ymm11,3 - vpaddq ymm5,ymm5,ymm14 - vpblendd ymm12,ymm13,ymm12,3 - vpaddq ymm6,ymm6,ymm11 - vpblendd ymm13,ymm0,ymm13,3 - vpaddq ymm7,ymm7,ymm12 - vpaddq ymm8,ymm8,ymm13 - - vpsrlq ymm14,ymm4,29 - vpand ymm4,ymm4,ymm15 - vpsrlq ymm11,ymm5,29 - vpand ymm5,ymm5,ymm15 - vpsrlq ymm12,ymm6,29 - vpermq ymm14,ymm14,0x93 - vpand ymm6,ymm6,ymm15 - vpsrlq ymm13,ymm7,29 - vpermq ymm11,ymm11,0x93 - vpand ymm7,ymm7,ymm15 - vpsrlq ymm0,ymm8,29 - vpermq ymm12,ymm12,0x93 - vpand ymm8,ymm8,ymm15 - vpermq ymm13,ymm13,0x93 - - vpblendd ymm10,ymm14,ymm9,3 - vpermq ymm0,ymm0,0x93 - vpblendd ymm14,ymm11,ymm14,3 - vpaddq ymm4,ymm4,ymm10 - vpblendd ymm11,ymm12,ymm11,3 - vpaddq ymm5,ymm5,ymm14 - vmovdqu YMMWORD[(128-128)+rdi],ymm4 - vpblendd ymm12,ymm13,ymm12,3 - vpaddq ymm6,ymm6,ymm11 - vmovdqu YMMWORD[(160-128)+rdi],ymm5 - vpblendd ymm13,ymm0,ymm13,3 - vpaddq ymm7,ymm7,ymm12 - vmovdqu YMMWORD[(192-128)+rdi],ymm6 - vpaddq ymm8,ymm8,ymm13 - vmovdqu YMMWORD[(224-128)+rdi],ymm7 - vmovdqu YMMWORD[(256-128)+rdi],ymm8 - - mov rsi,rdi - dec r8d - jne NEAR $L$OOP_GRANDE_SQR_1024 - - vzeroall - mov rax,rbp - -$L$sqr_1024_in_tail: - movaps xmm6,XMMWORD[((-216))+rax] - movaps xmm7,XMMWORD[((-200))+rax] - movaps xmm8,XMMWORD[((-184))+rax] - movaps xmm9,XMMWORD[((-168))+rax] - movaps xmm10,XMMWORD[((-152))+rax] - movaps xmm11,XMMWORD[((-136))+rax] - movaps xmm12,XMMWORD[((-120))+rax] - movaps xmm13,XMMWORD[((-104))+rax] - movaps xmm14,XMMWORD[((-88))+rax] - movaps xmm15,XMMWORD[((-72))+rax] - mov r15,QWORD[((-48))+rax] - - mov r14,QWORD[((-40))+rax] - - mov r13,QWORD[((-32))+rax] - - mov r12,QWORD[((-24))+rax] - - mov rbp,QWORD[((-16))+rax] - - mov rbx,QWORD[((-8))+rax] - - lea rsp,[rax] - -$L$sqr_1024_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_rsaz_1024_sqr_avx2: -global rsaz_1024_mul_avx2 - -ALIGN 64 -rsaz_1024_mul_avx2: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_rsaz_1024_mul_avx2: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - - - - lea rax,[rsp] - - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - push r15 - - vzeroupper - lea rsp,[((-168))+rsp] - vmovaps XMMWORD[(-216)+rax],xmm6 - vmovaps XMMWORD[(-200)+rax],xmm7 - vmovaps XMMWORD[(-184)+rax],xmm8 - vmovaps XMMWORD[(-168)+rax],xmm9 - vmovaps XMMWORD[(-152)+rax],xmm10 - vmovaps XMMWORD[(-136)+rax],xmm11 - vmovaps XMMWORD[(-120)+rax],xmm12 - vmovaps XMMWORD[(-104)+rax],xmm13 - vmovaps XMMWORD[(-88)+rax],xmm14 - vmovaps XMMWORD[(-72)+rax],xmm15 -$L$mul_1024_body: - mov rbp,rax - - vzeroall - mov r13,rdx - sub rsp,64 - - - - - - -DB 0x67,0x67 - mov r15,rsi - and r15,4095 - add r15,32*10 - shr r15,12 - mov r15,rsi - cmovnz rsi,r13 - cmovnz r13,r15 - - mov r15,rcx - sub rsi,-128 - sub rcx,-128 - sub rdi,-128 - - and r15,4095 - add r15,32*10 -DB 0x67,0x67 - shr r15,12 - jz NEAR $L$mul_1024_no_n_copy - - - - - - sub rsp,32*10 - vmovdqu ymm0,YMMWORD[((0-128))+rcx] - and rsp,-512 - vmovdqu ymm1,YMMWORD[((32-128))+rcx] - vmovdqu ymm2,YMMWORD[((64-128))+rcx] - vmovdqu ymm3,YMMWORD[((96-128))+rcx] - vmovdqu ymm4,YMMWORD[((128-128))+rcx] - vmovdqu ymm5,YMMWORD[((160-128))+rcx] - vmovdqu ymm6,YMMWORD[((192-128))+rcx] - vmovdqu ymm7,YMMWORD[((224-128))+rcx] - vmovdqu ymm8,YMMWORD[((256-128))+rcx] - lea rcx,[((64+128))+rsp] - vmovdqu YMMWORD[(0-128)+rcx],ymm0 - vpxor ymm0,ymm0,ymm0 - vmovdqu YMMWORD[(32-128)+rcx],ymm1 - vpxor ymm1,ymm1,ymm1 - vmovdqu YMMWORD[(64-128)+rcx],ymm2 - vpxor ymm2,ymm2,ymm2 - vmovdqu YMMWORD[(96-128)+rcx],ymm3 - vpxor ymm3,ymm3,ymm3 - vmovdqu YMMWORD[(128-128)+rcx],ymm4 - vpxor ymm4,ymm4,ymm4 - vmovdqu YMMWORD[(160-128)+rcx],ymm5 - vpxor ymm5,ymm5,ymm5 - vmovdqu YMMWORD[(192-128)+rcx],ymm6 - vpxor ymm6,ymm6,ymm6 - vmovdqu YMMWORD[(224-128)+rcx],ymm7 - vpxor ymm7,ymm7,ymm7 - vmovdqu YMMWORD[(256-128)+rcx],ymm8 - vmovdqa ymm8,ymm0 - vmovdqu YMMWORD[(288-128)+rcx],ymm9 -$L$mul_1024_no_n_copy: - and rsp,-64 - - mov rbx,QWORD[r13] - vpbroadcastq ymm10,QWORD[r13] - vmovdqu YMMWORD[rsp],ymm0 - xor r9,r9 -DB 0x67 - xor r10,r10 - xor r11,r11 - xor r12,r12 - - vmovdqu ymm15,YMMWORD[$L$and_mask] - mov r14d,9 - vmovdqu YMMWORD[(288-128)+rdi],ymm9 - jmp NEAR $L$oop_mul_1024 - -ALIGN 32 -$L$oop_mul_1024: - vpsrlq ymm9,ymm3,29 - mov rax,rbx - imul rax,QWORD[((-128))+rsi] - add rax,r9 - mov r10,rbx - imul r10,QWORD[((8-128))+rsi] - add r10,QWORD[8+rsp] - - mov r9,rax - imul eax,r8d - and eax,0x1fffffff - - mov r11,rbx - imul r11,QWORD[((16-128))+rsi] - add r11,QWORD[16+rsp] - - mov r12,rbx - imul r12,QWORD[((24-128))+rsi] - add r12,QWORD[24+rsp] - vpmuludq ymm0,ymm10,YMMWORD[((32-128))+rsi] - vmovd xmm11,eax - vpaddq ymm1,ymm1,ymm0 - vpmuludq ymm12,ymm10,YMMWORD[((64-128))+rsi] - vpbroadcastq ymm11,xmm11 - vpaddq ymm2,ymm2,ymm12 - vpmuludq ymm13,ymm10,YMMWORD[((96-128))+rsi] - vpand ymm3,ymm3,ymm15 - vpaddq ymm3,ymm3,ymm13 - vpmuludq ymm0,ymm10,YMMWORD[((128-128))+rsi] - vpaddq ymm4,ymm4,ymm0 - vpmuludq ymm12,ymm10,YMMWORD[((160-128))+rsi] - vpaddq ymm5,ymm5,ymm12 - vpmuludq ymm13,ymm10,YMMWORD[((192-128))+rsi] - vpaddq ymm6,ymm6,ymm13 - vpmuludq ymm0,ymm10,YMMWORD[((224-128))+rsi] - vpermq ymm9,ymm9,0x93 - vpaddq ymm7,ymm7,ymm0 - vpmuludq ymm12,ymm10,YMMWORD[((256-128))+rsi] - vpbroadcastq ymm10,QWORD[8+r13] - vpaddq ymm8,ymm8,ymm12 - - mov rdx,rax - imul rax,QWORD[((-128))+rcx] - add r9,rax - mov rax,rdx - imul rax,QWORD[((8-128))+rcx] - add r10,rax - mov rax,rdx - imul rax,QWORD[((16-128))+rcx] - add r11,rax - shr r9,29 - imul rdx,QWORD[((24-128))+rcx] - add r12,rdx - add r10,r9 - - vpmuludq ymm13,ymm11,YMMWORD[((32-128))+rcx] - vmovq rbx,xmm10 - vpaddq ymm1,ymm1,ymm13 - vpmuludq ymm0,ymm11,YMMWORD[((64-128))+rcx] - vpaddq ymm2,ymm2,ymm0 - vpmuludq ymm12,ymm11,YMMWORD[((96-128))+rcx] - vpaddq ymm3,ymm3,ymm12 - vpmuludq ymm13,ymm11,YMMWORD[((128-128))+rcx] - vpaddq ymm4,ymm4,ymm13 - vpmuludq ymm0,ymm11,YMMWORD[((160-128))+rcx] - vpaddq ymm5,ymm5,ymm0 - vpmuludq ymm12,ymm11,YMMWORD[((192-128))+rcx] - vpaddq ymm6,ymm6,ymm12 - vpmuludq ymm13,ymm11,YMMWORD[((224-128))+rcx] - vpblendd ymm12,ymm9,ymm14,3 - vpaddq ymm7,ymm7,ymm13 - vpmuludq ymm0,ymm11,YMMWORD[((256-128))+rcx] - vpaddq ymm3,ymm3,ymm12 - vpaddq ymm8,ymm8,ymm0 - - mov rax,rbx - imul rax,QWORD[((-128))+rsi] - add r10,rax - vmovdqu ymm12,YMMWORD[((-8+32-128))+rsi] - mov rax,rbx - imul rax,QWORD[((8-128))+rsi] - add r11,rax - vmovdqu ymm13,YMMWORD[((-8+64-128))+rsi] - - mov rax,r10 - vpblendd ymm9,ymm9,ymm14,0xfc - imul eax,r8d - vpaddq ymm4,ymm4,ymm9 - and eax,0x1fffffff - - imul rbx,QWORD[((16-128))+rsi] - add r12,rbx - vpmuludq ymm12,ymm12,ymm10 - vmovd xmm11,eax - vmovdqu ymm0,YMMWORD[((-8+96-128))+rsi] - vpaddq ymm1,ymm1,ymm12 - vpmuludq ymm13,ymm13,ymm10 - vpbroadcastq ymm11,xmm11 - vmovdqu ymm12,YMMWORD[((-8+128-128))+rsi] - vpaddq ymm2,ymm2,ymm13 - vpmuludq ymm0,ymm0,ymm10 - vmovdqu ymm13,YMMWORD[((-8+160-128))+rsi] - vpaddq ymm3,ymm3,ymm0 - vpmuludq ymm12,ymm12,ymm10 - vmovdqu ymm0,YMMWORD[((-8+192-128))+rsi] - vpaddq ymm4,ymm4,ymm12 - vpmuludq ymm13,ymm13,ymm10 - vmovdqu ymm12,YMMWORD[((-8+224-128))+rsi] - vpaddq ymm5,ymm5,ymm13 - vpmuludq ymm0,ymm0,ymm10 - vmovdqu ymm13,YMMWORD[((-8+256-128))+rsi] - vpaddq ymm6,ymm6,ymm0 - vpmuludq ymm12,ymm12,ymm10 - vmovdqu ymm9,YMMWORD[((-8+288-128))+rsi] - vpaddq ymm7,ymm7,ymm12 - vpmuludq ymm13,ymm13,ymm10 - vpaddq ymm8,ymm8,ymm13 - vpmuludq ymm9,ymm9,ymm10 - vpbroadcastq ymm10,QWORD[16+r13] - - mov rdx,rax - imul rax,QWORD[((-128))+rcx] - add r10,rax - vmovdqu ymm0,YMMWORD[((-8+32-128))+rcx] - mov rax,rdx - imul rax,QWORD[((8-128))+rcx] - add r11,rax - vmovdqu ymm12,YMMWORD[((-8+64-128))+rcx] - shr r10,29 - imul rdx,QWORD[((16-128))+rcx] - add r12,rdx - add r11,r10 - - vpmuludq ymm0,ymm0,ymm11 - vmovq rbx,xmm10 - vmovdqu ymm13,YMMWORD[((-8+96-128))+rcx] - vpaddq ymm1,ymm1,ymm0 - vpmuludq ymm12,ymm12,ymm11 - vmovdqu ymm0,YMMWORD[((-8+128-128))+rcx] - vpaddq ymm2,ymm2,ymm12 - vpmuludq ymm13,ymm13,ymm11 - vmovdqu ymm12,YMMWORD[((-8+160-128))+rcx] - vpaddq ymm3,ymm3,ymm13 - vpmuludq ymm0,ymm0,ymm11 - vmovdqu ymm13,YMMWORD[((-8+192-128))+rcx] - vpaddq ymm4,ymm4,ymm0 - vpmuludq ymm12,ymm12,ymm11 - vmovdqu ymm0,YMMWORD[((-8+224-128))+rcx] - vpaddq ymm5,ymm5,ymm12 - vpmuludq ymm13,ymm13,ymm11 - vmovdqu ymm12,YMMWORD[((-8+256-128))+rcx] - vpaddq ymm6,ymm6,ymm13 - vpmuludq ymm0,ymm0,ymm11 - vmovdqu ymm13,YMMWORD[((-8+288-128))+rcx] - vpaddq ymm7,ymm7,ymm0 - vpmuludq ymm12,ymm12,ymm11 - vpaddq ymm8,ymm8,ymm12 - vpmuludq ymm13,ymm13,ymm11 - vpaddq ymm9,ymm9,ymm13 - - vmovdqu ymm0,YMMWORD[((-16+32-128))+rsi] - mov rax,rbx - imul rax,QWORD[((-128))+rsi] - add rax,r11 - - vmovdqu ymm12,YMMWORD[((-16+64-128))+rsi] - mov r11,rax - imul eax,r8d - and eax,0x1fffffff - - imul rbx,QWORD[((8-128))+rsi] - add r12,rbx - vpmuludq ymm0,ymm0,ymm10 - vmovd xmm11,eax - vmovdqu ymm13,YMMWORD[((-16+96-128))+rsi] - vpaddq ymm1,ymm1,ymm0 - vpmuludq ymm12,ymm12,ymm10 - vpbroadcastq ymm11,xmm11 - vmovdqu ymm0,YMMWORD[((-16+128-128))+rsi] - vpaddq ymm2,ymm2,ymm12 - vpmuludq ymm13,ymm13,ymm10 - vmovdqu ymm12,YMMWORD[((-16+160-128))+rsi] - vpaddq ymm3,ymm3,ymm13 - vpmuludq ymm0,ymm0,ymm10 - vmovdqu ymm13,YMMWORD[((-16+192-128))+rsi] - vpaddq ymm4,ymm4,ymm0 - vpmuludq ymm12,ymm12,ymm10 - vmovdqu ymm0,YMMWORD[((-16+224-128))+rsi] - vpaddq ymm5,ymm5,ymm12 - vpmuludq ymm13,ymm13,ymm10 - vmovdqu ymm12,YMMWORD[((-16+256-128))+rsi] - vpaddq ymm6,ymm6,ymm13 - vpmuludq ymm0,ymm0,ymm10 - vmovdqu ymm13,YMMWORD[((-16+288-128))+rsi] - vpaddq ymm7,ymm7,ymm0 - vpmuludq ymm12,ymm12,ymm10 - vpaddq ymm8,ymm8,ymm12 - vpmuludq ymm13,ymm13,ymm10 - vpbroadcastq ymm10,QWORD[24+r13] - vpaddq ymm9,ymm9,ymm13 - - vmovdqu ymm0,YMMWORD[((-16+32-128))+rcx] - mov rdx,rax - imul rax,QWORD[((-128))+rcx] - add r11,rax - vmovdqu ymm12,YMMWORD[((-16+64-128))+rcx] - imul rdx,QWORD[((8-128))+rcx] - add r12,rdx - shr r11,29 - - vpmuludq ymm0,ymm0,ymm11 - vmovq rbx,xmm10 - vmovdqu ymm13,YMMWORD[((-16+96-128))+rcx] - vpaddq ymm1,ymm1,ymm0 - vpmuludq ymm12,ymm12,ymm11 - vmovdqu ymm0,YMMWORD[((-16+128-128))+rcx] - vpaddq ymm2,ymm2,ymm12 - vpmuludq ymm13,ymm13,ymm11 - vmovdqu ymm12,YMMWORD[((-16+160-128))+rcx] - vpaddq ymm3,ymm3,ymm13 - vpmuludq ymm0,ymm0,ymm11 - vmovdqu ymm13,YMMWORD[((-16+192-128))+rcx] - vpaddq ymm4,ymm4,ymm0 - vpmuludq ymm12,ymm12,ymm11 - vmovdqu ymm0,YMMWORD[((-16+224-128))+rcx] - vpaddq ymm5,ymm5,ymm12 - vpmuludq ymm13,ymm13,ymm11 - vmovdqu ymm12,YMMWORD[((-16+256-128))+rcx] - vpaddq ymm6,ymm6,ymm13 - vpmuludq ymm0,ymm0,ymm11 - vmovdqu ymm13,YMMWORD[((-16+288-128))+rcx] - vpaddq ymm7,ymm7,ymm0 - vpmuludq ymm12,ymm12,ymm11 - vmovdqu ymm0,YMMWORD[((-24+32-128))+rsi] - vpaddq ymm8,ymm8,ymm12 - vpmuludq ymm13,ymm13,ymm11 - vmovdqu ymm12,YMMWORD[((-24+64-128))+rsi] - vpaddq ymm9,ymm9,ymm13 - - add r12,r11 - imul rbx,QWORD[((-128))+rsi] - add r12,rbx - - mov rax,r12 - imul eax,r8d - and eax,0x1fffffff - - vpmuludq ymm0,ymm0,ymm10 - vmovd xmm11,eax - vmovdqu ymm13,YMMWORD[((-24+96-128))+rsi] - vpaddq ymm1,ymm1,ymm0 - vpmuludq ymm12,ymm12,ymm10 - vpbroadcastq ymm11,xmm11 - vmovdqu ymm0,YMMWORD[((-24+128-128))+rsi] - vpaddq ymm2,ymm2,ymm12 - vpmuludq ymm13,ymm13,ymm10 - vmovdqu ymm12,YMMWORD[((-24+160-128))+rsi] - vpaddq ymm3,ymm3,ymm13 - vpmuludq ymm0,ymm0,ymm10 - vmovdqu ymm13,YMMWORD[((-24+192-128))+rsi] - vpaddq ymm4,ymm4,ymm0 - vpmuludq ymm12,ymm12,ymm10 - vmovdqu ymm0,YMMWORD[((-24+224-128))+rsi] - vpaddq ymm5,ymm5,ymm12 - vpmuludq ymm13,ymm13,ymm10 - vmovdqu ymm12,YMMWORD[((-24+256-128))+rsi] - vpaddq ymm6,ymm6,ymm13 - vpmuludq ymm0,ymm0,ymm10 - vmovdqu ymm13,YMMWORD[((-24+288-128))+rsi] - vpaddq ymm7,ymm7,ymm0 - vpmuludq ymm12,ymm12,ymm10 - vpaddq ymm8,ymm8,ymm12 - vpmuludq ymm13,ymm13,ymm10 - vpbroadcastq ymm10,QWORD[32+r13] - vpaddq ymm9,ymm9,ymm13 - add r13,32 - - vmovdqu ymm0,YMMWORD[((-24+32-128))+rcx] - imul rax,QWORD[((-128))+rcx] - add r12,rax - shr r12,29 - - vmovdqu ymm12,YMMWORD[((-24+64-128))+rcx] - vpmuludq ymm0,ymm0,ymm11 - vmovq rbx,xmm10 - vmovdqu ymm13,YMMWORD[((-24+96-128))+rcx] - vpaddq ymm0,ymm1,ymm0 - vpmuludq ymm12,ymm12,ymm11 - vmovdqu YMMWORD[rsp],ymm0 - vpaddq ymm1,ymm2,ymm12 - vmovdqu ymm0,YMMWORD[((-24+128-128))+rcx] - vpmuludq ymm13,ymm13,ymm11 - vmovdqu ymm12,YMMWORD[((-24+160-128))+rcx] - vpaddq ymm2,ymm3,ymm13 - vpmuludq ymm0,ymm0,ymm11 - vmovdqu ymm13,YMMWORD[((-24+192-128))+rcx] - vpaddq ymm3,ymm4,ymm0 - vpmuludq ymm12,ymm12,ymm11 - vmovdqu ymm0,YMMWORD[((-24+224-128))+rcx] - vpaddq ymm4,ymm5,ymm12 - vpmuludq ymm13,ymm13,ymm11 - vmovdqu ymm12,YMMWORD[((-24+256-128))+rcx] - vpaddq ymm5,ymm6,ymm13 - vpmuludq ymm0,ymm0,ymm11 - vmovdqu ymm13,YMMWORD[((-24+288-128))+rcx] - mov r9,r12 - vpaddq ymm6,ymm7,ymm0 - vpmuludq ymm12,ymm12,ymm11 - add r9,QWORD[rsp] - vpaddq ymm7,ymm8,ymm12 - vpmuludq ymm13,ymm13,ymm11 - vmovq xmm12,r12 - vpaddq ymm8,ymm9,ymm13 - - dec r14d - jnz NEAR $L$oop_mul_1024 - vpaddq ymm0,ymm12,YMMWORD[rsp] - - vpsrlq ymm12,ymm0,29 - vpand ymm0,ymm0,ymm15 - vpsrlq ymm13,ymm1,29 - vpand ymm1,ymm1,ymm15 - vpsrlq ymm10,ymm2,29 - vpermq ymm12,ymm12,0x93 - vpand ymm2,ymm2,ymm15 - vpsrlq ymm11,ymm3,29 - vpermq ymm13,ymm13,0x93 - vpand ymm3,ymm3,ymm15 - - vpblendd ymm9,ymm12,ymm14,3 - vpermq ymm10,ymm10,0x93 - vpblendd ymm12,ymm13,ymm12,3 - vpermq ymm11,ymm11,0x93 - vpaddq ymm0,ymm0,ymm9 - vpblendd ymm13,ymm10,ymm13,3 - vpaddq ymm1,ymm1,ymm12 - vpblendd ymm10,ymm11,ymm10,3 - vpaddq ymm2,ymm2,ymm13 - vpblendd ymm11,ymm14,ymm11,3 - vpaddq ymm3,ymm3,ymm10 - vpaddq ymm4,ymm4,ymm11 - - vpsrlq ymm12,ymm0,29 - vpand ymm0,ymm0,ymm15 - vpsrlq ymm13,ymm1,29 - vpand ymm1,ymm1,ymm15 - vpsrlq ymm10,ymm2,29 - vpermq ymm12,ymm12,0x93 - vpand ymm2,ymm2,ymm15 - vpsrlq ymm11,ymm3,29 - vpermq ymm13,ymm13,0x93 - vpand ymm3,ymm3,ymm15 - vpermq ymm10,ymm10,0x93 - - vpblendd ymm9,ymm12,ymm14,3 - vpermq ymm11,ymm11,0x93 - vpblendd ymm12,ymm13,ymm12,3 - vpaddq ymm0,ymm0,ymm9 - vpblendd ymm13,ymm10,ymm13,3 - vpaddq ymm1,ymm1,ymm12 - vpblendd ymm10,ymm11,ymm10,3 - vpaddq ymm2,ymm2,ymm13 - vpblendd ymm11,ymm14,ymm11,3 - vpaddq ymm3,ymm3,ymm10 - vpaddq ymm4,ymm4,ymm11 - - vmovdqu YMMWORD[(0-128)+rdi],ymm0 - vmovdqu YMMWORD[(32-128)+rdi],ymm1 - vmovdqu YMMWORD[(64-128)+rdi],ymm2 - vmovdqu YMMWORD[(96-128)+rdi],ymm3 - vpsrlq ymm12,ymm4,29 - vpand ymm4,ymm4,ymm15 - vpsrlq ymm13,ymm5,29 - vpand ymm5,ymm5,ymm15 - vpsrlq ymm10,ymm6,29 - vpermq ymm12,ymm12,0x93 - vpand ymm6,ymm6,ymm15 - vpsrlq ymm11,ymm7,29 - vpermq ymm13,ymm13,0x93 - vpand ymm7,ymm7,ymm15 - vpsrlq ymm0,ymm8,29 - vpermq ymm10,ymm10,0x93 - vpand ymm8,ymm8,ymm15 - vpermq ymm11,ymm11,0x93 - - vpblendd ymm9,ymm12,ymm14,3 - vpermq ymm0,ymm0,0x93 - vpblendd ymm12,ymm13,ymm12,3 - vpaddq ymm4,ymm4,ymm9 - vpblendd ymm13,ymm10,ymm13,3 - vpaddq ymm5,ymm5,ymm12 - vpblendd ymm10,ymm11,ymm10,3 - vpaddq ymm6,ymm6,ymm13 - vpblendd ymm11,ymm0,ymm11,3 - vpaddq ymm7,ymm7,ymm10 - vpaddq ymm8,ymm8,ymm11 - - vpsrlq ymm12,ymm4,29 - vpand ymm4,ymm4,ymm15 - vpsrlq ymm13,ymm5,29 - vpand ymm5,ymm5,ymm15 - vpsrlq ymm10,ymm6,29 - vpermq ymm12,ymm12,0x93 - vpand ymm6,ymm6,ymm15 - vpsrlq ymm11,ymm7,29 - vpermq ymm13,ymm13,0x93 - vpand ymm7,ymm7,ymm15 - vpsrlq ymm0,ymm8,29 - vpermq ymm10,ymm10,0x93 - vpand ymm8,ymm8,ymm15 - vpermq ymm11,ymm11,0x93 - - vpblendd ymm9,ymm12,ymm14,3 - vpermq ymm0,ymm0,0x93 - vpblendd ymm12,ymm13,ymm12,3 - vpaddq ymm4,ymm4,ymm9 - vpblendd ymm13,ymm10,ymm13,3 - vpaddq ymm5,ymm5,ymm12 - vpblendd ymm10,ymm11,ymm10,3 - vpaddq ymm6,ymm6,ymm13 - vpblendd ymm11,ymm0,ymm11,3 - vpaddq ymm7,ymm7,ymm10 - vpaddq ymm8,ymm8,ymm11 - - vmovdqu YMMWORD[(128-128)+rdi],ymm4 - vmovdqu YMMWORD[(160-128)+rdi],ymm5 - vmovdqu YMMWORD[(192-128)+rdi],ymm6 - vmovdqu YMMWORD[(224-128)+rdi],ymm7 - vmovdqu YMMWORD[(256-128)+rdi],ymm8 - vzeroupper - - mov rax,rbp - -$L$mul_1024_in_tail: - movaps xmm6,XMMWORD[((-216))+rax] - movaps xmm7,XMMWORD[((-200))+rax] - movaps xmm8,XMMWORD[((-184))+rax] - movaps xmm9,XMMWORD[((-168))+rax] - movaps xmm10,XMMWORD[((-152))+rax] - movaps xmm11,XMMWORD[((-136))+rax] - movaps xmm12,XMMWORD[((-120))+rax] - movaps xmm13,XMMWORD[((-104))+rax] - movaps xmm14,XMMWORD[((-88))+rax] - movaps xmm15,XMMWORD[((-72))+rax] - mov r15,QWORD[((-48))+rax] - - mov r14,QWORD[((-40))+rax] - - mov r13,QWORD[((-32))+rax] - - mov r12,QWORD[((-24))+rax] - - mov rbp,QWORD[((-16))+rax] - - mov rbx,QWORD[((-8))+rax] - - lea rsp,[rax] - -$L$mul_1024_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_rsaz_1024_mul_avx2: -global rsaz_1024_red2norm_avx2 - -ALIGN 32 -rsaz_1024_red2norm_avx2: +global rsaz_avx2_eligible - sub rdx,-128 - xor rax,rax - mov r8,QWORD[((-128))+rdx] - mov r9,QWORD[((-120))+rdx] - mov r10,QWORD[((-112))+rdx] - shl r8,0 - shl r9,29 - mov r11,r10 - shl r10,58 - shr r11,6 - add rax,r8 - add rax,r9 - add rax,r10 - adc r11,0 - mov QWORD[rcx],rax - mov rax,r11 - mov r8,QWORD[((-104))+rdx] - mov r9,QWORD[((-96))+rdx] - shl r8,23 - mov r10,r9 - shl r9,52 - shr r10,12 - add rax,r8 - add rax,r9 - adc r10,0 - mov QWORD[8+rcx],rax - mov rax,r10 - mov r11,QWORD[((-88))+rdx] - mov r8,QWORD[((-80))+rdx] - shl r11,17 - mov r9,r8 - shl r8,46 - shr r9,18 - add rax,r11 - add rax,r8 - adc r9,0 - mov QWORD[16+rcx],rax - mov rax,r9 - mov r10,QWORD[((-72))+rdx] - mov r11,QWORD[((-64))+rdx] - shl r10,11 - mov r8,r11 - shl r11,40 - shr r8,24 - add rax,r10 - add rax,r11 - adc r8,0 - mov QWORD[24+rcx],rax - mov rax,r8 - mov r9,QWORD[((-56))+rdx] - mov r10,QWORD[((-48))+rdx] - mov r11,QWORD[((-40))+rdx] - shl r9,5 - shl r10,34 - mov r8,r11 - shl r11,63 - shr r8,1 - add rax,r9 - add rax,r10 - add rax,r11 - adc r8,0 - mov QWORD[32+rcx],rax - mov rax,r8 - mov r9,QWORD[((-32))+rdx] - mov r10,QWORD[((-24))+rdx] - shl r9,28 - mov r11,r10 - shl r10,57 - shr r11,7 - add rax,r9 - add rax,r10 - adc r11,0 - mov QWORD[40+rcx],rax - mov rax,r11 - mov r8,QWORD[((-16))+rdx] - mov r9,QWORD[((-8))+rdx] - shl r8,22 - mov r10,r9 - shl r9,51 - shr r10,13 - add rax,r8 - add rax,r9 - adc r10,0 - mov QWORD[48+rcx],rax - mov rax,r10 - mov r11,QWORD[rdx] - mov r8,QWORD[8+rdx] - shl r11,16 - mov r9,r8 - shl r8,45 - shr r9,19 - add rax,r11 - add rax,r8 - adc r9,0 - mov QWORD[56+rcx],rax - mov rax,r9 - mov r10,QWORD[16+rdx] - mov r11,QWORD[24+rdx] - shl r10,10 - mov r8,r11 - shl r11,39 - shr r8,25 - add rax,r10 - add rax,r11 - adc r8,0 - mov QWORD[64+rcx],rax - mov rax,r8 - mov r9,QWORD[32+rdx] - mov r10,QWORD[40+rdx] - mov r11,QWORD[48+rdx] - shl r9,4 - shl r10,33 - mov r8,r11 - shl r11,62 - shr r8,2 - add rax,r9 - add rax,r10 - add rax,r11 - adc r8,0 - mov QWORD[72+rcx],rax - mov rax,r8 - mov r9,QWORD[56+rdx] - mov r10,QWORD[64+rdx] - shl r9,27 - mov r11,r10 - shl r10,56 - shr r11,8 - add rax,r9 - add rax,r10 - adc r11,0 - mov QWORD[80+rcx],rax - mov rax,r11 - mov r8,QWORD[72+rdx] - mov r9,QWORD[80+rdx] - shl r8,21 - mov r10,r9 - shl r9,50 - shr r10,14 - add rax,r8 - add rax,r9 - adc r10,0 - mov QWORD[88+rcx],rax - mov rax,r10 - mov r11,QWORD[88+rdx] - mov r8,QWORD[96+rdx] - shl r11,15 - mov r9,r8 - shl r8,44 - shr r9,20 - add rax,r11 - add rax,r8 - adc r9,0 - mov QWORD[96+rcx],rax - mov rax,r9 - mov r10,QWORD[104+rdx] - mov r11,QWORD[112+rdx] - shl r10,9 - mov r8,r11 - shl r11,38 - shr r8,26 - add rax,r10 - add rax,r11 - adc r8,0 - mov QWORD[104+rcx],rax - mov rax,r8 - mov r9,QWORD[120+rdx] - mov r10,QWORD[128+rdx] - mov r11,QWORD[136+rdx] - shl r9,3 - shl r10,32 - mov r8,r11 - shl r11,61 - shr r8,3 - add rax,r9 - add rax,r10 - add rax,r11 - adc r8,0 - mov QWORD[112+rcx],rax - mov rax,r8 - mov r9,QWORD[144+rdx] - mov r10,QWORD[152+rdx] - shl r9,26 - mov r11,r10 - shl r10,55 - shr r11,9 - add rax,r9 - add rax,r10 - adc r11,0 - mov QWORD[120+rcx],rax - mov rax,r11 +rsaz_avx2_eligible: + xor eax,eax DB 0F3h,0C3h ;repret - +global rsaz_1024_sqr_avx2 +global rsaz_1024_mul_avx2 global rsaz_1024_norm2red_avx2 - -ALIGN 32 -rsaz_1024_norm2red_avx2: - - sub rcx,-128 - mov r8,QWORD[rdx] - mov eax,0x1fffffff - mov r9,QWORD[8+rdx] - mov r11,r8 - shr r11,0 - and r11,rax - mov QWORD[((-128))+rcx],r11 - mov r10,r8 - shr r10,29 - and r10,rax - mov QWORD[((-120))+rcx],r10 - shrd r8,r9,58 - and r8,rax - mov QWORD[((-112))+rcx],r8 - mov r10,QWORD[16+rdx] - mov r8,r9 - shr r8,23 - and r8,rax - mov QWORD[((-104))+rcx],r8 - shrd r9,r10,52 - and r9,rax - mov QWORD[((-96))+rcx],r9 - mov r11,QWORD[24+rdx] - mov r9,r10 - shr r9,17 - and r9,rax - mov QWORD[((-88))+rcx],r9 - shrd r10,r11,46 - and r10,rax - mov QWORD[((-80))+rcx],r10 - mov r8,QWORD[32+rdx] - mov r10,r11 - shr r10,11 - and r10,rax - mov QWORD[((-72))+rcx],r10 - shrd r11,r8,40 - and r11,rax - mov QWORD[((-64))+rcx],r11 - mov r9,QWORD[40+rdx] - mov r11,r8 - shr r11,5 - and r11,rax - mov QWORD[((-56))+rcx],r11 - mov r10,r8 - shr r10,34 - and r10,rax - mov QWORD[((-48))+rcx],r10 - shrd r8,r9,63 - and r8,rax - mov QWORD[((-40))+rcx],r8 - mov r10,QWORD[48+rdx] - mov r8,r9 - shr r8,28 - and r8,rax - mov QWORD[((-32))+rcx],r8 - shrd r9,r10,57 - and r9,rax - mov QWORD[((-24))+rcx],r9 - mov r11,QWORD[56+rdx] - mov r9,r10 - shr r9,22 - and r9,rax - mov QWORD[((-16))+rcx],r9 - shrd r10,r11,51 - and r10,rax - mov QWORD[((-8))+rcx],r10 - mov r8,QWORD[64+rdx] - mov r10,r11 - shr r10,16 - and r10,rax - mov QWORD[rcx],r10 - shrd r11,r8,45 - and r11,rax - mov QWORD[8+rcx],r11 - mov r9,QWORD[72+rdx] - mov r11,r8 - shr r11,10 - and r11,rax - mov QWORD[16+rcx],r11 - shrd r8,r9,39 - and r8,rax - mov QWORD[24+rcx],r8 - mov r10,QWORD[80+rdx] - mov r8,r9 - shr r8,4 - and r8,rax - mov QWORD[32+rcx],r8 - mov r11,r9 - shr r11,33 - and r11,rax - mov QWORD[40+rcx],r11 - shrd r9,r10,62 - and r9,rax - mov QWORD[48+rcx],r9 - mov r11,QWORD[88+rdx] - mov r9,r10 - shr r9,27 - and r9,rax - mov QWORD[56+rcx],r9 - shrd r10,r11,56 - and r10,rax - mov QWORD[64+rcx],r10 - mov r8,QWORD[96+rdx] - mov r10,r11 - shr r10,21 - and r10,rax - mov QWORD[72+rcx],r10 - shrd r11,r8,50 - and r11,rax - mov QWORD[80+rcx],r11 - mov r9,QWORD[104+rdx] - mov r11,r8 - shr r11,15 - and r11,rax - mov QWORD[88+rcx],r11 - shrd r8,r9,44 - and r8,rax - mov QWORD[96+rcx],r8 - mov r10,QWORD[112+rdx] - mov r8,r9 - shr r8,9 - and r8,rax - mov QWORD[104+rcx],r8 - shrd r9,r10,38 - and r9,rax - mov QWORD[112+rcx],r9 - mov r11,QWORD[120+rdx] - mov r9,r10 - shr r9,3 - and r9,rax - mov QWORD[120+rcx],r9 - mov r8,r10 - shr r8,32 - and r8,rax - mov QWORD[128+rcx],r8 - shrd r10,r11,61 - and r10,rax - mov QWORD[136+rcx],r10 - xor r8,r8 - mov r10,r11 - shr r10,26 - and r10,rax - mov QWORD[144+rcx],r10 - shrd r11,r8,55 - and r11,rax - mov QWORD[152+rcx],r11 - mov QWORD[160+rcx],r8 - mov QWORD[168+rcx],r8 - mov QWORD[176+rcx],r8 - mov QWORD[184+rcx],r8 - DB 0F3h,0C3h ;repret - - +global rsaz_1024_red2norm_avx2 global rsaz_1024_scatter5_avx2 - -ALIGN 32 -rsaz_1024_scatter5_avx2: - - vzeroupper - vmovdqu ymm5,YMMWORD[$L$scatter_permd] - shl r8d,4 - lea rcx,[r8*1+rcx] - mov eax,9 - jmp NEAR $L$oop_scatter_1024 - -ALIGN 32 -$L$oop_scatter_1024: - vmovdqu ymm0,YMMWORD[rdx] - lea rdx,[32+rdx] - vpermd ymm0,ymm5,ymm0 - vmovdqu XMMWORD[rcx],xmm0 - lea rcx,[512+rcx] - dec eax - jnz NEAR $L$oop_scatter_1024 - - vzeroupper - DB 0F3h,0C3h ;repret - - - global rsaz_1024_gather5_avx2 -ALIGN 32 +rsaz_1024_sqr_avx2: +rsaz_1024_mul_avx2: +rsaz_1024_norm2red_avx2: +rsaz_1024_red2norm_avx2: +rsaz_1024_scatter5_avx2: rsaz_1024_gather5_avx2: - - vzeroupper - mov r11,rsp - - lea rax,[((-136))+rsp] -$L$SEH_begin_rsaz_1024_gather5: - -DB 0x48,0x8d,0x60,0xe0 -DB 0xc5,0xf8,0x29,0x70,0xe0 -DB 0xc5,0xf8,0x29,0x78,0xf0 -DB 0xc5,0x78,0x29,0x40,0x00 -DB 0xc5,0x78,0x29,0x48,0x10 -DB 0xc5,0x78,0x29,0x50,0x20 -DB 0xc5,0x78,0x29,0x58,0x30 -DB 0xc5,0x78,0x29,0x60,0x40 -DB 0xc5,0x78,0x29,0x68,0x50 -DB 0xc5,0x78,0x29,0x70,0x60 -DB 0xc5,0x78,0x29,0x78,0x70 - lea rsp,[((-256))+rsp] - and rsp,-32 - lea r10,[$L$inc] - lea rax,[((-128))+rsp] - - vmovd xmm4,r8d - vmovdqa ymm0,YMMWORD[r10] - vmovdqa ymm1,YMMWORD[32+r10] - vmovdqa ymm5,YMMWORD[64+r10] - vpbroadcastd ymm4,xmm4 - - vpaddd ymm2,ymm0,ymm5 - vpcmpeqd ymm0,ymm0,ymm4 - vpaddd ymm3,ymm1,ymm5 - vpcmpeqd ymm1,ymm1,ymm4 - vmovdqa YMMWORD[(0+128)+rax],ymm0 - vpaddd ymm0,ymm2,ymm5 - vpcmpeqd ymm2,ymm2,ymm4 - vmovdqa YMMWORD[(32+128)+rax],ymm1 - vpaddd ymm1,ymm3,ymm5 - vpcmpeqd ymm3,ymm3,ymm4 - vmovdqa YMMWORD[(64+128)+rax],ymm2 - vpaddd ymm2,ymm0,ymm5 - vpcmpeqd ymm0,ymm0,ymm4 - vmovdqa YMMWORD[(96+128)+rax],ymm3 - vpaddd ymm3,ymm1,ymm5 - vpcmpeqd ymm1,ymm1,ymm4 - vmovdqa YMMWORD[(128+128)+rax],ymm0 - vpaddd ymm8,ymm2,ymm5 - vpcmpeqd ymm2,ymm2,ymm4 - vmovdqa YMMWORD[(160+128)+rax],ymm1 - vpaddd ymm9,ymm3,ymm5 - vpcmpeqd ymm3,ymm3,ymm4 - vmovdqa YMMWORD[(192+128)+rax],ymm2 - vpaddd ymm10,ymm8,ymm5 - vpcmpeqd ymm8,ymm8,ymm4 - vmovdqa YMMWORD[(224+128)+rax],ymm3 - vpaddd ymm11,ymm9,ymm5 - vpcmpeqd ymm9,ymm9,ymm4 - vpaddd ymm12,ymm10,ymm5 - vpcmpeqd ymm10,ymm10,ymm4 - vpaddd ymm13,ymm11,ymm5 - vpcmpeqd ymm11,ymm11,ymm4 - vpaddd ymm14,ymm12,ymm5 - vpcmpeqd ymm12,ymm12,ymm4 - vpaddd ymm15,ymm13,ymm5 - vpcmpeqd ymm13,ymm13,ymm4 - vpcmpeqd ymm14,ymm14,ymm4 - vpcmpeqd ymm15,ymm15,ymm4 - - vmovdqa ymm7,YMMWORD[((-32))+r10] - lea rdx,[128+rdx] - mov r8d,9 - -$L$oop_gather_1024: - vmovdqa ymm0,YMMWORD[((0-128))+rdx] - vmovdqa ymm1,YMMWORD[((32-128))+rdx] - vmovdqa ymm2,YMMWORD[((64-128))+rdx] - vmovdqa ymm3,YMMWORD[((96-128))+rdx] - vpand ymm0,ymm0,YMMWORD[((0+128))+rax] - vpand ymm1,ymm1,YMMWORD[((32+128))+rax] - vpand ymm2,ymm2,YMMWORD[((64+128))+rax] - vpor ymm4,ymm1,ymm0 - vpand ymm3,ymm3,YMMWORD[((96+128))+rax] - vmovdqa ymm0,YMMWORD[((128-128))+rdx] - vmovdqa ymm1,YMMWORD[((160-128))+rdx] - vpor ymm5,ymm3,ymm2 - vmovdqa ymm2,YMMWORD[((192-128))+rdx] - vmovdqa ymm3,YMMWORD[((224-128))+rdx] - vpand ymm0,ymm0,YMMWORD[((128+128))+rax] - vpand ymm1,ymm1,YMMWORD[((160+128))+rax] - vpand ymm2,ymm2,YMMWORD[((192+128))+rax] - vpor ymm4,ymm4,ymm0 - vpand ymm3,ymm3,YMMWORD[((224+128))+rax] - vpand ymm0,ymm8,YMMWORD[((256-128))+rdx] - vpor ymm5,ymm5,ymm1 - vpand ymm1,ymm9,YMMWORD[((288-128))+rdx] - vpor ymm4,ymm4,ymm2 - vpand ymm2,ymm10,YMMWORD[((320-128))+rdx] - vpor ymm5,ymm5,ymm3 - vpand ymm3,ymm11,YMMWORD[((352-128))+rdx] - vpor ymm4,ymm4,ymm0 - vpand ymm0,ymm12,YMMWORD[((384-128))+rdx] - vpor ymm5,ymm5,ymm1 - vpand ymm1,ymm13,YMMWORD[((416-128))+rdx] - vpor ymm4,ymm4,ymm2 - vpand ymm2,ymm14,YMMWORD[((448-128))+rdx] - vpor ymm5,ymm5,ymm3 - vpand ymm3,ymm15,YMMWORD[((480-128))+rdx] - lea rdx,[512+rdx] - vpor ymm4,ymm4,ymm0 - vpor ymm5,ymm5,ymm1 - vpor ymm4,ymm4,ymm2 - vpor ymm5,ymm5,ymm3 - - vpor ymm4,ymm4,ymm5 - vextracti128 xmm5,ymm4,1 - vpor xmm5,xmm5,xmm4 - vpermd ymm5,ymm7,ymm5 - vmovdqu YMMWORD[rcx],ymm5 - lea rcx,[32+rcx] - dec r8d - jnz NEAR $L$oop_gather_1024 - - vpxor ymm0,ymm0,ymm0 - vmovdqu YMMWORD[rcx],ymm0 - vzeroupper - movaps xmm6,XMMWORD[((-168))+r11] - movaps xmm7,XMMWORD[((-152))+r11] - movaps xmm8,XMMWORD[((-136))+r11] - movaps xmm9,XMMWORD[((-120))+r11] - movaps xmm10,XMMWORD[((-104))+r11] - movaps xmm11,XMMWORD[((-88))+r11] - movaps xmm12,XMMWORD[((-72))+r11] - movaps xmm13,XMMWORD[((-56))+r11] - movaps xmm14,XMMWORD[((-40))+r11] - movaps xmm15,XMMWORD[((-24))+r11] - lea rsp,[r11] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_rsaz_1024_gather5: - -EXTERN OPENSSL_ia32cap_P -global rsaz_avx2_eligible - -ALIGN 32 -rsaz_avx2_eligible: - mov eax,DWORD[((OPENSSL_ia32cap_P+8))] - mov ecx,524544 - mov edx,0 - and ecx,eax - cmp ecx,524544 - cmove eax,edx - and eax,32 - shr eax,5 +DB 0x0f,0x0b DB 0F3h,0C3h ;repret - -ALIGN 64 -$L$and_mask: - DQ 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff -$L$scatter_permd: - DD 0,2,4,6,7,7,7,7 -$L$gather_permd: - DD 0,7,1,7,2,7,3,7 -$L$inc: - DD 0,0,0,0,1,1,1,1 - DD 2,2,2,2,3,3,3,3 - DD 4,4,4,4,4,4,4,4 -ALIGN 64 -EXTERN __imp_RtlVirtualUnwind - -ALIGN 16 -rsaz_se_handler: - push rsi - push rdi - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - pushfq - sub rsp,64 - - mov rax,QWORD[120+r8] - mov rbx,QWORD[248+r8] - - mov rsi,QWORD[8+r9] - mov r11,QWORD[56+r9] - - mov r10d,DWORD[r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jb NEAR $L$common_seh_tail - - mov r10d,DWORD[4+r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jae NEAR $L$common_seh_tail - - mov rbp,QWORD[160+r8] - - mov r10d,DWORD[8+r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - cmovc rax,rbp - - mov r15,QWORD[((-48))+rax] - mov r14,QWORD[((-40))+rax] - mov r13,QWORD[((-32))+rax] - mov r12,QWORD[((-24))+rax] - mov rbp,QWORD[((-16))+rax] - mov rbx,QWORD[((-8))+rax] - mov QWORD[240+r8],r15 - mov QWORD[232+r8],r14 - mov QWORD[224+r8],r13 - mov QWORD[216+r8],r12 - mov QWORD[160+r8],rbp - mov QWORD[144+r8],rbx - - lea rsi,[((-216))+rax] - lea rdi,[512+r8] - mov ecx,20 - DD 0xa548f3fc - -$L$common_seh_tail: - mov rdi,QWORD[8+rax] - mov rsi,QWORD[16+rax] - mov QWORD[152+r8],rax - mov QWORD[168+r8],rsi - mov QWORD[176+r8],rdi - - mov rdi,QWORD[40+r9] - mov rsi,r8 - mov ecx,154 - DD 0xa548f3fc - - mov rsi,r9 - xor rcx,rcx - mov rdx,QWORD[8+rsi] - mov r8,QWORD[rsi] - mov r9,QWORD[16+rsi] - mov r10,QWORD[40+rsi] - lea r11,[56+rsi] - lea r12,[24+rsi] - mov QWORD[32+rsp],r10 - mov QWORD[40+rsp],r11 - mov QWORD[48+rsp],r12 - mov QWORD[56+rsp],rcx - call QWORD[__imp_RtlVirtualUnwind] - - mov eax,1 - add rsp,64 - popfq - pop r15 - pop r14 - pop r13 - pop r12 - pop rbp - pop rbx - pop rdi - pop rsi - DB 0F3h,0C3h ;repret - - -section .pdata rdata align=4 -ALIGN 4 - DD $L$SEH_begin_rsaz_1024_sqr_avx2 wrt ..imagebase - DD $L$SEH_end_rsaz_1024_sqr_avx2 wrt ..imagebase - DD $L$SEH_info_rsaz_1024_sqr_avx2 wrt ..imagebase - - DD $L$SEH_begin_rsaz_1024_mul_avx2 wrt ..imagebase - DD $L$SEH_end_rsaz_1024_mul_avx2 wrt ..imagebase - DD $L$SEH_info_rsaz_1024_mul_avx2 wrt ..imagebase - - DD $L$SEH_begin_rsaz_1024_gather5 wrt ..imagebase - DD $L$SEH_end_rsaz_1024_gather5 wrt ..imagebase - DD $L$SEH_info_rsaz_1024_gather5 wrt ..imagebase -section .xdata rdata align=8 -ALIGN 8 -$L$SEH_info_rsaz_1024_sqr_avx2: -DB 9,0,0,0 - DD rsaz_se_handler wrt ..imagebase - DD $L$sqr_1024_body wrt ..imagebase,$L$sqr_1024_epilogue wrt ..imagebase,$L$sqr_1024_in_tail wrt ..imagebase - DD 0 -$L$SEH_info_rsaz_1024_mul_avx2: -DB 9,0,0,0 - DD rsaz_se_handler wrt ..imagebase - DD $L$mul_1024_body wrt ..imagebase,$L$mul_1024_epilogue wrt ..imagebase,$L$mul_1024_in_tail wrt ..imagebase - DD 0 -$L$SEH_info_rsaz_1024_gather5: -DB 0x01,0x36,0x17,0x0b -DB 0x36,0xf8,0x09,0x00 -DB 0x31,0xe8,0x08,0x00 -DB 0x2c,0xd8,0x07,0x00 -DB 0x27,0xc8,0x06,0x00 -DB 0x22,0xb8,0x05,0x00 -DB 0x1d,0xa8,0x04,0x00 -DB 0x18,0x98,0x03,0x00 -DB 0x13,0x88,0x02,0x00 -DB 0x0e,0x78,0x01,0x00 -DB 0x09,0x68,0x00,0x00 -DB 0x04,0x01,0x15,0x00 -DB 0x00,0xb3,0x00,0x00 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-avx512.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-avx512.nasm index fc15281fa46..f8e4aa1c20e 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-avx512.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-avx512.nasm @@ -2,1030 +2,23 @@ default rel %define XMMWORD %define YMMWORD %define ZMMWORD -EXTERN OPENSSL_ia32cap_P +section .text code align=64 + + global ossl_rsaz_avx512ifma_eligible -ALIGN 32 ossl_rsaz_avx512ifma_eligible: - mov ecx,DWORD[((OPENSSL_ia32cap_P+8))] xor eax,eax - and ecx,2149777408 - cmp ecx,2149777408 - cmove eax,ecx DB 0F3h,0C3h ;repret -section .text code align=64 - global ossl_rsaz_amm52x20_x1_256 - -ALIGN 32 -ossl_rsaz_amm52x20_x1_256: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_ossl_rsaz_amm52x20_x1_256: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - - - -DB 243,15,30,250 - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - push r15 - -$L$rsaz_amm52x20_x1_256_body: - - - vpxord ymm0,ymm0,ymm0 - vmovdqa64 ymm1,ymm0 - vmovdqa64 ymm16,ymm0 - vmovdqa64 ymm17,ymm0 - vmovdqa64 ymm18,ymm0 - vmovdqa64 ymm19,ymm0 - - xor r9d,r9d - - mov r11,rdx - mov rax,0xfffffffffffff - - - mov ebx,5 - -ALIGN 32 -$L$loop5: - mov r13,QWORD[r11] - - vpbroadcastq ymm3,r13 - mov rdx,QWORD[rsi] - mulx r12,r13,r13 - add r9,r13 - mov r10,r12 - adc r10,0 - - mov r13,r8 - imul r13,r9 - and r13,rax - - vpbroadcastq ymm4,r13 - mov rdx,QWORD[rcx] - mulx r12,r13,r13 - add r9,r13 - adc r10,r12 - - shr r9,52 - sal r10,12 - or r9,r10 - - vpmadd52luq ymm1,ymm3,YMMWORD[rsi] - vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi] - vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi] - vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi] - vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi] - - vpmadd52luq ymm1,ymm4,YMMWORD[rcx] - vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx] - vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx] - vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx] - vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx] - - - valignq ymm1,ymm16,ymm1,1 - valignq ymm16,ymm17,ymm16,1 - valignq ymm17,ymm18,ymm17,1 - valignq ymm18,ymm19,ymm18,1 - valignq ymm19,ymm0,ymm19,1 - - vmovq r13,xmm1 - add r9,r13 - - vpmadd52huq ymm1,ymm3,YMMWORD[rsi] - vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi] - vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi] - vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi] - vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi] - - vpmadd52huq ymm1,ymm4,YMMWORD[rcx] - vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx] - vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx] - vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx] - vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx] - mov r13,QWORD[8+r11] - - vpbroadcastq ymm3,r13 - mov rdx,QWORD[rsi] - mulx r12,r13,r13 - add r9,r13 - mov r10,r12 - adc r10,0 - - mov r13,r8 - imul r13,r9 - and r13,rax - - vpbroadcastq ymm4,r13 - mov rdx,QWORD[rcx] - mulx r12,r13,r13 - add r9,r13 - adc r10,r12 - - shr r9,52 - sal r10,12 - or r9,r10 - - vpmadd52luq ymm1,ymm3,YMMWORD[rsi] - vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi] - vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi] - vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi] - vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi] - - vpmadd52luq ymm1,ymm4,YMMWORD[rcx] - vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx] - vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx] - vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx] - vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx] - - - valignq ymm1,ymm16,ymm1,1 - valignq ymm16,ymm17,ymm16,1 - valignq ymm17,ymm18,ymm17,1 - valignq ymm18,ymm19,ymm18,1 - valignq ymm19,ymm0,ymm19,1 - - vmovq r13,xmm1 - add r9,r13 - - vpmadd52huq ymm1,ymm3,YMMWORD[rsi] - vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi] - vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi] - vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi] - vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi] - - vpmadd52huq ymm1,ymm4,YMMWORD[rcx] - vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx] - vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx] - vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx] - vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx] - mov r13,QWORD[16+r11] - - vpbroadcastq ymm3,r13 - mov rdx,QWORD[rsi] - mulx r12,r13,r13 - add r9,r13 - mov r10,r12 - adc r10,0 - - mov r13,r8 - imul r13,r9 - and r13,rax - - vpbroadcastq ymm4,r13 - mov rdx,QWORD[rcx] - mulx r12,r13,r13 - add r9,r13 - adc r10,r12 - - shr r9,52 - sal r10,12 - or r9,r10 - - vpmadd52luq ymm1,ymm3,YMMWORD[rsi] - vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi] - vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi] - vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi] - vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi] - - vpmadd52luq ymm1,ymm4,YMMWORD[rcx] - vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx] - vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx] - vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx] - vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx] - - - valignq ymm1,ymm16,ymm1,1 - valignq ymm16,ymm17,ymm16,1 - valignq ymm17,ymm18,ymm17,1 - valignq ymm18,ymm19,ymm18,1 - valignq ymm19,ymm0,ymm19,1 - - vmovq r13,xmm1 - add r9,r13 - - vpmadd52huq ymm1,ymm3,YMMWORD[rsi] - vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi] - vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi] - vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi] - vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi] - - vpmadd52huq ymm1,ymm4,YMMWORD[rcx] - vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx] - vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx] - vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx] - vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx] - mov r13,QWORD[24+r11] - - vpbroadcastq ymm3,r13 - mov rdx,QWORD[rsi] - mulx r12,r13,r13 - add r9,r13 - mov r10,r12 - adc r10,0 - - mov r13,r8 - imul r13,r9 - and r13,rax - - vpbroadcastq ymm4,r13 - mov rdx,QWORD[rcx] - mulx r12,r13,r13 - add r9,r13 - adc r10,r12 - - shr r9,52 - sal r10,12 - or r9,r10 - - vpmadd52luq ymm1,ymm3,YMMWORD[rsi] - vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi] - vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi] - vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi] - vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi] - - vpmadd52luq ymm1,ymm4,YMMWORD[rcx] - vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx] - vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx] - vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx] - vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx] - - - valignq ymm1,ymm16,ymm1,1 - valignq ymm16,ymm17,ymm16,1 - valignq ymm17,ymm18,ymm17,1 - valignq ymm18,ymm19,ymm18,1 - valignq ymm19,ymm0,ymm19,1 - - vmovq r13,xmm1 - add r9,r13 - - vpmadd52huq ymm1,ymm3,YMMWORD[rsi] - vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi] - vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi] - vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi] - vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi] - - vpmadd52huq ymm1,ymm4,YMMWORD[rcx] - vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx] - vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx] - vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx] - vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx] - lea r11,[32+r11] - dec ebx - jne NEAR $L$loop5 - - vmovdqa64 ymm4,YMMWORD[$L$mask52x4] - - vpbroadcastq ymm3,r9 - vpblendd ymm1,ymm1,ymm3,3 - - - - vpsrlq ymm24,ymm1,52 - vpsrlq ymm25,ymm16,52 - vpsrlq ymm26,ymm17,52 - vpsrlq ymm27,ymm18,52 - vpsrlq ymm28,ymm19,52 - - - valignq ymm28,ymm28,ymm27,3 - valignq ymm27,ymm27,ymm26,3 - valignq ymm26,ymm26,ymm25,3 - valignq ymm25,ymm25,ymm24,3 - valignq ymm24,ymm24,ymm0,3 - - - vpandq ymm1,ymm1,ymm4 - vpandq ymm16,ymm16,ymm4 - vpandq ymm17,ymm17,ymm4 - vpandq ymm18,ymm18,ymm4 - vpandq ymm19,ymm19,ymm4 - - - vpaddq ymm1,ymm1,ymm24 - vpaddq ymm16,ymm16,ymm25 - vpaddq ymm17,ymm17,ymm26 - vpaddq ymm18,ymm18,ymm27 - vpaddq ymm19,ymm19,ymm28 - - - - vpcmpuq k1,ymm4,ymm1,1 - vpcmpuq k2,ymm4,ymm16,1 - vpcmpuq k3,ymm4,ymm17,1 - vpcmpuq k4,ymm4,ymm18,1 - vpcmpuq k5,ymm4,ymm19,1 - kmovb r14d,k1 - kmovb r13d,k2 - kmovb r12d,k3 - kmovb r11d,k4 - kmovb r10d,k5 - - - vpcmpuq k1,ymm4,ymm1,0 - vpcmpuq k2,ymm4,ymm16,0 - vpcmpuq k3,ymm4,ymm17,0 - vpcmpuq k4,ymm4,ymm18,0 - vpcmpuq k5,ymm4,ymm19,0 - kmovb r9d,k1 - kmovb r8d,k2 - kmovb ebx,k3 - kmovb ecx,k4 - kmovb edx,k5 - - - - shl r13b,4 - or r14b,r13b - shl r11b,4 - or r12b,r11b - - add r14b,r14b - adc r12b,r12b - adc r10b,r10b - - shl r8b,4 - or r9b,r8b - shl cl,4 - or bl,cl - - add r14b,r9b - adc r12b,bl - adc r10b,dl - - xor r14b,r9b - xor r12b,bl - xor r10b,dl - - kmovb k1,r14d - shr r14b,4 - kmovb k2,r14d - kmovb k3,r12d - shr r12b,4 - kmovb k4,r12d - kmovb k5,r10d - - - vpsubq ymm1{k1},ymm1,ymm4 - vpsubq ymm16{k2},ymm16,ymm4 - vpsubq ymm17{k3},ymm17,ymm4 - vpsubq ymm18{k4},ymm18,ymm4 - vpsubq ymm19{k5},ymm19,ymm4 - - vpandq ymm1,ymm1,ymm4 - vpandq ymm16,ymm16,ymm4 - vpandq ymm17,ymm17,ymm4 - vpandq ymm18,ymm18,ymm4 - vpandq ymm19,ymm19,ymm4 - - vmovdqu64 YMMWORD[rdi],ymm1 - vmovdqu64 YMMWORD[32+rdi],ymm16 - vmovdqu64 YMMWORD[64+rdi],ymm17 - vmovdqu64 YMMWORD[96+rdi],ymm18 - vmovdqu64 YMMWORD[128+rdi],ymm19 - - vzeroupper - mov r15,QWORD[rsp] - - mov r14,QWORD[8+rsp] - - mov r13,QWORD[16+rsp] - - mov r12,QWORD[24+rsp] - - mov rbp,QWORD[32+rsp] - - mov rbx,QWORD[40+rsp] - - lea rsp,[48+rsp] - -$L$rsaz_amm52x20_x1_256_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_ossl_rsaz_amm52x20_x1_256: -section .data data align=8 - -ALIGN 32 -$L$mask52x4: - DQ 0xfffffffffffff - DQ 0xfffffffffffff - DQ 0xfffffffffffff - DQ 0xfffffffffffff -section .text code align=64 - - global ossl_rsaz_amm52x20_x2_256 - -ALIGN 32 -ossl_rsaz_amm52x20_x2_256: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_ossl_rsaz_amm52x20_x2_256: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - - - -DB 243,15,30,250 - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - push r15 - -$L$rsaz_amm52x20_x2_256_body: - - - vpxord ymm0,ymm0,ymm0 - vmovdqa64 ymm1,ymm0 - vmovdqa64 ymm16,ymm0 - vmovdqa64 ymm17,ymm0 - vmovdqa64 ymm18,ymm0 - vmovdqa64 ymm19,ymm0 - vmovdqa64 ymm2,ymm0 - vmovdqa64 ymm20,ymm0 - vmovdqa64 ymm21,ymm0 - vmovdqa64 ymm22,ymm0 - vmovdqa64 ymm23,ymm0 - - xor r9d,r9d - xor r15d,r15d - - mov r11,rdx - mov rax,0xfffffffffffff - - mov ebx,20 - -ALIGN 32 -$L$loop20: - mov r13,QWORD[r11] - - vpbroadcastq ymm3,r13 - mov rdx,QWORD[rsi] - mulx r12,r13,r13 - add r9,r13 - mov r10,r12 - adc r10,0 - - mov r13,QWORD[r8] - imul r13,r9 - and r13,rax - - vpbroadcastq ymm4,r13 - mov rdx,QWORD[rcx] - mulx r12,r13,r13 - add r9,r13 - adc r10,r12 - - shr r9,52 - sal r10,12 - or r9,r10 - - vpmadd52luq ymm1,ymm3,YMMWORD[rsi] - vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi] - vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi] - vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi] - vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi] - - vpmadd52luq ymm1,ymm4,YMMWORD[rcx] - vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx] - vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx] - vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx] - vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx] - - - valignq ymm1,ymm16,ymm1,1 - valignq ymm16,ymm17,ymm16,1 - valignq ymm17,ymm18,ymm17,1 - valignq ymm18,ymm19,ymm18,1 - valignq ymm19,ymm0,ymm19,1 - - vmovq r13,xmm1 - add r9,r13 - - vpmadd52huq ymm1,ymm3,YMMWORD[rsi] - vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi] - vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi] - vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi] - vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi] - - vpmadd52huq ymm1,ymm4,YMMWORD[rcx] - vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx] - vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx] - vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx] - vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx] - mov r13,QWORD[160+r11] - - vpbroadcastq ymm3,r13 - mov rdx,QWORD[160+rsi] - mulx r12,r13,r13 - add r15,r13 - mov r10,r12 - adc r10,0 - - mov r13,QWORD[8+r8] - imul r13,r15 - and r13,rax - - vpbroadcastq ymm4,r13 - mov rdx,QWORD[160+rcx] - mulx r12,r13,r13 - add r15,r13 - adc r10,r12 - - shr r15,52 - sal r10,12 - or r15,r10 - - vpmadd52luq ymm2,ymm3,YMMWORD[160+rsi] - vpmadd52luq ymm20,ymm3,YMMWORD[192+rsi] - vpmadd52luq ymm21,ymm3,YMMWORD[224+rsi] - vpmadd52luq ymm22,ymm3,YMMWORD[256+rsi] - vpmadd52luq ymm23,ymm3,YMMWORD[288+rsi] - - vpmadd52luq ymm2,ymm4,YMMWORD[160+rcx] - vpmadd52luq ymm20,ymm4,YMMWORD[192+rcx] - vpmadd52luq ymm21,ymm4,YMMWORD[224+rcx] - vpmadd52luq ymm22,ymm4,YMMWORD[256+rcx] - vpmadd52luq ymm23,ymm4,YMMWORD[288+rcx] - - - valignq ymm2,ymm20,ymm2,1 - valignq ymm20,ymm21,ymm20,1 - valignq ymm21,ymm22,ymm21,1 - valignq ymm22,ymm23,ymm22,1 - valignq ymm23,ymm0,ymm23,1 - - vmovq r13,xmm2 - add r15,r13 - - vpmadd52huq ymm2,ymm3,YMMWORD[160+rsi] - vpmadd52huq ymm20,ymm3,YMMWORD[192+rsi] - vpmadd52huq ymm21,ymm3,YMMWORD[224+rsi] - vpmadd52huq ymm22,ymm3,YMMWORD[256+rsi] - vpmadd52huq ymm23,ymm3,YMMWORD[288+rsi] - - vpmadd52huq ymm2,ymm4,YMMWORD[160+rcx] - vpmadd52huq ymm20,ymm4,YMMWORD[192+rcx] - vpmadd52huq ymm21,ymm4,YMMWORD[224+rcx] - vpmadd52huq ymm22,ymm4,YMMWORD[256+rcx] - vpmadd52huq ymm23,ymm4,YMMWORD[288+rcx] - lea r11,[8+r11] - dec ebx - jne NEAR $L$loop20 - - vmovdqa64 ymm4,YMMWORD[$L$mask52x4] - - vpbroadcastq ymm3,r9 - vpblendd ymm1,ymm1,ymm3,3 - - - - vpsrlq ymm24,ymm1,52 - vpsrlq ymm25,ymm16,52 - vpsrlq ymm26,ymm17,52 - vpsrlq ymm27,ymm18,52 - vpsrlq ymm28,ymm19,52 - - - valignq ymm28,ymm28,ymm27,3 - valignq ymm27,ymm27,ymm26,3 - valignq ymm26,ymm26,ymm25,3 - valignq ymm25,ymm25,ymm24,3 - valignq ymm24,ymm24,ymm0,3 - - - vpandq ymm1,ymm1,ymm4 - vpandq ymm16,ymm16,ymm4 - vpandq ymm17,ymm17,ymm4 - vpandq ymm18,ymm18,ymm4 - vpandq ymm19,ymm19,ymm4 - - - vpaddq ymm1,ymm1,ymm24 - vpaddq ymm16,ymm16,ymm25 - vpaddq ymm17,ymm17,ymm26 - vpaddq ymm18,ymm18,ymm27 - vpaddq ymm19,ymm19,ymm28 - - - - vpcmpuq k1,ymm4,ymm1,1 - vpcmpuq k2,ymm4,ymm16,1 - vpcmpuq k3,ymm4,ymm17,1 - vpcmpuq k4,ymm4,ymm18,1 - vpcmpuq k5,ymm4,ymm19,1 - kmovb r14d,k1 - kmovb r13d,k2 - kmovb r12d,k3 - kmovb r11d,k4 - kmovb r10d,k5 - - - vpcmpuq k1,ymm4,ymm1,0 - vpcmpuq k2,ymm4,ymm16,0 - vpcmpuq k3,ymm4,ymm17,0 - vpcmpuq k4,ymm4,ymm18,0 - vpcmpuq k5,ymm4,ymm19,0 - kmovb r9d,k1 - kmovb r8d,k2 - kmovb ebx,k3 - kmovb ecx,k4 - kmovb edx,k5 - - - - shl r13b,4 - or r14b,r13b - shl r11b,4 - or r12b,r11b - - add r14b,r14b - adc r12b,r12b - adc r10b,r10b - - shl r8b,4 - or r9b,r8b - shl cl,4 - or bl,cl - - add r14b,r9b - adc r12b,bl - adc r10b,dl - - xor r14b,r9b - xor r12b,bl - xor r10b,dl - - kmovb k1,r14d - shr r14b,4 - kmovb k2,r14d - kmovb k3,r12d - shr r12b,4 - kmovb k4,r12d - kmovb k5,r10d - - - vpsubq ymm1{k1},ymm1,ymm4 - vpsubq ymm16{k2},ymm16,ymm4 - vpsubq ymm17{k3},ymm17,ymm4 - vpsubq ymm18{k4},ymm18,ymm4 - vpsubq ymm19{k5},ymm19,ymm4 - - vpandq ymm1,ymm1,ymm4 - vpandq ymm16,ymm16,ymm4 - vpandq ymm17,ymm17,ymm4 - vpandq ymm18,ymm18,ymm4 - vpandq ymm19,ymm19,ymm4 - - vpbroadcastq ymm3,r15 - vpblendd ymm2,ymm2,ymm3,3 - - - - vpsrlq ymm24,ymm2,52 - vpsrlq ymm25,ymm20,52 - vpsrlq ymm26,ymm21,52 - vpsrlq ymm27,ymm22,52 - vpsrlq ymm28,ymm23,52 - - - valignq ymm28,ymm28,ymm27,3 - valignq ymm27,ymm27,ymm26,3 - valignq ymm26,ymm26,ymm25,3 - valignq ymm25,ymm25,ymm24,3 - valignq ymm24,ymm24,ymm0,3 - - - vpandq ymm2,ymm2,ymm4 - vpandq ymm20,ymm20,ymm4 - vpandq ymm21,ymm21,ymm4 - vpandq ymm22,ymm22,ymm4 - vpandq ymm23,ymm23,ymm4 - - - vpaddq ymm2,ymm2,ymm24 - vpaddq ymm20,ymm20,ymm25 - vpaddq ymm21,ymm21,ymm26 - vpaddq ymm22,ymm22,ymm27 - vpaddq ymm23,ymm23,ymm28 - - - - vpcmpuq k1,ymm4,ymm2,1 - vpcmpuq k2,ymm4,ymm20,1 - vpcmpuq k3,ymm4,ymm21,1 - vpcmpuq k4,ymm4,ymm22,1 - vpcmpuq k5,ymm4,ymm23,1 - kmovb r14d,k1 - kmovb r13d,k2 - kmovb r12d,k3 - kmovb r11d,k4 - kmovb r10d,k5 - - - vpcmpuq k1,ymm4,ymm2,0 - vpcmpuq k2,ymm4,ymm20,0 - vpcmpuq k3,ymm4,ymm21,0 - vpcmpuq k4,ymm4,ymm22,0 - vpcmpuq k5,ymm4,ymm23,0 - kmovb r9d,k1 - kmovb r8d,k2 - kmovb ebx,k3 - kmovb ecx,k4 - kmovb edx,k5 - - - - shl r13b,4 - or r14b,r13b - shl r11b,4 - or r12b,r11b - - add r14b,r14b - adc r12b,r12b - adc r10b,r10b - - shl r8b,4 - or r9b,r8b - shl cl,4 - or bl,cl - - add r14b,r9b - adc r12b,bl - adc r10b,dl - - xor r14b,r9b - xor r12b,bl - xor r10b,dl - - kmovb k1,r14d - shr r14b,4 - kmovb k2,r14d - kmovb k3,r12d - shr r12b,4 - kmovb k4,r12d - kmovb k5,r10d - - - vpsubq ymm2{k1},ymm2,ymm4 - vpsubq ymm20{k2},ymm20,ymm4 - vpsubq ymm21{k3},ymm21,ymm4 - vpsubq ymm22{k4},ymm22,ymm4 - vpsubq ymm23{k5},ymm23,ymm4 - - vpandq ymm2,ymm2,ymm4 - vpandq ymm20,ymm20,ymm4 - vpandq ymm21,ymm21,ymm4 - vpandq ymm22,ymm22,ymm4 - vpandq ymm23,ymm23,ymm4 - - vmovdqu64 YMMWORD[rdi],ymm1 - vmovdqu64 YMMWORD[32+rdi],ymm16 - vmovdqu64 YMMWORD[64+rdi],ymm17 - vmovdqu64 YMMWORD[96+rdi],ymm18 - vmovdqu64 YMMWORD[128+rdi],ymm19 - - vmovdqu64 YMMWORD[160+rdi],ymm2 - vmovdqu64 YMMWORD[192+rdi],ymm20 - vmovdqu64 YMMWORD[224+rdi],ymm21 - vmovdqu64 YMMWORD[256+rdi],ymm22 - vmovdqu64 YMMWORD[288+rdi],ymm23 - - vzeroupper - mov r15,QWORD[rsp] - - mov r14,QWORD[8+rsp] - - mov r13,QWORD[16+rsp] - - mov r12,QWORD[24+rsp] - - mov rbp,QWORD[32+rsp] - - mov rbx,QWORD[40+rsp] - - lea rsp,[48+rsp] - -$L$rsaz_amm52x20_x2_256_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_ossl_rsaz_amm52x20_x2_256: -section .text code align=64 - - -ALIGN 32 global ossl_extract_multiplier_2x20_win5 +ossl_rsaz_amm52x20_x1_256: +ossl_rsaz_amm52x20_x2_256: ossl_extract_multiplier_2x20_win5: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_ossl_extract_multiplier_2x20_win5: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - - - -DB 243,15,30,250 - lea rax,[rcx*4+rcx] - sal rax,5 - add rsi,rax - - vmovdqa64 ymm23,YMMWORD[$L$ones] - vpbroadcastq ymm22,rdx - lea rax,[10240+rsi] - - vpxor xmm4,xmm4,xmm4 - vmovdqa64 ymm3,ymm4 - vmovdqa64 ymm2,ymm4 - vmovdqa64 ymm1,ymm4 - vmovdqa64 ymm0,ymm4 - vmovdqa64 ymm21,ymm4 - -ALIGN 32 -$L$loop: - vpcmpq k1,ymm22,ymm21,0 - add rsi,320 - vpaddq ymm21,ymm21,ymm23 - vmovdqu64 ymm16,YMMWORD[((-320))+rsi] - vmovdqu64 ymm17,YMMWORD[((-288))+rsi] - vmovdqu64 ymm18,YMMWORD[((-256))+rsi] - vmovdqu64 ymm19,YMMWORD[((-224))+rsi] - vmovdqu64 ymm20,YMMWORD[((-192))+rsi] - vpblendmq ymm0{k1},ymm0,ymm16 - vpblendmq ymm1{k1},ymm1,ymm17 - vpblendmq ymm2{k1},ymm2,ymm18 - vpblendmq ymm3{k1},ymm3,ymm19 - vpblendmq ymm4{k1},ymm4,ymm20 - cmp rax,rsi - jne NEAR $L$loop - - vmovdqu64 YMMWORD[rdi],ymm0 - vmovdqu64 YMMWORD[32+rdi],ymm1 - vmovdqu64 YMMWORD[64+rdi],ymm2 - vmovdqu64 YMMWORD[96+rdi],ymm3 - vmovdqu64 YMMWORD[128+rdi],ymm4 - - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_ossl_extract_multiplier_2x20_win5: -section .data data align=8 - -ALIGN 32 -$L$ones: - DQ 1,1,1,1 -EXTERN __imp_RtlVirtualUnwind - -ALIGN 16 -rsaz_def_handler: - push rsi - push rdi - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - pushfq - sub rsp,64 - - mov rax,QWORD[120+r8] - mov rbx,QWORD[248+r8] - - mov rsi,QWORD[8+r9] - mov r11,QWORD[56+r9] - - mov r10d,DWORD[r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jb NEAR $L$common_seh_tail - - mov rax,QWORD[152+r8] - - mov r10d,DWORD[4+r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jae NEAR $L$common_seh_tail - - lea rax,[48+rax] - - mov rbx,QWORD[((-8))+rax] - mov rbp,QWORD[((-16))+rax] - mov r12,QWORD[((-24))+rax] - mov r13,QWORD[((-32))+rax] - mov r14,QWORD[((-40))+rax] - mov r15,QWORD[((-48))+rax] - mov QWORD[144+r8],rbx - mov QWORD[160+r8],rbp - mov QWORD[216+r8],r12 - mov QWORD[224+r8],r13 - mov QWORD[232+r8],r14 - mov QWORD[240+r8],r15 - -$L$common_seh_tail: - mov rdi,QWORD[8+rax] - mov rsi,QWORD[16+rax] - mov QWORD[152+r8],rax - mov QWORD[168+r8],rsi - mov QWORD[176+r8],rdi - - mov rdi,QWORD[40+r9] - mov rsi,r8 - mov ecx,154 - DD 0xa548f3fc - - mov rsi,r9 - xor rcx,rcx - mov rdx,QWORD[8+rsi] - mov r8,QWORD[rsi] - mov r9,QWORD[16+rsi] - mov r10,QWORD[40+rsi] - lea r11,[56+rsi] - lea r12,[24+rsi] - mov QWORD[32+rsp],r10 - mov QWORD[40+rsp],r11 - mov QWORD[48+rsp],r12 - mov QWORD[56+rsp],rcx - call QWORD[__imp_RtlVirtualUnwind] - - mov eax,1 - add rsp,64 - popfq - pop r15 - pop r14 - pop r13 - pop r12 - pop rbp - pop rbx - pop rdi - pop rsi +DB 0x0f,0x0b DB 0F3h,0C3h ;repret - -section .pdata rdata align=4 -ALIGN 4 - DD $L$SEH_begin_ossl_rsaz_amm52x20_x1_256 wrt ..imagebase - DD $L$SEH_end_ossl_rsaz_amm52x20_x1_256 wrt ..imagebase - DD $L$SEH_info_ossl_rsaz_amm52x20_x1_256 wrt ..imagebase - - DD $L$SEH_begin_ossl_rsaz_amm52x20_x2_256 wrt ..imagebase - DD $L$SEH_end_ossl_rsaz_amm52x20_x2_256 wrt ..imagebase - DD $L$SEH_info_ossl_rsaz_amm52x20_x2_256 wrt ..imagebase - - DD $L$SEH_begin_ossl_extract_multiplier_2x20_win5 wrt ..imagebase - DD $L$SEH_end_ossl_extract_multiplier_2x20_win5 wrt ..imagebase - DD $L$SEH_info_ossl_extract_multiplier_2x20_win5 wrt ..imagebase - -section .xdata rdata align=8 -ALIGN 8 -$L$SEH_info_ossl_rsaz_amm52x20_x1_256: -DB 9,0,0,0 - DD rsaz_def_handler wrt ..imagebase - DD $L$rsaz_amm52x20_x1_256_body wrt ..imagebase,$L$rsaz_amm52x20_x1_256_epilogue wrt ..imagebase -$L$SEH_info_ossl_rsaz_amm52x20_x2_256: -DB 9,0,0,0 - DD rsaz_def_handler wrt ..imagebase - DD $L$rsaz_amm52x20_x2_256_body wrt ..imagebase,$L$rsaz_amm52x20_x2_256_epilogue wrt ..imagebase -$L$SEH_info_ossl_extract_multiplier_2x20_win5: -DB 9,0,0,0 - DD rsaz_def_handler wrt ..imagebase - DD $L$SEH_begin_ossl_extract_multiplier_2x20_win5 wrt ..imagebase,$L$SEH_begin_ossl_extract_multiplier_2x20_win5 wrt ..imagebase diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-x86_64.nasm index f407312e950..9f1c3f9b250 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-x86_64.nasm @@ -43,10 +43,6 @@ DB 102,72,15,110,202 mov rdx,QWORD[rsi] mov rax,QWORD[8+rsi] mov QWORD[128+rsp],rcx - mov r11d,0x80100 - and r11d,DWORD[((OPENSSL_ia32cap_P+8))] - cmp r11d,0x80100 - je NEAR $L$oop_sqrx jmp NEAR $L$oop_sqr ALIGN 32 @@ -417,282 +413,6 @@ DB 102,72,15,126,205 dec r8d jnz NEAR $L$oop_sqr - jmp NEAR $L$sqr_tail - -ALIGN 32 -$L$oop_sqrx: - mov DWORD[((128+8))+rsp],r8d -DB 102,72,15,110,199 - - mulx r9,r8,rax - mov rbx,rax - - mulx r10,rcx,QWORD[16+rsi] - xor rbp,rbp - - mulx r11,rax,QWORD[24+rsi] - adcx r9,rcx - -DB 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00 - adcx r10,rax - -DB 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00 - adcx r11,rcx - - mulx r14,rcx,QWORD[48+rsi] - adcx r12,rax - adcx r13,rcx - - mulx r15,rax,QWORD[56+rsi] - adcx r14,rax - adcx r15,rbp - - mulx rdi,rax,rdx - mov rdx,rbx - xor rcx,rcx - adox r8,r8 - adcx r8,rdi - adox rcx,rbp - adcx rcx,rbp - - mov QWORD[rsp],rax - mov QWORD[8+rsp],r8 - - -DB 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00 - adox r10,rax - adcx r11,rbx - - mulx r8,rdi,QWORD[24+rsi] - adox r11,rdi -DB 0x66 - adcx r12,r8 - - mulx rbx,rax,QWORD[32+rsi] - adox r12,rax - adcx r13,rbx - - mulx r8,rdi,QWORD[40+rsi] - adox r13,rdi - adcx r14,r8 - -DB 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 - adox r14,rax - adcx r15,rbx - -DB 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 - adox r15,rdi - adcx r8,rbp - mulx rdi,rax,rdx - adox r8,rbp -DB 0x48,0x8b,0x96,0x10,0x00,0x00,0x00 - - xor rbx,rbx - adox r9,r9 - - adcx rax,rcx - adox r10,r10 - adcx r9,rax - adox rbx,rbp - adcx r10,rdi - adcx rbx,rbp - - mov QWORD[16+rsp],r9 -DB 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 - - - mulx r9,rdi,QWORD[24+rsi] - adox r12,rdi - adcx r13,r9 - - mulx rcx,rax,QWORD[32+rsi] - adox r13,rax - adcx r14,rcx - -DB 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00 - adox r14,rdi - adcx r15,r9 - -DB 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 - adox r15,rax - adcx r8,rcx - - mulx r9,rdi,QWORD[56+rsi] - adox r8,rdi - adcx r9,rbp - mulx rdi,rax,rdx - adox r9,rbp - mov rdx,QWORD[24+rsi] - - xor rcx,rcx - adox r11,r11 - - adcx rax,rbx - adox r12,r12 - adcx r11,rax - adox rcx,rbp - adcx r12,rdi - adcx rcx,rbp - - mov QWORD[32+rsp],r11 - mov QWORD[40+rsp],r12 - - - mulx rbx,rax,QWORD[32+rsi] - adox r14,rax - adcx r15,rbx - - mulx r10,rdi,QWORD[40+rsi] - adox r15,rdi - adcx r8,r10 - - mulx rbx,rax,QWORD[48+rsi] - adox r8,rax - adcx r9,rbx - - mulx r10,rdi,QWORD[56+rsi] - adox r9,rdi - adcx r10,rbp - mulx rdi,rax,rdx - adox r10,rbp - mov rdx,QWORD[32+rsi] - - xor rbx,rbx - adox r13,r13 - - adcx rax,rcx - adox r14,r14 - adcx r13,rax - adox rbx,rbp - adcx r14,rdi - adcx rbx,rbp - - mov QWORD[48+rsp],r13 - mov QWORD[56+rsp],r14 - - - mulx r11,rdi,QWORD[40+rsi] - adox r8,rdi - adcx r9,r11 - - mulx rcx,rax,QWORD[48+rsi] - adox r9,rax - adcx r10,rcx - - mulx r11,rdi,QWORD[56+rsi] - adox r10,rdi - adcx r11,rbp - mulx rdi,rax,rdx - mov rdx,QWORD[40+rsi] - adox r11,rbp - - xor rcx,rcx - adox r15,r15 - - adcx rax,rbx - adox r8,r8 - adcx r15,rax - adox rcx,rbp - adcx r8,rdi - adcx rcx,rbp - - mov QWORD[64+rsp],r15 - mov QWORD[72+rsp],r8 - - -DB 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 - adox r10,rax - adcx r11,rbx - -DB 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 - adox r11,rdi - adcx r12,rbp - mulx rdi,rax,rdx - adox r12,rbp - mov rdx,QWORD[48+rsi] - - xor rbx,rbx - adox r9,r9 - - adcx rax,rcx - adox r10,r10 - adcx r9,rax - adcx r10,rdi - adox rbx,rbp - adcx rbx,rbp - - mov QWORD[80+rsp],r9 - mov QWORD[88+rsp],r10 - - -DB 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 - adox r12,rax - adox r13,rbp - - mulx rdi,rax,rdx - xor rcx,rcx - mov rdx,QWORD[56+rsi] - adox r11,r11 - - adcx rax,rbx - adox r12,r12 - adcx r11,rax - adox rcx,rbp - adcx r12,rdi - adcx rcx,rbp - -DB 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 -DB 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 - - - mulx rdx,rax,rdx - xor rbx,rbx - adox r13,r13 - - adcx rax,rcx - adox rbx,rbp - adcx rax,r13 - adcx rbx,rdx - -DB 102,72,15,126,199 -DB 102,72,15,126,205 - - mov rdx,QWORD[128+rsp] - mov r8,QWORD[rsp] - mov r9,QWORD[8+rsp] - mov r10,QWORD[16+rsp] - mov r11,QWORD[24+rsp] - mov r12,QWORD[32+rsp] - mov r13,QWORD[40+rsp] - mov r14,QWORD[48+rsp] - mov r15,QWORD[56+rsp] - - mov QWORD[112+rsp],rax - mov QWORD[120+rsp],rbx - - call __rsaz_512_reducex - - add r8,QWORD[64+rsp] - adc r9,QWORD[72+rsp] - adc r10,QWORD[80+rsp] - adc r11,QWORD[88+rsp] - adc r12,QWORD[96+rsp] - adc r13,QWORD[104+rsp] - adc r14,QWORD[112+rsp] - adc r15,QWORD[120+rsp] - sbb rcx,rcx - - call __rsaz_512_subtract - - mov rdx,r8 - mov rax,r9 - mov r8d,DWORD[((128+8))+rsp] - mov rsi,rdi - - dec r8d - jnz NEAR $L$oop_sqrx - -$L$sqr_tail: lea rax,[((128+24+48))+rsp] @@ -751,10 +471,6 @@ $L$mul_body: DB 102,72,15,110,199 DB 102,72,15,110,201 mov QWORD[128+rsp],r8 - mov r11d,0x80100 - and r11d,DWORD[((OPENSSL_ia32cap_P+8))] - cmp r11d,0x80100 - je NEAR $L$mulx mov rbx,QWORD[rdx] mov rbp,rdx call __rsaz_512_mul @@ -772,29 +488,6 @@ DB 102,72,15,126,205 mov r15,QWORD[56+rsp] call __rsaz_512_reduce - jmp NEAR $L$mul_tail - -ALIGN 32 -$L$mulx: - mov rbp,rdx - mov rdx,QWORD[rdx] - call __rsaz_512_mulx - -DB 102,72,15,126,199 -DB 102,72,15,126,205 - - mov rdx,QWORD[128+rsp] - mov r8,QWORD[rsp] - mov r9,QWORD[8+rsp] - mov r10,QWORD[16+rsp] - mov r11,QWORD[24+rsp] - mov r12,QWORD[32+rsp] - mov r13,QWORD[40+rsp] - mov r14,QWORD[48+rsp] - mov r15,QWORD[56+rsp] - - call __rsaz_512_reducex -$L$mul_tail: add r8,QWORD[64+rsp] adc r9,QWORD[72+rsp] adc r10,QWORD[80+rsp] @@ -926,10 +619,6 @@ $L$mul_gather4_body: por xmm8,xmm9 pshufd xmm9,xmm8,0x4e por xmm8,xmm9 - mov r11d,0x80100 - and r11d,DWORD[((OPENSSL_ia32cap_P+8))] - cmp r11d,0x80100 - je NEAR $L$mulx_gather DB 102,76,15,126,195 mov QWORD[128+rsp],r8 @@ -1110,142 +799,6 @@ DB 102,76,15,126,195 mov r15,QWORD[56+rsp] call __rsaz_512_reduce - jmp NEAR $L$mul_gather_tail - -ALIGN 32 -$L$mulx_gather: -DB 102,76,15,126,194 - - mov QWORD[128+rsp],r8 - mov QWORD[((128+8))+rsp],rdi - mov QWORD[((128+16))+rsp],rcx - - mulx r8,rbx,QWORD[rsi] - mov QWORD[rsp],rbx - xor edi,edi - - mulx r9,rax,QWORD[8+rsi] - - mulx r10,rbx,QWORD[16+rsi] - adcx r8,rax - - mulx r11,rax,QWORD[24+rsi] - adcx r9,rbx - - mulx r12,rbx,QWORD[32+rsi] - adcx r10,rax - - mulx r13,rax,QWORD[40+rsi] - adcx r11,rbx - - mulx r14,rbx,QWORD[48+rsi] - adcx r12,rax - - mulx r15,rax,QWORD[56+rsi] - adcx r13,rbx - adcx r14,rax -DB 0x67 - mov rbx,r8 - adcx r15,rdi - - mov rcx,-7 - jmp NEAR $L$oop_mulx_gather - -ALIGN 32 -$L$oop_mulx_gather: - movdqa xmm8,XMMWORD[rbp] - movdqa xmm9,XMMWORD[16+rbp] - movdqa xmm10,XMMWORD[32+rbp] - movdqa xmm11,XMMWORD[48+rbp] - pand xmm8,xmm0 - movdqa xmm12,XMMWORD[64+rbp] - pand xmm9,xmm1 - movdqa xmm13,XMMWORD[80+rbp] - pand xmm10,xmm2 - movdqa xmm14,XMMWORD[96+rbp] - pand xmm11,xmm3 - movdqa xmm15,XMMWORD[112+rbp] - lea rbp,[128+rbp] - pand xmm12,xmm4 - pand xmm13,xmm5 - pand xmm14,xmm6 - pand xmm15,xmm7 - por xmm8,xmm10 - por xmm9,xmm11 - por xmm8,xmm12 - por xmm9,xmm13 - por xmm8,xmm14 - por xmm9,xmm15 - - por xmm8,xmm9 - pshufd xmm9,xmm8,0x4e - por xmm8,xmm9 -DB 102,76,15,126,194 - -DB 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 - adcx rbx,rax - adox r8,r9 - - mulx r9,rax,QWORD[8+rsi] - adcx r8,rax - adox r9,r10 - - mulx r10,rax,QWORD[16+rsi] - adcx r9,rax - adox r10,r11 - -DB 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 - adcx r10,rax - adox r11,r12 - - mulx r12,rax,QWORD[32+rsi] - adcx r11,rax - adox r12,r13 - - mulx r13,rax,QWORD[40+rsi] - adcx r12,rax - adox r13,r14 - -DB 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 - adcx r13,rax -DB 0x67 - adox r14,r15 - - mulx r15,rax,QWORD[56+rsi] - mov QWORD[64+rcx*8+rsp],rbx - adcx r14,rax - adox r15,rdi - mov rbx,r8 - adcx r15,rdi - - inc rcx - jnz NEAR $L$oop_mulx_gather - - mov QWORD[64+rsp],r8 - mov QWORD[((64+8))+rsp],r9 - mov QWORD[((64+16))+rsp],r10 - mov QWORD[((64+24))+rsp],r11 - mov QWORD[((64+32))+rsp],r12 - mov QWORD[((64+40))+rsp],r13 - mov QWORD[((64+48))+rsp],r14 - mov QWORD[((64+56))+rsp],r15 - - mov rdx,QWORD[128+rsp] - mov rdi,QWORD[((128+8))+rsp] - mov rbp,QWORD[((128+16))+rsp] - - mov r8,QWORD[rsp] - mov r9,QWORD[8+rsp] - mov r10,QWORD[16+rsp] - mov r11,QWORD[24+rsp] - mov r12,QWORD[32+rsp] - mov r13,QWORD[40+rsp] - mov r14,QWORD[48+rsp] - mov r15,QWORD[56+rsp] - - call __rsaz_512_reducex - -$L$mul_gather_tail: add r8,QWORD[64+rsp] adc r9,QWORD[72+rsp] adc r10,QWORD[80+rsp] @@ -1332,10 +885,6 @@ DB 102,73,15,110,208 mov QWORD[128+rsp],rcx mov rbp,rdi - mov r11d,0x80100 - and r11d,DWORD[((OPENSSL_ia32cap_P+8))] - cmp r11d,0x80100 - je NEAR $L$mulx_scatter mov rbx,QWORD[rdi] call __rsaz_512_mul @@ -1352,29 +901,6 @@ DB 102,72,15,126,205 mov r15,QWORD[56+rsp] call __rsaz_512_reduce - jmp NEAR $L$mul_scatter_tail - -ALIGN 32 -$L$mulx_scatter: - mov rdx,QWORD[rdi] - call __rsaz_512_mulx - -DB 102,72,15,126,199 -DB 102,72,15,126,205 - - mov rdx,QWORD[128+rsp] - mov r8,QWORD[rsp] - mov r9,QWORD[8+rsp] - mov r10,QWORD[16+rsp] - mov r11,QWORD[24+rsp] - mov r12,QWORD[32+rsp] - mov r13,QWORD[40+rsp] - mov r14,QWORD[48+rsp] - mov r15,QWORD[56+rsp] - - call __rsaz_512_reducex - -$L$mul_scatter_tail: add r8,QWORD[64+rsp] adc r9,QWORD[72+rsp] adc r10,QWORD[80+rsp] @@ -1450,7 +976,6 @@ $L$SEH_begin_rsaz_512_mul_by_one: sub rsp,128+24 $L$mul_by_one_body: - mov eax,DWORD[((OPENSSL_ia32cap_P+8))] mov rbp,rdx mov QWORD[128+rsp],rcx @@ -1471,16 +996,7 @@ $L$mul_by_one_body: movdqa XMMWORD[64+rsp],xmm0 movdqa XMMWORD[80+rsp],xmm0 movdqa XMMWORD[96+rsp],xmm0 - and eax,0x80100 - cmp eax,0x80100 - je NEAR $L$by_one_callx call __rsaz_512_reduce - jmp NEAR $L$by_one_tail -ALIGN 32 -$L$by_one_callx: - mov rdx,QWORD[128+rsp] - call __rsaz_512_reducex -$L$by_one_tail: mov QWORD[rdi],r8 mov QWORD[8+rdi],r9 mov QWORD[16+rdi],r10 @@ -1598,64 +1114,6 @@ $L$reduction_loop: -ALIGN 32 -__rsaz_512_reducex: - - - imul rdx,r8 - xor rsi,rsi - mov ecx,8 - jmp NEAR $L$reduction_loopx - -ALIGN 32 -$L$reduction_loopx: - mov rbx,r8 - mulx r8,rax,QWORD[rbp] - adcx rax,rbx - adox r8,r9 - - mulx r9,rax,QWORD[8+rbp] - adcx r8,rax - adox r9,r10 - - mulx r10,rbx,QWORD[16+rbp] - adcx r9,rbx - adox r10,r11 - - mulx r11,rbx,QWORD[24+rbp] - adcx r10,rbx - adox r11,r12 - -DB 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 - mov rax,rdx - mov rdx,r8 - adcx r11,rbx - adox r12,r13 - - mulx rdx,rbx,QWORD[((128+8))+rsp] - mov rdx,rax - - mulx r13,rax,QWORD[40+rbp] - adcx r12,rax - adox r13,r14 - -DB 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 - adcx r13,rax - adox r14,r15 - - mulx r15,rax,QWORD[56+rbp] - mov rdx,rbx - adcx r14,rax - adox r15,rsi - adcx r15,rsi - - dec ecx - jne NEAR $L$reduction_loopx - - DB 0F3h,0C3h ;repret - - - ALIGN 32 __rsaz_512_subtract: @@ -1858,128 +1316,6 @@ $L$oop_mul: DB 0F3h,0C3h ;repret - -ALIGN 32 -__rsaz_512_mulx: - - mulx r8,rbx,QWORD[rsi] - mov rcx,-6 - - mulx r9,rax,QWORD[8+rsi] - mov QWORD[8+rsp],rbx - - mulx r10,rbx,QWORD[16+rsi] - adc r8,rax - - mulx r11,rax,QWORD[24+rsi] - adc r9,rbx - - mulx r12,rbx,QWORD[32+rsi] - adc r10,rax - - mulx r13,rax,QWORD[40+rsi] - adc r11,rbx - - mulx r14,rbx,QWORD[48+rsi] - adc r12,rax - - mulx r15,rax,QWORD[56+rsi] - mov rdx,QWORD[8+rbp] - adc r13,rbx - adc r14,rax - adc r15,0 - - xor rdi,rdi - jmp NEAR $L$oop_mulx - -ALIGN 32 -$L$oop_mulx: - mov rbx,r8 - mulx r8,rax,QWORD[rsi] - adcx rbx,rax - adox r8,r9 - - mulx r9,rax,QWORD[8+rsi] - adcx r8,rax - adox r9,r10 - - mulx r10,rax,QWORD[16+rsi] - adcx r9,rax - adox r10,r11 - - mulx r11,rax,QWORD[24+rsi] - adcx r10,rax - adox r11,r12 - -DB 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 - adcx r11,rax - adox r12,r13 - - mulx r13,rax,QWORD[40+rsi] - adcx r12,rax - adox r13,r14 - - mulx r14,rax,QWORD[48+rsi] - adcx r13,rax - adox r14,r15 - - mulx r15,rax,QWORD[56+rsi] - mov rdx,QWORD[64+rcx*8+rbp] - mov QWORD[((8+64-8))+rcx*8+rsp],rbx - adcx r14,rax - adox r15,rdi - adcx r15,rdi - - inc rcx - jnz NEAR $L$oop_mulx - - mov rbx,r8 - mulx r8,rax,QWORD[rsi] - adcx rbx,rax - adox r8,r9 - -DB 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 - adcx r8,rax - adox r9,r10 - -DB 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 - adcx r9,rax - adox r10,r11 - - mulx r11,rax,QWORD[24+rsi] - adcx r10,rax - adox r11,r12 - - mulx r12,rax,QWORD[32+rsi] - adcx r11,rax - adox r12,r13 - - mulx r13,rax,QWORD[40+rsi] - adcx r12,rax - adox r13,r14 - -DB 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 - adcx r13,rax - adox r14,r15 - -DB 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 - adcx r14,rax - adox r15,rdi - adcx r15,rdi - - mov QWORD[((8+64-8))+rsp],rbx - mov QWORD[((8+64))+rsp],r8 - mov QWORD[((8+64+8))+rsp],r9 - mov QWORD[((8+64+16))+rsp],r10 - mov QWORD[((8+64+24))+rsp],r11 - mov QWORD[((8+64+32))+rsp],r12 - mov QWORD[((8+64+40))+rsp],r13 - mov QWORD[((8+64+48))+rsp],r14 - mov QWORD[((8+64+56))+rsp],r15 - - DB 0F3h,0C3h ;repret - - global rsaz_512_scatter4 ALIGN 16 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/x86_64-mont.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/x86_64-mont.nasm index b4f755d63e6..80de3a35016 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/x86_64-mont.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/x86_64-mont.nasm @@ -31,7 +31,6 @@ $L$SEH_begin_bn_mul_mont: jnz NEAR $L$mul_enter cmp r9d,8 jb NEAR $L$mul_enter - mov r11d,DWORD[((OPENSSL_ia32cap_P+8))] cmp rdx,rsi jne NEAR $L$mul4x_enter test r9d,7 @@ -294,9 +293,6 @@ $L$SEH_begin_bn_mul4x_mont: mov rax,rsp $L$mul4x_enter: - and r11d,0x80100 - cmp r11d,0x80100 - je NEAR $L$mulx4x_enter push rbx push rbp @@ -722,7 +718,6 @@ $L$mul4x_epilogue: DB 0F3h,0C3h ;repret $L$SEH_end_bn_mul4x_mont: -EXTERN bn_sqrx8x_internal EXTERN bn_sqr8x_internal @@ -818,25 +813,6 @@ DB 102,72,15,110,209 pxor xmm0,xmm0 DB 102,72,15,110,207 DB 102,73,15,110,218 - mov eax,DWORD[((OPENSSL_ia32cap_P+8))] - and eax,0x80100 - cmp eax,0x80100 - jne NEAR $L$sqr8x_nox - - call bn_sqrx8x_internal - - - - - lea rbx,[rcx*1+r8] - mov r9,rcx - mov rdx,rcx -DB 102,72,15,126,207 - sar rcx,3+2 - jmp NEAR $L$sqr8x_sub - -ALIGN 32 -$L$sqr8x_nox: call bn_sqr8x_internal @@ -926,376 +902,6 @@ $L$sqr8x_epilogue: DB 0F3h,0C3h ;repret $L$SEH_end_bn_sqr8x_mont: - -ALIGN 32 -bn_mulx4x_mont: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_bn_mulx4x_mont: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - - mov rax,rsp - -$L$mulx4x_enter: - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - push r15 - -$L$mulx4x_prologue: - - shl r9d,3 - xor r10,r10 - sub r10,r9 - mov r8,QWORD[r8] - lea rbp,[((-72))+r10*1+rsp] - and rbp,-128 - mov r11,rsp - sub r11,rbp - and r11,-4096 - lea rsp,[rbp*1+r11] - mov r10,QWORD[rsp] - cmp rsp,rbp - ja NEAR $L$mulx4x_page_walk - jmp NEAR $L$mulx4x_page_walk_done - -ALIGN 16 -$L$mulx4x_page_walk: - lea rsp,[((-4096))+rsp] - mov r10,QWORD[rsp] - cmp rsp,rbp - ja NEAR $L$mulx4x_page_walk -$L$mulx4x_page_walk_done: - - lea r10,[r9*1+rdx] - - - - - - - - - - - - - mov QWORD[rsp],r9 - shr r9,5 - mov QWORD[16+rsp],r10 - sub r9,1 - mov QWORD[24+rsp],r8 - mov QWORD[32+rsp],rdi - mov QWORD[40+rsp],rax - - mov QWORD[48+rsp],r9 - jmp NEAR $L$mulx4x_body - -ALIGN 32 -$L$mulx4x_body: - lea rdi,[8+rdx] - mov rdx,QWORD[rdx] - lea rbx,[((64+32))+rsp] - mov r9,rdx - - mulx rax,r8,QWORD[rsi] - mulx r14,r11,QWORD[8+rsi] - add r11,rax - mov QWORD[8+rsp],rdi - mulx r13,r12,QWORD[16+rsi] - adc r12,r14 - adc r13,0 - - mov rdi,r8 - imul r8,QWORD[24+rsp] - xor rbp,rbp - - mulx r14,rax,QWORD[24+rsi] - mov rdx,r8 - lea rsi,[32+rsi] - adcx r13,rax - adcx r14,rbp - - mulx r10,rax,QWORD[rcx] - adcx rdi,rax - adox r10,r11 - mulx r11,rax,QWORD[8+rcx] - adcx r10,rax - adox r11,r12 -DB 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 - mov rdi,QWORD[48+rsp] - mov QWORD[((-32))+rbx],r10 - adcx r11,rax - adox r12,r13 - mulx r15,rax,QWORD[24+rcx] - mov rdx,r9 - mov QWORD[((-24))+rbx],r11 - adcx r12,rax - adox r15,rbp - lea rcx,[32+rcx] - mov QWORD[((-16))+rbx],r12 - - jmp NEAR $L$mulx4x_1st - -ALIGN 32 -$L$mulx4x_1st: - adcx r15,rbp - mulx rax,r10,QWORD[rsi] - adcx r10,r14 - mulx r14,r11,QWORD[8+rsi] - adcx r11,rax - mulx rax,r12,QWORD[16+rsi] - adcx r12,r14 - mulx r14,r13,QWORD[24+rsi] -DB 0x67,0x67 - mov rdx,r8 - adcx r13,rax - adcx r14,rbp - lea rsi,[32+rsi] - lea rbx,[32+rbx] - - adox r10,r15 - mulx r15,rax,QWORD[rcx] - adcx r10,rax - adox r11,r15 - mulx r15,rax,QWORD[8+rcx] - adcx r11,rax - adox r12,r15 - mulx r15,rax,QWORD[16+rcx] - mov QWORD[((-40))+rbx],r10 - adcx r12,rax - mov QWORD[((-32))+rbx],r11 - adox r13,r15 - mulx r15,rax,QWORD[24+rcx] - mov rdx,r9 - mov QWORD[((-24))+rbx],r12 - adcx r13,rax - adox r15,rbp - lea rcx,[32+rcx] - mov QWORD[((-16))+rbx],r13 - - dec rdi - jnz NEAR $L$mulx4x_1st - - mov rax,QWORD[rsp] - mov rdi,QWORD[8+rsp] - adc r15,rbp - add r14,r15 - sbb r15,r15 - mov QWORD[((-8))+rbx],r14 - jmp NEAR $L$mulx4x_outer - -ALIGN 32 -$L$mulx4x_outer: - mov rdx,QWORD[rdi] - lea rdi,[8+rdi] - sub rsi,rax - mov QWORD[rbx],r15 - lea rbx,[((64+32))+rsp] - sub rcx,rax - - mulx r11,r8,QWORD[rsi] - xor ebp,ebp - mov r9,rdx - mulx r12,r14,QWORD[8+rsi] - adox r8,QWORD[((-32))+rbx] - adcx r11,r14 - mulx r13,r15,QWORD[16+rsi] - adox r11,QWORD[((-24))+rbx] - adcx r12,r15 - adox r12,QWORD[((-16))+rbx] - adcx r13,rbp - adox r13,rbp - - mov QWORD[8+rsp],rdi - mov r15,r8 - imul r8,QWORD[24+rsp] - xor ebp,ebp - - mulx r14,rax,QWORD[24+rsi] - mov rdx,r8 - adcx r13,rax - adox r13,QWORD[((-8))+rbx] - adcx r14,rbp - lea rsi,[32+rsi] - adox r14,rbp - - mulx r10,rax,QWORD[rcx] - adcx r15,rax - adox r10,r11 - mulx r11,rax,QWORD[8+rcx] - adcx r10,rax - adox r11,r12 - mulx r12,rax,QWORD[16+rcx] - mov QWORD[((-32))+rbx],r10 - adcx r11,rax - adox r12,r13 - mulx r15,rax,QWORD[24+rcx] - mov rdx,r9 - mov QWORD[((-24))+rbx],r11 - lea rcx,[32+rcx] - adcx r12,rax - adox r15,rbp - mov rdi,QWORD[48+rsp] - mov QWORD[((-16))+rbx],r12 - - jmp NEAR $L$mulx4x_inner - -ALIGN 32 -$L$mulx4x_inner: - mulx rax,r10,QWORD[rsi] - adcx r15,rbp - adox r10,r14 - mulx r14,r11,QWORD[8+rsi] - adcx r10,QWORD[rbx] - adox r11,rax - mulx rax,r12,QWORD[16+rsi] - adcx r11,QWORD[8+rbx] - adox r12,r14 - mulx r14,r13,QWORD[24+rsi] - mov rdx,r8 - adcx r12,QWORD[16+rbx] - adox r13,rax - adcx r13,QWORD[24+rbx] - adox r14,rbp - lea rsi,[32+rsi] - lea rbx,[32+rbx] - adcx r14,rbp - - adox r10,r15 - mulx r15,rax,QWORD[rcx] - adcx r10,rax - adox r11,r15 - mulx r15,rax,QWORD[8+rcx] - adcx r11,rax - adox r12,r15 - mulx r15,rax,QWORD[16+rcx] - mov QWORD[((-40))+rbx],r10 - adcx r12,rax - adox r13,r15 - mulx r15,rax,QWORD[24+rcx] - mov rdx,r9 - mov QWORD[((-32))+rbx],r11 - mov QWORD[((-24))+rbx],r12 - adcx r13,rax - adox r15,rbp - lea rcx,[32+rcx] - mov QWORD[((-16))+rbx],r13 - - dec rdi - jnz NEAR $L$mulx4x_inner - - mov rax,QWORD[rsp] - mov rdi,QWORD[8+rsp] - adc r15,rbp - sub rbp,QWORD[rbx] - adc r14,r15 - sbb r15,r15 - mov QWORD[((-8))+rbx],r14 - - cmp rdi,QWORD[16+rsp] - jne NEAR $L$mulx4x_outer - - lea rbx,[64+rsp] - sub rcx,rax - neg r15 - mov rdx,rax - shr rax,3+2 - mov rdi,QWORD[32+rsp] - jmp NEAR $L$mulx4x_sub - -ALIGN 32 -$L$mulx4x_sub: - mov r11,QWORD[rbx] - mov r12,QWORD[8+rbx] - mov r13,QWORD[16+rbx] - mov r14,QWORD[24+rbx] - lea rbx,[32+rbx] - sbb r11,QWORD[rcx] - sbb r12,QWORD[8+rcx] - sbb r13,QWORD[16+rcx] - sbb r14,QWORD[24+rcx] - lea rcx,[32+rcx] - mov QWORD[rdi],r11 - mov QWORD[8+rdi],r12 - mov QWORD[16+rdi],r13 - mov QWORD[24+rdi],r14 - lea rdi,[32+rdi] - dec rax - jnz NEAR $L$mulx4x_sub - - sbb r15,0 - lea rbx,[64+rsp] - sub rdi,rdx - -DB 102,73,15,110,207 - pxor xmm0,xmm0 - pshufd xmm1,xmm1,0 - mov rsi,QWORD[40+rsp] - - jmp NEAR $L$mulx4x_cond_copy - -ALIGN 32 -$L$mulx4x_cond_copy: - movdqa xmm2,XMMWORD[rbx] - movdqa xmm3,XMMWORD[16+rbx] - lea rbx,[32+rbx] - movdqu xmm4,XMMWORD[rdi] - movdqu xmm5,XMMWORD[16+rdi] - lea rdi,[32+rdi] - movdqa XMMWORD[(-32)+rbx],xmm0 - movdqa XMMWORD[(-16)+rbx],xmm0 - pcmpeqd xmm0,xmm1 - pand xmm2,xmm1 - pand xmm3,xmm1 - pand xmm4,xmm0 - pand xmm5,xmm0 - pxor xmm0,xmm0 - por xmm4,xmm2 - por xmm5,xmm3 - movdqu XMMWORD[(-32)+rdi],xmm4 - movdqu XMMWORD[(-16)+rdi],xmm5 - sub rdx,32 - jnz NEAR $L$mulx4x_cond_copy - - mov QWORD[rbx],rdx - - mov rax,1 - mov r15,QWORD[((-48))+rsi] - - mov r14,QWORD[((-40))+rsi] - - mov r13,QWORD[((-32))+rsi] - - mov r12,QWORD[((-24))+rsi] - - mov rbp,QWORD[((-16))+rsi] - - mov rbx,QWORD[((-8))+rsi] - - lea rsp,[rsi] - -$L$mulx4x_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_bn_mulx4x_mont: DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 DB 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 DB 54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83 @@ -1447,9 +1053,6 @@ ALIGN 4 DD $L$SEH_begin_bn_sqr8x_mont wrt ..imagebase DD $L$SEH_end_bn_sqr8x_mont wrt ..imagebase DD $L$SEH_info_bn_sqr8x_mont wrt ..imagebase - DD $L$SEH_begin_bn_mulx4x_mont wrt ..imagebase - DD $L$SEH_end_bn_mulx4x_mont wrt ..imagebase - DD $L$SEH_info_bn_mulx4x_mont wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_bn_mul_mont: @@ -1465,8 +1068,3 @@ DB 9,0,0,0 DD sqr_handler wrt ..imagebase DD $L$sqr8x_prologue wrt ..imagebase,$L$sqr8x_body wrt ..imagebase,$L$sqr8x_epilogue wrt ..imagebase ALIGN 8 -$L$SEH_info_bn_mulx4x_mont: -DB 9,0,0,0 - DD sqr_handler wrt ..imagebase - DD $L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase -ALIGN 8 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/x86_64-mont5.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/x86_64-mont5.nasm index 260113b0176..15715aa9239 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/x86_64-mont5.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/x86_64-mont5.nasm @@ -29,7 +29,6 @@ $L$SEH_begin_bn_mul_mont_gather5: test r9d,7 jnz NEAR $L$mul_enter - mov r11d,DWORD[((OPENSSL_ia32cap_P+8))] jmp NEAR $L$mul4x_enter ALIGN 16 @@ -480,9 +479,6 @@ DB 0x67 mov rax,rsp $L$mul4x_enter: - and r11d,0x80108 - cmp r11d,0x80108 - je NEAR $L$mulx4x_enter push rbx push rbp @@ -1126,10 +1122,6 @@ $L$SEH_begin_bn_power5: mov rax,rsp - mov r11d,DWORD[((OPENSSL_ia32cap_P+8))] - and r11d,0x80108 - cmp r11d,0x80108 - je NEAR $L$powerx5_enter push rbx push rbp @@ -2095,1376 +2087,6 @@ $L$sqr4x_sub_entry: DB 0F3h,0C3h ;repret - -ALIGN 32 -bn_mulx4x_mont_gather5: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_bn_mulx4x_mont_gather5: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - - mov rax,rsp - -$L$mulx4x_enter: - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - push r15 - -$L$mulx4x_prologue: - - shl r9d,3 - lea r10,[r9*2+r9] - neg r9 - mov r8,QWORD[r8] - - - - - - - - - - - lea r11,[((-320))+r9*2+rsp] - mov rbp,rsp - sub r11,rdi - and r11,4095 - cmp r10,r11 - jb NEAR $L$mulx4xsp_alt - sub rbp,r11 - lea rbp,[((-320))+r9*2+rbp] - jmp NEAR $L$mulx4xsp_done - -$L$mulx4xsp_alt: - lea r10,[((4096-320))+r9*2] - lea rbp,[((-320))+r9*2+rbp] - sub r11,r10 - mov r10,0 - cmovc r11,r10 - sub rbp,r11 -$L$mulx4xsp_done: - and rbp,-64 - mov r11,rsp - sub r11,rbp - and r11,-4096 - lea rsp,[rbp*1+r11] - mov r10,QWORD[rsp] - cmp rsp,rbp - ja NEAR $L$mulx4x_page_walk - jmp NEAR $L$mulx4x_page_walk_done - -$L$mulx4x_page_walk: - lea rsp,[((-4096))+rsp] - mov r10,QWORD[rsp] - cmp rsp,rbp - ja NEAR $L$mulx4x_page_walk -$L$mulx4x_page_walk_done: - - - - - - - - - - - - - - mov QWORD[32+rsp],r8 - mov QWORD[40+rsp],rax - -$L$mulx4x_body: - call mulx4x_internal - - mov rsi,QWORD[40+rsp] - - mov rax,1 - - mov r15,QWORD[((-48))+rsi] - - mov r14,QWORD[((-40))+rsi] - - mov r13,QWORD[((-32))+rsi] - - mov r12,QWORD[((-24))+rsi] - - mov rbp,QWORD[((-16))+rsi] - - mov rbx,QWORD[((-8))+rsi] - - lea rsp,[rsi] - -$L$mulx4x_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_bn_mulx4x_mont_gather5: - - -ALIGN 32 -mulx4x_internal: - - mov QWORD[8+rsp],r9 - mov r10,r9 - neg r9 - shl r9,5 - neg r10 - lea r13,[128+r9*1+rdx] - shr r9,5+5 - movd xmm5,DWORD[56+rax] - sub r9,1 - lea rax,[$L$inc] - mov QWORD[((16+8))+rsp],r13 - mov QWORD[((24+8))+rsp],r9 - mov QWORD[((56+8))+rsp],rdi - movdqa xmm0,XMMWORD[rax] - movdqa xmm1,XMMWORD[16+rax] - lea r10,[((88-112))+r10*1+rsp] - lea rdi,[128+rdx] - - pshufd xmm5,xmm5,0 - movdqa xmm4,xmm1 -DB 0x67 - movdqa xmm2,xmm1 -DB 0x67 - paddd xmm1,xmm0 - pcmpeqd xmm0,xmm5 - movdqa xmm3,xmm4 - paddd xmm2,xmm1 - pcmpeqd xmm1,xmm5 - movdqa XMMWORD[112+r10],xmm0 - movdqa xmm0,xmm4 - - paddd xmm3,xmm2 - pcmpeqd xmm2,xmm5 - movdqa XMMWORD[128+r10],xmm1 - movdqa xmm1,xmm4 - - paddd xmm0,xmm3 - pcmpeqd xmm3,xmm5 - movdqa XMMWORD[144+r10],xmm2 - movdqa xmm2,xmm4 - - paddd xmm1,xmm0 - pcmpeqd xmm0,xmm5 - movdqa XMMWORD[160+r10],xmm3 - movdqa xmm3,xmm4 - paddd xmm2,xmm1 - pcmpeqd xmm1,xmm5 - movdqa XMMWORD[176+r10],xmm0 - movdqa xmm0,xmm4 - - paddd xmm3,xmm2 - pcmpeqd xmm2,xmm5 - movdqa XMMWORD[192+r10],xmm1 - movdqa xmm1,xmm4 - - paddd xmm0,xmm3 - pcmpeqd xmm3,xmm5 - movdqa XMMWORD[208+r10],xmm2 - movdqa xmm2,xmm4 - - paddd xmm1,xmm0 - pcmpeqd xmm0,xmm5 - movdqa XMMWORD[224+r10],xmm3 - movdqa xmm3,xmm4 - paddd xmm2,xmm1 - pcmpeqd xmm1,xmm5 - movdqa XMMWORD[240+r10],xmm0 - movdqa xmm0,xmm4 - - paddd xmm3,xmm2 - pcmpeqd xmm2,xmm5 - movdqa XMMWORD[256+r10],xmm1 - movdqa xmm1,xmm4 - - paddd xmm0,xmm3 - pcmpeqd xmm3,xmm5 - movdqa XMMWORD[272+r10],xmm2 - movdqa xmm2,xmm4 - - paddd xmm1,xmm0 - pcmpeqd xmm0,xmm5 - movdqa XMMWORD[288+r10],xmm3 - movdqa xmm3,xmm4 -DB 0x67 - paddd xmm2,xmm1 - pcmpeqd xmm1,xmm5 - movdqa XMMWORD[304+r10],xmm0 - - paddd xmm3,xmm2 - pcmpeqd xmm2,xmm5 - movdqa XMMWORD[320+r10],xmm1 - - pcmpeqd xmm3,xmm5 - movdqa XMMWORD[336+r10],xmm2 - - pand xmm0,XMMWORD[64+rdi] - pand xmm1,XMMWORD[80+rdi] - pand xmm2,XMMWORD[96+rdi] - movdqa XMMWORD[352+r10],xmm3 - pand xmm3,XMMWORD[112+rdi] - por xmm0,xmm2 - por xmm1,xmm3 - movdqa xmm4,XMMWORD[((-128))+rdi] - movdqa xmm5,XMMWORD[((-112))+rdi] - movdqa xmm2,XMMWORD[((-96))+rdi] - pand xmm4,XMMWORD[112+r10] - movdqa xmm3,XMMWORD[((-80))+rdi] - pand xmm5,XMMWORD[128+r10] - por xmm0,xmm4 - pand xmm2,XMMWORD[144+r10] - por xmm1,xmm5 - pand xmm3,XMMWORD[160+r10] - por xmm0,xmm2 - por xmm1,xmm3 - movdqa xmm4,XMMWORD[((-64))+rdi] - movdqa xmm5,XMMWORD[((-48))+rdi] - movdqa xmm2,XMMWORD[((-32))+rdi] - pand xmm4,XMMWORD[176+r10] - movdqa xmm3,XMMWORD[((-16))+rdi] - pand xmm5,XMMWORD[192+r10] - por xmm0,xmm4 - pand xmm2,XMMWORD[208+r10] - por xmm1,xmm5 - pand xmm3,XMMWORD[224+r10] - por xmm0,xmm2 - por xmm1,xmm3 - movdqa xmm4,XMMWORD[rdi] - movdqa xmm5,XMMWORD[16+rdi] - movdqa xmm2,XMMWORD[32+rdi] - pand xmm4,XMMWORD[240+r10] - movdqa xmm3,XMMWORD[48+rdi] - pand xmm5,XMMWORD[256+r10] - por xmm0,xmm4 - pand xmm2,XMMWORD[272+r10] - por xmm1,xmm5 - pand xmm3,XMMWORD[288+r10] - por xmm0,xmm2 - por xmm1,xmm3 - pxor xmm0,xmm1 - pshufd xmm1,xmm0,0x4e - por xmm0,xmm1 - lea rdi,[256+rdi] -DB 102,72,15,126,194 - lea rbx,[((64+32+8))+rsp] - - mov r9,rdx - mulx rax,r8,QWORD[rsi] - mulx r12,r11,QWORD[8+rsi] - add r11,rax - mulx r13,rax,QWORD[16+rsi] - adc r12,rax - adc r13,0 - mulx r14,rax,QWORD[24+rsi] - - mov r15,r8 - imul r8,QWORD[((32+8))+rsp] - xor rbp,rbp - mov rdx,r8 - - mov QWORD[((8+8))+rsp],rdi - - lea rsi,[32+rsi] - adcx r13,rax - adcx r14,rbp - - mulx r10,rax,QWORD[rcx] - adcx r15,rax - adox r10,r11 - mulx r11,rax,QWORD[8+rcx] - adcx r10,rax - adox r11,r12 - mulx r12,rax,QWORD[16+rcx] - mov rdi,QWORD[((24+8))+rsp] - mov QWORD[((-32))+rbx],r10 - adcx r11,rax - adox r12,r13 - mulx r15,rax,QWORD[24+rcx] - mov rdx,r9 - mov QWORD[((-24))+rbx],r11 - adcx r12,rax - adox r15,rbp - lea rcx,[32+rcx] - mov QWORD[((-16))+rbx],r12 - jmp NEAR $L$mulx4x_1st - -ALIGN 32 -$L$mulx4x_1st: - adcx r15,rbp - mulx rax,r10,QWORD[rsi] - adcx r10,r14 - mulx r14,r11,QWORD[8+rsi] - adcx r11,rax - mulx rax,r12,QWORD[16+rsi] - adcx r12,r14 - mulx r14,r13,QWORD[24+rsi] -DB 0x67,0x67 - mov rdx,r8 - adcx r13,rax - adcx r14,rbp - lea rsi,[32+rsi] - lea rbx,[32+rbx] - - adox r10,r15 - mulx r15,rax,QWORD[rcx] - adcx r10,rax - adox r11,r15 - mulx r15,rax,QWORD[8+rcx] - adcx r11,rax - adox r12,r15 - mulx r15,rax,QWORD[16+rcx] - mov QWORD[((-40))+rbx],r10 - adcx r12,rax - mov QWORD[((-32))+rbx],r11 - adox r13,r15 - mulx r15,rax,QWORD[24+rcx] - mov rdx,r9 - mov QWORD[((-24))+rbx],r12 - adcx r13,rax - adox r15,rbp - lea rcx,[32+rcx] - mov QWORD[((-16))+rbx],r13 - - dec rdi - jnz NEAR $L$mulx4x_1st - - mov rax,QWORD[8+rsp] - adc r15,rbp - lea rsi,[rax*1+rsi] - add r14,r15 - mov rdi,QWORD[((8+8))+rsp] - adc rbp,rbp - mov QWORD[((-8))+rbx],r14 - jmp NEAR $L$mulx4x_outer - -ALIGN 32 -$L$mulx4x_outer: - lea r10,[((16-256))+rbx] - pxor xmm4,xmm4 -DB 0x67,0x67 - pxor xmm5,xmm5 - movdqa xmm0,XMMWORD[((-128))+rdi] - movdqa xmm1,XMMWORD[((-112))+rdi] - movdqa xmm2,XMMWORD[((-96))+rdi] - pand xmm0,XMMWORD[256+r10] - movdqa xmm3,XMMWORD[((-80))+rdi] - pand xmm1,XMMWORD[272+r10] - por xmm4,xmm0 - pand xmm2,XMMWORD[288+r10] - por xmm5,xmm1 - pand xmm3,XMMWORD[304+r10] - por xmm4,xmm2 - por xmm5,xmm3 - movdqa xmm0,XMMWORD[((-64))+rdi] - movdqa xmm1,XMMWORD[((-48))+rdi] - movdqa xmm2,XMMWORD[((-32))+rdi] - pand xmm0,XMMWORD[320+r10] - movdqa xmm3,XMMWORD[((-16))+rdi] - pand xmm1,XMMWORD[336+r10] - por xmm4,xmm0 - pand xmm2,XMMWORD[352+r10] - por xmm5,xmm1 - pand xmm3,XMMWORD[368+r10] - por xmm4,xmm2 - por xmm5,xmm3 - movdqa xmm0,XMMWORD[rdi] - movdqa xmm1,XMMWORD[16+rdi] - movdqa xmm2,XMMWORD[32+rdi] - pand xmm0,XMMWORD[384+r10] - movdqa xmm3,XMMWORD[48+rdi] - pand xmm1,XMMWORD[400+r10] - por xmm4,xmm0 - pand xmm2,XMMWORD[416+r10] - por xmm5,xmm1 - pand xmm3,XMMWORD[432+r10] - por xmm4,xmm2 - por xmm5,xmm3 - movdqa xmm0,XMMWORD[64+rdi] - movdqa xmm1,XMMWORD[80+rdi] - movdqa xmm2,XMMWORD[96+rdi] - pand xmm0,XMMWORD[448+r10] - movdqa xmm3,XMMWORD[112+rdi] - pand xmm1,XMMWORD[464+r10] - por xmm4,xmm0 - pand xmm2,XMMWORD[480+r10] - por xmm5,xmm1 - pand xmm3,XMMWORD[496+r10] - por xmm4,xmm2 - por xmm5,xmm3 - por xmm4,xmm5 - pshufd xmm0,xmm4,0x4e - por xmm0,xmm4 - lea rdi,[256+rdi] -DB 102,72,15,126,194 - - mov QWORD[rbx],rbp - lea rbx,[32+rax*1+rbx] - mulx r11,r8,QWORD[rsi] - xor rbp,rbp - mov r9,rdx - mulx r12,r14,QWORD[8+rsi] - adox r8,QWORD[((-32))+rbx] - adcx r11,r14 - mulx r13,r15,QWORD[16+rsi] - adox r11,QWORD[((-24))+rbx] - adcx r12,r15 - mulx r14,rdx,QWORD[24+rsi] - adox r12,QWORD[((-16))+rbx] - adcx r13,rdx - lea rcx,[rax*1+rcx] - lea rsi,[32+rsi] - adox r13,QWORD[((-8))+rbx] - adcx r14,rbp - adox r14,rbp - - mov r15,r8 - imul r8,QWORD[((32+8))+rsp] - - mov rdx,r8 - xor rbp,rbp - mov QWORD[((8+8))+rsp],rdi - - mulx r10,rax,QWORD[rcx] - adcx r15,rax - adox r10,r11 - mulx r11,rax,QWORD[8+rcx] - adcx r10,rax - adox r11,r12 - mulx r12,rax,QWORD[16+rcx] - adcx r11,rax - adox r12,r13 - mulx r15,rax,QWORD[24+rcx] - mov rdx,r9 - mov rdi,QWORD[((24+8))+rsp] - mov QWORD[((-32))+rbx],r10 - adcx r12,rax - mov QWORD[((-24))+rbx],r11 - adox r15,rbp - mov QWORD[((-16))+rbx],r12 - lea rcx,[32+rcx] - jmp NEAR $L$mulx4x_inner - -ALIGN 32 -$L$mulx4x_inner: - mulx rax,r10,QWORD[rsi] - adcx r15,rbp - adox r10,r14 - mulx r14,r11,QWORD[8+rsi] - adcx r10,QWORD[rbx] - adox r11,rax - mulx rax,r12,QWORD[16+rsi] - adcx r11,QWORD[8+rbx] - adox r12,r14 - mulx r14,r13,QWORD[24+rsi] - mov rdx,r8 - adcx r12,QWORD[16+rbx] - adox r13,rax - adcx r13,QWORD[24+rbx] - adox r14,rbp - lea rsi,[32+rsi] - lea rbx,[32+rbx] - adcx r14,rbp - - adox r10,r15 - mulx r15,rax,QWORD[rcx] - adcx r10,rax - adox r11,r15 - mulx r15,rax,QWORD[8+rcx] - adcx r11,rax - adox r12,r15 - mulx r15,rax,QWORD[16+rcx] - mov QWORD[((-40))+rbx],r10 - adcx r12,rax - adox r13,r15 - mov QWORD[((-32))+rbx],r11 - mulx r15,rax,QWORD[24+rcx] - mov rdx,r9 - lea rcx,[32+rcx] - mov QWORD[((-24))+rbx],r12 - adcx r13,rax - adox r15,rbp - mov QWORD[((-16))+rbx],r13 - - dec rdi - jnz NEAR $L$mulx4x_inner - - mov rax,QWORD[((0+8))+rsp] - adc r15,rbp - sub rdi,QWORD[rbx] - mov rdi,QWORD[((8+8))+rsp] - mov r10,QWORD[((16+8))+rsp] - adc r14,r15 - lea rsi,[rax*1+rsi] - adc rbp,rbp - mov QWORD[((-8))+rbx],r14 - - cmp rdi,r10 - jb NEAR $L$mulx4x_outer - - mov r10,QWORD[((-8))+rcx] - mov r8,rbp - mov r12,QWORD[rax*1+rcx] - lea rbp,[rax*1+rcx] - mov rcx,rax - lea rdi,[rax*1+rbx] - xor eax,eax - xor r15,r15 - sub r10,r14 - adc r15,r15 - or r8,r15 - sar rcx,3+2 - sub rax,r8 - mov rdx,QWORD[((56+8))+rsp] - dec r12 - mov r13,QWORD[8+rbp] - xor r8,r8 - mov r14,QWORD[16+rbp] - mov r15,QWORD[24+rbp] - jmp NEAR $L$sqrx4x_sub_entry - - - -ALIGN 32 -bn_powerx5: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_bn_powerx5: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - - mov rax,rsp - -$L$powerx5_enter: - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - push r15 - -$L$powerx5_prologue: - - shl r9d,3 - lea r10,[r9*2+r9] - neg r9 - mov r8,QWORD[r8] - - - - - - - - - lea r11,[((-320))+r9*2+rsp] - mov rbp,rsp - sub r11,rdi - and r11,4095 - cmp r10,r11 - jb NEAR $L$pwrx_sp_alt - sub rbp,r11 - lea rbp,[((-320))+r9*2+rbp] - jmp NEAR $L$pwrx_sp_done - -ALIGN 32 -$L$pwrx_sp_alt: - lea r10,[((4096-320))+r9*2] - lea rbp,[((-320))+r9*2+rbp] - sub r11,r10 - mov r10,0 - cmovc r11,r10 - sub rbp,r11 -$L$pwrx_sp_done: - and rbp,-64 - mov r11,rsp - sub r11,rbp - and r11,-4096 - lea rsp,[rbp*1+r11] - mov r10,QWORD[rsp] - cmp rsp,rbp - ja NEAR $L$pwrx_page_walk - jmp NEAR $L$pwrx_page_walk_done - -$L$pwrx_page_walk: - lea rsp,[((-4096))+rsp] - mov r10,QWORD[rsp] - cmp rsp,rbp - ja NEAR $L$pwrx_page_walk -$L$pwrx_page_walk_done: - - mov r10,r9 - neg r9 - - - - - - - - - - - - - pxor xmm0,xmm0 -DB 102,72,15,110,207 -DB 102,72,15,110,209 -DB 102,73,15,110,218 -DB 102,72,15,110,226 - mov QWORD[32+rsp],r8 - mov QWORD[40+rsp],rax - -$L$powerx5_body: - - call __bn_sqrx8x_internal - call __bn_postx4x_internal - call __bn_sqrx8x_internal - call __bn_postx4x_internal - call __bn_sqrx8x_internal - call __bn_postx4x_internal - call __bn_sqrx8x_internal - call __bn_postx4x_internal - call __bn_sqrx8x_internal - call __bn_postx4x_internal - - mov r9,r10 - mov rdi,rsi -DB 102,72,15,126,209 -DB 102,72,15,126,226 - mov rax,QWORD[40+rsp] - - call mulx4x_internal - - mov rsi,QWORD[40+rsp] - - mov rax,1 - - mov r15,QWORD[((-48))+rsi] - - mov r14,QWORD[((-40))+rsi] - - mov r13,QWORD[((-32))+rsi] - - mov r12,QWORD[((-24))+rsi] - - mov rbp,QWORD[((-16))+rsi] - - mov rbx,QWORD[((-8))+rsi] - - lea rsp,[rsi] - -$L$powerx5_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_bn_powerx5: - -global bn_sqrx8x_internal - - -ALIGN 32 -bn_sqrx8x_internal: -__bn_sqrx8x_internal: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - lea rdi,[((48+8))+rsp] - lea rbp,[r9*1+rsi] - mov QWORD[((0+8))+rsp],r9 - mov QWORD[((8+8))+rsp],rbp - jmp NEAR $L$sqr8x_zero_start - -ALIGN 32 -DB 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 -$L$sqrx8x_zero: -DB 0x3e - movdqa XMMWORD[rdi],xmm0 - movdqa XMMWORD[16+rdi],xmm0 - movdqa XMMWORD[32+rdi],xmm0 - movdqa XMMWORD[48+rdi],xmm0 -$L$sqr8x_zero_start: - movdqa XMMWORD[64+rdi],xmm0 - movdqa XMMWORD[80+rdi],xmm0 - movdqa XMMWORD[96+rdi],xmm0 - movdqa XMMWORD[112+rdi],xmm0 - lea rdi,[128+rdi] - sub r9,64 - jnz NEAR $L$sqrx8x_zero - - mov rdx,QWORD[rsi] - - xor r10,r10 - xor r11,r11 - xor r12,r12 - xor r13,r13 - xor r14,r14 - xor r15,r15 - lea rdi,[((48+8))+rsp] - xor rbp,rbp - jmp NEAR $L$sqrx8x_outer_loop - -ALIGN 32 -$L$sqrx8x_outer_loop: - mulx rax,r8,QWORD[8+rsi] - adcx r8,r9 - adox r10,rax - mulx rax,r9,QWORD[16+rsi] - adcx r9,r10 - adox r11,rax -DB 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 - adcx r10,r11 - adox r12,rax -DB 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 - adcx r11,r12 - adox r13,rax - mulx rax,r12,QWORD[40+rsi] - adcx r12,r13 - adox r14,rax - mulx rax,r13,QWORD[48+rsi] - adcx r13,r14 - adox rax,r15 - mulx r15,r14,QWORD[56+rsi] - mov rdx,QWORD[8+rsi] - adcx r14,rax - adox r15,rbp - adc r15,QWORD[64+rdi] - mov QWORD[8+rdi],r8 - mov QWORD[16+rdi],r9 - sbb rcx,rcx - xor rbp,rbp - - - mulx rbx,r8,QWORD[16+rsi] - mulx rax,r9,QWORD[24+rsi] - adcx r8,r10 - adox r9,rbx - mulx rbx,r10,QWORD[32+rsi] - adcx r9,r11 - adox r10,rax -DB 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 - adcx r10,r12 - adox r11,rbx -DB 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 - adcx r11,r13 - adox r12,r14 -DB 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 - mov rdx,QWORD[16+rsi] - adcx r12,rax - adox r13,rbx - adcx r13,r15 - adox r14,rbp - adcx r14,rbp - - mov QWORD[24+rdi],r8 - mov QWORD[32+rdi],r9 - - mulx rbx,r8,QWORD[24+rsi] - mulx rax,r9,QWORD[32+rsi] - adcx r8,r10 - adox r9,rbx - mulx rbx,r10,QWORD[40+rsi] - adcx r9,r11 - adox r10,rax -DB 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 - adcx r10,r12 - adox r11,r13 -DB 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 -DB 0x3e - mov rdx,QWORD[24+rsi] - adcx r11,rbx - adox r12,rax - adcx r12,r14 - mov QWORD[40+rdi],r8 - mov QWORD[48+rdi],r9 - mulx rax,r8,QWORD[32+rsi] - adox r13,rbp - adcx r13,rbp - - mulx rbx,r9,QWORD[40+rsi] - adcx r8,r10 - adox r9,rax - mulx rax,r10,QWORD[48+rsi] - adcx r9,r11 - adox r10,r12 - mulx r12,r11,QWORD[56+rsi] - mov rdx,QWORD[32+rsi] - mov r14,QWORD[40+rsi] - adcx r10,rbx - adox r11,rax - mov r15,QWORD[48+rsi] - adcx r11,r13 - adox r12,rbp - adcx r12,rbp - - mov QWORD[56+rdi],r8 - mov QWORD[64+rdi],r9 - - mulx rax,r9,r14 - mov r8,QWORD[56+rsi] - adcx r9,r10 - mulx rbx,r10,r15 - adox r10,rax - adcx r10,r11 - mulx rax,r11,r8 - mov rdx,r14 - adox r11,rbx - adcx r11,r12 - - adcx rax,rbp - - mulx rbx,r14,r15 - mulx r13,r12,r8 - mov rdx,r15 - lea rsi,[64+rsi] - adcx r11,r14 - adox r12,rbx - adcx r12,rax - adox r13,rbp - -DB 0x67,0x67 - mulx r14,r8,r8 - adcx r13,r8 - adcx r14,rbp - - cmp rsi,QWORD[((8+8))+rsp] - je NEAR $L$sqrx8x_outer_break - - neg rcx - mov rcx,-8 - mov r15,rbp - mov r8,QWORD[64+rdi] - adcx r9,QWORD[72+rdi] - adcx r10,QWORD[80+rdi] - adcx r11,QWORD[88+rdi] - adc r12,QWORD[96+rdi] - adc r13,QWORD[104+rdi] - adc r14,QWORD[112+rdi] - adc r15,QWORD[120+rdi] - lea rbp,[rsi] - lea rdi,[128+rdi] - sbb rax,rax - - mov rdx,QWORD[((-64))+rsi] - mov QWORD[((16+8))+rsp],rax - mov QWORD[((24+8))+rsp],rdi - - - xor eax,eax - jmp NEAR $L$sqrx8x_loop - -ALIGN 32 -$L$sqrx8x_loop: - mov rbx,r8 - mulx r8,rax,QWORD[rbp] - adcx rbx,rax - adox r8,r9 - - mulx r9,rax,QWORD[8+rbp] - adcx r8,rax - adox r9,r10 - - mulx r10,rax,QWORD[16+rbp] - adcx r9,rax - adox r10,r11 - - mulx r11,rax,QWORD[24+rbp] - adcx r10,rax - adox r11,r12 - -DB 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 - adcx r11,rax - adox r12,r13 - - mulx r13,rax,QWORD[40+rbp] - adcx r12,rax - adox r13,r14 - - mulx r14,rax,QWORD[48+rbp] - mov QWORD[rcx*8+rdi],rbx - mov ebx,0 - adcx r13,rax - adox r14,r15 - -DB 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 - mov rdx,QWORD[8+rcx*8+rsi] - adcx r14,rax - adox r15,rbx - adcx r15,rbx - -DB 0x67 - inc rcx - jnz NEAR $L$sqrx8x_loop - - lea rbp,[64+rbp] - mov rcx,-8 - cmp rbp,QWORD[((8+8))+rsp] - je NEAR $L$sqrx8x_break - - sub rbx,QWORD[((16+8))+rsp] -DB 0x66 - mov rdx,QWORD[((-64))+rsi] - adcx r8,QWORD[rdi] - adcx r9,QWORD[8+rdi] - adc r10,QWORD[16+rdi] - adc r11,QWORD[24+rdi] - adc r12,QWORD[32+rdi] - adc r13,QWORD[40+rdi] - adc r14,QWORD[48+rdi] - adc r15,QWORD[56+rdi] - lea rdi,[64+rdi] -DB 0x67 - sbb rax,rax - xor ebx,ebx - mov QWORD[((16+8))+rsp],rax - jmp NEAR $L$sqrx8x_loop - -ALIGN 32 -$L$sqrx8x_break: - xor rbp,rbp - sub rbx,QWORD[((16+8))+rsp] - adcx r8,rbp - mov rcx,QWORD[((24+8))+rsp] - adcx r9,rbp - mov rdx,QWORD[rsi] - adc r10,0 - mov QWORD[rdi],r8 - adc r11,0 - adc r12,0 - adc r13,0 - adc r14,0 - adc r15,0 - cmp rdi,rcx - je NEAR $L$sqrx8x_outer_loop - - mov QWORD[8+rdi],r9 - mov r9,QWORD[8+rcx] - mov QWORD[16+rdi],r10 - mov r10,QWORD[16+rcx] - mov QWORD[24+rdi],r11 - mov r11,QWORD[24+rcx] - mov QWORD[32+rdi],r12 - mov r12,QWORD[32+rcx] - mov QWORD[40+rdi],r13 - mov r13,QWORD[40+rcx] - mov QWORD[48+rdi],r14 - mov r14,QWORD[48+rcx] - mov QWORD[56+rdi],r15 - mov r15,QWORD[56+rcx] - mov rdi,rcx - jmp NEAR $L$sqrx8x_outer_loop - -ALIGN 32 -$L$sqrx8x_outer_break: - mov QWORD[72+rdi],r9 -DB 102,72,15,126,217 - mov QWORD[80+rdi],r10 - mov QWORD[88+rdi],r11 - mov QWORD[96+rdi],r12 - mov QWORD[104+rdi],r13 - mov QWORD[112+rdi],r14 - lea rdi,[((48+8))+rsp] - mov rdx,QWORD[rcx*1+rsi] - - mov r11,QWORD[8+rdi] - xor r10,r10 - mov r9,QWORD[((0+8))+rsp] - adox r11,r11 - mov r12,QWORD[16+rdi] - mov r13,QWORD[24+rdi] - - -ALIGN 32 -$L$sqrx4x_shift_n_add: - mulx rbx,rax,rdx - adox r12,r12 - adcx rax,r10 -DB 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 -DB 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 - adox r13,r13 - adcx rbx,r11 - mov r11,QWORD[40+rdi] - mov QWORD[rdi],rax - mov QWORD[8+rdi],rbx - - mulx rbx,rax,rdx - adox r10,r10 - adcx rax,r12 - mov rdx,QWORD[16+rcx*1+rsi] - mov r12,QWORD[48+rdi] - adox r11,r11 - adcx rbx,r13 - mov r13,QWORD[56+rdi] - mov QWORD[16+rdi],rax - mov QWORD[24+rdi],rbx - - mulx rbx,rax,rdx - adox r12,r12 - adcx rax,r10 - mov rdx,QWORD[24+rcx*1+rsi] - lea rcx,[32+rcx] - mov r10,QWORD[64+rdi] - adox r13,r13 - adcx rbx,r11 - mov r11,QWORD[72+rdi] - mov QWORD[32+rdi],rax - mov QWORD[40+rdi],rbx - - mulx rbx,rax,rdx - adox r10,r10 - adcx rax,r12 - jrcxz $L$sqrx4x_shift_n_add_break -DB 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 - adox r11,r11 - adcx rbx,r13 - mov r12,QWORD[80+rdi] - mov r13,QWORD[88+rdi] - mov QWORD[48+rdi],rax - mov QWORD[56+rdi],rbx - lea rdi,[64+rdi] - nop - jmp NEAR $L$sqrx4x_shift_n_add - -ALIGN 32 -$L$sqrx4x_shift_n_add_break: - adcx rbx,r13 - mov QWORD[48+rdi],rax - mov QWORD[56+rdi],rbx - lea rdi,[64+rdi] -DB 102,72,15,126,213 -__bn_sqrx8x_reduction: - xor eax,eax - mov rbx,QWORD[((32+8))+rsp] - mov rdx,QWORD[((48+8))+rsp] - lea rcx,[((-64))+r9*1+rbp] - - mov QWORD[((0+8))+rsp],rcx - mov QWORD[((8+8))+rsp],rdi - - lea rdi,[((48+8))+rsp] - jmp NEAR $L$sqrx8x_reduction_loop - -ALIGN 32 -$L$sqrx8x_reduction_loop: - mov r9,QWORD[8+rdi] - mov r10,QWORD[16+rdi] - mov r11,QWORD[24+rdi] - mov r12,QWORD[32+rdi] - mov r8,rdx - imul rdx,rbx - mov r13,QWORD[40+rdi] - mov r14,QWORD[48+rdi] - mov r15,QWORD[56+rdi] - mov QWORD[((24+8))+rsp],rax - - lea rdi,[64+rdi] - xor rsi,rsi - mov rcx,-8 - jmp NEAR $L$sqrx8x_reduce - -ALIGN 32 -$L$sqrx8x_reduce: - mov rbx,r8 - mulx r8,rax,QWORD[rbp] - adcx rax,rbx - adox r8,r9 - - mulx r9,rbx,QWORD[8+rbp] - adcx r8,rbx - adox r9,r10 - - mulx r10,rbx,QWORD[16+rbp] - adcx r9,rbx - adox r10,r11 - - mulx r11,rbx,QWORD[24+rbp] - adcx r10,rbx - adox r11,r12 - -DB 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 - mov rax,rdx - mov rdx,r8 - adcx r11,rbx - adox r12,r13 - - mulx rdx,rbx,QWORD[((32+8))+rsp] - mov rdx,rax - mov QWORD[((64+48+8))+rcx*8+rsp],rax - - mulx r13,rax,QWORD[40+rbp] - adcx r12,rax - adox r13,r14 - - mulx r14,rax,QWORD[48+rbp] - adcx r13,rax - adox r14,r15 - - mulx r15,rax,QWORD[56+rbp] - mov rdx,rbx - adcx r14,rax - adox r15,rsi - adcx r15,rsi - -DB 0x67,0x67,0x67 - inc rcx - jnz NEAR $L$sqrx8x_reduce - - mov rax,rsi - cmp rbp,QWORD[((0+8))+rsp] - jae NEAR $L$sqrx8x_no_tail - - mov rdx,QWORD[((48+8))+rsp] - add r8,QWORD[rdi] - lea rbp,[64+rbp] - mov rcx,-8 - adcx r9,QWORD[8+rdi] - adcx r10,QWORD[16+rdi] - adc r11,QWORD[24+rdi] - adc r12,QWORD[32+rdi] - adc r13,QWORD[40+rdi] - adc r14,QWORD[48+rdi] - adc r15,QWORD[56+rdi] - lea rdi,[64+rdi] - sbb rax,rax - - xor rsi,rsi - mov QWORD[((16+8))+rsp],rax - jmp NEAR $L$sqrx8x_tail - -ALIGN 32 -$L$sqrx8x_tail: - mov rbx,r8 - mulx r8,rax,QWORD[rbp] - adcx rbx,rax - adox r8,r9 - - mulx r9,rax,QWORD[8+rbp] - adcx r8,rax - adox r9,r10 - - mulx r10,rax,QWORD[16+rbp] - adcx r9,rax - adox r10,r11 - - mulx r11,rax,QWORD[24+rbp] - adcx r10,rax - adox r11,r12 - -DB 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 - adcx r11,rax - adox r12,r13 - - mulx r13,rax,QWORD[40+rbp] - adcx r12,rax - adox r13,r14 - - mulx r14,rax,QWORD[48+rbp] - adcx r13,rax - adox r14,r15 - - mulx r15,rax,QWORD[56+rbp] - mov rdx,QWORD[((72+48+8))+rcx*8+rsp] - adcx r14,rax - adox r15,rsi - mov QWORD[rcx*8+rdi],rbx - mov rbx,r8 - adcx r15,rsi - - inc rcx - jnz NEAR $L$sqrx8x_tail - - cmp rbp,QWORD[((0+8))+rsp] - jae NEAR $L$sqrx8x_tail_done - - sub rsi,QWORD[((16+8))+rsp] - mov rdx,QWORD[((48+8))+rsp] - lea rbp,[64+rbp] - adc r8,QWORD[rdi] - adc r9,QWORD[8+rdi] - adc r10,QWORD[16+rdi] - adc r11,QWORD[24+rdi] - adc r12,QWORD[32+rdi] - adc r13,QWORD[40+rdi] - adc r14,QWORD[48+rdi] - adc r15,QWORD[56+rdi] - lea rdi,[64+rdi] - sbb rax,rax - sub rcx,8 - - xor rsi,rsi - mov QWORD[((16+8))+rsp],rax - jmp NEAR $L$sqrx8x_tail - -ALIGN 32 -$L$sqrx8x_tail_done: - xor rax,rax - add r8,QWORD[((24+8))+rsp] - adc r9,0 - adc r10,0 - adc r11,0 - adc r12,0 - adc r13,0 - adc r14,0 - adc r15,0 - adc rax,0 - - sub rsi,QWORD[((16+8))+rsp] -$L$sqrx8x_no_tail: - adc r8,QWORD[rdi] -DB 102,72,15,126,217 - adc r9,QWORD[8+rdi] - mov rsi,QWORD[56+rbp] -DB 102,72,15,126,213 - adc r10,QWORD[16+rdi] - adc r11,QWORD[24+rdi] - adc r12,QWORD[32+rdi] - adc r13,QWORD[40+rdi] - adc r14,QWORD[48+rdi] - adc r15,QWORD[56+rdi] - adc rax,0 - - mov rbx,QWORD[((32+8))+rsp] - mov rdx,QWORD[64+rcx*1+rdi] - - mov QWORD[rdi],r8 - lea r8,[64+rdi] - mov QWORD[8+rdi],r9 - mov QWORD[16+rdi],r10 - mov QWORD[24+rdi],r11 - mov QWORD[32+rdi],r12 - mov QWORD[40+rdi],r13 - mov QWORD[48+rdi],r14 - mov QWORD[56+rdi],r15 - - lea rdi,[64+rcx*1+rdi] - cmp r8,QWORD[((8+8))+rsp] - jb NEAR $L$sqrx8x_reduction_loop - DB 0F3h,0C3h ;repret - - -ALIGN 32 -__bn_postx4x_internal: - - mov r12,QWORD[rbp] - mov r10,rcx - mov r9,rcx - neg rax - sar rcx,3+2 - -DB 102,72,15,126,202 -DB 102,72,15,126,206 - dec r12 - mov r13,QWORD[8+rbp] - xor r8,r8 - mov r14,QWORD[16+rbp] - mov r15,QWORD[24+rbp] - jmp NEAR $L$sqrx4x_sub_entry - -ALIGN 16 -$L$sqrx4x_sub: - mov r12,QWORD[rbp] - mov r13,QWORD[8+rbp] - mov r14,QWORD[16+rbp] - mov r15,QWORD[24+rbp] -$L$sqrx4x_sub_entry: - andn r12,r12,rax - lea rbp,[32+rbp] - andn r13,r13,rax - andn r14,r14,rax - andn r15,r15,rax - - neg r8 - adc r12,QWORD[rdi] - adc r13,QWORD[8+rdi] - adc r14,QWORD[16+rdi] - adc r15,QWORD[24+rdi] - mov QWORD[rdx],r12 - lea rdi,[32+rdi] - mov QWORD[8+rdx],r13 - sbb r8,r8 - mov QWORD[16+rdx],r14 - mov QWORD[24+rdx],r15 - lea rdx,[32+rdx] - - inc rcx - jnz NEAR $L$sqrx4x_sub - - neg r9 - - DB 0F3h,0C3h ;repret - - global bn_get_bits5 ALIGN 16 @@ -3797,13 +2419,6 @@ ALIGN 4 DD $L$SEH_begin_bn_power5 wrt ..imagebase DD $L$SEH_end_bn_power5 wrt ..imagebase DD $L$SEH_info_bn_power5 wrt ..imagebase - DD $L$SEH_begin_bn_mulx4x_mont_gather5 wrt ..imagebase - DD $L$SEH_end_bn_mulx4x_mont_gather5 wrt ..imagebase - DD $L$SEH_info_bn_mulx4x_mont_gather5 wrt ..imagebase - - DD $L$SEH_begin_bn_powerx5 wrt ..imagebase - DD $L$SEH_end_bn_powerx5 wrt ..imagebase - DD $L$SEH_info_bn_powerx5 wrt ..imagebase DD $L$SEH_begin_bn_gather5 wrt ..imagebase DD $L$SEH_end_bn_gather5 wrt ..imagebase DD $L$SEH_info_bn_gather5 wrt ..imagebase @@ -3825,16 +2440,6 @@ DB 9,0,0,0 DD mul_handler wrt ..imagebase DD $L$power5_prologue wrt ..imagebase,$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase ALIGN 8 -$L$SEH_info_bn_mulx4x_mont_gather5: -DB 9,0,0,0 - DD mul_handler wrt ..imagebase - DD $L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase -ALIGN 8 -$L$SEH_info_bn_powerx5: -DB 9,0,0,0 - DD mul_handler wrt ..imagebase - DD $L$powerx5_prologue wrt ..imagebase,$L$powerx5_body wrt ..imagebase,$L$powerx5_epilogue wrt ..imagebase -ALIGN 8 $L$SEH_info_bn_gather5: DB 0x01,0x0b,0x03,0x0a DB 0x0b,0x01,0x21,0x00 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/ec/ecp_nistz256-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/ec/ecp_nistz256-x86_64.nasm index b35e99bc90b..e5b9c13dbcc 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/ec/ecp_nistz256-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/ec/ecp_nistz256-x86_64.nasm @@ -2853,10 +2853,6 @@ $L$SEH_begin_ecp_nistz256_ord_mul_mont: - mov ecx,0x80100 - and ecx,DWORD[((OPENSSL_ia32cap_P+8))] - cmp ecx,0x80100 - je NEAR $L$ecp_nistz256_ord_mul_montx push rbp push rbx @@ -3190,10 +3186,6 @@ $L$SEH_begin_ecp_nistz256_ord_sqr_mont: - mov ecx,0x80100 - and ecx,DWORD[((OPENSSL_ia32cap_P+8))] - cmp ecx,0x80100 - je NEAR $L$ecp_nistz256_ord_sqr_montx push rbp push rbx @@ -3478,472 +3470,6 @@ $L$ord_sqr_epilogue: $L$SEH_end_ecp_nistz256_ord_sqr_mont: -ALIGN 32 -ecp_nistz256_ord_mul_montx: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_ecp_nistz256_ord_mul_montx: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - - -$L$ecp_nistz256_ord_mul_montx: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - -$L$ord_mulx_body: - - mov rbx,rdx - mov rdx,QWORD[rdx] - mov r9,QWORD[rsi] - mov r10,QWORD[8+rsi] - mov r11,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - lea rsi,[((-128))+rsi] - lea r14,[(($L$ord-128))] - mov r15,QWORD[$L$ordK] - - - mulx r9,r8,r9 - mulx r10,rcx,r10 - mulx r11,rbp,r11 - add r9,rcx - mulx r12,rcx,r12 - mov rdx,r8 - mulx rax,rdx,r15 - adc r10,rbp - adc r11,rcx - adc r12,0 - - - xor r13,r13 - mulx rbp,rcx,QWORD[((0+128))+r14] - adcx r8,rcx - adox r9,rbp - - mulx rbp,rcx,QWORD[((8+128))+r14] - adcx r9,rcx - adox r10,rbp - - mulx rbp,rcx,QWORD[((16+128))+r14] - adcx r10,rcx - adox r11,rbp - - mulx rbp,rcx,QWORD[((24+128))+r14] - mov rdx,QWORD[8+rbx] - adcx r11,rcx - adox r12,rbp - adcx r12,r8 - adox r13,r8 - adc r13,0 - - - mulx rbp,rcx,QWORD[((0+128))+rsi] - adcx r9,rcx - adox r10,rbp - - mulx rbp,rcx,QWORD[((8+128))+rsi] - adcx r10,rcx - adox r11,rbp - - mulx rbp,rcx,QWORD[((16+128))+rsi] - adcx r11,rcx - adox r12,rbp - - mulx rbp,rcx,QWORD[((24+128))+rsi] - mov rdx,r9 - mulx rax,rdx,r15 - adcx r12,rcx - adox r13,rbp - - adcx r13,r8 - adox r8,r8 - adc r8,0 - - - mulx rbp,rcx,QWORD[((0+128))+r14] - adcx r9,rcx - adox r10,rbp - - mulx rbp,rcx,QWORD[((8+128))+r14] - adcx r10,rcx - adox r11,rbp - - mulx rbp,rcx,QWORD[((16+128))+r14] - adcx r11,rcx - adox r12,rbp - - mulx rbp,rcx,QWORD[((24+128))+r14] - mov rdx,QWORD[16+rbx] - adcx r12,rcx - adox r13,rbp - adcx r13,r9 - adox r8,r9 - adc r8,0 - - - mulx rbp,rcx,QWORD[((0+128))+rsi] - adcx r10,rcx - adox r11,rbp - - mulx rbp,rcx,QWORD[((8+128))+rsi] - adcx r11,rcx - adox r12,rbp - - mulx rbp,rcx,QWORD[((16+128))+rsi] - adcx r12,rcx - adox r13,rbp - - mulx rbp,rcx,QWORD[((24+128))+rsi] - mov rdx,r10 - mulx rax,rdx,r15 - adcx r13,rcx - adox r8,rbp - - adcx r8,r9 - adox r9,r9 - adc r9,0 - - - mulx rbp,rcx,QWORD[((0+128))+r14] - adcx r10,rcx - adox r11,rbp - - mulx rbp,rcx,QWORD[((8+128))+r14] - adcx r11,rcx - adox r12,rbp - - mulx rbp,rcx,QWORD[((16+128))+r14] - adcx r12,rcx - adox r13,rbp - - mulx rbp,rcx,QWORD[((24+128))+r14] - mov rdx,QWORD[24+rbx] - adcx r13,rcx - adox r8,rbp - adcx r8,r10 - adox r9,r10 - adc r9,0 - - - mulx rbp,rcx,QWORD[((0+128))+rsi] - adcx r11,rcx - adox r12,rbp - - mulx rbp,rcx,QWORD[((8+128))+rsi] - adcx r12,rcx - adox r13,rbp - - mulx rbp,rcx,QWORD[((16+128))+rsi] - adcx r13,rcx - adox r8,rbp - - mulx rbp,rcx,QWORD[((24+128))+rsi] - mov rdx,r11 - mulx rax,rdx,r15 - adcx r8,rcx - adox r9,rbp - - adcx r9,r10 - adox r10,r10 - adc r10,0 - - - mulx rbp,rcx,QWORD[((0+128))+r14] - adcx r11,rcx - adox r12,rbp - - mulx rbp,rcx,QWORD[((8+128))+r14] - adcx r12,rcx - adox r13,rbp - - mulx rbp,rcx,QWORD[((16+128))+r14] - adcx r13,rcx - adox r8,rbp - - mulx rbp,rcx,QWORD[((24+128))+r14] - lea r14,[128+r14] - mov rbx,r12 - adcx r8,rcx - adox r9,rbp - mov rdx,r13 - adcx r9,r11 - adox r10,r11 - adc r10,0 - - - - mov rcx,r8 - sub r12,QWORD[r14] - sbb r13,QWORD[8+r14] - sbb r8,QWORD[16+r14] - mov rbp,r9 - sbb r9,QWORD[24+r14] - sbb r10,0 - - cmovc r12,rbx - cmovc r13,rdx - cmovc r8,rcx - cmovc r9,rbp - - mov QWORD[rdi],r12 - mov QWORD[8+rdi],r13 - mov QWORD[16+rdi],r8 - mov QWORD[24+rdi],r9 - - mov r15,QWORD[rsp] - - mov r14,QWORD[8+rsp] - - mov r13,QWORD[16+rsp] - - mov r12,QWORD[24+rsp] - - mov rbx,QWORD[32+rsp] - - mov rbp,QWORD[40+rsp] - - lea rsp,[48+rsp] - -$L$ord_mulx_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_ecp_nistz256_ord_mul_montx: - - -ALIGN 32 -ecp_nistz256_ord_sqr_montx: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_ecp_nistz256_ord_sqr_montx: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - - -$L$ecp_nistz256_ord_sqr_montx: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - -$L$ord_sqrx_body: - - mov rbx,rdx - mov rdx,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r15,QWORD[16+rsi] - mov r8,QWORD[24+rsi] - lea rsi,[$L$ord] - jmp NEAR $L$oop_ord_sqrx - -ALIGN 32 -$L$oop_ord_sqrx: - mulx r10,r9,r14 - mulx r11,rcx,r15 - mov rax,rdx -DB 102,73,15,110,206 - mulx r12,rbp,r8 - mov rdx,r14 - add r10,rcx -DB 102,73,15,110,215 - adc r11,rbp - adc r12,0 - xor r13,r13 - - mulx rbp,rcx,r15 - adcx r11,rcx - adox r12,rbp - - mulx rbp,rcx,r8 - mov rdx,r15 - adcx r12,rcx - adox r13,rbp - adc r13,0 - - mulx r14,rcx,r8 - mov rdx,rax -DB 102,73,15,110,216 - xor r15,r15 - adcx r9,r9 - adox r13,rcx - adcx r10,r10 - adox r14,r15 - - - mulx rbp,r8,rdx -DB 102,72,15,126,202 - adcx r11,r11 - adox r9,rbp - adcx r12,r12 - mulx rax,rcx,rdx -DB 102,72,15,126,210 - adcx r13,r13 - adox r10,rcx - adcx r14,r14 - mulx rbp,rcx,rdx -DB 0x67 -DB 102,72,15,126,218 - adox r11,rax - adcx r15,r15 - adox r12,rcx - adox r13,rbp - mulx rax,rcx,rdx - adox r14,rcx - adox r15,rax - - - mov rdx,r8 - mulx rcx,rdx,QWORD[32+rsi] - - xor rax,rax - mulx rbp,rcx,QWORD[rsi] - adcx r8,rcx - adox r9,rbp - mulx rbp,rcx,QWORD[8+rsi] - adcx r9,rcx - adox r10,rbp - mulx rbp,rcx,QWORD[16+rsi] - adcx r10,rcx - adox r11,rbp - mulx rbp,rcx,QWORD[24+rsi] - adcx r11,rcx - adox r8,rbp - adcx r8,rax - - - mov rdx,r9 - mulx rcx,rdx,QWORD[32+rsi] - - mulx rbp,rcx,QWORD[rsi] - adox r9,rcx - adcx r10,rbp - mulx rbp,rcx,QWORD[8+rsi] - adox r10,rcx - adcx r11,rbp - mulx rbp,rcx,QWORD[16+rsi] - adox r11,rcx - adcx r8,rbp - mulx rbp,rcx,QWORD[24+rsi] - adox r8,rcx - adcx r9,rbp - adox r9,rax - - - mov rdx,r10 - mulx rcx,rdx,QWORD[32+rsi] - - mulx rbp,rcx,QWORD[rsi] - adcx r10,rcx - adox r11,rbp - mulx rbp,rcx,QWORD[8+rsi] - adcx r11,rcx - adox r8,rbp - mulx rbp,rcx,QWORD[16+rsi] - adcx r8,rcx - adox r9,rbp - mulx rbp,rcx,QWORD[24+rsi] - adcx r9,rcx - adox r10,rbp - adcx r10,rax - - - mov rdx,r11 - mulx rcx,rdx,QWORD[32+rsi] - - mulx rbp,rcx,QWORD[rsi] - adox r11,rcx - adcx r8,rbp - mulx rbp,rcx,QWORD[8+rsi] - adox r8,rcx - adcx r9,rbp - mulx rbp,rcx,QWORD[16+rsi] - adox r9,rcx - adcx r10,rbp - mulx rbp,rcx,QWORD[24+rsi] - adox r10,rcx - adcx r11,rbp - adox r11,rax - - - add r12,r8 - adc r9,r13 - mov rdx,r12 - adc r10,r14 - adc r11,r15 - mov r14,r9 - adc rax,0 - - - sub r12,QWORD[rsi] - mov r15,r10 - sbb r9,QWORD[8+rsi] - sbb r10,QWORD[16+rsi] - mov r8,r11 - sbb r11,QWORD[24+rsi] - sbb rax,0 - - cmovnc rdx,r12 - cmovnc r14,r9 - cmovnc r15,r10 - cmovnc r8,r11 - - dec rbx - jnz NEAR $L$oop_ord_sqrx - - mov QWORD[rdi],rdx - mov QWORD[8+rdi],r14 - pxor xmm1,xmm1 - mov QWORD[16+rdi],r15 - pxor xmm2,xmm2 - mov QWORD[24+rdi],r8 - pxor xmm3,xmm3 - - mov r15,QWORD[rsp] - - mov r14,QWORD[8+rsp] - - mov r13,QWORD[16+rsp] - - mov r12,QWORD[24+rsp] - - mov rbx,QWORD[32+rsp] - - mov rbp,QWORD[40+rsp] - - lea rsp,[48+rsp] - -$L$ord_sqrx_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_ecp_nistz256_ord_sqr_montx: - - global ecp_nistz256_to_mont @@ -3959,8 +3485,6 @@ $L$SEH_begin_ecp_nistz256_to_mont: - mov ecx,0x80100 - and ecx,DWORD[((OPENSSL_ia32cap_P+8))] lea rdx,[$L$RR] jmp NEAR $L$mul_mont @@ -3986,8 +3510,6 @@ $L$SEH_begin_ecp_nistz256_mul_mont: - mov ecx,0x80100 - and ecx,DWORD[((OPENSSL_ia32cap_P+8))] $L$mul_mont: push rbp @@ -4002,8 +3524,6 @@ $L$mul_mont: push r15 $L$mul_body: - cmp ecx,0x80100 - je NEAR $L$mul_montx mov rbx,rdx mov rax,QWORD[rdx] mov r9,QWORD[rsi] @@ -4012,19 +3532,6 @@ $L$mul_body: mov r12,QWORD[24+rsi] call __ecp_nistz256_mul_montq - jmp NEAR $L$mul_mont_done - -ALIGN 32 -$L$mul_montx: - mov rbx,rdx - mov rdx,QWORD[rdx] - mov r9,QWORD[rsi] - mov r10,QWORD[8+rsi] - mov r11,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - lea rsi,[((-128))+rsi] - - call __ecp_nistz256_mul_montx $L$mul_mont_done: mov r15,QWORD[rsp] @@ -4285,8 +3792,6 @@ $L$SEH_begin_ecp_nistz256_sqr_mont: - mov ecx,0x80100 - and ecx,DWORD[((OPENSSL_ia32cap_P+8))] push rbp push rbx @@ -4300,25 +3805,12 @@ $L$SEH_begin_ecp_nistz256_sqr_mont: push r15 $L$sqr_body: - cmp ecx,0x80100 - je NEAR $L$sqr_montx mov rax,QWORD[rsi] mov r14,QWORD[8+rsi] mov r15,QWORD[16+rsi] mov r8,QWORD[24+rsi] call __ecp_nistz256_sqr_montq - jmp NEAR $L$sqr_mont_done - -ALIGN 32 -$L$sqr_montx: - mov rdx,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r15,QWORD[16+rsi] - mov r8,QWORD[24+rsi] - lea rsi,[((-128))+rsi] - - call __ecp_nistz256_sqr_montx $L$sqr_mont_done: mov r15,QWORD[rsp] @@ -4505,335 +3997,37 @@ __ecp_nistz256_sqr_montq: -ALIGN 32 -__ecp_nistz256_mul_montx: - mulx r9,r8,r9 - mulx r10,rcx,r10 - mov r14,32 - xor r13,r13 - mulx r11,rbp,r11 - mov r15,QWORD[(($L$poly+24))] - adc r9,rcx - mulx r12,rcx,r12 - mov rdx,r8 - adc r10,rbp - shlx rbp,r8,r14 - adc r11,rcx - shrx rcx,r8,r14 - adc r12,0 +global ecp_nistz256_from_mont - add r9,rbp - adc r10,rcx +ALIGN 32 +ecp_nistz256_from_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_from_mont: + mov rdi,rcx + mov rsi,rdx - mulx rbp,rcx,r15 - mov rdx,QWORD[8+rbx] - adc r11,rcx - adc r12,rbp - adc r13,0 - xor r8,r8 + push r12 - mulx rbp,rcx,QWORD[((0+128))+rsi] - adcx r9,rcx - adox r10,rbp + push r13 - mulx rbp,rcx,QWORD[((8+128))+rsi] - adcx r10,rcx - adox r11,rbp +$L$from_body: - mulx rbp,rcx,QWORD[((16+128))+rsi] - adcx r11,rcx - adox r12,rbp - - mulx rbp,rcx,QWORD[((24+128))+rsi] - mov rdx,r9 - adcx r12,rcx - shlx rcx,r9,r14 - adox r13,rbp - shrx rbp,r9,r14 - - adcx r13,r8 - adox r8,r8 - adc r8,0 - - - - add r10,rcx - adc r11,rbp - - mulx rbp,rcx,r15 - mov rdx,QWORD[16+rbx] - adc r12,rcx - adc r13,rbp - adc r8,0 - xor r9,r9 - - - - mulx rbp,rcx,QWORD[((0+128))+rsi] - adcx r10,rcx - adox r11,rbp - - mulx rbp,rcx,QWORD[((8+128))+rsi] - adcx r11,rcx - adox r12,rbp - - mulx rbp,rcx,QWORD[((16+128))+rsi] - adcx r12,rcx - adox r13,rbp - - mulx rbp,rcx,QWORD[((24+128))+rsi] - mov rdx,r10 - adcx r13,rcx - shlx rcx,r10,r14 - adox r8,rbp - shrx rbp,r10,r14 - - adcx r8,r9 - adox r9,r9 - adc r9,0 - - - - add r11,rcx - adc r12,rbp - - mulx rbp,rcx,r15 - mov rdx,QWORD[24+rbx] - adc r13,rcx - adc r8,rbp - adc r9,0 - xor r10,r10 - - - - mulx rbp,rcx,QWORD[((0+128))+rsi] - adcx r11,rcx - adox r12,rbp - - mulx rbp,rcx,QWORD[((8+128))+rsi] - adcx r12,rcx - adox r13,rbp - - mulx rbp,rcx,QWORD[((16+128))+rsi] - adcx r13,rcx - adox r8,rbp - - mulx rbp,rcx,QWORD[((24+128))+rsi] - mov rdx,r11 - adcx r8,rcx - shlx rcx,r11,r14 - adox r9,rbp - shrx rbp,r11,r14 - - adcx r9,r10 - adox r10,r10 - adc r10,0 - - - - add r12,rcx - adc r13,rbp - - mulx rbp,rcx,r15 - mov rbx,r12 - mov r14,QWORD[(($L$poly+8))] - adc r8,rcx - mov rdx,r13 - adc r9,rbp - adc r10,0 - - - - xor eax,eax - mov rcx,r8 - sbb r12,-1 - sbb r13,r14 - sbb r8,0 - mov rbp,r9 - sbb r9,r15 - sbb r10,0 - - cmovc r12,rbx - cmovc r13,rdx - mov QWORD[rdi],r12 - cmovc r8,rcx - mov QWORD[8+rdi],r13 - cmovc r9,rbp - mov QWORD[16+rdi],r8 - mov QWORD[24+rdi],r9 - - DB 0F3h,0C3h ;repret - - - - -ALIGN 32 -__ecp_nistz256_sqr_montx: - - mulx r10,r9,r14 - mulx r11,rcx,r15 - xor eax,eax - adc r10,rcx - mulx r12,rbp,r8 - mov rdx,r14 - adc r11,rbp - adc r12,0 - xor r13,r13 - - - mulx rbp,rcx,r15 - adcx r11,rcx - adox r12,rbp - - mulx rbp,rcx,r8 - mov rdx,r15 - adcx r12,rcx - adox r13,rbp - adc r13,0 - - - mulx r14,rcx,r8 - mov rdx,QWORD[((0+128))+rsi] - xor r15,r15 - adcx r9,r9 - adox r13,rcx - adcx r10,r10 - adox r14,r15 - - mulx rbp,r8,rdx - mov rdx,QWORD[((8+128))+rsi] - adcx r11,r11 - adox r9,rbp - adcx r12,r12 - mulx rax,rcx,rdx - mov rdx,QWORD[((16+128))+rsi] - adcx r13,r13 - adox r10,rcx - adcx r14,r14 -DB 0x67 - mulx rbp,rcx,rdx - mov rdx,QWORD[((24+128))+rsi] - adox r11,rax - adcx r15,r15 - adox r12,rcx - mov rsi,32 - adox r13,rbp -DB 0x67,0x67 - mulx rax,rcx,rdx - mov rdx,QWORD[(($L$poly+24))] - adox r14,rcx - shlx rcx,r8,rsi - adox r15,rax - shrx rax,r8,rsi - mov rbp,rdx - - - add r9,rcx - adc r10,rax - - mulx r8,rcx,r8 - adc r11,rcx - shlx rcx,r9,rsi - adc r8,0 - shrx rax,r9,rsi - - - add r10,rcx - adc r11,rax - - mulx r9,rcx,r9 - adc r8,rcx - shlx rcx,r10,rsi - adc r9,0 - shrx rax,r10,rsi - - - add r11,rcx - adc r8,rax - - mulx r10,rcx,r10 - adc r9,rcx - shlx rcx,r11,rsi - adc r10,0 - shrx rax,r11,rsi - - - add r8,rcx - adc r9,rax - - mulx r11,rcx,r11 - adc r10,rcx - adc r11,0 - - xor rdx,rdx - add r12,r8 - mov rsi,QWORD[(($L$poly+8))] - adc r13,r9 - mov r8,r12 - adc r14,r10 - adc r15,r11 - mov r9,r13 - adc rdx,0 - - sub r12,-1 - mov r10,r14 - sbb r13,rsi - sbb r14,0 - mov r11,r15 - sbb r15,rbp - sbb rdx,0 - - cmovc r12,r8 - cmovc r13,r9 - mov QWORD[rdi],r12 - cmovc r14,r10 - mov QWORD[8+rdi],r13 - cmovc r15,r11 - mov QWORD[16+rdi],r14 - mov QWORD[24+rdi],r15 - - DB 0F3h,0C3h ;repret - - - - - - - - -global ecp_nistz256_from_mont - -ALIGN 32 -ecp_nistz256_from_mont: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_ecp_nistz256_from_mont: - mov rdi,rcx - mov rsi,rdx - - - - push r12 - - push r13 - -$L$from_body: - - mov rax,QWORD[rsi] - mov r13,QWORD[(($L$poly+24))] - mov r9,QWORD[8+rsi] - mov r10,QWORD[16+rsi] - mov r11,QWORD[24+rsi] - mov r8,rax - mov r12,QWORD[(($L$poly+8))] + mov rax,QWORD[rsi] + mov r13,QWORD[(($L$poly+24))] + mov r9,QWORD[8+rsi] + mov r10,QWORD[16+rsi] + mov r11,QWORD[24+rsi] + mov r8,rax + mov r12,QWORD[(($L$poly+8))] @@ -4951,9 +4145,6 @@ global ecp_nistz256_gather_w5 ALIGN 32 ecp_nistz256_gather_w5: - mov eax,DWORD[((OPENSSL_ia32cap_P+8))] - test eax,32 - jnz NEAR $L$avx2_gather_w5 lea rax,[((-136))+rsp] $L$SEH_begin_ecp_nistz256_gather_w5: DB 0x48,0x8d,0x60,0xe0 @@ -5061,9 +4252,6 @@ global ecp_nistz256_gather_w7 ALIGN 32 ecp_nistz256_gather_w7: - mov eax,DWORD[((OPENSSL_ia32cap_P+8))] - test eax,32 - jnz NEAR $L$avx2_gather_w7 lea rax,[((-136))+rsp] $L$SEH_begin_ecp_nistz256_gather_w7: DB 0x48,0x8d,0x60,0xe0 @@ -5102,1359 +4290,61 @@ $L$select_loop_sse_w7: pand xmm9,xmm15 pand xmm10,xmm15 por xmm2,xmm9 - pand xmm11,xmm15 - por xmm3,xmm10 - pand xmm12,xmm15 - por xmm4,xmm11 - prefetcht0 [255+rdx] - por xmm5,xmm12 - - dec rax - jnz NEAR $L$select_loop_sse_w7 - - movdqu XMMWORD[rcx],xmm2 - movdqu XMMWORD[16+rcx],xmm3 - movdqu XMMWORD[32+rcx],xmm4 - movdqu XMMWORD[48+rcx],xmm5 - movaps xmm6,XMMWORD[rsp] - movaps xmm7,XMMWORD[16+rsp] - movaps xmm8,XMMWORD[32+rsp] - movaps xmm9,XMMWORD[48+rsp] - movaps xmm10,XMMWORD[64+rsp] - movaps xmm11,XMMWORD[80+rsp] - movaps xmm12,XMMWORD[96+rsp] - movaps xmm13,XMMWORD[112+rsp] - movaps xmm14,XMMWORD[128+rsp] - movaps xmm15,XMMWORD[144+rsp] - lea rsp,[168+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_ecp_nistz256_gather_w7: - - - - -ALIGN 32 -ecp_nistz256_avx2_gather_w5: - -$L$avx2_gather_w5: - vzeroupper - lea rax,[((-136))+rsp] - mov r11,rsp -$L$SEH_begin_ecp_nistz256_avx2_gather_w5: -DB 0x48,0x8d,0x60,0xe0 -DB 0xc5,0xf8,0x29,0x70,0xe0 -DB 0xc5,0xf8,0x29,0x78,0xf0 -DB 0xc5,0x78,0x29,0x40,0x00 -DB 0xc5,0x78,0x29,0x48,0x10 -DB 0xc5,0x78,0x29,0x50,0x20 -DB 0xc5,0x78,0x29,0x58,0x30 -DB 0xc5,0x78,0x29,0x60,0x40 -DB 0xc5,0x78,0x29,0x68,0x50 -DB 0xc5,0x78,0x29,0x70,0x60 -DB 0xc5,0x78,0x29,0x78,0x70 - vmovdqa ymm0,YMMWORD[$L$Two] - - vpxor ymm2,ymm2,ymm2 - vpxor ymm3,ymm3,ymm3 - vpxor ymm4,ymm4,ymm4 - - vmovdqa ymm5,YMMWORD[$L$One] - vmovdqa ymm10,YMMWORD[$L$Two] - - vmovd xmm1,r8d - vpermd ymm1,ymm2,ymm1 - - mov rax,8 -$L$select_loop_avx2_w5: - - vmovdqa ymm6,YMMWORD[rdx] - vmovdqa ymm7,YMMWORD[32+rdx] - vmovdqa ymm8,YMMWORD[64+rdx] - - vmovdqa ymm11,YMMWORD[96+rdx] - vmovdqa ymm12,YMMWORD[128+rdx] - vmovdqa ymm13,YMMWORD[160+rdx] - - vpcmpeqd ymm9,ymm5,ymm1 - vpcmpeqd ymm14,ymm10,ymm1 - - vpaddd ymm5,ymm5,ymm0 - vpaddd ymm10,ymm10,ymm0 - lea rdx,[192+rdx] - - vpand ymm6,ymm6,ymm9 - vpand ymm7,ymm7,ymm9 - vpand ymm8,ymm8,ymm9 - vpand ymm11,ymm11,ymm14 - vpand ymm12,ymm12,ymm14 - vpand ymm13,ymm13,ymm14 - - vpxor ymm2,ymm2,ymm6 - vpxor ymm3,ymm3,ymm7 - vpxor ymm4,ymm4,ymm8 - vpxor ymm2,ymm2,ymm11 - vpxor ymm3,ymm3,ymm12 - vpxor ymm4,ymm4,ymm13 - - dec rax - jnz NEAR $L$select_loop_avx2_w5 - - vmovdqu YMMWORD[rcx],ymm2 - vmovdqu YMMWORD[32+rcx],ymm3 - vmovdqu YMMWORD[64+rcx],ymm4 - vzeroupper - movaps xmm6,XMMWORD[rsp] - movaps xmm7,XMMWORD[16+rsp] - movaps xmm8,XMMWORD[32+rsp] - movaps xmm9,XMMWORD[48+rsp] - movaps xmm10,XMMWORD[64+rsp] - movaps xmm11,XMMWORD[80+rsp] - movaps xmm12,XMMWORD[96+rsp] - movaps xmm13,XMMWORD[112+rsp] - movaps xmm14,XMMWORD[128+rsp] - movaps xmm15,XMMWORD[144+rsp] - lea rsp,[r11] - DB 0F3h,0C3h ;repret - -$L$SEH_end_ecp_nistz256_avx2_gather_w5: - - - - -global ecp_nistz256_avx2_gather_w7 - -ALIGN 32 -ecp_nistz256_avx2_gather_w7: - -$L$avx2_gather_w7: - vzeroupper - mov r11,rsp - lea rax,[((-136))+rsp] -$L$SEH_begin_ecp_nistz256_avx2_gather_w7: -DB 0x48,0x8d,0x60,0xe0 -DB 0xc5,0xf8,0x29,0x70,0xe0 -DB 0xc5,0xf8,0x29,0x78,0xf0 -DB 0xc5,0x78,0x29,0x40,0x00 -DB 0xc5,0x78,0x29,0x48,0x10 -DB 0xc5,0x78,0x29,0x50,0x20 -DB 0xc5,0x78,0x29,0x58,0x30 -DB 0xc5,0x78,0x29,0x60,0x40 -DB 0xc5,0x78,0x29,0x68,0x50 -DB 0xc5,0x78,0x29,0x70,0x60 -DB 0xc5,0x78,0x29,0x78,0x70 - vmovdqa ymm0,YMMWORD[$L$Three] - - vpxor ymm2,ymm2,ymm2 - vpxor ymm3,ymm3,ymm3 - - vmovdqa ymm4,YMMWORD[$L$One] - vmovdqa ymm8,YMMWORD[$L$Two] - vmovdqa ymm12,YMMWORD[$L$Three] - - vmovd xmm1,r8d - vpermd ymm1,ymm2,ymm1 - - - mov rax,21 -$L$select_loop_avx2_w7: - - vmovdqa ymm5,YMMWORD[rdx] - vmovdqa ymm6,YMMWORD[32+rdx] - - vmovdqa ymm9,YMMWORD[64+rdx] - vmovdqa ymm10,YMMWORD[96+rdx] - - vmovdqa ymm13,YMMWORD[128+rdx] - vmovdqa ymm14,YMMWORD[160+rdx] - - vpcmpeqd ymm7,ymm4,ymm1 - vpcmpeqd ymm11,ymm8,ymm1 - vpcmpeqd ymm15,ymm12,ymm1 - - vpaddd ymm4,ymm4,ymm0 - vpaddd ymm8,ymm8,ymm0 - vpaddd ymm12,ymm12,ymm0 - lea rdx,[192+rdx] - - vpand ymm5,ymm5,ymm7 - vpand ymm6,ymm6,ymm7 - vpand ymm9,ymm9,ymm11 - vpand ymm10,ymm10,ymm11 - vpand ymm13,ymm13,ymm15 - vpand ymm14,ymm14,ymm15 - - vpxor ymm2,ymm2,ymm5 - vpxor ymm3,ymm3,ymm6 - vpxor ymm2,ymm2,ymm9 - vpxor ymm3,ymm3,ymm10 - vpxor ymm2,ymm2,ymm13 - vpxor ymm3,ymm3,ymm14 - - dec rax - jnz NEAR $L$select_loop_avx2_w7 - - - vmovdqa ymm5,YMMWORD[rdx] - vmovdqa ymm6,YMMWORD[32+rdx] - - vpcmpeqd ymm7,ymm4,ymm1 - - vpand ymm5,ymm5,ymm7 - vpand ymm6,ymm6,ymm7 - - vpxor ymm2,ymm2,ymm5 - vpxor ymm3,ymm3,ymm6 - - vmovdqu YMMWORD[rcx],ymm2 - vmovdqu YMMWORD[32+rcx],ymm3 - vzeroupper - movaps xmm6,XMMWORD[rsp] - movaps xmm7,XMMWORD[16+rsp] - movaps xmm8,XMMWORD[32+rsp] - movaps xmm9,XMMWORD[48+rsp] - movaps xmm10,XMMWORD[64+rsp] - movaps xmm11,XMMWORD[80+rsp] - movaps xmm12,XMMWORD[96+rsp] - movaps xmm13,XMMWORD[112+rsp] - movaps xmm14,XMMWORD[128+rsp] - movaps xmm15,XMMWORD[144+rsp] - lea rsp,[r11] - DB 0F3h,0C3h ;repret - -$L$SEH_end_ecp_nistz256_avx2_gather_w7: - - -ALIGN 32 -__ecp_nistz256_add_toq: - - xor r11,r11 - add r12,QWORD[rbx] - adc r13,QWORD[8+rbx] - mov rax,r12 - adc r8,QWORD[16+rbx] - adc r9,QWORD[24+rbx] - mov rbp,r13 - adc r11,0 - - sub r12,-1 - mov rcx,r8 - sbb r13,r14 - sbb r8,0 - mov r10,r9 - sbb r9,r15 - sbb r11,0 - - cmovc r12,rax - cmovc r13,rbp - mov QWORD[rdi],r12 - cmovc r8,rcx - mov QWORD[8+rdi],r13 - cmovc r9,r10 - mov QWORD[16+rdi],r8 - mov QWORD[24+rdi],r9 - - DB 0F3h,0C3h ;repret - - - - -ALIGN 32 -__ecp_nistz256_sub_fromq: - - sub r12,QWORD[rbx] - sbb r13,QWORD[8+rbx] - mov rax,r12 - sbb r8,QWORD[16+rbx] - sbb r9,QWORD[24+rbx] - mov rbp,r13 - sbb r11,r11 - - add r12,-1 - mov rcx,r8 - adc r13,r14 - adc r8,0 - mov r10,r9 - adc r9,r15 - test r11,r11 - - cmovz r12,rax - cmovz r13,rbp - mov QWORD[rdi],r12 - cmovz r8,rcx - mov QWORD[8+rdi],r13 - cmovz r9,r10 - mov QWORD[16+rdi],r8 - mov QWORD[24+rdi],r9 - - DB 0F3h,0C3h ;repret - - - - -ALIGN 32 -__ecp_nistz256_subq: - - sub rax,r12 - sbb rbp,r13 - mov r12,rax - sbb rcx,r8 - sbb r10,r9 - mov r13,rbp - sbb r11,r11 - - add rax,-1 - mov r8,rcx - adc rbp,r14 - adc rcx,0 - mov r9,r10 - adc r10,r15 - test r11,r11 - - cmovnz r12,rax - cmovnz r13,rbp - cmovnz r8,rcx - cmovnz r9,r10 - - DB 0F3h,0C3h ;repret - - - - -ALIGN 32 -__ecp_nistz256_mul_by_2q: - - xor r11,r11 - add r12,r12 - adc r13,r13 - mov rax,r12 - adc r8,r8 - adc r9,r9 - mov rbp,r13 - adc r11,0 - - sub r12,-1 - mov rcx,r8 - sbb r13,r14 - sbb r8,0 - mov r10,r9 - sbb r9,r15 - sbb r11,0 - - cmovc r12,rax - cmovc r13,rbp - mov QWORD[rdi],r12 - cmovc r8,rcx - mov QWORD[8+rdi],r13 - cmovc r9,r10 - mov QWORD[16+rdi],r8 - mov QWORD[24+rdi],r9 - - DB 0F3h,0C3h ;repret - - -global ecp_nistz256_point_double - -ALIGN 32 -ecp_nistz256_point_double: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_ecp_nistz256_point_double: - mov rdi,rcx - mov rsi,rdx - - - - mov ecx,0x80100 - and ecx,DWORD[((OPENSSL_ia32cap_P+8))] - cmp ecx,0x80100 - je NEAR $L$point_doublex - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,32*5+8 - -$L$point_doubleq_body: - -$L$point_double_shortcutq: - movdqu xmm0,XMMWORD[rsi] - mov rbx,rsi - movdqu xmm1,XMMWORD[16+rsi] - mov r12,QWORD[((32+0))+rsi] - mov r13,QWORD[((32+8))+rsi] - mov r8,QWORD[((32+16))+rsi] - mov r9,QWORD[((32+24))+rsi] - mov r14,QWORD[(($L$poly+8))] - mov r15,QWORD[(($L$poly+24))] - movdqa XMMWORD[96+rsp],xmm0 - movdqa XMMWORD[(96+16)+rsp],xmm1 - lea r10,[32+rdi] - lea r11,[64+rdi] -DB 102,72,15,110,199 -DB 102,73,15,110,202 -DB 102,73,15,110,211 - - lea rdi,[rsp] - call __ecp_nistz256_mul_by_2q - - mov rax,QWORD[((64+0))+rsi] - mov r14,QWORD[((64+8))+rsi] - mov r15,QWORD[((64+16))+rsi] - mov r8,QWORD[((64+24))+rsi] - lea rsi,[((64-0))+rsi] - lea rdi,[64+rsp] - call __ecp_nistz256_sqr_montq - - mov rax,QWORD[((0+0))+rsp] - mov r14,QWORD[((8+0))+rsp] - lea rsi,[((0+0))+rsp] - mov r15,QWORD[((16+0))+rsp] - mov r8,QWORD[((24+0))+rsp] - lea rdi,[rsp] - call __ecp_nistz256_sqr_montq - - mov rax,QWORD[32+rbx] - mov r9,QWORD[((64+0))+rbx] - mov r10,QWORD[((64+8))+rbx] - mov r11,QWORD[((64+16))+rbx] - mov r12,QWORD[((64+24))+rbx] - lea rsi,[((64-0))+rbx] - lea rbx,[32+rbx] -DB 102,72,15,126,215 - call __ecp_nistz256_mul_montq - call __ecp_nistz256_mul_by_2q - - mov r12,QWORD[((96+0))+rsp] - mov r13,QWORD[((96+8))+rsp] - lea rbx,[64+rsp] - mov r8,QWORD[((96+16))+rsp] - mov r9,QWORD[((96+24))+rsp] - lea rdi,[32+rsp] - call __ecp_nistz256_add_toq - - mov r12,QWORD[((96+0))+rsp] - mov r13,QWORD[((96+8))+rsp] - lea rbx,[64+rsp] - mov r8,QWORD[((96+16))+rsp] - mov r9,QWORD[((96+24))+rsp] - lea rdi,[64+rsp] - call __ecp_nistz256_sub_fromq - - mov rax,QWORD[((0+0))+rsp] - mov r14,QWORD[((8+0))+rsp] - lea rsi,[((0+0))+rsp] - mov r15,QWORD[((16+0))+rsp] - mov r8,QWORD[((24+0))+rsp] -DB 102,72,15,126,207 - call __ecp_nistz256_sqr_montq - xor r9,r9 - mov rax,r12 - add r12,-1 - mov r10,r13 - adc r13,rsi - mov rcx,r14 - adc r14,0 - mov r8,r15 - adc r15,rbp - adc r9,0 - xor rsi,rsi - test rax,1 - - cmovz r12,rax - cmovz r13,r10 - cmovz r14,rcx - cmovz r15,r8 - cmovz r9,rsi - - mov rax,r13 - shr r12,1 - shl rax,63 - mov r10,r14 - shr r13,1 - or r12,rax - shl r10,63 - mov rcx,r15 - shr r14,1 - or r13,r10 - shl rcx,63 - mov QWORD[rdi],r12 - shr r15,1 - mov QWORD[8+rdi],r13 - shl r9,63 - or r14,rcx - or r15,r9 - mov QWORD[16+rdi],r14 - mov QWORD[24+rdi],r15 - mov rax,QWORD[64+rsp] - lea rbx,[64+rsp] - mov r9,QWORD[((0+32))+rsp] - mov r10,QWORD[((8+32))+rsp] - lea rsi,[((0+32))+rsp] - mov r11,QWORD[((16+32))+rsp] - mov r12,QWORD[((24+32))+rsp] - lea rdi,[32+rsp] - call __ecp_nistz256_mul_montq - - lea rdi,[128+rsp] - call __ecp_nistz256_mul_by_2q - - lea rbx,[32+rsp] - lea rdi,[32+rsp] - call __ecp_nistz256_add_toq - - mov rax,QWORD[96+rsp] - lea rbx,[96+rsp] - mov r9,QWORD[((0+0))+rsp] - mov r10,QWORD[((8+0))+rsp] - lea rsi,[((0+0))+rsp] - mov r11,QWORD[((16+0))+rsp] - mov r12,QWORD[((24+0))+rsp] - lea rdi,[rsp] - call __ecp_nistz256_mul_montq - - lea rdi,[128+rsp] - call __ecp_nistz256_mul_by_2q - - mov rax,QWORD[((0+32))+rsp] - mov r14,QWORD[((8+32))+rsp] - lea rsi,[((0+32))+rsp] - mov r15,QWORD[((16+32))+rsp] - mov r8,QWORD[((24+32))+rsp] -DB 102,72,15,126,199 - call __ecp_nistz256_sqr_montq - - lea rbx,[128+rsp] - mov r8,r14 - mov r9,r15 - mov r14,rsi - mov r15,rbp - call __ecp_nistz256_sub_fromq - - mov rax,QWORD[((0+0))+rsp] - mov rbp,QWORD[((0+8))+rsp] - mov rcx,QWORD[((0+16))+rsp] - mov r10,QWORD[((0+24))+rsp] - lea rdi,[rsp] - call __ecp_nistz256_subq - - mov rax,QWORD[32+rsp] - lea rbx,[32+rsp] - mov r14,r12 - xor ecx,ecx - mov QWORD[((0+0))+rsp],r12 - mov r10,r13 - mov QWORD[((0+8))+rsp],r13 - cmovz r11,r8 - mov QWORD[((0+16))+rsp],r8 - lea rsi,[((0-0))+rsp] - cmovz r12,r9 - mov QWORD[((0+24))+rsp],r9 - mov r9,r14 - lea rdi,[rsp] - call __ecp_nistz256_mul_montq - -DB 102,72,15,126,203 -DB 102,72,15,126,207 - call __ecp_nistz256_sub_fromq - - lea rsi,[((160+56))+rsp] - - mov r15,QWORD[((-48))+rsi] - - mov r14,QWORD[((-40))+rsi] - - mov r13,QWORD[((-32))+rsi] - - mov r12,QWORD[((-24))+rsi] - - mov rbx,QWORD[((-16))+rsi] - - mov rbp,QWORD[((-8))+rsi] - - lea rsp,[rsi] - -$L$point_doubleq_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_ecp_nistz256_point_double: -global ecp_nistz256_point_add - -ALIGN 32 -ecp_nistz256_point_add: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_ecp_nistz256_point_add: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - - - mov ecx,0x80100 - and ecx,DWORD[((OPENSSL_ia32cap_P+8))] - cmp ecx,0x80100 - je NEAR $L$point_addx - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,32*18+8 - -$L$point_addq_body: - - movdqu xmm0,XMMWORD[rsi] - movdqu xmm1,XMMWORD[16+rsi] - movdqu xmm2,XMMWORD[32+rsi] - movdqu xmm3,XMMWORD[48+rsi] - movdqu xmm4,XMMWORD[64+rsi] - movdqu xmm5,XMMWORD[80+rsi] - mov rbx,rsi - mov rsi,rdx - movdqa XMMWORD[384+rsp],xmm0 - movdqa XMMWORD[(384+16)+rsp],xmm1 - movdqa XMMWORD[416+rsp],xmm2 - movdqa XMMWORD[(416+16)+rsp],xmm3 - movdqa XMMWORD[448+rsp],xmm4 - movdqa XMMWORD[(448+16)+rsp],xmm5 - por xmm5,xmm4 - - movdqu xmm0,XMMWORD[rsi] - pshufd xmm3,xmm5,0xb1 - movdqu xmm1,XMMWORD[16+rsi] - movdqu xmm2,XMMWORD[32+rsi] - por xmm5,xmm3 - movdqu xmm3,XMMWORD[48+rsi] - mov rax,QWORD[((64+0))+rsi] - mov r14,QWORD[((64+8))+rsi] - mov r15,QWORD[((64+16))+rsi] - mov r8,QWORD[((64+24))+rsi] - movdqa XMMWORD[480+rsp],xmm0 - pshufd xmm4,xmm5,0x1e - movdqa XMMWORD[(480+16)+rsp],xmm1 - movdqu xmm0,XMMWORD[64+rsi] - movdqu xmm1,XMMWORD[80+rsi] - movdqa XMMWORD[512+rsp],xmm2 - movdqa XMMWORD[(512+16)+rsp],xmm3 - por xmm5,xmm4 - pxor xmm4,xmm4 - por xmm1,xmm0 -DB 102,72,15,110,199 - - lea rsi,[((64-0))+rsi] - mov QWORD[((544+0))+rsp],rax - mov QWORD[((544+8))+rsp],r14 - mov QWORD[((544+16))+rsp],r15 - mov QWORD[((544+24))+rsp],r8 - lea rdi,[96+rsp] - call __ecp_nistz256_sqr_montq - - pcmpeqd xmm5,xmm4 - pshufd xmm4,xmm1,0xb1 - por xmm4,xmm1 - pshufd xmm5,xmm5,0 - pshufd xmm3,xmm4,0x1e - por xmm4,xmm3 - pxor xmm3,xmm3 - pcmpeqd xmm4,xmm3 - pshufd xmm4,xmm4,0 - mov rax,QWORD[((64+0))+rbx] - mov r14,QWORD[((64+8))+rbx] - mov r15,QWORD[((64+16))+rbx] - mov r8,QWORD[((64+24))+rbx] -DB 102,72,15,110,203 - - lea rsi,[((64-0))+rbx] - lea rdi,[32+rsp] - call __ecp_nistz256_sqr_montq - - mov rax,QWORD[544+rsp] - lea rbx,[544+rsp] - mov r9,QWORD[((0+96))+rsp] - mov r10,QWORD[((8+96))+rsp] - lea rsi,[((0+96))+rsp] - mov r11,QWORD[((16+96))+rsp] - mov r12,QWORD[((24+96))+rsp] - lea rdi,[224+rsp] - call __ecp_nistz256_mul_montq - - mov rax,QWORD[448+rsp] - lea rbx,[448+rsp] - mov r9,QWORD[((0+32))+rsp] - mov r10,QWORD[((8+32))+rsp] - lea rsi,[((0+32))+rsp] - mov r11,QWORD[((16+32))+rsp] - mov r12,QWORD[((24+32))+rsp] - lea rdi,[256+rsp] - call __ecp_nistz256_mul_montq - - mov rax,QWORD[416+rsp] - lea rbx,[416+rsp] - mov r9,QWORD[((0+224))+rsp] - mov r10,QWORD[((8+224))+rsp] - lea rsi,[((0+224))+rsp] - mov r11,QWORD[((16+224))+rsp] - mov r12,QWORD[((24+224))+rsp] - lea rdi,[224+rsp] - call __ecp_nistz256_mul_montq - - mov rax,QWORD[512+rsp] - lea rbx,[512+rsp] - mov r9,QWORD[((0+256))+rsp] - mov r10,QWORD[((8+256))+rsp] - lea rsi,[((0+256))+rsp] - mov r11,QWORD[((16+256))+rsp] - mov r12,QWORD[((24+256))+rsp] - lea rdi,[256+rsp] - call __ecp_nistz256_mul_montq - - lea rbx,[224+rsp] - lea rdi,[64+rsp] - call __ecp_nistz256_sub_fromq - - or r12,r13 - movdqa xmm2,xmm4 - or r12,r8 - or r12,r9 - por xmm2,xmm5 -DB 102,73,15,110,220 - - mov rax,QWORD[384+rsp] - lea rbx,[384+rsp] - mov r9,QWORD[((0+96))+rsp] - mov r10,QWORD[((8+96))+rsp] - lea rsi,[((0+96))+rsp] - mov r11,QWORD[((16+96))+rsp] - mov r12,QWORD[((24+96))+rsp] - lea rdi,[160+rsp] - call __ecp_nistz256_mul_montq - - mov rax,QWORD[480+rsp] - lea rbx,[480+rsp] - mov r9,QWORD[((0+32))+rsp] - mov r10,QWORD[((8+32))+rsp] - lea rsi,[((0+32))+rsp] - mov r11,QWORD[((16+32))+rsp] - mov r12,QWORD[((24+32))+rsp] - lea rdi,[192+rsp] - call __ecp_nistz256_mul_montq - - lea rbx,[160+rsp] - lea rdi,[rsp] - call __ecp_nistz256_sub_fromq - - or r12,r13 - or r12,r8 - or r12,r9 - -DB 102,73,15,126,208 -DB 102,73,15,126,217 - - or r12,r8 - or r12,r9 - - -DB 0x3e - jnz NEAR $L$add_proceedq - -$L$add_doubleq: -DB 102,72,15,126,206 -DB 102,72,15,126,199 - add rsp,416 - - jmp NEAR $L$point_double_shortcutq - - -ALIGN 32 -$L$add_proceedq: - mov rax,QWORD[((0+64))+rsp] - mov r14,QWORD[((8+64))+rsp] - lea rsi,[((0+64))+rsp] - mov r15,QWORD[((16+64))+rsp] - mov r8,QWORD[((24+64))+rsp] - lea rdi,[96+rsp] - call __ecp_nistz256_sqr_montq - - mov rax,QWORD[448+rsp] - lea rbx,[448+rsp] - mov r9,QWORD[((0+0))+rsp] - mov r10,QWORD[((8+0))+rsp] - lea rsi,[((0+0))+rsp] - mov r11,QWORD[((16+0))+rsp] - mov r12,QWORD[((24+0))+rsp] - lea rdi,[352+rsp] - call __ecp_nistz256_mul_montq - - mov rax,QWORD[((0+0))+rsp] - mov r14,QWORD[((8+0))+rsp] - lea rsi,[((0+0))+rsp] - mov r15,QWORD[((16+0))+rsp] - mov r8,QWORD[((24+0))+rsp] - lea rdi,[32+rsp] - call __ecp_nistz256_sqr_montq - - mov rax,QWORD[544+rsp] - lea rbx,[544+rsp] - mov r9,QWORD[((0+352))+rsp] - mov r10,QWORD[((8+352))+rsp] - lea rsi,[((0+352))+rsp] - mov r11,QWORD[((16+352))+rsp] - mov r12,QWORD[((24+352))+rsp] - lea rdi,[352+rsp] - call __ecp_nistz256_mul_montq - - mov rax,QWORD[rsp] - lea rbx,[rsp] - mov r9,QWORD[((0+32))+rsp] - mov r10,QWORD[((8+32))+rsp] - lea rsi,[((0+32))+rsp] - mov r11,QWORD[((16+32))+rsp] - mov r12,QWORD[((24+32))+rsp] - lea rdi,[128+rsp] - call __ecp_nistz256_mul_montq - - mov rax,QWORD[160+rsp] - lea rbx,[160+rsp] - mov r9,QWORD[((0+32))+rsp] - mov r10,QWORD[((8+32))+rsp] - lea rsi,[((0+32))+rsp] - mov r11,QWORD[((16+32))+rsp] - mov r12,QWORD[((24+32))+rsp] - lea rdi,[192+rsp] - call __ecp_nistz256_mul_montq - - - - - xor r11,r11 - add r12,r12 - lea rsi,[96+rsp] - adc r13,r13 - mov rax,r12 - adc r8,r8 - adc r9,r9 - mov rbp,r13 - adc r11,0 - - sub r12,-1 - mov rcx,r8 - sbb r13,r14 - sbb r8,0 - mov r10,r9 - sbb r9,r15 - sbb r11,0 - - cmovc r12,rax - mov rax,QWORD[rsi] - cmovc r13,rbp - mov rbp,QWORD[8+rsi] - cmovc r8,rcx - mov rcx,QWORD[16+rsi] - cmovc r9,r10 - mov r10,QWORD[24+rsi] - - call __ecp_nistz256_subq - - lea rbx,[128+rsp] - lea rdi,[288+rsp] - call __ecp_nistz256_sub_fromq - - mov rax,QWORD[((192+0))+rsp] - mov rbp,QWORD[((192+8))+rsp] - mov rcx,QWORD[((192+16))+rsp] - mov r10,QWORD[((192+24))+rsp] - lea rdi,[320+rsp] - - call __ecp_nistz256_subq - - mov QWORD[rdi],r12 - mov QWORD[8+rdi],r13 - mov QWORD[16+rdi],r8 - mov QWORD[24+rdi],r9 - mov rax,QWORD[128+rsp] - lea rbx,[128+rsp] - mov r9,QWORD[((0+224))+rsp] - mov r10,QWORD[((8+224))+rsp] - lea rsi,[((0+224))+rsp] - mov r11,QWORD[((16+224))+rsp] - mov r12,QWORD[((24+224))+rsp] - lea rdi,[256+rsp] - call __ecp_nistz256_mul_montq - - mov rax,QWORD[320+rsp] - lea rbx,[320+rsp] - mov r9,QWORD[((0+64))+rsp] - mov r10,QWORD[((8+64))+rsp] - lea rsi,[((0+64))+rsp] - mov r11,QWORD[((16+64))+rsp] - mov r12,QWORD[((24+64))+rsp] - lea rdi,[320+rsp] - call __ecp_nistz256_mul_montq - - lea rbx,[256+rsp] - lea rdi,[320+rsp] - call __ecp_nistz256_sub_fromq - -DB 102,72,15,126,199 - - movdqa xmm0,xmm5 - movdqa xmm1,xmm5 - pandn xmm0,XMMWORD[352+rsp] - movdqa xmm2,xmm5 - pandn xmm1,XMMWORD[((352+16))+rsp] - movdqa xmm3,xmm5 - pand xmm2,XMMWORD[544+rsp] - pand xmm3,XMMWORD[((544+16))+rsp] - por xmm2,xmm0 - por xmm3,xmm1 - - movdqa xmm0,xmm4 - movdqa xmm1,xmm4 - pandn xmm0,xmm2 - movdqa xmm2,xmm4 - pandn xmm1,xmm3 - movdqa xmm3,xmm4 - pand xmm2,XMMWORD[448+rsp] - pand xmm3,XMMWORD[((448+16))+rsp] - por xmm2,xmm0 - por xmm3,xmm1 - movdqu XMMWORD[64+rdi],xmm2 - movdqu XMMWORD[80+rdi],xmm3 - - movdqa xmm0,xmm5 - movdqa xmm1,xmm5 - pandn xmm0,XMMWORD[288+rsp] - movdqa xmm2,xmm5 - pandn xmm1,XMMWORD[((288+16))+rsp] - movdqa xmm3,xmm5 - pand xmm2,XMMWORD[480+rsp] - pand xmm3,XMMWORD[((480+16))+rsp] - por xmm2,xmm0 - por xmm3,xmm1 - - movdqa xmm0,xmm4 - movdqa xmm1,xmm4 - pandn xmm0,xmm2 - movdqa xmm2,xmm4 - pandn xmm1,xmm3 - movdqa xmm3,xmm4 - pand xmm2,XMMWORD[384+rsp] - pand xmm3,XMMWORD[((384+16))+rsp] - por xmm2,xmm0 - por xmm3,xmm1 - movdqu XMMWORD[rdi],xmm2 - movdqu XMMWORD[16+rdi],xmm3 - - movdqa xmm0,xmm5 - movdqa xmm1,xmm5 - pandn xmm0,XMMWORD[320+rsp] - movdqa xmm2,xmm5 - pandn xmm1,XMMWORD[((320+16))+rsp] - movdqa xmm3,xmm5 - pand xmm2,XMMWORD[512+rsp] - pand xmm3,XMMWORD[((512+16))+rsp] - por xmm2,xmm0 - por xmm3,xmm1 - - movdqa xmm0,xmm4 - movdqa xmm1,xmm4 - pandn xmm0,xmm2 - movdqa xmm2,xmm4 - pandn xmm1,xmm3 - movdqa xmm3,xmm4 - pand xmm2,XMMWORD[416+rsp] - pand xmm3,XMMWORD[((416+16))+rsp] - por xmm2,xmm0 - por xmm3,xmm1 - movdqu XMMWORD[32+rdi],xmm2 - movdqu XMMWORD[48+rdi],xmm3 - -$L$add_doneq: - lea rsi,[((576+56))+rsp] - - mov r15,QWORD[((-48))+rsi] - - mov r14,QWORD[((-40))+rsi] - - mov r13,QWORD[((-32))+rsi] - - mov r12,QWORD[((-24))+rsi] - - mov rbx,QWORD[((-16))+rsi] - - mov rbp,QWORD[((-8))+rsi] - - lea rsp,[rsi] - -$L$point_addq_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_ecp_nistz256_point_add: -global ecp_nistz256_point_add_affine - -ALIGN 32 -ecp_nistz256_point_add_affine: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_ecp_nistz256_point_add_affine: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - - - mov ecx,0x80100 - and ecx,DWORD[((OPENSSL_ia32cap_P+8))] - cmp ecx,0x80100 - je NEAR $L$point_add_affinex - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,32*15+8 - -$L$add_affineq_body: - - movdqu xmm0,XMMWORD[rsi] - mov rbx,rdx - movdqu xmm1,XMMWORD[16+rsi] - movdqu xmm2,XMMWORD[32+rsi] - movdqu xmm3,XMMWORD[48+rsi] - movdqu xmm4,XMMWORD[64+rsi] - movdqu xmm5,XMMWORD[80+rsi] - mov rax,QWORD[((64+0))+rsi] - mov r14,QWORD[((64+8))+rsi] - mov r15,QWORD[((64+16))+rsi] - mov r8,QWORD[((64+24))+rsi] - movdqa XMMWORD[320+rsp],xmm0 - movdqa XMMWORD[(320+16)+rsp],xmm1 - movdqa XMMWORD[352+rsp],xmm2 - movdqa XMMWORD[(352+16)+rsp],xmm3 - movdqa XMMWORD[384+rsp],xmm4 - movdqa XMMWORD[(384+16)+rsp],xmm5 - por xmm5,xmm4 - - movdqu xmm0,XMMWORD[rbx] - pshufd xmm3,xmm5,0xb1 - movdqu xmm1,XMMWORD[16+rbx] - movdqu xmm2,XMMWORD[32+rbx] - por xmm5,xmm3 - movdqu xmm3,XMMWORD[48+rbx] - movdqa XMMWORD[416+rsp],xmm0 - pshufd xmm4,xmm5,0x1e - movdqa XMMWORD[(416+16)+rsp],xmm1 - por xmm1,xmm0 -DB 102,72,15,110,199 - movdqa XMMWORD[448+rsp],xmm2 - movdqa XMMWORD[(448+16)+rsp],xmm3 - por xmm3,xmm2 - por xmm5,xmm4 - pxor xmm4,xmm4 - por xmm3,xmm1 - - lea rsi,[((64-0))+rsi] - lea rdi,[32+rsp] - call __ecp_nistz256_sqr_montq - - pcmpeqd xmm5,xmm4 - pshufd xmm4,xmm3,0xb1 - mov rax,QWORD[rbx] - - mov r9,r12 - por xmm4,xmm3 - pshufd xmm5,xmm5,0 - pshufd xmm3,xmm4,0x1e - mov r10,r13 - por xmm4,xmm3 - pxor xmm3,xmm3 - mov r11,r14 - pcmpeqd xmm4,xmm3 - pshufd xmm4,xmm4,0 - - lea rsi,[((32-0))+rsp] - mov r12,r15 - lea rdi,[rsp] - call __ecp_nistz256_mul_montq - - lea rbx,[320+rsp] - lea rdi,[64+rsp] - call __ecp_nistz256_sub_fromq - - mov rax,QWORD[384+rsp] - lea rbx,[384+rsp] - mov r9,QWORD[((0+32))+rsp] - mov r10,QWORD[((8+32))+rsp] - lea rsi,[((0+32))+rsp] - mov r11,QWORD[((16+32))+rsp] - mov r12,QWORD[((24+32))+rsp] - lea rdi,[32+rsp] - call __ecp_nistz256_mul_montq - - mov rax,QWORD[384+rsp] - lea rbx,[384+rsp] - mov r9,QWORD[((0+64))+rsp] - mov r10,QWORD[((8+64))+rsp] - lea rsi,[((0+64))+rsp] - mov r11,QWORD[((16+64))+rsp] - mov r12,QWORD[((24+64))+rsp] - lea rdi,[288+rsp] - call __ecp_nistz256_mul_montq - - mov rax,QWORD[448+rsp] - lea rbx,[448+rsp] - mov r9,QWORD[((0+32))+rsp] - mov r10,QWORD[((8+32))+rsp] - lea rsi,[((0+32))+rsp] - mov r11,QWORD[((16+32))+rsp] - mov r12,QWORD[((24+32))+rsp] - lea rdi,[32+rsp] - call __ecp_nistz256_mul_montq - - lea rbx,[352+rsp] - lea rdi,[96+rsp] - call __ecp_nistz256_sub_fromq - - mov rax,QWORD[((0+64))+rsp] - mov r14,QWORD[((8+64))+rsp] - lea rsi,[((0+64))+rsp] - mov r15,QWORD[((16+64))+rsp] - mov r8,QWORD[((24+64))+rsp] - lea rdi,[128+rsp] - call __ecp_nistz256_sqr_montq - - mov rax,QWORD[((0+96))+rsp] - mov r14,QWORD[((8+96))+rsp] - lea rsi,[((0+96))+rsp] - mov r15,QWORD[((16+96))+rsp] - mov r8,QWORD[((24+96))+rsp] - lea rdi,[192+rsp] - call __ecp_nistz256_sqr_montq - - mov rax,QWORD[128+rsp] - lea rbx,[128+rsp] - mov r9,QWORD[((0+64))+rsp] - mov r10,QWORD[((8+64))+rsp] - lea rsi,[((0+64))+rsp] - mov r11,QWORD[((16+64))+rsp] - mov r12,QWORD[((24+64))+rsp] - lea rdi,[160+rsp] - call __ecp_nistz256_mul_montq - - mov rax,QWORD[320+rsp] - lea rbx,[320+rsp] - mov r9,QWORD[((0+128))+rsp] - mov r10,QWORD[((8+128))+rsp] - lea rsi,[((0+128))+rsp] - mov r11,QWORD[((16+128))+rsp] - mov r12,QWORD[((24+128))+rsp] - lea rdi,[rsp] - call __ecp_nistz256_mul_montq - - - - - xor r11,r11 - add r12,r12 - lea rsi,[192+rsp] - adc r13,r13 - mov rax,r12 - adc r8,r8 - adc r9,r9 - mov rbp,r13 - adc r11,0 - - sub r12,-1 - mov rcx,r8 - sbb r13,r14 - sbb r8,0 - mov r10,r9 - sbb r9,r15 - sbb r11,0 - - cmovc r12,rax - mov rax,QWORD[rsi] - cmovc r13,rbp - mov rbp,QWORD[8+rsi] - cmovc r8,rcx - mov rcx,QWORD[16+rsi] - cmovc r9,r10 - mov r10,QWORD[24+rsi] - - call __ecp_nistz256_subq - - lea rbx,[160+rsp] - lea rdi,[224+rsp] - call __ecp_nistz256_sub_fromq - - mov rax,QWORD[((0+0))+rsp] - mov rbp,QWORD[((0+8))+rsp] - mov rcx,QWORD[((0+16))+rsp] - mov r10,QWORD[((0+24))+rsp] - lea rdi,[64+rsp] - - call __ecp_nistz256_subq - - mov QWORD[rdi],r12 - mov QWORD[8+rdi],r13 - mov QWORD[16+rdi],r8 - mov QWORD[24+rdi],r9 - mov rax,QWORD[352+rsp] - lea rbx,[352+rsp] - mov r9,QWORD[((0+160))+rsp] - mov r10,QWORD[((8+160))+rsp] - lea rsi,[((0+160))+rsp] - mov r11,QWORD[((16+160))+rsp] - mov r12,QWORD[((24+160))+rsp] - lea rdi,[32+rsp] - call __ecp_nistz256_mul_montq - - mov rax,QWORD[96+rsp] - lea rbx,[96+rsp] - mov r9,QWORD[((0+64))+rsp] - mov r10,QWORD[((8+64))+rsp] - lea rsi,[((0+64))+rsp] - mov r11,QWORD[((16+64))+rsp] - mov r12,QWORD[((24+64))+rsp] - lea rdi,[64+rsp] - call __ecp_nistz256_mul_montq - - lea rbx,[32+rsp] - lea rdi,[256+rsp] - call __ecp_nistz256_sub_fromq - -DB 102,72,15,126,199 - - movdqa xmm0,xmm5 - movdqa xmm1,xmm5 - pandn xmm0,XMMWORD[288+rsp] - movdqa xmm2,xmm5 - pandn xmm1,XMMWORD[((288+16))+rsp] - movdqa xmm3,xmm5 - pand xmm2,XMMWORD[$L$ONE_mont] - pand xmm3,XMMWORD[(($L$ONE_mont+16))] - por xmm2,xmm0 - por xmm3,xmm1 - - movdqa xmm0,xmm4 - movdqa xmm1,xmm4 - pandn xmm0,xmm2 - movdqa xmm2,xmm4 - pandn xmm1,xmm3 - movdqa xmm3,xmm4 - pand xmm2,XMMWORD[384+rsp] - pand xmm3,XMMWORD[((384+16))+rsp] - por xmm2,xmm0 - por xmm3,xmm1 - movdqu XMMWORD[64+rdi],xmm2 - movdqu XMMWORD[80+rdi],xmm3 - - movdqa xmm0,xmm5 - movdqa xmm1,xmm5 - pandn xmm0,XMMWORD[224+rsp] - movdqa xmm2,xmm5 - pandn xmm1,XMMWORD[((224+16))+rsp] - movdqa xmm3,xmm5 - pand xmm2,XMMWORD[416+rsp] - pand xmm3,XMMWORD[((416+16))+rsp] - por xmm2,xmm0 - por xmm3,xmm1 - - movdqa xmm0,xmm4 - movdqa xmm1,xmm4 - pandn xmm0,xmm2 - movdqa xmm2,xmm4 - pandn xmm1,xmm3 - movdqa xmm3,xmm4 - pand xmm2,XMMWORD[320+rsp] - pand xmm3,XMMWORD[((320+16))+rsp] - por xmm2,xmm0 - por xmm3,xmm1 - movdqu XMMWORD[rdi],xmm2 - movdqu XMMWORD[16+rdi],xmm3 - - movdqa xmm0,xmm5 - movdqa xmm1,xmm5 - pandn xmm0,XMMWORD[256+rsp] - movdqa xmm2,xmm5 - pandn xmm1,XMMWORD[((256+16))+rsp] - movdqa xmm3,xmm5 - pand xmm2,XMMWORD[448+rsp] - pand xmm3,XMMWORD[((448+16))+rsp] - por xmm2,xmm0 - por xmm3,xmm1 - - movdqa xmm0,xmm4 - movdqa xmm1,xmm4 - pandn xmm0,xmm2 - movdqa xmm2,xmm4 - pandn xmm1,xmm3 - movdqa xmm3,xmm4 - pand xmm2,XMMWORD[352+rsp] - pand xmm3,XMMWORD[((352+16))+rsp] - por xmm2,xmm0 - por xmm3,xmm1 - movdqu XMMWORD[32+rdi],xmm2 - movdqu XMMWORD[48+rdi],xmm3 - - lea rsi,[((480+56))+rsp] + pand xmm11,xmm15 + por xmm3,xmm10 + pand xmm12,xmm15 + por xmm4,xmm11 + prefetcht0 [255+rdx] + por xmm5,xmm12 - mov r15,QWORD[((-48))+rsi] + dec rax + jnz NEAR $L$select_loop_sse_w7 - mov r14,QWORD[((-40))+rsi] + movdqu XMMWORD[rcx],xmm2 + movdqu XMMWORD[16+rcx],xmm3 + movdqu XMMWORD[32+rcx],xmm4 + movdqu XMMWORD[48+rcx],xmm5 + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[168+rsp] + DB 0F3h,0C3h ;repret - mov r13,QWORD[((-32))+rsi] +$L$SEH_end_ecp_nistz256_gather_w7: - mov r12,QWORD[((-24))+rsi] +global ecp_nistz256_avx2_gather_w7 - mov rbx,QWORD[((-16))+rsi] +ALIGN 32 +ecp_nistz256_avx2_gather_w7: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_avx2_gather_w7: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 - mov rbp,QWORD[((-8))+rsi] - lea rsp,[rsi] -$L$add_affineq_epilogue: +DB 0x0f,0x0b mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_ecp_nistz256_point_add_affine: +$L$SEH_end_ecp_nistz256_avx2_gather_w7: ALIGN 32 -__ecp_nistz256_add_tox: +__ecp_nistz256_add_toq: xor r11,r11 - adc r12,QWORD[rbx] + add r12,QWORD[rbx] adc r13,QWORD[8+rbx] mov rax,r12 adc r8,QWORD[16+rbx] @@ -6462,8 +4352,7 @@ __ecp_nistz256_add_tox: mov rbp,r13 adc r11,0 - xor r10,r10 - sbb r12,-1 + sub r12,-1 mov rcx,r8 sbb r13,r14 sbb r8,0 @@ -6486,32 +4375,30 @@ __ecp_nistz256_add_tox: ALIGN 32 -__ecp_nistz256_sub_fromx: +__ecp_nistz256_sub_fromq: - xor r11,r11 - sbb r12,QWORD[rbx] + sub r12,QWORD[rbx] sbb r13,QWORD[8+rbx] mov rax,r12 sbb r8,QWORD[16+rbx] sbb r9,QWORD[24+rbx] mov rbp,r13 - sbb r11,0 + sbb r11,r11 - xor r10,r10 - adc r12,-1 + add r12,-1 mov rcx,r8 adc r13,r14 adc r8,0 mov r10,r9 adc r9,r15 + test r11,r11 - bt r11,0 - cmovnc r12,rax - cmovnc r13,rbp + cmovz r12,rax + cmovz r13,rbp mov QWORD[rdi],r12 - cmovnc r8,rcx + cmovz r8,rcx mov QWORD[8+rdi],r13 - cmovnc r9,r10 + cmovz r9,r10 mov QWORD[16+rdi],r8 mov QWORD[24+rdi],r9 @@ -6521,30 +4408,28 @@ __ecp_nistz256_sub_fromx: ALIGN 32 -__ecp_nistz256_subx: +__ecp_nistz256_subq: - xor r11,r11 - sbb rax,r12 + sub rax,r12 sbb rbp,r13 mov r12,rax sbb rcx,r8 sbb r10,r9 mov r13,rbp - sbb r11,0 + sbb r11,r11 - xor r9,r9 - adc rax,-1 + add rax,-1 mov r8,rcx adc rbp,r14 adc rcx,0 mov r9,r10 adc r10,r15 + test r11,r11 - bt r11,0 - cmovc r12,rax - cmovc r13,rbp - cmovc r8,rcx - cmovc r9,r10 + cmovnz r12,rax + cmovnz r13,rbp + cmovnz r8,rcx + cmovnz r9,r10 DB 0F3h,0C3h ;repret @@ -6552,10 +4437,10 @@ __ecp_nistz256_subx: ALIGN 32 -__ecp_nistz256_mul_by_2x: +__ecp_nistz256_mul_by_2q: xor r11,r11 - adc r12,r12 + add r12,r12 adc r13,r13 mov rax,r12 adc r8,r8 @@ -6563,8 +4448,7 @@ __ecp_nistz256_mul_by_2x: mov rbp,r13 adc r11,0 - xor r10,r10 - sbb r12,-1 + sub r12,-1 mov rcx,r8 sbb r13,r14 sbb r8,0 @@ -6584,19 +4468,19 @@ __ecp_nistz256_mul_by_2x: DB 0F3h,0C3h ;repret +global ecp_nistz256_point_double ALIGN 32 -ecp_nistz256_point_doublex: +ecp_nistz256_point_double: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_ecp_nistz256_point_doublex: +$L$SEH_begin_ecp_nistz256_point_double: mov rdi,rcx mov rsi,rdx -$L$point_doublex: push rbp push rbx @@ -6611,9 +4495,9 @@ $L$point_doublex: sub rsp,32*5+8 -$L$point_doublex_body: +$L$point_doubleq_body: -$L$point_double_shortcutx: +$L$point_double_shortcutq: movdqu xmm0,XMMWORD[rsi] mov rbx,rsi movdqu xmm1,XMMWORD[16+rsi] @@ -6632,34 +4516,34 @@ DB 102,73,15,110,202 DB 102,73,15,110,211 lea rdi,[rsp] - call __ecp_nistz256_mul_by_2x + call __ecp_nistz256_mul_by_2q - mov rdx,QWORD[((64+0))+rsi] + mov rax,QWORD[((64+0))+rsi] mov r14,QWORD[((64+8))+rsi] mov r15,QWORD[((64+16))+rsi] mov r8,QWORD[((64+24))+rsi] - lea rsi,[((64-128))+rsi] + lea rsi,[((64-0))+rsi] lea rdi,[64+rsp] - call __ecp_nistz256_sqr_montx + call __ecp_nistz256_sqr_montq - mov rdx,QWORD[((0+0))+rsp] + mov rax,QWORD[((0+0))+rsp] mov r14,QWORD[((8+0))+rsp] - lea rsi,[((-128+0))+rsp] + lea rsi,[((0+0))+rsp] mov r15,QWORD[((16+0))+rsp] mov r8,QWORD[((24+0))+rsp] lea rdi,[rsp] - call __ecp_nistz256_sqr_montx + call __ecp_nistz256_sqr_montq - mov rdx,QWORD[32+rbx] + mov rax,QWORD[32+rbx] mov r9,QWORD[((64+0))+rbx] mov r10,QWORD[((64+8))+rbx] mov r11,QWORD[((64+16))+rbx] mov r12,QWORD[((64+24))+rbx] - lea rsi,[((64-128))+rbx] + lea rsi,[((64-0))+rbx] lea rbx,[32+rbx] DB 102,72,15,126,215 - call __ecp_nistz256_mul_montx - call __ecp_nistz256_mul_by_2x + call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_by_2q mov r12,QWORD[((96+0))+rsp] mov r13,QWORD[((96+8))+rsp] @@ -6667,7 +4551,7 @@ DB 102,72,15,126,215 mov r8,QWORD[((96+16))+rsp] mov r9,QWORD[((96+24))+rsp] lea rdi,[32+rsp] - call __ecp_nistz256_add_tox + call __ecp_nistz256_add_toq mov r12,QWORD[((96+0))+rsp] mov r13,QWORD[((96+8))+rsp] @@ -6675,15 +4559,15 @@ DB 102,72,15,126,215 mov r8,QWORD[((96+16))+rsp] mov r9,QWORD[((96+24))+rsp] lea rdi,[64+rsp] - call __ecp_nistz256_sub_fromx + call __ecp_nistz256_sub_fromq - mov rdx,QWORD[((0+0))+rsp] + mov rax,QWORD[((0+0))+rsp] mov r14,QWORD[((8+0))+rsp] - lea rsi,[((-128+0))+rsp] + lea rsi,[((0+0))+rsp] mov r15,QWORD[((16+0))+rsp] mov r8,QWORD[((24+0))+rsp] DB 102,72,15,126,207 - call __ecp_nistz256_sqr_montx + call __ecp_nistz256_sqr_montq xor r9,r9 mov rax,r12 add r12,-1 @@ -6722,59 +4606,59 @@ DB 102,72,15,126,207 or r15,r9 mov QWORD[16+rdi],r14 mov QWORD[24+rdi],r15 - mov rdx,QWORD[64+rsp] + mov rax,QWORD[64+rsp] lea rbx,[64+rsp] mov r9,QWORD[((0+32))+rsp] mov r10,QWORD[((8+32))+rsp] - lea rsi,[((-128+32))+rsp] + lea rsi,[((0+32))+rsp] mov r11,QWORD[((16+32))+rsp] mov r12,QWORD[((24+32))+rsp] lea rdi,[32+rsp] - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq lea rdi,[128+rsp] - call __ecp_nistz256_mul_by_2x + call __ecp_nistz256_mul_by_2q lea rbx,[32+rsp] lea rdi,[32+rsp] - call __ecp_nistz256_add_tox + call __ecp_nistz256_add_toq - mov rdx,QWORD[96+rsp] + mov rax,QWORD[96+rsp] lea rbx,[96+rsp] mov r9,QWORD[((0+0))+rsp] mov r10,QWORD[((8+0))+rsp] - lea rsi,[((-128+0))+rsp] + lea rsi,[((0+0))+rsp] mov r11,QWORD[((16+0))+rsp] mov r12,QWORD[((24+0))+rsp] lea rdi,[rsp] - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq lea rdi,[128+rsp] - call __ecp_nistz256_mul_by_2x + call __ecp_nistz256_mul_by_2q - mov rdx,QWORD[((0+32))+rsp] + mov rax,QWORD[((0+32))+rsp] mov r14,QWORD[((8+32))+rsp] - lea rsi,[((-128+32))+rsp] + lea rsi,[((0+32))+rsp] mov r15,QWORD[((16+32))+rsp] mov r8,QWORD[((24+32))+rsp] DB 102,72,15,126,199 - call __ecp_nistz256_sqr_montx + call __ecp_nistz256_sqr_montq lea rbx,[128+rsp] mov r8,r14 mov r9,r15 mov r14,rsi mov r15,rbp - call __ecp_nistz256_sub_fromx + call __ecp_nistz256_sub_fromq mov rax,QWORD[((0+0))+rsp] mov rbp,QWORD[((0+8))+rsp] mov rcx,QWORD[((0+16))+rsp] mov r10,QWORD[((0+24))+rsp] lea rdi,[rsp] - call __ecp_nistz256_subx + call __ecp_nistz256_subq - mov rdx,QWORD[32+rsp] + mov rax,QWORD[32+rsp] lea rbx,[32+rsp] mov r14,r12 xor ecx,ecx @@ -6783,16 +4667,16 @@ DB 102,72,15,126,199 mov QWORD[((0+8))+rsp],r13 cmovz r11,r8 mov QWORD[((0+16))+rsp],r8 - lea rsi,[((0-128))+rsp] + lea rsi,[((0-0))+rsp] cmovz r12,r9 mov QWORD[((0+24))+rsp],r9 mov r9,r14 lea rdi,[rsp] - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq DB 102,72,15,126,203 DB 102,72,15,126,207 - call __ecp_nistz256_sub_fromx + call __ecp_nistz256_sub_fromq lea rsi,[((160+56))+rsp] @@ -6810,26 +4694,26 @@ DB 102,72,15,126,207 lea rsp,[rsi] -$L$point_doublex_epilogue: +$L$point_doubleq_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_ecp_nistz256_point_doublex: +$L$SEH_end_ecp_nistz256_point_double: +global ecp_nistz256_point_add ALIGN 32 -ecp_nistz256_point_addx: +ecp_nistz256_point_add: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_ecp_nistz256_point_addx: +$L$SEH_begin_ecp_nistz256_point_add: mov rdi,rcx mov rsi,rdx mov rdx,r8 -$L$point_addx: push rbp push rbx @@ -6844,7 +4728,7 @@ $L$point_addx: sub rsp,32*18+8 -$L$point_addx_body: +$L$point_addq_body: movdqu xmm0,XMMWORD[rsi] movdqu xmm1,XMMWORD[16+rsi] @@ -6868,7 +4752,7 @@ $L$point_addx_body: movdqu xmm2,XMMWORD[32+rsi] por xmm5,xmm3 movdqu xmm3,XMMWORD[48+rsi] - mov rdx,QWORD[((64+0))+rsi] + mov rax,QWORD[((64+0))+rsi] mov r14,QWORD[((64+8))+rsi] mov r15,QWORD[((64+16))+rsi] mov r8,QWORD[((64+24))+rsi] @@ -6884,13 +4768,13 @@ $L$point_addx_body: por xmm1,xmm0 DB 102,72,15,110,199 - lea rsi,[((64-128))+rsi] - mov QWORD[((544+0))+rsp],rdx + lea rsi,[((64-0))+rsi] + mov QWORD[((544+0))+rsp],rax mov QWORD[((544+8))+rsp],r14 mov QWORD[((544+16))+rsp],r15 mov QWORD[((544+24))+rsp],r8 lea rdi,[96+rsp] - call __ecp_nistz256_sqr_montx + call __ecp_nistz256_sqr_montq pcmpeqd xmm5,xmm4 pshufd xmm4,xmm1,0xb1 @@ -6901,59 +4785,59 @@ DB 102,72,15,110,199 pxor xmm3,xmm3 pcmpeqd xmm4,xmm3 pshufd xmm4,xmm4,0 - mov rdx,QWORD[((64+0))+rbx] + mov rax,QWORD[((64+0))+rbx] mov r14,QWORD[((64+8))+rbx] mov r15,QWORD[((64+16))+rbx] mov r8,QWORD[((64+24))+rbx] DB 102,72,15,110,203 - lea rsi,[((64-128))+rbx] + lea rsi,[((64-0))+rbx] lea rdi,[32+rsp] - call __ecp_nistz256_sqr_montx + call __ecp_nistz256_sqr_montq - mov rdx,QWORD[544+rsp] + mov rax,QWORD[544+rsp] lea rbx,[544+rsp] mov r9,QWORD[((0+96))+rsp] mov r10,QWORD[((8+96))+rsp] - lea rsi,[((-128+96))+rsp] + lea rsi,[((0+96))+rsp] mov r11,QWORD[((16+96))+rsp] mov r12,QWORD[((24+96))+rsp] lea rdi,[224+rsp] - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq - mov rdx,QWORD[448+rsp] + mov rax,QWORD[448+rsp] lea rbx,[448+rsp] mov r9,QWORD[((0+32))+rsp] mov r10,QWORD[((8+32))+rsp] - lea rsi,[((-128+32))+rsp] + lea rsi,[((0+32))+rsp] mov r11,QWORD[((16+32))+rsp] mov r12,QWORD[((24+32))+rsp] lea rdi,[256+rsp] - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq - mov rdx,QWORD[416+rsp] + mov rax,QWORD[416+rsp] lea rbx,[416+rsp] mov r9,QWORD[((0+224))+rsp] mov r10,QWORD[((8+224))+rsp] - lea rsi,[((-128+224))+rsp] + lea rsi,[((0+224))+rsp] mov r11,QWORD[((16+224))+rsp] mov r12,QWORD[((24+224))+rsp] lea rdi,[224+rsp] - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq - mov rdx,QWORD[512+rsp] + mov rax,QWORD[512+rsp] lea rbx,[512+rsp] mov r9,QWORD[((0+256))+rsp] mov r10,QWORD[((8+256))+rsp] - lea rsi,[((-128+256))+rsp] + lea rsi,[((0+256))+rsp] mov r11,QWORD[((16+256))+rsp] mov r12,QWORD[((24+256))+rsp] lea rdi,[256+rsp] - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq lea rbx,[224+rsp] lea rdi,[64+rsp] - call __ecp_nistz256_sub_fromx + call __ecp_nistz256_sub_fromq or r12,r13 movdqa xmm2,xmm4 @@ -6962,29 +4846,29 @@ DB 102,72,15,110,203 por xmm2,xmm5 DB 102,73,15,110,220 - mov rdx,QWORD[384+rsp] + mov rax,QWORD[384+rsp] lea rbx,[384+rsp] mov r9,QWORD[((0+96))+rsp] mov r10,QWORD[((8+96))+rsp] - lea rsi,[((-128+96))+rsp] + lea rsi,[((0+96))+rsp] mov r11,QWORD[((16+96))+rsp] mov r12,QWORD[((24+96))+rsp] lea rdi,[160+rsp] - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq - mov rdx,QWORD[480+rsp] + mov rax,QWORD[480+rsp] lea rbx,[480+rsp] mov r9,QWORD[((0+32))+rsp] mov r10,QWORD[((8+32))+rsp] - lea rsi,[((-128+32))+rsp] + lea rsi,[((0+32))+rsp] mov r11,QWORD[((16+32))+rsp] mov r12,QWORD[((24+32))+rsp] lea rdi,[192+rsp] - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq lea rbx,[160+rsp] lea rdi,[rsp] - call __ecp_nistz256_sub_fromx + call __ecp_nistz256_sub_fromq or r12,r13 or r12,r8 @@ -6998,73 +4882,73 @@ DB 102,73,15,126,217 DB 0x3e - jnz NEAR $L$add_proceedx + jnz NEAR $L$add_proceedq -$L$add_doublex: +$L$add_doubleq: DB 102,72,15,126,206 DB 102,72,15,126,199 add rsp,416 - jmp NEAR $L$point_double_shortcutx + jmp NEAR $L$point_double_shortcutq ALIGN 32 -$L$add_proceedx: - mov rdx,QWORD[((0+64))+rsp] +$L$add_proceedq: + mov rax,QWORD[((0+64))+rsp] mov r14,QWORD[((8+64))+rsp] - lea rsi,[((-128+64))+rsp] + lea rsi,[((0+64))+rsp] mov r15,QWORD[((16+64))+rsp] mov r8,QWORD[((24+64))+rsp] lea rdi,[96+rsp] - call __ecp_nistz256_sqr_montx + call __ecp_nistz256_sqr_montq - mov rdx,QWORD[448+rsp] + mov rax,QWORD[448+rsp] lea rbx,[448+rsp] mov r9,QWORD[((0+0))+rsp] mov r10,QWORD[((8+0))+rsp] - lea rsi,[((-128+0))+rsp] + lea rsi,[((0+0))+rsp] mov r11,QWORD[((16+0))+rsp] mov r12,QWORD[((24+0))+rsp] lea rdi,[352+rsp] - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq - mov rdx,QWORD[((0+0))+rsp] + mov rax,QWORD[((0+0))+rsp] mov r14,QWORD[((8+0))+rsp] - lea rsi,[((-128+0))+rsp] + lea rsi,[((0+0))+rsp] mov r15,QWORD[((16+0))+rsp] mov r8,QWORD[((24+0))+rsp] lea rdi,[32+rsp] - call __ecp_nistz256_sqr_montx + call __ecp_nistz256_sqr_montq - mov rdx,QWORD[544+rsp] + mov rax,QWORD[544+rsp] lea rbx,[544+rsp] mov r9,QWORD[((0+352))+rsp] mov r10,QWORD[((8+352))+rsp] - lea rsi,[((-128+352))+rsp] + lea rsi,[((0+352))+rsp] mov r11,QWORD[((16+352))+rsp] mov r12,QWORD[((24+352))+rsp] lea rdi,[352+rsp] - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq - mov rdx,QWORD[rsp] + mov rax,QWORD[rsp] lea rbx,[rsp] mov r9,QWORD[((0+32))+rsp] mov r10,QWORD[((8+32))+rsp] - lea rsi,[((-128+32))+rsp] + lea rsi,[((0+32))+rsp] mov r11,QWORD[((16+32))+rsp] mov r12,QWORD[((24+32))+rsp] lea rdi,[128+rsp] - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq - mov rdx,QWORD[160+rsp] + mov rax,QWORD[160+rsp] lea rbx,[160+rsp] mov r9,QWORD[((0+32))+rsp] mov r10,QWORD[((8+32))+rsp] - lea rsi,[((-128+32))+rsp] + lea rsi,[((0+32))+rsp] mov r11,QWORD[((16+32))+rsp] mov r12,QWORD[((24+32))+rsp] lea rdi,[192+rsp] - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq @@ -7096,11 +4980,11 @@ $L$add_proceedx: cmovc r9,r10 mov r10,QWORD[24+rsi] - call __ecp_nistz256_subx + call __ecp_nistz256_subq lea rbx,[128+rsp] lea rdi,[288+rsp] - call __ecp_nistz256_sub_fromx + call __ecp_nistz256_sub_fromq mov rax,QWORD[((192+0))+rsp] mov rbp,QWORD[((192+8))+rsp] @@ -7108,35 +4992,35 @@ $L$add_proceedx: mov r10,QWORD[((192+24))+rsp] lea rdi,[320+rsp] - call __ecp_nistz256_subx + call __ecp_nistz256_subq mov QWORD[rdi],r12 mov QWORD[8+rdi],r13 mov QWORD[16+rdi],r8 mov QWORD[24+rdi],r9 - mov rdx,QWORD[128+rsp] + mov rax,QWORD[128+rsp] lea rbx,[128+rsp] mov r9,QWORD[((0+224))+rsp] mov r10,QWORD[((8+224))+rsp] - lea rsi,[((-128+224))+rsp] + lea rsi,[((0+224))+rsp] mov r11,QWORD[((16+224))+rsp] mov r12,QWORD[((24+224))+rsp] lea rdi,[256+rsp] - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq - mov rdx,QWORD[320+rsp] + mov rax,QWORD[320+rsp] lea rbx,[320+rsp] mov r9,QWORD[((0+64))+rsp] mov r10,QWORD[((8+64))+rsp] - lea rsi,[((-128+64))+rsp] + lea rsi,[((0+64))+rsp] mov r11,QWORD[((16+64))+rsp] mov r12,QWORD[((24+64))+rsp] lea rdi,[320+rsp] - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq lea rbx,[256+rsp] lea rdi,[320+rsp] - call __ecp_nistz256_sub_fromx + call __ecp_nistz256_sub_fromq DB 102,72,15,126,199 @@ -7212,7 +5096,7 @@ DB 102,72,15,126,199 movdqu XMMWORD[32+rdi],xmm2 movdqu XMMWORD[48+rdi],xmm3 -$L$add_donex: +$L$add_doneq: lea rsi,[((576+56))+rsp] mov r15,QWORD[((-48))+rsi] @@ -7229,26 +5113,26 @@ $L$add_donex: lea rsp,[rsi] -$L$point_addx_epilogue: +$L$point_addq_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_ecp_nistz256_point_addx: +$L$SEH_end_ecp_nistz256_point_add: +global ecp_nistz256_point_add_affine ALIGN 32 -ecp_nistz256_point_add_affinex: +ecp_nistz256_point_add_affine: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_ecp_nistz256_point_add_affinex: +$L$SEH_begin_ecp_nistz256_point_add_affine: mov rdi,rcx mov rsi,rdx mov rdx,r8 -$L$point_add_affinex: push rbp push rbx @@ -7263,7 +5147,7 @@ $L$point_add_affinex: sub rsp,32*15+8 -$L$add_affinex_body: +$L$add_affineq_body: movdqu xmm0,XMMWORD[rsi] mov rbx,rdx @@ -7272,7 +5156,7 @@ $L$add_affinex_body: movdqu xmm3,XMMWORD[48+rsi] movdqu xmm4,XMMWORD[64+rsi] movdqu xmm5,XMMWORD[80+rsi] - mov rdx,QWORD[((64+0))+rsi] + mov rax,QWORD[((64+0))+rsi] mov r14,QWORD[((64+8))+rsi] mov r15,QWORD[((64+16))+rsi] mov r8,QWORD[((64+24))+rsi] @@ -7302,13 +5186,13 @@ DB 102,72,15,110,199 pxor xmm4,xmm4 por xmm3,xmm1 - lea rsi,[((64-128))+rsi] + lea rsi,[((64-0))+rsi] lea rdi,[32+rsp] - call __ecp_nistz256_sqr_montx + call __ecp_nistz256_sqr_montq pcmpeqd xmm5,xmm4 pshufd xmm4,xmm3,0xb1 - mov rdx,QWORD[rbx] + mov rax,QWORD[rbx] mov r9,r12 por xmm4,xmm3 @@ -7321,84 +5205,84 @@ DB 102,72,15,110,199 pcmpeqd xmm4,xmm3 pshufd xmm4,xmm4,0 - lea rsi,[((32-128))+rsp] + lea rsi,[((32-0))+rsp] mov r12,r15 lea rdi,[rsp] - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq lea rbx,[320+rsp] lea rdi,[64+rsp] - call __ecp_nistz256_sub_fromx + call __ecp_nistz256_sub_fromq - mov rdx,QWORD[384+rsp] + mov rax,QWORD[384+rsp] lea rbx,[384+rsp] mov r9,QWORD[((0+32))+rsp] mov r10,QWORD[((8+32))+rsp] - lea rsi,[((-128+32))+rsp] + lea rsi,[((0+32))+rsp] mov r11,QWORD[((16+32))+rsp] mov r12,QWORD[((24+32))+rsp] lea rdi,[32+rsp] - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq - mov rdx,QWORD[384+rsp] + mov rax,QWORD[384+rsp] lea rbx,[384+rsp] mov r9,QWORD[((0+64))+rsp] mov r10,QWORD[((8+64))+rsp] - lea rsi,[((-128+64))+rsp] + lea rsi,[((0+64))+rsp] mov r11,QWORD[((16+64))+rsp] mov r12,QWORD[((24+64))+rsp] lea rdi,[288+rsp] - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq - mov rdx,QWORD[448+rsp] + mov rax,QWORD[448+rsp] lea rbx,[448+rsp] mov r9,QWORD[((0+32))+rsp] mov r10,QWORD[((8+32))+rsp] - lea rsi,[((-128+32))+rsp] + lea rsi,[((0+32))+rsp] mov r11,QWORD[((16+32))+rsp] mov r12,QWORD[((24+32))+rsp] lea rdi,[32+rsp] - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq lea rbx,[352+rsp] lea rdi,[96+rsp] - call __ecp_nistz256_sub_fromx + call __ecp_nistz256_sub_fromq - mov rdx,QWORD[((0+64))+rsp] + mov rax,QWORD[((0+64))+rsp] mov r14,QWORD[((8+64))+rsp] - lea rsi,[((-128+64))+rsp] + lea rsi,[((0+64))+rsp] mov r15,QWORD[((16+64))+rsp] mov r8,QWORD[((24+64))+rsp] lea rdi,[128+rsp] - call __ecp_nistz256_sqr_montx + call __ecp_nistz256_sqr_montq - mov rdx,QWORD[((0+96))+rsp] + mov rax,QWORD[((0+96))+rsp] mov r14,QWORD[((8+96))+rsp] - lea rsi,[((-128+96))+rsp] + lea rsi,[((0+96))+rsp] mov r15,QWORD[((16+96))+rsp] mov r8,QWORD[((24+96))+rsp] lea rdi,[192+rsp] - call __ecp_nistz256_sqr_montx + call __ecp_nistz256_sqr_montq - mov rdx,QWORD[128+rsp] + mov rax,QWORD[128+rsp] lea rbx,[128+rsp] mov r9,QWORD[((0+64))+rsp] mov r10,QWORD[((8+64))+rsp] - lea rsi,[((-128+64))+rsp] + lea rsi,[((0+64))+rsp] mov r11,QWORD[((16+64))+rsp] mov r12,QWORD[((24+64))+rsp] lea rdi,[160+rsp] - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq - mov rdx,QWORD[320+rsp] + mov rax,QWORD[320+rsp] lea rbx,[320+rsp] mov r9,QWORD[((0+128))+rsp] mov r10,QWORD[((8+128))+rsp] - lea rsi,[((-128+128))+rsp] + lea rsi,[((0+128))+rsp] mov r11,QWORD[((16+128))+rsp] mov r12,QWORD[((24+128))+rsp] lea rdi,[rsp] - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq @@ -7430,11 +5314,11 @@ DB 102,72,15,110,199 cmovc r9,r10 mov r10,QWORD[24+rsi] - call __ecp_nistz256_subx + call __ecp_nistz256_subq lea rbx,[160+rsp] lea rdi,[224+rsp] - call __ecp_nistz256_sub_fromx + call __ecp_nistz256_sub_fromq mov rax,QWORD[((0+0))+rsp] mov rbp,QWORD[((0+8))+rsp] @@ -7442,35 +5326,35 @@ DB 102,72,15,110,199 mov r10,QWORD[((0+24))+rsp] lea rdi,[64+rsp] - call __ecp_nistz256_subx + call __ecp_nistz256_subq mov QWORD[rdi],r12 mov QWORD[8+rdi],r13 mov QWORD[16+rdi],r8 mov QWORD[24+rdi],r9 - mov rdx,QWORD[352+rsp] + mov rax,QWORD[352+rsp] lea rbx,[352+rsp] mov r9,QWORD[((0+160))+rsp] mov r10,QWORD[((8+160))+rsp] - lea rsi,[((-128+160))+rsp] + lea rsi,[((0+160))+rsp] mov r11,QWORD[((16+160))+rsp] mov r12,QWORD[((24+160))+rsp] lea rdi,[32+rsp] - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq - mov rdx,QWORD[96+rsp] + mov rax,QWORD[96+rsp] lea rbx,[96+rsp] mov r9,QWORD[((0+64))+rsp] mov r10,QWORD[((8+64))+rsp] - lea rsi,[((-128+64))+rsp] + lea rsi,[((0+64))+rsp] mov r11,QWORD[((16+64))+rsp] mov r12,QWORD[((24+64))+rsp] lea rdi,[64+rsp] - call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_montq lea rbx,[32+rsp] lea rdi,[256+rsp] - call __ecp_nistz256_sub_fromx + call __ecp_nistz256_sub_fromq DB 102,72,15,126,199 @@ -7562,12 +5446,12 @@ DB 102,72,15,126,199 lea rsp,[rsi] -$L$add_affinex_epilogue: +$L$add_affineq_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_ecp_nistz256_point_add_affinex: +$L$SEH_end_ecp_nistz256_point_add_affine: EXTERN __imp_RtlVirtualUnwind @@ -7733,13 +5617,6 @@ ALIGN 4 DD $L$SEH_begin_ecp_nistz256_ord_sqr_mont wrt ..imagebase DD $L$SEH_end_ecp_nistz256_ord_sqr_mont wrt ..imagebase DD $L$SEH_info_ecp_nistz256_ord_sqr_mont wrt ..imagebase - DD $L$SEH_begin_ecp_nistz256_ord_mul_montx wrt ..imagebase - DD $L$SEH_end_ecp_nistz256_ord_mul_montx wrt ..imagebase - DD $L$SEH_info_ecp_nistz256_ord_mul_montx wrt ..imagebase - - DD $L$SEH_begin_ecp_nistz256_ord_sqr_montx wrt ..imagebase - DD $L$SEH_end_ecp_nistz256_ord_sqr_montx wrt ..imagebase - DD $L$SEH_info_ecp_nistz256_ord_sqr_montx wrt ..imagebase DD $L$SEH_begin_ecp_nistz256_to_mont wrt ..imagebase DD $L$SEH_end_ecp_nistz256_to_mont wrt ..imagebase DD $L$SEH_info_ecp_nistz256_to_mont wrt ..imagebase @@ -7763,13 +5640,6 @@ ALIGN 4 DD $L$SEH_begin_ecp_nistz256_gather_w7 wrt ..imagebase DD $L$SEH_end_ecp_nistz256_gather_w7 wrt ..imagebase DD $L$SEH_info_ecp_nistz256_gather_wX wrt ..imagebase - DD $L$SEH_begin_ecp_nistz256_avx2_gather_w5 wrt ..imagebase - DD $L$SEH_end_ecp_nistz256_avx2_gather_w5 wrt ..imagebase - DD $L$SEH_info_ecp_nistz256_avx2_gather_wX wrt ..imagebase - - DD $L$SEH_begin_ecp_nistz256_avx2_gather_w7 wrt ..imagebase - DD $L$SEH_end_ecp_nistz256_avx2_gather_w7 wrt ..imagebase - DD $L$SEH_info_ecp_nistz256_avx2_gather_wX wrt ..imagebase DD $L$SEH_begin_ecp_nistz256_point_double wrt ..imagebase DD $L$SEH_end_ecp_nistz256_point_double wrt ..imagebase DD $L$SEH_info_ecp_nistz256_point_double wrt ..imagebase @@ -7781,17 +5651,6 @@ ALIGN 4 DD $L$SEH_begin_ecp_nistz256_point_add_affine wrt ..imagebase DD $L$SEH_end_ecp_nistz256_point_add_affine wrt ..imagebase DD $L$SEH_info_ecp_nistz256_point_add_affine wrt ..imagebase - DD $L$SEH_begin_ecp_nistz256_point_doublex wrt ..imagebase - DD $L$SEH_end_ecp_nistz256_point_doublex wrt ..imagebase - DD $L$SEH_info_ecp_nistz256_point_doublex wrt ..imagebase - - DD $L$SEH_begin_ecp_nistz256_point_addx wrt ..imagebase - DD $L$SEH_end_ecp_nistz256_point_addx wrt ..imagebase - DD $L$SEH_info_ecp_nistz256_point_addx wrt ..imagebase - - DD $L$SEH_begin_ecp_nistz256_point_add_affinex wrt ..imagebase - DD $L$SEH_end_ecp_nistz256_point_add_affinex wrt ..imagebase - DD $L$SEH_info_ecp_nistz256_point_add_affinex wrt ..imagebase section .xdata rdata align=8 ALIGN 8 @@ -7829,16 +5688,6 @@ DB 9,0,0,0 DD full_handler wrt ..imagebase DD $L$ord_sqr_body wrt ..imagebase,$L$ord_sqr_epilogue wrt ..imagebase DD 48,0 -$L$SEH_info_ecp_nistz256_ord_mul_montx: -DB 9,0,0,0 - DD full_handler wrt ..imagebase - DD $L$ord_mulx_body wrt ..imagebase,$L$ord_mulx_epilogue wrt ..imagebase - DD 48,0 -$L$SEH_info_ecp_nistz256_ord_sqr_montx: -DB 9,0,0,0 - DD full_handler wrt ..imagebase - DD $L$ord_sqrx_body wrt ..imagebase,$L$ord_sqrx_epilogue wrt ..imagebase - DD 48,0 $L$SEH_info_ecp_nistz256_to_mont: DB 9,0,0,0 DD full_handler wrt ..imagebase @@ -7872,21 +5721,6 @@ DB 0x0c,0x78,0x01,0x00 DB 0x08,0x68,0x00,0x00 DB 0x04,0x01,0x15,0x00 ALIGN 8 -$L$SEH_info_ecp_nistz256_avx2_gather_wX: -DB 0x01,0x36,0x17,0x0b -DB 0x36,0xf8,0x09,0x00 -DB 0x31,0xe8,0x08,0x00 -DB 0x2c,0xd8,0x07,0x00 -DB 0x27,0xc8,0x06,0x00 -DB 0x22,0xb8,0x05,0x00 -DB 0x1d,0xa8,0x04,0x00 -DB 0x18,0x98,0x03,0x00 -DB 0x13,0x88,0x02,0x00 -DB 0x0e,0x78,0x01,0x00 -DB 0x09,0x68,0x00,0x00 -DB 0x04,0x01,0x15,0x00 -DB 0x00,0xb3,0x00,0x00 -ALIGN 8 $L$SEH_info_ecp_nistz256_point_double: DB 9,0,0,0 DD full_handler wrt ..imagebase @@ -7902,19 +5736,3 @@ DB 9,0,0,0 DD full_handler wrt ..imagebase DD $L$add_affineq_body wrt ..imagebase,$L$add_affineq_epilogue wrt ..imagebase DD 32*15+56,0 -ALIGN 8 -$L$SEH_info_ecp_nistz256_point_doublex: -DB 9,0,0,0 - DD full_handler wrt ..imagebase - DD $L$point_doublex_body wrt ..imagebase,$L$point_doublex_epilogue wrt ..imagebase - DD 32*5+56,0 -$L$SEH_info_ecp_nistz256_point_addx: -DB 9,0,0,0 - DD full_handler wrt ..imagebase - DD $L$point_addx_body wrt ..imagebase,$L$point_addx_epilogue wrt ..imagebase - DD 32*18+56,0 -$L$SEH_info_ecp_nistz256_point_add_affinex: -DB 9,0,0,0 - DD full_handler wrt ..imagebase - DD $L$add_affinex_body wrt ..imagebase,$L$add_affinex_epilogue wrt ..imagebase - DD 32*15+56,0 diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/ec/x25519-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/ec/x25519-x86_64.nasm index d5dc6fbc47f..b3f4b8a434d 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/ec/x25519-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/ec/x25519-x86_64.nasm @@ -409,457 +409,34 @@ $L$fe51_mul121666_body: $L$fe51_mul121666_epilogue: $L$SEH_end_x25519_fe51_mul121666: -EXTERN OPENSSL_ia32cap_P global x25519_fe64_eligible ALIGN 32 x25519_fe64_eligible: - mov ecx,DWORD[((OPENSSL_ia32cap_P+8))] xor eax,eax - and ecx,0x80100 - cmp ecx,0x80100 - cmove eax,ecx DB 0F3h,0C3h ;repret global x25519_fe64_mul -ALIGN 32 -x25519_fe64_mul: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_x25519_fe64_mul: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - - - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - push rdi - - lea rsp,[((-16))+rsp] - -$L$fe64_mul_body: - - mov rax,rdx - mov rbp,QWORD[rdx] - mov rdx,QWORD[rsi] - mov rcx,QWORD[8+rax] - mov r14,QWORD[16+rax] - mov r15,QWORD[24+rax] - - mulx rax,r8,rbp - xor edi,edi - mulx rbx,r9,rcx - adcx r9,rax - mulx rax,r10,r14 - adcx r10,rbx - mulx r12,r11,r15 - mov rdx,QWORD[8+rsi] - adcx r11,rax - mov QWORD[rsp],r14 - adcx r12,rdi - - mulx rbx,rax,rbp - adox r9,rax - adcx r10,rbx - mulx rbx,rax,rcx - adox r10,rax - adcx r11,rbx - mulx rbx,rax,r14 - adox r11,rax - adcx r12,rbx - mulx r13,rax,r15 - mov rdx,QWORD[16+rsi] - adox r12,rax - adcx r13,rdi - adox r13,rdi - - mulx rbx,rax,rbp - adcx r10,rax - adox r11,rbx - mulx rbx,rax,rcx - adcx r11,rax - adox r12,rbx - mulx rbx,rax,r14 - adcx r12,rax - adox r13,rbx - mulx r14,rax,r15 - mov rdx,QWORD[24+rsi] - adcx r13,rax - adox r14,rdi - adcx r14,rdi - - mulx rbx,rax,rbp - adox r11,rax - adcx r12,rbx - mulx rbx,rax,rcx - adox r12,rax - adcx r13,rbx - mulx rbx,rax,QWORD[rsp] - adox r13,rax - adcx r14,rbx - mulx r15,rax,r15 - mov edx,38 - adox r14,rax - adcx r15,rdi - adox r15,rdi - - jmp NEAR $L$reduce64 -$L$fe64_mul_epilogue: - -$L$SEH_end_x25519_fe64_mul: - global x25519_fe64_sqr - -ALIGN 32 -x25519_fe64_sqr: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_x25519_fe64_sqr: - mov rdi,rcx - mov rsi,rdx - - - - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - push rdi - - lea rsp,[((-16))+rsp] - -$L$fe64_sqr_body: - - mov rdx,QWORD[rsi] - mov rcx,QWORD[8+rsi] - mov rbp,QWORD[16+rsi] - mov rsi,QWORD[24+rsi] - - - mulx r15,r8,rdx - mulx rax,r9,rcx - xor edi,edi - mulx rbx,r10,rbp - adcx r10,rax - mulx r12,r11,rsi - mov rdx,rcx - adcx r11,rbx - adcx r12,rdi - - - mulx rbx,rax,rbp - adox r11,rax - adcx r12,rbx - mulx r13,rax,rsi - mov rdx,rbp - adox r12,rax - adcx r13,rdi - - - mulx r14,rax,rsi - mov rdx,rcx - adox r13,rax - adcx r14,rdi - adox r14,rdi - - adcx r9,r9 - adox r9,r15 - adcx r10,r10 - mulx rbx,rax,rdx - mov rdx,rbp - adcx r11,r11 - adox r10,rax - adcx r12,r12 - adox r11,rbx - mulx rbx,rax,rdx - mov rdx,rsi - adcx r13,r13 - adox r12,rax - adcx r14,r14 - adox r13,rbx - mulx r15,rax,rdx - mov edx,38 - adox r14,rax - adcx r15,rdi - adox r15,rdi - jmp NEAR $L$reduce64 - -ALIGN 32 -$L$reduce64: - mulx rbx,rax,r12 - adcx r8,rax - adox r9,rbx - mulx rbx,rax,r13 - adcx r9,rax - adox r10,rbx - mulx rbx,rax,r14 - adcx r10,rax - adox r11,rbx - mulx r12,rax,r15 - adcx r11,rax - adox r12,rdi - adcx r12,rdi - - mov rdi,QWORD[16+rsp] - imul r12,rdx - - add r8,r12 - adc r9,0 - adc r10,0 - adc r11,0 - - sbb rax,rax - and rax,38 - - add r8,rax - mov QWORD[8+rdi],r9 - mov QWORD[16+rdi],r10 - mov QWORD[24+rdi],r11 - mov QWORD[rdi],r8 - - mov r15,QWORD[24+rsp] - - mov r14,QWORD[32+rsp] - - mov r13,QWORD[40+rsp] - - mov r12,QWORD[48+rsp] - - mov rbx,QWORD[56+rsp] - - mov rbp,QWORD[64+rsp] - - lea rsp,[72+rsp] - -$L$fe64_sqr_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_x25519_fe64_sqr: - global x25519_fe64_mul121666 - -ALIGN 32 -x25519_fe64_mul121666: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_x25519_fe64_mul121666: - mov rdi,rcx - mov rsi,rdx - - -$L$fe64_mul121666_body: - - mov edx,121666 - mulx rcx,r8,QWORD[rsi] - mulx rax,r9,QWORD[8+rsi] - add r9,rcx - mulx rcx,r10,QWORD[16+rsi] - adc r10,rax - mulx rax,r11,QWORD[24+rsi] - adc r11,rcx - adc rax,0 - - imul rax,rax,38 - - add r8,rax - adc r9,0 - adc r10,0 - adc r11,0 - - sbb rax,rax - and rax,38 - - add r8,rax - mov QWORD[8+rdi],r9 - mov QWORD[16+rdi],r10 - mov QWORD[24+rdi],r11 - mov QWORD[rdi],r8 - -$L$fe64_mul121666_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_x25519_fe64_mul121666: - global x25519_fe64_add - -ALIGN 32 -x25519_fe64_add: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_x25519_fe64_add: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - -$L$fe64_add_body: - - mov r8,QWORD[rsi] - mov r9,QWORD[8+rsi] - mov r10,QWORD[16+rsi] - mov r11,QWORD[24+rsi] - - add r8,QWORD[rdx] - adc r9,QWORD[8+rdx] - adc r10,QWORD[16+rdx] - adc r11,QWORD[24+rdx] - - sbb rax,rax - and rax,38 - - add r8,rax - adc r9,0 - adc r10,0 - mov QWORD[8+rdi],r9 - adc r11,0 - mov QWORD[16+rdi],r10 - sbb rax,rax - mov QWORD[24+rdi],r11 - and rax,38 - - add r8,rax - mov QWORD[rdi],r8 - -$L$fe64_add_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_x25519_fe64_add: - global x25519_fe64_sub - -ALIGN 32 -x25519_fe64_sub: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_x25519_fe64_sub: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - -$L$fe64_sub_body: - - mov r8,QWORD[rsi] - mov r9,QWORD[8+rsi] - mov r10,QWORD[16+rsi] - mov r11,QWORD[24+rsi] - - sub r8,QWORD[rdx] - sbb r9,QWORD[8+rdx] - sbb r10,QWORD[16+rdx] - sbb r11,QWORD[24+rdx] - - sbb rax,rax - and rax,38 - - sub r8,rax - sbb r9,0 - sbb r10,0 - mov QWORD[8+rdi],r9 - sbb r11,0 - mov QWORD[16+rdi],r10 - sbb rax,rax - mov QWORD[24+rdi],r11 - and rax,38 - - sub r8,rax - mov QWORD[rdi],r8 - -$L$fe64_sub_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_x25519_fe64_sub: - global x25519_fe64_tobytes - -ALIGN 32 +x25519_fe64_mul: +x25519_fe64_sqr: +x25519_fe64_mul121666: +x25519_fe64_add: +x25519_fe64_sub: x25519_fe64_tobytes: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_x25519_fe64_tobytes: - mov rdi,rcx - mov rsi,rdx - -$L$fe64_to_body: - - mov r8,QWORD[rsi] - mov r9,QWORD[8+rsi] - mov r10,QWORD[16+rsi] - mov r11,QWORD[24+rsi] - - - lea rax,[r11*1+r11] - sar r11,63 - shr rax,1 - and r11,19 - add r11,19 - - add r8,r11 - adc r9,0 - adc r10,0 - adc rax,0 - - lea r11,[rax*1+rax] - sar rax,63 - shr r11,1 - not rax - and rax,19 - - sub r8,rax - sbb r9,0 - sbb r10,0 - sbb r11,0 - - mov QWORD[rdi],r8 - mov QWORD[8+rdi],r9 - mov QWORD[16+rdi],r10 - mov QWORD[24+rdi],r11 - -$L$fe64_to_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] +DB 0x0f,0x0b DB 0F3h,0C3h ;repret -$L$SEH_end_x25519_fe64_tobytes: + DB 88,50,53,53,49,57,32,112,114,105,109,105,116,105,118,101 DB 115,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82 DB 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 @@ -996,29 +573,6 @@ ALIGN 4 DD $L$SEH_begin_x25519_fe51_mul121666 wrt ..imagebase DD $L$SEH_end_x25519_fe51_mul121666 wrt ..imagebase DD $L$SEH_info_x25519_fe51_mul121666 wrt ..imagebase - DD $L$SEH_begin_x25519_fe64_mul wrt ..imagebase - DD $L$SEH_end_x25519_fe64_mul wrt ..imagebase - DD $L$SEH_info_x25519_fe64_mul wrt ..imagebase - - DD $L$SEH_begin_x25519_fe64_sqr wrt ..imagebase - DD $L$SEH_end_x25519_fe64_sqr wrt ..imagebase - DD $L$SEH_info_x25519_fe64_sqr wrt ..imagebase - - DD $L$SEH_begin_x25519_fe64_mul121666 wrt ..imagebase - DD $L$SEH_end_x25519_fe64_mul121666 wrt ..imagebase - DD $L$SEH_info_x25519_fe64_mul121666 wrt ..imagebase - - DD $L$SEH_begin_x25519_fe64_add wrt ..imagebase - DD $L$SEH_end_x25519_fe64_add wrt ..imagebase - DD $L$SEH_info_x25519_fe64_add wrt ..imagebase - - DD $L$SEH_begin_x25519_fe64_sub wrt ..imagebase - DD $L$SEH_end_x25519_fe64_sub wrt ..imagebase - DD $L$SEH_info_x25519_fe64_sub wrt ..imagebase - - DD $L$SEH_begin_x25519_fe64_tobytes wrt ..imagebase - DD $L$SEH_end_x25519_fe64_tobytes wrt ..imagebase - DD $L$SEH_info_x25519_fe64_tobytes wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_x25519_fe51_mul: @@ -1036,29 +590,3 @@ DB 9,0,0,0 DD full_handler wrt ..imagebase DD $L$fe51_mul121666_body wrt ..imagebase,$L$fe51_mul121666_epilogue wrt ..imagebase DD 88,0 -$L$SEH_info_x25519_fe64_mul: -DB 9,0,0,0 - DD full_handler wrt ..imagebase - DD $L$fe64_mul_body wrt ..imagebase,$L$fe64_mul_epilogue wrt ..imagebase - DD 72,0 -$L$SEH_info_x25519_fe64_sqr: -DB 9,0,0,0 - DD full_handler wrt ..imagebase - DD $L$fe64_sqr_body wrt ..imagebase,$L$fe64_sqr_epilogue wrt ..imagebase - DD 72,0 -$L$SEH_info_x25519_fe64_mul121666: -DB 9,0,0,0 - DD short_handler wrt ..imagebase - DD $L$fe64_mul121666_body wrt ..imagebase,$L$fe64_mul121666_epilogue wrt ..imagebase -$L$SEH_info_x25519_fe64_add: -DB 9,0,0,0 - DD short_handler wrt ..imagebase - DD $L$fe64_add_body wrt ..imagebase,$L$fe64_add_epilogue wrt ..imagebase -$L$SEH_info_x25519_fe64_sub: -DB 9,0,0,0 - DD short_handler wrt ..imagebase - DD $L$fe64_sub_body wrt ..imagebase,$L$fe64_sub_epilogue wrt ..imagebase -$L$SEH_info_x25519_fe64_tobytes: -DB 9,0,0,0 - DD short_handler wrt ..imagebase - DD $L$fe64_to_body wrt ..imagebase,$L$fe64_to_epilogue wrt ..imagebase diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/aesni-gcm-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/aesni-gcm-x86_64.nasm index cbc06ca5fae..d2812b08d12 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/aesni-gcm-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/aesni-gcm-x86_64.nasm @@ -5,977 +5,20 @@ default rel section .text code align=64 - -ALIGN 32 -_aesni_ctr32_ghash_6x: - - vmovdqu xmm2,XMMWORD[32+r11] - sub rdx,6 - vpxor xmm4,xmm4,xmm4 - vmovdqu xmm15,XMMWORD[((0-128))+rcx] - vpaddb xmm10,xmm1,xmm2 - vpaddb xmm11,xmm10,xmm2 - vpaddb xmm12,xmm11,xmm2 - vpaddb xmm13,xmm12,xmm2 - vpaddb xmm14,xmm13,xmm2 - vpxor xmm9,xmm1,xmm15 - vmovdqu XMMWORD[(16+8)+rsp],xmm4 - jmp NEAR $L$oop6x - -ALIGN 32 -$L$oop6x: - add ebx,100663296 - jc NEAR $L$handle_ctr32 - vmovdqu xmm3,XMMWORD[((0-32))+r9] - vpaddb xmm1,xmm14,xmm2 - vpxor xmm10,xmm10,xmm15 - vpxor xmm11,xmm11,xmm15 - -$L$resume_ctr32: - vmovdqu XMMWORD[r8],xmm1 - vpclmulqdq xmm5,xmm7,xmm3,0x10 - vpxor xmm12,xmm12,xmm15 - vmovups xmm2,XMMWORD[((16-128))+rcx] - vpclmulqdq xmm6,xmm7,xmm3,0x01 - xor r12,r12 - cmp r15,r14 - - vaesenc xmm9,xmm9,xmm2 - vmovdqu xmm0,XMMWORD[((48+8))+rsp] - vpxor xmm13,xmm13,xmm15 - vpclmulqdq xmm1,xmm7,xmm3,0x00 - vaesenc xmm10,xmm10,xmm2 - vpxor xmm14,xmm14,xmm15 - setnc r12b - vpclmulqdq xmm7,xmm7,xmm3,0x11 - vaesenc xmm11,xmm11,xmm2 - vmovdqu xmm3,XMMWORD[((16-32))+r9] - neg r12 - vaesenc xmm12,xmm12,xmm2 - vpxor xmm6,xmm6,xmm5 - vpclmulqdq xmm5,xmm0,xmm3,0x00 - vpxor xmm8,xmm8,xmm4 - vaesenc xmm13,xmm13,xmm2 - vpxor xmm4,xmm1,xmm5 - and r12,0x60 - vmovups xmm15,XMMWORD[((32-128))+rcx] - vpclmulqdq xmm1,xmm0,xmm3,0x10 - vaesenc xmm14,xmm14,xmm2 - - vpclmulqdq xmm2,xmm0,xmm3,0x01 - lea r14,[r12*1+r14] - vaesenc xmm9,xmm9,xmm15 - vpxor xmm8,xmm8,XMMWORD[((16+8))+rsp] - vpclmulqdq xmm3,xmm0,xmm3,0x11 - vmovdqu xmm0,XMMWORD[((64+8))+rsp] - vaesenc xmm10,xmm10,xmm15 - movbe r13,QWORD[88+r14] - vaesenc xmm11,xmm11,xmm15 - movbe r12,QWORD[80+r14] - vaesenc xmm12,xmm12,xmm15 - mov QWORD[((32+8))+rsp],r13 - vaesenc xmm13,xmm13,xmm15 - mov QWORD[((40+8))+rsp],r12 - vmovdqu xmm5,XMMWORD[((48-32))+r9] - vaesenc xmm14,xmm14,xmm15 - - vmovups xmm15,XMMWORD[((48-128))+rcx] - vpxor xmm6,xmm6,xmm1 - vpclmulqdq xmm1,xmm0,xmm5,0x00 - vaesenc xmm9,xmm9,xmm15 - vpxor xmm6,xmm6,xmm2 - vpclmulqdq xmm2,xmm0,xmm5,0x10 - vaesenc xmm10,xmm10,xmm15 - vpxor xmm7,xmm7,xmm3 - vpclmulqdq xmm3,xmm0,xmm5,0x01 - vaesenc xmm11,xmm11,xmm15 - vpclmulqdq xmm5,xmm0,xmm5,0x11 - vmovdqu xmm0,XMMWORD[((80+8))+rsp] - vaesenc xmm12,xmm12,xmm15 - vaesenc xmm13,xmm13,xmm15 - vpxor xmm4,xmm4,xmm1 - vmovdqu xmm1,XMMWORD[((64-32))+r9] - vaesenc xmm14,xmm14,xmm15 - - vmovups xmm15,XMMWORD[((64-128))+rcx] - vpxor xmm6,xmm6,xmm2 - vpclmulqdq xmm2,xmm0,xmm1,0x00 - vaesenc xmm9,xmm9,xmm15 - vpxor xmm6,xmm6,xmm3 - vpclmulqdq xmm3,xmm0,xmm1,0x10 - vaesenc xmm10,xmm10,xmm15 - movbe r13,QWORD[72+r14] - vpxor xmm7,xmm7,xmm5 - vpclmulqdq xmm5,xmm0,xmm1,0x01 - vaesenc xmm11,xmm11,xmm15 - movbe r12,QWORD[64+r14] - vpclmulqdq xmm1,xmm0,xmm1,0x11 - vmovdqu xmm0,XMMWORD[((96+8))+rsp] - vaesenc xmm12,xmm12,xmm15 - mov QWORD[((48+8))+rsp],r13 - vaesenc xmm13,xmm13,xmm15 - mov QWORD[((56+8))+rsp],r12 - vpxor xmm4,xmm4,xmm2 - vmovdqu xmm2,XMMWORD[((96-32))+r9] - vaesenc xmm14,xmm14,xmm15 - - vmovups xmm15,XMMWORD[((80-128))+rcx] - vpxor xmm6,xmm6,xmm3 - vpclmulqdq xmm3,xmm0,xmm2,0x00 - vaesenc xmm9,xmm9,xmm15 - vpxor xmm6,xmm6,xmm5 - vpclmulqdq xmm5,xmm0,xmm2,0x10 - vaesenc xmm10,xmm10,xmm15 - movbe r13,QWORD[56+r14] - vpxor xmm7,xmm7,xmm1 - vpclmulqdq xmm1,xmm0,xmm2,0x01 - vpxor xmm8,xmm8,XMMWORD[((112+8))+rsp] - vaesenc xmm11,xmm11,xmm15 - movbe r12,QWORD[48+r14] - vpclmulqdq xmm2,xmm0,xmm2,0x11 - vaesenc xmm12,xmm12,xmm15 - mov QWORD[((64+8))+rsp],r13 - vaesenc xmm13,xmm13,xmm15 - mov QWORD[((72+8))+rsp],r12 - vpxor xmm4,xmm4,xmm3 - vmovdqu xmm3,XMMWORD[((112-32))+r9] - vaesenc xmm14,xmm14,xmm15 - - vmovups xmm15,XMMWORD[((96-128))+rcx] - vpxor xmm6,xmm6,xmm5 - vpclmulqdq xmm5,xmm8,xmm3,0x10 - vaesenc xmm9,xmm9,xmm15 - vpxor xmm6,xmm6,xmm1 - vpclmulqdq xmm1,xmm8,xmm3,0x01 - vaesenc xmm10,xmm10,xmm15 - movbe r13,QWORD[40+r14] - vpxor xmm7,xmm7,xmm2 - vpclmulqdq xmm2,xmm8,xmm3,0x00 - vaesenc xmm11,xmm11,xmm15 - movbe r12,QWORD[32+r14] - vpclmulqdq xmm8,xmm8,xmm3,0x11 - vaesenc xmm12,xmm12,xmm15 - mov QWORD[((80+8))+rsp],r13 - vaesenc xmm13,xmm13,xmm15 - mov QWORD[((88+8))+rsp],r12 - vpxor xmm6,xmm6,xmm5 - vaesenc xmm14,xmm14,xmm15 - vpxor xmm6,xmm6,xmm1 - - vmovups xmm15,XMMWORD[((112-128))+rcx] - vpslldq xmm5,xmm6,8 - vpxor xmm4,xmm4,xmm2 - vmovdqu xmm3,XMMWORD[16+r11] - - vaesenc xmm9,xmm9,xmm15 - vpxor xmm7,xmm7,xmm8 - vaesenc xmm10,xmm10,xmm15 - vpxor xmm4,xmm4,xmm5 - movbe r13,QWORD[24+r14] - vaesenc xmm11,xmm11,xmm15 - movbe r12,QWORD[16+r14] - vpalignr xmm0,xmm4,xmm4,8 - vpclmulqdq xmm4,xmm4,xmm3,0x10 - mov QWORD[((96+8))+rsp],r13 - vaesenc xmm12,xmm12,xmm15 - mov QWORD[((104+8))+rsp],r12 - vaesenc xmm13,xmm13,xmm15 - vmovups xmm1,XMMWORD[((128-128))+rcx] - vaesenc xmm14,xmm14,xmm15 - - vaesenc xmm9,xmm9,xmm1 - vmovups xmm15,XMMWORD[((144-128))+rcx] - vaesenc xmm10,xmm10,xmm1 - vpsrldq xmm6,xmm6,8 - vaesenc xmm11,xmm11,xmm1 - vpxor xmm7,xmm7,xmm6 - vaesenc xmm12,xmm12,xmm1 - vpxor xmm4,xmm4,xmm0 - movbe r13,QWORD[8+r14] - vaesenc xmm13,xmm13,xmm1 - movbe r12,QWORD[r14] - vaesenc xmm14,xmm14,xmm1 - vmovups xmm1,XMMWORD[((160-128))+rcx] - cmp ebp,11 - jb NEAR $L$enc_tail - - vaesenc xmm9,xmm9,xmm15 - vaesenc xmm10,xmm10,xmm15 - vaesenc xmm11,xmm11,xmm15 - vaesenc xmm12,xmm12,xmm15 - vaesenc xmm13,xmm13,xmm15 - vaesenc xmm14,xmm14,xmm15 - - vaesenc xmm9,xmm9,xmm1 - vaesenc xmm10,xmm10,xmm1 - vaesenc xmm11,xmm11,xmm1 - vaesenc xmm12,xmm12,xmm1 - vaesenc xmm13,xmm13,xmm1 - vmovups xmm15,XMMWORD[((176-128))+rcx] - vaesenc xmm14,xmm14,xmm1 - vmovups xmm1,XMMWORD[((192-128))+rcx] - je NEAR $L$enc_tail - - vaesenc xmm9,xmm9,xmm15 - vaesenc xmm10,xmm10,xmm15 - vaesenc xmm11,xmm11,xmm15 - vaesenc xmm12,xmm12,xmm15 - vaesenc xmm13,xmm13,xmm15 - vaesenc xmm14,xmm14,xmm15 - - vaesenc xmm9,xmm9,xmm1 - vaesenc xmm10,xmm10,xmm1 - vaesenc xmm11,xmm11,xmm1 - vaesenc xmm12,xmm12,xmm1 - vaesenc xmm13,xmm13,xmm1 - vmovups xmm15,XMMWORD[((208-128))+rcx] - vaesenc xmm14,xmm14,xmm1 - vmovups xmm1,XMMWORD[((224-128))+rcx] - jmp NEAR $L$enc_tail - -ALIGN 32 -$L$handle_ctr32: - vmovdqu xmm0,XMMWORD[r11] - vpshufb xmm6,xmm1,xmm0 - vmovdqu xmm5,XMMWORD[48+r11] - vpaddd xmm10,xmm6,XMMWORD[64+r11] - vpaddd xmm11,xmm6,xmm5 - vmovdqu xmm3,XMMWORD[((0-32))+r9] - vpaddd xmm12,xmm10,xmm5 - vpshufb xmm10,xmm10,xmm0 - vpaddd xmm13,xmm11,xmm5 - vpshufb xmm11,xmm11,xmm0 - vpxor xmm10,xmm10,xmm15 - vpaddd xmm14,xmm12,xmm5 - vpshufb xmm12,xmm12,xmm0 - vpxor xmm11,xmm11,xmm15 - vpaddd xmm1,xmm13,xmm5 - vpshufb xmm13,xmm13,xmm0 - vpshufb xmm14,xmm14,xmm0 - vpshufb xmm1,xmm1,xmm0 - jmp NEAR $L$resume_ctr32 - -ALIGN 32 -$L$enc_tail: - vaesenc xmm9,xmm9,xmm15 - vmovdqu XMMWORD[(16+8)+rsp],xmm7 - vpalignr xmm8,xmm4,xmm4,8 - vaesenc xmm10,xmm10,xmm15 - vpclmulqdq xmm4,xmm4,xmm3,0x10 - vpxor xmm2,xmm1,XMMWORD[rdi] - vaesenc xmm11,xmm11,xmm15 - vpxor xmm0,xmm1,XMMWORD[16+rdi] - vaesenc xmm12,xmm12,xmm15 - vpxor xmm5,xmm1,XMMWORD[32+rdi] - vaesenc xmm13,xmm13,xmm15 - vpxor xmm6,xmm1,XMMWORD[48+rdi] - vaesenc xmm14,xmm14,xmm15 - vpxor xmm7,xmm1,XMMWORD[64+rdi] - vpxor xmm3,xmm1,XMMWORD[80+rdi] - vmovdqu xmm1,XMMWORD[r8] - - vaesenclast xmm9,xmm9,xmm2 - vmovdqu xmm2,XMMWORD[32+r11] - vaesenclast xmm10,xmm10,xmm0 - vpaddb xmm0,xmm1,xmm2 - mov QWORD[((112+8))+rsp],r13 - lea rdi,[96+rdi] - vaesenclast xmm11,xmm11,xmm5 - vpaddb xmm5,xmm0,xmm2 - mov QWORD[((120+8))+rsp],r12 - lea rsi,[96+rsi] - vmovdqu xmm15,XMMWORD[((0-128))+rcx] - vaesenclast xmm12,xmm12,xmm6 - vpaddb xmm6,xmm5,xmm2 - vaesenclast xmm13,xmm13,xmm7 - vpaddb xmm7,xmm6,xmm2 - vaesenclast xmm14,xmm14,xmm3 - vpaddb xmm3,xmm7,xmm2 - - add r10,0x60 - sub rdx,0x6 - jc NEAR $L$6x_done - - vmovups XMMWORD[(-96)+rsi],xmm9 - vpxor xmm9,xmm1,xmm15 - vmovups XMMWORD[(-80)+rsi],xmm10 - vmovdqa xmm10,xmm0 - vmovups XMMWORD[(-64)+rsi],xmm11 - vmovdqa xmm11,xmm5 - vmovups XMMWORD[(-48)+rsi],xmm12 - vmovdqa xmm12,xmm6 - vmovups XMMWORD[(-32)+rsi],xmm13 - vmovdqa xmm13,xmm7 - vmovups XMMWORD[(-16)+rsi],xmm14 - vmovdqa xmm14,xmm3 - vmovdqu xmm7,XMMWORD[((32+8))+rsp] - jmp NEAR $L$oop6x - -$L$6x_done: - vpxor xmm8,xmm8,XMMWORD[((16+8))+rsp] - vpxor xmm8,xmm8,xmm4 - - DB 0F3h,0C3h ;repret - - -global aesni_gcm_decrypt - -ALIGN 32 -aesni_gcm_decrypt: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aesni_gcm_decrypt: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - - xor r10,r10 - cmp rdx,0x60 - jb NEAR $L$gcm_dec_abort - - lea rax,[rsp] - - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - push r15 - - lea rsp,[((-168))+rsp] - movaps XMMWORD[(-216)+rax],xmm6 - movaps XMMWORD[(-200)+rax],xmm7 - movaps XMMWORD[(-184)+rax],xmm8 - movaps XMMWORD[(-168)+rax],xmm9 - movaps XMMWORD[(-152)+rax],xmm10 - movaps XMMWORD[(-136)+rax],xmm11 - movaps XMMWORD[(-120)+rax],xmm12 - movaps XMMWORD[(-104)+rax],xmm13 - movaps XMMWORD[(-88)+rax],xmm14 - movaps XMMWORD[(-72)+rax],xmm15 -$L$gcm_dec_body: - vzeroupper - - vmovdqu xmm1,XMMWORD[r8] - add rsp,-128 - mov ebx,DWORD[12+r8] - lea r11,[$L$bswap_mask] - lea r14,[((-128))+rcx] - mov r15,0xf80 - vmovdqu xmm8,XMMWORD[r9] - and rsp,-128 - vmovdqu xmm0,XMMWORD[r11] - lea rcx,[128+rcx] - lea r9,[((32+32))+r9] - mov ebp,DWORD[((240-128))+rcx] - vpshufb xmm8,xmm8,xmm0 - - and r14,r15 - and r15,rsp - sub r15,r14 - jc NEAR $L$dec_no_key_aliasing - cmp r15,768 - jnc NEAR $L$dec_no_key_aliasing - sub rsp,r15 -$L$dec_no_key_aliasing: - - vmovdqu xmm7,XMMWORD[80+rdi] - lea r14,[rdi] - vmovdqu xmm4,XMMWORD[64+rdi] - lea r15,[((-192))+rdx*1+rdi] - vmovdqu xmm5,XMMWORD[48+rdi] - shr rdx,4 - xor r10,r10 - vmovdqu xmm6,XMMWORD[32+rdi] - vpshufb xmm7,xmm7,xmm0 - vmovdqu xmm2,XMMWORD[16+rdi] - vpshufb xmm4,xmm4,xmm0 - vmovdqu xmm3,XMMWORD[rdi] - vpshufb xmm5,xmm5,xmm0 - vmovdqu XMMWORD[48+rsp],xmm4 - vpshufb xmm6,xmm6,xmm0 - vmovdqu XMMWORD[64+rsp],xmm5 - vpshufb xmm2,xmm2,xmm0 - vmovdqu XMMWORD[80+rsp],xmm6 - vpshufb xmm3,xmm3,xmm0 - vmovdqu XMMWORD[96+rsp],xmm2 - vmovdqu XMMWORD[112+rsp],xmm3 - - call _aesni_ctr32_ghash_6x - - vmovups XMMWORD[(-96)+rsi],xmm9 - vmovups XMMWORD[(-80)+rsi],xmm10 - vmovups XMMWORD[(-64)+rsi],xmm11 - vmovups XMMWORD[(-48)+rsi],xmm12 - vmovups XMMWORD[(-32)+rsi],xmm13 - vmovups XMMWORD[(-16)+rsi],xmm14 - - vpshufb xmm8,xmm8,XMMWORD[r11] - vmovdqu XMMWORD[(-64)+r9],xmm8 - - vzeroupper - movaps xmm6,XMMWORD[((-216))+rax] - movaps xmm7,XMMWORD[((-200))+rax] - movaps xmm8,XMMWORD[((-184))+rax] - movaps xmm9,XMMWORD[((-168))+rax] - movaps xmm10,XMMWORD[((-152))+rax] - movaps xmm11,XMMWORD[((-136))+rax] - movaps xmm12,XMMWORD[((-120))+rax] - movaps xmm13,XMMWORD[((-104))+rax] - movaps xmm14,XMMWORD[((-88))+rax] - movaps xmm15,XMMWORD[((-72))+rax] - mov r15,QWORD[((-48))+rax] - - mov r14,QWORD[((-40))+rax] - - mov r13,QWORD[((-32))+rax] - - mov r12,QWORD[((-24))+rax] - - mov rbp,QWORD[((-16))+rax] - - mov rbx,QWORD[((-8))+rax] - - lea rsp,[rax] - -$L$gcm_dec_abort: - mov rax,r10 - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_aesni_gcm_decrypt: - -ALIGN 32 -_aesni_ctr32_6x: - - vmovdqu xmm4,XMMWORD[((0-128))+rcx] - vmovdqu xmm2,XMMWORD[32+r11] - lea r13,[((-1))+rbp] - vmovups xmm15,XMMWORD[((16-128))+rcx] - lea r12,[((32-128))+rcx] - vpxor xmm9,xmm1,xmm4 - add ebx,100663296 - jc NEAR $L$handle_ctr32_2 - vpaddb xmm10,xmm1,xmm2 - vpaddb xmm11,xmm10,xmm2 - vpxor xmm10,xmm10,xmm4 - vpaddb xmm12,xmm11,xmm2 - vpxor xmm11,xmm11,xmm4 - vpaddb xmm13,xmm12,xmm2 - vpxor xmm12,xmm12,xmm4 - vpaddb xmm14,xmm13,xmm2 - vpxor xmm13,xmm13,xmm4 - vpaddb xmm1,xmm14,xmm2 - vpxor xmm14,xmm14,xmm4 - jmp NEAR $L$oop_ctr32 - -ALIGN 16 -$L$oop_ctr32: - vaesenc xmm9,xmm9,xmm15 - vaesenc xmm10,xmm10,xmm15 - vaesenc xmm11,xmm11,xmm15 - vaesenc xmm12,xmm12,xmm15 - vaesenc xmm13,xmm13,xmm15 - vaesenc xmm14,xmm14,xmm15 - vmovups xmm15,XMMWORD[r12] - lea r12,[16+r12] - dec r13d - jnz NEAR $L$oop_ctr32 - - vmovdqu xmm3,XMMWORD[r12] - vaesenc xmm9,xmm9,xmm15 - vpxor xmm4,xmm3,XMMWORD[rdi] - vaesenc xmm10,xmm10,xmm15 - vpxor xmm5,xmm3,XMMWORD[16+rdi] - vaesenc xmm11,xmm11,xmm15 - vpxor xmm6,xmm3,XMMWORD[32+rdi] - vaesenc xmm12,xmm12,xmm15 - vpxor xmm8,xmm3,XMMWORD[48+rdi] - vaesenc xmm13,xmm13,xmm15 - vpxor xmm2,xmm3,XMMWORD[64+rdi] - vaesenc xmm14,xmm14,xmm15 - vpxor xmm3,xmm3,XMMWORD[80+rdi] - lea rdi,[96+rdi] - - vaesenclast xmm9,xmm9,xmm4 - vaesenclast xmm10,xmm10,xmm5 - vaesenclast xmm11,xmm11,xmm6 - vaesenclast xmm12,xmm12,xmm8 - vaesenclast xmm13,xmm13,xmm2 - vaesenclast xmm14,xmm14,xmm3 - vmovups XMMWORD[rsi],xmm9 - vmovups XMMWORD[16+rsi],xmm10 - vmovups XMMWORD[32+rsi],xmm11 - vmovups XMMWORD[48+rsi],xmm12 - vmovups XMMWORD[64+rsi],xmm13 - vmovups XMMWORD[80+rsi],xmm14 - lea rsi,[96+rsi] - - DB 0F3h,0C3h ;repret -ALIGN 32 -$L$handle_ctr32_2: - vpshufb xmm6,xmm1,xmm0 - vmovdqu xmm5,XMMWORD[48+r11] - vpaddd xmm10,xmm6,XMMWORD[64+r11] - vpaddd xmm11,xmm6,xmm5 - vpaddd xmm12,xmm10,xmm5 - vpshufb xmm10,xmm10,xmm0 - vpaddd xmm13,xmm11,xmm5 - vpshufb xmm11,xmm11,xmm0 - vpxor xmm10,xmm10,xmm4 - vpaddd xmm14,xmm12,xmm5 - vpshufb xmm12,xmm12,xmm0 - vpxor xmm11,xmm11,xmm4 - vpaddd xmm1,xmm13,xmm5 - vpshufb xmm13,xmm13,xmm0 - vpxor xmm12,xmm12,xmm4 - vpshufb xmm14,xmm14,xmm0 - vpxor xmm13,xmm13,xmm4 - vpshufb xmm1,xmm1,xmm0 - vpxor xmm14,xmm14,xmm4 - jmp NEAR $L$oop_ctr32 - - - global aesni_gcm_encrypt -ALIGN 32 aesni_gcm_encrypt: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_aesni_gcm_encrypt: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - - xor r10,r10 - cmp rdx,0x60*3 - jb NEAR $L$gcm_enc_abort - - lea rax,[rsp] - - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - push r15 - - lea rsp,[((-168))+rsp] - movaps XMMWORD[(-216)+rax],xmm6 - movaps XMMWORD[(-200)+rax],xmm7 - movaps XMMWORD[(-184)+rax],xmm8 - movaps XMMWORD[(-168)+rax],xmm9 - movaps XMMWORD[(-152)+rax],xmm10 - movaps XMMWORD[(-136)+rax],xmm11 - movaps XMMWORD[(-120)+rax],xmm12 - movaps XMMWORD[(-104)+rax],xmm13 - movaps XMMWORD[(-88)+rax],xmm14 - movaps XMMWORD[(-72)+rax],xmm15 -$L$gcm_enc_body: - vzeroupper - - vmovdqu xmm1,XMMWORD[r8] - add rsp,-128 - mov ebx,DWORD[12+r8] - lea r11,[$L$bswap_mask] - lea r14,[((-128))+rcx] - mov r15,0xf80 - lea rcx,[128+rcx] - vmovdqu xmm0,XMMWORD[r11] - and rsp,-128 - mov ebp,DWORD[((240-128))+rcx] - - and r14,r15 - and r15,rsp - sub r15,r14 - jc NEAR $L$enc_no_key_aliasing - cmp r15,768 - jnc NEAR $L$enc_no_key_aliasing - sub rsp,r15 -$L$enc_no_key_aliasing: - - lea r14,[rsi] - lea r15,[((-192))+rdx*1+rsi] - shr rdx,4 - - call _aesni_ctr32_6x - vpshufb xmm8,xmm9,xmm0 - vpshufb xmm2,xmm10,xmm0 - vmovdqu XMMWORD[112+rsp],xmm8 - vpshufb xmm4,xmm11,xmm0 - vmovdqu XMMWORD[96+rsp],xmm2 - vpshufb xmm5,xmm12,xmm0 - vmovdqu XMMWORD[80+rsp],xmm4 - vpshufb xmm6,xmm13,xmm0 - vmovdqu XMMWORD[64+rsp],xmm5 - vpshufb xmm7,xmm14,xmm0 - vmovdqu XMMWORD[48+rsp],xmm6 - - call _aesni_ctr32_6x - - vmovdqu xmm8,XMMWORD[r9] - lea r9,[((32+32))+r9] - sub rdx,12 - mov r10,0x60*2 - vpshufb xmm8,xmm8,xmm0 - - call _aesni_ctr32_ghash_6x - vmovdqu xmm7,XMMWORD[32+rsp] - vmovdqu xmm0,XMMWORD[r11] - vmovdqu xmm3,XMMWORD[((0-32))+r9] - vpunpckhqdq xmm1,xmm7,xmm7 - vmovdqu xmm15,XMMWORD[((32-32))+r9] - vmovups XMMWORD[(-96)+rsi],xmm9 - vpshufb xmm9,xmm9,xmm0 - vpxor xmm1,xmm1,xmm7 - vmovups XMMWORD[(-80)+rsi],xmm10 - vpshufb xmm10,xmm10,xmm0 - vmovups XMMWORD[(-64)+rsi],xmm11 - vpshufb xmm11,xmm11,xmm0 - vmovups XMMWORD[(-48)+rsi],xmm12 - vpshufb xmm12,xmm12,xmm0 - vmovups XMMWORD[(-32)+rsi],xmm13 - vpshufb xmm13,xmm13,xmm0 - vmovups XMMWORD[(-16)+rsi],xmm14 - vpshufb xmm14,xmm14,xmm0 - vmovdqu XMMWORD[16+rsp],xmm9 - vmovdqu xmm6,XMMWORD[48+rsp] - vmovdqu xmm0,XMMWORD[((16-32))+r9] - vpunpckhqdq xmm2,xmm6,xmm6 - vpclmulqdq xmm5,xmm7,xmm3,0x00 - vpxor xmm2,xmm2,xmm6 - vpclmulqdq xmm7,xmm7,xmm3,0x11 - vpclmulqdq xmm1,xmm1,xmm15,0x00 - - vmovdqu xmm9,XMMWORD[64+rsp] - vpclmulqdq xmm4,xmm6,xmm0,0x00 - vmovdqu xmm3,XMMWORD[((48-32))+r9] - vpxor xmm4,xmm4,xmm5 - vpunpckhqdq xmm5,xmm9,xmm9 - vpclmulqdq xmm6,xmm6,xmm0,0x11 - vpxor xmm5,xmm5,xmm9 - vpxor xmm6,xmm6,xmm7 - vpclmulqdq xmm2,xmm2,xmm15,0x10 - vmovdqu xmm15,XMMWORD[((80-32))+r9] - vpxor xmm2,xmm2,xmm1 - - vmovdqu xmm1,XMMWORD[80+rsp] - vpclmulqdq xmm7,xmm9,xmm3,0x00 - vmovdqu xmm0,XMMWORD[((64-32))+r9] - vpxor xmm7,xmm7,xmm4 - vpunpckhqdq xmm4,xmm1,xmm1 - vpclmulqdq xmm9,xmm9,xmm3,0x11 - vpxor xmm4,xmm4,xmm1 - vpxor xmm9,xmm9,xmm6 - vpclmulqdq xmm5,xmm5,xmm15,0x00 - vpxor xmm5,xmm5,xmm2 - - vmovdqu xmm2,XMMWORD[96+rsp] - vpclmulqdq xmm6,xmm1,xmm0,0x00 - vmovdqu xmm3,XMMWORD[((96-32))+r9] - vpxor xmm6,xmm6,xmm7 - vpunpckhqdq xmm7,xmm2,xmm2 - vpclmulqdq xmm1,xmm1,xmm0,0x11 - vpxor xmm7,xmm7,xmm2 - vpxor xmm1,xmm1,xmm9 - vpclmulqdq xmm4,xmm4,xmm15,0x10 - vmovdqu xmm15,XMMWORD[((128-32))+r9] - vpxor xmm4,xmm4,xmm5 - - vpxor xmm8,xmm8,XMMWORD[112+rsp] - vpclmulqdq xmm5,xmm2,xmm3,0x00 - vmovdqu xmm0,XMMWORD[((112-32))+r9] - vpunpckhqdq xmm9,xmm8,xmm8 - vpxor xmm5,xmm5,xmm6 - vpclmulqdq xmm2,xmm2,xmm3,0x11 - vpxor xmm9,xmm9,xmm8 - vpxor xmm2,xmm2,xmm1 - vpclmulqdq xmm7,xmm7,xmm15,0x00 - vpxor xmm4,xmm7,xmm4 - - vpclmulqdq xmm6,xmm8,xmm0,0x00 - vmovdqu xmm3,XMMWORD[((0-32))+r9] - vpunpckhqdq xmm1,xmm14,xmm14 - vpclmulqdq xmm8,xmm8,xmm0,0x11 - vpxor xmm1,xmm1,xmm14 - vpxor xmm5,xmm6,xmm5 - vpclmulqdq xmm9,xmm9,xmm15,0x10 - vmovdqu xmm15,XMMWORD[((32-32))+r9] - vpxor xmm7,xmm8,xmm2 - vpxor xmm6,xmm9,xmm4 - vmovdqu xmm0,XMMWORD[((16-32))+r9] - vpxor xmm9,xmm7,xmm5 - vpclmulqdq xmm4,xmm14,xmm3,0x00 - vpxor xmm6,xmm6,xmm9 - vpunpckhqdq xmm2,xmm13,xmm13 - vpclmulqdq xmm14,xmm14,xmm3,0x11 - vpxor xmm2,xmm2,xmm13 - vpslldq xmm9,xmm6,8 - vpclmulqdq xmm1,xmm1,xmm15,0x00 - vpxor xmm8,xmm5,xmm9 - vpsrldq xmm6,xmm6,8 - vpxor xmm7,xmm7,xmm6 - - vpclmulqdq xmm5,xmm13,xmm0,0x00 - vmovdqu xmm3,XMMWORD[((48-32))+r9] - vpxor xmm5,xmm5,xmm4 - vpunpckhqdq xmm9,xmm12,xmm12 - vpclmulqdq xmm13,xmm13,xmm0,0x11 - vpxor xmm9,xmm9,xmm12 - vpxor xmm13,xmm13,xmm14 - vpalignr xmm14,xmm8,xmm8,8 - vpclmulqdq xmm2,xmm2,xmm15,0x10 - vmovdqu xmm15,XMMWORD[((80-32))+r9] - vpxor xmm2,xmm2,xmm1 - - vpclmulqdq xmm4,xmm12,xmm3,0x00 - vmovdqu xmm0,XMMWORD[((64-32))+r9] - vpxor xmm4,xmm4,xmm5 - vpunpckhqdq xmm1,xmm11,xmm11 - vpclmulqdq xmm12,xmm12,xmm3,0x11 - vpxor xmm1,xmm1,xmm11 - vpxor xmm12,xmm12,xmm13 - vxorps xmm7,xmm7,XMMWORD[16+rsp] - vpclmulqdq xmm9,xmm9,xmm15,0x00 - vpxor xmm9,xmm9,xmm2 - - vpclmulqdq xmm8,xmm8,XMMWORD[16+r11],0x10 - vxorps xmm8,xmm8,xmm14 - - vpclmulqdq xmm5,xmm11,xmm0,0x00 - vmovdqu xmm3,XMMWORD[((96-32))+r9] - vpxor xmm5,xmm5,xmm4 - vpunpckhqdq xmm2,xmm10,xmm10 - vpclmulqdq xmm11,xmm11,xmm0,0x11 - vpxor xmm2,xmm2,xmm10 - vpalignr xmm14,xmm8,xmm8,8 - vpxor xmm11,xmm11,xmm12 - vpclmulqdq xmm1,xmm1,xmm15,0x10 - vmovdqu xmm15,XMMWORD[((128-32))+r9] - vpxor xmm1,xmm1,xmm9 - - vxorps xmm14,xmm14,xmm7 - vpclmulqdq xmm8,xmm8,XMMWORD[16+r11],0x10 - vxorps xmm8,xmm8,xmm14 - - vpclmulqdq xmm4,xmm10,xmm3,0x00 - vmovdqu xmm0,XMMWORD[((112-32))+r9] - vpxor xmm4,xmm4,xmm5 - vpunpckhqdq xmm9,xmm8,xmm8 - vpclmulqdq xmm10,xmm10,xmm3,0x11 - vpxor xmm9,xmm9,xmm8 - vpxor xmm10,xmm10,xmm11 - vpclmulqdq xmm2,xmm2,xmm15,0x00 - vpxor xmm2,xmm2,xmm1 - - vpclmulqdq xmm5,xmm8,xmm0,0x00 - vpclmulqdq xmm7,xmm8,xmm0,0x11 - vpxor xmm5,xmm5,xmm4 - vpclmulqdq xmm6,xmm9,xmm15,0x10 - vpxor xmm7,xmm7,xmm10 - vpxor xmm6,xmm6,xmm2 - - vpxor xmm4,xmm7,xmm5 - vpxor xmm6,xmm6,xmm4 - vpslldq xmm1,xmm6,8 - vmovdqu xmm3,XMMWORD[16+r11] - vpsrldq xmm6,xmm6,8 - vpxor xmm8,xmm5,xmm1 - vpxor xmm7,xmm7,xmm6 - - vpalignr xmm2,xmm8,xmm8,8 - vpclmulqdq xmm8,xmm8,xmm3,0x10 - vpxor xmm8,xmm8,xmm2 - - vpalignr xmm2,xmm8,xmm8,8 - vpclmulqdq xmm8,xmm8,xmm3,0x10 - vpxor xmm2,xmm2,xmm7 - vpxor xmm8,xmm8,xmm2 - vpshufb xmm8,xmm8,XMMWORD[r11] - vmovdqu XMMWORD[(-64)+r9],xmm8 - - vzeroupper - movaps xmm6,XMMWORD[((-216))+rax] - movaps xmm7,XMMWORD[((-200))+rax] - movaps xmm8,XMMWORD[((-184))+rax] - movaps xmm9,XMMWORD[((-168))+rax] - movaps xmm10,XMMWORD[((-152))+rax] - movaps xmm11,XMMWORD[((-136))+rax] - movaps xmm12,XMMWORD[((-120))+rax] - movaps xmm13,XMMWORD[((-104))+rax] - movaps xmm14,XMMWORD[((-88))+rax] - movaps xmm15,XMMWORD[((-72))+rax] - mov r15,QWORD[((-48))+rax] - - mov r14,QWORD[((-40))+rax] - - mov r13,QWORD[((-32))+rax] - - mov r12,QWORD[((-24))+rax] - - mov rbp,QWORD[((-16))+rax] - - mov rbx,QWORD[((-8))+rax] - - lea rsp,[rax] - -$L$gcm_enc_abort: - mov rax,r10 - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] + xor eax,eax DB 0F3h,0C3h ;repret -$L$SEH_end_aesni_gcm_encrypt: -ALIGN 64 -$L$bswap_mask: -DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -$L$poly: -DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 -$L$one_msb: -DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 -$L$two_lsb: -DB 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -$L$one_lsb: -DB 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -DB 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108 -DB 101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82 -DB 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 -DB 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -ALIGN 64 -EXTERN __imp_RtlVirtualUnwind - -ALIGN 16 -gcm_se_handler: - push rsi - push rdi - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - pushfq - sub rsp,64 - - mov rax,QWORD[120+r8] - mov rbx,QWORD[248+r8] - - mov rsi,QWORD[8+r9] - mov r11,QWORD[56+r9] - - mov r10d,DWORD[r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jb NEAR $L$common_seh_tail - - mov rax,QWORD[152+r8] - mov r10d,DWORD[4+r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jae NEAR $L$common_seh_tail - mov rax,QWORD[120+r8] - - mov r15,QWORD[((-48))+rax] - mov r14,QWORD[((-40))+rax] - mov r13,QWORD[((-32))+rax] - mov r12,QWORD[((-24))+rax] - mov rbp,QWORD[((-16))+rax] - mov rbx,QWORD[((-8))+rax] - mov QWORD[240+r8],r15 - mov QWORD[232+r8],r14 - mov QWORD[224+r8],r13 - mov QWORD[216+r8],r12 - mov QWORD[160+r8],rbp - mov QWORD[144+r8],rbx - - lea rsi,[((-216))+rax] - lea rdi,[512+r8] - mov ecx,20 - DD 0xa548f3fc - -$L$common_seh_tail: - mov rdi,QWORD[8+rax] - mov rsi,QWORD[16+rax] - mov QWORD[152+r8],rax - mov QWORD[168+r8],rsi - mov QWORD[176+r8],rdi - - mov rdi,QWORD[40+r9] - mov rsi,r8 - mov ecx,154 - DD 0xa548f3fc +global aesni_gcm_decrypt - mov rsi,r9 - xor rcx,rcx - mov rdx,QWORD[8+rsi] - mov r8,QWORD[rsi] - mov r9,QWORD[16+rsi] - mov r10,QWORD[40+rsi] - lea r11,[56+rsi] - lea r12,[24+rsi] - mov QWORD[32+rsp],r10 - mov QWORD[40+rsp],r11 - mov QWORD[48+rsp],r12 - mov QWORD[56+rsp],rcx - call QWORD[__imp_RtlVirtualUnwind] +aesni_gcm_decrypt: - mov eax,1 - add rsp,64 - popfq - pop r15 - pop r14 - pop r13 - pop r12 - pop rbp - pop rbx - pop rdi - pop rsi + xor eax,eax DB 0F3h,0C3h ;repret -section .pdata rdata align=4 -ALIGN 4 - DD $L$SEH_begin_aesni_gcm_decrypt wrt ..imagebase - DD $L$SEH_end_aesni_gcm_decrypt wrt ..imagebase - DD $L$SEH_gcm_dec_info wrt ..imagebase - - DD $L$SEH_begin_aesni_gcm_encrypt wrt ..imagebase - DD $L$SEH_end_aesni_gcm_encrypt wrt ..imagebase - DD $L$SEH_gcm_enc_info wrt ..imagebase -section .xdata rdata align=8 -ALIGN 8 -$L$SEH_gcm_dec_info: -DB 9,0,0,0 - DD gcm_se_handler wrt ..imagebase - DD $L$gcm_dec_body wrt ..imagebase,$L$gcm_dec_abort wrt ..imagebase -$L$SEH_gcm_enc_info: -DB 9,0,0,0 - DD gcm_se_handler wrt ..imagebase - DD $L$gcm_enc_body wrt ..imagebase,$L$gcm_enc_abort wrt ..imagebase diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/ghash-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/ghash-x86_64.nasm index e70f90841bc..47f3b1fbead 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/ghash-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/ghash-x86_64.nasm @@ -1354,115 +1354,7 @@ global gcm_init_avx ALIGN 32 gcm_init_avx: -$L$SEH_begin_gcm_init_avx: - -DB 0x48,0x83,0xec,0x18 -DB 0x0f,0x29,0x34,0x24 - vzeroupper - - vmovdqu xmm2,XMMWORD[rdx] - vpshufd xmm2,xmm2,78 - - - vpshufd xmm4,xmm2,255 - vpsrlq xmm3,xmm2,63 - vpsllq xmm2,xmm2,1 - vpxor xmm5,xmm5,xmm5 - vpcmpgtd xmm5,xmm5,xmm4 - vpslldq xmm3,xmm3,8 - vpor xmm2,xmm2,xmm3 - - - vpand xmm5,xmm5,XMMWORD[$L$0x1c2_polynomial] - vpxor xmm2,xmm2,xmm5 - - vpunpckhqdq xmm6,xmm2,xmm2 - vmovdqa xmm0,xmm2 - vpxor xmm6,xmm6,xmm2 - mov r10,4 - jmp NEAR $L$init_start_avx -ALIGN 32 -$L$init_loop_avx: - vpalignr xmm5,xmm4,xmm3,8 - vmovdqu XMMWORD[(-16)+rcx],xmm5 - vpunpckhqdq xmm3,xmm0,xmm0 - vpxor xmm3,xmm3,xmm0 - vpclmulqdq xmm1,xmm0,xmm2,0x11 - vpclmulqdq xmm0,xmm0,xmm2,0x00 - vpclmulqdq xmm3,xmm3,xmm6,0x00 - vpxor xmm4,xmm1,xmm0 - vpxor xmm3,xmm3,xmm4 - - vpslldq xmm4,xmm3,8 - vpsrldq xmm3,xmm3,8 - vpxor xmm0,xmm0,xmm4 - vpxor xmm1,xmm1,xmm3 - vpsllq xmm3,xmm0,57 - vpsllq xmm4,xmm0,62 - vpxor xmm4,xmm4,xmm3 - vpsllq xmm3,xmm0,63 - vpxor xmm4,xmm4,xmm3 - vpslldq xmm3,xmm4,8 - vpsrldq xmm4,xmm4,8 - vpxor xmm0,xmm0,xmm3 - vpxor xmm1,xmm1,xmm4 - - vpsrlq xmm4,xmm0,1 - vpxor xmm1,xmm1,xmm0 - vpxor xmm0,xmm0,xmm4 - vpsrlq xmm4,xmm4,5 - vpxor xmm0,xmm0,xmm4 - vpsrlq xmm0,xmm0,1 - vpxor xmm0,xmm0,xmm1 -$L$init_start_avx: - vmovdqa xmm5,xmm0 - vpunpckhqdq xmm3,xmm0,xmm0 - vpxor xmm3,xmm3,xmm0 - vpclmulqdq xmm1,xmm0,xmm2,0x11 - vpclmulqdq xmm0,xmm0,xmm2,0x00 - vpclmulqdq xmm3,xmm3,xmm6,0x00 - vpxor xmm4,xmm1,xmm0 - vpxor xmm3,xmm3,xmm4 - - vpslldq xmm4,xmm3,8 - vpsrldq xmm3,xmm3,8 - vpxor xmm0,xmm0,xmm4 - vpxor xmm1,xmm1,xmm3 - vpsllq xmm3,xmm0,57 - vpsllq xmm4,xmm0,62 - vpxor xmm4,xmm4,xmm3 - vpsllq xmm3,xmm0,63 - vpxor xmm4,xmm4,xmm3 - vpslldq xmm3,xmm4,8 - vpsrldq xmm4,xmm4,8 - vpxor xmm0,xmm0,xmm3 - vpxor xmm1,xmm1,xmm4 - - vpsrlq xmm4,xmm0,1 - vpxor xmm1,xmm1,xmm0 - vpxor xmm0,xmm0,xmm4 - vpsrlq xmm4,xmm4,5 - vpxor xmm0,xmm0,xmm4 - vpsrlq xmm0,xmm0,1 - vpxor xmm0,xmm0,xmm1 - vpshufd xmm3,xmm5,78 - vpshufd xmm4,xmm0,78 - vpxor xmm3,xmm3,xmm5 - vmovdqu XMMWORD[rcx],xmm5 - vpxor xmm4,xmm4,xmm0 - vmovdqu XMMWORD[16+rcx],xmm0 - lea rcx,[48+rcx] - sub r10,1 - jnz NEAR $L$init_loop_avx - - vpalignr xmm5,xmm3,xmm4,8 - vmovdqu XMMWORD[(-16)+rcx],xmm5 - - vzeroupper - movaps xmm6,XMMWORD[rsp] - lea rsp,[24+rsp] -$L$SEH_end_gcm_init_avx: - DB 0F3h,0C3h ;repret + jmp NEAR $L$_init_clmul global gcm_gmult_avx @@ -1480,403 +1372,7 @@ ALIGN 32 gcm_ghash_avx: DB 243,15,30,250 - lea rax,[((-136))+rsp] -$L$SEH_begin_gcm_ghash_avx: - -DB 0x48,0x8d,0x60,0xe0 -DB 0x0f,0x29,0x70,0xe0 -DB 0x0f,0x29,0x78,0xf0 -DB 0x44,0x0f,0x29,0x00 -DB 0x44,0x0f,0x29,0x48,0x10 -DB 0x44,0x0f,0x29,0x50,0x20 -DB 0x44,0x0f,0x29,0x58,0x30 -DB 0x44,0x0f,0x29,0x60,0x40 -DB 0x44,0x0f,0x29,0x68,0x50 -DB 0x44,0x0f,0x29,0x70,0x60 -DB 0x44,0x0f,0x29,0x78,0x70 - vzeroupper - - vmovdqu xmm10,XMMWORD[rcx] - lea r10,[$L$0x1c2_polynomial] - lea rdx,[64+rdx] - vmovdqu xmm13,XMMWORD[$L$bswap_mask] - vpshufb xmm10,xmm10,xmm13 - cmp r9,0x80 - jb NEAR $L$short_avx - sub r9,0x80 - - vmovdqu xmm14,XMMWORD[112+r8] - vmovdqu xmm6,XMMWORD[((0-64))+rdx] - vpshufb xmm14,xmm14,xmm13 - vmovdqu xmm7,XMMWORD[((32-64))+rdx] - - vpunpckhqdq xmm9,xmm14,xmm14 - vmovdqu xmm15,XMMWORD[96+r8] - vpclmulqdq xmm0,xmm14,xmm6,0x00 - vpxor xmm9,xmm9,xmm14 - vpshufb xmm15,xmm15,xmm13 - vpclmulqdq xmm1,xmm14,xmm6,0x11 - vmovdqu xmm6,XMMWORD[((16-64))+rdx] - vpunpckhqdq xmm8,xmm15,xmm15 - vmovdqu xmm14,XMMWORD[80+r8] - vpclmulqdq xmm2,xmm9,xmm7,0x00 - vpxor xmm8,xmm8,xmm15 - - vpshufb xmm14,xmm14,xmm13 - vpclmulqdq xmm3,xmm15,xmm6,0x00 - vpunpckhqdq xmm9,xmm14,xmm14 - vpclmulqdq xmm4,xmm15,xmm6,0x11 - vmovdqu xmm6,XMMWORD[((48-64))+rdx] - vpxor xmm9,xmm9,xmm14 - vmovdqu xmm15,XMMWORD[64+r8] - vpclmulqdq xmm5,xmm8,xmm7,0x10 - vmovdqu xmm7,XMMWORD[((80-64))+rdx] - - vpshufb xmm15,xmm15,xmm13 - vpxor xmm3,xmm3,xmm0 - vpclmulqdq xmm0,xmm14,xmm6,0x00 - vpxor xmm4,xmm4,xmm1 - vpunpckhqdq xmm8,xmm15,xmm15 - vpclmulqdq xmm1,xmm14,xmm6,0x11 - vmovdqu xmm6,XMMWORD[((64-64))+rdx] - vpxor xmm5,xmm5,xmm2 - vpclmulqdq xmm2,xmm9,xmm7,0x00 - vpxor xmm8,xmm8,xmm15 - - vmovdqu xmm14,XMMWORD[48+r8] - vpxor xmm0,xmm0,xmm3 - vpclmulqdq xmm3,xmm15,xmm6,0x00 - vpxor xmm1,xmm1,xmm4 - vpshufb xmm14,xmm14,xmm13 - vpclmulqdq xmm4,xmm15,xmm6,0x11 - vmovdqu xmm6,XMMWORD[((96-64))+rdx] - vpxor xmm2,xmm2,xmm5 - vpunpckhqdq xmm9,xmm14,xmm14 - vpclmulqdq xmm5,xmm8,xmm7,0x10 - vmovdqu xmm7,XMMWORD[((128-64))+rdx] - vpxor xmm9,xmm9,xmm14 - - vmovdqu xmm15,XMMWORD[32+r8] - vpxor xmm3,xmm3,xmm0 - vpclmulqdq xmm0,xmm14,xmm6,0x00 - vpxor xmm4,xmm4,xmm1 - vpshufb xmm15,xmm15,xmm13 - vpclmulqdq xmm1,xmm14,xmm6,0x11 - vmovdqu xmm6,XMMWORD[((112-64))+rdx] - vpxor xmm5,xmm5,xmm2 - vpunpckhqdq xmm8,xmm15,xmm15 - vpclmulqdq xmm2,xmm9,xmm7,0x00 - vpxor xmm8,xmm8,xmm15 - - vmovdqu xmm14,XMMWORD[16+r8] - vpxor xmm0,xmm0,xmm3 - vpclmulqdq xmm3,xmm15,xmm6,0x00 - vpxor xmm1,xmm1,xmm4 - vpshufb xmm14,xmm14,xmm13 - vpclmulqdq xmm4,xmm15,xmm6,0x11 - vmovdqu xmm6,XMMWORD[((144-64))+rdx] - vpxor xmm2,xmm2,xmm5 - vpunpckhqdq xmm9,xmm14,xmm14 - vpclmulqdq xmm5,xmm8,xmm7,0x10 - vmovdqu xmm7,XMMWORD[((176-64))+rdx] - vpxor xmm9,xmm9,xmm14 - - vmovdqu xmm15,XMMWORD[r8] - vpxor xmm3,xmm3,xmm0 - vpclmulqdq xmm0,xmm14,xmm6,0x00 - vpxor xmm4,xmm4,xmm1 - vpshufb xmm15,xmm15,xmm13 - vpclmulqdq xmm1,xmm14,xmm6,0x11 - vmovdqu xmm6,XMMWORD[((160-64))+rdx] - vpxor xmm5,xmm5,xmm2 - vpclmulqdq xmm2,xmm9,xmm7,0x10 - - lea r8,[128+r8] - cmp r9,0x80 - jb NEAR $L$tail_avx - - vpxor xmm15,xmm15,xmm10 - sub r9,0x80 - jmp NEAR $L$oop8x_avx - -ALIGN 32 -$L$oop8x_avx: - vpunpckhqdq xmm8,xmm15,xmm15 - vmovdqu xmm14,XMMWORD[112+r8] - vpxor xmm3,xmm3,xmm0 - vpxor xmm8,xmm8,xmm15 - vpclmulqdq xmm10,xmm15,xmm6,0x00 - vpshufb xmm14,xmm14,xmm13 - vpxor xmm4,xmm4,xmm1 - vpclmulqdq xmm11,xmm15,xmm6,0x11 - vmovdqu xmm6,XMMWORD[((0-64))+rdx] - vpunpckhqdq xmm9,xmm14,xmm14 - vpxor xmm5,xmm5,xmm2 - vpclmulqdq xmm12,xmm8,xmm7,0x00 - vmovdqu xmm7,XMMWORD[((32-64))+rdx] - vpxor xmm9,xmm9,xmm14 - - vmovdqu xmm15,XMMWORD[96+r8] - vpclmulqdq xmm0,xmm14,xmm6,0x00 - vpxor xmm10,xmm10,xmm3 - vpshufb xmm15,xmm15,xmm13 - vpclmulqdq xmm1,xmm14,xmm6,0x11 - vxorps xmm11,xmm11,xmm4 - vmovdqu xmm6,XMMWORD[((16-64))+rdx] - vpunpckhqdq xmm8,xmm15,xmm15 - vpclmulqdq xmm2,xmm9,xmm7,0x00 - vpxor xmm12,xmm12,xmm5 - vxorps xmm8,xmm8,xmm15 - - vmovdqu xmm14,XMMWORD[80+r8] - vpxor xmm12,xmm12,xmm10 - vpclmulqdq xmm3,xmm15,xmm6,0x00 - vpxor xmm12,xmm12,xmm11 - vpslldq xmm9,xmm12,8 - vpxor xmm3,xmm3,xmm0 - vpclmulqdq xmm4,xmm15,xmm6,0x11 - vpsrldq xmm12,xmm12,8 - vpxor xmm10,xmm10,xmm9 - vmovdqu xmm6,XMMWORD[((48-64))+rdx] - vpshufb xmm14,xmm14,xmm13 - vxorps xmm11,xmm11,xmm12 - vpxor xmm4,xmm4,xmm1 - vpunpckhqdq xmm9,xmm14,xmm14 - vpclmulqdq xmm5,xmm8,xmm7,0x10 - vmovdqu xmm7,XMMWORD[((80-64))+rdx] - vpxor xmm9,xmm9,xmm14 - vpxor xmm5,xmm5,xmm2 - - vmovdqu xmm15,XMMWORD[64+r8] - vpalignr xmm12,xmm10,xmm10,8 - vpclmulqdq xmm0,xmm14,xmm6,0x00 - vpshufb xmm15,xmm15,xmm13 - vpxor xmm0,xmm0,xmm3 - vpclmulqdq xmm1,xmm14,xmm6,0x11 - vmovdqu xmm6,XMMWORD[((64-64))+rdx] - vpunpckhqdq xmm8,xmm15,xmm15 - vpxor xmm1,xmm1,xmm4 - vpclmulqdq xmm2,xmm9,xmm7,0x00 - vxorps xmm8,xmm8,xmm15 - vpxor xmm2,xmm2,xmm5 - - vmovdqu xmm14,XMMWORD[48+r8] - vpclmulqdq xmm10,xmm10,XMMWORD[r10],0x10 - vpclmulqdq xmm3,xmm15,xmm6,0x00 - vpshufb xmm14,xmm14,xmm13 - vpxor xmm3,xmm3,xmm0 - vpclmulqdq xmm4,xmm15,xmm6,0x11 - vmovdqu xmm6,XMMWORD[((96-64))+rdx] - vpunpckhqdq xmm9,xmm14,xmm14 - vpxor xmm4,xmm4,xmm1 - vpclmulqdq xmm5,xmm8,xmm7,0x10 - vmovdqu xmm7,XMMWORD[((128-64))+rdx] - vpxor xmm9,xmm9,xmm14 - vpxor xmm5,xmm5,xmm2 - - vmovdqu xmm15,XMMWORD[32+r8] - vpclmulqdq xmm0,xmm14,xmm6,0x00 - vpshufb xmm15,xmm15,xmm13 - vpxor xmm0,xmm0,xmm3 - vpclmulqdq xmm1,xmm14,xmm6,0x11 - vmovdqu xmm6,XMMWORD[((112-64))+rdx] - vpunpckhqdq xmm8,xmm15,xmm15 - vpxor xmm1,xmm1,xmm4 - vpclmulqdq xmm2,xmm9,xmm7,0x00 - vpxor xmm8,xmm8,xmm15 - vpxor xmm2,xmm2,xmm5 - vxorps xmm10,xmm10,xmm12 - - vmovdqu xmm14,XMMWORD[16+r8] - vpalignr xmm12,xmm10,xmm10,8 - vpclmulqdq xmm3,xmm15,xmm6,0x00 - vpshufb xmm14,xmm14,xmm13 - vpxor xmm3,xmm3,xmm0 - vpclmulqdq xmm4,xmm15,xmm6,0x11 - vmovdqu xmm6,XMMWORD[((144-64))+rdx] - vpclmulqdq xmm10,xmm10,XMMWORD[r10],0x10 - vxorps xmm12,xmm12,xmm11 - vpunpckhqdq xmm9,xmm14,xmm14 - vpxor xmm4,xmm4,xmm1 - vpclmulqdq xmm5,xmm8,xmm7,0x10 - vmovdqu xmm7,XMMWORD[((176-64))+rdx] - vpxor xmm9,xmm9,xmm14 - vpxor xmm5,xmm5,xmm2 - - vmovdqu xmm15,XMMWORD[r8] - vpclmulqdq xmm0,xmm14,xmm6,0x00 - vpshufb xmm15,xmm15,xmm13 - vpclmulqdq xmm1,xmm14,xmm6,0x11 - vmovdqu xmm6,XMMWORD[((160-64))+rdx] - vpxor xmm15,xmm15,xmm12 - vpclmulqdq xmm2,xmm9,xmm7,0x10 - vpxor xmm15,xmm15,xmm10 - - lea r8,[128+r8] - sub r9,0x80 - jnc NEAR $L$oop8x_avx - - add r9,0x80 - jmp NEAR $L$tail_no_xor_avx - -ALIGN 32 -$L$short_avx: - vmovdqu xmm14,XMMWORD[((-16))+r9*1+r8] - lea r8,[r9*1+r8] - vmovdqu xmm6,XMMWORD[((0-64))+rdx] - vmovdqu xmm7,XMMWORD[((32-64))+rdx] - vpshufb xmm15,xmm14,xmm13 - - vmovdqa xmm3,xmm0 - vmovdqa xmm4,xmm1 - vmovdqa xmm5,xmm2 - sub r9,0x10 - jz NEAR $L$tail_avx - - vpunpckhqdq xmm8,xmm15,xmm15 - vpxor xmm3,xmm3,xmm0 - vpclmulqdq xmm0,xmm15,xmm6,0x00 - vpxor xmm8,xmm8,xmm15 - vmovdqu xmm14,XMMWORD[((-32))+r8] - vpxor xmm4,xmm4,xmm1 - vpclmulqdq xmm1,xmm15,xmm6,0x11 - vmovdqu xmm6,XMMWORD[((16-64))+rdx] - vpshufb xmm15,xmm14,xmm13 - vpxor xmm5,xmm5,xmm2 - vpclmulqdq xmm2,xmm8,xmm7,0x00 - vpsrldq xmm7,xmm7,8 - sub r9,0x10 - jz NEAR $L$tail_avx - - vpunpckhqdq xmm8,xmm15,xmm15 - vpxor xmm3,xmm3,xmm0 - vpclmulqdq xmm0,xmm15,xmm6,0x00 - vpxor xmm8,xmm8,xmm15 - vmovdqu xmm14,XMMWORD[((-48))+r8] - vpxor xmm4,xmm4,xmm1 - vpclmulqdq xmm1,xmm15,xmm6,0x11 - vmovdqu xmm6,XMMWORD[((48-64))+rdx] - vpshufb xmm15,xmm14,xmm13 - vpxor xmm5,xmm5,xmm2 - vpclmulqdq xmm2,xmm8,xmm7,0x00 - vmovdqu xmm7,XMMWORD[((80-64))+rdx] - sub r9,0x10 - jz NEAR $L$tail_avx - - vpunpckhqdq xmm8,xmm15,xmm15 - vpxor xmm3,xmm3,xmm0 - vpclmulqdq xmm0,xmm15,xmm6,0x00 - vpxor xmm8,xmm8,xmm15 - vmovdqu xmm14,XMMWORD[((-64))+r8] - vpxor xmm4,xmm4,xmm1 - vpclmulqdq xmm1,xmm15,xmm6,0x11 - vmovdqu xmm6,XMMWORD[((64-64))+rdx] - vpshufb xmm15,xmm14,xmm13 - vpxor xmm5,xmm5,xmm2 - vpclmulqdq xmm2,xmm8,xmm7,0x00 - vpsrldq xmm7,xmm7,8 - sub r9,0x10 - jz NEAR $L$tail_avx - - vpunpckhqdq xmm8,xmm15,xmm15 - vpxor xmm3,xmm3,xmm0 - vpclmulqdq xmm0,xmm15,xmm6,0x00 - vpxor xmm8,xmm8,xmm15 - vmovdqu xmm14,XMMWORD[((-80))+r8] - vpxor xmm4,xmm4,xmm1 - vpclmulqdq xmm1,xmm15,xmm6,0x11 - vmovdqu xmm6,XMMWORD[((96-64))+rdx] - vpshufb xmm15,xmm14,xmm13 - vpxor xmm5,xmm5,xmm2 - vpclmulqdq xmm2,xmm8,xmm7,0x00 - vmovdqu xmm7,XMMWORD[((128-64))+rdx] - sub r9,0x10 - jz NEAR $L$tail_avx - - vpunpckhqdq xmm8,xmm15,xmm15 - vpxor xmm3,xmm3,xmm0 - vpclmulqdq xmm0,xmm15,xmm6,0x00 - vpxor xmm8,xmm8,xmm15 - vmovdqu xmm14,XMMWORD[((-96))+r8] - vpxor xmm4,xmm4,xmm1 - vpclmulqdq xmm1,xmm15,xmm6,0x11 - vmovdqu xmm6,XMMWORD[((112-64))+rdx] - vpshufb xmm15,xmm14,xmm13 - vpxor xmm5,xmm5,xmm2 - vpclmulqdq xmm2,xmm8,xmm7,0x00 - vpsrldq xmm7,xmm7,8 - sub r9,0x10 - jz NEAR $L$tail_avx - - vpunpckhqdq xmm8,xmm15,xmm15 - vpxor xmm3,xmm3,xmm0 - vpclmulqdq xmm0,xmm15,xmm6,0x00 - vpxor xmm8,xmm8,xmm15 - vmovdqu xmm14,XMMWORD[((-112))+r8] - vpxor xmm4,xmm4,xmm1 - vpclmulqdq xmm1,xmm15,xmm6,0x11 - vmovdqu xmm6,XMMWORD[((144-64))+rdx] - vpshufb xmm15,xmm14,xmm13 - vpxor xmm5,xmm5,xmm2 - vpclmulqdq xmm2,xmm8,xmm7,0x00 - vmovq xmm7,QWORD[((184-64))+rdx] - sub r9,0x10 - jmp NEAR $L$tail_avx - -ALIGN 32 -$L$tail_avx: - vpxor xmm15,xmm15,xmm10 -$L$tail_no_xor_avx: - vpunpckhqdq xmm8,xmm15,xmm15 - vpxor xmm3,xmm3,xmm0 - vpclmulqdq xmm0,xmm15,xmm6,0x00 - vpxor xmm8,xmm8,xmm15 - vpxor xmm4,xmm4,xmm1 - vpclmulqdq xmm1,xmm15,xmm6,0x11 - vpxor xmm5,xmm5,xmm2 - vpclmulqdq xmm2,xmm8,xmm7,0x00 - - vmovdqu xmm12,XMMWORD[r10] - - vpxor xmm10,xmm3,xmm0 - vpxor xmm11,xmm4,xmm1 - vpxor xmm5,xmm5,xmm2 - - vpxor xmm5,xmm5,xmm10 - vpxor xmm5,xmm5,xmm11 - vpslldq xmm9,xmm5,8 - vpsrldq xmm5,xmm5,8 - vpxor xmm10,xmm10,xmm9 - vpxor xmm11,xmm11,xmm5 - - vpclmulqdq xmm9,xmm10,xmm12,0x10 - vpalignr xmm10,xmm10,xmm10,8 - vpxor xmm10,xmm10,xmm9 - - vpclmulqdq xmm9,xmm10,xmm12,0x10 - vpalignr xmm10,xmm10,xmm10,8 - vpxor xmm10,xmm10,xmm11 - vpxor xmm10,xmm10,xmm9 - - cmp r9,0 - jne NEAR $L$short_avx - - vpshufb xmm10,xmm10,xmm13 - vmovdqu XMMWORD[rcx],xmm10 - vzeroupper - movaps xmm6,XMMWORD[rsp] - movaps xmm7,XMMWORD[16+rsp] - movaps xmm8,XMMWORD[32+rsp] - movaps xmm9,XMMWORD[48+rsp] - movaps xmm10,XMMWORD[64+rsp] - movaps xmm11,XMMWORD[80+rsp] - movaps xmm12,XMMWORD[96+rsp] - movaps xmm13,XMMWORD[112+rsp] - movaps xmm14,XMMWORD[128+rsp] - movaps xmm15,XMMWORD[144+rsp] - lea rsp,[168+rsp] -$L$SEH_end_gcm_ghash_avx: - DB 0F3h,0C3h ;repret + jmp NEAR $L$_ghash_clmul ALIGN 64 @@ -2040,13 +1536,6 @@ ALIGN 4 DD $L$SEH_begin_gcm_ghash_clmul wrt ..imagebase DD $L$SEH_end_gcm_ghash_clmul wrt ..imagebase DD $L$SEH_info_gcm_ghash_clmul wrt ..imagebase - DD $L$SEH_begin_gcm_init_avx wrt ..imagebase - DD $L$SEH_end_gcm_init_avx wrt ..imagebase - DD $L$SEH_info_gcm_init_clmul wrt ..imagebase - - DD $L$SEH_begin_gcm_ghash_avx wrt ..imagebase - DD $L$SEH_end_gcm_ghash_avx wrt ..imagebase - DD $L$SEH_info_gcm_ghash_clmul wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_gcm_gmult_4bit: diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-mb-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-mb-x86_64.nasm index 9018065f8dd..003b9229a98 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-mb-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-mb-x86_64.nasm @@ -24,8 +24,6 @@ $L$SEH_begin_sha1_multi_block: mov rcx,QWORD[((OPENSSL_ia32cap_P+4))] bt rcx,61 jc NEAR _shaext_shortcut - test ecx,268435456 - jnz NEAR _avx_shortcut mov rax,rsp push rbx @@ -3019,4407 +3017,6 @@ $L$epilogue_shaext: $L$SEH_end_sha1_multi_block_shaext: -ALIGN 32 -sha1_multi_block_avx: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_sha1_multi_block_avx: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - - -_avx_shortcut: - shr rcx,32 - cmp edx,2 - jb NEAR $L$avx - test ecx,32 - jnz NEAR _avx2_shortcut - jmp NEAR $L$avx -ALIGN 32 -$L$avx: - mov rax,rsp - - push rbx - - push rbp - - lea rsp,[((-168))+rsp] - movaps XMMWORD[rsp],xmm6 - movaps XMMWORD[16+rsp],xmm7 - movaps XMMWORD[32+rsp],xmm8 - movaps XMMWORD[48+rsp],xmm9 - movaps XMMWORD[(-120)+rax],xmm10 - movaps XMMWORD[(-104)+rax],xmm11 - movaps XMMWORD[(-88)+rax],xmm12 - movaps XMMWORD[(-72)+rax],xmm13 - movaps XMMWORD[(-56)+rax],xmm14 - movaps XMMWORD[(-40)+rax],xmm15 - sub rsp,288 - and rsp,-256 - mov QWORD[272+rsp],rax - -$L$body_avx: - lea rbp,[K_XX_XX] - lea rbx,[256+rsp] - - vzeroupper -$L$oop_grande_avx: - mov DWORD[280+rsp],edx - xor edx,edx - - mov r8,QWORD[rsi] - - mov ecx,DWORD[8+rsi] - cmp ecx,edx - cmovg edx,ecx - test ecx,ecx - mov DWORD[rbx],ecx - cmovle r8,rbp - - mov r9,QWORD[16+rsi] - - mov ecx,DWORD[24+rsi] - cmp ecx,edx - cmovg edx,ecx - test ecx,ecx - mov DWORD[4+rbx],ecx - cmovle r9,rbp - - mov r10,QWORD[32+rsi] - - mov ecx,DWORD[40+rsi] - cmp ecx,edx - cmovg edx,ecx - test ecx,ecx - mov DWORD[8+rbx],ecx - cmovle r10,rbp - - mov r11,QWORD[48+rsi] - - mov ecx,DWORD[56+rsi] - cmp ecx,edx - cmovg edx,ecx - test ecx,ecx - mov DWORD[12+rbx],ecx - cmovle r11,rbp - test edx,edx - jz NEAR $L$done_avx - - vmovdqu xmm10,XMMWORD[rdi] - lea rax,[128+rsp] - vmovdqu xmm11,XMMWORD[32+rdi] - vmovdqu xmm12,XMMWORD[64+rdi] - vmovdqu xmm13,XMMWORD[96+rdi] - vmovdqu xmm14,XMMWORD[128+rdi] - vmovdqu xmm5,XMMWORD[96+rbp] - jmp NEAR $L$oop_avx - -ALIGN 32 -$L$oop_avx: - vmovdqa xmm15,XMMWORD[((-32))+rbp] - vmovd xmm0,DWORD[r8] - lea r8,[64+r8] - vmovd xmm2,DWORD[r9] - lea r9,[64+r9] - vpinsrd xmm0,xmm0,DWORD[r10],1 - lea r10,[64+r10] - vpinsrd xmm2,xmm2,DWORD[r11],1 - lea r11,[64+r11] - vmovd xmm1,DWORD[((-60))+r8] - vpunpckldq xmm0,xmm0,xmm2 - vmovd xmm9,DWORD[((-60))+r9] - vpshufb xmm0,xmm0,xmm5 - vpinsrd xmm1,xmm1,DWORD[((-60))+r10],1 - vpinsrd xmm9,xmm9,DWORD[((-60))+r11],1 - vpaddd xmm14,xmm14,xmm15 - vpslld xmm8,xmm10,5 - vpandn xmm7,xmm11,xmm13 - vpand xmm6,xmm11,xmm12 - - vmovdqa XMMWORD[(0-128)+rax],xmm0 - vpaddd xmm14,xmm14,xmm0 - vpunpckldq xmm1,xmm1,xmm9 - vpsrld xmm9,xmm10,27 - vpxor xmm6,xmm6,xmm7 - vmovd xmm2,DWORD[((-56))+r8] - - vpslld xmm7,xmm11,30 - vpor xmm8,xmm8,xmm9 - vmovd xmm9,DWORD[((-56))+r9] - vpaddd xmm14,xmm14,xmm6 - - vpsrld xmm11,xmm11,2 - vpaddd xmm14,xmm14,xmm8 - vpshufb xmm1,xmm1,xmm5 - vpor xmm11,xmm11,xmm7 - vpinsrd xmm2,xmm2,DWORD[((-56))+r10],1 - vpinsrd xmm9,xmm9,DWORD[((-56))+r11],1 - vpaddd xmm13,xmm13,xmm15 - vpslld xmm8,xmm14,5 - vpandn xmm7,xmm10,xmm12 - vpand xmm6,xmm10,xmm11 - - vmovdqa XMMWORD[(16-128)+rax],xmm1 - vpaddd xmm13,xmm13,xmm1 - vpunpckldq xmm2,xmm2,xmm9 - vpsrld xmm9,xmm14,27 - vpxor xmm6,xmm6,xmm7 - vmovd xmm3,DWORD[((-52))+r8] - - vpslld xmm7,xmm10,30 - vpor xmm8,xmm8,xmm9 - vmovd xmm9,DWORD[((-52))+r9] - vpaddd xmm13,xmm13,xmm6 - - vpsrld xmm10,xmm10,2 - vpaddd xmm13,xmm13,xmm8 - vpshufb xmm2,xmm2,xmm5 - vpor xmm10,xmm10,xmm7 - vpinsrd xmm3,xmm3,DWORD[((-52))+r10],1 - vpinsrd xmm9,xmm9,DWORD[((-52))+r11],1 - vpaddd xmm12,xmm12,xmm15 - vpslld xmm8,xmm13,5 - vpandn xmm7,xmm14,xmm11 - vpand xmm6,xmm14,xmm10 - - vmovdqa XMMWORD[(32-128)+rax],xmm2 - vpaddd xmm12,xmm12,xmm2 - vpunpckldq xmm3,xmm3,xmm9 - vpsrld xmm9,xmm13,27 - vpxor xmm6,xmm6,xmm7 - vmovd xmm4,DWORD[((-48))+r8] - - vpslld xmm7,xmm14,30 - vpor xmm8,xmm8,xmm9 - vmovd xmm9,DWORD[((-48))+r9] - vpaddd xmm12,xmm12,xmm6 - - vpsrld xmm14,xmm14,2 - vpaddd xmm12,xmm12,xmm8 - vpshufb xmm3,xmm3,xmm5 - vpor xmm14,xmm14,xmm7 - vpinsrd xmm4,xmm4,DWORD[((-48))+r10],1 - vpinsrd xmm9,xmm9,DWORD[((-48))+r11],1 - vpaddd xmm11,xmm11,xmm15 - vpslld xmm8,xmm12,5 - vpandn xmm7,xmm13,xmm10 - vpand xmm6,xmm13,xmm14 - - vmovdqa XMMWORD[(48-128)+rax],xmm3 - vpaddd xmm11,xmm11,xmm3 - vpunpckldq xmm4,xmm4,xmm9 - vpsrld xmm9,xmm12,27 - vpxor xmm6,xmm6,xmm7 - vmovd xmm0,DWORD[((-44))+r8] - - vpslld xmm7,xmm13,30 - vpor xmm8,xmm8,xmm9 - vmovd xmm9,DWORD[((-44))+r9] - vpaddd xmm11,xmm11,xmm6 - - vpsrld xmm13,xmm13,2 - vpaddd xmm11,xmm11,xmm8 - vpshufb xmm4,xmm4,xmm5 - vpor xmm13,xmm13,xmm7 - vpinsrd xmm0,xmm0,DWORD[((-44))+r10],1 - vpinsrd xmm9,xmm9,DWORD[((-44))+r11],1 - vpaddd xmm10,xmm10,xmm15 - vpslld xmm8,xmm11,5 - vpandn xmm7,xmm12,xmm14 - vpand xmm6,xmm12,xmm13 - - vmovdqa XMMWORD[(64-128)+rax],xmm4 - vpaddd xmm10,xmm10,xmm4 - vpunpckldq xmm0,xmm0,xmm9 - vpsrld xmm9,xmm11,27 - vpxor xmm6,xmm6,xmm7 - vmovd xmm1,DWORD[((-40))+r8] - - vpslld xmm7,xmm12,30 - vpor xmm8,xmm8,xmm9 - vmovd xmm9,DWORD[((-40))+r9] - vpaddd xmm10,xmm10,xmm6 - - vpsrld xmm12,xmm12,2 - vpaddd xmm10,xmm10,xmm8 - vpshufb xmm0,xmm0,xmm5 - vpor xmm12,xmm12,xmm7 - vpinsrd xmm1,xmm1,DWORD[((-40))+r10],1 - vpinsrd xmm9,xmm9,DWORD[((-40))+r11],1 - vpaddd xmm14,xmm14,xmm15 - vpslld xmm8,xmm10,5 - vpandn xmm7,xmm11,xmm13 - vpand xmm6,xmm11,xmm12 - - vmovdqa XMMWORD[(80-128)+rax],xmm0 - vpaddd xmm14,xmm14,xmm0 - vpunpckldq xmm1,xmm1,xmm9 - vpsrld xmm9,xmm10,27 - vpxor xmm6,xmm6,xmm7 - vmovd xmm2,DWORD[((-36))+r8] - - vpslld xmm7,xmm11,30 - vpor xmm8,xmm8,xmm9 - vmovd xmm9,DWORD[((-36))+r9] - vpaddd xmm14,xmm14,xmm6 - - vpsrld xmm11,xmm11,2 - vpaddd xmm14,xmm14,xmm8 - vpshufb xmm1,xmm1,xmm5 - vpor xmm11,xmm11,xmm7 - vpinsrd xmm2,xmm2,DWORD[((-36))+r10],1 - vpinsrd xmm9,xmm9,DWORD[((-36))+r11],1 - vpaddd xmm13,xmm13,xmm15 - vpslld xmm8,xmm14,5 - vpandn xmm7,xmm10,xmm12 - vpand xmm6,xmm10,xmm11 - - vmovdqa XMMWORD[(96-128)+rax],xmm1 - vpaddd xmm13,xmm13,xmm1 - vpunpckldq xmm2,xmm2,xmm9 - vpsrld xmm9,xmm14,27 - vpxor xmm6,xmm6,xmm7 - vmovd xmm3,DWORD[((-32))+r8] - - vpslld xmm7,xmm10,30 - vpor xmm8,xmm8,xmm9 - vmovd xmm9,DWORD[((-32))+r9] - vpaddd xmm13,xmm13,xmm6 - - vpsrld xmm10,xmm10,2 - vpaddd xmm13,xmm13,xmm8 - vpshufb xmm2,xmm2,xmm5 - vpor xmm10,xmm10,xmm7 - vpinsrd xmm3,xmm3,DWORD[((-32))+r10],1 - vpinsrd xmm9,xmm9,DWORD[((-32))+r11],1 - vpaddd xmm12,xmm12,xmm15 - vpslld xmm8,xmm13,5 - vpandn xmm7,xmm14,xmm11 - vpand xmm6,xmm14,xmm10 - - vmovdqa XMMWORD[(112-128)+rax],xmm2 - vpaddd xmm12,xmm12,xmm2 - vpunpckldq xmm3,xmm3,xmm9 - vpsrld xmm9,xmm13,27 - vpxor xmm6,xmm6,xmm7 - vmovd xmm4,DWORD[((-28))+r8] - - vpslld xmm7,xmm14,30 - vpor xmm8,xmm8,xmm9 - vmovd xmm9,DWORD[((-28))+r9] - vpaddd xmm12,xmm12,xmm6 - - vpsrld xmm14,xmm14,2 - vpaddd xmm12,xmm12,xmm8 - vpshufb xmm3,xmm3,xmm5 - vpor xmm14,xmm14,xmm7 - vpinsrd xmm4,xmm4,DWORD[((-28))+r10],1 - vpinsrd xmm9,xmm9,DWORD[((-28))+r11],1 - vpaddd xmm11,xmm11,xmm15 - vpslld xmm8,xmm12,5 - vpandn xmm7,xmm13,xmm10 - vpand xmm6,xmm13,xmm14 - - vmovdqa XMMWORD[(128-128)+rax],xmm3 - vpaddd xmm11,xmm11,xmm3 - vpunpckldq xmm4,xmm4,xmm9 - vpsrld xmm9,xmm12,27 - vpxor xmm6,xmm6,xmm7 - vmovd xmm0,DWORD[((-24))+r8] - - vpslld xmm7,xmm13,30 - vpor xmm8,xmm8,xmm9 - vmovd xmm9,DWORD[((-24))+r9] - vpaddd xmm11,xmm11,xmm6 - - vpsrld xmm13,xmm13,2 - vpaddd xmm11,xmm11,xmm8 - vpshufb xmm4,xmm4,xmm5 - vpor xmm13,xmm13,xmm7 - vpinsrd xmm0,xmm0,DWORD[((-24))+r10],1 - vpinsrd xmm9,xmm9,DWORD[((-24))+r11],1 - vpaddd xmm10,xmm10,xmm15 - vpslld xmm8,xmm11,5 - vpandn xmm7,xmm12,xmm14 - vpand xmm6,xmm12,xmm13 - - vmovdqa XMMWORD[(144-128)+rax],xmm4 - vpaddd xmm10,xmm10,xmm4 - vpunpckldq xmm0,xmm0,xmm9 - vpsrld xmm9,xmm11,27 - vpxor xmm6,xmm6,xmm7 - vmovd xmm1,DWORD[((-20))+r8] - - vpslld xmm7,xmm12,30 - vpor xmm8,xmm8,xmm9 - vmovd xmm9,DWORD[((-20))+r9] - vpaddd xmm10,xmm10,xmm6 - - vpsrld xmm12,xmm12,2 - vpaddd xmm10,xmm10,xmm8 - vpshufb xmm0,xmm0,xmm5 - vpor xmm12,xmm12,xmm7 - vpinsrd xmm1,xmm1,DWORD[((-20))+r10],1 - vpinsrd xmm9,xmm9,DWORD[((-20))+r11],1 - vpaddd xmm14,xmm14,xmm15 - vpslld xmm8,xmm10,5 - vpandn xmm7,xmm11,xmm13 - vpand xmm6,xmm11,xmm12 - - vmovdqa XMMWORD[(160-128)+rax],xmm0 - vpaddd xmm14,xmm14,xmm0 - vpunpckldq xmm1,xmm1,xmm9 - vpsrld xmm9,xmm10,27 - vpxor xmm6,xmm6,xmm7 - vmovd xmm2,DWORD[((-16))+r8] - - vpslld xmm7,xmm11,30 - vpor xmm8,xmm8,xmm9 - vmovd xmm9,DWORD[((-16))+r9] - vpaddd xmm14,xmm14,xmm6 - - vpsrld xmm11,xmm11,2 - vpaddd xmm14,xmm14,xmm8 - vpshufb xmm1,xmm1,xmm5 - vpor xmm11,xmm11,xmm7 - vpinsrd xmm2,xmm2,DWORD[((-16))+r10],1 - vpinsrd xmm9,xmm9,DWORD[((-16))+r11],1 - vpaddd xmm13,xmm13,xmm15 - vpslld xmm8,xmm14,5 - vpandn xmm7,xmm10,xmm12 - vpand xmm6,xmm10,xmm11 - - vmovdqa XMMWORD[(176-128)+rax],xmm1 - vpaddd xmm13,xmm13,xmm1 - vpunpckldq xmm2,xmm2,xmm9 - vpsrld xmm9,xmm14,27 - vpxor xmm6,xmm6,xmm7 - vmovd xmm3,DWORD[((-12))+r8] - - vpslld xmm7,xmm10,30 - vpor xmm8,xmm8,xmm9 - vmovd xmm9,DWORD[((-12))+r9] - vpaddd xmm13,xmm13,xmm6 - - vpsrld xmm10,xmm10,2 - vpaddd xmm13,xmm13,xmm8 - vpshufb xmm2,xmm2,xmm5 - vpor xmm10,xmm10,xmm7 - vpinsrd xmm3,xmm3,DWORD[((-12))+r10],1 - vpinsrd xmm9,xmm9,DWORD[((-12))+r11],1 - vpaddd xmm12,xmm12,xmm15 - vpslld xmm8,xmm13,5 - vpandn xmm7,xmm14,xmm11 - vpand xmm6,xmm14,xmm10 - - vmovdqa XMMWORD[(192-128)+rax],xmm2 - vpaddd xmm12,xmm12,xmm2 - vpunpckldq xmm3,xmm3,xmm9 - vpsrld xmm9,xmm13,27 - vpxor xmm6,xmm6,xmm7 - vmovd xmm4,DWORD[((-8))+r8] - - vpslld xmm7,xmm14,30 - vpor xmm8,xmm8,xmm9 - vmovd xmm9,DWORD[((-8))+r9] - vpaddd xmm12,xmm12,xmm6 - - vpsrld xmm14,xmm14,2 - vpaddd xmm12,xmm12,xmm8 - vpshufb xmm3,xmm3,xmm5 - vpor xmm14,xmm14,xmm7 - vpinsrd xmm4,xmm4,DWORD[((-8))+r10],1 - vpinsrd xmm9,xmm9,DWORD[((-8))+r11],1 - vpaddd xmm11,xmm11,xmm15 - vpslld xmm8,xmm12,5 - vpandn xmm7,xmm13,xmm10 - vpand xmm6,xmm13,xmm14 - - vmovdqa XMMWORD[(208-128)+rax],xmm3 - vpaddd xmm11,xmm11,xmm3 - vpunpckldq xmm4,xmm4,xmm9 - vpsrld xmm9,xmm12,27 - vpxor xmm6,xmm6,xmm7 - vmovd xmm0,DWORD[((-4))+r8] - - vpslld xmm7,xmm13,30 - vpor xmm8,xmm8,xmm9 - vmovd xmm9,DWORD[((-4))+r9] - vpaddd xmm11,xmm11,xmm6 - - vpsrld xmm13,xmm13,2 - vpaddd xmm11,xmm11,xmm8 - vpshufb xmm4,xmm4,xmm5 - vpor xmm13,xmm13,xmm7 - vmovdqa xmm1,XMMWORD[((0-128))+rax] - vpinsrd xmm0,xmm0,DWORD[((-4))+r10],1 - vpinsrd xmm9,xmm9,DWORD[((-4))+r11],1 - vpaddd xmm10,xmm10,xmm15 - prefetcht0 [63+r8] - vpslld xmm8,xmm11,5 - vpandn xmm7,xmm12,xmm14 - vpand xmm6,xmm12,xmm13 - - vmovdqa XMMWORD[(224-128)+rax],xmm4 - vpaddd xmm10,xmm10,xmm4 - vpunpckldq xmm0,xmm0,xmm9 - vpsrld xmm9,xmm11,27 - prefetcht0 [63+r9] - vpxor xmm6,xmm6,xmm7 - - vpslld xmm7,xmm12,30 - vpor xmm8,xmm8,xmm9 - prefetcht0 [63+r10] - vpaddd xmm10,xmm10,xmm6 - - vpsrld xmm12,xmm12,2 - vpaddd xmm10,xmm10,xmm8 - prefetcht0 [63+r11] - vpshufb xmm0,xmm0,xmm5 - vpor xmm12,xmm12,xmm7 - vmovdqa xmm2,XMMWORD[((16-128))+rax] - vpxor xmm1,xmm1,xmm3 - vmovdqa xmm3,XMMWORD[((32-128))+rax] - - vpaddd xmm14,xmm14,xmm15 - vpslld xmm8,xmm10,5 - vpandn xmm7,xmm11,xmm13 - - vpand xmm6,xmm11,xmm12 - - vmovdqa XMMWORD[(240-128)+rax],xmm0 - vpaddd xmm14,xmm14,xmm0 - vpxor xmm1,xmm1,XMMWORD[((128-128))+rax] - vpsrld xmm9,xmm10,27 - vpxor xmm6,xmm6,xmm7 - vpxor xmm1,xmm1,xmm3 - - - vpslld xmm7,xmm11,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm14,xmm14,xmm6 - - vpsrld xmm5,xmm1,31 - vpaddd xmm1,xmm1,xmm1 - - vpsrld xmm11,xmm11,2 - - vpaddd xmm14,xmm14,xmm8 - vpor xmm1,xmm1,xmm5 - vpor xmm11,xmm11,xmm7 - vpxor xmm2,xmm2,xmm4 - vmovdqa xmm4,XMMWORD[((48-128))+rax] - - vpaddd xmm13,xmm13,xmm15 - vpslld xmm8,xmm14,5 - vpandn xmm7,xmm10,xmm12 - - vpand xmm6,xmm10,xmm11 - - vmovdqa XMMWORD[(0-128)+rax],xmm1 - vpaddd xmm13,xmm13,xmm1 - vpxor xmm2,xmm2,XMMWORD[((144-128))+rax] - vpsrld xmm9,xmm14,27 - vpxor xmm6,xmm6,xmm7 - vpxor xmm2,xmm2,xmm4 - - - vpslld xmm7,xmm10,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm13,xmm13,xmm6 - - vpsrld xmm5,xmm2,31 - vpaddd xmm2,xmm2,xmm2 - - vpsrld xmm10,xmm10,2 - - vpaddd xmm13,xmm13,xmm8 - vpor xmm2,xmm2,xmm5 - vpor xmm10,xmm10,xmm7 - vpxor xmm3,xmm3,xmm0 - vmovdqa xmm0,XMMWORD[((64-128))+rax] - - vpaddd xmm12,xmm12,xmm15 - vpslld xmm8,xmm13,5 - vpandn xmm7,xmm14,xmm11 - - vpand xmm6,xmm14,xmm10 - - vmovdqa XMMWORD[(16-128)+rax],xmm2 - vpaddd xmm12,xmm12,xmm2 - vpxor xmm3,xmm3,XMMWORD[((160-128))+rax] - vpsrld xmm9,xmm13,27 - vpxor xmm6,xmm6,xmm7 - vpxor xmm3,xmm3,xmm0 - - - vpslld xmm7,xmm14,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm12,xmm12,xmm6 - - vpsrld xmm5,xmm3,31 - vpaddd xmm3,xmm3,xmm3 - - vpsrld xmm14,xmm14,2 - - vpaddd xmm12,xmm12,xmm8 - vpor xmm3,xmm3,xmm5 - vpor xmm14,xmm14,xmm7 - vpxor xmm4,xmm4,xmm1 - vmovdqa xmm1,XMMWORD[((80-128))+rax] - - vpaddd xmm11,xmm11,xmm15 - vpslld xmm8,xmm12,5 - vpandn xmm7,xmm13,xmm10 - - vpand xmm6,xmm13,xmm14 - - vmovdqa XMMWORD[(32-128)+rax],xmm3 - vpaddd xmm11,xmm11,xmm3 - vpxor xmm4,xmm4,XMMWORD[((176-128))+rax] - vpsrld xmm9,xmm12,27 - vpxor xmm6,xmm6,xmm7 - vpxor xmm4,xmm4,xmm1 - - - vpslld xmm7,xmm13,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm11,xmm11,xmm6 - - vpsrld xmm5,xmm4,31 - vpaddd xmm4,xmm4,xmm4 - - vpsrld xmm13,xmm13,2 - - vpaddd xmm11,xmm11,xmm8 - vpor xmm4,xmm4,xmm5 - vpor xmm13,xmm13,xmm7 - vpxor xmm0,xmm0,xmm2 - vmovdqa xmm2,XMMWORD[((96-128))+rax] - - vpaddd xmm10,xmm10,xmm15 - vpslld xmm8,xmm11,5 - vpandn xmm7,xmm12,xmm14 - - vpand xmm6,xmm12,xmm13 - - vmovdqa XMMWORD[(48-128)+rax],xmm4 - vpaddd xmm10,xmm10,xmm4 - vpxor xmm0,xmm0,XMMWORD[((192-128))+rax] - vpsrld xmm9,xmm11,27 - vpxor xmm6,xmm6,xmm7 - vpxor xmm0,xmm0,xmm2 - - - vpslld xmm7,xmm12,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm10,xmm10,xmm6 - - vpsrld xmm5,xmm0,31 - vpaddd xmm0,xmm0,xmm0 - - vpsrld xmm12,xmm12,2 - - vpaddd xmm10,xmm10,xmm8 - vpor xmm0,xmm0,xmm5 - vpor xmm12,xmm12,xmm7 - vmovdqa xmm15,XMMWORD[rbp] - vpxor xmm1,xmm1,xmm3 - vmovdqa xmm3,XMMWORD[((112-128))+rax] - - vpslld xmm8,xmm10,5 - vpaddd xmm14,xmm14,xmm15 - vpxor xmm6,xmm13,xmm11 - vmovdqa XMMWORD[(64-128)+rax],xmm0 - vpaddd xmm14,xmm14,xmm0 - vpxor xmm1,xmm1,XMMWORD[((208-128))+rax] - vpsrld xmm9,xmm10,27 - vpxor xmm6,xmm6,xmm12 - vpxor xmm1,xmm1,xmm3 - - vpslld xmm7,xmm11,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm14,xmm14,xmm6 - vpsrld xmm5,xmm1,31 - vpaddd xmm1,xmm1,xmm1 - - vpsrld xmm11,xmm11,2 - vpaddd xmm14,xmm14,xmm8 - vpor xmm1,xmm1,xmm5 - vpor xmm11,xmm11,xmm7 - vpxor xmm2,xmm2,xmm4 - vmovdqa xmm4,XMMWORD[((128-128))+rax] - - vpslld xmm8,xmm14,5 - vpaddd xmm13,xmm13,xmm15 - vpxor xmm6,xmm12,xmm10 - vmovdqa XMMWORD[(80-128)+rax],xmm1 - vpaddd xmm13,xmm13,xmm1 - vpxor xmm2,xmm2,XMMWORD[((224-128))+rax] - vpsrld xmm9,xmm14,27 - vpxor xmm6,xmm6,xmm11 - vpxor xmm2,xmm2,xmm4 - - vpslld xmm7,xmm10,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm13,xmm13,xmm6 - vpsrld xmm5,xmm2,31 - vpaddd xmm2,xmm2,xmm2 - - vpsrld xmm10,xmm10,2 - vpaddd xmm13,xmm13,xmm8 - vpor xmm2,xmm2,xmm5 - vpor xmm10,xmm10,xmm7 - vpxor xmm3,xmm3,xmm0 - vmovdqa xmm0,XMMWORD[((144-128))+rax] - - vpslld xmm8,xmm13,5 - vpaddd xmm12,xmm12,xmm15 - vpxor xmm6,xmm11,xmm14 - vmovdqa XMMWORD[(96-128)+rax],xmm2 - vpaddd xmm12,xmm12,xmm2 - vpxor xmm3,xmm3,XMMWORD[((240-128))+rax] - vpsrld xmm9,xmm13,27 - vpxor xmm6,xmm6,xmm10 - vpxor xmm3,xmm3,xmm0 - - vpslld xmm7,xmm14,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm12,xmm12,xmm6 - vpsrld xmm5,xmm3,31 - vpaddd xmm3,xmm3,xmm3 - - vpsrld xmm14,xmm14,2 - vpaddd xmm12,xmm12,xmm8 - vpor xmm3,xmm3,xmm5 - vpor xmm14,xmm14,xmm7 - vpxor xmm4,xmm4,xmm1 - vmovdqa xmm1,XMMWORD[((160-128))+rax] - - vpslld xmm8,xmm12,5 - vpaddd xmm11,xmm11,xmm15 - vpxor xmm6,xmm10,xmm13 - vmovdqa XMMWORD[(112-128)+rax],xmm3 - vpaddd xmm11,xmm11,xmm3 - vpxor xmm4,xmm4,XMMWORD[((0-128))+rax] - vpsrld xmm9,xmm12,27 - vpxor xmm6,xmm6,xmm14 - vpxor xmm4,xmm4,xmm1 - - vpslld xmm7,xmm13,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm11,xmm11,xmm6 - vpsrld xmm5,xmm4,31 - vpaddd xmm4,xmm4,xmm4 - - vpsrld xmm13,xmm13,2 - vpaddd xmm11,xmm11,xmm8 - vpor xmm4,xmm4,xmm5 - vpor xmm13,xmm13,xmm7 - vpxor xmm0,xmm0,xmm2 - vmovdqa xmm2,XMMWORD[((176-128))+rax] - - vpslld xmm8,xmm11,5 - vpaddd xmm10,xmm10,xmm15 - vpxor xmm6,xmm14,xmm12 - vmovdqa XMMWORD[(128-128)+rax],xmm4 - vpaddd xmm10,xmm10,xmm4 - vpxor xmm0,xmm0,XMMWORD[((16-128))+rax] - vpsrld xmm9,xmm11,27 - vpxor xmm6,xmm6,xmm13 - vpxor xmm0,xmm0,xmm2 - - vpslld xmm7,xmm12,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm10,xmm10,xmm6 - vpsrld xmm5,xmm0,31 - vpaddd xmm0,xmm0,xmm0 - - vpsrld xmm12,xmm12,2 - vpaddd xmm10,xmm10,xmm8 - vpor xmm0,xmm0,xmm5 - vpor xmm12,xmm12,xmm7 - vpxor xmm1,xmm1,xmm3 - vmovdqa xmm3,XMMWORD[((192-128))+rax] - - vpslld xmm8,xmm10,5 - vpaddd xmm14,xmm14,xmm15 - vpxor xmm6,xmm13,xmm11 - vmovdqa XMMWORD[(144-128)+rax],xmm0 - vpaddd xmm14,xmm14,xmm0 - vpxor xmm1,xmm1,XMMWORD[((32-128))+rax] - vpsrld xmm9,xmm10,27 - vpxor xmm6,xmm6,xmm12 - vpxor xmm1,xmm1,xmm3 - - vpslld xmm7,xmm11,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm14,xmm14,xmm6 - vpsrld xmm5,xmm1,31 - vpaddd xmm1,xmm1,xmm1 - - vpsrld xmm11,xmm11,2 - vpaddd xmm14,xmm14,xmm8 - vpor xmm1,xmm1,xmm5 - vpor xmm11,xmm11,xmm7 - vpxor xmm2,xmm2,xmm4 - vmovdqa xmm4,XMMWORD[((208-128))+rax] - - vpslld xmm8,xmm14,5 - vpaddd xmm13,xmm13,xmm15 - vpxor xmm6,xmm12,xmm10 - vmovdqa XMMWORD[(160-128)+rax],xmm1 - vpaddd xmm13,xmm13,xmm1 - vpxor xmm2,xmm2,XMMWORD[((48-128))+rax] - vpsrld xmm9,xmm14,27 - vpxor xmm6,xmm6,xmm11 - vpxor xmm2,xmm2,xmm4 - - vpslld xmm7,xmm10,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm13,xmm13,xmm6 - vpsrld xmm5,xmm2,31 - vpaddd xmm2,xmm2,xmm2 - - vpsrld xmm10,xmm10,2 - vpaddd xmm13,xmm13,xmm8 - vpor xmm2,xmm2,xmm5 - vpor xmm10,xmm10,xmm7 - vpxor xmm3,xmm3,xmm0 - vmovdqa xmm0,XMMWORD[((224-128))+rax] - - vpslld xmm8,xmm13,5 - vpaddd xmm12,xmm12,xmm15 - vpxor xmm6,xmm11,xmm14 - vmovdqa XMMWORD[(176-128)+rax],xmm2 - vpaddd xmm12,xmm12,xmm2 - vpxor xmm3,xmm3,XMMWORD[((64-128))+rax] - vpsrld xmm9,xmm13,27 - vpxor xmm6,xmm6,xmm10 - vpxor xmm3,xmm3,xmm0 - - vpslld xmm7,xmm14,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm12,xmm12,xmm6 - vpsrld xmm5,xmm3,31 - vpaddd xmm3,xmm3,xmm3 - - vpsrld xmm14,xmm14,2 - vpaddd xmm12,xmm12,xmm8 - vpor xmm3,xmm3,xmm5 - vpor xmm14,xmm14,xmm7 - vpxor xmm4,xmm4,xmm1 - vmovdqa xmm1,XMMWORD[((240-128))+rax] - - vpslld xmm8,xmm12,5 - vpaddd xmm11,xmm11,xmm15 - vpxor xmm6,xmm10,xmm13 - vmovdqa XMMWORD[(192-128)+rax],xmm3 - vpaddd xmm11,xmm11,xmm3 - vpxor xmm4,xmm4,XMMWORD[((80-128))+rax] - vpsrld xmm9,xmm12,27 - vpxor xmm6,xmm6,xmm14 - vpxor xmm4,xmm4,xmm1 - - vpslld xmm7,xmm13,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm11,xmm11,xmm6 - vpsrld xmm5,xmm4,31 - vpaddd xmm4,xmm4,xmm4 - - vpsrld xmm13,xmm13,2 - vpaddd xmm11,xmm11,xmm8 - vpor xmm4,xmm4,xmm5 - vpor xmm13,xmm13,xmm7 - vpxor xmm0,xmm0,xmm2 - vmovdqa xmm2,XMMWORD[((0-128))+rax] - - vpslld xmm8,xmm11,5 - vpaddd xmm10,xmm10,xmm15 - vpxor xmm6,xmm14,xmm12 - vmovdqa XMMWORD[(208-128)+rax],xmm4 - vpaddd xmm10,xmm10,xmm4 - vpxor xmm0,xmm0,XMMWORD[((96-128))+rax] - vpsrld xmm9,xmm11,27 - vpxor xmm6,xmm6,xmm13 - vpxor xmm0,xmm0,xmm2 - - vpslld xmm7,xmm12,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm10,xmm10,xmm6 - vpsrld xmm5,xmm0,31 - vpaddd xmm0,xmm0,xmm0 - - vpsrld xmm12,xmm12,2 - vpaddd xmm10,xmm10,xmm8 - vpor xmm0,xmm0,xmm5 - vpor xmm12,xmm12,xmm7 - vpxor xmm1,xmm1,xmm3 - vmovdqa xmm3,XMMWORD[((16-128))+rax] - - vpslld xmm8,xmm10,5 - vpaddd xmm14,xmm14,xmm15 - vpxor xmm6,xmm13,xmm11 - vmovdqa XMMWORD[(224-128)+rax],xmm0 - vpaddd xmm14,xmm14,xmm0 - vpxor xmm1,xmm1,XMMWORD[((112-128))+rax] - vpsrld xmm9,xmm10,27 - vpxor xmm6,xmm6,xmm12 - vpxor xmm1,xmm1,xmm3 - - vpslld xmm7,xmm11,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm14,xmm14,xmm6 - vpsrld xmm5,xmm1,31 - vpaddd xmm1,xmm1,xmm1 - - vpsrld xmm11,xmm11,2 - vpaddd xmm14,xmm14,xmm8 - vpor xmm1,xmm1,xmm5 - vpor xmm11,xmm11,xmm7 - vpxor xmm2,xmm2,xmm4 - vmovdqa xmm4,XMMWORD[((32-128))+rax] - - vpslld xmm8,xmm14,5 - vpaddd xmm13,xmm13,xmm15 - vpxor xmm6,xmm12,xmm10 - vmovdqa XMMWORD[(240-128)+rax],xmm1 - vpaddd xmm13,xmm13,xmm1 - vpxor xmm2,xmm2,XMMWORD[((128-128))+rax] - vpsrld xmm9,xmm14,27 - vpxor xmm6,xmm6,xmm11 - vpxor xmm2,xmm2,xmm4 - - vpslld xmm7,xmm10,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm13,xmm13,xmm6 - vpsrld xmm5,xmm2,31 - vpaddd xmm2,xmm2,xmm2 - - vpsrld xmm10,xmm10,2 - vpaddd xmm13,xmm13,xmm8 - vpor xmm2,xmm2,xmm5 - vpor xmm10,xmm10,xmm7 - vpxor xmm3,xmm3,xmm0 - vmovdqa xmm0,XMMWORD[((48-128))+rax] - - vpslld xmm8,xmm13,5 - vpaddd xmm12,xmm12,xmm15 - vpxor xmm6,xmm11,xmm14 - vmovdqa XMMWORD[(0-128)+rax],xmm2 - vpaddd xmm12,xmm12,xmm2 - vpxor xmm3,xmm3,XMMWORD[((144-128))+rax] - vpsrld xmm9,xmm13,27 - vpxor xmm6,xmm6,xmm10 - vpxor xmm3,xmm3,xmm0 - - vpslld xmm7,xmm14,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm12,xmm12,xmm6 - vpsrld xmm5,xmm3,31 - vpaddd xmm3,xmm3,xmm3 - - vpsrld xmm14,xmm14,2 - vpaddd xmm12,xmm12,xmm8 - vpor xmm3,xmm3,xmm5 - vpor xmm14,xmm14,xmm7 - vpxor xmm4,xmm4,xmm1 - vmovdqa xmm1,XMMWORD[((64-128))+rax] - - vpslld xmm8,xmm12,5 - vpaddd xmm11,xmm11,xmm15 - vpxor xmm6,xmm10,xmm13 - vmovdqa XMMWORD[(16-128)+rax],xmm3 - vpaddd xmm11,xmm11,xmm3 - vpxor xmm4,xmm4,XMMWORD[((160-128))+rax] - vpsrld xmm9,xmm12,27 - vpxor xmm6,xmm6,xmm14 - vpxor xmm4,xmm4,xmm1 - - vpslld xmm7,xmm13,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm11,xmm11,xmm6 - vpsrld xmm5,xmm4,31 - vpaddd xmm4,xmm4,xmm4 - - vpsrld xmm13,xmm13,2 - vpaddd xmm11,xmm11,xmm8 - vpor xmm4,xmm4,xmm5 - vpor xmm13,xmm13,xmm7 - vpxor xmm0,xmm0,xmm2 - vmovdqa xmm2,XMMWORD[((80-128))+rax] - - vpslld xmm8,xmm11,5 - vpaddd xmm10,xmm10,xmm15 - vpxor xmm6,xmm14,xmm12 - vmovdqa XMMWORD[(32-128)+rax],xmm4 - vpaddd xmm10,xmm10,xmm4 - vpxor xmm0,xmm0,XMMWORD[((176-128))+rax] - vpsrld xmm9,xmm11,27 - vpxor xmm6,xmm6,xmm13 - vpxor xmm0,xmm0,xmm2 - - vpslld xmm7,xmm12,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm10,xmm10,xmm6 - vpsrld xmm5,xmm0,31 - vpaddd xmm0,xmm0,xmm0 - - vpsrld xmm12,xmm12,2 - vpaddd xmm10,xmm10,xmm8 - vpor xmm0,xmm0,xmm5 - vpor xmm12,xmm12,xmm7 - vpxor xmm1,xmm1,xmm3 - vmovdqa xmm3,XMMWORD[((96-128))+rax] - - vpslld xmm8,xmm10,5 - vpaddd xmm14,xmm14,xmm15 - vpxor xmm6,xmm13,xmm11 - vmovdqa XMMWORD[(48-128)+rax],xmm0 - vpaddd xmm14,xmm14,xmm0 - vpxor xmm1,xmm1,XMMWORD[((192-128))+rax] - vpsrld xmm9,xmm10,27 - vpxor xmm6,xmm6,xmm12 - vpxor xmm1,xmm1,xmm3 - - vpslld xmm7,xmm11,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm14,xmm14,xmm6 - vpsrld xmm5,xmm1,31 - vpaddd xmm1,xmm1,xmm1 - - vpsrld xmm11,xmm11,2 - vpaddd xmm14,xmm14,xmm8 - vpor xmm1,xmm1,xmm5 - vpor xmm11,xmm11,xmm7 - vpxor xmm2,xmm2,xmm4 - vmovdqa xmm4,XMMWORD[((112-128))+rax] - - vpslld xmm8,xmm14,5 - vpaddd xmm13,xmm13,xmm15 - vpxor xmm6,xmm12,xmm10 - vmovdqa XMMWORD[(64-128)+rax],xmm1 - vpaddd xmm13,xmm13,xmm1 - vpxor xmm2,xmm2,XMMWORD[((208-128))+rax] - vpsrld xmm9,xmm14,27 - vpxor xmm6,xmm6,xmm11 - vpxor xmm2,xmm2,xmm4 - - vpslld xmm7,xmm10,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm13,xmm13,xmm6 - vpsrld xmm5,xmm2,31 - vpaddd xmm2,xmm2,xmm2 - - vpsrld xmm10,xmm10,2 - vpaddd xmm13,xmm13,xmm8 - vpor xmm2,xmm2,xmm5 - vpor xmm10,xmm10,xmm7 - vpxor xmm3,xmm3,xmm0 - vmovdqa xmm0,XMMWORD[((128-128))+rax] - - vpslld xmm8,xmm13,5 - vpaddd xmm12,xmm12,xmm15 - vpxor xmm6,xmm11,xmm14 - vmovdqa XMMWORD[(80-128)+rax],xmm2 - vpaddd xmm12,xmm12,xmm2 - vpxor xmm3,xmm3,XMMWORD[((224-128))+rax] - vpsrld xmm9,xmm13,27 - vpxor xmm6,xmm6,xmm10 - vpxor xmm3,xmm3,xmm0 - - vpslld xmm7,xmm14,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm12,xmm12,xmm6 - vpsrld xmm5,xmm3,31 - vpaddd xmm3,xmm3,xmm3 - - vpsrld xmm14,xmm14,2 - vpaddd xmm12,xmm12,xmm8 - vpor xmm3,xmm3,xmm5 - vpor xmm14,xmm14,xmm7 - vpxor xmm4,xmm4,xmm1 - vmovdqa xmm1,XMMWORD[((144-128))+rax] - - vpslld xmm8,xmm12,5 - vpaddd xmm11,xmm11,xmm15 - vpxor xmm6,xmm10,xmm13 - vmovdqa XMMWORD[(96-128)+rax],xmm3 - vpaddd xmm11,xmm11,xmm3 - vpxor xmm4,xmm4,XMMWORD[((240-128))+rax] - vpsrld xmm9,xmm12,27 - vpxor xmm6,xmm6,xmm14 - vpxor xmm4,xmm4,xmm1 - - vpslld xmm7,xmm13,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm11,xmm11,xmm6 - vpsrld xmm5,xmm4,31 - vpaddd xmm4,xmm4,xmm4 - - vpsrld xmm13,xmm13,2 - vpaddd xmm11,xmm11,xmm8 - vpor xmm4,xmm4,xmm5 - vpor xmm13,xmm13,xmm7 - vpxor xmm0,xmm0,xmm2 - vmovdqa xmm2,XMMWORD[((160-128))+rax] - - vpslld xmm8,xmm11,5 - vpaddd xmm10,xmm10,xmm15 - vpxor xmm6,xmm14,xmm12 - vmovdqa XMMWORD[(112-128)+rax],xmm4 - vpaddd xmm10,xmm10,xmm4 - vpxor xmm0,xmm0,XMMWORD[((0-128))+rax] - vpsrld xmm9,xmm11,27 - vpxor xmm6,xmm6,xmm13 - vpxor xmm0,xmm0,xmm2 - - vpslld xmm7,xmm12,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm10,xmm10,xmm6 - vpsrld xmm5,xmm0,31 - vpaddd xmm0,xmm0,xmm0 - - vpsrld xmm12,xmm12,2 - vpaddd xmm10,xmm10,xmm8 - vpor xmm0,xmm0,xmm5 - vpor xmm12,xmm12,xmm7 - vmovdqa xmm15,XMMWORD[32+rbp] - vpxor xmm1,xmm1,xmm3 - vmovdqa xmm3,XMMWORD[((176-128))+rax] - - vpaddd xmm14,xmm14,xmm15 - vpslld xmm8,xmm10,5 - vpand xmm7,xmm13,xmm12 - vpxor xmm1,xmm1,XMMWORD[((16-128))+rax] - - vpaddd xmm14,xmm14,xmm7 - vpsrld xmm9,xmm10,27 - vpxor xmm6,xmm13,xmm12 - vpxor xmm1,xmm1,xmm3 - - vmovdqu XMMWORD[(128-128)+rax],xmm0 - vpaddd xmm14,xmm14,xmm0 - vpor xmm8,xmm8,xmm9 - vpsrld xmm5,xmm1,31 - vpand xmm6,xmm6,xmm11 - vpaddd xmm1,xmm1,xmm1 - - vpslld xmm7,xmm11,30 - vpaddd xmm14,xmm14,xmm6 - - vpsrld xmm11,xmm11,2 - vpaddd xmm14,xmm14,xmm8 - vpor xmm1,xmm1,xmm5 - vpor xmm11,xmm11,xmm7 - vpxor xmm2,xmm2,xmm4 - vmovdqa xmm4,XMMWORD[((192-128))+rax] - - vpaddd xmm13,xmm13,xmm15 - vpslld xmm8,xmm14,5 - vpand xmm7,xmm12,xmm11 - vpxor xmm2,xmm2,XMMWORD[((32-128))+rax] - - vpaddd xmm13,xmm13,xmm7 - vpsrld xmm9,xmm14,27 - vpxor xmm6,xmm12,xmm11 - vpxor xmm2,xmm2,xmm4 - - vmovdqu XMMWORD[(144-128)+rax],xmm1 - vpaddd xmm13,xmm13,xmm1 - vpor xmm8,xmm8,xmm9 - vpsrld xmm5,xmm2,31 - vpand xmm6,xmm6,xmm10 - vpaddd xmm2,xmm2,xmm2 - - vpslld xmm7,xmm10,30 - vpaddd xmm13,xmm13,xmm6 - - vpsrld xmm10,xmm10,2 - vpaddd xmm13,xmm13,xmm8 - vpor xmm2,xmm2,xmm5 - vpor xmm10,xmm10,xmm7 - vpxor xmm3,xmm3,xmm0 - vmovdqa xmm0,XMMWORD[((208-128))+rax] - - vpaddd xmm12,xmm12,xmm15 - vpslld xmm8,xmm13,5 - vpand xmm7,xmm11,xmm10 - vpxor xmm3,xmm3,XMMWORD[((48-128))+rax] - - vpaddd xmm12,xmm12,xmm7 - vpsrld xmm9,xmm13,27 - vpxor xmm6,xmm11,xmm10 - vpxor xmm3,xmm3,xmm0 - - vmovdqu XMMWORD[(160-128)+rax],xmm2 - vpaddd xmm12,xmm12,xmm2 - vpor xmm8,xmm8,xmm9 - vpsrld xmm5,xmm3,31 - vpand xmm6,xmm6,xmm14 - vpaddd xmm3,xmm3,xmm3 - - vpslld xmm7,xmm14,30 - vpaddd xmm12,xmm12,xmm6 - - vpsrld xmm14,xmm14,2 - vpaddd xmm12,xmm12,xmm8 - vpor xmm3,xmm3,xmm5 - vpor xmm14,xmm14,xmm7 - vpxor xmm4,xmm4,xmm1 - vmovdqa xmm1,XMMWORD[((224-128))+rax] - - vpaddd xmm11,xmm11,xmm15 - vpslld xmm8,xmm12,5 - vpand xmm7,xmm10,xmm14 - vpxor xmm4,xmm4,XMMWORD[((64-128))+rax] - - vpaddd xmm11,xmm11,xmm7 - vpsrld xmm9,xmm12,27 - vpxor xmm6,xmm10,xmm14 - vpxor xmm4,xmm4,xmm1 - - vmovdqu XMMWORD[(176-128)+rax],xmm3 - vpaddd xmm11,xmm11,xmm3 - vpor xmm8,xmm8,xmm9 - vpsrld xmm5,xmm4,31 - vpand xmm6,xmm6,xmm13 - vpaddd xmm4,xmm4,xmm4 - - vpslld xmm7,xmm13,30 - vpaddd xmm11,xmm11,xmm6 - - vpsrld xmm13,xmm13,2 - vpaddd xmm11,xmm11,xmm8 - vpor xmm4,xmm4,xmm5 - vpor xmm13,xmm13,xmm7 - vpxor xmm0,xmm0,xmm2 - vmovdqa xmm2,XMMWORD[((240-128))+rax] - - vpaddd xmm10,xmm10,xmm15 - vpslld xmm8,xmm11,5 - vpand xmm7,xmm14,xmm13 - vpxor xmm0,xmm0,XMMWORD[((80-128))+rax] - - vpaddd xmm10,xmm10,xmm7 - vpsrld xmm9,xmm11,27 - vpxor xmm6,xmm14,xmm13 - vpxor xmm0,xmm0,xmm2 - - vmovdqu XMMWORD[(192-128)+rax],xmm4 - vpaddd xmm10,xmm10,xmm4 - vpor xmm8,xmm8,xmm9 - vpsrld xmm5,xmm0,31 - vpand xmm6,xmm6,xmm12 - vpaddd xmm0,xmm0,xmm0 - - vpslld xmm7,xmm12,30 - vpaddd xmm10,xmm10,xmm6 - - vpsrld xmm12,xmm12,2 - vpaddd xmm10,xmm10,xmm8 - vpor xmm0,xmm0,xmm5 - vpor xmm12,xmm12,xmm7 - vpxor xmm1,xmm1,xmm3 - vmovdqa xmm3,XMMWORD[((0-128))+rax] - - vpaddd xmm14,xmm14,xmm15 - vpslld xmm8,xmm10,5 - vpand xmm7,xmm13,xmm12 - vpxor xmm1,xmm1,XMMWORD[((96-128))+rax] - - vpaddd xmm14,xmm14,xmm7 - vpsrld xmm9,xmm10,27 - vpxor xmm6,xmm13,xmm12 - vpxor xmm1,xmm1,xmm3 - - vmovdqu XMMWORD[(208-128)+rax],xmm0 - vpaddd xmm14,xmm14,xmm0 - vpor xmm8,xmm8,xmm9 - vpsrld xmm5,xmm1,31 - vpand xmm6,xmm6,xmm11 - vpaddd xmm1,xmm1,xmm1 - - vpslld xmm7,xmm11,30 - vpaddd xmm14,xmm14,xmm6 - - vpsrld xmm11,xmm11,2 - vpaddd xmm14,xmm14,xmm8 - vpor xmm1,xmm1,xmm5 - vpor xmm11,xmm11,xmm7 - vpxor xmm2,xmm2,xmm4 - vmovdqa xmm4,XMMWORD[((16-128))+rax] - - vpaddd xmm13,xmm13,xmm15 - vpslld xmm8,xmm14,5 - vpand xmm7,xmm12,xmm11 - vpxor xmm2,xmm2,XMMWORD[((112-128))+rax] - - vpaddd xmm13,xmm13,xmm7 - vpsrld xmm9,xmm14,27 - vpxor xmm6,xmm12,xmm11 - vpxor xmm2,xmm2,xmm4 - - vmovdqu XMMWORD[(224-128)+rax],xmm1 - vpaddd xmm13,xmm13,xmm1 - vpor xmm8,xmm8,xmm9 - vpsrld xmm5,xmm2,31 - vpand xmm6,xmm6,xmm10 - vpaddd xmm2,xmm2,xmm2 - - vpslld xmm7,xmm10,30 - vpaddd xmm13,xmm13,xmm6 - - vpsrld xmm10,xmm10,2 - vpaddd xmm13,xmm13,xmm8 - vpor xmm2,xmm2,xmm5 - vpor xmm10,xmm10,xmm7 - vpxor xmm3,xmm3,xmm0 - vmovdqa xmm0,XMMWORD[((32-128))+rax] - - vpaddd xmm12,xmm12,xmm15 - vpslld xmm8,xmm13,5 - vpand xmm7,xmm11,xmm10 - vpxor xmm3,xmm3,XMMWORD[((128-128))+rax] - - vpaddd xmm12,xmm12,xmm7 - vpsrld xmm9,xmm13,27 - vpxor xmm6,xmm11,xmm10 - vpxor xmm3,xmm3,xmm0 - - vmovdqu XMMWORD[(240-128)+rax],xmm2 - vpaddd xmm12,xmm12,xmm2 - vpor xmm8,xmm8,xmm9 - vpsrld xmm5,xmm3,31 - vpand xmm6,xmm6,xmm14 - vpaddd xmm3,xmm3,xmm3 - - vpslld xmm7,xmm14,30 - vpaddd xmm12,xmm12,xmm6 - - vpsrld xmm14,xmm14,2 - vpaddd xmm12,xmm12,xmm8 - vpor xmm3,xmm3,xmm5 - vpor xmm14,xmm14,xmm7 - vpxor xmm4,xmm4,xmm1 - vmovdqa xmm1,XMMWORD[((48-128))+rax] - - vpaddd xmm11,xmm11,xmm15 - vpslld xmm8,xmm12,5 - vpand xmm7,xmm10,xmm14 - vpxor xmm4,xmm4,XMMWORD[((144-128))+rax] - - vpaddd xmm11,xmm11,xmm7 - vpsrld xmm9,xmm12,27 - vpxor xmm6,xmm10,xmm14 - vpxor xmm4,xmm4,xmm1 - - vmovdqu XMMWORD[(0-128)+rax],xmm3 - vpaddd xmm11,xmm11,xmm3 - vpor xmm8,xmm8,xmm9 - vpsrld xmm5,xmm4,31 - vpand xmm6,xmm6,xmm13 - vpaddd xmm4,xmm4,xmm4 - - vpslld xmm7,xmm13,30 - vpaddd xmm11,xmm11,xmm6 - - vpsrld xmm13,xmm13,2 - vpaddd xmm11,xmm11,xmm8 - vpor xmm4,xmm4,xmm5 - vpor xmm13,xmm13,xmm7 - vpxor xmm0,xmm0,xmm2 - vmovdqa xmm2,XMMWORD[((64-128))+rax] - - vpaddd xmm10,xmm10,xmm15 - vpslld xmm8,xmm11,5 - vpand xmm7,xmm14,xmm13 - vpxor xmm0,xmm0,XMMWORD[((160-128))+rax] - - vpaddd xmm10,xmm10,xmm7 - vpsrld xmm9,xmm11,27 - vpxor xmm6,xmm14,xmm13 - vpxor xmm0,xmm0,xmm2 - - vmovdqu XMMWORD[(16-128)+rax],xmm4 - vpaddd xmm10,xmm10,xmm4 - vpor xmm8,xmm8,xmm9 - vpsrld xmm5,xmm0,31 - vpand xmm6,xmm6,xmm12 - vpaddd xmm0,xmm0,xmm0 - - vpslld xmm7,xmm12,30 - vpaddd xmm10,xmm10,xmm6 - - vpsrld xmm12,xmm12,2 - vpaddd xmm10,xmm10,xmm8 - vpor xmm0,xmm0,xmm5 - vpor xmm12,xmm12,xmm7 - vpxor xmm1,xmm1,xmm3 - vmovdqa xmm3,XMMWORD[((80-128))+rax] - - vpaddd xmm14,xmm14,xmm15 - vpslld xmm8,xmm10,5 - vpand xmm7,xmm13,xmm12 - vpxor xmm1,xmm1,XMMWORD[((176-128))+rax] - - vpaddd xmm14,xmm14,xmm7 - vpsrld xmm9,xmm10,27 - vpxor xmm6,xmm13,xmm12 - vpxor xmm1,xmm1,xmm3 - - vmovdqu XMMWORD[(32-128)+rax],xmm0 - vpaddd xmm14,xmm14,xmm0 - vpor xmm8,xmm8,xmm9 - vpsrld xmm5,xmm1,31 - vpand xmm6,xmm6,xmm11 - vpaddd xmm1,xmm1,xmm1 - - vpslld xmm7,xmm11,30 - vpaddd xmm14,xmm14,xmm6 - - vpsrld xmm11,xmm11,2 - vpaddd xmm14,xmm14,xmm8 - vpor xmm1,xmm1,xmm5 - vpor xmm11,xmm11,xmm7 - vpxor xmm2,xmm2,xmm4 - vmovdqa xmm4,XMMWORD[((96-128))+rax] - - vpaddd xmm13,xmm13,xmm15 - vpslld xmm8,xmm14,5 - vpand xmm7,xmm12,xmm11 - vpxor xmm2,xmm2,XMMWORD[((192-128))+rax] - - vpaddd xmm13,xmm13,xmm7 - vpsrld xmm9,xmm14,27 - vpxor xmm6,xmm12,xmm11 - vpxor xmm2,xmm2,xmm4 - - vmovdqu XMMWORD[(48-128)+rax],xmm1 - vpaddd xmm13,xmm13,xmm1 - vpor xmm8,xmm8,xmm9 - vpsrld xmm5,xmm2,31 - vpand xmm6,xmm6,xmm10 - vpaddd xmm2,xmm2,xmm2 - - vpslld xmm7,xmm10,30 - vpaddd xmm13,xmm13,xmm6 - - vpsrld xmm10,xmm10,2 - vpaddd xmm13,xmm13,xmm8 - vpor xmm2,xmm2,xmm5 - vpor xmm10,xmm10,xmm7 - vpxor xmm3,xmm3,xmm0 - vmovdqa xmm0,XMMWORD[((112-128))+rax] - - vpaddd xmm12,xmm12,xmm15 - vpslld xmm8,xmm13,5 - vpand xmm7,xmm11,xmm10 - vpxor xmm3,xmm3,XMMWORD[((208-128))+rax] - - vpaddd xmm12,xmm12,xmm7 - vpsrld xmm9,xmm13,27 - vpxor xmm6,xmm11,xmm10 - vpxor xmm3,xmm3,xmm0 - - vmovdqu XMMWORD[(64-128)+rax],xmm2 - vpaddd xmm12,xmm12,xmm2 - vpor xmm8,xmm8,xmm9 - vpsrld xmm5,xmm3,31 - vpand xmm6,xmm6,xmm14 - vpaddd xmm3,xmm3,xmm3 - - vpslld xmm7,xmm14,30 - vpaddd xmm12,xmm12,xmm6 - - vpsrld xmm14,xmm14,2 - vpaddd xmm12,xmm12,xmm8 - vpor xmm3,xmm3,xmm5 - vpor xmm14,xmm14,xmm7 - vpxor xmm4,xmm4,xmm1 - vmovdqa xmm1,XMMWORD[((128-128))+rax] - - vpaddd xmm11,xmm11,xmm15 - vpslld xmm8,xmm12,5 - vpand xmm7,xmm10,xmm14 - vpxor xmm4,xmm4,XMMWORD[((224-128))+rax] - - vpaddd xmm11,xmm11,xmm7 - vpsrld xmm9,xmm12,27 - vpxor xmm6,xmm10,xmm14 - vpxor xmm4,xmm4,xmm1 - - vmovdqu XMMWORD[(80-128)+rax],xmm3 - vpaddd xmm11,xmm11,xmm3 - vpor xmm8,xmm8,xmm9 - vpsrld xmm5,xmm4,31 - vpand xmm6,xmm6,xmm13 - vpaddd xmm4,xmm4,xmm4 - - vpslld xmm7,xmm13,30 - vpaddd xmm11,xmm11,xmm6 - - vpsrld xmm13,xmm13,2 - vpaddd xmm11,xmm11,xmm8 - vpor xmm4,xmm4,xmm5 - vpor xmm13,xmm13,xmm7 - vpxor xmm0,xmm0,xmm2 - vmovdqa xmm2,XMMWORD[((144-128))+rax] - - vpaddd xmm10,xmm10,xmm15 - vpslld xmm8,xmm11,5 - vpand xmm7,xmm14,xmm13 - vpxor xmm0,xmm0,XMMWORD[((240-128))+rax] - - vpaddd xmm10,xmm10,xmm7 - vpsrld xmm9,xmm11,27 - vpxor xmm6,xmm14,xmm13 - vpxor xmm0,xmm0,xmm2 - - vmovdqu XMMWORD[(96-128)+rax],xmm4 - vpaddd xmm10,xmm10,xmm4 - vpor xmm8,xmm8,xmm9 - vpsrld xmm5,xmm0,31 - vpand xmm6,xmm6,xmm12 - vpaddd xmm0,xmm0,xmm0 - - vpslld xmm7,xmm12,30 - vpaddd xmm10,xmm10,xmm6 - - vpsrld xmm12,xmm12,2 - vpaddd xmm10,xmm10,xmm8 - vpor xmm0,xmm0,xmm5 - vpor xmm12,xmm12,xmm7 - vpxor xmm1,xmm1,xmm3 - vmovdqa xmm3,XMMWORD[((160-128))+rax] - - vpaddd xmm14,xmm14,xmm15 - vpslld xmm8,xmm10,5 - vpand xmm7,xmm13,xmm12 - vpxor xmm1,xmm1,XMMWORD[((0-128))+rax] - - vpaddd xmm14,xmm14,xmm7 - vpsrld xmm9,xmm10,27 - vpxor xmm6,xmm13,xmm12 - vpxor xmm1,xmm1,xmm3 - - vmovdqu XMMWORD[(112-128)+rax],xmm0 - vpaddd xmm14,xmm14,xmm0 - vpor xmm8,xmm8,xmm9 - vpsrld xmm5,xmm1,31 - vpand xmm6,xmm6,xmm11 - vpaddd xmm1,xmm1,xmm1 - - vpslld xmm7,xmm11,30 - vpaddd xmm14,xmm14,xmm6 - - vpsrld xmm11,xmm11,2 - vpaddd xmm14,xmm14,xmm8 - vpor xmm1,xmm1,xmm5 - vpor xmm11,xmm11,xmm7 - vpxor xmm2,xmm2,xmm4 - vmovdqa xmm4,XMMWORD[((176-128))+rax] - - vpaddd xmm13,xmm13,xmm15 - vpslld xmm8,xmm14,5 - vpand xmm7,xmm12,xmm11 - vpxor xmm2,xmm2,XMMWORD[((16-128))+rax] - - vpaddd xmm13,xmm13,xmm7 - vpsrld xmm9,xmm14,27 - vpxor xmm6,xmm12,xmm11 - vpxor xmm2,xmm2,xmm4 - - vmovdqu XMMWORD[(128-128)+rax],xmm1 - vpaddd xmm13,xmm13,xmm1 - vpor xmm8,xmm8,xmm9 - vpsrld xmm5,xmm2,31 - vpand xmm6,xmm6,xmm10 - vpaddd xmm2,xmm2,xmm2 - - vpslld xmm7,xmm10,30 - vpaddd xmm13,xmm13,xmm6 - - vpsrld xmm10,xmm10,2 - vpaddd xmm13,xmm13,xmm8 - vpor xmm2,xmm2,xmm5 - vpor xmm10,xmm10,xmm7 - vpxor xmm3,xmm3,xmm0 - vmovdqa xmm0,XMMWORD[((192-128))+rax] - - vpaddd xmm12,xmm12,xmm15 - vpslld xmm8,xmm13,5 - vpand xmm7,xmm11,xmm10 - vpxor xmm3,xmm3,XMMWORD[((32-128))+rax] - - vpaddd xmm12,xmm12,xmm7 - vpsrld xmm9,xmm13,27 - vpxor xmm6,xmm11,xmm10 - vpxor xmm3,xmm3,xmm0 - - vmovdqu XMMWORD[(144-128)+rax],xmm2 - vpaddd xmm12,xmm12,xmm2 - vpor xmm8,xmm8,xmm9 - vpsrld xmm5,xmm3,31 - vpand xmm6,xmm6,xmm14 - vpaddd xmm3,xmm3,xmm3 - - vpslld xmm7,xmm14,30 - vpaddd xmm12,xmm12,xmm6 - - vpsrld xmm14,xmm14,2 - vpaddd xmm12,xmm12,xmm8 - vpor xmm3,xmm3,xmm5 - vpor xmm14,xmm14,xmm7 - vpxor xmm4,xmm4,xmm1 - vmovdqa xmm1,XMMWORD[((208-128))+rax] - - vpaddd xmm11,xmm11,xmm15 - vpslld xmm8,xmm12,5 - vpand xmm7,xmm10,xmm14 - vpxor xmm4,xmm4,XMMWORD[((48-128))+rax] - - vpaddd xmm11,xmm11,xmm7 - vpsrld xmm9,xmm12,27 - vpxor xmm6,xmm10,xmm14 - vpxor xmm4,xmm4,xmm1 - - vmovdqu XMMWORD[(160-128)+rax],xmm3 - vpaddd xmm11,xmm11,xmm3 - vpor xmm8,xmm8,xmm9 - vpsrld xmm5,xmm4,31 - vpand xmm6,xmm6,xmm13 - vpaddd xmm4,xmm4,xmm4 - - vpslld xmm7,xmm13,30 - vpaddd xmm11,xmm11,xmm6 - - vpsrld xmm13,xmm13,2 - vpaddd xmm11,xmm11,xmm8 - vpor xmm4,xmm4,xmm5 - vpor xmm13,xmm13,xmm7 - vpxor xmm0,xmm0,xmm2 - vmovdqa xmm2,XMMWORD[((224-128))+rax] - - vpaddd xmm10,xmm10,xmm15 - vpslld xmm8,xmm11,5 - vpand xmm7,xmm14,xmm13 - vpxor xmm0,xmm0,XMMWORD[((64-128))+rax] - - vpaddd xmm10,xmm10,xmm7 - vpsrld xmm9,xmm11,27 - vpxor xmm6,xmm14,xmm13 - vpxor xmm0,xmm0,xmm2 - - vmovdqu XMMWORD[(176-128)+rax],xmm4 - vpaddd xmm10,xmm10,xmm4 - vpor xmm8,xmm8,xmm9 - vpsrld xmm5,xmm0,31 - vpand xmm6,xmm6,xmm12 - vpaddd xmm0,xmm0,xmm0 - - vpslld xmm7,xmm12,30 - vpaddd xmm10,xmm10,xmm6 - - vpsrld xmm12,xmm12,2 - vpaddd xmm10,xmm10,xmm8 - vpor xmm0,xmm0,xmm5 - vpor xmm12,xmm12,xmm7 - vmovdqa xmm15,XMMWORD[64+rbp] - vpxor xmm1,xmm1,xmm3 - vmovdqa xmm3,XMMWORD[((240-128))+rax] - - vpslld xmm8,xmm10,5 - vpaddd xmm14,xmm14,xmm15 - vpxor xmm6,xmm13,xmm11 - vmovdqa XMMWORD[(192-128)+rax],xmm0 - vpaddd xmm14,xmm14,xmm0 - vpxor xmm1,xmm1,XMMWORD[((80-128))+rax] - vpsrld xmm9,xmm10,27 - vpxor xmm6,xmm6,xmm12 - vpxor xmm1,xmm1,xmm3 - - vpslld xmm7,xmm11,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm14,xmm14,xmm6 - vpsrld xmm5,xmm1,31 - vpaddd xmm1,xmm1,xmm1 - - vpsrld xmm11,xmm11,2 - vpaddd xmm14,xmm14,xmm8 - vpor xmm1,xmm1,xmm5 - vpor xmm11,xmm11,xmm7 - vpxor xmm2,xmm2,xmm4 - vmovdqa xmm4,XMMWORD[((0-128))+rax] - - vpslld xmm8,xmm14,5 - vpaddd xmm13,xmm13,xmm15 - vpxor xmm6,xmm12,xmm10 - vmovdqa XMMWORD[(208-128)+rax],xmm1 - vpaddd xmm13,xmm13,xmm1 - vpxor xmm2,xmm2,XMMWORD[((96-128))+rax] - vpsrld xmm9,xmm14,27 - vpxor xmm6,xmm6,xmm11 - vpxor xmm2,xmm2,xmm4 - - vpslld xmm7,xmm10,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm13,xmm13,xmm6 - vpsrld xmm5,xmm2,31 - vpaddd xmm2,xmm2,xmm2 - - vpsrld xmm10,xmm10,2 - vpaddd xmm13,xmm13,xmm8 - vpor xmm2,xmm2,xmm5 - vpor xmm10,xmm10,xmm7 - vpxor xmm3,xmm3,xmm0 - vmovdqa xmm0,XMMWORD[((16-128))+rax] - - vpslld xmm8,xmm13,5 - vpaddd xmm12,xmm12,xmm15 - vpxor xmm6,xmm11,xmm14 - vmovdqa XMMWORD[(224-128)+rax],xmm2 - vpaddd xmm12,xmm12,xmm2 - vpxor xmm3,xmm3,XMMWORD[((112-128))+rax] - vpsrld xmm9,xmm13,27 - vpxor xmm6,xmm6,xmm10 - vpxor xmm3,xmm3,xmm0 - - vpslld xmm7,xmm14,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm12,xmm12,xmm6 - vpsrld xmm5,xmm3,31 - vpaddd xmm3,xmm3,xmm3 - - vpsrld xmm14,xmm14,2 - vpaddd xmm12,xmm12,xmm8 - vpor xmm3,xmm3,xmm5 - vpor xmm14,xmm14,xmm7 - vpxor xmm4,xmm4,xmm1 - vmovdqa xmm1,XMMWORD[((32-128))+rax] - - vpslld xmm8,xmm12,5 - vpaddd xmm11,xmm11,xmm15 - vpxor xmm6,xmm10,xmm13 - vmovdqa XMMWORD[(240-128)+rax],xmm3 - vpaddd xmm11,xmm11,xmm3 - vpxor xmm4,xmm4,XMMWORD[((128-128))+rax] - vpsrld xmm9,xmm12,27 - vpxor xmm6,xmm6,xmm14 - vpxor xmm4,xmm4,xmm1 - - vpslld xmm7,xmm13,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm11,xmm11,xmm6 - vpsrld xmm5,xmm4,31 - vpaddd xmm4,xmm4,xmm4 - - vpsrld xmm13,xmm13,2 - vpaddd xmm11,xmm11,xmm8 - vpor xmm4,xmm4,xmm5 - vpor xmm13,xmm13,xmm7 - vpxor xmm0,xmm0,xmm2 - vmovdqa xmm2,XMMWORD[((48-128))+rax] - - vpslld xmm8,xmm11,5 - vpaddd xmm10,xmm10,xmm15 - vpxor xmm6,xmm14,xmm12 - vmovdqa XMMWORD[(0-128)+rax],xmm4 - vpaddd xmm10,xmm10,xmm4 - vpxor xmm0,xmm0,XMMWORD[((144-128))+rax] - vpsrld xmm9,xmm11,27 - vpxor xmm6,xmm6,xmm13 - vpxor xmm0,xmm0,xmm2 - - vpslld xmm7,xmm12,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm10,xmm10,xmm6 - vpsrld xmm5,xmm0,31 - vpaddd xmm0,xmm0,xmm0 - - vpsrld xmm12,xmm12,2 - vpaddd xmm10,xmm10,xmm8 - vpor xmm0,xmm0,xmm5 - vpor xmm12,xmm12,xmm7 - vpxor xmm1,xmm1,xmm3 - vmovdqa xmm3,XMMWORD[((64-128))+rax] - - vpslld xmm8,xmm10,5 - vpaddd xmm14,xmm14,xmm15 - vpxor xmm6,xmm13,xmm11 - vmovdqa XMMWORD[(16-128)+rax],xmm0 - vpaddd xmm14,xmm14,xmm0 - vpxor xmm1,xmm1,XMMWORD[((160-128))+rax] - vpsrld xmm9,xmm10,27 - vpxor xmm6,xmm6,xmm12 - vpxor xmm1,xmm1,xmm3 - - vpslld xmm7,xmm11,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm14,xmm14,xmm6 - vpsrld xmm5,xmm1,31 - vpaddd xmm1,xmm1,xmm1 - - vpsrld xmm11,xmm11,2 - vpaddd xmm14,xmm14,xmm8 - vpor xmm1,xmm1,xmm5 - vpor xmm11,xmm11,xmm7 - vpxor xmm2,xmm2,xmm4 - vmovdqa xmm4,XMMWORD[((80-128))+rax] - - vpslld xmm8,xmm14,5 - vpaddd xmm13,xmm13,xmm15 - vpxor xmm6,xmm12,xmm10 - vmovdqa XMMWORD[(32-128)+rax],xmm1 - vpaddd xmm13,xmm13,xmm1 - vpxor xmm2,xmm2,XMMWORD[((176-128))+rax] - vpsrld xmm9,xmm14,27 - vpxor xmm6,xmm6,xmm11 - vpxor xmm2,xmm2,xmm4 - - vpslld xmm7,xmm10,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm13,xmm13,xmm6 - vpsrld xmm5,xmm2,31 - vpaddd xmm2,xmm2,xmm2 - - vpsrld xmm10,xmm10,2 - vpaddd xmm13,xmm13,xmm8 - vpor xmm2,xmm2,xmm5 - vpor xmm10,xmm10,xmm7 - vpxor xmm3,xmm3,xmm0 - vmovdqa xmm0,XMMWORD[((96-128))+rax] - - vpslld xmm8,xmm13,5 - vpaddd xmm12,xmm12,xmm15 - vpxor xmm6,xmm11,xmm14 - vmovdqa XMMWORD[(48-128)+rax],xmm2 - vpaddd xmm12,xmm12,xmm2 - vpxor xmm3,xmm3,XMMWORD[((192-128))+rax] - vpsrld xmm9,xmm13,27 - vpxor xmm6,xmm6,xmm10 - vpxor xmm3,xmm3,xmm0 - - vpslld xmm7,xmm14,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm12,xmm12,xmm6 - vpsrld xmm5,xmm3,31 - vpaddd xmm3,xmm3,xmm3 - - vpsrld xmm14,xmm14,2 - vpaddd xmm12,xmm12,xmm8 - vpor xmm3,xmm3,xmm5 - vpor xmm14,xmm14,xmm7 - vpxor xmm4,xmm4,xmm1 - vmovdqa xmm1,XMMWORD[((112-128))+rax] - - vpslld xmm8,xmm12,5 - vpaddd xmm11,xmm11,xmm15 - vpxor xmm6,xmm10,xmm13 - vmovdqa XMMWORD[(64-128)+rax],xmm3 - vpaddd xmm11,xmm11,xmm3 - vpxor xmm4,xmm4,XMMWORD[((208-128))+rax] - vpsrld xmm9,xmm12,27 - vpxor xmm6,xmm6,xmm14 - vpxor xmm4,xmm4,xmm1 - - vpslld xmm7,xmm13,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm11,xmm11,xmm6 - vpsrld xmm5,xmm4,31 - vpaddd xmm4,xmm4,xmm4 - - vpsrld xmm13,xmm13,2 - vpaddd xmm11,xmm11,xmm8 - vpor xmm4,xmm4,xmm5 - vpor xmm13,xmm13,xmm7 - vpxor xmm0,xmm0,xmm2 - vmovdqa xmm2,XMMWORD[((128-128))+rax] - - vpslld xmm8,xmm11,5 - vpaddd xmm10,xmm10,xmm15 - vpxor xmm6,xmm14,xmm12 - vmovdqa XMMWORD[(80-128)+rax],xmm4 - vpaddd xmm10,xmm10,xmm4 - vpxor xmm0,xmm0,XMMWORD[((224-128))+rax] - vpsrld xmm9,xmm11,27 - vpxor xmm6,xmm6,xmm13 - vpxor xmm0,xmm0,xmm2 - - vpslld xmm7,xmm12,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm10,xmm10,xmm6 - vpsrld xmm5,xmm0,31 - vpaddd xmm0,xmm0,xmm0 - - vpsrld xmm12,xmm12,2 - vpaddd xmm10,xmm10,xmm8 - vpor xmm0,xmm0,xmm5 - vpor xmm12,xmm12,xmm7 - vpxor xmm1,xmm1,xmm3 - vmovdqa xmm3,XMMWORD[((144-128))+rax] - - vpslld xmm8,xmm10,5 - vpaddd xmm14,xmm14,xmm15 - vpxor xmm6,xmm13,xmm11 - vmovdqa XMMWORD[(96-128)+rax],xmm0 - vpaddd xmm14,xmm14,xmm0 - vpxor xmm1,xmm1,XMMWORD[((240-128))+rax] - vpsrld xmm9,xmm10,27 - vpxor xmm6,xmm6,xmm12 - vpxor xmm1,xmm1,xmm3 - - vpslld xmm7,xmm11,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm14,xmm14,xmm6 - vpsrld xmm5,xmm1,31 - vpaddd xmm1,xmm1,xmm1 - - vpsrld xmm11,xmm11,2 - vpaddd xmm14,xmm14,xmm8 - vpor xmm1,xmm1,xmm5 - vpor xmm11,xmm11,xmm7 - vpxor xmm2,xmm2,xmm4 - vmovdqa xmm4,XMMWORD[((160-128))+rax] - - vpslld xmm8,xmm14,5 - vpaddd xmm13,xmm13,xmm15 - vpxor xmm6,xmm12,xmm10 - vmovdqa XMMWORD[(112-128)+rax],xmm1 - vpaddd xmm13,xmm13,xmm1 - vpxor xmm2,xmm2,XMMWORD[((0-128))+rax] - vpsrld xmm9,xmm14,27 - vpxor xmm6,xmm6,xmm11 - vpxor xmm2,xmm2,xmm4 - - vpslld xmm7,xmm10,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm13,xmm13,xmm6 - vpsrld xmm5,xmm2,31 - vpaddd xmm2,xmm2,xmm2 - - vpsrld xmm10,xmm10,2 - vpaddd xmm13,xmm13,xmm8 - vpor xmm2,xmm2,xmm5 - vpor xmm10,xmm10,xmm7 - vpxor xmm3,xmm3,xmm0 - vmovdqa xmm0,XMMWORD[((176-128))+rax] - - vpslld xmm8,xmm13,5 - vpaddd xmm12,xmm12,xmm15 - vpxor xmm6,xmm11,xmm14 - vpaddd xmm12,xmm12,xmm2 - vpxor xmm3,xmm3,XMMWORD[((16-128))+rax] - vpsrld xmm9,xmm13,27 - vpxor xmm6,xmm6,xmm10 - vpxor xmm3,xmm3,xmm0 - - vpslld xmm7,xmm14,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm12,xmm12,xmm6 - vpsrld xmm5,xmm3,31 - vpaddd xmm3,xmm3,xmm3 - - vpsrld xmm14,xmm14,2 - vpaddd xmm12,xmm12,xmm8 - vpor xmm3,xmm3,xmm5 - vpor xmm14,xmm14,xmm7 - vpxor xmm4,xmm4,xmm1 - vmovdqa xmm1,XMMWORD[((192-128))+rax] - - vpslld xmm8,xmm12,5 - vpaddd xmm11,xmm11,xmm15 - vpxor xmm6,xmm10,xmm13 - vpaddd xmm11,xmm11,xmm3 - vpxor xmm4,xmm4,XMMWORD[((32-128))+rax] - vpsrld xmm9,xmm12,27 - vpxor xmm6,xmm6,xmm14 - vpxor xmm4,xmm4,xmm1 - - vpslld xmm7,xmm13,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm11,xmm11,xmm6 - vpsrld xmm5,xmm4,31 - vpaddd xmm4,xmm4,xmm4 - - vpsrld xmm13,xmm13,2 - vpaddd xmm11,xmm11,xmm8 - vpor xmm4,xmm4,xmm5 - vpor xmm13,xmm13,xmm7 - vpxor xmm0,xmm0,xmm2 - vmovdqa xmm2,XMMWORD[((208-128))+rax] - - vpslld xmm8,xmm11,5 - vpaddd xmm10,xmm10,xmm15 - vpxor xmm6,xmm14,xmm12 - vpaddd xmm10,xmm10,xmm4 - vpxor xmm0,xmm0,XMMWORD[((48-128))+rax] - vpsrld xmm9,xmm11,27 - vpxor xmm6,xmm6,xmm13 - vpxor xmm0,xmm0,xmm2 - - vpslld xmm7,xmm12,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm10,xmm10,xmm6 - vpsrld xmm5,xmm0,31 - vpaddd xmm0,xmm0,xmm0 - - vpsrld xmm12,xmm12,2 - vpaddd xmm10,xmm10,xmm8 - vpor xmm0,xmm0,xmm5 - vpor xmm12,xmm12,xmm7 - vpxor xmm1,xmm1,xmm3 - vmovdqa xmm3,XMMWORD[((224-128))+rax] - - vpslld xmm8,xmm10,5 - vpaddd xmm14,xmm14,xmm15 - vpxor xmm6,xmm13,xmm11 - vpaddd xmm14,xmm14,xmm0 - vpxor xmm1,xmm1,XMMWORD[((64-128))+rax] - vpsrld xmm9,xmm10,27 - vpxor xmm6,xmm6,xmm12 - vpxor xmm1,xmm1,xmm3 - - vpslld xmm7,xmm11,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm14,xmm14,xmm6 - vpsrld xmm5,xmm1,31 - vpaddd xmm1,xmm1,xmm1 - - vpsrld xmm11,xmm11,2 - vpaddd xmm14,xmm14,xmm8 - vpor xmm1,xmm1,xmm5 - vpor xmm11,xmm11,xmm7 - vpxor xmm2,xmm2,xmm4 - vmovdqa xmm4,XMMWORD[((240-128))+rax] - - vpslld xmm8,xmm14,5 - vpaddd xmm13,xmm13,xmm15 - vpxor xmm6,xmm12,xmm10 - vpaddd xmm13,xmm13,xmm1 - vpxor xmm2,xmm2,XMMWORD[((80-128))+rax] - vpsrld xmm9,xmm14,27 - vpxor xmm6,xmm6,xmm11 - vpxor xmm2,xmm2,xmm4 - - vpslld xmm7,xmm10,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm13,xmm13,xmm6 - vpsrld xmm5,xmm2,31 - vpaddd xmm2,xmm2,xmm2 - - vpsrld xmm10,xmm10,2 - vpaddd xmm13,xmm13,xmm8 - vpor xmm2,xmm2,xmm5 - vpor xmm10,xmm10,xmm7 - vpxor xmm3,xmm3,xmm0 - vmovdqa xmm0,XMMWORD[((0-128))+rax] - - vpslld xmm8,xmm13,5 - vpaddd xmm12,xmm12,xmm15 - vpxor xmm6,xmm11,xmm14 - vpaddd xmm12,xmm12,xmm2 - vpxor xmm3,xmm3,XMMWORD[((96-128))+rax] - vpsrld xmm9,xmm13,27 - vpxor xmm6,xmm6,xmm10 - vpxor xmm3,xmm3,xmm0 - - vpslld xmm7,xmm14,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm12,xmm12,xmm6 - vpsrld xmm5,xmm3,31 - vpaddd xmm3,xmm3,xmm3 - - vpsrld xmm14,xmm14,2 - vpaddd xmm12,xmm12,xmm8 - vpor xmm3,xmm3,xmm5 - vpor xmm14,xmm14,xmm7 - vpxor xmm4,xmm4,xmm1 - vmovdqa xmm1,XMMWORD[((16-128))+rax] - - vpslld xmm8,xmm12,5 - vpaddd xmm11,xmm11,xmm15 - vpxor xmm6,xmm10,xmm13 - vpaddd xmm11,xmm11,xmm3 - vpxor xmm4,xmm4,XMMWORD[((112-128))+rax] - vpsrld xmm9,xmm12,27 - vpxor xmm6,xmm6,xmm14 - vpxor xmm4,xmm4,xmm1 - - vpslld xmm7,xmm13,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm11,xmm11,xmm6 - vpsrld xmm5,xmm4,31 - vpaddd xmm4,xmm4,xmm4 - - vpsrld xmm13,xmm13,2 - vpaddd xmm11,xmm11,xmm8 - vpor xmm4,xmm4,xmm5 - vpor xmm13,xmm13,xmm7 - vpslld xmm8,xmm11,5 - vpaddd xmm10,xmm10,xmm15 - vpxor xmm6,xmm14,xmm12 - - vpsrld xmm9,xmm11,27 - vpaddd xmm10,xmm10,xmm4 - vpxor xmm6,xmm6,xmm13 - - vpslld xmm7,xmm12,30 - vpor xmm8,xmm8,xmm9 - vpaddd xmm10,xmm10,xmm6 - - vpsrld xmm12,xmm12,2 - vpaddd xmm10,xmm10,xmm8 - vpor xmm12,xmm12,xmm7 - mov ecx,1 - cmp ecx,DWORD[rbx] - cmovge r8,rbp - cmp ecx,DWORD[4+rbx] - cmovge r9,rbp - cmp ecx,DWORD[8+rbx] - cmovge r10,rbp - cmp ecx,DWORD[12+rbx] - cmovge r11,rbp - vmovdqu xmm6,XMMWORD[rbx] - vpxor xmm8,xmm8,xmm8 - vmovdqa xmm7,xmm6 - vpcmpgtd xmm7,xmm7,xmm8 - vpaddd xmm6,xmm6,xmm7 - - vpand xmm10,xmm10,xmm7 - vpand xmm11,xmm11,xmm7 - vpaddd xmm10,xmm10,XMMWORD[rdi] - vpand xmm12,xmm12,xmm7 - vpaddd xmm11,xmm11,XMMWORD[32+rdi] - vpand xmm13,xmm13,xmm7 - vpaddd xmm12,xmm12,XMMWORD[64+rdi] - vpand xmm14,xmm14,xmm7 - vpaddd xmm13,xmm13,XMMWORD[96+rdi] - vpaddd xmm14,xmm14,XMMWORD[128+rdi] - vmovdqu XMMWORD[rdi],xmm10 - vmovdqu XMMWORD[32+rdi],xmm11 - vmovdqu XMMWORD[64+rdi],xmm12 - vmovdqu XMMWORD[96+rdi],xmm13 - vmovdqu XMMWORD[128+rdi],xmm14 - - vmovdqu XMMWORD[rbx],xmm6 - vmovdqu xmm5,XMMWORD[96+rbp] - dec edx - jnz NEAR $L$oop_avx - - mov edx,DWORD[280+rsp] - lea rdi,[16+rdi] - lea rsi,[64+rsi] - dec edx - jnz NEAR $L$oop_grande_avx - -$L$done_avx: - mov rax,QWORD[272+rsp] - - vzeroupper - movaps xmm6,XMMWORD[((-184))+rax] - movaps xmm7,XMMWORD[((-168))+rax] - movaps xmm8,XMMWORD[((-152))+rax] - movaps xmm9,XMMWORD[((-136))+rax] - movaps xmm10,XMMWORD[((-120))+rax] - movaps xmm11,XMMWORD[((-104))+rax] - movaps xmm12,XMMWORD[((-88))+rax] - movaps xmm13,XMMWORD[((-72))+rax] - movaps xmm14,XMMWORD[((-56))+rax] - movaps xmm15,XMMWORD[((-40))+rax] - mov rbp,QWORD[((-16))+rax] - - mov rbx,QWORD[((-8))+rax] - - lea rsp,[rax] - -$L$epilogue_avx: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_sha1_multi_block_avx: - -ALIGN 32 -sha1_multi_block_avx2: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_sha1_multi_block_avx2: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - - -_avx2_shortcut: - mov rax,rsp - - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - push r15 - - lea rsp,[((-168))+rsp] - movaps XMMWORD[rsp],xmm6 - movaps XMMWORD[16+rsp],xmm7 - movaps XMMWORD[32+rsp],xmm8 - movaps XMMWORD[48+rsp],xmm9 - movaps XMMWORD[64+rsp],xmm10 - movaps XMMWORD[80+rsp],xmm11 - movaps XMMWORD[(-120)+rax],xmm12 - movaps XMMWORD[(-104)+rax],xmm13 - movaps XMMWORD[(-88)+rax],xmm14 - movaps XMMWORD[(-72)+rax],xmm15 - sub rsp,576 - and rsp,-256 - mov QWORD[544+rsp],rax - -$L$body_avx2: - lea rbp,[K_XX_XX] - shr edx,1 - - vzeroupper -$L$oop_grande_avx2: - mov DWORD[552+rsp],edx - xor edx,edx - lea rbx,[512+rsp] - - mov r12,QWORD[rsi] - - mov ecx,DWORD[8+rsi] - cmp ecx,edx - cmovg edx,ecx - test ecx,ecx - mov DWORD[rbx],ecx - cmovle r12,rbp - - mov r13,QWORD[16+rsi] - - mov ecx,DWORD[24+rsi] - cmp ecx,edx - cmovg edx,ecx - test ecx,ecx - mov DWORD[4+rbx],ecx - cmovle r13,rbp - - mov r14,QWORD[32+rsi] - - mov ecx,DWORD[40+rsi] - cmp ecx,edx - cmovg edx,ecx - test ecx,ecx - mov DWORD[8+rbx],ecx - cmovle r14,rbp - - mov r15,QWORD[48+rsi] - - mov ecx,DWORD[56+rsi] - cmp ecx,edx - cmovg edx,ecx - test ecx,ecx - mov DWORD[12+rbx],ecx - cmovle r15,rbp - - mov r8,QWORD[64+rsi] - - mov ecx,DWORD[72+rsi] - cmp ecx,edx - cmovg edx,ecx - test ecx,ecx - mov DWORD[16+rbx],ecx - cmovle r8,rbp - - mov r9,QWORD[80+rsi] - - mov ecx,DWORD[88+rsi] - cmp ecx,edx - cmovg edx,ecx - test ecx,ecx - mov DWORD[20+rbx],ecx - cmovle r9,rbp - - mov r10,QWORD[96+rsi] - - mov ecx,DWORD[104+rsi] - cmp ecx,edx - cmovg edx,ecx - test ecx,ecx - mov DWORD[24+rbx],ecx - cmovle r10,rbp - - mov r11,QWORD[112+rsi] - - mov ecx,DWORD[120+rsi] - cmp ecx,edx - cmovg edx,ecx - test ecx,ecx - mov DWORD[28+rbx],ecx - cmovle r11,rbp - vmovdqu ymm0,YMMWORD[rdi] - lea rax,[128+rsp] - vmovdqu ymm1,YMMWORD[32+rdi] - lea rbx,[((256+128))+rsp] - vmovdqu ymm2,YMMWORD[64+rdi] - vmovdqu ymm3,YMMWORD[96+rdi] - vmovdqu ymm4,YMMWORD[128+rdi] - vmovdqu ymm9,YMMWORD[96+rbp] - jmp NEAR $L$oop_avx2 - -ALIGN 32 -$L$oop_avx2: - vmovdqa ymm15,YMMWORD[((-32))+rbp] - vmovd xmm10,DWORD[r12] - lea r12,[64+r12] - vmovd xmm12,DWORD[r8] - lea r8,[64+r8] - vmovd xmm7,DWORD[r13] - lea r13,[64+r13] - vmovd xmm6,DWORD[r9] - lea r9,[64+r9] - vpinsrd xmm10,xmm10,DWORD[r14],1 - lea r14,[64+r14] - vpinsrd xmm12,xmm12,DWORD[r10],1 - lea r10,[64+r10] - vpinsrd xmm7,xmm7,DWORD[r15],1 - lea r15,[64+r15] - vpunpckldq ymm10,ymm10,ymm7 - vpinsrd xmm6,xmm6,DWORD[r11],1 - lea r11,[64+r11] - vpunpckldq ymm12,ymm12,ymm6 - vmovd xmm11,DWORD[((-60))+r12] - vinserti128 ymm10,ymm10,xmm12,1 - vmovd xmm8,DWORD[((-60))+r8] - vpshufb ymm10,ymm10,ymm9 - vmovd xmm7,DWORD[((-60))+r13] - vmovd xmm6,DWORD[((-60))+r9] - vpinsrd xmm11,xmm11,DWORD[((-60))+r14],1 - vpinsrd xmm8,xmm8,DWORD[((-60))+r10],1 - vpinsrd xmm7,xmm7,DWORD[((-60))+r15],1 - vpunpckldq ymm11,ymm11,ymm7 - vpinsrd xmm6,xmm6,DWORD[((-60))+r11],1 - vpunpckldq ymm8,ymm8,ymm6 - vpaddd ymm4,ymm4,ymm15 - vpslld ymm7,ymm0,5 - vpandn ymm6,ymm1,ymm3 - vpand ymm5,ymm1,ymm2 - - vmovdqa YMMWORD[(0-128)+rax],ymm10 - vpaddd ymm4,ymm4,ymm10 - vinserti128 ymm11,ymm11,xmm8,1 - vpsrld ymm8,ymm0,27 - vpxor ymm5,ymm5,ymm6 - vmovd xmm12,DWORD[((-56))+r12] - - vpslld ymm6,ymm1,30 - vpor ymm7,ymm7,ymm8 - vmovd xmm8,DWORD[((-56))+r8] - vpaddd ymm4,ymm4,ymm5 - - vpsrld ymm1,ymm1,2 - vpaddd ymm4,ymm4,ymm7 - vpshufb ymm11,ymm11,ymm9 - vpor ymm1,ymm1,ymm6 - vmovd xmm7,DWORD[((-56))+r13] - vmovd xmm6,DWORD[((-56))+r9] - vpinsrd xmm12,xmm12,DWORD[((-56))+r14],1 - vpinsrd xmm8,xmm8,DWORD[((-56))+r10],1 - vpinsrd xmm7,xmm7,DWORD[((-56))+r15],1 - vpunpckldq ymm12,ymm12,ymm7 - vpinsrd xmm6,xmm6,DWORD[((-56))+r11],1 - vpunpckldq ymm8,ymm8,ymm6 - vpaddd ymm3,ymm3,ymm15 - vpslld ymm7,ymm4,5 - vpandn ymm6,ymm0,ymm2 - vpand ymm5,ymm0,ymm1 - - vmovdqa YMMWORD[(32-128)+rax],ymm11 - vpaddd ymm3,ymm3,ymm11 - vinserti128 ymm12,ymm12,xmm8,1 - vpsrld ymm8,ymm4,27 - vpxor ymm5,ymm5,ymm6 - vmovd xmm13,DWORD[((-52))+r12] - - vpslld ymm6,ymm0,30 - vpor ymm7,ymm7,ymm8 - vmovd xmm8,DWORD[((-52))+r8] - vpaddd ymm3,ymm3,ymm5 - - vpsrld ymm0,ymm0,2 - vpaddd ymm3,ymm3,ymm7 - vpshufb ymm12,ymm12,ymm9 - vpor ymm0,ymm0,ymm6 - vmovd xmm7,DWORD[((-52))+r13] - vmovd xmm6,DWORD[((-52))+r9] - vpinsrd xmm13,xmm13,DWORD[((-52))+r14],1 - vpinsrd xmm8,xmm8,DWORD[((-52))+r10],1 - vpinsrd xmm7,xmm7,DWORD[((-52))+r15],1 - vpunpckldq ymm13,ymm13,ymm7 - vpinsrd xmm6,xmm6,DWORD[((-52))+r11],1 - vpunpckldq ymm8,ymm8,ymm6 - vpaddd ymm2,ymm2,ymm15 - vpslld ymm7,ymm3,5 - vpandn ymm6,ymm4,ymm1 - vpand ymm5,ymm4,ymm0 - - vmovdqa YMMWORD[(64-128)+rax],ymm12 - vpaddd ymm2,ymm2,ymm12 - vinserti128 ymm13,ymm13,xmm8,1 - vpsrld ymm8,ymm3,27 - vpxor ymm5,ymm5,ymm6 - vmovd xmm14,DWORD[((-48))+r12] - - vpslld ymm6,ymm4,30 - vpor ymm7,ymm7,ymm8 - vmovd xmm8,DWORD[((-48))+r8] - vpaddd ymm2,ymm2,ymm5 - - vpsrld ymm4,ymm4,2 - vpaddd ymm2,ymm2,ymm7 - vpshufb ymm13,ymm13,ymm9 - vpor ymm4,ymm4,ymm6 - vmovd xmm7,DWORD[((-48))+r13] - vmovd xmm6,DWORD[((-48))+r9] - vpinsrd xmm14,xmm14,DWORD[((-48))+r14],1 - vpinsrd xmm8,xmm8,DWORD[((-48))+r10],1 - vpinsrd xmm7,xmm7,DWORD[((-48))+r15],1 - vpunpckldq ymm14,ymm14,ymm7 - vpinsrd xmm6,xmm6,DWORD[((-48))+r11],1 - vpunpckldq ymm8,ymm8,ymm6 - vpaddd ymm1,ymm1,ymm15 - vpslld ymm7,ymm2,5 - vpandn ymm6,ymm3,ymm0 - vpand ymm5,ymm3,ymm4 - - vmovdqa YMMWORD[(96-128)+rax],ymm13 - vpaddd ymm1,ymm1,ymm13 - vinserti128 ymm14,ymm14,xmm8,1 - vpsrld ymm8,ymm2,27 - vpxor ymm5,ymm5,ymm6 - vmovd xmm10,DWORD[((-44))+r12] - - vpslld ymm6,ymm3,30 - vpor ymm7,ymm7,ymm8 - vmovd xmm8,DWORD[((-44))+r8] - vpaddd ymm1,ymm1,ymm5 - - vpsrld ymm3,ymm3,2 - vpaddd ymm1,ymm1,ymm7 - vpshufb ymm14,ymm14,ymm9 - vpor ymm3,ymm3,ymm6 - vmovd xmm7,DWORD[((-44))+r13] - vmovd xmm6,DWORD[((-44))+r9] - vpinsrd xmm10,xmm10,DWORD[((-44))+r14],1 - vpinsrd xmm8,xmm8,DWORD[((-44))+r10],1 - vpinsrd xmm7,xmm7,DWORD[((-44))+r15],1 - vpunpckldq ymm10,ymm10,ymm7 - vpinsrd xmm6,xmm6,DWORD[((-44))+r11],1 - vpunpckldq ymm8,ymm8,ymm6 - vpaddd ymm0,ymm0,ymm15 - vpslld ymm7,ymm1,5 - vpandn ymm6,ymm2,ymm4 - vpand ymm5,ymm2,ymm3 - - vmovdqa YMMWORD[(128-128)+rax],ymm14 - vpaddd ymm0,ymm0,ymm14 - vinserti128 ymm10,ymm10,xmm8,1 - vpsrld ymm8,ymm1,27 - vpxor ymm5,ymm5,ymm6 - vmovd xmm11,DWORD[((-40))+r12] - - vpslld ymm6,ymm2,30 - vpor ymm7,ymm7,ymm8 - vmovd xmm8,DWORD[((-40))+r8] - vpaddd ymm0,ymm0,ymm5 - - vpsrld ymm2,ymm2,2 - vpaddd ymm0,ymm0,ymm7 - vpshufb ymm10,ymm10,ymm9 - vpor ymm2,ymm2,ymm6 - vmovd xmm7,DWORD[((-40))+r13] - vmovd xmm6,DWORD[((-40))+r9] - vpinsrd xmm11,xmm11,DWORD[((-40))+r14],1 - vpinsrd xmm8,xmm8,DWORD[((-40))+r10],1 - vpinsrd xmm7,xmm7,DWORD[((-40))+r15],1 - vpunpckldq ymm11,ymm11,ymm7 - vpinsrd xmm6,xmm6,DWORD[((-40))+r11],1 - vpunpckldq ymm8,ymm8,ymm6 - vpaddd ymm4,ymm4,ymm15 - vpslld ymm7,ymm0,5 - vpandn ymm6,ymm1,ymm3 - vpand ymm5,ymm1,ymm2 - - vmovdqa YMMWORD[(160-128)+rax],ymm10 - vpaddd ymm4,ymm4,ymm10 - vinserti128 ymm11,ymm11,xmm8,1 - vpsrld ymm8,ymm0,27 - vpxor ymm5,ymm5,ymm6 - vmovd xmm12,DWORD[((-36))+r12] - - vpslld ymm6,ymm1,30 - vpor ymm7,ymm7,ymm8 - vmovd xmm8,DWORD[((-36))+r8] - vpaddd ymm4,ymm4,ymm5 - - vpsrld ymm1,ymm1,2 - vpaddd ymm4,ymm4,ymm7 - vpshufb ymm11,ymm11,ymm9 - vpor ymm1,ymm1,ymm6 - vmovd xmm7,DWORD[((-36))+r13] - vmovd xmm6,DWORD[((-36))+r9] - vpinsrd xmm12,xmm12,DWORD[((-36))+r14],1 - vpinsrd xmm8,xmm8,DWORD[((-36))+r10],1 - vpinsrd xmm7,xmm7,DWORD[((-36))+r15],1 - vpunpckldq ymm12,ymm12,ymm7 - vpinsrd xmm6,xmm6,DWORD[((-36))+r11],1 - vpunpckldq ymm8,ymm8,ymm6 - vpaddd ymm3,ymm3,ymm15 - vpslld ymm7,ymm4,5 - vpandn ymm6,ymm0,ymm2 - vpand ymm5,ymm0,ymm1 - - vmovdqa YMMWORD[(192-128)+rax],ymm11 - vpaddd ymm3,ymm3,ymm11 - vinserti128 ymm12,ymm12,xmm8,1 - vpsrld ymm8,ymm4,27 - vpxor ymm5,ymm5,ymm6 - vmovd xmm13,DWORD[((-32))+r12] - - vpslld ymm6,ymm0,30 - vpor ymm7,ymm7,ymm8 - vmovd xmm8,DWORD[((-32))+r8] - vpaddd ymm3,ymm3,ymm5 - - vpsrld ymm0,ymm0,2 - vpaddd ymm3,ymm3,ymm7 - vpshufb ymm12,ymm12,ymm9 - vpor ymm0,ymm0,ymm6 - vmovd xmm7,DWORD[((-32))+r13] - vmovd xmm6,DWORD[((-32))+r9] - vpinsrd xmm13,xmm13,DWORD[((-32))+r14],1 - vpinsrd xmm8,xmm8,DWORD[((-32))+r10],1 - vpinsrd xmm7,xmm7,DWORD[((-32))+r15],1 - vpunpckldq ymm13,ymm13,ymm7 - vpinsrd xmm6,xmm6,DWORD[((-32))+r11],1 - vpunpckldq ymm8,ymm8,ymm6 - vpaddd ymm2,ymm2,ymm15 - vpslld ymm7,ymm3,5 - vpandn ymm6,ymm4,ymm1 - vpand ymm5,ymm4,ymm0 - - vmovdqa YMMWORD[(224-128)+rax],ymm12 - vpaddd ymm2,ymm2,ymm12 - vinserti128 ymm13,ymm13,xmm8,1 - vpsrld ymm8,ymm3,27 - vpxor ymm5,ymm5,ymm6 - vmovd xmm14,DWORD[((-28))+r12] - - vpslld ymm6,ymm4,30 - vpor ymm7,ymm7,ymm8 - vmovd xmm8,DWORD[((-28))+r8] - vpaddd ymm2,ymm2,ymm5 - - vpsrld ymm4,ymm4,2 - vpaddd ymm2,ymm2,ymm7 - vpshufb ymm13,ymm13,ymm9 - vpor ymm4,ymm4,ymm6 - vmovd xmm7,DWORD[((-28))+r13] - vmovd xmm6,DWORD[((-28))+r9] - vpinsrd xmm14,xmm14,DWORD[((-28))+r14],1 - vpinsrd xmm8,xmm8,DWORD[((-28))+r10],1 - vpinsrd xmm7,xmm7,DWORD[((-28))+r15],1 - vpunpckldq ymm14,ymm14,ymm7 - vpinsrd xmm6,xmm6,DWORD[((-28))+r11],1 - vpunpckldq ymm8,ymm8,ymm6 - vpaddd ymm1,ymm1,ymm15 - vpslld ymm7,ymm2,5 - vpandn ymm6,ymm3,ymm0 - vpand ymm5,ymm3,ymm4 - - vmovdqa YMMWORD[(256-256-128)+rbx],ymm13 - vpaddd ymm1,ymm1,ymm13 - vinserti128 ymm14,ymm14,xmm8,1 - vpsrld ymm8,ymm2,27 - vpxor ymm5,ymm5,ymm6 - vmovd xmm10,DWORD[((-24))+r12] - - vpslld ymm6,ymm3,30 - vpor ymm7,ymm7,ymm8 - vmovd xmm8,DWORD[((-24))+r8] - vpaddd ymm1,ymm1,ymm5 - - vpsrld ymm3,ymm3,2 - vpaddd ymm1,ymm1,ymm7 - vpshufb ymm14,ymm14,ymm9 - vpor ymm3,ymm3,ymm6 - vmovd xmm7,DWORD[((-24))+r13] - vmovd xmm6,DWORD[((-24))+r9] - vpinsrd xmm10,xmm10,DWORD[((-24))+r14],1 - vpinsrd xmm8,xmm8,DWORD[((-24))+r10],1 - vpinsrd xmm7,xmm7,DWORD[((-24))+r15],1 - vpunpckldq ymm10,ymm10,ymm7 - vpinsrd xmm6,xmm6,DWORD[((-24))+r11],1 - vpunpckldq ymm8,ymm8,ymm6 - vpaddd ymm0,ymm0,ymm15 - vpslld ymm7,ymm1,5 - vpandn ymm6,ymm2,ymm4 - vpand ymm5,ymm2,ymm3 - - vmovdqa YMMWORD[(288-256-128)+rbx],ymm14 - vpaddd ymm0,ymm0,ymm14 - vinserti128 ymm10,ymm10,xmm8,1 - vpsrld ymm8,ymm1,27 - vpxor ymm5,ymm5,ymm6 - vmovd xmm11,DWORD[((-20))+r12] - - vpslld ymm6,ymm2,30 - vpor ymm7,ymm7,ymm8 - vmovd xmm8,DWORD[((-20))+r8] - vpaddd ymm0,ymm0,ymm5 - - vpsrld ymm2,ymm2,2 - vpaddd ymm0,ymm0,ymm7 - vpshufb ymm10,ymm10,ymm9 - vpor ymm2,ymm2,ymm6 - vmovd xmm7,DWORD[((-20))+r13] - vmovd xmm6,DWORD[((-20))+r9] - vpinsrd xmm11,xmm11,DWORD[((-20))+r14],1 - vpinsrd xmm8,xmm8,DWORD[((-20))+r10],1 - vpinsrd xmm7,xmm7,DWORD[((-20))+r15],1 - vpunpckldq ymm11,ymm11,ymm7 - vpinsrd xmm6,xmm6,DWORD[((-20))+r11],1 - vpunpckldq ymm8,ymm8,ymm6 - vpaddd ymm4,ymm4,ymm15 - vpslld ymm7,ymm0,5 - vpandn ymm6,ymm1,ymm3 - vpand ymm5,ymm1,ymm2 - - vmovdqa YMMWORD[(320-256-128)+rbx],ymm10 - vpaddd ymm4,ymm4,ymm10 - vinserti128 ymm11,ymm11,xmm8,1 - vpsrld ymm8,ymm0,27 - vpxor ymm5,ymm5,ymm6 - vmovd xmm12,DWORD[((-16))+r12] - - vpslld ymm6,ymm1,30 - vpor ymm7,ymm7,ymm8 - vmovd xmm8,DWORD[((-16))+r8] - vpaddd ymm4,ymm4,ymm5 - - vpsrld ymm1,ymm1,2 - vpaddd ymm4,ymm4,ymm7 - vpshufb ymm11,ymm11,ymm9 - vpor ymm1,ymm1,ymm6 - vmovd xmm7,DWORD[((-16))+r13] - vmovd xmm6,DWORD[((-16))+r9] - vpinsrd xmm12,xmm12,DWORD[((-16))+r14],1 - vpinsrd xmm8,xmm8,DWORD[((-16))+r10],1 - vpinsrd xmm7,xmm7,DWORD[((-16))+r15],1 - vpunpckldq ymm12,ymm12,ymm7 - vpinsrd xmm6,xmm6,DWORD[((-16))+r11],1 - vpunpckldq ymm8,ymm8,ymm6 - vpaddd ymm3,ymm3,ymm15 - vpslld ymm7,ymm4,5 - vpandn ymm6,ymm0,ymm2 - vpand ymm5,ymm0,ymm1 - - vmovdqa YMMWORD[(352-256-128)+rbx],ymm11 - vpaddd ymm3,ymm3,ymm11 - vinserti128 ymm12,ymm12,xmm8,1 - vpsrld ymm8,ymm4,27 - vpxor ymm5,ymm5,ymm6 - vmovd xmm13,DWORD[((-12))+r12] - - vpslld ymm6,ymm0,30 - vpor ymm7,ymm7,ymm8 - vmovd xmm8,DWORD[((-12))+r8] - vpaddd ymm3,ymm3,ymm5 - - vpsrld ymm0,ymm0,2 - vpaddd ymm3,ymm3,ymm7 - vpshufb ymm12,ymm12,ymm9 - vpor ymm0,ymm0,ymm6 - vmovd xmm7,DWORD[((-12))+r13] - vmovd xmm6,DWORD[((-12))+r9] - vpinsrd xmm13,xmm13,DWORD[((-12))+r14],1 - vpinsrd xmm8,xmm8,DWORD[((-12))+r10],1 - vpinsrd xmm7,xmm7,DWORD[((-12))+r15],1 - vpunpckldq ymm13,ymm13,ymm7 - vpinsrd xmm6,xmm6,DWORD[((-12))+r11],1 - vpunpckldq ymm8,ymm8,ymm6 - vpaddd ymm2,ymm2,ymm15 - vpslld ymm7,ymm3,5 - vpandn ymm6,ymm4,ymm1 - vpand ymm5,ymm4,ymm0 - - vmovdqa YMMWORD[(384-256-128)+rbx],ymm12 - vpaddd ymm2,ymm2,ymm12 - vinserti128 ymm13,ymm13,xmm8,1 - vpsrld ymm8,ymm3,27 - vpxor ymm5,ymm5,ymm6 - vmovd xmm14,DWORD[((-8))+r12] - - vpslld ymm6,ymm4,30 - vpor ymm7,ymm7,ymm8 - vmovd xmm8,DWORD[((-8))+r8] - vpaddd ymm2,ymm2,ymm5 - - vpsrld ymm4,ymm4,2 - vpaddd ymm2,ymm2,ymm7 - vpshufb ymm13,ymm13,ymm9 - vpor ymm4,ymm4,ymm6 - vmovd xmm7,DWORD[((-8))+r13] - vmovd xmm6,DWORD[((-8))+r9] - vpinsrd xmm14,xmm14,DWORD[((-8))+r14],1 - vpinsrd xmm8,xmm8,DWORD[((-8))+r10],1 - vpinsrd xmm7,xmm7,DWORD[((-8))+r15],1 - vpunpckldq ymm14,ymm14,ymm7 - vpinsrd xmm6,xmm6,DWORD[((-8))+r11],1 - vpunpckldq ymm8,ymm8,ymm6 - vpaddd ymm1,ymm1,ymm15 - vpslld ymm7,ymm2,5 - vpandn ymm6,ymm3,ymm0 - vpand ymm5,ymm3,ymm4 - - vmovdqa YMMWORD[(416-256-128)+rbx],ymm13 - vpaddd ymm1,ymm1,ymm13 - vinserti128 ymm14,ymm14,xmm8,1 - vpsrld ymm8,ymm2,27 - vpxor ymm5,ymm5,ymm6 - vmovd xmm10,DWORD[((-4))+r12] - - vpslld ymm6,ymm3,30 - vpor ymm7,ymm7,ymm8 - vmovd xmm8,DWORD[((-4))+r8] - vpaddd ymm1,ymm1,ymm5 - - vpsrld ymm3,ymm3,2 - vpaddd ymm1,ymm1,ymm7 - vpshufb ymm14,ymm14,ymm9 - vpor ymm3,ymm3,ymm6 - vmovdqa ymm11,YMMWORD[((0-128))+rax] - vmovd xmm7,DWORD[((-4))+r13] - vmovd xmm6,DWORD[((-4))+r9] - vpinsrd xmm10,xmm10,DWORD[((-4))+r14],1 - vpinsrd xmm8,xmm8,DWORD[((-4))+r10],1 - vpinsrd xmm7,xmm7,DWORD[((-4))+r15],1 - vpunpckldq ymm10,ymm10,ymm7 - vpinsrd xmm6,xmm6,DWORD[((-4))+r11],1 - vpunpckldq ymm8,ymm8,ymm6 - vpaddd ymm0,ymm0,ymm15 - prefetcht0 [63+r12] - vpslld ymm7,ymm1,5 - vpandn ymm6,ymm2,ymm4 - vpand ymm5,ymm2,ymm3 - - vmovdqa YMMWORD[(448-256-128)+rbx],ymm14 - vpaddd ymm0,ymm0,ymm14 - vinserti128 ymm10,ymm10,xmm8,1 - vpsrld ymm8,ymm1,27 - prefetcht0 [63+r13] - vpxor ymm5,ymm5,ymm6 - - vpslld ymm6,ymm2,30 - vpor ymm7,ymm7,ymm8 - prefetcht0 [63+r14] - vpaddd ymm0,ymm0,ymm5 - - vpsrld ymm2,ymm2,2 - vpaddd ymm0,ymm0,ymm7 - prefetcht0 [63+r15] - vpshufb ymm10,ymm10,ymm9 - vpor ymm2,ymm2,ymm6 - vmovdqa ymm12,YMMWORD[((32-128))+rax] - vpxor ymm11,ymm11,ymm13 - vmovdqa ymm13,YMMWORD[((64-128))+rax] - - vpaddd ymm4,ymm4,ymm15 - vpslld ymm7,ymm0,5 - vpandn ymm6,ymm1,ymm3 - prefetcht0 [63+r8] - vpand ymm5,ymm1,ymm2 - - vmovdqa YMMWORD[(480-256-128)+rbx],ymm10 - vpaddd ymm4,ymm4,ymm10 - vpxor ymm11,ymm11,YMMWORD[((256-256-128))+rbx] - vpsrld ymm8,ymm0,27 - vpxor ymm5,ymm5,ymm6 - vpxor ymm11,ymm11,ymm13 - prefetcht0 [63+r9] - - vpslld ymm6,ymm1,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm4,ymm4,ymm5 - prefetcht0 [63+r10] - vpsrld ymm9,ymm11,31 - vpaddd ymm11,ymm11,ymm11 - - vpsrld ymm1,ymm1,2 - prefetcht0 [63+r11] - vpaddd ymm4,ymm4,ymm7 - vpor ymm11,ymm11,ymm9 - vpor ymm1,ymm1,ymm6 - vpxor ymm12,ymm12,ymm14 - vmovdqa ymm14,YMMWORD[((96-128))+rax] - - vpaddd ymm3,ymm3,ymm15 - vpslld ymm7,ymm4,5 - vpandn ymm6,ymm0,ymm2 - - vpand ymm5,ymm0,ymm1 - - vmovdqa YMMWORD[(0-128)+rax],ymm11 - vpaddd ymm3,ymm3,ymm11 - vpxor ymm12,ymm12,YMMWORD[((288-256-128))+rbx] - vpsrld ymm8,ymm4,27 - vpxor ymm5,ymm5,ymm6 - vpxor ymm12,ymm12,ymm14 - - - vpslld ymm6,ymm0,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm3,ymm3,ymm5 - - vpsrld ymm9,ymm12,31 - vpaddd ymm12,ymm12,ymm12 - - vpsrld ymm0,ymm0,2 - - vpaddd ymm3,ymm3,ymm7 - vpor ymm12,ymm12,ymm9 - vpor ymm0,ymm0,ymm6 - vpxor ymm13,ymm13,ymm10 - vmovdqa ymm10,YMMWORD[((128-128))+rax] - - vpaddd ymm2,ymm2,ymm15 - vpslld ymm7,ymm3,5 - vpandn ymm6,ymm4,ymm1 - - vpand ymm5,ymm4,ymm0 - - vmovdqa YMMWORD[(32-128)+rax],ymm12 - vpaddd ymm2,ymm2,ymm12 - vpxor ymm13,ymm13,YMMWORD[((320-256-128))+rbx] - vpsrld ymm8,ymm3,27 - vpxor ymm5,ymm5,ymm6 - vpxor ymm13,ymm13,ymm10 - - - vpslld ymm6,ymm4,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm2,ymm2,ymm5 - - vpsrld ymm9,ymm13,31 - vpaddd ymm13,ymm13,ymm13 - - vpsrld ymm4,ymm4,2 - - vpaddd ymm2,ymm2,ymm7 - vpor ymm13,ymm13,ymm9 - vpor ymm4,ymm4,ymm6 - vpxor ymm14,ymm14,ymm11 - vmovdqa ymm11,YMMWORD[((160-128))+rax] - - vpaddd ymm1,ymm1,ymm15 - vpslld ymm7,ymm2,5 - vpandn ymm6,ymm3,ymm0 - - vpand ymm5,ymm3,ymm4 - - vmovdqa YMMWORD[(64-128)+rax],ymm13 - vpaddd ymm1,ymm1,ymm13 - vpxor ymm14,ymm14,YMMWORD[((352-256-128))+rbx] - vpsrld ymm8,ymm2,27 - vpxor ymm5,ymm5,ymm6 - vpxor ymm14,ymm14,ymm11 - - - vpslld ymm6,ymm3,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm1,ymm1,ymm5 - - vpsrld ymm9,ymm14,31 - vpaddd ymm14,ymm14,ymm14 - - vpsrld ymm3,ymm3,2 - - vpaddd ymm1,ymm1,ymm7 - vpor ymm14,ymm14,ymm9 - vpor ymm3,ymm3,ymm6 - vpxor ymm10,ymm10,ymm12 - vmovdqa ymm12,YMMWORD[((192-128))+rax] - - vpaddd ymm0,ymm0,ymm15 - vpslld ymm7,ymm1,5 - vpandn ymm6,ymm2,ymm4 - - vpand ymm5,ymm2,ymm3 - - vmovdqa YMMWORD[(96-128)+rax],ymm14 - vpaddd ymm0,ymm0,ymm14 - vpxor ymm10,ymm10,YMMWORD[((384-256-128))+rbx] - vpsrld ymm8,ymm1,27 - vpxor ymm5,ymm5,ymm6 - vpxor ymm10,ymm10,ymm12 - - - vpslld ymm6,ymm2,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm0,ymm0,ymm5 - - vpsrld ymm9,ymm10,31 - vpaddd ymm10,ymm10,ymm10 - - vpsrld ymm2,ymm2,2 - - vpaddd ymm0,ymm0,ymm7 - vpor ymm10,ymm10,ymm9 - vpor ymm2,ymm2,ymm6 - vmovdqa ymm15,YMMWORD[rbp] - vpxor ymm11,ymm11,ymm13 - vmovdqa ymm13,YMMWORD[((224-128))+rax] - - vpslld ymm7,ymm0,5 - vpaddd ymm4,ymm4,ymm15 - vpxor ymm5,ymm3,ymm1 - vmovdqa YMMWORD[(128-128)+rax],ymm10 - vpaddd ymm4,ymm4,ymm10 - vpxor ymm11,ymm11,YMMWORD[((416-256-128))+rbx] - vpsrld ymm8,ymm0,27 - vpxor ymm5,ymm5,ymm2 - vpxor ymm11,ymm11,ymm13 - - vpslld ymm6,ymm1,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm4,ymm4,ymm5 - vpsrld ymm9,ymm11,31 - vpaddd ymm11,ymm11,ymm11 - - vpsrld ymm1,ymm1,2 - vpaddd ymm4,ymm4,ymm7 - vpor ymm11,ymm11,ymm9 - vpor ymm1,ymm1,ymm6 - vpxor ymm12,ymm12,ymm14 - vmovdqa ymm14,YMMWORD[((256-256-128))+rbx] - - vpslld ymm7,ymm4,5 - vpaddd ymm3,ymm3,ymm15 - vpxor ymm5,ymm2,ymm0 - vmovdqa YMMWORD[(160-128)+rax],ymm11 - vpaddd ymm3,ymm3,ymm11 - vpxor ymm12,ymm12,YMMWORD[((448-256-128))+rbx] - vpsrld ymm8,ymm4,27 - vpxor ymm5,ymm5,ymm1 - vpxor ymm12,ymm12,ymm14 - - vpslld ymm6,ymm0,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm3,ymm3,ymm5 - vpsrld ymm9,ymm12,31 - vpaddd ymm12,ymm12,ymm12 - - vpsrld ymm0,ymm0,2 - vpaddd ymm3,ymm3,ymm7 - vpor ymm12,ymm12,ymm9 - vpor ymm0,ymm0,ymm6 - vpxor ymm13,ymm13,ymm10 - vmovdqa ymm10,YMMWORD[((288-256-128))+rbx] - - vpslld ymm7,ymm3,5 - vpaddd ymm2,ymm2,ymm15 - vpxor ymm5,ymm1,ymm4 - vmovdqa YMMWORD[(192-128)+rax],ymm12 - vpaddd ymm2,ymm2,ymm12 - vpxor ymm13,ymm13,YMMWORD[((480-256-128))+rbx] - vpsrld ymm8,ymm3,27 - vpxor ymm5,ymm5,ymm0 - vpxor ymm13,ymm13,ymm10 - - vpslld ymm6,ymm4,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm2,ymm2,ymm5 - vpsrld ymm9,ymm13,31 - vpaddd ymm13,ymm13,ymm13 - - vpsrld ymm4,ymm4,2 - vpaddd ymm2,ymm2,ymm7 - vpor ymm13,ymm13,ymm9 - vpor ymm4,ymm4,ymm6 - vpxor ymm14,ymm14,ymm11 - vmovdqa ymm11,YMMWORD[((320-256-128))+rbx] - - vpslld ymm7,ymm2,5 - vpaddd ymm1,ymm1,ymm15 - vpxor ymm5,ymm0,ymm3 - vmovdqa YMMWORD[(224-128)+rax],ymm13 - vpaddd ymm1,ymm1,ymm13 - vpxor ymm14,ymm14,YMMWORD[((0-128))+rax] - vpsrld ymm8,ymm2,27 - vpxor ymm5,ymm5,ymm4 - vpxor ymm14,ymm14,ymm11 - - vpslld ymm6,ymm3,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm1,ymm1,ymm5 - vpsrld ymm9,ymm14,31 - vpaddd ymm14,ymm14,ymm14 - - vpsrld ymm3,ymm3,2 - vpaddd ymm1,ymm1,ymm7 - vpor ymm14,ymm14,ymm9 - vpor ymm3,ymm3,ymm6 - vpxor ymm10,ymm10,ymm12 - vmovdqa ymm12,YMMWORD[((352-256-128))+rbx] - - vpslld ymm7,ymm1,5 - vpaddd ymm0,ymm0,ymm15 - vpxor ymm5,ymm4,ymm2 - vmovdqa YMMWORD[(256-256-128)+rbx],ymm14 - vpaddd ymm0,ymm0,ymm14 - vpxor ymm10,ymm10,YMMWORD[((32-128))+rax] - vpsrld ymm8,ymm1,27 - vpxor ymm5,ymm5,ymm3 - vpxor ymm10,ymm10,ymm12 - - vpslld ymm6,ymm2,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm0,ymm0,ymm5 - vpsrld ymm9,ymm10,31 - vpaddd ymm10,ymm10,ymm10 - - vpsrld ymm2,ymm2,2 - vpaddd ymm0,ymm0,ymm7 - vpor ymm10,ymm10,ymm9 - vpor ymm2,ymm2,ymm6 - vpxor ymm11,ymm11,ymm13 - vmovdqa ymm13,YMMWORD[((384-256-128))+rbx] - - vpslld ymm7,ymm0,5 - vpaddd ymm4,ymm4,ymm15 - vpxor ymm5,ymm3,ymm1 - vmovdqa YMMWORD[(288-256-128)+rbx],ymm10 - vpaddd ymm4,ymm4,ymm10 - vpxor ymm11,ymm11,YMMWORD[((64-128))+rax] - vpsrld ymm8,ymm0,27 - vpxor ymm5,ymm5,ymm2 - vpxor ymm11,ymm11,ymm13 - - vpslld ymm6,ymm1,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm4,ymm4,ymm5 - vpsrld ymm9,ymm11,31 - vpaddd ymm11,ymm11,ymm11 - - vpsrld ymm1,ymm1,2 - vpaddd ymm4,ymm4,ymm7 - vpor ymm11,ymm11,ymm9 - vpor ymm1,ymm1,ymm6 - vpxor ymm12,ymm12,ymm14 - vmovdqa ymm14,YMMWORD[((416-256-128))+rbx] - - vpslld ymm7,ymm4,5 - vpaddd ymm3,ymm3,ymm15 - vpxor ymm5,ymm2,ymm0 - vmovdqa YMMWORD[(320-256-128)+rbx],ymm11 - vpaddd ymm3,ymm3,ymm11 - vpxor ymm12,ymm12,YMMWORD[((96-128))+rax] - vpsrld ymm8,ymm4,27 - vpxor ymm5,ymm5,ymm1 - vpxor ymm12,ymm12,ymm14 - - vpslld ymm6,ymm0,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm3,ymm3,ymm5 - vpsrld ymm9,ymm12,31 - vpaddd ymm12,ymm12,ymm12 - - vpsrld ymm0,ymm0,2 - vpaddd ymm3,ymm3,ymm7 - vpor ymm12,ymm12,ymm9 - vpor ymm0,ymm0,ymm6 - vpxor ymm13,ymm13,ymm10 - vmovdqa ymm10,YMMWORD[((448-256-128))+rbx] - - vpslld ymm7,ymm3,5 - vpaddd ymm2,ymm2,ymm15 - vpxor ymm5,ymm1,ymm4 - vmovdqa YMMWORD[(352-256-128)+rbx],ymm12 - vpaddd ymm2,ymm2,ymm12 - vpxor ymm13,ymm13,YMMWORD[((128-128))+rax] - vpsrld ymm8,ymm3,27 - vpxor ymm5,ymm5,ymm0 - vpxor ymm13,ymm13,ymm10 - - vpslld ymm6,ymm4,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm2,ymm2,ymm5 - vpsrld ymm9,ymm13,31 - vpaddd ymm13,ymm13,ymm13 - - vpsrld ymm4,ymm4,2 - vpaddd ymm2,ymm2,ymm7 - vpor ymm13,ymm13,ymm9 - vpor ymm4,ymm4,ymm6 - vpxor ymm14,ymm14,ymm11 - vmovdqa ymm11,YMMWORD[((480-256-128))+rbx] - - vpslld ymm7,ymm2,5 - vpaddd ymm1,ymm1,ymm15 - vpxor ymm5,ymm0,ymm3 - vmovdqa YMMWORD[(384-256-128)+rbx],ymm13 - vpaddd ymm1,ymm1,ymm13 - vpxor ymm14,ymm14,YMMWORD[((160-128))+rax] - vpsrld ymm8,ymm2,27 - vpxor ymm5,ymm5,ymm4 - vpxor ymm14,ymm14,ymm11 - - vpslld ymm6,ymm3,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm1,ymm1,ymm5 - vpsrld ymm9,ymm14,31 - vpaddd ymm14,ymm14,ymm14 - - vpsrld ymm3,ymm3,2 - vpaddd ymm1,ymm1,ymm7 - vpor ymm14,ymm14,ymm9 - vpor ymm3,ymm3,ymm6 - vpxor ymm10,ymm10,ymm12 - vmovdqa ymm12,YMMWORD[((0-128))+rax] - - vpslld ymm7,ymm1,5 - vpaddd ymm0,ymm0,ymm15 - vpxor ymm5,ymm4,ymm2 - vmovdqa YMMWORD[(416-256-128)+rbx],ymm14 - vpaddd ymm0,ymm0,ymm14 - vpxor ymm10,ymm10,YMMWORD[((192-128))+rax] - vpsrld ymm8,ymm1,27 - vpxor ymm5,ymm5,ymm3 - vpxor ymm10,ymm10,ymm12 - - vpslld ymm6,ymm2,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm0,ymm0,ymm5 - vpsrld ymm9,ymm10,31 - vpaddd ymm10,ymm10,ymm10 - - vpsrld ymm2,ymm2,2 - vpaddd ymm0,ymm0,ymm7 - vpor ymm10,ymm10,ymm9 - vpor ymm2,ymm2,ymm6 - vpxor ymm11,ymm11,ymm13 - vmovdqa ymm13,YMMWORD[((32-128))+rax] - - vpslld ymm7,ymm0,5 - vpaddd ymm4,ymm4,ymm15 - vpxor ymm5,ymm3,ymm1 - vmovdqa YMMWORD[(448-256-128)+rbx],ymm10 - vpaddd ymm4,ymm4,ymm10 - vpxor ymm11,ymm11,YMMWORD[((224-128))+rax] - vpsrld ymm8,ymm0,27 - vpxor ymm5,ymm5,ymm2 - vpxor ymm11,ymm11,ymm13 - - vpslld ymm6,ymm1,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm4,ymm4,ymm5 - vpsrld ymm9,ymm11,31 - vpaddd ymm11,ymm11,ymm11 - - vpsrld ymm1,ymm1,2 - vpaddd ymm4,ymm4,ymm7 - vpor ymm11,ymm11,ymm9 - vpor ymm1,ymm1,ymm6 - vpxor ymm12,ymm12,ymm14 - vmovdqa ymm14,YMMWORD[((64-128))+rax] - - vpslld ymm7,ymm4,5 - vpaddd ymm3,ymm3,ymm15 - vpxor ymm5,ymm2,ymm0 - vmovdqa YMMWORD[(480-256-128)+rbx],ymm11 - vpaddd ymm3,ymm3,ymm11 - vpxor ymm12,ymm12,YMMWORD[((256-256-128))+rbx] - vpsrld ymm8,ymm4,27 - vpxor ymm5,ymm5,ymm1 - vpxor ymm12,ymm12,ymm14 - - vpslld ymm6,ymm0,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm3,ymm3,ymm5 - vpsrld ymm9,ymm12,31 - vpaddd ymm12,ymm12,ymm12 - - vpsrld ymm0,ymm0,2 - vpaddd ymm3,ymm3,ymm7 - vpor ymm12,ymm12,ymm9 - vpor ymm0,ymm0,ymm6 - vpxor ymm13,ymm13,ymm10 - vmovdqa ymm10,YMMWORD[((96-128))+rax] - - vpslld ymm7,ymm3,5 - vpaddd ymm2,ymm2,ymm15 - vpxor ymm5,ymm1,ymm4 - vmovdqa YMMWORD[(0-128)+rax],ymm12 - vpaddd ymm2,ymm2,ymm12 - vpxor ymm13,ymm13,YMMWORD[((288-256-128))+rbx] - vpsrld ymm8,ymm3,27 - vpxor ymm5,ymm5,ymm0 - vpxor ymm13,ymm13,ymm10 - - vpslld ymm6,ymm4,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm2,ymm2,ymm5 - vpsrld ymm9,ymm13,31 - vpaddd ymm13,ymm13,ymm13 - - vpsrld ymm4,ymm4,2 - vpaddd ymm2,ymm2,ymm7 - vpor ymm13,ymm13,ymm9 - vpor ymm4,ymm4,ymm6 - vpxor ymm14,ymm14,ymm11 - vmovdqa ymm11,YMMWORD[((128-128))+rax] - - vpslld ymm7,ymm2,5 - vpaddd ymm1,ymm1,ymm15 - vpxor ymm5,ymm0,ymm3 - vmovdqa YMMWORD[(32-128)+rax],ymm13 - vpaddd ymm1,ymm1,ymm13 - vpxor ymm14,ymm14,YMMWORD[((320-256-128))+rbx] - vpsrld ymm8,ymm2,27 - vpxor ymm5,ymm5,ymm4 - vpxor ymm14,ymm14,ymm11 - - vpslld ymm6,ymm3,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm1,ymm1,ymm5 - vpsrld ymm9,ymm14,31 - vpaddd ymm14,ymm14,ymm14 - - vpsrld ymm3,ymm3,2 - vpaddd ymm1,ymm1,ymm7 - vpor ymm14,ymm14,ymm9 - vpor ymm3,ymm3,ymm6 - vpxor ymm10,ymm10,ymm12 - vmovdqa ymm12,YMMWORD[((160-128))+rax] - - vpslld ymm7,ymm1,5 - vpaddd ymm0,ymm0,ymm15 - vpxor ymm5,ymm4,ymm2 - vmovdqa YMMWORD[(64-128)+rax],ymm14 - vpaddd ymm0,ymm0,ymm14 - vpxor ymm10,ymm10,YMMWORD[((352-256-128))+rbx] - vpsrld ymm8,ymm1,27 - vpxor ymm5,ymm5,ymm3 - vpxor ymm10,ymm10,ymm12 - - vpslld ymm6,ymm2,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm0,ymm0,ymm5 - vpsrld ymm9,ymm10,31 - vpaddd ymm10,ymm10,ymm10 - - vpsrld ymm2,ymm2,2 - vpaddd ymm0,ymm0,ymm7 - vpor ymm10,ymm10,ymm9 - vpor ymm2,ymm2,ymm6 - vpxor ymm11,ymm11,ymm13 - vmovdqa ymm13,YMMWORD[((192-128))+rax] - - vpslld ymm7,ymm0,5 - vpaddd ymm4,ymm4,ymm15 - vpxor ymm5,ymm3,ymm1 - vmovdqa YMMWORD[(96-128)+rax],ymm10 - vpaddd ymm4,ymm4,ymm10 - vpxor ymm11,ymm11,YMMWORD[((384-256-128))+rbx] - vpsrld ymm8,ymm0,27 - vpxor ymm5,ymm5,ymm2 - vpxor ymm11,ymm11,ymm13 - - vpslld ymm6,ymm1,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm4,ymm4,ymm5 - vpsrld ymm9,ymm11,31 - vpaddd ymm11,ymm11,ymm11 - - vpsrld ymm1,ymm1,2 - vpaddd ymm4,ymm4,ymm7 - vpor ymm11,ymm11,ymm9 - vpor ymm1,ymm1,ymm6 - vpxor ymm12,ymm12,ymm14 - vmovdqa ymm14,YMMWORD[((224-128))+rax] - - vpslld ymm7,ymm4,5 - vpaddd ymm3,ymm3,ymm15 - vpxor ymm5,ymm2,ymm0 - vmovdqa YMMWORD[(128-128)+rax],ymm11 - vpaddd ymm3,ymm3,ymm11 - vpxor ymm12,ymm12,YMMWORD[((416-256-128))+rbx] - vpsrld ymm8,ymm4,27 - vpxor ymm5,ymm5,ymm1 - vpxor ymm12,ymm12,ymm14 - - vpslld ymm6,ymm0,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm3,ymm3,ymm5 - vpsrld ymm9,ymm12,31 - vpaddd ymm12,ymm12,ymm12 - - vpsrld ymm0,ymm0,2 - vpaddd ymm3,ymm3,ymm7 - vpor ymm12,ymm12,ymm9 - vpor ymm0,ymm0,ymm6 - vpxor ymm13,ymm13,ymm10 - vmovdqa ymm10,YMMWORD[((256-256-128))+rbx] - - vpslld ymm7,ymm3,5 - vpaddd ymm2,ymm2,ymm15 - vpxor ymm5,ymm1,ymm4 - vmovdqa YMMWORD[(160-128)+rax],ymm12 - vpaddd ymm2,ymm2,ymm12 - vpxor ymm13,ymm13,YMMWORD[((448-256-128))+rbx] - vpsrld ymm8,ymm3,27 - vpxor ymm5,ymm5,ymm0 - vpxor ymm13,ymm13,ymm10 - - vpslld ymm6,ymm4,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm2,ymm2,ymm5 - vpsrld ymm9,ymm13,31 - vpaddd ymm13,ymm13,ymm13 - - vpsrld ymm4,ymm4,2 - vpaddd ymm2,ymm2,ymm7 - vpor ymm13,ymm13,ymm9 - vpor ymm4,ymm4,ymm6 - vpxor ymm14,ymm14,ymm11 - vmovdqa ymm11,YMMWORD[((288-256-128))+rbx] - - vpslld ymm7,ymm2,5 - vpaddd ymm1,ymm1,ymm15 - vpxor ymm5,ymm0,ymm3 - vmovdqa YMMWORD[(192-128)+rax],ymm13 - vpaddd ymm1,ymm1,ymm13 - vpxor ymm14,ymm14,YMMWORD[((480-256-128))+rbx] - vpsrld ymm8,ymm2,27 - vpxor ymm5,ymm5,ymm4 - vpxor ymm14,ymm14,ymm11 - - vpslld ymm6,ymm3,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm1,ymm1,ymm5 - vpsrld ymm9,ymm14,31 - vpaddd ymm14,ymm14,ymm14 - - vpsrld ymm3,ymm3,2 - vpaddd ymm1,ymm1,ymm7 - vpor ymm14,ymm14,ymm9 - vpor ymm3,ymm3,ymm6 - vpxor ymm10,ymm10,ymm12 - vmovdqa ymm12,YMMWORD[((320-256-128))+rbx] - - vpslld ymm7,ymm1,5 - vpaddd ymm0,ymm0,ymm15 - vpxor ymm5,ymm4,ymm2 - vmovdqa YMMWORD[(224-128)+rax],ymm14 - vpaddd ymm0,ymm0,ymm14 - vpxor ymm10,ymm10,YMMWORD[((0-128))+rax] - vpsrld ymm8,ymm1,27 - vpxor ymm5,ymm5,ymm3 - vpxor ymm10,ymm10,ymm12 - - vpslld ymm6,ymm2,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm0,ymm0,ymm5 - vpsrld ymm9,ymm10,31 - vpaddd ymm10,ymm10,ymm10 - - vpsrld ymm2,ymm2,2 - vpaddd ymm0,ymm0,ymm7 - vpor ymm10,ymm10,ymm9 - vpor ymm2,ymm2,ymm6 - vmovdqa ymm15,YMMWORD[32+rbp] - vpxor ymm11,ymm11,ymm13 - vmovdqa ymm13,YMMWORD[((352-256-128))+rbx] - - vpaddd ymm4,ymm4,ymm15 - vpslld ymm7,ymm0,5 - vpand ymm6,ymm3,ymm2 - vpxor ymm11,ymm11,YMMWORD[((32-128))+rax] - - vpaddd ymm4,ymm4,ymm6 - vpsrld ymm8,ymm0,27 - vpxor ymm5,ymm3,ymm2 - vpxor ymm11,ymm11,ymm13 - - vmovdqu YMMWORD[(256-256-128)+rbx],ymm10 - vpaddd ymm4,ymm4,ymm10 - vpor ymm7,ymm7,ymm8 - vpsrld ymm9,ymm11,31 - vpand ymm5,ymm5,ymm1 - vpaddd ymm11,ymm11,ymm11 - - vpslld ymm6,ymm1,30 - vpaddd ymm4,ymm4,ymm5 - - vpsrld ymm1,ymm1,2 - vpaddd ymm4,ymm4,ymm7 - vpor ymm11,ymm11,ymm9 - vpor ymm1,ymm1,ymm6 - vpxor ymm12,ymm12,ymm14 - vmovdqa ymm14,YMMWORD[((384-256-128))+rbx] - - vpaddd ymm3,ymm3,ymm15 - vpslld ymm7,ymm4,5 - vpand ymm6,ymm2,ymm1 - vpxor ymm12,ymm12,YMMWORD[((64-128))+rax] - - vpaddd ymm3,ymm3,ymm6 - vpsrld ymm8,ymm4,27 - vpxor ymm5,ymm2,ymm1 - vpxor ymm12,ymm12,ymm14 - - vmovdqu YMMWORD[(288-256-128)+rbx],ymm11 - vpaddd ymm3,ymm3,ymm11 - vpor ymm7,ymm7,ymm8 - vpsrld ymm9,ymm12,31 - vpand ymm5,ymm5,ymm0 - vpaddd ymm12,ymm12,ymm12 - - vpslld ymm6,ymm0,30 - vpaddd ymm3,ymm3,ymm5 - - vpsrld ymm0,ymm0,2 - vpaddd ymm3,ymm3,ymm7 - vpor ymm12,ymm12,ymm9 - vpor ymm0,ymm0,ymm6 - vpxor ymm13,ymm13,ymm10 - vmovdqa ymm10,YMMWORD[((416-256-128))+rbx] - - vpaddd ymm2,ymm2,ymm15 - vpslld ymm7,ymm3,5 - vpand ymm6,ymm1,ymm0 - vpxor ymm13,ymm13,YMMWORD[((96-128))+rax] - - vpaddd ymm2,ymm2,ymm6 - vpsrld ymm8,ymm3,27 - vpxor ymm5,ymm1,ymm0 - vpxor ymm13,ymm13,ymm10 - - vmovdqu YMMWORD[(320-256-128)+rbx],ymm12 - vpaddd ymm2,ymm2,ymm12 - vpor ymm7,ymm7,ymm8 - vpsrld ymm9,ymm13,31 - vpand ymm5,ymm5,ymm4 - vpaddd ymm13,ymm13,ymm13 - - vpslld ymm6,ymm4,30 - vpaddd ymm2,ymm2,ymm5 - - vpsrld ymm4,ymm4,2 - vpaddd ymm2,ymm2,ymm7 - vpor ymm13,ymm13,ymm9 - vpor ymm4,ymm4,ymm6 - vpxor ymm14,ymm14,ymm11 - vmovdqa ymm11,YMMWORD[((448-256-128))+rbx] - - vpaddd ymm1,ymm1,ymm15 - vpslld ymm7,ymm2,5 - vpand ymm6,ymm0,ymm4 - vpxor ymm14,ymm14,YMMWORD[((128-128))+rax] - - vpaddd ymm1,ymm1,ymm6 - vpsrld ymm8,ymm2,27 - vpxor ymm5,ymm0,ymm4 - vpxor ymm14,ymm14,ymm11 - - vmovdqu YMMWORD[(352-256-128)+rbx],ymm13 - vpaddd ymm1,ymm1,ymm13 - vpor ymm7,ymm7,ymm8 - vpsrld ymm9,ymm14,31 - vpand ymm5,ymm5,ymm3 - vpaddd ymm14,ymm14,ymm14 - - vpslld ymm6,ymm3,30 - vpaddd ymm1,ymm1,ymm5 - - vpsrld ymm3,ymm3,2 - vpaddd ymm1,ymm1,ymm7 - vpor ymm14,ymm14,ymm9 - vpor ymm3,ymm3,ymm6 - vpxor ymm10,ymm10,ymm12 - vmovdqa ymm12,YMMWORD[((480-256-128))+rbx] - - vpaddd ymm0,ymm0,ymm15 - vpslld ymm7,ymm1,5 - vpand ymm6,ymm4,ymm3 - vpxor ymm10,ymm10,YMMWORD[((160-128))+rax] - - vpaddd ymm0,ymm0,ymm6 - vpsrld ymm8,ymm1,27 - vpxor ymm5,ymm4,ymm3 - vpxor ymm10,ymm10,ymm12 - - vmovdqu YMMWORD[(384-256-128)+rbx],ymm14 - vpaddd ymm0,ymm0,ymm14 - vpor ymm7,ymm7,ymm8 - vpsrld ymm9,ymm10,31 - vpand ymm5,ymm5,ymm2 - vpaddd ymm10,ymm10,ymm10 - - vpslld ymm6,ymm2,30 - vpaddd ymm0,ymm0,ymm5 - - vpsrld ymm2,ymm2,2 - vpaddd ymm0,ymm0,ymm7 - vpor ymm10,ymm10,ymm9 - vpor ymm2,ymm2,ymm6 - vpxor ymm11,ymm11,ymm13 - vmovdqa ymm13,YMMWORD[((0-128))+rax] - - vpaddd ymm4,ymm4,ymm15 - vpslld ymm7,ymm0,5 - vpand ymm6,ymm3,ymm2 - vpxor ymm11,ymm11,YMMWORD[((192-128))+rax] - - vpaddd ymm4,ymm4,ymm6 - vpsrld ymm8,ymm0,27 - vpxor ymm5,ymm3,ymm2 - vpxor ymm11,ymm11,ymm13 - - vmovdqu YMMWORD[(416-256-128)+rbx],ymm10 - vpaddd ymm4,ymm4,ymm10 - vpor ymm7,ymm7,ymm8 - vpsrld ymm9,ymm11,31 - vpand ymm5,ymm5,ymm1 - vpaddd ymm11,ymm11,ymm11 - - vpslld ymm6,ymm1,30 - vpaddd ymm4,ymm4,ymm5 - - vpsrld ymm1,ymm1,2 - vpaddd ymm4,ymm4,ymm7 - vpor ymm11,ymm11,ymm9 - vpor ymm1,ymm1,ymm6 - vpxor ymm12,ymm12,ymm14 - vmovdqa ymm14,YMMWORD[((32-128))+rax] - - vpaddd ymm3,ymm3,ymm15 - vpslld ymm7,ymm4,5 - vpand ymm6,ymm2,ymm1 - vpxor ymm12,ymm12,YMMWORD[((224-128))+rax] - - vpaddd ymm3,ymm3,ymm6 - vpsrld ymm8,ymm4,27 - vpxor ymm5,ymm2,ymm1 - vpxor ymm12,ymm12,ymm14 - - vmovdqu YMMWORD[(448-256-128)+rbx],ymm11 - vpaddd ymm3,ymm3,ymm11 - vpor ymm7,ymm7,ymm8 - vpsrld ymm9,ymm12,31 - vpand ymm5,ymm5,ymm0 - vpaddd ymm12,ymm12,ymm12 - - vpslld ymm6,ymm0,30 - vpaddd ymm3,ymm3,ymm5 - - vpsrld ymm0,ymm0,2 - vpaddd ymm3,ymm3,ymm7 - vpor ymm12,ymm12,ymm9 - vpor ymm0,ymm0,ymm6 - vpxor ymm13,ymm13,ymm10 - vmovdqa ymm10,YMMWORD[((64-128))+rax] - - vpaddd ymm2,ymm2,ymm15 - vpslld ymm7,ymm3,5 - vpand ymm6,ymm1,ymm0 - vpxor ymm13,ymm13,YMMWORD[((256-256-128))+rbx] - - vpaddd ymm2,ymm2,ymm6 - vpsrld ymm8,ymm3,27 - vpxor ymm5,ymm1,ymm0 - vpxor ymm13,ymm13,ymm10 - - vmovdqu YMMWORD[(480-256-128)+rbx],ymm12 - vpaddd ymm2,ymm2,ymm12 - vpor ymm7,ymm7,ymm8 - vpsrld ymm9,ymm13,31 - vpand ymm5,ymm5,ymm4 - vpaddd ymm13,ymm13,ymm13 - - vpslld ymm6,ymm4,30 - vpaddd ymm2,ymm2,ymm5 - - vpsrld ymm4,ymm4,2 - vpaddd ymm2,ymm2,ymm7 - vpor ymm13,ymm13,ymm9 - vpor ymm4,ymm4,ymm6 - vpxor ymm14,ymm14,ymm11 - vmovdqa ymm11,YMMWORD[((96-128))+rax] - - vpaddd ymm1,ymm1,ymm15 - vpslld ymm7,ymm2,5 - vpand ymm6,ymm0,ymm4 - vpxor ymm14,ymm14,YMMWORD[((288-256-128))+rbx] - - vpaddd ymm1,ymm1,ymm6 - vpsrld ymm8,ymm2,27 - vpxor ymm5,ymm0,ymm4 - vpxor ymm14,ymm14,ymm11 - - vmovdqu YMMWORD[(0-128)+rax],ymm13 - vpaddd ymm1,ymm1,ymm13 - vpor ymm7,ymm7,ymm8 - vpsrld ymm9,ymm14,31 - vpand ymm5,ymm5,ymm3 - vpaddd ymm14,ymm14,ymm14 - - vpslld ymm6,ymm3,30 - vpaddd ymm1,ymm1,ymm5 - - vpsrld ymm3,ymm3,2 - vpaddd ymm1,ymm1,ymm7 - vpor ymm14,ymm14,ymm9 - vpor ymm3,ymm3,ymm6 - vpxor ymm10,ymm10,ymm12 - vmovdqa ymm12,YMMWORD[((128-128))+rax] - - vpaddd ymm0,ymm0,ymm15 - vpslld ymm7,ymm1,5 - vpand ymm6,ymm4,ymm3 - vpxor ymm10,ymm10,YMMWORD[((320-256-128))+rbx] - - vpaddd ymm0,ymm0,ymm6 - vpsrld ymm8,ymm1,27 - vpxor ymm5,ymm4,ymm3 - vpxor ymm10,ymm10,ymm12 - - vmovdqu YMMWORD[(32-128)+rax],ymm14 - vpaddd ymm0,ymm0,ymm14 - vpor ymm7,ymm7,ymm8 - vpsrld ymm9,ymm10,31 - vpand ymm5,ymm5,ymm2 - vpaddd ymm10,ymm10,ymm10 - - vpslld ymm6,ymm2,30 - vpaddd ymm0,ymm0,ymm5 - - vpsrld ymm2,ymm2,2 - vpaddd ymm0,ymm0,ymm7 - vpor ymm10,ymm10,ymm9 - vpor ymm2,ymm2,ymm6 - vpxor ymm11,ymm11,ymm13 - vmovdqa ymm13,YMMWORD[((160-128))+rax] - - vpaddd ymm4,ymm4,ymm15 - vpslld ymm7,ymm0,5 - vpand ymm6,ymm3,ymm2 - vpxor ymm11,ymm11,YMMWORD[((352-256-128))+rbx] - - vpaddd ymm4,ymm4,ymm6 - vpsrld ymm8,ymm0,27 - vpxor ymm5,ymm3,ymm2 - vpxor ymm11,ymm11,ymm13 - - vmovdqu YMMWORD[(64-128)+rax],ymm10 - vpaddd ymm4,ymm4,ymm10 - vpor ymm7,ymm7,ymm8 - vpsrld ymm9,ymm11,31 - vpand ymm5,ymm5,ymm1 - vpaddd ymm11,ymm11,ymm11 - - vpslld ymm6,ymm1,30 - vpaddd ymm4,ymm4,ymm5 - - vpsrld ymm1,ymm1,2 - vpaddd ymm4,ymm4,ymm7 - vpor ymm11,ymm11,ymm9 - vpor ymm1,ymm1,ymm6 - vpxor ymm12,ymm12,ymm14 - vmovdqa ymm14,YMMWORD[((192-128))+rax] - - vpaddd ymm3,ymm3,ymm15 - vpslld ymm7,ymm4,5 - vpand ymm6,ymm2,ymm1 - vpxor ymm12,ymm12,YMMWORD[((384-256-128))+rbx] - - vpaddd ymm3,ymm3,ymm6 - vpsrld ymm8,ymm4,27 - vpxor ymm5,ymm2,ymm1 - vpxor ymm12,ymm12,ymm14 - - vmovdqu YMMWORD[(96-128)+rax],ymm11 - vpaddd ymm3,ymm3,ymm11 - vpor ymm7,ymm7,ymm8 - vpsrld ymm9,ymm12,31 - vpand ymm5,ymm5,ymm0 - vpaddd ymm12,ymm12,ymm12 - - vpslld ymm6,ymm0,30 - vpaddd ymm3,ymm3,ymm5 - - vpsrld ymm0,ymm0,2 - vpaddd ymm3,ymm3,ymm7 - vpor ymm12,ymm12,ymm9 - vpor ymm0,ymm0,ymm6 - vpxor ymm13,ymm13,ymm10 - vmovdqa ymm10,YMMWORD[((224-128))+rax] - - vpaddd ymm2,ymm2,ymm15 - vpslld ymm7,ymm3,5 - vpand ymm6,ymm1,ymm0 - vpxor ymm13,ymm13,YMMWORD[((416-256-128))+rbx] - - vpaddd ymm2,ymm2,ymm6 - vpsrld ymm8,ymm3,27 - vpxor ymm5,ymm1,ymm0 - vpxor ymm13,ymm13,ymm10 - - vmovdqu YMMWORD[(128-128)+rax],ymm12 - vpaddd ymm2,ymm2,ymm12 - vpor ymm7,ymm7,ymm8 - vpsrld ymm9,ymm13,31 - vpand ymm5,ymm5,ymm4 - vpaddd ymm13,ymm13,ymm13 - - vpslld ymm6,ymm4,30 - vpaddd ymm2,ymm2,ymm5 - - vpsrld ymm4,ymm4,2 - vpaddd ymm2,ymm2,ymm7 - vpor ymm13,ymm13,ymm9 - vpor ymm4,ymm4,ymm6 - vpxor ymm14,ymm14,ymm11 - vmovdqa ymm11,YMMWORD[((256-256-128))+rbx] - - vpaddd ymm1,ymm1,ymm15 - vpslld ymm7,ymm2,5 - vpand ymm6,ymm0,ymm4 - vpxor ymm14,ymm14,YMMWORD[((448-256-128))+rbx] - - vpaddd ymm1,ymm1,ymm6 - vpsrld ymm8,ymm2,27 - vpxor ymm5,ymm0,ymm4 - vpxor ymm14,ymm14,ymm11 - - vmovdqu YMMWORD[(160-128)+rax],ymm13 - vpaddd ymm1,ymm1,ymm13 - vpor ymm7,ymm7,ymm8 - vpsrld ymm9,ymm14,31 - vpand ymm5,ymm5,ymm3 - vpaddd ymm14,ymm14,ymm14 - - vpslld ymm6,ymm3,30 - vpaddd ymm1,ymm1,ymm5 - - vpsrld ymm3,ymm3,2 - vpaddd ymm1,ymm1,ymm7 - vpor ymm14,ymm14,ymm9 - vpor ymm3,ymm3,ymm6 - vpxor ymm10,ymm10,ymm12 - vmovdqa ymm12,YMMWORD[((288-256-128))+rbx] - - vpaddd ymm0,ymm0,ymm15 - vpslld ymm7,ymm1,5 - vpand ymm6,ymm4,ymm3 - vpxor ymm10,ymm10,YMMWORD[((480-256-128))+rbx] - - vpaddd ymm0,ymm0,ymm6 - vpsrld ymm8,ymm1,27 - vpxor ymm5,ymm4,ymm3 - vpxor ymm10,ymm10,ymm12 - - vmovdqu YMMWORD[(192-128)+rax],ymm14 - vpaddd ymm0,ymm0,ymm14 - vpor ymm7,ymm7,ymm8 - vpsrld ymm9,ymm10,31 - vpand ymm5,ymm5,ymm2 - vpaddd ymm10,ymm10,ymm10 - - vpslld ymm6,ymm2,30 - vpaddd ymm0,ymm0,ymm5 - - vpsrld ymm2,ymm2,2 - vpaddd ymm0,ymm0,ymm7 - vpor ymm10,ymm10,ymm9 - vpor ymm2,ymm2,ymm6 - vpxor ymm11,ymm11,ymm13 - vmovdqa ymm13,YMMWORD[((320-256-128))+rbx] - - vpaddd ymm4,ymm4,ymm15 - vpslld ymm7,ymm0,5 - vpand ymm6,ymm3,ymm2 - vpxor ymm11,ymm11,YMMWORD[((0-128))+rax] - - vpaddd ymm4,ymm4,ymm6 - vpsrld ymm8,ymm0,27 - vpxor ymm5,ymm3,ymm2 - vpxor ymm11,ymm11,ymm13 - - vmovdqu YMMWORD[(224-128)+rax],ymm10 - vpaddd ymm4,ymm4,ymm10 - vpor ymm7,ymm7,ymm8 - vpsrld ymm9,ymm11,31 - vpand ymm5,ymm5,ymm1 - vpaddd ymm11,ymm11,ymm11 - - vpslld ymm6,ymm1,30 - vpaddd ymm4,ymm4,ymm5 - - vpsrld ymm1,ymm1,2 - vpaddd ymm4,ymm4,ymm7 - vpor ymm11,ymm11,ymm9 - vpor ymm1,ymm1,ymm6 - vpxor ymm12,ymm12,ymm14 - vmovdqa ymm14,YMMWORD[((352-256-128))+rbx] - - vpaddd ymm3,ymm3,ymm15 - vpslld ymm7,ymm4,5 - vpand ymm6,ymm2,ymm1 - vpxor ymm12,ymm12,YMMWORD[((32-128))+rax] - - vpaddd ymm3,ymm3,ymm6 - vpsrld ymm8,ymm4,27 - vpxor ymm5,ymm2,ymm1 - vpxor ymm12,ymm12,ymm14 - - vmovdqu YMMWORD[(256-256-128)+rbx],ymm11 - vpaddd ymm3,ymm3,ymm11 - vpor ymm7,ymm7,ymm8 - vpsrld ymm9,ymm12,31 - vpand ymm5,ymm5,ymm0 - vpaddd ymm12,ymm12,ymm12 - - vpslld ymm6,ymm0,30 - vpaddd ymm3,ymm3,ymm5 - - vpsrld ymm0,ymm0,2 - vpaddd ymm3,ymm3,ymm7 - vpor ymm12,ymm12,ymm9 - vpor ymm0,ymm0,ymm6 - vpxor ymm13,ymm13,ymm10 - vmovdqa ymm10,YMMWORD[((384-256-128))+rbx] - - vpaddd ymm2,ymm2,ymm15 - vpslld ymm7,ymm3,5 - vpand ymm6,ymm1,ymm0 - vpxor ymm13,ymm13,YMMWORD[((64-128))+rax] - - vpaddd ymm2,ymm2,ymm6 - vpsrld ymm8,ymm3,27 - vpxor ymm5,ymm1,ymm0 - vpxor ymm13,ymm13,ymm10 - - vmovdqu YMMWORD[(288-256-128)+rbx],ymm12 - vpaddd ymm2,ymm2,ymm12 - vpor ymm7,ymm7,ymm8 - vpsrld ymm9,ymm13,31 - vpand ymm5,ymm5,ymm4 - vpaddd ymm13,ymm13,ymm13 - - vpslld ymm6,ymm4,30 - vpaddd ymm2,ymm2,ymm5 - - vpsrld ymm4,ymm4,2 - vpaddd ymm2,ymm2,ymm7 - vpor ymm13,ymm13,ymm9 - vpor ymm4,ymm4,ymm6 - vpxor ymm14,ymm14,ymm11 - vmovdqa ymm11,YMMWORD[((416-256-128))+rbx] - - vpaddd ymm1,ymm1,ymm15 - vpslld ymm7,ymm2,5 - vpand ymm6,ymm0,ymm4 - vpxor ymm14,ymm14,YMMWORD[((96-128))+rax] - - vpaddd ymm1,ymm1,ymm6 - vpsrld ymm8,ymm2,27 - vpxor ymm5,ymm0,ymm4 - vpxor ymm14,ymm14,ymm11 - - vmovdqu YMMWORD[(320-256-128)+rbx],ymm13 - vpaddd ymm1,ymm1,ymm13 - vpor ymm7,ymm7,ymm8 - vpsrld ymm9,ymm14,31 - vpand ymm5,ymm5,ymm3 - vpaddd ymm14,ymm14,ymm14 - - vpslld ymm6,ymm3,30 - vpaddd ymm1,ymm1,ymm5 - - vpsrld ymm3,ymm3,2 - vpaddd ymm1,ymm1,ymm7 - vpor ymm14,ymm14,ymm9 - vpor ymm3,ymm3,ymm6 - vpxor ymm10,ymm10,ymm12 - vmovdqa ymm12,YMMWORD[((448-256-128))+rbx] - - vpaddd ymm0,ymm0,ymm15 - vpslld ymm7,ymm1,5 - vpand ymm6,ymm4,ymm3 - vpxor ymm10,ymm10,YMMWORD[((128-128))+rax] - - vpaddd ymm0,ymm0,ymm6 - vpsrld ymm8,ymm1,27 - vpxor ymm5,ymm4,ymm3 - vpxor ymm10,ymm10,ymm12 - - vmovdqu YMMWORD[(352-256-128)+rbx],ymm14 - vpaddd ymm0,ymm0,ymm14 - vpor ymm7,ymm7,ymm8 - vpsrld ymm9,ymm10,31 - vpand ymm5,ymm5,ymm2 - vpaddd ymm10,ymm10,ymm10 - - vpslld ymm6,ymm2,30 - vpaddd ymm0,ymm0,ymm5 - - vpsrld ymm2,ymm2,2 - vpaddd ymm0,ymm0,ymm7 - vpor ymm10,ymm10,ymm9 - vpor ymm2,ymm2,ymm6 - vmovdqa ymm15,YMMWORD[64+rbp] - vpxor ymm11,ymm11,ymm13 - vmovdqa ymm13,YMMWORD[((480-256-128))+rbx] - - vpslld ymm7,ymm0,5 - vpaddd ymm4,ymm4,ymm15 - vpxor ymm5,ymm3,ymm1 - vmovdqa YMMWORD[(384-256-128)+rbx],ymm10 - vpaddd ymm4,ymm4,ymm10 - vpxor ymm11,ymm11,YMMWORD[((160-128))+rax] - vpsrld ymm8,ymm0,27 - vpxor ymm5,ymm5,ymm2 - vpxor ymm11,ymm11,ymm13 - - vpslld ymm6,ymm1,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm4,ymm4,ymm5 - vpsrld ymm9,ymm11,31 - vpaddd ymm11,ymm11,ymm11 - - vpsrld ymm1,ymm1,2 - vpaddd ymm4,ymm4,ymm7 - vpor ymm11,ymm11,ymm9 - vpor ymm1,ymm1,ymm6 - vpxor ymm12,ymm12,ymm14 - vmovdqa ymm14,YMMWORD[((0-128))+rax] - - vpslld ymm7,ymm4,5 - vpaddd ymm3,ymm3,ymm15 - vpxor ymm5,ymm2,ymm0 - vmovdqa YMMWORD[(416-256-128)+rbx],ymm11 - vpaddd ymm3,ymm3,ymm11 - vpxor ymm12,ymm12,YMMWORD[((192-128))+rax] - vpsrld ymm8,ymm4,27 - vpxor ymm5,ymm5,ymm1 - vpxor ymm12,ymm12,ymm14 - - vpslld ymm6,ymm0,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm3,ymm3,ymm5 - vpsrld ymm9,ymm12,31 - vpaddd ymm12,ymm12,ymm12 - - vpsrld ymm0,ymm0,2 - vpaddd ymm3,ymm3,ymm7 - vpor ymm12,ymm12,ymm9 - vpor ymm0,ymm0,ymm6 - vpxor ymm13,ymm13,ymm10 - vmovdqa ymm10,YMMWORD[((32-128))+rax] - - vpslld ymm7,ymm3,5 - vpaddd ymm2,ymm2,ymm15 - vpxor ymm5,ymm1,ymm4 - vmovdqa YMMWORD[(448-256-128)+rbx],ymm12 - vpaddd ymm2,ymm2,ymm12 - vpxor ymm13,ymm13,YMMWORD[((224-128))+rax] - vpsrld ymm8,ymm3,27 - vpxor ymm5,ymm5,ymm0 - vpxor ymm13,ymm13,ymm10 - - vpslld ymm6,ymm4,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm2,ymm2,ymm5 - vpsrld ymm9,ymm13,31 - vpaddd ymm13,ymm13,ymm13 - - vpsrld ymm4,ymm4,2 - vpaddd ymm2,ymm2,ymm7 - vpor ymm13,ymm13,ymm9 - vpor ymm4,ymm4,ymm6 - vpxor ymm14,ymm14,ymm11 - vmovdqa ymm11,YMMWORD[((64-128))+rax] - - vpslld ymm7,ymm2,5 - vpaddd ymm1,ymm1,ymm15 - vpxor ymm5,ymm0,ymm3 - vmovdqa YMMWORD[(480-256-128)+rbx],ymm13 - vpaddd ymm1,ymm1,ymm13 - vpxor ymm14,ymm14,YMMWORD[((256-256-128))+rbx] - vpsrld ymm8,ymm2,27 - vpxor ymm5,ymm5,ymm4 - vpxor ymm14,ymm14,ymm11 - - vpslld ymm6,ymm3,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm1,ymm1,ymm5 - vpsrld ymm9,ymm14,31 - vpaddd ymm14,ymm14,ymm14 - - vpsrld ymm3,ymm3,2 - vpaddd ymm1,ymm1,ymm7 - vpor ymm14,ymm14,ymm9 - vpor ymm3,ymm3,ymm6 - vpxor ymm10,ymm10,ymm12 - vmovdqa ymm12,YMMWORD[((96-128))+rax] - - vpslld ymm7,ymm1,5 - vpaddd ymm0,ymm0,ymm15 - vpxor ymm5,ymm4,ymm2 - vmovdqa YMMWORD[(0-128)+rax],ymm14 - vpaddd ymm0,ymm0,ymm14 - vpxor ymm10,ymm10,YMMWORD[((288-256-128))+rbx] - vpsrld ymm8,ymm1,27 - vpxor ymm5,ymm5,ymm3 - vpxor ymm10,ymm10,ymm12 - - vpslld ymm6,ymm2,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm0,ymm0,ymm5 - vpsrld ymm9,ymm10,31 - vpaddd ymm10,ymm10,ymm10 - - vpsrld ymm2,ymm2,2 - vpaddd ymm0,ymm0,ymm7 - vpor ymm10,ymm10,ymm9 - vpor ymm2,ymm2,ymm6 - vpxor ymm11,ymm11,ymm13 - vmovdqa ymm13,YMMWORD[((128-128))+rax] - - vpslld ymm7,ymm0,5 - vpaddd ymm4,ymm4,ymm15 - vpxor ymm5,ymm3,ymm1 - vmovdqa YMMWORD[(32-128)+rax],ymm10 - vpaddd ymm4,ymm4,ymm10 - vpxor ymm11,ymm11,YMMWORD[((320-256-128))+rbx] - vpsrld ymm8,ymm0,27 - vpxor ymm5,ymm5,ymm2 - vpxor ymm11,ymm11,ymm13 - - vpslld ymm6,ymm1,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm4,ymm4,ymm5 - vpsrld ymm9,ymm11,31 - vpaddd ymm11,ymm11,ymm11 - - vpsrld ymm1,ymm1,2 - vpaddd ymm4,ymm4,ymm7 - vpor ymm11,ymm11,ymm9 - vpor ymm1,ymm1,ymm6 - vpxor ymm12,ymm12,ymm14 - vmovdqa ymm14,YMMWORD[((160-128))+rax] - - vpslld ymm7,ymm4,5 - vpaddd ymm3,ymm3,ymm15 - vpxor ymm5,ymm2,ymm0 - vmovdqa YMMWORD[(64-128)+rax],ymm11 - vpaddd ymm3,ymm3,ymm11 - vpxor ymm12,ymm12,YMMWORD[((352-256-128))+rbx] - vpsrld ymm8,ymm4,27 - vpxor ymm5,ymm5,ymm1 - vpxor ymm12,ymm12,ymm14 - - vpslld ymm6,ymm0,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm3,ymm3,ymm5 - vpsrld ymm9,ymm12,31 - vpaddd ymm12,ymm12,ymm12 - - vpsrld ymm0,ymm0,2 - vpaddd ymm3,ymm3,ymm7 - vpor ymm12,ymm12,ymm9 - vpor ymm0,ymm0,ymm6 - vpxor ymm13,ymm13,ymm10 - vmovdqa ymm10,YMMWORD[((192-128))+rax] - - vpslld ymm7,ymm3,5 - vpaddd ymm2,ymm2,ymm15 - vpxor ymm5,ymm1,ymm4 - vmovdqa YMMWORD[(96-128)+rax],ymm12 - vpaddd ymm2,ymm2,ymm12 - vpxor ymm13,ymm13,YMMWORD[((384-256-128))+rbx] - vpsrld ymm8,ymm3,27 - vpxor ymm5,ymm5,ymm0 - vpxor ymm13,ymm13,ymm10 - - vpslld ymm6,ymm4,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm2,ymm2,ymm5 - vpsrld ymm9,ymm13,31 - vpaddd ymm13,ymm13,ymm13 - - vpsrld ymm4,ymm4,2 - vpaddd ymm2,ymm2,ymm7 - vpor ymm13,ymm13,ymm9 - vpor ymm4,ymm4,ymm6 - vpxor ymm14,ymm14,ymm11 - vmovdqa ymm11,YMMWORD[((224-128))+rax] - - vpslld ymm7,ymm2,5 - vpaddd ymm1,ymm1,ymm15 - vpxor ymm5,ymm0,ymm3 - vmovdqa YMMWORD[(128-128)+rax],ymm13 - vpaddd ymm1,ymm1,ymm13 - vpxor ymm14,ymm14,YMMWORD[((416-256-128))+rbx] - vpsrld ymm8,ymm2,27 - vpxor ymm5,ymm5,ymm4 - vpxor ymm14,ymm14,ymm11 - - vpslld ymm6,ymm3,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm1,ymm1,ymm5 - vpsrld ymm9,ymm14,31 - vpaddd ymm14,ymm14,ymm14 - - vpsrld ymm3,ymm3,2 - vpaddd ymm1,ymm1,ymm7 - vpor ymm14,ymm14,ymm9 - vpor ymm3,ymm3,ymm6 - vpxor ymm10,ymm10,ymm12 - vmovdqa ymm12,YMMWORD[((256-256-128))+rbx] - - vpslld ymm7,ymm1,5 - vpaddd ymm0,ymm0,ymm15 - vpxor ymm5,ymm4,ymm2 - vmovdqa YMMWORD[(160-128)+rax],ymm14 - vpaddd ymm0,ymm0,ymm14 - vpxor ymm10,ymm10,YMMWORD[((448-256-128))+rbx] - vpsrld ymm8,ymm1,27 - vpxor ymm5,ymm5,ymm3 - vpxor ymm10,ymm10,ymm12 - - vpslld ymm6,ymm2,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm0,ymm0,ymm5 - vpsrld ymm9,ymm10,31 - vpaddd ymm10,ymm10,ymm10 - - vpsrld ymm2,ymm2,2 - vpaddd ymm0,ymm0,ymm7 - vpor ymm10,ymm10,ymm9 - vpor ymm2,ymm2,ymm6 - vpxor ymm11,ymm11,ymm13 - vmovdqa ymm13,YMMWORD[((288-256-128))+rbx] - - vpslld ymm7,ymm0,5 - vpaddd ymm4,ymm4,ymm15 - vpxor ymm5,ymm3,ymm1 - vmovdqa YMMWORD[(192-128)+rax],ymm10 - vpaddd ymm4,ymm4,ymm10 - vpxor ymm11,ymm11,YMMWORD[((480-256-128))+rbx] - vpsrld ymm8,ymm0,27 - vpxor ymm5,ymm5,ymm2 - vpxor ymm11,ymm11,ymm13 - - vpslld ymm6,ymm1,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm4,ymm4,ymm5 - vpsrld ymm9,ymm11,31 - vpaddd ymm11,ymm11,ymm11 - - vpsrld ymm1,ymm1,2 - vpaddd ymm4,ymm4,ymm7 - vpor ymm11,ymm11,ymm9 - vpor ymm1,ymm1,ymm6 - vpxor ymm12,ymm12,ymm14 - vmovdqa ymm14,YMMWORD[((320-256-128))+rbx] - - vpslld ymm7,ymm4,5 - vpaddd ymm3,ymm3,ymm15 - vpxor ymm5,ymm2,ymm0 - vmovdqa YMMWORD[(224-128)+rax],ymm11 - vpaddd ymm3,ymm3,ymm11 - vpxor ymm12,ymm12,YMMWORD[((0-128))+rax] - vpsrld ymm8,ymm4,27 - vpxor ymm5,ymm5,ymm1 - vpxor ymm12,ymm12,ymm14 - - vpslld ymm6,ymm0,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm3,ymm3,ymm5 - vpsrld ymm9,ymm12,31 - vpaddd ymm12,ymm12,ymm12 - - vpsrld ymm0,ymm0,2 - vpaddd ymm3,ymm3,ymm7 - vpor ymm12,ymm12,ymm9 - vpor ymm0,ymm0,ymm6 - vpxor ymm13,ymm13,ymm10 - vmovdqa ymm10,YMMWORD[((352-256-128))+rbx] - - vpslld ymm7,ymm3,5 - vpaddd ymm2,ymm2,ymm15 - vpxor ymm5,ymm1,ymm4 - vpaddd ymm2,ymm2,ymm12 - vpxor ymm13,ymm13,YMMWORD[((32-128))+rax] - vpsrld ymm8,ymm3,27 - vpxor ymm5,ymm5,ymm0 - vpxor ymm13,ymm13,ymm10 - - vpslld ymm6,ymm4,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm2,ymm2,ymm5 - vpsrld ymm9,ymm13,31 - vpaddd ymm13,ymm13,ymm13 - - vpsrld ymm4,ymm4,2 - vpaddd ymm2,ymm2,ymm7 - vpor ymm13,ymm13,ymm9 - vpor ymm4,ymm4,ymm6 - vpxor ymm14,ymm14,ymm11 - vmovdqa ymm11,YMMWORD[((384-256-128))+rbx] - - vpslld ymm7,ymm2,5 - vpaddd ymm1,ymm1,ymm15 - vpxor ymm5,ymm0,ymm3 - vpaddd ymm1,ymm1,ymm13 - vpxor ymm14,ymm14,YMMWORD[((64-128))+rax] - vpsrld ymm8,ymm2,27 - vpxor ymm5,ymm5,ymm4 - vpxor ymm14,ymm14,ymm11 - - vpslld ymm6,ymm3,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm1,ymm1,ymm5 - vpsrld ymm9,ymm14,31 - vpaddd ymm14,ymm14,ymm14 - - vpsrld ymm3,ymm3,2 - vpaddd ymm1,ymm1,ymm7 - vpor ymm14,ymm14,ymm9 - vpor ymm3,ymm3,ymm6 - vpxor ymm10,ymm10,ymm12 - vmovdqa ymm12,YMMWORD[((416-256-128))+rbx] - - vpslld ymm7,ymm1,5 - vpaddd ymm0,ymm0,ymm15 - vpxor ymm5,ymm4,ymm2 - vpaddd ymm0,ymm0,ymm14 - vpxor ymm10,ymm10,YMMWORD[((96-128))+rax] - vpsrld ymm8,ymm1,27 - vpxor ymm5,ymm5,ymm3 - vpxor ymm10,ymm10,ymm12 - - vpslld ymm6,ymm2,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm0,ymm0,ymm5 - vpsrld ymm9,ymm10,31 - vpaddd ymm10,ymm10,ymm10 - - vpsrld ymm2,ymm2,2 - vpaddd ymm0,ymm0,ymm7 - vpor ymm10,ymm10,ymm9 - vpor ymm2,ymm2,ymm6 - vpxor ymm11,ymm11,ymm13 - vmovdqa ymm13,YMMWORD[((448-256-128))+rbx] - - vpslld ymm7,ymm0,5 - vpaddd ymm4,ymm4,ymm15 - vpxor ymm5,ymm3,ymm1 - vpaddd ymm4,ymm4,ymm10 - vpxor ymm11,ymm11,YMMWORD[((128-128))+rax] - vpsrld ymm8,ymm0,27 - vpxor ymm5,ymm5,ymm2 - vpxor ymm11,ymm11,ymm13 - - vpslld ymm6,ymm1,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm4,ymm4,ymm5 - vpsrld ymm9,ymm11,31 - vpaddd ymm11,ymm11,ymm11 - - vpsrld ymm1,ymm1,2 - vpaddd ymm4,ymm4,ymm7 - vpor ymm11,ymm11,ymm9 - vpor ymm1,ymm1,ymm6 - vpxor ymm12,ymm12,ymm14 - vmovdqa ymm14,YMMWORD[((480-256-128))+rbx] - - vpslld ymm7,ymm4,5 - vpaddd ymm3,ymm3,ymm15 - vpxor ymm5,ymm2,ymm0 - vpaddd ymm3,ymm3,ymm11 - vpxor ymm12,ymm12,YMMWORD[((160-128))+rax] - vpsrld ymm8,ymm4,27 - vpxor ymm5,ymm5,ymm1 - vpxor ymm12,ymm12,ymm14 - - vpslld ymm6,ymm0,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm3,ymm3,ymm5 - vpsrld ymm9,ymm12,31 - vpaddd ymm12,ymm12,ymm12 - - vpsrld ymm0,ymm0,2 - vpaddd ymm3,ymm3,ymm7 - vpor ymm12,ymm12,ymm9 - vpor ymm0,ymm0,ymm6 - vpxor ymm13,ymm13,ymm10 - vmovdqa ymm10,YMMWORD[((0-128))+rax] - - vpslld ymm7,ymm3,5 - vpaddd ymm2,ymm2,ymm15 - vpxor ymm5,ymm1,ymm4 - vpaddd ymm2,ymm2,ymm12 - vpxor ymm13,ymm13,YMMWORD[((192-128))+rax] - vpsrld ymm8,ymm3,27 - vpxor ymm5,ymm5,ymm0 - vpxor ymm13,ymm13,ymm10 - - vpslld ymm6,ymm4,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm2,ymm2,ymm5 - vpsrld ymm9,ymm13,31 - vpaddd ymm13,ymm13,ymm13 - - vpsrld ymm4,ymm4,2 - vpaddd ymm2,ymm2,ymm7 - vpor ymm13,ymm13,ymm9 - vpor ymm4,ymm4,ymm6 - vpxor ymm14,ymm14,ymm11 - vmovdqa ymm11,YMMWORD[((32-128))+rax] - - vpslld ymm7,ymm2,5 - vpaddd ymm1,ymm1,ymm15 - vpxor ymm5,ymm0,ymm3 - vpaddd ymm1,ymm1,ymm13 - vpxor ymm14,ymm14,YMMWORD[((224-128))+rax] - vpsrld ymm8,ymm2,27 - vpxor ymm5,ymm5,ymm4 - vpxor ymm14,ymm14,ymm11 - - vpslld ymm6,ymm3,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm1,ymm1,ymm5 - vpsrld ymm9,ymm14,31 - vpaddd ymm14,ymm14,ymm14 - - vpsrld ymm3,ymm3,2 - vpaddd ymm1,ymm1,ymm7 - vpor ymm14,ymm14,ymm9 - vpor ymm3,ymm3,ymm6 - vpslld ymm7,ymm1,5 - vpaddd ymm0,ymm0,ymm15 - vpxor ymm5,ymm4,ymm2 - - vpsrld ymm8,ymm1,27 - vpaddd ymm0,ymm0,ymm14 - vpxor ymm5,ymm5,ymm3 - - vpslld ymm6,ymm2,30 - vpor ymm7,ymm7,ymm8 - vpaddd ymm0,ymm0,ymm5 - - vpsrld ymm2,ymm2,2 - vpaddd ymm0,ymm0,ymm7 - vpor ymm2,ymm2,ymm6 - mov ecx,1 - lea rbx,[512+rsp] - cmp ecx,DWORD[rbx] - cmovge r12,rbp - cmp ecx,DWORD[4+rbx] - cmovge r13,rbp - cmp ecx,DWORD[8+rbx] - cmovge r14,rbp - cmp ecx,DWORD[12+rbx] - cmovge r15,rbp - cmp ecx,DWORD[16+rbx] - cmovge r8,rbp - cmp ecx,DWORD[20+rbx] - cmovge r9,rbp - cmp ecx,DWORD[24+rbx] - cmovge r10,rbp - cmp ecx,DWORD[28+rbx] - cmovge r11,rbp - vmovdqu ymm5,YMMWORD[rbx] - vpxor ymm7,ymm7,ymm7 - vmovdqa ymm6,ymm5 - vpcmpgtd ymm6,ymm6,ymm7 - vpaddd ymm5,ymm5,ymm6 - - vpand ymm0,ymm0,ymm6 - vpand ymm1,ymm1,ymm6 - vpaddd ymm0,ymm0,YMMWORD[rdi] - vpand ymm2,ymm2,ymm6 - vpaddd ymm1,ymm1,YMMWORD[32+rdi] - vpand ymm3,ymm3,ymm6 - vpaddd ymm2,ymm2,YMMWORD[64+rdi] - vpand ymm4,ymm4,ymm6 - vpaddd ymm3,ymm3,YMMWORD[96+rdi] - vpaddd ymm4,ymm4,YMMWORD[128+rdi] - vmovdqu YMMWORD[rdi],ymm0 - vmovdqu YMMWORD[32+rdi],ymm1 - vmovdqu YMMWORD[64+rdi],ymm2 - vmovdqu YMMWORD[96+rdi],ymm3 - vmovdqu YMMWORD[128+rdi],ymm4 - - vmovdqu YMMWORD[rbx],ymm5 - lea rbx,[((256+128))+rsp] - vmovdqu ymm9,YMMWORD[96+rbp] - dec edx - jnz NEAR $L$oop_avx2 - - - - - - - -$L$done_avx2: - mov rax,QWORD[544+rsp] - - vzeroupper - movaps xmm6,XMMWORD[((-216))+rax] - movaps xmm7,XMMWORD[((-200))+rax] - movaps xmm8,XMMWORD[((-184))+rax] - movaps xmm9,XMMWORD[((-168))+rax] - movaps xmm10,XMMWORD[((-152))+rax] - movaps xmm11,XMMWORD[((-136))+rax] - movaps xmm12,XMMWORD[((-120))+rax] - movaps xmm13,XMMWORD[((-104))+rax] - movaps xmm14,XMMWORD[((-88))+rax] - movaps xmm15,XMMWORD[((-72))+rax] - mov r15,QWORD[((-48))+rax] - - mov r14,QWORD[((-40))+rax] - - mov r13,QWORD[((-32))+rax] - - mov r12,QWORD[((-24))+rax] - - mov rbp,QWORD[((-16))+rax] - - mov rbx,QWORD[((-8))+rax] - - lea rsp,[rax] - -$L$epilogue_avx2: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_sha1_multi_block_avx2: - ALIGN 256 DD 0x5a827999,0x5a827999,0x5a827999,0x5a827999 DD 0x5a827999,0x5a827999,0x5a827999,0x5a827999 @@ -7522,60 +3119,6 @@ $L$in_prologue: pop rsi DB 0F3h,0C3h ;repret - -ALIGN 16 -avx2_handler: - push rsi - push rdi - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - pushfq - sub rsp,64 - - mov rax,QWORD[120+r8] - mov rbx,QWORD[248+r8] - - mov rsi,QWORD[8+r9] - mov r11,QWORD[56+r9] - - mov r10d,DWORD[r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jb NEAR $L$in_prologue - - mov rax,QWORD[152+r8] - - mov r10d,DWORD[4+r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jae NEAR $L$in_prologue - - mov rax,QWORD[544+r8] - - mov rbx,QWORD[((-8))+rax] - mov rbp,QWORD[((-16))+rax] - mov r12,QWORD[((-24))+rax] - mov r13,QWORD[((-32))+rax] - mov r14,QWORD[((-40))+rax] - mov r15,QWORD[((-48))+rax] - mov QWORD[144+r8],rbx - mov QWORD[160+r8],rbp - mov QWORD[216+r8],r12 - mov QWORD[224+r8],r13 - mov QWORD[232+r8],r14 - mov QWORD[240+r8],r15 - - lea rsi,[((-56-160))+rax] - lea rdi,[512+r8] - mov ecx,20 - DD 0xa548f3fc - - jmp NEAR $L$in_prologue - section .pdata rdata align=4 ALIGN 4 DD $L$SEH_begin_sha1_multi_block wrt ..imagebase @@ -7584,12 +3127,6 @@ ALIGN 4 DD $L$SEH_begin_sha1_multi_block_shaext wrt ..imagebase DD $L$SEH_end_sha1_multi_block_shaext wrt ..imagebase DD $L$SEH_info_sha1_multi_block_shaext wrt ..imagebase - DD $L$SEH_begin_sha1_multi_block_avx wrt ..imagebase - DD $L$SEH_end_sha1_multi_block_avx wrt ..imagebase - DD $L$SEH_info_sha1_multi_block_avx wrt ..imagebase - DD $L$SEH_begin_sha1_multi_block_avx2 wrt ..imagebase - DD $L$SEH_end_sha1_multi_block_avx2 wrt ..imagebase - DD $L$SEH_info_sha1_multi_block_avx2 wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_sha1_multi_block: @@ -7600,11 +3137,3 @@ $L$SEH_info_sha1_multi_block_shaext: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$body_shaext wrt ..imagebase,$L$epilogue_shaext wrt ..imagebase -$L$SEH_info_sha1_multi_block_avx: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$body_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase -$L$SEH_info_sha1_multi_block_avx2: -DB 9,0,0,0 - DD avx2_handler wrt ..imagebase - DD $L$body_avx2 wrt ..imagebase,$L$epilogue_avx2 wrt ..imagebase diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-x86_64.nasm index 9d1f10e1ee6..e25a29d3951 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-x86_64.nasm @@ -27,14 +27,6 @@ $L$SEH_begin_sha1_block_data_order: jz NEAR $L$ialu test r10d,536870912 jnz NEAR _shaext_shortcut - and r10d,296 - cmp r10d,296 - je NEAR _avx2_shortcut - and r8d,268435456 - and r9d,1073741824 - or r8d,r9d - cmp r8d,1342177280 - je NEAR _avx_shortcut jmp NEAR _ssse3_shortcut ALIGN 16 @@ -2675,2876 +2667,6 @@ $L$epilogue_ssse3: DB 0F3h,0C3h ;repret $L$SEH_end_sha1_block_data_order_ssse3: - -ALIGN 16 -sha1_block_data_order_avx: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_sha1_block_data_order_avx: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - -_avx_shortcut: - - mov r11,rsp - - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - lea rsp,[((-160))+rsp] - vzeroupper - vmovaps XMMWORD[(-40-96)+r11],xmm6 - vmovaps XMMWORD[(-40-80)+r11],xmm7 - vmovaps XMMWORD[(-40-64)+r11],xmm8 - vmovaps XMMWORD[(-40-48)+r11],xmm9 - vmovaps XMMWORD[(-40-32)+r11],xmm10 - vmovaps XMMWORD[(-40-16)+r11],xmm11 -$L$prologue_avx: - and rsp,-64 - mov r8,rdi - mov r9,rsi - mov r10,rdx - - shl r10,6 - add r10,r9 - lea r14,[((K_XX_XX+64))] - - mov eax,DWORD[r8] - mov ebx,DWORD[4+r8] - mov ecx,DWORD[8+r8] - mov edx,DWORD[12+r8] - mov esi,ebx - mov ebp,DWORD[16+r8] - mov edi,ecx - xor edi,edx - and esi,edi - - vmovdqa xmm6,XMMWORD[64+r14] - vmovdqa xmm11,XMMWORD[((-64))+r14] - vmovdqu xmm0,XMMWORD[r9] - vmovdqu xmm1,XMMWORD[16+r9] - vmovdqu xmm2,XMMWORD[32+r9] - vmovdqu xmm3,XMMWORD[48+r9] - vpshufb xmm0,xmm0,xmm6 - add r9,64 - vpshufb xmm1,xmm1,xmm6 - vpshufb xmm2,xmm2,xmm6 - vpshufb xmm3,xmm3,xmm6 - vpaddd xmm4,xmm0,xmm11 - vpaddd xmm5,xmm1,xmm11 - vpaddd xmm6,xmm2,xmm11 - vmovdqa XMMWORD[rsp],xmm4 - vmovdqa XMMWORD[16+rsp],xmm5 - vmovdqa XMMWORD[32+rsp],xmm6 - jmp NEAR $L$oop_avx -ALIGN 16 -$L$oop_avx: - shrd ebx,ebx,2 - xor esi,edx - vpalignr xmm4,xmm1,xmm0,8 - mov edi,eax - add ebp,DWORD[rsp] - vpaddd xmm9,xmm11,xmm3 - xor ebx,ecx - shld eax,eax,5 - vpsrldq xmm8,xmm3,4 - add ebp,esi - and edi,ebx - vpxor xmm4,xmm4,xmm0 - xor ebx,ecx - add ebp,eax - vpxor xmm8,xmm8,xmm2 - shrd eax,eax,7 - xor edi,ecx - mov esi,ebp - add edx,DWORD[4+rsp] - vpxor xmm4,xmm4,xmm8 - xor eax,ebx - shld ebp,ebp,5 - vmovdqa XMMWORD[48+rsp],xmm9 - add edx,edi - and esi,eax - vpsrld xmm8,xmm4,31 - xor eax,ebx - add edx,ebp - shrd ebp,ebp,7 - xor esi,ebx - vpslldq xmm10,xmm4,12 - vpaddd xmm4,xmm4,xmm4 - mov edi,edx - add ecx,DWORD[8+rsp] - xor ebp,eax - shld edx,edx,5 - vpsrld xmm9,xmm10,30 - vpor xmm4,xmm4,xmm8 - add ecx,esi - and edi,ebp - xor ebp,eax - add ecx,edx - vpslld xmm10,xmm10,2 - vpxor xmm4,xmm4,xmm9 - shrd edx,edx,7 - xor edi,eax - mov esi,ecx - add ebx,DWORD[12+rsp] - vpxor xmm4,xmm4,xmm10 - xor edx,ebp - shld ecx,ecx,5 - add ebx,edi - and esi,edx - xor edx,ebp - add ebx,ecx - shrd ecx,ecx,7 - xor esi,ebp - vpalignr xmm5,xmm2,xmm1,8 - mov edi,ebx - add eax,DWORD[16+rsp] - vpaddd xmm9,xmm11,xmm4 - xor ecx,edx - shld ebx,ebx,5 - vpsrldq xmm8,xmm4,4 - add eax,esi - and edi,ecx - vpxor xmm5,xmm5,xmm1 - xor ecx,edx - add eax,ebx - vpxor xmm8,xmm8,xmm3 - shrd ebx,ebx,7 - xor edi,edx - mov esi,eax - add ebp,DWORD[20+rsp] - vpxor xmm5,xmm5,xmm8 - xor ebx,ecx - shld eax,eax,5 - vmovdqa XMMWORD[rsp],xmm9 - add ebp,edi - and esi,ebx - vpsrld xmm8,xmm5,31 - xor ebx,ecx - add ebp,eax - shrd eax,eax,7 - xor esi,ecx - vpslldq xmm10,xmm5,12 - vpaddd xmm5,xmm5,xmm5 - mov edi,ebp - add edx,DWORD[24+rsp] - xor eax,ebx - shld ebp,ebp,5 - vpsrld xmm9,xmm10,30 - vpor xmm5,xmm5,xmm8 - add edx,esi - and edi,eax - xor eax,ebx - add edx,ebp - vpslld xmm10,xmm10,2 - vpxor xmm5,xmm5,xmm9 - shrd ebp,ebp,7 - xor edi,ebx - mov esi,edx - add ecx,DWORD[28+rsp] - vpxor xmm5,xmm5,xmm10 - xor ebp,eax - shld edx,edx,5 - vmovdqa xmm11,XMMWORD[((-32))+r14] - add ecx,edi - and esi,ebp - xor ebp,eax - add ecx,edx - shrd edx,edx,7 - xor esi,eax - vpalignr xmm6,xmm3,xmm2,8 - mov edi,ecx - add ebx,DWORD[32+rsp] - vpaddd xmm9,xmm11,xmm5 - xor edx,ebp - shld ecx,ecx,5 - vpsrldq xmm8,xmm5,4 - add ebx,esi - and edi,edx - vpxor xmm6,xmm6,xmm2 - xor edx,ebp - add ebx,ecx - vpxor xmm8,xmm8,xmm4 - shrd ecx,ecx,7 - xor edi,ebp - mov esi,ebx - add eax,DWORD[36+rsp] - vpxor xmm6,xmm6,xmm8 - xor ecx,edx - shld ebx,ebx,5 - vmovdqa XMMWORD[16+rsp],xmm9 - add eax,edi - and esi,ecx - vpsrld xmm8,xmm6,31 - xor ecx,edx - add eax,ebx - shrd ebx,ebx,7 - xor esi,edx - vpslldq xmm10,xmm6,12 - vpaddd xmm6,xmm6,xmm6 - mov edi,eax - add ebp,DWORD[40+rsp] - xor ebx,ecx - shld eax,eax,5 - vpsrld xmm9,xmm10,30 - vpor xmm6,xmm6,xmm8 - add ebp,esi - and edi,ebx - xor ebx,ecx - add ebp,eax - vpslld xmm10,xmm10,2 - vpxor xmm6,xmm6,xmm9 - shrd eax,eax,7 - xor edi,ecx - mov esi,ebp - add edx,DWORD[44+rsp] - vpxor xmm6,xmm6,xmm10 - xor eax,ebx - shld ebp,ebp,5 - add edx,edi - and esi,eax - xor eax,ebx - add edx,ebp - shrd ebp,ebp,7 - xor esi,ebx - vpalignr xmm7,xmm4,xmm3,8 - mov edi,edx - add ecx,DWORD[48+rsp] - vpaddd xmm9,xmm11,xmm6 - xor ebp,eax - shld edx,edx,5 - vpsrldq xmm8,xmm6,4 - add ecx,esi - and edi,ebp - vpxor xmm7,xmm7,xmm3 - xor ebp,eax - add ecx,edx - vpxor xmm8,xmm8,xmm5 - shrd edx,edx,7 - xor edi,eax - mov esi,ecx - add ebx,DWORD[52+rsp] - vpxor xmm7,xmm7,xmm8 - xor edx,ebp - shld ecx,ecx,5 - vmovdqa XMMWORD[32+rsp],xmm9 - add ebx,edi - and esi,edx - vpsrld xmm8,xmm7,31 - xor edx,ebp - add ebx,ecx - shrd ecx,ecx,7 - xor esi,ebp - vpslldq xmm10,xmm7,12 - vpaddd xmm7,xmm7,xmm7 - mov edi,ebx - add eax,DWORD[56+rsp] - xor ecx,edx - shld ebx,ebx,5 - vpsrld xmm9,xmm10,30 - vpor xmm7,xmm7,xmm8 - add eax,esi - and edi,ecx - xor ecx,edx - add eax,ebx - vpslld xmm10,xmm10,2 - vpxor xmm7,xmm7,xmm9 - shrd ebx,ebx,7 - xor edi,edx - mov esi,eax - add ebp,DWORD[60+rsp] - vpxor xmm7,xmm7,xmm10 - xor ebx,ecx - shld eax,eax,5 - add ebp,edi - and esi,ebx - xor ebx,ecx - add ebp,eax - vpalignr xmm8,xmm7,xmm6,8 - vpxor xmm0,xmm0,xmm4 - shrd eax,eax,7 - xor esi,ecx - mov edi,ebp - add edx,DWORD[rsp] - vpxor xmm0,xmm0,xmm1 - xor eax,ebx - shld ebp,ebp,5 - vpaddd xmm9,xmm11,xmm7 - add edx,esi - and edi,eax - vpxor xmm0,xmm0,xmm8 - xor eax,ebx - add edx,ebp - shrd ebp,ebp,7 - xor edi,ebx - vpsrld xmm8,xmm0,30 - vmovdqa XMMWORD[48+rsp],xmm9 - mov esi,edx - add ecx,DWORD[4+rsp] - xor ebp,eax - shld edx,edx,5 - vpslld xmm0,xmm0,2 - add ecx,edi - and esi,ebp - xor ebp,eax - add ecx,edx - shrd edx,edx,7 - xor esi,eax - mov edi,ecx - add ebx,DWORD[8+rsp] - vpor xmm0,xmm0,xmm8 - xor edx,ebp - shld ecx,ecx,5 - add ebx,esi - and edi,edx - xor edx,ebp - add ebx,ecx - add eax,DWORD[12+rsp] - xor edi,ebp - mov esi,ebx - shld ebx,ebx,5 - add eax,edi - xor esi,edx - shrd ecx,ecx,7 - add eax,ebx - vpalignr xmm8,xmm0,xmm7,8 - vpxor xmm1,xmm1,xmm5 - add ebp,DWORD[16+rsp] - xor esi,ecx - mov edi,eax - shld eax,eax,5 - vpxor xmm1,xmm1,xmm2 - add ebp,esi - xor edi,ecx - vpaddd xmm9,xmm11,xmm0 - shrd ebx,ebx,7 - add ebp,eax - vpxor xmm1,xmm1,xmm8 - add edx,DWORD[20+rsp] - xor edi,ebx - mov esi,ebp - shld ebp,ebp,5 - vpsrld xmm8,xmm1,30 - vmovdqa XMMWORD[rsp],xmm9 - add edx,edi - xor esi,ebx - shrd eax,eax,7 - add edx,ebp - vpslld xmm1,xmm1,2 - add ecx,DWORD[24+rsp] - xor esi,eax - mov edi,edx - shld edx,edx,5 - add ecx,esi - xor edi,eax - shrd ebp,ebp,7 - add ecx,edx - vpor xmm1,xmm1,xmm8 - add ebx,DWORD[28+rsp] - xor edi,ebp - mov esi,ecx - shld ecx,ecx,5 - add ebx,edi - xor esi,ebp - shrd edx,edx,7 - add ebx,ecx - vpalignr xmm8,xmm1,xmm0,8 - vpxor xmm2,xmm2,xmm6 - add eax,DWORD[32+rsp] - xor esi,edx - mov edi,ebx - shld ebx,ebx,5 - vpxor xmm2,xmm2,xmm3 - add eax,esi - xor edi,edx - vpaddd xmm9,xmm11,xmm1 - vmovdqa xmm11,XMMWORD[r14] - shrd ecx,ecx,7 - add eax,ebx - vpxor xmm2,xmm2,xmm8 - add ebp,DWORD[36+rsp] - xor edi,ecx - mov esi,eax - shld eax,eax,5 - vpsrld xmm8,xmm2,30 - vmovdqa XMMWORD[16+rsp],xmm9 - add ebp,edi - xor esi,ecx - shrd ebx,ebx,7 - add ebp,eax - vpslld xmm2,xmm2,2 - add edx,DWORD[40+rsp] - xor esi,ebx - mov edi,ebp - shld ebp,ebp,5 - add edx,esi - xor edi,ebx - shrd eax,eax,7 - add edx,ebp - vpor xmm2,xmm2,xmm8 - add ecx,DWORD[44+rsp] - xor edi,eax - mov esi,edx - shld edx,edx,5 - add ecx,edi - xor esi,eax - shrd ebp,ebp,7 - add ecx,edx - vpalignr xmm8,xmm2,xmm1,8 - vpxor xmm3,xmm3,xmm7 - add ebx,DWORD[48+rsp] - xor esi,ebp - mov edi,ecx - shld ecx,ecx,5 - vpxor xmm3,xmm3,xmm4 - add ebx,esi - xor edi,ebp - vpaddd xmm9,xmm11,xmm2 - shrd edx,edx,7 - add ebx,ecx - vpxor xmm3,xmm3,xmm8 - add eax,DWORD[52+rsp] - xor edi,edx - mov esi,ebx - shld ebx,ebx,5 - vpsrld xmm8,xmm3,30 - vmovdqa XMMWORD[32+rsp],xmm9 - add eax,edi - xor esi,edx - shrd ecx,ecx,7 - add eax,ebx - vpslld xmm3,xmm3,2 - add ebp,DWORD[56+rsp] - xor esi,ecx - mov edi,eax - shld eax,eax,5 - add ebp,esi - xor edi,ecx - shrd ebx,ebx,7 - add ebp,eax - vpor xmm3,xmm3,xmm8 - add edx,DWORD[60+rsp] - xor edi,ebx - mov esi,ebp - shld ebp,ebp,5 - add edx,edi - xor esi,ebx - shrd eax,eax,7 - add edx,ebp - vpalignr xmm8,xmm3,xmm2,8 - vpxor xmm4,xmm4,xmm0 - add ecx,DWORD[rsp] - xor esi,eax - mov edi,edx - shld edx,edx,5 - vpxor xmm4,xmm4,xmm5 - add ecx,esi - xor edi,eax - vpaddd xmm9,xmm11,xmm3 - shrd ebp,ebp,7 - add ecx,edx - vpxor xmm4,xmm4,xmm8 - add ebx,DWORD[4+rsp] - xor edi,ebp - mov esi,ecx - shld ecx,ecx,5 - vpsrld xmm8,xmm4,30 - vmovdqa XMMWORD[48+rsp],xmm9 - add ebx,edi - xor esi,ebp - shrd edx,edx,7 - add ebx,ecx - vpslld xmm4,xmm4,2 - add eax,DWORD[8+rsp] - xor esi,edx - mov edi,ebx - shld ebx,ebx,5 - add eax,esi - xor edi,edx - shrd ecx,ecx,7 - add eax,ebx - vpor xmm4,xmm4,xmm8 - add ebp,DWORD[12+rsp] - xor edi,ecx - mov esi,eax - shld eax,eax,5 - add ebp,edi - xor esi,ecx - shrd ebx,ebx,7 - add ebp,eax - vpalignr xmm8,xmm4,xmm3,8 - vpxor xmm5,xmm5,xmm1 - add edx,DWORD[16+rsp] - xor esi,ebx - mov edi,ebp - shld ebp,ebp,5 - vpxor xmm5,xmm5,xmm6 - add edx,esi - xor edi,ebx - vpaddd xmm9,xmm11,xmm4 - shrd eax,eax,7 - add edx,ebp - vpxor xmm5,xmm5,xmm8 - add ecx,DWORD[20+rsp] - xor edi,eax - mov esi,edx - shld edx,edx,5 - vpsrld xmm8,xmm5,30 - vmovdqa XMMWORD[rsp],xmm9 - add ecx,edi - xor esi,eax - shrd ebp,ebp,7 - add ecx,edx - vpslld xmm5,xmm5,2 - add ebx,DWORD[24+rsp] - xor esi,ebp - mov edi,ecx - shld ecx,ecx,5 - add ebx,esi - xor edi,ebp - shrd edx,edx,7 - add ebx,ecx - vpor xmm5,xmm5,xmm8 - add eax,DWORD[28+rsp] - shrd ecx,ecx,7 - mov esi,ebx - xor edi,edx - shld ebx,ebx,5 - add eax,edi - xor esi,ecx - xor ecx,edx - add eax,ebx - vpalignr xmm8,xmm5,xmm4,8 - vpxor xmm6,xmm6,xmm2 - add ebp,DWORD[32+rsp] - and esi,ecx - xor ecx,edx - shrd ebx,ebx,7 - vpxor xmm6,xmm6,xmm7 - mov edi,eax - xor esi,ecx - vpaddd xmm9,xmm11,xmm5 - shld eax,eax,5 - add ebp,esi - vpxor xmm6,xmm6,xmm8 - xor edi,ebx - xor ebx,ecx - add ebp,eax - add edx,DWORD[36+rsp] - vpsrld xmm8,xmm6,30 - vmovdqa XMMWORD[16+rsp],xmm9 - and edi,ebx - xor ebx,ecx - shrd eax,eax,7 - mov esi,ebp - vpslld xmm6,xmm6,2 - xor edi,ebx - shld ebp,ebp,5 - add edx,edi - xor esi,eax - xor eax,ebx - add edx,ebp - add ecx,DWORD[40+rsp] - and esi,eax - vpor xmm6,xmm6,xmm8 - xor eax,ebx - shrd ebp,ebp,7 - mov edi,edx - xor esi,eax - shld edx,edx,5 - add ecx,esi - xor edi,ebp - xor ebp,eax - add ecx,edx - add ebx,DWORD[44+rsp] - and edi,ebp - xor ebp,eax - shrd edx,edx,7 - mov esi,ecx - xor edi,ebp - shld ecx,ecx,5 - add ebx,edi - xor esi,edx - xor edx,ebp - add ebx,ecx - vpalignr xmm8,xmm6,xmm5,8 - vpxor xmm7,xmm7,xmm3 - add eax,DWORD[48+rsp] - and esi,edx - xor edx,ebp - shrd ecx,ecx,7 - vpxor xmm7,xmm7,xmm0 - mov edi,ebx - xor esi,edx - vpaddd xmm9,xmm11,xmm6 - vmovdqa xmm11,XMMWORD[32+r14] - shld ebx,ebx,5 - add eax,esi - vpxor xmm7,xmm7,xmm8 - xor edi,ecx - xor ecx,edx - add eax,ebx - add ebp,DWORD[52+rsp] - vpsrld xmm8,xmm7,30 - vmovdqa XMMWORD[32+rsp],xmm9 - and edi,ecx - xor ecx,edx - shrd ebx,ebx,7 - mov esi,eax - vpslld xmm7,xmm7,2 - xor edi,ecx - shld eax,eax,5 - add ebp,edi - xor esi,ebx - xor ebx,ecx - add ebp,eax - add edx,DWORD[56+rsp] - and esi,ebx - vpor xmm7,xmm7,xmm8 - xor ebx,ecx - shrd eax,eax,7 - mov edi,ebp - xor esi,ebx - shld ebp,ebp,5 - add edx,esi - xor edi,eax - xor eax,ebx - add edx,ebp - add ecx,DWORD[60+rsp] - and edi,eax - xor eax,ebx - shrd ebp,ebp,7 - mov esi,edx - xor edi,eax - shld edx,edx,5 - add ecx,edi - xor esi,ebp - xor ebp,eax - add ecx,edx - vpalignr xmm8,xmm7,xmm6,8 - vpxor xmm0,xmm0,xmm4 - add ebx,DWORD[rsp] - and esi,ebp - xor ebp,eax - shrd edx,edx,7 - vpxor xmm0,xmm0,xmm1 - mov edi,ecx - xor esi,ebp - vpaddd xmm9,xmm11,xmm7 - shld ecx,ecx,5 - add ebx,esi - vpxor xmm0,xmm0,xmm8 - xor edi,edx - xor edx,ebp - add ebx,ecx - add eax,DWORD[4+rsp] - vpsrld xmm8,xmm0,30 - vmovdqa XMMWORD[48+rsp],xmm9 - and edi,edx - xor edx,ebp - shrd ecx,ecx,7 - mov esi,ebx - vpslld xmm0,xmm0,2 - xor edi,edx - shld ebx,ebx,5 - add eax,edi - xor esi,ecx - xor ecx,edx - add eax,ebx - add ebp,DWORD[8+rsp] - and esi,ecx - vpor xmm0,xmm0,xmm8 - xor ecx,edx - shrd ebx,ebx,7 - mov edi,eax - xor esi,ecx - shld eax,eax,5 - add ebp,esi - xor edi,ebx - xor ebx,ecx - add ebp,eax - add edx,DWORD[12+rsp] - and edi,ebx - xor ebx,ecx - shrd eax,eax,7 - mov esi,ebp - xor edi,ebx - shld ebp,ebp,5 - add edx,edi - xor esi,eax - xor eax,ebx - add edx,ebp - vpalignr xmm8,xmm0,xmm7,8 - vpxor xmm1,xmm1,xmm5 - add ecx,DWORD[16+rsp] - and esi,eax - xor eax,ebx - shrd ebp,ebp,7 - vpxor xmm1,xmm1,xmm2 - mov edi,edx - xor esi,eax - vpaddd xmm9,xmm11,xmm0 - shld edx,edx,5 - add ecx,esi - vpxor xmm1,xmm1,xmm8 - xor edi,ebp - xor ebp,eax - add ecx,edx - add ebx,DWORD[20+rsp] - vpsrld xmm8,xmm1,30 - vmovdqa XMMWORD[rsp],xmm9 - and edi,ebp - xor ebp,eax - shrd edx,edx,7 - mov esi,ecx - vpslld xmm1,xmm1,2 - xor edi,ebp - shld ecx,ecx,5 - add ebx,edi - xor esi,edx - xor edx,ebp - add ebx,ecx - add eax,DWORD[24+rsp] - and esi,edx - vpor xmm1,xmm1,xmm8 - xor edx,ebp - shrd ecx,ecx,7 - mov edi,ebx - xor esi,edx - shld ebx,ebx,5 - add eax,esi - xor edi,ecx - xor ecx,edx - add eax,ebx - add ebp,DWORD[28+rsp] - and edi,ecx - xor ecx,edx - shrd ebx,ebx,7 - mov esi,eax - xor edi,ecx - shld eax,eax,5 - add ebp,edi - xor esi,ebx - xor ebx,ecx - add ebp,eax - vpalignr xmm8,xmm1,xmm0,8 - vpxor xmm2,xmm2,xmm6 - add edx,DWORD[32+rsp] - and esi,ebx - xor ebx,ecx - shrd eax,eax,7 - vpxor xmm2,xmm2,xmm3 - mov edi,ebp - xor esi,ebx - vpaddd xmm9,xmm11,xmm1 - shld ebp,ebp,5 - add edx,esi - vpxor xmm2,xmm2,xmm8 - xor edi,eax - xor eax,ebx - add edx,ebp - add ecx,DWORD[36+rsp] - vpsrld xmm8,xmm2,30 - vmovdqa XMMWORD[16+rsp],xmm9 - and edi,eax - xor eax,ebx - shrd ebp,ebp,7 - mov esi,edx - vpslld xmm2,xmm2,2 - xor edi,eax - shld edx,edx,5 - add ecx,edi - xor esi,ebp - xor ebp,eax - add ecx,edx - add ebx,DWORD[40+rsp] - and esi,ebp - vpor xmm2,xmm2,xmm8 - xor ebp,eax - shrd edx,edx,7 - mov edi,ecx - xor esi,ebp - shld ecx,ecx,5 - add ebx,esi - xor edi,edx - xor edx,ebp - add ebx,ecx - add eax,DWORD[44+rsp] - and edi,edx - xor edx,ebp - shrd ecx,ecx,7 - mov esi,ebx - xor edi,edx - shld ebx,ebx,5 - add eax,edi - xor esi,edx - add eax,ebx - vpalignr xmm8,xmm2,xmm1,8 - vpxor xmm3,xmm3,xmm7 - add ebp,DWORD[48+rsp] - xor esi,ecx - mov edi,eax - shld eax,eax,5 - vpxor xmm3,xmm3,xmm4 - add ebp,esi - xor edi,ecx - vpaddd xmm9,xmm11,xmm2 - shrd ebx,ebx,7 - add ebp,eax - vpxor xmm3,xmm3,xmm8 - add edx,DWORD[52+rsp] - xor edi,ebx - mov esi,ebp - shld ebp,ebp,5 - vpsrld xmm8,xmm3,30 - vmovdqa XMMWORD[32+rsp],xmm9 - add edx,edi - xor esi,ebx - shrd eax,eax,7 - add edx,ebp - vpslld xmm3,xmm3,2 - add ecx,DWORD[56+rsp] - xor esi,eax - mov edi,edx - shld edx,edx,5 - add ecx,esi - xor edi,eax - shrd ebp,ebp,7 - add ecx,edx - vpor xmm3,xmm3,xmm8 - add ebx,DWORD[60+rsp] - xor edi,ebp - mov esi,ecx - shld ecx,ecx,5 - add ebx,edi - xor esi,ebp - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD[rsp] - vpaddd xmm9,xmm11,xmm3 - xor esi,edx - mov edi,ebx - shld ebx,ebx,5 - add eax,esi - vmovdqa XMMWORD[48+rsp],xmm9 - xor edi,edx - shrd ecx,ecx,7 - add eax,ebx - add ebp,DWORD[4+rsp] - xor edi,ecx - mov esi,eax - shld eax,eax,5 - add ebp,edi - xor esi,ecx - shrd ebx,ebx,7 - add ebp,eax - add edx,DWORD[8+rsp] - xor esi,ebx - mov edi,ebp - shld ebp,ebp,5 - add edx,esi - xor edi,ebx - shrd eax,eax,7 - add edx,ebp - add ecx,DWORD[12+rsp] - xor edi,eax - mov esi,edx - shld edx,edx,5 - add ecx,edi - xor esi,eax - shrd ebp,ebp,7 - add ecx,edx - cmp r9,r10 - je NEAR $L$done_avx - vmovdqa xmm6,XMMWORD[64+r14] - vmovdqa xmm11,XMMWORD[((-64))+r14] - vmovdqu xmm0,XMMWORD[r9] - vmovdqu xmm1,XMMWORD[16+r9] - vmovdqu xmm2,XMMWORD[32+r9] - vmovdqu xmm3,XMMWORD[48+r9] - vpshufb xmm0,xmm0,xmm6 - add r9,64 - add ebx,DWORD[16+rsp] - xor esi,ebp - vpshufb xmm1,xmm1,xmm6 - mov edi,ecx - shld ecx,ecx,5 - vpaddd xmm4,xmm0,xmm11 - add ebx,esi - xor edi,ebp - shrd edx,edx,7 - add ebx,ecx - vmovdqa XMMWORD[rsp],xmm4 - add eax,DWORD[20+rsp] - xor edi,edx - mov esi,ebx - shld ebx,ebx,5 - add eax,edi - xor esi,edx - shrd ecx,ecx,7 - add eax,ebx - add ebp,DWORD[24+rsp] - xor esi,ecx - mov edi,eax - shld eax,eax,5 - add ebp,esi - xor edi,ecx - shrd ebx,ebx,7 - add ebp,eax - add edx,DWORD[28+rsp] - xor edi,ebx - mov esi,ebp - shld ebp,ebp,5 - add edx,edi - xor esi,ebx - shrd eax,eax,7 - add edx,ebp - add ecx,DWORD[32+rsp] - xor esi,eax - vpshufb xmm2,xmm2,xmm6 - mov edi,edx - shld edx,edx,5 - vpaddd xmm5,xmm1,xmm11 - add ecx,esi - xor edi,eax - shrd ebp,ebp,7 - add ecx,edx - vmovdqa XMMWORD[16+rsp],xmm5 - add ebx,DWORD[36+rsp] - xor edi,ebp - mov esi,ecx - shld ecx,ecx,5 - add ebx,edi - xor esi,ebp - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD[40+rsp] - xor esi,edx - mov edi,ebx - shld ebx,ebx,5 - add eax,esi - xor edi,edx - shrd ecx,ecx,7 - add eax,ebx - add ebp,DWORD[44+rsp] - xor edi,ecx - mov esi,eax - shld eax,eax,5 - add ebp,edi - xor esi,ecx - shrd ebx,ebx,7 - add ebp,eax - add edx,DWORD[48+rsp] - xor esi,ebx - vpshufb xmm3,xmm3,xmm6 - mov edi,ebp - shld ebp,ebp,5 - vpaddd xmm6,xmm2,xmm11 - add edx,esi - xor edi,ebx - shrd eax,eax,7 - add edx,ebp - vmovdqa XMMWORD[32+rsp],xmm6 - add ecx,DWORD[52+rsp] - xor edi,eax - mov esi,edx - shld edx,edx,5 - add ecx,edi - xor esi,eax - shrd ebp,ebp,7 - add ecx,edx - add ebx,DWORD[56+rsp] - xor esi,ebp - mov edi,ecx - shld ecx,ecx,5 - add ebx,esi - xor edi,ebp - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD[60+rsp] - xor edi,edx - mov esi,ebx - shld ebx,ebx,5 - add eax,edi - shrd ecx,ecx,7 - add eax,ebx - add eax,DWORD[r8] - add esi,DWORD[4+r8] - add ecx,DWORD[8+r8] - add edx,DWORD[12+r8] - mov DWORD[r8],eax - add ebp,DWORD[16+r8] - mov DWORD[4+r8],esi - mov ebx,esi - mov DWORD[8+r8],ecx - mov edi,ecx - mov DWORD[12+r8],edx - xor edi,edx - mov DWORD[16+r8],ebp - and esi,edi - jmp NEAR $L$oop_avx - -ALIGN 16 -$L$done_avx: - add ebx,DWORD[16+rsp] - xor esi,ebp - mov edi,ecx - shld ecx,ecx,5 - add ebx,esi - xor edi,ebp - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD[20+rsp] - xor edi,edx - mov esi,ebx - shld ebx,ebx,5 - add eax,edi - xor esi,edx - shrd ecx,ecx,7 - add eax,ebx - add ebp,DWORD[24+rsp] - xor esi,ecx - mov edi,eax - shld eax,eax,5 - add ebp,esi - xor edi,ecx - shrd ebx,ebx,7 - add ebp,eax - add edx,DWORD[28+rsp] - xor edi,ebx - mov esi,ebp - shld ebp,ebp,5 - add edx,edi - xor esi,ebx - shrd eax,eax,7 - add edx,ebp - add ecx,DWORD[32+rsp] - xor esi,eax - mov edi,edx - shld edx,edx,5 - add ecx,esi - xor edi,eax - shrd ebp,ebp,7 - add ecx,edx - add ebx,DWORD[36+rsp] - xor edi,ebp - mov esi,ecx - shld ecx,ecx,5 - add ebx,edi - xor esi,ebp - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD[40+rsp] - xor esi,edx - mov edi,ebx - shld ebx,ebx,5 - add eax,esi - xor edi,edx - shrd ecx,ecx,7 - add eax,ebx - add ebp,DWORD[44+rsp] - xor edi,ecx - mov esi,eax - shld eax,eax,5 - add ebp,edi - xor esi,ecx - shrd ebx,ebx,7 - add ebp,eax - add edx,DWORD[48+rsp] - xor esi,ebx - mov edi,ebp - shld ebp,ebp,5 - add edx,esi - xor edi,ebx - shrd eax,eax,7 - add edx,ebp - add ecx,DWORD[52+rsp] - xor edi,eax - mov esi,edx - shld edx,edx,5 - add ecx,edi - xor esi,eax - shrd ebp,ebp,7 - add ecx,edx - add ebx,DWORD[56+rsp] - xor esi,ebp - mov edi,ecx - shld ecx,ecx,5 - add ebx,esi - xor edi,ebp - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD[60+rsp] - xor edi,edx - mov esi,ebx - shld ebx,ebx,5 - add eax,edi - shrd ecx,ecx,7 - add eax,ebx - vzeroupper - - add eax,DWORD[r8] - add esi,DWORD[4+r8] - add ecx,DWORD[8+r8] - mov DWORD[r8],eax - add edx,DWORD[12+r8] - mov DWORD[4+r8],esi - add ebp,DWORD[16+r8] - mov DWORD[8+r8],ecx - mov DWORD[12+r8],edx - mov DWORD[16+r8],ebp - movaps xmm6,XMMWORD[((-40-96))+r11] - movaps xmm7,XMMWORD[((-40-80))+r11] - movaps xmm8,XMMWORD[((-40-64))+r11] - movaps xmm9,XMMWORD[((-40-48))+r11] - movaps xmm10,XMMWORD[((-40-32))+r11] - movaps xmm11,XMMWORD[((-40-16))+r11] - mov r14,QWORD[((-40))+r11] - - mov r13,QWORD[((-32))+r11] - - mov r12,QWORD[((-24))+r11] - - mov rbp,QWORD[((-16))+r11] - - mov rbx,QWORD[((-8))+r11] - - lea rsp,[r11] - -$L$epilogue_avx: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_sha1_block_data_order_avx: - -ALIGN 16 -sha1_block_data_order_avx2: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_sha1_block_data_order_avx2: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - -_avx2_shortcut: - - mov r11,rsp - - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - vzeroupper - lea rsp,[((-96))+rsp] - vmovaps XMMWORD[(-40-96)+r11],xmm6 - vmovaps XMMWORD[(-40-80)+r11],xmm7 - vmovaps XMMWORD[(-40-64)+r11],xmm8 - vmovaps XMMWORD[(-40-48)+r11],xmm9 - vmovaps XMMWORD[(-40-32)+r11],xmm10 - vmovaps XMMWORD[(-40-16)+r11],xmm11 -$L$prologue_avx2: - mov r8,rdi - mov r9,rsi - mov r10,rdx - - lea rsp,[((-640))+rsp] - shl r10,6 - lea r13,[64+r9] - and rsp,-128 - add r10,r9 - lea r14,[((K_XX_XX+64))] - - mov eax,DWORD[r8] - cmp r13,r10 - cmovae r13,r9 - mov ebp,DWORD[4+r8] - mov ecx,DWORD[8+r8] - mov edx,DWORD[12+r8] - mov esi,DWORD[16+r8] - vmovdqu ymm6,YMMWORD[64+r14] - - vmovdqu xmm0,XMMWORD[r9] - vmovdqu xmm1,XMMWORD[16+r9] - vmovdqu xmm2,XMMWORD[32+r9] - vmovdqu xmm3,XMMWORD[48+r9] - lea r9,[64+r9] - vinserti128 ymm0,ymm0,XMMWORD[r13],1 - vinserti128 ymm1,ymm1,XMMWORD[16+r13],1 - vpshufb ymm0,ymm0,ymm6 - vinserti128 ymm2,ymm2,XMMWORD[32+r13],1 - vpshufb ymm1,ymm1,ymm6 - vinserti128 ymm3,ymm3,XMMWORD[48+r13],1 - vpshufb ymm2,ymm2,ymm6 - vmovdqu ymm11,YMMWORD[((-64))+r14] - vpshufb ymm3,ymm3,ymm6 - - vpaddd ymm4,ymm0,ymm11 - vpaddd ymm5,ymm1,ymm11 - vmovdqu YMMWORD[rsp],ymm4 - vpaddd ymm6,ymm2,ymm11 - vmovdqu YMMWORD[32+rsp],ymm5 - vpaddd ymm7,ymm3,ymm11 - vmovdqu YMMWORD[64+rsp],ymm6 - vmovdqu YMMWORD[96+rsp],ymm7 - vpalignr ymm4,ymm1,ymm0,8 - vpsrldq ymm8,ymm3,4 - vpxor ymm4,ymm4,ymm0 - vpxor ymm8,ymm8,ymm2 - vpxor ymm4,ymm4,ymm8 - vpsrld ymm8,ymm4,31 - vpslldq ymm10,ymm4,12 - vpaddd ymm4,ymm4,ymm4 - vpsrld ymm9,ymm10,30 - vpor ymm4,ymm4,ymm8 - vpslld ymm10,ymm10,2 - vpxor ymm4,ymm4,ymm9 - vpxor ymm4,ymm4,ymm10 - vpaddd ymm9,ymm4,ymm11 - vmovdqu YMMWORD[128+rsp],ymm9 - vpalignr ymm5,ymm2,ymm1,8 - vpsrldq ymm8,ymm4,4 - vpxor ymm5,ymm5,ymm1 - vpxor ymm8,ymm8,ymm3 - vpxor ymm5,ymm5,ymm8 - vpsrld ymm8,ymm5,31 - vmovdqu ymm11,YMMWORD[((-32))+r14] - vpslldq ymm10,ymm5,12 - vpaddd ymm5,ymm5,ymm5 - vpsrld ymm9,ymm10,30 - vpor ymm5,ymm5,ymm8 - vpslld ymm10,ymm10,2 - vpxor ymm5,ymm5,ymm9 - vpxor ymm5,ymm5,ymm10 - vpaddd ymm9,ymm5,ymm11 - vmovdqu YMMWORD[160+rsp],ymm9 - vpalignr ymm6,ymm3,ymm2,8 - vpsrldq ymm8,ymm5,4 - vpxor ymm6,ymm6,ymm2 - vpxor ymm8,ymm8,ymm4 - vpxor ymm6,ymm6,ymm8 - vpsrld ymm8,ymm6,31 - vpslldq ymm10,ymm6,12 - vpaddd ymm6,ymm6,ymm6 - vpsrld ymm9,ymm10,30 - vpor ymm6,ymm6,ymm8 - vpslld ymm10,ymm10,2 - vpxor ymm6,ymm6,ymm9 - vpxor ymm6,ymm6,ymm10 - vpaddd ymm9,ymm6,ymm11 - vmovdqu YMMWORD[192+rsp],ymm9 - vpalignr ymm7,ymm4,ymm3,8 - vpsrldq ymm8,ymm6,4 - vpxor ymm7,ymm7,ymm3 - vpxor ymm8,ymm8,ymm5 - vpxor ymm7,ymm7,ymm8 - vpsrld ymm8,ymm7,31 - vpslldq ymm10,ymm7,12 - vpaddd ymm7,ymm7,ymm7 - vpsrld ymm9,ymm10,30 - vpor ymm7,ymm7,ymm8 - vpslld ymm10,ymm10,2 - vpxor ymm7,ymm7,ymm9 - vpxor ymm7,ymm7,ymm10 - vpaddd ymm9,ymm7,ymm11 - vmovdqu YMMWORD[224+rsp],ymm9 - lea r13,[128+rsp] - jmp NEAR $L$oop_avx2 -ALIGN 32 -$L$oop_avx2: - rorx ebx,ebp,2 - andn edi,ebp,edx - and ebp,ecx - xor ebp,edi - jmp NEAR $L$align32_1 -ALIGN 32 -$L$align32_1: - vpalignr ymm8,ymm7,ymm6,8 - vpxor ymm0,ymm0,ymm4 - add esi,DWORD[((-128))+r13] - andn edi,eax,ecx - vpxor ymm0,ymm0,ymm1 - add esi,ebp - rorx r12d,eax,27 - rorx ebp,eax,2 - vpxor ymm0,ymm0,ymm8 - and eax,ebx - add esi,r12d - xor eax,edi - vpsrld ymm8,ymm0,30 - vpslld ymm0,ymm0,2 - add edx,DWORD[((-124))+r13] - andn edi,esi,ebx - add edx,eax - rorx r12d,esi,27 - rorx eax,esi,2 - and esi,ebp - vpor ymm0,ymm0,ymm8 - add edx,r12d - xor esi,edi - add ecx,DWORD[((-120))+r13] - andn edi,edx,ebp - vpaddd ymm9,ymm0,ymm11 - add ecx,esi - rorx r12d,edx,27 - rorx esi,edx,2 - and edx,eax - vmovdqu YMMWORD[256+rsp],ymm9 - add ecx,r12d - xor edx,edi - add ebx,DWORD[((-116))+r13] - andn edi,ecx,eax - add ebx,edx - rorx r12d,ecx,27 - rorx edx,ecx,2 - and ecx,esi - add ebx,r12d - xor ecx,edi - add ebp,DWORD[((-96))+r13] - andn edi,ebx,esi - add ebp,ecx - rorx r12d,ebx,27 - rorx ecx,ebx,2 - and ebx,edx - add ebp,r12d - xor ebx,edi - vpalignr ymm8,ymm0,ymm7,8 - vpxor ymm1,ymm1,ymm5 - add eax,DWORD[((-92))+r13] - andn edi,ebp,edx - vpxor ymm1,ymm1,ymm2 - add eax,ebx - rorx r12d,ebp,27 - rorx ebx,ebp,2 - vpxor ymm1,ymm1,ymm8 - and ebp,ecx - add eax,r12d - xor ebp,edi - vpsrld ymm8,ymm1,30 - vpslld ymm1,ymm1,2 - add esi,DWORD[((-88))+r13] - andn edi,eax,ecx - add esi,ebp - rorx r12d,eax,27 - rorx ebp,eax,2 - and eax,ebx - vpor ymm1,ymm1,ymm8 - add esi,r12d - xor eax,edi - add edx,DWORD[((-84))+r13] - andn edi,esi,ebx - vpaddd ymm9,ymm1,ymm11 - add edx,eax - rorx r12d,esi,27 - rorx eax,esi,2 - and esi,ebp - vmovdqu YMMWORD[288+rsp],ymm9 - add edx,r12d - xor esi,edi - add ecx,DWORD[((-64))+r13] - andn edi,edx,ebp - add ecx,esi - rorx r12d,edx,27 - rorx esi,edx,2 - and edx,eax - add ecx,r12d - xor edx,edi - add ebx,DWORD[((-60))+r13] - andn edi,ecx,eax - add ebx,edx - rorx r12d,ecx,27 - rorx edx,ecx,2 - and ecx,esi - add ebx,r12d - xor ecx,edi - vpalignr ymm8,ymm1,ymm0,8 - vpxor ymm2,ymm2,ymm6 - add ebp,DWORD[((-56))+r13] - andn edi,ebx,esi - vpxor ymm2,ymm2,ymm3 - vmovdqu ymm11,YMMWORD[r14] - add ebp,ecx - rorx r12d,ebx,27 - rorx ecx,ebx,2 - vpxor ymm2,ymm2,ymm8 - and ebx,edx - add ebp,r12d - xor ebx,edi - vpsrld ymm8,ymm2,30 - vpslld ymm2,ymm2,2 - add eax,DWORD[((-52))+r13] - andn edi,ebp,edx - add eax,ebx - rorx r12d,ebp,27 - rorx ebx,ebp,2 - and ebp,ecx - vpor ymm2,ymm2,ymm8 - add eax,r12d - xor ebp,edi - add esi,DWORD[((-32))+r13] - andn edi,eax,ecx - vpaddd ymm9,ymm2,ymm11 - add esi,ebp - rorx r12d,eax,27 - rorx ebp,eax,2 - and eax,ebx - vmovdqu YMMWORD[320+rsp],ymm9 - add esi,r12d - xor eax,edi - add edx,DWORD[((-28))+r13] - andn edi,esi,ebx - add edx,eax - rorx r12d,esi,27 - rorx eax,esi,2 - and esi,ebp - add edx,r12d - xor esi,edi - add ecx,DWORD[((-24))+r13] - andn edi,edx,ebp - add ecx,esi - rorx r12d,edx,27 - rorx esi,edx,2 - and edx,eax - add ecx,r12d - xor edx,edi - vpalignr ymm8,ymm2,ymm1,8 - vpxor ymm3,ymm3,ymm7 - add ebx,DWORD[((-20))+r13] - andn edi,ecx,eax - vpxor ymm3,ymm3,ymm4 - add ebx,edx - rorx r12d,ecx,27 - rorx edx,ecx,2 - vpxor ymm3,ymm3,ymm8 - and ecx,esi - add ebx,r12d - xor ecx,edi - vpsrld ymm8,ymm3,30 - vpslld ymm3,ymm3,2 - add ebp,DWORD[r13] - andn edi,ebx,esi - add ebp,ecx - rorx r12d,ebx,27 - rorx ecx,ebx,2 - and ebx,edx - vpor ymm3,ymm3,ymm8 - add ebp,r12d - xor ebx,edi - add eax,DWORD[4+r13] - andn edi,ebp,edx - vpaddd ymm9,ymm3,ymm11 - add eax,ebx - rorx r12d,ebp,27 - rorx ebx,ebp,2 - and ebp,ecx - vmovdqu YMMWORD[352+rsp],ymm9 - add eax,r12d - xor ebp,edi - add esi,DWORD[8+r13] - andn edi,eax,ecx - add esi,ebp - rorx r12d,eax,27 - rorx ebp,eax,2 - and eax,ebx - add esi,r12d - xor eax,edi - add edx,DWORD[12+r13] - lea edx,[rax*1+rdx] - rorx r12d,esi,27 - rorx eax,esi,2 - xor esi,ebp - add edx,r12d - xor esi,ebx - vpalignr ymm8,ymm3,ymm2,8 - vpxor ymm4,ymm4,ymm0 - add ecx,DWORD[32+r13] - lea ecx,[rsi*1+rcx] - vpxor ymm4,ymm4,ymm5 - rorx r12d,edx,27 - rorx esi,edx,2 - xor edx,eax - vpxor ymm4,ymm4,ymm8 - add ecx,r12d - xor edx,ebp - add ebx,DWORD[36+r13] - vpsrld ymm8,ymm4,30 - vpslld ymm4,ymm4,2 - lea ebx,[rdx*1+rbx] - rorx r12d,ecx,27 - rorx edx,ecx,2 - xor ecx,esi - add ebx,r12d - xor ecx,eax - vpor ymm4,ymm4,ymm8 - add ebp,DWORD[40+r13] - lea ebp,[rbp*1+rcx] - rorx r12d,ebx,27 - rorx ecx,ebx,2 - vpaddd ymm9,ymm4,ymm11 - xor ebx,edx - add ebp,r12d - xor ebx,esi - add eax,DWORD[44+r13] - vmovdqu YMMWORD[384+rsp],ymm9 - lea eax,[rbx*1+rax] - rorx r12d,ebp,27 - rorx ebx,ebp,2 - xor ebp,ecx - add eax,r12d - xor ebp,edx - add esi,DWORD[64+r13] - lea esi,[rbp*1+rsi] - rorx r12d,eax,27 - rorx ebp,eax,2 - xor eax,ebx - add esi,r12d - xor eax,ecx - vpalignr ymm8,ymm4,ymm3,8 - vpxor ymm5,ymm5,ymm1 - add edx,DWORD[68+r13] - lea edx,[rax*1+rdx] - vpxor ymm5,ymm5,ymm6 - rorx r12d,esi,27 - rorx eax,esi,2 - xor esi,ebp - vpxor ymm5,ymm5,ymm8 - add edx,r12d - xor esi,ebx - add ecx,DWORD[72+r13] - vpsrld ymm8,ymm5,30 - vpslld ymm5,ymm5,2 - lea ecx,[rsi*1+rcx] - rorx r12d,edx,27 - rorx esi,edx,2 - xor edx,eax - add ecx,r12d - xor edx,ebp - vpor ymm5,ymm5,ymm8 - add ebx,DWORD[76+r13] - lea ebx,[rdx*1+rbx] - rorx r12d,ecx,27 - rorx edx,ecx,2 - vpaddd ymm9,ymm5,ymm11 - xor ecx,esi - add ebx,r12d - xor ecx,eax - add ebp,DWORD[96+r13] - vmovdqu YMMWORD[416+rsp],ymm9 - lea ebp,[rbp*1+rcx] - rorx r12d,ebx,27 - rorx ecx,ebx,2 - xor ebx,edx - add ebp,r12d - xor ebx,esi - add eax,DWORD[100+r13] - lea eax,[rbx*1+rax] - rorx r12d,ebp,27 - rorx ebx,ebp,2 - xor ebp,ecx - add eax,r12d - xor ebp,edx - vpalignr ymm8,ymm5,ymm4,8 - vpxor ymm6,ymm6,ymm2 - add esi,DWORD[104+r13] - lea esi,[rbp*1+rsi] - vpxor ymm6,ymm6,ymm7 - rorx r12d,eax,27 - rorx ebp,eax,2 - xor eax,ebx - vpxor ymm6,ymm6,ymm8 - add esi,r12d - xor eax,ecx - add edx,DWORD[108+r13] - lea r13,[256+r13] - vpsrld ymm8,ymm6,30 - vpslld ymm6,ymm6,2 - lea edx,[rax*1+rdx] - rorx r12d,esi,27 - rorx eax,esi,2 - xor esi,ebp - add edx,r12d - xor esi,ebx - vpor ymm6,ymm6,ymm8 - add ecx,DWORD[((-128))+r13] - lea ecx,[rsi*1+rcx] - rorx r12d,edx,27 - rorx esi,edx,2 - vpaddd ymm9,ymm6,ymm11 - xor edx,eax - add ecx,r12d - xor edx,ebp - add ebx,DWORD[((-124))+r13] - vmovdqu YMMWORD[448+rsp],ymm9 - lea ebx,[rdx*1+rbx] - rorx r12d,ecx,27 - rorx edx,ecx,2 - xor ecx,esi - add ebx,r12d - xor ecx,eax - add ebp,DWORD[((-120))+r13] - lea ebp,[rbp*1+rcx] - rorx r12d,ebx,27 - rorx ecx,ebx,2 - xor ebx,edx - add ebp,r12d - xor ebx,esi - vpalignr ymm8,ymm6,ymm5,8 - vpxor ymm7,ymm7,ymm3 - add eax,DWORD[((-116))+r13] - lea eax,[rbx*1+rax] - vpxor ymm7,ymm7,ymm0 - vmovdqu ymm11,YMMWORD[32+r14] - rorx r12d,ebp,27 - rorx ebx,ebp,2 - xor ebp,ecx - vpxor ymm7,ymm7,ymm8 - add eax,r12d - xor ebp,edx - add esi,DWORD[((-96))+r13] - vpsrld ymm8,ymm7,30 - vpslld ymm7,ymm7,2 - lea esi,[rbp*1+rsi] - rorx r12d,eax,27 - rorx ebp,eax,2 - xor eax,ebx - add esi,r12d - xor eax,ecx - vpor ymm7,ymm7,ymm8 - add edx,DWORD[((-92))+r13] - lea edx,[rax*1+rdx] - rorx r12d,esi,27 - rorx eax,esi,2 - vpaddd ymm9,ymm7,ymm11 - xor esi,ebp - add edx,r12d - xor esi,ebx - add ecx,DWORD[((-88))+r13] - vmovdqu YMMWORD[480+rsp],ymm9 - lea ecx,[rsi*1+rcx] - rorx r12d,edx,27 - rorx esi,edx,2 - xor edx,eax - add ecx,r12d - xor edx,ebp - add ebx,DWORD[((-84))+r13] - mov edi,esi - xor edi,eax - lea ebx,[rdx*1+rbx] - rorx r12d,ecx,27 - rorx edx,ecx,2 - xor ecx,esi - add ebx,r12d - and ecx,edi - jmp NEAR $L$align32_2 -ALIGN 32 -$L$align32_2: - vpalignr ymm8,ymm7,ymm6,8 - vpxor ymm0,ymm0,ymm4 - add ebp,DWORD[((-64))+r13] - xor ecx,esi - vpxor ymm0,ymm0,ymm1 - mov edi,edx - xor edi,esi - lea ebp,[rbp*1+rcx] - vpxor ymm0,ymm0,ymm8 - rorx r12d,ebx,27 - rorx ecx,ebx,2 - xor ebx,edx - vpsrld ymm8,ymm0,30 - vpslld ymm0,ymm0,2 - add ebp,r12d - and ebx,edi - add eax,DWORD[((-60))+r13] - xor ebx,edx - mov edi,ecx - xor edi,edx - vpor ymm0,ymm0,ymm8 - lea eax,[rbx*1+rax] - rorx r12d,ebp,27 - rorx ebx,ebp,2 - xor ebp,ecx - vpaddd ymm9,ymm0,ymm11 - add eax,r12d - and ebp,edi - add esi,DWORD[((-56))+r13] - xor ebp,ecx - vmovdqu YMMWORD[512+rsp],ymm9 - mov edi,ebx - xor edi,ecx - lea esi,[rbp*1+rsi] - rorx r12d,eax,27 - rorx ebp,eax,2 - xor eax,ebx - add esi,r12d - and eax,edi - add edx,DWORD[((-52))+r13] - xor eax,ebx - mov edi,ebp - xor edi,ebx - lea edx,[rax*1+rdx] - rorx r12d,esi,27 - rorx eax,esi,2 - xor esi,ebp - add edx,r12d - and esi,edi - add ecx,DWORD[((-32))+r13] - xor esi,ebp - mov edi,eax - xor edi,ebp - lea ecx,[rsi*1+rcx] - rorx r12d,edx,27 - rorx esi,edx,2 - xor edx,eax - add ecx,r12d - and edx,edi - vpalignr ymm8,ymm0,ymm7,8 - vpxor ymm1,ymm1,ymm5 - add ebx,DWORD[((-28))+r13] - xor edx,eax - vpxor ymm1,ymm1,ymm2 - mov edi,esi - xor edi,eax - lea ebx,[rdx*1+rbx] - vpxor ymm1,ymm1,ymm8 - rorx r12d,ecx,27 - rorx edx,ecx,2 - xor ecx,esi - vpsrld ymm8,ymm1,30 - vpslld ymm1,ymm1,2 - add ebx,r12d - and ecx,edi - add ebp,DWORD[((-24))+r13] - xor ecx,esi - mov edi,edx - xor edi,esi - vpor ymm1,ymm1,ymm8 - lea ebp,[rbp*1+rcx] - rorx r12d,ebx,27 - rorx ecx,ebx,2 - xor ebx,edx - vpaddd ymm9,ymm1,ymm11 - add ebp,r12d - and ebx,edi - add eax,DWORD[((-20))+r13] - xor ebx,edx - vmovdqu YMMWORD[544+rsp],ymm9 - mov edi,ecx - xor edi,edx - lea eax,[rbx*1+rax] - rorx r12d,ebp,27 - rorx ebx,ebp,2 - xor ebp,ecx - add eax,r12d - and ebp,edi - add esi,DWORD[r13] - xor ebp,ecx - mov edi,ebx - xor edi,ecx - lea esi,[rbp*1+rsi] - rorx r12d,eax,27 - rorx ebp,eax,2 - xor eax,ebx - add esi,r12d - and eax,edi - add edx,DWORD[4+r13] - xor eax,ebx - mov edi,ebp - xor edi,ebx - lea edx,[rax*1+rdx] - rorx r12d,esi,27 - rorx eax,esi,2 - xor esi,ebp - add edx,r12d - and esi,edi - vpalignr ymm8,ymm1,ymm0,8 - vpxor ymm2,ymm2,ymm6 - add ecx,DWORD[8+r13] - xor esi,ebp - vpxor ymm2,ymm2,ymm3 - mov edi,eax - xor edi,ebp - lea ecx,[rsi*1+rcx] - vpxor ymm2,ymm2,ymm8 - rorx r12d,edx,27 - rorx esi,edx,2 - xor edx,eax - vpsrld ymm8,ymm2,30 - vpslld ymm2,ymm2,2 - add ecx,r12d - and edx,edi - add ebx,DWORD[12+r13] - xor edx,eax - mov edi,esi - xor edi,eax - vpor ymm2,ymm2,ymm8 - lea ebx,[rdx*1+rbx] - rorx r12d,ecx,27 - rorx edx,ecx,2 - xor ecx,esi - vpaddd ymm9,ymm2,ymm11 - add ebx,r12d - and ecx,edi - add ebp,DWORD[32+r13] - xor ecx,esi - vmovdqu YMMWORD[576+rsp],ymm9 - mov edi,edx - xor edi,esi - lea ebp,[rbp*1+rcx] - rorx r12d,ebx,27 - rorx ecx,ebx,2 - xor ebx,edx - add ebp,r12d - and ebx,edi - add eax,DWORD[36+r13] - xor ebx,edx - mov edi,ecx - xor edi,edx - lea eax,[rbx*1+rax] - rorx r12d,ebp,27 - rorx ebx,ebp,2 - xor ebp,ecx - add eax,r12d - and ebp,edi - add esi,DWORD[40+r13] - xor ebp,ecx - mov edi,ebx - xor edi,ecx - lea esi,[rbp*1+rsi] - rorx r12d,eax,27 - rorx ebp,eax,2 - xor eax,ebx - add esi,r12d - and eax,edi - vpalignr ymm8,ymm2,ymm1,8 - vpxor ymm3,ymm3,ymm7 - add edx,DWORD[44+r13] - xor eax,ebx - vpxor ymm3,ymm3,ymm4 - mov edi,ebp - xor edi,ebx - lea edx,[rax*1+rdx] - vpxor ymm3,ymm3,ymm8 - rorx r12d,esi,27 - rorx eax,esi,2 - xor esi,ebp - vpsrld ymm8,ymm3,30 - vpslld ymm3,ymm3,2 - add edx,r12d - and esi,edi - add ecx,DWORD[64+r13] - xor esi,ebp - mov edi,eax - xor edi,ebp - vpor ymm3,ymm3,ymm8 - lea ecx,[rsi*1+rcx] - rorx r12d,edx,27 - rorx esi,edx,2 - xor edx,eax - vpaddd ymm9,ymm3,ymm11 - add ecx,r12d - and edx,edi - add ebx,DWORD[68+r13] - xor edx,eax - vmovdqu YMMWORD[608+rsp],ymm9 - mov edi,esi - xor edi,eax - lea ebx,[rdx*1+rbx] - rorx r12d,ecx,27 - rorx edx,ecx,2 - xor ecx,esi - add ebx,r12d - and ecx,edi - add ebp,DWORD[72+r13] - xor ecx,esi - mov edi,edx - xor edi,esi - lea ebp,[rbp*1+rcx] - rorx r12d,ebx,27 - rorx ecx,ebx,2 - xor ebx,edx - add ebp,r12d - and ebx,edi - add eax,DWORD[76+r13] - xor ebx,edx - lea eax,[rbx*1+rax] - rorx r12d,ebp,27 - rorx ebx,ebp,2 - xor ebp,ecx - add eax,r12d - xor ebp,edx - add esi,DWORD[96+r13] - lea esi,[rbp*1+rsi] - rorx r12d,eax,27 - rorx ebp,eax,2 - xor eax,ebx - add esi,r12d - xor eax,ecx - add edx,DWORD[100+r13] - lea edx,[rax*1+rdx] - rorx r12d,esi,27 - rorx eax,esi,2 - xor esi,ebp - add edx,r12d - xor esi,ebx - add ecx,DWORD[104+r13] - lea ecx,[rsi*1+rcx] - rorx r12d,edx,27 - rorx esi,edx,2 - xor edx,eax - add ecx,r12d - xor edx,ebp - add ebx,DWORD[108+r13] - lea r13,[256+r13] - lea ebx,[rdx*1+rbx] - rorx r12d,ecx,27 - rorx edx,ecx,2 - xor ecx,esi - add ebx,r12d - xor ecx,eax - add ebp,DWORD[((-128))+r13] - lea ebp,[rbp*1+rcx] - rorx r12d,ebx,27 - rorx ecx,ebx,2 - xor ebx,edx - add ebp,r12d - xor ebx,esi - add eax,DWORD[((-124))+r13] - lea eax,[rbx*1+rax] - rorx r12d,ebp,27 - rorx ebx,ebp,2 - xor ebp,ecx - add eax,r12d - xor ebp,edx - add esi,DWORD[((-120))+r13] - lea esi,[rbp*1+rsi] - rorx r12d,eax,27 - rorx ebp,eax,2 - xor eax,ebx - add esi,r12d - xor eax,ecx - add edx,DWORD[((-116))+r13] - lea edx,[rax*1+rdx] - rorx r12d,esi,27 - rorx eax,esi,2 - xor esi,ebp - add edx,r12d - xor esi,ebx - add ecx,DWORD[((-96))+r13] - lea ecx,[rsi*1+rcx] - rorx r12d,edx,27 - rorx esi,edx,2 - xor edx,eax - add ecx,r12d - xor edx,ebp - add ebx,DWORD[((-92))+r13] - lea ebx,[rdx*1+rbx] - rorx r12d,ecx,27 - rorx edx,ecx,2 - xor ecx,esi - add ebx,r12d - xor ecx,eax - add ebp,DWORD[((-88))+r13] - lea ebp,[rbp*1+rcx] - rorx r12d,ebx,27 - rorx ecx,ebx,2 - xor ebx,edx - add ebp,r12d - xor ebx,esi - add eax,DWORD[((-84))+r13] - lea eax,[rbx*1+rax] - rorx r12d,ebp,27 - rorx ebx,ebp,2 - xor ebp,ecx - add eax,r12d - xor ebp,edx - add esi,DWORD[((-64))+r13] - lea esi,[rbp*1+rsi] - rorx r12d,eax,27 - rorx ebp,eax,2 - xor eax,ebx - add esi,r12d - xor eax,ecx - add edx,DWORD[((-60))+r13] - lea edx,[rax*1+rdx] - rorx r12d,esi,27 - rorx eax,esi,2 - xor esi,ebp - add edx,r12d - xor esi,ebx - add ecx,DWORD[((-56))+r13] - lea ecx,[rsi*1+rcx] - rorx r12d,edx,27 - rorx esi,edx,2 - xor edx,eax - add ecx,r12d - xor edx,ebp - add ebx,DWORD[((-52))+r13] - lea ebx,[rdx*1+rbx] - rorx r12d,ecx,27 - rorx edx,ecx,2 - xor ecx,esi - add ebx,r12d - xor ecx,eax - add ebp,DWORD[((-32))+r13] - lea ebp,[rbp*1+rcx] - rorx r12d,ebx,27 - rorx ecx,ebx,2 - xor ebx,edx - add ebp,r12d - xor ebx,esi - add eax,DWORD[((-28))+r13] - lea eax,[rbx*1+rax] - rorx r12d,ebp,27 - rorx ebx,ebp,2 - xor ebp,ecx - add eax,r12d - xor ebp,edx - add esi,DWORD[((-24))+r13] - lea esi,[rbp*1+rsi] - rorx r12d,eax,27 - rorx ebp,eax,2 - xor eax,ebx - add esi,r12d - xor eax,ecx - add edx,DWORD[((-20))+r13] - lea edx,[rax*1+rdx] - rorx r12d,esi,27 - add edx,r12d - lea r13,[128+r9] - lea rdi,[128+r9] - cmp r13,r10 - cmovae r13,r9 - - - add edx,DWORD[r8] - add esi,DWORD[4+r8] - add ebp,DWORD[8+r8] - mov DWORD[r8],edx - add ebx,DWORD[12+r8] - mov DWORD[4+r8],esi - mov eax,edx - add ecx,DWORD[16+r8] - mov r12d,ebp - mov DWORD[8+r8],ebp - mov edx,ebx - - mov DWORD[12+r8],ebx - mov ebp,esi - mov DWORD[16+r8],ecx - - mov esi,ecx - mov ecx,r12d - - - cmp r9,r10 - je NEAR $L$done_avx2 - vmovdqu ymm6,YMMWORD[64+r14] - cmp rdi,r10 - ja NEAR $L$ast_avx2 - - vmovdqu xmm0,XMMWORD[((-64))+rdi] - vmovdqu xmm1,XMMWORD[((-48))+rdi] - vmovdqu xmm2,XMMWORD[((-32))+rdi] - vmovdqu xmm3,XMMWORD[((-16))+rdi] - vinserti128 ymm0,ymm0,XMMWORD[r13],1 - vinserti128 ymm1,ymm1,XMMWORD[16+r13],1 - vinserti128 ymm2,ymm2,XMMWORD[32+r13],1 - vinserti128 ymm3,ymm3,XMMWORD[48+r13],1 - jmp NEAR $L$ast_avx2 - -ALIGN 32 -$L$ast_avx2: - lea r13,[((128+16))+rsp] - rorx ebx,ebp,2 - andn edi,ebp,edx - and ebp,ecx - xor ebp,edi - sub r9,-128 - add esi,DWORD[((-128))+r13] - andn edi,eax,ecx - add esi,ebp - rorx r12d,eax,27 - rorx ebp,eax,2 - and eax,ebx - add esi,r12d - xor eax,edi - add edx,DWORD[((-124))+r13] - andn edi,esi,ebx - add edx,eax - rorx r12d,esi,27 - rorx eax,esi,2 - and esi,ebp - add edx,r12d - xor esi,edi - add ecx,DWORD[((-120))+r13] - andn edi,edx,ebp - add ecx,esi - rorx r12d,edx,27 - rorx esi,edx,2 - and edx,eax - add ecx,r12d - xor edx,edi - add ebx,DWORD[((-116))+r13] - andn edi,ecx,eax - add ebx,edx - rorx r12d,ecx,27 - rorx edx,ecx,2 - and ecx,esi - add ebx,r12d - xor ecx,edi - add ebp,DWORD[((-96))+r13] - andn edi,ebx,esi - add ebp,ecx - rorx r12d,ebx,27 - rorx ecx,ebx,2 - and ebx,edx - add ebp,r12d - xor ebx,edi - add eax,DWORD[((-92))+r13] - andn edi,ebp,edx - add eax,ebx - rorx r12d,ebp,27 - rorx ebx,ebp,2 - and ebp,ecx - add eax,r12d - xor ebp,edi - add esi,DWORD[((-88))+r13] - andn edi,eax,ecx - add esi,ebp - rorx r12d,eax,27 - rorx ebp,eax,2 - and eax,ebx - add esi,r12d - xor eax,edi - add edx,DWORD[((-84))+r13] - andn edi,esi,ebx - add edx,eax - rorx r12d,esi,27 - rorx eax,esi,2 - and esi,ebp - add edx,r12d - xor esi,edi - add ecx,DWORD[((-64))+r13] - andn edi,edx,ebp - add ecx,esi - rorx r12d,edx,27 - rorx esi,edx,2 - and edx,eax - add ecx,r12d - xor edx,edi - add ebx,DWORD[((-60))+r13] - andn edi,ecx,eax - add ebx,edx - rorx r12d,ecx,27 - rorx edx,ecx,2 - and ecx,esi - add ebx,r12d - xor ecx,edi - add ebp,DWORD[((-56))+r13] - andn edi,ebx,esi - add ebp,ecx - rorx r12d,ebx,27 - rorx ecx,ebx,2 - and ebx,edx - add ebp,r12d - xor ebx,edi - add eax,DWORD[((-52))+r13] - andn edi,ebp,edx - add eax,ebx - rorx r12d,ebp,27 - rorx ebx,ebp,2 - and ebp,ecx - add eax,r12d - xor ebp,edi - add esi,DWORD[((-32))+r13] - andn edi,eax,ecx - add esi,ebp - rorx r12d,eax,27 - rorx ebp,eax,2 - and eax,ebx - add esi,r12d - xor eax,edi - add edx,DWORD[((-28))+r13] - andn edi,esi,ebx - add edx,eax - rorx r12d,esi,27 - rorx eax,esi,2 - and esi,ebp - add edx,r12d - xor esi,edi - add ecx,DWORD[((-24))+r13] - andn edi,edx,ebp - add ecx,esi - rorx r12d,edx,27 - rorx esi,edx,2 - and edx,eax - add ecx,r12d - xor edx,edi - add ebx,DWORD[((-20))+r13] - andn edi,ecx,eax - add ebx,edx - rorx r12d,ecx,27 - rorx edx,ecx,2 - and ecx,esi - add ebx,r12d - xor ecx,edi - add ebp,DWORD[r13] - andn edi,ebx,esi - add ebp,ecx - rorx r12d,ebx,27 - rorx ecx,ebx,2 - and ebx,edx - add ebp,r12d - xor ebx,edi - add eax,DWORD[4+r13] - andn edi,ebp,edx - add eax,ebx - rorx r12d,ebp,27 - rorx ebx,ebp,2 - and ebp,ecx - add eax,r12d - xor ebp,edi - add esi,DWORD[8+r13] - andn edi,eax,ecx - add esi,ebp - rorx r12d,eax,27 - rorx ebp,eax,2 - and eax,ebx - add esi,r12d - xor eax,edi - add edx,DWORD[12+r13] - lea edx,[rax*1+rdx] - rorx r12d,esi,27 - rorx eax,esi,2 - xor esi,ebp - add edx,r12d - xor esi,ebx - add ecx,DWORD[32+r13] - lea ecx,[rsi*1+rcx] - rorx r12d,edx,27 - rorx esi,edx,2 - xor edx,eax - add ecx,r12d - xor edx,ebp - add ebx,DWORD[36+r13] - lea ebx,[rdx*1+rbx] - rorx r12d,ecx,27 - rorx edx,ecx,2 - xor ecx,esi - add ebx,r12d - xor ecx,eax - add ebp,DWORD[40+r13] - lea ebp,[rbp*1+rcx] - rorx r12d,ebx,27 - rorx ecx,ebx,2 - xor ebx,edx - add ebp,r12d - xor ebx,esi - add eax,DWORD[44+r13] - lea eax,[rbx*1+rax] - rorx r12d,ebp,27 - rorx ebx,ebp,2 - xor ebp,ecx - add eax,r12d - xor ebp,edx - add esi,DWORD[64+r13] - lea esi,[rbp*1+rsi] - rorx r12d,eax,27 - rorx ebp,eax,2 - xor eax,ebx - add esi,r12d - xor eax,ecx - vmovdqu ymm11,YMMWORD[((-64))+r14] - vpshufb ymm0,ymm0,ymm6 - add edx,DWORD[68+r13] - lea edx,[rax*1+rdx] - rorx r12d,esi,27 - rorx eax,esi,2 - xor esi,ebp - add edx,r12d - xor esi,ebx - add ecx,DWORD[72+r13] - lea ecx,[rsi*1+rcx] - rorx r12d,edx,27 - rorx esi,edx,2 - xor edx,eax - add ecx,r12d - xor edx,ebp - add ebx,DWORD[76+r13] - lea ebx,[rdx*1+rbx] - rorx r12d,ecx,27 - rorx edx,ecx,2 - xor ecx,esi - add ebx,r12d - xor ecx,eax - add ebp,DWORD[96+r13] - lea ebp,[rbp*1+rcx] - rorx r12d,ebx,27 - rorx ecx,ebx,2 - xor ebx,edx - add ebp,r12d - xor ebx,esi - add eax,DWORD[100+r13] - lea eax,[rbx*1+rax] - rorx r12d,ebp,27 - rorx ebx,ebp,2 - xor ebp,ecx - add eax,r12d - xor ebp,edx - vpshufb ymm1,ymm1,ymm6 - vpaddd ymm8,ymm0,ymm11 - add esi,DWORD[104+r13] - lea esi,[rbp*1+rsi] - rorx r12d,eax,27 - rorx ebp,eax,2 - xor eax,ebx - add esi,r12d - xor eax,ecx - add edx,DWORD[108+r13] - lea r13,[256+r13] - lea edx,[rax*1+rdx] - rorx r12d,esi,27 - rorx eax,esi,2 - xor esi,ebp - add edx,r12d - xor esi,ebx - add ecx,DWORD[((-128))+r13] - lea ecx,[rsi*1+rcx] - rorx r12d,edx,27 - rorx esi,edx,2 - xor edx,eax - add ecx,r12d - xor edx,ebp - add ebx,DWORD[((-124))+r13] - lea ebx,[rdx*1+rbx] - rorx r12d,ecx,27 - rorx edx,ecx,2 - xor ecx,esi - add ebx,r12d - xor ecx,eax - add ebp,DWORD[((-120))+r13] - lea ebp,[rbp*1+rcx] - rorx r12d,ebx,27 - rorx ecx,ebx,2 - xor ebx,edx - add ebp,r12d - xor ebx,esi - vmovdqu YMMWORD[rsp],ymm8 - vpshufb ymm2,ymm2,ymm6 - vpaddd ymm9,ymm1,ymm11 - add eax,DWORD[((-116))+r13] - lea eax,[rbx*1+rax] - rorx r12d,ebp,27 - rorx ebx,ebp,2 - xor ebp,ecx - add eax,r12d - xor ebp,edx - add esi,DWORD[((-96))+r13] - lea esi,[rbp*1+rsi] - rorx r12d,eax,27 - rorx ebp,eax,2 - xor eax,ebx - add esi,r12d - xor eax,ecx - add edx,DWORD[((-92))+r13] - lea edx,[rax*1+rdx] - rorx r12d,esi,27 - rorx eax,esi,2 - xor esi,ebp - add edx,r12d - xor esi,ebx - add ecx,DWORD[((-88))+r13] - lea ecx,[rsi*1+rcx] - rorx r12d,edx,27 - rorx esi,edx,2 - xor edx,eax - add ecx,r12d - xor edx,ebp - add ebx,DWORD[((-84))+r13] - mov edi,esi - xor edi,eax - lea ebx,[rdx*1+rbx] - rorx r12d,ecx,27 - rorx edx,ecx,2 - xor ecx,esi - add ebx,r12d - and ecx,edi - vmovdqu YMMWORD[32+rsp],ymm9 - vpshufb ymm3,ymm3,ymm6 - vpaddd ymm6,ymm2,ymm11 - add ebp,DWORD[((-64))+r13] - xor ecx,esi - mov edi,edx - xor edi,esi - lea ebp,[rbp*1+rcx] - rorx r12d,ebx,27 - rorx ecx,ebx,2 - xor ebx,edx - add ebp,r12d - and ebx,edi - add eax,DWORD[((-60))+r13] - xor ebx,edx - mov edi,ecx - xor edi,edx - lea eax,[rbx*1+rax] - rorx r12d,ebp,27 - rorx ebx,ebp,2 - xor ebp,ecx - add eax,r12d - and ebp,edi - add esi,DWORD[((-56))+r13] - xor ebp,ecx - mov edi,ebx - xor edi,ecx - lea esi,[rbp*1+rsi] - rorx r12d,eax,27 - rorx ebp,eax,2 - xor eax,ebx - add esi,r12d - and eax,edi - add edx,DWORD[((-52))+r13] - xor eax,ebx - mov edi,ebp - xor edi,ebx - lea edx,[rax*1+rdx] - rorx r12d,esi,27 - rorx eax,esi,2 - xor esi,ebp - add edx,r12d - and esi,edi - add ecx,DWORD[((-32))+r13] - xor esi,ebp - mov edi,eax - xor edi,ebp - lea ecx,[rsi*1+rcx] - rorx r12d,edx,27 - rorx esi,edx,2 - xor edx,eax - add ecx,r12d - and edx,edi - jmp NEAR $L$align32_3 -ALIGN 32 -$L$align32_3: - vmovdqu YMMWORD[64+rsp],ymm6 - vpaddd ymm7,ymm3,ymm11 - add ebx,DWORD[((-28))+r13] - xor edx,eax - mov edi,esi - xor edi,eax - lea ebx,[rdx*1+rbx] - rorx r12d,ecx,27 - rorx edx,ecx,2 - xor ecx,esi - add ebx,r12d - and ecx,edi - add ebp,DWORD[((-24))+r13] - xor ecx,esi - mov edi,edx - xor edi,esi - lea ebp,[rbp*1+rcx] - rorx r12d,ebx,27 - rorx ecx,ebx,2 - xor ebx,edx - add ebp,r12d - and ebx,edi - add eax,DWORD[((-20))+r13] - xor ebx,edx - mov edi,ecx - xor edi,edx - lea eax,[rbx*1+rax] - rorx r12d,ebp,27 - rorx ebx,ebp,2 - xor ebp,ecx - add eax,r12d - and ebp,edi - add esi,DWORD[r13] - xor ebp,ecx - mov edi,ebx - xor edi,ecx - lea esi,[rbp*1+rsi] - rorx r12d,eax,27 - rorx ebp,eax,2 - xor eax,ebx - add esi,r12d - and eax,edi - add edx,DWORD[4+r13] - xor eax,ebx - mov edi,ebp - xor edi,ebx - lea edx,[rax*1+rdx] - rorx r12d,esi,27 - rorx eax,esi,2 - xor esi,ebp - add edx,r12d - and esi,edi - vmovdqu YMMWORD[96+rsp],ymm7 - add ecx,DWORD[8+r13] - xor esi,ebp - mov edi,eax - xor edi,ebp - lea ecx,[rsi*1+rcx] - rorx r12d,edx,27 - rorx esi,edx,2 - xor edx,eax - add ecx,r12d - and edx,edi - add ebx,DWORD[12+r13] - xor edx,eax - mov edi,esi - xor edi,eax - lea ebx,[rdx*1+rbx] - rorx r12d,ecx,27 - rorx edx,ecx,2 - xor ecx,esi - add ebx,r12d - and ecx,edi - add ebp,DWORD[32+r13] - xor ecx,esi - mov edi,edx - xor edi,esi - lea ebp,[rbp*1+rcx] - rorx r12d,ebx,27 - rorx ecx,ebx,2 - xor ebx,edx - add ebp,r12d - and ebx,edi - add eax,DWORD[36+r13] - xor ebx,edx - mov edi,ecx - xor edi,edx - lea eax,[rbx*1+rax] - rorx r12d,ebp,27 - rorx ebx,ebp,2 - xor ebp,ecx - add eax,r12d - and ebp,edi - add esi,DWORD[40+r13] - xor ebp,ecx - mov edi,ebx - xor edi,ecx - lea esi,[rbp*1+rsi] - rorx r12d,eax,27 - rorx ebp,eax,2 - xor eax,ebx - add esi,r12d - and eax,edi - vpalignr ymm4,ymm1,ymm0,8 - add edx,DWORD[44+r13] - xor eax,ebx - mov edi,ebp - xor edi,ebx - vpsrldq ymm8,ymm3,4 - lea edx,[rax*1+rdx] - rorx r12d,esi,27 - rorx eax,esi,2 - vpxor ymm4,ymm4,ymm0 - vpxor ymm8,ymm8,ymm2 - xor esi,ebp - add edx,r12d - vpxor ymm4,ymm4,ymm8 - and esi,edi - add ecx,DWORD[64+r13] - xor esi,ebp - mov edi,eax - vpsrld ymm8,ymm4,31 - xor edi,ebp - lea ecx,[rsi*1+rcx] - rorx r12d,edx,27 - vpslldq ymm10,ymm4,12 - vpaddd ymm4,ymm4,ymm4 - rorx esi,edx,2 - xor edx,eax - vpsrld ymm9,ymm10,30 - vpor ymm4,ymm4,ymm8 - add ecx,r12d - and edx,edi - vpslld ymm10,ymm10,2 - vpxor ymm4,ymm4,ymm9 - add ebx,DWORD[68+r13] - xor edx,eax - vpxor ymm4,ymm4,ymm10 - mov edi,esi - xor edi,eax - lea ebx,[rdx*1+rbx] - vpaddd ymm9,ymm4,ymm11 - rorx r12d,ecx,27 - rorx edx,ecx,2 - xor ecx,esi - vmovdqu YMMWORD[128+rsp],ymm9 - add ebx,r12d - and ecx,edi - add ebp,DWORD[72+r13] - xor ecx,esi - mov edi,edx - xor edi,esi - lea ebp,[rbp*1+rcx] - rorx r12d,ebx,27 - rorx ecx,ebx,2 - xor ebx,edx - add ebp,r12d - and ebx,edi - add eax,DWORD[76+r13] - xor ebx,edx - lea eax,[rbx*1+rax] - rorx r12d,ebp,27 - rorx ebx,ebp,2 - xor ebp,ecx - add eax,r12d - xor ebp,edx - vpalignr ymm5,ymm2,ymm1,8 - add esi,DWORD[96+r13] - lea esi,[rbp*1+rsi] - rorx r12d,eax,27 - rorx ebp,eax,2 - vpsrldq ymm8,ymm4,4 - xor eax,ebx - add esi,r12d - xor eax,ecx - vpxor ymm5,ymm5,ymm1 - vpxor ymm8,ymm8,ymm3 - add edx,DWORD[100+r13] - lea edx,[rax*1+rdx] - vpxor ymm5,ymm5,ymm8 - rorx r12d,esi,27 - rorx eax,esi,2 - xor esi,ebp - add edx,r12d - vpsrld ymm8,ymm5,31 - vmovdqu ymm11,YMMWORD[((-32))+r14] - xor esi,ebx - add ecx,DWORD[104+r13] - lea ecx,[rsi*1+rcx] - vpslldq ymm10,ymm5,12 - vpaddd ymm5,ymm5,ymm5 - rorx r12d,edx,27 - rorx esi,edx,2 - vpsrld ymm9,ymm10,30 - vpor ymm5,ymm5,ymm8 - xor edx,eax - add ecx,r12d - vpslld ymm10,ymm10,2 - vpxor ymm5,ymm5,ymm9 - xor edx,ebp - add ebx,DWORD[108+r13] - lea r13,[256+r13] - vpxor ymm5,ymm5,ymm10 - lea ebx,[rdx*1+rbx] - rorx r12d,ecx,27 - rorx edx,ecx,2 - vpaddd ymm9,ymm5,ymm11 - xor ecx,esi - add ebx,r12d - xor ecx,eax - vmovdqu YMMWORD[160+rsp],ymm9 - add ebp,DWORD[((-128))+r13] - lea ebp,[rbp*1+rcx] - rorx r12d,ebx,27 - rorx ecx,ebx,2 - xor ebx,edx - add ebp,r12d - xor ebx,esi - vpalignr ymm6,ymm3,ymm2,8 - add eax,DWORD[((-124))+r13] - lea eax,[rbx*1+rax] - rorx r12d,ebp,27 - rorx ebx,ebp,2 - vpsrldq ymm8,ymm5,4 - xor ebp,ecx - add eax,r12d - xor ebp,edx - vpxor ymm6,ymm6,ymm2 - vpxor ymm8,ymm8,ymm4 - add esi,DWORD[((-120))+r13] - lea esi,[rbp*1+rsi] - vpxor ymm6,ymm6,ymm8 - rorx r12d,eax,27 - rorx ebp,eax,2 - xor eax,ebx - add esi,r12d - vpsrld ymm8,ymm6,31 - xor eax,ecx - add edx,DWORD[((-116))+r13] - lea edx,[rax*1+rdx] - vpslldq ymm10,ymm6,12 - vpaddd ymm6,ymm6,ymm6 - rorx r12d,esi,27 - rorx eax,esi,2 - vpsrld ymm9,ymm10,30 - vpor ymm6,ymm6,ymm8 - xor esi,ebp - add edx,r12d - vpslld ymm10,ymm10,2 - vpxor ymm6,ymm6,ymm9 - xor esi,ebx - add ecx,DWORD[((-96))+r13] - vpxor ymm6,ymm6,ymm10 - lea ecx,[rsi*1+rcx] - rorx r12d,edx,27 - rorx esi,edx,2 - vpaddd ymm9,ymm6,ymm11 - xor edx,eax - add ecx,r12d - xor edx,ebp - vmovdqu YMMWORD[192+rsp],ymm9 - add ebx,DWORD[((-92))+r13] - lea ebx,[rdx*1+rbx] - rorx r12d,ecx,27 - rorx edx,ecx,2 - xor ecx,esi - add ebx,r12d - xor ecx,eax - vpalignr ymm7,ymm4,ymm3,8 - add ebp,DWORD[((-88))+r13] - lea ebp,[rbp*1+rcx] - rorx r12d,ebx,27 - rorx ecx,ebx,2 - vpsrldq ymm8,ymm6,4 - xor ebx,edx - add ebp,r12d - xor ebx,esi - vpxor ymm7,ymm7,ymm3 - vpxor ymm8,ymm8,ymm5 - add eax,DWORD[((-84))+r13] - lea eax,[rbx*1+rax] - vpxor ymm7,ymm7,ymm8 - rorx r12d,ebp,27 - rorx ebx,ebp,2 - xor ebp,ecx - add eax,r12d - vpsrld ymm8,ymm7,31 - xor ebp,edx - add esi,DWORD[((-64))+r13] - lea esi,[rbp*1+rsi] - vpslldq ymm10,ymm7,12 - vpaddd ymm7,ymm7,ymm7 - rorx r12d,eax,27 - rorx ebp,eax,2 - vpsrld ymm9,ymm10,30 - vpor ymm7,ymm7,ymm8 - xor eax,ebx - add esi,r12d - vpslld ymm10,ymm10,2 - vpxor ymm7,ymm7,ymm9 - xor eax,ecx - add edx,DWORD[((-60))+r13] - vpxor ymm7,ymm7,ymm10 - lea edx,[rax*1+rdx] - rorx r12d,esi,27 - rorx eax,esi,2 - vpaddd ymm9,ymm7,ymm11 - xor esi,ebp - add edx,r12d - xor esi,ebx - vmovdqu YMMWORD[224+rsp],ymm9 - add ecx,DWORD[((-56))+r13] - lea ecx,[rsi*1+rcx] - rorx r12d,edx,27 - rorx esi,edx,2 - xor edx,eax - add ecx,r12d - xor edx,ebp - add ebx,DWORD[((-52))+r13] - lea ebx,[rdx*1+rbx] - rorx r12d,ecx,27 - rorx edx,ecx,2 - xor ecx,esi - add ebx,r12d - xor ecx,eax - add ebp,DWORD[((-32))+r13] - lea ebp,[rbp*1+rcx] - rorx r12d,ebx,27 - rorx ecx,ebx,2 - xor ebx,edx - add ebp,r12d - xor ebx,esi - add eax,DWORD[((-28))+r13] - lea eax,[rbx*1+rax] - rorx r12d,ebp,27 - rorx ebx,ebp,2 - xor ebp,ecx - add eax,r12d - xor ebp,edx - add esi,DWORD[((-24))+r13] - lea esi,[rbp*1+rsi] - rorx r12d,eax,27 - rorx ebp,eax,2 - xor eax,ebx - add esi,r12d - xor eax,ecx - add edx,DWORD[((-20))+r13] - lea edx,[rax*1+rdx] - rorx r12d,esi,27 - add edx,r12d - lea r13,[128+rsp] - - - add edx,DWORD[r8] - add esi,DWORD[4+r8] - add ebp,DWORD[8+r8] - mov DWORD[r8],edx - add ebx,DWORD[12+r8] - mov DWORD[4+r8],esi - mov eax,edx - add ecx,DWORD[16+r8] - mov r12d,ebp - mov DWORD[8+r8],ebp - mov edx,ebx - - mov DWORD[12+r8],ebx - mov ebp,esi - mov DWORD[16+r8],ecx - - mov esi,ecx - mov ecx,r12d - - - cmp r9,r10 - jbe NEAR $L$oop_avx2 - -$L$done_avx2: - vzeroupper - movaps xmm6,XMMWORD[((-40-96))+r11] - movaps xmm7,XMMWORD[((-40-80))+r11] - movaps xmm8,XMMWORD[((-40-64))+r11] - movaps xmm9,XMMWORD[((-40-48))+r11] - movaps xmm10,XMMWORD[((-40-32))+r11] - movaps xmm11,XMMWORD[((-40-16))+r11] - mov r14,QWORD[((-40))+r11] - - mov r13,QWORD[((-32))+r11] - - mov r12,QWORD[((-24))+r11] - - mov rbp,QWORD[((-16))+r11] - - mov rbx,QWORD[((-8))+r11] - - lea rsp,[r11] - -$L$epilogue_avx2: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_sha1_block_data_order_avx2: ALIGN 64 K_XX_XX: DD 0x5a827999,0x5a827999,0x5a827999,0x5a827999 @@ -5738,12 +2860,6 @@ ALIGN 4 DD $L$SEH_begin_sha1_block_data_order_ssse3 wrt ..imagebase DD $L$SEH_end_sha1_block_data_order_ssse3 wrt ..imagebase DD $L$SEH_info_sha1_block_data_order_ssse3 wrt ..imagebase - DD $L$SEH_begin_sha1_block_data_order_avx wrt ..imagebase - DD $L$SEH_end_sha1_block_data_order_avx wrt ..imagebase - DD $L$SEH_info_sha1_block_data_order_avx wrt ..imagebase - DD $L$SEH_begin_sha1_block_data_order_avx2 wrt ..imagebase - DD $L$SEH_end_sha1_block_data_order_avx2 wrt ..imagebase - DD $L$SEH_info_sha1_block_data_order_avx2 wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_sha1_block_data_order: @@ -5756,11 +2872,3 @@ $L$SEH_info_sha1_block_data_order_ssse3: DB 9,0,0,0 DD ssse3_handler wrt ..imagebase DD $L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase -$L$SEH_info_sha1_block_data_order_avx: -DB 9,0,0,0 - DD ssse3_handler wrt ..imagebase - DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase -$L$SEH_info_sha1_block_data_order_avx2: -DB 9,0,0,0 - DD ssse3_handler wrt ..imagebase - DD $L$prologue_avx2 wrt ..imagebase,$L$epilogue_avx2 wrt ..imagebase diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-mb-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-mb-x86_64.nasm index 58c00d6b92c..c4fd2666ab0 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-mb-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-mb-x86_64.nasm @@ -24,8 +24,6 @@ $L$SEH_begin_sha256_multi_block: mov rcx,QWORD[((OPENSSL_ia32cap_P+4))] bt rcx,61 jc NEAR _shaext_shortcut - test ecx,268435456 - jnz NEAR _avx_shortcut mov rax,rsp push rbx @@ -3206,4764 +3204,6 @@ $L$epilogue_shaext: DB 0F3h,0C3h ;repret $L$SEH_end_sha256_multi_block_shaext: - -ALIGN 32 -sha256_multi_block_avx: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_sha256_multi_block_avx: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - - -_avx_shortcut: - shr rcx,32 - cmp edx,2 - jb NEAR $L$avx - test ecx,32 - jnz NEAR _avx2_shortcut - jmp NEAR $L$avx -ALIGN 32 -$L$avx: - mov rax,rsp - - push rbx - - push rbp - - lea rsp,[((-168))+rsp] - movaps XMMWORD[rsp],xmm6 - movaps XMMWORD[16+rsp],xmm7 - movaps XMMWORD[32+rsp],xmm8 - movaps XMMWORD[48+rsp],xmm9 - movaps XMMWORD[(-120)+rax],xmm10 - movaps XMMWORD[(-104)+rax],xmm11 - movaps XMMWORD[(-88)+rax],xmm12 - movaps XMMWORD[(-72)+rax],xmm13 - movaps XMMWORD[(-56)+rax],xmm14 - movaps XMMWORD[(-40)+rax],xmm15 - sub rsp,288 - and rsp,-256 - mov QWORD[272+rsp],rax - -$L$body_avx: - lea rbp,[((K256+128))] - lea rbx,[256+rsp] - lea rdi,[128+rdi] - -$L$oop_grande_avx: - mov DWORD[280+rsp],edx - xor edx,edx - - mov r8,QWORD[rsi] - - mov ecx,DWORD[8+rsi] - cmp ecx,edx - cmovg edx,ecx - test ecx,ecx - mov DWORD[rbx],ecx - cmovle r8,rbp - - mov r9,QWORD[16+rsi] - - mov ecx,DWORD[24+rsi] - cmp ecx,edx - cmovg edx,ecx - test ecx,ecx - mov DWORD[4+rbx],ecx - cmovle r9,rbp - - mov r10,QWORD[32+rsi] - - mov ecx,DWORD[40+rsi] - cmp ecx,edx - cmovg edx,ecx - test ecx,ecx - mov DWORD[8+rbx],ecx - cmovle r10,rbp - - mov r11,QWORD[48+rsi] - - mov ecx,DWORD[56+rsi] - cmp ecx,edx - cmovg edx,ecx - test ecx,ecx - mov DWORD[12+rbx],ecx - cmovle r11,rbp - test edx,edx - jz NEAR $L$done_avx - - vmovdqu xmm8,XMMWORD[((0-128))+rdi] - lea rax,[128+rsp] - vmovdqu xmm9,XMMWORD[((32-128))+rdi] - vmovdqu xmm10,XMMWORD[((64-128))+rdi] - vmovdqu xmm11,XMMWORD[((96-128))+rdi] - vmovdqu xmm12,XMMWORD[((128-128))+rdi] - vmovdqu xmm13,XMMWORD[((160-128))+rdi] - vmovdqu xmm14,XMMWORD[((192-128))+rdi] - vmovdqu xmm15,XMMWORD[((224-128))+rdi] - vmovdqu xmm6,XMMWORD[$L$pbswap] - jmp NEAR $L$oop_avx - -ALIGN 32 -$L$oop_avx: - vpxor xmm4,xmm10,xmm9 - vmovd xmm5,DWORD[r8] - vmovd xmm0,DWORD[r9] - vpinsrd xmm5,xmm5,DWORD[r10],1 - vpinsrd xmm0,xmm0,DWORD[r11],1 - vpunpckldq xmm5,xmm5,xmm0 - vpshufb xmm5,xmm5,xmm6 - vpsrld xmm7,xmm12,6 - vpslld xmm2,xmm12,26 - vmovdqu XMMWORD[(0-128)+rax],xmm5 - vpaddd xmm5,xmm5,xmm15 - - vpsrld xmm1,xmm12,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm12,21 - vpaddd xmm5,xmm5,XMMWORD[((-128))+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm12,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm12,7 - vpandn xmm0,xmm12,xmm14 - vpand xmm3,xmm12,xmm13 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm15,xmm8,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm8,30 - vpxor xmm0,xmm0,xmm3 - vpxor xmm3,xmm9,xmm8 - - vpxor xmm15,xmm15,xmm1 - vpaddd xmm5,xmm5,xmm7 - - vpsrld xmm1,xmm8,13 - - vpslld xmm2,xmm8,19 - vpaddd xmm5,xmm5,xmm0 - vpand xmm4,xmm4,xmm3 - - vpxor xmm7,xmm15,xmm1 - - vpsrld xmm1,xmm8,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm8,10 - vpxor xmm15,xmm9,xmm4 - vpaddd xmm11,xmm11,xmm5 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm15,xmm15,xmm5 - vpaddd xmm15,xmm15,xmm7 - vmovd xmm5,DWORD[4+r8] - vmovd xmm0,DWORD[4+r9] - vpinsrd xmm5,xmm5,DWORD[4+r10],1 - vpinsrd xmm0,xmm0,DWORD[4+r11],1 - vpunpckldq xmm5,xmm5,xmm0 - vpshufb xmm5,xmm5,xmm6 - vpsrld xmm7,xmm11,6 - vpslld xmm2,xmm11,26 - vmovdqu XMMWORD[(16-128)+rax],xmm5 - vpaddd xmm5,xmm5,xmm14 - - vpsrld xmm1,xmm11,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm11,21 - vpaddd xmm5,xmm5,XMMWORD[((-96))+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm11,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm11,7 - vpandn xmm0,xmm11,xmm13 - vpand xmm4,xmm11,xmm12 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm14,xmm15,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm15,30 - vpxor xmm0,xmm0,xmm4 - vpxor xmm4,xmm8,xmm15 - - vpxor xmm14,xmm14,xmm1 - vpaddd xmm5,xmm5,xmm7 - - vpsrld xmm1,xmm15,13 - - vpslld xmm2,xmm15,19 - vpaddd xmm5,xmm5,xmm0 - vpand xmm3,xmm3,xmm4 - - vpxor xmm7,xmm14,xmm1 - - vpsrld xmm1,xmm15,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm15,10 - vpxor xmm14,xmm8,xmm3 - vpaddd xmm10,xmm10,xmm5 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm14,xmm14,xmm5 - vpaddd xmm14,xmm14,xmm7 - vmovd xmm5,DWORD[8+r8] - vmovd xmm0,DWORD[8+r9] - vpinsrd xmm5,xmm5,DWORD[8+r10],1 - vpinsrd xmm0,xmm0,DWORD[8+r11],1 - vpunpckldq xmm5,xmm5,xmm0 - vpshufb xmm5,xmm5,xmm6 - vpsrld xmm7,xmm10,6 - vpslld xmm2,xmm10,26 - vmovdqu XMMWORD[(32-128)+rax],xmm5 - vpaddd xmm5,xmm5,xmm13 - - vpsrld xmm1,xmm10,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm10,21 - vpaddd xmm5,xmm5,XMMWORD[((-64))+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm10,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm10,7 - vpandn xmm0,xmm10,xmm12 - vpand xmm3,xmm10,xmm11 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm13,xmm14,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm14,30 - vpxor xmm0,xmm0,xmm3 - vpxor xmm3,xmm15,xmm14 - - vpxor xmm13,xmm13,xmm1 - vpaddd xmm5,xmm5,xmm7 - - vpsrld xmm1,xmm14,13 - - vpslld xmm2,xmm14,19 - vpaddd xmm5,xmm5,xmm0 - vpand xmm4,xmm4,xmm3 - - vpxor xmm7,xmm13,xmm1 - - vpsrld xmm1,xmm14,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm14,10 - vpxor xmm13,xmm15,xmm4 - vpaddd xmm9,xmm9,xmm5 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm13,xmm13,xmm5 - vpaddd xmm13,xmm13,xmm7 - vmovd xmm5,DWORD[12+r8] - vmovd xmm0,DWORD[12+r9] - vpinsrd xmm5,xmm5,DWORD[12+r10],1 - vpinsrd xmm0,xmm0,DWORD[12+r11],1 - vpunpckldq xmm5,xmm5,xmm0 - vpshufb xmm5,xmm5,xmm6 - vpsrld xmm7,xmm9,6 - vpslld xmm2,xmm9,26 - vmovdqu XMMWORD[(48-128)+rax],xmm5 - vpaddd xmm5,xmm5,xmm12 - - vpsrld xmm1,xmm9,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm9,21 - vpaddd xmm5,xmm5,XMMWORD[((-32))+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm9,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm9,7 - vpandn xmm0,xmm9,xmm11 - vpand xmm4,xmm9,xmm10 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm12,xmm13,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm13,30 - vpxor xmm0,xmm0,xmm4 - vpxor xmm4,xmm14,xmm13 - - vpxor xmm12,xmm12,xmm1 - vpaddd xmm5,xmm5,xmm7 - - vpsrld xmm1,xmm13,13 - - vpslld xmm2,xmm13,19 - vpaddd xmm5,xmm5,xmm0 - vpand xmm3,xmm3,xmm4 - - vpxor xmm7,xmm12,xmm1 - - vpsrld xmm1,xmm13,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm13,10 - vpxor xmm12,xmm14,xmm3 - vpaddd xmm8,xmm8,xmm5 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm12,xmm12,xmm5 - vpaddd xmm12,xmm12,xmm7 - vmovd xmm5,DWORD[16+r8] - vmovd xmm0,DWORD[16+r9] - vpinsrd xmm5,xmm5,DWORD[16+r10],1 - vpinsrd xmm0,xmm0,DWORD[16+r11],1 - vpunpckldq xmm5,xmm5,xmm0 - vpshufb xmm5,xmm5,xmm6 - vpsrld xmm7,xmm8,6 - vpslld xmm2,xmm8,26 - vmovdqu XMMWORD[(64-128)+rax],xmm5 - vpaddd xmm5,xmm5,xmm11 - - vpsrld xmm1,xmm8,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm8,21 - vpaddd xmm5,xmm5,XMMWORD[rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm8,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm8,7 - vpandn xmm0,xmm8,xmm10 - vpand xmm3,xmm8,xmm9 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm11,xmm12,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm12,30 - vpxor xmm0,xmm0,xmm3 - vpxor xmm3,xmm13,xmm12 - - vpxor xmm11,xmm11,xmm1 - vpaddd xmm5,xmm5,xmm7 - - vpsrld xmm1,xmm12,13 - - vpslld xmm2,xmm12,19 - vpaddd xmm5,xmm5,xmm0 - vpand xmm4,xmm4,xmm3 - - vpxor xmm7,xmm11,xmm1 - - vpsrld xmm1,xmm12,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm12,10 - vpxor xmm11,xmm13,xmm4 - vpaddd xmm15,xmm15,xmm5 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm11,xmm11,xmm5 - vpaddd xmm11,xmm11,xmm7 - vmovd xmm5,DWORD[20+r8] - vmovd xmm0,DWORD[20+r9] - vpinsrd xmm5,xmm5,DWORD[20+r10],1 - vpinsrd xmm0,xmm0,DWORD[20+r11],1 - vpunpckldq xmm5,xmm5,xmm0 - vpshufb xmm5,xmm5,xmm6 - vpsrld xmm7,xmm15,6 - vpslld xmm2,xmm15,26 - vmovdqu XMMWORD[(80-128)+rax],xmm5 - vpaddd xmm5,xmm5,xmm10 - - vpsrld xmm1,xmm15,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm15,21 - vpaddd xmm5,xmm5,XMMWORD[32+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm15,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm15,7 - vpandn xmm0,xmm15,xmm9 - vpand xmm4,xmm15,xmm8 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm10,xmm11,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm11,30 - vpxor xmm0,xmm0,xmm4 - vpxor xmm4,xmm12,xmm11 - - vpxor xmm10,xmm10,xmm1 - vpaddd xmm5,xmm5,xmm7 - - vpsrld xmm1,xmm11,13 - - vpslld xmm2,xmm11,19 - vpaddd xmm5,xmm5,xmm0 - vpand xmm3,xmm3,xmm4 - - vpxor xmm7,xmm10,xmm1 - - vpsrld xmm1,xmm11,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm11,10 - vpxor xmm10,xmm12,xmm3 - vpaddd xmm14,xmm14,xmm5 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm10,xmm10,xmm5 - vpaddd xmm10,xmm10,xmm7 - vmovd xmm5,DWORD[24+r8] - vmovd xmm0,DWORD[24+r9] - vpinsrd xmm5,xmm5,DWORD[24+r10],1 - vpinsrd xmm0,xmm0,DWORD[24+r11],1 - vpunpckldq xmm5,xmm5,xmm0 - vpshufb xmm5,xmm5,xmm6 - vpsrld xmm7,xmm14,6 - vpslld xmm2,xmm14,26 - vmovdqu XMMWORD[(96-128)+rax],xmm5 - vpaddd xmm5,xmm5,xmm9 - - vpsrld xmm1,xmm14,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm14,21 - vpaddd xmm5,xmm5,XMMWORD[64+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm14,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm14,7 - vpandn xmm0,xmm14,xmm8 - vpand xmm3,xmm14,xmm15 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm9,xmm10,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm10,30 - vpxor xmm0,xmm0,xmm3 - vpxor xmm3,xmm11,xmm10 - - vpxor xmm9,xmm9,xmm1 - vpaddd xmm5,xmm5,xmm7 - - vpsrld xmm1,xmm10,13 - - vpslld xmm2,xmm10,19 - vpaddd xmm5,xmm5,xmm0 - vpand xmm4,xmm4,xmm3 - - vpxor xmm7,xmm9,xmm1 - - vpsrld xmm1,xmm10,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm10,10 - vpxor xmm9,xmm11,xmm4 - vpaddd xmm13,xmm13,xmm5 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm9,xmm9,xmm5 - vpaddd xmm9,xmm9,xmm7 - vmovd xmm5,DWORD[28+r8] - vmovd xmm0,DWORD[28+r9] - vpinsrd xmm5,xmm5,DWORD[28+r10],1 - vpinsrd xmm0,xmm0,DWORD[28+r11],1 - vpunpckldq xmm5,xmm5,xmm0 - vpshufb xmm5,xmm5,xmm6 - vpsrld xmm7,xmm13,6 - vpslld xmm2,xmm13,26 - vmovdqu XMMWORD[(112-128)+rax],xmm5 - vpaddd xmm5,xmm5,xmm8 - - vpsrld xmm1,xmm13,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm13,21 - vpaddd xmm5,xmm5,XMMWORD[96+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm13,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm13,7 - vpandn xmm0,xmm13,xmm15 - vpand xmm4,xmm13,xmm14 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm8,xmm9,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm9,30 - vpxor xmm0,xmm0,xmm4 - vpxor xmm4,xmm10,xmm9 - - vpxor xmm8,xmm8,xmm1 - vpaddd xmm5,xmm5,xmm7 - - vpsrld xmm1,xmm9,13 - - vpslld xmm2,xmm9,19 - vpaddd xmm5,xmm5,xmm0 - vpand xmm3,xmm3,xmm4 - - vpxor xmm7,xmm8,xmm1 - - vpsrld xmm1,xmm9,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm9,10 - vpxor xmm8,xmm10,xmm3 - vpaddd xmm12,xmm12,xmm5 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm8,xmm8,xmm5 - vpaddd xmm8,xmm8,xmm7 - add rbp,256 - vmovd xmm5,DWORD[32+r8] - vmovd xmm0,DWORD[32+r9] - vpinsrd xmm5,xmm5,DWORD[32+r10],1 - vpinsrd xmm0,xmm0,DWORD[32+r11],1 - vpunpckldq xmm5,xmm5,xmm0 - vpshufb xmm5,xmm5,xmm6 - vpsrld xmm7,xmm12,6 - vpslld xmm2,xmm12,26 - vmovdqu XMMWORD[(128-128)+rax],xmm5 - vpaddd xmm5,xmm5,xmm15 - - vpsrld xmm1,xmm12,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm12,21 - vpaddd xmm5,xmm5,XMMWORD[((-128))+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm12,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm12,7 - vpandn xmm0,xmm12,xmm14 - vpand xmm3,xmm12,xmm13 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm15,xmm8,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm8,30 - vpxor xmm0,xmm0,xmm3 - vpxor xmm3,xmm9,xmm8 - - vpxor xmm15,xmm15,xmm1 - vpaddd xmm5,xmm5,xmm7 - - vpsrld xmm1,xmm8,13 - - vpslld xmm2,xmm8,19 - vpaddd xmm5,xmm5,xmm0 - vpand xmm4,xmm4,xmm3 - - vpxor xmm7,xmm15,xmm1 - - vpsrld xmm1,xmm8,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm8,10 - vpxor xmm15,xmm9,xmm4 - vpaddd xmm11,xmm11,xmm5 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm15,xmm15,xmm5 - vpaddd xmm15,xmm15,xmm7 - vmovd xmm5,DWORD[36+r8] - vmovd xmm0,DWORD[36+r9] - vpinsrd xmm5,xmm5,DWORD[36+r10],1 - vpinsrd xmm0,xmm0,DWORD[36+r11],1 - vpunpckldq xmm5,xmm5,xmm0 - vpshufb xmm5,xmm5,xmm6 - vpsrld xmm7,xmm11,6 - vpslld xmm2,xmm11,26 - vmovdqu XMMWORD[(144-128)+rax],xmm5 - vpaddd xmm5,xmm5,xmm14 - - vpsrld xmm1,xmm11,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm11,21 - vpaddd xmm5,xmm5,XMMWORD[((-96))+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm11,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm11,7 - vpandn xmm0,xmm11,xmm13 - vpand xmm4,xmm11,xmm12 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm14,xmm15,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm15,30 - vpxor xmm0,xmm0,xmm4 - vpxor xmm4,xmm8,xmm15 - - vpxor xmm14,xmm14,xmm1 - vpaddd xmm5,xmm5,xmm7 - - vpsrld xmm1,xmm15,13 - - vpslld xmm2,xmm15,19 - vpaddd xmm5,xmm5,xmm0 - vpand xmm3,xmm3,xmm4 - - vpxor xmm7,xmm14,xmm1 - - vpsrld xmm1,xmm15,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm15,10 - vpxor xmm14,xmm8,xmm3 - vpaddd xmm10,xmm10,xmm5 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm14,xmm14,xmm5 - vpaddd xmm14,xmm14,xmm7 - vmovd xmm5,DWORD[40+r8] - vmovd xmm0,DWORD[40+r9] - vpinsrd xmm5,xmm5,DWORD[40+r10],1 - vpinsrd xmm0,xmm0,DWORD[40+r11],1 - vpunpckldq xmm5,xmm5,xmm0 - vpshufb xmm5,xmm5,xmm6 - vpsrld xmm7,xmm10,6 - vpslld xmm2,xmm10,26 - vmovdqu XMMWORD[(160-128)+rax],xmm5 - vpaddd xmm5,xmm5,xmm13 - - vpsrld xmm1,xmm10,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm10,21 - vpaddd xmm5,xmm5,XMMWORD[((-64))+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm10,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm10,7 - vpandn xmm0,xmm10,xmm12 - vpand xmm3,xmm10,xmm11 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm13,xmm14,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm14,30 - vpxor xmm0,xmm0,xmm3 - vpxor xmm3,xmm15,xmm14 - - vpxor xmm13,xmm13,xmm1 - vpaddd xmm5,xmm5,xmm7 - - vpsrld xmm1,xmm14,13 - - vpslld xmm2,xmm14,19 - vpaddd xmm5,xmm5,xmm0 - vpand xmm4,xmm4,xmm3 - - vpxor xmm7,xmm13,xmm1 - - vpsrld xmm1,xmm14,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm14,10 - vpxor xmm13,xmm15,xmm4 - vpaddd xmm9,xmm9,xmm5 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm13,xmm13,xmm5 - vpaddd xmm13,xmm13,xmm7 - vmovd xmm5,DWORD[44+r8] - vmovd xmm0,DWORD[44+r9] - vpinsrd xmm5,xmm5,DWORD[44+r10],1 - vpinsrd xmm0,xmm0,DWORD[44+r11],1 - vpunpckldq xmm5,xmm5,xmm0 - vpshufb xmm5,xmm5,xmm6 - vpsrld xmm7,xmm9,6 - vpslld xmm2,xmm9,26 - vmovdqu XMMWORD[(176-128)+rax],xmm5 - vpaddd xmm5,xmm5,xmm12 - - vpsrld xmm1,xmm9,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm9,21 - vpaddd xmm5,xmm5,XMMWORD[((-32))+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm9,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm9,7 - vpandn xmm0,xmm9,xmm11 - vpand xmm4,xmm9,xmm10 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm12,xmm13,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm13,30 - vpxor xmm0,xmm0,xmm4 - vpxor xmm4,xmm14,xmm13 - - vpxor xmm12,xmm12,xmm1 - vpaddd xmm5,xmm5,xmm7 - - vpsrld xmm1,xmm13,13 - - vpslld xmm2,xmm13,19 - vpaddd xmm5,xmm5,xmm0 - vpand xmm3,xmm3,xmm4 - - vpxor xmm7,xmm12,xmm1 - - vpsrld xmm1,xmm13,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm13,10 - vpxor xmm12,xmm14,xmm3 - vpaddd xmm8,xmm8,xmm5 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm12,xmm12,xmm5 - vpaddd xmm12,xmm12,xmm7 - vmovd xmm5,DWORD[48+r8] - vmovd xmm0,DWORD[48+r9] - vpinsrd xmm5,xmm5,DWORD[48+r10],1 - vpinsrd xmm0,xmm0,DWORD[48+r11],1 - vpunpckldq xmm5,xmm5,xmm0 - vpshufb xmm5,xmm5,xmm6 - vpsrld xmm7,xmm8,6 - vpslld xmm2,xmm8,26 - vmovdqu XMMWORD[(192-128)+rax],xmm5 - vpaddd xmm5,xmm5,xmm11 - - vpsrld xmm1,xmm8,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm8,21 - vpaddd xmm5,xmm5,XMMWORD[rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm8,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm8,7 - vpandn xmm0,xmm8,xmm10 - vpand xmm3,xmm8,xmm9 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm11,xmm12,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm12,30 - vpxor xmm0,xmm0,xmm3 - vpxor xmm3,xmm13,xmm12 - - vpxor xmm11,xmm11,xmm1 - vpaddd xmm5,xmm5,xmm7 - - vpsrld xmm1,xmm12,13 - - vpslld xmm2,xmm12,19 - vpaddd xmm5,xmm5,xmm0 - vpand xmm4,xmm4,xmm3 - - vpxor xmm7,xmm11,xmm1 - - vpsrld xmm1,xmm12,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm12,10 - vpxor xmm11,xmm13,xmm4 - vpaddd xmm15,xmm15,xmm5 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm11,xmm11,xmm5 - vpaddd xmm11,xmm11,xmm7 - vmovd xmm5,DWORD[52+r8] - vmovd xmm0,DWORD[52+r9] - vpinsrd xmm5,xmm5,DWORD[52+r10],1 - vpinsrd xmm0,xmm0,DWORD[52+r11],1 - vpunpckldq xmm5,xmm5,xmm0 - vpshufb xmm5,xmm5,xmm6 - vpsrld xmm7,xmm15,6 - vpslld xmm2,xmm15,26 - vmovdqu XMMWORD[(208-128)+rax],xmm5 - vpaddd xmm5,xmm5,xmm10 - - vpsrld xmm1,xmm15,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm15,21 - vpaddd xmm5,xmm5,XMMWORD[32+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm15,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm15,7 - vpandn xmm0,xmm15,xmm9 - vpand xmm4,xmm15,xmm8 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm10,xmm11,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm11,30 - vpxor xmm0,xmm0,xmm4 - vpxor xmm4,xmm12,xmm11 - - vpxor xmm10,xmm10,xmm1 - vpaddd xmm5,xmm5,xmm7 - - vpsrld xmm1,xmm11,13 - - vpslld xmm2,xmm11,19 - vpaddd xmm5,xmm5,xmm0 - vpand xmm3,xmm3,xmm4 - - vpxor xmm7,xmm10,xmm1 - - vpsrld xmm1,xmm11,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm11,10 - vpxor xmm10,xmm12,xmm3 - vpaddd xmm14,xmm14,xmm5 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm10,xmm10,xmm5 - vpaddd xmm10,xmm10,xmm7 - vmovd xmm5,DWORD[56+r8] - vmovd xmm0,DWORD[56+r9] - vpinsrd xmm5,xmm5,DWORD[56+r10],1 - vpinsrd xmm0,xmm0,DWORD[56+r11],1 - vpunpckldq xmm5,xmm5,xmm0 - vpshufb xmm5,xmm5,xmm6 - vpsrld xmm7,xmm14,6 - vpslld xmm2,xmm14,26 - vmovdqu XMMWORD[(224-128)+rax],xmm5 - vpaddd xmm5,xmm5,xmm9 - - vpsrld xmm1,xmm14,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm14,21 - vpaddd xmm5,xmm5,XMMWORD[64+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm14,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm14,7 - vpandn xmm0,xmm14,xmm8 - vpand xmm3,xmm14,xmm15 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm9,xmm10,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm10,30 - vpxor xmm0,xmm0,xmm3 - vpxor xmm3,xmm11,xmm10 - - vpxor xmm9,xmm9,xmm1 - vpaddd xmm5,xmm5,xmm7 - - vpsrld xmm1,xmm10,13 - - vpslld xmm2,xmm10,19 - vpaddd xmm5,xmm5,xmm0 - vpand xmm4,xmm4,xmm3 - - vpxor xmm7,xmm9,xmm1 - - vpsrld xmm1,xmm10,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm10,10 - vpxor xmm9,xmm11,xmm4 - vpaddd xmm13,xmm13,xmm5 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm9,xmm9,xmm5 - vpaddd xmm9,xmm9,xmm7 - vmovd xmm5,DWORD[60+r8] - lea r8,[64+r8] - vmovd xmm0,DWORD[60+r9] - lea r9,[64+r9] - vpinsrd xmm5,xmm5,DWORD[60+r10],1 - lea r10,[64+r10] - vpinsrd xmm0,xmm0,DWORD[60+r11],1 - lea r11,[64+r11] - vpunpckldq xmm5,xmm5,xmm0 - vpshufb xmm5,xmm5,xmm6 - vpsrld xmm7,xmm13,6 - vpslld xmm2,xmm13,26 - vmovdqu XMMWORD[(240-128)+rax],xmm5 - vpaddd xmm5,xmm5,xmm8 - - vpsrld xmm1,xmm13,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm13,21 - vpaddd xmm5,xmm5,XMMWORD[96+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm13,25 - vpxor xmm7,xmm7,xmm2 - prefetcht0 [63+r8] - vpslld xmm2,xmm13,7 - vpandn xmm0,xmm13,xmm15 - vpand xmm4,xmm13,xmm14 - prefetcht0 [63+r9] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm8,xmm9,2 - vpxor xmm7,xmm7,xmm2 - prefetcht0 [63+r10] - vpslld xmm1,xmm9,30 - vpxor xmm0,xmm0,xmm4 - vpxor xmm4,xmm10,xmm9 - prefetcht0 [63+r11] - vpxor xmm8,xmm8,xmm1 - vpaddd xmm5,xmm5,xmm7 - - vpsrld xmm1,xmm9,13 - - vpslld xmm2,xmm9,19 - vpaddd xmm5,xmm5,xmm0 - vpand xmm3,xmm3,xmm4 - - vpxor xmm7,xmm8,xmm1 - - vpsrld xmm1,xmm9,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm9,10 - vpxor xmm8,xmm10,xmm3 - vpaddd xmm12,xmm12,xmm5 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm8,xmm8,xmm5 - vpaddd xmm8,xmm8,xmm7 - add rbp,256 - vmovdqu xmm5,XMMWORD[((0-128))+rax] - mov ecx,3 - jmp NEAR $L$oop_16_xx_avx -ALIGN 32 -$L$oop_16_xx_avx: - vmovdqu xmm6,XMMWORD[((16-128))+rax] - vpaddd xmm5,xmm5,XMMWORD[((144-128))+rax] - - vpsrld xmm7,xmm6,3 - vpsrld xmm1,xmm6,7 - vpslld xmm2,xmm6,25 - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm6,18 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm6,14 - vmovdqu xmm0,XMMWORD[((224-128))+rax] - vpsrld xmm3,xmm0,10 - - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm0,17 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,15 - vpaddd xmm5,xmm5,xmm7 - vpxor xmm7,xmm3,xmm1 - vpsrld xmm1,xmm0,19 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,13 - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - vpaddd xmm5,xmm5,xmm7 - vpsrld xmm7,xmm12,6 - vpslld xmm2,xmm12,26 - vmovdqu XMMWORD[(0-128)+rax],xmm5 - vpaddd xmm5,xmm5,xmm15 - - vpsrld xmm1,xmm12,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm12,21 - vpaddd xmm5,xmm5,XMMWORD[((-128))+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm12,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm12,7 - vpandn xmm0,xmm12,xmm14 - vpand xmm3,xmm12,xmm13 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm15,xmm8,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm8,30 - vpxor xmm0,xmm0,xmm3 - vpxor xmm3,xmm9,xmm8 - - vpxor xmm15,xmm15,xmm1 - vpaddd xmm5,xmm5,xmm7 - - vpsrld xmm1,xmm8,13 - - vpslld xmm2,xmm8,19 - vpaddd xmm5,xmm5,xmm0 - vpand xmm4,xmm4,xmm3 - - vpxor xmm7,xmm15,xmm1 - - vpsrld xmm1,xmm8,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm8,10 - vpxor xmm15,xmm9,xmm4 - vpaddd xmm11,xmm11,xmm5 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm15,xmm15,xmm5 - vpaddd xmm15,xmm15,xmm7 - vmovdqu xmm5,XMMWORD[((32-128))+rax] - vpaddd xmm6,xmm6,XMMWORD[((160-128))+rax] - - vpsrld xmm7,xmm5,3 - vpsrld xmm1,xmm5,7 - vpslld xmm2,xmm5,25 - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm5,18 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm5,14 - vmovdqu xmm0,XMMWORD[((240-128))+rax] - vpsrld xmm4,xmm0,10 - - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm0,17 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,15 - vpaddd xmm6,xmm6,xmm7 - vpxor xmm7,xmm4,xmm1 - vpsrld xmm1,xmm0,19 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,13 - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - vpaddd xmm6,xmm6,xmm7 - vpsrld xmm7,xmm11,6 - vpslld xmm2,xmm11,26 - vmovdqu XMMWORD[(16-128)+rax],xmm6 - vpaddd xmm6,xmm6,xmm14 - - vpsrld xmm1,xmm11,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm11,21 - vpaddd xmm6,xmm6,XMMWORD[((-96))+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm11,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm11,7 - vpandn xmm0,xmm11,xmm13 - vpand xmm4,xmm11,xmm12 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm14,xmm15,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm15,30 - vpxor xmm0,xmm0,xmm4 - vpxor xmm4,xmm8,xmm15 - - vpxor xmm14,xmm14,xmm1 - vpaddd xmm6,xmm6,xmm7 - - vpsrld xmm1,xmm15,13 - - vpslld xmm2,xmm15,19 - vpaddd xmm6,xmm6,xmm0 - vpand xmm3,xmm3,xmm4 - - vpxor xmm7,xmm14,xmm1 - - vpsrld xmm1,xmm15,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm15,10 - vpxor xmm14,xmm8,xmm3 - vpaddd xmm10,xmm10,xmm6 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm14,xmm14,xmm6 - vpaddd xmm14,xmm14,xmm7 - vmovdqu xmm6,XMMWORD[((48-128))+rax] - vpaddd xmm5,xmm5,XMMWORD[((176-128))+rax] - - vpsrld xmm7,xmm6,3 - vpsrld xmm1,xmm6,7 - vpslld xmm2,xmm6,25 - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm6,18 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm6,14 - vmovdqu xmm0,XMMWORD[((0-128))+rax] - vpsrld xmm3,xmm0,10 - - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm0,17 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,15 - vpaddd xmm5,xmm5,xmm7 - vpxor xmm7,xmm3,xmm1 - vpsrld xmm1,xmm0,19 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,13 - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - vpaddd xmm5,xmm5,xmm7 - vpsrld xmm7,xmm10,6 - vpslld xmm2,xmm10,26 - vmovdqu XMMWORD[(32-128)+rax],xmm5 - vpaddd xmm5,xmm5,xmm13 - - vpsrld xmm1,xmm10,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm10,21 - vpaddd xmm5,xmm5,XMMWORD[((-64))+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm10,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm10,7 - vpandn xmm0,xmm10,xmm12 - vpand xmm3,xmm10,xmm11 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm13,xmm14,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm14,30 - vpxor xmm0,xmm0,xmm3 - vpxor xmm3,xmm15,xmm14 - - vpxor xmm13,xmm13,xmm1 - vpaddd xmm5,xmm5,xmm7 - - vpsrld xmm1,xmm14,13 - - vpslld xmm2,xmm14,19 - vpaddd xmm5,xmm5,xmm0 - vpand xmm4,xmm4,xmm3 - - vpxor xmm7,xmm13,xmm1 - - vpsrld xmm1,xmm14,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm14,10 - vpxor xmm13,xmm15,xmm4 - vpaddd xmm9,xmm9,xmm5 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm13,xmm13,xmm5 - vpaddd xmm13,xmm13,xmm7 - vmovdqu xmm5,XMMWORD[((64-128))+rax] - vpaddd xmm6,xmm6,XMMWORD[((192-128))+rax] - - vpsrld xmm7,xmm5,3 - vpsrld xmm1,xmm5,7 - vpslld xmm2,xmm5,25 - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm5,18 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm5,14 - vmovdqu xmm0,XMMWORD[((16-128))+rax] - vpsrld xmm4,xmm0,10 - - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm0,17 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,15 - vpaddd xmm6,xmm6,xmm7 - vpxor xmm7,xmm4,xmm1 - vpsrld xmm1,xmm0,19 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,13 - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - vpaddd xmm6,xmm6,xmm7 - vpsrld xmm7,xmm9,6 - vpslld xmm2,xmm9,26 - vmovdqu XMMWORD[(48-128)+rax],xmm6 - vpaddd xmm6,xmm6,xmm12 - - vpsrld xmm1,xmm9,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm9,21 - vpaddd xmm6,xmm6,XMMWORD[((-32))+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm9,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm9,7 - vpandn xmm0,xmm9,xmm11 - vpand xmm4,xmm9,xmm10 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm12,xmm13,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm13,30 - vpxor xmm0,xmm0,xmm4 - vpxor xmm4,xmm14,xmm13 - - vpxor xmm12,xmm12,xmm1 - vpaddd xmm6,xmm6,xmm7 - - vpsrld xmm1,xmm13,13 - - vpslld xmm2,xmm13,19 - vpaddd xmm6,xmm6,xmm0 - vpand xmm3,xmm3,xmm4 - - vpxor xmm7,xmm12,xmm1 - - vpsrld xmm1,xmm13,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm13,10 - vpxor xmm12,xmm14,xmm3 - vpaddd xmm8,xmm8,xmm6 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm12,xmm12,xmm6 - vpaddd xmm12,xmm12,xmm7 - vmovdqu xmm6,XMMWORD[((80-128))+rax] - vpaddd xmm5,xmm5,XMMWORD[((208-128))+rax] - - vpsrld xmm7,xmm6,3 - vpsrld xmm1,xmm6,7 - vpslld xmm2,xmm6,25 - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm6,18 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm6,14 - vmovdqu xmm0,XMMWORD[((32-128))+rax] - vpsrld xmm3,xmm0,10 - - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm0,17 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,15 - vpaddd xmm5,xmm5,xmm7 - vpxor xmm7,xmm3,xmm1 - vpsrld xmm1,xmm0,19 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,13 - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - vpaddd xmm5,xmm5,xmm7 - vpsrld xmm7,xmm8,6 - vpslld xmm2,xmm8,26 - vmovdqu XMMWORD[(64-128)+rax],xmm5 - vpaddd xmm5,xmm5,xmm11 - - vpsrld xmm1,xmm8,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm8,21 - vpaddd xmm5,xmm5,XMMWORD[rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm8,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm8,7 - vpandn xmm0,xmm8,xmm10 - vpand xmm3,xmm8,xmm9 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm11,xmm12,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm12,30 - vpxor xmm0,xmm0,xmm3 - vpxor xmm3,xmm13,xmm12 - - vpxor xmm11,xmm11,xmm1 - vpaddd xmm5,xmm5,xmm7 - - vpsrld xmm1,xmm12,13 - - vpslld xmm2,xmm12,19 - vpaddd xmm5,xmm5,xmm0 - vpand xmm4,xmm4,xmm3 - - vpxor xmm7,xmm11,xmm1 - - vpsrld xmm1,xmm12,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm12,10 - vpxor xmm11,xmm13,xmm4 - vpaddd xmm15,xmm15,xmm5 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm11,xmm11,xmm5 - vpaddd xmm11,xmm11,xmm7 - vmovdqu xmm5,XMMWORD[((96-128))+rax] - vpaddd xmm6,xmm6,XMMWORD[((224-128))+rax] - - vpsrld xmm7,xmm5,3 - vpsrld xmm1,xmm5,7 - vpslld xmm2,xmm5,25 - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm5,18 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm5,14 - vmovdqu xmm0,XMMWORD[((48-128))+rax] - vpsrld xmm4,xmm0,10 - - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm0,17 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,15 - vpaddd xmm6,xmm6,xmm7 - vpxor xmm7,xmm4,xmm1 - vpsrld xmm1,xmm0,19 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,13 - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - vpaddd xmm6,xmm6,xmm7 - vpsrld xmm7,xmm15,6 - vpslld xmm2,xmm15,26 - vmovdqu XMMWORD[(80-128)+rax],xmm6 - vpaddd xmm6,xmm6,xmm10 - - vpsrld xmm1,xmm15,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm15,21 - vpaddd xmm6,xmm6,XMMWORD[32+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm15,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm15,7 - vpandn xmm0,xmm15,xmm9 - vpand xmm4,xmm15,xmm8 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm10,xmm11,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm11,30 - vpxor xmm0,xmm0,xmm4 - vpxor xmm4,xmm12,xmm11 - - vpxor xmm10,xmm10,xmm1 - vpaddd xmm6,xmm6,xmm7 - - vpsrld xmm1,xmm11,13 - - vpslld xmm2,xmm11,19 - vpaddd xmm6,xmm6,xmm0 - vpand xmm3,xmm3,xmm4 - - vpxor xmm7,xmm10,xmm1 - - vpsrld xmm1,xmm11,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm11,10 - vpxor xmm10,xmm12,xmm3 - vpaddd xmm14,xmm14,xmm6 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm10,xmm10,xmm6 - vpaddd xmm10,xmm10,xmm7 - vmovdqu xmm6,XMMWORD[((112-128))+rax] - vpaddd xmm5,xmm5,XMMWORD[((240-128))+rax] - - vpsrld xmm7,xmm6,3 - vpsrld xmm1,xmm6,7 - vpslld xmm2,xmm6,25 - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm6,18 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm6,14 - vmovdqu xmm0,XMMWORD[((64-128))+rax] - vpsrld xmm3,xmm0,10 - - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm0,17 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,15 - vpaddd xmm5,xmm5,xmm7 - vpxor xmm7,xmm3,xmm1 - vpsrld xmm1,xmm0,19 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,13 - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - vpaddd xmm5,xmm5,xmm7 - vpsrld xmm7,xmm14,6 - vpslld xmm2,xmm14,26 - vmovdqu XMMWORD[(96-128)+rax],xmm5 - vpaddd xmm5,xmm5,xmm9 - - vpsrld xmm1,xmm14,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm14,21 - vpaddd xmm5,xmm5,XMMWORD[64+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm14,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm14,7 - vpandn xmm0,xmm14,xmm8 - vpand xmm3,xmm14,xmm15 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm9,xmm10,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm10,30 - vpxor xmm0,xmm0,xmm3 - vpxor xmm3,xmm11,xmm10 - - vpxor xmm9,xmm9,xmm1 - vpaddd xmm5,xmm5,xmm7 - - vpsrld xmm1,xmm10,13 - - vpslld xmm2,xmm10,19 - vpaddd xmm5,xmm5,xmm0 - vpand xmm4,xmm4,xmm3 - - vpxor xmm7,xmm9,xmm1 - - vpsrld xmm1,xmm10,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm10,10 - vpxor xmm9,xmm11,xmm4 - vpaddd xmm13,xmm13,xmm5 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm9,xmm9,xmm5 - vpaddd xmm9,xmm9,xmm7 - vmovdqu xmm5,XMMWORD[((128-128))+rax] - vpaddd xmm6,xmm6,XMMWORD[((0-128))+rax] - - vpsrld xmm7,xmm5,3 - vpsrld xmm1,xmm5,7 - vpslld xmm2,xmm5,25 - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm5,18 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm5,14 - vmovdqu xmm0,XMMWORD[((80-128))+rax] - vpsrld xmm4,xmm0,10 - - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm0,17 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,15 - vpaddd xmm6,xmm6,xmm7 - vpxor xmm7,xmm4,xmm1 - vpsrld xmm1,xmm0,19 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,13 - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - vpaddd xmm6,xmm6,xmm7 - vpsrld xmm7,xmm13,6 - vpslld xmm2,xmm13,26 - vmovdqu XMMWORD[(112-128)+rax],xmm6 - vpaddd xmm6,xmm6,xmm8 - - vpsrld xmm1,xmm13,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm13,21 - vpaddd xmm6,xmm6,XMMWORD[96+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm13,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm13,7 - vpandn xmm0,xmm13,xmm15 - vpand xmm4,xmm13,xmm14 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm8,xmm9,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm9,30 - vpxor xmm0,xmm0,xmm4 - vpxor xmm4,xmm10,xmm9 - - vpxor xmm8,xmm8,xmm1 - vpaddd xmm6,xmm6,xmm7 - - vpsrld xmm1,xmm9,13 - - vpslld xmm2,xmm9,19 - vpaddd xmm6,xmm6,xmm0 - vpand xmm3,xmm3,xmm4 - - vpxor xmm7,xmm8,xmm1 - - vpsrld xmm1,xmm9,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm9,10 - vpxor xmm8,xmm10,xmm3 - vpaddd xmm12,xmm12,xmm6 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm8,xmm8,xmm6 - vpaddd xmm8,xmm8,xmm7 - add rbp,256 - vmovdqu xmm6,XMMWORD[((144-128))+rax] - vpaddd xmm5,xmm5,XMMWORD[((16-128))+rax] - - vpsrld xmm7,xmm6,3 - vpsrld xmm1,xmm6,7 - vpslld xmm2,xmm6,25 - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm6,18 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm6,14 - vmovdqu xmm0,XMMWORD[((96-128))+rax] - vpsrld xmm3,xmm0,10 - - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm0,17 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,15 - vpaddd xmm5,xmm5,xmm7 - vpxor xmm7,xmm3,xmm1 - vpsrld xmm1,xmm0,19 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,13 - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - vpaddd xmm5,xmm5,xmm7 - vpsrld xmm7,xmm12,6 - vpslld xmm2,xmm12,26 - vmovdqu XMMWORD[(128-128)+rax],xmm5 - vpaddd xmm5,xmm5,xmm15 - - vpsrld xmm1,xmm12,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm12,21 - vpaddd xmm5,xmm5,XMMWORD[((-128))+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm12,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm12,7 - vpandn xmm0,xmm12,xmm14 - vpand xmm3,xmm12,xmm13 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm15,xmm8,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm8,30 - vpxor xmm0,xmm0,xmm3 - vpxor xmm3,xmm9,xmm8 - - vpxor xmm15,xmm15,xmm1 - vpaddd xmm5,xmm5,xmm7 - - vpsrld xmm1,xmm8,13 - - vpslld xmm2,xmm8,19 - vpaddd xmm5,xmm5,xmm0 - vpand xmm4,xmm4,xmm3 - - vpxor xmm7,xmm15,xmm1 - - vpsrld xmm1,xmm8,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm8,10 - vpxor xmm15,xmm9,xmm4 - vpaddd xmm11,xmm11,xmm5 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm15,xmm15,xmm5 - vpaddd xmm15,xmm15,xmm7 - vmovdqu xmm5,XMMWORD[((160-128))+rax] - vpaddd xmm6,xmm6,XMMWORD[((32-128))+rax] - - vpsrld xmm7,xmm5,3 - vpsrld xmm1,xmm5,7 - vpslld xmm2,xmm5,25 - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm5,18 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm5,14 - vmovdqu xmm0,XMMWORD[((112-128))+rax] - vpsrld xmm4,xmm0,10 - - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm0,17 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,15 - vpaddd xmm6,xmm6,xmm7 - vpxor xmm7,xmm4,xmm1 - vpsrld xmm1,xmm0,19 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,13 - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - vpaddd xmm6,xmm6,xmm7 - vpsrld xmm7,xmm11,6 - vpslld xmm2,xmm11,26 - vmovdqu XMMWORD[(144-128)+rax],xmm6 - vpaddd xmm6,xmm6,xmm14 - - vpsrld xmm1,xmm11,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm11,21 - vpaddd xmm6,xmm6,XMMWORD[((-96))+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm11,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm11,7 - vpandn xmm0,xmm11,xmm13 - vpand xmm4,xmm11,xmm12 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm14,xmm15,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm15,30 - vpxor xmm0,xmm0,xmm4 - vpxor xmm4,xmm8,xmm15 - - vpxor xmm14,xmm14,xmm1 - vpaddd xmm6,xmm6,xmm7 - - vpsrld xmm1,xmm15,13 - - vpslld xmm2,xmm15,19 - vpaddd xmm6,xmm6,xmm0 - vpand xmm3,xmm3,xmm4 - - vpxor xmm7,xmm14,xmm1 - - vpsrld xmm1,xmm15,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm15,10 - vpxor xmm14,xmm8,xmm3 - vpaddd xmm10,xmm10,xmm6 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm14,xmm14,xmm6 - vpaddd xmm14,xmm14,xmm7 - vmovdqu xmm6,XMMWORD[((176-128))+rax] - vpaddd xmm5,xmm5,XMMWORD[((48-128))+rax] - - vpsrld xmm7,xmm6,3 - vpsrld xmm1,xmm6,7 - vpslld xmm2,xmm6,25 - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm6,18 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm6,14 - vmovdqu xmm0,XMMWORD[((128-128))+rax] - vpsrld xmm3,xmm0,10 - - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm0,17 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,15 - vpaddd xmm5,xmm5,xmm7 - vpxor xmm7,xmm3,xmm1 - vpsrld xmm1,xmm0,19 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,13 - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - vpaddd xmm5,xmm5,xmm7 - vpsrld xmm7,xmm10,6 - vpslld xmm2,xmm10,26 - vmovdqu XMMWORD[(160-128)+rax],xmm5 - vpaddd xmm5,xmm5,xmm13 - - vpsrld xmm1,xmm10,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm10,21 - vpaddd xmm5,xmm5,XMMWORD[((-64))+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm10,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm10,7 - vpandn xmm0,xmm10,xmm12 - vpand xmm3,xmm10,xmm11 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm13,xmm14,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm14,30 - vpxor xmm0,xmm0,xmm3 - vpxor xmm3,xmm15,xmm14 - - vpxor xmm13,xmm13,xmm1 - vpaddd xmm5,xmm5,xmm7 - - vpsrld xmm1,xmm14,13 - - vpslld xmm2,xmm14,19 - vpaddd xmm5,xmm5,xmm0 - vpand xmm4,xmm4,xmm3 - - vpxor xmm7,xmm13,xmm1 - - vpsrld xmm1,xmm14,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm14,10 - vpxor xmm13,xmm15,xmm4 - vpaddd xmm9,xmm9,xmm5 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm13,xmm13,xmm5 - vpaddd xmm13,xmm13,xmm7 - vmovdqu xmm5,XMMWORD[((192-128))+rax] - vpaddd xmm6,xmm6,XMMWORD[((64-128))+rax] - - vpsrld xmm7,xmm5,3 - vpsrld xmm1,xmm5,7 - vpslld xmm2,xmm5,25 - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm5,18 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm5,14 - vmovdqu xmm0,XMMWORD[((144-128))+rax] - vpsrld xmm4,xmm0,10 - - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm0,17 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,15 - vpaddd xmm6,xmm6,xmm7 - vpxor xmm7,xmm4,xmm1 - vpsrld xmm1,xmm0,19 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,13 - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - vpaddd xmm6,xmm6,xmm7 - vpsrld xmm7,xmm9,6 - vpslld xmm2,xmm9,26 - vmovdqu XMMWORD[(176-128)+rax],xmm6 - vpaddd xmm6,xmm6,xmm12 - - vpsrld xmm1,xmm9,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm9,21 - vpaddd xmm6,xmm6,XMMWORD[((-32))+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm9,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm9,7 - vpandn xmm0,xmm9,xmm11 - vpand xmm4,xmm9,xmm10 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm12,xmm13,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm13,30 - vpxor xmm0,xmm0,xmm4 - vpxor xmm4,xmm14,xmm13 - - vpxor xmm12,xmm12,xmm1 - vpaddd xmm6,xmm6,xmm7 - - vpsrld xmm1,xmm13,13 - - vpslld xmm2,xmm13,19 - vpaddd xmm6,xmm6,xmm0 - vpand xmm3,xmm3,xmm4 - - vpxor xmm7,xmm12,xmm1 - - vpsrld xmm1,xmm13,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm13,10 - vpxor xmm12,xmm14,xmm3 - vpaddd xmm8,xmm8,xmm6 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm12,xmm12,xmm6 - vpaddd xmm12,xmm12,xmm7 - vmovdqu xmm6,XMMWORD[((208-128))+rax] - vpaddd xmm5,xmm5,XMMWORD[((80-128))+rax] - - vpsrld xmm7,xmm6,3 - vpsrld xmm1,xmm6,7 - vpslld xmm2,xmm6,25 - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm6,18 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm6,14 - vmovdqu xmm0,XMMWORD[((160-128))+rax] - vpsrld xmm3,xmm0,10 - - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm0,17 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,15 - vpaddd xmm5,xmm5,xmm7 - vpxor xmm7,xmm3,xmm1 - vpsrld xmm1,xmm0,19 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,13 - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - vpaddd xmm5,xmm5,xmm7 - vpsrld xmm7,xmm8,6 - vpslld xmm2,xmm8,26 - vmovdqu XMMWORD[(192-128)+rax],xmm5 - vpaddd xmm5,xmm5,xmm11 - - vpsrld xmm1,xmm8,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm8,21 - vpaddd xmm5,xmm5,XMMWORD[rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm8,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm8,7 - vpandn xmm0,xmm8,xmm10 - vpand xmm3,xmm8,xmm9 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm11,xmm12,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm12,30 - vpxor xmm0,xmm0,xmm3 - vpxor xmm3,xmm13,xmm12 - - vpxor xmm11,xmm11,xmm1 - vpaddd xmm5,xmm5,xmm7 - - vpsrld xmm1,xmm12,13 - - vpslld xmm2,xmm12,19 - vpaddd xmm5,xmm5,xmm0 - vpand xmm4,xmm4,xmm3 - - vpxor xmm7,xmm11,xmm1 - - vpsrld xmm1,xmm12,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm12,10 - vpxor xmm11,xmm13,xmm4 - vpaddd xmm15,xmm15,xmm5 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm11,xmm11,xmm5 - vpaddd xmm11,xmm11,xmm7 - vmovdqu xmm5,XMMWORD[((224-128))+rax] - vpaddd xmm6,xmm6,XMMWORD[((96-128))+rax] - - vpsrld xmm7,xmm5,3 - vpsrld xmm1,xmm5,7 - vpslld xmm2,xmm5,25 - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm5,18 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm5,14 - vmovdqu xmm0,XMMWORD[((176-128))+rax] - vpsrld xmm4,xmm0,10 - - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm0,17 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,15 - vpaddd xmm6,xmm6,xmm7 - vpxor xmm7,xmm4,xmm1 - vpsrld xmm1,xmm0,19 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,13 - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - vpaddd xmm6,xmm6,xmm7 - vpsrld xmm7,xmm15,6 - vpslld xmm2,xmm15,26 - vmovdqu XMMWORD[(208-128)+rax],xmm6 - vpaddd xmm6,xmm6,xmm10 - - vpsrld xmm1,xmm15,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm15,21 - vpaddd xmm6,xmm6,XMMWORD[32+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm15,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm15,7 - vpandn xmm0,xmm15,xmm9 - vpand xmm4,xmm15,xmm8 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm10,xmm11,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm11,30 - vpxor xmm0,xmm0,xmm4 - vpxor xmm4,xmm12,xmm11 - - vpxor xmm10,xmm10,xmm1 - vpaddd xmm6,xmm6,xmm7 - - vpsrld xmm1,xmm11,13 - - vpslld xmm2,xmm11,19 - vpaddd xmm6,xmm6,xmm0 - vpand xmm3,xmm3,xmm4 - - vpxor xmm7,xmm10,xmm1 - - vpsrld xmm1,xmm11,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm11,10 - vpxor xmm10,xmm12,xmm3 - vpaddd xmm14,xmm14,xmm6 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm10,xmm10,xmm6 - vpaddd xmm10,xmm10,xmm7 - vmovdqu xmm6,XMMWORD[((240-128))+rax] - vpaddd xmm5,xmm5,XMMWORD[((112-128))+rax] - - vpsrld xmm7,xmm6,3 - vpsrld xmm1,xmm6,7 - vpslld xmm2,xmm6,25 - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm6,18 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm6,14 - vmovdqu xmm0,XMMWORD[((192-128))+rax] - vpsrld xmm3,xmm0,10 - - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm0,17 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,15 - vpaddd xmm5,xmm5,xmm7 - vpxor xmm7,xmm3,xmm1 - vpsrld xmm1,xmm0,19 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,13 - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - vpaddd xmm5,xmm5,xmm7 - vpsrld xmm7,xmm14,6 - vpslld xmm2,xmm14,26 - vmovdqu XMMWORD[(224-128)+rax],xmm5 - vpaddd xmm5,xmm5,xmm9 - - vpsrld xmm1,xmm14,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm14,21 - vpaddd xmm5,xmm5,XMMWORD[64+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm14,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm14,7 - vpandn xmm0,xmm14,xmm8 - vpand xmm3,xmm14,xmm15 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm9,xmm10,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm10,30 - vpxor xmm0,xmm0,xmm3 - vpxor xmm3,xmm11,xmm10 - - vpxor xmm9,xmm9,xmm1 - vpaddd xmm5,xmm5,xmm7 - - vpsrld xmm1,xmm10,13 - - vpslld xmm2,xmm10,19 - vpaddd xmm5,xmm5,xmm0 - vpand xmm4,xmm4,xmm3 - - vpxor xmm7,xmm9,xmm1 - - vpsrld xmm1,xmm10,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm10,10 - vpxor xmm9,xmm11,xmm4 - vpaddd xmm13,xmm13,xmm5 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm9,xmm9,xmm5 - vpaddd xmm9,xmm9,xmm7 - vmovdqu xmm5,XMMWORD[((0-128))+rax] - vpaddd xmm6,xmm6,XMMWORD[((128-128))+rax] - - vpsrld xmm7,xmm5,3 - vpsrld xmm1,xmm5,7 - vpslld xmm2,xmm5,25 - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm5,18 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm5,14 - vmovdqu xmm0,XMMWORD[((208-128))+rax] - vpsrld xmm4,xmm0,10 - - vpxor xmm7,xmm7,xmm1 - vpsrld xmm1,xmm0,17 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,15 - vpaddd xmm6,xmm6,xmm7 - vpxor xmm7,xmm4,xmm1 - vpsrld xmm1,xmm0,19 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm0,13 - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - vpaddd xmm6,xmm6,xmm7 - vpsrld xmm7,xmm13,6 - vpslld xmm2,xmm13,26 - vmovdqu XMMWORD[(240-128)+rax],xmm6 - vpaddd xmm6,xmm6,xmm8 - - vpsrld xmm1,xmm13,11 - vpxor xmm7,xmm7,xmm2 - vpslld xmm2,xmm13,21 - vpaddd xmm6,xmm6,XMMWORD[96+rbp] - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm1,xmm13,25 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm13,7 - vpandn xmm0,xmm13,xmm15 - vpand xmm4,xmm13,xmm14 - - vpxor xmm7,xmm7,xmm1 - - vpsrld xmm8,xmm9,2 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm1,xmm9,30 - vpxor xmm0,xmm0,xmm4 - vpxor xmm4,xmm10,xmm9 - - vpxor xmm8,xmm8,xmm1 - vpaddd xmm6,xmm6,xmm7 - - vpsrld xmm1,xmm9,13 - - vpslld xmm2,xmm9,19 - vpaddd xmm6,xmm6,xmm0 - vpand xmm3,xmm3,xmm4 - - vpxor xmm7,xmm8,xmm1 - - vpsrld xmm1,xmm9,22 - vpxor xmm7,xmm7,xmm2 - - vpslld xmm2,xmm9,10 - vpxor xmm8,xmm10,xmm3 - vpaddd xmm12,xmm12,xmm6 - - vpxor xmm7,xmm7,xmm1 - vpxor xmm7,xmm7,xmm2 - - vpaddd xmm8,xmm8,xmm6 - vpaddd xmm8,xmm8,xmm7 - add rbp,256 - dec ecx - jnz NEAR $L$oop_16_xx_avx - - mov ecx,1 - lea rbp,[((K256+128))] - cmp ecx,DWORD[rbx] - cmovge r8,rbp - cmp ecx,DWORD[4+rbx] - cmovge r9,rbp - cmp ecx,DWORD[8+rbx] - cmovge r10,rbp - cmp ecx,DWORD[12+rbx] - cmovge r11,rbp - vmovdqa xmm7,XMMWORD[rbx] - vpxor xmm0,xmm0,xmm0 - vmovdqa xmm6,xmm7 - vpcmpgtd xmm6,xmm6,xmm0 - vpaddd xmm7,xmm7,xmm6 - - vmovdqu xmm0,XMMWORD[((0-128))+rdi] - vpand xmm8,xmm8,xmm6 - vmovdqu xmm1,XMMWORD[((32-128))+rdi] - vpand xmm9,xmm9,xmm6 - vmovdqu xmm2,XMMWORD[((64-128))+rdi] - vpand xmm10,xmm10,xmm6 - vmovdqu xmm5,XMMWORD[((96-128))+rdi] - vpand xmm11,xmm11,xmm6 - vpaddd xmm8,xmm8,xmm0 - vmovdqu xmm0,XMMWORD[((128-128))+rdi] - vpand xmm12,xmm12,xmm6 - vpaddd xmm9,xmm9,xmm1 - vmovdqu xmm1,XMMWORD[((160-128))+rdi] - vpand xmm13,xmm13,xmm6 - vpaddd xmm10,xmm10,xmm2 - vmovdqu xmm2,XMMWORD[((192-128))+rdi] - vpand xmm14,xmm14,xmm6 - vpaddd xmm11,xmm11,xmm5 - vmovdqu xmm5,XMMWORD[((224-128))+rdi] - vpand xmm15,xmm15,xmm6 - vpaddd xmm12,xmm12,xmm0 - vpaddd xmm13,xmm13,xmm1 - vmovdqu XMMWORD[(0-128)+rdi],xmm8 - vpaddd xmm14,xmm14,xmm2 - vmovdqu XMMWORD[(32-128)+rdi],xmm9 - vpaddd xmm15,xmm15,xmm5 - vmovdqu XMMWORD[(64-128)+rdi],xmm10 - vmovdqu XMMWORD[(96-128)+rdi],xmm11 - vmovdqu XMMWORD[(128-128)+rdi],xmm12 - vmovdqu XMMWORD[(160-128)+rdi],xmm13 - vmovdqu XMMWORD[(192-128)+rdi],xmm14 - vmovdqu XMMWORD[(224-128)+rdi],xmm15 - - vmovdqu XMMWORD[rbx],xmm7 - vmovdqu xmm6,XMMWORD[$L$pbswap] - dec edx - jnz NEAR $L$oop_avx - - mov edx,DWORD[280+rsp] - lea rdi,[16+rdi] - lea rsi,[64+rsi] - dec edx - jnz NEAR $L$oop_grande_avx - -$L$done_avx: - mov rax,QWORD[272+rsp] - - vzeroupper - movaps xmm6,XMMWORD[((-184))+rax] - movaps xmm7,XMMWORD[((-168))+rax] - movaps xmm8,XMMWORD[((-152))+rax] - movaps xmm9,XMMWORD[((-136))+rax] - movaps xmm10,XMMWORD[((-120))+rax] - movaps xmm11,XMMWORD[((-104))+rax] - movaps xmm12,XMMWORD[((-88))+rax] - movaps xmm13,XMMWORD[((-72))+rax] - movaps xmm14,XMMWORD[((-56))+rax] - movaps xmm15,XMMWORD[((-40))+rax] - mov rbp,QWORD[((-16))+rax] - - mov rbx,QWORD[((-8))+rax] - - lea rsp,[rax] - -$L$epilogue_avx: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_sha256_multi_block_avx: - -ALIGN 32 -sha256_multi_block_avx2: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_sha256_multi_block_avx2: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - - -_avx2_shortcut: - mov rax,rsp - - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - push r15 - - lea rsp,[((-168))+rsp] - movaps XMMWORD[rsp],xmm6 - movaps XMMWORD[16+rsp],xmm7 - movaps XMMWORD[32+rsp],xmm8 - movaps XMMWORD[48+rsp],xmm9 - movaps XMMWORD[64+rsp],xmm10 - movaps XMMWORD[80+rsp],xmm11 - movaps XMMWORD[(-120)+rax],xmm12 - movaps XMMWORD[(-104)+rax],xmm13 - movaps XMMWORD[(-88)+rax],xmm14 - movaps XMMWORD[(-72)+rax],xmm15 - sub rsp,576 - and rsp,-256 - mov QWORD[544+rsp],rax - -$L$body_avx2: - lea rbp,[((K256+128))] - lea rdi,[128+rdi] - -$L$oop_grande_avx2: - mov DWORD[552+rsp],edx - xor edx,edx - lea rbx,[512+rsp] - - mov r12,QWORD[rsi] - - mov ecx,DWORD[8+rsi] - cmp ecx,edx - cmovg edx,ecx - test ecx,ecx - mov DWORD[rbx],ecx - cmovle r12,rbp - - mov r13,QWORD[16+rsi] - - mov ecx,DWORD[24+rsi] - cmp ecx,edx - cmovg edx,ecx - test ecx,ecx - mov DWORD[4+rbx],ecx - cmovle r13,rbp - - mov r14,QWORD[32+rsi] - - mov ecx,DWORD[40+rsi] - cmp ecx,edx - cmovg edx,ecx - test ecx,ecx - mov DWORD[8+rbx],ecx - cmovle r14,rbp - - mov r15,QWORD[48+rsi] - - mov ecx,DWORD[56+rsi] - cmp ecx,edx - cmovg edx,ecx - test ecx,ecx - mov DWORD[12+rbx],ecx - cmovle r15,rbp - - mov r8,QWORD[64+rsi] - - mov ecx,DWORD[72+rsi] - cmp ecx,edx - cmovg edx,ecx - test ecx,ecx - mov DWORD[16+rbx],ecx - cmovle r8,rbp - - mov r9,QWORD[80+rsi] - - mov ecx,DWORD[88+rsi] - cmp ecx,edx - cmovg edx,ecx - test ecx,ecx - mov DWORD[20+rbx],ecx - cmovle r9,rbp - - mov r10,QWORD[96+rsi] - - mov ecx,DWORD[104+rsi] - cmp ecx,edx - cmovg edx,ecx - test ecx,ecx - mov DWORD[24+rbx],ecx - cmovle r10,rbp - - mov r11,QWORD[112+rsi] - - mov ecx,DWORD[120+rsi] - cmp ecx,edx - cmovg edx,ecx - test ecx,ecx - mov DWORD[28+rbx],ecx - cmovle r11,rbp - vmovdqu ymm8,YMMWORD[((0-128))+rdi] - lea rax,[128+rsp] - vmovdqu ymm9,YMMWORD[((32-128))+rdi] - lea rbx,[((256+128))+rsp] - vmovdqu ymm10,YMMWORD[((64-128))+rdi] - vmovdqu ymm11,YMMWORD[((96-128))+rdi] - vmovdqu ymm12,YMMWORD[((128-128))+rdi] - vmovdqu ymm13,YMMWORD[((160-128))+rdi] - vmovdqu ymm14,YMMWORD[((192-128))+rdi] - vmovdqu ymm15,YMMWORD[((224-128))+rdi] - vmovdqu ymm6,YMMWORD[$L$pbswap] - jmp NEAR $L$oop_avx2 - -ALIGN 32 -$L$oop_avx2: - vpxor ymm4,ymm10,ymm9 - vmovd xmm5,DWORD[r12] - vmovd xmm0,DWORD[r8] - vmovd xmm1,DWORD[r13] - vmovd xmm2,DWORD[r9] - vpinsrd xmm5,xmm5,DWORD[r14],1 - vpinsrd xmm0,xmm0,DWORD[r10],1 - vpinsrd xmm1,xmm1,DWORD[r15],1 - vpunpckldq ymm5,ymm5,ymm1 - vpinsrd xmm2,xmm2,DWORD[r11],1 - vpunpckldq ymm0,ymm0,ymm2 - vinserti128 ymm5,ymm5,xmm0,1 - vpshufb ymm5,ymm5,ymm6 - vpsrld ymm7,ymm12,6 - vpslld ymm2,ymm12,26 - vmovdqu YMMWORD[(0-128)+rax],ymm5 - vpaddd ymm5,ymm5,ymm15 - - vpsrld ymm1,ymm12,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm12,21 - vpaddd ymm5,ymm5,YMMWORD[((-128))+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm12,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm12,7 - vpandn ymm0,ymm12,ymm14 - vpand ymm3,ymm12,ymm13 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm15,ymm8,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm8,30 - vpxor ymm0,ymm0,ymm3 - vpxor ymm3,ymm9,ymm8 - - vpxor ymm15,ymm15,ymm1 - vpaddd ymm5,ymm5,ymm7 - - vpsrld ymm1,ymm8,13 - - vpslld ymm2,ymm8,19 - vpaddd ymm5,ymm5,ymm0 - vpand ymm4,ymm4,ymm3 - - vpxor ymm7,ymm15,ymm1 - - vpsrld ymm1,ymm8,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm8,10 - vpxor ymm15,ymm9,ymm4 - vpaddd ymm11,ymm11,ymm5 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm15,ymm15,ymm5 - vpaddd ymm15,ymm15,ymm7 - vmovd xmm5,DWORD[4+r12] - vmovd xmm0,DWORD[4+r8] - vmovd xmm1,DWORD[4+r13] - vmovd xmm2,DWORD[4+r9] - vpinsrd xmm5,xmm5,DWORD[4+r14],1 - vpinsrd xmm0,xmm0,DWORD[4+r10],1 - vpinsrd xmm1,xmm1,DWORD[4+r15],1 - vpunpckldq ymm5,ymm5,ymm1 - vpinsrd xmm2,xmm2,DWORD[4+r11],1 - vpunpckldq ymm0,ymm0,ymm2 - vinserti128 ymm5,ymm5,xmm0,1 - vpshufb ymm5,ymm5,ymm6 - vpsrld ymm7,ymm11,6 - vpslld ymm2,ymm11,26 - vmovdqu YMMWORD[(32-128)+rax],ymm5 - vpaddd ymm5,ymm5,ymm14 - - vpsrld ymm1,ymm11,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm11,21 - vpaddd ymm5,ymm5,YMMWORD[((-96))+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm11,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm11,7 - vpandn ymm0,ymm11,ymm13 - vpand ymm4,ymm11,ymm12 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm14,ymm15,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm15,30 - vpxor ymm0,ymm0,ymm4 - vpxor ymm4,ymm8,ymm15 - - vpxor ymm14,ymm14,ymm1 - vpaddd ymm5,ymm5,ymm7 - - vpsrld ymm1,ymm15,13 - - vpslld ymm2,ymm15,19 - vpaddd ymm5,ymm5,ymm0 - vpand ymm3,ymm3,ymm4 - - vpxor ymm7,ymm14,ymm1 - - vpsrld ymm1,ymm15,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm15,10 - vpxor ymm14,ymm8,ymm3 - vpaddd ymm10,ymm10,ymm5 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm14,ymm14,ymm5 - vpaddd ymm14,ymm14,ymm7 - vmovd xmm5,DWORD[8+r12] - vmovd xmm0,DWORD[8+r8] - vmovd xmm1,DWORD[8+r13] - vmovd xmm2,DWORD[8+r9] - vpinsrd xmm5,xmm5,DWORD[8+r14],1 - vpinsrd xmm0,xmm0,DWORD[8+r10],1 - vpinsrd xmm1,xmm1,DWORD[8+r15],1 - vpunpckldq ymm5,ymm5,ymm1 - vpinsrd xmm2,xmm2,DWORD[8+r11],1 - vpunpckldq ymm0,ymm0,ymm2 - vinserti128 ymm5,ymm5,xmm0,1 - vpshufb ymm5,ymm5,ymm6 - vpsrld ymm7,ymm10,6 - vpslld ymm2,ymm10,26 - vmovdqu YMMWORD[(64-128)+rax],ymm5 - vpaddd ymm5,ymm5,ymm13 - - vpsrld ymm1,ymm10,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm10,21 - vpaddd ymm5,ymm5,YMMWORD[((-64))+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm10,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm10,7 - vpandn ymm0,ymm10,ymm12 - vpand ymm3,ymm10,ymm11 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm13,ymm14,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm14,30 - vpxor ymm0,ymm0,ymm3 - vpxor ymm3,ymm15,ymm14 - - vpxor ymm13,ymm13,ymm1 - vpaddd ymm5,ymm5,ymm7 - - vpsrld ymm1,ymm14,13 - - vpslld ymm2,ymm14,19 - vpaddd ymm5,ymm5,ymm0 - vpand ymm4,ymm4,ymm3 - - vpxor ymm7,ymm13,ymm1 - - vpsrld ymm1,ymm14,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm14,10 - vpxor ymm13,ymm15,ymm4 - vpaddd ymm9,ymm9,ymm5 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm13,ymm13,ymm5 - vpaddd ymm13,ymm13,ymm7 - vmovd xmm5,DWORD[12+r12] - vmovd xmm0,DWORD[12+r8] - vmovd xmm1,DWORD[12+r13] - vmovd xmm2,DWORD[12+r9] - vpinsrd xmm5,xmm5,DWORD[12+r14],1 - vpinsrd xmm0,xmm0,DWORD[12+r10],1 - vpinsrd xmm1,xmm1,DWORD[12+r15],1 - vpunpckldq ymm5,ymm5,ymm1 - vpinsrd xmm2,xmm2,DWORD[12+r11],1 - vpunpckldq ymm0,ymm0,ymm2 - vinserti128 ymm5,ymm5,xmm0,1 - vpshufb ymm5,ymm5,ymm6 - vpsrld ymm7,ymm9,6 - vpslld ymm2,ymm9,26 - vmovdqu YMMWORD[(96-128)+rax],ymm5 - vpaddd ymm5,ymm5,ymm12 - - vpsrld ymm1,ymm9,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm9,21 - vpaddd ymm5,ymm5,YMMWORD[((-32))+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm9,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm9,7 - vpandn ymm0,ymm9,ymm11 - vpand ymm4,ymm9,ymm10 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm12,ymm13,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm13,30 - vpxor ymm0,ymm0,ymm4 - vpxor ymm4,ymm14,ymm13 - - vpxor ymm12,ymm12,ymm1 - vpaddd ymm5,ymm5,ymm7 - - vpsrld ymm1,ymm13,13 - - vpslld ymm2,ymm13,19 - vpaddd ymm5,ymm5,ymm0 - vpand ymm3,ymm3,ymm4 - - vpxor ymm7,ymm12,ymm1 - - vpsrld ymm1,ymm13,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm13,10 - vpxor ymm12,ymm14,ymm3 - vpaddd ymm8,ymm8,ymm5 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm12,ymm12,ymm5 - vpaddd ymm12,ymm12,ymm7 - vmovd xmm5,DWORD[16+r12] - vmovd xmm0,DWORD[16+r8] - vmovd xmm1,DWORD[16+r13] - vmovd xmm2,DWORD[16+r9] - vpinsrd xmm5,xmm5,DWORD[16+r14],1 - vpinsrd xmm0,xmm0,DWORD[16+r10],1 - vpinsrd xmm1,xmm1,DWORD[16+r15],1 - vpunpckldq ymm5,ymm5,ymm1 - vpinsrd xmm2,xmm2,DWORD[16+r11],1 - vpunpckldq ymm0,ymm0,ymm2 - vinserti128 ymm5,ymm5,xmm0,1 - vpshufb ymm5,ymm5,ymm6 - vpsrld ymm7,ymm8,6 - vpslld ymm2,ymm8,26 - vmovdqu YMMWORD[(128-128)+rax],ymm5 - vpaddd ymm5,ymm5,ymm11 - - vpsrld ymm1,ymm8,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm8,21 - vpaddd ymm5,ymm5,YMMWORD[rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm8,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm8,7 - vpandn ymm0,ymm8,ymm10 - vpand ymm3,ymm8,ymm9 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm11,ymm12,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm12,30 - vpxor ymm0,ymm0,ymm3 - vpxor ymm3,ymm13,ymm12 - - vpxor ymm11,ymm11,ymm1 - vpaddd ymm5,ymm5,ymm7 - - vpsrld ymm1,ymm12,13 - - vpslld ymm2,ymm12,19 - vpaddd ymm5,ymm5,ymm0 - vpand ymm4,ymm4,ymm3 - - vpxor ymm7,ymm11,ymm1 - - vpsrld ymm1,ymm12,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm12,10 - vpxor ymm11,ymm13,ymm4 - vpaddd ymm15,ymm15,ymm5 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm11,ymm11,ymm5 - vpaddd ymm11,ymm11,ymm7 - vmovd xmm5,DWORD[20+r12] - vmovd xmm0,DWORD[20+r8] - vmovd xmm1,DWORD[20+r13] - vmovd xmm2,DWORD[20+r9] - vpinsrd xmm5,xmm5,DWORD[20+r14],1 - vpinsrd xmm0,xmm0,DWORD[20+r10],1 - vpinsrd xmm1,xmm1,DWORD[20+r15],1 - vpunpckldq ymm5,ymm5,ymm1 - vpinsrd xmm2,xmm2,DWORD[20+r11],1 - vpunpckldq ymm0,ymm0,ymm2 - vinserti128 ymm5,ymm5,xmm0,1 - vpshufb ymm5,ymm5,ymm6 - vpsrld ymm7,ymm15,6 - vpslld ymm2,ymm15,26 - vmovdqu YMMWORD[(160-128)+rax],ymm5 - vpaddd ymm5,ymm5,ymm10 - - vpsrld ymm1,ymm15,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm15,21 - vpaddd ymm5,ymm5,YMMWORD[32+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm15,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm15,7 - vpandn ymm0,ymm15,ymm9 - vpand ymm4,ymm15,ymm8 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm10,ymm11,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm11,30 - vpxor ymm0,ymm0,ymm4 - vpxor ymm4,ymm12,ymm11 - - vpxor ymm10,ymm10,ymm1 - vpaddd ymm5,ymm5,ymm7 - - vpsrld ymm1,ymm11,13 - - vpslld ymm2,ymm11,19 - vpaddd ymm5,ymm5,ymm0 - vpand ymm3,ymm3,ymm4 - - vpxor ymm7,ymm10,ymm1 - - vpsrld ymm1,ymm11,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm11,10 - vpxor ymm10,ymm12,ymm3 - vpaddd ymm14,ymm14,ymm5 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm10,ymm10,ymm5 - vpaddd ymm10,ymm10,ymm7 - vmovd xmm5,DWORD[24+r12] - vmovd xmm0,DWORD[24+r8] - vmovd xmm1,DWORD[24+r13] - vmovd xmm2,DWORD[24+r9] - vpinsrd xmm5,xmm5,DWORD[24+r14],1 - vpinsrd xmm0,xmm0,DWORD[24+r10],1 - vpinsrd xmm1,xmm1,DWORD[24+r15],1 - vpunpckldq ymm5,ymm5,ymm1 - vpinsrd xmm2,xmm2,DWORD[24+r11],1 - vpunpckldq ymm0,ymm0,ymm2 - vinserti128 ymm5,ymm5,xmm0,1 - vpshufb ymm5,ymm5,ymm6 - vpsrld ymm7,ymm14,6 - vpslld ymm2,ymm14,26 - vmovdqu YMMWORD[(192-128)+rax],ymm5 - vpaddd ymm5,ymm5,ymm9 - - vpsrld ymm1,ymm14,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm14,21 - vpaddd ymm5,ymm5,YMMWORD[64+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm14,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm14,7 - vpandn ymm0,ymm14,ymm8 - vpand ymm3,ymm14,ymm15 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm9,ymm10,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm10,30 - vpxor ymm0,ymm0,ymm3 - vpxor ymm3,ymm11,ymm10 - - vpxor ymm9,ymm9,ymm1 - vpaddd ymm5,ymm5,ymm7 - - vpsrld ymm1,ymm10,13 - - vpslld ymm2,ymm10,19 - vpaddd ymm5,ymm5,ymm0 - vpand ymm4,ymm4,ymm3 - - vpxor ymm7,ymm9,ymm1 - - vpsrld ymm1,ymm10,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm10,10 - vpxor ymm9,ymm11,ymm4 - vpaddd ymm13,ymm13,ymm5 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm9,ymm9,ymm5 - vpaddd ymm9,ymm9,ymm7 - vmovd xmm5,DWORD[28+r12] - vmovd xmm0,DWORD[28+r8] - vmovd xmm1,DWORD[28+r13] - vmovd xmm2,DWORD[28+r9] - vpinsrd xmm5,xmm5,DWORD[28+r14],1 - vpinsrd xmm0,xmm0,DWORD[28+r10],1 - vpinsrd xmm1,xmm1,DWORD[28+r15],1 - vpunpckldq ymm5,ymm5,ymm1 - vpinsrd xmm2,xmm2,DWORD[28+r11],1 - vpunpckldq ymm0,ymm0,ymm2 - vinserti128 ymm5,ymm5,xmm0,1 - vpshufb ymm5,ymm5,ymm6 - vpsrld ymm7,ymm13,6 - vpslld ymm2,ymm13,26 - vmovdqu YMMWORD[(224-128)+rax],ymm5 - vpaddd ymm5,ymm5,ymm8 - - vpsrld ymm1,ymm13,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm13,21 - vpaddd ymm5,ymm5,YMMWORD[96+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm13,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm13,7 - vpandn ymm0,ymm13,ymm15 - vpand ymm4,ymm13,ymm14 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm8,ymm9,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm9,30 - vpxor ymm0,ymm0,ymm4 - vpxor ymm4,ymm10,ymm9 - - vpxor ymm8,ymm8,ymm1 - vpaddd ymm5,ymm5,ymm7 - - vpsrld ymm1,ymm9,13 - - vpslld ymm2,ymm9,19 - vpaddd ymm5,ymm5,ymm0 - vpand ymm3,ymm3,ymm4 - - vpxor ymm7,ymm8,ymm1 - - vpsrld ymm1,ymm9,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm9,10 - vpxor ymm8,ymm10,ymm3 - vpaddd ymm12,ymm12,ymm5 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm8,ymm8,ymm5 - vpaddd ymm8,ymm8,ymm7 - add rbp,256 - vmovd xmm5,DWORD[32+r12] - vmovd xmm0,DWORD[32+r8] - vmovd xmm1,DWORD[32+r13] - vmovd xmm2,DWORD[32+r9] - vpinsrd xmm5,xmm5,DWORD[32+r14],1 - vpinsrd xmm0,xmm0,DWORD[32+r10],1 - vpinsrd xmm1,xmm1,DWORD[32+r15],1 - vpunpckldq ymm5,ymm5,ymm1 - vpinsrd xmm2,xmm2,DWORD[32+r11],1 - vpunpckldq ymm0,ymm0,ymm2 - vinserti128 ymm5,ymm5,xmm0,1 - vpshufb ymm5,ymm5,ymm6 - vpsrld ymm7,ymm12,6 - vpslld ymm2,ymm12,26 - vmovdqu YMMWORD[(256-256-128)+rbx],ymm5 - vpaddd ymm5,ymm5,ymm15 - - vpsrld ymm1,ymm12,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm12,21 - vpaddd ymm5,ymm5,YMMWORD[((-128))+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm12,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm12,7 - vpandn ymm0,ymm12,ymm14 - vpand ymm3,ymm12,ymm13 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm15,ymm8,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm8,30 - vpxor ymm0,ymm0,ymm3 - vpxor ymm3,ymm9,ymm8 - - vpxor ymm15,ymm15,ymm1 - vpaddd ymm5,ymm5,ymm7 - - vpsrld ymm1,ymm8,13 - - vpslld ymm2,ymm8,19 - vpaddd ymm5,ymm5,ymm0 - vpand ymm4,ymm4,ymm3 - - vpxor ymm7,ymm15,ymm1 - - vpsrld ymm1,ymm8,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm8,10 - vpxor ymm15,ymm9,ymm4 - vpaddd ymm11,ymm11,ymm5 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm15,ymm15,ymm5 - vpaddd ymm15,ymm15,ymm7 - vmovd xmm5,DWORD[36+r12] - vmovd xmm0,DWORD[36+r8] - vmovd xmm1,DWORD[36+r13] - vmovd xmm2,DWORD[36+r9] - vpinsrd xmm5,xmm5,DWORD[36+r14],1 - vpinsrd xmm0,xmm0,DWORD[36+r10],1 - vpinsrd xmm1,xmm1,DWORD[36+r15],1 - vpunpckldq ymm5,ymm5,ymm1 - vpinsrd xmm2,xmm2,DWORD[36+r11],1 - vpunpckldq ymm0,ymm0,ymm2 - vinserti128 ymm5,ymm5,xmm0,1 - vpshufb ymm5,ymm5,ymm6 - vpsrld ymm7,ymm11,6 - vpslld ymm2,ymm11,26 - vmovdqu YMMWORD[(288-256-128)+rbx],ymm5 - vpaddd ymm5,ymm5,ymm14 - - vpsrld ymm1,ymm11,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm11,21 - vpaddd ymm5,ymm5,YMMWORD[((-96))+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm11,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm11,7 - vpandn ymm0,ymm11,ymm13 - vpand ymm4,ymm11,ymm12 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm14,ymm15,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm15,30 - vpxor ymm0,ymm0,ymm4 - vpxor ymm4,ymm8,ymm15 - - vpxor ymm14,ymm14,ymm1 - vpaddd ymm5,ymm5,ymm7 - - vpsrld ymm1,ymm15,13 - - vpslld ymm2,ymm15,19 - vpaddd ymm5,ymm5,ymm0 - vpand ymm3,ymm3,ymm4 - - vpxor ymm7,ymm14,ymm1 - - vpsrld ymm1,ymm15,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm15,10 - vpxor ymm14,ymm8,ymm3 - vpaddd ymm10,ymm10,ymm5 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm14,ymm14,ymm5 - vpaddd ymm14,ymm14,ymm7 - vmovd xmm5,DWORD[40+r12] - vmovd xmm0,DWORD[40+r8] - vmovd xmm1,DWORD[40+r13] - vmovd xmm2,DWORD[40+r9] - vpinsrd xmm5,xmm5,DWORD[40+r14],1 - vpinsrd xmm0,xmm0,DWORD[40+r10],1 - vpinsrd xmm1,xmm1,DWORD[40+r15],1 - vpunpckldq ymm5,ymm5,ymm1 - vpinsrd xmm2,xmm2,DWORD[40+r11],1 - vpunpckldq ymm0,ymm0,ymm2 - vinserti128 ymm5,ymm5,xmm0,1 - vpshufb ymm5,ymm5,ymm6 - vpsrld ymm7,ymm10,6 - vpslld ymm2,ymm10,26 - vmovdqu YMMWORD[(320-256-128)+rbx],ymm5 - vpaddd ymm5,ymm5,ymm13 - - vpsrld ymm1,ymm10,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm10,21 - vpaddd ymm5,ymm5,YMMWORD[((-64))+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm10,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm10,7 - vpandn ymm0,ymm10,ymm12 - vpand ymm3,ymm10,ymm11 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm13,ymm14,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm14,30 - vpxor ymm0,ymm0,ymm3 - vpxor ymm3,ymm15,ymm14 - - vpxor ymm13,ymm13,ymm1 - vpaddd ymm5,ymm5,ymm7 - - vpsrld ymm1,ymm14,13 - - vpslld ymm2,ymm14,19 - vpaddd ymm5,ymm5,ymm0 - vpand ymm4,ymm4,ymm3 - - vpxor ymm7,ymm13,ymm1 - - vpsrld ymm1,ymm14,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm14,10 - vpxor ymm13,ymm15,ymm4 - vpaddd ymm9,ymm9,ymm5 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm13,ymm13,ymm5 - vpaddd ymm13,ymm13,ymm7 - vmovd xmm5,DWORD[44+r12] - vmovd xmm0,DWORD[44+r8] - vmovd xmm1,DWORD[44+r13] - vmovd xmm2,DWORD[44+r9] - vpinsrd xmm5,xmm5,DWORD[44+r14],1 - vpinsrd xmm0,xmm0,DWORD[44+r10],1 - vpinsrd xmm1,xmm1,DWORD[44+r15],1 - vpunpckldq ymm5,ymm5,ymm1 - vpinsrd xmm2,xmm2,DWORD[44+r11],1 - vpunpckldq ymm0,ymm0,ymm2 - vinserti128 ymm5,ymm5,xmm0,1 - vpshufb ymm5,ymm5,ymm6 - vpsrld ymm7,ymm9,6 - vpslld ymm2,ymm9,26 - vmovdqu YMMWORD[(352-256-128)+rbx],ymm5 - vpaddd ymm5,ymm5,ymm12 - - vpsrld ymm1,ymm9,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm9,21 - vpaddd ymm5,ymm5,YMMWORD[((-32))+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm9,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm9,7 - vpandn ymm0,ymm9,ymm11 - vpand ymm4,ymm9,ymm10 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm12,ymm13,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm13,30 - vpxor ymm0,ymm0,ymm4 - vpxor ymm4,ymm14,ymm13 - - vpxor ymm12,ymm12,ymm1 - vpaddd ymm5,ymm5,ymm7 - - vpsrld ymm1,ymm13,13 - - vpslld ymm2,ymm13,19 - vpaddd ymm5,ymm5,ymm0 - vpand ymm3,ymm3,ymm4 - - vpxor ymm7,ymm12,ymm1 - - vpsrld ymm1,ymm13,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm13,10 - vpxor ymm12,ymm14,ymm3 - vpaddd ymm8,ymm8,ymm5 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm12,ymm12,ymm5 - vpaddd ymm12,ymm12,ymm7 - vmovd xmm5,DWORD[48+r12] - vmovd xmm0,DWORD[48+r8] - vmovd xmm1,DWORD[48+r13] - vmovd xmm2,DWORD[48+r9] - vpinsrd xmm5,xmm5,DWORD[48+r14],1 - vpinsrd xmm0,xmm0,DWORD[48+r10],1 - vpinsrd xmm1,xmm1,DWORD[48+r15],1 - vpunpckldq ymm5,ymm5,ymm1 - vpinsrd xmm2,xmm2,DWORD[48+r11],1 - vpunpckldq ymm0,ymm0,ymm2 - vinserti128 ymm5,ymm5,xmm0,1 - vpshufb ymm5,ymm5,ymm6 - vpsrld ymm7,ymm8,6 - vpslld ymm2,ymm8,26 - vmovdqu YMMWORD[(384-256-128)+rbx],ymm5 - vpaddd ymm5,ymm5,ymm11 - - vpsrld ymm1,ymm8,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm8,21 - vpaddd ymm5,ymm5,YMMWORD[rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm8,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm8,7 - vpandn ymm0,ymm8,ymm10 - vpand ymm3,ymm8,ymm9 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm11,ymm12,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm12,30 - vpxor ymm0,ymm0,ymm3 - vpxor ymm3,ymm13,ymm12 - - vpxor ymm11,ymm11,ymm1 - vpaddd ymm5,ymm5,ymm7 - - vpsrld ymm1,ymm12,13 - - vpslld ymm2,ymm12,19 - vpaddd ymm5,ymm5,ymm0 - vpand ymm4,ymm4,ymm3 - - vpxor ymm7,ymm11,ymm1 - - vpsrld ymm1,ymm12,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm12,10 - vpxor ymm11,ymm13,ymm4 - vpaddd ymm15,ymm15,ymm5 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm11,ymm11,ymm5 - vpaddd ymm11,ymm11,ymm7 - vmovd xmm5,DWORD[52+r12] - vmovd xmm0,DWORD[52+r8] - vmovd xmm1,DWORD[52+r13] - vmovd xmm2,DWORD[52+r9] - vpinsrd xmm5,xmm5,DWORD[52+r14],1 - vpinsrd xmm0,xmm0,DWORD[52+r10],1 - vpinsrd xmm1,xmm1,DWORD[52+r15],1 - vpunpckldq ymm5,ymm5,ymm1 - vpinsrd xmm2,xmm2,DWORD[52+r11],1 - vpunpckldq ymm0,ymm0,ymm2 - vinserti128 ymm5,ymm5,xmm0,1 - vpshufb ymm5,ymm5,ymm6 - vpsrld ymm7,ymm15,6 - vpslld ymm2,ymm15,26 - vmovdqu YMMWORD[(416-256-128)+rbx],ymm5 - vpaddd ymm5,ymm5,ymm10 - - vpsrld ymm1,ymm15,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm15,21 - vpaddd ymm5,ymm5,YMMWORD[32+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm15,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm15,7 - vpandn ymm0,ymm15,ymm9 - vpand ymm4,ymm15,ymm8 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm10,ymm11,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm11,30 - vpxor ymm0,ymm0,ymm4 - vpxor ymm4,ymm12,ymm11 - - vpxor ymm10,ymm10,ymm1 - vpaddd ymm5,ymm5,ymm7 - - vpsrld ymm1,ymm11,13 - - vpslld ymm2,ymm11,19 - vpaddd ymm5,ymm5,ymm0 - vpand ymm3,ymm3,ymm4 - - vpxor ymm7,ymm10,ymm1 - - vpsrld ymm1,ymm11,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm11,10 - vpxor ymm10,ymm12,ymm3 - vpaddd ymm14,ymm14,ymm5 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm10,ymm10,ymm5 - vpaddd ymm10,ymm10,ymm7 - vmovd xmm5,DWORD[56+r12] - vmovd xmm0,DWORD[56+r8] - vmovd xmm1,DWORD[56+r13] - vmovd xmm2,DWORD[56+r9] - vpinsrd xmm5,xmm5,DWORD[56+r14],1 - vpinsrd xmm0,xmm0,DWORD[56+r10],1 - vpinsrd xmm1,xmm1,DWORD[56+r15],1 - vpunpckldq ymm5,ymm5,ymm1 - vpinsrd xmm2,xmm2,DWORD[56+r11],1 - vpunpckldq ymm0,ymm0,ymm2 - vinserti128 ymm5,ymm5,xmm0,1 - vpshufb ymm5,ymm5,ymm6 - vpsrld ymm7,ymm14,6 - vpslld ymm2,ymm14,26 - vmovdqu YMMWORD[(448-256-128)+rbx],ymm5 - vpaddd ymm5,ymm5,ymm9 - - vpsrld ymm1,ymm14,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm14,21 - vpaddd ymm5,ymm5,YMMWORD[64+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm14,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm14,7 - vpandn ymm0,ymm14,ymm8 - vpand ymm3,ymm14,ymm15 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm9,ymm10,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm10,30 - vpxor ymm0,ymm0,ymm3 - vpxor ymm3,ymm11,ymm10 - - vpxor ymm9,ymm9,ymm1 - vpaddd ymm5,ymm5,ymm7 - - vpsrld ymm1,ymm10,13 - - vpslld ymm2,ymm10,19 - vpaddd ymm5,ymm5,ymm0 - vpand ymm4,ymm4,ymm3 - - vpxor ymm7,ymm9,ymm1 - - vpsrld ymm1,ymm10,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm10,10 - vpxor ymm9,ymm11,ymm4 - vpaddd ymm13,ymm13,ymm5 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm9,ymm9,ymm5 - vpaddd ymm9,ymm9,ymm7 - vmovd xmm5,DWORD[60+r12] - lea r12,[64+r12] - vmovd xmm0,DWORD[60+r8] - lea r8,[64+r8] - vmovd xmm1,DWORD[60+r13] - lea r13,[64+r13] - vmovd xmm2,DWORD[60+r9] - lea r9,[64+r9] - vpinsrd xmm5,xmm5,DWORD[60+r14],1 - lea r14,[64+r14] - vpinsrd xmm0,xmm0,DWORD[60+r10],1 - lea r10,[64+r10] - vpinsrd xmm1,xmm1,DWORD[60+r15],1 - lea r15,[64+r15] - vpunpckldq ymm5,ymm5,ymm1 - vpinsrd xmm2,xmm2,DWORD[60+r11],1 - lea r11,[64+r11] - vpunpckldq ymm0,ymm0,ymm2 - vinserti128 ymm5,ymm5,xmm0,1 - vpshufb ymm5,ymm5,ymm6 - vpsrld ymm7,ymm13,6 - vpslld ymm2,ymm13,26 - vmovdqu YMMWORD[(480-256-128)+rbx],ymm5 - vpaddd ymm5,ymm5,ymm8 - - vpsrld ymm1,ymm13,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm13,21 - vpaddd ymm5,ymm5,YMMWORD[96+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm13,25 - vpxor ymm7,ymm7,ymm2 - prefetcht0 [63+r12] - vpslld ymm2,ymm13,7 - vpandn ymm0,ymm13,ymm15 - vpand ymm4,ymm13,ymm14 - prefetcht0 [63+r13] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm8,ymm9,2 - vpxor ymm7,ymm7,ymm2 - prefetcht0 [63+r14] - vpslld ymm1,ymm9,30 - vpxor ymm0,ymm0,ymm4 - vpxor ymm4,ymm10,ymm9 - prefetcht0 [63+r15] - vpxor ymm8,ymm8,ymm1 - vpaddd ymm5,ymm5,ymm7 - - vpsrld ymm1,ymm9,13 - prefetcht0 [63+r8] - vpslld ymm2,ymm9,19 - vpaddd ymm5,ymm5,ymm0 - vpand ymm3,ymm3,ymm4 - prefetcht0 [63+r9] - vpxor ymm7,ymm8,ymm1 - - vpsrld ymm1,ymm9,22 - vpxor ymm7,ymm7,ymm2 - prefetcht0 [63+r10] - vpslld ymm2,ymm9,10 - vpxor ymm8,ymm10,ymm3 - vpaddd ymm12,ymm12,ymm5 - prefetcht0 [63+r11] - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm8,ymm8,ymm5 - vpaddd ymm8,ymm8,ymm7 - add rbp,256 - vmovdqu ymm5,YMMWORD[((0-128))+rax] - mov ecx,3 - jmp NEAR $L$oop_16_xx_avx2 -ALIGN 32 -$L$oop_16_xx_avx2: - vmovdqu ymm6,YMMWORD[((32-128))+rax] - vpaddd ymm5,ymm5,YMMWORD[((288-256-128))+rbx] - - vpsrld ymm7,ymm6,3 - vpsrld ymm1,ymm6,7 - vpslld ymm2,ymm6,25 - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm6,18 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm6,14 - vmovdqu ymm0,YMMWORD[((448-256-128))+rbx] - vpsrld ymm3,ymm0,10 - - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm0,17 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,15 - vpaddd ymm5,ymm5,ymm7 - vpxor ymm7,ymm3,ymm1 - vpsrld ymm1,ymm0,19 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,13 - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - vpaddd ymm5,ymm5,ymm7 - vpsrld ymm7,ymm12,6 - vpslld ymm2,ymm12,26 - vmovdqu YMMWORD[(0-128)+rax],ymm5 - vpaddd ymm5,ymm5,ymm15 - - vpsrld ymm1,ymm12,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm12,21 - vpaddd ymm5,ymm5,YMMWORD[((-128))+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm12,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm12,7 - vpandn ymm0,ymm12,ymm14 - vpand ymm3,ymm12,ymm13 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm15,ymm8,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm8,30 - vpxor ymm0,ymm0,ymm3 - vpxor ymm3,ymm9,ymm8 - - vpxor ymm15,ymm15,ymm1 - vpaddd ymm5,ymm5,ymm7 - - vpsrld ymm1,ymm8,13 - - vpslld ymm2,ymm8,19 - vpaddd ymm5,ymm5,ymm0 - vpand ymm4,ymm4,ymm3 - - vpxor ymm7,ymm15,ymm1 - - vpsrld ymm1,ymm8,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm8,10 - vpxor ymm15,ymm9,ymm4 - vpaddd ymm11,ymm11,ymm5 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm15,ymm15,ymm5 - vpaddd ymm15,ymm15,ymm7 - vmovdqu ymm5,YMMWORD[((64-128))+rax] - vpaddd ymm6,ymm6,YMMWORD[((320-256-128))+rbx] - - vpsrld ymm7,ymm5,3 - vpsrld ymm1,ymm5,7 - vpslld ymm2,ymm5,25 - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm5,18 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm5,14 - vmovdqu ymm0,YMMWORD[((480-256-128))+rbx] - vpsrld ymm4,ymm0,10 - - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm0,17 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,15 - vpaddd ymm6,ymm6,ymm7 - vpxor ymm7,ymm4,ymm1 - vpsrld ymm1,ymm0,19 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,13 - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - vpaddd ymm6,ymm6,ymm7 - vpsrld ymm7,ymm11,6 - vpslld ymm2,ymm11,26 - vmovdqu YMMWORD[(32-128)+rax],ymm6 - vpaddd ymm6,ymm6,ymm14 - - vpsrld ymm1,ymm11,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm11,21 - vpaddd ymm6,ymm6,YMMWORD[((-96))+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm11,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm11,7 - vpandn ymm0,ymm11,ymm13 - vpand ymm4,ymm11,ymm12 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm14,ymm15,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm15,30 - vpxor ymm0,ymm0,ymm4 - vpxor ymm4,ymm8,ymm15 - - vpxor ymm14,ymm14,ymm1 - vpaddd ymm6,ymm6,ymm7 - - vpsrld ymm1,ymm15,13 - - vpslld ymm2,ymm15,19 - vpaddd ymm6,ymm6,ymm0 - vpand ymm3,ymm3,ymm4 - - vpxor ymm7,ymm14,ymm1 - - vpsrld ymm1,ymm15,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm15,10 - vpxor ymm14,ymm8,ymm3 - vpaddd ymm10,ymm10,ymm6 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm14,ymm14,ymm6 - vpaddd ymm14,ymm14,ymm7 - vmovdqu ymm6,YMMWORD[((96-128))+rax] - vpaddd ymm5,ymm5,YMMWORD[((352-256-128))+rbx] - - vpsrld ymm7,ymm6,3 - vpsrld ymm1,ymm6,7 - vpslld ymm2,ymm6,25 - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm6,18 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm6,14 - vmovdqu ymm0,YMMWORD[((0-128))+rax] - vpsrld ymm3,ymm0,10 - - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm0,17 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,15 - vpaddd ymm5,ymm5,ymm7 - vpxor ymm7,ymm3,ymm1 - vpsrld ymm1,ymm0,19 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,13 - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - vpaddd ymm5,ymm5,ymm7 - vpsrld ymm7,ymm10,6 - vpslld ymm2,ymm10,26 - vmovdqu YMMWORD[(64-128)+rax],ymm5 - vpaddd ymm5,ymm5,ymm13 - - vpsrld ymm1,ymm10,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm10,21 - vpaddd ymm5,ymm5,YMMWORD[((-64))+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm10,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm10,7 - vpandn ymm0,ymm10,ymm12 - vpand ymm3,ymm10,ymm11 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm13,ymm14,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm14,30 - vpxor ymm0,ymm0,ymm3 - vpxor ymm3,ymm15,ymm14 - - vpxor ymm13,ymm13,ymm1 - vpaddd ymm5,ymm5,ymm7 - - vpsrld ymm1,ymm14,13 - - vpslld ymm2,ymm14,19 - vpaddd ymm5,ymm5,ymm0 - vpand ymm4,ymm4,ymm3 - - vpxor ymm7,ymm13,ymm1 - - vpsrld ymm1,ymm14,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm14,10 - vpxor ymm13,ymm15,ymm4 - vpaddd ymm9,ymm9,ymm5 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm13,ymm13,ymm5 - vpaddd ymm13,ymm13,ymm7 - vmovdqu ymm5,YMMWORD[((128-128))+rax] - vpaddd ymm6,ymm6,YMMWORD[((384-256-128))+rbx] - - vpsrld ymm7,ymm5,3 - vpsrld ymm1,ymm5,7 - vpslld ymm2,ymm5,25 - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm5,18 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm5,14 - vmovdqu ymm0,YMMWORD[((32-128))+rax] - vpsrld ymm4,ymm0,10 - - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm0,17 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,15 - vpaddd ymm6,ymm6,ymm7 - vpxor ymm7,ymm4,ymm1 - vpsrld ymm1,ymm0,19 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,13 - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - vpaddd ymm6,ymm6,ymm7 - vpsrld ymm7,ymm9,6 - vpslld ymm2,ymm9,26 - vmovdqu YMMWORD[(96-128)+rax],ymm6 - vpaddd ymm6,ymm6,ymm12 - - vpsrld ymm1,ymm9,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm9,21 - vpaddd ymm6,ymm6,YMMWORD[((-32))+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm9,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm9,7 - vpandn ymm0,ymm9,ymm11 - vpand ymm4,ymm9,ymm10 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm12,ymm13,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm13,30 - vpxor ymm0,ymm0,ymm4 - vpxor ymm4,ymm14,ymm13 - - vpxor ymm12,ymm12,ymm1 - vpaddd ymm6,ymm6,ymm7 - - vpsrld ymm1,ymm13,13 - - vpslld ymm2,ymm13,19 - vpaddd ymm6,ymm6,ymm0 - vpand ymm3,ymm3,ymm4 - - vpxor ymm7,ymm12,ymm1 - - vpsrld ymm1,ymm13,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm13,10 - vpxor ymm12,ymm14,ymm3 - vpaddd ymm8,ymm8,ymm6 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm12,ymm12,ymm6 - vpaddd ymm12,ymm12,ymm7 - vmovdqu ymm6,YMMWORD[((160-128))+rax] - vpaddd ymm5,ymm5,YMMWORD[((416-256-128))+rbx] - - vpsrld ymm7,ymm6,3 - vpsrld ymm1,ymm6,7 - vpslld ymm2,ymm6,25 - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm6,18 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm6,14 - vmovdqu ymm0,YMMWORD[((64-128))+rax] - vpsrld ymm3,ymm0,10 - - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm0,17 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,15 - vpaddd ymm5,ymm5,ymm7 - vpxor ymm7,ymm3,ymm1 - vpsrld ymm1,ymm0,19 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,13 - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - vpaddd ymm5,ymm5,ymm7 - vpsrld ymm7,ymm8,6 - vpslld ymm2,ymm8,26 - vmovdqu YMMWORD[(128-128)+rax],ymm5 - vpaddd ymm5,ymm5,ymm11 - - vpsrld ymm1,ymm8,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm8,21 - vpaddd ymm5,ymm5,YMMWORD[rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm8,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm8,7 - vpandn ymm0,ymm8,ymm10 - vpand ymm3,ymm8,ymm9 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm11,ymm12,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm12,30 - vpxor ymm0,ymm0,ymm3 - vpxor ymm3,ymm13,ymm12 - - vpxor ymm11,ymm11,ymm1 - vpaddd ymm5,ymm5,ymm7 - - vpsrld ymm1,ymm12,13 - - vpslld ymm2,ymm12,19 - vpaddd ymm5,ymm5,ymm0 - vpand ymm4,ymm4,ymm3 - - vpxor ymm7,ymm11,ymm1 - - vpsrld ymm1,ymm12,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm12,10 - vpxor ymm11,ymm13,ymm4 - vpaddd ymm15,ymm15,ymm5 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm11,ymm11,ymm5 - vpaddd ymm11,ymm11,ymm7 - vmovdqu ymm5,YMMWORD[((192-128))+rax] - vpaddd ymm6,ymm6,YMMWORD[((448-256-128))+rbx] - - vpsrld ymm7,ymm5,3 - vpsrld ymm1,ymm5,7 - vpslld ymm2,ymm5,25 - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm5,18 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm5,14 - vmovdqu ymm0,YMMWORD[((96-128))+rax] - vpsrld ymm4,ymm0,10 - - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm0,17 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,15 - vpaddd ymm6,ymm6,ymm7 - vpxor ymm7,ymm4,ymm1 - vpsrld ymm1,ymm0,19 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,13 - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - vpaddd ymm6,ymm6,ymm7 - vpsrld ymm7,ymm15,6 - vpslld ymm2,ymm15,26 - vmovdqu YMMWORD[(160-128)+rax],ymm6 - vpaddd ymm6,ymm6,ymm10 - - vpsrld ymm1,ymm15,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm15,21 - vpaddd ymm6,ymm6,YMMWORD[32+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm15,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm15,7 - vpandn ymm0,ymm15,ymm9 - vpand ymm4,ymm15,ymm8 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm10,ymm11,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm11,30 - vpxor ymm0,ymm0,ymm4 - vpxor ymm4,ymm12,ymm11 - - vpxor ymm10,ymm10,ymm1 - vpaddd ymm6,ymm6,ymm7 - - vpsrld ymm1,ymm11,13 - - vpslld ymm2,ymm11,19 - vpaddd ymm6,ymm6,ymm0 - vpand ymm3,ymm3,ymm4 - - vpxor ymm7,ymm10,ymm1 - - vpsrld ymm1,ymm11,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm11,10 - vpxor ymm10,ymm12,ymm3 - vpaddd ymm14,ymm14,ymm6 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm10,ymm10,ymm6 - vpaddd ymm10,ymm10,ymm7 - vmovdqu ymm6,YMMWORD[((224-128))+rax] - vpaddd ymm5,ymm5,YMMWORD[((480-256-128))+rbx] - - vpsrld ymm7,ymm6,3 - vpsrld ymm1,ymm6,7 - vpslld ymm2,ymm6,25 - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm6,18 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm6,14 - vmovdqu ymm0,YMMWORD[((128-128))+rax] - vpsrld ymm3,ymm0,10 - - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm0,17 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,15 - vpaddd ymm5,ymm5,ymm7 - vpxor ymm7,ymm3,ymm1 - vpsrld ymm1,ymm0,19 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,13 - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - vpaddd ymm5,ymm5,ymm7 - vpsrld ymm7,ymm14,6 - vpslld ymm2,ymm14,26 - vmovdqu YMMWORD[(192-128)+rax],ymm5 - vpaddd ymm5,ymm5,ymm9 - - vpsrld ymm1,ymm14,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm14,21 - vpaddd ymm5,ymm5,YMMWORD[64+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm14,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm14,7 - vpandn ymm0,ymm14,ymm8 - vpand ymm3,ymm14,ymm15 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm9,ymm10,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm10,30 - vpxor ymm0,ymm0,ymm3 - vpxor ymm3,ymm11,ymm10 - - vpxor ymm9,ymm9,ymm1 - vpaddd ymm5,ymm5,ymm7 - - vpsrld ymm1,ymm10,13 - - vpslld ymm2,ymm10,19 - vpaddd ymm5,ymm5,ymm0 - vpand ymm4,ymm4,ymm3 - - vpxor ymm7,ymm9,ymm1 - - vpsrld ymm1,ymm10,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm10,10 - vpxor ymm9,ymm11,ymm4 - vpaddd ymm13,ymm13,ymm5 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm9,ymm9,ymm5 - vpaddd ymm9,ymm9,ymm7 - vmovdqu ymm5,YMMWORD[((256-256-128))+rbx] - vpaddd ymm6,ymm6,YMMWORD[((0-128))+rax] - - vpsrld ymm7,ymm5,3 - vpsrld ymm1,ymm5,7 - vpslld ymm2,ymm5,25 - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm5,18 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm5,14 - vmovdqu ymm0,YMMWORD[((160-128))+rax] - vpsrld ymm4,ymm0,10 - - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm0,17 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,15 - vpaddd ymm6,ymm6,ymm7 - vpxor ymm7,ymm4,ymm1 - vpsrld ymm1,ymm0,19 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,13 - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - vpaddd ymm6,ymm6,ymm7 - vpsrld ymm7,ymm13,6 - vpslld ymm2,ymm13,26 - vmovdqu YMMWORD[(224-128)+rax],ymm6 - vpaddd ymm6,ymm6,ymm8 - - vpsrld ymm1,ymm13,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm13,21 - vpaddd ymm6,ymm6,YMMWORD[96+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm13,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm13,7 - vpandn ymm0,ymm13,ymm15 - vpand ymm4,ymm13,ymm14 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm8,ymm9,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm9,30 - vpxor ymm0,ymm0,ymm4 - vpxor ymm4,ymm10,ymm9 - - vpxor ymm8,ymm8,ymm1 - vpaddd ymm6,ymm6,ymm7 - - vpsrld ymm1,ymm9,13 - - vpslld ymm2,ymm9,19 - vpaddd ymm6,ymm6,ymm0 - vpand ymm3,ymm3,ymm4 - - vpxor ymm7,ymm8,ymm1 - - vpsrld ymm1,ymm9,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm9,10 - vpxor ymm8,ymm10,ymm3 - vpaddd ymm12,ymm12,ymm6 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm8,ymm8,ymm6 - vpaddd ymm8,ymm8,ymm7 - add rbp,256 - vmovdqu ymm6,YMMWORD[((288-256-128))+rbx] - vpaddd ymm5,ymm5,YMMWORD[((32-128))+rax] - - vpsrld ymm7,ymm6,3 - vpsrld ymm1,ymm6,7 - vpslld ymm2,ymm6,25 - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm6,18 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm6,14 - vmovdqu ymm0,YMMWORD[((192-128))+rax] - vpsrld ymm3,ymm0,10 - - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm0,17 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,15 - vpaddd ymm5,ymm5,ymm7 - vpxor ymm7,ymm3,ymm1 - vpsrld ymm1,ymm0,19 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,13 - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - vpaddd ymm5,ymm5,ymm7 - vpsrld ymm7,ymm12,6 - vpslld ymm2,ymm12,26 - vmovdqu YMMWORD[(256-256-128)+rbx],ymm5 - vpaddd ymm5,ymm5,ymm15 - - vpsrld ymm1,ymm12,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm12,21 - vpaddd ymm5,ymm5,YMMWORD[((-128))+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm12,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm12,7 - vpandn ymm0,ymm12,ymm14 - vpand ymm3,ymm12,ymm13 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm15,ymm8,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm8,30 - vpxor ymm0,ymm0,ymm3 - vpxor ymm3,ymm9,ymm8 - - vpxor ymm15,ymm15,ymm1 - vpaddd ymm5,ymm5,ymm7 - - vpsrld ymm1,ymm8,13 - - vpslld ymm2,ymm8,19 - vpaddd ymm5,ymm5,ymm0 - vpand ymm4,ymm4,ymm3 - - vpxor ymm7,ymm15,ymm1 - - vpsrld ymm1,ymm8,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm8,10 - vpxor ymm15,ymm9,ymm4 - vpaddd ymm11,ymm11,ymm5 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm15,ymm15,ymm5 - vpaddd ymm15,ymm15,ymm7 - vmovdqu ymm5,YMMWORD[((320-256-128))+rbx] - vpaddd ymm6,ymm6,YMMWORD[((64-128))+rax] - - vpsrld ymm7,ymm5,3 - vpsrld ymm1,ymm5,7 - vpslld ymm2,ymm5,25 - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm5,18 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm5,14 - vmovdqu ymm0,YMMWORD[((224-128))+rax] - vpsrld ymm4,ymm0,10 - - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm0,17 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,15 - vpaddd ymm6,ymm6,ymm7 - vpxor ymm7,ymm4,ymm1 - vpsrld ymm1,ymm0,19 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,13 - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - vpaddd ymm6,ymm6,ymm7 - vpsrld ymm7,ymm11,6 - vpslld ymm2,ymm11,26 - vmovdqu YMMWORD[(288-256-128)+rbx],ymm6 - vpaddd ymm6,ymm6,ymm14 - - vpsrld ymm1,ymm11,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm11,21 - vpaddd ymm6,ymm6,YMMWORD[((-96))+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm11,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm11,7 - vpandn ymm0,ymm11,ymm13 - vpand ymm4,ymm11,ymm12 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm14,ymm15,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm15,30 - vpxor ymm0,ymm0,ymm4 - vpxor ymm4,ymm8,ymm15 - - vpxor ymm14,ymm14,ymm1 - vpaddd ymm6,ymm6,ymm7 - - vpsrld ymm1,ymm15,13 - - vpslld ymm2,ymm15,19 - vpaddd ymm6,ymm6,ymm0 - vpand ymm3,ymm3,ymm4 - - vpxor ymm7,ymm14,ymm1 - - vpsrld ymm1,ymm15,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm15,10 - vpxor ymm14,ymm8,ymm3 - vpaddd ymm10,ymm10,ymm6 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm14,ymm14,ymm6 - vpaddd ymm14,ymm14,ymm7 - vmovdqu ymm6,YMMWORD[((352-256-128))+rbx] - vpaddd ymm5,ymm5,YMMWORD[((96-128))+rax] - - vpsrld ymm7,ymm6,3 - vpsrld ymm1,ymm6,7 - vpslld ymm2,ymm6,25 - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm6,18 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm6,14 - vmovdqu ymm0,YMMWORD[((256-256-128))+rbx] - vpsrld ymm3,ymm0,10 - - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm0,17 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,15 - vpaddd ymm5,ymm5,ymm7 - vpxor ymm7,ymm3,ymm1 - vpsrld ymm1,ymm0,19 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,13 - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - vpaddd ymm5,ymm5,ymm7 - vpsrld ymm7,ymm10,6 - vpslld ymm2,ymm10,26 - vmovdqu YMMWORD[(320-256-128)+rbx],ymm5 - vpaddd ymm5,ymm5,ymm13 - - vpsrld ymm1,ymm10,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm10,21 - vpaddd ymm5,ymm5,YMMWORD[((-64))+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm10,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm10,7 - vpandn ymm0,ymm10,ymm12 - vpand ymm3,ymm10,ymm11 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm13,ymm14,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm14,30 - vpxor ymm0,ymm0,ymm3 - vpxor ymm3,ymm15,ymm14 - - vpxor ymm13,ymm13,ymm1 - vpaddd ymm5,ymm5,ymm7 - - vpsrld ymm1,ymm14,13 - - vpslld ymm2,ymm14,19 - vpaddd ymm5,ymm5,ymm0 - vpand ymm4,ymm4,ymm3 - - vpxor ymm7,ymm13,ymm1 - - vpsrld ymm1,ymm14,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm14,10 - vpxor ymm13,ymm15,ymm4 - vpaddd ymm9,ymm9,ymm5 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm13,ymm13,ymm5 - vpaddd ymm13,ymm13,ymm7 - vmovdqu ymm5,YMMWORD[((384-256-128))+rbx] - vpaddd ymm6,ymm6,YMMWORD[((128-128))+rax] - - vpsrld ymm7,ymm5,3 - vpsrld ymm1,ymm5,7 - vpslld ymm2,ymm5,25 - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm5,18 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm5,14 - vmovdqu ymm0,YMMWORD[((288-256-128))+rbx] - vpsrld ymm4,ymm0,10 - - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm0,17 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,15 - vpaddd ymm6,ymm6,ymm7 - vpxor ymm7,ymm4,ymm1 - vpsrld ymm1,ymm0,19 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,13 - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - vpaddd ymm6,ymm6,ymm7 - vpsrld ymm7,ymm9,6 - vpslld ymm2,ymm9,26 - vmovdqu YMMWORD[(352-256-128)+rbx],ymm6 - vpaddd ymm6,ymm6,ymm12 - - vpsrld ymm1,ymm9,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm9,21 - vpaddd ymm6,ymm6,YMMWORD[((-32))+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm9,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm9,7 - vpandn ymm0,ymm9,ymm11 - vpand ymm4,ymm9,ymm10 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm12,ymm13,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm13,30 - vpxor ymm0,ymm0,ymm4 - vpxor ymm4,ymm14,ymm13 - - vpxor ymm12,ymm12,ymm1 - vpaddd ymm6,ymm6,ymm7 - - vpsrld ymm1,ymm13,13 - - vpslld ymm2,ymm13,19 - vpaddd ymm6,ymm6,ymm0 - vpand ymm3,ymm3,ymm4 - - vpxor ymm7,ymm12,ymm1 - - vpsrld ymm1,ymm13,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm13,10 - vpxor ymm12,ymm14,ymm3 - vpaddd ymm8,ymm8,ymm6 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm12,ymm12,ymm6 - vpaddd ymm12,ymm12,ymm7 - vmovdqu ymm6,YMMWORD[((416-256-128))+rbx] - vpaddd ymm5,ymm5,YMMWORD[((160-128))+rax] - - vpsrld ymm7,ymm6,3 - vpsrld ymm1,ymm6,7 - vpslld ymm2,ymm6,25 - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm6,18 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm6,14 - vmovdqu ymm0,YMMWORD[((320-256-128))+rbx] - vpsrld ymm3,ymm0,10 - - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm0,17 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,15 - vpaddd ymm5,ymm5,ymm7 - vpxor ymm7,ymm3,ymm1 - vpsrld ymm1,ymm0,19 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,13 - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - vpaddd ymm5,ymm5,ymm7 - vpsrld ymm7,ymm8,6 - vpslld ymm2,ymm8,26 - vmovdqu YMMWORD[(384-256-128)+rbx],ymm5 - vpaddd ymm5,ymm5,ymm11 - - vpsrld ymm1,ymm8,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm8,21 - vpaddd ymm5,ymm5,YMMWORD[rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm8,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm8,7 - vpandn ymm0,ymm8,ymm10 - vpand ymm3,ymm8,ymm9 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm11,ymm12,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm12,30 - vpxor ymm0,ymm0,ymm3 - vpxor ymm3,ymm13,ymm12 - - vpxor ymm11,ymm11,ymm1 - vpaddd ymm5,ymm5,ymm7 - - vpsrld ymm1,ymm12,13 - - vpslld ymm2,ymm12,19 - vpaddd ymm5,ymm5,ymm0 - vpand ymm4,ymm4,ymm3 - - vpxor ymm7,ymm11,ymm1 - - vpsrld ymm1,ymm12,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm12,10 - vpxor ymm11,ymm13,ymm4 - vpaddd ymm15,ymm15,ymm5 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm11,ymm11,ymm5 - vpaddd ymm11,ymm11,ymm7 - vmovdqu ymm5,YMMWORD[((448-256-128))+rbx] - vpaddd ymm6,ymm6,YMMWORD[((192-128))+rax] - - vpsrld ymm7,ymm5,3 - vpsrld ymm1,ymm5,7 - vpslld ymm2,ymm5,25 - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm5,18 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm5,14 - vmovdqu ymm0,YMMWORD[((352-256-128))+rbx] - vpsrld ymm4,ymm0,10 - - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm0,17 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,15 - vpaddd ymm6,ymm6,ymm7 - vpxor ymm7,ymm4,ymm1 - vpsrld ymm1,ymm0,19 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,13 - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - vpaddd ymm6,ymm6,ymm7 - vpsrld ymm7,ymm15,6 - vpslld ymm2,ymm15,26 - vmovdqu YMMWORD[(416-256-128)+rbx],ymm6 - vpaddd ymm6,ymm6,ymm10 - - vpsrld ymm1,ymm15,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm15,21 - vpaddd ymm6,ymm6,YMMWORD[32+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm15,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm15,7 - vpandn ymm0,ymm15,ymm9 - vpand ymm4,ymm15,ymm8 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm10,ymm11,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm11,30 - vpxor ymm0,ymm0,ymm4 - vpxor ymm4,ymm12,ymm11 - - vpxor ymm10,ymm10,ymm1 - vpaddd ymm6,ymm6,ymm7 - - vpsrld ymm1,ymm11,13 - - vpslld ymm2,ymm11,19 - vpaddd ymm6,ymm6,ymm0 - vpand ymm3,ymm3,ymm4 - - vpxor ymm7,ymm10,ymm1 - - vpsrld ymm1,ymm11,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm11,10 - vpxor ymm10,ymm12,ymm3 - vpaddd ymm14,ymm14,ymm6 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm10,ymm10,ymm6 - vpaddd ymm10,ymm10,ymm7 - vmovdqu ymm6,YMMWORD[((480-256-128))+rbx] - vpaddd ymm5,ymm5,YMMWORD[((224-128))+rax] - - vpsrld ymm7,ymm6,3 - vpsrld ymm1,ymm6,7 - vpslld ymm2,ymm6,25 - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm6,18 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm6,14 - vmovdqu ymm0,YMMWORD[((384-256-128))+rbx] - vpsrld ymm3,ymm0,10 - - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm0,17 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,15 - vpaddd ymm5,ymm5,ymm7 - vpxor ymm7,ymm3,ymm1 - vpsrld ymm1,ymm0,19 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,13 - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - vpaddd ymm5,ymm5,ymm7 - vpsrld ymm7,ymm14,6 - vpslld ymm2,ymm14,26 - vmovdqu YMMWORD[(448-256-128)+rbx],ymm5 - vpaddd ymm5,ymm5,ymm9 - - vpsrld ymm1,ymm14,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm14,21 - vpaddd ymm5,ymm5,YMMWORD[64+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm14,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm14,7 - vpandn ymm0,ymm14,ymm8 - vpand ymm3,ymm14,ymm15 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm9,ymm10,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm10,30 - vpxor ymm0,ymm0,ymm3 - vpxor ymm3,ymm11,ymm10 - - vpxor ymm9,ymm9,ymm1 - vpaddd ymm5,ymm5,ymm7 - - vpsrld ymm1,ymm10,13 - - vpslld ymm2,ymm10,19 - vpaddd ymm5,ymm5,ymm0 - vpand ymm4,ymm4,ymm3 - - vpxor ymm7,ymm9,ymm1 - - vpsrld ymm1,ymm10,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm10,10 - vpxor ymm9,ymm11,ymm4 - vpaddd ymm13,ymm13,ymm5 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm9,ymm9,ymm5 - vpaddd ymm9,ymm9,ymm7 - vmovdqu ymm5,YMMWORD[((0-128))+rax] - vpaddd ymm6,ymm6,YMMWORD[((256-256-128))+rbx] - - vpsrld ymm7,ymm5,3 - vpsrld ymm1,ymm5,7 - vpslld ymm2,ymm5,25 - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm5,18 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm5,14 - vmovdqu ymm0,YMMWORD[((416-256-128))+rbx] - vpsrld ymm4,ymm0,10 - - vpxor ymm7,ymm7,ymm1 - vpsrld ymm1,ymm0,17 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,15 - vpaddd ymm6,ymm6,ymm7 - vpxor ymm7,ymm4,ymm1 - vpsrld ymm1,ymm0,19 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm0,13 - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - vpaddd ymm6,ymm6,ymm7 - vpsrld ymm7,ymm13,6 - vpslld ymm2,ymm13,26 - vmovdqu YMMWORD[(480-256-128)+rbx],ymm6 - vpaddd ymm6,ymm6,ymm8 - - vpsrld ymm1,ymm13,11 - vpxor ymm7,ymm7,ymm2 - vpslld ymm2,ymm13,21 - vpaddd ymm6,ymm6,YMMWORD[96+rbp] - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm1,ymm13,25 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm13,7 - vpandn ymm0,ymm13,ymm15 - vpand ymm4,ymm13,ymm14 - - vpxor ymm7,ymm7,ymm1 - - vpsrld ymm8,ymm9,2 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm1,ymm9,30 - vpxor ymm0,ymm0,ymm4 - vpxor ymm4,ymm10,ymm9 - - vpxor ymm8,ymm8,ymm1 - vpaddd ymm6,ymm6,ymm7 - - vpsrld ymm1,ymm9,13 - - vpslld ymm2,ymm9,19 - vpaddd ymm6,ymm6,ymm0 - vpand ymm3,ymm3,ymm4 - - vpxor ymm7,ymm8,ymm1 - - vpsrld ymm1,ymm9,22 - vpxor ymm7,ymm7,ymm2 - - vpslld ymm2,ymm9,10 - vpxor ymm8,ymm10,ymm3 - vpaddd ymm12,ymm12,ymm6 - - vpxor ymm7,ymm7,ymm1 - vpxor ymm7,ymm7,ymm2 - - vpaddd ymm8,ymm8,ymm6 - vpaddd ymm8,ymm8,ymm7 - add rbp,256 - dec ecx - jnz NEAR $L$oop_16_xx_avx2 - - mov ecx,1 - lea rbx,[512+rsp] - lea rbp,[((K256+128))] - cmp ecx,DWORD[rbx] - cmovge r12,rbp - cmp ecx,DWORD[4+rbx] - cmovge r13,rbp - cmp ecx,DWORD[8+rbx] - cmovge r14,rbp - cmp ecx,DWORD[12+rbx] - cmovge r15,rbp - cmp ecx,DWORD[16+rbx] - cmovge r8,rbp - cmp ecx,DWORD[20+rbx] - cmovge r9,rbp - cmp ecx,DWORD[24+rbx] - cmovge r10,rbp - cmp ecx,DWORD[28+rbx] - cmovge r11,rbp - vmovdqa ymm7,YMMWORD[rbx] - vpxor ymm0,ymm0,ymm0 - vmovdqa ymm6,ymm7 - vpcmpgtd ymm6,ymm6,ymm0 - vpaddd ymm7,ymm7,ymm6 - - vmovdqu ymm0,YMMWORD[((0-128))+rdi] - vpand ymm8,ymm8,ymm6 - vmovdqu ymm1,YMMWORD[((32-128))+rdi] - vpand ymm9,ymm9,ymm6 - vmovdqu ymm2,YMMWORD[((64-128))+rdi] - vpand ymm10,ymm10,ymm6 - vmovdqu ymm5,YMMWORD[((96-128))+rdi] - vpand ymm11,ymm11,ymm6 - vpaddd ymm8,ymm8,ymm0 - vmovdqu ymm0,YMMWORD[((128-128))+rdi] - vpand ymm12,ymm12,ymm6 - vpaddd ymm9,ymm9,ymm1 - vmovdqu ymm1,YMMWORD[((160-128))+rdi] - vpand ymm13,ymm13,ymm6 - vpaddd ymm10,ymm10,ymm2 - vmovdqu ymm2,YMMWORD[((192-128))+rdi] - vpand ymm14,ymm14,ymm6 - vpaddd ymm11,ymm11,ymm5 - vmovdqu ymm5,YMMWORD[((224-128))+rdi] - vpand ymm15,ymm15,ymm6 - vpaddd ymm12,ymm12,ymm0 - vpaddd ymm13,ymm13,ymm1 - vmovdqu YMMWORD[(0-128)+rdi],ymm8 - vpaddd ymm14,ymm14,ymm2 - vmovdqu YMMWORD[(32-128)+rdi],ymm9 - vpaddd ymm15,ymm15,ymm5 - vmovdqu YMMWORD[(64-128)+rdi],ymm10 - vmovdqu YMMWORD[(96-128)+rdi],ymm11 - vmovdqu YMMWORD[(128-128)+rdi],ymm12 - vmovdqu YMMWORD[(160-128)+rdi],ymm13 - vmovdqu YMMWORD[(192-128)+rdi],ymm14 - vmovdqu YMMWORD[(224-128)+rdi],ymm15 - - vmovdqu YMMWORD[rbx],ymm7 - lea rbx,[((256+128))+rsp] - vmovdqu ymm6,YMMWORD[$L$pbswap] - dec edx - jnz NEAR $L$oop_avx2 - - - - - - - -$L$done_avx2: - mov rax,QWORD[544+rsp] - - vzeroupper - movaps xmm6,XMMWORD[((-216))+rax] - movaps xmm7,XMMWORD[((-200))+rax] - movaps xmm8,XMMWORD[((-184))+rax] - movaps xmm9,XMMWORD[((-168))+rax] - movaps xmm10,XMMWORD[((-152))+rax] - movaps xmm11,XMMWORD[((-136))+rax] - movaps xmm12,XMMWORD[((-120))+rax] - movaps xmm13,XMMWORD[((-104))+rax] - movaps xmm14,XMMWORD[((-88))+rax] - movaps xmm15,XMMWORD[((-72))+rax] - mov r15,QWORD[((-48))+rax] - - mov r14,QWORD[((-40))+rax] - - mov r13,QWORD[((-32))+rax] - - mov r12,QWORD[((-24))+rax] - - mov rbp,QWORD[((-16))+rax] - - mov rbx,QWORD[((-8))+rax] - - lea rsp,[rax] - -$L$epilogue_avx2: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_sha256_multi_block_avx2: ALIGN 256 K256: DD 1116352408,1116352408,1116352408,1116352408 @@ -8203,60 +3443,6 @@ $L$in_prologue: pop rsi DB 0F3h,0C3h ;repret - -ALIGN 16 -avx2_handler: - push rsi - push rdi - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - pushfq - sub rsp,64 - - mov rax,QWORD[120+r8] - mov rbx,QWORD[248+r8] - - mov rsi,QWORD[8+r9] - mov r11,QWORD[56+r9] - - mov r10d,DWORD[r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jb NEAR $L$in_prologue - - mov rax,QWORD[152+r8] - - mov r10d,DWORD[4+r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jae NEAR $L$in_prologue - - mov rax,QWORD[544+r8] - - mov rbx,QWORD[((-8))+rax] - mov rbp,QWORD[((-16))+rax] - mov r12,QWORD[((-24))+rax] - mov r13,QWORD[((-32))+rax] - mov r14,QWORD[((-40))+rax] - mov r15,QWORD[((-48))+rax] - mov QWORD[144+r8],rbx - mov QWORD[160+r8],rbp - mov QWORD[216+r8],r12 - mov QWORD[224+r8],r13 - mov QWORD[232+r8],r14 - mov QWORD[240+r8],r15 - - lea rsi,[((-56-160))+rax] - lea rdi,[512+r8] - mov ecx,20 - DD 0xa548f3fc - - jmp NEAR $L$in_prologue - section .pdata rdata align=4 ALIGN 4 DD $L$SEH_begin_sha256_multi_block wrt ..imagebase @@ -8265,12 +3451,6 @@ ALIGN 4 DD $L$SEH_begin_sha256_multi_block_shaext wrt ..imagebase DD $L$SEH_end_sha256_multi_block_shaext wrt ..imagebase DD $L$SEH_info_sha256_multi_block_shaext wrt ..imagebase - DD $L$SEH_begin_sha256_multi_block_avx wrt ..imagebase - DD $L$SEH_end_sha256_multi_block_avx wrt ..imagebase - DD $L$SEH_info_sha256_multi_block_avx wrt ..imagebase - DD $L$SEH_begin_sha256_multi_block_avx2 wrt ..imagebase - DD $L$SEH_end_sha256_multi_block_avx2 wrt ..imagebase - DD $L$SEH_info_sha256_multi_block_avx2 wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_sha256_multi_block: @@ -8281,11 +3461,3 @@ $L$SEH_info_sha256_multi_block_shaext: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$body_shaext wrt ..imagebase,$L$epilogue_shaext wrt ..imagebase -$L$SEH_info_sha256_multi_block_avx: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$body_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase -$L$SEH_info_sha256_multi_block_avx2: -DB 9,0,0,0 - DD avx2_handler wrt ..imagebase - DD $L$body_avx2 wrt ..imagebase,$L$epilogue_avx2 wrt ..imagebase diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-x86_64.nasm index 8238c4e4636..c20586762eb 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-x86_64.nasm @@ -26,14 +26,6 @@ $L$SEH_begin_sha256_block_data_order: mov r11d,DWORD[8+r11] test r11d,536870912 jnz NEAR _shaext_shortcut - and r11d,296 - cmp r11d,296 - je NEAR $L$avx2_shortcut - and r9d,1073741824 - and r10d,268435968 - or r10d,r9d - cmp r10d,1342177792 - je NEAR $L$avx_shortcut test r10d,512 jnz NEAR $L$ssse3_shortcut mov rax,rsp @@ -3157,2385 +3149,6 @@ $L$epilogue_ssse3: DB 0F3h,0C3h ;repret $L$SEH_end_sha256_block_data_order_ssse3: - -ALIGN 64 -sha256_block_data_order_avx: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_sha256_block_data_order_avx: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - - -$L$avx_shortcut: - mov rax,rsp - - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - push r15 - - shl rdx,4 - sub rsp,160 - lea rdx,[rdx*4+rsi] - and rsp,-64 - mov QWORD[((64+0))+rsp],rdi - mov QWORD[((64+8))+rsp],rsi - mov QWORD[((64+16))+rsp],rdx - mov QWORD[88+rsp],rax - - movaps XMMWORD[(64+32)+rsp],xmm6 - movaps XMMWORD[(64+48)+rsp],xmm7 - movaps XMMWORD[(64+64)+rsp],xmm8 - movaps XMMWORD[(64+80)+rsp],xmm9 -$L$prologue_avx: - - vzeroupper - mov eax,DWORD[rdi] - mov ebx,DWORD[4+rdi] - mov ecx,DWORD[8+rdi] - mov edx,DWORD[12+rdi] - mov r8d,DWORD[16+rdi] - mov r9d,DWORD[20+rdi] - mov r10d,DWORD[24+rdi] - mov r11d,DWORD[28+rdi] - vmovdqa xmm8,XMMWORD[((K256+512+32))] - vmovdqa xmm9,XMMWORD[((K256+512+64))] - jmp NEAR $L$loop_avx -ALIGN 16 -$L$loop_avx: - vmovdqa xmm7,XMMWORD[((K256+512))] - vmovdqu xmm0,XMMWORD[rsi] - vmovdqu xmm1,XMMWORD[16+rsi] - vmovdqu xmm2,XMMWORD[32+rsi] - vmovdqu xmm3,XMMWORD[48+rsi] - vpshufb xmm0,xmm0,xmm7 - lea rbp,[K256] - vpshufb xmm1,xmm1,xmm7 - vpshufb xmm2,xmm2,xmm7 - vpaddd xmm4,xmm0,XMMWORD[rbp] - vpshufb xmm3,xmm3,xmm7 - vpaddd xmm5,xmm1,XMMWORD[32+rbp] - vpaddd xmm6,xmm2,XMMWORD[64+rbp] - vpaddd xmm7,xmm3,XMMWORD[96+rbp] - vmovdqa XMMWORD[rsp],xmm4 - mov r14d,eax - vmovdqa XMMWORD[16+rsp],xmm5 - mov edi,ebx - vmovdqa XMMWORD[32+rsp],xmm6 - xor edi,ecx - vmovdqa XMMWORD[48+rsp],xmm7 - mov r13d,r8d - jmp NEAR $L$avx_00_47 - -ALIGN 16 -$L$avx_00_47: - sub rbp,-128 - vpalignr xmm4,xmm1,xmm0,4 - shrd r13d,r13d,14 - mov eax,r14d - mov r12d,r9d - vpalignr xmm7,xmm3,xmm2,4 - shrd r14d,r14d,9 - xor r13d,r8d - xor r12d,r10d - vpsrld xmm6,xmm4,7 - shrd r13d,r13d,5 - xor r14d,eax - and r12d,r8d - vpaddd xmm0,xmm0,xmm7 - xor r13d,r8d - add r11d,DWORD[rsp] - mov r15d,eax - vpsrld xmm7,xmm4,3 - xor r12d,r10d - shrd r14d,r14d,11 - xor r15d,ebx - vpslld xmm5,xmm4,14 - add r11d,r12d - shrd r13d,r13d,6 - and edi,r15d - vpxor xmm4,xmm7,xmm6 - xor r14d,eax - add r11d,r13d - xor edi,ebx - vpshufd xmm7,xmm3,250 - shrd r14d,r14d,2 - add edx,r11d - add r11d,edi - vpsrld xmm6,xmm6,11 - mov r13d,edx - add r14d,r11d - shrd r13d,r13d,14 - vpxor xmm4,xmm4,xmm5 - mov r11d,r14d - mov r12d,r8d - shrd r14d,r14d,9 - vpslld xmm5,xmm5,11 - xor r13d,edx - xor r12d,r9d - shrd r13d,r13d,5 - vpxor xmm4,xmm4,xmm6 - xor r14d,r11d - and r12d,edx - xor r13d,edx - vpsrld xmm6,xmm7,10 - add r10d,DWORD[4+rsp] - mov edi,r11d - xor r12d,r9d - vpxor xmm4,xmm4,xmm5 - shrd r14d,r14d,11 - xor edi,eax - add r10d,r12d - vpsrlq xmm7,xmm7,17 - shrd r13d,r13d,6 - and r15d,edi - xor r14d,r11d - vpaddd xmm0,xmm0,xmm4 - add r10d,r13d - xor r15d,eax - shrd r14d,r14d,2 - vpxor xmm6,xmm6,xmm7 - add ecx,r10d - add r10d,r15d - mov r13d,ecx - vpsrlq xmm7,xmm7,2 - add r14d,r10d - shrd r13d,r13d,14 - mov r10d,r14d - vpxor xmm6,xmm6,xmm7 - mov r12d,edx - shrd r14d,r14d,9 - xor r13d,ecx - vpshufb xmm6,xmm6,xmm8 - xor r12d,r8d - shrd r13d,r13d,5 - xor r14d,r10d - vpaddd xmm0,xmm0,xmm6 - and r12d,ecx - xor r13d,ecx - add r9d,DWORD[8+rsp] - vpshufd xmm7,xmm0,80 - mov r15d,r10d - xor r12d,r8d - shrd r14d,r14d,11 - vpsrld xmm6,xmm7,10 - xor r15d,r11d - add r9d,r12d - shrd r13d,r13d,6 - vpsrlq xmm7,xmm7,17 - and edi,r15d - xor r14d,r10d - add r9d,r13d - vpxor xmm6,xmm6,xmm7 - xor edi,r11d - shrd r14d,r14d,2 - add ebx,r9d - vpsrlq xmm7,xmm7,2 - add r9d,edi - mov r13d,ebx - add r14d,r9d - vpxor xmm6,xmm6,xmm7 - shrd r13d,r13d,14 - mov r9d,r14d - mov r12d,ecx - vpshufb xmm6,xmm6,xmm9 - shrd r14d,r14d,9 - xor r13d,ebx - xor r12d,edx - vpaddd xmm0,xmm0,xmm6 - shrd r13d,r13d,5 - xor r14d,r9d - and r12d,ebx - vpaddd xmm6,xmm0,XMMWORD[rbp] - xor r13d,ebx - add r8d,DWORD[12+rsp] - mov edi,r9d - xor r12d,edx - shrd r14d,r14d,11 - xor edi,r10d - add r8d,r12d - shrd r13d,r13d,6 - and r15d,edi - xor r14d,r9d - add r8d,r13d - xor r15d,r10d - shrd r14d,r14d,2 - add eax,r8d - add r8d,r15d - mov r13d,eax - add r14d,r8d - vmovdqa XMMWORD[rsp],xmm6 - vpalignr xmm4,xmm2,xmm1,4 - shrd r13d,r13d,14 - mov r8d,r14d - mov r12d,ebx - vpalignr xmm7,xmm0,xmm3,4 - shrd r14d,r14d,9 - xor r13d,eax - xor r12d,ecx - vpsrld xmm6,xmm4,7 - shrd r13d,r13d,5 - xor r14d,r8d - and r12d,eax - vpaddd xmm1,xmm1,xmm7 - xor r13d,eax - add edx,DWORD[16+rsp] - mov r15d,r8d - vpsrld xmm7,xmm4,3 - xor r12d,ecx - shrd r14d,r14d,11 - xor r15d,r9d - vpslld xmm5,xmm4,14 - add edx,r12d - shrd r13d,r13d,6 - and edi,r15d - vpxor xmm4,xmm7,xmm6 - xor r14d,r8d - add edx,r13d - xor edi,r9d - vpshufd xmm7,xmm0,250 - shrd r14d,r14d,2 - add r11d,edx - add edx,edi - vpsrld xmm6,xmm6,11 - mov r13d,r11d - add r14d,edx - shrd r13d,r13d,14 - vpxor xmm4,xmm4,xmm5 - mov edx,r14d - mov r12d,eax - shrd r14d,r14d,9 - vpslld xmm5,xmm5,11 - xor r13d,r11d - xor r12d,ebx - shrd r13d,r13d,5 - vpxor xmm4,xmm4,xmm6 - xor r14d,edx - and r12d,r11d - xor r13d,r11d - vpsrld xmm6,xmm7,10 - add ecx,DWORD[20+rsp] - mov edi,edx - xor r12d,ebx - vpxor xmm4,xmm4,xmm5 - shrd r14d,r14d,11 - xor edi,r8d - add ecx,r12d - vpsrlq xmm7,xmm7,17 - shrd r13d,r13d,6 - and r15d,edi - xor r14d,edx - vpaddd xmm1,xmm1,xmm4 - add ecx,r13d - xor r15d,r8d - shrd r14d,r14d,2 - vpxor xmm6,xmm6,xmm7 - add r10d,ecx - add ecx,r15d - mov r13d,r10d - vpsrlq xmm7,xmm7,2 - add r14d,ecx - shrd r13d,r13d,14 - mov ecx,r14d - vpxor xmm6,xmm6,xmm7 - mov r12d,r11d - shrd r14d,r14d,9 - xor r13d,r10d - vpshufb xmm6,xmm6,xmm8 - xor r12d,eax - shrd r13d,r13d,5 - xor r14d,ecx - vpaddd xmm1,xmm1,xmm6 - and r12d,r10d - xor r13d,r10d - add ebx,DWORD[24+rsp] - vpshufd xmm7,xmm1,80 - mov r15d,ecx - xor r12d,eax - shrd r14d,r14d,11 - vpsrld xmm6,xmm7,10 - xor r15d,edx - add ebx,r12d - shrd r13d,r13d,6 - vpsrlq xmm7,xmm7,17 - and edi,r15d - xor r14d,ecx - add ebx,r13d - vpxor xmm6,xmm6,xmm7 - xor edi,edx - shrd r14d,r14d,2 - add r9d,ebx - vpsrlq xmm7,xmm7,2 - add ebx,edi - mov r13d,r9d - add r14d,ebx - vpxor xmm6,xmm6,xmm7 - shrd r13d,r13d,14 - mov ebx,r14d - mov r12d,r10d - vpshufb xmm6,xmm6,xmm9 - shrd r14d,r14d,9 - xor r13d,r9d - xor r12d,r11d - vpaddd xmm1,xmm1,xmm6 - shrd r13d,r13d,5 - xor r14d,ebx - and r12d,r9d - vpaddd xmm6,xmm1,XMMWORD[32+rbp] - xor r13d,r9d - add eax,DWORD[28+rsp] - mov edi,ebx - xor r12d,r11d - shrd r14d,r14d,11 - xor edi,ecx - add eax,r12d - shrd r13d,r13d,6 - and r15d,edi - xor r14d,ebx - add eax,r13d - xor r15d,ecx - shrd r14d,r14d,2 - add r8d,eax - add eax,r15d - mov r13d,r8d - add r14d,eax - vmovdqa XMMWORD[16+rsp],xmm6 - vpalignr xmm4,xmm3,xmm2,4 - shrd r13d,r13d,14 - mov eax,r14d - mov r12d,r9d - vpalignr xmm7,xmm1,xmm0,4 - shrd r14d,r14d,9 - xor r13d,r8d - xor r12d,r10d - vpsrld xmm6,xmm4,7 - shrd r13d,r13d,5 - xor r14d,eax - and r12d,r8d - vpaddd xmm2,xmm2,xmm7 - xor r13d,r8d - add r11d,DWORD[32+rsp] - mov r15d,eax - vpsrld xmm7,xmm4,3 - xor r12d,r10d - shrd r14d,r14d,11 - xor r15d,ebx - vpslld xmm5,xmm4,14 - add r11d,r12d - shrd r13d,r13d,6 - and edi,r15d - vpxor xmm4,xmm7,xmm6 - xor r14d,eax - add r11d,r13d - xor edi,ebx - vpshufd xmm7,xmm1,250 - shrd r14d,r14d,2 - add edx,r11d - add r11d,edi - vpsrld xmm6,xmm6,11 - mov r13d,edx - add r14d,r11d - shrd r13d,r13d,14 - vpxor xmm4,xmm4,xmm5 - mov r11d,r14d - mov r12d,r8d - shrd r14d,r14d,9 - vpslld xmm5,xmm5,11 - xor r13d,edx - xor r12d,r9d - shrd r13d,r13d,5 - vpxor xmm4,xmm4,xmm6 - xor r14d,r11d - and r12d,edx - xor r13d,edx - vpsrld xmm6,xmm7,10 - add r10d,DWORD[36+rsp] - mov edi,r11d - xor r12d,r9d - vpxor xmm4,xmm4,xmm5 - shrd r14d,r14d,11 - xor edi,eax - add r10d,r12d - vpsrlq xmm7,xmm7,17 - shrd r13d,r13d,6 - and r15d,edi - xor r14d,r11d - vpaddd xmm2,xmm2,xmm4 - add r10d,r13d - xor r15d,eax - shrd r14d,r14d,2 - vpxor xmm6,xmm6,xmm7 - add ecx,r10d - add r10d,r15d - mov r13d,ecx - vpsrlq xmm7,xmm7,2 - add r14d,r10d - shrd r13d,r13d,14 - mov r10d,r14d - vpxor xmm6,xmm6,xmm7 - mov r12d,edx - shrd r14d,r14d,9 - xor r13d,ecx - vpshufb xmm6,xmm6,xmm8 - xor r12d,r8d - shrd r13d,r13d,5 - xor r14d,r10d - vpaddd xmm2,xmm2,xmm6 - and r12d,ecx - xor r13d,ecx - add r9d,DWORD[40+rsp] - vpshufd xmm7,xmm2,80 - mov r15d,r10d - xor r12d,r8d - shrd r14d,r14d,11 - vpsrld xmm6,xmm7,10 - xor r15d,r11d - add r9d,r12d - shrd r13d,r13d,6 - vpsrlq xmm7,xmm7,17 - and edi,r15d - xor r14d,r10d - add r9d,r13d - vpxor xmm6,xmm6,xmm7 - xor edi,r11d - shrd r14d,r14d,2 - add ebx,r9d - vpsrlq xmm7,xmm7,2 - add r9d,edi - mov r13d,ebx - add r14d,r9d - vpxor xmm6,xmm6,xmm7 - shrd r13d,r13d,14 - mov r9d,r14d - mov r12d,ecx - vpshufb xmm6,xmm6,xmm9 - shrd r14d,r14d,9 - xor r13d,ebx - xor r12d,edx - vpaddd xmm2,xmm2,xmm6 - shrd r13d,r13d,5 - xor r14d,r9d - and r12d,ebx - vpaddd xmm6,xmm2,XMMWORD[64+rbp] - xor r13d,ebx - add r8d,DWORD[44+rsp] - mov edi,r9d - xor r12d,edx - shrd r14d,r14d,11 - xor edi,r10d - add r8d,r12d - shrd r13d,r13d,6 - and r15d,edi - xor r14d,r9d - add r8d,r13d - xor r15d,r10d - shrd r14d,r14d,2 - add eax,r8d - add r8d,r15d - mov r13d,eax - add r14d,r8d - vmovdqa XMMWORD[32+rsp],xmm6 - vpalignr xmm4,xmm0,xmm3,4 - shrd r13d,r13d,14 - mov r8d,r14d - mov r12d,ebx - vpalignr xmm7,xmm2,xmm1,4 - shrd r14d,r14d,9 - xor r13d,eax - xor r12d,ecx - vpsrld xmm6,xmm4,7 - shrd r13d,r13d,5 - xor r14d,r8d - and r12d,eax - vpaddd xmm3,xmm3,xmm7 - xor r13d,eax - add edx,DWORD[48+rsp] - mov r15d,r8d - vpsrld xmm7,xmm4,3 - xor r12d,ecx - shrd r14d,r14d,11 - xor r15d,r9d - vpslld xmm5,xmm4,14 - add edx,r12d - shrd r13d,r13d,6 - and edi,r15d - vpxor xmm4,xmm7,xmm6 - xor r14d,r8d - add edx,r13d - xor edi,r9d - vpshufd xmm7,xmm2,250 - shrd r14d,r14d,2 - add r11d,edx - add edx,edi - vpsrld xmm6,xmm6,11 - mov r13d,r11d - add r14d,edx - shrd r13d,r13d,14 - vpxor xmm4,xmm4,xmm5 - mov edx,r14d - mov r12d,eax - shrd r14d,r14d,9 - vpslld xmm5,xmm5,11 - xor r13d,r11d - xor r12d,ebx - shrd r13d,r13d,5 - vpxor xmm4,xmm4,xmm6 - xor r14d,edx - and r12d,r11d - xor r13d,r11d - vpsrld xmm6,xmm7,10 - add ecx,DWORD[52+rsp] - mov edi,edx - xor r12d,ebx - vpxor xmm4,xmm4,xmm5 - shrd r14d,r14d,11 - xor edi,r8d - add ecx,r12d - vpsrlq xmm7,xmm7,17 - shrd r13d,r13d,6 - and r15d,edi - xor r14d,edx - vpaddd xmm3,xmm3,xmm4 - add ecx,r13d - xor r15d,r8d - shrd r14d,r14d,2 - vpxor xmm6,xmm6,xmm7 - add r10d,ecx - add ecx,r15d - mov r13d,r10d - vpsrlq xmm7,xmm7,2 - add r14d,ecx - shrd r13d,r13d,14 - mov ecx,r14d - vpxor xmm6,xmm6,xmm7 - mov r12d,r11d - shrd r14d,r14d,9 - xor r13d,r10d - vpshufb xmm6,xmm6,xmm8 - xor r12d,eax - shrd r13d,r13d,5 - xor r14d,ecx - vpaddd xmm3,xmm3,xmm6 - and r12d,r10d - xor r13d,r10d - add ebx,DWORD[56+rsp] - vpshufd xmm7,xmm3,80 - mov r15d,ecx - xor r12d,eax - shrd r14d,r14d,11 - vpsrld xmm6,xmm7,10 - xor r15d,edx - add ebx,r12d - shrd r13d,r13d,6 - vpsrlq xmm7,xmm7,17 - and edi,r15d - xor r14d,ecx - add ebx,r13d - vpxor xmm6,xmm6,xmm7 - xor edi,edx - shrd r14d,r14d,2 - add r9d,ebx - vpsrlq xmm7,xmm7,2 - add ebx,edi - mov r13d,r9d - add r14d,ebx - vpxor xmm6,xmm6,xmm7 - shrd r13d,r13d,14 - mov ebx,r14d - mov r12d,r10d - vpshufb xmm6,xmm6,xmm9 - shrd r14d,r14d,9 - xor r13d,r9d - xor r12d,r11d - vpaddd xmm3,xmm3,xmm6 - shrd r13d,r13d,5 - xor r14d,ebx - and r12d,r9d - vpaddd xmm6,xmm3,XMMWORD[96+rbp] - xor r13d,r9d - add eax,DWORD[60+rsp] - mov edi,ebx - xor r12d,r11d - shrd r14d,r14d,11 - xor edi,ecx - add eax,r12d - shrd r13d,r13d,6 - and r15d,edi - xor r14d,ebx - add eax,r13d - xor r15d,ecx - shrd r14d,r14d,2 - add r8d,eax - add eax,r15d - mov r13d,r8d - add r14d,eax - vmovdqa XMMWORD[48+rsp],xmm6 - cmp BYTE[131+rbp],0 - jne NEAR $L$avx_00_47 - shrd r13d,r13d,14 - mov eax,r14d - mov r12d,r9d - shrd r14d,r14d,9 - xor r13d,r8d - xor r12d,r10d - shrd r13d,r13d,5 - xor r14d,eax - and r12d,r8d - xor r13d,r8d - add r11d,DWORD[rsp] - mov r15d,eax - xor r12d,r10d - shrd r14d,r14d,11 - xor r15d,ebx - add r11d,r12d - shrd r13d,r13d,6 - and edi,r15d - xor r14d,eax - add r11d,r13d - xor edi,ebx - shrd r14d,r14d,2 - add edx,r11d - add r11d,edi - mov r13d,edx - add r14d,r11d - shrd r13d,r13d,14 - mov r11d,r14d - mov r12d,r8d - shrd r14d,r14d,9 - xor r13d,edx - xor r12d,r9d - shrd r13d,r13d,5 - xor r14d,r11d - and r12d,edx - xor r13d,edx - add r10d,DWORD[4+rsp] - mov edi,r11d - xor r12d,r9d - shrd r14d,r14d,11 - xor edi,eax - add r10d,r12d - shrd r13d,r13d,6 - and r15d,edi - xor r14d,r11d - add r10d,r13d - xor r15d,eax - shrd r14d,r14d,2 - add ecx,r10d - add r10d,r15d - mov r13d,ecx - add r14d,r10d - shrd r13d,r13d,14 - mov r10d,r14d - mov r12d,edx - shrd r14d,r14d,9 - xor r13d,ecx - xor r12d,r8d - shrd r13d,r13d,5 - xor r14d,r10d - and r12d,ecx - xor r13d,ecx - add r9d,DWORD[8+rsp] - mov r15d,r10d - xor r12d,r8d - shrd r14d,r14d,11 - xor r15d,r11d - add r9d,r12d - shrd r13d,r13d,6 - and edi,r15d - xor r14d,r10d - add r9d,r13d - xor edi,r11d - shrd r14d,r14d,2 - add ebx,r9d - add r9d,edi - mov r13d,ebx - add r14d,r9d - shrd r13d,r13d,14 - mov r9d,r14d - mov r12d,ecx - shrd r14d,r14d,9 - xor r13d,ebx - xor r12d,edx - shrd r13d,r13d,5 - xor r14d,r9d - and r12d,ebx - xor r13d,ebx - add r8d,DWORD[12+rsp] - mov edi,r9d - xor r12d,edx - shrd r14d,r14d,11 - xor edi,r10d - add r8d,r12d - shrd r13d,r13d,6 - and r15d,edi - xor r14d,r9d - add r8d,r13d - xor r15d,r10d - shrd r14d,r14d,2 - add eax,r8d - add r8d,r15d - mov r13d,eax - add r14d,r8d - shrd r13d,r13d,14 - mov r8d,r14d - mov r12d,ebx - shrd r14d,r14d,9 - xor r13d,eax - xor r12d,ecx - shrd r13d,r13d,5 - xor r14d,r8d - and r12d,eax - xor r13d,eax - add edx,DWORD[16+rsp] - mov r15d,r8d - xor r12d,ecx - shrd r14d,r14d,11 - xor r15d,r9d - add edx,r12d - shrd r13d,r13d,6 - and edi,r15d - xor r14d,r8d - add edx,r13d - xor edi,r9d - shrd r14d,r14d,2 - add r11d,edx - add edx,edi - mov r13d,r11d - add r14d,edx - shrd r13d,r13d,14 - mov edx,r14d - mov r12d,eax - shrd r14d,r14d,9 - xor r13d,r11d - xor r12d,ebx - shrd r13d,r13d,5 - xor r14d,edx - and r12d,r11d - xor r13d,r11d - add ecx,DWORD[20+rsp] - mov edi,edx - xor r12d,ebx - shrd r14d,r14d,11 - xor edi,r8d - add ecx,r12d - shrd r13d,r13d,6 - and r15d,edi - xor r14d,edx - add ecx,r13d - xor r15d,r8d - shrd r14d,r14d,2 - add r10d,ecx - add ecx,r15d - mov r13d,r10d - add r14d,ecx - shrd r13d,r13d,14 - mov ecx,r14d - mov r12d,r11d - shrd r14d,r14d,9 - xor r13d,r10d - xor r12d,eax - shrd r13d,r13d,5 - xor r14d,ecx - and r12d,r10d - xor r13d,r10d - add ebx,DWORD[24+rsp] - mov r15d,ecx - xor r12d,eax - shrd r14d,r14d,11 - xor r15d,edx - add ebx,r12d - shrd r13d,r13d,6 - and edi,r15d - xor r14d,ecx - add ebx,r13d - xor edi,edx - shrd r14d,r14d,2 - add r9d,ebx - add ebx,edi - mov r13d,r9d - add r14d,ebx - shrd r13d,r13d,14 - mov ebx,r14d - mov r12d,r10d - shrd r14d,r14d,9 - xor r13d,r9d - xor r12d,r11d - shrd r13d,r13d,5 - xor r14d,ebx - and r12d,r9d - xor r13d,r9d - add eax,DWORD[28+rsp] - mov edi,ebx - xor r12d,r11d - shrd r14d,r14d,11 - xor edi,ecx - add eax,r12d - shrd r13d,r13d,6 - and r15d,edi - xor r14d,ebx - add eax,r13d - xor r15d,ecx - shrd r14d,r14d,2 - add r8d,eax - add eax,r15d - mov r13d,r8d - add r14d,eax - shrd r13d,r13d,14 - mov eax,r14d - mov r12d,r9d - shrd r14d,r14d,9 - xor r13d,r8d - xor r12d,r10d - shrd r13d,r13d,5 - xor r14d,eax - and r12d,r8d - xor r13d,r8d - add r11d,DWORD[32+rsp] - mov r15d,eax - xor r12d,r10d - shrd r14d,r14d,11 - xor r15d,ebx - add r11d,r12d - shrd r13d,r13d,6 - and edi,r15d - xor r14d,eax - add r11d,r13d - xor edi,ebx - shrd r14d,r14d,2 - add edx,r11d - add r11d,edi - mov r13d,edx - add r14d,r11d - shrd r13d,r13d,14 - mov r11d,r14d - mov r12d,r8d - shrd r14d,r14d,9 - xor r13d,edx - xor r12d,r9d - shrd r13d,r13d,5 - xor r14d,r11d - and r12d,edx - xor r13d,edx - add r10d,DWORD[36+rsp] - mov edi,r11d - xor r12d,r9d - shrd r14d,r14d,11 - xor edi,eax - add r10d,r12d - shrd r13d,r13d,6 - and r15d,edi - xor r14d,r11d - add r10d,r13d - xor r15d,eax - shrd r14d,r14d,2 - add ecx,r10d - add r10d,r15d - mov r13d,ecx - add r14d,r10d - shrd r13d,r13d,14 - mov r10d,r14d - mov r12d,edx - shrd r14d,r14d,9 - xor r13d,ecx - xor r12d,r8d - shrd r13d,r13d,5 - xor r14d,r10d - and r12d,ecx - xor r13d,ecx - add r9d,DWORD[40+rsp] - mov r15d,r10d - xor r12d,r8d - shrd r14d,r14d,11 - xor r15d,r11d - add r9d,r12d - shrd r13d,r13d,6 - and edi,r15d - xor r14d,r10d - add r9d,r13d - xor edi,r11d - shrd r14d,r14d,2 - add ebx,r9d - add r9d,edi - mov r13d,ebx - add r14d,r9d - shrd r13d,r13d,14 - mov r9d,r14d - mov r12d,ecx - shrd r14d,r14d,9 - xor r13d,ebx - xor r12d,edx - shrd r13d,r13d,5 - xor r14d,r9d - and r12d,ebx - xor r13d,ebx - add r8d,DWORD[44+rsp] - mov edi,r9d - xor r12d,edx - shrd r14d,r14d,11 - xor edi,r10d - add r8d,r12d - shrd r13d,r13d,6 - and r15d,edi - xor r14d,r9d - add r8d,r13d - xor r15d,r10d - shrd r14d,r14d,2 - add eax,r8d - add r8d,r15d - mov r13d,eax - add r14d,r8d - shrd r13d,r13d,14 - mov r8d,r14d - mov r12d,ebx - shrd r14d,r14d,9 - xor r13d,eax - xor r12d,ecx - shrd r13d,r13d,5 - xor r14d,r8d - and r12d,eax - xor r13d,eax - add edx,DWORD[48+rsp] - mov r15d,r8d - xor r12d,ecx - shrd r14d,r14d,11 - xor r15d,r9d - add edx,r12d - shrd r13d,r13d,6 - and edi,r15d - xor r14d,r8d - add edx,r13d - xor edi,r9d - shrd r14d,r14d,2 - add r11d,edx - add edx,edi - mov r13d,r11d - add r14d,edx - shrd r13d,r13d,14 - mov edx,r14d - mov r12d,eax - shrd r14d,r14d,9 - xor r13d,r11d - xor r12d,ebx - shrd r13d,r13d,5 - xor r14d,edx - and r12d,r11d - xor r13d,r11d - add ecx,DWORD[52+rsp] - mov edi,edx - xor r12d,ebx - shrd r14d,r14d,11 - xor edi,r8d - add ecx,r12d - shrd r13d,r13d,6 - and r15d,edi - xor r14d,edx - add ecx,r13d - xor r15d,r8d - shrd r14d,r14d,2 - add r10d,ecx - add ecx,r15d - mov r13d,r10d - add r14d,ecx - shrd r13d,r13d,14 - mov ecx,r14d - mov r12d,r11d - shrd r14d,r14d,9 - xor r13d,r10d - xor r12d,eax - shrd r13d,r13d,5 - xor r14d,ecx - and r12d,r10d - xor r13d,r10d - add ebx,DWORD[56+rsp] - mov r15d,ecx - xor r12d,eax - shrd r14d,r14d,11 - xor r15d,edx - add ebx,r12d - shrd r13d,r13d,6 - and edi,r15d - xor r14d,ecx - add ebx,r13d - xor edi,edx - shrd r14d,r14d,2 - add r9d,ebx - add ebx,edi - mov r13d,r9d - add r14d,ebx - shrd r13d,r13d,14 - mov ebx,r14d - mov r12d,r10d - shrd r14d,r14d,9 - xor r13d,r9d - xor r12d,r11d - shrd r13d,r13d,5 - xor r14d,ebx - and r12d,r9d - xor r13d,r9d - add eax,DWORD[60+rsp] - mov edi,ebx - xor r12d,r11d - shrd r14d,r14d,11 - xor edi,ecx - add eax,r12d - shrd r13d,r13d,6 - and r15d,edi - xor r14d,ebx - add eax,r13d - xor r15d,ecx - shrd r14d,r14d,2 - add r8d,eax - add eax,r15d - mov r13d,r8d - add r14d,eax - mov rdi,QWORD[((64+0))+rsp] - mov eax,r14d - - add eax,DWORD[rdi] - lea rsi,[64+rsi] - add ebx,DWORD[4+rdi] - add ecx,DWORD[8+rdi] - add edx,DWORD[12+rdi] - add r8d,DWORD[16+rdi] - add r9d,DWORD[20+rdi] - add r10d,DWORD[24+rdi] - add r11d,DWORD[28+rdi] - - cmp rsi,QWORD[((64+16))+rsp] - - mov DWORD[rdi],eax - mov DWORD[4+rdi],ebx - mov DWORD[8+rdi],ecx - mov DWORD[12+rdi],edx - mov DWORD[16+rdi],r8d - mov DWORD[20+rdi],r9d - mov DWORD[24+rdi],r10d - mov DWORD[28+rdi],r11d - jb NEAR $L$loop_avx - - mov rsi,QWORD[88+rsp] - - vzeroupper - movaps xmm6,XMMWORD[((64+32))+rsp] - movaps xmm7,XMMWORD[((64+48))+rsp] - movaps xmm8,XMMWORD[((64+64))+rsp] - movaps xmm9,XMMWORD[((64+80))+rsp] - mov r15,QWORD[((-48))+rsi] - - mov r14,QWORD[((-40))+rsi] - - mov r13,QWORD[((-32))+rsi] - - mov r12,QWORD[((-24))+rsi] - - mov rbp,QWORD[((-16))+rsi] - - mov rbx,QWORD[((-8))+rsi] - - lea rsp,[rsi] - -$L$epilogue_avx: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_sha256_block_data_order_avx: - -ALIGN 64 -sha256_block_data_order_avx2: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_sha256_block_data_order_avx2: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - - -$L$avx2_shortcut: - mov rax,rsp - - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,608 - shl rdx,4 - and rsp,-256*4 - lea rdx,[rdx*4+rsi] - add rsp,448 - mov QWORD[((64+0))+rsp],rdi - mov QWORD[((64+8))+rsp],rsi - mov QWORD[((64+16))+rsp],rdx - mov QWORD[88+rsp],rax - - movaps XMMWORD[(64+32)+rsp],xmm6 - movaps XMMWORD[(64+48)+rsp],xmm7 - movaps XMMWORD[(64+64)+rsp],xmm8 - movaps XMMWORD[(64+80)+rsp],xmm9 -$L$prologue_avx2: - - vzeroupper - sub rsi,-16*4 - mov eax,DWORD[rdi] - mov r12,rsi - mov ebx,DWORD[4+rdi] - cmp rsi,rdx - mov ecx,DWORD[8+rdi] - cmove r12,rsp - mov edx,DWORD[12+rdi] - mov r8d,DWORD[16+rdi] - mov r9d,DWORD[20+rdi] - mov r10d,DWORD[24+rdi] - mov r11d,DWORD[28+rdi] - vmovdqa ymm8,YMMWORD[((K256+512+32))] - vmovdqa ymm9,YMMWORD[((K256+512+64))] - jmp NEAR $L$oop_avx2 -ALIGN 16 -$L$oop_avx2: - vmovdqa ymm7,YMMWORD[((K256+512))] - vmovdqu xmm0,XMMWORD[((-64+0))+rsi] - vmovdqu xmm1,XMMWORD[((-64+16))+rsi] - vmovdqu xmm2,XMMWORD[((-64+32))+rsi] - vmovdqu xmm3,XMMWORD[((-64+48))+rsi] - - vinserti128 ymm0,ymm0,XMMWORD[r12],1 - vinserti128 ymm1,ymm1,XMMWORD[16+r12],1 - vpshufb ymm0,ymm0,ymm7 - vinserti128 ymm2,ymm2,XMMWORD[32+r12],1 - vpshufb ymm1,ymm1,ymm7 - vinserti128 ymm3,ymm3,XMMWORD[48+r12],1 - - lea rbp,[K256] - vpshufb ymm2,ymm2,ymm7 - vpaddd ymm4,ymm0,YMMWORD[rbp] - vpshufb ymm3,ymm3,ymm7 - vpaddd ymm5,ymm1,YMMWORD[32+rbp] - vpaddd ymm6,ymm2,YMMWORD[64+rbp] - vpaddd ymm7,ymm3,YMMWORD[96+rbp] - vmovdqa YMMWORD[rsp],ymm4 - xor r14d,r14d - vmovdqa YMMWORD[32+rsp],ymm5 - lea rsp,[((-64))+rsp] - mov edi,ebx - vmovdqa YMMWORD[rsp],ymm6 - xor edi,ecx - vmovdqa YMMWORD[32+rsp],ymm7 - mov r12d,r9d - sub rbp,-16*2*4 - jmp NEAR $L$avx2_00_47 - -ALIGN 16 -$L$avx2_00_47: - lea rsp,[((-64))+rsp] - vpalignr ymm4,ymm1,ymm0,4 - add r11d,DWORD[((0+128))+rsp] - and r12d,r8d - rorx r13d,r8d,25 - vpalignr ymm7,ymm3,ymm2,4 - rorx r15d,r8d,11 - lea eax,[r14*1+rax] - lea r11d,[r12*1+r11] - vpsrld ymm6,ymm4,7 - andn r12d,r8d,r10d - xor r13d,r15d - rorx r14d,r8d,6 - vpaddd ymm0,ymm0,ymm7 - lea r11d,[r12*1+r11] - xor r13d,r14d - mov r15d,eax - vpsrld ymm7,ymm4,3 - rorx r12d,eax,22 - lea r11d,[r13*1+r11] - xor r15d,ebx - vpslld ymm5,ymm4,14 - rorx r14d,eax,13 - rorx r13d,eax,2 - lea edx,[r11*1+rdx] - vpxor ymm4,ymm7,ymm6 - and edi,r15d - xor r14d,r12d - xor edi,ebx - vpshufd ymm7,ymm3,250 - xor r14d,r13d - lea r11d,[rdi*1+r11] - mov r12d,r8d - vpsrld ymm6,ymm6,11 - add r10d,DWORD[((4+128))+rsp] - and r12d,edx - rorx r13d,edx,25 - vpxor ymm4,ymm4,ymm5 - rorx edi,edx,11 - lea r11d,[r14*1+r11] - lea r10d,[r12*1+r10] - vpslld ymm5,ymm5,11 - andn r12d,edx,r9d - xor r13d,edi - rorx r14d,edx,6 - vpxor ymm4,ymm4,ymm6 - lea r10d,[r12*1+r10] - xor r13d,r14d - mov edi,r11d - vpsrld ymm6,ymm7,10 - rorx r12d,r11d,22 - lea r10d,[r13*1+r10] - xor edi,eax - vpxor ymm4,ymm4,ymm5 - rorx r14d,r11d,13 - rorx r13d,r11d,2 - lea ecx,[r10*1+rcx] - vpsrlq ymm7,ymm7,17 - and r15d,edi - xor r14d,r12d - xor r15d,eax - vpaddd ymm0,ymm0,ymm4 - xor r14d,r13d - lea r10d,[r15*1+r10] - mov r12d,edx - vpxor ymm6,ymm6,ymm7 - add r9d,DWORD[((8+128))+rsp] - and r12d,ecx - rorx r13d,ecx,25 - vpsrlq ymm7,ymm7,2 - rorx r15d,ecx,11 - lea r10d,[r14*1+r10] - lea r9d,[r12*1+r9] - vpxor ymm6,ymm6,ymm7 - andn r12d,ecx,r8d - xor r13d,r15d - rorx r14d,ecx,6 - vpshufb ymm6,ymm6,ymm8 - lea r9d,[r12*1+r9] - xor r13d,r14d - mov r15d,r10d - vpaddd ymm0,ymm0,ymm6 - rorx r12d,r10d,22 - lea r9d,[r13*1+r9] - xor r15d,r11d - vpshufd ymm7,ymm0,80 - rorx r14d,r10d,13 - rorx r13d,r10d,2 - lea ebx,[r9*1+rbx] - vpsrld ymm6,ymm7,10 - and edi,r15d - xor r14d,r12d - xor edi,r11d - vpsrlq ymm7,ymm7,17 - xor r14d,r13d - lea r9d,[rdi*1+r9] - mov r12d,ecx - vpxor ymm6,ymm6,ymm7 - add r8d,DWORD[((12+128))+rsp] - and r12d,ebx - rorx r13d,ebx,25 - vpsrlq ymm7,ymm7,2 - rorx edi,ebx,11 - lea r9d,[r14*1+r9] - lea r8d,[r12*1+r8] - vpxor ymm6,ymm6,ymm7 - andn r12d,ebx,edx - xor r13d,edi - rorx r14d,ebx,6 - vpshufb ymm6,ymm6,ymm9 - lea r8d,[r12*1+r8] - xor r13d,r14d - mov edi,r9d - vpaddd ymm0,ymm0,ymm6 - rorx r12d,r9d,22 - lea r8d,[r13*1+r8] - xor edi,r10d - vpaddd ymm6,ymm0,YMMWORD[rbp] - rorx r14d,r9d,13 - rorx r13d,r9d,2 - lea eax,[r8*1+rax] - and r15d,edi - xor r14d,r12d - xor r15d,r10d - xor r14d,r13d - lea r8d,[r15*1+r8] - mov r12d,ebx - vmovdqa YMMWORD[rsp],ymm6 - vpalignr ymm4,ymm2,ymm1,4 - add edx,DWORD[((32+128))+rsp] - and r12d,eax - rorx r13d,eax,25 - vpalignr ymm7,ymm0,ymm3,4 - rorx r15d,eax,11 - lea r8d,[r14*1+r8] - lea edx,[r12*1+rdx] - vpsrld ymm6,ymm4,7 - andn r12d,eax,ecx - xor r13d,r15d - rorx r14d,eax,6 - vpaddd ymm1,ymm1,ymm7 - lea edx,[r12*1+rdx] - xor r13d,r14d - mov r15d,r8d - vpsrld ymm7,ymm4,3 - rorx r12d,r8d,22 - lea edx,[r13*1+rdx] - xor r15d,r9d - vpslld ymm5,ymm4,14 - rorx r14d,r8d,13 - rorx r13d,r8d,2 - lea r11d,[rdx*1+r11] - vpxor ymm4,ymm7,ymm6 - and edi,r15d - xor r14d,r12d - xor edi,r9d - vpshufd ymm7,ymm0,250 - xor r14d,r13d - lea edx,[rdi*1+rdx] - mov r12d,eax - vpsrld ymm6,ymm6,11 - add ecx,DWORD[((36+128))+rsp] - and r12d,r11d - rorx r13d,r11d,25 - vpxor ymm4,ymm4,ymm5 - rorx edi,r11d,11 - lea edx,[r14*1+rdx] - lea ecx,[r12*1+rcx] - vpslld ymm5,ymm5,11 - andn r12d,r11d,ebx - xor r13d,edi - rorx r14d,r11d,6 - vpxor ymm4,ymm4,ymm6 - lea ecx,[r12*1+rcx] - xor r13d,r14d - mov edi,edx - vpsrld ymm6,ymm7,10 - rorx r12d,edx,22 - lea ecx,[r13*1+rcx] - xor edi,r8d - vpxor ymm4,ymm4,ymm5 - rorx r14d,edx,13 - rorx r13d,edx,2 - lea r10d,[rcx*1+r10] - vpsrlq ymm7,ymm7,17 - and r15d,edi - xor r14d,r12d - xor r15d,r8d - vpaddd ymm1,ymm1,ymm4 - xor r14d,r13d - lea ecx,[r15*1+rcx] - mov r12d,r11d - vpxor ymm6,ymm6,ymm7 - add ebx,DWORD[((40+128))+rsp] - and r12d,r10d - rorx r13d,r10d,25 - vpsrlq ymm7,ymm7,2 - rorx r15d,r10d,11 - lea ecx,[r14*1+rcx] - lea ebx,[r12*1+rbx] - vpxor ymm6,ymm6,ymm7 - andn r12d,r10d,eax - xor r13d,r15d - rorx r14d,r10d,6 - vpshufb ymm6,ymm6,ymm8 - lea ebx,[r12*1+rbx] - xor r13d,r14d - mov r15d,ecx - vpaddd ymm1,ymm1,ymm6 - rorx r12d,ecx,22 - lea ebx,[r13*1+rbx] - xor r15d,edx - vpshufd ymm7,ymm1,80 - rorx r14d,ecx,13 - rorx r13d,ecx,2 - lea r9d,[rbx*1+r9] - vpsrld ymm6,ymm7,10 - and edi,r15d - xor r14d,r12d - xor edi,edx - vpsrlq ymm7,ymm7,17 - xor r14d,r13d - lea ebx,[rdi*1+rbx] - mov r12d,r10d - vpxor ymm6,ymm6,ymm7 - add eax,DWORD[((44+128))+rsp] - and r12d,r9d - rorx r13d,r9d,25 - vpsrlq ymm7,ymm7,2 - rorx edi,r9d,11 - lea ebx,[r14*1+rbx] - lea eax,[r12*1+rax] - vpxor ymm6,ymm6,ymm7 - andn r12d,r9d,r11d - xor r13d,edi - rorx r14d,r9d,6 - vpshufb ymm6,ymm6,ymm9 - lea eax,[r12*1+rax] - xor r13d,r14d - mov edi,ebx - vpaddd ymm1,ymm1,ymm6 - rorx r12d,ebx,22 - lea eax,[r13*1+rax] - xor edi,ecx - vpaddd ymm6,ymm1,YMMWORD[32+rbp] - rorx r14d,ebx,13 - rorx r13d,ebx,2 - lea r8d,[rax*1+r8] - and r15d,edi - xor r14d,r12d - xor r15d,ecx - xor r14d,r13d - lea eax,[r15*1+rax] - mov r12d,r9d - vmovdqa YMMWORD[32+rsp],ymm6 - lea rsp,[((-64))+rsp] - vpalignr ymm4,ymm3,ymm2,4 - add r11d,DWORD[((0+128))+rsp] - and r12d,r8d - rorx r13d,r8d,25 - vpalignr ymm7,ymm1,ymm0,4 - rorx r15d,r8d,11 - lea eax,[r14*1+rax] - lea r11d,[r12*1+r11] - vpsrld ymm6,ymm4,7 - andn r12d,r8d,r10d - xor r13d,r15d - rorx r14d,r8d,6 - vpaddd ymm2,ymm2,ymm7 - lea r11d,[r12*1+r11] - xor r13d,r14d - mov r15d,eax - vpsrld ymm7,ymm4,3 - rorx r12d,eax,22 - lea r11d,[r13*1+r11] - xor r15d,ebx - vpslld ymm5,ymm4,14 - rorx r14d,eax,13 - rorx r13d,eax,2 - lea edx,[r11*1+rdx] - vpxor ymm4,ymm7,ymm6 - and edi,r15d - xor r14d,r12d - xor edi,ebx - vpshufd ymm7,ymm1,250 - xor r14d,r13d - lea r11d,[rdi*1+r11] - mov r12d,r8d - vpsrld ymm6,ymm6,11 - add r10d,DWORD[((4+128))+rsp] - and r12d,edx - rorx r13d,edx,25 - vpxor ymm4,ymm4,ymm5 - rorx edi,edx,11 - lea r11d,[r14*1+r11] - lea r10d,[r12*1+r10] - vpslld ymm5,ymm5,11 - andn r12d,edx,r9d - xor r13d,edi - rorx r14d,edx,6 - vpxor ymm4,ymm4,ymm6 - lea r10d,[r12*1+r10] - xor r13d,r14d - mov edi,r11d - vpsrld ymm6,ymm7,10 - rorx r12d,r11d,22 - lea r10d,[r13*1+r10] - xor edi,eax - vpxor ymm4,ymm4,ymm5 - rorx r14d,r11d,13 - rorx r13d,r11d,2 - lea ecx,[r10*1+rcx] - vpsrlq ymm7,ymm7,17 - and r15d,edi - xor r14d,r12d - xor r15d,eax - vpaddd ymm2,ymm2,ymm4 - xor r14d,r13d - lea r10d,[r15*1+r10] - mov r12d,edx - vpxor ymm6,ymm6,ymm7 - add r9d,DWORD[((8+128))+rsp] - and r12d,ecx - rorx r13d,ecx,25 - vpsrlq ymm7,ymm7,2 - rorx r15d,ecx,11 - lea r10d,[r14*1+r10] - lea r9d,[r12*1+r9] - vpxor ymm6,ymm6,ymm7 - andn r12d,ecx,r8d - xor r13d,r15d - rorx r14d,ecx,6 - vpshufb ymm6,ymm6,ymm8 - lea r9d,[r12*1+r9] - xor r13d,r14d - mov r15d,r10d - vpaddd ymm2,ymm2,ymm6 - rorx r12d,r10d,22 - lea r9d,[r13*1+r9] - xor r15d,r11d - vpshufd ymm7,ymm2,80 - rorx r14d,r10d,13 - rorx r13d,r10d,2 - lea ebx,[r9*1+rbx] - vpsrld ymm6,ymm7,10 - and edi,r15d - xor r14d,r12d - xor edi,r11d - vpsrlq ymm7,ymm7,17 - xor r14d,r13d - lea r9d,[rdi*1+r9] - mov r12d,ecx - vpxor ymm6,ymm6,ymm7 - add r8d,DWORD[((12+128))+rsp] - and r12d,ebx - rorx r13d,ebx,25 - vpsrlq ymm7,ymm7,2 - rorx edi,ebx,11 - lea r9d,[r14*1+r9] - lea r8d,[r12*1+r8] - vpxor ymm6,ymm6,ymm7 - andn r12d,ebx,edx - xor r13d,edi - rorx r14d,ebx,6 - vpshufb ymm6,ymm6,ymm9 - lea r8d,[r12*1+r8] - xor r13d,r14d - mov edi,r9d - vpaddd ymm2,ymm2,ymm6 - rorx r12d,r9d,22 - lea r8d,[r13*1+r8] - xor edi,r10d - vpaddd ymm6,ymm2,YMMWORD[64+rbp] - rorx r14d,r9d,13 - rorx r13d,r9d,2 - lea eax,[r8*1+rax] - and r15d,edi - xor r14d,r12d - xor r15d,r10d - xor r14d,r13d - lea r8d,[r15*1+r8] - mov r12d,ebx - vmovdqa YMMWORD[rsp],ymm6 - vpalignr ymm4,ymm0,ymm3,4 - add edx,DWORD[((32+128))+rsp] - and r12d,eax - rorx r13d,eax,25 - vpalignr ymm7,ymm2,ymm1,4 - rorx r15d,eax,11 - lea r8d,[r14*1+r8] - lea edx,[r12*1+rdx] - vpsrld ymm6,ymm4,7 - andn r12d,eax,ecx - xor r13d,r15d - rorx r14d,eax,6 - vpaddd ymm3,ymm3,ymm7 - lea edx,[r12*1+rdx] - xor r13d,r14d - mov r15d,r8d - vpsrld ymm7,ymm4,3 - rorx r12d,r8d,22 - lea edx,[r13*1+rdx] - xor r15d,r9d - vpslld ymm5,ymm4,14 - rorx r14d,r8d,13 - rorx r13d,r8d,2 - lea r11d,[rdx*1+r11] - vpxor ymm4,ymm7,ymm6 - and edi,r15d - xor r14d,r12d - xor edi,r9d - vpshufd ymm7,ymm2,250 - xor r14d,r13d - lea edx,[rdi*1+rdx] - mov r12d,eax - vpsrld ymm6,ymm6,11 - add ecx,DWORD[((36+128))+rsp] - and r12d,r11d - rorx r13d,r11d,25 - vpxor ymm4,ymm4,ymm5 - rorx edi,r11d,11 - lea edx,[r14*1+rdx] - lea ecx,[r12*1+rcx] - vpslld ymm5,ymm5,11 - andn r12d,r11d,ebx - xor r13d,edi - rorx r14d,r11d,6 - vpxor ymm4,ymm4,ymm6 - lea ecx,[r12*1+rcx] - xor r13d,r14d - mov edi,edx - vpsrld ymm6,ymm7,10 - rorx r12d,edx,22 - lea ecx,[r13*1+rcx] - xor edi,r8d - vpxor ymm4,ymm4,ymm5 - rorx r14d,edx,13 - rorx r13d,edx,2 - lea r10d,[rcx*1+r10] - vpsrlq ymm7,ymm7,17 - and r15d,edi - xor r14d,r12d - xor r15d,r8d - vpaddd ymm3,ymm3,ymm4 - xor r14d,r13d - lea ecx,[r15*1+rcx] - mov r12d,r11d - vpxor ymm6,ymm6,ymm7 - add ebx,DWORD[((40+128))+rsp] - and r12d,r10d - rorx r13d,r10d,25 - vpsrlq ymm7,ymm7,2 - rorx r15d,r10d,11 - lea ecx,[r14*1+rcx] - lea ebx,[r12*1+rbx] - vpxor ymm6,ymm6,ymm7 - andn r12d,r10d,eax - xor r13d,r15d - rorx r14d,r10d,6 - vpshufb ymm6,ymm6,ymm8 - lea ebx,[r12*1+rbx] - xor r13d,r14d - mov r15d,ecx - vpaddd ymm3,ymm3,ymm6 - rorx r12d,ecx,22 - lea ebx,[r13*1+rbx] - xor r15d,edx - vpshufd ymm7,ymm3,80 - rorx r14d,ecx,13 - rorx r13d,ecx,2 - lea r9d,[rbx*1+r9] - vpsrld ymm6,ymm7,10 - and edi,r15d - xor r14d,r12d - xor edi,edx - vpsrlq ymm7,ymm7,17 - xor r14d,r13d - lea ebx,[rdi*1+rbx] - mov r12d,r10d - vpxor ymm6,ymm6,ymm7 - add eax,DWORD[((44+128))+rsp] - and r12d,r9d - rorx r13d,r9d,25 - vpsrlq ymm7,ymm7,2 - rorx edi,r9d,11 - lea ebx,[r14*1+rbx] - lea eax,[r12*1+rax] - vpxor ymm6,ymm6,ymm7 - andn r12d,r9d,r11d - xor r13d,edi - rorx r14d,r9d,6 - vpshufb ymm6,ymm6,ymm9 - lea eax,[r12*1+rax] - xor r13d,r14d - mov edi,ebx - vpaddd ymm3,ymm3,ymm6 - rorx r12d,ebx,22 - lea eax,[r13*1+rax] - xor edi,ecx - vpaddd ymm6,ymm3,YMMWORD[96+rbp] - rorx r14d,ebx,13 - rorx r13d,ebx,2 - lea r8d,[rax*1+r8] - and r15d,edi - xor r14d,r12d - xor r15d,ecx - xor r14d,r13d - lea eax,[r15*1+rax] - mov r12d,r9d - vmovdqa YMMWORD[32+rsp],ymm6 - lea rbp,[128+rbp] - cmp BYTE[3+rbp],0 - jne NEAR $L$avx2_00_47 - add r11d,DWORD[((0+64))+rsp] - and r12d,r8d - rorx r13d,r8d,25 - rorx r15d,r8d,11 - lea eax,[r14*1+rax] - lea r11d,[r12*1+r11] - andn r12d,r8d,r10d - xor r13d,r15d - rorx r14d,r8d,6 - lea r11d,[r12*1+r11] - xor r13d,r14d - mov r15d,eax - rorx r12d,eax,22 - lea r11d,[r13*1+r11] - xor r15d,ebx - rorx r14d,eax,13 - rorx r13d,eax,2 - lea edx,[r11*1+rdx] - and edi,r15d - xor r14d,r12d - xor edi,ebx - xor r14d,r13d - lea r11d,[rdi*1+r11] - mov r12d,r8d - add r10d,DWORD[((4+64))+rsp] - and r12d,edx - rorx r13d,edx,25 - rorx edi,edx,11 - lea r11d,[r14*1+r11] - lea r10d,[r12*1+r10] - andn r12d,edx,r9d - xor r13d,edi - rorx r14d,edx,6 - lea r10d,[r12*1+r10] - xor r13d,r14d - mov edi,r11d - rorx r12d,r11d,22 - lea r10d,[r13*1+r10] - xor edi,eax - rorx r14d,r11d,13 - rorx r13d,r11d,2 - lea ecx,[r10*1+rcx] - and r15d,edi - xor r14d,r12d - xor r15d,eax - xor r14d,r13d - lea r10d,[r15*1+r10] - mov r12d,edx - add r9d,DWORD[((8+64))+rsp] - and r12d,ecx - rorx r13d,ecx,25 - rorx r15d,ecx,11 - lea r10d,[r14*1+r10] - lea r9d,[r12*1+r9] - andn r12d,ecx,r8d - xor r13d,r15d - rorx r14d,ecx,6 - lea r9d,[r12*1+r9] - xor r13d,r14d - mov r15d,r10d - rorx r12d,r10d,22 - lea r9d,[r13*1+r9] - xor r15d,r11d - rorx r14d,r10d,13 - rorx r13d,r10d,2 - lea ebx,[r9*1+rbx] - and edi,r15d - xor r14d,r12d - xor edi,r11d - xor r14d,r13d - lea r9d,[rdi*1+r9] - mov r12d,ecx - add r8d,DWORD[((12+64))+rsp] - and r12d,ebx - rorx r13d,ebx,25 - rorx edi,ebx,11 - lea r9d,[r14*1+r9] - lea r8d,[r12*1+r8] - andn r12d,ebx,edx - xor r13d,edi - rorx r14d,ebx,6 - lea r8d,[r12*1+r8] - xor r13d,r14d - mov edi,r9d - rorx r12d,r9d,22 - lea r8d,[r13*1+r8] - xor edi,r10d - rorx r14d,r9d,13 - rorx r13d,r9d,2 - lea eax,[r8*1+rax] - and r15d,edi - xor r14d,r12d - xor r15d,r10d - xor r14d,r13d - lea r8d,[r15*1+r8] - mov r12d,ebx - add edx,DWORD[((32+64))+rsp] - and r12d,eax - rorx r13d,eax,25 - rorx r15d,eax,11 - lea r8d,[r14*1+r8] - lea edx,[r12*1+rdx] - andn r12d,eax,ecx - xor r13d,r15d - rorx r14d,eax,6 - lea edx,[r12*1+rdx] - xor r13d,r14d - mov r15d,r8d - rorx r12d,r8d,22 - lea edx,[r13*1+rdx] - xor r15d,r9d - rorx r14d,r8d,13 - rorx r13d,r8d,2 - lea r11d,[rdx*1+r11] - and edi,r15d - xor r14d,r12d - xor edi,r9d - xor r14d,r13d - lea edx,[rdi*1+rdx] - mov r12d,eax - add ecx,DWORD[((36+64))+rsp] - and r12d,r11d - rorx r13d,r11d,25 - rorx edi,r11d,11 - lea edx,[r14*1+rdx] - lea ecx,[r12*1+rcx] - andn r12d,r11d,ebx - xor r13d,edi - rorx r14d,r11d,6 - lea ecx,[r12*1+rcx] - xor r13d,r14d - mov edi,edx - rorx r12d,edx,22 - lea ecx,[r13*1+rcx] - xor edi,r8d - rorx r14d,edx,13 - rorx r13d,edx,2 - lea r10d,[rcx*1+r10] - and r15d,edi - xor r14d,r12d - xor r15d,r8d - xor r14d,r13d - lea ecx,[r15*1+rcx] - mov r12d,r11d - add ebx,DWORD[((40+64))+rsp] - and r12d,r10d - rorx r13d,r10d,25 - rorx r15d,r10d,11 - lea ecx,[r14*1+rcx] - lea ebx,[r12*1+rbx] - andn r12d,r10d,eax - xor r13d,r15d - rorx r14d,r10d,6 - lea ebx,[r12*1+rbx] - xor r13d,r14d - mov r15d,ecx - rorx r12d,ecx,22 - lea ebx,[r13*1+rbx] - xor r15d,edx - rorx r14d,ecx,13 - rorx r13d,ecx,2 - lea r9d,[rbx*1+r9] - and edi,r15d - xor r14d,r12d - xor edi,edx - xor r14d,r13d - lea ebx,[rdi*1+rbx] - mov r12d,r10d - add eax,DWORD[((44+64))+rsp] - and r12d,r9d - rorx r13d,r9d,25 - rorx edi,r9d,11 - lea ebx,[r14*1+rbx] - lea eax,[r12*1+rax] - andn r12d,r9d,r11d - xor r13d,edi - rorx r14d,r9d,6 - lea eax,[r12*1+rax] - xor r13d,r14d - mov edi,ebx - rorx r12d,ebx,22 - lea eax,[r13*1+rax] - xor edi,ecx - rorx r14d,ebx,13 - rorx r13d,ebx,2 - lea r8d,[rax*1+r8] - and r15d,edi - xor r14d,r12d - xor r15d,ecx - xor r14d,r13d - lea eax,[r15*1+rax] - mov r12d,r9d - add r11d,DWORD[rsp] - and r12d,r8d - rorx r13d,r8d,25 - rorx r15d,r8d,11 - lea eax,[r14*1+rax] - lea r11d,[r12*1+r11] - andn r12d,r8d,r10d - xor r13d,r15d - rorx r14d,r8d,6 - lea r11d,[r12*1+r11] - xor r13d,r14d - mov r15d,eax - rorx r12d,eax,22 - lea r11d,[r13*1+r11] - xor r15d,ebx - rorx r14d,eax,13 - rorx r13d,eax,2 - lea edx,[r11*1+rdx] - and edi,r15d - xor r14d,r12d - xor edi,ebx - xor r14d,r13d - lea r11d,[rdi*1+r11] - mov r12d,r8d - add r10d,DWORD[4+rsp] - and r12d,edx - rorx r13d,edx,25 - rorx edi,edx,11 - lea r11d,[r14*1+r11] - lea r10d,[r12*1+r10] - andn r12d,edx,r9d - xor r13d,edi - rorx r14d,edx,6 - lea r10d,[r12*1+r10] - xor r13d,r14d - mov edi,r11d - rorx r12d,r11d,22 - lea r10d,[r13*1+r10] - xor edi,eax - rorx r14d,r11d,13 - rorx r13d,r11d,2 - lea ecx,[r10*1+rcx] - and r15d,edi - xor r14d,r12d - xor r15d,eax - xor r14d,r13d - lea r10d,[r15*1+r10] - mov r12d,edx - add r9d,DWORD[8+rsp] - and r12d,ecx - rorx r13d,ecx,25 - rorx r15d,ecx,11 - lea r10d,[r14*1+r10] - lea r9d,[r12*1+r9] - andn r12d,ecx,r8d - xor r13d,r15d - rorx r14d,ecx,6 - lea r9d,[r12*1+r9] - xor r13d,r14d - mov r15d,r10d - rorx r12d,r10d,22 - lea r9d,[r13*1+r9] - xor r15d,r11d - rorx r14d,r10d,13 - rorx r13d,r10d,2 - lea ebx,[r9*1+rbx] - and edi,r15d - xor r14d,r12d - xor edi,r11d - xor r14d,r13d - lea r9d,[rdi*1+r9] - mov r12d,ecx - add r8d,DWORD[12+rsp] - and r12d,ebx - rorx r13d,ebx,25 - rorx edi,ebx,11 - lea r9d,[r14*1+r9] - lea r8d,[r12*1+r8] - andn r12d,ebx,edx - xor r13d,edi - rorx r14d,ebx,6 - lea r8d,[r12*1+r8] - xor r13d,r14d - mov edi,r9d - rorx r12d,r9d,22 - lea r8d,[r13*1+r8] - xor edi,r10d - rorx r14d,r9d,13 - rorx r13d,r9d,2 - lea eax,[r8*1+rax] - and r15d,edi - xor r14d,r12d - xor r15d,r10d - xor r14d,r13d - lea r8d,[r15*1+r8] - mov r12d,ebx - add edx,DWORD[32+rsp] - and r12d,eax - rorx r13d,eax,25 - rorx r15d,eax,11 - lea r8d,[r14*1+r8] - lea edx,[r12*1+rdx] - andn r12d,eax,ecx - xor r13d,r15d - rorx r14d,eax,6 - lea edx,[r12*1+rdx] - xor r13d,r14d - mov r15d,r8d - rorx r12d,r8d,22 - lea edx,[r13*1+rdx] - xor r15d,r9d - rorx r14d,r8d,13 - rorx r13d,r8d,2 - lea r11d,[rdx*1+r11] - and edi,r15d - xor r14d,r12d - xor edi,r9d - xor r14d,r13d - lea edx,[rdi*1+rdx] - mov r12d,eax - add ecx,DWORD[36+rsp] - and r12d,r11d - rorx r13d,r11d,25 - rorx edi,r11d,11 - lea edx,[r14*1+rdx] - lea ecx,[r12*1+rcx] - andn r12d,r11d,ebx - xor r13d,edi - rorx r14d,r11d,6 - lea ecx,[r12*1+rcx] - xor r13d,r14d - mov edi,edx - rorx r12d,edx,22 - lea ecx,[r13*1+rcx] - xor edi,r8d - rorx r14d,edx,13 - rorx r13d,edx,2 - lea r10d,[rcx*1+r10] - and r15d,edi - xor r14d,r12d - xor r15d,r8d - xor r14d,r13d - lea ecx,[r15*1+rcx] - mov r12d,r11d - add ebx,DWORD[40+rsp] - and r12d,r10d - rorx r13d,r10d,25 - rorx r15d,r10d,11 - lea ecx,[r14*1+rcx] - lea ebx,[r12*1+rbx] - andn r12d,r10d,eax - xor r13d,r15d - rorx r14d,r10d,6 - lea ebx,[r12*1+rbx] - xor r13d,r14d - mov r15d,ecx - rorx r12d,ecx,22 - lea ebx,[r13*1+rbx] - xor r15d,edx - rorx r14d,ecx,13 - rorx r13d,ecx,2 - lea r9d,[rbx*1+r9] - and edi,r15d - xor r14d,r12d - xor edi,edx - xor r14d,r13d - lea ebx,[rdi*1+rbx] - mov r12d,r10d - add eax,DWORD[44+rsp] - and r12d,r9d - rorx r13d,r9d,25 - rorx edi,r9d,11 - lea ebx,[r14*1+rbx] - lea eax,[r12*1+rax] - andn r12d,r9d,r11d - xor r13d,edi - rorx r14d,r9d,6 - lea eax,[r12*1+rax] - xor r13d,r14d - mov edi,ebx - rorx r12d,ebx,22 - lea eax,[r13*1+rax] - xor edi,ecx - rorx r14d,ebx,13 - rorx r13d,ebx,2 - lea r8d,[rax*1+r8] - and r15d,edi - xor r14d,r12d - xor r15d,ecx - xor r14d,r13d - lea eax,[r15*1+rax] - mov r12d,r9d - mov rdi,QWORD[512+rsp] - add eax,r14d - - lea rbp,[448+rsp] - - add eax,DWORD[rdi] - add ebx,DWORD[4+rdi] - add ecx,DWORD[8+rdi] - add edx,DWORD[12+rdi] - add r8d,DWORD[16+rdi] - add r9d,DWORD[20+rdi] - add r10d,DWORD[24+rdi] - add r11d,DWORD[28+rdi] - - mov DWORD[rdi],eax - mov DWORD[4+rdi],ebx - mov DWORD[8+rdi],ecx - mov DWORD[12+rdi],edx - mov DWORD[16+rdi],r8d - mov DWORD[20+rdi],r9d - mov DWORD[24+rdi],r10d - mov DWORD[28+rdi],r11d - - cmp rsi,QWORD[80+rbp] - je NEAR $L$done_avx2 - - xor r14d,r14d - mov edi,ebx - xor edi,ecx - mov r12d,r9d - jmp NEAR $L$ower_avx2 -ALIGN 16 -$L$ower_avx2: - add r11d,DWORD[((0+16))+rbp] - and r12d,r8d - rorx r13d,r8d,25 - rorx r15d,r8d,11 - lea eax,[r14*1+rax] - lea r11d,[r12*1+r11] - andn r12d,r8d,r10d - xor r13d,r15d - rorx r14d,r8d,6 - lea r11d,[r12*1+r11] - xor r13d,r14d - mov r15d,eax - rorx r12d,eax,22 - lea r11d,[r13*1+r11] - xor r15d,ebx - rorx r14d,eax,13 - rorx r13d,eax,2 - lea edx,[r11*1+rdx] - and edi,r15d - xor r14d,r12d - xor edi,ebx - xor r14d,r13d - lea r11d,[rdi*1+r11] - mov r12d,r8d - add r10d,DWORD[((4+16))+rbp] - and r12d,edx - rorx r13d,edx,25 - rorx edi,edx,11 - lea r11d,[r14*1+r11] - lea r10d,[r12*1+r10] - andn r12d,edx,r9d - xor r13d,edi - rorx r14d,edx,6 - lea r10d,[r12*1+r10] - xor r13d,r14d - mov edi,r11d - rorx r12d,r11d,22 - lea r10d,[r13*1+r10] - xor edi,eax - rorx r14d,r11d,13 - rorx r13d,r11d,2 - lea ecx,[r10*1+rcx] - and r15d,edi - xor r14d,r12d - xor r15d,eax - xor r14d,r13d - lea r10d,[r15*1+r10] - mov r12d,edx - add r9d,DWORD[((8+16))+rbp] - and r12d,ecx - rorx r13d,ecx,25 - rorx r15d,ecx,11 - lea r10d,[r14*1+r10] - lea r9d,[r12*1+r9] - andn r12d,ecx,r8d - xor r13d,r15d - rorx r14d,ecx,6 - lea r9d,[r12*1+r9] - xor r13d,r14d - mov r15d,r10d - rorx r12d,r10d,22 - lea r9d,[r13*1+r9] - xor r15d,r11d - rorx r14d,r10d,13 - rorx r13d,r10d,2 - lea ebx,[r9*1+rbx] - and edi,r15d - xor r14d,r12d - xor edi,r11d - xor r14d,r13d - lea r9d,[rdi*1+r9] - mov r12d,ecx - add r8d,DWORD[((12+16))+rbp] - and r12d,ebx - rorx r13d,ebx,25 - rorx edi,ebx,11 - lea r9d,[r14*1+r9] - lea r8d,[r12*1+r8] - andn r12d,ebx,edx - xor r13d,edi - rorx r14d,ebx,6 - lea r8d,[r12*1+r8] - xor r13d,r14d - mov edi,r9d - rorx r12d,r9d,22 - lea r8d,[r13*1+r8] - xor edi,r10d - rorx r14d,r9d,13 - rorx r13d,r9d,2 - lea eax,[r8*1+rax] - and r15d,edi - xor r14d,r12d - xor r15d,r10d - xor r14d,r13d - lea r8d,[r15*1+r8] - mov r12d,ebx - add edx,DWORD[((32+16))+rbp] - and r12d,eax - rorx r13d,eax,25 - rorx r15d,eax,11 - lea r8d,[r14*1+r8] - lea edx,[r12*1+rdx] - andn r12d,eax,ecx - xor r13d,r15d - rorx r14d,eax,6 - lea edx,[r12*1+rdx] - xor r13d,r14d - mov r15d,r8d - rorx r12d,r8d,22 - lea edx,[r13*1+rdx] - xor r15d,r9d - rorx r14d,r8d,13 - rorx r13d,r8d,2 - lea r11d,[rdx*1+r11] - and edi,r15d - xor r14d,r12d - xor edi,r9d - xor r14d,r13d - lea edx,[rdi*1+rdx] - mov r12d,eax - add ecx,DWORD[((36+16))+rbp] - and r12d,r11d - rorx r13d,r11d,25 - rorx edi,r11d,11 - lea edx,[r14*1+rdx] - lea ecx,[r12*1+rcx] - andn r12d,r11d,ebx - xor r13d,edi - rorx r14d,r11d,6 - lea ecx,[r12*1+rcx] - xor r13d,r14d - mov edi,edx - rorx r12d,edx,22 - lea ecx,[r13*1+rcx] - xor edi,r8d - rorx r14d,edx,13 - rorx r13d,edx,2 - lea r10d,[rcx*1+r10] - and r15d,edi - xor r14d,r12d - xor r15d,r8d - xor r14d,r13d - lea ecx,[r15*1+rcx] - mov r12d,r11d - add ebx,DWORD[((40+16))+rbp] - and r12d,r10d - rorx r13d,r10d,25 - rorx r15d,r10d,11 - lea ecx,[r14*1+rcx] - lea ebx,[r12*1+rbx] - andn r12d,r10d,eax - xor r13d,r15d - rorx r14d,r10d,6 - lea ebx,[r12*1+rbx] - xor r13d,r14d - mov r15d,ecx - rorx r12d,ecx,22 - lea ebx,[r13*1+rbx] - xor r15d,edx - rorx r14d,ecx,13 - rorx r13d,ecx,2 - lea r9d,[rbx*1+r9] - and edi,r15d - xor r14d,r12d - xor edi,edx - xor r14d,r13d - lea ebx,[rdi*1+rbx] - mov r12d,r10d - add eax,DWORD[((44+16))+rbp] - and r12d,r9d - rorx r13d,r9d,25 - rorx edi,r9d,11 - lea ebx,[r14*1+rbx] - lea eax,[r12*1+rax] - andn r12d,r9d,r11d - xor r13d,edi - rorx r14d,r9d,6 - lea eax,[r12*1+rax] - xor r13d,r14d - mov edi,ebx - rorx r12d,ebx,22 - lea eax,[r13*1+rax] - xor edi,ecx - rorx r14d,ebx,13 - rorx r13d,ebx,2 - lea r8d,[rax*1+r8] - and r15d,edi - xor r14d,r12d - xor r15d,ecx - xor r14d,r13d - lea eax,[r15*1+rax] - mov r12d,r9d - lea rbp,[((-64))+rbp] - cmp rbp,rsp - jae NEAR $L$ower_avx2 - - mov rdi,QWORD[512+rsp] - add eax,r14d - - lea rsp,[448+rsp] - - - - add eax,DWORD[rdi] - add ebx,DWORD[4+rdi] - add ecx,DWORD[8+rdi] - add edx,DWORD[12+rdi] - add r8d,DWORD[16+rdi] - add r9d,DWORD[20+rdi] - lea rsi,[128+rsi] - add r10d,DWORD[24+rdi] - mov r12,rsi - add r11d,DWORD[28+rdi] - cmp rsi,QWORD[((64+16))+rsp] - - mov DWORD[rdi],eax - cmove r12,rsp - mov DWORD[4+rdi],ebx - mov DWORD[8+rdi],ecx - mov DWORD[12+rdi],edx - mov DWORD[16+rdi],r8d - mov DWORD[20+rdi],r9d - mov DWORD[24+rdi],r10d - mov DWORD[28+rdi],r11d - - jbe NEAR $L$oop_avx2 - lea rbp,[rsp] - - - - -$L$done_avx2: - mov rsi,QWORD[88+rbp] - - vzeroupper - movaps xmm6,XMMWORD[((64+32))+rbp] - movaps xmm7,XMMWORD[((64+48))+rbp] - movaps xmm8,XMMWORD[((64+64))+rbp] - movaps xmm9,XMMWORD[((64+80))+rbp] - mov r15,QWORD[((-48))+rsi] - - mov r14,QWORD[((-40))+rsi] - - mov r13,QWORD[((-32))+rsi] - - mov r12,QWORD[((-24))+rsi] - - mov rbp,QWORD[((-16))+rsi] - - mov rbx,QWORD[((-8))+rsi] - - lea rsp,[rsi] - -$L$epilogue_avx2: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_sha256_block_data_order_avx2: EXTERN __imp_RtlVirtualUnwind ALIGN 16 @@ -5568,13 +3181,6 @@ se_handler: lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$in_prologue - lea r10,[$L$avx2_shortcut] - cmp rbx,r10 - jb NEAR $L$not_in_avx2 - - and rax,-256*4 - add rax,448 -$L$not_in_avx2: mov rsi,rax mov rax,QWORD[((64+24))+rax] @@ -5682,12 +3288,6 @@ ALIGN 4 DD $L$SEH_begin_sha256_block_data_order_ssse3 wrt ..imagebase DD $L$SEH_end_sha256_block_data_order_ssse3 wrt ..imagebase DD $L$SEH_info_sha256_block_data_order_ssse3 wrt ..imagebase - DD $L$SEH_begin_sha256_block_data_order_avx wrt ..imagebase - DD $L$SEH_end_sha256_block_data_order_avx wrt ..imagebase - DD $L$SEH_info_sha256_block_data_order_avx wrt ..imagebase - DD $L$SEH_begin_sha256_block_data_order_avx2 wrt ..imagebase - DD $L$SEH_end_sha256_block_data_order_avx2 wrt ..imagebase - DD $L$SEH_info_sha256_block_data_order_avx2 wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_sha256_block_data_order: @@ -5701,11 +3301,3 @@ $L$SEH_info_sha256_block_data_order_ssse3: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase -$L$SEH_info_sha256_block_data_order_avx: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase -$L$SEH_info_sha256_block_data_order_avx2: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$prologue_avx2 wrt ..imagebase,$L$epilogue_avx2 wrt ..imagebase diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha512-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha512-x86_64.nasm index 5ddba53d1c5..f75e7fe2629 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha512-x86_64.nasm +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha512-x86_64.nasm @@ -20,20 +20,6 @@ $L$SEH_begin_sha512_block_data_order: - lea r11,[OPENSSL_ia32cap_P] - mov r9d,DWORD[r11] - mov r10d,DWORD[4+r11] - mov r11d,DWORD[8+r11] - test r10d,2048 - jnz NEAR $L$xop_shortcut - and r11d,296 - cmp r11d,296 - je NEAR $L$avx2_shortcut - and r9d,1073741824 - and r10d,268435968 - or r10d,r9d - cmp r10d,1342177792 - je NEAR $L$avx_shortcut mov rax,rsp push rbx @@ -1833,3833 +1819,110 @@ DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54 DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 DB 111,114,103,62,0 +EXTERN __imp_RtlVirtualUnwind -ALIGN 64 -sha512_block_data_order_xop: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_sha512_block_data_order_xop: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 +ALIGN 16 +se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] -$L$xop_shortcut: - mov rax,rsp + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$in_prologue - push rbx + mov rax,QWORD[152+r8] - push rbp + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$in_prologue + mov rsi,rax + mov rax,QWORD[((128+24))+rax] - push r12 + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 - push r13 + lea r10,[$L$epilogue] + cmp rbx,r10 + jb NEAR $L$in_prologue - push r14 + lea rsi,[((128+32))+rsi] + lea rdi,[512+r8] + mov ecx,12 + DD 0xa548f3fc - push r15 +$L$in_prologue: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi - shl rdx,4 - sub rsp,256 - lea rdx,[rdx*8+rsi] - and rsp,-64 - mov QWORD[((128+0))+rsp],rdi - mov QWORD[((128+8))+rsp],rsi - mov QWORD[((128+16))+rsp],rdx - mov QWORD[152+rsp],rax + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc - movaps XMMWORD[(128+32)+rsp],xmm6 - movaps XMMWORD[(128+48)+rsp],xmm7 - movaps XMMWORD[(128+64)+rsp],xmm8 - movaps XMMWORD[(128+80)+rsp],xmm9 - movaps XMMWORD[(128+96)+rsp],xmm10 - movaps XMMWORD[(128+112)+rsp],xmm11 -$L$prologue_xop: + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] - vzeroupper - mov rax,QWORD[rdi] - mov rbx,QWORD[8+rdi] - mov rcx,QWORD[16+rdi] - mov rdx,QWORD[24+rdi] - mov r8,QWORD[32+rdi] - mov r9,QWORD[40+rdi] - mov r10,QWORD[48+rdi] - mov r11,QWORD[56+rdi] - jmp NEAR $L$loop_xop -ALIGN 16 -$L$loop_xop: - vmovdqa xmm11,XMMWORD[((K512+1280))] - vmovdqu xmm0,XMMWORD[rsi] - lea rbp,[((K512+128))] - vmovdqu xmm1,XMMWORD[16+rsi] - vmovdqu xmm2,XMMWORD[32+rsi] - vpshufb xmm0,xmm0,xmm11 - vmovdqu xmm3,XMMWORD[48+rsi] - vpshufb xmm1,xmm1,xmm11 - vmovdqu xmm4,XMMWORD[64+rsi] - vpshufb xmm2,xmm2,xmm11 - vmovdqu xmm5,XMMWORD[80+rsi] - vpshufb xmm3,xmm3,xmm11 - vmovdqu xmm6,XMMWORD[96+rsi] - vpshufb xmm4,xmm4,xmm11 - vmovdqu xmm7,XMMWORD[112+rsi] - vpshufb xmm5,xmm5,xmm11 - vpaddq xmm8,xmm0,XMMWORD[((-128))+rbp] - vpshufb xmm6,xmm6,xmm11 - vpaddq xmm9,xmm1,XMMWORD[((-96))+rbp] - vpshufb xmm7,xmm7,xmm11 - vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] - vpaddq xmm11,xmm3,XMMWORD[((-32))+rbp] - vmovdqa XMMWORD[rsp],xmm8 - vpaddq xmm8,xmm4,XMMWORD[rbp] - vmovdqa XMMWORD[16+rsp],xmm9 - vpaddq xmm9,xmm5,XMMWORD[32+rbp] - vmovdqa XMMWORD[32+rsp],xmm10 - vpaddq xmm10,xmm6,XMMWORD[64+rbp] - vmovdqa XMMWORD[48+rsp],xmm11 - vpaddq xmm11,xmm7,XMMWORD[96+rbp] - vmovdqa XMMWORD[64+rsp],xmm8 - mov r14,rax - vmovdqa XMMWORD[80+rsp],xmm9 - mov rdi,rbx - vmovdqa XMMWORD[96+rsp],xmm10 - xor rdi,rcx - vmovdqa XMMWORD[112+rsp],xmm11 - mov r13,r8 - jmp NEAR $L$xop_00_47 - -ALIGN 16 -$L$xop_00_47: - add rbp,256 - vpalignr xmm8,xmm1,xmm0,8 - ror r13,23 - mov rax,r14 - vpalignr xmm11,xmm5,xmm4,8 - mov r12,r9 - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,r8 - xor r12,r10 - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,rax - vpaddq xmm0,xmm0,xmm11 - and r12,r8 - xor r13,r8 - add r11,QWORD[rsp] - mov r15,rax -DB 143,72,120,195,209,7 - xor r12,r10 - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,rbx - add r11,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,223,3 - xor r14,rax - add r11,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,rbx - ror r14,28 - vpsrlq xmm10,xmm7,6 - add rdx,r11 - add r11,rdi - vpaddq xmm0,xmm0,xmm8 - mov r13,rdx - add r14,r11 -DB 143,72,120,195,203,42 - ror r13,23 - mov r11,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,r8 - ror r14,5 - xor r13,rdx - xor r12,r9 - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,r11 - and r12,rdx - xor r13,rdx - vpaddq xmm0,xmm0,xmm11 - add r10,QWORD[8+rsp] - mov rdi,r11 - xor r12,r9 - ror r14,6 - vpaddq xmm10,xmm0,XMMWORD[((-128))+rbp] - xor rdi,rax - add r10,r12 - ror r13,14 - and r15,rdi - xor r14,r11 - add r10,r13 - xor r15,rax - ror r14,28 - add rcx,r10 - add r10,r15 - mov r13,rcx - add r14,r10 - vmovdqa XMMWORD[rsp],xmm10 - vpalignr xmm8,xmm2,xmm1,8 - ror r13,23 - mov r10,r14 - vpalignr xmm11,xmm6,xmm5,8 - mov r12,rdx - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,rcx - xor r12,r8 - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,r10 - vpaddq xmm1,xmm1,xmm11 - and r12,rcx - xor r13,rcx - add r9,QWORD[16+rsp] - mov r15,r10 -DB 143,72,120,195,209,7 - xor r12,r8 - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,r11 - add r9,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,216,3 - xor r14,r10 - add r9,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,r11 - ror r14,28 - vpsrlq xmm10,xmm0,6 - add rbx,r9 - add r9,rdi - vpaddq xmm1,xmm1,xmm8 - mov r13,rbx - add r14,r9 -DB 143,72,120,195,203,42 - ror r13,23 - mov r9,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,rcx - ror r14,5 - xor r13,rbx - xor r12,rdx - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,r9 - and r12,rbx - xor r13,rbx - vpaddq xmm1,xmm1,xmm11 - add r8,QWORD[24+rsp] - mov rdi,r9 - xor r12,rdx - ror r14,6 - vpaddq xmm10,xmm1,XMMWORD[((-96))+rbp] - xor rdi,r10 - add r8,r12 - ror r13,14 - and r15,rdi - xor r14,r9 - add r8,r13 - xor r15,r10 - ror r14,28 - add rax,r8 - add r8,r15 - mov r13,rax - add r14,r8 - vmovdqa XMMWORD[16+rsp],xmm10 - vpalignr xmm8,xmm3,xmm2,8 - ror r13,23 - mov r8,r14 - vpalignr xmm11,xmm7,xmm6,8 - mov r12,rbx - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,rax - xor r12,rcx - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,r8 - vpaddq xmm2,xmm2,xmm11 - and r12,rax - xor r13,rax - add rdx,QWORD[32+rsp] - mov r15,r8 -DB 143,72,120,195,209,7 - xor r12,rcx - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,r9 - add rdx,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,217,3 - xor r14,r8 - add rdx,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,r9 - ror r14,28 - vpsrlq xmm10,xmm1,6 - add r11,rdx - add rdx,rdi - vpaddq xmm2,xmm2,xmm8 - mov r13,r11 - add r14,rdx -DB 143,72,120,195,203,42 - ror r13,23 - mov rdx,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,rax - ror r14,5 - xor r13,r11 - xor r12,rbx - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,rdx - and r12,r11 - xor r13,r11 - vpaddq xmm2,xmm2,xmm11 - add rcx,QWORD[40+rsp] - mov rdi,rdx - xor r12,rbx - ror r14,6 - vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] - xor rdi,r8 - add rcx,r12 - ror r13,14 - and r15,rdi - xor r14,rdx - add rcx,r13 - xor r15,r8 - ror r14,28 - add r10,rcx - add rcx,r15 - mov r13,r10 - add r14,rcx - vmovdqa XMMWORD[32+rsp],xmm10 - vpalignr xmm8,xmm4,xmm3,8 - ror r13,23 - mov rcx,r14 - vpalignr xmm11,xmm0,xmm7,8 - mov r12,r11 - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,r10 - xor r12,rax - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,rcx - vpaddq xmm3,xmm3,xmm11 - and r12,r10 - xor r13,r10 - add rbx,QWORD[48+rsp] - mov r15,rcx -DB 143,72,120,195,209,7 - xor r12,rax - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,rdx - add rbx,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,218,3 - xor r14,rcx - add rbx,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,rdx - ror r14,28 - vpsrlq xmm10,xmm2,6 - add r9,rbx - add rbx,rdi - vpaddq xmm3,xmm3,xmm8 - mov r13,r9 - add r14,rbx -DB 143,72,120,195,203,42 - ror r13,23 - mov rbx,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,r10 - ror r14,5 - xor r13,r9 - xor r12,r11 - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,rbx - and r12,r9 - xor r13,r9 - vpaddq xmm3,xmm3,xmm11 - add rax,QWORD[56+rsp] - mov rdi,rbx - xor r12,r11 - ror r14,6 - vpaddq xmm10,xmm3,XMMWORD[((-32))+rbp] - xor rdi,rcx - add rax,r12 - ror r13,14 - and r15,rdi - xor r14,rbx - add rax,r13 - xor r15,rcx - ror r14,28 - add r8,rax - add rax,r15 - mov r13,r8 - add r14,rax - vmovdqa XMMWORD[48+rsp],xmm10 - vpalignr xmm8,xmm5,xmm4,8 - ror r13,23 - mov rax,r14 - vpalignr xmm11,xmm1,xmm0,8 - mov r12,r9 - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,r8 - xor r12,r10 - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,rax - vpaddq xmm4,xmm4,xmm11 - and r12,r8 - xor r13,r8 - add r11,QWORD[64+rsp] - mov r15,rax -DB 143,72,120,195,209,7 - xor r12,r10 - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,rbx - add r11,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,219,3 - xor r14,rax - add r11,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,rbx - ror r14,28 - vpsrlq xmm10,xmm3,6 - add rdx,r11 - add r11,rdi - vpaddq xmm4,xmm4,xmm8 - mov r13,rdx - add r14,r11 -DB 143,72,120,195,203,42 - ror r13,23 - mov r11,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,r8 - ror r14,5 - xor r13,rdx - xor r12,r9 - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,r11 - and r12,rdx - xor r13,rdx - vpaddq xmm4,xmm4,xmm11 - add r10,QWORD[72+rsp] - mov rdi,r11 - xor r12,r9 - ror r14,6 - vpaddq xmm10,xmm4,XMMWORD[rbp] - xor rdi,rax - add r10,r12 - ror r13,14 - and r15,rdi - xor r14,r11 - add r10,r13 - xor r15,rax - ror r14,28 - add rcx,r10 - add r10,r15 - mov r13,rcx - add r14,r10 - vmovdqa XMMWORD[64+rsp],xmm10 - vpalignr xmm8,xmm6,xmm5,8 - ror r13,23 - mov r10,r14 - vpalignr xmm11,xmm2,xmm1,8 - mov r12,rdx - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,rcx - xor r12,r8 - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,r10 - vpaddq xmm5,xmm5,xmm11 - and r12,rcx - xor r13,rcx - add r9,QWORD[80+rsp] - mov r15,r10 -DB 143,72,120,195,209,7 - xor r12,r8 - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,r11 - add r9,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,220,3 - xor r14,r10 - add r9,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,r11 - ror r14,28 - vpsrlq xmm10,xmm4,6 - add rbx,r9 - add r9,rdi - vpaddq xmm5,xmm5,xmm8 - mov r13,rbx - add r14,r9 -DB 143,72,120,195,203,42 - ror r13,23 - mov r9,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,rcx - ror r14,5 - xor r13,rbx - xor r12,rdx - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,r9 - and r12,rbx - xor r13,rbx - vpaddq xmm5,xmm5,xmm11 - add r8,QWORD[88+rsp] - mov rdi,r9 - xor r12,rdx - ror r14,6 - vpaddq xmm10,xmm5,XMMWORD[32+rbp] - xor rdi,r10 - add r8,r12 - ror r13,14 - and r15,rdi - xor r14,r9 - add r8,r13 - xor r15,r10 - ror r14,28 - add rax,r8 - add r8,r15 - mov r13,rax - add r14,r8 - vmovdqa XMMWORD[80+rsp],xmm10 - vpalignr xmm8,xmm7,xmm6,8 - ror r13,23 - mov r8,r14 - vpalignr xmm11,xmm3,xmm2,8 - mov r12,rbx - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,rax - xor r12,rcx - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,r8 - vpaddq xmm6,xmm6,xmm11 - and r12,rax - xor r13,rax - add rdx,QWORD[96+rsp] - mov r15,r8 -DB 143,72,120,195,209,7 - xor r12,rcx - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,r9 - add rdx,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,221,3 - xor r14,r8 - add rdx,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,r9 - ror r14,28 - vpsrlq xmm10,xmm5,6 - add r11,rdx - add rdx,rdi - vpaddq xmm6,xmm6,xmm8 - mov r13,r11 - add r14,rdx -DB 143,72,120,195,203,42 - ror r13,23 - mov rdx,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,rax - ror r14,5 - xor r13,r11 - xor r12,rbx - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,rdx - and r12,r11 - xor r13,r11 - vpaddq xmm6,xmm6,xmm11 - add rcx,QWORD[104+rsp] - mov rdi,rdx - xor r12,rbx - ror r14,6 - vpaddq xmm10,xmm6,XMMWORD[64+rbp] - xor rdi,r8 - add rcx,r12 - ror r13,14 - and r15,rdi - xor r14,rdx - add rcx,r13 - xor r15,r8 - ror r14,28 - add r10,rcx - add rcx,r15 - mov r13,r10 - add r14,rcx - vmovdqa XMMWORD[96+rsp],xmm10 - vpalignr xmm8,xmm0,xmm7,8 - ror r13,23 - mov rcx,r14 - vpalignr xmm11,xmm4,xmm3,8 - mov r12,r11 - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,r10 - xor r12,rax - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,rcx - vpaddq xmm7,xmm7,xmm11 - and r12,r10 - xor r13,r10 - add rbx,QWORD[112+rsp] - mov r15,rcx -DB 143,72,120,195,209,7 - xor r12,rax - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,rdx - add rbx,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,222,3 - xor r14,rcx - add rbx,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,rdx - ror r14,28 - vpsrlq xmm10,xmm6,6 - add r9,rbx - add rbx,rdi - vpaddq xmm7,xmm7,xmm8 - mov r13,r9 - add r14,rbx -DB 143,72,120,195,203,42 - ror r13,23 - mov rbx,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,r10 - ror r14,5 - xor r13,r9 - xor r12,r11 - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,rbx - and r12,r9 - xor r13,r9 - vpaddq xmm7,xmm7,xmm11 - add rax,QWORD[120+rsp] - mov rdi,rbx - xor r12,r11 - ror r14,6 - vpaddq xmm10,xmm7,XMMWORD[96+rbp] - xor rdi,rcx - add rax,r12 - ror r13,14 - and r15,rdi - xor r14,rbx - add rax,r13 - xor r15,rcx - ror r14,28 - add r8,rax - add rax,r15 - mov r13,r8 - add r14,rax - vmovdqa XMMWORD[112+rsp],xmm10 - cmp BYTE[135+rbp],0 - jne NEAR $L$xop_00_47 - ror r13,23 - mov rax,r14 - mov r12,r9 - ror r14,5 - xor r13,r8 - xor r12,r10 - ror r13,4 - xor r14,rax - and r12,r8 - xor r13,r8 - add r11,QWORD[rsp] - mov r15,rax - xor r12,r10 - ror r14,6 - xor r15,rbx - add r11,r12 - ror r13,14 - and rdi,r15 - xor r14,rax - add r11,r13 - xor rdi,rbx - ror r14,28 - add rdx,r11 - add r11,rdi - mov r13,rdx - add r14,r11 - ror r13,23 - mov r11,r14 - mov r12,r8 - ror r14,5 - xor r13,rdx - xor r12,r9 - ror r13,4 - xor r14,r11 - and r12,rdx - xor r13,rdx - add r10,QWORD[8+rsp] - mov rdi,r11 - xor r12,r9 - ror r14,6 - xor rdi,rax - add r10,r12 - ror r13,14 - and r15,rdi - xor r14,r11 - add r10,r13 - xor r15,rax - ror r14,28 - add rcx,r10 - add r10,r15 - mov r13,rcx - add r14,r10 - ror r13,23 - mov r10,r14 - mov r12,rdx - ror r14,5 - xor r13,rcx - xor r12,r8 - ror r13,4 - xor r14,r10 - and r12,rcx - xor r13,rcx - add r9,QWORD[16+rsp] - mov r15,r10 - xor r12,r8 - ror r14,6 - xor r15,r11 - add r9,r12 - ror r13,14 - and rdi,r15 - xor r14,r10 - add r9,r13 - xor rdi,r11 - ror r14,28 - add rbx,r9 - add r9,rdi - mov r13,rbx - add r14,r9 - ror r13,23 - mov r9,r14 - mov r12,rcx - ror r14,5 - xor r13,rbx - xor r12,rdx - ror r13,4 - xor r14,r9 - and r12,rbx - xor r13,rbx - add r8,QWORD[24+rsp] - mov rdi,r9 - xor r12,rdx - ror r14,6 - xor rdi,r10 - add r8,r12 - ror r13,14 - and r15,rdi - xor r14,r9 - add r8,r13 - xor r15,r10 - ror r14,28 - add rax,r8 - add r8,r15 - mov r13,rax - add r14,r8 - ror r13,23 - mov r8,r14 - mov r12,rbx - ror r14,5 - xor r13,rax - xor r12,rcx - ror r13,4 - xor r14,r8 - and r12,rax - xor r13,rax - add rdx,QWORD[32+rsp] - mov r15,r8 - xor r12,rcx - ror r14,6 - xor r15,r9 - add rdx,r12 - ror r13,14 - and rdi,r15 - xor r14,r8 - add rdx,r13 - xor rdi,r9 - ror r14,28 - add r11,rdx - add rdx,rdi - mov r13,r11 - add r14,rdx - ror r13,23 - mov rdx,r14 - mov r12,rax - ror r14,5 - xor r13,r11 - xor r12,rbx - ror r13,4 - xor r14,rdx - and r12,r11 - xor r13,r11 - add rcx,QWORD[40+rsp] - mov rdi,rdx - xor r12,rbx - ror r14,6 - xor rdi,r8 - add rcx,r12 - ror r13,14 - and r15,rdi - xor r14,rdx - add rcx,r13 - xor r15,r8 - ror r14,28 - add r10,rcx - add rcx,r15 - mov r13,r10 - add r14,rcx - ror r13,23 - mov rcx,r14 - mov r12,r11 - ror r14,5 - xor r13,r10 - xor r12,rax - ror r13,4 - xor r14,rcx - and r12,r10 - xor r13,r10 - add rbx,QWORD[48+rsp] - mov r15,rcx - xor r12,rax - ror r14,6 - xor r15,rdx - add rbx,r12 - ror r13,14 - and rdi,r15 - xor r14,rcx - add rbx,r13 - xor rdi,rdx - ror r14,28 - add r9,rbx - add rbx,rdi - mov r13,r9 - add r14,rbx - ror r13,23 - mov rbx,r14 - mov r12,r10 - ror r14,5 - xor r13,r9 - xor r12,r11 - ror r13,4 - xor r14,rbx - and r12,r9 - xor r13,r9 - add rax,QWORD[56+rsp] - mov rdi,rbx - xor r12,r11 - ror r14,6 - xor rdi,rcx - add rax,r12 - ror r13,14 - and r15,rdi - xor r14,rbx - add rax,r13 - xor r15,rcx - ror r14,28 - add r8,rax - add rax,r15 - mov r13,r8 - add r14,rax - ror r13,23 - mov rax,r14 - mov r12,r9 - ror r14,5 - xor r13,r8 - xor r12,r10 - ror r13,4 - xor r14,rax - and r12,r8 - xor r13,r8 - add r11,QWORD[64+rsp] - mov r15,rax - xor r12,r10 - ror r14,6 - xor r15,rbx - add r11,r12 - ror r13,14 - and rdi,r15 - xor r14,rax - add r11,r13 - xor rdi,rbx - ror r14,28 - add rdx,r11 - add r11,rdi - mov r13,rdx - add r14,r11 - ror r13,23 - mov r11,r14 - mov r12,r8 - ror r14,5 - xor r13,rdx - xor r12,r9 - ror r13,4 - xor r14,r11 - and r12,rdx - xor r13,rdx - add r10,QWORD[72+rsp] - mov rdi,r11 - xor r12,r9 - ror r14,6 - xor rdi,rax - add r10,r12 - ror r13,14 - and r15,rdi - xor r14,r11 - add r10,r13 - xor r15,rax - ror r14,28 - add rcx,r10 - add r10,r15 - mov r13,rcx - add r14,r10 - ror r13,23 - mov r10,r14 - mov r12,rdx - ror r14,5 - xor r13,rcx - xor r12,r8 - ror r13,4 - xor r14,r10 - and r12,rcx - xor r13,rcx - add r9,QWORD[80+rsp] - mov r15,r10 - xor r12,r8 - ror r14,6 - xor r15,r11 - add r9,r12 - ror r13,14 - and rdi,r15 - xor r14,r10 - add r9,r13 - xor rdi,r11 - ror r14,28 - add rbx,r9 - add r9,rdi - mov r13,rbx - add r14,r9 - ror r13,23 - mov r9,r14 - mov r12,rcx - ror r14,5 - xor r13,rbx - xor r12,rdx - ror r13,4 - xor r14,r9 - and r12,rbx - xor r13,rbx - add r8,QWORD[88+rsp] - mov rdi,r9 - xor r12,rdx - ror r14,6 - xor rdi,r10 - add r8,r12 - ror r13,14 - and r15,rdi - xor r14,r9 - add r8,r13 - xor r15,r10 - ror r14,28 - add rax,r8 - add r8,r15 - mov r13,rax - add r14,r8 - ror r13,23 - mov r8,r14 - mov r12,rbx - ror r14,5 - xor r13,rax - xor r12,rcx - ror r13,4 - xor r14,r8 - and r12,rax - xor r13,rax - add rdx,QWORD[96+rsp] - mov r15,r8 - xor r12,rcx - ror r14,6 - xor r15,r9 - add rdx,r12 - ror r13,14 - and rdi,r15 - xor r14,r8 - add rdx,r13 - xor rdi,r9 - ror r14,28 - add r11,rdx - add rdx,rdi - mov r13,r11 - add r14,rdx - ror r13,23 - mov rdx,r14 - mov r12,rax - ror r14,5 - xor r13,r11 - xor r12,rbx - ror r13,4 - xor r14,rdx - and r12,r11 - xor r13,r11 - add rcx,QWORD[104+rsp] - mov rdi,rdx - xor r12,rbx - ror r14,6 - xor rdi,r8 - add rcx,r12 - ror r13,14 - and r15,rdi - xor r14,rdx - add rcx,r13 - xor r15,r8 - ror r14,28 - add r10,rcx - add rcx,r15 - mov r13,r10 - add r14,rcx - ror r13,23 - mov rcx,r14 - mov r12,r11 - ror r14,5 - xor r13,r10 - xor r12,rax - ror r13,4 - xor r14,rcx - and r12,r10 - xor r13,r10 - add rbx,QWORD[112+rsp] - mov r15,rcx - xor r12,rax - ror r14,6 - xor r15,rdx - add rbx,r12 - ror r13,14 - and rdi,r15 - xor r14,rcx - add rbx,r13 - xor rdi,rdx - ror r14,28 - add r9,rbx - add rbx,rdi - mov r13,r9 - add r14,rbx - ror r13,23 - mov rbx,r14 - mov r12,r10 - ror r14,5 - xor r13,r9 - xor r12,r11 - ror r13,4 - xor r14,rbx - and r12,r9 - xor r13,r9 - add rax,QWORD[120+rsp] - mov rdi,rbx - xor r12,r11 - ror r14,6 - xor rdi,rcx - add rax,r12 - ror r13,14 - and r15,rdi - xor r14,rbx - add rax,r13 - xor r15,rcx - ror r14,28 - add r8,rax - add rax,r15 - mov r13,r8 - add r14,rax - mov rdi,QWORD[((128+0))+rsp] - mov rax,r14 - - add rax,QWORD[rdi] - lea rsi,[128+rsi] - add rbx,QWORD[8+rdi] - add rcx,QWORD[16+rdi] - add rdx,QWORD[24+rdi] - add r8,QWORD[32+rdi] - add r9,QWORD[40+rdi] - add r10,QWORD[48+rdi] - add r11,QWORD[56+rdi] - - cmp rsi,QWORD[((128+16))+rsp] - - mov QWORD[rdi],rax - mov QWORD[8+rdi],rbx - mov QWORD[16+rdi],rcx - mov QWORD[24+rdi],rdx - mov QWORD[32+rdi],r8 - mov QWORD[40+rdi],r9 - mov QWORD[48+rdi],r10 - mov QWORD[56+rdi],r11 - jb NEAR $L$loop_xop - - mov rsi,QWORD[152+rsp] - - vzeroupper - movaps xmm6,XMMWORD[((128+32))+rsp] - movaps xmm7,XMMWORD[((128+48))+rsp] - movaps xmm8,XMMWORD[((128+64))+rsp] - movaps xmm9,XMMWORD[((128+80))+rsp] - movaps xmm10,XMMWORD[((128+96))+rsp] - movaps xmm11,XMMWORD[((128+112))+rsp] - mov r15,QWORD[((-48))+rsi] - - mov r14,QWORD[((-40))+rsi] - - mov r13,QWORD[((-32))+rsi] - - mov r12,QWORD[((-24))+rsi] - - mov rbp,QWORD[((-16))+rsi] - - mov rbx,QWORD[((-8))+rsi] - - lea rsp,[rsi] - -$L$epilogue_xop: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_sha512_block_data_order_xop: - -ALIGN 64 -sha512_block_data_order_avx: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_sha512_block_data_order_avx: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - - -$L$avx_shortcut: - mov rax,rsp - - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - push r15 - - shl rdx,4 - sub rsp,256 - lea rdx,[rdx*8+rsi] - and rsp,-64 - mov QWORD[((128+0))+rsp],rdi - mov QWORD[((128+8))+rsp],rsi - mov QWORD[((128+16))+rsp],rdx - mov QWORD[152+rsp],rax - - movaps XMMWORD[(128+32)+rsp],xmm6 - movaps XMMWORD[(128+48)+rsp],xmm7 - movaps XMMWORD[(128+64)+rsp],xmm8 - movaps XMMWORD[(128+80)+rsp],xmm9 - movaps XMMWORD[(128+96)+rsp],xmm10 - movaps XMMWORD[(128+112)+rsp],xmm11 -$L$prologue_avx: - - vzeroupper - mov rax,QWORD[rdi] - mov rbx,QWORD[8+rdi] - mov rcx,QWORD[16+rdi] - mov rdx,QWORD[24+rdi] - mov r8,QWORD[32+rdi] - mov r9,QWORD[40+rdi] - mov r10,QWORD[48+rdi] - mov r11,QWORD[56+rdi] - jmp NEAR $L$loop_avx -ALIGN 16 -$L$loop_avx: - vmovdqa xmm11,XMMWORD[((K512+1280))] - vmovdqu xmm0,XMMWORD[rsi] - lea rbp,[((K512+128))] - vmovdqu xmm1,XMMWORD[16+rsi] - vmovdqu xmm2,XMMWORD[32+rsi] - vpshufb xmm0,xmm0,xmm11 - vmovdqu xmm3,XMMWORD[48+rsi] - vpshufb xmm1,xmm1,xmm11 - vmovdqu xmm4,XMMWORD[64+rsi] - vpshufb xmm2,xmm2,xmm11 - vmovdqu xmm5,XMMWORD[80+rsi] - vpshufb xmm3,xmm3,xmm11 - vmovdqu xmm6,XMMWORD[96+rsi] - vpshufb xmm4,xmm4,xmm11 - vmovdqu xmm7,XMMWORD[112+rsi] - vpshufb xmm5,xmm5,xmm11 - vpaddq xmm8,xmm0,XMMWORD[((-128))+rbp] - vpshufb xmm6,xmm6,xmm11 - vpaddq xmm9,xmm1,XMMWORD[((-96))+rbp] - vpshufb xmm7,xmm7,xmm11 - vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] - vpaddq xmm11,xmm3,XMMWORD[((-32))+rbp] - vmovdqa XMMWORD[rsp],xmm8 - vpaddq xmm8,xmm4,XMMWORD[rbp] - vmovdqa XMMWORD[16+rsp],xmm9 - vpaddq xmm9,xmm5,XMMWORD[32+rbp] - vmovdqa XMMWORD[32+rsp],xmm10 - vpaddq xmm10,xmm6,XMMWORD[64+rbp] - vmovdqa XMMWORD[48+rsp],xmm11 - vpaddq xmm11,xmm7,XMMWORD[96+rbp] - vmovdqa XMMWORD[64+rsp],xmm8 - mov r14,rax - vmovdqa XMMWORD[80+rsp],xmm9 - mov rdi,rbx - vmovdqa XMMWORD[96+rsp],xmm10 - xor rdi,rcx - vmovdqa XMMWORD[112+rsp],xmm11 - mov r13,r8 - jmp NEAR $L$avx_00_47 - -ALIGN 16 -$L$avx_00_47: - add rbp,256 - vpalignr xmm8,xmm1,xmm0,8 - shrd r13,r13,23 - mov rax,r14 - vpalignr xmm11,xmm5,xmm4,8 - mov r12,r9 - shrd r14,r14,5 - vpsrlq xmm10,xmm8,1 - xor r13,r8 - xor r12,r10 - vpaddq xmm0,xmm0,xmm11 - shrd r13,r13,4 - xor r14,rax - vpsrlq xmm11,xmm8,7 - and r12,r8 - xor r13,r8 - vpsllq xmm9,xmm8,56 - add r11,QWORD[rsp] - mov r15,rax - vpxor xmm8,xmm11,xmm10 - xor r12,r10 - shrd r14,r14,6 - vpsrlq xmm10,xmm10,7 - xor r15,rbx - add r11,r12 - vpxor xmm8,xmm8,xmm9 - shrd r13,r13,14 - and rdi,r15 - vpsllq xmm9,xmm9,7 - xor r14,rax - add r11,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,rbx - shrd r14,r14,28 - vpsrlq xmm11,xmm7,6 - add rdx,r11 - add r11,rdi - vpxor xmm8,xmm8,xmm9 - mov r13,rdx - add r14,r11 - vpsllq xmm10,xmm7,3 - shrd r13,r13,23 - mov r11,r14 - vpaddq xmm0,xmm0,xmm8 - mov r12,r8 - shrd r14,r14,5 - vpsrlq xmm9,xmm7,19 - xor r13,rdx - xor r12,r9 - vpxor xmm11,xmm11,xmm10 - shrd r13,r13,4 - xor r14,r11 - vpsllq xmm10,xmm10,42 - and r12,rdx - xor r13,rdx - vpxor xmm11,xmm11,xmm9 - add r10,QWORD[8+rsp] - mov rdi,r11 - vpsrlq xmm9,xmm9,42 - xor r12,r9 - shrd r14,r14,6 - vpxor xmm11,xmm11,xmm10 - xor rdi,rax - add r10,r12 - vpxor xmm11,xmm11,xmm9 - shrd r13,r13,14 - and r15,rdi - vpaddq xmm0,xmm0,xmm11 - xor r14,r11 - add r10,r13 - vpaddq xmm10,xmm0,XMMWORD[((-128))+rbp] - xor r15,rax - shrd r14,r14,28 - add rcx,r10 - add r10,r15 - mov r13,rcx - add r14,r10 - vmovdqa XMMWORD[rsp],xmm10 - vpalignr xmm8,xmm2,xmm1,8 - shrd r13,r13,23 - mov r10,r14 - vpalignr xmm11,xmm6,xmm5,8 - mov r12,rdx - shrd r14,r14,5 - vpsrlq xmm10,xmm8,1 - xor r13,rcx - xor r12,r8 - vpaddq xmm1,xmm1,xmm11 - shrd r13,r13,4 - xor r14,r10 - vpsrlq xmm11,xmm8,7 - and r12,rcx - xor r13,rcx - vpsllq xmm9,xmm8,56 - add r9,QWORD[16+rsp] - mov r15,r10 - vpxor xmm8,xmm11,xmm10 - xor r12,r8 - shrd r14,r14,6 - vpsrlq xmm10,xmm10,7 - xor r15,r11 - add r9,r12 - vpxor xmm8,xmm8,xmm9 - shrd r13,r13,14 - and rdi,r15 - vpsllq xmm9,xmm9,7 - xor r14,r10 - add r9,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,r11 - shrd r14,r14,28 - vpsrlq xmm11,xmm0,6 - add rbx,r9 - add r9,rdi - vpxor xmm8,xmm8,xmm9 - mov r13,rbx - add r14,r9 - vpsllq xmm10,xmm0,3 - shrd r13,r13,23 - mov r9,r14 - vpaddq xmm1,xmm1,xmm8 - mov r12,rcx - shrd r14,r14,5 - vpsrlq xmm9,xmm0,19 - xor r13,rbx - xor r12,rdx - vpxor xmm11,xmm11,xmm10 - shrd r13,r13,4 - xor r14,r9 - vpsllq xmm10,xmm10,42 - and r12,rbx - xor r13,rbx - vpxor xmm11,xmm11,xmm9 - add r8,QWORD[24+rsp] - mov rdi,r9 - vpsrlq xmm9,xmm9,42 - xor r12,rdx - shrd r14,r14,6 - vpxor xmm11,xmm11,xmm10 - xor rdi,r10 - add r8,r12 - vpxor xmm11,xmm11,xmm9 - shrd r13,r13,14 - and r15,rdi - vpaddq xmm1,xmm1,xmm11 - xor r14,r9 - add r8,r13 - vpaddq xmm10,xmm1,XMMWORD[((-96))+rbp] - xor r15,r10 - shrd r14,r14,28 - add rax,r8 - add r8,r15 - mov r13,rax - add r14,r8 - vmovdqa XMMWORD[16+rsp],xmm10 - vpalignr xmm8,xmm3,xmm2,8 - shrd r13,r13,23 - mov r8,r14 - vpalignr xmm11,xmm7,xmm6,8 - mov r12,rbx - shrd r14,r14,5 - vpsrlq xmm10,xmm8,1 - xor r13,rax - xor r12,rcx - vpaddq xmm2,xmm2,xmm11 - shrd r13,r13,4 - xor r14,r8 - vpsrlq xmm11,xmm8,7 - and r12,rax - xor r13,rax - vpsllq xmm9,xmm8,56 - add rdx,QWORD[32+rsp] - mov r15,r8 - vpxor xmm8,xmm11,xmm10 - xor r12,rcx - shrd r14,r14,6 - vpsrlq xmm10,xmm10,7 - xor r15,r9 - add rdx,r12 - vpxor xmm8,xmm8,xmm9 - shrd r13,r13,14 - and rdi,r15 - vpsllq xmm9,xmm9,7 - xor r14,r8 - add rdx,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,r9 - shrd r14,r14,28 - vpsrlq xmm11,xmm1,6 - add r11,rdx - add rdx,rdi - vpxor xmm8,xmm8,xmm9 - mov r13,r11 - add r14,rdx - vpsllq xmm10,xmm1,3 - shrd r13,r13,23 - mov rdx,r14 - vpaddq xmm2,xmm2,xmm8 - mov r12,rax - shrd r14,r14,5 - vpsrlq xmm9,xmm1,19 - xor r13,r11 - xor r12,rbx - vpxor xmm11,xmm11,xmm10 - shrd r13,r13,4 - xor r14,rdx - vpsllq xmm10,xmm10,42 - and r12,r11 - xor r13,r11 - vpxor xmm11,xmm11,xmm9 - add rcx,QWORD[40+rsp] - mov rdi,rdx - vpsrlq xmm9,xmm9,42 - xor r12,rbx - shrd r14,r14,6 - vpxor xmm11,xmm11,xmm10 - xor rdi,r8 - add rcx,r12 - vpxor xmm11,xmm11,xmm9 - shrd r13,r13,14 - and r15,rdi - vpaddq xmm2,xmm2,xmm11 - xor r14,rdx - add rcx,r13 - vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] - xor r15,r8 - shrd r14,r14,28 - add r10,rcx - add rcx,r15 - mov r13,r10 - add r14,rcx - vmovdqa XMMWORD[32+rsp],xmm10 - vpalignr xmm8,xmm4,xmm3,8 - shrd r13,r13,23 - mov rcx,r14 - vpalignr xmm11,xmm0,xmm7,8 - mov r12,r11 - shrd r14,r14,5 - vpsrlq xmm10,xmm8,1 - xor r13,r10 - xor r12,rax - vpaddq xmm3,xmm3,xmm11 - shrd r13,r13,4 - xor r14,rcx - vpsrlq xmm11,xmm8,7 - and r12,r10 - xor r13,r10 - vpsllq xmm9,xmm8,56 - add rbx,QWORD[48+rsp] - mov r15,rcx - vpxor xmm8,xmm11,xmm10 - xor r12,rax - shrd r14,r14,6 - vpsrlq xmm10,xmm10,7 - xor r15,rdx - add rbx,r12 - vpxor xmm8,xmm8,xmm9 - shrd r13,r13,14 - and rdi,r15 - vpsllq xmm9,xmm9,7 - xor r14,rcx - add rbx,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,rdx - shrd r14,r14,28 - vpsrlq xmm11,xmm2,6 - add r9,rbx - add rbx,rdi - vpxor xmm8,xmm8,xmm9 - mov r13,r9 - add r14,rbx - vpsllq xmm10,xmm2,3 - shrd r13,r13,23 - mov rbx,r14 - vpaddq xmm3,xmm3,xmm8 - mov r12,r10 - shrd r14,r14,5 - vpsrlq xmm9,xmm2,19 - xor r13,r9 - xor r12,r11 - vpxor xmm11,xmm11,xmm10 - shrd r13,r13,4 - xor r14,rbx - vpsllq xmm10,xmm10,42 - and r12,r9 - xor r13,r9 - vpxor xmm11,xmm11,xmm9 - add rax,QWORD[56+rsp] - mov rdi,rbx - vpsrlq xmm9,xmm9,42 - xor r12,r11 - shrd r14,r14,6 - vpxor xmm11,xmm11,xmm10 - xor rdi,rcx - add rax,r12 - vpxor xmm11,xmm11,xmm9 - shrd r13,r13,14 - and r15,rdi - vpaddq xmm3,xmm3,xmm11 - xor r14,rbx - add rax,r13 - vpaddq xmm10,xmm3,XMMWORD[((-32))+rbp] - xor r15,rcx - shrd r14,r14,28 - add r8,rax - add rax,r15 - mov r13,r8 - add r14,rax - vmovdqa XMMWORD[48+rsp],xmm10 - vpalignr xmm8,xmm5,xmm4,8 - shrd r13,r13,23 - mov rax,r14 - vpalignr xmm11,xmm1,xmm0,8 - mov r12,r9 - shrd r14,r14,5 - vpsrlq xmm10,xmm8,1 - xor r13,r8 - xor r12,r10 - vpaddq xmm4,xmm4,xmm11 - shrd r13,r13,4 - xor r14,rax - vpsrlq xmm11,xmm8,7 - and r12,r8 - xor r13,r8 - vpsllq xmm9,xmm8,56 - add r11,QWORD[64+rsp] - mov r15,rax - vpxor xmm8,xmm11,xmm10 - xor r12,r10 - shrd r14,r14,6 - vpsrlq xmm10,xmm10,7 - xor r15,rbx - add r11,r12 - vpxor xmm8,xmm8,xmm9 - shrd r13,r13,14 - and rdi,r15 - vpsllq xmm9,xmm9,7 - xor r14,rax - add r11,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,rbx - shrd r14,r14,28 - vpsrlq xmm11,xmm3,6 - add rdx,r11 - add r11,rdi - vpxor xmm8,xmm8,xmm9 - mov r13,rdx - add r14,r11 - vpsllq xmm10,xmm3,3 - shrd r13,r13,23 - mov r11,r14 - vpaddq xmm4,xmm4,xmm8 - mov r12,r8 - shrd r14,r14,5 - vpsrlq xmm9,xmm3,19 - xor r13,rdx - xor r12,r9 - vpxor xmm11,xmm11,xmm10 - shrd r13,r13,4 - xor r14,r11 - vpsllq xmm10,xmm10,42 - and r12,rdx - xor r13,rdx - vpxor xmm11,xmm11,xmm9 - add r10,QWORD[72+rsp] - mov rdi,r11 - vpsrlq xmm9,xmm9,42 - xor r12,r9 - shrd r14,r14,6 - vpxor xmm11,xmm11,xmm10 - xor rdi,rax - add r10,r12 - vpxor xmm11,xmm11,xmm9 - shrd r13,r13,14 - and r15,rdi - vpaddq xmm4,xmm4,xmm11 - xor r14,r11 - add r10,r13 - vpaddq xmm10,xmm4,XMMWORD[rbp] - xor r15,rax - shrd r14,r14,28 - add rcx,r10 - add r10,r15 - mov r13,rcx - add r14,r10 - vmovdqa XMMWORD[64+rsp],xmm10 - vpalignr xmm8,xmm6,xmm5,8 - shrd r13,r13,23 - mov r10,r14 - vpalignr xmm11,xmm2,xmm1,8 - mov r12,rdx - shrd r14,r14,5 - vpsrlq xmm10,xmm8,1 - xor r13,rcx - xor r12,r8 - vpaddq xmm5,xmm5,xmm11 - shrd r13,r13,4 - xor r14,r10 - vpsrlq xmm11,xmm8,7 - and r12,rcx - xor r13,rcx - vpsllq xmm9,xmm8,56 - add r9,QWORD[80+rsp] - mov r15,r10 - vpxor xmm8,xmm11,xmm10 - xor r12,r8 - shrd r14,r14,6 - vpsrlq xmm10,xmm10,7 - xor r15,r11 - add r9,r12 - vpxor xmm8,xmm8,xmm9 - shrd r13,r13,14 - and rdi,r15 - vpsllq xmm9,xmm9,7 - xor r14,r10 - add r9,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,r11 - shrd r14,r14,28 - vpsrlq xmm11,xmm4,6 - add rbx,r9 - add r9,rdi - vpxor xmm8,xmm8,xmm9 - mov r13,rbx - add r14,r9 - vpsllq xmm10,xmm4,3 - shrd r13,r13,23 - mov r9,r14 - vpaddq xmm5,xmm5,xmm8 - mov r12,rcx - shrd r14,r14,5 - vpsrlq xmm9,xmm4,19 - xor r13,rbx - xor r12,rdx - vpxor xmm11,xmm11,xmm10 - shrd r13,r13,4 - xor r14,r9 - vpsllq xmm10,xmm10,42 - and r12,rbx - xor r13,rbx - vpxor xmm11,xmm11,xmm9 - add r8,QWORD[88+rsp] - mov rdi,r9 - vpsrlq xmm9,xmm9,42 - xor r12,rdx - shrd r14,r14,6 - vpxor xmm11,xmm11,xmm10 - xor rdi,r10 - add r8,r12 - vpxor xmm11,xmm11,xmm9 - shrd r13,r13,14 - and r15,rdi - vpaddq xmm5,xmm5,xmm11 - xor r14,r9 - add r8,r13 - vpaddq xmm10,xmm5,XMMWORD[32+rbp] - xor r15,r10 - shrd r14,r14,28 - add rax,r8 - add r8,r15 - mov r13,rax - add r14,r8 - vmovdqa XMMWORD[80+rsp],xmm10 - vpalignr xmm8,xmm7,xmm6,8 - shrd r13,r13,23 - mov r8,r14 - vpalignr xmm11,xmm3,xmm2,8 - mov r12,rbx - shrd r14,r14,5 - vpsrlq xmm10,xmm8,1 - xor r13,rax - xor r12,rcx - vpaddq xmm6,xmm6,xmm11 - shrd r13,r13,4 - xor r14,r8 - vpsrlq xmm11,xmm8,7 - and r12,rax - xor r13,rax - vpsllq xmm9,xmm8,56 - add rdx,QWORD[96+rsp] - mov r15,r8 - vpxor xmm8,xmm11,xmm10 - xor r12,rcx - shrd r14,r14,6 - vpsrlq xmm10,xmm10,7 - xor r15,r9 - add rdx,r12 - vpxor xmm8,xmm8,xmm9 - shrd r13,r13,14 - and rdi,r15 - vpsllq xmm9,xmm9,7 - xor r14,r8 - add rdx,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,r9 - shrd r14,r14,28 - vpsrlq xmm11,xmm5,6 - add r11,rdx - add rdx,rdi - vpxor xmm8,xmm8,xmm9 - mov r13,r11 - add r14,rdx - vpsllq xmm10,xmm5,3 - shrd r13,r13,23 - mov rdx,r14 - vpaddq xmm6,xmm6,xmm8 - mov r12,rax - shrd r14,r14,5 - vpsrlq xmm9,xmm5,19 - xor r13,r11 - xor r12,rbx - vpxor xmm11,xmm11,xmm10 - shrd r13,r13,4 - xor r14,rdx - vpsllq xmm10,xmm10,42 - and r12,r11 - xor r13,r11 - vpxor xmm11,xmm11,xmm9 - add rcx,QWORD[104+rsp] - mov rdi,rdx - vpsrlq xmm9,xmm9,42 - xor r12,rbx - shrd r14,r14,6 - vpxor xmm11,xmm11,xmm10 - xor rdi,r8 - add rcx,r12 - vpxor xmm11,xmm11,xmm9 - shrd r13,r13,14 - and r15,rdi - vpaddq xmm6,xmm6,xmm11 - xor r14,rdx - add rcx,r13 - vpaddq xmm10,xmm6,XMMWORD[64+rbp] - xor r15,r8 - shrd r14,r14,28 - add r10,rcx - add rcx,r15 - mov r13,r10 - add r14,rcx - vmovdqa XMMWORD[96+rsp],xmm10 - vpalignr xmm8,xmm0,xmm7,8 - shrd r13,r13,23 - mov rcx,r14 - vpalignr xmm11,xmm4,xmm3,8 - mov r12,r11 - shrd r14,r14,5 - vpsrlq xmm10,xmm8,1 - xor r13,r10 - xor r12,rax - vpaddq xmm7,xmm7,xmm11 - shrd r13,r13,4 - xor r14,rcx - vpsrlq xmm11,xmm8,7 - and r12,r10 - xor r13,r10 - vpsllq xmm9,xmm8,56 - add rbx,QWORD[112+rsp] - mov r15,rcx - vpxor xmm8,xmm11,xmm10 - xor r12,rax - shrd r14,r14,6 - vpsrlq xmm10,xmm10,7 - xor r15,rdx - add rbx,r12 - vpxor xmm8,xmm8,xmm9 - shrd r13,r13,14 - and rdi,r15 - vpsllq xmm9,xmm9,7 - xor r14,rcx - add rbx,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,rdx - shrd r14,r14,28 - vpsrlq xmm11,xmm6,6 - add r9,rbx - add rbx,rdi - vpxor xmm8,xmm8,xmm9 - mov r13,r9 - add r14,rbx - vpsllq xmm10,xmm6,3 - shrd r13,r13,23 - mov rbx,r14 - vpaddq xmm7,xmm7,xmm8 - mov r12,r10 - shrd r14,r14,5 - vpsrlq xmm9,xmm6,19 - xor r13,r9 - xor r12,r11 - vpxor xmm11,xmm11,xmm10 - shrd r13,r13,4 - xor r14,rbx - vpsllq xmm10,xmm10,42 - and r12,r9 - xor r13,r9 - vpxor xmm11,xmm11,xmm9 - add rax,QWORD[120+rsp] - mov rdi,rbx - vpsrlq xmm9,xmm9,42 - xor r12,r11 - shrd r14,r14,6 - vpxor xmm11,xmm11,xmm10 - xor rdi,rcx - add rax,r12 - vpxor xmm11,xmm11,xmm9 - shrd r13,r13,14 - and r15,rdi - vpaddq xmm7,xmm7,xmm11 - xor r14,rbx - add rax,r13 - vpaddq xmm10,xmm7,XMMWORD[96+rbp] - xor r15,rcx - shrd r14,r14,28 - add r8,rax - add rax,r15 - mov r13,r8 - add r14,rax - vmovdqa XMMWORD[112+rsp],xmm10 - cmp BYTE[135+rbp],0 - jne NEAR $L$avx_00_47 - shrd r13,r13,23 - mov rax,r14 - mov r12,r9 - shrd r14,r14,5 - xor r13,r8 - xor r12,r10 - shrd r13,r13,4 - xor r14,rax - and r12,r8 - xor r13,r8 - add r11,QWORD[rsp] - mov r15,rax - xor r12,r10 - shrd r14,r14,6 - xor r15,rbx - add r11,r12 - shrd r13,r13,14 - and rdi,r15 - xor r14,rax - add r11,r13 - xor rdi,rbx - shrd r14,r14,28 - add rdx,r11 - add r11,rdi - mov r13,rdx - add r14,r11 - shrd r13,r13,23 - mov r11,r14 - mov r12,r8 - shrd r14,r14,5 - xor r13,rdx - xor r12,r9 - shrd r13,r13,4 - xor r14,r11 - and r12,rdx - xor r13,rdx - add r10,QWORD[8+rsp] - mov rdi,r11 - xor r12,r9 - shrd r14,r14,6 - xor rdi,rax - add r10,r12 - shrd r13,r13,14 - and r15,rdi - xor r14,r11 - add r10,r13 - xor r15,rax - shrd r14,r14,28 - add rcx,r10 - add r10,r15 - mov r13,rcx - add r14,r10 - shrd r13,r13,23 - mov r10,r14 - mov r12,rdx - shrd r14,r14,5 - xor r13,rcx - xor r12,r8 - shrd r13,r13,4 - xor r14,r10 - and r12,rcx - xor r13,rcx - add r9,QWORD[16+rsp] - mov r15,r10 - xor r12,r8 - shrd r14,r14,6 - xor r15,r11 - add r9,r12 - shrd r13,r13,14 - and rdi,r15 - xor r14,r10 - add r9,r13 - xor rdi,r11 - shrd r14,r14,28 - add rbx,r9 - add r9,rdi - mov r13,rbx - add r14,r9 - shrd r13,r13,23 - mov r9,r14 - mov r12,rcx - shrd r14,r14,5 - xor r13,rbx - xor r12,rdx - shrd r13,r13,4 - xor r14,r9 - and r12,rbx - xor r13,rbx - add r8,QWORD[24+rsp] - mov rdi,r9 - xor r12,rdx - shrd r14,r14,6 - xor rdi,r10 - add r8,r12 - shrd r13,r13,14 - and r15,rdi - xor r14,r9 - add r8,r13 - xor r15,r10 - shrd r14,r14,28 - add rax,r8 - add r8,r15 - mov r13,rax - add r14,r8 - shrd r13,r13,23 - mov r8,r14 - mov r12,rbx - shrd r14,r14,5 - xor r13,rax - xor r12,rcx - shrd r13,r13,4 - xor r14,r8 - and r12,rax - xor r13,rax - add rdx,QWORD[32+rsp] - mov r15,r8 - xor r12,rcx - shrd r14,r14,6 - xor r15,r9 - add rdx,r12 - shrd r13,r13,14 - and rdi,r15 - xor r14,r8 - add rdx,r13 - xor rdi,r9 - shrd r14,r14,28 - add r11,rdx - add rdx,rdi - mov r13,r11 - add r14,rdx - shrd r13,r13,23 - mov rdx,r14 - mov r12,rax - shrd r14,r14,5 - xor r13,r11 - xor r12,rbx - shrd r13,r13,4 - xor r14,rdx - and r12,r11 - xor r13,r11 - add rcx,QWORD[40+rsp] - mov rdi,rdx - xor r12,rbx - shrd r14,r14,6 - xor rdi,r8 - add rcx,r12 - shrd r13,r13,14 - and r15,rdi - xor r14,rdx - add rcx,r13 - xor r15,r8 - shrd r14,r14,28 - add r10,rcx - add rcx,r15 - mov r13,r10 - add r14,rcx - shrd r13,r13,23 - mov rcx,r14 - mov r12,r11 - shrd r14,r14,5 - xor r13,r10 - xor r12,rax - shrd r13,r13,4 - xor r14,rcx - and r12,r10 - xor r13,r10 - add rbx,QWORD[48+rsp] - mov r15,rcx - xor r12,rax - shrd r14,r14,6 - xor r15,rdx - add rbx,r12 - shrd r13,r13,14 - and rdi,r15 - xor r14,rcx - add rbx,r13 - xor rdi,rdx - shrd r14,r14,28 - add r9,rbx - add rbx,rdi - mov r13,r9 - add r14,rbx - shrd r13,r13,23 - mov rbx,r14 - mov r12,r10 - shrd r14,r14,5 - xor r13,r9 - xor r12,r11 - shrd r13,r13,4 - xor r14,rbx - and r12,r9 - xor r13,r9 - add rax,QWORD[56+rsp] - mov rdi,rbx - xor r12,r11 - shrd r14,r14,6 - xor rdi,rcx - add rax,r12 - shrd r13,r13,14 - and r15,rdi - xor r14,rbx - add rax,r13 - xor r15,rcx - shrd r14,r14,28 - add r8,rax - add rax,r15 - mov r13,r8 - add r14,rax - shrd r13,r13,23 - mov rax,r14 - mov r12,r9 - shrd r14,r14,5 - xor r13,r8 - xor r12,r10 - shrd r13,r13,4 - xor r14,rax - and r12,r8 - xor r13,r8 - add r11,QWORD[64+rsp] - mov r15,rax - xor r12,r10 - shrd r14,r14,6 - xor r15,rbx - add r11,r12 - shrd r13,r13,14 - and rdi,r15 - xor r14,rax - add r11,r13 - xor rdi,rbx - shrd r14,r14,28 - add rdx,r11 - add r11,rdi - mov r13,rdx - add r14,r11 - shrd r13,r13,23 - mov r11,r14 - mov r12,r8 - shrd r14,r14,5 - xor r13,rdx - xor r12,r9 - shrd r13,r13,4 - xor r14,r11 - and r12,rdx - xor r13,rdx - add r10,QWORD[72+rsp] - mov rdi,r11 - xor r12,r9 - shrd r14,r14,6 - xor rdi,rax - add r10,r12 - shrd r13,r13,14 - and r15,rdi - xor r14,r11 - add r10,r13 - xor r15,rax - shrd r14,r14,28 - add rcx,r10 - add r10,r15 - mov r13,rcx - add r14,r10 - shrd r13,r13,23 - mov r10,r14 - mov r12,rdx - shrd r14,r14,5 - xor r13,rcx - xor r12,r8 - shrd r13,r13,4 - xor r14,r10 - and r12,rcx - xor r13,rcx - add r9,QWORD[80+rsp] - mov r15,r10 - xor r12,r8 - shrd r14,r14,6 - xor r15,r11 - add r9,r12 - shrd r13,r13,14 - and rdi,r15 - xor r14,r10 - add r9,r13 - xor rdi,r11 - shrd r14,r14,28 - add rbx,r9 - add r9,rdi - mov r13,rbx - add r14,r9 - shrd r13,r13,23 - mov r9,r14 - mov r12,rcx - shrd r14,r14,5 - xor r13,rbx - xor r12,rdx - shrd r13,r13,4 - xor r14,r9 - and r12,rbx - xor r13,rbx - add r8,QWORD[88+rsp] - mov rdi,r9 - xor r12,rdx - shrd r14,r14,6 - xor rdi,r10 - add r8,r12 - shrd r13,r13,14 - and r15,rdi - xor r14,r9 - add r8,r13 - xor r15,r10 - shrd r14,r14,28 - add rax,r8 - add r8,r15 - mov r13,rax - add r14,r8 - shrd r13,r13,23 - mov r8,r14 - mov r12,rbx - shrd r14,r14,5 - xor r13,rax - xor r12,rcx - shrd r13,r13,4 - xor r14,r8 - and r12,rax - xor r13,rax - add rdx,QWORD[96+rsp] - mov r15,r8 - xor r12,rcx - shrd r14,r14,6 - xor r15,r9 - add rdx,r12 - shrd r13,r13,14 - and rdi,r15 - xor r14,r8 - add rdx,r13 - xor rdi,r9 - shrd r14,r14,28 - add r11,rdx - add rdx,rdi - mov r13,r11 - add r14,rdx - shrd r13,r13,23 - mov rdx,r14 - mov r12,rax - shrd r14,r14,5 - xor r13,r11 - xor r12,rbx - shrd r13,r13,4 - xor r14,rdx - and r12,r11 - xor r13,r11 - add rcx,QWORD[104+rsp] - mov rdi,rdx - xor r12,rbx - shrd r14,r14,6 - xor rdi,r8 - add rcx,r12 - shrd r13,r13,14 - and r15,rdi - xor r14,rdx - add rcx,r13 - xor r15,r8 - shrd r14,r14,28 - add r10,rcx - add rcx,r15 - mov r13,r10 - add r14,rcx - shrd r13,r13,23 - mov rcx,r14 - mov r12,r11 - shrd r14,r14,5 - xor r13,r10 - xor r12,rax - shrd r13,r13,4 - xor r14,rcx - and r12,r10 - xor r13,r10 - add rbx,QWORD[112+rsp] - mov r15,rcx - xor r12,rax - shrd r14,r14,6 - xor r15,rdx - add rbx,r12 - shrd r13,r13,14 - and rdi,r15 - xor r14,rcx - add rbx,r13 - xor rdi,rdx - shrd r14,r14,28 - add r9,rbx - add rbx,rdi - mov r13,r9 - add r14,rbx - shrd r13,r13,23 - mov rbx,r14 - mov r12,r10 - shrd r14,r14,5 - xor r13,r9 - xor r12,r11 - shrd r13,r13,4 - xor r14,rbx - and r12,r9 - xor r13,r9 - add rax,QWORD[120+rsp] - mov rdi,rbx - xor r12,r11 - shrd r14,r14,6 - xor rdi,rcx - add rax,r12 - shrd r13,r13,14 - and r15,rdi - xor r14,rbx - add rax,r13 - xor r15,rcx - shrd r14,r14,28 - add r8,rax - add rax,r15 - mov r13,r8 - add r14,rax - mov rdi,QWORD[((128+0))+rsp] - mov rax,r14 - - add rax,QWORD[rdi] - lea rsi,[128+rsi] - add rbx,QWORD[8+rdi] - add rcx,QWORD[16+rdi] - add rdx,QWORD[24+rdi] - add r8,QWORD[32+rdi] - add r9,QWORD[40+rdi] - add r10,QWORD[48+rdi] - add r11,QWORD[56+rdi] - - cmp rsi,QWORD[((128+16))+rsp] - - mov QWORD[rdi],rax - mov QWORD[8+rdi],rbx - mov QWORD[16+rdi],rcx - mov QWORD[24+rdi],rdx - mov QWORD[32+rdi],r8 - mov QWORD[40+rdi],r9 - mov QWORD[48+rdi],r10 - mov QWORD[56+rdi],r11 - jb NEAR $L$loop_avx - - mov rsi,QWORD[152+rsp] - - vzeroupper - movaps xmm6,XMMWORD[((128+32))+rsp] - movaps xmm7,XMMWORD[((128+48))+rsp] - movaps xmm8,XMMWORD[((128+64))+rsp] - movaps xmm9,XMMWORD[((128+80))+rsp] - movaps xmm10,XMMWORD[((128+96))+rsp] - movaps xmm11,XMMWORD[((128+112))+rsp] - mov r15,QWORD[((-48))+rsi] - - mov r14,QWORD[((-40))+rsi] - - mov r13,QWORD[((-32))+rsi] - - mov r12,QWORD[((-24))+rsi] - - mov rbp,QWORD[((-16))+rsi] - - mov rbx,QWORD[((-8))+rsi] - - lea rsp,[rsi] - -$L$epilogue_avx: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_sha512_block_data_order_avx: - -ALIGN 64 -sha512_block_data_order_avx2: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_sha512_block_data_order_avx2: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - - -$L$avx2_shortcut: - mov rax,rsp - - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,1408 - shl rdx,4 - and rsp,-256*8 - lea rdx,[rdx*8+rsi] - add rsp,1152 - mov QWORD[((128+0))+rsp],rdi - mov QWORD[((128+8))+rsp],rsi - mov QWORD[((128+16))+rsp],rdx - mov QWORD[152+rsp],rax - - movaps XMMWORD[(128+32)+rsp],xmm6 - movaps XMMWORD[(128+48)+rsp],xmm7 - movaps XMMWORD[(128+64)+rsp],xmm8 - movaps XMMWORD[(128+80)+rsp],xmm9 - movaps XMMWORD[(128+96)+rsp],xmm10 - movaps XMMWORD[(128+112)+rsp],xmm11 -$L$prologue_avx2: - - vzeroupper - sub rsi,-16*8 - mov rax,QWORD[rdi] - mov r12,rsi - mov rbx,QWORD[8+rdi] - cmp rsi,rdx - mov rcx,QWORD[16+rdi] - cmove r12,rsp - mov rdx,QWORD[24+rdi] - mov r8,QWORD[32+rdi] - mov r9,QWORD[40+rdi] - mov r10,QWORD[48+rdi] - mov r11,QWORD[56+rdi] - jmp NEAR $L$oop_avx2 -ALIGN 16 -$L$oop_avx2: - vmovdqu xmm0,XMMWORD[((-128))+rsi] - vmovdqu xmm1,XMMWORD[((-128+16))+rsi] - vmovdqu xmm2,XMMWORD[((-128+32))+rsi] - lea rbp,[((K512+128))] - vmovdqu xmm3,XMMWORD[((-128+48))+rsi] - vmovdqu xmm4,XMMWORD[((-128+64))+rsi] - vmovdqu xmm5,XMMWORD[((-128+80))+rsi] - vmovdqu xmm6,XMMWORD[((-128+96))+rsi] - vmovdqu xmm7,XMMWORD[((-128+112))+rsi] - - vmovdqa ymm10,YMMWORD[1152+rbp] - vinserti128 ymm0,ymm0,XMMWORD[r12],1 - vinserti128 ymm1,ymm1,XMMWORD[16+r12],1 - vpshufb ymm0,ymm0,ymm10 - vinserti128 ymm2,ymm2,XMMWORD[32+r12],1 - vpshufb ymm1,ymm1,ymm10 - vinserti128 ymm3,ymm3,XMMWORD[48+r12],1 - vpshufb ymm2,ymm2,ymm10 - vinserti128 ymm4,ymm4,XMMWORD[64+r12],1 - vpshufb ymm3,ymm3,ymm10 - vinserti128 ymm5,ymm5,XMMWORD[80+r12],1 - vpshufb ymm4,ymm4,ymm10 - vinserti128 ymm6,ymm6,XMMWORD[96+r12],1 - vpshufb ymm5,ymm5,ymm10 - vinserti128 ymm7,ymm7,XMMWORD[112+r12],1 - - vpaddq ymm8,ymm0,YMMWORD[((-128))+rbp] - vpshufb ymm6,ymm6,ymm10 - vpaddq ymm9,ymm1,YMMWORD[((-96))+rbp] - vpshufb ymm7,ymm7,ymm10 - vpaddq ymm10,ymm2,YMMWORD[((-64))+rbp] - vpaddq ymm11,ymm3,YMMWORD[((-32))+rbp] - vmovdqa YMMWORD[rsp],ymm8 - vpaddq ymm8,ymm4,YMMWORD[rbp] - vmovdqa YMMWORD[32+rsp],ymm9 - vpaddq ymm9,ymm5,YMMWORD[32+rbp] - vmovdqa YMMWORD[64+rsp],ymm10 - vpaddq ymm10,ymm6,YMMWORD[64+rbp] - vmovdqa YMMWORD[96+rsp],ymm11 - lea rsp,[((-128))+rsp] - vpaddq ymm11,ymm7,YMMWORD[96+rbp] - vmovdqa YMMWORD[rsp],ymm8 - xor r14,r14 - vmovdqa YMMWORD[32+rsp],ymm9 - mov rdi,rbx - vmovdqa YMMWORD[64+rsp],ymm10 - xor rdi,rcx - vmovdqa YMMWORD[96+rsp],ymm11 - mov r12,r9 - add rbp,16*2*8 - jmp NEAR $L$avx2_00_47 - -ALIGN 16 -$L$avx2_00_47: - lea rsp,[((-128))+rsp] - vpalignr ymm8,ymm1,ymm0,8 - add r11,QWORD[((0+256))+rsp] - and r12,r8 - rorx r13,r8,41 - vpalignr ymm11,ymm5,ymm4,8 - rorx r15,r8,18 - lea rax,[r14*1+rax] - lea r11,[r12*1+r11] - vpsrlq ymm10,ymm8,1 - andn r12,r8,r10 - xor r13,r15 - rorx r14,r8,14 - vpaddq ymm0,ymm0,ymm11 - vpsrlq ymm11,ymm8,7 - lea r11,[r12*1+r11] - xor r13,r14 - mov r15,rax - vpsllq ymm9,ymm8,56 - vpxor ymm8,ymm11,ymm10 - rorx r12,rax,39 - lea r11,[r13*1+r11] - xor r15,rbx - vpsrlq ymm10,ymm10,7 - vpxor ymm8,ymm8,ymm9 - rorx r14,rax,34 - rorx r13,rax,28 - lea rdx,[r11*1+rdx] - vpsllq ymm9,ymm9,7 - vpxor ymm8,ymm8,ymm10 - and rdi,r15 - xor r14,r12 - xor rdi,rbx - vpsrlq ymm11,ymm7,6 - vpxor ymm8,ymm8,ymm9 - xor r14,r13 - lea r11,[rdi*1+r11] - mov r12,r8 - vpsllq ymm10,ymm7,3 - vpaddq ymm0,ymm0,ymm8 - add r10,QWORD[((8+256))+rsp] - and r12,rdx - rorx r13,rdx,41 - vpsrlq ymm9,ymm7,19 - vpxor ymm11,ymm11,ymm10 - rorx rdi,rdx,18 - lea r11,[r14*1+r11] - lea r10,[r12*1+r10] - vpsllq ymm10,ymm10,42 - vpxor ymm11,ymm11,ymm9 - andn r12,rdx,r9 - xor r13,rdi - rorx r14,rdx,14 - vpsrlq ymm9,ymm9,42 - vpxor ymm11,ymm11,ymm10 - lea r10,[r12*1+r10] - xor r13,r14 - mov rdi,r11 - vpxor ymm11,ymm11,ymm9 - rorx r12,r11,39 - lea r10,[r13*1+r10] - xor rdi,rax - vpaddq ymm0,ymm0,ymm11 - rorx r14,r11,34 - rorx r13,r11,28 - lea rcx,[r10*1+rcx] - vpaddq ymm10,ymm0,YMMWORD[((-128))+rbp] - and r15,rdi - xor r14,r12 - xor r15,rax - xor r14,r13 - lea r10,[r15*1+r10] - mov r12,rdx - vmovdqa YMMWORD[rsp],ymm10 - vpalignr ymm8,ymm2,ymm1,8 - add r9,QWORD[((32+256))+rsp] - and r12,rcx - rorx r13,rcx,41 - vpalignr ymm11,ymm6,ymm5,8 - rorx r15,rcx,18 - lea r10,[r14*1+r10] - lea r9,[r12*1+r9] - vpsrlq ymm10,ymm8,1 - andn r12,rcx,r8 - xor r13,r15 - rorx r14,rcx,14 - vpaddq ymm1,ymm1,ymm11 - vpsrlq ymm11,ymm8,7 - lea r9,[r12*1+r9] - xor r13,r14 - mov r15,r10 - vpsllq ymm9,ymm8,56 - vpxor ymm8,ymm11,ymm10 - rorx r12,r10,39 - lea r9,[r13*1+r9] - xor r15,r11 - vpsrlq ymm10,ymm10,7 - vpxor ymm8,ymm8,ymm9 - rorx r14,r10,34 - rorx r13,r10,28 - lea rbx,[r9*1+rbx] - vpsllq ymm9,ymm9,7 - vpxor ymm8,ymm8,ymm10 - and rdi,r15 - xor r14,r12 - xor rdi,r11 - vpsrlq ymm11,ymm0,6 - vpxor ymm8,ymm8,ymm9 - xor r14,r13 - lea r9,[rdi*1+r9] - mov r12,rcx - vpsllq ymm10,ymm0,3 - vpaddq ymm1,ymm1,ymm8 - add r8,QWORD[((40+256))+rsp] - and r12,rbx - rorx r13,rbx,41 - vpsrlq ymm9,ymm0,19 - vpxor ymm11,ymm11,ymm10 - rorx rdi,rbx,18 - lea r9,[r14*1+r9] - lea r8,[r12*1+r8] - vpsllq ymm10,ymm10,42 - vpxor ymm11,ymm11,ymm9 - andn r12,rbx,rdx - xor r13,rdi - rorx r14,rbx,14 - vpsrlq ymm9,ymm9,42 - vpxor ymm11,ymm11,ymm10 - lea r8,[r12*1+r8] - xor r13,r14 - mov rdi,r9 - vpxor ymm11,ymm11,ymm9 - rorx r12,r9,39 - lea r8,[r13*1+r8] - xor rdi,r10 - vpaddq ymm1,ymm1,ymm11 - rorx r14,r9,34 - rorx r13,r9,28 - lea rax,[r8*1+rax] - vpaddq ymm10,ymm1,YMMWORD[((-96))+rbp] - and r15,rdi - xor r14,r12 - xor r15,r10 - xor r14,r13 - lea r8,[r15*1+r8] - mov r12,rbx - vmovdqa YMMWORD[32+rsp],ymm10 - vpalignr ymm8,ymm3,ymm2,8 - add rdx,QWORD[((64+256))+rsp] - and r12,rax - rorx r13,rax,41 - vpalignr ymm11,ymm7,ymm6,8 - rorx r15,rax,18 - lea r8,[r14*1+r8] - lea rdx,[r12*1+rdx] - vpsrlq ymm10,ymm8,1 - andn r12,rax,rcx - xor r13,r15 - rorx r14,rax,14 - vpaddq ymm2,ymm2,ymm11 - vpsrlq ymm11,ymm8,7 - lea rdx,[r12*1+rdx] - xor r13,r14 - mov r15,r8 - vpsllq ymm9,ymm8,56 - vpxor ymm8,ymm11,ymm10 - rorx r12,r8,39 - lea rdx,[r13*1+rdx] - xor r15,r9 - vpsrlq ymm10,ymm10,7 - vpxor ymm8,ymm8,ymm9 - rorx r14,r8,34 - rorx r13,r8,28 - lea r11,[rdx*1+r11] - vpsllq ymm9,ymm9,7 - vpxor ymm8,ymm8,ymm10 - and rdi,r15 - xor r14,r12 - xor rdi,r9 - vpsrlq ymm11,ymm1,6 - vpxor ymm8,ymm8,ymm9 - xor r14,r13 - lea rdx,[rdi*1+rdx] - mov r12,rax - vpsllq ymm10,ymm1,3 - vpaddq ymm2,ymm2,ymm8 - add rcx,QWORD[((72+256))+rsp] - and r12,r11 - rorx r13,r11,41 - vpsrlq ymm9,ymm1,19 - vpxor ymm11,ymm11,ymm10 - rorx rdi,r11,18 - lea rdx,[r14*1+rdx] - lea rcx,[r12*1+rcx] - vpsllq ymm10,ymm10,42 - vpxor ymm11,ymm11,ymm9 - andn r12,r11,rbx - xor r13,rdi - rorx r14,r11,14 - vpsrlq ymm9,ymm9,42 - vpxor ymm11,ymm11,ymm10 - lea rcx,[r12*1+rcx] - xor r13,r14 - mov rdi,rdx - vpxor ymm11,ymm11,ymm9 - rorx r12,rdx,39 - lea rcx,[r13*1+rcx] - xor rdi,r8 - vpaddq ymm2,ymm2,ymm11 - rorx r14,rdx,34 - rorx r13,rdx,28 - lea r10,[rcx*1+r10] - vpaddq ymm10,ymm2,YMMWORD[((-64))+rbp] - and r15,rdi - xor r14,r12 - xor r15,r8 - xor r14,r13 - lea rcx,[r15*1+rcx] - mov r12,r11 - vmovdqa YMMWORD[64+rsp],ymm10 - vpalignr ymm8,ymm4,ymm3,8 - add rbx,QWORD[((96+256))+rsp] - and r12,r10 - rorx r13,r10,41 - vpalignr ymm11,ymm0,ymm7,8 - rorx r15,r10,18 - lea rcx,[r14*1+rcx] - lea rbx,[r12*1+rbx] - vpsrlq ymm10,ymm8,1 - andn r12,r10,rax - xor r13,r15 - rorx r14,r10,14 - vpaddq ymm3,ymm3,ymm11 - vpsrlq ymm11,ymm8,7 - lea rbx,[r12*1+rbx] - xor r13,r14 - mov r15,rcx - vpsllq ymm9,ymm8,56 - vpxor ymm8,ymm11,ymm10 - rorx r12,rcx,39 - lea rbx,[r13*1+rbx] - xor r15,rdx - vpsrlq ymm10,ymm10,7 - vpxor ymm8,ymm8,ymm9 - rorx r14,rcx,34 - rorx r13,rcx,28 - lea r9,[rbx*1+r9] - vpsllq ymm9,ymm9,7 - vpxor ymm8,ymm8,ymm10 - and rdi,r15 - xor r14,r12 - xor rdi,rdx - vpsrlq ymm11,ymm2,6 - vpxor ymm8,ymm8,ymm9 - xor r14,r13 - lea rbx,[rdi*1+rbx] - mov r12,r10 - vpsllq ymm10,ymm2,3 - vpaddq ymm3,ymm3,ymm8 - add rax,QWORD[((104+256))+rsp] - and r12,r9 - rorx r13,r9,41 - vpsrlq ymm9,ymm2,19 - vpxor ymm11,ymm11,ymm10 - rorx rdi,r9,18 - lea rbx,[r14*1+rbx] - lea rax,[r12*1+rax] - vpsllq ymm10,ymm10,42 - vpxor ymm11,ymm11,ymm9 - andn r12,r9,r11 - xor r13,rdi - rorx r14,r9,14 - vpsrlq ymm9,ymm9,42 - vpxor ymm11,ymm11,ymm10 - lea rax,[r12*1+rax] - xor r13,r14 - mov rdi,rbx - vpxor ymm11,ymm11,ymm9 - rorx r12,rbx,39 - lea rax,[r13*1+rax] - xor rdi,rcx - vpaddq ymm3,ymm3,ymm11 - rorx r14,rbx,34 - rorx r13,rbx,28 - lea r8,[rax*1+r8] - vpaddq ymm10,ymm3,YMMWORD[((-32))+rbp] - and r15,rdi - xor r14,r12 - xor r15,rcx - xor r14,r13 - lea rax,[r15*1+rax] - mov r12,r9 - vmovdqa YMMWORD[96+rsp],ymm10 - lea rsp,[((-128))+rsp] - vpalignr ymm8,ymm5,ymm4,8 - add r11,QWORD[((0+256))+rsp] - and r12,r8 - rorx r13,r8,41 - vpalignr ymm11,ymm1,ymm0,8 - rorx r15,r8,18 - lea rax,[r14*1+rax] - lea r11,[r12*1+r11] - vpsrlq ymm10,ymm8,1 - andn r12,r8,r10 - xor r13,r15 - rorx r14,r8,14 - vpaddq ymm4,ymm4,ymm11 - vpsrlq ymm11,ymm8,7 - lea r11,[r12*1+r11] - xor r13,r14 - mov r15,rax - vpsllq ymm9,ymm8,56 - vpxor ymm8,ymm11,ymm10 - rorx r12,rax,39 - lea r11,[r13*1+r11] - xor r15,rbx - vpsrlq ymm10,ymm10,7 - vpxor ymm8,ymm8,ymm9 - rorx r14,rax,34 - rorx r13,rax,28 - lea rdx,[r11*1+rdx] - vpsllq ymm9,ymm9,7 - vpxor ymm8,ymm8,ymm10 - and rdi,r15 - xor r14,r12 - xor rdi,rbx - vpsrlq ymm11,ymm3,6 - vpxor ymm8,ymm8,ymm9 - xor r14,r13 - lea r11,[rdi*1+r11] - mov r12,r8 - vpsllq ymm10,ymm3,3 - vpaddq ymm4,ymm4,ymm8 - add r10,QWORD[((8+256))+rsp] - and r12,rdx - rorx r13,rdx,41 - vpsrlq ymm9,ymm3,19 - vpxor ymm11,ymm11,ymm10 - rorx rdi,rdx,18 - lea r11,[r14*1+r11] - lea r10,[r12*1+r10] - vpsllq ymm10,ymm10,42 - vpxor ymm11,ymm11,ymm9 - andn r12,rdx,r9 - xor r13,rdi - rorx r14,rdx,14 - vpsrlq ymm9,ymm9,42 - vpxor ymm11,ymm11,ymm10 - lea r10,[r12*1+r10] - xor r13,r14 - mov rdi,r11 - vpxor ymm11,ymm11,ymm9 - rorx r12,r11,39 - lea r10,[r13*1+r10] - xor rdi,rax - vpaddq ymm4,ymm4,ymm11 - rorx r14,r11,34 - rorx r13,r11,28 - lea rcx,[r10*1+rcx] - vpaddq ymm10,ymm4,YMMWORD[rbp] - and r15,rdi - xor r14,r12 - xor r15,rax - xor r14,r13 - lea r10,[r15*1+r10] - mov r12,rdx - vmovdqa YMMWORD[rsp],ymm10 - vpalignr ymm8,ymm6,ymm5,8 - add r9,QWORD[((32+256))+rsp] - and r12,rcx - rorx r13,rcx,41 - vpalignr ymm11,ymm2,ymm1,8 - rorx r15,rcx,18 - lea r10,[r14*1+r10] - lea r9,[r12*1+r9] - vpsrlq ymm10,ymm8,1 - andn r12,rcx,r8 - xor r13,r15 - rorx r14,rcx,14 - vpaddq ymm5,ymm5,ymm11 - vpsrlq ymm11,ymm8,7 - lea r9,[r12*1+r9] - xor r13,r14 - mov r15,r10 - vpsllq ymm9,ymm8,56 - vpxor ymm8,ymm11,ymm10 - rorx r12,r10,39 - lea r9,[r13*1+r9] - xor r15,r11 - vpsrlq ymm10,ymm10,7 - vpxor ymm8,ymm8,ymm9 - rorx r14,r10,34 - rorx r13,r10,28 - lea rbx,[r9*1+rbx] - vpsllq ymm9,ymm9,7 - vpxor ymm8,ymm8,ymm10 - and rdi,r15 - xor r14,r12 - xor rdi,r11 - vpsrlq ymm11,ymm4,6 - vpxor ymm8,ymm8,ymm9 - xor r14,r13 - lea r9,[rdi*1+r9] - mov r12,rcx - vpsllq ymm10,ymm4,3 - vpaddq ymm5,ymm5,ymm8 - add r8,QWORD[((40+256))+rsp] - and r12,rbx - rorx r13,rbx,41 - vpsrlq ymm9,ymm4,19 - vpxor ymm11,ymm11,ymm10 - rorx rdi,rbx,18 - lea r9,[r14*1+r9] - lea r8,[r12*1+r8] - vpsllq ymm10,ymm10,42 - vpxor ymm11,ymm11,ymm9 - andn r12,rbx,rdx - xor r13,rdi - rorx r14,rbx,14 - vpsrlq ymm9,ymm9,42 - vpxor ymm11,ymm11,ymm10 - lea r8,[r12*1+r8] - xor r13,r14 - mov rdi,r9 - vpxor ymm11,ymm11,ymm9 - rorx r12,r9,39 - lea r8,[r13*1+r8] - xor rdi,r10 - vpaddq ymm5,ymm5,ymm11 - rorx r14,r9,34 - rorx r13,r9,28 - lea rax,[r8*1+rax] - vpaddq ymm10,ymm5,YMMWORD[32+rbp] - and r15,rdi - xor r14,r12 - xor r15,r10 - xor r14,r13 - lea r8,[r15*1+r8] - mov r12,rbx - vmovdqa YMMWORD[32+rsp],ymm10 - vpalignr ymm8,ymm7,ymm6,8 - add rdx,QWORD[((64+256))+rsp] - and r12,rax - rorx r13,rax,41 - vpalignr ymm11,ymm3,ymm2,8 - rorx r15,rax,18 - lea r8,[r14*1+r8] - lea rdx,[r12*1+rdx] - vpsrlq ymm10,ymm8,1 - andn r12,rax,rcx - xor r13,r15 - rorx r14,rax,14 - vpaddq ymm6,ymm6,ymm11 - vpsrlq ymm11,ymm8,7 - lea rdx,[r12*1+rdx] - xor r13,r14 - mov r15,r8 - vpsllq ymm9,ymm8,56 - vpxor ymm8,ymm11,ymm10 - rorx r12,r8,39 - lea rdx,[r13*1+rdx] - xor r15,r9 - vpsrlq ymm10,ymm10,7 - vpxor ymm8,ymm8,ymm9 - rorx r14,r8,34 - rorx r13,r8,28 - lea r11,[rdx*1+r11] - vpsllq ymm9,ymm9,7 - vpxor ymm8,ymm8,ymm10 - and rdi,r15 - xor r14,r12 - xor rdi,r9 - vpsrlq ymm11,ymm5,6 - vpxor ymm8,ymm8,ymm9 - xor r14,r13 - lea rdx,[rdi*1+rdx] - mov r12,rax - vpsllq ymm10,ymm5,3 - vpaddq ymm6,ymm6,ymm8 - add rcx,QWORD[((72+256))+rsp] - and r12,r11 - rorx r13,r11,41 - vpsrlq ymm9,ymm5,19 - vpxor ymm11,ymm11,ymm10 - rorx rdi,r11,18 - lea rdx,[r14*1+rdx] - lea rcx,[r12*1+rcx] - vpsllq ymm10,ymm10,42 - vpxor ymm11,ymm11,ymm9 - andn r12,r11,rbx - xor r13,rdi - rorx r14,r11,14 - vpsrlq ymm9,ymm9,42 - vpxor ymm11,ymm11,ymm10 - lea rcx,[r12*1+rcx] - xor r13,r14 - mov rdi,rdx - vpxor ymm11,ymm11,ymm9 - rorx r12,rdx,39 - lea rcx,[r13*1+rcx] - xor rdi,r8 - vpaddq ymm6,ymm6,ymm11 - rorx r14,rdx,34 - rorx r13,rdx,28 - lea r10,[rcx*1+r10] - vpaddq ymm10,ymm6,YMMWORD[64+rbp] - and r15,rdi - xor r14,r12 - xor r15,r8 - xor r14,r13 - lea rcx,[r15*1+rcx] - mov r12,r11 - vmovdqa YMMWORD[64+rsp],ymm10 - vpalignr ymm8,ymm0,ymm7,8 - add rbx,QWORD[((96+256))+rsp] - and r12,r10 - rorx r13,r10,41 - vpalignr ymm11,ymm4,ymm3,8 - rorx r15,r10,18 - lea rcx,[r14*1+rcx] - lea rbx,[r12*1+rbx] - vpsrlq ymm10,ymm8,1 - andn r12,r10,rax - xor r13,r15 - rorx r14,r10,14 - vpaddq ymm7,ymm7,ymm11 - vpsrlq ymm11,ymm8,7 - lea rbx,[r12*1+rbx] - xor r13,r14 - mov r15,rcx - vpsllq ymm9,ymm8,56 - vpxor ymm8,ymm11,ymm10 - rorx r12,rcx,39 - lea rbx,[r13*1+rbx] - xor r15,rdx - vpsrlq ymm10,ymm10,7 - vpxor ymm8,ymm8,ymm9 - rorx r14,rcx,34 - rorx r13,rcx,28 - lea r9,[rbx*1+r9] - vpsllq ymm9,ymm9,7 - vpxor ymm8,ymm8,ymm10 - and rdi,r15 - xor r14,r12 - xor rdi,rdx - vpsrlq ymm11,ymm6,6 - vpxor ymm8,ymm8,ymm9 - xor r14,r13 - lea rbx,[rdi*1+rbx] - mov r12,r10 - vpsllq ymm10,ymm6,3 - vpaddq ymm7,ymm7,ymm8 - add rax,QWORD[((104+256))+rsp] - and r12,r9 - rorx r13,r9,41 - vpsrlq ymm9,ymm6,19 - vpxor ymm11,ymm11,ymm10 - rorx rdi,r9,18 - lea rbx,[r14*1+rbx] - lea rax,[r12*1+rax] - vpsllq ymm10,ymm10,42 - vpxor ymm11,ymm11,ymm9 - andn r12,r9,r11 - xor r13,rdi - rorx r14,r9,14 - vpsrlq ymm9,ymm9,42 - vpxor ymm11,ymm11,ymm10 - lea rax,[r12*1+rax] - xor r13,r14 - mov rdi,rbx - vpxor ymm11,ymm11,ymm9 - rorx r12,rbx,39 - lea rax,[r13*1+rax] - xor rdi,rcx - vpaddq ymm7,ymm7,ymm11 - rorx r14,rbx,34 - rorx r13,rbx,28 - lea r8,[rax*1+r8] - vpaddq ymm10,ymm7,YMMWORD[96+rbp] - and r15,rdi - xor r14,r12 - xor r15,rcx - xor r14,r13 - lea rax,[r15*1+rax] - mov r12,r9 - vmovdqa YMMWORD[96+rsp],ymm10 - lea rbp,[256+rbp] - cmp BYTE[((-121))+rbp],0 - jne NEAR $L$avx2_00_47 - add r11,QWORD[((0+128))+rsp] - and r12,r8 - rorx r13,r8,41 - rorx r15,r8,18 - lea rax,[r14*1+rax] - lea r11,[r12*1+r11] - andn r12,r8,r10 - xor r13,r15 - rorx r14,r8,14 - lea r11,[r12*1+r11] - xor r13,r14 - mov r15,rax - rorx r12,rax,39 - lea r11,[r13*1+r11] - xor r15,rbx - rorx r14,rax,34 - rorx r13,rax,28 - lea rdx,[r11*1+rdx] - and rdi,r15 - xor r14,r12 - xor rdi,rbx - xor r14,r13 - lea r11,[rdi*1+r11] - mov r12,r8 - add r10,QWORD[((8+128))+rsp] - and r12,rdx - rorx r13,rdx,41 - rorx rdi,rdx,18 - lea r11,[r14*1+r11] - lea r10,[r12*1+r10] - andn r12,rdx,r9 - xor r13,rdi - rorx r14,rdx,14 - lea r10,[r12*1+r10] - xor r13,r14 - mov rdi,r11 - rorx r12,r11,39 - lea r10,[r13*1+r10] - xor rdi,rax - rorx r14,r11,34 - rorx r13,r11,28 - lea rcx,[r10*1+rcx] - and r15,rdi - xor r14,r12 - xor r15,rax - xor r14,r13 - lea r10,[r15*1+r10] - mov r12,rdx - add r9,QWORD[((32+128))+rsp] - and r12,rcx - rorx r13,rcx,41 - rorx r15,rcx,18 - lea r10,[r14*1+r10] - lea r9,[r12*1+r9] - andn r12,rcx,r8 - xor r13,r15 - rorx r14,rcx,14 - lea r9,[r12*1+r9] - xor r13,r14 - mov r15,r10 - rorx r12,r10,39 - lea r9,[r13*1+r9] - xor r15,r11 - rorx r14,r10,34 - rorx r13,r10,28 - lea rbx,[r9*1+rbx] - and rdi,r15 - xor r14,r12 - xor rdi,r11 - xor r14,r13 - lea r9,[rdi*1+r9] - mov r12,rcx - add r8,QWORD[((40+128))+rsp] - and r12,rbx - rorx r13,rbx,41 - rorx rdi,rbx,18 - lea r9,[r14*1+r9] - lea r8,[r12*1+r8] - andn r12,rbx,rdx - xor r13,rdi - rorx r14,rbx,14 - lea r8,[r12*1+r8] - xor r13,r14 - mov rdi,r9 - rorx r12,r9,39 - lea r8,[r13*1+r8] - xor rdi,r10 - rorx r14,r9,34 - rorx r13,r9,28 - lea rax,[r8*1+rax] - and r15,rdi - xor r14,r12 - xor r15,r10 - xor r14,r13 - lea r8,[r15*1+r8] - mov r12,rbx - add rdx,QWORD[((64+128))+rsp] - and r12,rax - rorx r13,rax,41 - rorx r15,rax,18 - lea r8,[r14*1+r8] - lea rdx,[r12*1+rdx] - andn r12,rax,rcx - xor r13,r15 - rorx r14,rax,14 - lea rdx,[r12*1+rdx] - xor r13,r14 - mov r15,r8 - rorx r12,r8,39 - lea rdx,[r13*1+rdx] - xor r15,r9 - rorx r14,r8,34 - rorx r13,r8,28 - lea r11,[rdx*1+r11] - and rdi,r15 - xor r14,r12 - xor rdi,r9 - xor r14,r13 - lea rdx,[rdi*1+rdx] - mov r12,rax - add rcx,QWORD[((72+128))+rsp] - and r12,r11 - rorx r13,r11,41 - rorx rdi,r11,18 - lea rdx,[r14*1+rdx] - lea rcx,[r12*1+rcx] - andn r12,r11,rbx - xor r13,rdi - rorx r14,r11,14 - lea rcx,[r12*1+rcx] - xor r13,r14 - mov rdi,rdx - rorx r12,rdx,39 - lea rcx,[r13*1+rcx] - xor rdi,r8 - rorx r14,rdx,34 - rorx r13,rdx,28 - lea r10,[rcx*1+r10] - and r15,rdi - xor r14,r12 - xor r15,r8 - xor r14,r13 - lea rcx,[r15*1+rcx] - mov r12,r11 - add rbx,QWORD[((96+128))+rsp] - and r12,r10 - rorx r13,r10,41 - rorx r15,r10,18 - lea rcx,[r14*1+rcx] - lea rbx,[r12*1+rbx] - andn r12,r10,rax - xor r13,r15 - rorx r14,r10,14 - lea rbx,[r12*1+rbx] - xor r13,r14 - mov r15,rcx - rorx r12,rcx,39 - lea rbx,[r13*1+rbx] - xor r15,rdx - rorx r14,rcx,34 - rorx r13,rcx,28 - lea r9,[rbx*1+r9] - and rdi,r15 - xor r14,r12 - xor rdi,rdx - xor r14,r13 - lea rbx,[rdi*1+rbx] - mov r12,r10 - add rax,QWORD[((104+128))+rsp] - and r12,r9 - rorx r13,r9,41 - rorx rdi,r9,18 - lea rbx,[r14*1+rbx] - lea rax,[r12*1+rax] - andn r12,r9,r11 - xor r13,rdi - rorx r14,r9,14 - lea rax,[r12*1+rax] - xor r13,r14 - mov rdi,rbx - rorx r12,rbx,39 - lea rax,[r13*1+rax] - xor rdi,rcx - rorx r14,rbx,34 - rorx r13,rbx,28 - lea r8,[rax*1+r8] - and r15,rdi - xor r14,r12 - xor r15,rcx - xor r14,r13 - lea rax,[r15*1+rax] - mov r12,r9 - add r11,QWORD[rsp] - and r12,r8 - rorx r13,r8,41 - rorx r15,r8,18 - lea rax,[r14*1+rax] - lea r11,[r12*1+r11] - andn r12,r8,r10 - xor r13,r15 - rorx r14,r8,14 - lea r11,[r12*1+r11] - xor r13,r14 - mov r15,rax - rorx r12,rax,39 - lea r11,[r13*1+r11] - xor r15,rbx - rorx r14,rax,34 - rorx r13,rax,28 - lea rdx,[r11*1+rdx] - and rdi,r15 - xor r14,r12 - xor rdi,rbx - xor r14,r13 - lea r11,[rdi*1+r11] - mov r12,r8 - add r10,QWORD[8+rsp] - and r12,rdx - rorx r13,rdx,41 - rorx rdi,rdx,18 - lea r11,[r14*1+r11] - lea r10,[r12*1+r10] - andn r12,rdx,r9 - xor r13,rdi - rorx r14,rdx,14 - lea r10,[r12*1+r10] - xor r13,r14 - mov rdi,r11 - rorx r12,r11,39 - lea r10,[r13*1+r10] - xor rdi,rax - rorx r14,r11,34 - rorx r13,r11,28 - lea rcx,[r10*1+rcx] - and r15,rdi - xor r14,r12 - xor r15,rax - xor r14,r13 - lea r10,[r15*1+r10] - mov r12,rdx - add r9,QWORD[32+rsp] - and r12,rcx - rorx r13,rcx,41 - rorx r15,rcx,18 - lea r10,[r14*1+r10] - lea r9,[r12*1+r9] - andn r12,rcx,r8 - xor r13,r15 - rorx r14,rcx,14 - lea r9,[r12*1+r9] - xor r13,r14 - mov r15,r10 - rorx r12,r10,39 - lea r9,[r13*1+r9] - xor r15,r11 - rorx r14,r10,34 - rorx r13,r10,28 - lea rbx,[r9*1+rbx] - and rdi,r15 - xor r14,r12 - xor rdi,r11 - xor r14,r13 - lea r9,[rdi*1+r9] - mov r12,rcx - add r8,QWORD[40+rsp] - and r12,rbx - rorx r13,rbx,41 - rorx rdi,rbx,18 - lea r9,[r14*1+r9] - lea r8,[r12*1+r8] - andn r12,rbx,rdx - xor r13,rdi - rorx r14,rbx,14 - lea r8,[r12*1+r8] - xor r13,r14 - mov rdi,r9 - rorx r12,r9,39 - lea r8,[r13*1+r8] - xor rdi,r10 - rorx r14,r9,34 - rorx r13,r9,28 - lea rax,[r8*1+rax] - and r15,rdi - xor r14,r12 - xor r15,r10 - xor r14,r13 - lea r8,[r15*1+r8] - mov r12,rbx - add rdx,QWORD[64+rsp] - and r12,rax - rorx r13,rax,41 - rorx r15,rax,18 - lea r8,[r14*1+r8] - lea rdx,[r12*1+rdx] - andn r12,rax,rcx - xor r13,r15 - rorx r14,rax,14 - lea rdx,[r12*1+rdx] - xor r13,r14 - mov r15,r8 - rorx r12,r8,39 - lea rdx,[r13*1+rdx] - xor r15,r9 - rorx r14,r8,34 - rorx r13,r8,28 - lea r11,[rdx*1+r11] - and rdi,r15 - xor r14,r12 - xor rdi,r9 - xor r14,r13 - lea rdx,[rdi*1+rdx] - mov r12,rax - add rcx,QWORD[72+rsp] - and r12,r11 - rorx r13,r11,41 - rorx rdi,r11,18 - lea rdx,[r14*1+rdx] - lea rcx,[r12*1+rcx] - andn r12,r11,rbx - xor r13,rdi - rorx r14,r11,14 - lea rcx,[r12*1+rcx] - xor r13,r14 - mov rdi,rdx - rorx r12,rdx,39 - lea rcx,[r13*1+rcx] - xor rdi,r8 - rorx r14,rdx,34 - rorx r13,rdx,28 - lea r10,[rcx*1+r10] - and r15,rdi - xor r14,r12 - xor r15,r8 - xor r14,r13 - lea rcx,[r15*1+rcx] - mov r12,r11 - add rbx,QWORD[96+rsp] - and r12,r10 - rorx r13,r10,41 - rorx r15,r10,18 - lea rcx,[r14*1+rcx] - lea rbx,[r12*1+rbx] - andn r12,r10,rax - xor r13,r15 - rorx r14,r10,14 - lea rbx,[r12*1+rbx] - xor r13,r14 - mov r15,rcx - rorx r12,rcx,39 - lea rbx,[r13*1+rbx] - xor r15,rdx - rorx r14,rcx,34 - rorx r13,rcx,28 - lea r9,[rbx*1+r9] - and rdi,r15 - xor r14,r12 - xor rdi,rdx - xor r14,r13 - lea rbx,[rdi*1+rbx] - mov r12,r10 - add rax,QWORD[104+rsp] - and r12,r9 - rorx r13,r9,41 - rorx rdi,r9,18 - lea rbx,[r14*1+rbx] - lea rax,[r12*1+rax] - andn r12,r9,r11 - xor r13,rdi - rorx r14,r9,14 - lea rax,[r12*1+rax] - xor r13,r14 - mov rdi,rbx - rorx r12,rbx,39 - lea rax,[r13*1+rax] - xor rdi,rcx - rorx r14,rbx,34 - rorx r13,rbx,28 - lea r8,[rax*1+r8] - and r15,rdi - xor r14,r12 - xor r15,rcx - xor r14,r13 - lea rax,[r15*1+rax] - mov r12,r9 - mov rdi,QWORD[1280+rsp] - add rax,r14 - - lea rbp,[1152+rsp] - - add rax,QWORD[rdi] - add rbx,QWORD[8+rdi] - add rcx,QWORD[16+rdi] - add rdx,QWORD[24+rdi] - add r8,QWORD[32+rdi] - add r9,QWORD[40+rdi] - add r10,QWORD[48+rdi] - add r11,QWORD[56+rdi] - - mov QWORD[rdi],rax - mov QWORD[8+rdi],rbx - mov QWORD[16+rdi],rcx - mov QWORD[24+rdi],rdx - mov QWORD[32+rdi],r8 - mov QWORD[40+rdi],r9 - mov QWORD[48+rdi],r10 - mov QWORD[56+rdi],r11 - - cmp rsi,QWORD[144+rbp] - je NEAR $L$done_avx2 - - xor r14,r14 - mov rdi,rbx - xor rdi,rcx - mov r12,r9 - jmp NEAR $L$ower_avx2 -ALIGN 16 -$L$ower_avx2: - add r11,QWORD[((0+16))+rbp] - and r12,r8 - rorx r13,r8,41 - rorx r15,r8,18 - lea rax,[r14*1+rax] - lea r11,[r12*1+r11] - andn r12,r8,r10 - xor r13,r15 - rorx r14,r8,14 - lea r11,[r12*1+r11] - xor r13,r14 - mov r15,rax - rorx r12,rax,39 - lea r11,[r13*1+r11] - xor r15,rbx - rorx r14,rax,34 - rorx r13,rax,28 - lea rdx,[r11*1+rdx] - and rdi,r15 - xor r14,r12 - xor rdi,rbx - xor r14,r13 - lea r11,[rdi*1+r11] - mov r12,r8 - add r10,QWORD[((8+16))+rbp] - and r12,rdx - rorx r13,rdx,41 - rorx rdi,rdx,18 - lea r11,[r14*1+r11] - lea r10,[r12*1+r10] - andn r12,rdx,r9 - xor r13,rdi - rorx r14,rdx,14 - lea r10,[r12*1+r10] - xor r13,r14 - mov rdi,r11 - rorx r12,r11,39 - lea r10,[r13*1+r10] - xor rdi,rax - rorx r14,r11,34 - rorx r13,r11,28 - lea rcx,[r10*1+rcx] - and r15,rdi - xor r14,r12 - xor r15,rax - xor r14,r13 - lea r10,[r15*1+r10] - mov r12,rdx - add r9,QWORD[((32+16))+rbp] - and r12,rcx - rorx r13,rcx,41 - rorx r15,rcx,18 - lea r10,[r14*1+r10] - lea r9,[r12*1+r9] - andn r12,rcx,r8 - xor r13,r15 - rorx r14,rcx,14 - lea r9,[r12*1+r9] - xor r13,r14 - mov r15,r10 - rorx r12,r10,39 - lea r9,[r13*1+r9] - xor r15,r11 - rorx r14,r10,34 - rorx r13,r10,28 - lea rbx,[r9*1+rbx] - and rdi,r15 - xor r14,r12 - xor rdi,r11 - xor r14,r13 - lea r9,[rdi*1+r9] - mov r12,rcx - add r8,QWORD[((40+16))+rbp] - and r12,rbx - rorx r13,rbx,41 - rorx rdi,rbx,18 - lea r9,[r14*1+r9] - lea r8,[r12*1+r8] - andn r12,rbx,rdx - xor r13,rdi - rorx r14,rbx,14 - lea r8,[r12*1+r8] - xor r13,r14 - mov rdi,r9 - rorx r12,r9,39 - lea r8,[r13*1+r8] - xor rdi,r10 - rorx r14,r9,34 - rorx r13,r9,28 - lea rax,[r8*1+rax] - and r15,rdi - xor r14,r12 - xor r15,r10 - xor r14,r13 - lea r8,[r15*1+r8] - mov r12,rbx - add rdx,QWORD[((64+16))+rbp] - and r12,rax - rorx r13,rax,41 - rorx r15,rax,18 - lea r8,[r14*1+r8] - lea rdx,[r12*1+rdx] - andn r12,rax,rcx - xor r13,r15 - rorx r14,rax,14 - lea rdx,[r12*1+rdx] - xor r13,r14 - mov r15,r8 - rorx r12,r8,39 - lea rdx,[r13*1+rdx] - xor r15,r9 - rorx r14,r8,34 - rorx r13,r8,28 - lea r11,[rdx*1+r11] - and rdi,r15 - xor r14,r12 - xor rdi,r9 - xor r14,r13 - lea rdx,[rdi*1+rdx] - mov r12,rax - add rcx,QWORD[((72+16))+rbp] - and r12,r11 - rorx r13,r11,41 - rorx rdi,r11,18 - lea rdx,[r14*1+rdx] - lea rcx,[r12*1+rcx] - andn r12,r11,rbx - xor r13,rdi - rorx r14,r11,14 - lea rcx,[r12*1+rcx] - xor r13,r14 - mov rdi,rdx - rorx r12,rdx,39 - lea rcx,[r13*1+rcx] - xor rdi,r8 - rorx r14,rdx,34 - rorx r13,rdx,28 - lea r10,[rcx*1+r10] - and r15,rdi - xor r14,r12 - xor r15,r8 - xor r14,r13 - lea rcx,[r15*1+rcx] - mov r12,r11 - add rbx,QWORD[((96+16))+rbp] - and r12,r10 - rorx r13,r10,41 - rorx r15,r10,18 - lea rcx,[r14*1+rcx] - lea rbx,[r12*1+rbx] - andn r12,r10,rax - xor r13,r15 - rorx r14,r10,14 - lea rbx,[r12*1+rbx] - xor r13,r14 - mov r15,rcx - rorx r12,rcx,39 - lea rbx,[r13*1+rbx] - xor r15,rdx - rorx r14,rcx,34 - rorx r13,rcx,28 - lea r9,[rbx*1+r9] - and rdi,r15 - xor r14,r12 - xor rdi,rdx - xor r14,r13 - lea rbx,[rdi*1+rbx] - mov r12,r10 - add rax,QWORD[((104+16))+rbp] - and r12,r9 - rorx r13,r9,41 - rorx rdi,r9,18 - lea rbx,[r14*1+rbx] - lea rax,[r12*1+rax] - andn r12,r9,r11 - xor r13,rdi - rorx r14,r9,14 - lea rax,[r12*1+rax] - xor r13,r14 - mov rdi,rbx - rorx r12,rbx,39 - lea rax,[r13*1+rax] - xor rdi,rcx - rorx r14,rbx,34 - rorx r13,rbx,28 - lea r8,[rax*1+r8] - and r15,rdi - xor r14,r12 - xor r15,rcx - xor r14,r13 - lea rax,[r15*1+rax] - mov r12,r9 - lea rbp,[((-128))+rbp] - cmp rbp,rsp - jae NEAR $L$ower_avx2 - - mov rdi,QWORD[1280+rsp] - add rax,r14 - - lea rsp,[1152+rsp] - - - - add rax,QWORD[rdi] - add rbx,QWORD[8+rdi] - add rcx,QWORD[16+rdi] - add rdx,QWORD[24+rdi] - add r8,QWORD[32+rdi] - add r9,QWORD[40+rdi] - lea rsi,[256+rsi] - add r10,QWORD[48+rdi] - mov r12,rsi - add r11,QWORD[56+rdi] - cmp rsi,QWORD[((128+16))+rsp] - - mov QWORD[rdi],rax - cmove r12,rsp - mov QWORD[8+rdi],rbx - mov QWORD[16+rdi],rcx - mov QWORD[24+rdi],rdx - mov QWORD[32+rdi],r8 - mov QWORD[40+rdi],r9 - mov QWORD[48+rdi],r10 - mov QWORD[56+rdi],r11 - - jbe NEAR $L$oop_avx2 - lea rbp,[rsp] - - - - -$L$done_avx2: - mov rsi,QWORD[152+rbp] - - vzeroupper - movaps xmm6,XMMWORD[((128+32))+rbp] - movaps xmm7,XMMWORD[((128+48))+rbp] - movaps xmm8,XMMWORD[((128+64))+rbp] - movaps xmm9,XMMWORD[((128+80))+rbp] - movaps xmm10,XMMWORD[((128+96))+rbp] - movaps xmm11,XMMWORD[((128+112))+rbp] - mov r15,QWORD[((-48))+rsi] - - mov r14,QWORD[((-40))+rsi] - - mov r13,QWORD[((-32))+rsi] - - mov r12,QWORD[((-24))+rsi] - - mov rbp,QWORD[((-16))+rsi] - - mov rbx,QWORD[((-8))+rsi] - - lea rsp,[rsi] - -$L$epilogue_avx2: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_sha512_block_data_order_avx2: -EXTERN __imp_RtlVirtualUnwind - -ALIGN 16 -se_handler: - push rsi - push rdi - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - pushfq - sub rsp,64 - - mov rax,QWORD[120+r8] - mov rbx,QWORD[248+r8] - - mov rsi,QWORD[8+r9] - mov r11,QWORD[56+r9] - - mov r10d,DWORD[r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jb NEAR $L$in_prologue - - mov rax,QWORD[152+r8] - - mov r10d,DWORD[4+r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jae NEAR $L$in_prologue - lea r10,[$L$avx2_shortcut] - cmp rbx,r10 - jb NEAR $L$not_in_avx2 - - and rax,-256*8 - add rax,1152 -$L$not_in_avx2: - mov rsi,rax - mov rax,QWORD[((128+24))+rax] - - mov rbx,QWORD[((-8))+rax] - mov rbp,QWORD[((-16))+rax] - mov r12,QWORD[((-24))+rax] - mov r13,QWORD[((-32))+rax] - mov r14,QWORD[((-40))+rax] - mov r15,QWORD[((-48))+rax] - mov QWORD[144+r8],rbx - mov QWORD[160+r8],rbp - mov QWORD[216+r8],r12 - mov QWORD[224+r8],r13 - mov QWORD[232+r8],r14 - mov QWORD[240+r8],r15 - - lea r10,[$L$epilogue] - cmp rbx,r10 - jb NEAR $L$in_prologue - - lea rsi,[((128+32))+rsi] - lea rdi,[512+r8] - mov ecx,12 - DD 0xa548f3fc - -$L$in_prologue: - mov rdi,QWORD[8+rax] - mov rsi,QWORD[16+rax] - mov QWORD[152+r8],rax - mov QWORD[168+r8],rsi - mov QWORD[176+r8],rdi - - mov rdi,QWORD[40+r9] - mov rsi,r8 - mov ecx,154 - DD 0xa548f3fc - - mov rsi,r9 - xor rcx,rcx - mov rdx,QWORD[8+rsi] - mov r8,QWORD[rsi] - mov r9,QWORD[16+rsi] - mov r10,QWORD[40+rsi] - lea r11,[56+rsi] - lea r12,[24+rsi] - mov QWORD[32+rsp],r10 - mov QWORD[40+rsp],r11 - mov QWORD[48+rsp],r12 - mov QWORD[56+rsp],rcx - call QWORD[__imp_RtlVirtualUnwind] - - mov eax,1 - add rsp,64 - popfq - pop r15 - pop r14 - pop r13 - pop r12 - pop rbp - pop rbx - pop rdi - pop rsi - DB 0F3h,0C3h ;repret + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret section .pdata rdata align=4 ALIGN 4 DD $L$SEH_begin_sha512_block_data_order wrt ..imagebase DD $L$SEH_end_sha512_block_data_order wrt ..imagebase DD $L$SEH_info_sha512_block_data_order wrt ..imagebase - DD $L$SEH_begin_sha512_block_data_order_xop wrt ..imagebase - DD $L$SEH_end_sha512_block_data_order_xop wrt ..imagebase - DD $L$SEH_info_sha512_block_data_order_xop wrt ..imagebase - DD $L$SEH_begin_sha512_block_data_order_avx wrt ..imagebase - DD $L$SEH_end_sha512_block_data_order_avx wrt ..imagebase - DD $L$SEH_info_sha512_block_data_order_avx wrt ..imagebase - DD $L$SEH_begin_sha512_block_data_order_avx2 wrt ..imagebase - DD $L$SEH_end_sha512_block_data_order_avx2 wrt ..imagebase - DD $L$SEH_info_sha512_block_data_order_avx2 wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_sha512_block_data_order: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase -$L$SEH_info_sha512_block_data_order_xop: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$prologue_xop wrt ..imagebase,$L$epilogue_xop wrt ..imagebase -$L$SEH_info_sha512_block_data_order_avx: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase -$L$SEH_info_sha512_block_data_order_avx2: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$prologue_avx2 wrt ..imagebase,$L$epilogue_avx2 wrt ..imagebase diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/configuration-ec.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/configuration-ec.h index 437ede74d7a..8ed2c462485 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/configuration-ec.h +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/configuration-ec.h @@ -109,6 +109,9 @@ extern "C" { # ifndef OPENSSL_NO_DGRAM # define OPENSSL_NO_DGRAM # endif +# ifndef OPENSSL_NO_DH +# define OPENSSL_NO_DH +# endif # ifndef OPENSSL_NO_DSA # define OPENSSL_NO_DSA # endif @@ -241,6 +244,9 @@ extern "C" { # ifndef OPENSSL_NO_SM2 # define OPENSSL_NO_SM2 # endif +# ifndef OPENSSL_NO_SM3 +# define OPENSSL_NO_SM3 +# endif # ifndef OPENSSL_NO_SM4 # define OPENSSL_NO_SM4 # endif diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/configuration-noec.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/configuration-noec.h index 018225780b3..867ad08006a 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/configuration-noec.h +++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/configuration-noec.h @@ -109,6 +109,9 @@ extern "C" { # ifndef OPENSSL_NO_DGRAM # define OPENSSL_NO_DGRAM # endif +# ifndef OPENSSL_NO_DH +# define OPENSSL_NO_DH +# endif # ifndef OPENSSL_NO_DSA # define OPENSSL_NO_DSA # endif @@ -250,6 +253,9 @@ extern "C" { # ifndef OPENSSL_NO_SM2 # define OPENSSL_NO_SM2 # endif +# ifndef OPENSSL_NO_SM3 +# define OPENSSL_NO_SM3 +# endif # ifndef OPENSSL_NO_SM4 # define OPENSSL_NO_SM4 # endif diff --git a/CryptoPkg/Library/OpensslLib/OpensslLib.inf b/CryptoPkg/Library/OpensslLib/OpensslLib.inf index b5e436a0168..cdb30e81ab0 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslLib.inf +++ b/CryptoPkg/Library/OpensslLib/OpensslLib.inf @@ -181,20 +181,6 @@ $(OPENSSL_PATH)/crypto/conf/conf_mod.c $(OPENSSL_PATH)/crypto/conf/conf_sap.c $(OPENSSL_PATH)/crypto/conf/conf_ssl.c - $(OPENSSL_PATH)/crypto/dh/dh_ameth.c - $(OPENSSL_PATH)/crypto/dh/dh_asn1.c - $(OPENSSL_PATH)/crypto/dh/dh_backend.c - $(OPENSSL_PATH)/crypto/dh/dh_check.c - $(OPENSSL_PATH)/crypto/dh/dh_err.c - $(OPENSSL_PATH)/crypto/dh/dh_gen.c - $(OPENSSL_PATH)/crypto/dh/dh_group_params.c - $(OPENSSL_PATH)/crypto/dh/dh_kdf.c - $(OPENSSL_PATH)/crypto/dh/dh_key.c - $(OPENSSL_PATH)/crypto/dh/dh_lib.c - $(OPENSSL_PATH)/crypto/dh/dh_meth.c - $(OPENSSL_PATH)/crypto/dh/dh_pmeth.c - $(OPENSSL_PATH)/crypto/dh/dh_prn.c - $(OPENSSL_PATH)/crypto/dh/dh_rfc5114.c $(OPENSSL_PATH)/crypto/dso/dso_dl.c $(OPENSSL_PATH)/crypto/dso/dso_dlfcn.c $(OPENSSL_PATH)/crypto/dso/dso_err.c @@ -427,8 +413,6 @@ $(OPENSSL_PATH)/crypto/sha/sha256.c $(OPENSSL_PATH)/crypto/sha/sha3.c $(OPENSSL_PATH)/crypto/sha/sha512.c - $(OPENSSL_PATH)/crypto/sm3/legacy_sm3.c - $(OPENSSL_PATH)/crypto/sm3/sm3.c $(OPENSSL_PATH)/crypto/stack/stack.c $(OPENSSL_PATH)/crypto/txt_db/txt_db.c $(OPENSSL_PATH)/crypto/ui/ui_err.c @@ -543,7 +527,6 @@ $(OPENSSL_PATH)/providers/implementations/digests/null_prov.c $(OPENSSL_PATH)/providers/implementations/digests/sha2_prov.c $(OPENSSL_PATH)/providers/implementations/digests/sha3_prov.c - $(OPENSSL_PATH)/providers/implementations/digests/sm3_prov.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_der2key.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_epki2pki.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_msblob2key.c @@ -551,7 +534,6 @@ $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_pvk2key.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_spki2typespki.c $(OPENSSL_PATH)/providers/implementations/encode_decode/endecoder_common.c - $(OPENSSL_PATH)/providers/implementations/exchange/dh_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c $(OPENSSL_PATH)/providers/implementations/kdfs/hkdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/kbkdf.c @@ -565,7 +547,6 @@ $(OPENSSL_PATH)/providers/implementations/kdfs/tls1_prf.c $(OPENSSL_PATH)/providers/implementations/kdfs/x942kdf.c $(OPENSSL_PATH)/providers/implementations/kem/rsa_kem.c - $(OPENSSL_PATH)/providers/implementations/keymgmt/dh_kmgmt.c $(OPENSSL_PATH)/providers/implementations/keymgmt/kdf_legacy_kmgmt.c $(OPENSSL_PATH)/providers/implementations/keymgmt/mac_legacy_kmgmt.c $(OPENSSL_PATH)/providers/implementations/keymgmt/rsa_kmgmt.c diff --git a/CryptoPkg/Library/OpensslLib/OpensslLibAccel.inf b/CryptoPkg/Library/OpensslLib/OpensslLibAccel.inf index 673dba23621..6315e6edb32 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslLibAccel.inf +++ b/CryptoPkg/Library/OpensslLib/OpensslLibAccel.inf @@ -196,20 +196,6 @@ $(OPENSSL_PATH)/crypto/conf/conf_mod.c $(OPENSSL_PATH)/crypto/conf/conf_sap.c $(OPENSSL_PATH)/crypto/conf/conf_ssl.c - $(OPENSSL_PATH)/crypto/dh/dh_ameth.c - $(OPENSSL_PATH)/crypto/dh/dh_asn1.c - $(OPENSSL_PATH)/crypto/dh/dh_backend.c - $(OPENSSL_PATH)/crypto/dh/dh_check.c - $(OPENSSL_PATH)/crypto/dh/dh_err.c - $(OPENSSL_PATH)/crypto/dh/dh_gen.c - $(OPENSSL_PATH)/crypto/dh/dh_group_params.c - $(OPENSSL_PATH)/crypto/dh/dh_kdf.c - $(OPENSSL_PATH)/crypto/dh/dh_key.c - $(OPENSSL_PATH)/crypto/dh/dh_lib.c - $(OPENSSL_PATH)/crypto/dh/dh_meth.c - $(OPENSSL_PATH)/crypto/dh/dh_pmeth.c - $(OPENSSL_PATH)/crypto/dh/dh_prn.c - $(OPENSSL_PATH)/crypto/dh/dh_rfc5114.c $(OPENSSL_PATH)/crypto/dso/dso_dl.c $(OPENSSL_PATH)/crypto/dso/dso_dlfcn.c $(OPENSSL_PATH)/crypto/dso/dso_err.c @@ -441,8 +427,6 @@ $(OPENSSL_PATH)/crypto/sha/sha256.c $(OPENSSL_PATH)/crypto/sha/sha3.c $(OPENSSL_PATH)/crypto/sha/sha512.c - $(OPENSSL_PATH)/crypto/sm3/legacy_sm3.c - $(OPENSSL_PATH)/crypto/sm3/sm3.c $(OPENSSL_PATH)/crypto/stack/stack.c $(OPENSSL_PATH)/crypto/txt_db/txt_db.c $(OPENSSL_PATH)/crypto/ui/ui_err.c @@ -557,7 +541,6 @@ $(OPENSSL_PATH)/providers/implementations/digests/null_prov.c $(OPENSSL_PATH)/providers/implementations/digests/sha2_prov.c $(OPENSSL_PATH)/providers/implementations/digests/sha3_prov.c - $(OPENSSL_PATH)/providers/implementations/digests/sm3_prov.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_der2key.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_epki2pki.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_msblob2key.c @@ -565,7 +548,6 @@ $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_pvk2key.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_spki2typespki.c $(OPENSSL_PATH)/providers/implementations/encode_decode/endecoder_common.c - $(OPENSSL_PATH)/providers/implementations/exchange/dh_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c $(OPENSSL_PATH)/providers/implementations/kdfs/hkdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/kbkdf.c @@ -579,7 +561,6 @@ $(OPENSSL_PATH)/providers/implementations/kdfs/tls1_prf.c $(OPENSSL_PATH)/providers/implementations/kdfs/x942kdf.c $(OPENSSL_PATH)/providers/implementations/kem/rsa_kem.c - $(OPENSSL_PATH)/providers/implementations/keymgmt/dh_kmgmt.c $(OPENSSL_PATH)/providers/implementations/keymgmt/kdf_legacy_kmgmt.c $(OPENSSL_PATH)/providers/implementations/keymgmt/mac_legacy_kmgmt.c $(OPENSSL_PATH)/providers/implementations/keymgmt/rsa_kmgmt.c @@ -829,20 +810,6 @@ $(OPENSSL_PATH)/crypto/conf/conf_mod.c $(OPENSSL_PATH)/crypto/conf/conf_sap.c $(OPENSSL_PATH)/crypto/conf/conf_ssl.c - $(OPENSSL_PATH)/crypto/dh/dh_ameth.c - $(OPENSSL_PATH)/crypto/dh/dh_asn1.c - $(OPENSSL_PATH)/crypto/dh/dh_backend.c - $(OPENSSL_PATH)/crypto/dh/dh_check.c - $(OPENSSL_PATH)/crypto/dh/dh_err.c - $(OPENSSL_PATH)/crypto/dh/dh_gen.c - $(OPENSSL_PATH)/crypto/dh/dh_group_params.c - $(OPENSSL_PATH)/crypto/dh/dh_kdf.c - $(OPENSSL_PATH)/crypto/dh/dh_key.c - $(OPENSSL_PATH)/crypto/dh/dh_lib.c - $(OPENSSL_PATH)/crypto/dh/dh_meth.c - $(OPENSSL_PATH)/crypto/dh/dh_pmeth.c - $(OPENSSL_PATH)/crypto/dh/dh_prn.c - $(OPENSSL_PATH)/crypto/dh/dh_rfc5114.c $(OPENSSL_PATH)/crypto/dso/dso_dl.c $(OPENSSL_PATH)/crypto/dso/dso_dlfcn.c $(OPENSSL_PATH)/crypto/dso/dso_err.c @@ -1073,8 +1040,6 @@ $(OPENSSL_PATH)/crypto/sha/sha256.c $(OPENSSL_PATH)/crypto/sha/sha3.c $(OPENSSL_PATH)/crypto/sha/sha512.c - $(OPENSSL_PATH)/crypto/sm3/legacy_sm3.c - $(OPENSSL_PATH)/crypto/sm3/sm3.c $(OPENSSL_PATH)/crypto/stack/stack.c $(OPENSSL_PATH)/crypto/txt_db/txt_db.c $(OPENSSL_PATH)/crypto/ui/ui_err.c @@ -1189,7 +1154,6 @@ $(OPENSSL_PATH)/providers/implementations/digests/null_prov.c $(OPENSSL_PATH)/providers/implementations/digests/sha2_prov.c $(OPENSSL_PATH)/providers/implementations/digests/sha3_prov.c - $(OPENSSL_PATH)/providers/implementations/digests/sm3_prov.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_der2key.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_epki2pki.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_msblob2key.c @@ -1197,7 +1161,6 @@ $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_pvk2key.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_spki2typespki.c $(OPENSSL_PATH)/providers/implementations/encode_decode/endecoder_common.c - $(OPENSSL_PATH)/providers/implementations/exchange/dh_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c $(OPENSSL_PATH)/providers/implementations/kdfs/hkdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/kbkdf.c @@ -1211,7 +1174,6 @@ $(OPENSSL_PATH)/providers/implementations/kdfs/tls1_prf.c $(OPENSSL_PATH)/providers/implementations/kdfs/x942kdf.c $(OPENSSL_PATH)/providers/implementations/kem/rsa_kem.c - $(OPENSSL_PATH)/providers/implementations/keymgmt/dh_kmgmt.c $(OPENSSL_PATH)/providers/implementations/keymgmt/kdf_legacy_kmgmt.c $(OPENSSL_PATH)/providers/implementations/keymgmt/mac_legacy_kmgmt.c $(OPENSSL_PATH)/providers/implementations/keymgmt/rsa_kmgmt.c diff --git a/CryptoPkg/Library/OpensslLib/OpensslLibCrypto.inf b/CryptoPkg/Library/OpensslLib/OpensslLibCrypto.inf index 35162b90fe8..9f09af4ee9f 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslLibCrypto.inf +++ b/CryptoPkg/Library/OpensslLib/OpensslLibCrypto.inf @@ -182,20 +182,6 @@ $(OPENSSL_PATH)/crypto/conf/conf_mod.c $(OPENSSL_PATH)/crypto/conf/conf_sap.c $(OPENSSL_PATH)/crypto/conf/conf_ssl.c - $(OPENSSL_PATH)/crypto/dh/dh_ameth.c - $(OPENSSL_PATH)/crypto/dh/dh_asn1.c - $(OPENSSL_PATH)/crypto/dh/dh_backend.c - $(OPENSSL_PATH)/crypto/dh/dh_check.c - $(OPENSSL_PATH)/crypto/dh/dh_err.c - $(OPENSSL_PATH)/crypto/dh/dh_gen.c - $(OPENSSL_PATH)/crypto/dh/dh_group_params.c - $(OPENSSL_PATH)/crypto/dh/dh_kdf.c - $(OPENSSL_PATH)/crypto/dh/dh_key.c - $(OPENSSL_PATH)/crypto/dh/dh_lib.c - $(OPENSSL_PATH)/crypto/dh/dh_meth.c - $(OPENSSL_PATH)/crypto/dh/dh_pmeth.c - $(OPENSSL_PATH)/crypto/dh/dh_prn.c - $(OPENSSL_PATH)/crypto/dh/dh_rfc5114.c $(OPENSSL_PATH)/crypto/dso/dso_dl.c $(OPENSSL_PATH)/crypto/dso/dso_dlfcn.c $(OPENSSL_PATH)/crypto/dso/dso_err.c @@ -428,8 +414,6 @@ $(OPENSSL_PATH)/crypto/sha/sha256.c $(OPENSSL_PATH)/crypto/sha/sha3.c $(OPENSSL_PATH)/crypto/sha/sha512.c - $(OPENSSL_PATH)/crypto/sm3/legacy_sm3.c - $(OPENSSL_PATH)/crypto/sm3/sm3.c $(OPENSSL_PATH)/crypto/stack/stack.c $(OPENSSL_PATH)/crypto/txt_db/txt_db.c $(OPENSSL_PATH)/crypto/ui/ui_err.c @@ -544,7 +528,6 @@ $(OPENSSL_PATH)/providers/implementations/digests/null_prov.c $(OPENSSL_PATH)/providers/implementations/digests/sha2_prov.c $(OPENSSL_PATH)/providers/implementations/digests/sha3_prov.c - $(OPENSSL_PATH)/providers/implementations/digests/sm3_prov.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_der2key.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_epki2pki.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_msblob2key.c @@ -552,7 +535,6 @@ $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_pvk2key.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_spki2typespki.c $(OPENSSL_PATH)/providers/implementations/encode_decode/endecoder_common.c - $(OPENSSL_PATH)/providers/implementations/exchange/dh_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c $(OPENSSL_PATH)/providers/implementations/kdfs/hkdf.c $(OPENSSL_PATH)/providers/implementations/kdfs/kbkdf.c @@ -566,7 +548,6 @@ $(OPENSSL_PATH)/providers/implementations/kdfs/tls1_prf.c $(OPENSSL_PATH)/providers/implementations/kdfs/x942kdf.c $(OPENSSL_PATH)/providers/implementations/kem/rsa_kem.c - $(OPENSSL_PATH)/providers/implementations/keymgmt/dh_kmgmt.c $(OPENSSL_PATH)/providers/implementations/keymgmt/kdf_legacy_kmgmt.c $(OPENSSL_PATH)/providers/implementations/keymgmt/mac_legacy_kmgmt.c $(OPENSSL_PATH)/providers/implementations/keymgmt/rsa_kmgmt.c diff --git a/CryptoPkg/Library/OpensslLib/OpensslLibFull.inf b/CryptoPkg/Library/OpensslLib/OpensslLibFull.inf index 55c63429048..b821fa8f8c4 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslLibFull.inf +++ b/CryptoPkg/Library/OpensslLib/OpensslLibFull.inf @@ -186,20 +186,6 @@ $(OPENSSL_PATH)/crypto/conf/conf_mod.c $(OPENSSL_PATH)/crypto/conf/conf_sap.c $(OPENSSL_PATH)/crypto/conf/conf_ssl.c - $(OPENSSL_PATH)/crypto/dh/dh_ameth.c - $(OPENSSL_PATH)/crypto/dh/dh_asn1.c - $(OPENSSL_PATH)/crypto/dh/dh_backend.c - $(OPENSSL_PATH)/crypto/dh/dh_check.c - $(OPENSSL_PATH)/crypto/dh/dh_err.c - $(OPENSSL_PATH)/crypto/dh/dh_gen.c - $(OPENSSL_PATH)/crypto/dh/dh_group_params.c - $(OPENSSL_PATH)/crypto/dh/dh_kdf.c - $(OPENSSL_PATH)/crypto/dh/dh_key.c - $(OPENSSL_PATH)/crypto/dh/dh_lib.c - $(OPENSSL_PATH)/crypto/dh/dh_meth.c - $(OPENSSL_PATH)/crypto/dh/dh_pmeth.c - $(OPENSSL_PATH)/crypto/dh/dh_prn.c - $(OPENSSL_PATH)/crypto/dh/dh_rfc5114.c $(OPENSSL_PATH)/crypto/dso/dso_dl.c $(OPENSSL_PATH)/crypto/dso/dso_dlfcn.c $(OPENSSL_PATH)/crypto/dso/dso_err.c @@ -470,8 +456,6 @@ $(OPENSSL_PATH)/crypto/sha/sha256.c $(OPENSSL_PATH)/crypto/sha/sha3.c $(OPENSSL_PATH)/crypto/sha/sha512.c - $(OPENSSL_PATH)/crypto/sm3/legacy_sm3.c - $(OPENSSL_PATH)/crypto/sm3/sm3.c $(OPENSSL_PATH)/crypto/stack/stack.c $(OPENSSL_PATH)/crypto/txt_db/txt_db.c $(OPENSSL_PATH)/crypto/ui/ui_err.c @@ -586,7 +570,6 @@ $(OPENSSL_PATH)/providers/implementations/digests/null_prov.c $(OPENSSL_PATH)/providers/implementations/digests/sha2_prov.c $(OPENSSL_PATH)/providers/implementations/digests/sha3_prov.c - $(OPENSSL_PATH)/providers/implementations/digests/sm3_prov.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_der2key.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_epki2pki.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_msblob2key.c @@ -594,7 +577,6 @@ $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_pvk2key.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_spki2typespki.c $(OPENSSL_PATH)/providers/implementations/encode_decode/endecoder_common.c - $(OPENSSL_PATH)/providers/implementations/exchange/dh_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/ecdh_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/ecx_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c @@ -610,7 +592,6 @@ $(OPENSSL_PATH)/providers/implementations/kdfs/tls1_prf.c $(OPENSSL_PATH)/providers/implementations/kdfs/x942kdf.c $(OPENSSL_PATH)/providers/implementations/kem/rsa_kem.c - $(OPENSSL_PATH)/providers/implementations/keymgmt/dh_kmgmt.c $(OPENSSL_PATH)/providers/implementations/keymgmt/ec_kmgmt.c $(OPENSSL_PATH)/providers/implementations/keymgmt/ecx_kmgmt.c $(OPENSSL_PATH)/providers/implementations/keymgmt/kdf_legacy_kmgmt.c diff --git a/CryptoPkg/Library/OpensslLib/OpensslLibFullAccel.inf b/CryptoPkg/Library/OpensslLib/OpensslLibFullAccel.inf index 3e3efa13d79..106edab99e2 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslLibFullAccel.inf +++ b/CryptoPkg/Library/OpensslLib/OpensslLibFullAccel.inf @@ -202,20 +202,6 @@ $(OPENSSL_PATH)/crypto/conf/conf_mod.c $(OPENSSL_PATH)/crypto/conf/conf_sap.c $(OPENSSL_PATH)/crypto/conf/conf_ssl.c - $(OPENSSL_PATH)/crypto/dh/dh_ameth.c - $(OPENSSL_PATH)/crypto/dh/dh_asn1.c - $(OPENSSL_PATH)/crypto/dh/dh_backend.c - $(OPENSSL_PATH)/crypto/dh/dh_check.c - $(OPENSSL_PATH)/crypto/dh/dh_err.c - $(OPENSSL_PATH)/crypto/dh/dh_gen.c - $(OPENSSL_PATH)/crypto/dh/dh_group_params.c - $(OPENSSL_PATH)/crypto/dh/dh_kdf.c - $(OPENSSL_PATH)/crypto/dh/dh_key.c - $(OPENSSL_PATH)/crypto/dh/dh_lib.c - $(OPENSSL_PATH)/crypto/dh/dh_meth.c - $(OPENSSL_PATH)/crypto/dh/dh_pmeth.c - $(OPENSSL_PATH)/crypto/dh/dh_prn.c - $(OPENSSL_PATH)/crypto/dh/dh_rfc5114.c $(OPENSSL_PATH)/crypto/dso/dso_dl.c $(OPENSSL_PATH)/crypto/dso/dso_dlfcn.c $(OPENSSL_PATH)/crypto/dso/dso_err.c @@ -489,8 +475,6 @@ $(OPENSSL_PATH)/crypto/sha/sha256.c $(OPENSSL_PATH)/crypto/sha/sha3.c $(OPENSSL_PATH)/crypto/sha/sha512.c - $(OPENSSL_PATH)/crypto/sm3/legacy_sm3.c - $(OPENSSL_PATH)/crypto/sm3/sm3.c $(OPENSSL_PATH)/crypto/stack/stack.c $(OPENSSL_PATH)/crypto/txt_db/txt_db.c $(OPENSSL_PATH)/crypto/ui/ui_err.c @@ -605,7 +589,6 @@ $(OPENSSL_PATH)/providers/implementations/digests/null_prov.c $(OPENSSL_PATH)/providers/implementations/digests/sha2_prov.c $(OPENSSL_PATH)/providers/implementations/digests/sha3_prov.c - $(OPENSSL_PATH)/providers/implementations/digests/sm3_prov.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_der2key.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_epki2pki.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_msblob2key.c @@ -613,7 +596,6 @@ $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_pvk2key.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_spki2typespki.c $(OPENSSL_PATH)/providers/implementations/encode_decode/endecoder_common.c - $(OPENSSL_PATH)/providers/implementations/exchange/dh_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/ecdh_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/ecx_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c @@ -629,7 +611,6 @@ $(OPENSSL_PATH)/providers/implementations/kdfs/tls1_prf.c $(OPENSSL_PATH)/providers/implementations/kdfs/x942kdf.c $(OPENSSL_PATH)/providers/implementations/kem/rsa_kem.c - $(OPENSSL_PATH)/providers/implementations/keymgmt/dh_kmgmt.c $(OPENSSL_PATH)/providers/implementations/keymgmt/ec_kmgmt.c $(OPENSSL_PATH)/providers/implementations/keymgmt/ecx_kmgmt.c $(OPENSSL_PATH)/providers/implementations/keymgmt/kdf_legacy_kmgmt.c @@ -888,20 +869,6 @@ $(OPENSSL_PATH)/crypto/conf/conf_mod.c $(OPENSSL_PATH)/crypto/conf/conf_sap.c $(OPENSSL_PATH)/crypto/conf/conf_ssl.c - $(OPENSSL_PATH)/crypto/dh/dh_ameth.c - $(OPENSSL_PATH)/crypto/dh/dh_asn1.c - $(OPENSSL_PATH)/crypto/dh/dh_backend.c - $(OPENSSL_PATH)/crypto/dh/dh_check.c - $(OPENSSL_PATH)/crypto/dh/dh_err.c - $(OPENSSL_PATH)/crypto/dh/dh_gen.c - $(OPENSSL_PATH)/crypto/dh/dh_group_params.c - $(OPENSSL_PATH)/crypto/dh/dh_kdf.c - $(OPENSSL_PATH)/crypto/dh/dh_key.c - $(OPENSSL_PATH)/crypto/dh/dh_lib.c - $(OPENSSL_PATH)/crypto/dh/dh_meth.c - $(OPENSSL_PATH)/crypto/dh/dh_pmeth.c - $(OPENSSL_PATH)/crypto/dh/dh_prn.c - $(OPENSSL_PATH)/crypto/dh/dh_rfc5114.c $(OPENSSL_PATH)/crypto/dso/dso_dl.c $(OPENSSL_PATH)/crypto/dso/dso_dlfcn.c $(OPENSSL_PATH)/crypto/dso/dso_err.c @@ -1170,8 +1137,6 @@ $(OPENSSL_PATH)/crypto/sha/sha256.c $(OPENSSL_PATH)/crypto/sha/sha3.c $(OPENSSL_PATH)/crypto/sha/sha512.c - $(OPENSSL_PATH)/crypto/sm3/legacy_sm3.c - $(OPENSSL_PATH)/crypto/sm3/sm3.c $(OPENSSL_PATH)/crypto/stack/stack.c $(OPENSSL_PATH)/crypto/txt_db/txt_db.c $(OPENSSL_PATH)/crypto/ui/ui_err.c @@ -1286,7 +1251,6 @@ $(OPENSSL_PATH)/providers/implementations/digests/null_prov.c $(OPENSSL_PATH)/providers/implementations/digests/sha2_prov.c $(OPENSSL_PATH)/providers/implementations/digests/sha3_prov.c - $(OPENSSL_PATH)/providers/implementations/digests/sm3_prov.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_der2key.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_epki2pki.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_msblob2key.c @@ -1294,7 +1258,6 @@ $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_pvk2key.c $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_spki2typespki.c $(OPENSSL_PATH)/providers/implementations/encode_decode/endecoder_common.c - $(OPENSSL_PATH)/providers/implementations/exchange/dh_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/ecdh_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/ecx_exch.c $(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c @@ -1310,7 +1273,6 @@ $(OPENSSL_PATH)/providers/implementations/kdfs/tls1_prf.c $(OPENSSL_PATH)/providers/implementations/kdfs/x942kdf.c $(OPENSSL_PATH)/providers/implementations/kem/rsa_kem.c - $(OPENSSL_PATH)/providers/implementations/keymgmt/dh_kmgmt.c $(OPENSSL_PATH)/providers/implementations/keymgmt/ec_kmgmt.c $(OPENSSL_PATH)/providers/implementations/keymgmt/ecx_kmgmt.c $(OPENSSL_PATH)/providers/implementations/keymgmt/kdf_legacy_kmgmt.c diff --git a/CryptoPkg/Library/OpensslLib/OpensslLibFullAccelTest.inf b/CryptoPkg/Library/OpensslLib/OpensslLibFullAccelTest.inf new file mode 100644 index 00000000000..07ccf0b6eb3 --- /dev/null +++ b/CryptoPkg/Library/OpensslLib/OpensslLibFullAccelTest.inf @@ -0,0 +1,1473 @@ +## @file +# This module provides OpenSSL Library implementation with ECC and TLS +# features along with performance optimized implementations of SHA1, +# SHA256, SHA512 AESNI, VPAED, and GHASH for IA32 and X64. +# +# This library should be used if a module module needs ECC in TLS, or +# asymmetric cryptography services such as X509 certificate or PEM format +# data processing. This library increases the size overhead up to ~115 KB +# compared to OpensslLibAccel.inf library instance. +# +# Copyright (c) 2010 - 2020, Intel Corporation. All rights reserved.
+# (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+# SPDX-License-Identifier: BSD-2-Clause-Patent +# +## + +[Defines] + INF_VERSION = 0x00010005 + BASE_NAME = OpensslLibFullAccel + MODULE_UNI_FILE = OpensslLibFullAccel.uni + FILE_GUID = AC649FB2-ADCF-450A-9C61-ED3CAFF12864 + MODULE_TYPE = BASE + VERSION_STRING = 1.0 + LIBRARY_CLASS = OpensslLib + CONSTRUCTOR = OpensslLibConstructor + + DEFINE OPENSSL_PATH = openssl + DEFINE OPENSSL_GEN_PATH = OpensslGen + DEFINE OPENSSL_FLAGS = -DL_ENDIAN -DOPENSSL_SMALL_FOOTPRINT -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE + DEFINE OPENSSL_FLAGS_IA32 = -DAES_ASM -DGHASH_ASM -DMD5_ASM -DOPENSSL_CPUID_OBJ -DSHA1_ASM -DSHA256_ASM -DSHA512_ASM -DVPAES_ASM + DEFINE OPENSSL_FLAGS_X64 = -DAES_ASM -DBSAES_ASM -DGHASH_ASM -DKECCAK1600_ASM -DMD5_ASM -DOPENSSL_CPUID_OBJ -DSHA1_ASM -DSHA256_ASM -DSHA512_ASM -DVPAES_ASM + +# +# VALID_ARCHITECTURES = IA32 X64 +# + +[Sources] + OpensslLibConstructor.c + $(OPENSSL_PATH)/e_os.h + $(OPENSSL_PATH)/ms/uplink.h + $(OPENSSL_PATH)/crypto/bn/bn_asm.c +# Autogenerated files list starts here +# Autogenerated files list ends here + buildinf.h + buildinf.c + OpensslStub/ossl_store.c + OpensslStub/rand_pool.c +# OpensslStub/SslNull.c +# OpensslStub/EcSm2Null.c + OpensslStub/uefiprov.c + OpensslStub/EncoderNull.c + OpensslStub/SslStatServNull.c + OpensslStub/SslExtServNull.c + OpensslStub/Pkcs12Null.c + OpensslStub/CipherNull.c + +[Sources.IA32] +# Autogenerated files list starts here + #$(OPENSSL_PATH)/crypto/aes/aes_cfb.c + #$(OPENSSL_PATH)/crypto/aes/aes_ecb.c + #$(OPENSSL_PATH)/crypto/aes/aes_ige.c + #$(OPENSSL_PATH)/crypto/aes/aes_misc.c + #$(OPENSSL_PATH)/crypto/aes/aes_ofb.c + #$(OPENSSL_PATH)/crypto/aes/aes_wrap.c + $(OPENSSL_PATH)/crypto/asn1/a_bitstr.c + $(OPENSSL_PATH)/crypto/asn1/a_d2i_fp.c + $(OPENSSL_PATH)/crypto/asn1/a_digest.c + $(OPENSSL_PATH)/crypto/asn1/a_dup.c + $(OPENSSL_PATH)/crypto/asn1/a_gentm.c + $(OPENSSL_PATH)/crypto/asn1/a_i2d_fp.c + $(OPENSSL_PATH)/crypto/asn1/a_int.c + $(OPENSSL_PATH)/crypto/asn1/a_mbstr.c + $(OPENSSL_PATH)/crypto/asn1/a_object.c + $(OPENSSL_PATH)/crypto/asn1/a_octet.c + $(OPENSSL_PATH)/crypto/asn1/a_print.c + $(OPENSSL_PATH)/crypto/asn1/a_sign.c + $(OPENSSL_PATH)/crypto/asn1/a_strex.c + $(OPENSSL_PATH)/crypto/asn1/a_strnid.c + $(OPENSSL_PATH)/crypto/asn1/a_time.c + $(OPENSSL_PATH)/crypto/asn1/a_type.c + $(OPENSSL_PATH)/crypto/asn1/a_utctm.c + $(OPENSSL_PATH)/crypto/asn1/a_utf8.c + $(OPENSSL_PATH)/crypto/asn1/a_verify.c + $(OPENSSL_PATH)/crypto/asn1/ameth_lib.c + $(OPENSSL_PATH)/crypto/asn1/asn1_err.c + $(OPENSSL_PATH)/crypto/asn1/asn1_gen.c + $(OPENSSL_PATH)/crypto/asn1/asn1_item_list.c + $(OPENSSL_PATH)/crypto/asn1/asn1_lib.c + $(OPENSSL_PATH)/crypto/asn1/asn1_parse.c + $(OPENSSL_PATH)/crypto/asn1/asn_mime.c + $(OPENSSL_PATH)/crypto/asn1/asn_moid.c + $(OPENSSL_PATH)/crypto/asn1/asn_mstbl.c + $(OPENSSL_PATH)/crypto/asn1/asn_pack.c + $(OPENSSL_PATH)/crypto/asn1/bio_asn1.c + $(OPENSSL_PATH)/crypto/asn1/bio_ndef.c + $(OPENSSL_PATH)/crypto/asn1/d2i_param.c + $(OPENSSL_PATH)/crypto/asn1/d2i_pr.c + $(OPENSSL_PATH)/crypto/asn1/d2i_pu.c + $(OPENSSL_PATH)/crypto/asn1/evp_asn1.c + $(OPENSSL_PATH)/crypto/asn1/f_int.c + $(OPENSSL_PATH)/crypto/asn1/f_string.c + $(OPENSSL_PATH)/crypto/asn1/i2d_evp.c + $(OPENSSL_PATH)/crypto/asn1/nsseq.c + $(OPENSSL_PATH)/crypto/asn1/p5_pbe.c + $(OPENSSL_PATH)/crypto/asn1/p5_pbev2.c + $(OPENSSL_PATH)/crypto/asn1/p5_scrypt.c + $(OPENSSL_PATH)/crypto/asn1/p8_pkey.c + $(OPENSSL_PATH)/crypto/asn1/t_bitst.c + $(OPENSSL_PATH)/crypto/asn1/t_pkey.c + $(OPENSSL_PATH)/crypto/asn1/t_spki.c + $(OPENSSL_PATH)/crypto/asn1/tasn_dec.c + $(OPENSSL_PATH)/crypto/asn1/tasn_enc.c + $(OPENSSL_PATH)/crypto/asn1/tasn_fre.c + $(OPENSSL_PATH)/crypto/asn1/tasn_new.c + $(OPENSSL_PATH)/crypto/asn1/tasn_prn.c + $(OPENSSL_PATH)/crypto/asn1/tasn_scn.c + $(OPENSSL_PATH)/crypto/asn1/tasn_typ.c + $(OPENSSL_PATH)/crypto/asn1/tasn_utl.c + $(OPENSSL_PATH)/crypto/asn1/x_algor.c + $(OPENSSL_PATH)/crypto/asn1/x_bignum.c + $(OPENSSL_PATH)/crypto/asn1/x_info.c + $(OPENSSL_PATH)/crypto/asn1/x_int64.c + $(OPENSSL_PATH)/crypto/asn1/x_long.c + $(OPENSSL_PATH)/crypto/asn1/x_pkey.c + $(OPENSSL_PATH)/crypto/asn1/x_sig.c + $(OPENSSL_PATH)/crypto/asn1/x_spki.c + $(OPENSSL_PATH)/crypto/asn1/x_val.c + $(OPENSSL_PATH)/crypto/async/arch/async_null.c + $(OPENSSL_PATH)/crypto/async/arch/async_posix.c + $(OPENSSL_PATH)/crypto/async/arch/async_win.c + $(OPENSSL_PATH)/crypto/async/async.c + $(OPENSSL_PATH)/crypto/async/async_err.c + $(OPENSSL_PATH)/crypto/async/async_wait.c + $(OPENSSL_PATH)/crypto/bio/bf_buff.c + $(OPENSSL_PATH)/crypto/bio/bf_lbuf.c + $(OPENSSL_PATH)/crypto/bio/bf_nbio.c + $(OPENSSL_PATH)/crypto/bio/bf_null.c + $(OPENSSL_PATH)/crypto/bio/bf_prefix.c + $(OPENSSL_PATH)/crypto/bio/bf_readbuff.c + $(OPENSSL_PATH)/crypto/bio/bio_addr.c + $(OPENSSL_PATH)/crypto/bio/bio_cb.c + $(OPENSSL_PATH)/crypto/bio/bio_dump.c + $(OPENSSL_PATH)/crypto/bio/bio_err.c + $(OPENSSL_PATH)/crypto/bio/bio_lib.c + $(OPENSSL_PATH)/crypto/bio/bio_meth.c + $(OPENSSL_PATH)/crypto/bio/bio_print.c + $(OPENSSL_PATH)/crypto/bio/bio_sock.c + $(OPENSSL_PATH)/crypto/bio/bio_sock2.c + $(OPENSSL_PATH)/crypto/bio/bss_acpt.c + $(OPENSSL_PATH)/crypto/bio/bss_bio.c + $(OPENSSL_PATH)/crypto/bio/bss_conn.c + $(OPENSSL_PATH)/crypto/bio/bss_core.c + $(OPENSSL_PATH)/crypto/bio/bss_dgram.c + $(OPENSSL_PATH)/crypto/bio/bss_fd.c + $(OPENSSL_PATH)/crypto/bio/bss_file.c + $(OPENSSL_PATH)/crypto/bio/bss_log.c + $(OPENSSL_PATH)/crypto/bio/bss_mem.c + $(OPENSSL_PATH)/crypto/bio/bss_null.c + $(OPENSSL_PATH)/crypto/bio/bss_sock.c + $(OPENSSL_PATH)/crypto/bio/ossl_core_bio.c + $(OPENSSL_PATH)/crypto/bn/bn_add.c + $(OPENSSL_PATH)/crypto/bn/bn_blind.c + $(OPENSSL_PATH)/crypto/bn/bn_const.c + $(OPENSSL_PATH)/crypto/bn/bn_conv.c + $(OPENSSL_PATH)/crypto/bn/bn_ctx.c + $(OPENSSL_PATH)/crypto/bn/bn_dh.c + $(OPENSSL_PATH)/crypto/bn/bn_div.c + $(OPENSSL_PATH)/crypto/bn/bn_err.c + $(OPENSSL_PATH)/crypto/bn/bn_exp.c + $(OPENSSL_PATH)/crypto/bn/bn_exp2.c + $(OPENSSL_PATH)/crypto/bn/bn_gcd.c + $(OPENSSL_PATH)/crypto/bn/bn_gf2m.c + $(OPENSSL_PATH)/crypto/bn/bn_intern.c + $(OPENSSL_PATH)/crypto/bn/bn_kron.c + $(OPENSSL_PATH)/crypto/bn/bn_lib.c + $(OPENSSL_PATH)/crypto/bn/bn_mod.c + $(OPENSSL_PATH)/crypto/bn/bn_mont.c + $(OPENSSL_PATH)/crypto/bn/bn_mpi.c + $(OPENSSL_PATH)/crypto/bn/bn_mul.c + $(OPENSSL_PATH)/crypto/bn/bn_nist.c + $(OPENSSL_PATH)/crypto/bn/bn_prime.c + $(OPENSSL_PATH)/crypto/bn/bn_print.c + $(OPENSSL_PATH)/crypto/bn/bn_rand.c + $(OPENSSL_PATH)/crypto/bn/bn_recp.c + $(OPENSSL_PATH)/crypto/bn/bn_rsa_fips186_4.c + $(OPENSSL_PATH)/crypto/bn/bn_shift.c + $(OPENSSL_PATH)/crypto/bn/bn_sqr.c + $(OPENSSL_PATH)/crypto/bn/bn_sqrt.c + $(OPENSSL_PATH)/crypto/bn/bn_srp.c + $(OPENSSL_PATH)/crypto/bn/bn_word.c + $(OPENSSL_PATH)/crypto/bn/bn_x931p.c + $(OPENSSL_PATH)/crypto/buffer/buf_err.c + $(OPENSSL_PATH)/crypto/buffer/buffer.c + $(OPENSSL_PATH)/crypto/comp/c_zlib.c + $(OPENSSL_PATH)/crypto/comp/comp_err.c + $(OPENSSL_PATH)/crypto/comp/comp_lib.c + $(OPENSSL_PATH)/crypto/conf/conf_api.c + $(OPENSSL_PATH)/crypto/conf/conf_def.c + $(OPENSSL_PATH)/crypto/conf/conf_err.c + $(OPENSSL_PATH)/crypto/conf/conf_lib.c + $(OPENSSL_PATH)/crypto/conf/conf_mall.c + $(OPENSSL_PATH)/crypto/conf/conf_mod.c + $(OPENSSL_PATH)/crypto/conf/conf_sap.c + $(OPENSSL_PATH)/crypto/conf/conf_ssl.c + $(OPENSSL_PATH)/crypto/dso/dso_dl.c + $(OPENSSL_PATH)/crypto/dso/dso_dlfcn.c + $(OPENSSL_PATH)/crypto/dso/dso_err.c + $(OPENSSL_PATH)/crypto/dso/dso_lib.c + $(OPENSSL_PATH)/crypto/dso/dso_openssl.c + $(OPENSSL_PATH)/crypto/dso/dso_vms.c + $(OPENSSL_PATH)/crypto/dso/dso_win32.c + $(OPENSSL_PATH)/crypto/ec/curve448/arch_32/f_impl32.c + $(OPENSSL_PATH)/crypto/ec/curve448/arch_64/f_impl64.c + $(OPENSSL_PATH)/crypto/ec/curve448/curve448.c + $(OPENSSL_PATH)/crypto/ec/curve448/curve448_tables.c + $(OPENSSL_PATH)/crypto/ec/curve448/eddsa.c + $(OPENSSL_PATH)/crypto/ec/curve448/f_generic.c + $(OPENSSL_PATH)/crypto/ec/curve448/scalar.c + $(OPENSSL_PATH)/crypto/ec/curve25519.c + $(OPENSSL_PATH)/crypto/ec/ec2_oct.c + $(OPENSSL_PATH)/crypto/ec/ec2_smpl.c + $(OPENSSL_PATH)/crypto/ec/ec_ameth.c + $(OPENSSL_PATH)/crypto/ec/ec_asn1.c + $(OPENSSL_PATH)/crypto/ec/ec_backend.c + $(OPENSSL_PATH)/crypto/ec/ec_check.c + $(OPENSSL_PATH)/crypto/ec/ec_curve.c + $(OPENSSL_PATH)/crypto/ec/ec_cvt.c + $(OPENSSL_PATH)/crypto/ec/ec_deprecated.c + $(OPENSSL_PATH)/crypto/ec/ec_err.c + $(OPENSSL_PATH)/crypto/ec/ec_key.c + $(OPENSSL_PATH)/crypto/ec/ec_kmeth.c + $(OPENSSL_PATH)/crypto/ec/ec_lib.c + $(OPENSSL_PATH)/crypto/ec/ec_mult.c + $(OPENSSL_PATH)/crypto/ec/ec_oct.c + $(OPENSSL_PATH)/crypto/ec/ec_pmeth.c + $(OPENSSL_PATH)/crypto/ec/ec_print.c + $(OPENSSL_PATH)/crypto/ec/ecdh_kdf.c + $(OPENSSL_PATH)/crypto/ec/ecdh_ossl.c + $(OPENSSL_PATH)/crypto/ec/ecdsa_ossl.c + $(OPENSSL_PATH)/crypto/ec/ecdsa_sign.c + $(OPENSSL_PATH)/crypto/ec/ecdsa_vrf.c + $(OPENSSL_PATH)/crypto/ec/eck_prn.c + $(OPENSSL_PATH)/crypto/ec/ecp_mont.c + $(OPENSSL_PATH)/crypto/ec/ecp_nist.c + $(OPENSSL_PATH)/crypto/ec/ecp_oct.c + $(OPENSSL_PATH)/crypto/ec/ecp_smpl.c + $(OPENSSL_PATH)/crypto/ec/ecx_backend.c + $(OPENSSL_PATH)/crypto/ec/ecx_key.c + $(OPENSSL_PATH)/crypto/ec/ecx_meth.c + $(OPENSSL_PATH)/crypto/encode_decode/decoder_err.c + $(OPENSSL_PATH)/crypto/encode_decode/decoder_lib.c + $(OPENSSL_PATH)/crypto/encode_decode/decoder_meth.c + $(OPENSSL_PATH)/crypto/encode_decode/decoder_pkey.c + $(OPENSSL_PATH)/crypto/err/err.c + $(OPENSSL_PATH)/crypto/err/err_all.c + $(OPENSSL_PATH)/crypto/err/err_all_legacy.c + $(OPENSSL_PATH)/crypto/err/err_blocks.c + $(OPENSSL_PATH)/crypto/err/err_prn.c + $(OPENSSL_PATH)/crypto/ess/ess_asn1.c + $(OPENSSL_PATH)/crypto/ess/ess_err.c + $(OPENSSL_PATH)/crypto/ess/ess_lib.c + $(OPENSSL_PATH)/crypto/evp/asymcipher.c + $(OPENSSL_PATH)/crypto/evp/bio_b64.c + $(OPENSSL_PATH)/crypto/evp/bio_enc.c + $(OPENSSL_PATH)/crypto/evp/bio_md.c + $(OPENSSL_PATH)/crypto/evp/bio_ok.c + #$(OPENSSL_PATH)/crypto/evp/c_allc.c + $(OPENSSL_PATH)/crypto/evp/c_alld.c + $(OPENSSL_PATH)/crypto/evp/cmeth_lib.c + $(OPENSSL_PATH)/crypto/evp/ctrl_params_translate.c + $(OPENSSL_PATH)/crypto/evp/dh_ctrl.c + $(OPENSSL_PATH)/crypto/evp/dh_support.c + $(OPENSSL_PATH)/crypto/evp/digest.c + $(OPENSSL_PATH)/crypto/evp/dsa_ctrl.c + #$(OPENSSL_PATH)/crypto/evp/e_aes.c + #$(OPENSSL_PATH)/crypto/evp/e_aes_cbc_hmac_sha1.c + #$(OPENSSL_PATH)/crypto/evp/e_aes_cbc_hmac_sha256.c + $(OPENSSL_PATH)/crypto/evp/e_aria.c + $(OPENSSL_PATH)/crypto/evp/e_bf.c + $(OPENSSL_PATH)/crypto/evp/e_cast.c + $(OPENSSL_PATH)/crypto/evp/e_chacha20_poly1305.c + $(OPENSSL_PATH)/crypto/evp/e_des.c + $(OPENSSL_PATH)/crypto/evp/e_des3.c + $(OPENSSL_PATH)/crypto/evp/e_idea.c + $(OPENSSL_PATH)/crypto/evp/e_null.c + $(OPENSSL_PATH)/crypto/evp/e_rc2.c + $(OPENSSL_PATH)/crypto/evp/e_rc4.c + $(OPENSSL_PATH)/crypto/evp/e_rc4_hmac_md5.c + $(OPENSSL_PATH)/crypto/evp/e_rc5.c + $(OPENSSL_PATH)/crypto/evp/e_sm4.c + $(OPENSSL_PATH)/crypto/evp/e_xcbc_d.c + $(OPENSSL_PATH)/crypto/evp/ec_ctrl.c + $(OPENSSL_PATH)/crypto/evp/ec_support.c + $(OPENSSL_PATH)/crypto/evp/encode.c + $(OPENSSL_PATH)/crypto/evp/evp_cnf.c + $(OPENSSL_PATH)/crypto/evp/evp_enc.c + $(OPENSSL_PATH)/crypto/evp/evp_err.c + $(OPENSSL_PATH)/crypto/evp/evp_fetch.c + $(OPENSSL_PATH)/crypto/evp/evp_key.c + $(OPENSSL_PATH)/crypto/evp/evp_lib.c + $(OPENSSL_PATH)/crypto/evp/evp_pbe.c + $(OPENSSL_PATH)/crypto/evp/evp_pkey.c + $(OPENSSL_PATH)/crypto/evp/evp_rand.c + $(OPENSSL_PATH)/crypto/evp/evp_utils.c + $(OPENSSL_PATH)/crypto/evp/exchange.c + $(OPENSSL_PATH)/crypto/evp/kdf_lib.c + $(OPENSSL_PATH)/crypto/evp/kdf_meth.c + $(OPENSSL_PATH)/crypto/evp/kem.c + $(OPENSSL_PATH)/crypto/evp/keymgmt_lib.c + $(OPENSSL_PATH)/crypto/evp/keymgmt_meth.c + $(OPENSSL_PATH)/crypto/evp/legacy_md5.c + $(OPENSSL_PATH)/crypto/evp/legacy_md5_sha1.c + $(OPENSSL_PATH)/crypto/evp/legacy_sha.c + $(OPENSSL_PATH)/crypto/evp/m_null.c + $(OPENSSL_PATH)/crypto/evp/m_sigver.c + $(OPENSSL_PATH)/crypto/evp/mac_lib.c + $(OPENSSL_PATH)/crypto/evp/mac_meth.c + $(OPENSSL_PATH)/crypto/evp/names.c + $(OPENSSL_PATH)/crypto/evp/p5_crpt.c + $(OPENSSL_PATH)/crypto/evp/p5_crpt2.c + $(OPENSSL_PATH)/crypto/evp/p_dec.c + $(OPENSSL_PATH)/crypto/evp/p_enc.c + $(OPENSSL_PATH)/crypto/evp/p_legacy.c + $(OPENSSL_PATH)/crypto/evp/p_lib.c + $(OPENSSL_PATH)/crypto/evp/p_open.c + $(OPENSSL_PATH)/crypto/evp/p_seal.c + $(OPENSSL_PATH)/crypto/evp/p_sign.c + $(OPENSSL_PATH)/crypto/evp/p_verify.c + $(OPENSSL_PATH)/crypto/evp/pbe_scrypt.c + $(OPENSSL_PATH)/crypto/evp/pmeth_check.c + $(OPENSSL_PATH)/crypto/evp/pmeth_gn.c + $(OPENSSL_PATH)/crypto/evp/pmeth_lib.c + $(OPENSSL_PATH)/crypto/evp/signature.c + $(OPENSSL_PATH)/crypto/ffc/ffc_backend.c + $(OPENSSL_PATH)/crypto/ffc/ffc_dh.c + $(OPENSSL_PATH)/crypto/ffc/ffc_key_generate.c + $(OPENSSL_PATH)/crypto/ffc/ffc_key_validate.c + $(OPENSSL_PATH)/crypto/ffc/ffc_params.c + $(OPENSSL_PATH)/crypto/ffc/ffc_params_generate.c + $(OPENSSL_PATH)/crypto/ffc/ffc_params_validate.c + $(OPENSSL_PATH)/crypto/hmac/hmac.c + $(OPENSSL_PATH)/crypto/http/http_client.c + $(OPENSSL_PATH)/crypto/http/http_err.c + $(OPENSSL_PATH)/crypto/http/http_lib.c + $(OPENSSL_PATH)/crypto/kdf/kdf_err.c + $(OPENSSL_PATH)/crypto/lhash/lh_stats.c + $(OPENSSL_PATH)/crypto/lhash/lhash.c + $(OPENSSL_PATH)/crypto/asn1_dsa.c + $(OPENSSL_PATH)/crypto/bsearch.c + $(OPENSSL_PATH)/crypto/context.c + $(OPENSSL_PATH)/crypto/core_algorithm.c + $(OPENSSL_PATH)/crypto/core_fetch.c + $(OPENSSL_PATH)/crypto/core_namemap.c + $(OPENSSL_PATH)/crypto/cpt_err.c + $(OPENSSL_PATH)/crypto/cpuid.c + $(OPENSSL_PATH)/crypto/cryptlib.c + $(OPENSSL_PATH)/crypto/ctype.c + $(OPENSSL_PATH)/crypto/cversion.c + $(OPENSSL_PATH)/crypto/der_writer.c + $(OPENSSL_PATH)/crypto/ebcdic.c + $(OPENSSL_PATH)/crypto/ex_data.c + $(OPENSSL_PATH)/crypto/getenv.c + $(OPENSSL_PATH)/crypto/info.c + $(OPENSSL_PATH)/crypto/init.c + $(OPENSSL_PATH)/crypto/initthread.c + $(OPENSSL_PATH)/crypto/mem.c + $(OPENSSL_PATH)/crypto/mem_sec.c + $(OPENSSL_PATH)/crypto/o_dir.c + $(OPENSSL_PATH)/crypto/o_fopen.c + $(OPENSSL_PATH)/crypto/o_init.c + $(OPENSSL_PATH)/crypto/o_str.c + $(OPENSSL_PATH)/crypto/o_time.c + $(OPENSSL_PATH)/crypto/packet.c + $(OPENSSL_PATH)/crypto/param_build.c + $(OPENSSL_PATH)/crypto/param_build_set.c + $(OPENSSL_PATH)/crypto/params.c + $(OPENSSL_PATH)/crypto/params_dup.c + $(OPENSSL_PATH)/crypto/params_from_text.c + $(OPENSSL_PATH)/crypto/passphrase.c + $(OPENSSL_PATH)/crypto/provider.c + $(OPENSSL_PATH)/crypto/provider_child.c + $(OPENSSL_PATH)/crypto/provider_conf.c + $(OPENSSL_PATH)/crypto/provider_core.c + $(OPENSSL_PATH)/crypto/punycode.c + $(OPENSSL_PATH)/crypto/self_test_core.c + $(OPENSSL_PATH)/crypto/sparse_array.c + $(OPENSSL_PATH)/crypto/threads_lib.c + $(OPENSSL_PATH)/crypto/threads_none.c + $(OPENSSL_PATH)/crypto/threads_pthread.c + $(OPENSSL_PATH)/crypto/threads_win.c + $(OPENSSL_PATH)/crypto/trace.c + $(OPENSSL_PATH)/crypto/uid.c + $(OPENSSL_PATH)/crypto/md5/md5_dgst.c + $(OPENSSL_PATH)/crypto/md5/md5_one.c + $(OPENSSL_PATH)/crypto/md5/md5_sha1.c + $(OPENSSL_PATH)/crypto/modes/cbc128.c + $(OPENSSL_PATH)/crypto/modes/ccm128.c + $(OPENSSL_PATH)/crypto/modes/cfb128.c + $(OPENSSL_PATH)/crypto/modes/ctr128.c + $(OPENSSL_PATH)/crypto/modes/cts128.c + $(OPENSSL_PATH)/crypto/modes/gcm128.c + $(OPENSSL_PATH)/crypto/modes/ocb128.c + $(OPENSSL_PATH)/crypto/modes/ofb128.c + $(OPENSSL_PATH)/crypto/modes/siv128.c + $(OPENSSL_PATH)/crypto/modes/wrap128.c + $(OPENSSL_PATH)/crypto/modes/xts128.c + $(OPENSSL_PATH)/crypto/objects/o_names.c + $(OPENSSL_PATH)/crypto/objects/obj_dat.c + $(OPENSSL_PATH)/crypto/objects/obj_err.c + $(OPENSSL_PATH)/crypto/objects/obj_lib.c + $(OPENSSL_PATH)/crypto/objects/obj_xref.c + $(OPENSSL_PATH)/crypto/pem/pem_all.c + $(OPENSSL_PATH)/crypto/pem/pem_err.c + $(OPENSSL_PATH)/crypto/pem/pem_info.c + $(OPENSSL_PATH)/crypto/pem/pem_lib.c + $(OPENSSL_PATH)/crypto/pem/pem_oth.c + $(OPENSSL_PATH)/crypto/pem/pem_pk8.c + $(OPENSSL_PATH)/crypto/pem/pem_pkey.c + $(OPENSSL_PATH)/crypto/pem/pem_sign.c + $(OPENSSL_PATH)/crypto/pem/pem_x509.c + $(OPENSSL_PATH)/crypto/pem/pem_xaux.c + $(OPENSSL_PATH)/crypto/pem/pvkfmt.c + $(OPENSSL_PATH)/crypto/pkcs7/bio_pk7.c + $(OPENSSL_PATH)/crypto/pkcs7/pk7_asn1.c + $(OPENSSL_PATH)/crypto/pkcs7/pk7_attr.c + $(OPENSSL_PATH)/crypto/pkcs7/pk7_doit.c + $(OPENSSL_PATH)/crypto/pkcs7/pk7_lib.c + $(OPENSSL_PATH)/crypto/pkcs7/pk7_mime.c + $(OPENSSL_PATH)/crypto/pkcs7/pk7_smime.c + $(OPENSSL_PATH)/crypto/pkcs7/pkcs7err.c + $(OPENSSL_PATH)/crypto/property/defn_cache.c + $(OPENSSL_PATH)/crypto/property/property.c + $(OPENSSL_PATH)/crypto/property/property_err.c + $(OPENSSL_PATH)/crypto/property/property_parse.c + $(OPENSSL_PATH)/crypto/property/property_query.c + $(OPENSSL_PATH)/crypto/property/property_string.c + $(OPENSSL_PATH)/crypto/rand/prov_seed.c + $(OPENSSL_PATH)/crypto/rand/rand_deprecated.c + $(OPENSSL_PATH)/crypto/rand/rand_err.c + $(OPENSSL_PATH)/crypto/rand/rand_lib.c + $(OPENSSL_PATH)/crypto/rand/rand_meth.c + $(OPENSSL_PATH)/crypto/rand/rand_pool.c + $(OPENSSL_PATH)/crypto/rsa/rsa_ameth.c + $(OPENSSL_PATH)/crypto/rsa/rsa_asn1.c + $(OPENSSL_PATH)/crypto/rsa/rsa_backend.c + $(OPENSSL_PATH)/crypto/rsa/rsa_chk.c + $(OPENSSL_PATH)/crypto/rsa/rsa_crpt.c + $(OPENSSL_PATH)/crypto/rsa/rsa_err.c + $(OPENSSL_PATH)/crypto/rsa/rsa_gen.c + $(OPENSSL_PATH)/crypto/rsa/rsa_lib.c + $(OPENSSL_PATH)/crypto/rsa/rsa_meth.c + $(OPENSSL_PATH)/crypto/rsa/rsa_mp.c + $(OPENSSL_PATH)/crypto/rsa/rsa_mp_names.c + $(OPENSSL_PATH)/crypto/rsa/rsa_none.c + $(OPENSSL_PATH)/crypto/rsa/rsa_oaep.c + $(OPENSSL_PATH)/crypto/rsa/rsa_ossl.c + $(OPENSSL_PATH)/crypto/rsa/rsa_pk1.c + $(OPENSSL_PATH)/crypto/rsa/rsa_pmeth.c + $(OPENSSL_PATH)/crypto/rsa/rsa_prn.c + $(OPENSSL_PATH)/crypto/rsa/rsa_pss.c + $(OPENSSL_PATH)/crypto/rsa/rsa_saos.c + $(OPENSSL_PATH)/crypto/rsa/rsa_schemes.c + $(OPENSSL_PATH)/crypto/rsa/rsa_sign.c + $(OPENSSL_PATH)/crypto/rsa/rsa_sp800_56b_check.c + $(OPENSSL_PATH)/crypto/rsa/rsa_sp800_56b_gen.c + $(OPENSSL_PATH)/crypto/rsa/rsa_x931.c + $(OPENSSL_PATH)/crypto/rsa/rsa_x931g.c + $(OPENSSL_PATH)/crypto/sha/keccak1600.c + $(OPENSSL_PATH)/crypto/sha/sha1_one.c + $(OPENSSL_PATH)/crypto/sha/sha1dgst.c + $(OPENSSL_PATH)/crypto/sha/sha256.c + $(OPENSSL_PATH)/crypto/sha/sha3.c + $(OPENSSL_PATH)/crypto/sha/sha512.c + $(OPENSSL_PATH)/crypto/stack/stack.c + $(OPENSSL_PATH)/crypto/txt_db/txt_db.c + $(OPENSSL_PATH)/crypto/ui/ui_err.c + $(OPENSSL_PATH)/crypto/ui/ui_lib.c + $(OPENSSL_PATH)/crypto/ui/ui_null.c + $(OPENSSL_PATH)/crypto/ui/ui_openssl.c + $(OPENSSL_PATH)/crypto/ui/ui_util.c + $(OPENSSL_PATH)/crypto/x509/by_dir.c + $(OPENSSL_PATH)/crypto/x509/by_file.c + $(OPENSSL_PATH)/crypto/x509/by_store.c + $(OPENSSL_PATH)/crypto/x509/pcy_cache.c + $(OPENSSL_PATH)/crypto/x509/pcy_data.c + $(OPENSSL_PATH)/crypto/x509/pcy_lib.c + $(OPENSSL_PATH)/crypto/x509/pcy_map.c + $(OPENSSL_PATH)/crypto/x509/pcy_node.c + $(OPENSSL_PATH)/crypto/x509/pcy_tree.c + $(OPENSSL_PATH)/crypto/x509/t_crl.c + $(OPENSSL_PATH)/crypto/x509/t_req.c + $(OPENSSL_PATH)/crypto/x509/t_x509.c + $(OPENSSL_PATH)/crypto/x509/v3_addr.c + $(OPENSSL_PATH)/crypto/x509/v3_admis.c + $(OPENSSL_PATH)/crypto/x509/v3_akeya.c + $(OPENSSL_PATH)/crypto/x509/v3_akid.c + $(OPENSSL_PATH)/crypto/x509/v3_asid.c + $(OPENSSL_PATH)/crypto/x509/v3_bcons.c + $(OPENSSL_PATH)/crypto/x509/v3_bitst.c + $(OPENSSL_PATH)/crypto/x509/v3_conf.c + $(OPENSSL_PATH)/crypto/x509/v3_cpols.c + $(OPENSSL_PATH)/crypto/x509/v3_crld.c + $(OPENSSL_PATH)/crypto/x509/v3_enum.c + $(OPENSSL_PATH)/crypto/x509/v3_extku.c + $(OPENSSL_PATH)/crypto/x509/v3_genn.c + $(OPENSSL_PATH)/crypto/x509/v3_ia5.c + $(OPENSSL_PATH)/crypto/x509/v3_info.c + $(OPENSSL_PATH)/crypto/x509/v3_int.c + $(OPENSSL_PATH)/crypto/x509/v3_ist.c + $(OPENSSL_PATH)/crypto/x509/v3_lib.c + $(OPENSSL_PATH)/crypto/x509/v3_ncons.c + $(OPENSSL_PATH)/crypto/x509/v3_pci.c + $(OPENSSL_PATH)/crypto/x509/v3_pcia.c + $(OPENSSL_PATH)/crypto/x509/v3_pcons.c + $(OPENSSL_PATH)/crypto/x509/v3_pku.c + $(OPENSSL_PATH)/crypto/x509/v3_pmaps.c + $(OPENSSL_PATH)/crypto/x509/v3_prn.c + $(OPENSSL_PATH)/crypto/x509/v3_purp.c + $(OPENSSL_PATH)/crypto/x509/v3_san.c + $(OPENSSL_PATH)/crypto/x509/v3_skid.c + $(OPENSSL_PATH)/crypto/x509/v3_sxnet.c + $(OPENSSL_PATH)/crypto/x509/v3_tlsf.c + $(OPENSSL_PATH)/crypto/x509/v3_utf8.c + $(OPENSSL_PATH)/crypto/x509/v3_utl.c + $(OPENSSL_PATH)/crypto/x509/v3err.c + $(OPENSSL_PATH)/crypto/x509/x509_att.c + $(OPENSSL_PATH)/crypto/x509/x509_cmp.c + $(OPENSSL_PATH)/crypto/x509/x509_d2.c + $(OPENSSL_PATH)/crypto/x509/x509_def.c + $(OPENSSL_PATH)/crypto/x509/x509_err.c + $(OPENSSL_PATH)/crypto/x509/x509_ext.c + $(OPENSSL_PATH)/crypto/x509/x509_lu.c + $(OPENSSL_PATH)/crypto/x509/x509_meth.c + $(OPENSSL_PATH)/crypto/x509/x509_obj.c + $(OPENSSL_PATH)/crypto/x509/x509_r2x.c + $(OPENSSL_PATH)/crypto/x509/x509_req.c + $(OPENSSL_PATH)/crypto/x509/x509_set.c + $(OPENSSL_PATH)/crypto/x509/x509_trust.c + $(OPENSSL_PATH)/crypto/x509/x509_txt.c + $(OPENSSL_PATH)/crypto/x509/x509_v3.c + $(OPENSSL_PATH)/crypto/x509/x509_vfy.c + $(OPENSSL_PATH)/crypto/x509/x509_vpm.c + $(OPENSSL_PATH)/crypto/x509/x509cset.c + $(OPENSSL_PATH)/crypto/x509/x509name.c + $(OPENSSL_PATH)/crypto/x509/x509rset.c + $(OPENSSL_PATH)/crypto/x509/x509spki.c + $(OPENSSL_PATH)/crypto/x509/x509type.c + $(OPENSSL_PATH)/crypto/x509/x_all.c + $(OPENSSL_PATH)/crypto/x509/x_attrib.c + $(OPENSSL_PATH)/crypto/x509/x_crl.c + $(OPENSSL_PATH)/crypto/x509/x_exten.c + $(OPENSSL_PATH)/crypto/x509/x_name.c + $(OPENSSL_PATH)/crypto/x509/x_pubkey.c + $(OPENSSL_PATH)/crypto/x509/x_req.c + $(OPENSSL_PATH)/crypto/x509/x_x509.c + $(OPENSSL_PATH)/crypto/x509/x_x509a.c + $(OPENSSL_PATH)/providers/nullprov.c + $(OPENSSL_PATH)/providers/prov_running.c + $(OPENSSL_PATH)/providers/common/der/der_rsa_sig.c + $(OPENSSL_PATH)/providers/common/bio_prov.c + $(OPENSSL_PATH)/providers/common/capabilities.c + $(OPENSSL_PATH)/providers/common/digest_to_nid.c + $(OPENSSL_PATH)/providers/common/provider_seeding.c + $(OPENSSL_PATH)/providers/common/provider_util.c + $(OPENSSL_PATH)/providers/common/securitycheck.c + $(OPENSSL_PATH)/providers/common/securitycheck_default.c + $(OPENSSL_PATH)/providers/implementations/asymciphers/rsa_enc.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha1_hw.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha256_hw.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_ccm.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_ccm_hw.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_gcm.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_gcm_hw.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_hw.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_wrp.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_xts.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_xts_fips.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_xts_hw.c + $(OPENSSL_PATH)/providers/implementations/ciphers/cipher_cts.c + $(OPENSSL_PATH)/providers/implementations/ciphers/cipher_null.c + $(OPENSSL_PATH)/providers/implementations/digests/md5_prov.c + $(OPENSSL_PATH)/providers/implementations/digests/md5_sha1_prov.c + $(OPENSSL_PATH)/providers/implementations/digests/null_prov.c + $(OPENSSL_PATH)/providers/implementations/digests/sha2_prov.c + $(OPENSSL_PATH)/providers/implementations/digests/sha3_prov.c + $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_der2key.c + $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_epki2pki.c + $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_msblob2key.c + $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_pem2der.c + $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_pvk2key.c + $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_spki2typespki.c + $(OPENSSL_PATH)/providers/implementations/encode_decode/endecoder_common.c + $(OPENSSL_PATH)/providers/implementations/exchange/ecdh_exch.c + $(OPENSSL_PATH)/providers/implementations/exchange/ecx_exch.c + $(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c + $(OPENSSL_PATH)/providers/implementations/kdfs/hkdf.c + $(OPENSSL_PATH)/providers/implementations/kdfs/kbkdf.c + $(OPENSSL_PATH)/providers/implementations/kdfs/krb5kdf.c + $(OPENSSL_PATH)/providers/implementations/kdfs/pbkdf2.c + $(OPENSSL_PATH)/providers/implementations/kdfs/pbkdf2_fips.c + $(OPENSSL_PATH)/providers/implementations/kdfs/pkcs12kdf.c + $(OPENSSL_PATH)/providers/implementations/kdfs/scrypt.c + $(OPENSSL_PATH)/providers/implementations/kdfs/sshkdf.c + $(OPENSSL_PATH)/providers/implementations/kdfs/sskdf.c + $(OPENSSL_PATH)/providers/implementations/kdfs/tls1_prf.c + $(OPENSSL_PATH)/providers/implementations/kdfs/x942kdf.c + $(OPENSSL_PATH)/providers/implementations/kem/rsa_kem.c + $(OPENSSL_PATH)/providers/implementations/keymgmt/ec_kmgmt.c + $(OPENSSL_PATH)/providers/implementations/keymgmt/ecx_kmgmt.c + $(OPENSSL_PATH)/providers/implementations/keymgmt/kdf_legacy_kmgmt.c + $(OPENSSL_PATH)/providers/implementations/keymgmt/mac_legacy_kmgmt.c + $(OPENSSL_PATH)/providers/implementations/keymgmt/rsa_kmgmt.c + $(OPENSSL_PATH)/providers/implementations/macs/gmac_prov.c + $(OPENSSL_PATH)/providers/implementations/macs/hmac_prov.c + $(OPENSSL_PATH)/providers/implementations/macs/kmac_prov.c + $(OPENSSL_PATH)/providers/implementations/rands/crngt.c + $(OPENSSL_PATH)/providers/implementations/rands/drbg.c + $(OPENSSL_PATH)/providers/implementations/rands/drbg_ctr.c + $(OPENSSL_PATH)/providers/implementations/rands/drbg_hash.c + $(OPENSSL_PATH)/providers/implementations/rands/drbg_hmac.c + $(OPENSSL_PATH)/providers/implementations/rands/seed_src.c + $(OPENSSL_PATH)/providers/implementations/rands/test_rng.c + $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_cpu_x86.c + $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_tsc.c + $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_unix.c + $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_win.c + $(OPENSSL_PATH)/providers/implementations/signature/ecdsa_sig.c + $(OPENSSL_PATH)/providers/implementations/signature/eddsa_sig.c + $(OPENSSL_PATH)/providers/implementations/signature/mac_legacy_sig.c + $(OPENSSL_PATH)/providers/implementations/signature/rsa_sig.c + $(OPENSSL_PATH)/ssl/s3_cbc.c + $(OPENSSL_PATH)/providers/common/der/der_ec_key.c + $(OPENSSL_PATH)/providers/common/der/der_ec_sig.c + $(OPENSSL_PATH)/providers/common/der/der_ecx_key.c + $(OPENSSL_PATH)/providers/common/der/der_rsa_key.c + $(OPENSSL_PATH)/providers/common/provider_ctx.c + $(OPENSSL_PATH)/providers/common/provider_err.c + $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon.c + $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_block.c + $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_ccm.c + $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_ccm_hw.c + $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_gcm.c + $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_gcm_hw.c + $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_hw.c + $(OPENSSL_PATH)/providers/implementations/digests/digestcommon.c + $(OPENSSL_PATH)/ssl/record/tls_pad.c + $(OPENSSL_GEN_PATH)/providers/common/der/der_digests_gen.c + $(OPENSSL_GEN_PATH)/providers/common/der/der_ec_gen.c + $(OPENSSL_GEN_PATH)/providers/common/der/der_ecx_gen.c + $(OPENSSL_GEN_PATH)/providers/common/der/der_rsa_gen.c + $(OPENSSL_GEN_PATH)/providers/common/der/der_wrap_gen.c + $(OPENSSL_PATH)/ssl/bio_ssl.c + $(OPENSSL_PATH)/ssl/d1_lib.c + $(OPENSSL_PATH)/ssl/d1_msg.c + $(OPENSSL_PATH)/ssl/d1_srtp.c + $(OPENSSL_PATH)/ssl/methods.c + $(OPENSSL_PATH)/ssl/pqueue.c + $(OPENSSL_PATH)/ssl/s3_enc.c + $(OPENSSL_PATH)/ssl/s3_lib.c + $(OPENSSL_PATH)/ssl/s3_msg.c + $(OPENSSL_PATH)/ssl/ssl_asn1.c + $(OPENSSL_PATH)/ssl/ssl_cert.c + $(OPENSSL_PATH)/ssl/ssl_ciph.c + $(OPENSSL_PATH)/ssl/ssl_conf.c + $(OPENSSL_PATH)/ssl/ssl_err.c + $(OPENSSL_PATH)/ssl/ssl_err_legacy.c + $(OPENSSL_PATH)/ssl/ssl_init.c + $(OPENSSL_PATH)/ssl/ssl_lib.c + $(OPENSSL_PATH)/ssl/ssl_mcnf.c + $(OPENSSL_PATH)/ssl/ssl_rsa.c + $(OPENSSL_PATH)/ssl/ssl_rsa_legacy.c + $(OPENSSL_PATH)/ssl/ssl_sess.c + $(OPENSSL_PATH)/ssl/ssl_stat.c + $(OPENSSL_PATH)/ssl/ssl_txt.c + $(OPENSSL_PATH)/ssl/ssl_utst.c + $(OPENSSL_PATH)/ssl/t1_enc.c + $(OPENSSL_PATH)/ssl/t1_lib.c + $(OPENSSL_PATH)/ssl/t1_trce.c + $(OPENSSL_PATH)/ssl/tls13_enc.c + $(OPENSSL_PATH)/ssl/tls_depr.c + $(OPENSSL_PATH)/ssl/tls_srp.c + $(OPENSSL_PATH)/ssl/record/dtls1_bitmap.c + $(OPENSSL_PATH)/ssl/record/rec_layer_d1.c + $(OPENSSL_PATH)/ssl/record/rec_layer_s3.c + $(OPENSSL_PATH)/ssl/record/ssl3_buffer.c + $(OPENSSL_PATH)/ssl/record/ssl3_record.c + $(OPENSSL_PATH)/ssl/record/ssl3_record_tls13.c + $(OPENSSL_PATH)/ssl/statem/extensions.c + $(OPENSSL_PATH)/ssl/statem/extensions_clnt.c + $(OPENSSL_PATH)/ssl/statem/extensions_cust.c + $(OPENSSL_PATH)/ssl/statem/statem.c + $(OPENSSL_PATH)/ssl/statem/statem_clnt.c + $(OPENSSL_PATH)/ssl/statem/statem_dtls.c + $(OPENSSL_PATH)/ssl/statem/statem_lib.c + #$(OPENSSL_GEN_PATH)/IA32-MSFT/crypto/aes/aes-586.nasm | MSFT + #$(OPENSSL_GEN_PATH)/IA32-MSFT/crypto/aes/aesni-x86.nasm | MSFT + #$(OPENSSL_GEN_PATH)/IA32-MSFT/crypto/aes/vpaes-x86.nasm | MSFT + $(OPENSSL_GEN_PATH)/IA32-MSFT/crypto/x86cpuid.nasm | MSFT + $(OPENSSL_GEN_PATH)/IA32-MSFT/crypto/md5/md5-586.nasm | MSFT + $(OPENSSL_GEN_PATH)/IA32-MSFT/crypto/modes/ghash-x86.nasm | MSFT + $(OPENSSL_GEN_PATH)/IA32-MSFT/crypto/sha/sha1-586.nasm | MSFT + $(OPENSSL_GEN_PATH)/IA32-MSFT/crypto/sha/sha256-586.nasm | MSFT + $(OPENSSL_GEN_PATH)/IA32-MSFT/crypto/sha/sha512-586.nasm | MSFT + #$(OPENSSL_GEN_PATH)/IA32-GCC/crypto/aes/aes-586.S | GCC + #$(OPENSSL_GEN_PATH)/IA32-GCC/crypto/aes/aesni-x86.S | GCC + #$(OPENSSL_GEN_PATH)/IA32-GCC/crypto/aes/vpaes-x86.S | GCC + $(OPENSSL_GEN_PATH)/IA32-GCC/crypto/x86cpuid.S | GCC + $(OPENSSL_GEN_PATH)/IA32-GCC/crypto/md5/md5-586.S | GCC + $(OPENSSL_GEN_PATH)/IA32-GCC/crypto/modes/ghash-x86.S | GCC + $(OPENSSL_GEN_PATH)/IA32-GCC/crypto/sha/sha1-586.S | GCC + $(OPENSSL_GEN_PATH)/IA32-GCC/crypto/sha/sha256-586.S | GCC + $(OPENSSL_GEN_PATH)/IA32-GCC/crypto/sha/sha512-586.S | GCC +# Autogenerated files list ends here + +[Sources.X64] + X64/ApiHooks.c +# Autogenerated files list starts here + #$(OPENSSL_PATH)/crypto/aes/aes_cfb.c + #$(OPENSSL_PATH)/crypto/aes/aes_ecb.c + #$(OPENSSL_PATH)/crypto/aes/aes_ige.c + #$(OPENSSL_PATH)/crypto/aes/aes_misc.c + #$(OPENSSL_PATH)/crypto/aes/aes_ofb.c + #$(OPENSSL_PATH)/crypto/aes/aes_wrap.c + $(OPENSSL_PATH)/crypto/asn1/a_bitstr.c + $(OPENSSL_PATH)/crypto/asn1/a_d2i_fp.c + $(OPENSSL_PATH)/crypto/asn1/a_digest.c + $(OPENSSL_PATH)/crypto/asn1/a_dup.c + $(OPENSSL_PATH)/crypto/asn1/a_gentm.c + $(OPENSSL_PATH)/crypto/asn1/a_i2d_fp.c + $(OPENSSL_PATH)/crypto/asn1/a_int.c + $(OPENSSL_PATH)/crypto/asn1/a_mbstr.c + $(OPENSSL_PATH)/crypto/asn1/a_object.c + $(OPENSSL_PATH)/crypto/asn1/a_octet.c + $(OPENSSL_PATH)/crypto/asn1/a_print.c + $(OPENSSL_PATH)/crypto/asn1/a_sign.c + $(OPENSSL_PATH)/crypto/asn1/a_strex.c + $(OPENSSL_PATH)/crypto/asn1/a_strnid.c + $(OPENSSL_PATH)/crypto/asn1/a_time.c + $(OPENSSL_PATH)/crypto/asn1/a_type.c + $(OPENSSL_PATH)/crypto/asn1/a_utctm.c + $(OPENSSL_PATH)/crypto/asn1/a_utf8.c + $(OPENSSL_PATH)/crypto/asn1/a_verify.c + $(OPENSSL_PATH)/crypto/asn1/ameth_lib.c + $(OPENSSL_PATH)/crypto/asn1/asn1_err.c + $(OPENSSL_PATH)/crypto/asn1/asn1_gen.c + $(OPENSSL_PATH)/crypto/asn1/asn1_item_list.c + $(OPENSSL_PATH)/crypto/asn1/asn1_lib.c + $(OPENSSL_PATH)/crypto/asn1/asn1_parse.c + $(OPENSSL_PATH)/crypto/asn1/asn_mime.c + $(OPENSSL_PATH)/crypto/asn1/asn_moid.c + $(OPENSSL_PATH)/crypto/asn1/asn_mstbl.c + $(OPENSSL_PATH)/crypto/asn1/asn_pack.c + $(OPENSSL_PATH)/crypto/asn1/bio_asn1.c + $(OPENSSL_PATH)/crypto/asn1/bio_ndef.c + $(OPENSSL_PATH)/crypto/asn1/d2i_param.c + $(OPENSSL_PATH)/crypto/asn1/d2i_pr.c + $(OPENSSL_PATH)/crypto/asn1/d2i_pu.c + $(OPENSSL_PATH)/crypto/asn1/evp_asn1.c + $(OPENSSL_PATH)/crypto/asn1/f_int.c + $(OPENSSL_PATH)/crypto/asn1/f_string.c + $(OPENSSL_PATH)/crypto/asn1/i2d_evp.c + $(OPENSSL_PATH)/crypto/asn1/nsseq.c + $(OPENSSL_PATH)/crypto/asn1/p5_pbe.c + $(OPENSSL_PATH)/crypto/asn1/p5_pbev2.c + $(OPENSSL_PATH)/crypto/asn1/p5_scrypt.c + $(OPENSSL_PATH)/crypto/asn1/p8_pkey.c + $(OPENSSL_PATH)/crypto/asn1/t_bitst.c + $(OPENSSL_PATH)/crypto/asn1/t_pkey.c + $(OPENSSL_PATH)/crypto/asn1/t_spki.c + $(OPENSSL_PATH)/crypto/asn1/tasn_dec.c + $(OPENSSL_PATH)/crypto/asn1/tasn_enc.c + $(OPENSSL_PATH)/crypto/asn1/tasn_fre.c + $(OPENSSL_PATH)/crypto/asn1/tasn_new.c + $(OPENSSL_PATH)/crypto/asn1/tasn_prn.c + $(OPENSSL_PATH)/crypto/asn1/tasn_scn.c + $(OPENSSL_PATH)/crypto/asn1/tasn_typ.c + $(OPENSSL_PATH)/crypto/asn1/tasn_utl.c + $(OPENSSL_PATH)/crypto/asn1/x_algor.c + $(OPENSSL_PATH)/crypto/asn1/x_bignum.c + $(OPENSSL_PATH)/crypto/asn1/x_info.c + $(OPENSSL_PATH)/crypto/asn1/x_int64.c + $(OPENSSL_PATH)/crypto/asn1/x_long.c + $(OPENSSL_PATH)/crypto/asn1/x_pkey.c + $(OPENSSL_PATH)/crypto/asn1/x_sig.c + $(OPENSSL_PATH)/crypto/asn1/x_spki.c + $(OPENSSL_PATH)/crypto/asn1/x_val.c + $(OPENSSL_PATH)/crypto/async/arch/async_null.c + $(OPENSSL_PATH)/crypto/async/arch/async_posix.c + $(OPENSSL_PATH)/crypto/async/arch/async_win.c + $(OPENSSL_PATH)/crypto/async/async.c + $(OPENSSL_PATH)/crypto/async/async_err.c + $(OPENSSL_PATH)/crypto/async/async_wait.c + $(OPENSSL_PATH)/crypto/bio/bf_buff.c + $(OPENSSL_PATH)/crypto/bio/bf_lbuf.c + $(OPENSSL_PATH)/crypto/bio/bf_nbio.c + $(OPENSSL_PATH)/crypto/bio/bf_null.c + $(OPENSSL_PATH)/crypto/bio/bf_prefix.c + $(OPENSSL_PATH)/crypto/bio/bf_readbuff.c + $(OPENSSL_PATH)/crypto/bio/bio_addr.c + $(OPENSSL_PATH)/crypto/bio/bio_cb.c + $(OPENSSL_PATH)/crypto/bio/bio_dump.c + $(OPENSSL_PATH)/crypto/bio/bio_err.c + $(OPENSSL_PATH)/crypto/bio/bio_lib.c + $(OPENSSL_PATH)/crypto/bio/bio_meth.c + $(OPENSSL_PATH)/crypto/bio/bio_print.c + $(OPENSSL_PATH)/crypto/bio/bio_sock.c + $(OPENSSL_PATH)/crypto/bio/bio_sock2.c + $(OPENSSL_PATH)/crypto/bio/bss_acpt.c + $(OPENSSL_PATH)/crypto/bio/bss_bio.c + $(OPENSSL_PATH)/crypto/bio/bss_conn.c + $(OPENSSL_PATH)/crypto/bio/bss_core.c + $(OPENSSL_PATH)/crypto/bio/bss_dgram.c + $(OPENSSL_PATH)/crypto/bio/bss_fd.c + $(OPENSSL_PATH)/crypto/bio/bss_file.c + $(OPENSSL_PATH)/crypto/bio/bss_log.c + $(OPENSSL_PATH)/crypto/bio/bss_mem.c + $(OPENSSL_PATH)/crypto/bio/bss_null.c + $(OPENSSL_PATH)/crypto/bio/bss_sock.c + $(OPENSSL_PATH)/crypto/bio/ossl_core_bio.c + $(OPENSSL_PATH)/crypto/bn/bn_add.c + $(OPENSSL_PATH)/crypto/bn/bn_blind.c + $(OPENSSL_PATH)/crypto/bn/bn_const.c + $(OPENSSL_PATH)/crypto/bn/bn_conv.c + $(OPENSSL_PATH)/crypto/bn/bn_ctx.c + $(OPENSSL_PATH)/crypto/bn/bn_dh.c + $(OPENSSL_PATH)/crypto/bn/bn_div.c + $(OPENSSL_PATH)/crypto/bn/bn_err.c + $(OPENSSL_PATH)/crypto/bn/bn_exp.c + $(OPENSSL_PATH)/crypto/bn/bn_exp2.c + $(OPENSSL_PATH)/crypto/bn/bn_gcd.c + $(OPENSSL_PATH)/crypto/bn/bn_gf2m.c + $(OPENSSL_PATH)/crypto/bn/bn_intern.c + $(OPENSSL_PATH)/crypto/bn/bn_kron.c + $(OPENSSL_PATH)/crypto/bn/bn_lib.c + $(OPENSSL_PATH)/crypto/bn/bn_mod.c + $(OPENSSL_PATH)/crypto/bn/bn_mont.c + $(OPENSSL_PATH)/crypto/bn/bn_mpi.c + $(OPENSSL_PATH)/crypto/bn/bn_mul.c + $(OPENSSL_PATH)/crypto/bn/bn_nist.c + $(OPENSSL_PATH)/crypto/bn/bn_prime.c + $(OPENSSL_PATH)/crypto/bn/bn_print.c + $(OPENSSL_PATH)/crypto/bn/bn_rand.c + $(OPENSSL_PATH)/crypto/bn/bn_recp.c + $(OPENSSL_PATH)/crypto/bn/bn_rsa_fips186_4.c + $(OPENSSL_PATH)/crypto/bn/bn_shift.c + $(OPENSSL_PATH)/crypto/bn/bn_sqr.c + $(OPENSSL_PATH)/crypto/bn/bn_sqrt.c + $(OPENSSL_PATH)/crypto/bn/bn_srp.c + $(OPENSSL_PATH)/crypto/bn/bn_word.c + $(OPENSSL_PATH)/crypto/bn/bn_x931p.c + $(OPENSSL_PATH)/crypto/bn/rsaz_exp.c + $(OPENSSL_PATH)/crypto/bn/rsaz_exp_x2.c + $(OPENSSL_PATH)/crypto/buffer/buf_err.c + $(OPENSSL_PATH)/crypto/buffer/buffer.c + $(OPENSSL_PATH)/crypto/comp/c_zlib.c + $(OPENSSL_PATH)/crypto/comp/comp_err.c + $(OPENSSL_PATH)/crypto/comp/comp_lib.c + $(OPENSSL_PATH)/crypto/conf/conf_api.c + $(OPENSSL_PATH)/crypto/conf/conf_def.c + $(OPENSSL_PATH)/crypto/conf/conf_err.c + $(OPENSSL_PATH)/crypto/conf/conf_lib.c + $(OPENSSL_PATH)/crypto/conf/conf_mall.c + $(OPENSSL_PATH)/crypto/conf/conf_mod.c + $(OPENSSL_PATH)/crypto/conf/conf_sap.c + $(OPENSSL_PATH)/crypto/conf/conf_ssl.c + $(OPENSSL_PATH)/crypto/dso/dso_dl.c + $(OPENSSL_PATH)/crypto/dso/dso_dlfcn.c + $(OPENSSL_PATH)/crypto/dso/dso_err.c + $(OPENSSL_PATH)/crypto/dso/dso_lib.c + $(OPENSSL_PATH)/crypto/dso/dso_openssl.c + $(OPENSSL_PATH)/crypto/dso/dso_vms.c + $(OPENSSL_PATH)/crypto/dso/dso_win32.c + $(OPENSSL_PATH)/crypto/ec/curve448/arch_32/f_impl32.c + $(OPENSSL_PATH)/crypto/ec/curve448/arch_64/f_impl64.c + $(OPENSSL_PATH)/crypto/ec/curve448/curve448.c + $(OPENSSL_PATH)/crypto/ec/curve448/curve448_tables.c + $(OPENSSL_PATH)/crypto/ec/curve448/eddsa.c + $(OPENSSL_PATH)/crypto/ec/curve448/f_generic.c + $(OPENSSL_PATH)/crypto/ec/curve448/scalar.c + $(OPENSSL_PATH)/crypto/ec/curve25519.c + $(OPENSSL_PATH)/crypto/ec/ec2_oct.c + $(OPENSSL_PATH)/crypto/ec/ec2_smpl.c + $(OPENSSL_PATH)/crypto/ec/ec_ameth.c + $(OPENSSL_PATH)/crypto/ec/ec_asn1.c + $(OPENSSL_PATH)/crypto/ec/ec_backend.c + $(OPENSSL_PATH)/crypto/ec/ec_check.c + $(OPENSSL_PATH)/crypto/ec/ec_curve.c + $(OPENSSL_PATH)/crypto/ec/ec_cvt.c + $(OPENSSL_PATH)/crypto/ec/ec_deprecated.c + $(OPENSSL_PATH)/crypto/ec/ec_err.c + $(OPENSSL_PATH)/crypto/ec/ec_key.c + $(OPENSSL_PATH)/crypto/ec/ec_kmeth.c + $(OPENSSL_PATH)/crypto/ec/ec_lib.c + $(OPENSSL_PATH)/crypto/ec/ec_mult.c + $(OPENSSL_PATH)/crypto/ec/ec_oct.c + $(OPENSSL_PATH)/crypto/ec/ec_pmeth.c + $(OPENSSL_PATH)/crypto/ec/ec_print.c + $(OPENSSL_PATH)/crypto/ec/ecdh_kdf.c + $(OPENSSL_PATH)/crypto/ec/ecdh_ossl.c + $(OPENSSL_PATH)/crypto/ec/ecdsa_ossl.c + $(OPENSSL_PATH)/crypto/ec/ecdsa_sign.c + $(OPENSSL_PATH)/crypto/ec/ecdsa_vrf.c + $(OPENSSL_PATH)/crypto/ec/eck_prn.c + $(OPENSSL_PATH)/crypto/ec/ecp_mont.c + $(OPENSSL_PATH)/crypto/ec/ecp_nist.c + $(OPENSSL_PATH)/crypto/ec/ecp_oct.c + $(OPENSSL_PATH)/crypto/ec/ecp_smpl.c + $(OPENSSL_PATH)/crypto/ec/ecx_backend.c + $(OPENSSL_PATH)/crypto/ec/ecx_key.c + $(OPENSSL_PATH)/crypto/ec/ecx_meth.c + $(OPENSSL_PATH)/crypto/encode_decode/decoder_err.c + $(OPENSSL_PATH)/crypto/encode_decode/decoder_lib.c + $(OPENSSL_PATH)/crypto/encode_decode/decoder_meth.c + $(OPENSSL_PATH)/crypto/encode_decode/decoder_pkey.c + $(OPENSSL_PATH)/crypto/err/err.c + $(OPENSSL_PATH)/crypto/err/err_all.c + $(OPENSSL_PATH)/crypto/err/err_all_legacy.c + $(OPENSSL_PATH)/crypto/err/err_blocks.c + $(OPENSSL_PATH)/crypto/err/err_prn.c + $(OPENSSL_PATH)/crypto/ess/ess_asn1.c + $(OPENSSL_PATH)/crypto/ess/ess_err.c + $(OPENSSL_PATH)/crypto/ess/ess_lib.c + $(OPENSSL_PATH)/crypto/evp/asymcipher.c + $(OPENSSL_PATH)/crypto/evp/bio_b64.c + $(OPENSSL_PATH)/crypto/evp/bio_enc.c + $(OPENSSL_PATH)/crypto/evp/bio_md.c + $(OPENSSL_PATH)/crypto/evp/bio_ok.c + #$(OPENSSL_PATH)/crypto/evp/c_allc.c + $(OPENSSL_PATH)/crypto/evp/c_alld.c + $(OPENSSL_PATH)/crypto/evp/cmeth_lib.c + $(OPENSSL_PATH)/crypto/evp/ctrl_params_translate.c + $(OPENSSL_PATH)/crypto/evp/dh_ctrl.c + $(OPENSSL_PATH)/crypto/evp/dh_support.c + $(OPENSSL_PATH)/crypto/evp/digest.c + $(OPENSSL_PATH)/crypto/evp/dsa_ctrl.c + #$(OPENSSL_PATH)/crypto/evp/e_aes.c + #$(OPENSSL_PATH)/crypto/evp/e_aes_cbc_hmac_sha1.c +#$(OPENSSL_PATH)/crypto/evp/e_aes_cbc_hmac_sha256.c + $(OPENSSL_PATH)/crypto/evp/e_aria.c + $(OPENSSL_PATH)/crypto/evp/e_bf.c + $(OPENSSL_PATH)/crypto/evp/e_cast.c + $(OPENSSL_PATH)/crypto/evp/e_chacha20_poly1305.c + $(OPENSSL_PATH)/crypto/evp/e_des.c + $(OPENSSL_PATH)/crypto/evp/e_des3.c + $(OPENSSL_PATH)/crypto/evp/e_idea.c + $(OPENSSL_PATH)/crypto/evp/e_null.c + $(OPENSSL_PATH)/crypto/evp/e_rc2.c + $(OPENSSL_PATH)/crypto/evp/e_rc4.c + $(OPENSSL_PATH)/crypto/evp/e_rc4_hmac_md5.c + $(OPENSSL_PATH)/crypto/evp/e_rc5.c + $(OPENSSL_PATH)/crypto/evp/e_sm4.c + $(OPENSSL_PATH)/crypto/evp/e_xcbc_d.c + $(OPENSSL_PATH)/crypto/evp/ec_ctrl.c + $(OPENSSL_PATH)/crypto/evp/ec_support.c + $(OPENSSL_PATH)/crypto/evp/encode.c + $(OPENSSL_PATH)/crypto/evp/evp_cnf.c + $(OPENSSL_PATH)/crypto/evp/evp_enc.c + $(OPENSSL_PATH)/crypto/evp/evp_err.c + $(OPENSSL_PATH)/crypto/evp/evp_fetch.c + $(OPENSSL_PATH)/crypto/evp/evp_key.c + $(OPENSSL_PATH)/crypto/evp/evp_lib.c + $(OPENSSL_PATH)/crypto/evp/evp_pbe.c + $(OPENSSL_PATH)/crypto/evp/evp_pkey.c + $(OPENSSL_PATH)/crypto/evp/evp_rand.c + $(OPENSSL_PATH)/crypto/evp/evp_utils.c + $(OPENSSL_PATH)/crypto/evp/exchange.c + $(OPENSSL_PATH)/crypto/evp/kdf_lib.c + $(OPENSSL_PATH)/crypto/evp/kdf_meth.c + $(OPENSSL_PATH)/crypto/evp/kem.c + $(OPENSSL_PATH)/crypto/evp/keymgmt_lib.c + $(OPENSSL_PATH)/crypto/evp/keymgmt_meth.c + $(OPENSSL_PATH)/crypto/evp/legacy_md5.c + $(OPENSSL_PATH)/crypto/evp/legacy_md5_sha1.c + $(OPENSSL_PATH)/crypto/evp/legacy_sha.c + $(OPENSSL_PATH)/crypto/evp/m_null.c + $(OPENSSL_PATH)/crypto/evp/m_sigver.c + $(OPENSSL_PATH)/crypto/evp/mac_lib.c + $(OPENSSL_PATH)/crypto/evp/mac_meth.c + $(OPENSSL_PATH)/crypto/evp/names.c + $(OPENSSL_PATH)/crypto/evp/p5_crpt.c + $(OPENSSL_PATH)/crypto/evp/p5_crpt2.c + $(OPENSSL_PATH)/crypto/evp/p_dec.c + $(OPENSSL_PATH)/crypto/evp/p_enc.c + $(OPENSSL_PATH)/crypto/evp/p_legacy.c + $(OPENSSL_PATH)/crypto/evp/p_lib.c + $(OPENSSL_PATH)/crypto/evp/p_open.c + $(OPENSSL_PATH)/crypto/evp/p_seal.c + $(OPENSSL_PATH)/crypto/evp/p_sign.c + $(OPENSSL_PATH)/crypto/evp/p_verify.c + $(OPENSSL_PATH)/crypto/evp/pbe_scrypt.c + $(OPENSSL_PATH)/crypto/evp/pmeth_check.c + $(OPENSSL_PATH)/crypto/evp/pmeth_gn.c + $(OPENSSL_PATH)/crypto/evp/pmeth_lib.c + $(OPENSSL_PATH)/crypto/evp/signature.c + $(OPENSSL_PATH)/crypto/ffc/ffc_backend.c + $(OPENSSL_PATH)/crypto/ffc/ffc_dh.c + $(OPENSSL_PATH)/crypto/ffc/ffc_key_generate.c + $(OPENSSL_PATH)/crypto/ffc/ffc_key_validate.c + $(OPENSSL_PATH)/crypto/ffc/ffc_params.c + $(OPENSSL_PATH)/crypto/ffc/ffc_params_generate.c + $(OPENSSL_PATH)/crypto/ffc/ffc_params_validate.c + $(OPENSSL_PATH)/crypto/hmac/hmac.c + $(OPENSSL_PATH)/crypto/http/http_client.c + $(OPENSSL_PATH)/crypto/http/http_err.c + $(OPENSSL_PATH)/crypto/http/http_lib.c + $(OPENSSL_PATH)/crypto/kdf/kdf_err.c + $(OPENSSL_PATH)/crypto/lhash/lh_stats.c + $(OPENSSL_PATH)/crypto/lhash/lhash.c + $(OPENSSL_PATH)/crypto/asn1_dsa.c + $(OPENSSL_PATH)/crypto/bsearch.c + $(OPENSSL_PATH)/crypto/context.c + $(OPENSSL_PATH)/crypto/core_algorithm.c + $(OPENSSL_PATH)/crypto/core_fetch.c + $(OPENSSL_PATH)/crypto/core_namemap.c + $(OPENSSL_PATH)/crypto/cpt_err.c + $(OPENSSL_PATH)/crypto/cpuid.c + $(OPENSSL_PATH)/crypto/cryptlib.c + $(OPENSSL_PATH)/crypto/ctype.c + $(OPENSSL_PATH)/crypto/cversion.c + $(OPENSSL_PATH)/crypto/der_writer.c + $(OPENSSL_PATH)/crypto/ebcdic.c + $(OPENSSL_PATH)/crypto/ex_data.c + $(OPENSSL_PATH)/crypto/getenv.c + $(OPENSSL_PATH)/crypto/info.c + $(OPENSSL_PATH)/crypto/init.c + $(OPENSSL_PATH)/crypto/initthread.c + $(OPENSSL_PATH)/crypto/mem.c + $(OPENSSL_PATH)/crypto/mem_sec.c + $(OPENSSL_PATH)/crypto/o_dir.c + $(OPENSSL_PATH)/crypto/o_fopen.c + $(OPENSSL_PATH)/crypto/o_init.c + $(OPENSSL_PATH)/crypto/o_str.c + $(OPENSSL_PATH)/crypto/o_time.c + $(OPENSSL_PATH)/crypto/packet.c + $(OPENSSL_PATH)/crypto/param_build.c + $(OPENSSL_PATH)/crypto/param_build_set.c + $(OPENSSL_PATH)/crypto/params.c + $(OPENSSL_PATH)/crypto/params_dup.c + $(OPENSSL_PATH)/crypto/params_from_text.c + $(OPENSSL_PATH)/crypto/passphrase.c + $(OPENSSL_PATH)/crypto/provider.c + $(OPENSSL_PATH)/crypto/provider_child.c + $(OPENSSL_PATH)/crypto/provider_conf.c + $(OPENSSL_PATH)/crypto/provider_core.c + $(OPENSSL_PATH)/crypto/punycode.c + $(OPENSSL_PATH)/crypto/self_test_core.c + $(OPENSSL_PATH)/crypto/sparse_array.c + $(OPENSSL_PATH)/crypto/threads_lib.c + $(OPENSSL_PATH)/crypto/threads_none.c + $(OPENSSL_PATH)/crypto/threads_pthread.c + $(OPENSSL_PATH)/crypto/threads_win.c + $(OPENSSL_PATH)/crypto/trace.c + $(OPENSSL_PATH)/crypto/uid.c + $(OPENSSL_PATH)/crypto/md5/md5_dgst.c + $(OPENSSL_PATH)/crypto/md5/md5_one.c + $(OPENSSL_PATH)/crypto/md5/md5_sha1.c + $(OPENSSL_PATH)/crypto/modes/cbc128.c + $(OPENSSL_PATH)/crypto/modes/ccm128.c + $(OPENSSL_PATH)/crypto/modes/cfb128.c + $(OPENSSL_PATH)/crypto/modes/ctr128.c + $(OPENSSL_PATH)/crypto/modes/cts128.c + $(OPENSSL_PATH)/crypto/modes/gcm128.c + $(OPENSSL_PATH)/crypto/modes/ocb128.c + $(OPENSSL_PATH)/crypto/modes/ofb128.c + $(OPENSSL_PATH)/crypto/modes/siv128.c + $(OPENSSL_PATH)/crypto/modes/wrap128.c + $(OPENSSL_PATH)/crypto/modes/xts128.c + $(OPENSSL_PATH)/crypto/objects/o_names.c + $(OPENSSL_PATH)/crypto/objects/obj_dat.c + $(OPENSSL_PATH)/crypto/objects/obj_err.c + $(OPENSSL_PATH)/crypto/objects/obj_lib.c + $(OPENSSL_PATH)/crypto/objects/obj_xref.c + $(OPENSSL_PATH)/crypto/pem/pem_all.c + $(OPENSSL_PATH)/crypto/pem/pem_err.c + $(OPENSSL_PATH)/crypto/pem/pem_info.c + $(OPENSSL_PATH)/crypto/pem/pem_lib.c + $(OPENSSL_PATH)/crypto/pem/pem_oth.c + $(OPENSSL_PATH)/crypto/pem/pem_pk8.c + $(OPENSSL_PATH)/crypto/pem/pem_pkey.c + $(OPENSSL_PATH)/crypto/pem/pem_sign.c + $(OPENSSL_PATH)/crypto/pem/pem_x509.c + $(OPENSSL_PATH)/crypto/pem/pem_xaux.c + $(OPENSSL_PATH)/crypto/pem/pvkfmt.c + $(OPENSSL_PATH)/crypto/pkcs7/bio_pk7.c + $(OPENSSL_PATH)/crypto/pkcs7/pk7_asn1.c + $(OPENSSL_PATH)/crypto/pkcs7/pk7_attr.c + $(OPENSSL_PATH)/crypto/pkcs7/pk7_doit.c + $(OPENSSL_PATH)/crypto/pkcs7/pk7_lib.c + $(OPENSSL_PATH)/crypto/pkcs7/pk7_mime.c + $(OPENSSL_PATH)/crypto/pkcs7/pk7_smime.c + $(OPENSSL_PATH)/crypto/pkcs7/pkcs7err.c + $(OPENSSL_PATH)/crypto/property/defn_cache.c + $(OPENSSL_PATH)/crypto/property/property.c + $(OPENSSL_PATH)/crypto/property/property_err.c + $(OPENSSL_PATH)/crypto/property/property_parse.c + $(OPENSSL_PATH)/crypto/property/property_query.c + $(OPENSSL_PATH)/crypto/property/property_string.c + $(OPENSSL_PATH)/crypto/rand/prov_seed.c + $(OPENSSL_PATH)/crypto/rand/rand_deprecated.c + $(OPENSSL_PATH)/crypto/rand/rand_err.c + $(OPENSSL_PATH)/crypto/rand/rand_lib.c + $(OPENSSL_PATH)/crypto/rand/rand_meth.c + $(OPENSSL_PATH)/crypto/rand/rand_pool.c + $(OPENSSL_PATH)/crypto/rsa/rsa_ameth.c + $(OPENSSL_PATH)/crypto/rsa/rsa_asn1.c + $(OPENSSL_PATH)/crypto/rsa/rsa_backend.c + $(OPENSSL_PATH)/crypto/rsa/rsa_chk.c + $(OPENSSL_PATH)/crypto/rsa/rsa_crpt.c + $(OPENSSL_PATH)/crypto/rsa/rsa_err.c + $(OPENSSL_PATH)/crypto/rsa/rsa_gen.c + $(OPENSSL_PATH)/crypto/rsa/rsa_lib.c + $(OPENSSL_PATH)/crypto/rsa/rsa_meth.c + $(OPENSSL_PATH)/crypto/rsa/rsa_mp.c + $(OPENSSL_PATH)/crypto/rsa/rsa_mp_names.c + $(OPENSSL_PATH)/crypto/rsa/rsa_none.c + $(OPENSSL_PATH)/crypto/rsa/rsa_oaep.c + $(OPENSSL_PATH)/crypto/rsa/rsa_ossl.c + $(OPENSSL_PATH)/crypto/rsa/rsa_pk1.c + $(OPENSSL_PATH)/crypto/rsa/rsa_pmeth.c + $(OPENSSL_PATH)/crypto/rsa/rsa_prn.c + $(OPENSSL_PATH)/crypto/rsa/rsa_pss.c + $(OPENSSL_PATH)/crypto/rsa/rsa_saos.c + $(OPENSSL_PATH)/crypto/rsa/rsa_schemes.c + $(OPENSSL_PATH)/crypto/rsa/rsa_sign.c + $(OPENSSL_PATH)/crypto/rsa/rsa_sp800_56b_check.c + $(OPENSSL_PATH)/crypto/rsa/rsa_sp800_56b_gen.c + $(OPENSSL_PATH)/crypto/rsa/rsa_x931.c + $(OPENSSL_PATH)/crypto/rsa/rsa_x931g.c + $(OPENSSL_PATH)/crypto/sha/sha1_one.c + $(OPENSSL_PATH)/crypto/sha/sha1dgst.c + $(OPENSSL_PATH)/crypto/sha/sha256.c + $(OPENSSL_PATH)/crypto/sha/sha3.c + $(OPENSSL_PATH)/crypto/sha/sha512.c + $(OPENSSL_PATH)/crypto/stack/stack.c + $(OPENSSL_PATH)/crypto/txt_db/txt_db.c + $(OPENSSL_PATH)/crypto/ui/ui_err.c + $(OPENSSL_PATH)/crypto/ui/ui_lib.c + $(OPENSSL_PATH)/crypto/ui/ui_null.c + $(OPENSSL_PATH)/crypto/ui/ui_openssl.c + $(OPENSSL_PATH)/crypto/ui/ui_util.c + $(OPENSSL_PATH)/crypto/x509/by_dir.c + $(OPENSSL_PATH)/crypto/x509/by_file.c + $(OPENSSL_PATH)/crypto/x509/by_store.c + $(OPENSSL_PATH)/crypto/x509/pcy_cache.c + $(OPENSSL_PATH)/crypto/x509/pcy_data.c + $(OPENSSL_PATH)/crypto/x509/pcy_lib.c + $(OPENSSL_PATH)/crypto/x509/pcy_map.c + $(OPENSSL_PATH)/crypto/x509/pcy_node.c + $(OPENSSL_PATH)/crypto/x509/pcy_tree.c + $(OPENSSL_PATH)/crypto/x509/t_crl.c + $(OPENSSL_PATH)/crypto/x509/t_req.c + $(OPENSSL_PATH)/crypto/x509/t_x509.c + $(OPENSSL_PATH)/crypto/x509/v3_addr.c + $(OPENSSL_PATH)/crypto/x509/v3_admis.c + $(OPENSSL_PATH)/crypto/x509/v3_akeya.c + $(OPENSSL_PATH)/crypto/x509/v3_akid.c + $(OPENSSL_PATH)/crypto/x509/v3_asid.c + $(OPENSSL_PATH)/crypto/x509/v3_bcons.c + $(OPENSSL_PATH)/crypto/x509/v3_bitst.c + $(OPENSSL_PATH)/crypto/x509/v3_conf.c + $(OPENSSL_PATH)/crypto/x509/v3_cpols.c + $(OPENSSL_PATH)/crypto/x509/v3_crld.c + $(OPENSSL_PATH)/crypto/x509/v3_enum.c + $(OPENSSL_PATH)/crypto/x509/v3_extku.c + $(OPENSSL_PATH)/crypto/x509/v3_genn.c + $(OPENSSL_PATH)/crypto/x509/v3_ia5.c + $(OPENSSL_PATH)/crypto/x509/v3_info.c + $(OPENSSL_PATH)/crypto/x509/v3_int.c + $(OPENSSL_PATH)/crypto/x509/v3_ist.c + $(OPENSSL_PATH)/crypto/x509/v3_lib.c + $(OPENSSL_PATH)/crypto/x509/v3_ncons.c + $(OPENSSL_PATH)/crypto/x509/v3_pci.c + $(OPENSSL_PATH)/crypto/x509/v3_pcia.c + $(OPENSSL_PATH)/crypto/x509/v3_pcons.c + $(OPENSSL_PATH)/crypto/x509/v3_pku.c + $(OPENSSL_PATH)/crypto/x509/v3_pmaps.c + $(OPENSSL_PATH)/crypto/x509/v3_prn.c + $(OPENSSL_PATH)/crypto/x509/v3_purp.c + $(OPENSSL_PATH)/crypto/x509/v3_san.c + $(OPENSSL_PATH)/crypto/x509/v3_skid.c + $(OPENSSL_PATH)/crypto/x509/v3_sxnet.c + $(OPENSSL_PATH)/crypto/x509/v3_tlsf.c + $(OPENSSL_PATH)/crypto/x509/v3_utf8.c + $(OPENSSL_PATH)/crypto/x509/v3_utl.c + $(OPENSSL_PATH)/crypto/x509/v3err.c + $(OPENSSL_PATH)/crypto/x509/x509_att.c + $(OPENSSL_PATH)/crypto/x509/x509_cmp.c + $(OPENSSL_PATH)/crypto/x509/x509_d2.c + $(OPENSSL_PATH)/crypto/x509/x509_def.c + $(OPENSSL_PATH)/crypto/x509/x509_err.c + $(OPENSSL_PATH)/crypto/x509/x509_ext.c + $(OPENSSL_PATH)/crypto/x509/x509_lu.c + $(OPENSSL_PATH)/crypto/x509/x509_meth.c + $(OPENSSL_PATH)/crypto/x509/x509_obj.c + $(OPENSSL_PATH)/crypto/x509/x509_r2x.c + $(OPENSSL_PATH)/crypto/x509/x509_req.c + $(OPENSSL_PATH)/crypto/x509/x509_set.c + $(OPENSSL_PATH)/crypto/x509/x509_trust.c + $(OPENSSL_PATH)/crypto/x509/x509_txt.c + $(OPENSSL_PATH)/crypto/x509/x509_v3.c + $(OPENSSL_PATH)/crypto/x509/x509_vfy.c + $(OPENSSL_PATH)/crypto/x509/x509_vpm.c + $(OPENSSL_PATH)/crypto/x509/x509cset.c + $(OPENSSL_PATH)/crypto/x509/x509name.c + $(OPENSSL_PATH)/crypto/x509/x509rset.c + $(OPENSSL_PATH)/crypto/x509/x509spki.c + $(OPENSSL_PATH)/crypto/x509/x509type.c + $(OPENSSL_PATH)/crypto/x509/x_all.c + $(OPENSSL_PATH)/crypto/x509/x_attrib.c + $(OPENSSL_PATH)/crypto/x509/x_crl.c + $(OPENSSL_PATH)/crypto/x509/x_exten.c + $(OPENSSL_PATH)/crypto/x509/x_name.c + $(OPENSSL_PATH)/crypto/x509/x_pubkey.c + $(OPENSSL_PATH)/crypto/x509/x_req.c + $(OPENSSL_PATH)/crypto/x509/x_x509.c + $(OPENSSL_PATH)/crypto/x509/x_x509a.c + $(OPENSSL_PATH)/providers/nullprov.c + $(OPENSSL_PATH)/providers/prov_running.c + $(OPENSSL_PATH)/providers/common/der/der_rsa_sig.c + $(OPENSSL_PATH)/providers/common/bio_prov.c + $(OPENSSL_PATH)/providers/common/capabilities.c + $(OPENSSL_PATH)/providers/common/digest_to_nid.c + $(OPENSSL_PATH)/providers/common/provider_seeding.c + $(OPENSSL_PATH)/providers/common/provider_util.c + $(OPENSSL_PATH)/providers/common/securitycheck.c + $(OPENSSL_PATH)/providers/common/securitycheck_default.c + $(OPENSSL_PATH)/providers/implementations/asymciphers/rsa_enc.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha1_hw.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha256_hw.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_ccm.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_ccm_hw.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_gcm.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_gcm_hw.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_hw.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_wrp.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_xts.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_xts_fips.c + #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_xts_hw.c + $(OPENSSL_PATH)/providers/implementations/ciphers/cipher_cts.c + $(OPENSSL_PATH)/providers/implementations/ciphers/cipher_null.c + $(OPENSSL_PATH)/providers/implementations/digests/md5_prov.c + $(OPENSSL_PATH)/providers/implementations/digests/md5_sha1_prov.c + $(OPENSSL_PATH)/providers/implementations/digests/null_prov.c + $(OPENSSL_PATH)/providers/implementations/digests/sha2_prov.c + $(OPENSSL_PATH)/providers/implementations/digests/sha3_prov.c + $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_der2key.c + $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_epki2pki.c + $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_msblob2key.c + $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_pem2der.c + $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_pvk2key.c + $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_spki2typespki.c + $(OPENSSL_PATH)/providers/implementations/encode_decode/endecoder_common.c + $(OPENSSL_PATH)/providers/implementations/exchange/ecdh_exch.c + $(OPENSSL_PATH)/providers/implementations/exchange/ecx_exch.c + $(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c + $(OPENSSL_PATH)/providers/implementations/kdfs/hkdf.c + $(OPENSSL_PATH)/providers/implementations/kdfs/kbkdf.c + $(OPENSSL_PATH)/providers/implementations/kdfs/krb5kdf.c + $(OPENSSL_PATH)/providers/implementations/kdfs/pbkdf2.c + $(OPENSSL_PATH)/providers/implementations/kdfs/pbkdf2_fips.c + $(OPENSSL_PATH)/providers/implementations/kdfs/pkcs12kdf.c + $(OPENSSL_PATH)/providers/implementations/kdfs/scrypt.c + $(OPENSSL_PATH)/providers/implementations/kdfs/sshkdf.c + $(OPENSSL_PATH)/providers/implementations/kdfs/sskdf.c + $(OPENSSL_PATH)/providers/implementations/kdfs/tls1_prf.c + $(OPENSSL_PATH)/providers/implementations/kdfs/x942kdf.c + $(OPENSSL_PATH)/providers/implementations/kem/rsa_kem.c + $(OPENSSL_PATH)/providers/implementations/keymgmt/ec_kmgmt.c + $(OPENSSL_PATH)/providers/implementations/keymgmt/ecx_kmgmt.c + $(OPENSSL_PATH)/providers/implementations/keymgmt/kdf_legacy_kmgmt.c + $(OPENSSL_PATH)/providers/implementations/keymgmt/mac_legacy_kmgmt.c + $(OPENSSL_PATH)/providers/implementations/keymgmt/rsa_kmgmt.c + $(OPENSSL_PATH)/providers/implementations/macs/gmac_prov.c + $(OPENSSL_PATH)/providers/implementations/macs/hmac_prov.c + $(OPENSSL_PATH)/providers/implementations/macs/kmac_prov.c + $(OPENSSL_PATH)/providers/implementations/rands/crngt.c + $(OPENSSL_PATH)/providers/implementations/rands/drbg.c + $(OPENSSL_PATH)/providers/implementations/rands/drbg_ctr.c + $(OPENSSL_PATH)/providers/implementations/rands/drbg_hash.c + $(OPENSSL_PATH)/providers/implementations/rands/drbg_hmac.c + $(OPENSSL_PATH)/providers/implementations/rands/seed_src.c + $(OPENSSL_PATH)/providers/implementations/rands/test_rng.c + $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_cpu_x86.c + $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_tsc.c + $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_unix.c + $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_win.c + $(OPENSSL_PATH)/providers/implementations/signature/ecdsa_sig.c + $(OPENSSL_PATH)/providers/implementations/signature/eddsa_sig.c + $(OPENSSL_PATH)/providers/implementations/signature/mac_legacy_sig.c + $(OPENSSL_PATH)/providers/implementations/signature/rsa_sig.c + $(OPENSSL_PATH)/ssl/s3_cbc.c + $(OPENSSL_PATH)/providers/common/der/der_ec_key.c + $(OPENSSL_PATH)/providers/common/der/der_ec_sig.c + $(OPENSSL_PATH)/providers/common/der/der_ecx_key.c + $(OPENSSL_PATH)/providers/common/der/der_rsa_key.c + $(OPENSSL_PATH)/providers/common/provider_ctx.c + $(OPENSSL_PATH)/providers/common/provider_err.c + $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon.c + $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_block.c + $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_ccm.c + $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_ccm_hw.c + $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_gcm.c + $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_gcm_hw.c + $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_hw.c + $(OPENSSL_PATH)/providers/implementations/digests/digestcommon.c + $(OPENSSL_PATH)/ssl/record/tls_pad.c + $(OPENSSL_GEN_PATH)/providers/common/der/der_digests_gen.c + $(OPENSSL_GEN_PATH)/providers/common/der/der_ec_gen.c + $(OPENSSL_GEN_PATH)/providers/common/der/der_ecx_gen.c + $(OPENSSL_GEN_PATH)/providers/common/der/der_rsa_gen.c + $(OPENSSL_GEN_PATH)/providers/common/der/der_wrap_gen.c + $(OPENSSL_PATH)/ssl/bio_ssl.c + $(OPENSSL_PATH)/ssl/d1_lib.c + $(OPENSSL_PATH)/ssl/d1_msg.c + $(OPENSSL_PATH)/ssl/d1_srtp.c + $(OPENSSL_PATH)/ssl/methods.c + $(OPENSSL_PATH)/ssl/pqueue.c + $(OPENSSL_PATH)/ssl/s3_enc.c + $(OPENSSL_PATH)/ssl/s3_lib.c + $(OPENSSL_PATH)/ssl/s3_msg.c + $(OPENSSL_PATH)/ssl/ssl_asn1.c + $(OPENSSL_PATH)/ssl/ssl_cert.c + $(OPENSSL_PATH)/ssl/ssl_ciph.c + $(OPENSSL_PATH)/ssl/ssl_conf.c + $(OPENSSL_PATH)/ssl/ssl_err.c + $(OPENSSL_PATH)/ssl/ssl_err_legacy.c + $(OPENSSL_PATH)/ssl/ssl_init.c + $(OPENSSL_PATH)/ssl/ssl_lib.c + $(OPENSSL_PATH)/ssl/ssl_mcnf.c + $(OPENSSL_PATH)/ssl/ssl_rsa.c + $(OPENSSL_PATH)/ssl/ssl_rsa_legacy.c + $(OPENSSL_PATH)/ssl/ssl_sess.c + $(OPENSSL_PATH)/ssl/ssl_stat.c + $(OPENSSL_PATH)/ssl/ssl_txt.c + $(OPENSSL_PATH)/ssl/ssl_utst.c + $(OPENSSL_PATH)/ssl/t1_enc.c + $(OPENSSL_PATH)/ssl/t1_lib.c + $(OPENSSL_PATH)/ssl/t1_trce.c + $(OPENSSL_PATH)/ssl/tls13_enc.c + $(OPENSSL_PATH)/ssl/tls_depr.c + $(OPENSSL_PATH)/ssl/tls_srp.c + $(OPENSSL_PATH)/ssl/record/dtls1_bitmap.c + $(OPENSSL_PATH)/ssl/record/rec_layer_d1.c + $(OPENSSL_PATH)/ssl/record/rec_layer_s3.c + $(OPENSSL_PATH)/ssl/record/ssl3_buffer.c + $(OPENSSL_PATH)/ssl/record/ssl3_record.c + $(OPENSSL_PATH)/ssl/record/ssl3_record_tls13.c + $(OPENSSL_PATH)/ssl/statem/extensions.c + $(OPENSSL_PATH)/ssl/statem/extensions_clnt.c + $(OPENSSL_PATH)/ssl/statem/extensions_cust.c + $(OPENSSL_PATH)/ssl/statem/statem.c + $(OPENSSL_PATH)/ssl/statem/statem_clnt.c + $(OPENSSL_PATH)/ssl/statem/statem_dtls.c + $(OPENSSL_PATH)/ssl/statem/statem_lib.c + #$(OPENSSL_GEN_PATH)/X64-MSFT/crypto/aes/aes-x86_64.nasm | MSFT + #$(OPENSSL_GEN_PATH)/X64-MSFT/crypto/aes/aesni-mb-x86_64.nasm | MSFT + #$(OPENSSL_GEN_PATH)/X64-MSFT/crypto/aes/aesni-sha1-x86_64.nasm | MSFT + #$(OPENSSL_GEN_PATH)/X64-MSFT/crypto/aes/aesni-sha256-x86_64.nasm | MSFT + #$(OPENSSL_GEN_PATH)/X64-MSFT/crypto/aes/aesni-x86_64.nasm | MSFT + #$(OPENSSL_GEN_PATH)/X64-MSFT/crypto/aes/bsaes-x86_64.nasm | MSFT + #$(OPENSSL_GEN_PATH)/X64-MSFT/crypto/aes/vpaes-x86_64.nasm | MSFT + $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/x86_64cpuid.nasm | MSFT + $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/md5/md5-x86_64.nasm | MSFT + #$(OPENSSL_GEN_PATH)/X64-MSFT/crypto/modes/aesni-gcm-x86_64.nasm | MSFT + $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/modes/ghash-x86_64.nasm | MSFT + $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/sha/keccak1600-x86_64.nasm | MSFT + $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/sha/sha1-mb-x86_64.nasm | MSFT + $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/sha/sha1-x86_64.nasm | MSFT + $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/sha/sha256-mb-x86_64.nasm | MSFT + $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/sha/sha256-x86_64.nasm | MSFT + $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/sha/sha512-x86_64.nasm | MSFT + #$(OPENSSL_GEN_PATH)/X64-GCC/crypto/aes/aes-x86_64.s | GCC + #$(OPENSSL_GEN_PATH)/X64-GCC/crypto/aes/aesni-mb-x86_64.s | GCC + #$(OPENSSL_GEN_PATH)/X64-GCC/crypto/aes/aesni-sha1-x86_64.s | GCC + #$(OPENSSL_GEN_PATH)/X64-GCC/crypto/aes/aesni-sha256-x86_64.s | GCC + #$(OPENSSL_GEN_PATH)/X64-GCC/crypto/aes/aesni-x86_64.s | GCC + #$(OPENSSL_GEN_PATH)/X64-GCC/crypto/aes/bsaes-x86_64.s | GCC + #$(OPENSSL_GEN_PATH)/X64-GCC/crypto/aes/vpaes-x86_64.s | GCC + $(OPENSSL_GEN_PATH)/X64-GCC/crypto/x86_64cpuid.s | GCC + $(OPENSSL_GEN_PATH)/X64-GCC/crypto/md5/md5-x86_64.s | GCC + #$(OPENSSL_GEN_PATH)/X64-GCC/crypto/modes/aesni-gcm-x86_64.s | GCC + $(OPENSSL_GEN_PATH)/X64-GCC/crypto/modes/ghash-x86_64.s | GCC + $(OPENSSL_GEN_PATH)/X64-GCC/crypto/sha/keccak1600-x86_64.s | GCC + $(OPENSSL_GEN_PATH)/X64-GCC/crypto/sha/sha1-mb-x86_64.s | GCC + $(OPENSSL_GEN_PATH)/X64-GCC/crypto/sha/sha1-x86_64.s | GCC + $(OPENSSL_GEN_PATH)/X64-GCC/crypto/sha/sha256-mb-x86_64.s | GCC + $(OPENSSL_GEN_PATH)/X64-GCC/crypto/sha/sha256-x86_64.s | GCC + $(OPENSSL_GEN_PATH)/X64-GCC/crypto/sha/sha512-x86_64.s | GCC +# Autogenerated files list ends here + +[Packages] + MdePkg/MdePkg.dec + CryptoPkg/CryptoPkg.dec + +[LibraryClasses] + BaseLib + DebugLib + RngLib + +[BuildOptions] + # + # Disables the following Visual Studio compiler warnings brought by openssl source, + # so we do not break the build with /WX option: + # C4090: 'function' : different 'const' qualifiers + # C4132: 'object' : const object should be initialized (tls13_enc.c) + # C4210: nonstandard extension used: function given file scope + # C4244: conversion from type1 to type2, possible loss of data + # C4245: conversion from type1 to type2, signed/unsigned mismatch + # C4267: conversion from size_t to type, possible loss of data + # C4306: 'identifier' : conversion from 'type1' to 'type2' of greater size + # C4310: cast truncates constant value + # C4389: 'operator' : signed/unsigned mismatch (xxxx) + # C4700: uninitialized local variable 'name' used. (conf_sap.c(71)) + # C4702: unreachable code + # C4706: assignment within conditional expression + # C4819: The file contains a character that cannot be represented in the current code page + # C4133: incompatible types - from 'ASN1_TYPE *' to 'const ASN1_STRING *' (v3_genn.c(101)) + # + MSFT:*_*_IA32_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_IA32) /wd4090 /wd4132 /wd4210 /wd4244 /wd4245 /wd4267 /wd4310 /wd4389 /wd4700 /wd4702 /wd4706 /wd4819 /wd4133 + MSFT:*_*_X64_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_X64) /wd4090 /wd4132 /wd4210 /wd4244 /wd4245 /wd4267 /wd4306 /wd4310 /wd4700 /wd4389 /wd4702 /wd4706 /wd4819 /wd4133 + + # + # Disable following Visual Studio 2015 compiler warnings brought by openssl source, + # so we do not break the build with /WX option: + # C4718: recursive call has no side effects, deleting + # + MSFT:*_VS2015x86_IA32_CC_FLAGS = /wd4718 + MSFT:*_VS2015x86_X64_CC_FLAGS = /wd4718 + + INTEL:*_*_IA32_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER -U__ICC $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_IA32) /w + INTEL:*_*_X64_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER -U__ICC $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_X64) /w + + # + # Suppress the following build warnings in openssl so we don't break the build with -Werror + # -Werror=maybe-uninitialized: there exist some other paths for which the variable is not initialized. + # -Werror=format: Check calls to printf and scanf, etc., to make sure that the arguments supplied have + # types appropriate to the format string specified. + # -Werror=unused-but-set-variable: Warn whenever a local variable is assigned to, but otherwise unused (aside from its declaration). + # + GCC:*_*_IA32_CC_FLAGS = -U_WIN32 -U_WIN64 $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_IA32) -Wno-error=maybe-uninitialized -Wno-error=unused-but-set-variable + GCC:*_*_X64_CC_FLAGS = -U_WIN32 -U_WIN64 $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_X64) -Wno-error=maybe-uninitialized -Wno-error=format -Wno-format -Wno-error=unused-but-set-variable -DNO_MSABI_VA_FUNCS + GCC:*_CLANGDWARF_*_CC_FLAGS = -std=c99 -Wno-error=uninitialized -Wno-error=incompatible-pointer-types -Wno-error=pointer-sign -Wno-error=implicit-function-declaration -Wno-error=ignored-pragma-optimize + GCC:*_CLANG35_*_CC_FLAGS = -std=c99 -Wno-error=uninitialized + GCC:*_CLANG38_*_CC_FLAGS = -std=c99 -Wno-error=uninitialized + GCC:*_CLANGPDB_*_CC_FLAGS = -std=c99 -Wno-error=uninitialized -Wno-error=incompatible-pointer-types -Wno-error=pointer-sign -Wno-error=implicit-function-declaration -Wno-error=ignored-pragma-optimize + # Revisit after switching to 3.0 branch + GCC:*_GCC5_*_CC_FLAGS = -Wno-unused-but-set-variable + + # suppress the following warnings in openssl so we don't break the build with warnings-as-errors: + # 1295: Deprecated declaration - give arg types + # 550: was set but never used + # 1293: assignment in condition + # 111: statement is unreachable (invariably "break;" after "return X;" in case statement) + # 68: integer conversion resulted in a change of sign ("if (Status == -1)") + # 177: was declared but never referenced + # 223: function declared implicitly + # 144: a value of type cannot be used to initialize an entity of type + # 513: a value of type cannot be assigned to an entity of type + # 188: enumerated type mixed with another type (i.e. passing an integer as an enum without a cast) + # 1296: Extended constant initialiser used + # 128: loop is not reachable - may be emitted inappropriately if code follows a conditional return + # from the function that evaluates to true at compile time + # 546: transfer of control bypasses initialization - may be emitted inappropriately if the uninitialized + # variable is never referenced after the jump + # 1: ignore "#1-D: last line of file ends without a newline" + # 3017: may be used before being set (NOTE: This was fixed in OpenSSL 1.1 HEAD with + # commit d9b8b89bec4480de3a10bdaf9425db371c19145b, and can be dropped then.) + XCODE:*_*_IA32_CC_FLAGS = -mmmx -msse -U_WIN32 -U_WIN64 $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_IA32) -w -std=c99 -Wno-error=uninitialized + XCODE:*_*_X64_CC_FLAGS = -mmmx -msse -U_WIN32 -U_WIN64 $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_X64) -w -std=c99 -Wno-error=uninitialized diff --git a/CryptoPkg/Library/OpensslLib/OpensslStub/CipherNull.c b/CryptoPkg/Library/OpensslLib/OpensslStub/CipherNull.c new file mode 100644 index 00000000000..c305594c202 --- /dev/null +++ b/CryptoPkg/Library/OpensslLib/OpensslStub/CipherNull.c @@ -0,0 +1,14 @@ +/* + * Copyright 2023 Microsoft. + * + * A null implementation to not include any openssl ciphers + * as they are not used in project mu. + * + */ + +void openssl_add_all_ciphers_int (void) +{ + return; +} + + diff --git a/CryptoPkg/Library/OpensslLib/OpensslStub/Md5Null.c b/CryptoPkg/Library/OpensslLib/OpensslStub/Md5Null.c new file mode 100644 index 00000000000..d88786d6a9d --- /dev/null +++ b/CryptoPkg/Library/OpensslLib/OpensslStub/Md5Null.c @@ -0,0 +1,73 @@ +/** @file + Null implementation of MD5 functions called by BaseCryptLib. + + Copyright (c) 2022, Intel Corporation. All rights reserved.
+ SPDX-License-Identifier: BSD-2-Clause-Patent + +**/ + +/* + * MD5 low level APIs are deprecated for public use, but still ok for + * internal use. + */ + +#include "openssl/include/internal/deprecated.h" + +#include +#include "crypto/evp.h" +#include "openssl/crypto/evp/legacy_meth.h" + +static int init(EVP_MD_CTX *ctx) +{ + return 1; +} + +static int update(EVP_MD_CTX *ctx, const void *data, size_t count) +{ + return 1; +} + +static int final(EVP_MD_CTX *ctx, unsigned char *md) +{ + return 1; +} + +IMPLEMENT_LEGACY_EVP_MD_METH(md5, MD5) + +static const EVP_MD md5_md = { + NID_md5, + NID_md5WithRSAEncryption, + MD5_DIGEST_LENGTH, + 0, + EVP_ORIG_GLOBAL, + LEGACY_EVP_MD_METH_TABLE(init, update, final, NULL, MD5_CBLOCK) +}; + +const EVP_MD *EVP_md5(void) +{ + return NULL; +} + +//taken from md5_sha1.h +static const EVP_MD md5_sha1_md = { + NID_md5_sha1, + NID_md5_sha1, + MD5_DIGEST_LENGTH, + 0, + EVP_ORIG_GLOBAL, + LEGACY_EVP_MD_METH_TABLE(init, update, final, NULL, MD5_CBLOCK), +}; + +const EVP_MD *EVP_md5_sha1(void) +{ + return NULL; +} + +// Used for s3_cbc.c +void MD5_Transform (MD5_CTX *c, const unsigned char *b) { + return; +} + +int MD5_Init(MD5_CTX *c) { + return 1; +} diff --git a/CryptoPkg/Library/OpensslLib/OpensslStub/Sm3Null.c b/CryptoPkg/Library/OpensslLib/OpensslStub/Sm3Null.c new file mode 100644 index 00000000000..e24638b0681 --- /dev/null +++ b/CryptoPkg/Library/OpensslLib/OpensslStub/Sm3Null.c @@ -0,0 +1,47 @@ +/* + * Copyright 2017-2021 The OpenSSL Project Authors. All Rights Reserved. + * Copyright 2017 Ribose Inc. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +// Copied from sm3_legacy.c + +#include "crypto/evp.h" +#include "openssl/crypto/evp/legacy_meth.h" +#include "internal/sm3.h" + +static int init(EVP_MD_CTX *ctx) +{ + return 1; +} + +static int update(EVP_MD_CTX *ctx, const void *data, size_t count) +{ + return 1; +} + +static int final(EVP_MD_CTX *ctx, unsigned char *md) +{ + return 1; +} + +IMPLEMENT_LEGACY_EVP_MD_METH_LC(sm3_int, ossl_sm3) + +static const EVP_MD sm3_md = { + NID_sm3, + NID_sm3WithRSAEncryption, + SM3_DIGEST_LENGTH, + 0, + EVP_ORIG_GLOBAL, + LEGACY_EVP_MD_METH_TABLE(init, update, final, NULL, + SM3_CBLOCK), +}; + +const EVP_MD *EVP_sm3(void) +{ + return NULL; +} diff --git a/CryptoPkg/Library/OpensslLib/OpensslStub/uefiprov.c b/CryptoPkg/Library/OpensslLib/OpensslStub/uefiprov.c index 40ab7e937c6..09ec2c942a4 100644 --- a/CryptoPkg/Library/OpensslLib/OpensslStub/uefiprov.c +++ b/CryptoPkg/Library/OpensslLib/OpensslStub/uefiprov.c @@ -113,17 +113,21 @@ static const OSSL_ALGORITHM deflt_digests[] = { { PROV_NAMES_SHA2_512, "provider=default", ossl_sha512_functions }, #ifndef OPENSSL_NO_SM3 + // MU_CHANGE START { PROV_NAMES_SM3, "provider=default", ossl_sm3_functions }, + // MU_CHANGE END #endif /* OPENSSL_NO_SM3 */ #ifndef OPENSSL_NO_MD5 + // MU_CHANGE START { PROV_NAMES_MD5, "provider=default", ossl_md5_functions }, + // MU_CHANGE END #endif /* OPENSSL_NO_MD5 */ { PROV_NAMES_NULL, "provider=default", ossl_nullmd_functions }, { NULL, NULL, NULL } }; - +// MU_CHANGE START static const OSSL_ALGORITHM_CAPABLE deflt_ciphers[] = { ALG(PROV_NAMES_NULL, ossl_null_functions), ALG(PROV_NAMES_AES_256_ECB, ossl_aes256ecb_functions), @@ -144,6 +148,7 @@ static const OSSL_ALGORITHM_CAPABLE deflt_ciphers[] = { { { NULL, NULL, NULL }, NULL } }; static OSSL_ALGORITHM exported_ciphers[OSSL_NELEM(deflt_ciphers)]; +// MU_CHANGE END static const OSSL_ALGORITHM deflt_macs[] = { { PROV_NAMES_HMAC, "provider=default", ossl_hmac_functions }, @@ -161,10 +166,14 @@ static const OSSL_ALGORITHM deflt_kdfs[] = { static const OSSL_ALGORITHM deflt_keyexch[] = { #ifndef OPENSSL_NO_DH + // MU_CHANGE start - disable DH { PROV_NAMES_DH, "provider=default", ossl_dh_keyexch_functions }, + // MU_CHANGE end - disable DH #endif #ifndef OPENSSL_NO_EC + // MU_CHANGE start - disable DH { PROV_NAMES_ECDH, "provider=default", ossl_ecdh_keyexch_functions }, + // MU_CHANGE end - disable DH #endif { PROV_NAMES_TLS1_PRF, "provider=default", ossl_kdf_tls1_prf_keyexch_functions }, { PROV_NAMES_HKDF, "provider=default", ossl_kdf_hkdf_keyexch_functions }, @@ -193,10 +202,12 @@ static const OSSL_ALGORITHM deflt_asym_cipher[] = { static const OSSL_ALGORITHM deflt_keymgmt[] = { #ifndef OPENSSL_NO_DH + // MU_CHANGE start - disable DH { PROV_NAMES_DH, "provider=default", ossl_dh_keymgmt_functions, PROV_DESCS_DH }, { PROV_NAMES_DHX, "provider=default", ossl_dhx_keymgmt_functions, PROV_DESCS_DHX }, + // MU_CHANGE end - disable DH #endif { PROV_NAMES_RSA, "provider=default", ossl_rsa_keymgmt_functions, @@ -230,7 +241,10 @@ static const OSSL_ALGORITHM *deflt_query(void *provctx, int operation_id, case OSSL_OP_DIGEST: return deflt_digests; case OSSL_OP_CIPHER: + // MU_CHANGE START return exported_ciphers; + //return NULL; + // MU_CHANGE END case OSSL_OP_MAC: return deflt_macs; case OSSL_OP_KDF: @@ -322,7 +336,9 @@ int ossl_uefi_provider_init(const OSSL_CORE_HANDLE *handle, ossl_prov_ctx_set0_core_bio_method(*provctx, corebiometh); *out = deflt_dispatch_table; + // MU_CHANGE START ossl_prov_cache_exported_algorithms(deflt_ciphers, exported_ciphers); + // MU_CHANGE END return 1; } diff --git a/CryptoPkg/Library/OpensslLib/configure.py b/CryptoPkg/Library/OpensslLib/configure.py index 4243ca4c257..4d792fc9c8c 100755 --- a/CryptoPkg/Library/OpensslLib/configure.py +++ b/CryptoPkg/Library/OpensslLib/configure.py @@ -35,6 +35,7 @@ def openssl_configure(openssldir, target, ec = True): 'no-deprecated', 'no-des', 'no-dgram', + 'no-dh', 'no-dsa', 'no-dso', 'no-dtls', @@ -73,6 +74,7 @@ def openssl_configure(openssldir, target, ec = True): 'no-siphash', 'no-siv', 'no-sm2', + 'no-sm3', 'no-sm4', 'no-sock', 'no-srp',