diff --git a/CryptoPkg/Library/BaseCryptLib/BaseCryptLib.inf b/CryptoPkg/Library/BaseCryptLib/BaseCryptLib.inf
index 39395b71b84..34a17d84bdd 100644
--- a/CryptoPkg/Library/BaseCryptLib/BaseCryptLib.inf
+++ b/CryptoPkg/Library/BaseCryptLib/BaseCryptLib.inf
@@ -34,7 +34,7 @@
Hash/CryptSha1.c
Hash/CryptSha256.c
Hash/CryptSha512.c
- Hash/CryptSm3.c
+ Hash/CryptSm3Null.c ## Temp change
Hash/CryptSha3.c
Hash/CryptXkcp.c
Hash/CryptCShake256.c
@@ -42,7 +42,7 @@
Hash/CryptDispatchApDxe.c
Hmac/CryptHmac.c
Kdf/CryptHkdf.c
- Cipher/CryptAes.c
+ Cipher/CryptAesNull.c ## Temp change
Cipher/CryptAeadAesGcm.c
Pk/CryptRsaBasic.c
Pk/CryptRsaExt.c
@@ -52,7 +52,7 @@
Pk/CryptPkcs7VerifyCommon.c
Pk/CryptPkcs7VerifyBase.c
Pk/CryptPkcs7VerifyEku.c
- Pk/CryptDh.c
+ Pk/CryptDhNull.c
Pk/CryptX509.c
Pk/CryptAuthenticode.c
Pk/CryptTs.c
diff --git a/CryptoPkg/Library/BaseCryptLib/PeiCryptLib.inf b/CryptoPkg/Library/BaseCryptLib/PeiCryptLib.inf
index 7ae3c55de3a..840706acc80 100644
--- a/CryptoPkg/Library/BaseCryptLib/PeiCryptLib.inf
+++ b/CryptoPkg/Library/BaseCryptLib/PeiCryptLib.inf
@@ -38,7 +38,7 @@
Hash/CryptMd5Null.c ## MS_CHANGE_162948 MSChange - Remove support for deprecated crypto.
Hash/CryptSha1.c
Hash/CryptSha256.c
- Hash/CryptSm3.c
+ Hash/CryptSm3Null.c
Hash/CryptSha512.c
Hash/CryptSha3.c
Hash/CryptXkcp.c
diff --git a/CryptoPkg/Library/BaseCryptLib/SmmCryptLib.inf b/CryptoPkg/Library/BaseCryptLib/SmmCryptLib.inf
index f84290db371..fb16f52bca6 100644
--- a/CryptoPkg/Library/BaseCryptLib/SmmCryptLib.inf
+++ b/CryptoPkg/Library/BaseCryptLib/SmmCryptLib.inf
@@ -36,7 +36,7 @@
Hash/CryptMd5Null.c ## MS_CHANGE_162948 - MSChange - Remove support for deprecated crypto.
Hash/CryptSha1.c
Hash/CryptSha256.c
- Hash/CryptSm3.c
+ Hash/CryptSm3Null.c
Hash/CryptSha512.c
Hash/CryptSha3.c
Hash/CryptXkcp.c
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-GCC/crypto/sha/sha1-586.S b/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-GCC/crypto/sha/sha1-586.S
index 9cfe5a46603..f7bae3560ec 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-GCC/crypto/sha/sha1-586.S
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-GCC/crypto/sha/sha1-586.S
@@ -27,11 +27,6 @@ sha1_block_data_order:
jz .L001x86
testl $536870912,%ecx
jnz .Lshaext_shortcut
- andl $268435456,%edx
- andl $1073741824,%eax
- orl %edx,%eax
- cmpl $1342177280,%eax
- je .Lavx_shortcut
jmp .Lssse3_shortcut
.align 16
.L001x86:
@@ -2799,1181 +2794,6 @@ _sha1_block_data_order_ssse3:
popl %ebp
ret
.size _sha1_block_data_order_ssse3,.-_sha1_block_data_order_ssse3
-.type _sha1_block_data_order_avx,@function
-.align 16
-_sha1_block_data_order_avx:
- #ifdef __CET__
-
-.byte 243,15,30,251
- #endif
-
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- call .L008pic_point
-.L008pic_point:
- popl %ebp
- leal .LK_XX_XX-.L008pic_point(%ebp),%ebp
-.Lavx_shortcut:
- vzeroall
- vmovdqa (%ebp),%xmm7
- vmovdqa 16(%ebp),%xmm0
- vmovdqa 32(%ebp),%xmm1
- vmovdqa 48(%ebp),%xmm2
- vmovdqa 64(%ebp),%xmm6
- movl 20(%esp),%edi
- movl 24(%esp),%ebp
- movl 28(%esp),%edx
- movl %esp,%esi
- subl $208,%esp
- andl $-64,%esp
- vmovdqa %xmm0,112(%esp)
- vmovdqa %xmm1,128(%esp)
- vmovdqa %xmm2,144(%esp)
- shll $6,%edx
- vmovdqa %xmm7,160(%esp)
- addl %ebp,%edx
- vmovdqa %xmm6,176(%esp)
- addl $64,%ebp
- movl %edi,192(%esp)
- movl %ebp,196(%esp)
- movl %edx,200(%esp)
- movl %esi,204(%esp)
- movl (%edi),%eax
- movl 4(%edi),%ebx
- movl 8(%edi),%ecx
- movl 12(%edi),%edx
- movl 16(%edi),%edi
- movl %ebx,%esi
- vmovdqu -64(%ebp),%xmm0
- vmovdqu -48(%ebp),%xmm1
- vmovdqu -32(%ebp),%xmm2
- vmovdqu -16(%ebp),%xmm3
- vpshufb %xmm6,%xmm0,%xmm0
- vpshufb %xmm6,%xmm1,%xmm1
- vpshufb %xmm6,%xmm2,%xmm2
- vmovdqa %xmm7,96(%esp)
- vpshufb %xmm6,%xmm3,%xmm3
- vpaddd %xmm7,%xmm0,%xmm4
- vpaddd %xmm7,%xmm1,%xmm5
- vpaddd %xmm7,%xmm2,%xmm6
- vmovdqa %xmm4,(%esp)
- movl %ecx,%ebp
- vmovdqa %xmm5,16(%esp)
- xorl %edx,%ebp
- vmovdqa %xmm6,32(%esp)
- andl %ebp,%esi
- jmp .L009loop
-.align 16
-.L009loop:
- shrdl $2,%ebx,%ebx
- xorl %edx,%esi
- vpalignr $8,%xmm0,%xmm1,%xmm4
- movl %eax,%ebp
- addl (%esp),%edi
- vpaddd %xmm3,%xmm7,%xmm7
- vmovdqa %xmm0,64(%esp)
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpsrldq $4,%xmm3,%xmm6
- addl %esi,%edi
- andl %ebx,%ebp
- vpxor %xmm0,%xmm4,%xmm4
- xorl %ecx,%ebx
- addl %eax,%edi
- vpxor %xmm2,%xmm6,%xmm6
- shrdl $7,%eax,%eax
- xorl %ecx,%ebp
- vmovdqa %xmm7,48(%esp)
- movl %edi,%esi
- addl 4(%esp),%edx
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ebx,%eax
- shldl $5,%edi,%edi
- addl %ebp,%edx
- andl %eax,%esi
- vpsrld $31,%xmm4,%xmm6
- xorl %ebx,%eax
- addl %edi,%edx
- shrdl $7,%edi,%edi
- xorl %ebx,%esi
- vpslldq $12,%xmm4,%xmm0
- vpaddd %xmm4,%xmm4,%xmm4
- movl %edx,%ebp
- addl 8(%esp),%ecx
- xorl %eax,%edi
- shldl $5,%edx,%edx
- vpsrld $30,%xmm0,%xmm7
- vpor %xmm6,%xmm4,%xmm4
- addl %esi,%ecx
- andl %edi,%ebp
- xorl %eax,%edi
- addl %edx,%ecx
- vpslld $2,%xmm0,%xmm0
- shrdl $7,%edx,%edx
- xorl %eax,%ebp
- vpxor %xmm7,%xmm4,%xmm4
- movl %ecx,%esi
- addl 12(%esp),%ebx
- xorl %edi,%edx
- shldl $5,%ecx,%ecx
- vpxor %xmm0,%xmm4,%xmm4
- addl %ebp,%ebx
- andl %edx,%esi
- vmovdqa 96(%esp),%xmm0
- xorl %edi,%edx
- addl %ecx,%ebx
- shrdl $7,%ecx,%ecx
- xorl %edi,%esi
- vpalignr $8,%xmm1,%xmm2,%xmm5
- movl %ebx,%ebp
- addl 16(%esp),%eax
- vpaddd %xmm4,%xmm0,%xmm0
- vmovdqa %xmm1,80(%esp)
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vpsrldq $4,%xmm4,%xmm7
- addl %esi,%eax
- andl %ecx,%ebp
- vpxor %xmm1,%xmm5,%xmm5
- xorl %edx,%ecx
- addl %ebx,%eax
- vpxor %xmm3,%xmm7,%xmm7
- shrdl $7,%ebx,%ebx
- xorl %edx,%ebp
- vmovdqa %xmm0,(%esp)
- movl %eax,%esi
- addl 20(%esp),%edi
- vpxor %xmm7,%xmm5,%xmm5
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- addl %ebp,%edi
- andl %ebx,%esi
- vpsrld $31,%xmm5,%xmm7
- xorl %ecx,%ebx
- addl %eax,%edi
- shrdl $7,%eax,%eax
- xorl %ecx,%esi
- vpslldq $12,%xmm5,%xmm1
- vpaddd %xmm5,%xmm5,%xmm5
- movl %edi,%ebp
- addl 24(%esp),%edx
- xorl %ebx,%eax
- shldl $5,%edi,%edi
- vpsrld $30,%xmm1,%xmm0
- vpor %xmm7,%xmm5,%xmm5
- addl %esi,%edx
- andl %eax,%ebp
- xorl %ebx,%eax
- addl %edi,%edx
- vpslld $2,%xmm1,%xmm1
- shrdl $7,%edi,%edi
- xorl %ebx,%ebp
- vpxor %xmm0,%xmm5,%xmm5
- movl %edx,%esi
- addl 28(%esp),%ecx
- xorl %eax,%edi
- shldl $5,%edx,%edx
- vpxor %xmm1,%xmm5,%xmm5
- addl %ebp,%ecx
- andl %edi,%esi
- vmovdqa 112(%esp),%xmm1
- xorl %eax,%edi
- addl %edx,%ecx
- shrdl $7,%edx,%edx
- xorl %eax,%esi
- vpalignr $8,%xmm2,%xmm3,%xmm6
- movl %ecx,%ebp
- addl 32(%esp),%ebx
- vpaddd %xmm5,%xmm1,%xmm1
- vmovdqa %xmm2,96(%esp)
- xorl %edi,%edx
- shldl $5,%ecx,%ecx
- vpsrldq $4,%xmm5,%xmm0
- addl %esi,%ebx
- andl %edx,%ebp
- vpxor %xmm2,%xmm6,%xmm6
- xorl %edi,%edx
- addl %ecx,%ebx
- vpxor %xmm4,%xmm0,%xmm0
- shrdl $7,%ecx,%ecx
- xorl %edi,%ebp
- vmovdqa %xmm1,16(%esp)
- movl %ebx,%esi
- addl 36(%esp),%eax
- vpxor %xmm0,%xmm6,%xmm6
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- andl %ecx,%esi
- vpsrld $31,%xmm6,%xmm0
- xorl %edx,%ecx
- addl %ebx,%eax
- shrdl $7,%ebx,%ebx
- xorl %edx,%esi
- vpslldq $12,%xmm6,%xmm2
- vpaddd %xmm6,%xmm6,%xmm6
- movl %eax,%ebp
- addl 40(%esp),%edi
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpsrld $30,%xmm2,%xmm1
- vpor %xmm0,%xmm6,%xmm6
- addl %esi,%edi
- andl %ebx,%ebp
- xorl %ecx,%ebx
- addl %eax,%edi
- vpslld $2,%xmm2,%xmm2
- vmovdqa 64(%esp),%xmm0
- shrdl $7,%eax,%eax
- xorl %ecx,%ebp
- vpxor %xmm1,%xmm6,%xmm6
- movl %edi,%esi
- addl 44(%esp),%edx
- xorl %ebx,%eax
- shldl $5,%edi,%edi
- vpxor %xmm2,%xmm6,%xmm6
- addl %ebp,%edx
- andl %eax,%esi
- vmovdqa 112(%esp),%xmm2
- xorl %ebx,%eax
- addl %edi,%edx
- shrdl $7,%edi,%edi
- xorl %ebx,%esi
- vpalignr $8,%xmm3,%xmm4,%xmm7
- movl %edx,%ebp
- addl 48(%esp),%ecx
- vpaddd %xmm6,%xmm2,%xmm2
- vmovdqa %xmm3,64(%esp)
- xorl %eax,%edi
- shldl $5,%edx,%edx
- vpsrldq $4,%xmm6,%xmm1
- addl %esi,%ecx
- andl %edi,%ebp
- vpxor %xmm3,%xmm7,%xmm7
- xorl %eax,%edi
- addl %edx,%ecx
- vpxor %xmm5,%xmm1,%xmm1
- shrdl $7,%edx,%edx
- xorl %eax,%ebp
- vmovdqa %xmm2,32(%esp)
- movl %ecx,%esi
- addl 52(%esp),%ebx
- vpxor %xmm1,%xmm7,%xmm7
- xorl %edi,%edx
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- andl %edx,%esi
- vpsrld $31,%xmm7,%xmm1
- xorl %edi,%edx
- addl %ecx,%ebx
- shrdl $7,%ecx,%ecx
- xorl %edi,%esi
- vpslldq $12,%xmm7,%xmm3
- vpaddd %xmm7,%xmm7,%xmm7
- movl %ebx,%ebp
- addl 56(%esp),%eax
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vpsrld $30,%xmm3,%xmm2
- vpor %xmm1,%xmm7,%xmm7
- addl %esi,%eax
- andl %ecx,%ebp
- xorl %edx,%ecx
- addl %ebx,%eax
- vpslld $2,%xmm3,%xmm3
- vmovdqa 80(%esp),%xmm1
- shrdl $7,%ebx,%ebx
- xorl %edx,%ebp
- vpxor %xmm2,%xmm7,%xmm7
- movl %eax,%esi
- addl 60(%esp),%edi
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpxor %xmm3,%xmm7,%xmm7
- addl %ebp,%edi
- andl %ebx,%esi
- vmovdqa 112(%esp),%xmm3
- xorl %ecx,%ebx
- addl %eax,%edi
- vpalignr $8,%xmm6,%xmm7,%xmm2
- vpxor %xmm4,%xmm0,%xmm0
- shrdl $7,%eax,%eax
- xorl %ecx,%esi
- movl %edi,%ebp
- addl (%esp),%edx
- vpxor %xmm1,%xmm0,%xmm0
- vmovdqa %xmm4,80(%esp)
- xorl %ebx,%eax
- shldl $5,%edi,%edi
- vmovdqa %xmm3,%xmm4
- vpaddd %xmm7,%xmm3,%xmm3
- addl %esi,%edx
- andl %eax,%ebp
- vpxor %xmm2,%xmm0,%xmm0
- xorl %ebx,%eax
- addl %edi,%edx
- shrdl $7,%edi,%edi
- xorl %ebx,%ebp
- vpsrld $30,%xmm0,%xmm2
- vmovdqa %xmm3,48(%esp)
- movl %edx,%esi
- addl 4(%esp),%ecx
- xorl %eax,%edi
- shldl $5,%edx,%edx
- vpslld $2,%xmm0,%xmm0
- addl %ebp,%ecx
- andl %edi,%esi
- xorl %eax,%edi
- addl %edx,%ecx
- shrdl $7,%edx,%edx
- xorl %eax,%esi
- movl %ecx,%ebp
- addl 8(%esp),%ebx
- vpor %xmm2,%xmm0,%xmm0
- xorl %edi,%edx
- shldl $5,%ecx,%ecx
- vmovdqa 96(%esp),%xmm2
- addl %esi,%ebx
- andl %edx,%ebp
- xorl %edi,%edx
- addl %ecx,%ebx
- addl 12(%esp),%eax
- xorl %edi,%ebp
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpalignr $8,%xmm7,%xmm0,%xmm3
- vpxor %xmm5,%xmm1,%xmm1
- addl 16(%esp),%edi
- xorl %ecx,%esi
- movl %eax,%ebp
- shldl $5,%eax,%eax
- vpxor %xmm2,%xmm1,%xmm1
- vmovdqa %xmm5,96(%esp)
- addl %esi,%edi
- xorl %ecx,%ebp
- vmovdqa %xmm4,%xmm5
- vpaddd %xmm0,%xmm4,%xmm4
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- vpxor %xmm3,%xmm1,%xmm1
- addl 20(%esp),%edx
- xorl %ebx,%ebp
- movl %edi,%esi
- shldl $5,%edi,%edi
- vpsrld $30,%xmm1,%xmm3
- vmovdqa %xmm4,(%esp)
- addl %ebp,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %edi,%edx
- vpslld $2,%xmm1,%xmm1
- addl 24(%esp),%ecx
- xorl %eax,%esi
- movl %edx,%ebp
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %eax,%ebp
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- vpor %xmm3,%xmm1,%xmm1
- addl 28(%esp),%ebx
- xorl %edi,%ebp
- vmovdqa 64(%esp),%xmm3
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- xorl %edi,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpalignr $8,%xmm0,%xmm1,%xmm4
- vpxor %xmm6,%xmm2,%xmm2
- addl 32(%esp),%eax
- xorl %edx,%esi
- movl %ebx,%ebp
- shldl $5,%ebx,%ebx
- vpxor %xmm3,%xmm2,%xmm2
- vmovdqa %xmm6,64(%esp)
- addl %esi,%eax
- xorl %edx,%ebp
- vmovdqa 128(%esp),%xmm6
- vpaddd %xmm1,%xmm5,%xmm5
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpxor %xmm4,%xmm2,%xmm2
- addl 36(%esp),%edi
- xorl %ecx,%ebp
- movl %eax,%esi
- shldl $5,%eax,%eax
- vpsrld $30,%xmm2,%xmm4
- vmovdqa %xmm5,16(%esp)
- addl %ebp,%edi
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- vpslld $2,%xmm2,%xmm2
- addl 40(%esp),%edx
- xorl %ebx,%esi
- movl %edi,%ebp
- shldl $5,%edi,%edi
- addl %esi,%edx
- xorl %ebx,%ebp
- shrdl $7,%eax,%eax
- addl %edi,%edx
- vpor %xmm4,%xmm2,%xmm2
- addl 44(%esp),%ecx
- xorl %eax,%ebp
- vmovdqa 80(%esp),%xmm4
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %ebp,%ecx
- xorl %eax,%esi
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- vpalignr $8,%xmm1,%xmm2,%xmm5
- vpxor %xmm7,%xmm3,%xmm3
- addl 48(%esp),%ebx
- xorl %edi,%esi
- movl %ecx,%ebp
- shldl $5,%ecx,%ecx
- vpxor %xmm4,%xmm3,%xmm3
- vmovdqa %xmm7,80(%esp)
- addl %esi,%ebx
- xorl %edi,%ebp
- vmovdqa %xmm6,%xmm7
- vpaddd %xmm2,%xmm6,%xmm6
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpxor %xmm5,%xmm3,%xmm3
- addl 52(%esp),%eax
- xorl %edx,%ebp
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- vpsrld $30,%xmm3,%xmm5
- vmovdqa %xmm6,32(%esp)
- addl %ebp,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpslld $2,%xmm3,%xmm3
- addl 56(%esp),%edi
- xorl %ecx,%esi
- movl %eax,%ebp
- shldl $5,%eax,%eax
- addl %esi,%edi
- xorl %ecx,%ebp
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- vpor %xmm5,%xmm3,%xmm3
- addl 60(%esp),%edx
- xorl %ebx,%ebp
- vmovdqa 96(%esp),%xmm5
- movl %edi,%esi
- shldl $5,%edi,%edi
- addl %ebp,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %edi,%edx
- vpalignr $8,%xmm2,%xmm3,%xmm6
- vpxor %xmm0,%xmm4,%xmm4
- addl (%esp),%ecx
- xorl %eax,%esi
- movl %edx,%ebp
- shldl $5,%edx,%edx
- vpxor %xmm5,%xmm4,%xmm4
- vmovdqa %xmm0,96(%esp)
- addl %esi,%ecx
- xorl %eax,%ebp
- vmovdqa %xmm7,%xmm0
- vpaddd %xmm3,%xmm7,%xmm7
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- vpxor %xmm6,%xmm4,%xmm4
- addl 4(%esp),%ebx
- xorl %edi,%ebp
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- vpsrld $30,%xmm4,%xmm6
- vmovdqa %xmm7,48(%esp)
- addl %ebp,%ebx
- xorl %edi,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpslld $2,%xmm4,%xmm4
- addl 8(%esp),%eax
- xorl %edx,%esi
- movl %ebx,%ebp
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%ebp
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpor %xmm6,%xmm4,%xmm4
- addl 12(%esp),%edi
- xorl %ecx,%ebp
- vmovdqa 64(%esp),%xmm6
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %ebp,%edi
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- vpalignr $8,%xmm3,%xmm4,%xmm7
- vpxor %xmm1,%xmm5,%xmm5
- addl 16(%esp),%edx
- xorl %ebx,%esi
- movl %edi,%ebp
- shldl $5,%edi,%edi
- vpxor %xmm6,%xmm5,%xmm5
- vmovdqa %xmm1,64(%esp)
- addl %esi,%edx
- xorl %ebx,%ebp
- vmovdqa %xmm0,%xmm1
- vpaddd %xmm4,%xmm0,%xmm0
- shrdl $7,%eax,%eax
- addl %edi,%edx
- vpxor %xmm7,%xmm5,%xmm5
- addl 20(%esp),%ecx
- xorl %eax,%ebp
- movl %edx,%esi
- shldl $5,%edx,%edx
- vpsrld $30,%xmm5,%xmm7
- vmovdqa %xmm0,(%esp)
- addl %ebp,%ecx
- xorl %eax,%esi
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- vpslld $2,%xmm5,%xmm5
- addl 24(%esp),%ebx
- xorl %edi,%esi
- movl %ecx,%ebp
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edi,%ebp
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpor %xmm7,%xmm5,%xmm5
- addl 28(%esp),%eax
- vmovdqa 80(%esp),%xmm7
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- xorl %edx,%ebp
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- xorl %ecx,%esi
- xorl %edx,%ecx
- addl %ebx,%eax
- vpalignr $8,%xmm4,%xmm5,%xmm0
- vpxor %xmm2,%xmm6,%xmm6
- addl 32(%esp),%edi
- andl %ecx,%esi
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- vpxor %xmm7,%xmm6,%xmm6
- vmovdqa %xmm2,80(%esp)
- movl %eax,%ebp
- xorl %ecx,%esi
- vmovdqa %xmm1,%xmm2
- vpaddd %xmm5,%xmm1,%xmm1
- shldl $5,%eax,%eax
- addl %esi,%edi
- vpxor %xmm0,%xmm6,%xmm6
- xorl %ebx,%ebp
- xorl %ecx,%ebx
- addl %eax,%edi
- addl 36(%esp),%edx
- vpsrld $30,%xmm6,%xmm0
- vmovdqa %xmm1,16(%esp)
- andl %ebx,%ebp
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %edi,%esi
- vpslld $2,%xmm6,%xmm6
- xorl %ebx,%ebp
- shldl $5,%edi,%edi
- addl %ebp,%edx
- xorl %eax,%esi
- xorl %ebx,%eax
- addl %edi,%edx
- addl 40(%esp),%ecx
- andl %eax,%esi
- vpor %xmm0,%xmm6,%xmm6
- xorl %ebx,%eax
- shrdl $7,%edi,%edi
- vmovdqa 96(%esp),%xmm0
- movl %edx,%ebp
- xorl %eax,%esi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %edi,%ebp
- xorl %eax,%edi
- addl %edx,%ecx
- addl 44(%esp),%ebx
- andl %edi,%ebp
- xorl %eax,%edi
- shrdl $7,%edx,%edx
- movl %ecx,%esi
- xorl %edi,%ebp
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- xorl %edx,%esi
- xorl %edi,%edx
- addl %ecx,%ebx
- vpalignr $8,%xmm5,%xmm6,%xmm1
- vpxor %xmm3,%xmm7,%xmm7
- addl 48(%esp),%eax
- andl %edx,%esi
- xorl %edi,%edx
- shrdl $7,%ecx,%ecx
- vpxor %xmm0,%xmm7,%xmm7
- vmovdqa %xmm3,96(%esp)
- movl %ebx,%ebp
- xorl %edx,%esi
- vmovdqa 144(%esp),%xmm3
- vpaddd %xmm6,%xmm2,%xmm2
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- vpxor %xmm1,%xmm7,%xmm7
- xorl %ecx,%ebp
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 52(%esp),%edi
- vpsrld $30,%xmm7,%xmm1
- vmovdqa %xmm2,32(%esp)
- andl %ecx,%ebp
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%esi
- vpslld $2,%xmm7,%xmm7
- xorl %ecx,%ebp
- shldl $5,%eax,%eax
- addl %ebp,%edi
- xorl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%edi
- addl 56(%esp),%edx
- andl %ebx,%esi
- vpor %xmm1,%xmm7,%xmm7
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- vmovdqa 64(%esp),%xmm1
- movl %edi,%ebp
- xorl %ebx,%esi
- shldl $5,%edi,%edi
- addl %esi,%edx
- xorl %eax,%ebp
- xorl %ebx,%eax
- addl %edi,%edx
- addl 60(%esp),%ecx
- andl %eax,%ebp
- xorl %ebx,%eax
- shrdl $7,%edi,%edi
- movl %edx,%esi
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- addl %ebp,%ecx
- xorl %edi,%esi
- xorl %eax,%edi
- addl %edx,%ecx
- vpalignr $8,%xmm6,%xmm7,%xmm2
- vpxor %xmm4,%xmm0,%xmm0
- addl (%esp),%ebx
- andl %edi,%esi
- xorl %eax,%edi
- shrdl $7,%edx,%edx
- vpxor %xmm1,%xmm0,%xmm0
- vmovdqa %xmm4,64(%esp)
- movl %ecx,%ebp
- xorl %edi,%esi
- vmovdqa %xmm3,%xmm4
- vpaddd %xmm7,%xmm3,%xmm3
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- vpxor %xmm2,%xmm0,%xmm0
- xorl %edx,%ebp
- xorl %edi,%edx
- addl %ecx,%ebx
- addl 4(%esp),%eax
- vpsrld $30,%xmm0,%xmm2
- vmovdqa %xmm3,48(%esp)
- andl %edx,%ebp
- xorl %edi,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- vpslld $2,%xmm0,%xmm0
- xorl %edx,%ebp
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- xorl %ecx,%esi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 8(%esp),%edi
- andl %ecx,%esi
- vpor %xmm2,%xmm0,%xmm0
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- vmovdqa 80(%esp),%xmm2
- movl %eax,%ebp
- xorl %ecx,%esi
- shldl $5,%eax,%eax
- addl %esi,%edi
- xorl %ebx,%ebp
- xorl %ecx,%ebx
- addl %eax,%edi
- addl 12(%esp),%edx
- andl %ebx,%ebp
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %edi,%esi
- xorl %ebx,%ebp
- shldl $5,%edi,%edi
- addl %ebp,%edx
- xorl %eax,%esi
- xorl %ebx,%eax
- addl %edi,%edx
- vpalignr $8,%xmm7,%xmm0,%xmm3
- vpxor %xmm5,%xmm1,%xmm1
- addl 16(%esp),%ecx
- andl %eax,%esi
- xorl %ebx,%eax
- shrdl $7,%edi,%edi
- vpxor %xmm2,%xmm1,%xmm1
- vmovdqa %xmm5,80(%esp)
- movl %edx,%ebp
- xorl %eax,%esi
- vmovdqa %xmm4,%xmm5
- vpaddd %xmm0,%xmm4,%xmm4
- shldl $5,%edx,%edx
- addl %esi,%ecx
- vpxor %xmm3,%xmm1,%xmm1
- xorl %edi,%ebp
- xorl %eax,%edi
- addl %edx,%ecx
- addl 20(%esp),%ebx
- vpsrld $30,%xmm1,%xmm3
- vmovdqa %xmm4,(%esp)
- andl %edi,%ebp
- xorl %eax,%edi
- shrdl $7,%edx,%edx
- movl %ecx,%esi
- vpslld $2,%xmm1,%xmm1
- xorl %edi,%ebp
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- xorl %edx,%esi
- xorl %edi,%edx
- addl %ecx,%ebx
- addl 24(%esp),%eax
- andl %edx,%esi
- vpor %xmm3,%xmm1,%xmm1
- xorl %edi,%edx
- shrdl $7,%ecx,%ecx
- vmovdqa 96(%esp),%xmm3
- movl %ebx,%ebp
- xorl %edx,%esi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %ecx,%ebp
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 28(%esp),%edi
- andl %ecx,%ebp
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%esi
- xorl %ecx,%ebp
- shldl $5,%eax,%eax
- addl %ebp,%edi
- xorl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%edi
- vpalignr $8,%xmm0,%xmm1,%xmm4
- vpxor %xmm6,%xmm2,%xmm2
- addl 32(%esp),%edx
- andl %ebx,%esi
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- vpxor %xmm3,%xmm2,%xmm2
- vmovdqa %xmm6,96(%esp)
- movl %edi,%ebp
- xorl %ebx,%esi
- vmovdqa %xmm5,%xmm6
- vpaddd %xmm1,%xmm5,%xmm5
- shldl $5,%edi,%edi
- addl %esi,%edx
- vpxor %xmm4,%xmm2,%xmm2
- xorl %eax,%ebp
- xorl %ebx,%eax
- addl %edi,%edx
- addl 36(%esp),%ecx
- vpsrld $30,%xmm2,%xmm4
- vmovdqa %xmm5,16(%esp)
- andl %eax,%ebp
- xorl %ebx,%eax
- shrdl $7,%edi,%edi
- movl %edx,%esi
- vpslld $2,%xmm2,%xmm2
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- addl %ebp,%ecx
- xorl %edi,%esi
- xorl %eax,%edi
- addl %edx,%ecx
- addl 40(%esp),%ebx
- andl %edi,%esi
- vpor %xmm4,%xmm2,%xmm2
- xorl %eax,%edi
- shrdl $7,%edx,%edx
- vmovdqa 64(%esp),%xmm4
- movl %ecx,%ebp
- xorl %edi,%esi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edx,%ebp
- xorl %edi,%edx
- addl %ecx,%ebx
- addl 44(%esp),%eax
- andl %edx,%ebp
- xorl %edi,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- xorl %edx,%ebp
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- xorl %edx,%esi
- addl %ebx,%eax
- vpalignr $8,%xmm1,%xmm2,%xmm5
- vpxor %xmm7,%xmm3,%xmm3
- addl 48(%esp),%edi
- xorl %ecx,%esi
- movl %eax,%ebp
- shldl $5,%eax,%eax
- vpxor %xmm4,%xmm3,%xmm3
- vmovdqa %xmm7,64(%esp)
- addl %esi,%edi
- xorl %ecx,%ebp
- vmovdqa %xmm6,%xmm7
- vpaddd %xmm2,%xmm6,%xmm6
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- vpxor %xmm5,%xmm3,%xmm3
- addl 52(%esp),%edx
- xorl %ebx,%ebp
- movl %edi,%esi
- shldl $5,%edi,%edi
- vpsrld $30,%xmm3,%xmm5
- vmovdqa %xmm6,32(%esp)
- addl %ebp,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %edi,%edx
- vpslld $2,%xmm3,%xmm3
- addl 56(%esp),%ecx
- xorl %eax,%esi
- movl %edx,%ebp
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %eax,%ebp
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- vpor %xmm5,%xmm3,%xmm3
- addl 60(%esp),%ebx
- xorl %edi,%ebp
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- xorl %edi,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl (%esp),%eax
- vpaddd %xmm3,%xmm7,%xmm7
- xorl %edx,%esi
- movl %ebx,%ebp
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- vmovdqa %xmm7,48(%esp)
- xorl %edx,%ebp
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 4(%esp),%edi
- xorl %ecx,%ebp
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %ebp,%edi
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- addl 8(%esp),%edx
- xorl %ebx,%esi
- movl %edi,%ebp
- shldl $5,%edi,%edi
- addl %esi,%edx
- xorl %ebx,%ebp
- shrdl $7,%eax,%eax
- addl %edi,%edx
- addl 12(%esp),%ecx
- xorl %eax,%ebp
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %ebp,%ecx
- xorl %eax,%esi
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- movl 196(%esp),%ebp
- cmpl 200(%esp),%ebp
- je .L010done
- vmovdqa 160(%esp),%xmm7
- vmovdqa 176(%esp),%xmm6
- vmovdqu (%ebp),%xmm0
- vmovdqu 16(%ebp),%xmm1
- vmovdqu 32(%ebp),%xmm2
- vmovdqu 48(%ebp),%xmm3
- addl $64,%ebp
- vpshufb %xmm6,%xmm0,%xmm0
- movl %ebp,196(%esp)
- vmovdqa %xmm7,96(%esp)
- addl 16(%esp),%ebx
- xorl %edi,%esi
- vpshufb %xmm6,%xmm1,%xmm1
- movl %ecx,%ebp
- shldl $5,%ecx,%ecx
- vpaddd %xmm7,%xmm0,%xmm4
- addl %esi,%ebx
- xorl %edi,%ebp
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vmovdqa %xmm4,(%esp)
- addl 20(%esp),%eax
- xorl %edx,%ebp
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 24(%esp),%edi
- xorl %ecx,%esi
- movl %eax,%ebp
- shldl $5,%eax,%eax
- addl %esi,%edi
- xorl %ecx,%ebp
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- addl 28(%esp),%edx
- xorl %ebx,%ebp
- movl %edi,%esi
- shldl $5,%edi,%edi
- addl %ebp,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %edi,%edx
- addl 32(%esp),%ecx
- xorl %eax,%esi
- vpshufb %xmm6,%xmm2,%xmm2
- movl %edx,%ebp
- shldl $5,%edx,%edx
- vpaddd %xmm7,%xmm1,%xmm5
- addl %esi,%ecx
- xorl %eax,%ebp
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- vmovdqa %xmm5,16(%esp)
- addl 36(%esp),%ebx
- xorl %edi,%ebp
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- xorl %edi,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 40(%esp),%eax
- xorl %edx,%esi
- movl %ebx,%ebp
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%ebp
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 44(%esp),%edi
- xorl %ecx,%ebp
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %ebp,%edi
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- addl 48(%esp),%edx
- xorl %ebx,%esi
- vpshufb %xmm6,%xmm3,%xmm3
- movl %edi,%ebp
- shldl $5,%edi,%edi
- vpaddd %xmm7,%xmm2,%xmm6
- addl %esi,%edx
- xorl %ebx,%ebp
- shrdl $7,%eax,%eax
- addl %edi,%edx
- vmovdqa %xmm6,32(%esp)
- addl 52(%esp),%ecx
- xorl %eax,%ebp
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %ebp,%ecx
- xorl %eax,%esi
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- addl 56(%esp),%ebx
- xorl %edi,%esi
- movl %ecx,%ebp
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edi,%ebp
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 60(%esp),%eax
- xorl %edx,%ebp
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- movl 192(%esp),%ebp
- addl (%ebp),%eax
- addl 4(%ebp),%esi
- addl 8(%ebp),%ecx
- movl %eax,(%ebp)
- addl 12(%ebp),%edx
- movl %esi,4(%ebp)
- addl 16(%ebp),%edi
- movl %ecx,%ebx
- movl %ecx,8(%ebp)
- xorl %edx,%ebx
- movl %edx,12(%ebp)
- movl %edi,16(%ebp)
- movl %esi,%ebp
- andl %ebx,%esi
- movl %ebp,%ebx
- jmp .L009loop
-.align 16
-.L010done:
- addl 16(%esp),%ebx
- xorl %edi,%esi
- movl %ecx,%ebp
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edi,%ebp
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 20(%esp),%eax
- xorl %edx,%ebp
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 24(%esp),%edi
- xorl %ecx,%esi
- movl %eax,%ebp
- shldl $5,%eax,%eax
- addl %esi,%edi
- xorl %ecx,%ebp
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- addl 28(%esp),%edx
- xorl %ebx,%ebp
- movl %edi,%esi
- shldl $5,%edi,%edi
- addl %ebp,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %edi,%edx
- addl 32(%esp),%ecx
- xorl %eax,%esi
- movl %edx,%ebp
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %eax,%ebp
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- addl 36(%esp),%ebx
- xorl %edi,%ebp
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %ebp,%ebx
- xorl %edi,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 40(%esp),%eax
- xorl %edx,%esi
- movl %ebx,%ebp
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%ebp
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 44(%esp),%edi
- xorl %ecx,%ebp
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %ebp,%edi
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%edi
- addl 48(%esp),%edx
- xorl %ebx,%esi
- movl %edi,%ebp
- shldl $5,%edi,%edi
- addl %esi,%edx
- xorl %ebx,%ebp
- shrdl $7,%eax,%eax
- addl %edi,%edx
- addl 52(%esp),%ecx
- xorl %eax,%ebp
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %ebp,%ecx
- xorl %eax,%esi
- shrdl $7,%edi,%edi
- addl %edx,%ecx
- addl 56(%esp),%ebx
- xorl %edi,%esi
- movl %ecx,%ebp
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edi,%ebp
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 60(%esp),%eax
- xorl %edx,%ebp
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %ebp,%eax
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vzeroall
- movl 192(%esp),%ebp
- addl (%ebp),%eax
- movl 204(%esp),%esp
- addl 4(%ebp),%esi
- addl 8(%ebp),%ecx
- movl %eax,(%ebp)
- addl 12(%ebp),%edx
- movl %esi,4(%ebp)
- addl 16(%ebp),%edi
- movl %ecx,8(%ebp)
- movl %edx,12(%ebp)
- movl %edi,16(%ebp)
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.size _sha1_block_data_order_avx,.-_sha1_block_data_order_avx
.align 64
.LK_XX_XX:
.long 1518500249,1518500249,1518500249,1518500249
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-GCC/crypto/sha/sha256-586.S b/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-GCC/crypto/sha/sha256-586.S
index 9253ab18d0d..5f515ac8835 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-GCC/crypto/sha/sha256-586.S
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-GCC/crypto/sha/sha256-586.S
@@ -44,13 +44,12 @@ sha256_block_data_order:
orl %ebx,%ecx
andl $1342177280,%ecx
cmpl $1342177280,%ecx
- je .L005AVX
testl $512,%ebx
- jnz .L006SSSE3
+ jnz .L005SSSE3
.L003no_xmm:
subl %edi,%eax
cmpl $256,%eax
- jae .L007unrolled
+ jae .L006unrolled
jmp .L002loop
.align 16
.L002loop:
@@ -122,7 +121,7 @@ sha256_block_data_order:
movl %ecx,28(%esp)
movl %edi,32(%esp)
.align 16
-.L00800_15:
+.L00700_15:
movl %edx,%ecx
movl 24(%esp),%esi
rorl $14,%ecx
@@ -160,11 +159,11 @@ sha256_block_data_order:
addl $4,%ebp
addl %ebx,%eax
cmpl $3248222580,%esi
- jne .L00800_15
+ jne .L00700_15
movl 156(%esp),%ecx
- jmp .L00916_63
+ jmp .L00816_63
.align 16
-.L00916_63:
+.L00816_63:
movl %ecx,%ebx
movl 104(%esp),%esi
rorl $11,%ecx
@@ -219,7 +218,7 @@ sha256_block_data_order:
addl $4,%ebp
addl %ebx,%eax
cmpl $3329325298,%esi
- jne .L00916_63
+ jne .L00816_63
movl 356(%esp),%esi
movl 8(%esp),%ebx
movl 16(%esp),%ecx
@@ -263,7 +262,7 @@ sha256_block_data_order:
.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
.byte 62,0
.align 16
-.L007unrolled:
+.L006unrolled:
leal -96(%esp),%esp
movl (%esi),%eax
movl 4(%esi),%ebp
@@ -280,9 +279,9 @@ sha256_block_data_order:
movl %ebx,20(%esp)
movl %ecx,24(%esp)
movl %esi,28(%esp)
- jmp .L010grand_loop
+ jmp .L009grand_loop
.align 16
-.L010grand_loop:
+.L009grand_loop:
movl (%edi),%ebx
movl 4(%edi),%ecx
bswap %ebx
@@ -3162,7 +3161,7 @@ sha256_block_data_order:
movl %ebx,24(%esp)
movl %ecx,28(%esp)
cmpl 104(%esp),%edi
- jb .L010grand_loop
+ jb .L009grand_loop
movl 108(%esp),%esp
popl %edi
popl %esi
@@ -3181,9 +3180,9 @@ sha256_block_data_order:
pshufd $27,%xmm2,%xmm2
.byte 102,15,58,15,202,8
punpcklqdq %xmm0,%xmm2
- jmp .L011loop_shaext
+ jmp .L010loop_shaext
.align 16
-.L011loop_shaext:
+.L010loop_shaext:
movdqu (%edi),%xmm3
movdqu 16(%edi),%xmm4
movdqu 32(%edi),%xmm5
@@ -3353,7 +3352,7 @@ sha256_block_data_order:
.byte 15,56,203,202
paddd 16(%esp),%xmm2
paddd (%esp),%xmm1
- jnz .L011loop_shaext
+ jnz .L010loop_shaext
pshufd $177,%xmm2,%xmm2
pshufd $27,%xmm1,%xmm7
pshufd $177,%xmm1,%xmm1
@@ -3368,7 +3367,7 @@ sha256_block_data_order:
popl %ebp
ret
.align 32
-.L006SSSE3:
+.L005SSSE3:
leal -96(%esp),%esp
movl (%esi),%eax
movl 4(%esi),%ebx
@@ -3387,9 +3386,9 @@ sha256_block_data_order:
movl %ecx,24(%esp)
movl %esi,28(%esp)
movdqa 256(%ebp),%xmm7
- jmp .L012grand_ssse3
+ jmp .L011grand_ssse3
.align 16
-.L012grand_ssse3:
+.L011grand_ssse3:
movdqu (%edi),%xmm0
movdqu 16(%edi),%xmm1
movdqu 32(%edi),%xmm2
@@ -3412,9 +3411,9 @@ sha256_block_data_order:
paddd %xmm3,%xmm7
movdqa %xmm6,64(%esp)
movdqa %xmm7,80(%esp)
- jmp .L013ssse3_00_47
+ jmp .L012ssse3_00_47
.align 16
-.L013ssse3_00_47:
+.L012ssse3_00_47:
addl $64,%ebp
movl %edx,%ecx
movdqa %xmm1,%xmm4
@@ -4057,7 +4056,7 @@ sha256_block_data_order:
addl %ecx,%eax
movdqa %xmm6,80(%esp)
cmpl $66051,64(%ebp)
- jne .L013ssse3_00_47
+ jne .L012ssse3_00_47
movl %edx,%ecx
rorl $14,%edx
movl 20(%esp),%esi
@@ -4571,2218 +4570,13 @@ sha256_block_data_order:
movdqa 64(%ebp),%xmm7
subl $192,%ebp
cmpl 104(%esp),%edi
- jb .L012grand_ssse3
+ jb .L011grand_ssse3
movl 108(%esp),%esp
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
-.align 32
-.L005AVX:
- andl $264,%edx
- cmpl $264,%edx
- je .L014AVX_BMI
- leal -96(%esp),%esp
- vzeroall
- movl (%esi),%eax
- movl 4(%esi),%ebx
- movl 8(%esi),%ecx
- movl 12(%esi),%edi
- movl %ebx,4(%esp)
- xorl %ecx,%ebx
- movl %ecx,8(%esp)
- movl %edi,12(%esp)
- movl 16(%esi),%edx
- movl 20(%esi),%edi
- movl 24(%esi),%ecx
- movl 28(%esi),%esi
- movl %edi,20(%esp)
- movl 100(%esp),%edi
- movl %ecx,24(%esp)
- movl %esi,28(%esp)
- vmovdqa 256(%ebp),%xmm7
- jmp .L015grand_avx
-.align 32
-.L015grand_avx:
- vmovdqu (%edi),%xmm0
- vmovdqu 16(%edi),%xmm1
- vmovdqu 32(%edi),%xmm2
- vmovdqu 48(%edi),%xmm3
- addl $64,%edi
- vpshufb %xmm7,%xmm0,%xmm0
- movl %edi,100(%esp)
- vpshufb %xmm7,%xmm1,%xmm1
- vpshufb %xmm7,%xmm2,%xmm2
- vpaddd (%ebp),%xmm0,%xmm4
- vpshufb %xmm7,%xmm3,%xmm3
- vpaddd 16(%ebp),%xmm1,%xmm5
- vpaddd 32(%ebp),%xmm2,%xmm6
- vpaddd 48(%ebp),%xmm3,%xmm7
- vmovdqa %xmm4,32(%esp)
- vmovdqa %xmm5,48(%esp)
- vmovdqa %xmm6,64(%esp)
- vmovdqa %xmm7,80(%esp)
- jmp .L016avx_00_47
-.align 16
-.L016avx_00_47:
- addl $64,%ebp
- vpalignr $4,%xmm0,%xmm1,%xmm4
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 20(%esp),%esi
- vpalignr $4,%xmm2,%xmm3,%xmm7
- xorl %ecx,%edx
- movl 24(%esp),%edi
- xorl %edi,%esi
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- vpaddd %xmm7,%xmm0,%xmm0
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrld $3,%xmm4,%xmm7
- movl %eax,%ecx
- addl %edi,%edx
- movl 4(%esp),%edi
- vpslld $14,%xmm4,%xmm5
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,(%esp)
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 28(%esp),%edx
- vpshufd $250,%xmm3,%xmm7
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpsrld $11,%xmm6,%xmm6
- addl 32(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpxor %xmm5,%xmm4,%xmm4
- addl %edx,%ebx
- addl 12(%esp),%edx
- addl %ecx,%ebx
- vpslld $11,%xmm5,%xmm5
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 16(%esp),%esi
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ecx,%edx
- movl 20(%esp),%edi
- xorl %edi,%esi
- vpsrld $10,%xmm7,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,12(%esp)
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %ebx,%ecx
- addl %edi,%edx
- movl (%esp),%edi
- vpaddd %xmm4,%xmm0,%xmm0
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,28(%esp)
- vpxor %xmm5,%xmm6,%xmm6
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 24(%esp),%edx
- vpsrlq $19,%xmm7,%xmm7
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- addl 36(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- vpshufd $132,%xmm6,%xmm7
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %ecx,%eax
- vpsrldq $8,%xmm7,%xmm7
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 12(%esp),%esi
- vpaddd %xmm7,%xmm0,%xmm0
- xorl %ecx,%edx
- movl 16(%esp),%edi
- xorl %edi,%esi
- vpshufd $80,%xmm0,%xmm7
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- vpsrld $10,%xmm7,%xmm6
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %eax,%ecx
- addl %edi,%edx
- movl 28(%esp),%edi
- vpxor %xmm5,%xmm6,%xmm6
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,24(%esp)
- vpsrlq $19,%xmm7,%xmm7
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpshufd $232,%xmm6,%xmm7
- addl 40(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpslldq $8,%xmm7,%xmm7
- addl %edx,%ebx
- addl 4(%esp),%edx
- addl %ecx,%ebx
- vpaddd %xmm7,%xmm0,%xmm0
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 8(%esp),%esi
- vpaddd (%ebp),%xmm0,%xmm6
- xorl %ecx,%edx
- movl 12(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,4(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 24(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,20(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 44(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl (%esp),%edx
- addl %ecx,%eax
- vmovdqa %xmm6,32(%esp)
- vpalignr $4,%xmm1,%xmm2,%xmm4
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 4(%esp),%esi
- vpalignr $4,%xmm3,%xmm0,%xmm7
- xorl %ecx,%edx
- movl 8(%esp),%edi
- xorl %edi,%esi
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- vpaddd %xmm7,%xmm1,%xmm1
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrld $3,%xmm4,%xmm7
- movl %eax,%ecx
- addl %edi,%edx
- movl 20(%esp),%edi
- vpslld $14,%xmm4,%xmm5
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,16(%esp)
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 12(%esp),%edx
- vpshufd $250,%xmm0,%xmm7
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpsrld $11,%xmm6,%xmm6
- addl 48(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpxor %xmm5,%xmm4,%xmm4
- addl %edx,%ebx
- addl 28(%esp),%edx
- addl %ecx,%ebx
- vpslld $11,%xmm5,%xmm5
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl (%esp),%esi
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ecx,%edx
- movl 4(%esp),%edi
- xorl %edi,%esi
- vpsrld $10,%xmm7,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,28(%esp)
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %ebx,%ecx
- addl %edi,%edx
- movl 16(%esp),%edi
- vpaddd %xmm4,%xmm1,%xmm1
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,12(%esp)
- vpxor %xmm5,%xmm6,%xmm6
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 8(%esp),%edx
- vpsrlq $19,%xmm7,%xmm7
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- addl 52(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- vpshufd $132,%xmm6,%xmm7
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %ecx,%eax
- vpsrldq $8,%xmm7,%xmm7
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 28(%esp),%esi
- vpaddd %xmm7,%xmm1,%xmm1
- xorl %ecx,%edx
- movl (%esp),%edi
- xorl %edi,%esi
- vpshufd $80,%xmm1,%xmm7
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- vpsrld $10,%xmm7,%xmm6
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %eax,%ecx
- addl %edi,%edx
- movl 12(%esp),%edi
- vpxor %xmm5,%xmm6,%xmm6
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,8(%esp)
- vpsrlq $19,%xmm7,%xmm7
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpshufd $232,%xmm6,%xmm7
- addl 56(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpslldq $8,%xmm7,%xmm7
- addl %edx,%ebx
- addl 20(%esp),%edx
- addl %ecx,%ebx
- vpaddd %xmm7,%xmm1,%xmm1
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 24(%esp),%esi
- vpaddd 16(%ebp),%xmm1,%xmm6
- xorl %ecx,%edx
- movl 28(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,20(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 8(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,4(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 60(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %ecx,%eax
- vmovdqa %xmm6,48(%esp)
- vpalignr $4,%xmm2,%xmm3,%xmm4
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 20(%esp),%esi
- vpalignr $4,%xmm0,%xmm1,%xmm7
- xorl %ecx,%edx
- movl 24(%esp),%edi
- xorl %edi,%esi
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- vpaddd %xmm7,%xmm2,%xmm2
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrld $3,%xmm4,%xmm7
- movl %eax,%ecx
- addl %edi,%edx
- movl 4(%esp),%edi
- vpslld $14,%xmm4,%xmm5
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,(%esp)
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 28(%esp),%edx
- vpshufd $250,%xmm1,%xmm7
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpsrld $11,%xmm6,%xmm6
- addl 64(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpxor %xmm5,%xmm4,%xmm4
- addl %edx,%ebx
- addl 12(%esp),%edx
- addl %ecx,%ebx
- vpslld $11,%xmm5,%xmm5
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 16(%esp),%esi
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ecx,%edx
- movl 20(%esp),%edi
- xorl %edi,%esi
- vpsrld $10,%xmm7,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,12(%esp)
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %ebx,%ecx
- addl %edi,%edx
- movl (%esp),%edi
- vpaddd %xmm4,%xmm2,%xmm2
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,28(%esp)
- vpxor %xmm5,%xmm6,%xmm6
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 24(%esp),%edx
- vpsrlq $19,%xmm7,%xmm7
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- addl 68(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- vpshufd $132,%xmm6,%xmm7
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %ecx,%eax
- vpsrldq $8,%xmm7,%xmm7
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 12(%esp),%esi
- vpaddd %xmm7,%xmm2,%xmm2
- xorl %ecx,%edx
- movl 16(%esp),%edi
- xorl %edi,%esi
- vpshufd $80,%xmm2,%xmm7
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- vpsrld $10,%xmm7,%xmm6
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %eax,%ecx
- addl %edi,%edx
- movl 28(%esp),%edi
- vpxor %xmm5,%xmm6,%xmm6
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,24(%esp)
- vpsrlq $19,%xmm7,%xmm7
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpshufd $232,%xmm6,%xmm7
- addl 72(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpslldq $8,%xmm7,%xmm7
- addl %edx,%ebx
- addl 4(%esp),%edx
- addl %ecx,%ebx
- vpaddd %xmm7,%xmm2,%xmm2
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 8(%esp),%esi
- vpaddd 32(%ebp),%xmm2,%xmm6
- xorl %ecx,%edx
- movl 12(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,4(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 24(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,20(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 76(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl (%esp),%edx
- addl %ecx,%eax
- vmovdqa %xmm6,64(%esp)
- vpalignr $4,%xmm3,%xmm0,%xmm4
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 4(%esp),%esi
- vpalignr $4,%xmm1,%xmm2,%xmm7
- xorl %ecx,%edx
- movl 8(%esp),%edi
- xorl %edi,%esi
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- vpaddd %xmm7,%xmm3,%xmm3
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrld $3,%xmm4,%xmm7
- movl %eax,%ecx
- addl %edi,%edx
- movl 20(%esp),%edi
- vpslld $14,%xmm4,%xmm5
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,16(%esp)
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 12(%esp),%edx
- vpshufd $250,%xmm2,%xmm7
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpsrld $11,%xmm6,%xmm6
- addl 80(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpxor %xmm5,%xmm4,%xmm4
- addl %edx,%ebx
- addl 28(%esp),%edx
- addl %ecx,%ebx
- vpslld $11,%xmm5,%xmm5
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl (%esp),%esi
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ecx,%edx
- movl 4(%esp),%edi
- xorl %edi,%esi
- vpsrld $10,%xmm7,%xmm6
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,28(%esp)
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %ebx,%ecx
- addl %edi,%edx
- movl 16(%esp),%edi
- vpaddd %xmm4,%xmm3,%xmm3
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,12(%esp)
- vpxor %xmm5,%xmm6,%xmm6
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 8(%esp),%edx
- vpsrlq $19,%xmm7,%xmm7
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- addl 84(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- vpshufd $132,%xmm6,%xmm7
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %ecx,%eax
- vpsrldq $8,%xmm7,%xmm7
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 28(%esp),%esi
- vpaddd %xmm7,%xmm3,%xmm3
- xorl %ecx,%edx
- movl (%esp),%edi
- xorl %edi,%esi
- vpshufd $80,%xmm3,%xmm7
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- vpsrld $10,%xmm7,%xmm6
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- vpsrlq $17,%xmm7,%xmm5
- movl %eax,%ecx
- addl %edi,%edx
- movl 12(%esp),%edi
- vpxor %xmm5,%xmm6,%xmm6
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,8(%esp)
- vpsrlq $19,%xmm7,%xmm7
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- vpshufd $232,%xmm6,%xmm7
- addl 88(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- vpslldq $8,%xmm7,%xmm7
- addl %edx,%ebx
- addl 20(%esp),%edx
- addl %ecx,%ebx
- vpaddd %xmm7,%xmm3,%xmm3
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 24(%esp),%esi
- vpaddd 48(%ebp),%xmm3,%xmm6
- xorl %ecx,%edx
- movl 28(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,20(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 8(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,4(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 92(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %ecx,%eax
- vmovdqa %xmm6,80(%esp)
- cmpl $66051,64(%ebp)
- jne .L016avx_00_47
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 20(%esp),%esi
- xorl %ecx,%edx
- movl 24(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 4(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 28(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 32(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 12(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 16(%esp),%esi
- xorl %ecx,%edx
- movl 20(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,12(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl (%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,28(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 24(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 36(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 12(%esp),%esi
- xorl %ecx,%edx
- movl 16(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 28(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,24(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 40(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 4(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 8(%esp),%esi
- xorl %ecx,%edx
- movl 12(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,4(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 24(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,20(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 44(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl (%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 4(%esp),%esi
- xorl %ecx,%edx
- movl 8(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 20(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,16(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 12(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 48(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 28(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl (%esp),%esi
- xorl %ecx,%edx
- movl 4(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,28(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 16(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,12(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 8(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 52(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 28(%esp),%esi
- xorl %ecx,%edx
- movl (%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 12(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,8(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 56(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 20(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 24(%esp),%esi
- xorl %ecx,%edx
- movl 28(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,20(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 8(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,4(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 60(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 20(%esp),%esi
- xorl %ecx,%edx
- movl 24(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,16(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 4(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 28(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 64(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 12(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 16(%esp),%esi
- xorl %ecx,%edx
- movl 20(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,12(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl (%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,28(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 24(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 68(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 8(%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 12(%esp),%esi
- xorl %ecx,%edx
- movl 16(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,8(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 28(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,24(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 72(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 4(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 8(%esp),%esi
- xorl %ecx,%edx
- movl 12(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,4(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 24(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,20(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 76(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl (%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 4(%esp),%esi
- xorl %ecx,%edx
- movl 8(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 20(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,16(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 12(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 80(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 28(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl (%esp),%esi
- xorl %ecx,%edx
- movl 4(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,28(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 16(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,12(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl 8(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 84(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 24(%esp),%edx
- addl %ecx,%eax
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 28(%esp),%esi
- xorl %ecx,%edx
- movl (%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,24(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %edi,%edx
- movl 12(%esp),%edi
- movl %eax,%esi
- shrdl $9,%ecx,%ecx
- movl %eax,8(%esp)
- xorl %eax,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %eax,%ebx
- xorl %esi,%ecx
- addl 88(%esp),%edx
- xorl %edi,%ebx
- shrdl $2,%ecx,%ecx
- addl %edx,%ebx
- addl 20(%esp),%edx
- addl %ecx,%ebx
- movl %edx,%ecx
- shrdl $14,%edx,%edx
- movl 24(%esp),%esi
- xorl %ecx,%edx
- movl 28(%esp),%edi
- xorl %edi,%esi
- shrdl $5,%edx,%edx
- andl %ecx,%esi
- movl %ecx,20(%esp)
- xorl %ecx,%edx
- xorl %esi,%edi
- shrdl $6,%edx,%edx
- movl %ebx,%ecx
- addl %edi,%edx
- movl 8(%esp),%edi
- movl %ebx,%esi
- shrdl $9,%ecx,%ecx
- movl %ebx,4(%esp)
- xorl %ebx,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- shrdl $11,%ecx,%ecx
- andl %ebx,%eax
- xorl %esi,%ecx
- addl 92(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %edx,%eax
- addl 16(%esp),%edx
- addl %ecx,%eax
- movl 96(%esp),%esi
- xorl %edi,%ebx
- movl 12(%esp),%ecx
- addl (%esi),%eax
- addl 4(%esi),%ebx
- addl 8(%esi),%edi
- addl 12(%esi),%ecx
- movl %eax,(%esi)
- movl %ebx,4(%esi)
- movl %edi,8(%esi)
- movl %ecx,12(%esi)
- movl %ebx,4(%esp)
- xorl %edi,%ebx
- movl %edi,8(%esp)
- movl %ecx,12(%esp)
- movl 20(%esp),%edi
- movl 24(%esp),%ecx
- addl 16(%esi),%edx
- addl 20(%esi),%edi
- addl 24(%esi),%ecx
- movl %edx,16(%esi)
- movl %edi,20(%esi)
- movl %edi,20(%esp)
- movl 28(%esp),%edi
- movl %ecx,24(%esi)
- addl 28(%esi),%edi
- movl %ecx,24(%esp)
- movl %edi,28(%esi)
- movl %edi,28(%esp)
- movl 100(%esp),%edi
- vmovdqa 64(%ebp),%xmm7
- subl $192,%ebp
- cmpl 104(%esp),%edi
- jb .L015grand_avx
- movl 108(%esp),%esp
- vzeroall
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
-.align 32
-.L014AVX_BMI:
- leal -96(%esp),%esp
- vzeroall
- movl (%esi),%eax
- movl 4(%esi),%ebx
- movl 8(%esi),%ecx
- movl 12(%esi),%edi
- movl %ebx,4(%esp)
- xorl %ecx,%ebx
- movl %ecx,8(%esp)
- movl %edi,12(%esp)
- movl 16(%esi),%edx
- movl 20(%esi),%edi
- movl 24(%esi),%ecx
- movl 28(%esi),%esi
- movl %edi,20(%esp)
- movl 100(%esp),%edi
- movl %ecx,24(%esp)
- movl %esi,28(%esp)
- vmovdqa 256(%ebp),%xmm7
- jmp .L017grand_avx_bmi
-.align 32
-.L017grand_avx_bmi:
- vmovdqu (%edi),%xmm0
- vmovdqu 16(%edi),%xmm1
- vmovdqu 32(%edi),%xmm2
- vmovdqu 48(%edi),%xmm3
- addl $64,%edi
- vpshufb %xmm7,%xmm0,%xmm0
- movl %edi,100(%esp)
- vpshufb %xmm7,%xmm1,%xmm1
- vpshufb %xmm7,%xmm2,%xmm2
- vpaddd (%ebp),%xmm0,%xmm4
- vpshufb %xmm7,%xmm3,%xmm3
- vpaddd 16(%ebp),%xmm1,%xmm5
- vpaddd 32(%ebp),%xmm2,%xmm6
- vpaddd 48(%ebp),%xmm3,%xmm7
- vmovdqa %xmm4,32(%esp)
- vmovdqa %xmm5,48(%esp)
- vmovdqa %xmm6,64(%esp)
- vmovdqa %xmm7,80(%esp)
- jmp .L018avx_bmi_00_47
-.align 16
-.L018avx_bmi_00_47:
- addl $64,%ebp
- vpalignr $4,%xmm0,%xmm1,%xmm4
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,16(%esp)
- vpalignr $4,%xmm2,%xmm3,%xmm7
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 24(%esp),%edx,%esi
- vpsrld $7,%xmm4,%xmm6
- xorl %edi,%ecx
- andl 20(%esp),%edx
- movl %eax,(%esp)
- vpaddd %xmm7,%xmm0,%xmm0
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- vpsrld $3,%xmm4,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- vpslld $14,%xmm4,%xmm5
- movl 4(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- vpxor %xmm6,%xmm7,%xmm4
- addl 28(%esp),%edx
- andl %eax,%ebx
- addl 32(%esp),%edx
- vpshufd $250,%xmm3,%xmm7
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 12(%esp),%edx
- vpsrld $11,%xmm6,%xmm6
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpxor %xmm5,%xmm4,%xmm4
- movl %edx,12(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpslld $11,%xmm5,%xmm5
- andnl 20(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 16(%esp),%edx
- vpxor %xmm6,%xmm4,%xmm4
- movl %ebx,28(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- vpsrld $10,%xmm7,%xmm6
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- vpxor %xmm5,%xmm4,%xmm4
- movl (%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- vpsrlq $17,%xmm7,%xmm5
- addl 24(%esp),%edx
- andl %ebx,%eax
- addl 36(%esp),%edx
- vpaddd %xmm4,%xmm0,%xmm0
- xorl %edi,%eax
- addl %edx,%ecx
- addl 8(%esp),%edx
- vpxor %xmm5,%xmm6,%xmm6
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpsrlq $19,%xmm7,%xmm7
- movl %edx,8(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- andnl 16(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 12(%esp),%edx
- vpshufd $132,%xmm6,%xmm7
- movl %eax,24(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- vpsrldq $8,%xmm7,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- vpaddd %xmm7,%xmm0,%xmm0
- movl 28(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- vpshufd $80,%xmm0,%xmm7
- addl 20(%esp),%edx
- andl %eax,%ebx
- addl 40(%esp),%edx
- vpsrld $10,%xmm7,%xmm6
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 4(%esp),%edx
- vpsrlq $17,%xmm7,%xmm5
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpxor %xmm5,%xmm6,%xmm6
- movl %edx,4(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpsrlq $19,%xmm7,%xmm7
- andnl 12(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 8(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- movl %ebx,20(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- vpshufd $232,%xmm6,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- vpslldq $8,%xmm7,%xmm7
- movl 24(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- vpaddd %xmm7,%xmm0,%xmm0
- addl 16(%esp),%edx
- andl %ebx,%eax
- addl 44(%esp),%edx
- vpaddd (%ebp),%xmm0,%xmm6
- xorl %edi,%eax
- addl %edx,%ecx
- addl (%esp),%edx
- leal (%eax,%ecx,1),%eax
- vmovdqa %xmm6,32(%esp)
- vpalignr $4,%xmm1,%xmm2,%xmm4
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,(%esp)
- vpalignr $4,%xmm3,%xmm0,%xmm7
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 8(%esp),%edx,%esi
- vpsrld $7,%xmm4,%xmm6
- xorl %edi,%ecx
- andl 4(%esp),%edx
- movl %eax,16(%esp)
- vpaddd %xmm7,%xmm1,%xmm1
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- vpsrld $3,%xmm4,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- vpslld $14,%xmm4,%xmm5
- movl 20(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- vpxor %xmm6,%xmm7,%xmm4
- addl 12(%esp),%edx
- andl %eax,%ebx
- addl 48(%esp),%edx
- vpshufd $250,%xmm0,%xmm7
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 28(%esp),%edx
- vpsrld $11,%xmm6,%xmm6
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpxor %xmm5,%xmm4,%xmm4
- movl %edx,28(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpslld $11,%xmm5,%xmm5
- andnl 4(%esp),%edx,%esi
- xorl %edi,%ecx
- andl (%esp),%edx
- vpxor %xmm6,%xmm4,%xmm4
- movl %ebx,12(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- vpsrld $10,%xmm7,%xmm6
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- vpxor %xmm5,%xmm4,%xmm4
- movl 16(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- vpsrlq $17,%xmm7,%xmm5
- addl 8(%esp),%edx
- andl %ebx,%eax
- addl 52(%esp),%edx
- vpaddd %xmm4,%xmm1,%xmm1
- xorl %edi,%eax
- addl %edx,%ecx
- addl 24(%esp),%edx
- vpxor %xmm5,%xmm6,%xmm6
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpsrlq $19,%xmm7,%xmm7
- movl %edx,24(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- andnl (%esp),%edx,%esi
- xorl %edi,%ecx
- andl 28(%esp),%edx
- vpshufd $132,%xmm6,%xmm7
- movl %eax,8(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- vpsrldq $8,%xmm7,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- vpaddd %xmm7,%xmm1,%xmm1
- movl 12(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- vpshufd $80,%xmm1,%xmm7
- addl 4(%esp),%edx
- andl %eax,%ebx
- addl 56(%esp),%edx
- vpsrld $10,%xmm7,%xmm6
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 20(%esp),%edx
- vpsrlq $17,%xmm7,%xmm5
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpxor %xmm5,%xmm6,%xmm6
- movl %edx,20(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpsrlq $19,%xmm7,%xmm7
- andnl 28(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 24(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- movl %ebx,4(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- vpshufd $232,%xmm6,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- vpslldq $8,%xmm7,%xmm7
- movl 8(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- vpaddd %xmm7,%xmm1,%xmm1
- addl (%esp),%edx
- andl %ebx,%eax
- addl 60(%esp),%edx
- vpaddd 16(%ebp),%xmm1,%xmm6
- xorl %edi,%eax
- addl %edx,%ecx
- addl 16(%esp),%edx
- leal (%eax,%ecx,1),%eax
- vmovdqa %xmm6,48(%esp)
- vpalignr $4,%xmm2,%xmm3,%xmm4
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,16(%esp)
- vpalignr $4,%xmm0,%xmm1,%xmm7
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 24(%esp),%edx,%esi
- vpsrld $7,%xmm4,%xmm6
- xorl %edi,%ecx
- andl 20(%esp),%edx
- movl %eax,(%esp)
- vpaddd %xmm7,%xmm2,%xmm2
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- vpsrld $3,%xmm4,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- vpslld $14,%xmm4,%xmm5
- movl 4(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- vpxor %xmm6,%xmm7,%xmm4
- addl 28(%esp),%edx
- andl %eax,%ebx
- addl 64(%esp),%edx
- vpshufd $250,%xmm1,%xmm7
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 12(%esp),%edx
- vpsrld $11,%xmm6,%xmm6
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpxor %xmm5,%xmm4,%xmm4
- movl %edx,12(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpslld $11,%xmm5,%xmm5
- andnl 20(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 16(%esp),%edx
- vpxor %xmm6,%xmm4,%xmm4
- movl %ebx,28(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- vpsrld $10,%xmm7,%xmm6
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- vpxor %xmm5,%xmm4,%xmm4
- movl (%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- vpsrlq $17,%xmm7,%xmm5
- addl 24(%esp),%edx
- andl %ebx,%eax
- addl 68(%esp),%edx
- vpaddd %xmm4,%xmm2,%xmm2
- xorl %edi,%eax
- addl %edx,%ecx
- addl 8(%esp),%edx
- vpxor %xmm5,%xmm6,%xmm6
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpsrlq $19,%xmm7,%xmm7
- movl %edx,8(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- andnl 16(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 12(%esp),%edx
- vpshufd $132,%xmm6,%xmm7
- movl %eax,24(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- vpsrldq $8,%xmm7,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- vpaddd %xmm7,%xmm2,%xmm2
- movl 28(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- vpshufd $80,%xmm2,%xmm7
- addl 20(%esp),%edx
- andl %eax,%ebx
- addl 72(%esp),%edx
- vpsrld $10,%xmm7,%xmm6
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 4(%esp),%edx
- vpsrlq $17,%xmm7,%xmm5
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpxor %xmm5,%xmm6,%xmm6
- movl %edx,4(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpsrlq $19,%xmm7,%xmm7
- andnl 12(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 8(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- movl %ebx,20(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- vpshufd $232,%xmm6,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- vpslldq $8,%xmm7,%xmm7
- movl 24(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- vpaddd %xmm7,%xmm2,%xmm2
- addl 16(%esp),%edx
- andl %ebx,%eax
- addl 76(%esp),%edx
- vpaddd 32(%ebp),%xmm2,%xmm6
- xorl %edi,%eax
- addl %edx,%ecx
- addl (%esp),%edx
- leal (%eax,%ecx,1),%eax
- vmovdqa %xmm6,64(%esp)
- vpalignr $4,%xmm3,%xmm0,%xmm4
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,(%esp)
- vpalignr $4,%xmm1,%xmm2,%xmm7
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 8(%esp),%edx,%esi
- vpsrld $7,%xmm4,%xmm6
- xorl %edi,%ecx
- andl 4(%esp),%edx
- movl %eax,16(%esp)
- vpaddd %xmm7,%xmm3,%xmm3
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- vpsrld $3,%xmm4,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- vpslld $14,%xmm4,%xmm5
- movl 20(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- vpxor %xmm6,%xmm7,%xmm4
- addl 12(%esp),%edx
- andl %eax,%ebx
- addl 80(%esp),%edx
- vpshufd $250,%xmm2,%xmm7
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 28(%esp),%edx
- vpsrld $11,%xmm6,%xmm6
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpxor %xmm5,%xmm4,%xmm4
- movl %edx,28(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpslld $11,%xmm5,%xmm5
- andnl 4(%esp),%edx,%esi
- xorl %edi,%ecx
- andl (%esp),%edx
- vpxor %xmm6,%xmm4,%xmm4
- movl %ebx,12(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- vpsrld $10,%xmm7,%xmm6
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- vpxor %xmm5,%xmm4,%xmm4
- movl 16(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- vpsrlq $17,%xmm7,%xmm5
- addl 8(%esp),%edx
- andl %ebx,%eax
- addl 84(%esp),%edx
- vpaddd %xmm4,%xmm3,%xmm3
- xorl %edi,%eax
- addl %edx,%ecx
- addl 24(%esp),%edx
- vpxor %xmm5,%xmm6,%xmm6
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpsrlq $19,%xmm7,%xmm7
- movl %edx,24(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- andnl (%esp),%edx,%esi
- xorl %edi,%ecx
- andl 28(%esp),%edx
- vpshufd $132,%xmm6,%xmm7
- movl %eax,8(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- vpsrldq $8,%xmm7,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- vpaddd %xmm7,%xmm3,%xmm3
- movl 12(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- vpshufd $80,%xmm3,%xmm7
- addl 4(%esp),%edx
- andl %eax,%ebx
- addl 88(%esp),%edx
- vpsrld $10,%xmm7,%xmm6
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 20(%esp),%edx
- vpsrlq $17,%xmm7,%xmm5
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- vpxor %xmm5,%xmm6,%xmm6
- movl %edx,20(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- vpsrlq $19,%xmm7,%xmm7
- andnl 28(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 24(%esp),%edx
- vpxor %xmm7,%xmm6,%xmm6
- movl %ebx,4(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- vpshufd $232,%xmm6,%xmm7
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- vpslldq $8,%xmm7,%xmm7
- movl 8(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- vpaddd %xmm7,%xmm3,%xmm3
- addl (%esp),%edx
- andl %ebx,%eax
- addl 92(%esp),%edx
- vpaddd 48(%ebp),%xmm3,%xmm6
- xorl %edi,%eax
- addl %edx,%ecx
- addl 16(%esp),%edx
- leal (%eax,%ecx,1),%eax
- vmovdqa %xmm6,80(%esp)
- cmpl $66051,64(%ebp)
- jne .L018avx_bmi_00_47
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,16(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 24(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 20(%esp),%edx
- movl %eax,(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- movl 4(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- addl 28(%esp),%edx
- andl %eax,%ebx
- addl 32(%esp),%edx
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 12(%esp),%edx
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,12(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 20(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 16(%esp),%edx
- movl %ebx,28(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- movl (%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- addl 24(%esp),%edx
- andl %ebx,%eax
- addl 36(%esp),%edx
- xorl %edi,%eax
- addl %edx,%ecx
- addl 8(%esp),%edx
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,8(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 16(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 12(%esp),%edx
- movl %eax,24(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- movl 28(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- andl %eax,%ebx
- addl 40(%esp),%edx
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 4(%esp),%edx
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,4(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 12(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 8(%esp),%edx
- movl %ebx,20(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- movl 24(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- andl %ebx,%eax
- addl 44(%esp),%edx
- xorl %edi,%eax
- addl %edx,%ecx
- addl (%esp),%edx
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 8(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 4(%esp),%edx
- movl %eax,16(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- movl 20(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- addl 12(%esp),%edx
- andl %eax,%ebx
- addl 48(%esp),%edx
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 28(%esp),%edx
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,28(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 4(%esp),%edx,%esi
- xorl %edi,%ecx
- andl (%esp),%edx
- movl %ebx,12(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- movl 16(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- addl 8(%esp),%edx
- andl %ebx,%eax
- addl 52(%esp),%edx
- xorl %edi,%eax
- addl %edx,%ecx
- addl 24(%esp),%edx
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,24(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl (%esp),%edx,%esi
- xorl %edi,%ecx
- andl 28(%esp),%edx
- movl %eax,8(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- movl 12(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- andl %eax,%ebx
- addl 56(%esp),%edx
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 20(%esp),%edx
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,20(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 28(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 24(%esp),%edx
- movl %ebx,4(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- movl 8(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- andl %ebx,%eax
- addl 60(%esp),%edx
- xorl %edi,%eax
- addl %edx,%ecx
- addl 16(%esp),%edx
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,16(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 24(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 20(%esp),%edx
- movl %eax,(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- movl 4(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- addl 28(%esp),%edx
- andl %eax,%ebx
- addl 64(%esp),%edx
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 12(%esp),%edx
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,12(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 20(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 16(%esp),%edx
- movl %ebx,28(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- movl (%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- addl 24(%esp),%edx
- andl %ebx,%eax
- addl 68(%esp),%edx
- xorl %edi,%eax
- addl %edx,%ecx
- addl 8(%esp),%edx
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,8(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 16(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 12(%esp),%edx
- movl %eax,24(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- movl 28(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- addl 20(%esp),%edx
- andl %eax,%ebx
- addl 72(%esp),%edx
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 4(%esp),%edx
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,4(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 12(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 8(%esp),%edx
- movl %ebx,20(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- movl 24(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- addl 16(%esp),%edx
- andl %ebx,%eax
- addl 76(%esp),%edx
- xorl %edi,%eax
- addl %edx,%ecx
- addl (%esp),%edx
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 8(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 4(%esp),%edx
- movl %eax,16(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- movl 20(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- addl 12(%esp),%edx
- andl %eax,%ebx
- addl 80(%esp),%edx
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 28(%esp),%edx
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,28(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 4(%esp),%edx,%esi
- xorl %edi,%ecx
- andl (%esp),%edx
- movl %ebx,12(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- movl 16(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- addl 8(%esp),%edx
- andl %ebx,%eax
- addl 84(%esp),%edx
- xorl %edi,%eax
- addl %edx,%ecx
- addl 24(%esp),%edx
- leal (%eax,%ecx,1),%eax
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,24(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl (%esp),%edx,%esi
- xorl %edi,%ecx
- andl 28(%esp),%edx
- movl %eax,8(%esp)
- orl %esi,%edx
- rorxl $2,%eax,%edi
- rorxl $13,%eax,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%eax,%ecx
- xorl %edi,%esi
- movl 12(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%eax
- addl 4(%esp),%edx
- andl %eax,%ebx
- addl 88(%esp),%edx
- xorl %edi,%ebx
- addl %edx,%ecx
- addl 20(%esp),%edx
- leal (%ebx,%ecx,1),%ebx
- rorxl $6,%edx,%ecx
- rorxl $11,%edx,%esi
- movl %edx,20(%esp)
- rorxl $25,%edx,%edi
- xorl %esi,%ecx
- andnl 28(%esp),%edx,%esi
- xorl %edi,%ecx
- andl 24(%esp),%edx
- movl %ebx,4(%esp)
- orl %esi,%edx
- rorxl $2,%ebx,%edi
- rorxl $13,%ebx,%esi
- leal (%edx,%ecx,1),%edx
- rorxl $22,%ebx,%ecx
- xorl %edi,%esi
- movl 8(%esp),%edi
- xorl %esi,%ecx
- xorl %edi,%ebx
- addl (%esp),%edx
- andl %ebx,%eax
- addl 92(%esp),%edx
- xorl %edi,%eax
- addl %edx,%ecx
- addl 16(%esp),%edx
- leal (%eax,%ecx,1),%eax
- movl 96(%esp),%esi
- xorl %edi,%ebx
- movl 12(%esp),%ecx
- addl (%esi),%eax
- addl 4(%esi),%ebx
- addl 8(%esi),%edi
- addl 12(%esi),%ecx
- movl %eax,(%esi)
- movl %ebx,4(%esi)
- movl %edi,8(%esi)
- movl %ecx,12(%esi)
- movl %ebx,4(%esp)
- xorl %edi,%ebx
- movl %edi,8(%esp)
- movl %ecx,12(%esp)
- movl 20(%esp),%edi
- movl 24(%esp),%ecx
- addl 16(%esi),%edx
- addl 20(%esi),%edi
- addl 24(%esi),%ecx
- movl %edx,16(%esi)
- movl %edi,20(%esi)
- movl %edi,20(%esp)
- movl 28(%esp),%edi
- movl %ecx,24(%esi)
- addl 28(%esi),%edi
- movl %ecx,24(%esp)
- movl %edi,28(%esi)
- movl %edi,28(%esp)
- movl 100(%esp),%edi
- vmovdqa 64(%ebp),%xmm7
- subl $192,%ebp
- cmpl 104(%esp),%edi
- jb .L017grand_avx_bmi
- movl 108(%esp),%esp
- vzeroall
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
.size sha256_block_data_order,.-.L_sha256_block_data_order_begin
.comm OPENSSL_ia32cap_P,16,4
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-MSFT/crypto/sha/sha1-586.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-MSFT/crypto/sha/sha1-586.nasm
index 0d644acce05..ab7e9f25eb3 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-MSFT/crypto/sha/sha1-586.nasm
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-MSFT/crypto/sha/sha1-586.nasm
@@ -29,11 +29,6 @@ L$000pic_point:
jz NEAR L$001x86
test ecx,536870912
jnz NEAR L$shaext_shortcut
- and edx,268435456
- and eax,1073741824
- or eax,edx
- cmp eax,1342177280
- je NEAR L$avx_shortcut
jmp NEAR L$ssse3_shortcut
align 16
L$001x86:
@@ -2786,1174 +2781,6 @@ L$007done:
pop ebx
pop ebp
ret
-align 16
-__sha1_block_data_order_avx:
- push ebp
- push ebx
- push esi
- push edi
- call L$008pic_point
-L$008pic_point:
- pop ebp
- lea ebp,[(L$K_XX_XX-L$008pic_point)+ebp]
-L$avx_shortcut:
- vzeroall
- vmovdqa xmm7,[ebp]
- vmovdqa xmm0,[16+ebp]
- vmovdqa xmm1,[32+ebp]
- vmovdqa xmm2,[48+ebp]
- vmovdqa xmm6,[64+ebp]
- mov edi,DWORD [20+esp]
- mov ebp,DWORD [24+esp]
- mov edx,DWORD [28+esp]
- mov esi,esp
- sub esp,208
- and esp,-64
- vmovdqa [112+esp],xmm0
- vmovdqa [128+esp],xmm1
- vmovdqa [144+esp],xmm2
- shl edx,6
- vmovdqa [160+esp],xmm7
- add edx,ebp
- vmovdqa [176+esp],xmm6
- add ebp,64
- mov DWORD [192+esp],edi
- mov DWORD [196+esp],ebp
- mov DWORD [200+esp],edx
- mov DWORD [204+esp],esi
- mov eax,DWORD [edi]
- mov ebx,DWORD [4+edi]
- mov ecx,DWORD [8+edi]
- mov edx,DWORD [12+edi]
- mov edi,DWORD [16+edi]
- mov esi,ebx
- vmovdqu xmm0,[ebp-64]
- vmovdqu xmm1,[ebp-48]
- vmovdqu xmm2,[ebp-32]
- vmovdqu xmm3,[ebp-16]
- vpshufb xmm0,xmm0,xmm6
- vpshufb xmm1,xmm1,xmm6
- vpshufb xmm2,xmm2,xmm6
- vmovdqa [96+esp],xmm7
- vpshufb xmm3,xmm3,xmm6
- vpaddd xmm4,xmm0,xmm7
- vpaddd xmm5,xmm1,xmm7
- vpaddd xmm6,xmm2,xmm7
- vmovdqa [esp],xmm4
- mov ebp,ecx
- vmovdqa [16+esp],xmm5
- xor ebp,edx
- vmovdqa [32+esp],xmm6
- and esi,ebp
- jmp NEAR L$009loop
-align 16
-L$009loop:
- shrd ebx,ebx,2
- xor esi,edx
- vpalignr xmm4,xmm1,xmm0,8
- mov ebp,eax
- add edi,DWORD [esp]
- vpaddd xmm7,xmm7,xmm3
- vmovdqa [64+esp],xmm0
- xor ebx,ecx
- shld eax,eax,5
- vpsrldq xmm6,xmm3,4
- add edi,esi
- and ebp,ebx
- vpxor xmm4,xmm4,xmm0
- xor ebx,ecx
- add edi,eax
- vpxor xmm6,xmm6,xmm2
- shrd eax,eax,7
- xor ebp,ecx
- vmovdqa [48+esp],xmm7
- mov esi,edi
- add edx,DWORD [4+esp]
- vpxor xmm4,xmm4,xmm6
- xor eax,ebx
- shld edi,edi,5
- add edx,ebp
- and esi,eax
- vpsrld xmm6,xmm4,31
- xor eax,ebx
- add edx,edi
- shrd edi,edi,7
- xor esi,ebx
- vpslldq xmm0,xmm4,12
- vpaddd xmm4,xmm4,xmm4
- mov ebp,edx
- add ecx,DWORD [8+esp]
- xor edi,eax
- shld edx,edx,5
- vpsrld xmm7,xmm0,30
- vpor xmm4,xmm4,xmm6
- add ecx,esi
- and ebp,edi
- xor edi,eax
- add ecx,edx
- vpslld xmm0,xmm0,2
- shrd edx,edx,7
- xor ebp,eax
- vpxor xmm4,xmm4,xmm7
- mov esi,ecx
- add ebx,DWORD [12+esp]
- xor edx,edi
- shld ecx,ecx,5
- vpxor xmm4,xmm4,xmm0
- add ebx,ebp
- and esi,edx
- vmovdqa xmm0,[96+esp]
- xor edx,edi
- add ebx,ecx
- shrd ecx,ecx,7
- xor esi,edi
- vpalignr xmm5,xmm2,xmm1,8
- mov ebp,ebx
- add eax,DWORD [16+esp]
- vpaddd xmm0,xmm0,xmm4
- vmovdqa [80+esp],xmm1
- xor ecx,edx
- shld ebx,ebx,5
- vpsrldq xmm7,xmm4,4
- add eax,esi
- and ebp,ecx
- vpxor xmm5,xmm5,xmm1
- xor ecx,edx
- add eax,ebx
- vpxor xmm7,xmm7,xmm3
- shrd ebx,ebx,7
- xor ebp,edx
- vmovdqa [esp],xmm0
- mov esi,eax
- add edi,DWORD [20+esp]
- vpxor xmm5,xmm5,xmm7
- xor ebx,ecx
- shld eax,eax,5
- add edi,ebp
- and esi,ebx
- vpsrld xmm7,xmm5,31
- xor ebx,ecx
- add edi,eax
- shrd eax,eax,7
- xor esi,ecx
- vpslldq xmm1,xmm5,12
- vpaddd xmm5,xmm5,xmm5
- mov ebp,edi
- add edx,DWORD [24+esp]
- xor eax,ebx
- shld edi,edi,5
- vpsrld xmm0,xmm1,30
- vpor xmm5,xmm5,xmm7
- add edx,esi
- and ebp,eax
- xor eax,ebx
- add edx,edi
- vpslld xmm1,xmm1,2
- shrd edi,edi,7
- xor ebp,ebx
- vpxor xmm5,xmm5,xmm0
- mov esi,edx
- add ecx,DWORD [28+esp]
- xor edi,eax
- shld edx,edx,5
- vpxor xmm5,xmm5,xmm1
- add ecx,ebp
- and esi,edi
- vmovdqa xmm1,[112+esp]
- xor edi,eax
- add ecx,edx
- shrd edx,edx,7
- xor esi,eax
- vpalignr xmm6,xmm3,xmm2,8
- mov ebp,ecx
- add ebx,DWORD [32+esp]
- vpaddd xmm1,xmm1,xmm5
- vmovdqa [96+esp],xmm2
- xor edx,edi
- shld ecx,ecx,5
- vpsrldq xmm0,xmm5,4
- add ebx,esi
- and ebp,edx
- vpxor xmm6,xmm6,xmm2
- xor edx,edi
- add ebx,ecx
- vpxor xmm0,xmm0,xmm4
- shrd ecx,ecx,7
- xor ebp,edi
- vmovdqa [16+esp],xmm1
- mov esi,ebx
- add eax,DWORD [36+esp]
- vpxor xmm6,xmm6,xmm0
- xor ecx,edx
- shld ebx,ebx,5
- add eax,ebp
- and esi,ecx
- vpsrld xmm0,xmm6,31
- xor ecx,edx
- add eax,ebx
- shrd ebx,ebx,7
- xor esi,edx
- vpslldq xmm2,xmm6,12
- vpaddd xmm6,xmm6,xmm6
- mov ebp,eax
- add edi,DWORD [40+esp]
- xor ebx,ecx
- shld eax,eax,5
- vpsrld xmm1,xmm2,30
- vpor xmm6,xmm6,xmm0
- add edi,esi
- and ebp,ebx
- xor ebx,ecx
- add edi,eax
- vpslld xmm2,xmm2,2
- vmovdqa xmm0,[64+esp]
- shrd eax,eax,7
- xor ebp,ecx
- vpxor xmm6,xmm6,xmm1
- mov esi,edi
- add edx,DWORD [44+esp]
- xor eax,ebx
- shld edi,edi,5
- vpxor xmm6,xmm6,xmm2
- add edx,ebp
- and esi,eax
- vmovdqa xmm2,[112+esp]
- xor eax,ebx
- add edx,edi
- shrd edi,edi,7
- xor esi,ebx
- vpalignr xmm7,xmm4,xmm3,8
- mov ebp,edx
- add ecx,DWORD [48+esp]
- vpaddd xmm2,xmm2,xmm6
- vmovdqa [64+esp],xmm3
- xor edi,eax
- shld edx,edx,5
- vpsrldq xmm1,xmm6,4
- add ecx,esi
- and ebp,edi
- vpxor xmm7,xmm7,xmm3
- xor edi,eax
- add ecx,edx
- vpxor xmm1,xmm1,xmm5
- shrd edx,edx,7
- xor ebp,eax
- vmovdqa [32+esp],xmm2
- mov esi,ecx
- add ebx,DWORD [52+esp]
- vpxor xmm7,xmm7,xmm1
- xor edx,edi
- shld ecx,ecx,5
- add ebx,ebp
- and esi,edx
- vpsrld xmm1,xmm7,31
- xor edx,edi
- add ebx,ecx
- shrd ecx,ecx,7
- xor esi,edi
- vpslldq xmm3,xmm7,12
- vpaddd xmm7,xmm7,xmm7
- mov ebp,ebx
- add eax,DWORD [56+esp]
- xor ecx,edx
- shld ebx,ebx,5
- vpsrld xmm2,xmm3,30
- vpor xmm7,xmm7,xmm1
- add eax,esi
- and ebp,ecx
- xor ecx,edx
- add eax,ebx
- vpslld xmm3,xmm3,2
- vmovdqa xmm1,[80+esp]
- shrd ebx,ebx,7
- xor ebp,edx
- vpxor xmm7,xmm7,xmm2
- mov esi,eax
- add edi,DWORD [60+esp]
- xor ebx,ecx
- shld eax,eax,5
- vpxor xmm7,xmm7,xmm3
- add edi,ebp
- and esi,ebx
- vmovdqa xmm3,[112+esp]
- xor ebx,ecx
- add edi,eax
- vpalignr xmm2,xmm7,xmm6,8
- vpxor xmm0,xmm0,xmm4
- shrd eax,eax,7
- xor esi,ecx
- mov ebp,edi
- add edx,DWORD [esp]
- vpxor xmm0,xmm0,xmm1
- vmovdqa [80+esp],xmm4
- xor eax,ebx
- shld edi,edi,5
- vmovdqa xmm4,xmm3
- vpaddd xmm3,xmm3,xmm7
- add edx,esi
- and ebp,eax
- vpxor xmm0,xmm0,xmm2
- xor eax,ebx
- add edx,edi
- shrd edi,edi,7
- xor ebp,ebx
- vpsrld xmm2,xmm0,30
- vmovdqa [48+esp],xmm3
- mov esi,edx
- add ecx,DWORD [4+esp]
- xor edi,eax
- shld edx,edx,5
- vpslld xmm0,xmm0,2
- add ecx,ebp
- and esi,edi
- xor edi,eax
- add ecx,edx
- shrd edx,edx,7
- xor esi,eax
- mov ebp,ecx
- add ebx,DWORD [8+esp]
- vpor xmm0,xmm0,xmm2
- xor edx,edi
- shld ecx,ecx,5
- vmovdqa xmm2,[96+esp]
- add ebx,esi
- and ebp,edx
- xor edx,edi
- add ebx,ecx
- add eax,DWORD [12+esp]
- xor ebp,edi
- mov esi,ebx
- shld ebx,ebx,5
- add eax,ebp
- xor esi,edx
- shrd ecx,ecx,7
- add eax,ebx
- vpalignr xmm3,xmm0,xmm7,8
- vpxor xmm1,xmm1,xmm5
- add edi,DWORD [16+esp]
- xor esi,ecx
- mov ebp,eax
- shld eax,eax,5
- vpxor xmm1,xmm1,xmm2
- vmovdqa [96+esp],xmm5
- add edi,esi
- xor ebp,ecx
- vmovdqa xmm5,xmm4
- vpaddd xmm4,xmm4,xmm0
- shrd ebx,ebx,7
- add edi,eax
- vpxor xmm1,xmm1,xmm3
- add edx,DWORD [20+esp]
- xor ebp,ebx
- mov esi,edi
- shld edi,edi,5
- vpsrld xmm3,xmm1,30
- vmovdqa [esp],xmm4
- add edx,ebp
- xor esi,ebx
- shrd eax,eax,7
- add edx,edi
- vpslld xmm1,xmm1,2
- add ecx,DWORD [24+esp]
- xor esi,eax
- mov ebp,edx
- shld edx,edx,5
- add ecx,esi
- xor ebp,eax
- shrd edi,edi,7
- add ecx,edx
- vpor xmm1,xmm1,xmm3
- add ebx,DWORD [28+esp]
- xor ebp,edi
- vmovdqa xmm3,[64+esp]
- mov esi,ecx
- shld ecx,ecx,5
- add ebx,ebp
- xor esi,edi
- shrd edx,edx,7
- add ebx,ecx
- vpalignr xmm4,xmm1,xmm0,8
- vpxor xmm2,xmm2,xmm6
- add eax,DWORD [32+esp]
- xor esi,edx
- mov ebp,ebx
- shld ebx,ebx,5
- vpxor xmm2,xmm2,xmm3
- vmovdqa [64+esp],xmm6
- add eax,esi
- xor ebp,edx
- vmovdqa xmm6,[128+esp]
- vpaddd xmm5,xmm5,xmm1
- shrd ecx,ecx,7
- add eax,ebx
- vpxor xmm2,xmm2,xmm4
- add edi,DWORD [36+esp]
- xor ebp,ecx
- mov esi,eax
- shld eax,eax,5
- vpsrld xmm4,xmm2,30
- vmovdqa [16+esp],xmm5
- add edi,ebp
- xor esi,ecx
- shrd ebx,ebx,7
- add edi,eax
- vpslld xmm2,xmm2,2
- add edx,DWORD [40+esp]
- xor esi,ebx
- mov ebp,edi
- shld edi,edi,5
- add edx,esi
- xor ebp,ebx
- shrd eax,eax,7
- add edx,edi
- vpor xmm2,xmm2,xmm4
- add ecx,DWORD [44+esp]
- xor ebp,eax
- vmovdqa xmm4,[80+esp]
- mov esi,edx
- shld edx,edx,5
- add ecx,ebp
- xor esi,eax
- shrd edi,edi,7
- add ecx,edx
- vpalignr xmm5,xmm2,xmm1,8
- vpxor xmm3,xmm3,xmm7
- add ebx,DWORD [48+esp]
- xor esi,edi
- mov ebp,ecx
- shld ecx,ecx,5
- vpxor xmm3,xmm3,xmm4
- vmovdqa [80+esp],xmm7
- add ebx,esi
- xor ebp,edi
- vmovdqa xmm7,xmm6
- vpaddd xmm6,xmm6,xmm2
- shrd edx,edx,7
- add ebx,ecx
- vpxor xmm3,xmm3,xmm5
- add eax,DWORD [52+esp]
- xor ebp,edx
- mov esi,ebx
- shld ebx,ebx,5
- vpsrld xmm5,xmm3,30
- vmovdqa [32+esp],xmm6
- add eax,ebp
- xor esi,edx
- shrd ecx,ecx,7
- add eax,ebx
- vpslld xmm3,xmm3,2
- add edi,DWORD [56+esp]
- xor esi,ecx
- mov ebp,eax
- shld eax,eax,5
- add edi,esi
- xor ebp,ecx
- shrd ebx,ebx,7
- add edi,eax
- vpor xmm3,xmm3,xmm5
- add edx,DWORD [60+esp]
- xor ebp,ebx
- vmovdqa xmm5,[96+esp]
- mov esi,edi
- shld edi,edi,5
- add edx,ebp
- xor esi,ebx
- shrd eax,eax,7
- add edx,edi
- vpalignr xmm6,xmm3,xmm2,8
- vpxor xmm4,xmm4,xmm0
- add ecx,DWORD [esp]
- xor esi,eax
- mov ebp,edx
- shld edx,edx,5
- vpxor xmm4,xmm4,xmm5
- vmovdqa [96+esp],xmm0
- add ecx,esi
- xor ebp,eax
- vmovdqa xmm0,xmm7
- vpaddd xmm7,xmm7,xmm3
- shrd edi,edi,7
- add ecx,edx
- vpxor xmm4,xmm4,xmm6
- add ebx,DWORD [4+esp]
- xor ebp,edi
- mov esi,ecx
- shld ecx,ecx,5
- vpsrld xmm6,xmm4,30
- vmovdqa [48+esp],xmm7
- add ebx,ebp
- xor esi,edi
- shrd edx,edx,7
- add ebx,ecx
- vpslld xmm4,xmm4,2
- add eax,DWORD [8+esp]
- xor esi,edx
- mov ebp,ebx
- shld ebx,ebx,5
- add eax,esi
- xor ebp,edx
- shrd ecx,ecx,7
- add eax,ebx
- vpor xmm4,xmm4,xmm6
- add edi,DWORD [12+esp]
- xor ebp,ecx
- vmovdqa xmm6,[64+esp]
- mov esi,eax
- shld eax,eax,5
- add edi,ebp
- xor esi,ecx
- shrd ebx,ebx,7
- add edi,eax
- vpalignr xmm7,xmm4,xmm3,8
- vpxor xmm5,xmm5,xmm1
- add edx,DWORD [16+esp]
- xor esi,ebx
- mov ebp,edi
- shld edi,edi,5
- vpxor xmm5,xmm5,xmm6
- vmovdqa [64+esp],xmm1
- add edx,esi
- xor ebp,ebx
- vmovdqa xmm1,xmm0
- vpaddd xmm0,xmm0,xmm4
- shrd eax,eax,7
- add edx,edi
- vpxor xmm5,xmm5,xmm7
- add ecx,DWORD [20+esp]
- xor ebp,eax
- mov esi,edx
- shld edx,edx,5
- vpsrld xmm7,xmm5,30
- vmovdqa [esp],xmm0
- add ecx,ebp
- xor esi,eax
- shrd edi,edi,7
- add ecx,edx
- vpslld xmm5,xmm5,2
- add ebx,DWORD [24+esp]
- xor esi,edi
- mov ebp,ecx
- shld ecx,ecx,5
- add ebx,esi
- xor ebp,edi
- shrd edx,edx,7
- add ebx,ecx
- vpor xmm5,xmm5,xmm7
- add eax,DWORD [28+esp]
- vmovdqa xmm7,[80+esp]
- shrd ecx,ecx,7
- mov esi,ebx
- xor ebp,edx
- shld ebx,ebx,5
- add eax,ebp
- xor esi,ecx
- xor ecx,edx
- add eax,ebx
- vpalignr xmm0,xmm5,xmm4,8
- vpxor xmm6,xmm6,xmm2
- add edi,DWORD [32+esp]
- and esi,ecx
- xor ecx,edx
- shrd ebx,ebx,7
- vpxor xmm6,xmm6,xmm7
- vmovdqa [80+esp],xmm2
- mov ebp,eax
- xor esi,ecx
- vmovdqa xmm2,xmm1
- vpaddd xmm1,xmm1,xmm5
- shld eax,eax,5
- add edi,esi
- vpxor xmm6,xmm6,xmm0
- xor ebp,ebx
- xor ebx,ecx
- add edi,eax
- add edx,DWORD [36+esp]
- vpsrld xmm0,xmm6,30
- vmovdqa [16+esp],xmm1
- and ebp,ebx
- xor ebx,ecx
- shrd eax,eax,7
- mov esi,edi
- vpslld xmm6,xmm6,2
- xor ebp,ebx
- shld edi,edi,5
- add edx,ebp
- xor esi,eax
- xor eax,ebx
- add edx,edi
- add ecx,DWORD [40+esp]
- and esi,eax
- vpor xmm6,xmm6,xmm0
- xor eax,ebx
- shrd edi,edi,7
- vmovdqa xmm0,[96+esp]
- mov ebp,edx
- xor esi,eax
- shld edx,edx,5
- add ecx,esi
- xor ebp,edi
- xor edi,eax
- add ecx,edx
- add ebx,DWORD [44+esp]
- and ebp,edi
- xor edi,eax
- shrd edx,edx,7
- mov esi,ecx
- xor ebp,edi
- shld ecx,ecx,5
- add ebx,ebp
- xor esi,edx
- xor edx,edi
- add ebx,ecx
- vpalignr xmm1,xmm6,xmm5,8
- vpxor xmm7,xmm7,xmm3
- add eax,DWORD [48+esp]
- and esi,edx
- xor edx,edi
- shrd ecx,ecx,7
- vpxor xmm7,xmm7,xmm0
- vmovdqa [96+esp],xmm3
- mov ebp,ebx
- xor esi,edx
- vmovdqa xmm3,[144+esp]
- vpaddd xmm2,xmm2,xmm6
- shld ebx,ebx,5
- add eax,esi
- vpxor xmm7,xmm7,xmm1
- xor ebp,ecx
- xor ecx,edx
- add eax,ebx
- add edi,DWORD [52+esp]
- vpsrld xmm1,xmm7,30
- vmovdqa [32+esp],xmm2
- and ebp,ecx
- xor ecx,edx
- shrd ebx,ebx,7
- mov esi,eax
- vpslld xmm7,xmm7,2
- xor ebp,ecx
- shld eax,eax,5
- add edi,ebp
- xor esi,ebx
- xor ebx,ecx
- add edi,eax
- add edx,DWORD [56+esp]
- and esi,ebx
- vpor xmm7,xmm7,xmm1
- xor ebx,ecx
- shrd eax,eax,7
- vmovdqa xmm1,[64+esp]
- mov ebp,edi
- xor esi,ebx
- shld edi,edi,5
- add edx,esi
- xor ebp,eax
- xor eax,ebx
- add edx,edi
- add ecx,DWORD [60+esp]
- and ebp,eax
- xor eax,ebx
- shrd edi,edi,7
- mov esi,edx
- xor ebp,eax
- shld edx,edx,5
- add ecx,ebp
- xor esi,edi
- xor edi,eax
- add ecx,edx
- vpalignr xmm2,xmm7,xmm6,8
- vpxor xmm0,xmm0,xmm4
- add ebx,DWORD [esp]
- and esi,edi
- xor edi,eax
- shrd edx,edx,7
- vpxor xmm0,xmm0,xmm1
- vmovdqa [64+esp],xmm4
- mov ebp,ecx
- xor esi,edi
- vmovdqa xmm4,xmm3
- vpaddd xmm3,xmm3,xmm7
- shld ecx,ecx,5
- add ebx,esi
- vpxor xmm0,xmm0,xmm2
- xor ebp,edx
- xor edx,edi
- add ebx,ecx
- add eax,DWORD [4+esp]
- vpsrld xmm2,xmm0,30
- vmovdqa [48+esp],xmm3
- and ebp,edx
- xor edx,edi
- shrd ecx,ecx,7
- mov esi,ebx
- vpslld xmm0,xmm0,2
- xor ebp,edx
- shld ebx,ebx,5
- add eax,ebp
- xor esi,ecx
- xor ecx,edx
- add eax,ebx
- add edi,DWORD [8+esp]
- and esi,ecx
- vpor xmm0,xmm0,xmm2
- xor ecx,edx
- shrd ebx,ebx,7
- vmovdqa xmm2,[80+esp]
- mov ebp,eax
- xor esi,ecx
- shld eax,eax,5
- add edi,esi
- xor ebp,ebx
- xor ebx,ecx
- add edi,eax
- add edx,DWORD [12+esp]
- and ebp,ebx
- xor ebx,ecx
- shrd eax,eax,7
- mov esi,edi
- xor ebp,ebx
- shld edi,edi,5
- add edx,ebp
- xor esi,eax
- xor eax,ebx
- add edx,edi
- vpalignr xmm3,xmm0,xmm7,8
- vpxor xmm1,xmm1,xmm5
- add ecx,DWORD [16+esp]
- and esi,eax
- xor eax,ebx
- shrd edi,edi,7
- vpxor xmm1,xmm1,xmm2
- vmovdqa [80+esp],xmm5
- mov ebp,edx
- xor esi,eax
- vmovdqa xmm5,xmm4
- vpaddd xmm4,xmm4,xmm0
- shld edx,edx,5
- add ecx,esi
- vpxor xmm1,xmm1,xmm3
- xor ebp,edi
- xor edi,eax
- add ecx,edx
- add ebx,DWORD [20+esp]
- vpsrld xmm3,xmm1,30
- vmovdqa [esp],xmm4
- and ebp,edi
- xor edi,eax
- shrd edx,edx,7
- mov esi,ecx
- vpslld xmm1,xmm1,2
- xor ebp,edi
- shld ecx,ecx,5
- add ebx,ebp
- xor esi,edx
- xor edx,edi
- add ebx,ecx
- add eax,DWORD [24+esp]
- and esi,edx
- vpor xmm1,xmm1,xmm3
- xor edx,edi
- shrd ecx,ecx,7
- vmovdqa xmm3,[96+esp]
- mov ebp,ebx
- xor esi,edx
- shld ebx,ebx,5
- add eax,esi
- xor ebp,ecx
- xor ecx,edx
- add eax,ebx
- add edi,DWORD [28+esp]
- and ebp,ecx
- xor ecx,edx
- shrd ebx,ebx,7
- mov esi,eax
- xor ebp,ecx
- shld eax,eax,5
- add edi,ebp
- xor esi,ebx
- xor ebx,ecx
- add edi,eax
- vpalignr xmm4,xmm1,xmm0,8
- vpxor xmm2,xmm2,xmm6
- add edx,DWORD [32+esp]
- and esi,ebx
- xor ebx,ecx
- shrd eax,eax,7
- vpxor xmm2,xmm2,xmm3
- vmovdqa [96+esp],xmm6
- mov ebp,edi
- xor esi,ebx
- vmovdqa xmm6,xmm5
- vpaddd xmm5,xmm5,xmm1
- shld edi,edi,5
- add edx,esi
- vpxor xmm2,xmm2,xmm4
- xor ebp,eax
- xor eax,ebx
- add edx,edi
- add ecx,DWORD [36+esp]
- vpsrld xmm4,xmm2,30
- vmovdqa [16+esp],xmm5
- and ebp,eax
- xor eax,ebx
- shrd edi,edi,7
- mov esi,edx
- vpslld xmm2,xmm2,2
- xor ebp,eax
- shld edx,edx,5
- add ecx,ebp
- xor esi,edi
- xor edi,eax
- add ecx,edx
- add ebx,DWORD [40+esp]
- and esi,edi
- vpor xmm2,xmm2,xmm4
- xor edi,eax
- shrd edx,edx,7
- vmovdqa xmm4,[64+esp]
- mov ebp,ecx
- xor esi,edi
- shld ecx,ecx,5
- add ebx,esi
- xor ebp,edx
- xor edx,edi
- add ebx,ecx
- add eax,DWORD [44+esp]
- and ebp,edx
- xor edx,edi
- shrd ecx,ecx,7
- mov esi,ebx
- xor ebp,edx
- shld ebx,ebx,5
- add eax,ebp
- xor esi,edx
- add eax,ebx
- vpalignr xmm5,xmm2,xmm1,8
- vpxor xmm3,xmm3,xmm7
- add edi,DWORD [48+esp]
- xor esi,ecx
- mov ebp,eax
- shld eax,eax,5
- vpxor xmm3,xmm3,xmm4
- vmovdqa [64+esp],xmm7
- add edi,esi
- xor ebp,ecx
- vmovdqa xmm7,xmm6
- vpaddd xmm6,xmm6,xmm2
- shrd ebx,ebx,7
- add edi,eax
- vpxor xmm3,xmm3,xmm5
- add edx,DWORD [52+esp]
- xor ebp,ebx
- mov esi,edi
- shld edi,edi,5
- vpsrld xmm5,xmm3,30
- vmovdqa [32+esp],xmm6
- add edx,ebp
- xor esi,ebx
- shrd eax,eax,7
- add edx,edi
- vpslld xmm3,xmm3,2
- add ecx,DWORD [56+esp]
- xor esi,eax
- mov ebp,edx
- shld edx,edx,5
- add ecx,esi
- xor ebp,eax
- shrd edi,edi,7
- add ecx,edx
- vpor xmm3,xmm3,xmm5
- add ebx,DWORD [60+esp]
- xor ebp,edi
- mov esi,ecx
- shld ecx,ecx,5
- add ebx,ebp
- xor esi,edi
- shrd edx,edx,7
- add ebx,ecx
- add eax,DWORD [esp]
- vpaddd xmm7,xmm7,xmm3
- xor esi,edx
- mov ebp,ebx
- shld ebx,ebx,5
- add eax,esi
- vmovdqa [48+esp],xmm7
- xor ebp,edx
- shrd ecx,ecx,7
- add eax,ebx
- add edi,DWORD [4+esp]
- xor ebp,ecx
- mov esi,eax
- shld eax,eax,5
- add edi,ebp
- xor esi,ecx
- shrd ebx,ebx,7
- add edi,eax
- add edx,DWORD [8+esp]
- xor esi,ebx
- mov ebp,edi
- shld edi,edi,5
- add edx,esi
- xor ebp,ebx
- shrd eax,eax,7
- add edx,edi
- add ecx,DWORD [12+esp]
- xor ebp,eax
- mov esi,edx
- shld edx,edx,5
- add ecx,ebp
- xor esi,eax
- shrd edi,edi,7
- add ecx,edx
- mov ebp,DWORD [196+esp]
- cmp ebp,DWORD [200+esp]
- je NEAR L$010done
- vmovdqa xmm7,[160+esp]
- vmovdqa xmm6,[176+esp]
- vmovdqu xmm0,[ebp]
- vmovdqu xmm1,[16+ebp]
- vmovdqu xmm2,[32+ebp]
- vmovdqu xmm3,[48+ebp]
- add ebp,64
- vpshufb xmm0,xmm0,xmm6
- mov DWORD [196+esp],ebp
- vmovdqa [96+esp],xmm7
- add ebx,DWORD [16+esp]
- xor esi,edi
- vpshufb xmm1,xmm1,xmm6
- mov ebp,ecx
- shld ecx,ecx,5
- vpaddd xmm4,xmm0,xmm7
- add ebx,esi
- xor ebp,edi
- shrd edx,edx,7
- add ebx,ecx
- vmovdqa [esp],xmm4
- add eax,DWORD [20+esp]
- xor ebp,edx
- mov esi,ebx
- shld ebx,ebx,5
- add eax,ebp
- xor esi,edx
- shrd ecx,ecx,7
- add eax,ebx
- add edi,DWORD [24+esp]
- xor esi,ecx
- mov ebp,eax
- shld eax,eax,5
- add edi,esi
- xor ebp,ecx
- shrd ebx,ebx,7
- add edi,eax
- add edx,DWORD [28+esp]
- xor ebp,ebx
- mov esi,edi
- shld edi,edi,5
- add edx,ebp
- xor esi,ebx
- shrd eax,eax,7
- add edx,edi
- add ecx,DWORD [32+esp]
- xor esi,eax
- vpshufb xmm2,xmm2,xmm6
- mov ebp,edx
- shld edx,edx,5
- vpaddd xmm5,xmm1,xmm7
- add ecx,esi
- xor ebp,eax
- shrd edi,edi,7
- add ecx,edx
- vmovdqa [16+esp],xmm5
- add ebx,DWORD [36+esp]
- xor ebp,edi
- mov esi,ecx
- shld ecx,ecx,5
- add ebx,ebp
- xor esi,edi
- shrd edx,edx,7
- add ebx,ecx
- add eax,DWORD [40+esp]
- xor esi,edx
- mov ebp,ebx
- shld ebx,ebx,5
- add eax,esi
- xor ebp,edx
- shrd ecx,ecx,7
- add eax,ebx
- add edi,DWORD [44+esp]
- xor ebp,ecx
- mov esi,eax
- shld eax,eax,5
- add edi,ebp
- xor esi,ecx
- shrd ebx,ebx,7
- add edi,eax
- add edx,DWORD [48+esp]
- xor esi,ebx
- vpshufb xmm3,xmm3,xmm6
- mov ebp,edi
- shld edi,edi,5
- vpaddd xmm6,xmm2,xmm7
- add edx,esi
- xor ebp,ebx
- shrd eax,eax,7
- add edx,edi
- vmovdqa [32+esp],xmm6
- add ecx,DWORD [52+esp]
- xor ebp,eax
- mov esi,edx
- shld edx,edx,5
- add ecx,ebp
- xor esi,eax
- shrd edi,edi,7
- add ecx,edx
- add ebx,DWORD [56+esp]
- xor esi,edi
- mov ebp,ecx
- shld ecx,ecx,5
- add ebx,esi
- xor ebp,edi
- shrd edx,edx,7
- add ebx,ecx
- add eax,DWORD [60+esp]
- xor ebp,edx
- mov esi,ebx
- shld ebx,ebx,5
- add eax,ebp
- shrd ecx,ecx,7
- add eax,ebx
- mov ebp,DWORD [192+esp]
- add eax,DWORD [ebp]
- add esi,DWORD [4+ebp]
- add ecx,DWORD [8+ebp]
- mov DWORD [ebp],eax
- add edx,DWORD [12+ebp]
- mov DWORD [4+ebp],esi
- add edi,DWORD [16+ebp]
- mov ebx,ecx
- mov DWORD [8+ebp],ecx
- xor ebx,edx
- mov DWORD [12+ebp],edx
- mov DWORD [16+ebp],edi
- mov ebp,esi
- and esi,ebx
- mov ebx,ebp
- jmp NEAR L$009loop
-align 16
-L$010done:
- add ebx,DWORD [16+esp]
- xor esi,edi
- mov ebp,ecx
- shld ecx,ecx,5
- add ebx,esi
- xor ebp,edi
- shrd edx,edx,7
- add ebx,ecx
- add eax,DWORD [20+esp]
- xor ebp,edx
- mov esi,ebx
- shld ebx,ebx,5
- add eax,ebp
- xor esi,edx
- shrd ecx,ecx,7
- add eax,ebx
- add edi,DWORD [24+esp]
- xor esi,ecx
- mov ebp,eax
- shld eax,eax,5
- add edi,esi
- xor ebp,ecx
- shrd ebx,ebx,7
- add edi,eax
- add edx,DWORD [28+esp]
- xor ebp,ebx
- mov esi,edi
- shld edi,edi,5
- add edx,ebp
- xor esi,ebx
- shrd eax,eax,7
- add edx,edi
- add ecx,DWORD [32+esp]
- xor esi,eax
- mov ebp,edx
- shld edx,edx,5
- add ecx,esi
- xor ebp,eax
- shrd edi,edi,7
- add ecx,edx
- add ebx,DWORD [36+esp]
- xor ebp,edi
- mov esi,ecx
- shld ecx,ecx,5
- add ebx,ebp
- xor esi,edi
- shrd edx,edx,7
- add ebx,ecx
- add eax,DWORD [40+esp]
- xor esi,edx
- mov ebp,ebx
- shld ebx,ebx,5
- add eax,esi
- xor ebp,edx
- shrd ecx,ecx,7
- add eax,ebx
- add edi,DWORD [44+esp]
- xor ebp,ecx
- mov esi,eax
- shld eax,eax,5
- add edi,ebp
- xor esi,ecx
- shrd ebx,ebx,7
- add edi,eax
- add edx,DWORD [48+esp]
- xor esi,ebx
- mov ebp,edi
- shld edi,edi,5
- add edx,esi
- xor ebp,ebx
- shrd eax,eax,7
- add edx,edi
- add ecx,DWORD [52+esp]
- xor ebp,eax
- mov esi,edx
- shld edx,edx,5
- add ecx,ebp
- xor esi,eax
- shrd edi,edi,7
- add ecx,edx
- add ebx,DWORD [56+esp]
- xor esi,edi
- mov ebp,ecx
- shld ecx,ecx,5
- add ebx,esi
- xor ebp,edi
- shrd edx,edx,7
- add ebx,ecx
- add eax,DWORD [60+esp]
- xor ebp,edx
- mov esi,ebx
- shld ebx,ebx,5
- add eax,ebp
- shrd ecx,ecx,7
- add eax,ebx
- vzeroall
- mov ebp,DWORD [192+esp]
- add eax,DWORD [ebp]
- mov esp,DWORD [204+esp]
- add esi,DWORD [4+ebp]
- add ecx,DWORD [8+ebp]
- mov DWORD [ebp],eax
- add edx,DWORD [12+ebp]
- mov DWORD [4+ebp],esi
- add edi,DWORD [16+ebp]
- mov DWORD [8+ebp],ecx
- mov DWORD [12+ebp],edx
- mov DWORD [16+ebp],edi
- pop edi
- pop esi
- pop ebx
- pop ebp
- ret
align 64
L$K_XX_XX:
dd 1518500249,1518500249,1518500249,1518500249
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-MSFT/crypto/sha/sha256-586.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-MSFT/crypto/sha/sha256-586.nasm
index 7d8398c7d37..922c31604a2 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-MSFT/crypto/sha/sha256-586.nasm
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/IA32-MSFT/crypto/sha/sha256-586.nasm
@@ -46,13 +46,12 @@ L$000pic_point:
or ecx,ebx
and ecx,1342177280
cmp ecx,1342177280
- je NEAR L$005AVX
test ebx,512
- jnz NEAR L$006SSSE3
+ jnz NEAR L$005SSSE3
L$003no_xmm:
sub eax,edi
cmp eax,256
- jae NEAR L$007unrolled
+ jae NEAR L$006unrolled
jmp NEAR L$002loop
align 16
L$002loop:
@@ -124,7 +123,7 @@ L$002loop:
mov DWORD [28+esp],ecx
mov DWORD [32+esp],edi
align 16
-L$00800_15:
+L$00700_15:
mov ecx,edx
mov esi,DWORD [24+esp]
ror ecx,14
@@ -162,11 +161,11 @@ L$00800_15:
add ebp,4
add eax,ebx
cmp esi,3248222580
- jne NEAR L$00800_15
+ jne NEAR L$00700_15
mov ecx,DWORD [156+esp]
- jmp NEAR L$00916_63
+ jmp NEAR L$00816_63
align 16
-L$00916_63:
+L$00816_63:
mov ebx,ecx
mov esi,DWORD [104+esp]
ror ecx,11
@@ -221,7 +220,7 @@ L$00916_63:
add ebp,4
add eax,ebx
cmp esi,3329325298
- jne NEAR L$00916_63
+ jne NEAR L$00816_63
mov esi,DWORD [356+esp]
mov ebx,DWORD [8+esp]
mov ecx,DWORD [16+esp]
@@ -265,7 +264,7 @@ db 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
db 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
db 62,0
align 16
-L$007unrolled:
+L$006unrolled:
lea esp,[esp-96]
mov eax,DWORD [esi]
mov ebp,DWORD [4+esi]
@@ -282,9 +281,9 @@ L$007unrolled:
mov DWORD [20+esp],ebx
mov DWORD [24+esp],ecx
mov DWORD [28+esp],esi
- jmp NEAR L$010grand_loop
+ jmp NEAR L$009grand_loop
align 16
-L$010grand_loop:
+L$009grand_loop:
mov ebx,DWORD [edi]
mov ecx,DWORD [4+edi]
bswap ebx
@@ -3164,7 +3163,7 @@ L$010grand_loop:
mov DWORD [24+esp],ebx
mov DWORD [28+esp],ecx
cmp edi,DWORD [104+esp]
- jb NEAR L$010grand_loop
+ jb NEAR L$009grand_loop
mov esp,DWORD [108+esp]
pop edi
pop esi
@@ -3183,9 +3182,9 @@ L$004shaext:
pshufd xmm2,xmm2,27
db 102,15,58,15,202,8
punpcklqdq xmm2,xmm0
- jmp NEAR L$011loop_shaext
+ jmp NEAR L$010loop_shaext
align 16
-L$011loop_shaext:
+L$010loop_shaext:
movdqu xmm3,[edi]
movdqu xmm4,[16+edi]
movdqu xmm5,[32+edi]
@@ -3355,7 +3354,7 @@ db 15,56,203,209
db 15,56,203,202
paddd xmm2,[16+esp]
paddd xmm1,[esp]
- jnz NEAR L$011loop_shaext
+ jnz NEAR L$010loop_shaext
pshufd xmm2,xmm2,177
pshufd xmm7,xmm1,27
pshufd xmm1,xmm1,177
@@ -3370,7 +3369,7 @@ db 102,15,58,15,215,8
pop ebp
ret
align 32
-L$006SSSE3:
+L$005SSSE3:
lea esp,[esp-96]
mov eax,DWORD [esi]
mov ebx,DWORD [4+esi]
@@ -3389,9 +3388,9 @@ L$006SSSE3:
mov DWORD [24+esp],ecx
mov DWORD [28+esp],esi
movdqa xmm7,[256+ebp]
- jmp NEAR L$012grand_ssse3
+ jmp NEAR L$011grand_ssse3
align 16
-L$012grand_ssse3:
+L$011grand_ssse3:
movdqu xmm0,[edi]
movdqu xmm1,[16+edi]
movdqu xmm2,[32+edi]
@@ -3414,9 +3413,9 @@ db 102,15,56,0,223
paddd xmm7,xmm3
movdqa [64+esp],xmm6
movdqa [80+esp],xmm7
- jmp NEAR L$013ssse3_00_47
+ jmp NEAR L$012ssse3_00_47
align 16
-L$013ssse3_00_47:
+L$012ssse3_00_47:
add ebp,64
mov ecx,edx
movdqa xmm4,xmm1
@@ -4059,7 +4058,7 @@ db 102,15,58,15,249,4
add eax,ecx
movdqa [80+esp],xmm6
cmp DWORD [64+ebp],66051
- jne NEAR L$013ssse3_00_47
+ jne NEAR L$012ssse3_00_47
mov ecx,edx
ror edx,14
mov esi,DWORD [20+esp]
@@ -4573,2217 +4572,12 @@ db 102,15,58,15,249,4
movdqa xmm7,[64+ebp]
sub ebp,192
cmp edi,DWORD [104+esp]
- jb NEAR L$012grand_ssse3
+ jb NEAR L$011grand_ssse3
mov esp,DWORD [108+esp]
pop edi
pop esi
pop ebx
pop ebp
ret
-align 32
-L$005AVX:
- and edx,264
- cmp edx,264
- je NEAR L$014AVX_BMI
- lea esp,[esp-96]
- vzeroall
- mov eax,DWORD [esi]
- mov ebx,DWORD [4+esi]
- mov ecx,DWORD [8+esi]
- mov edi,DWORD [12+esi]
- mov DWORD [4+esp],ebx
- xor ebx,ecx
- mov DWORD [8+esp],ecx
- mov DWORD [12+esp],edi
- mov edx,DWORD [16+esi]
- mov edi,DWORD [20+esi]
- mov ecx,DWORD [24+esi]
- mov esi,DWORD [28+esi]
- mov DWORD [20+esp],edi
- mov edi,DWORD [100+esp]
- mov DWORD [24+esp],ecx
- mov DWORD [28+esp],esi
- vmovdqa xmm7,[256+ebp]
- jmp NEAR L$015grand_avx
-align 32
-L$015grand_avx:
- vmovdqu xmm0,[edi]
- vmovdqu xmm1,[16+edi]
- vmovdqu xmm2,[32+edi]
- vmovdqu xmm3,[48+edi]
- add edi,64
- vpshufb xmm0,xmm0,xmm7
- mov DWORD [100+esp],edi
- vpshufb xmm1,xmm1,xmm7
- vpshufb xmm2,xmm2,xmm7
- vpaddd xmm4,xmm0,[ebp]
- vpshufb xmm3,xmm3,xmm7
- vpaddd xmm5,xmm1,[16+ebp]
- vpaddd xmm6,xmm2,[32+ebp]
- vpaddd xmm7,xmm3,[48+ebp]
- vmovdqa [32+esp],xmm4
- vmovdqa [48+esp],xmm5
- vmovdqa [64+esp],xmm6
- vmovdqa [80+esp],xmm7
- jmp NEAR L$016avx_00_47
-align 16
-L$016avx_00_47:
- add ebp,64
- vpalignr xmm4,xmm1,xmm0,4
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [20+esp]
- vpalignr xmm7,xmm3,xmm2,4
- xor edx,ecx
- mov edi,DWORD [24+esp]
- xor esi,edi
- vpsrld xmm6,xmm4,7
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [16+esp],ecx
- vpaddd xmm0,xmm0,xmm7
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- vpsrld xmm7,xmm4,3
- mov ecx,eax
- add edx,edi
- mov edi,DWORD [4+esp]
- vpslld xmm5,xmm4,14
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD [esp],eax
- vpxor xmm4,xmm7,xmm6
- xor ecx,eax
- xor eax,edi
- add edx,DWORD [28+esp]
- vpshufd xmm7,xmm3,250
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- vpsrld xmm6,xmm6,11
- add edx,DWORD [32+esp]
- xor ebx,edi
- shrd ecx,ecx,2
- vpxor xmm4,xmm4,xmm5
- add ebx,edx
- add edx,DWORD [12+esp]
- add ebx,ecx
- vpslld xmm5,xmm5,11
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [16+esp]
- vpxor xmm4,xmm4,xmm6
- xor edx,ecx
- mov edi,DWORD [20+esp]
- xor esi,edi
- vpsrld xmm6,xmm7,10
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [12+esp],ecx
- vpxor xmm4,xmm4,xmm5
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- vpsrlq xmm5,xmm7,17
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD [esp]
- vpaddd xmm0,xmm0,xmm4
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD [28+esp],ebx
- vpxor xmm6,xmm6,xmm5
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD [24+esp]
- vpsrlq xmm7,xmm7,19
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- vpxor xmm6,xmm6,xmm7
- add edx,DWORD [36+esp]
- xor eax,edi
- shrd ecx,ecx,2
- vpshufd xmm7,xmm6,132
- add eax,edx
- add edx,DWORD [8+esp]
- add eax,ecx
- vpsrldq xmm7,xmm7,8
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [12+esp]
- vpaddd xmm0,xmm0,xmm7
- xor edx,ecx
- mov edi,DWORD [16+esp]
- xor esi,edi
- vpshufd xmm7,xmm0,80
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [8+esp],ecx
- vpsrld xmm6,xmm7,10
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- vpsrlq xmm5,xmm7,17
- mov ecx,eax
- add edx,edi
- mov edi,DWORD [28+esp]
- vpxor xmm6,xmm6,xmm5
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD [24+esp],eax
- vpsrlq xmm7,xmm7,19
- xor ecx,eax
- xor eax,edi
- add edx,DWORD [20+esp]
- vpxor xmm6,xmm6,xmm7
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- vpshufd xmm7,xmm6,232
- add edx,DWORD [40+esp]
- xor ebx,edi
- shrd ecx,ecx,2
- vpslldq xmm7,xmm7,8
- add ebx,edx
- add edx,DWORD [4+esp]
- add ebx,ecx
- vpaddd xmm0,xmm0,xmm7
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [8+esp]
- vpaddd xmm6,xmm0,[ebp]
- xor edx,ecx
- mov edi,DWORD [12+esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [4+esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD [24+esp]
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD [20+esp],ebx
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD [16+esp]
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- add edx,DWORD [44+esp]
- xor eax,edi
- shrd ecx,ecx,2
- add eax,edx
- add edx,DWORD [esp]
- add eax,ecx
- vmovdqa [32+esp],xmm6
- vpalignr xmm4,xmm2,xmm1,4
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [4+esp]
- vpalignr xmm7,xmm0,xmm3,4
- xor edx,ecx
- mov edi,DWORD [8+esp]
- xor esi,edi
- vpsrld xmm6,xmm4,7
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [esp],ecx
- vpaddd xmm1,xmm1,xmm7
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- vpsrld xmm7,xmm4,3
- mov ecx,eax
- add edx,edi
- mov edi,DWORD [20+esp]
- vpslld xmm5,xmm4,14
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD [16+esp],eax
- vpxor xmm4,xmm7,xmm6
- xor ecx,eax
- xor eax,edi
- add edx,DWORD [12+esp]
- vpshufd xmm7,xmm0,250
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- vpsrld xmm6,xmm6,11
- add edx,DWORD [48+esp]
- xor ebx,edi
- shrd ecx,ecx,2
- vpxor xmm4,xmm4,xmm5
- add ebx,edx
- add edx,DWORD [28+esp]
- add ebx,ecx
- vpslld xmm5,xmm5,11
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [esp]
- vpxor xmm4,xmm4,xmm6
- xor edx,ecx
- mov edi,DWORD [4+esp]
- xor esi,edi
- vpsrld xmm6,xmm7,10
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [28+esp],ecx
- vpxor xmm4,xmm4,xmm5
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- vpsrlq xmm5,xmm7,17
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD [16+esp]
- vpaddd xmm1,xmm1,xmm4
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD [12+esp],ebx
- vpxor xmm6,xmm6,xmm5
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD [8+esp]
- vpsrlq xmm7,xmm7,19
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- vpxor xmm6,xmm6,xmm7
- add edx,DWORD [52+esp]
- xor eax,edi
- shrd ecx,ecx,2
- vpshufd xmm7,xmm6,132
- add eax,edx
- add edx,DWORD [24+esp]
- add eax,ecx
- vpsrldq xmm7,xmm7,8
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [28+esp]
- vpaddd xmm1,xmm1,xmm7
- xor edx,ecx
- mov edi,DWORD [esp]
- xor esi,edi
- vpshufd xmm7,xmm1,80
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [24+esp],ecx
- vpsrld xmm6,xmm7,10
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- vpsrlq xmm5,xmm7,17
- mov ecx,eax
- add edx,edi
- mov edi,DWORD [12+esp]
- vpxor xmm6,xmm6,xmm5
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD [8+esp],eax
- vpsrlq xmm7,xmm7,19
- xor ecx,eax
- xor eax,edi
- add edx,DWORD [4+esp]
- vpxor xmm6,xmm6,xmm7
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- vpshufd xmm7,xmm6,232
- add edx,DWORD [56+esp]
- xor ebx,edi
- shrd ecx,ecx,2
- vpslldq xmm7,xmm7,8
- add ebx,edx
- add edx,DWORD [20+esp]
- add ebx,ecx
- vpaddd xmm1,xmm1,xmm7
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [24+esp]
- vpaddd xmm6,xmm1,[16+ebp]
- xor edx,ecx
- mov edi,DWORD [28+esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [20+esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD [8+esp]
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD [4+esp],ebx
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD [esp]
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- add edx,DWORD [60+esp]
- xor eax,edi
- shrd ecx,ecx,2
- add eax,edx
- add edx,DWORD [16+esp]
- add eax,ecx
- vmovdqa [48+esp],xmm6
- vpalignr xmm4,xmm3,xmm2,4
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [20+esp]
- vpalignr xmm7,xmm1,xmm0,4
- xor edx,ecx
- mov edi,DWORD [24+esp]
- xor esi,edi
- vpsrld xmm6,xmm4,7
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [16+esp],ecx
- vpaddd xmm2,xmm2,xmm7
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- vpsrld xmm7,xmm4,3
- mov ecx,eax
- add edx,edi
- mov edi,DWORD [4+esp]
- vpslld xmm5,xmm4,14
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD [esp],eax
- vpxor xmm4,xmm7,xmm6
- xor ecx,eax
- xor eax,edi
- add edx,DWORD [28+esp]
- vpshufd xmm7,xmm1,250
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- vpsrld xmm6,xmm6,11
- add edx,DWORD [64+esp]
- xor ebx,edi
- shrd ecx,ecx,2
- vpxor xmm4,xmm4,xmm5
- add ebx,edx
- add edx,DWORD [12+esp]
- add ebx,ecx
- vpslld xmm5,xmm5,11
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [16+esp]
- vpxor xmm4,xmm4,xmm6
- xor edx,ecx
- mov edi,DWORD [20+esp]
- xor esi,edi
- vpsrld xmm6,xmm7,10
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [12+esp],ecx
- vpxor xmm4,xmm4,xmm5
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- vpsrlq xmm5,xmm7,17
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD [esp]
- vpaddd xmm2,xmm2,xmm4
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD [28+esp],ebx
- vpxor xmm6,xmm6,xmm5
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD [24+esp]
- vpsrlq xmm7,xmm7,19
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- vpxor xmm6,xmm6,xmm7
- add edx,DWORD [68+esp]
- xor eax,edi
- shrd ecx,ecx,2
- vpshufd xmm7,xmm6,132
- add eax,edx
- add edx,DWORD [8+esp]
- add eax,ecx
- vpsrldq xmm7,xmm7,8
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [12+esp]
- vpaddd xmm2,xmm2,xmm7
- xor edx,ecx
- mov edi,DWORD [16+esp]
- xor esi,edi
- vpshufd xmm7,xmm2,80
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [8+esp],ecx
- vpsrld xmm6,xmm7,10
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- vpsrlq xmm5,xmm7,17
- mov ecx,eax
- add edx,edi
- mov edi,DWORD [28+esp]
- vpxor xmm6,xmm6,xmm5
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD [24+esp],eax
- vpsrlq xmm7,xmm7,19
- xor ecx,eax
- xor eax,edi
- add edx,DWORD [20+esp]
- vpxor xmm6,xmm6,xmm7
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- vpshufd xmm7,xmm6,232
- add edx,DWORD [72+esp]
- xor ebx,edi
- shrd ecx,ecx,2
- vpslldq xmm7,xmm7,8
- add ebx,edx
- add edx,DWORD [4+esp]
- add ebx,ecx
- vpaddd xmm2,xmm2,xmm7
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [8+esp]
- vpaddd xmm6,xmm2,[32+ebp]
- xor edx,ecx
- mov edi,DWORD [12+esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [4+esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD [24+esp]
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD [20+esp],ebx
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD [16+esp]
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- add edx,DWORD [76+esp]
- xor eax,edi
- shrd ecx,ecx,2
- add eax,edx
- add edx,DWORD [esp]
- add eax,ecx
- vmovdqa [64+esp],xmm6
- vpalignr xmm4,xmm0,xmm3,4
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [4+esp]
- vpalignr xmm7,xmm2,xmm1,4
- xor edx,ecx
- mov edi,DWORD [8+esp]
- xor esi,edi
- vpsrld xmm6,xmm4,7
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [esp],ecx
- vpaddd xmm3,xmm3,xmm7
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- vpsrld xmm7,xmm4,3
- mov ecx,eax
- add edx,edi
- mov edi,DWORD [20+esp]
- vpslld xmm5,xmm4,14
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD [16+esp],eax
- vpxor xmm4,xmm7,xmm6
- xor ecx,eax
- xor eax,edi
- add edx,DWORD [12+esp]
- vpshufd xmm7,xmm2,250
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- vpsrld xmm6,xmm6,11
- add edx,DWORD [80+esp]
- xor ebx,edi
- shrd ecx,ecx,2
- vpxor xmm4,xmm4,xmm5
- add ebx,edx
- add edx,DWORD [28+esp]
- add ebx,ecx
- vpslld xmm5,xmm5,11
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [esp]
- vpxor xmm4,xmm4,xmm6
- xor edx,ecx
- mov edi,DWORD [4+esp]
- xor esi,edi
- vpsrld xmm6,xmm7,10
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [28+esp],ecx
- vpxor xmm4,xmm4,xmm5
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- vpsrlq xmm5,xmm7,17
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD [16+esp]
- vpaddd xmm3,xmm3,xmm4
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD [12+esp],ebx
- vpxor xmm6,xmm6,xmm5
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD [8+esp]
- vpsrlq xmm7,xmm7,19
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- vpxor xmm6,xmm6,xmm7
- add edx,DWORD [84+esp]
- xor eax,edi
- shrd ecx,ecx,2
- vpshufd xmm7,xmm6,132
- add eax,edx
- add edx,DWORD [24+esp]
- add eax,ecx
- vpsrldq xmm7,xmm7,8
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [28+esp]
- vpaddd xmm3,xmm3,xmm7
- xor edx,ecx
- mov edi,DWORD [esp]
- xor esi,edi
- vpshufd xmm7,xmm3,80
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [24+esp],ecx
- vpsrld xmm6,xmm7,10
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- vpsrlq xmm5,xmm7,17
- mov ecx,eax
- add edx,edi
- mov edi,DWORD [12+esp]
- vpxor xmm6,xmm6,xmm5
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD [8+esp],eax
- vpsrlq xmm7,xmm7,19
- xor ecx,eax
- xor eax,edi
- add edx,DWORD [4+esp]
- vpxor xmm6,xmm6,xmm7
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- vpshufd xmm7,xmm6,232
- add edx,DWORD [88+esp]
- xor ebx,edi
- shrd ecx,ecx,2
- vpslldq xmm7,xmm7,8
- add ebx,edx
- add edx,DWORD [20+esp]
- add ebx,ecx
- vpaddd xmm3,xmm3,xmm7
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [24+esp]
- vpaddd xmm6,xmm3,[48+ebp]
- xor edx,ecx
- mov edi,DWORD [28+esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [20+esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD [8+esp]
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD [4+esp],ebx
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD [esp]
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- add edx,DWORD [92+esp]
- xor eax,edi
- shrd ecx,ecx,2
- add eax,edx
- add edx,DWORD [16+esp]
- add eax,ecx
- vmovdqa [80+esp],xmm6
- cmp DWORD [64+ebp],66051
- jne NEAR L$016avx_00_47
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [20+esp]
- xor edx,ecx
- mov edi,DWORD [24+esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [16+esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,eax
- add edx,edi
- mov edi,DWORD [4+esp]
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD [esp],eax
- xor ecx,eax
- xor eax,edi
- add edx,DWORD [28+esp]
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- add edx,DWORD [32+esp]
- xor ebx,edi
- shrd ecx,ecx,2
- add ebx,edx
- add edx,DWORD [12+esp]
- add ebx,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [16+esp]
- xor edx,ecx
- mov edi,DWORD [20+esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [12+esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD [esp]
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD [28+esp],ebx
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD [24+esp]
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- add edx,DWORD [36+esp]
- xor eax,edi
- shrd ecx,ecx,2
- add eax,edx
- add edx,DWORD [8+esp]
- add eax,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [12+esp]
- xor edx,ecx
- mov edi,DWORD [16+esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [8+esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,eax
- add edx,edi
- mov edi,DWORD [28+esp]
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD [24+esp],eax
- xor ecx,eax
- xor eax,edi
- add edx,DWORD [20+esp]
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- add edx,DWORD [40+esp]
- xor ebx,edi
- shrd ecx,ecx,2
- add ebx,edx
- add edx,DWORD [4+esp]
- add ebx,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [8+esp]
- xor edx,ecx
- mov edi,DWORD [12+esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [4+esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD [24+esp]
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD [20+esp],ebx
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD [16+esp]
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- add edx,DWORD [44+esp]
- xor eax,edi
- shrd ecx,ecx,2
- add eax,edx
- add edx,DWORD [esp]
- add eax,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [4+esp]
- xor edx,ecx
- mov edi,DWORD [8+esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,eax
- add edx,edi
- mov edi,DWORD [20+esp]
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD [16+esp],eax
- xor ecx,eax
- xor eax,edi
- add edx,DWORD [12+esp]
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- add edx,DWORD [48+esp]
- xor ebx,edi
- shrd ecx,ecx,2
- add ebx,edx
- add edx,DWORD [28+esp]
- add ebx,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [esp]
- xor edx,ecx
- mov edi,DWORD [4+esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [28+esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD [16+esp]
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD [12+esp],ebx
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD [8+esp]
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- add edx,DWORD [52+esp]
- xor eax,edi
- shrd ecx,ecx,2
- add eax,edx
- add edx,DWORD [24+esp]
- add eax,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [28+esp]
- xor edx,ecx
- mov edi,DWORD [esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [24+esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,eax
- add edx,edi
- mov edi,DWORD [12+esp]
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD [8+esp],eax
- xor ecx,eax
- xor eax,edi
- add edx,DWORD [4+esp]
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- add edx,DWORD [56+esp]
- xor ebx,edi
- shrd ecx,ecx,2
- add ebx,edx
- add edx,DWORD [20+esp]
- add ebx,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [24+esp]
- xor edx,ecx
- mov edi,DWORD [28+esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [20+esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD [8+esp]
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD [4+esp],ebx
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD [esp]
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- add edx,DWORD [60+esp]
- xor eax,edi
- shrd ecx,ecx,2
- add eax,edx
- add edx,DWORD [16+esp]
- add eax,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [20+esp]
- xor edx,ecx
- mov edi,DWORD [24+esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [16+esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,eax
- add edx,edi
- mov edi,DWORD [4+esp]
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD [esp],eax
- xor ecx,eax
- xor eax,edi
- add edx,DWORD [28+esp]
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- add edx,DWORD [64+esp]
- xor ebx,edi
- shrd ecx,ecx,2
- add ebx,edx
- add edx,DWORD [12+esp]
- add ebx,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [16+esp]
- xor edx,ecx
- mov edi,DWORD [20+esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [12+esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD [esp]
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD [28+esp],ebx
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD [24+esp]
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- add edx,DWORD [68+esp]
- xor eax,edi
- shrd ecx,ecx,2
- add eax,edx
- add edx,DWORD [8+esp]
- add eax,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [12+esp]
- xor edx,ecx
- mov edi,DWORD [16+esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [8+esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,eax
- add edx,edi
- mov edi,DWORD [28+esp]
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD [24+esp],eax
- xor ecx,eax
- xor eax,edi
- add edx,DWORD [20+esp]
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- add edx,DWORD [72+esp]
- xor ebx,edi
- shrd ecx,ecx,2
- add ebx,edx
- add edx,DWORD [4+esp]
- add ebx,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [8+esp]
- xor edx,ecx
- mov edi,DWORD [12+esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [4+esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD [24+esp]
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD [20+esp],ebx
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD [16+esp]
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- add edx,DWORD [76+esp]
- xor eax,edi
- shrd ecx,ecx,2
- add eax,edx
- add edx,DWORD [esp]
- add eax,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [4+esp]
- xor edx,ecx
- mov edi,DWORD [8+esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,eax
- add edx,edi
- mov edi,DWORD [20+esp]
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD [16+esp],eax
- xor ecx,eax
- xor eax,edi
- add edx,DWORD [12+esp]
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- add edx,DWORD [80+esp]
- xor ebx,edi
- shrd ecx,ecx,2
- add ebx,edx
- add edx,DWORD [28+esp]
- add ebx,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [esp]
- xor edx,ecx
- mov edi,DWORD [4+esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [28+esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD [16+esp]
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD [12+esp],ebx
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD [8+esp]
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- add edx,DWORD [84+esp]
- xor eax,edi
- shrd ecx,ecx,2
- add eax,edx
- add edx,DWORD [24+esp]
- add eax,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [28+esp]
- xor edx,ecx
- mov edi,DWORD [esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [24+esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,eax
- add edx,edi
- mov edi,DWORD [12+esp]
- mov esi,eax
- shrd ecx,ecx,9
- mov DWORD [8+esp],eax
- xor ecx,eax
- xor eax,edi
- add edx,DWORD [4+esp]
- shrd ecx,ecx,11
- and ebx,eax
- xor ecx,esi
- add edx,DWORD [88+esp]
- xor ebx,edi
- shrd ecx,ecx,2
- add ebx,edx
- add edx,DWORD [20+esp]
- add ebx,ecx
- mov ecx,edx
- shrd edx,edx,14
- mov esi,DWORD [24+esp]
- xor edx,ecx
- mov edi,DWORD [28+esp]
- xor esi,edi
- shrd edx,edx,5
- and esi,ecx
- mov DWORD [20+esp],ecx
- xor edx,ecx
- xor edi,esi
- shrd edx,edx,6
- mov ecx,ebx
- add edx,edi
- mov edi,DWORD [8+esp]
- mov esi,ebx
- shrd ecx,ecx,9
- mov DWORD [4+esp],ebx
- xor ecx,ebx
- xor ebx,edi
- add edx,DWORD [esp]
- shrd ecx,ecx,11
- and eax,ebx
- xor ecx,esi
- add edx,DWORD [92+esp]
- xor eax,edi
- shrd ecx,ecx,2
- add eax,edx
- add edx,DWORD [16+esp]
- add eax,ecx
- mov esi,DWORD [96+esp]
- xor ebx,edi
- mov ecx,DWORD [12+esp]
- add eax,DWORD [esi]
- add ebx,DWORD [4+esi]
- add edi,DWORD [8+esi]
- add ecx,DWORD [12+esi]
- mov DWORD [esi],eax
- mov DWORD [4+esi],ebx
- mov DWORD [8+esi],edi
- mov DWORD [12+esi],ecx
- mov DWORD [4+esp],ebx
- xor ebx,edi
- mov DWORD [8+esp],edi
- mov DWORD [12+esp],ecx
- mov edi,DWORD [20+esp]
- mov ecx,DWORD [24+esp]
- add edx,DWORD [16+esi]
- add edi,DWORD [20+esi]
- add ecx,DWORD [24+esi]
- mov DWORD [16+esi],edx
- mov DWORD [20+esi],edi
- mov DWORD [20+esp],edi
- mov edi,DWORD [28+esp]
- mov DWORD [24+esi],ecx
- add edi,DWORD [28+esi]
- mov DWORD [24+esp],ecx
- mov DWORD [28+esi],edi
- mov DWORD [28+esp],edi
- mov edi,DWORD [100+esp]
- vmovdqa xmm7,[64+ebp]
- sub ebp,192
- cmp edi,DWORD [104+esp]
- jb NEAR L$015grand_avx
- mov esp,DWORD [108+esp]
- vzeroall
- pop edi
- pop esi
- pop ebx
- pop ebp
- ret
-align 32
-L$014AVX_BMI:
- lea esp,[esp-96]
- vzeroall
- mov eax,DWORD [esi]
- mov ebx,DWORD [4+esi]
- mov ecx,DWORD [8+esi]
- mov edi,DWORD [12+esi]
- mov DWORD [4+esp],ebx
- xor ebx,ecx
- mov DWORD [8+esp],ecx
- mov DWORD [12+esp],edi
- mov edx,DWORD [16+esi]
- mov edi,DWORD [20+esi]
- mov ecx,DWORD [24+esi]
- mov esi,DWORD [28+esi]
- mov DWORD [20+esp],edi
- mov edi,DWORD [100+esp]
- mov DWORD [24+esp],ecx
- mov DWORD [28+esp],esi
- vmovdqa xmm7,[256+ebp]
- jmp NEAR L$017grand_avx_bmi
-align 32
-L$017grand_avx_bmi:
- vmovdqu xmm0,[edi]
- vmovdqu xmm1,[16+edi]
- vmovdqu xmm2,[32+edi]
- vmovdqu xmm3,[48+edi]
- add edi,64
- vpshufb xmm0,xmm0,xmm7
- mov DWORD [100+esp],edi
- vpshufb xmm1,xmm1,xmm7
- vpshufb xmm2,xmm2,xmm7
- vpaddd xmm4,xmm0,[ebp]
- vpshufb xmm3,xmm3,xmm7
- vpaddd xmm5,xmm1,[16+ebp]
- vpaddd xmm6,xmm2,[32+ebp]
- vpaddd xmm7,xmm3,[48+ebp]
- vmovdqa [32+esp],xmm4
- vmovdqa [48+esp],xmm5
- vmovdqa [64+esp],xmm6
- vmovdqa [80+esp],xmm7
- jmp NEAR L$018avx_bmi_00_47
-align 16
-L$018avx_bmi_00_47:
- add ebp,64
- vpalignr xmm4,xmm1,xmm0,4
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD [16+esp],edx
- vpalignr xmm7,xmm3,xmm2,4
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD [24+esp]
- vpsrld xmm6,xmm4,7
- xor ecx,edi
- and edx,DWORD [20+esp]
- mov DWORD [esp],eax
- vpaddd xmm0,xmm0,xmm7
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- vpsrld xmm7,xmm4,3
- lea edx,[ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- vpslld xmm5,xmm4,14
- mov edi,DWORD [4+esp]
- xor ecx,esi
- xor eax,edi
- vpxor xmm4,xmm7,xmm6
- add edx,DWORD [28+esp]
- and ebx,eax
- add edx,DWORD [32+esp]
- vpshufd xmm7,xmm3,250
- xor ebx,edi
- add ecx,edx
- add edx,DWORD [12+esp]
- vpsrld xmm6,xmm6,11
- lea ebx,[ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- vpxor xmm4,xmm4,xmm5
- mov DWORD [12+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- vpslld xmm5,xmm5,11
- andn esi,edx,DWORD [20+esp]
- xor ecx,edi
- and edx,DWORD [16+esp]
- vpxor xmm4,xmm4,xmm6
- mov DWORD [28+esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- vpsrld xmm6,xmm7,10
- lea edx,[ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- vpxor xmm4,xmm4,xmm5
- mov edi,DWORD [esp]
- xor ecx,esi
- xor ebx,edi
- vpsrlq xmm5,xmm7,17
- add edx,DWORD [24+esp]
- and eax,ebx
- add edx,DWORD [36+esp]
- vpaddd xmm0,xmm0,xmm4
- xor eax,edi
- add ecx,edx
- add edx,DWORD [8+esp]
- vpxor xmm6,xmm6,xmm5
- lea eax,[ecx*1+eax]
- rorx ecx,edx,6
- rorx esi,edx,11
- vpsrlq xmm7,xmm7,19
- mov DWORD [8+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- vpxor xmm6,xmm6,xmm7
- andn esi,edx,DWORD [16+esp]
- xor ecx,edi
- and edx,DWORD [12+esp]
- vpshufd xmm7,xmm6,132
- mov DWORD [24+esp],eax
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- vpsrldq xmm7,xmm7,8
- lea edx,[ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- vpaddd xmm0,xmm0,xmm7
- mov edi,DWORD [28+esp]
- xor ecx,esi
- xor eax,edi
- vpshufd xmm7,xmm0,80
- add edx,DWORD [20+esp]
- and ebx,eax
- add edx,DWORD [40+esp]
- vpsrld xmm6,xmm7,10
- xor ebx,edi
- add ecx,edx
- add edx,DWORD [4+esp]
- vpsrlq xmm5,xmm7,17
- lea ebx,[ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- vpxor xmm6,xmm6,xmm5
- mov DWORD [4+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- vpsrlq xmm7,xmm7,19
- andn esi,edx,DWORD [12+esp]
- xor ecx,edi
- and edx,DWORD [8+esp]
- vpxor xmm6,xmm6,xmm7
- mov DWORD [20+esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- vpshufd xmm7,xmm6,232
- lea edx,[ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- vpslldq xmm7,xmm7,8
- mov edi,DWORD [24+esp]
- xor ecx,esi
- xor ebx,edi
- vpaddd xmm0,xmm0,xmm7
- add edx,DWORD [16+esp]
- and eax,ebx
- add edx,DWORD [44+esp]
- vpaddd xmm6,xmm0,[ebp]
- xor eax,edi
- add ecx,edx
- add edx,DWORD [esp]
- lea eax,[ecx*1+eax]
- vmovdqa [32+esp],xmm6
- vpalignr xmm4,xmm2,xmm1,4
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD [esp],edx
- vpalignr xmm7,xmm0,xmm3,4
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD [8+esp]
- vpsrld xmm6,xmm4,7
- xor ecx,edi
- and edx,DWORD [4+esp]
- mov DWORD [16+esp],eax
- vpaddd xmm1,xmm1,xmm7
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- vpsrld xmm7,xmm4,3
- lea edx,[ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- vpslld xmm5,xmm4,14
- mov edi,DWORD [20+esp]
- xor ecx,esi
- xor eax,edi
- vpxor xmm4,xmm7,xmm6
- add edx,DWORD [12+esp]
- and ebx,eax
- add edx,DWORD [48+esp]
- vpshufd xmm7,xmm0,250
- xor ebx,edi
- add ecx,edx
- add edx,DWORD [28+esp]
- vpsrld xmm6,xmm6,11
- lea ebx,[ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- vpxor xmm4,xmm4,xmm5
- mov DWORD [28+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- vpslld xmm5,xmm5,11
- andn esi,edx,DWORD [4+esp]
- xor ecx,edi
- and edx,DWORD [esp]
- vpxor xmm4,xmm4,xmm6
- mov DWORD [12+esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- vpsrld xmm6,xmm7,10
- lea edx,[ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- vpxor xmm4,xmm4,xmm5
- mov edi,DWORD [16+esp]
- xor ecx,esi
- xor ebx,edi
- vpsrlq xmm5,xmm7,17
- add edx,DWORD [8+esp]
- and eax,ebx
- add edx,DWORD [52+esp]
- vpaddd xmm1,xmm1,xmm4
- xor eax,edi
- add ecx,edx
- add edx,DWORD [24+esp]
- vpxor xmm6,xmm6,xmm5
- lea eax,[ecx*1+eax]
- rorx ecx,edx,6
- rorx esi,edx,11
- vpsrlq xmm7,xmm7,19
- mov DWORD [24+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- vpxor xmm6,xmm6,xmm7
- andn esi,edx,DWORD [esp]
- xor ecx,edi
- and edx,DWORD [28+esp]
- vpshufd xmm7,xmm6,132
- mov DWORD [8+esp],eax
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- vpsrldq xmm7,xmm7,8
- lea edx,[ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- vpaddd xmm1,xmm1,xmm7
- mov edi,DWORD [12+esp]
- xor ecx,esi
- xor eax,edi
- vpshufd xmm7,xmm1,80
- add edx,DWORD [4+esp]
- and ebx,eax
- add edx,DWORD [56+esp]
- vpsrld xmm6,xmm7,10
- xor ebx,edi
- add ecx,edx
- add edx,DWORD [20+esp]
- vpsrlq xmm5,xmm7,17
- lea ebx,[ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- vpxor xmm6,xmm6,xmm5
- mov DWORD [20+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- vpsrlq xmm7,xmm7,19
- andn esi,edx,DWORD [28+esp]
- xor ecx,edi
- and edx,DWORD [24+esp]
- vpxor xmm6,xmm6,xmm7
- mov DWORD [4+esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- vpshufd xmm7,xmm6,232
- lea edx,[ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- vpslldq xmm7,xmm7,8
- mov edi,DWORD [8+esp]
- xor ecx,esi
- xor ebx,edi
- vpaddd xmm1,xmm1,xmm7
- add edx,DWORD [esp]
- and eax,ebx
- add edx,DWORD [60+esp]
- vpaddd xmm6,xmm1,[16+ebp]
- xor eax,edi
- add ecx,edx
- add edx,DWORD [16+esp]
- lea eax,[ecx*1+eax]
- vmovdqa [48+esp],xmm6
- vpalignr xmm4,xmm3,xmm2,4
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD [16+esp],edx
- vpalignr xmm7,xmm1,xmm0,4
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD [24+esp]
- vpsrld xmm6,xmm4,7
- xor ecx,edi
- and edx,DWORD [20+esp]
- mov DWORD [esp],eax
- vpaddd xmm2,xmm2,xmm7
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- vpsrld xmm7,xmm4,3
- lea edx,[ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- vpslld xmm5,xmm4,14
- mov edi,DWORD [4+esp]
- xor ecx,esi
- xor eax,edi
- vpxor xmm4,xmm7,xmm6
- add edx,DWORD [28+esp]
- and ebx,eax
- add edx,DWORD [64+esp]
- vpshufd xmm7,xmm1,250
- xor ebx,edi
- add ecx,edx
- add edx,DWORD [12+esp]
- vpsrld xmm6,xmm6,11
- lea ebx,[ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- vpxor xmm4,xmm4,xmm5
- mov DWORD [12+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- vpslld xmm5,xmm5,11
- andn esi,edx,DWORD [20+esp]
- xor ecx,edi
- and edx,DWORD [16+esp]
- vpxor xmm4,xmm4,xmm6
- mov DWORD [28+esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- vpsrld xmm6,xmm7,10
- lea edx,[ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- vpxor xmm4,xmm4,xmm5
- mov edi,DWORD [esp]
- xor ecx,esi
- xor ebx,edi
- vpsrlq xmm5,xmm7,17
- add edx,DWORD [24+esp]
- and eax,ebx
- add edx,DWORD [68+esp]
- vpaddd xmm2,xmm2,xmm4
- xor eax,edi
- add ecx,edx
- add edx,DWORD [8+esp]
- vpxor xmm6,xmm6,xmm5
- lea eax,[ecx*1+eax]
- rorx ecx,edx,6
- rorx esi,edx,11
- vpsrlq xmm7,xmm7,19
- mov DWORD [8+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- vpxor xmm6,xmm6,xmm7
- andn esi,edx,DWORD [16+esp]
- xor ecx,edi
- and edx,DWORD [12+esp]
- vpshufd xmm7,xmm6,132
- mov DWORD [24+esp],eax
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- vpsrldq xmm7,xmm7,8
- lea edx,[ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- vpaddd xmm2,xmm2,xmm7
- mov edi,DWORD [28+esp]
- xor ecx,esi
- xor eax,edi
- vpshufd xmm7,xmm2,80
- add edx,DWORD [20+esp]
- and ebx,eax
- add edx,DWORD [72+esp]
- vpsrld xmm6,xmm7,10
- xor ebx,edi
- add ecx,edx
- add edx,DWORD [4+esp]
- vpsrlq xmm5,xmm7,17
- lea ebx,[ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- vpxor xmm6,xmm6,xmm5
- mov DWORD [4+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- vpsrlq xmm7,xmm7,19
- andn esi,edx,DWORD [12+esp]
- xor ecx,edi
- and edx,DWORD [8+esp]
- vpxor xmm6,xmm6,xmm7
- mov DWORD [20+esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- vpshufd xmm7,xmm6,232
- lea edx,[ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- vpslldq xmm7,xmm7,8
- mov edi,DWORD [24+esp]
- xor ecx,esi
- xor ebx,edi
- vpaddd xmm2,xmm2,xmm7
- add edx,DWORD [16+esp]
- and eax,ebx
- add edx,DWORD [76+esp]
- vpaddd xmm6,xmm2,[32+ebp]
- xor eax,edi
- add ecx,edx
- add edx,DWORD [esp]
- lea eax,[ecx*1+eax]
- vmovdqa [64+esp],xmm6
- vpalignr xmm4,xmm0,xmm3,4
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD [esp],edx
- vpalignr xmm7,xmm2,xmm1,4
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD [8+esp]
- vpsrld xmm6,xmm4,7
- xor ecx,edi
- and edx,DWORD [4+esp]
- mov DWORD [16+esp],eax
- vpaddd xmm3,xmm3,xmm7
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- vpsrld xmm7,xmm4,3
- lea edx,[ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- vpslld xmm5,xmm4,14
- mov edi,DWORD [20+esp]
- xor ecx,esi
- xor eax,edi
- vpxor xmm4,xmm7,xmm6
- add edx,DWORD [12+esp]
- and ebx,eax
- add edx,DWORD [80+esp]
- vpshufd xmm7,xmm2,250
- xor ebx,edi
- add ecx,edx
- add edx,DWORD [28+esp]
- vpsrld xmm6,xmm6,11
- lea ebx,[ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- vpxor xmm4,xmm4,xmm5
- mov DWORD [28+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- vpslld xmm5,xmm5,11
- andn esi,edx,DWORD [4+esp]
- xor ecx,edi
- and edx,DWORD [esp]
- vpxor xmm4,xmm4,xmm6
- mov DWORD [12+esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- vpsrld xmm6,xmm7,10
- lea edx,[ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- vpxor xmm4,xmm4,xmm5
- mov edi,DWORD [16+esp]
- xor ecx,esi
- xor ebx,edi
- vpsrlq xmm5,xmm7,17
- add edx,DWORD [8+esp]
- and eax,ebx
- add edx,DWORD [84+esp]
- vpaddd xmm3,xmm3,xmm4
- xor eax,edi
- add ecx,edx
- add edx,DWORD [24+esp]
- vpxor xmm6,xmm6,xmm5
- lea eax,[ecx*1+eax]
- rorx ecx,edx,6
- rorx esi,edx,11
- vpsrlq xmm7,xmm7,19
- mov DWORD [24+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- vpxor xmm6,xmm6,xmm7
- andn esi,edx,DWORD [esp]
- xor ecx,edi
- and edx,DWORD [28+esp]
- vpshufd xmm7,xmm6,132
- mov DWORD [8+esp],eax
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- vpsrldq xmm7,xmm7,8
- lea edx,[ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- vpaddd xmm3,xmm3,xmm7
- mov edi,DWORD [12+esp]
- xor ecx,esi
- xor eax,edi
- vpshufd xmm7,xmm3,80
- add edx,DWORD [4+esp]
- and ebx,eax
- add edx,DWORD [88+esp]
- vpsrld xmm6,xmm7,10
- xor ebx,edi
- add ecx,edx
- add edx,DWORD [20+esp]
- vpsrlq xmm5,xmm7,17
- lea ebx,[ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- vpxor xmm6,xmm6,xmm5
- mov DWORD [20+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- vpsrlq xmm7,xmm7,19
- andn esi,edx,DWORD [28+esp]
- xor ecx,edi
- and edx,DWORD [24+esp]
- vpxor xmm6,xmm6,xmm7
- mov DWORD [4+esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- vpshufd xmm7,xmm6,232
- lea edx,[ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- vpslldq xmm7,xmm7,8
- mov edi,DWORD [8+esp]
- xor ecx,esi
- xor ebx,edi
- vpaddd xmm3,xmm3,xmm7
- add edx,DWORD [esp]
- and eax,ebx
- add edx,DWORD [92+esp]
- vpaddd xmm6,xmm3,[48+ebp]
- xor eax,edi
- add ecx,edx
- add edx,DWORD [16+esp]
- lea eax,[ecx*1+eax]
- vmovdqa [80+esp],xmm6
- cmp DWORD [64+ebp],66051
- jne NEAR L$018avx_bmi_00_47
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD [16+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD [24+esp]
- xor ecx,edi
- and edx,DWORD [20+esp]
- mov DWORD [esp],eax
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- lea edx,[ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- mov edi,DWORD [4+esp]
- xor ecx,esi
- xor eax,edi
- add edx,DWORD [28+esp]
- and ebx,eax
- add edx,DWORD [32+esp]
- xor ebx,edi
- add ecx,edx
- add edx,DWORD [12+esp]
- lea ebx,[ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD [12+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD [20+esp]
- xor ecx,edi
- and edx,DWORD [16+esp]
- mov DWORD [28+esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- lea edx,[ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- mov edi,DWORD [esp]
- xor ecx,esi
- xor ebx,edi
- add edx,DWORD [24+esp]
- and eax,ebx
- add edx,DWORD [36+esp]
- xor eax,edi
- add ecx,edx
- add edx,DWORD [8+esp]
- lea eax,[ecx*1+eax]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD [8+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD [16+esp]
- xor ecx,edi
- and edx,DWORD [12+esp]
- mov DWORD [24+esp],eax
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- lea edx,[ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- mov edi,DWORD [28+esp]
- xor ecx,esi
- xor eax,edi
- add edx,DWORD [20+esp]
- and ebx,eax
- add edx,DWORD [40+esp]
- xor ebx,edi
- add ecx,edx
- add edx,DWORD [4+esp]
- lea ebx,[ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD [4+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD [12+esp]
- xor ecx,edi
- and edx,DWORD [8+esp]
- mov DWORD [20+esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- lea edx,[ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- mov edi,DWORD [24+esp]
- xor ecx,esi
- xor ebx,edi
- add edx,DWORD [16+esp]
- and eax,ebx
- add edx,DWORD [44+esp]
- xor eax,edi
- add ecx,edx
- add edx,DWORD [esp]
- lea eax,[ecx*1+eax]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD [esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD [8+esp]
- xor ecx,edi
- and edx,DWORD [4+esp]
- mov DWORD [16+esp],eax
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- lea edx,[ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- mov edi,DWORD [20+esp]
- xor ecx,esi
- xor eax,edi
- add edx,DWORD [12+esp]
- and ebx,eax
- add edx,DWORD [48+esp]
- xor ebx,edi
- add ecx,edx
- add edx,DWORD [28+esp]
- lea ebx,[ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD [28+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD [4+esp]
- xor ecx,edi
- and edx,DWORD [esp]
- mov DWORD [12+esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- lea edx,[ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- mov edi,DWORD [16+esp]
- xor ecx,esi
- xor ebx,edi
- add edx,DWORD [8+esp]
- and eax,ebx
- add edx,DWORD [52+esp]
- xor eax,edi
- add ecx,edx
- add edx,DWORD [24+esp]
- lea eax,[ecx*1+eax]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD [24+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD [esp]
- xor ecx,edi
- and edx,DWORD [28+esp]
- mov DWORD [8+esp],eax
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- lea edx,[ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- mov edi,DWORD [12+esp]
- xor ecx,esi
- xor eax,edi
- add edx,DWORD [4+esp]
- and ebx,eax
- add edx,DWORD [56+esp]
- xor ebx,edi
- add ecx,edx
- add edx,DWORD [20+esp]
- lea ebx,[ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD [20+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD [28+esp]
- xor ecx,edi
- and edx,DWORD [24+esp]
- mov DWORD [4+esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- lea edx,[ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- mov edi,DWORD [8+esp]
- xor ecx,esi
- xor ebx,edi
- add edx,DWORD [esp]
- and eax,ebx
- add edx,DWORD [60+esp]
- xor eax,edi
- add ecx,edx
- add edx,DWORD [16+esp]
- lea eax,[ecx*1+eax]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD [16+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD [24+esp]
- xor ecx,edi
- and edx,DWORD [20+esp]
- mov DWORD [esp],eax
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- lea edx,[ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- mov edi,DWORD [4+esp]
- xor ecx,esi
- xor eax,edi
- add edx,DWORD [28+esp]
- and ebx,eax
- add edx,DWORD [64+esp]
- xor ebx,edi
- add ecx,edx
- add edx,DWORD [12+esp]
- lea ebx,[ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD [12+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD [20+esp]
- xor ecx,edi
- and edx,DWORD [16+esp]
- mov DWORD [28+esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- lea edx,[ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- mov edi,DWORD [esp]
- xor ecx,esi
- xor ebx,edi
- add edx,DWORD [24+esp]
- and eax,ebx
- add edx,DWORD [68+esp]
- xor eax,edi
- add ecx,edx
- add edx,DWORD [8+esp]
- lea eax,[ecx*1+eax]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD [8+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD [16+esp]
- xor ecx,edi
- and edx,DWORD [12+esp]
- mov DWORD [24+esp],eax
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- lea edx,[ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- mov edi,DWORD [28+esp]
- xor ecx,esi
- xor eax,edi
- add edx,DWORD [20+esp]
- and ebx,eax
- add edx,DWORD [72+esp]
- xor ebx,edi
- add ecx,edx
- add edx,DWORD [4+esp]
- lea ebx,[ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD [4+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD [12+esp]
- xor ecx,edi
- and edx,DWORD [8+esp]
- mov DWORD [20+esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- lea edx,[ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- mov edi,DWORD [24+esp]
- xor ecx,esi
- xor ebx,edi
- add edx,DWORD [16+esp]
- and eax,ebx
- add edx,DWORD [76+esp]
- xor eax,edi
- add ecx,edx
- add edx,DWORD [esp]
- lea eax,[ecx*1+eax]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD [esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD [8+esp]
- xor ecx,edi
- and edx,DWORD [4+esp]
- mov DWORD [16+esp],eax
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- lea edx,[ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- mov edi,DWORD [20+esp]
- xor ecx,esi
- xor eax,edi
- add edx,DWORD [12+esp]
- and ebx,eax
- add edx,DWORD [80+esp]
- xor ebx,edi
- add ecx,edx
- add edx,DWORD [28+esp]
- lea ebx,[ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD [28+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD [4+esp]
- xor ecx,edi
- and edx,DWORD [esp]
- mov DWORD [12+esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- lea edx,[ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- mov edi,DWORD [16+esp]
- xor ecx,esi
- xor ebx,edi
- add edx,DWORD [8+esp]
- and eax,ebx
- add edx,DWORD [84+esp]
- xor eax,edi
- add ecx,edx
- add edx,DWORD [24+esp]
- lea eax,[ecx*1+eax]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD [24+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD [esp]
- xor ecx,edi
- and edx,DWORD [28+esp]
- mov DWORD [8+esp],eax
- or edx,esi
- rorx edi,eax,2
- rorx esi,eax,13
- lea edx,[ecx*1+edx]
- rorx ecx,eax,22
- xor esi,edi
- mov edi,DWORD [12+esp]
- xor ecx,esi
- xor eax,edi
- add edx,DWORD [4+esp]
- and ebx,eax
- add edx,DWORD [88+esp]
- xor ebx,edi
- add ecx,edx
- add edx,DWORD [20+esp]
- lea ebx,[ecx*1+ebx]
- rorx ecx,edx,6
- rorx esi,edx,11
- mov DWORD [20+esp],edx
- rorx edi,edx,25
- xor ecx,esi
- andn esi,edx,DWORD [28+esp]
- xor ecx,edi
- and edx,DWORD [24+esp]
- mov DWORD [4+esp],ebx
- or edx,esi
- rorx edi,ebx,2
- rorx esi,ebx,13
- lea edx,[ecx*1+edx]
- rorx ecx,ebx,22
- xor esi,edi
- mov edi,DWORD [8+esp]
- xor ecx,esi
- xor ebx,edi
- add edx,DWORD [esp]
- and eax,ebx
- add edx,DWORD [92+esp]
- xor eax,edi
- add ecx,edx
- add edx,DWORD [16+esp]
- lea eax,[ecx*1+eax]
- mov esi,DWORD [96+esp]
- xor ebx,edi
- mov ecx,DWORD [12+esp]
- add eax,DWORD [esi]
- add ebx,DWORD [4+esi]
- add edi,DWORD [8+esi]
- add ecx,DWORD [12+esi]
- mov DWORD [esi],eax
- mov DWORD [4+esi],ebx
- mov DWORD [8+esi],edi
- mov DWORD [12+esi],ecx
- mov DWORD [4+esp],ebx
- xor ebx,edi
- mov DWORD [8+esp],edi
- mov DWORD [12+esp],ecx
- mov edi,DWORD [20+esp]
- mov ecx,DWORD [24+esp]
- add edx,DWORD [16+esi]
- add edi,DWORD [20+esi]
- add ecx,DWORD [24+esi]
- mov DWORD [16+esi],edx
- mov DWORD [20+esi],edi
- mov DWORD [20+esp],edi
- mov edi,DWORD [28+esp]
- mov DWORD [24+esi],ecx
- add edi,DWORD [28+esi]
- mov DWORD [24+esp],ecx
- mov DWORD [28+esi],edi
- mov DWORD [28+esp],edi
- mov edi,DWORD [100+esp]
- vmovdqa xmm7,[64+ebp]
- sub ebp,192
- cmp edi,DWORD [104+esp]
- jb NEAR L$017grand_avx_bmi
- mov esp,DWORD [108+esp]
- vzeroall
- pop edi
- pop esi
- pop ebx
- pop ebp
- ret
segment .bss
common _OPENSSL_ia32cap_P 16
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aes-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aes-x86_64.s
index 7d2428b971f..0b98b38c3f6 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aes-x86_64.s
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aes-x86_64.s
@@ -2656,7 +2656,7 @@ AES_cbc_encrypt:
.long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
.byte 65,69,83,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64
- .section ".note.gnu.property", "a"
+ .section .note.gnu.property, #alloc
.p2align 3
.long 1f - 0f
.long 4f - 1f
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-mb-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-mb-x86_64.s
index 6e9fe9d7514..fd0c91fa45c 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-mb-x86_64.s
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-mb-x86_64.s
@@ -7,14 +7,6 @@
.align 32
aesni_multi_cbc_encrypt:
.cfi_startproc
- cmpl $2,%edx
- jb .Lenc_non_avx
- movl OPENSSL_ia32cap_P+4(%rip),%ecx
- testl $268435456,%ecx
- jnz _avx_cbc_enc_shortcut
- jmp .Lenc_non_avx
-.align 16
-.Lenc_non_avx:
movq %rsp,%rax
.cfi_def_cfa_register %rax
pushq %rbx
@@ -298,14 +290,6 @@ aesni_multi_cbc_encrypt:
.align 32
aesni_multi_cbc_decrypt:
.cfi_startproc
- cmpl $2,%edx
- jb .Ldec_non_avx
- movl OPENSSL_ia32cap_P+4(%rip),%ecx
- testl $268435456,%ecx
- jnz _avx_cbc_dec_shortcut
- jmp .Ldec_non_avx
-.align 16
-.Ldec_non_avx:
movq %rsp,%rax
.cfi_def_cfa_register %rax
pushq %rbx
@@ -573,1020 +557,7 @@ aesni_multi_cbc_decrypt:
.byte 0xf3,0xc3
.cfi_endproc
.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
-.type aesni_multi_cbc_encrypt_avx,@function
-.align 32
-aesni_multi_cbc_encrypt_avx:
-.cfi_startproc
-_avx_cbc_enc_shortcut:
- movq %rsp,%rax
-.cfi_def_cfa_register %rax
- pushq %rbx
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_offset %rbp,-24
- pushq %r12
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_offset %r15,-56
-
-
-
-
-
-
-
-
- subq $192,%rsp
- andq $-128,%rsp
- movq %rax,16(%rsp)
-.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08
-
-.Lenc8x_body:
- vzeroupper
- vmovdqu (%rsi),%xmm15
- leaq 120(%rsi),%rsi
- leaq 160(%rdi),%rdi
- shrl $1,%edx
-
-.Lenc8x_loop_grande:
-
- xorl %edx,%edx
-
- movl -144(%rdi),%ecx
-
- movq -160(%rdi),%r8
- cmpl %edx,%ecx
-
- movq -152(%rdi),%rbx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
-
- vmovdqu -136(%rdi),%xmm2
- movl %ecx,32(%rsp)
- cmovleq %rsp,%r8
- subq %r8,%rbx
- movq %rbx,64(%rsp)
-
- movl -104(%rdi),%ecx
-
- movq -120(%rdi),%r9
- cmpl %edx,%ecx
-
- movq -112(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
-
- vmovdqu -96(%rdi),%xmm3
- movl %ecx,36(%rsp)
- cmovleq %rsp,%r9
- subq %r9,%rbp
- movq %rbp,72(%rsp)
-
- movl -64(%rdi),%ecx
-
- movq -80(%rdi),%r10
- cmpl %edx,%ecx
-
- movq -72(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
-
- vmovdqu -56(%rdi),%xmm4
- movl %ecx,40(%rsp)
- cmovleq %rsp,%r10
- subq %r10,%rbp
- movq %rbp,80(%rsp)
-
- movl -24(%rdi),%ecx
-
- movq -40(%rdi),%r11
- cmpl %edx,%ecx
-
- movq -32(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
-
- vmovdqu -16(%rdi),%xmm5
- movl %ecx,44(%rsp)
- cmovleq %rsp,%r11
- subq %r11,%rbp
- movq %rbp,88(%rsp)
-
- movl 16(%rdi),%ecx
-
- movq 0(%rdi),%r12
- cmpl %edx,%ecx
-
- movq 8(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
-
- vmovdqu 24(%rdi),%xmm6
- movl %ecx,48(%rsp)
- cmovleq %rsp,%r12
- subq %r12,%rbp
- movq %rbp,96(%rsp)
-
- movl 56(%rdi),%ecx
-
- movq 40(%rdi),%r13
- cmpl %edx,%ecx
-
- movq 48(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
-
- vmovdqu 64(%rdi),%xmm7
- movl %ecx,52(%rsp)
- cmovleq %rsp,%r13
- subq %r13,%rbp
- movq %rbp,104(%rsp)
-
- movl 96(%rdi),%ecx
-
- movq 80(%rdi),%r14
- cmpl %edx,%ecx
-
- movq 88(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
-
- vmovdqu 104(%rdi),%xmm8
- movl %ecx,56(%rsp)
- cmovleq %rsp,%r14
- subq %r14,%rbp
- movq %rbp,112(%rsp)
-
- movl 136(%rdi),%ecx
-
- movq 120(%rdi),%r15
- cmpl %edx,%ecx
-
- movq 128(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
-
- vmovdqu 144(%rdi),%xmm9
- movl %ecx,60(%rsp)
- cmovleq %rsp,%r15
- subq %r15,%rbp
- movq %rbp,120(%rsp)
- testl %edx,%edx
- jz .Lenc8x_done
-
- vmovups 16-120(%rsi),%xmm1
- vmovups 32-120(%rsi),%xmm0
- movl 240-120(%rsi),%eax
-
- vpxor (%r8),%xmm15,%xmm10
- leaq 128(%rsp),%rbp
- vpxor (%r9),%xmm15,%xmm11
- vpxor (%r10),%xmm15,%xmm12
- vpxor (%r11),%xmm15,%xmm13
- vpxor %xmm10,%xmm2,%xmm2
- vpxor (%r12),%xmm15,%xmm10
- vpxor %xmm11,%xmm3,%xmm3
- vpxor (%r13),%xmm15,%xmm11
- vpxor %xmm12,%xmm4,%xmm4
- vpxor (%r14),%xmm15,%xmm12
- vpxor %xmm13,%xmm5,%xmm5
- vpxor (%r15),%xmm15,%xmm13
- vpxor %xmm10,%xmm6,%xmm6
- movl $1,%ecx
- vpxor %xmm11,%xmm7,%xmm7
- vpxor %xmm12,%xmm8,%xmm8
- vpxor %xmm13,%xmm9,%xmm9
- jmp .Loop_enc8x
-
-.align 32
-.Loop_enc8x:
- vaesenc %xmm1,%xmm2,%xmm2
- cmpl 32+0(%rsp),%ecx
- vaesenc %xmm1,%xmm3,%xmm3
- prefetcht0 31(%r8)
- vaesenc %xmm1,%xmm4,%xmm4
- vaesenc %xmm1,%xmm5,%xmm5
- leaq (%r8,%rbx,1),%rbx
- cmovgeq %rsp,%r8
- vaesenc %xmm1,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesenc %xmm1,%xmm7,%xmm7
- subq %r8,%rbx
- vaesenc %xmm1,%xmm8,%xmm8
- vpxor 16(%r8),%xmm15,%xmm10
- movq %rbx,64+0(%rsp)
- vaesenc %xmm1,%xmm9,%xmm9
- vmovups -72(%rsi),%xmm1
- leaq 16(%r8,%rbx,1),%r8
- vmovdqu %xmm10,0(%rbp)
- vaesenc %xmm0,%xmm2,%xmm2
- cmpl 32+4(%rsp),%ecx
- movq 64+8(%rsp),%rbx
- vaesenc %xmm0,%xmm3,%xmm3
- prefetcht0 31(%r9)
- vaesenc %xmm0,%xmm4,%xmm4
- vaesenc %xmm0,%xmm5,%xmm5
- leaq (%r9,%rbx,1),%rbx
- cmovgeq %rsp,%r9
- vaesenc %xmm0,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesenc %xmm0,%xmm7,%xmm7
- subq %r9,%rbx
- vaesenc %xmm0,%xmm8,%xmm8
- vpxor 16(%r9),%xmm15,%xmm11
- movq %rbx,64+8(%rsp)
- vaesenc %xmm0,%xmm9,%xmm9
- vmovups -56(%rsi),%xmm0
- leaq 16(%r9,%rbx,1),%r9
- vmovdqu %xmm11,16(%rbp)
- vaesenc %xmm1,%xmm2,%xmm2
- cmpl 32+8(%rsp),%ecx
- movq 64+16(%rsp),%rbx
- vaesenc %xmm1,%xmm3,%xmm3
- prefetcht0 31(%r10)
- vaesenc %xmm1,%xmm4,%xmm4
- prefetcht0 15(%r8)
- vaesenc %xmm1,%xmm5,%xmm5
- leaq (%r10,%rbx,1),%rbx
- cmovgeq %rsp,%r10
- vaesenc %xmm1,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesenc %xmm1,%xmm7,%xmm7
- subq %r10,%rbx
- vaesenc %xmm1,%xmm8,%xmm8
- vpxor 16(%r10),%xmm15,%xmm12
- movq %rbx,64+16(%rsp)
- vaesenc %xmm1,%xmm9,%xmm9
- vmovups -40(%rsi),%xmm1
- leaq 16(%r10,%rbx,1),%r10
- vmovdqu %xmm12,32(%rbp)
- vaesenc %xmm0,%xmm2,%xmm2
- cmpl 32+12(%rsp),%ecx
- movq 64+24(%rsp),%rbx
- vaesenc %xmm0,%xmm3,%xmm3
- prefetcht0 31(%r11)
- vaesenc %xmm0,%xmm4,%xmm4
- prefetcht0 15(%r9)
- vaesenc %xmm0,%xmm5,%xmm5
- leaq (%r11,%rbx,1),%rbx
- cmovgeq %rsp,%r11
- vaesenc %xmm0,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesenc %xmm0,%xmm7,%xmm7
- subq %r11,%rbx
- vaesenc %xmm0,%xmm8,%xmm8
- vpxor 16(%r11),%xmm15,%xmm13
- movq %rbx,64+24(%rsp)
- vaesenc %xmm0,%xmm9,%xmm9
- vmovups -24(%rsi),%xmm0
- leaq 16(%r11,%rbx,1),%r11
- vmovdqu %xmm13,48(%rbp)
- vaesenc %xmm1,%xmm2,%xmm2
- cmpl 32+16(%rsp),%ecx
- movq 64+32(%rsp),%rbx
- vaesenc %xmm1,%xmm3,%xmm3
- prefetcht0 31(%r12)
- vaesenc %xmm1,%xmm4,%xmm4
- prefetcht0 15(%r10)
- vaesenc %xmm1,%xmm5,%xmm5
- leaq (%r12,%rbx,1),%rbx
- cmovgeq %rsp,%r12
- vaesenc %xmm1,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesenc %xmm1,%xmm7,%xmm7
- subq %r12,%rbx
- vaesenc %xmm1,%xmm8,%xmm8
- vpxor 16(%r12),%xmm15,%xmm10
- movq %rbx,64+32(%rsp)
- vaesenc %xmm1,%xmm9,%xmm9
- vmovups -8(%rsi),%xmm1
- leaq 16(%r12,%rbx,1),%r12
- vaesenc %xmm0,%xmm2,%xmm2
- cmpl 32+20(%rsp),%ecx
- movq 64+40(%rsp),%rbx
- vaesenc %xmm0,%xmm3,%xmm3
- prefetcht0 31(%r13)
- vaesenc %xmm0,%xmm4,%xmm4
- prefetcht0 15(%r11)
- vaesenc %xmm0,%xmm5,%xmm5
- leaq (%rbx,%r13,1),%rbx
- cmovgeq %rsp,%r13
- vaesenc %xmm0,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesenc %xmm0,%xmm7,%xmm7
- subq %r13,%rbx
- vaesenc %xmm0,%xmm8,%xmm8
- vpxor 16(%r13),%xmm15,%xmm11
- movq %rbx,64+40(%rsp)
- vaesenc %xmm0,%xmm9,%xmm9
- vmovups 8(%rsi),%xmm0
- leaq 16(%r13,%rbx,1),%r13
- vaesenc %xmm1,%xmm2,%xmm2
- cmpl 32+24(%rsp),%ecx
- movq 64+48(%rsp),%rbx
- vaesenc %xmm1,%xmm3,%xmm3
- prefetcht0 31(%r14)
- vaesenc %xmm1,%xmm4,%xmm4
- prefetcht0 15(%r12)
- vaesenc %xmm1,%xmm5,%xmm5
- leaq (%r14,%rbx,1),%rbx
- cmovgeq %rsp,%r14
- vaesenc %xmm1,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesenc %xmm1,%xmm7,%xmm7
- subq %r14,%rbx
- vaesenc %xmm1,%xmm8,%xmm8
- vpxor 16(%r14),%xmm15,%xmm12
- movq %rbx,64+48(%rsp)
- vaesenc %xmm1,%xmm9,%xmm9
- vmovups 24(%rsi),%xmm1
- leaq 16(%r14,%rbx,1),%r14
- vaesenc %xmm0,%xmm2,%xmm2
- cmpl 32+28(%rsp),%ecx
- movq 64+56(%rsp),%rbx
- vaesenc %xmm0,%xmm3,%xmm3
- prefetcht0 31(%r15)
- vaesenc %xmm0,%xmm4,%xmm4
- prefetcht0 15(%r13)
- vaesenc %xmm0,%xmm5,%xmm5
- leaq (%r15,%rbx,1),%rbx
- cmovgeq %rsp,%r15
- vaesenc %xmm0,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesenc %xmm0,%xmm7,%xmm7
- subq %r15,%rbx
- vaesenc %xmm0,%xmm8,%xmm8
- vpxor 16(%r15),%xmm15,%xmm13
- movq %rbx,64+56(%rsp)
- vaesenc %xmm0,%xmm9,%xmm9
- vmovups 40(%rsi),%xmm0
- leaq 16(%r15,%rbx,1),%r15
- vmovdqu 32(%rsp),%xmm14
- prefetcht0 15(%r14)
- prefetcht0 15(%r15)
- cmpl $11,%eax
- jb .Lenc8x_tail
-
- vaesenc %xmm1,%xmm2,%xmm2
- vaesenc %xmm1,%xmm3,%xmm3
- vaesenc %xmm1,%xmm4,%xmm4
- vaesenc %xmm1,%xmm5,%xmm5
- vaesenc %xmm1,%xmm6,%xmm6
- vaesenc %xmm1,%xmm7,%xmm7
- vaesenc %xmm1,%xmm8,%xmm8
- vaesenc %xmm1,%xmm9,%xmm9
- vmovups 176-120(%rsi),%xmm1
-
- vaesenc %xmm0,%xmm2,%xmm2
- vaesenc %xmm0,%xmm3,%xmm3
- vaesenc %xmm0,%xmm4,%xmm4
- vaesenc %xmm0,%xmm5,%xmm5
- vaesenc %xmm0,%xmm6,%xmm6
- vaesenc %xmm0,%xmm7,%xmm7
- vaesenc %xmm0,%xmm8,%xmm8
- vaesenc %xmm0,%xmm9,%xmm9
- vmovups 192-120(%rsi),%xmm0
- je .Lenc8x_tail
-
- vaesenc %xmm1,%xmm2,%xmm2
- vaesenc %xmm1,%xmm3,%xmm3
- vaesenc %xmm1,%xmm4,%xmm4
- vaesenc %xmm1,%xmm5,%xmm5
- vaesenc %xmm1,%xmm6,%xmm6
- vaesenc %xmm1,%xmm7,%xmm7
- vaesenc %xmm1,%xmm8,%xmm8
- vaesenc %xmm1,%xmm9,%xmm9
- vmovups 208-120(%rsi),%xmm1
-
- vaesenc %xmm0,%xmm2,%xmm2
- vaesenc %xmm0,%xmm3,%xmm3
- vaesenc %xmm0,%xmm4,%xmm4
- vaesenc %xmm0,%xmm5,%xmm5
- vaesenc %xmm0,%xmm6,%xmm6
- vaesenc %xmm0,%xmm7,%xmm7
- vaesenc %xmm0,%xmm8,%xmm8
- vaesenc %xmm0,%xmm9,%xmm9
- vmovups 224-120(%rsi),%xmm0
-
-.Lenc8x_tail:
- vaesenc %xmm1,%xmm2,%xmm2
- vpxor %xmm15,%xmm15,%xmm15
- vaesenc %xmm1,%xmm3,%xmm3
- vaesenc %xmm1,%xmm4,%xmm4
- vpcmpgtd %xmm15,%xmm14,%xmm15
- vaesenc %xmm1,%xmm5,%xmm5
- vaesenc %xmm1,%xmm6,%xmm6
- vpaddd %xmm14,%xmm15,%xmm15
- vmovdqu 48(%rsp),%xmm14
- vaesenc %xmm1,%xmm7,%xmm7
- movq 64(%rsp),%rbx
- vaesenc %xmm1,%xmm8,%xmm8
- vaesenc %xmm1,%xmm9,%xmm9
- vmovups 16-120(%rsi),%xmm1
-
- vaesenclast %xmm0,%xmm2,%xmm2
- vmovdqa %xmm15,32(%rsp)
- vpxor %xmm15,%xmm15,%xmm15
- vaesenclast %xmm0,%xmm3,%xmm3
- vaesenclast %xmm0,%xmm4,%xmm4
- vpcmpgtd %xmm15,%xmm14,%xmm15
- vaesenclast %xmm0,%xmm5,%xmm5
- vaesenclast %xmm0,%xmm6,%xmm6
- vpaddd %xmm15,%xmm14,%xmm14
- vmovdqu -120(%rsi),%xmm15
- vaesenclast %xmm0,%xmm7,%xmm7
- vaesenclast %xmm0,%xmm8,%xmm8
- vmovdqa %xmm14,48(%rsp)
- vaesenclast %xmm0,%xmm9,%xmm9
- vmovups 32-120(%rsi),%xmm0
-
- vmovups %xmm2,-16(%r8)
- subq %rbx,%r8
- vpxor 0(%rbp),%xmm2,%xmm2
- vmovups %xmm3,-16(%r9)
- subq 72(%rsp),%r9
- vpxor 16(%rbp),%xmm3,%xmm3
- vmovups %xmm4,-16(%r10)
- subq 80(%rsp),%r10
- vpxor 32(%rbp),%xmm4,%xmm4
- vmovups %xmm5,-16(%r11)
- subq 88(%rsp),%r11
- vpxor 48(%rbp),%xmm5,%xmm5
- vmovups %xmm6,-16(%r12)
- subq 96(%rsp),%r12
- vpxor %xmm10,%xmm6,%xmm6
- vmovups %xmm7,-16(%r13)
- subq 104(%rsp),%r13
- vpxor %xmm11,%xmm7,%xmm7
- vmovups %xmm8,-16(%r14)
- subq 112(%rsp),%r14
- vpxor %xmm12,%xmm8,%xmm8
- vmovups %xmm9,-16(%r15)
- subq 120(%rsp),%r15
- vpxor %xmm13,%xmm9,%xmm9
-
- decl %edx
- jnz .Loop_enc8x
-
- movq 16(%rsp),%rax
-.cfi_def_cfa %rax,8
-
-
-
-
-
-.Lenc8x_done:
- vzeroupper
- movq -48(%rax),%r15
-.cfi_restore %r15
- movq -40(%rax),%r14
-.cfi_restore %r14
- movq -32(%rax),%r13
-.cfi_restore %r13
- movq -24(%rax),%r12
-.cfi_restore %r12
- movq -16(%rax),%rbp
-.cfi_restore %rbp
- movq -8(%rax),%rbx
-.cfi_restore %rbx
- leaq (%rax),%rsp
-.cfi_def_cfa_register %rsp
-.Lenc8x_epilogue:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
-
-.type aesni_multi_cbc_decrypt_avx,@function
-.align 32
-aesni_multi_cbc_decrypt_avx:
-.cfi_startproc
-_avx_cbc_dec_shortcut:
- movq %rsp,%rax
-.cfi_def_cfa_register %rax
- pushq %rbx
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_offset %rbp,-24
- pushq %r12
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_offset %r15,-56
-
-
-
-
-
-
-
-
-
- subq $256,%rsp
- andq $-256,%rsp
- subq $192,%rsp
- movq %rax,16(%rsp)
-.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08
-
-.Ldec8x_body:
- vzeroupper
- vmovdqu (%rsi),%xmm15
- leaq 120(%rsi),%rsi
- leaq 160(%rdi),%rdi
- shrl $1,%edx
-
-.Ldec8x_loop_grande:
-
- xorl %edx,%edx
-
- movl -144(%rdi),%ecx
-
- movq -160(%rdi),%r8
- cmpl %edx,%ecx
-
- movq -152(%rdi),%rbx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
-
- vmovdqu -136(%rdi),%xmm2
- movl %ecx,32(%rsp)
- cmovleq %rsp,%r8
- subq %r8,%rbx
- movq %rbx,64(%rsp)
- vmovdqu %xmm2,192(%rsp)
-
- movl -104(%rdi),%ecx
-
- movq -120(%rdi),%r9
- cmpl %edx,%ecx
-
- movq -112(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
-
- vmovdqu -96(%rdi),%xmm3
- movl %ecx,36(%rsp)
- cmovleq %rsp,%r9
- subq %r9,%rbp
- movq %rbp,72(%rsp)
- vmovdqu %xmm3,208(%rsp)
-
- movl -64(%rdi),%ecx
-
- movq -80(%rdi),%r10
- cmpl %edx,%ecx
-
- movq -72(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
-
- vmovdqu -56(%rdi),%xmm4
- movl %ecx,40(%rsp)
- cmovleq %rsp,%r10
- subq %r10,%rbp
- movq %rbp,80(%rsp)
- vmovdqu %xmm4,224(%rsp)
-
- movl -24(%rdi),%ecx
-
- movq -40(%rdi),%r11
- cmpl %edx,%ecx
-
- movq -32(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
-
- vmovdqu -16(%rdi),%xmm5
- movl %ecx,44(%rsp)
- cmovleq %rsp,%r11
- subq %r11,%rbp
- movq %rbp,88(%rsp)
- vmovdqu %xmm5,240(%rsp)
-
- movl 16(%rdi),%ecx
-
- movq 0(%rdi),%r12
- cmpl %edx,%ecx
-
- movq 8(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
-
- vmovdqu 24(%rdi),%xmm6
- movl %ecx,48(%rsp)
- cmovleq %rsp,%r12
- subq %r12,%rbp
- movq %rbp,96(%rsp)
- vmovdqu %xmm6,256(%rsp)
-
- movl 56(%rdi),%ecx
-
- movq 40(%rdi),%r13
- cmpl %edx,%ecx
-
- movq 48(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
-
- vmovdqu 64(%rdi),%xmm7
- movl %ecx,52(%rsp)
- cmovleq %rsp,%r13
- subq %r13,%rbp
- movq %rbp,104(%rsp)
- vmovdqu %xmm7,272(%rsp)
-
- movl 96(%rdi),%ecx
-
- movq 80(%rdi),%r14
- cmpl %edx,%ecx
-
- movq 88(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
-
- vmovdqu 104(%rdi),%xmm8
- movl %ecx,56(%rsp)
- cmovleq %rsp,%r14
- subq %r14,%rbp
- movq %rbp,112(%rsp)
- vmovdqu %xmm8,288(%rsp)
-
- movl 136(%rdi),%ecx
-
- movq 120(%rdi),%r15
- cmpl %edx,%ecx
-
- movq 128(%rdi),%rbp
- cmovgl %ecx,%edx
- testl %ecx,%ecx
-
- vmovdqu 144(%rdi),%xmm9
- movl %ecx,60(%rsp)
- cmovleq %rsp,%r15
- subq %r15,%rbp
- movq %rbp,120(%rsp)
- vmovdqu %xmm9,304(%rsp)
- testl %edx,%edx
- jz .Ldec8x_done
-
- vmovups 16-120(%rsi),%xmm1
- vmovups 32-120(%rsi),%xmm0
- movl 240-120(%rsi),%eax
- leaq 192+128(%rsp),%rbp
-
- vmovdqu (%r8),%xmm2
- vmovdqu (%r9),%xmm3
- vmovdqu (%r10),%xmm4
- vmovdqu (%r11),%xmm5
- vmovdqu (%r12),%xmm6
- vmovdqu (%r13),%xmm7
- vmovdqu (%r14),%xmm8
- vmovdqu (%r15),%xmm9
- vmovdqu %xmm2,0(%rbp)
- vpxor %xmm15,%xmm2,%xmm2
- vmovdqu %xmm3,16(%rbp)
- vpxor %xmm15,%xmm3,%xmm3
- vmovdqu %xmm4,32(%rbp)
- vpxor %xmm15,%xmm4,%xmm4
- vmovdqu %xmm5,48(%rbp)
- vpxor %xmm15,%xmm5,%xmm5
- vmovdqu %xmm6,64(%rbp)
- vpxor %xmm15,%xmm6,%xmm6
- vmovdqu %xmm7,80(%rbp)
- vpxor %xmm15,%xmm7,%xmm7
- vmovdqu %xmm8,96(%rbp)
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu %xmm9,112(%rbp)
- vpxor %xmm15,%xmm9,%xmm9
- xorq $0x80,%rbp
- movl $1,%ecx
- jmp .Loop_dec8x
-
-.align 32
-.Loop_dec8x:
- vaesdec %xmm1,%xmm2,%xmm2
- cmpl 32+0(%rsp),%ecx
- vaesdec %xmm1,%xmm3,%xmm3
- prefetcht0 31(%r8)
- vaesdec %xmm1,%xmm4,%xmm4
- vaesdec %xmm1,%xmm5,%xmm5
- leaq (%r8,%rbx,1),%rbx
- cmovgeq %rsp,%r8
- vaesdec %xmm1,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesdec %xmm1,%xmm7,%xmm7
- subq %r8,%rbx
- vaesdec %xmm1,%xmm8,%xmm8
- vmovdqu 16(%r8),%xmm10
- movq %rbx,64+0(%rsp)
- vaesdec %xmm1,%xmm9,%xmm9
- vmovups -72(%rsi),%xmm1
- leaq 16(%r8,%rbx,1),%r8
- vmovdqu %xmm10,128(%rsp)
- vaesdec %xmm0,%xmm2,%xmm2
- cmpl 32+4(%rsp),%ecx
- movq 64+8(%rsp),%rbx
- vaesdec %xmm0,%xmm3,%xmm3
- prefetcht0 31(%r9)
- vaesdec %xmm0,%xmm4,%xmm4
- vaesdec %xmm0,%xmm5,%xmm5
- leaq (%r9,%rbx,1),%rbx
- cmovgeq %rsp,%r9
- vaesdec %xmm0,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesdec %xmm0,%xmm7,%xmm7
- subq %r9,%rbx
- vaesdec %xmm0,%xmm8,%xmm8
- vmovdqu 16(%r9),%xmm11
- movq %rbx,64+8(%rsp)
- vaesdec %xmm0,%xmm9,%xmm9
- vmovups -56(%rsi),%xmm0
- leaq 16(%r9,%rbx,1),%r9
- vmovdqu %xmm11,144(%rsp)
- vaesdec %xmm1,%xmm2,%xmm2
- cmpl 32+8(%rsp),%ecx
- movq 64+16(%rsp),%rbx
- vaesdec %xmm1,%xmm3,%xmm3
- prefetcht0 31(%r10)
- vaesdec %xmm1,%xmm4,%xmm4
- prefetcht0 15(%r8)
- vaesdec %xmm1,%xmm5,%xmm5
- leaq (%r10,%rbx,1),%rbx
- cmovgeq %rsp,%r10
- vaesdec %xmm1,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesdec %xmm1,%xmm7,%xmm7
- subq %r10,%rbx
- vaesdec %xmm1,%xmm8,%xmm8
- vmovdqu 16(%r10),%xmm12
- movq %rbx,64+16(%rsp)
- vaesdec %xmm1,%xmm9,%xmm9
- vmovups -40(%rsi),%xmm1
- leaq 16(%r10,%rbx,1),%r10
- vmovdqu %xmm12,160(%rsp)
- vaesdec %xmm0,%xmm2,%xmm2
- cmpl 32+12(%rsp),%ecx
- movq 64+24(%rsp),%rbx
- vaesdec %xmm0,%xmm3,%xmm3
- prefetcht0 31(%r11)
- vaesdec %xmm0,%xmm4,%xmm4
- prefetcht0 15(%r9)
- vaesdec %xmm0,%xmm5,%xmm5
- leaq (%r11,%rbx,1),%rbx
- cmovgeq %rsp,%r11
- vaesdec %xmm0,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesdec %xmm0,%xmm7,%xmm7
- subq %r11,%rbx
- vaesdec %xmm0,%xmm8,%xmm8
- vmovdqu 16(%r11),%xmm13
- movq %rbx,64+24(%rsp)
- vaesdec %xmm0,%xmm9,%xmm9
- vmovups -24(%rsi),%xmm0
- leaq 16(%r11,%rbx,1),%r11
- vmovdqu %xmm13,176(%rsp)
- vaesdec %xmm1,%xmm2,%xmm2
- cmpl 32+16(%rsp),%ecx
- movq 64+32(%rsp),%rbx
- vaesdec %xmm1,%xmm3,%xmm3
- prefetcht0 31(%r12)
- vaesdec %xmm1,%xmm4,%xmm4
- prefetcht0 15(%r10)
- vaesdec %xmm1,%xmm5,%xmm5
- leaq (%r12,%rbx,1),%rbx
- cmovgeq %rsp,%r12
- vaesdec %xmm1,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesdec %xmm1,%xmm7,%xmm7
- subq %r12,%rbx
- vaesdec %xmm1,%xmm8,%xmm8
- vmovdqu 16(%r12),%xmm10
- movq %rbx,64+32(%rsp)
- vaesdec %xmm1,%xmm9,%xmm9
- vmovups -8(%rsi),%xmm1
- leaq 16(%r12,%rbx,1),%r12
- vaesdec %xmm0,%xmm2,%xmm2
- cmpl 32+20(%rsp),%ecx
- movq 64+40(%rsp),%rbx
- vaesdec %xmm0,%xmm3,%xmm3
- prefetcht0 31(%r13)
- vaesdec %xmm0,%xmm4,%xmm4
- prefetcht0 15(%r11)
- vaesdec %xmm0,%xmm5,%xmm5
- leaq (%rbx,%r13,1),%rbx
- cmovgeq %rsp,%r13
- vaesdec %xmm0,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesdec %xmm0,%xmm7,%xmm7
- subq %r13,%rbx
- vaesdec %xmm0,%xmm8,%xmm8
- vmovdqu 16(%r13),%xmm11
- movq %rbx,64+40(%rsp)
- vaesdec %xmm0,%xmm9,%xmm9
- vmovups 8(%rsi),%xmm0
- leaq 16(%r13,%rbx,1),%r13
- vaesdec %xmm1,%xmm2,%xmm2
- cmpl 32+24(%rsp),%ecx
- movq 64+48(%rsp),%rbx
- vaesdec %xmm1,%xmm3,%xmm3
- prefetcht0 31(%r14)
- vaesdec %xmm1,%xmm4,%xmm4
- prefetcht0 15(%r12)
- vaesdec %xmm1,%xmm5,%xmm5
- leaq (%r14,%rbx,1),%rbx
- cmovgeq %rsp,%r14
- vaesdec %xmm1,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesdec %xmm1,%xmm7,%xmm7
- subq %r14,%rbx
- vaesdec %xmm1,%xmm8,%xmm8
- vmovdqu 16(%r14),%xmm12
- movq %rbx,64+48(%rsp)
- vaesdec %xmm1,%xmm9,%xmm9
- vmovups 24(%rsi),%xmm1
- leaq 16(%r14,%rbx,1),%r14
- vaesdec %xmm0,%xmm2,%xmm2
- cmpl 32+28(%rsp),%ecx
- movq 64+56(%rsp),%rbx
- vaesdec %xmm0,%xmm3,%xmm3
- prefetcht0 31(%r15)
- vaesdec %xmm0,%xmm4,%xmm4
- prefetcht0 15(%r13)
- vaesdec %xmm0,%xmm5,%xmm5
- leaq (%r15,%rbx,1),%rbx
- cmovgeq %rsp,%r15
- vaesdec %xmm0,%xmm6,%xmm6
- cmovgq %rsp,%rbx
- vaesdec %xmm0,%xmm7,%xmm7
- subq %r15,%rbx
- vaesdec %xmm0,%xmm8,%xmm8
- vmovdqu 16(%r15),%xmm13
- movq %rbx,64+56(%rsp)
- vaesdec %xmm0,%xmm9,%xmm9
- vmovups 40(%rsi),%xmm0
- leaq 16(%r15,%rbx,1),%r15
- vmovdqu 32(%rsp),%xmm14
- prefetcht0 15(%r14)
- prefetcht0 15(%r15)
- cmpl $11,%eax
- jb .Ldec8x_tail
-
- vaesdec %xmm1,%xmm2,%xmm2
- vaesdec %xmm1,%xmm3,%xmm3
- vaesdec %xmm1,%xmm4,%xmm4
- vaesdec %xmm1,%xmm5,%xmm5
- vaesdec %xmm1,%xmm6,%xmm6
- vaesdec %xmm1,%xmm7,%xmm7
- vaesdec %xmm1,%xmm8,%xmm8
- vaesdec %xmm1,%xmm9,%xmm9
- vmovups 176-120(%rsi),%xmm1
-
- vaesdec %xmm0,%xmm2,%xmm2
- vaesdec %xmm0,%xmm3,%xmm3
- vaesdec %xmm0,%xmm4,%xmm4
- vaesdec %xmm0,%xmm5,%xmm5
- vaesdec %xmm0,%xmm6,%xmm6
- vaesdec %xmm0,%xmm7,%xmm7
- vaesdec %xmm0,%xmm8,%xmm8
- vaesdec %xmm0,%xmm9,%xmm9
- vmovups 192-120(%rsi),%xmm0
- je .Ldec8x_tail
-
- vaesdec %xmm1,%xmm2,%xmm2
- vaesdec %xmm1,%xmm3,%xmm3
- vaesdec %xmm1,%xmm4,%xmm4
- vaesdec %xmm1,%xmm5,%xmm5
- vaesdec %xmm1,%xmm6,%xmm6
- vaesdec %xmm1,%xmm7,%xmm7
- vaesdec %xmm1,%xmm8,%xmm8
- vaesdec %xmm1,%xmm9,%xmm9
- vmovups 208-120(%rsi),%xmm1
-
- vaesdec %xmm0,%xmm2,%xmm2
- vaesdec %xmm0,%xmm3,%xmm3
- vaesdec %xmm0,%xmm4,%xmm4
- vaesdec %xmm0,%xmm5,%xmm5
- vaesdec %xmm0,%xmm6,%xmm6
- vaesdec %xmm0,%xmm7,%xmm7
- vaesdec %xmm0,%xmm8,%xmm8
- vaesdec %xmm0,%xmm9,%xmm9
- vmovups 224-120(%rsi),%xmm0
-
-.Ldec8x_tail:
- vaesdec %xmm1,%xmm2,%xmm2
- vpxor %xmm15,%xmm15,%xmm15
- vaesdec %xmm1,%xmm3,%xmm3
- vaesdec %xmm1,%xmm4,%xmm4
- vpcmpgtd %xmm15,%xmm14,%xmm15
- vaesdec %xmm1,%xmm5,%xmm5
- vaesdec %xmm1,%xmm6,%xmm6
- vpaddd %xmm14,%xmm15,%xmm15
- vmovdqu 48(%rsp),%xmm14
- vaesdec %xmm1,%xmm7,%xmm7
- movq 64(%rsp),%rbx
- vaesdec %xmm1,%xmm8,%xmm8
- vaesdec %xmm1,%xmm9,%xmm9
- vmovups 16-120(%rsi),%xmm1
-
- vaesdeclast %xmm0,%xmm2,%xmm2
- vmovdqa %xmm15,32(%rsp)
- vpxor %xmm15,%xmm15,%xmm15
- vaesdeclast %xmm0,%xmm3,%xmm3
- vpxor 0(%rbp),%xmm2,%xmm2
- vaesdeclast %xmm0,%xmm4,%xmm4
- vpxor 16(%rbp),%xmm3,%xmm3
- vpcmpgtd %xmm15,%xmm14,%xmm15
- vaesdeclast %xmm0,%xmm5,%xmm5
- vpxor 32(%rbp),%xmm4,%xmm4
- vaesdeclast %xmm0,%xmm6,%xmm6
- vpxor 48(%rbp),%xmm5,%xmm5
- vpaddd %xmm15,%xmm14,%xmm14
- vmovdqu -120(%rsi),%xmm15
- vaesdeclast %xmm0,%xmm7,%xmm7
- vpxor 64(%rbp),%xmm6,%xmm6
- vaesdeclast %xmm0,%xmm8,%xmm8
- vpxor 80(%rbp),%xmm7,%xmm7
- vmovdqa %xmm14,48(%rsp)
- vaesdeclast %xmm0,%xmm9,%xmm9
- vpxor 96(%rbp),%xmm8,%xmm8
- vmovups 32-120(%rsi),%xmm0
-
- vmovups %xmm2,-16(%r8)
- subq %rbx,%r8
- vmovdqu 128+0(%rsp),%xmm2
- vpxor 112(%rbp),%xmm9,%xmm9
- vmovups %xmm3,-16(%r9)
- subq 72(%rsp),%r9
- vmovdqu %xmm2,0(%rbp)
- vpxor %xmm15,%xmm2,%xmm2
- vmovdqu 128+16(%rsp),%xmm3
- vmovups %xmm4,-16(%r10)
- subq 80(%rsp),%r10
- vmovdqu %xmm3,16(%rbp)
- vpxor %xmm15,%xmm3,%xmm3
- vmovdqu 128+32(%rsp),%xmm4
- vmovups %xmm5,-16(%r11)
- subq 88(%rsp),%r11
- vmovdqu %xmm4,32(%rbp)
- vpxor %xmm15,%xmm4,%xmm4
- vmovdqu 128+48(%rsp),%xmm5
- vmovups %xmm6,-16(%r12)
- subq 96(%rsp),%r12
- vmovdqu %xmm5,48(%rbp)
- vpxor %xmm15,%xmm5,%xmm5
- vmovdqu %xmm10,64(%rbp)
- vpxor %xmm10,%xmm15,%xmm6
- vmovups %xmm7,-16(%r13)
- subq 104(%rsp),%r13
- vmovdqu %xmm11,80(%rbp)
- vpxor %xmm11,%xmm15,%xmm7
- vmovups %xmm8,-16(%r14)
- subq 112(%rsp),%r14
- vmovdqu %xmm12,96(%rbp)
- vpxor %xmm12,%xmm15,%xmm8
- vmovups %xmm9,-16(%r15)
- subq 120(%rsp),%r15
- vmovdqu %xmm13,112(%rbp)
- vpxor %xmm13,%xmm15,%xmm9
-
- xorq $128,%rbp
- decl %edx
- jnz .Loop_dec8x
-
- movq 16(%rsp),%rax
-.cfi_def_cfa %rax,8
-
-
-
-
-
-.Ldec8x_done:
- vzeroupper
- movq -48(%rax),%r15
-.cfi_restore %r15
- movq -40(%rax),%r14
-.cfi_restore %r14
- movq -32(%rax),%r13
-.cfi_restore %r13
- movq -24(%rax),%r12
-.cfi_restore %r12
- movq -16(%rax),%rbp
-.cfi_restore %rbp
- movq -8(%rax),%rbx
-.cfi_restore %rbx
- leaq (%rax),%rsp
-.cfi_def_cfa_register %rsp
-.Ldec8x_epilogue:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
- .section ".note.gnu.property", "a"
+ .section .note.gnu.property, #alloc
.p2align 3
.long 1f - 0f
.long 4f - 1f
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-sha1-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-sha1-x86_64.s
index 68af8c69a68..303a9518821 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-sha1-x86_64.s
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-sha1-x86_64.s
@@ -11,11 +11,6 @@ aesni_cbc_sha1_enc:
movq OPENSSL_ia32cap_P+4(%rip),%r11
btq $61,%r11
jc aesni_cbc_sha1_enc_shaext
- andl $268435456,%r11d
- andl $1073741824,%r10d
- orl %r11d,%r10d
- cmpl $1342177280,%r10d
- je aesni_cbc_sha1_enc_avx
jmp aesni_cbc_sha1_enc_ssse3
.byte 0xf3,0xc3
.cfi_endproc
@@ -1397,1327 +1392,6 @@ aesni_cbc_sha1_enc_ssse3:
.byte 0xf3,0xc3
.cfi_endproc
.size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
-.type aesni_cbc_sha1_enc_avx,@function
-.align 32
-aesni_cbc_sha1_enc_avx:
-.cfi_startproc
- movq 8(%rsp),%r10
-
-
- pushq %rbx
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbp,-24
- pushq %r12
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r15,-56
- leaq -104(%rsp),%rsp
-.cfi_adjust_cfa_offset 104
-
-
- vzeroall
- movq %rdi,%r12
- movq %rsi,%r13
- movq %rdx,%r14
- leaq 112(%rcx),%r15
- vmovdqu (%r8),%xmm12
- movq %r8,88(%rsp)
- shlq $6,%r14
- subq %r12,%r13
- movl 240-112(%r15),%r8d
- addq %r10,%r14
-
- leaq K_XX_XX(%rip),%r11
- movl 0(%r9),%eax
- movl 4(%r9),%ebx
- movl 8(%r9),%ecx
- movl 12(%r9),%edx
- movl %ebx,%esi
- movl 16(%r9),%ebp
- movl %ecx,%edi
- xorl %edx,%edi
- andl %edi,%esi
-
- vmovdqa 64(%r11),%xmm6
- vmovdqa 0(%r11),%xmm10
- vmovdqu 0(%r10),%xmm0
- vmovdqu 16(%r10),%xmm1
- vmovdqu 32(%r10),%xmm2
- vmovdqu 48(%r10),%xmm3
- vpshufb %xmm6,%xmm0,%xmm0
- addq $64,%r10
- vpshufb %xmm6,%xmm1,%xmm1
- vpshufb %xmm6,%xmm2,%xmm2
- vpshufb %xmm6,%xmm3,%xmm3
- vpaddd %xmm10,%xmm0,%xmm4
- vpaddd %xmm10,%xmm1,%xmm5
- vpaddd %xmm10,%xmm2,%xmm6
- vmovdqa %xmm4,0(%rsp)
- vmovdqa %xmm5,16(%rsp)
- vmovdqa %xmm6,32(%rsp)
- vmovups -112(%r15),%xmm15
- vmovups 16-112(%r15),%xmm14
- jmp .Loop_avx
-.align 32
-.Loop_avx:
- shrdl $2,%ebx,%ebx
- vmovdqu 0(%r12),%xmm13
- vpxor %xmm15,%xmm13,%xmm13
- vpxor %xmm13,%xmm12,%xmm12
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -80(%r15),%xmm15
- xorl %edx,%esi
- vpalignr $8,%xmm0,%xmm1,%xmm4
- movl %eax,%edi
- addl 0(%rsp),%ebp
- vpaddd %xmm3,%xmm10,%xmm9
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpsrldq $4,%xmm3,%xmm8
- addl %esi,%ebp
- andl %ebx,%edi
- vpxor %xmm0,%xmm4,%xmm4
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpxor %xmm2,%xmm8,%xmm8
- shrdl $7,%eax,%eax
- xorl %ecx,%edi
- movl %ebp,%esi
- addl 4(%rsp),%edx
- vpxor %xmm8,%xmm4,%xmm4
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- vmovdqa %xmm9,48(%rsp)
- addl %edi,%edx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups -64(%r15),%xmm14
- andl %eax,%esi
- vpsrld $31,%xmm4,%xmm8
- xorl %ebx,%eax
- addl %ebp,%edx
- shrdl $7,%ebp,%ebp
- xorl %ebx,%esi
- vpslldq $12,%xmm4,%xmm9
- vpaddd %xmm4,%xmm4,%xmm4
- movl %edx,%edi
- addl 8(%rsp),%ecx
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vpor %xmm8,%xmm4,%xmm4
- vpsrld $30,%xmm9,%xmm8
- addl %esi,%ecx
- andl %ebp,%edi
- xorl %eax,%ebp
- addl %edx,%ecx
- vpslld $2,%xmm9,%xmm9
- vpxor %xmm8,%xmm4,%xmm4
- shrdl $7,%edx,%edx
- xorl %eax,%edi
- movl %ecx,%esi
- addl 12(%rsp),%ebx
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -48(%r15),%xmm15
- vpxor %xmm9,%xmm4,%xmm4
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- andl %edx,%esi
- xorl %ebp,%edx
- addl %ecx,%ebx
- shrdl $7,%ecx,%ecx
- xorl %ebp,%esi
- vpalignr $8,%xmm1,%xmm2,%xmm5
- movl %ebx,%edi
- addl 16(%rsp),%eax
- vpaddd %xmm4,%xmm10,%xmm9
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vpsrldq $4,%xmm4,%xmm8
- addl %esi,%eax
- andl %ecx,%edi
- vpxor %xmm1,%xmm5,%xmm5
- xorl %edx,%ecx
- addl %ebx,%eax
- vpxor %xmm3,%xmm8,%xmm8
- shrdl $7,%ebx,%ebx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups -32(%r15),%xmm14
- xorl %edx,%edi
- movl %eax,%esi
- addl 20(%rsp),%ebp
- vpxor %xmm8,%xmm5,%xmm5
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vmovdqa %xmm9,0(%rsp)
- addl %edi,%ebp
- andl %ebx,%esi
- vpsrld $31,%xmm5,%xmm8
- xorl %ecx,%ebx
- addl %eax,%ebp
- shrdl $7,%eax,%eax
- xorl %ecx,%esi
- vpslldq $12,%xmm5,%xmm9
- vpaddd %xmm5,%xmm5,%xmm5
- movl %ebp,%edi
- addl 24(%rsp),%edx
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- vpor %xmm8,%xmm5,%xmm5
- vpsrld $30,%xmm9,%xmm8
- addl %esi,%edx
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -16(%r15),%xmm15
- andl %eax,%edi
- xorl %ebx,%eax
- addl %ebp,%edx
- vpslld $2,%xmm9,%xmm9
- vpxor %xmm8,%xmm5,%xmm5
- shrdl $7,%ebp,%ebp
- xorl %ebx,%edi
- movl %edx,%esi
- addl 28(%rsp),%ecx
- vpxor %xmm9,%xmm5,%xmm5
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vmovdqa 16(%r11),%xmm10
- addl %edi,%ecx
- andl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- shrdl $7,%edx,%edx
- xorl %eax,%esi
- vpalignr $8,%xmm2,%xmm3,%xmm6
- movl %ecx,%edi
- addl 32(%rsp),%ebx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 0(%r15),%xmm14
- vpaddd %xmm5,%xmm10,%xmm9
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- vpsrldq $4,%xmm5,%xmm8
- addl %esi,%ebx
- andl %edx,%edi
- vpxor %xmm2,%xmm6,%xmm6
- xorl %ebp,%edx
- addl %ecx,%ebx
- vpxor %xmm4,%xmm8,%xmm8
- shrdl $7,%ecx,%ecx
- xorl %ebp,%edi
- movl %ebx,%esi
- addl 36(%rsp),%eax
- vpxor %xmm8,%xmm6,%xmm6
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vmovdqa %xmm9,16(%rsp)
- addl %edi,%eax
- andl %ecx,%esi
- vpsrld $31,%xmm6,%xmm8
- xorl %edx,%ecx
- addl %ebx,%eax
- shrdl $7,%ebx,%ebx
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 16(%r15),%xmm15
- xorl %edx,%esi
- vpslldq $12,%xmm6,%xmm9
- vpaddd %xmm6,%xmm6,%xmm6
- movl %eax,%edi
- addl 40(%rsp),%ebp
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpor %xmm8,%xmm6,%xmm6
- vpsrld $30,%xmm9,%xmm8
- addl %esi,%ebp
- andl %ebx,%edi
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpslld $2,%xmm9,%xmm9
- vpxor %xmm8,%xmm6,%xmm6
- shrdl $7,%eax,%eax
- xorl %ecx,%edi
- movl %ebp,%esi
- addl 44(%rsp),%edx
- vpxor %xmm9,%xmm6,%xmm6
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 32(%r15),%xmm14
- andl %eax,%esi
- xorl %ebx,%eax
- addl %ebp,%edx
- shrdl $7,%ebp,%ebp
- xorl %ebx,%esi
- vpalignr $8,%xmm3,%xmm4,%xmm7
- movl %edx,%edi
- addl 48(%rsp),%ecx
- vpaddd %xmm6,%xmm10,%xmm9
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vpsrldq $4,%xmm6,%xmm8
- addl %esi,%ecx
- andl %ebp,%edi
- vpxor %xmm3,%xmm7,%xmm7
- xorl %eax,%ebp
- addl %edx,%ecx
- vpxor %xmm5,%xmm8,%xmm8
- shrdl $7,%edx,%edx
- xorl %eax,%edi
- movl %ecx,%esi
- addl 52(%rsp),%ebx
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 48(%r15),%xmm15
- vpxor %xmm8,%xmm7,%xmm7
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- vmovdqa %xmm9,32(%rsp)
- addl %edi,%ebx
- andl %edx,%esi
- vpsrld $31,%xmm7,%xmm8
- xorl %ebp,%edx
- addl %ecx,%ebx
- shrdl $7,%ecx,%ecx
- xorl %ebp,%esi
- vpslldq $12,%xmm7,%xmm9
- vpaddd %xmm7,%xmm7,%xmm7
- movl %ebx,%edi
- addl 56(%rsp),%eax
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vpor %xmm8,%xmm7,%xmm7
- vpsrld $30,%xmm9,%xmm8
- addl %esi,%eax
- andl %ecx,%edi
- xorl %edx,%ecx
- addl %ebx,%eax
- vpslld $2,%xmm9,%xmm9
- vpxor %xmm8,%xmm7,%xmm7
- shrdl $7,%ebx,%ebx
- cmpl $11,%r8d
- jb .Lvaesenclast6
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 64(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 80(%r15),%xmm15
- je .Lvaesenclast6
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 96(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 112(%r15),%xmm15
-.Lvaesenclast6:
- vaesenclast %xmm15,%xmm12,%xmm12
- vmovups -112(%r15),%xmm15
- vmovups 16-112(%r15),%xmm14
- xorl %edx,%edi
- movl %eax,%esi
- addl 60(%rsp),%ebp
- vpxor %xmm9,%xmm7,%xmm7
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- addl %edi,%ebp
- andl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpalignr $8,%xmm6,%xmm7,%xmm8
- vpxor %xmm4,%xmm0,%xmm0
- shrdl $7,%eax,%eax
- xorl %ecx,%esi
- movl %ebp,%edi
- addl 0(%rsp),%edx
- vpxor %xmm1,%xmm0,%xmm0
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- vpaddd %xmm7,%xmm10,%xmm9
- addl %esi,%edx
- vmovdqu 16(%r12),%xmm13
- vpxor %xmm15,%xmm13,%xmm13
- vmovups %xmm12,0(%r12,%r13,1)
- vpxor %xmm13,%xmm12,%xmm12
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -80(%r15),%xmm15
- andl %eax,%edi
- vpxor %xmm8,%xmm0,%xmm0
- xorl %ebx,%eax
- addl %ebp,%edx
- shrdl $7,%ebp,%ebp
- xorl %ebx,%edi
- vpsrld $30,%xmm0,%xmm8
- vmovdqa %xmm9,48(%rsp)
- movl %edx,%esi
- addl 4(%rsp),%ecx
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vpslld $2,%xmm0,%xmm0
- addl %edi,%ecx
- andl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- shrdl $7,%edx,%edx
- xorl %eax,%esi
- movl %ecx,%edi
- addl 8(%rsp),%ebx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups -64(%r15),%xmm14
- vpor %xmm8,%xmm0,%xmm0
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- andl %edx,%edi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 12(%rsp),%eax
- xorl %ebp,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpalignr $8,%xmm7,%xmm0,%xmm8
- vpxor %xmm5,%xmm1,%xmm1
- addl 16(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -48(%r15),%xmm15
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- vpxor %xmm2,%xmm1,%xmm1
- addl %esi,%ebp
- xorl %ecx,%edi
- vpaddd %xmm0,%xmm10,%xmm9
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpxor %xmm8,%xmm1,%xmm1
- addl 20(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- vpsrld $30,%xmm1,%xmm8
- vmovdqa %xmm9,0(%rsp)
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpslld $2,%xmm1,%xmm1
- addl 24(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups -32(%r15),%xmm14
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpor %xmm8,%xmm1,%xmm1
- addl 28(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpalignr $8,%xmm0,%xmm1,%xmm8
- vpxor %xmm6,%xmm2,%xmm2
- addl 32(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- vpxor %xmm3,%xmm2,%xmm2
- addl %esi,%eax
- xorl %edx,%edi
- vpaddd %xmm1,%xmm10,%xmm9
- vmovdqa 32(%r11),%xmm10
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpxor %xmm8,%xmm2,%xmm2
- addl 36(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -16(%r15),%xmm15
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- vpsrld $30,%xmm2,%xmm8
- vmovdqa %xmm9,16(%rsp)
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpslld $2,%xmm2,%xmm2
- addl 40(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpor %xmm8,%xmm2,%xmm2
- addl 44(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 0(%r15),%xmm14
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpalignr $8,%xmm1,%xmm2,%xmm8
- vpxor %xmm7,%xmm3,%xmm3
- addl 48(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- vpxor %xmm4,%xmm3,%xmm3
- addl %esi,%ebx
- xorl %ebp,%edi
- vpaddd %xmm2,%xmm10,%xmm9
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpxor %xmm8,%xmm3,%xmm3
- addl 52(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- vpsrld $30,%xmm3,%xmm8
- vmovdqa %xmm9,32(%rsp)
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpslld $2,%xmm3,%xmm3
- addl 56(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 16(%r15),%xmm15
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ecx,%edi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpor %xmm8,%xmm3,%xmm3
- addl 60(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpalignr $8,%xmm2,%xmm3,%xmm8
- vpxor %xmm0,%xmm4,%xmm4
- addl 0(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- vpxor %xmm5,%xmm4,%xmm4
- addl %esi,%ecx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 32(%r15),%xmm14
- xorl %eax,%edi
- vpaddd %xmm3,%xmm10,%xmm9
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpxor %xmm8,%xmm4,%xmm4
- addl 4(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- vpsrld $30,%xmm4,%xmm8
- vmovdqa %xmm9,48(%rsp)
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpslld $2,%xmm4,%xmm4
- addl 8(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpor %xmm8,%xmm4,%xmm4
- addl 12(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 48(%r15),%xmm15
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpalignr $8,%xmm3,%xmm4,%xmm8
- vpxor %xmm1,%xmm5,%xmm5
- addl 16(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- vpxor %xmm6,%xmm5,%xmm5
- addl %esi,%edx
- xorl %ebx,%edi
- vpaddd %xmm4,%xmm10,%xmm9
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpxor %xmm8,%xmm5,%xmm5
- addl 20(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- vpsrld $30,%xmm5,%xmm8
- vmovdqa %xmm9,0(%rsp)
- addl %edi,%ecx
- cmpl $11,%r8d
- jb .Lvaesenclast7
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 64(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 80(%r15),%xmm15
- je .Lvaesenclast7
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 96(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 112(%r15),%xmm15
-.Lvaesenclast7:
- vaesenclast %xmm15,%xmm12,%xmm12
- vmovups -112(%r15),%xmm15
- vmovups 16-112(%r15),%xmm14
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpslld $2,%xmm5,%xmm5
- addl 24(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpor %xmm8,%xmm5,%xmm5
- addl 28(%rsp),%eax
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- xorl %edx,%edi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %ecx,%esi
- xorl %edx,%ecx
- addl %ebx,%eax
- vpalignr $8,%xmm4,%xmm5,%xmm8
- vpxor %xmm2,%xmm6,%xmm6
- addl 32(%rsp),%ebp
- vmovdqu 32(%r12),%xmm13
- vpxor %xmm15,%xmm13,%xmm13
- vmovups %xmm12,16(%r13,%r12,1)
- vpxor %xmm13,%xmm12,%xmm12
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -80(%r15),%xmm15
- andl %ecx,%esi
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- vpxor %xmm7,%xmm6,%xmm6
- movl %eax,%edi
- xorl %ecx,%esi
- vpaddd %xmm5,%xmm10,%xmm9
- shldl $5,%eax,%eax
- addl %esi,%ebp
- vpxor %xmm8,%xmm6,%xmm6
- xorl %ebx,%edi
- xorl %ecx,%ebx
- addl %eax,%ebp
- addl 36(%rsp),%edx
- vpsrld $30,%xmm6,%xmm8
- vmovdqa %xmm9,16(%rsp)
- andl %ebx,%edi
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %ebp,%esi
- vpslld $2,%xmm6,%xmm6
- xorl %ebx,%edi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups -64(%r15),%xmm14
- xorl %eax,%esi
- xorl %ebx,%eax
- addl %ebp,%edx
- addl 40(%rsp),%ecx
- andl %eax,%esi
- vpor %xmm8,%xmm6,%xmm6
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- movl %edx,%edi
- xorl %eax,%esi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %ebp,%edi
- xorl %eax,%ebp
- addl %edx,%ecx
- addl 44(%rsp),%ebx
- andl %ebp,%edi
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -48(%r15),%xmm15
- movl %ecx,%esi
- xorl %ebp,%edi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %edx,%esi
- xorl %ebp,%edx
- addl %ecx,%ebx
- vpalignr $8,%xmm5,%xmm6,%xmm8
- vpxor %xmm3,%xmm7,%xmm7
- addl 48(%rsp),%eax
- andl %edx,%esi
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- vpxor %xmm0,%xmm7,%xmm7
- movl %ebx,%edi
- xorl %edx,%esi
- vpaddd %xmm6,%xmm10,%xmm9
- vmovdqa 48(%r11),%xmm10
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- vpxor %xmm8,%xmm7,%xmm7
- xorl %ecx,%edi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 52(%rsp),%ebp
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups -32(%r15),%xmm14
- vpsrld $30,%xmm7,%xmm8
- vmovdqa %xmm9,32(%rsp)
- andl %ecx,%edi
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%esi
- vpslld $2,%xmm7,%xmm7
- xorl %ecx,%edi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%ebp
- addl 56(%rsp),%edx
- andl %ebx,%esi
- vpor %xmm8,%xmm7,%xmm7
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %ebp,%edi
- xorl %ebx,%esi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -16(%r15),%xmm15
- xorl %eax,%edi
- xorl %ebx,%eax
- addl %ebp,%edx
- addl 60(%rsp),%ecx
- andl %eax,%edi
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- movl %edx,%esi
- xorl %eax,%edi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- vpalignr $8,%xmm6,%xmm7,%xmm8
- vpxor %xmm4,%xmm0,%xmm0
- addl 0(%rsp),%ebx
- andl %ebp,%esi
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 0(%r15),%xmm14
- vpxor %xmm1,%xmm0,%xmm0
- movl %ecx,%edi
- xorl %ebp,%esi
- vpaddd %xmm7,%xmm10,%xmm9
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- vpxor %xmm8,%xmm0,%xmm0
- xorl %edx,%edi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 4(%rsp),%eax
- vpsrld $30,%xmm0,%xmm8
- vmovdqa %xmm9,48(%rsp)
- andl %edx,%edi
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- vpslld $2,%xmm0,%xmm0
- xorl %edx,%edi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %ecx,%esi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 8(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 16(%r15),%xmm15
- andl %ecx,%esi
- vpor %xmm8,%xmm0,%xmm0
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%edi
- xorl %ecx,%esi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ebx,%edi
- xorl %ecx,%ebx
- addl %eax,%ebp
- addl 12(%rsp),%edx
- andl %ebx,%edi
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %ebp,%esi
- xorl %ebx,%edi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 32(%r15),%xmm14
- xorl %eax,%esi
- xorl %ebx,%eax
- addl %ebp,%edx
- vpalignr $8,%xmm7,%xmm0,%xmm8
- vpxor %xmm5,%xmm1,%xmm1
- addl 16(%rsp),%ecx
- andl %eax,%esi
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- vpxor %xmm2,%xmm1,%xmm1
- movl %edx,%edi
- xorl %eax,%esi
- vpaddd %xmm0,%xmm10,%xmm9
- shldl $5,%edx,%edx
- addl %esi,%ecx
- vpxor %xmm8,%xmm1,%xmm1
- xorl %ebp,%edi
- xorl %eax,%ebp
- addl %edx,%ecx
- addl 20(%rsp),%ebx
- vpsrld $30,%xmm1,%xmm8
- vmovdqa %xmm9,0(%rsp)
- andl %ebp,%edi
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 48(%r15),%xmm15
- movl %ecx,%esi
- vpslld $2,%xmm1,%xmm1
- xorl %ebp,%edi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %edx,%esi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 24(%rsp),%eax
- andl %edx,%esi
- vpor %xmm8,%xmm1,%xmm1
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%edi
- xorl %edx,%esi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %ecx,%edi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 28(%rsp),%ebp
- cmpl $11,%r8d
- jb .Lvaesenclast8
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 64(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 80(%r15),%xmm15
- je .Lvaesenclast8
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 96(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 112(%r15),%xmm15
-.Lvaesenclast8:
- vaesenclast %xmm15,%xmm12,%xmm12
- vmovups -112(%r15),%xmm15
- vmovups 16-112(%r15),%xmm14
- andl %ecx,%edi
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%esi
- xorl %ecx,%edi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpalignr $8,%xmm0,%xmm1,%xmm8
- vpxor %xmm6,%xmm2,%xmm2
- addl 32(%rsp),%edx
- andl %ebx,%esi
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- vpxor %xmm3,%xmm2,%xmm2
- movl %ebp,%edi
- xorl %ebx,%esi
- vpaddd %xmm1,%xmm10,%xmm9
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- vmovdqu 48(%r12),%xmm13
- vpxor %xmm15,%xmm13,%xmm13
- vmovups %xmm12,32(%r13,%r12,1)
- vpxor %xmm13,%xmm12,%xmm12
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -80(%r15),%xmm15
- vpxor %xmm8,%xmm2,%xmm2
- xorl %eax,%edi
- xorl %ebx,%eax
- addl %ebp,%edx
- addl 36(%rsp),%ecx
- vpsrld $30,%xmm2,%xmm8
- vmovdqa %xmm9,16(%rsp)
- andl %eax,%edi
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- movl %edx,%esi
- vpslld $2,%xmm2,%xmm2
- xorl %eax,%edi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- addl 40(%rsp),%ebx
- andl %ebp,%esi
- vpor %xmm8,%xmm2,%xmm2
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups -64(%r15),%xmm14
- movl %ecx,%edi
- xorl %ebp,%esi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edx,%edi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 44(%rsp),%eax
- andl %edx,%edi
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- xorl %edx,%edi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- addl %ebx,%eax
- vpalignr $8,%xmm1,%xmm2,%xmm8
- vpxor %xmm7,%xmm3,%xmm3
- addl 48(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -48(%r15),%xmm15
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- vpxor %xmm4,%xmm3,%xmm3
- addl %esi,%ebp
- xorl %ecx,%edi
- vpaddd %xmm2,%xmm10,%xmm9
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpxor %xmm8,%xmm3,%xmm3
- addl 52(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- vpsrld $30,%xmm3,%xmm8
- vmovdqa %xmm9,32(%rsp)
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpslld $2,%xmm3,%xmm3
- addl 56(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups -32(%r15),%xmm14
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpor %xmm8,%xmm3,%xmm3
- addl 60(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 0(%rsp),%eax
- vpaddd %xmm3,%xmm10,%xmm9
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- vmovdqa %xmm9,48(%rsp)
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 4(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups -16(%r15),%xmm15
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 8(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 12(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 0(%r15),%xmm14
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- cmpq %r14,%r10
- je .Ldone_avx
- vmovdqa 64(%r11),%xmm9
- vmovdqa 0(%r11),%xmm10
- vmovdqu 0(%r10),%xmm0
- vmovdqu 16(%r10),%xmm1
- vmovdqu 32(%r10),%xmm2
- vmovdqu 48(%r10),%xmm3
- vpshufb %xmm9,%xmm0,%xmm0
- addq $64,%r10
- addl 16(%rsp),%ebx
- xorl %ebp,%esi
- vpshufb %xmm9,%xmm1,%xmm1
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- vpaddd %xmm10,%xmm0,%xmm8
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vmovdqa %xmm8,0(%rsp)
- addl 20(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 24(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 16(%r15),%xmm15
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ecx,%edi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 28(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 32(%rsp),%ecx
- xorl %eax,%esi
- vpshufb %xmm9,%xmm2,%xmm2
- movl %edx,%edi
- shldl $5,%edx,%edx
- vpaddd %xmm10,%xmm1,%xmm8
- addl %esi,%ecx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 32(%r15),%xmm14
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vmovdqa %xmm8,16(%rsp)
- addl 36(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 40(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 44(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 48(%r15),%xmm15
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 48(%rsp),%edx
- xorl %ebx,%esi
- vpshufb %xmm9,%xmm3,%xmm3
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- vpaddd %xmm10,%xmm2,%xmm8
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vmovdqa %xmm8,32(%rsp)
- addl 52(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- cmpl $11,%r8d
- jb .Lvaesenclast9
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 64(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 80(%r15),%xmm15
- je .Lvaesenclast9
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 96(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 112(%r15),%xmm15
-.Lvaesenclast9:
- vaesenclast %xmm15,%xmm12,%xmm12
- vmovups -112(%r15),%xmm15
- vmovups 16-112(%r15),%xmm14
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- addl 56(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 60(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vmovups %xmm12,48(%r13,%r12,1)
- leaq 64(%r12),%r12
-
- addl 0(%r9),%eax
- addl 4(%r9),%esi
- addl 8(%r9),%ecx
- addl 12(%r9),%edx
- movl %eax,0(%r9)
- addl 16(%r9),%ebp
- movl %esi,4(%r9)
- movl %esi,%ebx
- movl %ecx,8(%r9)
- movl %ecx,%edi
- movl %edx,12(%r9)
- xorl %edx,%edi
- movl %ebp,16(%r9)
- andl %edi,%esi
- jmp .Loop_avx
-
-.Ldone_avx:
- addl 16(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 20(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 24(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 16(%r15),%xmm15
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ecx,%edi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 28(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 32(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 32(%r15),%xmm14
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- addl 36(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 40(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 44(%rsp),%ebp
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 48(%r15),%xmm15
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 48(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 52(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- cmpl $11,%r8d
- jb .Lvaesenclast10
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 64(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 80(%r15),%xmm15
- je .Lvaesenclast10
- vaesenc %xmm15,%xmm12,%xmm12
- vmovups 96(%r15),%xmm14
- vaesenc %xmm14,%xmm12,%xmm12
- vmovups 112(%r15),%xmm15
-.Lvaesenclast10:
- vaesenclast %xmm15,%xmm12,%xmm12
- vmovups -112(%r15),%xmm15
- vmovups 16-112(%r15),%xmm14
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- addl 56(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 60(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vmovups %xmm12,48(%r13,%r12,1)
- movq 88(%rsp),%r8
-
- addl 0(%r9),%eax
- addl 4(%r9),%esi
- addl 8(%r9),%ecx
- movl %eax,0(%r9)
- addl 12(%r9),%edx
- movl %esi,4(%r9)
- addl 16(%r9),%ebp
- movl %ecx,8(%r9)
- movl %edx,12(%r9)
- movl %ebp,16(%r9)
- vmovups %xmm12,(%r8)
- vzeroall
- leaq 104(%rsp),%rsi
-.cfi_def_cfa %rsi,56
- movq 0(%rsi),%r15
-.cfi_restore %r15
- movq 8(%rsi),%r14
-.cfi_restore %r14
- movq 16(%rsi),%r13
-.cfi_restore %r13
- movq 24(%rsi),%r12
-.cfi_restore %r12
- movq 32(%rsi),%rbp
-.cfi_restore %rbp
- movq 40(%rsi),%rbx
-.cfi_restore %rbx
- leaq 48(%rsi),%rsp
-.cfi_def_cfa %rsp,8
-.Lepilogue_avx:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
.align 64
K_XX_XX:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
@@ -2809,17 +1483,17 @@ aesni_cbc_sha1_enc_shaext:
pxor %xmm3,%xmm5
.byte 15,56,201,243
cmpl $11,%r11d
- jb .Laesenclast11
+ jb .Laesenclast6
movups 64(%rcx),%xmm0
.byte 102,15,56,220,209
movups 80(%rcx),%xmm1
.byte 102,15,56,220,208
- je .Laesenclast11
+ je .Laesenclast6
movups 96(%rcx),%xmm0
.byte 102,15,56,220,209
movups 112(%rcx),%xmm1
.byte 102,15,56,220,208
-.Laesenclast11:
+.Laesenclast6:
.byte 102,15,56,221,209
movups 16-112(%rcx),%xmm0
movdqa %xmm8,%xmm10
@@ -2875,17 +1549,17 @@ aesni_cbc_sha1_enc_shaext:
pxor %xmm4,%xmm6
.byte 15,56,201,220
cmpl $11,%r11d
- jb .Laesenclast12
+ jb .Laesenclast7
movups 64(%rcx),%xmm0
.byte 102,15,56,220,209
movups 80(%rcx),%xmm1
.byte 102,15,56,220,208
- je .Laesenclast12
+ je .Laesenclast7
movups 96(%rcx),%xmm0
.byte 102,15,56,220,209
movups 112(%rcx),%xmm1
.byte 102,15,56,220,208
-.Laesenclast12:
+.Laesenclast7:
.byte 102,15,56,221,209
movups 16-112(%rcx),%xmm0
movdqa %xmm8,%xmm9
@@ -2941,17 +1615,17 @@ aesni_cbc_sha1_enc_shaext:
pxor %xmm5,%xmm3
.byte 15,56,201,229
cmpl $11,%r11d
- jb .Laesenclast13
+ jb .Laesenclast8
movups 64(%rcx),%xmm0
.byte 102,15,56,220,209
movups 80(%rcx),%xmm1
.byte 102,15,56,220,208
- je .Laesenclast13
+ je .Laesenclast8
movups 96(%rcx),%xmm0
.byte 102,15,56,220,209
movups 112(%rcx),%xmm1
.byte 102,15,56,220,208
-.Laesenclast13:
+.Laesenclast8:
.byte 102,15,56,221,209
movups 16-112(%rcx),%xmm0
movdqa %xmm8,%xmm10
@@ -3005,17 +1679,17 @@ aesni_cbc_sha1_enc_shaext:
movups 48(%rcx),%xmm1
.byte 102,15,56,220,208
cmpl $11,%r11d
- jb .Laesenclast14
+ jb .Laesenclast9
movups 64(%rcx),%xmm0
.byte 102,15,56,220,209
movups 80(%rcx),%xmm1
.byte 102,15,56,220,208
- je .Laesenclast14
+ je .Laesenclast9
movups 96(%rcx),%xmm0
.byte 102,15,56,220,209
movups 112(%rcx),%xmm1
.byte 102,15,56,220,208
-.Laesenclast14:
+.Laesenclast9:
.byte 102,15,56,221,209
movups 16-112(%rcx),%xmm0
decq %rdx
@@ -3033,7 +1707,7 @@ aesni_cbc_sha1_enc_shaext:
.byte 0xf3,0xc3
.cfi_endproc
.size aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext
- .section ".note.gnu.property", "a"
+ .section .note.gnu.property, #alloc
.p2align 3
.long 1f - 0f
.long 4f - 1f
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-sha256-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-sha256-x86_64.s
index 0e022a30c0d..f1256ca0eca 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-sha256-x86_64.s
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-sha256-x86_64.s
@@ -6,25 +6,6 @@
.align 16
aesni_cbc_sha256_enc:
.cfi_startproc
- leaq OPENSSL_ia32cap_P(%rip),%r11
- movl $1,%eax
- cmpq $0,%rdi
- je .Lprobe
- movl 0(%r11),%eax
- movq 4(%r11),%r10
- btq $61,%r10
- jc aesni_cbc_sha256_enc_shaext
- movq %r10,%r11
- shrq $32,%r11
-
- testl $2048,%r10d
- jnz aesni_cbc_sha256_enc_xop
- andl $296,%r11d
- cmpl $296,%r11d
- je aesni_cbc_sha256_enc_avx2
- andl $268435456,%r10d
- jnz aesni_cbc_sha256_enc_avx
- ud2
xorl %eax,%eax
cmpq $0,%rdi
je .Lprobe
@@ -76,4364 +57,7 @@ K256:
.long 0,0,0,0, 0,0,0,0
.byte 65,69,83,78,73,45,67,66,67,43,83,72,65,50,53,54,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64
-.type aesni_cbc_sha256_enc_xop,@function
-.align 64
-aesni_cbc_sha256_enc_xop:
-.cfi_startproc
-.Lxop_shortcut:
- movq 8(%rsp),%r10
- movq %rsp,%rax
-.cfi_def_cfa_register %rax
- pushq %rbx
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_offset %rbp,-24
- pushq %r12
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_offset %r15,-56
- subq $128,%rsp
- andq $-64,%rsp
-
- shlq $6,%rdx
- subq %rdi,%rsi
- subq %rdi,%r10
- addq %rdi,%rdx
-
-
- movq %rsi,64+8(%rsp)
- movq %rdx,64+16(%rsp)
-
- movq %r8,64+32(%rsp)
- movq %r9,64+40(%rsp)
- movq %r10,64+48(%rsp)
- movq %rax,120(%rsp)
-.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08
-.Lprologue_xop:
- vzeroall
-
- movq %rdi,%r12
- leaq 128(%rcx),%rdi
- leaq K256+544(%rip),%r13
- movl 240-128(%rdi),%r14d
- movq %r9,%r15
- movq %r10,%rsi
- vmovdqu (%r8),%xmm8
- subq $9,%r14
-
- movl 0(%r15),%eax
- movl 4(%r15),%ebx
- movl 8(%r15),%ecx
- movl 12(%r15),%edx
- movl 16(%r15),%r8d
- movl 20(%r15),%r9d
- movl 24(%r15),%r10d
- movl 28(%r15),%r11d
-
- vmovdqa 0(%r13,%r14,8),%xmm14
- vmovdqa 16(%r13,%r14,8),%xmm13
- vmovdqa 32(%r13,%r14,8),%xmm12
- vmovdqu 0-128(%rdi),%xmm10
- jmp .Lloop_xop
-.align 16
-.Lloop_xop:
- vmovdqa K256+512(%rip),%xmm7
- vmovdqu 0(%rsi,%r12,1),%xmm0
- vmovdqu 16(%rsi,%r12,1),%xmm1
- vmovdqu 32(%rsi,%r12,1),%xmm2
- vmovdqu 48(%rsi,%r12,1),%xmm3
- vpshufb %xmm7,%xmm0,%xmm0
- leaq K256(%rip),%rbp
- vpshufb %xmm7,%xmm1,%xmm1
- vpshufb %xmm7,%xmm2,%xmm2
- vpaddd 0(%rbp),%xmm0,%xmm4
- vpshufb %xmm7,%xmm3,%xmm3
- vpaddd 32(%rbp),%xmm1,%xmm5
- vpaddd 64(%rbp),%xmm2,%xmm6
- vpaddd 96(%rbp),%xmm3,%xmm7
- vmovdqa %xmm4,0(%rsp)
- movl %eax,%r14d
- vmovdqa %xmm5,16(%rsp)
- movl %ebx,%esi
- vmovdqa %xmm6,32(%rsp)
- xorl %ecx,%esi
- vmovdqa %xmm7,48(%rsp)
- movl %r8d,%r13d
- jmp .Lxop_00_47
-
-.align 16
-.Lxop_00_47:
- subq $-32*4,%rbp
- vmovdqu (%r12),%xmm9
- movq %r12,64+0(%rsp)
- vpalignr $4,%xmm0,%xmm1,%xmm4
- rorl $14,%r13d
- movl %r14d,%eax
- vpalignr $4,%xmm2,%xmm3,%xmm7
- movl %r9d,%r12d
- xorl %r8d,%r13d
-.byte 143,232,120,194,236,14
- rorl $9,%r14d
- xorl %r10d,%r12d
- vpsrld $3,%xmm4,%xmm4
- rorl $5,%r13d
- xorl %eax,%r14d
- vpaddd %xmm7,%xmm0,%xmm0
- andl %r8d,%r12d
- vpxor %xmm10,%xmm9,%xmm9
- vmovdqu 16-128(%rdi),%xmm10
- xorl %r8d,%r13d
- addl 0(%rsp),%r11d
- movl %eax,%r15d
-.byte 143,232,120,194,245,11
- rorl $11,%r14d
- xorl %r10d,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ebx,%r15d
- rorl $6,%r13d
- addl %r12d,%r11d
- andl %r15d,%esi
-.byte 143,232,120,194,251,13
- xorl %eax,%r14d
- addl %r13d,%r11d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ebx,%esi
- addl %r11d,%edx
- vpsrld $10,%xmm3,%xmm6
- rorl $2,%r14d
- addl %esi,%r11d
- vpaddd %xmm4,%xmm0,%xmm0
- movl %edx,%r13d
- addl %r11d,%r14d
-.byte 143,232,120,194,239,2
- rorl $14,%r13d
- movl %r14d,%r11d
- vpxor %xmm6,%xmm7,%xmm7
- movl %r8d,%r12d
- xorl %edx,%r13d
- rorl $9,%r14d
- xorl %r9d,%r12d
- vpxor %xmm5,%xmm7,%xmm7
- rorl $5,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- vpxor %xmm8,%xmm9,%xmm9
- xorl %edx,%r13d
- vpsrldq $8,%xmm7,%xmm7
- addl 4(%rsp),%r10d
- movl %r11d,%esi
- rorl $11,%r14d
- xorl %r9d,%r12d
- vpaddd %xmm7,%xmm0,%xmm0
- xorl %eax,%esi
- rorl $6,%r13d
- addl %r12d,%r10d
- andl %esi,%r15d
-.byte 143,232,120,194,248,13
- xorl %r11d,%r14d
- addl %r13d,%r10d
- vpsrld $10,%xmm0,%xmm6
- xorl %eax,%r15d
- addl %r10d,%ecx
-.byte 143,232,120,194,239,2
- rorl $2,%r14d
- addl %r15d,%r10d
- vpxor %xmm6,%xmm7,%xmm7
- movl %ecx,%r13d
- addl %r10d,%r14d
- rorl $14,%r13d
- movl %r14d,%r10d
- vpxor %xmm5,%xmm7,%xmm7
- movl %edx,%r12d
- xorl %ecx,%r13d
- rorl $9,%r14d
- xorl %r8d,%r12d
- vpslldq $8,%xmm7,%xmm7
- rorl $5,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 32-128(%rdi),%xmm10
- xorl %ecx,%r13d
- vpaddd %xmm7,%xmm0,%xmm0
- addl 8(%rsp),%r9d
- movl %r10d,%r15d
- rorl $11,%r14d
- xorl %r8d,%r12d
- vpaddd 0(%rbp),%xmm0,%xmm6
- xorl %r11d,%r15d
- rorl $6,%r13d
- addl %r12d,%r9d
- andl %r15d,%esi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%esi
- addl %r9d,%ebx
- rorl $2,%r14d
- addl %esi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- rorl $14,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- xorl %ebx,%r13d
- rorl $9,%r14d
- xorl %edx,%r12d
- rorl $5,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 48-128(%rdi),%xmm10
- xorl %ebx,%r13d
- addl 12(%rsp),%r8d
- movl %r9d,%esi
- rorl $11,%r14d
- xorl %edx,%r12d
- xorl %r10d,%esi
- rorl $6,%r13d
- addl %r12d,%r8d
- andl %esi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- addl %r8d,%eax
- rorl $2,%r14d
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- vmovdqa %xmm6,0(%rsp)
- vpalignr $4,%xmm1,%xmm2,%xmm4
- rorl $14,%r13d
- movl %r14d,%r8d
- vpalignr $4,%xmm3,%xmm0,%xmm7
- movl %ebx,%r12d
- xorl %eax,%r13d
-.byte 143,232,120,194,236,14
- rorl $9,%r14d
- xorl %ecx,%r12d
- vpsrld $3,%xmm4,%xmm4
- rorl $5,%r13d
- xorl %r8d,%r14d
- vpaddd %xmm7,%xmm1,%xmm1
- andl %eax,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 64-128(%rdi),%xmm10
- xorl %eax,%r13d
- addl 16(%rsp),%edx
- movl %r8d,%r15d
-.byte 143,232,120,194,245,11
- rorl $11,%r14d
- xorl %ecx,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- xorl %r9d,%r15d
- rorl $6,%r13d
- addl %r12d,%edx
- andl %r15d,%esi
-.byte 143,232,120,194,248,13
- xorl %r8d,%r14d
- addl %r13d,%edx
- vpxor %xmm6,%xmm4,%xmm4
- xorl %r9d,%esi
- addl %edx,%r11d
- vpsrld $10,%xmm0,%xmm6
- rorl $2,%r14d
- addl %esi,%edx
- vpaddd %xmm4,%xmm1,%xmm1
- movl %r11d,%r13d
- addl %edx,%r14d
-.byte 143,232,120,194,239,2
- rorl $14,%r13d
- movl %r14d,%edx
- vpxor %xmm6,%xmm7,%xmm7
- movl %eax,%r12d
- xorl %r11d,%r13d
- rorl $9,%r14d
- xorl %ebx,%r12d
- vpxor %xmm5,%xmm7,%xmm7
- rorl $5,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 80-128(%rdi),%xmm10
- xorl %r11d,%r13d
- vpsrldq $8,%xmm7,%xmm7
- addl 20(%rsp),%ecx
- movl %edx,%esi
- rorl $11,%r14d
- xorl %ebx,%r12d
- vpaddd %xmm7,%xmm1,%xmm1
- xorl %r8d,%esi
- rorl $6,%r13d
- addl %r12d,%ecx
- andl %esi,%r15d
-.byte 143,232,120,194,249,13
- xorl %edx,%r14d
- addl %r13d,%ecx
- vpsrld $10,%xmm1,%xmm6
- xorl %r8d,%r15d
- addl %ecx,%r10d
-.byte 143,232,120,194,239,2
- rorl $2,%r14d
- addl %r15d,%ecx
- vpxor %xmm6,%xmm7,%xmm7
- movl %r10d,%r13d
- addl %ecx,%r14d
- rorl $14,%r13d
- movl %r14d,%ecx
- vpxor %xmm5,%xmm7,%xmm7
- movl %r11d,%r12d
- xorl %r10d,%r13d
- rorl $9,%r14d
- xorl %eax,%r12d
- vpslldq $8,%xmm7,%xmm7
- rorl $5,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 96-128(%rdi),%xmm10
- xorl %r10d,%r13d
- vpaddd %xmm7,%xmm1,%xmm1
- addl 24(%rsp),%ebx
- movl %ecx,%r15d
- rorl $11,%r14d
- xorl %eax,%r12d
- vpaddd 32(%rbp),%xmm1,%xmm6
- xorl %edx,%r15d
- rorl $6,%r13d
- addl %r12d,%ebx
- andl %r15d,%esi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%esi
- addl %ebx,%r9d
- rorl $2,%r14d
- addl %esi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- rorl $14,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- xorl %r9d,%r13d
- rorl $9,%r14d
- xorl %r11d,%r12d
- rorl $5,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 112-128(%rdi),%xmm10
- xorl %r9d,%r13d
- addl 28(%rsp),%eax
- movl %ebx,%esi
- rorl $11,%r14d
- xorl %r11d,%r12d
- xorl %ecx,%esi
- rorl $6,%r13d
- addl %r12d,%eax
- andl %esi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- addl %eax,%r8d
- rorl $2,%r14d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- vmovdqa %xmm6,16(%rsp)
- vpalignr $4,%xmm2,%xmm3,%xmm4
- rorl $14,%r13d
- movl %r14d,%eax
- vpalignr $4,%xmm0,%xmm1,%xmm7
- movl %r9d,%r12d
- xorl %r8d,%r13d
-.byte 143,232,120,194,236,14
- rorl $9,%r14d
- xorl %r10d,%r12d
- vpsrld $3,%xmm4,%xmm4
- rorl $5,%r13d
- xorl %eax,%r14d
- vpaddd %xmm7,%xmm2,%xmm2
- andl %r8d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 128-128(%rdi),%xmm10
- xorl %r8d,%r13d
- addl 32(%rsp),%r11d
- movl %eax,%r15d
-.byte 143,232,120,194,245,11
- rorl $11,%r14d
- xorl %r10d,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ebx,%r15d
- rorl $6,%r13d
- addl %r12d,%r11d
- andl %r15d,%esi
-.byte 143,232,120,194,249,13
- xorl %eax,%r14d
- addl %r13d,%r11d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %ebx,%esi
- addl %r11d,%edx
- vpsrld $10,%xmm1,%xmm6
- rorl $2,%r14d
- addl %esi,%r11d
- vpaddd %xmm4,%xmm2,%xmm2
- movl %edx,%r13d
- addl %r11d,%r14d
-.byte 143,232,120,194,239,2
- rorl $14,%r13d
- movl %r14d,%r11d
- vpxor %xmm6,%xmm7,%xmm7
- movl %r8d,%r12d
- xorl %edx,%r13d
- rorl $9,%r14d
- xorl %r9d,%r12d
- vpxor %xmm5,%xmm7,%xmm7
- rorl $5,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 144-128(%rdi),%xmm10
- xorl %edx,%r13d
- vpsrldq $8,%xmm7,%xmm7
- addl 36(%rsp),%r10d
- movl %r11d,%esi
- rorl $11,%r14d
- xorl %r9d,%r12d
- vpaddd %xmm7,%xmm2,%xmm2
- xorl %eax,%esi
- rorl $6,%r13d
- addl %r12d,%r10d
- andl %esi,%r15d
-.byte 143,232,120,194,250,13
- xorl %r11d,%r14d
- addl %r13d,%r10d
- vpsrld $10,%xmm2,%xmm6
- xorl %eax,%r15d
- addl %r10d,%ecx
-.byte 143,232,120,194,239,2
- rorl $2,%r14d
- addl %r15d,%r10d
- vpxor %xmm6,%xmm7,%xmm7
- movl %ecx,%r13d
- addl %r10d,%r14d
- rorl $14,%r13d
- movl %r14d,%r10d
- vpxor %xmm5,%xmm7,%xmm7
- movl %edx,%r12d
- xorl %ecx,%r13d
- rorl $9,%r14d
- xorl %r8d,%r12d
- vpslldq $8,%xmm7,%xmm7
- rorl $5,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 160-128(%rdi),%xmm10
- xorl %ecx,%r13d
- vpaddd %xmm7,%xmm2,%xmm2
- addl 40(%rsp),%r9d
- movl %r10d,%r15d
- rorl $11,%r14d
- xorl %r8d,%r12d
- vpaddd 64(%rbp),%xmm2,%xmm6
- xorl %r11d,%r15d
- rorl $6,%r13d
- addl %r12d,%r9d
- andl %r15d,%esi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%esi
- addl %r9d,%ebx
- rorl $2,%r14d
- addl %esi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- rorl $14,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- xorl %ebx,%r13d
- rorl $9,%r14d
- xorl %edx,%r12d
- rorl $5,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 176-128(%rdi),%xmm10
- xorl %ebx,%r13d
- addl 44(%rsp),%r8d
- movl %r9d,%esi
- rorl $11,%r14d
- xorl %edx,%r12d
- xorl %r10d,%esi
- rorl $6,%r13d
- addl %r12d,%r8d
- andl %esi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- addl %r8d,%eax
- rorl $2,%r14d
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- vmovdqa %xmm6,32(%rsp)
- vpalignr $4,%xmm3,%xmm0,%xmm4
- rorl $14,%r13d
- movl %r14d,%r8d
- vpalignr $4,%xmm1,%xmm2,%xmm7
- movl %ebx,%r12d
- xorl %eax,%r13d
-.byte 143,232,120,194,236,14
- rorl $9,%r14d
- xorl %ecx,%r12d
- vpsrld $3,%xmm4,%xmm4
- rorl $5,%r13d
- xorl %r8d,%r14d
- vpaddd %xmm7,%xmm3,%xmm3
- andl %eax,%r12d
- vpand %xmm12,%xmm11,%xmm8
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 192-128(%rdi),%xmm10
- xorl %eax,%r13d
- addl 48(%rsp),%edx
- movl %r8d,%r15d
-.byte 143,232,120,194,245,11
- rorl $11,%r14d
- xorl %ecx,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- xorl %r9d,%r15d
- rorl $6,%r13d
- addl %r12d,%edx
- andl %r15d,%esi
-.byte 143,232,120,194,250,13
- xorl %r8d,%r14d
- addl %r13d,%edx
- vpxor %xmm6,%xmm4,%xmm4
- xorl %r9d,%esi
- addl %edx,%r11d
- vpsrld $10,%xmm2,%xmm6
- rorl $2,%r14d
- addl %esi,%edx
- vpaddd %xmm4,%xmm3,%xmm3
- movl %r11d,%r13d
- addl %edx,%r14d
-.byte 143,232,120,194,239,2
- rorl $14,%r13d
- movl %r14d,%edx
- vpxor %xmm6,%xmm7,%xmm7
- movl %eax,%r12d
- xorl %r11d,%r13d
- rorl $9,%r14d
- xorl %ebx,%r12d
- vpxor %xmm5,%xmm7,%xmm7
- rorl $5,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 208-128(%rdi),%xmm10
- xorl %r11d,%r13d
- vpsrldq $8,%xmm7,%xmm7
- addl 52(%rsp),%ecx
- movl %edx,%esi
- rorl $11,%r14d
- xorl %ebx,%r12d
- vpaddd %xmm7,%xmm3,%xmm3
- xorl %r8d,%esi
- rorl $6,%r13d
- addl %r12d,%ecx
- andl %esi,%r15d
-.byte 143,232,120,194,251,13
- xorl %edx,%r14d
- addl %r13d,%ecx
- vpsrld $10,%xmm3,%xmm6
- xorl %r8d,%r15d
- addl %ecx,%r10d
-.byte 143,232,120,194,239,2
- rorl $2,%r14d
- addl %r15d,%ecx
- vpxor %xmm6,%xmm7,%xmm7
- movl %r10d,%r13d
- addl %ecx,%r14d
- rorl $14,%r13d
- movl %r14d,%ecx
- vpxor %xmm5,%xmm7,%xmm7
- movl %r11d,%r12d
- xorl %r10d,%r13d
- rorl $9,%r14d
- xorl %eax,%r12d
- vpslldq $8,%xmm7,%xmm7
- rorl $5,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- vpand %xmm13,%xmm11,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 224-128(%rdi),%xmm10
- xorl %r10d,%r13d
- vpaddd %xmm7,%xmm3,%xmm3
- addl 56(%rsp),%ebx
- movl %ecx,%r15d
- rorl $11,%r14d
- xorl %eax,%r12d
- vpaddd 96(%rbp),%xmm3,%xmm6
- xorl %edx,%r15d
- rorl $6,%r13d
- addl %r12d,%ebx
- andl %r15d,%esi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%esi
- addl %ebx,%r9d
- rorl $2,%r14d
- addl %esi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- rorl $14,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- xorl %r9d,%r13d
- rorl $9,%r14d
- xorl %r11d,%r12d
- rorl $5,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vpor %xmm11,%xmm8,%xmm8
- vaesenclast %xmm10,%xmm9,%xmm11
- vmovdqu 0-128(%rdi),%xmm10
- xorl %r9d,%r13d
- addl 60(%rsp),%eax
- movl %ebx,%esi
- rorl $11,%r14d
- xorl %r11d,%r12d
- xorl %ecx,%esi
- rorl $6,%r13d
- addl %r12d,%eax
- andl %esi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- addl %eax,%r8d
- rorl $2,%r14d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- vmovdqa %xmm6,48(%rsp)
- movq 64+0(%rsp),%r12
- vpand %xmm14,%xmm11,%xmm11
- movq 64+8(%rsp),%r15
- vpor %xmm11,%xmm8,%xmm8
- vmovdqu %xmm8,(%r15,%r12,1)
- leaq 16(%r12),%r12
- cmpb $0,131(%rbp)
- jne .Lxop_00_47
- vmovdqu (%r12),%xmm9
- movq %r12,64+0(%rsp)
- rorl $14,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- xorl %r8d,%r13d
- rorl $9,%r14d
- xorl %r10d,%r12d
- rorl $5,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vpxor %xmm10,%xmm9,%xmm9
- vmovdqu 16-128(%rdi),%xmm10
- xorl %r8d,%r13d
- addl 0(%rsp),%r11d
- movl %eax,%r15d
- rorl $11,%r14d
- xorl %r10d,%r12d
- xorl %ebx,%r15d
- rorl $6,%r13d
- addl %r12d,%r11d
- andl %r15d,%esi
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%esi
- addl %r11d,%edx
- rorl $2,%r14d
- addl %esi,%r11d
- movl %edx,%r13d
- addl %r11d,%r14d
- rorl $14,%r13d
- movl %r14d,%r11d
- movl %r8d,%r12d
- xorl %edx,%r13d
- rorl $9,%r14d
- xorl %r9d,%r12d
- rorl $5,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- vpxor %xmm8,%xmm9,%xmm9
- xorl %edx,%r13d
- addl 4(%rsp),%r10d
- movl %r11d,%esi
- rorl $11,%r14d
- xorl %r9d,%r12d
- xorl %eax,%esi
- rorl $6,%r13d
- addl %r12d,%r10d
- andl %esi,%r15d
- xorl %r11d,%r14d
- addl %r13d,%r10d
- xorl %eax,%r15d
- addl %r10d,%ecx
- rorl $2,%r14d
- addl %r15d,%r10d
- movl %ecx,%r13d
- addl %r10d,%r14d
- rorl $14,%r13d
- movl %r14d,%r10d
- movl %edx,%r12d
- xorl %ecx,%r13d
- rorl $9,%r14d
- xorl %r8d,%r12d
- rorl $5,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 32-128(%rdi),%xmm10
- xorl %ecx,%r13d
- addl 8(%rsp),%r9d
- movl %r10d,%r15d
- rorl $11,%r14d
- xorl %r8d,%r12d
- xorl %r11d,%r15d
- rorl $6,%r13d
- addl %r12d,%r9d
- andl %r15d,%esi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%esi
- addl %r9d,%ebx
- rorl $2,%r14d
- addl %esi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- rorl $14,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- xorl %ebx,%r13d
- rorl $9,%r14d
- xorl %edx,%r12d
- rorl $5,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 48-128(%rdi),%xmm10
- xorl %ebx,%r13d
- addl 12(%rsp),%r8d
- movl %r9d,%esi
- rorl $11,%r14d
- xorl %edx,%r12d
- xorl %r10d,%esi
- rorl $6,%r13d
- addl %r12d,%r8d
- andl %esi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- addl %r8d,%eax
- rorl $2,%r14d
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- rorl $14,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- xorl %eax,%r13d
- rorl $9,%r14d
- xorl %ecx,%r12d
- rorl $5,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 64-128(%rdi),%xmm10
- xorl %eax,%r13d
- addl 16(%rsp),%edx
- movl %r8d,%r15d
- rorl $11,%r14d
- xorl %ecx,%r12d
- xorl %r9d,%r15d
- rorl $6,%r13d
- addl %r12d,%edx
- andl %r15d,%esi
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%esi
- addl %edx,%r11d
- rorl $2,%r14d
- addl %esi,%edx
- movl %r11d,%r13d
- addl %edx,%r14d
- rorl $14,%r13d
- movl %r14d,%edx
- movl %eax,%r12d
- xorl %r11d,%r13d
- rorl $9,%r14d
- xorl %ebx,%r12d
- rorl $5,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 80-128(%rdi),%xmm10
- xorl %r11d,%r13d
- addl 20(%rsp),%ecx
- movl %edx,%esi
- rorl $11,%r14d
- xorl %ebx,%r12d
- xorl %r8d,%esi
- rorl $6,%r13d
- addl %r12d,%ecx
- andl %esi,%r15d
- xorl %edx,%r14d
- addl %r13d,%ecx
- xorl %r8d,%r15d
- addl %ecx,%r10d
- rorl $2,%r14d
- addl %r15d,%ecx
- movl %r10d,%r13d
- addl %ecx,%r14d
- rorl $14,%r13d
- movl %r14d,%ecx
- movl %r11d,%r12d
- xorl %r10d,%r13d
- rorl $9,%r14d
- xorl %eax,%r12d
- rorl $5,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 96-128(%rdi),%xmm10
- xorl %r10d,%r13d
- addl 24(%rsp),%ebx
- movl %ecx,%r15d
- rorl $11,%r14d
- xorl %eax,%r12d
- xorl %edx,%r15d
- rorl $6,%r13d
- addl %r12d,%ebx
- andl %r15d,%esi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%esi
- addl %ebx,%r9d
- rorl $2,%r14d
- addl %esi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- rorl $14,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- xorl %r9d,%r13d
- rorl $9,%r14d
- xorl %r11d,%r12d
- rorl $5,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 112-128(%rdi),%xmm10
- xorl %r9d,%r13d
- addl 28(%rsp),%eax
- movl %ebx,%esi
- rorl $11,%r14d
- xorl %r11d,%r12d
- xorl %ecx,%esi
- rorl $6,%r13d
- addl %r12d,%eax
- andl %esi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- addl %eax,%r8d
- rorl $2,%r14d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- rorl $14,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- xorl %r8d,%r13d
- rorl $9,%r14d
- xorl %r10d,%r12d
- rorl $5,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 128-128(%rdi),%xmm10
- xorl %r8d,%r13d
- addl 32(%rsp),%r11d
- movl %eax,%r15d
- rorl $11,%r14d
- xorl %r10d,%r12d
- xorl %ebx,%r15d
- rorl $6,%r13d
- addl %r12d,%r11d
- andl %r15d,%esi
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%esi
- addl %r11d,%edx
- rorl $2,%r14d
- addl %esi,%r11d
- movl %edx,%r13d
- addl %r11d,%r14d
- rorl $14,%r13d
- movl %r14d,%r11d
- movl %r8d,%r12d
- xorl %edx,%r13d
- rorl $9,%r14d
- xorl %r9d,%r12d
- rorl $5,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 144-128(%rdi),%xmm10
- xorl %edx,%r13d
- addl 36(%rsp),%r10d
- movl %r11d,%esi
- rorl $11,%r14d
- xorl %r9d,%r12d
- xorl %eax,%esi
- rorl $6,%r13d
- addl %r12d,%r10d
- andl %esi,%r15d
- xorl %r11d,%r14d
- addl %r13d,%r10d
- xorl %eax,%r15d
- addl %r10d,%ecx
- rorl $2,%r14d
- addl %r15d,%r10d
- movl %ecx,%r13d
- addl %r10d,%r14d
- rorl $14,%r13d
- movl %r14d,%r10d
- movl %edx,%r12d
- xorl %ecx,%r13d
- rorl $9,%r14d
- xorl %r8d,%r12d
- rorl $5,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 160-128(%rdi),%xmm10
- xorl %ecx,%r13d
- addl 40(%rsp),%r9d
- movl %r10d,%r15d
- rorl $11,%r14d
- xorl %r8d,%r12d
- xorl %r11d,%r15d
- rorl $6,%r13d
- addl %r12d,%r9d
- andl %r15d,%esi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%esi
- addl %r9d,%ebx
- rorl $2,%r14d
- addl %esi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- rorl $14,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- xorl %ebx,%r13d
- rorl $9,%r14d
- xorl %edx,%r12d
- rorl $5,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 176-128(%rdi),%xmm10
- xorl %ebx,%r13d
- addl 44(%rsp),%r8d
- movl %r9d,%esi
- rorl $11,%r14d
- xorl %edx,%r12d
- xorl %r10d,%esi
- rorl $6,%r13d
- addl %r12d,%r8d
- andl %esi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- addl %r8d,%eax
- rorl $2,%r14d
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- rorl $14,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- xorl %eax,%r13d
- rorl $9,%r14d
- xorl %ecx,%r12d
- rorl $5,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vpand %xmm12,%xmm11,%xmm8
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 192-128(%rdi),%xmm10
- xorl %eax,%r13d
- addl 48(%rsp),%edx
- movl %r8d,%r15d
- rorl $11,%r14d
- xorl %ecx,%r12d
- xorl %r9d,%r15d
- rorl $6,%r13d
- addl %r12d,%edx
- andl %r15d,%esi
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%esi
- addl %edx,%r11d
- rorl $2,%r14d
- addl %esi,%edx
- movl %r11d,%r13d
- addl %edx,%r14d
- rorl $14,%r13d
- movl %r14d,%edx
- movl %eax,%r12d
- xorl %r11d,%r13d
- rorl $9,%r14d
- xorl %ebx,%r12d
- rorl $5,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 208-128(%rdi),%xmm10
- xorl %r11d,%r13d
- addl 52(%rsp),%ecx
- movl %edx,%esi
- rorl $11,%r14d
- xorl %ebx,%r12d
- xorl %r8d,%esi
- rorl $6,%r13d
- addl %r12d,%ecx
- andl %esi,%r15d
- xorl %edx,%r14d
- addl %r13d,%ecx
- xorl %r8d,%r15d
- addl %ecx,%r10d
- rorl $2,%r14d
- addl %r15d,%ecx
- movl %r10d,%r13d
- addl %ecx,%r14d
- rorl $14,%r13d
- movl %r14d,%ecx
- movl %r11d,%r12d
- xorl %r10d,%r13d
- rorl $9,%r14d
- xorl %eax,%r12d
- rorl $5,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- vpand %xmm13,%xmm11,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 224-128(%rdi),%xmm10
- xorl %r10d,%r13d
- addl 56(%rsp),%ebx
- movl %ecx,%r15d
- rorl $11,%r14d
- xorl %eax,%r12d
- xorl %edx,%r15d
- rorl $6,%r13d
- addl %r12d,%ebx
- andl %r15d,%esi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%esi
- addl %ebx,%r9d
- rorl $2,%r14d
- addl %esi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- rorl $14,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- xorl %r9d,%r13d
- rorl $9,%r14d
- xorl %r11d,%r12d
- rorl $5,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vpor %xmm11,%xmm8,%xmm8
- vaesenclast %xmm10,%xmm9,%xmm11
- vmovdqu 0-128(%rdi),%xmm10
- xorl %r9d,%r13d
- addl 60(%rsp),%eax
- movl %ebx,%esi
- rorl $11,%r14d
- xorl %r11d,%r12d
- xorl %ecx,%esi
- rorl $6,%r13d
- addl %r12d,%eax
- andl %esi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- addl %eax,%r8d
- rorl $2,%r14d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- movq 64+0(%rsp),%r12
- movq 64+8(%rsp),%r13
- movq 64+40(%rsp),%r15
- movq 64+48(%rsp),%rsi
-
- vpand %xmm14,%xmm11,%xmm11
- movl %r14d,%eax
- vpor %xmm11,%xmm8,%xmm8
- vmovdqu %xmm8,(%r12,%r13,1)
- leaq 16(%r12),%r12
-
- addl 0(%r15),%eax
- addl 4(%r15),%ebx
- addl 8(%r15),%ecx
- addl 12(%r15),%edx
- addl 16(%r15),%r8d
- addl 20(%r15),%r9d
- addl 24(%r15),%r10d
- addl 28(%r15),%r11d
-
- cmpq 64+16(%rsp),%r12
-
- movl %eax,0(%r15)
- movl %ebx,4(%r15)
- movl %ecx,8(%r15)
- movl %edx,12(%r15)
- movl %r8d,16(%r15)
- movl %r9d,20(%r15)
- movl %r10d,24(%r15)
- movl %r11d,28(%r15)
-
- jb .Lloop_xop
-
- movq 64+32(%rsp),%r8
- movq 120(%rsp),%rsi
-.cfi_def_cfa %rsi,8
- vmovdqu %xmm8,(%r8)
- vzeroall
- movq -48(%rsi),%r15
-.cfi_restore %r15
- movq -40(%rsi),%r14
-.cfi_restore %r14
- movq -32(%rsi),%r13
-.cfi_restore %r13
- movq -24(%rsi),%r12
-.cfi_restore %r12
- movq -16(%rsi),%rbp
-.cfi_restore %rbp
- movq -8(%rsi),%rbx
-.cfi_restore %rbx
- leaq (%rsi),%rsp
-.cfi_def_cfa_register %rsp
-.Lepilogue_xop:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size aesni_cbc_sha256_enc_xop,.-aesni_cbc_sha256_enc_xop
-.type aesni_cbc_sha256_enc_avx,@function
-.align 64
-aesni_cbc_sha256_enc_avx:
-.cfi_startproc
-.Lavx_shortcut:
- movq 8(%rsp),%r10
- movq %rsp,%rax
-.cfi_def_cfa_register %rax
- pushq %rbx
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_offset %rbp,-24
- pushq %r12
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_offset %r15,-56
- subq $128,%rsp
- andq $-64,%rsp
-
- shlq $6,%rdx
- subq %rdi,%rsi
- subq %rdi,%r10
- addq %rdi,%rdx
-
-
- movq %rsi,64+8(%rsp)
- movq %rdx,64+16(%rsp)
-
- movq %r8,64+32(%rsp)
- movq %r9,64+40(%rsp)
- movq %r10,64+48(%rsp)
- movq %rax,120(%rsp)
-.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08
-.Lprologue_avx:
- vzeroall
-
- movq %rdi,%r12
- leaq 128(%rcx),%rdi
- leaq K256+544(%rip),%r13
- movl 240-128(%rdi),%r14d
- movq %r9,%r15
- movq %r10,%rsi
- vmovdqu (%r8),%xmm8
- subq $9,%r14
-
- movl 0(%r15),%eax
- movl 4(%r15),%ebx
- movl 8(%r15),%ecx
- movl 12(%r15),%edx
- movl 16(%r15),%r8d
- movl 20(%r15),%r9d
- movl 24(%r15),%r10d
- movl 28(%r15),%r11d
-
- vmovdqa 0(%r13,%r14,8),%xmm14
- vmovdqa 16(%r13,%r14,8),%xmm13
- vmovdqa 32(%r13,%r14,8),%xmm12
- vmovdqu 0-128(%rdi),%xmm10
- jmp .Lloop_avx
-.align 16
-.Lloop_avx:
- vmovdqa K256+512(%rip),%xmm7
- vmovdqu 0(%rsi,%r12,1),%xmm0
- vmovdqu 16(%rsi,%r12,1),%xmm1
- vmovdqu 32(%rsi,%r12,1),%xmm2
- vmovdqu 48(%rsi,%r12,1),%xmm3
- vpshufb %xmm7,%xmm0,%xmm0
- leaq K256(%rip),%rbp
- vpshufb %xmm7,%xmm1,%xmm1
- vpshufb %xmm7,%xmm2,%xmm2
- vpaddd 0(%rbp),%xmm0,%xmm4
- vpshufb %xmm7,%xmm3,%xmm3
- vpaddd 32(%rbp),%xmm1,%xmm5
- vpaddd 64(%rbp),%xmm2,%xmm6
- vpaddd 96(%rbp),%xmm3,%xmm7
- vmovdqa %xmm4,0(%rsp)
- movl %eax,%r14d
- vmovdqa %xmm5,16(%rsp)
- movl %ebx,%esi
- vmovdqa %xmm6,32(%rsp)
- xorl %ecx,%esi
- vmovdqa %xmm7,48(%rsp)
- movl %r8d,%r13d
- jmp .Lavx_00_47
-
-.align 16
-.Lavx_00_47:
- subq $-32*4,%rbp
- vmovdqu (%r12),%xmm9
- movq %r12,64+0(%rsp)
- vpalignr $4,%xmm0,%xmm1,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- vpalignr $4,%xmm2,%xmm3,%xmm7
- xorl %r8d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vpaddd %xmm7,%xmm0,%xmm0
- vpxor %xmm10,%xmm9,%xmm9
- vmovdqu 16-128(%rdi),%xmm10
- xorl %r8d,%r13d
- addl 0(%rsp),%r11d
- movl %eax,%r15d
- vpsrld $3,%xmm4,%xmm7
- shrdl $11,%r14d,%r14d
- xorl %r10d,%r12d
- xorl %ebx,%r15d
- vpslld $14,%xmm4,%xmm5
- shrdl $6,%r13d,%r13d
- addl %r12d,%r11d
- andl %r15d,%esi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%esi
- vpshufd $250,%xmm3,%xmm7
- addl %r11d,%edx
- shrdl $2,%r14d,%r14d
- addl %esi,%r11d
- vpsrld $11,%xmm6,%xmm6
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%r11d
- movl %r8d,%r12d
- xorl %edx,%r13d
- vpslld $11,%xmm5,%xmm5
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %r11d,%r14d
- andl %edx,%r12d
- vpxor %xmm8,%xmm9,%xmm9
- xorl %edx,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 4(%rsp),%r10d
- movl %r11d,%esi
- shrdl $11,%r14d,%r14d
- vpxor %xmm5,%xmm4,%xmm4
- xorl %r9d,%r12d
- xorl %eax,%esi
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- addl %r12d,%r10d
- andl %esi,%r15d
- xorl %r11d,%r14d
- vpaddd %xmm4,%xmm0,%xmm0
- addl %r13d,%r10d
- xorl %eax,%r15d
- addl %r10d,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $2,%r14d,%r14d
- addl %r15d,%r10d
- movl %ecx,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- vpxor %xmm7,%xmm6,%xmm6
- movl %edx,%r12d
- xorl %ecx,%r13d
- shrdl $9,%r14d,%r14d
- vpshufd $132,%xmm6,%xmm6
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- vpsrldq $8,%xmm6,%xmm6
- andl %ecx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 32-128(%rdi),%xmm10
- xorl %ecx,%r13d
- addl 8(%rsp),%r9d
- vpaddd %xmm6,%xmm0,%xmm0
- movl %r10d,%r15d
- shrdl $11,%r14d,%r14d
- xorl %r8d,%r12d
- vpshufd $80,%xmm0,%xmm7
- xorl %r11d,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%r9d
- vpsrld $10,%xmm7,%xmm6
- andl %r15d,%esi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- vpsrlq $17,%xmm7,%xmm7
- xorl %r11d,%esi
- addl %r9d,%ebx
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %esi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- vpsrlq $2,%xmm7,%xmm7
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- vpxor %xmm7,%xmm6,%xmm6
- xorl %ebx,%r13d
- shrdl $9,%r14d,%r14d
- xorl %edx,%r12d
- vpshufd $232,%xmm6,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vpslldq $8,%xmm6,%xmm6
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 48-128(%rdi),%xmm10
- xorl %ebx,%r13d
- addl 12(%rsp),%r8d
- movl %r9d,%esi
- vpaddd %xmm6,%xmm0,%xmm0
- shrdl $11,%r14d,%r14d
- xorl %edx,%r12d
- xorl %r10d,%esi
- vpaddd 0(%rbp),%xmm0,%xmm6
- shrdl $6,%r13d,%r13d
- addl %r12d,%r8d
- andl %esi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- addl %r8d,%eax
- shrdl $2,%r14d,%r14d
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- vmovdqa %xmm6,0(%rsp)
- vpalignr $4,%xmm1,%xmm2,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- vpalignr $4,%xmm3,%xmm0,%xmm7
- xorl %eax,%r13d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vpaddd %xmm7,%xmm1,%xmm1
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 64-128(%rdi),%xmm10
- xorl %eax,%r13d
- addl 16(%rsp),%edx
- movl %r8d,%r15d
- vpsrld $3,%xmm4,%xmm7
- shrdl $11,%r14d,%r14d
- xorl %ecx,%r12d
- xorl %r9d,%r15d
- vpslld $14,%xmm4,%xmm5
- shrdl $6,%r13d,%r13d
- addl %r12d,%edx
- andl %r15d,%esi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%esi
- vpshufd $250,%xmm0,%xmm7
- addl %edx,%r11d
- shrdl $2,%r14d,%r14d
- addl %esi,%edx
- vpsrld $11,%xmm6,%xmm6
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%edx
- movl %eax,%r12d
- xorl %r11d,%r13d
- vpslld $11,%xmm5,%xmm5
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %edx,%r14d
- andl %r11d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 80-128(%rdi),%xmm10
- xorl %r11d,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 20(%rsp),%ecx
- movl %edx,%esi
- shrdl $11,%r14d,%r14d
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ebx,%r12d
- xorl %r8d,%esi
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- addl %r12d,%ecx
- andl %esi,%r15d
- xorl %edx,%r14d
- vpaddd %xmm4,%xmm1,%xmm1
- addl %r13d,%ecx
- xorl %r8d,%r15d
- addl %ecx,%r10d
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $2,%r14d,%r14d
- addl %r15d,%ecx
- movl %r10d,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- movl %r11d,%r12d
- xorl %r10d,%r13d
- shrdl $9,%r14d,%r14d
- vpshufd $132,%xmm6,%xmm6
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- vpsrldq $8,%xmm6,%xmm6
- andl %r10d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 96-128(%rdi),%xmm10
- xorl %r10d,%r13d
- addl 24(%rsp),%ebx
- vpaddd %xmm6,%xmm1,%xmm1
- movl %ecx,%r15d
- shrdl $11,%r14d,%r14d
- xorl %eax,%r12d
- vpshufd $80,%xmm1,%xmm7
- xorl %edx,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%ebx
- vpsrld $10,%xmm7,%xmm6
- andl %r15d,%esi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- vpsrlq $17,%xmm7,%xmm7
- xorl %edx,%esi
- addl %ebx,%r9d
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %esi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- vpsrlq $2,%xmm7,%xmm7
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- vpxor %xmm7,%xmm6,%xmm6
- xorl %r9d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r11d,%r12d
- vpshufd $232,%xmm6,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vpslldq $8,%xmm6,%xmm6
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 112-128(%rdi),%xmm10
- xorl %r9d,%r13d
- addl 28(%rsp),%eax
- movl %ebx,%esi
- vpaddd %xmm6,%xmm1,%xmm1
- shrdl $11,%r14d,%r14d
- xorl %r11d,%r12d
- xorl %ecx,%esi
- vpaddd 32(%rbp),%xmm1,%xmm6
- shrdl $6,%r13d,%r13d
- addl %r12d,%eax
- andl %esi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- addl %eax,%r8d
- shrdl $2,%r14d,%r14d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- vmovdqa %xmm6,16(%rsp)
- vpalignr $4,%xmm2,%xmm3,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- vpalignr $4,%xmm0,%xmm1,%xmm7
- xorl %r8d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vpaddd %xmm7,%xmm2,%xmm2
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 128-128(%rdi),%xmm10
- xorl %r8d,%r13d
- addl 32(%rsp),%r11d
- movl %eax,%r15d
- vpsrld $3,%xmm4,%xmm7
- shrdl $11,%r14d,%r14d
- xorl %r10d,%r12d
- xorl %ebx,%r15d
- vpslld $14,%xmm4,%xmm5
- shrdl $6,%r13d,%r13d
- addl %r12d,%r11d
- andl %r15d,%esi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%esi
- vpshufd $250,%xmm1,%xmm7
- addl %r11d,%edx
- shrdl $2,%r14d,%r14d
- addl %esi,%r11d
- vpsrld $11,%xmm6,%xmm6
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%r11d
- movl %r8d,%r12d
- xorl %edx,%r13d
- vpslld $11,%xmm5,%xmm5
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %r11d,%r14d
- andl %edx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 144-128(%rdi),%xmm10
- xorl %edx,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 36(%rsp),%r10d
- movl %r11d,%esi
- shrdl $11,%r14d,%r14d
- vpxor %xmm5,%xmm4,%xmm4
- xorl %r9d,%r12d
- xorl %eax,%esi
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- addl %r12d,%r10d
- andl %esi,%r15d
- xorl %r11d,%r14d
- vpaddd %xmm4,%xmm2,%xmm2
- addl %r13d,%r10d
- xorl %eax,%r15d
- addl %r10d,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $2,%r14d,%r14d
- addl %r15d,%r10d
- movl %ecx,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- vpxor %xmm7,%xmm6,%xmm6
- movl %edx,%r12d
- xorl %ecx,%r13d
- shrdl $9,%r14d,%r14d
- vpshufd $132,%xmm6,%xmm6
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- vpsrldq $8,%xmm6,%xmm6
- andl %ecx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 160-128(%rdi),%xmm10
- xorl %ecx,%r13d
- addl 40(%rsp),%r9d
- vpaddd %xmm6,%xmm2,%xmm2
- movl %r10d,%r15d
- shrdl $11,%r14d,%r14d
- xorl %r8d,%r12d
- vpshufd $80,%xmm2,%xmm7
- xorl %r11d,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%r9d
- vpsrld $10,%xmm7,%xmm6
- andl %r15d,%esi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- vpsrlq $17,%xmm7,%xmm7
- xorl %r11d,%esi
- addl %r9d,%ebx
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %esi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- vpsrlq $2,%xmm7,%xmm7
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- vpxor %xmm7,%xmm6,%xmm6
- xorl %ebx,%r13d
- shrdl $9,%r14d,%r14d
- xorl %edx,%r12d
- vpshufd $232,%xmm6,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vpslldq $8,%xmm6,%xmm6
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 176-128(%rdi),%xmm10
- xorl %ebx,%r13d
- addl 44(%rsp),%r8d
- movl %r9d,%esi
- vpaddd %xmm6,%xmm2,%xmm2
- shrdl $11,%r14d,%r14d
- xorl %edx,%r12d
- xorl %r10d,%esi
- vpaddd 64(%rbp),%xmm2,%xmm6
- shrdl $6,%r13d,%r13d
- addl %r12d,%r8d
- andl %esi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- addl %r8d,%eax
- shrdl $2,%r14d,%r14d
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- vmovdqa %xmm6,32(%rsp)
- vpalignr $4,%xmm3,%xmm0,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- vpalignr $4,%xmm1,%xmm2,%xmm7
- xorl %eax,%r13d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vpaddd %xmm7,%xmm3,%xmm3
- vpand %xmm12,%xmm11,%xmm8
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 192-128(%rdi),%xmm10
- xorl %eax,%r13d
- addl 48(%rsp),%edx
- movl %r8d,%r15d
- vpsrld $3,%xmm4,%xmm7
- shrdl $11,%r14d,%r14d
- xorl %ecx,%r12d
- xorl %r9d,%r15d
- vpslld $14,%xmm4,%xmm5
- shrdl $6,%r13d,%r13d
- addl %r12d,%edx
- andl %r15d,%esi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%esi
- vpshufd $250,%xmm2,%xmm7
- addl %edx,%r11d
- shrdl $2,%r14d,%r14d
- addl %esi,%edx
- vpsrld $11,%xmm6,%xmm6
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%edx
- movl %eax,%r12d
- xorl %r11d,%r13d
- vpslld $11,%xmm5,%xmm5
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %edx,%r14d
- andl %r11d,%r12d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 208-128(%rdi),%xmm10
- xorl %r11d,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 52(%rsp),%ecx
- movl %edx,%esi
- shrdl $11,%r14d,%r14d
- vpxor %xmm5,%xmm4,%xmm4
- xorl %ebx,%r12d
- xorl %r8d,%esi
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- addl %r12d,%ecx
- andl %esi,%r15d
- xorl %edx,%r14d
- vpaddd %xmm4,%xmm3,%xmm3
- addl %r13d,%ecx
- xorl %r8d,%r15d
- addl %ecx,%r10d
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $2,%r14d,%r14d
- addl %r15d,%ecx
- movl %r10d,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- movl %r11d,%r12d
- xorl %r10d,%r13d
- shrdl $9,%r14d,%r14d
- vpshufd $132,%xmm6,%xmm6
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- vpsrldq $8,%xmm6,%xmm6
- andl %r10d,%r12d
- vpand %xmm13,%xmm11,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 224-128(%rdi),%xmm10
- xorl %r10d,%r13d
- addl 56(%rsp),%ebx
- vpaddd %xmm6,%xmm3,%xmm3
- movl %ecx,%r15d
- shrdl $11,%r14d,%r14d
- xorl %eax,%r12d
- vpshufd $80,%xmm3,%xmm7
- xorl %edx,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%ebx
- vpsrld $10,%xmm7,%xmm6
- andl %r15d,%esi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- vpsrlq $17,%xmm7,%xmm7
- xorl %edx,%esi
- addl %ebx,%r9d
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %esi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- vpsrlq $2,%xmm7,%xmm7
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- vpxor %xmm7,%xmm6,%xmm6
- xorl %r9d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r11d,%r12d
- vpshufd $232,%xmm6,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vpslldq $8,%xmm6,%xmm6
- vpor %xmm11,%xmm8,%xmm8
- vaesenclast %xmm10,%xmm9,%xmm11
- vmovdqu 0-128(%rdi),%xmm10
- xorl %r9d,%r13d
- addl 60(%rsp),%eax
- movl %ebx,%esi
- vpaddd %xmm6,%xmm3,%xmm3
- shrdl $11,%r14d,%r14d
- xorl %r11d,%r12d
- xorl %ecx,%esi
- vpaddd 96(%rbp),%xmm3,%xmm6
- shrdl $6,%r13d,%r13d
- addl %r12d,%eax
- andl %esi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- addl %eax,%r8d
- shrdl $2,%r14d,%r14d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- vmovdqa %xmm6,48(%rsp)
- movq 64+0(%rsp),%r12
- vpand %xmm14,%xmm11,%xmm11
- movq 64+8(%rsp),%r15
- vpor %xmm11,%xmm8,%xmm8
- vmovdqu %xmm8,(%r15,%r12,1)
- leaq 16(%r12),%r12
- cmpb $0,131(%rbp)
- jne .Lavx_00_47
- vmovdqu (%r12),%xmm9
- movq %r12,64+0(%rsp)
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- xorl %r8d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vpxor %xmm10,%xmm9,%xmm9
- vmovdqu 16-128(%rdi),%xmm10
- xorl %r8d,%r13d
- addl 0(%rsp),%r11d
- movl %eax,%r15d
- shrdl $11,%r14d,%r14d
- xorl %r10d,%r12d
- xorl %ebx,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%r11d
- andl %r15d,%esi
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%esi
- addl %r11d,%edx
- shrdl $2,%r14d,%r14d
- addl %esi,%r11d
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r11d
- movl %r8d,%r12d
- xorl %edx,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- vpxor %xmm8,%xmm9,%xmm9
- xorl %edx,%r13d
- addl 4(%rsp),%r10d
- movl %r11d,%esi
- shrdl $11,%r14d,%r14d
- xorl %r9d,%r12d
- xorl %eax,%esi
- shrdl $6,%r13d,%r13d
- addl %r12d,%r10d
- andl %esi,%r15d
- xorl %r11d,%r14d
- addl %r13d,%r10d
- xorl %eax,%r15d
- addl %r10d,%ecx
- shrdl $2,%r14d,%r14d
- addl %r15d,%r10d
- movl %ecx,%r13d
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- movl %edx,%r12d
- xorl %ecx,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 32-128(%rdi),%xmm10
- xorl %ecx,%r13d
- addl 8(%rsp),%r9d
- movl %r10d,%r15d
- shrdl $11,%r14d,%r14d
- xorl %r8d,%r12d
- xorl %r11d,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%r9d
- andl %r15d,%esi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%esi
- addl %r9d,%ebx
- shrdl $2,%r14d,%r14d
- addl %esi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- xorl %ebx,%r13d
- shrdl $9,%r14d,%r14d
- xorl %edx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 48-128(%rdi),%xmm10
- xorl %ebx,%r13d
- addl 12(%rsp),%r8d
- movl %r9d,%esi
- shrdl $11,%r14d,%r14d
- xorl %edx,%r12d
- xorl %r10d,%esi
- shrdl $6,%r13d,%r13d
- addl %r12d,%r8d
- andl %esi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- addl %r8d,%eax
- shrdl $2,%r14d,%r14d
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- xorl %eax,%r13d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 64-128(%rdi),%xmm10
- xorl %eax,%r13d
- addl 16(%rsp),%edx
- movl %r8d,%r15d
- shrdl $11,%r14d,%r14d
- xorl %ecx,%r12d
- xorl %r9d,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%edx
- andl %r15d,%esi
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%esi
- addl %edx,%r11d
- shrdl $2,%r14d,%r14d
- addl %esi,%edx
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%edx
- movl %eax,%r12d
- xorl %r11d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 80-128(%rdi),%xmm10
- xorl %r11d,%r13d
- addl 20(%rsp),%ecx
- movl %edx,%esi
- shrdl $11,%r14d,%r14d
- xorl %ebx,%r12d
- xorl %r8d,%esi
- shrdl $6,%r13d,%r13d
- addl %r12d,%ecx
- andl %esi,%r15d
- xorl %edx,%r14d
- addl %r13d,%ecx
- xorl %r8d,%r15d
- addl %ecx,%r10d
- shrdl $2,%r14d,%r14d
- addl %r15d,%ecx
- movl %r10d,%r13d
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- movl %r11d,%r12d
- xorl %r10d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 96-128(%rdi),%xmm10
- xorl %r10d,%r13d
- addl 24(%rsp),%ebx
- movl %ecx,%r15d
- shrdl $11,%r14d,%r14d
- xorl %eax,%r12d
- xorl %edx,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%ebx
- andl %r15d,%esi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%esi
- addl %ebx,%r9d
- shrdl $2,%r14d,%r14d
- addl %esi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- xorl %r9d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r11d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 112-128(%rdi),%xmm10
- xorl %r9d,%r13d
- addl 28(%rsp),%eax
- movl %ebx,%esi
- shrdl $11,%r14d,%r14d
- xorl %r11d,%r12d
- xorl %ecx,%esi
- shrdl $6,%r13d,%r13d
- addl %r12d,%eax
- andl %esi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- addl %eax,%r8d
- shrdl $2,%r14d,%r14d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- xorl %r8d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 128-128(%rdi),%xmm10
- xorl %r8d,%r13d
- addl 32(%rsp),%r11d
- movl %eax,%r15d
- shrdl $11,%r14d,%r14d
- xorl %r10d,%r12d
- xorl %ebx,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%r11d
- andl %r15d,%esi
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%esi
- addl %r11d,%edx
- shrdl $2,%r14d,%r14d
- addl %esi,%r11d
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r11d
- movl %r8d,%r12d
- xorl %edx,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 144-128(%rdi),%xmm10
- xorl %edx,%r13d
- addl 36(%rsp),%r10d
- movl %r11d,%esi
- shrdl $11,%r14d,%r14d
- xorl %r9d,%r12d
- xorl %eax,%esi
- shrdl $6,%r13d,%r13d
- addl %r12d,%r10d
- andl %esi,%r15d
- xorl %r11d,%r14d
- addl %r13d,%r10d
- xorl %eax,%r15d
- addl %r10d,%ecx
- shrdl $2,%r14d,%r14d
- addl %r15d,%r10d
- movl %ecx,%r13d
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- movl %edx,%r12d
- xorl %ecx,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 160-128(%rdi),%xmm10
- xorl %ecx,%r13d
- addl 40(%rsp),%r9d
- movl %r10d,%r15d
- shrdl $11,%r14d,%r14d
- xorl %r8d,%r12d
- xorl %r11d,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%r9d
- andl %r15d,%esi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%esi
- addl %r9d,%ebx
- shrdl $2,%r14d,%r14d
- addl %esi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- xorl %ebx,%r13d
- shrdl $9,%r14d,%r14d
- xorl %edx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 176-128(%rdi),%xmm10
- xorl %ebx,%r13d
- addl 44(%rsp),%r8d
- movl %r9d,%esi
- shrdl $11,%r14d,%r14d
- xorl %edx,%r12d
- xorl %r10d,%esi
- shrdl $6,%r13d,%r13d
- addl %r12d,%r8d
- andl %esi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- addl %r8d,%eax
- shrdl $2,%r14d,%r14d
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- xorl %eax,%r13d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vpand %xmm12,%xmm11,%xmm8
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 192-128(%rdi),%xmm10
- xorl %eax,%r13d
- addl 48(%rsp),%edx
- movl %r8d,%r15d
- shrdl $11,%r14d,%r14d
- xorl %ecx,%r12d
- xorl %r9d,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%edx
- andl %r15d,%esi
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%esi
- addl %edx,%r11d
- shrdl $2,%r14d,%r14d
- addl %esi,%edx
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%edx
- movl %eax,%r12d
- xorl %r11d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 208-128(%rdi),%xmm10
- xorl %r11d,%r13d
- addl 52(%rsp),%ecx
- movl %edx,%esi
- shrdl $11,%r14d,%r14d
- xorl %ebx,%r12d
- xorl %r8d,%esi
- shrdl $6,%r13d,%r13d
- addl %r12d,%ecx
- andl %esi,%r15d
- xorl %edx,%r14d
- addl %r13d,%ecx
- xorl %r8d,%r15d
- addl %ecx,%r10d
- shrdl $2,%r14d,%r14d
- addl %r15d,%ecx
- movl %r10d,%r13d
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- movl %r11d,%r12d
- xorl %r10d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- vpand %xmm13,%xmm11,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 224-128(%rdi),%xmm10
- xorl %r10d,%r13d
- addl 56(%rsp),%ebx
- movl %ecx,%r15d
- shrdl $11,%r14d,%r14d
- xorl %eax,%r12d
- xorl %edx,%r15d
- shrdl $6,%r13d,%r13d
- addl %r12d,%ebx
- andl %r15d,%esi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%esi
- addl %ebx,%r9d
- shrdl $2,%r14d,%r14d
- addl %esi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- xorl %r9d,%r13d
- shrdl $9,%r14d,%r14d
- xorl %r11d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vpor %xmm11,%xmm8,%xmm8
- vaesenclast %xmm10,%xmm9,%xmm11
- vmovdqu 0-128(%rdi),%xmm10
- xorl %r9d,%r13d
- addl 60(%rsp),%eax
- movl %ebx,%esi
- shrdl $11,%r14d,%r14d
- xorl %r11d,%r12d
- xorl %ecx,%esi
- shrdl $6,%r13d,%r13d
- addl %r12d,%eax
- andl %esi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- addl %eax,%r8d
- shrdl $2,%r14d,%r14d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- movq 64+0(%rsp),%r12
- movq 64+8(%rsp),%r13
- movq 64+40(%rsp),%r15
- movq 64+48(%rsp),%rsi
-
- vpand %xmm14,%xmm11,%xmm11
- movl %r14d,%eax
- vpor %xmm11,%xmm8,%xmm8
- vmovdqu %xmm8,(%r12,%r13,1)
- leaq 16(%r12),%r12
-
- addl 0(%r15),%eax
- addl 4(%r15),%ebx
- addl 8(%r15),%ecx
- addl 12(%r15),%edx
- addl 16(%r15),%r8d
- addl 20(%r15),%r9d
- addl 24(%r15),%r10d
- addl 28(%r15),%r11d
-
- cmpq 64+16(%rsp),%r12
-
- movl %eax,0(%r15)
- movl %ebx,4(%r15)
- movl %ecx,8(%r15)
- movl %edx,12(%r15)
- movl %r8d,16(%r15)
- movl %r9d,20(%r15)
- movl %r10d,24(%r15)
- movl %r11d,28(%r15)
- jb .Lloop_avx
-
- movq 64+32(%rsp),%r8
- movq 120(%rsp),%rsi
-.cfi_def_cfa %rsi,8
- vmovdqu %xmm8,(%r8)
- vzeroall
- movq -48(%rsi),%r15
-.cfi_restore %r15
- movq -40(%rsi),%r14
-.cfi_restore %r14
- movq -32(%rsi),%r13
-.cfi_restore %r13
- movq -24(%rsi),%r12
-.cfi_restore %r12
- movq -16(%rsi),%rbp
-.cfi_restore %rbp
- movq -8(%rsi),%rbx
-.cfi_restore %rbx
- leaq (%rsi),%rsp
-.cfi_def_cfa_register %rsp
-.Lepilogue_avx:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size aesni_cbc_sha256_enc_avx,.-aesni_cbc_sha256_enc_avx
-.type aesni_cbc_sha256_enc_avx2,@function
-.align 64
-aesni_cbc_sha256_enc_avx2:
-.cfi_startproc
-.Lavx2_shortcut:
- movq 8(%rsp),%r10
- movq %rsp,%rax
-.cfi_def_cfa_register %rax
- pushq %rbx
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_offset %rbp,-24
- pushq %r12
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_offset %r15,-56
- subq $576,%rsp
- andq $-1024,%rsp
- addq $448,%rsp
-
- shlq $6,%rdx
- subq %rdi,%rsi
- subq %rdi,%r10
- addq %rdi,%rdx
-
-
-
- movq %rdx,64+16(%rsp)
-
- movq %r8,64+32(%rsp)
- movq %r9,64+40(%rsp)
- movq %r10,64+48(%rsp)
- movq %rax,120(%rsp)
-.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08
-.Lprologue_avx2:
- vzeroall
-
- movq %rdi,%r13
- vpinsrq $1,%rsi,%xmm15,%xmm15
- leaq 128(%rcx),%rdi
- leaq K256+544(%rip),%r12
- movl 240-128(%rdi),%r14d
- movq %r9,%r15
- movq %r10,%rsi
- vmovdqu (%r8),%xmm8
- leaq -9(%r14),%r14
-
- vmovdqa 0(%r12,%r14,8),%xmm14
- vmovdqa 16(%r12,%r14,8),%xmm13
- vmovdqa 32(%r12,%r14,8),%xmm12
-
- subq $-64,%r13
- movl 0(%r15),%eax
- leaq (%rsi,%r13,1),%r12
- movl 4(%r15),%ebx
- cmpq %rdx,%r13
- movl 8(%r15),%ecx
- cmoveq %rsp,%r12
- movl 12(%r15),%edx
- movl 16(%r15),%r8d
- movl 20(%r15),%r9d
- movl 24(%r15),%r10d
- movl 28(%r15),%r11d
- vmovdqu 0-128(%rdi),%xmm10
- jmp .Loop_avx2
-.align 16
-.Loop_avx2:
- vmovdqa K256+512(%rip),%ymm7
- vmovdqu -64+0(%rsi,%r13,1),%xmm0
- vmovdqu -64+16(%rsi,%r13,1),%xmm1
- vmovdqu -64+32(%rsi,%r13,1),%xmm2
- vmovdqu -64+48(%rsi,%r13,1),%xmm3
-
- vinserti128 $1,(%r12),%ymm0,%ymm0
- vinserti128 $1,16(%r12),%ymm1,%ymm1
- vpshufb %ymm7,%ymm0,%ymm0
- vinserti128 $1,32(%r12),%ymm2,%ymm2
- vpshufb %ymm7,%ymm1,%ymm1
- vinserti128 $1,48(%r12),%ymm3,%ymm3
-
- leaq K256(%rip),%rbp
- vpshufb %ymm7,%ymm2,%ymm2
- leaq -64(%r13),%r13
- vpaddd 0(%rbp),%ymm0,%ymm4
- vpshufb %ymm7,%ymm3,%ymm3
- vpaddd 32(%rbp),%ymm1,%ymm5
- vpaddd 64(%rbp),%ymm2,%ymm6
- vpaddd 96(%rbp),%ymm3,%ymm7
- vmovdqa %ymm4,0(%rsp)
- xorl %r14d,%r14d
- vmovdqa %ymm5,32(%rsp)
-
- movq 120(%rsp),%rsi
-.cfi_def_cfa %rsi,8
- leaq -64(%rsp),%rsp
-
-
-
- movq %rsi,-8(%rsp)
-.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
- movl %ebx,%esi
- vmovdqa %ymm6,0(%rsp)
- xorl %ecx,%esi
- vmovdqa %ymm7,32(%rsp)
- movl %r9d,%r12d
- subq $-32*4,%rbp
- jmp .Lavx2_00_47
-
-.align 16
-.Lavx2_00_47:
- vmovdqu (%r13),%xmm9
- vpinsrq $0,%r13,%xmm15,%xmm15
- leaq -64(%rsp),%rsp
-.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08
-
- pushq 64-8(%rsp)
-.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08
- leaq 8(%rsp),%rsp
-.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
- vpalignr $4,%ymm0,%ymm1,%ymm4
- addl 0+128(%rsp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- vpalignr $4,%ymm2,%ymm3,%ymm7
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- vpsrld $7,%ymm4,%ymm6
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- vpaddd %ymm7,%ymm0,%ymm0
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- vpsrld $3,%ymm4,%ymm7
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- vpslld $14,%ymm4,%ymm5
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- vpxor %ymm6,%ymm7,%ymm4
- andl %r15d,%esi
- vpxor %xmm10,%xmm9,%xmm9
- vmovdqu 16-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ebx,%esi
- vpshufd $250,%ymm3,%ymm7
- xorl %r13d,%r14d
- leal (%r11,%rsi,1),%r11d
- movl %r8d,%r12d
- vpsrld $11,%ymm6,%ymm6
- addl 4+128(%rsp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $11,%edx,%esi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- vpslld $11,%ymm5,%ymm5
- andnl %r9d,%edx,%r12d
- xorl %esi,%r13d
- rorxl $6,%edx,%r14d
- vpxor %ymm6,%ymm4,%ymm4
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%esi
- vpsrld $10,%ymm7,%ymm6
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%esi
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- vpsrlq $17,%ymm7,%ymm7
- andl %esi,%r15d
- vpxor %xmm8,%xmm9,%xmm9
- xorl %r12d,%r14d
- xorl %eax,%r15d
- vpaddd %ymm4,%ymm0,%ymm0
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 8+128(%rsp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- vpxor %ymm7,%ymm6,%ymm6
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- vpshufd $132,%ymm6,%ymm6
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- vpsrldq $8,%ymm6,%ymm6
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- vpaddd %ymm6,%ymm0,%ymm0
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- vpshufd $80,%ymm0,%ymm7
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 32-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r11d,%esi
- vpsrld $10,%ymm7,%ymm6
- xorl %r13d,%r14d
- leal (%r9,%rsi,1),%r9d
- movl %ecx,%r12d
- vpsrlq $17,%ymm7,%ymm7
- addl 12+128(%rsp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- vpxor %ymm7,%ymm6,%ymm6
- rorxl $11,%ebx,%esi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- vpsrlq $2,%ymm7,%ymm7
- andnl %edx,%ebx,%r12d
- xorl %esi,%r13d
- rorxl $6,%ebx,%r14d
- vpxor %ymm7,%ymm6,%ymm6
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%esi
- vpshufd $232,%ymm6,%ymm6
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%esi
- vpslldq $8,%ymm6,%ymm6
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- vpaddd %ymm6,%ymm0,%ymm0
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 48-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- vpaddd 0(%rbp),%ymm0,%ymm6
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- vmovdqa %ymm6,0(%rsp)
- vpalignr $4,%ymm1,%ymm2,%ymm4
- addl 32+128(%rsp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- vpalignr $4,%ymm3,%ymm0,%ymm7
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- vpsrld $7,%ymm4,%ymm6
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- vpaddd %ymm7,%ymm1,%ymm1
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- vpsrld $3,%ymm4,%ymm7
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- vpslld $14,%ymm4,%ymm5
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- vpxor %ymm6,%ymm7,%ymm4
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 64-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r9d,%esi
- vpshufd $250,%ymm0,%ymm7
- xorl %r13d,%r14d
- leal (%rdx,%rsi,1),%edx
- movl %eax,%r12d
- vpsrld $11,%ymm6,%ymm6
- addl 36+128(%rsp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $11,%r11d,%esi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- vpslld $11,%ymm5,%ymm5
- andnl %ebx,%r11d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r11d,%r14d
- vpxor %ymm6,%ymm4,%ymm4
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%esi
- vpsrld $10,%ymm7,%ymm6
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%esi
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- vpsrlq $17,%ymm7,%ymm7
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 80-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- vpaddd %ymm4,%ymm1,%ymm1
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 40+128(%rsp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- vpxor %ymm7,%ymm6,%ymm6
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- vpshufd $132,%ymm6,%ymm6
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- vpsrldq $8,%ymm6,%ymm6
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- vpaddd %ymm6,%ymm1,%ymm1
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- vpshufd $80,%ymm1,%ymm7
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 96-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %edx,%esi
- vpsrld $10,%ymm7,%ymm6
- xorl %r13d,%r14d
- leal (%rbx,%rsi,1),%ebx
- movl %r10d,%r12d
- vpsrlq $17,%ymm7,%ymm7
- addl 44+128(%rsp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- vpxor %ymm7,%ymm6,%ymm6
- rorxl $11,%r9d,%esi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- vpsrlq $2,%ymm7,%ymm7
- andnl %r11d,%r9d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r9d,%r14d
- vpxor %ymm7,%ymm6,%ymm6
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%esi
- vpshufd $232,%ymm6,%ymm6
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%esi
- vpslldq $8,%ymm6,%ymm6
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- vpaddd %ymm6,%ymm1,%ymm1
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 112-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- vpaddd 32(%rbp),%ymm1,%ymm6
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- vmovdqa %ymm6,32(%rsp)
- leaq -64(%rsp),%rsp
-.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08
-
- pushq 64-8(%rsp)
-.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08
- leaq 8(%rsp),%rsp
-.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
- vpalignr $4,%ymm2,%ymm3,%ymm4
- addl 0+128(%rsp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- vpalignr $4,%ymm0,%ymm1,%ymm7
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- vpsrld $7,%ymm4,%ymm6
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- vpaddd %ymm7,%ymm2,%ymm2
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- vpsrld $3,%ymm4,%ymm7
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- vpslld $14,%ymm4,%ymm5
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- vpxor %ymm6,%ymm7,%ymm4
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 128-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ebx,%esi
- vpshufd $250,%ymm1,%ymm7
- xorl %r13d,%r14d
- leal (%r11,%rsi,1),%r11d
- movl %r8d,%r12d
- vpsrld $11,%ymm6,%ymm6
- addl 4+128(%rsp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $11,%edx,%esi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- vpslld $11,%ymm5,%ymm5
- andnl %r9d,%edx,%r12d
- xorl %esi,%r13d
- rorxl $6,%edx,%r14d
- vpxor %ymm6,%ymm4,%ymm4
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%esi
- vpsrld $10,%ymm7,%ymm6
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%esi
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- vpsrlq $17,%ymm7,%ymm7
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 144-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %eax,%r15d
- vpaddd %ymm4,%ymm2,%ymm2
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 8+128(%rsp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- vpxor %ymm7,%ymm6,%ymm6
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- vpshufd $132,%ymm6,%ymm6
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- vpsrldq $8,%ymm6,%ymm6
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- vpaddd %ymm6,%ymm2,%ymm2
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- vpshufd $80,%ymm2,%ymm7
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 160-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r11d,%esi
- vpsrld $10,%ymm7,%ymm6
- xorl %r13d,%r14d
- leal (%r9,%rsi,1),%r9d
- movl %ecx,%r12d
- vpsrlq $17,%ymm7,%ymm7
- addl 12+128(%rsp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- vpxor %ymm7,%ymm6,%ymm6
- rorxl $11,%ebx,%esi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- vpsrlq $2,%ymm7,%ymm7
- andnl %edx,%ebx,%r12d
- xorl %esi,%r13d
- rorxl $6,%ebx,%r14d
- vpxor %ymm7,%ymm6,%ymm6
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%esi
- vpshufd $232,%ymm6,%ymm6
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%esi
- vpslldq $8,%ymm6,%ymm6
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- vpaddd %ymm6,%ymm2,%ymm2
- andl %esi,%r15d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 176-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- vpaddd 64(%rbp),%ymm2,%ymm6
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- vmovdqa %ymm6,0(%rsp)
- vpalignr $4,%ymm3,%ymm0,%ymm4
- addl 32+128(%rsp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- vpalignr $4,%ymm1,%ymm2,%ymm7
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- vpsrld $7,%ymm4,%ymm6
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- vpaddd %ymm7,%ymm3,%ymm3
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- vpsrld $3,%ymm4,%ymm7
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- vpslld $14,%ymm4,%ymm5
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- vpxor %ymm6,%ymm7,%ymm4
- andl %r15d,%esi
- vpand %xmm12,%xmm11,%xmm8
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 192-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r9d,%esi
- vpshufd $250,%ymm2,%ymm7
- xorl %r13d,%r14d
- leal (%rdx,%rsi,1),%edx
- movl %eax,%r12d
- vpsrld $11,%ymm6,%ymm6
- addl 36+128(%rsp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $11,%r11d,%esi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- vpslld $11,%ymm5,%ymm5
- andnl %ebx,%r11d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r11d,%r14d
- vpxor %ymm6,%ymm4,%ymm4
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%esi
- vpsrld $10,%ymm7,%ymm6
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%esi
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- vpsrlq $17,%ymm7,%ymm7
- andl %esi,%r15d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 208-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- vpaddd %ymm4,%ymm3,%ymm3
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 40+128(%rsp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- vpxor %ymm7,%ymm6,%ymm6
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- vpshufd $132,%ymm6,%ymm6
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- vpsrldq $8,%ymm6,%ymm6
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- vpaddd %ymm6,%ymm3,%ymm3
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- vpshufd $80,%ymm3,%ymm7
- andl %r15d,%esi
- vpand %xmm13,%xmm11,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 224-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %edx,%esi
- vpsrld $10,%ymm7,%ymm6
- xorl %r13d,%r14d
- leal (%rbx,%rsi,1),%ebx
- movl %r10d,%r12d
- vpsrlq $17,%ymm7,%ymm7
- addl 44+128(%rsp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- vpxor %ymm7,%ymm6,%ymm6
- rorxl $11,%r9d,%esi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- vpsrlq $2,%ymm7,%ymm7
- andnl %r11d,%r9d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r9d,%r14d
- vpxor %ymm7,%ymm6,%ymm6
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%esi
- vpshufd $232,%ymm6,%ymm6
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%esi
- vpslldq $8,%ymm6,%ymm6
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- vpaddd %ymm6,%ymm3,%ymm3
- andl %esi,%r15d
- vpor %xmm11,%xmm8,%xmm8
- vaesenclast %xmm10,%xmm9,%xmm11
- vmovdqu 0-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- vpaddd 96(%rbp),%ymm3,%ymm6
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- vmovdqa %ymm6,32(%rsp)
- vmovq %xmm15,%r13
- vpextrq $1,%xmm15,%r15
- vpand %xmm14,%xmm11,%xmm11
- vpor %xmm11,%xmm8,%xmm8
- vmovdqu %xmm8,(%r15,%r13,1)
- leaq 16(%r13),%r13
- leaq 128(%rbp),%rbp
- cmpb $0,3(%rbp)
- jne .Lavx2_00_47
- vmovdqu (%r13),%xmm9
- vpinsrq $0,%r13,%xmm15,%xmm15
- addl 0+64(%rsp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- andl %r15d,%esi
- vpxor %xmm10,%xmm9,%xmm9
- vmovdqu 16-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ebx,%esi
- xorl %r13d,%r14d
- leal (%r11,%rsi,1),%r11d
- movl %r8d,%r12d
- addl 4+64(%rsp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- rorxl $11,%edx,%esi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- andnl %r9d,%edx,%r12d
- xorl %esi,%r13d
- rorxl $6,%edx,%r14d
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%esi
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%esi
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- andl %esi,%r15d
- vpxor %xmm8,%xmm9,%xmm9
- xorl %r12d,%r14d
- xorl %eax,%r15d
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- addl 8+64(%rsp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 32-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r11d,%esi
- xorl %r13d,%r14d
- leal (%r9,%rsi,1),%r9d
- movl %ecx,%r12d
- addl 12+64(%rsp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- rorxl $11,%ebx,%esi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- andnl %edx,%ebx,%r12d
- xorl %esi,%r13d
- rorxl $6,%ebx,%r14d
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%esi
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%esi
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 48-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- addl 32+64(%rsp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 64-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r9d,%esi
- xorl %r13d,%r14d
- leal (%rdx,%rsi,1),%edx
- movl %eax,%r12d
- addl 36+64(%rsp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- rorxl $11,%r11d,%esi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- andnl %ebx,%r11d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r11d,%r14d
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%esi
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%esi
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 80-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- addl 40+64(%rsp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 96-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %edx,%esi
- xorl %r13d,%r14d
- leal (%rbx,%rsi,1),%ebx
- movl %r10d,%r12d
- addl 44+64(%rsp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- rorxl $11,%r9d,%esi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- andnl %r11d,%r9d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r9d,%r14d
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%esi
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%esi
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 112-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- addl 0(%rsp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 128-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ebx,%esi
- xorl %r13d,%r14d
- leal (%r11,%rsi,1),%r11d
- movl %r8d,%r12d
- addl 4(%rsp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- rorxl $11,%edx,%esi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- andnl %r9d,%edx,%r12d
- xorl %esi,%r13d
- rorxl $6,%edx,%r14d
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%esi
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%esi
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 144-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %eax,%r15d
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- addl 8(%rsp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 160-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r11d,%esi
- xorl %r13d,%r14d
- leal (%r9,%rsi,1),%r9d
- movl %ecx,%r12d
- addl 12(%rsp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- rorxl $11,%ebx,%esi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- andnl %edx,%ebx,%r12d
- xorl %esi,%r13d
- rorxl $6,%ebx,%r14d
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%esi
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%esi
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %esi,%r15d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 176-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- addl 32(%rsp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- andl %r15d,%esi
- vpand %xmm12,%xmm11,%xmm8
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 192-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r9d,%esi
- xorl %r13d,%r14d
- leal (%rdx,%rsi,1),%edx
- movl %eax,%r12d
- addl 36(%rsp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- rorxl $11,%r11d,%esi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- andnl %ebx,%r11d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r11d,%r14d
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%esi
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%esi
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- andl %esi,%r15d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 208-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- addl 40(%rsp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- andl %r15d,%esi
- vpand %xmm13,%xmm11,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 224-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %edx,%esi
- xorl %r13d,%r14d
- leal (%rbx,%rsi,1),%ebx
- movl %r10d,%r12d
- addl 44(%rsp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- rorxl $11,%r9d,%esi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- andnl %r11d,%r9d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r9d,%r14d
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%esi
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%esi
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %esi,%r15d
- vpor %xmm11,%xmm8,%xmm8
- vaesenclast %xmm10,%xmm9,%xmm11
- vmovdqu 0-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- vpextrq $1,%xmm15,%r12
- vmovq %xmm15,%r13
- movq 552(%rsp),%r15
- addl %r14d,%eax
- leaq 448(%rsp),%rbp
-
- vpand %xmm14,%xmm11,%xmm11
- vpor %xmm11,%xmm8,%xmm8
- vmovdqu %xmm8,(%r12,%r13,1)
- leaq 16(%r13),%r13
-
- addl 0(%r15),%eax
- addl 4(%r15),%ebx
- addl 8(%r15),%ecx
- addl 12(%r15),%edx
- addl 16(%r15),%r8d
- addl 20(%r15),%r9d
- addl 24(%r15),%r10d
- addl 28(%r15),%r11d
-
- movl %eax,0(%r15)
- movl %ebx,4(%r15)
- movl %ecx,8(%r15)
- movl %edx,12(%r15)
- movl %r8d,16(%r15)
- movl %r9d,20(%r15)
- movl %r10d,24(%r15)
- movl %r11d,28(%r15)
-
- cmpq 80(%rbp),%r13
- je .Ldone_avx2
-
- xorl %r14d,%r14d
- movl %ebx,%esi
- movl %r9d,%r12d
- xorl %ecx,%esi
- jmp .Lower_avx2
-.align 16
-.Lower_avx2:
- vmovdqu (%r13),%xmm9
- vpinsrq $0,%r13,%xmm15,%xmm15
- addl 0+16(%rbp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- andl %r15d,%esi
- vpxor %xmm10,%xmm9,%xmm9
- vmovdqu 16-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ebx,%esi
- xorl %r13d,%r14d
- leal (%r11,%rsi,1),%r11d
- movl %r8d,%r12d
- addl 4+16(%rbp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- rorxl $11,%edx,%esi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- andnl %r9d,%edx,%r12d
- xorl %esi,%r13d
- rorxl $6,%edx,%r14d
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%esi
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%esi
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- andl %esi,%r15d
- vpxor %xmm8,%xmm9,%xmm9
- xorl %r12d,%r14d
- xorl %eax,%r15d
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- addl 8+16(%rbp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 32-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r11d,%esi
- xorl %r13d,%r14d
- leal (%r9,%rsi,1),%r9d
- movl %ecx,%r12d
- addl 12+16(%rbp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- rorxl $11,%ebx,%esi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- andnl %edx,%ebx,%r12d
- xorl %esi,%r13d
- rorxl $6,%ebx,%r14d
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%esi
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%esi
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 48-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- addl 32+16(%rbp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 64-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r9d,%esi
- xorl %r13d,%r14d
- leal (%rdx,%rsi,1),%edx
- movl %eax,%r12d
- addl 36+16(%rbp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- rorxl $11,%r11d,%esi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- andnl %ebx,%r11d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r11d,%r14d
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%esi
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%esi
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 80-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- addl 40+16(%rbp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 96-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %edx,%esi
- xorl %r13d,%r14d
- leal (%rbx,%rsi,1),%ebx
- movl %r10d,%r12d
- addl 44+16(%rbp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- rorxl $11,%r9d,%esi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- andnl %r11d,%r9d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r9d,%r14d
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%esi
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%esi
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 112-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- leaq -64(%rbp),%rbp
- addl 0+16(%rbp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 128-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ebx,%esi
- xorl %r13d,%r14d
- leal (%r11,%rsi,1),%r11d
- movl %r8d,%r12d
- addl 4+16(%rbp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- rorxl $11,%edx,%esi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- andnl %r9d,%edx,%r12d
- xorl %esi,%r13d
- rorxl $6,%edx,%r14d
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%esi
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%esi
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- andl %esi,%r15d
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 144-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %eax,%r15d
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- addl 8+16(%rbp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- andl %r15d,%esi
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 160-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r11d,%esi
- xorl %r13d,%r14d
- leal (%r9,%rsi,1),%r9d
- movl %ecx,%r12d
- addl 12+16(%rbp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- rorxl $11,%ebx,%esi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- andnl %edx,%ebx,%r12d
- xorl %esi,%r13d
- rorxl $6,%ebx,%r14d
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%esi
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%esi
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %esi,%r15d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 176-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- addl 32+16(%rbp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- andl %r15d,%esi
- vpand %xmm12,%xmm11,%xmm8
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 192-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r9d,%esi
- xorl %r13d,%r14d
- leal (%rdx,%rsi,1),%edx
- movl %eax,%r12d
- addl 36+16(%rbp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- rorxl $11,%r11d,%esi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- andnl %ebx,%r11d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r11d,%r14d
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%esi
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%esi
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- andl %esi,%r15d
- vaesenclast %xmm10,%xmm9,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 208-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- addl 40+16(%rbp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- andl %r15d,%esi
- vpand %xmm13,%xmm11,%xmm11
- vaesenc %xmm10,%xmm9,%xmm9
- vmovdqu 224-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %edx,%esi
- xorl %r13d,%r14d
- leal (%rbx,%rsi,1),%ebx
- movl %r10d,%r12d
- addl 44+16(%rbp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- rorxl $11,%r9d,%esi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- andnl %r11d,%r9d,%r12d
- xorl %esi,%r13d
- rorxl $6,%r9d,%r14d
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%esi
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%esi
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %esi,%r15d
- vpor %xmm11,%xmm8,%xmm8
- vaesenclast %xmm10,%xmm9,%xmm11
- vmovdqu 0-128(%rdi),%xmm10
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- vmovq %xmm15,%r13
- vpextrq $1,%xmm15,%r15
- vpand %xmm14,%xmm11,%xmm11
- vpor %xmm11,%xmm8,%xmm8
- leaq -64(%rbp),%rbp
- vmovdqu %xmm8,(%r15,%r13,1)
- leaq 16(%r13),%r13
- cmpq %rsp,%rbp
- jae .Lower_avx2
-
- movq 552(%rsp),%r15
- leaq 64(%r13),%r13
- movq 560(%rsp),%rsi
- addl %r14d,%eax
- leaq 448(%rsp),%rsp
-
- addl 0(%r15),%eax
- addl 4(%r15),%ebx
- addl 8(%r15),%ecx
- addl 12(%r15),%edx
- addl 16(%r15),%r8d
- addl 20(%r15),%r9d
- addl 24(%r15),%r10d
- leaq (%rsi,%r13,1),%r12
- addl 28(%r15),%r11d
-
- cmpq 64+16(%rsp),%r13
-
- movl %eax,0(%r15)
- cmoveq %rsp,%r12
- movl %ebx,4(%r15)
- movl %ecx,8(%r15)
- movl %edx,12(%r15)
- movl %r8d,16(%r15)
- movl %r9d,20(%r15)
- movl %r10d,24(%r15)
- movl %r11d,28(%r15)
-
- jbe .Loop_avx2
- leaq (%rsp),%rbp
-
-
-.cfi_escape 0x0f,0x06,0x76,0xf8,0x00,0x06,0x23,0x08
-
-.Ldone_avx2:
- movq 64+32(%rbp),%r8
- movq 64+56(%rbp),%rsi
-.cfi_def_cfa %rsi,8
- vmovdqu %xmm8,(%r8)
- vzeroall
- movq -48(%rsi),%r15
-.cfi_restore %r15
- movq -40(%rsi),%r14
-.cfi_restore %r14
- movq -32(%rsi),%r13
-.cfi_restore %r13
- movq -24(%rsi),%r12
-.cfi_restore %r12
- movq -16(%rsi),%rbp
-.cfi_restore %rbp
- movq -8(%rsi),%rbx
-.cfi_restore %rbx
- leaq (%rsi),%rsp
-.cfi_def_cfa_register %rsp
-.Lepilogue_avx2:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size aesni_cbc_sha256_enc_avx2,.-aesni_cbc_sha256_enc_avx2
-.type aesni_cbc_sha256_enc_shaext,@function
-.align 32
-aesni_cbc_sha256_enc_shaext:
-.cfi_startproc
- movq 8(%rsp),%r10
- leaq K256+128(%rip),%rax
- movdqu (%r9),%xmm1
- movdqu 16(%r9),%xmm2
- movdqa 512-128(%rax),%xmm3
-
- movl 240(%rcx),%r11d
- subq %rdi,%rsi
- movups (%rcx),%xmm15
- movups (%r8),%xmm6
- movups 16(%rcx),%xmm4
- leaq 112(%rcx),%rcx
-
- pshufd $0x1b,%xmm1,%xmm0
- pshufd $0xb1,%xmm1,%xmm1
- pshufd $0x1b,%xmm2,%xmm2
- movdqa %xmm3,%xmm7
-.byte 102,15,58,15,202,8
- punpcklqdq %xmm0,%xmm2
-
- jmp .Loop_shaext
-
-.align 16
-.Loop_shaext:
- movdqu (%r10),%xmm10
- movdqu 16(%r10),%xmm11
- movdqu 32(%r10),%xmm12
-.byte 102,68,15,56,0,211
- movdqu 48(%r10),%xmm13
-
- movdqa 0-128(%rax),%xmm0
- paddd %xmm10,%xmm0
-.byte 102,68,15,56,0,219
- movdqa %xmm2,%xmm9
- movdqa %xmm1,%xmm8
- movups 0(%rdi),%xmm14
- xorps %xmm15,%xmm14
- xorps %xmm14,%xmm6
- movups -80(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movups -64(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,202
-
- movdqa 32-128(%rax),%xmm0
- paddd %xmm11,%xmm0
-.byte 102,68,15,56,0,227
- leaq 64(%r10),%r10
- movups -48(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movups -32(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,202
-
- movdqa 64-128(%rax),%xmm0
- paddd %xmm12,%xmm0
-.byte 102,68,15,56,0,235
-.byte 69,15,56,204,211
- movups -16(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movdqa %xmm13,%xmm3
-.byte 102,65,15,58,15,220,4
- paddd %xmm3,%xmm10
- movups 0(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,202
-
- movdqa 96-128(%rax),%xmm0
- paddd %xmm13,%xmm0
-.byte 69,15,56,205,213
-.byte 69,15,56,204,220
- movups 16(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movups 32(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movdqa %xmm10,%xmm3
-.byte 102,65,15,58,15,221,4
- paddd %xmm3,%xmm11
-.byte 15,56,203,202
- movdqa 128-128(%rax),%xmm0
- paddd %xmm10,%xmm0
-.byte 69,15,56,205,218
-.byte 69,15,56,204,229
- movups 48(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movdqa %xmm11,%xmm3
-.byte 102,65,15,58,15,218,4
- paddd %xmm3,%xmm12
- cmpl $11,%r11d
- jb .Laesenclast1
- movups 64(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 80(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- je .Laesenclast1
- movups 96(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 112(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.Laesenclast1:
- aesenclast %xmm5,%xmm6
- movups 16-112(%rcx),%xmm4
- nop
-.byte 15,56,203,202
- movups 16(%rdi),%xmm14
- xorps %xmm15,%xmm14
- movups %xmm6,0(%rsi,%rdi,1)
- xorps %xmm14,%xmm6
- movups -80(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- movdqa 160-128(%rax),%xmm0
- paddd %xmm11,%xmm0
-.byte 69,15,56,205,227
-.byte 69,15,56,204,234
- movups -64(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movdqa %xmm12,%xmm3
-.byte 102,65,15,58,15,219,4
- paddd %xmm3,%xmm13
- movups -48(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,202
- movdqa 192-128(%rax),%xmm0
- paddd %xmm12,%xmm0
-.byte 69,15,56,205,236
-.byte 69,15,56,204,211
- movups -32(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movdqa %xmm13,%xmm3
-.byte 102,65,15,58,15,220,4
- paddd %xmm3,%xmm10
- movups -16(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,202
- movdqa 224-128(%rax),%xmm0
- paddd %xmm13,%xmm0
-.byte 69,15,56,205,213
-.byte 69,15,56,204,220
- movups 0(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movdqa %xmm10,%xmm3
-.byte 102,65,15,58,15,221,4
- paddd %xmm3,%xmm11
- movups 16(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,202
- movdqa 256-128(%rax),%xmm0
- paddd %xmm10,%xmm0
-.byte 69,15,56,205,218
-.byte 69,15,56,204,229
- movups 32(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movdqa %xmm11,%xmm3
-.byte 102,65,15,58,15,218,4
- paddd %xmm3,%xmm12
- movups 48(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- cmpl $11,%r11d
- jb .Laesenclast2
- movups 64(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 80(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- je .Laesenclast2
- movups 96(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 112(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.Laesenclast2:
- aesenclast %xmm5,%xmm6
- movups 16-112(%rcx),%xmm4
- nop
-.byte 15,56,203,202
- movups 32(%rdi),%xmm14
- xorps %xmm15,%xmm14
- movups %xmm6,16(%rsi,%rdi,1)
- xorps %xmm14,%xmm6
- movups -80(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- movdqa 288-128(%rax),%xmm0
- paddd %xmm11,%xmm0
-.byte 69,15,56,205,227
-.byte 69,15,56,204,234
- movups -64(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movdqa %xmm12,%xmm3
-.byte 102,65,15,58,15,219,4
- paddd %xmm3,%xmm13
- movups -48(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,202
- movdqa 320-128(%rax),%xmm0
- paddd %xmm12,%xmm0
-.byte 69,15,56,205,236
-.byte 69,15,56,204,211
- movups -32(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movdqa %xmm13,%xmm3
-.byte 102,65,15,58,15,220,4
- paddd %xmm3,%xmm10
- movups -16(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,202
- movdqa 352-128(%rax),%xmm0
- paddd %xmm13,%xmm0
-.byte 69,15,56,205,213
-.byte 69,15,56,204,220
- movups 0(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movdqa %xmm10,%xmm3
-.byte 102,65,15,58,15,221,4
- paddd %xmm3,%xmm11
- movups 16(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,202
- movdqa 384-128(%rax),%xmm0
- paddd %xmm10,%xmm0
-.byte 69,15,56,205,218
-.byte 69,15,56,204,229
- movups 32(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movdqa %xmm11,%xmm3
-.byte 102,65,15,58,15,218,4
- paddd %xmm3,%xmm12
- movups 48(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,202
- movdqa 416-128(%rax),%xmm0
- paddd %xmm11,%xmm0
-.byte 69,15,56,205,227
-.byte 69,15,56,204,234
- cmpl $11,%r11d
- jb .Laesenclast3
- movups 64(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 80(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- je .Laesenclast3
- movups 96(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 112(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.Laesenclast3:
- aesenclast %xmm5,%xmm6
- movups 16-112(%rcx),%xmm4
- nop
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movdqa %xmm12,%xmm3
-.byte 102,65,15,58,15,219,4
- paddd %xmm3,%xmm13
- movups 48(%rdi),%xmm14
- xorps %xmm15,%xmm14
- movups %xmm6,32(%rsi,%rdi,1)
- xorps %xmm14,%xmm6
- movups -80(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- movups -64(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,202
-
- movdqa 448-128(%rax),%xmm0
- paddd %xmm12,%xmm0
-.byte 69,15,56,205,236
- movdqa %xmm7,%xmm3
- movups -48(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movups -32(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,202
-
- movdqa 480-128(%rax),%xmm0
- paddd %xmm13,%xmm0
- movups -16(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- movups 0(%rcx),%xmm4
- aesenc %xmm5,%xmm6
-.byte 15,56,203,209
- pshufd $0x0e,%xmm0,%xmm0
- movups 16(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.byte 15,56,203,202
-
- movups 32(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 48(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- cmpl $11,%r11d
- jb .Laesenclast4
- movups 64(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 80(%rcx),%xmm5
- aesenc %xmm4,%xmm6
- je .Laesenclast4
- movups 96(%rcx),%xmm4
- aesenc %xmm5,%xmm6
- movups 112(%rcx),%xmm5
- aesenc %xmm4,%xmm6
-.Laesenclast4:
- aesenclast %xmm5,%xmm6
- movups 16-112(%rcx),%xmm4
- nop
-
- paddd %xmm9,%xmm2
- paddd %xmm8,%xmm1
-
- decq %rdx
- movups %xmm6,48(%rsi,%rdi,1)
- leaq 64(%rdi),%rdi
- jnz .Loop_shaext
-
- pshufd $0xb1,%xmm2,%xmm2
- pshufd $0x1b,%xmm1,%xmm3
- pshufd $0xb1,%xmm1,%xmm1
- punpckhqdq %xmm2,%xmm1
-.byte 102,15,58,15,211,8
-
- movups %xmm6,(%r8)
- movdqu %xmm1,(%r9)
- movdqu %xmm2,16(%r9)
- .byte 0xf3,0xc3
-.cfi_endproc
-.size aesni_cbc_sha256_enc_shaext,.-aesni_cbc_sha256_enc_shaext
- .section ".note.gnu.property", "a"
+ .section .note.gnu.property, #alloc
.p2align 3
.long 1f - 0f
.long 4f - 1f
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-x86_64.s
index aa7585f179a..d637b0d12fe 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-x86_64.s
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/aesni-x86_64.s
@@ -4483,7 +4483,7 @@ __aesni_set_encrypt_key:
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64
- .section ".note.gnu.property", "a"
+ .section .note.gnu.property, #alloc
.p2align 3
.long 1f - 0f
.long 4f - 1f
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/bsaes-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/bsaes-x86_64.s
index 5abda703024..57f86f616e9 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/bsaes-x86_64.s
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/bsaes-x86_64.s
@@ -2595,7 +2595,7 @@ _bsaes_const:
.byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,69,109,105,108,105,97,32,75,195,164,115,112,101,114,44,32,80,101,116,101,114,32,83,99,104,119,97,98,101,44,32,65,110,100,121,32,80,111,108,121,97,107,111,118,0
.align 64
.size _bsaes_const,.-_bsaes_const
- .section ".note.gnu.property", "a"
+ .section .note.gnu.property, #alloc
.p2align 3
.long 1f - 0f
.long 4f - 1f
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/vpaes-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/vpaes-x86_64.s
index 4bd2e683b9f..4ee6ed9dc9a 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/vpaes-x86_64.s
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/aes/vpaes-x86_64.s
@@ -856,7 +856,7 @@ _vpaes_consts:
.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
.align 64
.size _vpaes_consts,.-_vpaes_consts
- .section ".note.gnu.property", "a"
+ .section .note.gnu.property, #alloc
.p2align 3
.long 1f - 0f
.long 4f - 1f
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-avx2.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-avx2.s
index 7644d07da74..214f397a33a 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-avx2.s
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-avx2.s
@@ -1,1748 +1,29 @@
.text
+.globl rsaz_avx2_eligible
+.type rsaz_avx2_eligible,@function
+rsaz_avx2_eligible:
+ xorl %eax,%eax
+ .byte 0xf3,0xc3
+.size rsaz_avx2_eligible,.-rsaz_avx2_eligible
+
.globl rsaz_1024_sqr_avx2
+.globl rsaz_1024_mul_avx2
+.globl rsaz_1024_norm2red_avx2
+.globl rsaz_1024_red2norm_avx2
+.globl rsaz_1024_scatter5_avx2
+.globl rsaz_1024_gather5_avx2
.type rsaz_1024_sqr_avx2,@function
-.align 64
rsaz_1024_sqr_avx2:
-.cfi_startproc
- leaq (%rsp),%rax
-.cfi_def_cfa_register %rax
- pushq %rbx
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_offset %rbp,-24
- pushq %r12
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_offset %r15,-56
- vzeroupper
- movq %rax,%rbp
-.cfi_def_cfa_register %rbp
- movq %rdx,%r13
- subq $832,%rsp
- movq %r13,%r15
- subq $-128,%rdi
- subq $-128,%rsi
- subq $-128,%r13
-
- andq $4095,%r15
- addq $320,%r15
- shrq $12,%r15
- vpxor %ymm9,%ymm9,%ymm9
- jz .Lsqr_1024_no_n_copy
-
-
-
-
-
- subq $320,%rsp
- vmovdqu 0-128(%r13),%ymm0
- andq $-2048,%rsp
- vmovdqu 32-128(%r13),%ymm1
- vmovdqu 64-128(%r13),%ymm2
- vmovdqu 96-128(%r13),%ymm3
- vmovdqu 128-128(%r13),%ymm4
- vmovdqu 160-128(%r13),%ymm5
- vmovdqu 192-128(%r13),%ymm6
- vmovdqu 224-128(%r13),%ymm7
- vmovdqu 256-128(%r13),%ymm8
- leaq 832+128(%rsp),%r13
- vmovdqu %ymm0,0-128(%r13)
- vmovdqu %ymm1,32-128(%r13)
- vmovdqu %ymm2,64-128(%r13)
- vmovdqu %ymm3,96-128(%r13)
- vmovdqu %ymm4,128-128(%r13)
- vmovdqu %ymm5,160-128(%r13)
- vmovdqu %ymm6,192-128(%r13)
- vmovdqu %ymm7,224-128(%r13)
- vmovdqu %ymm8,256-128(%r13)
- vmovdqu %ymm9,288-128(%r13)
-
-.Lsqr_1024_no_n_copy:
- andq $-1024,%rsp
-
- vmovdqu 32-128(%rsi),%ymm1
- vmovdqu 64-128(%rsi),%ymm2
- vmovdqu 96-128(%rsi),%ymm3
- vmovdqu 128-128(%rsi),%ymm4
- vmovdqu 160-128(%rsi),%ymm5
- vmovdqu 192-128(%rsi),%ymm6
- vmovdqu 224-128(%rsi),%ymm7
- vmovdqu 256-128(%rsi),%ymm8
-
- leaq 192(%rsp),%rbx
- vmovdqu .Land_mask(%rip),%ymm15
- jmp .LOOP_GRANDE_SQR_1024
-
-.align 32
-.LOOP_GRANDE_SQR_1024:
- leaq 576+128(%rsp),%r9
- leaq 448(%rsp),%r12
-
-
-
-
- vpaddq %ymm1,%ymm1,%ymm1
- vpbroadcastq 0-128(%rsi),%ymm10
- vpaddq %ymm2,%ymm2,%ymm2
- vmovdqa %ymm1,0-128(%r9)
- vpaddq %ymm3,%ymm3,%ymm3
- vmovdqa %ymm2,32-128(%r9)
- vpaddq %ymm4,%ymm4,%ymm4
- vmovdqa %ymm3,64-128(%r9)
- vpaddq %ymm5,%ymm5,%ymm5
- vmovdqa %ymm4,96-128(%r9)
- vpaddq %ymm6,%ymm6,%ymm6
- vmovdqa %ymm5,128-128(%r9)
- vpaddq %ymm7,%ymm7,%ymm7
- vmovdqa %ymm6,160-128(%r9)
- vpaddq %ymm8,%ymm8,%ymm8
- vmovdqa %ymm7,192-128(%r9)
- vpxor %ymm9,%ymm9,%ymm9
- vmovdqa %ymm8,224-128(%r9)
-
- vpmuludq 0-128(%rsi),%ymm10,%ymm0
- vpbroadcastq 32-128(%rsi),%ymm11
- vmovdqu %ymm9,288-192(%rbx)
- vpmuludq %ymm10,%ymm1,%ymm1
- vmovdqu %ymm9,320-448(%r12)
- vpmuludq %ymm10,%ymm2,%ymm2
- vmovdqu %ymm9,352-448(%r12)
- vpmuludq %ymm10,%ymm3,%ymm3
- vmovdqu %ymm9,384-448(%r12)
- vpmuludq %ymm10,%ymm4,%ymm4
- vmovdqu %ymm9,416-448(%r12)
- vpmuludq %ymm10,%ymm5,%ymm5
- vmovdqu %ymm9,448-448(%r12)
- vpmuludq %ymm10,%ymm6,%ymm6
- vmovdqu %ymm9,480-448(%r12)
- vpmuludq %ymm10,%ymm7,%ymm7
- vmovdqu %ymm9,512-448(%r12)
- vpmuludq %ymm10,%ymm8,%ymm8
- vpbroadcastq 64-128(%rsi),%ymm10
- vmovdqu %ymm9,544-448(%r12)
-
- movq %rsi,%r15
- movl $4,%r14d
- jmp .Lsqr_entry_1024
-.align 32
-.LOOP_SQR_1024:
- vpbroadcastq 32-128(%r15),%ymm11
- vpmuludq 0-128(%rsi),%ymm10,%ymm0
- vpaddq 0-192(%rbx),%ymm0,%ymm0
- vpmuludq 0-128(%r9),%ymm10,%ymm1
- vpaddq 32-192(%rbx),%ymm1,%ymm1
- vpmuludq 32-128(%r9),%ymm10,%ymm2
- vpaddq 64-192(%rbx),%ymm2,%ymm2
- vpmuludq 64-128(%r9),%ymm10,%ymm3
- vpaddq 96-192(%rbx),%ymm3,%ymm3
- vpmuludq 96-128(%r9),%ymm10,%ymm4
- vpaddq 128-192(%rbx),%ymm4,%ymm4
- vpmuludq 128-128(%r9),%ymm10,%ymm5
- vpaddq 160-192(%rbx),%ymm5,%ymm5
- vpmuludq 160-128(%r9),%ymm10,%ymm6
- vpaddq 192-192(%rbx),%ymm6,%ymm6
- vpmuludq 192-128(%r9),%ymm10,%ymm7
- vpaddq 224-192(%rbx),%ymm7,%ymm7
- vpmuludq 224-128(%r9),%ymm10,%ymm8
- vpbroadcastq 64-128(%r15),%ymm10
- vpaddq 256-192(%rbx),%ymm8,%ymm8
-.Lsqr_entry_1024:
- vmovdqu %ymm0,0-192(%rbx)
- vmovdqu %ymm1,32-192(%rbx)
-
- vpmuludq 32-128(%rsi),%ymm11,%ymm12
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq 32-128(%r9),%ymm11,%ymm14
- vpaddq %ymm14,%ymm3,%ymm3
- vpmuludq 64-128(%r9),%ymm11,%ymm13
- vpaddq %ymm13,%ymm4,%ymm4
- vpmuludq 96-128(%r9),%ymm11,%ymm12
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq 128-128(%r9),%ymm11,%ymm14
- vpaddq %ymm14,%ymm6,%ymm6
- vpmuludq 160-128(%r9),%ymm11,%ymm13
- vpaddq %ymm13,%ymm7,%ymm7
- vpmuludq 192-128(%r9),%ymm11,%ymm12
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq 224-128(%r9),%ymm11,%ymm0
- vpbroadcastq 96-128(%r15),%ymm11
- vpaddq 288-192(%rbx),%ymm0,%ymm0
-
- vmovdqu %ymm2,64-192(%rbx)
- vmovdqu %ymm3,96-192(%rbx)
-
- vpmuludq 64-128(%rsi),%ymm10,%ymm13
- vpaddq %ymm13,%ymm4,%ymm4
- vpmuludq 64-128(%r9),%ymm10,%ymm12
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq 96-128(%r9),%ymm10,%ymm14
- vpaddq %ymm14,%ymm6,%ymm6
- vpmuludq 128-128(%r9),%ymm10,%ymm13
- vpaddq %ymm13,%ymm7,%ymm7
- vpmuludq 160-128(%r9),%ymm10,%ymm12
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq 192-128(%r9),%ymm10,%ymm14
- vpaddq %ymm14,%ymm0,%ymm0
- vpmuludq 224-128(%r9),%ymm10,%ymm1
- vpbroadcastq 128-128(%r15),%ymm10
- vpaddq 320-448(%r12),%ymm1,%ymm1
-
- vmovdqu %ymm4,128-192(%rbx)
- vmovdqu %ymm5,160-192(%rbx)
-
- vpmuludq 96-128(%rsi),%ymm11,%ymm12
- vpaddq %ymm12,%ymm6,%ymm6
- vpmuludq 96-128(%r9),%ymm11,%ymm14
- vpaddq %ymm14,%ymm7,%ymm7
- vpmuludq 128-128(%r9),%ymm11,%ymm13
- vpaddq %ymm13,%ymm8,%ymm8
- vpmuludq 160-128(%r9),%ymm11,%ymm12
- vpaddq %ymm12,%ymm0,%ymm0
- vpmuludq 192-128(%r9),%ymm11,%ymm14
- vpaddq %ymm14,%ymm1,%ymm1
- vpmuludq 224-128(%r9),%ymm11,%ymm2
- vpbroadcastq 160-128(%r15),%ymm11
- vpaddq 352-448(%r12),%ymm2,%ymm2
-
- vmovdqu %ymm6,192-192(%rbx)
- vmovdqu %ymm7,224-192(%rbx)
-
- vpmuludq 128-128(%rsi),%ymm10,%ymm12
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq 128-128(%r9),%ymm10,%ymm14
- vpaddq %ymm14,%ymm0,%ymm0
- vpmuludq 160-128(%r9),%ymm10,%ymm13
- vpaddq %ymm13,%ymm1,%ymm1
- vpmuludq 192-128(%r9),%ymm10,%ymm12
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq 224-128(%r9),%ymm10,%ymm3
- vpbroadcastq 192-128(%r15),%ymm10
- vpaddq 384-448(%r12),%ymm3,%ymm3
-
- vmovdqu %ymm8,256-192(%rbx)
- vmovdqu %ymm0,288-192(%rbx)
- leaq 8(%rbx),%rbx
-
- vpmuludq 160-128(%rsi),%ymm11,%ymm13
- vpaddq %ymm13,%ymm1,%ymm1
- vpmuludq 160-128(%r9),%ymm11,%ymm12
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq 192-128(%r9),%ymm11,%ymm14
- vpaddq %ymm14,%ymm3,%ymm3
- vpmuludq 224-128(%r9),%ymm11,%ymm4
- vpbroadcastq 224-128(%r15),%ymm11
- vpaddq 416-448(%r12),%ymm4,%ymm4
-
- vmovdqu %ymm1,320-448(%r12)
- vmovdqu %ymm2,352-448(%r12)
-
- vpmuludq 192-128(%rsi),%ymm10,%ymm12
- vpaddq %ymm12,%ymm3,%ymm3
- vpmuludq 192-128(%r9),%ymm10,%ymm14
- vpbroadcastq 256-128(%r15),%ymm0
- vpaddq %ymm14,%ymm4,%ymm4
- vpmuludq 224-128(%r9),%ymm10,%ymm5
- vpbroadcastq 0+8-128(%r15),%ymm10
- vpaddq 448-448(%r12),%ymm5,%ymm5
-
- vmovdqu %ymm3,384-448(%r12)
- vmovdqu %ymm4,416-448(%r12)
- leaq 8(%r15),%r15
-
- vpmuludq 224-128(%rsi),%ymm11,%ymm12
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq 224-128(%r9),%ymm11,%ymm6
- vpaddq 480-448(%r12),%ymm6,%ymm6
-
- vpmuludq 256-128(%rsi),%ymm0,%ymm7
- vmovdqu %ymm5,448-448(%r12)
- vpaddq 512-448(%r12),%ymm7,%ymm7
- vmovdqu %ymm6,480-448(%r12)
- vmovdqu %ymm7,512-448(%r12)
- leaq 8(%r12),%r12
-
- decl %r14d
- jnz .LOOP_SQR_1024
-
- vmovdqu 256(%rsp),%ymm8
- vmovdqu 288(%rsp),%ymm1
- vmovdqu 320(%rsp),%ymm2
- leaq 192(%rsp),%rbx
-
- vpsrlq $29,%ymm8,%ymm14
- vpand %ymm15,%ymm8,%ymm8
- vpsrlq $29,%ymm1,%ymm11
- vpand %ymm15,%ymm1,%ymm1
-
- vpermq $0x93,%ymm14,%ymm14
- vpxor %ymm9,%ymm9,%ymm9
- vpermq $0x93,%ymm11,%ymm11
-
- vpblendd $3,%ymm9,%ymm14,%ymm10
- vpblendd $3,%ymm14,%ymm11,%ymm14
- vpaddq %ymm10,%ymm8,%ymm8
- vpblendd $3,%ymm11,%ymm9,%ymm11
- vpaddq %ymm14,%ymm1,%ymm1
- vpaddq %ymm11,%ymm2,%ymm2
- vmovdqu %ymm1,288-192(%rbx)
- vmovdqu %ymm2,320-192(%rbx)
-
- movq (%rsp),%rax
- movq 8(%rsp),%r10
- movq 16(%rsp),%r11
- movq 24(%rsp),%r12
- vmovdqu 32(%rsp),%ymm1
- vmovdqu 64-192(%rbx),%ymm2
- vmovdqu 96-192(%rbx),%ymm3
- vmovdqu 128-192(%rbx),%ymm4
- vmovdqu 160-192(%rbx),%ymm5
- vmovdqu 192-192(%rbx),%ymm6
- vmovdqu 224-192(%rbx),%ymm7
-
- movq %rax,%r9
- imull %ecx,%eax
- andl $0x1fffffff,%eax
- vmovd %eax,%xmm12
-
- movq %rax,%rdx
- imulq -128(%r13),%rax
- vpbroadcastq %xmm12,%ymm12
- addq %rax,%r9
- movq %rdx,%rax
- imulq 8-128(%r13),%rax
- shrq $29,%r9
- addq %rax,%r10
- movq %rdx,%rax
- imulq 16-128(%r13),%rax
- addq %r9,%r10
- addq %rax,%r11
- imulq 24-128(%r13),%rdx
- addq %rdx,%r12
-
- movq %r10,%rax
- imull %ecx,%eax
- andl $0x1fffffff,%eax
-
- movl $9,%r14d
- jmp .LOOP_REDUCE_1024
-
-.align 32
-.LOOP_REDUCE_1024:
- vmovd %eax,%xmm13
- vpbroadcastq %xmm13,%ymm13
-
- vpmuludq 32-128(%r13),%ymm12,%ymm10
- movq %rax,%rdx
- imulq -128(%r13),%rax
- vpaddq %ymm10,%ymm1,%ymm1
- addq %rax,%r10
- vpmuludq 64-128(%r13),%ymm12,%ymm14
- movq %rdx,%rax
- imulq 8-128(%r13),%rax
- vpaddq %ymm14,%ymm2,%ymm2
- vpmuludq 96-128(%r13),%ymm12,%ymm11
-.byte 0x67
- addq %rax,%r11
-.byte 0x67
- movq %rdx,%rax
- imulq 16-128(%r13),%rax
- shrq $29,%r10
- vpaddq %ymm11,%ymm3,%ymm3
- vpmuludq 128-128(%r13),%ymm12,%ymm10
- addq %rax,%r12
- addq %r10,%r11
- vpaddq %ymm10,%ymm4,%ymm4
- vpmuludq 160-128(%r13),%ymm12,%ymm14
- movq %r11,%rax
- imull %ecx,%eax
- vpaddq %ymm14,%ymm5,%ymm5
- vpmuludq 192-128(%r13),%ymm12,%ymm11
- andl $0x1fffffff,%eax
- vpaddq %ymm11,%ymm6,%ymm6
- vpmuludq 224-128(%r13),%ymm12,%ymm10
- vpaddq %ymm10,%ymm7,%ymm7
- vpmuludq 256-128(%r13),%ymm12,%ymm14
- vmovd %eax,%xmm12
-
- vpaddq %ymm14,%ymm8,%ymm8
-
- vpbroadcastq %xmm12,%ymm12
-
- vpmuludq 32-8-128(%r13),%ymm13,%ymm11
- vmovdqu 96-8-128(%r13),%ymm14
- movq %rax,%rdx
- imulq -128(%r13),%rax
- vpaddq %ymm11,%ymm1,%ymm1
- vpmuludq 64-8-128(%r13),%ymm13,%ymm10
- vmovdqu 128-8-128(%r13),%ymm11
- addq %rax,%r11
- movq %rdx,%rax
- imulq 8-128(%r13),%rax
- vpaddq %ymm10,%ymm2,%ymm2
- addq %r12,%rax
- shrq $29,%r11
- vpmuludq %ymm13,%ymm14,%ymm14
- vmovdqu 160-8-128(%r13),%ymm10
- addq %r11,%rax
- vpaddq %ymm14,%ymm3,%ymm3
- vpmuludq %ymm13,%ymm11,%ymm11
- vmovdqu 192-8-128(%r13),%ymm14
-.byte 0x67
- movq %rax,%r12
- imull %ecx,%eax
- vpaddq %ymm11,%ymm4,%ymm4
- vpmuludq %ymm13,%ymm10,%ymm10
-.byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00
- andl $0x1fffffff,%eax
- vpaddq %ymm10,%ymm5,%ymm5
- vpmuludq %ymm13,%ymm14,%ymm14
- vmovdqu 256-8-128(%r13),%ymm10
- vpaddq %ymm14,%ymm6,%ymm6
- vpmuludq %ymm13,%ymm11,%ymm11
- vmovdqu 288-8-128(%r13),%ymm9
- vmovd %eax,%xmm0
- imulq -128(%r13),%rax
- vpaddq %ymm11,%ymm7,%ymm7
- vpmuludq %ymm13,%ymm10,%ymm10
- vmovdqu 32-16-128(%r13),%ymm14
- vpbroadcastq %xmm0,%ymm0
- vpaddq %ymm10,%ymm8,%ymm8
- vpmuludq %ymm13,%ymm9,%ymm9
- vmovdqu 64-16-128(%r13),%ymm11
- addq %rax,%r12
-
- vmovdqu 32-24-128(%r13),%ymm13
- vpmuludq %ymm12,%ymm14,%ymm14
- vmovdqu 96-16-128(%r13),%ymm10
- vpaddq %ymm14,%ymm1,%ymm1
- vpmuludq %ymm0,%ymm13,%ymm13
- vpmuludq %ymm12,%ymm11,%ymm11
-.byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff
- vpaddq %ymm1,%ymm13,%ymm13
- vpaddq %ymm11,%ymm2,%ymm2
- vpmuludq %ymm12,%ymm10,%ymm10
- vmovdqu 160-16-128(%r13),%ymm11
-.byte 0x67
- vmovq %xmm13,%rax
- vmovdqu %ymm13,(%rsp)
- vpaddq %ymm10,%ymm3,%ymm3
- vpmuludq %ymm12,%ymm14,%ymm14
- vmovdqu 192-16-128(%r13),%ymm10
- vpaddq %ymm14,%ymm4,%ymm4
- vpmuludq %ymm12,%ymm11,%ymm11
- vmovdqu 224-16-128(%r13),%ymm14
- vpaddq %ymm11,%ymm5,%ymm5
- vpmuludq %ymm12,%ymm10,%ymm10
- vmovdqu 256-16-128(%r13),%ymm11
- vpaddq %ymm10,%ymm6,%ymm6
- vpmuludq %ymm12,%ymm14,%ymm14
- shrq $29,%r12
- vmovdqu 288-16-128(%r13),%ymm10
- addq %r12,%rax
- vpaddq %ymm14,%ymm7,%ymm7
- vpmuludq %ymm12,%ymm11,%ymm11
-
- movq %rax,%r9
- imull %ecx,%eax
- vpaddq %ymm11,%ymm8,%ymm8
- vpmuludq %ymm12,%ymm10,%ymm10
- andl $0x1fffffff,%eax
- vmovd %eax,%xmm12
- vmovdqu 96-24-128(%r13),%ymm11
-.byte 0x67
- vpaddq %ymm10,%ymm9,%ymm9
- vpbroadcastq %xmm12,%ymm12
-
- vpmuludq 64-24-128(%r13),%ymm0,%ymm14
- vmovdqu 128-24-128(%r13),%ymm10
- movq %rax,%rdx
- imulq -128(%r13),%rax
- movq 8(%rsp),%r10
- vpaddq %ymm14,%ymm2,%ymm1
- vpmuludq %ymm0,%ymm11,%ymm11
- vmovdqu 160-24-128(%r13),%ymm14
- addq %rax,%r9
- movq %rdx,%rax
- imulq 8-128(%r13),%rax
-.byte 0x67
- shrq $29,%r9
- movq 16(%rsp),%r11
- vpaddq %ymm11,%ymm3,%ymm2
- vpmuludq %ymm0,%ymm10,%ymm10
- vmovdqu 192-24-128(%r13),%ymm11
- addq %rax,%r10
- movq %rdx,%rax
- imulq 16-128(%r13),%rax
- vpaddq %ymm10,%ymm4,%ymm3
- vpmuludq %ymm0,%ymm14,%ymm14
- vmovdqu 224-24-128(%r13),%ymm10
- imulq 24-128(%r13),%rdx
- addq %rax,%r11
- leaq (%r9,%r10,1),%rax
- vpaddq %ymm14,%ymm5,%ymm4
- vpmuludq %ymm0,%ymm11,%ymm11
- vmovdqu 256-24-128(%r13),%ymm14
- movq %rax,%r10
- imull %ecx,%eax
- vpmuludq %ymm0,%ymm10,%ymm10
- vpaddq %ymm11,%ymm6,%ymm5
- vmovdqu 288-24-128(%r13),%ymm11
- andl $0x1fffffff,%eax
- vpaddq %ymm10,%ymm7,%ymm6
- vpmuludq %ymm0,%ymm14,%ymm14
- addq 24(%rsp),%rdx
- vpaddq %ymm14,%ymm8,%ymm7
- vpmuludq %ymm0,%ymm11,%ymm11
- vpaddq %ymm11,%ymm9,%ymm8
- vmovq %r12,%xmm9
- movq %rdx,%r12
-
- decl %r14d
- jnz .LOOP_REDUCE_1024
- leaq 448(%rsp),%r12
- vpaddq %ymm9,%ymm13,%ymm0
- vpxor %ymm9,%ymm9,%ymm9
-
- vpaddq 288-192(%rbx),%ymm0,%ymm0
- vpaddq 320-448(%r12),%ymm1,%ymm1
- vpaddq 352-448(%r12),%ymm2,%ymm2
- vpaddq 384-448(%r12),%ymm3,%ymm3
- vpaddq 416-448(%r12),%ymm4,%ymm4
- vpaddq 448-448(%r12),%ymm5,%ymm5
- vpaddq 480-448(%r12),%ymm6,%ymm6
- vpaddq 512-448(%r12),%ymm7,%ymm7
- vpaddq 544-448(%r12),%ymm8,%ymm8
-
- vpsrlq $29,%ymm0,%ymm14
- vpand %ymm15,%ymm0,%ymm0
- vpsrlq $29,%ymm1,%ymm11
- vpand %ymm15,%ymm1,%ymm1
- vpsrlq $29,%ymm2,%ymm12
- vpermq $0x93,%ymm14,%ymm14
- vpand %ymm15,%ymm2,%ymm2
- vpsrlq $29,%ymm3,%ymm13
- vpermq $0x93,%ymm11,%ymm11
- vpand %ymm15,%ymm3,%ymm3
- vpermq $0x93,%ymm12,%ymm12
-
- vpblendd $3,%ymm9,%ymm14,%ymm10
- vpermq $0x93,%ymm13,%ymm13
- vpblendd $3,%ymm14,%ymm11,%ymm14
- vpaddq %ymm10,%ymm0,%ymm0
- vpblendd $3,%ymm11,%ymm12,%ymm11
- vpaddq %ymm14,%ymm1,%ymm1
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm11,%ymm2,%ymm2
- vpblendd $3,%ymm13,%ymm9,%ymm13
- vpaddq %ymm12,%ymm3,%ymm3
- vpaddq %ymm13,%ymm4,%ymm4
-
- vpsrlq $29,%ymm0,%ymm14
- vpand %ymm15,%ymm0,%ymm0
- vpsrlq $29,%ymm1,%ymm11
- vpand %ymm15,%ymm1,%ymm1
- vpsrlq $29,%ymm2,%ymm12
- vpermq $0x93,%ymm14,%ymm14
- vpand %ymm15,%ymm2,%ymm2
- vpsrlq $29,%ymm3,%ymm13
- vpermq $0x93,%ymm11,%ymm11
- vpand %ymm15,%ymm3,%ymm3
- vpermq $0x93,%ymm12,%ymm12
-
- vpblendd $3,%ymm9,%ymm14,%ymm10
- vpermq $0x93,%ymm13,%ymm13
- vpblendd $3,%ymm14,%ymm11,%ymm14
- vpaddq %ymm10,%ymm0,%ymm0
- vpblendd $3,%ymm11,%ymm12,%ymm11
- vpaddq %ymm14,%ymm1,%ymm1
- vmovdqu %ymm0,0-128(%rdi)
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm11,%ymm2,%ymm2
- vmovdqu %ymm1,32-128(%rdi)
- vpblendd $3,%ymm13,%ymm9,%ymm13
- vpaddq %ymm12,%ymm3,%ymm3
- vmovdqu %ymm2,64-128(%rdi)
- vpaddq %ymm13,%ymm4,%ymm4
- vmovdqu %ymm3,96-128(%rdi)
- vpsrlq $29,%ymm4,%ymm14
- vpand %ymm15,%ymm4,%ymm4
- vpsrlq $29,%ymm5,%ymm11
- vpand %ymm15,%ymm5,%ymm5
- vpsrlq $29,%ymm6,%ymm12
- vpermq $0x93,%ymm14,%ymm14
- vpand %ymm15,%ymm6,%ymm6
- vpsrlq $29,%ymm7,%ymm13
- vpermq $0x93,%ymm11,%ymm11
- vpand %ymm15,%ymm7,%ymm7
- vpsrlq $29,%ymm8,%ymm0
- vpermq $0x93,%ymm12,%ymm12
- vpand %ymm15,%ymm8,%ymm8
- vpermq $0x93,%ymm13,%ymm13
-
- vpblendd $3,%ymm9,%ymm14,%ymm10
- vpermq $0x93,%ymm0,%ymm0
- vpblendd $3,%ymm14,%ymm11,%ymm14
- vpaddq %ymm10,%ymm4,%ymm4
- vpblendd $3,%ymm11,%ymm12,%ymm11
- vpaddq %ymm14,%ymm5,%ymm5
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm11,%ymm6,%ymm6
- vpblendd $3,%ymm13,%ymm0,%ymm13
- vpaddq %ymm12,%ymm7,%ymm7
- vpaddq %ymm13,%ymm8,%ymm8
-
- vpsrlq $29,%ymm4,%ymm14
- vpand %ymm15,%ymm4,%ymm4
- vpsrlq $29,%ymm5,%ymm11
- vpand %ymm15,%ymm5,%ymm5
- vpsrlq $29,%ymm6,%ymm12
- vpermq $0x93,%ymm14,%ymm14
- vpand %ymm15,%ymm6,%ymm6
- vpsrlq $29,%ymm7,%ymm13
- vpermq $0x93,%ymm11,%ymm11
- vpand %ymm15,%ymm7,%ymm7
- vpsrlq $29,%ymm8,%ymm0
- vpermq $0x93,%ymm12,%ymm12
- vpand %ymm15,%ymm8,%ymm8
- vpermq $0x93,%ymm13,%ymm13
-
- vpblendd $3,%ymm9,%ymm14,%ymm10
- vpermq $0x93,%ymm0,%ymm0
- vpblendd $3,%ymm14,%ymm11,%ymm14
- vpaddq %ymm10,%ymm4,%ymm4
- vpblendd $3,%ymm11,%ymm12,%ymm11
- vpaddq %ymm14,%ymm5,%ymm5
- vmovdqu %ymm4,128-128(%rdi)
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm11,%ymm6,%ymm6
- vmovdqu %ymm5,160-128(%rdi)
- vpblendd $3,%ymm13,%ymm0,%ymm13
- vpaddq %ymm12,%ymm7,%ymm7
- vmovdqu %ymm6,192-128(%rdi)
- vpaddq %ymm13,%ymm8,%ymm8
- vmovdqu %ymm7,224-128(%rdi)
- vmovdqu %ymm8,256-128(%rdi)
-
- movq %rdi,%rsi
- decl %r8d
- jne .LOOP_GRANDE_SQR_1024
-
- vzeroall
- movq %rbp,%rax
-.cfi_def_cfa_register %rax
- movq -48(%rax),%r15
-.cfi_restore %r15
- movq -40(%rax),%r14
-.cfi_restore %r14
- movq -32(%rax),%r13
-.cfi_restore %r13
- movq -24(%rax),%r12
-.cfi_restore %r12
- movq -16(%rax),%rbp
-.cfi_restore %rbp
- movq -8(%rax),%rbx
-.cfi_restore %rbx
- leaq (%rax),%rsp
-.cfi_def_cfa_register %rsp
-.Lsqr_1024_epilogue:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
-.globl rsaz_1024_mul_avx2
-.type rsaz_1024_mul_avx2,@function
-.align 64
rsaz_1024_mul_avx2:
-.cfi_startproc
- leaq (%rsp),%rax
-.cfi_def_cfa_register %rax
- pushq %rbx
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_offset %rbp,-24
- pushq %r12
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_offset %r15,-56
- movq %rax,%rbp
-.cfi_def_cfa_register %rbp
- vzeroall
- movq %rdx,%r13
- subq $64,%rsp
-
-
-
-
-
-
-.byte 0x67,0x67
- movq %rsi,%r15
- andq $4095,%r15
- addq $320,%r15
- shrq $12,%r15
- movq %rsi,%r15
- cmovnzq %r13,%rsi
- cmovnzq %r15,%r13
-
- movq %rcx,%r15
- subq $-128,%rsi
- subq $-128,%rcx
- subq $-128,%rdi
-
- andq $4095,%r15
- addq $320,%r15
-.byte 0x67,0x67
- shrq $12,%r15
- jz .Lmul_1024_no_n_copy
-
-
-
-
-
- subq $320,%rsp
- vmovdqu 0-128(%rcx),%ymm0
- andq $-512,%rsp
- vmovdqu 32-128(%rcx),%ymm1
- vmovdqu 64-128(%rcx),%ymm2
- vmovdqu 96-128(%rcx),%ymm3
- vmovdqu 128-128(%rcx),%ymm4
- vmovdqu 160-128(%rcx),%ymm5
- vmovdqu 192-128(%rcx),%ymm6
- vmovdqu 224-128(%rcx),%ymm7
- vmovdqu 256-128(%rcx),%ymm8
- leaq 64+128(%rsp),%rcx
- vmovdqu %ymm0,0-128(%rcx)
- vpxor %ymm0,%ymm0,%ymm0
- vmovdqu %ymm1,32-128(%rcx)
- vpxor %ymm1,%ymm1,%ymm1
- vmovdqu %ymm2,64-128(%rcx)
- vpxor %ymm2,%ymm2,%ymm2
- vmovdqu %ymm3,96-128(%rcx)
- vpxor %ymm3,%ymm3,%ymm3
- vmovdqu %ymm4,128-128(%rcx)
- vpxor %ymm4,%ymm4,%ymm4
- vmovdqu %ymm5,160-128(%rcx)
- vpxor %ymm5,%ymm5,%ymm5
- vmovdqu %ymm6,192-128(%rcx)
- vpxor %ymm6,%ymm6,%ymm6
- vmovdqu %ymm7,224-128(%rcx)
- vpxor %ymm7,%ymm7,%ymm7
- vmovdqu %ymm8,256-128(%rcx)
- vmovdqa %ymm0,%ymm8
- vmovdqu %ymm9,288-128(%rcx)
-.Lmul_1024_no_n_copy:
- andq $-64,%rsp
-
- movq (%r13),%rbx
- vpbroadcastq (%r13),%ymm10
- vmovdqu %ymm0,(%rsp)
- xorq %r9,%r9
-.byte 0x67
- xorq %r10,%r10
- xorq %r11,%r11
- xorq %r12,%r12
-
- vmovdqu .Land_mask(%rip),%ymm15
- movl $9,%r14d
- vmovdqu %ymm9,288-128(%rdi)
- jmp .Loop_mul_1024
-
-.align 32
-.Loop_mul_1024:
- vpsrlq $29,%ymm3,%ymm9
- movq %rbx,%rax
- imulq -128(%rsi),%rax
- addq %r9,%rax
- movq %rbx,%r10
- imulq 8-128(%rsi),%r10
- addq 8(%rsp),%r10
-
- movq %rax,%r9
- imull %r8d,%eax
- andl $0x1fffffff,%eax
-
- movq %rbx,%r11
- imulq 16-128(%rsi),%r11
- addq 16(%rsp),%r11
-
- movq %rbx,%r12
- imulq 24-128(%rsi),%r12
- addq 24(%rsp),%r12
- vpmuludq 32-128(%rsi),%ymm10,%ymm0
- vmovd %eax,%xmm11
- vpaddq %ymm0,%ymm1,%ymm1
- vpmuludq 64-128(%rsi),%ymm10,%ymm12
- vpbroadcastq %xmm11,%ymm11
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq 96-128(%rsi),%ymm10,%ymm13
- vpand %ymm15,%ymm3,%ymm3
- vpaddq %ymm13,%ymm3,%ymm3
- vpmuludq 128-128(%rsi),%ymm10,%ymm0
- vpaddq %ymm0,%ymm4,%ymm4
- vpmuludq 160-128(%rsi),%ymm10,%ymm12
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq 192-128(%rsi),%ymm10,%ymm13
- vpaddq %ymm13,%ymm6,%ymm6
- vpmuludq 224-128(%rsi),%ymm10,%ymm0
- vpermq $0x93,%ymm9,%ymm9
- vpaddq %ymm0,%ymm7,%ymm7
- vpmuludq 256-128(%rsi),%ymm10,%ymm12
- vpbroadcastq 8(%r13),%ymm10
- vpaddq %ymm12,%ymm8,%ymm8
-
- movq %rax,%rdx
- imulq -128(%rcx),%rax
- addq %rax,%r9
- movq %rdx,%rax
- imulq 8-128(%rcx),%rax
- addq %rax,%r10
- movq %rdx,%rax
- imulq 16-128(%rcx),%rax
- addq %rax,%r11
- shrq $29,%r9
- imulq 24-128(%rcx),%rdx
- addq %rdx,%r12
- addq %r9,%r10
-
- vpmuludq 32-128(%rcx),%ymm11,%ymm13
- vmovq %xmm10,%rbx
- vpaddq %ymm13,%ymm1,%ymm1
- vpmuludq 64-128(%rcx),%ymm11,%ymm0
- vpaddq %ymm0,%ymm2,%ymm2
- vpmuludq 96-128(%rcx),%ymm11,%ymm12
- vpaddq %ymm12,%ymm3,%ymm3
- vpmuludq 128-128(%rcx),%ymm11,%ymm13
- vpaddq %ymm13,%ymm4,%ymm4
- vpmuludq 160-128(%rcx),%ymm11,%ymm0
- vpaddq %ymm0,%ymm5,%ymm5
- vpmuludq 192-128(%rcx),%ymm11,%ymm12
- vpaddq %ymm12,%ymm6,%ymm6
- vpmuludq 224-128(%rcx),%ymm11,%ymm13
- vpblendd $3,%ymm14,%ymm9,%ymm12
- vpaddq %ymm13,%ymm7,%ymm7
- vpmuludq 256-128(%rcx),%ymm11,%ymm0
- vpaddq %ymm12,%ymm3,%ymm3
- vpaddq %ymm0,%ymm8,%ymm8
-
- movq %rbx,%rax
- imulq -128(%rsi),%rax
- addq %rax,%r10
- vmovdqu -8+32-128(%rsi),%ymm12
- movq %rbx,%rax
- imulq 8-128(%rsi),%rax
- addq %rax,%r11
- vmovdqu -8+64-128(%rsi),%ymm13
-
- movq %r10,%rax
- vpblendd $0xfc,%ymm14,%ymm9,%ymm9
- imull %r8d,%eax
- vpaddq %ymm9,%ymm4,%ymm4
- andl $0x1fffffff,%eax
-
- imulq 16-128(%rsi),%rbx
- addq %rbx,%r12
- vpmuludq %ymm10,%ymm12,%ymm12
- vmovd %eax,%xmm11
- vmovdqu -8+96-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm1,%ymm1
- vpmuludq %ymm10,%ymm13,%ymm13
- vpbroadcastq %xmm11,%ymm11
- vmovdqu -8+128-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm2,%ymm2
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovdqu -8+160-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm3,%ymm3
- vpmuludq %ymm10,%ymm12,%ymm12
- vmovdqu -8+192-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm4,%ymm4
- vpmuludq %ymm10,%ymm13,%ymm13
- vmovdqu -8+224-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm5,%ymm5
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovdqu -8+256-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm6,%ymm6
- vpmuludq %ymm10,%ymm12,%ymm12
- vmovdqu -8+288-128(%rsi),%ymm9
- vpaddq %ymm12,%ymm7,%ymm7
- vpmuludq %ymm10,%ymm13,%ymm13
- vpaddq %ymm13,%ymm8,%ymm8
- vpmuludq %ymm10,%ymm9,%ymm9
- vpbroadcastq 16(%r13),%ymm10
-
- movq %rax,%rdx
- imulq -128(%rcx),%rax
- addq %rax,%r10
- vmovdqu -8+32-128(%rcx),%ymm0
- movq %rdx,%rax
- imulq 8-128(%rcx),%rax
- addq %rax,%r11
- vmovdqu -8+64-128(%rcx),%ymm12
- shrq $29,%r10
- imulq 16-128(%rcx),%rdx
- addq %rdx,%r12
- addq %r10,%r11
-
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovq %xmm10,%rbx
- vmovdqu -8+96-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm1,%ymm1
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu -8+128-128(%rcx),%ymm0
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -8+160-128(%rcx),%ymm12
- vpaddq %ymm13,%ymm3,%ymm3
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovdqu -8+192-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm4,%ymm4
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu -8+224-128(%rcx),%ymm0
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -8+256-128(%rcx),%ymm12
- vpaddq %ymm13,%ymm6,%ymm6
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovdqu -8+288-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm7,%ymm7
- vpmuludq %ymm11,%ymm12,%ymm12
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq %ymm11,%ymm13,%ymm13
- vpaddq %ymm13,%ymm9,%ymm9
-
- vmovdqu -16+32-128(%rsi),%ymm0
- movq %rbx,%rax
- imulq -128(%rsi),%rax
- addq %r11,%rax
-
- vmovdqu -16+64-128(%rsi),%ymm12
- movq %rax,%r11
- imull %r8d,%eax
- andl $0x1fffffff,%eax
-
- imulq 8-128(%rsi),%rbx
- addq %rbx,%r12
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovd %eax,%xmm11
- vmovdqu -16+96-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm1,%ymm1
- vpmuludq %ymm10,%ymm12,%ymm12
- vpbroadcastq %xmm11,%ymm11
- vmovdqu -16+128-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq %ymm10,%ymm13,%ymm13
- vmovdqu -16+160-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm3,%ymm3
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovdqu -16+192-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm4,%ymm4
- vpmuludq %ymm10,%ymm12,%ymm12
- vmovdqu -16+224-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq %ymm10,%ymm13,%ymm13
- vmovdqu -16+256-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm6,%ymm6
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovdqu -16+288-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm7,%ymm7
- vpmuludq %ymm10,%ymm12,%ymm12
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq %ymm10,%ymm13,%ymm13
- vpbroadcastq 24(%r13),%ymm10
- vpaddq %ymm13,%ymm9,%ymm9
-
- vmovdqu -16+32-128(%rcx),%ymm0
- movq %rax,%rdx
- imulq -128(%rcx),%rax
- addq %rax,%r11
- vmovdqu -16+64-128(%rcx),%ymm12
- imulq 8-128(%rcx),%rdx
- addq %rdx,%r12
- shrq $29,%r11
-
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovq %xmm10,%rbx
- vmovdqu -16+96-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm1,%ymm1
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu -16+128-128(%rcx),%ymm0
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -16+160-128(%rcx),%ymm12
- vpaddq %ymm13,%ymm3,%ymm3
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovdqu -16+192-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm4,%ymm4
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu -16+224-128(%rcx),%ymm0
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -16+256-128(%rcx),%ymm12
- vpaddq %ymm13,%ymm6,%ymm6
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovdqu -16+288-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm7,%ymm7
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu -24+32-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -24+64-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm9,%ymm9
-
- addq %r11,%r12
- imulq -128(%rsi),%rbx
- addq %rbx,%r12
-
- movq %r12,%rax
- imull %r8d,%eax
- andl $0x1fffffff,%eax
-
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovd %eax,%xmm11
- vmovdqu -24+96-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm1,%ymm1
- vpmuludq %ymm10,%ymm12,%ymm12
- vpbroadcastq %xmm11,%ymm11
- vmovdqu -24+128-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm2,%ymm2
- vpmuludq %ymm10,%ymm13,%ymm13
- vmovdqu -24+160-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm3,%ymm3
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovdqu -24+192-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm4,%ymm4
- vpmuludq %ymm10,%ymm12,%ymm12
- vmovdqu -24+224-128(%rsi),%ymm0
- vpaddq %ymm12,%ymm5,%ymm5
- vpmuludq %ymm10,%ymm13,%ymm13
- vmovdqu -24+256-128(%rsi),%ymm12
- vpaddq %ymm13,%ymm6,%ymm6
- vpmuludq %ymm10,%ymm0,%ymm0
- vmovdqu -24+288-128(%rsi),%ymm13
- vpaddq %ymm0,%ymm7,%ymm7
- vpmuludq %ymm10,%ymm12,%ymm12
- vpaddq %ymm12,%ymm8,%ymm8
- vpmuludq %ymm10,%ymm13,%ymm13
- vpbroadcastq 32(%r13),%ymm10
- vpaddq %ymm13,%ymm9,%ymm9
- addq $32,%r13
-
- vmovdqu -24+32-128(%rcx),%ymm0
- imulq -128(%rcx),%rax
- addq %rax,%r12
- shrq $29,%r12
-
- vmovdqu -24+64-128(%rcx),%ymm12
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovq %xmm10,%rbx
- vmovdqu -24+96-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm1,%ymm0
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu %ymm0,(%rsp)
- vpaddq %ymm12,%ymm2,%ymm1
- vmovdqu -24+128-128(%rcx),%ymm0
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -24+160-128(%rcx),%ymm12
- vpaddq %ymm13,%ymm3,%ymm2
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovdqu -24+192-128(%rcx),%ymm13
- vpaddq %ymm0,%ymm4,%ymm3
- vpmuludq %ymm11,%ymm12,%ymm12
- vmovdqu -24+224-128(%rcx),%ymm0
- vpaddq %ymm12,%ymm5,%ymm4
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovdqu -24+256-128(%rcx),%ymm12
- vpaddq %ymm13,%ymm6,%ymm5
- vpmuludq %ymm11,%ymm0,%ymm0
- vmovdqu -24+288-128(%rcx),%ymm13
- movq %r12,%r9
- vpaddq %ymm0,%ymm7,%ymm6
- vpmuludq %ymm11,%ymm12,%ymm12
- addq (%rsp),%r9
- vpaddq %ymm12,%ymm8,%ymm7
- vpmuludq %ymm11,%ymm13,%ymm13
- vmovq %r12,%xmm12
- vpaddq %ymm13,%ymm9,%ymm8
-
- decl %r14d
- jnz .Loop_mul_1024
- vpaddq (%rsp),%ymm12,%ymm0
-
- vpsrlq $29,%ymm0,%ymm12
- vpand %ymm15,%ymm0,%ymm0
- vpsrlq $29,%ymm1,%ymm13
- vpand %ymm15,%ymm1,%ymm1
- vpsrlq $29,%ymm2,%ymm10
- vpermq $0x93,%ymm12,%ymm12
- vpand %ymm15,%ymm2,%ymm2
- vpsrlq $29,%ymm3,%ymm11
- vpermq $0x93,%ymm13,%ymm13
- vpand %ymm15,%ymm3,%ymm3
-
- vpblendd $3,%ymm14,%ymm12,%ymm9
- vpermq $0x93,%ymm10,%ymm10
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpermq $0x93,%ymm11,%ymm11
- vpaddq %ymm9,%ymm0,%ymm0
- vpblendd $3,%ymm13,%ymm10,%ymm13
- vpaddq %ymm12,%ymm1,%ymm1
- vpblendd $3,%ymm10,%ymm11,%ymm10
- vpaddq %ymm13,%ymm2,%ymm2
- vpblendd $3,%ymm11,%ymm14,%ymm11
- vpaddq %ymm10,%ymm3,%ymm3
- vpaddq %ymm11,%ymm4,%ymm4
-
- vpsrlq $29,%ymm0,%ymm12
- vpand %ymm15,%ymm0,%ymm0
- vpsrlq $29,%ymm1,%ymm13
- vpand %ymm15,%ymm1,%ymm1
- vpsrlq $29,%ymm2,%ymm10
- vpermq $0x93,%ymm12,%ymm12
- vpand %ymm15,%ymm2,%ymm2
- vpsrlq $29,%ymm3,%ymm11
- vpermq $0x93,%ymm13,%ymm13
- vpand %ymm15,%ymm3,%ymm3
- vpermq $0x93,%ymm10,%ymm10
-
- vpblendd $3,%ymm14,%ymm12,%ymm9
- vpermq $0x93,%ymm11,%ymm11
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm9,%ymm0,%ymm0
- vpblendd $3,%ymm13,%ymm10,%ymm13
- vpaddq %ymm12,%ymm1,%ymm1
- vpblendd $3,%ymm10,%ymm11,%ymm10
- vpaddq %ymm13,%ymm2,%ymm2
- vpblendd $3,%ymm11,%ymm14,%ymm11
- vpaddq %ymm10,%ymm3,%ymm3
- vpaddq %ymm11,%ymm4,%ymm4
-
- vmovdqu %ymm0,0-128(%rdi)
- vmovdqu %ymm1,32-128(%rdi)
- vmovdqu %ymm2,64-128(%rdi)
- vmovdqu %ymm3,96-128(%rdi)
- vpsrlq $29,%ymm4,%ymm12
- vpand %ymm15,%ymm4,%ymm4
- vpsrlq $29,%ymm5,%ymm13
- vpand %ymm15,%ymm5,%ymm5
- vpsrlq $29,%ymm6,%ymm10
- vpermq $0x93,%ymm12,%ymm12
- vpand %ymm15,%ymm6,%ymm6
- vpsrlq $29,%ymm7,%ymm11
- vpermq $0x93,%ymm13,%ymm13
- vpand %ymm15,%ymm7,%ymm7
- vpsrlq $29,%ymm8,%ymm0
- vpermq $0x93,%ymm10,%ymm10
- vpand %ymm15,%ymm8,%ymm8
- vpermq $0x93,%ymm11,%ymm11
-
- vpblendd $3,%ymm14,%ymm12,%ymm9
- vpermq $0x93,%ymm0,%ymm0
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm9,%ymm4,%ymm4
- vpblendd $3,%ymm13,%ymm10,%ymm13
- vpaddq %ymm12,%ymm5,%ymm5
- vpblendd $3,%ymm10,%ymm11,%ymm10
- vpaddq %ymm13,%ymm6,%ymm6
- vpblendd $3,%ymm11,%ymm0,%ymm11
- vpaddq %ymm10,%ymm7,%ymm7
- vpaddq %ymm11,%ymm8,%ymm8
-
- vpsrlq $29,%ymm4,%ymm12
- vpand %ymm15,%ymm4,%ymm4
- vpsrlq $29,%ymm5,%ymm13
- vpand %ymm15,%ymm5,%ymm5
- vpsrlq $29,%ymm6,%ymm10
- vpermq $0x93,%ymm12,%ymm12
- vpand %ymm15,%ymm6,%ymm6
- vpsrlq $29,%ymm7,%ymm11
- vpermq $0x93,%ymm13,%ymm13
- vpand %ymm15,%ymm7,%ymm7
- vpsrlq $29,%ymm8,%ymm0
- vpermq $0x93,%ymm10,%ymm10
- vpand %ymm15,%ymm8,%ymm8
- vpermq $0x93,%ymm11,%ymm11
-
- vpblendd $3,%ymm14,%ymm12,%ymm9
- vpermq $0x93,%ymm0,%ymm0
- vpblendd $3,%ymm12,%ymm13,%ymm12
- vpaddq %ymm9,%ymm4,%ymm4
- vpblendd $3,%ymm13,%ymm10,%ymm13
- vpaddq %ymm12,%ymm5,%ymm5
- vpblendd $3,%ymm10,%ymm11,%ymm10
- vpaddq %ymm13,%ymm6,%ymm6
- vpblendd $3,%ymm11,%ymm0,%ymm11
- vpaddq %ymm10,%ymm7,%ymm7
- vpaddq %ymm11,%ymm8,%ymm8
-
- vmovdqu %ymm4,128-128(%rdi)
- vmovdqu %ymm5,160-128(%rdi)
- vmovdqu %ymm6,192-128(%rdi)
- vmovdqu %ymm7,224-128(%rdi)
- vmovdqu %ymm8,256-128(%rdi)
- vzeroupper
-
- movq %rbp,%rax
-.cfi_def_cfa_register %rax
- movq -48(%rax),%r15
-.cfi_restore %r15
- movq -40(%rax),%r14
-.cfi_restore %r14
- movq -32(%rax),%r13
-.cfi_restore %r13
- movq -24(%rax),%r12
-.cfi_restore %r12
- movq -16(%rax),%rbp
-.cfi_restore %rbp
- movq -8(%rax),%rbx
-.cfi_restore %rbx
- leaq (%rax),%rsp
-.cfi_def_cfa_register %rsp
-.Lmul_1024_epilogue:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
-.globl rsaz_1024_red2norm_avx2
-.type rsaz_1024_red2norm_avx2,@function
-.align 32
-rsaz_1024_red2norm_avx2:
-.cfi_startproc
- subq $-128,%rsi
- xorq %rax,%rax
- movq -128(%rsi),%r8
- movq -120(%rsi),%r9
- movq -112(%rsi),%r10
- shlq $0,%r8
- shlq $29,%r9
- movq %r10,%r11
- shlq $58,%r10
- shrq $6,%r11
- addq %r8,%rax
- addq %r9,%rax
- addq %r10,%rax
- adcq $0,%r11
- movq %rax,0(%rdi)
- movq %r11,%rax
- movq -104(%rsi),%r8
- movq -96(%rsi),%r9
- shlq $23,%r8
- movq %r9,%r10
- shlq $52,%r9
- shrq $12,%r10
- addq %r8,%rax
- addq %r9,%rax
- adcq $0,%r10
- movq %rax,8(%rdi)
- movq %r10,%rax
- movq -88(%rsi),%r11
- movq -80(%rsi),%r8
- shlq $17,%r11
- movq %r8,%r9
- shlq $46,%r8
- shrq $18,%r9
- addq %r11,%rax
- addq %r8,%rax
- adcq $0,%r9
- movq %rax,16(%rdi)
- movq %r9,%rax
- movq -72(%rsi),%r10
- movq -64(%rsi),%r11
- shlq $11,%r10
- movq %r11,%r8
- shlq $40,%r11
- shrq $24,%r8
- addq %r10,%rax
- addq %r11,%rax
- adcq $0,%r8
- movq %rax,24(%rdi)
- movq %r8,%rax
- movq -56(%rsi),%r9
- movq -48(%rsi),%r10
- movq -40(%rsi),%r11
- shlq $5,%r9
- shlq $34,%r10
- movq %r11,%r8
- shlq $63,%r11
- shrq $1,%r8
- addq %r9,%rax
- addq %r10,%rax
- addq %r11,%rax
- adcq $0,%r8
- movq %rax,32(%rdi)
- movq %r8,%rax
- movq -32(%rsi),%r9
- movq -24(%rsi),%r10
- shlq $28,%r9
- movq %r10,%r11
- shlq $57,%r10
- shrq $7,%r11
- addq %r9,%rax
- addq %r10,%rax
- adcq $0,%r11
- movq %rax,40(%rdi)
- movq %r11,%rax
- movq -16(%rsi),%r8
- movq -8(%rsi),%r9
- shlq $22,%r8
- movq %r9,%r10
- shlq $51,%r9
- shrq $13,%r10
- addq %r8,%rax
- addq %r9,%rax
- adcq $0,%r10
- movq %rax,48(%rdi)
- movq %r10,%rax
- movq 0(%rsi),%r11
- movq 8(%rsi),%r8
- shlq $16,%r11
- movq %r8,%r9
- shlq $45,%r8
- shrq $19,%r9
- addq %r11,%rax
- addq %r8,%rax
- adcq $0,%r9
- movq %rax,56(%rdi)
- movq %r9,%rax
- movq 16(%rsi),%r10
- movq 24(%rsi),%r11
- shlq $10,%r10
- movq %r11,%r8
- shlq $39,%r11
- shrq $25,%r8
- addq %r10,%rax
- addq %r11,%rax
- adcq $0,%r8
- movq %rax,64(%rdi)
- movq %r8,%rax
- movq 32(%rsi),%r9
- movq 40(%rsi),%r10
- movq 48(%rsi),%r11
- shlq $4,%r9
- shlq $33,%r10
- movq %r11,%r8
- shlq $62,%r11
- shrq $2,%r8
- addq %r9,%rax
- addq %r10,%rax
- addq %r11,%rax
- adcq $0,%r8
- movq %rax,72(%rdi)
- movq %r8,%rax
- movq 56(%rsi),%r9
- movq 64(%rsi),%r10
- shlq $27,%r9
- movq %r10,%r11
- shlq $56,%r10
- shrq $8,%r11
- addq %r9,%rax
- addq %r10,%rax
- adcq $0,%r11
- movq %rax,80(%rdi)
- movq %r11,%rax
- movq 72(%rsi),%r8
- movq 80(%rsi),%r9
- shlq $21,%r8
- movq %r9,%r10
- shlq $50,%r9
- shrq $14,%r10
- addq %r8,%rax
- addq %r9,%rax
- adcq $0,%r10
- movq %rax,88(%rdi)
- movq %r10,%rax
- movq 88(%rsi),%r11
- movq 96(%rsi),%r8
- shlq $15,%r11
- movq %r8,%r9
- shlq $44,%r8
- shrq $20,%r9
- addq %r11,%rax
- addq %r8,%rax
- adcq $0,%r9
- movq %rax,96(%rdi)
- movq %r9,%rax
- movq 104(%rsi),%r10
- movq 112(%rsi),%r11
- shlq $9,%r10
- movq %r11,%r8
- shlq $38,%r11
- shrq $26,%r8
- addq %r10,%rax
- addq %r11,%rax
- adcq $0,%r8
- movq %rax,104(%rdi)
- movq %r8,%rax
- movq 120(%rsi),%r9
- movq 128(%rsi),%r10
- movq 136(%rsi),%r11
- shlq $3,%r9
- shlq $32,%r10
- movq %r11,%r8
- shlq $61,%r11
- shrq $3,%r8
- addq %r9,%rax
- addq %r10,%rax
- addq %r11,%rax
- adcq $0,%r8
- movq %rax,112(%rdi)
- movq %r8,%rax
- movq 144(%rsi),%r9
- movq 152(%rsi),%r10
- shlq $26,%r9
- movq %r10,%r11
- shlq $55,%r10
- shrq $9,%r11
- addq %r9,%rax
- addq %r10,%rax
- adcq $0,%r11
- movq %rax,120(%rdi)
- movq %r11,%rax
- .byte 0xf3,0xc3
-.cfi_endproc
-.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
-
-.globl rsaz_1024_norm2red_avx2
-.type rsaz_1024_norm2red_avx2,@function
-.align 32
rsaz_1024_norm2red_avx2:
-.cfi_startproc
- subq $-128,%rdi
- movq (%rsi),%r8
- movl $0x1fffffff,%eax
- movq 8(%rsi),%r9
- movq %r8,%r11
- shrq $0,%r11
- andq %rax,%r11
- movq %r11,-128(%rdi)
- movq %r8,%r10
- shrq $29,%r10
- andq %rax,%r10
- movq %r10,-120(%rdi)
- shrdq $58,%r9,%r8
- andq %rax,%r8
- movq %r8,-112(%rdi)
- movq 16(%rsi),%r10
- movq %r9,%r8
- shrq $23,%r8
- andq %rax,%r8
- movq %r8,-104(%rdi)
- shrdq $52,%r10,%r9
- andq %rax,%r9
- movq %r9,-96(%rdi)
- movq 24(%rsi),%r11
- movq %r10,%r9
- shrq $17,%r9
- andq %rax,%r9
- movq %r9,-88(%rdi)
- shrdq $46,%r11,%r10
- andq %rax,%r10
- movq %r10,-80(%rdi)
- movq 32(%rsi),%r8
- movq %r11,%r10
- shrq $11,%r10
- andq %rax,%r10
- movq %r10,-72(%rdi)
- shrdq $40,%r8,%r11
- andq %rax,%r11
- movq %r11,-64(%rdi)
- movq 40(%rsi),%r9
- movq %r8,%r11
- shrq $5,%r11
- andq %rax,%r11
- movq %r11,-56(%rdi)
- movq %r8,%r10
- shrq $34,%r10
- andq %rax,%r10
- movq %r10,-48(%rdi)
- shrdq $63,%r9,%r8
- andq %rax,%r8
- movq %r8,-40(%rdi)
- movq 48(%rsi),%r10
- movq %r9,%r8
- shrq $28,%r8
- andq %rax,%r8
- movq %r8,-32(%rdi)
- shrdq $57,%r10,%r9
- andq %rax,%r9
- movq %r9,-24(%rdi)
- movq 56(%rsi),%r11
- movq %r10,%r9
- shrq $22,%r9
- andq %rax,%r9
- movq %r9,-16(%rdi)
- shrdq $51,%r11,%r10
- andq %rax,%r10
- movq %r10,-8(%rdi)
- movq 64(%rsi),%r8
- movq %r11,%r10
- shrq $16,%r10
- andq %rax,%r10
- movq %r10,0(%rdi)
- shrdq $45,%r8,%r11
- andq %rax,%r11
- movq %r11,8(%rdi)
- movq 72(%rsi),%r9
- movq %r8,%r11
- shrq $10,%r11
- andq %rax,%r11
- movq %r11,16(%rdi)
- shrdq $39,%r9,%r8
- andq %rax,%r8
- movq %r8,24(%rdi)
- movq 80(%rsi),%r10
- movq %r9,%r8
- shrq $4,%r8
- andq %rax,%r8
- movq %r8,32(%rdi)
- movq %r9,%r11
- shrq $33,%r11
- andq %rax,%r11
- movq %r11,40(%rdi)
- shrdq $62,%r10,%r9
- andq %rax,%r9
- movq %r9,48(%rdi)
- movq 88(%rsi),%r11
- movq %r10,%r9
- shrq $27,%r9
- andq %rax,%r9
- movq %r9,56(%rdi)
- shrdq $56,%r11,%r10
- andq %rax,%r10
- movq %r10,64(%rdi)
- movq 96(%rsi),%r8
- movq %r11,%r10
- shrq $21,%r10
- andq %rax,%r10
- movq %r10,72(%rdi)
- shrdq $50,%r8,%r11
- andq %rax,%r11
- movq %r11,80(%rdi)
- movq 104(%rsi),%r9
- movq %r8,%r11
- shrq $15,%r11
- andq %rax,%r11
- movq %r11,88(%rdi)
- shrdq $44,%r9,%r8
- andq %rax,%r8
- movq %r8,96(%rdi)
- movq 112(%rsi),%r10
- movq %r9,%r8
- shrq $9,%r8
- andq %rax,%r8
- movq %r8,104(%rdi)
- shrdq $38,%r10,%r9
- andq %rax,%r9
- movq %r9,112(%rdi)
- movq 120(%rsi),%r11
- movq %r10,%r9
- shrq $3,%r9
- andq %rax,%r9
- movq %r9,120(%rdi)
- movq %r10,%r8
- shrq $32,%r8
- andq %rax,%r8
- movq %r8,128(%rdi)
- shrdq $61,%r11,%r10
- andq %rax,%r10
- movq %r10,136(%rdi)
- xorq %r8,%r8
- movq %r11,%r10
- shrq $26,%r10
- andq %rax,%r10
- movq %r10,144(%rdi)
- shrdq $55,%r8,%r11
- andq %rax,%r11
- movq %r11,152(%rdi)
- movq %r8,160(%rdi)
- movq %r8,168(%rdi)
- movq %r8,176(%rdi)
- movq %r8,184(%rdi)
- .byte 0xf3,0xc3
-.cfi_endproc
-.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
-.globl rsaz_1024_scatter5_avx2
-.type rsaz_1024_scatter5_avx2,@function
-.align 32
+rsaz_1024_red2norm_avx2:
rsaz_1024_scatter5_avx2:
-.cfi_startproc
- vzeroupper
- vmovdqu .Lscatter_permd(%rip),%ymm5
- shll $4,%edx
- leaq (%rdi,%rdx,1),%rdi
- movl $9,%eax
- jmp .Loop_scatter_1024
-
-.align 32
-.Loop_scatter_1024:
- vmovdqu (%rsi),%ymm0
- leaq 32(%rsi),%rsi
- vpermd %ymm0,%ymm5,%ymm0
- vmovdqu %xmm0,(%rdi)
- leaq 512(%rdi),%rdi
- decl %eax
- jnz .Loop_scatter_1024
-
- vzeroupper
- .byte 0xf3,0xc3
-.cfi_endproc
-.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
-
-.globl rsaz_1024_gather5_avx2
-.type rsaz_1024_gather5_avx2,@function
-.align 32
rsaz_1024_gather5_avx2:
-.cfi_startproc
- vzeroupper
- movq %rsp,%r11
-.cfi_def_cfa_register %r11
- leaq -256(%rsp),%rsp
- andq $-32,%rsp
- leaq .Linc(%rip),%r10
- leaq -128(%rsp),%rax
-
- vmovd %edx,%xmm4
- vmovdqa (%r10),%ymm0
- vmovdqa 32(%r10),%ymm1
- vmovdqa 64(%r10),%ymm5
- vpbroadcastd %xmm4,%ymm4
-
- vpaddd %ymm5,%ymm0,%ymm2
- vpcmpeqd %ymm4,%ymm0,%ymm0
- vpaddd %ymm5,%ymm1,%ymm3
- vpcmpeqd %ymm4,%ymm1,%ymm1
- vmovdqa %ymm0,0+128(%rax)
- vpaddd %ymm5,%ymm2,%ymm0
- vpcmpeqd %ymm4,%ymm2,%ymm2
- vmovdqa %ymm1,32+128(%rax)
- vpaddd %ymm5,%ymm3,%ymm1
- vpcmpeqd %ymm4,%ymm3,%ymm3
- vmovdqa %ymm2,64+128(%rax)
- vpaddd %ymm5,%ymm0,%ymm2
- vpcmpeqd %ymm4,%ymm0,%ymm0
- vmovdqa %ymm3,96+128(%rax)
- vpaddd %ymm5,%ymm1,%ymm3
- vpcmpeqd %ymm4,%ymm1,%ymm1
- vmovdqa %ymm0,128+128(%rax)
- vpaddd %ymm5,%ymm2,%ymm8
- vpcmpeqd %ymm4,%ymm2,%ymm2
- vmovdqa %ymm1,160+128(%rax)
- vpaddd %ymm5,%ymm3,%ymm9
- vpcmpeqd %ymm4,%ymm3,%ymm3
- vmovdqa %ymm2,192+128(%rax)
- vpaddd %ymm5,%ymm8,%ymm10
- vpcmpeqd %ymm4,%ymm8,%ymm8
- vmovdqa %ymm3,224+128(%rax)
- vpaddd %ymm5,%ymm9,%ymm11
- vpcmpeqd %ymm4,%ymm9,%ymm9
- vpaddd %ymm5,%ymm10,%ymm12
- vpcmpeqd %ymm4,%ymm10,%ymm10
- vpaddd %ymm5,%ymm11,%ymm13
- vpcmpeqd %ymm4,%ymm11,%ymm11
- vpaddd %ymm5,%ymm12,%ymm14
- vpcmpeqd %ymm4,%ymm12,%ymm12
- vpaddd %ymm5,%ymm13,%ymm15
- vpcmpeqd %ymm4,%ymm13,%ymm13
- vpcmpeqd %ymm4,%ymm14,%ymm14
- vpcmpeqd %ymm4,%ymm15,%ymm15
-
- vmovdqa -32(%r10),%ymm7
- leaq 128(%rsi),%rsi
- movl $9,%edx
-
-.Loop_gather_1024:
- vmovdqa 0-128(%rsi),%ymm0
- vmovdqa 32-128(%rsi),%ymm1
- vmovdqa 64-128(%rsi),%ymm2
- vmovdqa 96-128(%rsi),%ymm3
- vpand 0+128(%rax),%ymm0,%ymm0
- vpand 32+128(%rax),%ymm1,%ymm1
- vpand 64+128(%rax),%ymm2,%ymm2
- vpor %ymm0,%ymm1,%ymm4
- vpand 96+128(%rax),%ymm3,%ymm3
- vmovdqa 128-128(%rsi),%ymm0
- vmovdqa 160-128(%rsi),%ymm1
- vpor %ymm2,%ymm3,%ymm5
- vmovdqa 192-128(%rsi),%ymm2
- vmovdqa 224-128(%rsi),%ymm3
- vpand 128+128(%rax),%ymm0,%ymm0
- vpand 160+128(%rax),%ymm1,%ymm1
- vpand 192+128(%rax),%ymm2,%ymm2
- vpor %ymm0,%ymm4,%ymm4
- vpand 224+128(%rax),%ymm3,%ymm3
- vpand 256-128(%rsi),%ymm8,%ymm0
- vpor %ymm1,%ymm5,%ymm5
- vpand 288-128(%rsi),%ymm9,%ymm1
- vpor %ymm2,%ymm4,%ymm4
- vpand 320-128(%rsi),%ymm10,%ymm2
- vpor %ymm3,%ymm5,%ymm5
- vpand 352-128(%rsi),%ymm11,%ymm3
- vpor %ymm0,%ymm4,%ymm4
- vpand 384-128(%rsi),%ymm12,%ymm0
- vpor %ymm1,%ymm5,%ymm5
- vpand 416-128(%rsi),%ymm13,%ymm1
- vpor %ymm2,%ymm4,%ymm4
- vpand 448-128(%rsi),%ymm14,%ymm2
- vpor %ymm3,%ymm5,%ymm5
- vpand 480-128(%rsi),%ymm15,%ymm3
- leaq 512(%rsi),%rsi
- vpor %ymm0,%ymm4,%ymm4
- vpor %ymm1,%ymm5,%ymm5
- vpor %ymm2,%ymm4,%ymm4
- vpor %ymm3,%ymm5,%ymm5
-
- vpor %ymm5,%ymm4,%ymm4
- vextracti128 $1,%ymm4,%xmm5
- vpor %xmm4,%xmm5,%xmm5
- vpermd %ymm5,%ymm7,%ymm5
- vmovdqu %ymm5,(%rdi)
- leaq 32(%rdi),%rdi
- decl %edx
- jnz .Loop_gather_1024
-
- vpxor %ymm0,%ymm0,%ymm0
- vmovdqu %ymm0,(%rdi)
- vzeroupper
- leaq (%r11),%rsp
-.cfi_def_cfa_register %rsp
+.byte 0x0f,0x0b
.byte 0xf3,0xc3
-.cfi_endproc
-.LSEH_end_rsaz_1024_gather5:
-.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
-
-.globl rsaz_avx2_eligible
-.type rsaz_avx2_eligible,@function
-.align 32
-rsaz_avx2_eligible:
- movl OPENSSL_ia32cap_P+8(%rip),%eax
- movl $524544,%ecx
- movl $0,%edx
- andl %eax,%ecx
- cmpl $524544,%ecx
- cmovel %edx,%eax
- andl $32,%eax
- shrl $5,%eax
- .byte 0xf3,0xc3
-.size rsaz_avx2_eligible,.-rsaz_avx2_eligible
-
-.align 64
-.Land_mask:
-.quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
-.Lscatter_permd:
-.long 0,2,4,6,7,7,7,7
-.Lgather_permd:
-.long 0,7,1,7,2,7,3,7
-.Linc:
-.long 0,0,0,0, 1,1,1,1
-.long 2,2,2,2, 3,3,3,3
-.long 4,4,4,4, 4,4,4,4
-.align 64
- .section ".note.gnu.property", "a"
+.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
+ .section .note.gnu.property, #alloc
.p2align 3
.long 1f - 0f
.long 4f - 1f
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-avx512.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-avx512.s
index 341cd06cd89..106ae6bbb92 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-avx512.s
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-avx512.s
@@ -1,884 +1,23 @@
+.text
.globl ossl_rsaz_avx512ifma_eligible
.type ossl_rsaz_avx512ifma_eligible,@function
-.align 32
ossl_rsaz_avx512ifma_eligible:
- movl OPENSSL_ia32cap_P+8(%rip),%ecx
xorl %eax,%eax
- andl $2149777408,%ecx
- cmpl $2149777408,%ecx
- cmovel %ecx,%eax
.byte 0xf3,0xc3
.size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible
-.text
.globl ossl_rsaz_amm52x20_x1_256
+.globl ossl_rsaz_amm52x20_x2_256
+.globl ossl_extract_multiplier_2x20_win5
.type ossl_rsaz_amm52x20_x1_256,@function
-.align 32
ossl_rsaz_amm52x20_x1_256:
-.cfi_startproc
-.byte 243,15,30,250
- pushq %rbx
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbp,-24
- pushq %r12
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r15,-56
-.Lrsaz_amm52x20_x1_256_body:
-
-
- vpxord %ymm0,%ymm0,%ymm0
- vmovdqa64 %ymm0,%ymm1
- vmovdqa64 %ymm0,%ymm16
- vmovdqa64 %ymm0,%ymm17
- vmovdqa64 %ymm0,%ymm18
- vmovdqa64 %ymm0,%ymm19
-
- xorl %r9d,%r9d
-
- movq %rdx,%r11
- movq $0xfffffffffffff,%rax
-
-
- movl $5,%ebx
-
-.align 32
-.Lloop5:
- movq 0(%r11),%r13
-
- vpbroadcastq %r13,%ymm3
- movq 0(%rsi),%rdx
- mulxq %r13,%r13,%r12
- addq %r13,%r9
- movq %r12,%r10
- adcq $0,%r10
-
- movq %r8,%r13
- imulq %r9,%r13
- andq %rax,%r13
-
- vpbroadcastq %r13,%ymm4
- movq 0(%rcx),%rdx
- mulxq %r13,%r13,%r12
- addq %r13,%r9
- adcq %r12,%r10
-
- shrq $52,%r9
- salq $12,%r10
- orq %r10,%r9
-
- vpmadd52luq 0(%rsi),%ymm3,%ymm1
- vpmadd52luq 32(%rsi),%ymm3,%ymm16
- vpmadd52luq 64(%rsi),%ymm3,%ymm17
- vpmadd52luq 96(%rsi),%ymm3,%ymm18
- vpmadd52luq 128(%rsi),%ymm3,%ymm19
-
- vpmadd52luq 0(%rcx),%ymm4,%ymm1
- vpmadd52luq 32(%rcx),%ymm4,%ymm16
- vpmadd52luq 64(%rcx),%ymm4,%ymm17
- vpmadd52luq 96(%rcx),%ymm4,%ymm18
- vpmadd52luq 128(%rcx),%ymm4,%ymm19
-
-
- valignq $1,%ymm1,%ymm16,%ymm1
- valignq $1,%ymm16,%ymm17,%ymm16
- valignq $1,%ymm17,%ymm18,%ymm17
- valignq $1,%ymm18,%ymm19,%ymm18
- valignq $1,%ymm19,%ymm0,%ymm19
-
- vmovq %xmm1,%r13
- addq %r13,%r9
-
- vpmadd52huq 0(%rsi),%ymm3,%ymm1
- vpmadd52huq 32(%rsi),%ymm3,%ymm16
- vpmadd52huq 64(%rsi),%ymm3,%ymm17
- vpmadd52huq 96(%rsi),%ymm3,%ymm18
- vpmadd52huq 128(%rsi),%ymm3,%ymm19
-
- vpmadd52huq 0(%rcx),%ymm4,%ymm1
- vpmadd52huq 32(%rcx),%ymm4,%ymm16
- vpmadd52huq 64(%rcx),%ymm4,%ymm17
- vpmadd52huq 96(%rcx),%ymm4,%ymm18
- vpmadd52huq 128(%rcx),%ymm4,%ymm19
- movq 8(%r11),%r13
-
- vpbroadcastq %r13,%ymm3
- movq 0(%rsi),%rdx
- mulxq %r13,%r13,%r12
- addq %r13,%r9
- movq %r12,%r10
- adcq $0,%r10
-
- movq %r8,%r13
- imulq %r9,%r13
- andq %rax,%r13
-
- vpbroadcastq %r13,%ymm4
- movq 0(%rcx),%rdx
- mulxq %r13,%r13,%r12
- addq %r13,%r9
- adcq %r12,%r10
-
- shrq $52,%r9
- salq $12,%r10
- orq %r10,%r9
-
- vpmadd52luq 0(%rsi),%ymm3,%ymm1
- vpmadd52luq 32(%rsi),%ymm3,%ymm16
- vpmadd52luq 64(%rsi),%ymm3,%ymm17
- vpmadd52luq 96(%rsi),%ymm3,%ymm18
- vpmadd52luq 128(%rsi),%ymm3,%ymm19
-
- vpmadd52luq 0(%rcx),%ymm4,%ymm1
- vpmadd52luq 32(%rcx),%ymm4,%ymm16
- vpmadd52luq 64(%rcx),%ymm4,%ymm17
- vpmadd52luq 96(%rcx),%ymm4,%ymm18
- vpmadd52luq 128(%rcx),%ymm4,%ymm19
-
-
- valignq $1,%ymm1,%ymm16,%ymm1
- valignq $1,%ymm16,%ymm17,%ymm16
- valignq $1,%ymm17,%ymm18,%ymm17
- valignq $1,%ymm18,%ymm19,%ymm18
- valignq $1,%ymm19,%ymm0,%ymm19
-
- vmovq %xmm1,%r13
- addq %r13,%r9
-
- vpmadd52huq 0(%rsi),%ymm3,%ymm1
- vpmadd52huq 32(%rsi),%ymm3,%ymm16
- vpmadd52huq 64(%rsi),%ymm3,%ymm17
- vpmadd52huq 96(%rsi),%ymm3,%ymm18
- vpmadd52huq 128(%rsi),%ymm3,%ymm19
-
- vpmadd52huq 0(%rcx),%ymm4,%ymm1
- vpmadd52huq 32(%rcx),%ymm4,%ymm16
- vpmadd52huq 64(%rcx),%ymm4,%ymm17
- vpmadd52huq 96(%rcx),%ymm4,%ymm18
- vpmadd52huq 128(%rcx),%ymm4,%ymm19
- movq 16(%r11),%r13
-
- vpbroadcastq %r13,%ymm3
- movq 0(%rsi),%rdx
- mulxq %r13,%r13,%r12
- addq %r13,%r9
- movq %r12,%r10
- adcq $0,%r10
-
- movq %r8,%r13
- imulq %r9,%r13
- andq %rax,%r13
-
- vpbroadcastq %r13,%ymm4
- movq 0(%rcx),%rdx
- mulxq %r13,%r13,%r12
- addq %r13,%r9
- adcq %r12,%r10
-
- shrq $52,%r9
- salq $12,%r10
- orq %r10,%r9
-
- vpmadd52luq 0(%rsi),%ymm3,%ymm1
- vpmadd52luq 32(%rsi),%ymm3,%ymm16
- vpmadd52luq 64(%rsi),%ymm3,%ymm17
- vpmadd52luq 96(%rsi),%ymm3,%ymm18
- vpmadd52luq 128(%rsi),%ymm3,%ymm19
-
- vpmadd52luq 0(%rcx),%ymm4,%ymm1
- vpmadd52luq 32(%rcx),%ymm4,%ymm16
- vpmadd52luq 64(%rcx),%ymm4,%ymm17
- vpmadd52luq 96(%rcx),%ymm4,%ymm18
- vpmadd52luq 128(%rcx),%ymm4,%ymm19
-
-
- valignq $1,%ymm1,%ymm16,%ymm1
- valignq $1,%ymm16,%ymm17,%ymm16
- valignq $1,%ymm17,%ymm18,%ymm17
- valignq $1,%ymm18,%ymm19,%ymm18
- valignq $1,%ymm19,%ymm0,%ymm19
-
- vmovq %xmm1,%r13
- addq %r13,%r9
-
- vpmadd52huq 0(%rsi),%ymm3,%ymm1
- vpmadd52huq 32(%rsi),%ymm3,%ymm16
- vpmadd52huq 64(%rsi),%ymm3,%ymm17
- vpmadd52huq 96(%rsi),%ymm3,%ymm18
- vpmadd52huq 128(%rsi),%ymm3,%ymm19
-
- vpmadd52huq 0(%rcx),%ymm4,%ymm1
- vpmadd52huq 32(%rcx),%ymm4,%ymm16
- vpmadd52huq 64(%rcx),%ymm4,%ymm17
- vpmadd52huq 96(%rcx),%ymm4,%ymm18
- vpmadd52huq 128(%rcx),%ymm4,%ymm19
- movq 24(%r11),%r13
-
- vpbroadcastq %r13,%ymm3
- movq 0(%rsi),%rdx
- mulxq %r13,%r13,%r12
- addq %r13,%r9
- movq %r12,%r10
- adcq $0,%r10
-
- movq %r8,%r13
- imulq %r9,%r13
- andq %rax,%r13
-
- vpbroadcastq %r13,%ymm4
- movq 0(%rcx),%rdx
- mulxq %r13,%r13,%r12
- addq %r13,%r9
- adcq %r12,%r10
-
- shrq $52,%r9
- salq $12,%r10
- orq %r10,%r9
-
- vpmadd52luq 0(%rsi),%ymm3,%ymm1
- vpmadd52luq 32(%rsi),%ymm3,%ymm16
- vpmadd52luq 64(%rsi),%ymm3,%ymm17
- vpmadd52luq 96(%rsi),%ymm3,%ymm18
- vpmadd52luq 128(%rsi),%ymm3,%ymm19
-
- vpmadd52luq 0(%rcx),%ymm4,%ymm1
- vpmadd52luq 32(%rcx),%ymm4,%ymm16
- vpmadd52luq 64(%rcx),%ymm4,%ymm17
- vpmadd52luq 96(%rcx),%ymm4,%ymm18
- vpmadd52luq 128(%rcx),%ymm4,%ymm19
-
-
- valignq $1,%ymm1,%ymm16,%ymm1
- valignq $1,%ymm16,%ymm17,%ymm16
- valignq $1,%ymm17,%ymm18,%ymm17
- valignq $1,%ymm18,%ymm19,%ymm18
- valignq $1,%ymm19,%ymm0,%ymm19
-
- vmovq %xmm1,%r13
- addq %r13,%r9
-
- vpmadd52huq 0(%rsi),%ymm3,%ymm1
- vpmadd52huq 32(%rsi),%ymm3,%ymm16
- vpmadd52huq 64(%rsi),%ymm3,%ymm17
- vpmadd52huq 96(%rsi),%ymm3,%ymm18
- vpmadd52huq 128(%rsi),%ymm3,%ymm19
-
- vpmadd52huq 0(%rcx),%ymm4,%ymm1
- vpmadd52huq 32(%rcx),%ymm4,%ymm16
- vpmadd52huq 64(%rcx),%ymm4,%ymm17
- vpmadd52huq 96(%rcx),%ymm4,%ymm18
- vpmadd52huq 128(%rcx),%ymm4,%ymm19
- leaq 32(%r11),%r11
- decl %ebx
- jne .Lloop5
-
- vmovdqa64 .Lmask52x4(%rip),%ymm4
-
- vpbroadcastq %r9,%ymm3
- vpblendd $3,%ymm3,%ymm1,%ymm1
-
-
-
- vpsrlq $52,%ymm1,%ymm24
- vpsrlq $52,%ymm16,%ymm25
- vpsrlq $52,%ymm17,%ymm26
- vpsrlq $52,%ymm18,%ymm27
- vpsrlq $52,%ymm19,%ymm28
-
-
- valignq $3,%ymm27,%ymm28,%ymm28
- valignq $3,%ymm26,%ymm27,%ymm27
- valignq $3,%ymm25,%ymm26,%ymm26
- valignq $3,%ymm24,%ymm25,%ymm25
- valignq $3,%ymm0,%ymm24,%ymm24
-
-
- vpandq %ymm4,%ymm1,%ymm1
- vpandq %ymm4,%ymm16,%ymm16
- vpandq %ymm4,%ymm17,%ymm17
- vpandq %ymm4,%ymm18,%ymm18
- vpandq %ymm4,%ymm19,%ymm19
-
-
- vpaddq %ymm24,%ymm1,%ymm1
- vpaddq %ymm25,%ymm16,%ymm16
- vpaddq %ymm26,%ymm17,%ymm17
- vpaddq %ymm27,%ymm18,%ymm18
- vpaddq %ymm28,%ymm19,%ymm19
-
-
-
- vpcmpuq $1,%ymm1,%ymm4,%k1
- vpcmpuq $1,%ymm16,%ymm4,%k2
- vpcmpuq $1,%ymm17,%ymm4,%k3
- vpcmpuq $1,%ymm18,%ymm4,%k4
- vpcmpuq $1,%ymm19,%ymm4,%k5
- kmovb %k1,%r14d
- kmovb %k2,%r13d
- kmovb %k3,%r12d
- kmovb %k4,%r11d
- kmovb %k5,%r10d
-
-
- vpcmpuq $0,%ymm1,%ymm4,%k1
- vpcmpuq $0,%ymm16,%ymm4,%k2
- vpcmpuq $0,%ymm17,%ymm4,%k3
- vpcmpuq $0,%ymm18,%ymm4,%k4
- vpcmpuq $0,%ymm19,%ymm4,%k5
- kmovb %k1,%r9d
- kmovb %k2,%r8d
- kmovb %k3,%ebx
- kmovb %k4,%ecx
- kmovb %k5,%edx
-
-
-
- shlb $4,%r13b
- orb %r13b,%r14b
- shlb $4,%r11b
- orb %r11b,%r12b
-
- addb %r14b,%r14b
- adcb %r12b,%r12b
- adcb %r10b,%r10b
-
- shlb $4,%r8b
- orb %r8b,%r9b
- shlb $4,%cl
- orb %cl,%bl
-
- addb %r9b,%r14b
- adcb %bl,%r12b
- adcb %dl,%r10b
-
- xorb %r9b,%r14b
- xorb %bl,%r12b
- xorb %dl,%r10b
-
- kmovb %r14d,%k1
- shrb $4,%r14b
- kmovb %r14d,%k2
- kmovb %r12d,%k3
- shrb $4,%r12b
- kmovb %r12d,%k4
- kmovb %r10d,%k5
-
-
- vpsubq %ymm4,%ymm1,%ymm1{%k1}
- vpsubq %ymm4,%ymm16,%ymm16{%k2}
- vpsubq %ymm4,%ymm17,%ymm17{%k3}
- vpsubq %ymm4,%ymm18,%ymm18{%k4}
- vpsubq %ymm4,%ymm19,%ymm19{%k5}
-
- vpandq %ymm4,%ymm1,%ymm1
- vpandq %ymm4,%ymm16,%ymm16
- vpandq %ymm4,%ymm17,%ymm17
- vpandq %ymm4,%ymm18,%ymm18
- vpandq %ymm4,%ymm19,%ymm19
-
- vmovdqu64 %ymm1,(%rdi)
- vmovdqu64 %ymm16,32(%rdi)
- vmovdqu64 %ymm17,64(%rdi)
- vmovdqu64 %ymm18,96(%rdi)
- vmovdqu64 %ymm19,128(%rdi)
-
- vzeroupper
- movq 0(%rsp),%r15
-.cfi_restore %r15
- movq 8(%rsp),%r14
-.cfi_restore %r14
- movq 16(%rsp),%r13
-.cfi_restore %r13
- movq 24(%rsp),%r12
-.cfi_restore %r12
- movq 32(%rsp),%rbp
-.cfi_restore %rbp
- movq 40(%rsp),%rbx
-.cfi_restore %rbx
- leaq 48(%rsp),%rsp
-.cfi_adjust_cfa_offset -48
-.Lrsaz_amm52x20_x1_256_epilogue:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size ossl_rsaz_amm52x20_x1_256, .-ossl_rsaz_amm52x20_x1_256
-.data
-.align 32
-.Lmask52x4:
-.quad 0xfffffffffffff
-.quad 0xfffffffffffff
-.quad 0xfffffffffffff
-.quad 0xfffffffffffff
-.text
-
-.globl ossl_rsaz_amm52x20_x2_256
-.type ossl_rsaz_amm52x20_x2_256,@function
-.align 32
ossl_rsaz_amm52x20_x2_256:
-.cfi_startproc
-.byte 243,15,30,250
- pushq %rbx
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbp,-24
- pushq %r12
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r15,-56
-.Lrsaz_amm52x20_x2_256_body:
-
-
- vpxord %ymm0,%ymm0,%ymm0
- vmovdqa64 %ymm0,%ymm1
- vmovdqa64 %ymm0,%ymm16
- vmovdqa64 %ymm0,%ymm17
- vmovdqa64 %ymm0,%ymm18
- vmovdqa64 %ymm0,%ymm19
- vmovdqa64 %ymm0,%ymm2
- vmovdqa64 %ymm0,%ymm20
- vmovdqa64 %ymm0,%ymm21
- vmovdqa64 %ymm0,%ymm22
- vmovdqa64 %ymm0,%ymm23
-
- xorl %r9d,%r9d
- xorl %r15d,%r15d
-
- movq %rdx,%r11
- movq $0xfffffffffffff,%rax
-
- movl $20,%ebx
-
-.align 32
-.Lloop20:
- movq 0(%r11),%r13
-
- vpbroadcastq %r13,%ymm3
- movq 0(%rsi),%rdx
- mulxq %r13,%r13,%r12
- addq %r13,%r9
- movq %r12,%r10
- adcq $0,%r10
-
- movq (%r8),%r13
- imulq %r9,%r13
- andq %rax,%r13
-
- vpbroadcastq %r13,%ymm4
- movq 0(%rcx),%rdx
- mulxq %r13,%r13,%r12
- addq %r13,%r9
- adcq %r12,%r10
-
- shrq $52,%r9
- salq $12,%r10
- orq %r10,%r9
-
- vpmadd52luq 0(%rsi),%ymm3,%ymm1
- vpmadd52luq 32(%rsi),%ymm3,%ymm16
- vpmadd52luq 64(%rsi),%ymm3,%ymm17
- vpmadd52luq 96(%rsi),%ymm3,%ymm18
- vpmadd52luq 128(%rsi),%ymm3,%ymm19
-
- vpmadd52luq 0(%rcx),%ymm4,%ymm1
- vpmadd52luq 32(%rcx),%ymm4,%ymm16
- vpmadd52luq 64(%rcx),%ymm4,%ymm17
- vpmadd52luq 96(%rcx),%ymm4,%ymm18
- vpmadd52luq 128(%rcx),%ymm4,%ymm19
-
-
- valignq $1,%ymm1,%ymm16,%ymm1
- valignq $1,%ymm16,%ymm17,%ymm16
- valignq $1,%ymm17,%ymm18,%ymm17
- valignq $1,%ymm18,%ymm19,%ymm18
- valignq $1,%ymm19,%ymm0,%ymm19
-
- vmovq %xmm1,%r13
- addq %r13,%r9
-
- vpmadd52huq 0(%rsi),%ymm3,%ymm1
- vpmadd52huq 32(%rsi),%ymm3,%ymm16
- vpmadd52huq 64(%rsi),%ymm3,%ymm17
- vpmadd52huq 96(%rsi),%ymm3,%ymm18
- vpmadd52huq 128(%rsi),%ymm3,%ymm19
-
- vpmadd52huq 0(%rcx),%ymm4,%ymm1
- vpmadd52huq 32(%rcx),%ymm4,%ymm16
- vpmadd52huq 64(%rcx),%ymm4,%ymm17
- vpmadd52huq 96(%rcx),%ymm4,%ymm18
- vpmadd52huq 128(%rcx),%ymm4,%ymm19
- movq 160(%r11),%r13
-
- vpbroadcastq %r13,%ymm3
- movq 160(%rsi),%rdx
- mulxq %r13,%r13,%r12
- addq %r13,%r15
- movq %r12,%r10
- adcq $0,%r10
-
- movq 8(%r8),%r13
- imulq %r15,%r13
- andq %rax,%r13
-
- vpbroadcastq %r13,%ymm4
- movq 160(%rcx),%rdx
- mulxq %r13,%r13,%r12
- addq %r13,%r15
- adcq %r12,%r10
-
- shrq $52,%r15
- salq $12,%r10
- orq %r10,%r15
-
- vpmadd52luq 160(%rsi),%ymm3,%ymm2
- vpmadd52luq 192(%rsi),%ymm3,%ymm20
- vpmadd52luq 224(%rsi),%ymm3,%ymm21
- vpmadd52luq 256(%rsi),%ymm3,%ymm22
- vpmadd52luq 288(%rsi),%ymm3,%ymm23
-
- vpmadd52luq 160(%rcx),%ymm4,%ymm2
- vpmadd52luq 192(%rcx),%ymm4,%ymm20
- vpmadd52luq 224(%rcx),%ymm4,%ymm21
- vpmadd52luq 256(%rcx),%ymm4,%ymm22
- vpmadd52luq 288(%rcx),%ymm4,%ymm23
-
-
- valignq $1,%ymm2,%ymm20,%ymm2
- valignq $1,%ymm20,%ymm21,%ymm20
- valignq $1,%ymm21,%ymm22,%ymm21
- valignq $1,%ymm22,%ymm23,%ymm22
- valignq $1,%ymm23,%ymm0,%ymm23
-
- vmovq %xmm2,%r13
- addq %r13,%r15
-
- vpmadd52huq 160(%rsi),%ymm3,%ymm2
- vpmadd52huq 192(%rsi),%ymm3,%ymm20
- vpmadd52huq 224(%rsi),%ymm3,%ymm21
- vpmadd52huq 256(%rsi),%ymm3,%ymm22
- vpmadd52huq 288(%rsi),%ymm3,%ymm23
-
- vpmadd52huq 160(%rcx),%ymm4,%ymm2
- vpmadd52huq 192(%rcx),%ymm4,%ymm20
- vpmadd52huq 224(%rcx),%ymm4,%ymm21
- vpmadd52huq 256(%rcx),%ymm4,%ymm22
- vpmadd52huq 288(%rcx),%ymm4,%ymm23
- leaq 8(%r11),%r11
- decl %ebx
- jne .Lloop20
-
- vmovdqa64 .Lmask52x4(%rip),%ymm4
-
- vpbroadcastq %r9,%ymm3
- vpblendd $3,%ymm3,%ymm1,%ymm1
-
-
-
- vpsrlq $52,%ymm1,%ymm24
- vpsrlq $52,%ymm16,%ymm25
- vpsrlq $52,%ymm17,%ymm26
- vpsrlq $52,%ymm18,%ymm27
- vpsrlq $52,%ymm19,%ymm28
-
-
- valignq $3,%ymm27,%ymm28,%ymm28
- valignq $3,%ymm26,%ymm27,%ymm27
- valignq $3,%ymm25,%ymm26,%ymm26
- valignq $3,%ymm24,%ymm25,%ymm25
- valignq $3,%ymm0,%ymm24,%ymm24
-
-
- vpandq %ymm4,%ymm1,%ymm1
- vpandq %ymm4,%ymm16,%ymm16
- vpandq %ymm4,%ymm17,%ymm17
- vpandq %ymm4,%ymm18,%ymm18
- vpandq %ymm4,%ymm19,%ymm19
-
-
- vpaddq %ymm24,%ymm1,%ymm1
- vpaddq %ymm25,%ymm16,%ymm16
- vpaddq %ymm26,%ymm17,%ymm17
- vpaddq %ymm27,%ymm18,%ymm18
- vpaddq %ymm28,%ymm19,%ymm19
-
-
-
- vpcmpuq $1,%ymm1,%ymm4,%k1
- vpcmpuq $1,%ymm16,%ymm4,%k2
- vpcmpuq $1,%ymm17,%ymm4,%k3
- vpcmpuq $1,%ymm18,%ymm4,%k4
- vpcmpuq $1,%ymm19,%ymm4,%k5
- kmovb %k1,%r14d
- kmovb %k2,%r13d
- kmovb %k3,%r12d
- kmovb %k4,%r11d
- kmovb %k5,%r10d
-
-
- vpcmpuq $0,%ymm1,%ymm4,%k1
- vpcmpuq $0,%ymm16,%ymm4,%k2
- vpcmpuq $0,%ymm17,%ymm4,%k3
- vpcmpuq $0,%ymm18,%ymm4,%k4
- vpcmpuq $0,%ymm19,%ymm4,%k5
- kmovb %k1,%r9d
- kmovb %k2,%r8d
- kmovb %k3,%ebx
- kmovb %k4,%ecx
- kmovb %k5,%edx
-
-
-
- shlb $4,%r13b
- orb %r13b,%r14b
- shlb $4,%r11b
- orb %r11b,%r12b
-
- addb %r14b,%r14b
- adcb %r12b,%r12b
- adcb %r10b,%r10b
-
- shlb $4,%r8b
- orb %r8b,%r9b
- shlb $4,%cl
- orb %cl,%bl
-
- addb %r9b,%r14b
- adcb %bl,%r12b
- adcb %dl,%r10b
-
- xorb %r9b,%r14b
- xorb %bl,%r12b
- xorb %dl,%r10b
-
- kmovb %r14d,%k1
- shrb $4,%r14b
- kmovb %r14d,%k2
- kmovb %r12d,%k3
- shrb $4,%r12b
- kmovb %r12d,%k4
- kmovb %r10d,%k5
-
-
- vpsubq %ymm4,%ymm1,%ymm1{%k1}
- vpsubq %ymm4,%ymm16,%ymm16{%k2}
- vpsubq %ymm4,%ymm17,%ymm17{%k3}
- vpsubq %ymm4,%ymm18,%ymm18{%k4}
- vpsubq %ymm4,%ymm19,%ymm19{%k5}
-
- vpandq %ymm4,%ymm1,%ymm1
- vpandq %ymm4,%ymm16,%ymm16
- vpandq %ymm4,%ymm17,%ymm17
- vpandq %ymm4,%ymm18,%ymm18
- vpandq %ymm4,%ymm19,%ymm19
-
- vpbroadcastq %r15,%ymm3
- vpblendd $3,%ymm3,%ymm2,%ymm2
-
-
-
- vpsrlq $52,%ymm2,%ymm24
- vpsrlq $52,%ymm20,%ymm25
- vpsrlq $52,%ymm21,%ymm26
- vpsrlq $52,%ymm22,%ymm27
- vpsrlq $52,%ymm23,%ymm28
-
-
- valignq $3,%ymm27,%ymm28,%ymm28
- valignq $3,%ymm26,%ymm27,%ymm27
- valignq $3,%ymm25,%ymm26,%ymm26
- valignq $3,%ymm24,%ymm25,%ymm25
- valignq $3,%ymm0,%ymm24,%ymm24
-
-
- vpandq %ymm4,%ymm2,%ymm2
- vpandq %ymm4,%ymm20,%ymm20
- vpandq %ymm4,%ymm21,%ymm21
- vpandq %ymm4,%ymm22,%ymm22
- vpandq %ymm4,%ymm23,%ymm23
-
-
- vpaddq %ymm24,%ymm2,%ymm2
- vpaddq %ymm25,%ymm20,%ymm20
- vpaddq %ymm26,%ymm21,%ymm21
- vpaddq %ymm27,%ymm22,%ymm22
- vpaddq %ymm28,%ymm23,%ymm23
-
-
-
- vpcmpuq $1,%ymm2,%ymm4,%k1
- vpcmpuq $1,%ymm20,%ymm4,%k2
- vpcmpuq $1,%ymm21,%ymm4,%k3
- vpcmpuq $1,%ymm22,%ymm4,%k4
- vpcmpuq $1,%ymm23,%ymm4,%k5
- kmovb %k1,%r14d
- kmovb %k2,%r13d
- kmovb %k3,%r12d
- kmovb %k4,%r11d
- kmovb %k5,%r10d
-
-
- vpcmpuq $0,%ymm2,%ymm4,%k1
- vpcmpuq $0,%ymm20,%ymm4,%k2
- vpcmpuq $0,%ymm21,%ymm4,%k3
- vpcmpuq $0,%ymm22,%ymm4,%k4
- vpcmpuq $0,%ymm23,%ymm4,%k5
- kmovb %k1,%r9d
- kmovb %k2,%r8d
- kmovb %k3,%ebx
- kmovb %k4,%ecx
- kmovb %k5,%edx
-
-
-
- shlb $4,%r13b
- orb %r13b,%r14b
- shlb $4,%r11b
- orb %r11b,%r12b
-
- addb %r14b,%r14b
- adcb %r12b,%r12b
- adcb %r10b,%r10b
-
- shlb $4,%r8b
- orb %r8b,%r9b
- shlb $4,%cl
- orb %cl,%bl
-
- addb %r9b,%r14b
- adcb %bl,%r12b
- adcb %dl,%r10b
-
- xorb %r9b,%r14b
- xorb %bl,%r12b
- xorb %dl,%r10b
-
- kmovb %r14d,%k1
- shrb $4,%r14b
- kmovb %r14d,%k2
- kmovb %r12d,%k3
- shrb $4,%r12b
- kmovb %r12d,%k4
- kmovb %r10d,%k5
-
-
- vpsubq %ymm4,%ymm2,%ymm2{%k1}
- vpsubq %ymm4,%ymm20,%ymm20{%k2}
- vpsubq %ymm4,%ymm21,%ymm21{%k3}
- vpsubq %ymm4,%ymm22,%ymm22{%k4}
- vpsubq %ymm4,%ymm23,%ymm23{%k5}
-
- vpandq %ymm4,%ymm2,%ymm2
- vpandq %ymm4,%ymm20,%ymm20
- vpandq %ymm4,%ymm21,%ymm21
- vpandq %ymm4,%ymm22,%ymm22
- vpandq %ymm4,%ymm23,%ymm23
-
- vmovdqu64 %ymm1,(%rdi)
- vmovdqu64 %ymm16,32(%rdi)
- vmovdqu64 %ymm17,64(%rdi)
- vmovdqu64 %ymm18,96(%rdi)
- vmovdqu64 %ymm19,128(%rdi)
-
- vmovdqu64 %ymm2,160(%rdi)
- vmovdqu64 %ymm20,192(%rdi)
- vmovdqu64 %ymm21,224(%rdi)
- vmovdqu64 %ymm22,256(%rdi)
- vmovdqu64 %ymm23,288(%rdi)
-
- vzeroupper
- movq 0(%rsp),%r15
-.cfi_restore %r15
- movq 8(%rsp),%r14
-.cfi_restore %r14
- movq 16(%rsp),%r13
-.cfi_restore %r13
- movq 24(%rsp),%r12
-.cfi_restore %r12
- movq 32(%rsp),%rbp
-.cfi_restore %rbp
- movq 40(%rsp),%rbx
-.cfi_restore %rbx
- leaq 48(%rsp),%rsp
-.cfi_adjust_cfa_offset -48
-.Lrsaz_amm52x20_x2_256_epilogue:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size ossl_rsaz_amm52x20_x2_256, .-ossl_rsaz_amm52x20_x2_256
-.text
-
-.align 32
-.globl ossl_extract_multiplier_2x20_win5
-.type ossl_extract_multiplier_2x20_win5,@function
ossl_extract_multiplier_2x20_win5:
-.cfi_startproc
-.byte 243,15,30,250
- leaq (%rcx,%rcx,4),%rax
- salq $5,%rax
- addq %rax,%rsi
-
- vmovdqa64 .Lones(%rip),%ymm23
- vpbroadcastq %rdx,%ymm22
- leaq 10240(%rsi),%rax
-
- vpxor %xmm4,%xmm4,%xmm4
- vmovdqa64 %ymm4,%ymm3
- vmovdqa64 %ymm4,%ymm2
- vmovdqa64 %ymm4,%ymm1
- vmovdqa64 %ymm4,%ymm0
- vmovdqa64 %ymm4,%ymm21
-
-.align 32
-.Lloop:
- vpcmpq $0,%ymm21,%ymm22,%k1
- addq $320,%rsi
- vpaddq %ymm23,%ymm21,%ymm21
- vmovdqu64 -320(%rsi),%ymm16
- vmovdqu64 -288(%rsi),%ymm17
- vmovdqu64 -256(%rsi),%ymm18
- vmovdqu64 -224(%rsi),%ymm19
- vmovdqu64 -192(%rsi),%ymm20
- vpblendmq %ymm16,%ymm0,%ymm0{%k1}
- vpblendmq %ymm17,%ymm1,%ymm1{%k1}
- vpblendmq %ymm18,%ymm2,%ymm2{%k1}
- vpblendmq %ymm19,%ymm3,%ymm3{%k1}
- vpblendmq %ymm20,%ymm4,%ymm4{%k1}
- cmpq %rsi,%rax
- jne .Lloop
-
- vmovdqu64 %ymm0,(%rdi)
- vmovdqu64 %ymm1,32(%rdi)
- vmovdqu64 %ymm2,64(%rdi)
- vmovdqu64 %ymm3,96(%rdi)
- vmovdqu64 %ymm4,128(%rdi)
-
+.byte 0x0f,0x0b
.byte 0xf3,0xc3
-.cfi_endproc
-.size ossl_extract_multiplier_2x20_win5, .-ossl_extract_multiplier_2x20_win5
-.data
-.align 32
-.Lones:
-.quad 1,1,1,1
- .section ".note.gnu.property", "a"
+.size ossl_rsaz_amm52x20_x1_256, .-ossl_rsaz_amm52x20_x1_256
+ .section .note.gnu.property, #alloc
.p2align 3
.long 1f - 0f
.long 4f - 1f
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-x86_64.s
index ea1ae389119..d68613212f1 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-x86_64.s
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/rsaz-x86_64.s
@@ -33,10 +33,6 @@ rsaz_512_sqr:
movq (%rsi),%rdx
movq 8(%rsi),%rax
movq %rcx,128(%rsp)
- movl $0x80100,%r11d
- andl OPENSSL_ia32cap_P+8(%rip),%r11d
- cmpl $0x80100,%r11d
- je .Loop_sqrx
jmp .Loop_sqr
.align 32
@@ -407,282 +403,6 @@ rsaz_512_sqr:
decl %r8d
jnz .Loop_sqr
- jmp .Lsqr_tail
-
-.align 32
-.Loop_sqrx:
- movl %r8d,128+8(%rsp)
-.byte 102,72,15,110,199
-
- mulxq %rax,%r8,%r9
- movq %rax,%rbx
-
- mulxq 16(%rsi),%rcx,%r10
- xorq %rbp,%rbp
-
- mulxq 24(%rsi),%rax,%r11
- adcxq %rcx,%r9
-
-.byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00
- adcxq %rax,%r10
-
-.byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00
- adcxq %rcx,%r11
-
- mulxq 48(%rsi),%rcx,%r14
- adcxq %rax,%r12
- adcxq %rcx,%r13
-
- mulxq 56(%rsi),%rax,%r15
- adcxq %rax,%r14
- adcxq %rbp,%r15
-
- mulxq %rdx,%rax,%rdi
- movq %rbx,%rdx
- xorq %rcx,%rcx
- adoxq %r8,%r8
- adcxq %rdi,%r8
- adoxq %rbp,%rcx
- adcxq %rbp,%rcx
-
- movq %rax,(%rsp)
- movq %r8,8(%rsp)
-
-
-.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00
- adoxq %rax,%r10
- adcxq %rbx,%r11
-
- mulxq 24(%rsi),%rdi,%r8
- adoxq %rdi,%r11
-.byte 0x66
- adcxq %r8,%r12
-
- mulxq 32(%rsi),%rax,%rbx
- adoxq %rax,%r12
- adcxq %rbx,%r13
-
- mulxq 40(%rsi),%rdi,%r8
- adoxq %rdi,%r13
- adcxq %r8,%r14
-
-.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
- adoxq %rax,%r14
- adcxq %rbx,%r15
-
-.byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00
- adoxq %rdi,%r15
- adcxq %rbp,%r8
- mulxq %rdx,%rax,%rdi
- adoxq %rbp,%r8
-.byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00
-
- xorq %rbx,%rbx
- adoxq %r9,%r9
-
- adcxq %rcx,%rax
- adoxq %r10,%r10
- adcxq %rax,%r9
- adoxq %rbp,%rbx
- adcxq %rdi,%r10
- adcxq %rbp,%rbx
-
- movq %r9,16(%rsp)
-.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00
-
-
- mulxq 24(%rsi),%rdi,%r9
- adoxq %rdi,%r12
- adcxq %r9,%r13
-
- mulxq 32(%rsi),%rax,%rcx
- adoxq %rax,%r13
- adcxq %rcx,%r14
-
-.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00
- adoxq %rdi,%r14
- adcxq %r9,%r15
-
-.byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00
- adoxq %rax,%r15
- adcxq %rcx,%r8
-
- mulxq 56(%rsi),%rdi,%r9
- adoxq %rdi,%r8
- adcxq %rbp,%r9
- mulxq %rdx,%rax,%rdi
- adoxq %rbp,%r9
- movq 24(%rsi),%rdx
-
- xorq %rcx,%rcx
- adoxq %r11,%r11
-
- adcxq %rbx,%rax
- adoxq %r12,%r12
- adcxq %rax,%r11
- adoxq %rbp,%rcx
- adcxq %rdi,%r12
- adcxq %rbp,%rcx
-
- movq %r11,32(%rsp)
- movq %r12,40(%rsp)
-
-
- mulxq 32(%rsi),%rax,%rbx
- adoxq %rax,%r14
- adcxq %rbx,%r15
-
- mulxq 40(%rsi),%rdi,%r10
- adoxq %rdi,%r15
- adcxq %r10,%r8
-
- mulxq 48(%rsi),%rax,%rbx
- adoxq %rax,%r8
- adcxq %rbx,%r9
-
- mulxq 56(%rsi),%rdi,%r10
- adoxq %rdi,%r9
- adcxq %rbp,%r10
- mulxq %rdx,%rax,%rdi
- adoxq %rbp,%r10
- movq 32(%rsi),%rdx
-
- xorq %rbx,%rbx
- adoxq %r13,%r13
-
- adcxq %rcx,%rax
- adoxq %r14,%r14
- adcxq %rax,%r13
- adoxq %rbp,%rbx
- adcxq %rdi,%r14
- adcxq %rbp,%rbx
-
- movq %r13,48(%rsp)
- movq %r14,56(%rsp)
-
-
- mulxq 40(%rsi),%rdi,%r11
- adoxq %rdi,%r8
- adcxq %r11,%r9
-
- mulxq 48(%rsi),%rax,%rcx
- adoxq %rax,%r9
- adcxq %rcx,%r10
-
- mulxq 56(%rsi),%rdi,%r11
- adoxq %rdi,%r10
- adcxq %rbp,%r11
- mulxq %rdx,%rax,%rdi
- movq 40(%rsi),%rdx
- adoxq %rbp,%r11
-
- xorq %rcx,%rcx
- adoxq %r15,%r15
-
- adcxq %rbx,%rax
- adoxq %r8,%r8
- adcxq %rax,%r15
- adoxq %rbp,%rcx
- adcxq %rdi,%r8
- adcxq %rbp,%rcx
-
- movq %r15,64(%rsp)
- movq %r8,72(%rsp)
-
-
-.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
- adoxq %rax,%r10
- adcxq %rbx,%r11
-
-.byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00
- adoxq %rdi,%r11
- adcxq %rbp,%r12
- mulxq %rdx,%rax,%rdi
- adoxq %rbp,%r12
- movq 48(%rsi),%rdx
-
- xorq %rbx,%rbx
- adoxq %r9,%r9
-
- adcxq %rcx,%rax
- adoxq %r10,%r10
- adcxq %rax,%r9
- adcxq %rdi,%r10
- adoxq %rbp,%rbx
- adcxq %rbp,%rbx
-
- movq %r9,80(%rsp)
- movq %r10,88(%rsp)
-
-
-.byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00
- adoxq %rax,%r12
- adoxq %rbp,%r13
-
- mulxq %rdx,%rax,%rdi
- xorq %rcx,%rcx
- movq 56(%rsi),%rdx
- adoxq %r11,%r11
-
- adcxq %rbx,%rax
- adoxq %r12,%r12
- adcxq %rax,%r11
- adoxq %rbp,%rcx
- adcxq %rdi,%r12
- adcxq %rbp,%rcx
-
-.byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00
-.byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00
-
-
- mulxq %rdx,%rax,%rdx
- xorq %rbx,%rbx
- adoxq %r13,%r13
-
- adcxq %rcx,%rax
- adoxq %rbp,%rbx
- adcxq %r13,%rax
- adcxq %rdx,%rbx
-
-.byte 102,72,15,126,199
-.byte 102,72,15,126,205
-
- movq 128(%rsp),%rdx
- movq (%rsp),%r8
- movq 8(%rsp),%r9
- movq 16(%rsp),%r10
- movq 24(%rsp),%r11
- movq 32(%rsp),%r12
- movq 40(%rsp),%r13
- movq 48(%rsp),%r14
- movq 56(%rsp),%r15
-
- movq %rax,112(%rsp)
- movq %rbx,120(%rsp)
-
- call __rsaz_512_reducex
-
- addq 64(%rsp),%r8
- adcq 72(%rsp),%r9
- adcq 80(%rsp),%r10
- adcq 88(%rsp),%r11
- adcq 96(%rsp),%r12
- adcq 104(%rsp),%r13
- adcq 112(%rsp),%r14
- adcq 120(%rsp),%r15
- sbbq %rcx,%rcx
-
- call __rsaz_512_subtract
-
- movq %r8,%rdx
- movq %r9,%rax
- movl 128+8(%rsp),%r8d
- movq %rdi,%rsi
-
- decl %r8d
- jnz .Loop_sqrx
-
-.Lsqr_tail:
leaq 128+24+48(%rsp),%rax
.cfi_def_cfa %rax,8
@@ -734,10 +454,6 @@ rsaz_512_mul:
.byte 102,72,15,110,199
.byte 102,72,15,110,201
movq %r8,128(%rsp)
- movl $0x80100,%r11d
- andl OPENSSL_ia32cap_P+8(%rip),%r11d
- cmpl $0x80100,%r11d
- je .Lmulx
movq (%rdx),%rbx
movq %rdx,%rbp
call __rsaz_512_mul
@@ -755,29 +471,6 @@ rsaz_512_mul:
movq 56(%rsp),%r15
call __rsaz_512_reduce
- jmp .Lmul_tail
-
-.align 32
-.Lmulx:
- movq %rdx,%rbp
- movq (%rdx),%rdx
- call __rsaz_512_mulx
-
-.byte 102,72,15,126,199
-.byte 102,72,15,126,205
-
- movq 128(%rsp),%rdx
- movq (%rsp),%r8
- movq 8(%rsp),%r9
- movq 16(%rsp),%r10
- movq 24(%rsp),%r11
- movq 32(%rsp),%r12
- movq 40(%rsp),%r13
- movq 48(%rsp),%r14
- movq 56(%rsp),%r15
-
- call __rsaz_512_reducex
-.Lmul_tail:
addq 64(%rsp),%r8
adcq 72(%rsp),%r9
adcq 80(%rsp),%r10
@@ -891,10 +584,6 @@ rsaz_512_mul_gather4:
por %xmm9,%xmm8
pshufd $0x4e,%xmm8,%xmm9
por %xmm9,%xmm8
- movl $0x80100,%r11d
- andl OPENSSL_ia32cap_P+8(%rip),%r11d
- cmpl $0x80100,%r11d
- je .Lmulx_gather
.byte 102,76,15,126,195
movq %r8,128(%rsp)
@@ -1075,142 +764,6 @@ rsaz_512_mul_gather4:
movq 56(%rsp),%r15
call __rsaz_512_reduce
- jmp .Lmul_gather_tail
-
-.align 32
-.Lmulx_gather:
-.byte 102,76,15,126,194
-
- movq %r8,128(%rsp)
- movq %rdi,128+8(%rsp)
- movq %rcx,128+16(%rsp)
-
- mulxq (%rsi),%rbx,%r8
- movq %rbx,(%rsp)
- xorl %edi,%edi
-
- mulxq 8(%rsi),%rax,%r9
-
- mulxq 16(%rsi),%rbx,%r10
- adcxq %rax,%r8
-
- mulxq 24(%rsi),%rax,%r11
- adcxq %rbx,%r9
-
- mulxq 32(%rsi),%rbx,%r12
- adcxq %rax,%r10
-
- mulxq 40(%rsi),%rax,%r13
- adcxq %rbx,%r11
-
- mulxq 48(%rsi),%rbx,%r14
- adcxq %rax,%r12
-
- mulxq 56(%rsi),%rax,%r15
- adcxq %rbx,%r13
- adcxq %rax,%r14
-.byte 0x67
- movq %r8,%rbx
- adcxq %rdi,%r15
-
- movq $-7,%rcx
- jmp .Loop_mulx_gather
-
-.align 32
-.Loop_mulx_gather:
- movdqa 0(%rbp),%xmm8
- movdqa 16(%rbp),%xmm9
- movdqa 32(%rbp),%xmm10
- movdqa 48(%rbp),%xmm11
- pand %xmm0,%xmm8
- movdqa 64(%rbp),%xmm12
- pand %xmm1,%xmm9
- movdqa 80(%rbp),%xmm13
- pand %xmm2,%xmm10
- movdqa 96(%rbp),%xmm14
- pand %xmm3,%xmm11
- movdqa 112(%rbp),%xmm15
- leaq 128(%rbp),%rbp
- pand %xmm4,%xmm12
- pand %xmm5,%xmm13
- pand %xmm6,%xmm14
- pand %xmm7,%xmm15
- por %xmm10,%xmm8
- por %xmm11,%xmm9
- por %xmm12,%xmm8
- por %xmm13,%xmm9
- por %xmm14,%xmm8
- por %xmm15,%xmm9
-
- por %xmm9,%xmm8
- pshufd $0x4e,%xmm8,%xmm9
- por %xmm9,%xmm8
-.byte 102,76,15,126,194
-
-.byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00
- adcxq %rax,%rbx
- adoxq %r9,%r8
-
- mulxq 8(%rsi),%rax,%r9
- adcxq %rax,%r8
- adoxq %r10,%r9
-
- mulxq 16(%rsi),%rax,%r10
- adcxq %rax,%r9
- adoxq %r11,%r10
-
-.byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00
- adcxq %rax,%r10
- adoxq %r12,%r11
-
- mulxq 32(%rsi),%rax,%r12
- adcxq %rax,%r11
- adoxq %r13,%r12
-
- mulxq 40(%rsi),%rax,%r13
- adcxq %rax,%r12
- adoxq %r14,%r13
-
-.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
- adcxq %rax,%r13
-.byte 0x67
- adoxq %r15,%r14
-
- mulxq 56(%rsi),%rax,%r15
- movq %rbx,64(%rsp,%rcx,8)
- adcxq %rax,%r14
- adoxq %rdi,%r15
- movq %r8,%rbx
- adcxq %rdi,%r15
-
- incq %rcx
- jnz .Loop_mulx_gather
-
- movq %r8,64(%rsp)
- movq %r9,64+8(%rsp)
- movq %r10,64+16(%rsp)
- movq %r11,64+24(%rsp)
- movq %r12,64+32(%rsp)
- movq %r13,64+40(%rsp)
- movq %r14,64+48(%rsp)
- movq %r15,64+56(%rsp)
-
- movq 128(%rsp),%rdx
- movq 128+8(%rsp),%rdi
- movq 128+16(%rsp),%rbp
-
- movq (%rsp),%r8
- movq 8(%rsp),%r9
- movq 16(%rsp),%r10
- movq 24(%rsp),%r11
- movq 32(%rsp),%r12
- movq 40(%rsp),%r13
- movq 48(%rsp),%r14
- movq 56(%rsp),%r15
-
- call __rsaz_512_reducex
-
-.Lmul_gather_tail:
addq 64(%rsp),%r8
adcq 72(%rsp),%r9
adcq 80(%rsp),%r10
@@ -1278,10 +831,6 @@ rsaz_512_mul_scatter4:
movq %rcx,128(%rsp)
movq %rdi,%rbp
- movl $0x80100,%r11d
- andl OPENSSL_ia32cap_P+8(%rip),%r11d
- cmpl $0x80100,%r11d
- je .Lmulx_scatter
movq (%rdi),%rbx
call __rsaz_512_mul
@@ -1298,29 +847,6 @@ rsaz_512_mul_scatter4:
movq 56(%rsp),%r15
call __rsaz_512_reduce
- jmp .Lmul_scatter_tail
-
-.align 32
-.Lmulx_scatter:
- movq (%rdi),%rdx
- call __rsaz_512_mulx
-
-.byte 102,72,15,126,199
-.byte 102,72,15,126,205
-
- movq 128(%rsp),%rdx
- movq (%rsp),%r8
- movq 8(%rsp),%r9
- movq 16(%rsp),%r10
- movq 24(%rsp),%r11
- movq 32(%rsp),%r12
- movq 40(%rsp),%r13
- movq 48(%rsp),%r14
- movq 56(%rsp),%r15
-
- call __rsaz_512_reducex
-
-.Lmul_scatter_tail:
addq 64(%rsp),%r8
adcq 72(%rsp),%r9
adcq 80(%rsp),%r10
@@ -1390,7 +916,6 @@ rsaz_512_mul_by_one:
subq $128+24,%rsp
.cfi_adjust_cfa_offset 128+24
.Lmul_by_one_body:
- movl OPENSSL_ia32cap_P+8(%rip),%eax
movq %rdx,%rbp
movq %rcx,128(%rsp)
@@ -1411,16 +936,7 @@ rsaz_512_mul_by_one:
movdqa %xmm0,64(%rsp)
movdqa %xmm0,80(%rsp)
movdqa %xmm0,96(%rsp)
- andl $0x80100,%eax
- cmpl $0x80100,%eax
- je .Lby_one_callx
call __rsaz_512_reduce
- jmp .Lby_one_tail
-.align 32
-.Lby_one_callx:
- movq 128(%rsp),%rdx
- call __rsaz_512_reducex
-.Lby_one_tail:
movq %r8,(%rdi)
movq %r9,8(%rdi)
movq %r10,16(%rdi)
@@ -1535,64 +1051,6 @@ __rsaz_512_reduce:
.byte 0xf3,0xc3
.cfi_endproc
.size __rsaz_512_reduce,.-__rsaz_512_reduce
-.type __rsaz_512_reducex,@function
-.align 32
-__rsaz_512_reducex:
-.cfi_startproc
-
- imulq %r8,%rdx
- xorq %rsi,%rsi
- movl $8,%ecx
- jmp .Lreduction_loopx
-
-.align 32
-.Lreduction_loopx:
- movq %r8,%rbx
- mulxq 0(%rbp),%rax,%r8
- adcxq %rbx,%rax
- adoxq %r9,%r8
-
- mulxq 8(%rbp),%rax,%r9
- adcxq %rax,%r8
- adoxq %r10,%r9
-
- mulxq 16(%rbp),%rbx,%r10
- adcxq %rbx,%r9
- adoxq %r11,%r10
-
- mulxq 24(%rbp),%rbx,%r11
- adcxq %rbx,%r10
- adoxq %r12,%r11
-
-.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
- movq %rdx,%rax
- movq %r8,%rdx
- adcxq %rbx,%r11
- adoxq %r13,%r12
-
- mulxq 128+8(%rsp),%rbx,%rdx
- movq %rax,%rdx
-
- mulxq 40(%rbp),%rax,%r13
- adcxq %rax,%r12
- adoxq %r14,%r13
-
-.byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00
- adcxq %rax,%r13
- adoxq %r15,%r14
-
- mulxq 56(%rbp),%rax,%r15
- movq %rbx,%rdx
- adcxq %rax,%r14
- adoxq %rsi,%r15
- adcxq %rsi,%r15
-
- decl %ecx
- jne .Lreduction_loopx
-
- .byte 0xf3,0xc3
-.cfi_endproc
-.size __rsaz_512_reducex,.-__rsaz_512_reducex
.type __rsaz_512_subtract,@function
.align 32
__rsaz_512_subtract:
@@ -1796,128 +1254,6 @@ __rsaz_512_mul:
.byte 0xf3,0xc3
.cfi_endproc
.size __rsaz_512_mul,.-__rsaz_512_mul
-.type __rsaz_512_mulx,@function
-.align 32
-__rsaz_512_mulx:
-.cfi_startproc
- mulxq (%rsi),%rbx,%r8
- movq $-6,%rcx
-
- mulxq 8(%rsi),%rax,%r9
- movq %rbx,8(%rsp)
-
- mulxq 16(%rsi),%rbx,%r10
- adcq %rax,%r8
-
- mulxq 24(%rsi),%rax,%r11
- adcq %rbx,%r9
-
- mulxq 32(%rsi),%rbx,%r12
- adcq %rax,%r10
-
- mulxq 40(%rsi),%rax,%r13
- adcq %rbx,%r11
-
- mulxq 48(%rsi),%rbx,%r14
- adcq %rax,%r12
-
- mulxq 56(%rsi),%rax,%r15
- movq 8(%rbp),%rdx
- adcq %rbx,%r13
- adcq %rax,%r14
- adcq $0,%r15
-
- xorq %rdi,%rdi
- jmp .Loop_mulx
-
-.align 32
-.Loop_mulx:
- movq %r8,%rbx
- mulxq (%rsi),%rax,%r8
- adcxq %rax,%rbx
- adoxq %r9,%r8
-
- mulxq 8(%rsi),%rax,%r9
- adcxq %rax,%r8
- adoxq %r10,%r9
-
- mulxq 16(%rsi),%rax,%r10
- adcxq %rax,%r9
- adoxq %r11,%r10
-
- mulxq 24(%rsi),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
-
-.byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00
- adcxq %rax,%r11
- adoxq %r13,%r12
-
- mulxq 40(%rsi),%rax,%r13
- adcxq %rax,%r12
- adoxq %r14,%r13
-
- mulxq 48(%rsi),%rax,%r14
- adcxq %rax,%r13
- adoxq %r15,%r14
-
- mulxq 56(%rsi),%rax,%r15
- movq 64(%rbp,%rcx,8),%rdx
- movq %rbx,8+64-8(%rsp,%rcx,8)
- adcxq %rax,%r14
- adoxq %rdi,%r15
- adcxq %rdi,%r15
-
- incq %rcx
- jnz .Loop_mulx
-
- movq %r8,%rbx
- mulxq (%rsi),%rax,%r8
- adcxq %rax,%rbx
- adoxq %r9,%r8
-
-.byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00
- adcxq %rax,%r8
- adoxq %r10,%r9
-
-.byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00
- adcxq %rax,%r9
- adoxq %r11,%r10
-
- mulxq 24(%rsi),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
-
- mulxq 32(%rsi),%rax,%r12
- adcxq %rax,%r11
- adoxq %r13,%r12
-
- mulxq 40(%rsi),%rax,%r13
- adcxq %rax,%r12
- adoxq %r14,%r13
-
-.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
- adcxq %rax,%r13
- adoxq %r15,%r14
-
-.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
- adcxq %rax,%r14
- adoxq %rdi,%r15
- adcxq %rdi,%r15
-
- movq %rbx,8+64-8(%rsp)
- movq %r8,8+64(%rsp)
- movq %r9,8+64+8(%rsp)
- movq %r10,8+64+16(%rsp)
- movq %r11,8+64+24(%rsp)
- movq %r12,8+64+32(%rsp)
- movq %r13,8+64+40(%rsp)
- movq %r14,8+64+48(%rsp)
- movq %r15,8+64+56(%rsp)
-
- .byte 0xf3,0xc3
-.cfi_endproc
-.size __rsaz_512_mulx,.-__rsaz_512_mulx
.globl rsaz_512_scatter4
.type rsaz_512_scatter4,@function
.align 16
@@ -2013,7 +1349,7 @@ rsaz_512_gather4:
.Linc:
.long 0,0, 1,1
.long 2,2, 2,2
- .section ".note.gnu.property", "a"
+ .section .note.gnu.property, #alloc
.p2align 3
.long 1f - 0f
.long 4f - 1f
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-gf2m.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-gf2m.s
index 4f259df94bc..3c1e47c6ddd 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-gf2m.s
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-gf2m.s
@@ -309,7 +309,7 @@ bn_GF2m_mul_2x2:
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
.byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 16
- .section ".note.gnu.property", "a"
+ .section .note.gnu.property, #alloc
.p2align 3
.long 1f - 0f
.long 4f - 1f
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-mont.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-mont.s
index f412eee41c9..ba7bb44ca38 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-mont.s
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-mont.s
@@ -14,7 +14,6 @@ bn_mul_mont:
jnz .Lmul_enter
cmpl $8,%r9d
jb .Lmul_enter
- movl OPENSSL_ia32cap_P+8(%rip),%r11d
cmpq %rsi,%rdx
jne .Lmul4x_enter
testl $7,%r9d
@@ -263,9 +262,6 @@ bn_mul4x_mont:
movq %rsp,%rax
.cfi_def_cfa_register %rax
.Lmul4x_enter:
- andl $0x80100,%r11d
- cmpl $0x80100,%r11d
- je .Lmulx4x_enter
pushq %rbx
.cfi_offset %rbx,-16
pushq %rbp
@@ -691,7 +687,6 @@ bn_mul4x_mont:
.size bn_mul4x_mont,.-bn_mul4x_mont
-
.type bn_sqr8x_mont,@function
.align 32
bn_sqr8x_mont:
@@ -773,25 +768,6 @@ bn_sqr8x_mont:
pxor %xmm0,%xmm0
.byte 102,72,15,110,207
.byte 102,73,15,110,218
- movl OPENSSL_ia32cap_P+8(%rip),%eax
- andl $0x80100,%eax
- cmpl $0x80100,%eax
- jne .Lsqr8x_nox
-
- call bn_sqrx8x_internal
-
-
-
-
- leaq (%r8,%rcx,1),%rbx
- movq %rcx,%r9
- movq %rcx,%rdx
-.byte 102,72,15,126,207
- sarq $3+2,%rcx
- jmp .Lsqr8x_sub
-
-.align 32
-.Lsqr8x_nox:
call bn_sqr8x_internal
@@ -879,365 +855,9 @@ bn_sqr8x_mont:
.byte 0xf3,0xc3
.cfi_endproc
.size bn_sqr8x_mont,.-bn_sqr8x_mont
-.type bn_mulx4x_mont,@function
-.align 32
-bn_mulx4x_mont:
-.cfi_startproc
- movq %rsp,%rax
-.cfi_def_cfa_register %rax
-.Lmulx4x_enter:
- pushq %rbx
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_offset %rbp,-24
- pushq %r12
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_offset %r15,-56
-.Lmulx4x_prologue:
-
- shll $3,%r9d
- xorq %r10,%r10
- subq %r9,%r10
- movq (%r8),%r8
- leaq -72(%rsp,%r10,1),%rbp
- andq $-128,%rbp
- movq %rsp,%r11
- subq %rbp,%r11
- andq $-4096,%r11
- leaq (%r11,%rbp,1),%rsp
- movq (%rsp),%r10
- cmpq %rbp,%rsp
- ja .Lmulx4x_page_walk
- jmp .Lmulx4x_page_walk_done
-
-.align 16
-.Lmulx4x_page_walk:
- leaq -4096(%rsp),%rsp
- movq (%rsp),%r10
- cmpq %rbp,%rsp
- ja .Lmulx4x_page_walk
-.Lmulx4x_page_walk_done:
-
- leaq (%rdx,%r9,1),%r10
-
-
-
-
-
-
-
-
-
-
-
-
- movq %r9,0(%rsp)
- shrq $5,%r9
- movq %r10,16(%rsp)
- subq $1,%r9
- movq %r8,24(%rsp)
- movq %rdi,32(%rsp)
- movq %rax,40(%rsp)
-.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
- movq %r9,48(%rsp)
- jmp .Lmulx4x_body
-
-.align 32
-.Lmulx4x_body:
- leaq 8(%rdx),%rdi
- movq (%rdx),%rdx
- leaq 64+32(%rsp),%rbx
- movq %rdx,%r9
-
- mulxq 0(%rsi),%r8,%rax
- mulxq 8(%rsi),%r11,%r14
- addq %rax,%r11
- movq %rdi,8(%rsp)
- mulxq 16(%rsi),%r12,%r13
- adcq %r14,%r12
- adcq $0,%r13
-
- movq %r8,%rdi
- imulq 24(%rsp),%r8
- xorq %rbp,%rbp
-
- mulxq 24(%rsi),%rax,%r14
- movq %r8,%rdx
- leaq 32(%rsi),%rsi
- adcxq %rax,%r13
- adcxq %rbp,%r14
-
- mulxq 0(%rcx),%rax,%r10
- adcxq %rax,%rdi
- adoxq %r11,%r10
- mulxq 8(%rcx),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
-.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
- movq 48(%rsp),%rdi
- movq %r10,-32(%rbx)
- adcxq %rax,%r11
- adoxq %r13,%r12
- mulxq 24(%rcx),%rax,%r15
- movq %r9,%rdx
- movq %r11,-24(%rbx)
- adcxq %rax,%r12
- adoxq %rbp,%r15
- leaq 32(%rcx),%rcx
- movq %r12,-16(%rbx)
-
- jmp .Lmulx4x_1st
-
-.align 32
-.Lmulx4x_1st:
- adcxq %rbp,%r15
- mulxq 0(%rsi),%r10,%rax
- adcxq %r14,%r10
- mulxq 8(%rsi),%r11,%r14
- adcxq %rax,%r11
- mulxq 16(%rsi),%r12,%rax
- adcxq %r14,%r12
- mulxq 24(%rsi),%r13,%r14
-.byte 0x67,0x67
- movq %r8,%rdx
- adcxq %rax,%r13
- adcxq %rbp,%r14
- leaq 32(%rsi),%rsi
- leaq 32(%rbx),%rbx
-
- adoxq %r15,%r10
- mulxq 0(%rcx),%rax,%r15
- adcxq %rax,%r10
- adoxq %r15,%r11
- mulxq 8(%rcx),%rax,%r15
- adcxq %rax,%r11
- adoxq %r15,%r12
- mulxq 16(%rcx),%rax,%r15
- movq %r10,-40(%rbx)
- adcxq %rax,%r12
- movq %r11,-32(%rbx)
- adoxq %r15,%r13
- mulxq 24(%rcx),%rax,%r15
- movq %r9,%rdx
- movq %r12,-24(%rbx)
- adcxq %rax,%r13
- adoxq %rbp,%r15
- leaq 32(%rcx),%rcx
- movq %r13,-16(%rbx)
-
- decq %rdi
- jnz .Lmulx4x_1st
-
- movq 0(%rsp),%rax
- movq 8(%rsp),%rdi
- adcq %rbp,%r15
- addq %r15,%r14
- sbbq %r15,%r15
- movq %r14,-8(%rbx)
- jmp .Lmulx4x_outer
-
-.align 32
-.Lmulx4x_outer:
- movq (%rdi),%rdx
- leaq 8(%rdi),%rdi
- subq %rax,%rsi
- movq %r15,(%rbx)
- leaq 64+32(%rsp),%rbx
- subq %rax,%rcx
-
- mulxq 0(%rsi),%r8,%r11
- xorl %ebp,%ebp
- movq %rdx,%r9
- mulxq 8(%rsi),%r14,%r12
- adoxq -32(%rbx),%r8
- adcxq %r14,%r11
- mulxq 16(%rsi),%r15,%r13
- adoxq -24(%rbx),%r11
- adcxq %r15,%r12
- adoxq -16(%rbx),%r12
- adcxq %rbp,%r13
- adoxq %rbp,%r13
-
- movq %rdi,8(%rsp)
- movq %r8,%r15
- imulq 24(%rsp),%r8
- xorl %ebp,%ebp
-
- mulxq 24(%rsi),%rax,%r14
- movq %r8,%rdx
- adcxq %rax,%r13
- adoxq -8(%rbx),%r13
- adcxq %rbp,%r14
- leaq 32(%rsi),%rsi
- adoxq %rbp,%r14
-
- mulxq 0(%rcx),%rax,%r10
- adcxq %rax,%r15
- adoxq %r11,%r10
- mulxq 8(%rcx),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
- mulxq 16(%rcx),%rax,%r12
- movq %r10,-32(%rbx)
- adcxq %rax,%r11
- adoxq %r13,%r12
- mulxq 24(%rcx),%rax,%r15
- movq %r9,%rdx
- movq %r11,-24(%rbx)
- leaq 32(%rcx),%rcx
- adcxq %rax,%r12
- adoxq %rbp,%r15
- movq 48(%rsp),%rdi
- movq %r12,-16(%rbx)
-
- jmp .Lmulx4x_inner
-
-.align 32
-.Lmulx4x_inner:
- mulxq 0(%rsi),%r10,%rax
- adcxq %rbp,%r15
- adoxq %r14,%r10
- mulxq 8(%rsi),%r11,%r14
- adcxq 0(%rbx),%r10
- adoxq %rax,%r11
- mulxq 16(%rsi),%r12,%rax
- adcxq 8(%rbx),%r11
- adoxq %r14,%r12
- mulxq 24(%rsi),%r13,%r14
- movq %r8,%rdx
- adcxq 16(%rbx),%r12
- adoxq %rax,%r13
- adcxq 24(%rbx),%r13
- adoxq %rbp,%r14
- leaq 32(%rsi),%rsi
- leaq 32(%rbx),%rbx
- adcxq %rbp,%r14
-
- adoxq %r15,%r10
- mulxq 0(%rcx),%rax,%r15
- adcxq %rax,%r10
- adoxq %r15,%r11
- mulxq 8(%rcx),%rax,%r15
- adcxq %rax,%r11
- adoxq %r15,%r12
- mulxq 16(%rcx),%rax,%r15
- movq %r10,-40(%rbx)
- adcxq %rax,%r12
- adoxq %r15,%r13
- mulxq 24(%rcx),%rax,%r15
- movq %r9,%rdx
- movq %r11,-32(%rbx)
- movq %r12,-24(%rbx)
- adcxq %rax,%r13
- adoxq %rbp,%r15
- leaq 32(%rcx),%rcx
- movq %r13,-16(%rbx)
-
- decq %rdi
- jnz .Lmulx4x_inner
-
- movq 0(%rsp),%rax
- movq 8(%rsp),%rdi
- adcq %rbp,%r15
- subq 0(%rbx),%rbp
- adcq %r15,%r14
- sbbq %r15,%r15
- movq %r14,-8(%rbx)
-
- cmpq 16(%rsp),%rdi
- jne .Lmulx4x_outer
-
- leaq 64(%rsp),%rbx
- subq %rax,%rcx
- negq %r15
- movq %rax,%rdx
- shrq $3+2,%rax
- movq 32(%rsp),%rdi
- jmp .Lmulx4x_sub
-
-.align 32
-.Lmulx4x_sub:
- movq 0(%rbx),%r11
- movq 8(%rbx),%r12
- movq 16(%rbx),%r13
- movq 24(%rbx),%r14
- leaq 32(%rbx),%rbx
- sbbq 0(%rcx),%r11
- sbbq 8(%rcx),%r12
- sbbq 16(%rcx),%r13
- sbbq 24(%rcx),%r14
- leaq 32(%rcx),%rcx
- movq %r11,0(%rdi)
- movq %r12,8(%rdi)
- movq %r13,16(%rdi)
- movq %r14,24(%rdi)
- leaq 32(%rdi),%rdi
- decq %rax
- jnz .Lmulx4x_sub
-
- sbbq $0,%r15
- leaq 64(%rsp),%rbx
- subq %rdx,%rdi
-
-.byte 102,73,15,110,207
- pxor %xmm0,%xmm0
- pshufd $0,%xmm1,%xmm1
- movq 40(%rsp),%rsi
-.cfi_def_cfa %rsi,8
- jmp .Lmulx4x_cond_copy
-
-.align 32
-.Lmulx4x_cond_copy:
- movdqa 0(%rbx),%xmm2
- movdqa 16(%rbx),%xmm3
- leaq 32(%rbx),%rbx
- movdqu 0(%rdi),%xmm4
- movdqu 16(%rdi),%xmm5
- leaq 32(%rdi),%rdi
- movdqa %xmm0,-32(%rbx)
- movdqa %xmm0,-16(%rbx)
- pcmpeqd %xmm1,%xmm0
- pand %xmm1,%xmm2
- pand %xmm1,%xmm3
- pand %xmm0,%xmm4
- pand %xmm0,%xmm5
- pxor %xmm0,%xmm0
- por %xmm2,%xmm4
- por %xmm3,%xmm5
- movdqu %xmm4,-32(%rdi)
- movdqu %xmm5,-16(%rdi)
- subq $32,%rdx
- jnz .Lmulx4x_cond_copy
-
- movq %rdx,(%rbx)
-
- movq $1,%rax
- movq -48(%rsi),%r15
-.cfi_restore %r15
- movq -40(%rsi),%r14
-.cfi_restore %r14
- movq -32(%rsi),%r13
-.cfi_restore %r13
- movq -24(%rsi),%r12
-.cfi_restore %r12
- movq -16(%rsi),%rbp
-.cfi_restore %rbp
- movq -8(%rsi),%rbx
-.cfi_restore %rbx
- leaq (%rsi),%rsp
-.cfi_def_cfa_register %rsp
-.Lmulx4x_epilogue:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size bn_mulx4x_mont,.-bn_mulx4x_mont
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 16
- .section ".note.gnu.property", "a"
+ .section .note.gnu.property, #alloc
.p2align 3
.long 1f - 0f
.long 4f - 1f
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-mont5.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-mont5.s
index d0025f94e2d..4614a037ae6 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-mont5.s
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/bn/x86_64-mont5.s
@@ -12,7 +12,6 @@ bn_mul_mont_gather5:
.cfi_def_cfa_register %rax
testl $7,%r9d
jnz .Lmul_enter
- movl OPENSSL_ia32cap_P+8(%rip),%r11d
jmp .Lmul4x_enter
.align 16
@@ -449,9 +448,6 @@ bn_mul4x_mont_gather5:
movq %rsp,%rax
.cfi_def_cfa_register %rax
.Lmul4x_enter:
- andl $0x80108,%r11d
- cmpl $0x80108,%r11d
- je .Lmulx4x_enter
pushq %rbx
.cfi_offset %rbx,-16
pushq %rbp
@@ -1081,10 +1077,6 @@ bn_power5:
.cfi_startproc
movq %rsp,%rax
.cfi_def_cfa_register %rax
- movl OPENSSL_ia32cap_P+8(%rip),%r11d
- andl $0x80108,%r11d
- cmpl $0x80108,%r11d
- je .Lpowerx5_enter
pushq %rbx
.cfi_offset %rbx,-16
pushq %rbp
@@ -2048,1348 +2040,6 @@ __bn_post4x_internal:
.byte 0xf3,0xc3
.cfi_endproc
.size __bn_post4x_internal,.-__bn_post4x_internal
-.type bn_mulx4x_mont_gather5,@function
-.align 32
-bn_mulx4x_mont_gather5:
-.cfi_startproc
- movq %rsp,%rax
-.cfi_def_cfa_register %rax
-.Lmulx4x_enter:
- pushq %rbx
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_offset %rbp,-24
- pushq %r12
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_offset %r15,-56
-.Lmulx4x_prologue:
-
- shll $3,%r9d
- leaq (%r9,%r9,2),%r10
- negq %r9
- movq (%r8),%r8
-
-
-
-
-
-
-
-
-
-
- leaq -320(%rsp,%r9,2),%r11
- movq %rsp,%rbp
- subq %rdi,%r11
- andq $4095,%r11
- cmpq %r11,%r10
- jb .Lmulx4xsp_alt
- subq %r11,%rbp
- leaq -320(%rbp,%r9,2),%rbp
- jmp .Lmulx4xsp_done
-
-.Lmulx4xsp_alt:
- leaq 4096-320(,%r9,2),%r10
- leaq -320(%rbp,%r9,2),%rbp
- subq %r10,%r11
- movq $0,%r10
- cmovcq %r10,%r11
- subq %r11,%rbp
-.Lmulx4xsp_done:
- andq $-64,%rbp
- movq %rsp,%r11
- subq %rbp,%r11
- andq $-4096,%r11
- leaq (%r11,%rbp,1),%rsp
- movq (%rsp),%r10
- cmpq %rbp,%rsp
- ja .Lmulx4x_page_walk
- jmp .Lmulx4x_page_walk_done
-
-.Lmulx4x_page_walk:
- leaq -4096(%rsp),%rsp
- movq (%rsp),%r10
- cmpq %rbp,%rsp
- ja .Lmulx4x_page_walk
-.Lmulx4x_page_walk_done:
-
-
-
-
-
-
-
-
-
-
-
-
-
- movq %r8,32(%rsp)
- movq %rax,40(%rsp)
-.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
-.Lmulx4x_body:
- call mulx4x_internal
-
- movq 40(%rsp),%rsi
-.cfi_def_cfa %rsi,8
- movq $1,%rax
-
- movq -48(%rsi),%r15
-.cfi_restore %r15
- movq -40(%rsi),%r14
-.cfi_restore %r14
- movq -32(%rsi),%r13
-.cfi_restore %r13
- movq -24(%rsi),%r12
-.cfi_restore %r12
- movq -16(%rsi),%rbp
-.cfi_restore %rbp
- movq -8(%rsi),%rbx
-.cfi_restore %rbx
- leaq (%rsi),%rsp
-.cfi_def_cfa_register %rsp
-.Lmulx4x_epilogue:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
-
-.type mulx4x_internal,@function
-.align 32
-mulx4x_internal:
-.cfi_startproc
- movq %r9,8(%rsp)
- movq %r9,%r10
- negq %r9
- shlq $5,%r9
- negq %r10
- leaq 128(%rdx,%r9,1),%r13
- shrq $5+5,%r9
- movd 8(%rax),%xmm5
- subq $1,%r9
- leaq .Linc(%rip),%rax
- movq %r13,16+8(%rsp)
- movq %r9,24+8(%rsp)
- movq %rdi,56+8(%rsp)
- movdqa 0(%rax),%xmm0
- movdqa 16(%rax),%xmm1
- leaq 88-112(%rsp,%r10,1),%r10
- leaq 128(%rdx),%rdi
-
- pshufd $0,%xmm5,%xmm5
- movdqa %xmm1,%xmm4
-.byte 0x67
- movdqa %xmm1,%xmm2
-.byte 0x67
- paddd %xmm0,%xmm1
- pcmpeqd %xmm5,%xmm0
- movdqa %xmm4,%xmm3
- paddd %xmm1,%xmm2
- pcmpeqd %xmm5,%xmm1
- movdqa %xmm0,112(%r10)
- movdqa %xmm4,%xmm0
-
- paddd %xmm2,%xmm3
- pcmpeqd %xmm5,%xmm2
- movdqa %xmm1,128(%r10)
- movdqa %xmm4,%xmm1
-
- paddd %xmm3,%xmm0
- pcmpeqd %xmm5,%xmm3
- movdqa %xmm2,144(%r10)
- movdqa %xmm4,%xmm2
-
- paddd %xmm0,%xmm1
- pcmpeqd %xmm5,%xmm0
- movdqa %xmm3,160(%r10)
- movdqa %xmm4,%xmm3
- paddd %xmm1,%xmm2
- pcmpeqd %xmm5,%xmm1
- movdqa %xmm0,176(%r10)
- movdqa %xmm4,%xmm0
-
- paddd %xmm2,%xmm3
- pcmpeqd %xmm5,%xmm2
- movdqa %xmm1,192(%r10)
- movdqa %xmm4,%xmm1
-
- paddd %xmm3,%xmm0
- pcmpeqd %xmm5,%xmm3
- movdqa %xmm2,208(%r10)
- movdqa %xmm4,%xmm2
-
- paddd %xmm0,%xmm1
- pcmpeqd %xmm5,%xmm0
- movdqa %xmm3,224(%r10)
- movdqa %xmm4,%xmm3
- paddd %xmm1,%xmm2
- pcmpeqd %xmm5,%xmm1
- movdqa %xmm0,240(%r10)
- movdqa %xmm4,%xmm0
-
- paddd %xmm2,%xmm3
- pcmpeqd %xmm5,%xmm2
- movdqa %xmm1,256(%r10)
- movdqa %xmm4,%xmm1
-
- paddd %xmm3,%xmm0
- pcmpeqd %xmm5,%xmm3
- movdqa %xmm2,272(%r10)
- movdqa %xmm4,%xmm2
-
- paddd %xmm0,%xmm1
- pcmpeqd %xmm5,%xmm0
- movdqa %xmm3,288(%r10)
- movdqa %xmm4,%xmm3
-.byte 0x67
- paddd %xmm1,%xmm2
- pcmpeqd %xmm5,%xmm1
- movdqa %xmm0,304(%r10)
-
- paddd %xmm2,%xmm3
- pcmpeqd %xmm5,%xmm2
- movdqa %xmm1,320(%r10)
-
- pcmpeqd %xmm5,%xmm3
- movdqa %xmm2,336(%r10)
-
- pand 64(%rdi),%xmm0
- pand 80(%rdi),%xmm1
- pand 96(%rdi),%xmm2
- movdqa %xmm3,352(%r10)
- pand 112(%rdi),%xmm3
- por %xmm2,%xmm0
- por %xmm3,%xmm1
- movdqa -128(%rdi),%xmm4
- movdqa -112(%rdi),%xmm5
- movdqa -96(%rdi),%xmm2
- pand 112(%r10),%xmm4
- movdqa -80(%rdi),%xmm3
- pand 128(%r10),%xmm5
- por %xmm4,%xmm0
- pand 144(%r10),%xmm2
- por %xmm5,%xmm1
- pand 160(%r10),%xmm3
- por %xmm2,%xmm0
- por %xmm3,%xmm1
- movdqa -64(%rdi),%xmm4
- movdqa -48(%rdi),%xmm5
- movdqa -32(%rdi),%xmm2
- pand 176(%r10),%xmm4
- movdqa -16(%rdi),%xmm3
- pand 192(%r10),%xmm5
- por %xmm4,%xmm0
- pand 208(%r10),%xmm2
- por %xmm5,%xmm1
- pand 224(%r10),%xmm3
- por %xmm2,%xmm0
- por %xmm3,%xmm1
- movdqa 0(%rdi),%xmm4
- movdqa 16(%rdi),%xmm5
- movdqa 32(%rdi),%xmm2
- pand 240(%r10),%xmm4
- movdqa 48(%rdi),%xmm3
- pand 256(%r10),%xmm5
- por %xmm4,%xmm0
- pand 272(%r10),%xmm2
- por %xmm5,%xmm1
- pand 288(%r10),%xmm3
- por %xmm2,%xmm0
- por %xmm3,%xmm1
- pxor %xmm1,%xmm0
- pshufd $0x4e,%xmm0,%xmm1
- por %xmm1,%xmm0
- leaq 256(%rdi),%rdi
-.byte 102,72,15,126,194
- leaq 64+32+8(%rsp),%rbx
-
- movq %rdx,%r9
- mulxq 0(%rsi),%r8,%rax
- mulxq 8(%rsi),%r11,%r12
- addq %rax,%r11
- mulxq 16(%rsi),%rax,%r13
- adcq %rax,%r12
- adcq $0,%r13
- mulxq 24(%rsi),%rax,%r14
-
- movq %r8,%r15
- imulq 32+8(%rsp),%r8
- xorq %rbp,%rbp
- movq %r8,%rdx
-
- movq %rdi,8+8(%rsp)
-
- leaq 32(%rsi),%rsi
- adcxq %rax,%r13
- adcxq %rbp,%r14
-
- mulxq 0(%rcx),%rax,%r10
- adcxq %rax,%r15
- adoxq %r11,%r10
- mulxq 8(%rcx),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
- mulxq 16(%rcx),%rax,%r12
- movq 24+8(%rsp),%rdi
- movq %r10,-32(%rbx)
- adcxq %rax,%r11
- adoxq %r13,%r12
- mulxq 24(%rcx),%rax,%r15
- movq %r9,%rdx
- movq %r11,-24(%rbx)
- adcxq %rax,%r12
- adoxq %rbp,%r15
- leaq 32(%rcx),%rcx
- movq %r12,-16(%rbx)
- jmp .Lmulx4x_1st
-
-.align 32
-.Lmulx4x_1st:
- adcxq %rbp,%r15
- mulxq 0(%rsi),%r10,%rax
- adcxq %r14,%r10
- mulxq 8(%rsi),%r11,%r14
- adcxq %rax,%r11
- mulxq 16(%rsi),%r12,%rax
- adcxq %r14,%r12
- mulxq 24(%rsi),%r13,%r14
-.byte 0x67,0x67
- movq %r8,%rdx
- adcxq %rax,%r13
- adcxq %rbp,%r14
- leaq 32(%rsi),%rsi
- leaq 32(%rbx),%rbx
-
- adoxq %r15,%r10
- mulxq 0(%rcx),%rax,%r15
- adcxq %rax,%r10
- adoxq %r15,%r11
- mulxq 8(%rcx),%rax,%r15
- adcxq %rax,%r11
- adoxq %r15,%r12
- mulxq 16(%rcx),%rax,%r15
- movq %r10,-40(%rbx)
- adcxq %rax,%r12
- movq %r11,-32(%rbx)
- adoxq %r15,%r13
- mulxq 24(%rcx),%rax,%r15
- movq %r9,%rdx
- movq %r12,-24(%rbx)
- adcxq %rax,%r13
- adoxq %rbp,%r15
- leaq 32(%rcx),%rcx
- movq %r13,-16(%rbx)
-
- decq %rdi
- jnz .Lmulx4x_1st
-
- movq 8(%rsp),%rax
- adcq %rbp,%r15
- leaq (%rsi,%rax,1),%rsi
- addq %r15,%r14
- movq 8+8(%rsp),%rdi
- adcq %rbp,%rbp
- movq %r14,-8(%rbx)
- jmp .Lmulx4x_outer
-
-.align 32
-.Lmulx4x_outer:
- leaq 16-256(%rbx),%r10
- pxor %xmm4,%xmm4
-.byte 0x67,0x67
- pxor %xmm5,%xmm5
- movdqa -128(%rdi),%xmm0
- movdqa -112(%rdi),%xmm1
- movdqa -96(%rdi),%xmm2
- pand 256(%r10),%xmm0
- movdqa -80(%rdi),%xmm3
- pand 272(%r10),%xmm1
- por %xmm0,%xmm4
- pand 288(%r10),%xmm2
- por %xmm1,%xmm5
- pand 304(%r10),%xmm3
- por %xmm2,%xmm4
- por %xmm3,%xmm5
- movdqa -64(%rdi),%xmm0
- movdqa -48(%rdi),%xmm1
- movdqa -32(%rdi),%xmm2
- pand 320(%r10),%xmm0
- movdqa -16(%rdi),%xmm3
- pand 336(%r10),%xmm1
- por %xmm0,%xmm4
- pand 352(%r10),%xmm2
- por %xmm1,%xmm5
- pand 368(%r10),%xmm3
- por %xmm2,%xmm4
- por %xmm3,%xmm5
- movdqa 0(%rdi),%xmm0
- movdqa 16(%rdi),%xmm1
- movdqa 32(%rdi),%xmm2
- pand 384(%r10),%xmm0
- movdqa 48(%rdi),%xmm3
- pand 400(%r10),%xmm1
- por %xmm0,%xmm4
- pand 416(%r10),%xmm2
- por %xmm1,%xmm5
- pand 432(%r10),%xmm3
- por %xmm2,%xmm4
- por %xmm3,%xmm5
- movdqa 64(%rdi),%xmm0
- movdqa 80(%rdi),%xmm1
- movdqa 96(%rdi),%xmm2
- pand 448(%r10),%xmm0
- movdqa 112(%rdi),%xmm3
- pand 464(%r10),%xmm1
- por %xmm0,%xmm4
- pand 480(%r10),%xmm2
- por %xmm1,%xmm5
- pand 496(%r10),%xmm3
- por %xmm2,%xmm4
- por %xmm3,%xmm5
- por %xmm5,%xmm4
- pshufd $0x4e,%xmm4,%xmm0
- por %xmm4,%xmm0
- leaq 256(%rdi),%rdi
-.byte 102,72,15,126,194
-
- movq %rbp,(%rbx)
- leaq 32(%rbx,%rax,1),%rbx
- mulxq 0(%rsi),%r8,%r11
- xorq %rbp,%rbp
- movq %rdx,%r9
- mulxq 8(%rsi),%r14,%r12
- adoxq -32(%rbx),%r8
- adcxq %r14,%r11
- mulxq 16(%rsi),%r15,%r13
- adoxq -24(%rbx),%r11
- adcxq %r15,%r12
- mulxq 24(%rsi),%rdx,%r14
- adoxq -16(%rbx),%r12
- adcxq %rdx,%r13
- leaq (%rcx,%rax,1),%rcx
- leaq 32(%rsi),%rsi
- adoxq -8(%rbx),%r13
- adcxq %rbp,%r14
- adoxq %rbp,%r14
-
- movq %r8,%r15
- imulq 32+8(%rsp),%r8
-
- movq %r8,%rdx
- xorq %rbp,%rbp
- movq %rdi,8+8(%rsp)
-
- mulxq 0(%rcx),%rax,%r10
- adcxq %rax,%r15
- adoxq %r11,%r10
- mulxq 8(%rcx),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
- mulxq 16(%rcx),%rax,%r12
- adcxq %rax,%r11
- adoxq %r13,%r12
- mulxq 24(%rcx),%rax,%r15
- movq %r9,%rdx
- movq 24+8(%rsp),%rdi
- movq %r10,-32(%rbx)
- adcxq %rax,%r12
- movq %r11,-24(%rbx)
- adoxq %rbp,%r15
- movq %r12,-16(%rbx)
- leaq 32(%rcx),%rcx
- jmp .Lmulx4x_inner
-
-.align 32
-.Lmulx4x_inner:
- mulxq 0(%rsi),%r10,%rax
- adcxq %rbp,%r15
- adoxq %r14,%r10
- mulxq 8(%rsi),%r11,%r14
- adcxq 0(%rbx),%r10
- adoxq %rax,%r11
- mulxq 16(%rsi),%r12,%rax
- adcxq 8(%rbx),%r11
- adoxq %r14,%r12
- mulxq 24(%rsi),%r13,%r14
- movq %r8,%rdx
- adcxq 16(%rbx),%r12
- adoxq %rax,%r13
- adcxq 24(%rbx),%r13
- adoxq %rbp,%r14
- leaq 32(%rsi),%rsi
- leaq 32(%rbx),%rbx
- adcxq %rbp,%r14
-
- adoxq %r15,%r10
- mulxq 0(%rcx),%rax,%r15
- adcxq %rax,%r10
- adoxq %r15,%r11
- mulxq 8(%rcx),%rax,%r15
- adcxq %rax,%r11
- adoxq %r15,%r12
- mulxq 16(%rcx),%rax,%r15
- movq %r10,-40(%rbx)
- adcxq %rax,%r12
- adoxq %r15,%r13
- movq %r11,-32(%rbx)
- mulxq 24(%rcx),%rax,%r15
- movq %r9,%rdx
- leaq 32(%rcx),%rcx
- movq %r12,-24(%rbx)
- adcxq %rax,%r13
- adoxq %rbp,%r15
- movq %r13,-16(%rbx)
-
- decq %rdi
- jnz .Lmulx4x_inner
-
- movq 0+8(%rsp),%rax
- adcq %rbp,%r15
- subq 0(%rbx),%rdi
- movq 8+8(%rsp),%rdi
- movq 16+8(%rsp),%r10
- adcq %r15,%r14
- leaq (%rsi,%rax,1),%rsi
- adcq %rbp,%rbp
- movq %r14,-8(%rbx)
-
- cmpq %r10,%rdi
- jb .Lmulx4x_outer
-
- movq -8(%rcx),%r10
- movq %rbp,%r8
- movq (%rcx,%rax,1),%r12
- leaq (%rcx,%rax,1),%rbp
- movq %rax,%rcx
- leaq (%rbx,%rax,1),%rdi
- xorl %eax,%eax
- xorq %r15,%r15
- subq %r14,%r10
- adcq %r15,%r15
- orq %r15,%r8
- sarq $3+2,%rcx
- subq %r8,%rax
- movq 56+8(%rsp),%rdx
- decq %r12
- movq 8(%rbp),%r13
- xorq %r8,%r8
- movq 16(%rbp),%r14
- movq 24(%rbp),%r15
- jmp .Lsqrx4x_sub_entry
-.cfi_endproc
-.size mulx4x_internal,.-mulx4x_internal
-.type bn_powerx5,@function
-.align 32
-bn_powerx5:
-.cfi_startproc
- movq %rsp,%rax
-.cfi_def_cfa_register %rax
-.Lpowerx5_enter:
- pushq %rbx
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_offset %rbp,-24
- pushq %r12
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_offset %r15,-56
-.Lpowerx5_prologue:
-
- shll $3,%r9d
- leaq (%r9,%r9,2),%r10
- negq %r9
- movq (%r8),%r8
-
-
-
-
-
-
-
-
- leaq -320(%rsp,%r9,2),%r11
- movq %rsp,%rbp
- subq %rdi,%r11
- andq $4095,%r11
- cmpq %r11,%r10
- jb .Lpwrx_sp_alt
- subq %r11,%rbp
- leaq -320(%rbp,%r9,2),%rbp
- jmp .Lpwrx_sp_done
-
-.align 32
-.Lpwrx_sp_alt:
- leaq 4096-320(,%r9,2),%r10
- leaq -320(%rbp,%r9,2),%rbp
- subq %r10,%r11
- movq $0,%r10
- cmovcq %r10,%r11
- subq %r11,%rbp
-.Lpwrx_sp_done:
- andq $-64,%rbp
- movq %rsp,%r11
- subq %rbp,%r11
- andq $-4096,%r11
- leaq (%r11,%rbp,1),%rsp
- movq (%rsp),%r10
- cmpq %rbp,%rsp
- ja .Lpwrx_page_walk
- jmp .Lpwrx_page_walk_done
-
-.Lpwrx_page_walk:
- leaq -4096(%rsp),%rsp
- movq (%rsp),%r10
- cmpq %rbp,%rsp
- ja .Lpwrx_page_walk
-.Lpwrx_page_walk_done:
-
- movq %r9,%r10
- negq %r9
-
-
-
-
-
-
-
-
-
-
-
-
- pxor %xmm0,%xmm0
-.byte 102,72,15,110,207
-.byte 102,72,15,110,209
-.byte 102,73,15,110,218
-.byte 102,72,15,110,226
- movq %r8,32(%rsp)
- movq %rax,40(%rsp)
-.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
-.Lpowerx5_body:
-
- call __bn_sqrx8x_internal
- call __bn_postx4x_internal
- call __bn_sqrx8x_internal
- call __bn_postx4x_internal
- call __bn_sqrx8x_internal
- call __bn_postx4x_internal
- call __bn_sqrx8x_internal
- call __bn_postx4x_internal
- call __bn_sqrx8x_internal
- call __bn_postx4x_internal
-
- movq %r10,%r9
- movq %rsi,%rdi
-.byte 102,72,15,126,209
-.byte 102,72,15,126,226
- movq 40(%rsp),%rax
-
- call mulx4x_internal
-
- movq 40(%rsp),%rsi
-.cfi_def_cfa %rsi,8
- movq $1,%rax
-
- movq -48(%rsi),%r15
-.cfi_restore %r15
- movq -40(%rsi),%r14
-.cfi_restore %r14
- movq -32(%rsi),%r13
-.cfi_restore %r13
- movq -24(%rsi),%r12
-.cfi_restore %r12
- movq -16(%rsi),%rbp
-.cfi_restore %rbp
- movq -8(%rsi),%rbx
-.cfi_restore %rbx
- leaq (%rsi),%rsp
-.cfi_def_cfa_register %rsp
-.Lpowerx5_epilogue:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size bn_powerx5,.-bn_powerx5
-
-.globl bn_sqrx8x_internal
-.hidden bn_sqrx8x_internal
-.type bn_sqrx8x_internal,@function
-.align 32
-bn_sqrx8x_internal:
-__bn_sqrx8x_internal:
-.cfi_startproc
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- leaq 48+8(%rsp),%rdi
- leaq (%rsi,%r9,1),%rbp
- movq %r9,0+8(%rsp)
- movq %rbp,8+8(%rsp)
- jmp .Lsqr8x_zero_start
-
-.align 32
-.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
-.Lsqrx8x_zero:
-.byte 0x3e
- movdqa %xmm0,0(%rdi)
- movdqa %xmm0,16(%rdi)
- movdqa %xmm0,32(%rdi)
- movdqa %xmm0,48(%rdi)
-.Lsqr8x_zero_start:
- movdqa %xmm0,64(%rdi)
- movdqa %xmm0,80(%rdi)
- movdqa %xmm0,96(%rdi)
- movdqa %xmm0,112(%rdi)
- leaq 128(%rdi),%rdi
- subq $64,%r9
- jnz .Lsqrx8x_zero
-
- movq 0(%rsi),%rdx
-
- xorq %r10,%r10
- xorq %r11,%r11
- xorq %r12,%r12
- xorq %r13,%r13
- xorq %r14,%r14
- xorq %r15,%r15
- leaq 48+8(%rsp),%rdi
- xorq %rbp,%rbp
- jmp .Lsqrx8x_outer_loop
-
-.align 32
-.Lsqrx8x_outer_loop:
- mulxq 8(%rsi),%r8,%rax
- adcxq %r9,%r8
- adoxq %rax,%r10
- mulxq 16(%rsi),%r9,%rax
- adcxq %r10,%r9
- adoxq %rax,%r11
-.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
- adcxq %r11,%r10
- adoxq %rax,%r12
-.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
- adcxq %r12,%r11
- adoxq %rax,%r13
- mulxq 40(%rsi),%r12,%rax
- adcxq %r13,%r12
- adoxq %rax,%r14
- mulxq 48(%rsi),%r13,%rax
- adcxq %r14,%r13
- adoxq %r15,%rax
- mulxq 56(%rsi),%r14,%r15
- movq 8(%rsi),%rdx
- adcxq %rax,%r14
- adoxq %rbp,%r15
- adcq 64(%rdi),%r15
- movq %r8,8(%rdi)
- movq %r9,16(%rdi)
- sbbq %rcx,%rcx
- xorq %rbp,%rbp
-
-
- mulxq 16(%rsi),%r8,%rbx
- mulxq 24(%rsi),%r9,%rax
- adcxq %r10,%r8
- adoxq %rbx,%r9
- mulxq 32(%rsi),%r10,%rbx
- adcxq %r11,%r9
- adoxq %rax,%r10
-.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
- adcxq %r12,%r10
- adoxq %rbx,%r11
-.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
- adcxq %r13,%r11
- adoxq %r14,%r12
-.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
- movq 16(%rsi),%rdx
- adcxq %rax,%r12
- adoxq %rbx,%r13
- adcxq %r15,%r13
- adoxq %rbp,%r14
- adcxq %rbp,%r14
-
- movq %r8,24(%rdi)
- movq %r9,32(%rdi)
-
- mulxq 24(%rsi),%r8,%rbx
- mulxq 32(%rsi),%r9,%rax
- adcxq %r10,%r8
- adoxq %rbx,%r9
- mulxq 40(%rsi),%r10,%rbx
- adcxq %r11,%r9
- adoxq %rax,%r10
-.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
- adcxq %r12,%r10
- adoxq %r13,%r11
-.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
-.byte 0x3e
- movq 24(%rsi),%rdx
- adcxq %rbx,%r11
- adoxq %rax,%r12
- adcxq %r14,%r12
- movq %r8,40(%rdi)
- movq %r9,48(%rdi)
- mulxq 32(%rsi),%r8,%rax
- adoxq %rbp,%r13
- adcxq %rbp,%r13
-
- mulxq 40(%rsi),%r9,%rbx
- adcxq %r10,%r8
- adoxq %rax,%r9
- mulxq 48(%rsi),%r10,%rax
- adcxq %r11,%r9
- adoxq %r12,%r10
- mulxq 56(%rsi),%r11,%r12
- movq 32(%rsi),%rdx
- movq 40(%rsi),%r14
- adcxq %rbx,%r10
- adoxq %rax,%r11
- movq 48(%rsi),%r15
- adcxq %r13,%r11
- adoxq %rbp,%r12
- adcxq %rbp,%r12
-
- movq %r8,56(%rdi)
- movq %r9,64(%rdi)
-
- mulxq %r14,%r9,%rax
- movq 56(%rsi),%r8
- adcxq %r10,%r9
- mulxq %r15,%r10,%rbx
- adoxq %rax,%r10
- adcxq %r11,%r10
- mulxq %r8,%r11,%rax
- movq %r14,%rdx
- adoxq %rbx,%r11
- adcxq %r12,%r11
-
- adcxq %rbp,%rax
-
- mulxq %r15,%r14,%rbx
- mulxq %r8,%r12,%r13
- movq %r15,%rdx
- leaq 64(%rsi),%rsi
- adcxq %r14,%r11
- adoxq %rbx,%r12
- adcxq %rax,%r12
- adoxq %rbp,%r13
-
-.byte 0x67,0x67
- mulxq %r8,%r8,%r14
- adcxq %r8,%r13
- adcxq %rbp,%r14
-
- cmpq 8+8(%rsp),%rsi
- je .Lsqrx8x_outer_break
-
- negq %rcx
- movq $-8,%rcx
- movq %rbp,%r15
- movq 64(%rdi),%r8
- adcxq 72(%rdi),%r9
- adcxq 80(%rdi),%r10
- adcxq 88(%rdi),%r11
- adcq 96(%rdi),%r12
- adcq 104(%rdi),%r13
- adcq 112(%rdi),%r14
- adcq 120(%rdi),%r15
- leaq (%rsi),%rbp
- leaq 128(%rdi),%rdi
- sbbq %rax,%rax
-
- movq -64(%rsi),%rdx
- movq %rax,16+8(%rsp)
- movq %rdi,24+8(%rsp)
-
-
- xorl %eax,%eax
- jmp .Lsqrx8x_loop
-
-.align 32
-.Lsqrx8x_loop:
- movq %r8,%rbx
- mulxq 0(%rbp),%rax,%r8
- adcxq %rax,%rbx
- adoxq %r9,%r8
-
- mulxq 8(%rbp),%rax,%r9
- adcxq %rax,%r8
- adoxq %r10,%r9
-
- mulxq 16(%rbp),%rax,%r10
- adcxq %rax,%r9
- adoxq %r11,%r10
-
- mulxq 24(%rbp),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
-
-.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
- adcxq %rax,%r11
- adoxq %r13,%r12
-
- mulxq 40(%rbp),%rax,%r13
- adcxq %rax,%r12
- adoxq %r14,%r13
-
- mulxq 48(%rbp),%rax,%r14
- movq %rbx,(%rdi,%rcx,8)
- movl $0,%ebx
- adcxq %rax,%r13
- adoxq %r15,%r14
-
-.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
- movq 8(%rsi,%rcx,8),%rdx
- adcxq %rax,%r14
- adoxq %rbx,%r15
- adcxq %rbx,%r15
-
-.byte 0x67
- incq %rcx
- jnz .Lsqrx8x_loop
-
- leaq 64(%rbp),%rbp
- movq $-8,%rcx
- cmpq 8+8(%rsp),%rbp
- je .Lsqrx8x_break
-
- subq 16+8(%rsp),%rbx
-.byte 0x66
- movq -64(%rsi),%rdx
- adcxq 0(%rdi),%r8
- adcxq 8(%rdi),%r9
- adcq 16(%rdi),%r10
- adcq 24(%rdi),%r11
- adcq 32(%rdi),%r12
- adcq 40(%rdi),%r13
- adcq 48(%rdi),%r14
- adcq 56(%rdi),%r15
- leaq 64(%rdi),%rdi
-.byte 0x67
- sbbq %rax,%rax
- xorl %ebx,%ebx
- movq %rax,16+8(%rsp)
- jmp .Lsqrx8x_loop
-
-.align 32
-.Lsqrx8x_break:
- xorq %rbp,%rbp
- subq 16+8(%rsp),%rbx
- adcxq %rbp,%r8
- movq 24+8(%rsp),%rcx
- adcxq %rbp,%r9
- movq 0(%rsi),%rdx
- adcq $0,%r10
- movq %r8,0(%rdi)
- adcq $0,%r11
- adcq $0,%r12
- adcq $0,%r13
- adcq $0,%r14
- adcq $0,%r15
- cmpq %rcx,%rdi
- je .Lsqrx8x_outer_loop
-
- movq %r9,8(%rdi)
- movq 8(%rcx),%r9
- movq %r10,16(%rdi)
- movq 16(%rcx),%r10
- movq %r11,24(%rdi)
- movq 24(%rcx),%r11
- movq %r12,32(%rdi)
- movq 32(%rcx),%r12
- movq %r13,40(%rdi)
- movq 40(%rcx),%r13
- movq %r14,48(%rdi)
- movq 48(%rcx),%r14
- movq %r15,56(%rdi)
- movq 56(%rcx),%r15
- movq %rcx,%rdi
- jmp .Lsqrx8x_outer_loop
-
-.align 32
-.Lsqrx8x_outer_break:
- movq %r9,72(%rdi)
-.byte 102,72,15,126,217
- movq %r10,80(%rdi)
- movq %r11,88(%rdi)
- movq %r12,96(%rdi)
- movq %r13,104(%rdi)
- movq %r14,112(%rdi)
- leaq 48+8(%rsp),%rdi
- movq (%rsi,%rcx,1),%rdx
-
- movq 8(%rdi),%r11
- xorq %r10,%r10
- movq 0+8(%rsp),%r9
- adoxq %r11,%r11
- movq 16(%rdi),%r12
- movq 24(%rdi),%r13
-
-
-.align 32
-.Lsqrx4x_shift_n_add:
- mulxq %rdx,%rax,%rbx
- adoxq %r12,%r12
- adcxq %r10,%rax
-.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
-.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
- adoxq %r13,%r13
- adcxq %r11,%rbx
- movq 40(%rdi),%r11
- movq %rax,0(%rdi)
- movq %rbx,8(%rdi)
-
- mulxq %rdx,%rax,%rbx
- adoxq %r10,%r10
- adcxq %r12,%rax
- movq 16(%rsi,%rcx,1),%rdx
- movq 48(%rdi),%r12
- adoxq %r11,%r11
- adcxq %r13,%rbx
- movq 56(%rdi),%r13
- movq %rax,16(%rdi)
- movq %rbx,24(%rdi)
-
- mulxq %rdx,%rax,%rbx
- adoxq %r12,%r12
- adcxq %r10,%rax
- movq 24(%rsi,%rcx,1),%rdx
- leaq 32(%rcx),%rcx
- movq 64(%rdi),%r10
- adoxq %r13,%r13
- adcxq %r11,%rbx
- movq 72(%rdi),%r11
- movq %rax,32(%rdi)
- movq %rbx,40(%rdi)
-
- mulxq %rdx,%rax,%rbx
- adoxq %r10,%r10
- adcxq %r12,%rax
- jrcxz .Lsqrx4x_shift_n_add_break
-.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
- adoxq %r11,%r11
- adcxq %r13,%rbx
- movq 80(%rdi),%r12
- movq 88(%rdi),%r13
- movq %rax,48(%rdi)
- movq %rbx,56(%rdi)
- leaq 64(%rdi),%rdi
- nop
- jmp .Lsqrx4x_shift_n_add
-
-.align 32
-.Lsqrx4x_shift_n_add_break:
- adcxq %r13,%rbx
- movq %rax,48(%rdi)
- movq %rbx,56(%rdi)
- leaq 64(%rdi),%rdi
-.byte 102,72,15,126,213
-__bn_sqrx8x_reduction:
- xorl %eax,%eax
- movq 32+8(%rsp),%rbx
- movq 48+8(%rsp),%rdx
- leaq -64(%rbp,%r9,1),%rcx
-
- movq %rcx,0+8(%rsp)
- movq %rdi,8+8(%rsp)
-
- leaq 48+8(%rsp),%rdi
- jmp .Lsqrx8x_reduction_loop
-
-.align 32
-.Lsqrx8x_reduction_loop:
- movq 8(%rdi),%r9
- movq 16(%rdi),%r10
- movq 24(%rdi),%r11
- movq 32(%rdi),%r12
- movq %rdx,%r8
- imulq %rbx,%rdx
- movq 40(%rdi),%r13
- movq 48(%rdi),%r14
- movq 56(%rdi),%r15
- movq %rax,24+8(%rsp)
-
- leaq 64(%rdi),%rdi
- xorq %rsi,%rsi
- movq $-8,%rcx
- jmp .Lsqrx8x_reduce
-
-.align 32
-.Lsqrx8x_reduce:
- movq %r8,%rbx
- mulxq 0(%rbp),%rax,%r8
- adcxq %rbx,%rax
- adoxq %r9,%r8
-
- mulxq 8(%rbp),%rbx,%r9
- adcxq %rbx,%r8
- adoxq %r10,%r9
-
- mulxq 16(%rbp),%rbx,%r10
- adcxq %rbx,%r9
- adoxq %r11,%r10
-
- mulxq 24(%rbp),%rbx,%r11
- adcxq %rbx,%r10
- adoxq %r12,%r11
-
-.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
- movq %rdx,%rax
- movq %r8,%rdx
- adcxq %rbx,%r11
- adoxq %r13,%r12
-
- mulxq 32+8(%rsp),%rbx,%rdx
- movq %rax,%rdx
- movq %rax,64+48+8(%rsp,%rcx,8)
-
- mulxq 40(%rbp),%rax,%r13
- adcxq %rax,%r12
- adoxq %r14,%r13
-
- mulxq 48(%rbp),%rax,%r14
- adcxq %rax,%r13
- adoxq %r15,%r14
-
- mulxq 56(%rbp),%rax,%r15
- movq %rbx,%rdx
- adcxq %rax,%r14
- adoxq %rsi,%r15
- adcxq %rsi,%r15
-
-.byte 0x67,0x67,0x67
- incq %rcx
- jnz .Lsqrx8x_reduce
-
- movq %rsi,%rax
- cmpq 0+8(%rsp),%rbp
- jae .Lsqrx8x_no_tail
-
- movq 48+8(%rsp),%rdx
- addq 0(%rdi),%r8
- leaq 64(%rbp),%rbp
- movq $-8,%rcx
- adcxq 8(%rdi),%r9
- adcxq 16(%rdi),%r10
- adcq 24(%rdi),%r11
- adcq 32(%rdi),%r12
- adcq 40(%rdi),%r13
- adcq 48(%rdi),%r14
- adcq 56(%rdi),%r15
- leaq 64(%rdi),%rdi
- sbbq %rax,%rax
-
- xorq %rsi,%rsi
- movq %rax,16+8(%rsp)
- jmp .Lsqrx8x_tail
-
-.align 32
-.Lsqrx8x_tail:
- movq %r8,%rbx
- mulxq 0(%rbp),%rax,%r8
- adcxq %rax,%rbx
- adoxq %r9,%r8
-
- mulxq 8(%rbp),%rax,%r9
- adcxq %rax,%r8
- adoxq %r10,%r9
-
- mulxq 16(%rbp),%rax,%r10
- adcxq %rax,%r9
- adoxq %r11,%r10
-
- mulxq 24(%rbp),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
-
-.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
- adcxq %rax,%r11
- adoxq %r13,%r12
-
- mulxq 40(%rbp),%rax,%r13
- adcxq %rax,%r12
- adoxq %r14,%r13
-
- mulxq 48(%rbp),%rax,%r14
- adcxq %rax,%r13
- adoxq %r15,%r14
-
- mulxq 56(%rbp),%rax,%r15
- movq 72+48+8(%rsp,%rcx,8),%rdx
- adcxq %rax,%r14
- adoxq %rsi,%r15
- movq %rbx,(%rdi,%rcx,8)
- movq %r8,%rbx
- adcxq %rsi,%r15
-
- incq %rcx
- jnz .Lsqrx8x_tail
-
- cmpq 0+8(%rsp),%rbp
- jae .Lsqrx8x_tail_done
-
- subq 16+8(%rsp),%rsi
- movq 48+8(%rsp),%rdx
- leaq 64(%rbp),%rbp
- adcq 0(%rdi),%r8
- adcq 8(%rdi),%r9
- adcq 16(%rdi),%r10
- adcq 24(%rdi),%r11
- adcq 32(%rdi),%r12
- adcq 40(%rdi),%r13
- adcq 48(%rdi),%r14
- adcq 56(%rdi),%r15
- leaq 64(%rdi),%rdi
- sbbq %rax,%rax
- subq $8,%rcx
-
- xorq %rsi,%rsi
- movq %rax,16+8(%rsp)
- jmp .Lsqrx8x_tail
-
-.align 32
-.Lsqrx8x_tail_done:
- xorq %rax,%rax
- addq 24+8(%rsp),%r8
- adcq $0,%r9
- adcq $0,%r10
- adcq $0,%r11
- adcq $0,%r12
- adcq $0,%r13
- adcq $0,%r14
- adcq $0,%r15
- adcq $0,%rax
-
- subq 16+8(%rsp),%rsi
-.Lsqrx8x_no_tail:
- adcq 0(%rdi),%r8
-.byte 102,72,15,126,217
- adcq 8(%rdi),%r9
- movq 56(%rbp),%rsi
-.byte 102,72,15,126,213
- adcq 16(%rdi),%r10
- adcq 24(%rdi),%r11
- adcq 32(%rdi),%r12
- adcq 40(%rdi),%r13
- adcq 48(%rdi),%r14
- adcq 56(%rdi),%r15
- adcq $0,%rax
-
- movq 32+8(%rsp),%rbx
- movq 64(%rdi,%rcx,1),%rdx
-
- movq %r8,0(%rdi)
- leaq 64(%rdi),%r8
- movq %r9,8(%rdi)
- movq %r10,16(%rdi)
- movq %r11,24(%rdi)
- movq %r12,32(%rdi)
- movq %r13,40(%rdi)
- movq %r14,48(%rdi)
- movq %r15,56(%rdi)
-
- leaq 64(%rdi,%rcx,1),%rdi
- cmpq 8+8(%rsp),%r8
- jb .Lsqrx8x_reduction_loop
- .byte 0xf3,0xc3
-.cfi_endproc
-.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
-.align 32
-__bn_postx4x_internal:
-.cfi_startproc
- movq 0(%rbp),%r12
- movq %rcx,%r10
- movq %rcx,%r9
- negq %rax
- sarq $3+2,%rcx
-
-.byte 102,72,15,126,202
-.byte 102,72,15,126,206
- decq %r12
- movq 8(%rbp),%r13
- xorq %r8,%r8
- movq 16(%rbp),%r14
- movq 24(%rbp),%r15
- jmp .Lsqrx4x_sub_entry
-
-.align 16
-.Lsqrx4x_sub:
- movq 0(%rbp),%r12
- movq 8(%rbp),%r13
- movq 16(%rbp),%r14
- movq 24(%rbp),%r15
-.Lsqrx4x_sub_entry:
- andnq %rax,%r12,%r12
- leaq 32(%rbp),%rbp
- andnq %rax,%r13,%r13
- andnq %rax,%r14,%r14
- andnq %rax,%r15,%r15
-
- negq %r8
- adcq 0(%rdi),%r12
- adcq 8(%rdi),%r13
- adcq 16(%rdi),%r14
- adcq 24(%rdi),%r15
- movq %r12,0(%rdx)
- leaq 32(%rdi),%rdi
- movq %r13,8(%rdx)
- sbbq %r8,%r8
- movq %r14,16(%rdx)
- movq %r15,24(%rdx)
- leaq 32(%rdx),%rdx
-
- incq %rcx
- jnz .Lsqrx4x_sub
-
- negq %r9
-
- .byte 0xf3,0xc3
-.cfi_endproc
-.size __bn_postx4x_internal,.-__bn_postx4x_internal
.globl bn_get_bits5
.type bn_get_bits5,@function
.align 16
@@ -3601,7 +2251,7 @@ bn_gather5:
.long 0,0, 1,1
.long 2,2, 2,2
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
- .section ".note.gnu.property", "a"
+ .section .note.gnu.property, #alloc
.p2align 3
.long 1f - 0f
.long 4f - 1f
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/ec/ecp_nistz256-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/ec/ecp_nistz256-x86_64.s
index 4e05eefb1ee..be22fac090f 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/ec/ecp_nistz256-x86_64.s
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/ec/ecp_nistz256-x86_64.s
@@ -2788,10 +2788,6 @@ ecp_nistz256_neg:
.align 32
ecp_nistz256_ord_mul_mont:
.cfi_startproc
- movl $0x80100,%ecx
- andl OPENSSL_ia32cap_P+8(%rip),%ecx
- cmpl $0x80100,%ecx
- je .Lecp_nistz256_ord_mul_montx
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
@@ -3120,10 +3116,6 @@ ecp_nistz256_ord_mul_mont:
.align 32
ecp_nistz256_ord_sqr_mont:
.cfi_startproc
- movl $0x80100,%ecx
- andl OPENSSL_ia32cap_P+8(%rip),%ecx
- cmpl $0x80100,%ecx
- je .Lecp_nistz256_ord_sqr_montx
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
@@ -3411,462 +3403,6 @@ ecp_nistz256_ord_sqr_mont:
.cfi_endproc
.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
-.type ecp_nistz256_ord_mul_montx,@function
-.align 32
-ecp_nistz256_ord_mul_montx:
-.cfi_startproc
-.Lecp_nistz256_ord_mul_montx:
- pushq %rbp
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbp,-16
- pushq %rbx
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbx,-24
- pushq %r12
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r15,-56
-.Lord_mulx_body:
-
- movq %rdx,%rbx
- movq 0(%rdx),%rdx
- movq 0(%rsi),%r9
- movq 8(%rsi),%r10
- movq 16(%rsi),%r11
- movq 24(%rsi),%r12
- leaq -128(%rsi),%rsi
- leaq .Lord-128(%rip),%r14
- movq .LordK(%rip),%r15
-
-
- mulxq %r9,%r8,%r9
- mulxq %r10,%rcx,%r10
- mulxq %r11,%rbp,%r11
- addq %rcx,%r9
- mulxq %r12,%rcx,%r12
- movq %r8,%rdx
- mulxq %r15,%rdx,%rax
- adcq %rbp,%r10
- adcq %rcx,%r11
- adcq $0,%r12
-
-
- xorq %r13,%r13
- mulxq 0+128(%r14),%rcx,%rbp
- adcxq %rcx,%r8
- adoxq %rbp,%r9
-
- mulxq 8+128(%r14),%rcx,%rbp
- adcxq %rcx,%r9
- adoxq %rbp,%r10
-
- mulxq 16+128(%r14),%rcx,%rbp
- adcxq %rcx,%r10
- adoxq %rbp,%r11
-
- mulxq 24+128(%r14),%rcx,%rbp
- movq 8(%rbx),%rdx
- adcxq %rcx,%r11
- adoxq %rbp,%r12
- adcxq %r8,%r12
- adoxq %r8,%r13
- adcq $0,%r13
-
-
- mulxq 0+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r9
- adoxq %rbp,%r10
-
- mulxq 8+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r10
- adoxq %rbp,%r11
-
- mulxq 16+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq 24+128(%rsi),%rcx,%rbp
- movq %r9,%rdx
- mulxq %r15,%rdx,%rax
- adcxq %rcx,%r12
- adoxq %rbp,%r13
-
- adcxq %r8,%r13
- adoxq %r8,%r8
- adcq $0,%r8
-
-
- mulxq 0+128(%r14),%rcx,%rbp
- adcxq %rcx,%r9
- adoxq %rbp,%r10
-
- mulxq 8+128(%r14),%rcx,%rbp
- adcxq %rcx,%r10
- adoxq %rbp,%r11
-
- mulxq 16+128(%r14),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq 24+128(%r14),%rcx,%rbp
- movq 16(%rbx),%rdx
- adcxq %rcx,%r12
- adoxq %rbp,%r13
- adcxq %r9,%r13
- adoxq %r9,%r8
- adcq $0,%r8
-
-
- mulxq 0+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r10
- adoxq %rbp,%r11
-
- mulxq 8+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq 16+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r12
- adoxq %rbp,%r13
-
- mulxq 24+128(%rsi),%rcx,%rbp
- movq %r10,%rdx
- mulxq %r15,%rdx,%rax
- adcxq %rcx,%r13
- adoxq %rbp,%r8
-
- adcxq %r9,%r8
- adoxq %r9,%r9
- adcq $0,%r9
-
-
- mulxq 0+128(%r14),%rcx,%rbp
- adcxq %rcx,%r10
- adoxq %rbp,%r11
-
- mulxq 8+128(%r14),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq 16+128(%r14),%rcx,%rbp
- adcxq %rcx,%r12
- adoxq %rbp,%r13
-
- mulxq 24+128(%r14),%rcx,%rbp
- movq 24(%rbx),%rdx
- adcxq %rcx,%r13
- adoxq %rbp,%r8
- adcxq %r10,%r8
- adoxq %r10,%r9
- adcq $0,%r9
-
-
- mulxq 0+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq 8+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r12
- adoxq %rbp,%r13
-
- mulxq 16+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r13
- adoxq %rbp,%r8
-
- mulxq 24+128(%rsi),%rcx,%rbp
- movq %r11,%rdx
- mulxq %r15,%rdx,%rax
- adcxq %rcx,%r8
- adoxq %rbp,%r9
-
- adcxq %r10,%r9
- adoxq %r10,%r10
- adcq $0,%r10
-
-
- mulxq 0+128(%r14),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq 8+128(%r14),%rcx,%rbp
- adcxq %rcx,%r12
- adoxq %rbp,%r13
-
- mulxq 16+128(%r14),%rcx,%rbp
- adcxq %rcx,%r13
- adoxq %rbp,%r8
-
- mulxq 24+128(%r14),%rcx,%rbp
- leaq 128(%r14),%r14
- movq %r12,%rbx
- adcxq %rcx,%r8
- adoxq %rbp,%r9
- movq %r13,%rdx
- adcxq %r11,%r9
- adoxq %r11,%r10
- adcq $0,%r10
-
-
-
- movq %r8,%rcx
- subq 0(%r14),%r12
- sbbq 8(%r14),%r13
- sbbq 16(%r14),%r8
- movq %r9,%rbp
- sbbq 24(%r14),%r9
- sbbq $0,%r10
-
- cmovcq %rbx,%r12
- cmovcq %rdx,%r13
- cmovcq %rcx,%r8
- cmovcq %rbp,%r9
-
- movq %r12,0(%rdi)
- movq %r13,8(%rdi)
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
-
- movq 0(%rsp),%r15
-.cfi_restore %r15
- movq 8(%rsp),%r14
-.cfi_restore %r14
- movq 16(%rsp),%r13
-.cfi_restore %r13
- movq 24(%rsp),%r12
-.cfi_restore %r12
- movq 32(%rsp),%rbx
-.cfi_restore %rbx
- movq 40(%rsp),%rbp
-.cfi_restore %rbp
- leaq 48(%rsp),%rsp
-.cfi_adjust_cfa_offset -48
-.Lord_mulx_epilogue:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
-
-.type ecp_nistz256_ord_sqr_montx,@function
-.align 32
-ecp_nistz256_ord_sqr_montx:
-.cfi_startproc
-.Lecp_nistz256_ord_sqr_montx:
- pushq %rbp
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbp,-16
- pushq %rbx
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbx,-24
- pushq %r12
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r15,-56
-.Lord_sqrx_body:
-
- movq %rdx,%rbx
- movq 0(%rsi),%rdx
- movq 8(%rsi),%r14
- movq 16(%rsi),%r15
- movq 24(%rsi),%r8
- leaq .Lord(%rip),%rsi
- jmp .Loop_ord_sqrx
-
-.align 32
-.Loop_ord_sqrx:
- mulxq %r14,%r9,%r10
- mulxq %r15,%rcx,%r11
- movq %rdx,%rax
-.byte 102,73,15,110,206
- mulxq %r8,%rbp,%r12
- movq %r14,%rdx
- addq %rcx,%r10
-.byte 102,73,15,110,215
- adcq %rbp,%r11
- adcq $0,%r12
- xorq %r13,%r13
-
- mulxq %r15,%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq %r8,%rcx,%rbp
- movq %r15,%rdx
- adcxq %rcx,%r12
- adoxq %rbp,%r13
- adcq $0,%r13
-
- mulxq %r8,%rcx,%r14
- movq %rax,%rdx
-.byte 102,73,15,110,216
- xorq %r15,%r15
- adcxq %r9,%r9
- adoxq %rcx,%r13
- adcxq %r10,%r10
- adoxq %r15,%r14
-
-
- mulxq %rdx,%r8,%rbp
-.byte 102,72,15,126,202
- adcxq %r11,%r11
- adoxq %rbp,%r9
- adcxq %r12,%r12
- mulxq %rdx,%rcx,%rax
-.byte 102,72,15,126,210
- adcxq %r13,%r13
- adoxq %rcx,%r10
- adcxq %r14,%r14
- mulxq %rdx,%rcx,%rbp
-.byte 0x67
-.byte 102,72,15,126,218
- adoxq %rax,%r11
- adcxq %r15,%r15
- adoxq %rcx,%r12
- adoxq %rbp,%r13
- mulxq %rdx,%rcx,%rax
- adoxq %rcx,%r14
- adoxq %rax,%r15
-
-
- movq %r8,%rdx
- mulxq 32(%rsi),%rdx,%rcx
-
- xorq %rax,%rax
- mulxq 0(%rsi),%rcx,%rbp
- adcxq %rcx,%r8
- adoxq %rbp,%r9
- mulxq 8(%rsi),%rcx,%rbp
- adcxq %rcx,%r9
- adoxq %rbp,%r10
- mulxq 16(%rsi),%rcx,%rbp
- adcxq %rcx,%r10
- adoxq %rbp,%r11
- mulxq 24(%rsi),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r8
- adcxq %rax,%r8
-
-
- movq %r9,%rdx
- mulxq 32(%rsi),%rdx,%rcx
-
- mulxq 0(%rsi),%rcx,%rbp
- adoxq %rcx,%r9
- adcxq %rbp,%r10
- mulxq 8(%rsi),%rcx,%rbp
- adoxq %rcx,%r10
- adcxq %rbp,%r11
- mulxq 16(%rsi),%rcx,%rbp
- adoxq %rcx,%r11
- adcxq %rbp,%r8
- mulxq 24(%rsi),%rcx,%rbp
- adoxq %rcx,%r8
- adcxq %rbp,%r9
- adoxq %rax,%r9
-
-
- movq %r10,%rdx
- mulxq 32(%rsi),%rdx,%rcx
-
- mulxq 0(%rsi),%rcx,%rbp
- adcxq %rcx,%r10
- adoxq %rbp,%r11
- mulxq 8(%rsi),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r8
- mulxq 16(%rsi),%rcx,%rbp
- adcxq %rcx,%r8
- adoxq %rbp,%r9
- mulxq 24(%rsi),%rcx,%rbp
- adcxq %rcx,%r9
- adoxq %rbp,%r10
- adcxq %rax,%r10
-
-
- movq %r11,%rdx
- mulxq 32(%rsi),%rdx,%rcx
-
- mulxq 0(%rsi),%rcx,%rbp
- adoxq %rcx,%r11
- adcxq %rbp,%r8
- mulxq 8(%rsi),%rcx,%rbp
- adoxq %rcx,%r8
- adcxq %rbp,%r9
- mulxq 16(%rsi),%rcx,%rbp
- adoxq %rcx,%r9
- adcxq %rbp,%r10
- mulxq 24(%rsi),%rcx,%rbp
- adoxq %rcx,%r10
- adcxq %rbp,%r11
- adoxq %rax,%r11
-
-
- addq %r8,%r12
- adcq %r13,%r9
- movq %r12,%rdx
- adcq %r14,%r10
- adcq %r15,%r11
- movq %r9,%r14
- adcq $0,%rax
-
-
- subq 0(%rsi),%r12
- movq %r10,%r15
- sbbq 8(%rsi),%r9
- sbbq 16(%rsi),%r10
- movq %r11,%r8
- sbbq 24(%rsi),%r11
- sbbq $0,%rax
-
- cmovncq %r12,%rdx
- cmovncq %r9,%r14
- cmovncq %r10,%r15
- cmovncq %r11,%r8
-
- decq %rbx
- jnz .Loop_ord_sqrx
-
- movq %rdx,0(%rdi)
- movq %r14,8(%rdi)
- pxor %xmm1,%xmm1
- movq %r15,16(%rdi)
- pxor %xmm2,%xmm2
- movq %r8,24(%rdi)
- pxor %xmm3,%xmm3
-
- movq 0(%rsp),%r15
-.cfi_restore %r15
- movq 8(%rsp),%r14
-.cfi_restore %r14
- movq 16(%rsp),%r13
-.cfi_restore %r13
- movq 24(%rsp),%r12
-.cfi_restore %r12
- movq 32(%rsp),%rbx
-.cfi_restore %rbx
- movq 40(%rsp),%rbp
-.cfi_restore %rbp
- leaq 48(%rsp),%rsp
-.cfi_adjust_cfa_offset -48
-.Lord_sqrx_epilogue:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
-
@@ -3875,8 +3411,6 @@ ecp_nistz256_ord_sqr_montx:
.align 32
ecp_nistz256_to_mont:
.cfi_startproc
- movl $0x80100,%ecx
- andl OPENSSL_ia32cap_P+8(%rip),%ecx
leaq .LRR(%rip),%rdx
jmp .Lmul_mont
.cfi_endproc
@@ -3893,8 +3427,6 @@ ecp_nistz256_to_mont:
.align 32
ecp_nistz256_mul_mont:
.cfi_startproc
- movl $0x80100,%ecx
- andl OPENSSL_ia32cap_P+8(%rip),%ecx
.Lmul_mont:
pushq %rbp
.cfi_adjust_cfa_offset 8
@@ -3915,8 +3447,6 @@ ecp_nistz256_mul_mont:
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
.Lmul_body:
- cmpl $0x80100,%ecx
- je .Lmul_montx
movq %rdx,%rbx
movq 0(%rdx),%rax
movq 0(%rsi),%r9
@@ -3925,19 +3455,6 @@ ecp_nistz256_mul_mont:
movq 24(%rsi),%r12
call __ecp_nistz256_mul_montq
- jmp .Lmul_mont_done
-
-.align 32
-.Lmul_montx:
- movq %rdx,%rbx
- movq 0(%rdx),%rdx
- movq 0(%rsi),%r9
- movq 8(%rsi),%r10
- movq 16(%rsi),%r11
- movq 24(%rsi),%r12
- leaq -128(%rsi),%rsi
-
- call __ecp_nistz256_mul_montx
.Lmul_mont_done:
movq 0(%rsp),%r15
.cfi_restore %r15
@@ -4188,8 +3705,6 @@ __ecp_nistz256_mul_montq:
.align 32
ecp_nistz256_sqr_mont:
.cfi_startproc
- movl $0x80100,%ecx
- andl OPENSSL_ia32cap_P+8(%rip),%ecx
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
@@ -4209,25 +3724,12 @@ ecp_nistz256_sqr_mont:
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
.Lsqr_body:
- cmpl $0x80100,%ecx
- je .Lsqr_montx
movq 0(%rsi),%rax
movq 8(%rsi),%r14
movq 16(%rsi),%r15
movq 24(%rsi),%r8
call __ecp_nistz256_sqr_montq
- jmp .Lsqr_mont_done
-
-.align 32
-.Lsqr_montx:
- movq 0(%rsi),%rdx
- movq 8(%rsi),%r14
- movq 16(%rsi),%r15
- movq 24(%rsi),%r8
- leaq -128(%rsi),%rsi
-
- call __ecp_nistz256_sqr_montx
.Lsqr_mont_done:
movq 0(%rsp),%r15
.cfi_restore %r15
@@ -4411,342 +3913,44 @@ __ecp_nistz256_sqr_montq:
.byte 0xf3,0xc3
.cfi_endproc
.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
-.type __ecp_nistz256_mul_montx,@function
-.align 32
-__ecp_nistz256_mul_montx:
-.cfi_startproc
- mulxq %r9,%r8,%r9
- mulxq %r10,%rcx,%r10
- movq $32,%r14
- xorq %r13,%r13
- mulxq %r11,%rbp,%r11
- movq .Lpoly+24(%rip),%r15
- adcq %rcx,%r9
- mulxq %r12,%rcx,%r12
- movq %r8,%rdx
- adcq %rbp,%r10
- shlxq %r14,%r8,%rbp
- adcq %rcx,%r11
- shrxq %r14,%r8,%rcx
- adcq $0,%r12
- addq %rbp,%r9
- adcq %rcx,%r10
- mulxq %r15,%rcx,%rbp
- movq 8(%rbx),%rdx
- adcq %rcx,%r11
- adcq %rbp,%r12
- adcq $0,%r13
- xorq %r8,%r8
+.globl ecp_nistz256_from_mont
+.type ecp_nistz256_from_mont,@function
+.align 32
+ecp_nistz256_from_mont:
+.cfi_startproc
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-16
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-24
+.Lfrom_body:
+ movq 0(%rsi),%rax
+ movq .Lpoly+24(%rip),%r13
+ movq 8(%rsi),%r9
+ movq 16(%rsi),%r10
+ movq 24(%rsi),%r11
+ movq %rax,%r8
+ movq .Lpoly+8(%rip),%r12
- mulxq 0+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r9
- adoxq %rbp,%r10
- mulxq 8+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r10
- adoxq %rbp,%r11
-
- mulxq 16+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq 24+128(%rsi),%rcx,%rbp
- movq %r9,%rdx
- adcxq %rcx,%r12
- shlxq %r14,%r9,%rcx
- adoxq %rbp,%r13
- shrxq %r14,%r9,%rbp
-
- adcxq %r8,%r13
- adoxq %r8,%r8
- adcq $0,%r8
-
-
-
- addq %rcx,%r10
- adcq %rbp,%r11
-
- mulxq %r15,%rcx,%rbp
- movq 16(%rbx),%rdx
- adcq %rcx,%r12
- adcq %rbp,%r13
- adcq $0,%r8
- xorq %r9,%r9
-
-
-
- mulxq 0+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r10
- adoxq %rbp,%r11
-
- mulxq 8+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq 16+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r12
- adoxq %rbp,%r13
-
- mulxq 24+128(%rsi),%rcx,%rbp
- movq %r10,%rdx
- adcxq %rcx,%r13
- shlxq %r14,%r10,%rcx
- adoxq %rbp,%r8
- shrxq %r14,%r10,%rbp
-
- adcxq %r9,%r8
- adoxq %r9,%r9
- adcq $0,%r9
-
-
-
- addq %rcx,%r11
- adcq %rbp,%r12
-
- mulxq %r15,%rcx,%rbp
- movq 24(%rbx),%rdx
- adcq %rcx,%r13
- adcq %rbp,%r8
- adcq $0,%r9
- xorq %r10,%r10
-
-
-
- mulxq 0+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq 8+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r12
- adoxq %rbp,%r13
-
- mulxq 16+128(%rsi),%rcx,%rbp
- adcxq %rcx,%r13
- adoxq %rbp,%r8
-
- mulxq 24+128(%rsi),%rcx,%rbp
- movq %r11,%rdx
- adcxq %rcx,%r8
- shlxq %r14,%r11,%rcx
- adoxq %rbp,%r9
- shrxq %r14,%r11,%rbp
-
- adcxq %r10,%r9
- adoxq %r10,%r10
- adcq $0,%r10
-
-
-
- addq %rcx,%r12
- adcq %rbp,%r13
-
- mulxq %r15,%rcx,%rbp
- movq %r12,%rbx
- movq .Lpoly+8(%rip),%r14
- adcq %rcx,%r8
- movq %r13,%rdx
- adcq %rbp,%r9
- adcq $0,%r10
-
-
-
- xorl %eax,%eax
- movq %r8,%rcx
- sbbq $-1,%r12
- sbbq %r14,%r13
- sbbq $0,%r8
- movq %r9,%rbp
- sbbq %r15,%r9
- sbbq $0,%r10
-
- cmovcq %rbx,%r12
- cmovcq %rdx,%r13
- movq %r12,0(%rdi)
- cmovcq %rcx,%r8
- movq %r13,8(%rdi)
- cmovcq %rbp,%r9
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
-
- .byte 0xf3,0xc3
-.cfi_endproc
-.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
-
-.type __ecp_nistz256_sqr_montx,@function
-.align 32
-__ecp_nistz256_sqr_montx:
-.cfi_startproc
- mulxq %r14,%r9,%r10
- mulxq %r15,%rcx,%r11
- xorl %eax,%eax
- adcq %rcx,%r10
- mulxq %r8,%rbp,%r12
- movq %r14,%rdx
- adcq %rbp,%r11
- adcq $0,%r12
- xorq %r13,%r13
-
-
- mulxq %r15,%rcx,%rbp
- adcxq %rcx,%r11
- adoxq %rbp,%r12
-
- mulxq %r8,%rcx,%rbp
- movq %r15,%rdx
- adcxq %rcx,%r12
- adoxq %rbp,%r13
- adcq $0,%r13
-
-
- mulxq %r8,%rcx,%r14
- movq 0+128(%rsi),%rdx
- xorq %r15,%r15
- adcxq %r9,%r9
- adoxq %rcx,%r13
- adcxq %r10,%r10
- adoxq %r15,%r14
-
- mulxq %rdx,%r8,%rbp
- movq 8+128(%rsi),%rdx
- adcxq %r11,%r11
- adoxq %rbp,%r9
- adcxq %r12,%r12
- mulxq %rdx,%rcx,%rax
- movq 16+128(%rsi),%rdx
- adcxq %r13,%r13
- adoxq %rcx,%r10
- adcxq %r14,%r14
-.byte 0x67
- mulxq %rdx,%rcx,%rbp
- movq 24+128(%rsi),%rdx
- adoxq %rax,%r11
- adcxq %r15,%r15
- adoxq %rcx,%r12
- movq $32,%rsi
- adoxq %rbp,%r13
-.byte 0x67,0x67
- mulxq %rdx,%rcx,%rax
- movq .Lpoly+24(%rip),%rdx
- adoxq %rcx,%r14
- shlxq %rsi,%r8,%rcx
- adoxq %rax,%r15
- shrxq %rsi,%r8,%rax
- movq %rdx,%rbp
-
-
- addq %rcx,%r9
- adcq %rax,%r10
-
- mulxq %r8,%rcx,%r8
- adcq %rcx,%r11
- shlxq %rsi,%r9,%rcx
- adcq $0,%r8
- shrxq %rsi,%r9,%rax
-
-
- addq %rcx,%r10
- adcq %rax,%r11
-
- mulxq %r9,%rcx,%r9
- adcq %rcx,%r8
- shlxq %rsi,%r10,%rcx
- adcq $0,%r9
- shrxq %rsi,%r10,%rax
-
-
- addq %rcx,%r11
- adcq %rax,%r8
-
- mulxq %r10,%rcx,%r10
- adcq %rcx,%r9
- shlxq %rsi,%r11,%rcx
- adcq $0,%r10
- shrxq %rsi,%r11,%rax
-
-
- addq %rcx,%r8
- adcq %rax,%r9
-
- mulxq %r11,%rcx,%r11
- adcq %rcx,%r10
- adcq $0,%r11
-
- xorq %rdx,%rdx
- addq %r8,%r12
- movq .Lpoly+8(%rip),%rsi
- adcq %r9,%r13
- movq %r12,%r8
- adcq %r10,%r14
- adcq %r11,%r15
- movq %r13,%r9
- adcq $0,%rdx
-
- subq $-1,%r12
- movq %r14,%r10
- sbbq %rsi,%r13
- sbbq $0,%r14
- movq %r15,%r11
- sbbq %rbp,%r15
- sbbq $0,%rdx
-
- cmovcq %r8,%r12
- cmovcq %r9,%r13
- movq %r12,0(%rdi)
- cmovcq %r10,%r14
- movq %r13,8(%rdi)
- cmovcq %r11,%r15
- movq %r14,16(%rdi)
- movq %r15,24(%rdi)
-
- .byte 0xf3,0xc3
-.cfi_endproc
-.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
-
-
-
-
-
-
-.globl ecp_nistz256_from_mont
-.type ecp_nistz256_from_mont,@function
-.align 32
-ecp_nistz256_from_mont:
-.cfi_startproc
- pushq %r12
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r12,-16
- pushq %r13
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r13,-24
-.Lfrom_body:
-
- movq 0(%rsi),%rax
- movq .Lpoly+24(%rip),%r13
- movq 8(%rsi),%r9
- movq 16(%rsi),%r10
- movq 24(%rsi),%r11
- movq %rax,%r8
- movq .Lpoly+8(%rip),%r12
-
-
-
- movq %rax,%rcx
- shlq $32,%r8
- mulq %r13
- shrq $32,%rcx
- addq %r8,%r9
- adcq %rcx,%r10
- adcq %rax,%r11
- movq %r9,%rax
- adcq $0,%rdx
+ movq %rax,%rcx
+ shlq $32,%r8
+ mulq %r13
+ shrq $32,%rcx
+ addq %r8,%r9
+ adcq %rcx,%r10
+ adcq %rax,%r11
+ movq %r9,%rax
+ adcq $0,%rdx
@@ -4850,9 +4054,6 @@ ecp_nistz256_scatter_w5:
.align 32
ecp_nistz256_gather_w5:
.cfi_startproc
- movl OPENSSL_ia32cap_P+8(%rip),%eax
- testl $32,%eax
- jnz .Lavx2_gather_w5
movdqa .LOne(%rip),%xmm0
movd %edx,%xmm1
@@ -4936,9 +4137,6 @@ ecp_nistz256_scatter_w7:
.align 32
ecp_nistz256_gather_w7:
.cfi_startproc
- movl OPENSSL_ia32cap_P+8(%rip),%eax
- testl $32,%eax
- jnz .Lavx2_gather_w7
movdqa .LOne(%rip),%xmm8
movd %edx,%xmm1
@@ -4957,1291 +4155,46 @@ ecp_nistz256_gather_w7:
movdqa 0(%rsi),%xmm9
movdqa 16(%rsi),%xmm10
pcmpeqd %xmm1,%xmm15
- movdqa 32(%rsi),%xmm11
- movdqa 48(%rsi),%xmm12
- leaq 64(%rsi),%rsi
-
- pand %xmm15,%xmm9
- pand %xmm15,%xmm10
- por %xmm9,%xmm2
- pand %xmm15,%xmm11
- por %xmm10,%xmm3
- pand %xmm15,%xmm12
- por %xmm11,%xmm4
- prefetcht0 255(%rsi)
- por %xmm12,%xmm5
-
- decq %rax
- jnz .Lselect_loop_sse_w7
-
- movdqu %xmm2,0(%rdi)
- movdqu %xmm3,16(%rdi)
- movdqu %xmm4,32(%rdi)
- movdqu %xmm5,48(%rdi)
- .byte 0xf3,0xc3
-.cfi_endproc
-.LSEH_end_ecp_nistz256_gather_w7:
-.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
-
-
-.type ecp_nistz256_avx2_gather_w5,@function
-.align 32
-ecp_nistz256_avx2_gather_w5:
-.cfi_startproc
-.Lavx2_gather_w5:
- vzeroupper
- vmovdqa .LTwo(%rip),%ymm0
-
- vpxor %ymm2,%ymm2,%ymm2
- vpxor %ymm3,%ymm3,%ymm3
- vpxor %ymm4,%ymm4,%ymm4
-
- vmovdqa .LOne(%rip),%ymm5
- vmovdqa .LTwo(%rip),%ymm10
-
- vmovd %edx,%xmm1
- vpermd %ymm1,%ymm2,%ymm1
-
- movq $8,%rax
-.Lselect_loop_avx2_w5:
-
- vmovdqa 0(%rsi),%ymm6
- vmovdqa 32(%rsi),%ymm7
- vmovdqa 64(%rsi),%ymm8
-
- vmovdqa 96(%rsi),%ymm11
- vmovdqa 128(%rsi),%ymm12
- vmovdqa 160(%rsi),%ymm13
-
- vpcmpeqd %ymm1,%ymm5,%ymm9
- vpcmpeqd %ymm1,%ymm10,%ymm14
-
- vpaddd %ymm0,%ymm5,%ymm5
- vpaddd %ymm0,%ymm10,%ymm10
- leaq 192(%rsi),%rsi
-
- vpand %ymm9,%ymm6,%ymm6
- vpand %ymm9,%ymm7,%ymm7
- vpand %ymm9,%ymm8,%ymm8
- vpand %ymm14,%ymm11,%ymm11
- vpand %ymm14,%ymm12,%ymm12
- vpand %ymm14,%ymm13,%ymm13
-
- vpxor %ymm6,%ymm2,%ymm2
- vpxor %ymm7,%ymm3,%ymm3
- vpxor %ymm8,%ymm4,%ymm4
- vpxor %ymm11,%ymm2,%ymm2
- vpxor %ymm12,%ymm3,%ymm3
- vpxor %ymm13,%ymm4,%ymm4
-
- decq %rax
- jnz .Lselect_loop_avx2_w5
-
- vmovdqu %ymm2,0(%rdi)
- vmovdqu %ymm3,32(%rdi)
- vmovdqu %ymm4,64(%rdi)
- vzeroupper
- .byte 0xf3,0xc3
-.cfi_endproc
-.LSEH_end_ecp_nistz256_avx2_gather_w5:
-.size ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5
-
-
-
-.globl ecp_nistz256_avx2_gather_w7
-.type ecp_nistz256_avx2_gather_w7,@function
-.align 32
-ecp_nistz256_avx2_gather_w7:
-.cfi_startproc
-.Lavx2_gather_w7:
- vzeroupper
- vmovdqa .LThree(%rip),%ymm0
-
- vpxor %ymm2,%ymm2,%ymm2
- vpxor %ymm3,%ymm3,%ymm3
-
- vmovdqa .LOne(%rip),%ymm4
- vmovdqa .LTwo(%rip),%ymm8
- vmovdqa .LThree(%rip),%ymm12
-
- vmovd %edx,%xmm1
- vpermd %ymm1,%ymm2,%ymm1
-
-
- movq $21,%rax
-.Lselect_loop_avx2_w7:
-
- vmovdqa 0(%rsi),%ymm5
- vmovdqa 32(%rsi),%ymm6
-
- vmovdqa 64(%rsi),%ymm9
- vmovdqa 96(%rsi),%ymm10
-
- vmovdqa 128(%rsi),%ymm13
- vmovdqa 160(%rsi),%ymm14
-
- vpcmpeqd %ymm1,%ymm4,%ymm7
- vpcmpeqd %ymm1,%ymm8,%ymm11
- vpcmpeqd %ymm1,%ymm12,%ymm15
-
- vpaddd %ymm0,%ymm4,%ymm4
- vpaddd %ymm0,%ymm8,%ymm8
- vpaddd %ymm0,%ymm12,%ymm12
- leaq 192(%rsi),%rsi
-
- vpand %ymm7,%ymm5,%ymm5
- vpand %ymm7,%ymm6,%ymm6
- vpand %ymm11,%ymm9,%ymm9
- vpand %ymm11,%ymm10,%ymm10
- vpand %ymm15,%ymm13,%ymm13
- vpand %ymm15,%ymm14,%ymm14
-
- vpxor %ymm5,%ymm2,%ymm2
- vpxor %ymm6,%ymm3,%ymm3
- vpxor %ymm9,%ymm2,%ymm2
- vpxor %ymm10,%ymm3,%ymm3
- vpxor %ymm13,%ymm2,%ymm2
- vpxor %ymm14,%ymm3,%ymm3
-
- decq %rax
- jnz .Lselect_loop_avx2_w7
-
-
- vmovdqa 0(%rsi),%ymm5
- vmovdqa 32(%rsi),%ymm6
-
- vpcmpeqd %ymm1,%ymm4,%ymm7
-
- vpand %ymm7,%ymm5,%ymm5
- vpand %ymm7,%ymm6,%ymm6
-
- vpxor %ymm5,%ymm2,%ymm2
- vpxor %ymm6,%ymm3,%ymm3
-
- vmovdqu %ymm2,0(%rdi)
- vmovdqu %ymm3,32(%rdi)
- vzeroupper
- .byte 0xf3,0xc3
-.cfi_endproc
-.LSEH_end_ecp_nistz256_avx2_gather_w7:
-.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
-.type __ecp_nistz256_add_toq,@function
-.align 32
-__ecp_nistz256_add_toq:
-.cfi_startproc
- xorq %r11,%r11
- addq 0(%rbx),%r12
- adcq 8(%rbx),%r13
- movq %r12,%rax
- adcq 16(%rbx),%r8
- adcq 24(%rbx),%r9
- movq %r13,%rbp
- adcq $0,%r11
-
- subq $-1,%r12
- movq %r8,%rcx
- sbbq %r14,%r13
- sbbq $0,%r8
- movq %r9,%r10
- sbbq %r15,%r9
- sbbq $0,%r11
-
- cmovcq %rax,%r12
- cmovcq %rbp,%r13
- movq %r12,0(%rdi)
- cmovcq %rcx,%r8
- movq %r13,8(%rdi)
- cmovcq %r10,%r9
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
-
- .byte 0xf3,0xc3
-.cfi_endproc
-.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
-
-.type __ecp_nistz256_sub_fromq,@function
-.align 32
-__ecp_nistz256_sub_fromq:
-.cfi_startproc
- subq 0(%rbx),%r12
- sbbq 8(%rbx),%r13
- movq %r12,%rax
- sbbq 16(%rbx),%r8
- sbbq 24(%rbx),%r9
- movq %r13,%rbp
- sbbq %r11,%r11
-
- addq $-1,%r12
- movq %r8,%rcx
- adcq %r14,%r13
- adcq $0,%r8
- movq %r9,%r10
- adcq %r15,%r9
- testq %r11,%r11
-
- cmovzq %rax,%r12
- cmovzq %rbp,%r13
- movq %r12,0(%rdi)
- cmovzq %rcx,%r8
- movq %r13,8(%rdi)
- cmovzq %r10,%r9
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
-
- .byte 0xf3,0xc3
-.cfi_endproc
-.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
-
-.type __ecp_nistz256_subq,@function
-.align 32
-__ecp_nistz256_subq:
-.cfi_startproc
- subq %r12,%rax
- sbbq %r13,%rbp
- movq %rax,%r12
- sbbq %r8,%rcx
- sbbq %r9,%r10
- movq %rbp,%r13
- sbbq %r11,%r11
-
- addq $-1,%rax
- movq %rcx,%r8
- adcq %r14,%rbp
- adcq $0,%rcx
- movq %r10,%r9
- adcq %r15,%r10
- testq %r11,%r11
-
- cmovnzq %rax,%r12
- cmovnzq %rbp,%r13
- cmovnzq %rcx,%r8
- cmovnzq %r10,%r9
-
- .byte 0xf3,0xc3
-.cfi_endproc
-.size __ecp_nistz256_subq,.-__ecp_nistz256_subq
-
-.type __ecp_nistz256_mul_by_2q,@function
-.align 32
-__ecp_nistz256_mul_by_2q:
-.cfi_startproc
- xorq %r11,%r11
- addq %r12,%r12
- adcq %r13,%r13
- movq %r12,%rax
- adcq %r8,%r8
- adcq %r9,%r9
- movq %r13,%rbp
- adcq $0,%r11
-
- subq $-1,%r12
- movq %r8,%rcx
- sbbq %r14,%r13
- sbbq $0,%r8
- movq %r9,%r10
- sbbq %r15,%r9
- sbbq $0,%r11
-
- cmovcq %rax,%r12
- cmovcq %rbp,%r13
- movq %r12,0(%rdi)
- cmovcq %rcx,%r8
- movq %r13,8(%rdi)
- cmovcq %r10,%r9
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
-
- .byte 0xf3,0xc3
-.cfi_endproc
-.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
-.globl ecp_nistz256_point_double
-.type ecp_nistz256_point_double,@function
-.align 32
-ecp_nistz256_point_double:
-.cfi_startproc
- movl $0x80100,%ecx
- andl OPENSSL_ia32cap_P+8(%rip),%ecx
- cmpl $0x80100,%ecx
- je .Lpoint_doublex
- pushq %rbp
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbp,-16
- pushq %rbx
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbx,-24
- pushq %r12
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r15,-56
- subq $160+8,%rsp
-.cfi_adjust_cfa_offset 32*5+8
-.Lpoint_doubleq_body:
-
-.Lpoint_double_shortcutq:
- movdqu 0(%rsi),%xmm0
- movq %rsi,%rbx
- movdqu 16(%rsi),%xmm1
- movq 32+0(%rsi),%r12
- movq 32+8(%rsi),%r13
- movq 32+16(%rsi),%r8
- movq 32+24(%rsi),%r9
- movq .Lpoly+8(%rip),%r14
- movq .Lpoly+24(%rip),%r15
- movdqa %xmm0,96(%rsp)
- movdqa %xmm1,96+16(%rsp)
- leaq 32(%rdi),%r10
- leaq 64(%rdi),%r11
-.byte 102,72,15,110,199
-.byte 102,73,15,110,202
-.byte 102,73,15,110,211
-
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_by_2q
-
- movq 64+0(%rsi),%rax
- movq 64+8(%rsi),%r14
- movq 64+16(%rsi),%r15
- movq 64+24(%rsi),%r8
- leaq 64-0(%rsi),%rsi
- leaq 64(%rsp),%rdi
- call __ecp_nistz256_sqr_montq
-
- movq 0+0(%rsp),%rax
- movq 8+0(%rsp),%r14
- leaq 0+0(%rsp),%rsi
- movq 16+0(%rsp),%r15
- movq 24+0(%rsp),%r8
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_sqr_montq
-
- movq 32(%rbx),%rax
- movq 64+0(%rbx),%r9
- movq 64+8(%rbx),%r10
- movq 64+16(%rbx),%r11
- movq 64+24(%rbx),%r12
- leaq 64-0(%rbx),%rsi
- leaq 32(%rbx),%rbx
-.byte 102,72,15,126,215
- call __ecp_nistz256_mul_montq
- call __ecp_nistz256_mul_by_2q
-
- movq 96+0(%rsp),%r12
- movq 96+8(%rsp),%r13
- leaq 64(%rsp),%rbx
- movq 96+16(%rsp),%r8
- movq 96+24(%rsp),%r9
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_add_toq
-
- movq 96+0(%rsp),%r12
- movq 96+8(%rsp),%r13
- leaq 64(%rsp),%rbx
- movq 96+16(%rsp),%r8
- movq 96+24(%rsp),%r9
- leaq 64(%rsp),%rdi
- call __ecp_nistz256_sub_fromq
-
- movq 0+0(%rsp),%rax
- movq 8+0(%rsp),%r14
- leaq 0+0(%rsp),%rsi
- movq 16+0(%rsp),%r15
- movq 24+0(%rsp),%r8
-.byte 102,72,15,126,207
- call __ecp_nistz256_sqr_montq
- xorq %r9,%r9
- movq %r12,%rax
- addq $-1,%r12
- movq %r13,%r10
- adcq %rsi,%r13
- movq %r14,%rcx
- adcq $0,%r14
- movq %r15,%r8
- adcq %rbp,%r15
- adcq $0,%r9
- xorq %rsi,%rsi
- testq $1,%rax
-
- cmovzq %rax,%r12
- cmovzq %r10,%r13
- cmovzq %rcx,%r14
- cmovzq %r8,%r15
- cmovzq %rsi,%r9
-
- movq %r13,%rax
- shrq $1,%r12
- shlq $63,%rax
- movq %r14,%r10
- shrq $1,%r13
- orq %rax,%r12
- shlq $63,%r10
- movq %r15,%rcx
- shrq $1,%r14
- orq %r10,%r13
- shlq $63,%rcx
- movq %r12,0(%rdi)
- shrq $1,%r15
- movq %r13,8(%rdi)
- shlq $63,%r9
- orq %rcx,%r14
- orq %r9,%r15
- movq %r14,16(%rdi)
- movq %r15,24(%rdi)
- movq 64(%rsp),%rax
- leaq 64(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq 0+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- leaq 128(%rsp),%rdi
- call __ecp_nistz256_mul_by_2q
-
- leaq 32(%rsp),%rbx
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_add_toq
-
- movq 96(%rsp),%rax
- leaq 96(%rsp),%rbx
- movq 0+0(%rsp),%r9
- movq 8+0(%rsp),%r10
- leaq 0+0(%rsp),%rsi
- movq 16+0(%rsp),%r11
- movq 24+0(%rsp),%r12
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- leaq 128(%rsp),%rdi
- call __ecp_nistz256_mul_by_2q
-
- movq 0+32(%rsp),%rax
- movq 8+32(%rsp),%r14
- leaq 0+32(%rsp),%rsi
- movq 16+32(%rsp),%r15
- movq 24+32(%rsp),%r8
-.byte 102,72,15,126,199
- call __ecp_nistz256_sqr_montq
-
- leaq 128(%rsp),%rbx
- movq %r14,%r8
- movq %r15,%r9
- movq %rsi,%r14
- movq %rbp,%r15
- call __ecp_nistz256_sub_fromq
-
- movq 0+0(%rsp),%rax
- movq 0+8(%rsp),%rbp
- movq 0+16(%rsp),%rcx
- movq 0+24(%rsp),%r10
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_subq
-
- movq 32(%rsp),%rax
- leaq 32(%rsp),%rbx
- movq %r12,%r14
- xorl %ecx,%ecx
- movq %r12,0+0(%rsp)
- movq %r13,%r10
- movq %r13,0+8(%rsp)
- cmovzq %r8,%r11
- movq %r8,0+16(%rsp)
- leaq 0-0(%rsp),%rsi
- cmovzq %r9,%r12
- movq %r9,0+24(%rsp)
- movq %r14,%r9
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
-.byte 102,72,15,126,203
-.byte 102,72,15,126,207
- call __ecp_nistz256_sub_fromq
-
- leaq 160+56(%rsp),%rsi
-.cfi_def_cfa %rsi,8
- movq -48(%rsi),%r15
-.cfi_restore %r15
- movq -40(%rsi),%r14
-.cfi_restore %r14
- movq -32(%rsi),%r13
-.cfi_restore %r13
- movq -24(%rsi),%r12
-.cfi_restore %r12
- movq -16(%rsi),%rbx
-.cfi_restore %rbx
- movq -8(%rsi),%rbp
-.cfi_restore %rbp
- leaq (%rsi),%rsp
-.cfi_def_cfa_register %rsp
-.Lpoint_doubleq_epilogue:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size ecp_nistz256_point_double,.-ecp_nistz256_point_double
-.globl ecp_nistz256_point_add
-.type ecp_nistz256_point_add,@function
-.align 32
-ecp_nistz256_point_add:
-.cfi_startproc
- movl $0x80100,%ecx
- andl OPENSSL_ia32cap_P+8(%rip),%ecx
- cmpl $0x80100,%ecx
- je .Lpoint_addx
- pushq %rbp
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbp,-16
- pushq %rbx
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbx,-24
- pushq %r12
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r15,-56
- subq $576+8,%rsp
-.cfi_adjust_cfa_offset 32*18+8
-.Lpoint_addq_body:
-
- movdqu 0(%rsi),%xmm0
- movdqu 16(%rsi),%xmm1
- movdqu 32(%rsi),%xmm2
- movdqu 48(%rsi),%xmm3
- movdqu 64(%rsi),%xmm4
- movdqu 80(%rsi),%xmm5
- movq %rsi,%rbx
- movq %rdx,%rsi
- movdqa %xmm0,384(%rsp)
- movdqa %xmm1,384+16(%rsp)
- movdqa %xmm2,416(%rsp)
- movdqa %xmm3,416+16(%rsp)
- movdqa %xmm4,448(%rsp)
- movdqa %xmm5,448+16(%rsp)
- por %xmm4,%xmm5
-
- movdqu 0(%rsi),%xmm0
- pshufd $0xb1,%xmm5,%xmm3
- movdqu 16(%rsi),%xmm1
- movdqu 32(%rsi),%xmm2
- por %xmm3,%xmm5
- movdqu 48(%rsi),%xmm3
- movq 64+0(%rsi),%rax
- movq 64+8(%rsi),%r14
- movq 64+16(%rsi),%r15
- movq 64+24(%rsi),%r8
- movdqa %xmm0,480(%rsp)
- pshufd $0x1e,%xmm5,%xmm4
- movdqa %xmm1,480+16(%rsp)
- movdqu 64(%rsi),%xmm0
- movdqu 80(%rsi),%xmm1
- movdqa %xmm2,512(%rsp)
- movdqa %xmm3,512+16(%rsp)
- por %xmm4,%xmm5
- pxor %xmm4,%xmm4
- por %xmm0,%xmm1
-.byte 102,72,15,110,199
-
- leaq 64-0(%rsi),%rsi
- movq %rax,544+0(%rsp)
- movq %r14,544+8(%rsp)
- movq %r15,544+16(%rsp)
- movq %r8,544+24(%rsp)
- leaq 96(%rsp),%rdi
- call __ecp_nistz256_sqr_montq
-
- pcmpeqd %xmm4,%xmm5
- pshufd $0xb1,%xmm1,%xmm4
- por %xmm1,%xmm4
- pshufd $0,%xmm5,%xmm5
- pshufd $0x1e,%xmm4,%xmm3
- por %xmm3,%xmm4
- pxor %xmm3,%xmm3
- pcmpeqd %xmm3,%xmm4
- pshufd $0,%xmm4,%xmm4
- movq 64+0(%rbx),%rax
- movq 64+8(%rbx),%r14
- movq 64+16(%rbx),%r15
- movq 64+24(%rbx),%r8
-.byte 102,72,15,110,203
-
- leaq 64-0(%rbx),%rsi
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_sqr_montq
-
- movq 544(%rsp),%rax
- leaq 544(%rsp),%rbx
- movq 0+96(%rsp),%r9
- movq 8+96(%rsp),%r10
- leaq 0+96(%rsp),%rsi
- movq 16+96(%rsp),%r11
- movq 24+96(%rsp),%r12
- leaq 224(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- movq 448(%rsp),%rax
- leaq 448(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq 0+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 256(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- movq 416(%rsp),%rax
- leaq 416(%rsp),%rbx
- movq 0+224(%rsp),%r9
- movq 8+224(%rsp),%r10
- leaq 0+224(%rsp),%rsi
- movq 16+224(%rsp),%r11
- movq 24+224(%rsp),%r12
- leaq 224(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- movq 512(%rsp),%rax
- leaq 512(%rsp),%rbx
- movq 0+256(%rsp),%r9
- movq 8+256(%rsp),%r10
- leaq 0+256(%rsp),%rsi
- movq 16+256(%rsp),%r11
- movq 24+256(%rsp),%r12
- leaq 256(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- leaq 224(%rsp),%rbx
- leaq 64(%rsp),%rdi
- call __ecp_nistz256_sub_fromq
-
- orq %r13,%r12
- movdqa %xmm4,%xmm2
- orq %r8,%r12
- orq %r9,%r12
- por %xmm5,%xmm2
-.byte 102,73,15,110,220
-
- movq 384(%rsp),%rax
- leaq 384(%rsp),%rbx
- movq 0+96(%rsp),%r9
- movq 8+96(%rsp),%r10
- leaq 0+96(%rsp),%rsi
- movq 16+96(%rsp),%r11
- movq 24+96(%rsp),%r12
- leaq 160(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- movq 480(%rsp),%rax
- leaq 480(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq 0+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 192(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- leaq 160(%rsp),%rbx
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_sub_fromq
-
- orq %r13,%r12
- orq %r8,%r12
- orq %r9,%r12
-
-.byte 102,73,15,126,208
-.byte 102,73,15,126,217
-
- orq %r8,%r12
- orq %r9,%r12
-
-
-.byte 0x3e
- jnz .Ladd_proceedq
-
-.Ladd_doubleq:
-.byte 102,72,15,126,206
-.byte 102,72,15,126,199
- addq $416,%rsp
-.cfi_adjust_cfa_offset -416
- jmp .Lpoint_double_shortcutq
-.cfi_adjust_cfa_offset 416
-
-.align 32
-.Ladd_proceedq:
- movq 0+64(%rsp),%rax
- movq 8+64(%rsp),%r14
- leaq 0+64(%rsp),%rsi
- movq 16+64(%rsp),%r15
- movq 24+64(%rsp),%r8
- leaq 96(%rsp),%rdi
- call __ecp_nistz256_sqr_montq
-
- movq 448(%rsp),%rax
- leaq 448(%rsp),%rbx
- movq 0+0(%rsp),%r9
- movq 8+0(%rsp),%r10
- leaq 0+0(%rsp),%rsi
- movq 16+0(%rsp),%r11
- movq 24+0(%rsp),%r12
- leaq 352(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- movq 0+0(%rsp),%rax
- movq 8+0(%rsp),%r14
- leaq 0+0(%rsp),%rsi
- movq 16+0(%rsp),%r15
- movq 24+0(%rsp),%r8
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_sqr_montq
-
- movq 544(%rsp),%rax
- leaq 544(%rsp),%rbx
- movq 0+352(%rsp),%r9
- movq 8+352(%rsp),%r10
- leaq 0+352(%rsp),%rsi
- movq 16+352(%rsp),%r11
- movq 24+352(%rsp),%r12
- leaq 352(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- movq 0(%rsp),%rax
- leaq 0(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq 0+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 128(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- movq 160(%rsp),%rax
- leaq 160(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq 0+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 192(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
-
-
-
- xorq %r11,%r11
- addq %r12,%r12
- leaq 96(%rsp),%rsi
- adcq %r13,%r13
- movq %r12,%rax
- adcq %r8,%r8
- adcq %r9,%r9
- movq %r13,%rbp
- adcq $0,%r11
-
- subq $-1,%r12
- movq %r8,%rcx
- sbbq %r14,%r13
- sbbq $0,%r8
- movq %r9,%r10
- sbbq %r15,%r9
- sbbq $0,%r11
-
- cmovcq %rax,%r12
- movq 0(%rsi),%rax
- cmovcq %rbp,%r13
- movq 8(%rsi),%rbp
- cmovcq %rcx,%r8
- movq 16(%rsi),%rcx
- cmovcq %r10,%r9
- movq 24(%rsi),%r10
-
- call __ecp_nistz256_subq
-
- leaq 128(%rsp),%rbx
- leaq 288(%rsp),%rdi
- call __ecp_nistz256_sub_fromq
-
- movq 192+0(%rsp),%rax
- movq 192+8(%rsp),%rbp
- movq 192+16(%rsp),%rcx
- movq 192+24(%rsp),%r10
- leaq 320(%rsp),%rdi
-
- call __ecp_nistz256_subq
-
- movq %r12,0(%rdi)
- movq %r13,8(%rdi)
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
- movq 128(%rsp),%rax
- leaq 128(%rsp),%rbx
- movq 0+224(%rsp),%r9
- movq 8+224(%rsp),%r10
- leaq 0+224(%rsp),%rsi
- movq 16+224(%rsp),%r11
- movq 24+224(%rsp),%r12
- leaq 256(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- movq 320(%rsp),%rax
- leaq 320(%rsp),%rbx
- movq 0+64(%rsp),%r9
- movq 8+64(%rsp),%r10
- leaq 0+64(%rsp),%rsi
- movq 16+64(%rsp),%r11
- movq 24+64(%rsp),%r12
- leaq 320(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- leaq 256(%rsp),%rbx
- leaq 320(%rsp),%rdi
- call __ecp_nistz256_sub_fromq
-
-.byte 102,72,15,126,199
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 352(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 352+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand 544(%rsp),%xmm2
- pand 544+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 448(%rsp),%xmm2
- pand 448+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,64(%rdi)
- movdqu %xmm3,80(%rdi)
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 288(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 288+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand 480(%rsp),%xmm2
- pand 480+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 384(%rsp),%xmm2
- pand 384+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,0(%rdi)
- movdqu %xmm3,16(%rdi)
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 320(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 320+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand 512(%rsp),%xmm2
- pand 512+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 416(%rsp),%xmm2
- pand 416+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,32(%rdi)
- movdqu %xmm3,48(%rdi)
-
-.Ladd_doneq:
- leaq 576+56(%rsp),%rsi
-.cfi_def_cfa %rsi,8
- movq -48(%rsi),%r15
-.cfi_restore %r15
- movq -40(%rsi),%r14
-.cfi_restore %r14
- movq -32(%rsi),%r13
-.cfi_restore %r13
- movq -24(%rsi),%r12
-.cfi_restore %r12
- movq -16(%rsi),%rbx
-.cfi_restore %rbx
- movq -8(%rsi),%rbp
-.cfi_restore %rbp
- leaq (%rsi),%rsp
-.cfi_def_cfa_register %rsp
-.Lpoint_addq_epilogue:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size ecp_nistz256_point_add,.-ecp_nistz256_point_add
-.globl ecp_nistz256_point_add_affine
-.type ecp_nistz256_point_add_affine,@function
-.align 32
-ecp_nistz256_point_add_affine:
-.cfi_startproc
- movl $0x80100,%ecx
- andl OPENSSL_ia32cap_P+8(%rip),%ecx
- cmpl $0x80100,%ecx
- je .Lpoint_add_affinex
- pushq %rbp
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbp,-16
- pushq %rbx
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbx,-24
- pushq %r12
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r15,-56
- subq $480+8,%rsp
-.cfi_adjust_cfa_offset 32*15+8
-.Ladd_affineq_body:
-
- movdqu 0(%rsi),%xmm0
- movq %rdx,%rbx
- movdqu 16(%rsi),%xmm1
- movdqu 32(%rsi),%xmm2
- movdqu 48(%rsi),%xmm3
- movdqu 64(%rsi),%xmm4
- movdqu 80(%rsi),%xmm5
- movq 64+0(%rsi),%rax
- movq 64+8(%rsi),%r14
- movq 64+16(%rsi),%r15
- movq 64+24(%rsi),%r8
- movdqa %xmm0,320(%rsp)
- movdqa %xmm1,320+16(%rsp)
- movdqa %xmm2,352(%rsp)
- movdqa %xmm3,352+16(%rsp)
- movdqa %xmm4,384(%rsp)
- movdqa %xmm5,384+16(%rsp)
- por %xmm4,%xmm5
-
- movdqu 0(%rbx),%xmm0
- pshufd $0xb1,%xmm5,%xmm3
- movdqu 16(%rbx),%xmm1
- movdqu 32(%rbx),%xmm2
- por %xmm3,%xmm5
- movdqu 48(%rbx),%xmm3
- movdqa %xmm0,416(%rsp)
- pshufd $0x1e,%xmm5,%xmm4
- movdqa %xmm1,416+16(%rsp)
- por %xmm0,%xmm1
-.byte 102,72,15,110,199
- movdqa %xmm2,448(%rsp)
- movdqa %xmm3,448+16(%rsp)
- por %xmm2,%xmm3
- por %xmm4,%xmm5
- pxor %xmm4,%xmm4
- por %xmm1,%xmm3
-
- leaq 64-0(%rsi),%rsi
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_sqr_montq
-
- pcmpeqd %xmm4,%xmm5
- pshufd $0xb1,%xmm3,%xmm4
- movq 0(%rbx),%rax
-
- movq %r12,%r9
- por %xmm3,%xmm4
- pshufd $0,%xmm5,%xmm5
- pshufd $0x1e,%xmm4,%xmm3
- movq %r13,%r10
- por %xmm3,%xmm4
- pxor %xmm3,%xmm3
- movq %r14,%r11
- pcmpeqd %xmm3,%xmm4
- pshufd $0,%xmm4,%xmm4
-
- leaq 32-0(%rsp),%rsi
- movq %r15,%r12
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- leaq 320(%rsp),%rbx
- leaq 64(%rsp),%rdi
- call __ecp_nistz256_sub_fromq
-
- movq 384(%rsp),%rax
- leaq 384(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq 0+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- movq 384(%rsp),%rax
- leaq 384(%rsp),%rbx
- movq 0+64(%rsp),%r9
- movq 8+64(%rsp),%r10
- leaq 0+64(%rsp),%rsi
- movq 16+64(%rsp),%r11
- movq 24+64(%rsp),%r12
- leaq 288(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- movq 448(%rsp),%rax
- leaq 448(%rsp),%rbx
- movq 0+32(%rsp),%r9
- movq 8+32(%rsp),%r10
- leaq 0+32(%rsp),%rsi
- movq 16+32(%rsp),%r11
- movq 24+32(%rsp),%r12
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- leaq 352(%rsp),%rbx
- leaq 96(%rsp),%rdi
- call __ecp_nistz256_sub_fromq
-
- movq 0+64(%rsp),%rax
- movq 8+64(%rsp),%r14
- leaq 0+64(%rsp),%rsi
- movq 16+64(%rsp),%r15
- movq 24+64(%rsp),%r8
- leaq 128(%rsp),%rdi
- call __ecp_nistz256_sqr_montq
-
- movq 0+96(%rsp),%rax
- movq 8+96(%rsp),%r14
- leaq 0+96(%rsp),%rsi
- movq 16+96(%rsp),%r15
- movq 24+96(%rsp),%r8
- leaq 192(%rsp),%rdi
- call __ecp_nistz256_sqr_montq
-
- movq 128(%rsp),%rax
- leaq 128(%rsp),%rbx
- movq 0+64(%rsp),%r9
- movq 8+64(%rsp),%r10
- leaq 0+64(%rsp),%rsi
- movq 16+64(%rsp),%r11
- movq 24+64(%rsp),%r12
- leaq 160(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- movq 320(%rsp),%rax
- leaq 320(%rsp),%rbx
- movq 0+128(%rsp),%r9
- movq 8+128(%rsp),%r10
- leaq 0+128(%rsp),%rsi
- movq 16+128(%rsp),%r11
- movq 24+128(%rsp),%r12
- leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
-
-
-
- xorq %r11,%r11
- addq %r12,%r12
- leaq 192(%rsp),%rsi
- adcq %r13,%r13
- movq %r12,%rax
- adcq %r8,%r8
- adcq %r9,%r9
- movq %r13,%rbp
- adcq $0,%r11
-
- subq $-1,%r12
- movq %r8,%rcx
- sbbq %r14,%r13
- sbbq $0,%r8
- movq %r9,%r10
- sbbq %r15,%r9
- sbbq $0,%r11
-
- cmovcq %rax,%r12
- movq 0(%rsi),%rax
- cmovcq %rbp,%r13
- movq 8(%rsi),%rbp
- cmovcq %rcx,%r8
- movq 16(%rsi),%rcx
- cmovcq %r10,%r9
- movq 24(%rsi),%r10
-
- call __ecp_nistz256_subq
-
- leaq 160(%rsp),%rbx
- leaq 224(%rsp),%rdi
- call __ecp_nistz256_sub_fromq
-
- movq 0+0(%rsp),%rax
- movq 0+8(%rsp),%rbp
- movq 0+16(%rsp),%rcx
- movq 0+24(%rsp),%r10
- leaq 64(%rsp),%rdi
-
- call __ecp_nistz256_subq
-
- movq %r12,0(%rdi)
- movq %r13,8(%rdi)
- movq %r8,16(%rdi)
- movq %r9,24(%rdi)
- movq 352(%rsp),%rax
- leaq 352(%rsp),%rbx
- movq 0+160(%rsp),%r9
- movq 8+160(%rsp),%r10
- leaq 0+160(%rsp),%rsi
- movq 16+160(%rsp),%r11
- movq 24+160(%rsp),%r12
- leaq 32(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- movq 96(%rsp),%rax
- leaq 96(%rsp),%rbx
- movq 0+64(%rsp),%r9
- movq 8+64(%rsp),%r10
- leaq 0+64(%rsp),%rsi
- movq 16+64(%rsp),%r11
- movq 24+64(%rsp),%r12
- leaq 64(%rsp),%rdi
- call __ecp_nistz256_mul_montq
-
- leaq 32(%rsp),%rbx
- leaq 256(%rsp),%rdi
- call __ecp_nistz256_sub_fromq
-
-.byte 102,72,15,126,199
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 288(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 288+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand .LONE_mont(%rip),%xmm2
- pand .LONE_mont+16(%rip),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 384(%rsp),%xmm2
- pand 384+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,64(%rdi)
- movdqu %xmm3,80(%rdi)
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 224(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 224+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand 416(%rsp),%xmm2
- pand 416+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
-
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 320(%rsp),%xmm2
- pand 320+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,0(%rdi)
- movdqu %xmm3,16(%rdi)
-
- movdqa %xmm5,%xmm0
- movdqa %xmm5,%xmm1
- pandn 256(%rsp),%xmm0
- movdqa %xmm5,%xmm2
- pandn 256+16(%rsp),%xmm1
- movdqa %xmm5,%xmm3
- pand 448(%rsp),%xmm2
- pand 448+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
+ movdqa 32(%rsi),%xmm11
+ movdqa 48(%rsi),%xmm12
+ leaq 64(%rsi),%rsi
- movdqa %xmm4,%xmm0
- movdqa %xmm4,%xmm1
- pandn %xmm2,%xmm0
- movdqa %xmm4,%xmm2
- pandn %xmm3,%xmm1
- movdqa %xmm4,%xmm3
- pand 352(%rsp),%xmm2
- pand 352+16(%rsp),%xmm3
- por %xmm0,%xmm2
- por %xmm1,%xmm3
- movdqu %xmm2,32(%rdi)
- movdqu %xmm3,48(%rdi)
+ pand %xmm15,%xmm9
+ pand %xmm15,%xmm10
+ por %xmm9,%xmm2
+ pand %xmm15,%xmm11
+ por %xmm10,%xmm3
+ pand %xmm15,%xmm12
+ por %xmm11,%xmm4
+ prefetcht0 255(%rsi)
+ por %xmm12,%xmm5
- leaq 480+56(%rsp),%rsi
-.cfi_def_cfa %rsi,8
- movq -48(%rsi),%r15
-.cfi_restore %r15
- movq -40(%rsi),%r14
-.cfi_restore %r14
- movq -32(%rsi),%r13
-.cfi_restore %r13
- movq -24(%rsi),%r12
-.cfi_restore %r12
- movq -16(%rsi),%rbx
-.cfi_restore %rbx
- movq -8(%rsi),%rbp
-.cfi_restore %rbp
- leaq (%rsi),%rsp
-.cfi_def_cfa_register %rsp
-.Ladd_affineq_epilogue:
+ decq %rax
+ jnz .Lselect_loop_sse_w7
+
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+ movdqu %xmm4,32(%rdi)
+ movdqu %xmm5,48(%rdi)
.byte 0xf3,0xc3
.cfi_endproc
-.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
-.type __ecp_nistz256_add_tox,@function
+.LSEH_end_ecp_nistz256_gather_w7:
+.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
+.globl ecp_nistz256_avx2_gather_w7
+.type ecp_nistz256_avx2_gather_w7,@function
+.align 32
+ecp_nistz256_avx2_gather_w7:
+.cfi_startproc
+.byte 0x0f,0x0b
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
+.type __ecp_nistz256_add_toq,@function
.align 32
-__ecp_nistz256_add_tox:
+__ecp_nistz256_add_toq:
.cfi_startproc
xorq %r11,%r11
- adcq 0(%rbx),%r12
+ addq 0(%rbx),%r12
adcq 8(%rbx),%r13
movq %r12,%rax
adcq 16(%rbx),%r8
@@ -6249,8 +4202,7 @@ __ecp_nistz256_add_tox:
movq %r13,%rbp
adcq $0,%r11
- xorq %r10,%r10
- sbbq $-1,%r12
+ subq $-1,%r12
movq %r8,%rcx
sbbq %r14,%r13
sbbq $0,%r8
@@ -6269,80 +4221,76 @@ __ecp_nistz256_add_tox:
.byte 0xf3,0xc3
.cfi_endproc
-.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
+.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
-.type __ecp_nistz256_sub_fromx,@function
+.type __ecp_nistz256_sub_fromq,@function
.align 32
-__ecp_nistz256_sub_fromx:
+__ecp_nistz256_sub_fromq:
.cfi_startproc
- xorq %r11,%r11
- sbbq 0(%rbx),%r12
+ subq 0(%rbx),%r12
sbbq 8(%rbx),%r13
movq %r12,%rax
sbbq 16(%rbx),%r8
sbbq 24(%rbx),%r9
movq %r13,%rbp
- sbbq $0,%r11
+ sbbq %r11,%r11
- xorq %r10,%r10
- adcq $-1,%r12
+ addq $-1,%r12
movq %r8,%rcx
adcq %r14,%r13
adcq $0,%r8
movq %r9,%r10
adcq %r15,%r9
+ testq %r11,%r11
- btq $0,%r11
- cmovncq %rax,%r12
- cmovncq %rbp,%r13
+ cmovzq %rax,%r12
+ cmovzq %rbp,%r13
movq %r12,0(%rdi)
- cmovncq %rcx,%r8
+ cmovzq %rcx,%r8
movq %r13,8(%rdi)
- cmovncq %r10,%r9
+ cmovzq %r10,%r9
movq %r8,16(%rdi)
movq %r9,24(%rdi)
.byte 0xf3,0xc3
.cfi_endproc
-.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
+.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
-.type __ecp_nistz256_subx,@function
+.type __ecp_nistz256_subq,@function
.align 32
-__ecp_nistz256_subx:
+__ecp_nistz256_subq:
.cfi_startproc
- xorq %r11,%r11
- sbbq %r12,%rax
+ subq %r12,%rax
sbbq %r13,%rbp
movq %rax,%r12
sbbq %r8,%rcx
sbbq %r9,%r10
movq %rbp,%r13
- sbbq $0,%r11
+ sbbq %r11,%r11
- xorq %r9,%r9
- adcq $-1,%rax
+ addq $-1,%rax
movq %rcx,%r8
adcq %r14,%rbp
adcq $0,%rcx
movq %r10,%r9
adcq %r15,%r10
+ testq %r11,%r11
- btq $0,%r11
- cmovcq %rax,%r12
- cmovcq %rbp,%r13
- cmovcq %rcx,%r8
- cmovcq %r10,%r9
+ cmovnzq %rax,%r12
+ cmovnzq %rbp,%r13
+ cmovnzq %rcx,%r8
+ cmovnzq %r10,%r9
.byte 0xf3,0xc3
.cfi_endproc
-.size __ecp_nistz256_subx,.-__ecp_nistz256_subx
+.size __ecp_nistz256_subq,.-__ecp_nistz256_subq
-.type __ecp_nistz256_mul_by_2x,@function
+.type __ecp_nistz256_mul_by_2q,@function
.align 32
-__ecp_nistz256_mul_by_2x:
+__ecp_nistz256_mul_by_2q:
.cfi_startproc
xorq %r11,%r11
- adcq %r12,%r12
+ addq %r12,%r12
adcq %r13,%r13
movq %r12,%rax
adcq %r8,%r8
@@ -6350,8 +4298,7 @@ __ecp_nistz256_mul_by_2x:
movq %r13,%rbp
adcq $0,%r11
- xorq %r10,%r10
- sbbq $-1,%r12
+ subq $-1,%r12
movq %r8,%rcx
sbbq %r14,%r13
sbbq $0,%r8
@@ -6370,12 +4317,12 @@ __ecp_nistz256_mul_by_2x:
.byte 0xf3,0xc3
.cfi_endproc
-.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
-.type ecp_nistz256_point_doublex,@function
+.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
+.globl ecp_nistz256_point_double
+.type ecp_nistz256_point_double,@function
.align 32
-ecp_nistz256_point_doublex:
+ecp_nistz256_point_double:
.cfi_startproc
-.Lpoint_doublex:
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
@@ -6396,9 +4343,9 @@ ecp_nistz256_point_doublex:
.cfi_offset %r15,-56
subq $160+8,%rsp
.cfi_adjust_cfa_offset 32*5+8
-.Lpoint_doublex_body:
+.Lpoint_doubleq_body:
-.Lpoint_double_shortcutx:
+.Lpoint_double_shortcutq:
movdqu 0(%rsi),%xmm0
movq %rsi,%rbx
movdqu 16(%rsi),%xmm1
@@ -6417,34 +4364,34 @@ ecp_nistz256_point_doublex:
.byte 102,73,15,110,211
leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_by_2x
+ call __ecp_nistz256_mul_by_2q
- movq 64+0(%rsi),%rdx
+ movq 64+0(%rsi),%rax
movq 64+8(%rsi),%r14
movq 64+16(%rsi),%r15
movq 64+24(%rsi),%r8
- leaq 64-128(%rsi),%rsi
+ leaq 64-0(%rsi),%rsi
leaq 64(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
+ call __ecp_nistz256_sqr_montq
- movq 0+0(%rsp),%rdx
+ movq 0+0(%rsp),%rax
movq 8+0(%rsp),%r14
- leaq -128+0(%rsp),%rsi
+ leaq 0+0(%rsp),%rsi
movq 16+0(%rsp),%r15
movq 24+0(%rsp),%r8
leaq 0(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
+ call __ecp_nistz256_sqr_montq
- movq 32(%rbx),%rdx
+ movq 32(%rbx),%rax
movq 64+0(%rbx),%r9
movq 64+8(%rbx),%r10
movq 64+16(%rbx),%r11
movq 64+24(%rbx),%r12
- leaq 64-128(%rbx),%rsi
+ leaq 64-0(%rbx),%rsi
leaq 32(%rbx),%rbx
.byte 102,72,15,126,215
- call __ecp_nistz256_mul_montx
- call __ecp_nistz256_mul_by_2x
+ call __ecp_nistz256_mul_montq
+ call __ecp_nistz256_mul_by_2q
movq 96+0(%rsp),%r12
movq 96+8(%rsp),%r13
@@ -6452,7 +4399,7 @@ ecp_nistz256_point_doublex:
movq 96+16(%rsp),%r8
movq 96+24(%rsp),%r9
leaq 32(%rsp),%rdi
- call __ecp_nistz256_add_tox
+ call __ecp_nistz256_add_toq
movq 96+0(%rsp),%r12
movq 96+8(%rsp),%r13
@@ -6460,15 +4407,15 @@ ecp_nistz256_point_doublex:
movq 96+16(%rsp),%r8
movq 96+24(%rsp),%r9
leaq 64(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
+ call __ecp_nistz256_sub_fromq
- movq 0+0(%rsp),%rdx
+ movq 0+0(%rsp),%rax
movq 8+0(%rsp),%r14
- leaq -128+0(%rsp),%rsi
+ leaq 0+0(%rsp),%rsi
movq 16+0(%rsp),%r15
movq 24+0(%rsp),%r8
.byte 102,72,15,126,207
- call __ecp_nistz256_sqr_montx
+ call __ecp_nistz256_sqr_montq
xorq %r9,%r9
movq %r12,%rax
addq $-1,%r12
@@ -6507,59 +4454,59 @@ ecp_nistz256_point_doublex:
orq %r9,%r15
movq %r14,16(%rdi)
movq %r15,24(%rdi)
- movq 64(%rsp),%rdx
+ movq 64(%rsp),%rax
leaq 64(%rsp),%rbx
movq 0+32(%rsp),%r9
movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
+ leaq 0+32(%rsp),%rsi
movq 16+32(%rsp),%r11
movq 24+32(%rsp),%r12
leaq 32(%rsp),%rdi
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
leaq 128(%rsp),%rdi
- call __ecp_nistz256_mul_by_2x
+ call __ecp_nistz256_mul_by_2q
leaq 32(%rsp),%rbx
leaq 32(%rsp),%rdi
- call __ecp_nistz256_add_tox
+ call __ecp_nistz256_add_toq
- movq 96(%rsp),%rdx
+ movq 96(%rsp),%rax
leaq 96(%rsp),%rbx
movq 0+0(%rsp),%r9
movq 8+0(%rsp),%r10
- leaq -128+0(%rsp),%rsi
+ leaq 0+0(%rsp),%rsi
movq 16+0(%rsp),%r11
movq 24+0(%rsp),%r12
leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
leaq 128(%rsp),%rdi
- call __ecp_nistz256_mul_by_2x
+ call __ecp_nistz256_mul_by_2q
- movq 0+32(%rsp),%rdx
+ movq 0+32(%rsp),%rax
movq 8+32(%rsp),%r14
- leaq -128+32(%rsp),%rsi
+ leaq 0+32(%rsp),%rsi
movq 16+32(%rsp),%r15
movq 24+32(%rsp),%r8
.byte 102,72,15,126,199
- call __ecp_nistz256_sqr_montx
+ call __ecp_nistz256_sqr_montq
leaq 128(%rsp),%rbx
movq %r14,%r8
movq %r15,%r9
movq %rsi,%r14
movq %rbp,%r15
- call __ecp_nistz256_sub_fromx
+ call __ecp_nistz256_sub_fromq
movq 0+0(%rsp),%rax
movq 0+8(%rsp),%rbp
movq 0+16(%rsp),%rcx
movq 0+24(%rsp),%r10
leaq 0(%rsp),%rdi
- call __ecp_nistz256_subx
+ call __ecp_nistz256_subq
- movq 32(%rsp),%rdx
+ movq 32(%rsp),%rax
leaq 32(%rsp),%rbx
movq %r12,%r14
xorl %ecx,%ecx
@@ -6568,16 +4515,16 @@ ecp_nistz256_point_doublex:
movq %r13,0+8(%rsp)
cmovzq %r8,%r11
movq %r8,0+16(%rsp)
- leaq 0-128(%rsp),%rsi
+ leaq 0-0(%rsp),%rsi
cmovzq %r9,%r12
movq %r9,0+24(%rsp)
movq %r14,%r9
leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
.byte 102,72,15,126,203
.byte 102,72,15,126,207
- call __ecp_nistz256_sub_fromx
+ call __ecp_nistz256_sub_fromq
leaq 160+56(%rsp),%rsi
.cfi_def_cfa %rsi,8
@@ -6595,15 +4542,15 @@ ecp_nistz256_point_doublex:
.cfi_restore %rbp
leaq (%rsi),%rsp
.cfi_def_cfa_register %rsp
-.Lpoint_doublex_epilogue:
+.Lpoint_doubleq_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
-.size ecp_nistz256_point_doublex,.-ecp_nistz256_point_doublex
-.type ecp_nistz256_point_addx,@function
+.size ecp_nistz256_point_double,.-ecp_nistz256_point_double
+.globl ecp_nistz256_point_add
+.type ecp_nistz256_point_add,@function
.align 32
-ecp_nistz256_point_addx:
+ecp_nistz256_point_add:
.cfi_startproc
-.Lpoint_addx:
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
@@ -6624,7 +4571,7 @@ ecp_nistz256_point_addx:
.cfi_offset %r15,-56
subq $576+8,%rsp
.cfi_adjust_cfa_offset 32*18+8
-.Lpoint_addx_body:
+.Lpoint_addq_body:
movdqu 0(%rsi),%xmm0
movdqu 16(%rsi),%xmm1
@@ -6648,7 +4595,7 @@ ecp_nistz256_point_addx:
movdqu 32(%rsi),%xmm2
por %xmm3,%xmm5
movdqu 48(%rsi),%xmm3
- movq 64+0(%rsi),%rdx
+ movq 64+0(%rsi),%rax
movq 64+8(%rsi),%r14
movq 64+16(%rsi),%r15
movq 64+24(%rsi),%r8
@@ -6664,13 +4611,13 @@ ecp_nistz256_point_addx:
por %xmm0,%xmm1
.byte 102,72,15,110,199
- leaq 64-128(%rsi),%rsi
- movq %rdx,544+0(%rsp)
+ leaq 64-0(%rsi),%rsi
+ movq %rax,544+0(%rsp)
movq %r14,544+8(%rsp)
movq %r15,544+16(%rsp)
movq %r8,544+24(%rsp)
leaq 96(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
+ call __ecp_nistz256_sqr_montq
pcmpeqd %xmm4,%xmm5
pshufd $0xb1,%xmm1,%xmm4
@@ -6681,59 +4628,59 @@ ecp_nistz256_point_addx:
pxor %xmm3,%xmm3
pcmpeqd %xmm3,%xmm4
pshufd $0,%xmm4,%xmm4
- movq 64+0(%rbx),%rdx
+ movq 64+0(%rbx),%rax
movq 64+8(%rbx),%r14
movq 64+16(%rbx),%r15
movq 64+24(%rbx),%r8
.byte 102,72,15,110,203
- leaq 64-128(%rbx),%rsi
+ leaq 64-0(%rbx),%rsi
leaq 32(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
+ call __ecp_nistz256_sqr_montq
- movq 544(%rsp),%rdx
+ movq 544(%rsp),%rax
leaq 544(%rsp),%rbx
movq 0+96(%rsp),%r9
movq 8+96(%rsp),%r10
- leaq -128+96(%rsp),%rsi
+ leaq 0+96(%rsp),%rsi
movq 16+96(%rsp),%r11
movq 24+96(%rsp),%r12
leaq 224(%rsp),%rdi
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
- movq 448(%rsp),%rdx
+ movq 448(%rsp),%rax
leaq 448(%rsp),%rbx
movq 0+32(%rsp),%r9
movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
+ leaq 0+32(%rsp),%rsi
movq 16+32(%rsp),%r11
movq 24+32(%rsp),%r12
leaq 256(%rsp),%rdi
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
- movq 416(%rsp),%rdx
+ movq 416(%rsp),%rax
leaq 416(%rsp),%rbx
movq 0+224(%rsp),%r9
movq 8+224(%rsp),%r10
- leaq -128+224(%rsp),%rsi
+ leaq 0+224(%rsp),%rsi
movq 16+224(%rsp),%r11
movq 24+224(%rsp),%r12
leaq 224(%rsp),%rdi
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
- movq 512(%rsp),%rdx
+ movq 512(%rsp),%rax
leaq 512(%rsp),%rbx
movq 0+256(%rsp),%r9
movq 8+256(%rsp),%r10
- leaq -128+256(%rsp),%rsi
+ leaq 0+256(%rsp),%rsi
movq 16+256(%rsp),%r11
movq 24+256(%rsp),%r12
leaq 256(%rsp),%rdi
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
leaq 224(%rsp),%rbx
leaq 64(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
+ call __ecp_nistz256_sub_fromq
orq %r13,%r12
movdqa %xmm4,%xmm2
@@ -6742,29 +4689,29 @@ ecp_nistz256_point_addx:
por %xmm5,%xmm2
.byte 102,73,15,110,220
- movq 384(%rsp),%rdx
+ movq 384(%rsp),%rax
leaq 384(%rsp),%rbx
movq 0+96(%rsp),%r9
movq 8+96(%rsp),%r10
- leaq -128+96(%rsp),%rsi
+ leaq 0+96(%rsp),%rsi
movq 16+96(%rsp),%r11
movq 24+96(%rsp),%r12
leaq 160(%rsp),%rdi
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
- movq 480(%rsp),%rdx
+ movq 480(%rsp),%rax
leaq 480(%rsp),%rbx
movq 0+32(%rsp),%r9
movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
+ leaq 0+32(%rsp),%rsi
movq 16+32(%rsp),%r11
movq 24+32(%rsp),%r12
leaq 192(%rsp),%rdi
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
leaq 160(%rsp),%rbx
leaq 0(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
+ call __ecp_nistz256_sub_fromq
orq %r13,%r12
orq %r8,%r12
@@ -6778,73 +4725,73 @@ ecp_nistz256_point_addx:
.byte 0x3e
- jnz .Ladd_proceedx
+ jnz .Ladd_proceedq
-.Ladd_doublex:
+.Ladd_doubleq:
.byte 102,72,15,126,206
.byte 102,72,15,126,199
addq $416,%rsp
.cfi_adjust_cfa_offset -416
- jmp .Lpoint_double_shortcutx
+ jmp .Lpoint_double_shortcutq
.cfi_adjust_cfa_offset 416
.align 32
-.Ladd_proceedx:
- movq 0+64(%rsp),%rdx
+.Ladd_proceedq:
+ movq 0+64(%rsp),%rax
movq 8+64(%rsp),%r14
- leaq -128+64(%rsp),%rsi
+ leaq 0+64(%rsp),%rsi
movq 16+64(%rsp),%r15
movq 24+64(%rsp),%r8
leaq 96(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
+ call __ecp_nistz256_sqr_montq
- movq 448(%rsp),%rdx
+ movq 448(%rsp),%rax
leaq 448(%rsp),%rbx
movq 0+0(%rsp),%r9
movq 8+0(%rsp),%r10
- leaq -128+0(%rsp),%rsi
+ leaq 0+0(%rsp),%rsi
movq 16+0(%rsp),%r11
movq 24+0(%rsp),%r12
leaq 352(%rsp),%rdi
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
- movq 0+0(%rsp),%rdx
+ movq 0+0(%rsp),%rax
movq 8+0(%rsp),%r14
- leaq -128+0(%rsp),%rsi
+ leaq 0+0(%rsp),%rsi
movq 16+0(%rsp),%r15
movq 24+0(%rsp),%r8
leaq 32(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
+ call __ecp_nistz256_sqr_montq
- movq 544(%rsp),%rdx
+ movq 544(%rsp),%rax
leaq 544(%rsp),%rbx
movq 0+352(%rsp),%r9
movq 8+352(%rsp),%r10
- leaq -128+352(%rsp),%rsi
+ leaq 0+352(%rsp),%rsi
movq 16+352(%rsp),%r11
movq 24+352(%rsp),%r12
leaq 352(%rsp),%rdi
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
- movq 0(%rsp),%rdx
+ movq 0(%rsp),%rax
leaq 0(%rsp),%rbx
movq 0+32(%rsp),%r9
movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
+ leaq 0+32(%rsp),%rsi
movq 16+32(%rsp),%r11
movq 24+32(%rsp),%r12
leaq 128(%rsp),%rdi
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
- movq 160(%rsp),%rdx
+ movq 160(%rsp),%rax
leaq 160(%rsp),%rbx
movq 0+32(%rsp),%r9
movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
+ leaq 0+32(%rsp),%rsi
movq 16+32(%rsp),%r11
movq 24+32(%rsp),%r12
leaq 192(%rsp),%rdi
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
@@ -6876,11 +4823,11 @@ ecp_nistz256_point_addx:
cmovcq %r10,%r9
movq 24(%rsi),%r10
- call __ecp_nistz256_subx
+ call __ecp_nistz256_subq
leaq 128(%rsp),%rbx
leaq 288(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
+ call __ecp_nistz256_sub_fromq
movq 192+0(%rsp),%rax
movq 192+8(%rsp),%rbp
@@ -6888,35 +4835,35 @@ ecp_nistz256_point_addx:
movq 192+24(%rsp),%r10
leaq 320(%rsp),%rdi
- call __ecp_nistz256_subx
+ call __ecp_nistz256_subq
movq %r12,0(%rdi)
movq %r13,8(%rdi)
movq %r8,16(%rdi)
movq %r9,24(%rdi)
- movq 128(%rsp),%rdx
+ movq 128(%rsp),%rax
leaq 128(%rsp),%rbx
movq 0+224(%rsp),%r9
movq 8+224(%rsp),%r10
- leaq -128+224(%rsp),%rsi
+ leaq 0+224(%rsp),%rsi
movq 16+224(%rsp),%r11
movq 24+224(%rsp),%r12
leaq 256(%rsp),%rdi
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
- movq 320(%rsp),%rdx
+ movq 320(%rsp),%rax
leaq 320(%rsp),%rbx
movq 0+64(%rsp),%r9
movq 8+64(%rsp),%r10
- leaq -128+64(%rsp),%rsi
+ leaq 0+64(%rsp),%rsi
movq 16+64(%rsp),%r11
movq 24+64(%rsp),%r12
leaq 320(%rsp),%rdi
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
leaq 256(%rsp),%rbx
leaq 320(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
+ call __ecp_nistz256_sub_fromq
.byte 102,72,15,126,199
@@ -6992,7 +4939,7 @@ ecp_nistz256_point_addx:
movdqu %xmm2,32(%rdi)
movdqu %xmm3,48(%rdi)
-.Ladd_donex:
+.Ladd_doneq:
leaq 576+56(%rsp),%rsi
.cfi_def_cfa %rsi,8
movq -48(%rsi),%r15
@@ -7009,15 +4956,15 @@ ecp_nistz256_point_addx:
.cfi_restore %rbp
leaq (%rsi),%rsp
.cfi_def_cfa_register %rsp
-.Lpoint_addx_epilogue:
+.Lpoint_addq_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
-.size ecp_nistz256_point_addx,.-ecp_nistz256_point_addx
-.type ecp_nistz256_point_add_affinex,@function
+.size ecp_nistz256_point_add,.-ecp_nistz256_point_add
+.globl ecp_nistz256_point_add_affine
+.type ecp_nistz256_point_add_affine,@function
.align 32
-ecp_nistz256_point_add_affinex:
+ecp_nistz256_point_add_affine:
.cfi_startproc
-.Lpoint_add_affinex:
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
@@ -7038,7 +4985,7 @@ ecp_nistz256_point_add_affinex:
.cfi_offset %r15,-56
subq $480+8,%rsp
.cfi_adjust_cfa_offset 32*15+8
-.Ladd_affinex_body:
+.Ladd_affineq_body:
movdqu 0(%rsi),%xmm0
movq %rdx,%rbx
@@ -7047,7 +4994,7 @@ ecp_nistz256_point_add_affinex:
movdqu 48(%rsi),%xmm3
movdqu 64(%rsi),%xmm4
movdqu 80(%rsi),%xmm5
- movq 64+0(%rsi),%rdx
+ movq 64+0(%rsi),%rax
movq 64+8(%rsi),%r14
movq 64+16(%rsi),%r15
movq 64+24(%rsi),%r8
@@ -7077,13 +5024,13 @@ ecp_nistz256_point_add_affinex:
pxor %xmm4,%xmm4
por %xmm1,%xmm3
- leaq 64-128(%rsi),%rsi
+ leaq 64-0(%rsi),%rsi
leaq 32(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
+ call __ecp_nistz256_sqr_montq
pcmpeqd %xmm4,%xmm5
pshufd $0xb1,%xmm3,%xmm4
- movq 0(%rbx),%rdx
+ movq 0(%rbx),%rax
movq %r12,%r9
por %xmm3,%xmm4
@@ -7096,84 +5043,84 @@ ecp_nistz256_point_add_affinex:
pcmpeqd %xmm3,%xmm4
pshufd $0,%xmm4,%xmm4
- leaq 32-128(%rsp),%rsi
+ leaq 32-0(%rsp),%rsi
movq %r15,%r12
leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
leaq 320(%rsp),%rbx
leaq 64(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
+ call __ecp_nistz256_sub_fromq
- movq 384(%rsp),%rdx
+ movq 384(%rsp),%rax
leaq 384(%rsp),%rbx
movq 0+32(%rsp),%r9
movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
+ leaq 0+32(%rsp),%rsi
movq 16+32(%rsp),%r11
movq 24+32(%rsp),%r12
leaq 32(%rsp),%rdi
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
- movq 384(%rsp),%rdx
+ movq 384(%rsp),%rax
leaq 384(%rsp),%rbx
movq 0+64(%rsp),%r9
movq 8+64(%rsp),%r10
- leaq -128+64(%rsp),%rsi
+ leaq 0+64(%rsp),%rsi
movq 16+64(%rsp),%r11
movq 24+64(%rsp),%r12
leaq 288(%rsp),%rdi
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
- movq 448(%rsp),%rdx
+ movq 448(%rsp),%rax
leaq 448(%rsp),%rbx
movq 0+32(%rsp),%r9
movq 8+32(%rsp),%r10
- leaq -128+32(%rsp),%rsi
+ leaq 0+32(%rsp),%rsi
movq 16+32(%rsp),%r11
movq 24+32(%rsp),%r12
leaq 32(%rsp),%rdi
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
leaq 352(%rsp),%rbx
leaq 96(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
+ call __ecp_nistz256_sub_fromq
- movq 0+64(%rsp),%rdx
+ movq 0+64(%rsp),%rax
movq 8+64(%rsp),%r14
- leaq -128+64(%rsp),%rsi
+ leaq 0+64(%rsp),%rsi
movq 16+64(%rsp),%r15
movq 24+64(%rsp),%r8
leaq 128(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
+ call __ecp_nistz256_sqr_montq
- movq 0+96(%rsp),%rdx
+ movq 0+96(%rsp),%rax
movq 8+96(%rsp),%r14
- leaq -128+96(%rsp),%rsi
+ leaq 0+96(%rsp),%rsi
movq 16+96(%rsp),%r15
movq 24+96(%rsp),%r8
leaq 192(%rsp),%rdi
- call __ecp_nistz256_sqr_montx
+ call __ecp_nistz256_sqr_montq
- movq 128(%rsp),%rdx
+ movq 128(%rsp),%rax
leaq 128(%rsp),%rbx
movq 0+64(%rsp),%r9
movq 8+64(%rsp),%r10
- leaq -128+64(%rsp),%rsi
+ leaq 0+64(%rsp),%rsi
movq 16+64(%rsp),%r11
movq 24+64(%rsp),%r12
leaq 160(%rsp),%rdi
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
- movq 320(%rsp),%rdx
+ movq 320(%rsp),%rax
leaq 320(%rsp),%rbx
movq 0+128(%rsp),%r9
movq 8+128(%rsp),%r10
- leaq -128+128(%rsp),%rsi
+ leaq 0+128(%rsp),%rsi
movq 16+128(%rsp),%r11
movq 24+128(%rsp),%r12
leaq 0(%rsp),%rdi
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
@@ -7205,11 +5152,11 @@ ecp_nistz256_point_add_affinex:
cmovcq %r10,%r9
movq 24(%rsi),%r10
- call __ecp_nistz256_subx
+ call __ecp_nistz256_subq
leaq 160(%rsp),%rbx
leaq 224(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
+ call __ecp_nistz256_sub_fromq
movq 0+0(%rsp),%rax
movq 0+8(%rsp),%rbp
@@ -7217,35 +5164,35 @@ ecp_nistz256_point_add_affinex:
movq 0+24(%rsp),%r10
leaq 64(%rsp),%rdi
- call __ecp_nistz256_subx
+ call __ecp_nistz256_subq
movq %r12,0(%rdi)
movq %r13,8(%rdi)
movq %r8,16(%rdi)
movq %r9,24(%rdi)
- movq 352(%rsp),%rdx
+ movq 352(%rsp),%rax
leaq 352(%rsp),%rbx
movq 0+160(%rsp),%r9
movq 8+160(%rsp),%r10
- leaq -128+160(%rsp),%rsi
+ leaq 0+160(%rsp),%rsi
movq 16+160(%rsp),%r11
movq 24+160(%rsp),%r12
leaq 32(%rsp),%rdi
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
- movq 96(%rsp),%rdx
+ movq 96(%rsp),%rax
leaq 96(%rsp),%rbx
movq 0+64(%rsp),%r9
movq 8+64(%rsp),%r10
- leaq -128+64(%rsp),%rsi
+ leaq 0+64(%rsp),%rsi
movq 16+64(%rsp),%r11
movq 24+64(%rsp),%r12
leaq 64(%rsp),%rdi
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
leaq 32(%rsp),%rbx
leaq 256(%rsp),%rdi
- call __ecp_nistz256_sub_fromx
+ call __ecp_nistz256_sub_fromq
.byte 102,72,15,126,199
@@ -7337,11 +5284,11 @@ ecp_nistz256_point_add_affinex:
.cfi_restore %rbp
leaq (%rsi),%rsp
.cfi_def_cfa_register %rsp
-.Ladd_affinex_epilogue:
+.Ladd_affineq_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
-.size ecp_nistz256_point_add_affinex,.-ecp_nistz256_point_add_affinex
- .section ".note.gnu.property", "a"
+.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
+ .section .note.gnu.property, #alloc
.p2align 3
.long 1f - 0f
.long 4f - 1f
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/ec/x25519-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/ec/x25519-x86_64.s
index dd5a6efce58..3ee9bc6bbb8 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/ec/x25519-x86_64.s
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/ec/x25519-x86_64.s
@@ -395,412 +395,36 @@ x25519_fe51_mul121666:
.Lfe51_mul121666_epilogue:
.cfi_endproc
.size x25519_fe51_mul121666,.-x25519_fe51_mul121666
-
.globl x25519_fe64_eligible
.type x25519_fe64_eligible,@function
.align 32
x25519_fe64_eligible:
.cfi_startproc
- movl OPENSSL_ia32cap_P+8(%rip),%ecx
xorl %eax,%eax
- andl $0x80100,%ecx
- cmpl $0x80100,%ecx
- cmovel %ecx,%eax
.byte 0xf3,0xc3
.cfi_endproc
.size x25519_fe64_eligible,.-x25519_fe64_eligible
.globl x25519_fe64_mul
.type x25519_fe64_mul,@function
-.align 32
-x25519_fe64_mul:
-.cfi_startproc
- pushq %rbp
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbp,-16
- pushq %rbx
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbx,-24
- pushq %r12
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r15,-56
- pushq %rdi
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rdi,-64
- leaq -16(%rsp),%rsp
-.cfi_adjust_cfa_offset 16
-.Lfe64_mul_body:
-
- movq %rdx,%rax
- movq 0(%rdx),%rbp
- movq 0(%rsi),%rdx
- movq 8(%rax),%rcx
- movq 16(%rax),%r14
- movq 24(%rax),%r15
-
- mulxq %rbp,%r8,%rax
- xorl %edi,%edi
- mulxq %rcx,%r9,%rbx
- adcxq %rax,%r9
- mulxq %r14,%r10,%rax
- adcxq %rbx,%r10
- mulxq %r15,%r11,%r12
- movq 8(%rsi),%rdx
- adcxq %rax,%r11
- movq %r14,(%rsp)
- adcxq %rdi,%r12
-
- mulxq %rbp,%rax,%rbx
- adoxq %rax,%r9
- adcxq %rbx,%r10
- mulxq %rcx,%rax,%rbx
- adoxq %rax,%r10
- adcxq %rbx,%r11
- mulxq %r14,%rax,%rbx
- adoxq %rax,%r11
- adcxq %rbx,%r12
- mulxq %r15,%rax,%r13
- movq 16(%rsi),%rdx
- adoxq %rax,%r12
- adcxq %rdi,%r13
- adoxq %rdi,%r13
-
- mulxq %rbp,%rax,%rbx
- adcxq %rax,%r10
- adoxq %rbx,%r11
- mulxq %rcx,%rax,%rbx
- adcxq %rax,%r11
- adoxq %rbx,%r12
- mulxq %r14,%rax,%rbx
- adcxq %rax,%r12
- adoxq %rbx,%r13
- mulxq %r15,%rax,%r14
- movq 24(%rsi),%rdx
- adcxq %rax,%r13
- adoxq %rdi,%r14
- adcxq %rdi,%r14
-
- mulxq %rbp,%rax,%rbx
- adoxq %rax,%r11
- adcxq %rbx,%r12
- mulxq %rcx,%rax,%rbx
- adoxq %rax,%r12
- adcxq %rbx,%r13
- mulxq (%rsp),%rax,%rbx
- adoxq %rax,%r13
- adcxq %rbx,%r14
- mulxq %r15,%rax,%r15
- movl $38,%edx
- adoxq %rax,%r14
- adcxq %rdi,%r15
- adoxq %rdi,%r15
-
- jmp .Lreduce64
-.Lfe64_mul_epilogue:
-.cfi_endproc
-.size x25519_fe64_mul,.-x25519_fe64_mul
-
.globl x25519_fe64_sqr
-.type x25519_fe64_sqr,@function
-.align 32
-x25519_fe64_sqr:
-.cfi_startproc
- pushq %rbp
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbp,-16
- pushq %rbx
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbx,-24
- pushq %r12
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r15,-56
- pushq %rdi
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rdi,-64
- leaq -16(%rsp),%rsp
-.cfi_adjust_cfa_offset 16
-.Lfe64_sqr_body:
-
- movq 0(%rsi),%rdx
- movq 8(%rsi),%rcx
- movq 16(%rsi),%rbp
- movq 24(%rsi),%rsi
-
-
- mulxq %rdx,%r8,%r15
- mulxq %rcx,%r9,%rax
- xorl %edi,%edi
- mulxq %rbp,%r10,%rbx
- adcxq %rax,%r10
- mulxq %rsi,%r11,%r12
- movq %rcx,%rdx
- adcxq %rbx,%r11
- adcxq %rdi,%r12
-
-
- mulxq %rbp,%rax,%rbx
- adoxq %rax,%r11
- adcxq %rbx,%r12
- mulxq %rsi,%rax,%r13
- movq %rbp,%rdx
- adoxq %rax,%r12
- adcxq %rdi,%r13
-
-
- mulxq %rsi,%rax,%r14
- movq %rcx,%rdx
- adoxq %rax,%r13
- adcxq %rdi,%r14
- adoxq %rdi,%r14
-
- adcxq %r9,%r9
- adoxq %r15,%r9
- adcxq %r10,%r10
- mulxq %rdx,%rax,%rbx
- movq %rbp,%rdx
- adcxq %r11,%r11
- adoxq %rax,%r10
- adcxq %r12,%r12
- adoxq %rbx,%r11
- mulxq %rdx,%rax,%rbx
- movq %rsi,%rdx
- adcxq %r13,%r13
- adoxq %rax,%r12
- adcxq %r14,%r14
- adoxq %rbx,%r13
- mulxq %rdx,%rax,%r15
- movl $38,%edx
- adoxq %rax,%r14
- adcxq %rdi,%r15
- adoxq %rdi,%r15
- jmp .Lreduce64
-
-.align 32
-.Lreduce64:
- mulxq %r12,%rax,%rbx
- adcxq %rax,%r8
- adoxq %rbx,%r9
- mulxq %r13,%rax,%rbx
- adcxq %rax,%r9
- adoxq %rbx,%r10
- mulxq %r14,%rax,%rbx
- adcxq %rax,%r10
- adoxq %rbx,%r11
- mulxq %r15,%rax,%r12
- adcxq %rax,%r11
- adoxq %rdi,%r12
- adcxq %rdi,%r12
-
- movq 16(%rsp),%rdi
- imulq %rdx,%r12
-
- addq %r12,%r8
- adcq $0,%r9
- adcq $0,%r10
- adcq $0,%r11
-
- sbbq %rax,%rax
- andq $38,%rax
-
- addq %rax,%r8
- movq %r9,8(%rdi)
- movq %r10,16(%rdi)
- movq %r11,24(%rdi)
- movq %r8,0(%rdi)
-
- movq 24(%rsp),%r15
-.cfi_restore %r15
- movq 32(%rsp),%r14
-.cfi_restore %r14
- movq 40(%rsp),%r13
-.cfi_restore %r13
- movq 48(%rsp),%r12
-.cfi_restore %r12
- movq 56(%rsp),%rbx
-.cfi_restore %rbx
- movq 64(%rsp),%rbp
-.cfi_restore %rbp
- leaq 72(%rsp),%rsp
-.cfi_adjust_cfa_offset 88
-.Lfe64_sqr_epilogue:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size x25519_fe64_sqr,.-x25519_fe64_sqr
-
.globl x25519_fe64_mul121666
-.type x25519_fe64_mul121666,@function
-.align 32
-x25519_fe64_mul121666:
-.Lfe64_mul121666_body:
-.cfi_startproc
- movl $121666,%edx
- mulxq 0(%rsi),%r8,%rcx
- mulxq 8(%rsi),%r9,%rax
- addq %rcx,%r9
- mulxq 16(%rsi),%r10,%rcx
- adcq %rax,%r10
- mulxq 24(%rsi),%r11,%rax
- adcq %rcx,%r11
- adcq $0,%rax
-
- imulq $38,%rax,%rax
-
- addq %rax,%r8
- adcq $0,%r9
- adcq $0,%r10
- adcq $0,%r11
-
- sbbq %rax,%rax
- andq $38,%rax
-
- addq %rax,%r8
- movq %r9,8(%rdi)
- movq %r10,16(%rdi)
- movq %r11,24(%rdi)
- movq %r8,0(%rdi)
-
-.Lfe64_mul121666_epilogue:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size x25519_fe64_mul121666,.-x25519_fe64_mul121666
-
.globl x25519_fe64_add
-.type x25519_fe64_add,@function
-.align 32
-x25519_fe64_add:
-.Lfe64_add_body:
-.cfi_startproc
- movq 0(%rsi),%r8
- movq 8(%rsi),%r9
- movq 16(%rsi),%r10
- movq 24(%rsi),%r11
-
- addq 0(%rdx),%r8
- adcq 8(%rdx),%r9
- adcq 16(%rdx),%r10
- adcq 24(%rdx),%r11
-
- sbbq %rax,%rax
- andq $38,%rax
-
- addq %rax,%r8
- adcq $0,%r9
- adcq $0,%r10
- movq %r9,8(%rdi)
- adcq $0,%r11
- movq %r10,16(%rdi)
- sbbq %rax,%rax
- movq %r11,24(%rdi)
- andq $38,%rax
-
- addq %rax,%r8
- movq %r8,0(%rdi)
-
-.Lfe64_add_epilogue:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size x25519_fe64_add,.-x25519_fe64_add
-
.globl x25519_fe64_sub
-.type x25519_fe64_sub,@function
-.align 32
-x25519_fe64_sub:
-.Lfe64_sub_body:
-.cfi_startproc
- movq 0(%rsi),%r8
- movq 8(%rsi),%r9
- movq 16(%rsi),%r10
- movq 24(%rsi),%r11
-
- subq 0(%rdx),%r8
- sbbq 8(%rdx),%r9
- sbbq 16(%rdx),%r10
- sbbq 24(%rdx),%r11
-
- sbbq %rax,%rax
- andq $38,%rax
-
- subq %rax,%r8
- sbbq $0,%r9
- sbbq $0,%r10
- movq %r9,8(%rdi)
- sbbq $0,%r11
- movq %r10,16(%rdi)
- sbbq %rax,%rax
- movq %r11,24(%rdi)
- andq $38,%rax
-
- subq %rax,%r8
- movq %r8,0(%rdi)
-
-.Lfe64_sub_epilogue:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size x25519_fe64_sub,.-x25519_fe64_sub
-
.globl x25519_fe64_tobytes
-.type x25519_fe64_tobytes,@function
-.align 32
+x25519_fe64_mul:
+x25519_fe64_sqr:
+x25519_fe64_mul121666:
+x25519_fe64_add:
+x25519_fe64_sub:
x25519_fe64_tobytes:
-.Lfe64_to_body:
.cfi_startproc
- movq 0(%rsi),%r8
- movq 8(%rsi),%r9
- movq 16(%rsi),%r10
- movq 24(%rsi),%r11
-
-
- leaq (%r11,%r11,1),%rax
- sarq $63,%r11
- shrq $1,%rax
- andq $19,%r11
- addq $19,%r11
-
- addq %r11,%r8
- adcq $0,%r9
- adcq $0,%r10
- adcq $0,%rax
-
- leaq (%rax,%rax,1),%r11
- sarq $63,%rax
- shrq $1,%r11
- notq %rax
- andq $19,%rax
-
- subq %rax,%r8
- sbbq $0,%r9
- sbbq $0,%r10
- sbbq $0,%r11
-
- movq %r8,0(%rdi)
- movq %r9,8(%rdi)
- movq %r10,16(%rdi)
- movq %r11,24(%rdi)
-
-.Lfe64_to_epilogue:
+.byte 0x0f,0x0b
.byte 0xf3,0xc3
.cfi_endproc
-.size x25519_fe64_tobytes,.-x25519_fe64_tobytes
+.size x25519_fe64_mul,.-x25519_fe64_mul
.byte 88,50,53,53,49,57,32,112,114,105,109,105,116,105,118,101,115,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
- .section ".note.gnu.property", "a"
+ .section .note.gnu.property, #alloc
.p2align 3
.long 1f - 0f
.long 4f - 1f
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/md5/md5-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/md5/md5-x86_64.s
index 40bfc69f380..3d1a966de9b 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/md5/md5-x86_64.s
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/md5/md5-x86_64.s
@@ -681,7 +681,7 @@ ossl_md5_block_asm_data_order:
.byte 0xf3,0xc3
.cfi_endproc
.size ossl_md5_block_asm_data_order,.-ossl_md5_block_asm_data_order
- .section ".note.gnu.property", "a"
+ .section .note.gnu.property, #alloc
.p2align 3
.long 1f - 0f
.long 4f - 1f
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/aesni-gcm-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/aesni-gcm-x86_64.s
index 288f44af921..19e0b738366 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/aesni-gcm-x86_64.s
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/aesni-gcm-x86_64.s
@@ -1,793 +1,23 @@
.text
-.type _aesni_ctr32_ghash_6x,@function
-.align 32
-_aesni_ctr32_ghash_6x:
+.globl aesni_gcm_encrypt
+.type aesni_gcm_encrypt,@function
+aesni_gcm_encrypt:
.cfi_startproc
- vmovdqu 32(%r11),%xmm2
- subq $6,%rdx
- vpxor %xmm4,%xmm4,%xmm4
- vmovdqu 0-128(%rcx),%xmm15
- vpaddb %xmm2,%xmm1,%xmm10
- vpaddb %xmm2,%xmm10,%xmm11
- vpaddb %xmm2,%xmm11,%xmm12
- vpaddb %xmm2,%xmm12,%xmm13
- vpaddb %xmm2,%xmm13,%xmm14
- vpxor %xmm15,%xmm1,%xmm9
- vmovdqu %xmm4,16+8(%rsp)
- jmp .Loop6x
-
-.align 32
-.Loop6x:
- addl $100663296,%ebx
- jc .Lhandle_ctr32
- vmovdqu 0-32(%r9),%xmm3
- vpaddb %xmm2,%xmm14,%xmm1
- vpxor %xmm15,%xmm10,%xmm10
- vpxor %xmm15,%xmm11,%xmm11
-
-.Lresume_ctr32:
- vmovdqu %xmm1,(%r8)
- vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
- vpxor %xmm15,%xmm12,%xmm12
- vmovups 16-128(%rcx),%xmm2
- vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
- xorq %r12,%r12
- cmpq %r14,%r15
-
- vaesenc %xmm2,%xmm9,%xmm9
- vmovdqu 48+8(%rsp),%xmm0
- vpxor %xmm15,%xmm13,%xmm13
- vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
- vaesenc %xmm2,%xmm10,%xmm10
- vpxor %xmm15,%xmm14,%xmm14
- setnc %r12b
- vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
- vaesenc %xmm2,%xmm11,%xmm11
- vmovdqu 16-32(%r9),%xmm3
- negq %r12
- vaesenc %xmm2,%xmm12,%xmm12
- vpxor %xmm5,%xmm6,%xmm6
- vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
- vpxor %xmm4,%xmm8,%xmm8
- vaesenc %xmm2,%xmm13,%xmm13
- vpxor %xmm5,%xmm1,%xmm4
- andq $0x60,%r12
- vmovups 32-128(%rcx),%xmm15
- vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
- vaesenc %xmm2,%xmm14,%xmm14
-
- vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
- leaq (%r14,%r12,1),%r14
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor 16+8(%rsp),%xmm8,%xmm8
- vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
- vmovdqu 64+8(%rsp),%xmm0
- vaesenc %xmm15,%xmm10,%xmm10
- movbeq 88(%r14),%r13
- vaesenc %xmm15,%xmm11,%xmm11
- movbeq 80(%r14),%r12
- vaesenc %xmm15,%xmm12,%xmm12
- movq %r13,32+8(%rsp)
- vaesenc %xmm15,%xmm13,%xmm13
- movq %r12,40+8(%rsp)
- vmovdqu 48-32(%r9),%xmm5
- vaesenc %xmm15,%xmm14,%xmm14
-
- vmovups 48-128(%rcx),%xmm15
- vpxor %xmm1,%xmm6,%xmm6
- vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor %xmm2,%xmm6,%xmm6
- vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
- vaesenc %xmm15,%xmm10,%xmm10
- vpxor %xmm3,%xmm7,%xmm7
- vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
- vaesenc %xmm15,%xmm11,%xmm11
- vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
- vmovdqu 80+8(%rsp),%xmm0
- vaesenc %xmm15,%xmm12,%xmm12
- vaesenc %xmm15,%xmm13,%xmm13
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqu 64-32(%r9),%xmm1
- vaesenc %xmm15,%xmm14,%xmm14
-
- vmovups 64-128(%rcx),%xmm15
- vpxor %xmm2,%xmm6,%xmm6
- vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor %xmm3,%xmm6,%xmm6
- vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
- vaesenc %xmm15,%xmm10,%xmm10
- movbeq 72(%r14),%r13
- vpxor %xmm5,%xmm7,%xmm7
- vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
- vaesenc %xmm15,%xmm11,%xmm11
- movbeq 64(%r14),%r12
- vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
- vmovdqu 96+8(%rsp),%xmm0
- vaesenc %xmm15,%xmm12,%xmm12
- movq %r13,48+8(%rsp)
- vaesenc %xmm15,%xmm13,%xmm13
- movq %r12,56+8(%rsp)
- vpxor %xmm2,%xmm4,%xmm4
- vmovdqu 96-32(%r9),%xmm2
- vaesenc %xmm15,%xmm14,%xmm14
-
- vmovups 80-128(%rcx),%xmm15
- vpxor %xmm3,%xmm6,%xmm6
- vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor %xmm5,%xmm6,%xmm6
- vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
- vaesenc %xmm15,%xmm10,%xmm10
- movbeq 56(%r14),%r13
- vpxor %xmm1,%xmm7,%xmm7
- vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
- vpxor 112+8(%rsp),%xmm8,%xmm8
- vaesenc %xmm15,%xmm11,%xmm11
- movbeq 48(%r14),%r12
- vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
- vaesenc %xmm15,%xmm12,%xmm12
- movq %r13,64+8(%rsp)
- vaesenc %xmm15,%xmm13,%xmm13
- movq %r12,72+8(%rsp)
- vpxor %xmm3,%xmm4,%xmm4
- vmovdqu 112-32(%r9),%xmm3
- vaesenc %xmm15,%xmm14,%xmm14
-
- vmovups 96-128(%rcx),%xmm15
- vpxor %xmm5,%xmm6,%xmm6
- vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor %xmm1,%xmm6,%xmm6
- vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
- vaesenc %xmm15,%xmm10,%xmm10
- movbeq 40(%r14),%r13
- vpxor %xmm2,%xmm7,%xmm7
- vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
- vaesenc %xmm15,%xmm11,%xmm11
- movbeq 32(%r14),%r12
- vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
- vaesenc %xmm15,%xmm12,%xmm12
- movq %r13,80+8(%rsp)
- vaesenc %xmm15,%xmm13,%xmm13
- movq %r12,88+8(%rsp)
- vpxor %xmm5,%xmm6,%xmm6
- vaesenc %xmm15,%xmm14,%xmm14
- vpxor %xmm1,%xmm6,%xmm6
-
- vmovups 112-128(%rcx),%xmm15
- vpslldq $8,%xmm6,%xmm5
- vpxor %xmm2,%xmm4,%xmm4
- vmovdqu 16(%r11),%xmm3
-
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor %xmm8,%xmm7,%xmm7
- vaesenc %xmm15,%xmm10,%xmm10
- vpxor %xmm5,%xmm4,%xmm4
- movbeq 24(%r14),%r13
- vaesenc %xmm15,%xmm11,%xmm11
- movbeq 16(%r14),%r12
- vpalignr $8,%xmm4,%xmm4,%xmm0
- vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
- movq %r13,96+8(%rsp)
- vaesenc %xmm15,%xmm12,%xmm12
- movq %r12,104+8(%rsp)
- vaesenc %xmm15,%xmm13,%xmm13
- vmovups 128-128(%rcx),%xmm1
- vaesenc %xmm15,%xmm14,%xmm14
-
- vaesenc %xmm1,%xmm9,%xmm9
- vmovups 144-128(%rcx),%xmm15
- vaesenc %xmm1,%xmm10,%xmm10
- vpsrldq $8,%xmm6,%xmm6
- vaesenc %xmm1,%xmm11,%xmm11
- vpxor %xmm6,%xmm7,%xmm7
- vaesenc %xmm1,%xmm12,%xmm12
- vpxor %xmm0,%xmm4,%xmm4
- movbeq 8(%r14),%r13
- vaesenc %xmm1,%xmm13,%xmm13
- movbeq 0(%r14),%r12
- vaesenc %xmm1,%xmm14,%xmm14
- vmovups 160-128(%rcx),%xmm1
- cmpl $11,%ebp
- jb .Lenc_tail
-
- vaesenc %xmm15,%xmm9,%xmm9
- vaesenc %xmm15,%xmm10,%xmm10
- vaesenc %xmm15,%xmm11,%xmm11
- vaesenc %xmm15,%xmm12,%xmm12
- vaesenc %xmm15,%xmm13,%xmm13
- vaesenc %xmm15,%xmm14,%xmm14
-
- vaesenc %xmm1,%xmm9,%xmm9
- vaesenc %xmm1,%xmm10,%xmm10
- vaesenc %xmm1,%xmm11,%xmm11
- vaesenc %xmm1,%xmm12,%xmm12
- vaesenc %xmm1,%xmm13,%xmm13
- vmovups 176-128(%rcx),%xmm15
- vaesenc %xmm1,%xmm14,%xmm14
- vmovups 192-128(%rcx),%xmm1
- je .Lenc_tail
-
- vaesenc %xmm15,%xmm9,%xmm9
- vaesenc %xmm15,%xmm10,%xmm10
- vaesenc %xmm15,%xmm11,%xmm11
- vaesenc %xmm15,%xmm12,%xmm12
- vaesenc %xmm15,%xmm13,%xmm13
- vaesenc %xmm15,%xmm14,%xmm14
-
- vaesenc %xmm1,%xmm9,%xmm9
- vaesenc %xmm1,%xmm10,%xmm10
- vaesenc %xmm1,%xmm11,%xmm11
- vaesenc %xmm1,%xmm12,%xmm12
- vaesenc %xmm1,%xmm13,%xmm13
- vmovups 208-128(%rcx),%xmm15
- vaesenc %xmm1,%xmm14,%xmm14
- vmovups 224-128(%rcx),%xmm1
- jmp .Lenc_tail
-
-.align 32
-.Lhandle_ctr32:
- vmovdqu (%r11),%xmm0
- vpshufb %xmm0,%xmm1,%xmm6
- vmovdqu 48(%r11),%xmm5
- vpaddd 64(%r11),%xmm6,%xmm10
- vpaddd %xmm5,%xmm6,%xmm11
- vmovdqu 0-32(%r9),%xmm3
- vpaddd %xmm5,%xmm10,%xmm12
- vpshufb %xmm0,%xmm10,%xmm10
- vpaddd %xmm5,%xmm11,%xmm13
- vpshufb %xmm0,%xmm11,%xmm11
- vpxor %xmm15,%xmm10,%xmm10
- vpaddd %xmm5,%xmm12,%xmm14
- vpshufb %xmm0,%xmm12,%xmm12
- vpxor %xmm15,%xmm11,%xmm11
- vpaddd %xmm5,%xmm13,%xmm1
- vpshufb %xmm0,%xmm13,%xmm13
- vpshufb %xmm0,%xmm14,%xmm14
- vpshufb %xmm0,%xmm1,%xmm1
- jmp .Lresume_ctr32
-
-.align 32
-.Lenc_tail:
- vaesenc %xmm15,%xmm9,%xmm9
- vmovdqu %xmm7,16+8(%rsp)
- vpalignr $8,%xmm4,%xmm4,%xmm8
- vaesenc %xmm15,%xmm10,%xmm10
- vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
- vpxor 0(%rdi),%xmm1,%xmm2
- vaesenc %xmm15,%xmm11,%xmm11
- vpxor 16(%rdi),%xmm1,%xmm0
- vaesenc %xmm15,%xmm12,%xmm12
- vpxor 32(%rdi),%xmm1,%xmm5
- vaesenc %xmm15,%xmm13,%xmm13
- vpxor 48(%rdi),%xmm1,%xmm6
- vaesenc %xmm15,%xmm14,%xmm14
- vpxor 64(%rdi),%xmm1,%xmm7
- vpxor 80(%rdi),%xmm1,%xmm3
- vmovdqu (%r8),%xmm1
-
- vaesenclast %xmm2,%xmm9,%xmm9
- vmovdqu 32(%r11),%xmm2
- vaesenclast %xmm0,%xmm10,%xmm10
- vpaddb %xmm2,%xmm1,%xmm0
- movq %r13,112+8(%rsp)
- leaq 96(%rdi),%rdi
- vaesenclast %xmm5,%xmm11,%xmm11
- vpaddb %xmm2,%xmm0,%xmm5
- movq %r12,120+8(%rsp)
- leaq 96(%rsi),%rsi
- vmovdqu 0-128(%rcx),%xmm15
- vaesenclast %xmm6,%xmm12,%xmm12
- vpaddb %xmm2,%xmm5,%xmm6
- vaesenclast %xmm7,%xmm13,%xmm13
- vpaddb %xmm2,%xmm6,%xmm7
- vaesenclast %xmm3,%xmm14,%xmm14
- vpaddb %xmm2,%xmm7,%xmm3
-
- addq $0x60,%r10
- subq $0x6,%rdx
- jc .L6x_done
-
- vmovups %xmm9,-96(%rsi)
- vpxor %xmm15,%xmm1,%xmm9
- vmovups %xmm10,-80(%rsi)
- vmovdqa %xmm0,%xmm10
- vmovups %xmm11,-64(%rsi)
- vmovdqa %xmm5,%xmm11
- vmovups %xmm12,-48(%rsi)
- vmovdqa %xmm6,%xmm12
- vmovups %xmm13,-32(%rsi)
- vmovdqa %xmm7,%xmm13
- vmovups %xmm14,-16(%rsi)
- vmovdqa %xmm3,%xmm14
- vmovdqu 32+8(%rsp),%xmm7
- jmp .Loop6x
-
-.L6x_done:
- vpxor 16+8(%rsp),%xmm8,%xmm8
- vpxor %xmm4,%xmm8,%xmm8
-
+ xorl %eax,%eax
.byte 0xf3,0xc3
.cfi_endproc
-.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
+.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
+
.globl aesni_gcm_decrypt
.type aesni_gcm_decrypt,@function
-.align 32
aesni_gcm_decrypt:
.cfi_startproc
- xorq %r10,%r10
- cmpq $0x60,%rdx
- jb .Lgcm_dec_abort
-
- leaq (%rsp),%rax
-.cfi_def_cfa_register %rax
- pushq %rbx
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_offset %rbp,-24
- pushq %r12
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_offset %r15,-56
- vzeroupper
-
- vmovdqu (%r8),%xmm1
- addq $-128,%rsp
- movl 12(%r8),%ebx
- leaq .Lbswap_mask(%rip),%r11
- leaq -128(%rcx),%r14
- movq $0xf80,%r15
- vmovdqu (%r9),%xmm8
- andq $-128,%rsp
- vmovdqu (%r11),%xmm0
- leaq 128(%rcx),%rcx
- leaq 32+32(%r9),%r9
- movl 240-128(%rcx),%ebp
- vpshufb %xmm0,%xmm8,%xmm8
-
- andq %r15,%r14
- andq %rsp,%r15
- subq %r14,%r15
- jc .Ldec_no_key_aliasing
- cmpq $768,%r15
- jnc .Ldec_no_key_aliasing
- subq %r15,%rsp
-.Ldec_no_key_aliasing:
-
- vmovdqu 80(%rdi),%xmm7
- leaq (%rdi),%r14
- vmovdqu 64(%rdi),%xmm4
- leaq -192(%rdi,%rdx,1),%r15
- vmovdqu 48(%rdi),%xmm5
- shrq $4,%rdx
- xorq %r10,%r10
- vmovdqu 32(%rdi),%xmm6
- vpshufb %xmm0,%xmm7,%xmm7
- vmovdqu 16(%rdi),%xmm2
- vpshufb %xmm0,%xmm4,%xmm4
- vmovdqu (%rdi),%xmm3
- vpshufb %xmm0,%xmm5,%xmm5
- vmovdqu %xmm4,48(%rsp)
- vpshufb %xmm0,%xmm6,%xmm6
- vmovdqu %xmm5,64(%rsp)
- vpshufb %xmm0,%xmm2,%xmm2
- vmovdqu %xmm6,80(%rsp)
- vpshufb %xmm0,%xmm3,%xmm3
- vmovdqu %xmm2,96(%rsp)
- vmovdqu %xmm3,112(%rsp)
-
- call _aesni_ctr32_ghash_6x
-
- vmovups %xmm9,-96(%rsi)
- vmovups %xmm10,-80(%rsi)
- vmovups %xmm11,-64(%rsi)
- vmovups %xmm12,-48(%rsi)
- vmovups %xmm13,-32(%rsi)
- vmovups %xmm14,-16(%rsi)
-
- vpshufb (%r11),%xmm8,%xmm8
- vmovdqu %xmm8,-64(%r9)
-
- vzeroupper
- movq -48(%rax),%r15
-.cfi_restore %r15
- movq -40(%rax),%r14
-.cfi_restore %r14
- movq -32(%rax),%r13
-.cfi_restore %r13
- movq -24(%rax),%r12
-.cfi_restore %r12
- movq -16(%rax),%rbp
-.cfi_restore %rbp
- movq -8(%rax),%rbx
-.cfi_restore %rbx
- leaq (%rax),%rsp
-.cfi_def_cfa_register %rsp
-.Lgcm_dec_abort:
- movq %r10,%rax
+ xorl %eax,%eax
.byte 0xf3,0xc3
.cfi_endproc
.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
-.type _aesni_ctr32_6x,@function
-.align 32
-_aesni_ctr32_6x:
-.cfi_startproc
- vmovdqu 0-128(%rcx),%xmm4
- vmovdqu 32(%r11),%xmm2
- leaq -1(%rbp),%r13
- vmovups 16-128(%rcx),%xmm15
- leaq 32-128(%rcx),%r12
- vpxor %xmm4,%xmm1,%xmm9
- addl $100663296,%ebx
- jc .Lhandle_ctr32_2
- vpaddb %xmm2,%xmm1,%xmm10
- vpaddb %xmm2,%xmm10,%xmm11
- vpxor %xmm4,%xmm10,%xmm10
- vpaddb %xmm2,%xmm11,%xmm12
- vpxor %xmm4,%xmm11,%xmm11
- vpaddb %xmm2,%xmm12,%xmm13
- vpxor %xmm4,%xmm12,%xmm12
- vpaddb %xmm2,%xmm13,%xmm14
- vpxor %xmm4,%xmm13,%xmm13
- vpaddb %xmm2,%xmm14,%xmm1
- vpxor %xmm4,%xmm14,%xmm14
- jmp .Loop_ctr32
-
-.align 16
-.Loop_ctr32:
- vaesenc %xmm15,%xmm9,%xmm9
- vaesenc %xmm15,%xmm10,%xmm10
- vaesenc %xmm15,%xmm11,%xmm11
- vaesenc %xmm15,%xmm12,%xmm12
- vaesenc %xmm15,%xmm13,%xmm13
- vaesenc %xmm15,%xmm14,%xmm14
- vmovups (%r12),%xmm15
- leaq 16(%r12),%r12
- decl %r13d
- jnz .Loop_ctr32
-
- vmovdqu (%r12),%xmm3
- vaesenc %xmm15,%xmm9,%xmm9
- vpxor 0(%rdi),%xmm3,%xmm4
- vaesenc %xmm15,%xmm10,%xmm10
- vpxor 16(%rdi),%xmm3,%xmm5
- vaesenc %xmm15,%xmm11,%xmm11
- vpxor 32(%rdi),%xmm3,%xmm6
- vaesenc %xmm15,%xmm12,%xmm12
- vpxor 48(%rdi),%xmm3,%xmm8
- vaesenc %xmm15,%xmm13,%xmm13
- vpxor 64(%rdi),%xmm3,%xmm2
- vaesenc %xmm15,%xmm14,%xmm14
- vpxor 80(%rdi),%xmm3,%xmm3
- leaq 96(%rdi),%rdi
-
- vaesenclast %xmm4,%xmm9,%xmm9
- vaesenclast %xmm5,%xmm10,%xmm10
- vaesenclast %xmm6,%xmm11,%xmm11
- vaesenclast %xmm8,%xmm12,%xmm12
- vaesenclast %xmm2,%xmm13,%xmm13
- vaesenclast %xmm3,%xmm14,%xmm14
- vmovups %xmm9,0(%rsi)
- vmovups %xmm10,16(%rsi)
- vmovups %xmm11,32(%rsi)
- vmovups %xmm12,48(%rsi)
- vmovups %xmm13,64(%rsi)
- vmovups %xmm14,80(%rsi)
- leaq 96(%rsi),%rsi
-
- .byte 0xf3,0xc3
-.align 32
-.Lhandle_ctr32_2:
- vpshufb %xmm0,%xmm1,%xmm6
- vmovdqu 48(%r11),%xmm5
- vpaddd 64(%r11),%xmm6,%xmm10
- vpaddd %xmm5,%xmm6,%xmm11
- vpaddd %xmm5,%xmm10,%xmm12
- vpshufb %xmm0,%xmm10,%xmm10
- vpaddd %xmm5,%xmm11,%xmm13
- vpshufb %xmm0,%xmm11,%xmm11
- vpxor %xmm4,%xmm10,%xmm10
- vpaddd %xmm5,%xmm12,%xmm14
- vpshufb %xmm0,%xmm12,%xmm12
- vpxor %xmm4,%xmm11,%xmm11
- vpaddd %xmm5,%xmm13,%xmm1
- vpshufb %xmm0,%xmm13,%xmm13
- vpxor %xmm4,%xmm12,%xmm12
- vpshufb %xmm0,%xmm14,%xmm14
- vpxor %xmm4,%xmm13,%xmm13
- vpshufb %xmm0,%xmm1,%xmm1
- vpxor %xmm4,%xmm14,%xmm14
- jmp .Loop_ctr32
-.cfi_endproc
-.size _aesni_ctr32_6x,.-_aesni_ctr32_6x
-
-.globl aesni_gcm_encrypt
-.type aesni_gcm_encrypt,@function
-.align 32
-aesni_gcm_encrypt:
-.cfi_startproc
- xorq %r10,%r10
- cmpq $288,%rdx
- jb .Lgcm_enc_abort
-
- leaq (%rsp),%rax
-.cfi_def_cfa_register %rax
- pushq %rbx
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_offset %rbp,-24
- pushq %r12
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_offset %r15,-56
- vzeroupper
-
- vmovdqu (%r8),%xmm1
- addq $-128,%rsp
- movl 12(%r8),%ebx
- leaq .Lbswap_mask(%rip),%r11
- leaq -128(%rcx),%r14
- movq $0xf80,%r15
- leaq 128(%rcx),%rcx
- vmovdqu (%r11),%xmm0
- andq $-128,%rsp
- movl 240-128(%rcx),%ebp
-
- andq %r15,%r14
- andq %rsp,%r15
- subq %r14,%r15
- jc .Lenc_no_key_aliasing
- cmpq $768,%r15
- jnc .Lenc_no_key_aliasing
- subq %r15,%rsp
-.Lenc_no_key_aliasing:
-
- leaq (%rsi),%r14
- leaq -192(%rsi,%rdx,1),%r15
- shrq $4,%rdx
-
- call _aesni_ctr32_6x
- vpshufb %xmm0,%xmm9,%xmm8
- vpshufb %xmm0,%xmm10,%xmm2
- vmovdqu %xmm8,112(%rsp)
- vpshufb %xmm0,%xmm11,%xmm4
- vmovdqu %xmm2,96(%rsp)
- vpshufb %xmm0,%xmm12,%xmm5
- vmovdqu %xmm4,80(%rsp)
- vpshufb %xmm0,%xmm13,%xmm6
- vmovdqu %xmm5,64(%rsp)
- vpshufb %xmm0,%xmm14,%xmm7
- vmovdqu %xmm6,48(%rsp)
-
- call _aesni_ctr32_6x
-
- vmovdqu (%r9),%xmm8
- leaq 32+32(%r9),%r9
- subq $12,%rdx
- movq $192,%r10
- vpshufb %xmm0,%xmm8,%xmm8
-
- call _aesni_ctr32_ghash_6x
- vmovdqu 32(%rsp),%xmm7
- vmovdqu (%r11),%xmm0
- vmovdqu 0-32(%r9),%xmm3
- vpunpckhqdq %xmm7,%xmm7,%xmm1
- vmovdqu 32-32(%r9),%xmm15
- vmovups %xmm9,-96(%rsi)
- vpshufb %xmm0,%xmm9,%xmm9
- vpxor %xmm7,%xmm1,%xmm1
- vmovups %xmm10,-80(%rsi)
- vpshufb %xmm0,%xmm10,%xmm10
- vmovups %xmm11,-64(%rsi)
- vpshufb %xmm0,%xmm11,%xmm11
- vmovups %xmm12,-48(%rsi)
- vpshufb %xmm0,%xmm12,%xmm12
- vmovups %xmm13,-32(%rsi)
- vpshufb %xmm0,%xmm13,%xmm13
- vmovups %xmm14,-16(%rsi)
- vpshufb %xmm0,%xmm14,%xmm14
- vmovdqu %xmm9,16(%rsp)
- vmovdqu 48(%rsp),%xmm6
- vmovdqu 16-32(%r9),%xmm0
- vpunpckhqdq %xmm6,%xmm6,%xmm2
- vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5
- vpxor %xmm6,%xmm2,%xmm2
- vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
- vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
-
- vmovdqu 64(%rsp),%xmm9
- vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4
- vmovdqu 48-32(%r9),%xmm3
- vpxor %xmm5,%xmm4,%xmm4
- vpunpckhqdq %xmm9,%xmm9,%xmm5
- vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6
- vpxor %xmm9,%xmm5,%xmm5
- vpxor %xmm7,%xmm6,%xmm6
- vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
- vmovdqu 80-32(%r9),%xmm15
- vpxor %xmm1,%xmm2,%xmm2
-
- vmovdqu 80(%rsp),%xmm1
- vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7
- vmovdqu 64-32(%r9),%xmm0
- vpxor %xmm4,%xmm7,%xmm7
- vpunpckhqdq %xmm1,%xmm1,%xmm4
- vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9
- vpxor %xmm1,%xmm4,%xmm4
- vpxor %xmm6,%xmm9,%xmm9
- vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5
- vpxor %xmm2,%xmm5,%xmm5
-
- vmovdqu 96(%rsp),%xmm2
- vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6
- vmovdqu 96-32(%r9),%xmm3
- vpxor %xmm7,%xmm6,%xmm6
- vpunpckhqdq %xmm2,%xmm2,%xmm7
- vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpxor %xmm9,%xmm1,%xmm1
- vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4
- vmovdqu 128-32(%r9),%xmm15
- vpxor %xmm5,%xmm4,%xmm4
-
- vpxor 112(%rsp),%xmm8,%xmm8
- vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5
- vmovdqu 112-32(%r9),%xmm0
- vpunpckhqdq %xmm8,%xmm8,%xmm9
- vpxor %xmm6,%xmm5,%xmm5
- vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2
- vpxor %xmm8,%xmm9,%xmm9
- vpxor %xmm1,%xmm2,%xmm2
- vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7
- vpxor %xmm4,%xmm7,%xmm4
-
- vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6
- vmovdqu 0-32(%r9),%xmm3
- vpunpckhqdq %xmm14,%xmm14,%xmm1
- vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8
- vpxor %xmm14,%xmm1,%xmm1
- vpxor %xmm5,%xmm6,%xmm5
- vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9
- vmovdqu 32-32(%r9),%xmm15
- vpxor %xmm2,%xmm8,%xmm7
- vpxor %xmm4,%xmm9,%xmm6
-
- vmovdqu 16-32(%r9),%xmm0
- vpxor %xmm5,%xmm7,%xmm9
- vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4
- vpxor %xmm9,%xmm6,%xmm6
- vpunpckhqdq %xmm13,%xmm13,%xmm2
- vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14
- vpxor %xmm13,%xmm2,%xmm2
- vpslldq $8,%xmm6,%xmm9
- vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
- vpxor %xmm9,%xmm5,%xmm8
- vpsrldq $8,%xmm6,%xmm6
- vpxor %xmm6,%xmm7,%xmm7
-
- vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5
- vmovdqu 48-32(%r9),%xmm3
- vpxor %xmm4,%xmm5,%xmm5
- vpunpckhqdq %xmm12,%xmm12,%xmm9
- vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13
- vpxor %xmm12,%xmm9,%xmm9
- vpxor %xmm14,%xmm13,%xmm13
- vpalignr $8,%xmm8,%xmm8,%xmm14
- vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
- vmovdqu 80-32(%r9),%xmm15
- vpxor %xmm1,%xmm2,%xmm2
-
- vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4
- vmovdqu 64-32(%r9),%xmm0
- vpxor %xmm5,%xmm4,%xmm4
- vpunpckhqdq %xmm11,%xmm11,%xmm1
- vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12
- vpxor %xmm11,%xmm1,%xmm1
- vpxor %xmm13,%xmm12,%xmm12
- vxorps 16(%rsp),%xmm7,%xmm7
- vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9
- vpxor %xmm2,%xmm9,%xmm9
-
- vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
- vxorps %xmm14,%xmm8,%xmm8
-
- vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5
- vmovdqu 96-32(%r9),%xmm3
- vpxor %xmm4,%xmm5,%xmm5
- vpunpckhqdq %xmm10,%xmm10,%xmm2
- vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11
- vpxor %xmm10,%xmm2,%xmm2
- vpalignr $8,%xmm8,%xmm8,%xmm14
- vpxor %xmm12,%xmm11,%xmm11
- vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1
- vmovdqu 128-32(%r9),%xmm15
- vpxor %xmm9,%xmm1,%xmm1
-
- vxorps %xmm7,%xmm14,%xmm14
- vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
- vxorps %xmm14,%xmm8,%xmm8
-
- vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4
- vmovdqu 112-32(%r9),%xmm0
- vpxor %xmm5,%xmm4,%xmm4
- vpunpckhqdq %xmm8,%xmm8,%xmm9
- vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10
- vpxor %xmm8,%xmm9,%xmm9
- vpxor %xmm11,%xmm10,%xmm10
- vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2
- vpxor %xmm1,%xmm2,%xmm2
-
- vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5
- vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7
- vpxor %xmm4,%xmm5,%xmm5
- vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6
- vpxor %xmm10,%xmm7,%xmm7
- vpxor %xmm2,%xmm6,%xmm6
-
- vpxor %xmm5,%xmm7,%xmm4
- vpxor %xmm4,%xmm6,%xmm6
- vpslldq $8,%xmm6,%xmm1
- vmovdqu 16(%r11),%xmm3
- vpsrldq $8,%xmm6,%xmm6
- vpxor %xmm1,%xmm5,%xmm8
- vpxor %xmm6,%xmm7,%xmm7
-
- vpalignr $8,%xmm8,%xmm8,%xmm2
- vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
- vpxor %xmm2,%xmm8,%xmm8
-
- vpalignr $8,%xmm8,%xmm8,%xmm2
- vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
- vpxor %xmm7,%xmm2,%xmm2
- vpxor %xmm2,%xmm8,%xmm8
- vpshufb (%r11),%xmm8,%xmm8
- vmovdqu %xmm8,-64(%r9)
-
- vzeroupper
- movq -48(%rax),%r15
-.cfi_restore %r15
- movq -40(%rax),%r14
-.cfi_restore %r14
- movq -32(%rax),%r13
-.cfi_restore %r13
- movq -24(%rax),%r12
-.cfi_restore %r12
- movq -16(%rax),%rbp
-.cfi_restore %rbp
- movq -8(%rax),%rbx
-.cfi_restore %rbx
- leaq (%rax),%rsp
-.cfi_def_cfa_register %rsp
-.Lgcm_enc_abort:
- movq %r10,%rax
- .byte 0xf3,0xc3
-.cfi_endproc
-.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
-.align 64
-.Lbswap_mask:
-.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-.Lpoly:
-.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
-.Lone_msb:
-.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
-.Ltwo_lsb:
-.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-.Lone_lsb:
-.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 64
- .section ".note.gnu.property", "a"
+ .section .note.gnu.property, #alloc
.p2align 3
.long 1f - 0f
.long 4f - 1f
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/ghash-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/ghash-x86_64.s
index ac4823fe589..3aa9f6c1784 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/ghash-x86_64.s
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/modes/ghash-x86_64.s
@@ -1306,108 +1306,7 @@ gcm_ghash_clmul:
.align 32
gcm_init_avx:
.cfi_startproc
- vzeroupper
-
- vmovdqu (%rsi),%xmm2
- vpshufd $78,%xmm2,%xmm2
-
-
- vpshufd $255,%xmm2,%xmm4
- vpsrlq $63,%xmm2,%xmm3
- vpsllq $1,%xmm2,%xmm2
- vpxor %xmm5,%xmm5,%xmm5
- vpcmpgtd %xmm4,%xmm5,%xmm5
- vpslldq $8,%xmm3,%xmm3
- vpor %xmm3,%xmm2,%xmm2
-
-
- vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5
- vpxor %xmm5,%xmm2,%xmm2
-
- vpunpckhqdq %xmm2,%xmm2,%xmm6
- vmovdqa %xmm2,%xmm0
- vpxor %xmm2,%xmm6,%xmm6
- movq $4,%r10
- jmp .Linit_start_avx
-.align 32
-.Linit_loop_avx:
- vpalignr $8,%xmm3,%xmm4,%xmm5
- vmovdqu %xmm5,-16(%rdi)
- vpunpckhqdq %xmm0,%xmm0,%xmm3
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
- vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
- vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
- vpxor %xmm0,%xmm1,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
-
- vpslldq $8,%xmm3,%xmm4
- vpsrldq $8,%xmm3,%xmm3
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm3,%xmm1,%xmm1
- vpsllq $57,%xmm0,%xmm3
- vpsllq $62,%xmm0,%xmm4
- vpxor %xmm3,%xmm4,%xmm4
- vpsllq $63,%xmm0,%xmm3
- vpxor %xmm3,%xmm4,%xmm4
- vpslldq $8,%xmm4,%xmm3
- vpsrldq $8,%xmm4,%xmm4
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm4,%xmm1,%xmm1
-
- vpsrlq $1,%xmm0,%xmm4
- vpxor %xmm0,%xmm1,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpsrlq $5,%xmm4,%xmm4
- vpxor %xmm4,%xmm0,%xmm0
- vpsrlq $1,%xmm0,%xmm0
- vpxor %xmm1,%xmm0,%xmm0
-.Linit_start_avx:
- vmovdqa %xmm0,%xmm5
- vpunpckhqdq %xmm0,%xmm0,%xmm3
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
- vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
- vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
- vpxor %xmm0,%xmm1,%xmm4
- vpxor %xmm4,%xmm3,%xmm3
-
- vpslldq $8,%xmm3,%xmm4
- vpsrldq $8,%xmm3,%xmm3
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm3,%xmm1,%xmm1
- vpsllq $57,%xmm0,%xmm3
- vpsllq $62,%xmm0,%xmm4
- vpxor %xmm3,%xmm4,%xmm4
- vpsllq $63,%xmm0,%xmm3
- vpxor %xmm3,%xmm4,%xmm4
- vpslldq $8,%xmm4,%xmm3
- vpsrldq $8,%xmm4,%xmm4
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm4,%xmm1,%xmm1
-
- vpsrlq $1,%xmm0,%xmm4
- vpxor %xmm0,%xmm1,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpsrlq $5,%xmm4,%xmm4
- vpxor %xmm4,%xmm0,%xmm0
- vpsrlq $1,%xmm0,%xmm0
- vpxor %xmm1,%xmm0,%xmm0
- vpshufd $78,%xmm5,%xmm3
- vpshufd $78,%xmm0,%xmm4
- vpxor %xmm5,%xmm3,%xmm3
- vmovdqu %xmm5,0(%rdi)
- vpxor %xmm0,%xmm4,%xmm4
- vmovdqu %xmm0,16(%rdi)
- leaq 48(%rdi),%rdi
- subq $1,%r10
- jnz .Linit_loop_avx
-
- vpalignr $8,%xmm4,%xmm3,%xmm5
- vmovdqu %xmm5,-16(%rdi)
-
- vzeroupper
- .byte 0xf3,0xc3
+ jmp .L_init_clmul
.cfi_endproc
.size gcm_init_avx,.-gcm_init_avx
.globl gcm_gmult_avx
@@ -1425,377 +1324,7 @@ gcm_gmult_avx:
gcm_ghash_avx:
.cfi_startproc
.byte 243,15,30,250
- vzeroupper
-
- vmovdqu (%rdi),%xmm10
- leaq .L0x1c2_polynomial(%rip),%r10
- leaq 64(%rsi),%rsi
- vmovdqu .Lbswap_mask(%rip),%xmm13
- vpshufb %xmm13,%xmm10,%xmm10
- cmpq $0x80,%rcx
- jb .Lshort_avx
- subq $0x80,%rcx
-
- vmovdqu 112(%rdx),%xmm14
- vmovdqu 0-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm14
- vmovdqu 32-64(%rsi),%xmm7
-
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vmovdqu 96(%rdx),%xmm15
- vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
- vpxor %xmm14,%xmm9,%xmm9
- vpshufb %xmm13,%xmm15,%xmm15
- vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
- vmovdqu 16-64(%rsi),%xmm6
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vmovdqu 80(%rdx),%xmm14
- vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
- vpxor %xmm15,%xmm8,%xmm8
-
- vpshufb %xmm13,%xmm14,%xmm14
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
- vmovdqu 48-64(%rsi),%xmm6
- vpxor %xmm14,%xmm9,%xmm9
- vmovdqu 64(%rdx),%xmm15
- vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
- vmovdqu 80-64(%rsi),%xmm7
-
- vpshufb %xmm13,%xmm15,%xmm15
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
- vpxor %xmm1,%xmm4,%xmm4
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
- vmovdqu 64-64(%rsi),%xmm6
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
- vpxor %xmm15,%xmm8,%xmm8
-
- vmovdqu 48(%rdx),%xmm14
- vpxor %xmm3,%xmm0,%xmm0
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
- vpxor %xmm4,%xmm1,%xmm1
- vpshufb %xmm13,%xmm14,%xmm14
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
- vmovdqu 96-64(%rsi),%xmm6
- vpxor %xmm5,%xmm2,%xmm2
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
- vmovdqu 128-64(%rsi),%xmm7
- vpxor %xmm14,%xmm9,%xmm9
-
- vmovdqu 32(%rdx),%xmm15
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
- vpxor %xmm1,%xmm4,%xmm4
- vpshufb %xmm13,%xmm15,%xmm15
- vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
- vmovdqu 112-64(%rsi),%xmm6
- vpxor %xmm2,%xmm5,%xmm5
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
- vpxor %xmm15,%xmm8,%xmm8
-
- vmovdqu 16(%rdx),%xmm14
- vpxor %xmm3,%xmm0,%xmm0
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
- vpxor %xmm4,%xmm1,%xmm1
- vpshufb %xmm13,%xmm14,%xmm14
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
- vmovdqu 144-64(%rsi),%xmm6
- vpxor %xmm5,%xmm2,%xmm2
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
- vmovdqu 176-64(%rsi),%xmm7
- vpxor %xmm14,%xmm9,%xmm9
-
- vmovdqu (%rdx),%xmm15
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
- vpxor %xmm1,%xmm4,%xmm4
- vpshufb %xmm13,%xmm15,%xmm15
- vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
- vmovdqu 160-64(%rsi),%xmm6
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
-
- leaq 128(%rdx),%rdx
- cmpq $0x80,%rcx
- jb .Ltail_avx
-
- vpxor %xmm10,%xmm15,%xmm15
- subq $0x80,%rcx
- jmp .Loop8x_avx
-
-.align 32
-.Loop8x_avx:
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vmovdqu 112(%rdx),%xmm14
- vpxor %xmm0,%xmm3,%xmm3
- vpxor %xmm15,%xmm8,%xmm8
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10
- vpshufb %xmm13,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11
- vmovdqu 0-64(%rsi),%xmm6
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12
- vmovdqu 32-64(%rsi),%xmm7
- vpxor %xmm14,%xmm9,%xmm9
-
- vmovdqu 96(%rdx),%xmm15
- vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
- vpxor %xmm3,%xmm10,%xmm10
- vpshufb %xmm13,%xmm15,%xmm15
- vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
- vxorps %xmm4,%xmm11,%xmm11
- vmovdqu 16-64(%rsi),%xmm6
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
- vpxor %xmm5,%xmm12,%xmm12
- vxorps %xmm15,%xmm8,%xmm8
-
- vmovdqu 80(%rdx),%xmm14
- vpxor %xmm10,%xmm12,%xmm12
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
- vpxor %xmm11,%xmm12,%xmm12
- vpslldq $8,%xmm12,%xmm9
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
- vpsrldq $8,%xmm12,%xmm12
- vpxor %xmm9,%xmm10,%xmm10
- vmovdqu 48-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm14
- vxorps %xmm12,%xmm11,%xmm11
- vpxor %xmm1,%xmm4,%xmm4
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
- vmovdqu 80-64(%rsi),%xmm7
- vpxor %xmm14,%xmm9,%xmm9
- vpxor %xmm2,%xmm5,%xmm5
-
- vmovdqu 64(%rdx),%xmm15
- vpalignr $8,%xmm10,%xmm10,%xmm12
- vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
- vpshufb %xmm13,%xmm15,%xmm15
- vpxor %xmm3,%xmm0,%xmm0
- vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
- vmovdqu 64-64(%rsi),%xmm6
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
- vxorps %xmm15,%xmm8,%xmm8
- vpxor %xmm5,%xmm2,%xmm2
-
- vmovdqu 48(%rdx),%xmm14
- vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
- vpshufb %xmm13,%xmm14,%xmm14
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
- vmovdqu 96-64(%rsi),%xmm6
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
- vmovdqu 128-64(%rsi),%xmm7
- vpxor %xmm14,%xmm9,%xmm9
- vpxor %xmm2,%xmm5,%xmm5
-
- vmovdqu 32(%rdx),%xmm15
- vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
- vpshufb %xmm13,%xmm15,%xmm15
- vpxor %xmm3,%xmm0,%xmm0
- vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
- vmovdqu 112-64(%rsi),%xmm6
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm4,%xmm1,%xmm1
- vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
- vpxor %xmm15,%xmm8,%xmm8
- vpxor %xmm5,%xmm2,%xmm2
- vxorps %xmm12,%xmm10,%xmm10
-
- vmovdqu 16(%rdx),%xmm14
- vpalignr $8,%xmm10,%xmm10,%xmm12
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
- vpshufb %xmm13,%xmm14,%xmm14
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
- vmovdqu 144-64(%rsi),%xmm6
- vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
- vxorps %xmm11,%xmm12,%xmm12
- vpunpckhqdq %xmm14,%xmm14,%xmm9
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
- vmovdqu 176-64(%rsi),%xmm7
- vpxor %xmm14,%xmm9,%xmm9
- vpxor %xmm2,%xmm5,%xmm5
-
- vmovdqu (%rdx),%xmm15
- vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
- vpshufb %xmm13,%xmm15,%xmm15
- vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
- vmovdqu 160-64(%rsi),%xmm6
- vpxor %xmm12,%xmm15,%xmm15
- vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
- vpxor %xmm10,%xmm15,%xmm15
-
- leaq 128(%rdx),%rdx
- subq $0x80,%rcx
- jnc .Loop8x_avx
-
- addq $0x80,%rcx
- jmp .Ltail_no_xor_avx
-
-.align 32
-.Lshort_avx:
- vmovdqu -16(%rdx,%rcx,1),%xmm14
- leaq (%rdx,%rcx,1),%rdx
- vmovdqu 0-64(%rsi),%xmm6
- vmovdqu 32-64(%rsi),%xmm7
- vpshufb %xmm13,%xmm14,%xmm15
-
- vmovdqa %xmm0,%xmm3
- vmovdqa %xmm1,%xmm4
- vmovdqa %xmm2,%xmm5
- subq $0x10,%rcx
- jz .Ltail_avx
-
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu -32(%rdx),%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
- vmovdqu 16-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm15
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
- vpsrldq $8,%xmm7,%xmm7
- subq $0x10,%rcx
- jz .Ltail_avx
-
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu -48(%rdx),%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
- vmovdqu 48-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm15
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
- vmovdqu 80-64(%rsi),%xmm7
- subq $0x10,%rcx
- jz .Ltail_avx
-
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu -64(%rdx),%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
- vmovdqu 64-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm15
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
- vpsrldq $8,%xmm7,%xmm7
- subq $0x10,%rcx
- jz .Ltail_avx
-
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu -80(%rdx),%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
- vmovdqu 96-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm15
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
- vmovdqu 128-64(%rsi),%xmm7
- subq $0x10,%rcx
- jz .Ltail_avx
-
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu -96(%rdx),%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
- vmovdqu 112-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm15
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
- vpsrldq $8,%xmm7,%xmm7
- subq $0x10,%rcx
- jz .Ltail_avx
-
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vmovdqu -112(%rdx),%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
- vmovdqu 144-64(%rsi),%xmm6
- vpshufb %xmm13,%xmm14,%xmm15
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
- vmovq 184-64(%rsi),%xmm7
- subq $0x10,%rcx
- jmp .Ltail_avx
-
-.align 32
-.Ltail_avx:
- vpxor %xmm10,%xmm15,%xmm15
-.Ltail_no_xor_avx:
- vpunpckhqdq %xmm15,%xmm15,%xmm8
- vpxor %xmm0,%xmm3,%xmm3
- vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
- vpxor %xmm15,%xmm8,%xmm8
- vpxor %xmm1,%xmm4,%xmm4
- vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
- vpxor %xmm2,%xmm5,%xmm5
- vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
-
- vmovdqu (%r10),%xmm12
-
- vpxor %xmm0,%xmm3,%xmm10
- vpxor %xmm1,%xmm4,%xmm11
- vpxor %xmm2,%xmm5,%xmm5
-
- vpxor %xmm10,%xmm5,%xmm5
- vpxor %xmm11,%xmm5,%xmm5
- vpslldq $8,%xmm5,%xmm9
- vpsrldq $8,%xmm5,%xmm5
- vpxor %xmm9,%xmm10,%xmm10
- vpxor %xmm5,%xmm11,%xmm11
-
- vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
- vpalignr $8,%xmm10,%xmm10,%xmm10
- vpxor %xmm9,%xmm10,%xmm10
-
- vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
- vpalignr $8,%xmm10,%xmm10,%xmm10
- vpxor %xmm11,%xmm10,%xmm10
- vpxor %xmm9,%xmm10,%xmm10
-
- cmpq $0,%rcx
- jne .Lshort_avx
-
- vpshufb %xmm13,%xmm10,%xmm10
- vmovdqu %xmm10,(%rdi)
- vzeroupper
- .byte 0xf3,0xc3
+ jmp .L_ghash_clmul
.cfi_endproc
.size gcm_ghash_avx,.-gcm_ghash_avx
.align 64
@@ -1851,7 +1380,7 @@ gcm_ghash_avx:
.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64
- .section ".note.gnu.property", "a"
+ .section .note.gnu.property, #alloc
.p2align 3
.long 1f - 0f
.long 4f - 1f
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/keccak1600-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/keccak1600-x86_64.s
index 9c0054aa175..6beb92e69e3 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/keccak1600-x86_64.s
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/keccak1600-x86_64.s
@@ -522,7 +522,7 @@ iotas:
.quad 0x8000000080008008
.size iotas,.-iotas
.byte 75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
- .section ".note.gnu.property", "a"
+ .section .note.gnu.property, #alloc
.p2align 3
.long 1f - 0f
.long 4f - 1f
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha1-mb-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha1-mb-x86_64.s
index 589ffb37468..76135fdbb10 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha1-mb-x86_64.s
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha1-mb-x86_64.s
@@ -10,8 +10,6 @@ sha1_multi_block:
movq OPENSSL_ia32cap_P+4(%rip),%rcx
btq $61,%rcx
jc _shaext_shortcut
- testl $268435456,%ecx
- jnz _avx_shortcut
movq %rsp,%rax
.cfi_def_cfa_register %rax
pushq %rbx
@@ -2949,4343 +2947,6 @@ _shaext_shortcut:
.byte 0xf3,0xc3
.cfi_endproc
.size sha1_multi_block_shaext,.-sha1_multi_block_shaext
-.type sha1_multi_block_avx,@function
-.align 32
-sha1_multi_block_avx:
-.cfi_startproc
-_avx_shortcut:
- shrq $32,%rcx
- cmpl $2,%edx
- jb .Lavx
- testl $32,%ecx
- jnz _avx2_shortcut
- jmp .Lavx
-.align 32
-.Lavx:
- movq %rsp,%rax
-.cfi_def_cfa_register %rax
- pushq %rbx
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_offset %rbp,-24
- subq $288,%rsp
- andq $-256,%rsp
- movq %rax,272(%rsp)
-.cfi_escape 0x0f,0x06,0x77,0x90,0x02,0x06,0x23,0x08
-.Lbody_avx:
- leaq K_XX_XX(%rip),%rbp
- leaq 256(%rsp),%rbx
-
- vzeroupper
-.Loop_grande_avx:
- movl %edx,280(%rsp)
- xorl %edx,%edx
-
- movq 0(%rsi),%r8
-
- movl 8(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,0(%rbx)
- cmovleq %rbp,%r8
-
- movq 16(%rsi),%r9
-
- movl 24(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,4(%rbx)
- cmovleq %rbp,%r9
-
- movq 32(%rsi),%r10
-
- movl 40(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,8(%rbx)
- cmovleq %rbp,%r10
-
- movq 48(%rsi),%r11
-
- movl 56(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,12(%rbx)
- cmovleq %rbp,%r11
- testl %edx,%edx
- jz .Ldone_avx
-
- vmovdqu 0(%rdi),%xmm10
- leaq 128(%rsp),%rax
- vmovdqu 32(%rdi),%xmm11
- vmovdqu 64(%rdi),%xmm12
- vmovdqu 96(%rdi),%xmm13
- vmovdqu 128(%rdi),%xmm14
- vmovdqu 96(%rbp),%xmm5
- jmp .Loop_avx
-
-.align 32
-.Loop_avx:
- vmovdqa -32(%rbp),%xmm15
- vmovd (%r8),%xmm0
- leaq 64(%r8),%r8
- vmovd (%r9),%xmm2
- leaq 64(%r9),%r9
- vpinsrd $1,(%r10),%xmm0,%xmm0
- leaq 64(%r10),%r10
- vpinsrd $1,(%r11),%xmm2,%xmm2
- leaq 64(%r11),%r11
- vmovd -60(%r8),%xmm1
- vpunpckldq %xmm2,%xmm0,%xmm0
- vmovd -60(%r9),%xmm9
- vpshufb %xmm5,%xmm0,%xmm0
- vpinsrd $1,-60(%r10),%xmm1,%xmm1
- vpinsrd $1,-60(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm14,%xmm14
- vpslld $5,%xmm10,%xmm8
- vpandn %xmm13,%xmm11,%xmm7
- vpand %xmm12,%xmm11,%xmm6
-
- vmovdqa %xmm0,0-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpunpckldq %xmm9,%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -56(%r8),%xmm2
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -56(%r9),%xmm9
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpshufb %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpinsrd $1,-56(%r10),%xmm2,%xmm2
- vpinsrd $1,-56(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm13,%xmm13
- vpslld $5,%xmm14,%xmm8
- vpandn %xmm12,%xmm10,%xmm7
- vpand %xmm11,%xmm10,%xmm6
-
- vmovdqa %xmm1,16-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpunpckldq %xmm9,%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -52(%r8),%xmm3
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -52(%r9),%xmm9
- vpaddd %xmm6,%xmm13,%xmm13
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpshufb %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpinsrd $1,-52(%r10),%xmm3,%xmm3
- vpinsrd $1,-52(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm12,%xmm12
- vpslld $5,%xmm13,%xmm8
- vpandn %xmm11,%xmm14,%xmm7
- vpand %xmm10,%xmm14,%xmm6
-
- vmovdqa %xmm2,32-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpunpckldq %xmm9,%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -48(%r8),%xmm4
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -48(%r9),%xmm9
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpshufb %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpinsrd $1,-48(%r10),%xmm4,%xmm4
- vpinsrd $1,-48(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm11,%xmm11
- vpslld $5,%xmm12,%xmm8
- vpandn %xmm10,%xmm13,%xmm7
- vpand %xmm14,%xmm13,%xmm6
-
- vmovdqa %xmm3,48-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpunpckldq %xmm9,%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -44(%r8),%xmm0
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -44(%r9),%xmm9
- vpaddd %xmm6,%xmm11,%xmm11
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpshufb %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpinsrd $1,-44(%r10),%xmm0,%xmm0
- vpinsrd $1,-44(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm10,%xmm10
- vpslld $5,%xmm11,%xmm8
- vpandn %xmm14,%xmm12,%xmm7
- vpand %xmm13,%xmm12,%xmm6
-
- vmovdqa %xmm4,64-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpunpckldq %xmm9,%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -40(%r8),%xmm1
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -40(%r9),%xmm9
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpshufb %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpinsrd $1,-40(%r10),%xmm1,%xmm1
- vpinsrd $1,-40(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm14,%xmm14
- vpslld $5,%xmm10,%xmm8
- vpandn %xmm13,%xmm11,%xmm7
- vpand %xmm12,%xmm11,%xmm6
-
- vmovdqa %xmm0,80-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpunpckldq %xmm9,%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -36(%r8),%xmm2
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -36(%r9),%xmm9
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpshufb %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpinsrd $1,-36(%r10),%xmm2,%xmm2
- vpinsrd $1,-36(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm13,%xmm13
- vpslld $5,%xmm14,%xmm8
- vpandn %xmm12,%xmm10,%xmm7
- vpand %xmm11,%xmm10,%xmm6
-
- vmovdqa %xmm1,96-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpunpckldq %xmm9,%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -32(%r8),%xmm3
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -32(%r9),%xmm9
- vpaddd %xmm6,%xmm13,%xmm13
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpshufb %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpinsrd $1,-32(%r10),%xmm3,%xmm3
- vpinsrd $1,-32(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm12,%xmm12
- vpslld $5,%xmm13,%xmm8
- vpandn %xmm11,%xmm14,%xmm7
- vpand %xmm10,%xmm14,%xmm6
-
- vmovdqa %xmm2,112-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpunpckldq %xmm9,%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -28(%r8),%xmm4
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -28(%r9),%xmm9
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpshufb %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpinsrd $1,-28(%r10),%xmm4,%xmm4
- vpinsrd $1,-28(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm11,%xmm11
- vpslld $5,%xmm12,%xmm8
- vpandn %xmm10,%xmm13,%xmm7
- vpand %xmm14,%xmm13,%xmm6
-
- vmovdqa %xmm3,128-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpunpckldq %xmm9,%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -24(%r8),%xmm0
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -24(%r9),%xmm9
- vpaddd %xmm6,%xmm11,%xmm11
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpshufb %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpinsrd $1,-24(%r10),%xmm0,%xmm0
- vpinsrd $1,-24(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm10,%xmm10
- vpslld $5,%xmm11,%xmm8
- vpandn %xmm14,%xmm12,%xmm7
- vpand %xmm13,%xmm12,%xmm6
-
- vmovdqa %xmm4,144-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpunpckldq %xmm9,%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -20(%r8),%xmm1
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -20(%r9),%xmm9
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpshufb %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpinsrd $1,-20(%r10),%xmm1,%xmm1
- vpinsrd $1,-20(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm14,%xmm14
- vpslld $5,%xmm10,%xmm8
- vpandn %xmm13,%xmm11,%xmm7
- vpand %xmm12,%xmm11,%xmm6
-
- vmovdqa %xmm0,160-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpunpckldq %xmm9,%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -16(%r8),%xmm2
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -16(%r9),%xmm9
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpshufb %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpinsrd $1,-16(%r10),%xmm2,%xmm2
- vpinsrd $1,-16(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm13,%xmm13
- vpslld $5,%xmm14,%xmm8
- vpandn %xmm12,%xmm10,%xmm7
- vpand %xmm11,%xmm10,%xmm6
-
- vmovdqa %xmm1,176-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpunpckldq %xmm9,%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -12(%r8),%xmm3
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -12(%r9),%xmm9
- vpaddd %xmm6,%xmm13,%xmm13
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpshufb %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpinsrd $1,-12(%r10),%xmm3,%xmm3
- vpinsrd $1,-12(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm12,%xmm12
- vpslld $5,%xmm13,%xmm8
- vpandn %xmm11,%xmm14,%xmm7
- vpand %xmm10,%xmm14,%xmm6
-
- vmovdqa %xmm2,192-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpunpckldq %xmm9,%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -8(%r8),%xmm4
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -8(%r9),%xmm9
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpshufb %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpinsrd $1,-8(%r10),%xmm4,%xmm4
- vpinsrd $1,-8(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm11,%xmm11
- vpslld $5,%xmm12,%xmm8
- vpandn %xmm10,%xmm13,%xmm7
- vpand %xmm14,%xmm13,%xmm6
-
- vmovdqa %xmm3,208-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpunpckldq %xmm9,%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vmovd -4(%r8),%xmm0
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vmovd -4(%r9),%xmm9
- vpaddd %xmm6,%xmm11,%xmm11
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpshufb %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vmovdqa 0-128(%rax),%xmm1
- vpinsrd $1,-4(%r10),%xmm0,%xmm0
- vpinsrd $1,-4(%r11),%xmm9,%xmm9
- vpaddd %xmm15,%xmm10,%xmm10
- prefetcht0 63(%r8)
- vpslld $5,%xmm11,%xmm8
- vpandn %xmm14,%xmm12,%xmm7
- vpand %xmm13,%xmm12,%xmm6
-
- vmovdqa %xmm4,224-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpunpckldq %xmm9,%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- prefetcht0 63(%r9)
- vpxor %xmm7,%xmm6,%xmm6
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- prefetcht0 63(%r10)
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- prefetcht0 63(%r11)
- vpshufb %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vmovdqa 16-128(%rax),%xmm2
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 32-128(%rax),%xmm3
-
- vpaddd %xmm15,%xmm14,%xmm14
- vpslld $5,%xmm10,%xmm8
- vpandn %xmm13,%xmm11,%xmm7
-
- vpand %xmm12,%xmm11,%xmm6
-
- vmovdqa %xmm0,240-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 128-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
-
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 48-128(%rax),%xmm4
-
- vpaddd %xmm15,%xmm13,%xmm13
- vpslld $5,%xmm14,%xmm8
- vpandn %xmm12,%xmm10,%xmm7
-
- vpand %xmm11,%xmm10,%xmm6
-
- vmovdqa %xmm1,0-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 144-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
-
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
-
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 64-128(%rax),%xmm0
-
- vpaddd %xmm15,%xmm12,%xmm12
- vpslld $5,%xmm13,%xmm8
- vpandn %xmm11,%xmm14,%xmm7
-
- vpand %xmm10,%xmm14,%xmm6
-
- vmovdqa %xmm2,16-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 160-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
-
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 80-128(%rax),%xmm1
-
- vpaddd %xmm15,%xmm11,%xmm11
- vpslld $5,%xmm12,%xmm8
- vpandn %xmm10,%xmm13,%xmm7
-
- vpand %xmm14,%xmm13,%xmm6
-
- vmovdqa %xmm3,32-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 176-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
-
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
-
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 96-128(%rax),%xmm2
-
- vpaddd %xmm15,%xmm10,%xmm10
- vpslld $5,%xmm11,%xmm8
- vpandn %xmm14,%xmm12,%xmm7
-
- vpand %xmm13,%xmm12,%xmm6
-
- vmovdqa %xmm4,48-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor 192-128(%rax),%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm7,%xmm6,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $31,%xmm0,%xmm5
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpsrld $2,%xmm12,%xmm12
-
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vmovdqa 0(%rbp),%xmm15
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 112-128(%rax),%xmm3
-
- vpslld $5,%xmm10,%xmm8
- vpaddd %xmm15,%xmm14,%xmm14
- vpxor %xmm11,%xmm13,%xmm6
- vmovdqa %xmm0,64-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 208-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 128-128(%rax),%xmm4
-
- vpslld $5,%xmm14,%xmm8
- vpaddd %xmm15,%xmm13,%xmm13
- vpxor %xmm10,%xmm12,%xmm6
- vmovdqa %xmm1,80-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 224-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 144-128(%rax),%xmm0
-
- vpslld $5,%xmm13,%xmm8
- vpaddd %xmm15,%xmm12,%xmm12
- vpxor %xmm14,%xmm11,%xmm6
- vmovdqa %xmm2,96-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 240-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 160-128(%rax),%xmm1
-
- vpslld $5,%xmm12,%xmm8
- vpaddd %xmm15,%xmm11,%xmm11
- vpxor %xmm13,%xmm10,%xmm6
- vmovdqa %xmm3,112-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 0-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 176-128(%rax),%xmm2
-
- vpslld $5,%xmm11,%xmm8
- vpaddd %xmm15,%xmm10,%xmm10
- vpxor %xmm12,%xmm14,%xmm6
- vmovdqa %xmm4,128-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor 16-128(%rax),%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm6,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
- vpsrld $31,%xmm0,%xmm5
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 192-128(%rax),%xmm3
-
- vpslld $5,%xmm10,%xmm8
- vpaddd %xmm15,%xmm14,%xmm14
- vpxor %xmm11,%xmm13,%xmm6
- vmovdqa %xmm0,144-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 32-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 208-128(%rax),%xmm4
-
- vpslld $5,%xmm14,%xmm8
- vpaddd %xmm15,%xmm13,%xmm13
- vpxor %xmm10,%xmm12,%xmm6
- vmovdqa %xmm1,160-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 48-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 224-128(%rax),%xmm0
-
- vpslld $5,%xmm13,%xmm8
- vpaddd %xmm15,%xmm12,%xmm12
- vpxor %xmm14,%xmm11,%xmm6
- vmovdqa %xmm2,176-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 64-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 240-128(%rax),%xmm1
-
- vpslld $5,%xmm12,%xmm8
- vpaddd %xmm15,%xmm11,%xmm11
- vpxor %xmm13,%xmm10,%xmm6
- vmovdqa %xmm3,192-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 80-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 0-128(%rax),%xmm2
-
- vpslld $5,%xmm11,%xmm8
- vpaddd %xmm15,%xmm10,%xmm10
- vpxor %xmm12,%xmm14,%xmm6
- vmovdqa %xmm4,208-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor 96-128(%rax),%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm6,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
- vpsrld $31,%xmm0,%xmm5
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 16-128(%rax),%xmm3
-
- vpslld $5,%xmm10,%xmm8
- vpaddd %xmm15,%xmm14,%xmm14
- vpxor %xmm11,%xmm13,%xmm6
- vmovdqa %xmm0,224-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 112-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 32-128(%rax),%xmm4
-
- vpslld $5,%xmm14,%xmm8
- vpaddd %xmm15,%xmm13,%xmm13
- vpxor %xmm10,%xmm12,%xmm6
- vmovdqa %xmm1,240-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 128-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 48-128(%rax),%xmm0
-
- vpslld $5,%xmm13,%xmm8
- vpaddd %xmm15,%xmm12,%xmm12
- vpxor %xmm14,%xmm11,%xmm6
- vmovdqa %xmm2,0-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 144-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 64-128(%rax),%xmm1
-
- vpslld $5,%xmm12,%xmm8
- vpaddd %xmm15,%xmm11,%xmm11
- vpxor %xmm13,%xmm10,%xmm6
- vmovdqa %xmm3,16-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 160-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 80-128(%rax),%xmm2
-
- vpslld $5,%xmm11,%xmm8
- vpaddd %xmm15,%xmm10,%xmm10
- vpxor %xmm12,%xmm14,%xmm6
- vmovdqa %xmm4,32-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor 176-128(%rax),%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm6,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
- vpsrld $31,%xmm0,%xmm5
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 96-128(%rax),%xmm3
-
- vpslld $5,%xmm10,%xmm8
- vpaddd %xmm15,%xmm14,%xmm14
- vpxor %xmm11,%xmm13,%xmm6
- vmovdqa %xmm0,48-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 192-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 112-128(%rax),%xmm4
-
- vpslld $5,%xmm14,%xmm8
- vpaddd %xmm15,%xmm13,%xmm13
- vpxor %xmm10,%xmm12,%xmm6
- vmovdqa %xmm1,64-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 208-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 128-128(%rax),%xmm0
-
- vpslld $5,%xmm13,%xmm8
- vpaddd %xmm15,%xmm12,%xmm12
- vpxor %xmm14,%xmm11,%xmm6
- vmovdqa %xmm2,80-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 224-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 144-128(%rax),%xmm1
-
- vpslld $5,%xmm12,%xmm8
- vpaddd %xmm15,%xmm11,%xmm11
- vpxor %xmm13,%xmm10,%xmm6
- vmovdqa %xmm3,96-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 240-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 160-128(%rax),%xmm2
-
- vpslld $5,%xmm11,%xmm8
- vpaddd %xmm15,%xmm10,%xmm10
- vpxor %xmm12,%xmm14,%xmm6
- vmovdqa %xmm4,112-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor 0-128(%rax),%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm6,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
- vpsrld $31,%xmm0,%xmm5
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vmovdqa 32(%rbp),%xmm15
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 176-128(%rax),%xmm3
-
- vpaddd %xmm15,%xmm14,%xmm14
- vpslld $5,%xmm10,%xmm8
- vpand %xmm12,%xmm13,%xmm7
- vpxor 16-128(%rax),%xmm1,%xmm1
-
- vpaddd %xmm7,%xmm14,%xmm14
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm13,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vmovdqu %xmm0,128-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm1,%xmm5
- vpand %xmm11,%xmm6,%xmm6
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 192-128(%rax),%xmm4
-
- vpaddd %xmm15,%xmm13,%xmm13
- vpslld $5,%xmm14,%xmm8
- vpand %xmm11,%xmm12,%xmm7
- vpxor 32-128(%rax),%xmm2,%xmm2
-
- vpaddd %xmm7,%xmm13,%xmm13
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm12,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vmovdqu %xmm1,144-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm2,%xmm5
- vpand %xmm10,%xmm6,%xmm6
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpaddd %xmm6,%xmm13,%xmm13
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 208-128(%rax),%xmm0
-
- vpaddd %xmm15,%xmm12,%xmm12
- vpslld $5,%xmm13,%xmm8
- vpand %xmm10,%xmm11,%xmm7
- vpxor 48-128(%rax),%xmm3,%xmm3
-
- vpaddd %xmm7,%xmm12,%xmm12
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm11,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vmovdqu %xmm2,160-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm3,%xmm5
- vpand %xmm14,%xmm6,%xmm6
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 224-128(%rax),%xmm1
-
- vpaddd %xmm15,%xmm11,%xmm11
- vpslld $5,%xmm12,%xmm8
- vpand %xmm14,%xmm10,%xmm7
- vpxor 64-128(%rax),%xmm4,%xmm4
-
- vpaddd %xmm7,%xmm11,%xmm11
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm10,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vmovdqu %xmm3,176-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm4,%xmm5
- vpand %xmm13,%xmm6,%xmm6
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpaddd %xmm6,%xmm11,%xmm11
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 240-128(%rax),%xmm2
-
- vpaddd %xmm15,%xmm10,%xmm10
- vpslld $5,%xmm11,%xmm8
- vpand %xmm13,%xmm14,%xmm7
- vpxor 80-128(%rax),%xmm0,%xmm0
-
- vpaddd %xmm7,%xmm10,%xmm10
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm14,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vmovdqu %xmm4,192-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm0,%xmm5
- vpand %xmm12,%xmm6,%xmm6
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 0-128(%rax),%xmm3
-
- vpaddd %xmm15,%xmm14,%xmm14
- vpslld $5,%xmm10,%xmm8
- vpand %xmm12,%xmm13,%xmm7
- vpxor 96-128(%rax),%xmm1,%xmm1
-
- vpaddd %xmm7,%xmm14,%xmm14
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm13,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vmovdqu %xmm0,208-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm1,%xmm5
- vpand %xmm11,%xmm6,%xmm6
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 16-128(%rax),%xmm4
-
- vpaddd %xmm15,%xmm13,%xmm13
- vpslld $5,%xmm14,%xmm8
- vpand %xmm11,%xmm12,%xmm7
- vpxor 112-128(%rax),%xmm2,%xmm2
-
- vpaddd %xmm7,%xmm13,%xmm13
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm12,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vmovdqu %xmm1,224-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm2,%xmm5
- vpand %xmm10,%xmm6,%xmm6
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpaddd %xmm6,%xmm13,%xmm13
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 32-128(%rax),%xmm0
-
- vpaddd %xmm15,%xmm12,%xmm12
- vpslld $5,%xmm13,%xmm8
- vpand %xmm10,%xmm11,%xmm7
- vpxor 128-128(%rax),%xmm3,%xmm3
-
- vpaddd %xmm7,%xmm12,%xmm12
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm11,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vmovdqu %xmm2,240-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm3,%xmm5
- vpand %xmm14,%xmm6,%xmm6
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 48-128(%rax),%xmm1
-
- vpaddd %xmm15,%xmm11,%xmm11
- vpslld $5,%xmm12,%xmm8
- vpand %xmm14,%xmm10,%xmm7
- vpxor 144-128(%rax),%xmm4,%xmm4
-
- vpaddd %xmm7,%xmm11,%xmm11
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm10,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vmovdqu %xmm3,0-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm4,%xmm5
- vpand %xmm13,%xmm6,%xmm6
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpaddd %xmm6,%xmm11,%xmm11
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 64-128(%rax),%xmm2
-
- vpaddd %xmm15,%xmm10,%xmm10
- vpslld $5,%xmm11,%xmm8
- vpand %xmm13,%xmm14,%xmm7
- vpxor 160-128(%rax),%xmm0,%xmm0
-
- vpaddd %xmm7,%xmm10,%xmm10
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm14,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vmovdqu %xmm4,16-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm0,%xmm5
- vpand %xmm12,%xmm6,%xmm6
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 80-128(%rax),%xmm3
-
- vpaddd %xmm15,%xmm14,%xmm14
- vpslld $5,%xmm10,%xmm8
- vpand %xmm12,%xmm13,%xmm7
- vpxor 176-128(%rax),%xmm1,%xmm1
-
- vpaddd %xmm7,%xmm14,%xmm14
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm13,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vmovdqu %xmm0,32-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm1,%xmm5
- vpand %xmm11,%xmm6,%xmm6
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 96-128(%rax),%xmm4
-
- vpaddd %xmm15,%xmm13,%xmm13
- vpslld $5,%xmm14,%xmm8
- vpand %xmm11,%xmm12,%xmm7
- vpxor 192-128(%rax),%xmm2,%xmm2
-
- vpaddd %xmm7,%xmm13,%xmm13
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm12,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vmovdqu %xmm1,48-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm2,%xmm5
- vpand %xmm10,%xmm6,%xmm6
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpaddd %xmm6,%xmm13,%xmm13
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 112-128(%rax),%xmm0
-
- vpaddd %xmm15,%xmm12,%xmm12
- vpslld $5,%xmm13,%xmm8
- vpand %xmm10,%xmm11,%xmm7
- vpxor 208-128(%rax),%xmm3,%xmm3
-
- vpaddd %xmm7,%xmm12,%xmm12
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm11,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vmovdqu %xmm2,64-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm3,%xmm5
- vpand %xmm14,%xmm6,%xmm6
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 128-128(%rax),%xmm1
-
- vpaddd %xmm15,%xmm11,%xmm11
- vpslld $5,%xmm12,%xmm8
- vpand %xmm14,%xmm10,%xmm7
- vpxor 224-128(%rax),%xmm4,%xmm4
-
- vpaddd %xmm7,%xmm11,%xmm11
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm10,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vmovdqu %xmm3,80-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm4,%xmm5
- vpand %xmm13,%xmm6,%xmm6
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpaddd %xmm6,%xmm11,%xmm11
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 144-128(%rax),%xmm2
-
- vpaddd %xmm15,%xmm10,%xmm10
- vpslld $5,%xmm11,%xmm8
- vpand %xmm13,%xmm14,%xmm7
- vpxor 240-128(%rax),%xmm0,%xmm0
-
- vpaddd %xmm7,%xmm10,%xmm10
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm14,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vmovdqu %xmm4,96-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm0,%xmm5
- vpand %xmm12,%xmm6,%xmm6
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 160-128(%rax),%xmm3
-
- vpaddd %xmm15,%xmm14,%xmm14
- vpslld $5,%xmm10,%xmm8
- vpand %xmm12,%xmm13,%xmm7
- vpxor 0-128(%rax),%xmm1,%xmm1
-
- vpaddd %xmm7,%xmm14,%xmm14
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm13,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vmovdqu %xmm0,112-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm1,%xmm5
- vpand %xmm11,%xmm6,%xmm6
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 176-128(%rax),%xmm4
-
- vpaddd %xmm15,%xmm13,%xmm13
- vpslld $5,%xmm14,%xmm8
- vpand %xmm11,%xmm12,%xmm7
- vpxor 16-128(%rax),%xmm2,%xmm2
-
- vpaddd %xmm7,%xmm13,%xmm13
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm12,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vmovdqu %xmm1,128-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm2,%xmm5
- vpand %xmm10,%xmm6,%xmm6
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpaddd %xmm6,%xmm13,%xmm13
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 192-128(%rax),%xmm0
-
- vpaddd %xmm15,%xmm12,%xmm12
- vpslld $5,%xmm13,%xmm8
- vpand %xmm10,%xmm11,%xmm7
- vpxor 32-128(%rax),%xmm3,%xmm3
-
- vpaddd %xmm7,%xmm12,%xmm12
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm11,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vmovdqu %xmm2,144-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm3,%xmm5
- vpand %xmm14,%xmm6,%xmm6
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 208-128(%rax),%xmm1
-
- vpaddd %xmm15,%xmm11,%xmm11
- vpslld $5,%xmm12,%xmm8
- vpand %xmm14,%xmm10,%xmm7
- vpxor 48-128(%rax),%xmm4,%xmm4
-
- vpaddd %xmm7,%xmm11,%xmm11
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm10,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vmovdqu %xmm3,160-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm4,%xmm5
- vpand %xmm13,%xmm6,%xmm6
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpaddd %xmm6,%xmm11,%xmm11
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 224-128(%rax),%xmm2
-
- vpaddd %xmm15,%xmm10,%xmm10
- vpslld $5,%xmm11,%xmm8
- vpand %xmm13,%xmm14,%xmm7
- vpxor 64-128(%rax),%xmm0,%xmm0
-
- vpaddd %xmm7,%xmm10,%xmm10
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm14,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vmovdqu %xmm4,176-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpor %xmm9,%xmm8,%xmm8
- vpsrld $31,%xmm0,%xmm5
- vpand %xmm12,%xmm6,%xmm6
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vmovdqa 64(%rbp),%xmm15
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 240-128(%rax),%xmm3
-
- vpslld $5,%xmm10,%xmm8
- vpaddd %xmm15,%xmm14,%xmm14
- vpxor %xmm11,%xmm13,%xmm6
- vmovdqa %xmm0,192-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 80-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 0-128(%rax),%xmm4
-
- vpslld $5,%xmm14,%xmm8
- vpaddd %xmm15,%xmm13,%xmm13
- vpxor %xmm10,%xmm12,%xmm6
- vmovdqa %xmm1,208-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 96-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 16-128(%rax),%xmm0
-
- vpslld $5,%xmm13,%xmm8
- vpaddd %xmm15,%xmm12,%xmm12
- vpxor %xmm14,%xmm11,%xmm6
- vmovdqa %xmm2,224-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 112-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 32-128(%rax),%xmm1
-
- vpslld $5,%xmm12,%xmm8
- vpaddd %xmm15,%xmm11,%xmm11
- vpxor %xmm13,%xmm10,%xmm6
- vmovdqa %xmm3,240-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 128-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 48-128(%rax),%xmm2
-
- vpslld $5,%xmm11,%xmm8
- vpaddd %xmm15,%xmm10,%xmm10
- vpxor %xmm12,%xmm14,%xmm6
- vmovdqa %xmm4,0-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor 144-128(%rax),%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm6,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
- vpsrld $31,%xmm0,%xmm5
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 64-128(%rax),%xmm3
-
- vpslld $5,%xmm10,%xmm8
- vpaddd %xmm15,%xmm14,%xmm14
- vpxor %xmm11,%xmm13,%xmm6
- vmovdqa %xmm0,16-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 160-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 80-128(%rax),%xmm4
-
- vpslld $5,%xmm14,%xmm8
- vpaddd %xmm15,%xmm13,%xmm13
- vpxor %xmm10,%xmm12,%xmm6
- vmovdqa %xmm1,32-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 176-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 96-128(%rax),%xmm0
-
- vpslld $5,%xmm13,%xmm8
- vpaddd %xmm15,%xmm12,%xmm12
- vpxor %xmm14,%xmm11,%xmm6
- vmovdqa %xmm2,48-128(%rax)
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 192-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 112-128(%rax),%xmm1
-
- vpslld $5,%xmm12,%xmm8
- vpaddd %xmm15,%xmm11,%xmm11
- vpxor %xmm13,%xmm10,%xmm6
- vmovdqa %xmm3,64-128(%rax)
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 208-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 128-128(%rax),%xmm2
-
- vpslld $5,%xmm11,%xmm8
- vpaddd %xmm15,%xmm10,%xmm10
- vpxor %xmm12,%xmm14,%xmm6
- vmovdqa %xmm4,80-128(%rax)
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor 224-128(%rax),%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm6,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
- vpsrld $31,%xmm0,%xmm5
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 144-128(%rax),%xmm3
-
- vpslld $5,%xmm10,%xmm8
- vpaddd %xmm15,%xmm14,%xmm14
- vpxor %xmm11,%xmm13,%xmm6
- vmovdqa %xmm0,96-128(%rax)
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 240-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 160-128(%rax),%xmm4
-
- vpslld $5,%xmm14,%xmm8
- vpaddd %xmm15,%xmm13,%xmm13
- vpxor %xmm10,%xmm12,%xmm6
- vmovdqa %xmm1,112-128(%rax)
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 0-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 176-128(%rax),%xmm0
-
- vpslld $5,%xmm13,%xmm8
- vpaddd %xmm15,%xmm12,%xmm12
- vpxor %xmm14,%xmm11,%xmm6
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 16-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 192-128(%rax),%xmm1
-
- vpslld $5,%xmm12,%xmm8
- vpaddd %xmm15,%xmm11,%xmm11
- vpxor %xmm13,%xmm10,%xmm6
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 32-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpxor %xmm2,%xmm0,%xmm0
- vmovdqa 208-128(%rax),%xmm2
-
- vpslld $5,%xmm11,%xmm8
- vpaddd %xmm15,%xmm10,%xmm10
- vpxor %xmm12,%xmm14,%xmm6
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor 48-128(%rax),%xmm0,%xmm0
- vpsrld $27,%xmm11,%xmm9
- vpxor %xmm13,%xmm6,%xmm6
- vpxor %xmm2,%xmm0,%xmm0
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
- vpsrld $31,%xmm0,%xmm5
- vpaddd %xmm0,%xmm0,%xmm0
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm5,%xmm0,%xmm0
- vpor %xmm7,%xmm12,%xmm12
- vpxor %xmm3,%xmm1,%xmm1
- vmovdqa 224-128(%rax),%xmm3
-
- vpslld $5,%xmm10,%xmm8
- vpaddd %xmm15,%xmm14,%xmm14
- vpxor %xmm11,%xmm13,%xmm6
- vpaddd %xmm0,%xmm14,%xmm14
- vpxor 64-128(%rax),%xmm1,%xmm1
- vpsrld $27,%xmm10,%xmm9
- vpxor %xmm12,%xmm6,%xmm6
- vpxor %xmm3,%xmm1,%xmm1
-
- vpslld $30,%xmm11,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm14,%xmm14
- vpsrld $31,%xmm1,%xmm5
- vpaddd %xmm1,%xmm1,%xmm1
-
- vpsrld $2,%xmm11,%xmm11
- vpaddd %xmm8,%xmm14,%xmm14
- vpor %xmm5,%xmm1,%xmm1
- vpor %xmm7,%xmm11,%xmm11
- vpxor %xmm4,%xmm2,%xmm2
- vmovdqa 240-128(%rax),%xmm4
-
- vpslld $5,%xmm14,%xmm8
- vpaddd %xmm15,%xmm13,%xmm13
- vpxor %xmm10,%xmm12,%xmm6
- vpaddd %xmm1,%xmm13,%xmm13
- vpxor 80-128(%rax),%xmm2,%xmm2
- vpsrld $27,%xmm14,%xmm9
- vpxor %xmm11,%xmm6,%xmm6
- vpxor %xmm4,%xmm2,%xmm2
-
- vpslld $30,%xmm10,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm13,%xmm13
- vpsrld $31,%xmm2,%xmm5
- vpaddd %xmm2,%xmm2,%xmm2
-
- vpsrld $2,%xmm10,%xmm10
- vpaddd %xmm8,%xmm13,%xmm13
- vpor %xmm5,%xmm2,%xmm2
- vpor %xmm7,%xmm10,%xmm10
- vpxor %xmm0,%xmm3,%xmm3
- vmovdqa 0-128(%rax),%xmm0
-
- vpslld $5,%xmm13,%xmm8
- vpaddd %xmm15,%xmm12,%xmm12
- vpxor %xmm14,%xmm11,%xmm6
- vpaddd %xmm2,%xmm12,%xmm12
- vpxor 96-128(%rax),%xmm3,%xmm3
- vpsrld $27,%xmm13,%xmm9
- vpxor %xmm10,%xmm6,%xmm6
- vpxor %xmm0,%xmm3,%xmm3
-
- vpslld $30,%xmm14,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
- vpsrld $31,%xmm3,%xmm5
- vpaddd %xmm3,%xmm3,%xmm3
-
- vpsrld $2,%xmm14,%xmm14
- vpaddd %xmm8,%xmm12,%xmm12
- vpor %xmm5,%xmm3,%xmm3
- vpor %xmm7,%xmm14,%xmm14
- vpxor %xmm1,%xmm4,%xmm4
- vmovdqa 16-128(%rax),%xmm1
-
- vpslld $5,%xmm12,%xmm8
- vpaddd %xmm15,%xmm11,%xmm11
- vpxor %xmm13,%xmm10,%xmm6
- vpaddd %xmm3,%xmm11,%xmm11
- vpxor 112-128(%rax),%xmm4,%xmm4
- vpsrld $27,%xmm12,%xmm9
- vpxor %xmm14,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm4
-
- vpslld $30,%xmm13,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm11,%xmm11
- vpsrld $31,%xmm4,%xmm5
- vpaddd %xmm4,%xmm4,%xmm4
-
- vpsrld $2,%xmm13,%xmm13
- vpaddd %xmm8,%xmm11,%xmm11
- vpor %xmm5,%xmm4,%xmm4
- vpor %xmm7,%xmm13,%xmm13
- vpslld $5,%xmm11,%xmm8
- vpaddd %xmm15,%xmm10,%xmm10
- vpxor %xmm12,%xmm14,%xmm6
-
- vpsrld $27,%xmm11,%xmm9
- vpaddd %xmm4,%xmm10,%xmm10
- vpxor %xmm13,%xmm6,%xmm6
-
- vpslld $30,%xmm12,%xmm7
- vpor %xmm9,%xmm8,%xmm8
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpsrld $2,%xmm12,%xmm12
- vpaddd %xmm8,%xmm10,%xmm10
- vpor %xmm7,%xmm12,%xmm12
- movl $1,%ecx
- cmpl 0(%rbx),%ecx
- cmovgeq %rbp,%r8
- cmpl 4(%rbx),%ecx
- cmovgeq %rbp,%r9
- cmpl 8(%rbx),%ecx
- cmovgeq %rbp,%r10
- cmpl 12(%rbx),%ecx
- cmovgeq %rbp,%r11
- vmovdqu (%rbx),%xmm6
- vpxor %xmm8,%xmm8,%xmm8
- vmovdqa %xmm6,%xmm7
- vpcmpgtd %xmm8,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpand %xmm7,%xmm10,%xmm10
- vpand %xmm7,%xmm11,%xmm11
- vpaddd 0(%rdi),%xmm10,%xmm10
- vpand %xmm7,%xmm12,%xmm12
- vpaddd 32(%rdi),%xmm11,%xmm11
- vpand %xmm7,%xmm13,%xmm13
- vpaddd 64(%rdi),%xmm12,%xmm12
- vpand %xmm7,%xmm14,%xmm14
- vpaddd 96(%rdi),%xmm13,%xmm13
- vpaddd 128(%rdi),%xmm14,%xmm14
- vmovdqu %xmm10,0(%rdi)
- vmovdqu %xmm11,32(%rdi)
- vmovdqu %xmm12,64(%rdi)
- vmovdqu %xmm13,96(%rdi)
- vmovdqu %xmm14,128(%rdi)
-
- vmovdqu %xmm6,(%rbx)
- vmovdqu 96(%rbp),%xmm5
- decl %edx
- jnz .Loop_avx
-
- movl 280(%rsp),%edx
- leaq 16(%rdi),%rdi
- leaq 64(%rsi),%rsi
- decl %edx
- jnz .Loop_grande_avx
-
-.Ldone_avx:
- movq 272(%rsp),%rax
-.cfi_def_cfa %rax,8
- vzeroupper
- movq -16(%rax),%rbp
-.cfi_restore %rbp
- movq -8(%rax),%rbx
-.cfi_restore %rbx
- leaq (%rax),%rsp
-.cfi_def_cfa_register %rsp
-.Lepilogue_avx:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size sha1_multi_block_avx,.-sha1_multi_block_avx
-.type sha1_multi_block_avx2,@function
-.align 32
-sha1_multi_block_avx2:
-.cfi_startproc
-_avx2_shortcut:
- movq %rsp,%rax
-.cfi_def_cfa_register %rax
- pushq %rbx
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_offset %rbp,-24
- pushq %r12
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_offset %r15,-56
- subq $576,%rsp
- andq $-256,%rsp
- movq %rax,544(%rsp)
-.cfi_escape 0x0f,0x06,0x77,0xa0,0x04,0x06,0x23,0x08
-.Lbody_avx2:
- leaq K_XX_XX(%rip),%rbp
- shrl $1,%edx
-
- vzeroupper
-.Loop_grande_avx2:
- movl %edx,552(%rsp)
- xorl %edx,%edx
- leaq 512(%rsp),%rbx
-
- movq 0(%rsi),%r12
-
- movl 8(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,0(%rbx)
- cmovleq %rbp,%r12
-
- movq 16(%rsi),%r13
-
- movl 24(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,4(%rbx)
- cmovleq %rbp,%r13
-
- movq 32(%rsi),%r14
-
- movl 40(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,8(%rbx)
- cmovleq %rbp,%r14
-
- movq 48(%rsi),%r15
-
- movl 56(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,12(%rbx)
- cmovleq %rbp,%r15
-
- movq 64(%rsi),%r8
-
- movl 72(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,16(%rbx)
- cmovleq %rbp,%r8
-
- movq 80(%rsi),%r9
-
- movl 88(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,20(%rbx)
- cmovleq %rbp,%r9
-
- movq 96(%rsi),%r10
-
- movl 104(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,24(%rbx)
- cmovleq %rbp,%r10
-
- movq 112(%rsi),%r11
-
- movl 120(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,28(%rbx)
- cmovleq %rbp,%r11
- vmovdqu 0(%rdi),%ymm0
- leaq 128(%rsp),%rax
- vmovdqu 32(%rdi),%ymm1
- leaq 256+128(%rsp),%rbx
- vmovdqu 64(%rdi),%ymm2
- vmovdqu 96(%rdi),%ymm3
- vmovdqu 128(%rdi),%ymm4
- vmovdqu 96(%rbp),%ymm9
- jmp .Loop_avx2
-
-.align 32
-.Loop_avx2:
- vmovdqa -32(%rbp),%ymm15
- vmovd (%r12),%xmm10
- leaq 64(%r12),%r12
- vmovd (%r8),%xmm12
- leaq 64(%r8),%r8
- vmovd (%r13),%xmm7
- leaq 64(%r13),%r13
- vmovd (%r9),%xmm6
- leaq 64(%r9),%r9
- vpinsrd $1,(%r14),%xmm10,%xmm10
- leaq 64(%r14),%r14
- vpinsrd $1,(%r10),%xmm12,%xmm12
- leaq 64(%r10),%r10
- vpinsrd $1,(%r15),%xmm7,%xmm7
- leaq 64(%r15),%r15
- vpunpckldq %ymm7,%ymm10,%ymm10
- vpinsrd $1,(%r11),%xmm6,%xmm6
- leaq 64(%r11),%r11
- vpunpckldq %ymm6,%ymm12,%ymm12
- vmovd -60(%r12),%xmm11
- vinserti128 $1,%xmm12,%ymm10,%ymm10
- vmovd -60(%r8),%xmm8
- vpshufb %ymm9,%ymm10,%ymm10
- vmovd -60(%r13),%xmm7
- vmovd -60(%r9),%xmm6
- vpinsrd $1,-60(%r14),%xmm11,%xmm11
- vpinsrd $1,-60(%r10),%xmm8,%xmm8
- vpinsrd $1,-60(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm11,%ymm11
- vpinsrd $1,-60(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm4,%ymm4
- vpslld $5,%ymm0,%ymm7
- vpandn %ymm3,%ymm1,%ymm6
- vpand %ymm2,%ymm1,%ymm5
-
- vmovdqa %ymm10,0-128(%rax)
- vpaddd %ymm10,%ymm4,%ymm4
- vinserti128 $1,%xmm8,%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -56(%r12),%xmm12
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -56(%r8),%xmm8
- vpaddd %ymm5,%ymm4,%ymm4
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpshufb %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vmovd -56(%r13),%xmm7
- vmovd -56(%r9),%xmm6
- vpinsrd $1,-56(%r14),%xmm12,%xmm12
- vpinsrd $1,-56(%r10),%xmm8,%xmm8
- vpinsrd $1,-56(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm12,%ymm12
- vpinsrd $1,-56(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm3,%ymm3
- vpslld $5,%ymm4,%ymm7
- vpandn %ymm2,%ymm0,%ymm6
- vpand %ymm1,%ymm0,%ymm5
-
- vmovdqa %ymm11,32-128(%rax)
- vpaddd %ymm11,%ymm3,%ymm3
- vinserti128 $1,%xmm8,%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -52(%r12),%xmm13
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -52(%r8),%xmm8
- vpaddd %ymm5,%ymm3,%ymm3
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpshufb %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vmovd -52(%r13),%xmm7
- vmovd -52(%r9),%xmm6
- vpinsrd $1,-52(%r14),%xmm13,%xmm13
- vpinsrd $1,-52(%r10),%xmm8,%xmm8
- vpinsrd $1,-52(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm13,%ymm13
- vpinsrd $1,-52(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm2,%ymm2
- vpslld $5,%ymm3,%ymm7
- vpandn %ymm1,%ymm4,%ymm6
- vpand %ymm0,%ymm4,%ymm5
-
- vmovdqa %ymm12,64-128(%rax)
- vpaddd %ymm12,%ymm2,%ymm2
- vinserti128 $1,%xmm8,%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -48(%r12),%xmm14
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -48(%r8),%xmm8
- vpaddd %ymm5,%ymm2,%ymm2
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpshufb %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vmovd -48(%r13),%xmm7
- vmovd -48(%r9),%xmm6
- vpinsrd $1,-48(%r14),%xmm14,%xmm14
- vpinsrd $1,-48(%r10),%xmm8,%xmm8
- vpinsrd $1,-48(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm14,%ymm14
- vpinsrd $1,-48(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm1,%ymm1
- vpslld $5,%ymm2,%ymm7
- vpandn %ymm0,%ymm3,%ymm6
- vpand %ymm4,%ymm3,%ymm5
-
- vmovdqa %ymm13,96-128(%rax)
- vpaddd %ymm13,%ymm1,%ymm1
- vinserti128 $1,%xmm8,%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -44(%r12),%xmm10
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -44(%r8),%xmm8
- vpaddd %ymm5,%ymm1,%ymm1
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpshufb %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vmovd -44(%r13),%xmm7
- vmovd -44(%r9),%xmm6
- vpinsrd $1,-44(%r14),%xmm10,%xmm10
- vpinsrd $1,-44(%r10),%xmm8,%xmm8
- vpinsrd $1,-44(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm10,%ymm10
- vpinsrd $1,-44(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm0,%ymm0
- vpslld $5,%ymm1,%ymm7
- vpandn %ymm4,%ymm2,%ymm6
- vpand %ymm3,%ymm2,%ymm5
-
- vmovdqa %ymm14,128-128(%rax)
- vpaddd %ymm14,%ymm0,%ymm0
- vinserti128 $1,%xmm8,%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -40(%r12),%xmm11
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -40(%r8),%xmm8
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpshufb %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vmovd -40(%r13),%xmm7
- vmovd -40(%r9),%xmm6
- vpinsrd $1,-40(%r14),%xmm11,%xmm11
- vpinsrd $1,-40(%r10),%xmm8,%xmm8
- vpinsrd $1,-40(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm11,%ymm11
- vpinsrd $1,-40(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm4,%ymm4
- vpslld $5,%ymm0,%ymm7
- vpandn %ymm3,%ymm1,%ymm6
- vpand %ymm2,%ymm1,%ymm5
-
- vmovdqa %ymm10,160-128(%rax)
- vpaddd %ymm10,%ymm4,%ymm4
- vinserti128 $1,%xmm8,%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -36(%r12),%xmm12
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -36(%r8),%xmm8
- vpaddd %ymm5,%ymm4,%ymm4
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpshufb %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vmovd -36(%r13),%xmm7
- vmovd -36(%r9),%xmm6
- vpinsrd $1,-36(%r14),%xmm12,%xmm12
- vpinsrd $1,-36(%r10),%xmm8,%xmm8
- vpinsrd $1,-36(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm12,%ymm12
- vpinsrd $1,-36(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm3,%ymm3
- vpslld $5,%ymm4,%ymm7
- vpandn %ymm2,%ymm0,%ymm6
- vpand %ymm1,%ymm0,%ymm5
-
- vmovdqa %ymm11,192-128(%rax)
- vpaddd %ymm11,%ymm3,%ymm3
- vinserti128 $1,%xmm8,%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -32(%r12),%xmm13
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -32(%r8),%xmm8
- vpaddd %ymm5,%ymm3,%ymm3
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpshufb %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vmovd -32(%r13),%xmm7
- vmovd -32(%r9),%xmm6
- vpinsrd $1,-32(%r14),%xmm13,%xmm13
- vpinsrd $1,-32(%r10),%xmm8,%xmm8
- vpinsrd $1,-32(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm13,%ymm13
- vpinsrd $1,-32(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm2,%ymm2
- vpslld $5,%ymm3,%ymm7
- vpandn %ymm1,%ymm4,%ymm6
- vpand %ymm0,%ymm4,%ymm5
-
- vmovdqa %ymm12,224-128(%rax)
- vpaddd %ymm12,%ymm2,%ymm2
- vinserti128 $1,%xmm8,%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -28(%r12),%xmm14
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -28(%r8),%xmm8
- vpaddd %ymm5,%ymm2,%ymm2
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpshufb %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vmovd -28(%r13),%xmm7
- vmovd -28(%r9),%xmm6
- vpinsrd $1,-28(%r14),%xmm14,%xmm14
- vpinsrd $1,-28(%r10),%xmm8,%xmm8
- vpinsrd $1,-28(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm14,%ymm14
- vpinsrd $1,-28(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm1,%ymm1
- vpslld $5,%ymm2,%ymm7
- vpandn %ymm0,%ymm3,%ymm6
- vpand %ymm4,%ymm3,%ymm5
-
- vmovdqa %ymm13,256-256-128(%rbx)
- vpaddd %ymm13,%ymm1,%ymm1
- vinserti128 $1,%xmm8,%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -24(%r12),%xmm10
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -24(%r8),%xmm8
- vpaddd %ymm5,%ymm1,%ymm1
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpshufb %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vmovd -24(%r13),%xmm7
- vmovd -24(%r9),%xmm6
- vpinsrd $1,-24(%r14),%xmm10,%xmm10
- vpinsrd $1,-24(%r10),%xmm8,%xmm8
- vpinsrd $1,-24(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm10,%ymm10
- vpinsrd $1,-24(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm0,%ymm0
- vpslld $5,%ymm1,%ymm7
- vpandn %ymm4,%ymm2,%ymm6
- vpand %ymm3,%ymm2,%ymm5
-
- vmovdqa %ymm14,288-256-128(%rbx)
- vpaddd %ymm14,%ymm0,%ymm0
- vinserti128 $1,%xmm8,%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -20(%r12),%xmm11
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -20(%r8),%xmm8
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpshufb %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vmovd -20(%r13),%xmm7
- vmovd -20(%r9),%xmm6
- vpinsrd $1,-20(%r14),%xmm11,%xmm11
- vpinsrd $1,-20(%r10),%xmm8,%xmm8
- vpinsrd $1,-20(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm11,%ymm11
- vpinsrd $1,-20(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm4,%ymm4
- vpslld $5,%ymm0,%ymm7
- vpandn %ymm3,%ymm1,%ymm6
- vpand %ymm2,%ymm1,%ymm5
-
- vmovdqa %ymm10,320-256-128(%rbx)
- vpaddd %ymm10,%ymm4,%ymm4
- vinserti128 $1,%xmm8,%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -16(%r12),%xmm12
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -16(%r8),%xmm8
- vpaddd %ymm5,%ymm4,%ymm4
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpshufb %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vmovd -16(%r13),%xmm7
- vmovd -16(%r9),%xmm6
- vpinsrd $1,-16(%r14),%xmm12,%xmm12
- vpinsrd $1,-16(%r10),%xmm8,%xmm8
- vpinsrd $1,-16(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm12,%ymm12
- vpinsrd $1,-16(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm3,%ymm3
- vpslld $5,%ymm4,%ymm7
- vpandn %ymm2,%ymm0,%ymm6
- vpand %ymm1,%ymm0,%ymm5
-
- vmovdqa %ymm11,352-256-128(%rbx)
- vpaddd %ymm11,%ymm3,%ymm3
- vinserti128 $1,%xmm8,%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -12(%r12),%xmm13
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -12(%r8),%xmm8
- vpaddd %ymm5,%ymm3,%ymm3
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpshufb %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vmovd -12(%r13),%xmm7
- vmovd -12(%r9),%xmm6
- vpinsrd $1,-12(%r14),%xmm13,%xmm13
- vpinsrd $1,-12(%r10),%xmm8,%xmm8
- vpinsrd $1,-12(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm13,%ymm13
- vpinsrd $1,-12(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm2,%ymm2
- vpslld $5,%ymm3,%ymm7
- vpandn %ymm1,%ymm4,%ymm6
- vpand %ymm0,%ymm4,%ymm5
-
- vmovdqa %ymm12,384-256-128(%rbx)
- vpaddd %ymm12,%ymm2,%ymm2
- vinserti128 $1,%xmm8,%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -8(%r12),%xmm14
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -8(%r8),%xmm8
- vpaddd %ymm5,%ymm2,%ymm2
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpshufb %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vmovd -8(%r13),%xmm7
- vmovd -8(%r9),%xmm6
- vpinsrd $1,-8(%r14),%xmm14,%xmm14
- vpinsrd $1,-8(%r10),%xmm8,%xmm8
- vpinsrd $1,-8(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm14,%ymm14
- vpinsrd $1,-8(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm1,%ymm1
- vpslld $5,%ymm2,%ymm7
- vpandn %ymm0,%ymm3,%ymm6
- vpand %ymm4,%ymm3,%ymm5
-
- vmovdqa %ymm13,416-256-128(%rbx)
- vpaddd %ymm13,%ymm1,%ymm1
- vinserti128 $1,%xmm8,%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vmovd -4(%r12),%xmm10
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vmovd -4(%r8),%xmm8
- vpaddd %ymm5,%ymm1,%ymm1
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpshufb %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vmovdqa 0-128(%rax),%ymm11
- vmovd -4(%r13),%xmm7
- vmovd -4(%r9),%xmm6
- vpinsrd $1,-4(%r14),%xmm10,%xmm10
- vpinsrd $1,-4(%r10),%xmm8,%xmm8
- vpinsrd $1,-4(%r15),%xmm7,%xmm7
- vpunpckldq %ymm7,%ymm10,%ymm10
- vpinsrd $1,-4(%r11),%xmm6,%xmm6
- vpunpckldq %ymm6,%ymm8,%ymm8
- vpaddd %ymm15,%ymm0,%ymm0
- prefetcht0 63(%r12)
- vpslld $5,%ymm1,%ymm7
- vpandn %ymm4,%ymm2,%ymm6
- vpand %ymm3,%ymm2,%ymm5
-
- vmovdqa %ymm14,448-256-128(%rbx)
- vpaddd %ymm14,%ymm0,%ymm0
- vinserti128 $1,%xmm8,%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- prefetcht0 63(%r13)
- vpxor %ymm6,%ymm5,%ymm5
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- prefetcht0 63(%r14)
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- prefetcht0 63(%r15)
- vpshufb %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vmovdqa 32-128(%rax),%ymm12
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 64-128(%rax),%ymm13
-
- vpaddd %ymm15,%ymm4,%ymm4
- vpslld $5,%ymm0,%ymm7
- vpandn %ymm3,%ymm1,%ymm6
- prefetcht0 63(%r8)
- vpand %ymm2,%ymm1,%ymm5
-
- vmovdqa %ymm10,480-256-128(%rbx)
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 256-256-128(%rbx),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
- prefetcht0 63(%r9)
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- prefetcht0 63(%r10)
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- prefetcht0 63(%r11)
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 96-128(%rax),%ymm14
-
- vpaddd %ymm15,%ymm3,%ymm3
- vpslld $5,%ymm4,%ymm7
- vpandn %ymm2,%ymm0,%ymm6
-
- vpand %ymm1,%ymm0,%ymm5
-
- vmovdqa %ymm11,0-128(%rax)
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 288-256-128(%rbx),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
-
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
-
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 128-128(%rax),%ymm10
-
- vpaddd %ymm15,%ymm2,%ymm2
- vpslld $5,%ymm3,%ymm7
- vpandn %ymm1,%ymm4,%ymm6
-
- vpand %ymm0,%ymm4,%ymm5
-
- vmovdqa %ymm12,32-128(%rax)
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 320-256-128(%rbx),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
-
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
-
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 160-128(%rax),%ymm11
-
- vpaddd %ymm15,%ymm1,%ymm1
- vpslld $5,%ymm2,%ymm7
- vpandn %ymm0,%ymm3,%ymm6
-
- vpand %ymm4,%ymm3,%ymm5
-
- vmovdqa %ymm13,64-128(%rax)
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 352-256-128(%rbx),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
-
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
-
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 192-128(%rax),%ymm12
-
- vpaddd %ymm15,%ymm0,%ymm0
- vpslld $5,%ymm1,%ymm7
- vpandn %ymm4,%ymm2,%ymm6
-
- vpand %ymm3,%ymm2,%ymm5
-
- vmovdqa %ymm14,96-128(%rax)
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor 384-256-128(%rbx),%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm6,%ymm5,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $31,%ymm10,%ymm9
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpsrld $2,%ymm2,%ymm2
-
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vmovdqa 0(%rbp),%ymm15
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 224-128(%rax),%ymm13
-
- vpslld $5,%ymm0,%ymm7
- vpaddd %ymm15,%ymm4,%ymm4
- vpxor %ymm1,%ymm3,%ymm5
- vmovdqa %ymm10,128-128(%rax)
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 416-256-128(%rbx),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 256-256-128(%rbx),%ymm14
-
- vpslld $5,%ymm4,%ymm7
- vpaddd %ymm15,%ymm3,%ymm3
- vpxor %ymm0,%ymm2,%ymm5
- vmovdqa %ymm11,160-128(%rax)
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 448-256-128(%rbx),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 288-256-128(%rbx),%ymm10
-
- vpslld $5,%ymm3,%ymm7
- vpaddd %ymm15,%ymm2,%ymm2
- vpxor %ymm4,%ymm1,%ymm5
- vmovdqa %ymm12,192-128(%rax)
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 480-256-128(%rbx),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 320-256-128(%rbx),%ymm11
-
- vpslld $5,%ymm2,%ymm7
- vpaddd %ymm15,%ymm1,%ymm1
- vpxor %ymm3,%ymm0,%ymm5
- vmovdqa %ymm13,224-128(%rax)
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 0-128(%rax),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 352-256-128(%rbx),%ymm12
-
- vpslld $5,%ymm1,%ymm7
- vpaddd %ymm15,%ymm0,%ymm0
- vpxor %ymm2,%ymm4,%ymm5
- vmovdqa %ymm14,256-256-128(%rbx)
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor 32-128(%rax),%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm5,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
- vpsrld $31,%ymm10,%ymm9
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 384-256-128(%rbx),%ymm13
-
- vpslld $5,%ymm0,%ymm7
- vpaddd %ymm15,%ymm4,%ymm4
- vpxor %ymm1,%ymm3,%ymm5
- vmovdqa %ymm10,288-256-128(%rbx)
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 64-128(%rax),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 416-256-128(%rbx),%ymm14
-
- vpslld $5,%ymm4,%ymm7
- vpaddd %ymm15,%ymm3,%ymm3
- vpxor %ymm0,%ymm2,%ymm5
- vmovdqa %ymm11,320-256-128(%rbx)
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 96-128(%rax),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 448-256-128(%rbx),%ymm10
-
- vpslld $5,%ymm3,%ymm7
- vpaddd %ymm15,%ymm2,%ymm2
- vpxor %ymm4,%ymm1,%ymm5
- vmovdqa %ymm12,352-256-128(%rbx)
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 128-128(%rax),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 480-256-128(%rbx),%ymm11
-
- vpslld $5,%ymm2,%ymm7
- vpaddd %ymm15,%ymm1,%ymm1
- vpxor %ymm3,%ymm0,%ymm5
- vmovdqa %ymm13,384-256-128(%rbx)
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 160-128(%rax),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 0-128(%rax),%ymm12
-
- vpslld $5,%ymm1,%ymm7
- vpaddd %ymm15,%ymm0,%ymm0
- vpxor %ymm2,%ymm4,%ymm5
- vmovdqa %ymm14,416-256-128(%rbx)
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor 192-128(%rax),%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm5,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
- vpsrld $31,%ymm10,%ymm9
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 32-128(%rax),%ymm13
-
- vpslld $5,%ymm0,%ymm7
- vpaddd %ymm15,%ymm4,%ymm4
- vpxor %ymm1,%ymm3,%ymm5
- vmovdqa %ymm10,448-256-128(%rbx)
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 224-128(%rax),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 64-128(%rax),%ymm14
-
- vpslld $5,%ymm4,%ymm7
- vpaddd %ymm15,%ymm3,%ymm3
- vpxor %ymm0,%ymm2,%ymm5
- vmovdqa %ymm11,480-256-128(%rbx)
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 256-256-128(%rbx),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 96-128(%rax),%ymm10
-
- vpslld $5,%ymm3,%ymm7
- vpaddd %ymm15,%ymm2,%ymm2
- vpxor %ymm4,%ymm1,%ymm5
- vmovdqa %ymm12,0-128(%rax)
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 288-256-128(%rbx),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 128-128(%rax),%ymm11
-
- vpslld $5,%ymm2,%ymm7
- vpaddd %ymm15,%ymm1,%ymm1
- vpxor %ymm3,%ymm0,%ymm5
- vmovdqa %ymm13,32-128(%rax)
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 320-256-128(%rbx),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 160-128(%rax),%ymm12
-
- vpslld $5,%ymm1,%ymm7
- vpaddd %ymm15,%ymm0,%ymm0
- vpxor %ymm2,%ymm4,%ymm5
- vmovdqa %ymm14,64-128(%rax)
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor 352-256-128(%rbx),%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm5,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
- vpsrld $31,%ymm10,%ymm9
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 192-128(%rax),%ymm13
-
- vpslld $5,%ymm0,%ymm7
- vpaddd %ymm15,%ymm4,%ymm4
- vpxor %ymm1,%ymm3,%ymm5
- vmovdqa %ymm10,96-128(%rax)
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 384-256-128(%rbx),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 224-128(%rax),%ymm14
-
- vpslld $5,%ymm4,%ymm7
- vpaddd %ymm15,%ymm3,%ymm3
- vpxor %ymm0,%ymm2,%ymm5
- vmovdqa %ymm11,128-128(%rax)
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 416-256-128(%rbx),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 256-256-128(%rbx),%ymm10
-
- vpslld $5,%ymm3,%ymm7
- vpaddd %ymm15,%ymm2,%ymm2
- vpxor %ymm4,%ymm1,%ymm5
- vmovdqa %ymm12,160-128(%rax)
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 448-256-128(%rbx),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 288-256-128(%rbx),%ymm11
-
- vpslld $5,%ymm2,%ymm7
- vpaddd %ymm15,%ymm1,%ymm1
- vpxor %ymm3,%ymm0,%ymm5
- vmovdqa %ymm13,192-128(%rax)
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 480-256-128(%rbx),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 320-256-128(%rbx),%ymm12
-
- vpslld $5,%ymm1,%ymm7
- vpaddd %ymm15,%ymm0,%ymm0
- vpxor %ymm2,%ymm4,%ymm5
- vmovdqa %ymm14,224-128(%rax)
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor 0-128(%rax),%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm5,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
- vpsrld $31,%ymm10,%ymm9
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vmovdqa 32(%rbp),%ymm15
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 352-256-128(%rbx),%ymm13
-
- vpaddd %ymm15,%ymm4,%ymm4
- vpslld $5,%ymm0,%ymm7
- vpand %ymm2,%ymm3,%ymm6
- vpxor 32-128(%rax),%ymm11,%ymm11
-
- vpaddd %ymm6,%ymm4,%ymm4
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm3,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vmovdqu %ymm10,256-256-128(%rbx)
- vpaddd %ymm10,%ymm4,%ymm4
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm11,%ymm9
- vpand %ymm1,%ymm5,%ymm5
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpaddd %ymm5,%ymm4,%ymm4
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 384-256-128(%rbx),%ymm14
-
- vpaddd %ymm15,%ymm3,%ymm3
- vpslld $5,%ymm4,%ymm7
- vpand %ymm1,%ymm2,%ymm6
- vpxor 64-128(%rax),%ymm12,%ymm12
-
- vpaddd %ymm6,%ymm3,%ymm3
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm2,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vmovdqu %ymm11,288-256-128(%rbx)
- vpaddd %ymm11,%ymm3,%ymm3
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm12,%ymm9
- vpand %ymm0,%ymm5,%ymm5
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpaddd %ymm5,%ymm3,%ymm3
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 416-256-128(%rbx),%ymm10
-
- vpaddd %ymm15,%ymm2,%ymm2
- vpslld $5,%ymm3,%ymm7
- vpand %ymm0,%ymm1,%ymm6
- vpxor 96-128(%rax),%ymm13,%ymm13
-
- vpaddd %ymm6,%ymm2,%ymm2
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm1,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vmovdqu %ymm12,320-256-128(%rbx)
- vpaddd %ymm12,%ymm2,%ymm2
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm13,%ymm9
- vpand %ymm4,%ymm5,%ymm5
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpaddd %ymm5,%ymm2,%ymm2
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 448-256-128(%rbx),%ymm11
-
- vpaddd %ymm15,%ymm1,%ymm1
- vpslld $5,%ymm2,%ymm7
- vpand %ymm4,%ymm0,%ymm6
- vpxor 128-128(%rax),%ymm14,%ymm14
-
- vpaddd %ymm6,%ymm1,%ymm1
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm0,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vmovdqu %ymm13,352-256-128(%rbx)
- vpaddd %ymm13,%ymm1,%ymm1
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm14,%ymm9
- vpand %ymm3,%ymm5,%ymm5
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpaddd %ymm5,%ymm1,%ymm1
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 480-256-128(%rbx),%ymm12
-
- vpaddd %ymm15,%ymm0,%ymm0
- vpslld $5,%ymm1,%ymm7
- vpand %ymm3,%ymm4,%ymm6
- vpxor 160-128(%rax),%ymm10,%ymm10
-
- vpaddd %ymm6,%ymm0,%ymm0
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm4,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vmovdqu %ymm14,384-256-128(%rbx)
- vpaddd %ymm14,%ymm0,%ymm0
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm10,%ymm9
- vpand %ymm2,%ymm5,%ymm5
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 0-128(%rax),%ymm13
-
- vpaddd %ymm15,%ymm4,%ymm4
- vpslld $5,%ymm0,%ymm7
- vpand %ymm2,%ymm3,%ymm6
- vpxor 192-128(%rax),%ymm11,%ymm11
-
- vpaddd %ymm6,%ymm4,%ymm4
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm3,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vmovdqu %ymm10,416-256-128(%rbx)
- vpaddd %ymm10,%ymm4,%ymm4
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm11,%ymm9
- vpand %ymm1,%ymm5,%ymm5
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpaddd %ymm5,%ymm4,%ymm4
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 32-128(%rax),%ymm14
-
- vpaddd %ymm15,%ymm3,%ymm3
- vpslld $5,%ymm4,%ymm7
- vpand %ymm1,%ymm2,%ymm6
- vpxor 224-128(%rax),%ymm12,%ymm12
-
- vpaddd %ymm6,%ymm3,%ymm3
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm2,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vmovdqu %ymm11,448-256-128(%rbx)
- vpaddd %ymm11,%ymm3,%ymm3
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm12,%ymm9
- vpand %ymm0,%ymm5,%ymm5
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpaddd %ymm5,%ymm3,%ymm3
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 64-128(%rax),%ymm10
-
- vpaddd %ymm15,%ymm2,%ymm2
- vpslld $5,%ymm3,%ymm7
- vpand %ymm0,%ymm1,%ymm6
- vpxor 256-256-128(%rbx),%ymm13,%ymm13
-
- vpaddd %ymm6,%ymm2,%ymm2
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm1,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vmovdqu %ymm12,480-256-128(%rbx)
- vpaddd %ymm12,%ymm2,%ymm2
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm13,%ymm9
- vpand %ymm4,%ymm5,%ymm5
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpaddd %ymm5,%ymm2,%ymm2
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 96-128(%rax),%ymm11
-
- vpaddd %ymm15,%ymm1,%ymm1
- vpslld $5,%ymm2,%ymm7
- vpand %ymm4,%ymm0,%ymm6
- vpxor 288-256-128(%rbx),%ymm14,%ymm14
-
- vpaddd %ymm6,%ymm1,%ymm1
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm0,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vmovdqu %ymm13,0-128(%rax)
- vpaddd %ymm13,%ymm1,%ymm1
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm14,%ymm9
- vpand %ymm3,%ymm5,%ymm5
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpaddd %ymm5,%ymm1,%ymm1
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 128-128(%rax),%ymm12
-
- vpaddd %ymm15,%ymm0,%ymm0
- vpslld $5,%ymm1,%ymm7
- vpand %ymm3,%ymm4,%ymm6
- vpxor 320-256-128(%rbx),%ymm10,%ymm10
-
- vpaddd %ymm6,%ymm0,%ymm0
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm4,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vmovdqu %ymm14,32-128(%rax)
- vpaddd %ymm14,%ymm0,%ymm0
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm10,%ymm9
- vpand %ymm2,%ymm5,%ymm5
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 160-128(%rax),%ymm13
-
- vpaddd %ymm15,%ymm4,%ymm4
- vpslld $5,%ymm0,%ymm7
- vpand %ymm2,%ymm3,%ymm6
- vpxor 352-256-128(%rbx),%ymm11,%ymm11
-
- vpaddd %ymm6,%ymm4,%ymm4
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm3,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vmovdqu %ymm10,64-128(%rax)
- vpaddd %ymm10,%ymm4,%ymm4
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm11,%ymm9
- vpand %ymm1,%ymm5,%ymm5
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpaddd %ymm5,%ymm4,%ymm4
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 192-128(%rax),%ymm14
-
- vpaddd %ymm15,%ymm3,%ymm3
- vpslld $5,%ymm4,%ymm7
- vpand %ymm1,%ymm2,%ymm6
- vpxor 384-256-128(%rbx),%ymm12,%ymm12
-
- vpaddd %ymm6,%ymm3,%ymm3
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm2,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vmovdqu %ymm11,96-128(%rax)
- vpaddd %ymm11,%ymm3,%ymm3
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm12,%ymm9
- vpand %ymm0,%ymm5,%ymm5
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpaddd %ymm5,%ymm3,%ymm3
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 224-128(%rax),%ymm10
-
- vpaddd %ymm15,%ymm2,%ymm2
- vpslld $5,%ymm3,%ymm7
- vpand %ymm0,%ymm1,%ymm6
- vpxor 416-256-128(%rbx),%ymm13,%ymm13
-
- vpaddd %ymm6,%ymm2,%ymm2
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm1,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vmovdqu %ymm12,128-128(%rax)
- vpaddd %ymm12,%ymm2,%ymm2
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm13,%ymm9
- vpand %ymm4,%ymm5,%ymm5
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpaddd %ymm5,%ymm2,%ymm2
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 256-256-128(%rbx),%ymm11
-
- vpaddd %ymm15,%ymm1,%ymm1
- vpslld $5,%ymm2,%ymm7
- vpand %ymm4,%ymm0,%ymm6
- vpxor 448-256-128(%rbx),%ymm14,%ymm14
-
- vpaddd %ymm6,%ymm1,%ymm1
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm0,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vmovdqu %ymm13,160-128(%rax)
- vpaddd %ymm13,%ymm1,%ymm1
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm14,%ymm9
- vpand %ymm3,%ymm5,%ymm5
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpaddd %ymm5,%ymm1,%ymm1
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 288-256-128(%rbx),%ymm12
-
- vpaddd %ymm15,%ymm0,%ymm0
- vpslld $5,%ymm1,%ymm7
- vpand %ymm3,%ymm4,%ymm6
- vpxor 480-256-128(%rbx),%ymm10,%ymm10
-
- vpaddd %ymm6,%ymm0,%ymm0
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm4,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vmovdqu %ymm14,192-128(%rax)
- vpaddd %ymm14,%ymm0,%ymm0
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm10,%ymm9
- vpand %ymm2,%ymm5,%ymm5
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 320-256-128(%rbx),%ymm13
-
- vpaddd %ymm15,%ymm4,%ymm4
- vpslld $5,%ymm0,%ymm7
- vpand %ymm2,%ymm3,%ymm6
- vpxor 0-128(%rax),%ymm11,%ymm11
-
- vpaddd %ymm6,%ymm4,%ymm4
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm3,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vmovdqu %ymm10,224-128(%rax)
- vpaddd %ymm10,%ymm4,%ymm4
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm11,%ymm9
- vpand %ymm1,%ymm5,%ymm5
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpaddd %ymm5,%ymm4,%ymm4
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 352-256-128(%rbx),%ymm14
-
- vpaddd %ymm15,%ymm3,%ymm3
- vpslld $5,%ymm4,%ymm7
- vpand %ymm1,%ymm2,%ymm6
- vpxor 32-128(%rax),%ymm12,%ymm12
-
- vpaddd %ymm6,%ymm3,%ymm3
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm2,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vmovdqu %ymm11,256-256-128(%rbx)
- vpaddd %ymm11,%ymm3,%ymm3
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm12,%ymm9
- vpand %ymm0,%ymm5,%ymm5
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpaddd %ymm5,%ymm3,%ymm3
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 384-256-128(%rbx),%ymm10
-
- vpaddd %ymm15,%ymm2,%ymm2
- vpslld $5,%ymm3,%ymm7
- vpand %ymm0,%ymm1,%ymm6
- vpxor 64-128(%rax),%ymm13,%ymm13
-
- vpaddd %ymm6,%ymm2,%ymm2
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm1,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vmovdqu %ymm12,288-256-128(%rbx)
- vpaddd %ymm12,%ymm2,%ymm2
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm13,%ymm9
- vpand %ymm4,%ymm5,%ymm5
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpaddd %ymm5,%ymm2,%ymm2
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 416-256-128(%rbx),%ymm11
-
- vpaddd %ymm15,%ymm1,%ymm1
- vpslld $5,%ymm2,%ymm7
- vpand %ymm4,%ymm0,%ymm6
- vpxor 96-128(%rax),%ymm14,%ymm14
-
- vpaddd %ymm6,%ymm1,%ymm1
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm0,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vmovdqu %ymm13,320-256-128(%rbx)
- vpaddd %ymm13,%ymm1,%ymm1
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm14,%ymm9
- vpand %ymm3,%ymm5,%ymm5
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpaddd %ymm5,%ymm1,%ymm1
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 448-256-128(%rbx),%ymm12
-
- vpaddd %ymm15,%ymm0,%ymm0
- vpslld $5,%ymm1,%ymm7
- vpand %ymm3,%ymm4,%ymm6
- vpxor 128-128(%rax),%ymm10,%ymm10
-
- vpaddd %ymm6,%ymm0,%ymm0
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm4,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vmovdqu %ymm14,352-256-128(%rbx)
- vpaddd %ymm14,%ymm0,%ymm0
- vpor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm10,%ymm9
- vpand %ymm2,%ymm5,%ymm5
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vmovdqa 64(%rbp),%ymm15
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 480-256-128(%rbx),%ymm13
-
- vpslld $5,%ymm0,%ymm7
- vpaddd %ymm15,%ymm4,%ymm4
- vpxor %ymm1,%ymm3,%ymm5
- vmovdqa %ymm10,384-256-128(%rbx)
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 160-128(%rax),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 0-128(%rax),%ymm14
-
- vpslld $5,%ymm4,%ymm7
- vpaddd %ymm15,%ymm3,%ymm3
- vpxor %ymm0,%ymm2,%ymm5
- vmovdqa %ymm11,416-256-128(%rbx)
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 192-128(%rax),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 32-128(%rax),%ymm10
-
- vpslld $5,%ymm3,%ymm7
- vpaddd %ymm15,%ymm2,%ymm2
- vpxor %ymm4,%ymm1,%ymm5
- vmovdqa %ymm12,448-256-128(%rbx)
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 224-128(%rax),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 64-128(%rax),%ymm11
-
- vpslld $5,%ymm2,%ymm7
- vpaddd %ymm15,%ymm1,%ymm1
- vpxor %ymm3,%ymm0,%ymm5
- vmovdqa %ymm13,480-256-128(%rbx)
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 256-256-128(%rbx),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 96-128(%rax),%ymm12
-
- vpslld $5,%ymm1,%ymm7
- vpaddd %ymm15,%ymm0,%ymm0
- vpxor %ymm2,%ymm4,%ymm5
- vmovdqa %ymm14,0-128(%rax)
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor 288-256-128(%rbx),%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm5,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
- vpsrld $31,%ymm10,%ymm9
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 128-128(%rax),%ymm13
-
- vpslld $5,%ymm0,%ymm7
- vpaddd %ymm15,%ymm4,%ymm4
- vpxor %ymm1,%ymm3,%ymm5
- vmovdqa %ymm10,32-128(%rax)
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 320-256-128(%rbx),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 160-128(%rax),%ymm14
-
- vpslld $5,%ymm4,%ymm7
- vpaddd %ymm15,%ymm3,%ymm3
- vpxor %ymm0,%ymm2,%ymm5
- vmovdqa %ymm11,64-128(%rax)
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 352-256-128(%rbx),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 192-128(%rax),%ymm10
-
- vpslld $5,%ymm3,%ymm7
- vpaddd %ymm15,%ymm2,%ymm2
- vpxor %ymm4,%ymm1,%ymm5
- vmovdqa %ymm12,96-128(%rax)
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 384-256-128(%rbx),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 224-128(%rax),%ymm11
-
- vpslld $5,%ymm2,%ymm7
- vpaddd %ymm15,%ymm1,%ymm1
- vpxor %ymm3,%ymm0,%ymm5
- vmovdqa %ymm13,128-128(%rax)
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 416-256-128(%rbx),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 256-256-128(%rbx),%ymm12
-
- vpslld $5,%ymm1,%ymm7
- vpaddd %ymm15,%ymm0,%ymm0
- vpxor %ymm2,%ymm4,%ymm5
- vmovdqa %ymm14,160-128(%rax)
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor 448-256-128(%rbx),%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm5,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
- vpsrld $31,%ymm10,%ymm9
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 288-256-128(%rbx),%ymm13
-
- vpslld $5,%ymm0,%ymm7
- vpaddd %ymm15,%ymm4,%ymm4
- vpxor %ymm1,%ymm3,%ymm5
- vmovdqa %ymm10,192-128(%rax)
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 480-256-128(%rbx),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 320-256-128(%rbx),%ymm14
-
- vpslld $5,%ymm4,%ymm7
- vpaddd %ymm15,%ymm3,%ymm3
- vpxor %ymm0,%ymm2,%ymm5
- vmovdqa %ymm11,224-128(%rax)
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 0-128(%rax),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 352-256-128(%rbx),%ymm10
-
- vpslld $5,%ymm3,%ymm7
- vpaddd %ymm15,%ymm2,%ymm2
- vpxor %ymm4,%ymm1,%ymm5
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 32-128(%rax),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 384-256-128(%rbx),%ymm11
-
- vpslld $5,%ymm2,%ymm7
- vpaddd %ymm15,%ymm1,%ymm1
- vpxor %ymm3,%ymm0,%ymm5
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 64-128(%rax),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpxor %ymm12,%ymm10,%ymm10
- vmovdqa 416-256-128(%rbx),%ymm12
-
- vpslld $5,%ymm1,%ymm7
- vpaddd %ymm15,%ymm0,%ymm0
- vpxor %ymm2,%ymm4,%ymm5
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor 96-128(%rax),%ymm10,%ymm10
- vpsrld $27,%ymm1,%ymm8
- vpxor %ymm3,%ymm5,%ymm5
- vpxor %ymm12,%ymm10,%ymm10
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
- vpsrld $31,%ymm10,%ymm9
- vpaddd %ymm10,%ymm10,%ymm10
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm9,%ymm10,%ymm10
- vpor %ymm6,%ymm2,%ymm2
- vpxor %ymm13,%ymm11,%ymm11
- vmovdqa 448-256-128(%rbx),%ymm13
-
- vpslld $5,%ymm0,%ymm7
- vpaddd %ymm15,%ymm4,%ymm4
- vpxor %ymm1,%ymm3,%ymm5
- vpaddd %ymm10,%ymm4,%ymm4
- vpxor 128-128(%rax),%ymm11,%ymm11
- vpsrld $27,%ymm0,%ymm8
- vpxor %ymm2,%ymm5,%ymm5
- vpxor %ymm13,%ymm11,%ymm11
-
- vpslld $30,%ymm1,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm4,%ymm4
- vpsrld $31,%ymm11,%ymm9
- vpaddd %ymm11,%ymm11,%ymm11
-
- vpsrld $2,%ymm1,%ymm1
- vpaddd %ymm7,%ymm4,%ymm4
- vpor %ymm9,%ymm11,%ymm11
- vpor %ymm6,%ymm1,%ymm1
- vpxor %ymm14,%ymm12,%ymm12
- vmovdqa 480-256-128(%rbx),%ymm14
-
- vpslld $5,%ymm4,%ymm7
- vpaddd %ymm15,%ymm3,%ymm3
- vpxor %ymm0,%ymm2,%ymm5
- vpaddd %ymm11,%ymm3,%ymm3
- vpxor 160-128(%rax),%ymm12,%ymm12
- vpsrld $27,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm14,%ymm12,%ymm12
-
- vpslld $30,%ymm0,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm3,%ymm3
- vpsrld $31,%ymm12,%ymm9
- vpaddd %ymm12,%ymm12,%ymm12
-
- vpsrld $2,%ymm0,%ymm0
- vpaddd %ymm7,%ymm3,%ymm3
- vpor %ymm9,%ymm12,%ymm12
- vpor %ymm6,%ymm0,%ymm0
- vpxor %ymm10,%ymm13,%ymm13
- vmovdqa 0-128(%rax),%ymm10
-
- vpslld $5,%ymm3,%ymm7
- vpaddd %ymm15,%ymm2,%ymm2
- vpxor %ymm4,%ymm1,%ymm5
- vpaddd %ymm12,%ymm2,%ymm2
- vpxor 192-128(%rax),%ymm13,%ymm13
- vpsrld $27,%ymm3,%ymm8
- vpxor %ymm0,%ymm5,%ymm5
- vpxor %ymm10,%ymm13,%ymm13
-
- vpslld $30,%ymm4,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm2,%ymm2
- vpsrld $31,%ymm13,%ymm9
- vpaddd %ymm13,%ymm13,%ymm13
-
- vpsrld $2,%ymm4,%ymm4
- vpaddd %ymm7,%ymm2,%ymm2
- vpor %ymm9,%ymm13,%ymm13
- vpor %ymm6,%ymm4,%ymm4
- vpxor %ymm11,%ymm14,%ymm14
- vmovdqa 32-128(%rax),%ymm11
-
- vpslld $5,%ymm2,%ymm7
- vpaddd %ymm15,%ymm1,%ymm1
- vpxor %ymm3,%ymm0,%ymm5
- vpaddd %ymm13,%ymm1,%ymm1
- vpxor 224-128(%rax),%ymm14,%ymm14
- vpsrld $27,%ymm2,%ymm8
- vpxor %ymm4,%ymm5,%ymm5
- vpxor %ymm11,%ymm14,%ymm14
-
- vpslld $30,%ymm3,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm1,%ymm1
- vpsrld $31,%ymm14,%ymm9
- vpaddd %ymm14,%ymm14,%ymm14
-
- vpsrld $2,%ymm3,%ymm3
- vpaddd %ymm7,%ymm1,%ymm1
- vpor %ymm9,%ymm14,%ymm14
- vpor %ymm6,%ymm3,%ymm3
- vpslld $5,%ymm1,%ymm7
- vpaddd %ymm15,%ymm0,%ymm0
- vpxor %ymm2,%ymm4,%ymm5
-
- vpsrld $27,%ymm1,%ymm8
- vpaddd %ymm14,%ymm0,%ymm0
- vpxor %ymm3,%ymm5,%ymm5
-
- vpslld $30,%ymm2,%ymm6
- vpor %ymm8,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
-
- vpsrld $2,%ymm2,%ymm2
- vpaddd %ymm7,%ymm0,%ymm0
- vpor %ymm6,%ymm2,%ymm2
- movl $1,%ecx
- leaq 512(%rsp),%rbx
- cmpl 0(%rbx),%ecx
- cmovgeq %rbp,%r12
- cmpl 4(%rbx),%ecx
- cmovgeq %rbp,%r13
- cmpl 8(%rbx),%ecx
- cmovgeq %rbp,%r14
- cmpl 12(%rbx),%ecx
- cmovgeq %rbp,%r15
- cmpl 16(%rbx),%ecx
- cmovgeq %rbp,%r8
- cmpl 20(%rbx),%ecx
- cmovgeq %rbp,%r9
- cmpl 24(%rbx),%ecx
- cmovgeq %rbp,%r10
- cmpl 28(%rbx),%ecx
- cmovgeq %rbp,%r11
- vmovdqu (%rbx),%ymm5
- vpxor %ymm7,%ymm7,%ymm7
- vmovdqa %ymm5,%ymm6
- vpcmpgtd %ymm7,%ymm6,%ymm6
- vpaddd %ymm6,%ymm5,%ymm5
-
- vpand %ymm6,%ymm0,%ymm0
- vpand %ymm6,%ymm1,%ymm1
- vpaddd 0(%rdi),%ymm0,%ymm0
- vpand %ymm6,%ymm2,%ymm2
- vpaddd 32(%rdi),%ymm1,%ymm1
- vpand %ymm6,%ymm3,%ymm3
- vpaddd 64(%rdi),%ymm2,%ymm2
- vpand %ymm6,%ymm4,%ymm4
- vpaddd 96(%rdi),%ymm3,%ymm3
- vpaddd 128(%rdi),%ymm4,%ymm4
- vmovdqu %ymm0,0(%rdi)
- vmovdqu %ymm1,32(%rdi)
- vmovdqu %ymm2,64(%rdi)
- vmovdqu %ymm3,96(%rdi)
- vmovdqu %ymm4,128(%rdi)
-
- vmovdqu %ymm5,(%rbx)
- leaq 256+128(%rsp),%rbx
- vmovdqu 96(%rbp),%ymm9
- decl %edx
- jnz .Loop_avx2
-
-
-
-
-
-
-
-.Ldone_avx2:
- movq 544(%rsp),%rax
-.cfi_def_cfa %rax,8
- vzeroupper
- movq -48(%rax),%r15
-.cfi_restore %r15
- movq -40(%rax),%r14
-.cfi_restore %r14
- movq -32(%rax),%r13
-.cfi_restore %r13
- movq -24(%rax),%r12
-.cfi_restore %r12
- movq -16(%rax),%rbp
-.cfi_restore %rbp
- movq -8(%rax),%rbx
-.cfi_restore %rbx
- leaq (%rax),%rsp
-.cfi_def_cfa_register %rsp
-.Lepilogue_avx2:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size sha1_multi_block_avx2,.-sha1_multi_block_avx2
.align 256
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
@@ -7301,7 +2962,7 @@ K_XX_XX:
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
.byte 83,72,65,49,32,109,117,108,116,105,45,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
- .section ".note.gnu.property", "a"
+ .section .note.gnu.property, #alloc
.p2align 3
.long 1f - 0f
.long 4f - 1f
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha1-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha1-x86_64.s
index 3a03212f8b6..e730222f30b 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha1-x86_64.s
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha1-x86_64.s
@@ -13,14 +13,6 @@ sha1_block_data_order:
jz .Lialu
testl $536870912,%r10d
jnz _shaext_shortcut
- andl $296,%r10d
- cmpl $296,%r10d
- je _avx2_shortcut
- andl $268435456,%r8d
- andl $1073741824,%r9d
- orl %r9d,%r8d
- cmpl $1342177280,%r8d
- je _avx_shortcut
jmp _ssse3_shortcut
.align 16
@@ -2612,2827 +2604,6 @@ _ssse3_shortcut:
.byte 0xf3,0xc3
.cfi_endproc
.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
-.type sha1_block_data_order_avx,@function
-.align 16
-sha1_block_data_order_avx:
-_avx_shortcut:
-.cfi_startproc
- movq %rsp,%r11
-.cfi_def_cfa_register %r11
- pushq %rbx
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_offset %rbp,-24
- pushq %r12
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_offset %r14,-48
- leaq -64(%rsp),%rsp
- vzeroupper
- andq $-64,%rsp
- movq %rdi,%r8
- movq %rsi,%r9
- movq %rdx,%r10
-
- shlq $6,%r10
- addq %r9,%r10
- leaq K_XX_XX+64(%rip),%r14
-
- movl 0(%r8),%eax
- movl 4(%r8),%ebx
- movl 8(%r8),%ecx
- movl 12(%r8),%edx
- movl %ebx,%esi
- movl 16(%r8),%ebp
- movl %ecx,%edi
- xorl %edx,%edi
- andl %edi,%esi
-
- vmovdqa 64(%r14),%xmm6
- vmovdqa -64(%r14),%xmm11
- vmovdqu 0(%r9),%xmm0
- vmovdqu 16(%r9),%xmm1
- vmovdqu 32(%r9),%xmm2
- vmovdqu 48(%r9),%xmm3
- vpshufb %xmm6,%xmm0,%xmm0
- addq $64,%r9
- vpshufb %xmm6,%xmm1,%xmm1
- vpshufb %xmm6,%xmm2,%xmm2
- vpshufb %xmm6,%xmm3,%xmm3
- vpaddd %xmm11,%xmm0,%xmm4
- vpaddd %xmm11,%xmm1,%xmm5
- vpaddd %xmm11,%xmm2,%xmm6
- vmovdqa %xmm4,0(%rsp)
- vmovdqa %xmm5,16(%rsp)
- vmovdqa %xmm6,32(%rsp)
- jmp .Loop_avx
-.align 16
-.Loop_avx:
- shrdl $2,%ebx,%ebx
- xorl %edx,%esi
- vpalignr $8,%xmm0,%xmm1,%xmm4
- movl %eax,%edi
- addl 0(%rsp),%ebp
- vpaddd %xmm3,%xmm11,%xmm9
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpsrldq $4,%xmm3,%xmm8
- addl %esi,%ebp
- andl %ebx,%edi
- vpxor %xmm0,%xmm4,%xmm4
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpxor %xmm2,%xmm8,%xmm8
- shrdl $7,%eax,%eax
- xorl %ecx,%edi
- movl %ebp,%esi
- addl 4(%rsp),%edx
- vpxor %xmm8,%xmm4,%xmm4
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- vmovdqa %xmm9,48(%rsp)
- addl %edi,%edx
- andl %eax,%esi
- vpsrld $31,%xmm4,%xmm8
- xorl %ebx,%eax
- addl %ebp,%edx
- shrdl $7,%ebp,%ebp
- xorl %ebx,%esi
- vpslldq $12,%xmm4,%xmm10
- vpaddd %xmm4,%xmm4,%xmm4
- movl %edx,%edi
- addl 8(%rsp),%ecx
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vpsrld $30,%xmm10,%xmm9
- vpor %xmm8,%xmm4,%xmm4
- addl %esi,%ecx
- andl %ebp,%edi
- xorl %eax,%ebp
- addl %edx,%ecx
- vpslld $2,%xmm10,%xmm10
- vpxor %xmm9,%xmm4,%xmm4
- shrdl $7,%edx,%edx
- xorl %eax,%edi
- movl %ecx,%esi
- addl 12(%rsp),%ebx
- vpxor %xmm10,%xmm4,%xmm4
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- andl %edx,%esi
- xorl %ebp,%edx
- addl %ecx,%ebx
- shrdl $7,%ecx,%ecx
- xorl %ebp,%esi
- vpalignr $8,%xmm1,%xmm2,%xmm5
- movl %ebx,%edi
- addl 16(%rsp),%eax
- vpaddd %xmm4,%xmm11,%xmm9
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vpsrldq $4,%xmm4,%xmm8
- addl %esi,%eax
- andl %ecx,%edi
- vpxor %xmm1,%xmm5,%xmm5
- xorl %edx,%ecx
- addl %ebx,%eax
- vpxor %xmm3,%xmm8,%xmm8
- shrdl $7,%ebx,%ebx
- xorl %edx,%edi
- movl %eax,%esi
- addl 20(%rsp),%ebp
- vpxor %xmm8,%xmm5,%xmm5
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vmovdqa %xmm9,0(%rsp)
- addl %edi,%ebp
- andl %ebx,%esi
- vpsrld $31,%xmm5,%xmm8
- xorl %ecx,%ebx
- addl %eax,%ebp
- shrdl $7,%eax,%eax
- xorl %ecx,%esi
- vpslldq $12,%xmm5,%xmm10
- vpaddd %xmm5,%xmm5,%xmm5
- movl %ebp,%edi
- addl 24(%rsp),%edx
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- vpsrld $30,%xmm10,%xmm9
- vpor %xmm8,%xmm5,%xmm5
- addl %esi,%edx
- andl %eax,%edi
- xorl %ebx,%eax
- addl %ebp,%edx
- vpslld $2,%xmm10,%xmm10
- vpxor %xmm9,%xmm5,%xmm5
- shrdl $7,%ebp,%ebp
- xorl %ebx,%edi
- movl %edx,%esi
- addl 28(%rsp),%ecx
- vpxor %xmm10,%xmm5,%xmm5
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vmovdqa -32(%r14),%xmm11
- addl %edi,%ecx
- andl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- shrdl $7,%edx,%edx
- xorl %eax,%esi
- vpalignr $8,%xmm2,%xmm3,%xmm6
- movl %ecx,%edi
- addl 32(%rsp),%ebx
- vpaddd %xmm5,%xmm11,%xmm9
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- vpsrldq $4,%xmm5,%xmm8
- addl %esi,%ebx
- andl %edx,%edi
- vpxor %xmm2,%xmm6,%xmm6
- xorl %ebp,%edx
- addl %ecx,%ebx
- vpxor %xmm4,%xmm8,%xmm8
- shrdl $7,%ecx,%ecx
- xorl %ebp,%edi
- movl %ebx,%esi
- addl 36(%rsp),%eax
- vpxor %xmm8,%xmm6,%xmm6
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vmovdqa %xmm9,16(%rsp)
- addl %edi,%eax
- andl %ecx,%esi
- vpsrld $31,%xmm6,%xmm8
- xorl %edx,%ecx
- addl %ebx,%eax
- shrdl $7,%ebx,%ebx
- xorl %edx,%esi
- vpslldq $12,%xmm6,%xmm10
- vpaddd %xmm6,%xmm6,%xmm6
- movl %eax,%edi
- addl 40(%rsp),%ebp
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- vpsrld $30,%xmm10,%xmm9
- vpor %xmm8,%xmm6,%xmm6
- addl %esi,%ebp
- andl %ebx,%edi
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpslld $2,%xmm10,%xmm10
- vpxor %xmm9,%xmm6,%xmm6
- shrdl $7,%eax,%eax
- xorl %ecx,%edi
- movl %ebp,%esi
- addl 44(%rsp),%edx
- vpxor %xmm10,%xmm6,%xmm6
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- andl %eax,%esi
- xorl %ebx,%eax
- addl %ebp,%edx
- shrdl $7,%ebp,%ebp
- xorl %ebx,%esi
- vpalignr $8,%xmm3,%xmm4,%xmm7
- movl %edx,%edi
- addl 48(%rsp),%ecx
- vpaddd %xmm6,%xmm11,%xmm9
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vpsrldq $4,%xmm6,%xmm8
- addl %esi,%ecx
- andl %ebp,%edi
- vpxor %xmm3,%xmm7,%xmm7
- xorl %eax,%ebp
- addl %edx,%ecx
- vpxor %xmm5,%xmm8,%xmm8
- shrdl $7,%edx,%edx
- xorl %eax,%edi
- movl %ecx,%esi
- addl 52(%rsp),%ebx
- vpxor %xmm8,%xmm7,%xmm7
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- vmovdqa %xmm9,32(%rsp)
- addl %edi,%ebx
- andl %edx,%esi
- vpsrld $31,%xmm7,%xmm8
- xorl %ebp,%edx
- addl %ecx,%ebx
- shrdl $7,%ecx,%ecx
- xorl %ebp,%esi
- vpslldq $12,%xmm7,%xmm10
- vpaddd %xmm7,%xmm7,%xmm7
- movl %ebx,%edi
- addl 56(%rsp),%eax
- xorl %edx,%ecx
- shldl $5,%ebx,%ebx
- vpsrld $30,%xmm10,%xmm9
- vpor %xmm8,%xmm7,%xmm7
- addl %esi,%eax
- andl %ecx,%edi
- xorl %edx,%ecx
- addl %ebx,%eax
- vpslld $2,%xmm10,%xmm10
- vpxor %xmm9,%xmm7,%xmm7
- shrdl $7,%ebx,%ebx
- xorl %edx,%edi
- movl %eax,%esi
- addl 60(%rsp),%ebp
- vpxor %xmm10,%xmm7,%xmm7
- xorl %ecx,%ebx
- shldl $5,%eax,%eax
- addl %edi,%ebp
- andl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpalignr $8,%xmm6,%xmm7,%xmm8
- vpxor %xmm4,%xmm0,%xmm0
- shrdl $7,%eax,%eax
- xorl %ecx,%esi
- movl %ebp,%edi
- addl 0(%rsp),%edx
- vpxor %xmm1,%xmm0,%xmm0
- xorl %ebx,%eax
- shldl $5,%ebp,%ebp
- vpaddd %xmm7,%xmm11,%xmm9
- addl %esi,%edx
- andl %eax,%edi
- vpxor %xmm8,%xmm0,%xmm0
- xorl %ebx,%eax
- addl %ebp,%edx
- shrdl $7,%ebp,%ebp
- xorl %ebx,%edi
- vpsrld $30,%xmm0,%xmm8
- vmovdqa %xmm9,48(%rsp)
- movl %edx,%esi
- addl 4(%rsp),%ecx
- xorl %eax,%ebp
- shldl $5,%edx,%edx
- vpslld $2,%xmm0,%xmm0
- addl %edi,%ecx
- andl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- shrdl $7,%edx,%edx
- xorl %eax,%esi
- movl %ecx,%edi
- addl 8(%rsp),%ebx
- vpor %xmm8,%xmm0,%xmm0
- xorl %ebp,%edx
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- andl %edx,%edi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 12(%rsp),%eax
- xorl %ebp,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpalignr $8,%xmm7,%xmm0,%xmm8
- vpxor %xmm5,%xmm1,%xmm1
- addl 16(%rsp),%ebp
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- vpxor %xmm2,%xmm1,%xmm1
- addl %esi,%ebp
- xorl %ecx,%edi
- vpaddd %xmm0,%xmm11,%xmm9
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpxor %xmm8,%xmm1,%xmm1
- addl 20(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- vpsrld $30,%xmm1,%xmm8
- vmovdqa %xmm9,0(%rsp)
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpslld $2,%xmm1,%xmm1
- addl 24(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpor %xmm8,%xmm1,%xmm1
- addl 28(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpalignr $8,%xmm0,%xmm1,%xmm8
- vpxor %xmm6,%xmm2,%xmm2
- addl 32(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- vpxor %xmm3,%xmm2,%xmm2
- addl %esi,%eax
- xorl %edx,%edi
- vpaddd %xmm1,%xmm11,%xmm9
- vmovdqa 0(%r14),%xmm11
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpxor %xmm8,%xmm2,%xmm2
- addl 36(%rsp),%ebp
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- vpsrld $30,%xmm2,%xmm8
- vmovdqa %xmm9,16(%rsp)
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpslld $2,%xmm2,%xmm2
- addl 40(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpor %xmm8,%xmm2,%xmm2
- addl 44(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpalignr $8,%xmm1,%xmm2,%xmm8
- vpxor %xmm7,%xmm3,%xmm3
- addl 48(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- vpxor %xmm4,%xmm3,%xmm3
- addl %esi,%ebx
- xorl %ebp,%edi
- vpaddd %xmm2,%xmm11,%xmm9
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpxor %xmm8,%xmm3,%xmm3
- addl 52(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- vpsrld $30,%xmm3,%xmm8
- vmovdqa %xmm9,32(%rsp)
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpslld $2,%xmm3,%xmm3
- addl 56(%rsp),%ebp
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ecx,%edi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpor %xmm8,%xmm3,%xmm3
- addl 60(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpalignr $8,%xmm2,%xmm3,%xmm8
- vpxor %xmm0,%xmm4,%xmm4
- addl 0(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- vpxor %xmm5,%xmm4,%xmm4
- addl %esi,%ecx
- xorl %eax,%edi
- vpaddd %xmm3,%xmm11,%xmm9
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpxor %xmm8,%xmm4,%xmm4
- addl 4(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- vpsrld $30,%xmm4,%xmm8
- vmovdqa %xmm9,48(%rsp)
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpslld $2,%xmm4,%xmm4
- addl 8(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vpor %xmm8,%xmm4,%xmm4
- addl 12(%rsp),%ebp
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpalignr $8,%xmm3,%xmm4,%xmm8
- vpxor %xmm1,%xmm5,%xmm5
- addl 16(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- vpxor %xmm6,%xmm5,%xmm5
- addl %esi,%edx
- xorl %ebx,%edi
- vpaddd %xmm4,%xmm11,%xmm9
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpxor %xmm8,%xmm5,%xmm5
- addl 20(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- vpsrld $30,%xmm5,%xmm8
- vmovdqa %xmm9,0(%rsp)
- addl %edi,%ecx
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpslld $2,%xmm5,%xmm5
- addl 24(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vpor %xmm8,%xmm5,%xmm5
- addl 28(%rsp),%eax
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- xorl %edx,%edi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %ecx,%esi
- xorl %edx,%ecx
- addl %ebx,%eax
- vpalignr $8,%xmm4,%xmm5,%xmm8
- vpxor %xmm2,%xmm6,%xmm6
- addl 32(%rsp),%ebp
- andl %ecx,%esi
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- vpxor %xmm7,%xmm6,%xmm6
- movl %eax,%edi
- xorl %ecx,%esi
- vpaddd %xmm5,%xmm11,%xmm9
- shldl $5,%eax,%eax
- addl %esi,%ebp
- vpxor %xmm8,%xmm6,%xmm6
- xorl %ebx,%edi
- xorl %ecx,%ebx
- addl %eax,%ebp
- addl 36(%rsp),%edx
- vpsrld $30,%xmm6,%xmm8
- vmovdqa %xmm9,16(%rsp)
- andl %ebx,%edi
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %ebp,%esi
- vpslld $2,%xmm6,%xmm6
- xorl %ebx,%edi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %eax,%esi
- xorl %ebx,%eax
- addl %ebp,%edx
- addl 40(%rsp),%ecx
- andl %eax,%esi
- vpor %xmm8,%xmm6,%xmm6
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- movl %edx,%edi
- xorl %eax,%esi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %ebp,%edi
- xorl %eax,%ebp
- addl %edx,%ecx
- addl 44(%rsp),%ebx
- andl %ebp,%edi
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- movl %ecx,%esi
- xorl %ebp,%edi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %edx,%esi
- xorl %ebp,%edx
- addl %ecx,%ebx
- vpalignr $8,%xmm5,%xmm6,%xmm8
- vpxor %xmm3,%xmm7,%xmm7
- addl 48(%rsp),%eax
- andl %edx,%esi
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- vpxor %xmm0,%xmm7,%xmm7
- movl %ebx,%edi
- xorl %edx,%esi
- vpaddd %xmm6,%xmm11,%xmm9
- vmovdqa 32(%r14),%xmm11
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- vpxor %xmm8,%xmm7,%xmm7
- xorl %ecx,%edi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 52(%rsp),%ebp
- vpsrld $30,%xmm7,%xmm8
- vmovdqa %xmm9,32(%rsp)
- andl %ecx,%edi
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%esi
- vpslld $2,%xmm7,%xmm7
- xorl %ecx,%edi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%ebp
- addl 56(%rsp),%edx
- andl %ebx,%esi
- vpor %xmm8,%xmm7,%xmm7
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %ebp,%edi
- xorl %ebx,%esi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- xorl %eax,%edi
- xorl %ebx,%eax
- addl %ebp,%edx
- addl 60(%rsp),%ecx
- andl %eax,%edi
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- movl %edx,%esi
- xorl %eax,%edi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- vpalignr $8,%xmm6,%xmm7,%xmm8
- vpxor %xmm4,%xmm0,%xmm0
- addl 0(%rsp),%ebx
- andl %ebp,%esi
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- vpxor %xmm1,%xmm0,%xmm0
- movl %ecx,%edi
- xorl %ebp,%esi
- vpaddd %xmm7,%xmm11,%xmm9
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- vpxor %xmm8,%xmm0,%xmm0
- xorl %edx,%edi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 4(%rsp),%eax
- vpsrld $30,%xmm0,%xmm8
- vmovdqa %xmm9,48(%rsp)
- andl %edx,%edi
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- vpslld $2,%xmm0,%xmm0
- xorl %edx,%edi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %ecx,%esi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 8(%rsp),%ebp
- andl %ecx,%esi
- vpor %xmm8,%xmm0,%xmm0
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%edi
- xorl %ecx,%esi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ebx,%edi
- xorl %ecx,%ebx
- addl %eax,%ebp
- addl 12(%rsp),%edx
- andl %ebx,%edi
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- movl %ebp,%esi
- xorl %ebx,%edi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %eax,%esi
- xorl %ebx,%eax
- addl %ebp,%edx
- vpalignr $8,%xmm7,%xmm0,%xmm8
- vpxor %xmm5,%xmm1,%xmm1
- addl 16(%rsp),%ecx
- andl %eax,%esi
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- vpxor %xmm2,%xmm1,%xmm1
- movl %edx,%edi
- xorl %eax,%esi
- vpaddd %xmm0,%xmm11,%xmm9
- shldl $5,%edx,%edx
- addl %esi,%ecx
- vpxor %xmm8,%xmm1,%xmm1
- xorl %ebp,%edi
- xorl %eax,%ebp
- addl %edx,%ecx
- addl 20(%rsp),%ebx
- vpsrld $30,%xmm1,%xmm8
- vmovdqa %xmm9,0(%rsp)
- andl %ebp,%edi
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- movl %ecx,%esi
- vpslld $2,%xmm1,%xmm1
- xorl %ebp,%edi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %edx,%esi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 24(%rsp),%eax
- andl %edx,%esi
- vpor %xmm8,%xmm1,%xmm1
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%edi
- xorl %edx,%esi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %ecx,%edi
- xorl %edx,%ecx
- addl %ebx,%eax
- addl 28(%rsp),%ebp
- andl %ecx,%edi
- xorl %edx,%ecx
- shrdl $7,%ebx,%ebx
- movl %eax,%esi
- xorl %ecx,%edi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ebx,%esi
- xorl %ecx,%ebx
- addl %eax,%ebp
- vpalignr $8,%xmm0,%xmm1,%xmm8
- vpxor %xmm6,%xmm2,%xmm2
- addl 32(%rsp),%edx
- andl %ebx,%esi
- xorl %ecx,%ebx
- shrdl $7,%eax,%eax
- vpxor %xmm3,%xmm2,%xmm2
- movl %ebp,%edi
- xorl %ebx,%esi
- vpaddd %xmm1,%xmm11,%xmm9
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- vpxor %xmm8,%xmm2,%xmm2
- xorl %eax,%edi
- xorl %ebx,%eax
- addl %ebp,%edx
- addl 36(%rsp),%ecx
- vpsrld $30,%xmm2,%xmm8
- vmovdqa %xmm9,16(%rsp)
- andl %eax,%edi
- xorl %ebx,%eax
- shrdl $7,%ebp,%ebp
- movl %edx,%esi
- vpslld $2,%xmm2,%xmm2
- xorl %eax,%edi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %ebp,%esi
- xorl %eax,%ebp
- addl %edx,%ecx
- addl 40(%rsp),%ebx
- andl %ebp,%esi
- vpor %xmm8,%xmm2,%xmm2
- xorl %eax,%ebp
- shrdl $7,%edx,%edx
- movl %ecx,%edi
- xorl %ebp,%esi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %edx,%edi
- xorl %ebp,%edx
- addl %ecx,%ebx
- addl 44(%rsp),%eax
- andl %edx,%edi
- xorl %ebp,%edx
- shrdl $7,%ecx,%ecx
- movl %ebx,%esi
- xorl %edx,%edi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- addl %ebx,%eax
- vpalignr $8,%xmm1,%xmm2,%xmm8
- vpxor %xmm7,%xmm3,%xmm3
- addl 48(%rsp),%ebp
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- vpxor %xmm4,%xmm3,%xmm3
- addl %esi,%ebp
- xorl %ecx,%edi
- vpaddd %xmm2,%xmm11,%xmm9
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- vpxor %xmm8,%xmm3,%xmm3
- addl 52(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- vpsrld $30,%xmm3,%xmm8
- vmovdqa %xmm9,32(%rsp)
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vpslld $2,%xmm3,%xmm3
- addl 56(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vpor %xmm8,%xmm3,%xmm3
- addl 60(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 0(%rsp),%eax
- vpaddd %xmm3,%xmm11,%xmm9
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- vmovdqa %xmm9,48(%rsp)
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 4(%rsp),%ebp
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 8(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 12(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- cmpq %r10,%r9
- je .Ldone_avx
- vmovdqa 64(%r14),%xmm6
- vmovdqa -64(%r14),%xmm11
- vmovdqu 0(%r9),%xmm0
- vmovdqu 16(%r9),%xmm1
- vmovdqu 32(%r9),%xmm2
- vmovdqu 48(%r9),%xmm3
- vpshufb %xmm6,%xmm0,%xmm0
- addq $64,%r9
- addl 16(%rsp),%ebx
- xorl %ebp,%esi
- vpshufb %xmm6,%xmm1,%xmm1
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- vpaddd %xmm11,%xmm0,%xmm4
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- vmovdqa %xmm4,0(%rsp)
- addl 20(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 24(%rsp),%ebp
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ecx,%edi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 28(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 32(%rsp),%ecx
- xorl %eax,%esi
- vpshufb %xmm6,%xmm2,%xmm2
- movl %edx,%edi
- shldl $5,%edx,%edx
- vpaddd %xmm11,%xmm1,%xmm5
- addl %esi,%ecx
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- vmovdqa %xmm5,16(%rsp)
- addl 36(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 40(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 44(%rsp),%ebp
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 48(%rsp),%edx
- xorl %ebx,%esi
- vpshufb %xmm6,%xmm3,%xmm3
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- vpaddd %xmm11,%xmm2,%xmm6
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- vmovdqa %xmm6,32(%rsp)
- addl 52(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- addl 56(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 60(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 0(%r8),%eax
- addl 4(%r8),%esi
- addl 8(%r8),%ecx
- addl 12(%r8),%edx
- movl %eax,0(%r8)
- addl 16(%r8),%ebp
- movl %esi,4(%r8)
- movl %esi,%ebx
- movl %ecx,8(%r8)
- movl %ecx,%edi
- movl %edx,12(%r8)
- xorl %edx,%edi
- movl %ebp,16(%r8)
- andl %edi,%esi
- jmp .Loop_avx
-
-.align 16
-.Ldone_avx:
- addl 16(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 20(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- xorl %edx,%esi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 24(%rsp),%ebp
- xorl %ecx,%esi
- movl %eax,%edi
- shldl $5,%eax,%eax
- addl %esi,%ebp
- xorl %ecx,%edi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 28(%rsp),%edx
- xorl %ebx,%edi
- movl %ebp,%esi
- shldl $5,%ebp,%ebp
- addl %edi,%edx
- xorl %ebx,%esi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 32(%rsp),%ecx
- xorl %eax,%esi
- movl %edx,%edi
- shldl $5,%edx,%edx
- addl %esi,%ecx
- xorl %eax,%edi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- addl 36(%rsp),%ebx
- xorl %ebp,%edi
- movl %ecx,%esi
- shldl $5,%ecx,%ecx
- addl %edi,%ebx
- xorl %ebp,%esi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 40(%rsp),%eax
- xorl %edx,%esi
- movl %ebx,%edi
- shldl $5,%ebx,%ebx
- addl %esi,%eax
- xorl %edx,%edi
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- addl 44(%rsp),%ebp
- xorl %ecx,%edi
- movl %eax,%esi
- shldl $5,%eax,%eax
- addl %edi,%ebp
- xorl %ecx,%esi
- shrdl $7,%ebx,%ebx
- addl %eax,%ebp
- addl 48(%rsp),%edx
- xorl %ebx,%esi
- movl %ebp,%edi
- shldl $5,%ebp,%ebp
- addl %esi,%edx
- xorl %ebx,%edi
- shrdl $7,%eax,%eax
- addl %ebp,%edx
- addl 52(%rsp),%ecx
- xorl %eax,%edi
- movl %edx,%esi
- shldl $5,%edx,%edx
- addl %edi,%ecx
- xorl %eax,%esi
- shrdl $7,%ebp,%ebp
- addl %edx,%ecx
- addl 56(%rsp),%ebx
- xorl %ebp,%esi
- movl %ecx,%edi
- shldl $5,%ecx,%ecx
- addl %esi,%ebx
- xorl %ebp,%edi
- shrdl $7,%edx,%edx
- addl %ecx,%ebx
- addl 60(%rsp),%eax
- xorl %edx,%edi
- movl %ebx,%esi
- shldl $5,%ebx,%ebx
- addl %edi,%eax
- shrdl $7,%ecx,%ecx
- addl %ebx,%eax
- vzeroupper
-
- addl 0(%r8),%eax
- addl 4(%r8),%esi
- addl 8(%r8),%ecx
- movl %eax,0(%r8)
- addl 12(%r8),%edx
- movl %esi,4(%r8)
- addl 16(%r8),%ebp
- movl %ecx,8(%r8)
- movl %edx,12(%r8)
- movl %ebp,16(%r8)
- movq -40(%r11),%r14
-.cfi_restore %r14
- movq -32(%r11),%r13
-.cfi_restore %r13
- movq -24(%r11),%r12
-.cfi_restore %r12
- movq -16(%r11),%rbp
-.cfi_restore %rbp
- movq -8(%r11),%rbx
-.cfi_restore %rbx
- leaq (%r11),%rsp
-.cfi_def_cfa_register %rsp
-.Lepilogue_avx:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
-.type sha1_block_data_order_avx2,@function
-.align 16
-sha1_block_data_order_avx2:
-_avx2_shortcut:
-.cfi_startproc
- movq %rsp,%r11
-.cfi_def_cfa_register %r11
- pushq %rbx
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_offset %rbp,-24
- pushq %r12
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_offset %r14,-48
- vzeroupper
- movq %rdi,%r8
- movq %rsi,%r9
- movq %rdx,%r10
-
- leaq -640(%rsp),%rsp
- shlq $6,%r10
- leaq 64(%r9),%r13
- andq $-128,%rsp
- addq %r9,%r10
- leaq K_XX_XX+64(%rip),%r14
-
- movl 0(%r8),%eax
- cmpq %r10,%r13
- cmovaeq %r9,%r13
- movl 4(%r8),%ebp
- movl 8(%r8),%ecx
- movl 12(%r8),%edx
- movl 16(%r8),%esi
- vmovdqu 64(%r14),%ymm6
-
- vmovdqu (%r9),%xmm0
- vmovdqu 16(%r9),%xmm1
- vmovdqu 32(%r9),%xmm2
- vmovdqu 48(%r9),%xmm3
- leaq 64(%r9),%r9
- vinserti128 $1,(%r13),%ymm0,%ymm0
- vinserti128 $1,16(%r13),%ymm1,%ymm1
- vpshufb %ymm6,%ymm0,%ymm0
- vinserti128 $1,32(%r13),%ymm2,%ymm2
- vpshufb %ymm6,%ymm1,%ymm1
- vinserti128 $1,48(%r13),%ymm3,%ymm3
- vpshufb %ymm6,%ymm2,%ymm2
- vmovdqu -64(%r14),%ymm11
- vpshufb %ymm6,%ymm3,%ymm3
-
- vpaddd %ymm11,%ymm0,%ymm4
- vpaddd %ymm11,%ymm1,%ymm5
- vmovdqu %ymm4,0(%rsp)
- vpaddd %ymm11,%ymm2,%ymm6
- vmovdqu %ymm5,32(%rsp)
- vpaddd %ymm11,%ymm3,%ymm7
- vmovdqu %ymm6,64(%rsp)
- vmovdqu %ymm7,96(%rsp)
- vpalignr $8,%ymm0,%ymm1,%ymm4
- vpsrldq $4,%ymm3,%ymm8
- vpxor %ymm0,%ymm4,%ymm4
- vpxor %ymm2,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpsrld $31,%ymm4,%ymm8
- vpslldq $12,%ymm4,%ymm10
- vpaddd %ymm4,%ymm4,%ymm4
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm4,%ymm4
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm4,%ymm4
- vpxor %ymm10,%ymm4,%ymm4
- vpaddd %ymm11,%ymm4,%ymm9
- vmovdqu %ymm9,128(%rsp)
- vpalignr $8,%ymm1,%ymm2,%ymm5
- vpsrldq $4,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm3,%ymm8,%ymm8
- vpxor %ymm8,%ymm5,%ymm5
- vpsrld $31,%ymm5,%ymm8
- vmovdqu -32(%r14),%ymm11
- vpslldq $12,%ymm5,%ymm10
- vpaddd %ymm5,%ymm5,%ymm5
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm5,%ymm5
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm10,%ymm5,%ymm5
- vpaddd %ymm11,%ymm5,%ymm9
- vmovdqu %ymm9,160(%rsp)
- vpalignr $8,%ymm2,%ymm3,%ymm6
- vpsrldq $4,%ymm5,%ymm8
- vpxor %ymm2,%ymm6,%ymm6
- vpxor %ymm4,%ymm8,%ymm8
- vpxor %ymm8,%ymm6,%ymm6
- vpsrld $31,%ymm6,%ymm8
- vpslldq $12,%ymm6,%ymm10
- vpaddd %ymm6,%ymm6,%ymm6
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm6,%ymm6
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm6,%ymm6
- vpxor %ymm10,%ymm6,%ymm6
- vpaddd %ymm11,%ymm6,%ymm9
- vmovdqu %ymm9,192(%rsp)
- vpalignr $8,%ymm3,%ymm4,%ymm7
- vpsrldq $4,%ymm6,%ymm8
- vpxor %ymm3,%ymm7,%ymm7
- vpxor %ymm5,%ymm8,%ymm8
- vpxor %ymm8,%ymm7,%ymm7
- vpsrld $31,%ymm7,%ymm8
- vpslldq $12,%ymm7,%ymm10
- vpaddd %ymm7,%ymm7,%ymm7
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm7,%ymm7
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm7,%ymm7
- vpxor %ymm10,%ymm7,%ymm7
- vpaddd %ymm11,%ymm7,%ymm9
- vmovdqu %ymm9,224(%rsp)
- leaq 128(%rsp),%r13
- jmp .Loop_avx2
-.align 32
-.Loop_avx2:
- rorxl $2,%ebp,%ebx
- andnl %edx,%ebp,%edi
- andl %ecx,%ebp
- xorl %edi,%ebp
- jmp .Lalign32_1
-.align 32
-.Lalign32_1:
- vpalignr $8,%ymm6,%ymm7,%ymm8
- vpxor %ymm4,%ymm0,%ymm0
- addl -128(%r13),%esi
- andnl %ecx,%eax,%edi
- vpxor %ymm1,%ymm0,%ymm0
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- vpxor %ymm8,%ymm0,%ymm0
- andl %ebx,%eax
- addl %r12d,%esi
- xorl %edi,%eax
- vpsrld $30,%ymm0,%ymm8
- vpslld $2,%ymm0,%ymm0
- addl -124(%r13),%edx
- andnl %ebx,%esi,%edi
- addl %eax,%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- andl %ebp,%esi
- vpor %ymm8,%ymm0,%ymm0
- addl %r12d,%edx
- xorl %edi,%esi
- addl -120(%r13),%ecx
- andnl %ebp,%edx,%edi
- vpaddd %ymm11,%ymm0,%ymm9
- addl %esi,%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- andl %eax,%edx
- vmovdqu %ymm9,256(%rsp)
- addl %r12d,%ecx
- xorl %edi,%edx
- addl -116(%r13),%ebx
- andnl %eax,%ecx,%edi
- addl %edx,%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- andl %esi,%ecx
- addl %r12d,%ebx
- xorl %edi,%ecx
- addl -96(%r13),%ebp
- andnl %esi,%ebx,%edi
- addl %ecx,%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- andl %edx,%ebx
- addl %r12d,%ebp
- xorl %edi,%ebx
- vpalignr $8,%ymm7,%ymm0,%ymm8
- vpxor %ymm5,%ymm1,%ymm1
- addl -92(%r13),%eax
- andnl %edx,%ebp,%edi
- vpxor %ymm2,%ymm1,%ymm1
- addl %ebx,%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- vpxor %ymm8,%ymm1,%ymm1
- andl %ecx,%ebp
- addl %r12d,%eax
- xorl %edi,%ebp
- vpsrld $30,%ymm1,%ymm8
- vpslld $2,%ymm1,%ymm1
- addl -88(%r13),%esi
- andnl %ecx,%eax,%edi
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- vpor %ymm8,%ymm1,%ymm1
- addl %r12d,%esi
- xorl %edi,%eax
- addl -84(%r13),%edx
- andnl %ebx,%esi,%edi
- vpaddd %ymm11,%ymm1,%ymm9
- addl %eax,%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- andl %ebp,%esi
- vmovdqu %ymm9,288(%rsp)
- addl %r12d,%edx
- xorl %edi,%esi
- addl -64(%r13),%ecx
- andnl %ebp,%edx,%edi
- addl %esi,%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- andl %eax,%edx
- addl %r12d,%ecx
- xorl %edi,%edx
- addl -60(%r13),%ebx
- andnl %eax,%ecx,%edi
- addl %edx,%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- andl %esi,%ecx
- addl %r12d,%ebx
- xorl %edi,%ecx
- vpalignr $8,%ymm0,%ymm1,%ymm8
- vpxor %ymm6,%ymm2,%ymm2
- addl -56(%r13),%ebp
- andnl %esi,%ebx,%edi
- vpxor %ymm3,%ymm2,%ymm2
- vmovdqu 0(%r14),%ymm11
- addl %ecx,%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- vpxor %ymm8,%ymm2,%ymm2
- andl %edx,%ebx
- addl %r12d,%ebp
- xorl %edi,%ebx
- vpsrld $30,%ymm2,%ymm8
- vpslld $2,%ymm2,%ymm2
- addl -52(%r13),%eax
- andnl %edx,%ebp,%edi
- addl %ebx,%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- andl %ecx,%ebp
- vpor %ymm8,%ymm2,%ymm2
- addl %r12d,%eax
- xorl %edi,%ebp
- addl -32(%r13),%esi
- andnl %ecx,%eax,%edi
- vpaddd %ymm11,%ymm2,%ymm9
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- vmovdqu %ymm9,320(%rsp)
- addl %r12d,%esi
- xorl %edi,%eax
- addl -28(%r13),%edx
- andnl %ebx,%esi,%edi
- addl %eax,%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- andl %ebp,%esi
- addl %r12d,%edx
- xorl %edi,%esi
- addl -24(%r13),%ecx
- andnl %ebp,%edx,%edi
- addl %esi,%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- andl %eax,%edx
- addl %r12d,%ecx
- xorl %edi,%edx
- vpalignr $8,%ymm1,%ymm2,%ymm8
- vpxor %ymm7,%ymm3,%ymm3
- addl -20(%r13),%ebx
- andnl %eax,%ecx,%edi
- vpxor %ymm4,%ymm3,%ymm3
- addl %edx,%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- vpxor %ymm8,%ymm3,%ymm3
- andl %esi,%ecx
- addl %r12d,%ebx
- xorl %edi,%ecx
- vpsrld $30,%ymm3,%ymm8
- vpslld $2,%ymm3,%ymm3
- addl 0(%r13),%ebp
- andnl %esi,%ebx,%edi
- addl %ecx,%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- andl %edx,%ebx
- vpor %ymm8,%ymm3,%ymm3
- addl %r12d,%ebp
- xorl %edi,%ebx
- addl 4(%r13),%eax
- andnl %edx,%ebp,%edi
- vpaddd %ymm11,%ymm3,%ymm9
- addl %ebx,%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- andl %ecx,%ebp
- vmovdqu %ymm9,352(%rsp)
- addl %r12d,%eax
- xorl %edi,%ebp
- addl 8(%r13),%esi
- andnl %ecx,%eax,%edi
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- addl %r12d,%esi
- xorl %edi,%eax
- addl 12(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- vpalignr $8,%ymm2,%ymm3,%ymm8
- vpxor %ymm0,%ymm4,%ymm4
- addl 32(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- vpxor %ymm8,%ymm4,%ymm4
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl 36(%r13),%ebx
- vpsrld $30,%ymm4,%ymm8
- vpslld $2,%ymm4,%ymm4
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- vpor %ymm8,%ymm4,%ymm4
- addl 40(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- vpaddd %ymm11,%ymm4,%ymm9
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl 44(%r13),%eax
- vmovdqu %ymm9,384(%rsp)
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl 64(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- vpalignr $8,%ymm3,%ymm4,%ymm8
- vpxor %ymm1,%ymm5,%ymm5
- addl 68(%r13),%edx
- leal (%rdx,%rax,1),%edx
- vpxor %ymm6,%ymm5,%ymm5
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- vpxor %ymm8,%ymm5,%ymm5
- addl %r12d,%edx
- xorl %ebx,%esi
- addl 72(%r13),%ecx
- vpsrld $30,%ymm5,%ymm8
- vpslld $2,%ymm5,%ymm5
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- vpor %ymm8,%ymm5,%ymm5
- addl 76(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- vpaddd %ymm11,%ymm5,%ymm9
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl 96(%r13),%ebp
- vmovdqu %ymm9,416(%rsp)
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl 100(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- vpalignr $8,%ymm4,%ymm5,%ymm8
- vpxor %ymm2,%ymm6,%ymm6
- addl 104(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- vpxor %ymm7,%ymm6,%ymm6
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- vpxor %ymm8,%ymm6,%ymm6
- addl %r12d,%esi
- xorl %ecx,%eax
- addl 108(%r13),%edx
- leaq 256(%r13),%r13
- vpsrld $30,%ymm6,%ymm8
- vpslld $2,%ymm6,%ymm6
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- vpor %ymm8,%ymm6,%ymm6
- addl -128(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- vpaddd %ymm11,%ymm6,%ymm9
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -124(%r13),%ebx
- vmovdqu %ymm9,448(%rsp)
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl -120(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- vpalignr $8,%ymm5,%ymm6,%ymm8
- vpxor %ymm3,%ymm7,%ymm7
- addl -116(%r13),%eax
- leal (%rax,%rbx,1),%eax
- vpxor %ymm0,%ymm7,%ymm7
- vmovdqu 32(%r14),%ymm11
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- vpxor %ymm8,%ymm7,%ymm7
- addl %r12d,%eax
- xorl %edx,%ebp
- addl -96(%r13),%esi
- vpsrld $30,%ymm7,%ymm8
- vpslld $2,%ymm7,%ymm7
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- vpor %ymm8,%ymm7,%ymm7
- addl -92(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- vpaddd %ymm11,%ymm7,%ymm9
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl -88(%r13),%ecx
- vmovdqu %ymm9,480(%rsp)
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -84(%r13),%ebx
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- andl %edi,%ecx
- jmp .Lalign32_2
-.align 32
-.Lalign32_2:
- vpalignr $8,%ymm6,%ymm7,%ymm8
- vpxor %ymm4,%ymm0,%ymm0
- addl -64(%r13),%ebp
- xorl %esi,%ecx
- vpxor %ymm1,%ymm0,%ymm0
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- vpxor %ymm8,%ymm0,%ymm0
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- vpsrld $30,%ymm0,%ymm8
- vpslld $2,%ymm0,%ymm0
- addl %r12d,%ebp
- andl %edi,%ebx
- addl -60(%r13),%eax
- xorl %edx,%ebx
- movl %ecx,%edi
- xorl %edx,%edi
- vpor %ymm8,%ymm0,%ymm0
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- vpaddd %ymm11,%ymm0,%ymm9
- addl %r12d,%eax
- andl %edi,%ebp
- addl -56(%r13),%esi
- xorl %ecx,%ebp
- vmovdqu %ymm9,512(%rsp)
- movl %ebx,%edi
- xorl %ecx,%edi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- andl %edi,%eax
- addl -52(%r13),%edx
- xorl %ebx,%eax
- movl %ebp,%edi
- xorl %ebx,%edi
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- andl %edi,%esi
- addl -32(%r13),%ecx
- xorl %ebp,%esi
- movl %eax,%edi
- xorl %ebp,%edi
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- andl %edi,%edx
- vpalignr $8,%ymm7,%ymm0,%ymm8
- vpxor %ymm5,%ymm1,%ymm1
- addl -28(%r13),%ebx
- xorl %eax,%edx
- vpxor %ymm2,%ymm1,%ymm1
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- vpxor %ymm8,%ymm1,%ymm1
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- vpsrld $30,%ymm1,%ymm8
- vpslld $2,%ymm1,%ymm1
- addl %r12d,%ebx
- andl %edi,%ecx
- addl -24(%r13),%ebp
- xorl %esi,%ecx
- movl %edx,%edi
- xorl %esi,%edi
- vpor %ymm8,%ymm1,%ymm1
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- vpaddd %ymm11,%ymm1,%ymm9
- addl %r12d,%ebp
- andl %edi,%ebx
- addl -20(%r13),%eax
- xorl %edx,%ebx
- vmovdqu %ymm9,544(%rsp)
- movl %ecx,%edi
- xorl %edx,%edi
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- andl %edi,%ebp
- addl 0(%r13),%esi
- xorl %ecx,%ebp
- movl %ebx,%edi
- xorl %ecx,%edi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- andl %edi,%eax
- addl 4(%r13),%edx
- xorl %ebx,%eax
- movl %ebp,%edi
- xorl %ebx,%edi
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- andl %edi,%esi
- vpalignr $8,%ymm0,%ymm1,%ymm8
- vpxor %ymm6,%ymm2,%ymm2
- addl 8(%r13),%ecx
- xorl %ebp,%esi
- vpxor %ymm3,%ymm2,%ymm2
- movl %eax,%edi
- xorl %ebp,%edi
- leal (%rcx,%rsi,1),%ecx
- vpxor %ymm8,%ymm2,%ymm2
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- vpsrld $30,%ymm2,%ymm8
- vpslld $2,%ymm2,%ymm2
- addl %r12d,%ecx
- andl %edi,%edx
- addl 12(%r13),%ebx
- xorl %eax,%edx
- movl %esi,%edi
- xorl %eax,%edi
- vpor %ymm8,%ymm2,%ymm2
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- vpaddd %ymm11,%ymm2,%ymm9
- addl %r12d,%ebx
- andl %edi,%ecx
- addl 32(%r13),%ebp
- xorl %esi,%ecx
- vmovdqu %ymm9,576(%rsp)
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- andl %edi,%ebx
- addl 36(%r13),%eax
- xorl %edx,%ebx
- movl %ecx,%edi
- xorl %edx,%edi
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- andl %edi,%ebp
- addl 40(%r13),%esi
- xorl %ecx,%ebp
- movl %ebx,%edi
- xorl %ecx,%edi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- andl %edi,%eax
- vpalignr $8,%ymm1,%ymm2,%ymm8
- vpxor %ymm7,%ymm3,%ymm3
- addl 44(%r13),%edx
- xorl %ebx,%eax
- vpxor %ymm4,%ymm3,%ymm3
- movl %ebp,%edi
- xorl %ebx,%edi
- leal (%rdx,%rax,1),%edx
- vpxor %ymm8,%ymm3,%ymm3
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- vpsrld $30,%ymm3,%ymm8
- vpslld $2,%ymm3,%ymm3
- addl %r12d,%edx
- andl %edi,%esi
- addl 64(%r13),%ecx
- xorl %ebp,%esi
- movl %eax,%edi
- xorl %ebp,%edi
- vpor %ymm8,%ymm3,%ymm3
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- vpaddd %ymm11,%ymm3,%ymm9
- addl %r12d,%ecx
- andl %edi,%edx
- addl 68(%r13),%ebx
- xorl %eax,%edx
- vmovdqu %ymm9,608(%rsp)
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- andl %edi,%ecx
- addl 72(%r13),%ebp
- xorl %esi,%ecx
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- andl %edi,%ebx
- addl 76(%r13),%eax
- xorl %edx,%ebx
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl 96(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl 100(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl 104(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl 108(%r13),%ebx
- leaq 256(%r13),%r13
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl -128(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl -124(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl -120(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl -116(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl -96(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -92(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl -88(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl -84(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl -64(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl -60(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl -56(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -52(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl -32(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl -28(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl -24(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl -20(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- addl %r12d,%edx
- leaq 128(%r9),%r13
- leaq 128(%r9),%rdi
- cmpq %r10,%r13
- cmovaeq %r9,%r13
-
-
- addl 0(%r8),%edx
- addl 4(%r8),%esi
- addl 8(%r8),%ebp
- movl %edx,0(%r8)
- addl 12(%r8),%ebx
- movl %esi,4(%r8)
- movl %edx,%eax
- addl 16(%r8),%ecx
- movl %ebp,%r12d
- movl %ebp,8(%r8)
- movl %ebx,%edx
-
- movl %ebx,12(%r8)
- movl %esi,%ebp
- movl %ecx,16(%r8)
-
- movl %ecx,%esi
- movl %r12d,%ecx
-
-
- cmpq %r10,%r9
- je .Ldone_avx2
- vmovdqu 64(%r14),%ymm6
- cmpq %r10,%rdi
- ja .Last_avx2
-
- vmovdqu -64(%rdi),%xmm0
- vmovdqu -48(%rdi),%xmm1
- vmovdqu -32(%rdi),%xmm2
- vmovdqu -16(%rdi),%xmm3
- vinserti128 $1,0(%r13),%ymm0,%ymm0
- vinserti128 $1,16(%r13),%ymm1,%ymm1
- vinserti128 $1,32(%r13),%ymm2,%ymm2
- vinserti128 $1,48(%r13),%ymm3,%ymm3
- jmp .Last_avx2
-
-.align 32
-.Last_avx2:
- leaq 128+16(%rsp),%r13
- rorxl $2,%ebp,%ebx
- andnl %edx,%ebp,%edi
- andl %ecx,%ebp
- xorl %edi,%ebp
- subq $-128,%r9
- addl -128(%r13),%esi
- andnl %ecx,%eax,%edi
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- addl %r12d,%esi
- xorl %edi,%eax
- addl -124(%r13),%edx
- andnl %ebx,%esi,%edi
- addl %eax,%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- andl %ebp,%esi
- addl %r12d,%edx
- xorl %edi,%esi
- addl -120(%r13),%ecx
- andnl %ebp,%edx,%edi
- addl %esi,%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- andl %eax,%edx
- addl %r12d,%ecx
- xorl %edi,%edx
- addl -116(%r13),%ebx
- andnl %eax,%ecx,%edi
- addl %edx,%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- andl %esi,%ecx
- addl %r12d,%ebx
- xorl %edi,%ecx
- addl -96(%r13),%ebp
- andnl %esi,%ebx,%edi
- addl %ecx,%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- andl %edx,%ebx
- addl %r12d,%ebp
- xorl %edi,%ebx
- addl -92(%r13),%eax
- andnl %edx,%ebp,%edi
- addl %ebx,%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- andl %ecx,%ebp
- addl %r12d,%eax
- xorl %edi,%ebp
- addl -88(%r13),%esi
- andnl %ecx,%eax,%edi
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- addl %r12d,%esi
- xorl %edi,%eax
- addl -84(%r13),%edx
- andnl %ebx,%esi,%edi
- addl %eax,%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- andl %ebp,%esi
- addl %r12d,%edx
- xorl %edi,%esi
- addl -64(%r13),%ecx
- andnl %ebp,%edx,%edi
- addl %esi,%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- andl %eax,%edx
- addl %r12d,%ecx
- xorl %edi,%edx
- addl -60(%r13),%ebx
- andnl %eax,%ecx,%edi
- addl %edx,%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- andl %esi,%ecx
- addl %r12d,%ebx
- xorl %edi,%ecx
- addl -56(%r13),%ebp
- andnl %esi,%ebx,%edi
- addl %ecx,%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- andl %edx,%ebx
- addl %r12d,%ebp
- xorl %edi,%ebx
- addl -52(%r13),%eax
- andnl %edx,%ebp,%edi
- addl %ebx,%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- andl %ecx,%ebp
- addl %r12d,%eax
- xorl %edi,%ebp
- addl -32(%r13),%esi
- andnl %ecx,%eax,%edi
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- addl %r12d,%esi
- xorl %edi,%eax
- addl -28(%r13),%edx
- andnl %ebx,%esi,%edi
- addl %eax,%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- andl %ebp,%esi
- addl %r12d,%edx
- xorl %edi,%esi
- addl -24(%r13),%ecx
- andnl %ebp,%edx,%edi
- addl %esi,%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- andl %eax,%edx
- addl %r12d,%ecx
- xorl %edi,%edx
- addl -20(%r13),%ebx
- andnl %eax,%ecx,%edi
- addl %edx,%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- andl %esi,%ecx
- addl %r12d,%ebx
- xorl %edi,%ecx
- addl 0(%r13),%ebp
- andnl %esi,%ebx,%edi
- addl %ecx,%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- andl %edx,%ebx
- addl %r12d,%ebp
- xorl %edi,%ebx
- addl 4(%r13),%eax
- andnl %edx,%ebp,%edi
- addl %ebx,%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- andl %ecx,%ebp
- addl %r12d,%eax
- xorl %edi,%ebp
- addl 8(%r13),%esi
- andnl %ecx,%eax,%edi
- addl %ebp,%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- andl %ebx,%eax
- addl %r12d,%esi
- xorl %edi,%eax
- addl 12(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl 32(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl 36(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl 40(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl 44(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl 64(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- vmovdqu -64(%r14),%ymm11
- vpshufb %ymm6,%ymm0,%ymm0
- addl 68(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl 72(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl 76(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl 96(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl 100(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- vpshufb %ymm6,%ymm1,%ymm1
- vpaddd %ymm11,%ymm0,%ymm8
- addl 104(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl 108(%r13),%edx
- leaq 256(%r13),%r13
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl -128(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -124(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl -120(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- vmovdqu %ymm8,0(%rsp)
- vpshufb %ymm6,%ymm2,%ymm2
- vpaddd %ymm11,%ymm1,%ymm9
- addl -116(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl -96(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl -92(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- addl -88(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -84(%r13),%ebx
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- andl %edi,%ecx
- vmovdqu %ymm9,32(%rsp)
- vpshufb %ymm6,%ymm3,%ymm3
- vpaddd %ymm11,%ymm2,%ymm6
- addl -64(%r13),%ebp
- xorl %esi,%ecx
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- andl %edi,%ebx
- addl -60(%r13),%eax
- xorl %edx,%ebx
- movl %ecx,%edi
- xorl %edx,%edi
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- andl %edi,%ebp
- addl -56(%r13),%esi
- xorl %ecx,%ebp
- movl %ebx,%edi
- xorl %ecx,%edi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- andl %edi,%eax
- addl -52(%r13),%edx
- xorl %ebx,%eax
- movl %ebp,%edi
- xorl %ebx,%edi
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- andl %edi,%esi
- addl -32(%r13),%ecx
- xorl %ebp,%esi
- movl %eax,%edi
- xorl %ebp,%edi
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- andl %edi,%edx
- jmp .Lalign32_3
-.align 32
-.Lalign32_3:
- vmovdqu %ymm6,64(%rsp)
- vpaddd %ymm11,%ymm3,%ymm7
- addl -28(%r13),%ebx
- xorl %eax,%edx
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- andl %edi,%ecx
- addl -24(%r13),%ebp
- xorl %esi,%ecx
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- andl %edi,%ebx
- addl -20(%r13),%eax
- xorl %edx,%ebx
- movl %ecx,%edi
- xorl %edx,%edi
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- andl %edi,%ebp
- addl 0(%r13),%esi
- xorl %ecx,%ebp
- movl %ebx,%edi
- xorl %ecx,%edi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- andl %edi,%eax
- addl 4(%r13),%edx
- xorl %ebx,%eax
- movl %ebp,%edi
- xorl %ebx,%edi
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- andl %edi,%esi
- vmovdqu %ymm7,96(%rsp)
- addl 8(%r13),%ecx
- xorl %ebp,%esi
- movl %eax,%edi
- xorl %ebp,%edi
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- andl %edi,%edx
- addl 12(%r13),%ebx
- xorl %eax,%edx
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- andl %edi,%ecx
- addl 32(%r13),%ebp
- xorl %esi,%ecx
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- andl %edi,%ebx
- addl 36(%r13),%eax
- xorl %edx,%ebx
- movl %ecx,%edi
- xorl %edx,%edi
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- andl %edi,%ebp
- addl 40(%r13),%esi
- xorl %ecx,%ebp
- movl %ebx,%edi
- xorl %ecx,%edi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- andl %edi,%eax
- vpalignr $8,%ymm0,%ymm1,%ymm4
- addl 44(%r13),%edx
- xorl %ebx,%eax
- movl %ebp,%edi
- xorl %ebx,%edi
- vpsrldq $4,%ymm3,%ymm8
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- vpxor %ymm0,%ymm4,%ymm4
- vpxor %ymm2,%ymm8,%ymm8
- xorl %ebp,%esi
- addl %r12d,%edx
- vpxor %ymm8,%ymm4,%ymm4
- andl %edi,%esi
- addl 64(%r13),%ecx
- xorl %ebp,%esi
- movl %eax,%edi
- vpsrld $31,%ymm4,%ymm8
- xorl %ebp,%edi
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- vpslldq $12,%ymm4,%ymm10
- vpaddd %ymm4,%ymm4,%ymm4
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm4,%ymm4
- addl %r12d,%ecx
- andl %edi,%edx
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm4,%ymm4
- addl 68(%r13),%ebx
- xorl %eax,%edx
- vpxor %ymm10,%ymm4,%ymm4
- movl %esi,%edi
- xorl %eax,%edi
- leal (%rbx,%rdx,1),%ebx
- vpaddd %ymm11,%ymm4,%ymm9
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- vmovdqu %ymm9,128(%rsp)
- addl %r12d,%ebx
- andl %edi,%ecx
- addl 72(%r13),%ebp
- xorl %esi,%ecx
- movl %edx,%edi
- xorl %esi,%edi
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- andl %edi,%ebx
- addl 76(%r13),%eax
- xorl %edx,%ebx
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- vpalignr $8,%ymm1,%ymm2,%ymm5
- addl 96(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- vpsrldq $4,%ymm4,%ymm8
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- vpxor %ymm1,%ymm5,%ymm5
- vpxor %ymm3,%ymm8,%ymm8
- addl 100(%r13),%edx
- leal (%rdx,%rax,1),%edx
- vpxor %ymm8,%ymm5,%ymm5
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- xorl %ebp,%esi
- addl %r12d,%edx
- vpsrld $31,%ymm5,%ymm8
- vmovdqu -32(%r14),%ymm11
- xorl %ebx,%esi
- addl 104(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- vpslldq $12,%ymm5,%ymm10
- vpaddd %ymm5,%ymm5,%ymm5
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm5,%ymm5
- xorl %eax,%edx
- addl %r12d,%ecx
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm5,%ymm5
- xorl %ebp,%edx
- addl 108(%r13),%ebx
- leaq 256(%r13),%r13
- vpxor %ymm10,%ymm5,%ymm5
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- vpaddd %ymm11,%ymm5,%ymm9
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- vmovdqu %ymm9,160(%rsp)
- addl -128(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- vpalignr $8,%ymm2,%ymm3,%ymm6
- addl -124(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- vpsrldq $4,%ymm5,%ymm8
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- vpxor %ymm2,%ymm6,%ymm6
- vpxor %ymm4,%ymm8,%ymm8
- addl -120(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- vpxor %ymm8,%ymm6,%ymm6
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- vpsrld $31,%ymm6,%ymm8
- xorl %ecx,%eax
- addl -116(%r13),%edx
- leal (%rdx,%rax,1),%edx
- vpslldq $12,%ymm6,%ymm10
- vpaddd %ymm6,%ymm6,%ymm6
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm6,%ymm6
- xorl %ebp,%esi
- addl %r12d,%edx
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm6,%ymm6
- xorl %ebx,%esi
- addl -96(%r13),%ecx
- vpxor %ymm10,%ymm6,%ymm6
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- vpaddd %ymm11,%ymm6,%ymm9
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- vmovdqu %ymm9,192(%rsp)
- addl -92(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- vpalignr $8,%ymm3,%ymm4,%ymm7
- addl -88(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- vpsrldq $4,%ymm6,%ymm8
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- vpxor %ymm3,%ymm7,%ymm7
- vpxor %ymm5,%ymm8,%ymm8
- addl -84(%r13),%eax
- leal (%rax,%rbx,1),%eax
- vpxor %ymm8,%ymm7,%ymm7
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- vpsrld $31,%ymm7,%ymm8
- xorl %edx,%ebp
- addl -64(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- vpslldq $12,%ymm7,%ymm10
- vpaddd %ymm7,%ymm7,%ymm7
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- vpsrld $30,%ymm10,%ymm9
- vpor %ymm8,%ymm7,%ymm7
- xorl %ebx,%eax
- addl %r12d,%esi
- vpslld $2,%ymm10,%ymm10
- vpxor %ymm9,%ymm7,%ymm7
- xorl %ecx,%eax
- addl -60(%r13),%edx
- vpxor %ymm10,%ymm7,%ymm7
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- rorxl $2,%esi,%eax
- vpaddd %ymm11,%ymm7,%ymm9
- xorl %ebp,%esi
- addl %r12d,%edx
- xorl %ebx,%esi
- vmovdqu %ymm9,224(%rsp)
- addl -56(%r13),%ecx
- leal (%rcx,%rsi,1),%ecx
- rorxl $27,%edx,%r12d
- rorxl $2,%edx,%esi
- xorl %eax,%edx
- addl %r12d,%ecx
- xorl %ebp,%edx
- addl -52(%r13),%ebx
- leal (%rbx,%rdx,1),%ebx
- rorxl $27,%ecx,%r12d
- rorxl $2,%ecx,%edx
- xorl %esi,%ecx
- addl %r12d,%ebx
- xorl %eax,%ecx
- addl -32(%r13),%ebp
- leal (%rcx,%rbp,1),%ebp
- rorxl $27,%ebx,%r12d
- rorxl $2,%ebx,%ecx
- xorl %edx,%ebx
- addl %r12d,%ebp
- xorl %esi,%ebx
- addl -28(%r13),%eax
- leal (%rax,%rbx,1),%eax
- rorxl $27,%ebp,%r12d
- rorxl $2,%ebp,%ebx
- xorl %ecx,%ebp
- addl %r12d,%eax
- xorl %edx,%ebp
- addl -24(%r13),%esi
- leal (%rsi,%rbp,1),%esi
- rorxl $27,%eax,%r12d
- rorxl $2,%eax,%ebp
- xorl %ebx,%eax
- addl %r12d,%esi
- xorl %ecx,%eax
- addl -20(%r13),%edx
- leal (%rdx,%rax,1),%edx
- rorxl $27,%esi,%r12d
- addl %r12d,%edx
- leaq 128(%rsp),%r13
-
-
- addl 0(%r8),%edx
- addl 4(%r8),%esi
- addl 8(%r8),%ebp
- movl %edx,0(%r8)
- addl 12(%r8),%ebx
- movl %esi,4(%r8)
- movl %edx,%eax
- addl 16(%r8),%ecx
- movl %ebp,%r12d
- movl %ebp,8(%r8)
- movl %ebx,%edx
-
- movl %ebx,12(%r8)
- movl %esi,%ebp
- movl %ecx,16(%r8)
-
- movl %ecx,%esi
- movl %r12d,%ecx
-
-
- cmpq %r10,%r9
- jbe .Loop_avx2
-
-.Ldone_avx2:
- vzeroupper
- movq -40(%r11),%r14
-.cfi_restore %r14
- movq -32(%r11),%r13
-.cfi_restore %r13
- movq -24(%r11),%r12
-.cfi_restore %r12
- movq -16(%r11),%rbp
-.cfi_restore %rbp
- movq -8(%r11),%rbx
-.cfi_restore %rbx
- leaq (%r11),%rsp
-.cfi_def_cfa_register %rsp
-.Lepilogue_avx2:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
.align 64
K_XX_XX:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
@@ -5448,7 +2619,7 @@ K_XX_XX:
.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64
- .section ".note.gnu.property", "a"
+ .section .note.gnu.property, #alloc
.p2align 3
.long 1f - 0f
.long 4f - 1f
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha256-mb-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha256-mb-x86_64.s
index 8f9e4bfe5cf..5bb4ca7ed49 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha256-mb-x86_64.s
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha256-mb-x86_64.s
@@ -10,8 +10,6 @@ sha256_multi_block:
movq OPENSSL_ia32cap_P+4(%rip),%rcx
btq $61,%rcx
jc _shaext_shortcut
- testl $268435456,%ecx
- jnz _avx_shortcut
movq %rsp,%rax
.cfi_def_cfa_register %rax
pushq %rbx
@@ -3137,4700 +3135,6 @@ _shaext_shortcut:
.byte 0xf3,0xc3
.cfi_endproc
.size sha256_multi_block_shaext,.-sha256_multi_block_shaext
-.type sha256_multi_block_avx,@function
-.align 32
-sha256_multi_block_avx:
-.cfi_startproc
-_avx_shortcut:
- shrq $32,%rcx
- cmpl $2,%edx
- jb .Lavx
- testl $32,%ecx
- jnz _avx2_shortcut
- jmp .Lavx
-.align 32
-.Lavx:
- movq %rsp,%rax
-.cfi_def_cfa_register %rax
- pushq %rbx
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_offset %rbp,-24
- subq $288,%rsp
- andq $-256,%rsp
- movq %rax,272(%rsp)
-.cfi_escape 0x0f,0x06,0x77,0x90,0x02,0x06,0x23,0x08
-.Lbody_avx:
- leaq K256+128(%rip),%rbp
- leaq 256(%rsp),%rbx
- leaq 128(%rdi),%rdi
-
-.Loop_grande_avx:
- movl %edx,280(%rsp)
- xorl %edx,%edx
-
- movq 0(%rsi),%r8
-
- movl 8(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,0(%rbx)
- cmovleq %rbp,%r8
-
- movq 16(%rsi),%r9
-
- movl 24(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,4(%rbx)
- cmovleq %rbp,%r9
-
- movq 32(%rsi),%r10
-
- movl 40(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,8(%rbx)
- cmovleq %rbp,%r10
-
- movq 48(%rsi),%r11
-
- movl 56(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,12(%rbx)
- cmovleq %rbp,%r11
- testl %edx,%edx
- jz .Ldone_avx
-
- vmovdqu 0-128(%rdi),%xmm8
- leaq 128(%rsp),%rax
- vmovdqu 32-128(%rdi),%xmm9
- vmovdqu 64-128(%rdi),%xmm10
- vmovdqu 96-128(%rdi),%xmm11
- vmovdqu 128-128(%rdi),%xmm12
- vmovdqu 160-128(%rdi),%xmm13
- vmovdqu 192-128(%rdi),%xmm14
- vmovdqu 224-128(%rdi),%xmm15
- vmovdqu .Lpbswap(%rip),%xmm6
- jmp .Loop_avx
-
-.align 32
-.Loop_avx:
- vpxor %xmm9,%xmm10,%xmm4
- vmovd 0(%r8),%xmm5
- vmovd 0(%r9),%xmm0
- vpinsrd $1,0(%r10),%xmm5,%xmm5
- vpinsrd $1,0(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm12,%xmm7
- vpslld $26,%xmm12,%xmm2
- vmovdqu %xmm5,0-128(%rax)
- vpaddd %xmm15,%xmm5,%xmm5
-
- vpsrld $11,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm12,%xmm2
- vpaddd -128(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm12,%xmm2
- vpandn %xmm14,%xmm12,%xmm0
- vpand %xmm13,%xmm12,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm8,%xmm15
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm8,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm8,%xmm9,%xmm3
-
- vpxor %xmm1,%xmm15,%xmm15
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm8,%xmm1
-
- vpslld $19,%xmm8,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm15,%xmm7
-
- vpsrld $22,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm8,%xmm2
- vpxor %xmm4,%xmm9,%xmm15
- vpaddd %xmm5,%xmm11,%xmm11
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm15,%xmm15
- vpaddd %xmm7,%xmm15,%xmm15
- vmovd 4(%r8),%xmm5
- vmovd 4(%r9),%xmm0
- vpinsrd $1,4(%r10),%xmm5,%xmm5
- vpinsrd $1,4(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm11,%xmm7
- vpslld $26,%xmm11,%xmm2
- vmovdqu %xmm5,16-128(%rax)
- vpaddd %xmm14,%xmm5,%xmm5
-
- vpsrld $11,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm11,%xmm2
- vpaddd -96(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm11,%xmm2
- vpandn %xmm13,%xmm11,%xmm0
- vpand %xmm12,%xmm11,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm15,%xmm14
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm15,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm15,%xmm8,%xmm4
-
- vpxor %xmm1,%xmm14,%xmm14
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm15,%xmm1
-
- vpslld $19,%xmm15,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm14,%xmm7
-
- vpsrld $22,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm15,%xmm2
- vpxor %xmm3,%xmm8,%xmm14
- vpaddd %xmm5,%xmm10,%xmm10
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm14,%xmm14
- vpaddd %xmm7,%xmm14,%xmm14
- vmovd 8(%r8),%xmm5
- vmovd 8(%r9),%xmm0
- vpinsrd $1,8(%r10),%xmm5,%xmm5
- vpinsrd $1,8(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm10,%xmm7
- vpslld $26,%xmm10,%xmm2
- vmovdqu %xmm5,32-128(%rax)
- vpaddd %xmm13,%xmm5,%xmm5
-
- vpsrld $11,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm10,%xmm2
- vpaddd -64(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm10,%xmm2
- vpandn %xmm12,%xmm10,%xmm0
- vpand %xmm11,%xmm10,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm14,%xmm13
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm14,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm14,%xmm15,%xmm3
-
- vpxor %xmm1,%xmm13,%xmm13
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm14,%xmm1
-
- vpslld $19,%xmm14,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm13,%xmm7
-
- vpsrld $22,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm14,%xmm2
- vpxor %xmm4,%xmm15,%xmm13
- vpaddd %xmm5,%xmm9,%xmm9
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm13,%xmm13
- vpaddd %xmm7,%xmm13,%xmm13
- vmovd 12(%r8),%xmm5
- vmovd 12(%r9),%xmm0
- vpinsrd $1,12(%r10),%xmm5,%xmm5
- vpinsrd $1,12(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm9,%xmm7
- vpslld $26,%xmm9,%xmm2
- vmovdqu %xmm5,48-128(%rax)
- vpaddd %xmm12,%xmm5,%xmm5
-
- vpsrld $11,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm9,%xmm2
- vpaddd -32(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm9,%xmm2
- vpandn %xmm11,%xmm9,%xmm0
- vpand %xmm10,%xmm9,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm13,%xmm12
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm13,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm13,%xmm14,%xmm4
-
- vpxor %xmm1,%xmm12,%xmm12
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm13,%xmm1
-
- vpslld $19,%xmm13,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm12,%xmm7
-
- vpsrld $22,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm13,%xmm2
- vpxor %xmm3,%xmm14,%xmm12
- vpaddd %xmm5,%xmm8,%xmm8
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm12,%xmm12
- vpaddd %xmm7,%xmm12,%xmm12
- vmovd 16(%r8),%xmm5
- vmovd 16(%r9),%xmm0
- vpinsrd $1,16(%r10),%xmm5,%xmm5
- vpinsrd $1,16(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm8,%xmm7
- vpslld $26,%xmm8,%xmm2
- vmovdqu %xmm5,64-128(%rax)
- vpaddd %xmm11,%xmm5,%xmm5
-
- vpsrld $11,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm8,%xmm2
- vpaddd 0(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm8,%xmm2
- vpandn %xmm10,%xmm8,%xmm0
- vpand %xmm9,%xmm8,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm12,%xmm11
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm12,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm12,%xmm13,%xmm3
-
- vpxor %xmm1,%xmm11,%xmm11
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm12,%xmm1
-
- vpslld $19,%xmm12,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm11,%xmm7
-
- vpsrld $22,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm12,%xmm2
- vpxor %xmm4,%xmm13,%xmm11
- vpaddd %xmm5,%xmm15,%xmm15
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm11,%xmm11
- vpaddd %xmm7,%xmm11,%xmm11
- vmovd 20(%r8),%xmm5
- vmovd 20(%r9),%xmm0
- vpinsrd $1,20(%r10),%xmm5,%xmm5
- vpinsrd $1,20(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm15,%xmm7
- vpslld $26,%xmm15,%xmm2
- vmovdqu %xmm5,80-128(%rax)
- vpaddd %xmm10,%xmm5,%xmm5
-
- vpsrld $11,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm15,%xmm2
- vpaddd 32(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm15,%xmm2
- vpandn %xmm9,%xmm15,%xmm0
- vpand %xmm8,%xmm15,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm11,%xmm10
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm11,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm11,%xmm12,%xmm4
-
- vpxor %xmm1,%xmm10,%xmm10
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm11,%xmm1
-
- vpslld $19,%xmm11,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm10,%xmm7
-
- vpsrld $22,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm11,%xmm2
- vpxor %xmm3,%xmm12,%xmm10
- vpaddd %xmm5,%xmm14,%xmm14
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm10,%xmm10
- vpaddd %xmm7,%xmm10,%xmm10
- vmovd 24(%r8),%xmm5
- vmovd 24(%r9),%xmm0
- vpinsrd $1,24(%r10),%xmm5,%xmm5
- vpinsrd $1,24(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm14,%xmm7
- vpslld $26,%xmm14,%xmm2
- vmovdqu %xmm5,96-128(%rax)
- vpaddd %xmm9,%xmm5,%xmm5
-
- vpsrld $11,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm14,%xmm2
- vpaddd 64(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm14,%xmm2
- vpandn %xmm8,%xmm14,%xmm0
- vpand %xmm15,%xmm14,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm10,%xmm9
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm10,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm10,%xmm11,%xmm3
-
- vpxor %xmm1,%xmm9,%xmm9
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm10,%xmm1
-
- vpslld $19,%xmm10,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm9,%xmm7
-
- vpsrld $22,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm10,%xmm2
- vpxor %xmm4,%xmm11,%xmm9
- vpaddd %xmm5,%xmm13,%xmm13
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm9,%xmm9
- vpaddd %xmm7,%xmm9,%xmm9
- vmovd 28(%r8),%xmm5
- vmovd 28(%r9),%xmm0
- vpinsrd $1,28(%r10),%xmm5,%xmm5
- vpinsrd $1,28(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm13,%xmm7
- vpslld $26,%xmm13,%xmm2
- vmovdqu %xmm5,112-128(%rax)
- vpaddd %xmm8,%xmm5,%xmm5
-
- vpsrld $11,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm13,%xmm2
- vpaddd 96(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm13,%xmm2
- vpandn %xmm15,%xmm13,%xmm0
- vpand %xmm14,%xmm13,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm9,%xmm8
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm9,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm9,%xmm10,%xmm4
-
- vpxor %xmm1,%xmm8,%xmm8
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm9,%xmm1
-
- vpslld $19,%xmm9,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm8,%xmm7
-
- vpsrld $22,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm9,%xmm2
- vpxor %xmm3,%xmm10,%xmm8
- vpaddd %xmm5,%xmm12,%xmm12
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm8,%xmm8
- vpaddd %xmm7,%xmm8,%xmm8
- addq $256,%rbp
- vmovd 32(%r8),%xmm5
- vmovd 32(%r9),%xmm0
- vpinsrd $1,32(%r10),%xmm5,%xmm5
- vpinsrd $1,32(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm12,%xmm7
- vpslld $26,%xmm12,%xmm2
- vmovdqu %xmm5,128-128(%rax)
- vpaddd %xmm15,%xmm5,%xmm5
-
- vpsrld $11,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm12,%xmm2
- vpaddd -128(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm12,%xmm2
- vpandn %xmm14,%xmm12,%xmm0
- vpand %xmm13,%xmm12,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm8,%xmm15
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm8,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm8,%xmm9,%xmm3
-
- vpxor %xmm1,%xmm15,%xmm15
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm8,%xmm1
-
- vpslld $19,%xmm8,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm15,%xmm7
-
- vpsrld $22,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm8,%xmm2
- vpxor %xmm4,%xmm9,%xmm15
- vpaddd %xmm5,%xmm11,%xmm11
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm15,%xmm15
- vpaddd %xmm7,%xmm15,%xmm15
- vmovd 36(%r8),%xmm5
- vmovd 36(%r9),%xmm0
- vpinsrd $1,36(%r10),%xmm5,%xmm5
- vpinsrd $1,36(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm11,%xmm7
- vpslld $26,%xmm11,%xmm2
- vmovdqu %xmm5,144-128(%rax)
- vpaddd %xmm14,%xmm5,%xmm5
-
- vpsrld $11,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm11,%xmm2
- vpaddd -96(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm11,%xmm2
- vpandn %xmm13,%xmm11,%xmm0
- vpand %xmm12,%xmm11,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm15,%xmm14
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm15,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm15,%xmm8,%xmm4
-
- vpxor %xmm1,%xmm14,%xmm14
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm15,%xmm1
-
- vpslld $19,%xmm15,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm14,%xmm7
-
- vpsrld $22,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm15,%xmm2
- vpxor %xmm3,%xmm8,%xmm14
- vpaddd %xmm5,%xmm10,%xmm10
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm14,%xmm14
- vpaddd %xmm7,%xmm14,%xmm14
- vmovd 40(%r8),%xmm5
- vmovd 40(%r9),%xmm0
- vpinsrd $1,40(%r10),%xmm5,%xmm5
- vpinsrd $1,40(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm10,%xmm7
- vpslld $26,%xmm10,%xmm2
- vmovdqu %xmm5,160-128(%rax)
- vpaddd %xmm13,%xmm5,%xmm5
-
- vpsrld $11,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm10,%xmm2
- vpaddd -64(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm10,%xmm2
- vpandn %xmm12,%xmm10,%xmm0
- vpand %xmm11,%xmm10,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm14,%xmm13
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm14,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm14,%xmm15,%xmm3
-
- vpxor %xmm1,%xmm13,%xmm13
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm14,%xmm1
-
- vpslld $19,%xmm14,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm13,%xmm7
-
- vpsrld $22,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm14,%xmm2
- vpxor %xmm4,%xmm15,%xmm13
- vpaddd %xmm5,%xmm9,%xmm9
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm13,%xmm13
- vpaddd %xmm7,%xmm13,%xmm13
- vmovd 44(%r8),%xmm5
- vmovd 44(%r9),%xmm0
- vpinsrd $1,44(%r10),%xmm5,%xmm5
- vpinsrd $1,44(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm9,%xmm7
- vpslld $26,%xmm9,%xmm2
- vmovdqu %xmm5,176-128(%rax)
- vpaddd %xmm12,%xmm5,%xmm5
-
- vpsrld $11,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm9,%xmm2
- vpaddd -32(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm9,%xmm2
- vpandn %xmm11,%xmm9,%xmm0
- vpand %xmm10,%xmm9,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm13,%xmm12
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm13,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm13,%xmm14,%xmm4
-
- vpxor %xmm1,%xmm12,%xmm12
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm13,%xmm1
-
- vpslld $19,%xmm13,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm12,%xmm7
-
- vpsrld $22,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm13,%xmm2
- vpxor %xmm3,%xmm14,%xmm12
- vpaddd %xmm5,%xmm8,%xmm8
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm12,%xmm12
- vpaddd %xmm7,%xmm12,%xmm12
- vmovd 48(%r8),%xmm5
- vmovd 48(%r9),%xmm0
- vpinsrd $1,48(%r10),%xmm5,%xmm5
- vpinsrd $1,48(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm8,%xmm7
- vpslld $26,%xmm8,%xmm2
- vmovdqu %xmm5,192-128(%rax)
- vpaddd %xmm11,%xmm5,%xmm5
-
- vpsrld $11,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm8,%xmm2
- vpaddd 0(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm8,%xmm2
- vpandn %xmm10,%xmm8,%xmm0
- vpand %xmm9,%xmm8,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm12,%xmm11
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm12,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm12,%xmm13,%xmm3
-
- vpxor %xmm1,%xmm11,%xmm11
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm12,%xmm1
-
- vpslld $19,%xmm12,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm11,%xmm7
-
- vpsrld $22,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm12,%xmm2
- vpxor %xmm4,%xmm13,%xmm11
- vpaddd %xmm5,%xmm15,%xmm15
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm11,%xmm11
- vpaddd %xmm7,%xmm11,%xmm11
- vmovd 52(%r8),%xmm5
- vmovd 52(%r9),%xmm0
- vpinsrd $1,52(%r10),%xmm5,%xmm5
- vpinsrd $1,52(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm15,%xmm7
- vpslld $26,%xmm15,%xmm2
- vmovdqu %xmm5,208-128(%rax)
- vpaddd %xmm10,%xmm5,%xmm5
-
- vpsrld $11,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm15,%xmm2
- vpaddd 32(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm15,%xmm2
- vpandn %xmm9,%xmm15,%xmm0
- vpand %xmm8,%xmm15,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm11,%xmm10
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm11,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm11,%xmm12,%xmm4
-
- vpxor %xmm1,%xmm10,%xmm10
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm11,%xmm1
-
- vpslld $19,%xmm11,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm10,%xmm7
-
- vpsrld $22,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm11,%xmm2
- vpxor %xmm3,%xmm12,%xmm10
- vpaddd %xmm5,%xmm14,%xmm14
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm10,%xmm10
- vpaddd %xmm7,%xmm10,%xmm10
- vmovd 56(%r8),%xmm5
- vmovd 56(%r9),%xmm0
- vpinsrd $1,56(%r10),%xmm5,%xmm5
- vpinsrd $1,56(%r11),%xmm0,%xmm0
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm14,%xmm7
- vpslld $26,%xmm14,%xmm2
- vmovdqu %xmm5,224-128(%rax)
- vpaddd %xmm9,%xmm5,%xmm5
-
- vpsrld $11,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm14,%xmm2
- vpaddd 64(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm14,%xmm2
- vpandn %xmm8,%xmm14,%xmm0
- vpand %xmm15,%xmm14,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm10,%xmm9
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm10,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm10,%xmm11,%xmm3
-
- vpxor %xmm1,%xmm9,%xmm9
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm10,%xmm1
-
- vpslld $19,%xmm10,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm9,%xmm7
-
- vpsrld $22,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm10,%xmm2
- vpxor %xmm4,%xmm11,%xmm9
- vpaddd %xmm5,%xmm13,%xmm13
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm9,%xmm9
- vpaddd %xmm7,%xmm9,%xmm9
- vmovd 60(%r8),%xmm5
- leaq 64(%r8),%r8
- vmovd 60(%r9),%xmm0
- leaq 64(%r9),%r9
- vpinsrd $1,60(%r10),%xmm5,%xmm5
- leaq 64(%r10),%r10
- vpinsrd $1,60(%r11),%xmm0,%xmm0
- leaq 64(%r11),%r11
- vpunpckldq %xmm0,%xmm5,%xmm5
- vpshufb %xmm6,%xmm5,%xmm5
- vpsrld $6,%xmm13,%xmm7
- vpslld $26,%xmm13,%xmm2
- vmovdqu %xmm5,240-128(%rax)
- vpaddd %xmm8,%xmm5,%xmm5
-
- vpsrld $11,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm13,%xmm2
- vpaddd 96(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- prefetcht0 63(%r8)
- vpslld $7,%xmm13,%xmm2
- vpandn %xmm15,%xmm13,%xmm0
- vpand %xmm14,%xmm13,%xmm4
- prefetcht0 63(%r9)
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm9,%xmm8
- vpxor %xmm2,%xmm7,%xmm7
- prefetcht0 63(%r10)
- vpslld $30,%xmm9,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm9,%xmm10,%xmm4
- prefetcht0 63(%r11)
- vpxor %xmm1,%xmm8,%xmm8
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm9,%xmm1
-
- vpslld $19,%xmm9,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm8,%xmm7
-
- vpsrld $22,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm9,%xmm2
- vpxor %xmm3,%xmm10,%xmm8
- vpaddd %xmm5,%xmm12,%xmm12
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm8,%xmm8
- vpaddd %xmm7,%xmm8,%xmm8
- addq $256,%rbp
- vmovdqu 0-128(%rax),%xmm5
- movl $3,%ecx
- jmp .Loop_16_xx_avx
-.align 32
-.Loop_16_xx_avx:
- vmovdqu 16-128(%rax),%xmm6
- vpaddd 144-128(%rax),%xmm5,%xmm5
-
- vpsrld $3,%xmm6,%xmm7
- vpsrld $7,%xmm6,%xmm1
- vpslld $25,%xmm6,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm6,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm6,%xmm2
- vmovdqu 224-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm5,%xmm5
- vpxor %xmm1,%xmm3,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm5,%xmm5
- vpsrld $6,%xmm12,%xmm7
- vpslld $26,%xmm12,%xmm2
- vmovdqu %xmm5,0-128(%rax)
- vpaddd %xmm15,%xmm5,%xmm5
-
- vpsrld $11,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm12,%xmm2
- vpaddd -128(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm12,%xmm2
- vpandn %xmm14,%xmm12,%xmm0
- vpand %xmm13,%xmm12,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm8,%xmm15
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm8,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm8,%xmm9,%xmm3
-
- vpxor %xmm1,%xmm15,%xmm15
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm8,%xmm1
-
- vpslld $19,%xmm8,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm15,%xmm7
-
- vpsrld $22,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm8,%xmm2
- vpxor %xmm4,%xmm9,%xmm15
- vpaddd %xmm5,%xmm11,%xmm11
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm15,%xmm15
- vpaddd %xmm7,%xmm15,%xmm15
- vmovdqu 32-128(%rax),%xmm5
- vpaddd 160-128(%rax),%xmm6,%xmm6
-
- vpsrld $3,%xmm5,%xmm7
- vpsrld $7,%xmm5,%xmm1
- vpslld $25,%xmm5,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm5,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm5,%xmm2
- vmovdqu 240-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
- vpsrld $6,%xmm11,%xmm7
- vpslld $26,%xmm11,%xmm2
- vmovdqu %xmm6,16-128(%rax)
- vpaddd %xmm14,%xmm6,%xmm6
-
- vpsrld $11,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm11,%xmm2
- vpaddd -96(%rbp),%xmm6,%xmm6
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm11,%xmm2
- vpandn %xmm13,%xmm11,%xmm0
- vpand %xmm12,%xmm11,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm15,%xmm14
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm15,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm15,%xmm8,%xmm4
-
- vpxor %xmm1,%xmm14,%xmm14
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpsrld $13,%xmm15,%xmm1
-
- vpslld $19,%xmm15,%xmm2
- vpaddd %xmm0,%xmm6,%xmm6
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm14,%xmm7
-
- vpsrld $22,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm15,%xmm2
- vpxor %xmm3,%xmm8,%xmm14
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm6,%xmm14,%xmm14
- vpaddd %xmm7,%xmm14,%xmm14
- vmovdqu 48-128(%rax),%xmm6
- vpaddd 176-128(%rax),%xmm5,%xmm5
-
- vpsrld $3,%xmm6,%xmm7
- vpsrld $7,%xmm6,%xmm1
- vpslld $25,%xmm6,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm6,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm6,%xmm2
- vmovdqu 0-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm5,%xmm5
- vpxor %xmm1,%xmm3,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm5,%xmm5
- vpsrld $6,%xmm10,%xmm7
- vpslld $26,%xmm10,%xmm2
- vmovdqu %xmm5,32-128(%rax)
- vpaddd %xmm13,%xmm5,%xmm5
-
- vpsrld $11,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm10,%xmm2
- vpaddd -64(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm10,%xmm2
- vpandn %xmm12,%xmm10,%xmm0
- vpand %xmm11,%xmm10,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm14,%xmm13
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm14,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm14,%xmm15,%xmm3
-
- vpxor %xmm1,%xmm13,%xmm13
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm14,%xmm1
-
- vpslld $19,%xmm14,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm13,%xmm7
-
- vpsrld $22,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm14,%xmm2
- vpxor %xmm4,%xmm15,%xmm13
- vpaddd %xmm5,%xmm9,%xmm9
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm13,%xmm13
- vpaddd %xmm7,%xmm13,%xmm13
- vmovdqu 64-128(%rax),%xmm5
- vpaddd 192-128(%rax),%xmm6,%xmm6
-
- vpsrld $3,%xmm5,%xmm7
- vpsrld $7,%xmm5,%xmm1
- vpslld $25,%xmm5,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm5,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm5,%xmm2
- vmovdqu 16-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
- vpsrld $6,%xmm9,%xmm7
- vpslld $26,%xmm9,%xmm2
- vmovdqu %xmm6,48-128(%rax)
- vpaddd %xmm12,%xmm6,%xmm6
-
- vpsrld $11,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm9,%xmm2
- vpaddd -32(%rbp),%xmm6,%xmm6
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm9,%xmm2
- vpandn %xmm11,%xmm9,%xmm0
- vpand %xmm10,%xmm9,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm13,%xmm12
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm13,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm13,%xmm14,%xmm4
-
- vpxor %xmm1,%xmm12,%xmm12
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpsrld $13,%xmm13,%xmm1
-
- vpslld $19,%xmm13,%xmm2
- vpaddd %xmm0,%xmm6,%xmm6
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm12,%xmm7
-
- vpsrld $22,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm13,%xmm2
- vpxor %xmm3,%xmm14,%xmm12
- vpaddd %xmm6,%xmm8,%xmm8
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm6,%xmm12,%xmm12
- vpaddd %xmm7,%xmm12,%xmm12
- vmovdqu 80-128(%rax),%xmm6
- vpaddd 208-128(%rax),%xmm5,%xmm5
-
- vpsrld $3,%xmm6,%xmm7
- vpsrld $7,%xmm6,%xmm1
- vpslld $25,%xmm6,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm6,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm6,%xmm2
- vmovdqu 32-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm5,%xmm5
- vpxor %xmm1,%xmm3,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm5,%xmm5
- vpsrld $6,%xmm8,%xmm7
- vpslld $26,%xmm8,%xmm2
- vmovdqu %xmm5,64-128(%rax)
- vpaddd %xmm11,%xmm5,%xmm5
-
- vpsrld $11,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm8,%xmm2
- vpaddd 0(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm8,%xmm2
- vpandn %xmm10,%xmm8,%xmm0
- vpand %xmm9,%xmm8,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm12,%xmm11
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm12,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm12,%xmm13,%xmm3
-
- vpxor %xmm1,%xmm11,%xmm11
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm12,%xmm1
-
- vpslld $19,%xmm12,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm11,%xmm7
-
- vpsrld $22,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm12,%xmm2
- vpxor %xmm4,%xmm13,%xmm11
- vpaddd %xmm5,%xmm15,%xmm15
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm11,%xmm11
- vpaddd %xmm7,%xmm11,%xmm11
- vmovdqu 96-128(%rax),%xmm5
- vpaddd 224-128(%rax),%xmm6,%xmm6
-
- vpsrld $3,%xmm5,%xmm7
- vpsrld $7,%xmm5,%xmm1
- vpslld $25,%xmm5,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm5,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm5,%xmm2
- vmovdqu 48-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
- vpsrld $6,%xmm15,%xmm7
- vpslld $26,%xmm15,%xmm2
- vmovdqu %xmm6,80-128(%rax)
- vpaddd %xmm10,%xmm6,%xmm6
-
- vpsrld $11,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm15,%xmm2
- vpaddd 32(%rbp),%xmm6,%xmm6
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm15,%xmm2
- vpandn %xmm9,%xmm15,%xmm0
- vpand %xmm8,%xmm15,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm11,%xmm10
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm11,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm11,%xmm12,%xmm4
-
- vpxor %xmm1,%xmm10,%xmm10
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpsrld $13,%xmm11,%xmm1
-
- vpslld $19,%xmm11,%xmm2
- vpaddd %xmm0,%xmm6,%xmm6
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm10,%xmm7
-
- vpsrld $22,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm11,%xmm2
- vpxor %xmm3,%xmm12,%xmm10
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm6,%xmm10,%xmm10
- vpaddd %xmm7,%xmm10,%xmm10
- vmovdqu 112-128(%rax),%xmm6
- vpaddd 240-128(%rax),%xmm5,%xmm5
-
- vpsrld $3,%xmm6,%xmm7
- vpsrld $7,%xmm6,%xmm1
- vpslld $25,%xmm6,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm6,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm6,%xmm2
- vmovdqu 64-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm5,%xmm5
- vpxor %xmm1,%xmm3,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm5,%xmm5
- vpsrld $6,%xmm14,%xmm7
- vpslld $26,%xmm14,%xmm2
- vmovdqu %xmm5,96-128(%rax)
- vpaddd %xmm9,%xmm5,%xmm5
-
- vpsrld $11,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm14,%xmm2
- vpaddd 64(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm14,%xmm2
- vpandn %xmm8,%xmm14,%xmm0
- vpand %xmm15,%xmm14,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm10,%xmm9
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm10,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm10,%xmm11,%xmm3
-
- vpxor %xmm1,%xmm9,%xmm9
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm10,%xmm1
-
- vpslld $19,%xmm10,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm9,%xmm7
-
- vpsrld $22,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm10,%xmm2
- vpxor %xmm4,%xmm11,%xmm9
- vpaddd %xmm5,%xmm13,%xmm13
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm9,%xmm9
- vpaddd %xmm7,%xmm9,%xmm9
- vmovdqu 128-128(%rax),%xmm5
- vpaddd 0-128(%rax),%xmm6,%xmm6
-
- vpsrld $3,%xmm5,%xmm7
- vpsrld $7,%xmm5,%xmm1
- vpslld $25,%xmm5,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm5,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm5,%xmm2
- vmovdqu 80-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
- vpsrld $6,%xmm13,%xmm7
- vpslld $26,%xmm13,%xmm2
- vmovdqu %xmm6,112-128(%rax)
- vpaddd %xmm8,%xmm6,%xmm6
-
- vpsrld $11,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm13,%xmm2
- vpaddd 96(%rbp),%xmm6,%xmm6
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm13,%xmm2
- vpandn %xmm15,%xmm13,%xmm0
- vpand %xmm14,%xmm13,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm9,%xmm8
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm9,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm9,%xmm10,%xmm4
-
- vpxor %xmm1,%xmm8,%xmm8
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpsrld $13,%xmm9,%xmm1
-
- vpslld $19,%xmm9,%xmm2
- vpaddd %xmm0,%xmm6,%xmm6
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm8,%xmm7
-
- vpsrld $22,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm9,%xmm2
- vpxor %xmm3,%xmm10,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm6,%xmm8,%xmm8
- vpaddd %xmm7,%xmm8,%xmm8
- addq $256,%rbp
- vmovdqu 144-128(%rax),%xmm6
- vpaddd 16-128(%rax),%xmm5,%xmm5
-
- vpsrld $3,%xmm6,%xmm7
- vpsrld $7,%xmm6,%xmm1
- vpslld $25,%xmm6,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm6,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm6,%xmm2
- vmovdqu 96-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm5,%xmm5
- vpxor %xmm1,%xmm3,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm5,%xmm5
- vpsrld $6,%xmm12,%xmm7
- vpslld $26,%xmm12,%xmm2
- vmovdqu %xmm5,128-128(%rax)
- vpaddd %xmm15,%xmm5,%xmm5
-
- vpsrld $11,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm12,%xmm2
- vpaddd -128(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm12,%xmm2
- vpandn %xmm14,%xmm12,%xmm0
- vpand %xmm13,%xmm12,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm8,%xmm15
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm8,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm8,%xmm9,%xmm3
-
- vpxor %xmm1,%xmm15,%xmm15
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm8,%xmm1
-
- vpslld $19,%xmm8,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm15,%xmm7
-
- vpsrld $22,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm8,%xmm2
- vpxor %xmm4,%xmm9,%xmm15
- vpaddd %xmm5,%xmm11,%xmm11
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm15,%xmm15
- vpaddd %xmm7,%xmm15,%xmm15
- vmovdqu 160-128(%rax),%xmm5
- vpaddd 32-128(%rax),%xmm6,%xmm6
-
- vpsrld $3,%xmm5,%xmm7
- vpsrld $7,%xmm5,%xmm1
- vpslld $25,%xmm5,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm5,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm5,%xmm2
- vmovdqu 112-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
- vpsrld $6,%xmm11,%xmm7
- vpslld $26,%xmm11,%xmm2
- vmovdqu %xmm6,144-128(%rax)
- vpaddd %xmm14,%xmm6,%xmm6
-
- vpsrld $11,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm11,%xmm2
- vpaddd -96(%rbp),%xmm6,%xmm6
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm11,%xmm2
- vpandn %xmm13,%xmm11,%xmm0
- vpand %xmm12,%xmm11,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm15,%xmm14
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm15,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm15,%xmm8,%xmm4
-
- vpxor %xmm1,%xmm14,%xmm14
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpsrld $13,%xmm15,%xmm1
-
- vpslld $19,%xmm15,%xmm2
- vpaddd %xmm0,%xmm6,%xmm6
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm14,%xmm7
-
- vpsrld $22,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm15,%xmm2
- vpxor %xmm3,%xmm8,%xmm14
- vpaddd %xmm6,%xmm10,%xmm10
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm6,%xmm14,%xmm14
- vpaddd %xmm7,%xmm14,%xmm14
- vmovdqu 176-128(%rax),%xmm6
- vpaddd 48-128(%rax),%xmm5,%xmm5
-
- vpsrld $3,%xmm6,%xmm7
- vpsrld $7,%xmm6,%xmm1
- vpslld $25,%xmm6,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm6,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm6,%xmm2
- vmovdqu 128-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm5,%xmm5
- vpxor %xmm1,%xmm3,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm5,%xmm5
- vpsrld $6,%xmm10,%xmm7
- vpslld $26,%xmm10,%xmm2
- vmovdqu %xmm5,160-128(%rax)
- vpaddd %xmm13,%xmm5,%xmm5
-
- vpsrld $11,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm10,%xmm2
- vpaddd -64(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm10,%xmm2
- vpandn %xmm12,%xmm10,%xmm0
- vpand %xmm11,%xmm10,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm14,%xmm13
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm14,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm14,%xmm15,%xmm3
-
- vpxor %xmm1,%xmm13,%xmm13
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm14,%xmm1
-
- vpslld $19,%xmm14,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm13,%xmm7
-
- vpsrld $22,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm14,%xmm2
- vpxor %xmm4,%xmm15,%xmm13
- vpaddd %xmm5,%xmm9,%xmm9
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm13,%xmm13
- vpaddd %xmm7,%xmm13,%xmm13
- vmovdqu 192-128(%rax),%xmm5
- vpaddd 64-128(%rax),%xmm6,%xmm6
-
- vpsrld $3,%xmm5,%xmm7
- vpsrld $7,%xmm5,%xmm1
- vpslld $25,%xmm5,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm5,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm5,%xmm2
- vmovdqu 144-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
- vpsrld $6,%xmm9,%xmm7
- vpslld $26,%xmm9,%xmm2
- vmovdqu %xmm6,176-128(%rax)
- vpaddd %xmm12,%xmm6,%xmm6
-
- vpsrld $11,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm9,%xmm2
- vpaddd -32(%rbp),%xmm6,%xmm6
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm9,%xmm2
- vpandn %xmm11,%xmm9,%xmm0
- vpand %xmm10,%xmm9,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm13,%xmm12
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm13,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm13,%xmm14,%xmm4
-
- vpxor %xmm1,%xmm12,%xmm12
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpsrld $13,%xmm13,%xmm1
-
- vpslld $19,%xmm13,%xmm2
- vpaddd %xmm0,%xmm6,%xmm6
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm12,%xmm7
-
- vpsrld $22,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm13,%xmm2
- vpxor %xmm3,%xmm14,%xmm12
- vpaddd %xmm6,%xmm8,%xmm8
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm6,%xmm12,%xmm12
- vpaddd %xmm7,%xmm12,%xmm12
- vmovdqu 208-128(%rax),%xmm6
- vpaddd 80-128(%rax),%xmm5,%xmm5
-
- vpsrld $3,%xmm6,%xmm7
- vpsrld $7,%xmm6,%xmm1
- vpslld $25,%xmm6,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm6,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm6,%xmm2
- vmovdqu 160-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm5,%xmm5
- vpxor %xmm1,%xmm3,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm5,%xmm5
- vpsrld $6,%xmm8,%xmm7
- vpslld $26,%xmm8,%xmm2
- vmovdqu %xmm5,192-128(%rax)
- vpaddd %xmm11,%xmm5,%xmm5
-
- vpsrld $11,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm8,%xmm2
- vpaddd 0(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm8,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm8,%xmm2
- vpandn %xmm10,%xmm8,%xmm0
- vpand %xmm9,%xmm8,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm12,%xmm11
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm12,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm12,%xmm13,%xmm3
-
- vpxor %xmm1,%xmm11,%xmm11
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm12,%xmm1
-
- vpslld $19,%xmm12,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm11,%xmm7
-
- vpsrld $22,%xmm12,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm12,%xmm2
- vpxor %xmm4,%xmm13,%xmm11
- vpaddd %xmm5,%xmm15,%xmm15
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm11,%xmm11
- vpaddd %xmm7,%xmm11,%xmm11
- vmovdqu 224-128(%rax),%xmm5
- vpaddd 96-128(%rax),%xmm6,%xmm6
-
- vpsrld $3,%xmm5,%xmm7
- vpsrld $7,%xmm5,%xmm1
- vpslld $25,%xmm5,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm5,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm5,%xmm2
- vmovdqu 176-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
- vpsrld $6,%xmm15,%xmm7
- vpslld $26,%xmm15,%xmm2
- vmovdqu %xmm6,208-128(%rax)
- vpaddd %xmm10,%xmm6,%xmm6
-
- vpsrld $11,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm15,%xmm2
- vpaddd 32(%rbp),%xmm6,%xmm6
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm15,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm15,%xmm2
- vpandn %xmm9,%xmm15,%xmm0
- vpand %xmm8,%xmm15,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm11,%xmm10
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm11,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm11,%xmm12,%xmm4
-
- vpxor %xmm1,%xmm10,%xmm10
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpsrld $13,%xmm11,%xmm1
-
- vpslld $19,%xmm11,%xmm2
- vpaddd %xmm0,%xmm6,%xmm6
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm10,%xmm7
-
- vpsrld $22,%xmm11,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm11,%xmm2
- vpxor %xmm3,%xmm12,%xmm10
- vpaddd %xmm6,%xmm14,%xmm14
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm6,%xmm10,%xmm10
- vpaddd %xmm7,%xmm10,%xmm10
- vmovdqu 240-128(%rax),%xmm6
- vpaddd 112-128(%rax),%xmm5,%xmm5
-
- vpsrld $3,%xmm6,%xmm7
- vpsrld $7,%xmm6,%xmm1
- vpslld $25,%xmm6,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm6,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm6,%xmm2
- vmovdqu 192-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm5,%xmm5
- vpxor %xmm1,%xmm3,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm5,%xmm5
- vpsrld $6,%xmm14,%xmm7
- vpslld $26,%xmm14,%xmm2
- vmovdqu %xmm5,224-128(%rax)
- vpaddd %xmm9,%xmm5,%xmm5
-
- vpsrld $11,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm14,%xmm2
- vpaddd 64(%rbp),%xmm5,%xmm5
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm14,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm14,%xmm2
- vpandn %xmm8,%xmm14,%xmm0
- vpand %xmm15,%xmm14,%xmm3
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm10,%xmm9
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm10,%xmm1
- vpxor %xmm3,%xmm0,%xmm0
- vpxor %xmm10,%xmm11,%xmm3
-
- vpxor %xmm1,%xmm9,%xmm9
- vpaddd %xmm7,%xmm5,%xmm5
-
- vpsrld $13,%xmm10,%xmm1
-
- vpslld $19,%xmm10,%xmm2
- vpaddd %xmm0,%xmm5,%xmm5
- vpand %xmm3,%xmm4,%xmm4
-
- vpxor %xmm1,%xmm9,%xmm7
-
- vpsrld $22,%xmm10,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm10,%xmm2
- vpxor %xmm4,%xmm11,%xmm9
- vpaddd %xmm5,%xmm13,%xmm13
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm5,%xmm9,%xmm9
- vpaddd %xmm7,%xmm9,%xmm9
- vmovdqu 0-128(%rax),%xmm5
- vpaddd 128-128(%rax),%xmm6,%xmm6
-
- vpsrld $3,%xmm5,%xmm7
- vpsrld $7,%xmm5,%xmm1
- vpslld $25,%xmm5,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $18,%xmm5,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $14,%xmm5,%xmm2
- vmovdqu 208-128(%rax),%xmm0
- vpsrld $10,%xmm0,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
- vpsrld $17,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $15,%xmm0,%xmm2
- vpaddd %xmm7,%xmm6,%xmm6
- vpxor %xmm1,%xmm4,%xmm7
- vpsrld $19,%xmm0,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $13,%xmm0,%xmm2
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
- vpaddd %xmm7,%xmm6,%xmm6
- vpsrld $6,%xmm13,%xmm7
- vpslld $26,%xmm13,%xmm2
- vmovdqu %xmm6,240-128(%rax)
- vpaddd %xmm8,%xmm6,%xmm6
-
- vpsrld $11,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
- vpslld $21,%xmm13,%xmm2
- vpaddd 96(%rbp),%xmm6,%xmm6
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $25,%xmm13,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $7,%xmm13,%xmm2
- vpandn %xmm15,%xmm13,%xmm0
- vpand %xmm14,%xmm13,%xmm4
-
- vpxor %xmm1,%xmm7,%xmm7
-
- vpsrld $2,%xmm9,%xmm8
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $30,%xmm9,%xmm1
- vpxor %xmm4,%xmm0,%xmm0
- vpxor %xmm9,%xmm10,%xmm4
-
- vpxor %xmm1,%xmm8,%xmm8
- vpaddd %xmm7,%xmm6,%xmm6
-
- vpsrld $13,%xmm9,%xmm1
-
- vpslld $19,%xmm9,%xmm2
- vpaddd %xmm0,%xmm6,%xmm6
- vpand %xmm4,%xmm3,%xmm3
-
- vpxor %xmm1,%xmm8,%xmm7
-
- vpsrld $22,%xmm9,%xmm1
- vpxor %xmm2,%xmm7,%xmm7
-
- vpslld $10,%xmm9,%xmm2
- vpxor %xmm3,%xmm10,%xmm8
- vpaddd %xmm6,%xmm12,%xmm12
-
- vpxor %xmm1,%xmm7,%xmm7
- vpxor %xmm2,%xmm7,%xmm7
-
- vpaddd %xmm6,%xmm8,%xmm8
- vpaddd %xmm7,%xmm8,%xmm8
- addq $256,%rbp
- decl %ecx
- jnz .Loop_16_xx_avx
-
- movl $1,%ecx
- leaq K256+128(%rip),%rbp
- cmpl 0(%rbx),%ecx
- cmovgeq %rbp,%r8
- cmpl 4(%rbx),%ecx
- cmovgeq %rbp,%r9
- cmpl 8(%rbx),%ecx
- cmovgeq %rbp,%r10
- cmpl 12(%rbx),%ecx
- cmovgeq %rbp,%r11
- vmovdqa (%rbx),%xmm7
- vpxor %xmm0,%xmm0,%xmm0
- vmovdqa %xmm7,%xmm6
- vpcmpgtd %xmm0,%xmm6,%xmm6
- vpaddd %xmm6,%xmm7,%xmm7
-
- vmovdqu 0-128(%rdi),%xmm0
- vpand %xmm6,%xmm8,%xmm8
- vmovdqu 32-128(%rdi),%xmm1
- vpand %xmm6,%xmm9,%xmm9
- vmovdqu 64-128(%rdi),%xmm2
- vpand %xmm6,%xmm10,%xmm10
- vmovdqu 96-128(%rdi),%xmm5
- vpand %xmm6,%xmm11,%xmm11
- vpaddd %xmm0,%xmm8,%xmm8
- vmovdqu 128-128(%rdi),%xmm0
- vpand %xmm6,%xmm12,%xmm12
- vpaddd %xmm1,%xmm9,%xmm9
- vmovdqu 160-128(%rdi),%xmm1
- vpand %xmm6,%xmm13,%xmm13
- vpaddd %xmm2,%xmm10,%xmm10
- vmovdqu 192-128(%rdi),%xmm2
- vpand %xmm6,%xmm14,%xmm14
- vpaddd %xmm5,%xmm11,%xmm11
- vmovdqu 224-128(%rdi),%xmm5
- vpand %xmm6,%xmm15,%xmm15
- vpaddd %xmm0,%xmm12,%xmm12
- vpaddd %xmm1,%xmm13,%xmm13
- vmovdqu %xmm8,0-128(%rdi)
- vpaddd %xmm2,%xmm14,%xmm14
- vmovdqu %xmm9,32-128(%rdi)
- vpaddd %xmm5,%xmm15,%xmm15
- vmovdqu %xmm10,64-128(%rdi)
- vmovdqu %xmm11,96-128(%rdi)
- vmovdqu %xmm12,128-128(%rdi)
- vmovdqu %xmm13,160-128(%rdi)
- vmovdqu %xmm14,192-128(%rdi)
- vmovdqu %xmm15,224-128(%rdi)
-
- vmovdqu %xmm7,(%rbx)
- vmovdqu .Lpbswap(%rip),%xmm6
- decl %edx
- jnz .Loop_avx
-
- movl 280(%rsp),%edx
- leaq 16(%rdi),%rdi
- leaq 64(%rsi),%rsi
- decl %edx
- jnz .Loop_grande_avx
-
-.Ldone_avx:
- movq 272(%rsp),%rax
-.cfi_def_cfa %rax,8
- vzeroupper
- movq -16(%rax),%rbp
-.cfi_restore %rbp
- movq -8(%rax),%rbx
-.cfi_restore %rbx
- leaq (%rax),%rsp
-.cfi_def_cfa_register %rsp
-.Lepilogue_avx:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size sha256_multi_block_avx,.-sha256_multi_block_avx
-.type sha256_multi_block_avx2,@function
-.align 32
-sha256_multi_block_avx2:
-.cfi_startproc
-_avx2_shortcut:
- movq %rsp,%rax
-.cfi_def_cfa_register %rax
- pushq %rbx
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_offset %rbp,-24
- pushq %r12
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_offset %r15,-56
- subq $576,%rsp
- andq $-256,%rsp
- movq %rax,544(%rsp)
-.cfi_escape 0x0f,0x06,0x77,0xa0,0x04,0x06,0x23,0x08
-.Lbody_avx2:
- leaq K256+128(%rip),%rbp
- leaq 128(%rdi),%rdi
-
-.Loop_grande_avx2:
- movl %edx,552(%rsp)
- xorl %edx,%edx
- leaq 512(%rsp),%rbx
-
- movq 0(%rsi),%r12
-
- movl 8(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,0(%rbx)
- cmovleq %rbp,%r12
-
- movq 16(%rsi),%r13
-
- movl 24(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,4(%rbx)
- cmovleq %rbp,%r13
-
- movq 32(%rsi),%r14
-
- movl 40(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,8(%rbx)
- cmovleq %rbp,%r14
-
- movq 48(%rsi),%r15
-
- movl 56(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,12(%rbx)
- cmovleq %rbp,%r15
-
- movq 64(%rsi),%r8
-
- movl 72(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,16(%rbx)
- cmovleq %rbp,%r8
-
- movq 80(%rsi),%r9
-
- movl 88(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,20(%rbx)
- cmovleq %rbp,%r9
-
- movq 96(%rsi),%r10
-
- movl 104(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,24(%rbx)
- cmovleq %rbp,%r10
-
- movq 112(%rsi),%r11
-
- movl 120(%rsi),%ecx
- cmpl %edx,%ecx
- cmovgl %ecx,%edx
- testl %ecx,%ecx
- movl %ecx,28(%rbx)
- cmovleq %rbp,%r11
- vmovdqu 0-128(%rdi),%ymm8
- leaq 128(%rsp),%rax
- vmovdqu 32-128(%rdi),%ymm9
- leaq 256+128(%rsp),%rbx
- vmovdqu 64-128(%rdi),%ymm10
- vmovdqu 96-128(%rdi),%ymm11
- vmovdqu 128-128(%rdi),%ymm12
- vmovdqu 160-128(%rdi),%ymm13
- vmovdqu 192-128(%rdi),%ymm14
- vmovdqu 224-128(%rdi),%ymm15
- vmovdqu .Lpbswap(%rip),%ymm6
- jmp .Loop_avx2
-
-.align 32
-.Loop_avx2:
- vpxor %ymm9,%ymm10,%ymm4
- vmovd 0(%r12),%xmm5
- vmovd 0(%r8),%xmm0
- vmovd 0(%r13),%xmm1
- vmovd 0(%r9),%xmm2
- vpinsrd $1,0(%r14),%xmm5,%xmm5
- vpinsrd $1,0(%r10),%xmm0,%xmm0
- vpinsrd $1,0(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,0(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm12,%ymm7
- vpslld $26,%ymm12,%ymm2
- vmovdqu %ymm5,0-128(%rax)
- vpaddd %ymm15,%ymm5,%ymm5
-
- vpsrld $11,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm12,%ymm2
- vpaddd -128(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm12,%ymm2
- vpandn %ymm14,%ymm12,%ymm0
- vpand %ymm13,%ymm12,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm8,%ymm15
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm8,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm8,%ymm9,%ymm3
-
- vpxor %ymm1,%ymm15,%ymm15
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm8,%ymm1
-
- vpslld $19,%ymm8,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm15,%ymm7
-
- vpsrld $22,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm8,%ymm2
- vpxor %ymm4,%ymm9,%ymm15
- vpaddd %ymm5,%ymm11,%ymm11
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm15,%ymm15
- vpaddd %ymm7,%ymm15,%ymm15
- vmovd 4(%r12),%xmm5
- vmovd 4(%r8),%xmm0
- vmovd 4(%r13),%xmm1
- vmovd 4(%r9),%xmm2
- vpinsrd $1,4(%r14),%xmm5,%xmm5
- vpinsrd $1,4(%r10),%xmm0,%xmm0
- vpinsrd $1,4(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,4(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm11,%ymm7
- vpslld $26,%ymm11,%ymm2
- vmovdqu %ymm5,32-128(%rax)
- vpaddd %ymm14,%ymm5,%ymm5
-
- vpsrld $11,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm11,%ymm2
- vpaddd -96(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm11,%ymm2
- vpandn %ymm13,%ymm11,%ymm0
- vpand %ymm12,%ymm11,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm15,%ymm14
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm15,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm15,%ymm8,%ymm4
-
- vpxor %ymm1,%ymm14,%ymm14
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm15,%ymm1
-
- vpslld $19,%ymm15,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm14,%ymm7
-
- vpsrld $22,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm15,%ymm2
- vpxor %ymm3,%ymm8,%ymm14
- vpaddd %ymm5,%ymm10,%ymm10
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm14,%ymm14
- vpaddd %ymm7,%ymm14,%ymm14
- vmovd 8(%r12),%xmm5
- vmovd 8(%r8),%xmm0
- vmovd 8(%r13),%xmm1
- vmovd 8(%r9),%xmm2
- vpinsrd $1,8(%r14),%xmm5,%xmm5
- vpinsrd $1,8(%r10),%xmm0,%xmm0
- vpinsrd $1,8(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,8(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm10,%ymm7
- vpslld $26,%ymm10,%ymm2
- vmovdqu %ymm5,64-128(%rax)
- vpaddd %ymm13,%ymm5,%ymm5
-
- vpsrld $11,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm10,%ymm2
- vpaddd -64(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm10,%ymm2
- vpandn %ymm12,%ymm10,%ymm0
- vpand %ymm11,%ymm10,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm14,%ymm13
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm14,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm14,%ymm15,%ymm3
-
- vpxor %ymm1,%ymm13,%ymm13
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm14,%ymm1
-
- vpslld $19,%ymm14,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm13,%ymm7
-
- vpsrld $22,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm14,%ymm2
- vpxor %ymm4,%ymm15,%ymm13
- vpaddd %ymm5,%ymm9,%ymm9
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm13,%ymm13
- vpaddd %ymm7,%ymm13,%ymm13
- vmovd 12(%r12),%xmm5
- vmovd 12(%r8),%xmm0
- vmovd 12(%r13),%xmm1
- vmovd 12(%r9),%xmm2
- vpinsrd $1,12(%r14),%xmm5,%xmm5
- vpinsrd $1,12(%r10),%xmm0,%xmm0
- vpinsrd $1,12(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,12(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm9,%ymm7
- vpslld $26,%ymm9,%ymm2
- vmovdqu %ymm5,96-128(%rax)
- vpaddd %ymm12,%ymm5,%ymm5
-
- vpsrld $11,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm9,%ymm2
- vpaddd -32(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm9,%ymm2
- vpandn %ymm11,%ymm9,%ymm0
- vpand %ymm10,%ymm9,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm13,%ymm12
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm13,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm13,%ymm14,%ymm4
-
- vpxor %ymm1,%ymm12,%ymm12
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm13,%ymm1
-
- vpslld $19,%ymm13,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm12,%ymm7
-
- vpsrld $22,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm13,%ymm2
- vpxor %ymm3,%ymm14,%ymm12
- vpaddd %ymm5,%ymm8,%ymm8
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm12,%ymm12
- vpaddd %ymm7,%ymm12,%ymm12
- vmovd 16(%r12),%xmm5
- vmovd 16(%r8),%xmm0
- vmovd 16(%r13),%xmm1
- vmovd 16(%r9),%xmm2
- vpinsrd $1,16(%r14),%xmm5,%xmm5
- vpinsrd $1,16(%r10),%xmm0,%xmm0
- vpinsrd $1,16(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,16(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm8,%ymm7
- vpslld $26,%ymm8,%ymm2
- vmovdqu %ymm5,128-128(%rax)
- vpaddd %ymm11,%ymm5,%ymm5
-
- vpsrld $11,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm8,%ymm2
- vpaddd 0(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm8,%ymm2
- vpandn %ymm10,%ymm8,%ymm0
- vpand %ymm9,%ymm8,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm12,%ymm11
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm12,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm12,%ymm13,%ymm3
-
- vpxor %ymm1,%ymm11,%ymm11
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm12,%ymm1
-
- vpslld $19,%ymm12,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm11,%ymm7
-
- vpsrld $22,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm12,%ymm2
- vpxor %ymm4,%ymm13,%ymm11
- vpaddd %ymm5,%ymm15,%ymm15
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm11,%ymm11
- vpaddd %ymm7,%ymm11,%ymm11
- vmovd 20(%r12),%xmm5
- vmovd 20(%r8),%xmm0
- vmovd 20(%r13),%xmm1
- vmovd 20(%r9),%xmm2
- vpinsrd $1,20(%r14),%xmm5,%xmm5
- vpinsrd $1,20(%r10),%xmm0,%xmm0
- vpinsrd $1,20(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,20(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm15,%ymm7
- vpslld $26,%ymm15,%ymm2
- vmovdqu %ymm5,160-128(%rax)
- vpaddd %ymm10,%ymm5,%ymm5
-
- vpsrld $11,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm15,%ymm2
- vpaddd 32(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm15,%ymm2
- vpandn %ymm9,%ymm15,%ymm0
- vpand %ymm8,%ymm15,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm11,%ymm10
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm11,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm11,%ymm12,%ymm4
-
- vpxor %ymm1,%ymm10,%ymm10
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm11,%ymm1
-
- vpslld $19,%ymm11,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm10,%ymm7
-
- vpsrld $22,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm11,%ymm2
- vpxor %ymm3,%ymm12,%ymm10
- vpaddd %ymm5,%ymm14,%ymm14
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm10,%ymm10
- vpaddd %ymm7,%ymm10,%ymm10
- vmovd 24(%r12),%xmm5
- vmovd 24(%r8),%xmm0
- vmovd 24(%r13),%xmm1
- vmovd 24(%r9),%xmm2
- vpinsrd $1,24(%r14),%xmm5,%xmm5
- vpinsrd $1,24(%r10),%xmm0,%xmm0
- vpinsrd $1,24(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,24(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm14,%ymm7
- vpslld $26,%ymm14,%ymm2
- vmovdqu %ymm5,192-128(%rax)
- vpaddd %ymm9,%ymm5,%ymm5
-
- vpsrld $11,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm14,%ymm2
- vpaddd 64(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm14,%ymm2
- vpandn %ymm8,%ymm14,%ymm0
- vpand %ymm15,%ymm14,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm10,%ymm9
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm10,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm10,%ymm11,%ymm3
-
- vpxor %ymm1,%ymm9,%ymm9
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm10,%ymm1
-
- vpslld $19,%ymm10,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm9,%ymm7
-
- vpsrld $22,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm10,%ymm2
- vpxor %ymm4,%ymm11,%ymm9
- vpaddd %ymm5,%ymm13,%ymm13
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm9,%ymm9
- vpaddd %ymm7,%ymm9,%ymm9
- vmovd 28(%r12),%xmm5
- vmovd 28(%r8),%xmm0
- vmovd 28(%r13),%xmm1
- vmovd 28(%r9),%xmm2
- vpinsrd $1,28(%r14),%xmm5,%xmm5
- vpinsrd $1,28(%r10),%xmm0,%xmm0
- vpinsrd $1,28(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,28(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm13,%ymm7
- vpslld $26,%ymm13,%ymm2
- vmovdqu %ymm5,224-128(%rax)
- vpaddd %ymm8,%ymm5,%ymm5
-
- vpsrld $11,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm13,%ymm2
- vpaddd 96(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm13,%ymm2
- vpandn %ymm15,%ymm13,%ymm0
- vpand %ymm14,%ymm13,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm9,%ymm8
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm9,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm9,%ymm10,%ymm4
-
- vpxor %ymm1,%ymm8,%ymm8
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm9,%ymm1
-
- vpslld $19,%ymm9,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm8,%ymm7
-
- vpsrld $22,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm9,%ymm2
- vpxor %ymm3,%ymm10,%ymm8
- vpaddd %ymm5,%ymm12,%ymm12
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm8,%ymm8
- vpaddd %ymm7,%ymm8,%ymm8
- addq $256,%rbp
- vmovd 32(%r12),%xmm5
- vmovd 32(%r8),%xmm0
- vmovd 32(%r13),%xmm1
- vmovd 32(%r9),%xmm2
- vpinsrd $1,32(%r14),%xmm5,%xmm5
- vpinsrd $1,32(%r10),%xmm0,%xmm0
- vpinsrd $1,32(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,32(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm12,%ymm7
- vpslld $26,%ymm12,%ymm2
- vmovdqu %ymm5,256-256-128(%rbx)
- vpaddd %ymm15,%ymm5,%ymm5
-
- vpsrld $11,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm12,%ymm2
- vpaddd -128(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm12,%ymm2
- vpandn %ymm14,%ymm12,%ymm0
- vpand %ymm13,%ymm12,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm8,%ymm15
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm8,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm8,%ymm9,%ymm3
-
- vpxor %ymm1,%ymm15,%ymm15
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm8,%ymm1
-
- vpslld $19,%ymm8,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm15,%ymm7
-
- vpsrld $22,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm8,%ymm2
- vpxor %ymm4,%ymm9,%ymm15
- vpaddd %ymm5,%ymm11,%ymm11
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm15,%ymm15
- vpaddd %ymm7,%ymm15,%ymm15
- vmovd 36(%r12),%xmm5
- vmovd 36(%r8),%xmm0
- vmovd 36(%r13),%xmm1
- vmovd 36(%r9),%xmm2
- vpinsrd $1,36(%r14),%xmm5,%xmm5
- vpinsrd $1,36(%r10),%xmm0,%xmm0
- vpinsrd $1,36(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,36(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm11,%ymm7
- vpslld $26,%ymm11,%ymm2
- vmovdqu %ymm5,288-256-128(%rbx)
- vpaddd %ymm14,%ymm5,%ymm5
-
- vpsrld $11,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm11,%ymm2
- vpaddd -96(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm11,%ymm2
- vpandn %ymm13,%ymm11,%ymm0
- vpand %ymm12,%ymm11,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm15,%ymm14
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm15,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm15,%ymm8,%ymm4
-
- vpxor %ymm1,%ymm14,%ymm14
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm15,%ymm1
-
- vpslld $19,%ymm15,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm14,%ymm7
-
- vpsrld $22,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm15,%ymm2
- vpxor %ymm3,%ymm8,%ymm14
- vpaddd %ymm5,%ymm10,%ymm10
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm14,%ymm14
- vpaddd %ymm7,%ymm14,%ymm14
- vmovd 40(%r12),%xmm5
- vmovd 40(%r8),%xmm0
- vmovd 40(%r13),%xmm1
- vmovd 40(%r9),%xmm2
- vpinsrd $1,40(%r14),%xmm5,%xmm5
- vpinsrd $1,40(%r10),%xmm0,%xmm0
- vpinsrd $1,40(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,40(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm10,%ymm7
- vpslld $26,%ymm10,%ymm2
- vmovdqu %ymm5,320-256-128(%rbx)
- vpaddd %ymm13,%ymm5,%ymm5
-
- vpsrld $11,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm10,%ymm2
- vpaddd -64(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm10,%ymm2
- vpandn %ymm12,%ymm10,%ymm0
- vpand %ymm11,%ymm10,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm14,%ymm13
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm14,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm14,%ymm15,%ymm3
-
- vpxor %ymm1,%ymm13,%ymm13
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm14,%ymm1
-
- vpslld $19,%ymm14,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm13,%ymm7
-
- vpsrld $22,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm14,%ymm2
- vpxor %ymm4,%ymm15,%ymm13
- vpaddd %ymm5,%ymm9,%ymm9
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm13,%ymm13
- vpaddd %ymm7,%ymm13,%ymm13
- vmovd 44(%r12),%xmm5
- vmovd 44(%r8),%xmm0
- vmovd 44(%r13),%xmm1
- vmovd 44(%r9),%xmm2
- vpinsrd $1,44(%r14),%xmm5,%xmm5
- vpinsrd $1,44(%r10),%xmm0,%xmm0
- vpinsrd $1,44(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,44(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm9,%ymm7
- vpslld $26,%ymm9,%ymm2
- vmovdqu %ymm5,352-256-128(%rbx)
- vpaddd %ymm12,%ymm5,%ymm5
-
- vpsrld $11,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm9,%ymm2
- vpaddd -32(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm9,%ymm2
- vpandn %ymm11,%ymm9,%ymm0
- vpand %ymm10,%ymm9,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm13,%ymm12
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm13,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm13,%ymm14,%ymm4
-
- vpxor %ymm1,%ymm12,%ymm12
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm13,%ymm1
-
- vpslld $19,%ymm13,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm12,%ymm7
-
- vpsrld $22,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm13,%ymm2
- vpxor %ymm3,%ymm14,%ymm12
- vpaddd %ymm5,%ymm8,%ymm8
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm12,%ymm12
- vpaddd %ymm7,%ymm12,%ymm12
- vmovd 48(%r12),%xmm5
- vmovd 48(%r8),%xmm0
- vmovd 48(%r13),%xmm1
- vmovd 48(%r9),%xmm2
- vpinsrd $1,48(%r14),%xmm5,%xmm5
- vpinsrd $1,48(%r10),%xmm0,%xmm0
- vpinsrd $1,48(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,48(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm8,%ymm7
- vpslld $26,%ymm8,%ymm2
- vmovdqu %ymm5,384-256-128(%rbx)
- vpaddd %ymm11,%ymm5,%ymm5
-
- vpsrld $11,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm8,%ymm2
- vpaddd 0(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm8,%ymm2
- vpandn %ymm10,%ymm8,%ymm0
- vpand %ymm9,%ymm8,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm12,%ymm11
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm12,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm12,%ymm13,%ymm3
-
- vpxor %ymm1,%ymm11,%ymm11
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm12,%ymm1
-
- vpslld $19,%ymm12,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm11,%ymm7
-
- vpsrld $22,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm12,%ymm2
- vpxor %ymm4,%ymm13,%ymm11
- vpaddd %ymm5,%ymm15,%ymm15
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm11,%ymm11
- vpaddd %ymm7,%ymm11,%ymm11
- vmovd 52(%r12),%xmm5
- vmovd 52(%r8),%xmm0
- vmovd 52(%r13),%xmm1
- vmovd 52(%r9),%xmm2
- vpinsrd $1,52(%r14),%xmm5,%xmm5
- vpinsrd $1,52(%r10),%xmm0,%xmm0
- vpinsrd $1,52(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,52(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm15,%ymm7
- vpslld $26,%ymm15,%ymm2
- vmovdqu %ymm5,416-256-128(%rbx)
- vpaddd %ymm10,%ymm5,%ymm5
-
- vpsrld $11,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm15,%ymm2
- vpaddd 32(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm15,%ymm2
- vpandn %ymm9,%ymm15,%ymm0
- vpand %ymm8,%ymm15,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm11,%ymm10
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm11,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm11,%ymm12,%ymm4
-
- vpxor %ymm1,%ymm10,%ymm10
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm11,%ymm1
-
- vpslld $19,%ymm11,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm10,%ymm7
-
- vpsrld $22,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm11,%ymm2
- vpxor %ymm3,%ymm12,%ymm10
- vpaddd %ymm5,%ymm14,%ymm14
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm10,%ymm10
- vpaddd %ymm7,%ymm10,%ymm10
- vmovd 56(%r12),%xmm5
- vmovd 56(%r8),%xmm0
- vmovd 56(%r13),%xmm1
- vmovd 56(%r9),%xmm2
- vpinsrd $1,56(%r14),%xmm5,%xmm5
- vpinsrd $1,56(%r10),%xmm0,%xmm0
- vpinsrd $1,56(%r15),%xmm1,%xmm1
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,56(%r11),%xmm2,%xmm2
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm14,%ymm7
- vpslld $26,%ymm14,%ymm2
- vmovdqu %ymm5,448-256-128(%rbx)
- vpaddd %ymm9,%ymm5,%ymm5
-
- vpsrld $11,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm14,%ymm2
- vpaddd 64(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm14,%ymm2
- vpandn %ymm8,%ymm14,%ymm0
- vpand %ymm15,%ymm14,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm10,%ymm9
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm10,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm10,%ymm11,%ymm3
-
- vpxor %ymm1,%ymm9,%ymm9
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm10,%ymm1
-
- vpslld $19,%ymm10,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm9,%ymm7
-
- vpsrld $22,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm10,%ymm2
- vpxor %ymm4,%ymm11,%ymm9
- vpaddd %ymm5,%ymm13,%ymm13
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm9,%ymm9
- vpaddd %ymm7,%ymm9,%ymm9
- vmovd 60(%r12),%xmm5
- leaq 64(%r12),%r12
- vmovd 60(%r8),%xmm0
- leaq 64(%r8),%r8
- vmovd 60(%r13),%xmm1
- leaq 64(%r13),%r13
- vmovd 60(%r9),%xmm2
- leaq 64(%r9),%r9
- vpinsrd $1,60(%r14),%xmm5,%xmm5
- leaq 64(%r14),%r14
- vpinsrd $1,60(%r10),%xmm0,%xmm0
- leaq 64(%r10),%r10
- vpinsrd $1,60(%r15),%xmm1,%xmm1
- leaq 64(%r15),%r15
- vpunpckldq %ymm1,%ymm5,%ymm5
- vpinsrd $1,60(%r11),%xmm2,%xmm2
- leaq 64(%r11),%r11
- vpunpckldq %ymm2,%ymm0,%ymm0
- vinserti128 $1,%xmm0,%ymm5,%ymm5
- vpshufb %ymm6,%ymm5,%ymm5
- vpsrld $6,%ymm13,%ymm7
- vpslld $26,%ymm13,%ymm2
- vmovdqu %ymm5,480-256-128(%rbx)
- vpaddd %ymm8,%ymm5,%ymm5
-
- vpsrld $11,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm13,%ymm2
- vpaddd 96(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- prefetcht0 63(%r12)
- vpslld $7,%ymm13,%ymm2
- vpandn %ymm15,%ymm13,%ymm0
- vpand %ymm14,%ymm13,%ymm4
- prefetcht0 63(%r13)
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm9,%ymm8
- vpxor %ymm2,%ymm7,%ymm7
- prefetcht0 63(%r14)
- vpslld $30,%ymm9,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm9,%ymm10,%ymm4
- prefetcht0 63(%r15)
- vpxor %ymm1,%ymm8,%ymm8
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm9,%ymm1
- prefetcht0 63(%r8)
- vpslld $19,%ymm9,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm4,%ymm3,%ymm3
- prefetcht0 63(%r9)
- vpxor %ymm1,%ymm8,%ymm7
-
- vpsrld $22,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- prefetcht0 63(%r10)
- vpslld $10,%ymm9,%ymm2
- vpxor %ymm3,%ymm10,%ymm8
- vpaddd %ymm5,%ymm12,%ymm12
- prefetcht0 63(%r11)
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm8,%ymm8
- vpaddd %ymm7,%ymm8,%ymm8
- addq $256,%rbp
- vmovdqu 0-128(%rax),%ymm5
- movl $3,%ecx
- jmp .Loop_16_xx_avx2
-.align 32
-.Loop_16_xx_avx2:
- vmovdqu 32-128(%rax),%ymm6
- vpaddd 288-256-128(%rbx),%ymm5,%ymm5
-
- vpsrld $3,%ymm6,%ymm7
- vpsrld $7,%ymm6,%ymm1
- vpslld $25,%ymm6,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm6,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm6,%ymm2
- vmovdqu 448-256-128(%rbx),%ymm0
- vpsrld $10,%ymm0,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm5,%ymm5
- vpxor %ymm1,%ymm3,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm5,%ymm5
- vpsrld $6,%ymm12,%ymm7
- vpslld $26,%ymm12,%ymm2
- vmovdqu %ymm5,0-128(%rax)
- vpaddd %ymm15,%ymm5,%ymm5
-
- vpsrld $11,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm12,%ymm2
- vpaddd -128(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm12,%ymm2
- vpandn %ymm14,%ymm12,%ymm0
- vpand %ymm13,%ymm12,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm8,%ymm15
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm8,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm8,%ymm9,%ymm3
-
- vpxor %ymm1,%ymm15,%ymm15
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm8,%ymm1
-
- vpslld $19,%ymm8,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm15,%ymm7
-
- vpsrld $22,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm8,%ymm2
- vpxor %ymm4,%ymm9,%ymm15
- vpaddd %ymm5,%ymm11,%ymm11
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm15,%ymm15
- vpaddd %ymm7,%ymm15,%ymm15
- vmovdqu 64-128(%rax),%ymm5
- vpaddd 320-256-128(%rbx),%ymm6,%ymm6
-
- vpsrld $3,%ymm5,%ymm7
- vpsrld $7,%ymm5,%ymm1
- vpslld $25,%ymm5,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm5,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm5,%ymm2
- vmovdqu 480-256-128(%rbx),%ymm0
- vpsrld $10,%ymm0,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm1,%ymm4,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm6,%ymm6
- vpsrld $6,%ymm11,%ymm7
- vpslld $26,%ymm11,%ymm2
- vmovdqu %ymm6,32-128(%rax)
- vpaddd %ymm14,%ymm6,%ymm6
-
- vpsrld $11,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm11,%ymm2
- vpaddd -96(%rbp),%ymm6,%ymm6
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm11,%ymm2
- vpandn %ymm13,%ymm11,%ymm0
- vpand %ymm12,%ymm11,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm15,%ymm14
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm15,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm15,%ymm8,%ymm4
-
- vpxor %ymm1,%ymm14,%ymm14
- vpaddd %ymm7,%ymm6,%ymm6
-
- vpsrld $13,%ymm15,%ymm1
-
- vpslld $19,%ymm15,%ymm2
- vpaddd %ymm0,%ymm6,%ymm6
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm14,%ymm7
-
- vpsrld $22,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm15,%ymm2
- vpxor %ymm3,%ymm8,%ymm14
- vpaddd %ymm6,%ymm10,%ymm10
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm6,%ymm14,%ymm14
- vpaddd %ymm7,%ymm14,%ymm14
- vmovdqu 96-128(%rax),%ymm6
- vpaddd 352-256-128(%rbx),%ymm5,%ymm5
-
- vpsrld $3,%ymm6,%ymm7
- vpsrld $7,%ymm6,%ymm1
- vpslld $25,%ymm6,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm6,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm6,%ymm2
- vmovdqu 0-128(%rax),%ymm0
- vpsrld $10,%ymm0,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm5,%ymm5
- vpxor %ymm1,%ymm3,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm5,%ymm5
- vpsrld $6,%ymm10,%ymm7
- vpslld $26,%ymm10,%ymm2
- vmovdqu %ymm5,64-128(%rax)
- vpaddd %ymm13,%ymm5,%ymm5
-
- vpsrld $11,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm10,%ymm2
- vpaddd -64(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm10,%ymm2
- vpandn %ymm12,%ymm10,%ymm0
- vpand %ymm11,%ymm10,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm14,%ymm13
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm14,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm14,%ymm15,%ymm3
-
- vpxor %ymm1,%ymm13,%ymm13
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm14,%ymm1
-
- vpslld $19,%ymm14,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm13,%ymm7
-
- vpsrld $22,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm14,%ymm2
- vpxor %ymm4,%ymm15,%ymm13
- vpaddd %ymm5,%ymm9,%ymm9
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm13,%ymm13
- vpaddd %ymm7,%ymm13,%ymm13
- vmovdqu 128-128(%rax),%ymm5
- vpaddd 384-256-128(%rbx),%ymm6,%ymm6
-
- vpsrld $3,%ymm5,%ymm7
- vpsrld $7,%ymm5,%ymm1
- vpslld $25,%ymm5,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm5,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm5,%ymm2
- vmovdqu 32-128(%rax),%ymm0
- vpsrld $10,%ymm0,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm1,%ymm4,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm6,%ymm6
- vpsrld $6,%ymm9,%ymm7
- vpslld $26,%ymm9,%ymm2
- vmovdqu %ymm6,96-128(%rax)
- vpaddd %ymm12,%ymm6,%ymm6
-
- vpsrld $11,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm9,%ymm2
- vpaddd -32(%rbp),%ymm6,%ymm6
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm9,%ymm2
- vpandn %ymm11,%ymm9,%ymm0
- vpand %ymm10,%ymm9,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm13,%ymm12
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm13,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm13,%ymm14,%ymm4
-
- vpxor %ymm1,%ymm12,%ymm12
- vpaddd %ymm7,%ymm6,%ymm6
-
- vpsrld $13,%ymm13,%ymm1
-
- vpslld $19,%ymm13,%ymm2
- vpaddd %ymm0,%ymm6,%ymm6
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm12,%ymm7
-
- vpsrld $22,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm13,%ymm2
- vpxor %ymm3,%ymm14,%ymm12
- vpaddd %ymm6,%ymm8,%ymm8
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm6,%ymm12,%ymm12
- vpaddd %ymm7,%ymm12,%ymm12
- vmovdqu 160-128(%rax),%ymm6
- vpaddd 416-256-128(%rbx),%ymm5,%ymm5
-
- vpsrld $3,%ymm6,%ymm7
- vpsrld $7,%ymm6,%ymm1
- vpslld $25,%ymm6,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm6,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm6,%ymm2
- vmovdqu 64-128(%rax),%ymm0
- vpsrld $10,%ymm0,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm5,%ymm5
- vpxor %ymm1,%ymm3,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm5,%ymm5
- vpsrld $6,%ymm8,%ymm7
- vpslld $26,%ymm8,%ymm2
- vmovdqu %ymm5,128-128(%rax)
- vpaddd %ymm11,%ymm5,%ymm5
-
- vpsrld $11,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm8,%ymm2
- vpaddd 0(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm8,%ymm2
- vpandn %ymm10,%ymm8,%ymm0
- vpand %ymm9,%ymm8,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm12,%ymm11
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm12,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm12,%ymm13,%ymm3
-
- vpxor %ymm1,%ymm11,%ymm11
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm12,%ymm1
-
- vpslld $19,%ymm12,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm11,%ymm7
-
- vpsrld $22,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm12,%ymm2
- vpxor %ymm4,%ymm13,%ymm11
- vpaddd %ymm5,%ymm15,%ymm15
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm11,%ymm11
- vpaddd %ymm7,%ymm11,%ymm11
- vmovdqu 192-128(%rax),%ymm5
- vpaddd 448-256-128(%rbx),%ymm6,%ymm6
-
- vpsrld $3,%ymm5,%ymm7
- vpsrld $7,%ymm5,%ymm1
- vpslld $25,%ymm5,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm5,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm5,%ymm2
- vmovdqu 96-128(%rax),%ymm0
- vpsrld $10,%ymm0,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm1,%ymm4,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm6,%ymm6
- vpsrld $6,%ymm15,%ymm7
- vpslld $26,%ymm15,%ymm2
- vmovdqu %ymm6,160-128(%rax)
- vpaddd %ymm10,%ymm6,%ymm6
-
- vpsrld $11,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm15,%ymm2
- vpaddd 32(%rbp),%ymm6,%ymm6
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm15,%ymm2
- vpandn %ymm9,%ymm15,%ymm0
- vpand %ymm8,%ymm15,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm11,%ymm10
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm11,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm11,%ymm12,%ymm4
-
- vpxor %ymm1,%ymm10,%ymm10
- vpaddd %ymm7,%ymm6,%ymm6
-
- vpsrld $13,%ymm11,%ymm1
-
- vpslld $19,%ymm11,%ymm2
- vpaddd %ymm0,%ymm6,%ymm6
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm10,%ymm7
-
- vpsrld $22,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm11,%ymm2
- vpxor %ymm3,%ymm12,%ymm10
- vpaddd %ymm6,%ymm14,%ymm14
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm6,%ymm10,%ymm10
- vpaddd %ymm7,%ymm10,%ymm10
- vmovdqu 224-128(%rax),%ymm6
- vpaddd 480-256-128(%rbx),%ymm5,%ymm5
-
- vpsrld $3,%ymm6,%ymm7
- vpsrld $7,%ymm6,%ymm1
- vpslld $25,%ymm6,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm6,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm6,%ymm2
- vmovdqu 128-128(%rax),%ymm0
- vpsrld $10,%ymm0,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm5,%ymm5
- vpxor %ymm1,%ymm3,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm5,%ymm5
- vpsrld $6,%ymm14,%ymm7
- vpslld $26,%ymm14,%ymm2
- vmovdqu %ymm5,192-128(%rax)
- vpaddd %ymm9,%ymm5,%ymm5
-
- vpsrld $11,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm14,%ymm2
- vpaddd 64(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm14,%ymm2
- vpandn %ymm8,%ymm14,%ymm0
- vpand %ymm15,%ymm14,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm10,%ymm9
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm10,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm10,%ymm11,%ymm3
-
- vpxor %ymm1,%ymm9,%ymm9
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm10,%ymm1
-
- vpslld $19,%ymm10,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm9,%ymm7
-
- vpsrld $22,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm10,%ymm2
- vpxor %ymm4,%ymm11,%ymm9
- vpaddd %ymm5,%ymm13,%ymm13
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm9,%ymm9
- vpaddd %ymm7,%ymm9,%ymm9
- vmovdqu 256-256-128(%rbx),%ymm5
- vpaddd 0-128(%rax),%ymm6,%ymm6
-
- vpsrld $3,%ymm5,%ymm7
- vpsrld $7,%ymm5,%ymm1
- vpslld $25,%ymm5,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm5,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm5,%ymm2
- vmovdqu 160-128(%rax),%ymm0
- vpsrld $10,%ymm0,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm1,%ymm4,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm6,%ymm6
- vpsrld $6,%ymm13,%ymm7
- vpslld $26,%ymm13,%ymm2
- vmovdqu %ymm6,224-128(%rax)
- vpaddd %ymm8,%ymm6,%ymm6
-
- vpsrld $11,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm13,%ymm2
- vpaddd 96(%rbp),%ymm6,%ymm6
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm13,%ymm2
- vpandn %ymm15,%ymm13,%ymm0
- vpand %ymm14,%ymm13,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm9,%ymm8
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm9,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm9,%ymm10,%ymm4
-
- vpxor %ymm1,%ymm8,%ymm8
- vpaddd %ymm7,%ymm6,%ymm6
-
- vpsrld $13,%ymm9,%ymm1
-
- vpslld $19,%ymm9,%ymm2
- vpaddd %ymm0,%ymm6,%ymm6
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm8,%ymm7
-
- vpsrld $22,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm9,%ymm2
- vpxor %ymm3,%ymm10,%ymm8
- vpaddd %ymm6,%ymm12,%ymm12
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm6,%ymm8,%ymm8
- vpaddd %ymm7,%ymm8,%ymm8
- addq $256,%rbp
- vmovdqu 288-256-128(%rbx),%ymm6
- vpaddd 32-128(%rax),%ymm5,%ymm5
-
- vpsrld $3,%ymm6,%ymm7
- vpsrld $7,%ymm6,%ymm1
- vpslld $25,%ymm6,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm6,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm6,%ymm2
- vmovdqu 192-128(%rax),%ymm0
- vpsrld $10,%ymm0,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm5,%ymm5
- vpxor %ymm1,%ymm3,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm5,%ymm5
- vpsrld $6,%ymm12,%ymm7
- vpslld $26,%ymm12,%ymm2
- vmovdqu %ymm5,256-256-128(%rbx)
- vpaddd %ymm15,%ymm5,%ymm5
-
- vpsrld $11,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm12,%ymm2
- vpaddd -128(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm12,%ymm2
- vpandn %ymm14,%ymm12,%ymm0
- vpand %ymm13,%ymm12,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm8,%ymm15
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm8,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm8,%ymm9,%ymm3
-
- vpxor %ymm1,%ymm15,%ymm15
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm8,%ymm1
-
- vpslld $19,%ymm8,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm15,%ymm7
-
- vpsrld $22,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm8,%ymm2
- vpxor %ymm4,%ymm9,%ymm15
- vpaddd %ymm5,%ymm11,%ymm11
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm15,%ymm15
- vpaddd %ymm7,%ymm15,%ymm15
- vmovdqu 320-256-128(%rbx),%ymm5
- vpaddd 64-128(%rax),%ymm6,%ymm6
-
- vpsrld $3,%ymm5,%ymm7
- vpsrld $7,%ymm5,%ymm1
- vpslld $25,%ymm5,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm5,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm5,%ymm2
- vmovdqu 224-128(%rax),%ymm0
- vpsrld $10,%ymm0,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm1,%ymm4,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm6,%ymm6
- vpsrld $6,%ymm11,%ymm7
- vpslld $26,%ymm11,%ymm2
- vmovdqu %ymm6,288-256-128(%rbx)
- vpaddd %ymm14,%ymm6,%ymm6
-
- vpsrld $11,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm11,%ymm2
- vpaddd -96(%rbp),%ymm6,%ymm6
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm11,%ymm2
- vpandn %ymm13,%ymm11,%ymm0
- vpand %ymm12,%ymm11,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm15,%ymm14
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm15,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm15,%ymm8,%ymm4
-
- vpxor %ymm1,%ymm14,%ymm14
- vpaddd %ymm7,%ymm6,%ymm6
-
- vpsrld $13,%ymm15,%ymm1
-
- vpslld $19,%ymm15,%ymm2
- vpaddd %ymm0,%ymm6,%ymm6
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm14,%ymm7
-
- vpsrld $22,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm15,%ymm2
- vpxor %ymm3,%ymm8,%ymm14
- vpaddd %ymm6,%ymm10,%ymm10
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm6,%ymm14,%ymm14
- vpaddd %ymm7,%ymm14,%ymm14
- vmovdqu 352-256-128(%rbx),%ymm6
- vpaddd 96-128(%rax),%ymm5,%ymm5
-
- vpsrld $3,%ymm6,%ymm7
- vpsrld $7,%ymm6,%ymm1
- vpslld $25,%ymm6,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm6,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm6,%ymm2
- vmovdqu 256-256-128(%rbx),%ymm0
- vpsrld $10,%ymm0,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm5,%ymm5
- vpxor %ymm1,%ymm3,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm5,%ymm5
- vpsrld $6,%ymm10,%ymm7
- vpslld $26,%ymm10,%ymm2
- vmovdqu %ymm5,320-256-128(%rbx)
- vpaddd %ymm13,%ymm5,%ymm5
-
- vpsrld $11,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm10,%ymm2
- vpaddd -64(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm10,%ymm2
- vpandn %ymm12,%ymm10,%ymm0
- vpand %ymm11,%ymm10,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm14,%ymm13
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm14,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm14,%ymm15,%ymm3
-
- vpxor %ymm1,%ymm13,%ymm13
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm14,%ymm1
-
- vpslld $19,%ymm14,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm13,%ymm7
-
- vpsrld $22,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm14,%ymm2
- vpxor %ymm4,%ymm15,%ymm13
- vpaddd %ymm5,%ymm9,%ymm9
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm13,%ymm13
- vpaddd %ymm7,%ymm13,%ymm13
- vmovdqu 384-256-128(%rbx),%ymm5
- vpaddd 128-128(%rax),%ymm6,%ymm6
-
- vpsrld $3,%ymm5,%ymm7
- vpsrld $7,%ymm5,%ymm1
- vpslld $25,%ymm5,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm5,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm5,%ymm2
- vmovdqu 288-256-128(%rbx),%ymm0
- vpsrld $10,%ymm0,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm1,%ymm4,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm6,%ymm6
- vpsrld $6,%ymm9,%ymm7
- vpslld $26,%ymm9,%ymm2
- vmovdqu %ymm6,352-256-128(%rbx)
- vpaddd %ymm12,%ymm6,%ymm6
-
- vpsrld $11,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm9,%ymm2
- vpaddd -32(%rbp),%ymm6,%ymm6
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm9,%ymm2
- vpandn %ymm11,%ymm9,%ymm0
- vpand %ymm10,%ymm9,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm13,%ymm12
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm13,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm13,%ymm14,%ymm4
-
- vpxor %ymm1,%ymm12,%ymm12
- vpaddd %ymm7,%ymm6,%ymm6
-
- vpsrld $13,%ymm13,%ymm1
-
- vpslld $19,%ymm13,%ymm2
- vpaddd %ymm0,%ymm6,%ymm6
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm12,%ymm7
-
- vpsrld $22,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm13,%ymm2
- vpxor %ymm3,%ymm14,%ymm12
- vpaddd %ymm6,%ymm8,%ymm8
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm6,%ymm12,%ymm12
- vpaddd %ymm7,%ymm12,%ymm12
- vmovdqu 416-256-128(%rbx),%ymm6
- vpaddd 160-128(%rax),%ymm5,%ymm5
-
- vpsrld $3,%ymm6,%ymm7
- vpsrld $7,%ymm6,%ymm1
- vpslld $25,%ymm6,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm6,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm6,%ymm2
- vmovdqu 320-256-128(%rbx),%ymm0
- vpsrld $10,%ymm0,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm5,%ymm5
- vpxor %ymm1,%ymm3,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm5,%ymm5
- vpsrld $6,%ymm8,%ymm7
- vpslld $26,%ymm8,%ymm2
- vmovdqu %ymm5,384-256-128(%rbx)
- vpaddd %ymm11,%ymm5,%ymm5
-
- vpsrld $11,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm8,%ymm2
- vpaddd 0(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm8,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm8,%ymm2
- vpandn %ymm10,%ymm8,%ymm0
- vpand %ymm9,%ymm8,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm12,%ymm11
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm12,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm12,%ymm13,%ymm3
-
- vpxor %ymm1,%ymm11,%ymm11
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm12,%ymm1
-
- vpslld $19,%ymm12,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm11,%ymm7
-
- vpsrld $22,%ymm12,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm12,%ymm2
- vpxor %ymm4,%ymm13,%ymm11
- vpaddd %ymm5,%ymm15,%ymm15
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm11,%ymm11
- vpaddd %ymm7,%ymm11,%ymm11
- vmovdqu 448-256-128(%rbx),%ymm5
- vpaddd 192-128(%rax),%ymm6,%ymm6
-
- vpsrld $3,%ymm5,%ymm7
- vpsrld $7,%ymm5,%ymm1
- vpslld $25,%ymm5,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm5,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm5,%ymm2
- vmovdqu 352-256-128(%rbx),%ymm0
- vpsrld $10,%ymm0,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm1,%ymm4,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm6,%ymm6
- vpsrld $6,%ymm15,%ymm7
- vpslld $26,%ymm15,%ymm2
- vmovdqu %ymm6,416-256-128(%rbx)
- vpaddd %ymm10,%ymm6,%ymm6
-
- vpsrld $11,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm15,%ymm2
- vpaddd 32(%rbp),%ymm6,%ymm6
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm15,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm15,%ymm2
- vpandn %ymm9,%ymm15,%ymm0
- vpand %ymm8,%ymm15,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm11,%ymm10
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm11,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm11,%ymm12,%ymm4
-
- vpxor %ymm1,%ymm10,%ymm10
- vpaddd %ymm7,%ymm6,%ymm6
-
- vpsrld $13,%ymm11,%ymm1
-
- vpslld $19,%ymm11,%ymm2
- vpaddd %ymm0,%ymm6,%ymm6
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm10,%ymm7
-
- vpsrld $22,%ymm11,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm11,%ymm2
- vpxor %ymm3,%ymm12,%ymm10
- vpaddd %ymm6,%ymm14,%ymm14
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm6,%ymm10,%ymm10
- vpaddd %ymm7,%ymm10,%ymm10
- vmovdqu 480-256-128(%rbx),%ymm6
- vpaddd 224-128(%rax),%ymm5,%ymm5
-
- vpsrld $3,%ymm6,%ymm7
- vpsrld $7,%ymm6,%ymm1
- vpslld $25,%ymm6,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm6,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm6,%ymm2
- vmovdqu 384-256-128(%rbx),%ymm0
- vpsrld $10,%ymm0,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm5,%ymm5
- vpxor %ymm1,%ymm3,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm5,%ymm5
- vpsrld $6,%ymm14,%ymm7
- vpslld $26,%ymm14,%ymm2
- vmovdqu %ymm5,448-256-128(%rbx)
- vpaddd %ymm9,%ymm5,%ymm5
-
- vpsrld $11,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm14,%ymm2
- vpaddd 64(%rbp),%ymm5,%ymm5
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm14,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm14,%ymm2
- vpandn %ymm8,%ymm14,%ymm0
- vpand %ymm15,%ymm14,%ymm3
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm10,%ymm9
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm10,%ymm1
- vpxor %ymm3,%ymm0,%ymm0
- vpxor %ymm10,%ymm11,%ymm3
-
- vpxor %ymm1,%ymm9,%ymm9
- vpaddd %ymm7,%ymm5,%ymm5
-
- vpsrld $13,%ymm10,%ymm1
-
- vpslld $19,%ymm10,%ymm2
- vpaddd %ymm0,%ymm5,%ymm5
- vpand %ymm3,%ymm4,%ymm4
-
- vpxor %ymm1,%ymm9,%ymm7
-
- vpsrld $22,%ymm10,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm10,%ymm2
- vpxor %ymm4,%ymm11,%ymm9
- vpaddd %ymm5,%ymm13,%ymm13
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm5,%ymm9,%ymm9
- vpaddd %ymm7,%ymm9,%ymm9
- vmovdqu 0-128(%rax),%ymm5
- vpaddd 256-256-128(%rbx),%ymm6,%ymm6
-
- vpsrld $3,%ymm5,%ymm7
- vpsrld $7,%ymm5,%ymm1
- vpslld $25,%ymm5,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $18,%ymm5,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $14,%ymm5,%ymm2
- vmovdqu 416-256-128(%rbx),%ymm0
- vpsrld $10,%ymm0,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
- vpsrld $17,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $15,%ymm0,%ymm2
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm1,%ymm4,%ymm7
- vpsrld $19,%ymm0,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $13,%ymm0,%ymm2
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
- vpaddd %ymm7,%ymm6,%ymm6
- vpsrld $6,%ymm13,%ymm7
- vpslld $26,%ymm13,%ymm2
- vmovdqu %ymm6,480-256-128(%rbx)
- vpaddd %ymm8,%ymm6,%ymm6
-
- vpsrld $11,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
- vpslld $21,%ymm13,%ymm2
- vpaddd 96(%rbp),%ymm6,%ymm6
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $25,%ymm13,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $7,%ymm13,%ymm2
- vpandn %ymm15,%ymm13,%ymm0
- vpand %ymm14,%ymm13,%ymm4
-
- vpxor %ymm1,%ymm7,%ymm7
-
- vpsrld $2,%ymm9,%ymm8
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $30,%ymm9,%ymm1
- vpxor %ymm4,%ymm0,%ymm0
- vpxor %ymm9,%ymm10,%ymm4
-
- vpxor %ymm1,%ymm8,%ymm8
- vpaddd %ymm7,%ymm6,%ymm6
-
- vpsrld $13,%ymm9,%ymm1
-
- vpslld $19,%ymm9,%ymm2
- vpaddd %ymm0,%ymm6,%ymm6
- vpand %ymm4,%ymm3,%ymm3
-
- vpxor %ymm1,%ymm8,%ymm7
-
- vpsrld $22,%ymm9,%ymm1
- vpxor %ymm2,%ymm7,%ymm7
-
- vpslld $10,%ymm9,%ymm2
- vpxor %ymm3,%ymm10,%ymm8
- vpaddd %ymm6,%ymm12,%ymm12
-
- vpxor %ymm1,%ymm7,%ymm7
- vpxor %ymm2,%ymm7,%ymm7
-
- vpaddd %ymm6,%ymm8,%ymm8
- vpaddd %ymm7,%ymm8,%ymm8
- addq $256,%rbp
- decl %ecx
- jnz .Loop_16_xx_avx2
-
- movl $1,%ecx
- leaq 512(%rsp),%rbx
- leaq K256+128(%rip),%rbp
- cmpl 0(%rbx),%ecx
- cmovgeq %rbp,%r12
- cmpl 4(%rbx),%ecx
- cmovgeq %rbp,%r13
- cmpl 8(%rbx),%ecx
- cmovgeq %rbp,%r14
- cmpl 12(%rbx),%ecx
- cmovgeq %rbp,%r15
- cmpl 16(%rbx),%ecx
- cmovgeq %rbp,%r8
- cmpl 20(%rbx),%ecx
- cmovgeq %rbp,%r9
- cmpl 24(%rbx),%ecx
- cmovgeq %rbp,%r10
- cmpl 28(%rbx),%ecx
- cmovgeq %rbp,%r11
- vmovdqa (%rbx),%ymm7
- vpxor %ymm0,%ymm0,%ymm0
- vmovdqa %ymm7,%ymm6
- vpcmpgtd %ymm0,%ymm6,%ymm6
- vpaddd %ymm6,%ymm7,%ymm7
-
- vmovdqu 0-128(%rdi),%ymm0
- vpand %ymm6,%ymm8,%ymm8
- vmovdqu 32-128(%rdi),%ymm1
- vpand %ymm6,%ymm9,%ymm9
- vmovdqu 64-128(%rdi),%ymm2
- vpand %ymm6,%ymm10,%ymm10
- vmovdqu 96-128(%rdi),%ymm5
- vpand %ymm6,%ymm11,%ymm11
- vpaddd %ymm0,%ymm8,%ymm8
- vmovdqu 128-128(%rdi),%ymm0
- vpand %ymm6,%ymm12,%ymm12
- vpaddd %ymm1,%ymm9,%ymm9
- vmovdqu 160-128(%rdi),%ymm1
- vpand %ymm6,%ymm13,%ymm13
- vpaddd %ymm2,%ymm10,%ymm10
- vmovdqu 192-128(%rdi),%ymm2
- vpand %ymm6,%ymm14,%ymm14
- vpaddd %ymm5,%ymm11,%ymm11
- vmovdqu 224-128(%rdi),%ymm5
- vpand %ymm6,%ymm15,%ymm15
- vpaddd %ymm0,%ymm12,%ymm12
- vpaddd %ymm1,%ymm13,%ymm13
- vmovdqu %ymm8,0-128(%rdi)
- vpaddd %ymm2,%ymm14,%ymm14
- vmovdqu %ymm9,32-128(%rdi)
- vpaddd %ymm5,%ymm15,%ymm15
- vmovdqu %ymm10,64-128(%rdi)
- vmovdqu %ymm11,96-128(%rdi)
- vmovdqu %ymm12,128-128(%rdi)
- vmovdqu %ymm13,160-128(%rdi)
- vmovdqu %ymm14,192-128(%rdi)
- vmovdqu %ymm15,224-128(%rdi)
-
- vmovdqu %ymm7,(%rbx)
- leaq 256+128(%rsp),%rbx
- vmovdqu .Lpbswap(%rip),%ymm6
- decl %edx
- jnz .Loop_avx2
-
-
-
-
-
-
-
-.Ldone_avx2:
- movq 544(%rsp),%rax
-.cfi_def_cfa %rax,8
- vzeroupper
- movq -48(%rax),%r15
-.cfi_restore %r15
- movq -40(%rax),%r14
-.cfi_restore %r14
- movq -32(%rax),%r13
-.cfi_restore %r13
- movq -24(%rax),%r12
-.cfi_restore %r12
- movq -16(%rax),%rbp
-.cfi_restore %rbp
- movq -8(%rax),%rbx
-.cfi_restore %rbx
- leaq (%rax),%rsp
-.cfi_def_cfa_register %rsp
-.Lepilogue_avx2:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size sha256_multi_block_avx2,.-sha256_multi_block_avx2
.align 256
K256:
.long 1116352408,1116352408,1116352408,1116352408
@@ -7982,7 +3286,7 @@ K256_shaext:
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.byte 83,72,65,50,53,54,32,109,117,108,116,105,45,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
- .section ".note.gnu.property", "a"
+ .section .note.gnu.property, #alloc
.p2align 3
.long 1f - 0f
.long 4f - 1f
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha256-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha256-x86_64.s
index 1b03ce39b99..f2c864d92ec 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha256-x86_64.s
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha256-x86_64.s
@@ -12,14 +12,6 @@ sha256_block_data_order:
movl 8(%r11),%r11d
testl $536870912,%r11d
jnz _shaext_shortcut
- andl $296,%r11d
- cmpl $296,%r11d
- je .Lavx2_shortcut
- andl $1073741824,%r9d
- andl $268435968,%r10d
- orl %r9d,%r10d
- cmpl $1342177792,%r10d
- je .Lavx_shortcut
testl $512,%r10d
jnz .Lssse3_shortcut
movq %rsp,%rax
@@ -3093,2368 +3085,7 @@ sha256_block_data_order_ssse3:
.byte 0xf3,0xc3
.cfi_endproc
.size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3
-.type sha256_block_data_order_avx,@function
-.align 64
-sha256_block_data_order_avx:
-.cfi_startproc
-.Lavx_shortcut:
- movq %rsp,%rax
-.cfi_def_cfa_register %rax
- pushq %rbx
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_offset %rbp,-24
- pushq %r12
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_offset %r15,-56
- shlq $4,%rdx
- subq $96,%rsp
- leaq (%rsi,%rdx,4),%rdx
- andq $-64,%rsp
- movq %rdi,64+0(%rsp)
- movq %rsi,64+8(%rsp)
- movq %rdx,64+16(%rsp)
- movq %rax,88(%rsp)
-.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
-.Lprologue_avx:
-
- vzeroupper
- movl 0(%rdi),%eax
- movl 4(%rdi),%ebx
- movl 8(%rdi),%ecx
- movl 12(%rdi),%edx
- movl 16(%rdi),%r8d
- movl 20(%rdi),%r9d
- movl 24(%rdi),%r10d
- movl 28(%rdi),%r11d
- vmovdqa K256+512+32(%rip),%xmm8
- vmovdqa K256+512+64(%rip),%xmm9
- jmp .Lloop_avx
-.align 16
-.Lloop_avx:
- vmovdqa K256+512(%rip),%xmm7
- vmovdqu 0(%rsi),%xmm0
- vmovdqu 16(%rsi),%xmm1
- vmovdqu 32(%rsi),%xmm2
- vmovdqu 48(%rsi),%xmm3
- vpshufb %xmm7,%xmm0,%xmm0
- leaq K256(%rip),%rbp
- vpshufb %xmm7,%xmm1,%xmm1
- vpshufb %xmm7,%xmm2,%xmm2
- vpaddd 0(%rbp),%xmm0,%xmm4
- vpshufb %xmm7,%xmm3,%xmm3
- vpaddd 32(%rbp),%xmm1,%xmm5
- vpaddd 64(%rbp),%xmm2,%xmm6
- vpaddd 96(%rbp),%xmm3,%xmm7
- vmovdqa %xmm4,0(%rsp)
- movl %eax,%r14d
- vmovdqa %xmm5,16(%rsp)
- movl %ebx,%edi
- vmovdqa %xmm6,32(%rsp)
- xorl %ecx,%edi
- vmovdqa %xmm7,48(%rsp)
- movl %r8d,%r13d
- jmp .Lavx_00_47
-
-.align 16
-.Lavx_00_47:
- subq $-128,%rbp
- vpalignr $4,%xmm0,%xmm1,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- vpalignr $4,%xmm2,%xmm3,%xmm7
- shrdl $9,%r14d,%r14d
- xorl %r8d,%r13d
- xorl %r10d,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vpaddd %xmm7,%xmm0,%xmm0
- xorl %r8d,%r13d
- addl 0(%rsp),%r11d
- movl %eax,%r15d
- vpsrld $3,%xmm4,%xmm7
- xorl %r10d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ebx,%r15d
- vpslld $14,%xmm4,%xmm5
- addl %r12d,%r11d
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%edi
- vpshufd $250,%xmm3,%xmm7
- shrdl $2,%r14d,%r14d
- addl %r11d,%edx
- addl %edi,%r11d
- vpsrld $11,%xmm6,%xmm6
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%r11d
- movl %r8d,%r12d
- shrdl $9,%r14d,%r14d
- vpslld $11,%xmm5,%xmm5
- xorl %edx,%r13d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %r11d,%r14d
- andl %edx,%r12d
- xorl %edx,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 4(%rsp),%r10d
- movl %r11d,%edi
- xorl %r9d,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- shrdl $11,%r14d,%r14d
- xorl %eax,%edi
- addl %r12d,%r10d
- vpsrlq $17,%xmm7,%xmm7
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r11d,%r14d
- vpaddd %xmm4,%xmm0,%xmm0
- addl %r13d,%r10d
- xorl %eax,%r15d
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %r10d,%ecx
- addl %r15d,%r10d
- movl %ecx,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- vpxor %xmm7,%xmm6,%xmm6
- movl %edx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r13d
- vpshufb %xmm8,%xmm6,%xmm6
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- vpaddd %xmm6,%xmm0,%xmm0
- andl %ecx,%r12d
- xorl %ecx,%r13d
- addl 8(%rsp),%r9d
- vpshufd $80,%xmm0,%xmm7
- movl %r10d,%r15d
- xorl %r8d,%r12d
- shrdl $11,%r14d,%r14d
- vpsrld $10,%xmm7,%xmm6
- xorl %r11d,%r15d
- addl %r12d,%r9d
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- andl %r15d,%edi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- vpxor %xmm7,%xmm6,%xmm6
- xorl %r11d,%edi
- shrdl $2,%r14d,%r14d
- addl %r9d,%ebx
- vpsrlq $2,%xmm7,%xmm7
- addl %edi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- vpshufb %xmm9,%xmm6,%xmm6
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r13d
- xorl %edx,%r12d
- vpaddd %xmm6,%xmm0,%xmm0
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vpaddd 0(%rbp),%xmm0,%xmm6
- xorl %ebx,%r13d
- addl 12(%rsp),%r8d
- movl %r9d,%edi
- xorl %edx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r10d,%edi
- addl %r12d,%r8d
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- shrdl $2,%r14d,%r14d
- addl %r8d,%eax
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- vmovdqa %xmm6,0(%rsp)
- vpalignr $4,%xmm1,%xmm2,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- vpalignr $4,%xmm3,%xmm0,%xmm7
- shrdl $9,%r14d,%r14d
- xorl %eax,%r13d
- xorl %ecx,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vpaddd %xmm7,%xmm1,%xmm1
- xorl %eax,%r13d
- addl 16(%rsp),%edx
- movl %r8d,%r15d
- vpsrld $3,%xmm4,%xmm7
- xorl %ecx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r9d,%r15d
- vpslld $14,%xmm4,%xmm5
- addl %r12d,%edx
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%edi
- vpshufd $250,%xmm0,%xmm7
- shrdl $2,%r14d,%r14d
- addl %edx,%r11d
- addl %edi,%edx
- vpsrld $11,%xmm6,%xmm6
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%edx
- movl %eax,%r12d
- shrdl $9,%r14d,%r14d
- vpslld $11,%xmm5,%xmm5
- xorl %r11d,%r13d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %edx,%r14d
- andl %r11d,%r12d
- xorl %r11d,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 20(%rsp),%ecx
- movl %edx,%edi
- xorl %ebx,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- shrdl $11,%r14d,%r14d
- xorl %r8d,%edi
- addl %r12d,%ecx
- vpsrlq $17,%xmm7,%xmm7
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %edx,%r14d
- vpaddd %xmm4,%xmm1,%xmm1
- addl %r13d,%ecx
- xorl %r8d,%r15d
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %ecx,%r10d
- addl %r15d,%ecx
- movl %r10d,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- movl %r11d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r13d
- vpshufb %xmm8,%xmm6,%xmm6
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- vpaddd %xmm6,%xmm1,%xmm1
- andl %r10d,%r12d
- xorl %r10d,%r13d
- addl 24(%rsp),%ebx
- vpshufd $80,%xmm1,%xmm7
- movl %ecx,%r15d
- xorl %eax,%r12d
- shrdl $11,%r14d,%r14d
- vpsrld $10,%xmm7,%xmm6
- xorl %edx,%r15d
- addl %r12d,%ebx
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- andl %r15d,%edi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- vpxor %xmm7,%xmm6,%xmm6
- xorl %edx,%edi
- shrdl $2,%r14d,%r14d
- addl %ebx,%r9d
- vpsrlq $2,%xmm7,%xmm7
- addl %edi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- vpshufb %xmm9,%xmm6,%xmm6
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r13d
- xorl %r11d,%r12d
- vpaddd %xmm6,%xmm1,%xmm1
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vpaddd 32(%rbp),%xmm1,%xmm6
- xorl %r9d,%r13d
- addl 28(%rsp),%eax
- movl %ebx,%edi
- xorl %r11d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ecx,%edi
- addl %r12d,%eax
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- shrdl $2,%r14d,%r14d
- addl %eax,%r8d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- vmovdqa %xmm6,16(%rsp)
- vpalignr $4,%xmm2,%xmm3,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- vpalignr $4,%xmm0,%xmm1,%xmm7
- shrdl $9,%r14d,%r14d
- xorl %r8d,%r13d
- xorl %r10d,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- vpaddd %xmm7,%xmm2,%xmm2
- xorl %r8d,%r13d
- addl 32(%rsp),%r11d
- movl %eax,%r15d
- vpsrld $3,%xmm4,%xmm7
- xorl %r10d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ebx,%r15d
- vpslld $14,%xmm4,%xmm5
- addl %r12d,%r11d
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%edi
- vpshufd $250,%xmm1,%xmm7
- shrdl $2,%r14d,%r14d
- addl %r11d,%edx
- addl %edi,%r11d
- vpsrld $11,%xmm6,%xmm6
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%r11d
- movl %r8d,%r12d
- shrdl $9,%r14d,%r14d
- vpslld $11,%xmm5,%xmm5
- xorl %edx,%r13d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %r11d,%r14d
- andl %edx,%r12d
- xorl %edx,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 36(%rsp),%r10d
- movl %r11d,%edi
- xorl %r9d,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- shrdl $11,%r14d,%r14d
- xorl %eax,%edi
- addl %r12d,%r10d
- vpsrlq $17,%xmm7,%xmm7
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r11d,%r14d
- vpaddd %xmm4,%xmm2,%xmm2
- addl %r13d,%r10d
- xorl %eax,%r15d
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %r10d,%ecx
- addl %r15d,%r10d
- movl %ecx,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- vpxor %xmm7,%xmm6,%xmm6
- movl %edx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r13d
- vpshufb %xmm8,%xmm6,%xmm6
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- vpaddd %xmm6,%xmm2,%xmm2
- andl %ecx,%r12d
- xorl %ecx,%r13d
- addl 40(%rsp),%r9d
- vpshufd $80,%xmm2,%xmm7
- movl %r10d,%r15d
- xorl %r8d,%r12d
- shrdl $11,%r14d,%r14d
- vpsrld $10,%xmm7,%xmm6
- xorl %r11d,%r15d
- addl %r12d,%r9d
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- andl %r15d,%edi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- vpxor %xmm7,%xmm6,%xmm6
- xorl %r11d,%edi
- shrdl $2,%r14d,%r14d
- addl %r9d,%ebx
- vpsrlq $2,%xmm7,%xmm7
- addl %edi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- vpshufb %xmm9,%xmm6,%xmm6
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r13d
- xorl %edx,%r12d
- vpaddd %xmm6,%xmm2,%xmm2
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- vpaddd 64(%rbp),%xmm2,%xmm6
- xorl %ebx,%r13d
- addl 44(%rsp),%r8d
- movl %r9d,%edi
- xorl %edx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r10d,%edi
- addl %r12d,%r8d
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- shrdl $2,%r14d,%r14d
- addl %r8d,%eax
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- vmovdqa %xmm6,32(%rsp)
- vpalignr $4,%xmm3,%xmm0,%xmm4
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- vpalignr $4,%xmm1,%xmm2,%xmm7
- shrdl $9,%r14d,%r14d
- xorl %eax,%r13d
- xorl %ecx,%r12d
- vpsrld $7,%xmm4,%xmm6
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- vpaddd %xmm7,%xmm3,%xmm3
- xorl %eax,%r13d
- addl 48(%rsp),%edx
- movl %r8d,%r15d
- vpsrld $3,%xmm4,%xmm7
- xorl %ecx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r9d,%r15d
- vpslld $14,%xmm4,%xmm5
- addl %r12d,%edx
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- vpxor %xmm6,%xmm7,%xmm4
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%edi
- vpshufd $250,%xmm2,%xmm7
- shrdl $2,%r14d,%r14d
- addl %edx,%r11d
- addl %edi,%edx
- vpsrld $11,%xmm6,%xmm6
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- vpxor %xmm5,%xmm4,%xmm4
- movl %r14d,%edx
- movl %eax,%r12d
- shrdl $9,%r14d,%r14d
- vpslld $11,%xmm5,%xmm5
- xorl %r11d,%r13d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- vpxor %xmm6,%xmm4,%xmm4
- xorl %edx,%r14d
- andl %r11d,%r12d
- xorl %r11d,%r13d
- vpsrld $10,%xmm7,%xmm6
- addl 52(%rsp),%ecx
- movl %edx,%edi
- xorl %ebx,%r12d
- vpxor %xmm5,%xmm4,%xmm4
- shrdl $11,%r14d,%r14d
- xorl %r8d,%edi
- addl %r12d,%ecx
- vpsrlq $17,%xmm7,%xmm7
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %edx,%r14d
- vpaddd %xmm4,%xmm3,%xmm3
- addl %r13d,%ecx
- xorl %r8d,%r15d
- shrdl $2,%r14d,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- addl %ecx,%r10d
- addl %r15d,%ecx
- movl %r10d,%r13d
- vpsrlq $2,%xmm7,%xmm7
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- vpxor %xmm7,%xmm6,%xmm6
- movl %r11d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r13d
- vpshufb %xmm8,%xmm6,%xmm6
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- vpaddd %xmm6,%xmm3,%xmm3
- andl %r10d,%r12d
- xorl %r10d,%r13d
- addl 56(%rsp),%ebx
- vpshufd $80,%xmm3,%xmm7
- movl %ecx,%r15d
- xorl %eax,%r12d
- shrdl $11,%r14d,%r14d
- vpsrld $10,%xmm7,%xmm6
- xorl %edx,%r15d
- addl %r12d,%ebx
- shrdl $6,%r13d,%r13d
- vpsrlq $17,%xmm7,%xmm7
- andl %r15d,%edi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- vpxor %xmm7,%xmm6,%xmm6
- xorl %edx,%edi
- shrdl $2,%r14d,%r14d
- addl %ebx,%r9d
- vpsrlq $2,%xmm7,%xmm7
- addl %edi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- vpxor %xmm7,%xmm6,%xmm6
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- vpshufb %xmm9,%xmm6,%xmm6
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r13d
- xorl %r11d,%r12d
- vpaddd %xmm6,%xmm3,%xmm3
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- vpaddd 96(%rbp),%xmm3,%xmm6
- xorl %r9d,%r13d
- addl 60(%rsp),%eax
- movl %ebx,%edi
- xorl %r11d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ecx,%edi
- addl %r12d,%eax
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- shrdl $2,%r14d,%r14d
- addl %eax,%r8d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- vmovdqa %xmm6,48(%rsp)
- cmpb $0,131(%rbp)
- jne .Lavx_00_47
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r8d,%r13d
- xorl %r10d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- xorl %r8d,%r13d
- addl 0(%rsp),%r11d
- movl %eax,%r15d
- xorl %r10d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ebx,%r15d
- addl %r12d,%r11d
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%edi
- shrdl $2,%r14d,%r14d
- addl %r11d,%edx
- addl %edi,%r11d
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r11d
- movl %r8d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %edx,%r13d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- xorl %edx,%r13d
- addl 4(%rsp),%r10d
- movl %r11d,%edi
- xorl %r9d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %eax,%edi
- addl %r12d,%r10d
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r11d,%r14d
- addl %r13d,%r10d
- xorl %eax,%r15d
- shrdl $2,%r14d,%r14d
- addl %r10d,%ecx
- addl %r15d,%r10d
- movl %ecx,%r13d
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- movl %edx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r13d
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- xorl %ecx,%r13d
- addl 8(%rsp),%r9d
- movl %r10d,%r15d
- xorl %r8d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r11d,%r15d
- addl %r12d,%r9d
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%edi
- shrdl $2,%r14d,%r14d
- addl %r9d,%ebx
- addl %edi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r13d
- xorl %edx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- xorl %ebx,%r13d
- addl 12(%rsp),%r8d
- movl %r9d,%edi
- xorl %edx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r10d,%edi
- addl %r12d,%r8d
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- shrdl $2,%r14d,%r14d
- addl %r8d,%eax
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %eax,%r13d
- xorl %ecx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- xorl %eax,%r13d
- addl 16(%rsp),%edx
- movl %r8d,%r15d
- xorl %ecx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r9d,%r15d
- addl %r12d,%edx
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%edi
- shrdl $2,%r14d,%r14d
- addl %edx,%r11d
- addl %edi,%edx
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%edx
- movl %eax,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r11d,%r13d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- xorl %r11d,%r13d
- addl 20(%rsp),%ecx
- movl %edx,%edi
- xorl %ebx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r8d,%edi
- addl %r12d,%ecx
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %edx,%r14d
- addl %r13d,%ecx
- xorl %r8d,%r15d
- shrdl $2,%r14d,%r14d
- addl %ecx,%r10d
- addl %r15d,%ecx
- movl %r10d,%r13d
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- movl %r11d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r13d
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- xorl %r10d,%r13d
- addl 24(%rsp),%ebx
- movl %ecx,%r15d
- xorl %eax,%r12d
- shrdl $11,%r14d,%r14d
- xorl %edx,%r15d
- addl %r12d,%ebx
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%edi
- shrdl $2,%r14d,%r14d
- addl %ebx,%r9d
- addl %edi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r13d
- xorl %r11d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- xorl %r9d,%r13d
- addl 28(%rsp),%eax
- movl %ebx,%edi
- xorl %r11d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ecx,%edi
- addl %r12d,%eax
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- shrdl $2,%r14d,%r14d
- addl %eax,%r8d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%eax
- movl %r9d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r8d,%r13d
- xorl %r10d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %eax,%r14d
- andl %r8d,%r12d
- xorl %r8d,%r13d
- addl 32(%rsp),%r11d
- movl %eax,%r15d
- xorl %r10d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ebx,%r15d
- addl %r12d,%r11d
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %eax,%r14d
- addl %r13d,%r11d
- xorl %ebx,%edi
- shrdl $2,%r14d,%r14d
- addl %r11d,%edx
- addl %edi,%r11d
- movl %edx,%r13d
- addl %r11d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r11d
- movl %r8d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %edx,%r13d
- xorl %r9d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r11d,%r14d
- andl %edx,%r12d
- xorl %edx,%r13d
- addl 36(%rsp),%r10d
- movl %r11d,%edi
- xorl %r9d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %eax,%edi
- addl %r12d,%r10d
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r11d,%r14d
- addl %r13d,%r10d
- xorl %eax,%r15d
- shrdl $2,%r14d,%r14d
- addl %r10d,%ecx
- addl %r15d,%r10d
- movl %ecx,%r13d
- addl %r10d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r10d
- movl %edx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %ecx,%r13d
- xorl %r8d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r10d,%r14d
- andl %ecx,%r12d
- xorl %ecx,%r13d
- addl 40(%rsp),%r9d
- movl %r10d,%r15d
- xorl %r8d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r11d,%r15d
- addl %r12d,%r9d
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %r10d,%r14d
- addl %r13d,%r9d
- xorl %r11d,%edi
- shrdl $2,%r14d,%r14d
- addl %r9d,%ebx
- addl %edi,%r9d
- movl %ebx,%r13d
- addl %r9d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r9d
- movl %ecx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %ebx,%r13d
- xorl %edx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r9d,%r14d
- andl %ebx,%r12d
- xorl %ebx,%r13d
- addl 44(%rsp),%r8d
- movl %r9d,%edi
- xorl %edx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r10d,%edi
- addl %r12d,%r8d
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %r9d,%r14d
- addl %r13d,%r8d
- xorl %r10d,%r15d
- shrdl $2,%r14d,%r14d
- addl %r8d,%eax
- addl %r15d,%r8d
- movl %eax,%r13d
- addl %r8d,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%r8d
- movl %ebx,%r12d
- shrdl $9,%r14d,%r14d
- xorl %eax,%r13d
- xorl %ecx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %r8d,%r14d
- andl %eax,%r12d
- xorl %eax,%r13d
- addl 48(%rsp),%edx
- movl %r8d,%r15d
- xorl %ecx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r9d,%r15d
- addl %r12d,%edx
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %r8d,%r14d
- addl %r13d,%edx
- xorl %r9d,%edi
- shrdl $2,%r14d,%r14d
- addl %edx,%r11d
- addl %edi,%edx
- movl %r11d,%r13d
- addl %edx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%edx
- movl %eax,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r11d,%r13d
- xorl %ebx,%r12d
- shrdl $5,%r13d,%r13d
- xorl %edx,%r14d
- andl %r11d,%r12d
- xorl %r11d,%r13d
- addl 52(%rsp),%ecx
- movl %edx,%edi
- xorl %ebx,%r12d
- shrdl $11,%r14d,%r14d
- xorl %r8d,%edi
- addl %r12d,%ecx
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %edx,%r14d
- addl %r13d,%ecx
- xorl %r8d,%r15d
- shrdl $2,%r14d,%r14d
- addl %ecx,%r10d
- addl %r15d,%ecx
- movl %r10d,%r13d
- addl %ecx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ecx
- movl %r11d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r10d,%r13d
- xorl %eax,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ecx,%r14d
- andl %r10d,%r12d
- xorl %r10d,%r13d
- addl 56(%rsp),%ebx
- movl %ecx,%r15d
- xorl %eax,%r12d
- shrdl $11,%r14d,%r14d
- xorl %edx,%r15d
- addl %r12d,%ebx
- shrdl $6,%r13d,%r13d
- andl %r15d,%edi
- xorl %ecx,%r14d
- addl %r13d,%ebx
- xorl %edx,%edi
- shrdl $2,%r14d,%r14d
- addl %ebx,%r9d
- addl %edi,%ebx
- movl %r9d,%r13d
- addl %ebx,%r14d
- shrdl $14,%r13d,%r13d
- movl %r14d,%ebx
- movl %r10d,%r12d
- shrdl $9,%r14d,%r14d
- xorl %r9d,%r13d
- xorl %r11d,%r12d
- shrdl $5,%r13d,%r13d
- xorl %ebx,%r14d
- andl %r9d,%r12d
- xorl %r9d,%r13d
- addl 60(%rsp),%eax
- movl %ebx,%edi
- xorl %r11d,%r12d
- shrdl $11,%r14d,%r14d
- xorl %ecx,%edi
- addl %r12d,%eax
- shrdl $6,%r13d,%r13d
- andl %edi,%r15d
- xorl %ebx,%r14d
- addl %r13d,%eax
- xorl %ecx,%r15d
- shrdl $2,%r14d,%r14d
- addl %eax,%r8d
- addl %r15d,%eax
- movl %r8d,%r13d
- addl %eax,%r14d
- movq 64+0(%rsp),%rdi
- movl %r14d,%eax
-
- addl 0(%rdi),%eax
- leaq 64(%rsi),%rsi
- addl 4(%rdi),%ebx
- addl 8(%rdi),%ecx
- addl 12(%rdi),%edx
- addl 16(%rdi),%r8d
- addl 20(%rdi),%r9d
- addl 24(%rdi),%r10d
- addl 28(%rdi),%r11d
-
- cmpq 64+16(%rsp),%rsi
-
- movl %eax,0(%rdi)
- movl %ebx,4(%rdi)
- movl %ecx,8(%rdi)
- movl %edx,12(%rdi)
- movl %r8d,16(%rdi)
- movl %r9d,20(%rdi)
- movl %r10d,24(%rdi)
- movl %r11d,28(%rdi)
- jb .Lloop_avx
-
- movq 88(%rsp),%rsi
-.cfi_def_cfa %rsi,8
- vzeroupper
- movq -48(%rsi),%r15
-.cfi_restore %r15
- movq -40(%rsi),%r14
-.cfi_restore %r14
- movq -32(%rsi),%r13
-.cfi_restore %r13
- movq -24(%rsi),%r12
-.cfi_restore %r12
- movq -16(%rsi),%rbp
-.cfi_restore %rbp
- movq -8(%rsi),%rbx
-.cfi_restore %rbx
- leaq (%rsi),%rsp
-.cfi_def_cfa_register %rsp
-.Lepilogue_avx:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size sha256_block_data_order_avx,.-sha256_block_data_order_avx
-.type sha256_block_data_order_avx2,@function
-.align 64
-sha256_block_data_order_avx2:
-.cfi_startproc
-.Lavx2_shortcut:
- movq %rsp,%rax
-.cfi_def_cfa_register %rax
- pushq %rbx
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_offset %rbp,-24
- pushq %r12
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_offset %r15,-56
- subq $544,%rsp
- shlq $4,%rdx
- andq $-1024,%rsp
- leaq (%rsi,%rdx,4),%rdx
- addq $448,%rsp
- movq %rdi,64+0(%rsp)
- movq %rsi,64+8(%rsp)
- movq %rdx,64+16(%rsp)
- movq %rax,88(%rsp)
-.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
-.Lprologue_avx2:
-
- vzeroupper
- subq $-64,%rsi
- movl 0(%rdi),%eax
- movq %rsi,%r12
- movl 4(%rdi),%ebx
- cmpq %rdx,%rsi
- movl 8(%rdi),%ecx
- cmoveq %rsp,%r12
- movl 12(%rdi),%edx
- movl 16(%rdi),%r8d
- movl 20(%rdi),%r9d
- movl 24(%rdi),%r10d
- movl 28(%rdi),%r11d
- vmovdqa K256+512+32(%rip),%ymm8
- vmovdqa K256+512+64(%rip),%ymm9
- jmp .Loop_avx2
-.align 16
-.Loop_avx2:
- vmovdqa K256+512(%rip),%ymm7
- vmovdqu -64+0(%rsi),%xmm0
- vmovdqu -64+16(%rsi),%xmm1
- vmovdqu -64+32(%rsi),%xmm2
- vmovdqu -64+48(%rsi),%xmm3
-
- vinserti128 $1,(%r12),%ymm0,%ymm0
- vinserti128 $1,16(%r12),%ymm1,%ymm1
- vpshufb %ymm7,%ymm0,%ymm0
- vinserti128 $1,32(%r12),%ymm2,%ymm2
- vpshufb %ymm7,%ymm1,%ymm1
- vinserti128 $1,48(%r12),%ymm3,%ymm3
-
- leaq K256(%rip),%rbp
- vpshufb %ymm7,%ymm2,%ymm2
- vpaddd 0(%rbp),%ymm0,%ymm4
- vpshufb %ymm7,%ymm3,%ymm3
- vpaddd 32(%rbp),%ymm1,%ymm5
- vpaddd 64(%rbp),%ymm2,%ymm6
- vpaddd 96(%rbp),%ymm3,%ymm7
- vmovdqa %ymm4,0(%rsp)
- xorl %r14d,%r14d
- vmovdqa %ymm5,32(%rsp)
-
- movq 88(%rsp),%rdi
-.cfi_def_cfa %rdi,8
- leaq -64(%rsp),%rsp
-
-
-
- movq %rdi,-8(%rsp)
-.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
- movl %ebx,%edi
- vmovdqa %ymm6,0(%rsp)
- xorl %ecx,%edi
- vmovdqa %ymm7,32(%rsp)
- movl %r9d,%r12d
- subq $-32*4,%rbp
- jmp .Lavx2_00_47
-
-.align 16
-.Lavx2_00_47:
- leaq -64(%rsp),%rsp
-.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08
-
- pushq 64-8(%rsp)
-.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08
- leaq 8(%rsp),%rsp
-.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
- vpalignr $4,%ymm0,%ymm1,%ymm4
- addl 0+128(%rsp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- vpalignr $4,%ymm2,%ymm3,%ymm7
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- vpsrld $7,%ymm4,%ymm6
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- vpaddd %ymm7,%ymm0,%ymm0
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- vpsrld $3,%ymm4,%ymm7
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- vpslld $14,%ymm4,%ymm5
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- vpxor %ymm6,%ymm7,%ymm4
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %ebx,%edi
- vpshufd $250,%ymm3,%ymm7
- xorl %r13d,%r14d
- leal (%r11,%rdi,1),%r11d
- movl %r8d,%r12d
- vpsrld $11,%ymm6,%ymm6
- addl 4+128(%rsp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $11,%edx,%edi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- vpslld $11,%ymm5,%ymm5
- andnl %r9d,%edx,%r12d
- xorl %edi,%r13d
- rorxl $6,%edx,%r14d
- vpxor %ymm6,%ymm4,%ymm4
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%edi
- vpsrld $10,%ymm7,%ymm6
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%edi
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- vpsrlq $17,%ymm7,%ymm7
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %eax,%r15d
- vpaddd %ymm4,%ymm0,%ymm0
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 8+128(%rsp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- vpxor %ymm7,%ymm6,%ymm6
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- vpshufb %ymm8,%ymm6,%ymm6
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- vpaddd %ymm6,%ymm0,%ymm0
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- vpshufd $80,%ymm0,%ymm7
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- vpsrld $10,%ymm7,%ymm6
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r11d,%edi
- vpsrlq $17,%ymm7,%ymm7
- xorl %r13d,%r14d
- leal (%r9,%rdi,1),%r9d
- movl %ecx,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 12+128(%rsp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%ebx,%edi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- vpxor %ymm7,%ymm6,%ymm6
- andnl %edx,%ebx,%r12d
- xorl %edi,%r13d
- rorxl $6,%ebx,%r14d
- vpshufb %ymm9,%ymm6,%ymm6
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%edi
- vpaddd %ymm6,%ymm0,%ymm0
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%edi
- vpaddd 0(%rbp),%ymm0,%ymm6
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- vmovdqa %ymm6,0(%rsp)
- vpalignr $4,%ymm1,%ymm2,%ymm4
- addl 32+128(%rsp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- vpalignr $4,%ymm3,%ymm0,%ymm7
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- vpsrld $7,%ymm4,%ymm6
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- vpaddd %ymm7,%ymm1,%ymm1
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- vpsrld $3,%ymm4,%ymm7
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- vpslld $14,%ymm4,%ymm5
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- vpxor %ymm6,%ymm7,%ymm4
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r9d,%edi
- vpshufd $250,%ymm0,%ymm7
- xorl %r13d,%r14d
- leal (%rdx,%rdi,1),%edx
- movl %eax,%r12d
- vpsrld $11,%ymm6,%ymm6
- addl 36+128(%rsp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $11,%r11d,%edi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- vpslld $11,%ymm5,%ymm5
- andnl %ebx,%r11d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r11d,%r14d
- vpxor %ymm6,%ymm4,%ymm4
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%edi
- vpsrld $10,%ymm7,%ymm6
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%edi
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- vpsrlq $17,%ymm7,%ymm7
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- vpaddd %ymm4,%ymm1,%ymm1
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 40+128(%rsp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- vpxor %ymm7,%ymm6,%ymm6
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- vpshufb %ymm8,%ymm6,%ymm6
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- vpaddd %ymm6,%ymm1,%ymm1
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- vpshufd $80,%ymm1,%ymm7
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- vpsrld $10,%ymm7,%ymm6
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %edx,%edi
- vpsrlq $17,%ymm7,%ymm7
- xorl %r13d,%r14d
- leal (%rbx,%rdi,1),%ebx
- movl %r10d,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 44+128(%rsp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%r9d,%edi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- vpxor %ymm7,%ymm6,%ymm6
- andnl %r11d,%r9d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r9d,%r14d
- vpshufb %ymm9,%ymm6,%ymm6
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%edi
- vpaddd %ymm6,%ymm1,%ymm1
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%edi
- vpaddd 32(%rbp),%ymm1,%ymm6
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- vmovdqa %ymm6,32(%rsp)
- leaq -64(%rsp),%rsp
-.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08
-
- pushq 64-8(%rsp)
-.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08
- leaq 8(%rsp),%rsp
-.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
- vpalignr $4,%ymm2,%ymm3,%ymm4
- addl 0+128(%rsp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- vpalignr $4,%ymm0,%ymm1,%ymm7
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- vpsrld $7,%ymm4,%ymm6
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- vpaddd %ymm7,%ymm2,%ymm2
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- vpsrld $3,%ymm4,%ymm7
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- vpslld $14,%ymm4,%ymm5
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- vpxor %ymm6,%ymm7,%ymm4
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %ebx,%edi
- vpshufd $250,%ymm1,%ymm7
- xorl %r13d,%r14d
- leal (%r11,%rdi,1),%r11d
- movl %r8d,%r12d
- vpsrld $11,%ymm6,%ymm6
- addl 4+128(%rsp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $11,%edx,%edi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- vpslld $11,%ymm5,%ymm5
- andnl %r9d,%edx,%r12d
- xorl %edi,%r13d
- rorxl $6,%edx,%r14d
- vpxor %ymm6,%ymm4,%ymm4
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%edi
- vpsrld $10,%ymm7,%ymm6
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%edi
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- vpsrlq $17,%ymm7,%ymm7
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %eax,%r15d
- vpaddd %ymm4,%ymm2,%ymm2
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 8+128(%rsp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- vpxor %ymm7,%ymm6,%ymm6
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- vpshufb %ymm8,%ymm6,%ymm6
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- vpaddd %ymm6,%ymm2,%ymm2
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- vpshufd $80,%ymm2,%ymm7
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- vpsrld $10,%ymm7,%ymm6
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r11d,%edi
- vpsrlq $17,%ymm7,%ymm7
- xorl %r13d,%r14d
- leal (%r9,%rdi,1),%r9d
- movl %ecx,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 12+128(%rsp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%ebx,%edi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- vpxor %ymm7,%ymm6,%ymm6
- andnl %edx,%ebx,%r12d
- xorl %edi,%r13d
- rorxl $6,%ebx,%r14d
- vpshufb %ymm9,%ymm6,%ymm6
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%edi
- vpaddd %ymm6,%ymm2,%ymm2
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%edi
- vpaddd 64(%rbp),%ymm2,%ymm6
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- vmovdqa %ymm6,0(%rsp)
- vpalignr $4,%ymm3,%ymm0,%ymm4
- addl 32+128(%rsp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- vpalignr $4,%ymm1,%ymm2,%ymm7
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- vpsrld $7,%ymm4,%ymm6
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- vpaddd %ymm7,%ymm3,%ymm3
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- vpsrld $3,%ymm4,%ymm7
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- vpslld $14,%ymm4,%ymm5
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- vpxor %ymm6,%ymm7,%ymm4
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r9d,%edi
- vpshufd $250,%ymm2,%ymm7
- xorl %r13d,%r14d
- leal (%rdx,%rdi,1),%edx
- movl %eax,%r12d
- vpsrld $11,%ymm6,%ymm6
- addl 36+128(%rsp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $11,%r11d,%edi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- vpslld $11,%ymm5,%ymm5
- andnl %ebx,%r11d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r11d,%r14d
- vpxor %ymm6,%ymm4,%ymm4
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%edi
- vpsrld $10,%ymm7,%ymm6
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%edi
- vpxor %ymm5,%ymm4,%ymm4
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- vpsrlq $17,%ymm7,%ymm7
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- vpaddd %ymm4,%ymm3,%ymm3
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 40+128(%rsp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- vpxor %ymm7,%ymm6,%ymm6
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- vpshufb %ymm8,%ymm6,%ymm6
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- vpaddd %ymm6,%ymm3,%ymm3
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- vpshufd $80,%ymm3,%ymm7
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- vpsrld $10,%ymm7,%ymm6
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %edx,%edi
- vpsrlq $17,%ymm7,%ymm7
- xorl %r13d,%r14d
- leal (%rbx,%rdi,1),%ebx
- movl %r10d,%r12d
- vpxor %ymm7,%ymm6,%ymm6
- addl 44+128(%rsp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- vpsrlq $2,%ymm7,%ymm7
- rorxl $11,%r9d,%edi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- vpxor %ymm7,%ymm6,%ymm6
- andnl %r11d,%r9d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r9d,%r14d
- vpshufb %ymm9,%ymm6,%ymm6
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%edi
- vpaddd %ymm6,%ymm3,%ymm3
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%edi
- vpaddd 96(%rbp),%ymm3,%ymm6
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- vmovdqa %ymm6,32(%rsp)
- leaq 128(%rbp),%rbp
- cmpb $0,3(%rbp)
- jne .Lavx2_00_47
- addl 0+64(%rsp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %ebx,%edi
- xorl %r13d,%r14d
- leal (%r11,%rdi,1),%r11d
- movl %r8d,%r12d
- addl 4+64(%rsp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- rorxl $11,%edx,%edi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- andnl %r9d,%edx,%r12d
- xorl %edi,%r13d
- rorxl $6,%edx,%r14d
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%edi
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%edi
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %eax,%r15d
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- addl 8+64(%rsp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r11d,%edi
- xorl %r13d,%r14d
- leal (%r9,%rdi,1),%r9d
- movl %ecx,%r12d
- addl 12+64(%rsp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- rorxl $11,%ebx,%edi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- andnl %edx,%ebx,%r12d
- xorl %edi,%r13d
- rorxl $6,%ebx,%r14d
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%edi
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%edi
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- addl 32+64(%rsp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r9d,%edi
- xorl %r13d,%r14d
- leal (%rdx,%rdi,1),%edx
- movl %eax,%r12d
- addl 36+64(%rsp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- rorxl $11,%r11d,%edi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- andnl %ebx,%r11d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r11d,%r14d
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%edi
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%edi
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- addl 40+64(%rsp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %edx,%edi
- xorl %r13d,%r14d
- leal (%rbx,%rdi,1),%ebx
- movl %r10d,%r12d
- addl 44+64(%rsp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- rorxl $11,%r9d,%edi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- andnl %r11d,%r9d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r9d,%r14d
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%edi
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%edi
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- addl 0(%rsp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %ebx,%edi
- xorl %r13d,%r14d
- leal (%r11,%rdi,1),%r11d
- movl %r8d,%r12d
- addl 4(%rsp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- rorxl $11,%edx,%edi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- andnl %r9d,%edx,%r12d
- xorl %edi,%r13d
- rorxl $6,%edx,%r14d
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%edi
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%edi
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %eax,%r15d
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- addl 8(%rsp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r11d,%edi
- xorl %r13d,%r14d
- leal (%r9,%rdi,1),%r9d
- movl %ecx,%r12d
- addl 12(%rsp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- rorxl $11,%ebx,%edi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- andnl %edx,%ebx,%r12d
- xorl %edi,%r13d
- rorxl $6,%ebx,%r14d
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%edi
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%edi
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- addl 32(%rsp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r9d,%edi
- xorl %r13d,%r14d
- leal (%rdx,%rdi,1),%edx
- movl %eax,%r12d
- addl 36(%rsp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- rorxl $11,%r11d,%edi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- andnl %ebx,%r11d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r11d,%r14d
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%edi
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%edi
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- addl 40(%rsp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %edx,%edi
- xorl %r13d,%r14d
- leal (%rbx,%rdi,1),%ebx
- movl %r10d,%r12d
- addl 44(%rsp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- rorxl $11,%r9d,%edi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- andnl %r11d,%r9d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r9d,%r14d
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%edi
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%edi
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- movq 512(%rsp),%rdi
- addl %r14d,%eax
-
- leaq 448(%rsp),%rbp
-
- addl 0(%rdi),%eax
- addl 4(%rdi),%ebx
- addl 8(%rdi),%ecx
- addl 12(%rdi),%edx
- addl 16(%rdi),%r8d
- addl 20(%rdi),%r9d
- addl 24(%rdi),%r10d
- addl 28(%rdi),%r11d
-
- movl %eax,0(%rdi)
- movl %ebx,4(%rdi)
- movl %ecx,8(%rdi)
- movl %edx,12(%rdi)
- movl %r8d,16(%rdi)
- movl %r9d,20(%rdi)
- movl %r10d,24(%rdi)
- movl %r11d,28(%rdi)
-
- cmpq 80(%rbp),%rsi
- je .Ldone_avx2
-
- xorl %r14d,%r14d
- movl %ebx,%edi
- xorl %ecx,%edi
- movl %r9d,%r12d
- jmp .Lower_avx2
-.align 16
-.Lower_avx2:
- addl 0+16(%rbp),%r11d
- andl %r8d,%r12d
- rorxl $25,%r8d,%r13d
- rorxl $11,%r8d,%r15d
- leal (%rax,%r14,1),%eax
- leal (%r11,%r12,1),%r11d
- andnl %r10d,%r8d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r8d,%r14d
- leal (%r11,%r12,1),%r11d
- xorl %r14d,%r13d
- movl %eax,%r15d
- rorxl $22,%eax,%r12d
- leal (%r11,%r13,1),%r11d
- xorl %ebx,%r15d
- rorxl $13,%eax,%r14d
- rorxl $2,%eax,%r13d
- leal (%rdx,%r11,1),%edx
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %ebx,%edi
- xorl %r13d,%r14d
- leal (%r11,%rdi,1),%r11d
- movl %r8d,%r12d
- addl 4+16(%rbp),%r10d
- andl %edx,%r12d
- rorxl $25,%edx,%r13d
- rorxl $11,%edx,%edi
- leal (%r11,%r14,1),%r11d
- leal (%r10,%r12,1),%r10d
- andnl %r9d,%edx,%r12d
- xorl %edi,%r13d
- rorxl $6,%edx,%r14d
- leal (%r10,%r12,1),%r10d
- xorl %r14d,%r13d
- movl %r11d,%edi
- rorxl $22,%r11d,%r12d
- leal (%r10,%r13,1),%r10d
- xorl %eax,%edi
- rorxl $13,%r11d,%r14d
- rorxl $2,%r11d,%r13d
- leal (%rcx,%r10,1),%ecx
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %eax,%r15d
- xorl %r13d,%r14d
- leal (%r10,%r15,1),%r10d
- movl %edx,%r12d
- addl 8+16(%rbp),%r9d
- andl %ecx,%r12d
- rorxl $25,%ecx,%r13d
- rorxl $11,%ecx,%r15d
- leal (%r10,%r14,1),%r10d
- leal (%r9,%r12,1),%r9d
- andnl %r8d,%ecx,%r12d
- xorl %r15d,%r13d
- rorxl $6,%ecx,%r14d
- leal (%r9,%r12,1),%r9d
- xorl %r14d,%r13d
- movl %r10d,%r15d
- rorxl $22,%r10d,%r12d
- leal (%r9,%r13,1),%r9d
- xorl %r11d,%r15d
- rorxl $13,%r10d,%r14d
- rorxl $2,%r10d,%r13d
- leal (%rbx,%r9,1),%ebx
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r11d,%edi
- xorl %r13d,%r14d
- leal (%r9,%rdi,1),%r9d
- movl %ecx,%r12d
- addl 12+16(%rbp),%r8d
- andl %ebx,%r12d
- rorxl $25,%ebx,%r13d
- rorxl $11,%ebx,%edi
- leal (%r9,%r14,1),%r9d
- leal (%r8,%r12,1),%r8d
- andnl %edx,%ebx,%r12d
- xorl %edi,%r13d
- rorxl $6,%ebx,%r14d
- leal (%r8,%r12,1),%r8d
- xorl %r14d,%r13d
- movl %r9d,%edi
- rorxl $22,%r9d,%r12d
- leal (%r8,%r13,1),%r8d
- xorl %r10d,%edi
- rorxl $13,%r9d,%r14d
- rorxl $2,%r9d,%r13d
- leal (%rax,%r8,1),%eax
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r10d,%r15d
- xorl %r13d,%r14d
- leal (%r8,%r15,1),%r8d
- movl %ebx,%r12d
- addl 32+16(%rbp),%edx
- andl %eax,%r12d
- rorxl $25,%eax,%r13d
- rorxl $11,%eax,%r15d
- leal (%r8,%r14,1),%r8d
- leal (%rdx,%r12,1),%edx
- andnl %ecx,%eax,%r12d
- xorl %r15d,%r13d
- rorxl $6,%eax,%r14d
- leal (%rdx,%r12,1),%edx
- xorl %r14d,%r13d
- movl %r8d,%r15d
- rorxl $22,%r8d,%r12d
- leal (%rdx,%r13,1),%edx
- xorl %r9d,%r15d
- rorxl $13,%r8d,%r14d
- rorxl $2,%r8d,%r13d
- leal (%r11,%rdx,1),%r11d
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %r9d,%edi
- xorl %r13d,%r14d
- leal (%rdx,%rdi,1),%edx
- movl %eax,%r12d
- addl 36+16(%rbp),%ecx
- andl %r11d,%r12d
- rorxl $25,%r11d,%r13d
- rorxl $11,%r11d,%edi
- leal (%rdx,%r14,1),%edx
- leal (%rcx,%r12,1),%ecx
- andnl %ebx,%r11d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r11d,%r14d
- leal (%rcx,%r12,1),%ecx
- xorl %r14d,%r13d
- movl %edx,%edi
- rorxl $22,%edx,%r12d
- leal (%rcx,%r13,1),%ecx
- xorl %r8d,%edi
- rorxl $13,%edx,%r14d
- rorxl $2,%edx,%r13d
- leal (%r10,%rcx,1),%r10d
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %r8d,%r15d
- xorl %r13d,%r14d
- leal (%rcx,%r15,1),%ecx
- movl %r11d,%r12d
- addl 40+16(%rbp),%ebx
- andl %r10d,%r12d
- rorxl $25,%r10d,%r13d
- rorxl $11,%r10d,%r15d
- leal (%rcx,%r14,1),%ecx
- leal (%rbx,%r12,1),%ebx
- andnl %eax,%r10d,%r12d
- xorl %r15d,%r13d
- rorxl $6,%r10d,%r14d
- leal (%rbx,%r12,1),%ebx
- xorl %r14d,%r13d
- movl %ecx,%r15d
- rorxl $22,%ecx,%r12d
- leal (%rbx,%r13,1),%ebx
- xorl %edx,%r15d
- rorxl $13,%ecx,%r14d
- rorxl $2,%ecx,%r13d
- leal (%r9,%rbx,1),%r9d
- andl %r15d,%edi
- xorl %r12d,%r14d
- xorl %edx,%edi
- xorl %r13d,%r14d
- leal (%rbx,%rdi,1),%ebx
- movl %r10d,%r12d
- addl 44+16(%rbp),%eax
- andl %r9d,%r12d
- rorxl $25,%r9d,%r13d
- rorxl $11,%r9d,%edi
- leal (%rbx,%r14,1),%ebx
- leal (%rax,%r12,1),%eax
- andnl %r11d,%r9d,%r12d
- xorl %edi,%r13d
- rorxl $6,%r9d,%r14d
- leal (%rax,%r12,1),%eax
- xorl %r14d,%r13d
- movl %ebx,%edi
- rorxl $22,%ebx,%r12d
- leal (%rax,%r13,1),%eax
- xorl %ecx,%edi
- rorxl $13,%ebx,%r14d
- rorxl $2,%ebx,%r13d
- leal (%r8,%rax,1),%r8d
- andl %edi,%r15d
- xorl %r12d,%r14d
- xorl %ecx,%r15d
- xorl %r13d,%r14d
- leal (%rax,%r15,1),%eax
- movl %r9d,%r12d
- leaq -64(%rbp),%rbp
- cmpq %rsp,%rbp
- jae .Lower_avx2
-
- movq 512(%rsp),%rdi
- addl %r14d,%eax
-
- leaq 448(%rsp),%rsp
-
-.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
-
- addl 0(%rdi),%eax
- addl 4(%rdi),%ebx
- addl 8(%rdi),%ecx
- addl 12(%rdi),%edx
- addl 16(%rdi),%r8d
- addl 20(%rdi),%r9d
- leaq 128(%rsi),%rsi
- addl 24(%rdi),%r10d
- movq %rsi,%r12
- addl 28(%rdi),%r11d
- cmpq 64+16(%rsp),%rsi
-
- movl %eax,0(%rdi)
- cmoveq %rsp,%r12
- movl %ebx,4(%rdi)
- movl %ecx,8(%rdi)
- movl %edx,12(%rdi)
- movl %r8d,16(%rdi)
- movl %r9d,20(%rdi)
- movl %r10d,24(%rdi)
- movl %r11d,28(%rdi)
-
- jbe .Loop_avx2
- leaq (%rsp),%rbp
-
-
-.cfi_escape 0x0f,0x06,0x76,0xd8,0x00,0x06,0x23,0x08
-
-.Ldone_avx2:
- movq 88(%rbp),%rsi
-.cfi_def_cfa %rsi,8
- vzeroupper
- movq -48(%rsi),%r15
-.cfi_restore %r15
- movq -40(%rsi),%r14
-.cfi_restore %r14
- movq -32(%rsi),%r13
-.cfi_restore %r13
- movq -24(%rsi),%r12
-.cfi_restore %r12
- movq -16(%rsi),%rbp
-.cfi_restore %rbp
- movq -8(%rsi),%rbx
-.cfi_restore %rbx
- leaq (%rsi),%rsp
-.cfi_def_cfa_register %rsp
-.Lepilogue_avx2:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size sha256_block_data_order_avx2,.-sha256_block_data_order_avx2
- .section ".note.gnu.property", "a"
+ .section .note.gnu.property, #alloc
.p2align 3
.long 1f - 0f
.long 4f - 1f
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha512-x86_64.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha512-x86_64.s
index 3744b830146..72f59523725 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha512-x86_64.s
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/sha/sha512-x86_64.s
@@ -6,20 +6,6 @@
.align 16
sha512_block_data_order:
.cfi_startproc
- leaq OPENSSL_ia32cap_P(%rip),%r11
- movl 0(%r11),%r9d
- movl 4(%r11),%r10d
- movl 8(%r11),%r11d
- testl $2048,%r10d
- jnz .Lxop_shortcut
- andl $296,%r11d
- cmpl $296,%r11d
- je .Lavx2_shortcut
- andl $1073741824,%r9d
- andl $268435968,%r10d
- orl %r9d,%r10d
- cmpl $1342177792,%r10d
- je .Lavx_shortcut
movq %rsp,%rax
.cfi_def_cfa_register %rax
pushq %rbx
@@ -1813,3653 +1799,7 @@ K512:
.quad 0x0001020304050607,0x08090a0b0c0d0e0f
.quad 0x0001020304050607,0x08090a0b0c0d0e0f
.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.type sha512_block_data_order_xop,@function
-.align 64
-sha512_block_data_order_xop:
-.cfi_startproc
-.Lxop_shortcut:
- movq %rsp,%rax
-.cfi_def_cfa_register %rax
- pushq %rbx
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_offset %rbp,-24
- pushq %r12
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_offset %r15,-56
- shlq $4,%rdx
- subq $160,%rsp
- leaq (%rsi,%rdx,8),%rdx
- andq $-64,%rsp
- movq %rdi,128+0(%rsp)
- movq %rsi,128+8(%rsp)
- movq %rdx,128+16(%rsp)
- movq %rax,152(%rsp)
-.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
-.Lprologue_xop:
-
- vzeroupper
- movq 0(%rdi),%rax
- movq 8(%rdi),%rbx
- movq 16(%rdi),%rcx
- movq 24(%rdi),%rdx
- movq 32(%rdi),%r8
- movq 40(%rdi),%r9
- movq 48(%rdi),%r10
- movq 56(%rdi),%r11
- jmp .Lloop_xop
-.align 16
-.Lloop_xop:
- vmovdqa K512+1280(%rip),%xmm11
- vmovdqu 0(%rsi),%xmm0
- leaq K512+128(%rip),%rbp
- vmovdqu 16(%rsi),%xmm1
- vmovdqu 32(%rsi),%xmm2
- vpshufb %xmm11,%xmm0,%xmm0
- vmovdqu 48(%rsi),%xmm3
- vpshufb %xmm11,%xmm1,%xmm1
- vmovdqu 64(%rsi),%xmm4
- vpshufb %xmm11,%xmm2,%xmm2
- vmovdqu 80(%rsi),%xmm5
- vpshufb %xmm11,%xmm3,%xmm3
- vmovdqu 96(%rsi),%xmm6
- vpshufb %xmm11,%xmm4,%xmm4
- vmovdqu 112(%rsi),%xmm7
- vpshufb %xmm11,%xmm5,%xmm5
- vpaddq -128(%rbp),%xmm0,%xmm8
- vpshufb %xmm11,%xmm6,%xmm6
- vpaddq -96(%rbp),%xmm1,%xmm9
- vpshufb %xmm11,%xmm7,%xmm7
- vpaddq -64(%rbp),%xmm2,%xmm10
- vpaddq -32(%rbp),%xmm3,%xmm11
- vmovdqa %xmm8,0(%rsp)
- vpaddq 0(%rbp),%xmm4,%xmm8
- vmovdqa %xmm9,16(%rsp)
- vpaddq 32(%rbp),%xmm5,%xmm9
- vmovdqa %xmm10,32(%rsp)
- vpaddq 64(%rbp),%xmm6,%xmm10
- vmovdqa %xmm11,48(%rsp)
- vpaddq 96(%rbp),%xmm7,%xmm11
- vmovdqa %xmm8,64(%rsp)
- movq %rax,%r14
- vmovdqa %xmm9,80(%rsp)
- movq %rbx,%rdi
- vmovdqa %xmm10,96(%rsp)
- xorq %rcx,%rdi
- vmovdqa %xmm11,112(%rsp)
- movq %r8,%r13
- jmp .Lxop_00_47
-
-.align 16
-.Lxop_00_47:
- addq $256,%rbp
- vpalignr $8,%xmm0,%xmm1,%xmm8
- rorq $23,%r13
- movq %r14,%rax
- vpalignr $8,%xmm4,%xmm5,%xmm11
- movq %r9,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %r8,%r13
- xorq %r10,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %rax,%r14
- vpaddq %xmm11,%xmm0,%xmm0
- andq %r8,%r12
- xorq %r8,%r13
- addq 0(%rsp),%r11
- movq %rax,%r15
-.byte 143,72,120,195,209,7
- xorq %r10,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %rbx,%r15
- addq %r12,%r11
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,223,3
- xorq %rax,%r14
- addq %r13,%r11
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rbx,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm7,%xmm10
- addq %r11,%rdx
- addq %rdi,%r11
- vpaddq %xmm8,%xmm0,%xmm0
- movq %rdx,%r13
- addq %r11,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%r11
- vpxor %xmm10,%xmm11,%xmm11
- movq %r8,%r12
- rorq $5,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- vpaddq %xmm11,%xmm0,%xmm0
- addq 8(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- rorq $6,%r14
- vpaddq -128(%rbp),%xmm0,%xmm10
- xorq %rax,%rdi
- addq %r12,%r10
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- rorq $28,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- vmovdqa %xmm10,0(%rsp)
- vpalignr $8,%xmm1,%xmm2,%xmm8
- rorq $23,%r13
- movq %r14,%r10
- vpalignr $8,%xmm5,%xmm6,%xmm11
- movq %rdx,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %rcx,%r13
- xorq %r8,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %r10,%r14
- vpaddq %xmm11,%xmm1,%xmm1
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 16(%rsp),%r9
- movq %r10,%r15
-.byte 143,72,120,195,209,7
- xorq %r8,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %r11,%r15
- addq %r12,%r9
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,216,3
- xorq %r10,%r14
- addq %r13,%r9
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r11,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm0,%xmm10
- addq %r9,%rbx
- addq %rdi,%r9
- vpaddq %xmm8,%xmm1,%xmm1
- movq %rbx,%r13
- addq %r9,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%r9
- vpxor %xmm10,%xmm11,%xmm11
- movq %rcx,%r12
- rorq $5,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- vpaddq %xmm11,%xmm1,%xmm1
- addq 24(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- rorq $6,%r14
- vpaddq -96(%rbp),%xmm1,%xmm10
- xorq %r10,%rdi
- addq %r12,%r8
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- rorq $28,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- vmovdqa %xmm10,16(%rsp)
- vpalignr $8,%xmm2,%xmm3,%xmm8
- rorq $23,%r13
- movq %r14,%r8
- vpalignr $8,%xmm6,%xmm7,%xmm11
- movq %rbx,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %rax,%r13
- xorq %rcx,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %r8,%r14
- vpaddq %xmm11,%xmm2,%xmm2
- andq %rax,%r12
- xorq %rax,%r13
- addq 32(%rsp),%rdx
- movq %r8,%r15
-.byte 143,72,120,195,209,7
- xorq %rcx,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %r9,%r15
- addq %r12,%rdx
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,217,3
- xorq %r8,%r14
- addq %r13,%rdx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r9,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm1,%xmm10
- addq %rdx,%r11
- addq %rdi,%rdx
- vpaddq %xmm8,%xmm2,%xmm2
- movq %r11,%r13
- addq %rdx,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%rdx
- vpxor %xmm10,%xmm11,%xmm11
- movq %rax,%r12
- rorq $5,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- vpaddq %xmm11,%xmm2,%xmm2
- addq 40(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- rorq $6,%r14
- vpaddq -64(%rbp),%xmm2,%xmm10
- xorq %r8,%rdi
- addq %r12,%rcx
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- rorq $28,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- vmovdqa %xmm10,32(%rsp)
- vpalignr $8,%xmm3,%xmm4,%xmm8
- rorq $23,%r13
- movq %r14,%rcx
- vpalignr $8,%xmm7,%xmm0,%xmm11
- movq %r11,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %r10,%r13
- xorq %rax,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %rcx,%r14
- vpaddq %xmm11,%xmm3,%xmm3
- andq %r10,%r12
- xorq %r10,%r13
- addq 48(%rsp),%rbx
- movq %rcx,%r15
-.byte 143,72,120,195,209,7
- xorq %rax,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %rdx,%r15
- addq %r12,%rbx
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,218,3
- xorq %rcx,%r14
- addq %r13,%rbx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rdx,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm2,%xmm10
- addq %rbx,%r9
- addq %rdi,%rbx
- vpaddq %xmm8,%xmm3,%xmm3
- movq %r9,%r13
- addq %rbx,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%rbx
- vpxor %xmm10,%xmm11,%xmm11
- movq %r10,%r12
- rorq $5,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- vpaddq %xmm11,%xmm3,%xmm3
- addq 56(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- rorq $6,%r14
- vpaddq -32(%rbp),%xmm3,%xmm10
- xorq %rcx,%rdi
- addq %r12,%rax
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- rorq $28,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- vmovdqa %xmm10,48(%rsp)
- vpalignr $8,%xmm4,%xmm5,%xmm8
- rorq $23,%r13
- movq %r14,%rax
- vpalignr $8,%xmm0,%xmm1,%xmm11
- movq %r9,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %r8,%r13
- xorq %r10,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %rax,%r14
- vpaddq %xmm11,%xmm4,%xmm4
- andq %r8,%r12
- xorq %r8,%r13
- addq 64(%rsp),%r11
- movq %rax,%r15
-.byte 143,72,120,195,209,7
- xorq %r10,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %rbx,%r15
- addq %r12,%r11
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,219,3
- xorq %rax,%r14
- addq %r13,%r11
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rbx,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm3,%xmm10
- addq %r11,%rdx
- addq %rdi,%r11
- vpaddq %xmm8,%xmm4,%xmm4
- movq %rdx,%r13
- addq %r11,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%r11
- vpxor %xmm10,%xmm11,%xmm11
- movq %r8,%r12
- rorq $5,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- vpaddq %xmm11,%xmm4,%xmm4
- addq 72(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- rorq $6,%r14
- vpaddq 0(%rbp),%xmm4,%xmm10
- xorq %rax,%rdi
- addq %r12,%r10
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- rorq $28,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- vmovdqa %xmm10,64(%rsp)
- vpalignr $8,%xmm5,%xmm6,%xmm8
- rorq $23,%r13
- movq %r14,%r10
- vpalignr $8,%xmm1,%xmm2,%xmm11
- movq %rdx,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %rcx,%r13
- xorq %r8,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %r10,%r14
- vpaddq %xmm11,%xmm5,%xmm5
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 80(%rsp),%r9
- movq %r10,%r15
-.byte 143,72,120,195,209,7
- xorq %r8,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %r11,%r15
- addq %r12,%r9
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,220,3
- xorq %r10,%r14
- addq %r13,%r9
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r11,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm4,%xmm10
- addq %r9,%rbx
- addq %rdi,%r9
- vpaddq %xmm8,%xmm5,%xmm5
- movq %rbx,%r13
- addq %r9,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%r9
- vpxor %xmm10,%xmm11,%xmm11
- movq %rcx,%r12
- rorq $5,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- vpaddq %xmm11,%xmm5,%xmm5
- addq 88(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- rorq $6,%r14
- vpaddq 32(%rbp),%xmm5,%xmm10
- xorq %r10,%rdi
- addq %r12,%r8
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- rorq $28,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- vmovdqa %xmm10,80(%rsp)
- vpalignr $8,%xmm6,%xmm7,%xmm8
- rorq $23,%r13
- movq %r14,%r8
- vpalignr $8,%xmm2,%xmm3,%xmm11
- movq %rbx,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %rax,%r13
- xorq %rcx,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %r8,%r14
- vpaddq %xmm11,%xmm6,%xmm6
- andq %rax,%r12
- xorq %rax,%r13
- addq 96(%rsp),%rdx
- movq %r8,%r15
-.byte 143,72,120,195,209,7
- xorq %rcx,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %r9,%r15
- addq %r12,%rdx
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,221,3
- xorq %r8,%r14
- addq %r13,%rdx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r9,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm5,%xmm10
- addq %rdx,%r11
- addq %rdi,%rdx
- vpaddq %xmm8,%xmm6,%xmm6
- movq %r11,%r13
- addq %rdx,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%rdx
- vpxor %xmm10,%xmm11,%xmm11
- movq %rax,%r12
- rorq $5,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- vpaddq %xmm11,%xmm6,%xmm6
- addq 104(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- rorq $6,%r14
- vpaddq 64(%rbp),%xmm6,%xmm10
- xorq %r8,%rdi
- addq %r12,%rcx
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- rorq $28,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- vmovdqa %xmm10,96(%rsp)
- vpalignr $8,%xmm7,%xmm0,%xmm8
- rorq $23,%r13
- movq %r14,%rcx
- vpalignr $8,%xmm3,%xmm4,%xmm11
- movq %r11,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %r10,%r13
- xorq %rax,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %rcx,%r14
- vpaddq %xmm11,%xmm7,%xmm7
- andq %r10,%r12
- xorq %r10,%r13
- addq 112(%rsp),%rbx
- movq %rcx,%r15
-.byte 143,72,120,195,209,7
- xorq %rax,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %rdx,%r15
- addq %r12,%rbx
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,222,3
- xorq %rcx,%r14
- addq %r13,%rbx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rdx,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm6,%xmm10
- addq %rbx,%r9
- addq %rdi,%rbx
- vpaddq %xmm8,%xmm7,%xmm7
- movq %r9,%r13
- addq %rbx,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%rbx
- vpxor %xmm10,%xmm11,%xmm11
- movq %r10,%r12
- rorq $5,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- vpaddq %xmm11,%xmm7,%xmm7
- addq 120(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- rorq $6,%r14
- vpaddq 96(%rbp),%xmm7,%xmm10
- xorq %rcx,%rdi
- addq %r12,%rax
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- rorq $28,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- vmovdqa %xmm10,112(%rsp)
- cmpb $0,135(%rbp)
- jne .Lxop_00_47
- rorq $23,%r13
- movq %r14,%rax
- movq %r9,%r12
- rorq $5,%r14
- xorq %r8,%r13
- xorq %r10,%r12
- rorq $4,%r13
- xorq %rax,%r14
- andq %r8,%r12
- xorq %r8,%r13
- addq 0(%rsp),%r11
- movq %rax,%r15
- xorq %r10,%r12
- rorq $6,%r14
- xorq %rbx,%r15
- addq %r12,%r11
- rorq $14,%r13
- andq %r15,%rdi
- xorq %rax,%r14
- addq %r13,%r11
- xorq %rbx,%rdi
- rorq $28,%r14
- addq %r11,%rdx
- addq %rdi,%r11
- movq %rdx,%r13
- addq %r11,%r14
- rorq $23,%r13
- movq %r14,%r11
- movq %r8,%r12
- rorq $5,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- rorq $4,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- addq 8(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- rorq $6,%r14
- xorq %rax,%rdi
- addq %r12,%r10
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- rorq $28,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- rorq $23,%r13
- movq %r14,%r10
- movq %rdx,%r12
- rorq $5,%r14
- xorq %rcx,%r13
- xorq %r8,%r12
- rorq $4,%r13
- xorq %r10,%r14
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 16(%rsp),%r9
- movq %r10,%r15
- xorq %r8,%r12
- rorq $6,%r14
- xorq %r11,%r15
- addq %r12,%r9
- rorq $14,%r13
- andq %r15,%rdi
- xorq %r10,%r14
- addq %r13,%r9
- xorq %r11,%rdi
- rorq $28,%r14
- addq %r9,%rbx
- addq %rdi,%r9
- movq %rbx,%r13
- addq %r9,%r14
- rorq $23,%r13
- movq %r14,%r9
- movq %rcx,%r12
- rorq $5,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- rorq $4,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- addq 24(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- rorq $6,%r14
- xorq %r10,%rdi
- addq %r12,%r8
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- rorq $28,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- rorq $23,%r13
- movq %r14,%r8
- movq %rbx,%r12
- rorq $5,%r14
- xorq %rax,%r13
- xorq %rcx,%r12
- rorq $4,%r13
- xorq %r8,%r14
- andq %rax,%r12
- xorq %rax,%r13
- addq 32(%rsp),%rdx
- movq %r8,%r15
- xorq %rcx,%r12
- rorq $6,%r14
- xorq %r9,%r15
- addq %r12,%rdx
- rorq $14,%r13
- andq %r15,%rdi
- xorq %r8,%r14
- addq %r13,%rdx
- xorq %r9,%rdi
- rorq $28,%r14
- addq %rdx,%r11
- addq %rdi,%rdx
- movq %r11,%r13
- addq %rdx,%r14
- rorq $23,%r13
- movq %r14,%rdx
- movq %rax,%r12
- rorq $5,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- rorq $4,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- addq 40(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- rorq $6,%r14
- xorq %r8,%rdi
- addq %r12,%rcx
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- rorq $28,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- rorq $23,%r13
- movq %r14,%rcx
- movq %r11,%r12
- rorq $5,%r14
- xorq %r10,%r13
- xorq %rax,%r12
- rorq $4,%r13
- xorq %rcx,%r14
- andq %r10,%r12
- xorq %r10,%r13
- addq 48(%rsp),%rbx
- movq %rcx,%r15
- xorq %rax,%r12
- rorq $6,%r14
- xorq %rdx,%r15
- addq %r12,%rbx
- rorq $14,%r13
- andq %r15,%rdi
- xorq %rcx,%r14
- addq %r13,%rbx
- xorq %rdx,%rdi
- rorq $28,%r14
- addq %rbx,%r9
- addq %rdi,%rbx
- movq %r9,%r13
- addq %rbx,%r14
- rorq $23,%r13
- movq %r14,%rbx
- movq %r10,%r12
- rorq $5,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- rorq $4,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- addq 56(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- rorq $6,%r14
- xorq %rcx,%rdi
- addq %r12,%rax
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- rorq $28,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- rorq $23,%r13
- movq %r14,%rax
- movq %r9,%r12
- rorq $5,%r14
- xorq %r8,%r13
- xorq %r10,%r12
- rorq $4,%r13
- xorq %rax,%r14
- andq %r8,%r12
- xorq %r8,%r13
- addq 64(%rsp),%r11
- movq %rax,%r15
- xorq %r10,%r12
- rorq $6,%r14
- xorq %rbx,%r15
- addq %r12,%r11
- rorq $14,%r13
- andq %r15,%rdi
- xorq %rax,%r14
- addq %r13,%r11
- xorq %rbx,%rdi
- rorq $28,%r14
- addq %r11,%rdx
- addq %rdi,%r11
- movq %rdx,%r13
- addq %r11,%r14
- rorq $23,%r13
- movq %r14,%r11
- movq %r8,%r12
- rorq $5,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- rorq $4,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- addq 72(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- rorq $6,%r14
- xorq %rax,%rdi
- addq %r12,%r10
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- rorq $28,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- rorq $23,%r13
- movq %r14,%r10
- movq %rdx,%r12
- rorq $5,%r14
- xorq %rcx,%r13
- xorq %r8,%r12
- rorq $4,%r13
- xorq %r10,%r14
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 80(%rsp),%r9
- movq %r10,%r15
- xorq %r8,%r12
- rorq $6,%r14
- xorq %r11,%r15
- addq %r12,%r9
- rorq $14,%r13
- andq %r15,%rdi
- xorq %r10,%r14
- addq %r13,%r9
- xorq %r11,%rdi
- rorq $28,%r14
- addq %r9,%rbx
- addq %rdi,%r9
- movq %rbx,%r13
- addq %r9,%r14
- rorq $23,%r13
- movq %r14,%r9
- movq %rcx,%r12
- rorq $5,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- rorq $4,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- addq 88(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- rorq $6,%r14
- xorq %r10,%rdi
- addq %r12,%r8
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- rorq $28,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- rorq $23,%r13
- movq %r14,%r8
- movq %rbx,%r12
- rorq $5,%r14
- xorq %rax,%r13
- xorq %rcx,%r12
- rorq $4,%r13
- xorq %r8,%r14
- andq %rax,%r12
- xorq %rax,%r13
- addq 96(%rsp),%rdx
- movq %r8,%r15
- xorq %rcx,%r12
- rorq $6,%r14
- xorq %r9,%r15
- addq %r12,%rdx
- rorq $14,%r13
- andq %r15,%rdi
- xorq %r8,%r14
- addq %r13,%rdx
- xorq %r9,%rdi
- rorq $28,%r14
- addq %rdx,%r11
- addq %rdi,%rdx
- movq %r11,%r13
- addq %rdx,%r14
- rorq $23,%r13
- movq %r14,%rdx
- movq %rax,%r12
- rorq $5,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- rorq $4,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- addq 104(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- rorq $6,%r14
- xorq %r8,%rdi
- addq %r12,%rcx
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- rorq $28,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- rorq $23,%r13
- movq %r14,%rcx
- movq %r11,%r12
- rorq $5,%r14
- xorq %r10,%r13
- xorq %rax,%r12
- rorq $4,%r13
- xorq %rcx,%r14
- andq %r10,%r12
- xorq %r10,%r13
- addq 112(%rsp),%rbx
- movq %rcx,%r15
- xorq %rax,%r12
- rorq $6,%r14
- xorq %rdx,%r15
- addq %r12,%rbx
- rorq $14,%r13
- andq %r15,%rdi
- xorq %rcx,%r14
- addq %r13,%rbx
- xorq %rdx,%rdi
- rorq $28,%r14
- addq %rbx,%r9
- addq %rdi,%rbx
- movq %r9,%r13
- addq %rbx,%r14
- rorq $23,%r13
- movq %r14,%rbx
- movq %r10,%r12
- rorq $5,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- rorq $4,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- addq 120(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- rorq $6,%r14
- xorq %rcx,%rdi
- addq %r12,%rax
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- rorq $28,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- movq 128+0(%rsp),%rdi
- movq %r14,%rax
-
- addq 0(%rdi),%rax
- leaq 128(%rsi),%rsi
- addq 8(%rdi),%rbx
- addq 16(%rdi),%rcx
- addq 24(%rdi),%rdx
- addq 32(%rdi),%r8
- addq 40(%rdi),%r9
- addq 48(%rdi),%r10
- addq 56(%rdi),%r11
-
- cmpq 128+16(%rsp),%rsi
-
- movq %rax,0(%rdi)
- movq %rbx,8(%rdi)
- movq %rcx,16(%rdi)
- movq %rdx,24(%rdi)
- movq %r8,32(%rdi)
- movq %r9,40(%rdi)
- movq %r10,48(%rdi)
- movq %r11,56(%rdi)
- jb .Lloop_xop
-
- movq 152(%rsp),%rsi
-.cfi_def_cfa %rsi,8
- vzeroupper
- movq -48(%rsi),%r15
-.cfi_restore %r15
- movq -40(%rsi),%r14
-.cfi_restore %r14
- movq -32(%rsi),%r13
-.cfi_restore %r13
- movq -24(%rsi),%r12
-.cfi_restore %r12
- movq -16(%rsi),%rbp
-.cfi_restore %rbp
- movq -8(%rsi),%rbx
-.cfi_restore %rbx
- leaq (%rsi),%rsp
-.cfi_def_cfa_register %rsp
-.Lepilogue_xop:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size sha512_block_data_order_xop,.-sha512_block_data_order_xop
-.type sha512_block_data_order_avx,@function
-.align 64
-sha512_block_data_order_avx:
-.cfi_startproc
-.Lavx_shortcut:
- movq %rsp,%rax
-.cfi_def_cfa_register %rax
- pushq %rbx
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_offset %rbp,-24
- pushq %r12
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_offset %r15,-56
- shlq $4,%rdx
- subq $160,%rsp
- leaq (%rsi,%rdx,8),%rdx
- andq $-64,%rsp
- movq %rdi,128+0(%rsp)
- movq %rsi,128+8(%rsp)
- movq %rdx,128+16(%rsp)
- movq %rax,152(%rsp)
-.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
-.Lprologue_avx:
-
- vzeroupper
- movq 0(%rdi),%rax
- movq 8(%rdi),%rbx
- movq 16(%rdi),%rcx
- movq 24(%rdi),%rdx
- movq 32(%rdi),%r8
- movq 40(%rdi),%r9
- movq 48(%rdi),%r10
- movq 56(%rdi),%r11
- jmp .Lloop_avx
-.align 16
-.Lloop_avx:
- vmovdqa K512+1280(%rip),%xmm11
- vmovdqu 0(%rsi),%xmm0
- leaq K512+128(%rip),%rbp
- vmovdqu 16(%rsi),%xmm1
- vmovdqu 32(%rsi),%xmm2
- vpshufb %xmm11,%xmm0,%xmm0
- vmovdqu 48(%rsi),%xmm3
- vpshufb %xmm11,%xmm1,%xmm1
- vmovdqu 64(%rsi),%xmm4
- vpshufb %xmm11,%xmm2,%xmm2
- vmovdqu 80(%rsi),%xmm5
- vpshufb %xmm11,%xmm3,%xmm3
- vmovdqu 96(%rsi),%xmm6
- vpshufb %xmm11,%xmm4,%xmm4
- vmovdqu 112(%rsi),%xmm7
- vpshufb %xmm11,%xmm5,%xmm5
- vpaddq -128(%rbp),%xmm0,%xmm8
- vpshufb %xmm11,%xmm6,%xmm6
- vpaddq -96(%rbp),%xmm1,%xmm9
- vpshufb %xmm11,%xmm7,%xmm7
- vpaddq -64(%rbp),%xmm2,%xmm10
- vpaddq -32(%rbp),%xmm3,%xmm11
- vmovdqa %xmm8,0(%rsp)
- vpaddq 0(%rbp),%xmm4,%xmm8
- vmovdqa %xmm9,16(%rsp)
- vpaddq 32(%rbp),%xmm5,%xmm9
- vmovdqa %xmm10,32(%rsp)
- vpaddq 64(%rbp),%xmm6,%xmm10
- vmovdqa %xmm11,48(%rsp)
- vpaddq 96(%rbp),%xmm7,%xmm11
- vmovdqa %xmm8,64(%rsp)
- movq %rax,%r14
- vmovdqa %xmm9,80(%rsp)
- movq %rbx,%rdi
- vmovdqa %xmm10,96(%rsp)
- xorq %rcx,%rdi
- vmovdqa %xmm11,112(%rsp)
- movq %r8,%r13
- jmp .Lavx_00_47
-
-.align 16
-.Lavx_00_47:
- addq $256,%rbp
- vpalignr $8,%xmm0,%xmm1,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%rax
- vpalignr $8,%xmm4,%xmm5,%xmm11
- movq %r9,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %r8,%r13
- xorq %r10,%r12
- vpaddq %xmm11,%xmm0,%xmm0
- shrdq $4,%r13,%r13
- xorq %rax,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %r8,%r12
- xorq %r8,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 0(%rsp),%r11
- movq %rax,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %r10,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %rbx,%r15
- addq %r12,%r11
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %rax,%r14
- addq %r13,%r11
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rbx,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm7,%xmm11
- addq %r11,%rdx
- addq %rdi,%r11
- vpxor %xmm9,%xmm8,%xmm8
- movq %rdx,%r13
- addq %r11,%r14
- vpsllq $3,%xmm7,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%r11
- vpaddq %xmm8,%xmm0,%xmm0
- movq %r8,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm7,%xmm9
- xorq %rdx,%r13
- xorq %r9,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %r11,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %rdx,%r12
- xorq %rdx,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 8(%rsp),%r10
- movq %r11,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %r9,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %rax,%rdi
- addq %r12,%r10
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm0,%xmm0
- xorq %r11,%r14
- addq %r13,%r10
- vpaddq -128(%rbp),%xmm0,%xmm10
- xorq %rax,%r15
- shrdq $28,%r14,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- vmovdqa %xmm10,0(%rsp)
- vpalignr $8,%xmm1,%xmm2,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%r10
- vpalignr $8,%xmm5,%xmm6,%xmm11
- movq %rdx,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %rcx,%r13
- xorq %r8,%r12
- vpaddq %xmm11,%xmm1,%xmm1
- shrdq $4,%r13,%r13
- xorq %r10,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %rcx,%r12
- xorq %rcx,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 16(%rsp),%r9
- movq %r10,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %r8,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %r11,%r15
- addq %r12,%r9
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %r10,%r14
- addq %r13,%r9
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r11,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm0,%xmm11
- addq %r9,%rbx
- addq %rdi,%r9
- vpxor %xmm9,%xmm8,%xmm8
- movq %rbx,%r13
- addq %r9,%r14
- vpsllq $3,%xmm0,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%r9
- vpaddq %xmm8,%xmm1,%xmm1
- movq %rcx,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm0,%xmm9
- xorq %rbx,%r13
- xorq %rdx,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %r9,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %rbx,%r12
- xorq %rbx,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 24(%rsp),%r8
- movq %r9,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %rdx,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %r10,%rdi
- addq %r12,%r8
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm1,%xmm1
- xorq %r9,%r14
- addq %r13,%r8
- vpaddq -96(%rbp),%xmm1,%xmm10
- xorq %r10,%r15
- shrdq $28,%r14,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- vmovdqa %xmm10,16(%rsp)
- vpalignr $8,%xmm2,%xmm3,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%r8
- vpalignr $8,%xmm6,%xmm7,%xmm11
- movq %rbx,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %rax,%r13
- xorq %rcx,%r12
- vpaddq %xmm11,%xmm2,%xmm2
- shrdq $4,%r13,%r13
- xorq %r8,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %rax,%r12
- xorq %rax,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 32(%rsp),%rdx
- movq %r8,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %rcx,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %r9,%r15
- addq %r12,%rdx
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %r8,%r14
- addq %r13,%rdx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r9,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm1,%xmm11
- addq %rdx,%r11
- addq %rdi,%rdx
- vpxor %xmm9,%xmm8,%xmm8
- movq %r11,%r13
- addq %rdx,%r14
- vpsllq $3,%xmm1,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%rdx
- vpaddq %xmm8,%xmm2,%xmm2
- movq %rax,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm1,%xmm9
- xorq %r11,%r13
- xorq %rbx,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %rdx,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %r11,%r12
- xorq %r11,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 40(%rsp),%rcx
- movq %rdx,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %rbx,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %r8,%rdi
- addq %r12,%rcx
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm2,%xmm2
- xorq %rdx,%r14
- addq %r13,%rcx
- vpaddq -64(%rbp),%xmm2,%xmm10
- xorq %r8,%r15
- shrdq $28,%r14,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- vmovdqa %xmm10,32(%rsp)
- vpalignr $8,%xmm3,%xmm4,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%rcx
- vpalignr $8,%xmm7,%xmm0,%xmm11
- movq %r11,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %r10,%r13
- xorq %rax,%r12
- vpaddq %xmm11,%xmm3,%xmm3
- shrdq $4,%r13,%r13
- xorq %rcx,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %r10,%r12
- xorq %r10,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 48(%rsp),%rbx
- movq %rcx,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %rax,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %rdx,%r15
- addq %r12,%rbx
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %rcx,%r14
- addq %r13,%rbx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rdx,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm2,%xmm11
- addq %rbx,%r9
- addq %rdi,%rbx
- vpxor %xmm9,%xmm8,%xmm8
- movq %r9,%r13
- addq %rbx,%r14
- vpsllq $3,%xmm2,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%rbx
- vpaddq %xmm8,%xmm3,%xmm3
- movq %r10,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm2,%xmm9
- xorq %r9,%r13
- xorq %r11,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %rbx,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %r9,%r12
- xorq %r9,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 56(%rsp),%rax
- movq %rbx,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %r11,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %rcx,%rdi
- addq %r12,%rax
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm3,%xmm3
- xorq %rbx,%r14
- addq %r13,%rax
- vpaddq -32(%rbp),%xmm3,%xmm10
- xorq %rcx,%r15
- shrdq $28,%r14,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- vmovdqa %xmm10,48(%rsp)
- vpalignr $8,%xmm4,%xmm5,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%rax
- vpalignr $8,%xmm0,%xmm1,%xmm11
- movq %r9,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %r8,%r13
- xorq %r10,%r12
- vpaddq %xmm11,%xmm4,%xmm4
- shrdq $4,%r13,%r13
- xorq %rax,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %r8,%r12
- xorq %r8,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 64(%rsp),%r11
- movq %rax,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %r10,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %rbx,%r15
- addq %r12,%r11
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %rax,%r14
- addq %r13,%r11
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rbx,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm3,%xmm11
- addq %r11,%rdx
- addq %rdi,%r11
- vpxor %xmm9,%xmm8,%xmm8
- movq %rdx,%r13
- addq %r11,%r14
- vpsllq $3,%xmm3,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%r11
- vpaddq %xmm8,%xmm4,%xmm4
- movq %r8,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm3,%xmm9
- xorq %rdx,%r13
- xorq %r9,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %r11,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %rdx,%r12
- xorq %rdx,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 72(%rsp),%r10
- movq %r11,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %r9,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %rax,%rdi
- addq %r12,%r10
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm4,%xmm4
- xorq %r11,%r14
- addq %r13,%r10
- vpaddq 0(%rbp),%xmm4,%xmm10
- xorq %rax,%r15
- shrdq $28,%r14,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- vmovdqa %xmm10,64(%rsp)
- vpalignr $8,%xmm5,%xmm6,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%r10
- vpalignr $8,%xmm1,%xmm2,%xmm11
- movq %rdx,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %rcx,%r13
- xorq %r8,%r12
- vpaddq %xmm11,%xmm5,%xmm5
- shrdq $4,%r13,%r13
- xorq %r10,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %rcx,%r12
- xorq %rcx,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 80(%rsp),%r9
- movq %r10,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %r8,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %r11,%r15
- addq %r12,%r9
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %r10,%r14
- addq %r13,%r9
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r11,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm4,%xmm11
- addq %r9,%rbx
- addq %rdi,%r9
- vpxor %xmm9,%xmm8,%xmm8
- movq %rbx,%r13
- addq %r9,%r14
- vpsllq $3,%xmm4,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%r9
- vpaddq %xmm8,%xmm5,%xmm5
- movq %rcx,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm4,%xmm9
- xorq %rbx,%r13
- xorq %rdx,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %r9,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %rbx,%r12
- xorq %rbx,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 88(%rsp),%r8
- movq %r9,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %rdx,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %r10,%rdi
- addq %r12,%r8
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm5,%xmm5
- xorq %r9,%r14
- addq %r13,%r8
- vpaddq 32(%rbp),%xmm5,%xmm10
- xorq %r10,%r15
- shrdq $28,%r14,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- vmovdqa %xmm10,80(%rsp)
- vpalignr $8,%xmm6,%xmm7,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%r8
- vpalignr $8,%xmm2,%xmm3,%xmm11
- movq %rbx,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %rax,%r13
- xorq %rcx,%r12
- vpaddq %xmm11,%xmm6,%xmm6
- shrdq $4,%r13,%r13
- xorq %r8,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %rax,%r12
- xorq %rax,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 96(%rsp),%rdx
- movq %r8,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %rcx,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %r9,%r15
- addq %r12,%rdx
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %r8,%r14
- addq %r13,%rdx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r9,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm5,%xmm11
- addq %rdx,%r11
- addq %rdi,%rdx
- vpxor %xmm9,%xmm8,%xmm8
- movq %r11,%r13
- addq %rdx,%r14
- vpsllq $3,%xmm5,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%rdx
- vpaddq %xmm8,%xmm6,%xmm6
- movq %rax,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm5,%xmm9
- xorq %r11,%r13
- xorq %rbx,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %rdx,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %r11,%r12
- xorq %r11,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 104(%rsp),%rcx
- movq %rdx,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %rbx,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %r8,%rdi
- addq %r12,%rcx
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm6,%xmm6
- xorq %rdx,%r14
- addq %r13,%rcx
- vpaddq 64(%rbp),%xmm6,%xmm10
- xorq %r8,%r15
- shrdq $28,%r14,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- vmovdqa %xmm10,96(%rsp)
- vpalignr $8,%xmm7,%xmm0,%xmm8
- shrdq $23,%r13,%r13
- movq %r14,%rcx
- vpalignr $8,%xmm3,%xmm4,%xmm11
- movq %r11,%r12
- shrdq $5,%r14,%r14
- vpsrlq $1,%xmm8,%xmm10
- xorq %r10,%r13
- xorq %rax,%r12
- vpaddq %xmm11,%xmm7,%xmm7
- shrdq $4,%r13,%r13
- xorq %rcx,%r14
- vpsrlq $7,%xmm8,%xmm11
- andq %r10,%r12
- xorq %r10,%r13
- vpsllq $56,%xmm8,%xmm9
- addq 112(%rsp),%rbx
- movq %rcx,%r15
- vpxor %xmm10,%xmm11,%xmm8
- xorq %rax,%r12
- shrdq $6,%r14,%r14
- vpsrlq $7,%xmm10,%xmm10
- xorq %rdx,%r15
- addq %r12,%rbx
- vpxor %xmm9,%xmm8,%xmm8
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- vpsllq $7,%xmm9,%xmm9
- xorq %rcx,%r14
- addq %r13,%rbx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rdx,%rdi
- shrdq $28,%r14,%r14
- vpsrlq $6,%xmm6,%xmm11
- addq %rbx,%r9
- addq %rdi,%rbx
- vpxor %xmm9,%xmm8,%xmm8
- movq %r9,%r13
- addq %rbx,%r14
- vpsllq $3,%xmm6,%xmm10
- shrdq $23,%r13,%r13
- movq %r14,%rbx
- vpaddq %xmm8,%xmm7,%xmm7
- movq %r10,%r12
- shrdq $5,%r14,%r14
- vpsrlq $19,%xmm6,%xmm9
- xorq %r9,%r13
- xorq %r11,%r12
- vpxor %xmm10,%xmm11,%xmm11
- shrdq $4,%r13,%r13
- xorq %rbx,%r14
- vpsllq $42,%xmm10,%xmm10
- andq %r9,%r12
- xorq %r9,%r13
- vpxor %xmm9,%xmm11,%xmm11
- addq 120(%rsp),%rax
- movq %rbx,%rdi
- vpsrlq $42,%xmm9,%xmm9
- xorq %r11,%r12
- shrdq $6,%r14,%r14
- vpxor %xmm10,%xmm11,%xmm11
- xorq %rcx,%rdi
- addq %r12,%rax
- vpxor %xmm9,%xmm11,%xmm11
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- vpaddq %xmm11,%xmm7,%xmm7
- xorq %rbx,%r14
- addq %r13,%rax
- vpaddq 96(%rbp),%xmm7,%xmm10
- xorq %rcx,%r15
- shrdq $28,%r14,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- vmovdqa %xmm10,112(%rsp)
- cmpb $0,135(%rbp)
- jne .Lavx_00_47
- shrdq $23,%r13,%r13
- movq %r14,%rax
- movq %r9,%r12
- shrdq $5,%r14,%r14
- xorq %r8,%r13
- xorq %r10,%r12
- shrdq $4,%r13,%r13
- xorq %rax,%r14
- andq %r8,%r12
- xorq %r8,%r13
- addq 0(%rsp),%r11
- movq %rax,%r15
- xorq %r10,%r12
- shrdq $6,%r14,%r14
- xorq %rbx,%r15
- addq %r12,%r11
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %rax,%r14
- addq %r13,%r11
- xorq %rbx,%rdi
- shrdq $28,%r14,%r14
- addq %r11,%rdx
- addq %rdi,%r11
- movq %rdx,%r13
- addq %r11,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r11
- movq %r8,%r12
- shrdq $5,%r14,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- shrdq $4,%r13,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- addq 8(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- shrdq $6,%r14,%r14
- xorq %rax,%rdi
- addq %r12,%r10
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- shrdq $28,%r14,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r10
- movq %rdx,%r12
- shrdq $5,%r14,%r14
- xorq %rcx,%r13
- xorq %r8,%r12
- shrdq $4,%r13,%r13
- xorq %r10,%r14
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 16(%rsp),%r9
- movq %r10,%r15
- xorq %r8,%r12
- shrdq $6,%r14,%r14
- xorq %r11,%r15
- addq %r12,%r9
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %r10,%r14
- addq %r13,%r9
- xorq %r11,%rdi
- shrdq $28,%r14,%r14
- addq %r9,%rbx
- addq %rdi,%r9
- movq %rbx,%r13
- addq %r9,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r9
- movq %rcx,%r12
- shrdq $5,%r14,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- shrdq $4,%r13,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- addq 24(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- shrdq $6,%r14,%r14
- xorq %r10,%rdi
- addq %r12,%r8
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- shrdq $28,%r14,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r8
- movq %rbx,%r12
- shrdq $5,%r14,%r14
- xorq %rax,%r13
- xorq %rcx,%r12
- shrdq $4,%r13,%r13
- xorq %r8,%r14
- andq %rax,%r12
- xorq %rax,%r13
- addq 32(%rsp),%rdx
- movq %r8,%r15
- xorq %rcx,%r12
- shrdq $6,%r14,%r14
- xorq %r9,%r15
- addq %r12,%rdx
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %r8,%r14
- addq %r13,%rdx
- xorq %r9,%rdi
- shrdq $28,%r14,%r14
- addq %rdx,%r11
- addq %rdi,%rdx
- movq %r11,%r13
- addq %rdx,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rdx
- movq %rax,%r12
- shrdq $5,%r14,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- shrdq $4,%r13,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- addq 40(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- shrdq $6,%r14,%r14
- xorq %r8,%rdi
- addq %r12,%rcx
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- shrdq $28,%r14,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rcx
- movq %r11,%r12
- shrdq $5,%r14,%r14
- xorq %r10,%r13
- xorq %rax,%r12
- shrdq $4,%r13,%r13
- xorq %rcx,%r14
- andq %r10,%r12
- xorq %r10,%r13
- addq 48(%rsp),%rbx
- movq %rcx,%r15
- xorq %rax,%r12
- shrdq $6,%r14,%r14
- xorq %rdx,%r15
- addq %r12,%rbx
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %rcx,%r14
- addq %r13,%rbx
- xorq %rdx,%rdi
- shrdq $28,%r14,%r14
- addq %rbx,%r9
- addq %rdi,%rbx
- movq %r9,%r13
- addq %rbx,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rbx
- movq %r10,%r12
- shrdq $5,%r14,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- shrdq $4,%r13,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- addq 56(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- shrdq $6,%r14,%r14
- xorq %rcx,%rdi
- addq %r12,%rax
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- shrdq $28,%r14,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rax
- movq %r9,%r12
- shrdq $5,%r14,%r14
- xorq %r8,%r13
- xorq %r10,%r12
- shrdq $4,%r13,%r13
- xorq %rax,%r14
- andq %r8,%r12
- xorq %r8,%r13
- addq 64(%rsp),%r11
- movq %rax,%r15
- xorq %r10,%r12
- shrdq $6,%r14,%r14
- xorq %rbx,%r15
- addq %r12,%r11
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %rax,%r14
- addq %r13,%r11
- xorq %rbx,%rdi
- shrdq $28,%r14,%r14
- addq %r11,%rdx
- addq %rdi,%r11
- movq %rdx,%r13
- addq %r11,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r11
- movq %r8,%r12
- shrdq $5,%r14,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- shrdq $4,%r13,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- addq 72(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- shrdq $6,%r14,%r14
- xorq %rax,%rdi
- addq %r12,%r10
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- shrdq $28,%r14,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r10
- movq %rdx,%r12
- shrdq $5,%r14,%r14
- xorq %rcx,%r13
- xorq %r8,%r12
- shrdq $4,%r13,%r13
- xorq %r10,%r14
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 80(%rsp),%r9
- movq %r10,%r15
- xorq %r8,%r12
- shrdq $6,%r14,%r14
- xorq %r11,%r15
- addq %r12,%r9
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %r10,%r14
- addq %r13,%r9
- xorq %r11,%rdi
- shrdq $28,%r14,%r14
- addq %r9,%rbx
- addq %rdi,%r9
- movq %rbx,%r13
- addq %r9,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r9
- movq %rcx,%r12
- shrdq $5,%r14,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- shrdq $4,%r13,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- addq 88(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- shrdq $6,%r14,%r14
- xorq %r10,%rdi
- addq %r12,%r8
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- shrdq $28,%r14,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- shrdq $23,%r13,%r13
- movq %r14,%r8
- movq %rbx,%r12
- shrdq $5,%r14,%r14
- xorq %rax,%r13
- xorq %rcx,%r12
- shrdq $4,%r13,%r13
- xorq %r8,%r14
- andq %rax,%r12
- xorq %rax,%r13
- addq 96(%rsp),%rdx
- movq %r8,%r15
- xorq %rcx,%r12
- shrdq $6,%r14,%r14
- xorq %r9,%r15
- addq %r12,%rdx
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %r8,%r14
- addq %r13,%rdx
- xorq %r9,%rdi
- shrdq $28,%r14,%r14
- addq %rdx,%r11
- addq %rdi,%rdx
- movq %r11,%r13
- addq %rdx,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rdx
- movq %rax,%r12
- shrdq $5,%r14,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- shrdq $4,%r13,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- addq 104(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- shrdq $6,%r14,%r14
- xorq %r8,%rdi
- addq %r12,%rcx
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- shrdq $28,%r14,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rcx
- movq %r11,%r12
- shrdq $5,%r14,%r14
- xorq %r10,%r13
- xorq %rax,%r12
- shrdq $4,%r13,%r13
- xorq %rcx,%r14
- andq %r10,%r12
- xorq %r10,%r13
- addq 112(%rsp),%rbx
- movq %rcx,%r15
- xorq %rax,%r12
- shrdq $6,%r14,%r14
- xorq %rdx,%r15
- addq %r12,%rbx
- shrdq $14,%r13,%r13
- andq %r15,%rdi
- xorq %rcx,%r14
- addq %r13,%rbx
- xorq %rdx,%rdi
- shrdq $28,%r14,%r14
- addq %rbx,%r9
- addq %rdi,%rbx
- movq %r9,%r13
- addq %rbx,%r14
- shrdq $23,%r13,%r13
- movq %r14,%rbx
- movq %r10,%r12
- shrdq $5,%r14,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- shrdq $4,%r13,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- addq 120(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- shrdq $6,%r14,%r14
- xorq %rcx,%rdi
- addq %r12,%rax
- shrdq $14,%r13,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- shrdq $28,%r14,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- movq 128+0(%rsp),%rdi
- movq %r14,%rax
-
- addq 0(%rdi),%rax
- leaq 128(%rsi),%rsi
- addq 8(%rdi),%rbx
- addq 16(%rdi),%rcx
- addq 24(%rdi),%rdx
- addq 32(%rdi),%r8
- addq 40(%rdi),%r9
- addq 48(%rdi),%r10
- addq 56(%rdi),%r11
-
- cmpq 128+16(%rsp),%rsi
-
- movq %rax,0(%rdi)
- movq %rbx,8(%rdi)
- movq %rcx,16(%rdi)
- movq %rdx,24(%rdi)
- movq %r8,32(%rdi)
- movq %r9,40(%rdi)
- movq %r10,48(%rdi)
- movq %r11,56(%rdi)
- jb .Lloop_avx
-
- movq 152(%rsp),%rsi
-.cfi_def_cfa %rsi,8
- vzeroupper
- movq -48(%rsi),%r15
-.cfi_restore %r15
- movq -40(%rsi),%r14
-.cfi_restore %r14
- movq -32(%rsi),%r13
-.cfi_restore %r13
- movq -24(%rsi),%r12
-.cfi_restore %r12
- movq -16(%rsi),%rbp
-.cfi_restore %rbp
- movq -8(%rsi),%rbx
-.cfi_restore %rbx
- leaq (%rsi),%rsp
-.cfi_def_cfa_register %rsp
-.Lepilogue_avx:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size sha512_block_data_order_avx,.-sha512_block_data_order_avx
-.type sha512_block_data_order_avx2,@function
-.align 64
-sha512_block_data_order_avx2:
-.cfi_startproc
-.Lavx2_shortcut:
- movq %rsp,%rax
-.cfi_def_cfa_register %rax
- pushq %rbx
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_offset %rbp,-24
- pushq %r12
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_offset %r15,-56
- subq $1312,%rsp
- shlq $4,%rdx
- andq $-2048,%rsp
- leaq (%rsi,%rdx,8),%rdx
- addq $1152,%rsp
- movq %rdi,128+0(%rsp)
- movq %rsi,128+8(%rsp)
- movq %rdx,128+16(%rsp)
- movq %rax,152(%rsp)
-.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
-.Lprologue_avx2:
-
- vzeroupper
- subq $-128,%rsi
- movq 0(%rdi),%rax
- movq %rsi,%r12
- movq 8(%rdi),%rbx
- cmpq %rdx,%rsi
- movq 16(%rdi),%rcx
- cmoveq %rsp,%r12
- movq 24(%rdi),%rdx
- movq 32(%rdi),%r8
- movq 40(%rdi),%r9
- movq 48(%rdi),%r10
- movq 56(%rdi),%r11
- jmp .Loop_avx2
-.align 16
-.Loop_avx2:
- vmovdqu -128(%rsi),%xmm0
- vmovdqu -128+16(%rsi),%xmm1
- vmovdqu -128+32(%rsi),%xmm2
- leaq K512+128(%rip),%rbp
- vmovdqu -128+48(%rsi),%xmm3
- vmovdqu -128+64(%rsi),%xmm4
- vmovdqu -128+80(%rsi),%xmm5
- vmovdqu -128+96(%rsi),%xmm6
- vmovdqu -128+112(%rsi),%xmm7
-
- vmovdqa 1152(%rbp),%ymm10
- vinserti128 $1,(%r12),%ymm0,%ymm0
- vinserti128 $1,16(%r12),%ymm1,%ymm1
- vpshufb %ymm10,%ymm0,%ymm0
- vinserti128 $1,32(%r12),%ymm2,%ymm2
- vpshufb %ymm10,%ymm1,%ymm1
- vinserti128 $1,48(%r12),%ymm3,%ymm3
- vpshufb %ymm10,%ymm2,%ymm2
- vinserti128 $1,64(%r12),%ymm4,%ymm4
- vpshufb %ymm10,%ymm3,%ymm3
- vinserti128 $1,80(%r12),%ymm5,%ymm5
- vpshufb %ymm10,%ymm4,%ymm4
- vinserti128 $1,96(%r12),%ymm6,%ymm6
- vpshufb %ymm10,%ymm5,%ymm5
- vinserti128 $1,112(%r12),%ymm7,%ymm7
-
- vpaddq -128(%rbp),%ymm0,%ymm8
- vpshufb %ymm10,%ymm6,%ymm6
- vpaddq -96(%rbp),%ymm1,%ymm9
- vpshufb %ymm10,%ymm7,%ymm7
- vpaddq -64(%rbp),%ymm2,%ymm10
- vpaddq -32(%rbp),%ymm3,%ymm11
- vmovdqa %ymm8,0(%rsp)
- vpaddq 0(%rbp),%ymm4,%ymm8
- vmovdqa %ymm9,32(%rsp)
- vpaddq 32(%rbp),%ymm5,%ymm9
- vmovdqa %ymm10,64(%rsp)
- vpaddq 64(%rbp),%ymm6,%ymm10
- vmovdqa %ymm11,96(%rsp)
-
- movq 152(%rsp),%rdi
-.cfi_def_cfa %rdi,8
- leaq -128(%rsp),%rsp
-
-
-
- movq %rdi,-8(%rsp)
-.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
- vpaddq 96(%rbp),%ymm7,%ymm11
- vmovdqa %ymm8,0(%rsp)
- xorq %r14,%r14
- vmovdqa %ymm9,32(%rsp)
- movq %rbx,%rdi
- vmovdqa %ymm10,64(%rsp)
- xorq %rcx,%rdi
- vmovdqa %ymm11,96(%rsp)
- movq %r9,%r12
- addq $32*8,%rbp
- jmp .Lavx2_00_47
-
-.align 16
-.Lavx2_00_47:
- leaq -128(%rsp),%rsp
-.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08
-
- pushq 128-8(%rsp)
-.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08
- leaq 8(%rsp),%rsp
-.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
- vpalignr $8,%ymm0,%ymm1,%ymm8
- addq 0+256(%rsp),%r11
- andq %r8,%r12
- rorxq $41,%r8,%r13
- vpalignr $8,%ymm4,%ymm5,%ymm11
- rorxq $18,%r8,%r15
- leaq (%rax,%r14,1),%rax
- leaq (%r11,%r12,1),%r11
- vpsrlq $1,%ymm8,%ymm10
- andnq %r10,%r8,%r12
- xorq %r15,%r13
- rorxq $14,%r8,%r14
- vpaddq %ymm11,%ymm0,%ymm0
- vpsrlq $7,%ymm8,%ymm11
- leaq (%r11,%r12,1),%r11
- xorq %r14,%r13
- movq %rax,%r15
- vpsllq $56,%ymm8,%ymm9
- vpxor %ymm10,%ymm11,%ymm8
- rorxq $39,%rax,%r12
- leaq (%r11,%r13,1),%r11
- xorq %rbx,%r15
- vpsrlq $7,%ymm10,%ymm10
- vpxor %ymm9,%ymm8,%ymm8
- rorxq $34,%rax,%r14
- rorxq $28,%rax,%r13
- leaq (%rdx,%r11,1),%rdx
- vpsllq $7,%ymm9,%ymm9
- vpxor %ymm10,%ymm8,%ymm8
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rbx,%rdi
- vpsrlq $6,%ymm7,%ymm11
- vpxor %ymm9,%ymm8,%ymm8
- xorq %r13,%r14
- leaq (%r11,%rdi,1),%r11
- movq %r8,%r12
- vpsllq $3,%ymm7,%ymm10
- vpaddq %ymm8,%ymm0,%ymm0
- addq 8+256(%rsp),%r10
- andq %rdx,%r12
- rorxq $41,%rdx,%r13
- vpsrlq $19,%ymm7,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- rorxq $18,%rdx,%rdi
- leaq (%r11,%r14,1),%r11
- leaq (%r10,%r12,1),%r10
- vpsllq $42,%ymm10,%ymm10
- vpxor %ymm9,%ymm11,%ymm11
- andnq %r9,%rdx,%r12
- xorq %rdi,%r13
- rorxq $14,%rdx,%r14
- vpsrlq $42,%ymm9,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- leaq (%r10,%r12,1),%r10
- xorq %r14,%r13
- movq %r11,%rdi
- vpxor %ymm9,%ymm11,%ymm11
- rorxq $39,%r11,%r12
- leaq (%r10,%r13,1),%r10
- xorq %rax,%rdi
- vpaddq %ymm11,%ymm0,%ymm0
- rorxq $34,%r11,%r14
- rorxq $28,%r11,%r13
- leaq (%rcx,%r10,1),%rcx
- vpaddq -128(%rbp),%ymm0,%ymm10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rax,%r15
- xorq %r13,%r14
- leaq (%r10,%r15,1),%r10
- movq %rdx,%r12
- vmovdqa %ymm10,0(%rsp)
- vpalignr $8,%ymm1,%ymm2,%ymm8
- addq 32+256(%rsp),%r9
- andq %rcx,%r12
- rorxq $41,%rcx,%r13
- vpalignr $8,%ymm5,%ymm6,%ymm11
- rorxq $18,%rcx,%r15
- leaq (%r10,%r14,1),%r10
- leaq (%r9,%r12,1),%r9
- vpsrlq $1,%ymm8,%ymm10
- andnq %r8,%rcx,%r12
- xorq %r15,%r13
- rorxq $14,%rcx,%r14
- vpaddq %ymm11,%ymm1,%ymm1
- vpsrlq $7,%ymm8,%ymm11
- leaq (%r9,%r12,1),%r9
- xorq %r14,%r13
- movq %r10,%r15
- vpsllq $56,%ymm8,%ymm9
- vpxor %ymm10,%ymm11,%ymm8
- rorxq $39,%r10,%r12
- leaq (%r9,%r13,1),%r9
- xorq %r11,%r15
- vpsrlq $7,%ymm10,%ymm10
- vpxor %ymm9,%ymm8,%ymm8
- rorxq $34,%r10,%r14
- rorxq $28,%r10,%r13
- leaq (%rbx,%r9,1),%rbx
- vpsllq $7,%ymm9,%ymm9
- vpxor %ymm10,%ymm8,%ymm8
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r11,%rdi
- vpsrlq $6,%ymm0,%ymm11
- vpxor %ymm9,%ymm8,%ymm8
- xorq %r13,%r14
- leaq (%r9,%rdi,1),%r9
- movq %rcx,%r12
- vpsllq $3,%ymm0,%ymm10
- vpaddq %ymm8,%ymm1,%ymm1
- addq 40+256(%rsp),%r8
- andq %rbx,%r12
- rorxq $41,%rbx,%r13
- vpsrlq $19,%ymm0,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- rorxq $18,%rbx,%rdi
- leaq (%r9,%r14,1),%r9
- leaq (%r8,%r12,1),%r8
- vpsllq $42,%ymm10,%ymm10
- vpxor %ymm9,%ymm11,%ymm11
- andnq %rdx,%rbx,%r12
- xorq %rdi,%r13
- rorxq $14,%rbx,%r14
- vpsrlq $42,%ymm9,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- leaq (%r8,%r12,1),%r8
- xorq %r14,%r13
- movq %r9,%rdi
- vpxor %ymm9,%ymm11,%ymm11
- rorxq $39,%r9,%r12
- leaq (%r8,%r13,1),%r8
- xorq %r10,%rdi
- vpaddq %ymm11,%ymm1,%ymm1
- rorxq $34,%r9,%r14
- rorxq $28,%r9,%r13
- leaq (%rax,%r8,1),%rax
- vpaddq -96(%rbp),%ymm1,%ymm10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r10,%r15
- xorq %r13,%r14
- leaq (%r8,%r15,1),%r8
- movq %rbx,%r12
- vmovdqa %ymm10,32(%rsp)
- vpalignr $8,%ymm2,%ymm3,%ymm8
- addq 64+256(%rsp),%rdx
- andq %rax,%r12
- rorxq $41,%rax,%r13
- vpalignr $8,%ymm6,%ymm7,%ymm11
- rorxq $18,%rax,%r15
- leaq (%r8,%r14,1),%r8
- leaq (%rdx,%r12,1),%rdx
- vpsrlq $1,%ymm8,%ymm10
- andnq %rcx,%rax,%r12
- xorq %r15,%r13
- rorxq $14,%rax,%r14
- vpaddq %ymm11,%ymm2,%ymm2
- vpsrlq $7,%ymm8,%ymm11
- leaq (%rdx,%r12,1),%rdx
- xorq %r14,%r13
- movq %r8,%r15
- vpsllq $56,%ymm8,%ymm9
- vpxor %ymm10,%ymm11,%ymm8
- rorxq $39,%r8,%r12
- leaq (%rdx,%r13,1),%rdx
- xorq %r9,%r15
- vpsrlq $7,%ymm10,%ymm10
- vpxor %ymm9,%ymm8,%ymm8
- rorxq $34,%r8,%r14
- rorxq $28,%r8,%r13
- leaq (%r11,%rdx,1),%r11
- vpsllq $7,%ymm9,%ymm9
- vpxor %ymm10,%ymm8,%ymm8
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r9,%rdi
- vpsrlq $6,%ymm1,%ymm11
- vpxor %ymm9,%ymm8,%ymm8
- xorq %r13,%r14
- leaq (%rdx,%rdi,1),%rdx
- movq %rax,%r12
- vpsllq $3,%ymm1,%ymm10
- vpaddq %ymm8,%ymm2,%ymm2
- addq 72+256(%rsp),%rcx
- andq %r11,%r12
- rorxq $41,%r11,%r13
- vpsrlq $19,%ymm1,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- rorxq $18,%r11,%rdi
- leaq (%rdx,%r14,1),%rdx
- leaq (%rcx,%r12,1),%rcx
- vpsllq $42,%ymm10,%ymm10
- vpxor %ymm9,%ymm11,%ymm11
- andnq %rbx,%r11,%r12
- xorq %rdi,%r13
- rorxq $14,%r11,%r14
- vpsrlq $42,%ymm9,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- leaq (%rcx,%r12,1),%rcx
- xorq %r14,%r13
- movq %rdx,%rdi
- vpxor %ymm9,%ymm11,%ymm11
- rorxq $39,%rdx,%r12
- leaq (%rcx,%r13,1),%rcx
- xorq %r8,%rdi
- vpaddq %ymm11,%ymm2,%ymm2
- rorxq $34,%rdx,%r14
- rorxq $28,%rdx,%r13
- leaq (%r10,%rcx,1),%r10
- vpaddq -64(%rbp),%ymm2,%ymm10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r8,%r15
- xorq %r13,%r14
- leaq (%rcx,%r15,1),%rcx
- movq %r11,%r12
- vmovdqa %ymm10,64(%rsp)
- vpalignr $8,%ymm3,%ymm4,%ymm8
- addq 96+256(%rsp),%rbx
- andq %r10,%r12
- rorxq $41,%r10,%r13
- vpalignr $8,%ymm7,%ymm0,%ymm11
- rorxq $18,%r10,%r15
- leaq (%rcx,%r14,1),%rcx
- leaq (%rbx,%r12,1),%rbx
- vpsrlq $1,%ymm8,%ymm10
- andnq %rax,%r10,%r12
- xorq %r15,%r13
- rorxq $14,%r10,%r14
- vpaddq %ymm11,%ymm3,%ymm3
- vpsrlq $7,%ymm8,%ymm11
- leaq (%rbx,%r12,1),%rbx
- xorq %r14,%r13
- movq %rcx,%r15
- vpsllq $56,%ymm8,%ymm9
- vpxor %ymm10,%ymm11,%ymm8
- rorxq $39,%rcx,%r12
- leaq (%rbx,%r13,1),%rbx
- xorq %rdx,%r15
- vpsrlq $7,%ymm10,%ymm10
- vpxor %ymm9,%ymm8,%ymm8
- rorxq $34,%rcx,%r14
- rorxq $28,%rcx,%r13
- leaq (%r9,%rbx,1),%r9
- vpsllq $7,%ymm9,%ymm9
- vpxor %ymm10,%ymm8,%ymm8
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rdx,%rdi
- vpsrlq $6,%ymm2,%ymm11
- vpxor %ymm9,%ymm8,%ymm8
- xorq %r13,%r14
- leaq (%rbx,%rdi,1),%rbx
- movq %r10,%r12
- vpsllq $3,%ymm2,%ymm10
- vpaddq %ymm8,%ymm3,%ymm3
- addq 104+256(%rsp),%rax
- andq %r9,%r12
- rorxq $41,%r9,%r13
- vpsrlq $19,%ymm2,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- rorxq $18,%r9,%rdi
- leaq (%rbx,%r14,1),%rbx
- leaq (%rax,%r12,1),%rax
- vpsllq $42,%ymm10,%ymm10
- vpxor %ymm9,%ymm11,%ymm11
- andnq %r11,%r9,%r12
- xorq %rdi,%r13
- rorxq $14,%r9,%r14
- vpsrlq $42,%ymm9,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- leaq (%rax,%r12,1),%rax
- xorq %r14,%r13
- movq %rbx,%rdi
- vpxor %ymm9,%ymm11,%ymm11
- rorxq $39,%rbx,%r12
- leaq (%rax,%r13,1),%rax
- xorq %rcx,%rdi
- vpaddq %ymm11,%ymm3,%ymm3
- rorxq $34,%rbx,%r14
- rorxq $28,%rbx,%r13
- leaq (%r8,%rax,1),%r8
- vpaddq -32(%rbp),%ymm3,%ymm10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rcx,%r15
- xorq %r13,%r14
- leaq (%rax,%r15,1),%rax
- movq %r9,%r12
- vmovdqa %ymm10,96(%rsp)
- leaq -128(%rsp),%rsp
-.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08
-
- pushq 128-8(%rsp)
-.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08
- leaq 8(%rsp),%rsp
-.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
- vpalignr $8,%ymm4,%ymm5,%ymm8
- addq 0+256(%rsp),%r11
- andq %r8,%r12
- rorxq $41,%r8,%r13
- vpalignr $8,%ymm0,%ymm1,%ymm11
- rorxq $18,%r8,%r15
- leaq (%rax,%r14,1),%rax
- leaq (%r11,%r12,1),%r11
- vpsrlq $1,%ymm8,%ymm10
- andnq %r10,%r8,%r12
- xorq %r15,%r13
- rorxq $14,%r8,%r14
- vpaddq %ymm11,%ymm4,%ymm4
- vpsrlq $7,%ymm8,%ymm11
- leaq (%r11,%r12,1),%r11
- xorq %r14,%r13
- movq %rax,%r15
- vpsllq $56,%ymm8,%ymm9
- vpxor %ymm10,%ymm11,%ymm8
- rorxq $39,%rax,%r12
- leaq (%r11,%r13,1),%r11
- xorq %rbx,%r15
- vpsrlq $7,%ymm10,%ymm10
- vpxor %ymm9,%ymm8,%ymm8
- rorxq $34,%rax,%r14
- rorxq $28,%rax,%r13
- leaq (%rdx,%r11,1),%rdx
- vpsllq $7,%ymm9,%ymm9
- vpxor %ymm10,%ymm8,%ymm8
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rbx,%rdi
- vpsrlq $6,%ymm3,%ymm11
- vpxor %ymm9,%ymm8,%ymm8
- xorq %r13,%r14
- leaq (%r11,%rdi,1),%r11
- movq %r8,%r12
- vpsllq $3,%ymm3,%ymm10
- vpaddq %ymm8,%ymm4,%ymm4
- addq 8+256(%rsp),%r10
- andq %rdx,%r12
- rorxq $41,%rdx,%r13
- vpsrlq $19,%ymm3,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- rorxq $18,%rdx,%rdi
- leaq (%r11,%r14,1),%r11
- leaq (%r10,%r12,1),%r10
- vpsllq $42,%ymm10,%ymm10
- vpxor %ymm9,%ymm11,%ymm11
- andnq %r9,%rdx,%r12
- xorq %rdi,%r13
- rorxq $14,%rdx,%r14
- vpsrlq $42,%ymm9,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- leaq (%r10,%r12,1),%r10
- xorq %r14,%r13
- movq %r11,%rdi
- vpxor %ymm9,%ymm11,%ymm11
- rorxq $39,%r11,%r12
- leaq (%r10,%r13,1),%r10
- xorq %rax,%rdi
- vpaddq %ymm11,%ymm4,%ymm4
- rorxq $34,%r11,%r14
- rorxq $28,%r11,%r13
- leaq (%rcx,%r10,1),%rcx
- vpaddq 0(%rbp),%ymm4,%ymm10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rax,%r15
- xorq %r13,%r14
- leaq (%r10,%r15,1),%r10
- movq %rdx,%r12
- vmovdqa %ymm10,0(%rsp)
- vpalignr $8,%ymm5,%ymm6,%ymm8
- addq 32+256(%rsp),%r9
- andq %rcx,%r12
- rorxq $41,%rcx,%r13
- vpalignr $8,%ymm1,%ymm2,%ymm11
- rorxq $18,%rcx,%r15
- leaq (%r10,%r14,1),%r10
- leaq (%r9,%r12,1),%r9
- vpsrlq $1,%ymm8,%ymm10
- andnq %r8,%rcx,%r12
- xorq %r15,%r13
- rorxq $14,%rcx,%r14
- vpaddq %ymm11,%ymm5,%ymm5
- vpsrlq $7,%ymm8,%ymm11
- leaq (%r9,%r12,1),%r9
- xorq %r14,%r13
- movq %r10,%r15
- vpsllq $56,%ymm8,%ymm9
- vpxor %ymm10,%ymm11,%ymm8
- rorxq $39,%r10,%r12
- leaq (%r9,%r13,1),%r9
- xorq %r11,%r15
- vpsrlq $7,%ymm10,%ymm10
- vpxor %ymm9,%ymm8,%ymm8
- rorxq $34,%r10,%r14
- rorxq $28,%r10,%r13
- leaq (%rbx,%r9,1),%rbx
- vpsllq $7,%ymm9,%ymm9
- vpxor %ymm10,%ymm8,%ymm8
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r11,%rdi
- vpsrlq $6,%ymm4,%ymm11
- vpxor %ymm9,%ymm8,%ymm8
- xorq %r13,%r14
- leaq (%r9,%rdi,1),%r9
- movq %rcx,%r12
- vpsllq $3,%ymm4,%ymm10
- vpaddq %ymm8,%ymm5,%ymm5
- addq 40+256(%rsp),%r8
- andq %rbx,%r12
- rorxq $41,%rbx,%r13
- vpsrlq $19,%ymm4,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- rorxq $18,%rbx,%rdi
- leaq (%r9,%r14,1),%r9
- leaq (%r8,%r12,1),%r8
- vpsllq $42,%ymm10,%ymm10
- vpxor %ymm9,%ymm11,%ymm11
- andnq %rdx,%rbx,%r12
- xorq %rdi,%r13
- rorxq $14,%rbx,%r14
- vpsrlq $42,%ymm9,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- leaq (%r8,%r12,1),%r8
- xorq %r14,%r13
- movq %r9,%rdi
- vpxor %ymm9,%ymm11,%ymm11
- rorxq $39,%r9,%r12
- leaq (%r8,%r13,1),%r8
- xorq %r10,%rdi
- vpaddq %ymm11,%ymm5,%ymm5
- rorxq $34,%r9,%r14
- rorxq $28,%r9,%r13
- leaq (%rax,%r8,1),%rax
- vpaddq 32(%rbp),%ymm5,%ymm10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r10,%r15
- xorq %r13,%r14
- leaq (%r8,%r15,1),%r8
- movq %rbx,%r12
- vmovdqa %ymm10,32(%rsp)
- vpalignr $8,%ymm6,%ymm7,%ymm8
- addq 64+256(%rsp),%rdx
- andq %rax,%r12
- rorxq $41,%rax,%r13
- vpalignr $8,%ymm2,%ymm3,%ymm11
- rorxq $18,%rax,%r15
- leaq (%r8,%r14,1),%r8
- leaq (%rdx,%r12,1),%rdx
- vpsrlq $1,%ymm8,%ymm10
- andnq %rcx,%rax,%r12
- xorq %r15,%r13
- rorxq $14,%rax,%r14
- vpaddq %ymm11,%ymm6,%ymm6
- vpsrlq $7,%ymm8,%ymm11
- leaq (%rdx,%r12,1),%rdx
- xorq %r14,%r13
- movq %r8,%r15
- vpsllq $56,%ymm8,%ymm9
- vpxor %ymm10,%ymm11,%ymm8
- rorxq $39,%r8,%r12
- leaq (%rdx,%r13,1),%rdx
- xorq %r9,%r15
- vpsrlq $7,%ymm10,%ymm10
- vpxor %ymm9,%ymm8,%ymm8
- rorxq $34,%r8,%r14
- rorxq $28,%r8,%r13
- leaq (%r11,%rdx,1),%r11
- vpsllq $7,%ymm9,%ymm9
- vpxor %ymm10,%ymm8,%ymm8
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r9,%rdi
- vpsrlq $6,%ymm5,%ymm11
- vpxor %ymm9,%ymm8,%ymm8
- xorq %r13,%r14
- leaq (%rdx,%rdi,1),%rdx
- movq %rax,%r12
- vpsllq $3,%ymm5,%ymm10
- vpaddq %ymm8,%ymm6,%ymm6
- addq 72+256(%rsp),%rcx
- andq %r11,%r12
- rorxq $41,%r11,%r13
- vpsrlq $19,%ymm5,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- rorxq $18,%r11,%rdi
- leaq (%rdx,%r14,1),%rdx
- leaq (%rcx,%r12,1),%rcx
- vpsllq $42,%ymm10,%ymm10
- vpxor %ymm9,%ymm11,%ymm11
- andnq %rbx,%r11,%r12
- xorq %rdi,%r13
- rorxq $14,%r11,%r14
- vpsrlq $42,%ymm9,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- leaq (%rcx,%r12,1),%rcx
- xorq %r14,%r13
- movq %rdx,%rdi
- vpxor %ymm9,%ymm11,%ymm11
- rorxq $39,%rdx,%r12
- leaq (%rcx,%r13,1),%rcx
- xorq %r8,%rdi
- vpaddq %ymm11,%ymm6,%ymm6
- rorxq $34,%rdx,%r14
- rorxq $28,%rdx,%r13
- leaq (%r10,%rcx,1),%r10
- vpaddq 64(%rbp),%ymm6,%ymm10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r8,%r15
- xorq %r13,%r14
- leaq (%rcx,%r15,1),%rcx
- movq %r11,%r12
- vmovdqa %ymm10,64(%rsp)
- vpalignr $8,%ymm7,%ymm0,%ymm8
- addq 96+256(%rsp),%rbx
- andq %r10,%r12
- rorxq $41,%r10,%r13
- vpalignr $8,%ymm3,%ymm4,%ymm11
- rorxq $18,%r10,%r15
- leaq (%rcx,%r14,1),%rcx
- leaq (%rbx,%r12,1),%rbx
- vpsrlq $1,%ymm8,%ymm10
- andnq %rax,%r10,%r12
- xorq %r15,%r13
- rorxq $14,%r10,%r14
- vpaddq %ymm11,%ymm7,%ymm7
- vpsrlq $7,%ymm8,%ymm11
- leaq (%rbx,%r12,1),%rbx
- xorq %r14,%r13
- movq %rcx,%r15
- vpsllq $56,%ymm8,%ymm9
- vpxor %ymm10,%ymm11,%ymm8
- rorxq $39,%rcx,%r12
- leaq (%rbx,%r13,1),%rbx
- xorq %rdx,%r15
- vpsrlq $7,%ymm10,%ymm10
- vpxor %ymm9,%ymm8,%ymm8
- rorxq $34,%rcx,%r14
- rorxq $28,%rcx,%r13
- leaq (%r9,%rbx,1),%r9
- vpsllq $7,%ymm9,%ymm9
- vpxor %ymm10,%ymm8,%ymm8
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rdx,%rdi
- vpsrlq $6,%ymm6,%ymm11
- vpxor %ymm9,%ymm8,%ymm8
- xorq %r13,%r14
- leaq (%rbx,%rdi,1),%rbx
- movq %r10,%r12
- vpsllq $3,%ymm6,%ymm10
- vpaddq %ymm8,%ymm7,%ymm7
- addq 104+256(%rsp),%rax
- andq %r9,%r12
- rorxq $41,%r9,%r13
- vpsrlq $19,%ymm6,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- rorxq $18,%r9,%rdi
- leaq (%rbx,%r14,1),%rbx
- leaq (%rax,%r12,1),%rax
- vpsllq $42,%ymm10,%ymm10
- vpxor %ymm9,%ymm11,%ymm11
- andnq %r11,%r9,%r12
- xorq %rdi,%r13
- rorxq $14,%r9,%r14
- vpsrlq $42,%ymm9,%ymm9
- vpxor %ymm10,%ymm11,%ymm11
- leaq (%rax,%r12,1),%rax
- xorq %r14,%r13
- movq %rbx,%rdi
- vpxor %ymm9,%ymm11,%ymm11
- rorxq $39,%rbx,%r12
- leaq (%rax,%r13,1),%rax
- xorq %rcx,%rdi
- vpaddq %ymm11,%ymm7,%ymm7
- rorxq $34,%rbx,%r14
- rorxq $28,%rbx,%r13
- leaq (%r8,%rax,1),%r8
- vpaddq 96(%rbp),%ymm7,%ymm10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rcx,%r15
- xorq %r13,%r14
- leaq (%rax,%r15,1),%rax
- movq %r9,%r12
- vmovdqa %ymm10,96(%rsp)
- leaq 256(%rbp),%rbp
- cmpb $0,-121(%rbp)
- jne .Lavx2_00_47
- addq 0+128(%rsp),%r11
- andq %r8,%r12
- rorxq $41,%r8,%r13
- rorxq $18,%r8,%r15
- leaq (%rax,%r14,1),%rax
- leaq (%r11,%r12,1),%r11
- andnq %r10,%r8,%r12
- xorq %r15,%r13
- rorxq $14,%r8,%r14
- leaq (%r11,%r12,1),%r11
- xorq %r14,%r13
- movq %rax,%r15
- rorxq $39,%rax,%r12
- leaq (%r11,%r13,1),%r11
- xorq %rbx,%r15
- rorxq $34,%rax,%r14
- rorxq $28,%rax,%r13
- leaq (%rdx,%r11,1),%rdx
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rbx,%rdi
- xorq %r13,%r14
- leaq (%r11,%rdi,1),%r11
- movq %r8,%r12
- addq 8+128(%rsp),%r10
- andq %rdx,%r12
- rorxq $41,%rdx,%r13
- rorxq $18,%rdx,%rdi
- leaq (%r11,%r14,1),%r11
- leaq (%r10,%r12,1),%r10
- andnq %r9,%rdx,%r12
- xorq %rdi,%r13
- rorxq $14,%rdx,%r14
- leaq (%r10,%r12,1),%r10
- xorq %r14,%r13
- movq %r11,%rdi
- rorxq $39,%r11,%r12
- leaq (%r10,%r13,1),%r10
- xorq %rax,%rdi
- rorxq $34,%r11,%r14
- rorxq $28,%r11,%r13
- leaq (%rcx,%r10,1),%rcx
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rax,%r15
- xorq %r13,%r14
- leaq (%r10,%r15,1),%r10
- movq %rdx,%r12
- addq 32+128(%rsp),%r9
- andq %rcx,%r12
- rorxq $41,%rcx,%r13
- rorxq $18,%rcx,%r15
- leaq (%r10,%r14,1),%r10
- leaq (%r9,%r12,1),%r9
- andnq %r8,%rcx,%r12
- xorq %r15,%r13
- rorxq $14,%rcx,%r14
- leaq (%r9,%r12,1),%r9
- xorq %r14,%r13
- movq %r10,%r15
- rorxq $39,%r10,%r12
- leaq (%r9,%r13,1),%r9
- xorq %r11,%r15
- rorxq $34,%r10,%r14
- rorxq $28,%r10,%r13
- leaq (%rbx,%r9,1),%rbx
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r11,%rdi
- xorq %r13,%r14
- leaq (%r9,%rdi,1),%r9
- movq %rcx,%r12
- addq 40+128(%rsp),%r8
- andq %rbx,%r12
- rorxq $41,%rbx,%r13
- rorxq $18,%rbx,%rdi
- leaq (%r9,%r14,1),%r9
- leaq (%r8,%r12,1),%r8
- andnq %rdx,%rbx,%r12
- xorq %rdi,%r13
- rorxq $14,%rbx,%r14
- leaq (%r8,%r12,1),%r8
- xorq %r14,%r13
- movq %r9,%rdi
- rorxq $39,%r9,%r12
- leaq (%r8,%r13,1),%r8
- xorq %r10,%rdi
- rorxq $34,%r9,%r14
- rorxq $28,%r9,%r13
- leaq (%rax,%r8,1),%rax
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r10,%r15
- xorq %r13,%r14
- leaq (%r8,%r15,1),%r8
- movq %rbx,%r12
- addq 64+128(%rsp),%rdx
- andq %rax,%r12
- rorxq $41,%rax,%r13
- rorxq $18,%rax,%r15
- leaq (%r8,%r14,1),%r8
- leaq (%rdx,%r12,1),%rdx
- andnq %rcx,%rax,%r12
- xorq %r15,%r13
- rorxq $14,%rax,%r14
- leaq (%rdx,%r12,1),%rdx
- xorq %r14,%r13
- movq %r8,%r15
- rorxq $39,%r8,%r12
- leaq (%rdx,%r13,1),%rdx
- xorq %r9,%r15
- rorxq $34,%r8,%r14
- rorxq $28,%r8,%r13
- leaq (%r11,%rdx,1),%r11
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r9,%rdi
- xorq %r13,%r14
- leaq (%rdx,%rdi,1),%rdx
- movq %rax,%r12
- addq 72+128(%rsp),%rcx
- andq %r11,%r12
- rorxq $41,%r11,%r13
- rorxq $18,%r11,%rdi
- leaq (%rdx,%r14,1),%rdx
- leaq (%rcx,%r12,1),%rcx
- andnq %rbx,%r11,%r12
- xorq %rdi,%r13
- rorxq $14,%r11,%r14
- leaq (%rcx,%r12,1),%rcx
- xorq %r14,%r13
- movq %rdx,%rdi
- rorxq $39,%rdx,%r12
- leaq (%rcx,%r13,1),%rcx
- xorq %r8,%rdi
- rorxq $34,%rdx,%r14
- rorxq $28,%rdx,%r13
- leaq (%r10,%rcx,1),%r10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r8,%r15
- xorq %r13,%r14
- leaq (%rcx,%r15,1),%rcx
- movq %r11,%r12
- addq 96+128(%rsp),%rbx
- andq %r10,%r12
- rorxq $41,%r10,%r13
- rorxq $18,%r10,%r15
- leaq (%rcx,%r14,1),%rcx
- leaq (%rbx,%r12,1),%rbx
- andnq %rax,%r10,%r12
- xorq %r15,%r13
- rorxq $14,%r10,%r14
- leaq (%rbx,%r12,1),%rbx
- xorq %r14,%r13
- movq %rcx,%r15
- rorxq $39,%rcx,%r12
- leaq (%rbx,%r13,1),%rbx
- xorq %rdx,%r15
- rorxq $34,%rcx,%r14
- rorxq $28,%rcx,%r13
- leaq (%r9,%rbx,1),%r9
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rdx,%rdi
- xorq %r13,%r14
- leaq (%rbx,%rdi,1),%rbx
- movq %r10,%r12
- addq 104+128(%rsp),%rax
- andq %r9,%r12
- rorxq $41,%r9,%r13
- rorxq $18,%r9,%rdi
- leaq (%rbx,%r14,1),%rbx
- leaq (%rax,%r12,1),%rax
- andnq %r11,%r9,%r12
- xorq %rdi,%r13
- rorxq $14,%r9,%r14
- leaq (%rax,%r12,1),%rax
- xorq %r14,%r13
- movq %rbx,%rdi
- rorxq $39,%rbx,%r12
- leaq (%rax,%r13,1),%rax
- xorq %rcx,%rdi
- rorxq $34,%rbx,%r14
- rorxq $28,%rbx,%r13
- leaq (%r8,%rax,1),%r8
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rcx,%r15
- xorq %r13,%r14
- leaq (%rax,%r15,1),%rax
- movq %r9,%r12
- addq 0(%rsp),%r11
- andq %r8,%r12
- rorxq $41,%r8,%r13
- rorxq $18,%r8,%r15
- leaq (%rax,%r14,1),%rax
- leaq (%r11,%r12,1),%r11
- andnq %r10,%r8,%r12
- xorq %r15,%r13
- rorxq $14,%r8,%r14
- leaq (%r11,%r12,1),%r11
- xorq %r14,%r13
- movq %rax,%r15
- rorxq $39,%rax,%r12
- leaq (%r11,%r13,1),%r11
- xorq %rbx,%r15
- rorxq $34,%rax,%r14
- rorxq $28,%rax,%r13
- leaq (%rdx,%r11,1),%rdx
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rbx,%rdi
- xorq %r13,%r14
- leaq (%r11,%rdi,1),%r11
- movq %r8,%r12
- addq 8(%rsp),%r10
- andq %rdx,%r12
- rorxq $41,%rdx,%r13
- rorxq $18,%rdx,%rdi
- leaq (%r11,%r14,1),%r11
- leaq (%r10,%r12,1),%r10
- andnq %r9,%rdx,%r12
- xorq %rdi,%r13
- rorxq $14,%rdx,%r14
- leaq (%r10,%r12,1),%r10
- xorq %r14,%r13
- movq %r11,%rdi
- rorxq $39,%r11,%r12
- leaq (%r10,%r13,1),%r10
- xorq %rax,%rdi
- rorxq $34,%r11,%r14
- rorxq $28,%r11,%r13
- leaq (%rcx,%r10,1),%rcx
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rax,%r15
- xorq %r13,%r14
- leaq (%r10,%r15,1),%r10
- movq %rdx,%r12
- addq 32(%rsp),%r9
- andq %rcx,%r12
- rorxq $41,%rcx,%r13
- rorxq $18,%rcx,%r15
- leaq (%r10,%r14,1),%r10
- leaq (%r9,%r12,1),%r9
- andnq %r8,%rcx,%r12
- xorq %r15,%r13
- rorxq $14,%rcx,%r14
- leaq (%r9,%r12,1),%r9
- xorq %r14,%r13
- movq %r10,%r15
- rorxq $39,%r10,%r12
- leaq (%r9,%r13,1),%r9
- xorq %r11,%r15
- rorxq $34,%r10,%r14
- rorxq $28,%r10,%r13
- leaq (%rbx,%r9,1),%rbx
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r11,%rdi
- xorq %r13,%r14
- leaq (%r9,%rdi,1),%r9
- movq %rcx,%r12
- addq 40(%rsp),%r8
- andq %rbx,%r12
- rorxq $41,%rbx,%r13
- rorxq $18,%rbx,%rdi
- leaq (%r9,%r14,1),%r9
- leaq (%r8,%r12,1),%r8
- andnq %rdx,%rbx,%r12
- xorq %rdi,%r13
- rorxq $14,%rbx,%r14
- leaq (%r8,%r12,1),%r8
- xorq %r14,%r13
- movq %r9,%rdi
- rorxq $39,%r9,%r12
- leaq (%r8,%r13,1),%r8
- xorq %r10,%rdi
- rorxq $34,%r9,%r14
- rorxq $28,%r9,%r13
- leaq (%rax,%r8,1),%rax
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r10,%r15
- xorq %r13,%r14
- leaq (%r8,%r15,1),%r8
- movq %rbx,%r12
- addq 64(%rsp),%rdx
- andq %rax,%r12
- rorxq $41,%rax,%r13
- rorxq $18,%rax,%r15
- leaq (%r8,%r14,1),%r8
- leaq (%rdx,%r12,1),%rdx
- andnq %rcx,%rax,%r12
- xorq %r15,%r13
- rorxq $14,%rax,%r14
- leaq (%rdx,%r12,1),%rdx
- xorq %r14,%r13
- movq %r8,%r15
- rorxq $39,%r8,%r12
- leaq (%rdx,%r13,1),%rdx
- xorq %r9,%r15
- rorxq $34,%r8,%r14
- rorxq $28,%r8,%r13
- leaq (%r11,%rdx,1),%r11
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r9,%rdi
- xorq %r13,%r14
- leaq (%rdx,%rdi,1),%rdx
- movq %rax,%r12
- addq 72(%rsp),%rcx
- andq %r11,%r12
- rorxq $41,%r11,%r13
- rorxq $18,%r11,%rdi
- leaq (%rdx,%r14,1),%rdx
- leaq (%rcx,%r12,1),%rcx
- andnq %rbx,%r11,%r12
- xorq %rdi,%r13
- rorxq $14,%r11,%r14
- leaq (%rcx,%r12,1),%rcx
- xorq %r14,%r13
- movq %rdx,%rdi
- rorxq $39,%rdx,%r12
- leaq (%rcx,%r13,1),%rcx
- xorq %r8,%rdi
- rorxq $34,%rdx,%r14
- rorxq $28,%rdx,%r13
- leaq (%r10,%rcx,1),%r10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r8,%r15
- xorq %r13,%r14
- leaq (%rcx,%r15,1),%rcx
- movq %r11,%r12
- addq 96(%rsp),%rbx
- andq %r10,%r12
- rorxq $41,%r10,%r13
- rorxq $18,%r10,%r15
- leaq (%rcx,%r14,1),%rcx
- leaq (%rbx,%r12,1),%rbx
- andnq %rax,%r10,%r12
- xorq %r15,%r13
- rorxq $14,%r10,%r14
- leaq (%rbx,%r12,1),%rbx
- xorq %r14,%r13
- movq %rcx,%r15
- rorxq $39,%rcx,%r12
- leaq (%rbx,%r13,1),%rbx
- xorq %rdx,%r15
- rorxq $34,%rcx,%r14
- rorxq $28,%rcx,%r13
- leaq (%r9,%rbx,1),%r9
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rdx,%rdi
- xorq %r13,%r14
- leaq (%rbx,%rdi,1),%rbx
- movq %r10,%r12
- addq 104(%rsp),%rax
- andq %r9,%r12
- rorxq $41,%r9,%r13
- rorxq $18,%r9,%rdi
- leaq (%rbx,%r14,1),%rbx
- leaq (%rax,%r12,1),%rax
- andnq %r11,%r9,%r12
- xorq %rdi,%r13
- rorxq $14,%r9,%r14
- leaq (%rax,%r12,1),%rax
- xorq %r14,%r13
- movq %rbx,%rdi
- rorxq $39,%rbx,%r12
- leaq (%rax,%r13,1),%rax
- xorq %rcx,%rdi
- rorxq $34,%rbx,%r14
- rorxq $28,%rbx,%r13
- leaq (%r8,%rax,1),%r8
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rcx,%r15
- xorq %r13,%r14
- leaq (%rax,%r15,1),%rax
- movq %r9,%r12
- movq 1280(%rsp),%rdi
- addq %r14,%rax
-
- leaq 1152(%rsp),%rbp
-
- addq 0(%rdi),%rax
- addq 8(%rdi),%rbx
- addq 16(%rdi),%rcx
- addq 24(%rdi),%rdx
- addq 32(%rdi),%r8
- addq 40(%rdi),%r9
- addq 48(%rdi),%r10
- addq 56(%rdi),%r11
-
- movq %rax,0(%rdi)
- movq %rbx,8(%rdi)
- movq %rcx,16(%rdi)
- movq %rdx,24(%rdi)
- movq %r8,32(%rdi)
- movq %r9,40(%rdi)
- movq %r10,48(%rdi)
- movq %r11,56(%rdi)
-
- cmpq 144(%rbp),%rsi
- je .Ldone_avx2
-
- xorq %r14,%r14
- movq %rbx,%rdi
- xorq %rcx,%rdi
- movq %r9,%r12
- jmp .Lower_avx2
-.align 16
-.Lower_avx2:
- addq 0+16(%rbp),%r11
- andq %r8,%r12
- rorxq $41,%r8,%r13
- rorxq $18,%r8,%r15
- leaq (%rax,%r14,1),%rax
- leaq (%r11,%r12,1),%r11
- andnq %r10,%r8,%r12
- xorq %r15,%r13
- rorxq $14,%r8,%r14
- leaq (%r11,%r12,1),%r11
- xorq %r14,%r13
- movq %rax,%r15
- rorxq $39,%rax,%r12
- leaq (%r11,%r13,1),%r11
- xorq %rbx,%r15
- rorxq $34,%rax,%r14
- rorxq $28,%rax,%r13
- leaq (%rdx,%r11,1),%rdx
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rbx,%rdi
- xorq %r13,%r14
- leaq (%r11,%rdi,1),%r11
- movq %r8,%r12
- addq 8+16(%rbp),%r10
- andq %rdx,%r12
- rorxq $41,%rdx,%r13
- rorxq $18,%rdx,%rdi
- leaq (%r11,%r14,1),%r11
- leaq (%r10,%r12,1),%r10
- andnq %r9,%rdx,%r12
- xorq %rdi,%r13
- rorxq $14,%rdx,%r14
- leaq (%r10,%r12,1),%r10
- xorq %r14,%r13
- movq %r11,%rdi
- rorxq $39,%r11,%r12
- leaq (%r10,%r13,1),%r10
- xorq %rax,%rdi
- rorxq $34,%r11,%r14
- rorxq $28,%r11,%r13
- leaq (%rcx,%r10,1),%rcx
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rax,%r15
- xorq %r13,%r14
- leaq (%r10,%r15,1),%r10
- movq %rdx,%r12
- addq 32+16(%rbp),%r9
- andq %rcx,%r12
- rorxq $41,%rcx,%r13
- rorxq $18,%rcx,%r15
- leaq (%r10,%r14,1),%r10
- leaq (%r9,%r12,1),%r9
- andnq %r8,%rcx,%r12
- xorq %r15,%r13
- rorxq $14,%rcx,%r14
- leaq (%r9,%r12,1),%r9
- xorq %r14,%r13
- movq %r10,%r15
- rorxq $39,%r10,%r12
- leaq (%r9,%r13,1),%r9
- xorq %r11,%r15
- rorxq $34,%r10,%r14
- rorxq $28,%r10,%r13
- leaq (%rbx,%r9,1),%rbx
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r11,%rdi
- xorq %r13,%r14
- leaq (%r9,%rdi,1),%r9
- movq %rcx,%r12
- addq 40+16(%rbp),%r8
- andq %rbx,%r12
- rorxq $41,%rbx,%r13
- rorxq $18,%rbx,%rdi
- leaq (%r9,%r14,1),%r9
- leaq (%r8,%r12,1),%r8
- andnq %rdx,%rbx,%r12
- xorq %rdi,%r13
- rorxq $14,%rbx,%r14
- leaq (%r8,%r12,1),%r8
- xorq %r14,%r13
- movq %r9,%rdi
- rorxq $39,%r9,%r12
- leaq (%r8,%r13,1),%r8
- xorq %r10,%rdi
- rorxq $34,%r9,%r14
- rorxq $28,%r9,%r13
- leaq (%rax,%r8,1),%rax
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r10,%r15
- xorq %r13,%r14
- leaq (%r8,%r15,1),%r8
- movq %rbx,%r12
- addq 64+16(%rbp),%rdx
- andq %rax,%r12
- rorxq $41,%rax,%r13
- rorxq $18,%rax,%r15
- leaq (%r8,%r14,1),%r8
- leaq (%rdx,%r12,1),%rdx
- andnq %rcx,%rax,%r12
- xorq %r15,%r13
- rorxq $14,%rax,%r14
- leaq (%rdx,%r12,1),%rdx
- xorq %r14,%r13
- movq %r8,%r15
- rorxq $39,%r8,%r12
- leaq (%rdx,%r13,1),%rdx
- xorq %r9,%r15
- rorxq $34,%r8,%r14
- rorxq $28,%r8,%r13
- leaq (%r11,%rdx,1),%r11
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %r9,%rdi
- xorq %r13,%r14
- leaq (%rdx,%rdi,1),%rdx
- movq %rax,%r12
- addq 72+16(%rbp),%rcx
- andq %r11,%r12
- rorxq $41,%r11,%r13
- rorxq $18,%r11,%rdi
- leaq (%rdx,%r14,1),%rdx
- leaq (%rcx,%r12,1),%rcx
- andnq %rbx,%r11,%r12
- xorq %rdi,%r13
- rorxq $14,%r11,%r14
- leaq (%rcx,%r12,1),%rcx
- xorq %r14,%r13
- movq %rdx,%rdi
- rorxq $39,%rdx,%r12
- leaq (%rcx,%r13,1),%rcx
- xorq %r8,%rdi
- rorxq $34,%rdx,%r14
- rorxq $28,%rdx,%r13
- leaq (%r10,%rcx,1),%r10
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %r8,%r15
- xorq %r13,%r14
- leaq (%rcx,%r15,1),%rcx
- movq %r11,%r12
- addq 96+16(%rbp),%rbx
- andq %r10,%r12
- rorxq $41,%r10,%r13
- rorxq $18,%r10,%r15
- leaq (%rcx,%r14,1),%rcx
- leaq (%rbx,%r12,1),%rbx
- andnq %rax,%r10,%r12
- xorq %r15,%r13
- rorxq $14,%r10,%r14
- leaq (%rbx,%r12,1),%rbx
- xorq %r14,%r13
- movq %rcx,%r15
- rorxq $39,%rcx,%r12
- leaq (%rbx,%r13,1),%rbx
- xorq %rdx,%r15
- rorxq $34,%rcx,%r14
- rorxq $28,%rcx,%r13
- leaq (%r9,%rbx,1),%r9
- andq %r15,%rdi
- xorq %r12,%r14
- xorq %rdx,%rdi
- xorq %r13,%r14
- leaq (%rbx,%rdi,1),%rbx
- movq %r10,%r12
- addq 104+16(%rbp),%rax
- andq %r9,%r12
- rorxq $41,%r9,%r13
- rorxq $18,%r9,%rdi
- leaq (%rbx,%r14,1),%rbx
- leaq (%rax,%r12,1),%rax
- andnq %r11,%r9,%r12
- xorq %rdi,%r13
- rorxq $14,%r9,%r14
- leaq (%rax,%r12,1),%rax
- xorq %r14,%r13
- movq %rbx,%rdi
- rorxq $39,%rbx,%r12
- leaq (%rax,%r13,1),%rax
- xorq %rcx,%rdi
- rorxq $34,%rbx,%r14
- rorxq $28,%rbx,%r13
- leaq (%r8,%rax,1),%r8
- andq %rdi,%r15
- xorq %r12,%r14
- xorq %rcx,%r15
- xorq %r13,%r14
- leaq (%rax,%r15,1),%rax
- movq %r9,%r12
- leaq -128(%rbp),%rbp
- cmpq %rsp,%rbp
- jae .Lower_avx2
-
- movq 1280(%rsp),%rdi
- addq %r14,%rax
-
- leaq 1152(%rsp),%rsp
-
-.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
-
- addq 0(%rdi),%rax
- addq 8(%rdi),%rbx
- addq 16(%rdi),%rcx
- addq 24(%rdi),%rdx
- addq 32(%rdi),%r8
- addq 40(%rdi),%r9
- leaq 256(%rsi),%rsi
- addq 48(%rdi),%r10
- movq %rsi,%r12
- addq 56(%rdi),%r11
- cmpq 128+16(%rsp),%rsi
-
- movq %rax,0(%rdi)
- cmoveq %rsp,%r12
- movq %rbx,8(%rdi)
- movq %rcx,16(%rdi)
- movq %rdx,24(%rdi)
- movq %r8,32(%rdi)
- movq %r9,40(%rdi)
- movq %r10,48(%rdi)
- movq %r11,56(%rdi)
-
- jbe .Loop_avx2
- leaq (%rsp),%rbp
-
-
-.cfi_escape 0x0f,0x06,0x76,0x98,0x01,0x06,0x23,0x08
-
-.Ldone_avx2:
- movq 152(%rbp),%rsi
-.cfi_def_cfa %rsi,8
- vzeroupper
- movq -48(%rsi),%r15
-.cfi_restore %r15
- movq -40(%rsi),%r14
-.cfi_restore %r14
- movq -32(%rsi),%r13
-.cfi_restore %r13
- movq -24(%rsi),%r12
-.cfi_restore %r12
- movq -16(%rsi),%rbp
-.cfi_restore %rbp
- movq -8(%rsi),%rbx
-.cfi_restore %rbx
- leaq (%rsi),%rsp
-.cfi_def_cfa_register %rsp
-.Lepilogue_avx2:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size sha512_block_data_order_avx2,.-sha512_block_data_order_avx2
- .section ".note.gnu.property", "a"
+ .section .note.gnu.property, #alloc
.p2align 3
.long 1f - 0f
.long 4f - 1f
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/x86_64cpuid.s b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/x86_64cpuid.s
index 5fda386d1df..e23b53af26e 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/x86_64cpuid.s
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-GCC/crypto/x86_64cpuid.s
@@ -489,7 +489,7 @@ OPENSSL_ia32_rdseed_bytes:
.byte 0xf3,0xc3
.cfi_endproc
.size OPENSSL_ia32_rdseed_bytes,.-OPENSSL_ia32_rdseed_bytes
- .section ".note.gnu.property", "a"
+ .section .note.gnu.property, #alloc
.p2align 3
.long 1f - 0f
.long 4f - 1f
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-mb-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-mb-x86_64.nasm
index 7908342cf4c..2ec2d3bd1cd 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-mb-x86_64.nasm
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-mb-x86_64.nasm
@@ -21,14 +21,6 @@ $L$SEH_begin_aesni_multi_cbc_encrypt:
- cmp edx,2
- jb NEAR $L$enc_non_avx
- mov ecx,DWORD[((OPENSSL_ia32cap_P+4))]
- test ecx,268435456
- jnz NEAR _avx_cbc_enc_shortcut
- jmp NEAR $L$enc_non_avx
-ALIGN 16
-$L$enc_non_avx:
mov rax,rsp
push rbx
@@ -344,14 +336,6 @@ $L$SEH_begin_aesni_multi_cbc_decrypt:
- cmp edx,2
- jb NEAR $L$dec_non_avx
- mov ecx,DWORD[((OPENSSL_ia32cap_P+4))]
- test ecx,268435456
- jnz NEAR _avx_cbc_dec_shortcut
- jmp NEAR $L$dec_non_avx
-ALIGN 16
-$L$dec_non_avx:
mov rax,rsp
push rbx
@@ -642,1083 +626,6 @@ $L$dec4x_epilogue:
DB 0F3h,0C3h ;repret
$L$SEH_end_aesni_multi_cbc_decrypt:
-
-ALIGN 32
-aesni_multi_cbc_encrypt_avx:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_aesni_multi_cbc_encrypt_avx:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
-
-
-
-_avx_cbc_enc_shortcut:
- mov rax,rsp
-
- push rbx
-
- push rbp
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
- lea rsp,[((-168))+rsp]
- movaps XMMWORD[rsp],xmm6
- movaps XMMWORD[16+rsp],xmm7
- movaps XMMWORD[32+rsp],xmm8
- movaps XMMWORD[48+rsp],xmm9
- movaps XMMWORD[64+rsp],xmm10
- movaps XMMWORD[80+rsp],xmm11
- movaps XMMWORD[(-120)+rax],xmm12
- movaps XMMWORD[(-104)+rax],xmm13
- movaps XMMWORD[(-88)+rax],xmm14
- movaps XMMWORD[(-72)+rax],xmm15
-
-
-
-
-
-
-
-
- sub rsp,192
- and rsp,-128
- mov QWORD[16+rsp],rax
-
-
-$L$enc8x_body:
- vzeroupper
- vmovdqu xmm15,XMMWORD[rsi]
- lea rsi,[120+rsi]
- lea rdi,[160+rdi]
- shr edx,1
-
-$L$enc8x_loop_grande:
-
- xor edx,edx
-
- mov ecx,DWORD[((-144))+rdi]
-
- mov r8,QWORD[((-160))+rdi]
- cmp ecx,edx
-
- mov rbx,QWORD[((-152))+rdi]
- cmovg edx,ecx
- test ecx,ecx
-
- vmovdqu xmm2,XMMWORD[((-136))+rdi]
- mov DWORD[32+rsp],ecx
- cmovle r8,rsp
- sub rbx,r8
- mov QWORD[64+rsp],rbx
-
- mov ecx,DWORD[((-104))+rdi]
-
- mov r9,QWORD[((-120))+rdi]
- cmp ecx,edx
-
- mov rbp,QWORD[((-112))+rdi]
- cmovg edx,ecx
- test ecx,ecx
-
- vmovdqu xmm3,XMMWORD[((-96))+rdi]
- mov DWORD[36+rsp],ecx
- cmovle r9,rsp
- sub rbp,r9
- mov QWORD[72+rsp],rbp
-
- mov ecx,DWORD[((-64))+rdi]
-
- mov r10,QWORD[((-80))+rdi]
- cmp ecx,edx
-
- mov rbp,QWORD[((-72))+rdi]
- cmovg edx,ecx
- test ecx,ecx
-
- vmovdqu xmm4,XMMWORD[((-56))+rdi]
- mov DWORD[40+rsp],ecx
- cmovle r10,rsp
- sub rbp,r10
- mov QWORD[80+rsp],rbp
-
- mov ecx,DWORD[((-24))+rdi]
-
- mov r11,QWORD[((-40))+rdi]
- cmp ecx,edx
-
- mov rbp,QWORD[((-32))+rdi]
- cmovg edx,ecx
- test ecx,ecx
-
- vmovdqu xmm5,XMMWORD[((-16))+rdi]
- mov DWORD[44+rsp],ecx
- cmovle r11,rsp
- sub rbp,r11
- mov QWORD[88+rsp],rbp
-
- mov ecx,DWORD[16+rdi]
-
- mov r12,QWORD[rdi]
- cmp ecx,edx
-
- mov rbp,QWORD[8+rdi]
- cmovg edx,ecx
- test ecx,ecx
-
- vmovdqu xmm6,XMMWORD[24+rdi]
- mov DWORD[48+rsp],ecx
- cmovle r12,rsp
- sub rbp,r12
- mov QWORD[96+rsp],rbp
-
- mov ecx,DWORD[56+rdi]
-
- mov r13,QWORD[40+rdi]
- cmp ecx,edx
-
- mov rbp,QWORD[48+rdi]
- cmovg edx,ecx
- test ecx,ecx
-
- vmovdqu xmm7,XMMWORD[64+rdi]
- mov DWORD[52+rsp],ecx
- cmovle r13,rsp
- sub rbp,r13
- mov QWORD[104+rsp],rbp
-
- mov ecx,DWORD[96+rdi]
-
- mov r14,QWORD[80+rdi]
- cmp ecx,edx
-
- mov rbp,QWORD[88+rdi]
- cmovg edx,ecx
- test ecx,ecx
-
- vmovdqu xmm8,XMMWORD[104+rdi]
- mov DWORD[56+rsp],ecx
- cmovle r14,rsp
- sub rbp,r14
- mov QWORD[112+rsp],rbp
-
- mov ecx,DWORD[136+rdi]
-
- mov r15,QWORD[120+rdi]
- cmp ecx,edx
-
- mov rbp,QWORD[128+rdi]
- cmovg edx,ecx
- test ecx,ecx
-
- vmovdqu xmm9,XMMWORD[144+rdi]
- mov DWORD[60+rsp],ecx
- cmovle r15,rsp
- sub rbp,r15
- mov QWORD[120+rsp],rbp
- test edx,edx
- jz NEAR $L$enc8x_done
-
- vmovups xmm1,XMMWORD[((16-120))+rsi]
- vmovups xmm0,XMMWORD[((32-120))+rsi]
- mov eax,DWORD[((240-120))+rsi]
-
- vpxor xmm10,xmm15,XMMWORD[r8]
- lea rbp,[128+rsp]
- vpxor xmm11,xmm15,XMMWORD[r9]
- vpxor xmm12,xmm15,XMMWORD[r10]
- vpxor xmm13,xmm15,XMMWORD[r11]
- vpxor xmm2,xmm2,xmm10
- vpxor xmm10,xmm15,XMMWORD[r12]
- vpxor xmm3,xmm3,xmm11
- vpxor xmm11,xmm15,XMMWORD[r13]
- vpxor xmm4,xmm4,xmm12
- vpxor xmm12,xmm15,XMMWORD[r14]
- vpxor xmm5,xmm5,xmm13
- vpxor xmm13,xmm15,XMMWORD[r15]
- vpxor xmm6,xmm6,xmm10
- mov ecx,1
- vpxor xmm7,xmm7,xmm11
- vpxor xmm8,xmm8,xmm12
- vpxor xmm9,xmm9,xmm13
- jmp NEAR $L$oop_enc8x
-
-ALIGN 32
-$L$oop_enc8x:
- vaesenc xmm2,xmm2,xmm1
- cmp ecx,DWORD[((32+0))+rsp]
- vaesenc xmm3,xmm3,xmm1
- prefetcht0 [31+r8]
- vaesenc xmm4,xmm4,xmm1
- vaesenc xmm5,xmm5,xmm1
- lea rbx,[rbx*1+r8]
- cmovge r8,rsp
- vaesenc xmm6,xmm6,xmm1
- cmovg rbx,rsp
- vaesenc xmm7,xmm7,xmm1
- sub rbx,r8
- vaesenc xmm8,xmm8,xmm1
- vpxor xmm10,xmm15,XMMWORD[16+r8]
- mov QWORD[((64+0))+rsp],rbx
- vaesenc xmm9,xmm9,xmm1
- vmovups xmm1,XMMWORD[((-72))+rsi]
- lea r8,[16+rbx*1+r8]
- vmovdqu XMMWORD[rbp],xmm10
- vaesenc xmm2,xmm2,xmm0
- cmp ecx,DWORD[((32+4))+rsp]
- mov rbx,QWORD[((64+8))+rsp]
- vaesenc xmm3,xmm3,xmm0
- prefetcht0 [31+r9]
- vaesenc xmm4,xmm4,xmm0
- vaesenc xmm5,xmm5,xmm0
- lea rbx,[rbx*1+r9]
- cmovge r9,rsp
- vaesenc xmm6,xmm6,xmm0
- cmovg rbx,rsp
- vaesenc xmm7,xmm7,xmm0
- sub rbx,r9
- vaesenc xmm8,xmm8,xmm0
- vpxor xmm11,xmm15,XMMWORD[16+r9]
- mov QWORD[((64+8))+rsp],rbx
- vaesenc xmm9,xmm9,xmm0
- vmovups xmm0,XMMWORD[((-56))+rsi]
- lea r9,[16+rbx*1+r9]
- vmovdqu XMMWORD[16+rbp],xmm11
- vaesenc xmm2,xmm2,xmm1
- cmp ecx,DWORD[((32+8))+rsp]
- mov rbx,QWORD[((64+16))+rsp]
- vaesenc xmm3,xmm3,xmm1
- prefetcht0 [31+r10]
- vaesenc xmm4,xmm4,xmm1
- prefetcht0 [15+r8]
- vaesenc xmm5,xmm5,xmm1
- lea rbx,[rbx*1+r10]
- cmovge r10,rsp
- vaesenc xmm6,xmm6,xmm1
- cmovg rbx,rsp
- vaesenc xmm7,xmm7,xmm1
- sub rbx,r10
- vaesenc xmm8,xmm8,xmm1
- vpxor xmm12,xmm15,XMMWORD[16+r10]
- mov QWORD[((64+16))+rsp],rbx
- vaesenc xmm9,xmm9,xmm1
- vmovups xmm1,XMMWORD[((-40))+rsi]
- lea r10,[16+rbx*1+r10]
- vmovdqu XMMWORD[32+rbp],xmm12
- vaesenc xmm2,xmm2,xmm0
- cmp ecx,DWORD[((32+12))+rsp]
- mov rbx,QWORD[((64+24))+rsp]
- vaesenc xmm3,xmm3,xmm0
- prefetcht0 [31+r11]
- vaesenc xmm4,xmm4,xmm0
- prefetcht0 [15+r9]
- vaesenc xmm5,xmm5,xmm0
- lea rbx,[rbx*1+r11]
- cmovge r11,rsp
- vaesenc xmm6,xmm6,xmm0
- cmovg rbx,rsp
- vaesenc xmm7,xmm7,xmm0
- sub rbx,r11
- vaesenc xmm8,xmm8,xmm0
- vpxor xmm13,xmm15,XMMWORD[16+r11]
- mov QWORD[((64+24))+rsp],rbx
- vaesenc xmm9,xmm9,xmm0
- vmovups xmm0,XMMWORD[((-24))+rsi]
- lea r11,[16+rbx*1+r11]
- vmovdqu XMMWORD[48+rbp],xmm13
- vaesenc xmm2,xmm2,xmm1
- cmp ecx,DWORD[((32+16))+rsp]
- mov rbx,QWORD[((64+32))+rsp]
- vaesenc xmm3,xmm3,xmm1
- prefetcht0 [31+r12]
- vaesenc xmm4,xmm4,xmm1
- prefetcht0 [15+r10]
- vaesenc xmm5,xmm5,xmm1
- lea rbx,[rbx*1+r12]
- cmovge r12,rsp
- vaesenc xmm6,xmm6,xmm1
- cmovg rbx,rsp
- vaesenc xmm7,xmm7,xmm1
- sub rbx,r12
- vaesenc xmm8,xmm8,xmm1
- vpxor xmm10,xmm15,XMMWORD[16+r12]
- mov QWORD[((64+32))+rsp],rbx
- vaesenc xmm9,xmm9,xmm1
- vmovups xmm1,XMMWORD[((-8))+rsi]
- lea r12,[16+rbx*1+r12]
- vaesenc xmm2,xmm2,xmm0
- cmp ecx,DWORD[((32+20))+rsp]
- mov rbx,QWORD[((64+40))+rsp]
- vaesenc xmm3,xmm3,xmm0
- prefetcht0 [31+r13]
- vaesenc xmm4,xmm4,xmm0
- prefetcht0 [15+r11]
- vaesenc xmm5,xmm5,xmm0
- lea rbx,[r13*1+rbx]
- cmovge r13,rsp
- vaesenc xmm6,xmm6,xmm0
- cmovg rbx,rsp
- vaesenc xmm7,xmm7,xmm0
- sub rbx,r13
- vaesenc xmm8,xmm8,xmm0
- vpxor xmm11,xmm15,XMMWORD[16+r13]
- mov QWORD[((64+40))+rsp],rbx
- vaesenc xmm9,xmm9,xmm0
- vmovups xmm0,XMMWORD[8+rsi]
- lea r13,[16+rbx*1+r13]
- vaesenc xmm2,xmm2,xmm1
- cmp ecx,DWORD[((32+24))+rsp]
- mov rbx,QWORD[((64+48))+rsp]
- vaesenc xmm3,xmm3,xmm1
- prefetcht0 [31+r14]
- vaesenc xmm4,xmm4,xmm1
- prefetcht0 [15+r12]
- vaesenc xmm5,xmm5,xmm1
- lea rbx,[rbx*1+r14]
- cmovge r14,rsp
- vaesenc xmm6,xmm6,xmm1
- cmovg rbx,rsp
- vaesenc xmm7,xmm7,xmm1
- sub rbx,r14
- vaesenc xmm8,xmm8,xmm1
- vpxor xmm12,xmm15,XMMWORD[16+r14]
- mov QWORD[((64+48))+rsp],rbx
- vaesenc xmm9,xmm9,xmm1
- vmovups xmm1,XMMWORD[24+rsi]
- lea r14,[16+rbx*1+r14]
- vaesenc xmm2,xmm2,xmm0
- cmp ecx,DWORD[((32+28))+rsp]
- mov rbx,QWORD[((64+56))+rsp]
- vaesenc xmm3,xmm3,xmm0
- prefetcht0 [31+r15]
- vaesenc xmm4,xmm4,xmm0
- prefetcht0 [15+r13]
- vaesenc xmm5,xmm5,xmm0
- lea rbx,[rbx*1+r15]
- cmovge r15,rsp
- vaesenc xmm6,xmm6,xmm0
- cmovg rbx,rsp
- vaesenc xmm7,xmm7,xmm0
- sub rbx,r15
- vaesenc xmm8,xmm8,xmm0
- vpxor xmm13,xmm15,XMMWORD[16+r15]
- mov QWORD[((64+56))+rsp],rbx
- vaesenc xmm9,xmm9,xmm0
- vmovups xmm0,XMMWORD[40+rsi]
- lea r15,[16+rbx*1+r15]
- vmovdqu xmm14,XMMWORD[32+rsp]
- prefetcht0 [15+r14]
- prefetcht0 [15+r15]
- cmp eax,11
- jb NEAR $L$enc8x_tail
-
- vaesenc xmm2,xmm2,xmm1
- vaesenc xmm3,xmm3,xmm1
- vaesenc xmm4,xmm4,xmm1
- vaesenc xmm5,xmm5,xmm1
- vaesenc xmm6,xmm6,xmm1
- vaesenc xmm7,xmm7,xmm1
- vaesenc xmm8,xmm8,xmm1
- vaesenc xmm9,xmm9,xmm1
- vmovups xmm1,XMMWORD[((176-120))+rsi]
-
- vaesenc xmm2,xmm2,xmm0
- vaesenc xmm3,xmm3,xmm0
- vaesenc xmm4,xmm4,xmm0
- vaesenc xmm5,xmm5,xmm0
- vaesenc xmm6,xmm6,xmm0
- vaesenc xmm7,xmm7,xmm0
- vaesenc xmm8,xmm8,xmm0
- vaesenc xmm9,xmm9,xmm0
- vmovups xmm0,XMMWORD[((192-120))+rsi]
- je NEAR $L$enc8x_tail
-
- vaesenc xmm2,xmm2,xmm1
- vaesenc xmm3,xmm3,xmm1
- vaesenc xmm4,xmm4,xmm1
- vaesenc xmm5,xmm5,xmm1
- vaesenc xmm6,xmm6,xmm1
- vaesenc xmm7,xmm7,xmm1
- vaesenc xmm8,xmm8,xmm1
- vaesenc xmm9,xmm9,xmm1
- vmovups xmm1,XMMWORD[((208-120))+rsi]
-
- vaesenc xmm2,xmm2,xmm0
- vaesenc xmm3,xmm3,xmm0
- vaesenc xmm4,xmm4,xmm0
- vaesenc xmm5,xmm5,xmm0
- vaesenc xmm6,xmm6,xmm0
- vaesenc xmm7,xmm7,xmm0
- vaesenc xmm8,xmm8,xmm0
- vaesenc xmm9,xmm9,xmm0
- vmovups xmm0,XMMWORD[((224-120))+rsi]
-
-$L$enc8x_tail:
- vaesenc xmm2,xmm2,xmm1
- vpxor xmm15,xmm15,xmm15
- vaesenc xmm3,xmm3,xmm1
- vaesenc xmm4,xmm4,xmm1
- vpcmpgtd xmm15,xmm14,xmm15
- vaesenc xmm5,xmm5,xmm1
- vaesenc xmm6,xmm6,xmm1
- vpaddd xmm15,xmm15,xmm14
- vmovdqu xmm14,XMMWORD[48+rsp]
- vaesenc xmm7,xmm7,xmm1
- mov rbx,QWORD[64+rsp]
- vaesenc xmm8,xmm8,xmm1
- vaesenc xmm9,xmm9,xmm1
- vmovups xmm1,XMMWORD[((16-120))+rsi]
-
- vaesenclast xmm2,xmm2,xmm0
- vmovdqa XMMWORD[32+rsp],xmm15
- vpxor xmm15,xmm15,xmm15
- vaesenclast xmm3,xmm3,xmm0
- vaesenclast xmm4,xmm4,xmm0
- vpcmpgtd xmm15,xmm14,xmm15
- vaesenclast xmm5,xmm5,xmm0
- vaesenclast xmm6,xmm6,xmm0
- vpaddd xmm14,xmm14,xmm15
- vmovdqu xmm15,XMMWORD[((-120))+rsi]
- vaesenclast xmm7,xmm7,xmm0
- vaesenclast xmm8,xmm8,xmm0
- vmovdqa XMMWORD[48+rsp],xmm14
- vaesenclast xmm9,xmm9,xmm0
- vmovups xmm0,XMMWORD[((32-120))+rsi]
-
- vmovups XMMWORD[(-16)+r8],xmm2
- sub r8,rbx
- vpxor xmm2,xmm2,XMMWORD[rbp]
- vmovups XMMWORD[(-16)+r9],xmm3
- sub r9,QWORD[72+rsp]
- vpxor xmm3,xmm3,XMMWORD[16+rbp]
- vmovups XMMWORD[(-16)+r10],xmm4
- sub r10,QWORD[80+rsp]
- vpxor xmm4,xmm4,XMMWORD[32+rbp]
- vmovups XMMWORD[(-16)+r11],xmm5
- sub r11,QWORD[88+rsp]
- vpxor xmm5,xmm5,XMMWORD[48+rbp]
- vmovups XMMWORD[(-16)+r12],xmm6
- sub r12,QWORD[96+rsp]
- vpxor xmm6,xmm6,xmm10
- vmovups XMMWORD[(-16)+r13],xmm7
- sub r13,QWORD[104+rsp]
- vpxor xmm7,xmm7,xmm11
- vmovups XMMWORD[(-16)+r14],xmm8
- sub r14,QWORD[112+rsp]
- vpxor xmm8,xmm8,xmm12
- vmovups XMMWORD[(-16)+r15],xmm9
- sub r15,QWORD[120+rsp]
- vpxor xmm9,xmm9,xmm13
-
- dec edx
- jnz NEAR $L$oop_enc8x
-
- mov rax,QWORD[16+rsp]
-
-
-
-
-
-
-$L$enc8x_done:
- vzeroupper
- movaps xmm6,XMMWORD[((-216))+rax]
- movaps xmm7,XMMWORD[((-200))+rax]
- movaps xmm8,XMMWORD[((-184))+rax]
- movaps xmm9,XMMWORD[((-168))+rax]
- movaps xmm10,XMMWORD[((-152))+rax]
- movaps xmm11,XMMWORD[((-136))+rax]
- movaps xmm12,XMMWORD[((-120))+rax]
- movaps xmm13,XMMWORD[((-104))+rax]
- movaps xmm14,XMMWORD[((-88))+rax]
- movaps xmm15,XMMWORD[((-72))+rax]
- mov r15,QWORD[((-48))+rax]
-
- mov r14,QWORD[((-40))+rax]
-
- mov r13,QWORD[((-32))+rax]
-
- mov r12,QWORD[((-24))+rax]
-
- mov rbp,QWORD[((-16))+rax]
-
- mov rbx,QWORD[((-8))+rax]
-
- lea rsp,[rax]
-
-$L$enc8x_epilogue:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_aesni_multi_cbc_encrypt_avx:
-
-
-ALIGN 32
-aesni_multi_cbc_decrypt_avx:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_aesni_multi_cbc_decrypt_avx:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
-
-
-
-_avx_cbc_dec_shortcut:
- mov rax,rsp
-
- push rbx
-
- push rbp
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
- lea rsp,[((-168))+rsp]
- movaps XMMWORD[rsp],xmm6
- movaps XMMWORD[16+rsp],xmm7
- movaps XMMWORD[32+rsp],xmm8
- movaps XMMWORD[48+rsp],xmm9
- movaps XMMWORD[64+rsp],xmm10
- movaps XMMWORD[80+rsp],xmm11
- movaps XMMWORD[(-120)+rax],xmm12
- movaps XMMWORD[(-104)+rax],xmm13
- movaps XMMWORD[(-88)+rax],xmm14
- movaps XMMWORD[(-72)+rax],xmm15
-
-
-
-
-
-
-
-
-
- sub rsp,256
- and rsp,-256
- sub rsp,192
- mov QWORD[16+rsp],rax
-
-
-$L$dec8x_body:
- vzeroupper
- vmovdqu xmm15,XMMWORD[rsi]
- lea rsi,[120+rsi]
- lea rdi,[160+rdi]
- shr edx,1
-
-$L$dec8x_loop_grande:
-
- xor edx,edx
-
- mov ecx,DWORD[((-144))+rdi]
-
- mov r8,QWORD[((-160))+rdi]
- cmp ecx,edx
-
- mov rbx,QWORD[((-152))+rdi]
- cmovg edx,ecx
- test ecx,ecx
-
- vmovdqu xmm2,XMMWORD[((-136))+rdi]
- mov DWORD[32+rsp],ecx
- cmovle r8,rsp
- sub rbx,r8
- mov QWORD[64+rsp],rbx
- vmovdqu XMMWORD[192+rsp],xmm2
-
- mov ecx,DWORD[((-104))+rdi]
-
- mov r9,QWORD[((-120))+rdi]
- cmp ecx,edx
-
- mov rbp,QWORD[((-112))+rdi]
- cmovg edx,ecx
- test ecx,ecx
-
- vmovdqu xmm3,XMMWORD[((-96))+rdi]
- mov DWORD[36+rsp],ecx
- cmovle r9,rsp
- sub rbp,r9
- mov QWORD[72+rsp],rbp
- vmovdqu XMMWORD[208+rsp],xmm3
-
- mov ecx,DWORD[((-64))+rdi]
-
- mov r10,QWORD[((-80))+rdi]
- cmp ecx,edx
-
- mov rbp,QWORD[((-72))+rdi]
- cmovg edx,ecx
- test ecx,ecx
-
- vmovdqu xmm4,XMMWORD[((-56))+rdi]
- mov DWORD[40+rsp],ecx
- cmovle r10,rsp
- sub rbp,r10
- mov QWORD[80+rsp],rbp
- vmovdqu XMMWORD[224+rsp],xmm4
-
- mov ecx,DWORD[((-24))+rdi]
-
- mov r11,QWORD[((-40))+rdi]
- cmp ecx,edx
-
- mov rbp,QWORD[((-32))+rdi]
- cmovg edx,ecx
- test ecx,ecx
-
- vmovdqu xmm5,XMMWORD[((-16))+rdi]
- mov DWORD[44+rsp],ecx
- cmovle r11,rsp
- sub rbp,r11
- mov QWORD[88+rsp],rbp
- vmovdqu XMMWORD[240+rsp],xmm5
-
- mov ecx,DWORD[16+rdi]
-
- mov r12,QWORD[rdi]
- cmp ecx,edx
-
- mov rbp,QWORD[8+rdi]
- cmovg edx,ecx
- test ecx,ecx
-
- vmovdqu xmm6,XMMWORD[24+rdi]
- mov DWORD[48+rsp],ecx
- cmovle r12,rsp
- sub rbp,r12
- mov QWORD[96+rsp],rbp
- vmovdqu XMMWORD[256+rsp],xmm6
-
- mov ecx,DWORD[56+rdi]
-
- mov r13,QWORD[40+rdi]
- cmp ecx,edx
-
- mov rbp,QWORD[48+rdi]
- cmovg edx,ecx
- test ecx,ecx
-
- vmovdqu xmm7,XMMWORD[64+rdi]
- mov DWORD[52+rsp],ecx
- cmovle r13,rsp
- sub rbp,r13
- mov QWORD[104+rsp],rbp
- vmovdqu XMMWORD[272+rsp],xmm7
-
- mov ecx,DWORD[96+rdi]
-
- mov r14,QWORD[80+rdi]
- cmp ecx,edx
-
- mov rbp,QWORD[88+rdi]
- cmovg edx,ecx
- test ecx,ecx
-
- vmovdqu xmm8,XMMWORD[104+rdi]
- mov DWORD[56+rsp],ecx
- cmovle r14,rsp
- sub rbp,r14
- mov QWORD[112+rsp],rbp
- vmovdqu XMMWORD[288+rsp],xmm8
-
- mov ecx,DWORD[136+rdi]
-
- mov r15,QWORD[120+rdi]
- cmp ecx,edx
-
- mov rbp,QWORD[128+rdi]
- cmovg edx,ecx
- test ecx,ecx
-
- vmovdqu xmm9,XMMWORD[144+rdi]
- mov DWORD[60+rsp],ecx
- cmovle r15,rsp
- sub rbp,r15
- mov QWORD[120+rsp],rbp
- vmovdqu XMMWORD[304+rsp],xmm9
- test edx,edx
- jz NEAR $L$dec8x_done
-
- vmovups xmm1,XMMWORD[((16-120))+rsi]
- vmovups xmm0,XMMWORD[((32-120))+rsi]
- mov eax,DWORD[((240-120))+rsi]
- lea rbp,[((192+128))+rsp]
-
- vmovdqu xmm2,XMMWORD[r8]
- vmovdqu xmm3,XMMWORD[r9]
- vmovdqu xmm4,XMMWORD[r10]
- vmovdqu xmm5,XMMWORD[r11]
- vmovdqu xmm6,XMMWORD[r12]
- vmovdqu xmm7,XMMWORD[r13]
- vmovdqu xmm8,XMMWORD[r14]
- vmovdqu xmm9,XMMWORD[r15]
- vmovdqu XMMWORD[rbp],xmm2
- vpxor xmm2,xmm2,xmm15
- vmovdqu XMMWORD[16+rbp],xmm3
- vpxor xmm3,xmm3,xmm15
- vmovdqu XMMWORD[32+rbp],xmm4
- vpxor xmm4,xmm4,xmm15
- vmovdqu XMMWORD[48+rbp],xmm5
- vpxor xmm5,xmm5,xmm15
- vmovdqu XMMWORD[64+rbp],xmm6
- vpxor xmm6,xmm6,xmm15
- vmovdqu XMMWORD[80+rbp],xmm7
- vpxor xmm7,xmm7,xmm15
- vmovdqu XMMWORD[96+rbp],xmm8
- vpxor xmm8,xmm8,xmm15
- vmovdqu XMMWORD[112+rbp],xmm9
- vpxor xmm9,xmm9,xmm15
- xor rbp,0x80
- mov ecx,1
- jmp NEAR $L$oop_dec8x
-
-ALIGN 32
-$L$oop_dec8x:
- vaesdec xmm2,xmm2,xmm1
- cmp ecx,DWORD[((32+0))+rsp]
- vaesdec xmm3,xmm3,xmm1
- prefetcht0 [31+r8]
- vaesdec xmm4,xmm4,xmm1
- vaesdec xmm5,xmm5,xmm1
- lea rbx,[rbx*1+r8]
- cmovge r8,rsp
- vaesdec xmm6,xmm6,xmm1
- cmovg rbx,rsp
- vaesdec xmm7,xmm7,xmm1
- sub rbx,r8
- vaesdec xmm8,xmm8,xmm1
- vmovdqu xmm10,XMMWORD[16+r8]
- mov QWORD[((64+0))+rsp],rbx
- vaesdec xmm9,xmm9,xmm1
- vmovups xmm1,XMMWORD[((-72))+rsi]
- lea r8,[16+rbx*1+r8]
- vmovdqu XMMWORD[128+rsp],xmm10
- vaesdec xmm2,xmm2,xmm0
- cmp ecx,DWORD[((32+4))+rsp]
- mov rbx,QWORD[((64+8))+rsp]
- vaesdec xmm3,xmm3,xmm0
- prefetcht0 [31+r9]
- vaesdec xmm4,xmm4,xmm0
- vaesdec xmm5,xmm5,xmm0
- lea rbx,[rbx*1+r9]
- cmovge r9,rsp
- vaesdec xmm6,xmm6,xmm0
- cmovg rbx,rsp
- vaesdec xmm7,xmm7,xmm0
- sub rbx,r9
- vaesdec xmm8,xmm8,xmm0
- vmovdqu xmm11,XMMWORD[16+r9]
- mov QWORD[((64+8))+rsp],rbx
- vaesdec xmm9,xmm9,xmm0
- vmovups xmm0,XMMWORD[((-56))+rsi]
- lea r9,[16+rbx*1+r9]
- vmovdqu XMMWORD[144+rsp],xmm11
- vaesdec xmm2,xmm2,xmm1
- cmp ecx,DWORD[((32+8))+rsp]
- mov rbx,QWORD[((64+16))+rsp]
- vaesdec xmm3,xmm3,xmm1
- prefetcht0 [31+r10]
- vaesdec xmm4,xmm4,xmm1
- prefetcht0 [15+r8]
- vaesdec xmm5,xmm5,xmm1
- lea rbx,[rbx*1+r10]
- cmovge r10,rsp
- vaesdec xmm6,xmm6,xmm1
- cmovg rbx,rsp
- vaesdec xmm7,xmm7,xmm1
- sub rbx,r10
- vaesdec xmm8,xmm8,xmm1
- vmovdqu xmm12,XMMWORD[16+r10]
- mov QWORD[((64+16))+rsp],rbx
- vaesdec xmm9,xmm9,xmm1
- vmovups xmm1,XMMWORD[((-40))+rsi]
- lea r10,[16+rbx*1+r10]
- vmovdqu XMMWORD[160+rsp],xmm12
- vaesdec xmm2,xmm2,xmm0
- cmp ecx,DWORD[((32+12))+rsp]
- mov rbx,QWORD[((64+24))+rsp]
- vaesdec xmm3,xmm3,xmm0
- prefetcht0 [31+r11]
- vaesdec xmm4,xmm4,xmm0
- prefetcht0 [15+r9]
- vaesdec xmm5,xmm5,xmm0
- lea rbx,[rbx*1+r11]
- cmovge r11,rsp
- vaesdec xmm6,xmm6,xmm0
- cmovg rbx,rsp
- vaesdec xmm7,xmm7,xmm0
- sub rbx,r11
- vaesdec xmm8,xmm8,xmm0
- vmovdqu xmm13,XMMWORD[16+r11]
- mov QWORD[((64+24))+rsp],rbx
- vaesdec xmm9,xmm9,xmm0
- vmovups xmm0,XMMWORD[((-24))+rsi]
- lea r11,[16+rbx*1+r11]
- vmovdqu XMMWORD[176+rsp],xmm13
- vaesdec xmm2,xmm2,xmm1
- cmp ecx,DWORD[((32+16))+rsp]
- mov rbx,QWORD[((64+32))+rsp]
- vaesdec xmm3,xmm3,xmm1
- prefetcht0 [31+r12]
- vaesdec xmm4,xmm4,xmm1
- prefetcht0 [15+r10]
- vaesdec xmm5,xmm5,xmm1
- lea rbx,[rbx*1+r12]
- cmovge r12,rsp
- vaesdec xmm6,xmm6,xmm1
- cmovg rbx,rsp
- vaesdec xmm7,xmm7,xmm1
- sub rbx,r12
- vaesdec xmm8,xmm8,xmm1
- vmovdqu xmm10,XMMWORD[16+r12]
- mov QWORD[((64+32))+rsp],rbx
- vaesdec xmm9,xmm9,xmm1
- vmovups xmm1,XMMWORD[((-8))+rsi]
- lea r12,[16+rbx*1+r12]
- vaesdec xmm2,xmm2,xmm0
- cmp ecx,DWORD[((32+20))+rsp]
- mov rbx,QWORD[((64+40))+rsp]
- vaesdec xmm3,xmm3,xmm0
- prefetcht0 [31+r13]
- vaesdec xmm4,xmm4,xmm0
- prefetcht0 [15+r11]
- vaesdec xmm5,xmm5,xmm0
- lea rbx,[r13*1+rbx]
- cmovge r13,rsp
- vaesdec xmm6,xmm6,xmm0
- cmovg rbx,rsp
- vaesdec xmm7,xmm7,xmm0
- sub rbx,r13
- vaesdec xmm8,xmm8,xmm0
- vmovdqu xmm11,XMMWORD[16+r13]
- mov QWORD[((64+40))+rsp],rbx
- vaesdec xmm9,xmm9,xmm0
- vmovups xmm0,XMMWORD[8+rsi]
- lea r13,[16+rbx*1+r13]
- vaesdec xmm2,xmm2,xmm1
- cmp ecx,DWORD[((32+24))+rsp]
- mov rbx,QWORD[((64+48))+rsp]
- vaesdec xmm3,xmm3,xmm1
- prefetcht0 [31+r14]
- vaesdec xmm4,xmm4,xmm1
- prefetcht0 [15+r12]
- vaesdec xmm5,xmm5,xmm1
- lea rbx,[rbx*1+r14]
- cmovge r14,rsp
- vaesdec xmm6,xmm6,xmm1
- cmovg rbx,rsp
- vaesdec xmm7,xmm7,xmm1
- sub rbx,r14
- vaesdec xmm8,xmm8,xmm1
- vmovdqu xmm12,XMMWORD[16+r14]
- mov QWORD[((64+48))+rsp],rbx
- vaesdec xmm9,xmm9,xmm1
- vmovups xmm1,XMMWORD[24+rsi]
- lea r14,[16+rbx*1+r14]
- vaesdec xmm2,xmm2,xmm0
- cmp ecx,DWORD[((32+28))+rsp]
- mov rbx,QWORD[((64+56))+rsp]
- vaesdec xmm3,xmm3,xmm0
- prefetcht0 [31+r15]
- vaesdec xmm4,xmm4,xmm0
- prefetcht0 [15+r13]
- vaesdec xmm5,xmm5,xmm0
- lea rbx,[rbx*1+r15]
- cmovge r15,rsp
- vaesdec xmm6,xmm6,xmm0
- cmovg rbx,rsp
- vaesdec xmm7,xmm7,xmm0
- sub rbx,r15
- vaesdec xmm8,xmm8,xmm0
- vmovdqu xmm13,XMMWORD[16+r15]
- mov QWORD[((64+56))+rsp],rbx
- vaesdec xmm9,xmm9,xmm0
- vmovups xmm0,XMMWORD[40+rsi]
- lea r15,[16+rbx*1+r15]
- vmovdqu xmm14,XMMWORD[32+rsp]
- prefetcht0 [15+r14]
- prefetcht0 [15+r15]
- cmp eax,11
- jb NEAR $L$dec8x_tail
-
- vaesdec xmm2,xmm2,xmm1
- vaesdec xmm3,xmm3,xmm1
- vaesdec xmm4,xmm4,xmm1
- vaesdec xmm5,xmm5,xmm1
- vaesdec xmm6,xmm6,xmm1
- vaesdec xmm7,xmm7,xmm1
- vaesdec xmm8,xmm8,xmm1
- vaesdec xmm9,xmm9,xmm1
- vmovups xmm1,XMMWORD[((176-120))+rsi]
-
- vaesdec xmm2,xmm2,xmm0
- vaesdec xmm3,xmm3,xmm0
- vaesdec xmm4,xmm4,xmm0
- vaesdec xmm5,xmm5,xmm0
- vaesdec xmm6,xmm6,xmm0
- vaesdec xmm7,xmm7,xmm0
- vaesdec xmm8,xmm8,xmm0
- vaesdec xmm9,xmm9,xmm0
- vmovups xmm0,XMMWORD[((192-120))+rsi]
- je NEAR $L$dec8x_tail
-
- vaesdec xmm2,xmm2,xmm1
- vaesdec xmm3,xmm3,xmm1
- vaesdec xmm4,xmm4,xmm1
- vaesdec xmm5,xmm5,xmm1
- vaesdec xmm6,xmm6,xmm1
- vaesdec xmm7,xmm7,xmm1
- vaesdec xmm8,xmm8,xmm1
- vaesdec xmm9,xmm9,xmm1
- vmovups xmm1,XMMWORD[((208-120))+rsi]
-
- vaesdec xmm2,xmm2,xmm0
- vaesdec xmm3,xmm3,xmm0
- vaesdec xmm4,xmm4,xmm0
- vaesdec xmm5,xmm5,xmm0
- vaesdec xmm6,xmm6,xmm0
- vaesdec xmm7,xmm7,xmm0
- vaesdec xmm8,xmm8,xmm0
- vaesdec xmm9,xmm9,xmm0
- vmovups xmm0,XMMWORD[((224-120))+rsi]
-
-$L$dec8x_tail:
- vaesdec xmm2,xmm2,xmm1
- vpxor xmm15,xmm15,xmm15
- vaesdec xmm3,xmm3,xmm1
- vaesdec xmm4,xmm4,xmm1
- vpcmpgtd xmm15,xmm14,xmm15
- vaesdec xmm5,xmm5,xmm1
- vaesdec xmm6,xmm6,xmm1
- vpaddd xmm15,xmm15,xmm14
- vmovdqu xmm14,XMMWORD[48+rsp]
- vaesdec xmm7,xmm7,xmm1
- mov rbx,QWORD[64+rsp]
- vaesdec xmm8,xmm8,xmm1
- vaesdec xmm9,xmm9,xmm1
- vmovups xmm1,XMMWORD[((16-120))+rsi]
-
- vaesdeclast xmm2,xmm2,xmm0
- vmovdqa XMMWORD[32+rsp],xmm15
- vpxor xmm15,xmm15,xmm15
- vaesdeclast xmm3,xmm3,xmm0
- vpxor xmm2,xmm2,XMMWORD[rbp]
- vaesdeclast xmm4,xmm4,xmm0
- vpxor xmm3,xmm3,XMMWORD[16+rbp]
- vpcmpgtd xmm15,xmm14,xmm15
- vaesdeclast xmm5,xmm5,xmm0
- vpxor xmm4,xmm4,XMMWORD[32+rbp]
- vaesdeclast xmm6,xmm6,xmm0
- vpxor xmm5,xmm5,XMMWORD[48+rbp]
- vpaddd xmm14,xmm14,xmm15
- vmovdqu xmm15,XMMWORD[((-120))+rsi]
- vaesdeclast xmm7,xmm7,xmm0
- vpxor xmm6,xmm6,XMMWORD[64+rbp]
- vaesdeclast xmm8,xmm8,xmm0
- vpxor xmm7,xmm7,XMMWORD[80+rbp]
- vmovdqa XMMWORD[48+rsp],xmm14
- vaesdeclast xmm9,xmm9,xmm0
- vpxor xmm8,xmm8,XMMWORD[96+rbp]
- vmovups xmm0,XMMWORD[((32-120))+rsi]
-
- vmovups XMMWORD[(-16)+r8],xmm2
- sub r8,rbx
- vmovdqu xmm2,XMMWORD[((128+0))+rsp]
- vpxor xmm9,xmm9,XMMWORD[112+rbp]
- vmovups XMMWORD[(-16)+r9],xmm3
- sub r9,QWORD[72+rsp]
- vmovdqu XMMWORD[rbp],xmm2
- vpxor xmm2,xmm2,xmm15
- vmovdqu xmm3,XMMWORD[((128+16))+rsp]
- vmovups XMMWORD[(-16)+r10],xmm4
- sub r10,QWORD[80+rsp]
- vmovdqu XMMWORD[16+rbp],xmm3
- vpxor xmm3,xmm3,xmm15
- vmovdqu xmm4,XMMWORD[((128+32))+rsp]
- vmovups XMMWORD[(-16)+r11],xmm5
- sub r11,QWORD[88+rsp]
- vmovdqu XMMWORD[32+rbp],xmm4
- vpxor xmm4,xmm4,xmm15
- vmovdqu xmm5,XMMWORD[((128+48))+rsp]
- vmovups XMMWORD[(-16)+r12],xmm6
- sub r12,QWORD[96+rsp]
- vmovdqu XMMWORD[48+rbp],xmm5
- vpxor xmm5,xmm5,xmm15
- vmovdqu XMMWORD[64+rbp],xmm10
- vpxor xmm6,xmm15,xmm10
- vmovups XMMWORD[(-16)+r13],xmm7
- sub r13,QWORD[104+rsp]
- vmovdqu XMMWORD[80+rbp],xmm11
- vpxor xmm7,xmm15,xmm11
- vmovups XMMWORD[(-16)+r14],xmm8
- sub r14,QWORD[112+rsp]
- vmovdqu XMMWORD[96+rbp],xmm12
- vpxor xmm8,xmm15,xmm12
- vmovups XMMWORD[(-16)+r15],xmm9
- sub r15,QWORD[120+rsp]
- vmovdqu XMMWORD[112+rbp],xmm13
- vpxor xmm9,xmm15,xmm13
-
- xor rbp,128
- dec edx
- jnz NEAR $L$oop_dec8x
-
- mov rax,QWORD[16+rsp]
-
-
-
-
-
-
-$L$dec8x_done:
- vzeroupper
- movaps xmm6,XMMWORD[((-216))+rax]
- movaps xmm7,XMMWORD[((-200))+rax]
- movaps xmm8,XMMWORD[((-184))+rax]
- movaps xmm9,XMMWORD[((-168))+rax]
- movaps xmm10,XMMWORD[((-152))+rax]
- movaps xmm11,XMMWORD[((-136))+rax]
- movaps xmm12,XMMWORD[((-120))+rax]
- movaps xmm13,XMMWORD[((-104))+rax]
- movaps xmm14,XMMWORD[((-88))+rax]
- movaps xmm15,XMMWORD[((-72))+rax]
- mov r15,QWORD[((-48))+rax]
-
- mov r14,QWORD[((-40))+rax]
-
- mov r13,QWORD[((-32))+rax]
-
- mov r12,QWORD[((-24))+rax]
-
- mov rbp,QWORD[((-16))+rax]
-
- mov rbx,QWORD[((-8))+rax]
-
- lea rsp,[rax]
-
-$L$dec8x_epilogue:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_aesni_multi_cbc_decrypt_avx:
EXTERN __imp_RtlVirtualUnwind
ALIGN 16
@@ -1820,12 +727,6 @@ ALIGN 4
DD $L$SEH_begin_aesni_multi_cbc_decrypt wrt ..imagebase
DD $L$SEH_end_aesni_multi_cbc_decrypt wrt ..imagebase
DD $L$SEH_info_aesni_multi_cbc_decrypt wrt ..imagebase
- DD $L$SEH_begin_aesni_multi_cbc_encrypt_avx wrt ..imagebase
- DD $L$SEH_end_aesni_multi_cbc_encrypt_avx wrt ..imagebase
- DD $L$SEH_info_aesni_multi_cbc_encrypt_avx wrt ..imagebase
- DD $L$SEH_begin_aesni_multi_cbc_decrypt_avx wrt ..imagebase
- DD $L$SEH_end_aesni_multi_cbc_decrypt_avx wrt ..imagebase
- DD $L$SEH_info_aesni_multi_cbc_decrypt_avx wrt ..imagebase
section .xdata rdata align=8
ALIGN 8
$L$SEH_info_aesni_multi_cbc_encrypt:
@@ -1836,11 +737,3 @@ $L$SEH_info_aesni_multi_cbc_decrypt:
DB 9,0,0,0
DD se_handler wrt ..imagebase
DD $L$dec4x_body wrt ..imagebase,$L$dec4x_epilogue wrt ..imagebase
-$L$SEH_info_aesni_multi_cbc_encrypt_avx:
-DB 9,0,0,0
- DD se_handler wrt ..imagebase
- DD $L$enc8x_body wrt ..imagebase,$L$enc8x_epilogue wrt ..imagebase
-$L$SEH_info_aesni_multi_cbc_decrypt_avx:
-DB 9,0,0,0
- DD se_handler wrt ..imagebase
- DD $L$dec8x_body wrt ..imagebase,$L$dec8x_epilogue wrt ..imagebase
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-sha1-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-sha1-x86_64.nasm
index f4ed3f70843..e52174799ef 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-sha1-x86_64.nasm
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-sha1-x86_64.nasm
@@ -16,11 +16,6 @@ aesni_cbc_sha1_enc:
mov r11,QWORD[((OPENSSL_ia32cap_P+4))]
bt r11,61
jc NEAR aesni_cbc_sha1_enc_shaext
- and r11d,268435456
- and r10d,1073741824
- or r10d,r11d
- cmp r10d,1342177280
- je NEAR aesni_cbc_sha1_enc_avx
jmp NEAR aesni_cbc_sha1_enc_ssse3
DB 0F3h,0C3h ;repret
@@ -1431,1356 +1426,6 @@ $L$epilogue_ssse3:
DB 0F3h,0C3h ;repret
$L$SEH_end_aesni_cbc_sha1_enc_ssse3:
-
-ALIGN 32
-aesni_cbc_sha1_enc_avx:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_aesni_cbc_sha1_enc_avx:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
- mov rcx,r9
- mov r8,QWORD[40+rsp]
- mov r9,QWORD[48+rsp]
-
-
-
- mov r10,QWORD[56+rsp]
-
-
- push rbx
-
- push rbp
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
- lea rsp,[((-264))+rsp]
-
-
-
- movaps XMMWORD[(96+0)+rsp],xmm6
- movaps XMMWORD[(96+16)+rsp],xmm7
- movaps XMMWORD[(96+32)+rsp],xmm8
- movaps XMMWORD[(96+48)+rsp],xmm9
- movaps XMMWORD[(96+64)+rsp],xmm10
- movaps XMMWORD[(96+80)+rsp],xmm11
- movaps XMMWORD[(96+96)+rsp],xmm12
- movaps XMMWORD[(96+112)+rsp],xmm13
- movaps XMMWORD[(96+128)+rsp],xmm14
- movaps XMMWORD[(96+144)+rsp],xmm15
-$L$prologue_avx:
- vzeroall
- mov r12,rdi
- mov r13,rsi
- mov r14,rdx
- lea r15,[112+rcx]
- vmovdqu xmm12,XMMWORD[r8]
- mov QWORD[88+rsp],r8
- shl r14,6
- sub r13,r12
- mov r8d,DWORD[((240-112))+r15]
- add r14,r10
-
- lea r11,[K_XX_XX]
- mov eax,DWORD[r9]
- mov ebx,DWORD[4+r9]
- mov ecx,DWORD[8+r9]
- mov edx,DWORD[12+r9]
- mov esi,ebx
- mov ebp,DWORD[16+r9]
- mov edi,ecx
- xor edi,edx
- and esi,edi
-
- vmovdqa xmm6,XMMWORD[64+r11]
- vmovdqa xmm10,XMMWORD[r11]
- vmovdqu xmm0,XMMWORD[r10]
- vmovdqu xmm1,XMMWORD[16+r10]
- vmovdqu xmm2,XMMWORD[32+r10]
- vmovdqu xmm3,XMMWORD[48+r10]
- vpshufb xmm0,xmm0,xmm6
- add r10,64
- vpshufb xmm1,xmm1,xmm6
- vpshufb xmm2,xmm2,xmm6
- vpshufb xmm3,xmm3,xmm6
- vpaddd xmm4,xmm0,xmm10
- vpaddd xmm5,xmm1,xmm10
- vpaddd xmm6,xmm2,xmm10
- vmovdqa XMMWORD[rsp],xmm4
- vmovdqa XMMWORD[16+rsp],xmm5
- vmovdqa XMMWORD[32+rsp],xmm6
- vmovups xmm15,XMMWORD[((-112))+r15]
- vmovups xmm14,XMMWORD[((16-112))+r15]
- jmp NEAR $L$oop_avx
-ALIGN 32
-$L$oop_avx:
- shrd ebx,ebx,2
- vmovdqu xmm13,XMMWORD[r12]
- vpxor xmm13,xmm13,xmm15
- vpxor xmm12,xmm12,xmm13
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[((-80))+r15]
- xor esi,edx
- vpalignr xmm4,xmm1,xmm0,8
- mov edi,eax
- add ebp,DWORD[rsp]
- vpaddd xmm9,xmm10,xmm3
- xor ebx,ecx
- shld eax,eax,5
- vpsrldq xmm8,xmm3,4
- add ebp,esi
- and edi,ebx
- vpxor xmm4,xmm4,xmm0
- xor ebx,ecx
- add ebp,eax
- vpxor xmm8,xmm8,xmm2
- shrd eax,eax,7
- xor edi,ecx
- mov esi,ebp
- add edx,DWORD[4+rsp]
- vpxor xmm4,xmm4,xmm8
- xor eax,ebx
- shld ebp,ebp,5
- vmovdqa XMMWORD[48+rsp],xmm9
- add edx,edi
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[((-64))+r15]
- and esi,eax
- vpsrld xmm8,xmm4,31
- xor eax,ebx
- add edx,ebp
- shrd ebp,ebp,7
- xor esi,ebx
- vpslldq xmm9,xmm4,12
- vpaddd xmm4,xmm4,xmm4
- mov edi,edx
- add ecx,DWORD[8+rsp]
- xor ebp,eax
- shld edx,edx,5
- vpor xmm4,xmm4,xmm8
- vpsrld xmm8,xmm9,30
- add ecx,esi
- and edi,ebp
- xor ebp,eax
- add ecx,edx
- vpslld xmm9,xmm9,2
- vpxor xmm4,xmm4,xmm8
- shrd edx,edx,7
- xor edi,eax
- mov esi,ecx
- add ebx,DWORD[12+rsp]
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[((-48))+r15]
- vpxor xmm4,xmm4,xmm9
- xor edx,ebp
- shld ecx,ecx,5
- add ebx,edi
- and esi,edx
- xor edx,ebp
- add ebx,ecx
- shrd ecx,ecx,7
- xor esi,ebp
- vpalignr xmm5,xmm2,xmm1,8
- mov edi,ebx
- add eax,DWORD[16+rsp]
- vpaddd xmm9,xmm10,xmm4
- xor ecx,edx
- shld ebx,ebx,5
- vpsrldq xmm8,xmm4,4
- add eax,esi
- and edi,ecx
- vpxor xmm5,xmm5,xmm1
- xor ecx,edx
- add eax,ebx
- vpxor xmm8,xmm8,xmm3
- shrd ebx,ebx,7
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[((-32))+r15]
- xor edi,edx
- mov esi,eax
- add ebp,DWORD[20+rsp]
- vpxor xmm5,xmm5,xmm8
- xor ebx,ecx
- shld eax,eax,5
- vmovdqa XMMWORD[rsp],xmm9
- add ebp,edi
- and esi,ebx
- vpsrld xmm8,xmm5,31
- xor ebx,ecx
- add ebp,eax
- shrd eax,eax,7
- xor esi,ecx
- vpslldq xmm9,xmm5,12
- vpaddd xmm5,xmm5,xmm5
- mov edi,ebp
- add edx,DWORD[24+rsp]
- xor eax,ebx
- shld ebp,ebp,5
- vpor xmm5,xmm5,xmm8
- vpsrld xmm8,xmm9,30
- add edx,esi
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[((-16))+r15]
- and edi,eax
- xor eax,ebx
- add edx,ebp
- vpslld xmm9,xmm9,2
- vpxor xmm5,xmm5,xmm8
- shrd ebp,ebp,7
- xor edi,ebx
- mov esi,edx
- add ecx,DWORD[28+rsp]
- vpxor xmm5,xmm5,xmm9
- xor ebp,eax
- shld edx,edx,5
- vmovdqa xmm10,XMMWORD[16+r11]
- add ecx,edi
- and esi,ebp
- xor ebp,eax
- add ecx,edx
- shrd edx,edx,7
- xor esi,eax
- vpalignr xmm6,xmm3,xmm2,8
- mov edi,ecx
- add ebx,DWORD[32+rsp]
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[r15]
- vpaddd xmm9,xmm10,xmm5
- xor edx,ebp
- shld ecx,ecx,5
- vpsrldq xmm8,xmm5,4
- add ebx,esi
- and edi,edx
- vpxor xmm6,xmm6,xmm2
- xor edx,ebp
- add ebx,ecx
- vpxor xmm8,xmm8,xmm4
- shrd ecx,ecx,7
- xor edi,ebp
- mov esi,ebx
- add eax,DWORD[36+rsp]
- vpxor xmm6,xmm6,xmm8
- xor ecx,edx
- shld ebx,ebx,5
- vmovdqa XMMWORD[16+rsp],xmm9
- add eax,edi
- and esi,ecx
- vpsrld xmm8,xmm6,31
- xor ecx,edx
- add eax,ebx
- shrd ebx,ebx,7
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[16+r15]
- xor esi,edx
- vpslldq xmm9,xmm6,12
- vpaddd xmm6,xmm6,xmm6
- mov edi,eax
- add ebp,DWORD[40+rsp]
- xor ebx,ecx
- shld eax,eax,5
- vpor xmm6,xmm6,xmm8
- vpsrld xmm8,xmm9,30
- add ebp,esi
- and edi,ebx
- xor ebx,ecx
- add ebp,eax
- vpslld xmm9,xmm9,2
- vpxor xmm6,xmm6,xmm8
- shrd eax,eax,7
- xor edi,ecx
- mov esi,ebp
- add edx,DWORD[44+rsp]
- vpxor xmm6,xmm6,xmm9
- xor eax,ebx
- shld ebp,ebp,5
- add edx,edi
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[32+r15]
- and esi,eax
- xor eax,ebx
- add edx,ebp
- shrd ebp,ebp,7
- xor esi,ebx
- vpalignr xmm7,xmm4,xmm3,8
- mov edi,edx
- add ecx,DWORD[48+rsp]
- vpaddd xmm9,xmm10,xmm6
- xor ebp,eax
- shld edx,edx,5
- vpsrldq xmm8,xmm6,4
- add ecx,esi
- and edi,ebp
- vpxor xmm7,xmm7,xmm3
- xor ebp,eax
- add ecx,edx
- vpxor xmm8,xmm8,xmm5
- shrd edx,edx,7
- xor edi,eax
- mov esi,ecx
- add ebx,DWORD[52+rsp]
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[48+r15]
- vpxor xmm7,xmm7,xmm8
- xor edx,ebp
- shld ecx,ecx,5
- vmovdqa XMMWORD[32+rsp],xmm9
- add ebx,edi
- and esi,edx
- vpsrld xmm8,xmm7,31
- xor edx,ebp
- add ebx,ecx
- shrd ecx,ecx,7
- xor esi,ebp
- vpslldq xmm9,xmm7,12
- vpaddd xmm7,xmm7,xmm7
- mov edi,ebx
- add eax,DWORD[56+rsp]
- xor ecx,edx
- shld ebx,ebx,5
- vpor xmm7,xmm7,xmm8
- vpsrld xmm8,xmm9,30
- add eax,esi
- and edi,ecx
- xor ecx,edx
- add eax,ebx
- vpslld xmm9,xmm9,2
- vpxor xmm7,xmm7,xmm8
- shrd ebx,ebx,7
- cmp r8d,11
- jb NEAR $L$vaesenclast6
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[64+r15]
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[80+r15]
- je NEAR $L$vaesenclast6
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[96+r15]
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[112+r15]
-$L$vaesenclast6:
- vaesenclast xmm12,xmm12,xmm15
- vmovups xmm15,XMMWORD[((-112))+r15]
- vmovups xmm14,XMMWORD[((16-112))+r15]
- xor edi,edx
- mov esi,eax
- add ebp,DWORD[60+rsp]
- vpxor xmm7,xmm7,xmm9
- xor ebx,ecx
- shld eax,eax,5
- add ebp,edi
- and esi,ebx
- xor ebx,ecx
- add ebp,eax
- vpalignr xmm8,xmm7,xmm6,8
- vpxor xmm0,xmm0,xmm4
- shrd eax,eax,7
- xor esi,ecx
- mov edi,ebp
- add edx,DWORD[rsp]
- vpxor xmm0,xmm0,xmm1
- xor eax,ebx
- shld ebp,ebp,5
- vpaddd xmm9,xmm10,xmm7
- add edx,esi
- vmovdqu xmm13,XMMWORD[16+r12]
- vpxor xmm13,xmm13,xmm15
- vmovups XMMWORD[r13*1+r12],xmm12
- vpxor xmm12,xmm12,xmm13
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[((-80))+r15]
- and edi,eax
- vpxor xmm0,xmm0,xmm8
- xor eax,ebx
- add edx,ebp
- shrd ebp,ebp,7
- xor edi,ebx
- vpsrld xmm8,xmm0,30
- vmovdqa XMMWORD[48+rsp],xmm9
- mov esi,edx
- add ecx,DWORD[4+rsp]
- xor ebp,eax
- shld edx,edx,5
- vpslld xmm0,xmm0,2
- add ecx,edi
- and esi,ebp
- xor ebp,eax
- add ecx,edx
- shrd edx,edx,7
- xor esi,eax
- mov edi,ecx
- add ebx,DWORD[8+rsp]
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[((-64))+r15]
- vpor xmm0,xmm0,xmm8
- xor edx,ebp
- shld ecx,ecx,5
- add ebx,esi
- and edi,edx
- xor edx,ebp
- add ebx,ecx
- add eax,DWORD[12+rsp]
- xor edi,ebp
- mov esi,ebx
- shld ebx,ebx,5
- add eax,edi
- xor esi,edx
- shrd ecx,ecx,7
- add eax,ebx
- vpalignr xmm8,xmm0,xmm7,8
- vpxor xmm1,xmm1,xmm5
- add ebp,DWORD[16+rsp]
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[((-48))+r15]
- xor esi,ecx
- mov edi,eax
- shld eax,eax,5
- vpxor xmm1,xmm1,xmm2
- add ebp,esi
- xor edi,ecx
- vpaddd xmm9,xmm10,xmm0
- shrd ebx,ebx,7
- add ebp,eax
- vpxor xmm1,xmm1,xmm8
- add edx,DWORD[20+rsp]
- xor edi,ebx
- mov esi,ebp
- shld ebp,ebp,5
- vpsrld xmm8,xmm1,30
- vmovdqa XMMWORD[rsp],xmm9
- add edx,edi
- xor esi,ebx
- shrd eax,eax,7
- add edx,ebp
- vpslld xmm1,xmm1,2
- add ecx,DWORD[24+rsp]
- xor esi,eax
- mov edi,edx
- shld edx,edx,5
- add ecx,esi
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[((-32))+r15]
- xor edi,eax
- shrd ebp,ebp,7
- add ecx,edx
- vpor xmm1,xmm1,xmm8
- add ebx,DWORD[28+rsp]
- xor edi,ebp
- mov esi,ecx
- shld ecx,ecx,5
- add ebx,edi
- xor esi,ebp
- shrd edx,edx,7
- add ebx,ecx
- vpalignr xmm8,xmm1,xmm0,8
- vpxor xmm2,xmm2,xmm6
- add eax,DWORD[32+rsp]
- xor esi,edx
- mov edi,ebx
- shld ebx,ebx,5
- vpxor xmm2,xmm2,xmm3
- add eax,esi
- xor edi,edx
- vpaddd xmm9,xmm10,xmm1
- vmovdqa xmm10,XMMWORD[32+r11]
- shrd ecx,ecx,7
- add eax,ebx
- vpxor xmm2,xmm2,xmm8
- add ebp,DWORD[36+rsp]
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[((-16))+r15]
- xor edi,ecx
- mov esi,eax
- shld eax,eax,5
- vpsrld xmm8,xmm2,30
- vmovdqa XMMWORD[16+rsp],xmm9
- add ebp,edi
- xor esi,ecx
- shrd ebx,ebx,7
- add ebp,eax
- vpslld xmm2,xmm2,2
- add edx,DWORD[40+rsp]
- xor esi,ebx
- mov edi,ebp
- shld ebp,ebp,5
- add edx,esi
- xor edi,ebx
- shrd eax,eax,7
- add edx,ebp
- vpor xmm2,xmm2,xmm8
- add ecx,DWORD[44+rsp]
- xor edi,eax
- mov esi,edx
- shld edx,edx,5
- add ecx,edi
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[r15]
- xor esi,eax
- shrd ebp,ebp,7
- add ecx,edx
- vpalignr xmm8,xmm2,xmm1,8
- vpxor xmm3,xmm3,xmm7
- add ebx,DWORD[48+rsp]
- xor esi,ebp
- mov edi,ecx
- shld ecx,ecx,5
- vpxor xmm3,xmm3,xmm4
- add ebx,esi
- xor edi,ebp
- vpaddd xmm9,xmm10,xmm2
- shrd edx,edx,7
- add ebx,ecx
- vpxor xmm3,xmm3,xmm8
- add eax,DWORD[52+rsp]
- xor edi,edx
- mov esi,ebx
- shld ebx,ebx,5
- vpsrld xmm8,xmm3,30
- vmovdqa XMMWORD[32+rsp],xmm9
- add eax,edi
- xor esi,edx
- shrd ecx,ecx,7
- add eax,ebx
- vpslld xmm3,xmm3,2
- add ebp,DWORD[56+rsp]
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[16+r15]
- xor esi,ecx
- mov edi,eax
- shld eax,eax,5
- add ebp,esi
- xor edi,ecx
- shrd ebx,ebx,7
- add ebp,eax
- vpor xmm3,xmm3,xmm8
- add edx,DWORD[60+rsp]
- xor edi,ebx
- mov esi,ebp
- shld ebp,ebp,5
- add edx,edi
- xor esi,ebx
- shrd eax,eax,7
- add edx,ebp
- vpalignr xmm8,xmm3,xmm2,8
- vpxor xmm4,xmm4,xmm0
- add ecx,DWORD[rsp]
- xor esi,eax
- mov edi,edx
- shld edx,edx,5
- vpxor xmm4,xmm4,xmm5
- add ecx,esi
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[32+r15]
- xor edi,eax
- vpaddd xmm9,xmm10,xmm3
- shrd ebp,ebp,7
- add ecx,edx
- vpxor xmm4,xmm4,xmm8
- add ebx,DWORD[4+rsp]
- xor edi,ebp
- mov esi,ecx
- shld ecx,ecx,5
- vpsrld xmm8,xmm4,30
- vmovdqa XMMWORD[48+rsp],xmm9
- add ebx,edi
- xor esi,ebp
- shrd edx,edx,7
- add ebx,ecx
- vpslld xmm4,xmm4,2
- add eax,DWORD[8+rsp]
- xor esi,edx
- mov edi,ebx
- shld ebx,ebx,5
- add eax,esi
- xor edi,edx
- shrd ecx,ecx,7
- add eax,ebx
- vpor xmm4,xmm4,xmm8
- add ebp,DWORD[12+rsp]
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[48+r15]
- xor edi,ecx
- mov esi,eax
- shld eax,eax,5
- add ebp,edi
- xor esi,ecx
- shrd ebx,ebx,7
- add ebp,eax
- vpalignr xmm8,xmm4,xmm3,8
- vpxor xmm5,xmm5,xmm1
- add edx,DWORD[16+rsp]
- xor esi,ebx
- mov edi,ebp
- shld ebp,ebp,5
- vpxor xmm5,xmm5,xmm6
- add edx,esi
- xor edi,ebx
- vpaddd xmm9,xmm10,xmm4
- shrd eax,eax,7
- add edx,ebp
- vpxor xmm5,xmm5,xmm8
- add ecx,DWORD[20+rsp]
- xor edi,eax
- mov esi,edx
- shld edx,edx,5
- vpsrld xmm8,xmm5,30
- vmovdqa XMMWORD[rsp],xmm9
- add ecx,edi
- cmp r8d,11
- jb NEAR $L$vaesenclast7
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[64+r15]
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[80+r15]
- je NEAR $L$vaesenclast7
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[96+r15]
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[112+r15]
-$L$vaesenclast7:
- vaesenclast xmm12,xmm12,xmm15
- vmovups xmm15,XMMWORD[((-112))+r15]
- vmovups xmm14,XMMWORD[((16-112))+r15]
- xor esi,eax
- shrd ebp,ebp,7
- add ecx,edx
- vpslld xmm5,xmm5,2
- add ebx,DWORD[24+rsp]
- xor esi,ebp
- mov edi,ecx
- shld ecx,ecx,5
- add ebx,esi
- xor edi,ebp
- shrd edx,edx,7
- add ebx,ecx
- vpor xmm5,xmm5,xmm8
- add eax,DWORD[28+rsp]
- shrd ecx,ecx,7
- mov esi,ebx
- xor edi,edx
- shld ebx,ebx,5
- add eax,edi
- xor esi,ecx
- xor ecx,edx
- add eax,ebx
- vpalignr xmm8,xmm5,xmm4,8
- vpxor xmm6,xmm6,xmm2
- add ebp,DWORD[32+rsp]
- vmovdqu xmm13,XMMWORD[32+r12]
- vpxor xmm13,xmm13,xmm15
- vmovups XMMWORD[16+r12*1+r13],xmm12
- vpxor xmm12,xmm12,xmm13
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[((-80))+r15]
- and esi,ecx
- xor ecx,edx
- shrd ebx,ebx,7
- vpxor xmm6,xmm6,xmm7
- mov edi,eax
- xor esi,ecx
- vpaddd xmm9,xmm10,xmm5
- shld eax,eax,5
- add ebp,esi
- vpxor xmm6,xmm6,xmm8
- xor edi,ebx
- xor ebx,ecx
- add ebp,eax
- add edx,DWORD[36+rsp]
- vpsrld xmm8,xmm6,30
- vmovdqa XMMWORD[16+rsp],xmm9
- and edi,ebx
- xor ebx,ecx
- shrd eax,eax,7
- mov esi,ebp
- vpslld xmm6,xmm6,2
- xor edi,ebx
- shld ebp,ebp,5
- add edx,edi
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[((-64))+r15]
- xor esi,eax
- xor eax,ebx
- add edx,ebp
- add ecx,DWORD[40+rsp]
- and esi,eax
- vpor xmm6,xmm6,xmm8
- xor eax,ebx
- shrd ebp,ebp,7
- mov edi,edx
- xor esi,eax
- shld edx,edx,5
- add ecx,esi
- xor edi,ebp
- xor ebp,eax
- add ecx,edx
- add ebx,DWORD[44+rsp]
- and edi,ebp
- xor ebp,eax
- shrd edx,edx,7
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[((-48))+r15]
- mov esi,ecx
- xor edi,ebp
- shld ecx,ecx,5
- add ebx,edi
- xor esi,edx
- xor edx,ebp
- add ebx,ecx
- vpalignr xmm8,xmm6,xmm5,8
- vpxor xmm7,xmm7,xmm3
- add eax,DWORD[48+rsp]
- and esi,edx
- xor edx,ebp
- shrd ecx,ecx,7
- vpxor xmm7,xmm7,xmm0
- mov edi,ebx
- xor esi,edx
- vpaddd xmm9,xmm10,xmm6
- vmovdqa xmm10,XMMWORD[48+r11]
- shld ebx,ebx,5
- add eax,esi
- vpxor xmm7,xmm7,xmm8
- xor edi,ecx
- xor ecx,edx
- add eax,ebx
- add ebp,DWORD[52+rsp]
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[((-32))+r15]
- vpsrld xmm8,xmm7,30
- vmovdqa XMMWORD[32+rsp],xmm9
- and edi,ecx
- xor ecx,edx
- shrd ebx,ebx,7
- mov esi,eax
- vpslld xmm7,xmm7,2
- xor edi,ecx
- shld eax,eax,5
- add ebp,edi
- xor esi,ebx
- xor ebx,ecx
- add ebp,eax
- add edx,DWORD[56+rsp]
- and esi,ebx
- vpor xmm7,xmm7,xmm8
- xor ebx,ecx
- shrd eax,eax,7
- mov edi,ebp
- xor esi,ebx
- shld ebp,ebp,5
- add edx,esi
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[((-16))+r15]
- xor edi,eax
- xor eax,ebx
- add edx,ebp
- add ecx,DWORD[60+rsp]
- and edi,eax
- xor eax,ebx
- shrd ebp,ebp,7
- mov esi,edx
- xor edi,eax
- shld edx,edx,5
- add ecx,edi
- xor esi,ebp
- xor ebp,eax
- add ecx,edx
- vpalignr xmm8,xmm7,xmm6,8
- vpxor xmm0,xmm0,xmm4
- add ebx,DWORD[rsp]
- and esi,ebp
- xor ebp,eax
- shrd edx,edx,7
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[r15]
- vpxor xmm0,xmm0,xmm1
- mov edi,ecx
- xor esi,ebp
- vpaddd xmm9,xmm10,xmm7
- shld ecx,ecx,5
- add ebx,esi
- vpxor xmm0,xmm0,xmm8
- xor edi,edx
- xor edx,ebp
- add ebx,ecx
- add eax,DWORD[4+rsp]
- vpsrld xmm8,xmm0,30
- vmovdqa XMMWORD[48+rsp],xmm9
- and edi,edx
- xor edx,ebp
- shrd ecx,ecx,7
- mov esi,ebx
- vpslld xmm0,xmm0,2
- xor edi,edx
- shld ebx,ebx,5
- add eax,edi
- xor esi,ecx
- xor ecx,edx
- add eax,ebx
- add ebp,DWORD[8+rsp]
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[16+r15]
- and esi,ecx
- vpor xmm0,xmm0,xmm8
- xor ecx,edx
- shrd ebx,ebx,7
- mov edi,eax
- xor esi,ecx
- shld eax,eax,5
- add ebp,esi
- xor edi,ebx
- xor ebx,ecx
- add ebp,eax
- add edx,DWORD[12+rsp]
- and edi,ebx
- xor ebx,ecx
- shrd eax,eax,7
- mov esi,ebp
- xor edi,ebx
- shld ebp,ebp,5
- add edx,edi
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[32+r15]
- xor esi,eax
- xor eax,ebx
- add edx,ebp
- vpalignr xmm8,xmm0,xmm7,8
- vpxor xmm1,xmm1,xmm5
- add ecx,DWORD[16+rsp]
- and esi,eax
- xor eax,ebx
- shrd ebp,ebp,7
- vpxor xmm1,xmm1,xmm2
- mov edi,edx
- xor esi,eax
- vpaddd xmm9,xmm10,xmm0
- shld edx,edx,5
- add ecx,esi
- vpxor xmm1,xmm1,xmm8
- xor edi,ebp
- xor ebp,eax
- add ecx,edx
- add ebx,DWORD[20+rsp]
- vpsrld xmm8,xmm1,30
- vmovdqa XMMWORD[rsp],xmm9
- and edi,ebp
- xor ebp,eax
- shrd edx,edx,7
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[48+r15]
- mov esi,ecx
- vpslld xmm1,xmm1,2
- xor edi,ebp
- shld ecx,ecx,5
- add ebx,edi
- xor esi,edx
- xor edx,ebp
- add ebx,ecx
- add eax,DWORD[24+rsp]
- and esi,edx
- vpor xmm1,xmm1,xmm8
- xor edx,ebp
- shrd ecx,ecx,7
- mov edi,ebx
- xor esi,edx
- shld ebx,ebx,5
- add eax,esi
- xor edi,ecx
- xor ecx,edx
- add eax,ebx
- add ebp,DWORD[28+rsp]
- cmp r8d,11
- jb NEAR $L$vaesenclast8
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[64+r15]
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[80+r15]
- je NEAR $L$vaesenclast8
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[96+r15]
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[112+r15]
-$L$vaesenclast8:
- vaesenclast xmm12,xmm12,xmm15
- vmovups xmm15,XMMWORD[((-112))+r15]
- vmovups xmm14,XMMWORD[((16-112))+r15]
- and edi,ecx
- xor ecx,edx
- shrd ebx,ebx,7
- mov esi,eax
- xor edi,ecx
- shld eax,eax,5
- add ebp,edi
- xor esi,ebx
- xor ebx,ecx
- add ebp,eax
- vpalignr xmm8,xmm1,xmm0,8
- vpxor xmm2,xmm2,xmm6
- add edx,DWORD[32+rsp]
- and esi,ebx
- xor ebx,ecx
- shrd eax,eax,7
- vpxor xmm2,xmm2,xmm3
- mov edi,ebp
- xor esi,ebx
- vpaddd xmm9,xmm10,xmm1
- shld ebp,ebp,5
- add edx,esi
- vmovdqu xmm13,XMMWORD[48+r12]
- vpxor xmm13,xmm13,xmm15
- vmovups XMMWORD[32+r12*1+r13],xmm12
- vpxor xmm12,xmm12,xmm13
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[((-80))+r15]
- vpxor xmm2,xmm2,xmm8
- xor edi,eax
- xor eax,ebx
- add edx,ebp
- add ecx,DWORD[36+rsp]
- vpsrld xmm8,xmm2,30
- vmovdqa XMMWORD[16+rsp],xmm9
- and edi,eax
- xor eax,ebx
- shrd ebp,ebp,7
- mov esi,edx
- vpslld xmm2,xmm2,2
- xor edi,eax
- shld edx,edx,5
- add ecx,edi
- xor esi,ebp
- xor ebp,eax
- add ecx,edx
- add ebx,DWORD[40+rsp]
- and esi,ebp
- vpor xmm2,xmm2,xmm8
- xor ebp,eax
- shrd edx,edx,7
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[((-64))+r15]
- mov edi,ecx
- xor esi,ebp
- shld ecx,ecx,5
- add ebx,esi
- xor edi,edx
- xor edx,ebp
- add ebx,ecx
- add eax,DWORD[44+rsp]
- and edi,edx
- xor edx,ebp
- shrd ecx,ecx,7
- mov esi,ebx
- xor edi,edx
- shld ebx,ebx,5
- add eax,edi
- xor esi,edx
- add eax,ebx
- vpalignr xmm8,xmm2,xmm1,8
- vpxor xmm3,xmm3,xmm7
- add ebp,DWORD[48+rsp]
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[((-48))+r15]
- xor esi,ecx
- mov edi,eax
- shld eax,eax,5
- vpxor xmm3,xmm3,xmm4
- add ebp,esi
- xor edi,ecx
- vpaddd xmm9,xmm10,xmm2
- shrd ebx,ebx,7
- add ebp,eax
- vpxor xmm3,xmm3,xmm8
- add edx,DWORD[52+rsp]
- xor edi,ebx
- mov esi,ebp
- shld ebp,ebp,5
- vpsrld xmm8,xmm3,30
- vmovdqa XMMWORD[32+rsp],xmm9
- add edx,edi
- xor esi,ebx
- shrd eax,eax,7
- add edx,ebp
- vpslld xmm3,xmm3,2
- add ecx,DWORD[56+rsp]
- xor esi,eax
- mov edi,edx
- shld edx,edx,5
- add ecx,esi
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[((-32))+r15]
- xor edi,eax
- shrd ebp,ebp,7
- add ecx,edx
- vpor xmm3,xmm3,xmm8
- add ebx,DWORD[60+rsp]
- xor edi,ebp
- mov esi,ecx
- shld ecx,ecx,5
- add ebx,edi
- xor esi,ebp
- shrd edx,edx,7
- add ebx,ecx
- add eax,DWORD[rsp]
- vpaddd xmm9,xmm10,xmm3
- xor esi,edx
- mov edi,ebx
- shld ebx,ebx,5
- add eax,esi
- vmovdqa XMMWORD[48+rsp],xmm9
- xor edi,edx
- shrd ecx,ecx,7
- add eax,ebx
- add ebp,DWORD[4+rsp]
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[((-16))+r15]
- xor edi,ecx
- mov esi,eax
- shld eax,eax,5
- add ebp,edi
- xor esi,ecx
- shrd ebx,ebx,7
- add ebp,eax
- add edx,DWORD[8+rsp]
- xor esi,ebx
- mov edi,ebp
- shld ebp,ebp,5
- add edx,esi
- xor edi,ebx
- shrd eax,eax,7
- add edx,ebp
- add ecx,DWORD[12+rsp]
- xor edi,eax
- mov esi,edx
- shld edx,edx,5
- add ecx,edi
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[r15]
- xor esi,eax
- shrd ebp,ebp,7
- add ecx,edx
- cmp r10,r14
- je NEAR $L$done_avx
- vmovdqa xmm9,XMMWORD[64+r11]
- vmovdqa xmm10,XMMWORD[r11]
- vmovdqu xmm0,XMMWORD[r10]
- vmovdqu xmm1,XMMWORD[16+r10]
- vmovdqu xmm2,XMMWORD[32+r10]
- vmovdqu xmm3,XMMWORD[48+r10]
- vpshufb xmm0,xmm0,xmm9
- add r10,64
- add ebx,DWORD[16+rsp]
- xor esi,ebp
- vpshufb xmm1,xmm1,xmm9
- mov edi,ecx
- shld ecx,ecx,5
- vpaddd xmm8,xmm0,xmm10
- add ebx,esi
- xor edi,ebp
- shrd edx,edx,7
- add ebx,ecx
- vmovdqa XMMWORD[rsp],xmm8
- add eax,DWORD[20+rsp]
- xor edi,edx
- mov esi,ebx
- shld ebx,ebx,5
- add eax,edi
- xor esi,edx
- shrd ecx,ecx,7
- add eax,ebx
- add ebp,DWORD[24+rsp]
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[16+r15]
- xor esi,ecx
- mov edi,eax
- shld eax,eax,5
- add ebp,esi
- xor edi,ecx
- shrd ebx,ebx,7
- add ebp,eax
- add edx,DWORD[28+rsp]
- xor edi,ebx
- mov esi,ebp
- shld ebp,ebp,5
- add edx,edi
- xor esi,ebx
- shrd eax,eax,7
- add edx,ebp
- add ecx,DWORD[32+rsp]
- xor esi,eax
- vpshufb xmm2,xmm2,xmm9
- mov edi,edx
- shld edx,edx,5
- vpaddd xmm8,xmm1,xmm10
- add ecx,esi
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[32+r15]
- xor edi,eax
- shrd ebp,ebp,7
- add ecx,edx
- vmovdqa XMMWORD[16+rsp],xmm8
- add ebx,DWORD[36+rsp]
- xor edi,ebp
- mov esi,ecx
- shld ecx,ecx,5
- add ebx,edi
- xor esi,ebp
- shrd edx,edx,7
- add ebx,ecx
- add eax,DWORD[40+rsp]
- xor esi,edx
- mov edi,ebx
- shld ebx,ebx,5
- add eax,esi
- xor edi,edx
- shrd ecx,ecx,7
- add eax,ebx
- add ebp,DWORD[44+rsp]
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[48+r15]
- xor edi,ecx
- mov esi,eax
- shld eax,eax,5
- add ebp,edi
- xor esi,ecx
- shrd ebx,ebx,7
- add ebp,eax
- add edx,DWORD[48+rsp]
- xor esi,ebx
- vpshufb xmm3,xmm3,xmm9
- mov edi,ebp
- shld ebp,ebp,5
- vpaddd xmm8,xmm2,xmm10
- add edx,esi
- xor edi,ebx
- shrd eax,eax,7
- add edx,ebp
- vmovdqa XMMWORD[32+rsp],xmm8
- add ecx,DWORD[52+rsp]
- xor edi,eax
- mov esi,edx
- shld edx,edx,5
- add ecx,edi
- cmp r8d,11
- jb NEAR $L$vaesenclast9
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[64+r15]
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[80+r15]
- je NEAR $L$vaesenclast9
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[96+r15]
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[112+r15]
-$L$vaesenclast9:
- vaesenclast xmm12,xmm12,xmm15
- vmovups xmm15,XMMWORD[((-112))+r15]
- vmovups xmm14,XMMWORD[((16-112))+r15]
- xor esi,eax
- shrd ebp,ebp,7
- add ecx,edx
- add ebx,DWORD[56+rsp]
- xor esi,ebp
- mov edi,ecx
- shld ecx,ecx,5
- add ebx,esi
- xor edi,ebp
- shrd edx,edx,7
- add ebx,ecx
- add eax,DWORD[60+rsp]
- xor edi,edx
- mov esi,ebx
- shld ebx,ebx,5
- add eax,edi
- shrd ecx,ecx,7
- add eax,ebx
- vmovups XMMWORD[48+r12*1+r13],xmm12
- lea r12,[64+r12]
-
- add eax,DWORD[r9]
- add esi,DWORD[4+r9]
- add ecx,DWORD[8+r9]
- add edx,DWORD[12+r9]
- mov DWORD[r9],eax
- add ebp,DWORD[16+r9]
- mov DWORD[4+r9],esi
- mov ebx,esi
- mov DWORD[8+r9],ecx
- mov edi,ecx
- mov DWORD[12+r9],edx
- xor edi,edx
- mov DWORD[16+r9],ebp
- and esi,edi
- jmp NEAR $L$oop_avx
-
-$L$done_avx:
- add ebx,DWORD[16+rsp]
- xor esi,ebp
- mov edi,ecx
- shld ecx,ecx,5
- add ebx,esi
- xor edi,ebp
- shrd edx,edx,7
- add ebx,ecx
- add eax,DWORD[20+rsp]
- xor edi,edx
- mov esi,ebx
- shld ebx,ebx,5
- add eax,edi
- xor esi,edx
- shrd ecx,ecx,7
- add eax,ebx
- add ebp,DWORD[24+rsp]
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[16+r15]
- xor esi,ecx
- mov edi,eax
- shld eax,eax,5
- add ebp,esi
- xor edi,ecx
- shrd ebx,ebx,7
- add ebp,eax
- add edx,DWORD[28+rsp]
- xor edi,ebx
- mov esi,ebp
- shld ebp,ebp,5
- add edx,edi
- xor esi,ebx
- shrd eax,eax,7
- add edx,ebp
- add ecx,DWORD[32+rsp]
- xor esi,eax
- mov edi,edx
- shld edx,edx,5
- add ecx,esi
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[32+r15]
- xor edi,eax
- shrd ebp,ebp,7
- add ecx,edx
- add ebx,DWORD[36+rsp]
- xor edi,ebp
- mov esi,ecx
- shld ecx,ecx,5
- add ebx,edi
- xor esi,ebp
- shrd edx,edx,7
- add ebx,ecx
- add eax,DWORD[40+rsp]
- xor esi,edx
- mov edi,ebx
- shld ebx,ebx,5
- add eax,esi
- xor edi,edx
- shrd ecx,ecx,7
- add eax,ebx
- add ebp,DWORD[44+rsp]
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[48+r15]
- xor edi,ecx
- mov esi,eax
- shld eax,eax,5
- add ebp,edi
- xor esi,ecx
- shrd ebx,ebx,7
- add ebp,eax
- add edx,DWORD[48+rsp]
- xor esi,ebx
- mov edi,ebp
- shld ebp,ebp,5
- add edx,esi
- xor edi,ebx
- shrd eax,eax,7
- add edx,ebp
- add ecx,DWORD[52+rsp]
- xor edi,eax
- mov esi,edx
- shld edx,edx,5
- add ecx,edi
- cmp r8d,11
- jb NEAR $L$vaesenclast10
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[64+r15]
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[80+r15]
- je NEAR $L$vaesenclast10
- vaesenc xmm12,xmm12,xmm15
- vmovups xmm14,XMMWORD[96+r15]
- vaesenc xmm12,xmm12,xmm14
- vmovups xmm15,XMMWORD[112+r15]
-$L$vaesenclast10:
- vaesenclast xmm12,xmm12,xmm15
- vmovups xmm15,XMMWORD[((-112))+r15]
- vmovups xmm14,XMMWORD[((16-112))+r15]
- xor esi,eax
- shrd ebp,ebp,7
- add ecx,edx
- add ebx,DWORD[56+rsp]
- xor esi,ebp
- mov edi,ecx
- shld ecx,ecx,5
- add ebx,esi
- xor edi,ebp
- shrd edx,edx,7
- add ebx,ecx
- add eax,DWORD[60+rsp]
- xor edi,edx
- mov esi,ebx
- shld ebx,ebx,5
- add eax,edi
- shrd ecx,ecx,7
- add eax,ebx
- vmovups XMMWORD[48+r12*1+r13],xmm12
- mov r8,QWORD[88+rsp]
-
- add eax,DWORD[r9]
- add esi,DWORD[4+r9]
- add ecx,DWORD[8+r9]
- mov DWORD[r9],eax
- add edx,DWORD[12+r9]
- mov DWORD[4+r9],esi
- add ebp,DWORD[16+r9]
- mov DWORD[8+r9],ecx
- mov DWORD[12+r9],edx
- mov DWORD[16+r9],ebp
- vmovups XMMWORD[r8],xmm12
- vzeroall
- movaps xmm6,XMMWORD[((96+0))+rsp]
- movaps xmm7,XMMWORD[((96+16))+rsp]
- movaps xmm8,XMMWORD[((96+32))+rsp]
- movaps xmm9,XMMWORD[((96+48))+rsp]
- movaps xmm10,XMMWORD[((96+64))+rsp]
- movaps xmm11,XMMWORD[((96+80))+rsp]
- movaps xmm12,XMMWORD[((96+96))+rsp]
- movaps xmm13,XMMWORD[((96+112))+rsp]
- movaps xmm14,XMMWORD[((96+128))+rsp]
- movaps xmm15,XMMWORD[((96+144))+rsp]
- lea rsi,[264+rsp]
-
- mov r15,QWORD[rsi]
-
- mov r14,QWORD[8+rsi]
-
- mov r13,QWORD[16+rsi]
-
- mov r12,QWORD[24+rsi]
-
- mov rbp,QWORD[32+rsi]
-
- mov rbx,QWORD[40+rsi]
-
- lea rsp,[48+rsi]
-
-$L$epilogue_avx:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_aesni_cbc_sha1_enc_avx:
ALIGN 64
K_XX_XX:
DD 0x5a827999,0x5a827999,0x5a827999,0x5a827999
@@ -2900,17 +1545,17 @@ DB 15,56,202,227
pxor xmm5,xmm3
DB 15,56,201,243
cmp r11d,11
- jb NEAR $L$aesenclast11
+ jb NEAR $L$aesenclast6
movups xmm0,XMMWORD[64+rcx]
DB 102,15,56,220,209
movups xmm1,XMMWORD[80+rcx]
DB 102,15,56,220,208
- je NEAR $L$aesenclast11
+ je NEAR $L$aesenclast6
movups xmm0,XMMWORD[96+rcx]
DB 102,15,56,220,209
movups xmm1,XMMWORD[112+rcx]
DB 102,15,56,220,208
-$L$aesenclast11:
+$L$aesenclast6:
DB 102,15,56,221,209
movups xmm0,XMMWORD[((16-112))+rcx]
movdqa xmm10,xmm8
@@ -2966,17 +1611,17 @@ DB 15,56,202,236
pxor xmm6,xmm4
DB 15,56,201,220
cmp r11d,11
- jb NEAR $L$aesenclast12
+ jb NEAR $L$aesenclast7
movups xmm0,XMMWORD[64+rcx]
DB 102,15,56,220,209
movups xmm1,XMMWORD[80+rcx]
DB 102,15,56,220,208
- je NEAR $L$aesenclast12
+ je NEAR $L$aesenclast7
movups xmm0,XMMWORD[96+rcx]
DB 102,15,56,220,209
movups xmm1,XMMWORD[112+rcx]
DB 102,15,56,220,208
-$L$aesenclast12:
+$L$aesenclast7:
DB 102,15,56,221,209
movups xmm0,XMMWORD[((16-112))+rcx]
movdqa xmm9,xmm8
@@ -3032,17 +1677,17 @@ DB 15,56,202,245
pxor xmm3,xmm5
DB 15,56,201,229
cmp r11d,11
- jb NEAR $L$aesenclast13
+ jb NEAR $L$aesenclast8
movups xmm0,XMMWORD[64+rcx]
DB 102,15,56,220,209
movups xmm1,XMMWORD[80+rcx]
DB 102,15,56,220,208
- je NEAR $L$aesenclast13
+ je NEAR $L$aesenclast8
movups xmm0,XMMWORD[96+rcx]
DB 102,15,56,220,209
movups xmm1,XMMWORD[112+rcx]
DB 102,15,56,220,208
-$L$aesenclast13:
+$L$aesenclast8:
DB 102,15,56,221,209
movups xmm0,XMMWORD[((16-112))+rcx]
movdqa xmm10,xmm8
@@ -3096,17 +1741,17 @@ DB 102,15,56,220,209
movups xmm1,XMMWORD[48+rcx]
DB 102,15,56,220,208
cmp r11d,11
- jb NEAR $L$aesenclast14
+ jb NEAR $L$aesenclast9
movups xmm0,XMMWORD[64+rcx]
DB 102,15,56,220,209
movups xmm1,XMMWORD[80+rcx]
DB 102,15,56,220,208
- je NEAR $L$aesenclast14
+ je NEAR $L$aesenclast9
movups xmm0,XMMWORD[96+rcx]
DB 102,15,56,220,209
movups xmm1,XMMWORD[112+rcx]
DB 102,15,56,220,208
-$L$aesenclast14:
+$L$aesenclast9:
DB 102,15,56,221,209
movups xmm0,XMMWORD[((16-112))+rcx]
dec rdx
@@ -3246,9 +1891,6 @@ ALIGN 4
DD $L$SEH_begin_aesni_cbc_sha1_enc_ssse3 wrt ..imagebase
DD $L$SEH_end_aesni_cbc_sha1_enc_ssse3 wrt ..imagebase
DD $L$SEH_info_aesni_cbc_sha1_enc_ssse3 wrt ..imagebase
- DD $L$SEH_begin_aesni_cbc_sha1_enc_avx wrt ..imagebase
- DD $L$SEH_end_aesni_cbc_sha1_enc_avx wrt ..imagebase
- DD $L$SEH_info_aesni_cbc_sha1_enc_avx wrt ..imagebase
DD $L$SEH_begin_aesni_cbc_sha1_enc_shaext wrt ..imagebase
DD $L$SEH_end_aesni_cbc_sha1_enc_shaext wrt ..imagebase
DD $L$SEH_info_aesni_cbc_sha1_enc_shaext wrt ..imagebase
@@ -3258,10 +1900,6 @@ $L$SEH_info_aesni_cbc_sha1_enc_ssse3:
DB 9,0,0,0
DD ssse3_handler wrt ..imagebase
DD $L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase
-$L$SEH_info_aesni_cbc_sha1_enc_avx:
-DB 9,0,0,0
- DD ssse3_handler wrt ..imagebase
- DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
$L$SEH_info_aesni_cbc_sha1_enc_shaext:
DB 9,0,0,0
DD ssse3_handler wrt ..imagebase
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-sha256-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-sha256-x86_64.nasm
index b2a9c65f5d0..38beecde894 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-sha256-x86_64.nasm
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/aes/aesni-sha256-x86_64.nasm
@@ -11,25 +11,6 @@ global aesni_cbc_sha256_enc
ALIGN 16
aesni_cbc_sha256_enc:
- lea r11,[OPENSSL_ia32cap_P]
- mov eax,1
- cmp rcx,0
- je NEAR $L$probe
- mov eax,DWORD[r11]
- mov r10,QWORD[4+r11]
- bt r10,61
- jc NEAR aesni_cbc_sha256_enc_shaext
- mov r11,r10
- shr r11,32
-
- test r10d,2048
- jnz NEAR aesni_cbc_sha256_enc_xop
- and r11d,296
- cmp r11d,296
- je NEAR aesni_cbc_sha256_enc_avx2
- and r10d,268435456
- jnz NEAR aesni_cbc_sha256_enc_avx
- ud2
xor eax,eax
cmp rcx,0
je NEAR $L$probe
@@ -85,4624 +66,3 @@ DB 54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98
DB 121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108
DB 46,111,114,103,62,0
ALIGN 64
-
-ALIGN 64
-aesni_cbc_sha256_enc_xop:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_aesni_cbc_sha256_enc_xop:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
- mov rcx,r9
- mov r8,QWORD[40+rsp]
- mov r9,QWORD[48+rsp]
-
-
-
-$L$xop_shortcut:
- mov r10,QWORD[56+rsp]
- mov rax,rsp
-
- push rbx
-
- push rbp
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
- sub rsp,288
- and rsp,-64
-
- shl rdx,6
- sub rsi,rdi
- sub r10,rdi
- add rdx,rdi
-
-
- mov QWORD[((64+8))+rsp],rsi
- mov QWORD[((64+16))+rsp],rdx
-
- mov QWORD[((64+32))+rsp],r8
- mov QWORD[((64+40))+rsp],r9
- mov QWORD[((64+48))+rsp],r10
- mov QWORD[120+rsp],rax
-
- movaps XMMWORD[128+rsp],xmm6
- movaps XMMWORD[144+rsp],xmm7
- movaps XMMWORD[160+rsp],xmm8
- movaps XMMWORD[176+rsp],xmm9
- movaps XMMWORD[192+rsp],xmm10
- movaps XMMWORD[208+rsp],xmm11
- movaps XMMWORD[224+rsp],xmm12
- movaps XMMWORD[240+rsp],xmm13
- movaps XMMWORD[256+rsp],xmm14
- movaps XMMWORD[272+rsp],xmm15
-$L$prologue_xop:
- vzeroall
-
- mov r12,rdi
- lea rdi,[128+rcx]
- lea r13,[((K256+544))]
- mov r14d,DWORD[((240-128))+rdi]
- mov r15,r9
- mov rsi,r10
- vmovdqu xmm8,XMMWORD[r8]
- sub r14,9
-
- mov eax,DWORD[r15]
- mov ebx,DWORD[4+r15]
- mov ecx,DWORD[8+r15]
- mov edx,DWORD[12+r15]
- mov r8d,DWORD[16+r15]
- mov r9d,DWORD[20+r15]
- mov r10d,DWORD[24+r15]
- mov r11d,DWORD[28+r15]
-
- vmovdqa xmm14,XMMWORD[r14*8+r13]
- vmovdqa xmm13,XMMWORD[16+r14*8+r13]
- vmovdqa xmm12,XMMWORD[32+r14*8+r13]
- vmovdqu xmm10,XMMWORD[((0-128))+rdi]
- jmp NEAR $L$loop_xop
-ALIGN 16
-$L$loop_xop:
- vmovdqa xmm7,XMMWORD[((K256+512))]
- vmovdqu xmm0,XMMWORD[r12*1+rsi]
- vmovdqu xmm1,XMMWORD[16+r12*1+rsi]
- vmovdqu xmm2,XMMWORD[32+r12*1+rsi]
- vmovdqu xmm3,XMMWORD[48+r12*1+rsi]
- vpshufb xmm0,xmm0,xmm7
- lea rbp,[K256]
- vpshufb xmm1,xmm1,xmm7
- vpshufb xmm2,xmm2,xmm7
- vpaddd xmm4,xmm0,XMMWORD[rbp]
- vpshufb xmm3,xmm3,xmm7
- vpaddd xmm5,xmm1,XMMWORD[32+rbp]
- vpaddd xmm6,xmm2,XMMWORD[64+rbp]
- vpaddd xmm7,xmm3,XMMWORD[96+rbp]
- vmovdqa XMMWORD[rsp],xmm4
- mov r14d,eax
- vmovdqa XMMWORD[16+rsp],xmm5
- mov esi,ebx
- vmovdqa XMMWORD[32+rsp],xmm6
- xor esi,ecx
- vmovdqa XMMWORD[48+rsp],xmm7
- mov r13d,r8d
- jmp NEAR $L$xop_00_47
-
-ALIGN 16
-$L$xop_00_47:
- sub rbp,-16*2*4
- vmovdqu xmm9,XMMWORD[r12]
- mov QWORD[((64+0))+rsp],r12
- vpalignr xmm4,xmm1,xmm0,4
- ror r13d,14
- mov eax,r14d
- vpalignr xmm7,xmm3,xmm2,4
- mov r12d,r9d
- xor r13d,r8d
-DB 143,232,120,194,236,14
- ror r14d,9
- xor r12d,r10d
- vpsrld xmm4,xmm4,3
- ror r13d,5
- xor r14d,eax
- vpaddd xmm0,xmm0,xmm7
- and r12d,r8d
- vpxor xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((16-128))+rdi]
- xor r13d,r8d
- add r11d,DWORD[rsp]
- mov r15d,eax
-DB 143,232,120,194,245,11
- ror r14d,11
- xor r12d,r10d
- vpxor xmm4,xmm4,xmm5
- xor r15d,ebx
- ror r13d,6
- add r11d,r12d
- and esi,r15d
-DB 143,232,120,194,251,13
- xor r14d,eax
- add r11d,r13d
- vpxor xmm4,xmm4,xmm6
- xor esi,ebx
- add edx,r11d
- vpsrld xmm6,xmm3,10
- ror r14d,2
- add r11d,esi
- vpaddd xmm0,xmm0,xmm4
- mov r13d,edx
- add r14d,r11d
-DB 143,232,120,194,239,2
- ror r13d,14
- mov r11d,r14d
- vpxor xmm7,xmm7,xmm6
- mov r12d,r8d
- xor r13d,edx
- ror r14d,9
- xor r12d,r9d
- vpxor xmm7,xmm7,xmm5
- ror r13d,5
- xor r14d,r11d
- and r12d,edx
- vpxor xmm9,xmm9,xmm8
- xor r13d,edx
- vpsrldq xmm7,xmm7,8
- add r10d,DWORD[4+rsp]
- mov esi,r11d
- ror r14d,11
- xor r12d,r9d
- vpaddd xmm0,xmm0,xmm7
- xor esi,eax
- ror r13d,6
- add r10d,r12d
- and r15d,esi
-DB 143,232,120,194,248,13
- xor r14d,r11d
- add r10d,r13d
- vpsrld xmm6,xmm0,10
- xor r15d,eax
- add ecx,r10d
-DB 143,232,120,194,239,2
- ror r14d,2
- add r10d,r15d
- vpxor xmm7,xmm7,xmm6
- mov r13d,ecx
- add r14d,r10d
- ror r13d,14
- mov r10d,r14d
- vpxor xmm7,xmm7,xmm5
- mov r12d,edx
- xor r13d,ecx
- ror r14d,9
- xor r12d,r8d
- vpslldq xmm7,xmm7,8
- ror r13d,5
- xor r14d,r10d
- and r12d,ecx
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((32-128))+rdi]
- xor r13d,ecx
- vpaddd xmm0,xmm0,xmm7
- add r9d,DWORD[8+rsp]
- mov r15d,r10d
- ror r14d,11
- xor r12d,r8d
- vpaddd xmm6,xmm0,XMMWORD[rbp]
- xor r15d,r11d
- ror r13d,6
- add r9d,r12d
- and esi,r15d
- xor r14d,r10d
- add r9d,r13d
- xor esi,r11d
- add ebx,r9d
- ror r14d,2
- add r9d,esi
- mov r13d,ebx
- add r14d,r9d
- ror r13d,14
- mov r9d,r14d
- mov r12d,ecx
- xor r13d,ebx
- ror r14d,9
- xor r12d,edx
- ror r13d,5
- xor r14d,r9d
- and r12d,ebx
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((48-128))+rdi]
- xor r13d,ebx
- add r8d,DWORD[12+rsp]
- mov esi,r9d
- ror r14d,11
- xor r12d,edx
- xor esi,r10d
- ror r13d,6
- add r8d,r12d
- and r15d,esi
- xor r14d,r9d
- add r8d,r13d
- xor r15d,r10d
- add eax,r8d
- ror r14d,2
- add r8d,r15d
- mov r13d,eax
- add r14d,r8d
- vmovdqa XMMWORD[rsp],xmm6
- vpalignr xmm4,xmm2,xmm1,4
- ror r13d,14
- mov r8d,r14d
- vpalignr xmm7,xmm0,xmm3,4
- mov r12d,ebx
- xor r13d,eax
-DB 143,232,120,194,236,14
- ror r14d,9
- xor r12d,ecx
- vpsrld xmm4,xmm4,3
- ror r13d,5
- xor r14d,r8d
- vpaddd xmm1,xmm1,xmm7
- and r12d,eax
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((64-128))+rdi]
- xor r13d,eax
- add edx,DWORD[16+rsp]
- mov r15d,r8d
-DB 143,232,120,194,245,11
- ror r14d,11
- xor r12d,ecx
- vpxor xmm4,xmm4,xmm5
- xor r15d,r9d
- ror r13d,6
- add edx,r12d
- and esi,r15d
-DB 143,232,120,194,248,13
- xor r14d,r8d
- add edx,r13d
- vpxor xmm4,xmm4,xmm6
- xor esi,r9d
- add r11d,edx
- vpsrld xmm6,xmm0,10
- ror r14d,2
- add edx,esi
- vpaddd xmm1,xmm1,xmm4
- mov r13d,r11d
- add r14d,edx
-DB 143,232,120,194,239,2
- ror r13d,14
- mov edx,r14d
- vpxor xmm7,xmm7,xmm6
- mov r12d,eax
- xor r13d,r11d
- ror r14d,9
- xor r12d,ebx
- vpxor xmm7,xmm7,xmm5
- ror r13d,5
- xor r14d,edx
- and r12d,r11d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((80-128))+rdi]
- xor r13d,r11d
- vpsrldq xmm7,xmm7,8
- add ecx,DWORD[20+rsp]
- mov esi,edx
- ror r14d,11
- xor r12d,ebx
- vpaddd xmm1,xmm1,xmm7
- xor esi,r8d
- ror r13d,6
- add ecx,r12d
- and r15d,esi
-DB 143,232,120,194,249,13
- xor r14d,edx
- add ecx,r13d
- vpsrld xmm6,xmm1,10
- xor r15d,r8d
- add r10d,ecx
-DB 143,232,120,194,239,2
- ror r14d,2
- add ecx,r15d
- vpxor xmm7,xmm7,xmm6
- mov r13d,r10d
- add r14d,ecx
- ror r13d,14
- mov ecx,r14d
- vpxor xmm7,xmm7,xmm5
- mov r12d,r11d
- xor r13d,r10d
- ror r14d,9
- xor r12d,eax
- vpslldq xmm7,xmm7,8
- ror r13d,5
- xor r14d,ecx
- and r12d,r10d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((96-128))+rdi]
- xor r13d,r10d
- vpaddd xmm1,xmm1,xmm7
- add ebx,DWORD[24+rsp]
- mov r15d,ecx
- ror r14d,11
- xor r12d,eax
- vpaddd xmm6,xmm1,XMMWORD[32+rbp]
- xor r15d,edx
- ror r13d,6
- add ebx,r12d
- and esi,r15d
- xor r14d,ecx
- add ebx,r13d
- xor esi,edx
- add r9d,ebx
- ror r14d,2
- add ebx,esi
- mov r13d,r9d
- add r14d,ebx
- ror r13d,14
- mov ebx,r14d
- mov r12d,r10d
- xor r13d,r9d
- ror r14d,9
- xor r12d,r11d
- ror r13d,5
- xor r14d,ebx
- and r12d,r9d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((112-128))+rdi]
- xor r13d,r9d
- add eax,DWORD[28+rsp]
- mov esi,ebx
- ror r14d,11
- xor r12d,r11d
- xor esi,ecx
- ror r13d,6
- add eax,r12d
- and r15d,esi
- xor r14d,ebx
- add eax,r13d
- xor r15d,ecx
- add r8d,eax
- ror r14d,2
- add eax,r15d
- mov r13d,r8d
- add r14d,eax
- vmovdqa XMMWORD[16+rsp],xmm6
- vpalignr xmm4,xmm3,xmm2,4
- ror r13d,14
- mov eax,r14d
- vpalignr xmm7,xmm1,xmm0,4
- mov r12d,r9d
- xor r13d,r8d
-DB 143,232,120,194,236,14
- ror r14d,9
- xor r12d,r10d
- vpsrld xmm4,xmm4,3
- ror r13d,5
- xor r14d,eax
- vpaddd xmm2,xmm2,xmm7
- and r12d,r8d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((128-128))+rdi]
- xor r13d,r8d
- add r11d,DWORD[32+rsp]
- mov r15d,eax
-DB 143,232,120,194,245,11
- ror r14d,11
- xor r12d,r10d
- vpxor xmm4,xmm4,xmm5
- xor r15d,ebx
- ror r13d,6
- add r11d,r12d
- and esi,r15d
-DB 143,232,120,194,249,13
- xor r14d,eax
- add r11d,r13d
- vpxor xmm4,xmm4,xmm6
- xor esi,ebx
- add edx,r11d
- vpsrld xmm6,xmm1,10
- ror r14d,2
- add r11d,esi
- vpaddd xmm2,xmm2,xmm4
- mov r13d,edx
- add r14d,r11d
-DB 143,232,120,194,239,2
- ror r13d,14
- mov r11d,r14d
- vpxor xmm7,xmm7,xmm6
- mov r12d,r8d
- xor r13d,edx
- ror r14d,9
- xor r12d,r9d
- vpxor xmm7,xmm7,xmm5
- ror r13d,5
- xor r14d,r11d
- and r12d,edx
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((144-128))+rdi]
- xor r13d,edx
- vpsrldq xmm7,xmm7,8
- add r10d,DWORD[36+rsp]
- mov esi,r11d
- ror r14d,11
- xor r12d,r9d
- vpaddd xmm2,xmm2,xmm7
- xor esi,eax
- ror r13d,6
- add r10d,r12d
- and r15d,esi
-DB 143,232,120,194,250,13
- xor r14d,r11d
- add r10d,r13d
- vpsrld xmm6,xmm2,10
- xor r15d,eax
- add ecx,r10d
-DB 143,232,120,194,239,2
- ror r14d,2
- add r10d,r15d
- vpxor xmm7,xmm7,xmm6
- mov r13d,ecx
- add r14d,r10d
- ror r13d,14
- mov r10d,r14d
- vpxor xmm7,xmm7,xmm5
- mov r12d,edx
- xor r13d,ecx
- ror r14d,9
- xor r12d,r8d
- vpslldq xmm7,xmm7,8
- ror r13d,5
- xor r14d,r10d
- and r12d,ecx
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((160-128))+rdi]
- xor r13d,ecx
- vpaddd xmm2,xmm2,xmm7
- add r9d,DWORD[40+rsp]
- mov r15d,r10d
- ror r14d,11
- xor r12d,r8d
- vpaddd xmm6,xmm2,XMMWORD[64+rbp]
- xor r15d,r11d
- ror r13d,6
- add r9d,r12d
- and esi,r15d
- xor r14d,r10d
- add r9d,r13d
- xor esi,r11d
- add ebx,r9d
- ror r14d,2
- add r9d,esi
- mov r13d,ebx
- add r14d,r9d
- ror r13d,14
- mov r9d,r14d
- mov r12d,ecx
- xor r13d,ebx
- ror r14d,9
- xor r12d,edx
- ror r13d,5
- xor r14d,r9d
- and r12d,ebx
- vaesenclast xmm11,xmm9,xmm10
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((176-128))+rdi]
- xor r13d,ebx
- add r8d,DWORD[44+rsp]
- mov esi,r9d
- ror r14d,11
- xor r12d,edx
- xor esi,r10d
- ror r13d,6
- add r8d,r12d
- and r15d,esi
- xor r14d,r9d
- add r8d,r13d
- xor r15d,r10d
- add eax,r8d
- ror r14d,2
- add r8d,r15d
- mov r13d,eax
- add r14d,r8d
- vmovdqa XMMWORD[32+rsp],xmm6
- vpalignr xmm4,xmm0,xmm3,4
- ror r13d,14
- mov r8d,r14d
- vpalignr xmm7,xmm2,xmm1,4
- mov r12d,ebx
- xor r13d,eax
-DB 143,232,120,194,236,14
- ror r14d,9
- xor r12d,ecx
- vpsrld xmm4,xmm4,3
- ror r13d,5
- xor r14d,r8d
- vpaddd xmm3,xmm3,xmm7
- and r12d,eax
- vpand xmm8,xmm11,xmm12
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((192-128))+rdi]
- xor r13d,eax
- add edx,DWORD[48+rsp]
- mov r15d,r8d
-DB 143,232,120,194,245,11
- ror r14d,11
- xor r12d,ecx
- vpxor xmm4,xmm4,xmm5
- xor r15d,r9d
- ror r13d,6
- add edx,r12d
- and esi,r15d
-DB 143,232,120,194,250,13
- xor r14d,r8d
- add edx,r13d
- vpxor xmm4,xmm4,xmm6
- xor esi,r9d
- add r11d,edx
- vpsrld xmm6,xmm2,10
- ror r14d,2
- add edx,esi
- vpaddd xmm3,xmm3,xmm4
- mov r13d,r11d
- add r14d,edx
-DB 143,232,120,194,239,2
- ror r13d,14
- mov edx,r14d
- vpxor xmm7,xmm7,xmm6
- mov r12d,eax
- xor r13d,r11d
- ror r14d,9
- xor r12d,ebx
- vpxor xmm7,xmm7,xmm5
- ror r13d,5
- xor r14d,edx
- and r12d,r11d
- vaesenclast xmm11,xmm9,xmm10
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((208-128))+rdi]
- xor r13d,r11d
- vpsrldq xmm7,xmm7,8
- add ecx,DWORD[52+rsp]
- mov esi,edx
- ror r14d,11
- xor r12d,ebx
- vpaddd xmm3,xmm3,xmm7
- xor esi,r8d
- ror r13d,6
- add ecx,r12d
- and r15d,esi
-DB 143,232,120,194,251,13
- xor r14d,edx
- add ecx,r13d
- vpsrld xmm6,xmm3,10
- xor r15d,r8d
- add r10d,ecx
-DB 143,232,120,194,239,2
- ror r14d,2
- add ecx,r15d
- vpxor xmm7,xmm7,xmm6
- mov r13d,r10d
- add r14d,ecx
- ror r13d,14
- mov ecx,r14d
- vpxor xmm7,xmm7,xmm5
- mov r12d,r11d
- xor r13d,r10d
- ror r14d,9
- xor r12d,eax
- vpslldq xmm7,xmm7,8
- ror r13d,5
- xor r14d,ecx
- and r12d,r10d
- vpand xmm11,xmm11,xmm13
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((224-128))+rdi]
- xor r13d,r10d
- vpaddd xmm3,xmm3,xmm7
- add ebx,DWORD[56+rsp]
- mov r15d,ecx
- ror r14d,11
- xor r12d,eax
- vpaddd xmm6,xmm3,XMMWORD[96+rbp]
- xor r15d,edx
- ror r13d,6
- add ebx,r12d
- and esi,r15d
- xor r14d,ecx
- add ebx,r13d
- xor esi,edx
- add r9d,ebx
- ror r14d,2
- add ebx,esi
- mov r13d,r9d
- add r14d,ebx
- ror r13d,14
- mov ebx,r14d
- mov r12d,r10d
- xor r13d,r9d
- ror r14d,9
- xor r12d,r11d
- ror r13d,5
- xor r14d,ebx
- and r12d,r9d
- vpor xmm8,xmm8,xmm11
- vaesenclast xmm11,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((0-128))+rdi]
- xor r13d,r9d
- add eax,DWORD[60+rsp]
- mov esi,ebx
- ror r14d,11
- xor r12d,r11d
- xor esi,ecx
- ror r13d,6
- add eax,r12d
- and r15d,esi
- xor r14d,ebx
- add eax,r13d
- xor r15d,ecx
- add r8d,eax
- ror r14d,2
- add eax,r15d
- mov r13d,r8d
- add r14d,eax
- vmovdqa XMMWORD[48+rsp],xmm6
- mov r12,QWORD[((64+0))+rsp]
- vpand xmm11,xmm11,xmm14
- mov r15,QWORD[((64+8))+rsp]
- vpor xmm8,xmm8,xmm11
- vmovdqu XMMWORD[r12*1+r15],xmm8
- lea r12,[16+r12]
- cmp BYTE[131+rbp],0
- jne NEAR $L$xop_00_47
- vmovdqu xmm9,XMMWORD[r12]
- mov QWORD[((64+0))+rsp],r12
- ror r13d,14
- mov eax,r14d
- mov r12d,r9d
- xor r13d,r8d
- ror r14d,9
- xor r12d,r10d
- ror r13d,5
- xor r14d,eax
- and r12d,r8d
- vpxor xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((16-128))+rdi]
- xor r13d,r8d
- add r11d,DWORD[rsp]
- mov r15d,eax
- ror r14d,11
- xor r12d,r10d
- xor r15d,ebx
- ror r13d,6
- add r11d,r12d
- and esi,r15d
- xor r14d,eax
- add r11d,r13d
- xor esi,ebx
- add edx,r11d
- ror r14d,2
- add r11d,esi
- mov r13d,edx
- add r14d,r11d
- ror r13d,14
- mov r11d,r14d
- mov r12d,r8d
- xor r13d,edx
- ror r14d,9
- xor r12d,r9d
- ror r13d,5
- xor r14d,r11d
- and r12d,edx
- vpxor xmm9,xmm9,xmm8
- xor r13d,edx
- add r10d,DWORD[4+rsp]
- mov esi,r11d
- ror r14d,11
- xor r12d,r9d
- xor esi,eax
- ror r13d,6
- add r10d,r12d
- and r15d,esi
- xor r14d,r11d
- add r10d,r13d
- xor r15d,eax
- add ecx,r10d
- ror r14d,2
- add r10d,r15d
- mov r13d,ecx
- add r14d,r10d
- ror r13d,14
- mov r10d,r14d
- mov r12d,edx
- xor r13d,ecx
- ror r14d,9
- xor r12d,r8d
- ror r13d,5
- xor r14d,r10d
- and r12d,ecx
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((32-128))+rdi]
- xor r13d,ecx
- add r9d,DWORD[8+rsp]
- mov r15d,r10d
- ror r14d,11
- xor r12d,r8d
- xor r15d,r11d
- ror r13d,6
- add r9d,r12d
- and esi,r15d
- xor r14d,r10d
- add r9d,r13d
- xor esi,r11d
- add ebx,r9d
- ror r14d,2
- add r9d,esi
- mov r13d,ebx
- add r14d,r9d
- ror r13d,14
- mov r9d,r14d
- mov r12d,ecx
- xor r13d,ebx
- ror r14d,9
- xor r12d,edx
- ror r13d,5
- xor r14d,r9d
- and r12d,ebx
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((48-128))+rdi]
- xor r13d,ebx
- add r8d,DWORD[12+rsp]
- mov esi,r9d
- ror r14d,11
- xor r12d,edx
- xor esi,r10d
- ror r13d,6
- add r8d,r12d
- and r15d,esi
- xor r14d,r9d
- add r8d,r13d
- xor r15d,r10d
- add eax,r8d
- ror r14d,2
- add r8d,r15d
- mov r13d,eax
- add r14d,r8d
- ror r13d,14
- mov r8d,r14d
- mov r12d,ebx
- xor r13d,eax
- ror r14d,9
- xor r12d,ecx
- ror r13d,5
- xor r14d,r8d
- and r12d,eax
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((64-128))+rdi]
- xor r13d,eax
- add edx,DWORD[16+rsp]
- mov r15d,r8d
- ror r14d,11
- xor r12d,ecx
- xor r15d,r9d
- ror r13d,6
- add edx,r12d
- and esi,r15d
- xor r14d,r8d
- add edx,r13d
- xor esi,r9d
- add r11d,edx
- ror r14d,2
- add edx,esi
- mov r13d,r11d
- add r14d,edx
- ror r13d,14
- mov edx,r14d
- mov r12d,eax
- xor r13d,r11d
- ror r14d,9
- xor r12d,ebx
- ror r13d,5
- xor r14d,edx
- and r12d,r11d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((80-128))+rdi]
- xor r13d,r11d
- add ecx,DWORD[20+rsp]
- mov esi,edx
- ror r14d,11
- xor r12d,ebx
- xor esi,r8d
- ror r13d,6
- add ecx,r12d
- and r15d,esi
- xor r14d,edx
- add ecx,r13d
- xor r15d,r8d
- add r10d,ecx
- ror r14d,2
- add ecx,r15d
- mov r13d,r10d
- add r14d,ecx
- ror r13d,14
- mov ecx,r14d
- mov r12d,r11d
- xor r13d,r10d
- ror r14d,9
- xor r12d,eax
- ror r13d,5
- xor r14d,ecx
- and r12d,r10d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((96-128))+rdi]
- xor r13d,r10d
- add ebx,DWORD[24+rsp]
- mov r15d,ecx
- ror r14d,11
- xor r12d,eax
- xor r15d,edx
- ror r13d,6
- add ebx,r12d
- and esi,r15d
- xor r14d,ecx
- add ebx,r13d
- xor esi,edx
- add r9d,ebx
- ror r14d,2
- add ebx,esi
- mov r13d,r9d
- add r14d,ebx
- ror r13d,14
- mov ebx,r14d
- mov r12d,r10d
- xor r13d,r9d
- ror r14d,9
- xor r12d,r11d
- ror r13d,5
- xor r14d,ebx
- and r12d,r9d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((112-128))+rdi]
- xor r13d,r9d
- add eax,DWORD[28+rsp]
- mov esi,ebx
- ror r14d,11
- xor r12d,r11d
- xor esi,ecx
- ror r13d,6
- add eax,r12d
- and r15d,esi
- xor r14d,ebx
- add eax,r13d
- xor r15d,ecx
- add r8d,eax
- ror r14d,2
- add eax,r15d
- mov r13d,r8d
- add r14d,eax
- ror r13d,14
- mov eax,r14d
- mov r12d,r9d
- xor r13d,r8d
- ror r14d,9
- xor r12d,r10d
- ror r13d,5
- xor r14d,eax
- and r12d,r8d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((128-128))+rdi]
- xor r13d,r8d
- add r11d,DWORD[32+rsp]
- mov r15d,eax
- ror r14d,11
- xor r12d,r10d
- xor r15d,ebx
- ror r13d,6
- add r11d,r12d
- and esi,r15d
- xor r14d,eax
- add r11d,r13d
- xor esi,ebx
- add edx,r11d
- ror r14d,2
- add r11d,esi
- mov r13d,edx
- add r14d,r11d
- ror r13d,14
- mov r11d,r14d
- mov r12d,r8d
- xor r13d,edx
- ror r14d,9
- xor r12d,r9d
- ror r13d,5
- xor r14d,r11d
- and r12d,edx
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((144-128))+rdi]
- xor r13d,edx
- add r10d,DWORD[36+rsp]
- mov esi,r11d
- ror r14d,11
- xor r12d,r9d
- xor esi,eax
- ror r13d,6
- add r10d,r12d
- and r15d,esi
- xor r14d,r11d
- add r10d,r13d
- xor r15d,eax
- add ecx,r10d
- ror r14d,2
- add r10d,r15d
- mov r13d,ecx
- add r14d,r10d
- ror r13d,14
- mov r10d,r14d
- mov r12d,edx
- xor r13d,ecx
- ror r14d,9
- xor r12d,r8d
- ror r13d,5
- xor r14d,r10d
- and r12d,ecx
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((160-128))+rdi]
- xor r13d,ecx
- add r9d,DWORD[40+rsp]
- mov r15d,r10d
- ror r14d,11
- xor r12d,r8d
- xor r15d,r11d
- ror r13d,6
- add r9d,r12d
- and esi,r15d
- xor r14d,r10d
- add r9d,r13d
- xor esi,r11d
- add ebx,r9d
- ror r14d,2
- add r9d,esi
- mov r13d,ebx
- add r14d,r9d
- ror r13d,14
- mov r9d,r14d
- mov r12d,ecx
- xor r13d,ebx
- ror r14d,9
- xor r12d,edx
- ror r13d,5
- xor r14d,r9d
- and r12d,ebx
- vaesenclast xmm11,xmm9,xmm10
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((176-128))+rdi]
- xor r13d,ebx
- add r8d,DWORD[44+rsp]
- mov esi,r9d
- ror r14d,11
- xor r12d,edx
- xor esi,r10d
- ror r13d,6
- add r8d,r12d
- and r15d,esi
- xor r14d,r9d
- add r8d,r13d
- xor r15d,r10d
- add eax,r8d
- ror r14d,2
- add r8d,r15d
- mov r13d,eax
- add r14d,r8d
- ror r13d,14
- mov r8d,r14d
- mov r12d,ebx
- xor r13d,eax
- ror r14d,9
- xor r12d,ecx
- ror r13d,5
- xor r14d,r8d
- and r12d,eax
- vpand xmm8,xmm11,xmm12
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((192-128))+rdi]
- xor r13d,eax
- add edx,DWORD[48+rsp]
- mov r15d,r8d
- ror r14d,11
- xor r12d,ecx
- xor r15d,r9d
- ror r13d,6
- add edx,r12d
- and esi,r15d
- xor r14d,r8d
- add edx,r13d
- xor esi,r9d
- add r11d,edx
- ror r14d,2
- add edx,esi
- mov r13d,r11d
- add r14d,edx
- ror r13d,14
- mov edx,r14d
- mov r12d,eax
- xor r13d,r11d
- ror r14d,9
- xor r12d,ebx
- ror r13d,5
- xor r14d,edx
- and r12d,r11d
- vaesenclast xmm11,xmm9,xmm10
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((208-128))+rdi]
- xor r13d,r11d
- add ecx,DWORD[52+rsp]
- mov esi,edx
- ror r14d,11
- xor r12d,ebx
- xor esi,r8d
- ror r13d,6
- add ecx,r12d
- and r15d,esi
- xor r14d,edx
- add ecx,r13d
- xor r15d,r8d
- add r10d,ecx
- ror r14d,2
- add ecx,r15d
- mov r13d,r10d
- add r14d,ecx
- ror r13d,14
- mov ecx,r14d
- mov r12d,r11d
- xor r13d,r10d
- ror r14d,9
- xor r12d,eax
- ror r13d,5
- xor r14d,ecx
- and r12d,r10d
- vpand xmm11,xmm11,xmm13
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((224-128))+rdi]
- xor r13d,r10d
- add ebx,DWORD[56+rsp]
- mov r15d,ecx
- ror r14d,11
- xor r12d,eax
- xor r15d,edx
- ror r13d,6
- add ebx,r12d
- and esi,r15d
- xor r14d,ecx
- add ebx,r13d
- xor esi,edx
- add r9d,ebx
- ror r14d,2
- add ebx,esi
- mov r13d,r9d
- add r14d,ebx
- ror r13d,14
- mov ebx,r14d
- mov r12d,r10d
- xor r13d,r9d
- ror r14d,9
- xor r12d,r11d
- ror r13d,5
- xor r14d,ebx
- and r12d,r9d
- vpor xmm8,xmm8,xmm11
- vaesenclast xmm11,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((0-128))+rdi]
- xor r13d,r9d
- add eax,DWORD[60+rsp]
- mov esi,ebx
- ror r14d,11
- xor r12d,r11d
- xor esi,ecx
- ror r13d,6
- add eax,r12d
- and r15d,esi
- xor r14d,ebx
- add eax,r13d
- xor r15d,ecx
- add r8d,eax
- ror r14d,2
- add eax,r15d
- mov r13d,r8d
- add r14d,eax
- mov r12,QWORD[((64+0))+rsp]
- mov r13,QWORD[((64+8))+rsp]
- mov r15,QWORD[((64+40))+rsp]
- mov rsi,QWORD[((64+48))+rsp]
-
- vpand xmm11,xmm11,xmm14
- mov eax,r14d
- vpor xmm8,xmm8,xmm11
- vmovdqu XMMWORD[r13*1+r12],xmm8
- lea r12,[16+r12]
-
- add eax,DWORD[r15]
- add ebx,DWORD[4+r15]
- add ecx,DWORD[8+r15]
- add edx,DWORD[12+r15]
- add r8d,DWORD[16+r15]
- add r9d,DWORD[20+r15]
- add r10d,DWORD[24+r15]
- add r11d,DWORD[28+r15]
-
- cmp r12,QWORD[((64+16))+rsp]
-
- mov DWORD[r15],eax
- mov DWORD[4+r15],ebx
- mov DWORD[8+r15],ecx
- mov DWORD[12+r15],edx
- mov DWORD[16+r15],r8d
- mov DWORD[20+r15],r9d
- mov DWORD[24+r15],r10d
- mov DWORD[28+r15],r11d
-
- jb NEAR $L$loop_xop
-
- mov r8,QWORD[((64+32))+rsp]
- mov rsi,QWORD[120+rsp]
-
- vmovdqu XMMWORD[r8],xmm8
- vzeroall
- movaps xmm6,XMMWORD[128+rsp]
- movaps xmm7,XMMWORD[144+rsp]
- movaps xmm8,XMMWORD[160+rsp]
- movaps xmm9,XMMWORD[176+rsp]
- movaps xmm10,XMMWORD[192+rsp]
- movaps xmm11,XMMWORD[208+rsp]
- movaps xmm12,XMMWORD[224+rsp]
- movaps xmm13,XMMWORD[240+rsp]
- movaps xmm14,XMMWORD[256+rsp]
- movaps xmm15,XMMWORD[272+rsp]
- mov r15,QWORD[((-48))+rsi]
-
- mov r14,QWORD[((-40))+rsi]
-
- mov r13,QWORD[((-32))+rsi]
-
- mov r12,QWORD[((-24))+rsi]
-
- mov rbp,QWORD[((-16))+rsi]
-
- mov rbx,QWORD[((-8))+rsi]
-
- lea rsp,[rsi]
-
-$L$epilogue_xop:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_aesni_cbc_sha256_enc_xop:
-
-ALIGN 64
-aesni_cbc_sha256_enc_avx:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_aesni_cbc_sha256_enc_avx:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
- mov rcx,r9
- mov r8,QWORD[40+rsp]
- mov r9,QWORD[48+rsp]
-
-
-
-$L$avx_shortcut:
- mov r10,QWORD[56+rsp]
- mov rax,rsp
-
- push rbx
-
- push rbp
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
- sub rsp,288
- and rsp,-64
-
- shl rdx,6
- sub rsi,rdi
- sub r10,rdi
- add rdx,rdi
-
-
- mov QWORD[((64+8))+rsp],rsi
- mov QWORD[((64+16))+rsp],rdx
-
- mov QWORD[((64+32))+rsp],r8
- mov QWORD[((64+40))+rsp],r9
- mov QWORD[((64+48))+rsp],r10
- mov QWORD[120+rsp],rax
-
- movaps XMMWORD[128+rsp],xmm6
- movaps XMMWORD[144+rsp],xmm7
- movaps XMMWORD[160+rsp],xmm8
- movaps XMMWORD[176+rsp],xmm9
- movaps XMMWORD[192+rsp],xmm10
- movaps XMMWORD[208+rsp],xmm11
- movaps XMMWORD[224+rsp],xmm12
- movaps XMMWORD[240+rsp],xmm13
- movaps XMMWORD[256+rsp],xmm14
- movaps XMMWORD[272+rsp],xmm15
-$L$prologue_avx:
- vzeroall
-
- mov r12,rdi
- lea rdi,[128+rcx]
- lea r13,[((K256+544))]
- mov r14d,DWORD[((240-128))+rdi]
- mov r15,r9
- mov rsi,r10
- vmovdqu xmm8,XMMWORD[r8]
- sub r14,9
-
- mov eax,DWORD[r15]
- mov ebx,DWORD[4+r15]
- mov ecx,DWORD[8+r15]
- mov edx,DWORD[12+r15]
- mov r8d,DWORD[16+r15]
- mov r9d,DWORD[20+r15]
- mov r10d,DWORD[24+r15]
- mov r11d,DWORD[28+r15]
-
- vmovdqa xmm14,XMMWORD[r14*8+r13]
- vmovdqa xmm13,XMMWORD[16+r14*8+r13]
- vmovdqa xmm12,XMMWORD[32+r14*8+r13]
- vmovdqu xmm10,XMMWORD[((0-128))+rdi]
- jmp NEAR $L$loop_avx
-ALIGN 16
-$L$loop_avx:
- vmovdqa xmm7,XMMWORD[((K256+512))]
- vmovdqu xmm0,XMMWORD[r12*1+rsi]
- vmovdqu xmm1,XMMWORD[16+r12*1+rsi]
- vmovdqu xmm2,XMMWORD[32+r12*1+rsi]
- vmovdqu xmm3,XMMWORD[48+r12*1+rsi]
- vpshufb xmm0,xmm0,xmm7
- lea rbp,[K256]
- vpshufb xmm1,xmm1,xmm7
- vpshufb xmm2,xmm2,xmm7
- vpaddd xmm4,xmm0,XMMWORD[rbp]
- vpshufb xmm3,xmm3,xmm7
- vpaddd xmm5,xmm1,XMMWORD[32+rbp]
- vpaddd xmm6,xmm2,XMMWORD[64+rbp]
- vpaddd xmm7,xmm3,XMMWORD[96+rbp]
- vmovdqa XMMWORD[rsp],xmm4
- mov r14d,eax
- vmovdqa XMMWORD[16+rsp],xmm5
- mov esi,ebx
- vmovdqa XMMWORD[32+rsp],xmm6
- xor esi,ecx
- vmovdqa XMMWORD[48+rsp],xmm7
- mov r13d,r8d
- jmp NEAR $L$avx_00_47
-
-ALIGN 16
-$L$avx_00_47:
- sub rbp,-16*2*4
- vmovdqu xmm9,XMMWORD[r12]
- mov QWORD[((64+0))+rsp],r12
- vpalignr xmm4,xmm1,xmm0,4
- shrd r13d,r13d,14
- mov eax,r14d
- mov r12d,r9d
- vpalignr xmm7,xmm3,xmm2,4
- xor r13d,r8d
- shrd r14d,r14d,9
- xor r12d,r10d
- vpsrld xmm6,xmm4,7
- shrd r13d,r13d,5
- xor r14d,eax
- and r12d,r8d
- vpaddd xmm0,xmm0,xmm7
- vpxor xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((16-128))+rdi]
- xor r13d,r8d
- add r11d,DWORD[rsp]
- mov r15d,eax
- vpsrld xmm7,xmm4,3
- shrd r14d,r14d,11
- xor r12d,r10d
- xor r15d,ebx
- vpslld xmm5,xmm4,14
- shrd r13d,r13d,6
- add r11d,r12d
- and esi,r15d
- vpxor xmm4,xmm7,xmm6
- xor r14d,eax
- add r11d,r13d
- xor esi,ebx
- vpshufd xmm7,xmm3,250
- add edx,r11d
- shrd r14d,r14d,2
- add r11d,esi
- vpsrld xmm6,xmm6,11
- mov r13d,edx
- add r14d,r11d
- shrd r13d,r13d,14
- vpxor xmm4,xmm4,xmm5
- mov r11d,r14d
- mov r12d,r8d
- xor r13d,edx
- vpslld xmm5,xmm5,11
- shrd r14d,r14d,9
- xor r12d,r9d
- shrd r13d,r13d,5
- vpxor xmm4,xmm4,xmm6
- xor r14d,r11d
- and r12d,edx
- vpxor xmm9,xmm9,xmm8
- xor r13d,edx
- vpsrld xmm6,xmm7,10
- add r10d,DWORD[4+rsp]
- mov esi,r11d
- shrd r14d,r14d,11
- vpxor xmm4,xmm4,xmm5
- xor r12d,r9d
- xor esi,eax
- shrd r13d,r13d,6
- vpsrlq xmm7,xmm7,17
- add r10d,r12d
- and r15d,esi
- xor r14d,r11d
- vpaddd xmm0,xmm0,xmm4
- add r10d,r13d
- xor r15d,eax
- add ecx,r10d
- vpxor xmm6,xmm6,xmm7
- shrd r14d,r14d,2
- add r10d,r15d
- mov r13d,ecx
- vpsrlq xmm7,xmm7,2
- add r14d,r10d
- shrd r13d,r13d,14
- mov r10d,r14d
- vpxor xmm6,xmm6,xmm7
- mov r12d,edx
- xor r13d,ecx
- shrd r14d,r14d,9
- vpshufd xmm6,xmm6,132
- xor r12d,r8d
- shrd r13d,r13d,5
- xor r14d,r10d
- vpsrldq xmm6,xmm6,8
- and r12d,ecx
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((32-128))+rdi]
- xor r13d,ecx
- add r9d,DWORD[8+rsp]
- vpaddd xmm0,xmm0,xmm6
- mov r15d,r10d
- shrd r14d,r14d,11
- xor r12d,r8d
- vpshufd xmm7,xmm0,80
- xor r15d,r11d
- shrd r13d,r13d,6
- add r9d,r12d
- vpsrld xmm6,xmm7,10
- and esi,r15d
- xor r14d,r10d
- add r9d,r13d
- vpsrlq xmm7,xmm7,17
- xor esi,r11d
- add ebx,r9d
- shrd r14d,r14d,2
- vpxor xmm6,xmm6,xmm7
- add r9d,esi
- mov r13d,ebx
- add r14d,r9d
- vpsrlq xmm7,xmm7,2
- shrd r13d,r13d,14
- mov r9d,r14d
- mov r12d,ecx
- vpxor xmm6,xmm6,xmm7
- xor r13d,ebx
- shrd r14d,r14d,9
- xor r12d,edx
- vpshufd xmm6,xmm6,232
- shrd r13d,r13d,5
- xor r14d,r9d
- and r12d,ebx
- vpslldq xmm6,xmm6,8
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((48-128))+rdi]
- xor r13d,ebx
- add r8d,DWORD[12+rsp]
- mov esi,r9d
- vpaddd xmm0,xmm0,xmm6
- shrd r14d,r14d,11
- xor r12d,edx
- xor esi,r10d
- vpaddd xmm6,xmm0,XMMWORD[rbp]
- shrd r13d,r13d,6
- add r8d,r12d
- and r15d,esi
- xor r14d,r9d
- add r8d,r13d
- xor r15d,r10d
- add eax,r8d
- shrd r14d,r14d,2
- add r8d,r15d
- mov r13d,eax
- add r14d,r8d
- vmovdqa XMMWORD[rsp],xmm6
- vpalignr xmm4,xmm2,xmm1,4
- shrd r13d,r13d,14
- mov r8d,r14d
- mov r12d,ebx
- vpalignr xmm7,xmm0,xmm3,4
- xor r13d,eax
- shrd r14d,r14d,9
- xor r12d,ecx
- vpsrld xmm6,xmm4,7
- shrd r13d,r13d,5
- xor r14d,r8d
- and r12d,eax
- vpaddd xmm1,xmm1,xmm7
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((64-128))+rdi]
- xor r13d,eax
- add edx,DWORD[16+rsp]
- mov r15d,r8d
- vpsrld xmm7,xmm4,3
- shrd r14d,r14d,11
- xor r12d,ecx
- xor r15d,r9d
- vpslld xmm5,xmm4,14
- shrd r13d,r13d,6
- add edx,r12d
- and esi,r15d
- vpxor xmm4,xmm7,xmm6
- xor r14d,r8d
- add edx,r13d
- xor esi,r9d
- vpshufd xmm7,xmm0,250
- add r11d,edx
- shrd r14d,r14d,2
- add edx,esi
- vpsrld xmm6,xmm6,11
- mov r13d,r11d
- add r14d,edx
- shrd r13d,r13d,14
- vpxor xmm4,xmm4,xmm5
- mov edx,r14d
- mov r12d,eax
- xor r13d,r11d
- vpslld xmm5,xmm5,11
- shrd r14d,r14d,9
- xor r12d,ebx
- shrd r13d,r13d,5
- vpxor xmm4,xmm4,xmm6
- xor r14d,edx
- and r12d,r11d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((80-128))+rdi]
- xor r13d,r11d
- vpsrld xmm6,xmm7,10
- add ecx,DWORD[20+rsp]
- mov esi,edx
- shrd r14d,r14d,11
- vpxor xmm4,xmm4,xmm5
- xor r12d,ebx
- xor esi,r8d
- shrd r13d,r13d,6
- vpsrlq xmm7,xmm7,17
- add ecx,r12d
- and r15d,esi
- xor r14d,edx
- vpaddd xmm1,xmm1,xmm4
- add ecx,r13d
- xor r15d,r8d
- add r10d,ecx
- vpxor xmm6,xmm6,xmm7
- shrd r14d,r14d,2
- add ecx,r15d
- mov r13d,r10d
- vpsrlq xmm7,xmm7,2
- add r14d,ecx
- shrd r13d,r13d,14
- mov ecx,r14d
- vpxor xmm6,xmm6,xmm7
- mov r12d,r11d
- xor r13d,r10d
- shrd r14d,r14d,9
- vpshufd xmm6,xmm6,132
- xor r12d,eax
- shrd r13d,r13d,5
- xor r14d,ecx
- vpsrldq xmm6,xmm6,8
- and r12d,r10d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((96-128))+rdi]
- xor r13d,r10d
- add ebx,DWORD[24+rsp]
- vpaddd xmm1,xmm1,xmm6
- mov r15d,ecx
- shrd r14d,r14d,11
- xor r12d,eax
- vpshufd xmm7,xmm1,80
- xor r15d,edx
- shrd r13d,r13d,6
- add ebx,r12d
- vpsrld xmm6,xmm7,10
- and esi,r15d
- xor r14d,ecx
- add ebx,r13d
- vpsrlq xmm7,xmm7,17
- xor esi,edx
- add r9d,ebx
- shrd r14d,r14d,2
- vpxor xmm6,xmm6,xmm7
- add ebx,esi
- mov r13d,r9d
- add r14d,ebx
- vpsrlq xmm7,xmm7,2
- shrd r13d,r13d,14
- mov ebx,r14d
- mov r12d,r10d
- vpxor xmm6,xmm6,xmm7
- xor r13d,r9d
- shrd r14d,r14d,9
- xor r12d,r11d
- vpshufd xmm6,xmm6,232
- shrd r13d,r13d,5
- xor r14d,ebx
- and r12d,r9d
- vpslldq xmm6,xmm6,8
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((112-128))+rdi]
- xor r13d,r9d
- add eax,DWORD[28+rsp]
- mov esi,ebx
- vpaddd xmm1,xmm1,xmm6
- shrd r14d,r14d,11
- xor r12d,r11d
- xor esi,ecx
- vpaddd xmm6,xmm1,XMMWORD[32+rbp]
- shrd r13d,r13d,6
- add eax,r12d
- and r15d,esi
- xor r14d,ebx
- add eax,r13d
- xor r15d,ecx
- add r8d,eax
- shrd r14d,r14d,2
- add eax,r15d
- mov r13d,r8d
- add r14d,eax
- vmovdqa XMMWORD[16+rsp],xmm6
- vpalignr xmm4,xmm3,xmm2,4
- shrd r13d,r13d,14
- mov eax,r14d
- mov r12d,r9d
- vpalignr xmm7,xmm1,xmm0,4
- xor r13d,r8d
- shrd r14d,r14d,9
- xor r12d,r10d
- vpsrld xmm6,xmm4,7
- shrd r13d,r13d,5
- xor r14d,eax
- and r12d,r8d
- vpaddd xmm2,xmm2,xmm7
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((128-128))+rdi]
- xor r13d,r8d
- add r11d,DWORD[32+rsp]
- mov r15d,eax
- vpsrld xmm7,xmm4,3
- shrd r14d,r14d,11
- xor r12d,r10d
- xor r15d,ebx
- vpslld xmm5,xmm4,14
- shrd r13d,r13d,6
- add r11d,r12d
- and esi,r15d
- vpxor xmm4,xmm7,xmm6
- xor r14d,eax
- add r11d,r13d
- xor esi,ebx
- vpshufd xmm7,xmm1,250
- add edx,r11d
- shrd r14d,r14d,2
- add r11d,esi
- vpsrld xmm6,xmm6,11
- mov r13d,edx
- add r14d,r11d
- shrd r13d,r13d,14
- vpxor xmm4,xmm4,xmm5
- mov r11d,r14d
- mov r12d,r8d
- xor r13d,edx
- vpslld xmm5,xmm5,11
- shrd r14d,r14d,9
- xor r12d,r9d
- shrd r13d,r13d,5
- vpxor xmm4,xmm4,xmm6
- xor r14d,r11d
- and r12d,edx
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((144-128))+rdi]
- xor r13d,edx
- vpsrld xmm6,xmm7,10
- add r10d,DWORD[36+rsp]
- mov esi,r11d
- shrd r14d,r14d,11
- vpxor xmm4,xmm4,xmm5
- xor r12d,r9d
- xor esi,eax
- shrd r13d,r13d,6
- vpsrlq xmm7,xmm7,17
- add r10d,r12d
- and r15d,esi
- xor r14d,r11d
- vpaddd xmm2,xmm2,xmm4
- add r10d,r13d
- xor r15d,eax
- add ecx,r10d
- vpxor xmm6,xmm6,xmm7
- shrd r14d,r14d,2
- add r10d,r15d
- mov r13d,ecx
- vpsrlq xmm7,xmm7,2
- add r14d,r10d
- shrd r13d,r13d,14
- mov r10d,r14d
- vpxor xmm6,xmm6,xmm7
- mov r12d,edx
- xor r13d,ecx
- shrd r14d,r14d,9
- vpshufd xmm6,xmm6,132
- xor r12d,r8d
- shrd r13d,r13d,5
- xor r14d,r10d
- vpsrldq xmm6,xmm6,8
- and r12d,ecx
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((160-128))+rdi]
- xor r13d,ecx
- add r9d,DWORD[40+rsp]
- vpaddd xmm2,xmm2,xmm6
- mov r15d,r10d
- shrd r14d,r14d,11
- xor r12d,r8d
- vpshufd xmm7,xmm2,80
- xor r15d,r11d
- shrd r13d,r13d,6
- add r9d,r12d
- vpsrld xmm6,xmm7,10
- and esi,r15d
- xor r14d,r10d
- add r9d,r13d
- vpsrlq xmm7,xmm7,17
- xor esi,r11d
- add ebx,r9d
- shrd r14d,r14d,2
- vpxor xmm6,xmm6,xmm7
- add r9d,esi
- mov r13d,ebx
- add r14d,r9d
- vpsrlq xmm7,xmm7,2
- shrd r13d,r13d,14
- mov r9d,r14d
- mov r12d,ecx
- vpxor xmm6,xmm6,xmm7
- xor r13d,ebx
- shrd r14d,r14d,9
- xor r12d,edx
- vpshufd xmm6,xmm6,232
- shrd r13d,r13d,5
- xor r14d,r9d
- and r12d,ebx
- vpslldq xmm6,xmm6,8
- vaesenclast xmm11,xmm9,xmm10
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((176-128))+rdi]
- xor r13d,ebx
- add r8d,DWORD[44+rsp]
- mov esi,r9d
- vpaddd xmm2,xmm2,xmm6
- shrd r14d,r14d,11
- xor r12d,edx
- xor esi,r10d
- vpaddd xmm6,xmm2,XMMWORD[64+rbp]
- shrd r13d,r13d,6
- add r8d,r12d
- and r15d,esi
- xor r14d,r9d
- add r8d,r13d
- xor r15d,r10d
- add eax,r8d
- shrd r14d,r14d,2
- add r8d,r15d
- mov r13d,eax
- add r14d,r8d
- vmovdqa XMMWORD[32+rsp],xmm6
- vpalignr xmm4,xmm0,xmm3,4
- shrd r13d,r13d,14
- mov r8d,r14d
- mov r12d,ebx
- vpalignr xmm7,xmm2,xmm1,4
- xor r13d,eax
- shrd r14d,r14d,9
- xor r12d,ecx
- vpsrld xmm6,xmm4,7
- shrd r13d,r13d,5
- xor r14d,r8d
- and r12d,eax
- vpaddd xmm3,xmm3,xmm7
- vpand xmm8,xmm11,xmm12
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((192-128))+rdi]
- xor r13d,eax
- add edx,DWORD[48+rsp]
- mov r15d,r8d
- vpsrld xmm7,xmm4,3
- shrd r14d,r14d,11
- xor r12d,ecx
- xor r15d,r9d
- vpslld xmm5,xmm4,14
- shrd r13d,r13d,6
- add edx,r12d
- and esi,r15d
- vpxor xmm4,xmm7,xmm6
- xor r14d,r8d
- add edx,r13d
- xor esi,r9d
- vpshufd xmm7,xmm2,250
- add r11d,edx
- shrd r14d,r14d,2
- add edx,esi
- vpsrld xmm6,xmm6,11
- mov r13d,r11d
- add r14d,edx
- shrd r13d,r13d,14
- vpxor xmm4,xmm4,xmm5
- mov edx,r14d
- mov r12d,eax
- xor r13d,r11d
- vpslld xmm5,xmm5,11
- shrd r14d,r14d,9
- xor r12d,ebx
- shrd r13d,r13d,5
- vpxor xmm4,xmm4,xmm6
- xor r14d,edx
- and r12d,r11d
- vaesenclast xmm11,xmm9,xmm10
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((208-128))+rdi]
- xor r13d,r11d
- vpsrld xmm6,xmm7,10
- add ecx,DWORD[52+rsp]
- mov esi,edx
- shrd r14d,r14d,11
- vpxor xmm4,xmm4,xmm5
- xor r12d,ebx
- xor esi,r8d
- shrd r13d,r13d,6
- vpsrlq xmm7,xmm7,17
- add ecx,r12d
- and r15d,esi
- xor r14d,edx
- vpaddd xmm3,xmm3,xmm4
- add ecx,r13d
- xor r15d,r8d
- add r10d,ecx
- vpxor xmm6,xmm6,xmm7
- shrd r14d,r14d,2
- add ecx,r15d
- mov r13d,r10d
- vpsrlq xmm7,xmm7,2
- add r14d,ecx
- shrd r13d,r13d,14
- mov ecx,r14d
- vpxor xmm6,xmm6,xmm7
- mov r12d,r11d
- xor r13d,r10d
- shrd r14d,r14d,9
- vpshufd xmm6,xmm6,132
- xor r12d,eax
- shrd r13d,r13d,5
- xor r14d,ecx
- vpsrldq xmm6,xmm6,8
- and r12d,r10d
- vpand xmm11,xmm11,xmm13
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((224-128))+rdi]
- xor r13d,r10d
- add ebx,DWORD[56+rsp]
- vpaddd xmm3,xmm3,xmm6
- mov r15d,ecx
- shrd r14d,r14d,11
- xor r12d,eax
- vpshufd xmm7,xmm3,80
- xor r15d,edx
- shrd r13d,r13d,6
- add ebx,r12d
- vpsrld xmm6,xmm7,10
- and esi,r15d
- xor r14d,ecx
- add ebx,r13d
- vpsrlq xmm7,xmm7,17
- xor esi,edx
- add r9d,ebx
- shrd r14d,r14d,2
- vpxor xmm6,xmm6,xmm7
- add ebx,esi
- mov r13d,r9d
- add r14d,ebx
- vpsrlq xmm7,xmm7,2
- shrd r13d,r13d,14
- mov ebx,r14d
- mov r12d,r10d
- vpxor xmm6,xmm6,xmm7
- xor r13d,r9d
- shrd r14d,r14d,9
- xor r12d,r11d
- vpshufd xmm6,xmm6,232
- shrd r13d,r13d,5
- xor r14d,ebx
- and r12d,r9d
- vpslldq xmm6,xmm6,8
- vpor xmm8,xmm8,xmm11
- vaesenclast xmm11,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((0-128))+rdi]
- xor r13d,r9d
- add eax,DWORD[60+rsp]
- mov esi,ebx
- vpaddd xmm3,xmm3,xmm6
- shrd r14d,r14d,11
- xor r12d,r11d
- xor esi,ecx
- vpaddd xmm6,xmm3,XMMWORD[96+rbp]
- shrd r13d,r13d,6
- add eax,r12d
- and r15d,esi
- xor r14d,ebx
- add eax,r13d
- xor r15d,ecx
- add r8d,eax
- shrd r14d,r14d,2
- add eax,r15d
- mov r13d,r8d
- add r14d,eax
- vmovdqa XMMWORD[48+rsp],xmm6
- mov r12,QWORD[((64+0))+rsp]
- vpand xmm11,xmm11,xmm14
- mov r15,QWORD[((64+8))+rsp]
- vpor xmm8,xmm8,xmm11
- vmovdqu XMMWORD[r12*1+r15],xmm8
- lea r12,[16+r12]
- cmp BYTE[131+rbp],0
- jne NEAR $L$avx_00_47
- vmovdqu xmm9,XMMWORD[r12]
- mov QWORD[((64+0))+rsp],r12
- shrd r13d,r13d,14
- mov eax,r14d
- mov r12d,r9d
- xor r13d,r8d
- shrd r14d,r14d,9
- xor r12d,r10d
- shrd r13d,r13d,5
- xor r14d,eax
- and r12d,r8d
- vpxor xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((16-128))+rdi]
- xor r13d,r8d
- add r11d,DWORD[rsp]
- mov r15d,eax
- shrd r14d,r14d,11
- xor r12d,r10d
- xor r15d,ebx
- shrd r13d,r13d,6
- add r11d,r12d
- and esi,r15d
- xor r14d,eax
- add r11d,r13d
- xor esi,ebx
- add edx,r11d
- shrd r14d,r14d,2
- add r11d,esi
- mov r13d,edx
- add r14d,r11d
- shrd r13d,r13d,14
- mov r11d,r14d
- mov r12d,r8d
- xor r13d,edx
- shrd r14d,r14d,9
- xor r12d,r9d
- shrd r13d,r13d,5
- xor r14d,r11d
- and r12d,edx
- vpxor xmm9,xmm9,xmm8
- xor r13d,edx
- add r10d,DWORD[4+rsp]
- mov esi,r11d
- shrd r14d,r14d,11
- xor r12d,r9d
- xor esi,eax
- shrd r13d,r13d,6
- add r10d,r12d
- and r15d,esi
- xor r14d,r11d
- add r10d,r13d
- xor r15d,eax
- add ecx,r10d
- shrd r14d,r14d,2
- add r10d,r15d
- mov r13d,ecx
- add r14d,r10d
- shrd r13d,r13d,14
- mov r10d,r14d
- mov r12d,edx
- xor r13d,ecx
- shrd r14d,r14d,9
- xor r12d,r8d
- shrd r13d,r13d,5
- xor r14d,r10d
- and r12d,ecx
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((32-128))+rdi]
- xor r13d,ecx
- add r9d,DWORD[8+rsp]
- mov r15d,r10d
- shrd r14d,r14d,11
- xor r12d,r8d
- xor r15d,r11d
- shrd r13d,r13d,6
- add r9d,r12d
- and esi,r15d
- xor r14d,r10d
- add r9d,r13d
- xor esi,r11d
- add ebx,r9d
- shrd r14d,r14d,2
- add r9d,esi
- mov r13d,ebx
- add r14d,r9d
- shrd r13d,r13d,14
- mov r9d,r14d
- mov r12d,ecx
- xor r13d,ebx
- shrd r14d,r14d,9
- xor r12d,edx
- shrd r13d,r13d,5
- xor r14d,r9d
- and r12d,ebx
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((48-128))+rdi]
- xor r13d,ebx
- add r8d,DWORD[12+rsp]
- mov esi,r9d
- shrd r14d,r14d,11
- xor r12d,edx
- xor esi,r10d
- shrd r13d,r13d,6
- add r8d,r12d
- and r15d,esi
- xor r14d,r9d
- add r8d,r13d
- xor r15d,r10d
- add eax,r8d
- shrd r14d,r14d,2
- add r8d,r15d
- mov r13d,eax
- add r14d,r8d
- shrd r13d,r13d,14
- mov r8d,r14d
- mov r12d,ebx
- xor r13d,eax
- shrd r14d,r14d,9
- xor r12d,ecx
- shrd r13d,r13d,5
- xor r14d,r8d
- and r12d,eax
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((64-128))+rdi]
- xor r13d,eax
- add edx,DWORD[16+rsp]
- mov r15d,r8d
- shrd r14d,r14d,11
- xor r12d,ecx
- xor r15d,r9d
- shrd r13d,r13d,6
- add edx,r12d
- and esi,r15d
- xor r14d,r8d
- add edx,r13d
- xor esi,r9d
- add r11d,edx
- shrd r14d,r14d,2
- add edx,esi
- mov r13d,r11d
- add r14d,edx
- shrd r13d,r13d,14
- mov edx,r14d
- mov r12d,eax
- xor r13d,r11d
- shrd r14d,r14d,9
- xor r12d,ebx
- shrd r13d,r13d,5
- xor r14d,edx
- and r12d,r11d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((80-128))+rdi]
- xor r13d,r11d
- add ecx,DWORD[20+rsp]
- mov esi,edx
- shrd r14d,r14d,11
- xor r12d,ebx
- xor esi,r8d
- shrd r13d,r13d,6
- add ecx,r12d
- and r15d,esi
- xor r14d,edx
- add ecx,r13d
- xor r15d,r8d
- add r10d,ecx
- shrd r14d,r14d,2
- add ecx,r15d
- mov r13d,r10d
- add r14d,ecx
- shrd r13d,r13d,14
- mov ecx,r14d
- mov r12d,r11d
- xor r13d,r10d
- shrd r14d,r14d,9
- xor r12d,eax
- shrd r13d,r13d,5
- xor r14d,ecx
- and r12d,r10d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((96-128))+rdi]
- xor r13d,r10d
- add ebx,DWORD[24+rsp]
- mov r15d,ecx
- shrd r14d,r14d,11
- xor r12d,eax
- xor r15d,edx
- shrd r13d,r13d,6
- add ebx,r12d
- and esi,r15d
- xor r14d,ecx
- add ebx,r13d
- xor esi,edx
- add r9d,ebx
- shrd r14d,r14d,2
- add ebx,esi
- mov r13d,r9d
- add r14d,ebx
- shrd r13d,r13d,14
- mov ebx,r14d
- mov r12d,r10d
- xor r13d,r9d
- shrd r14d,r14d,9
- xor r12d,r11d
- shrd r13d,r13d,5
- xor r14d,ebx
- and r12d,r9d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((112-128))+rdi]
- xor r13d,r9d
- add eax,DWORD[28+rsp]
- mov esi,ebx
- shrd r14d,r14d,11
- xor r12d,r11d
- xor esi,ecx
- shrd r13d,r13d,6
- add eax,r12d
- and r15d,esi
- xor r14d,ebx
- add eax,r13d
- xor r15d,ecx
- add r8d,eax
- shrd r14d,r14d,2
- add eax,r15d
- mov r13d,r8d
- add r14d,eax
- shrd r13d,r13d,14
- mov eax,r14d
- mov r12d,r9d
- xor r13d,r8d
- shrd r14d,r14d,9
- xor r12d,r10d
- shrd r13d,r13d,5
- xor r14d,eax
- and r12d,r8d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((128-128))+rdi]
- xor r13d,r8d
- add r11d,DWORD[32+rsp]
- mov r15d,eax
- shrd r14d,r14d,11
- xor r12d,r10d
- xor r15d,ebx
- shrd r13d,r13d,6
- add r11d,r12d
- and esi,r15d
- xor r14d,eax
- add r11d,r13d
- xor esi,ebx
- add edx,r11d
- shrd r14d,r14d,2
- add r11d,esi
- mov r13d,edx
- add r14d,r11d
- shrd r13d,r13d,14
- mov r11d,r14d
- mov r12d,r8d
- xor r13d,edx
- shrd r14d,r14d,9
- xor r12d,r9d
- shrd r13d,r13d,5
- xor r14d,r11d
- and r12d,edx
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((144-128))+rdi]
- xor r13d,edx
- add r10d,DWORD[36+rsp]
- mov esi,r11d
- shrd r14d,r14d,11
- xor r12d,r9d
- xor esi,eax
- shrd r13d,r13d,6
- add r10d,r12d
- and r15d,esi
- xor r14d,r11d
- add r10d,r13d
- xor r15d,eax
- add ecx,r10d
- shrd r14d,r14d,2
- add r10d,r15d
- mov r13d,ecx
- add r14d,r10d
- shrd r13d,r13d,14
- mov r10d,r14d
- mov r12d,edx
- xor r13d,ecx
- shrd r14d,r14d,9
- xor r12d,r8d
- shrd r13d,r13d,5
- xor r14d,r10d
- and r12d,ecx
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((160-128))+rdi]
- xor r13d,ecx
- add r9d,DWORD[40+rsp]
- mov r15d,r10d
- shrd r14d,r14d,11
- xor r12d,r8d
- xor r15d,r11d
- shrd r13d,r13d,6
- add r9d,r12d
- and esi,r15d
- xor r14d,r10d
- add r9d,r13d
- xor esi,r11d
- add ebx,r9d
- shrd r14d,r14d,2
- add r9d,esi
- mov r13d,ebx
- add r14d,r9d
- shrd r13d,r13d,14
- mov r9d,r14d
- mov r12d,ecx
- xor r13d,ebx
- shrd r14d,r14d,9
- xor r12d,edx
- shrd r13d,r13d,5
- xor r14d,r9d
- and r12d,ebx
- vaesenclast xmm11,xmm9,xmm10
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((176-128))+rdi]
- xor r13d,ebx
- add r8d,DWORD[44+rsp]
- mov esi,r9d
- shrd r14d,r14d,11
- xor r12d,edx
- xor esi,r10d
- shrd r13d,r13d,6
- add r8d,r12d
- and r15d,esi
- xor r14d,r9d
- add r8d,r13d
- xor r15d,r10d
- add eax,r8d
- shrd r14d,r14d,2
- add r8d,r15d
- mov r13d,eax
- add r14d,r8d
- shrd r13d,r13d,14
- mov r8d,r14d
- mov r12d,ebx
- xor r13d,eax
- shrd r14d,r14d,9
- xor r12d,ecx
- shrd r13d,r13d,5
- xor r14d,r8d
- and r12d,eax
- vpand xmm8,xmm11,xmm12
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((192-128))+rdi]
- xor r13d,eax
- add edx,DWORD[48+rsp]
- mov r15d,r8d
- shrd r14d,r14d,11
- xor r12d,ecx
- xor r15d,r9d
- shrd r13d,r13d,6
- add edx,r12d
- and esi,r15d
- xor r14d,r8d
- add edx,r13d
- xor esi,r9d
- add r11d,edx
- shrd r14d,r14d,2
- add edx,esi
- mov r13d,r11d
- add r14d,edx
- shrd r13d,r13d,14
- mov edx,r14d
- mov r12d,eax
- xor r13d,r11d
- shrd r14d,r14d,9
- xor r12d,ebx
- shrd r13d,r13d,5
- xor r14d,edx
- and r12d,r11d
- vaesenclast xmm11,xmm9,xmm10
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((208-128))+rdi]
- xor r13d,r11d
- add ecx,DWORD[52+rsp]
- mov esi,edx
- shrd r14d,r14d,11
- xor r12d,ebx
- xor esi,r8d
- shrd r13d,r13d,6
- add ecx,r12d
- and r15d,esi
- xor r14d,edx
- add ecx,r13d
- xor r15d,r8d
- add r10d,ecx
- shrd r14d,r14d,2
- add ecx,r15d
- mov r13d,r10d
- add r14d,ecx
- shrd r13d,r13d,14
- mov ecx,r14d
- mov r12d,r11d
- xor r13d,r10d
- shrd r14d,r14d,9
- xor r12d,eax
- shrd r13d,r13d,5
- xor r14d,ecx
- and r12d,r10d
- vpand xmm11,xmm11,xmm13
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((224-128))+rdi]
- xor r13d,r10d
- add ebx,DWORD[56+rsp]
- mov r15d,ecx
- shrd r14d,r14d,11
- xor r12d,eax
- xor r15d,edx
- shrd r13d,r13d,6
- add ebx,r12d
- and esi,r15d
- xor r14d,ecx
- add ebx,r13d
- xor esi,edx
- add r9d,ebx
- shrd r14d,r14d,2
- add ebx,esi
- mov r13d,r9d
- add r14d,ebx
- shrd r13d,r13d,14
- mov ebx,r14d
- mov r12d,r10d
- xor r13d,r9d
- shrd r14d,r14d,9
- xor r12d,r11d
- shrd r13d,r13d,5
- xor r14d,ebx
- and r12d,r9d
- vpor xmm8,xmm8,xmm11
- vaesenclast xmm11,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((0-128))+rdi]
- xor r13d,r9d
- add eax,DWORD[60+rsp]
- mov esi,ebx
- shrd r14d,r14d,11
- xor r12d,r11d
- xor esi,ecx
- shrd r13d,r13d,6
- add eax,r12d
- and r15d,esi
- xor r14d,ebx
- add eax,r13d
- xor r15d,ecx
- add r8d,eax
- shrd r14d,r14d,2
- add eax,r15d
- mov r13d,r8d
- add r14d,eax
- mov r12,QWORD[((64+0))+rsp]
- mov r13,QWORD[((64+8))+rsp]
- mov r15,QWORD[((64+40))+rsp]
- mov rsi,QWORD[((64+48))+rsp]
-
- vpand xmm11,xmm11,xmm14
- mov eax,r14d
- vpor xmm8,xmm8,xmm11
- vmovdqu XMMWORD[r13*1+r12],xmm8
- lea r12,[16+r12]
-
- add eax,DWORD[r15]
- add ebx,DWORD[4+r15]
- add ecx,DWORD[8+r15]
- add edx,DWORD[12+r15]
- add r8d,DWORD[16+r15]
- add r9d,DWORD[20+r15]
- add r10d,DWORD[24+r15]
- add r11d,DWORD[28+r15]
-
- cmp r12,QWORD[((64+16))+rsp]
-
- mov DWORD[r15],eax
- mov DWORD[4+r15],ebx
- mov DWORD[8+r15],ecx
- mov DWORD[12+r15],edx
- mov DWORD[16+r15],r8d
- mov DWORD[20+r15],r9d
- mov DWORD[24+r15],r10d
- mov DWORD[28+r15],r11d
- jb NEAR $L$loop_avx
-
- mov r8,QWORD[((64+32))+rsp]
- mov rsi,QWORD[120+rsp]
-
- vmovdqu XMMWORD[r8],xmm8
- vzeroall
- movaps xmm6,XMMWORD[128+rsp]
- movaps xmm7,XMMWORD[144+rsp]
- movaps xmm8,XMMWORD[160+rsp]
- movaps xmm9,XMMWORD[176+rsp]
- movaps xmm10,XMMWORD[192+rsp]
- movaps xmm11,XMMWORD[208+rsp]
- movaps xmm12,XMMWORD[224+rsp]
- movaps xmm13,XMMWORD[240+rsp]
- movaps xmm14,XMMWORD[256+rsp]
- movaps xmm15,XMMWORD[272+rsp]
- mov r15,QWORD[((-48))+rsi]
-
- mov r14,QWORD[((-40))+rsi]
-
- mov r13,QWORD[((-32))+rsi]
-
- mov r12,QWORD[((-24))+rsi]
-
- mov rbp,QWORD[((-16))+rsi]
-
- mov rbx,QWORD[((-8))+rsi]
-
- lea rsp,[rsi]
-
-$L$epilogue_avx:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_aesni_cbc_sha256_enc_avx:
-
-ALIGN 64
-aesni_cbc_sha256_enc_avx2:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_aesni_cbc_sha256_enc_avx2:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
- mov rcx,r9
- mov r8,QWORD[40+rsp]
- mov r9,QWORD[48+rsp]
-
-
-
-$L$avx2_shortcut:
- mov r10,QWORD[56+rsp]
- mov rax,rsp
-
- push rbx
-
- push rbp
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
- sub rsp,736
- and rsp,-256*4
- add rsp,448
-
- shl rdx,6
- sub rsi,rdi
- sub r10,rdi
- add rdx,rdi
-
-
-
- mov QWORD[((64+16))+rsp],rdx
-
- mov QWORD[((64+32))+rsp],r8
- mov QWORD[((64+40))+rsp],r9
- mov QWORD[((64+48))+rsp],r10
- mov QWORD[120+rsp],rax
-
- movaps XMMWORD[128+rsp],xmm6
- movaps XMMWORD[144+rsp],xmm7
- movaps XMMWORD[160+rsp],xmm8
- movaps XMMWORD[176+rsp],xmm9
- movaps XMMWORD[192+rsp],xmm10
- movaps XMMWORD[208+rsp],xmm11
- movaps XMMWORD[224+rsp],xmm12
- movaps XMMWORD[240+rsp],xmm13
- movaps XMMWORD[256+rsp],xmm14
- movaps XMMWORD[272+rsp],xmm15
-$L$prologue_avx2:
- vzeroall
-
- mov r13,rdi
- vpinsrq xmm15,xmm15,rsi,1
- lea rdi,[128+rcx]
- lea r12,[((K256+544))]
- mov r14d,DWORD[((240-128))+rdi]
- mov r15,r9
- mov rsi,r10
- vmovdqu xmm8,XMMWORD[r8]
- lea r14,[((-9))+r14]
-
- vmovdqa xmm14,XMMWORD[r14*8+r12]
- vmovdqa xmm13,XMMWORD[16+r14*8+r12]
- vmovdqa xmm12,XMMWORD[32+r14*8+r12]
-
- sub r13,-16*4
- mov eax,DWORD[r15]
- lea r12,[r13*1+rsi]
- mov ebx,DWORD[4+r15]
- cmp r13,rdx
- mov ecx,DWORD[8+r15]
- cmove r12,rsp
- mov edx,DWORD[12+r15]
- mov r8d,DWORD[16+r15]
- mov r9d,DWORD[20+r15]
- mov r10d,DWORD[24+r15]
- mov r11d,DWORD[28+r15]
- vmovdqu xmm10,XMMWORD[((0-128))+rdi]
- jmp NEAR $L$oop_avx2
-ALIGN 16
-$L$oop_avx2:
- vmovdqa ymm7,YMMWORD[((K256+512))]
- vmovdqu xmm0,XMMWORD[((-64+0))+r13*1+rsi]
- vmovdqu xmm1,XMMWORD[((-64+16))+r13*1+rsi]
- vmovdqu xmm2,XMMWORD[((-64+32))+r13*1+rsi]
- vmovdqu xmm3,XMMWORD[((-64+48))+r13*1+rsi]
-
- vinserti128 ymm0,ymm0,XMMWORD[r12],1
- vinserti128 ymm1,ymm1,XMMWORD[16+r12],1
- vpshufb ymm0,ymm0,ymm7
- vinserti128 ymm2,ymm2,XMMWORD[32+r12],1
- vpshufb ymm1,ymm1,ymm7
- vinserti128 ymm3,ymm3,XMMWORD[48+r12],1
-
- lea rbp,[K256]
- vpshufb ymm2,ymm2,ymm7
- lea r13,[((-64))+r13]
- vpaddd ymm4,ymm0,YMMWORD[rbp]
- vpshufb ymm3,ymm3,ymm7
- vpaddd ymm5,ymm1,YMMWORD[32+rbp]
- vpaddd ymm6,ymm2,YMMWORD[64+rbp]
- vpaddd ymm7,ymm3,YMMWORD[96+rbp]
- vmovdqa YMMWORD[rsp],ymm4
- xor r14d,r14d
- vmovdqa YMMWORD[32+rsp],ymm5
- lea rsp,[((-64))+rsp]
- mov esi,ebx
- vmovdqa YMMWORD[rsp],ymm6
- xor esi,ecx
- vmovdqa YMMWORD[32+rsp],ymm7
- mov r12d,r9d
- sub rbp,-16*2*4
- jmp NEAR $L$avx2_00_47
-
-ALIGN 16
-$L$avx2_00_47:
- vmovdqu xmm9,XMMWORD[r13]
- vpinsrq xmm15,xmm15,r13,0
- lea rsp,[((-64))+rsp]
- vpalignr ymm4,ymm1,ymm0,4
- add r11d,DWORD[((0+128))+rsp]
- and r12d,r8d
- rorx r13d,r8d,25
- vpalignr ymm7,ymm3,ymm2,4
- rorx r15d,r8d,11
- lea eax,[r14*1+rax]
- lea r11d,[r12*1+r11]
- vpsrld ymm6,ymm4,7
- andn r12d,r8d,r10d
- xor r13d,r15d
- rorx r14d,r8d,6
- vpaddd ymm0,ymm0,ymm7
- lea r11d,[r12*1+r11]
- xor r13d,r14d
- mov r15d,eax
- vpsrld ymm7,ymm4,3
- rorx r12d,eax,22
- lea r11d,[r13*1+r11]
- xor r15d,ebx
- vpslld ymm5,ymm4,14
- rorx r14d,eax,13
- rorx r13d,eax,2
- lea edx,[r11*1+rdx]
- vpxor ymm4,ymm7,ymm6
- and esi,r15d
- vpxor xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((16-128))+rdi]
- xor r14d,r12d
- xor esi,ebx
- vpshufd ymm7,ymm3,250
- xor r14d,r13d
- lea r11d,[rsi*1+r11]
- mov r12d,r8d
- vpsrld ymm6,ymm6,11
- add r10d,DWORD[((4+128))+rsp]
- and r12d,edx
- rorx r13d,edx,25
- vpxor ymm4,ymm4,ymm5
- rorx esi,edx,11
- lea r11d,[r14*1+r11]
- lea r10d,[r12*1+r10]
- vpslld ymm5,ymm5,11
- andn r12d,edx,r9d
- xor r13d,esi
- rorx r14d,edx,6
- vpxor ymm4,ymm4,ymm6
- lea r10d,[r12*1+r10]
- xor r13d,r14d
- mov esi,r11d
- vpsrld ymm6,ymm7,10
- rorx r12d,r11d,22
- lea r10d,[r13*1+r10]
- xor esi,eax
- vpxor ymm4,ymm4,ymm5
- rorx r14d,r11d,13
- rorx r13d,r11d,2
- lea ecx,[r10*1+rcx]
- vpsrlq ymm7,ymm7,17
- and r15d,esi
- vpxor xmm9,xmm9,xmm8
- xor r14d,r12d
- xor r15d,eax
- vpaddd ymm0,ymm0,ymm4
- xor r14d,r13d
- lea r10d,[r15*1+r10]
- mov r12d,edx
- vpxor ymm6,ymm6,ymm7
- add r9d,DWORD[((8+128))+rsp]
- and r12d,ecx
- rorx r13d,ecx,25
- vpsrlq ymm7,ymm7,2
- rorx r15d,ecx,11
- lea r10d,[r14*1+r10]
- lea r9d,[r12*1+r9]
- vpxor ymm6,ymm6,ymm7
- andn r12d,ecx,r8d
- xor r13d,r15d
- rorx r14d,ecx,6
- vpshufd ymm6,ymm6,132
- lea r9d,[r12*1+r9]
- xor r13d,r14d
- mov r15d,r10d
- vpsrldq ymm6,ymm6,8
- rorx r12d,r10d,22
- lea r9d,[r13*1+r9]
- xor r15d,r11d
- vpaddd ymm0,ymm0,ymm6
- rorx r14d,r10d,13
- rorx r13d,r10d,2
- lea ebx,[r9*1+rbx]
- vpshufd ymm7,ymm0,80
- and esi,r15d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((32-128))+rdi]
- xor r14d,r12d
- xor esi,r11d
- vpsrld ymm6,ymm7,10
- xor r14d,r13d
- lea r9d,[rsi*1+r9]
- mov r12d,ecx
- vpsrlq ymm7,ymm7,17
- add r8d,DWORD[((12+128))+rsp]
- and r12d,ebx
- rorx r13d,ebx,25
- vpxor ymm6,ymm6,ymm7
- rorx esi,ebx,11
- lea r9d,[r14*1+r9]
- lea r8d,[r12*1+r8]
- vpsrlq ymm7,ymm7,2
- andn r12d,ebx,edx
- xor r13d,esi
- rorx r14d,ebx,6
- vpxor ymm6,ymm6,ymm7
- lea r8d,[r12*1+r8]
- xor r13d,r14d
- mov esi,r9d
- vpshufd ymm6,ymm6,232
- rorx r12d,r9d,22
- lea r8d,[r13*1+r8]
- xor esi,r10d
- vpslldq ymm6,ymm6,8
- rorx r14d,r9d,13
- rorx r13d,r9d,2
- lea eax,[r8*1+rax]
- vpaddd ymm0,ymm0,ymm6
- and r15d,esi
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((48-128))+rdi]
- xor r14d,r12d
- xor r15d,r10d
- vpaddd ymm6,ymm0,YMMWORD[rbp]
- xor r14d,r13d
- lea r8d,[r15*1+r8]
- mov r12d,ebx
- vmovdqa YMMWORD[rsp],ymm6
- vpalignr ymm4,ymm2,ymm1,4
- add edx,DWORD[((32+128))+rsp]
- and r12d,eax
- rorx r13d,eax,25
- vpalignr ymm7,ymm0,ymm3,4
- rorx r15d,eax,11
- lea r8d,[r14*1+r8]
- lea edx,[r12*1+rdx]
- vpsrld ymm6,ymm4,7
- andn r12d,eax,ecx
- xor r13d,r15d
- rorx r14d,eax,6
- vpaddd ymm1,ymm1,ymm7
- lea edx,[r12*1+rdx]
- xor r13d,r14d
- mov r15d,r8d
- vpsrld ymm7,ymm4,3
- rorx r12d,r8d,22
- lea edx,[r13*1+rdx]
- xor r15d,r9d
- vpslld ymm5,ymm4,14
- rorx r14d,r8d,13
- rorx r13d,r8d,2
- lea r11d,[rdx*1+r11]
- vpxor ymm4,ymm7,ymm6
- and esi,r15d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((64-128))+rdi]
- xor r14d,r12d
- xor esi,r9d
- vpshufd ymm7,ymm0,250
- xor r14d,r13d
- lea edx,[rsi*1+rdx]
- mov r12d,eax
- vpsrld ymm6,ymm6,11
- add ecx,DWORD[((36+128))+rsp]
- and r12d,r11d
- rorx r13d,r11d,25
- vpxor ymm4,ymm4,ymm5
- rorx esi,r11d,11
- lea edx,[r14*1+rdx]
- lea ecx,[r12*1+rcx]
- vpslld ymm5,ymm5,11
- andn r12d,r11d,ebx
- xor r13d,esi
- rorx r14d,r11d,6
- vpxor ymm4,ymm4,ymm6
- lea ecx,[r12*1+rcx]
- xor r13d,r14d
- mov esi,edx
- vpsrld ymm6,ymm7,10
- rorx r12d,edx,22
- lea ecx,[r13*1+rcx]
- xor esi,r8d
- vpxor ymm4,ymm4,ymm5
- rorx r14d,edx,13
- rorx r13d,edx,2
- lea r10d,[rcx*1+r10]
- vpsrlq ymm7,ymm7,17
- and r15d,esi
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((80-128))+rdi]
- xor r14d,r12d
- xor r15d,r8d
- vpaddd ymm1,ymm1,ymm4
- xor r14d,r13d
- lea ecx,[r15*1+rcx]
- mov r12d,r11d
- vpxor ymm6,ymm6,ymm7
- add ebx,DWORD[((40+128))+rsp]
- and r12d,r10d
- rorx r13d,r10d,25
- vpsrlq ymm7,ymm7,2
- rorx r15d,r10d,11
- lea ecx,[r14*1+rcx]
- lea ebx,[r12*1+rbx]
- vpxor ymm6,ymm6,ymm7
- andn r12d,r10d,eax
- xor r13d,r15d
- rorx r14d,r10d,6
- vpshufd ymm6,ymm6,132
- lea ebx,[r12*1+rbx]
- xor r13d,r14d
- mov r15d,ecx
- vpsrldq ymm6,ymm6,8
- rorx r12d,ecx,22
- lea ebx,[r13*1+rbx]
- xor r15d,edx
- vpaddd ymm1,ymm1,ymm6
- rorx r14d,ecx,13
- rorx r13d,ecx,2
- lea r9d,[rbx*1+r9]
- vpshufd ymm7,ymm1,80
- and esi,r15d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((96-128))+rdi]
- xor r14d,r12d
- xor esi,edx
- vpsrld ymm6,ymm7,10
- xor r14d,r13d
- lea ebx,[rsi*1+rbx]
- mov r12d,r10d
- vpsrlq ymm7,ymm7,17
- add eax,DWORD[((44+128))+rsp]
- and r12d,r9d
- rorx r13d,r9d,25
- vpxor ymm6,ymm6,ymm7
- rorx esi,r9d,11
- lea ebx,[r14*1+rbx]
- lea eax,[r12*1+rax]
- vpsrlq ymm7,ymm7,2
- andn r12d,r9d,r11d
- xor r13d,esi
- rorx r14d,r9d,6
- vpxor ymm6,ymm6,ymm7
- lea eax,[r12*1+rax]
- xor r13d,r14d
- mov esi,ebx
- vpshufd ymm6,ymm6,232
- rorx r12d,ebx,22
- lea eax,[r13*1+rax]
- xor esi,ecx
- vpslldq ymm6,ymm6,8
- rorx r14d,ebx,13
- rorx r13d,ebx,2
- lea r8d,[rax*1+r8]
- vpaddd ymm1,ymm1,ymm6
- and r15d,esi
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((112-128))+rdi]
- xor r14d,r12d
- xor r15d,ecx
- vpaddd ymm6,ymm1,YMMWORD[32+rbp]
- xor r14d,r13d
- lea eax,[r15*1+rax]
- mov r12d,r9d
- vmovdqa YMMWORD[32+rsp],ymm6
- lea rsp,[((-64))+rsp]
- vpalignr ymm4,ymm3,ymm2,4
- add r11d,DWORD[((0+128))+rsp]
- and r12d,r8d
- rorx r13d,r8d,25
- vpalignr ymm7,ymm1,ymm0,4
- rorx r15d,r8d,11
- lea eax,[r14*1+rax]
- lea r11d,[r12*1+r11]
- vpsrld ymm6,ymm4,7
- andn r12d,r8d,r10d
- xor r13d,r15d
- rorx r14d,r8d,6
- vpaddd ymm2,ymm2,ymm7
- lea r11d,[r12*1+r11]
- xor r13d,r14d
- mov r15d,eax
- vpsrld ymm7,ymm4,3
- rorx r12d,eax,22
- lea r11d,[r13*1+r11]
- xor r15d,ebx
- vpslld ymm5,ymm4,14
- rorx r14d,eax,13
- rorx r13d,eax,2
- lea edx,[r11*1+rdx]
- vpxor ymm4,ymm7,ymm6
- and esi,r15d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((128-128))+rdi]
- xor r14d,r12d
- xor esi,ebx
- vpshufd ymm7,ymm1,250
- xor r14d,r13d
- lea r11d,[rsi*1+r11]
- mov r12d,r8d
- vpsrld ymm6,ymm6,11
- add r10d,DWORD[((4+128))+rsp]
- and r12d,edx
- rorx r13d,edx,25
- vpxor ymm4,ymm4,ymm5
- rorx esi,edx,11
- lea r11d,[r14*1+r11]
- lea r10d,[r12*1+r10]
- vpslld ymm5,ymm5,11
- andn r12d,edx,r9d
- xor r13d,esi
- rorx r14d,edx,6
- vpxor ymm4,ymm4,ymm6
- lea r10d,[r12*1+r10]
- xor r13d,r14d
- mov esi,r11d
- vpsrld ymm6,ymm7,10
- rorx r12d,r11d,22
- lea r10d,[r13*1+r10]
- xor esi,eax
- vpxor ymm4,ymm4,ymm5
- rorx r14d,r11d,13
- rorx r13d,r11d,2
- lea ecx,[r10*1+rcx]
- vpsrlq ymm7,ymm7,17
- and r15d,esi
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((144-128))+rdi]
- xor r14d,r12d
- xor r15d,eax
- vpaddd ymm2,ymm2,ymm4
- xor r14d,r13d
- lea r10d,[r15*1+r10]
- mov r12d,edx
- vpxor ymm6,ymm6,ymm7
- add r9d,DWORD[((8+128))+rsp]
- and r12d,ecx
- rorx r13d,ecx,25
- vpsrlq ymm7,ymm7,2
- rorx r15d,ecx,11
- lea r10d,[r14*1+r10]
- lea r9d,[r12*1+r9]
- vpxor ymm6,ymm6,ymm7
- andn r12d,ecx,r8d
- xor r13d,r15d
- rorx r14d,ecx,6
- vpshufd ymm6,ymm6,132
- lea r9d,[r12*1+r9]
- xor r13d,r14d
- mov r15d,r10d
- vpsrldq ymm6,ymm6,8
- rorx r12d,r10d,22
- lea r9d,[r13*1+r9]
- xor r15d,r11d
- vpaddd ymm2,ymm2,ymm6
- rorx r14d,r10d,13
- rorx r13d,r10d,2
- lea ebx,[r9*1+rbx]
- vpshufd ymm7,ymm2,80
- and esi,r15d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((160-128))+rdi]
- xor r14d,r12d
- xor esi,r11d
- vpsrld ymm6,ymm7,10
- xor r14d,r13d
- lea r9d,[rsi*1+r9]
- mov r12d,ecx
- vpsrlq ymm7,ymm7,17
- add r8d,DWORD[((12+128))+rsp]
- and r12d,ebx
- rorx r13d,ebx,25
- vpxor ymm6,ymm6,ymm7
- rorx esi,ebx,11
- lea r9d,[r14*1+r9]
- lea r8d,[r12*1+r8]
- vpsrlq ymm7,ymm7,2
- andn r12d,ebx,edx
- xor r13d,esi
- rorx r14d,ebx,6
- vpxor ymm6,ymm6,ymm7
- lea r8d,[r12*1+r8]
- xor r13d,r14d
- mov esi,r9d
- vpshufd ymm6,ymm6,232
- rorx r12d,r9d,22
- lea r8d,[r13*1+r8]
- xor esi,r10d
- vpslldq ymm6,ymm6,8
- rorx r14d,r9d,13
- rorx r13d,r9d,2
- lea eax,[r8*1+rax]
- vpaddd ymm2,ymm2,ymm6
- and r15d,esi
- vaesenclast xmm11,xmm9,xmm10
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((176-128))+rdi]
- xor r14d,r12d
- xor r15d,r10d
- vpaddd ymm6,ymm2,YMMWORD[64+rbp]
- xor r14d,r13d
- lea r8d,[r15*1+r8]
- mov r12d,ebx
- vmovdqa YMMWORD[rsp],ymm6
- vpalignr ymm4,ymm0,ymm3,4
- add edx,DWORD[((32+128))+rsp]
- and r12d,eax
- rorx r13d,eax,25
- vpalignr ymm7,ymm2,ymm1,4
- rorx r15d,eax,11
- lea r8d,[r14*1+r8]
- lea edx,[r12*1+rdx]
- vpsrld ymm6,ymm4,7
- andn r12d,eax,ecx
- xor r13d,r15d
- rorx r14d,eax,6
- vpaddd ymm3,ymm3,ymm7
- lea edx,[r12*1+rdx]
- xor r13d,r14d
- mov r15d,r8d
- vpsrld ymm7,ymm4,3
- rorx r12d,r8d,22
- lea edx,[r13*1+rdx]
- xor r15d,r9d
- vpslld ymm5,ymm4,14
- rorx r14d,r8d,13
- rorx r13d,r8d,2
- lea r11d,[rdx*1+r11]
- vpxor ymm4,ymm7,ymm6
- and esi,r15d
- vpand xmm8,xmm11,xmm12
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((192-128))+rdi]
- xor r14d,r12d
- xor esi,r9d
- vpshufd ymm7,ymm2,250
- xor r14d,r13d
- lea edx,[rsi*1+rdx]
- mov r12d,eax
- vpsrld ymm6,ymm6,11
- add ecx,DWORD[((36+128))+rsp]
- and r12d,r11d
- rorx r13d,r11d,25
- vpxor ymm4,ymm4,ymm5
- rorx esi,r11d,11
- lea edx,[r14*1+rdx]
- lea ecx,[r12*1+rcx]
- vpslld ymm5,ymm5,11
- andn r12d,r11d,ebx
- xor r13d,esi
- rorx r14d,r11d,6
- vpxor ymm4,ymm4,ymm6
- lea ecx,[r12*1+rcx]
- xor r13d,r14d
- mov esi,edx
- vpsrld ymm6,ymm7,10
- rorx r12d,edx,22
- lea ecx,[r13*1+rcx]
- xor esi,r8d
- vpxor ymm4,ymm4,ymm5
- rorx r14d,edx,13
- rorx r13d,edx,2
- lea r10d,[rcx*1+r10]
- vpsrlq ymm7,ymm7,17
- and r15d,esi
- vaesenclast xmm11,xmm9,xmm10
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((208-128))+rdi]
- xor r14d,r12d
- xor r15d,r8d
- vpaddd ymm3,ymm3,ymm4
- xor r14d,r13d
- lea ecx,[r15*1+rcx]
- mov r12d,r11d
- vpxor ymm6,ymm6,ymm7
- add ebx,DWORD[((40+128))+rsp]
- and r12d,r10d
- rorx r13d,r10d,25
- vpsrlq ymm7,ymm7,2
- rorx r15d,r10d,11
- lea ecx,[r14*1+rcx]
- lea ebx,[r12*1+rbx]
- vpxor ymm6,ymm6,ymm7
- andn r12d,r10d,eax
- xor r13d,r15d
- rorx r14d,r10d,6
- vpshufd ymm6,ymm6,132
- lea ebx,[r12*1+rbx]
- xor r13d,r14d
- mov r15d,ecx
- vpsrldq ymm6,ymm6,8
- rorx r12d,ecx,22
- lea ebx,[r13*1+rbx]
- xor r15d,edx
- vpaddd ymm3,ymm3,ymm6
- rorx r14d,ecx,13
- rorx r13d,ecx,2
- lea r9d,[rbx*1+r9]
- vpshufd ymm7,ymm3,80
- and esi,r15d
- vpand xmm11,xmm11,xmm13
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((224-128))+rdi]
- xor r14d,r12d
- xor esi,edx
- vpsrld ymm6,ymm7,10
- xor r14d,r13d
- lea ebx,[rsi*1+rbx]
- mov r12d,r10d
- vpsrlq ymm7,ymm7,17
- add eax,DWORD[((44+128))+rsp]
- and r12d,r9d
- rorx r13d,r9d,25
- vpxor ymm6,ymm6,ymm7
- rorx esi,r9d,11
- lea ebx,[r14*1+rbx]
- lea eax,[r12*1+rax]
- vpsrlq ymm7,ymm7,2
- andn r12d,r9d,r11d
- xor r13d,esi
- rorx r14d,r9d,6
- vpxor ymm6,ymm6,ymm7
- lea eax,[r12*1+rax]
- xor r13d,r14d
- mov esi,ebx
- vpshufd ymm6,ymm6,232
- rorx r12d,ebx,22
- lea eax,[r13*1+rax]
- xor esi,ecx
- vpslldq ymm6,ymm6,8
- rorx r14d,ebx,13
- rorx r13d,ebx,2
- lea r8d,[rax*1+r8]
- vpaddd ymm3,ymm3,ymm6
- and r15d,esi
- vpor xmm8,xmm8,xmm11
- vaesenclast xmm11,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((0-128))+rdi]
- xor r14d,r12d
- xor r15d,ecx
- vpaddd ymm6,ymm3,YMMWORD[96+rbp]
- xor r14d,r13d
- lea eax,[r15*1+rax]
- mov r12d,r9d
- vmovdqa YMMWORD[32+rsp],ymm6
- vmovq r13,xmm15
- vpextrq r15,xmm15,1
- vpand xmm11,xmm11,xmm14
- vpor xmm8,xmm8,xmm11
- vmovdqu XMMWORD[r13*1+r15],xmm8
- lea r13,[16+r13]
- lea rbp,[128+rbp]
- cmp BYTE[3+rbp],0
- jne NEAR $L$avx2_00_47
- vmovdqu xmm9,XMMWORD[r13]
- vpinsrq xmm15,xmm15,r13,0
- add r11d,DWORD[((0+64))+rsp]
- and r12d,r8d
- rorx r13d,r8d,25
- rorx r15d,r8d,11
- lea eax,[r14*1+rax]
- lea r11d,[r12*1+r11]
- andn r12d,r8d,r10d
- xor r13d,r15d
- rorx r14d,r8d,6
- lea r11d,[r12*1+r11]
- xor r13d,r14d
- mov r15d,eax
- rorx r12d,eax,22
- lea r11d,[r13*1+r11]
- xor r15d,ebx
- rorx r14d,eax,13
- rorx r13d,eax,2
- lea edx,[r11*1+rdx]
- and esi,r15d
- vpxor xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((16-128))+rdi]
- xor r14d,r12d
- xor esi,ebx
- xor r14d,r13d
- lea r11d,[rsi*1+r11]
- mov r12d,r8d
- add r10d,DWORD[((4+64))+rsp]
- and r12d,edx
- rorx r13d,edx,25
- rorx esi,edx,11
- lea r11d,[r14*1+r11]
- lea r10d,[r12*1+r10]
- andn r12d,edx,r9d
- xor r13d,esi
- rorx r14d,edx,6
- lea r10d,[r12*1+r10]
- xor r13d,r14d
- mov esi,r11d
- rorx r12d,r11d,22
- lea r10d,[r13*1+r10]
- xor esi,eax
- rorx r14d,r11d,13
- rorx r13d,r11d,2
- lea ecx,[r10*1+rcx]
- and r15d,esi
- vpxor xmm9,xmm9,xmm8
- xor r14d,r12d
- xor r15d,eax
- xor r14d,r13d
- lea r10d,[r15*1+r10]
- mov r12d,edx
- add r9d,DWORD[((8+64))+rsp]
- and r12d,ecx
- rorx r13d,ecx,25
- rorx r15d,ecx,11
- lea r10d,[r14*1+r10]
- lea r9d,[r12*1+r9]
- andn r12d,ecx,r8d
- xor r13d,r15d
- rorx r14d,ecx,6
- lea r9d,[r12*1+r9]
- xor r13d,r14d
- mov r15d,r10d
- rorx r12d,r10d,22
- lea r9d,[r13*1+r9]
- xor r15d,r11d
- rorx r14d,r10d,13
- rorx r13d,r10d,2
- lea ebx,[r9*1+rbx]
- and esi,r15d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((32-128))+rdi]
- xor r14d,r12d
- xor esi,r11d
- xor r14d,r13d
- lea r9d,[rsi*1+r9]
- mov r12d,ecx
- add r8d,DWORD[((12+64))+rsp]
- and r12d,ebx
- rorx r13d,ebx,25
- rorx esi,ebx,11
- lea r9d,[r14*1+r9]
- lea r8d,[r12*1+r8]
- andn r12d,ebx,edx
- xor r13d,esi
- rorx r14d,ebx,6
- lea r8d,[r12*1+r8]
- xor r13d,r14d
- mov esi,r9d
- rorx r12d,r9d,22
- lea r8d,[r13*1+r8]
- xor esi,r10d
- rorx r14d,r9d,13
- rorx r13d,r9d,2
- lea eax,[r8*1+rax]
- and r15d,esi
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((48-128))+rdi]
- xor r14d,r12d
- xor r15d,r10d
- xor r14d,r13d
- lea r8d,[r15*1+r8]
- mov r12d,ebx
- add edx,DWORD[((32+64))+rsp]
- and r12d,eax
- rorx r13d,eax,25
- rorx r15d,eax,11
- lea r8d,[r14*1+r8]
- lea edx,[r12*1+rdx]
- andn r12d,eax,ecx
- xor r13d,r15d
- rorx r14d,eax,6
- lea edx,[r12*1+rdx]
- xor r13d,r14d
- mov r15d,r8d
- rorx r12d,r8d,22
- lea edx,[r13*1+rdx]
- xor r15d,r9d
- rorx r14d,r8d,13
- rorx r13d,r8d,2
- lea r11d,[rdx*1+r11]
- and esi,r15d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((64-128))+rdi]
- xor r14d,r12d
- xor esi,r9d
- xor r14d,r13d
- lea edx,[rsi*1+rdx]
- mov r12d,eax
- add ecx,DWORD[((36+64))+rsp]
- and r12d,r11d
- rorx r13d,r11d,25
- rorx esi,r11d,11
- lea edx,[r14*1+rdx]
- lea ecx,[r12*1+rcx]
- andn r12d,r11d,ebx
- xor r13d,esi
- rorx r14d,r11d,6
- lea ecx,[r12*1+rcx]
- xor r13d,r14d
- mov esi,edx
- rorx r12d,edx,22
- lea ecx,[r13*1+rcx]
- xor esi,r8d
- rorx r14d,edx,13
- rorx r13d,edx,2
- lea r10d,[rcx*1+r10]
- and r15d,esi
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((80-128))+rdi]
- xor r14d,r12d
- xor r15d,r8d
- xor r14d,r13d
- lea ecx,[r15*1+rcx]
- mov r12d,r11d
- add ebx,DWORD[((40+64))+rsp]
- and r12d,r10d
- rorx r13d,r10d,25
- rorx r15d,r10d,11
- lea ecx,[r14*1+rcx]
- lea ebx,[r12*1+rbx]
- andn r12d,r10d,eax
- xor r13d,r15d
- rorx r14d,r10d,6
- lea ebx,[r12*1+rbx]
- xor r13d,r14d
- mov r15d,ecx
- rorx r12d,ecx,22
- lea ebx,[r13*1+rbx]
- xor r15d,edx
- rorx r14d,ecx,13
- rorx r13d,ecx,2
- lea r9d,[rbx*1+r9]
- and esi,r15d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((96-128))+rdi]
- xor r14d,r12d
- xor esi,edx
- xor r14d,r13d
- lea ebx,[rsi*1+rbx]
- mov r12d,r10d
- add eax,DWORD[((44+64))+rsp]
- and r12d,r9d
- rorx r13d,r9d,25
- rorx esi,r9d,11
- lea ebx,[r14*1+rbx]
- lea eax,[r12*1+rax]
- andn r12d,r9d,r11d
- xor r13d,esi
- rorx r14d,r9d,6
- lea eax,[r12*1+rax]
- xor r13d,r14d
- mov esi,ebx
- rorx r12d,ebx,22
- lea eax,[r13*1+rax]
- xor esi,ecx
- rorx r14d,ebx,13
- rorx r13d,ebx,2
- lea r8d,[rax*1+r8]
- and r15d,esi
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((112-128))+rdi]
- xor r14d,r12d
- xor r15d,ecx
- xor r14d,r13d
- lea eax,[r15*1+rax]
- mov r12d,r9d
- add r11d,DWORD[rsp]
- and r12d,r8d
- rorx r13d,r8d,25
- rorx r15d,r8d,11
- lea eax,[r14*1+rax]
- lea r11d,[r12*1+r11]
- andn r12d,r8d,r10d
- xor r13d,r15d
- rorx r14d,r8d,6
- lea r11d,[r12*1+r11]
- xor r13d,r14d
- mov r15d,eax
- rorx r12d,eax,22
- lea r11d,[r13*1+r11]
- xor r15d,ebx
- rorx r14d,eax,13
- rorx r13d,eax,2
- lea edx,[r11*1+rdx]
- and esi,r15d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((128-128))+rdi]
- xor r14d,r12d
- xor esi,ebx
- xor r14d,r13d
- lea r11d,[rsi*1+r11]
- mov r12d,r8d
- add r10d,DWORD[4+rsp]
- and r12d,edx
- rorx r13d,edx,25
- rorx esi,edx,11
- lea r11d,[r14*1+r11]
- lea r10d,[r12*1+r10]
- andn r12d,edx,r9d
- xor r13d,esi
- rorx r14d,edx,6
- lea r10d,[r12*1+r10]
- xor r13d,r14d
- mov esi,r11d
- rorx r12d,r11d,22
- lea r10d,[r13*1+r10]
- xor esi,eax
- rorx r14d,r11d,13
- rorx r13d,r11d,2
- lea ecx,[r10*1+rcx]
- and r15d,esi
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((144-128))+rdi]
- xor r14d,r12d
- xor r15d,eax
- xor r14d,r13d
- lea r10d,[r15*1+r10]
- mov r12d,edx
- add r9d,DWORD[8+rsp]
- and r12d,ecx
- rorx r13d,ecx,25
- rorx r15d,ecx,11
- lea r10d,[r14*1+r10]
- lea r9d,[r12*1+r9]
- andn r12d,ecx,r8d
- xor r13d,r15d
- rorx r14d,ecx,6
- lea r9d,[r12*1+r9]
- xor r13d,r14d
- mov r15d,r10d
- rorx r12d,r10d,22
- lea r9d,[r13*1+r9]
- xor r15d,r11d
- rorx r14d,r10d,13
- rorx r13d,r10d,2
- lea ebx,[r9*1+rbx]
- and esi,r15d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((160-128))+rdi]
- xor r14d,r12d
- xor esi,r11d
- xor r14d,r13d
- lea r9d,[rsi*1+r9]
- mov r12d,ecx
- add r8d,DWORD[12+rsp]
- and r12d,ebx
- rorx r13d,ebx,25
- rorx esi,ebx,11
- lea r9d,[r14*1+r9]
- lea r8d,[r12*1+r8]
- andn r12d,ebx,edx
- xor r13d,esi
- rorx r14d,ebx,6
- lea r8d,[r12*1+r8]
- xor r13d,r14d
- mov esi,r9d
- rorx r12d,r9d,22
- lea r8d,[r13*1+r8]
- xor esi,r10d
- rorx r14d,r9d,13
- rorx r13d,r9d,2
- lea eax,[r8*1+rax]
- and r15d,esi
- vaesenclast xmm11,xmm9,xmm10
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((176-128))+rdi]
- xor r14d,r12d
- xor r15d,r10d
- xor r14d,r13d
- lea r8d,[r15*1+r8]
- mov r12d,ebx
- add edx,DWORD[32+rsp]
- and r12d,eax
- rorx r13d,eax,25
- rorx r15d,eax,11
- lea r8d,[r14*1+r8]
- lea edx,[r12*1+rdx]
- andn r12d,eax,ecx
- xor r13d,r15d
- rorx r14d,eax,6
- lea edx,[r12*1+rdx]
- xor r13d,r14d
- mov r15d,r8d
- rorx r12d,r8d,22
- lea edx,[r13*1+rdx]
- xor r15d,r9d
- rorx r14d,r8d,13
- rorx r13d,r8d,2
- lea r11d,[rdx*1+r11]
- and esi,r15d
- vpand xmm8,xmm11,xmm12
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((192-128))+rdi]
- xor r14d,r12d
- xor esi,r9d
- xor r14d,r13d
- lea edx,[rsi*1+rdx]
- mov r12d,eax
- add ecx,DWORD[36+rsp]
- and r12d,r11d
- rorx r13d,r11d,25
- rorx esi,r11d,11
- lea edx,[r14*1+rdx]
- lea ecx,[r12*1+rcx]
- andn r12d,r11d,ebx
- xor r13d,esi
- rorx r14d,r11d,6
- lea ecx,[r12*1+rcx]
- xor r13d,r14d
- mov esi,edx
- rorx r12d,edx,22
- lea ecx,[r13*1+rcx]
- xor esi,r8d
- rorx r14d,edx,13
- rorx r13d,edx,2
- lea r10d,[rcx*1+r10]
- and r15d,esi
- vaesenclast xmm11,xmm9,xmm10
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((208-128))+rdi]
- xor r14d,r12d
- xor r15d,r8d
- xor r14d,r13d
- lea ecx,[r15*1+rcx]
- mov r12d,r11d
- add ebx,DWORD[40+rsp]
- and r12d,r10d
- rorx r13d,r10d,25
- rorx r15d,r10d,11
- lea ecx,[r14*1+rcx]
- lea ebx,[r12*1+rbx]
- andn r12d,r10d,eax
- xor r13d,r15d
- rorx r14d,r10d,6
- lea ebx,[r12*1+rbx]
- xor r13d,r14d
- mov r15d,ecx
- rorx r12d,ecx,22
- lea ebx,[r13*1+rbx]
- xor r15d,edx
- rorx r14d,ecx,13
- rorx r13d,ecx,2
- lea r9d,[rbx*1+r9]
- and esi,r15d
- vpand xmm11,xmm11,xmm13
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((224-128))+rdi]
- xor r14d,r12d
- xor esi,edx
- xor r14d,r13d
- lea ebx,[rsi*1+rbx]
- mov r12d,r10d
- add eax,DWORD[44+rsp]
- and r12d,r9d
- rorx r13d,r9d,25
- rorx esi,r9d,11
- lea ebx,[r14*1+rbx]
- lea eax,[r12*1+rax]
- andn r12d,r9d,r11d
- xor r13d,esi
- rorx r14d,r9d,6
- lea eax,[r12*1+rax]
- xor r13d,r14d
- mov esi,ebx
- rorx r12d,ebx,22
- lea eax,[r13*1+rax]
- xor esi,ecx
- rorx r14d,ebx,13
- rorx r13d,ebx,2
- lea r8d,[rax*1+r8]
- and r15d,esi
- vpor xmm8,xmm8,xmm11
- vaesenclast xmm11,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((0-128))+rdi]
- xor r14d,r12d
- xor r15d,ecx
- xor r14d,r13d
- lea eax,[r15*1+rax]
- mov r12d,r9d
- vpextrq r12,xmm15,1
- vmovq r13,xmm15
- mov r15,QWORD[552+rsp]
- add eax,r14d
- lea rbp,[448+rsp]
-
- vpand xmm11,xmm11,xmm14
- vpor xmm8,xmm8,xmm11
- vmovdqu XMMWORD[r13*1+r12],xmm8
- lea r13,[16+r13]
-
- add eax,DWORD[r15]
- add ebx,DWORD[4+r15]
- add ecx,DWORD[8+r15]
- add edx,DWORD[12+r15]
- add r8d,DWORD[16+r15]
- add r9d,DWORD[20+r15]
- add r10d,DWORD[24+r15]
- add r11d,DWORD[28+r15]
-
- mov DWORD[r15],eax
- mov DWORD[4+r15],ebx
- mov DWORD[8+r15],ecx
- mov DWORD[12+r15],edx
- mov DWORD[16+r15],r8d
- mov DWORD[20+r15],r9d
- mov DWORD[24+r15],r10d
- mov DWORD[28+r15],r11d
-
- cmp r13,QWORD[80+rbp]
- je NEAR $L$done_avx2
-
- xor r14d,r14d
- mov esi,ebx
- mov r12d,r9d
- xor esi,ecx
- jmp NEAR $L$ower_avx2
-ALIGN 16
-$L$ower_avx2:
- vmovdqu xmm9,XMMWORD[r13]
- vpinsrq xmm15,xmm15,r13,0
- add r11d,DWORD[((0+16))+rbp]
- and r12d,r8d
- rorx r13d,r8d,25
- rorx r15d,r8d,11
- lea eax,[r14*1+rax]
- lea r11d,[r12*1+r11]
- andn r12d,r8d,r10d
- xor r13d,r15d
- rorx r14d,r8d,6
- lea r11d,[r12*1+r11]
- xor r13d,r14d
- mov r15d,eax
- rorx r12d,eax,22
- lea r11d,[r13*1+r11]
- xor r15d,ebx
- rorx r14d,eax,13
- rorx r13d,eax,2
- lea edx,[r11*1+rdx]
- and esi,r15d
- vpxor xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((16-128))+rdi]
- xor r14d,r12d
- xor esi,ebx
- xor r14d,r13d
- lea r11d,[rsi*1+r11]
- mov r12d,r8d
- add r10d,DWORD[((4+16))+rbp]
- and r12d,edx
- rorx r13d,edx,25
- rorx esi,edx,11
- lea r11d,[r14*1+r11]
- lea r10d,[r12*1+r10]
- andn r12d,edx,r9d
- xor r13d,esi
- rorx r14d,edx,6
- lea r10d,[r12*1+r10]
- xor r13d,r14d
- mov esi,r11d
- rorx r12d,r11d,22
- lea r10d,[r13*1+r10]
- xor esi,eax
- rorx r14d,r11d,13
- rorx r13d,r11d,2
- lea ecx,[r10*1+rcx]
- and r15d,esi
- vpxor xmm9,xmm9,xmm8
- xor r14d,r12d
- xor r15d,eax
- xor r14d,r13d
- lea r10d,[r15*1+r10]
- mov r12d,edx
- add r9d,DWORD[((8+16))+rbp]
- and r12d,ecx
- rorx r13d,ecx,25
- rorx r15d,ecx,11
- lea r10d,[r14*1+r10]
- lea r9d,[r12*1+r9]
- andn r12d,ecx,r8d
- xor r13d,r15d
- rorx r14d,ecx,6
- lea r9d,[r12*1+r9]
- xor r13d,r14d
- mov r15d,r10d
- rorx r12d,r10d,22
- lea r9d,[r13*1+r9]
- xor r15d,r11d
- rorx r14d,r10d,13
- rorx r13d,r10d,2
- lea ebx,[r9*1+rbx]
- and esi,r15d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((32-128))+rdi]
- xor r14d,r12d
- xor esi,r11d
- xor r14d,r13d
- lea r9d,[rsi*1+r9]
- mov r12d,ecx
- add r8d,DWORD[((12+16))+rbp]
- and r12d,ebx
- rorx r13d,ebx,25
- rorx esi,ebx,11
- lea r9d,[r14*1+r9]
- lea r8d,[r12*1+r8]
- andn r12d,ebx,edx
- xor r13d,esi
- rorx r14d,ebx,6
- lea r8d,[r12*1+r8]
- xor r13d,r14d
- mov esi,r9d
- rorx r12d,r9d,22
- lea r8d,[r13*1+r8]
- xor esi,r10d
- rorx r14d,r9d,13
- rorx r13d,r9d,2
- lea eax,[r8*1+rax]
- and r15d,esi
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((48-128))+rdi]
- xor r14d,r12d
- xor r15d,r10d
- xor r14d,r13d
- lea r8d,[r15*1+r8]
- mov r12d,ebx
- add edx,DWORD[((32+16))+rbp]
- and r12d,eax
- rorx r13d,eax,25
- rorx r15d,eax,11
- lea r8d,[r14*1+r8]
- lea edx,[r12*1+rdx]
- andn r12d,eax,ecx
- xor r13d,r15d
- rorx r14d,eax,6
- lea edx,[r12*1+rdx]
- xor r13d,r14d
- mov r15d,r8d
- rorx r12d,r8d,22
- lea edx,[r13*1+rdx]
- xor r15d,r9d
- rorx r14d,r8d,13
- rorx r13d,r8d,2
- lea r11d,[rdx*1+r11]
- and esi,r15d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((64-128))+rdi]
- xor r14d,r12d
- xor esi,r9d
- xor r14d,r13d
- lea edx,[rsi*1+rdx]
- mov r12d,eax
- add ecx,DWORD[((36+16))+rbp]
- and r12d,r11d
- rorx r13d,r11d,25
- rorx esi,r11d,11
- lea edx,[r14*1+rdx]
- lea ecx,[r12*1+rcx]
- andn r12d,r11d,ebx
- xor r13d,esi
- rorx r14d,r11d,6
- lea ecx,[r12*1+rcx]
- xor r13d,r14d
- mov esi,edx
- rorx r12d,edx,22
- lea ecx,[r13*1+rcx]
- xor esi,r8d
- rorx r14d,edx,13
- rorx r13d,edx,2
- lea r10d,[rcx*1+r10]
- and r15d,esi
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((80-128))+rdi]
- xor r14d,r12d
- xor r15d,r8d
- xor r14d,r13d
- lea ecx,[r15*1+rcx]
- mov r12d,r11d
- add ebx,DWORD[((40+16))+rbp]
- and r12d,r10d
- rorx r13d,r10d,25
- rorx r15d,r10d,11
- lea ecx,[r14*1+rcx]
- lea ebx,[r12*1+rbx]
- andn r12d,r10d,eax
- xor r13d,r15d
- rorx r14d,r10d,6
- lea ebx,[r12*1+rbx]
- xor r13d,r14d
- mov r15d,ecx
- rorx r12d,ecx,22
- lea ebx,[r13*1+rbx]
- xor r15d,edx
- rorx r14d,ecx,13
- rorx r13d,ecx,2
- lea r9d,[rbx*1+r9]
- and esi,r15d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((96-128))+rdi]
- xor r14d,r12d
- xor esi,edx
- xor r14d,r13d
- lea ebx,[rsi*1+rbx]
- mov r12d,r10d
- add eax,DWORD[((44+16))+rbp]
- and r12d,r9d
- rorx r13d,r9d,25
- rorx esi,r9d,11
- lea ebx,[r14*1+rbx]
- lea eax,[r12*1+rax]
- andn r12d,r9d,r11d
- xor r13d,esi
- rorx r14d,r9d,6
- lea eax,[r12*1+rax]
- xor r13d,r14d
- mov esi,ebx
- rorx r12d,ebx,22
- lea eax,[r13*1+rax]
- xor esi,ecx
- rorx r14d,ebx,13
- rorx r13d,ebx,2
- lea r8d,[rax*1+r8]
- and r15d,esi
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((112-128))+rdi]
- xor r14d,r12d
- xor r15d,ecx
- xor r14d,r13d
- lea eax,[r15*1+rax]
- mov r12d,r9d
- lea rbp,[((-64))+rbp]
- add r11d,DWORD[((0+16))+rbp]
- and r12d,r8d
- rorx r13d,r8d,25
- rorx r15d,r8d,11
- lea eax,[r14*1+rax]
- lea r11d,[r12*1+r11]
- andn r12d,r8d,r10d
- xor r13d,r15d
- rorx r14d,r8d,6
- lea r11d,[r12*1+r11]
- xor r13d,r14d
- mov r15d,eax
- rorx r12d,eax,22
- lea r11d,[r13*1+r11]
- xor r15d,ebx
- rorx r14d,eax,13
- rorx r13d,eax,2
- lea edx,[r11*1+rdx]
- and esi,r15d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((128-128))+rdi]
- xor r14d,r12d
- xor esi,ebx
- xor r14d,r13d
- lea r11d,[rsi*1+r11]
- mov r12d,r8d
- add r10d,DWORD[((4+16))+rbp]
- and r12d,edx
- rorx r13d,edx,25
- rorx esi,edx,11
- lea r11d,[r14*1+r11]
- lea r10d,[r12*1+r10]
- andn r12d,edx,r9d
- xor r13d,esi
- rorx r14d,edx,6
- lea r10d,[r12*1+r10]
- xor r13d,r14d
- mov esi,r11d
- rorx r12d,r11d,22
- lea r10d,[r13*1+r10]
- xor esi,eax
- rorx r14d,r11d,13
- rorx r13d,r11d,2
- lea ecx,[r10*1+rcx]
- and r15d,esi
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((144-128))+rdi]
- xor r14d,r12d
- xor r15d,eax
- xor r14d,r13d
- lea r10d,[r15*1+r10]
- mov r12d,edx
- add r9d,DWORD[((8+16))+rbp]
- and r12d,ecx
- rorx r13d,ecx,25
- rorx r15d,ecx,11
- lea r10d,[r14*1+r10]
- lea r9d,[r12*1+r9]
- andn r12d,ecx,r8d
- xor r13d,r15d
- rorx r14d,ecx,6
- lea r9d,[r12*1+r9]
- xor r13d,r14d
- mov r15d,r10d
- rorx r12d,r10d,22
- lea r9d,[r13*1+r9]
- xor r15d,r11d
- rorx r14d,r10d,13
- rorx r13d,r10d,2
- lea ebx,[r9*1+rbx]
- and esi,r15d
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((160-128))+rdi]
- xor r14d,r12d
- xor esi,r11d
- xor r14d,r13d
- lea r9d,[rsi*1+r9]
- mov r12d,ecx
- add r8d,DWORD[((12+16))+rbp]
- and r12d,ebx
- rorx r13d,ebx,25
- rorx esi,ebx,11
- lea r9d,[r14*1+r9]
- lea r8d,[r12*1+r8]
- andn r12d,ebx,edx
- xor r13d,esi
- rorx r14d,ebx,6
- lea r8d,[r12*1+r8]
- xor r13d,r14d
- mov esi,r9d
- rorx r12d,r9d,22
- lea r8d,[r13*1+r8]
- xor esi,r10d
- rorx r14d,r9d,13
- rorx r13d,r9d,2
- lea eax,[r8*1+rax]
- and r15d,esi
- vaesenclast xmm11,xmm9,xmm10
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((176-128))+rdi]
- xor r14d,r12d
- xor r15d,r10d
- xor r14d,r13d
- lea r8d,[r15*1+r8]
- mov r12d,ebx
- add edx,DWORD[((32+16))+rbp]
- and r12d,eax
- rorx r13d,eax,25
- rorx r15d,eax,11
- lea r8d,[r14*1+r8]
- lea edx,[r12*1+rdx]
- andn r12d,eax,ecx
- xor r13d,r15d
- rorx r14d,eax,6
- lea edx,[r12*1+rdx]
- xor r13d,r14d
- mov r15d,r8d
- rorx r12d,r8d,22
- lea edx,[r13*1+rdx]
- xor r15d,r9d
- rorx r14d,r8d,13
- rorx r13d,r8d,2
- lea r11d,[rdx*1+r11]
- and esi,r15d
- vpand xmm8,xmm11,xmm12
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((192-128))+rdi]
- xor r14d,r12d
- xor esi,r9d
- xor r14d,r13d
- lea edx,[rsi*1+rdx]
- mov r12d,eax
- add ecx,DWORD[((36+16))+rbp]
- and r12d,r11d
- rorx r13d,r11d,25
- rorx esi,r11d,11
- lea edx,[r14*1+rdx]
- lea ecx,[r12*1+rcx]
- andn r12d,r11d,ebx
- xor r13d,esi
- rorx r14d,r11d,6
- lea ecx,[r12*1+rcx]
- xor r13d,r14d
- mov esi,edx
- rorx r12d,edx,22
- lea ecx,[r13*1+rcx]
- xor esi,r8d
- rorx r14d,edx,13
- rorx r13d,edx,2
- lea r10d,[rcx*1+r10]
- and r15d,esi
- vaesenclast xmm11,xmm9,xmm10
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((208-128))+rdi]
- xor r14d,r12d
- xor r15d,r8d
- xor r14d,r13d
- lea ecx,[r15*1+rcx]
- mov r12d,r11d
- add ebx,DWORD[((40+16))+rbp]
- and r12d,r10d
- rorx r13d,r10d,25
- rorx r15d,r10d,11
- lea ecx,[r14*1+rcx]
- lea ebx,[r12*1+rbx]
- andn r12d,r10d,eax
- xor r13d,r15d
- rorx r14d,r10d,6
- lea ebx,[r12*1+rbx]
- xor r13d,r14d
- mov r15d,ecx
- rorx r12d,ecx,22
- lea ebx,[r13*1+rbx]
- xor r15d,edx
- rorx r14d,ecx,13
- rorx r13d,ecx,2
- lea r9d,[rbx*1+r9]
- and esi,r15d
- vpand xmm11,xmm11,xmm13
- vaesenc xmm9,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((224-128))+rdi]
- xor r14d,r12d
- xor esi,edx
- xor r14d,r13d
- lea ebx,[rsi*1+rbx]
- mov r12d,r10d
- add eax,DWORD[((44+16))+rbp]
- and r12d,r9d
- rorx r13d,r9d,25
- rorx esi,r9d,11
- lea ebx,[r14*1+rbx]
- lea eax,[r12*1+rax]
- andn r12d,r9d,r11d
- xor r13d,esi
- rorx r14d,r9d,6
- lea eax,[r12*1+rax]
- xor r13d,r14d
- mov esi,ebx
- rorx r12d,ebx,22
- lea eax,[r13*1+rax]
- xor esi,ecx
- rorx r14d,ebx,13
- rorx r13d,ebx,2
- lea r8d,[rax*1+r8]
- and r15d,esi
- vpor xmm8,xmm8,xmm11
- vaesenclast xmm11,xmm9,xmm10
- vmovdqu xmm10,XMMWORD[((0-128))+rdi]
- xor r14d,r12d
- xor r15d,ecx
- xor r14d,r13d
- lea eax,[r15*1+rax]
- mov r12d,r9d
- vmovq r13,xmm15
- vpextrq r15,xmm15,1
- vpand xmm11,xmm11,xmm14
- vpor xmm8,xmm8,xmm11
- lea rbp,[((-64))+rbp]
- vmovdqu XMMWORD[r13*1+r15],xmm8
- lea r13,[16+r13]
- cmp rbp,rsp
- jae NEAR $L$ower_avx2
-
- mov r15,QWORD[552+rsp]
- lea r13,[64+r13]
- mov rsi,QWORD[560+rsp]
- add eax,r14d
- lea rsp,[448+rsp]
-
- add eax,DWORD[r15]
- add ebx,DWORD[4+r15]
- add ecx,DWORD[8+r15]
- add edx,DWORD[12+r15]
- add r8d,DWORD[16+r15]
- add r9d,DWORD[20+r15]
- add r10d,DWORD[24+r15]
- lea r12,[r13*1+rsi]
- add r11d,DWORD[28+r15]
-
- cmp r13,QWORD[((64+16))+rsp]
-
- mov DWORD[r15],eax
- cmove r12,rsp
- mov DWORD[4+r15],ebx
- mov DWORD[8+r15],ecx
- mov DWORD[12+r15],edx
- mov DWORD[16+r15],r8d
- mov DWORD[20+r15],r9d
- mov DWORD[24+r15],r10d
- mov DWORD[28+r15],r11d
-
- jbe NEAR $L$oop_avx2
- lea rbp,[rsp]
-
-
-
-
-$L$done_avx2:
- mov r8,QWORD[((64+32))+rbp]
- mov rsi,QWORD[((64+56))+rbp]
-
- vmovdqu XMMWORD[r8],xmm8
- vzeroall
- movaps xmm6,XMMWORD[128+rbp]
- movaps xmm7,XMMWORD[144+rbp]
- movaps xmm8,XMMWORD[160+rbp]
- movaps xmm9,XMMWORD[176+rbp]
- movaps xmm10,XMMWORD[192+rbp]
- movaps xmm11,XMMWORD[208+rbp]
- movaps xmm12,XMMWORD[224+rbp]
- movaps xmm13,XMMWORD[240+rbp]
- movaps xmm14,XMMWORD[256+rbp]
- movaps xmm15,XMMWORD[272+rbp]
- mov r15,QWORD[((-48))+rsi]
-
- mov r14,QWORD[((-40))+rsi]
-
- mov r13,QWORD[((-32))+rsi]
-
- mov r12,QWORD[((-24))+rsi]
-
- mov rbp,QWORD[((-16))+rsi]
-
- mov rbx,QWORD[((-8))+rsi]
-
- lea rsp,[rsi]
-
-$L$epilogue_avx2:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_aesni_cbc_sha256_enc_avx2:
-
-ALIGN 32
-aesni_cbc_sha256_enc_shaext:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_aesni_cbc_sha256_enc_shaext:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
- mov rcx,r9
- mov r8,QWORD[40+rsp]
- mov r9,QWORD[48+rsp]
-
-
-
- mov r10,QWORD[56+rsp]
- lea rsp,[((-168))+rsp]
- movaps XMMWORD[(-8-160)+rax],xmm6
- movaps XMMWORD[(-8-144)+rax],xmm7
- movaps XMMWORD[(-8-128)+rax],xmm8
- movaps XMMWORD[(-8-112)+rax],xmm9
- movaps XMMWORD[(-8-96)+rax],xmm10
- movaps XMMWORD[(-8-80)+rax],xmm11
- movaps XMMWORD[(-8-64)+rax],xmm12
- movaps XMMWORD[(-8-48)+rax],xmm13
- movaps XMMWORD[(-8-32)+rax],xmm14
- movaps XMMWORD[(-8-16)+rax],xmm15
-$L$prologue_shaext:
- lea rax,[((K256+128))]
- movdqu xmm1,XMMWORD[r9]
- movdqu xmm2,XMMWORD[16+r9]
- movdqa xmm3,XMMWORD[((512-128))+rax]
-
- mov r11d,DWORD[240+rcx]
- sub rsi,rdi
- movups xmm15,XMMWORD[rcx]
- movups xmm6,XMMWORD[r8]
- movups xmm4,XMMWORD[16+rcx]
- lea rcx,[112+rcx]
-
- pshufd xmm0,xmm1,0x1b
- pshufd xmm1,xmm1,0xb1
- pshufd xmm2,xmm2,0x1b
- movdqa xmm7,xmm3
-DB 102,15,58,15,202,8
- punpcklqdq xmm2,xmm0
-
- jmp NEAR $L$oop_shaext
-
-ALIGN 16
-$L$oop_shaext:
- movdqu xmm10,XMMWORD[r10]
- movdqu xmm11,XMMWORD[16+r10]
- movdqu xmm12,XMMWORD[32+r10]
-DB 102,68,15,56,0,211
- movdqu xmm13,XMMWORD[48+r10]
-
- movdqa xmm0,XMMWORD[((0-128))+rax]
- paddd xmm0,xmm10
-DB 102,68,15,56,0,219
- movdqa xmm9,xmm2
- movdqa xmm8,xmm1
- movups xmm14,XMMWORD[rdi]
- xorps xmm14,xmm15
- xorps xmm6,xmm14
- movups xmm5,XMMWORD[((-80))+rcx]
- aesenc xmm6,xmm4
-DB 15,56,203,209
- pshufd xmm0,xmm0,0x0e
- movups xmm4,XMMWORD[((-64))+rcx]
- aesenc xmm6,xmm5
-DB 15,56,203,202
-
- movdqa xmm0,XMMWORD[((32-128))+rax]
- paddd xmm0,xmm11
-DB 102,68,15,56,0,227
- lea r10,[64+r10]
- movups xmm5,XMMWORD[((-48))+rcx]
- aesenc xmm6,xmm4
-DB 15,56,203,209
- pshufd xmm0,xmm0,0x0e
- movups xmm4,XMMWORD[((-32))+rcx]
- aesenc xmm6,xmm5
-DB 15,56,203,202
-
- movdqa xmm0,XMMWORD[((64-128))+rax]
- paddd xmm0,xmm12
-DB 102,68,15,56,0,235
-DB 69,15,56,204,211
- movups xmm5,XMMWORD[((-16))+rcx]
- aesenc xmm6,xmm4
-DB 15,56,203,209
- pshufd xmm0,xmm0,0x0e
- movdqa xmm3,xmm13
-DB 102,65,15,58,15,220,4
- paddd xmm10,xmm3
- movups xmm4,XMMWORD[rcx]
- aesenc xmm6,xmm5
-DB 15,56,203,202
-
- movdqa xmm0,XMMWORD[((96-128))+rax]
- paddd xmm0,xmm13
-DB 69,15,56,205,213
-DB 69,15,56,204,220
- movups xmm5,XMMWORD[16+rcx]
- aesenc xmm6,xmm4
-DB 15,56,203,209
- pshufd xmm0,xmm0,0x0e
- movups xmm4,XMMWORD[32+rcx]
- aesenc xmm6,xmm5
- movdqa xmm3,xmm10
-DB 102,65,15,58,15,221,4
- paddd xmm11,xmm3
-DB 15,56,203,202
- movdqa xmm0,XMMWORD[((128-128))+rax]
- paddd xmm0,xmm10
-DB 69,15,56,205,218
-DB 69,15,56,204,229
- movups xmm5,XMMWORD[48+rcx]
- aesenc xmm6,xmm4
-DB 15,56,203,209
- pshufd xmm0,xmm0,0x0e
- movdqa xmm3,xmm11
-DB 102,65,15,58,15,218,4
- paddd xmm12,xmm3
- cmp r11d,11
- jb NEAR $L$aesenclast1
- movups xmm4,XMMWORD[64+rcx]
- aesenc xmm6,xmm5
- movups xmm5,XMMWORD[80+rcx]
- aesenc xmm6,xmm4
- je NEAR $L$aesenclast1
- movups xmm4,XMMWORD[96+rcx]
- aesenc xmm6,xmm5
- movups xmm5,XMMWORD[112+rcx]
- aesenc xmm6,xmm4
-$L$aesenclast1:
- aesenclast xmm6,xmm5
- movups xmm4,XMMWORD[((16-112))+rcx]
- nop
-DB 15,56,203,202
- movups xmm14,XMMWORD[16+rdi]
- xorps xmm14,xmm15
- movups XMMWORD[rdi*1+rsi],xmm6
- xorps xmm6,xmm14
- movups xmm5,XMMWORD[((-80))+rcx]
- aesenc xmm6,xmm4
- movdqa xmm0,XMMWORD[((160-128))+rax]
- paddd xmm0,xmm11
-DB 69,15,56,205,227
-DB 69,15,56,204,234
- movups xmm4,XMMWORD[((-64))+rcx]
- aesenc xmm6,xmm5
-DB 15,56,203,209
- pshufd xmm0,xmm0,0x0e
- movdqa xmm3,xmm12
-DB 102,65,15,58,15,219,4
- paddd xmm13,xmm3
- movups xmm5,XMMWORD[((-48))+rcx]
- aesenc xmm6,xmm4
-DB 15,56,203,202
- movdqa xmm0,XMMWORD[((192-128))+rax]
- paddd xmm0,xmm12
-DB 69,15,56,205,236
-DB 69,15,56,204,211
- movups xmm4,XMMWORD[((-32))+rcx]
- aesenc xmm6,xmm5
-DB 15,56,203,209
- pshufd xmm0,xmm0,0x0e
- movdqa xmm3,xmm13
-DB 102,65,15,58,15,220,4
- paddd xmm10,xmm3
- movups xmm5,XMMWORD[((-16))+rcx]
- aesenc xmm6,xmm4
-DB 15,56,203,202
- movdqa xmm0,XMMWORD[((224-128))+rax]
- paddd xmm0,xmm13
-DB 69,15,56,205,213
-DB 69,15,56,204,220
- movups xmm4,XMMWORD[rcx]
- aesenc xmm6,xmm5
-DB 15,56,203,209
- pshufd xmm0,xmm0,0x0e
- movdqa xmm3,xmm10
-DB 102,65,15,58,15,221,4
- paddd xmm11,xmm3
- movups xmm5,XMMWORD[16+rcx]
- aesenc xmm6,xmm4
-DB 15,56,203,202
- movdqa xmm0,XMMWORD[((256-128))+rax]
- paddd xmm0,xmm10
-DB 69,15,56,205,218
-DB 69,15,56,204,229
- movups xmm4,XMMWORD[32+rcx]
- aesenc xmm6,xmm5
-DB 15,56,203,209
- pshufd xmm0,xmm0,0x0e
- movdqa xmm3,xmm11
-DB 102,65,15,58,15,218,4
- paddd xmm12,xmm3
- movups xmm5,XMMWORD[48+rcx]
- aesenc xmm6,xmm4
- cmp r11d,11
- jb NEAR $L$aesenclast2
- movups xmm4,XMMWORD[64+rcx]
- aesenc xmm6,xmm5
- movups xmm5,XMMWORD[80+rcx]
- aesenc xmm6,xmm4
- je NEAR $L$aesenclast2
- movups xmm4,XMMWORD[96+rcx]
- aesenc xmm6,xmm5
- movups xmm5,XMMWORD[112+rcx]
- aesenc xmm6,xmm4
-$L$aesenclast2:
- aesenclast xmm6,xmm5
- movups xmm4,XMMWORD[((16-112))+rcx]
- nop
-DB 15,56,203,202
- movups xmm14,XMMWORD[32+rdi]
- xorps xmm14,xmm15
- movups XMMWORD[16+rdi*1+rsi],xmm6
- xorps xmm6,xmm14
- movups xmm5,XMMWORD[((-80))+rcx]
- aesenc xmm6,xmm4
- movdqa xmm0,XMMWORD[((288-128))+rax]
- paddd xmm0,xmm11
-DB 69,15,56,205,227
-DB 69,15,56,204,234
- movups xmm4,XMMWORD[((-64))+rcx]
- aesenc xmm6,xmm5
-DB 15,56,203,209
- pshufd xmm0,xmm0,0x0e
- movdqa xmm3,xmm12
-DB 102,65,15,58,15,219,4
- paddd xmm13,xmm3
- movups xmm5,XMMWORD[((-48))+rcx]
- aesenc xmm6,xmm4
-DB 15,56,203,202
- movdqa xmm0,XMMWORD[((320-128))+rax]
- paddd xmm0,xmm12
-DB 69,15,56,205,236
-DB 69,15,56,204,211
- movups xmm4,XMMWORD[((-32))+rcx]
- aesenc xmm6,xmm5
-DB 15,56,203,209
- pshufd xmm0,xmm0,0x0e
- movdqa xmm3,xmm13
-DB 102,65,15,58,15,220,4
- paddd xmm10,xmm3
- movups xmm5,XMMWORD[((-16))+rcx]
- aesenc xmm6,xmm4
-DB 15,56,203,202
- movdqa xmm0,XMMWORD[((352-128))+rax]
- paddd xmm0,xmm13
-DB 69,15,56,205,213
-DB 69,15,56,204,220
- movups xmm4,XMMWORD[rcx]
- aesenc xmm6,xmm5
-DB 15,56,203,209
- pshufd xmm0,xmm0,0x0e
- movdqa xmm3,xmm10
-DB 102,65,15,58,15,221,4
- paddd xmm11,xmm3
- movups xmm5,XMMWORD[16+rcx]
- aesenc xmm6,xmm4
-DB 15,56,203,202
- movdqa xmm0,XMMWORD[((384-128))+rax]
- paddd xmm0,xmm10
-DB 69,15,56,205,218
-DB 69,15,56,204,229
- movups xmm4,XMMWORD[32+rcx]
- aesenc xmm6,xmm5
-DB 15,56,203,209
- pshufd xmm0,xmm0,0x0e
- movdqa xmm3,xmm11
-DB 102,65,15,58,15,218,4
- paddd xmm12,xmm3
- movups xmm5,XMMWORD[48+rcx]
- aesenc xmm6,xmm4
-DB 15,56,203,202
- movdqa xmm0,XMMWORD[((416-128))+rax]
- paddd xmm0,xmm11
-DB 69,15,56,205,227
-DB 69,15,56,204,234
- cmp r11d,11
- jb NEAR $L$aesenclast3
- movups xmm4,XMMWORD[64+rcx]
- aesenc xmm6,xmm5
- movups xmm5,XMMWORD[80+rcx]
- aesenc xmm6,xmm4
- je NEAR $L$aesenclast3
- movups xmm4,XMMWORD[96+rcx]
- aesenc xmm6,xmm5
- movups xmm5,XMMWORD[112+rcx]
- aesenc xmm6,xmm4
-$L$aesenclast3:
- aesenclast xmm6,xmm5
- movups xmm4,XMMWORD[((16-112))+rcx]
- nop
-DB 15,56,203,209
- pshufd xmm0,xmm0,0x0e
- movdqa xmm3,xmm12
-DB 102,65,15,58,15,219,4
- paddd xmm13,xmm3
- movups xmm14,XMMWORD[48+rdi]
- xorps xmm14,xmm15
- movups XMMWORD[32+rdi*1+rsi],xmm6
- xorps xmm6,xmm14
- movups xmm5,XMMWORD[((-80))+rcx]
- aesenc xmm6,xmm4
- movups xmm4,XMMWORD[((-64))+rcx]
- aesenc xmm6,xmm5
-DB 15,56,203,202
-
- movdqa xmm0,XMMWORD[((448-128))+rax]
- paddd xmm0,xmm12
-DB 69,15,56,205,236
- movdqa xmm3,xmm7
- movups xmm5,XMMWORD[((-48))+rcx]
- aesenc xmm6,xmm4
-DB 15,56,203,209
- pshufd xmm0,xmm0,0x0e
- movups xmm4,XMMWORD[((-32))+rcx]
- aesenc xmm6,xmm5
-DB 15,56,203,202
-
- movdqa xmm0,XMMWORD[((480-128))+rax]
- paddd xmm0,xmm13
- movups xmm5,XMMWORD[((-16))+rcx]
- aesenc xmm6,xmm4
- movups xmm4,XMMWORD[rcx]
- aesenc xmm6,xmm5
-DB 15,56,203,209
- pshufd xmm0,xmm0,0x0e
- movups xmm5,XMMWORD[16+rcx]
- aesenc xmm6,xmm4
-DB 15,56,203,202
-
- movups xmm4,XMMWORD[32+rcx]
- aesenc xmm6,xmm5
- movups xmm5,XMMWORD[48+rcx]
- aesenc xmm6,xmm4
- cmp r11d,11
- jb NEAR $L$aesenclast4
- movups xmm4,XMMWORD[64+rcx]
- aesenc xmm6,xmm5
- movups xmm5,XMMWORD[80+rcx]
- aesenc xmm6,xmm4
- je NEAR $L$aesenclast4
- movups xmm4,XMMWORD[96+rcx]
- aesenc xmm6,xmm5
- movups xmm5,XMMWORD[112+rcx]
- aesenc xmm6,xmm4
-$L$aesenclast4:
- aesenclast xmm6,xmm5
- movups xmm4,XMMWORD[((16-112))+rcx]
- nop
-
- paddd xmm2,xmm9
- paddd xmm1,xmm8
-
- dec rdx
- movups XMMWORD[48+rdi*1+rsi],xmm6
- lea rdi,[64+rdi]
- jnz NEAR $L$oop_shaext
-
- pshufd xmm2,xmm2,0xb1
- pshufd xmm3,xmm1,0x1b
- pshufd xmm1,xmm1,0xb1
- punpckhqdq xmm1,xmm2
-DB 102,15,58,15,211,8
-
- movups XMMWORD[r8],xmm6
- movdqu XMMWORD[r9],xmm1
- movdqu XMMWORD[16+r9],xmm2
- movaps xmm6,XMMWORD[rsp]
- movaps xmm7,XMMWORD[16+rsp]
- movaps xmm8,XMMWORD[32+rsp]
- movaps xmm9,XMMWORD[48+rsp]
- movaps xmm10,XMMWORD[64+rsp]
- movaps xmm11,XMMWORD[80+rsp]
- movaps xmm12,XMMWORD[96+rsp]
- movaps xmm13,XMMWORD[112+rsp]
- movaps xmm14,XMMWORD[128+rsp]
- movaps xmm15,XMMWORD[144+rsp]
- lea rsp,[((8+160))+rsp]
-$L$epilogue_shaext:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_aesni_cbc_sha256_enc_shaext:
-EXTERN __imp_RtlVirtualUnwind
-
-ALIGN 16
-se_handler:
- push rsi
- push rdi
- push rbx
- push rbp
- push r12
- push r13
- push r14
- push r15
- pushfq
- sub rsp,64
-
- mov rax,QWORD[120+r8]
- mov rbx,QWORD[248+r8]
-
- mov rsi,QWORD[8+r9]
- mov r11,QWORD[56+r9]
-
- mov r10d,DWORD[r11]
- lea r10,[r10*1+rsi]
- cmp rbx,r10
- jb NEAR $L$in_prologue
-
- mov rax,QWORD[152+r8]
-
- mov r10d,DWORD[4+r11]
- lea r10,[r10*1+rsi]
- cmp rbx,r10
- jae NEAR $L$in_prologue
- lea r10,[aesni_cbc_sha256_enc_shaext]
- cmp rbx,r10
- jb NEAR $L$not_in_shaext
-
- lea rsi,[rax]
- lea rdi,[512+r8]
- mov ecx,20
- DD 0xa548f3fc
- lea rax,[168+rax]
- jmp NEAR $L$in_prologue
-$L$not_in_shaext:
- lea r10,[$L$avx2_shortcut]
- cmp rbx,r10
- jb NEAR $L$not_in_avx2
-
- and rax,-256*4
- add rax,448
-$L$not_in_avx2:
- mov rsi,rax
- mov rax,QWORD[((64+56))+rax]
-
- mov rbx,QWORD[((-8))+rax]
- mov rbp,QWORD[((-16))+rax]
- mov r12,QWORD[((-24))+rax]
- mov r13,QWORD[((-32))+rax]
- mov r14,QWORD[((-40))+rax]
- mov r15,QWORD[((-48))+rax]
- mov QWORD[144+r8],rbx
- mov QWORD[160+r8],rbp
- mov QWORD[216+r8],r12
- mov QWORD[224+r8],r13
- mov QWORD[232+r8],r14
- mov QWORD[240+r8],r15
-
- lea rsi,[((64+64))+rsi]
- lea rdi,[512+r8]
- mov ecx,20
- DD 0xa548f3fc
-
-$L$in_prologue:
- mov rdi,QWORD[8+rax]
- mov rsi,QWORD[16+rax]
- mov QWORD[152+r8],rax
- mov QWORD[168+r8],rsi
- mov QWORD[176+r8],rdi
-
- mov rdi,QWORD[40+r9]
- mov rsi,r8
- mov ecx,154
- DD 0xa548f3fc
-
- mov rsi,r9
- xor rcx,rcx
- mov rdx,QWORD[8+rsi]
- mov r8,QWORD[rsi]
- mov r9,QWORD[16+rsi]
- mov r10,QWORD[40+rsi]
- lea r11,[56+rsi]
- lea r12,[24+rsi]
- mov QWORD[32+rsp],r10
- mov QWORD[40+rsp],r11
- mov QWORD[48+rsp],r12
- mov QWORD[56+rsp],rcx
- call QWORD[__imp_RtlVirtualUnwind]
-
- mov eax,1
- add rsp,64
- popfq
- pop r15
- pop r14
- pop r13
- pop r12
- pop rbp
- pop rbx
- pop rdi
- pop rsi
- DB 0F3h,0C3h ;repret
-
-
-section .pdata rdata align=4
- DD $L$SEH_begin_aesni_cbc_sha256_enc_xop wrt ..imagebase
- DD $L$SEH_end_aesni_cbc_sha256_enc_xop wrt ..imagebase
- DD $L$SEH_info_aesni_cbc_sha256_enc_xop wrt ..imagebase
-
- DD $L$SEH_begin_aesni_cbc_sha256_enc_avx wrt ..imagebase
- DD $L$SEH_end_aesni_cbc_sha256_enc_avx wrt ..imagebase
- DD $L$SEH_info_aesni_cbc_sha256_enc_avx wrt ..imagebase
- DD $L$SEH_begin_aesni_cbc_sha256_enc_avx2 wrt ..imagebase
- DD $L$SEH_end_aesni_cbc_sha256_enc_avx2 wrt ..imagebase
- DD $L$SEH_info_aesni_cbc_sha256_enc_avx2 wrt ..imagebase
- DD $L$SEH_begin_aesni_cbc_sha256_enc_shaext wrt ..imagebase
- DD $L$SEH_end_aesni_cbc_sha256_enc_shaext wrt ..imagebase
- DD $L$SEH_info_aesni_cbc_sha256_enc_shaext wrt ..imagebase
-section .xdata rdata align=8
-ALIGN 8
-$L$SEH_info_aesni_cbc_sha256_enc_xop:
-DB 9,0,0,0
- DD se_handler wrt ..imagebase
- DD $L$prologue_xop wrt ..imagebase,$L$epilogue_xop wrt ..imagebase
-
-$L$SEH_info_aesni_cbc_sha256_enc_avx:
-DB 9,0,0,0
- DD se_handler wrt ..imagebase
- DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
-$L$SEH_info_aesni_cbc_sha256_enc_avx2:
-DB 9,0,0,0
- DD se_handler wrt ..imagebase
- DD $L$prologue_avx2 wrt ..imagebase,$L$epilogue_avx2 wrt ..imagebase
-$L$SEH_info_aesni_cbc_sha256_enc_shaext:
-DB 9,0,0,0
- DD se_handler wrt ..imagebase
- DD $L$prologue_shaext wrt ..imagebase,$L$epilogue_shaext wrt ..imagebase
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-avx2.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-avx2.nasm
index 7342e16c22c..f097a539bb1 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-avx2.nasm
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-avx2.nasm
@@ -5,1977 +5,26 @@ default rel
section .text code align=64
-global rsaz_1024_sqr_avx2
-
-ALIGN 64
-rsaz_1024_sqr_avx2:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_rsaz_1024_sqr_avx2:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
- mov rcx,r9
- mov r8,QWORD[40+rsp]
-
-
-
- lea rax,[rsp]
-
- push rbx
-
- push rbp
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
- vzeroupper
- lea rsp,[((-168))+rsp]
- vmovaps XMMWORD[(-216)+rax],xmm6
- vmovaps XMMWORD[(-200)+rax],xmm7
- vmovaps XMMWORD[(-184)+rax],xmm8
- vmovaps XMMWORD[(-168)+rax],xmm9
- vmovaps XMMWORD[(-152)+rax],xmm10
- vmovaps XMMWORD[(-136)+rax],xmm11
- vmovaps XMMWORD[(-120)+rax],xmm12
- vmovaps XMMWORD[(-104)+rax],xmm13
- vmovaps XMMWORD[(-88)+rax],xmm14
- vmovaps XMMWORD[(-72)+rax],xmm15
-$L$sqr_1024_body:
- mov rbp,rax
-
- mov r13,rdx
- sub rsp,832
- mov r15,r13
- sub rdi,-128
- sub rsi,-128
- sub r13,-128
-
- and r15,4095
- add r15,32*10
- shr r15,12
- vpxor ymm9,ymm9,ymm9
- jz NEAR $L$sqr_1024_no_n_copy
-
-
-
-
-
- sub rsp,32*10
- vmovdqu ymm0,YMMWORD[((0-128))+r13]
- and rsp,-2048
- vmovdqu ymm1,YMMWORD[((32-128))+r13]
- vmovdqu ymm2,YMMWORD[((64-128))+r13]
- vmovdqu ymm3,YMMWORD[((96-128))+r13]
- vmovdqu ymm4,YMMWORD[((128-128))+r13]
- vmovdqu ymm5,YMMWORD[((160-128))+r13]
- vmovdqu ymm6,YMMWORD[((192-128))+r13]
- vmovdqu ymm7,YMMWORD[((224-128))+r13]
- vmovdqu ymm8,YMMWORD[((256-128))+r13]
- lea r13,[((832+128))+rsp]
- vmovdqu YMMWORD[(0-128)+r13],ymm0
- vmovdqu YMMWORD[(32-128)+r13],ymm1
- vmovdqu YMMWORD[(64-128)+r13],ymm2
- vmovdqu YMMWORD[(96-128)+r13],ymm3
- vmovdqu YMMWORD[(128-128)+r13],ymm4
- vmovdqu YMMWORD[(160-128)+r13],ymm5
- vmovdqu YMMWORD[(192-128)+r13],ymm6
- vmovdqu YMMWORD[(224-128)+r13],ymm7
- vmovdqu YMMWORD[(256-128)+r13],ymm8
- vmovdqu YMMWORD[(288-128)+r13],ymm9
-
-$L$sqr_1024_no_n_copy:
- and rsp,-1024
-
- vmovdqu ymm1,YMMWORD[((32-128))+rsi]
- vmovdqu ymm2,YMMWORD[((64-128))+rsi]
- vmovdqu ymm3,YMMWORD[((96-128))+rsi]
- vmovdqu ymm4,YMMWORD[((128-128))+rsi]
- vmovdqu ymm5,YMMWORD[((160-128))+rsi]
- vmovdqu ymm6,YMMWORD[((192-128))+rsi]
- vmovdqu ymm7,YMMWORD[((224-128))+rsi]
- vmovdqu ymm8,YMMWORD[((256-128))+rsi]
-
- lea rbx,[192+rsp]
- vmovdqu ymm15,YMMWORD[$L$and_mask]
- jmp NEAR $L$OOP_GRANDE_SQR_1024
-
-ALIGN 32
-$L$OOP_GRANDE_SQR_1024:
- lea r9,[((576+128))+rsp]
- lea r12,[448+rsp]
-
-
-
-
- vpaddq ymm1,ymm1,ymm1
- vpbroadcastq ymm10,QWORD[((0-128))+rsi]
- vpaddq ymm2,ymm2,ymm2
- vmovdqa YMMWORD[(0-128)+r9],ymm1
- vpaddq ymm3,ymm3,ymm3
- vmovdqa YMMWORD[(32-128)+r9],ymm2
- vpaddq ymm4,ymm4,ymm4
- vmovdqa YMMWORD[(64-128)+r9],ymm3
- vpaddq ymm5,ymm5,ymm5
- vmovdqa YMMWORD[(96-128)+r9],ymm4
- vpaddq ymm6,ymm6,ymm6
- vmovdqa YMMWORD[(128-128)+r9],ymm5
- vpaddq ymm7,ymm7,ymm7
- vmovdqa YMMWORD[(160-128)+r9],ymm6
- vpaddq ymm8,ymm8,ymm8
- vmovdqa YMMWORD[(192-128)+r9],ymm7
- vpxor ymm9,ymm9,ymm9
- vmovdqa YMMWORD[(224-128)+r9],ymm8
-
- vpmuludq ymm0,ymm10,YMMWORD[((0-128))+rsi]
- vpbroadcastq ymm11,QWORD[((32-128))+rsi]
- vmovdqu YMMWORD[(288-192)+rbx],ymm9
- vpmuludq ymm1,ymm1,ymm10
- vmovdqu YMMWORD[(320-448)+r12],ymm9
- vpmuludq ymm2,ymm2,ymm10
- vmovdqu YMMWORD[(352-448)+r12],ymm9
- vpmuludq ymm3,ymm3,ymm10
- vmovdqu YMMWORD[(384-448)+r12],ymm9
- vpmuludq ymm4,ymm4,ymm10
- vmovdqu YMMWORD[(416-448)+r12],ymm9
- vpmuludq ymm5,ymm5,ymm10
- vmovdqu YMMWORD[(448-448)+r12],ymm9
- vpmuludq ymm6,ymm6,ymm10
- vmovdqu YMMWORD[(480-448)+r12],ymm9
- vpmuludq ymm7,ymm7,ymm10
- vmovdqu YMMWORD[(512-448)+r12],ymm9
- vpmuludq ymm8,ymm8,ymm10
- vpbroadcastq ymm10,QWORD[((64-128))+rsi]
- vmovdqu YMMWORD[(544-448)+r12],ymm9
-
- mov r15,rsi
- mov r14d,4
- jmp NEAR $L$sqr_entry_1024
-ALIGN 32
-$L$OOP_SQR_1024:
- vpbroadcastq ymm11,QWORD[((32-128))+r15]
- vpmuludq ymm0,ymm10,YMMWORD[((0-128))+rsi]
- vpaddq ymm0,ymm0,YMMWORD[((0-192))+rbx]
- vpmuludq ymm1,ymm10,YMMWORD[((0-128))+r9]
- vpaddq ymm1,ymm1,YMMWORD[((32-192))+rbx]
- vpmuludq ymm2,ymm10,YMMWORD[((32-128))+r9]
- vpaddq ymm2,ymm2,YMMWORD[((64-192))+rbx]
- vpmuludq ymm3,ymm10,YMMWORD[((64-128))+r9]
- vpaddq ymm3,ymm3,YMMWORD[((96-192))+rbx]
- vpmuludq ymm4,ymm10,YMMWORD[((96-128))+r9]
- vpaddq ymm4,ymm4,YMMWORD[((128-192))+rbx]
- vpmuludq ymm5,ymm10,YMMWORD[((128-128))+r9]
- vpaddq ymm5,ymm5,YMMWORD[((160-192))+rbx]
- vpmuludq ymm6,ymm10,YMMWORD[((160-128))+r9]
- vpaddq ymm6,ymm6,YMMWORD[((192-192))+rbx]
- vpmuludq ymm7,ymm10,YMMWORD[((192-128))+r9]
- vpaddq ymm7,ymm7,YMMWORD[((224-192))+rbx]
- vpmuludq ymm8,ymm10,YMMWORD[((224-128))+r9]
- vpbroadcastq ymm10,QWORD[((64-128))+r15]
- vpaddq ymm8,ymm8,YMMWORD[((256-192))+rbx]
-$L$sqr_entry_1024:
- vmovdqu YMMWORD[(0-192)+rbx],ymm0
- vmovdqu YMMWORD[(32-192)+rbx],ymm1
-
- vpmuludq ymm12,ymm11,YMMWORD[((32-128))+rsi]
- vpaddq ymm2,ymm2,ymm12
- vpmuludq ymm14,ymm11,YMMWORD[((32-128))+r9]
- vpaddq ymm3,ymm3,ymm14
- vpmuludq ymm13,ymm11,YMMWORD[((64-128))+r9]
- vpaddq ymm4,ymm4,ymm13
- vpmuludq ymm12,ymm11,YMMWORD[((96-128))+r9]
- vpaddq ymm5,ymm5,ymm12
- vpmuludq ymm14,ymm11,YMMWORD[((128-128))+r9]
- vpaddq ymm6,ymm6,ymm14
- vpmuludq ymm13,ymm11,YMMWORD[((160-128))+r9]
- vpaddq ymm7,ymm7,ymm13
- vpmuludq ymm12,ymm11,YMMWORD[((192-128))+r9]
- vpaddq ymm8,ymm8,ymm12
- vpmuludq ymm0,ymm11,YMMWORD[((224-128))+r9]
- vpbroadcastq ymm11,QWORD[((96-128))+r15]
- vpaddq ymm0,ymm0,YMMWORD[((288-192))+rbx]
-
- vmovdqu YMMWORD[(64-192)+rbx],ymm2
- vmovdqu YMMWORD[(96-192)+rbx],ymm3
-
- vpmuludq ymm13,ymm10,YMMWORD[((64-128))+rsi]
- vpaddq ymm4,ymm4,ymm13
- vpmuludq ymm12,ymm10,YMMWORD[((64-128))+r9]
- vpaddq ymm5,ymm5,ymm12
- vpmuludq ymm14,ymm10,YMMWORD[((96-128))+r9]
- vpaddq ymm6,ymm6,ymm14
- vpmuludq ymm13,ymm10,YMMWORD[((128-128))+r9]
- vpaddq ymm7,ymm7,ymm13
- vpmuludq ymm12,ymm10,YMMWORD[((160-128))+r9]
- vpaddq ymm8,ymm8,ymm12
- vpmuludq ymm14,ymm10,YMMWORD[((192-128))+r9]
- vpaddq ymm0,ymm0,ymm14
- vpmuludq ymm1,ymm10,YMMWORD[((224-128))+r9]
- vpbroadcastq ymm10,QWORD[((128-128))+r15]
- vpaddq ymm1,ymm1,YMMWORD[((320-448))+r12]
-
- vmovdqu YMMWORD[(128-192)+rbx],ymm4
- vmovdqu YMMWORD[(160-192)+rbx],ymm5
-
- vpmuludq ymm12,ymm11,YMMWORD[((96-128))+rsi]
- vpaddq ymm6,ymm6,ymm12
- vpmuludq ymm14,ymm11,YMMWORD[((96-128))+r9]
- vpaddq ymm7,ymm7,ymm14
- vpmuludq ymm13,ymm11,YMMWORD[((128-128))+r9]
- vpaddq ymm8,ymm8,ymm13
- vpmuludq ymm12,ymm11,YMMWORD[((160-128))+r9]
- vpaddq ymm0,ymm0,ymm12
- vpmuludq ymm14,ymm11,YMMWORD[((192-128))+r9]
- vpaddq ymm1,ymm1,ymm14
- vpmuludq ymm2,ymm11,YMMWORD[((224-128))+r9]
- vpbroadcastq ymm11,QWORD[((160-128))+r15]
- vpaddq ymm2,ymm2,YMMWORD[((352-448))+r12]
-
- vmovdqu YMMWORD[(192-192)+rbx],ymm6
- vmovdqu YMMWORD[(224-192)+rbx],ymm7
-
- vpmuludq ymm12,ymm10,YMMWORD[((128-128))+rsi]
- vpaddq ymm8,ymm8,ymm12
- vpmuludq ymm14,ymm10,YMMWORD[((128-128))+r9]
- vpaddq ymm0,ymm0,ymm14
- vpmuludq ymm13,ymm10,YMMWORD[((160-128))+r9]
- vpaddq ymm1,ymm1,ymm13
- vpmuludq ymm12,ymm10,YMMWORD[((192-128))+r9]
- vpaddq ymm2,ymm2,ymm12
- vpmuludq ymm3,ymm10,YMMWORD[((224-128))+r9]
- vpbroadcastq ymm10,QWORD[((192-128))+r15]
- vpaddq ymm3,ymm3,YMMWORD[((384-448))+r12]
-
- vmovdqu YMMWORD[(256-192)+rbx],ymm8
- vmovdqu YMMWORD[(288-192)+rbx],ymm0
- lea rbx,[8+rbx]
-
- vpmuludq ymm13,ymm11,YMMWORD[((160-128))+rsi]
- vpaddq ymm1,ymm1,ymm13
- vpmuludq ymm12,ymm11,YMMWORD[((160-128))+r9]
- vpaddq ymm2,ymm2,ymm12
- vpmuludq ymm14,ymm11,YMMWORD[((192-128))+r9]
- vpaddq ymm3,ymm3,ymm14
- vpmuludq ymm4,ymm11,YMMWORD[((224-128))+r9]
- vpbroadcastq ymm11,QWORD[((224-128))+r15]
- vpaddq ymm4,ymm4,YMMWORD[((416-448))+r12]
-
- vmovdqu YMMWORD[(320-448)+r12],ymm1
- vmovdqu YMMWORD[(352-448)+r12],ymm2
-
- vpmuludq ymm12,ymm10,YMMWORD[((192-128))+rsi]
- vpaddq ymm3,ymm3,ymm12
- vpmuludq ymm14,ymm10,YMMWORD[((192-128))+r9]
- vpbroadcastq ymm0,QWORD[((256-128))+r15]
- vpaddq ymm4,ymm4,ymm14
- vpmuludq ymm5,ymm10,YMMWORD[((224-128))+r9]
- vpbroadcastq ymm10,QWORD[((0+8-128))+r15]
- vpaddq ymm5,ymm5,YMMWORD[((448-448))+r12]
-
- vmovdqu YMMWORD[(384-448)+r12],ymm3
- vmovdqu YMMWORD[(416-448)+r12],ymm4
- lea r15,[8+r15]
-
- vpmuludq ymm12,ymm11,YMMWORD[((224-128))+rsi]
- vpaddq ymm5,ymm5,ymm12
- vpmuludq ymm6,ymm11,YMMWORD[((224-128))+r9]
- vpaddq ymm6,ymm6,YMMWORD[((480-448))+r12]
-
- vpmuludq ymm7,ymm0,YMMWORD[((256-128))+rsi]
- vmovdqu YMMWORD[(448-448)+r12],ymm5
- vpaddq ymm7,ymm7,YMMWORD[((512-448))+r12]
- vmovdqu YMMWORD[(480-448)+r12],ymm6
- vmovdqu YMMWORD[(512-448)+r12],ymm7
- lea r12,[8+r12]
-
- dec r14d
- jnz NEAR $L$OOP_SQR_1024
-
- vmovdqu ymm8,YMMWORD[256+rsp]
- vmovdqu ymm1,YMMWORD[288+rsp]
- vmovdqu ymm2,YMMWORD[320+rsp]
- lea rbx,[192+rsp]
-
- vpsrlq ymm14,ymm8,29
- vpand ymm8,ymm8,ymm15
- vpsrlq ymm11,ymm1,29
- vpand ymm1,ymm1,ymm15
-
- vpermq ymm14,ymm14,0x93
- vpxor ymm9,ymm9,ymm9
- vpermq ymm11,ymm11,0x93
-
- vpblendd ymm10,ymm14,ymm9,3
- vpblendd ymm14,ymm11,ymm14,3
- vpaddq ymm8,ymm8,ymm10
- vpblendd ymm11,ymm9,ymm11,3
- vpaddq ymm1,ymm1,ymm14
- vpaddq ymm2,ymm2,ymm11
- vmovdqu YMMWORD[(288-192)+rbx],ymm1
- vmovdqu YMMWORD[(320-192)+rbx],ymm2
-
- mov rax,QWORD[rsp]
- mov r10,QWORD[8+rsp]
- mov r11,QWORD[16+rsp]
- mov r12,QWORD[24+rsp]
- vmovdqu ymm1,YMMWORD[32+rsp]
- vmovdqu ymm2,YMMWORD[((64-192))+rbx]
- vmovdqu ymm3,YMMWORD[((96-192))+rbx]
- vmovdqu ymm4,YMMWORD[((128-192))+rbx]
- vmovdqu ymm5,YMMWORD[((160-192))+rbx]
- vmovdqu ymm6,YMMWORD[((192-192))+rbx]
- vmovdqu ymm7,YMMWORD[((224-192))+rbx]
-
- mov r9,rax
- imul eax,ecx
- and eax,0x1fffffff
- vmovd xmm12,eax
-
- mov rdx,rax
- imul rax,QWORD[((-128))+r13]
- vpbroadcastq ymm12,xmm12
- add r9,rax
- mov rax,rdx
- imul rax,QWORD[((8-128))+r13]
- shr r9,29
- add r10,rax
- mov rax,rdx
- imul rax,QWORD[((16-128))+r13]
- add r10,r9
- add r11,rax
- imul rdx,QWORD[((24-128))+r13]
- add r12,rdx
-
- mov rax,r10
- imul eax,ecx
- and eax,0x1fffffff
-
- mov r14d,9
- jmp NEAR $L$OOP_REDUCE_1024
-
-ALIGN 32
-$L$OOP_REDUCE_1024:
- vmovd xmm13,eax
- vpbroadcastq ymm13,xmm13
-
- vpmuludq ymm10,ymm12,YMMWORD[((32-128))+r13]
- mov rdx,rax
- imul rax,QWORD[((-128))+r13]
- vpaddq ymm1,ymm1,ymm10
- add r10,rax
- vpmuludq ymm14,ymm12,YMMWORD[((64-128))+r13]
- mov rax,rdx
- imul rax,QWORD[((8-128))+r13]
- vpaddq ymm2,ymm2,ymm14
- vpmuludq ymm11,ymm12,YMMWORD[((96-128))+r13]
-DB 0x67
- add r11,rax
-DB 0x67
- mov rax,rdx
- imul rax,QWORD[((16-128))+r13]
- shr r10,29
- vpaddq ymm3,ymm3,ymm11
- vpmuludq ymm10,ymm12,YMMWORD[((128-128))+r13]
- add r12,rax
- add r11,r10
- vpaddq ymm4,ymm4,ymm10
- vpmuludq ymm14,ymm12,YMMWORD[((160-128))+r13]
- mov rax,r11
- imul eax,ecx
- vpaddq ymm5,ymm5,ymm14
- vpmuludq ymm11,ymm12,YMMWORD[((192-128))+r13]
- and eax,0x1fffffff
- vpaddq ymm6,ymm6,ymm11
- vpmuludq ymm10,ymm12,YMMWORD[((224-128))+r13]
- vpaddq ymm7,ymm7,ymm10
- vpmuludq ymm14,ymm12,YMMWORD[((256-128))+r13]
- vmovd xmm12,eax
-
- vpaddq ymm8,ymm8,ymm14
-
- vpbroadcastq ymm12,xmm12
-
- vpmuludq ymm11,ymm13,YMMWORD[((32-8-128))+r13]
- vmovdqu ymm14,YMMWORD[((96-8-128))+r13]
- mov rdx,rax
- imul rax,QWORD[((-128))+r13]
- vpaddq ymm1,ymm1,ymm11
- vpmuludq ymm10,ymm13,YMMWORD[((64-8-128))+r13]
- vmovdqu ymm11,YMMWORD[((128-8-128))+r13]
- add r11,rax
- mov rax,rdx
- imul rax,QWORD[((8-128))+r13]
- vpaddq ymm2,ymm2,ymm10
- add rax,r12
- shr r11,29
- vpmuludq ymm14,ymm14,ymm13
- vmovdqu ymm10,YMMWORD[((160-8-128))+r13]
- add rax,r11
- vpaddq ymm3,ymm3,ymm14
- vpmuludq ymm11,ymm11,ymm13
- vmovdqu ymm14,YMMWORD[((192-8-128))+r13]
-DB 0x67
- mov r12,rax
- imul eax,ecx
- vpaddq ymm4,ymm4,ymm11
- vpmuludq ymm10,ymm10,ymm13
-DB 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00
- and eax,0x1fffffff
- vpaddq ymm5,ymm5,ymm10
- vpmuludq ymm14,ymm14,ymm13
- vmovdqu ymm10,YMMWORD[((256-8-128))+r13]
- vpaddq ymm6,ymm6,ymm14
- vpmuludq ymm11,ymm11,ymm13
- vmovdqu ymm9,YMMWORD[((288-8-128))+r13]
- vmovd xmm0,eax
- imul rax,QWORD[((-128))+r13]
- vpaddq ymm7,ymm7,ymm11
- vpmuludq ymm10,ymm10,ymm13
- vmovdqu ymm14,YMMWORD[((32-16-128))+r13]
- vpbroadcastq ymm0,xmm0
- vpaddq ymm8,ymm8,ymm10
- vpmuludq ymm9,ymm9,ymm13
- vmovdqu ymm11,YMMWORD[((64-16-128))+r13]
- add r12,rax
-
- vmovdqu ymm13,YMMWORD[((32-24-128))+r13]
- vpmuludq ymm14,ymm14,ymm12
- vmovdqu ymm10,YMMWORD[((96-16-128))+r13]
- vpaddq ymm1,ymm1,ymm14
- vpmuludq ymm13,ymm13,ymm0
- vpmuludq ymm11,ymm11,ymm12
-DB 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff
- vpaddq ymm13,ymm13,ymm1
- vpaddq ymm2,ymm2,ymm11
- vpmuludq ymm10,ymm10,ymm12
- vmovdqu ymm11,YMMWORD[((160-16-128))+r13]
-DB 0x67
- vmovq rax,xmm13
- vmovdqu YMMWORD[rsp],ymm13
- vpaddq ymm3,ymm3,ymm10
- vpmuludq ymm14,ymm14,ymm12
- vmovdqu ymm10,YMMWORD[((192-16-128))+r13]
- vpaddq ymm4,ymm4,ymm14
- vpmuludq ymm11,ymm11,ymm12
- vmovdqu ymm14,YMMWORD[((224-16-128))+r13]
- vpaddq ymm5,ymm5,ymm11
- vpmuludq ymm10,ymm10,ymm12
- vmovdqu ymm11,YMMWORD[((256-16-128))+r13]
- vpaddq ymm6,ymm6,ymm10
- vpmuludq ymm14,ymm14,ymm12
- shr r12,29
- vmovdqu ymm10,YMMWORD[((288-16-128))+r13]
- add rax,r12
- vpaddq ymm7,ymm7,ymm14
- vpmuludq ymm11,ymm11,ymm12
-
- mov r9,rax
- imul eax,ecx
- vpaddq ymm8,ymm8,ymm11
- vpmuludq ymm10,ymm10,ymm12
- and eax,0x1fffffff
- vmovd xmm12,eax
- vmovdqu ymm11,YMMWORD[((96-24-128))+r13]
-DB 0x67
- vpaddq ymm9,ymm9,ymm10
- vpbroadcastq ymm12,xmm12
-
- vpmuludq ymm14,ymm0,YMMWORD[((64-24-128))+r13]
- vmovdqu ymm10,YMMWORD[((128-24-128))+r13]
- mov rdx,rax
- imul rax,QWORD[((-128))+r13]
- mov r10,QWORD[8+rsp]
- vpaddq ymm1,ymm2,ymm14
- vpmuludq ymm11,ymm11,ymm0
- vmovdqu ymm14,YMMWORD[((160-24-128))+r13]
- add r9,rax
- mov rax,rdx
- imul rax,QWORD[((8-128))+r13]
-DB 0x67
- shr r9,29
- mov r11,QWORD[16+rsp]
- vpaddq ymm2,ymm3,ymm11
- vpmuludq ymm10,ymm10,ymm0
- vmovdqu ymm11,YMMWORD[((192-24-128))+r13]
- add r10,rax
- mov rax,rdx
- imul rax,QWORD[((16-128))+r13]
- vpaddq ymm3,ymm4,ymm10
- vpmuludq ymm14,ymm14,ymm0
- vmovdqu ymm10,YMMWORD[((224-24-128))+r13]
- imul rdx,QWORD[((24-128))+r13]
- add r11,rax
- lea rax,[r10*1+r9]
- vpaddq ymm4,ymm5,ymm14
- vpmuludq ymm11,ymm11,ymm0
- vmovdqu ymm14,YMMWORD[((256-24-128))+r13]
- mov r10,rax
- imul eax,ecx
- vpmuludq ymm10,ymm10,ymm0
- vpaddq ymm5,ymm6,ymm11
- vmovdqu ymm11,YMMWORD[((288-24-128))+r13]
- and eax,0x1fffffff
- vpaddq ymm6,ymm7,ymm10
- vpmuludq ymm14,ymm14,ymm0
- add rdx,QWORD[24+rsp]
- vpaddq ymm7,ymm8,ymm14
- vpmuludq ymm11,ymm11,ymm0
- vpaddq ymm8,ymm9,ymm11
- vmovq xmm9,r12
- mov r12,rdx
-
- dec r14d
- jnz NEAR $L$OOP_REDUCE_1024
- lea r12,[448+rsp]
- vpaddq ymm0,ymm13,ymm9
- vpxor ymm9,ymm9,ymm9
-
- vpaddq ymm0,ymm0,YMMWORD[((288-192))+rbx]
- vpaddq ymm1,ymm1,YMMWORD[((320-448))+r12]
- vpaddq ymm2,ymm2,YMMWORD[((352-448))+r12]
- vpaddq ymm3,ymm3,YMMWORD[((384-448))+r12]
- vpaddq ymm4,ymm4,YMMWORD[((416-448))+r12]
- vpaddq ymm5,ymm5,YMMWORD[((448-448))+r12]
- vpaddq ymm6,ymm6,YMMWORD[((480-448))+r12]
- vpaddq ymm7,ymm7,YMMWORD[((512-448))+r12]
- vpaddq ymm8,ymm8,YMMWORD[((544-448))+r12]
-
- vpsrlq ymm14,ymm0,29
- vpand ymm0,ymm0,ymm15
- vpsrlq ymm11,ymm1,29
- vpand ymm1,ymm1,ymm15
- vpsrlq ymm12,ymm2,29
- vpermq ymm14,ymm14,0x93
- vpand ymm2,ymm2,ymm15
- vpsrlq ymm13,ymm3,29
- vpermq ymm11,ymm11,0x93
- vpand ymm3,ymm3,ymm15
- vpermq ymm12,ymm12,0x93
-
- vpblendd ymm10,ymm14,ymm9,3
- vpermq ymm13,ymm13,0x93
- vpblendd ymm14,ymm11,ymm14,3
- vpaddq ymm0,ymm0,ymm10
- vpblendd ymm11,ymm12,ymm11,3
- vpaddq ymm1,ymm1,ymm14
- vpblendd ymm12,ymm13,ymm12,3
- vpaddq ymm2,ymm2,ymm11
- vpblendd ymm13,ymm9,ymm13,3
- vpaddq ymm3,ymm3,ymm12
- vpaddq ymm4,ymm4,ymm13
-
- vpsrlq ymm14,ymm0,29
- vpand ymm0,ymm0,ymm15
- vpsrlq ymm11,ymm1,29
- vpand ymm1,ymm1,ymm15
- vpsrlq ymm12,ymm2,29
- vpermq ymm14,ymm14,0x93
- vpand ymm2,ymm2,ymm15
- vpsrlq ymm13,ymm3,29
- vpermq ymm11,ymm11,0x93
- vpand ymm3,ymm3,ymm15
- vpermq ymm12,ymm12,0x93
-
- vpblendd ymm10,ymm14,ymm9,3
- vpermq ymm13,ymm13,0x93
- vpblendd ymm14,ymm11,ymm14,3
- vpaddq ymm0,ymm0,ymm10
- vpblendd ymm11,ymm12,ymm11,3
- vpaddq ymm1,ymm1,ymm14
- vmovdqu YMMWORD[(0-128)+rdi],ymm0
- vpblendd ymm12,ymm13,ymm12,3
- vpaddq ymm2,ymm2,ymm11
- vmovdqu YMMWORD[(32-128)+rdi],ymm1
- vpblendd ymm13,ymm9,ymm13,3
- vpaddq ymm3,ymm3,ymm12
- vmovdqu YMMWORD[(64-128)+rdi],ymm2
- vpaddq ymm4,ymm4,ymm13
- vmovdqu YMMWORD[(96-128)+rdi],ymm3
- vpsrlq ymm14,ymm4,29
- vpand ymm4,ymm4,ymm15
- vpsrlq ymm11,ymm5,29
- vpand ymm5,ymm5,ymm15
- vpsrlq ymm12,ymm6,29
- vpermq ymm14,ymm14,0x93
- vpand ymm6,ymm6,ymm15
- vpsrlq ymm13,ymm7,29
- vpermq ymm11,ymm11,0x93
- vpand ymm7,ymm7,ymm15
- vpsrlq ymm0,ymm8,29
- vpermq ymm12,ymm12,0x93
- vpand ymm8,ymm8,ymm15
- vpermq ymm13,ymm13,0x93
-
- vpblendd ymm10,ymm14,ymm9,3
- vpermq ymm0,ymm0,0x93
- vpblendd ymm14,ymm11,ymm14,3
- vpaddq ymm4,ymm4,ymm10
- vpblendd ymm11,ymm12,ymm11,3
- vpaddq ymm5,ymm5,ymm14
- vpblendd ymm12,ymm13,ymm12,3
- vpaddq ymm6,ymm6,ymm11
- vpblendd ymm13,ymm0,ymm13,3
- vpaddq ymm7,ymm7,ymm12
- vpaddq ymm8,ymm8,ymm13
-
- vpsrlq ymm14,ymm4,29
- vpand ymm4,ymm4,ymm15
- vpsrlq ymm11,ymm5,29
- vpand ymm5,ymm5,ymm15
- vpsrlq ymm12,ymm6,29
- vpermq ymm14,ymm14,0x93
- vpand ymm6,ymm6,ymm15
- vpsrlq ymm13,ymm7,29
- vpermq ymm11,ymm11,0x93
- vpand ymm7,ymm7,ymm15
- vpsrlq ymm0,ymm8,29
- vpermq ymm12,ymm12,0x93
- vpand ymm8,ymm8,ymm15
- vpermq ymm13,ymm13,0x93
-
- vpblendd ymm10,ymm14,ymm9,3
- vpermq ymm0,ymm0,0x93
- vpblendd ymm14,ymm11,ymm14,3
- vpaddq ymm4,ymm4,ymm10
- vpblendd ymm11,ymm12,ymm11,3
- vpaddq ymm5,ymm5,ymm14
- vmovdqu YMMWORD[(128-128)+rdi],ymm4
- vpblendd ymm12,ymm13,ymm12,3
- vpaddq ymm6,ymm6,ymm11
- vmovdqu YMMWORD[(160-128)+rdi],ymm5
- vpblendd ymm13,ymm0,ymm13,3
- vpaddq ymm7,ymm7,ymm12
- vmovdqu YMMWORD[(192-128)+rdi],ymm6
- vpaddq ymm8,ymm8,ymm13
- vmovdqu YMMWORD[(224-128)+rdi],ymm7
- vmovdqu YMMWORD[(256-128)+rdi],ymm8
-
- mov rsi,rdi
- dec r8d
- jne NEAR $L$OOP_GRANDE_SQR_1024
-
- vzeroall
- mov rax,rbp
-
-$L$sqr_1024_in_tail:
- movaps xmm6,XMMWORD[((-216))+rax]
- movaps xmm7,XMMWORD[((-200))+rax]
- movaps xmm8,XMMWORD[((-184))+rax]
- movaps xmm9,XMMWORD[((-168))+rax]
- movaps xmm10,XMMWORD[((-152))+rax]
- movaps xmm11,XMMWORD[((-136))+rax]
- movaps xmm12,XMMWORD[((-120))+rax]
- movaps xmm13,XMMWORD[((-104))+rax]
- movaps xmm14,XMMWORD[((-88))+rax]
- movaps xmm15,XMMWORD[((-72))+rax]
- mov r15,QWORD[((-48))+rax]
-
- mov r14,QWORD[((-40))+rax]
-
- mov r13,QWORD[((-32))+rax]
-
- mov r12,QWORD[((-24))+rax]
-
- mov rbp,QWORD[((-16))+rax]
-
- mov rbx,QWORD[((-8))+rax]
-
- lea rsp,[rax]
-
-$L$sqr_1024_epilogue:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_rsaz_1024_sqr_avx2:
-global rsaz_1024_mul_avx2
-
-ALIGN 64
-rsaz_1024_mul_avx2:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_rsaz_1024_mul_avx2:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
- mov rcx,r9
- mov r8,QWORD[40+rsp]
-
-
-
- lea rax,[rsp]
-
- push rbx
-
- push rbp
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
- vzeroupper
- lea rsp,[((-168))+rsp]
- vmovaps XMMWORD[(-216)+rax],xmm6
- vmovaps XMMWORD[(-200)+rax],xmm7
- vmovaps XMMWORD[(-184)+rax],xmm8
- vmovaps XMMWORD[(-168)+rax],xmm9
- vmovaps XMMWORD[(-152)+rax],xmm10
- vmovaps XMMWORD[(-136)+rax],xmm11
- vmovaps XMMWORD[(-120)+rax],xmm12
- vmovaps XMMWORD[(-104)+rax],xmm13
- vmovaps XMMWORD[(-88)+rax],xmm14
- vmovaps XMMWORD[(-72)+rax],xmm15
-$L$mul_1024_body:
- mov rbp,rax
-
- vzeroall
- mov r13,rdx
- sub rsp,64
-
-
-
-
-
-
-DB 0x67,0x67
- mov r15,rsi
- and r15,4095
- add r15,32*10
- shr r15,12
- mov r15,rsi
- cmovnz rsi,r13
- cmovnz r13,r15
-
- mov r15,rcx
- sub rsi,-128
- sub rcx,-128
- sub rdi,-128
-
- and r15,4095
- add r15,32*10
-DB 0x67,0x67
- shr r15,12
- jz NEAR $L$mul_1024_no_n_copy
-
-
-
-
-
- sub rsp,32*10
- vmovdqu ymm0,YMMWORD[((0-128))+rcx]
- and rsp,-512
- vmovdqu ymm1,YMMWORD[((32-128))+rcx]
- vmovdqu ymm2,YMMWORD[((64-128))+rcx]
- vmovdqu ymm3,YMMWORD[((96-128))+rcx]
- vmovdqu ymm4,YMMWORD[((128-128))+rcx]
- vmovdqu ymm5,YMMWORD[((160-128))+rcx]
- vmovdqu ymm6,YMMWORD[((192-128))+rcx]
- vmovdqu ymm7,YMMWORD[((224-128))+rcx]
- vmovdqu ymm8,YMMWORD[((256-128))+rcx]
- lea rcx,[((64+128))+rsp]
- vmovdqu YMMWORD[(0-128)+rcx],ymm0
- vpxor ymm0,ymm0,ymm0
- vmovdqu YMMWORD[(32-128)+rcx],ymm1
- vpxor ymm1,ymm1,ymm1
- vmovdqu YMMWORD[(64-128)+rcx],ymm2
- vpxor ymm2,ymm2,ymm2
- vmovdqu YMMWORD[(96-128)+rcx],ymm3
- vpxor ymm3,ymm3,ymm3
- vmovdqu YMMWORD[(128-128)+rcx],ymm4
- vpxor ymm4,ymm4,ymm4
- vmovdqu YMMWORD[(160-128)+rcx],ymm5
- vpxor ymm5,ymm5,ymm5
- vmovdqu YMMWORD[(192-128)+rcx],ymm6
- vpxor ymm6,ymm6,ymm6
- vmovdqu YMMWORD[(224-128)+rcx],ymm7
- vpxor ymm7,ymm7,ymm7
- vmovdqu YMMWORD[(256-128)+rcx],ymm8
- vmovdqa ymm8,ymm0
- vmovdqu YMMWORD[(288-128)+rcx],ymm9
-$L$mul_1024_no_n_copy:
- and rsp,-64
-
- mov rbx,QWORD[r13]
- vpbroadcastq ymm10,QWORD[r13]
- vmovdqu YMMWORD[rsp],ymm0
- xor r9,r9
-DB 0x67
- xor r10,r10
- xor r11,r11
- xor r12,r12
-
- vmovdqu ymm15,YMMWORD[$L$and_mask]
- mov r14d,9
- vmovdqu YMMWORD[(288-128)+rdi],ymm9
- jmp NEAR $L$oop_mul_1024
-
-ALIGN 32
-$L$oop_mul_1024:
- vpsrlq ymm9,ymm3,29
- mov rax,rbx
- imul rax,QWORD[((-128))+rsi]
- add rax,r9
- mov r10,rbx
- imul r10,QWORD[((8-128))+rsi]
- add r10,QWORD[8+rsp]
-
- mov r9,rax
- imul eax,r8d
- and eax,0x1fffffff
-
- mov r11,rbx
- imul r11,QWORD[((16-128))+rsi]
- add r11,QWORD[16+rsp]
-
- mov r12,rbx
- imul r12,QWORD[((24-128))+rsi]
- add r12,QWORD[24+rsp]
- vpmuludq ymm0,ymm10,YMMWORD[((32-128))+rsi]
- vmovd xmm11,eax
- vpaddq ymm1,ymm1,ymm0
- vpmuludq ymm12,ymm10,YMMWORD[((64-128))+rsi]
- vpbroadcastq ymm11,xmm11
- vpaddq ymm2,ymm2,ymm12
- vpmuludq ymm13,ymm10,YMMWORD[((96-128))+rsi]
- vpand ymm3,ymm3,ymm15
- vpaddq ymm3,ymm3,ymm13
- vpmuludq ymm0,ymm10,YMMWORD[((128-128))+rsi]
- vpaddq ymm4,ymm4,ymm0
- vpmuludq ymm12,ymm10,YMMWORD[((160-128))+rsi]
- vpaddq ymm5,ymm5,ymm12
- vpmuludq ymm13,ymm10,YMMWORD[((192-128))+rsi]
- vpaddq ymm6,ymm6,ymm13
- vpmuludq ymm0,ymm10,YMMWORD[((224-128))+rsi]
- vpermq ymm9,ymm9,0x93
- vpaddq ymm7,ymm7,ymm0
- vpmuludq ymm12,ymm10,YMMWORD[((256-128))+rsi]
- vpbroadcastq ymm10,QWORD[8+r13]
- vpaddq ymm8,ymm8,ymm12
-
- mov rdx,rax
- imul rax,QWORD[((-128))+rcx]
- add r9,rax
- mov rax,rdx
- imul rax,QWORD[((8-128))+rcx]
- add r10,rax
- mov rax,rdx
- imul rax,QWORD[((16-128))+rcx]
- add r11,rax
- shr r9,29
- imul rdx,QWORD[((24-128))+rcx]
- add r12,rdx
- add r10,r9
-
- vpmuludq ymm13,ymm11,YMMWORD[((32-128))+rcx]
- vmovq rbx,xmm10
- vpaddq ymm1,ymm1,ymm13
- vpmuludq ymm0,ymm11,YMMWORD[((64-128))+rcx]
- vpaddq ymm2,ymm2,ymm0
- vpmuludq ymm12,ymm11,YMMWORD[((96-128))+rcx]
- vpaddq ymm3,ymm3,ymm12
- vpmuludq ymm13,ymm11,YMMWORD[((128-128))+rcx]
- vpaddq ymm4,ymm4,ymm13
- vpmuludq ymm0,ymm11,YMMWORD[((160-128))+rcx]
- vpaddq ymm5,ymm5,ymm0
- vpmuludq ymm12,ymm11,YMMWORD[((192-128))+rcx]
- vpaddq ymm6,ymm6,ymm12
- vpmuludq ymm13,ymm11,YMMWORD[((224-128))+rcx]
- vpblendd ymm12,ymm9,ymm14,3
- vpaddq ymm7,ymm7,ymm13
- vpmuludq ymm0,ymm11,YMMWORD[((256-128))+rcx]
- vpaddq ymm3,ymm3,ymm12
- vpaddq ymm8,ymm8,ymm0
-
- mov rax,rbx
- imul rax,QWORD[((-128))+rsi]
- add r10,rax
- vmovdqu ymm12,YMMWORD[((-8+32-128))+rsi]
- mov rax,rbx
- imul rax,QWORD[((8-128))+rsi]
- add r11,rax
- vmovdqu ymm13,YMMWORD[((-8+64-128))+rsi]
-
- mov rax,r10
- vpblendd ymm9,ymm9,ymm14,0xfc
- imul eax,r8d
- vpaddq ymm4,ymm4,ymm9
- and eax,0x1fffffff
-
- imul rbx,QWORD[((16-128))+rsi]
- add r12,rbx
- vpmuludq ymm12,ymm12,ymm10
- vmovd xmm11,eax
- vmovdqu ymm0,YMMWORD[((-8+96-128))+rsi]
- vpaddq ymm1,ymm1,ymm12
- vpmuludq ymm13,ymm13,ymm10
- vpbroadcastq ymm11,xmm11
- vmovdqu ymm12,YMMWORD[((-8+128-128))+rsi]
- vpaddq ymm2,ymm2,ymm13
- vpmuludq ymm0,ymm0,ymm10
- vmovdqu ymm13,YMMWORD[((-8+160-128))+rsi]
- vpaddq ymm3,ymm3,ymm0
- vpmuludq ymm12,ymm12,ymm10
- vmovdqu ymm0,YMMWORD[((-8+192-128))+rsi]
- vpaddq ymm4,ymm4,ymm12
- vpmuludq ymm13,ymm13,ymm10
- vmovdqu ymm12,YMMWORD[((-8+224-128))+rsi]
- vpaddq ymm5,ymm5,ymm13
- vpmuludq ymm0,ymm0,ymm10
- vmovdqu ymm13,YMMWORD[((-8+256-128))+rsi]
- vpaddq ymm6,ymm6,ymm0
- vpmuludq ymm12,ymm12,ymm10
- vmovdqu ymm9,YMMWORD[((-8+288-128))+rsi]
- vpaddq ymm7,ymm7,ymm12
- vpmuludq ymm13,ymm13,ymm10
- vpaddq ymm8,ymm8,ymm13
- vpmuludq ymm9,ymm9,ymm10
- vpbroadcastq ymm10,QWORD[16+r13]
-
- mov rdx,rax
- imul rax,QWORD[((-128))+rcx]
- add r10,rax
- vmovdqu ymm0,YMMWORD[((-8+32-128))+rcx]
- mov rax,rdx
- imul rax,QWORD[((8-128))+rcx]
- add r11,rax
- vmovdqu ymm12,YMMWORD[((-8+64-128))+rcx]
- shr r10,29
- imul rdx,QWORD[((16-128))+rcx]
- add r12,rdx
- add r11,r10
-
- vpmuludq ymm0,ymm0,ymm11
- vmovq rbx,xmm10
- vmovdqu ymm13,YMMWORD[((-8+96-128))+rcx]
- vpaddq ymm1,ymm1,ymm0
- vpmuludq ymm12,ymm12,ymm11
- vmovdqu ymm0,YMMWORD[((-8+128-128))+rcx]
- vpaddq ymm2,ymm2,ymm12
- vpmuludq ymm13,ymm13,ymm11
- vmovdqu ymm12,YMMWORD[((-8+160-128))+rcx]
- vpaddq ymm3,ymm3,ymm13
- vpmuludq ymm0,ymm0,ymm11
- vmovdqu ymm13,YMMWORD[((-8+192-128))+rcx]
- vpaddq ymm4,ymm4,ymm0
- vpmuludq ymm12,ymm12,ymm11
- vmovdqu ymm0,YMMWORD[((-8+224-128))+rcx]
- vpaddq ymm5,ymm5,ymm12
- vpmuludq ymm13,ymm13,ymm11
- vmovdqu ymm12,YMMWORD[((-8+256-128))+rcx]
- vpaddq ymm6,ymm6,ymm13
- vpmuludq ymm0,ymm0,ymm11
- vmovdqu ymm13,YMMWORD[((-8+288-128))+rcx]
- vpaddq ymm7,ymm7,ymm0
- vpmuludq ymm12,ymm12,ymm11
- vpaddq ymm8,ymm8,ymm12
- vpmuludq ymm13,ymm13,ymm11
- vpaddq ymm9,ymm9,ymm13
-
- vmovdqu ymm0,YMMWORD[((-16+32-128))+rsi]
- mov rax,rbx
- imul rax,QWORD[((-128))+rsi]
- add rax,r11
-
- vmovdqu ymm12,YMMWORD[((-16+64-128))+rsi]
- mov r11,rax
- imul eax,r8d
- and eax,0x1fffffff
-
- imul rbx,QWORD[((8-128))+rsi]
- add r12,rbx
- vpmuludq ymm0,ymm0,ymm10
- vmovd xmm11,eax
- vmovdqu ymm13,YMMWORD[((-16+96-128))+rsi]
- vpaddq ymm1,ymm1,ymm0
- vpmuludq ymm12,ymm12,ymm10
- vpbroadcastq ymm11,xmm11
- vmovdqu ymm0,YMMWORD[((-16+128-128))+rsi]
- vpaddq ymm2,ymm2,ymm12
- vpmuludq ymm13,ymm13,ymm10
- vmovdqu ymm12,YMMWORD[((-16+160-128))+rsi]
- vpaddq ymm3,ymm3,ymm13
- vpmuludq ymm0,ymm0,ymm10
- vmovdqu ymm13,YMMWORD[((-16+192-128))+rsi]
- vpaddq ymm4,ymm4,ymm0
- vpmuludq ymm12,ymm12,ymm10
- vmovdqu ymm0,YMMWORD[((-16+224-128))+rsi]
- vpaddq ymm5,ymm5,ymm12
- vpmuludq ymm13,ymm13,ymm10
- vmovdqu ymm12,YMMWORD[((-16+256-128))+rsi]
- vpaddq ymm6,ymm6,ymm13
- vpmuludq ymm0,ymm0,ymm10
- vmovdqu ymm13,YMMWORD[((-16+288-128))+rsi]
- vpaddq ymm7,ymm7,ymm0
- vpmuludq ymm12,ymm12,ymm10
- vpaddq ymm8,ymm8,ymm12
- vpmuludq ymm13,ymm13,ymm10
- vpbroadcastq ymm10,QWORD[24+r13]
- vpaddq ymm9,ymm9,ymm13
-
- vmovdqu ymm0,YMMWORD[((-16+32-128))+rcx]
- mov rdx,rax
- imul rax,QWORD[((-128))+rcx]
- add r11,rax
- vmovdqu ymm12,YMMWORD[((-16+64-128))+rcx]
- imul rdx,QWORD[((8-128))+rcx]
- add r12,rdx
- shr r11,29
-
- vpmuludq ymm0,ymm0,ymm11
- vmovq rbx,xmm10
- vmovdqu ymm13,YMMWORD[((-16+96-128))+rcx]
- vpaddq ymm1,ymm1,ymm0
- vpmuludq ymm12,ymm12,ymm11
- vmovdqu ymm0,YMMWORD[((-16+128-128))+rcx]
- vpaddq ymm2,ymm2,ymm12
- vpmuludq ymm13,ymm13,ymm11
- vmovdqu ymm12,YMMWORD[((-16+160-128))+rcx]
- vpaddq ymm3,ymm3,ymm13
- vpmuludq ymm0,ymm0,ymm11
- vmovdqu ymm13,YMMWORD[((-16+192-128))+rcx]
- vpaddq ymm4,ymm4,ymm0
- vpmuludq ymm12,ymm12,ymm11
- vmovdqu ymm0,YMMWORD[((-16+224-128))+rcx]
- vpaddq ymm5,ymm5,ymm12
- vpmuludq ymm13,ymm13,ymm11
- vmovdqu ymm12,YMMWORD[((-16+256-128))+rcx]
- vpaddq ymm6,ymm6,ymm13
- vpmuludq ymm0,ymm0,ymm11
- vmovdqu ymm13,YMMWORD[((-16+288-128))+rcx]
- vpaddq ymm7,ymm7,ymm0
- vpmuludq ymm12,ymm12,ymm11
- vmovdqu ymm0,YMMWORD[((-24+32-128))+rsi]
- vpaddq ymm8,ymm8,ymm12
- vpmuludq ymm13,ymm13,ymm11
- vmovdqu ymm12,YMMWORD[((-24+64-128))+rsi]
- vpaddq ymm9,ymm9,ymm13
-
- add r12,r11
- imul rbx,QWORD[((-128))+rsi]
- add r12,rbx
-
- mov rax,r12
- imul eax,r8d
- and eax,0x1fffffff
-
- vpmuludq ymm0,ymm0,ymm10
- vmovd xmm11,eax
- vmovdqu ymm13,YMMWORD[((-24+96-128))+rsi]
- vpaddq ymm1,ymm1,ymm0
- vpmuludq ymm12,ymm12,ymm10
- vpbroadcastq ymm11,xmm11
- vmovdqu ymm0,YMMWORD[((-24+128-128))+rsi]
- vpaddq ymm2,ymm2,ymm12
- vpmuludq ymm13,ymm13,ymm10
- vmovdqu ymm12,YMMWORD[((-24+160-128))+rsi]
- vpaddq ymm3,ymm3,ymm13
- vpmuludq ymm0,ymm0,ymm10
- vmovdqu ymm13,YMMWORD[((-24+192-128))+rsi]
- vpaddq ymm4,ymm4,ymm0
- vpmuludq ymm12,ymm12,ymm10
- vmovdqu ymm0,YMMWORD[((-24+224-128))+rsi]
- vpaddq ymm5,ymm5,ymm12
- vpmuludq ymm13,ymm13,ymm10
- vmovdqu ymm12,YMMWORD[((-24+256-128))+rsi]
- vpaddq ymm6,ymm6,ymm13
- vpmuludq ymm0,ymm0,ymm10
- vmovdqu ymm13,YMMWORD[((-24+288-128))+rsi]
- vpaddq ymm7,ymm7,ymm0
- vpmuludq ymm12,ymm12,ymm10
- vpaddq ymm8,ymm8,ymm12
- vpmuludq ymm13,ymm13,ymm10
- vpbroadcastq ymm10,QWORD[32+r13]
- vpaddq ymm9,ymm9,ymm13
- add r13,32
-
- vmovdqu ymm0,YMMWORD[((-24+32-128))+rcx]
- imul rax,QWORD[((-128))+rcx]
- add r12,rax
- shr r12,29
-
- vmovdqu ymm12,YMMWORD[((-24+64-128))+rcx]
- vpmuludq ymm0,ymm0,ymm11
- vmovq rbx,xmm10
- vmovdqu ymm13,YMMWORD[((-24+96-128))+rcx]
- vpaddq ymm0,ymm1,ymm0
- vpmuludq ymm12,ymm12,ymm11
- vmovdqu YMMWORD[rsp],ymm0
- vpaddq ymm1,ymm2,ymm12
- vmovdqu ymm0,YMMWORD[((-24+128-128))+rcx]
- vpmuludq ymm13,ymm13,ymm11
- vmovdqu ymm12,YMMWORD[((-24+160-128))+rcx]
- vpaddq ymm2,ymm3,ymm13
- vpmuludq ymm0,ymm0,ymm11
- vmovdqu ymm13,YMMWORD[((-24+192-128))+rcx]
- vpaddq ymm3,ymm4,ymm0
- vpmuludq ymm12,ymm12,ymm11
- vmovdqu ymm0,YMMWORD[((-24+224-128))+rcx]
- vpaddq ymm4,ymm5,ymm12
- vpmuludq ymm13,ymm13,ymm11
- vmovdqu ymm12,YMMWORD[((-24+256-128))+rcx]
- vpaddq ymm5,ymm6,ymm13
- vpmuludq ymm0,ymm0,ymm11
- vmovdqu ymm13,YMMWORD[((-24+288-128))+rcx]
- mov r9,r12
- vpaddq ymm6,ymm7,ymm0
- vpmuludq ymm12,ymm12,ymm11
- add r9,QWORD[rsp]
- vpaddq ymm7,ymm8,ymm12
- vpmuludq ymm13,ymm13,ymm11
- vmovq xmm12,r12
- vpaddq ymm8,ymm9,ymm13
-
- dec r14d
- jnz NEAR $L$oop_mul_1024
- vpaddq ymm0,ymm12,YMMWORD[rsp]
-
- vpsrlq ymm12,ymm0,29
- vpand ymm0,ymm0,ymm15
- vpsrlq ymm13,ymm1,29
- vpand ymm1,ymm1,ymm15
- vpsrlq ymm10,ymm2,29
- vpermq ymm12,ymm12,0x93
- vpand ymm2,ymm2,ymm15
- vpsrlq ymm11,ymm3,29
- vpermq ymm13,ymm13,0x93
- vpand ymm3,ymm3,ymm15
-
- vpblendd ymm9,ymm12,ymm14,3
- vpermq ymm10,ymm10,0x93
- vpblendd ymm12,ymm13,ymm12,3
- vpermq ymm11,ymm11,0x93
- vpaddq ymm0,ymm0,ymm9
- vpblendd ymm13,ymm10,ymm13,3
- vpaddq ymm1,ymm1,ymm12
- vpblendd ymm10,ymm11,ymm10,3
- vpaddq ymm2,ymm2,ymm13
- vpblendd ymm11,ymm14,ymm11,3
- vpaddq ymm3,ymm3,ymm10
- vpaddq ymm4,ymm4,ymm11
-
- vpsrlq ymm12,ymm0,29
- vpand ymm0,ymm0,ymm15
- vpsrlq ymm13,ymm1,29
- vpand ymm1,ymm1,ymm15
- vpsrlq ymm10,ymm2,29
- vpermq ymm12,ymm12,0x93
- vpand ymm2,ymm2,ymm15
- vpsrlq ymm11,ymm3,29
- vpermq ymm13,ymm13,0x93
- vpand ymm3,ymm3,ymm15
- vpermq ymm10,ymm10,0x93
-
- vpblendd ymm9,ymm12,ymm14,3
- vpermq ymm11,ymm11,0x93
- vpblendd ymm12,ymm13,ymm12,3
- vpaddq ymm0,ymm0,ymm9
- vpblendd ymm13,ymm10,ymm13,3
- vpaddq ymm1,ymm1,ymm12
- vpblendd ymm10,ymm11,ymm10,3
- vpaddq ymm2,ymm2,ymm13
- vpblendd ymm11,ymm14,ymm11,3
- vpaddq ymm3,ymm3,ymm10
- vpaddq ymm4,ymm4,ymm11
-
- vmovdqu YMMWORD[(0-128)+rdi],ymm0
- vmovdqu YMMWORD[(32-128)+rdi],ymm1
- vmovdqu YMMWORD[(64-128)+rdi],ymm2
- vmovdqu YMMWORD[(96-128)+rdi],ymm3
- vpsrlq ymm12,ymm4,29
- vpand ymm4,ymm4,ymm15
- vpsrlq ymm13,ymm5,29
- vpand ymm5,ymm5,ymm15
- vpsrlq ymm10,ymm6,29
- vpermq ymm12,ymm12,0x93
- vpand ymm6,ymm6,ymm15
- vpsrlq ymm11,ymm7,29
- vpermq ymm13,ymm13,0x93
- vpand ymm7,ymm7,ymm15
- vpsrlq ymm0,ymm8,29
- vpermq ymm10,ymm10,0x93
- vpand ymm8,ymm8,ymm15
- vpermq ymm11,ymm11,0x93
-
- vpblendd ymm9,ymm12,ymm14,3
- vpermq ymm0,ymm0,0x93
- vpblendd ymm12,ymm13,ymm12,3
- vpaddq ymm4,ymm4,ymm9
- vpblendd ymm13,ymm10,ymm13,3
- vpaddq ymm5,ymm5,ymm12
- vpblendd ymm10,ymm11,ymm10,3
- vpaddq ymm6,ymm6,ymm13
- vpblendd ymm11,ymm0,ymm11,3
- vpaddq ymm7,ymm7,ymm10
- vpaddq ymm8,ymm8,ymm11
-
- vpsrlq ymm12,ymm4,29
- vpand ymm4,ymm4,ymm15
- vpsrlq ymm13,ymm5,29
- vpand ymm5,ymm5,ymm15
- vpsrlq ymm10,ymm6,29
- vpermq ymm12,ymm12,0x93
- vpand ymm6,ymm6,ymm15
- vpsrlq ymm11,ymm7,29
- vpermq ymm13,ymm13,0x93
- vpand ymm7,ymm7,ymm15
- vpsrlq ymm0,ymm8,29
- vpermq ymm10,ymm10,0x93
- vpand ymm8,ymm8,ymm15
- vpermq ymm11,ymm11,0x93
-
- vpblendd ymm9,ymm12,ymm14,3
- vpermq ymm0,ymm0,0x93
- vpblendd ymm12,ymm13,ymm12,3
- vpaddq ymm4,ymm4,ymm9
- vpblendd ymm13,ymm10,ymm13,3
- vpaddq ymm5,ymm5,ymm12
- vpblendd ymm10,ymm11,ymm10,3
- vpaddq ymm6,ymm6,ymm13
- vpblendd ymm11,ymm0,ymm11,3
- vpaddq ymm7,ymm7,ymm10
- vpaddq ymm8,ymm8,ymm11
-
- vmovdqu YMMWORD[(128-128)+rdi],ymm4
- vmovdqu YMMWORD[(160-128)+rdi],ymm5
- vmovdqu YMMWORD[(192-128)+rdi],ymm6
- vmovdqu YMMWORD[(224-128)+rdi],ymm7
- vmovdqu YMMWORD[(256-128)+rdi],ymm8
- vzeroupper
-
- mov rax,rbp
-
-$L$mul_1024_in_tail:
- movaps xmm6,XMMWORD[((-216))+rax]
- movaps xmm7,XMMWORD[((-200))+rax]
- movaps xmm8,XMMWORD[((-184))+rax]
- movaps xmm9,XMMWORD[((-168))+rax]
- movaps xmm10,XMMWORD[((-152))+rax]
- movaps xmm11,XMMWORD[((-136))+rax]
- movaps xmm12,XMMWORD[((-120))+rax]
- movaps xmm13,XMMWORD[((-104))+rax]
- movaps xmm14,XMMWORD[((-88))+rax]
- movaps xmm15,XMMWORD[((-72))+rax]
- mov r15,QWORD[((-48))+rax]
-
- mov r14,QWORD[((-40))+rax]
-
- mov r13,QWORD[((-32))+rax]
-
- mov r12,QWORD[((-24))+rax]
-
- mov rbp,QWORD[((-16))+rax]
-
- mov rbx,QWORD[((-8))+rax]
-
- lea rsp,[rax]
-
-$L$mul_1024_epilogue:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_rsaz_1024_mul_avx2:
-global rsaz_1024_red2norm_avx2
-
-ALIGN 32
-rsaz_1024_red2norm_avx2:
+global rsaz_avx2_eligible
- sub rdx,-128
- xor rax,rax
- mov r8,QWORD[((-128))+rdx]
- mov r9,QWORD[((-120))+rdx]
- mov r10,QWORD[((-112))+rdx]
- shl r8,0
- shl r9,29
- mov r11,r10
- shl r10,58
- shr r11,6
- add rax,r8
- add rax,r9
- add rax,r10
- adc r11,0
- mov QWORD[rcx],rax
- mov rax,r11
- mov r8,QWORD[((-104))+rdx]
- mov r9,QWORD[((-96))+rdx]
- shl r8,23
- mov r10,r9
- shl r9,52
- shr r10,12
- add rax,r8
- add rax,r9
- adc r10,0
- mov QWORD[8+rcx],rax
- mov rax,r10
- mov r11,QWORD[((-88))+rdx]
- mov r8,QWORD[((-80))+rdx]
- shl r11,17
- mov r9,r8
- shl r8,46
- shr r9,18
- add rax,r11
- add rax,r8
- adc r9,0
- mov QWORD[16+rcx],rax
- mov rax,r9
- mov r10,QWORD[((-72))+rdx]
- mov r11,QWORD[((-64))+rdx]
- shl r10,11
- mov r8,r11
- shl r11,40
- shr r8,24
- add rax,r10
- add rax,r11
- adc r8,0
- mov QWORD[24+rcx],rax
- mov rax,r8
- mov r9,QWORD[((-56))+rdx]
- mov r10,QWORD[((-48))+rdx]
- mov r11,QWORD[((-40))+rdx]
- shl r9,5
- shl r10,34
- mov r8,r11
- shl r11,63
- shr r8,1
- add rax,r9
- add rax,r10
- add rax,r11
- adc r8,0
- mov QWORD[32+rcx],rax
- mov rax,r8
- mov r9,QWORD[((-32))+rdx]
- mov r10,QWORD[((-24))+rdx]
- shl r9,28
- mov r11,r10
- shl r10,57
- shr r11,7
- add rax,r9
- add rax,r10
- adc r11,0
- mov QWORD[40+rcx],rax
- mov rax,r11
- mov r8,QWORD[((-16))+rdx]
- mov r9,QWORD[((-8))+rdx]
- shl r8,22
- mov r10,r9
- shl r9,51
- shr r10,13
- add rax,r8
- add rax,r9
- adc r10,0
- mov QWORD[48+rcx],rax
- mov rax,r10
- mov r11,QWORD[rdx]
- mov r8,QWORD[8+rdx]
- shl r11,16
- mov r9,r8
- shl r8,45
- shr r9,19
- add rax,r11
- add rax,r8
- adc r9,0
- mov QWORD[56+rcx],rax
- mov rax,r9
- mov r10,QWORD[16+rdx]
- mov r11,QWORD[24+rdx]
- shl r10,10
- mov r8,r11
- shl r11,39
- shr r8,25
- add rax,r10
- add rax,r11
- adc r8,0
- mov QWORD[64+rcx],rax
- mov rax,r8
- mov r9,QWORD[32+rdx]
- mov r10,QWORD[40+rdx]
- mov r11,QWORD[48+rdx]
- shl r9,4
- shl r10,33
- mov r8,r11
- shl r11,62
- shr r8,2
- add rax,r9
- add rax,r10
- add rax,r11
- adc r8,0
- mov QWORD[72+rcx],rax
- mov rax,r8
- mov r9,QWORD[56+rdx]
- mov r10,QWORD[64+rdx]
- shl r9,27
- mov r11,r10
- shl r10,56
- shr r11,8
- add rax,r9
- add rax,r10
- adc r11,0
- mov QWORD[80+rcx],rax
- mov rax,r11
- mov r8,QWORD[72+rdx]
- mov r9,QWORD[80+rdx]
- shl r8,21
- mov r10,r9
- shl r9,50
- shr r10,14
- add rax,r8
- add rax,r9
- adc r10,0
- mov QWORD[88+rcx],rax
- mov rax,r10
- mov r11,QWORD[88+rdx]
- mov r8,QWORD[96+rdx]
- shl r11,15
- mov r9,r8
- shl r8,44
- shr r9,20
- add rax,r11
- add rax,r8
- adc r9,0
- mov QWORD[96+rcx],rax
- mov rax,r9
- mov r10,QWORD[104+rdx]
- mov r11,QWORD[112+rdx]
- shl r10,9
- mov r8,r11
- shl r11,38
- shr r8,26
- add rax,r10
- add rax,r11
- adc r8,0
- mov QWORD[104+rcx],rax
- mov rax,r8
- mov r9,QWORD[120+rdx]
- mov r10,QWORD[128+rdx]
- mov r11,QWORD[136+rdx]
- shl r9,3
- shl r10,32
- mov r8,r11
- shl r11,61
- shr r8,3
- add rax,r9
- add rax,r10
- add rax,r11
- adc r8,0
- mov QWORD[112+rcx],rax
- mov rax,r8
- mov r9,QWORD[144+rdx]
- mov r10,QWORD[152+rdx]
- shl r9,26
- mov r11,r10
- shl r10,55
- shr r11,9
- add rax,r9
- add rax,r10
- adc r11,0
- mov QWORD[120+rcx],rax
- mov rax,r11
+rsaz_avx2_eligible:
+ xor eax,eax
DB 0F3h,0C3h ;repret
-
+global rsaz_1024_sqr_avx2
+global rsaz_1024_mul_avx2
global rsaz_1024_norm2red_avx2
-
-ALIGN 32
-rsaz_1024_norm2red_avx2:
-
- sub rcx,-128
- mov r8,QWORD[rdx]
- mov eax,0x1fffffff
- mov r9,QWORD[8+rdx]
- mov r11,r8
- shr r11,0
- and r11,rax
- mov QWORD[((-128))+rcx],r11
- mov r10,r8
- shr r10,29
- and r10,rax
- mov QWORD[((-120))+rcx],r10
- shrd r8,r9,58
- and r8,rax
- mov QWORD[((-112))+rcx],r8
- mov r10,QWORD[16+rdx]
- mov r8,r9
- shr r8,23
- and r8,rax
- mov QWORD[((-104))+rcx],r8
- shrd r9,r10,52
- and r9,rax
- mov QWORD[((-96))+rcx],r9
- mov r11,QWORD[24+rdx]
- mov r9,r10
- shr r9,17
- and r9,rax
- mov QWORD[((-88))+rcx],r9
- shrd r10,r11,46
- and r10,rax
- mov QWORD[((-80))+rcx],r10
- mov r8,QWORD[32+rdx]
- mov r10,r11
- shr r10,11
- and r10,rax
- mov QWORD[((-72))+rcx],r10
- shrd r11,r8,40
- and r11,rax
- mov QWORD[((-64))+rcx],r11
- mov r9,QWORD[40+rdx]
- mov r11,r8
- shr r11,5
- and r11,rax
- mov QWORD[((-56))+rcx],r11
- mov r10,r8
- shr r10,34
- and r10,rax
- mov QWORD[((-48))+rcx],r10
- shrd r8,r9,63
- and r8,rax
- mov QWORD[((-40))+rcx],r8
- mov r10,QWORD[48+rdx]
- mov r8,r9
- shr r8,28
- and r8,rax
- mov QWORD[((-32))+rcx],r8
- shrd r9,r10,57
- and r9,rax
- mov QWORD[((-24))+rcx],r9
- mov r11,QWORD[56+rdx]
- mov r9,r10
- shr r9,22
- and r9,rax
- mov QWORD[((-16))+rcx],r9
- shrd r10,r11,51
- and r10,rax
- mov QWORD[((-8))+rcx],r10
- mov r8,QWORD[64+rdx]
- mov r10,r11
- shr r10,16
- and r10,rax
- mov QWORD[rcx],r10
- shrd r11,r8,45
- and r11,rax
- mov QWORD[8+rcx],r11
- mov r9,QWORD[72+rdx]
- mov r11,r8
- shr r11,10
- and r11,rax
- mov QWORD[16+rcx],r11
- shrd r8,r9,39
- and r8,rax
- mov QWORD[24+rcx],r8
- mov r10,QWORD[80+rdx]
- mov r8,r9
- shr r8,4
- and r8,rax
- mov QWORD[32+rcx],r8
- mov r11,r9
- shr r11,33
- and r11,rax
- mov QWORD[40+rcx],r11
- shrd r9,r10,62
- and r9,rax
- mov QWORD[48+rcx],r9
- mov r11,QWORD[88+rdx]
- mov r9,r10
- shr r9,27
- and r9,rax
- mov QWORD[56+rcx],r9
- shrd r10,r11,56
- and r10,rax
- mov QWORD[64+rcx],r10
- mov r8,QWORD[96+rdx]
- mov r10,r11
- shr r10,21
- and r10,rax
- mov QWORD[72+rcx],r10
- shrd r11,r8,50
- and r11,rax
- mov QWORD[80+rcx],r11
- mov r9,QWORD[104+rdx]
- mov r11,r8
- shr r11,15
- and r11,rax
- mov QWORD[88+rcx],r11
- shrd r8,r9,44
- and r8,rax
- mov QWORD[96+rcx],r8
- mov r10,QWORD[112+rdx]
- mov r8,r9
- shr r8,9
- and r8,rax
- mov QWORD[104+rcx],r8
- shrd r9,r10,38
- and r9,rax
- mov QWORD[112+rcx],r9
- mov r11,QWORD[120+rdx]
- mov r9,r10
- shr r9,3
- and r9,rax
- mov QWORD[120+rcx],r9
- mov r8,r10
- shr r8,32
- and r8,rax
- mov QWORD[128+rcx],r8
- shrd r10,r11,61
- and r10,rax
- mov QWORD[136+rcx],r10
- xor r8,r8
- mov r10,r11
- shr r10,26
- and r10,rax
- mov QWORD[144+rcx],r10
- shrd r11,r8,55
- and r11,rax
- mov QWORD[152+rcx],r11
- mov QWORD[160+rcx],r8
- mov QWORD[168+rcx],r8
- mov QWORD[176+rcx],r8
- mov QWORD[184+rcx],r8
- DB 0F3h,0C3h ;repret
-
-
+global rsaz_1024_red2norm_avx2
global rsaz_1024_scatter5_avx2
-
-ALIGN 32
-rsaz_1024_scatter5_avx2:
-
- vzeroupper
- vmovdqu ymm5,YMMWORD[$L$scatter_permd]
- shl r8d,4
- lea rcx,[r8*1+rcx]
- mov eax,9
- jmp NEAR $L$oop_scatter_1024
-
-ALIGN 32
-$L$oop_scatter_1024:
- vmovdqu ymm0,YMMWORD[rdx]
- lea rdx,[32+rdx]
- vpermd ymm0,ymm5,ymm0
- vmovdqu XMMWORD[rcx],xmm0
- lea rcx,[512+rcx]
- dec eax
- jnz NEAR $L$oop_scatter_1024
-
- vzeroupper
- DB 0F3h,0C3h ;repret
-
-
-
global rsaz_1024_gather5_avx2
-ALIGN 32
+rsaz_1024_sqr_avx2:
+rsaz_1024_mul_avx2:
+rsaz_1024_norm2red_avx2:
+rsaz_1024_red2norm_avx2:
+rsaz_1024_scatter5_avx2:
rsaz_1024_gather5_avx2:
-
- vzeroupper
- mov r11,rsp
-
- lea rax,[((-136))+rsp]
-$L$SEH_begin_rsaz_1024_gather5:
-
-DB 0x48,0x8d,0x60,0xe0
-DB 0xc5,0xf8,0x29,0x70,0xe0
-DB 0xc5,0xf8,0x29,0x78,0xf0
-DB 0xc5,0x78,0x29,0x40,0x00
-DB 0xc5,0x78,0x29,0x48,0x10
-DB 0xc5,0x78,0x29,0x50,0x20
-DB 0xc5,0x78,0x29,0x58,0x30
-DB 0xc5,0x78,0x29,0x60,0x40
-DB 0xc5,0x78,0x29,0x68,0x50
-DB 0xc5,0x78,0x29,0x70,0x60
-DB 0xc5,0x78,0x29,0x78,0x70
- lea rsp,[((-256))+rsp]
- and rsp,-32
- lea r10,[$L$inc]
- lea rax,[((-128))+rsp]
-
- vmovd xmm4,r8d
- vmovdqa ymm0,YMMWORD[r10]
- vmovdqa ymm1,YMMWORD[32+r10]
- vmovdqa ymm5,YMMWORD[64+r10]
- vpbroadcastd ymm4,xmm4
-
- vpaddd ymm2,ymm0,ymm5
- vpcmpeqd ymm0,ymm0,ymm4
- vpaddd ymm3,ymm1,ymm5
- vpcmpeqd ymm1,ymm1,ymm4
- vmovdqa YMMWORD[(0+128)+rax],ymm0
- vpaddd ymm0,ymm2,ymm5
- vpcmpeqd ymm2,ymm2,ymm4
- vmovdqa YMMWORD[(32+128)+rax],ymm1
- vpaddd ymm1,ymm3,ymm5
- vpcmpeqd ymm3,ymm3,ymm4
- vmovdqa YMMWORD[(64+128)+rax],ymm2
- vpaddd ymm2,ymm0,ymm5
- vpcmpeqd ymm0,ymm0,ymm4
- vmovdqa YMMWORD[(96+128)+rax],ymm3
- vpaddd ymm3,ymm1,ymm5
- vpcmpeqd ymm1,ymm1,ymm4
- vmovdqa YMMWORD[(128+128)+rax],ymm0
- vpaddd ymm8,ymm2,ymm5
- vpcmpeqd ymm2,ymm2,ymm4
- vmovdqa YMMWORD[(160+128)+rax],ymm1
- vpaddd ymm9,ymm3,ymm5
- vpcmpeqd ymm3,ymm3,ymm4
- vmovdqa YMMWORD[(192+128)+rax],ymm2
- vpaddd ymm10,ymm8,ymm5
- vpcmpeqd ymm8,ymm8,ymm4
- vmovdqa YMMWORD[(224+128)+rax],ymm3
- vpaddd ymm11,ymm9,ymm5
- vpcmpeqd ymm9,ymm9,ymm4
- vpaddd ymm12,ymm10,ymm5
- vpcmpeqd ymm10,ymm10,ymm4
- vpaddd ymm13,ymm11,ymm5
- vpcmpeqd ymm11,ymm11,ymm4
- vpaddd ymm14,ymm12,ymm5
- vpcmpeqd ymm12,ymm12,ymm4
- vpaddd ymm15,ymm13,ymm5
- vpcmpeqd ymm13,ymm13,ymm4
- vpcmpeqd ymm14,ymm14,ymm4
- vpcmpeqd ymm15,ymm15,ymm4
-
- vmovdqa ymm7,YMMWORD[((-32))+r10]
- lea rdx,[128+rdx]
- mov r8d,9
-
-$L$oop_gather_1024:
- vmovdqa ymm0,YMMWORD[((0-128))+rdx]
- vmovdqa ymm1,YMMWORD[((32-128))+rdx]
- vmovdqa ymm2,YMMWORD[((64-128))+rdx]
- vmovdqa ymm3,YMMWORD[((96-128))+rdx]
- vpand ymm0,ymm0,YMMWORD[((0+128))+rax]
- vpand ymm1,ymm1,YMMWORD[((32+128))+rax]
- vpand ymm2,ymm2,YMMWORD[((64+128))+rax]
- vpor ymm4,ymm1,ymm0
- vpand ymm3,ymm3,YMMWORD[((96+128))+rax]
- vmovdqa ymm0,YMMWORD[((128-128))+rdx]
- vmovdqa ymm1,YMMWORD[((160-128))+rdx]
- vpor ymm5,ymm3,ymm2
- vmovdqa ymm2,YMMWORD[((192-128))+rdx]
- vmovdqa ymm3,YMMWORD[((224-128))+rdx]
- vpand ymm0,ymm0,YMMWORD[((128+128))+rax]
- vpand ymm1,ymm1,YMMWORD[((160+128))+rax]
- vpand ymm2,ymm2,YMMWORD[((192+128))+rax]
- vpor ymm4,ymm4,ymm0
- vpand ymm3,ymm3,YMMWORD[((224+128))+rax]
- vpand ymm0,ymm8,YMMWORD[((256-128))+rdx]
- vpor ymm5,ymm5,ymm1
- vpand ymm1,ymm9,YMMWORD[((288-128))+rdx]
- vpor ymm4,ymm4,ymm2
- vpand ymm2,ymm10,YMMWORD[((320-128))+rdx]
- vpor ymm5,ymm5,ymm3
- vpand ymm3,ymm11,YMMWORD[((352-128))+rdx]
- vpor ymm4,ymm4,ymm0
- vpand ymm0,ymm12,YMMWORD[((384-128))+rdx]
- vpor ymm5,ymm5,ymm1
- vpand ymm1,ymm13,YMMWORD[((416-128))+rdx]
- vpor ymm4,ymm4,ymm2
- vpand ymm2,ymm14,YMMWORD[((448-128))+rdx]
- vpor ymm5,ymm5,ymm3
- vpand ymm3,ymm15,YMMWORD[((480-128))+rdx]
- lea rdx,[512+rdx]
- vpor ymm4,ymm4,ymm0
- vpor ymm5,ymm5,ymm1
- vpor ymm4,ymm4,ymm2
- vpor ymm5,ymm5,ymm3
-
- vpor ymm4,ymm4,ymm5
- vextracti128 xmm5,ymm4,1
- vpor xmm5,xmm5,xmm4
- vpermd ymm5,ymm7,ymm5
- vmovdqu YMMWORD[rcx],ymm5
- lea rcx,[32+rcx]
- dec r8d
- jnz NEAR $L$oop_gather_1024
-
- vpxor ymm0,ymm0,ymm0
- vmovdqu YMMWORD[rcx],ymm0
- vzeroupper
- movaps xmm6,XMMWORD[((-168))+r11]
- movaps xmm7,XMMWORD[((-152))+r11]
- movaps xmm8,XMMWORD[((-136))+r11]
- movaps xmm9,XMMWORD[((-120))+r11]
- movaps xmm10,XMMWORD[((-104))+r11]
- movaps xmm11,XMMWORD[((-88))+r11]
- movaps xmm12,XMMWORD[((-72))+r11]
- movaps xmm13,XMMWORD[((-56))+r11]
- movaps xmm14,XMMWORD[((-40))+r11]
- movaps xmm15,XMMWORD[((-24))+r11]
- lea rsp,[r11]
-
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_rsaz_1024_gather5:
-
-EXTERN OPENSSL_ia32cap_P
-global rsaz_avx2_eligible
-
-ALIGN 32
-rsaz_avx2_eligible:
- mov eax,DWORD[((OPENSSL_ia32cap_P+8))]
- mov ecx,524544
- mov edx,0
- and ecx,eax
- cmp ecx,524544
- cmove eax,edx
- and eax,32
- shr eax,5
+DB 0x0f,0x0b
DB 0F3h,0C3h ;repret
-
-ALIGN 64
-$L$and_mask:
- DQ 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
-$L$scatter_permd:
- DD 0,2,4,6,7,7,7,7
-$L$gather_permd:
- DD 0,7,1,7,2,7,3,7
-$L$inc:
- DD 0,0,0,0,1,1,1,1
- DD 2,2,2,2,3,3,3,3
- DD 4,4,4,4,4,4,4,4
-ALIGN 64
-EXTERN __imp_RtlVirtualUnwind
-
-ALIGN 16
-rsaz_se_handler:
- push rsi
- push rdi
- push rbx
- push rbp
- push r12
- push r13
- push r14
- push r15
- pushfq
- sub rsp,64
-
- mov rax,QWORD[120+r8]
- mov rbx,QWORD[248+r8]
-
- mov rsi,QWORD[8+r9]
- mov r11,QWORD[56+r9]
-
- mov r10d,DWORD[r11]
- lea r10,[r10*1+rsi]
- cmp rbx,r10
- jb NEAR $L$common_seh_tail
-
- mov r10d,DWORD[4+r11]
- lea r10,[r10*1+rsi]
- cmp rbx,r10
- jae NEAR $L$common_seh_tail
-
- mov rbp,QWORD[160+r8]
-
- mov r10d,DWORD[8+r11]
- lea r10,[r10*1+rsi]
- cmp rbx,r10
- cmovc rax,rbp
-
- mov r15,QWORD[((-48))+rax]
- mov r14,QWORD[((-40))+rax]
- mov r13,QWORD[((-32))+rax]
- mov r12,QWORD[((-24))+rax]
- mov rbp,QWORD[((-16))+rax]
- mov rbx,QWORD[((-8))+rax]
- mov QWORD[240+r8],r15
- mov QWORD[232+r8],r14
- mov QWORD[224+r8],r13
- mov QWORD[216+r8],r12
- mov QWORD[160+r8],rbp
- mov QWORD[144+r8],rbx
-
- lea rsi,[((-216))+rax]
- lea rdi,[512+r8]
- mov ecx,20
- DD 0xa548f3fc
-
-$L$common_seh_tail:
- mov rdi,QWORD[8+rax]
- mov rsi,QWORD[16+rax]
- mov QWORD[152+r8],rax
- mov QWORD[168+r8],rsi
- mov QWORD[176+r8],rdi
-
- mov rdi,QWORD[40+r9]
- mov rsi,r8
- mov ecx,154
- DD 0xa548f3fc
-
- mov rsi,r9
- xor rcx,rcx
- mov rdx,QWORD[8+rsi]
- mov r8,QWORD[rsi]
- mov r9,QWORD[16+rsi]
- mov r10,QWORD[40+rsi]
- lea r11,[56+rsi]
- lea r12,[24+rsi]
- mov QWORD[32+rsp],r10
- mov QWORD[40+rsp],r11
- mov QWORD[48+rsp],r12
- mov QWORD[56+rsp],rcx
- call QWORD[__imp_RtlVirtualUnwind]
-
- mov eax,1
- add rsp,64
- popfq
- pop r15
- pop r14
- pop r13
- pop r12
- pop rbp
- pop rbx
- pop rdi
- pop rsi
- DB 0F3h,0C3h ;repret
-
-
-section .pdata rdata align=4
-ALIGN 4
- DD $L$SEH_begin_rsaz_1024_sqr_avx2 wrt ..imagebase
- DD $L$SEH_end_rsaz_1024_sqr_avx2 wrt ..imagebase
- DD $L$SEH_info_rsaz_1024_sqr_avx2 wrt ..imagebase
-
- DD $L$SEH_begin_rsaz_1024_mul_avx2 wrt ..imagebase
- DD $L$SEH_end_rsaz_1024_mul_avx2 wrt ..imagebase
- DD $L$SEH_info_rsaz_1024_mul_avx2 wrt ..imagebase
-
- DD $L$SEH_begin_rsaz_1024_gather5 wrt ..imagebase
- DD $L$SEH_end_rsaz_1024_gather5 wrt ..imagebase
- DD $L$SEH_info_rsaz_1024_gather5 wrt ..imagebase
-section .xdata rdata align=8
-ALIGN 8
-$L$SEH_info_rsaz_1024_sqr_avx2:
-DB 9,0,0,0
- DD rsaz_se_handler wrt ..imagebase
- DD $L$sqr_1024_body wrt ..imagebase,$L$sqr_1024_epilogue wrt ..imagebase,$L$sqr_1024_in_tail wrt ..imagebase
- DD 0
-$L$SEH_info_rsaz_1024_mul_avx2:
-DB 9,0,0,0
- DD rsaz_se_handler wrt ..imagebase
- DD $L$mul_1024_body wrt ..imagebase,$L$mul_1024_epilogue wrt ..imagebase,$L$mul_1024_in_tail wrt ..imagebase
- DD 0
-$L$SEH_info_rsaz_1024_gather5:
-DB 0x01,0x36,0x17,0x0b
-DB 0x36,0xf8,0x09,0x00
-DB 0x31,0xe8,0x08,0x00
-DB 0x2c,0xd8,0x07,0x00
-DB 0x27,0xc8,0x06,0x00
-DB 0x22,0xb8,0x05,0x00
-DB 0x1d,0xa8,0x04,0x00
-DB 0x18,0x98,0x03,0x00
-DB 0x13,0x88,0x02,0x00
-DB 0x0e,0x78,0x01,0x00
-DB 0x09,0x68,0x00,0x00
-DB 0x04,0x01,0x15,0x00
-DB 0x00,0xb3,0x00,0x00
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-avx512.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-avx512.nasm
index fc15281fa46..f8e4aa1c20e 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-avx512.nasm
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-avx512.nasm
@@ -2,1030 +2,23 @@ default rel
%define XMMWORD
%define YMMWORD
%define ZMMWORD
-EXTERN OPENSSL_ia32cap_P
+section .text code align=64
+
+
global ossl_rsaz_avx512ifma_eligible
-ALIGN 32
ossl_rsaz_avx512ifma_eligible:
- mov ecx,DWORD[((OPENSSL_ia32cap_P+8))]
xor eax,eax
- and ecx,2149777408
- cmp ecx,2149777408
- cmove eax,ecx
DB 0F3h,0C3h ;repret
-section .text code align=64
-
global ossl_rsaz_amm52x20_x1_256
-
-ALIGN 32
-ossl_rsaz_amm52x20_x1_256:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_ossl_rsaz_amm52x20_x1_256:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
- mov rcx,r9
- mov r8,QWORD[40+rsp]
-
-
-
-DB 243,15,30,250
- push rbx
-
- push rbp
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
-$L$rsaz_amm52x20_x1_256_body:
-
-
- vpxord ymm0,ymm0,ymm0
- vmovdqa64 ymm1,ymm0
- vmovdqa64 ymm16,ymm0
- vmovdqa64 ymm17,ymm0
- vmovdqa64 ymm18,ymm0
- vmovdqa64 ymm19,ymm0
-
- xor r9d,r9d
-
- mov r11,rdx
- mov rax,0xfffffffffffff
-
-
- mov ebx,5
-
-ALIGN 32
-$L$loop5:
- mov r13,QWORD[r11]
-
- vpbroadcastq ymm3,r13
- mov rdx,QWORD[rsi]
- mulx r12,r13,r13
- add r9,r13
- mov r10,r12
- adc r10,0
-
- mov r13,r8
- imul r13,r9
- and r13,rax
-
- vpbroadcastq ymm4,r13
- mov rdx,QWORD[rcx]
- mulx r12,r13,r13
- add r9,r13
- adc r10,r12
-
- shr r9,52
- sal r10,12
- or r9,r10
-
- vpmadd52luq ymm1,ymm3,YMMWORD[rsi]
- vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi]
- vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi]
- vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi]
- vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi]
-
- vpmadd52luq ymm1,ymm4,YMMWORD[rcx]
- vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx]
- vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx]
- vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx]
- vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx]
-
-
- valignq ymm1,ymm16,ymm1,1
- valignq ymm16,ymm17,ymm16,1
- valignq ymm17,ymm18,ymm17,1
- valignq ymm18,ymm19,ymm18,1
- valignq ymm19,ymm0,ymm19,1
-
- vmovq r13,xmm1
- add r9,r13
-
- vpmadd52huq ymm1,ymm3,YMMWORD[rsi]
- vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi]
- vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi]
- vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi]
- vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi]
-
- vpmadd52huq ymm1,ymm4,YMMWORD[rcx]
- vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx]
- vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx]
- vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx]
- vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx]
- mov r13,QWORD[8+r11]
-
- vpbroadcastq ymm3,r13
- mov rdx,QWORD[rsi]
- mulx r12,r13,r13
- add r9,r13
- mov r10,r12
- adc r10,0
-
- mov r13,r8
- imul r13,r9
- and r13,rax
-
- vpbroadcastq ymm4,r13
- mov rdx,QWORD[rcx]
- mulx r12,r13,r13
- add r9,r13
- adc r10,r12
-
- shr r9,52
- sal r10,12
- or r9,r10
-
- vpmadd52luq ymm1,ymm3,YMMWORD[rsi]
- vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi]
- vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi]
- vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi]
- vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi]
-
- vpmadd52luq ymm1,ymm4,YMMWORD[rcx]
- vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx]
- vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx]
- vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx]
- vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx]
-
-
- valignq ymm1,ymm16,ymm1,1
- valignq ymm16,ymm17,ymm16,1
- valignq ymm17,ymm18,ymm17,1
- valignq ymm18,ymm19,ymm18,1
- valignq ymm19,ymm0,ymm19,1
-
- vmovq r13,xmm1
- add r9,r13
-
- vpmadd52huq ymm1,ymm3,YMMWORD[rsi]
- vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi]
- vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi]
- vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi]
- vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi]
-
- vpmadd52huq ymm1,ymm4,YMMWORD[rcx]
- vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx]
- vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx]
- vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx]
- vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx]
- mov r13,QWORD[16+r11]
-
- vpbroadcastq ymm3,r13
- mov rdx,QWORD[rsi]
- mulx r12,r13,r13
- add r9,r13
- mov r10,r12
- adc r10,0
-
- mov r13,r8
- imul r13,r9
- and r13,rax
-
- vpbroadcastq ymm4,r13
- mov rdx,QWORD[rcx]
- mulx r12,r13,r13
- add r9,r13
- adc r10,r12
-
- shr r9,52
- sal r10,12
- or r9,r10
-
- vpmadd52luq ymm1,ymm3,YMMWORD[rsi]
- vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi]
- vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi]
- vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi]
- vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi]
-
- vpmadd52luq ymm1,ymm4,YMMWORD[rcx]
- vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx]
- vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx]
- vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx]
- vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx]
-
-
- valignq ymm1,ymm16,ymm1,1
- valignq ymm16,ymm17,ymm16,1
- valignq ymm17,ymm18,ymm17,1
- valignq ymm18,ymm19,ymm18,1
- valignq ymm19,ymm0,ymm19,1
-
- vmovq r13,xmm1
- add r9,r13
-
- vpmadd52huq ymm1,ymm3,YMMWORD[rsi]
- vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi]
- vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi]
- vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi]
- vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi]
-
- vpmadd52huq ymm1,ymm4,YMMWORD[rcx]
- vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx]
- vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx]
- vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx]
- vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx]
- mov r13,QWORD[24+r11]
-
- vpbroadcastq ymm3,r13
- mov rdx,QWORD[rsi]
- mulx r12,r13,r13
- add r9,r13
- mov r10,r12
- adc r10,0
-
- mov r13,r8
- imul r13,r9
- and r13,rax
-
- vpbroadcastq ymm4,r13
- mov rdx,QWORD[rcx]
- mulx r12,r13,r13
- add r9,r13
- adc r10,r12
-
- shr r9,52
- sal r10,12
- or r9,r10
-
- vpmadd52luq ymm1,ymm3,YMMWORD[rsi]
- vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi]
- vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi]
- vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi]
- vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi]
-
- vpmadd52luq ymm1,ymm4,YMMWORD[rcx]
- vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx]
- vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx]
- vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx]
- vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx]
-
-
- valignq ymm1,ymm16,ymm1,1
- valignq ymm16,ymm17,ymm16,1
- valignq ymm17,ymm18,ymm17,1
- valignq ymm18,ymm19,ymm18,1
- valignq ymm19,ymm0,ymm19,1
-
- vmovq r13,xmm1
- add r9,r13
-
- vpmadd52huq ymm1,ymm3,YMMWORD[rsi]
- vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi]
- vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi]
- vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi]
- vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi]
-
- vpmadd52huq ymm1,ymm4,YMMWORD[rcx]
- vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx]
- vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx]
- vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx]
- vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx]
- lea r11,[32+r11]
- dec ebx
- jne NEAR $L$loop5
-
- vmovdqa64 ymm4,YMMWORD[$L$mask52x4]
-
- vpbroadcastq ymm3,r9
- vpblendd ymm1,ymm1,ymm3,3
-
-
-
- vpsrlq ymm24,ymm1,52
- vpsrlq ymm25,ymm16,52
- vpsrlq ymm26,ymm17,52
- vpsrlq ymm27,ymm18,52
- vpsrlq ymm28,ymm19,52
-
-
- valignq ymm28,ymm28,ymm27,3
- valignq ymm27,ymm27,ymm26,3
- valignq ymm26,ymm26,ymm25,3
- valignq ymm25,ymm25,ymm24,3
- valignq ymm24,ymm24,ymm0,3
-
-
- vpandq ymm1,ymm1,ymm4
- vpandq ymm16,ymm16,ymm4
- vpandq ymm17,ymm17,ymm4
- vpandq ymm18,ymm18,ymm4
- vpandq ymm19,ymm19,ymm4
-
-
- vpaddq ymm1,ymm1,ymm24
- vpaddq ymm16,ymm16,ymm25
- vpaddq ymm17,ymm17,ymm26
- vpaddq ymm18,ymm18,ymm27
- vpaddq ymm19,ymm19,ymm28
-
-
-
- vpcmpuq k1,ymm4,ymm1,1
- vpcmpuq k2,ymm4,ymm16,1
- vpcmpuq k3,ymm4,ymm17,1
- vpcmpuq k4,ymm4,ymm18,1
- vpcmpuq k5,ymm4,ymm19,1
- kmovb r14d,k1
- kmovb r13d,k2
- kmovb r12d,k3
- kmovb r11d,k4
- kmovb r10d,k5
-
-
- vpcmpuq k1,ymm4,ymm1,0
- vpcmpuq k2,ymm4,ymm16,0
- vpcmpuq k3,ymm4,ymm17,0
- vpcmpuq k4,ymm4,ymm18,0
- vpcmpuq k5,ymm4,ymm19,0
- kmovb r9d,k1
- kmovb r8d,k2
- kmovb ebx,k3
- kmovb ecx,k4
- kmovb edx,k5
-
-
-
- shl r13b,4
- or r14b,r13b
- shl r11b,4
- or r12b,r11b
-
- add r14b,r14b
- adc r12b,r12b
- adc r10b,r10b
-
- shl r8b,4
- or r9b,r8b
- shl cl,4
- or bl,cl
-
- add r14b,r9b
- adc r12b,bl
- adc r10b,dl
-
- xor r14b,r9b
- xor r12b,bl
- xor r10b,dl
-
- kmovb k1,r14d
- shr r14b,4
- kmovb k2,r14d
- kmovb k3,r12d
- shr r12b,4
- kmovb k4,r12d
- kmovb k5,r10d
-
-
- vpsubq ymm1{k1},ymm1,ymm4
- vpsubq ymm16{k2},ymm16,ymm4
- vpsubq ymm17{k3},ymm17,ymm4
- vpsubq ymm18{k4},ymm18,ymm4
- vpsubq ymm19{k5},ymm19,ymm4
-
- vpandq ymm1,ymm1,ymm4
- vpandq ymm16,ymm16,ymm4
- vpandq ymm17,ymm17,ymm4
- vpandq ymm18,ymm18,ymm4
- vpandq ymm19,ymm19,ymm4
-
- vmovdqu64 YMMWORD[rdi],ymm1
- vmovdqu64 YMMWORD[32+rdi],ymm16
- vmovdqu64 YMMWORD[64+rdi],ymm17
- vmovdqu64 YMMWORD[96+rdi],ymm18
- vmovdqu64 YMMWORD[128+rdi],ymm19
-
- vzeroupper
- mov r15,QWORD[rsp]
-
- mov r14,QWORD[8+rsp]
-
- mov r13,QWORD[16+rsp]
-
- mov r12,QWORD[24+rsp]
-
- mov rbp,QWORD[32+rsp]
-
- mov rbx,QWORD[40+rsp]
-
- lea rsp,[48+rsp]
-
-$L$rsaz_amm52x20_x1_256_epilogue:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_ossl_rsaz_amm52x20_x1_256:
-section .data data align=8
-
-ALIGN 32
-$L$mask52x4:
- DQ 0xfffffffffffff
- DQ 0xfffffffffffff
- DQ 0xfffffffffffff
- DQ 0xfffffffffffff
-section .text code align=64
-
-
global ossl_rsaz_amm52x20_x2_256
-
-ALIGN 32
-ossl_rsaz_amm52x20_x2_256:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_ossl_rsaz_amm52x20_x2_256:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
- mov rcx,r9
- mov r8,QWORD[40+rsp]
-
-
-
-DB 243,15,30,250
- push rbx
-
- push rbp
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
-$L$rsaz_amm52x20_x2_256_body:
-
-
- vpxord ymm0,ymm0,ymm0
- vmovdqa64 ymm1,ymm0
- vmovdqa64 ymm16,ymm0
- vmovdqa64 ymm17,ymm0
- vmovdqa64 ymm18,ymm0
- vmovdqa64 ymm19,ymm0
- vmovdqa64 ymm2,ymm0
- vmovdqa64 ymm20,ymm0
- vmovdqa64 ymm21,ymm0
- vmovdqa64 ymm22,ymm0
- vmovdqa64 ymm23,ymm0
-
- xor r9d,r9d
- xor r15d,r15d
-
- mov r11,rdx
- mov rax,0xfffffffffffff
-
- mov ebx,20
-
-ALIGN 32
-$L$loop20:
- mov r13,QWORD[r11]
-
- vpbroadcastq ymm3,r13
- mov rdx,QWORD[rsi]
- mulx r12,r13,r13
- add r9,r13
- mov r10,r12
- adc r10,0
-
- mov r13,QWORD[r8]
- imul r13,r9
- and r13,rax
-
- vpbroadcastq ymm4,r13
- mov rdx,QWORD[rcx]
- mulx r12,r13,r13
- add r9,r13
- adc r10,r12
-
- shr r9,52
- sal r10,12
- or r9,r10
-
- vpmadd52luq ymm1,ymm3,YMMWORD[rsi]
- vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi]
- vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi]
- vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi]
- vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi]
-
- vpmadd52luq ymm1,ymm4,YMMWORD[rcx]
- vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx]
- vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx]
- vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx]
- vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx]
-
-
- valignq ymm1,ymm16,ymm1,1
- valignq ymm16,ymm17,ymm16,1
- valignq ymm17,ymm18,ymm17,1
- valignq ymm18,ymm19,ymm18,1
- valignq ymm19,ymm0,ymm19,1
-
- vmovq r13,xmm1
- add r9,r13
-
- vpmadd52huq ymm1,ymm3,YMMWORD[rsi]
- vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi]
- vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi]
- vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi]
- vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi]
-
- vpmadd52huq ymm1,ymm4,YMMWORD[rcx]
- vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx]
- vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx]
- vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx]
- vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx]
- mov r13,QWORD[160+r11]
-
- vpbroadcastq ymm3,r13
- mov rdx,QWORD[160+rsi]
- mulx r12,r13,r13
- add r15,r13
- mov r10,r12
- adc r10,0
-
- mov r13,QWORD[8+r8]
- imul r13,r15
- and r13,rax
-
- vpbroadcastq ymm4,r13
- mov rdx,QWORD[160+rcx]
- mulx r12,r13,r13
- add r15,r13
- adc r10,r12
-
- shr r15,52
- sal r10,12
- or r15,r10
-
- vpmadd52luq ymm2,ymm3,YMMWORD[160+rsi]
- vpmadd52luq ymm20,ymm3,YMMWORD[192+rsi]
- vpmadd52luq ymm21,ymm3,YMMWORD[224+rsi]
- vpmadd52luq ymm22,ymm3,YMMWORD[256+rsi]
- vpmadd52luq ymm23,ymm3,YMMWORD[288+rsi]
-
- vpmadd52luq ymm2,ymm4,YMMWORD[160+rcx]
- vpmadd52luq ymm20,ymm4,YMMWORD[192+rcx]
- vpmadd52luq ymm21,ymm4,YMMWORD[224+rcx]
- vpmadd52luq ymm22,ymm4,YMMWORD[256+rcx]
- vpmadd52luq ymm23,ymm4,YMMWORD[288+rcx]
-
-
- valignq ymm2,ymm20,ymm2,1
- valignq ymm20,ymm21,ymm20,1
- valignq ymm21,ymm22,ymm21,1
- valignq ymm22,ymm23,ymm22,1
- valignq ymm23,ymm0,ymm23,1
-
- vmovq r13,xmm2
- add r15,r13
-
- vpmadd52huq ymm2,ymm3,YMMWORD[160+rsi]
- vpmadd52huq ymm20,ymm3,YMMWORD[192+rsi]
- vpmadd52huq ymm21,ymm3,YMMWORD[224+rsi]
- vpmadd52huq ymm22,ymm3,YMMWORD[256+rsi]
- vpmadd52huq ymm23,ymm3,YMMWORD[288+rsi]
-
- vpmadd52huq ymm2,ymm4,YMMWORD[160+rcx]
- vpmadd52huq ymm20,ymm4,YMMWORD[192+rcx]
- vpmadd52huq ymm21,ymm4,YMMWORD[224+rcx]
- vpmadd52huq ymm22,ymm4,YMMWORD[256+rcx]
- vpmadd52huq ymm23,ymm4,YMMWORD[288+rcx]
- lea r11,[8+r11]
- dec ebx
- jne NEAR $L$loop20
-
- vmovdqa64 ymm4,YMMWORD[$L$mask52x4]
-
- vpbroadcastq ymm3,r9
- vpblendd ymm1,ymm1,ymm3,3
-
-
-
- vpsrlq ymm24,ymm1,52
- vpsrlq ymm25,ymm16,52
- vpsrlq ymm26,ymm17,52
- vpsrlq ymm27,ymm18,52
- vpsrlq ymm28,ymm19,52
-
-
- valignq ymm28,ymm28,ymm27,3
- valignq ymm27,ymm27,ymm26,3
- valignq ymm26,ymm26,ymm25,3
- valignq ymm25,ymm25,ymm24,3
- valignq ymm24,ymm24,ymm0,3
-
-
- vpandq ymm1,ymm1,ymm4
- vpandq ymm16,ymm16,ymm4
- vpandq ymm17,ymm17,ymm4
- vpandq ymm18,ymm18,ymm4
- vpandq ymm19,ymm19,ymm4
-
-
- vpaddq ymm1,ymm1,ymm24
- vpaddq ymm16,ymm16,ymm25
- vpaddq ymm17,ymm17,ymm26
- vpaddq ymm18,ymm18,ymm27
- vpaddq ymm19,ymm19,ymm28
-
-
-
- vpcmpuq k1,ymm4,ymm1,1
- vpcmpuq k2,ymm4,ymm16,1
- vpcmpuq k3,ymm4,ymm17,1
- vpcmpuq k4,ymm4,ymm18,1
- vpcmpuq k5,ymm4,ymm19,1
- kmovb r14d,k1
- kmovb r13d,k2
- kmovb r12d,k3
- kmovb r11d,k4
- kmovb r10d,k5
-
-
- vpcmpuq k1,ymm4,ymm1,0
- vpcmpuq k2,ymm4,ymm16,0
- vpcmpuq k3,ymm4,ymm17,0
- vpcmpuq k4,ymm4,ymm18,0
- vpcmpuq k5,ymm4,ymm19,0
- kmovb r9d,k1
- kmovb r8d,k2
- kmovb ebx,k3
- kmovb ecx,k4
- kmovb edx,k5
-
-
-
- shl r13b,4
- or r14b,r13b
- shl r11b,4
- or r12b,r11b
-
- add r14b,r14b
- adc r12b,r12b
- adc r10b,r10b
-
- shl r8b,4
- or r9b,r8b
- shl cl,4
- or bl,cl
-
- add r14b,r9b
- adc r12b,bl
- adc r10b,dl
-
- xor r14b,r9b
- xor r12b,bl
- xor r10b,dl
-
- kmovb k1,r14d
- shr r14b,4
- kmovb k2,r14d
- kmovb k3,r12d
- shr r12b,4
- kmovb k4,r12d
- kmovb k5,r10d
-
-
- vpsubq ymm1{k1},ymm1,ymm4
- vpsubq ymm16{k2},ymm16,ymm4
- vpsubq ymm17{k3},ymm17,ymm4
- vpsubq ymm18{k4},ymm18,ymm4
- vpsubq ymm19{k5},ymm19,ymm4
-
- vpandq ymm1,ymm1,ymm4
- vpandq ymm16,ymm16,ymm4
- vpandq ymm17,ymm17,ymm4
- vpandq ymm18,ymm18,ymm4
- vpandq ymm19,ymm19,ymm4
-
- vpbroadcastq ymm3,r15
- vpblendd ymm2,ymm2,ymm3,3
-
-
-
- vpsrlq ymm24,ymm2,52
- vpsrlq ymm25,ymm20,52
- vpsrlq ymm26,ymm21,52
- vpsrlq ymm27,ymm22,52
- vpsrlq ymm28,ymm23,52
-
-
- valignq ymm28,ymm28,ymm27,3
- valignq ymm27,ymm27,ymm26,3
- valignq ymm26,ymm26,ymm25,3
- valignq ymm25,ymm25,ymm24,3
- valignq ymm24,ymm24,ymm0,3
-
-
- vpandq ymm2,ymm2,ymm4
- vpandq ymm20,ymm20,ymm4
- vpandq ymm21,ymm21,ymm4
- vpandq ymm22,ymm22,ymm4
- vpandq ymm23,ymm23,ymm4
-
-
- vpaddq ymm2,ymm2,ymm24
- vpaddq ymm20,ymm20,ymm25
- vpaddq ymm21,ymm21,ymm26
- vpaddq ymm22,ymm22,ymm27
- vpaddq ymm23,ymm23,ymm28
-
-
-
- vpcmpuq k1,ymm4,ymm2,1
- vpcmpuq k2,ymm4,ymm20,1
- vpcmpuq k3,ymm4,ymm21,1
- vpcmpuq k4,ymm4,ymm22,1
- vpcmpuq k5,ymm4,ymm23,1
- kmovb r14d,k1
- kmovb r13d,k2
- kmovb r12d,k3
- kmovb r11d,k4
- kmovb r10d,k5
-
-
- vpcmpuq k1,ymm4,ymm2,0
- vpcmpuq k2,ymm4,ymm20,0
- vpcmpuq k3,ymm4,ymm21,0
- vpcmpuq k4,ymm4,ymm22,0
- vpcmpuq k5,ymm4,ymm23,0
- kmovb r9d,k1
- kmovb r8d,k2
- kmovb ebx,k3
- kmovb ecx,k4
- kmovb edx,k5
-
-
-
- shl r13b,4
- or r14b,r13b
- shl r11b,4
- or r12b,r11b
-
- add r14b,r14b
- adc r12b,r12b
- adc r10b,r10b
-
- shl r8b,4
- or r9b,r8b
- shl cl,4
- or bl,cl
-
- add r14b,r9b
- adc r12b,bl
- adc r10b,dl
-
- xor r14b,r9b
- xor r12b,bl
- xor r10b,dl
-
- kmovb k1,r14d
- shr r14b,4
- kmovb k2,r14d
- kmovb k3,r12d
- shr r12b,4
- kmovb k4,r12d
- kmovb k5,r10d
-
-
- vpsubq ymm2{k1},ymm2,ymm4
- vpsubq ymm20{k2},ymm20,ymm4
- vpsubq ymm21{k3},ymm21,ymm4
- vpsubq ymm22{k4},ymm22,ymm4
- vpsubq ymm23{k5},ymm23,ymm4
-
- vpandq ymm2,ymm2,ymm4
- vpandq ymm20,ymm20,ymm4
- vpandq ymm21,ymm21,ymm4
- vpandq ymm22,ymm22,ymm4
- vpandq ymm23,ymm23,ymm4
-
- vmovdqu64 YMMWORD[rdi],ymm1
- vmovdqu64 YMMWORD[32+rdi],ymm16
- vmovdqu64 YMMWORD[64+rdi],ymm17
- vmovdqu64 YMMWORD[96+rdi],ymm18
- vmovdqu64 YMMWORD[128+rdi],ymm19
-
- vmovdqu64 YMMWORD[160+rdi],ymm2
- vmovdqu64 YMMWORD[192+rdi],ymm20
- vmovdqu64 YMMWORD[224+rdi],ymm21
- vmovdqu64 YMMWORD[256+rdi],ymm22
- vmovdqu64 YMMWORD[288+rdi],ymm23
-
- vzeroupper
- mov r15,QWORD[rsp]
-
- mov r14,QWORD[8+rsp]
-
- mov r13,QWORD[16+rsp]
-
- mov r12,QWORD[24+rsp]
-
- mov rbp,QWORD[32+rsp]
-
- mov rbx,QWORD[40+rsp]
-
- lea rsp,[48+rsp]
-
-$L$rsaz_amm52x20_x2_256_epilogue:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_ossl_rsaz_amm52x20_x2_256:
-section .text code align=64
-
-
-ALIGN 32
global ossl_extract_multiplier_2x20_win5
+ossl_rsaz_amm52x20_x1_256:
+ossl_rsaz_amm52x20_x2_256:
ossl_extract_multiplier_2x20_win5:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_ossl_extract_multiplier_2x20_win5:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
- mov rcx,r9
-
-
-
-DB 243,15,30,250
- lea rax,[rcx*4+rcx]
- sal rax,5
- add rsi,rax
-
- vmovdqa64 ymm23,YMMWORD[$L$ones]
- vpbroadcastq ymm22,rdx
- lea rax,[10240+rsi]
-
- vpxor xmm4,xmm4,xmm4
- vmovdqa64 ymm3,ymm4
- vmovdqa64 ymm2,ymm4
- vmovdqa64 ymm1,ymm4
- vmovdqa64 ymm0,ymm4
- vmovdqa64 ymm21,ymm4
-
-ALIGN 32
-$L$loop:
- vpcmpq k1,ymm22,ymm21,0
- add rsi,320
- vpaddq ymm21,ymm21,ymm23
- vmovdqu64 ymm16,YMMWORD[((-320))+rsi]
- vmovdqu64 ymm17,YMMWORD[((-288))+rsi]
- vmovdqu64 ymm18,YMMWORD[((-256))+rsi]
- vmovdqu64 ymm19,YMMWORD[((-224))+rsi]
- vmovdqu64 ymm20,YMMWORD[((-192))+rsi]
- vpblendmq ymm0{k1},ymm0,ymm16
- vpblendmq ymm1{k1},ymm1,ymm17
- vpblendmq ymm2{k1},ymm2,ymm18
- vpblendmq ymm3{k1},ymm3,ymm19
- vpblendmq ymm4{k1},ymm4,ymm20
- cmp rax,rsi
- jne NEAR $L$loop
-
- vmovdqu64 YMMWORD[rdi],ymm0
- vmovdqu64 YMMWORD[32+rdi],ymm1
- vmovdqu64 YMMWORD[64+rdi],ymm2
- vmovdqu64 YMMWORD[96+rdi],ymm3
- vmovdqu64 YMMWORD[128+rdi],ymm4
-
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_ossl_extract_multiplier_2x20_win5:
-section .data data align=8
-
-ALIGN 32
-$L$ones:
- DQ 1,1,1,1
-EXTERN __imp_RtlVirtualUnwind
-
-ALIGN 16
-rsaz_def_handler:
- push rsi
- push rdi
- push rbx
- push rbp
- push r12
- push r13
- push r14
- push r15
- pushfq
- sub rsp,64
-
- mov rax,QWORD[120+r8]
- mov rbx,QWORD[248+r8]
-
- mov rsi,QWORD[8+r9]
- mov r11,QWORD[56+r9]
-
- mov r10d,DWORD[r11]
- lea r10,[r10*1+rsi]
- cmp rbx,r10
- jb NEAR $L$common_seh_tail
-
- mov rax,QWORD[152+r8]
-
- mov r10d,DWORD[4+r11]
- lea r10,[r10*1+rsi]
- cmp rbx,r10
- jae NEAR $L$common_seh_tail
-
- lea rax,[48+rax]
-
- mov rbx,QWORD[((-8))+rax]
- mov rbp,QWORD[((-16))+rax]
- mov r12,QWORD[((-24))+rax]
- mov r13,QWORD[((-32))+rax]
- mov r14,QWORD[((-40))+rax]
- mov r15,QWORD[((-48))+rax]
- mov QWORD[144+r8],rbx
- mov QWORD[160+r8],rbp
- mov QWORD[216+r8],r12
- mov QWORD[224+r8],r13
- mov QWORD[232+r8],r14
- mov QWORD[240+r8],r15
-
-$L$common_seh_tail:
- mov rdi,QWORD[8+rax]
- mov rsi,QWORD[16+rax]
- mov QWORD[152+r8],rax
- mov QWORD[168+r8],rsi
- mov QWORD[176+r8],rdi
-
- mov rdi,QWORD[40+r9]
- mov rsi,r8
- mov ecx,154
- DD 0xa548f3fc
-
- mov rsi,r9
- xor rcx,rcx
- mov rdx,QWORD[8+rsi]
- mov r8,QWORD[rsi]
- mov r9,QWORD[16+rsi]
- mov r10,QWORD[40+rsi]
- lea r11,[56+rsi]
- lea r12,[24+rsi]
- mov QWORD[32+rsp],r10
- mov QWORD[40+rsp],r11
- mov QWORD[48+rsp],r12
- mov QWORD[56+rsp],rcx
- call QWORD[__imp_RtlVirtualUnwind]
-
- mov eax,1
- add rsp,64
- popfq
- pop r15
- pop r14
- pop r13
- pop r12
- pop rbp
- pop rbx
- pop rdi
- pop rsi
+DB 0x0f,0x0b
DB 0F3h,0C3h ;repret
-
-section .pdata rdata align=4
-ALIGN 4
- DD $L$SEH_begin_ossl_rsaz_amm52x20_x1_256 wrt ..imagebase
- DD $L$SEH_end_ossl_rsaz_amm52x20_x1_256 wrt ..imagebase
- DD $L$SEH_info_ossl_rsaz_amm52x20_x1_256 wrt ..imagebase
-
- DD $L$SEH_begin_ossl_rsaz_amm52x20_x2_256 wrt ..imagebase
- DD $L$SEH_end_ossl_rsaz_amm52x20_x2_256 wrt ..imagebase
- DD $L$SEH_info_ossl_rsaz_amm52x20_x2_256 wrt ..imagebase
-
- DD $L$SEH_begin_ossl_extract_multiplier_2x20_win5 wrt ..imagebase
- DD $L$SEH_end_ossl_extract_multiplier_2x20_win5 wrt ..imagebase
- DD $L$SEH_info_ossl_extract_multiplier_2x20_win5 wrt ..imagebase
-
-section .xdata rdata align=8
-ALIGN 8
-$L$SEH_info_ossl_rsaz_amm52x20_x1_256:
-DB 9,0,0,0
- DD rsaz_def_handler wrt ..imagebase
- DD $L$rsaz_amm52x20_x1_256_body wrt ..imagebase,$L$rsaz_amm52x20_x1_256_epilogue wrt ..imagebase
-$L$SEH_info_ossl_rsaz_amm52x20_x2_256:
-DB 9,0,0,0
- DD rsaz_def_handler wrt ..imagebase
- DD $L$rsaz_amm52x20_x2_256_body wrt ..imagebase,$L$rsaz_amm52x20_x2_256_epilogue wrt ..imagebase
-$L$SEH_info_ossl_extract_multiplier_2x20_win5:
-DB 9,0,0,0
- DD rsaz_def_handler wrt ..imagebase
- DD $L$SEH_begin_ossl_extract_multiplier_2x20_win5 wrt ..imagebase,$L$SEH_begin_ossl_extract_multiplier_2x20_win5 wrt ..imagebase
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-x86_64.nasm
index f407312e950..9f1c3f9b250 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-x86_64.nasm
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/rsaz-x86_64.nasm
@@ -43,10 +43,6 @@ DB 102,72,15,110,202
mov rdx,QWORD[rsi]
mov rax,QWORD[8+rsi]
mov QWORD[128+rsp],rcx
- mov r11d,0x80100
- and r11d,DWORD[((OPENSSL_ia32cap_P+8))]
- cmp r11d,0x80100
- je NEAR $L$oop_sqrx
jmp NEAR $L$oop_sqr
ALIGN 32
@@ -417,282 +413,6 @@ DB 102,72,15,126,205
dec r8d
jnz NEAR $L$oop_sqr
- jmp NEAR $L$sqr_tail
-
-ALIGN 32
-$L$oop_sqrx:
- mov DWORD[((128+8))+rsp],r8d
-DB 102,72,15,110,199
-
- mulx r9,r8,rax
- mov rbx,rax
-
- mulx r10,rcx,QWORD[16+rsi]
- xor rbp,rbp
-
- mulx r11,rax,QWORD[24+rsi]
- adcx r9,rcx
-
-DB 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00
- adcx r10,rax
-
-DB 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00
- adcx r11,rcx
-
- mulx r14,rcx,QWORD[48+rsi]
- adcx r12,rax
- adcx r13,rcx
-
- mulx r15,rax,QWORD[56+rsi]
- adcx r14,rax
- adcx r15,rbp
-
- mulx rdi,rax,rdx
- mov rdx,rbx
- xor rcx,rcx
- adox r8,r8
- adcx r8,rdi
- adox rcx,rbp
- adcx rcx,rbp
-
- mov QWORD[rsp],rax
- mov QWORD[8+rsp],r8
-
-
-DB 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00
- adox r10,rax
- adcx r11,rbx
-
- mulx r8,rdi,QWORD[24+rsi]
- adox r11,rdi
-DB 0x66
- adcx r12,r8
-
- mulx rbx,rax,QWORD[32+rsi]
- adox r12,rax
- adcx r13,rbx
-
- mulx r8,rdi,QWORD[40+rsi]
- adox r13,rdi
- adcx r14,r8
-
-DB 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
- adox r14,rax
- adcx r15,rbx
-
-DB 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00
- adox r15,rdi
- adcx r8,rbp
- mulx rdi,rax,rdx
- adox r8,rbp
-DB 0x48,0x8b,0x96,0x10,0x00,0x00,0x00
-
- xor rbx,rbx
- adox r9,r9
-
- adcx rax,rcx
- adox r10,r10
- adcx r9,rax
- adox rbx,rbp
- adcx r10,rdi
- adcx rbx,rbp
-
- mov QWORD[16+rsp],r9
-DB 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00
-
-
- mulx r9,rdi,QWORD[24+rsi]
- adox r12,rdi
- adcx r13,r9
-
- mulx rcx,rax,QWORD[32+rsi]
- adox r13,rax
- adcx r14,rcx
-
-DB 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00
- adox r14,rdi
- adcx r15,r9
-
-DB 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00
- adox r15,rax
- adcx r8,rcx
-
- mulx r9,rdi,QWORD[56+rsi]
- adox r8,rdi
- adcx r9,rbp
- mulx rdi,rax,rdx
- adox r9,rbp
- mov rdx,QWORD[24+rsi]
-
- xor rcx,rcx
- adox r11,r11
-
- adcx rax,rbx
- adox r12,r12
- adcx r11,rax
- adox rcx,rbp
- adcx r12,rdi
- adcx rcx,rbp
-
- mov QWORD[32+rsp],r11
- mov QWORD[40+rsp],r12
-
-
- mulx rbx,rax,QWORD[32+rsi]
- adox r14,rax
- adcx r15,rbx
-
- mulx r10,rdi,QWORD[40+rsi]
- adox r15,rdi
- adcx r8,r10
-
- mulx rbx,rax,QWORD[48+rsi]
- adox r8,rax
- adcx r9,rbx
-
- mulx r10,rdi,QWORD[56+rsi]
- adox r9,rdi
- adcx r10,rbp
- mulx rdi,rax,rdx
- adox r10,rbp
- mov rdx,QWORD[32+rsi]
-
- xor rbx,rbx
- adox r13,r13
-
- adcx rax,rcx
- adox r14,r14
- adcx r13,rax
- adox rbx,rbp
- adcx r14,rdi
- adcx rbx,rbp
-
- mov QWORD[48+rsp],r13
- mov QWORD[56+rsp],r14
-
-
- mulx r11,rdi,QWORD[40+rsi]
- adox r8,rdi
- adcx r9,r11
-
- mulx rcx,rax,QWORD[48+rsi]
- adox r9,rax
- adcx r10,rcx
-
- mulx r11,rdi,QWORD[56+rsi]
- adox r10,rdi
- adcx r11,rbp
- mulx rdi,rax,rdx
- mov rdx,QWORD[40+rsi]
- adox r11,rbp
-
- xor rcx,rcx
- adox r15,r15
-
- adcx rax,rbx
- adox r8,r8
- adcx r15,rax
- adox rcx,rbp
- adcx r8,rdi
- adcx rcx,rbp
-
- mov QWORD[64+rsp],r15
- mov QWORD[72+rsp],r8
-
-
-DB 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
- adox r10,rax
- adcx r11,rbx
-
-DB 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00
- adox r11,rdi
- adcx r12,rbp
- mulx rdi,rax,rdx
- adox r12,rbp
- mov rdx,QWORD[48+rsi]
-
- xor rbx,rbx
- adox r9,r9
-
- adcx rax,rcx
- adox r10,r10
- adcx r9,rax
- adcx r10,rdi
- adox rbx,rbp
- adcx rbx,rbp
-
- mov QWORD[80+rsp],r9
- mov QWORD[88+rsp],r10
-
-
-DB 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00
- adox r12,rax
- adox r13,rbp
-
- mulx rdi,rax,rdx
- xor rcx,rcx
- mov rdx,QWORD[56+rsi]
- adox r11,r11
-
- adcx rax,rbx
- adox r12,r12
- adcx r11,rax
- adox rcx,rbp
- adcx r12,rdi
- adcx rcx,rbp
-
-DB 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00
-DB 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00
-
-
- mulx rdx,rax,rdx
- xor rbx,rbx
- adox r13,r13
-
- adcx rax,rcx
- adox rbx,rbp
- adcx rax,r13
- adcx rbx,rdx
-
-DB 102,72,15,126,199
-DB 102,72,15,126,205
-
- mov rdx,QWORD[128+rsp]
- mov r8,QWORD[rsp]
- mov r9,QWORD[8+rsp]
- mov r10,QWORD[16+rsp]
- mov r11,QWORD[24+rsp]
- mov r12,QWORD[32+rsp]
- mov r13,QWORD[40+rsp]
- mov r14,QWORD[48+rsp]
- mov r15,QWORD[56+rsp]
-
- mov QWORD[112+rsp],rax
- mov QWORD[120+rsp],rbx
-
- call __rsaz_512_reducex
-
- add r8,QWORD[64+rsp]
- adc r9,QWORD[72+rsp]
- adc r10,QWORD[80+rsp]
- adc r11,QWORD[88+rsp]
- adc r12,QWORD[96+rsp]
- adc r13,QWORD[104+rsp]
- adc r14,QWORD[112+rsp]
- adc r15,QWORD[120+rsp]
- sbb rcx,rcx
-
- call __rsaz_512_subtract
-
- mov rdx,r8
- mov rax,r9
- mov r8d,DWORD[((128+8))+rsp]
- mov rsi,rdi
-
- dec r8d
- jnz NEAR $L$oop_sqrx
-
-$L$sqr_tail:
lea rax,[((128+24+48))+rsp]
@@ -751,10 +471,6 @@ $L$mul_body:
DB 102,72,15,110,199
DB 102,72,15,110,201
mov QWORD[128+rsp],r8
- mov r11d,0x80100
- and r11d,DWORD[((OPENSSL_ia32cap_P+8))]
- cmp r11d,0x80100
- je NEAR $L$mulx
mov rbx,QWORD[rdx]
mov rbp,rdx
call __rsaz_512_mul
@@ -772,29 +488,6 @@ DB 102,72,15,126,205
mov r15,QWORD[56+rsp]
call __rsaz_512_reduce
- jmp NEAR $L$mul_tail
-
-ALIGN 32
-$L$mulx:
- mov rbp,rdx
- mov rdx,QWORD[rdx]
- call __rsaz_512_mulx
-
-DB 102,72,15,126,199
-DB 102,72,15,126,205
-
- mov rdx,QWORD[128+rsp]
- mov r8,QWORD[rsp]
- mov r9,QWORD[8+rsp]
- mov r10,QWORD[16+rsp]
- mov r11,QWORD[24+rsp]
- mov r12,QWORD[32+rsp]
- mov r13,QWORD[40+rsp]
- mov r14,QWORD[48+rsp]
- mov r15,QWORD[56+rsp]
-
- call __rsaz_512_reducex
-$L$mul_tail:
add r8,QWORD[64+rsp]
adc r9,QWORD[72+rsp]
adc r10,QWORD[80+rsp]
@@ -926,10 +619,6 @@ $L$mul_gather4_body:
por xmm8,xmm9
pshufd xmm9,xmm8,0x4e
por xmm8,xmm9
- mov r11d,0x80100
- and r11d,DWORD[((OPENSSL_ia32cap_P+8))]
- cmp r11d,0x80100
- je NEAR $L$mulx_gather
DB 102,76,15,126,195
mov QWORD[128+rsp],r8
@@ -1110,142 +799,6 @@ DB 102,76,15,126,195
mov r15,QWORD[56+rsp]
call __rsaz_512_reduce
- jmp NEAR $L$mul_gather_tail
-
-ALIGN 32
-$L$mulx_gather:
-DB 102,76,15,126,194
-
- mov QWORD[128+rsp],r8
- mov QWORD[((128+8))+rsp],rdi
- mov QWORD[((128+16))+rsp],rcx
-
- mulx r8,rbx,QWORD[rsi]
- mov QWORD[rsp],rbx
- xor edi,edi
-
- mulx r9,rax,QWORD[8+rsi]
-
- mulx r10,rbx,QWORD[16+rsi]
- adcx r8,rax
-
- mulx r11,rax,QWORD[24+rsi]
- adcx r9,rbx
-
- mulx r12,rbx,QWORD[32+rsi]
- adcx r10,rax
-
- mulx r13,rax,QWORD[40+rsi]
- adcx r11,rbx
-
- mulx r14,rbx,QWORD[48+rsi]
- adcx r12,rax
-
- mulx r15,rax,QWORD[56+rsi]
- adcx r13,rbx
- adcx r14,rax
-DB 0x67
- mov rbx,r8
- adcx r15,rdi
-
- mov rcx,-7
- jmp NEAR $L$oop_mulx_gather
-
-ALIGN 32
-$L$oop_mulx_gather:
- movdqa xmm8,XMMWORD[rbp]
- movdqa xmm9,XMMWORD[16+rbp]
- movdqa xmm10,XMMWORD[32+rbp]
- movdqa xmm11,XMMWORD[48+rbp]
- pand xmm8,xmm0
- movdqa xmm12,XMMWORD[64+rbp]
- pand xmm9,xmm1
- movdqa xmm13,XMMWORD[80+rbp]
- pand xmm10,xmm2
- movdqa xmm14,XMMWORD[96+rbp]
- pand xmm11,xmm3
- movdqa xmm15,XMMWORD[112+rbp]
- lea rbp,[128+rbp]
- pand xmm12,xmm4
- pand xmm13,xmm5
- pand xmm14,xmm6
- pand xmm15,xmm7
- por xmm8,xmm10
- por xmm9,xmm11
- por xmm8,xmm12
- por xmm9,xmm13
- por xmm8,xmm14
- por xmm9,xmm15
-
- por xmm8,xmm9
- pshufd xmm9,xmm8,0x4e
- por xmm8,xmm9
-DB 102,76,15,126,194
-
-DB 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00
- adcx rbx,rax
- adox r8,r9
-
- mulx r9,rax,QWORD[8+rsi]
- adcx r8,rax
- adox r9,r10
-
- mulx r10,rax,QWORD[16+rsi]
- adcx r9,rax
- adox r10,r11
-
-DB 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00
- adcx r10,rax
- adox r11,r12
-
- mulx r12,rax,QWORD[32+rsi]
- adcx r11,rax
- adox r12,r13
-
- mulx r13,rax,QWORD[40+rsi]
- adcx r12,rax
- adox r13,r14
-
-DB 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
- adcx r13,rax
-DB 0x67
- adox r14,r15
-
- mulx r15,rax,QWORD[56+rsi]
- mov QWORD[64+rcx*8+rsp],rbx
- adcx r14,rax
- adox r15,rdi
- mov rbx,r8
- adcx r15,rdi
-
- inc rcx
- jnz NEAR $L$oop_mulx_gather
-
- mov QWORD[64+rsp],r8
- mov QWORD[((64+8))+rsp],r9
- mov QWORD[((64+16))+rsp],r10
- mov QWORD[((64+24))+rsp],r11
- mov QWORD[((64+32))+rsp],r12
- mov QWORD[((64+40))+rsp],r13
- mov QWORD[((64+48))+rsp],r14
- mov QWORD[((64+56))+rsp],r15
-
- mov rdx,QWORD[128+rsp]
- mov rdi,QWORD[((128+8))+rsp]
- mov rbp,QWORD[((128+16))+rsp]
-
- mov r8,QWORD[rsp]
- mov r9,QWORD[8+rsp]
- mov r10,QWORD[16+rsp]
- mov r11,QWORD[24+rsp]
- mov r12,QWORD[32+rsp]
- mov r13,QWORD[40+rsp]
- mov r14,QWORD[48+rsp]
- mov r15,QWORD[56+rsp]
-
- call __rsaz_512_reducex
-
-$L$mul_gather_tail:
add r8,QWORD[64+rsp]
adc r9,QWORD[72+rsp]
adc r10,QWORD[80+rsp]
@@ -1332,10 +885,6 @@ DB 102,73,15,110,208
mov QWORD[128+rsp],rcx
mov rbp,rdi
- mov r11d,0x80100
- and r11d,DWORD[((OPENSSL_ia32cap_P+8))]
- cmp r11d,0x80100
- je NEAR $L$mulx_scatter
mov rbx,QWORD[rdi]
call __rsaz_512_mul
@@ -1352,29 +901,6 @@ DB 102,72,15,126,205
mov r15,QWORD[56+rsp]
call __rsaz_512_reduce
- jmp NEAR $L$mul_scatter_tail
-
-ALIGN 32
-$L$mulx_scatter:
- mov rdx,QWORD[rdi]
- call __rsaz_512_mulx
-
-DB 102,72,15,126,199
-DB 102,72,15,126,205
-
- mov rdx,QWORD[128+rsp]
- mov r8,QWORD[rsp]
- mov r9,QWORD[8+rsp]
- mov r10,QWORD[16+rsp]
- mov r11,QWORD[24+rsp]
- mov r12,QWORD[32+rsp]
- mov r13,QWORD[40+rsp]
- mov r14,QWORD[48+rsp]
- mov r15,QWORD[56+rsp]
-
- call __rsaz_512_reducex
-
-$L$mul_scatter_tail:
add r8,QWORD[64+rsp]
adc r9,QWORD[72+rsp]
adc r10,QWORD[80+rsp]
@@ -1450,7 +976,6 @@ $L$SEH_begin_rsaz_512_mul_by_one:
sub rsp,128+24
$L$mul_by_one_body:
- mov eax,DWORD[((OPENSSL_ia32cap_P+8))]
mov rbp,rdx
mov QWORD[128+rsp],rcx
@@ -1471,16 +996,7 @@ $L$mul_by_one_body:
movdqa XMMWORD[64+rsp],xmm0
movdqa XMMWORD[80+rsp],xmm0
movdqa XMMWORD[96+rsp],xmm0
- and eax,0x80100
- cmp eax,0x80100
- je NEAR $L$by_one_callx
call __rsaz_512_reduce
- jmp NEAR $L$by_one_tail
-ALIGN 32
-$L$by_one_callx:
- mov rdx,QWORD[128+rsp]
- call __rsaz_512_reducex
-$L$by_one_tail:
mov QWORD[rdi],r8
mov QWORD[8+rdi],r9
mov QWORD[16+rdi],r10
@@ -1598,64 +1114,6 @@ $L$reduction_loop:
-ALIGN 32
-__rsaz_512_reducex:
-
-
- imul rdx,r8
- xor rsi,rsi
- mov ecx,8
- jmp NEAR $L$reduction_loopx
-
-ALIGN 32
-$L$reduction_loopx:
- mov rbx,r8
- mulx r8,rax,QWORD[rbp]
- adcx rax,rbx
- adox r8,r9
-
- mulx r9,rax,QWORD[8+rbp]
- adcx r8,rax
- adox r9,r10
-
- mulx r10,rbx,QWORD[16+rbp]
- adcx r9,rbx
- adox r10,r11
-
- mulx r11,rbx,QWORD[24+rbp]
- adcx r10,rbx
- adox r11,r12
-
-DB 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
- mov rax,rdx
- mov rdx,r8
- adcx r11,rbx
- adox r12,r13
-
- mulx rdx,rbx,QWORD[((128+8))+rsp]
- mov rdx,rax
-
- mulx r13,rax,QWORD[40+rbp]
- adcx r12,rax
- adox r13,r14
-
-DB 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00
- adcx r13,rax
- adox r14,r15
-
- mulx r15,rax,QWORD[56+rbp]
- mov rdx,rbx
- adcx r14,rax
- adox r15,rsi
- adcx r15,rsi
-
- dec ecx
- jne NEAR $L$reduction_loopx
-
- DB 0F3h,0C3h ;repret
-
-
-
ALIGN 32
__rsaz_512_subtract:
@@ -1858,128 +1316,6 @@ $L$oop_mul:
DB 0F3h,0C3h ;repret
-
-ALIGN 32
-__rsaz_512_mulx:
-
- mulx r8,rbx,QWORD[rsi]
- mov rcx,-6
-
- mulx r9,rax,QWORD[8+rsi]
- mov QWORD[8+rsp],rbx
-
- mulx r10,rbx,QWORD[16+rsi]
- adc r8,rax
-
- mulx r11,rax,QWORD[24+rsi]
- adc r9,rbx
-
- mulx r12,rbx,QWORD[32+rsi]
- adc r10,rax
-
- mulx r13,rax,QWORD[40+rsi]
- adc r11,rbx
-
- mulx r14,rbx,QWORD[48+rsi]
- adc r12,rax
-
- mulx r15,rax,QWORD[56+rsi]
- mov rdx,QWORD[8+rbp]
- adc r13,rbx
- adc r14,rax
- adc r15,0
-
- xor rdi,rdi
- jmp NEAR $L$oop_mulx
-
-ALIGN 32
-$L$oop_mulx:
- mov rbx,r8
- mulx r8,rax,QWORD[rsi]
- adcx rbx,rax
- adox r8,r9
-
- mulx r9,rax,QWORD[8+rsi]
- adcx r8,rax
- adox r9,r10
-
- mulx r10,rax,QWORD[16+rsi]
- adcx r9,rax
- adox r10,r11
-
- mulx r11,rax,QWORD[24+rsi]
- adcx r10,rax
- adox r11,r12
-
-DB 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00
- adcx r11,rax
- adox r12,r13
-
- mulx r13,rax,QWORD[40+rsi]
- adcx r12,rax
- adox r13,r14
-
- mulx r14,rax,QWORD[48+rsi]
- adcx r13,rax
- adox r14,r15
-
- mulx r15,rax,QWORD[56+rsi]
- mov rdx,QWORD[64+rcx*8+rbp]
- mov QWORD[((8+64-8))+rcx*8+rsp],rbx
- adcx r14,rax
- adox r15,rdi
- adcx r15,rdi
-
- inc rcx
- jnz NEAR $L$oop_mulx
-
- mov rbx,r8
- mulx r8,rax,QWORD[rsi]
- adcx rbx,rax
- adox r8,r9
-
-DB 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00
- adcx r8,rax
- adox r9,r10
-
-DB 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00
- adcx r9,rax
- adox r10,r11
-
- mulx r11,rax,QWORD[24+rsi]
- adcx r10,rax
- adox r11,r12
-
- mulx r12,rax,QWORD[32+rsi]
- adcx r11,rax
- adox r12,r13
-
- mulx r13,rax,QWORD[40+rsi]
- adcx r12,rax
- adox r13,r14
-
-DB 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
- adcx r13,rax
- adox r14,r15
-
-DB 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
- adcx r14,rax
- adox r15,rdi
- adcx r15,rdi
-
- mov QWORD[((8+64-8))+rsp],rbx
- mov QWORD[((8+64))+rsp],r8
- mov QWORD[((8+64+8))+rsp],r9
- mov QWORD[((8+64+16))+rsp],r10
- mov QWORD[((8+64+24))+rsp],r11
- mov QWORD[((8+64+32))+rsp],r12
- mov QWORD[((8+64+40))+rsp],r13
- mov QWORD[((8+64+48))+rsp],r14
- mov QWORD[((8+64+56))+rsp],r15
-
- DB 0F3h,0C3h ;repret
-
-
global rsaz_512_scatter4
ALIGN 16
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/x86_64-mont.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/x86_64-mont.nasm
index b4f755d63e6..80de3a35016 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/x86_64-mont.nasm
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/x86_64-mont.nasm
@@ -31,7 +31,6 @@ $L$SEH_begin_bn_mul_mont:
jnz NEAR $L$mul_enter
cmp r9d,8
jb NEAR $L$mul_enter
- mov r11d,DWORD[((OPENSSL_ia32cap_P+8))]
cmp rdx,rsi
jne NEAR $L$mul4x_enter
test r9d,7
@@ -294,9 +293,6 @@ $L$SEH_begin_bn_mul4x_mont:
mov rax,rsp
$L$mul4x_enter:
- and r11d,0x80100
- cmp r11d,0x80100
- je NEAR $L$mulx4x_enter
push rbx
push rbp
@@ -722,7 +718,6 @@ $L$mul4x_epilogue:
DB 0F3h,0C3h ;repret
$L$SEH_end_bn_mul4x_mont:
-EXTERN bn_sqrx8x_internal
EXTERN bn_sqr8x_internal
@@ -818,25 +813,6 @@ DB 102,72,15,110,209
pxor xmm0,xmm0
DB 102,72,15,110,207
DB 102,73,15,110,218
- mov eax,DWORD[((OPENSSL_ia32cap_P+8))]
- and eax,0x80100
- cmp eax,0x80100
- jne NEAR $L$sqr8x_nox
-
- call bn_sqrx8x_internal
-
-
-
-
- lea rbx,[rcx*1+r8]
- mov r9,rcx
- mov rdx,rcx
-DB 102,72,15,126,207
- sar rcx,3+2
- jmp NEAR $L$sqr8x_sub
-
-ALIGN 32
-$L$sqr8x_nox:
call bn_sqr8x_internal
@@ -926,376 +902,6 @@ $L$sqr8x_epilogue:
DB 0F3h,0C3h ;repret
$L$SEH_end_bn_sqr8x_mont:
-
-ALIGN 32
-bn_mulx4x_mont:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_bn_mulx4x_mont:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
- mov rcx,r9
- mov r8,QWORD[40+rsp]
- mov r9,QWORD[48+rsp]
-
-
-
- mov rax,rsp
-
-$L$mulx4x_enter:
- push rbx
-
- push rbp
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
-$L$mulx4x_prologue:
-
- shl r9d,3
- xor r10,r10
- sub r10,r9
- mov r8,QWORD[r8]
- lea rbp,[((-72))+r10*1+rsp]
- and rbp,-128
- mov r11,rsp
- sub r11,rbp
- and r11,-4096
- lea rsp,[rbp*1+r11]
- mov r10,QWORD[rsp]
- cmp rsp,rbp
- ja NEAR $L$mulx4x_page_walk
- jmp NEAR $L$mulx4x_page_walk_done
-
-ALIGN 16
-$L$mulx4x_page_walk:
- lea rsp,[((-4096))+rsp]
- mov r10,QWORD[rsp]
- cmp rsp,rbp
- ja NEAR $L$mulx4x_page_walk
-$L$mulx4x_page_walk_done:
-
- lea r10,[r9*1+rdx]
-
-
-
-
-
-
-
-
-
-
-
-
- mov QWORD[rsp],r9
- shr r9,5
- mov QWORD[16+rsp],r10
- sub r9,1
- mov QWORD[24+rsp],r8
- mov QWORD[32+rsp],rdi
- mov QWORD[40+rsp],rax
-
- mov QWORD[48+rsp],r9
- jmp NEAR $L$mulx4x_body
-
-ALIGN 32
-$L$mulx4x_body:
- lea rdi,[8+rdx]
- mov rdx,QWORD[rdx]
- lea rbx,[((64+32))+rsp]
- mov r9,rdx
-
- mulx rax,r8,QWORD[rsi]
- mulx r14,r11,QWORD[8+rsi]
- add r11,rax
- mov QWORD[8+rsp],rdi
- mulx r13,r12,QWORD[16+rsi]
- adc r12,r14
- adc r13,0
-
- mov rdi,r8
- imul r8,QWORD[24+rsp]
- xor rbp,rbp
-
- mulx r14,rax,QWORD[24+rsi]
- mov rdx,r8
- lea rsi,[32+rsi]
- adcx r13,rax
- adcx r14,rbp
-
- mulx r10,rax,QWORD[rcx]
- adcx rdi,rax
- adox r10,r11
- mulx r11,rax,QWORD[8+rcx]
- adcx r10,rax
- adox r11,r12
-DB 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
- mov rdi,QWORD[48+rsp]
- mov QWORD[((-32))+rbx],r10
- adcx r11,rax
- adox r12,r13
- mulx r15,rax,QWORD[24+rcx]
- mov rdx,r9
- mov QWORD[((-24))+rbx],r11
- adcx r12,rax
- adox r15,rbp
- lea rcx,[32+rcx]
- mov QWORD[((-16))+rbx],r12
-
- jmp NEAR $L$mulx4x_1st
-
-ALIGN 32
-$L$mulx4x_1st:
- adcx r15,rbp
- mulx rax,r10,QWORD[rsi]
- adcx r10,r14
- mulx r14,r11,QWORD[8+rsi]
- adcx r11,rax
- mulx rax,r12,QWORD[16+rsi]
- adcx r12,r14
- mulx r14,r13,QWORD[24+rsi]
-DB 0x67,0x67
- mov rdx,r8
- adcx r13,rax
- adcx r14,rbp
- lea rsi,[32+rsi]
- lea rbx,[32+rbx]
-
- adox r10,r15
- mulx r15,rax,QWORD[rcx]
- adcx r10,rax
- adox r11,r15
- mulx r15,rax,QWORD[8+rcx]
- adcx r11,rax
- adox r12,r15
- mulx r15,rax,QWORD[16+rcx]
- mov QWORD[((-40))+rbx],r10
- adcx r12,rax
- mov QWORD[((-32))+rbx],r11
- adox r13,r15
- mulx r15,rax,QWORD[24+rcx]
- mov rdx,r9
- mov QWORD[((-24))+rbx],r12
- adcx r13,rax
- adox r15,rbp
- lea rcx,[32+rcx]
- mov QWORD[((-16))+rbx],r13
-
- dec rdi
- jnz NEAR $L$mulx4x_1st
-
- mov rax,QWORD[rsp]
- mov rdi,QWORD[8+rsp]
- adc r15,rbp
- add r14,r15
- sbb r15,r15
- mov QWORD[((-8))+rbx],r14
- jmp NEAR $L$mulx4x_outer
-
-ALIGN 32
-$L$mulx4x_outer:
- mov rdx,QWORD[rdi]
- lea rdi,[8+rdi]
- sub rsi,rax
- mov QWORD[rbx],r15
- lea rbx,[((64+32))+rsp]
- sub rcx,rax
-
- mulx r11,r8,QWORD[rsi]
- xor ebp,ebp
- mov r9,rdx
- mulx r12,r14,QWORD[8+rsi]
- adox r8,QWORD[((-32))+rbx]
- adcx r11,r14
- mulx r13,r15,QWORD[16+rsi]
- adox r11,QWORD[((-24))+rbx]
- adcx r12,r15
- adox r12,QWORD[((-16))+rbx]
- adcx r13,rbp
- adox r13,rbp
-
- mov QWORD[8+rsp],rdi
- mov r15,r8
- imul r8,QWORD[24+rsp]
- xor ebp,ebp
-
- mulx r14,rax,QWORD[24+rsi]
- mov rdx,r8
- adcx r13,rax
- adox r13,QWORD[((-8))+rbx]
- adcx r14,rbp
- lea rsi,[32+rsi]
- adox r14,rbp
-
- mulx r10,rax,QWORD[rcx]
- adcx r15,rax
- adox r10,r11
- mulx r11,rax,QWORD[8+rcx]
- adcx r10,rax
- adox r11,r12
- mulx r12,rax,QWORD[16+rcx]
- mov QWORD[((-32))+rbx],r10
- adcx r11,rax
- adox r12,r13
- mulx r15,rax,QWORD[24+rcx]
- mov rdx,r9
- mov QWORD[((-24))+rbx],r11
- lea rcx,[32+rcx]
- adcx r12,rax
- adox r15,rbp
- mov rdi,QWORD[48+rsp]
- mov QWORD[((-16))+rbx],r12
-
- jmp NEAR $L$mulx4x_inner
-
-ALIGN 32
-$L$mulx4x_inner:
- mulx rax,r10,QWORD[rsi]
- adcx r15,rbp
- adox r10,r14
- mulx r14,r11,QWORD[8+rsi]
- adcx r10,QWORD[rbx]
- adox r11,rax
- mulx rax,r12,QWORD[16+rsi]
- adcx r11,QWORD[8+rbx]
- adox r12,r14
- mulx r14,r13,QWORD[24+rsi]
- mov rdx,r8
- adcx r12,QWORD[16+rbx]
- adox r13,rax
- adcx r13,QWORD[24+rbx]
- adox r14,rbp
- lea rsi,[32+rsi]
- lea rbx,[32+rbx]
- adcx r14,rbp
-
- adox r10,r15
- mulx r15,rax,QWORD[rcx]
- adcx r10,rax
- adox r11,r15
- mulx r15,rax,QWORD[8+rcx]
- adcx r11,rax
- adox r12,r15
- mulx r15,rax,QWORD[16+rcx]
- mov QWORD[((-40))+rbx],r10
- adcx r12,rax
- adox r13,r15
- mulx r15,rax,QWORD[24+rcx]
- mov rdx,r9
- mov QWORD[((-32))+rbx],r11
- mov QWORD[((-24))+rbx],r12
- adcx r13,rax
- adox r15,rbp
- lea rcx,[32+rcx]
- mov QWORD[((-16))+rbx],r13
-
- dec rdi
- jnz NEAR $L$mulx4x_inner
-
- mov rax,QWORD[rsp]
- mov rdi,QWORD[8+rsp]
- adc r15,rbp
- sub rbp,QWORD[rbx]
- adc r14,r15
- sbb r15,r15
- mov QWORD[((-8))+rbx],r14
-
- cmp rdi,QWORD[16+rsp]
- jne NEAR $L$mulx4x_outer
-
- lea rbx,[64+rsp]
- sub rcx,rax
- neg r15
- mov rdx,rax
- shr rax,3+2
- mov rdi,QWORD[32+rsp]
- jmp NEAR $L$mulx4x_sub
-
-ALIGN 32
-$L$mulx4x_sub:
- mov r11,QWORD[rbx]
- mov r12,QWORD[8+rbx]
- mov r13,QWORD[16+rbx]
- mov r14,QWORD[24+rbx]
- lea rbx,[32+rbx]
- sbb r11,QWORD[rcx]
- sbb r12,QWORD[8+rcx]
- sbb r13,QWORD[16+rcx]
- sbb r14,QWORD[24+rcx]
- lea rcx,[32+rcx]
- mov QWORD[rdi],r11
- mov QWORD[8+rdi],r12
- mov QWORD[16+rdi],r13
- mov QWORD[24+rdi],r14
- lea rdi,[32+rdi]
- dec rax
- jnz NEAR $L$mulx4x_sub
-
- sbb r15,0
- lea rbx,[64+rsp]
- sub rdi,rdx
-
-DB 102,73,15,110,207
- pxor xmm0,xmm0
- pshufd xmm1,xmm1,0
- mov rsi,QWORD[40+rsp]
-
- jmp NEAR $L$mulx4x_cond_copy
-
-ALIGN 32
-$L$mulx4x_cond_copy:
- movdqa xmm2,XMMWORD[rbx]
- movdqa xmm3,XMMWORD[16+rbx]
- lea rbx,[32+rbx]
- movdqu xmm4,XMMWORD[rdi]
- movdqu xmm5,XMMWORD[16+rdi]
- lea rdi,[32+rdi]
- movdqa XMMWORD[(-32)+rbx],xmm0
- movdqa XMMWORD[(-16)+rbx],xmm0
- pcmpeqd xmm0,xmm1
- pand xmm2,xmm1
- pand xmm3,xmm1
- pand xmm4,xmm0
- pand xmm5,xmm0
- pxor xmm0,xmm0
- por xmm4,xmm2
- por xmm5,xmm3
- movdqu XMMWORD[(-32)+rdi],xmm4
- movdqu XMMWORD[(-16)+rdi],xmm5
- sub rdx,32
- jnz NEAR $L$mulx4x_cond_copy
-
- mov QWORD[rbx],rdx
-
- mov rax,1
- mov r15,QWORD[((-48))+rsi]
-
- mov r14,QWORD[((-40))+rsi]
-
- mov r13,QWORD[((-32))+rsi]
-
- mov r12,QWORD[((-24))+rsi]
-
- mov rbp,QWORD[((-16))+rsi]
-
- mov rbx,QWORD[((-8))+rsi]
-
- lea rsp,[rsi]
-
-$L$mulx4x_epilogue:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_bn_mulx4x_mont:
DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
DB 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
DB 54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83
@@ -1447,9 +1053,6 @@ ALIGN 4
DD $L$SEH_begin_bn_sqr8x_mont wrt ..imagebase
DD $L$SEH_end_bn_sqr8x_mont wrt ..imagebase
DD $L$SEH_info_bn_sqr8x_mont wrt ..imagebase
- DD $L$SEH_begin_bn_mulx4x_mont wrt ..imagebase
- DD $L$SEH_end_bn_mulx4x_mont wrt ..imagebase
- DD $L$SEH_info_bn_mulx4x_mont wrt ..imagebase
section .xdata rdata align=8
ALIGN 8
$L$SEH_info_bn_mul_mont:
@@ -1465,8 +1068,3 @@ DB 9,0,0,0
DD sqr_handler wrt ..imagebase
DD $L$sqr8x_prologue wrt ..imagebase,$L$sqr8x_body wrt ..imagebase,$L$sqr8x_epilogue wrt ..imagebase
ALIGN 8
-$L$SEH_info_bn_mulx4x_mont:
-DB 9,0,0,0
- DD sqr_handler wrt ..imagebase
- DD $L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase
-ALIGN 8
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/x86_64-mont5.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/x86_64-mont5.nasm
index 260113b0176..15715aa9239 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/x86_64-mont5.nasm
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/bn/x86_64-mont5.nasm
@@ -29,7 +29,6 @@ $L$SEH_begin_bn_mul_mont_gather5:
test r9d,7
jnz NEAR $L$mul_enter
- mov r11d,DWORD[((OPENSSL_ia32cap_P+8))]
jmp NEAR $L$mul4x_enter
ALIGN 16
@@ -480,9 +479,6 @@ DB 0x67
mov rax,rsp
$L$mul4x_enter:
- and r11d,0x80108
- cmp r11d,0x80108
- je NEAR $L$mulx4x_enter
push rbx
push rbp
@@ -1126,10 +1122,6 @@ $L$SEH_begin_bn_power5:
mov rax,rsp
- mov r11d,DWORD[((OPENSSL_ia32cap_P+8))]
- and r11d,0x80108
- cmp r11d,0x80108
- je NEAR $L$powerx5_enter
push rbx
push rbp
@@ -2095,1376 +2087,6 @@ $L$sqr4x_sub_entry:
DB 0F3h,0C3h ;repret
-
-ALIGN 32
-bn_mulx4x_mont_gather5:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_bn_mulx4x_mont_gather5:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
- mov rcx,r9
- mov r8,QWORD[40+rsp]
- mov r9,QWORD[48+rsp]
-
-
-
- mov rax,rsp
-
-$L$mulx4x_enter:
- push rbx
-
- push rbp
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
-$L$mulx4x_prologue:
-
- shl r9d,3
- lea r10,[r9*2+r9]
- neg r9
- mov r8,QWORD[r8]
-
-
-
-
-
-
-
-
-
-
- lea r11,[((-320))+r9*2+rsp]
- mov rbp,rsp
- sub r11,rdi
- and r11,4095
- cmp r10,r11
- jb NEAR $L$mulx4xsp_alt
- sub rbp,r11
- lea rbp,[((-320))+r9*2+rbp]
- jmp NEAR $L$mulx4xsp_done
-
-$L$mulx4xsp_alt:
- lea r10,[((4096-320))+r9*2]
- lea rbp,[((-320))+r9*2+rbp]
- sub r11,r10
- mov r10,0
- cmovc r11,r10
- sub rbp,r11
-$L$mulx4xsp_done:
- and rbp,-64
- mov r11,rsp
- sub r11,rbp
- and r11,-4096
- lea rsp,[rbp*1+r11]
- mov r10,QWORD[rsp]
- cmp rsp,rbp
- ja NEAR $L$mulx4x_page_walk
- jmp NEAR $L$mulx4x_page_walk_done
-
-$L$mulx4x_page_walk:
- lea rsp,[((-4096))+rsp]
- mov r10,QWORD[rsp]
- cmp rsp,rbp
- ja NEAR $L$mulx4x_page_walk
-$L$mulx4x_page_walk_done:
-
-
-
-
-
-
-
-
-
-
-
-
-
- mov QWORD[32+rsp],r8
- mov QWORD[40+rsp],rax
-
-$L$mulx4x_body:
- call mulx4x_internal
-
- mov rsi,QWORD[40+rsp]
-
- mov rax,1
-
- mov r15,QWORD[((-48))+rsi]
-
- mov r14,QWORD[((-40))+rsi]
-
- mov r13,QWORD[((-32))+rsi]
-
- mov r12,QWORD[((-24))+rsi]
-
- mov rbp,QWORD[((-16))+rsi]
-
- mov rbx,QWORD[((-8))+rsi]
-
- lea rsp,[rsi]
-
-$L$mulx4x_epilogue:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_bn_mulx4x_mont_gather5:
-
-
-ALIGN 32
-mulx4x_internal:
-
- mov QWORD[8+rsp],r9
- mov r10,r9
- neg r9
- shl r9,5
- neg r10
- lea r13,[128+r9*1+rdx]
- shr r9,5+5
- movd xmm5,DWORD[56+rax]
- sub r9,1
- lea rax,[$L$inc]
- mov QWORD[((16+8))+rsp],r13
- mov QWORD[((24+8))+rsp],r9
- mov QWORD[((56+8))+rsp],rdi
- movdqa xmm0,XMMWORD[rax]
- movdqa xmm1,XMMWORD[16+rax]
- lea r10,[((88-112))+r10*1+rsp]
- lea rdi,[128+rdx]
-
- pshufd xmm5,xmm5,0
- movdqa xmm4,xmm1
-DB 0x67
- movdqa xmm2,xmm1
-DB 0x67
- paddd xmm1,xmm0
- pcmpeqd xmm0,xmm5
- movdqa xmm3,xmm4
- paddd xmm2,xmm1
- pcmpeqd xmm1,xmm5
- movdqa XMMWORD[112+r10],xmm0
- movdqa xmm0,xmm4
-
- paddd xmm3,xmm2
- pcmpeqd xmm2,xmm5
- movdqa XMMWORD[128+r10],xmm1
- movdqa xmm1,xmm4
-
- paddd xmm0,xmm3
- pcmpeqd xmm3,xmm5
- movdqa XMMWORD[144+r10],xmm2
- movdqa xmm2,xmm4
-
- paddd xmm1,xmm0
- pcmpeqd xmm0,xmm5
- movdqa XMMWORD[160+r10],xmm3
- movdqa xmm3,xmm4
- paddd xmm2,xmm1
- pcmpeqd xmm1,xmm5
- movdqa XMMWORD[176+r10],xmm0
- movdqa xmm0,xmm4
-
- paddd xmm3,xmm2
- pcmpeqd xmm2,xmm5
- movdqa XMMWORD[192+r10],xmm1
- movdqa xmm1,xmm4
-
- paddd xmm0,xmm3
- pcmpeqd xmm3,xmm5
- movdqa XMMWORD[208+r10],xmm2
- movdqa xmm2,xmm4
-
- paddd xmm1,xmm0
- pcmpeqd xmm0,xmm5
- movdqa XMMWORD[224+r10],xmm3
- movdqa xmm3,xmm4
- paddd xmm2,xmm1
- pcmpeqd xmm1,xmm5
- movdqa XMMWORD[240+r10],xmm0
- movdqa xmm0,xmm4
-
- paddd xmm3,xmm2
- pcmpeqd xmm2,xmm5
- movdqa XMMWORD[256+r10],xmm1
- movdqa xmm1,xmm4
-
- paddd xmm0,xmm3
- pcmpeqd xmm3,xmm5
- movdqa XMMWORD[272+r10],xmm2
- movdqa xmm2,xmm4
-
- paddd xmm1,xmm0
- pcmpeqd xmm0,xmm5
- movdqa XMMWORD[288+r10],xmm3
- movdqa xmm3,xmm4
-DB 0x67
- paddd xmm2,xmm1
- pcmpeqd xmm1,xmm5
- movdqa XMMWORD[304+r10],xmm0
-
- paddd xmm3,xmm2
- pcmpeqd xmm2,xmm5
- movdqa XMMWORD[320+r10],xmm1
-
- pcmpeqd xmm3,xmm5
- movdqa XMMWORD[336+r10],xmm2
-
- pand xmm0,XMMWORD[64+rdi]
- pand xmm1,XMMWORD[80+rdi]
- pand xmm2,XMMWORD[96+rdi]
- movdqa XMMWORD[352+r10],xmm3
- pand xmm3,XMMWORD[112+rdi]
- por xmm0,xmm2
- por xmm1,xmm3
- movdqa xmm4,XMMWORD[((-128))+rdi]
- movdqa xmm5,XMMWORD[((-112))+rdi]
- movdqa xmm2,XMMWORD[((-96))+rdi]
- pand xmm4,XMMWORD[112+r10]
- movdqa xmm3,XMMWORD[((-80))+rdi]
- pand xmm5,XMMWORD[128+r10]
- por xmm0,xmm4
- pand xmm2,XMMWORD[144+r10]
- por xmm1,xmm5
- pand xmm3,XMMWORD[160+r10]
- por xmm0,xmm2
- por xmm1,xmm3
- movdqa xmm4,XMMWORD[((-64))+rdi]
- movdqa xmm5,XMMWORD[((-48))+rdi]
- movdqa xmm2,XMMWORD[((-32))+rdi]
- pand xmm4,XMMWORD[176+r10]
- movdqa xmm3,XMMWORD[((-16))+rdi]
- pand xmm5,XMMWORD[192+r10]
- por xmm0,xmm4
- pand xmm2,XMMWORD[208+r10]
- por xmm1,xmm5
- pand xmm3,XMMWORD[224+r10]
- por xmm0,xmm2
- por xmm1,xmm3
- movdqa xmm4,XMMWORD[rdi]
- movdqa xmm5,XMMWORD[16+rdi]
- movdqa xmm2,XMMWORD[32+rdi]
- pand xmm4,XMMWORD[240+r10]
- movdqa xmm3,XMMWORD[48+rdi]
- pand xmm5,XMMWORD[256+r10]
- por xmm0,xmm4
- pand xmm2,XMMWORD[272+r10]
- por xmm1,xmm5
- pand xmm3,XMMWORD[288+r10]
- por xmm0,xmm2
- por xmm1,xmm3
- pxor xmm0,xmm1
- pshufd xmm1,xmm0,0x4e
- por xmm0,xmm1
- lea rdi,[256+rdi]
-DB 102,72,15,126,194
- lea rbx,[((64+32+8))+rsp]
-
- mov r9,rdx
- mulx rax,r8,QWORD[rsi]
- mulx r12,r11,QWORD[8+rsi]
- add r11,rax
- mulx r13,rax,QWORD[16+rsi]
- adc r12,rax
- adc r13,0
- mulx r14,rax,QWORD[24+rsi]
-
- mov r15,r8
- imul r8,QWORD[((32+8))+rsp]
- xor rbp,rbp
- mov rdx,r8
-
- mov QWORD[((8+8))+rsp],rdi
-
- lea rsi,[32+rsi]
- adcx r13,rax
- adcx r14,rbp
-
- mulx r10,rax,QWORD[rcx]
- adcx r15,rax
- adox r10,r11
- mulx r11,rax,QWORD[8+rcx]
- adcx r10,rax
- adox r11,r12
- mulx r12,rax,QWORD[16+rcx]
- mov rdi,QWORD[((24+8))+rsp]
- mov QWORD[((-32))+rbx],r10
- adcx r11,rax
- adox r12,r13
- mulx r15,rax,QWORD[24+rcx]
- mov rdx,r9
- mov QWORD[((-24))+rbx],r11
- adcx r12,rax
- adox r15,rbp
- lea rcx,[32+rcx]
- mov QWORD[((-16))+rbx],r12
- jmp NEAR $L$mulx4x_1st
-
-ALIGN 32
-$L$mulx4x_1st:
- adcx r15,rbp
- mulx rax,r10,QWORD[rsi]
- adcx r10,r14
- mulx r14,r11,QWORD[8+rsi]
- adcx r11,rax
- mulx rax,r12,QWORD[16+rsi]
- adcx r12,r14
- mulx r14,r13,QWORD[24+rsi]
-DB 0x67,0x67
- mov rdx,r8
- adcx r13,rax
- adcx r14,rbp
- lea rsi,[32+rsi]
- lea rbx,[32+rbx]
-
- adox r10,r15
- mulx r15,rax,QWORD[rcx]
- adcx r10,rax
- adox r11,r15
- mulx r15,rax,QWORD[8+rcx]
- adcx r11,rax
- adox r12,r15
- mulx r15,rax,QWORD[16+rcx]
- mov QWORD[((-40))+rbx],r10
- adcx r12,rax
- mov QWORD[((-32))+rbx],r11
- adox r13,r15
- mulx r15,rax,QWORD[24+rcx]
- mov rdx,r9
- mov QWORD[((-24))+rbx],r12
- adcx r13,rax
- adox r15,rbp
- lea rcx,[32+rcx]
- mov QWORD[((-16))+rbx],r13
-
- dec rdi
- jnz NEAR $L$mulx4x_1st
-
- mov rax,QWORD[8+rsp]
- adc r15,rbp
- lea rsi,[rax*1+rsi]
- add r14,r15
- mov rdi,QWORD[((8+8))+rsp]
- adc rbp,rbp
- mov QWORD[((-8))+rbx],r14
- jmp NEAR $L$mulx4x_outer
-
-ALIGN 32
-$L$mulx4x_outer:
- lea r10,[((16-256))+rbx]
- pxor xmm4,xmm4
-DB 0x67,0x67
- pxor xmm5,xmm5
- movdqa xmm0,XMMWORD[((-128))+rdi]
- movdqa xmm1,XMMWORD[((-112))+rdi]
- movdqa xmm2,XMMWORD[((-96))+rdi]
- pand xmm0,XMMWORD[256+r10]
- movdqa xmm3,XMMWORD[((-80))+rdi]
- pand xmm1,XMMWORD[272+r10]
- por xmm4,xmm0
- pand xmm2,XMMWORD[288+r10]
- por xmm5,xmm1
- pand xmm3,XMMWORD[304+r10]
- por xmm4,xmm2
- por xmm5,xmm3
- movdqa xmm0,XMMWORD[((-64))+rdi]
- movdqa xmm1,XMMWORD[((-48))+rdi]
- movdqa xmm2,XMMWORD[((-32))+rdi]
- pand xmm0,XMMWORD[320+r10]
- movdqa xmm3,XMMWORD[((-16))+rdi]
- pand xmm1,XMMWORD[336+r10]
- por xmm4,xmm0
- pand xmm2,XMMWORD[352+r10]
- por xmm5,xmm1
- pand xmm3,XMMWORD[368+r10]
- por xmm4,xmm2
- por xmm5,xmm3
- movdqa xmm0,XMMWORD[rdi]
- movdqa xmm1,XMMWORD[16+rdi]
- movdqa xmm2,XMMWORD[32+rdi]
- pand xmm0,XMMWORD[384+r10]
- movdqa xmm3,XMMWORD[48+rdi]
- pand xmm1,XMMWORD[400+r10]
- por xmm4,xmm0
- pand xmm2,XMMWORD[416+r10]
- por xmm5,xmm1
- pand xmm3,XMMWORD[432+r10]
- por xmm4,xmm2
- por xmm5,xmm3
- movdqa xmm0,XMMWORD[64+rdi]
- movdqa xmm1,XMMWORD[80+rdi]
- movdqa xmm2,XMMWORD[96+rdi]
- pand xmm0,XMMWORD[448+r10]
- movdqa xmm3,XMMWORD[112+rdi]
- pand xmm1,XMMWORD[464+r10]
- por xmm4,xmm0
- pand xmm2,XMMWORD[480+r10]
- por xmm5,xmm1
- pand xmm3,XMMWORD[496+r10]
- por xmm4,xmm2
- por xmm5,xmm3
- por xmm4,xmm5
- pshufd xmm0,xmm4,0x4e
- por xmm0,xmm4
- lea rdi,[256+rdi]
-DB 102,72,15,126,194
-
- mov QWORD[rbx],rbp
- lea rbx,[32+rax*1+rbx]
- mulx r11,r8,QWORD[rsi]
- xor rbp,rbp
- mov r9,rdx
- mulx r12,r14,QWORD[8+rsi]
- adox r8,QWORD[((-32))+rbx]
- adcx r11,r14
- mulx r13,r15,QWORD[16+rsi]
- adox r11,QWORD[((-24))+rbx]
- adcx r12,r15
- mulx r14,rdx,QWORD[24+rsi]
- adox r12,QWORD[((-16))+rbx]
- adcx r13,rdx
- lea rcx,[rax*1+rcx]
- lea rsi,[32+rsi]
- adox r13,QWORD[((-8))+rbx]
- adcx r14,rbp
- adox r14,rbp
-
- mov r15,r8
- imul r8,QWORD[((32+8))+rsp]
-
- mov rdx,r8
- xor rbp,rbp
- mov QWORD[((8+8))+rsp],rdi
-
- mulx r10,rax,QWORD[rcx]
- adcx r15,rax
- adox r10,r11
- mulx r11,rax,QWORD[8+rcx]
- adcx r10,rax
- adox r11,r12
- mulx r12,rax,QWORD[16+rcx]
- adcx r11,rax
- adox r12,r13
- mulx r15,rax,QWORD[24+rcx]
- mov rdx,r9
- mov rdi,QWORD[((24+8))+rsp]
- mov QWORD[((-32))+rbx],r10
- adcx r12,rax
- mov QWORD[((-24))+rbx],r11
- adox r15,rbp
- mov QWORD[((-16))+rbx],r12
- lea rcx,[32+rcx]
- jmp NEAR $L$mulx4x_inner
-
-ALIGN 32
-$L$mulx4x_inner:
- mulx rax,r10,QWORD[rsi]
- adcx r15,rbp
- adox r10,r14
- mulx r14,r11,QWORD[8+rsi]
- adcx r10,QWORD[rbx]
- adox r11,rax
- mulx rax,r12,QWORD[16+rsi]
- adcx r11,QWORD[8+rbx]
- adox r12,r14
- mulx r14,r13,QWORD[24+rsi]
- mov rdx,r8
- adcx r12,QWORD[16+rbx]
- adox r13,rax
- adcx r13,QWORD[24+rbx]
- adox r14,rbp
- lea rsi,[32+rsi]
- lea rbx,[32+rbx]
- adcx r14,rbp
-
- adox r10,r15
- mulx r15,rax,QWORD[rcx]
- adcx r10,rax
- adox r11,r15
- mulx r15,rax,QWORD[8+rcx]
- adcx r11,rax
- adox r12,r15
- mulx r15,rax,QWORD[16+rcx]
- mov QWORD[((-40))+rbx],r10
- adcx r12,rax
- adox r13,r15
- mov QWORD[((-32))+rbx],r11
- mulx r15,rax,QWORD[24+rcx]
- mov rdx,r9
- lea rcx,[32+rcx]
- mov QWORD[((-24))+rbx],r12
- adcx r13,rax
- adox r15,rbp
- mov QWORD[((-16))+rbx],r13
-
- dec rdi
- jnz NEAR $L$mulx4x_inner
-
- mov rax,QWORD[((0+8))+rsp]
- adc r15,rbp
- sub rdi,QWORD[rbx]
- mov rdi,QWORD[((8+8))+rsp]
- mov r10,QWORD[((16+8))+rsp]
- adc r14,r15
- lea rsi,[rax*1+rsi]
- adc rbp,rbp
- mov QWORD[((-8))+rbx],r14
-
- cmp rdi,r10
- jb NEAR $L$mulx4x_outer
-
- mov r10,QWORD[((-8))+rcx]
- mov r8,rbp
- mov r12,QWORD[rax*1+rcx]
- lea rbp,[rax*1+rcx]
- mov rcx,rax
- lea rdi,[rax*1+rbx]
- xor eax,eax
- xor r15,r15
- sub r10,r14
- adc r15,r15
- or r8,r15
- sar rcx,3+2
- sub rax,r8
- mov rdx,QWORD[((56+8))+rsp]
- dec r12
- mov r13,QWORD[8+rbp]
- xor r8,r8
- mov r14,QWORD[16+rbp]
- mov r15,QWORD[24+rbp]
- jmp NEAR $L$sqrx4x_sub_entry
-
-
-
-ALIGN 32
-bn_powerx5:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_bn_powerx5:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
- mov rcx,r9
- mov r8,QWORD[40+rsp]
- mov r9,QWORD[48+rsp]
-
-
-
- mov rax,rsp
-
-$L$powerx5_enter:
- push rbx
-
- push rbp
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
-$L$powerx5_prologue:
-
- shl r9d,3
- lea r10,[r9*2+r9]
- neg r9
- mov r8,QWORD[r8]
-
-
-
-
-
-
-
-
- lea r11,[((-320))+r9*2+rsp]
- mov rbp,rsp
- sub r11,rdi
- and r11,4095
- cmp r10,r11
- jb NEAR $L$pwrx_sp_alt
- sub rbp,r11
- lea rbp,[((-320))+r9*2+rbp]
- jmp NEAR $L$pwrx_sp_done
-
-ALIGN 32
-$L$pwrx_sp_alt:
- lea r10,[((4096-320))+r9*2]
- lea rbp,[((-320))+r9*2+rbp]
- sub r11,r10
- mov r10,0
- cmovc r11,r10
- sub rbp,r11
-$L$pwrx_sp_done:
- and rbp,-64
- mov r11,rsp
- sub r11,rbp
- and r11,-4096
- lea rsp,[rbp*1+r11]
- mov r10,QWORD[rsp]
- cmp rsp,rbp
- ja NEAR $L$pwrx_page_walk
- jmp NEAR $L$pwrx_page_walk_done
-
-$L$pwrx_page_walk:
- lea rsp,[((-4096))+rsp]
- mov r10,QWORD[rsp]
- cmp rsp,rbp
- ja NEAR $L$pwrx_page_walk
-$L$pwrx_page_walk_done:
-
- mov r10,r9
- neg r9
-
-
-
-
-
-
-
-
-
-
-
-
- pxor xmm0,xmm0
-DB 102,72,15,110,207
-DB 102,72,15,110,209
-DB 102,73,15,110,218
-DB 102,72,15,110,226
- mov QWORD[32+rsp],r8
- mov QWORD[40+rsp],rax
-
-$L$powerx5_body:
-
- call __bn_sqrx8x_internal
- call __bn_postx4x_internal
- call __bn_sqrx8x_internal
- call __bn_postx4x_internal
- call __bn_sqrx8x_internal
- call __bn_postx4x_internal
- call __bn_sqrx8x_internal
- call __bn_postx4x_internal
- call __bn_sqrx8x_internal
- call __bn_postx4x_internal
-
- mov r9,r10
- mov rdi,rsi
-DB 102,72,15,126,209
-DB 102,72,15,126,226
- mov rax,QWORD[40+rsp]
-
- call mulx4x_internal
-
- mov rsi,QWORD[40+rsp]
-
- mov rax,1
-
- mov r15,QWORD[((-48))+rsi]
-
- mov r14,QWORD[((-40))+rsi]
-
- mov r13,QWORD[((-32))+rsi]
-
- mov r12,QWORD[((-24))+rsi]
-
- mov rbp,QWORD[((-16))+rsi]
-
- mov rbx,QWORD[((-8))+rsi]
-
- lea rsp,[rsi]
-
-$L$powerx5_epilogue:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_bn_powerx5:
-
-global bn_sqrx8x_internal
-
-
-ALIGN 32
-bn_sqrx8x_internal:
-__bn_sqrx8x_internal:
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- lea rdi,[((48+8))+rsp]
- lea rbp,[r9*1+rsi]
- mov QWORD[((0+8))+rsp],r9
- mov QWORD[((8+8))+rsp],rbp
- jmp NEAR $L$sqr8x_zero_start
-
-ALIGN 32
-DB 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
-$L$sqrx8x_zero:
-DB 0x3e
- movdqa XMMWORD[rdi],xmm0
- movdqa XMMWORD[16+rdi],xmm0
- movdqa XMMWORD[32+rdi],xmm0
- movdqa XMMWORD[48+rdi],xmm0
-$L$sqr8x_zero_start:
- movdqa XMMWORD[64+rdi],xmm0
- movdqa XMMWORD[80+rdi],xmm0
- movdqa XMMWORD[96+rdi],xmm0
- movdqa XMMWORD[112+rdi],xmm0
- lea rdi,[128+rdi]
- sub r9,64
- jnz NEAR $L$sqrx8x_zero
-
- mov rdx,QWORD[rsi]
-
- xor r10,r10
- xor r11,r11
- xor r12,r12
- xor r13,r13
- xor r14,r14
- xor r15,r15
- lea rdi,[((48+8))+rsp]
- xor rbp,rbp
- jmp NEAR $L$sqrx8x_outer_loop
-
-ALIGN 32
-$L$sqrx8x_outer_loop:
- mulx rax,r8,QWORD[8+rsi]
- adcx r8,r9
- adox r10,rax
- mulx rax,r9,QWORD[16+rsi]
- adcx r9,r10
- adox r11,rax
-DB 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
- adcx r10,r11
- adox r12,rax
-DB 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
- adcx r11,r12
- adox r13,rax
- mulx rax,r12,QWORD[40+rsi]
- adcx r12,r13
- adox r14,rax
- mulx rax,r13,QWORD[48+rsi]
- adcx r13,r14
- adox rax,r15
- mulx r15,r14,QWORD[56+rsi]
- mov rdx,QWORD[8+rsi]
- adcx r14,rax
- adox r15,rbp
- adc r15,QWORD[64+rdi]
- mov QWORD[8+rdi],r8
- mov QWORD[16+rdi],r9
- sbb rcx,rcx
- xor rbp,rbp
-
-
- mulx rbx,r8,QWORD[16+rsi]
- mulx rax,r9,QWORD[24+rsi]
- adcx r8,r10
- adox r9,rbx
- mulx rbx,r10,QWORD[32+rsi]
- adcx r9,r11
- adox r10,rax
-DB 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
- adcx r10,r12
- adox r11,rbx
-DB 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
- adcx r11,r13
- adox r12,r14
-DB 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
- mov rdx,QWORD[16+rsi]
- adcx r12,rax
- adox r13,rbx
- adcx r13,r15
- adox r14,rbp
- adcx r14,rbp
-
- mov QWORD[24+rdi],r8
- mov QWORD[32+rdi],r9
-
- mulx rbx,r8,QWORD[24+rsi]
- mulx rax,r9,QWORD[32+rsi]
- adcx r8,r10
- adox r9,rbx
- mulx rbx,r10,QWORD[40+rsi]
- adcx r9,r11
- adox r10,rax
-DB 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
- adcx r10,r12
- adox r11,r13
-DB 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
-DB 0x3e
- mov rdx,QWORD[24+rsi]
- adcx r11,rbx
- adox r12,rax
- adcx r12,r14
- mov QWORD[40+rdi],r8
- mov QWORD[48+rdi],r9
- mulx rax,r8,QWORD[32+rsi]
- adox r13,rbp
- adcx r13,rbp
-
- mulx rbx,r9,QWORD[40+rsi]
- adcx r8,r10
- adox r9,rax
- mulx rax,r10,QWORD[48+rsi]
- adcx r9,r11
- adox r10,r12
- mulx r12,r11,QWORD[56+rsi]
- mov rdx,QWORD[32+rsi]
- mov r14,QWORD[40+rsi]
- adcx r10,rbx
- adox r11,rax
- mov r15,QWORD[48+rsi]
- adcx r11,r13
- adox r12,rbp
- adcx r12,rbp
-
- mov QWORD[56+rdi],r8
- mov QWORD[64+rdi],r9
-
- mulx rax,r9,r14
- mov r8,QWORD[56+rsi]
- adcx r9,r10
- mulx rbx,r10,r15
- adox r10,rax
- adcx r10,r11
- mulx rax,r11,r8
- mov rdx,r14
- adox r11,rbx
- adcx r11,r12
-
- adcx rax,rbp
-
- mulx rbx,r14,r15
- mulx r13,r12,r8
- mov rdx,r15
- lea rsi,[64+rsi]
- adcx r11,r14
- adox r12,rbx
- adcx r12,rax
- adox r13,rbp
-
-DB 0x67,0x67
- mulx r14,r8,r8
- adcx r13,r8
- adcx r14,rbp
-
- cmp rsi,QWORD[((8+8))+rsp]
- je NEAR $L$sqrx8x_outer_break
-
- neg rcx
- mov rcx,-8
- mov r15,rbp
- mov r8,QWORD[64+rdi]
- adcx r9,QWORD[72+rdi]
- adcx r10,QWORD[80+rdi]
- adcx r11,QWORD[88+rdi]
- adc r12,QWORD[96+rdi]
- adc r13,QWORD[104+rdi]
- adc r14,QWORD[112+rdi]
- adc r15,QWORD[120+rdi]
- lea rbp,[rsi]
- lea rdi,[128+rdi]
- sbb rax,rax
-
- mov rdx,QWORD[((-64))+rsi]
- mov QWORD[((16+8))+rsp],rax
- mov QWORD[((24+8))+rsp],rdi
-
-
- xor eax,eax
- jmp NEAR $L$sqrx8x_loop
-
-ALIGN 32
-$L$sqrx8x_loop:
- mov rbx,r8
- mulx r8,rax,QWORD[rbp]
- adcx rbx,rax
- adox r8,r9
-
- mulx r9,rax,QWORD[8+rbp]
- adcx r8,rax
- adox r9,r10
-
- mulx r10,rax,QWORD[16+rbp]
- adcx r9,rax
- adox r10,r11
-
- mulx r11,rax,QWORD[24+rbp]
- adcx r10,rax
- adox r11,r12
-
-DB 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
- adcx r11,rax
- adox r12,r13
-
- mulx r13,rax,QWORD[40+rbp]
- adcx r12,rax
- adox r13,r14
-
- mulx r14,rax,QWORD[48+rbp]
- mov QWORD[rcx*8+rdi],rbx
- mov ebx,0
- adcx r13,rax
- adox r14,r15
-
-DB 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
- mov rdx,QWORD[8+rcx*8+rsi]
- adcx r14,rax
- adox r15,rbx
- adcx r15,rbx
-
-DB 0x67
- inc rcx
- jnz NEAR $L$sqrx8x_loop
-
- lea rbp,[64+rbp]
- mov rcx,-8
- cmp rbp,QWORD[((8+8))+rsp]
- je NEAR $L$sqrx8x_break
-
- sub rbx,QWORD[((16+8))+rsp]
-DB 0x66
- mov rdx,QWORD[((-64))+rsi]
- adcx r8,QWORD[rdi]
- adcx r9,QWORD[8+rdi]
- adc r10,QWORD[16+rdi]
- adc r11,QWORD[24+rdi]
- adc r12,QWORD[32+rdi]
- adc r13,QWORD[40+rdi]
- adc r14,QWORD[48+rdi]
- adc r15,QWORD[56+rdi]
- lea rdi,[64+rdi]
-DB 0x67
- sbb rax,rax
- xor ebx,ebx
- mov QWORD[((16+8))+rsp],rax
- jmp NEAR $L$sqrx8x_loop
-
-ALIGN 32
-$L$sqrx8x_break:
- xor rbp,rbp
- sub rbx,QWORD[((16+8))+rsp]
- adcx r8,rbp
- mov rcx,QWORD[((24+8))+rsp]
- adcx r9,rbp
- mov rdx,QWORD[rsi]
- adc r10,0
- mov QWORD[rdi],r8
- adc r11,0
- adc r12,0
- adc r13,0
- adc r14,0
- adc r15,0
- cmp rdi,rcx
- je NEAR $L$sqrx8x_outer_loop
-
- mov QWORD[8+rdi],r9
- mov r9,QWORD[8+rcx]
- mov QWORD[16+rdi],r10
- mov r10,QWORD[16+rcx]
- mov QWORD[24+rdi],r11
- mov r11,QWORD[24+rcx]
- mov QWORD[32+rdi],r12
- mov r12,QWORD[32+rcx]
- mov QWORD[40+rdi],r13
- mov r13,QWORD[40+rcx]
- mov QWORD[48+rdi],r14
- mov r14,QWORD[48+rcx]
- mov QWORD[56+rdi],r15
- mov r15,QWORD[56+rcx]
- mov rdi,rcx
- jmp NEAR $L$sqrx8x_outer_loop
-
-ALIGN 32
-$L$sqrx8x_outer_break:
- mov QWORD[72+rdi],r9
-DB 102,72,15,126,217
- mov QWORD[80+rdi],r10
- mov QWORD[88+rdi],r11
- mov QWORD[96+rdi],r12
- mov QWORD[104+rdi],r13
- mov QWORD[112+rdi],r14
- lea rdi,[((48+8))+rsp]
- mov rdx,QWORD[rcx*1+rsi]
-
- mov r11,QWORD[8+rdi]
- xor r10,r10
- mov r9,QWORD[((0+8))+rsp]
- adox r11,r11
- mov r12,QWORD[16+rdi]
- mov r13,QWORD[24+rdi]
-
-
-ALIGN 32
-$L$sqrx4x_shift_n_add:
- mulx rbx,rax,rdx
- adox r12,r12
- adcx rax,r10
-DB 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
-DB 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
- adox r13,r13
- adcx rbx,r11
- mov r11,QWORD[40+rdi]
- mov QWORD[rdi],rax
- mov QWORD[8+rdi],rbx
-
- mulx rbx,rax,rdx
- adox r10,r10
- adcx rax,r12
- mov rdx,QWORD[16+rcx*1+rsi]
- mov r12,QWORD[48+rdi]
- adox r11,r11
- adcx rbx,r13
- mov r13,QWORD[56+rdi]
- mov QWORD[16+rdi],rax
- mov QWORD[24+rdi],rbx
-
- mulx rbx,rax,rdx
- adox r12,r12
- adcx rax,r10
- mov rdx,QWORD[24+rcx*1+rsi]
- lea rcx,[32+rcx]
- mov r10,QWORD[64+rdi]
- adox r13,r13
- adcx rbx,r11
- mov r11,QWORD[72+rdi]
- mov QWORD[32+rdi],rax
- mov QWORD[40+rdi],rbx
-
- mulx rbx,rax,rdx
- adox r10,r10
- adcx rax,r12
- jrcxz $L$sqrx4x_shift_n_add_break
-DB 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
- adox r11,r11
- adcx rbx,r13
- mov r12,QWORD[80+rdi]
- mov r13,QWORD[88+rdi]
- mov QWORD[48+rdi],rax
- mov QWORD[56+rdi],rbx
- lea rdi,[64+rdi]
- nop
- jmp NEAR $L$sqrx4x_shift_n_add
-
-ALIGN 32
-$L$sqrx4x_shift_n_add_break:
- adcx rbx,r13
- mov QWORD[48+rdi],rax
- mov QWORD[56+rdi],rbx
- lea rdi,[64+rdi]
-DB 102,72,15,126,213
-__bn_sqrx8x_reduction:
- xor eax,eax
- mov rbx,QWORD[((32+8))+rsp]
- mov rdx,QWORD[((48+8))+rsp]
- lea rcx,[((-64))+r9*1+rbp]
-
- mov QWORD[((0+8))+rsp],rcx
- mov QWORD[((8+8))+rsp],rdi
-
- lea rdi,[((48+8))+rsp]
- jmp NEAR $L$sqrx8x_reduction_loop
-
-ALIGN 32
-$L$sqrx8x_reduction_loop:
- mov r9,QWORD[8+rdi]
- mov r10,QWORD[16+rdi]
- mov r11,QWORD[24+rdi]
- mov r12,QWORD[32+rdi]
- mov r8,rdx
- imul rdx,rbx
- mov r13,QWORD[40+rdi]
- mov r14,QWORD[48+rdi]
- mov r15,QWORD[56+rdi]
- mov QWORD[((24+8))+rsp],rax
-
- lea rdi,[64+rdi]
- xor rsi,rsi
- mov rcx,-8
- jmp NEAR $L$sqrx8x_reduce
-
-ALIGN 32
-$L$sqrx8x_reduce:
- mov rbx,r8
- mulx r8,rax,QWORD[rbp]
- adcx rax,rbx
- adox r8,r9
-
- mulx r9,rbx,QWORD[8+rbp]
- adcx r8,rbx
- adox r9,r10
-
- mulx r10,rbx,QWORD[16+rbp]
- adcx r9,rbx
- adox r10,r11
-
- mulx r11,rbx,QWORD[24+rbp]
- adcx r10,rbx
- adox r11,r12
-
-DB 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
- mov rax,rdx
- mov rdx,r8
- adcx r11,rbx
- adox r12,r13
-
- mulx rdx,rbx,QWORD[((32+8))+rsp]
- mov rdx,rax
- mov QWORD[((64+48+8))+rcx*8+rsp],rax
-
- mulx r13,rax,QWORD[40+rbp]
- adcx r12,rax
- adox r13,r14
-
- mulx r14,rax,QWORD[48+rbp]
- adcx r13,rax
- adox r14,r15
-
- mulx r15,rax,QWORD[56+rbp]
- mov rdx,rbx
- adcx r14,rax
- adox r15,rsi
- adcx r15,rsi
-
-DB 0x67,0x67,0x67
- inc rcx
- jnz NEAR $L$sqrx8x_reduce
-
- mov rax,rsi
- cmp rbp,QWORD[((0+8))+rsp]
- jae NEAR $L$sqrx8x_no_tail
-
- mov rdx,QWORD[((48+8))+rsp]
- add r8,QWORD[rdi]
- lea rbp,[64+rbp]
- mov rcx,-8
- adcx r9,QWORD[8+rdi]
- adcx r10,QWORD[16+rdi]
- adc r11,QWORD[24+rdi]
- adc r12,QWORD[32+rdi]
- adc r13,QWORD[40+rdi]
- adc r14,QWORD[48+rdi]
- adc r15,QWORD[56+rdi]
- lea rdi,[64+rdi]
- sbb rax,rax
-
- xor rsi,rsi
- mov QWORD[((16+8))+rsp],rax
- jmp NEAR $L$sqrx8x_tail
-
-ALIGN 32
-$L$sqrx8x_tail:
- mov rbx,r8
- mulx r8,rax,QWORD[rbp]
- adcx rbx,rax
- adox r8,r9
-
- mulx r9,rax,QWORD[8+rbp]
- adcx r8,rax
- adox r9,r10
-
- mulx r10,rax,QWORD[16+rbp]
- adcx r9,rax
- adox r10,r11
-
- mulx r11,rax,QWORD[24+rbp]
- adcx r10,rax
- adox r11,r12
-
-DB 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
- adcx r11,rax
- adox r12,r13
-
- mulx r13,rax,QWORD[40+rbp]
- adcx r12,rax
- adox r13,r14
-
- mulx r14,rax,QWORD[48+rbp]
- adcx r13,rax
- adox r14,r15
-
- mulx r15,rax,QWORD[56+rbp]
- mov rdx,QWORD[((72+48+8))+rcx*8+rsp]
- adcx r14,rax
- adox r15,rsi
- mov QWORD[rcx*8+rdi],rbx
- mov rbx,r8
- adcx r15,rsi
-
- inc rcx
- jnz NEAR $L$sqrx8x_tail
-
- cmp rbp,QWORD[((0+8))+rsp]
- jae NEAR $L$sqrx8x_tail_done
-
- sub rsi,QWORD[((16+8))+rsp]
- mov rdx,QWORD[((48+8))+rsp]
- lea rbp,[64+rbp]
- adc r8,QWORD[rdi]
- adc r9,QWORD[8+rdi]
- adc r10,QWORD[16+rdi]
- adc r11,QWORD[24+rdi]
- adc r12,QWORD[32+rdi]
- adc r13,QWORD[40+rdi]
- adc r14,QWORD[48+rdi]
- adc r15,QWORD[56+rdi]
- lea rdi,[64+rdi]
- sbb rax,rax
- sub rcx,8
-
- xor rsi,rsi
- mov QWORD[((16+8))+rsp],rax
- jmp NEAR $L$sqrx8x_tail
-
-ALIGN 32
-$L$sqrx8x_tail_done:
- xor rax,rax
- add r8,QWORD[((24+8))+rsp]
- adc r9,0
- adc r10,0
- adc r11,0
- adc r12,0
- adc r13,0
- adc r14,0
- adc r15,0
- adc rax,0
-
- sub rsi,QWORD[((16+8))+rsp]
-$L$sqrx8x_no_tail:
- adc r8,QWORD[rdi]
-DB 102,72,15,126,217
- adc r9,QWORD[8+rdi]
- mov rsi,QWORD[56+rbp]
-DB 102,72,15,126,213
- adc r10,QWORD[16+rdi]
- adc r11,QWORD[24+rdi]
- adc r12,QWORD[32+rdi]
- adc r13,QWORD[40+rdi]
- adc r14,QWORD[48+rdi]
- adc r15,QWORD[56+rdi]
- adc rax,0
-
- mov rbx,QWORD[((32+8))+rsp]
- mov rdx,QWORD[64+rcx*1+rdi]
-
- mov QWORD[rdi],r8
- lea r8,[64+rdi]
- mov QWORD[8+rdi],r9
- mov QWORD[16+rdi],r10
- mov QWORD[24+rdi],r11
- mov QWORD[32+rdi],r12
- mov QWORD[40+rdi],r13
- mov QWORD[48+rdi],r14
- mov QWORD[56+rdi],r15
-
- lea rdi,[64+rcx*1+rdi]
- cmp r8,QWORD[((8+8))+rsp]
- jb NEAR $L$sqrx8x_reduction_loop
- DB 0F3h,0C3h ;repret
-
-
-ALIGN 32
-__bn_postx4x_internal:
-
- mov r12,QWORD[rbp]
- mov r10,rcx
- mov r9,rcx
- neg rax
- sar rcx,3+2
-
-DB 102,72,15,126,202
-DB 102,72,15,126,206
- dec r12
- mov r13,QWORD[8+rbp]
- xor r8,r8
- mov r14,QWORD[16+rbp]
- mov r15,QWORD[24+rbp]
- jmp NEAR $L$sqrx4x_sub_entry
-
-ALIGN 16
-$L$sqrx4x_sub:
- mov r12,QWORD[rbp]
- mov r13,QWORD[8+rbp]
- mov r14,QWORD[16+rbp]
- mov r15,QWORD[24+rbp]
-$L$sqrx4x_sub_entry:
- andn r12,r12,rax
- lea rbp,[32+rbp]
- andn r13,r13,rax
- andn r14,r14,rax
- andn r15,r15,rax
-
- neg r8
- adc r12,QWORD[rdi]
- adc r13,QWORD[8+rdi]
- adc r14,QWORD[16+rdi]
- adc r15,QWORD[24+rdi]
- mov QWORD[rdx],r12
- lea rdi,[32+rdi]
- mov QWORD[8+rdx],r13
- sbb r8,r8
- mov QWORD[16+rdx],r14
- mov QWORD[24+rdx],r15
- lea rdx,[32+rdx]
-
- inc rcx
- jnz NEAR $L$sqrx4x_sub
-
- neg r9
-
- DB 0F3h,0C3h ;repret
-
-
global bn_get_bits5
ALIGN 16
@@ -3797,13 +2419,6 @@ ALIGN 4
DD $L$SEH_begin_bn_power5 wrt ..imagebase
DD $L$SEH_end_bn_power5 wrt ..imagebase
DD $L$SEH_info_bn_power5 wrt ..imagebase
- DD $L$SEH_begin_bn_mulx4x_mont_gather5 wrt ..imagebase
- DD $L$SEH_end_bn_mulx4x_mont_gather5 wrt ..imagebase
- DD $L$SEH_info_bn_mulx4x_mont_gather5 wrt ..imagebase
-
- DD $L$SEH_begin_bn_powerx5 wrt ..imagebase
- DD $L$SEH_end_bn_powerx5 wrt ..imagebase
- DD $L$SEH_info_bn_powerx5 wrt ..imagebase
DD $L$SEH_begin_bn_gather5 wrt ..imagebase
DD $L$SEH_end_bn_gather5 wrt ..imagebase
DD $L$SEH_info_bn_gather5 wrt ..imagebase
@@ -3825,16 +2440,6 @@ DB 9,0,0,0
DD mul_handler wrt ..imagebase
DD $L$power5_prologue wrt ..imagebase,$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase
ALIGN 8
-$L$SEH_info_bn_mulx4x_mont_gather5:
-DB 9,0,0,0
- DD mul_handler wrt ..imagebase
- DD $L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase
-ALIGN 8
-$L$SEH_info_bn_powerx5:
-DB 9,0,0,0
- DD mul_handler wrt ..imagebase
- DD $L$powerx5_prologue wrt ..imagebase,$L$powerx5_body wrt ..imagebase,$L$powerx5_epilogue wrt ..imagebase
-ALIGN 8
$L$SEH_info_bn_gather5:
DB 0x01,0x0b,0x03,0x0a
DB 0x0b,0x01,0x21,0x00
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/ec/ecp_nistz256-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/ec/ecp_nistz256-x86_64.nasm
index b35e99bc90b..e5b9c13dbcc 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/ec/ecp_nistz256-x86_64.nasm
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/ec/ecp_nistz256-x86_64.nasm
@@ -2853,10 +2853,6 @@ $L$SEH_begin_ecp_nistz256_ord_mul_mont:
- mov ecx,0x80100
- and ecx,DWORD[((OPENSSL_ia32cap_P+8))]
- cmp ecx,0x80100
- je NEAR $L$ecp_nistz256_ord_mul_montx
push rbp
push rbx
@@ -3190,10 +3186,6 @@ $L$SEH_begin_ecp_nistz256_ord_sqr_mont:
- mov ecx,0x80100
- and ecx,DWORD[((OPENSSL_ia32cap_P+8))]
- cmp ecx,0x80100
- je NEAR $L$ecp_nistz256_ord_sqr_montx
push rbp
push rbx
@@ -3478,472 +3470,6 @@ $L$ord_sqr_epilogue:
$L$SEH_end_ecp_nistz256_ord_sqr_mont:
-ALIGN 32
-ecp_nistz256_ord_mul_montx:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_ecp_nistz256_ord_mul_montx:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
-
-
-
-$L$ecp_nistz256_ord_mul_montx:
- push rbp
-
- push rbx
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
-$L$ord_mulx_body:
-
- mov rbx,rdx
- mov rdx,QWORD[rdx]
- mov r9,QWORD[rsi]
- mov r10,QWORD[8+rsi]
- mov r11,QWORD[16+rsi]
- mov r12,QWORD[24+rsi]
- lea rsi,[((-128))+rsi]
- lea r14,[(($L$ord-128))]
- mov r15,QWORD[$L$ordK]
-
-
- mulx r9,r8,r9
- mulx r10,rcx,r10
- mulx r11,rbp,r11
- add r9,rcx
- mulx r12,rcx,r12
- mov rdx,r8
- mulx rax,rdx,r15
- adc r10,rbp
- adc r11,rcx
- adc r12,0
-
-
- xor r13,r13
- mulx rbp,rcx,QWORD[((0+128))+r14]
- adcx r8,rcx
- adox r9,rbp
-
- mulx rbp,rcx,QWORD[((8+128))+r14]
- adcx r9,rcx
- adox r10,rbp
-
- mulx rbp,rcx,QWORD[((16+128))+r14]
- adcx r10,rcx
- adox r11,rbp
-
- mulx rbp,rcx,QWORD[((24+128))+r14]
- mov rdx,QWORD[8+rbx]
- adcx r11,rcx
- adox r12,rbp
- adcx r12,r8
- adox r13,r8
- adc r13,0
-
-
- mulx rbp,rcx,QWORD[((0+128))+rsi]
- adcx r9,rcx
- adox r10,rbp
-
- mulx rbp,rcx,QWORD[((8+128))+rsi]
- adcx r10,rcx
- adox r11,rbp
-
- mulx rbp,rcx,QWORD[((16+128))+rsi]
- adcx r11,rcx
- adox r12,rbp
-
- mulx rbp,rcx,QWORD[((24+128))+rsi]
- mov rdx,r9
- mulx rax,rdx,r15
- adcx r12,rcx
- adox r13,rbp
-
- adcx r13,r8
- adox r8,r8
- adc r8,0
-
-
- mulx rbp,rcx,QWORD[((0+128))+r14]
- adcx r9,rcx
- adox r10,rbp
-
- mulx rbp,rcx,QWORD[((8+128))+r14]
- adcx r10,rcx
- adox r11,rbp
-
- mulx rbp,rcx,QWORD[((16+128))+r14]
- adcx r11,rcx
- adox r12,rbp
-
- mulx rbp,rcx,QWORD[((24+128))+r14]
- mov rdx,QWORD[16+rbx]
- adcx r12,rcx
- adox r13,rbp
- adcx r13,r9
- adox r8,r9
- adc r8,0
-
-
- mulx rbp,rcx,QWORD[((0+128))+rsi]
- adcx r10,rcx
- adox r11,rbp
-
- mulx rbp,rcx,QWORD[((8+128))+rsi]
- adcx r11,rcx
- adox r12,rbp
-
- mulx rbp,rcx,QWORD[((16+128))+rsi]
- adcx r12,rcx
- adox r13,rbp
-
- mulx rbp,rcx,QWORD[((24+128))+rsi]
- mov rdx,r10
- mulx rax,rdx,r15
- adcx r13,rcx
- adox r8,rbp
-
- adcx r8,r9
- adox r9,r9
- adc r9,0
-
-
- mulx rbp,rcx,QWORD[((0+128))+r14]
- adcx r10,rcx
- adox r11,rbp
-
- mulx rbp,rcx,QWORD[((8+128))+r14]
- adcx r11,rcx
- adox r12,rbp
-
- mulx rbp,rcx,QWORD[((16+128))+r14]
- adcx r12,rcx
- adox r13,rbp
-
- mulx rbp,rcx,QWORD[((24+128))+r14]
- mov rdx,QWORD[24+rbx]
- adcx r13,rcx
- adox r8,rbp
- adcx r8,r10
- adox r9,r10
- adc r9,0
-
-
- mulx rbp,rcx,QWORD[((0+128))+rsi]
- adcx r11,rcx
- adox r12,rbp
-
- mulx rbp,rcx,QWORD[((8+128))+rsi]
- adcx r12,rcx
- adox r13,rbp
-
- mulx rbp,rcx,QWORD[((16+128))+rsi]
- adcx r13,rcx
- adox r8,rbp
-
- mulx rbp,rcx,QWORD[((24+128))+rsi]
- mov rdx,r11
- mulx rax,rdx,r15
- adcx r8,rcx
- adox r9,rbp
-
- adcx r9,r10
- adox r10,r10
- adc r10,0
-
-
- mulx rbp,rcx,QWORD[((0+128))+r14]
- adcx r11,rcx
- adox r12,rbp
-
- mulx rbp,rcx,QWORD[((8+128))+r14]
- adcx r12,rcx
- adox r13,rbp
-
- mulx rbp,rcx,QWORD[((16+128))+r14]
- adcx r13,rcx
- adox r8,rbp
-
- mulx rbp,rcx,QWORD[((24+128))+r14]
- lea r14,[128+r14]
- mov rbx,r12
- adcx r8,rcx
- adox r9,rbp
- mov rdx,r13
- adcx r9,r11
- adox r10,r11
- adc r10,0
-
-
-
- mov rcx,r8
- sub r12,QWORD[r14]
- sbb r13,QWORD[8+r14]
- sbb r8,QWORD[16+r14]
- mov rbp,r9
- sbb r9,QWORD[24+r14]
- sbb r10,0
-
- cmovc r12,rbx
- cmovc r13,rdx
- cmovc r8,rcx
- cmovc r9,rbp
-
- mov QWORD[rdi],r12
- mov QWORD[8+rdi],r13
- mov QWORD[16+rdi],r8
- mov QWORD[24+rdi],r9
-
- mov r15,QWORD[rsp]
-
- mov r14,QWORD[8+rsp]
-
- mov r13,QWORD[16+rsp]
-
- mov r12,QWORD[24+rsp]
-
- mov rbx,QWORD[32+rsp]
-
- mov rbp,QWORD[40+rsp]
-
- lea rsp,[48+rsp]
-
-$L$ord_mulx_epilogue:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_ecp_nistz256_ord_mul_montx:
-
-
-ALIGN 32
-ecp_nistz256_ord_sqr_montx:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_ecp_nistz256_ord_sqr_montx:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
-
-
-
-$L$ecp_nistz256_ord_sqr_montx:
- push rbp
-
- push rbx
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
-$L$ord_sqrx_body:
-
- mov rbx,rdx
- mov rdx,QWORD[rsi]
- mov r14,QWORD[8+rsi]
- mov r15,QWORD[16+rsi]
- mov r8,QWORD[24+rsi]
- lea rsi,[$L$ord]
- jmp NEAR $L$oop_ord_sqrx
-
-ALIGN 32
-$L$oop_ord_sqrx:
- mulx r10,r9,r14
- mulx r11,rcx,r15
- mov rax,rdx
-DB 102,73,15,110,206
- mulx r12,rbp,r8
- mov rdx,r14
- add r10,rcx
-DB 102,73,15,110,215
- adc r11,rbp
- adc r12,0
- xor r13,r13
-
- mulx rbp,rcx,r15
- adcx r11,rcx
- adox r12,rbp
-
- mulx rbp,rcx,r8
- mov rdx,r15
- adcx r12,rcx
- adox r13,rbp
- adc r13,0
-
- mulx r14,rcx,r8
- mov rdx,rax
-DB 102,73,15,110,216
- xor r15,r15
- adcx r9,r9
- adox r13,rcx
- adcx r10,r10
- adox r14,r15
-
-
- mulx rbp,r8,rdx
-DB 102,72,15,126,202
- adcx r11,r11
- adox r9,rbp
- adcx r12,r12
- mulx rax,rcx,rdx
-DB 102,72,15,126,210
- adcx r13,r13
- adox r10,rcx
- adcx r14,r14
- mulx rbp,rcx,rdx
-DB 0x67
-DB 102,72,15,126,218
- adox r11,rax
- adcx r15,r15
- adox r12,rcx
- adox r13,rbp
- mulx rax,rcx,rdx
- adox r14,rcx
- adox r15,rax
-
-
- mov rdx,r8
- mulx rcx,rdx,QWORD[32+rsi]
-
- xor rax,rax
- mulx rbp,rcx,QWORD[rsi]
- adcx r8,rcx
- adox r9,rbp
- mulx rbp,rcx,QWORD[8+rsi]
- adcx r9,rcx
- adox r10,rbp
- mulx rbp,rcx,QWORD[16+rsi]
- adcx r10,rcx
- adox r11,rbp
- mulx rbp,rcx,QWORD[24+rsi]
- adcx r11,rcx
- adox r8,rbp
- adcx r8,rax
-
-
- mov rdx,r9
- mulx rcx,rdx,QWORD[32+rsi]
-
- mulx rbp,rcx,QWORD[rsi]
- adox r9,rcx
- adcx r10,rbp
- mulx rbp,rcx,QWORD[8+rsi]
- adox r10,rcx
- adcx r11,rbp
- mulx rbp,rcx,QWORD[16+rsi]
- adox r11,rcx
- adcx r8,rbp
- mulx rbp,rcx,QWORD[24+rsi]
- adox r8,rcx
- adcx r9,rbp
- adox r9,rax
-
-
- mov rdx,r10
- mulx rcx,rdx,QWORD[32+rsi]
-
- mulx rbp,rcx,QWORD[rsi]
- adcx r10,rcx
- adox r11,rbp
- mulx rbp,rcx,QWORD[8+rsi]
- adcx r11,rcx
- adox r8,rbp
- mulx rbp,rcx,QWORD[16+rsi]
- adcx r8,rcx
- adox r9,rbp
- mulx rbp,rcx,QWORD[24+rsi]
- adcx r9,rcx
- adox r10,rbp
- adcx r10,rax
-
-
- mov rdx,r11
- mulx rcx,rdx,QWORD[32+rsi]
-
- mulx rbp,rcx,QWORD[rsi]
- adox r11,rcx
- adcx r8,rbp
- mulx rbp,rcx,QWORD[8+rsi]
- adox r8,rcx
- adcx r9,rbp
- mulx rbp,rcx,QWORD[16+rsi]
- adox r9,rcx
- adcx r10,rbp
- mulx rbp,rcx,QWORD[24+rsi]
- adox r10,rcx
- adcx r11,rbp
- adox r11,rax
-
-
- add r12,r8
- adc r9,r13
- mov rdx,r12
- adc r10,r14
- adc r11,r15
- mov r14,r9
- adc rax,0
-
-
- sub r12,QWORD[rsi]
- mov r15,r10
- sbb r9,QWORD[8+rsi]
- sbb r10,QWORD[16+rsi]
- mov r8,r11
- sbb r11,QWORD[24+rsi]
- sbb rax,0
-
- cmovnc rdx,r12
- cmovnc r14,r9
- cmovnc r15,r10
- cmovnc r8,r11
-
- dec rbx
- jnz NEAR $L$oop_ord_sqrx
-
- mov QWORD[rdi],rdx
- mov QWORD[8+rdi],r14
- pxor xmm1,xmm1
- mov QWORD[16+rdi],r15
- pxor xmm2,xmm2
- mov QWORD[24+rdi],r8
- pxor xmm3,xmm3
-
- mov r15,QWORD[rsp]
-
- mov r14,QWORD[8+rsp]
-
- mov r13,QWORD[16+rsp]
-
- mov r12,QWORD[24+rsp]
-
- mov rbx,QWORD[32+rsp]
-
- mov rbp,QWORD[40+rsp]
-
- lea rsp,[48+rsp]
-
-$L$ord_sqrx_epilogue:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_ecp_nistz256_ord_sqr_montx:
-
-
global ecp_nistz256_to_mont
@@ -3959,8 +3485,6 @@ $L$SEH_begin_ecp_nistz256_to_mont:
- mov ecx,0x80100
- and ecx,DWORD[((OPENSSL_ia32cap_P+8))]
lea rdx,[$L$RR]
jmp NEAR $L$mul_mont
@@ -3986,8 +3510,6 @@ $L$SEH_begin_ecp_nistz256_mul_mont:
- mov ecx,0x80100
- and ecx,DWORD[((OPENSSL_ia32cap_P+8))]
$L$mul_mont:
push rbp
@@ -4002,8 +3524,6 @@ $L$mul_mont:
push r15
$L$mul_body:
- cmp ecx,0x80100
- je NEAR $L$mul_montx
mov rbx,rdx
mov rax,QWORD[rdx]
mov r9,QWORD[rsi]
@@ -4012,19 +3532,6 @@ $L$mul_body:
mov r12,QWORD[24+rsi]
call __ecp_nistz256_mul_montq
- jmp NEAR $L$mul_mont_done
-
-ALIGN 32
-$L$mul_montx:
- mov rbx,rdx
- mov rdx,QWORD[rdx]
- mov r9,QWORD[rsi]
- mov r10,QWORD[8+rsi]
- mov r11,QWORD[16+rsi]
- mov r12,QWORD[24+rsi]
- lea rsi,[((-128))+rsi]
-
- call __ecp_nistz256_mul_montx
$L$mul_mont_done:
mov r15,QWORD[rsp]
@@ -4285,8 +3792,6 @@ $L$SEH_begin_ecp_nistz256_sqr_mont:
- mov ecx,0x80100
- and ecx,DWORD[((OPENSSL_ia32cap_P+8))]
push rbp
push rbx
@@ -4300,25 +3805,12 @@ $L$SEH_begin_ecp_nistz256_sqr_mont:
push r15
$L$sqr_body:
- cmp ecx,0x80100
- je NEAR $L$sqr_montx
mov rax,QWORD[rsi]
mov r14,QWORD[8+rsi]
mov r15,QWORD[16+rsi]
mov r8,QWORD[24+rsi]
call __ecp_nistz256_sqr_montq
- jmp NEAR $L$sqr_mont_done
-
-ALIGN 32
-$L$sqr_montx:
- mov rdx,QWORD[rsi]
- mov r14,QWORD[8+rsi]
- mov r15,QWORD[16+rsi]
- mov r8,QWORD[24+rsi]
- lea rsi,[((-128))+rsi]
-
- call __ecp_nistz256_sqr_montx
$L$sqr_mont_done:
mov r15,QWORD[rsp]
@@ -4505,335 +3997,37 @@ __ecp_nistz256_sqr_montq:
-ALIGN 32
-__ecp_nistz256_mul_montx:
- mulx r9,r8,r9
- mulx r10,rcx,r10
- mov r14,32
- xor r13,r13
- mulx r11,rbp,r11
- mov r15,QWORD[(($L$poly+24))]
- adc r9,rcx
- mulx r12,rcx,r12
- mov rdx,r8
- adc r10,rbp
- shlx rbp,r8,r14
- adc r11,rcx
- shrx rcx,r8,r14
- adc r12,0
+global ecp_nistz256_from_mont
- add r9,rbp
- adc r10,rcx
+ALIGN 32
+ecp_nistz256_from_mont:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_ecp_nistz256_from_mont:
+ mov rdi,rcx
+ mov rsi,rdx
- mulx rbp,rcx,r15
- mov rdx,QWORD[8+rbx]
- adc r11,rcx
- adc r12,rbp
- adc r13,0
- xor r8,r8
+ push r12
- mulx rbp,rcx,QWORD[((0+128))+rsi]
- adcx r9,rcx
- adox r10,rbp
+ push r13
- mulx rbp,rcx,QWORD[((8+128))+rsi]
- adcx r10,rcx
- adox r11,rbp
+$L$from_body:
- mulx rbp,rcx,QWORD[((16+128))+rsi]
- adcx r11,rcx
- adox r12,rbp
-
- mulx rbp,rcx,QWORD[((24+128))+rsi]
- mov rdx,r9
- adcx r12,rcx
- shlx rcx,r9,r14
- adox r13,rbp
- shrx rbp,r9,r14
-
- adcx r13,r8
- adox r8,r8
- adc r8,0
-
-
-
- add r10,rcx
- adc r11,rbp
-
- mulx rbp,rcx,r15
- mov rdx,QWORD[16+rbx]
- adc r12,rcx
- adc r13,rbp
- adc r8,0
- xor r9,r9
-
-
-
- mulx rbp,rcx,QWORD[((0+128))+rsi]
- adcx r10,rcx
- adox r11,rbp
-
- mulx rbp,rcx,QWORD[((8+128))+rsi]
- adcx r11,rcx
- adox r12,rbp
-
- mulx rbp,rcx,QWORD[((16+128))+rsi]
- adcx r12,rcx
- adox r13,rbp
-
- mulx rbp,rcx,QWORD[((24+128))+rsi]
- mov rdx,r10
- adcx r13,rcx
- shlx rcx,r10,r14
- adox r8,rbp
- shrx rbp,r10,r14
-
- adcx r8,r9
- adox r9,r9
- adc r9,0
-
-
-
- add r11,rcx
- adc r12,rbp
-
- mulx rbp,rcx,r15
- mov rdx,QWORD[24+rbx]
- adc r13,rcx
- adc r8,rbp
- adc r9,0
- xor r10,r10
-
-
-
- mulx rbp,rcx,QWORD[((0+128))+rsi]
- adcx r11,rcx
- adox r12,rbp
-
- mulx rbp,rcx,QWORD[((8+128))+rsi]
- adcx r12,rcx
- adox r13,rbp
-
- mulx rbp,rcx,QWORD[((16+128))+rsi]
- adcx r13,rcx
- adox r8,rbp
-
- mulx rbp,rcx,QWORD[((24+128))+rsi]
- mov rdx,r11
- adcx r8,rcx
- shlx rcx,r11,r14
- adox r9,rbp
- shrx rbp,r11,r14
-
- adcx r9,r10
- adox r10,r10
- adc r10,0
-
-
-
- add r12,rcx
- adc r13,rbp
-
- mulx rbp,rcx,r15
- mov rbx,r12
- mov r14,QWORD[(($L$poly+8))]
- adc r8,rcx
- mov rdx,r13
- adc r9,rbp
- adc r10,0
-
-
-
- xor eax,eax
- mov rcx,r8
- sbb r12,-1
- sbb r13,r14
- sbb r8,0
- mov rbp,r9
- sbb r9,r15
- sbb r10,0
-
- cmovc r12,rbx
- cmovc r13,rdx
- mov QWORD[rdi],r12
- cmovc r8,rcx
- mov QWORD[8+rdi],r13
- cmovc r9,rbp
- mov QWORD[16+rdi],r8
- mov QWORD[24+rdi],r9
-
- DB 0F3h,0C3h ;repret
-
-
-
-
-ALIGN 32
-__ecp_nistz256_sqr_montx:
-
- mulx r10,r9,r14
- mulx r11,rcx,r15
- xor eax,eax
- adc r10,rcx
- mulx r12,rbp,r8
- mov rdx,r14
- adc r11,rbp
- adc r12,0
- xor r13,r13
-
-
- mulx rbp,rcx,r15
- adcx r11,rcx
- adox r12,rbp
-
- mulx rbp,rcx,r8
- mov rdx,r15
- adcx r12,rcx
- adox r13,rbp
- adc r13,0
-
-
- mulx r14,rcx,r8
- mov rdx,QWORD[((0+128))+rsi]
- xor r15,r15
- adcx r9,r9
- adox r13,rcx
- adcx r10,r10
- adox r14,r15
-
- mulx rbp,r8,rdx
- mov rdx,QWORD[((8+128))+rsi]
- adcx r11,r11
- adox r9,rbp
- adcx r12,r12
- mulx rax,rcx,rdx
- mov rdx,QWORD[((16+128))+rsi]
- adcx r13,r13
- adox r10,rcx
- adcx r14,r14
-DB 0x67
- mulx rbp,rcx,rdx
- mov rdx,QWORD[((24+128))+rsi]
- adox r11,rax
- adcx r15,r15
- adox r12,rcx
- mov rsi,32
- adox r13,rbp
-DB 0x67,0x67
- mulx rax,rcx,rdx
- mov rdx,QWORD[(($L$poly+24))]
- adox r14,rcx
- shlx rcx,r8,rsi
- adox r15,rax
- shrx rax,r8,rsi
- mov rbp,rdx
-
-
- add r9,rcx
- adc r10,rax
-
- mulx r8,rcx,r8
- adc r11,rcx
- shlx rcx,r9,rsi
- adc r8,0
- shrx rax,r9,rsi
-
-
- add r10,rcx
- adc r11,rax
-
- mulx r9,rcx,r9
- adc r8,rcx
- shlx rcx,r10,rsi
- adc r9,0
- shrx rax,r10,rsi
-
-
- add r11,rcx
- adc r8,rax
-
- mulx r10,rcx,r10
- adc r9,rcx
- shlx rcx,r11,rsi
- adc r10,0
- shrx rax,r11,rsi
-
-
- add r8,rcx
- adc r9,rax
-
- mulx r11,rcx,r11
- adc r10,rcx
- adc r11,0
-
- xor rdx,rdx
- add r12,r8
- mov rsi,QWORD[(($L$poly+8))]
- adc r13,r9
- mov r8,r12
- adc r14,r10
- adc r15,r11
- mov r9,r13
- adc rdx,0
-
- sub r12,-1
- mov r10,r14
- sbb r13,rsi
- sbb r14,0
- mov r11,r15
- sbb r15,rbp
- sbb rdx,0
-
- cmovc r12,r8
- cmovc r13,r9
- mov QWORD[rdi],r12
- cmovc r14,r10
- mov QWORD[8+rdi],r13
- cmovc r15,r11
- mov QWORD[16+rdi],r14
- mov QWORD[24+rdi],r15
-
- DB 0F3h,0C3h ;repret
-
-
-
-
-
-
-
-
-global ecp_nistz256_from_mont
-
-ALIGN 32
-ecp_nistz256_from_mont:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_ecp_nistz256_from_mont:
- mov rdi,rcx
- mov rsi,rdx
-
-
-
- push r12
-
- push r13
-
-$L$from_body:
-
- mov rax,QWORD[rsi]
- mov r13,QWORD[(($L$poly+24))]
- mov r9,QWORD[8+rsi]
- mov r10,QWORD[16+rsi]
- mov r11,QWORD[24+rsi]
- mov r8,rax
- mov r12,QWORD[(($L$poly+8))]
+ mov rax,QWORD[rsi]
+ mov r13,QWORD[(($L$poly+24))]
+ mov r9,QWORD[8+rsi]
+ mov r10,QWORD[16+rsi]
+ mov r11,QWORD[24+rsi]
+ mov r8,rax
+ mov r12,QWORD[(($L$poly+8))]
@@ -4951,9 +4145,6 @@ global ecp_nistz256_gather_w5
ALIGN 32
ecp_nistz256_gather_w5:
- mov eax,DWORD[((OPENSSL_ia32cap_P+8))]
- test eax,32
- jnz NEAR $L$avx2_gather_w5
lea rax,[((-136))+rsp]
$L$SEH_begin_ecp_nistz256_gather_w5:
DB 0x48,0x8d,0x60,0xe0
@@ -5061,9 +4252,6 @@ global ecp_nistz256_gather_w7
ALIGN 32
ecp_nistz256_gather_w7:
- mov eax,DWORD[((OPENSSL_ia32cap_P+8))]
- test eax,32
- jnz NEAR $L$avx2_gather_w7
lea rax,[((-136))+rsp]
$L$SEH_begin_ecp_nistz256_gather_w7:
DB 0x48,0x8d,0x60,0xe0
@@ -5102,1359 +4290,61 @@ $L$select_loop_sse_w7:
pand xmm9,xmm15
pand xmm10,xmm15
por xmm2,xmm9
- pand xmm11,xmm15
- por xmm3,xmm10
- pand xmm12,xmm15
- por xmm4,xmm11
- prefetcht0 [255+rdx]
- por xmm5,xmm12
-
- dec rax
- jnz NEAR $L$select_loop_sse_w7
-
- movdqu XMMWORD[rcx],xmm2
- movdqu XMMWORD[16+rcx],xmm3
- movdqu XMMWORD[32+rcx],xmm4
- movdqu XMMWORD[48+rcx],xmm5
- movaps xmm6,XMMWORD[rsp]
- movaps xmm7,XMMWORD[16+rsp]
- movaps xmm8,XMMWORD[32+rsp]
- movaps xmm9,XMMWORD[48+rsp]
- movaps xmm10,XMMWORD[64+rsp]
- movaps xmm11,XMMWORD[80+rsp]
- movaps xmm12,XMMWORD[96+rsp]
- movaps xmm13,XMMWORD[112+rsp]
- movaps xmm14,XMMWORD[128+rsp]
- movaps xmm15,XMMWORD[144+rsp]
- lea rsp,[168+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_ecp_nistz256_gather_w7:
-
-
-
-
-ALIGN 32
-ecp_nistz256_avx2_gather_w5:
-
-$L$avx2_gather_w5:
- vzeroupper
- lea rax,[((-136))+rsp]
- mov r11,rsp
-$L$SEH_begin_ecp_nistz256_avx2_gather_w5:
-DB 0x48,0x8d,0x60,0xe0
-DB 0xc5,0xf8,0x29,0x70,0xe0
-DB 0xc5,0xf8,0x29,0x78,0xf0
-DB 0xc5,0x78,0x29,0x40,0x00
-DB 0xc5,0x78,0x29,0x48,0x10
-DB 0xc5,0x78,0x29,0x50,0x20
-DB 0xc5,0x78,0x29,0x58,0x30
-DB 0xc5,0x78,0x29,0x60,0x40
-DB 0xc5,0x78,0x29,0x68,0x50
-DB 0xc5,0x78,0x29,0x70,0x60
-DB 0xc5,0x78,0x29,0x78,0x70
- vmovdqa ymm0,YMMWORD[$L$Two]
-
- vpxor ymm2,ymm2,ymm2
- vpxor ymm3,ymm3,ymm3
- vpxor ymm4,ymm4,ymm4
-
- vmovdqa ymm5,YMMWORD[$L$One]
- vmovdqa ymm10,YMMWORD[$L$Two]
-
- vmovd xmm1,r8d
- vpermd ymm1,ymm2,ymm1
-
- mov rax,8
-$L$select_loop_avx2_w5:
-
- vmovdqa ymm6,YMMWORD[rdx]
- vmovdqa ymm7,YMMWORD[32+rdx]
- vmovdqa ymm8,YMMWORD[64+rdx]
-
- vmovdqa ymm11,YMMWORD[96+rdx]
- vmovdqa ymm12,YMMWORD[128+rdx]
- vmovdqa ymm13,YMMWORD[160+rdx]
-
- vpcmpeqd ymm9,ymm5,ymm1
- vpcmpeqd ymm14,ymm10,ymm1
-
- vpaddd ymm5,ymm5,ymm0
- vpaddd ymm10,ymm10,ymm0
- lea rdx,[192+rdx]
-
- vpand ymm6,ymm6,ymm9
- vpand ymm7,ymm7,ymm9
- vpand ymm8,ymm8,ymm9
- vpand ymm11,ymm11,ymm14
- vpand ymm12,ymm12,ymm14
- vpand ymm13,ymm13,ymm14
-
- vpxor ymm2,ymm2,ymm6
- vpxor ymm3,ymm3,ymm7
- vpxor ymm4,ymm4,ymm8
- vpxor ymm2,ymm2,ymm11
- vpxor ymm3,ymm3,ymm12
- vpxor ymm4,ymm4,ymm13
-
- dec rax
- jnz NEAR $L$select_loop_avx2_w5
-
- vmovdqu YMMWORD[rcx],ymm2
- vmovdqu YMMWORD[32+rcx],ymm3
- vmovdqu YMMWORD[64+rcx],ymm4
- vzeroupper
- movaps xmm6,XMMWORD[rsp]
- movaps xmm7,XMMWORD[16+rsp]
- movaps xmm8,XMMWORD[32+rsp]
- movaps xmm9,XMMWORD[48+rsp]
- movaps xmm10,XMMWORD[64+rsp]
- movaps xmm11,XMMWORD[80+rsp]
- movaps xmm12,XMMWORD[96+rsp]
- movaps xmm13,XMMWORD[112+rsp]
- movaps xmm14,XMMWORD[128+rsp]
- movaps xmm15,XMMWORD[144+rsp]
- lea rsp,[r11]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_ecp_nistz256_avx2_gather_w5:
-
-
-
-
-global ecp_nistz256_avx2_gather_w7
-
-ALIGN 32
-ecp_nistz256_avx2_gather_w7:
-
-$L$avx2_gather_w7:
- vzeroupper
- mov r11,rsp
- lea rax,[((-136))+rsp]
-$L$SEH_begin_ecp_nistz256_avx2_gather_w7:
-DB 0x48,0x8d,0x60,0xe0
-DB 0xc5,0xf8,0x29,0x70,0xe0
-DB 0xc5,0xf8,0x29,0x78,0xf0
-DB 0xc5,0x78,0x29,0x40,0x00
-DB 0xc5,0x78,0x29,0x48,0x10
-DB 0xc5,0x78,0x29,0x50,0x20
-DB 0xc5,0x78,0x29,0x58,0x30
-DB 0xc5,0x78,0x29,0x60,0x40
-DB 0xc5,0x78,0x29,0x68,0x50
-DB 0xc5,0x78,0x29,0x70,0x60
-DB 0xc5,0x78,0x29,0x78,0x70
- vmovdqa ymm0,YMMWORD[$L$Three]
-
- vpxor ymm2,ymm2,ymm2
- vpxor ymm3,ymm3,ymm3
-
- vmovdqa ymm4,YMMWORD[$L$One]
- vmovdqa ymm8,YMMWORD[$L$Two]
- vmovdqa ymm12,YMMWORD[$L$Three]
-
- vmovd xmm1,r8d
- vpermd ymm1,ymm2,ymm1
-
-
- mov rax,21
-$L$select_loop_avx2_w7:
-
- vmovdqa ymm5,YMMWORD[rdx]
- vmovdqa ymm6,YMMWORD[32+rdx]
-
- vmovdqa ymm9,YMMWORD[64+rdx]
- vmovdqa ymm10,YMMWORD[96+rdx]
-
- vmovdqa ymm13,YMMWORD[128+rdx]
- vmovdqa ymm14,YMMWORD[160+rdx]
-
- vpcmpeqd ymm7,ymm4,ymm1
- vpcmpeqd ymm11,ymm8,ymm1
- vpcmpeqd ymm15,ymm12,ymm1
-
- vpaddd ymm4,ymm4,ymm0
- vpaddd ymm8,ymm8,ymm0
- vpaddd ymm12,ymm12,ymm0
- lea rdx,[192+rdx]
-
- vpand ymm5,ymm5,ymm7
- vpand ymm6,ymm6,ymm7
- vpand ymm9,ymm9,ymm11
- vpand ymm10,ymm10,ymm11
- vpand ymm13,ymm13,ymm15
- vpand ymm14,ymm14,ymm15
-
- vpxor ymm2,ymm2,ymm5
- vpxor ymm3,ymm3,ymm6
- vpxor ymm2,ymm2,ymm9
- vpxor ymm3,ymm3,ymm10
- vpxor ymm2,ymm2,ymm13
- vpxor ymm3,ymm3,ymm14
-
- dec rax
- jnz NEAR $L$select_loop_avx2_w7
-
-
- vmovdqa ymm5,YMMWORD[rdx]
- vmovdqa ymm6,YMMWORD[32+rdx]
-
- vpcmpeqd ymm7,ymm4,ymm1
-
- vpand ymm5,ymm5,ymm7
- vpand ymm6,ymm6,ymm7
-
- vpxor ymm2,ymm2,ymm5
- vpxor ymm3,ymm3,ymm6
-
- vmovdqu YMMWORD[rcx],ymm2
- vmovdqu YMMWORD[32+rcx],ymm3
- vzeroupper
- movaps xmm6,XMMWORD[rsp]
- movaps xmm7,XMMWORD[16+rsp]
- movaps xmm8,XMMWORD[32+rsp]
- movaps xmm9,XMMWORD[48+rsp]
- movaps xmm10,XMMWORD[64+rsp]
- movaps xmm11,XMMWORD[80+rsp]
- movaps xmm12,XMMWORD[96+rsp]
- movaps xmm13,XMMWORD[112+rsp]
- movaps xmm14,XMMWORD[128+rsp]
- movaps xmm15,XMMWORD[144+rsp]
- lea rsp,[r11]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_ecp_nistz256_avx2_gather_w7:
-
-
-ALIGN 32
-__ecp_nistz256_add_toq:
-
- xor r11,r11
- add r12,QWORD[rbx]
- adc r13,QWORD[8+rbx]
- mov rax,r12
- adc r8,QWORD[16+rbx]
- adc r9,QWORD[24+rbx]
- mov rbp,r13
- adc r11,0
-
- sub r12,-1
- mov rcx,r8
- sbb r13,r14
- sbb r8,0
- mov r10,r9
- sbb r9,r15
- sbb r11,0
-
- cmovc r12,rax
- cmovc r13,rbp
- mov QWORD[rdi],r12
- cmovc r8,rcx
- mov QWORD[8+rdi],r13
- cmovc r9,r10
- mov QWORD[16+rdi],r8
- mov QWORD[24+rdi],r9
-
- DB 0F3h,0C3h ;repret
-
-
-
-
-ALIGN 32
-__ecp_nistz256_sub_fromq:
-
- sub r12,QWORD[rbx]
- sbb r13,QWORD[8+rbx]
- mov rax,r12
- sbb r8,QWORD[16+rbx]
- sbb r9,QWORD[24+rbx]
- mov rbp,r13
- sbb r11,r11
-
- add r12,-1
- mov rcx,r8
- adc r13,r14
- adc r8,0
- mov r10,r9
- adc r9,r15
- test r11,r11
-
- cmovz r12,rax
- cmovz r13,rbp
- mov QWORD[rdi],r12
- cmovz r8,rcx
- mov QWORD[8+rdi],r13
- cmovz r9,r10
- mov QWORD[16+rdi],r8
- mov QWORD[24+rdi],r9
-
- DB 0F3h,0C3h ;repret
-
-
-
-
-ALIGN 32
-__ecp_nistz256_subq:
-
- sub rax,r12
- sbb rbp,r13
- mov r12,rax
- sbb rcx,r8
- sbb r10,r9
- mov r13,rbp
- sbb r11,r11
-
- add rax,-1
- mov r8,rcx
- adc rbp,r14
- adc rcx,0
- mov r9,r10
- adc r10,r15
- test r11,r11
-
- cmovnz r12,rax
- cmovnz r13,rbp
- cmovnz r8,rcx
- cmovnz r9,r10
-
- DB 0F3h,0C3h ;repret
-
-
-
-
-ALIGN 32
-__ecp_nistz256_mul_by_2q:
-
- xor r11,r11
- add r12,r12
- adc r13,r13
- mov rax,r12
- adc r8,r8
- adc r9,r9
- mov rbp,r13
- adc r11,0
-
- sub r12,-1
- mov rcx,r8
- sbb r13,r14
- sbb r8,0
- mov r10,r9
- sbb r9,r15
- sbb r11,0
-
- cmovc r12,rax
- cmovc r13,rbp
- mov QWORD[rdi],r12
- cmovc r8,rcx
- mov QWORD[8+rdi],r13
- cmovc r9,r10
- mov QWORD[16+rdi],r8
- mov QWORD[24+rdi],r9
-
- DB 0F3h,0C3h ;repret
-
-
-global ecp_nistz256_point_double
-
-ALIGN 32
-ecp_nistz256_point_double:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_ecp_nistz256_point_double:
- mov rdi,rcx
- mov rsi,rdx
-
-
-
- mov ecx,0x80100
- and ecx,DWORD[((OPENSSL_ia32cap_P+8))]
- cmp ecx,0x80100
- je NEAR $L$point_doublex
- push rbp
-
- push rbx
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
- sub rsp,32*5+8
-
-$L$point_doubleq_body:
-
-$L$point_double_shortcutq:
- movdqu xmm0,XMMWORD[rsi]
- mov rbx,rsi
- movdqu xmm1,XMMWORD[16+rsi]
- mov r12,QWORD[((32+0))+rsi]
- mov r13,QWORD[((32+8))+rsi]
- mov r8,QWORD[((32+16))+rsi]
- mov r9,QWORD[((32+24))+rsi]
- mov r14,QWORD[(($L$poly+8))]
- mov r15,QWORD[(($L$poly+24))]
- movdqa XMMWORD[96+rsp],xmm0
- movdqa XMMWORD[(96+16)+rsp],xmm1
- lea r10,[32+rdi]
- lea r11,[64+rdi]
-DB 102,72,15,110,199
-DB 102,73,15,110,202
-DB 102,73,15,110,211
-
- lea rdi,[rsp]
- call __ecp_nistz256_mul_by_2q
-
- mov rax,QWORD[((64+0))+rsi]
- mov r14,QWORD[((64+8))+rsi]
- mov r15,QWORD[((64+16))+rsi]
- mov r8,QWORD[((64+24))+rsi]
- lea rsi,[((64-0))+rsi]
- lea rdi,[64+rsp]
- call __ecp_nistz256_sqr_montq
-
- mov rax,QWORD[((0+0))+rsp]
- mov r14,QWORD[((8+0))+rsp]
- lea rsi,[((0+0))+rsp]
- mov r15,QWORD[((16+0))+rsp]
- mov r8,QWORD[((24+0))+rsp]
- lea rdi,[rsp]
- call __ecp_nistz256_sqr_montq
-
- mov rax,QWORD[32+rbx]
- mov r9,QWORD[((64+0))+rbx]
- mov r10,QWORD[((64+8))+rbx]
- mov r11,QWORD[((64+16))+rbx]
- mov r12,QWORD[((64+24))+rbx]
- lea rsi,[((64-0))+rbx]
- lea rbx,[32+rbx]
-DB 102,72,15,126,215
- call __ecp_nistz256_mul_montq
- call __ecp_nistz256_mul_by_2q
-
- mov r12,QWORD[((96+0))+rsp]
- mov r13,QWORD[((96+8))+rsp]
- lea rbx,[64+rsp]
- mov r8,QWORD[((96+16))+rsp]
- mov r9,QWORD[((96+24))+rsp]
- lea rdi,[32+rsp]
- call __ecp_nistz256_add_toq
-
- mov r12,QWORD[((96+0))+rsp]
- mov r13,QWORD[((96+8))+rsp]
- lea rbx,[64+rsp]
- mov r8,QWORD[((96+16))+rsp]
- mov r9,QWORD[((96+24))+rsp]
- lea rdi,[64+rsp]
- call __ecp_nistz256_sub_fromq
-
- mov rax,QWORD[((0+0))+rsp]
- mov r14,QWORD[((8+0))+rsp]
- lea rsi,[((0+0))+rsp]
- mov r15,QWORD[((16+0))+rsp]
- mov r8,QWORD[((24+0))+rsp]
-DB 102,72,15,126,207
- call __ecp_nistz256_sqr_montq
- xor r9,r9
- mov rax,r12
- add r12,-1
- mov r10,r13
- adc r13,rsi
- mov rcx,r14
- adc r14,0
- mov r8,r15
- adc r15,rbp
- adc r9,0
- xor rsi,rsi
- test rax,1
-
- cmovz r12,rax
- cmovz r13,r10
- cmovz r14,rcx
- cmovz r15,r8
- cmovz r9,rsi
-
- mov rax,r13
- shr r12,1
- shl rax,63
- mov r10,r14
- shr r13,1
- or r12,rax
- shl r10,63
- mov rcx,r15
- shr r14,1
- or r13,r10
- shl rcx,63
- mov QWORD[rdi],r12
- shr r15,1
- mov QWORD[8+rdi],r13
- shl r9,63
- or r14,rcx
- or r15,r9
- mov QWORD[16+rdi],r14
- mov QWORD[24+rdi],r15
- mov rax,QWORD[64+rsp]
- lea rbx,[64+rsp]
- mov r9,QWORD[((0+32))+rsp]
- mov r10,QWORD[((8+32))+rsp]
- lea rsi,[((0+32))+rsp]
- mov r11,QWORD[((16+32))+rsp]
- mov r12,QWORD[((24+32))+rsp]
- lea rdi,[32+rsp]
- call __ecp_nistz256_mul_montq
-
- lea rdi,[128+rsp]
- call __ecp_nistz256_mul_by_2q
-
- lea rbx,[32+rsp]
- lea rdi,[32+rsp]
- call __ecp_nistz256_add_toq
-
- mov rax,QWORD[96+rsp]
- lea rbx,[96+rsp]
- mov r9,QWORD[((0+0))+rsp]
- mov r10,QWORD[((8+0))+rsp]
- lea rsi,[((0+0))+rsp]
- mov r11,QWORD[((16+0))+rsp]
- mov r12,QWORD[((24+0))+rsp]
- lea rdi,[rsp]
- call __ecp_nistz256_mul_montq
-
- lea rdi,[128+rsp]
- call __ecp_nistz256_mul_by_2q
-
- mov rax,QWORD[((0+32))+rsp]
- mov r14,QWORD[((8+32))+rsp]
- lea rsi,[((0+32))+rsp]
- mov r15,QWORD[((16+32))+rsp]
- mov r8,QWORD[((24+32))+rsp]
-DB 102,72,15,126,199
- call __ecp_nistz256_sqr_montq
-
- lea rbx,[128+rsp]
- mov r8,r14
- mov r9,r15
- mov r14,rsi
- mov r15,rbp
- call __ecp_nistz256_sub_fromq
-
- mov rax,QWORD[((0+0))+rsp]
- mov rbp,QWORD[((0+8))+rsp]
- mov rcx,QWORD[((0+16))+rsp]
- mov r10,QWORD[((0+24))+rsp]
- lea rdi,[rsp]
- call __ecp_nistz256_subq
-
- mov rax,QWORD[32+rsp]
- lea rbx,[32+rsp]
- mov r14,r12
- xor ecx,ecx
- mov QWORD[((0+0))+rsp],r12
- mov r10,r13
- mov QWORD[((0+8))+rsp],r13
- cmovz r11,r8
- mov QWORD[((0+16))+rsp],r8
- lea rsi,[((0-0))+rsp]
- cmovz r12,r9
- mov QWORD[((0+24))+rsp],r9
- mov r9,r14
- lea rdi,[rsp]
- call __ecp_nistz256_mul_montq
-
-DB 102,72,15,126,203
-DB 102,72,15,126,207
- call __ecp_nistz256_sub_fromq
-
- lea rsi,[((160+56))+rsp]
-
- mov r15,QWORD[((-48))+rsi]
-
- mov r14,QWORD[((-40))+rsi]
-
- mov r13,QWORD[((-32))+rsi]
-
- mov r12,QWORD[((-24))+rsi]
-
- mov rbx,QWORD[((-16))+rsi]
-
- mov rbp,QWORD[((-8))+rsi]
-
- lea rsp,[rsi]
-
-$L$point_doubleq_epilogue:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_ecp_nistz256_point_double:
-global ecp_nistz256_point_add
-
-ALIGN 32
-ecp_nistz256_point_add:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_ecp_nistz256_point_add:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
-
-
-
- mov ecx,0x80100
- and ecx,DWORD[((OPENSSL_ia32cap_P+8))]
- cmp ecx,0x80100
- je NEAR $L$point_addx
- push rbp
-
- push rbx
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
- sub rsp,32*18+8
-
-$L$point_addq_body:
-
- movdqu xmm0,XMMWORD[rsi]
- movdqu xmm1,XMMWORD[16+rsi]
- movdqu xmm2,XMMWORD[32+rsi]
- movdqu xmm3,XMMWORD[48+rsi]
- movdqu xmm4,XMMWORD[64+rsi]
- movdqu xmm5,XMMWORD[80+rsi]
- mov rbx,rsi
- mov rsi,rdx
- movdqa XMMWORD[384+rsp],xmm0
- movdqa XMMWORD[(384+16)+rsp],xmm1
- movdqa XMMWORD[416+rsp],xmm2
- movdqa XMMWORD[(416+16)+rsp],xmm3
- movdqa XMMWORD[448+rsp],xmm4
- movdqa XMMWORD[(448+16)+rsp],xmm5
- por xmm5,xmm4
-
- movdqu xmm0,XMMWORD[rsi]
- pshufd xmm3,xmm5,0xb1
- movdqu xmm1,XMMWORD[16+rsi]
- movdqu xmm2,XMMWORD[32+rsi]
- por xmm5,xmm3
- movdqu xmm3,XMMWORD[48+rsi]
- mov rax,QWORD[((64+0))+rsi]
- mov r14,QWORD[((64+8))+rsi]
- mov r15,QWORD[((64+16))+rsi]
- mov r8,QWORD[((64+24))+rsi]
- movdqa XMMWORD[480+rsp],xmm0
- pshufd xmm4,xmm5,0x1e
- movdqa XMMWORD[(480+16)+rsp],xmm1
- movdqu xmm0,XMMWORD[64+rsi]
- movdqu xmm1,XMMWORD[80+rsi]
- movdqa XMMWORD[512+rsp],xmm2
- movdqa XMMWORD[(512+16)+rsp],xmm3
- por xmm5,xmm4
- pxor xmm4,xmm4
- por xmm1,xmm0
-DB 102,72,15,110,199
-
- lea rsi,[((64-0))+rsi]
- mov QWORD[((544+0))+rsp],rax
- mov QWORD[((544+8))+rsp],r14
- mov QWORD[((544+16))+rsp],r15
- mov QWORD[((544+24))+rsp],r8
- lea rdi,[96+rsp]
- call __ecp_nistz256_sqr_montq
-
- pcmpeqd xmm5,xmm4
- pshufd xmm4,xmm1,0xb1
- por xmm4,xmm1
- pshufd xmm5,xmm5,0
- pshufd xmm3,xmm4,0x1e
- por xmm4,xmm3
- pxor xmm3,xmm3
- pcmpeqd xmm4,xmm3
- pshufd xmm4,xmm4,0
- mov rax,QWORD[((64+0))+rbx]
- mov r14,QWORD[((64+8))+rbx]
- mov r15,QWORD[((64+16))+rbx]
- mov r8,QWORD[((64+24))+rbx]
-DB 102,72,15,110,203
-
- lea rsi,[((64-0))+rbx]
- lea rdi,[32+rsp]
- call __ecp_nistz256_sqr_montq
-
- mov rax,QWORD[544+rsp]
- lea rbx,[544+rsp]
- mov r9,QWORD[((0+96))+rsp]
- mov r10,QWORD[((8+96))+rsp]
- lea rsi,[((0+96))+rsp]
- mov r11,QWORD[((16+96))+rsp]
- mov r12,QWORD[((24+96))+rsp]
- lea rdi,[224+rsp]
- call __ecp_nistz256_mul_montq
-
- mov rax,QWORD[448+rsp]
- lea rbx,[448+rsp]
- mov r9,QWORD[((0+32))+rsp]
- mov r10,QWORD[((8+32))+rsp]
- lea rsi,[((0+32))+rsp]
- mov r11,QWORD[((16+32))+rsp]
- mov r12,QWORD[((24+32))+rsp]
- lea rdi,[256+rsp]
- call __ecp_nistz256_mul_montq
-
- mov rax,QWORD[416+rsp]
- lea rbx,[416+rsp]
- mov r9,QWORD[((0+224))+rsp]
- mov r10,QWORD[((8+224))+rsp]
- lea rsi,[((0+224))+rsp]
- mov r11,QWORD[((16+224))+rsp]
- mov r12,QWORD[((24+224))+rsp]
- lea rdi,[224+rsp]
- call __ecp_nistz256_mul_montq
-
- mov rax,QWORD[512+rsp]
- lea rbx,[512+rsp]
- mov r9,QWORD[((0+256))+rsp]
- mov r10,QWORD[((8+256))+rsp]
- lea rsi,[((0+256))+rsp]
- mov r11,QWORD[((16+256))+rsp]
- mov r12,QWORD[((24+256))+rsp]
- lea rdi,[256+rsp]
- call __ecp_nistz256_mul_montq
-
- lea rbx,[224+rsp]
- lea rdi,[64+rsp]
- call __ecp_nistz256_sub_fromq
-
- or r12,r13
- movdqa xmm2,xmm4
- or r12,r8
- or r12,r9
- por xmm2,xmm5
-DB 102,73,15,110,220
-
- mov rax,QWORD[384+rsp]
- lea rbx,[384+rsp]
- mov r9,QWORD[((0+96))+rsp]
- mov r10,QWORD[((8+96))+rsp]
- lea rsi,[((0+96))+rsp]
- mov r11,QWORD[((16+96))+rsp]
- mov r12,QWORD[((24+96))+rsp]
- lea rdi,[160+rsp]
- call __ecp_nistz256_mul_montq
-
- mov rax,QWORD[480+rsp]
- lea rbx,[480+rsp]
- mov r9,QWORD[((0+32))+rsp]
- mov r10,QWORD[((8+32))+rsp]
- lea rsi,[((0+32))+rsp]
- mov r11,QWORD[((16+32))+rsp]
- mov r12,QWORD[((24+32))+rsp]
- lea rdi,[192+rsp]
- call __ecp_nistz256_mul_montq
-
- lea rbx,[160+rsp]
- lea rdi,[rsp]
- call __ecp_nistz256_sub_fromq
-
- or r12,r13
- or r12,r8
- or r12,r9
-
-DB 102,73,15,126,208
-DB 102,73,15,126,217
-
- or r12,r8
- or r12,r9
-
-
-DB 0x3e
- jnz NEAR $L$add_proceedq
-
-$L$add_doubleq:
-DB 102,72,15,126,206
-DB 102,72,15,126,199
- add rsp,416
-
- jmp NEAR $L$point_double_shortcutq
-
-
-ALIGN 32
-$L$add_proceedq:
- mov rax,QWORD[((0+64))+rsp]
- mov r14,QWORD[((8+64))+rsp]
- lea rsi,[((0+64))+rsp]
- mov r15,QWORD[((16+64))+rsp]
- mov r8,QWORD[((24+64))+rsp]
- lea rdi,[96+rsp]
- call __ecp_nistz256_sqr_montq
-
- mov rax,QWORD[448+rsp]
- lea rbx,[448+rsp]
- mov r9,QWORD[((0+0))+rsp]
- mov r10,QWORD[((8+0))+rsp]
- lea rsi,[((0+0))+rsp]
- mov r11,QWORD[((16+0))+rsp]
- mov r12,QWORD[((24+0))+rsp]
- lea rdi,[352+rsp]
- call __ecp_nistz256_mul_montq
-
- mov rax,QWORD[((0+0))+rsp]
- mov r14,QWORD[((8+0))+rsp]
- lea rsi,[((0+0))+rsp]
- mov r15,QWORD[((16+0))+rsp]
- mov r8,QWORD[((24+0))+rsp]
- lea rdi,[32+rsp]
- call __ecp_nistz256_sqr_montq
-
- mov rax,QWORD[544+rsp]
- lea rbx,[544+rsp]
- mov r9,QWORD[((0+352))+rsp]
- mov r10,QWORD[((8+352))+rsp]
- lea rsi,[((0+352))+rsp]
- mov r11,QWORD[((16+352))+rsp]
- mov r12,QWORD[((24+352))+rsp]
- lea rdi,[352+rsp]
- call __ecp_nistz256_mul_montq
-
- mov rax,QWORD[rsp]
- lea rbx,[rsp]
- mov r9,QWORD[((0+32))+rsp]
- mov r10,QWORD[((8+32))+rsp]
- lea rsi,[((0+32))+rsp]
- mov r11,QWORD[((16+32))+rsp]
- mov r12,QWORD[((24+32))+rsp]
- lea rdi,[128+rsp]
- call __ecp_nistz256_mul_montq
-
- mov rax,QWORD[160+rsp]
- lea rbx,[160+rsp]
- mov r9,QWORD[((0+32))+rsp]
- mov r10,QWORD[((8+32))+rsp]
- lea rsi,[((0+32))+rsp]
- mov r11,QWORD[((16+32))+rsp]
- mov r12,QWORD[((24+32))+rsp]
- lea rdi,[192+rsp]
- call __ecp_nistz256_mul_montq
-
-
-
-
- xor r11,r11
- add r12,r12
- lea rsi,[96+rsp]
- adc r13,r13
- mov rax,r12
- adc r8,r8
- adc r9,r9
- mov rbp,r13
- adc r11,0
-
- sub r12,-1
- mov rcx,r8
- sbb r13,r14
- sbb r8,0
- mov r10,r9
- sbb r9,r15
- sbb r11,0
-
- cmovc r12,rax
- mov rax,QWORD[rsi]
- cmovc r13,rbp
- mov rbp,QWORD[8+rsi]
- cmovc r8,rcx
- mov rcx,QWORD[16+rsi]
- cmovc r9,r10
- mov r10,QWORD[24+rsi]
-
- call __ecp_nistz256_subq
-
- lea rbx,[128+rsp]
- lea rdi,[288+rsp]
- call __ecp_nistz256_sub_fromq
-
- mov rax,QWORD[((192+0))+rsp]
- mov rbp,QWORD[((192+8))+rsp]
- mov rcx,QWORD[((192+16))+rsp]
- mov r10,QWORD[((192+24))+rsp]
- lea rdi,[320+rsp]
-
- call __ecp_nistz256_subq
-
- mov QWORD[rdi],r12
- mov QWORD[8+rdi],r13
- mov QWORD[16+rdi],r8
- mov QWORD[24+rdi],r9
- mov rax,QWORD[128+rsp]
- lea rbx,[128+rsp]
- mov r9,QWORD[((0+224))+rsp]
- mov r10,QWORD[((8+224))+rsp]
- lea rsi,[((0+224))+rsp]
- mov r11,QWORD[((16+224))+rsp]
- mov r12,QWORD[((24+224))+rsp]
- lea rdi,[256+rsp]
- call __ecp_nistz256_mul_montq
-
- mov rax,QWORD[320+rsp]
- lea rbx,[320+rsp]
- mov r9,QWORD[((0+64))+rsp]
- mov r10,QWORD[((8+64))+rsp]
- lea rsi,[((0+64))+rsp]
- mov r11,QWORD[((16+64))+rsp]
- mov r12,QWORD[((24+64))+rsp]
- lea rdi,[320+rsp]
- call __ecp_nistz256_mul_montq
-
- lea rbx,[256+rsp]
- lea rdi,[320+rsp]
- call __ecp_nistz256_sub_fromq
-
-DB 102,72,15,126,199
-
- movdqa xmm0,xmm5
- movdqa xmm1,xmm5
- pandn xmm0,XMMWORD[352+rsp]
- movdqa xmm2,xmm5
- pandn xmm1,XMMWORD[((352+16))+rsp]
- movdqa xmm3,xmm5
- pand xmm2,XMMWORD[544+rsp]
- pand xmm3,XMMWORD[((544+16))+rsp]
- por xmm2,xmm0
- por xmm3,xmm1
-
- movdqa xmm0,xmm4
- movdqa xmm1,xmm4
- pandn xmm0,xmm2
- movdqa xmm2,xmm4
- pandn xmm1,xmm3
- movdqa xmm3,xmm4
- pand xmm2,XMMWORD[448+rsp]
- pand xmm3,XMMWORD[((448+16))+rsp]
- por xmm2,xmm0
- por xmm3,xmm1
- movdqu XMMWORD[64+rdi],xmm2
- movdqu XMMWORD[80+rdi],xmm3
-
- movdqa xmm0,xmm5
- movdqa xmm1,xmm5
- pandn xmm0,XMMWORD[288+rsp]
- movdqa xmm2,xmm5
- pandn xmm1,XMMWORD[((288+16))+rsp]
- movdqa xmm3,xmm5
- pand xmm2,XMMWORD[480+rsp]
- pand xmm3,XMMWORD[((480+16))+rsp]
- por xmm2,xmm0
- por xmm3,xmm1
-
- movdqa xmm0,xmm4
- movdqa xmm1,xmm4
- pandn xmm0,xmm2
- movdqa xmm2,xmm4
- pandn xmm1,xmm3
- movdqa xmm3,xmm4
- pand xmm2,XMMWORD[384+rsp]
- pand xmm3,XMMWORD[((384+16))+rsp]
- por xmm2,xmm0
- por xmm3,xmm1
- movdqu XMMWORD[rdi],xmm2
- movdqu XMMWORD[16+rdi],xmm3
-
- movdqa xmm0,xmm5
- movdqa xmm1,xmm5
- pandn xmm0,XMMWORD[320+rsp]
- movdqa xmm2,xmm5
- pandn xmm1,XMMWORD[((320+16))+rsp]
- movdqa xmm3,xmm5
- pand xmm2,XMMWORD[512+rsp]
- pand xmm3,XMMWORD[((512+16))+rsp]
- por xmm2,xmm0
- por xmm3,xmm1
-
- movdqa xmm0,xmm4
- movdqa xmm1,xmm4
- pandn xmm0,xmm2
- movdqa xmm2,xmm4
- pandn xmm1,xmm3
- movdqa xmm3,xmm4
- pand xmm2,XMMWORD[416+rsp]
- pand xmm3,XMMWORD[((416+16))+rsp]
- por xmm2,xmm0
- por xmm3,xmm1
- movdqu XMMWORD[32+rdi],xmm2
- movdqu XMMWORD[48+rdi],xmm3
-
-$L$add_doneq:
- lea rsi,[((576+56))+rsp]
-
- mov r15,QWORD[((-48))+rsi]
-
- mov r14,QWORD[((-40))+rsi]
-
- mov r13,QWORD[((-32))+rsi]
-
- mov r12,QWORD[((-24))+rsi]
-
- mov rbx,QWORD[((-16))+rsi]
-
- mov rbp,QWORD[((-8))+rsi]
-
- lea rsp,[rsi]
-
-$L$point_addq_epilogue:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_ecp_nistz256_point_add:
-global ecp_nistz256_point_add_affine
-
-ALIGN 32
-ecp_nistz256_point_add_affine:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_ecp_nistz256_point_add_affine:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
-
-
-
- mov ecx,0x80100
- and ecx,DWORD[((OPENSSL_ia32cap_P+8))]
- cmp ecx,0x80100
- je NEAR $L$point_add_affinex
- push rbp
-
- push rbx
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
- sub rsp,32*15+8
-
-$L$add_affineq_body:
-
- movdqu xmm0,XMMWORD[rsi]
- mov rbx,rdx
- movdqu xmm1,XMMWORD[16+rsi]
- movdqu xmm2,XMMWORD[32+rsi]
- movdqu xmm3,XMMWORD[48+rsi]
- movdqu xmm4,XMMWORD[64+rsi]
- movdqu xmm5,XMMWORD[80+rsi]
- mov rax,QWORD[((64+0))+rsi]
- mov r14,QWORD[((64+8))+rsi]
- mov r15,QWORD[((64+16))+rsi]
- mov r8,QWORD[((64+24))+rsi]
- movdqa XMMWORD[320+rsp],xmm0
- movdqa XMMWORD[(320+16)+rsp],xmm1
- movdqa XMMWORD[352+rsp],xmm2
- movdqa XMMWORD[(352+16)+rsp],xmm3
- movdqa XMMWORD[384+rsp],xmm4
- movdqa XMMWORD[(384+16)+rsp],xmm5
- por xmm5,xmm4
-
- movdqu xmm0,XMMWORD[rbx]
- pshufd xmm3,xmm5,0xb1
- movdqu xmm1,XMMWORD[16+rbx]
- movdqu xmm2,XMMWORD[32+rbx]
- por xmm5,xmm3
- movdqu xmm3,XMMWORD[48+rbx]
- movdqa XMMWORD[416+rsp],xmm0
- pshufd xmm4,xmm5,0x1e
- movdqa XMMWORD[(416+16)+rsp],xmm1
- por xmm1,xmm0
-DB 102,72,15,110,199
- movdqa XMMWORD[448+rsp],xmm2
- movdqa XMMWORD[(448+16)+rsp],xmm3
- por xmm3,xmm2
- por xmm5,xmm4
- pxor xmm4,xmm4
- por xmm3,xmm1
-
- lea rsi,[((64-0))+rsi]
- lea rdi,[32+rsp]
- call __ecp_nistz256_sqr_montq
-
- pcmpeqd xmm5,xmm4
- pshufd xmm4,xmm3,0xb1
- mov rax,QWORD[rbx]
-
- mov r9,r12
- por xmm4,xmm3
- pshufd xmm5,xmm5,0
- pshufd xmm3,xmm4,0x1e
- mov r10,r13
- por xmm4,xmm3
- pxor xmm3,xmm3
- mov r11,r14
- pcmpeqd xmm4,xmm3
- pshufd xmm4,xmm4,0
-
- lea rsi,[((32-0))+rsp]
- mov r12,r15
- lea rdi,[rsp]
- call __ecp_nistz256_mul_montq
-
- lea rbx,[320+rsp]
- lea rdi,[64+rsp]
- call __ecp_nistz256_sub_fromq
-
- mov rax,QWORD[384+rsp]
- lea rbx,[384+rsp]
- mov r9,QWORD[((0+32))+rsp]
- mov r10,QWORD[((8+32))+rsp]
- lea rsi,[((0+32))+rsp]
- mov r11,QWORD[((16+32))+rsp]
- mov r12,QWORD[((24+32))+rsp]
- lea rdi,[32+rsp]
- call __ecp_nistz256_mul_montq
-
- mov rax,QWORD[384+rsp]
- lea rbx,[384+rsp]
- mov r9,QWORD[((0+64))+rsp]
- mov r10,QWORD[((8+64))+rsp]
- lea rsi,[((0+64))+rsp]
- mov r11,QWORD[((16+64))+rsp]
- mov r12,QWORD[((24+64))+rsp]
- lea rdi,[288+rsp]
- call __ecp_nistz256_mul_montq
-
- mov rax,QWORD[448+rsp]
- lea rbx,[448+rsp]
- mov r9,QWORD[((0+32))+rsp]
- mov r10,QWORD[((8+32))+rsp]
- lea rsi,[((0+32))+rsp]
- mov r11,QWORD[((16+32))+rsp]
- mov r12,QWORD[((24+32))+rsp]
- lea rdi,[32+rsp]
- call __ecp_nistz256_mul_montq
-
- lea rbx,[352+rsp]
- lea rdi,[96+rsp]
- call __ecp_nistz256_sub_fromq
-
- mov rax,QWORD[((0+64))+rsp]
- mov r14,QWORD[((8+64))+rsp]
- lea rsi,[((0+64))+rsp]
- mov r15,QWORD[((16+64))+rsp]
- mov r8,QWORD[((24+64))+rsp]
- lea rdi,[128+rsp]
- call __ecp_nistz256_sqr_montq
-
- mov rax,QWORD[((0+96))+rsp]
- mov r14,QWORD[((8+96))+rsp]
- lea rsi,[((0+96))+rsp]
- mov r15,QWORD[((16+96))+rsp]
- mov r8,QWORD[((24+96))+rsp]
- lea rdi,[192+rsp]
- call __ecp_nistz256_sqr_montq
-
- mov rax,QWORD[128+rsp]
- lea rbx,[128+rsp]
- mov r9,QWORD[((0+64))+rsp]
- mov r10,QWORD[((8+64))+rsp]
- lea rsi,[((0+64))+rsp]
- mov r11,QWORD[((16+64))+rsp]
- mov r12,QWORD[((24+64))+rsp]
- lea rdi,[160+rsp]
- call __ecp_nistz256_mul_montq
-
- mov rax,QWORD[320+rsp]
- lea rbx,[320+rsp]
- mov r9,QWORD[((0+128))+rsp]
- mov r10,QWORD[((8+128))+rsp]
- lea rsi,[((0+128))+rsp]
- mov r11,QWORD[((16+128))+rsp]
- mov r12,QWORD[((24+128))+rsp]
- lea rdi,[rsp]
- call __ecp_nistz256_mul_montq
-
-
-
-
- xor r11,r11
- add r12,r12
- lea rsi,[192+rsp]
- adc r13,r13
- mov rax,r12
- adc r8,r8
- adc r9,r9
- mov rbp,r13
- adc r11,0
-
- sub r12,-1
- mov rcx,r8
- sbb r13,r14
- sbb r8,0
- mov r10,r9
- sbb r9,r15
- sbb r11,0
-
- cmovc r12,rax
- mov rax,QWORD[rsi]
- cmovc r13,rbp
- mov rbp,QWORD[8+rsi]
- cmovc r8,rcx
- mov rcx,QWORD[16+rsi]
- cmovc r9,r10
- mov r10,QWORD[24+rsi]
-
- call __ecp_nistz256_subq
-
- lea rbx,[160+rsp]
- lea rdi,[224+rsp]
- call __ecp_nistz256_sub_fromq
-
- mov rax,QWORD[((0+0))+rsp]
- mov rbp,QWORD[((0+8))+rsp]
- mov rcx,QWORD[((0+16))+rsp]
- mov r10,QWORD[((0+24))+rsp]
- lea rdi,[64+rsp]
-
- call __ecp_nistz256_subq
-
- mov QWORD[rdi],r12
- mov QWORD[8+rdi],r13
- mov QWORD[16+rdi],r8
- mov QWORD[24+rdi],r9
- mov rax,QWORD[352+rsp]
- lea rbx,[352+rsp]
- mov r9,QWORD[((0+160))+rsp]
- mov r10,QWORD[((8+160))+rsp]
- lea rsi,[((0+160))+rsp]
- mov r11,QWORD[((16+160))+rsp]
- mov r12,QWORD[((24+160))+rsp]
- lea rdi,[32+rsp]
- call __ecp_nistz256_mul_montq
-
- mov rax,QWORD[96+rsp]
- lea rbx,[96+rsp]
- mov r9,QWORD[((0+64))+rsp]
- mov r10,QWORD[((8+64))+rsp]
- lea rsi,[((0+64))+rsp]
- mov r11,QWORD[((16+64))+rsp]
- mov r12,QWORD[((24+64))+rsp]
- lea rdi,[64+rsp]
- call __ecp_nistz256_mul_montq
-
- lea rbx,[32+rsp]
- lea rdi,[256+rsp]
- call __ecp_nistz256_sub_fromq
-
-DB 102,72,15,126,199
-
- movdqa xmm0,xmm5
- movdqa xmm1,xmm5
- pandn xmm0,XMMWORD[288+rsp]
- movdqa xmm2,xmm5
- pandn xmm1,XMMWORD[((288+16))+rsp]
- movdqa xmm3,xmm5
- pand xmm2,XMMWORD[$L$ONE_mont]
- pand xmm3,XMMWORD[(($L$ONE_mont+16))]
- por xmm2,xmm0
- por xmm3,xmm1
-
- movdqa xmm0,xmm4
- movdqa xmm1,xmm4
- pandn xmm0,xmm2
- movdqa xmm2,xmm4
- pandn xmm1,xmm3
- movdqa xmm3,xmm4
- pand xmm2,XMMWORD[384+rsp]
- pand xmm3,XMMWORD[((384+16))+rsp]
- por xmm2,xmm0
- por xmm3,xmm1
- movdqu XMMWORD[64+rdi],xmm2
- movdqu XMMWORD[80+rdi],xmm3
-
- movdqa xmm0,xmm5
- movdqa xmm1,xmm5
- pandn xmm0,XMMWORD[224+rsp]
- movdqa xmm2,xmm5
- pandn xmm1,XMMWORD[((224+16))+rsp]
- movdqa xmm3,xmm5
- pand xmm2,XMMWORD[416+rsp]
- pand xmm3,XMMWORD[((416+16))+rsp]
- por xmm2,xmm0
- por xmm3,xmm1
-
- movdqa xmm0,xmm4
- movdqa xmm1,xmm4
- pandn xmm0,xmm2
- movdqa xmm2,xmm4
- pandn xmm1,xmm3
- movdqa xmm3,xmm4
- pand xmm2,XMMWORD[320+rsp]
- pand xmm3,XMMWORD[((320+16))+rsp]
- por xmm2,xmm0
- por xmm3,xmm1
- movdqu XMMWORD[rdi],xmm2
- movdqu XMMWORD[16+rdi],xmm3
-
- movdqa xmm0,xmm5
- movdqa xmm1,xmm5
- pandn xmm0,XMMWORD[256+rsp]
- movdqa xmm2,xmm5
- pandn xmm1,XMMWORD[((256+16))+rsp]
- movdqa xmm3,xmm5
- pand xmm2,XMMWORD[448+rsp]
- pand xmm3,XMMWORD[((448+16))+rsp]
- por xmm2,xmm0
- por xmm3,xmm1
-
- movdqa xmm0,xmm4
- movdqa xmm1,xmm4
- pandn xmm0,xmm2
- movdqa xmm2,xmm4
- pandn xmm1,xmm3
- movdqa xmm3,xmm4
- pand xmm2,XMMWORD[352+rsp]
- pand xmm3,XMMWORD[((352+16))+rsp]
- por xmm2,xmm0
- por xmm3,xmm1
- movdqu XMMWORD[32+rdi],xmm2
- movdqu XMMWORD[48+rdi],xmm3
-
- lea rsi,[((480+56))+rsp]
+ pand xmm11,xmm15
+ por xmm3,xmm10
+ pand xmm12,xmm15
+ por xmm4,xmm11
+ prefetcht0 [255+rdx]
+ por xmm5,xmm12
- mov r15,QWORD[((-48))+rsi]
+ dec rax
+ jnz NEAR $L$select_loop_sse_w7
- mov r14,QWORD[((-40))+rsi]
+ movdqu XMMWORD[rcx],xmm2
+ movdqu XMMWORD[16+rcx],xmm3
+ movdqu XMMWORD[32+rcx],xmm4
+ movdqu XMMWORD[48+rcx],xmm5
+ movaps xmm6,XMMWORD[rsp]
+ movaps xmm7,XMMWORD[16+rsp]
+ movaps xmm8,XMMWORD[32+rsp]
+ movaps xmm9,XMMWORD[48+rsp]
+ movaps xmm10,XMMWORD[64+rsp]
+ movaps xmm11,XMMWORD[80+rsp]
+ movaps xmm12,XMMWORD[96+rsp]
+ movaps xmm13,XMMWORD[112+rsp]
+ movaps xmm14,XMMWORD[128+rsp]
+ movaps xmm15,XMMWORD[144+rsp]
+ lea rsp,[168+rsp]
+ DB 0F3h,0C3h ;repret
- mov r13,QWORD[((-32))+rsi]
+$L$SEH_end_ecp_nistz256_gather_w7:
- mov r12,QWORD[((-24))+rsi]
+global ecp_nistz256_avx2_gather_w7
- mov rbx,QWORD[((-16))+rsi]
+ALIGN 32
+ecp_nistz256_avx2_gather_w7:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_ecp_nistz256_avx2_gather_w7:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
- mov rbp,QWORD[((-8))+rsi]
- lea rsp,[rsi]
-$L$add_affineq_epilogue:
+DB 0x0f,0x0b
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
-$L$SEH_end_ecp_nistz256_point_add_affine:
+$L$SEH_end_ecp_nistz256_avx2_gather_w7:
ALIGN 32
-__ecp_nistz256_add_tox:
+__ecp_nistz256_add_toq:
xor r11,r11
- adc r12,QWORD[rbx]
+ add r12,QWORD[rbx]
adc r13,QWORD[8+rbx]
mov rax,r12
adc r8,QWORD[16+rbx]
@@ -6462,8 +4352,7 @@ __ecp_nistz256_add_tox:
mov rbp,r13
adc r11,0
- xor r10,r10
- sbb r12,-1
+ sub r12,-1
mov rcx,r8
sbb r13,r14
sbb r8,0
@@ -6486,32 +4375,30 @@ __ecp_nistz256_add_tox:
ALIGN 32
-__ecp_nistz256_sub_fromx:
+__ecp_nistz256_sub_fromq:
- xor r11,r11
- sbb r12,QWORD[rbx]
+ sub r12,QWORD[rbx]
sbb r13,QWORD[8+rbx]
mov rax,r12
sbb r8,QWORD[16+rbx]
sbb r9,QWORD[24+rbx]
mov rbp,r13
- sbb r11,0
+ sbb r11,r11
- xor r10,r10
- adc r12,-1
+ add r12,-1
mov rcx,r8
adc r13,r14
adc r8,0
mov r10,r9
adc r9,r15
+ test r11,r11
- bt r11,0
- cmovnc r12,rax
- cmovnc r13,rbp
+ cmovz r12,rax
+ cmovz r13,rbp
mov QWORD[rdi],r12
- cmovnc r8,rcx
+ cmovz r8,rcx
mov QWORD[8+rdi],r13
- cmovnc r9,r10
+ cmovz r9,r10
mov QWORD[16+rdi],r8
mov QWORD[24+rdi],r9
@@ -6521,30 +4408,28 @@ __ecp_nistz256_sub_fromx:
ALIGN 32
-__ecp_nistz256_subx:
+__ecp_nistz256_subq:
- xor r11,r11
- sbb rax,r12
+ sub rax,r12
sbb rbp,r13
mov r12,rax
sbb rcx,r8
sbb r10,r9
mov r13,rbp
- sbb r11,0
+ sbb r11,r11
- xor r9,r9
- adc rax,-1
+ add rax,-1
mov r8,rcx
adc rbp,r14
adc rcx,0
mov r9,r10
adc r10,r15
+ test r11,r11
- bt r11,0
- cmovc r12,rax
- cmovc r13,rbp
- cmovc r8,rcx
- cmovc r9,r10
+ cmovnz r12,rax
+ cmovnz r13,rbp
+ cmovnz r8,rcx
+ cmovnz r9,r10
DB 0F3h,0C3h ;repret
@@ -6552,10 +4437,10 @@ __ecp_nistz256_subx:
ALIGN 32
-__ecp_nistz256_mul_by_2x:
+__ecp_nistz256_mul_by_2q:
xor r11,r11
- adc r12,r12
+ add r12,r12
adc r13,r13
mov rax,r12
adc r8,r8
@@ -6563,8 +4448,7 @@ __ecp_nistz256_mul_by_2x:
mov rbp,r13
adc r11,0
- xor r10,r10
- sbb r12,-1
+ sub r12,-1
mov rcx,r8
sbb r13,r14
sbb r8,0
@@ -6584,19 +4468,19 @@ __ecp_nistz256_mul_by_2x:
DB 0F3h,0C3h ;repret
+global ecp_nistz256_point_double
ALIGN 32
-ecp_nistz256_point_doublex:
+ecp_nistz256_point_double:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
-$L$SEH_begin_ecp_nistz256_point_doublex:
+$L$SEH_begin_ecp_nistz256_point_double:
mov rdi,rcx
mov rsi,rdx
-$L$point_doublex:
push rbp
push rbx
@@ -6611,9 +4495,9 @@ $L$point_doublex:
sub rsp,32*5+8
-$L$point_doublex_body:
+$L$point_doubleq_body:
-$L$point_double_shortcutx:
+$L$point_double_shortcutq:
movdqu xmm0,XMMWORD[rsi]
mov rbx,rsi
movdqu xmm1,XMMWORD[16+rsi]
@@ -6632,34 +4516,34 @@ DB 102,73,15,110,202
DB 102,73,15,110,211
lea rdi,[rsp]
- call __ecp_nistz256_mul_by_2x
+ call __ecp_nistz256_mul_by_2q
- mov rdx,QWORD[((64+0))+rsi]
+ mov rax,QWORD[((64+0))+rsi]
mov r14,QWORD[((64+8))+rsi]
mov r15,QWORD[((64+16))+rsi]
mov r8,QWORD[((64+24))+rsi]
- lea rsi,[((64-128))+rsi]
+ lea rsi,[((64-0))+rsi]
lea rdi,[64+rsp]
- call __ecp_nistz256_sqr_montx
+ call __ecp_nistz256_sqr_montq
- mov rdx,QWORD[((0+0))+rsp]
+ mov rax,QWORD[((0+0))+rsp]
mov r14,QWORD[((8+0))+rsp]
- lea rsi,[((-128+0))+rsp]
+ lea rsi,[((0+0))+rsp]
mov r15,QWORD[((16+0))+rsp]
mov r8,QWORD[((24+0))+rsp]
lea rdi,[rsp]
- call __ecp_nistz256_sqr_montx
+ call __ecp_nistz256_sqr_montq
- mov rdx,QWORD[32+rbx]
+ mov rax,QWORD[32+rbx]
mov r9,QWORD[((64+0))+rbx]
mov r10,QWORD[((64+8))+rbx]
mov r11,QWORD[((64+16))+rbx]
mov r12,QWORD[((64+24))+rbx]
- lea rsi,[((64-128))+rbx]
+ lea rsi,[((64-0))+rbx]
lea rbx,[32+rbx]
DB 102,72,15,126,215
- call __ecp_nistz256_mul_montx
- call __ecp_nistz256_mul_by_2x
+ call __ecp_nistz256_mul_montq
+ call __ecp_nistz256_mul_by_2q
mov r12,QWORD[((96+0))+rsp]
mov r13,QWORD[((96+8))+rsp]
@@ -6667,7 +4551,7 @@ DB 102,72,15,126,215
mov r8,QWORD[((96+16))+rsp]
mov r9,QWORD[((96+24))+rsp]
lea rdi,[32+rsp]
- call __ecp_nistz256_add_tox
+ call __ecp_nistz256_add_toq
mov r12,QWORD[((96+0))+rsp]
mov r13,QWORD[((96+8))+rsp]
@@ -6675,15 +4559,15 @@ DB 102,72,15,126,215
mov r8,QWORD[((96+16))+rsp]
mov r9,QWORD[((96+24))+rsp]
lea rdi,[64+rsp]
- call __ecp_nistz256_sub_fromx
+ call __ecp_nistz256_sub_fromq
- mov rdx,QWORD[((0+0))+rsp]
+ mov rax,QWORD[((0+0))+rsp]
mov r14,QWORD[((8+0))+rsp]
- lea rsi,[((-128+0))+rsp]
+ lea rsi,[((0+0))+rsp]
mov r15,QWORD[((16+0))+rsp]
mov r8,QWORD[((24+0))+rsp]
DB 102,72,15,126,207
- call __ecp_nistz256_sqr_montx
+ call __ecp_nistz256_sqr_montq
xor r9,r9
mov rax,r12
add r12,-1
@@ -6722,59 +4606,59 @@ DB 102,72,15,126,207
or r15,r9
mov QWORD[16+rdi],r14
mov QWORD[24+rdi],r15
- mov rdx,QWORD[64+rsp]
+ mov rax,QWORD[64+rsp]
lea rbx,[64+rsp]
mov r9,QWORD[((0+32))+rsp]
mov r10,QWORD[((8+32))+rsp]
- lea rsi,[((-128+32))+rsp]
+ lea rsi,[((0+32))+rsp]
mov r11,QWORD[((16+32))+rsp]
mov r12,QWORD[((24+32))+rsp]
lea rdi,[32+rsp]
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
lea rdi,[128+rsp]
- call __ecp_nistz256_mul_by_2x
+ call __ecp_nistz256_mul_by_2q
lea rbx,[32+rsp]
lea rdi,[32+rsp]
- call __ecp_nistz256_add_tox
+ call __ecp_nistz256_add_toq
- mov rdx,QWORD[96+rsp]
+ mov rax,QWORD[96+rsp]
lea rbx,[96+rsp]
mov r9,QWORD[((0+0))+rsp]
mov r10,QWORD[((8+0))+rsp]
- lea rsi,[((-128+0))+rsp]
+ lea rsi,[((0+0))+rsp]
mov r11,QWORD[((16+0))+rsp]
mov r12,QWORD[((24+0))+rsp]
lea rdi,[rsp]
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
lea rdi,[128+rsp]
- call __ecp_nistz256_mul_by_2x
+ call __ecp_nistz256_mul_by_2q
- mov rdx,QWORD[((0+32))+rsp]
+ mov rax,QWORD[((0+32))+rsp]
mov r14,QWORD[((8+32))+rsp]
- lea rsi,[((-128+32))+rsp]
+ lea rsi,[((0+32))+rsp]
mov r15,QWORD[((16+32))+rsp]
mov r8,QWORD[((24+32))+rsp]
DB 102,72,15,126,199
- call __ecp_nistz256_sqr_montx
+ call __ecp_nistz256_sqr_montq
lea rbx,[128+rsp]
mov r8,r14
mov r9,r15
mov r14,rsi
mov r15,rbp
- call __ecp_nistz256_sub_fromx
+ call __ecp_nistz256_sub_fromq
mov rax,QWORD[((0+0))+rsp]
mov rbp,QWORD[((0+8))+rsp]
mov rcx,QWORD[((0+16))+rsp]
mov r10,QWORD[((0+24))+rsp]
lea rdi,[rsp]
- call __ecp_nistz256_subx
+ call __ecp_nistz256_subq
- mov rdx,QWORD[32+rsp]
+ mov rax,QWORD[32+rsp]
lea rbx,[32+rsp]
mov r14,r12
xor ecx,ecx
@@ -6783,16 +4667,16 @@ DB 102,72,15,126,199
mov QWORD[((0+8))+rsp],r13
cmovz r11,r8
mov QWORD[((0+16))+rsp],r8
- lea rsi,[((0-128))+rsp]
+ lea rsi,[((0-0))+rsp]
cmovz r12,r9
mov QWORD[((0+24))+rsp],r9
mov r9,r14
lea rdi,[rsp]
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
DB 102,72,15,126,203
DB 102,72,15,126,207
- call __ecp_nistz256_sub_fromx
+ call __ecp_nistz256_sub_fromq
lea rsi,[((160+56))+rsp]
@@ -6810,26 +4694,26 @@ DB 102,72,15,126,207
lea rsp,[rsi]
-$L$point_doublex_epilogue:
+$L$point_doubleq_epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
-$L$SEH_end_ecp_nistz256_point_doublex:
+$L$SEH_end_ecp_nistz256_point_double:
+global ecp_nistz256_point_add
ALIGN 32
-ecp_nistz256_point_addx:
+ecp_nistz256_point_add:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
-$L$SEH_begin_ecp_nistz256_point_addx:
+$L$SEH_begin_ecp_nistz256_point_add:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
-$L$point_addx:
push rbp
push rbx
@@ -6844,7 +4728,7 @@ $L$point_addx:
sub rsp,32*18+8
-$L$point_addx_body:
+$L$point_addq_body:
movdqu xmm0,XMMWORD[rsi]
movdqu xmm1,XMMWORD[16+rsi]
@@ -6868,7 +4752,7 @@ $L$point_addx_body:
movdqu xmm2,XMMWORD[32+rsi]
por xmm5,xmm3
movdqu xmm3,XMMWORD[48+rsi]
- mov rdx,QWORD[((64+0))+rsi]
+ mov rax,QWORD[((64+0))+rsi]
mov r14,QWORD[((64+8))+rsi]
mov r15,QWORD[((64+16))+rsi]
mov r8,QWORD[((64+24))+rsi]
@@ -6884,13 +4768,13 @@ $L$point_addx_body:
por xmm1,xmm0
DB 102,72,15,110,199
- lea rsi,[((64-128))+rsi]
- mov QWORD[((544+0))+rsp],rdx
+ lea rsi,[((64-0))+rsi]
+ mov QWORD[((544+0))+rsp],rax
mov QWORD[((544+8))+rsp],r14
mov QWORD[((544+16))+rsp],r15
mov QWORD[((544+24))+rsp],r8
lea rdi,[96+rsp]
- call __ecp_nistz256_sqr_montx
+ call __ecp_nistz256_sqr_montq
pcmpeqd xmm5,xmm4
pshufd xmm4,xmm1,0xb1
@@ -6901,59 +4785,59 @@ DB 102,72,15,110,199
pxor xmm3,xmm3
pcmpeqd xmm4,xmm3
pshufd xmm4,xmm4,0
- mov rdx,QWORD[((64+0))+rbx]
+ mov rax,QWORD[((64+0))+rbx]
mov r14,QWORD[((64+8))+rbx]
mov r15,QWORD[((64+16))+rbx]
mov r8,QWORD[((64+24))+rbx]
DB 102,72,15,110,203
- lea rsi,[((64-128))+rbx]
+ lea rsi,[((64-0))+rbx]
lea rdi,[32+rsp]
- call __ecp_nistz256_sqr_montx
+ call __ecp_nistz256_sqr_montq
- mov rdx,QWORD[544+rsp]
+ mov rax,QWORD[544+rsp]
lea rbx,[544+rsp]
mov r9,QWORD[((0+96))+rsp]
mov r10,QWORD[((8+96))+rsp]
- lea rsi,[((-128+96))+rsp]
+ lea rsi,[((0+96))+rsp]
mov r11,QWORD[((16+96))+rsp]
mov r12,QWORD[((24+96))+rsp]
lea rdi,[224+rsp]
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
- mov rdx,QWORD[448+rsp]
+ mov rax,QWORD[448+rsp]
lea rbx,[448+rsp]
mov r9,QWORD[((0+32))+rsp]
mov r10,QWORD[((8+32))+rsp]
- lea rsi,[((-128+32))+rsp]
+ lea rsi,[((0+32))+rsp]
mov r11,QWORD[((16+32))+rsp]
mov r12,QWORD[((24+32))+rsp]
lea rdi,[256+rsp]
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
- mov rdx,QWORD[416+rsp]
+ mov rax,QWORD[416+rsp]
lea rbx,[416+rsp]
mov r9,QWORD[((0+224))+rsp]
mov r10,QWORD[((8+224))+rsp]
- lea rsi,[((-128+224))+rsp]
+ lea rsi,[((0+224))+rsp]
mov r11,QWORD[((16+224))+rsp]
mov r12,QWORD[((24+224))+rsp]
lea rdi,[224+rsp]
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
- mov rdx,QWORD[512+rsp]
+ mov rax,QWORD[512+rsp]
lea rbx,[512+rsp]
mov r9,QWORD[((0+256))+rsp]
mov r10,QWORD[((8+256))+rsp]
- lea rsi,[((-128+256))+rsp]
+ lea rsi,[((0+256))+rsp]
mov r11,QWORD[((16+256))+rsp]
mov r12,QWORD[((24+256))+rsp]
lea rdi,[256+rsp]
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
lea rbx,[224+rsp]
lea rdi,[64+rsp]
- call __ecp_nistz256_sub_fromx
+ call __ecp_nistz256_sub_fromq
or r12,r13
movdqa xmm2,xmm4
@@ -6962,29 +4846,29 @@ DB 102,72,15,110,203
por xmm2,xmm5
DB 102,73,15,110,220
- mov rdx,QWORD[384+rsp]
+ mov rax,QWORD[384+rsp]
lea rbx,[384+rsp]
mov r9,QWORD[((0+96))+rsp]
mov r10,QWORD[((8+96))+rsp]
- lea rsi,[((-128+96))+rsp]
+ lea rsi,[((0+96))+rsp]
mov r11,QWORD[((16+96))+rsp]
mov r12,QWORD[((24+96))+rsp]
lea rdi,[160+rsp]
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
- mov rdx,QWORD[480+rsp]
+ mov rax,QWORD[480+rsp]
lea rbx,[480+rsp]
mov r9,QWORD[((0+32))+rsp]
mov r10,QWORD[((8+32))+rsp]
- lea rsi,[((-128+32))+rsp]
+ lea rsi,[((0+32))+rsp]
mov r11,QWORD[((16+32))+rsp]
mov r12,QWORD[((24+32))+rsp]
lea rdi,[192+rsp]
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
lea rbx,[160+rsp]
lea rdi,[rsp]
- call __ecp_nistz256_sub_fromx
+ call __ecp_nistz256_sub_fromq
or r12,r13
or r12,r8
@@ -6998,73 +4882,73 @@ DB 102,73,15,126,217
DB 0x3e
- jnz NEAR $L$add_proceedx
+ jnz NEAR $L$add_proceedq
-$L$add_doublex:
+$L$add_doubleq:
DB 102,72,15,126,206
DB 102,72,15,126,199
add rsp,416
- jmp NEAR $L$point_double_shortcutx
+ jmp NEAR $L$point_double_shortcutq
ALIGN 32
-$L$add_proceedx:
- mov rdx,QWORD[((0+64))+rsp]
+$L$add_proceedq:
+ mov rax,QWORD[((0+64))+rsp]
mov r14,QWORD[((8+64))+rsp]
- lea rsi,[((-128+64))+rsp]
+ lea rsi,[((0+64))+rsp]
mov r15,QWORD[((16+64))+rsp]
mov r8,QWORD[((24+64))+rsp]
lea rdi,[96+rsp]
- call __ecp_nistz256_sqr_montx
+ call __ecp_nistz256_sqr_montq
- mov rdx,QWORD[448+rsp]
+ mov rax,QWORD[448+rsp]
lea rbx,[448+rsp]
mov r9,QWORD[((0+0))+rsp]
mov r10,QWORD[((8+0))+rsp]
- lea rsi,[((-128+0))+rsp]
+ lea rsi,[((0+0))+rsp]
mov r11,QWORD[((16+0))+rsp]
mov r12,QWORD[((24+0))+rsp]
lea rdi,[352+rsp]
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
- mov rdx,QWORD[((0+0))+rsp]
+ mov rax,QWORD[((0+0))+rsp]
mov r14,QWORD[((8+0))+rsp]
- lea rsi,[((-128+0))+rsp]
+ lea rsi,[((0+0))+rsp]
mov r15,QWORD[((16+0))+rsp]
mov r8,QWORD[((24+0))+rsp]
lea rdi,[32+rsp]
- call __ecp_nistz256_sqr_montx
+ call __ecp_nistz256_sqr_montq
- mov rdx,QWORD[544+rsp]
+ mov rax,QWORD[544+rsp]
lea rbx,[544+rsp]
mov r9,QWORD[((0+352))+rsp]
mov r10,QWORD[((8+352))+rsp]
- lea rsi,[((-128+352))+rsp]
+ lea rsi,[((0+352))+rsp]
mov r11,QWORD[((16+352))+rsp]
mov r12,QWORD[((24+352))+rsp]
lea rdi,[352+rsp]
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
- mov rdx,QWORD[rsp]
+ mov rax,QWORD[rsp]
lea rbx,[rsp]
mov r9,QWORD[((0+32))+rsp]
mov r10,QWORD[((8+32))+rsp]
- lea rsi,[((-128+32))+rsp]
+ lea rsi,[((0+32))+rsp]
mov r11,QWORD[((16+32))+rsp]
mov r12,QWORD[((24+32))+rsp]
lea rdi,[128+rsp]
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
- mov rdx,QWORD[160+rsp]
+ mov rax,QWORD[160+rsp]
lea rbx,[160+rsp]
mov r9,QWORD[((0+32))+rsp]
mov r10,QWORD[((8+32))+rsp]
- lea rsi,[((-128+32))+rsp]
+ lea rsi,[((0+32))+rsp]
mov r11,QWORD[((16+32))+rsp]
mov r12,QWORD[((24+32))+rsp]
lea rdi,[192+rsp]
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
@@ -7096,11 +4980,11 @@ $L$add_proceedx:
cmovc r9,r10
mov r10,QWORD[24+rsi]
- call __ecp_nistz256_subx
+ call __ecp_nistz256_subq
lea rbx,[128+rsp]
lea rdi,[288+rsp]
- call __ecp_nistz256_sub_fromx
+ call __ecp_nistz256_sub_fromq
mov rax,QWORD[((192+0))+rsp]
mov rbp,QWORD[((192+8))+rsp]
@@ -7108,35 +4992,35 @@ $L$add_proceedx:
mov r10,QWORD[((192+24))+rsp]
lea rdi,[320+rsp]
- call __ecp_nistz256_subx
+ call __ecp_nistz256_subq
mov QWORD[rdi],r12
mov QWORD[8+rdi],r13
mov QWORD[16+rdi],r8
mov QWORD[24+rdi],r9
- mov rdx,QWORD[128+rsp]
+ mov rax,QWORD[128+rsp]
lea rbx,[128+rsp]
mov r9,QWORD[((0+224))+rsp]
mov r10,QWORD[((8+224))+rsp]
- lea rsi,[((-128+224))+rsp]
+ lea rsi,[((0+224))+rsp]
mov r11,QWORD[((16+224))+rsp]
mov r12,QWORD[((24+224))+rsp]
lea rdi,[256+rsp]
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
- mov rdx,QWORD[320+rsp]
+ mov rax,QWORD[320+rsp]
lea rbx,[320+rsp]
mov r9,QWORD[((0+64))+rsp]
mov r10,QWORD[((8+64))+rsp]
- lea rsi,[((-128+64))+rsp]
+ lea rsi,[((0+64))+rsp]
mov r11,QWORD[((16+64))+rsp]
mov r12,QWORD[((24+64))+rsp]
lea rdi,[320+rsp]
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
lea rbx,[256+rsp]
lea rdi,[320+rsp]
- call __ecp_nistz256_sub_fromx
+ call __ecp_nistz256_sub_fromq
DB 102,72,15,126,199
@@ -7212,7 +5096,7 @@ DB 102,72,15,126,199
movdqu XMMWORD[32+rdi],xmm2
movdqu XMMWORD[48+rdi],xmm3
-$L$add_donex:
+$L$add_doneq:
lea rsi,[((576+56))+rsp]
mov r15,QWORD[((-48))+rsi]
@@ -7229,26 +5113,26 @@ $L$add_donex:
lea rsp,[rsi]
-$L$point_addx_epilogue:
+$L$point_addq_epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
-$L$SEH_end_ecp_nistz256_point_addx:
+$L$SEH_end_ecp_nistz256_point_add:
+global ecp_nistz256_point_add_affine
ALIGN 32
-ecp_nistz256_point_add_affinex:
+ecp_nistz256_point_add_affine:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
-$L$SEH_begin_ecp_nistz256_point_add_affinex:
+$L$SEH_begin_ecp_nistz256_point_add_affine:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
-$L$point_add_affinex:
push rbp
push rbx
@@ -7263,7 +5147,7 @@ $L$point_add_affinex:
sub rsp,32*15+8
-$L$add_affinex_body:
+$L$add_affineq_body:
movdqu xmm0,XMMWORD[rsi]
mov rbx,rdx
@@ -7272,7 +5156,7 @@ $L$add_affinex_body:
movdqu xmm3,XMMWORD[48+rsi]
movdqu xmm4,XMMWORD[64+rsi]
movdqu xmm5,XMMWORD[80+rsi]
- mov rdx,QWORD[((64+0))+rsi]
+ mov rax,QWORD[((64+0))+rsi]
mov r14,QWORD[((64+8))+rsi]
mov r15,QWORD[((64+16))+rsi]
mov r8,QWORD[((64+24))+rsi]
@@ -7302,13 +5186,13 @@ DB 102,72,15,110,199
pxor xmm4,xmm4
por xmm3,xmm1
- lea rsi,[((64-128))+rsi]
+ lea rsi,[((64-0))+rsi]
lea rdi,[32+rsp]
- call __ecp_nistz256_sqr_montx
+ call __ecp_nistz256_sqr_montq
pcmpeqd xmm5,xmm4
pshufd xmm4,xmm3,0xb1
- mov rdx,QWORD[rbx]
+ mov rax,QWORD[rbx]
mov r9,r12
por xmm4,xmm3
@@ -7321,84 +5205,84 @@ DB 102,72,15,110,199
pcmpeqd xmm4,xmm3
pshufd xmm4,xmm4,0
- lea rsi,[((32-128))+rsp]
+ lea rsi,[((32-0))+rsp]
mov r12,r15
lea rdi,[rsp]
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
lea rbx,[320+rsp]
lea rdi,[64+rsp]
- call __ecp_nistz256_sub_fromx
+ call __ecp_nistz256_sub_fromq
- mov rdx,QWORD[384+rsp]
+ mov rax,QWORD[384+rsp]
lea rbx,[384+rsp]
mov r9,QWORD[((0+32))+rsp]
mov r10,QWORD[((8+32))+rsp]
- lea rsi,[((-128+32))+rsp]
+ lea rsi,[((0+32))+rsp]
mov r11,QWORD[((16+32))+rsp]
mov r12,QWORD[((24+32))+rsp]
lea rdi,[32+rsp]
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
- mov rdx,QWORD[384+rsp]
+ mov rax,QWORD[384+rsp]
lea rbx,[384+rsp]
mov r9,QWORD[((0+64))+rsp]
mov r10,QWORD[((8+64))+rsp]
- lea rsi,[((-128+64))+rsp]
+ lea rsi,[((0+64))+rsp]
mov r11,QWORD[((16+64))+rsp]
mov r12,QWORD[((24+64))+rsp]
lea rdi,[288+rsp]
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
- mov rdx,QWORD[448+rsp]
+ mov rax,QWORD[448+rsp]
lea rbx,[448+rsp]
mov r9,QWORD[((0+32))+rsp]
mov r10,QWORD[((8+32))+rsp]
- lea rsi,[((-128+32))+rsp]
+ lea rsi,[((0+32))+rsp]
mov r11,QWORD[((16+32))+rsp]
mov r12,QWORD[((24+32))+rsp]
lea rdi,[32+rsp]
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
lea rbx,[352+rsp]
lea rdi,[96+rsp]
- call __ecp_nistz256_sub_fromx
+ call __ecp_nistz256_sub_fromq
- mov rdx,QWORD[((0+64))+rsp]
+ mov rax,QWORD[((0+64))+rsp]
mov r14,QWORD[((8+64))+rsp]
- lea rsi,[((-128+64))+rsp]
+ lea rsi,[((0+64))+rsp]
mov r15,QWORD[((16+64))+rsp]
mov r8,QWORD[((24+64))+rsp]
lea rdi,[128+rsp]
- call __ecp_nistz256_sqr_montx
+ call __ecp_nistz256_sqr_montq
- mov rdx,QWORD[((0+96))+rsp]
+ mov rax,QWORD[((0+96))+rsp]
mov r14,QWORD[((8+96))+rsp]
- lea rsi,[((-128+96))+rsp]
+ lea rsi,[((0+96))+rsp]
mov r15,QWORD[((16+96))+rsp]
mov r8,QWORD[((24+96))+rsp]
lea rdi,[192+rsp]
- call __ecp_nistz256_sqr_montx
+ call __ecp_nistz256_sqr_montq
- mov rdx,QWORD[128+rsp]
+ mov rax,QWORD[128+rsp]
lea rbx,[128+rsp]
mov r9,QWORD[((0+64))+rsp]
mov r10,QWORD[((8+64))+rsp]
- lea rsi,[((-128+64))+rsp]
+ lea rsi,[((0+64))+rsp]
mov r11,QWORD[((16+64))+rsp]
mov r12,QWORD[((24+64))+rsp]
lea rdi,[160+rsp]
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
- mov rdx,QWORD[320+rsp]
+ mov rax,QWORD[320+rsp]
lea rbx,[320+rsp]
mov r9,QWORD[((0+128))+rsp]
mov r10,QWORD[((8+128))+rsp]
- lea rsi,[((-128+128))+rsp]
+ lea rsi,[((0+128))+rsp]
mov r11,QWORD[((16+128))+rsp]
mov r12,QWORD[((24+128))+rsp]
lea rdi,[rsp]
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
@@ -7430,11 +5314,11 @@ DB 102,72,15,110,199
cmovc r9,r10
mov r10,QWORD[24+rsi]
- call __ecp_nistz256_subx
+ call __ecp_nistz256_subq
lea rbx,[160+rsp]
lea rdi,[224+rsp]
- call __ecp_nistz256_sub_fromx
+ call __ecp_nistz256_sub_fromq
mov rax,QWORD[((0+0))+rsp]
mov rbp,QWORD[((0+8))+rsp]
@@ -7442,35 +5326,35 @@ DB 102,72,15,110,199
mov r10,QWORD[((0+24))+rsp]
lea rdi,[64+rsp]
- call __ecp_nistz256_subx
+ call __ecp_nistz256_subq
mov QWORD[rdi],r12
mov QWORD[8+rdi],r13
mov QWORD[16+rdi],r8
mov QWORD[24+rdi],r9
- mov rdx,QWORD[352+rsp]
+ mov rax,QWORD[352+rsp]
lea rbx,[352+rsp]
mov r9,QWORD[((0+160))+rsp]
mov r10,QWORD[((8+160))+rsp]
- lea rsi,[((-128+160))+rsp]
+ lea rsi,[((0+160))+rsp]
mov r11,QWORD[((16+160))+rsp]
mov r12,QWORD[((24+160))+rsp]
lea rdi,[32+rsp]
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
- mov rdx,QWORD[96+rsp]
+ mov rax,QWORD[96+rsp]
lea rbx,[96+rsp]
mov r9,QWORD[((0+64))+rsp]
mov r10,QWORD[((8+64))+rsp]
- lea rsi,[((-128+64))+rsp]
+ lea rsi,[((0+64))+rsp]
mov r11,QWORD[((16+64))+rsp]
mov r12,QWORD[((24+64))+rsp]
lea rdi,[64+rsp]
- call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_montq
lea rbx,[32+rsp]
lea rdi,[256+rsp]
- call __ecp_nistz256_sub_fromx
+ call __ecp_nistz256_sub_fromq
DB 102,72,15,126,199
@@ -7562,12 +5446,12 @@ DB 102,72,15,126,199
lea rsp,[rsi]
-$L$add_affinex_epilogue:
+$L$add_affineq_epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
-$L$SEH_end_ecp_nistz256_point_add_affinex:
+$L$SEH_end_ecp_nistz256_point_add_affine:
EXTERN __imp_RtlVirtualUnwind
@@ -7733,13 +5617,6 @@ ALIGN 4
DD $L$SEH_begin_ecp_nistz256_ord_sqr_mont wrt ..imagebase
DD $L$SEH_end_ecp_nistz256_ord_sqr_mont wrt ..imagebase
DD $L$SEH_info_ecp_nistz256_ord_sqr_mont wrt ..imagebase
- DD $L$SEH_begin_ecp_nistz256_ord_mul_montx wrt ..imagebase
- DD $L$SEH_end_ecp_nistz256_ord_mul_montx wrt ..imagebase
- DD $L$SEH_info_ecp_nistz256_ord_mul_montx wrt ..imagebase
-
- DD $L$SEH_begin_ecp_nistz256_ord_sqr_montx wrt ..imagebase
- DD $L$SEH_end_ecp_nistz256_ord_sqr_montx wrt ..imagebase
- DD $L$SEH_info_ecp_nistz256_ord_sqr_montx wrt ..imagebase
DD $L$SEH_begin_ecp_nistz256_to_mont wrt ..imagebase
DD $L$SEH_end_ecp_nistz256_to_mont wrt ..imagebase
DD $L$SEH_info_ecp_nistz256_to_mont wrt ..imagebase
@@ -7763,13 +5640,6 @@ ALIGN 4
DD $L$SEH_begin_ecp_nistz256_gather_w7 wrt ..imagebase
DD $L$SEH_end_ecp_nistz256_gather_w7 wrt ..imagebase
DD $L$SEH_info_ecp_nistz256_gather_wX wrt ..imagebase
- DD $L$SEH_begin_ecp_nistz256_avx2_gather_w5 wrt ..imagebase
- DD $L$SEH_end_ecp_nistz256_avx2_gather_w5 wrt ..imagebase
- DD $L$SEH_info_ecp_nistz256_avx2_gather_wX wrt ..imagebase
-
- DD $L$SEH_begin_ecp_nistz256_avx2_gather_w7 wrt ..imagebase
- DD $L$SEH_end_ecp_nistz256_avx2_gather_w7 wrt ..imagebase
- DD $L$SEH_info_ecp_nistz256_avx2_gather_wX wrt ..imagebase
DD $L$SEH_begin_ecp_nistz256_point_double wrt ..imagebase
DD $L$SEH_end_ecp_nistz256_point_double wrt ..imagebase
DD $L$SEH_info_ecp_nistz256_point_double wrt ..imagebase
@@ -7781,17 +5651,6 @@ ALIGN 4
DD $L$SEH_begin_ecp_nistz256_point_add_affine wrt ..imagebase
DD $L$SEH_end_ecp_nistz256_point_add_affine wrt ..imagebase
DD $L$SEH_info_ecp_nistz256_point_add_affine wrt ..imagebase
- DD $L$SEH_begin_ecp_nistz256_point_doublex wrt ..imagebase
- DD $L$SEH_end_ecp_nistz256_point_doublex wrt ..imagebase
- DD $L$SEH_info_ecp_nistz256_point_doublex wrt ..imagebase
-
- DD $L$SEH_begin_ecp_nistz256_point_addx wrt ..imagebase
- DD $L$SEH_end_ecp_nistz256_point_addx wrt ..imagebase
- DD $L$SEH_info_ecp_nistz256_point_addx wrt ..imagebase
-
- DD $L$SEH_begin_ecp_nistz256_point_add_affinex wrt ..imagebase
- DD $L$SEH_end_ecp_nistz256_point_add_affinex wrt ..imagebase
- DD $L$SEH_info_ecp_nistz256_point_add_affinex wrt ..imagebase
section .xdata rdata align=8
ALIGN 8
@@ -7829,16 +5688,6 @@ DB 9,0,0,0
DD full_handler wrt ..imagebase
DD $L$ord_sqr_body wrt ..imagebase,$L$ord_sqr_epilogue wrt ..imagebase
DD 48,0
-$L$SEH_info_ecp_nistz256_ord_mul_montx:
-DB 9,0,0,0
- DD full_handler wrt ..imagebase
- DD $L$ord_mulx_body wrt ..imagebase,$L$ord_mulx_epilogue wrt ..imagebase
- DD 48,0
-$L$SEH_info_ecp_nistz256_ord_sqr_montx:
-DB 9,0,0,0
- DD full_handler wrt ..imagebase
- DD $L$ord_sqrx_body wrt ..imagebase,$L$ord_sqrx_epilogue wrt ..imagebase
- DD 48,0
$L$SEH_info_ecp_nistz256_to_mont:
DB 9,0,0,0
DD full_handler wrt ..imagebase
@@ -7872,21 +5721,6 @@ DB 0x0c,0x78,0x01,0x00
DB 0x08,0x68,0x00,0x00
DB 0x04,0x01,0x15,0x00
ALIGN 8
-$L$SEH_info_ecp_nistz256_avx2_gather_wX:
-DB 0x01,0x36,0x17,0x0b
-DB 0x36,0xf8,0x09,0x00
-DB 0x31,0xe8,0x08,0x00
-DB 0x2c,0xd8,0x07,0x00
-DB 0x27,0xc8,0x06,0x00
-DB 0x22,0xb8,0x05,0x00
-DB 0x1d,0xa8,0x04,0x00
-DB 0x18,0x98,0x03,0x00
-DB 0x13,0x88,0x02,0x00
-DB 0x0e,0x78,0x01,0x00
-DB 0x09,0x68,0x00,0x00
-DB 0x04,0x01,0x15,0x00
-DB 0x00,0xb3,0x00,0x00
-ALIGN 8
$L$SEH_info_ecp_nistz256_point_double:
DB 9,0,0,0
DD full_handler wrt ..imagebase
@@ -7902,19 +5736,3 @@ DB 9,0,0,0
DD full_handler wrt ..imagebase
DD $L$add_affineq_body wrt ..imagebase,$L$add_affineq_epilogue wrt ..imagebase
DD 32*15+56,0
-ALIGN 8
-$L$SEH_info_ecp_nistz256_point_doublex:
-DB 9,0,0,0
- DD full_handler wrt ..imagebase
- DD $L$point_doublex_body wrt ..imagebase,$L$point_doublex_epilogue wrt ..imagebase
- DD 32*5+56,0
-$L$SEH_info_ecp_nistz256_point_addx:
-DB 9,0,0,0
- DD full_handler wrt ..imagebase
- DD $L$point_addx_body wrt ..imagebase,$L$point_addx_epilogue wrt ..imagebase
- DD 32*18+56,0
-$L$SEH_info_ecp_nistz256_point_add_affinex:
-DB 9,0,0,0
- DD full_handler wrt ..imagebase
- DD $L$add_affinex_body wrt ..imagebase,$L$add_affinex_epilogue wrt ..imagebase
- DD 32*15+56,0
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/ec/x25519-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/ec/x25519-x86_64.nasm
index d5dc6fbc47f..b3f4b8a434d 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/ec/x25519-x86_64.nasm
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/ec/x25519-x86_64.nasm
@@ -409,457 +409,34 @@ $L$fe51_mul121666_body:
$L$fe51_mul121666_epilogue:
$L$SEH_end_x25519_fe51_mul121666:
-EXTERN OPENSSL_ia32cap_P
global x25519_fe64_eligible
ALIGN 32
x25519_fe64_eligible:
- mov ecx,DWORD[((OPENSSL_ia32cap_P+8))]
xor eax,eax
- and ecx,0x80100
- cmp ecx,0x80100
- cmove eax,ecx
DB 0F3h,0C3h ;repret
global x25519_fe64_mul
-ALIGN 32
-x25519_fe64_mul:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_x25519_fe64_mul:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
-
-
-
- push rbp
-
- push rbx
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
- push rdi
-
- lea rsp,[((-16))+rsp]
-
-$L$fe64_mul_body:
-
- mov rax,rdx
- mov rbp,QWORD[rdx]
- mov rdx,QWORD[rsi]
- mov rcx,QWORD[8+rax]
- mov r14,QWORD[16+rax]
- mov r15,QWORD[24+rax]
-
- mulx rax,r8,rbp
- xor edi,edi
- mulx rbx,r9,rcx
- adcx r9,rax
- mulx rax,r10,r14
- adcx r10,rbx
- mulx r12,r11,r15
- mov rdx,QWORD[8+rsi]
- adcx r11,rax
- mov QWORD[rsp],r14
- adcx r12,rdi
-
- mulx rbx,rax,rbp
- adox r9,rax
- adcx r10,rbx
- mulx rbx,rax,rcx
- adox r10,rax
- adcx r11,rbx
- mulx rbx,rax,r14
- adox r11,rax
- adcx r12,rbx
- mulx r13,rax,r15
- mov rdx,QWORD[16+rsi]
- adox r12,rax
- adcx r13,rdi
- adox r13,rdi
-
- mulx rbx,rax,rbp
- adcx r10,rax
- adox r11,rbx
- mulx rbx,rax,rcx
- adcx r11,rax
- adox r12,rbx
- mulx rbx,rax,r14
- adcx r12,rax
- adox r13,rbx
- mulx r14,rax,r15
- mov rdx,QWORD[24+rsi]
- adcx r13,rax
- adox r14,rdi
- adcx r14,rdi
-
- mulx rbx,rax,rbp
- adox r11,rax
- adcx r12,rbx
- mulx rbx,rax,rcx
- adox r12,rax
- adcx r13,rbx
- mulx rbx,rax,QWORD[rsp]
- adox r13,rax
- adcx r14,rbx
- mulx r15,rax,r15
- mov edx,38
- adox r14,rax
- adcx r15,rdi
- adox r15,rdi
-
- jmp NEAR $L$reduce64
-$L$fe64_mul_epilogue:
-
-$L$SEH_end_x25519_fe64_mul:
-
global x25519_fe64_sqr
-
-ALIGN 32
-x25519_fe64_sqr:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_x25519_fe64_sqr:
- mov rdi,rcx
- mov rsi,rdx
-
-
-
- push rbp
-
- push rbx
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
- push rdi
-
- lea rsp,[((-16))+rsp]
-
-$L$fe64_sqr_body:
-
- mov rdx,QWORD[rsi]
- mov rcx,QWORD[8+rsi]
- mov rbp,QWORD[16+rsi]
- mov rsi,QWORD[24+rsi]
-
-
- mulx r15,r8,rdx
- mulx rax,r9,rcx
- xor edi,edi
- mulx rbx,r10,rbp
- adcx r10,rax
- mulx r12,r11,rsi
- mov rdx,rcx
- adcx r11,rbx
- adcx r12,rdi
-
-
- mulx rbx,rax,rbp
- adox r11,rax
- adcx r12,rbx
- mulx r13,rax,rsi
- mov rdx,rbp
- adox r12,rax
- adcx r13,rdi
-
-
- mulx r14,rax,rsi
- mov rdx,rcx
- adox r13,rax
- adcx r14,rdi
- adox r14,rdi
-
- adcx r9,r9
- adox r9,r15
- adcx r10,r10
- mulx rbx,rax,rdx
- mov rdx,rbp
- adcx r11,r11
- adox r10,rax
- adcx r12,r12
- adox r11,rbx
- mulx rbx,rax,rdx
- mov rdx,rsi
- adcx r13,r13
- adox r12,rax
- adcx r14,r14
- adox r13,rbx
- mulx r15,rax,rdx
- mov edx,38
- adox r14,rax
- adcx r15,rdi
- adox r15,rdi
- jmp NEAR $L$reduce64
-
-ALIGN 32
-$L$reduce64:
- mulx rbx,rax,r12
- adcx r8,rax
- adox r9,rbx
- mulx rbx,rax,r13
- adcx r9,rax
- adox r10,rbx
- mulx rbx,rax,r14
- adcx r10,rax
- adox r11,rbx
- mulx r12,rax,r15
- adcx r11,rax
- adox r12,rdi
- adcx r12,rdi
-
- mov rdi,QWORD[16+rsp]
- imul r12,rdx
-
- add r8,r12
- adc r9,0
- adc r10,0
- adc r11,0
-
- sbb rax,rax
- and rax,38
-
- add r8,rax
- mov QWORD[8+rdi],r9
- mov QWORD[16+rdi],r10
- mov QWORD[24+rdi],r11
- mov QWORD[rdi],r8
-
- mov r15,QWORD[24+rsp]
-
- mov r14,QWORD[32+rsp]
-
- mov r13,QWORD[40+rsp]
-
- mov r12,QWORD[48+rsp]
-
- mov rbx,QWORD[56+rsp]
-
- mov rbp,QWORD[64+rsp]
-
- lea rsp,[72+rsp]
-
-$L$fe64_sqr_epilogue:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_x25519_fe64_sqr:
-
global x25519_fe64_mul121666
-
-ALIGN 32
-x25519_fe64_mul121666:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_x25519_fe64_mul121666:
- mov rdi,rcx
- mov rsi,rdx
-
-
-$L$fe64_mul121666_body:
-
- mov edx,121666
- mulx rcx,r8,QWORD[rsi]
- mulx rax,r9,QWORD[8+rsi]
- add r9,rcx
- mulx rcx,r10,QWORD[16+rsi]
- adc r10,rax
- mulx rax,r11,QWORD[24+rsi]
- adc r11,rcx
- adc rax,0
-
- imul rax,rax,38
-
- add r8,rax
- adc r9,0
- adc r10,0
- adc r11,0
-
- sbb rax,rax
- and rax,38
-
- add r8,rax
- mov QWORD[8+rdi],r9
- mov QWORD[16+rdi],r10
- mov QWORD[24+rdi],r11
- mov QWORD[rdi],r8
-
-$L$fe64_mul121666_epilogue:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_x25519_fe64_mul121666:
-
global x25519_fe64_add
-
-ALIGN 32
-x25519_fe64_add:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_x25519_fe64_add:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
-
-
-$L$fe64_add_body:
-
- mov r8,QWORD[rsi]
- mov r9,QWORD[8+rsi]
- mov r10,QWORD[16+rsi]
- mov r11,QWORD[24+rsi]
-
- add r8,QWORD[rdx]
- adc r9,QWORD[8+rdx]
- adc r10,QWORD[16+rdx]
- adc r11,QWORD[24+rdx]
-
- sbb rax,rax
- and rax,38
-
- add r8,rax
- adc r9,0
- adc r10,0
- mov QWORD[8+rdi],r9
- adc r11,0
- mov QWORD[16+rdi],r10
- sbb rax,rax
- mov QWORD[24+rdi],r11
- and rax,38
-
- add r8,rax
- mov QWORD[rdi],r8
-
-$L$fe64_add_epilogue:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_x25519_fe64_add:
-
global x25519_fe64_sub
-
-ALIGN 32
-x25519_fe64_sub:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_x25519_fe64_sub:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
-
-
-$L$fe64_sub_body:
-
- mov r8,QWORD[rsi]
- mov r9,QWORD[8+rsi]
- mov r10,QWORD[16+rsi]
- mov r11,QWORD[24+rsi]
-
- sub r8,QWORD[rdx]
- sbb r9,QWORD[8+rdx]
- sbb r10,QWORD[16+rdx]
- sbb r11,QWORD[24+rdx]
-
- sbb rax,rax
- and rax,38
-
- sub r8,rax
- sbb r9,0
- sbb r10,0
- mov QWORD[8+rdi],r9
- sbb r11,0
- mov QWORD[16+rdi],r10
- sbb rax,rax
- mov QWORD[24+rdi],r11
- and rax,38
-
- sub r8,rax
- mov QWORD[rdi],r8
-
-$L$fe64_sub_epilogue:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_x25519_fe64_sub:
-
global x25519_fe64_tobytes
-
-ALIGN 32
+x25519_fe64_mul:
+x25519_fe64_sqr:
+x25519_fe64_mul121666:
+x25519_fe64_add:
+x25519_fe64_sub:
x25519_fe64_tobytes:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_x25519_fe64_tobytes:
- mov rdi,rcx
- mov rsi,rdx
-
-$L$fe64_to_body:
-
- mov r8,QWORD[rsi]
- mov r9,QWORD[8+rsi]
- mov r10,QWORD[16+rsi]
- mov r11,QWORD[24+rsi]
-
-
- lea rax,[r11*1+r11]
- sar r11,63
- shr rax,1
- and r11,19
- add r11,19
-
- add r8,r11
- adc r9,0
- adc r10,0
- adc rax,0
-
- lea r11,[rax*1+rax]
- sar rax,63
- shr r11,1
- not rax
- and rax,19
-
- sub r8,rax
- sbb r9,0
- sbb r10,0
- sbb r11,0
-
- mov QWORD[rdi],r8
- mov QWORD[8+rdi],r9
- mov QWORD[16+rdi],r10
- mov QWORD[24+rdi],r11
-
-$L$fe64_to_epilogue:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
+DB 0x0f,0x0b
DB 0F3h,0C3h ;repret
-$L$SEH_end_x25519_fe64_tobytes:
+
DB 88,50,53,53,49,57,32,112,114,105,109,105,116,105,118,101
DB 115,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82
DB 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
@@ -996,29 +573,6 @@ ALIGN 4
DD $L$SEH_begin_x25519_fe51_mul121666 wrt ..imagebase
DD $L$SEH_end_x25519_fe51_mul121666 wrt ..imagebase
DD $L$SEH_info_x25519_fe51_mul121666 wrt ..imagebase
- DD $L$SEH_begin_x25519_fe64_mul wrt ..imagebase
- DD $L$SEH_end_x25519_fe64_mul wrt ..imagebase
- DD $L$SEH_info_x25519_fe64_mul wrt ..imagebase
-
- DD $L$SEH_begin_x25519_fe64_sqr wrt ..imagebase
- DD $L$SEH_end_x25519_fe64_sqr wrt ..imagebase
- DD $L$SEH_info_x25519_fe64_sqr wrt ..imagebase
-
- DD $L$SEH_begin_x25519_fe64_mul121666 wrt ..imagebase
- DD $L$SEH_end_x25519_fe64_mul121666 wrt ..imagebase
- DD $L$SEH_info_x25519_fe64_mul121666 wrt ..imagebase
-
- DD $L$SEH_begin_x25519_fe64_add wrt ..imagebase
- DD $L$SEH_end_x25519_fe64_add wrt ..imagebase
- DD $L$SEH_info_x25519_fe64_add wrt ..imagebase
-
- DD $L$SEH_begin_x25519_fe64_sub wrt ..imagebase
- DD $L$SEH_end_x25519_fe64_sub wrt ..imagebase
- DD $L$SEH_info_x25519_fe64_sub wrt ..imagebase
-
- DD $L$SEH_begin_x25519_fe64_tobytes wrt ..imagebase
- DD $L$SEH_end_x25519_fe64_tobytes wrt ..imagebase
- DD $L$SEH_info_x25519_fe64_tobytes wrt ..imagebase
section .xdata rdata align=8
ALIGN 8
$L$SEH_info_x25519_fe51_mul:
@@ -1036,29 +590,3 @@ DB 9,0,0,0
DD full_handler wrt ..imagebase
DD $L$fe51_mul121666_body wrt ..imagebase,$L$fe51_mul121666_epilogue wrt ..imagebase
DD 88,0
-$L$SEH_info_x25519_fe64_mul:
-DB 9,0,0,0
- DD full_handler wrt ..imagebase
- DD $L$fe64_mul_body wrt ..imagebase,$L$fe64_mul_epilogue wrt ..imagebase
- DD 72,0
-$L$SEH_info_x25519_fe64_sqr:
-DB 9,0,0,0
- DD full_handler wrt ..imagebase
- DD $L$fe64_sqr_body wrt ..imagebase,$L$fe64_sqr_epilogue wrt ..imagebase
- DD 72,0
-$L$SEH_info_x25519_fe64_mul121666:
-DB 9,0,0,0
- DD short_handler wrt ..imagebase
- DD $L$fe64_mul121666_body wrt ..imagebase,$L$fe64_mul121666_epilogue wrt ..imagebase
-$L$SEH_info_x25519_fe64_add:
-DB 9,0,0,0
- DD short_handler wrt ..imagebase
- DD $L$fe64_add_body wrt ..imagebase,$L$fe64_add_epilogue wrt ..imagebase
-$L$SEH_info_x25519_fe64_sub:
-DB 9,0,0,0
- DD short_handler wrt ..imagebase
- DD $L$fe64_sub_body wrt ..imagebase,$L$fe64_sub_epilogue wrt ..imagebase
-$L$SEH_info_x25519_fe64_tobytes:
-DB 9,0,0,0
- DD short_handler wrt ..imagebase
- DD $L$fe64_to_body wrt ..imagebase,$L$fe64_to_epilogue wrt ..imagebase
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/aesni-gcm-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/aesni-gcm-x86_64.nasm
index cbc06ca5fae..d2812b08d12 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/aesni-gcm-x86_64.nasm
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/aesni-gcm-x86_64.nasm
@@ -5,977 +5,20 @@ default rel
section .text code align=64
-
-ALIGN 32
-_aesni_ctr32_ghash_6x:
-
- vmovdqu xmm2,XMMWORD[32+r11]
- sub rdx,6
- vpxor xmm4,xmm4,xmm4
- vmovdqu xmm15,XMMWORD[((0-128))+rcx]
- vpaddb xmm10,xmm1,xmm2
- vpaddb xmm11,xmm10,xmm2
- vpaddb xmm12,xmm11,xmm2
- vpaddb xmm13,xmm12,xmm2
- vpaddb xmm14,xmm13,xmm2
- vpxor xmm9,xmm1,xmm15
- vmovdqu XMMWORD[(16+8)+rsp],xmm4
- jmp NEAR $L$oop6x
-
-ALIGN 32
-$L$oop6x:
- add ebx,100663296
- jc NEAR $L$handle_ctr32
- vmovdqu xmm3,XMMWORD[((0-32))+r9]
- vpaddb xmm1,xmm14,xmm2
- vpxor xmm10,xmm10,xmm15
- vpxor xmm11,xmm11,xmm15
-
-$L$resume_ctr32:
- vmovdqu XMMWORD[r8],xmm1
- vpclmulqdq xmm5,xmm7,xmm3,0x10
- vpxor xmm12,xmm12,xmm15
- vmovups xmm2,XMMWORD[((16-128))+rcx]
- vpclmulqdq xmm6,xmm7,xmm3,0x01
- xor r12,r12
- cmp r15,r14
-
- vaesenc xmm9,xmm9,xmm2
- vmovdqu xmm0,XMMWORD[((48+8))+rsp]
- vpxor xmm13,xmm13,xmm15
- vpclmulqdq xmm1,xmm7,xmm3,0x00
- vaesenc xmm10,xmm10,xmm2
- vpxor xmm14,xmm14,xmm15
- setnc r12b
- vpclmulqdq xmm7,xmm7,xmm3,0x11
- vaesenc xmm11,xmm11,xmm2
- vmovdqu xmm3,XMMWORD[((16-32))+r9]
- neg r12
- vaesenc xmm12,xmm12,xmm2
- vpxor xmm6,xmm6,xmm5
- vpclmulqdq xmm5,xmm0,xmm3,0x00
- vpxor xmm8,xmm8,xmm4
- vaesenc xmm13,xmm13,xmm2
- vpxor xmm4,xmm1,xmm5
- and r12,0x60
- vmovups xmm15,XMMWORD[((32-128))+rcx]
- vpclmulqdq xmm1,xmm0,xmm3,0x10
- vaesenc xmm14,xmm14,xmm2
-
- vpclmulqdq xmm2,xmm0,xmm3,0x01
- lea r14,[r12*1+r14]
- vaesenc xmm9,xmm9,xmm15
- vpxor xmm8,xmm8,XMMWORD[((16+8))+rsp]
- vpclmulqdq xmm3,xmm0,xmm3,0x11
- vmovdqu xmm0,XMMWORD[((64+8))+rsp]
- vaesenc xmm10,xmm10,xmm15
- movbe r13,QWORD[88+r14]
- vaesenc xmm11,xmm11,xmm15
- movbe r12,QWORD[80+r14]
- vaesenc xmm12,xmm12,xmm15
- mov QWORD[((32+8))+rsp],r13
- vaesenc xmm13,xmm13,xmm15
- mov QWORD[((40+8))+rsp],r12
- vmovdqu xmm5,XMMWORD[((48-32))+r9]
- vaesenc xmm14,xmm14,xmm15
-
- vmovups xmm15,XMMWORD[((48-128))+rcx]
- vpxor xmm6,xmm6,xmm1
- vpclmulqdq xmm1,xmm0,xmm5,0x00
- vaesenc xmm9,xmm9,xmm15
- vpxor xmm6,xmm6,xmm2
- vpclmulqdq xmm2,xmm0,xmm5,0x10
- vaesenc xmm10,xmm10,xmm15
- vpxor xmm7,xmm7,xmm3
- vpclmulqdq xmm3,xmm0,xmm5,0x01
- vaesenc xmm11,xmm11,xmm15
- vpclmulqdq xmm5,xmm0,xmm5,0x11
- vmovdqu xmm0,XMMWORD[((80+8))+rsp]
- vaesenc xmm12,xmm12,xmm15
- vaesenc xmm13,xmm13,xmm15
- vpxor xmm4,xmm4,xmm1
- vmovdqu xmm1,XMMWORD[((64-32))+r9]
- vaesenc xmm14,xmm14,xmm15
-
- vmovups xmm15,XMMWORD[((64-128))+rcx]
- vpxor xmm6,xmm6,xmm2
- vpclmulqdq xmm2,xmm0,xmm1,0x00
- vaesenc xmm9,xmm9,xmm15
- vpxor xmm6,xmm6,xmm3
- vpclmulqdq xmm3,xmm0,xmm1,0x10
- vaesenc xmm10,xmm10,xmm15
- movbe r13,QWORD[72+r14]
- vpxor xmm7,xmm7,xmm5
- vpclmulqdq xmm5,xmm0,xmm1,0x01
- vaesenc xmm11,xmm11,xmm15
- movbe r12,QWORD[64+r14]
- vpclmulqdq xmm1,xmm0,xmm1,0x11
- vmovdqu xmm0,XMMWORD[((96+8))+rsp]
- vaesenc xmm12,xmm12,xmm15
- mov QWORD[((48+8))+rsp],r13
- vaesenc xmm13,xmm13,xmm15
- mov QWORD[((56+8))+rsp],r12
- vpxor xmm4,xmm4,xmm2
- vmovdqu xmm2,XMMWORD[((96-32))+r9]
- vaesenc xmm14,xmm14,xmm15
-
- vmovups xmm15,XMMWORD[((80-128))+rcx]
- vpxor xmm6,xmm6,xmm3
- vpclmulqdq xmm3,xmm0,xmm2,0x00
- vaesenc xmm9,xmm9,xmm15
- vpxor xmm6,xmm6,xmm5
- vpclmulqdq xmm5,xmm0,xmm2,0x10
- vaesenc xmm10,xmm10,xmm15
- movbe r13,QWORD[56+r14]
- vpxor xmm7,xmm7,xmm1
- vpclmulqdq xmm1,xmm0,xmm2,0x01
- vpxor xmm8,xmm8,XMMWORD[((112+8))+rsp]
- vaesenc xmm11,xmm11,xmm15
- movbe r12,QWORD[48+r14]
- vpclmulqdq xmm2,xmm0,xmm2,0x11
- vaesenc xmm12,xmm12,xmm15
- mov QWORD[((64+8))+rsp],r13
- vaesenc xmm13,xmm13,xmm15
- mov QWORD[((72+8))+rsp],r12
- vpxor xmm4,xmm4,xmm3
- vmovdqu xmm3,XMMWORD[((112-32))+r9]
- vaesenc xmm14,xmm14,xmm15
-
- vmovups xmm15,XMMWORD[((96-128))+rcx]
- vpxor xmm6,xmm6,xmm5
- vpclmulqdq xmm5,xmm8,xmm3,0x10
- vaesenc xmm9,xmm9,xmm15
- vpxor xmm6,xmm6,xmm1
- vpclmulqdq xmm1,xmm8,xmm3,0x01
- vaesenc xmm10,xmm10,xmm15
- movbe r13,QWORD[40+r14]
- vpxor xmm7,xmm7,xmm2
- vpclmulqdq xmm2,xmm8,xmm3,0x00
- vaesenc xmm11,xmm11,xmm15
- movbe r12,QWORD[32+r14]
- vpclmulqdq xmm8,xmm8,xmm3,0x11
- vaesenc xmm12,xmm12,xmm15
- mov QWORD[((80+8))+rsp],r13
- vaesenc xmm13,xmm13,xmm15
- mov QWORD[((88+8))+rsp],r12
- vpxor xmm6,xmm6,xmm5
- vaesenc xmm14,xmm14,xmm15
- vpxor xmm6,xmm6,xmm1
-
- vmovups xmm15,XMMWORD[((112-128))+rcx]
- vpslldq xmm5,xmm6,8
- vpxor xmm4,xmm4,xmm2
- vmovdqu xmm3,XMMWORD[16+r11]
-
- vaesenc xmm9,xmm9,xmm15
- vpxor xmm7,xmm7,xmm8
- vaesenc xmm10,xmm10,xmm15
- vpxor xmm4,xmm4,xmm5
- movbe r13,QWORD[24+r14]
- vaesenc xmm11,xmm11,xmm15
- movbe r12,QWORD[16+r14]
- vpalignr xmm0,xmm4,xmm4,8
- vpclmulqdq xmm4,xmm4,xmm3,0x10
- mov QWORD[((96+8))+rsp],r13
- vaesenc xmm12,xmm12,xmm15
- mov QWORD[((104+8))+rsp],r12
- vaesenc xmm13,xmm13,xmm15
- vmovups xmm1,XMMWORD[((128-128))+rcx]
- vaesenc xmm14,xmm14,xmm15
-
- vaesenc xmm9,xmm9,xmm1
- vmovups xmm15,XMMWORD[((144-128))+rcx]
- vaesenc xmm10,xmm10,xmm1
- vpsrldq xmm6,xmm6,8
- vaesenc xmm11,xmm11,xmm1
- vpxor xmm7,xmm7,xmm6
- vaesenc xmm12,xmm12,xmm1
- vpxor xmm4,xmm4,xmm0
- movbe r13,QWORD[8+r14]
- vaesenc xmm13,xmm13,xmm1
- movbe r12,QWORD[r14]
- vaesenc xmm14,xmm14,xmm1
- vmovups xmm1,XMMWORD[((160-128))+rcx]
- cmp ebp,11
- jb NEAR $L$enc_tail
-
- vaesenc xmm9,xmm9,xmm15
- vaesenc xmm10,xmm10,xmm15
- vaesenc xmm11,xmm11,xmm15
- vaesenc xmm12,xmm12,xmm15
- vaesenc xmm13,xmm13,xmm15
- vaesenc xmm14,xmm14,xmm15
-
- vaesenc xmm9,xmm9,xmm1
- vaesenc xmm10,xmm10,xmm1
- vaesenc xmm11,xmm11,xmm1
- vaesenc xmm12,xmm12,xmm1
- vaesenc xmm13,xmm13,xmm1
- vmovups xmm15,XMMWORD[((176-128))+rcx]
- vaesenc xmm14,xmm14,xmm1
- vmovups xmm1,XMMWORD[((192-128))+rcx]
- je NEAR $L$enc_tail
-
- vaesenc xmm9,xmm9,xmm15
- vaesenc xmm10,xmm10,xmm15
- vaesenc xmm11,xmm11,xmm15
- vaesenc xmm12,xmm12,xmm15
- vaesenc xmm13,xmm13,xmm15
- vaesenc xmm14,xmm14,xmm15
-
- vaesenc xmm9,xmm9,xmm1
- vaesenc xmm10,xmm10,xmm1
- vaesenc xmm11,xmm11,xmm1
- vaesenc xmm12,xmm12,xmm1
- vaesenc xmm13,xmm13,xmm1
- vmovups xmm15,XMMWORD[((208-128))+rcx]
- vaesenc xmm14,xmm14,xmm1
- vmovups xmm1,XMMWORD[((224-128))+rcx]
- jmp NEAR $L$enc_tail
-
-ALIGN 32
-$L$handle_ctr32:
- vmovdqu xmm0,XMMWORD[r11]
- vpshufb xmm6,xmm1,xmm0
- vmovdqu xmm5,XMMWORD[48+r11]
- vpaddd xmm10,xmm6,XMMWORD[64+r11]
- vpaddd xmm11,xmm6,xmm5
- vmovdqu xmm3,XMMWORD[((0-32))+r9]
- vpaddd xmm12,xmm10,xmm5
- vpshufb xmm10,xmm10,xmm0
- vpaddd xmm13,xmm11,xmm5
- vpshufb xmm11,xmm11,xmm0
- vpxor xmm10,xmm10,xmm15
- vpaddd xmm14,xmm12,xmm5
- vpshufb xmm12,xmm12,xmm0
- vpxor xmm11,xmm11,xmm15
- vpaddd xmm1,xmm13,xmm5
- vpshufb xmm13,xmm13,xmm0
- vpshufb xmm14,xmm14,xmm0
- vpshufb xmm1,xmm1,xmm0
- jmp NEAR $L$resume_ctr32
-
-ALIGN 32
-$L$enc_tail:
- vaesenc xmm9,xmm9,xmm15
- vmovdqu XMMWORD[(16+8)+rsp],xmm7
- vpalignr xmm8,xmm4,xmm4,8
- vaesenc xmm10,xmm10,xmm15
- vpclmulqdq xmm4,xmm4,xmm3,0x10
- vpxor xmm2,xmm1,XMMWORD[rdi]
- vaesenc xmm11,xmm11,xmm15
- vpxor xmm0,xmm1,XMMWORD[16+rdi]
- vaesenc xmm12,xmm12,xmm15
- vpxor xmm5,xmm1,XMMWORD[32+rdi]
- vaesenc xmm13,xmm13,xmm15
- vpxor xmm6,xmm1,XMMWORD[48+rdi]
- vaesenc xmm14,xmm14,xmm15
- vpxor xmm7,xmm1,XMMWORD[64+rdi]
- vpxor xmm3,xmm1,XMMWORD[80+rdi]
- vmovdqu xmm1,XMMWORD[r8]
-
- vaesenclast xmm9,xmm9,xmm2
- vmovdqu xmm2,XMMWORD[32+r11]
- vaesenclast xmm10,xmm10,xmm0
- vpaddb xmm0,xmm1,xmm2
- mov QWORD[((112+8))+rsp],r13
- lea rdi,[96+rdi]
- vaesenclast xmm11,xmm11,xmm5
- vpaddb xmm5,xmm0,xmm2
- mov QWORD[((120+8))+rsp],r12
- lea rsi,[96+rsi]
- vmovdqu xmm15,XMMWORD[((0-128))+rcx]
- vaesenclast xmm12,xmm12,xmm6
- vpaddb xmm6,xmm5,xmm2
- vaesenclast xmm13,xmm13,xmm7
- vpaddb xmm7,xmm6,xmm2
- vaesenclast xmm14,xmm14,xmm3
- vpaddb xmm3,xmm7,xmm2
-
- add r10,0x60
- sub rdx,0x6
- jc NEAR $L$6x_done
-
- vmovups XMMWORD[(-96)+rsi],xmm9
- vpxor xmm9,xmm1,xmm15
- vmovups XMMWORD[(-80)+rsi],xmm10
- vmovdqa xmm10,xmm0
- vmovups XMMWORD[(-64)+rsi],xmm11
- vmovdqa xmm11,xmm5
- vmovups XMMWORD[(-48)+rsi],xmm12
- vmovdqa xmm12,xmm6
- vmovups XMMWORD[(-32)+rsi],xmm13
- vmovdqa xmm13,xmm7
- vmovups XMMWORD[(-16)+rsi],xmm14
- vmovdqa xmm14,xmm3
- vmovdqu xmm7,XMMWORD[((32+8))+rsp]
- jmp NEAR $L$oop6x
-
-$L$6x_done:
- vpxor xmm8,xmm8,XMMWORD[((16+8))+rsp]
- vpxor xmm8,xmm8,xmm4
-
- DB 0F3h,0C3h ;repret
-
-
-global aesni_gcm_decrypt
-
-ALIGN 32
-aesni_gcm_decrypt:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_aesni_gcm_decrypt:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
- mov rcx,r9
- mov r8,QWORD[40+rsp]
- mov r9,QWORD[48+rsp]
-
-
-
- xor r10,r10
- cmp rdx,0x60
- jb NEAR $L$gcm_dec_abort
-
- lea rax,[rsp]
-
- push rbx
-
- push rbp
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
- lea rsp,[((-168))+rsp]
- movaps XMMWORD[(-216)+rax],xmm6
- movaps XMMWORD[(-200)+rax],xmm7
- movaps XMMWORD[(-184)+rax],xmm8
- movaps XMMWORD[(-168)+rax],xmm9
- movaps XMMWORD[(-152)+rax],xmm10
- movaps XMMWORD[(-136)+rax],xmm11
- movaps XMMWORD[(-120)+rax],xmm12
- movaps XMMWORD[(-104)+rax],xmm13
- movaps XMMWORD[(-88)+rax],xmm14
- movaps XMMWORD[(-72)+rax],xmm15
-$L$gcm_dec_body:
- vzeroupper
-
- vmovdqu xmm1,XMMWORD[r8]
- add rsp,-128
- mov ebx,DWORD[12+r8]
- lea r11,[$L$bswap_mask]
- lea r14,[((-128))+rcx]
- mov r15,0xf80
- vmovdqu xmm8,XMMWORD[r9]
- and rsp,-128
- vmovdqu xmm0,XMMWORD[r11]
- lea rcx,[128+rcx]
- lea r9,[((32+32))+r9]
- mov ebp,DWORD[((240-128))+rcx]
- vpshufb xmm8,xmm8,xmm0
-
- and r14,r15
- and r15,rsp
- sub r15,r14
- jc NEAR $L$dec_no_key_aliasing
- cmp r15,768
- jnc NEAR $L$dec_no_key_aliasing
- sub rsp,r15
-$L$dec_no_key_aliasing:
-
- vmovdqu xmm7,XMMWORD[80+rdi]
- lea r14,[rdi]
- vmovdqu xmm4,XMMWORD[64+rdi]
- lea r15,[((-192))+rdx*1+rdi]
- vmovdqu xmm5,XMMWORD[48+rdi]
- shr rdx,4
- xor r10,r10
- vmovdqu xmm6,XMMWORD[32+rdi]
- vpshufb xmm7,xmm7,xmm0
- vmovdqu xmm2,XMMWORD[16+rdi]
- vpshufb xmm4,xmm4,xmm0
- vmovdqu xmm3,XMMWORD[rdi]
- vpshufb xmm5,xmm5,xmm0
- vmovdqu XMMWORD[48+rsp],xmm4
- vpshufb xmm6,xmm6,xmm0
- vmovdqu XMMWORD[64+rsp],xmm5
- vpshufb xmm2,xmm2,xmm0
- vmovdqu XMMWORD[80+rsp],xmm6
- vpshufb xmm3,xmm3,xmm0
- vmovdqu XMMWORD[96+rsp],xmm2
- vmovdqu XMMWORD[112+rsp],xmm3
-
- call _aesni_ctr32_ghash_6x
-
- vmovups XMMWORD[(-96)+rsi],xmm9
- vmovups XMMWORD[(-80)+rsi],xmm10
- vmovups XMMWORD[(-64)+rsi],xmm11
- vmovups XMMWORD[(-48)+rsi],xmm12
- vmovups XMMWORD[(-32)+rsi],xmm13
- vmovups XMMWORD[(-16)+rsi],xmm14
-
- vpshufb xmm8,xmm8,XMMWORD[r11]
- vmovdqu XMMWORD[(-64)+r9],xmm8
-
- vzeroupper
- movaps xmm6,XMMWORD[((-216))+rax]
- movaps xmm7,XMMWORD[((-200))+rax]
- movaps xmm8,XMMWORD[((-184))+rax]
- movaps xmm9,XMMWORD[((-168))+rax]
- movaps xmm10,XMMWORD[((-152))+rax]
- movaps xmm11,XMMWORD[((-136))+rax]
- movaps xmm12,XMMWORD[((-120))+rax]
- movaps xmm13,XMMWORD[((-104))+rax]
- movaps xmm14,XMMWORD[((-88))+rax]
- movaps xmm15,XMMWORD[((-72))+rax]
- mov r15,QWORD[((-48))+rax]
-
- mov r14,QWORD[((-40))+rax]
-
- mov r13,QWORD[((-32))+rax]
-
- mov r12,QWORD[((-24))+rax]
-
- mov rbp,QWORD[((-16))+rax]
-
- mov rbx,QWORD[((-8))+rax]
-
- lea rsp,[rax]
-
-$L$gcm_dec_abort:
- mov rax,r10
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_aesni_gcm_decrypt:
-
-ALIGN 32
-_aesni_ctr32_6x:
-
- vmovdqu xmm4,XMMWORD[((0-128))+rcx]
- vmovdqu xmm2,XMMWORD[32+r11]
- lea r13,[((-1))+rbp]
- vmovups xmm15,XMMWORD[((16-128))+rcx]
- lea r12,[((32-128))+rcx]
- vpxor xmm9,xmm1,xmm4
- add ebx,100663296
- jc NEAR $L$handle_ctr32_2
- vpaddb xmm10,xmm1,xmm2
- vpaddb xmm11,xmm10,xmm2
- vpxor xmm10,xmm10,xmm4
- vpaddb xmm12,xmm11,xmm2
- vpxor xmm11,xmm11,xmm4
- vpaddb xmm13,xmm12,xmm2
- vpxor xmm12,xmm12,xmm4
- vpaddb xmm14,xmm13,xmm2
- vpxor xmm13,xmm13,xmm4
- vpaddb xmm1,xmm14,xmm2
- vpxor xmm14,xmm14,xmm4
- jmp NEAR $L$oop_ctr32
-
-ALIGN 16
-$L$oop_ctr32:
- vaesenc xmm9,xmm9,xmm15
- vaesenc xmm10,xmm10,xmm15
- vaesenc xmm11,xmm11,xmm15
- vaesenc xmm12,xmm12,xmm15
- vaesenc xmm13,xmm13,xmm15
- vaesenc xmm14,xmm14,xmm15
- vmovups xmm15,XMMWORD[r12]
- lea r12,[16+r12]
- dec r13d
- jnz NEAR $L$oop_ctr32
-
- vmovdqu xmm3,XMMWORD[r12]
- vaesenc xmm9,xmm9,xmm15
- vpxor xmm4,xmm3,XMMWORD[rdi]
- vaesenc xmm10,xmm10,xmm15
- vpxor xmm5,xmm3,XMMWORD[16+rdi]
- vaesenc xmm11,xmm11,xmm15
- vpxor xmm6,xmm3,XMMWORD[32+rdi]
- vaesenc xmm12,xmm12,xmm15
- vpxor xmm8,xmm3,XMMWORD[48+rdi]
- vaesenc xmm13,xmm13,xmm15
- vpxor xmm2,xmm3,XMMWORD[64+rdi]
- vaesenc xmm14,xmm14,xmm15
- vpxor xmm3,xmm3,XMMWORD[80+rdi]
- lea rdi,[96+rdi]
-
- vaesenclast xmm9,xmm9,xmm4
- vaesenclast xmm10,xmm10,xmm5
- vaesenclast xmm11,xmm11,xmm6
- vaesenclast xmm12,xmm12,xmm8
- vaesenclast xmm13,xmm13,xmm2
- vaesenclast xmm14,xmm14,xmm3
- vmovups XMMWORD[rsi],xmm9
- vmovups XMMWORD[16+rsi],xmm10
- vmovups XMMWORD[32+rsi],xmm11
- vmovups XMMWORD[48+rsi],xmm12
- vmovups XMMWORD[64+rsi],xmm13
- vmovups XMMWORD[80+rsi],xmm14
- lea rsi,[96+rsi]
-
- DB 0F3h,0C3h ;repret
-ALIGN 32
-$L$handle_ctr32_2:
- vpshufb xmm6,xmm1,xmm0
- vmovdqu xmm5,XMMWORD[48+r11]
- vpaddd xmm10,xmm6,XMMWORD[64+r11]
- vpaddd xmm11,xmm6,xmm5
- vpaddd xmm12,xmm10,xmm5
- vpshufb xmm10,xmm10,xmm0
- vpaddd xmm13,xmm11,xmm5
- vpshufb xmm11,xmm11,xmm0
- vpxor xmm10,xmm10,xmm4
- vpaddd xmm14,xmm12,xmm5
- vpshufb xmm12,xmm12,xmm0
- vpxor xmm11,xmm11,xmm4
- vpaddd xmm1,xmm13,xmm5
- vpshufb xmm13,xmm13,xmm0
- vpxor xmm12,xmm12,xmm4
- vpshufb xmm14,xmm14,xmm0
- vpxor xmm13,xmm13,xmm4
- vpshufb xmm1,xmm1,xmm0
- vpxor xmm14,xmm14,xmm4
- jmp NEAR $L$oop_ctr32
-
-
-
global aesni_gcm_encrypt
-ALIGN 32
aesni_gcm_encrypt:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_aesni_gcm_encrypt:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
- mov rcx,r9
- mov r8,QWORD[40+rsp]
- mov r9,QWORD[48+rsp]
-
-
-
- xor r10,r10
- cmp rdx,0x60*3
- jb NEAR $L$gcm_enc_abort
-
- lea rax,[rsp]
-
- push rbx
-
- push rbp
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
- lea rsp,[((-168))+rsp]
- movaps XMMWORD[(-216)+rax],xmm6
- movaps XMMWORD[(-200)+rax],xmm7
- movaps XMMWORD[(-184)+rax],xmm8
- movaps XMMWORD[(-168)+rax],xmm9
- movaps XMMWORD[(-152)+rax],xmm10
- movaps XMMWORD[(-136)+rax],xmm11
- movaps XMMWORD[(-120)+rax],xmm12
- movaps XMMWORD[(-104)+rax],xmm13
- movaps XMMWORD[(-88)+rax],xmm14
- movaps XMMWORD[(-72)+rax],xmm15
-$L$gcm_enc_body:
- vzeroupper
-
- vmovdqu xmm1,XMMWORD[r8]
- add rsp,-128
- mov ebx,DWORD[12+r8]
- lea r11,[$L$bswap_mask]
- lea r14,[((-128))+rcx]
- mov r15,0xf80
- lea rcx,[128+rcx]
- vmovdqu xmm0,XMMWORD[r11]
- and rsp,-128
- mov ebp,DWORD[((240-128))+rcx]
-
- and r14,r15
- and r15,rsp
- sub r15,r14
- jc NEAR $L$enc_no_key_aliasing
- cmp r15,768
- jnc NEAR $L$enc_no_key_aliasing
- sub rsp,r15
-$L$enc_no_key_aliasing:
-
- lea r14,[rsi]
- lea r15,[((-192))+rdx*1+rsi]
- shr rdx,4
-
- call _aesni_ctr32_6x
- vpshufb xmm8,xmm9,xmm0
- vpshufb xmm2,xmm10,xmm0
- vmovdqu XMMWORD[112+rsp],xmm8
- vpshufb xmm4,xmm11,xmm0
- vmovdqu XMMWORD[96+rsp],xmm2
- vpshufb xmm5,xmm12,xmm0
- vmovdqu XMMWORD[80+rsp],xmm4
- vpshufb xmm6,xmm13,xmm0
- vmovdqu XMMWORD[64+rsp],xmm5
- vpshufb xmm7,xmm14,xmm0
- vmovdqu XMMWORD[48+rsp],xmm6
-
- call _aesni_ctr32_6x
-
- vmovdqu xmm8,XMMWORD[r9]
- lea r9,[((32+32))+r9]
- sub rdx,12
- mov r10,0x60*2
- vpshufb xmm8,xmm8,xmm0
-
- call _aesni_ctr32_ghash_6x
- vmovdqu xmm7,XMMWORD[32+rsp]
- vmovdqu xmm0,XMMWORD[r11]
- vmovdqu xmm3,XMMWORD[((0-32))+r9]
- vpunpckhqdq xmm1,xmm7,xmm7
- vmovdqu xmm15,XMMWORD[((32-32))+r9]
- vmovups XMMWORD[(-96)+rsi],xmm9
- vpshufb xmm9,xmm9,xmm0
- vpxor xmm1,xmm1,xmm7
- vmovups XMMWORD[(-80)+rsi],xmm10
- vpshufb xmm10,xmm10,xmm0
- vmovups XMMWORD[(-64)+rsi],xmm11
- vpshufb xmm11,xmm11,xmm0
- vmovups XMMWORD[(-48)+rsi],xmm12
- vpshufb xmm12,xmm12,xmm0
- vmovups XMMWORD[(-32)+rsi],xmm13
- vpshufb xmm13,xmm13,xmm0
- vmovups XMMWORD[(-16)+rsi],xmm14
- vpshufb xmm14,xmm14,xmm0
- vmovdqu XMMWORD[16+rsp],xmm9
- vmovdqu xmm6,XMMWORD[48+rsp]
- vmovdqu xmm0,XMMWORD[((16-32))+r9]
- vpunpckhqdq xmm2,xmm6,xmm6
- vpclmulqdq xmm5,xmm7,xmm3,0x00
- vpxor xmm2,xmm2,xmm6
- vpclmulqdq xmm7,xmm7,xmm3,0x11
- vpclmulqdq xmm1,xmm1,xmm15,0x00
-
- vmovdqu xmm9,XMMWORD[64+rsp]
- vpclmulqdq xmm4,xmm6,xmm0,0x00
- vmovdqu xmm3,XMMWORD[((48-32))+r9]
- vpxor xmm4,xmm4,xmm5
- vpunpckhqdq xmm5,xmm9,xmm9
- vpclmulqdq xmm6,xmm6,xmm0,0x11
- vpxor xmm5,xmm5,xmm9
- vpxor xmm6,xmm6,xmm7
- vpclmulqdq xmm2,xmm2,xmm15,0x10
- vmovdqu xmm15,XMMWORD[((80-32))+r9]
- vpxor xmm2,xmm2,xmm1
-
- vmovdqu xmm1,XMMWORD[80+rsp]
- vpclmulqdq xmm7,xmm9,xmm3,0x00
- vmovdqu xmm0,XMMWORD[((64-32))+r9]
- vpxor xmm7,xmm7,xmm4
- vpunpckhqdq xmm4,xmm1,xmm1
- vpclmulqdq xmm9,xmm9,xmm3,0x11
- vpxor xmm4,xmm4,xmm1
- vpxor xmm9,xmm9,xmm6
- vpclmulqdq xmm5,xmm5,xmm15,0x00
- vpxor xmm5,xmm5,xmm2
-
- vmovdqu xmm2,XMMWORD[96+rsp]
- vpclmulqdq xmm6,xmm1,xmm0,0x00
- vmovdqu xmm3,XMMWORD[((96-32))+r9]
- vpxor xmm6,xmm6,xmm7
- vpunpckhqdq xmm7,xmm2,xmm2
- vpclmulqdq xmm1,xmm1,xmm0,0x11
- vpxor xmm7,xmm7,xmm2
- vpxor xmm1,xmm1,xmm9
- vpclmulqdq xmm4,xmm4,xmm15,0x10
- vmovdqu xmm15,XMMWORD[((128-32))+r9]
- vpxor xmm4,xmm4,xmm5
-
- vpxor xmm8,xmm8,XMMWORD[112+rsp]
- vpclmulqdq xmm5,xmm2,xmm3,0x00
- vmovdqu xmm0,XMMWORD[((112-32))+r9]
- vpunpckhqdq xmm9,xmm8,xmm8
- vpxor xmm5,xmm5,xmm6
- vpclmulqdq xmm2,xmm2,xmm3,0x11
- vpxor xmm9,xmm9,xmm8
- vpxor xmm2,xmm2,xmm1
- vpclmulqdq xmm7,xmm7,xmm15,0x00
- vpxor xmm4,xmm7,xmm4
-
- vpclmulqdq xmm6,xmm8,xmm0,0x00
- vmovdqu xmm3,XMMWORD[((0-32))+r9]
- vpunpckhqdq xmm1,xmm14,xmm14
- vpclmulqdq xmm8,xmm8,xmm0,0x11
- vpxor xmm1,xmm1,xmm14
- vpxor xmm5,xmm6,xmm5
- vpclmulqdq xmm9,xmm9,xmm15,0x10
- vmovdqu xmm15,XMMWORD[((32-32))+r9]
- vpxor xmm7,xmm8,xmm2
- vpxor xmm6,xmm9,xmm4
- vmovdqu xmm0,XMMWORD[((16-32))+r9]
- vpxor xmm9,xmm7,xmm5
- vpclmulqdq xmm4,xmm14,xmm3,0x00
- vpxor xmm6,xmm6,xmm9
- vpunpckhqdq xmm2,xmm13,xmm13
- vpclmulqdq xmm14,xmm14,xmm3,0x11
- vpxor xmm2,xmm2,xmm13
- vpslldq xmm9,xmm6,8
- vpclmulqdq xmm1,xmm1,xmm15,0x00
- vpxor xmm8,xmm5,xmm9
- vpsrldq xmm6,xmm6,8
- vpxor xmm7,xmm7,xmm6
-
- vpclmulqdq xmm5,xmm13,xmm0,0x00
- vmovdqu xmm3,XMMWORD[((48-32))+r9]
- vpxor xmm5,xmm5,xmm4
- vpunpckhqdq xmm9,xmm12,xmm12
- vpclmulqdq xmm13,xmm13,xmm0,0x11
- vpxor xmm9,xmm9,xmm12
- vpxor xmm13,xmm13,xmm14
- vpalignr xmm14,xmm8,xmm8,8
- vpclmulqdq xmm2,xmm2,xmm15,0x10
- vmovdqu xmm15,XMMWORD[((80-32))+r9]
- vpxor xmm2,xmm2,xmm1
-
- vpclmulqdq xmm4,xmm12,xmm3,0x00
- vmovdqu xmm0,XMMWORD[((64-32))+r9]
- vpxor xmm4,xmm4,xmm5
- vpunpckhqdq xmm1,xmm11,xmm11
- vpclmulqdq xmm12,xmm12,xmm3,0x11
- vpxor xmm1,xmm1,xmm11
- vpxor xmm12,xmm12,xmm13
- vxorps xmm7,xmm7,XMMWORD[16+rsp]
- vpclmulqdq xmm9,xmm9,xmm15,0x00
- vpxor xmm9,xmm9,xmm2
-
- vpclmulqdq xmm8,xmm8,XMMWORD[16+r11],0x10
- vxorps xmm8,xmm8,xmm14
-
- vpclmulqdq xmm5,xmm11,xmm0,0x00
- vmovdqu xmm3,XMMWORD[((96-32))+r9]
- vpxor xmm5,xmm5,xmm4
- vpunpckhqdq xmm2,xmm10,xmm10
- vpclmulqdq xmm11,xmm11,xmm0,0x11
- vpxor xmm2,xmm2,xmm10
- vpalignr xmm14,xmm8,xmm8,8
- vpxor xmm11,xmm11,xmm12
- vpclmulqdq xmm1,xmm1,xmm15,0x10
- vmovdqu xmm15,XMMWORD[((128-32))+r9]
- vpxor xmm1,xmm1,xmm9
-
- vxorps xmm14,xmm14,xmm7
- vpclmulqdq xmm8,xmm8,XMMWORD[16+r11],0x10
- vxorps xmm8,xmm8,xmm14
-
- vpclmulqdq xmm4,xmm10,xmm3,0x00
- vmovdqu xmm0,XMMWORD[((112-32))+r9]
- vpxor xmm4,xmm4,xmm5
- vpunpckhqdq xmm9,xmm8,xmm8
- vpclmulqdq xmm10,xmm10,xmm3,0x11
- vpxor xmm9,xmm9,xmm8
- vpxor xmm10,xmm10,xmm11
- vpclmulqdq xmm2,xmm2,xmm15,0x00
- vpxor xmm2,xmm2,xmm1
-
- vpclmulqdq xmm5,xmm8,xmm0,0x00
- vpclmulqdq xmm7,xmm8,xmm0,0x11
- vpxor xmm5,xmm5,xmm4
- vpclmulqdq xmm6,xmm9,xmm15,0x10
- vpxor xmm7,xmm7,xmm10
- vpxor xmm6,xmm6,xmm2
-
- vpxor xmm4,xmm7,xmm5
- vpxor xmm6,xmm6,xmm4
- vpslldq xmm1,xmm6,8
- vmovdqu xmm3,XMMWORD[16+r11]
- vpsrldq xmm6,xmm6,8
- vpxor xmm8,xmm5,xmm1
- vpxor xmm7,xmm7,xmm6
-
- vpalignr xmm2,xmm8,xmm8,8
- vpclmulqdq xmm8,xmm8,xmm3,0x10
- vpxor xmm8,xmm8,xmm2
-
- vpalignr xmm2,xmm8,xmm8,8
- vpclmulqdq xmm8,xmm8,xmm3,0x10
- vpxor xmm2,xmm2,xmm7
- vpxor xmm8,xmm8,xmm2
- vpshufb xmm8,xmm8,XMMWORD[r11]
- vmovdqu XMMWORD[(-64)+r9],xmm8
-
- vzeroupper
- movaps xmm6,XMMWORD[((-216))+rax]
- movaps xmm7,XMMWORD[((-200))+rax]
- movaps xmm8,XMMWORD[((-184))+rax]
- movaps xmm9,XMMWORD[((-168))+rax]
- movaps xmm10,XMMWORD[((-152))+rax]
- movaps xmm11,XMMWORD[((-136))+rax]
- movaps xmm12,XMMWORD[((-120))+rax]
- movaps xmm13,XMMWORD[((-104))+rax]
- movaps xmm14,XMMWORD[((-88))+rax]
- movaps xmm15,XMMWORD[((-72))+rax]
- mov r15,QWORD[((-48))+rax]
-
- mov r14,QWORD[((-40))+rax]
-
- mov r13,QWORD[((-32))+rax]
-
- mov r12,QWORD[((-24))+rax]
-
- mov rbp,QWORD[((-16))+rax]
-
- mov rbx,QWORD[((-8))+rax]
-
- lea rsp,[rax]
-
-$L$gcm_enc_abort:
- mov rax,r10
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
+ xor eax,eax
DB 0F3h,0C3h ;repret
-$L$SEH_end_aesni_gcm_encrypt:
-ALIGN 64
-$L$bswap_mask:
-DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-$L$poly:
-DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
-$L$one_msb:
-DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
-$L$two_lsb:
-DB 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-$L$one_lsb:
-DB 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-DB 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108
-DB 101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82
-DB 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
-DB 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-ALIGN 64
-EXTERN __imp_RtlVirtualUnwind
-
-ALIGN 16
-gcm_se_handler:
- push rsi
- push rdi
- push rbx
- push rbp
- push r12
- push r13
- push r14
- push r15
- pushfq
- sub rsp,64
-
- mov rax,QWORD[120+r8]
- mov rbx,QWORD[248+r8]
-
- mov rsi,QWORD[8+r9]
- mov r11,QWORD[56+r9]
-
- mov r10d,DWORD[r11]
- lea r10,[r10*1+rsi]
- cmp rbx,r10
- jb NEAR $L$common_seh_tail
-
- mov rax,QWORD[152+r8]
- mov r10d,DWORD[4+r11]
- lea r10,[r10*1+rsi]
- cmp rbx,r10
- jae NEAR $L$common_seh_tail
- mov rax,QWORD[120+r8]
-
- mov r15,QWORD[((-48))+rax]
- mov r14,QWORD[((-40))+rax]
- mov r13,QWORD[((-32))+rax]
- mov r12,QWORD[((-24))+rax]
- mov rbp,QWORD[((-16))+rax]
- mov rbx,QWORD[((-8))+rax]
- mov QWORD[240+r8],r15
- mov QWORD[232+r8],r14
- mov QWORD[224+r8],r13
- mov QWORD[216+r8],r12
- mov QWORD[160+r8],rbp
- mov QWORD[144+r8],rbx
-
- lea rsi,[((-216))+rax]
- lea rdi,[512+r8]
- mov ecx,20
- DD 0xa548f3fc
-
-$L$common_seh_tail:
- mov rdi,QWORD[8+rax]
- mov rsi,QWORD[16+rax]
- mov QWORD[152+r8],rax
- mov QWORD[168+r8],rsi
- mov QWORD[176+r8],rdi
-
- mov rdi,QWORD[40+r9]
- mov rsi,r8
- mov ecx,154
- DD 0xa548f3fc
+global aesni_gcm_decrypt
- mov rsi,r9
- xor rcx,rcx
- mov rdx,QWORD[8+rsi]
- mov r8,QWORD[rsi]
- mov r9,QWORD[16+rsi]
- mov r10,QWORD[40+rsi]
- lea r11,[56+rsi]
- lea r12,[24+rsi]
- mov QWORD[32+rsp],r10
- mov QWORD[40+rsp],r11
- mov QWORD[48+rsp],r12
- mov QWORD[56+rsp],rcx
- call QWORD[__imp_RtlVirtualUnwind]
+aesni_gcm_decrypt:
- mov eax,1
- add rsp,64
- popfq
- pop r15
- pop r14
- pop r13
- pop r12
- pop rbp
- pop rbx
- pop rdi
- pop rsi
+ xor eax,eax
DB 0F3h,0C3h ;repret
-section .pdata rdata align=4
-ALIGN 4
- DD $L$SEH_begin_aesni_gcm_decrypt wrt ..imagebase
- DD $L$SEH_end_aesni_gcm_decrypt wrt ..imagebase
- DD $L$SEH_gcm_dec_info wrt ..imagebase
-
- DD $L$SEH_begin_aesni_gcm_encrypt wrt ..imagebase
- DD $L$SEH_end_aesni_gcm_encrypt wrt ..imagebase
- DD $L$SEH_gcm_enc_info wrt ..imagebase
-section .xdata rdata align=8
-ALIGN 8
-$L$SEH_gcm_dec_info:
-DB 9,0,0,0
- DD gcm_se_handler wrt ..imagebase
- DD $L$gcm_dec_body wrt ..imagebase,$L$gcm_dec_abort wrt ..imagebase
-$L$SEH_gcm_enc_info:
-DB 9,0,0,0
- DD gcm_se_handler wrt ..imagebase
- DD $L$gcm_enc_body wrt ..imagebase,$L$gcm_enc_abort wrt ..imagebase
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/ghash-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/ghash-x86_64.nasm
index e70f90841bc..47f3b1fbead 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/ghash-x86_64.nasm
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/modes/ghash-x86_64.nasm
@@ -1354,115 +1354,7 @@ global gcm_init_avx
ALIGN 32
gcm_init_avx:
-$L$SEH_begin_gcm_init_avx:
-
-DB 0x48,0x83,0xec,0x18
-DB 0x0f,0x29,0x34,0x24
- vzeroupper
-
- vmovdqu xmm2,XMMWORD[rdx]
- vpshufd xmm2,xmm2,78
-
-
- vpshufd xmm4,xmm2,255
- vpsrlq xmm3,xmm2,63
- vpsllq xmm2,xmm2,1
- vpxor xmm5,xmm5,xmm5
- vpcmpgtd xmm5,xmm5,xmm4
- vpslldq xmm3,xmm3,8
- vpor xmm2,xmm2,xmm3
-
-
- vpand xmm5,xmm5,XMMWORD[$L$0x1c2_polynomial]
- vpxor xmm2,xmm2,xmm5
-
- vpunpckhqdq xmm6,xmm2,xmm2
- vmovdqa xmm0,xmm2
- vpxor xmm6,xmm6,xmm2
- mov r10,4
- jmp NEAR $L$init_start_avx
-ALIGN 32
-$L$init_loop_avx:
- vpalignr xmm5,xmm4,xmm3,8
- vmovdqu XMMWORD[(-16)+rcx],xmm5
- vpunpckhqdq xmm3,xmm0,xmm0
- vpxor xmm3,xmm3,xmm0
- vpclmulqdq xmm1,xmm0,xmm2,0x11
- vpclmulqdq xmm0,xmm0,xmm2,0x00
- vpclmulqdq xmm3,xmm3,xmm6,0x00
- vpxor xmm4,xmm1,xmm0
- vpxor xmm3,xmm3,xmm4
-
- vpslldq xmm4,xmm3,8
- vpsrldq xmm3,xmm3,8
- vpxor xmm0,xmm0,xmm4
- vpxor xmm1,xmm1,xmm3
- vpsllq xmm3,xmm0,57
- vpsllq xmm4,xmm0,62
- vpxor xmm4,xmm4,xmm3
- vpsllq xmm3,xmm0,63
- vpxor xmm4,xmm4,xmm3
- vpslldq xmm3,xmm4,8
- vpsrldq xmm4,xmm4,8
- vpxor xmm0,xmm0,xmm3
- vpxor xmm1,xmm1,xmm4
-
- vpsrlq xmm4,xmm0,1
- vpxor xmm1,xmm1,xmm0
- vpxor xmm0,xmm0,xmm4
- vpsrlq xmm4,xmm4,5
- vpxor xmm0,xmm0,xmm4
- vpsrlq xmm0,xmm0,1
- vpxor xmm0,xmm0,xmm1
-$L$init_start_avx:
- vmovdqa xmm5,xmm0
- vpunpckhqdq xmm3,xmm0,xmm0
- vpxor xmm3,xmm3,xmm0
- vpclmulqdq xmm1,xmm0,xmm2,0x11
- vpclmulqdq xmm0,xmm0,xmm2,0x00
- vpclmulqdq xmm3,xmm3,xmm6,0x00
- vpxor xmm4,xmm1,xmm0
- vpxor xmm3,xmm3,xmm4
-
- vpslldq xmm4,xmm3,8
- vpsrldq xmm3,xmm3,8
- vpxor xmm0,xmm0,xmm4
- vpxor xmm1,xmm1,xmm3
- vpsllq xmm3,xmm0,57
- vpsllq xmm4,xmm0,62
- vpxor xmm4,xmm4,xmm3
- vpsllq xmm3,xmm0,63
- vpxor xmm4,xmm4,xmm3
- vpslldq xmm3,xmm4,8
- vpsrldq xmm4,xmm4,8
- vpxor xmm0,xmm0,xmm3
- vpxor xmm1,xmm1,xmm4
-
- vpsrlq xmm4,xmm0,1
- vpxor xmm1,xmm1,xmm0
- vpxor xmm0,xmm0,xmm4
- vpsrlq xmm4,xmm4,5
- vpxor xmm0,xmm0,xmm4
- vpsrlq xmm0,xmm0,1
- vpxor xmm0,xmm0,xmm1
- vpshufd xmm3,xmm5,78
- vpshufd xmm4,xmm0,78
- vpxor xmm3,xmm3,xmm5
- vmovdqu XMMWORD[rcx],xmm5
- vpxor xmm4,xmm4,xmm0
- vmovdqu XMMWORD[16+rcx],xmm0
- lea rcx,[48+rcx]
- sub r10,1
- jnz NEAR $L$init_loop_avx
-
- vpalignr xmm5,xmm3,xmm4,8
- vmovdqu XMMWORD[(-16)+rcx],xmm5
-
- vzeroupper
- movaps xmm6,XMMWORD[rsp]
- lea rsp,[24+rsp]
-$L$SEH_end_gcm_init_avx:
- DB 0F3h,0C3h ;repret
+ jmp NEAR $L$_init_clmul
global gcm_gmult_avx
@@ -1480,403 +1372,7 @@ ALIGN 32
gcm_ghash_avx:
DB 243,15,30,250
- lea rax,[((-136))+rsp]
-$L$SEH_begin_gcm_ghash_avx:
-
-DB 0x48,0x8d,0x60,0xe0
-DB 0x0f,0x29,0x70,0xe0
-DB 0x0f,0x29,0x78,0xf0
-DB 0x44,0x0f,0x29,0x00
-DB 0x44,0x0f,0x29,0x48,0x10
-DB 0x44,0x0f,0x29,0x50,0x20
-DB 0x44,0x0f,0x29,0x58,0x30
-DB 0x44,0x0f,0x29,0x60,0x40
-DB 0x44,0x0f,0x29,0x68,0x50
-DB 0x44,0x0f,0x29,0x70,0x60
-DB 0x44,0x0f,0x29,0x78,0x70
- vzeroupper
-
- vmovdqu xmm10,XMMWORD[rcx]
- lea r10,[$L$0x1c2_polynomial]
- lea rdx,[64+rdx]
- vmovdqu xmm13,XMMWORD[$L$bswap_mask]
- vpshufb xmm10,xmm10,xmm13
- cmp r9,0x80
- jb NEAR $L$short_avx
- sub r9,0x80
-
- vmovdqu xmm14,XMMWORD[112+r8]
- vmovdqu xmm6,XMMWORD[((0-64))+rdx]
- vpshufb xmm14,xmm14,xmm13
- vmovdqu xmm7,XMMWORD[((32-64))+rdx]
-
- vpunpckhqdq xmm9,xmm14,xmm14
- vmovdqu xmm15,XMMWORD[96+r8]
- vpclmulqdq xmm0,xmm14,xmm6,0x00
- vpxor xmm9,xmm9,xmm14
- vpshufb xmm15,xmm15,xmm13
- vpclmulqdq xmm1,xmm14,xmm6,0x11
- vmovdqu xmm6,XMMWORD[((16-64))+rdx]
- vpunpckhqdq xmm8,xmm15,xmm15
- vmovdqu xmm14,XMMWORD[80+r8]
- vpclmulqdq xmm2,xmm9,xmm7,0x00
- vpxor xmm8,xmm8,xmm15
-
- vpshufb xmm14,xmm14,xmm13
- vpclmulqdq xmm3,xmm15,xmm6,0x00
- vpunpckhqdq xmm9,xmm14,xmm14
- vpclmulqdq xmm4,xmm15,xmm6,0x11
- vmovdqu xmm6,XMMWORD[((48-64))+rdx]
- vpxor xmm9,xmm9,xmm14
- vmovdqu xmm15,XMMWORD[64+r8]
- vpclmulqdq xmm5,xmm8,xmm7,0x10
- vmovdqu xmm7,XMMWORD[((80-64))+rdx]
-
- vpshufb xmm15,xmm15,xmm13
- vpxor xmm3,xmm3,xmm0
- vpclmulqdq xmm0,xmm14,xmm6,0x00
- vpxor xmm4,xmm4,xmm1
- vpunpckhqdq xmm8,xmm15,xmm15
- vpclmulqdq xmm1,xmm14,xmm6,0x11
- vmovdqu xmm6,XMMWORD[((64-64))+rdx]
- vpxor xmm5,xmm5,xmm2
- vpclmulqdq xmm2,xmm9,xmm7,0x00
- vpxor xmm8,xmm8,xmm15
-
- vmovdqu xmm14,XMMWORD[48+r8]
- vpxor xmm0,xmm0,xmm3
- vpclmulqdq xmm3,xmm15,xmm6,0x00
- vpxor xmm1,xmm1,xmm4
- vpshufb xmm14,xmm14,xmm13
- vpclmulqdq xmm4,xmm15,xmm6,0x11
- vmovdqu xmm6,XMMWORD[((96-64))+rdx]
- vpxor xmm2,xmm2,xmm5
- vpunpckhqdq xmm9,xmm14,xmm14
- vpclmulqdq xmm5,xmm8,xmm7,0x10
- vmovdqu xmm7,XMMWORD[((128-64))+rdx]
- vpxor xmm9,xmm9,xmm14
-
- vmovdqu xmm15,XMMWORD[32+r8]
- vpxor xmm3,xmm3,xmm0
- vpclmulqdq xmm0,xmm14,xmm6,0x00
- vpxor xmm4,xmm4,xmm1
- vpshufb xmm15,xmm15,xmm13
- vpclmulqdq xmm1,xmm14,xmm6,0x11
- vmovdqu xmm6,XMMWORD[((112-64))+rdx]
- vpxor xmm5,xmm5,xmm2
- vpunpckhqdq xmm8,xmm15,xmm15
- vpclmulqdq xmm2,xmm9,xmm7,0x00
- vpxor xmm8,xmm8,xmm15
-
- vmovdqu xmm14,XMMWORD[16+r8]
- vpxor xmm0,xmm0,xmm3
- vpclmulqdq xmm3,xmm15,xmm6,0x00
- vpxor xmm1,xmm1,xmm4
- vpshufb xmm14,xmm14,xmm13
- vpclmulqdq xmm4,xmm15,xmm6,0x11
- vmovdqu xmm6,XMMWORD[((144-64))+rdx]
- vpxor xmm2,xmm2,xmm5
- vpunpckhqdq xmm9,xmm14,xmm14
- vpclmulqdq xmm5,xmm8,xmm7,0x10
- vmovdqu xmm7,XMMWORD[((176-64))+rdx]
- vpxor xmm9,xmm9,xmm14
-
- vmovdqu xmm15,XMMWORD[r8]
- vpxor xmm3,xmm3,xmm0
- vpclmulqdq xmm0,xmm14,xmm6,0x00
- vpxor xmm4,xmm4,xmm1
- vpshufb xmm15,xmm15,xmm13
- vpclmulqdq xmm1,xmm14,xmm6,0x11
- vmovdqu xmm6,XMMWORD[((160-64))+rdx]
- vpxor xmm5,xmm5,xmm2
- vpclmulqdq xmm2,xmm9,xmm7,0x10
-
- lea r8,[128+r8]
- cmp r9,0x80
- jb NEAR $L$tail_avx
-
- vpxor xmm15,xmm15,xmm10
- sub r9,0x80
- jmp NEAR $L$oop8x_avx
-
-ALIGN 32
-$L$oop8x_avx:
- vpunpckhqdq xmm8,xmm15,xmm15
- vmovdqu xmm14,XMMWORD[112+r8]
- vpxor xmm3,xmm3,xmm0
- vpxor xmm8,xmm8,xmm15
- vpclmulqdq xmm10,xmm15,xmm6,0x00
- vpshufb xmm14,xmm14,xmm13
- vpxor xmm4,xmm4,xmm1
- vpclmulqdq xmm11,xmm15,xmm6,0x11
- vmovdqu xmm6,XMMWORD[((0-64))+rdx]
- vpunpckhqdq xmm9,xmm14,xmm14
- vpxor xmm5,xmm5,xmm2
- vpclmulqdq xmm12,xmm8,xmm7,0x00
- vmovdqu xmm7,XMMWORD[((32-64))+rdx]
- vpxor xmm9,xmm9,xmm14
-
- vmovdqu xmm15,XMMWORD[96+r8]
- vpclmulqdq xmm0,xmm14,xmm6,0x00
- vpxor xmm10,xmm10,xmm3
- vpshufb xmm15,xmm15,xmm13
- vpclmulqdq xmm1,xmm14,xmm6,0x11
- vxorps xmm11,xmm11,xmm4
- vmovdqu xmm6,XMMWORD[((16-64))+rdx]
- vpunpckhqdq xmm8,xmm15,xmm15
- vpclmulqdq xmm2,xmm9,xmm7,0x00
- vpxor xmm12,xmm12,xmm5
- vxorps xmm8,xmm8,xmm15
-
- vmovdqu xmm14,XMMWORD[80+r8]
- vpxor xmm12,xmm12,xmm10
- vpclmulqdq xmm3,xmm15,xmm6,0x00
- vpxor xmm12,xmm12,xmm11
- vpslldq xmm9,xmm12,8
- vpxor xmm3,xmm3,xmm0
- vpclmulqdq xmm4,xmm15,xmm6,0x11
- vpsrldq xmm12,xmm12,8
- vpxor xmm10,xmm10,xmm9
- vmovdqu xmm6,XMMWORD[((48-64))+rdx]
- vpshufb xmm14,xmm14,xmm13
- vxorps xmm11,xmm11,xmm12
- vpxor xmm4,xmm4,xmm1
- vpunpckhqdq xmm9,xmm14,xmm14
- vpclmulqdq xmm5,xmm8,xmm7,0x10
- vmovdqu xmm7,XMMWORD[((80-64))+rdx]
- vpxor xmm9,xmm9,xmm14
- vpxor xmm5,xmm5,xmm2
-
- vmovdqu xmm15,XMMWORD[64+r8]
- vpalignr xmm12,xmm10,xmm10,8
- vpclmulqdq xmm0,xmm14,xmm6,0x00
- vpshufb xmm15,xmm15,xmm13
- vpxor xmm0,xmm0,xmm3
- vpclmulqdq xmm1,xmm14,xmm6,0x11
- vmovdqu xmm6,XMMWORD[((64-64))+rdx]
- vpunpckhqdq xmm8,xmm15,xmm15
- vpxor xmm1,xmm1,xmm4
- vpclmulqdq xmm2,xmm9,xmm7,0x00
- vxorps xmm8,xmm8,xmm15
- vpxor xmm2,xmm2,xmm5
-
- vmovdqu xmm14,XMMWORD[48+r8]
- vpclmulqdq xmm10,xmm10,XMMWORD[r10],0x10
- vpclmulqdq xmm3,xmm15,xmm6,0x00
- vpshufb xmm14,xmm14,xmm13
- vpxor xmm3,xmm3,xmm0
- vpclmulqdq xmm4,xmm15,xmm6,0x11
- vmovdqu xmm6,XMMWORD[((96-64))+rdx]
- vpunpckhqdq xmm9,xmm14,xmm14
- vpxor xmm4,xmm4,xmm1
- vpclmulqdq xmm5,xmm8,xmm7,0x10
- vmovdqu xmm7,XMMWORD[((128-64))+rdx]
- vpxor xmm9,xmm9,xmm14
- vpxor xmm5,xmm5,xmm2
-
- vmovdqu xmm15,XMMWORD[32+r8]
- vpclmulqdq xmm0,xmm14,xmm6,0x00
- vpshufb xmm15,xmm15,xmm13
- vpxor xmm0,xmm0,xmm3
- vpclmulqdq xmm1,xmm14,xmm6,0x11
- vmovdqu xmm6,XMMWORD[((112-64))+rdx]
- vpunpckhqdq xmm8,xmm15,xmm15
- vpxor xmm1,xmm1,xmm4
- vpclmulqdq xmm2,xmm9,xmm7,0x00
- vpxor xmm8,xmm8,xmm15
- vpxor xmm2,xmm2,xmm5
- vxorps xmm10,xmm10,xmm12
-
- vmovdqu xmm14,XMMWORD[16+r8]
- vpalignr xmm12,xmm10,xmm10,8
- vpclmulqdq xmm3,xmm15,xmm6,0x00
- vpshufb xmm14,xmm14,xmm13
- vpxor xmm3,xmm3,xmm0
- vpclmulqdq xmm4,xmm15,xmm6,0x11
- vmovdqu xmm6,XMMWORD[((144-64))+rdx]
- vpclmulqdq xmm10,xmm10,XMMWORD[r10],0x10
- vxorps xmm12,xmm12,xmm11
- vpunpckhqdq xmm9,xmm14,xmm14
- vpxor xmm4,xmm4,xmm1
- vpclmulqdq xmm5,xmm8,xmm7,0x10
- vmovdqu xmm7,XMMWORD[((176-64))+rdx]
- vpxor xmm9,xmm9,xmm14
- vpxor xmm5,xmm5,xmm2
-
- vmovdqu xmm15,XMMWORD[r8]
- vpclmulqdq xmm0,xmm14,xmm6,0x00
- vpshufb xmm15,xmm15,xmm13
- vpclmulqdq xmm1,xmm14,xmm6,0x11
- vmovdqu xmm6,XMMWORD[((160-64))+rdx]
- vpxor xmm15,xmm15,xmm12
- vpclmulqdq xmm2,xmm9,xmm7,0x10
- vpxor xmm15,xmm15,xmm10
-
- lea r8,[128+r8]
- sub r9,0x80
- jnc NEAR $L$oop8x_avx
-
- add r9,0x80
- jmp NEAR $L$tail_no_xor_avx
-
-ALIGN 32
-$L$short_avx:
- vmovdqu xmm14,XMMWORD[((-16))+r9*1+r8]
- lea r8,[r9*1+r8]
- vmovdqu xmm6,XMMWORD[((0-64))+rdx]
- vmovdqu xmm7,XMMWORD[((32-64))+rdx]
- vpshufb xmm15,xmm14,xmm13
-
- vmovdqa xmm3,xmm0
- vmovdqa xmm4,xmm1
- vmovdqa xmm5,xmm2
- sub r9,0x10
- jz NEAR $L$tail_avx
-
- vpunpckhqdq xmm8,xmm15,xmm15
- vpxor xmm3,xmm3,xmm0
- vpclmulqdq xmm0,xmm15,xmm6,0x00
- vpxor xmm8,xmm8,xmm15
- vmovdqu xmm14,XMMWORD[((-32))+r8]
- vpxor xmm4,xmm4,xmm1
- vpclmulqdq xmm1,xmm15,xmm6,0x11
- vmovdqu xmm6,XMMWORD[((16-64))+rdx]
- vpshufb xmm15,xmm14,xmm13
- vpxor xmm5,xmm5,xmm2
- vpclmulqdq xmm2,xmm8,xmm7,0x00
- vpsrldq xmm7,xmm7,8
- sub r9,0x10
- jz NEAR $L$tail_avx
-
- vpunpckhqdq xmm8,xmm15,xmm15
- vpxor xmm3,xmm3,xmm0
- vpclmulqdq xmm0,xmm15,xmm6,0x00
- vpxor xmm8,xmm8,xmm15
- vmovdqu xmm14,XMMWORD[((-48))+r8]
- vpxor xmm4,xmm4,xmm1
- vpclmulqdq xmm1,xmm15,xmm6,0x11
- vmovdqu xmm6,XMMWORD[((48-64))+rdx]
- vpshufb xmm15,xmm14,xmm13
- vpxor xmm5,xmm5,xmm2
- vpclmulqdq xmm2,xmm8,xmm7,0x00
- vmovdqu xmm7,XMMWORD[((80-64))+rdx]
- sub r9,0x10
- jz NEAR $L$tail_avx
-
- vpunpckhqdq xmm8,xmm15,xmm15
- vpxor xmm3,xmm3,xmm0
- vpclmulqdq xmm0,xmm15,xmm6,0x00
- vpxor xmm8,xmm8,xmm15
- vmovdqu xmm14,XMMWORD[((-64))+r8]
- vpxor xmm4,xmm4,xmm1
- vpclmulqdq xmm1,xmm15,xmm6,0x11
- vmovdqu xmm6,XMMWORD[((64-64))+rdx]
- vpshufb xmm15,xmm14,xmm13
- vpxor xmm5,xmm5,xmm2
- vpclmulqdq xmm2,xmm8,xmm7,0x00
- vpsrldq xmm7,xmm7,8
- sub r9,0x10
- jz NEAR $L$tail_avx
-
- vpunpckhqdq xmm8,xmm15,xmm15
- vpxor xmm3,xmm3,xmm0
- vpclmulqdq xmm0,xmm15,xmm6,0x00
- vpxor xmm8,xmm8,xmm15
- vmovdqu xmm14,XMMWORD[((-80))+r8]
- vpxor xmm4,xmm4,xmm1
- vpclmulqdq xmm1,xmm15,xmm6,0x11
- vmovdqu xmm6,XMMWORD[((96-64))+rdx]
- vpshufb xmm15,xmm14,xmm13
- vpxor xmm5,xmm5,xmm2
- vpclmulqdq xmm2,xmm8,xmm7,0x00
- vmovdqu xmm7,XMMWORD[((128-64))+rdx]
- sub r9,0x10
- jz NEAR $L$tail_avx
-
- vpunpckhqdq xmm8,xmm15,xmm15
- vpxor xmm3,xmm3,xmm0
- vpclmulqdq xmm0,xmm15,xmm6,0x00
- vpxor xmm8,xmm8,xmm15
- vmovdqu xmm14,XMMWORD[((-96))+r8]
- vpxor xmm4,xmm4,xmm1
- vpclmulqdq xmm1,xmm15,xmm6,0x11
- vmovdqu xmm6,XMMWORD[((112-64))+rdx]
- vpshufb xmm15,xmm14,xmm13
- vpxor xmm5,xmm5,xmm2
- vpclmulqdq xmm2,xmm8,xmm7,0x00
- vpsrldq xmm7,xmm7,8
- sub r9,0x10
- jz NEAR $L$tail_avx
-
- vpunpckhqdq xmm8,xmm15,xmm15
- vpxor xmm3,xmm3,xmm0
- vpclmulqdq xmm0,xmm15,xmm6,0x00
- vpxor xmm8,xmm8,xmm15
- vmovdqu xmm14,XMMWORD[((-112))+r8]
- vpxor xmm4,xmm4,xmm1
- vpclmulqdq xmm1,xmm15,xmm6,0x11
- vmovdqu xmm6,XMMWORD[((144-64))+rdx]
- vpshufb xmm15,xmm14,xmm13
- vpxor xmm5,xmm5,xmm2
- vpclmulqdq xmm2,xmm8,xmm7,0x00
- vmovq xmm7,QWORD[((184-64))+rdx]
- sub r9,0x10
- jmp NEAR $L$tail_avx
-
-ALIGN 32
-$L$tail_avx:
- vpxor xmm15,xmm15,xmm10
-$L$tail_no_xor_avx:
- vpunpckhqdq xmm8,xmm15,xmm15
- vpxor xmm3,xmm3,xmm0
- vpclmulqdq xmm0,xmm15,xmm6,0x00
- vpxor xmm8,xmm8,xmm15
- vpxor xmm4,xmm4,xmm1
- vpclmulqdq xmm1,xmm15,xmm6,0x11
- vpxor xmm5,xmm5,xmm2
- vpclmulqdq xmm2,xmm8,xmm7,0x00
-
- vmovdqu xmm12,XMMWORD[r10]
-
- vpxor xmm10,xmm3,xmm0
- vpxor xmm11,xmm4,xmm1
- vpxor xmm5,xmm5,xmm2
-
- vpxor xmm5,xmm5,xmm10
- vpxor xmm5,xmm5,xmm11
- vpslldq xmm9,xmm5,8
- vpsrldq xmm5,xmm5,8
- vpxor xmm10,xmm10,xmm9
- vpxor xmm11,xmm11,xmm5
-
- vpclmulqdq xmm9,xmm10,xmm12,0x10
- vpalignr xmm10,xmm10,xmm10,8
- vpxor xmm10,xmm10,xmm9
-
- vpclmulqdq xmm9,xmm10,xmm12,0x10
- vpalignr xmm10,xmm10,xmm10,8
- vpxor xmm10,xmm10,xmm11
- vpxor xmm10,xmm10,xmm9
-
- cmp r9,0
- jne NEAR $L$short_avx
-
- vpshufb xmm10,xmm10,xmm13
- vmovdqu XMMWORD[rcx],xmm10
- vzeroupper
- movaps xmm6,XMMWORD[rsp]
- movaps xmm7,XMMWORD[16+rsp]
- movaps xmm8,XMMWORD[32+rsp]
- movaps xmm9,XMMWORD[48+rsp]
- movaps xmm10,XMMWORD[64+rsp]
- movaps xmm11,XMMWORD[80+rsp]
- movaps xmm12,XMMWORD[96+rsp]
- movaps xmm13,XMMWORD[112+rsp]
- movaps xmm14,XMMWORD[128+rsp]
- movaps xmm15,XMMWORD[144+rsp]
- lea rsp,[168+rsp]
-$L$SEH_end_gcm_ghash_avx:
- DB 0F3h,0C3h ;repret
+ jmp NEAR $L$_ghash_clmul
ALIGN 64
@@ -2040,13 +1536,6 @@ ALIGN 4
DD $L$SEH_begin_gcm_ghash_clmul wrt ..imagebase
DD $L$SEH_end_gcm_ghash_clmul wrt ..imagebase
DD $L$SEH_info_gcm_ghash_clmul wrt ..imagebase
- DD $L$SEH_begin_gcm_init_avx wrt ..imagebase
- DD $L$SEH_end_gcm_init_avx wrt ..imagebase
- DD $L$SEH_info_gcm_init_clmul wrt ..imagebase
-
- DD $L$SEH_begin_gcm_ghash_avx wrt ..imagebase
- DD $L$SEH_end_gcm_ghash_avx wrt ..imagebase
- DD $L$SEH_info_gcm_ghash_clmul wrt ..imagebase
section .xdata rdata align=8
ALIGN 8
$L$SEH_info_gcm_gmult_4bit:
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-mb-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-mb-x86_64.nasm
index 9018065f8dd..003b9229a98 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-mb-x86_64.nasm
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-mb-x86_64.nasm
@@ -24,8 +24,6 @@ $L$SEH_begin_sha1_multi_block:
mov rcx,QWORD[((OPENSSL_ia32cap_P+4))]
bt rcx,61
jc NEAR _shaext_shortcut
- test ecx,268435456
- jnz NEAR _avx_shortcut
mov rax,rsp
push rbx
@@ -3019,4407 +3017,6 @@ $L$epilogue_shaext:
$L$SEH_end_sha1_multi_block_shaext:
-ALIGN 32
-sha1_multi_block_avx:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_sha1_multi_block_avx:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
-
-
-
-_avx_shortcut:
- shr rcx,32
- cmp edx,2
- jb NEAR $L$avx
- test ecx,32
- jnz NEAR _avx2_shortcut
- jmp NEAR $L$avx
-ALIGN 32
-$L$avx:
- mov rax,rsp
-
- push rbx
-
- push rbp
-
- lea rsp,[((-168))+rsp]
- movaps XMMWORD[rsp],xmm6
- movaps XMMWORD[16+rsp],xmm7
- movaps XMMWORD[32+rsp],xmm8
- movaps XMMWORD[48+rsp],xmm9
- movaps XMMWORD[(-120)+rax],xmm10
- movaps XMMWORD[(-104)+rax],xmm11
- movaps XMMWORD[(-88)+rax],xmm12
- movaps XMMWORD[(-72)+rax],xmm13
- movaps XMMWORD[(-56)+rax],xmm14
- movaps XMMWORD[(-40)+rax],xmm15
- sub rsp,288
- and rsp,-256
- mov QWORD[272+rsp],rax
-
-$L$body_avx:
- lea rbp,[K_XX_XX]
- lea rbx,[256+rsp]
-
- vzeroupper
-$L$oop_grande_avx:
- mov DWORD[280+rsp],edx
- xor edx,edx
-
- mov r8,QWORD[rsi]
-
- mov ecx,DWORD[8+rsi]
- cmp ecx,edx
- cmovg edx,ecx
- test ecx,ecx
- mov DWORD[rbx],ecx
- cmovle r8,rbp
-
- mov r9,QWORD[16+rsi]
-
- mov ecx,DWORD[24+rsi]
- cmp ecx,edx
- cmovg edx,ecx
- test ecx,ecx
- mov DWORD[4+rbx],ecx
- cmovle r9,rbp
-
- mov r10,QWORD[32+rsi]
-
- mov ecx,DWORD[40+rsi]
- cmp ecx,edx
- cmovg edx,ecx
- test ecx,ecx
- mov DWORD[8+rbx],ecx
- cmovle r10,rbp
-
- mov r11,QWORD[48+rsi]
-
- mov ecx,DWORD[56+rsi]
- cmp ecx,edx
- cmovg edx,ecx
- test ecx,ecx
- mov DWORD[12+rbx],ecx
- cmovle r11,rbp
- test edx,edx
- jz NEAR $L$done_avx
-
- vmovdqu xmm10,XMMWORD[rdi]
- lea rax,[128+rsp]
- vmovdqu xmm11,XMMWORD[32+rdi]
- vmovdqu xmm12,XMMWORD[64+rdi]
- vmovdqu xmm13,XMMWORD[96+rdi]
- vmovdqu xmm14,XMMWORD[128+rdi]
- vmovdqu xmm5,XMMWORD[96+rbp]
- jmp NEAR $L$oop_avx
-
-ALIGN 32
-$L$oop_avx:
- vmovdqa xmm15,XMMWORD[((-32))+rbp]
- vmovd xmm0,DWORD[r8]
- lea r8,[64+r8]
- vmovd xmm2,DWORD[r9]
- lea r9,[64+r9]
- vpinsrd xmm0,xmm0,DWORD[r10],1
- lea r10,[64+r10]
- vpinsrd xmm2,xmm2,DWORD[r11],1
- lea r11,[64+r11]
- vmovd xmm1,DWORD[((-60))+r8]
- vpunpckldq xmm0,xmm0,xmm2
- vmovd xmm9,DWORD[((-60))+r9]
- vpshufb xmm0,xmm0,xmm5
- vpinsrd xmm1,xmm1,DWORD[((-60))+r10],1
- vpinsrd xmm9,xmm9,DWORD[((-60))+r11],1
- vpaddd xmm14,xmm14,xmm15
- vpslld xmm8,xmm10,5
- vpandn xmm7,xmm11,xmm13
- vpand xmm6,xmm11,xmm12
-
- vmovdqa XMMWORD[(0-128)+rax],xmm0
- vpaddd xmm14,xmm14,xmm0
- vpunpckldq xmm1,xmm1,xmm9
- vpsrld xmm9,xmm10,27
- vpxor xmm6,xmm6,xmm7
- vmovd xmm2,DWORD[((-56))+r8]
-
- vpslld xmm7,xmm11,30
- vpor xmm8,xmm8,xmm9
- vmovd xmm9,DWORD[((-56))+r9]
- vpaddd xmm14,xmm14,xmm6
-
- vpsrld xmm11,xmm11,2
- vpaddd xmm14,xmm14,xmm8
- vpshufb xmm1,xmm1,xmm5
- vpor xmm11,xmm11,xmm7
- vpinsrd xmm2,xmm2,DWORD[((-56))+r10],1
- vpinsrd xmm9,xmm9,DWORD[((-56))+r11],1
- vpaddd xmm13,xmm13,xmm15
- vpslld xmm8,xmm14,5
- vpandn xmm7,xmm10,xmm12
- vpand xmm6,xmm10,xmm11
-
- vmovdqa XMMWORD[(16-128)+rax],xmm1
- vpaddd xmm13,xmm13,xmm1
- vpunpckldq xmm2,xmm2,xmm9
- vpsrld xmm9,xmm14,27
- vpxor xmm6,xmm6,xmm7
- vmovd xmm3,DWORD[((-52))+r8]
-
- vpslld xmm7,xmm10,30
- vpor xmm8,xmm8,xmm9
- vmovd xmm9,DWORD[((-52))+r9]
- vpaddd xmm13,xmm13,xmm6
-
- vpsrld xmm10,xmm10,2
- vpaddd xmm13,xmm13,xmm8
- vpshufb xmm2,xmm2,xmm5
- vpor xmm10,xmm10,xmm7
- vpinsrd xmm3,xmm3,DWORD[((-52))+r10],1
- vpinsrd xmm9,xmm9,DWORD[((-52))+r11],1
- vpaddd xmm12,xmm12,xmm15
- vpslld xmm8,xmm13,5
- vpandn xmm7,xmm14,xmm11
- vpand xmm6,xmm14,xmm10
-
- vmovdqa XMMWORD[(32-128)+rax],xmm2
- vpaddd xmm12,xmm12,xmm2
- vpunpckldq xmm3,xmm3,xmm9
- vpsrld xmm9,xmm13,27
- vpxor xmm6,xmm6,xmm7
- vmovd xmm4,DWORD[((-48))+r8]
-
- vpslld xmm7,xmm14,30
- vpor xmm8,xmm8,xmm9
- vmovd xmm9,DWORD[((-48))+r9]
- vpaddd xmm12,xmm12,xmm6
-
- vpsrld xmm14,xmm14,2
- vpaddd xmm12,xmm12,xmm8
- vpshufb xmm3,xmm3,xmm5
- vpor xmm14,xmm14,xmm7
- vpinsrd xmm4,xmm4,DWORD[((-48))+r10],1
- vpinsrd xmm9,xmm9,DWORD[((-48))+r11],1
- vpaddd xmm11,xmm11,xmm15
- vpslld xmm8,xmm12,5
- vpandn xmm7,xmm13,xmm10
- vpand xmm6,xmm13,xmm14
-
- vmovdqa XMMWORD[(48-128)+rax],xmm3
- vpaddd xmm11,xmm11,xmm3
- vpunpckldq xmm4,xmm4,xmm9
- vpsrld xmm9,xmm12,27
- vpxor xmm6,xmm6,xmm7
- vmovd xmm0,DWORD[((-44))+r8]
-
- vpslld xmm7,xmm13,30
- vpor xmm8,xmm8,xmm9
- vmovd xmm9,DWORD[((-44))+r9]
- vpaddd xmm11,xmm11,xmm6
-
- vpsrld xmm13,xmm13,2
- vpaddd xmm11,xmm11,xmm8
- vpshufb xmm4,xmm4,xmm5
- vpor xmm13,xmm13,xmm7
- vpinsrd xmm0,xmm0,DWORD[((-44))+r10],1
- vpinsrd xmm9,xmm9,DWORD[((-44))+r11],1
- vpaddd xmm10,xmm10,xmm15
- vpslld xmm8,xmm11,5
- vpandn xmm7,xmm12,xmm14
- vpand xmm6,xmm12,xmm13
-
- vmovdqa XMMWORD[(64-128)+rax],xmm4
- vpaddd xmm10,xmm10,xmm4
- vpunpckldq xmm0,xmm0,xmm9
- vpsrld xmm9,xmm11,27
- vpxor xmm6,xmm6,xmm7
- vmovd xmm1,DWORD[((-40))+r8]
-
- vpslld xmm7,xmm12,30
- vpor xmm8,xmm8,xmm9
- vmovd xmm9,DWORD[((-40))+r9]
- vpaddd xmm10,xmm10,xmm6
-
- vpsrld xmm12,xmm12,2
- vpaddd xmm10,xmm10,xmm8
- vpshufb xmm0,xmm0,xmm5
- vpor xmm12,xmm12,xmm7
- vpinsrd xmm1,xmm1,DWORD[((-40))+r10],1
- vpinsrd xmm9,xmm9,DWORD[((-40))+r11],1
- vpaddd xmm14,xmm14,xmm15
- vpslld xmm8,xmm10,5
- vpandn xmm7,xmm11,xmm13
- vpand xmm6,xmm11,xmm12
-
- vmovdqa XMMWORD[(80-128)+rax],xmm0
- vpaddd xmm14,xmm14,xmm0
- vpunpckldq xmm1,xmm1,xmm9
- vpsrld xmm9,xmm10,27
- vpxor xmm6,xmm6,xmm7
- vmovd xmm2,DWORD[((-36))+r8]
-
- vpslld xmm7,xmm11,30
- vpor xmm8,xmm8,xmm9
- vmovd xmm9,DWORD[((-36))+r9]
- vpaddd xmm14,xmm14,xmm6
-
- vpsrld xmm11,xmm11,2
- vpaddd xmm14,xmm14,xmm8
- vpshufb xmm1,xmm1,xmm5
- vpor xmm11,xmm11,xmm7
- vpinsrd xmm2,xmm2,DWORD[((-36))+r10],1
- vpinsrd xmm9,xmm9,DWORD[((-36))+r11],1
- vpaddd xmm13,xmm13,xmm15
- vpslld xmm8,xmm14,5
- vpandn xmm7,xmm10,xmm12
- vpand xmm6,xmm10,xmm11
-
- vmovdqa XMMWORD[(96-128)+rax],xmm1
- vpaddd xmm13,xmm13,xmm1
- vpunpckldq xmm2,xmm2,xmm9
- vpsrld xmm9,xmm14,27
- vpxor xmm6,xmm6,xmm7
- vmovd xmm3,DWORD[((-32))+r8]
-
- vpslld xmm7,xmm10,30
- vpor xmm8,xmm8,xmm9
- vmovd xmm9,DWORD[((-32))+r9]
- vpaddd xmm13,xmm13,xmm6
-
- vpsrld xmm10,xmm10,2
- vpaddd xmm13,xmm13,xmm8
- vpshufb xmm2,xmm2,xmm5
- vpor xmm10,xmm10,xmm7
- vpinsrd xmm3,xmm3,DWORD[((-32))+r10],1
- vpinsrd xmm9,xmm9,DWORD[((-32))+r11],1
- vpaddd xmm12,xmm12,xmm15
- vpslld xmm8,xmm13,5
- vpandn xmm7,xmm14,xmm11
- vpand xmm6,xmm14,xmm10
-
- vmovdqa XMMWORD[(112-128)+rax],xmm2
- vpaddd xmm12,xmm12,xmm2
- vpunpckldq xmm3,xmm3,xmm9
- vpsrld xmm9,xmm13,27
- vpxor xmm6,xmm6,xmm7
- vmovd xmm4,DWORD[((-28))+r8]
-
- vpslld xmm7,xmm14,30
- vpor xmm8,xmm8,xmm9
- vmovd xmm9,DWORD[((-28))+r9]
- vpaddd xmm12,xmm12,xmm6
-
- vpsrld xmm14,xmm14,2
- vpaddd xmm12,xmm12,xmm8
- vpshufb xmm3,xmm3,xmm5
- vpor xmm14,xmm14,xmm7
- vpinsrd xmm4,xmm4,DWORD[((-28))+r10],1
- vpinsrd xmm9,xmm9,DWORD[((-28))+r11],1
- vpaddd xmm11,xmm11,xmm15
- vpslld xmm8,xmm12,5
- vpandn xmm7,xmm13,xmm10
- vpand xmm6,xmm13,xmm14
-
- vmovdqa XMMWORD[(128-128)+rax],xmm3
- vpaddd xmm11,xmm11,xmm3
- vpunpckldq xmm4,xmm4,xmm9
- vpsrld xmm9,xmm12,27
- vpxor xmm6,xmm6,xmm7
- vmovd xmm0,DWORD[((-24))+r8]
-
- vpslld xmm7,xmm13,30
- vpor xmm8,xmm8,xmm9
- vmovd xmm9,DWORD[((-24))+r9]
- vpaddd xmm11,xmm11,xmm6
-
- vpsrld xmm13,xmm13,2
- vpaddd xmm11,xmm11,xmm8
- vpshufb xmm4,xmm4,xmm5
- vpor xmm13,xmm13,xmm7
- vpinsrd xmm0,xmm0,DWORD[((-24))+r10],1
- vpinsrd xmm9,xmm9,DWORD[((-24))+r11],1
- vpaddd xmm10,xmm10,xmm15
- vpslld xmm8,xmm11,5
- vpandn xmm7,xmm12,xmm14
- vpand xmm6,xmm12,xmm13
-
- vmovdqa XMMWORD[(144-128)+rax],xmm4
- vpaddd xmm10,xmm10,xmm4
- vpunpckldq xmm0,xmm0,xmm9
- vpsrld xmm9,xmm11,27
- vpxor xmm6,xmm6,xmm7
- vmovd xmm1,DWORD[((-20))+r8]
-
- vpslld xmm7,xmm12,30
- vpor xmm8,xmm8,xmm9
- vmovd xmm9,DWORD[((-20))+r9]
- vpaddd xmm10,xmm10,xmm6
-
- vpsrld xmm12,xmm12,2
- vpaddd xmm10,xmm10,xmm8
- vpshufb xmm0,xmm0,xmm5
- vpor xmm12,xmm12,xmm7
- vpinsrd xmm1,xmm1,DWORD[((-20))+r10],1
- vpinsrd xmm9,xmm9,DWORD[((-20))+r11],1
- vpaddd xmm14,xmm14,xmm15
- vpslld xmm8,xmm10,5
- vpandn xmm7,xmm11,xmm13
- vpand xmm6,xmm11,xmm12
-
- vmovdqa XMMWORD[(160-128)+rax],xmm0
- vpaddd xmm14,xmm14,xmm0
- vpunpckldq xmm1,xmm1,xmm9
- vpsrld xmm9,xmm10,27
- vpxor xmm6,xmm6,xmm7
- vmovd xmm2,DWORD[((-16))+r8]
-
- vpslld xmm7,xmm11,30
- vpor xmm8,xmm8,xmm9
- vmovd xmm9,DWORD[((-16))+r9]
- vpaddd xmm14,xmm14,xmm6
-
- vpsrld xmm11,xmm11,2
- vpaddd xmm14,xmm14,xmm8
- vpshufb xmm1,xmm1,xmm5
- vpor xmm11,xmm11,xmm7
- vpinsrd xmm2,xmm2,DWORD[((-16))+r10],1
- vpinsrd xmm9,xmm9,DWORD[((-16))+r11],1
- vpaddd xmm13,xmm13,xmm15
- vpslld xmm8,xmm14,5
- vpandn xmm7,xmm10,xmm12
- vpand xmm6,xmm10,xmm11
-
- vmovdqa XMMWORD[(176-128)+rax],xmm1
- vpaddd xmm13,xmm13,xmm1
- vpunpckldq xmm2,xmm2,xmm9
- vpsrld xmm9,xmm14,27
- vpxor xmm6,xmm6,xmm7
- vmovd xmm3,DWORD[((-12))+r8]
-
- vpslld xmm7,xmm10,30
- vpor xmm8,xmm8,xmm9
- vmovd xmm9,DWORD[((-12))+r9]
- vpaddd xmm13,xmm13,xmm6
-
- vpsrld xmm10,xmm10,2
- vpaddd xmm13,xmm13,xmm8
- vpshufb xmm2,xmm2,xmm5
- vpor xmm10,xmm10,xmm7
- vpinsrd xmm3,xmm3,DWORD[((-12))+r10],1
- vpinsrd xmm9,xmm9,DWORD[((-12))+r11],1
- vpaddd xmm12,xmm12,xmm15
- vpslld xmm8,xmm13,5
- vpandn xmm7,xmm14,xmm11
- vpand xmm6,xmm14,xmm10
-
- vmovdqa XMMWORD[(192-128)+rax],xmm2
- vpaddd xmm12,xmm12,xmm2
- vpunpckldq xmm3,xmm3,xmm9
- vpsrld xmm9,xmm13,27
- vpxor xmm6,xmm6,xmm7
- vmovd xmm4,DWORD[((-8))+r8]
-
- vpslld xmm7,xmm14,30
- vpor xmm8,xmm8,xmm9
- vmovd xmm9,DWORD[((-8))+r9]
- vpaddd xmm12,xmm12,xmm6
-
- vpsrld xmm14,xmm14,2
- vpaddd xmm12,xmm12,xmm8
- vpshufb xmm3,xmm3,xmm5
- vpor xmm14,xmm14,xmm7
- vpinsrd xmm4,xmm4,DWORD[((-8))+r10],1
- vpinsrd xmm9,xmm9,DWORD[((-8))+r11],1
- vpaddd xmm11,xmm11,xmm15
- vpslld xmm8,xmm12,5
- vpandn xmm7,xmm13,xmm10
- vpand xmm6,xmm13,xmm14
-
- vmovdqa XMMWORD[(208-128)+rax],xmm3
- vpaddd xmm11,xmm11,xmm3
- vpunpckldq xmm4,xmm4,xmm9
- vpsrld xmm9,xmm12,27
- vpxor xmm6,xmm6,xmm7
- vmovd xmm0,DWORD[((-4))+r8]
-
- vpslld xmm7,xmm13,30
- vpor xmm8,xmm8,xmm9
- vmovd xmm9,DWORD[((-4))+r9]
- vpaddd xmm11,xmm11,xmm6
-
- vpsrld xmm13,xmm13,2
- vpaddd xmm11,xmm11,xmm8
- vpshufb xmm4,xmm4,xmm5
- vpor xmm13,xmm13,xmm7
- vmovdqa xmm1,XMMWORD[((0-128))+rax]
- vpinsrd xmm0,xmm0,DWORD[((-4))+r10],1
- vpinsrd xmm9,xmm9,DWORD[((-4))+r11],1
- vpaddd xmm10,xmm10,xmm15
- prefetcht0 [63+r8]
- vpslld xmm8,xmm11,5
- vpandn xmm7,xmm12,xmm14
- vpand xmm6,xmm12,xmm13
-
- vmovdqa XMMWORD[(224-128)+rax],xmm4
- vpaddd xmm10,xmm10,xmm4
- vpunpckldq xmm0,xmm0,xmm9
- vpsrld xmm9,xmm11,27
- prefetcht0 [63+r9]
- vpxor xmm6,xmm6,xmm7
-
- vpslld xmm7,xmm12,30
- vpor xmm8,xmm8,xmm9
- prefetcht0 [63+r10]
- vpaddd xmm10,xmm10,xmm6
-
- vpsrld xmm12,xmm12,2
- vpaddd xmm10,xmm10,xmm8
- prefetcht0 [63+r11]
- vpshufb xmm0,xmm0,xmm5
- vpor xmm12,xmm12,xmm7
- vmovdqa xmm2,XMMWORD[((16-128))+rax]
- vpxor xmm1,xmm1,xmm3
- vmovdqa xmm3,XMMWORD[((32-128))+rax]
-
- vpaddd xmm14,xmm14,xmm15
- vpslld xmm8,xmm10,5
- vpandn xmm7,xmm11,xmm13
-
- vpand xmm6,xmm11,xmm12
-
- vmovdqa XMMWORD[(240-128)+rax],xmm0
- vpaddd xmm14,xmm14,xmm0
- vpxor xmm1,xmm1,XMMWORD[((128-128))+rax]
- vpsrld xmm9,xmm10,27
- vpxor xmm6,xmm6,xmm7
- vpxor xmm1,xmm1,xmm3
-
-
- vpslld xmm7,xmm11,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm14,xmm14,xmm6
-
- vpsrld xmm5,xmm1,31
- vpaddd xmm1,xmm1,xmm1
-
- vpsrld xmm11,xmm11,2
-
- vpaddd xmm14,xmm14,xmm8
- vpor xmm1,xmm1,xmm5
- vpor xmm11,xmm11,xmm7
- vpxor xmm2,xmm2,xmm4
- vmovdqa xmm4,XMMWORD[((48-128))+rax]
-
- vpaddd xmm13,xmm13,xmm15
- vpslld xmm8,xmm14,5
- vpandn xmm7,xmm10,xmm12
-
- vpand xmm6,xmm10,xmm11
-
- vmovdqa XMMWORD[(0-128)+rax],xmm1
- vpaddd xmm13,xmm13,xmm1
- vpxor xmm2,xmm2,XMMWORD[((144-128))+rax]
- vpsrld xmm9,xmm14,27
- vpxor xmm6,xmm6,xmm7
- vpxor xmm2,xmm2,xmm4
-
-
- vpslld xmm7,xmm10,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm13,xmm13,xmm6
-
- vpsrld xmm5,xmm2,31
- vpaddd xmm2,xmm2,xmm2
-
- vpsrld xmm10,xmm10,2
-
- vpaddd xmm13,xmm13,xmm8
- vpor xmm2,xmm2,xmm5
- vpor xmm10,xmm10,xmm7
- vpxor xmm3,xmm3,xmm0
- vmovdqa xmm0,XMMWORD[((64-128))+rax]
-
- vpaddd xmm12,xmm12,xmm15
- vpslld xmm8,xmm13,5
- vpandn xmm7,xmm14,xmm11
-
- vpand xmm6,xmm14,xmm10
-
- vmovdqa XMMWORD[(16-128)+rax],xmm2
- vpaddd xmm12,xmm12,xmm2
- vpxor xmm3,xmm3,XMMWORD[((160-128))+rax]
- vpsrld xmm9,xmm13,27
- vpxor xmm6,xmm6,xmm7
- vpxor xmm3,xmm3,xmm0
-
-
- vpslld xmm7,xmm14,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm12,xmm12,xmm6
-
- vpsrld xmm5,xmm3,31
- vpaddd xmm3,xmm3,xmm3
-
- vpsrld xmm14,xmm14,2
-
- vpaddd xmm12,xmm12,xmm8
- vpor xmm3,xmm3,xmm5
- vpor xmm14,xmm14,xmm7
- vpxor xmm4,xmm4,xmm1
- vmovdqa xmm1,XMMWORD[((80-128))+rax]
-
- vpaddd xmm11,xmm11,xmm15
- vpslld xmm8,xmm12,5
- vpandn xmm7,xmm13,xmm10
-
- vpand xmm6,xmm13,xmm14
-
- vmovdqa XMMWORD[(32-128)+rax],xmm3
- vpaddd xmm11,xmm11,xmm3
- vpxor xmm4,xmm4,XMMWORD[((176-128))+rax]
- vpsrld xmm9,xmm12,27
- vpxor xmm6,xmm6,xmm7
- vpxor xmm4,xmm4,xmm1
-
-
- vpslld xmm7,xmm13,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm11,xmm11,xmm6
-
- vpsrld xmm5,xmm4,31
- vpaddd xmm4,xmm4,xmm4
-
- vpsrld xmm13,xmm13,2
-
- vpaddd xmm11,xmm11,xmm8
- vpor xmm4,xmm4,xmm5
- vpor xmm13,xmm13,xmm7
- vpxor xmm0,xmm0,xmm2
- vmovdqa xmm2,XMMWORD[((96-128))+rax]
-
- vpaddd xmm10,xmm10,xmm15
- vpslld xmm8,xmm11,5
- vpandn xmm7,xmm12,xmm14
-
- vpand xmm6,xmm12,xmm13
-
- vmovdqa XMMWORD[(48-128)+rax],xmm4
- vpaddd xmm10,xmm10,xmm4
- vpxor xmm0,xmm0,XMMWORD[((192-128))+rax]
- vpsrld xmm9,xmm11,27
- vpxor xmm6,xmm6,xmm7
- vpxor xmm0,xmm0,xmm2
-
-
- vpslld xmm7,xmm12,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm10,xmm10,xmm6
-
- vpsrld xmm5,xmm0,31
- vpaddd xmm0,xmm0,xmm0
-
- vpsrld xmm12,xmm12,2
-
- vpaddd xmm10,xmm10,xmm8
- vpor xmm0,xmm0,xmm5
- vpor xmm12,xmm12,xmm7
- vmovdqa xmm15,XMMWORD[rbp]
- vpxor xmm1,xmm1,xmm3
- vmovdqa xmm3,XMMWORD[((112-128))+rax]
-
- vpslld xmm8,xmm10,5
- vpaddd xmm14,xmm14,xmm15
- vpxor xmm6,xmm13,xmm11
- vmovdqa XMMWORD[(64-128)+rax],xmm0
- vpaddd xmm14,xmm14,xmm0
- vpxor xmm1,xmm1,XMMWORD[((208-128))+rax]
- vpsrld xmm9,xmm10,27
- vpxor xmm6,xmm6,xmm12
- vpxor xmm1,xmm1,xmm3
-
- vpslld xmm7,xmm11,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm14,xmm14,xmm6
- vpsrld xmm5,xmm1,31
- vpaddd xmm1,xmm1,xmm1
-
- vpsrld xmm11,xmm11,2
- vpaddd xmm14,xmm14,xmm8
- vpor xmm1,xmm1,xmm5
- vpor xmm11,xmm11,xmm7
- vpxor xmm2,xmm2,xmm4
- vmovdqa xmm4,XMMWORD[((128-128))+rax]
-
- vpslld xmm8,xmm14,5
- vpaddd xmm13,xmm13,xmm15
- vpxor xmm6,xmm12,xmm10
- vmovdqa XMMWORD[(80-128)+rax],xmm1
- vpaddd xmm13,xmm13,xmm1
- vpxor xmm2,xmm2,XMMWORD[((224-128))+rax]
- vpsrld xmm9,xmm14,27
- vpxor xmm6,xmm6,xmm11
- vpxor xmm2,xmm2,xmm4
-
- vpslld xmm7,xmm10,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm13,xmm13,xmm6
- vpsrld xmm5,xmm2,31
- vpaddd xmm2,xmm2,xmm2
-
- vpsrld xmm10,xmm10,2
- vpaddd xmm13,xmm13,xmm8
- vpor xmm2,xmm2,xmm5
- vpor xmm10,xmm10,xmm7
- vpxor xmm3,xmm3,xmm0
- vmovdqa xmm0,XMMWORD[((144-128))+rax]
-
- vpslld xmm8,xmm13,5
- vpaddd xmm12,xmm12,xmm15
- vpxor xmm6,xmm11,xmm14
- vmovdqa XMMWORD[(96-128)+rax],xmm2
- vpaddd xmm12,xmm12,xmm2
- vpxor xmm3,xmm3,XMMWORD[((240-128))+rax]
- vpsrld xmm9,xmm13,27
- vpxor xmm6,xmm6,xmm10
- vpxor xmm3,xmm3,xmm0
-
- vpslld xmm7,xmm14,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm12,xmm12,xmm6
- vpsrld xmm5,xmm3,31
- vpaddd xmm3,xmm3,xmm3
-
- vpsrld xmm14,xmm14,2
- vpaddd xmm12,xmm12,xmm8
- vpor xmm3,xmm3,xmm5
- vpor xmm14,xmm14,xmm7
- vpxor xmm4,xmm4,xmm1
- vmovdqa xmm1,XMMWORD[((160-128))+rax]
-
- vpslld xmm8,xmm12,5
- vpaddd xmm11,xmm11,xmm15
- vpxor xmm6,xmm10,xmm13
- vmovdqa XMMWORD[(112-128)+rax],xmm3
- vpaddd xmm11,xmm11,xmm3
- vpxor xmm4,xmm4,XMMWORD[((0-128))+rax]
- vpsrld xmm9,xmm12,27
- vpxor xmm6,xmm6,xmm14
- vpxor xmm4,xmm4,xmm1
-
- vpslld xmm7,xmm13,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm11,xmm11,xmm6
- vpsrld xmm5,xmm4,31
- vpaddd xmm4,xmm4,xmm4
-
- vpsrld xmm13,xmm13,2
- vpaddd xmm11,xmm11,xmm8
- vpor xmm4,xmm4,xmm5
- vpor xmm13,xmm13,xmm7
- vpxor xmm0,xmm0,xmm2
- vmovdqa xmm2,XMMWORD[((176-128))+rax]
-
- vpslld xmm8,xmm11,5
- vpaddd xmm10,xmm10,xmm15
- vpxor xmm6,xmm14,xmm12
- vmovdqa XMMWORD[(128-128)+rax],xmm4
- vpaddd xmm10,xmm10,xmm4
- vpxor xmm0,xmm0,XMMWORD[((16-128))+rax]
- vpsrld xmm9,xmm11,27
- vpxor xmm6,xmm6,xmm13
- vpxor xmm0,xmm0,xmm2
-
- vpslld xmm7,xmm12,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm10,xmm10,xmm6
- vpsrld xmm5,xmm0,31
- vpaddd xmm0,xmm0,xmm0
-
- vpsrld xmm12,xmm12,2
- vpaddd xmm10,xmm10,xmm8
- vpor xmm0,xmm0,xmm5
- vpor xmm12,xmm12,xmm7
- vpxor xmm1,xmm1,xmm3
- vmovdqa xmm3,XMMWORD[((192-128))+rax]
-
- vpslld xmm8,xmm10,5
- vpaddd xmm14,xmm14,xmm15
- vpxor xmm6,xmm13,xmm11
- vmovdqa XMMWORD[(144-128)+rax],xmm0
- vpaddd xmm14,xmm14,xmm0
- vpxor xmm1,xmm1,XMMWORD[((32-128))+rax]
- vpsrld xmm9,xmm10,27
- vpxor xmm6,xmm6,xmm12
- vpxor xmm1,xmm1,xmm3
-
- vpslld xmm7,xmm11,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm14,xmm14,xmm6
- vpsrld xmm5,xmm1,31
- vpaddd xmm1,xmm1,xmm1
-
- vpsrld xmm11,xmm11,2
- vpaddd xmm14,xmm14,xmm8
- vpor xmm1,xmm1,xmm5
- vpor xmm11,xmm11,xmm7
- vpxor xmm2,xmm2,xmm4
- vmovdqa xmm4,XMMWORD[((208-128))+rax]
-
- vpslld xmm8,xmm14,5
- vpaddd xmm13,xmm13,xmm15
- vpxor xmm6,xmm12,xmm10
- vmovdqa XMMWORD[(160-128)+rax],xmm1
- vpaddd xmm13,xmm13,xmm1
- vpxor xmm2,xmm2,XMMWORD[((48-128))+rax]
- vpsrld xmm9,xmm14,27
- vpxor xmm6,xmm6,xmm11
- vpxor xmm2,xmm2,xmm4
-
- vpslld xmm7,xmm10,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm13,xmm13,xmm6
- vpsrld xmm5,xmm2,31
- vpaddd xmm2,xmm2,xmm2
-
- vpsrld xmm10,xmm10,2
- vpaddd xmm13,xmm13,xmm8
- vpor xmm2,xmm2,xmm5
- vpor xmm10,xmm10,xmm7
- vpxor xmm3,xmm3,xmm0
- vmovdqa xmm0,XMMWORD[((224-128))+rax]
-
- vpslld xmm8,xmm13,5
- vpaddd xmm12,xmm12,xmm15
- vpxor xmm6,xmm11,xmm14
- vmovdqa XMMWORD[(176-128)+rax],xmm2
- vpaddd xmm12,xmm12,xmm2
- vpxor xmm3,xmm3,XMMWORD[((64-128))+rax]
- vpsrld xmm9,xmm13,27
- vpxor xmm6,xmm6,xmm10
- vpxor xmm3,xmm3,xmm0
-
- vpslld xmm7,xmm14,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm12,xmm12,xmm6
- vpsrld xmm5,xmm3,31
- vpaddd xmm3,xmm3,xmm3
-
- vpsrld xmm14,xmm14,2
- vpaddd xmm12,xmm12,xmm8
- vpor xmm3,xmm3,xmm5
- vpor xmm14,xmm14,xmm7
- vpxor xmm4,xmm4,xmm1
- vmovdqa xmm1,XMMWORD[((240-128))+rax]
-
- vpslld xmm8,xmm12,5
- vpaddd xmm11,xmm11,xmm15
- vpxor xmm6,xmm10,xmm13
- vmovdqa XMMWORD[(192-128)+rax],xmm3
- vpaddd xmm11,xmm11,xmm3
- vpxor xmm4,xmm4,XMMWORD[((80-128))+rax]
- vpsrld xmm9,xmm12,27
- vpxor xmm6,xmm6,xmm14
- vpxor xmm4,xmm4,xmm1
-
- vpslld xmm7,xmm13,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm11,xmm11,xmm6
- vpsrld xmm5,xmm4,31
- vpaddd xmm4,xmm4,xmm4
-
- vpsrld xmm13,xmm13,2
- vpaddd xmm11,xmm11,xmm8
- vpor xmm4,xmm4,xmm5
- vpor xmm13,xmm13,xmm7
- vpxor xmm0,xmm0,xmm2
- vmovdqa xmm2,XMMWORD[((0-128))+rax]
-
- vpslld xmm8,xmm11,5
- vpaddd xmm10,xmm10,xmm15
- vpxor xmm6,xmm14,xmm12
- vmovdqa XMMWORD[(208-128)+rax],xmm4
- vpaddd xmm10,xmm10,xmm4
- vpxor xmm0,xmm0,XMMWORD[((96-128))+rax]
- vpsrld xmm9,xmm11,27
- vpxor xmm6,xmm6,xmm13
- vpxor xmm0,xmm0,xmm2
-
- vpslld xmm7,xmm12,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm10,xmm10,xmm6
- vpsrld xmm5,xmm0,31
- vpaddd xmm0,xmm0,xmm0
-
- vpsrld xmm12,xmm12,2
- vpaddd xmm10,xmm10,xmm8
- vpor xmm0,xmm0,xmm5
- vpor xmm12,xmm12,xmm7
- vpxor xmm1,xmm1,xmm3
- vmovdqa xmm3,XMMWORD[((16-128))+rax]
-
- vpslld xmm8,xmm10,5
- vpaddd xmm14,xmm14,xmm15
- vpxor xmm6,xmm13,xmm11
- vmovdqa XMMWORD[(224-128)+rax],xmm0
- vpaddd xmm14,xmm14,xmm0
- vpxor xmm1,xmm1,XMMWORD[((112-128))+rax]
- vpsrld xmm9,xmm10,27
- vpxor xmm6,xmm6,xmm12
- vpxor xmm1,xmm1,xmm3
-
- vpslld xmm7,xmm11,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm14,xmm14,xmm6
- vpsrld xmm5,xmm1,31
- vpaddd xmm1,xmm1,xmm1
-
- vpsrld xmm11,xmm11,2
- vpaddd xmm14,xmm14,xmm8
- vpor xmm1,xmm1,xmm5
- vpor xmm11,xmm11,xmm7
- vpxor xmm2,xmm2,xmm4
- vmovdqa xmm4,XMMWORD[((32-128))+rax]
-
- vpslld xmm8,xmm14,5
- vpaddd xmm13,xmm13,xmm15
- vpxor xmm6,xmm12,xmm10
- vmovdqa XMMWORD[(240-128)+rax],xmm1
- vpaddd xmm13,xmm13,xmm1
- vpxor xmm2,xmm2,XMMWORD[((128-128))+rax]
- vpsrld xmm9,xmm14,27
- vpxor xmm6,xmm6,xmm11
- vpxor xmm2,xmm2,xmm4
-
- vpslld xmm7,xmm10,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm13,xmm13,xmm6
- vpsrld xmm5,xmm2,31
- vpaddd xmm2,xmm2,xmm2
-
- vpsrld xmm10,xmm10,2
- vpaddd xmm13,xmm13,xmm8
- vpor xmm2,xmm2,xmm5
- vpor xmm10,xmm10,xmm7
- vpxor xmm3,xmm3,xmm0
- vmovdqa xmm0,XMMWORD[((48-128))+rax]
-
- vpslld xmm8,xmm13,5
- vpaddd xmm12,xmm12,xmm15
- vpxor xmm6,xmm11,xmm14
- vmovdqa XMMWORD[(0-128)+rax],xmm2
- vpaddd xmm12,xmm12,xmm2
- vpxor xmm3,xmm3,XMMWORD[((144-128))+rax]
- vpsrld xmm9,xmm13,27
- vpxor xmm6,xmm6,xmm10
- vpxor xmm3,xmm3,xmm0
-
- vpslld xmm7,xmm14,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm12,xmm12,xmm6
- vpsrld xmm5,xmm3,31
- vpaddd xmm3,xmm3,xmm3
-
- vpsrld xmm14,xmm14,2
- vpaddd xmm12,xmm12,xmm8
- vpor xmm3,xmm3,xmm5
- vpor xmm14,xmm14,xmm7
- vpxor xmm4,xmm4,xmm1
- vmovdqa xmm1,XMMWORD[((64-128))+rax]
-
- vpslld xmm8,xmm12,5
- vpaddd xmm11,xmm11,xmm15
- vpxor xmm6,xmm10,xmm13
- vmovdqa XMMWORD[(16-128)+rax],xmm3
- vpaddd xmm11,xmm11,xmm3
- vpxor xmm4,xmm4,XMMWORD[((160-128))+rax]
- vpsrld xmm9,xmm12,27
- vpxor xmm6,xmm6,xmm14
- vpxor xmm4,xmm4,xmm1
-
- vpslld xmm7,xmm13,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm11,xmm11,xmm6
- vpsrld xmm5,xmm4,31
- vpaddd xmm4,xmm4,xmm4
-
- vpsrld xmm13,xmm13,2
- vpaddd xmm11,xmm11,xmm8
- vpor xmm4,xmm4,xmm5
- vpor xmm13,xmm13,xmm7
- vpxor xmm0,xmm0,xmm2
- vmovdqa xmm2,XMMWORD[((80-128))+rax]
-
- vpslld xmm8,xmm11,5
- vpaddd xmm10,xmm10,xmm15
- vpxor xmm6,xmm14,xmm12
- vmovdqa XMMWORD[(32-128)+rax],xmm4
- vpaddd xmm10,xmm10,xmm4
- vpxor xmm0,xmm0,XMMWORD[((176-128))+rax]
- vpsrld xmm9,xmm11,27
- vpxor xmm6,xmm6,xmm13
- vpxor xmm0,xmm0,xmm2
-
- vpslld xmm7,xmm12,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm10,xmm10,xmm6
- vpsrld xmm5,xmm0,31
- vpaddd xmm0,xmm0,xmm0
-
- vpsrld xmm12,xmm12,2
- vpaddd xmm10,xmm10,xmm8
- vpor xmm0,xmm0,xmm5
- vpor xmm12,xmm12,xmm7
- vpxor xmm1,xmm1,xmm3
- vmovdqa xmm3,XMMWORD[((96-128))+rax]
-
- vpslld xmm8,xmm10,5
- vpaddd xmm14,xmm14,xmm15
- vpxor xmm6,xmm13,xmm11
- vmovdqa XMMWORD[(48-128)+rax],xmm0
- vpaddd xmm14,xmm14,xmm0
- vpxor xmm1,xmm1,XMMWORD[((192-128))+rax]
- vpsrld xmm9,xmm10,27
- vpxor xmm6,xmm6,xmm12
- vpxor xmm1,xmm1,xmm3
-
- vpslld xmm7,xmm11,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm14,xmm14,xmm6
- vpsrld xmm5,xmm1,31
- vpaddd xmm1,xmm1,xmm1
-
- vpsrld xmm11,xmm11,2
- vpaddd xmm14,xmm14,xmm8
- vpor xmm1,xmm1,xmm5
- vpor xmm11,xmm11,xmm7
- vpxor xmm2,xmm2,xmm4
- vmovdqa xmm4,XMMWORD[((112-128))+rax]
-
- vpslld xmm8,xmm14,5
- vpaddd xmm13,xmm13,xmm15
- vpxor xmm6,xmm12,xmm10
- vmovdqa XMMWORD[(64-128)+rax],xmm1
- vpaddd xmm13,xmm13,xmm1
- vpxor xmm2,xmm2,XMMWORD[((208-128))+rax]
- vpsrld xmm9,xmm14,27
- vpxor xmm6,xmm6,xmm11
- vpxor xmm2,xmm2,xmm4
-
- vpslld xmm7,xmm10,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm13,xmm13,xmm6
- vpsrld xmm5,xmm2,31
- vpaddd xmm2,xmm2,xmm2
-
- vpsrld xmm10,xmm10,2
- vpaddd xmm13,xmm13,xmm8
- vpor xmm2,xmm2,xmm5
- vpor xmm10,xmm10,xmm7
- vpxor xmm3,xmm3,xmm0
- vmovdqa xmm0,XMMWORD[((128-128))+rax]
-
- vpslld xmm8,xmm13,5
- vpaddd xmm12,xmm12,xmm15
- vpxor xmm6,xmm11,xmm14
- vmovdqa XMMWORD[(80-128)+rax],xmm2
- vpaddd xmm12,xmm12,xmm2
- vpxor xmm3,xmm3,XMMWORD[((224-128))+rax]
- vpsrld xmm9,xmm13,27
- vpxor xmm6,xmm6,xmm10
- vpxor xmm3,xmm3,xmm0
-
- vpslld xmm7,xmm14,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm12,xmm12,xmm6
- vpsrld xmm5,xmm3,31
- vpaddd xmm3,xmm3,xmm3
-
- vpsrld xmm14,xmm14,2
- vpaddd xmm12,xmm12,xmm8
- vpor xmm3,xmm3,xmm5
- vpor xmm14,xmm14,xmm7
- vpxor xmm4,xmm4,xmm1
- vmovdqa xmm1,XMMWORD[((144-128))+rax]
-
- vpslld xmm8,xmm12,5
- vpaddd xmm11,xmm11,xmm15
- vpxor xmm6,xmm10,xmm13
- vmovdqa XMMWORD[(96-128)+rax],xmm3
- vpaddd xmm11,xmm11,xmm3
- vpxor xmm4,xmm4,XMMWORD[((240-128))+rax]
- vpsrld xmm9,xmm12,27
- vpxor xmm6,xmm6,xmm14
- vpxor xmm4,xmm4,xmm1
-
- vpslld xmm7,xmm13,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm11,xmm11,xmm6
- vpsrld xmm5,xmm4,31
- vpaddd xmm4,xmm4,xmm4
-
- vpsrld xmm13,xmm13,2
- vpaddd xmm11,xmm11,xmm8
- vpor xmm4,xmm4,xmm5
- vpor xmm13,xmm13,xmm7
- vpxor xmm0,xmm0,xmm2
- vmovdqa xmm2,XMMWORD[((160-128))+rax]
-
- vpslld xmm8,xmm11,5
- vpaddd xmm10,xmm10,xmm15
- vpxor xmm6,xmm14,xmm12
- vmovdqa XMMWORD[(112-128)+rax],xmm4
- vpaddd xmm10,xmm10,xmm4
- vpxor xmm0,xmm0,XMMWORD[((0-128))+rax]
- vpsrld xmm9,xmm11,27
- vpxor xmm6,xmm6,xmm13
- vpxor xmm0,xmm0,xmm2
-
- vpslld xmm7,xmm12,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm10,xmm10,xmm6
- vpsrld xmm5,xmm0,31
- vpaddd xmm0,xmm0,xmm0
-
- vpsrld xmm12,xmm12,2
- vpaddd xmm10,xmm10,xmm8
- vpor xmm0,xmm0,xmm5
- vpor xmm12,xmm12,xmm7
- vmovdqa xmm15,XMMWORD[32+rbp]
- vpxor xmm1,xmm1,xmm3
- vmovdqa xmm3,XMMWORD[((176-128))+rax]
-
- vpaddd xmm14,xmm14,xmm15
- vpslld xmm8,xmm10,5
- vpand xmm7,xmm13,xmm12
- vpxor xmm1,xmm1,XMMWORD[((16-128))+rax]
-
- vpaddd xmm14,xmm14,xmm7
- vpsrld xmm9,xmm10,27
- vpxor xmm6,xmm13,xmm12
- vpxor xmm1,xmm1,xmm3
-
- vmovdqu XMMWORD[(128-128)+rax],xmm0
- vpaddd xmm14,xmm14,xmm0
- vpor xmm8,xmm8,xmm9
- vpsrld xmm5,xmm1,31
- vpand xmm6,xmm6,xmm11
- vpaddd xmm1,xmm1,xmm1
-
- vpslld xmm7,xmm11,30
- vpaddd xmm14,xmm14,xmm6
-
- vpsrld xmm11,xmm11,2
- vpaddd xmm14,xmm14,xmm8
- vpor xmm1,xmm1,xmm5
- vpor xmm11,xmm11,xmm7
- vpxor xmm2,xmm2,xmm4
- vmovdqa xmm4,XMMWORD[((192-128))+rax]
-
- vpaddd xmm13,xmm13,xmm15
- vpslld xmm8,xmm14,5
- vpand xmm7,xmm12,xmm11
- vpxor xmm2,xmm2,XMMWORD[((32-128))+rax]
-
- vpaddd xmm13,xmm13,xmm7
- vpsrld xmm9,xmm14,27
- vpxor xmm6,xmm12,xmm11
- vpxor xmm2,xmm2,xmm4
-
- vmovdqu XMMWORD[(144-128)+rax],xmm1
- vpaddd xmm13,xmm13,xmm1
- vpor xmm8,xmm8,xmm9
- vpsrld xmm5,xmm2,31
- vpand xmm6,xmm6,xmm10
- vpaddd xmm2,xmm2,xmm2
-
- vpslld xmm7,xmm10,30
- vpaddd xmm13,xmm13,xmm6
-
- vpsrld xmm10,xmm10,2
- vpaddd xmm13,xmm13,xmm8
- vpor xmm2,xmm2,xmm5
- vpor xmm10,xmm10,xmm7
- vpxor xmm3,xmm3,xmm0
- vmovdqa xmm0,XMMWORD[((208-128))+rax]
-
- vpaddd xmm12,xmm12,xmm15
- vpslld xmm8,xmm13,5
- vpand xmm7,xmm11,xmm10
- vpxor xmm3,xmm3,XMMWORD[((48-128))+rax]
-
- vpaddd xmm12,xmm12,xmm7
- vpsrld xmm9,xmm13,27
- vpxor xmm6,xmm11,xmm10
- vpxor xmm3,xmm3,xmm0
-
- vmovdqu XMMWORD[(160-128)+rax],xmm2
- vpaddd xmm12,xmm12,xmm2
- vpor xmm8,xmm8,xmm9
- vpsrld xmm5,xmm3,31
- vpand xmm6,xmm6,xmm14
- vpaddd xmm3,xmm3,xmm3
-
- vpslld xmm7,xmm14,30
- vpaddd xmm12,xmm12,xmm6
-
- vpsrld xmm14,xmm14,2
- vpaddd xmm12,xmm12,xmm8
- vpor xmm3,xmm3,xmm5
- vpor xmm14,xmm14,xmm7
- vpxor xmm4,xmm4,xmm1
- vmovdqa xmm1,XMMWORD[((224-128))+rax]
-
- vpaddd xmm11,xmm11,xmm15
- vpslld xmm8,xmm12,5
- vpand xmm7,xmm10,xmm14
- vpxor xmm4,xmm4,XMMWORD[((64-128))+rax]
-
- vpaddd xmm11,xmm11,xmm7
- vpsrld xmm9,xmm12,27
- vpxor xmm6,xmm10,xmm14
- vpxor xmm4,xmm4,xmm1
-
- vmovdqu XMMWORD[(176-128)+rax],xmm3
- vpaddd xmm11,xmm11,xmm3
- vpor xmm8,xmm8,xmm9
- vpsrld xmm5,xmm4,31
- vpand xmm6,xmm6,xmm13
- vpaddd xmm4,xmm4,xmm4
-
- vpslld xmm7,xmm13,30
- vpaddd xmm11,xmm11,xmm6
-
- vpsrld xmm13,xmm13,2
- vpaddd xmm11,xmm11,xmm8
- vpor xmm4,xmm4,xmm5
- vpor xmm13,xmm13,xmm7
- vpxor xmm0,xmm0,xmm2
- vmovdqa xmm2,XMMWORD[((240-128))+rax]
-
- vpaddd xmm10,xmm10,xmm15
- vpslld xmm8,xmm11,5
- vpand xmm7,xmm14,xmm13
- vpxor xmm0,xmm0,XMMWORD[((80-128))+rax]
-
- vpaddd xmm10,xmm10,xmm7
- vpsrld xmm9,xmm11,27
- vpxor xmm6,xmm14,xmm13
- vpxor xmm0,xmm0,xmm2
-
- vmovdqu XMMWORD[(192-128)+rax],xmm4
- vpaddd xmm10,xmm10,xmm4
- vpor xmm8,xmm8,xmm9
- vpsrld xmm5,xmm0,31
- vpand xmm6,xmm6,xmm12
- vpaddd xmm0,xmm0,xmm0
-
- vpslld xmm7,xmm12,30
- vpaddd xmm10,xmm10,xmm6
-
- vpsrld xmm12,xmm12,2
- vpaddd xmm10,xmm10,xmm8
- vpor xmm0,xmm0,xmm5
- vpor xmm12,xmm12,xmm7
- vpxor xmm1,xmm1,xmm3
- vmovdqa xmm3,XMMWORD[((0-128))+rax]
-
- vpaddd xmm14,xmm14,xmm15
- vpslld xmm8,xmm10,5
- vpand xmm7,xmm13,xmm12
- vpxor xmm1,xmm1,XMMWORD[((96-128))+rax]
-
- vpaddd xmm14,xmm14,xmm7
- vpsrld xmm9,xmm10,27
- vpxor xmm6,xmm13,xmm12
- vpxor xmm1,xmm1,xmm3
-
- vmovdqu XMMWORD[(208-128)+rax],xmm0
- vpaddd xmm14,xmm14,xmm0
- vpor xmm8,xmm8,xmm9
- vpsrld xmm5,xmm1,31
- vpand xmm6,xmm6,xmm11
- vpaddd xmm1,xmm1,xmm1
-
- vpslld xmm7,xmm11,30
- vpaddd xmm14,xmm14,xmm6
-
- vpsrld xmm11,xmm11,2
- vpaddd xmm14,xmm14,xmm8
- vpor xmm1,xmm1,xmm5
- vpor xmm11,xmm11,xmm7
- vpxor xmm2,xmm2,xmm4
- vmovdqa xmm4,XMMWORD[((16-128))+rax]
-
- vpaddd xmm13,xmm13,xmm15
- vpslld xmm8,xmm14,5
- vpand xmm7,xmm12,xmm11
- vpxor xmm2,xmm2,XMMWORD[((112-128))+rax]
-
- vpaddd xmm13,xmm13,xmm7
- vpsrld xmm9,xmm14,27
- vpxor xmm6,xmm12,xmm11
- vpxor xmm2,xmm2,xmm4
-
- vmovdqu XMMWORD[(224-128)+rax],xmm1
- vpaddd xmm13,xmm13,xmm1
- vpor xmm8,xmm8,xmm9
- vpsrld xmm5,xmm2,31
- vpand xmm6,xmm6,xmm10
- vpaddd xmm2,xmm2,xmm2
-
- vpslld xmm7,xmm10,30
- vpaddd xmm13,xmm13,xmm6
-
- vpsrld xmm10,xmm10,2
- vpaddd xmm13,xmm13,xmm8
- vpor xmm2,xmm2,xmm5
- vpor xmm10,xmm10,xmm7
- vpxor xmm3,xmm3,xmm0
- vmovdqa xmm0,XMMWORD[((32-128))+rax]
-
- vpaddd xmm12,xmm12,xmm15
- vpslld xmm8,xmm13,5
- vpand xmm7,xmm11,xmm10
- vpxor xmm3,xmm3,XMMWORD[((128-128))+rax]
-
- vpaddd xmm12,xmm12,xmm7
- vpsrld xmm9,xmm13,27
- vpxor xmm6,xmm11,xmm10
- vpxor xmm3,xmm3,xmm0
-
- vmovdqu XMMWORD[(240-128)+rax],xmm2
- vpaddd xmm12,xmm12,xmm2
- vpor xmm8,xmm8,xmm9
- vpsrld xmm5,xmm3,31
- vpand xmm6,xmm6,xmm14
- vpaddd xmm3,xmm3,xmm3
-
- vpslld xmm7,xmm14,30
- vpaddd xmm12,xmm12,xmm6
-
- vpsrld xmm14,xmm14,2
- vpaddd xmm12,xmm12,xmm8
- vpor xmm3,xmm3,xmm5
- vpor xmm14,xmm14,xmm7
- vpxor xmm4,xmm4,xmm1
- vmovdqa xmm1,XMMWORD[((48-128))+rax]
-
- vpaddd xmm11,xmm11,xmm15
- vpslld xmm8,xmm12,5
- vpand xmm7,xmm10,xmm14
- vpxor xmm4,xmm4,XMMWORD[((144-128))+rax]
-
- vpaddd xmm11,xmm11,xmm7
- vpsrld xmm9,xmm12,27
- vpxor xmm6,xmm10,xmm14
- vpxor xmm4,xmm4,xmm1
-
- vmovdqu XMMWORD[(0-128)+rax],xmm3
- vpaddd xmm11,xmm11,xmm3
- vpor xmm8,xmm8,xmm9
- vpsrld xmm5,xmm4,31
- vpand xmm6,xmm6,xmm13
- vpaddd xmm4,xmm4,xmm4
-
- vpslld xmm7,xmm13,30
- vpaddd xmm11,xmm11,xmm6
-
- vpsrld xmm13,xmm13,2
- vpaddd xmm11,xmm11,xmm8
- vpor xmm4,xmm4,xmm5
- vpor xmm13,xmm13,xmm7
- vpxor xmm0,xmm0,xmm2
- vmovdqa xmm2,XMMWORD[((64-128))+rax]
-
- vpaddd xmm10,xmm10,xmm15
- vpslld xmm8,xmm11,5
- vpand xmm7,xmm14,xmm13
- vpxor xmm0,xmm0,XMMWORD[((160-128))+rax]
-
- vpaddd xmm10,xmm10,xmm7
- vpsrld xmm9,xmm11,27
- vpxor xmm6,xmm14,xmm13
- vpxor xmm0,xmm0,xmm2
-
- vmovdqu XMMWORD[(16-128)+rax],xmm4
- vpaddd xmm10,xmm10,xmm4
- vpor xmm8,xmm8,xmm9
- vpsrld xmm5,xmm0,31
- vpand xmm6,xmm6,xmm12
- vpaddd xmm0,xmm0,xmm0
-
- vpslld xmm7,xmm12,30
- vpaddd xmm10,xmm10,xmm6
-
- vpsrld xmm12,xmm12,2
- vpaddd xmm10,xmm10,xmm8
- vpor xmm0,xmm0,xmm5
- vpor xmm12,xmm12,xmm7
- vpxor xmm1,xmm1,xmm3
- vmovdqa xmm3,XMMWORD[((80-128))+rax]
-
- vpaddd xmm14,xmm14,xmm15
- vpslld xmm8,xmm10,5
- vpand xmm7,xmm13,xmm12
- vpxor xmm1,xmm1,XMMWORD[((176-128))+rax]
-
- vpaddd xmm14,xmm14,xmm7
- vpsrld xmm9,xmm10,27
- vpxor xmm6,xmm13,xmm12
- vpxor xmm1,xmm1,xmm3
-
- vmovdqu XMMWORD[(32-128)+rax],xmm0
- vpaddd xmm14,xmm14,xmm0
- vpor xmm8,xmm8,xmm9
- vpsrld xmm5,xmm1,31
- vpand xmm6,xmm6,xmm11
- vpaddd xmm1,xmm1,xmm1
-
- vpslld xmm7,xmm11,30
- vpaddd xmm14,xmm14,xmm6
-
- vpsrld xmm11,xmm11,2
- vpaddd xmm14,xmm14,xmm8
- vpor xmm1,xmm1,xmm5
- vpor xmm11,xmm11,xmm7
- vpxor xmm2,xmm2,xmm4
- vmovdqa xmm4,XMMWORD[((96-128))+rax]
-
- vpaddd xmm13,xmm13,xmm15
- vpslld xmm8,xmm14,5
- vpand xmm7,xmm12,xmm11
- vpxor xmm2,xmm2,XMMWORD[((192-128))+rax]
-
- vpaddd xmm13,xmm13,xmm7
- vpsrld xmm9,xmm14,27
- vpxor xmm6,xmm12,xmm11
- vpxor xmm2,xmm2,xmm4
-
- vmovdqu XMMWORD[(48-128)+rax],xmm1
- vpaddd xmm13,xmm13,xmm1
- vpor xmm8,xmm8,xmm9
- vpsrld xmm5,xmm2,31
- vpand xmm6,xmm6,xmm10
- vpaddd xmm2,xmm2,xmm2
-
- vpslld xmm7,xmm10,30
- vpaddd xmm13,xmm13,xmm6
-
- vpsrld xmm10,xmm10,2
- vpaddd xmm13,xmm13,xmm8
- vpor xmm2,xmm2,xmm5
- vpor xmm10,xmm10,xmm7
- vpxor xmm3,xmm3,xmm0
- vmovdqa xmm0,XMMWORD[((112-128))+rax]
-
- vpaddd xmm12,xmm12,xmm15
- vpslld xmm8,xmm13,5
- vpand xmm7,xmm11,xmm10
- vpxor xmm3,xmm3,XMMWORD[((208-128))+rax]
-
- vpaddd xmm12,xmm12,xmm7
- vpsrld xmm9,xmm13,27
- vpxor xmm6,xmm11,xmm10
- vpxor xmm3,xmm3,xmm0
-
- vmovdqu XMMWORD[(64-128)+rax],xmm2
- vpaddd xmm12,xmm12,xmm2
- vpor xmm8,xmm8,xmm9
- vpsrld xmm5,xmm3,31
- vpand xmm6,xmm6,xmm14
- vpaddd xmm3,xmm3,xmm3
-
- vpslld xmm7,xmm14,30
- vpaddd xmm12,xmm12,xmm6
-
- vpsrld xmm14,xmm14,2
- vpaddd xmm12,xmm12,xmm8
- vpor xmm3,xmm3,xmm5
- vpor xmm14,xmm14,xmm7
- vpxor xmm4,xmm4,xmm1
- vmovdqa xmm1,XMMWORD[((128-128))+rax]
-
- vpaddd xmm11,xmm11,xmm15
- vpslld xmm8,xmm12,5
- vpand xmm7,xmm10,xmm14
- vpxor xmm4,xmm4,XMMWORD[((224-128))+rax]
-
- vpaddd xmm11,xmm11,xmm7
- vpsrld xmm9,xmm12,27
- vpxor xmm6,xmm10,xmm14
- vpxor xmm4,xmm4,xmm1
-
- vmovdqu XMMWORD[(80-128)+rax],xmm3
- vpaddd xmm11,xmm11,xmm3
- vpor xmm8,xmm8,xmm9
- vpsrld xmm5,xmm4,31
- vpand xmm6,xmm6,xmm13
- vpaddd xmm4,xmm4,xmm4
-
- vpslld xmm7,xmm13,30
- vpaddd xmm11,xmm11,xmm6
-
- vpsrld xmm13,xmm13,2
- vpaddd xmm11,xmm11,xmm8
- vpor xmm4,xmm4,xmm5
- vpor xmm13,xmm13,xmm7
- vpxor xmm0,xmm0,xmm2
- vmovdqa xmm2,XMMWORD[((144-128))+rax]
-
- vpaddd xmm10,xmm10,xmm15
- vpslld xmm8,xmm11,5
- vpand xmm7,xmm14,xmm13
- vpxor xmm0,xmm0,XMMWORD[((240-128))+rax]
-
- vpaddd xmm10,xmm10,xmm7
- vpsrld xmm9,xmm11,27
- vpxor xmm6,xmm14,xmm13
- vpxor xmm0,xmm0,xmm2
-
- vmovdqu XMMWORD[(96-128)+rax],xmm4
- vpaddd xmm10,xmm10,xmm4
- vpor xmm8,xmm8,xmm9
- vpsrld xmm5,xmm0,31
- vpand xmm6,xmm6,xmm12
- vpaddd xmm0,xmm0,xmm0
-
- vpslld xmm7,xmm12,30
- vpaddd xmm10,xmm10,xmm6
-
- vpsrld xmm12,xmm12,2
- vpaddd xmm10,xmm10,xmm8
- vpor xmm0,xmm0,xmm5
- vpor xmm12,xmm12,xmm7
- vpxor xmm1,xmm1,xmm3
- vmovdqa xmm3,XMMWORD[((160-128))+rax]
-
- vpaddd xmm14,xmm14,xmm15
- vpslld xmm8,xmm10,5
- vpand xmm7,xmm13,xmm12
- vpxor xmm1,xmm1,XMMWORD[((0-128))+rax]
-
- vpaddd xmm14,xmm14,xmm7
- vpsrld xmm9,xmm10,27
- vpxor xmm6,xmm13,xmm12
- vpxor xmm1,xmm1,xmm3
-
- vmovdqu XMMWORD[(112-128)+rax],xmm0
- vpaddd xmm14,xmm14,xmm0
- vpor xmm8,xmm8,xmm9
- vpsrld xmm5,xmm1,31
- vpand xmm6,xmm6,xmm11
- vpaddd xmm1,xmm1,xmm1
-
- vpslld xmm7,xmm11,30
- vpaddd xmm14,xmm14,xmm6
-
- vpsrld xmm11,xmm11,2
- vpaddd xmm14,xmm14,xmm8
- vpor xmm1,xmm1,xmm5
- vpor xmm11,xmm11,xmm7
- vpxor xmm2,xmm2,xmm4
- vmovdqa xmm4,XMMWORD[((176-128))+rax]
-
- vpaddd xmm13,xmm13,xmm15
- vpslld xmm8,xmm14,5
- vpand xmm7,xmm12,xmm11
- vpxor xmm2,xmm2,XMMWORD[((16-128))+rax]
-
- vpaddd xmm13,xmm13,xmm7
- vpsrld xmm9,xmm14,27
- vpxor xmm6,xmm12,xmm11
- vpxor xmm2,xmm2,xmm4
-
- vmovdqu XMMWORD[(128-128)+rax],xmm1
- vpaddd xmm13,xmm13,xmm1
- vpor xmm8,xmm8,xmm9
- vpsrld xmm5,xmm2,31
- vpand xmm6,xmm6,xmm10
- vpaddd xmm2,xmm2,xmm2
-
- vpslld xmm7,xmm10,30
- vpaddd xmm13,xmm13,xmm6
-
- vpsrld xmm10,xmm10,2
- vpaddd xmm13,xmm13,xmm8
- vpor xmm2,xmm2,xmm5
- vpor xmm10,xmm10,xmm7
- vpxor xmm3,xmm3,xmm0
- vmovdqa xmm0,XMMWORD[((192-128))+rax]
-
- vpaddd xmm12,xmm12,xmm15
- vpslld xmm8,xmm13,5
- vpand xmm7,xmm11,xmm10
- vpxor xmm3,xmm3,XMMWORD[((32-128))+rax]
-
- vpaddd xmm12,xmm12,xmm7
- vpsrld xmm9,xmm13,27
- vpxor xmm6,xmm11,xmm10
- vpxor xmm3,xmm3,xmm0
-
- vmovdqu XMMWORD[(144-128)+rax],xmm2
- vpaddd xmm12,xmm12,xmm2
- vpor xmm8,xmm8,xmm9
- vpsrld xmm5,xmm3,31
- vpand xmm6,xmm6,xmm14
- vpaddd xmm3,xmm3,xmm3
-
- vpslld xmm7,xmm14,30
- vpaddd xmm12,xmm12,xmm6
-
- vpsrld xmm14,xmm14,2
- vpaddd xmm12,xmm12,xmm8
- vpor xmm3,xmm3,xmm5
- vpor xmm14,xmm14,xmm7
- vpxor xmm4,xmm4,xmm1
- vmovdqa xmm1,XMMWORD[((208-128))+rax]
-
- vpaddd xmm11,xmm11,xmm15
- vpslld xmm8,xmm12,5
- vpand xmm7,xmm10,xmm14
- vpxor xmm4,xmm4,XMMWORD[((48-128))+rax]
-
- vpaddd xmm11,xmm11,xmm7
- vpsrld xmm9,xmm12,27
- vpxor xmm6,xmm10,xmm14
- vpxor xmm4,xmm4,xmm1
-
- vmovdqu XMMWORD[(160-128)+rax],xmm3
- vpaddd xmm11,xmm11,xmm3
- vpor xmm8,xmm8,xmm9
- vpsrld xmm5,xmm4,31
- vpand xmm6,xmm6,xmm13
- vpaddd xmm4,xmm4,xmm4
-
- vpslld xmm7,xmm13,30
- vpaddd xmm11,xmm11,xmm6
-
- vpsrld xmm13,xmm13,2
- vpaddd xmm11,xmm11,xmm8
- vpor xmm4,xmm4,xmm5
- vpor xmm13,xmm13,xmm7
- vpxor xmm0,xmm0,xmm2
- vmovdqa xmm2,XMMWORD[((224-128))+rax]
-
- vpaddd xmm10,xmm10,xmm15
- vpslld xmm8,xmm11,5
- vpand xmm7,xmm14,xmm13
- vpxor xmm0,xmm0,XMMWORD[((64-128))+rax]
-
- vpaddd xmm10,xmm10,xmm7
- vpsrld xmm9,xmm11,27
- vpxor xmm6,xmm14,xmm13
- vpxor xmm0,xmm0,xmm2
-
- vmovdqu XMMWORD[(176-128)+rax],xmm4
- vpaddd xmm10,xmm10,xmm4
- vpor xmm8,xmm8,xmm9
- vpsrld xmm5,xmm0,31
- vpand xmm6,xmm6,xmm12
- vpaddd xmm0,xmm0,xmm0
-
- vpslld xmm7,xmm12,30
- vpaddd xmm10,xmm10,xmm6
-
- vpsrld xmm12,xmm12,2
- vpaddd xmm10,xmm10,xmm8
- vpor xmm0,xmm0,xmm5
- vpor xmm12,xmm12,xmm7
- vmovdqa xmm15,XMMWORD[64+rbp]
- vpxor xmm1,xmm1,xmm3
- vmovdqa xmm3,XMMWORD[((240-128))+rax]
-
- vpslld xmm8,xmm10,5
- vpaddd xmm14,xmm14,xmm15
- vpxor xmm6,xmm13,xmm11
- vmovdqa XMMWORD[(192-128)+rax],xmm0
- vpaddd xmm14,xmm14,xmm0
- vpxor xmm1,xmm1,XMMWORD[((80-128))+rax]
- vpsrld xmm9,xmm10,27
- vpxor xmm6,xmm6,xmm12
- vpxor xmm1,xmm1,xmm3
-
- vpslld xmm7,xmm11,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm14,xmm14,xmm6
- vpsrld xmm5,xmm1,31
- vpaddd xmm1,xmm1,xmm1
-
- vpsrld xmm11,xmm11,2
- vpaddd xmm14,xmm14,xmm8
- vpor xmm1,xmm1,xmm5
- vpor xmm11,xmm11,xmm7
- vpxor xmm2,xmm2,xmm4
- vmovdqa xmm4,XMMWORD[((0-128))+rax]
-
- vpslld xmm8,xmm14,5
- vpaddd xmm13,xmm13,xmm15
- vpxor xmm6,xmm12,xmm10
- vmovdqa XMMWORD[(208-128)+rax],xmm1
- vpaddd xmm13,xmm13,xmm1
- vpxor xmm2,xmm2,XMMWORD[((96-128))+rax]
- vpsrld xmm9,xmm14,27
- vpxor xmm6,xmm6,xmm11
- vpxor xmm2,xmm2,xmm4
-
- vpslld xmm7,xmm10,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm13,xmm13,xmm6
- vpsrld xmm5,xmm2,31
- vpaddd xmm2,xmm2,xmm2
-
- vpsrld xmm10,xmm10,2
- vpaddd xmm13,xmm13,xmm8
- vpor xmm2,xmm2,xmm5
- vpor xmm10,xmm10,xmm7
- vpxor xmm3,xmm3,xmm0
- vmovdqa xmm0,XMMWORD[((16-128))+rax]
-
- vpslld xmm8,xmm13,5
- vpaddd xmm12,xmm12,xmm15
- vpxor xmm6,xmm11,xmm14
- vmovdqa XMMWORD[(224-128)+rax],xmm2
- vpaddd xmm12,xmm12,xmm2
- vpxor xmm3,xmm3,XMMWORD[((112-128))+rax]
- vpsrld xmm9,xmm13,27
- vpxor xmm6,xmm6,xmm10
- vpxor xmm3,xmm3,xmm0
-
- vpslld xmm7,xmm14,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm12,xmm12,xmm6
- vpsrld xmm5,xmm3,31
- vpaddd xmm3,xmm3,xmm3
-
- vpsrld xmm14,xmm14,2
- vpaddd xmm12,xmm12,xmm8
- vpor xmm3,xmm3,xmm5
- vpor xmm14,xmm14,xmm7
- vpxor xmm4,xmm4,xmm1
- vmovdqa xmm1,XMMWORD[((32-128))+rax]
-
- vpslld xmm8,xmm12,5
- vpaddd xmm11,xmm11,xmm15
- vpxor xmm6,xmm10,xmm13
- vmovdqa XMMWORD[(240-128)+rax],xmm3
- vpaddd xmm11,xmm11,xmm3
- vpxor xmm4,xmm4,XMMWORD[((128-128))+rax]
- vpsrld xmm9,xmm12,27
- vpxor xmm6,xmm6,xmm14
- vpxor xmm4,xmm4,xmm1
-
- vpslld xmm7,xmm13,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm11,xmm11,xmm6
- vpsrld xmm5,xmm4,31
- vpaddd xmm4,xmm4,xmm4
-
- vpsrld xmm13,xmm13,2
- vpaddd xmm11,xmm11,xmm8
- vpor xmm4,xmm4,xmm5
- vpor xmm13,xmm13,xmm7
- vpxor xmm0,xmm0,xmm2
- vmovdqa xmm2,XMMWORD[((48-128))+rax]
-
- vpslld xmm8,xmm11,5
- vpaddd xmm10,xmm10,xmm15
- vpxor xmm6,xmm14,xmm12
- vmovdqa XMMWORD[(0-128)+rax],xmm4
- vpaddd xmm10,xmm10,xmm4
- vpxor xmm0,xmm0,XMMWORD[((144-128))+rax]
- vpsrld xmm9,xmm11,27
- vpxor xmm6,xmm6,xmm13
- vpxor xmm0,xmm0,xmm2
-
- vpslld xmm7,xmm12,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm10,xmm10,xmm6
- vpsrld xmm5,xmm0,31
- vpaddd xmm0,xmm0,xmm0
-
- vpsrld xmm12,xmm12,2
- vpaddd xmm10,xmm10,xmm8
- vpor xmm0,xmm0,xmm5
- vpor xmm12,xmm12,xmm7
- vpxor xmm1,xmm1,xmm3
- vmovdqa xmm3,XMMWORD[((64-128))+rax]
-
- vpslld xmm8,xmm10,5
- vpaddd xmm14,xmm14,xmm15
- vpxor xmm6,xmm13,xmm11
- vmovdqa XMMWORD[(16-128)+rax],xmm0
- vpaddd xmm14,xmm14,xmm0
- vpxor xmm1,xmm1,XMMWORD[((160-128))+rax]
- vpsrld xmm9,xmm10,27
- vpxor xmm6,xmm6,xmm12
- vpxor xmm1,xmm1,xmm3
-
- vpslld xmm7,xmm11,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm14,xmm14,xmm6
- vpsrld xmm5,xmm1,31
- vpaddd xmm1,xmm1,xmm1
-
- vpsrld xmm11,xmm11,2
- vpaddd xmm14,xmm14,xmm8
- vpor xmm1,xmm1,xmm5
- vpor xmm11,xmm11,xmm7
- vpxor xmm2,xmm2,xmm4
- vmovdqa xmm4,XMMWORD[((80-128))+rax]
-
- vpslld xmm8,xmm14,5
- vpaddd xmm13,xmm13,xmm15
- vpxor xmm6,xmm12,xmm10
- vmovdqa XMMWORD[(32-128)+rax],xmm1
- vpaddd xmm13,xmm13,xmm1
- vpxor xmm2,xmm2,XMMWORD[((176-128))+rax]
- vpsrld xmm9,xmm14,27
- vpxor xmm6,xmm6,xmm11
- vpxor xmm2,xmm2,xmm4
-
- vpslld xmm7,xmm10,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm13,xmm13,xmm6
- vpsrld xmm5,xmm2,31
- vpaddd xmm2,xmm2,xmm2
-
- vpsrld xmm10,xmm10,2
- vpaddd xmm13,xmm13,xmm8
- vpor xmm2,xmm2,xmm5
- vpor xmm10,xmm10,xmm7
- vpxor xmm3,xmm3,xmm0
- vmovdqa xmm0,XMMWORD[((96-128))+rax]
-
- vpslld xmm8,xmm13,5
- vpaddd xmm12,xmm12,xmm15
- vpxor xmm6,xmm11,xmm14
- vmovdqa XMMWORD[(48-128)+rax],xmm2
- vpaddd xmm12,xmm12,xmm2
- vpxor xmm3,xmm3,XMMWORD[((192-128))+rax]
- vpsrld xmm9,xmm13,27
- vpxor xmm6,xmm6,xmm10
- vpxor xmm3,xmm3,xmm0
-
- vpslld xmm7,xmm14,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm12,xmm12,xmm6
- vpsrld xmm5,xmm3,31
- vpaddd xmm3,xmm3,xmm3
-
- vpsrld xmm14,xmm14,2
- vpaddd xmm12,xmm12,xmm8
- vpor xmm3,xmm3,xmm5
- vpor xmm14,xmm14,xmm7
- vpxor xmm4,xmm4,xmm1
- vmovdqa xmm1,XMMWORD[((112-128))+rax]
-
- vpslld xmm8,xmm12,5
- vpaddd xmm11,xmm11,xmm15
- vpxor xmm6,xmm10,xmm13
- vmovdqa XMMWORD[(64-128)+rax],xmm3
- vpaddd xmm11,xmm11,xmm3
- vpxor xmm4,xmm4,XMMWORD[((208-128))+rax]
- vpsrld xmm9,xmm12,27
- vpxor xmm6,xmm6,xmm14
- vpxor xmm4,xmm4,xmm1
-
- vpslld xmm7,xmm13,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm11,xmm11,xmm6
- vpsrld xmm5,xmm4,31
- vpaddd xmm4,xmm4,xmm4
-
- vpsrld xmm13,xmm13,2
- vpaddd xmm11,xmm11,xmm8
- vpor xmm4,xmm4,xmm5
- vpor xmm13,xmm13,xmm7
- vpxor xmm0,xmm0,xmm2
- vmovdqa xmm2,XMMWORD[((128-128))+rax]
-
- vpslld xmm8,xmm11,5
- vpaddd xmm10,xmm10,xmm15
- vpxor xmm6,xmm14,xmm12
- vmovdqa XMMWORD[(80-128)+rax],xmm4
- vpaddd xmm10,xmm10,xmm4
- vpxor xmm0,xmm0,XMMWORD[((224-128))+rax]
- vpsrld xmm9,xmm11,27
- vpxor xmm6,xmm6,xmm13
- vpxor xmm0,xmm0,xmm2
-
- vpslld xmm7,xmm12,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm10,xmm10,xmm6
- vpsrld xmm5,xmm0,31
- vpaddd xmm0,xmm0,xmm0
-
- vpsrld xmm12,xmm12,2
- vpaddd xmm10,xmm10,xmm8
- vpor xmm0,xmm0,xmm5
- vpor xmm12,xmm12,xmm7
- vpxor xmm1,xmm1,xmm3
- vmovdqa xmm3,XMMWORD[((144-128))+rax]
-
- vpslld xmm8,xmm10,5
- vpaddd xmm14,xmm14,xmm15
- vpxor xmm6,xmm13,xmm11
- vmovdqa XMMWORD[(96-128)+rax],xmm0
- vpaddd xmm14,xmm14,xmm0
- vpxor xmm1,xmm1,XMMWORD[((240-128))+rax]
- vpsrld xmm9,xmm10,27
- vpxor xmm6,xmm6,xmm12
- vpxor xmm1,xmm1,xmm3
-
- vpslld xmm7,xmm11,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm14,xmm14,xmm6
- vpsrld xmm5,xmm1,31
- vpaddd xmm1,xmm1,xmm1
-
- vpsrld xmm11,xmm11,2
- vpaddd xmm14,xmm14,xmm8
- vpor xmm1,xmm1,xmm5
- vpor xmm11,xmm11,xmm7
- vpxor xmm2,xmm2,xmm4
- vmovdqa xmm4,XMMWORD[((160-128))+rax]
-
- vpslld xmm8,xmm14,5
- vpaddd xmm13,xmm13,xmm15
- vpxor xmm6,xmm12,xmm10
- vmovdqa XMMWORD[(112-128)+rax],xmm1
- vpaddd xmm13,xmm13,xmm1
- vpxor xmm2,xmm2,XMMWORD[((0-128))+rax]
- vpsrld xmm9,xmm14,27
- vpxor xmm6,xmm6,xmm11
- vpxor xmm2,xmm2,xmm4
-
- vpslld xmm7,xmm10,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm13,xmm13,xmm6
- vpsrld xmm5,xmm2,31
- vpaddd xmm2,xmm2,xmm2
-
- vpsrld xmm10,xmm10,2
- vpaddd xmm13,xmm13,xmm8
- vpor xmm2,xmm2,xmm5
- vpor xmm10,xmm10,xmm7
- vpxor xmm3,xmm3,xmm0
- vmovdqa xmm0,XMMWORD[((176-128))+rax]
-
- vpslld xmm8,xmm13,5
- vpaddd xmm12,xmm12,xmm15
- vpxor xmm6,xmm11,xmm14
- vpaddd xmm12,xmm12,xmm2
- vpxor xmm3,xmm3,XMMWORD[((16-128))+rax]
- vpsrld xmm9,xmm13,27
- vpxor xmm6,xmm6,xmm10
- vpxor xmm3,xmm3,xmm0
-
- vpslld xmm7,xmm14,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm12,xmm12,xmm6
- vpsrld xmm5,xmm3,31
- vpaddd xmm3,xmm3,xmm3
-
- vpsrld xmm14,xmm14,2
- vpaddd xmm12,xmm12,xmm8
- vpor xmm3,xmm3,xmm5
- vpor xmm14,xmm14,xmm7
- vpxor xmm4,xmm4,xmm1
- vmovdqa xmm1,XMMWORD[((192-128))+rax]
-
- vpslld xmm8,xmm12,5
- vpaddd xmm11,xmm11,xmm15
- vpxor xmm6,xmm10,xmm13
- vpaddd xmm11,xmm11,xmm3
- vpxor xmm4,xmm4,XMMWORD[((32-128))+rax]
- vpsrld xmm9,xmm12,27
- vpxor xmm6,xmm6,xmm14
- vpxor xmm4,xmm4,xmm1
-
- vpslld xmm7,xmm13,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm11,xmm11,xmm6
- vpsrld xmm5,xmm4,31
- vpaddd xmm4,xmm4,xmm4
-
- vpsrld xmm13,xmm13,2
- vpaddd xmm11,xmm11,xmm8
- vpor xmm4,xmm4,xmm5
- vpor xmm13,xmm13,xmm7
- vpxor xmm0,xmm0,xmm2
- vmovdqa xmm2,XMMWORD[((208-128))+rax]
-
- vpslld xmm8,xmm11,5
- vpaddd xmm10,xmm10,xmm15
- vpxor xmm6,xmm14,xmm12
- vpaddd xmm10,xmm10,xmm4
- vpxor xmm0,xmm0,XMMWORD[((48-128))+rax]
- vpsrld xmm9,xmm11,27
- vpxor xmm6,xmm6,xmm13
- vpxor xmm0,xmm0,xmm2
-
- vpslld xmm7,xmm12,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm10,xmm10,xmm6
- vpsrld xmm5,xmm0,31
- vpaddd xmm0,xmm0,xmm0
-
- vpsrld xmm12,xmm12,2
- vpaddd xmm10,xmm10,xmm8
- vpor xmm0,xmm0,xmm5
- vpor xmm12,xmm12,xmm7
- vpxor xmm1,xmm1,xmm3
- vmovdqa xmm3,XMMWORD[((224-128))+rax]
-
- vpslld xmm8,xmm10,5
- vpaddd xmm14,xmm14,xmm15
- vpxor xmm6,xmm13,xmm11
- vpaddd xmm14,xmm14,xmm0
- vpxor xmm1,xmm1,XMMWORD[((64-128))+rax]
- vpsrld xmm9,xmm10,27
- vpxor xmm6,xmm6,xmm12
- vpxor xmm1,xmm1,xmm3
-
- vpslld xmm7,xmm11,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm14,xmm14,xmm6
- vpsrld xmm5,xmm1,31
- vpaddd xmm1,xmm1,xmm1
-
- vpsrld xmm11,xmm11,2
- vpaddd xmm14,xmm14,xmm8
- vpor xmm1,xmm1,xmm5
- vpor xmm11,xmm11,xmm7
- vpxor xmm2,xmm2,xmm4
- vmovdqa xmm4,XMMWORD[((240-128))+rax]
-
- vpslld xmm8,xmm14,5
- vpaddd xmm13,xmm13,xmm15
- vpxor xmm6,xmm12,xmm10
- vpaddd xmm13,xmm13,xmm1
- vpxor xmm2,xmm2,XMMWORD[((80-128))+rax]
- vpsrld xmm9,xmm14,27
- vpxor xmm6,xmm6,xmm11
- vpxor xmm2,xmm2,xmm4
-
- vpslld xmm7,xmm10,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm13,xmm13,xmm6
- vpsrld xmm5,xmm2,31
- vpaddd xmm2,xmm2,xmm2
-
- vpsrld xmm10,xmm10,2
- vpaddd xmm13,xmm13,xmm8
- vpor xmm2,xmm2,xmm5
- vpor xmm10,xmm10,xmm7
- vpxor xmm3,xmm3,xmm0
- vmovdqa xmm0,XMMWORD[((0-128))+rax]
-
- vpslld xmm8,xmm13,5
- vpaddd xmm12,xmm12,xmm15
- vpxor xmm6,xmm11,xmm14
- vpaddd xmm12,xmm12,xmm2
- vpxor xmm3,xmm3,XMMWORD[((96-128))+rax]
- vpsrld xmm9,xmm13,27
- vpxor xmm6,xmm6,xmm10
- vpxor xmm3,xmm3,xmm0
-
- vpslld xmm7,xmm14,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm12,xmm12,xmm6
- vpsrld xmm5,xmm3,31
- vpaddd xmm3,xmm3,xmm3
-
- vpsrld xmm14,xmm14,2
- vpaddd xmm12,xmm12,xmm8
- vpor xmm3,xmm3,xmm5
- vpor xmm14,xmm14,xmm7
- vpxor xmm4,xmm4,xmm1
- vmovdqa xmm1,XMMWORD[((16-128))+rax]
-
- vpslld xmm8,xmm12,5
- vpaddd xmm11,xmm11,xmm15
- vpxor xmm6,xmm10,xmm13
- vpaddd xmm11,xmm11,xmm3
- vpxor xmm4,xmm4,XMMWORD[((112-128))+rax]
- vpsrld xmm9,xmm12,27
- vpxor xmm6,xmm6,xmm14
- vpxor xmm4,xmm4,xmm1
-
- vpslld xmm7,xmm13,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm11,xmm11,xmm6
- vpsrld xmm5,xmm4,31
- vpaddd xmm4,xmm4,xmm4
-
- vpsrld xmm13,xmm13,2
- vpaddd xmm11,xmm11,xmm8
- vpor xmm4,xmm4,xmm5
- vpor xmm13,xmm13,xmm7
- vpslld xmm8,xmm11,5
- vpaddd xmm10,xmm10,xmm15
- vpxor xmm6,xmm14,xmm12
-
- vpsrld xmm9,xmm11,27
- vpaddd xmm10,xmm10,xmm4
- vpxor xmm6,xmm6,xmm13
-
- vpslld xmm7,xmm12,30
- vpor xmm8,xmm8,xmm9
- vpaddd xmm10,xmm10,xmm6
-
- vpsrld xmm12,xmm12,2
- vpaddd xmm10,xmm10,xmm8
- vpor xmm12,xmm12,xmm7
- mov ecx,1
- cmp ecx,DWORD[rbx]
- cmovge r8,rbp
- cmp ecx,DWORD[4+rbx]
- cmovge r9,rbp
- cmp ecx,DWORD[8+rbx]
- cmovge r10,rbp
- cmp ecx,DWORD[12+rbx]
- cmovge r11,rbp
- vmovdqu xmm6,XMMWORD[rbx]
- vpxor xmm8,xmm8,xmm8
- vmovdqa xmm7,xmm6
- vpcmpgtd xmm7,xmm7,xmm8
- vpaddd xmm6,xmm6,xmm7
-
- vpand xmm10,xmm10,xmm7
- vpand xmm11,xmm11,xmm7
- vpaddd xmm10,xmm10,XMMWORD[rdi]
- vpand xmm12,xmm12,xmm7
- vpaddd xmm11,xmm11,XMMWORD[32+rdi]
- vpand xmm13,xmm13,xmm7
- vpaddd xmm12,xmm12,XMMWORD[64+rdi]
- vpand xmm14,xmm14,xmm7
- vpaddd xmm13,xmm13,XMMWORD[96+rdi]
- vpaddd xmm14,xmm14,XMMWORD[128+rdi]
- vmovdqu XMMWORD[rdi],xmm10
- vmovdqu XMMWORD[32+rdi],xmm11
- vmovdqu XMMWORD[64+rdi],xmm12
- vmovdqu XMMWORD[96+rdi],xmm13
- vmovdqu XMMWORD[128+rdi],xmm14
-
- vmovdqu XMMWORD[rbx],xmm6
- vmovdqu xmm5,XMMWORD[96+rbp]
- dec edx
- jnz NEAR $L$oop_avx
-
- mov edx,DWORD[280+rsp]
- lea rdi,[16+rdi]
- lea rsi,[64+rsi]
- dec edx
- jnz NEAR $L$oop_grande_avx
-
-$L$done_avx:
- mov rax,QWORD[272+rsp]
-
- vzeroupper
- movaps xmm6,XMMWORD[((-184))+rax]
- movaps xmm7,XMMWORD[((-168))+rax]
- movaps xmm8,XMMWORD[((-152))+rax]
- movaps xmm9,XMMWORD[((-136))+rax]
- movaps xmm10,XMMWORD[((-120))+rax]
- movaps xmm11,XMMWORD[((-104))+rax]
- movaps xmm12,XMMWORD[((-88))+rax]
- movaps xmm13,XMMWORD[((-72))+rax]
- movaps xmm14,XMMWORD[((-56))+rax]
- movaps xmm15,XMMWORD[((-40))+rax]
- mov rbp,QWORD[((-16))+rax]
-
- mov rbx,QWORD[((-8))+rax]
-
- lea rsp,[rax]
-
-$L$epilogue_avx:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_sha1_multi_block_avx:
-
-ALIGN 32
-sha1_multi_block_avx2:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_sha1_multi_block_avx2:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
-
-
-
-_avx2_shortcut:
- mov rax,rsp
-
- push rbx
-
- push rbp
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
- lea rsp,[((-168))+rsp]
- movaps XMMWORD[rsp],xmm6
- movaps XMMWORD[16+rsp],xmm7
- movaps XMMWORD[32+rsp],xmm8
- movaps XMMWORD[48+rsp],xmm9
- movaps XMMWORD[64+rsp],xmm10
- movaps XMMWORD[80+rsp],xmm11
- movaps XMMWORD[(-120)+rax],xmm12
- movaps XMMWORD[(-104)+rax],xmm13
- movaps XMMWORD[(-88)+rax],xmm14
- movaps XMMWORD[(-72)+rax],xmm15
- sub rsp,576
- and rsp,-256
- mov QWORD[544+rsp],rax
-
-$L$body_avx2:
- lea rbp,[K_XX_XX]
- shr edx,1
-
- vzeroupper
-$L$oop_grande_avx2:
- mov DWORD[552+rsp],edx
- xor edx,edx
- lea rbx,[512+rsp]
-
- mov r12,QWORD[rsi]
-
- mov ecx,DWORD[8+rsi]
- cmp ecx,edx
- cmovg edx,ecx
- test ecx,ecx
- mov DWORD[rbx],ecx
- cmovle r12,rbp
-
- mov r13,QWORD[16+rsi]
-
- mov ecx,DWORD[24+rsi]
- cmp ecx,edx
- cmovg edx,ecx
- test ecx,ecx
- mov DWORD[4+rbx],ecx
- cmovle r13,rbp
-
- mov r14,QWORD[32+rsi]
-
- mov ecx,DWORD[40+rsi]
- cmp ecx,edx
- cmovg edx,ecx
- test ecx,ecx
- mov DWORD[8+rbx],ecx
- cmovle r14,rbp
-
- mov r15,QWORD[48+rsi]
-
- mov ecx,DWORD[56+rsi]
- cmp ecx,edx
- cmovg edx,ecx
- test ecx,ecx
- mov DWORD[12+rbx],ecx
- cmovle r15,rbp
-
- mov r8,QWORD[64+rsi]
-
- mov ecx,DWORD[72+rsi]
- cmp ecx,edx
- cmovg edx,ecx
- test ecx,ecx
- mov DWORD[16+rbx],ecx
- cmovle r8,rbp
-
- mov r9,QWORD[80+rsi]
-
- mov ecx,DWORD[88+rsi]
- cmp ecx,edx
- cmovg edx,ecx
- test ecx,ecx
- mov DWORD[20+rbx],ecx
- cmovle r9,rbp
-
- mov r10,QWORD[96+rsi]
-
- mov ecx,DWORD[104+rsi]
- cmp ecx,edx
- cmovg edx,ecx
- test ecx,ecx
- mov DWORD[24+rbx],ecx
- cmovle r10,rbp
-
- mov r11,QWORD[112+rsi]
-
- mov ecx,DWORD[120+rsi]
- cmp ecx,edx
- cmovg edx,ecx
- test ecx,ecx
- mov DWORD[28+rbx],ecx
- cmovle r11,rbp
- vmovdqu ymm0,YMMWORD[rdi]
- lea rax,[128+rsp]
- vmovdqu ymm1,YMMWORD[32+rdi]
- lea rbx,[((256+128))+rsp]
- vmovdqu ymm2,YMMWORD[64+rdi]
- vmovdqu ymm3,YMMWORD[96+rdi]
- vmovdqu ymm4,YMMWORD[128+rdi]
- vmovdqu ymm9,YMMWORD[96+rbp]
- jmp NEAR $L$oop_avx2
-
-ALIGN 32
-$L$oop_avx2:
- vmovdqa ymm15,YMMWORD[((-32))+rbp]
- vmovd xmm10,DWORD[r12]
- lea r12,[64+r12]
- vmovd xmm12,DWORD[r8]
- lea r8,[64+r8]
- vmovd xmm7,DWORD[r13]
- lea r13,[64+r13]
- vmovd xmm6,DWORD[r9]
- lea r9,[64+r9]
- vpinsrd xmm10,xmm10,DWORD[r14],1
- lea r14,[64+r14]
- vpinsrd xmm12,xmm12,DWORD[r10],1
- lea r10,[64+r10]
- vpinsrd xmm7,xmm7,DWORD[r15],1
- lea r15,[64+r15]
- vpunpckldq ymm10,ymm10,ymm7
- vpinsrd xmm6,xmm6,DWORD[r11],1
- lea r11,[64+r11]
- vpunpckldq ymm12,ymm12,ymm6
- vmovd xmm11,DWORD[((-60))+r12]
- vinserti128 ymm10,ymm10,xmm12,1
- vmovd xmm8,DWORD[((-60))+r8]
- vpshufb ymm10,ymm10,ymm9
- vmovd xmm7,DWORD[((-60))+r13]
- vmovd xmm6,DWORD[((-60))+r9]
- vpinsrd xmm11,xmm11,DWORD[((-60))+r14],1
- vpinsrd xmm8,xmm8,DWORD[((-60))+r10],1
- vpinsrd xmm7,xmm7,DWORD[((-60))+r15],1
- vpunpckldq ymm11,ymm11,ymm7
- vpinsrd xmm6,xmm6,DWORD[((-60))+r11],1
- vpunpckldq ymm8,ymm8,ymm6
- vpaddd ymm4,ymm4,ymm15
- vpslld ymm7,ymm0,5
- vpandn ymm6,ymm1,ymm3
- vpand ymm5,ymm1,ymm2
-
- vmovdqa YMMWORD[(0-128)+rax],ymm10
- vpaddd ymm4,ymm4,ymm10
- vinserti128 ymm11,ymm11,xmm8,1
- vpsrld ymm8,ymm0,27
- vpxor ymm5,ymm5,ymm6
- vmovd xmm12,DWORD[((-56))+r12]
-
- vpslld ymm6,ymm1,30
- vpor ymm7,ymm7,ymm8
- vmovd xmm8,DWORD[((-56))+r8]
- vpaddd ymm4,ymm4,ymm5
-
- vpsrld ymm1,ymm1,2
- vpaddd ymm4,ymm4,ymm7
- vpshufb ymm11,ymm11,ymm9
- vpor ymm1,ymm1,ymm6
- vmovd xmm7,DWORD[((-56))+r13]
- vmovd xmm6,DWORD[((-56))+r9]
- vpinsrd xmm12,xmm12,DWORD[((-56))+r14],1
- vpinsrd xmm8,xmm8,DWORD[((-56))+r10],1
- vpinsrd xmm7,xmm7,DWORD[((-56))+r15],1
- vpunpckldq ymm12,ymm12,ymm7
- vpinsrd xmm6,xmm6,DWORD[((-56))+r11],1
- vpunpckldq ymm8,ymm8,ymm6
- vpaddd ymm3,ymm3,ymm15
- vpslld ymm7,ymm4,5
- vpandn ymm6,ymm0,ymm2
- vpand ymm5,ymm0,ymm1
-
- vmovdqa YMMWORD[(32-128)+rax],ymm11
- vpaddd ymm3,ymm3,ymm11
- vinserti128 ymm12,ymm12,xmm8,1
- vpsrld ymm8,ymm4,27
- vpxor ymm5,ymm5,ymm6
- vmovd xmm13,DWORD[((-52))+r12]
-
- vpslld ymm6,ymm0,30
- vpor ymm7,ymm7,ymm8
- vmovd xmm8,DWORD[((-52))+r8]
- vpaddd ymm3,ymm3,ymm5
-
- vpsrld ymm0,ymm0,2
- vpaddd ymm3,ymm3,ymm7
- vpshufb ymm12,ymm12,ymm9
- vpor ymm0,ymm0,ymm6
- vmovd xmm7,DWORD[((-52))+r13]
- vmovd xmm6,DWORD[((-52))+r9]
- vpinsrd xmm13,xmm13,DWORD[((-52))+r14],1
- vpinsrd xmm8,xmm8,DWORD[((-52))+r10],1
- vpinsrd xmm7,xmm7,DWORD[((-52))+r15],1
- vpunpckldq ymm13,ymm13,ymm7
- vpinsrd xmm6,xmm6,DWORD[((-52))+r11],1
- vpunpckldq ymm8,ymm8,ymm6
- vpaddd ymm2,ymm2,ymm15
- vpslld ymm7,ymm3,5
- vpandn ymm6,ymm4,ymm1
- vpand ymm5,ymm4,ymm0
-
- vmovdqa YMMWORD[(64-128)+rax],ymm12
- vpaddd ymm2,ymm2,ymm12
- vinserti128 ymm13,ymm13,xmm8,1
- vpsrld ymm8,ymm3,27
- vpxor ymm5,ymm5,ymm6
- vmovd xmm14,DWORD[((-48))+r12]
-
- vpslld ymm6,ymm4,30
- vpor ymm7,ymm7,ymm8
- vmovd xmm8,DWORD[((-48))+r8]
- vpaddd ymm2,ymm2,ymm5
-
- vpsrld ymm4,ymm4,2
- vpaddd ymm2,ymm2,ymm7
- vpshufb ymm13,ymm13,ymm9
- vpor ymm4,ymm4,ymm6
- vmovd xmm7,DWORD[((-48))+r13]
- vmovd xmm6,DWORD[((-48))+r9]
- vpinsrd xmm14,xmm14,DWORD[((-48))+r14],1
- vpinsrd xmm8,xmm8,DWORD[((-48))+r10],1
- vpinsrd xmm7,xmm7,DWORD[((-48))+r15],1
- vpunpckldq ymm14,ymm14,ymm7
- vpinsrd xmm6,xmm6,DWORD[((-48))+r11],1
- vpunpckldq ymm8,ymm8,ymm6
- vpaddd ymm1,ymm1,ymm15
- vpslld ymm7,ymm2,5
- vpandn ymm6,ymm3,ymm0
- vpand ymm5,ymm3,ymm4
-
- vmovdqa YMMWORD[(96-128)+rax],ymm13
- vpaddd ymm1,ymm1,ymm13
- vinserti128 ymm14,ymm14,xmm8,1
- vpsrld ymm8,ymm2,27
- vpxor ymm5,ymm5,ymm6
- vmovd xmm10,DWORD[((-44))+r12]
-
- vpslld ymm6,ymm3,30
- vpor ymm7,ymm7,ymm8
- vmovd xmm8,DWORD[((-44))+r8]
- vpaddd ymm1,ymm1,ymm5
-
- vpsrld ymm3,ymm3,2
- vpaddd ymm1,ymm1,ymm7
- vpshufb ymm14,ymm14,ymm9
- vpor ymm3,ymm3,ymm6
- vmovd xmm7,DWORD[((-44))+r13]
- vmovd xmm6,DWORD[((-44))+r9]
- vpinsrd xmm10,xmm10,DWORD[((-44))+r14],1
- vpinsrd xmm8,xmm8,DWORD[((-44))+r10],1
- vpinsrd xmm7,xmm7,DWORD[((-44))+r15],1
- vpunpckldq ymm10,ymm10,ymm7
- vpinsrd xmm6,xmm6,DWORD[((-44))+r11],1
- vpunpckldq ymm8,ymm8,ymm6
- vpaddd ymm0,ymm0,ymm15
- vpslld ymm7,ymm1,5
- vpandn ymm6,ymm2,ymm4
- vpand ymm5,ymm2,ymm3
-
- vmovdqa YMMWORD[(128-128)+rax],ymm14
- vpaddd ymm0,ymm0,ymm14
- vinserti128 ymm10,ymm10,xmm8,1
- vpsrld ymm8,ymm1,27
- vpxor ymm5,ymm5,ymm6
- vmovd xmm11,DWORD[((-40))+r12]
-
- vpslld ymm6,ymm2,30
- vpor ymm7,ymm7,ymm8
- vmovd xmm8,DWORD[((-40))+r8]
- vpaddd ymm0,ymm0,ymm5
-
- vpsrld ymm2,ymm2,2
- vpaddd ymm0,ymm0,ymm7
- vpshufb ymm10,ymm10,ymm9
- vpor ymm2,ymm2,ymm6
- vmovd xmm7,DWORD[((-40))+r13]
- vmovd xmm6,DWORD[((-40))+r9]
- vpinsrd xmm11,xmm11,DWORD[((-40))+r14],1
- vpinsrd xmm8,xmm8,DWORD[((-40))+r10],1
- vpinsrd xmm7,xmm7,DWORD[((-40))+r15],1
- vpunpckldq ymm11,ymm11,ymm7
- vpinsrd xmm6,xmm6,DWORD[((-40))+r11],1
- vpunpckldq ymm8,ymm8,ymm6
- vpaddd ymm4,ymm4,ymm15
- vpslld ymm7,ymm0,5
- vpandn ymm6,ymm1,ymm3
- vpand ymm5,ymm1,ymm2
-
- vmovdqa YMMWORD[(160-128)+rax],ymm10
- vpaddd ymm4,ymm4,ymm10
- vinserti128 ymm11,ymm11,xmm8,1
- vpsrld ymm8,ymm0,27
- vpxor ymm5,ymm5,ymm6
- vmovd xmm12,DWORD[((-36))+r12]
-
- vpslld ymm6,ymm1,30
- vpor ymm7,ymm7,ymm8
- vmovd xmm8,DWORD[((-36))+r8]
- vpaddd ymm4,ymm4,ymm5
-
- vpsrld ymm1,ymm1,2
- vpaddd ymm4,ymm4,ymm7
- vpshufb ymm11,ymm11,ymm9
- vpor ymm1,ymm1,ymm6
- vmovd xmm7,DWORD[((-36))+r13]
- vmovd xmm6,DWORD[((-36))+r9]
- vpinsrd xmm12,xmm12,DWORD[((-36))+r14],1
- vpinsrd xmm8,xmm8,DWORD[((-36))+r10],1
- vpinsrd xmm7,xmm7,DWORD[((-36))+r15],1
- vpunpckldq ymm12,ymm12,ymm7
- vpinsrd xmm6,xmm6,DWORD[((-36))+r11],1
- vpunpckldq ymm8,ymm8,ymm6
- vpaddd ymm3,ymm3,ymm15
- vpslld ymm7,ymm4,5
- vpandn ymm6,ymm0,ymm2
- vpand ymm5,ymm0,ymm1
-
- vmovdqa YMMWORD[(192-128)+rax],ymm11
- vpaddd ymm3,ymm3,ymm11
- vinserti128 ymm12,ymm12,xmm8,1
- vpsrld ymm8,ymm4,27
- vpxor ymm5,ymm5,ymm6
- vmovd xmm13,DWORD[((-32))+r12]
-
- vpslld ymm6,ymm0,30
- vpor ymm7,ymm7,ymm8
- vmovd xmm8,DWORD[((-32))+r8]
- vpaddd ymm3,ymm3,ymm5
-
- vpsrld ymm0,ymm0,2
- vpaddd ymm3,ymm3,ymm7
- vpshufb ymm12,ymm12,ymm9
- vpor ymm0,ymm0,ymm6
- vmovd xmm7,DWORD[((-32))+r13]
- vmovd xmm6,DWORD[((-32))+r9]
- vpinsrd xmm13,xmm13,DWORD[((-32))+r14],1
- vpinsrd xmm8,xmm8,DWORD[((-32))+r10],1
- vpinsrd xmm7,xmm7,DWORD[((-32))+r15],1
- vpunpckldq ymm13,ymm13,ymm7
- vpinsrd xmm6,xmm6,DWORD[((-32))+r11],1
- vpunpckldq ymm8,ymm8,ymm6
- vpaddd ymm2,ymm2,ymm15
- vpslld ymm7,ymm3,5
- vpandn ymm6,ymm4,ymm1
- vpand ymm5,ymm4,ymm0
-
- vmovdqa YMMWORD[(224-128)+rax],ymm12
- vpaddd ymm2,ymm2,ymm12
- vinserti128 ymm13,ymm13,xmm8,1
- vpsrld ymm8,ymm3,27
- vpxor ymm5,ymm5,ymm6
- vmovd xmm14,DWORD[((-28))+r12]
-
- vpslld ymm6,ymm4,30
- vpor ymm7,ymm7,ymm8
- vmovd xmm8,DWORD[((-28))+r8]
- vpaddd ymm2,ymm2,ymm5
-
- vpsrld ymm4,ymm4,2
- vpaddd ymm2,ymm2,ymm7
- vpshufb ymm13,ymm13,ymm9
- vpor ymm4,ymm4,ymm6
- vmovd xmm7,DWORD[((-28))+r13]
- vmovd xmm6,DWORD[((-28))+r9]
- vpinsrd xmm14,xmm14,DWORD[((-28))+r14],1
- vpinsrd xmm8,xmm8,DWORD[((-28))+r10],1
- vpinsrd xmm7,xmm7,DWORD[((-28))+r15],1
- vpunpckldq ymm14,ymm14,ymm7
- vpinsrd xmm6,xmm6,DWORD[((-28))+r11],1
- vpunpckldq ymm8,ymm8,ymm6
- vpaddd ymm1,ymm1,ymm15
- vpslld ymm7,ymm2,5
- vpandn ymm6,ymm3,ymm0
- vpand ymm5,ymm3,ymm4
-
- vmovdqa YMMWORD[(256-256-128)+rbx],ymm13
- vpaddd ymm1,ymm1,ymm13
- vinserti128 ymm14,ymm14,xmm8,1
- vpsrld ymm8,ymm2,27
- vpxor ymm5,ymm5,ymm6
- vmovd xmm10,DWORD[((-24))+r12]
-
- vpslld ymm6,ymm3,30
- vpor ymm7,ymm7,ymm8
- vmovd xmm8,DWORD[((-24))+r8]
- vpaddd ymm1,ymm1,ymm5
-
- vpsrld ymm3,ymm3,2
- vpaddd ymm1,ymm1,ymm7
- vpshufb ymm14,ymm14,ymm9
- vpor ymm3,ymm3,ymm6
- vmovd xmm7,DWORD[((-24))+r13]
- vmovd xmm6,DWORD[((-24))+r9]
- vpinsrd xmm10,xmm10,DWORD[((-24))+r14],1
- vpinsrd xmm8,xmm8,DWORD[((-24))+r10],1
- vpinsrd xmm7,xmm7,DWORD[((-24))+r15],1
- vpunpckldq ymm10,ymm10,ymm7
- vpinsrd xmm6,xmm6,DWORD[((-24))+r11],1
- vpunpckldq ymm8,ymm8,ymm6
- vpaddd ymm0,ymm0,ymm15
- vpslld ymm7,ymm1,5
- vpandn ymm6,ymm2,ymm4
- vpand ymm5,ymm2,ymm3
-
- vmovdqa YMMWORD[(288-256-128)+rbx],ymm14
- vpaddd ymm0,ymm0,ymm14
- vinserti128 ymm10,ymm10,xmm8,1
- vpsrld ymm8,ymm1,27
- vpxor ymm5,ymm5,ymm6
- vmovd xmm11,DWORD[((-20))+r12]
-
- vpslld ymm6,ymm2,30
- vpor ymm7,ymm7,ymm8
- vmovd xmm8,DWORD[((-20))+r8]
- vpaddd ymm0,ymm0,ymm5
-
- vpsrld ymm2,ymm2,2
- vpaddd ymm0,ymm0,ymm7
- vpshufb ymm10,ymm10,ymm9
- vpor ymm2,ymm2,ymm6
- vmovd xmm7,DWORD[((-20))+r13]
- vmovd xmm6,DWORD[((-20))+r9]
- vpinsrd xmm11,xmm11,DWORD[((-20))+r14],1
- vpinsrd xmm8,xmm8,DWORD[((-20))+r10],1
- vpinsrd xmm7,xmm7,DWORD[((-20))+r15],1
- vpunpckldq ymm11,ymm11,ymm7
- vpinsrd xmm6,xmm6,DWORD[((-20))+r11],1
- vpunpckldq ymm8,ymm8,ymm6
- vpaddd ymm4,ymm4,ymm15
- vpslld ymm7,ymm0,5
- vpandn ymm6,ymm1,ymm3
- vpand ymm5,ymm1,ymm2
-
- vmovdqa YMMWORD[(320-256-128)+rbx],ymm10
- vpaddd ymm4,ymm4,ymm10
- vinserti128 ymm11,ymm11,xmm8,1
- vpsrld ymm8,ymm0,27
- vpxor ymm5,ymm5,ymm6
- vmovd xmm12,DWORD[((-16))+r12]
-
- vpslld ymm6,ymm1,30
- vpor ymm7,ymm7,ymm8
- vmovd xmm8,DWORD[((-16))+r8]
- vpaddd ymm4,ymm4,ymm5
-
- vpsrld ymm1,ymm1,2
- vpaddd ymm4,ymm4,ymm7
- vpshufb ymm11,ymm11,ymm9
- vpor ymm1,ymm1,ymm6
- vmovd xmm7,DWORD[((-16))+r13]
- vmovd xmm6,DWORD[((-16))+r9]
- vpinsrd xmm12,xmm12,DWORD[((-16))+r14],1
- vpinsrd xmm8,xmm8,DWORD[((-16))+r10],1
- vpinsrd xmm7,xmm7,DWORD[((-16))+r15],1
- vpunpckldq ymm12,ymm12,ymm7
- vpinsrd xmm6,xmm6,DWORD[((-16))+r11],1
- vpunpckldq ymm8,ymm8,ymm6
- vpaddd ymm3,ymm3,ymm15
- vpslld ymm7,ymm4,5
- vpandn ymm6,ymm0,ymm2
- vpand ymm5,ymm0,ymm1
-
- vmovdqa YMMWORD[(352-256-128)+rbx],ymm11
- vpaddd ymm3,ymm3,ymm11
- vinserti128 ymm12,ymm12,xmm8,1
- vpsrld ymm8,ymm4,27
- vpxor ymm5,ymm5,ymm6
- vmovd xmm13,DWORD[((-12))+r12]
-
- vpslld ymm6,ymm0,30
- vpor ymm7,ymm7,ymm8
- vmovd xmm8,DWORD[((-12))+r8]
- vpaddd ymm3,ymm3,ymm5
-
- vpsrld ymm0,ymm0,2
- vpaddd ymm3,ymm3,ymm7
- vpshufb ymm12,ymm12,ymm9
- vpor ymm0,ymm0,ymm6
- vmovd xmm7,DWORD[((-12))+r13]
- vmovd xmm6,DWORD[((-12))+r9]
- vpinsrd xmm13,xmm13,DWORD[((-12))+r14],1
- vpinsrd xmm8,xmm8,DWORD[((-12))+r10],1
- vpinsrd xmm7,xmm7,DWORD[((-12))+r15],1
- vpunpckldq ymm13,ymm13,ymm7
- vpinsrd xmm6,xmm6,DWORD[((-12))+r11],1
- vpunpckldq ymm8,ymm8,ymm6
- vpaddd ymm2,ymm2,ymm15
- vpslld ymm7,ymm3,5
- vpandn ymm6,ymm4,ymm1
- vpand ymm5,ymm4,ymm0
-
- vmovdqa YMMWORD[(384-256-128)+rbx],ymm12
- vpaddd ymm2,ymm2,ymm12
- vinserti128 ymm13,ymm13,xmm8,1
- vpsrld ymm8,ymm3,27
- vpxor ymm5,ymm5,ymm6
- vmovd xmm14,DWORD[((-8))+r12]
-
- vpslld ymm6,ymm4,30
- vpor ymm7,ymm7,ymm8
- vmovd xmm8,DWORD[((-8))+r8]
- vpaddd ymm2,ymm2,ymm5
-
- vpsrld ymm4,ymm4,2
- vpaddd ymm2,ymm2,ymm7
- vpshufb ymm13,ymm13,ymm9
- vpor ymm4,ymm4,ymm6
- vmovd xmm7,DWORD[((-8))+r13]
- vmovd xmm6,DWORD[((-8))+r9]
- vpinsrd xmm14,xmm14,DWORD[((-8))+r14],1
- vpinsrd xmm8,xmm8,DWORD[((-8))+r10],1
- vpinsrd xmm7,xmm7,DWORD[((-8))+r15],1
- vpunpckldq ymm14,ymm14,ymm7
- vpinsrd xmm6,xmm6,DWORD[((-8))+r11],1
- vpunpckldq ymm8,ymm8,ymm6
- vpaddd ymm1,ymm1,ymm15
- vpslld ymm7,ymm2,5
- vpandn ymm6,ymm3,ymm0
- vpand ymm5,ymm3,ymm4
-
- vmovdqa YMMWORD[(416-256-128)+rbx],ymm13
- vpaddd ymm1,ymm1,ymm13
- vinserti128 ymm14,ymm14,xmm8,1
- vpsrld ymm8,ymm2,27
- vpxor ymm5,ymm5,ymm6
- vmovd xmm10,DWORD[((-4))+r12]
-
- vpslld ymm6,ymm3,30
- vpor ymm7,ymm7,ymm8
- vmovd xmm8,DWORD[((-4))+r8]
- vpaddd ymm1,ymm1,ymm5
-
- vpsrld ymm3,ymm3,2
- vpaddd ymm1,ymm1,ymm7
- vpshufb ymm14,ymm14,ymm9
- vpor ymm3,ymm3,ymm6
- vmovdqa ymm11,YMMWORD[((0-128))+rax]
- vmovd xmm7,DWORD[((-4))+r13]
- vmovd xmm6,DWORD[((-4))+r9]
- vpinsrd xmm10,xmm10,DWORD[((-4))+r14],1
- vpinsrd xmm8,xmm8,DWORD[((-4))+r10],1
- vpinsrd xmm7,xmm7,DWORD[((-4))+r15],1
- vpunpckldq ymm10,ymm10,ymm7
- vpinsrd xmm6,xmm6,DWORD[((-4))+r11],1
- vpunpckldq ymm8,ymm8,ymm6
- vpaddd ymm0,ymm0,ymm15
- prefetcht0 [63+r12]
- vpslld ymm7,ymm1,5
- vpandn ymm6,ymm2,ymm4
- vpand ymm5,ymm2,ymm3
-
- vmovdqa YMMWORD[(448-256-128)+rbx],ymm14
- vpaddd ymm0,ymm0,ymm14
- vinserti128 ymm10,ymm10,xmm8,1
- vpsrld ymm8,ymm1,27
- prefetcht0 [63+r13]
- vpxor ymm5,ymm5,ymm6
-
- vpslld ymm6,ymm2,30
- vpor ymm7,ymm7,ymm8
- prefetcht0 [63+r14]
- vpaddd ymm0,ymm0,ymm5
-
- vpsrld ymm2,ymm2,2
- vpaddd ymm0,ymm0,ymm7
- prefetcht0 [63+r15]
- vpshufb ymm10,ymm10,ymm9
- vpor ymm2,ymm2,ymm6
- vmovdqa ymm12,YMMWORD[((32-128))+rax]
- vpxor ymm11,ymm11,ymm13
- vmovdqa ymm13,YMMWORD[((64-128))+rax]
-
- vpaddd ymm4,ymm4,ymm15
- vpslld ymm7,ymm0,5
- vpandn ymm6,ymm1,ymm3
- prefetcht0 [63+r8]
- vpand ymm5,ymm1,ymm2
-
- vmovdqa YMMWORD[(480-256-128)+rbx],ymm10
- vpaddd ymm4,ymm4,ymm10
- vpxor ymm11,ymm11,YMMWORD[((256-256-128))+rbx]
- vpsrld ymm8,ymm0,27
- vpxor ymm5,ymm5,ymm6
- vpxor ymm11,ymm11,ymm13
- prefetcht0 [63+r9]
-
- vpslld ymm6,ymm1,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm4,ymm4,ymm5
- prefetcht0 [63+r10]
- vpsrld ymm9,ymm11,31
- vpaddd ymm11,ymm11,ymm11
-
- vpsrld ymm1,ymm1,2
- prefetcht0 [63+r11]
- vpaddd ymm4,ymm4,ymm7
- vpor ymm11,ymm11,ymm9
- vpor ymm1,ymm1,ymm6
- vpxor ymm12,ymm12,ymm14
- vmovdqa ymm14,YMMWORD[((96-128))+rax]
-
- vpaddd ymm3,ymm3,ymm15
- vpslld ymm7,ymm4,5
- vpandn ymm6,ymm0,ymm2
-
- vpand ymm5,ymm0,ymm1
-
- vmovdqa YMMWORD[(0-128)+rax],ymm11
- vpaddd ymm3,ymm3,ymm11
- vpxor ymm12,ymm12,YMMWORD[((288-256-128))+rbx]
- vpsrld ymm8,ymm4,27
- vpxor ymm5,ymm5,ymm6
- vpxor ymm12,ymm12,ymm14
-
-
- vpslld ymm6,ymm0,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm3,ymm3,ymm5
-
- vpsrld ymm9,ymm12,31
- vpaddd ymm12,ymm12,ymm12
-
- vpsrld ymm0,ymm0,2
-
- vpaddd ymm3,ymm3,ymm7
- vpor ymm12,ymm12,ymm9
- vpor ymm0,ymm0,ymm6
- vpxor ymm13,ymm13,ymm10
- vmovdqa ymm10,YMMWORD[((128-128))+rax]
-
- vpaddd ymm2,ymm2,ymm15
- vpslld ymm7,ymm3,5
- vpandn ymm6,ymm4,ymm1
-
- vpand ymm5,ymm4,ymm0
-
- vmovdqa YMMWORD[(32-128)+rax],ymm12
- vpaddd ymm2,ymm2,ymm12
- vpxor ymm13,ymm13,YMMWORD[((320-256-128))+rbx]
- vpsrld ymm8,ymm3,27
- vpxor ymm5,ymm5,ymm6
- vpxor ymm13,ymm13,ymm10
-
-
- vpslld ymm6,ymm4,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm2,ymm2,ymm5
-
- vpsrld ymm9,ymm13,31
- vpaddd ymm13,ymm13,ymm13
-
- vpsrld ymm4,ymm4,2
-
- vpaddd ymm2,ymm2,ymm7
- vpor ymm13,ymm13,ymm9
- vpor ymm4,ymm4,ymm6
- vpxor ymm14,ymm14,ymm11
- vmovdqa ymm11,YMMWORD[((160-128))+rax]
-
- vpaddd ymm1,ymm1,ymm15
- vpslld ymm7,ymm2,5
- vpandn ymm6,ymm3,ymm0
-
- vpand ymm5,ymm3,ymm4
-
- vmovdqa YMMWORD[(64-128)+rax],ymm13
- vpaddd ymm1,ymm1,ymm13
- vpxor ymm14,ymm14,YMMWORD[((352-256-128))+rbx]
- vpsrld ymm8,ymm2,27
- vpxor ymm5,ymm5,ymm6
- vpxor ymm14,ymm14,ymm11
-
-
- vpslld ymm6,ymm3,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm1,ymm1,ymm5
-
- vpsrld ymm9,ymm14,31
- vpaddd ymm14,ymm14,ymm14
-
- vpsrld ymm3,ymm3,2
-
- vpaddd ymm1,ymm1,ymm7
- vpor ymm14,ymm14,ymm9
- vpor ymm3,ymm3,ymm6
- vpxor ymm10,ymm10,ymm12
- vmovdqa ymm12,YMMWORD[((192-128))+rax]
-
- vpaddd ymm0,ymm0,ymm15
- vpslld ymm7,ymm1,5
- vpandn ymm6,ymm2,ymm4
-
- vpand ymm5,ymm2,ymm3
-
- vmovdqa YMMWORD[(96-128)+rax],ymm14
- vpaddd ymm0,ymm0,ymm14
- vpxor ymm10,ymm10,YMMWORD[((384-256-128))+rbx]
- vpsrld ymm8,ymm1,27
- vpxor ymm5,ymm5,ymm6
- vpxor ymm10,ymm10,ymm12
-
-
- vpslld ymm6,ymm2,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm0,ymm0,ymm5
-
- vpsrld ymm9,ymm10,31
- vpaddd ymm10,ymm10,ymm10
-
- vpsrld ymm2,ymm2,2
-
- vpaddd ymm0,ymm0,ymm7
- vpor ymm10,ymm10,ymm9
- vpor ymm2,ymm2,ymm6
- vmovdqa ymm15,YMMWORD[rbp]
- vpxor ymm11,ymm11,ymm13
- vmovdqa ymm13,YMMWORD[((224-128))+rax]
-
- vpslld ymm7,ymm0,5
- vpaddd ymm4,ymm4,ymm15
- vpxor ymm5,ymm3,ymm1
- vmovdqa YMMWORD[(128-128)+rax],ymm10
- vpaddd ymm4,ymm4,ymm10
- vpxor ymm11,ymm11,YMMWORD[((416-256-128))+rbx]
- vpsrld ymm8,ymm0,27
- vpxor ymm5,ymm5,ymm2
- vpxor ymm11,ymm11,ymm13
-
- vpslld ymm6,ymm1,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm4,ymm4,ymm5
- vpsrld ymm9,ymm11,31
- vpaddd ymm11,ymm11,ymm11
-
- vpsrld ymm1,ymm1,2
- vpaddd ymm4,ymm4,ymm7
- vpor ymm11,ymm11,ymm9
- vpor ymm1,ymm1,ymm6
- vpxor ymm12,ymm12,ymm14
- vmovdqa ymm14,YMMWORD[((256-256-128))+rbx]
-
- vpslld ymm7,ymm4,5
- vpaddd ymm3,ymm3,ymm15
- vpxor ymm5,ymm2,ymm0
- vmovdqa YMMWORD[(160-128)+rax],ymm11
- vpaddd ymm3,ymm3,ymm11
- vpxor ymm12,ymm12,YMMWORD[((448-256-128))+rbx]
- vpsrld ymm8,ymm4,27
- vpxor ymm5,ymm5,ymm1
- vpxor ymm12,ymm12,ymm14
-
- vpslld ymm6,ymm0,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm3,ymm3,ymm5
- vpsrld ymm9,ymm12,31
- vpaddd ymm12,ymm12,ymm12
-
- vpsrld ymm0,ymm0,2
- vpaddd ymm3,ymm3,ymm7
- vpor ymm12,ymm12,ymm9
- vpor ymm0,ymm0,ymm6
- vpxor ymm13,ymm13,ymm10
- vmovdqa ymm10,YMMWORD[((288-256-128))+rbx]
-
- vpslld ymm7,ymm3,5
- vpaddd ymm2,ymm2,ymm15
- vpxor ymm5,ymm1,ymm4
- vmovdqa YMMWORD[(192-128)+rax],ymm12
- vpaddd ymm2,ymm2,ymm12
- vpxor ymm13,ymm13,YMMWORD[((480-256-128))+rbx]
- vpsrld ymm8,ymm3,27
- vpxor ymm5,ymm5,ymm0
- vpxor ymm13,ymm13,ymm10
-
- vpslld ymm6,ymm4,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm2,ymm2,ymm5
- vpsrld ymm9,ymm13,31
- vpaddd ymm13,ymm13,ymm13
-
- vpsrld ymm4,ymm4,2
- vpaddd ymm2,ymm2,ymm7
- vpor ymm13,ymm13,ymm9
- vpor ymm4,ymm4,ymm6
- vpxor ymm14,ymm14,ymm11
- vmovdqa ymm11,YMMWORD[((320-256-128))+rbx]
-
- vpslld ymm7,ymm2,5
- vpaddd ymm1,ymm1,ymm15
- vpxor ymm5,ymm0,ymm3
- vmovdqa YMMWORD[(224-128)+rax],ymm13
- vpaddd ymm1,ymm1,ymm13
- vpxor ymm14,ymm14,YMMWORD[((0-128))+rax]
- vpsrld ymm8,ymm2,27
- vpxor ymm5,ymm5,ymm4
- vpxor ymm14,ymm14,ymm11
-
- vpslld ymm6,ymm3,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm1,ymm1,ymm5
- vpsrld ymm9,ymm14,31
- vpaddd ymm14,ymm14,ymm14
-
- vpsrld ymm3,ymm3,2
- vpaddd ymm1,ymm1,ymm7
- vpor ymm14,ymm14,ymm9
- vpor ymm3,ymm3,ymm6
- vpxor ymm10,ymm10,ymm12
- vmovdqa ymm12,YMMWORD[((352-256-128))+rbx]
-
- vpslld ymm7,ymm1,5
- vpaddd ymm0,ymm0,ymm15
- vpxor ymm5,ymm4,ymm2
- vmovdqa YMMWORD[(256-256-128)+rbx],ymm14
- vpaddd ymm0,ymm0,ymm14
- vpxor ymm10,ymm10,YMMWORD[((32-128))+rax]
- vpsrld ymm8,ymm1,27
- vpxor ymm5,ymm5,ymm3
- vpxor ymm10,ymm10,ymm12
-
- vpslld ymm6,ymm2,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm0,ymm0,ymm5
- vpsrld ymm9,ymm10,31
- vpaddd ymm10,ymm10,ymm10
-
- vpsrld ymm2,ymm2,2
- vpaddd ymm0,ymm0,ymm7
- vpor ymm10,ymm10,ymm9
- vpor ymm2,ymm2,ymm6
- vpxor ymm11,ymm11,ymm13
- vmovdqa ymm13,YMMWORD[((384-256-128))+rbx]
-
- vpslld ymm7,ymm0,5
- vpaddd ymm4,ymm4,ymm15
- vpxor ymm5,ymm3,ymm1
- vmovdqa YMMWORD[(288-256-128)+rbx],ymm10
- vpaddd ymm4,ymm4,ymm10
- vpxor ymm11,ymm11,YMMWORD[((64-128))+rax]
- vpsrld ymm8,ymm0,27
- vpxor ymm5,ymm5,ymm2
- vpxor ymm11,ymm11,ymm13
-
- vpslld ymm6,ymm1,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm4,ymm4,ymm5
- vpsrld ymm9,ymm11,31
- vpaddd ymm11,ymm11,ymm11
-
- vpsrld ymm1,ymm1,2
- vpaddd ymm4,ymm4,ymm7
- vpor ymm11,ymm11,ymm9
- vpor ymm1,ymm1,ymm6
- vpxor ymm12,ymm12,ymm14
- vmovdqa ymm14,YMMWORD[((416-256-128))+rbx]
-
- vpslld ymm7,ymm4,5
- vpaddd ymm3,ymm3,ymm15
- vpxor ymm5,ymm2,ymm0
- vmovdqa YMMWORD[(320-256-128)+rbx],ymm11
- vpaddd ymm3,ymm3,ymm11
- vpxor ymm12,ymm12,YMMWORD[((96-128))+rax]
- vpsrld ymm8,ymm4,27
- vpxor ymm5,ymm5,ymm1
- vpxor ymm12,ymm12,ymm14
-
- vpslld ymm6,ymm0,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm3,ymm3,ymm5
- vpsrld ymm9,ymm12,31
- vpaddd ymm12,ymm12,ymm12
-
- vpsrld ymm0,ymm0,2
- vpaddd ymm3,ymm3,ymm7
- vpor ymm12,ymm12,ymm9
- vpor ymm0,ymm0,ymm6
- vpxor ymm13,ymm13,ymm10
- vmovdqa ymm10,YMMWORD[((448-256-128))+rbx]
-
- vpslld ymm7,ymm3,5
- vpaddd ymm2,ymm2,ymm15
- vpxor ymm5,ymm1,ymm4
- vmovdqa YMMWORD[(352-256-128)+rbx],ymm12
- vpaddd ymm2,ymm2,ymm12
- vpxor ymm13,ymm13,YMMWORD[((128-128))+rax]
- vpsrld ymm8,ymm3,27
- vpxor ymm5,ymm5,ymm0
- vpxor ymm13,ymm13,ymm10
-
- vpslld ymm6,ymm4,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm2,ymm2,ymm5
- vpsrld ymm9,ymm13,31
- vpaddd ymm13,ymm13,ymm13
-
- vpsrld ymm4,ymm4,2
- vpaddd ymm2,ymm2,ymm7
- vpor ymm13,ymm13,ymm9
- vpor ymm4,ymm4,ymm6
- vpxor ymm14,ymm14,ymm11
- vmovdqa ymm11,YMMWORD[((480-256-128))+rbx]
-
- vpslld ymm7,ymm2,5
- vpaddd ymm1,ymm1,ymm15
- vpxor ymm5,ymm0,ymm3
- vmovdqa YMMWORD[(384-256-128)+rbx],ymm13
- vpaddd ymm1,ymm1,ymm13
- vpxor ymm14,ymm14,YMMWORD[((160-128))+rax]
- vpsrld ymm8,ymm2,27
- vpxor ymm5,ymm5,ymm4
- vpxor ymm14,ymm14,ymm11
-
- vpslld ymm6,ymm3,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm1,ymm1,ymm5
- vpsrld ymm9,ymm14,31
- vpaddd ymm14,ymm14,ymm14
-
- vpsrld ymm3,ymm3,2
- vpaddd ymm1,ymm1,ymm7
- vpor ymm14,ymm14,ymm9
- vpor ymm3,ymm3,ymm6
- vpxor ymm10,ymm10,ymm12
- vmovdqa ymm12,YMMWORD[((0-128))+rax]
-
- vpslld ymm7,ymm1,5
- vpaddd ymm0,ymm0,ymm15
- vpxor ymm5,ymm4,ymm2
- vmovdqa YMMWORD[(416-256-128)+rbx],ymm14
- vpaddd ymm0,ymm0,ymm14
- vpxor ymm10,ymm10,YMMWORD[((192-128))+rax]
- vpsrld ymm8,ymm1,27
- vpxor ymm5,ymm5,ymm3
- vpxor ymm10,ymm10,ymm12
-
- vpslld ymm6,ymm2,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm0,ymm0,ymm5
- vpsrld ymm9,ymm10,31
- vpaddd ymm10,ymm10,ymm10
-
- vpsrld ymm2,ymm2,2
- vpaddd ymm0,ymm0,ymm7
- vpor ymm10,ymm10,ymm9
- vpor ymm2,ymm2,ymm6
- vpxor ymm11,ymm11,ymm13
- vmovdqa ymm13,YMMWORD[((32-128))+rax]
-
- vpslld ymm7,ymm0,5
- vpaddd ymm4,ymm4,ymm15
- vpxor ymm5,ymm3,ymm1
- vmovdqa YMMWORD[(448-256-128)+rbx],ymm10
- vpaddd ymm4,ymm4,ymm10
- vpxor ymm11,ymm11,YMMWORD[((224-128))+rax]
- vpsrld ymm8,ymm0,27
- vpxor ymm5,ymm5,ymm2
- vpxor ymm11,ymm11,ymm13
-
- vpslld ymm6,ymm1,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm4,ymm4,ymm5
- vpsrld ymm9,ymm11,31
- vpaddd ymm11,ymm11,ymm11
-
- vpsrld ymm1,ymm1,2
- vpaddd ymm4,ymm4,ymm7
- vpor ymm11,ymm11,ymm9
- vpor ymm1,ymm1,ymm6
- vpxor ymm12,ymm12,ymm14
- vmovdqa ymm14,YMMWORD[((64-128))+rax]
-
- vpslld ymm7,ymm4,5
- vpaddd ymm3,ymm3,ymm15
- vpxor ymm5,ymm2,ymm0
- vmovdqa YMMWORD[(480-256-128)+rbx],ymm11
- vpaddd ymm3,ymm3,ymm11
- vpxor ymm12,ymm12,YMMWORD[((256-256-128))+rbx]
- vpsrld ymm8,ymm4,27
- vpxor ymm5,ymm5,ymm1
- vpxor ymm12,ymm12,ymm14
-
- vpslld ymm6,ymm0,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm3,ymm3,ymm5
- vpsrld ymm9,ymm12,31
- vpaddd ymm12,ymm12,ymm12
-
- vpsrld ymm0,ymm0,2
- vpaddd ymm3,ymm3,ymm7
- vpor ymm12,ymm12,ymm9
- vpor ymm0,ymm0,ymm6
- vpxor ymm13,ymm13,ymm10
- vmovdqa ymm10,YMMWORD[((96-128))+rax]
-
- vpslld ymm7,ymm3,5
- vpaddd ymm2,ymm2,ymm15
- vpxor ymm5,ymm1,ymm4
- vmovdqa YMMWORD[(0-128)+rax],ymm12
- vpaddd ymm2,ymm2,ymm12
- vpxor ymm13,ymm13,YMMWORD[((288-256-128))+rbx]
- vpsrld ymm8,ymm3,27
- vpxor ymm5,ymm5,ymm0
- vpxor ymm13,ymm13,ymm10
-
- vpslld ymm6,ymm4,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm2,ymm2,ymm5
- vpsrld ymm9,ymm13,31
- vpaddd ymm13,ymm13,ymm13
-
- vpsrld ymm4,ymm4,2
- vpaddd ymm2,ymm2,ymm7
- vpor ymm13,ymm13,ymm9
- vpor ymm4,ymm4,ymm6
- vpxor ymm14,ymm14,ymm11
- vmovdqa ymm11,YMMWORD[((128-128))+rax]
-
- vpslld ymm7,ymm2,5
- vpaddd ymm1,ymm1,ymm15
- vpxor ymm5,ymm0,ymm3
- vmovdqa YMMWORD[(32-128)+rax],ymm13
- vpaddd ymm1,ymm1,ymm13
- vpxor ymm14,ymm14,YMMWORD[((320-256-128))+rbx]
- vpsrld ymm8,ymm2,27
- vpxor ymm5,ymm5,ymm4
- vpxor ymm14,ymm14,ymm11
-
- vpslld ymm6,ymm3,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm1,ymm1,ymm5
- vpsrld ymm9,ymm14,31
- vpaddd ymm14,ymm14,ymm14
-
- vpsrld ymm3,ymm3,2
- vpaddd ymm1,ymm1,ymm7
- vpor ymm14,ymm14,ymm9
- vpor ymm3,ymm3,ymm6
- vpxor ymm10,ymm10,ymm12
- vmovdqa ymm12,YMMWORD[((160-128))+rax]
-
- vpslld ymm7,ymm1,5
- vpaddd ymm0,ymm0,ymm15
- vpxor ymm5,ymm4,ymm2
- vmovdqa YMMWORD[(64-128)+rax],ymm14
- vpaddd ymm0,ymm0,ymm14
- vpxor ymm10,ymm10,YMMWORD[((352-256-128))+rbx]
- vpsrld ymm8,ymm1,27
- vpxor ymm5,ymm5,ymm3
- vpxor ymm10,ymm10,ymm12
-
- vpslld ymm6,ymm2,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm0,ymm0,ymm5
- vpsrld ymm9,ymm10,31
- vpaddd ymm10,ymm10,ymm10
-
- vpsrld ymm2,ymm2,2
- vpaddd ymm0,ymm0,ymm7
- vpor ymm10,ymm10,ymm9
- vpor ymm2,ymm2,ymm6
- vpxor ymm11,ymm11,ymm13
- vmovdqa ymm13,YMMWORD[((192-128))+rax]
-
- vpslld ymm7,ymm0,5
- vpaddd ymm4,ymm4,ymm15
- vpxor ymm5,ymm3,ymm1
- vmovdqa YMMWORD[(96-128)+rax],ymm10
- vpaddd ymm4,ymm4,ymm10
- vpxor ymm11,ymm11,YMMWORD[((384-256-128))+rbx]
- vpsrld ymm8,ymm0,27
- vpxor ymm5,ymm5,ymm2
- vpxor ymm11,ymm11,ymm13
-
- vpslld ymm6,ymm1,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm4,ymm4,ymm5
- vpsrld ymm9,ymm11,31
- vpaddd ymm11,ymm11,ymm11
-
- vpsrld ymm1,ymm1,2
- vpaddd ymm4,ymm4,ymm7
- vpor ymm11,ymm11,ymm9
- vpor ymm1,ymm1,ymm6
- vpxor ymm12,ymm12,ymm14
- vmovdqa ymm14,YMMWORD[((224-128))+rax]
-
- vpslld ymm7,ymm4,5
- vpaddd ymm3,ymm3,ymm15
- vpxor ymm5,ymm2,ymm0
- vmovdqa YMMWORD[(128-128)+rax],ymm11
- vpaddd ymm3,ymm3,ymm11
- vpxor ymm12,ymm12,YMMWORD[((416-256-128))+rbx]
- vpsrld ymm8,ymm4,27
- vpxor ymm5,ymm5,ymm1
- vpxor ymm12,ymm12,ymm14
-
- vpslld ymm6,ymm0,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm3,ymm3,ymm5
- vpsrld ymm9,ymm12,31
- vpaddd ymm12,ymm12,ymm12
-
- vpsrld ymm0,ymm0,2
- vpaddd ymm3,ymm3,ymm7
- vpor ymm12,ymm12,ymm9
- vpor ymm0,ymm0,ymm6
- vpxor ymm13,ymm13,ymm10
- vmovdqa ymm10,YMMWORD[((256-256-128))+rbx]
-
- vpslld ymm7,ymm3,5
- vpaddd ymm2,ymm2,ymm15
- vpxor ymm5,ymm1,ymm4
- vmovdqa YMMWORD[(160-128)+rax],ymm12
- vpaddd ymm2,ymm2,ymm12
- vpxor ymm13,ymm13,YMMWORD[((448-256-128))+rbx]
- vpsrld ymm8,ymm3,27
- vpxor ymm5,ymm5,ymm0
- vpxor ymm13,ymm13,ymm10
-
- vpslld ymm6,ymm4,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm2,ymm2,ymm5
- vpsrld ymm9,ymm13,31
- vpaddd ymm13,ymm13,ymm13
-
- vpsrld ymm4,ymm4,2
- vpaddd ymm2,ymm2,ymm7
- vpor ymm13,ymm13,ymm9
- vpor ymm4,ymm4,ymm6
- vpxor ymm14,ymm14,ymm11
- vmovdqa ymm11,YMMWORD[((288-256-128))+rbx]
-
- vpslld ymm7,ymm2,5
- vpaddd ymm1,ymm1,ymm15
- vpxor ymm5,ymm0,ymm3
- vmovdqa YMMWORD[(192-128)+rax],ymm13
- vpaddd ymm1,ymm1,ymm13
- vpxor ymm14,ymm14,YMMWORD[((480-256-128))+rbx]
- vpsrld ymm8,ymm2,27
- vpxor ymm5,ymm5,ymm4
- vpxor ymm14,ymm14,ymm11
-
- vpslld ymm6,ymm3,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm1,ymm1,ymm5
- vpsrld ymm9,ymm14,31
- vpaddd ymm14,ymm14,ymm14
-
- vpsrld ymm3,ymm3,2
- vpaddd ymm1,ymm1,ymm7
- vpor ymm14,ymm14,ymm9
- vpor ymm3,ymm3,ymm6
- vpxor ymm10,ymm10,ymm12
- vmovdqa ymm12,YMMWORD[((320-256-128))+rbx]
-
- vpslld ymm7,ymm1,5
- vpaddd ymm0,ymm0,ymm15
- vpxor ymm5,ymm4,ymm2
- vmovdqa YMMWORD[(224-128)+rax],ymm14
- vpaddd ymm0,ymm0,ymm14
- vpxor ymm10,ymm10,YMMWORD[((0-128))+rax]
- vpsrld ymm8,ymm1,27
- vpxor ymm5,ymm5,ymm3
- vpxor ymm10,ymm10,ymm12
-
- vpslld ymm6,ymm2,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm0,ymm0,ymm5
- vpsrld ymm9,ymm10,31
- vpaddd ymm10,ymm10,ymm10
-
- vpsrld ymm2,ymm2,2
- vpaddd ymm0,ymm0,ymm7
- vpor ymm10,ymm10,ymm9
- vpor ymm2,ymm2,ymm6
- vmovdqa ymm15,YMMWORD[32+rbp]
- vpxor ymm11,ymm11,ymm13
- vmovdqa ymm13,YMMWORD[((352-256-128))+rbx]
-
- vpaddd ymm4,ymm4,ymm15
- vpslld ymm7,ymm0,5
- vpand ymm6,ymm3,ymm2
- vpxor ymm11,ymm11,YMMWORD[((32-128))+rax]
-
- vpaddd ymm4,ymm4,ymm6
- vpsrld ymm8,ymm0,27
- vpxor ymm5,ymm3,ymm2
- vpxor ymm11,ymm11,ymm13
-
- vmovdqu YMMWORD[(256-256-128)+rbx],ymm10
- vpaddd ymm4,ymm4,ymm10
- vpor ymm7,ymm7,ymm8
- vpsrld ymm9,ymm11,31
- vpand ymm5,ymm5,ymm1
- vpaddd ymm11,ymm11,ymm11
-
- vpslld ymm6,ymm1,30
- vpaddd ymm4,ymm4,ymm5
-
- vpsrld ymm1,ymm1,2
- vpaddd ymm4,ymm4,ymm7
- vpor ymm11,ymm11,ymm9
- vpor ymm1,ymm1,ymm6
- vpxor ymm12,ymm12,ymm14
- vmovdqa ymm14,YMMWORD[((384-256-128))+rbx]
-
- vpaddd ymm3,ymm3,ymm15
- vpslld ymm7,ymm4,5
- vpand ymm6,ymm2,ymm1
- vpxor ymm12,ymm12,YMMWORD[((64-128))+rax]
-
- vpaddd ymm3,ymm3,ymm6
- vpsrld ymm8,ymm4,27
- vpxor ymm5,ymm2,ymm1
- vpxor ymm12,ymm12,ymm14
-
- vmovdqu YMMWORD[(288-256-128)+rbx],ymm11
- vpaddd ymm3,ymm3,ymm11
- vpor ymm7,ymm7,ymm8
- vpsrld ymm9,ymm12,31
- vpand ymm5,ymm5,ymm0
- vpaddd ymm12,ymm12,ymm12
-
- vpslld ymm6,ymm0,30
- vpaddd ymm3,ymm3,ymm5
-
- vpsrld ymm0,ymm0,2
- vpaddd ymm3,ymm3,ymm7
- vpor ymm12,ymm12,ymm9
- vpor ymm0,ymm0,ymm6
- vpxor ymm13,ymm13,ymm10
- vmovdqa ymm10,YMMWORD[((416-256-128))+rbx]
-
- vpaddd ymm2,ymm2,ymm15
- vpslld ymm7,ymm3,5
- vpand ymm6,ymm1,ymm0
- vpxor ymm13,ymm13,YMMWORD[((96-128))+rax]
-
- vpaddd ymm2,ymm2,ymm6
- vpsrld ymm8,ymm3,27
- vpxor ymm5,ymm1,ymm0
- vpxor ymm13,ymm13,ymm10
-
- vmovdqu YMMWORD[(320-256-128)+rbx],ymm12
- vpaddd ymm2,ymm2,ymm12
- vpor ymm7,ymm7,ymm8
- vpsrld ymm9,ymm13,31
- vpand ymm5,ymm5,ymm4
- vpaddd ymm13,ymm13,ymm13
-
- vpslld ymm6,ymm4,30
- vpaddd ymm2,ymm2,ymm5
-
- vpsrld ymm4,ymm4,2
- vpaddd ymm2,ymm2,ymm7
- vpor ymm13,ymm13,ymm9
- vpor ymm4,ymm4,ymm6
- vpxor ymm14,ymm14,ymm11
- vmovdqa ymm11,YMMWORD[((448-256-128))+rbx]
-
- vpaddd ymm1,ymm1,ymm15
- vpslld ymm7,ymm2,5
- vpand ymm6,ymm0,ymm4
- vpxor ymm14,ymm14,YMMWORD[((128-128))+rax]
-
- vpaddd ymm1,ymm1,ymm6
- vpsrld ymm8,ymm2,27
- vpxor ymm5,ymm0,ymm4
- vpxor ymm14,ymm14,ymm11
-
- vmovdqu YMMWORD[(352-256-128)+rbx],ymm13
- vpaddd ymm1,ymm1,ymm13
- vpor ymm7,ymm7,ymm8
- vpsrld ymm9,ymm14,31
- vpand ymm5,ymm5,ymm3
- vpaddd ymm14,ymm14,ymm14
-
- vpslld ymm6,ymm3,30
- vpaddd ymm1,ymm1,ymm5
-
- vpsrld ymm3,ymm3,2
- vpaddd ymm1,ymm1,ymm7
- vpor ymm14,ymm14,ymm9
- vpor ymm3,ymm3,ymm6
- vpxor ymm10,ymm10,ymm12
- vmovdqa ymm12,YMMWORD[((480-256-128))+rbx]
-
- vpaddd ymm0,ymm0,ymm15
- vpslld ymm7,ymm1,5
- vpand ymm6,ymm4,ymm3
- vpxor ymm10,ymm10,YMMWORD[((160-128))+rax]
-
- vpaddd ymm0,ymm0,ymm6
- vpsrld ymm8,ymm1,27
- vpxor ymm5,ymm4,ymm3
- vpxor ymm10,ymm10,ymm12
-
- vmovdqu YMMWORD[(384-256-128)+rbx],ymm14
- vpaddd ymm0,ymm0,ymm14
- vpor ymm7,ymm7,ymm8
- vpsrld ymm9,ymm10,31
- vpand ymm5,ymm5,ymm2
- vpaddd ymm10,ymm10,ymm10
-
- vpslld ymm6,ymm2,30
- vpaddd ymm0,ymm0,ymm5
-
- vpsrld ymm2,ymm2,2
- vpaddd ymm0,ymm0,ymm7
- vpor ymm10,ymm10,ymm9
- vpor ymm2,ymm2,ymm6
- vpxor ymm11,ymm11,ymm13
- vmovdqa ymm13,YMMWORD[((0-128))+rax]
-
- vpaddd ymm4,ymm4,ymm15
- vpslld ymm7,ymm0,5
- vpand ymm6,ymm3,ymm2
- vpxor ymm11,ymm11,YMMWORD[((192-128))+rax]
-
- vpaddd ymm4,ymm4,ymm6
- vpsrld ymm8,ymm0,27
- vpxor ymm5,ymm3,ymm2
- vpxor ymm11,ymm11,ymm13
-
- vmovdqu YMMWORD[(416-256-128)+rbx],ymm10
- vpaddd ymm4,ymm4,ymm10
- vpor ymm7,ymm7,ymm8
- vpsrld ymm9,ymm11,31
- vpand ymm5,ymm5,ymm1
- vpaddd ymm11,ymm11,ymm11
-
- vpslld ymm6,ymm1,30
- vpaddd ymm4,ymm4,ymm5
-
- vpsrld ymm1,ymm1,2
- vpaddd ymm4,ymm4,ymm7
- vpor ymm11,ymm11,ymm9
- vpor ymm1,ymm1,ymm6
- vpxor ymm12,ymm12,ymm14
- vmovdqa ymm14,YMMWORD[((32-128))+rax]
-
- vpaddd ymm3,ymm3,ymm15
- vpslld ymm7,ymm4,5
- vpand ymm6,ymm2,ymm1
- vpxor ymm12,ymm12,YMMWORD[((224-128))+rax]
-
- vpaddd ymm3,ymm3,ymm6
- vpsrld ymm8,ymm4,27
- vpxor ymm5,ymm2,ymm1
- vpxor ymm12,ymm12,ymm14
-
- vmovdqu YMMWORD[(448-256-128)+rbx],ymm11
- vpaddd ymm3,ymm3,ymm11
- vpor ymm7,ymm7,ymm8
- vpsrld ymm9,ymm12,31
- vpand ymm5,ymm5,ymm0
- vpaddd ymm12,ymm12,ymm12
-
- vpslld ymm6,ymm0,30
- vpaddd ymm3,ymm3,ymm5
-
- vpsrld ymm0,ymm0,2
- vpaddd ymm3,ymm3,ymm7
- vpor ymm12,ymm12,ymm9
- vpor ymm0,ymm0,ymm6
- vpxor ymm13,ymm13,ymm10
- vmovdqa ymm10,YMMWORD[((64-128))+rax]
-
- vpaddd ymm2,ymm2,ymm15
- vpslld ymm7,ymm3,5
- vpand ymm6,ymm1,ymm0
- vpxor ymm13,ymm13,YMMWORD[((256-256-128))+rbx]
-
- vpaddd ymm2,ymm2,ymm6
- vpsrld ymm8,ymm3,27
- vpxor ymm5,ymm1,ymm0
- vpxor ymm13,ymm13,ymm10
-
- vmovdqu YMMWORD[(480-256-128)+rbx],ymm12
- vpaddd ymm2,ymm2,ymm12
- vpor ymm7,ymm7,ymm8
- vpsrld ymm9,ymm13,31
- vpand ymm5,ymm5,ymm4
- vpaddd ymm13,ymm13,ymm13
-
- vpslld ymm6,ymm4,30
- vpaddd ymm2,ymm2,ymm5
-
- vpsrld ymm4,ymm4,2
- vpaddd ymm2,ymm2,ymm7
- vpor ymm13,ymm13,ymm9
- vpor ymm4,ymm4,ymm6
- vpxor ymm14,ymm14,ymm11
- vmovdqa ymm11,YMMWORD[((96-128))+rax]
-
- vpaddd ymm1,ymm1,ymm15
- vpslld ymm7,ymm2,5
- vpand ymm6,ymm0,ymm4
- vpxor ymm14,ymm14,YMMWORD[((288-256-128))+rbx]
-
- vpaddd ymm1,ymm1,ymm6
- vpsrld ymm8,ymm2,27
- vpxor ymm5,ymm0,ymm4
- vpxor ymm14,ymm14,ymm11
-
- vmovdqu YMMWORD[(0-128)+rax],ymm13
- vpaddd ymm1,ymm1,ymm13
- vpor ymm7,ymm7,ymm8
- vpsrld ymm9,ymm14,31
- vpand ymm5,ymm5,ymm3
- vpaddd ymm14,ymm14,ymm14
-
- vpslld ymm6,ymm3,30
- vpaddd ymm1,ymm1,ymm5
-
- vpsrld ymm3,ymm3,2
- vpaddd ymm1,ymm1,ymm7
- vpor ymm14,ymm14,ymm9
- vpor ymm3,ymm3,ymm6
- vpxor ymm10,ymm10,ymm12
- vmovdqa ymm12,YMMWORD[((128-128))+rax]
-
- vpaddd ymm0,ymm0,ymm15
- vpslld ymm7,ymm1,5
- vpand ymm6,ymm4,ymm3
- vpxor ymm10,ymm10,YMMWORD[((320-256-128))+rbx]
-
- vpaddd ymm0,ymm0,ymm6
- vpsrld ymm8,ymm1,27
- vpxor ymm5,ymm4,ymm3
- vpxor ymm10,ymm10,ymm12
-
- vmovdqu YMMWORD[(32-128)+rax],ymm14
- vpaddd ymm0,ymm0,ymm14
- vpor ymm7,ymm7,ymm8
- vpsrld ymm9,ymm10,31
- vpand ymm5,ymm5,ymm2
- vpaddd ymm10,ymm10,ymm10
-
- vpslld ymm6,ymm2,30
- vpaddd ymm0,ymm0,ymm5
-
- vpsrld ymm2,ymm2,2
- vpaddd ymm0,ymm0,ymm7
- vpor ymm10,ymm10,ymm9
- vpor ymm2,ymm2,ymm6
- vpxor ymm11,ymm11,ymm13
- vmovdqa ymm13,YMMWORD[((160-128))+rax]
-
- vpaddd ymm4,ymm4,ymm15
- vpslld ymm7,ymm0,5
- vpand ymm6,ymm3,ymm2
- vpxor ymm11,ymm11,YMMWORD[((352-256-128))+rbx]
-
- vpaddd ymm4,ymm4,ymm6
- vpsrld ymm8,ymm0,27
- vpxor ymm5,ymm3,ymm2
- vpxor ymm11,ymm11,ymm13
-
- vmovdqu YMMWORD[(64-128)+rax],ymm10
- vpaddd ymm4,ymm4,ymm10
- vpor ymm7,ymm7,ymm8
- vpsrld ymm9,ymm11,31
- vpand ymm5,ymm5,ymm1
- vpaddd ymm11,ymm11,ymm11
-
- vpslld ymm6,ymm1,30
- vpaddd ymm4,ymm4,ymm5
-
- vpsrld ymm1,ymm1,2
- vpaddd ymm4,ymm4,ymm7
- vpor ymm11,ymm11,ymm9
- vpor ymm1,ymm1,ymm6
- vpxor ymm12,ymm12,ymm14
- vmovdqa ymm14,YMMWORD[((192-128))+rax]
-
- vpaddd ymm3,ymm3,ymm15
- vpslld ymm7,ymm4,5
- vpand ymm6,ymm2,ymm1
- vpxor ymm12,ymm12,YMMWORD[((384-256-128))+rbx]
-
- vpaddd ymm3,ymm3,ymm6
- vpsrld ymm8,ymm4,27
- vpxor ymm5,ymm2,ymm1
- vpxor ymm12,ymm12,ymm14
-
- vmovdqu YMMWORD[(96-128)+rax],ymm11
- vpaddd ymm3,ymm3,ymm11
- vpor ymm7,ymm7,ymm8
- vpsrld ymm9,ymm12,31
- vpand ymm5,ymm5,ymm0
- vpaddd ymm12,ymm12,ymm12
-
- vpslld ymm6,ymm0,30
- vpaddd ymm3,ymm3,ymm5
-
- vpsrld ymm0,ymm0,2
- vpaddd ymm3,ymm3,ymm7
- vpor ymm12,ymm12,ymm9
- vpor ymm0,ymm0,ymm6
- vpxor ymm13,ymm13,ymm10
- vmovdqa ymm10,YMMWORD[((224-128))+rax]
-
- vpaddd ymm2,ymm2,ymm15
- vpslld ymm7,ymm3,5
- vpand ymm6,ymm1,ymm0
- vpxor ymm13,ymm13,YMMWORD[((416-256-128))+rbx]
-
- vpaddd ymm2,ymm2,ymm6
- vpsrld ymm8,ymm3,27
- vpxor ymm5,ymm1,ymm0
- vpxor ymm13,ymm13,ymm10
-
- vmovdqu YMMWORD[(128-128)+rax],ymm12
- vpaddd ymm2,ymm2,ymm12
- vpor ymm7,ymm7,ymm8
- vpsrld ymm9,ymm13,31
- vpand ymm5,ymm5,ymm4
- vpaddd ymm13,ymm13,ymm13
-
- vpslld ymm6,ymm4,30
- vpaddd ymm2,ymm2,ymm5
-
- vpsrld ymm4,ymm4,2
- vpaddd ymm2,ymm2,ymm7
- vpor ymm13,ymm13,ymm9
- vpor ymm4,ymm4,ymm6
- vpxor ymm14,ymm14,ymm11
- vmovdqa ymm11,YMMWORD[((256-256-128))+rbx]
-
- vpaddd ymm1,ymm1,ymm15
- vpslld ymm7,ymm2,5
- vpand ymm6,ymm0,ymm4
- vpxor ymm14,ymm14,YMMWORD[((448-256-128))+rbx]
-
- vpaddd ymm1,ymm1,ymm6
- vpsrld ymm8,ymm2,27
- vpxor ymm5,ymm0,ymm4
- vpxor ymm14,ymm14,ymm11
-
- vmovdqu YMMWORD[(160-128)+rax],ymm13
- vpaddd ymm1,ymm1,ymm13
- vpor ymm7,ymm7,ymm8
- vpsrld ymm9,ymm14,31
- vpand ymm5,ymm5,ymm3
- vpaddd ymm14,ymm14,ymm14
-
- vpslld ymm6,ymm3,30
- vpaddd ymm1,ymm1,ymm5
-
- vpsrld ymm3,ymm3,2
- vpaddd ymm1,ymm1,ymm7
- vpor ymm14,ymm14,ymm9
- vpor ymm3,ymm3,ymm6
- vpxor ymm10,ymm10,ymm12
- vmovdqa ymm12,YMMWORD[((288-256-128))+rbx]
-
- vpaddd ymm0,ymm0,ymm15
- vpslld ymm7,ymm1,5
- vpand ymm6,ymm4,ymm3
- vpxor ymm10,ymm10,YMMWORD[((480-256-128))+rbx]
-
- vpaddd ymm0,ymm0,ymm6
- vpsrld ymm8,ymm1,27
- vpxor ymm5,ymm4,ymm3
- vpxor ymm10,ymm10,ymm12
-
- vmovdqu YMMWORD[(192-128)+rax],ymm14
- vpaddd ymm0,ymm0,ymm14
- vpor ymm7,ymm7,ymm8
- vpsrld ymm9,ymm10,31
- vpand ymm5,ymm5,ymm2
- vpaddd ymm10,ymm10,ymm10
-
- vpslld ymm6,ymm2,30
- vpaddd ymm0,ymm0,ymm5
-
- vpsrld ymm2,ymm2,2
- vpaddd ymm0,ymm0,ymm7
- vpor ymm10,ymm10,ymm9
- vpor ymm2,ymm2,ymm6
- vpxor ymm11,ymm11,ymm13
- vmovdqa ymm13,YMMWORD[((320-256-128))+rbx]
-
- vpaddd ymm4,ymm4,ymm15
- vpslld ymm7,ymm0,5
- vpand ymm6,ymm3,ymm2
- vpxor ymm11,ymm11,YMMWORD[((0-128))+rax]
-
- vpaddd ymm4,ymm4,ymm6
- vpsrld ymm8,ymm0,27
- vpxor ymm5,ymm3,ymm2
- vpxor ymm11,ymm11,ymm13
-
- vmovdqu YMMWORD[(224-128)+rax],ymm10
- vpaddd ymm4,ymm4,ymm10
- vpor ymm7,ymm7,ymm8
- vpsrld ymm9,ymm11,31
- vpand ymm5,ymm5,ymm1
- vpaddd ymm11,ymm11,ymm11
-
- vpslld ymm6,ymm1,30
- vpaddd ymm4,ymm4,ymm5
-
- vpsrld ymm1,ymm1,2
- vpaddd ymm4,ymm4,ymm7
- vpor ymm11,ymm11,ymm9
- vpor ymm1,ymm1,ymm6
- vpxor ymm12,ymm12,ymm14
- vmovdqa ymm14,YMMWORD[((352-256-128))+rbx]
-
- vpaddd ymm3,ymm3,ymm15
- vpslld ymm7,ymm4,5
- vpand ymm6,ymm2,ymm1
- vpxor ymm12,ymm12,YMMWORD[((32-128))+rax]
-
- vpaddd ymm3,ymm3,ymm6
- vpsrld ymm8,ymm4,27
- vpxor ymm5,ymm2,ymm1
- vpxor ymm12,ymm12,ymm14
-
- vmovdqu YMMWORD[(256-256-128)+rbx],ymm11
- vpaddd ymm3,ymm3,ymm11
- vpor ymm7,ymm7,ymm8
- vpsrld ymm9,ymm12,31
- vpand ymm5,ymm5,ymm0
- vpaddd ymm12,ymm12,ymm12
-
- vpslld ymm6,ymm0,30
- vpaddd ymm3,ymm3,ymm5
-
- vpsrld ymm0,ymm0,2
- vpaddd ymm3,ymm3,ymm7
- vpor ymm12,ymm12,ymm9
- vpor ymm0,ymm0,ymm6
- vpxor ymm13,ymm13,ymm10
- vmovdqa ymm10,YMMWORD[((384-256-128))+rbx]
-
- vpaddd ymm2,ymm2,ymm15
- vpslld ymm7,ymm3,5
- vpand ymm6,ymm1,ymm0
- vpxor ymm13,ymm13,YMMWORD[((64-128))+rax]
-
- vpaddd ymm2,ymm2,ymm6
- vpsrld ymm8,ymm3,27
- vpxor ymm5,ymm1,ymm0
- vpxor ymm13,ymm13,ymm10
-
- vmovdqu YMMWORD[(288-256-128)+rbx],ymm12
- vpaddd ymm2,ymm2,ymm12
- vpor ymm7,ymm7,ymm8
- vpsrld ymm9,ymm13,31
- vpand ymm5,ymm5,ymm4
- vpaddd ymm13,ymm13,ymm13
-
- vpslld ymm6,ymm4,30
- vpaddd ymm2,ymm2,ymm5
-
- vpsrld ymm4,ymm4,2
- vpaddd ymm2,ymm2,ymm7
- vpor ymm13,ymm13,ymm9
- vpor ymm4,ymm4,ymm6
- vpxor ymm14,ymm14,ymm11
- vmovdqa ymm11,YMMWORD[((416-256-128))+rbx]
-
- vpaddd ymm1,ymm1,ymm15
- vpslld ymm7,ymm2,5
- vpand ymm6,ymm0,ymm4
- vpxor ymm14,ymm14,YMMWORD[((96-128))+rax]
-
- vpaddd ymm1,ymm1,ymm6
- vpsrld ymm8,ymm2,27
- vpxor ymm5,ymm0,ymm4
- vpxor ymm14,ymm14,ymm11
-
- vmovdqu YMMWORD[(320-256-128)+rbx],ymm13
- vpaddd ymm1,ymm1,ymm13
- vpor ymm7,ymm7,ymm8
- vpsrld ymm9,ymm14,31
- vpand ymm5,ymm5,ymm3
- vpaddd ymm14,ymm14,ymm14
-
- vpslld ymm6,ymm3,30
- vpaddd ymm1,ymm1,ymm5
-
- vpsrld ymm3,ymm3,2
- vpaddd ymm1,ymm1,ymm7
- vpor ymm14,ymm14,ymm9
- vpor ymm3,ymm3,ymm6
- vpxor ymm10,ymm10,ymm12
- vmovdqa ymm12,YMMWORD[((448-256-128))+rbx]
-
- vpaddd ymm0,ymm0,ymm15
- vpslld ymm7,ymm1,5
- vpand ymm6,ymm4,ymm3
- vpxor ymm10,ymm10,YMMWORD[((128-128))+rax]
-
- vpaddd ymm0,ymm0,ymm6
- vpsrld ymm8,ymm1,27
- vpxor ymm5,ymm4,ymm3
- vpxor ymm10,ymm10,ymm12
-
- vmovdqu YMMWORD[(352-256-128)+rbx],ymm14
- vpaddd ymm0,ymm0,ymm14
- vpor ymm7,ymm7,ymm8
- vpsrld ymm9,ymm10,31
- vpand ymm5,ymm5,ymm2
- vpaddd ymm10,ymm10,ymm10
-
- vpslld ymm6,ymm2,30
- vpaddd ymm0,ymm0,ymm5
-
- vpsrld ymm2,ymm2,2
- vpaddd ymm0,ymm0,ymm7
- vpor ymm10,ymm10,ymm9
- vpor ymm2,ymm2,ymm6
- vmovdqa ymm15,YMMWORD[64+rbp]
- vpxor ymm11,ymm11,ymm13
- vmovdqa ymm13,YMMWORD[((480-256-128))+rbx]
-
- vpslld ymm7,ymm0,5
- vpaddd ymm4,ymm4,ymm15
- vpxor ymm5,ymm3,ymm1
- vmovdqa YMMWORD[(384-256-128)+rbx],ymm10
- vpaddd ymm4,ymm4,ymm10
- vpxor ymm11,ymm11,YMMWORD[((160-128))+rax]
- vpsrld ymm8,ymm0,27
- vpxor ymm5,ymm5,ymm2
- vpxor ymm11,ymm11,ymm13
-
- vpslld ymm6,ymm1,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm4,ymm4,ymm5
- vpsrld ymm9,ymm11,31
- vpaddd ymm11,ymm11,ymm11
-
- vpsrld ymm1,ymm1,2
- vpaddd ymm4,ymm4,ymm7
- vpor ymm11,ymm11,ymm9
- vpor ymm1,ymm1,ymm6
- vpxor ymm12,ymm12,ymm14
- vmovdqa ymm14,YMMWORD[((0-128))+rax]
-
- vpslld ymm7,ymm4,5
- vpaddd ymm3,ymm3,ymm15
- vpxor ymm5,ymm2,ymm0
- vmovdqa YMMWORD[(416-256-128)+rbx],ymm11
- vpaddd ymm3,ymm3,ymm11
- vpxor ymm12,ymm12,YMMWORD[((192-128))+rax]
- vpsrld ymm8,ymm4,27
- vpxor ymm5,ymm5,ymm1
- vpxor ymm12,ymm12,ymm14
-
- vpslld ymm6,ymm0,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm3,ymm3,ymm5
- vpsrld ymm9,ymm12,31
- vpaddd ymm12,ymm12,ymm12
-
- vpsrld ymm0,ymm0,2
- vpaddd ymm3,ymm3,ymm7
- vpor ymm12,ymm12,ymm9
- vpor ymm0,ymm0,ymm6
- vpxor ymm13,ymm13,ymm10
- vmovdqa ymm10,YMMWORD[((32-128))+rax]
-
- vpslld ymm7,ymm3,5
- vpaddd ymm2,ymm2,ymm15
- vpxor ymm5,ymm1,ymm4
- vmovdqa YMMWORD[(448-256-128)+rbx],ymm12
- vpaddd ymm2,ymm2,ymm12
- vpxor ymm13,ymm13,YMMWORD[((224-128))+rax]
- vpsrld ymm8,ymm3,27
- vpxor ymm5,ymm5,ymm0
- vpxor ymm13,ymm13,ymm10
-
- vpslld ymm6,ymm4,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm2,ymm2,ymm5
- vpsrld ymm9,ymm13,31
- vpaddd ymm13,ymm13,ymm13
-
- vpsrld ymm4,ymm4,2
- vpaddd ymm2,ymm2,ymm7
- vpor ymm13,ymm13,ymm9
- vpor ymm4,ymm4,ymm6
- vpxor ymm14,ymm14,ymm11
- vmovdqa ymm11,YMMWORD[((64-128))+rax]
-
- vpslld ymm7,ymm2,5
- vpaddd ymm1,ymm1,ymm15
- vpxor ymm5,ymm0,ymm3
- vmovdqa YMMWORD[(480-256-128)+rbx],ymm13
- vpaddd ymm1,ymm1,ymm13
- vpxor ymm14,ymm14,YMMWORD[((256-256-128))+rbx]
- vpsrld ymm8,ymm2,27
- vpxor ymm5,ymm5,ymm4
- vpxor ymm14,ymm14,ymm11
-
- vpslld ymm6,ymm3,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm1,ymm1,ymm5
- vpsrld ymm9,ymm14,31
- vpaddd ymm14,ymm14,ymm14
-
- vpsrld ymm3,ymm3,2
- vpaddd ymm1,ymm1,ymm7
- vpor ymm14,ymm14,ymm9
- vpor ymm3,ymm3,ymm6
- vpxor ymm10,ymm10,ymm12
- vmovdqa ymm12,YMMWORD[((96-128))+rax]
-
- vpslld ymm7,ymm1,5
- vpaddd ymm0,ymm0,ymm15
- vpxor ymm5,ymm4,ymm2
- vmovdqa YMMWORD[(0-128)+rax],ymm14
- vpaddd ymm0,ymm0,ymm14
- vpxor ymm10,ymm10,YMMWORD[((288-256-128))+rbx]
- vpsrld ymm8,ymm1,27
- vpxor ymm5,ymm5,ymm3
- vpxor ymm10,ymm10,ymm12
-
- vpslld ymm6,ymm2,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm0,ymm0,ymm5
- vpsrld ymm9,ymm10,31
- vpaddd ymm10,ymm10,ymm10
-
- vpsrld ymm2,ymm2,2
- vpaddd ymm0,ymm0,ymm7
- vpor ymm10,ymm10,ymm9
- vpor ymm2,ymm2,ymm6
- vpxor ymm11,ymm11,ymm13
- vmovdqa ymm13,YMMWORD[((128-128))+rax]
-
- vpslld ymm7,ymm0,5
- vpaddd ymm4,ymm4,ymm15
- vpxor ymm5,ymm3,ymm1
- vmovdqa YMMWORD[(32-128)+rax],ymm10
- vpaddd ymm4,ymm4,ymm10
- vpxor ymm11,ymm11,YMMWORD[((320-256-128))+rbx]
- vpsrld ymm8,ymm0,27
- vpxor ymm5,ymm5,ymm2
- vpxor ymm11,ymm11,ymm13
-
- vpslld ymm6,ymm1,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm4,ymm4,ymm5
- vpsrld ymm9,ymm11,31
- vpaddd ymm11,ymm11,ymm11
-
- vpsrld ymm1,ymm1,2
- vpaddd ymm4,ymm4,ymm7
- vpor ymm11,ymm11,ymm9
- vpor ymm1,ymm1,ymm6
- vpxor ymm12,ymm12,ymm14
- vmovdqa ymm14,YMMWORD[((160-128))+rax]
-
- vpslld ymm7,ymm4,5
- vpaddd ymm3,ymm3,ymm15
- vpxor ymm5,ymm2,ymm0
- vmovdqa YMMWORD[(64-128)+rax],ymm11
- vpaddd ymm3,ymm3,ymm11
- vpxor ymm12,ymm12,YMMWORD[((352-256-128))+rbx]
- vpsrld ymm8,ymm4,27
- vpxor ymm5,ymm5,ymm1
- vpxor ymm12,ymm12,ymm14
-
- vpslld ymm6,ymm0,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm3,ymm3,ymm5
- vpsrld ymm9,ymm12,31
- vpaddd ymm12,ymm12,ymm12
-
- vpsrld ymm0,ymm0,2
- vpaddd ymm3,ymm3,ymm7
- vpor ymm12,ymm12,ymm9
- vpor ymm0,ymm0,ymm6
- vpxor ymm13,ymm13,ymm10
- vmovdqa ymm10,YMMWORD[((192-128))+rax]
-
- vpslld ymm7,ymm3,5
- vpaddd ymm2,ymm2,ymm15
- vpxor ymm5,ymm1,ymm4
- vmovdqa YMMWORD[(96-128)+rax],ymm12
- vpaddd ymm2,ymm2,ymm12
- vpxor ymm13,ymm13,YMMWORD[((384-256-128))+rbx]
- vpsrld ymm8,ymm3,27
- vpxor ymm5,ymm5,ymm0
- vpxor ymm13,ymm13,ymm10
-
- vpslld ymm6,ymm4,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm2,ymm2,ymm5
- vpsrld ymm9,ymm13,31
- vpaddd ymm13,ymm13,ymm13
-
- vpsrld ymm4,ymm4,2
- vpaddd ymm2,ymm2,ymm7
- vpor ymm13,ymm13,ymm9
- vpor ymm4,ymm4,ymm6
- vpxor ymm14,ymm14,ymm11
- vmovdqa ymm11,YMMWORD[((224-128))+rax]
-
- vpslld ymm7,ymm2,5
- vpaddd ymm1,ymm1,ymm15
- vpxor ymm5,ymm0,ymm3
- vmovdqa YMMWORD[(128-128)+rax],ymm13
- vpaddd ymm1,ymm1,ymm13
- vpxor ymm14,ymm14,YMMWORD[((416-256-128))+rbx]
- vpsrld ymm8,ymm2,27
- vpxor ymm5,ymm5,ymm4
- vpxor ymm14,ymm14,ymm11
-
- vpslld ymm6,ymm3,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm1,ymm1,ymm5
- vpsrld ymm9,ymm14,31
- vpaddd ymm14,ymm14,ymm14
-
- vpsrld ymm3,ymm3,2
- vpaddd ymm1,ymm1,ymm7
- vpor ymm14,ymm14,ymm9
- vpor ymm3,ymm3,ymm6
- vpxor ymm10,ymm10,ymm12
- vmovdqa ymm12,YMMWORD[((256-256-128))+rbx]
-
- vpslld ymm7,ymm1,5
- vpaddd ymm0,ymm0,ymm15
- vpxor ymm5,ymm4,ymm2
- vmovdqa YMMWORD[(160-128)+rax],ymm14
- vpaddd ymm0,ymm0,ymm14
- vpxor ymm10,ymm10,YMMWORD[((448-256-128))+rbx]
- vpsrld ymm8,ymm1,27
- vpxor ymm5,ymm5,ymm3
- vpxor ymm10,ymm10,ymm12
-
- vpslld ymm6,ymm2,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm0,ymm0,ymm5
- vpsrld ymm9,ymm10,31
- vpaddd ymm10,ymm10,ymm10
-
- vpsrld ymm2,ymm2,2
- vpaddd ymm0,ymm0,ymm7
- vpor ymm10,ymm10,ymm9
- vpor ymm2,ymm2,ymm6
- vpxor ymm11,ymm11,ymm13
- vmovdqa ymm13,YMMWORD[((288-256-128))+rbx]
-
- vpslld ymm7,ymm0,5
- vpaddd ymm4,ymm4,ymm15
- vpxor ymm5,ymm3,ymm1
- vmovdqa YMMWORD[(192-128)+rax],ymm10
- vpaddd ymm4,ymm4,ymm10
- vpxor ymm11,ymm11,YMMWORD[((480-256-128))+rbx]
- vpsrld ymm8,ymm0,27
- vpxor ymm5,ymm5,ymm2
- vpxor ymm11,ymm11,ymm13
-
- vpslld ymm6,ymm1,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm4,ymm4,ymm5
- vpsrld ymm9,ymm11,31
- vpaddd ymm11,ymm11,ymm11
-
- vpsrld ymm1,ymm1,2
- vpaddd ymm4,ymm4,ymm7
- vpor ymm11,ymm11,ymm9
- vpor ymm1,ymm1,ymm6
- vpxor ymm12,ymm12,ymm14
- vmovdqa ymm14,YMMWORD[((320-256-128))+rbx]
-
- vpslld ymm7,ymm4,5
- vpaddd ymm3,ymm3,ymm15
- vpxor ymm5,ymm2,ymm0
- vmovdqa YMMWORD[(224-128)+rax],ymm11
- vpaddd ymm3,ymm3,ymm11
- vpxor ymm12,ymm12,YMMWORD[((0-128))+rax]
- vpsrld ymm8,ymm4,27
- vpxor ymm5,ymm5,ymm1
- vpxor ymm12,ymm12,ymm14
-
- vpslld ymm6,ymm0,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm3,ymm3,ymm5
- vpsrld ymm9,ymm12,31
- vpaddd ymm12,ymm12,ymm12
-
- vpsrld ymm0,ymm0,2
- vpaddd ymm3,ymm3,ymm7
- vpor ymm12,ymm12,ymm9
- vpor ymm0,ymm0,ymm6
- vpxor ymm13,ymm13,ymm10
- vmovdqa ymm10,YMMWORD[((352-256-128))+rbx]
-
- vpslld ymm7,ymm3,5
- vpaddd ymm2,ymm2,ymm15
- vpxor ymm5,ymm1,ymm4
- vpaddd ymm2,ymm2,ymm12
- vpxor ymm13,ymm13,YMMWORD[((32-128))+rax]
- vpsrld ymm8,ymm3,27
- vpxor ymm5,ymm5,ymm0
- vpxor ymm13,ymm13,ymm10
-
- vpslld ymm6,ymm4,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm2,ymm2,ymm5
- vpsrld ymm9,ymm13,31
- vpaddd ymm13,ymm13,ymm13
-
- vpsrld ymm4,ymm4,2
- vpaddd ymm2,ymm2,ymm7
- vpor ymm13,ymm13,ymm9
- vpor ymm4,ymm4,ymm6
- vpxor ymm14,ymm14,ymm11
- vmovdqa ymm11,YMMWORD[((384-256-128))+rbx]
-
- vpslld ymm7,ymm2,5
- vpaddd ymm1,ymm1,ymm15
- vpxor ymm5,ymm0,ymm3
- vpaddd ymm1,ymm1,ymm13
- vpxor ymm14,ymm14,YMMWORD[((64-128))+rax]
- vpsrld ymm8,ymm2,27
- vpxor ymm5,ymm5,ymm4
- vpxor ymm14,ymm14,ymm11
-
- vpslld ymm6,ymm3,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm1,ymm1,ymm5
- vpsrld ymm9,ymm14,31
- vpaddd ymm14,ymm14,ymm14
-
- vpsrld ymm3,ymm3,2
- vpaddd ymm1,ymm1,ymm7
- vpor ymm14,ymm14,ymm9
- vpor ymm3,ymm3,ymm6
- vpxor ymm10,ymm10,ymm12
- vmovdqa ymm12,YMMWORD[((416-256-128))+rbx]
-
- vpslld ymm7,ymm1,5
- vpaddd ymm0,ymm0,ymm15
- vpxor ymm5,ymm4,ymm2
- vpaddd ymm0,ymm0,ymm14
- vpxor ymm10,ymm10,YMMWORD[((96-128))+rax]
- vpsrld ymm8,ymm1,27
- vpxor ymm5,ymm5,ymm3
- vpxor ymm10,ymm10,ymm12
-
- vpslld ymm6,ymm2,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm0,ymm0,ymm5
- vpsrld ymm9,ymm10,31
- vpaddd ymm10,ymm10,ymm10
-
- vpsrld ymm2,ymm2,2
- vpaddd ymm0,ymm0,ymm7
- vpor ymm10,ymm10,ymm9
- vpor ymm2,ymm2,ymm6
- vpxor ymm11,ymm11,ymm13
- vmovdqa ymm13,YMMWORD[((448-256-128))+rbx]
-
- vpslld ymm7,ymm0,5
- vpaddd ymm4,ymm4,ymm15
- vpxor ymm5,ymm3,ymm1
- vpaddd ymm4,ymm4,ymm10
- vpxor ymm11,ymm11,YMMWORD[((128-128))+rax]
- vpsrld ymm8,ymm0,27
- vpxor ymm5,ymm5,ymm2
- vpxor ymm11,ymm11,ymm13
-
- vpslld ymm6,ymm1,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm4,ymm4,ymm5
- vpsrld ymm9,ymm11,31
- vpaddd ymm11,ymm11,ymm11
-
- vpsrld ymm1,ymm1,2
- vpaddd ymm4,ymm4,ymm7
- vpor ymm11,ymm11,ymm9
- vpor ymm1,ymm1,ymm6
- vpxor ymm12,ymm12,ymm14
- vmovdqa ymm14,YMMWORD[((480-256-128))+rbx]
-
- vpslld ymm7,ymm4,5
- vpaddd ymm3,ymm3,ymm15
- vpxor ymm5,ymm2,ymm0
- vpaddd ymm3,ymm3,ymm11
- vpxor ymm12,ymm12,YMMWORD[((160-128))+rax]
- vpsrld ymm8,ymm4,27
- vpxor ymm5,ymm5,ymm1
- vpxor ymm12,ymm12,ymm14
-
- vpslld ymm6,ymm0,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm3,ymm3,ymm5
- vpsrld ymm9,ymm12,31
- vpaddd ymm12,ymm12,ymm12
-
- vpsrld ymm0,ymm0,2
- vpaddd ymm3,ymm3,ymm7
- vpor ymm12,ymm12,ymm9
- vpor ymm0,ymm0,ymm6
- vpxor ymm13,ymm13,ymm10
- vmovdqa ymm10,YMMWORD[((0-128))+rax]
-
- vpslld ymm7,ymm3,5
- vpaddd ymm2,ymm2,ymm15
- vpxor ymm5,ymm1,ymm4
- vpaddd ymm2,ymm2,ymm12
- vpxor ymm13,ymm13,YMMWORD[((192-128))+rax]
- vpsrld ymm8,ymm3,27
- vpxor ymm5,ymm5,ymm0
- vpxor ymm13,ymm13,ymm10
-
- vpslld ymm6,ymm4,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm2,ymm2,ymm5
- vpsrld ymm9,ymm13,31
- vpaddd ymm13,ymm13,ymm13
-
- vpsrld ymm4,ymm4,2
- vpaddd ymm2,ymm2,ymm7
- vpor ymm13,ymm13,ymm9
- vpor ymm4,ymm4,ymm6
- vpxor ymm14,ymm14,ymm11
- vmovdqa ymm11,YMMWORD[((32-128))+rax]
-
- vpslld ymm7,ymm2,5
- vpaddd ymm1,ymm1,ymm15
- vpxor ymm5,ymm0,ymm3
- vpaddd ymm1,ymm1,ymm13
- vpxor ymm14,ymm14,YMMWORD[((224-128))+rax]
- vpsrld ymm8,ymm2,27
- vpxor ymm5,ymm5,ymm4
- vpxor ymm14,ymm14,ymm11
-
- vpslld ymm6,ymm3,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm1,ymm1,ymm5
- vpsrld ymm9,ymm14,31
- vpaddd ymm14,ymm14,ymm14
-
- vpsrld ymm3,ymm3,2
- vpaddd ymm1,ymm1,ymm7
- vpor ymm14,ymm14,ymm9
- vpor ymm3,ymm3,ymm6
- vpslld ymm7,ymm1,5
- vpaddd ymm0,ymm0,ymm15
- vpxor ymm5,ymm4,ymm2
-
- vpsrld ymm8,ymm1,27
- vpaddd ymm0,ymm0,ymm14
- vpxor ymm5,ymm5,ymm3
-
- vpslld ymm6,ymm2,30
- vpor ymm7,ymm7,ymm8
- vpaddd ymm0,ymm0,ymm5
-
- vpsrld ymm2,ymm2,2
- vpaddd ymm0,ymm0,ymm7
- vpor ymm2,ymm2,ymm6
- mov ecx,1
- lea rbx,[512+rsp]
- cmp ecx,DWORD[rbx]
- cmovge r12,rbp
- cmp ecx,DWORD[4+rbx]
- cmovge r13,rbp
- cmp ecx,DWORD[8+rbx]
- cmovge r14,rbp
- cmp ecx,DWORD[12+rbx]
- cmovge r15,rbp
- cmp ecx,DWORD[16+rbx]
- cmovge r8,rbp
- cmp ecx,DWORD[20+rbx]
- cmovge r9,rbp
- cmp ecx,DWORD[24+rbx]
- cmovge r10,rbp
- cmp ecx,DWORD[28+rbx]
- cmovge r11,rbp
- vmovdqu ymm5,YMMWORD[rbx]
- vpxor ymm7,ymm7,ymm7
- vmovdqa ymm6,ymm5
- vpcmpgtd ymm6,ymm6,ymm7
- vpaddd ymm5,ymm5,ymm6
-
- vpand ymm0,ymm0,ymm6
- vpand ymm1,ymm1,ymm6
- vpaddd ymm0,ymm0,YMMWORD[rdi]
- vpand ymm2,ymm2,ymm6
- vpaddd ymm1,ymm1,YMMWORD[32+rdi]
- vpand ymm3,ymm3,ymm6
- vpaddd ymm2,ymm2,YMMWORD[64+rdi]
- vpand ymm4,ymm4,ymm6
- vpaddd ymm3,ymm3,YMMWORD[96+rdi]
- vpaddd ymm4,ymm4,YMMWORD[128+rdi]
- vmovdqu YMMWORD[rdi],ymm0
- vmovdqu YMMWORD[32+rdi],ymm1
- vmovdqu YMMWORD[64+rdi],ymm2
- vmovdqu YMMWORD[96+rdi],ymm3
- vmovdqu YMMWORD[128+rdi],ymm4
-
- vmovdqu YMMWORD[rbx],ymm5
- lea rbx,[((256+128))+rsp]
- vmovdqu ymm9,YMMWORD[96+rbp]
- dec edx
- jnz NEAR $L$oop_avx2
-
-
-
-
-
-
-
-$L$done_avx2:
- mov rax,QWORD[544+rsp]
-
- vzeroupper
- movaps xmm6,XMMWORD[((-216))+rax]
- movaps xmm7,XMMWORD[((-200))+rax]
- movaps xmm8,XMMWORD[((-184))+rax]
- movaps xmm9,XMMWORD[((-168))+rax]
- movaps xmm10,XMMWORD[((-152))+rax]
- movaps xmm11,XMMWORD[((-136))+rax]
- movaps xmm12,XMMWORD[((-120))+rax]
- movaps xmm13,XMMWORD[((-104))+rax]
- movaps xmm14,XMMWORD[((-88))+rax]
- movaps xmm15,XMMWORD[((-72))+rax]
- mov r15,QWORD[((-48))+rax]
-
- mov r14,QWORD[((-40))+rax]
-
- mov r13,QWORD[((-32))+rax]
-
- mov r12,QWORD[((-24))+rax]
-
- mov rbp,QWORD[((-16))+rax]
-
- mov rbx,QWORD[((-8))+rax]
-
- lea rsp,[rax]
-
-$L$epilogue_avx2:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_sha1_multi_block_avx2:
-
ALIGN 256
DD 0x5a827999,0x5a827999,0x5a827999,0x5a827999
DD 0x5a827999,0x5a827999,0x5a827999,0x5a827999
@@ -7522,60 +3119,6 @@ $L$in_prologue:
pop rsi
DB 0F3h,0C3h ;repret
-
-ALIGN 16
-avx2_handler:
- push rsi
- push rdi
- push rbx
- push rbp
- push r12
- push r13
- push r14
- push r15
- pushfq
- sub rsp,64
-
- mov rax,QWORD[120+r8]
- mov rbx,QWORD[248+r8]
-
- mov rsi,QWORD[8+r9]
- mov r11,QWORD[56+r9]
-
- mov r10d,DWORD[r11]
- lea r10,[r10*1+rsi]
- cmp rbx,r10
- jb NEAR $L$in_prologue
-
- mov rax,QWORD[152+r8]
-
- mov r10d,DWORD[4+r11]
- lea r10,[r10*1+rsi]
- cmp rbx,r10
- jae NEAR $L$in_prologue
-
- mov rax,QWORD[544+r8]
-
- mov rbx,QWORD[((-8))+rax]
- mov rbp,QWORD[((-16))+rax]
- mov r12,QWORD[((-24))+rax]
- mov r13,QWORD[((-32))+rax]
- mov r14,QWORD[((-40))+rax]
- mov r15,QWORD[((-48))+rax]
- mov QWORD[144+r8],rbx
- mov QWORD[160+r8],rbp
- mov QWORD[216+r8],r12
- mov QWORD[224+r8],r13
- mov QWORD[232+r8],r14
- mov QWORD[240+r8],r15
-
- lea rsi,[((-56-160))+rax]
- lea rdi,[512+r8]
- mov ecx,20
- DD 0xa548f3fc
-
- jmp NEAR $L$in_prologue
-
section .pdata rdata align=4
ALIGN 4
DD $L$SEH_begin_sha1_multi_block wrt ..imagebase
@@ -7584,12 +3127,6 @@ ALIGN 4
DD $L$SEH_begin_sha1_multi_block_shaext wrt ..imagebase
DD $L$SEH_end_sha1_multi_block_shaext wrt ..imagebase
DD $L$SEH_info_sha1_multi_block_shaext wrt ..imagebase
- DD $L$SEH_begin_sha1_multi_block_avx wrt ..imagebase
- DD $L$SEH_end_sha1_multi_block_avx wrt ..imagebase
- DD $L$SEH_info_sha1_multi_block_avx wrt ..imagebase
- DD $L$SEH_begin_sha1_multi_block_avx2 wrt ..imagebase
- DD $L$SEH_end_sha1_multi_block_avx2 wrt ..imagebase
- DD $L$SEH_info_sha1_multi_block_avx2 wrt ..imagebase
section .xdata rdata align=8
ALIGN 8
$L$SEH_info_sha1_multi_block:
@@ -7600,11 +3137,3 @@ $L$SEH_info_sha1_multi_block_shaext:
DB 9,0,0,0
DD se_handler wrt ..imagebase
DD $L$body_shaext wrt ..imagebase,$L$epilogue_shaext wrt ..imagebase
-$L$SEH_info_sha1_multi_block_avx:
-DB 9,0,0,0
- DD se_handler wrt ..imagebase
- DD $L$body_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
-$L$SEH_info_sha1_multi_block_avx2:
-DB 9,0,0,0
- DD avx2_handler wrt ..imagebase
- DD $L$body_avx2 wrt ..imagebase,$L$epilogue_avx2 wrt ..imagebase
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-x86_64.nasm
index 9d1f10e1ee6..e25a29d3951 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-x86_64.nasm
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha1-x86_64.nasm
@@ -27,14 +27,6 @@ $L$SEH_begin_sha1_block_data_order:
jz NEAR $L$ialu
test r10d,536870912
jnz NEAR _shaext_shortcut
- and r10d,296
- cmp r10d,296
- je NEAR _avx2_shortcut
- and r8d,268435456
- and r9d,1073741824
- or r8d,r9d
- cmp r8d,1342177280
- je NEAR _avx_shortcut
jmp NEAR _ssse3_shortcut
ALIGN 16
@@ -2675,2876 +2667,6 @@ $L$epilogue_ssse3:
DB 0F3h,0C3h ;repret
$L$SEH_end_sha1_block_data_order_ssse3:
-
-ALIGN 16
-sha1_block_data_order_avx:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_sha1_block_data_order_avx:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
-
-
-_avx_shortcut:
-
- mov r11,rsp
-
- push rbx
-
- push rbp
-
- push r12
-
- push r13
-
- push r14
-
- lea rsp,[((-160))+rsp]
- vzeroupper
- vmovaps XMMWORD[(-40-96)+r11],xmm6
- vmovaps XMMWORD[(-40-80)+r11],xmm7
- vmovaps XMMWORD[(-40-64)+r11],xmm8
- vmovaps XMMWORD[(-40-48)+r11],xmm9
- vmovaps XMMWORD[(-40-32)+r11],xmm10
- vmovaps XMMWORD[(-40-16)+r11],xmm11
-$L$prologue_avx:
- and rsp,-64
- mov r8,rdi
- mov r9,rsi
- mov r10,rdx
-
- shl r10,6
- add r10,r9
- lea r14,[((K_XX_XX+64))]
-
- mov eax,DWORD[r8]
- mov ebx,DWORD[4+r8]
- mov ecx,DWORD[8+r8]
- mov edx,DWORD[12+r8]
- mov esi,ebx
- mov ebp,DWORD[16+r8]
- mov edi,ecx
- xor edi,edx
- and esi,edi
-
- vmovdqa xmm6,XMMWORD[64+r14]
- vmovdqa xmm11,XMMWORD[((-64))+r14]
- vmovdqu xmm0,XMMWORD[r9]
- vmovdqu xmm1,XMMWORD[16+r9]
- vmovdqu xmm2,XMMWORD[32+r9]
- vmovdqu xmm3,XMMWORD[48+r9]
- vpshufb xmm0,xmm0,xmm6
- add r9,64
- vpshufb xmm1,xmm1,xmm6
- vpshufb xmm2,xmm2,xmm6
- vpshufb xmm3,xmm3,xmm6
- vpaddd xmm4,xmm0,xmm11
- vpaddd xmm5,xmm1,xmm11
- vpaddd xmm6,xmm2,xmm11
- vmovdqa XMMWORD[rsp],xmm4
- vmovdqa XMMWORD[16+rsp],xmm5
- vmovdqa XMMWORD[32+rsp],xmm6
- jmp NEAR $L$oop_avx
-ALIGN 16
-$L$oop_avx:
- shrd ebx,ebx,2
- xor esi,edx
- vpalignr xmm4,xmm1,xmm0,8
- mov edi,eax
- add ebp,DWORD[rsp]
- vpaddd xmm9,xmm11,xmm3
- xor ebx,ecx
- shld eax,eax,5
- vpsrldq xmm8,xmm3,4
- add ebp,esi
- and edi,ebx
- vpxor xmm4,xmm4,xmm0
- xor ebx,ecx
- add ebp,eax
- vpxor xmm8,xmm8,xmm2
- shrd eax,eax,7
- xor edi,ecx
- mov esi,ebp
- add edx,DWORD[4+rsp]
- vpxor xmm4,xmm4,xmm8
- xor eax,ebx
- shld ebp,ebp,5
- vmovdqa XMMWORD[48+rsp],xmm9
- add edx,edi
- and esi,eax
- vpsrld xmm8,xmm4,31
- xor eax,ebx
- add edx,ebp
- shrd ebp,ebp,7
- xor esi,ebx
- vpslldq xmm10,xmm4,12
- vpaddd xmm4,xmm4,xmm4
- mov edi,edx
- add ecx,DWORD[8+rsp]
- xor ebp,eax
- shld edx,edx,5
- vpsrld xmm9,xmm10,30
- vpor xmm4,xmm4,xmm8
- add ecx,esi
- and edi,ebp
- xor ebp,eax
- add ecx,edx
- vpslld xmm10,xmm10,2
- vpxor xmm4,xmm4,xmm9
- shrd edx,edx,7
- xor edi,eax
- mov esi,ecx
- add ebx,DWORD[12+rsp]
- vpxor xmm4,xmm4,xmm10
- xor edx,ebp
- shld ecx,ecx,5
- add ebx,edi
- and esi,edx
- xor edx,ebp
- add ebx,ecx
- shrd ecx,ecx,7
- xor esi,ebp
- vpalignr xmm5,xmm2,xmm1,8
- mov edi,ebx
- add eax,DWORD[16+rsp]
- vpaddd xmm9,xmm11,xmm4
- xor ecx,edx
- shld ebx,ebx,5
- vpsrldq xmm8,xmm4,4
- add eax,esi
- and edi,ecx
- vpxor xmm5,xmm5,xmm1
- xor ecx,edx
- add eax,ebx
- vpxor xmm8,xmm8,xmm3
- shrd ebx,ebx,7
- xor edi,edx
- mov esi,eax
- add ebp,DWORD[20+rsp]
- vpxor xmm5,xmm5,xmm8
- xor ebx,ecx
- shld eax,eax,5
- vmovdqa XMMWORD[rsp],xmm9
- add ebp,edi
- and esi,ebx
- vpsrld xmm8,xmm5,31
- xor ebx,ecx
- add ebp,eax
- shrd eax,eax,7
- xor esi,ecx
- vpslldq xmm10,xmm5,12
- vpaddd xmm5,xmm5,xmm5
- mov edi,ebp
- add edx,DWORD[24+rsp]
- xor eax,ebx
- shld ebp,ebp,5
- vpsrld xmm9,xmm10,30
- vpor xmm5,xmm5,xmm8
- add edx,esi
- and edi,eax
- xor eax,ebx
- add edx,ebp
- vpslld xmm10,xmm10,2
- vpxor xmm5,xmm5,xmm9
- shrd ebp,ebp,7
- xor edi,ebx
- mov esi,edx
- add ecx,DWORD[28+rsp]
- vpxor xmm5,xmm5,xmm10
- xor ebp,eax
- shld edx,edx,5
- vmovdqa xmm11,XMMWORD[((-32))+r14]
- add ecx,edi
- and esi,ebp
- xor ebp,eax
- add ecx,edx
- shrd edx,edx,7
- xor esi,eax
- vpalignr xmm6,xmm3,xmm2,8
- mov edi,ecx
- add ebx,DWORD[32+rsp]
- vpaddd xmm9,xmm11,xmm5
- xor edx,ebp
- shld ecx,ecx,5
- vpsrldq xmm8,xmm5,4
- add ebx,esi
- and edi,edx
- vpxor xmm6,xmm6,xmm2
- xor edx,ebp
- add ebx,ecx
- vpxor xmm8,xmm8,xmm4
- shrd ecx,ecx,7
- xor edi,ebp
- mov esi,ebx
- add eax,DWORD[36+rsp]
- vpxor xmm6,xmm6,xmm8
- xor ecx,edx
- shld ebx,ebx,5
- vmovdqa XMMWORD[16+rsp],xmm9
- add eax,edi
- and esi,ecx
- vpsrld xmm8,xmm6,31
- xor ecx,edx
- add eax,ebx
- shrd ebx,ebx,7
- xor esi,edx
- vpslldq xmm10,xmm6,12
- vpaddd xmm6,xmm6,xmm6
- mov edi,eax
- add ebp,DWORD[40+rsp]
- xor ebx,ecx
- shld eax,eax,5
- vpsrld xmm9,xmm10,30
- vpor xmm6,xmm6,xmm8
- add ebp,esi
- and edi,ebx
- xor ebx,ecx
- add ebp,eax
- vpslld xmm10,xmm10,2
- vpxor xmm6,xmm6,xmm9
- shrd eax,eax,7
- xor edi,ecx
- mov esi,ebp
- add edx,DWORD[44+rsp]
- vpxor xmm6,xmm6,xmm10
- xor eax,ebx
- shld ebp,ebp,5
- add edx,edi
- and esi,eax
- xor eax,ebx
- add edx,ebp
- shrd ebp,ebp,7
- xor esi,ebx
- vpalignr xmm7,xmm4,xmm3,8
- mov edi,edx
- add ecx,DWORD[48+rsp]
- vpaddd xmm9,xmm11,xmm6
- xor ebp,eax
- shld edx,edx,5
- vpsrldq xmm8,xmm6,4
- add ecx,esi
- and edi,ebp
- vpxor xmm7,xmm7,xmm3
- xor ebp,eax
- add ecx,edx
- vpxor xmm8,xmm8,xmm5
- shrd edx,edx,7
- xor edi,eax
- mov esi,ecx
- add ebx,DWORD[52+rsp]
- vpxor xmm7,xmm7,xmm8
- xor edx,ebp
- shld ecx,ecx,5
- vmovdqa XMMWORD[32+rsp],xmm9
- add ebx,edi
- and esi,edx
- vpsrld xmm8,xmm7,31
- xor edx,ebp
- add ebx,ecx
- shrd ecx,ecx,7
- xor esi,ebp
- vpslldq xmm10,xmm7,12
- vpaddd xmm7,xmm7,xmm7
- mov edi,ebx
- add eax,DWORD[56+rsp]
- xor ecx,edx
- shld ebx,ebx,5
- vpsrld xmm9,xmm10,30
- vpor xmm7,xmm7,xmm8
- add eax,esi
- and edi,ecx
- xor ecx,edx
- add eax,ebx
- vpslld xmm10,xmm10,2
- vpxor xmm7,xmm7,xmm9
- shrd ebx,ebx,7
- xor edi,edx
- mov esi,eax
- add ebp,DWORD[60+rsp]
- vpxor xmm7,xmm7,xmm10
- xor ebx,ecx
- shld eax,eax,5
- add ebp,edi
- and esi,ebx
- xor ebx,ecx
- add ebp,eax
- vpalignr xmm8,xmm7,xmm6,8
- vpxor xmm0,xmm0,xmm4
- shrd eax,eax,7
- xor esi,ecx
- mov edi,ebp
- add edx,DWORD[rsp]
- vpxor xmm0,xmm0,xmm1
- xor eax,ebx
- shld ebp,ebp,5
- vpaddd xmm9,xmm11,xmm7
- add edx,esi
- and edi,eax
- vpxor xmm0,xmm0,xmm8
- xor eax,ebx
- add edx,ebp
- shrd ebp,ebp,7
- xor edi,ebx
- vpsrld xmm8,xmm0,30
- vmovdqa XMMWORD[48+rsp],xmm9
- mov esi,edx
- add ecx,DWORD[4+rsp]
- xor ebp,eax
- shld edx,edx,5
- vpslld xmm0,xmm0,2
- add ecx,edi
- and esi,ebp
- xor ebp,eax
- add ecx,edx
- shrd edx,edx,7
- xor esi,eax
- mov edi,ecx
- add ebx,DWORD[8+rsp]
- vpor xmm0,xmm0,xmm8
- xor edx,ebp
- shld ecx,ecx,5
- add ebx,esi
- and edi,edx
- xor edx,ebp
- add ebx,ecx
- add eax,DWORD[12+rsp]
- xor edi,ebp
- mov esi,ebx
- shld ebx,ebx,5
- add eax,edi
- xor esi,edx
- shrd ecx,ecx,7
- add eax,ebx
- vpalignr xmm8,xmm0,xmm7,8
- vpxor xmm1,xmm1,xmm5
- add ebp,DWORD[16+rsp]
- xor esi,ecx
- mov edi,eax
- shld eax,eax,5
- vpxor xmm1,xmm1,xmm2
- add ebp,esi
- xor edi,ecx
- vpaddd xmm9,xmm11,xmm0
- shrd ebx,ebx,7
- add ebp,eax
- vpxor xmm1,xmm1,xmm8
- add edx,DWORD[20+rsp]
- xor edi,ebx
- mov esi,ebp
- shld ebp,ebp,5
- vpsrld xmm8,xmm1,30
- vmovdqa XMMWORD[rsp],xmm9
- add edx,edi
- xor esi,ebx
- shrd eax,eax,7
- add edx,ebp
- vpslld xmm1,xmm1,2
- add ecx,DWORD[24+rsp]
- xor esi,eax
- mov edi,edx
- shld edx,edx,5
- add ecx,esi
- xor edi,eax
- shrd ebp,ebp,7
- add ecx,edx
- vpor xmm1,xmm1,xmm8
- add ebx,DWORD[28+rsp]
- xor edi,ebp
- mov esi,ecx
- shld ecx,ecx,5
- add ebx,edi
- xor esi,ebp
- shrd edx,edx,7
- add ebx,ecx
- vpalignr xmm8,xmm1,xmm0,8
- vpxor xmm2,xmm2,xmm6
- add eax,DWORD[32+rsp]
- xor esi,edx
- mov edi,ebx
- shld ebx,ebx,5
- vpxor xmm2,xmm2,xmm3
- add eax,esi
- xor edi,edx
- vpaddd xmm9,xmm11,xmm1
- vmovdqa xmm11,XMMWORD[r14]
- shrd ecx,ecx,7
- add eax,ebx
- vpxor xmm2,xmm2,xmm8
- add ebp,DWORD[36+rsp]
- xor edi,ecx
- mov esi,eax
- shld eax,eax,5
- vpsrld xmm8,xmm2,30
- vmovdqa XMMWORD[16+rsp],xmm9
- add ebp,edi
- xor esi,ecx
- shrd ebx,ebx,7
- add ebp,eax
- vpslld xmm2,xmm2,2
- add edx,DWORD[40+rsp]
- xor esi,ebx
- mov edi,ebp
- shld ebp,ebp,5
- add edx,esi
- xor edi,ebx
- shrd eax,eax,7
- add edx,ebp
- vpor xmm2,xmm2,xmm8
- add ecx,DWORD[44+rsp]
- xor edi,eax
- mov esi,edx
- shld edx,edx,5
- add ecx,edi
- xor esi,eax
- shrd ebp,ebp,7
- add ecx,edx
- vpalignr xmm8,xmm2,xmm1,8
- vpxor xmm3,xmm3,xmm7
- add ebx,DWORD[48+rsp]
- xor esi,ebp
- mov edi,ecx
- shld ecx,ecx,5
- vpxor xmm3,xmm3,xmm4
- add ebx,esi
- xor edi,ebp
- vpaddd xmm9,xmm11,xmm2
- shrd edx,edx,7
- add ebx,ecx
- vpxor xmm3,xmm3,xmm8
- add eax,DWORD[52+rsp]
- xor edi,edx
- mov esi,ebx
- shld ebx,ebx,5
- vpsrld xmm8,xmm3,30
- vmovdqa XMMWORD[32+rsp],xmm9
- add eax,edi
- xor esi,edx
- shrd ecx,ecx,7
- add eax,ebx
- vpslld xmm3,xmm3,2
- add ebp,DWORD[56+rsp]
- xor esi,ecx
- mov edi,eax
- shld eax,eax,5
- add ebp,esi
- xor edi,ecx
- shrd ebx,ebx,7
- add ebp,eax
- vpor xmm3,xmm3,xmm8
- add edx,DWORD[60+rsp]
- xor edi,ebx
- mov esi,ebp
- shld ebp,ebp,5
- add edx,edi
- xor esi,ebx
- shrd eax,eax,7
- add edx,ebp
- vpalignr xmm8,xmm3,xmm2,8
- vpxor xmm4,xmm4,xmm0
- add ecx,DWORD[rsp]
- xor esi,eax
- mov edi,edx
- shld edx,edx,5
- vpxor xmm4,xmm4,xmm5
- add ecx,esi
- xor edi,eax
- vpaddd xmm9,xmm11,xmm3
- shrd ebp,ebp,7
- add ecx,edx
- vpxor xmm4,xmm4,xmm8
- add ebx,DWORD[4+rsp]
- xor edi,ebp
- mov esi,ecx
- shld ecx,ecx,5
- vpsrld xmm8,xmm4,30
- vmovdqa XMMWORD[48+rsp],xmm9
- add ebx,edi
- xor esi,ebp
- shrd edx,edx,7
- add ebx,ecx
- vpslld xmm4,xmm4,2
- add eax,DWORD[8+rsp]
- xor esi,edx
- mov edi,ebx
- shld ebx,ebx,5
- add eax,esi
- xor edi,edx
- shrd ecx,ecx,7
- add eax,ebx
- vpor xmm4,xmm4,xmm8
- add ebp,DWORD[12+rsp]
- xor edi,ecx
- mov esi,eax
- shld eax,eax,5
- add ebp,edi
- xor esi,ecx
- shrd ebx,ebx,7
- add ebp,eax
- vpalignr xmm8,xmm4,xmm3,8
- vpxor xmm5,xmm5,xmm1
- add edx,DWORD[16+rsp]
- xor esi,ebx
- mov edi,ebp
- shld ebp,ebp,5
- vpxor xmm5,xmm5,xmm6
- add edx,esi
- xor edi,ebx
- vpaddd xmm9,xmm11,xmm4
- shrd eax,eax,7
- add edx,ebp
- vpxor xmm5,xmm5,xmm8
- add ecx,DWORD[20+rsp]
- xor edi,eax
- mov esi,edx
- shld edx,edx,5
- vpsrld xmm8,xmm5,30
- vmovdqa XMMWORD[rsp],xmm9
- add ecx,edi
- xor esi,eax
- shrd ebp,ebp,7
- add ecx,edx
- vpslld xmm5,xmm5,2
- add ebx,DWORD[24+rsp]
- xor esi,ebp
- mov edi,ecx
- shld ecx,ecx,5
- add ebx,esi
- xor edi,ebp
- shrd edx,edx,7
- add ebx,ecx
- vpor xmm5,xmm5,xmm8
- add eax,DWORD[28+rsp]
- shrd ecx,ecx,7
- mov esi,ebx
- xor edi,edx
- shld ebx,ebx,5
- add eax,edi
- xor esi,ecx
- xor ecx,edx
- add eax,ebx
- vpalignr xmm8,xmm5,xmm4,8
- vpxor xmm6,xmm6,xmm2
- add ebp,DWORD[32+rsp]
- and esi,ecx
- xor ecx,edx
- shrd ebx,ebx,7
- vpxor xmm6,xmm6,xmm7
- mov edi,eax
- xor esi,ecx
- vpaddd xmm9,xmm11,xmm5
- shld eax,eax,5
- add ebp,esi
- vpxor xmm6,xmm6,xmm8
- xor edi,ebx
- xor ebx,ecx
- add ebp,eax
- add edx,DWORD[36+rsp]
- vpsrld xmm8,xmm6,30
- vmovdqa XMMWORD[16+rsp],xmm9
- and edi,ebx
- xor ebx,ecx
- shrd eax,eax,7
- mov esi,ebp
- vpslld xmm6,xmm6,2
- xor edi,ebx
- shld ebp,ebp,5
- add edx,edi
- xor esi,eax
- xor eax,ebx
- add edx,ebp
- add ecx,DWORD[40+rsp]
- and esi,eax
- vpor xmm6,xmm6,xmm8
- xor eax,ebx
- shrd ebp,ebp,7
- mov edi,edx
- xor esi,eax
- shld edx,edx,5
- add ecx,esi
- xor edi,ebp
- xor ebp,eax
- add ecx,edx
- add ebx,DWORD[44+rsp]
- and edi,ebp
- xor ebp,eax
- shrd edx,edx,7
- mov esi,ecx
- xor edi,ebp
- shld ecx,ecx,5
- add ebx,edi
- xor esi,edx
- xor edx,ebp
- add ebx,ecx
- vpalignr xmm8,xmm6,xmm5,8
- vpxor xmm7,xmm7,xmm3
- add eax,DWORD[48+rsp]
- and esi,edx
- xor edx,ebp
- shrd ecx,ecx,7
- vpxor xmm7,xmm7,xmm0
- mov edi,ebx
- xor esi,edx
- vpaddd xmm9,xmm11,xmm6
- vmovdqa xmm11,XMMWORD[32+r14]
- shld ebx,ebx,5
- add eax,esi
- vpxor xmm7,xmm7,xmm8
- xor edi,ecx
- xor ecx,edx
- add eax,ebx
- add ebp,DWORD[52+rsp]
- vpsrld xmm8,xmm7,30
- vmovdqa XMMWORD[32+rsp],xmm9
- and edi,ecx
- xor ecx,edx
- shrd ebx,ebx,7
- mov esi,eax
- vpslld xmm7,xmm7,2
- xor edi,ecx
- shld eax,eax,5
- add ebp,edi
- xor esi,ebx
- xor ebx,ecx
- add ebp,eax
- add edx,DWORD[56+rsp]
- and esi,ebx
- vpor xmm7,xmm7,xmm8
- xor ebx,ecx
- shrd eax,eax,7
- mov edi,ebp
- xor esi,ebx
- shld ebp,ebp,5
- add edx,esi
- xor edi,eax
- xor eax,ebx
- add edx,ebp
- add ecx,DWORD[60+rsp]
- and edi,eax
- xor eax,ebx
- shrd ebp,ebp,7
- mov esi,edx
- xor edi,eax
- shld edx,edx,5
- add ecx,edi
- xor esi,ebp
- xor ebp,eax
- add ecx,edx
- vpalignr xmm8,xmm7,xmm6,8
- vpxor xmm0,xmm0,xmm4
- add ebx,DWORD[rsp]
- and esi,ebp
- xor ebp,eax
- shrd edx,edx,7
- vpxor xmm0,xmm0,xmm1
- mov edi,ecx
- xor esi,ebp
- vpaddd xmm9,xmm11,xmm7
- shld ecx,ecx,5
- add ebx,esi
- vpxor xmm0,xmm0,xmm8
- xor edi,edx
- xor edx,ebp
- add ebx,ecx
- add eax,DWORD[4+rsp]
- vpsrld xmm8,xmm0,30
- vmovdqa XMMWORD[48+rsp],xmm9
- and edi,edx
- xor edx,ebp
- shrd ecx,ecx,7
- mov esi,ebx
- vpslld xmm0,xmm0,2
- xor edi,edx
- shld ebx,ebx,5
- add eax,edi
- xor esi,ecx
- xor ecx,edx
- add eax,ebx
- add ebp,DWORD[8+rsp]
- and esi,ecx
- vpor xmm0,xmm0,xmm8
- xor ecx,edx
- shrd ebx,ebx,7
- mov edi,eax
- xor esi,ecx
- shld eax,eax,5
- add ebp,esi
- xor edi,ebx
- xor ebx,ecx
- add ebp,eax
- add edx,DWORD[12+rsp]
- and edi,ebx
- xor ebx,ecx
- shrd eax,eax,7
- mov esi,ebp
- xor edi,ebx
- shld ebp,ebp,5
- add edx,edi
- xor esi,eax
- xor eax,ebx
- add edx,ebp
- vpalignr xmm8,xmm0,xmm7,8
- vpxor xmm1,xmm1,xmm5
- add ecx,DWORD[16+rsp]
- and esi,eax
- xor eax,ebx
- shrd ebp,ebp,7
- vpxor xmm1,xmm1,xmm2
- mov edi,edx
- xor esi,eax
- vpaddd xmm9,xmm11,xmm0
- shld edx,edx,5
- add ecx,esi
- vpxor xmm1,xmm1,xmm8
- xor edi,ebp
- xor ebp,eax
- add ecx,edx
- add ebx,DWORD[20+rsp]
- vpsrld xmm8,xmm1,30
- vmovdqa XMMWORD[rsp],xmm9
- and edi,ebp
- xor ebp,eax
- shrd edx,edx,7
- mov esi,ecx
- vpslld xmm1,xmm1,2
- xor edi,ebp
- shld ecx,ecx,5
- add ebx,edi
- xor esi,edx
- xor edx,ebp
- add ebx,ecx
- add eax,DWORD[24+rsp]
- and esi,edx
- vpor xmm1,xmm1,xmm8
- xor edx,ebp
- shrd ecx,ecx,7
- mov edi,ebx
- xor esi,edx
- shld ebx,ebx,5
- add eax,esi
- xor edi,ecx
- xor ecx,edx
- add eax,ebx
- add ebp,DWORD[28+rsp]
- and edi,ecx
- xor ecx,edx
- shrd ebx,ebx,7
- mov esi,eax
- xor edi,ecx
- shld eax,eax,5
- add ebp,edi
- xor esi,ebx
- xor ebx,ecx
- add ebp,eax
- vpalignr xmm8,xmm1,xmm0,8
- vpxor xmm2,xmm2,xmm6
- add edx,DWORD[32+rsp]
- and esi,ebx
- xor ebx,ecx
- shrd eax,eax,7
- vpxor xmm2,xmm2,xmm3
- mov edi,ebp
- xor esi,ebx
- vpaddd xmm9,xmm11,xmm1
- shld ebp,ebp,5
- add edx,esi
- vpxor xmm2,xmm2,xmm8
- xor edi,eax
- xor eax,ebx
- add edx,ebp
- add ecx,DWORD[36+rsp]
- vpsrld xmm8,xmm2,30
- vmovdqa XMMWORD[16+rsp],xmm9
- and edi,eax
- xor eax,ebx
- shrd ebp,ebp,7
- mov esi,edx
- vpslld xmm2,xmm2,2
- xor edi,eax
- shld edx,edx,5
- add ecx,edi
- xor esi,ebp
- xor ebp,eax
- add ecx,edx
- add ebx,DWORD[40+rsp]
- and esi,ebp
- vpor xmm2,xmm2,xmm8
- xor ebp,eax
- shrd edx,edx,7
- mov edi,ecx
- xor esi,ebp
- shld ecx,ecx,5
- add ebx,esi
- xor edi,edx
- xor edx,ebp
- add ebx,ecx
- add eax,DWORD[44+rsp]
- and edi,edx
- xor edx,ebp
- shrd ecx,ecx,7
- mov esi,ebx
- xor edi,edx
- shld ebx,ebx,5
- add eax,edi
- xor esi,edx
- add eax,ebx
- vpalignr xmm8,xmm2,xmm1,8
- vpxor xmm3,xmm3,xmm7
- add ebp,DWORD[48+rsp]
- xor esi,ecx
- mov edi,eax
- shld eax,eax,5
- vpxor xmm3,xmm3,xmm4
- add ebp,esi
- xor edi,ecx
- vpaddd xmm9,xmm11,xmm2
- shrd ebx,ebx,7
- add ebp,eax
- vpxor xmm3,xmm3,xmm8
- add edx,DWORD[52+rsp]
- xor edi,ebx
- mov esi,ebp
- shld ebp,ebp,5
- vpsrld xmm8,xmm3,30
- vmovdqa XMMWORD[32+rsp],xmm9
- add edx,edi
- xor esi,ebx
- shrd eax,eax,7
- add edx,ebp
- vpslld xmm3,xmm3,2
- add ecx,DWORD[56+rsp]
- xor esi,eax
- mov edi,edx
- shld edx,edx,5
- add ecx,esi
- xor edi,eax
- shrd ebp,ebp,7
- add ecx,edx
- vpor xmm3,xmm3,xmm8
- add ebx,DWORD[60+rsp]
- xor edi,ebp
- mov esi,ecx
- shld ecx,ecx,5
- add ebx,edi
- xor esi,ebp
- shrd edx,edx,7
- add ebx,ecx
- add eax,DWORD[rsp]
- vpaddd xmm9,xmm11,xmm3
- xor esi,edx
- mov edi,ebx
- shld ebx,ebx,5
- add eax,esi
- vmovdqa XMMWORD[48+rsp],xmm9
- xor edi,edx
- shrd ecx,ecx,7
- add eax,ebx
- add ebp,DWORD[4+rsp]
- xor edi,ecx
- mov esi,eax
- shld eax,eax,5
- add ebp,edi
- xor esi,ecx
- shrd ebx,ebx,7
- add ebp,eax
- add edx,DWORD[8+rsp]
- xor esi,ebx
- mov edi,ebp
- shld ebp,ebp,5
- add edx,esi
- xor edi,ebx
- shrd eax,eax,7
- add edx,ebp
- add ecx,DWORD[12+rsp]
- xor edi,eax
- mov esi,edx
- shld edx,edx,5
- add ecx,edi
- xor esi,eax
- shrd ebp,ebp,7
- add ecx,edx
- cmp r9,r10
- je NEAR $L$done_avx
- vmovdqa xmm6,XMMWORD[64+r14]
- vmovdqa xmm11,XMMWORD[((-64))+r14]
- vmovdqu xmm0,XMMWORD[r9]
- vmovdqu xmm1,XMMWORD[16+r9]
- vmovdqu xmm2,XMMWORD[32+r9]
- vmovdqu xmm3,XMMWORD[48+r9]
- vpshufb xmm0,xmm0,xmm6
- add r9,64
- add ebx,DWORD[16+rsp]
- xor esi,ebp
- vpshufb xmm1,xmm1,xmm6
- mov edi,ecx
- shld ecx,ecx,5
- vpaddd xmm4,xmm0,xmm11
- add ebx,esi
- xor edi,ebp
- shrd edx,edx,7
- add ebx,ecx
- vmovdqa XMMWORD[rsp],xmm4
- add eax,DWORD[20+rsp]
- xor edi,edx
- mov esi,ebx
- shld ebx,ebx,5
- add eax,edi
- xor esi,edx
- shrd ecx,ecx,7
- add eax,ebx
- add ebp,DWORD[24+rsp]
- xor esi,ecx
- mov edi,eax
- shld eax,eax,5
- add ebp,esi
- xor edi,ecx
- shrd ebx,ebx,7
- add ebp,eax
- add edx,DWORD[28+rsp]
- xor edi,ebx
- mov esi,ebp
- shld ebp,ebp,5
- add edx,edi
- xor esi,ebx
- shrd eax,eax,7
- add edx,ebp
- add ecx,DWORD[32+rsp]
- xor esi,eax
- vpshufb xmm2,xmm2,xmm6
- mov edi,edx
- shld edx,edx,5
- vpaddd xmm5,xmm1,xmm11
- add ecx,esi
- xor edi,eax
- shrd ebp,ebp,7
- add ecx,edx
- vmovdqa XMMWORD[16+rsp],xmm5
- add ebx,DWORD[36+rsp]
- xor edi,ebp
- mov esi,ecx
- shld ecx,ecx,5
- add ebx,edi
- xor esi,ebp
- shrd edx,edx,7
- add ebx,ecx
- add eax,DWORD[40+rsp]
- xor esi,edx
- mov edi,ebx
- shld ebx,ebx,5
- add eax,esi
- xor edi,edx
- shrd ecx,ecx,7
- add eax,ebx
- add ebp,DWORD[44+rsp]
- xor edi,ecx
- mov esi,eax
- shld eax,eax,5
- add ebp,edi
- xor esi,ecx
- shrd ebx,ebx,7
- add ebp,eax
- add edx,DWORD[48+rsp]
- xor esi,ebx
- vpshufb xmm3,xmm3,xmm6
- mov edi,ebp
- shld ebp,ebp,5
- vpaddd xmm6,xmm2,xmm11
- add edx,esi
- xor edi,ebx
- shrd eax,eax,7
- add edx,ebp
- vmovdqa XMMWORD[32+rsp],xmm6
- add ecx,DWORD[52+rsp]
- xor edi,eax
- mov esi,edx
- shld edx,edx,5
- add ecx,edi
- xor esi,eax
- shrd ebp,ebp,7
- add ecx,edx
- add ebx,DWORD[56+rsp]
- xor esi,ebp
- mov edi,ecx
- shld ecx,ecx,5
- add ebx,esi
- xor edi,ebp
- shrd edx,edx,7
- add ebx,ecx
- add eax,DWORD[60+rsp]
- xor edi,edx
- mov esi,ebx
- shld ebx,ebx,5
- add eax,edi
- shrd ecx,ecx,7
- add eax,ebx
- add eax,DWORD[r8]
- add esi,DWORD[4+r8]
- add ecx,DWORD[8+r8]
- add edx,DWORD[12+r8]
- mov DWORD[r8],eax
- add ebp,DWORD[16+r8]
- mov DWORD[4+r8],esi
- mov ebx,esi
- mov DWORD[8+r8],ecx
- mov edi,ecx
- mov DWORD[12+r8],edx
- xor edi,edx
- mov DWORD[16+r8],ebp
- and esi,edi
- jmp NEAR $L$oop_avx
-
-ALIGN 16
-$L$done_avx:
- add ebx,DWORD[16+rsp]
- xor esi,ebp
- mov edi,ecx
- shld ecx,ecx,5
- add ebx,esi
- xor edi,ebp
- shrd edx,edx,7
- add ebx,ecx
- add eax,DWORD[20+rsp]
- xor edi,edx
- mov esi,ebx
- shld ebx,ebx,5
- add eax,edi
- xor esi,edx
- shrd ecx,ecx,7
- add eax,ebx
- add ebp,DWORD[24+rsp]
- xor esi,ecx
- mov edi,eax
- shld eax,eax,5
- add ebp,esi
- xor edi,ecx
- shrd ebx,ebx,7
- add ebp,eax
- add edx,DWORD[28+rsp]
- xor edi,ebx
- mov esi,ebp
- shld ebp,ebp,5
- add edx,edi
- xor esi,ebx
- shrd eax,eax,7
- add edx,ebp
- add ecx,DWORD[32+rsp]
- xor esi,eax
- mov edi,edx
- shld edx,edx,5
- add ecx,esi
- xor edi,eax
- shrd ebp,ebp,7
- add ecx,edx
- add ebx,DWORD[36+rsp]
- xor edi,ebp
- mov esi,ecx
- shld ecx,ecx,5
- add ebx,edi
- xor esi,ebp
- shrd edx,edx,7
- add ebx,ecx
- add eax,DWORD[40+rsp]
- xor esi,edx
- mov edi,ebx
- shld ebx,ebx,5
- add eax,esi
- xor edi,edx
- shrd ecx,ecx,7
- add eax,ebx
- add ebp,DWORD[44+rsp]
- xor edi,ecx
- mov esi,eax
- shld eax,eax,5
- add ebp,edi
- xor esi,ecx
- shrd ebx,ebx,7
- add ebp,eax
- add edx,DWORD[48+rsp]
- xor esi,ebx
- mov edi,ebp
- shld ebp,ebp,5
- add edx,esi
- xor edi,ebx
- shrd eax,eax,7
- add edx,ebp
- add ecx,DWORD[52+rsp]
- xor edi,eax
- mov esi,edx
- shld edx,edx,5
- add ecx,edi
- xor esi,eax
- shrd ebp,ebp,7
- add ecx,edx
- add ebx,DWORD[56+rsp]
- xor esi,ebp
- mov edi,ecx
- shld ecx,ecx,5
- add ebx,esi
- xor edi,ebp
- shrd edx,edx,7
- add ebx,ecx
- add eax,DWORD[60+rsp]
- xor edi,edx
- mov esi,ebx
- shld ebx,ebx,5
- add eax,edi
- shrd ecx,ecx,7
- add eax,ebx
- vzeroupper
-
- add eax,DWORD[r8]
- add esi,DWORD[4+r8]
- add ecx,DWORD[8+r8]
- mov DWORD[r8],eax
- add edx,DWORD[12+r8]
- mov DWORD[4+r8],esi
- add ebp,DWORD[16+r8]
- mov DWORD[8+r8],ecx
- mov DWORD[12+r8],edx
- mov DWORD[16+r8],ebp
- movaps xmm6,XMMWORD[((-40-96))+r11]
- movaps xmm7,XMMWORD[((-40-80))+r11]
- movaps xmm8,XMMWORD[((-40-64))+r11]
- movaps xmm9,XMMWORD[((-40-48))+r11]
- movaps xmm10,XMMWORD[((-40-32))+r11]
- movaps xmm11,XMMWORD[((-40-16))+r11]
- mov r14,QWORD[((-40))+r11]
-
- mov r13,QWORD[((-32))+r11]
-
- mov r12,QWORD[((-24))+r11]
-
- mov rbp,QWORD[((-16))+r11]
-
- mov rbx,QWORD[((-8))+r11]
-
- lea rsp,[r11]
-
-$L$epilogue_avx:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_sha1_block_data_order_avx:
-
-ALIGN 16
-sha1_block_data_order_avx2:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_sha1_block_data_order_avx2:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
-
-
-_avx2_shortcut:
-
- mov r11,rsp
-
- push rbx
-
- push rbp
-
- push r12
-
- push r13
-
- push r14
-
- vzeroupper
- lea rsp,[((-96))+rsp]
- vmovaps XMMWORD[(-40-96)+r11],xmm6
- vmovaps XMMWORD[(-40-80)+r11],xmm7
- vmovaps XMMWORD[(-40-64)+r11],xmm8
- vmovaps XMMWORD[(-40-48)+r11],xmm9
- vmovaps XMMWORD[(-40-32)+r11],xmm10
- vmovaps XMMWORD[(-40-16)+r11],xmm11
-$L$prologue_avx2:
- mov r8,rdi
- mov r9,rsi
- mov r10,rdx
-
- lea rsp,[((-640))+rsp]
- shl r10,6
- lea r13,[64+r9]
- and rsp,-128
- add r10,r9
- lea r14,[((K_XX_XX+64))]
-
- mov eax,DWORD[r8]
- cmp r13,r10
- cmovae r13,r9
- mov ebp,DWORD[4+r8]
- mov ecx,DWORD[8+r8]
- mov edx,DWORD[12+r8]
- mov esi,DWORD[16+r8]
- vmovdqu ymm6,YMMWORD[64+r14]
-
- vmovdqu xmm0,XMMWORD[r9]
- vmovdqu xmm1,XMMWORD[16+r9]
- vmovdqu xmm2,XMMWORD[32+r9]
- vmovdqu xmm3,XMMWORD[48+r9]
- lea r9,[64+r9]
- vinserti128 ymm0,ymm0,XMMWORD[r13],1
- vinserti128 ymm1,ymm1,XMMWORD[16+r13],1
- vpshufb ymm0,ymm0,ymm6
- vinserti128 ymm2,ymm2,XMMWORD[32+r13],1
- vpshufb ymm1,ymm1,ymm6
- vinserti128 ymm3,ymm3,XMMWORD[48+r13],1
- vpshufb ymm2,ymm2,ymm6
- vmovdqu ymm11,YMMWORD[((-64))+r14]
- vpshufb ymm3,ymm3,ymm6
-
- vpaddd ymm4,ymm0,ymm11
- vpaddd ymm5,ymm1,ymm11
- vmovdqu YMMWORD[rsp],ymm4
- vpaddd ymm6,ymm2,ymm11
- vmovdqu YMMWORD[32+rsp],ymm5
- vpaddd ymm7,ymm3,ymm11
- vmovdqu YMMWORD[64+rsp],ymm6
- vmovdqu YMMWORD[96+rsp],ymm7
- vpalignr ymm4,ymm1,ymm0,8
- vpsrldq ymm8,ymm3,4
- vpxor ymm4,ymm4,ymm0
- vpxor ymm8,ymm8,ymm2
- vpxor ymm4,ymm4,ymm8
- vpsrld ymm8,ymm4,31
- vpslldq ymm10,ymm4,12
- vpaddd ymm4,ymm4,ymm4
- vpsrld ymm9,ymm10,30
- vpor ymm4,ymm4,ymm8
- vpslld ymm10,ymm10,2
- vpxor ymm4,ymm4,ymm9
- vpxor ymm4,ymm4,ymm10
- vpaddd ymm9,ymm4,ymm11
- vmovdqu YMMWORD[128+rsp],ymm9
- vpalignr ymm5,ymm2,ymm1,8
- vpsrldq ymm8,ymm4,4
- vpxor ymm5,ymm5,ymm1
- vpxor ymm8,ymm8,ymm3
- vpxor ymm5,ymm5,ymm8
- vpsrld ymm8,ymm5,31
- vmovdqu ymm11,YMMWORD[((-32))+r14]
- vpslldq ymm10,ymm5,12
- vpaddd ymm5,ymm5,ymm5
- vpsrld ymm9,ymm10,30
- vpor ymm5,ymm5,ymm8
- vpslld ymm10,ymm10,2
- vpxor ymm5,ymm5,ymm9
- vpxor ymm5,ymm5,ymm10
- vpaddd ymm9,ymm5,ymm11
- vmovdqu YMMWORD[160+rsp],ymm9
- vpalignr ymm6,ymm3,ymm2,8
- vpsrldq ymm8,ymm5,4
- vpxor ymm6,ymm6,ymm2
- vpxor ymm8,ymm8,ymm4
- vpxor ymm6,ymm6,ymm8
- vpsrld ymm8,ymm6,31
- vpslldq ymm10,ymm6,12
- vpaddd ymm6,ymm6,ymm6
- vpsrld ymm9,ymm10,30
- vpor ymm6,ymm6,ymm8
- vpslld ymm10,ymm10,2
- vpxor ymm6,ymm6,ymm9
- vpxor ymm6,ymm6,ymm10
- vpaddd ymm9,ymm6,ymm11
- vmovdqu YMMWORD[192+rsp],ymm9
- vpalignr ymm7,ymm4,ymm3,8
- vpsrldq ymm8,ymm6,4
- vpxor ymm7,ymm7,ymm3
- vpxor ymm8,ymm8,ymm5
- vpxor ymm7,ymm7,ymm8
- vpsrld ymm8,ymm7,31
- vpslldq ymm10,ymm7,12
- vpaddd ymm7,ymm7,ymm7
- vpsrld ymm9,ymm10,30
- vpor ymm7,ymm7,ymm8
- vpslld ymm10,ymm10,2
- vpxor ymm7,ymm7,ymm9
- vpxor ymm7,ymm7,ymm10
- vpaddd ymm9,ymm7,ymm11
- vmovdqu YMMWORD[224+rsp],ymm9
- lea r13,[128+rsp]
- jmp NEAR $L$oop_avx2
-ALIGN 32
-$L$oop_avx2:
- rorx ebx,ebp,2
- andn edi,ebp,edx
- and ebp,ecx
- xor ebp,edi
- jmp NEAR $L$align32_1
-ALIGN 32
-$L$align32_1:
- vpalignr ymm8,ymm7,ymm6,8
- vpxor ymm0,ymm0,ymm4
- add esi,DWORD[((-128))+r13]
- andn edi,eax,ecx
- vpxor ymm0,ymm0,ymm1
- add esi,ebp
- rorx r12d,eax,27
- rorx ebp,eax,2
- vpxor ymm0,ymm0,ymm8
- and eax,ebx
- add esi,r12d
- xor eax,edi
- vpsrld ymm8,ymm0,30
- vpslld ymm0,ymm0,2
- add edx,DWORD[((-124))+r13]
- andn edi,esi,ebx
- add edx,eax
- rorx r12d,esi,27
- rorx eax,esi,2
- and esi,ebp
- vpor ymm0,ymm0,ymm8
- add edx,r12d
- xor esi,edi
- add ecx,DWORD[((-120))+r13]
- andn edi,edx,ebp
- vpaddd ymm9,ymm0,ymm11
- add ecx,esi
- rorx r12d,edx,27
- rorx esi,edx,2
- and edx,eax
- vmovdqu YMMWORD[256+rsp],ymm9
- add ecx,r12d
- xor edx,edi
- add ebx,DWORD[((-116))+r13]
- andn edi,ecx,eax
- add ebx,edx
- rorx r12d,ecx,27
- rorx edx,ecx,2
- and ecx,esi
- add ebx,r12d
- xor ecx,edi
- add ebp,DWORD[((-96))+r13]
- andn edi,ebx,esi
- add ebp,ecx
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- and ebx,edx
- add ebp,r12d
- xor ebx,edi
- vpalignr ymm8,ymm0,ymm7,8
- vpxor ymm1,ymm1,ymm5
- add eax,DWORD[((-92))+r13]
- andn edi,ebp,edx
- vpxor ymm1,ymm1,ymm2
- add eax,ebx
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- vpxor ymm1,ymm1,ymm8
- and ebp,ecx
- add eax,r12d
- xor ebp,edi
- vpsrld ymm8,ymm1,30
- vpslld ymm1,ymm1,2
- add esi,DWORD[((-88))+r13]
- andn edi,eax,ecx
- add esi,ebp
- rorx r12d,eax,27
- rorx ebp,eax,2
- and eax,ebx
- vpor ymm1,ymm1,ymm8
- add esi,r12d
- xor eax,edi
- add edx,DWORD[((-84))+r13]
- andn edi,esi,ebx
- vpaddd ymm9,ymm1,ymm11
- add edx,eax
- rorx r12d,esi,27
- rorx eax,esi,2
- and esi,ebp
- vmovdqu YMMWORD[288+rsp],ymm9
- add edx,r12d
- xor esi,edi
- add ecx,DWORD[((-64))+r13]
- andn edi,edx,ebp
- add ecx,esi
- rorx r12d,edx,27
- rorx esi,edx,2
- and edx,eax
- add ecx,r12d
- xor edx,edi
- add ebx,DWORD[((-60))+r13]
- andn edi,ecx,eax
- add ebx,edx
- rorx r12d,ecx,27
- rorx edx,ecx,2
- and ecx,esi
- add ebx,r12d
- xor ecx,edi
- vpalignr ymm8,ymm1,ymm0,8
- vpxor ymm2,ymm2,ymm6
- add ebp,DWORD[((-56))+r13]
- andn edi,ebx,esi
- vpxor ymm2,ymm2,ymm3
- vmovdqu ymm11,YMMWORD[r14]
- add ebp,ecx
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- vpxor ymm2,ymm2,ymm8
- and ebx,edx
- add ebp,r12d
- xor ebx,edi
- vpsrld ymm8,ymm2,30
- vpslld ymm2,ymm2,2
- add eax,DWORD[((-52))+r13]
- andn edi,ebp,edx
- add eax,ebx
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- and ebp,ecx
- vpor ymm2,ymm2,ymm8
- add eax,r12d
- xor ebp,edi
- add esi,DWORD[((-32))+r13]
- andn edi,eax,ecx
- vpaddd ymm9,ymm2,ymm11
- add esi,ebp
- rorx r12d,eax,27
- rorx ebp,eax,2
- and eax,ebx
- vmovdqu YMMWORD[320+rsp],ymm9
- add esi,r12d
- xor eax,edi
- add edx,DWORD[((-28))+r13]
- andn edi,esi,ebx
- add edx,eax
- rorx r12d,esi,27
- rorx eax,esi,2
- and esi,ebp
- add edx,r12d
- xor esi,edi
- add ecx,DWORD[((-24))+r13]
- andn edi,edx,ebp
- add ecx,esi
- rorx r12d,edx,27
- rorx esi,edx,2
- and edx,eax
- add ecx,r12d
- xor edx,edi
- vpalignr ymm8,ymm2,ymm1,8
- vpxor ymm3,ymm3,ymm7
- add ebx,DWORD[((-20))+r13]
- andn edi,ecx,eax
- vpxor ymm3,ymm3,ymm4
- add ebx,edx
- rorx r12d,ecx,27
- rorx edx,ecx,2
- vpxor ymm3,ymm3,ymm8
- and ecx,esi
- add ebx,r12d
- xor ecx,edi
- vpsrld ymm8,ymm3,30
- vpslld ymm3,ymm3,2
- add ebp,DWORD[r13]
- andn edi,ebx,esi
- add ebp,ecx
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- and ebx,edx
- vpor ymm3,ymm3,ymm8
- add ebp,r12d
- xor ebx,edi
- add eax,DWORD[4+r13]
- andn edi,ebp,edx
- vpaddd ymm9,ymm3,ymm11
- add eax,ebx
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- and ebp,ecx
- vmovdqu YMMWORD[352+rsp],ymm9
- add eax,r12d
- xor ebp,edi
- add esi,DWORD[8+r13]
- andn edi,eax,ecx
- add esi,ebp
- rorx r12d,eax,27
- rorx ebp,eax,2
- and eax,ebx
- add esi,r12d
- xor eax,edi
- add edx,DWORD[12+r13]
- lea edx,[rax*1+rdx]
- rorx r12d,esi,27
- rorx eax,esi,2
- xor esi,ebp
- add edx,r12d
- xor esi,ebx
- vpalignr ymm8,ymm3,ymm2,8
- vpxor ymm4,ymm4,ymm0
- add ecx,DWORD[32+r13]
- lea ecx,[rsi*1+rcx]
- vpxor ymm4,ymm4,ymm5
- rorx r12d,edx,27
- rorx esi,edx,2
- xor edx,eax
- vpxor ymm4,ymm4,ymm8
- add ecx,r12d
- xor edx,ebp
- add ebx,DWORD[36+r13]
- vpsrld ymm8,ymm4,30
- vpslld ymm4,ymm4,2
- lea ebx,[rdx*1+rbx]
- rorx r12d,ecx,27
- rorx edx,ecx,2
- xor ecx,esi
- add ebx,r12d
- xor ecx,eax
- vpor ymm4,ymm4,ymm8
- add ebp,DWORD[40+r13]
- lea ebp,[rbp*1+rcx]
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- vpaddd ymm9,ymm4,ymm11
- xor ebx,edx
- add ebp,r12d
- xor ebx,esi
- add eax,DWORD[44+r13]
- vmovdqu YMMWORD[384+rsp],ymm9
- lea eax,[rbx*1+rax]
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- xor ebp,ecx
- add eax,r12d
- xor ebp,edx
- add esi,DWORD[64+r13]
- lea esi,[rbp*1+rsi]
- rorx r12d,eax,27
- rorx ebp,eax,2
- xor eax,ebx
- add esi,r12d
- xor eax,ecx
- vpalignr ymm8,ymm4,ymm3,8
- vpxor ymm5,ymm5,ymm1
- add edx,DWORD[68+r13]
- lea edx,[rax*1+rdx]
- vpxor ymm5,ymm5,ymm6
- rorx r12d,esi,27
- rorx eax,esi,2
- xor esi,ebp
- vpxor ymm5,ymm5,ymm8
- add edx,r12d
- xor esi,ebx
- add ecx,DWORD[72+r13]
- vpsrld ymm8,ymm5,30
- vpslld ymm5,ymm5,2
- lea ecx,[rsi*1+rcx]
- rorx r12d,edx,27
- rorx esi,edx,2
- xor edx,eax
- add ecx,r12d
- xor edx,ebp
- vpor ymm5,ymm5,ymm8
- add ebx,DWORD[76+r13]
- lea ebx,[rdx*1+rbx]
- rorx r12d,ecx,27
- rorx edx,ecx,2
- vpaddd ymm9,ymm5,ymm11
- xor ecx,esi
- add ebx,r12d
- xor ecx,eax
- add ebp,DWORD[96+r13]
- vmovdqu YMMWORD[416+rsp],ymm9
- lea ebp,[rbp*1+rcx]
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- xor ebx,edx
- add ebp,r12d
- xor ebx,esi
- add eax,DWORD[100+r13]
- lea eax,[rbx*1+rax]
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- xor ebp,ecx
- add eax,r12d
- xor ebp,edx
- vpalignr ymm8,ymm5,ymm4,8
- vpxor ymm6,ymm6,ymm2
- add esi,DWORD[104+r13]
- lea esi,[rbp*1+rsi]
- vpxor ymm6,ymm6,ymm7
- rorx r12d,eax,27
- rorx ebp,eax,2
- xor eax,ebx
- vpxor ymm6,ymm6,ymm8
- add esi,r12d
- xor eax,ecx
- add edx,DWORD[108+r13]
- lea r13,[256+r13]
- vpsrld ymm8,ymm6,30
- vpslld ymm6,ymm6,2
- lea edx,[rax*1+rdx]
- rorx r12d,esi,27
- rorx eax,esi,2
- xor esi,ebp
- add edx,r12d
- xor esi,ebx
- vpor ymm6,ymm6,ymm8
- add ecx,DWORD[((-128))+r13]
- lea ecx,[rsi*1+rcx]
- rorx r12d,edx,27
- rorx esi,edx,2
- vpaddd ymm9,ymm6,ymm11
- xor edx,eax
- add ecx,r12d
- xor edx,ebp
- add ebx,DWORD[((-124))+r13]
- vmovdqu YMMWORD[448+rsp],ymm9
- lea ebx,[rdx*1+rbx]
- rorx r12d,ecx,27
- rorx edx,ecx,2
- xor ecx,esi
- add ebx,r12d
- xor ecx,eax
- add ebp,DWORD[((-120))+r13]
- lea ebp,[rbp*1+rcx]
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- xor ebx,edx
- add ebp,r12d
- xor ebx,esi
- vpalignr ymm8,ymm6,ymm5,8
- vpxor ymm7,ymm7,ymm3
- add eax,DWORD[((-116))+r13]
- lea eax,[rbx*1+rax]
- vpxor ymm7,ymm7,ymm0
- vmovdqu ymm11,YMMWORD[32+r14]
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- xor ebp,ecx
- vpxor ymm7,ymm7,ymm8
- add eax,r12d
- xor ebp,edx
- add esi,DWORD[((-96))+r13]
- vpsrld ymm8,ymm7,30
- vpslld ymm7,ymm7,2
- lea esi,[rbp*1+rsi]
- rorx r12d,eax,27
- rorx ebp,eax,2
- xor eax,ebx
- add esi,r12d
- xor eax,ecx
- vpor ymm7,ymm7,ymm8
- add edx,DWORD[((-92))+r13]
- lea edx,[rax*1+rdx]
- rorx r12d,esi,27
- rorx eax,esi,2
- vpaddd ymm9,ymm7,ymm11
- xor esi,ebp
- add edx,r12d
- xor esi,ebx
- add ecx,DWORD[((-88))+r13]
- vmovdqu YMMWORD[480+rsp],ymm9
- lea ecx,[rsi*1+rcx]
- rorx r12d,edx,27
- rorx esi,edx,2
- xor edx,eax
- add ecx,r12d
- xor edx,ebp
- add ebx,DWORD[((-84))+r13]
- mov edi,esi
- xor edi,eax
- lea ebx,[rdx*1+rbx]
- rorx r12d,ecx,27
- rorx edx,ecx,2
- xor ecx,esi
- add ebx,r12d
- and ecx,edi
- jmp NEAR $L$align32_2
-ALIGN 32
-$L$align32_2:
- vpalignr ymm8,ymm7,ymm6,8
- vpxor ymm0,ymm0,ymm4
- add ebp,DWORD[((-64))+r13]
- xor ecx,esi
- vpxor ymm0,ymm0,ymm1
- mov edi,edx
- xor edi,esi
- lea ebp,[rbp*1+rcx]
- vpxor ymm0,ymm0,ymm8
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- xor ebx,edx
- vpsrld ymm8,ymm0,30
- vpslld ymm0,ymm0,2
- add ebp,r12d
- and ebx,edi
- add eax,DWORD[((-60))+r13]
- xor ebx,edx
- mov edi,ecx
- xor edi,edx
- vpor ymm0,ymm0,ymm8
- lea eax,[rbx*1+rax]
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- xor ebp,ecx
- vpaddd ymm9,ymm0,ymm11
- add eax,r12d
- and ebp,edi
- add esi,DWORD[((-56))+r13]
- xor ebp,ecx
- vmovdqu YMMWORD[512+rsp],ymm9
- mov edi,ebx
- xor edi,ecx
- lea esi,[rbp*1+rsi]
- rorx r12d,eax,27
- rorx ebp,eax,2
- xor eax,ebx
- add esi,r12d
- and eax,edi
- add edx,DWORD[((-52))+r13]
- xor eax,ebx
- mov edi,ebp
- xor edi,ebx
- lea edx,[rax*1+rdx]
- rorx r12d,esi,27
- rorx eax,esi,2
- xor esi,ebp
- add edx,r12d
- and esi,edi
- add ecx,DWORD[((-32))+r13]
- xor esi,ebp
- mov edi,eax
- xor edi,ebp
- lea ecx,[rsi*1+rcx]
- rorx r12d,edx,27
- rorx esi,edx,2
- xor edx,eax
- add ecx,r12d
- and edx,edi
- vpalignr ymm8,ymm0,ymm7,8
- vpxor ymm1,ymm1,ymm5
- add ebx,DWORD[((-28))+r13]
- xor edx,eax
- vpxor ymm1,ymm1,ymm2
- mov edi,esi
- xor edi,eax
- lea ebx,[rdx*1+rbx]
- vpxor ymm1,ymm1,ymm8
- rorx r12d,ecx,27
- rorx edx,ecx,2
- xor ecx,esi
- vpsrld ymm8,ymm1,30
- vpslld ymm1,ymm1,2
- add ebx,r12d
- and ecx,edi
- add ebp,DWORD[((-24))+r13]
- xor ecx,esi
- mov edi,edx
- xor edi,esi
- vpor ymm1,ymm1,ymm8
- lea ebp,[rbp*1+rcx]
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- xor ebx,edx
- vpaddd ymm9,ymm1,ymm11
- add ebp,r12d
- and ebx,edi
- add eax,DWORD[((-20))+r13]
- xor ebx,edx
- vmovdqu YMMWORD[544+rsp],ymm9
- mov edi,ecx
- xor edi,edx
- lea eax,[rbx*1+rax]
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- xor ebp,ecx
- add eax,r12d
- and ebp,edi
- add esi,DWORD[r13]
- xor ebp,ecx
- mov edi,ebx
- xor edi,ecx
- lea esi,[rbp*1+rsi]
- rorx r12d,eax,27
- rorx ebp,eax,2
- xor eax,ebx
- add esi,r12d
- and eax,edi
- add edx,DWORD[4+r13]
- xor eax,ebx
- mov edi,ebp
- xor edi,ebx
- lea edx,[rax*1+rdx]
- rorx r12d,esi,27
- rorx eax,esi,2
- xor esi,ebp
- add edx,r12d
- and esi,edi
- vpalignr ymm8,ymm1,ymm0,8
- vpxor ymm2,ymm2,ymm6
- add ecx,DWORD[8+r13]
- xor esi,ebp
- vpxor ymm2,ymm2,ymm3
- mov edi,eax
- xor edi,ebp
- lea ecx,[rsi*1+rcx]
- vpxor ymm2,ymm2,ymm8
- rorx r12d,edx,27
- rorx esi,edx,2
- xor edx,eax
- vpsrld ymm8,ymm2,30
- vpslld ymm2,ymm2,2
- add ecx,r12d
- and edx,edi
- add ebx,DWORD[12+r13]
- xor edx,eax
- mov edi,esi
- xor edi,eax
- vpor ymm2,ymm2,ymm8
- lea ebx,[rdx*1+rbx]
- rorx r12d,ecx,27
- rorx edx,ecx,2
- xor ecx,esi
- vpaddd ymm9,ymm2,ymm11
- add ebx,r12d
- and ecx,edi
- add ebp,DWORD[32+r13]
- xor ecx,esi
- vmovdqu YMMWORD[576+rsp],ymm9
- mov edi,edx
- xor edi,esi
- lea ebp,[rbp*1+rcx]
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- xor ebx,edx
- add ebp,r12d
- and ebx,edi
- add eax,DWORD[36+r13]
- xor ebx,edx
- mov edi,ecx
- xor edi,edx
- lea eax,[rbx*1+rax]
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- xor ebp,ecx
- add eax,r12d
- and ebp,edi
- add esi,DWORD[40+r13]
- xor ebp,ecx
- mov edi,ebx
- xor edi,ecx
- lea esi,[rbp*1+rsi]
- rorx r12d,eax,27
- rorx ebp,eax,2
- xor eax,ebx
- add esi,r12d
- and eax,edi
- vpalignr ymm8,ymm2,ymm1,8
- vpxor ymm3,ymm3,ymm7
- add edx,DWORD[44+r13]
- xor eax,ebx
- vpxor ymm3,ymm3,ymm4
- mov edi,ebp
- xor edi,ebx
- lea edx,[rax*1+rdx]
- vpxor ymm3,ymm3,ymm8
- rorx r12d,esi,27
- rorx eax,esi,2
- xor esi,ebp
- vpsrld ymm8,ymm3,30
- vpslld ymm3,ymm3,2
- add edx,r12d
- and esi,edi
- add ecx,DWORD[64+r13]
- xor esi,ebp
- mov edi,eax
- xor edi,ebp
- vpor ymm3,ymm3,ymm8
- lea ecx,[rsi*1+rcx]
- rorx r12d,edx,27
- rorx esi,edx,2
- xor edx,eax
- vpaddd ymm9,ymm3,ymm11
- add ecx,r12d
- and edx,edi
- add ebx,DWORD[68+r13]
- xor edx,eax
- vmovdqu YMMWORD[608+rsp],ymm9
- mov edi,esi
- xor edi,eax
- lea ebx,[rdx*1+rbx]
- rorx r12d,ecx,27
- rorx edx,ecx,2
- xor ecx,esi
- add ebx,r12d
- and ecx,edi
- add ebp,DWORD[72+r13]
- xor ecx,esi
- mov edi,edx
- xor edi,esi
- lea ebp,[rbp*1+rcx]
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- xor ebx,edx
- add ebp,r12d
- and ebx,edi
- add eax,DWORD[76+r13]
- xor ebx,edx
- lea eax,[rbx*1+rax]
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- xor ebp,ecx
- add eax,r12d
- xor ebp,edx
- add esi,DWORD[96+r13]
- lea esi,[rbp*1+rsi]
- rorx r12d,eax,27
- rorx ebp,eax,2
- xor eax,ebx
- add esi,r12d
- xor eax,ecx
- add edx,DWORD[100+r13]
- lea edx,[rax*1+rdx]
- rorx r12d,esi,27
- rorx eax,esi,2
- xor esi,ebp
- add edx,r12d
- xor esi,ebx
- add ecx,DWORD[104+r13]
- lea ecx,[rsi*1+rcx]
- rorx r12d,edx,27
- rorx esi,edx,2
- xor edx,eax
- add ecx,r12d
- xor edx,ebp
- add ebx,DWORD[108+r13]
- lea r13,[256+r13]
- lea ebx,[rdx*1+rbx]
- rorx r12d,ecx,27
- rorx edx,ecx,2
- xor ecx,esi
- add ebx,r12d
- xor ecx,eax
- add ebp,DWORD[((-128))+r13]
- lea ebp,[rbp*1+rcx]
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- xor ebx,edx
- add ebp,r12d
- xor ebx,esi
- add eax,DWORD[((-124))+r13]
- lea eax,[rbx*1+rax]
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- xor ebp,ecx
- add eax,r12d
- xor ebp,edx
- add esi,DWORD[((-120))+r13]
- lea esi,[rbp*1+rsi]
- rorx r12d,eax,27
- rorx ebp,eax,2
- xor eax,ebx
- add esi,r12d
- xor eax,ecx
- add edx,DWORD[((-116))+r13]
- lea edx,[rax*1+rdx]
- rorx r12d,esi,27
- rorx eax,esi,2
- xor esi,ebp
- add edx,r12d
- xor esi,ebx
- add ecx,DWORD[((-96))+r13]
- lea ecx,[rsi*1+rcx]
- rorx r12d,edx,27
- rorx esi,edx,2
- xor edx,eax
- add ecx,r12d
- xor edx,ebp
- add ebx,DWORD[((-92))+r13]
- lea ebx,[rdx*1+rbx]
- rorx r12d,ecx,27
- rorx edx,ecx,2
- xor ecx,esi
- add ebx,r12d
- xor ecx,eax
- add ebp,DWORD[((-88))+r13]
- lea ebp,[rbp*1+rcx]
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- xor ebx,edx
- add ebp,r12d
- xor ebx,esi
- add eax,DWORD[((-84))+r13]
- lea eax,[rbx*1+rax]
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- xor ebp,ecx
- add eax,r12d
- xor ebp,edx
- add esi,DWORD[((-64))+r13]
- lea esi,[rbp*1+rsi]
- rorx r12d,eax,27
- rorx ebp,eax,2
- xor eax,ebx
- add esi,r12d
- xor eax,ecx
- add edx,DWORD[((-60))+r13]
- lea edx,[rax*1+rdx]
- rorx r12d,esi,27
- rorx eax,esi,2
- xor esi,ebp
- add edx,r12d
- xor esi,ebx
- add ecx,DWORD[((-56))+r13]
- lea ecx,[rsi*1+rcx]
- rorx r12d,edx,27
- rorx esi,edx,2
- xor edx,eax
- add ecx,r12d
- xor edx,ebp
- add ebx,DWORD[((-52))+r13]
- lea ebx,[rdx*1+rbx]
- rorx r12d,ecx,27
- rorx edx,ecx,2
- xor ecx,esi
- add ebx,r12d
- xor ecx,eax
- add ebp,DWORD[((-32))+r13]
- lea ebp,[rbp*1+rcx]
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- xor ebx,edx
- add ebp,r12d
- xor ebx,esi
- add eax,DWORD[((-28))+r13]
- lea eax,[rbx*1+rax]
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- xor ebp,ecx
- add eax,r12d
- xor ebp,edx
- add esi,DWORD[((-24))+r13]
- lea esi,[rbp*1+rsi]
- rorx r12d,eax,27
- rorx ebp,eax,2
- xor eax,ebx
- add esi,r12d
- xor eax,ecx
- add edx,DWORD[((-20))+r13]
- lea edx,[rax*1+rdx]
- rorx r12d,esi,27
- add edx,r12d
- lea r13,[128+r9]
- lea rdi,[128+r9]
- cmp r13,r10
- cmovae r13,r9
-
-
- add edx,DWORD[r8]
- add esi,DWORD[4+r8]
- add ebp,DWORD[8+r8]
- mov DWORD[r8],edx
- add ebx,DWORD[12+r8]
- mov DWORD[4+r8],esi
- mov eax,edx
- add ecx,DWORD[16+r8]
- mov r12d,ebp
- mov DWORD[8+r8],ebp
- mov edx,ebx
-
- mov DWORD[12+r8],ebx
- mov ebp,esi
- mov DWORD[16+r8],ecx
-
- mov esi,ecx
- mov ecx,r12d
-
-
- cmp r9,r10
- je NEAR $L$done_avx2
- vmovdqu ymm6,YMMWORD[64+r14]
- cmp rdi,r10
- ja NEAR $L$ast_avx2
-
- vmovdqu xmm0,XMMWORD[((-64))+rdi]
- vmovdqu xmm1,XMMWORD[((-48))+rdi]
- vmovdqu xmm2,XMMWORD[((-32))+rdi]
- vmovdqu xmm3,XMMWORD[((-16))+rdi]
- vinserti128 ymm0,ymm0,XMMWORD[r13],1
- vinserti128 ymm1,ymm1,XMMWORD[16+r13],1
- vinserti128 ymm2,ymm2,XMMWORD[32+r13],1
- vinserti128 ymm3,ymm3,XMMWORD[48+r13],1
- jmp NEAR $L$ast_avx2
-
-ALIGN 32
-$L$ast_avx2:
- lea r13,[((128+16))+rsp]
- rorx ebx,ebp,2
- andn edi,ebp,edx
- and ebp,ecx
- xor ebp,edi
- sub r9,-128
- add esi,DWORD[((-128))+r13]
- andn edi,eax,ecx
- add esi,ebp
- rorx r12d,eax,27
- rorx ebp,eax,2
- and eax,ebx
- add esi,r12d
- xor eax,edi
- add edx,DWORD[((-124))+r13]
- andn edi,esi,ebx
- add edx,eax
- rorx r12d,esi,27
- rorx eax,esi,2
- and esi,ebp
- add edx,r12d
- xor esi,edi
- add ecx,DWORD[((-120))+r13]
- andn edi,edx,ebp
- add ecx,esi
- rorx r12d,edx,27
- rorx esi,edx,2
- and edx,eax
- add ecx,r12d
- xor edx,edi
- add ebx,DWORD[((-116))+r13]
- andn edi,ecx,eax
- add ebx,edx
- rorx r12d,ecx,27
- rorx edx,ecx,2
- and ecx,esi
- add ebx,r12d
- xor ecx,edi
- add ebp,DWORD[((-96))+r13]
- andn edi,ebx,esi
- add ebp,ecx
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- and ebx,edx
- add ebp,r12d
- xor ebx,edi
- add eax,DWORD[((-92))+r13]
- andn edi,ebp,edx
- add eax,ebx
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- and ebp,ecx
- add eax,r12d
- xor ebp,edi
- add esi,DWORD[((-88))+r13]
- andn edi,eax,ecx
- add esi,ebp
- rorx r12d,eax,27
- rorx ebp,eax,2
- and eax,ebx
- add esi,r12d
- xor eax,edi
- add edx,DWORD[((-84))+r13]
- andn edi,esi,ebx
- add edx,eax
- rorx r12d,esi,27
- rorx eax,esi,2
- and esi,ebp
- add edx,r12d
- xor esi,edi
- add ecx,DWORD[((-64))+r13]
- andn edi,edx,ebp
- add ecx,esi
- rorx r12d,edx,27
- rorx esi,edx,2
- and edx,eax
- add ecx,r12d
- xor edx,edi
- add ebx,DWORD[((-60))+r13]
- andn edi,ecx,eax
- add ebx,edx
- rorx r12d,ecx,27
- rorx edx,ecx,2
- and ecx,esi
- add ebx,r12d
- xor ecx,edi
- add ebp,DWORD[((-56))+r13]
- andn edi,ebx,esi
- add ebp,ecx
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- and ebx,edx
- add ebp,r12d
- xor ebx,edi
- add eax,DWORD[((-52))+r13]
- andn edi,ebp,edx
- add eax,ebx
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- and ebp,ecx
- add eax,r12d
- xor ebp,edi
- add esi,DWORD[((-32))+r13]
- andn edi,eax,ecx
- add esi,ebp
- rorx r12d,eax,27
- rorx ebp,eax,2
- and eax,ebx
- add esi,r12d
- xor eax,edi
- add edx,DWORD[((-28))+r13]
- andn edi,esi,ebx
- add edx,eax
- rorx r12d,esi,27
- rorx eax,esi,2
- and esi,ebp
- add edx,r12d
- xor esi,edi
- add ecx,DWORD[((-24))+r13]
- andn edi,edx,ebp
- add ecx,esi
- rorx r12d,edx,27
- rorx esi,edx,2
- and edx,eax
- add ecx,r12d
- xor edx,edi
- add ebx,DWORD[((-20))+r13]
- andn edi,ecx,eax
- add ebx,edx
- rorx r12d,ecx,27
- rorx edx,ecx,2
- and ecx,esi
- add ebx,r12d
- xor ecx,edi
- add ebp,DWORD[r13]
- andn edi,ebx,esi
- add ebp,ecx
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- and ebx,edx
- add ebp,r12d
- xor ebx,edi
- add eax,DWORD[4+r13]
- andn edi,ebp,edx
- add eax,ebx
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- and ebp,ecx
- add eax,r12d
- xor ebp,edi
- add esi,DWORD[8+r13]
- andn edi,eax,ecx
- add esi,ebp
- rorx r12d,eax,27
- rorx ebp,eax,2
- and eax,ebx
- add esi,r12d
- xor eax,edi
- add edx,DWORD[12+r13]
- lea edx,[rax*1+rdx]
- rorx r12d,esi,27
- rorx eax,esi,2
- xor esi,ebp
- add edx,r12d
- xor esi,ebx
- add ecx,DWORD[32+r13]
- lea ecx,[rsi*1+rcx]
- rorx r12d,edx,27
- rorx esi,edx,2
- xor edx,eax
- add ecx,r12d
- xor edx,ebp
- add ebx,DWORD[36+r13]
- lea ebx,[rdx*1+rbx]
- rorx r12d,ecx,27
- rorx edx,ecx,2
- xor ecx,esi
- add ebx,r12d
- xor ecx,eax
- add ebp,DWORD[40+r13]
- lea ebp,[rbp*1+rcx]
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- xor ebx,edx
- add ebp,r12d
- xor ebx,esi
- add eax,DWORD[44+r13]
- lea eax,[rbx*1+rax]
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- xor ebp,ecx
- add eax,r12d
- xor ebp,edx
- add esi,DWORD[64+r13]
- lea esi,[rbp*1+rsi]
- rorx r12d,eax,27
- rorx ebp,eax,2
- xor eax,ebx
- add esi,r12d
- xor eax,ecx
- vmovdqu ymm11,YMMWORD[((-64))+r14]
- vpshufb ymm0,ymm0,ymm6
- add edx,DWORD[68+r13]
- lea edx,[rax*1+rdx]
- rorx r12d,esi,27
- rorx eax,esi,2
- xor esi,ebp
- add edx,r12d
- xor esi,ebx
- add ecx,DWORD[72+r13]
- lea ecx,[rsi*1+rcx]
- rorx r12d,edx,27
- rorx esi,edx,2
- xor edx,eax
- add ecx,r12d
- xor edx,ebp
- add ebx,DWORD[76+r13]
- lea ebx,[rdx*1+rbx]
- rorx r12d,ecx,27
- rorx edx,ecx,2
- xor ecx,esi
- add ebx,r12d
- xor ecx,eax
- add ebp,DWORD[96+r13]
- lea ebp,[rbp*1+rcx]
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- xor ebx,edx
- add ebp,r12d
- xor ebx,esi
- add eax,DWORD[100+r13]
- lea eax,[rbx*1+rax]
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- xor ebp,ecx
- add eax,r12d
- xor ebp,edx
- vpshufb ymm1,ymm1,ymm6
- vpaddd ymm8,ymm0,ymm11
- add esi,DWORD[104+r13]
- lea esi,[rbp*1+rsi]
- rorx r12d,eax,27
- rorx ebp,eax,2
- xor eax,ebx
- add esi,r12d
- xor eax,ecx
- add edx,DWORD[108+r13]
- lea r13,[256+r13]
- lea edx,[rax*1+rdx]
- rorx r12d,esi,27
- rorx eax,esi,2
- xor esi,ebp
- add edx,r12d
- xor esi,ebx
- add ecx,DWORD[((-128))+r13]
- lea ecx,[rsi*1+rcx]
- rorx r12d,edx,27
- rorx esi,edx,2
- xor edx,eax
- add ecx,r12d
- xor edx,ebp
- add ebx,DWORD[((-124))+r13]
- lea ebx,[rdx*1+rbx]
- rorx r12d,ecx,27
- rorx edx,ecx,2
- xor ecx,esi
- add ebx,r12d
- xor ecx,eax
- add ebp,DWORD[((-120))+r13]
- lea ebp,[rbp*1+rcx]
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- xor ebx,edx
- add ebp,r12d
- xor ebx,esi
- vmovdqu YMMWORD[rsp],ymm8
- vpshufb ymm2,ymm2,ymm6
- vpaddd ymm9,ymm1,ymm11
- add eax,DWORD[((-116))+r13]
- lea eax,[rbx*1+rax]
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- xor ebp,ecx
- add eax,r12d
- xor ebp,edx
- add esi,DWORD[((-96))+r13]
- lea esi,[rbp*1+rsi]
- rorx r12d,eax,27
- rorx ebp,eax,2
- xor eax,ebx
- add esi,r12d
- xor eax,ecx
- add edx,DWORD[((-92))+r13]
- lea edx,[rax*1+rdx]
- rorx r12d,esi,27
- rorx eax,esi,2
- xor esi,ebp
- add edx,r12d
- xor esi,ebx
- add ecx,DWORD[((-88))+r13]
- lea ecx,[rsi*1+rcx]
- rorx r12d,edx,27
- rorx esi,edx,2
- xor edx,eax
- add ecx,r12d
- xor edx,ebp
- add ebx,DWORD[((-84))+r13]
- mov edi,esi
- xor edi,eax
- lea ebx,[rdx*1+rbx]
- rorx r12d,ecx,27
- rorx edx,ecx,2
- xor ecx,esi
- add ebx,r12d
- and ecx,edi
- vmovdqu YMMWORD[32+rsp],ymm9
- vpshufb ymm3,ymm3,ymm6
- vpaddd ymm6,ymm2,ymm11
- add ebp,DWORD[((-64))+r13]
- xor ecx,esi
- mov edi,edx
- xor edi,esi
- lea ebp,[rbp*1+rcx]
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- xor ebx,edx
- add ebp,r12d
- and ebx,edi
- add eax,DWORD[((-60))+r13]
- xor ebx,edx
- mov edi,ecx
- xor edi,edx
- lea eax,[rbx*1+rax]
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- xor ebp,ecx
- add eax,r12d
- and ebp,edi
- add esi,DWORD[((-56))+r13]
- xor ebp,ecx
- mov edi,ebx
- xor edi,ecx
- lea esi,[rbp*1+rsi]
- rorx r12d,eax,27
- rorx ebp,eax,2
- xor eax,ebx
- add esi,r12d
- and eax,edi
- add edx,DWORD[((-52))+r13]
- xor eax,ebx
- mov edi,ebp
- xor edi,ebx
- lea edx,[rax*1+rdx]
- rorx r12d,esi,27
- rorx eax,esi,2
- xor esi,ebp
- add edx,r12d
- and esi,edi
- add ecx,DWORD[((-32))+r13]
- xor esi,ebp
- mov edi,eax
- xor edi,ebp
- lea ecx,[rsi*1+rcx]
- rorx r12d,edx,27
- rorx esi,edx,2
- xor edx,eax
- add ecx,r12d
- and edx,edi
- jmp NEAR $L$align32_3
-ALIGN 32
-$L$align32_3:
- vmovdqu YMMWORD[64+rsp],ymm6
- vpaddd ymm7,ymm3,ymm11
- add ebx,DWORD[((-28))+r13]
- xor edx,eax
- mov edi,esi
- xor edi,eax
- lea ebx,[rdx*1+rbx]
- rorx r12d,ecx,27
- rorx edx,ecx,2
- xor ecx,esi
- add ebx,r12d
- and ecx,edi
- add ebp,DWORD[((-24))+r13]
- xor ecx,esi
- mov edi,edx
- xor edi,esi
- lea ebp,[rbp*1+rcx]
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- xor ebx,edx
- add ebp,r12d
- and ebx,edi
- add eax,DWORD[((-20))+r13]
- xor ebx,edx
- mov edi,ecx
- xor edi,edx
- lea eax,[rbx*1+rax]
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- xor ebp,ecx
- add eax,r12d
- and ebp,edi
- add esi,DWORD[r13]
- xor ebp,ecx
- mov edi,ebx
- xor edi,ecx
- lea esi,[rbp*1+rsi]
- rorx r12d,eax,27
- rorx ebp,eax,2
- xor eax,ebx
- add esi,r12d
- and eax,edi
- add edx,DWORD[4+r13]
- xor eax,ebx
- mov edi,ebp
- xor edi,ebx
- lea edx,[rax*1+rdx]
- rorx r12d,esi,27
- rorx eax,esi,2
- xor esi,ebp
- add edx,r12d
- and esi,edi
- vmovdqu YMMWORD[96+rsp],ymm7
- add ecx,DWORD[8+r13]
- xor esi,ebp
- mov edi,eax
- xor edi,ebp
- lea ecx,[rsi*1+rcx]
- rorx r12d,edx,27
- rorx esi,edx,2
- xor edx,eax
- add ecx,r12d
- and edx,edi
- add ebx,DWORD[12+r13]
- xor edx,eax
- mov edi,esi
- xor edi,eax
- lea ebx,[rdx*1+rbx]
- rorx r12d,ecx,27
- rorx edx,ecx,2
- xor ecx,esi
- add ebx,r12d
- and ecx,edi
- add ebp,DWORD[32+r13]
- xor ecx,esi
- mov edi,edx
- xor edi,esi
- lea ebp,[rbp*1+rcx]
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- xor ebx,edx
- add ebp,r12d
- and ebx,edi
- add eax,DWORD[36+r13]
- xor ebx,edx
- mov edi,ecx
- xor edi,edx
- lea eax,[rbx*1+rax]
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- xor ebp,ecx
- add eax,r12d
- and ebp,edi
- add esi,DWORD[40+r13]
- xor ebp,ecx
- mov edi,ebx
- xor edi,ecx
- lea esi,[rbp*1+rsi]
- rorx r12d,eax,27
- rorx ebp,eax,2
- xor eax,ebx
- add esi,r12d
- and eax,edi
- vpalignr ymm4,ymm1,ymm0,8
- add edx,DWORD[44+r13]
- xor eax,ebx
- mov edi,ebp
- xor edi,ebx
- vpsrldq ymm8,ymm3,4
- lea edx,[rax*1+rdx]
- rorx r12d,esi,27
- rorx eax,esi,2
- vpxor ymm4,ymm4,ymm0
- vpxor ymm8,ymm8,ymm2
- xor esi,ebp
- add edx,r12d
- vpxor ymm4,ymm4,ymm8
- and esi,edi
- add ecx,DWORD[64+r13]
- xor esi,ebp
- mov edi,eax
- vpsrld ymm8,ymm4,31
- xor edi,ebp
- lea ecx,[rsi*1+rcx]
- rorx r12d,edx,27
- vpslldq ymm10,ymm4,12
- vpaddd ymm4,ymm4,ymm4
- rorx esi,edx,2
- xor edx,eax
- vpsrld ymm9,ymm10,30
- vpor ymm4,ymm4,ymm8
- add ecx,r12d
- and edx,edi
- vpslld ymm10,ymm10,2
- vpxor ymm4,ymm4,ymm9
- add ebx,DWORD[68+r13]
- xor edx,eax
- vpxor ymm4,ymm4,ymm10
- mov edi,esi
- xor edi,eax
- lea ebx,[rdx*1+rbx]
- vpaddd ymm9,ymm4,ymm11
- rorx r12d,ecx,27
- rorx edx,ecx,2
- xor ecx,esi
- vmovdqu YMMWORD[128+rsp],ymm9
- add ebx,r12d
- and ecx,edi
- add ebp,DWORD[72+r13]
- xor ecx,esi
- mov edi,edx
- xor edi,esi
- lea ebp,[rbp*1+rcx]
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- xor ebx,edx
- add ebp,r12d
- and ebx,edi
- add eax,DWORD[76+r13]
- xor ebx,edx
- lea eax,[rbx*1+rax]
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- xor ebp,ecx
- add eax,r12d
- xor ebp,edx
- vpalignr ymm5,ymm2,ymm1,8
- add esi,DWORD[96+r13]
- lea esi,[rbp*1+rsi]
- rorx r12d,eax,27
- rorx ebp,eax,2
- vpsrldq ymm8,ymm4,4
- xor eax,ebx
- add esi,r12d
- xor eax,ecx
- vpxor ymm5,ymm5,ymm1
- vpxor ymm8,ymm8,ymm3
- add edx,DWORD[100+r13]
- lea edx,[rax*1+rdx]
- vpxor ymm5,ymm5,ymm8
- rorx r12d,esi,27
- rorx eax,esi,2
- xor esi,ebp
- add edx,r12d
- vpsrld ymm8,ymm5,31
- vmovdqu ymm11,YMMWORD[((-32))+r14]
- xor esi,ebx
- add ecx,DWORD[104+r13]
- lea ecx,[rsi*1+rcx]
- vpslldq ymm10,ymm5,12
- vpaddd ymm5,ymm5,ymm5
- rorx r12d,edx,27
- rorx esi,edx,2
- vpsrld ymm9,ymm10,30
- vpor ymm5,ymm5,ymm8
- xor edx,eax
- add ecx,r12d
- vpslld ymm10,ymm10,2
- vpxor ymm5,ymm5,ymm9
- xor edx,ebp
- add ebx,DWORD[108+r13]
- lea r13,[256+r13]
- vpxor ymm5,ymm5,ymm10
- lea ebx,[rdx*1+rbx]
- rorx r12d,ecx,27
- rorx edx,ecx,2
- vpaddd ymm9,ymm5,ymm11
- xor ecx,esi
- add ebx,r12d
- xor ecx,eax
- vmovdqu YMMWORD[160+rsp],ymm9
- add ebp,DWORD[((-128))+r13]
- lea ebp,[rbp*1+rcx]
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- xor ebx,edx
- add ebp,r12d
- xor ebx,esi
- vpalignr ymm6,ymm3,ymm2,8
- add eax,DWORD[((-124))+r13]
- lea eax,[rbx*1+rax]
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- vpsrldq ymm8,ymm5,4
- xor ebp,ecx
- add eax,r12d
- xor ebp,edx
- vpxor ymm6,ymm6,ymm2
- vpxor ymm8,ymm8,ymm4
- add esi,DWORD[((-120))+r13]
- lea esi,[rbp*1+rsi]
- vpxor ymm6,ymm6,ymm8
- rorx r12d,eax,27
- rorx ebp,eax,2
- xor eax,ebx
- add esi,r12d
- vpsrld ymm8,ymm6,31
- xor eax,ecx
- add edx,DWORD[((-116))+r13]
- lea edx,[rax*1+rdx]
- vpslldq ymm10,ymm6,12
- vpaddd ymm6,ymm6,ymm6
- rorx r12d,esi,27
- rorx eax,esi,2
- vpsrld ymm9,ymm10,30
- vpor ymm6,ymm6,ymm8
- xor esi,ebp
- add edx,r12d
- vpslld ymm10,ymm10,2
- vpxor ymm6,ymm6,ymm9
- xor esi,ebx
- add ecx,DWORD[((-96))+r13]
- vpxor ymm6,ymm6,ymm10
- lea ecx,[rsi*1+rcx]
- rorx r12d,edx,27
- rorx esi,edx,2
- vpaddd ymm9,ymm6,ymm11
- xor edx,eax
- add ecx,r12d
- xor edx,ebp
- vmovdqu YMMWORD[192+rsp],ymm9
- add ebx,DWORD[((-92))+r13]
- lea ebx,[rdx*1+rbx]
- rorx r12d,ecx,27
- rorx edx,ecx,2
- xor ecx,esi
- add ebx,r12d
- xor ecx,eax
- vpalignr ymm7,ymm4,ymm3,8
- add ebp,DWORD[((-88))+r13]
- lea ebp,[rbp*1+rcx]
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- vpsrldq ymm8,ymm6,4
- xor ebx,edx
- add ebp,r12d
- xor ebx,esi
- vpxor ymm7,ymm7,ymm3
- vpxor ymm8,ymm8,ymm5
- add eax,DWORD[((-84))+r13]
- lea eax,[rbx*1+rax]
- vpxor ymm7,ymm7,ymm8
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- xor ebp,ecx
- add eax,r12d
- vpsrld ymm8,ymm7,31
- xor ebp,edx
- add esi,DWORD[((-64))+r13]
- lea esi,[rbp*1+rsi]
- vpslldq ymm10,ymm7,12
- vpaddd ymm7,ymm7,ymm7
- rorx r12d,eax,27
- rorx ebp,eax,2
- vpsrld ymm9,ymm10,30
- vpor ymm7,ymm7,ymm8
- xor eax,ebx
- add esi,r12d
- vpslld ymm10,ymm10,2
- vpxor ymm7,ymm7,ymm9
- xor eax,ecx
- add edx,DWORD[((-60))+r13]
- vpxor ymm7,ymm7,ymm10
- lea edx,[rax*1+rdx]
- rorx r12d,esi,27
- rorx eax,esi,2
- vpaddd ymm9,ymm7,ymm11
- xor esi,ebp
- add edx,r12d
- xor esi,ebx
- vmovdqu YMMWORD[224+rsp],ymm9
- add ecx,DWORD[((-56))+r13]
- lea ecx,[rsi*1+rcx]
- rorx r12d,edx,27
- rorx esi,edx,2
- xor edx,eax
- add ecx,r12d
- xor edx,ebp
- add ebx,DWORD[((-52))+r13]
- lea ebx,[rdx*1+rbx]
- rorx r12d,ecx,27
- rorx edx,ecx,2
- xor ecx,esi
- add ebx,r12d
- xor ecx,eax
- add ebp,DWORD[((-32))+r13]
- lea ebp,[rbp*1+rcx]
- rorx r12d,ebx,27
- rorx ecx,ebx,2
- xor ebx,edx
- add ebp,r12d
- xor ebx,esi
- add eax,DWORD[((-28))+r13]
- lea eax,[rbx*1+rax]
- rorx r12d,ebp,27
- rorx ebx,ebp,2
- xor ebp,ecx
- add eax,r12d
- xor ebp,edx
- add esi,DWORD[((-24))+r13]
- lea esi,[rbp*1+rsi]
- rorx r12d,eax,27
- rorx ebp,eax,2
- xor eax,ebx
- add esi,r12d
- xor eax,ecx
- add edx,DWORD[((-20))+r13]
- lea edx,[rax*1+rdx]
- rorx r12d,esi,27
- add edx,r12d
- lea r13,[128+rsp]
-
-
- add edx,DWORD[r8]
- add esi,DWORD[4+r8]
- add ebp,DWORD[8+r8]
- mov DWORD[r8],edx
- add ebx,DWORD[12+r8]
- mov DWORD[4+r8],esi
- mov eax,edx
- add ecx,DWORD[16+r8]
- mov r12d,ebp
- mov DWORD[8+r8],ebp
- mov edx,ebx
-
- mov DWORD[12+r8],ebx
- mov ebp,esi
- mov DWORD[16+r8],ecx
-
- mov esi,ecx
- mov ecx,r12d
-
-
- cmp r9,r10
- jbe NEAR $L$oop_avx2
-
-$L$done_avx2:
- vzeroupper
- movaps xmm6,XMMWORD[((-40-96))+r11]
- movaps xmm7,XMMWORD[((-40-80))+r11]
- movaps xmm8,XMMWORD[((-40-64))+r11]
- movaps xmm9,XMMWORD[((-40-48))+r11]
- movaps xmm10,XMMWORD[((-40-32))+r11]
- movaps xmm11,XMMWORD[((-40-16))+r11]
- mov r14,QWORD[((-40))+r11]
-
- mov r13,QWORD[((-32))+r11]
-
- mov r12,QWORD[((-24))+r11]
-
- mov rbp,QWORD[((-16))+r11]
-
- mov rbx,QWORD[((-8))+r11]
-
- lea rsp,[r11]
-
-$L$epilogue_avx2:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_sha1_block_data_order_avx2:
ALIGN 64
K_XX_XX:
DD 0x5a827999,0x5a827999,0x5a827999,0x5a827999
@@ -5738,12 +2860,6 @@ ALIGN 4
DD $L$SEH_begin_sha1_block_data_order_ssse3 wrt ..imagebase
DD $L$SEH_end_sha1_block_data_order_ssse3 wrt ..imagebase
DD $L$SEH_info_sha1_block_data_order_ssse3 wrt ..imagebase
- DD $L$SEH_begin_sha1_block_data_order_avx wrt ..imagebase
- DD $L$SEH_end_sha1_block_data_order_avx wrt ..imagebase
- DD $L$SEH_info_sha1_block_data_order_avx wrt ..imagebase
- DD $L$SEH_begin_sha1_block_data_order_avx2 wrt ..imagebase
- DD $L$SEH_end_sha1_block_data_order_avx2 wrt ..imagebase
- DD $L$SEH_info_sha1_block_data_order_avx2 wrt ..imagebase
section .xdata rdata align=8
ALIGN 8
$L$SEH_info_sha1_block_data_order:
@@ -5756,11 +2872,3 @@ $L$SEH_info_sha1_block_data_order_ssse3:
DB 9,0,0,0
DD ssse3_handler wrt ..imagebase
DD $L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase
-$L$SEH_info_sha1_block_data_order_avx:
-DB 9,0,0,0
- DD ssse3_handler wrt ..imagebase
- DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
-$L$SEH_info_sha1_block_data_order_avx2:
-DB 9,0,0,0
- DD ssse3_handler wrt ..imagebase
- DD $L$prologue_avx2 wrt ..imagebase,$L$epilogue_avx2 wrt ..imagebase
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-mb-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-mb-x86_64.nasm
index 58c00d6b92c..c4fd2666ab0 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-mb-x86_64.nasm
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-mb-x86_64.nasm
@@ -24,8 +24,6 @@ $L$SEH_begin_sha256_multi_block:
mov rcx,QWORD[((OPENSSL_ia32cap_P+4))]
bt rcx,61
jc NEAR _shaext_shortcut
- test ecx,268435456
- jnz NEAR _avx_shortcut
mov rax,rsp
push rbx
@@ -3206,4764 +3204,6 @@ $L$epilogue_shaext:
DB 0F3h,0C3h ;repret
$L$SEH_end_sha256_multi_block_shaext:
-
-ALIGN 32
-sha256_multi_block_avx:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_sha256_multi_block_avx:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
-
-
-
-_avx_shortcut:
- shr rcx,32
- cmp edx,2
- jb NEAR $L$avx
- test ecx,32
- jnz NEAR _avx2_shortcut
- jmp NEAR $L$avx
-ALIGN 32
-$L$avx:
- mov rax,rsp
-
- push rbx
-
- push rbp
-
- lea rsp,[((-168))+rsp]
- movaps XMMWORD[rsp],xmm6
- movaps XMMWORD[16+rsp],xmm7
- movaps XMMWORD[32+rsp],xmm8
- movaps XMMWORD[48+rsp],xmm9
- movaps XMMWORD[(-120)+rax],xmm10
- movaps XMMWORD[(-104)+rax],xmm11
- movaps XMMWORD[(-88)+rax],xmm12
- movaps XMMWORD[(-72)+rax],xmm13
- movaps XMMWORD[(-56)+rax],xmm14
- movaps XMMWORD[(-40)+rax],xmm15
- sub rsp,288
- and rsp,-256
- mov QWORD[272+rsp],rax
-
-$L$body_avx:
- lea rbp,[((K256+128))]
- lea rbx,[256+rsp]
- lea rdi,[128+rdi]
-
-$L$oop_grande_avx:
- mov DWORD[280+rsp],edx
- xor edx,edx
-
- mov r8,QWORD[rsi]
-
- mov ecx,DWORD[8+rsi]
- cmp ecx,edx
- cmovg edx,ecx
- test ecx,ecx
- mov DWORD[rbx],ecx
- cmovle r8,rbp
-
- mov r9,QWORD[16+rsi]
-
- mov ecx,DWORD[24+rsi]
- cmp ecx,edx
- cmovg edx,ecx
- test ecx,ecx
- mov DWORD[4+rbx],ecx
- cmovle r9,rbp
-
- mov r10,QWORD[32+rsi]
-
- mov ecx,DWORD[40+rsi]
- cmp ecx,edx
- cmovg edx,ecx
- test ecx,ecx
- mov DWORD[8+rbx],ecx
- cmovle r10,rbp
-
- mov r11,QWORD[48+rsi]
-
- mov ecx,DWORD[56+rsi]
- cmp ecx,edx
- cmovg edx,ecx
- test ecx,ecx
- mov DWORD[12+rbx],ecx
- cmovle r11,rbp
- test edx,edx
- jz NEAR $L$done_avx
-
- vmovdqu xmm8,XMMWORD[((0-128))+rdi]
- lea rax,[128+rsp]
- vmovdqu xmm9,XMMWORD[((32-128))+rdi]
- vmovdqu xmm10,XMMWORD[((64-128))+rdi]
- vmovdqu xmm11,XMMWORD[((96-128))+rdi]
- vmovdqu xmm12,XMMWORD[((128-128))+rdi]
- vmovdqu xmm13,XMMWORD[((160-128))+rdi]
- vmovdqu xmm14,XMMWORD[((192-128))+rdi]
- vmovdqu xmm15,XMMWORD[((224-128))+rdi]
- vmovdqu xmm6,XMMWORD[$L$pbswap]
- jmp NEAR $L$oop_avx
-
-ALIGN 32
-$L$oop_avx:
- vpxor xmm4,xmm10,xmm9
- vmovd xmm5,DWORD[r8]
- vmovd xmm0,DWORD[r9]
- vpinsrd xmm5,xmm5,DWORD[r10],1
- vpinsrd xmm0,xmm0,DWORD[r11],1
- vpunpckldq xmm5,xmm5,xmm0
- vpshufb xmm5,xmm5,xmm6
- vpsrld xmm7,xmm12,6
- vpslld xmm2,xmm12,26
- vmovdqu XMMWORD[(0-128)+rax],xmm5
- vpaddd xmm5,xmm5,xmm15
-
- vpsrld xmm1,xmm12,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm12,21
- vpaddd xmm5,xmm5,XMMWORD[((-128))+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm12,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm12,7
- vpandn xmm0,xmm12,xmm14
- vpand xmm3,xmm12,xmm13
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm15,xmm8,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm8,30
- vpxor xmm0,xmm0,xmm3
- vpxor xmm3,xmm9,xmm8
-
- vpxor xmm15,xmm15,xmm1
- vpaddd xmm5,xmm5,xmm7
-
- vpsrld xmm1,xmm8,13
-
- vpslld xmm2,xmm8,19
- vpaddd xmm5,xmm5,xmm0
- vpand xmm4,xmm4,xmm3
-
- vpxor xmm7,xmm15,xmm1
-
- vpsrld xmm1,xmm8,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm8,10
- vpxor xmm15,xmm9,xmm4
- vpaddd xmm11,xmm11,xmm5
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm15,xmm15,xmm5
- vpaddd xmm15,xmm15,xmm7
- vmovd xmm5,DWORD[4+r8]
- vmovd xmm0,DWORD[4+r9]
- vpinsrd xmm5,xmm5,DWORD[4+r10],1
- vpinsrd xmm0,xmm0,DWORD[4+r11],1
- vpunpckldq xmm5,xmm5,xmm0
- vpshufb xmm5,xmm5,xmm6
- vpsrld xmm7,xmm11,6
- vpslld xmm2,xmm11,26
- vmovdqu XMMWORD[(16-128)+rax],xmm5
- vpaddd xmm5,xmm5,xmm14
-
- vpsrld xmm1,xmm11,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm11,21
- vpaddd xmm5,xmm5,XMMWORD[((-96))+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm11,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm11,7
- vpandn xmm0,xmm11,xmm13
- vpand xmm4,xmm11,xmm12
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm14,xmm15,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm15,30
- vpxor xmm0,xmm0,xmm4
- vpxor xmm4,xmm8,xmm15
-
- vpxor xmm14,xmm14,xmm1
- vpaddd xmm5,xmm5,xmm7
-
- vpsrld xmm1,xmm15,13
-
- vpslld xmm2,xmm15,19
- vpaddd xmm5,xmm5,xmm0
- vpand xmm3,xmm3,xmm4
-
- vpxor xmm7,xmm14,xmm1
-
- vpsrld xmm1,xmm15,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm15,10
- vpxor xmm14,xmm8,xmm3
- vpaddd xmm10,xmm10,xmm5
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm14,xmm14,xmm5
- vpaddd xmm14,xmm14,xmm7
- vmovd xmm5,DWORD[8+r8]
- vmovd xmm0,DWORD[8+r9]
- vpinsrd xmm5,xmm5,DWORD[8+r10],1
- vpinsrd xmm0,xmm0,DWORD[8+r11],1
- vpunpckldq xmm5,xmm5,xmm0
- vpshufb xmm5,xmm5,xmm6
- vpsrld xmm7,xmm10,6
- vpslld xmm2,xmm10,26
- vmovdqu XMMWORD[(32-128)+rax],xmm5
- vpaddd xmm5,xmm5,xmm13
-
- vpsrld xmm1,xmm10,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm10,21
- vpaddd xmm5,xmm5,XMMWORD[((-64))+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm10,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm10,7
- vpandn xmm0,xmm10,xmm12
- vpand xmm3,xmm10,xmm11
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm13,xmm14,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm14,30
- vpxor xmm0,xmm0,xmm3
- vpxor xmm3,xmm15,xmm14
-
- vpxor xmm13,xmm13,xmm1
- vpaddd xmm5,xmm5,xmm7
-
- vpsrld xmm1,xmm14,13
-
- vpslld xmm2,xmm14,19
- vpaddd xmm5,xmm5,xmm0
- vpand xmm4,xmm4,xmm3
-
- vpxor xmm7,xmm13,xmm1
-
- vpsrld xmm1,xmm14,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm14,10
- vpxor xmm13,xmm15,xmm4
- vpaddd xmm9,xmm9,xmm5
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm13,xmm13,xmm5
- vpaddd xmm13,xmm13,xmm7
- vmovd xmm5,DWORD[12+r8]
- vmovd xmm0,DWORD[12+r9]
- vpinsrd xmm5,xmm5,DWORD[12+r10],1
- vpinsrd xmm0,xmm0,DWORD[12+r11],1
- vpunpckldq xmm5,xmm5,xmm0
- vpshufb xmm5,xmm5,xmm6
- vpsrld xmm7,xmm9,6
- vpslld xmm2,xmm9,26
- vmovdqu XMMWORD[(48-128)+rax],xmm5
- vpaddd xmm5,xmm5,xmm12
-
- vpsrld xmm1,xmm9,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm9,21
- vpaddd xmm5,xmm5,XMMWORD[((-32))+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm9,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm9,7
- vpandn xmm0,xmm9,xmm11
- vpand xmm4,xmm9,xmm10
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm12,xmm13,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm13,30
- vpxor xmm0,xmm0,xmm4
- vpxor xmm4,xmm14,xmm13
-
- vpxor xmm12,xmm12,xmm1
- vpaddd xmm5,xmm5,xmm7
-
- vpsrld xmm1,xmm13,13
-
- vpslld xmm2,xmm13,19
- vpaddd xmm5,xmm5,xmm0
- vpand xmm3,xmm3,xmm4
-
- vpxor xmm7,xmm12,xmm1
-
- vpsrld xmm1,xmm13,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm13,10
- vpxor xmm12,xmm14,xmm3
- vpaddd xmm8,xmm8,xmm5
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm12,xmm12,xmm5
- vpaddd xmm12,xmm12,xmm7
- vmovd xmm5,DWORD[16+r8]
- vmovd xmm0,DWORD[16+r9]
- vpinsrd xmm5,xmm5,DWORD[16+r10],1
- vpinsrd xmm0,xmm0,DWORD[16+r11],1
- vpunpckldq xmm5,xmm5,xmm0
- vpshufb xmm5,xmm5,xmm6
- vpsrld xmm7,xmm8,6
- vpslld xmm2,xmm8,26
- vmovdqu XMMWORD[(64-128)+rax],xmm5
- vpaddd xmm5,xmm5,xmm11
-
- vpsrld xmm1,xmm8,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm8,21
- vpaddd xmm5,xmm5,XMMWORD[rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm8,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm8,7
- vpandn xmm0,xmm8,xmm10
- vpand xmm3,xmm8,xmm9
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm11,xmm12,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm12,30
- vpxor xmm0,xmm0,xmm3
- vpxor xmm3,xmm13,xmm12
-
- vpxor xmm11,xmm11,xmm1
- vpaddd xmm5,xmm5,xmm7
-
- vpsrld xmm1,xmm12,13
-
- vpslld xmm2,xmm12,19
- vpaddd xmm5,xmm5,xmm0
- vpand xmm4,xmm4,xmm3
-
- vpxor xmm7,xmm11,xmm1
-
- vpsrld xmm1,xmm12,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm12,10
- vpxor xmm11,xmm13,xmm4
- vpaddd xmm15,xmm15,xmm5
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm11,xmm11,xmm5
- vpaddd xmm11,xmm11,xmm7
- vmovd xmm5,DWORD[20+r8]
- vmovd xmm0,DWORD[20+r9]
- vpinsrd xmm5,xmm5,DWORD[20+r10],1
- vpinsrd xmm0,xmm0,DWORD[20+r11],1
- vpunpckldq xmm5,xmm5,xmm0
- vpshufb xmm5,xmm5,xmm6
- vpsrld xmm7,xmm15,6
- vpslld xmm2,xmm15,26
- vmovdqu XMMWORD[(80-128)+rax],xmm5
- vpaddd xmm5,xmm5,xmm10
-
- vpsrld xmm1,xmm15,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm15,21
- vpaddd xmm5,xmm5,XMMWORD[32+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm15,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm15,7
- vpandn xmm0,xmm15,xmm9
- vpand xmm4,xmm15,xmm8
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm10,xmm11,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm11,30
- vpxor xmm0,xmm0,xmm4
- vpxor xmm4,xmm12,xmm11
-
- vpxor xmm10,xmm10,xmm1
- vpaddd xmm5,xmm5,xmm7
-
- vpsrld xmm1,xmm11,13
-
- vpslld xmm2,xmm11,19
- vpaddd xmm5,xmm5,xmm0
- vpand xmm3,xmm3,xmm4
-
- vpxor xmm7,xmm10,xmm1
-
- vpsrld xmm1,xmm11,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm11,10
- vpxor xmm10,xmm12,xmm3
- vpaddd xmm14,xmm14,xmm5
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm10,xmm10,xmm5
- vpaddd xmm10,xmm10,xmm7
- vmovd xmm5,DWORD[24+r8]
- vmovd xmm0,DWORD[24+r9]
- vpinsrd xmm5,xmm5,DWORD[24+r10],1
- vpinsrd xmm0,xmm0,DWORD[24+r11],1
- vpunpckldq xmm5,xmm5,xmm0
- vpshufb xmm5,xmm5,xmm6
- vpsrld xmm7,xmm14,6
- vpslld xmm2,xmm14,26
- vmovdqu XMMWORD[(96-128)+rax],xmm5
- vpaddd xmm5,xmm5,xmm9
-
- vpsrld xmm1,xmm14,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm14,21
- vpaddd xmm5,xmm5,XMMWORD[64+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm14,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm14,7
- vpandn xmm0,xmm14,xmm8
- vpand xmm3,xmm14,xmm15
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm9,xmm10,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm10,30
- vpxor xmm0,xmm0,xmm3
- vpxor xmm3,xmm11,xmm10
-
- vpxor xmm9,xmm9,xmm1
- vpaddd xmm5,xmm5,xmm7
-
- vpsrld xmm1,xmm10,13
-
- vpslld xmm2,xmm10,19
- vpaddd xmm5,xmm5,xmm0
- vpand xmm4,xmm4,xmm3
-
- vpxor xmm7,xmm9,xmm1
-
- vpsrld xmm1,xmm10,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm10,10
- vpxor xmm9,xmm11,xmm4
- vpaddd xmm13,xmm13,xmm5
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm9,xmm9,xmm5
- vpaddd xmm9,xmm9,xmm7
- vmovd xmm5,DWORD[28+r8]
- vmovd xmm0,DWORD[28+r9]
- vpinsrd xmm5,xmm5,DWORD[28+r10],1
- vpinsrd xmm0,xmm0,DWORD[28+r11],1
- vpunpckldq xmm5,xmm5,xmm0
- vpshufb xmm5,xmm5,xmm6
- vpsrld xmm7,xmm13,6
- vpslld xmm2,xmm13,26
- vmovdqu XMMWORD[(112-128)+rax],xmm5
- vpaddd xmm5,xmm5,xmm8
-
- vpsrld xmm1,xmm13,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm13,21
- vpaddd xmm5,xmm5,XMMWORD[96+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm13,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm13,7
- vpandn xmm0,xmm13,xmm15
- vpand xmm4,xmm13,xmm14
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm8,xmm9,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm9,30
- vpxor xmm0,xmm0,xmm4
- vpxor xmm4,xmm10,xmm9
-
- vpxor xmm8,xmm8,xmm1
- vpaddd xmm5,xmm5,xmm7
-
- vpsrld xmm1,xmm9,13
-
- vpslld xmm2,xmm9,19
- vpaddd xmm5,xmm5,xmm0
- vpand xmm3,xmm3,xmm4
-
- vpxor xmm7,xmm8,xmm1
-
- vpsrld xmm1,xmm9,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm9,10
- vpxor xmm8,xmm10,xmm3
- vpaddd xmm12,xmm12,xmm5
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm8,xmm8,xmm5
- vpaddd xmm8,xmm8,xmm7
- add rbp,256
- vmovd xmm5,DWORD[32+r8]
- vmovd xmm0,DWORD[32+r9]
- vpinsrd xmm5,xmm5,DWORD[32+r10],1
- vpinsrd xmm0,xmm0,DWORD[32+r11],1
- vpunpckldq xmm5,xmm5,xmm0
- vpshufb xmm5,xmm5,xmm6
- vpsrld xmm7,xmm12,6
- vpslld xmm2,xmm12,26
- vmovdqu XMMWORD[(128-128)+rax],xmm5
- vpaddd xmm5,xmm5,xmm15
-
- vpsrld xmm1,xmm12,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm12,21
- vpaddd xmm5,xmm5,XMMWORD[((-128))+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm12,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm12,7
- vpandn xmm0,xmm12,xmm14
- vpand xmm3,xmm12,xmm13
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm15,xmm8,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm8,30
- vpxor xmm0,xmm0,xmm3
- vpxor xmm3,xmm9,xmm8
-
- vpxor xmm15,xmm15,xmm1
- vpaddd xmm5,xmm5,xmm7
-
- vpsrld xmm1,xmm8,13
-
- vpslld xmm2,xmm8,19
- vpaddd xmm5,xmm5,xmm0
- vpand xmm4,xmm4,xmm3
-
- vpxor xmm7,xmm15,xmm1
-
- vpsrld xmm1,xmm8,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm8,10
- vpxor xmm15,xmm9,xmm4
- vpaddd xmm11,xmm11,xmm5
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm15,xmm15,xmm5
- vpaddd xmm15,xmm15,xmm7
- vmovd xmm5,DWORD[36+r8]
- vmovd xmm0,DWORD[36+r9]
- vpinsrd xmm5,xmm5,DWORD[36+r10],1
- vpinsrd xmm0,xmm0,DWORD[36+r11],1
- vpunpckldq xmm5,xmm5,xmm0
- vpshufb xmm5,xmm5,xmm6
- vpsrld xmm7,xmm11,6
- vpslld xmm2,xmm11,26
- vmovdqu XMMWORD[(144-128)+rax],xmm5
- vpaddd xmm5,xmm5,xmm14
-
- vpsrld xmm1,xmm11,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm11,21
- vpaddd xmm5,xmm5,XMMWORD[((-96))+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm11,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm11,7
- vpandn xmm0,xmm11,xmm13
- vpand xmm4,xmm11,xmm12
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm14,xmm15,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm15,30
- vpxor xmm0,xmm0,xmm4
- vpxor xmm4,xmm8,xmm15
-
- vpxor xmm14,xmm14,xmm1
- vpaddd xmm5,xmm5,xmm7
-
- vpsrld xmm1,xmm15,13
-
- vpslld xmm2,xmm15,19
- vpaddd xmm5,xmm5,xmm0
- vpand xmm3,xmm3,xmm4
-
- vpxor xmm7,xmm14,xmm1
-
- vpsrld xmm1,xmm15,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm15,10
- vpxor xmm14,xmm8,xmm3
- vpaddd xmm10,xmm10,xmm5
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm14,xmm14,xmm5
- vpaddd xmm14,xmm14,xmm7
- vmovd xmm5,DWORD[40+r8]
- vmovd xmm0,DWORD[40+r9]
- vpinsrd xmm5,xmm5,DWORD[40+r10],1
- vpinsrd xmm0,xmm0,DWORD[40+r11],1
- vpunpckldq xmm5,xmm5,xmm0
- vpshufb xmm5,xmm5,xmm6
- vpsrld xmm7,xmm10,6
- vpslld xmm2,xmm10,26
- vmovdqu XMMWORD[(160-128)+rax],xmm5
- vpaddd xmm5,xmm5,xmm13
-
- vpsrld xmm1,xmm10,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm10,21
- vpaddd xmm5,xmm5,XMMWORD[((-64))+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm10,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm10,7
- vpandn xmm0,xmm10,xmm12
- vpand xmm3,xmm10,xmm11
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm13,xmm14,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm14,30
- vpxor xmm0,xmm0,xmm3
- vpxor xmm3,xmm15,xmm14
-
- vpxor xmm13,xmm13,xmm1
- vpaddd xmm5,xmm5,xmm7
-
- vpsrld xmm1,xmm14,13
-
- vpslld xmm2,xmm14,19
- vpaddd xmm5,xmm5,xmm0
- vpand xmm4,xmm4,xmm3
-
- vpxor xmm7,xmm13,xmm1
-
- vpsrld xmm1,xmm14,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm14,10
- vpxor xmm13,xmm15,xmm4
- vpaddd xmm9,xmm9,xmm5
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm13,xmm13,xmm5
- vpaddd xmm13,xmm13,xmm7
- vmovd xmm5,DWORD[44+r8]
- vmovd xmm0,DWORD[44+r9]
- vpinsrd xmm5,xmm5,DWORD[44+r10],1
- vpinsrd xmm0,xmm0,DWORD[44+r11],1
- vpunpckldq xmm5,xmm5,xmm0
- vpshufb xmm5,xmm5,xmm6
- vpsrld xmm7,xmm9,6
- vpslld xmm2,xmm9,26
- vmovdqu XMMWORD[(176-128)+rax],xmm5
- vpaddd xmm5,xmm5,xmm12
-
- vpsrld xmm1,xmm9,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm9,21
- vpaddd xmm5,xmm5,XMMWORD[((-32))+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm9,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm9,7
- vpandn xmm0,xmm9,xmm11
- vpand xmm4,xmm9,xmm10
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm12,xmm13,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm13,30
- vpxor xmm0,xmm0,xmm4
- vpxor xmm4,xmm14,xmm13
-
- vpxor xmm12,xmm12,xmm1
- vpaddd xmm5,xmm5,xmm7
-
- vpsrld xmm1,xmm13,13
-
- vpslld xmm2,xmm13,19
- vpaddd xmm5,xmm5,xmm0
- vpand xmm3,xmm3,xmm4
-
- vpxor xmm7,xmm12,xmm1
-
- vpsrld xmm1,xmm13,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm13,10
- vpxor xmm12,xmm14,xmm3
- vpaddd xmm8,xmm8,xmm5
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm12,xmm12,xmm5
- vpaddd xmm12,xmm12,xmm7
- vmovd xmm5,DWORD[48+r8]
- vmovd xmm0,DWORD[48+r9]
- vpinsrd xmm5,xmm5,DWORD[48+r10],1
- vpinsrd xmm0,xmm0,DWORD[48+r11],1
- vpunpckldq xmm5,xmm5,xmm0
- vpshufb xmm5,xmm5,xmm6
- vpsrld xmm7,xmm8,6
- vpslld xmm2,xmm8,26
- vmovdqu XMMWORD[(192-128)+rax],xmm5
- vpaddd xmm5,xmm5,xmm11
-
- vpsrld xmm1,xmm8,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm8,21
- vpaddd xmm5,xmm5,XMMWORD[rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm8,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm8,7
- vpandn xmm0,xmm8,xmm10
- vpand xmm3,xmm8,xmm9
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm11,xmm12,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm12,30
- vpxor xmm0,xmm0,xmm3
- vpxor xmm3,xmm13,xmm12
-
- vpxor xmm11,xmm11,xmm1
- vpaddd xmm5,xmm5,xmm7
-
- vpsrld xmm1,xmm12,13
-
- vpslld xmm2,xmm12,19
- vpaddd xmm5,xmm5,xmm0
- vpand xmm4,xmm4,xmm3
-
- vpxor xmm7,xmm11,xmm1
-
- vpsrld xmm1,xmm12,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm12,10
- vpxor xmm11,xmm13,xmm4
- vpaddd xmm15,xmm15,xmm5
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm11,xmm11,xmm5
- vpaddd xmm11,xmm11,xmm7
- vmovd xmm5,DWORD[52+r8]
- vmovd xmm0,DWORD[52+r9]
- vpinsrd xmm5,xmm5,DWORD[52+r10],1
- vpinsrd xmm0,xmm0,DWORD[52+r11],1
- vpunpckldq xmm5,xmm5,xmm0
- vpshufb xmm5,xmm5,xmm6
- vpsrld xmm7,xmm15,6
- vpslld xmm2,xmm15,26
- vmovdqu XMMWORD[(208-128)+rax],xmm5
- vpaddd xmm5,xmm5,xmm10
-
- vpsrld xmm1,xmm15,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm15,21
- vpaddd xmm5,xmm5,XMMWORD[32+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm15,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm15,7
- vpandn xmm0,xmm15,xmm9
- vpand xmm4,xmm15,xmm8
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm10,xmm11,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm11,30
- vpxor xmm0,xmm0,xmm4
- vpxor xmm4,xmm12,xmm11
-
- vpxor xmm10,xmm10,xmm1
- vpaddd xmm5,xmm5,xmm7
-
- vpsrld xmm1,xmm11,13
-
- vpslld xmm2,xmm11,19
- vpaddd xmm5,xmm5,xmm0
- vpand xmm3,xmm3,xmm4
-
- vpxor xmm7,xmm10,xmm1
-
- vpsrld xmm1,xmm11,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm11,10
- vpxor xmm10,xmm12,xmm3
- vpaddd xmm14,xmm14,xmm5
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm10,xmm10,xmm5
- vpaddd xmm10,xmm10,xmm7
- vmovd xmm5,DWORD[56+r8]
- vmovd xmm0,DWORD[56+r9]
- vpinsrd xmm5,xmm5,DWORD[56+r10],1
- vpinsrd xmm0,xmm0,DWORD[56+r11],1
- vpunpckldq xmm5,xmm5,xmm0
- vpshufb xmm5,xmm5,xmm6
- vpsrld xmm7,xmm14,6
- vpslld xmm2,xmm14,26
- vmovdqu XMMWORD[(224-128)+rax],xmm5
- vpaddd xmm5,xmm5,xmm9
-
- vpsrld xmm1,xmm14,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm14,21
- vpaddd xmm5,xmm5,XMMWORD[64+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm14,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm14,7
- vpandn xmm0,xmm14,xmm8
- vpand xmm3,xmm14,xmm15
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm9,xmm10,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm10,30
- vpxor xmm0,xmm0,xmm3
- vpxor xmm3,xmm11,xmm10
-
- vpxor xmm9,xmm9,xmm1
- vpaddd xmm5,xmm5,xmm7
-
- vpsrld xmm1,xmm10,13
-
- vpslld xmm2,xmm10,19
- vpaddd xmm5,xmm5,xmm0
- vpand xmm4,xmm4,xmm3
-
- vpxor xmm7,xmm9,xmm1
-
- vpsrld xmm1,xmm10,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm10,10
- vpxor xmm9,xmm11,xmm4
- vpaddd xmm13,xmm13,xmm5
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm9,xmm9,xmm5
- vpaddd xmm9,xmm9,xmm7
- vmovd xmm5,DWORD[60+r8]
- lea r8,[64+r8]
- vmovd xmm0,DWORD[60+r9]
- lea r9,[64+r9]
- vpinsrd xmm5,xmm5,DWORD[60+r10],1
- lea r10,[64+r10]
- vpinsrd xmm0,xmm0,DWORD[60+r11],1
- lea r11,[64+r11]
- vpunpckldq xmm5,xmm5,xmm0
- vpshufb xmm5,xmm5,xmm6
- vpsrld xmm7,xmm13,6
- vpslld xmm2,xmm13,26
- vmovdqu XMMWORD[(240-128)+rax],xmm5
- vpaddd xmm5,xmm5,xmm8
-
- vpsrld xmm1,xmm13,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm13,21
- vpaddd xmm5,xmm5,XMMWORD[96+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm13,25
- vpxor xmm7,xmm7,xmm2
- prefetcht0 [63+r8]
- vpslld xmm2,xmm13,7
- vpandn xmm0,xmm13,xmm15
- vpand xmm4,xmm13,xmm14
- prefetcht0 [63+r9]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm8,xmm9,2
- vpxor xmm7,xmm7,xmm2
- prefetcht0 [63+r10]
- vpslld xmm1,xmm9,30
- vpxor xmm0,xmm0,xmm4
- vpxor xmm4,xmm10,xmm9
- prefetcht0 [63+r11]
- vpxor xmm8,xmm8,xmm1
- vpaddd xmm5,xmm5,xmm7
-
- vpsrld xmm1,xmm9,13
-
- vpslld xmm2,xmm9,19
- vpaddd xmm5,xmm5,xmm0
- vpand xmm3,xmm3,xmm4
-
- vpxor xmm7,xmm8,xmm1
-
- vpsrld xmm1,xmm9,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm9,10
- vpxor xmm8,xmm10,xmm3
- vpaddd xmm12,xmm12,xmm5
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm8,xmm8,xmm5
- vpaddd xmm8,xmm8,xmm7
- add rbp,256
- vmovdqu xmm5,XMMWORD[((0-128))+rax]
- mov ecx,3
- jmp NEAR $L$oop_16_xx_avx
-ALIGN 32
-$L$oop_16_xx_avx:
- vmovdqu xmm6,XMMWORD[((16-128))+rax]
- vpaddd xmm5,xmm5,XMMWORD[((144-128))+rax]
-
- vpsrld xmm7,xmm6,3
- vpsrld xmm1,xmm6,7
- vpslld xmm2,xmm6,25
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm6,18
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm6,14
- vmovdqu xmm0,XMMWORD[((224-128))+rax]
- vpsrld xmm3,xmm0,10
-
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm0,17
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,15
- vpaddd xmm5,xmm5,xmm7
- vpxor xmm7,xmm3,xmm1
- vpsrld xmm1,xmm0,19
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,13
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
- vpaddd xmm5,xmm5,xmm7
- vpsrld xmm7,xmm12,6
- vpslld xmm2,xmm12,26
- vmovdqu XMMWORD[(0-128)+rax],xmm5
- vpaddd xmm5,xmm5,xmm15
-
- vpsrld xmm1,xmm12,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm12,21
- vpaddd xmm5,xmm5,XMMWORD[((-128))+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm12,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm12,7
- vpandn xmm0,xmm12,xmm14
- vpand xmm3,xmm12,xmm13
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm15,xmm8,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm8,30
- vpxor xmm0,xmm0,xmm3
- vpxor xmm3,xmm9,xmm8
-
- vpxor xmm15,xmm15,xmm1
- vpaddd xmm5,xmm5,xmm7
-
- vpsrld xmm1,xmm8,13
-
- vpslld xmm2,xmm8,19
- vpaddd xmm5,xmm5,xmm0
- vpand xmm4,xmm4,xmm3
-
- vpxor xmm7,xmm15,xmm1
-
- vpsrld xmm1,xmm8,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm8,10
- vpxor xmm15,xmm9,xmm4
- vpaddd xmm11,xmm11,xmm5
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm15,xmm15,xmm5
- vpaddd xmm15,xmm15,xmm7
- vmovdqu xmm5,XMMWORD[((32-128))+rax]
- vpaddd xmm6,xmm6,XMMWORD[((160-128))+rax]
-
- vpsrld xmm7,xmm5,3
- vpsrld xmm1,xmm5,7
- vpslld xmm2,xmm5,25
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm5,18
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm5,14
- vmovdqu xmm0,XMMWORD[((240-128))+rax]
- vpsrld xmm4,xmm0,10
-
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm0,17
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,15
- vpaddd xmm6,xmm6,xmm7
- vpxor xmm7,xmm4,xmm1
- vpsrld xmm1,xmm0,19
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,13
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
- vpaddd xmm6,xmm6,xmm7
- vpsrld xmm7,xmm11,6
- vpslld xmm2,xmm11,26
- vmovdqu XMMWORD[(16-128)+rax],xmm6
- vpaddd xmm6,xmm6,xmm14
-
- vpsrld xmm1,xmm11,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm11,21
- vpaddd xmm6,xmm6,XMMWORD[((-96))+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm11,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm11,7
- vpandn xmm0,xmm11,xmm13
- vpand xmm4,xmm11,xmm12
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm14,xmm15,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm15,30
- vpxor xmm0,xmm0,xmm4
- vpxor xmm4,xmm8,xmm15
-
- vpxor xmm14,xmm14,xmm1
- vpaddd xmm6,xmm6,xmm7
-
- vpsrld xmm1,xmm15,13
-
- vpslld xmm2,xmm15,19
- vpaddd xmm6,xmm6,xmm0
- vpand xmm3,xmm3,xmm4
-
- vpxor xmm7,xmm14,xmm1
-
- vpsrld xmm1,xmm15,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm15,10
- vpxor xmm14,xmm8,xmm3
- vpaddd xmm10,xmm10,xmm6
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm14,xmm14,xmm6
- vpaddd xmm14,xmm14,xmm7
- vmovdqu xmm6,XMMWORD[((48-128))+rax]
- vpaddd xmm5,xmm5,XMMWORD[((176-128))+rax]
-
- vpsrld xmm7,xmm6,3
- vpsrld xmm1,xmm6,7
- vpslld xmm2,xmm6,25
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm6,18
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm6,14
- vmovdqu xmm0,XMMWORD[((0-128))+rax]
- vpsrld xmm3,xmm0,10
-
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm0,17
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,15
- vpaddd xmm5,xmm5,xmm7
- vpxor xmm7,xmm3,xmm1
- vpsrld xmm1,xmm0,19
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,13
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
- vpaddd xmm5,xmm5,xmm7
- vpsrld xmm7,xmm10,6
- vpslld xmm2,xmm10,26
- vmovdqu XMMWORD[(32-128)+rax],xmm5
- vpaddd xmm5,xmm5,xmm13
-
- vpsrld xmm1,xmm10,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm10,21
- vpaddd xmm5,xmm5,XMMWORD[((-64))+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm10,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm10,7
- vpandn xmm0,xmm10,xmm12
- vpand xmm3,xmm10,xmm11
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm13,xmm14,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm14,30
- vpxor xmm0,xmm0,xmm3
- vpxor xmm3,xmm15,xmm14
-
- vpxor xmm13,xmm13,xmm1
- vpaddd xmm5,xmm5,xmm7
-
- vpsrld xmm1,xmm14,13
-
- vpslld xmm2,xmm14,19
- vpaddd xmm5,xmm5,xmm0
- vpand xmm4,xmm4,xmm3
-
- vpxor xmm7,xmm13,xmm1
-
- vpsrld xmm1,xmm14,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm14,10
- vpxor xmm13,xmm15,xmm4
- vpaddd xmm9,xmm9,xmm5
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm13,xmm13,xmm5
- vpaddd xmm13,xmm13,xmm7
- vmovdqu xmm5,XMMWORD[((64-128))+rax]
- vpaddd xmm6,xmm6,XMMWORD[((192-128))+rax]
-
- vpsrld xmm7,xmm5,3
- vpsrld xmm1,xmm5,7
- vpslld xmm2,xmm5,25
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm5,18
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm5,14
- vmovdqu xmm0,XMMWORD[((16-128))+rax]
- vpsrld xmm4,xmm0,10
-
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm0,17
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,15
- vpaddd xmm6,xmm6,xmm7
- vpxor xmm7,xmm4,xmm1
- vpsrld xmm1,xmm0,19
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,13
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
- vpaddd xmm6,xmm6,xmm7
- vpsrld xmm7,xmm9,6
- vpslld xmm2,xmm9,26
- vmovdqu XMMWORD[(48-128)+rax],xmm6
- vpaddd xmm6,xmm6,xmm12
-
- vpsrld xmm1,xmm9,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm9,21
- vpaddd xmm6,xmm6,XMMWORD[((-32))+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm9,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm9,7
- vpandn xmm0,xmm9,xmm11
- vpand xmm4,xmm9,xmm10
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm12,xmm13,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm13,30
- vpxor xmm0,xmm0,xmm4
- vpxor xmm4,xmm14,xmm13
-
- vpxor xmm12,xmm12,xmm1
- vpaddd xmm6,xmm6,xmm7
-
- vpsrld xmm1,xmm13,13
-
- vpslld xmm2,xmm13,19
- vpaddd xmm6,xmm6,xmm0
- vpand xmm3,xmm3,xmm4
-
- vpxor xmm7,xmm12,xmm1
-
- vpsrld xmm1,xmm13,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm13,10
- vpxor xmm12,xmm14,xmm3
- vpaddd xmm8,xmm8,xmm6
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm12,xmm12,xmm6
- vpaddd xmm12,xmm12,xmm7
- vmovdqu xmm6,XMMWORD[((80-128))+rax]
- vpaddd xmm5,xmm5,XMMWORD[((208-128))+rax]
-
- vpsrld xmm7,xmm6,3
- vpsrld xmm1,xmm6,7
- vpslld xmm2,xmm6,25
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm6,18
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm6,14
- vmovdqu xmm0,XMMWORD[((32-128))+rax]
- vpsrld xmm3,xmm0,10
-
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm0,17
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,15
- vpaddd xmm5,xmm5,xmm7
- vpxor xmm7,xmm3,xmm1
- vpsrld xmm1,xmm0,19
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,13
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
- vpaddd xmm5,xmm5,xmm7
- vpsrld xmm7,xmm8,6
- vpslld xmm2,xmm8,26
- vmovdqu XMMWORD[(64-128)+rax],xmm5
- vpaddd xmm5,xmm5,xmm11
-
- vpsrld xmm1,xmm8,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm8,21
- vpaddd xmm5,xmm5,XMMWORD[rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm8,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm8,7
- vpandn xmm0,xmm8,xmm10
- vpand xmm3,xmm8,xmm9
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm11,xmm12,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm12,30
- vpxor xmm0,xmm0,xmm3
- vpxor xmm3,xmm13,xmm12
-
- vpxor xmm11,xmm11,xmm1
- vpaddd xmm5,xmm5,xmm7
-
- vpsrld xmm1,xmm12,13
-
- vpslld xmm2,xmm12,19
- vpaddd xmm5,xmm5,xmm0
- vpand xmm4,xmm4,xmm3
-
- vpxor xmm7,xmm11,xmm1
-
- vpsrld xmm1,xmm12,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm12,10
- vpxor xmm11,xmm13,xmm4
- vpaddd xmm15,xmm15,xmm5
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm11,xmm11,xmm5
- vpaddd xmm11,xmm11,xmm7
- vmovdqu xmm5,XMMWORD[((96-128))+rax]
- vpaddd xmm6,xmm6,XMMWORD[((224-128))+rax]
-
- vpsrld xmm7,xmm5,3
- vpsrld xmm1,xmm5,7
- vpslld xmm2,xmm5,25
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm5,18
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm5,14
- vmovdqu xmm0,XMMWORD[((48-128))+rax]
- vpsrld xmm4,xmm0,10
-
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm0,17
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,15
- vpaddd xmm6,xmm6,xmm7
- vpxor xmm7,xmm4,xmm1
- vpsrld xmm1,xmm0,19
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,13
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
- vpaddd xmm6,xmm6,xmm7
- vpsrld xmm7,xmm15,6
- vpslld xmm2,xmm15,26
- vmovdqu XMMWORD[(80-128)+rax],xmm6
- vpaddd xmm6,xmm6,xmm10
-
- vpsrld xmm1,xmm15,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm15,21
- vpaddd xmm6,xmm6,XMMWORD[32+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm15,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm15,7
- vpandn xmm0,xmm15,xmm9
- vpand xmm4,xmm15,xmm8
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm10,xmm11,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm11,30
- vpxor xmm0,xmm0,xmm4
- vpxor xmm4,xmm12,xmm11
-
- vpxor xmm10,xmm10,xmm1
- vpaddd xmm6,xmm6,xmm7
-
- vpsrld xmm1,xmm11,13
-
- vpslld xmm2,xmm11,19
- vpaddd xmm6,xmm6,xmm0
- vpand xmm3,xmm3,xmm4
-
- vpxor xmm7,xmm10,xmm1
-
- vpsrld xmm1,xmm11,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm11,10
- vpxor xmm10,xmm12,xmm3
- vpaddd xmm14,xmm14,xmm6
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm10,xmm10,xmm6
- vpaddd xmm10,xmm10,xmm7
- vmovdqu xmm6,XMMWORD[((112-128))+rax]
- vpaddd xmm5,xmm5,XMMWORD[((240-128))+rax]
-
- vpsrld xmm7,xmm6,3
- vpsrld xmm1,xmm6,7
- vpslld xmm2,xmm6,25
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm6,18
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm6,14
- vmovdqu xmm0,XMMWORD[((64-128))+rax]
- vpsrld xmm3,xmm0,10
-
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm0,17
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,15
- vpaddd xmm5,xmm5,xmm7
- vpxor xmm7,xmm3,xmm1
- vpsrld xmm1,xmm0,19
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,13
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
- vpaddd xmm5,xmm5,xmm7
- vpsrld xmm7,xmm14,6
- vpslld xmm2,xmm14,26
- vmovdqu XMMWORD[(96-128)+rax],xmm5
- vpaddd xmm5,xmm5,xmm9
-
- vpsrld xmm1,xmm14,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm14,21
- vpaddd xmm5,xmm5,XMMWORD[64+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm14,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm14,7
- vpandn xmm0,xmm14,xmm8
- vpand xmm3,xmm14,xmm15
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm9,xmm10,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm10,30
- vpxor xmm0,xmm0,xmm3
- vpxor xmm3,xmm11,xmm10
-
- vpxor xmm9,xmm9,xmm1
- vpaddd xmm5,xmm5,xmm7
-
- vpsrld xmm1,xmm10,13
-
- vpslld xmm2,xmm10,19
- vpaddd xmm5,xmm5,xmm0
- vpand xmm4,xmm4,xmm3
-
- vpxor xmm7,xmm9,xmm1
-
- vpsrld xmm1,xmm10,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm10,10
- vpxor xmm9,xmm11,xmm4
- vpaddd xmm13,xmm13,xmm5
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm9,xmm9,xmm5
- vpaddd xmm9,xmm9,xmm7
- vmovdqu xmm5,XMMWORD[((128-128))+rax]
- vpaddd xmm6,xmm6,XMMWORD[((0-128))+rax]
-
- vpsrld xmm7,xmm5,3
- vpsrld xmm1,xmm5,7
- vpslld xmm2,xmm5,25
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm5,18
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm5,14
- vmovdqu xmm0,XMMWORD[((80-128))+rax]
- vpsrld xmm4,xmm0,10
-
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm0,17
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,15
- vpaddd xmm6,xmm6,xmm7
- vpxor xmm7,xmm4,xmm1
- vpsrld xmm1,xmm0,19
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,13
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
- vpaddd xmm6,xmm6,xmm7
- vpsrld xmm7,xmm13,6
- vpslld xmm2,xmm13,26
- vmovdqu XMMWORD[(112-128)+rax],xmm6
- vpaddd xmm6,xmm6,xmm8
-
- vpsrld xmm1,xmm13,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm13,21
- vpaddd xmm6,xmm6,XMMWORD[96+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm13,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm13,7
- vpandn xmm0,xmm13,xmm15
- vpand xmm4,xmm13,xmm14
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm8,xmm9,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm9,30
- vpxor xmm0,xmm0,xmm4
- vpxor xmm4,xmm10,xmm9
-
- vpxor xmm8,xmm8,xmm1
- vpaddd xmm6,xmm6,xmm7
-
- vpsrld xmm1,xmm9,13
-
- vpslld xmm2,xmm9,19
- vpaddd xmm6,xmm6,xmm0
- vpand xmm3,xmm3,xmm4
-
- vpxor xmm7,xmm8,xmm1
-
- vpsrld xmm1,xmm9,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm9,10
- vpxor xmm8,xmm10,xmm3
- vpaddd xmm12,xmm12,xmm6
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm8,xmm8,xmm6
- vpaddd xmm8,xmm8,xmm7
- add rbp,256
- vmovdqu xmm6,XMMWORD[((144-128))+rax]
- vpaddd xmm5,xmm5,XMMWORD[((16-128))+rax]
-
- vpsrld xmm7,xmm6,3
- vpsrld xmm1,xmm6,7
- vpslld xmm2,xmm6,25
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm6,18
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm6,14
- vmovdqu xmm0,XMMWORD[((96-128))+rax]
- vpsrld xmm3,xmm0,10
-
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm0,17
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,15
- vpaddd xmm5,xmm5,xmm7
- vpxor xmm7,xmm3,xmm1
- vpsrld xmm1,xmm0,19
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,13
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
- vpaddd xmm5,xmm5,xmm7
- vpsrld xmm7,xmm12,6
- vpslld xmm2,xmm12,26
- vmovdqu XMMWORD[(128-128)+rax],xmm5
- vpaddd xmm5,xmm5,xmm15
-
- vpsrld xmm1,xmm12,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm12,21
- vpaddd xmm5,xmm5,XMMWORD[((-128))+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm12,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm12,7
- vpandn xmm0,xmm12,xmm14
- vpand xmm3,xmm12,xmm13
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm15,xmm8,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm8,30
- vpxor xmm0,xmm0,xmm3
- vpxor xmm3,xmm9,xmm8
-
- vpxor xmm15,xmm15,xmm1
- vpaddd xmm5,xmm5,xmm7
-
- vpsrld xmm1,xmm8,13
-
- vpslld xmm2,xmm8,19
- vpaddd xmm5,xmm5,xmm0
- vpand xmm4,xmm4,xmm3
-
- vpxor xmm7,xmm15,xmm1
-
- vpsrld xmm1,xmm8,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm8,10
- vpxor xmm15,xmm9,xmm4
- vpaddd xmm11,xmm11,xmm5
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm15,xmm15,xmm5
- vpaddd xmm15,xmm15,xmm7
- vmovdqu xmm5,XMMWORD[((160-128))+rax]
- vpaddd xmm6,xmm6,XMMWORD[((32-128))+rax]
-
- vpsrld xmm7,xmm5,3
- vpsrld xmm1,xmm5,7
- vpslld xmm2,xmm5,25
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm5,18
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm5,14
- vmovdqu xmm0,XMMWORD[((112-128))+rax]
- vpsrld xmm4,xmm0,10
-
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm0,17
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,15
- vpaddd xmm6,xmm6,xmm7
- vpxor xmm7,xmm4,xmm1
- vpsrld xmm1,xmm0,19
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,13
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
- vpaddd xmm6,xmm6,xmm7
- vpsrld xmm7,xmm11,6
- vpslld xmm2,xmm11,26
- vmovdqu XMMWORD[(144-128)+rax],xmm6
- vpaddd xmm6,xmm6,xmm14
-
- vpsrld xmm1,xmm11,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm11,21
- vpaddd xmm6,xmm6,XMMWORD[((-96))+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm11,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm11,7
- vpandn xmm0,xmm11,xmm13
- vpand xmm4,xmm11,xmm12
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm14,xmm15,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm15,30
- vpxor xmm0,xmm0,xmm4
- vpxor xmm4,xmm8,xmm15
-
- vpxor xmm14,xmm14,xmm1
- vpaddd xmm6,xmm6,xmm7
-
- vpsrld xmm1,xmm15,13
-
- vpslld xmm2,xmm15,19
- vpaddd xmm6,xmm6,xmm0
- vpand xmm3,xmm3,xmm4
-
- vpxor xmm7,xmm14,xmm1
-
- vpsrld xmm1,xmm15,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm15,10
- vpxor xmm14,xmm8,xmm3
- vpaddd xmm10,xmm10,xmm6
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm14,xmm14,xmm6
- vpaddd xmm14,xmm14,xmm7
- vmovdqu xmm6,XMMWORD[((176-128))+rax]
- vpaddd xmm5,xmm5,XMMWORD[((48-128))+rax]
-
- vpsrld xmm7,xmm6,3
- vpsrld xmm1,xmm6,7
- vpslld xmm2,xmm6,25
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm6,18
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm6,14
- vmovdqu xmm0,XMMWORD[((128-128))+rax]
- vpsrld xmm3,xmm0,10
-
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm0,17
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,15
- vpaddd xmm5,xmm5,xmm7
- vpxor xmm7,xmm3,xmm1
- vpsrld xmm1,xmm0,19
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,13
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
- vpaddd xmm5,xmm5,xmm7
- vpsrld xmm7,xmm10,6
- vpslld xmm2,xmm10,26
- vmovdqu XMMWORD[(160-128)+rax],xmm5
- vpaddd xmm5,xmm5,xmm13
-
- vpsrld xmm1,xmm10,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm10,21
- vpaddd xmm5,xmm5,XMMWORD[((-64))+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm10,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm10,7
- vpandn xmm0,xmm10,xmm12
- vpand xmm3,xmm10,xmm11
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm13,xmm14,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm14,30
- vpxor xmm0,xmm0,xmm3
- vpxor xmm3,xmm15,xmm14
-
- vpxor xmm13,xmm13,xmm1
- vpaddd xmm5,xmm5,xmm7
-
- vpsrld xmm1,xmm14,13
-
- vpslld xmm2,xmm14,19
- vpaddd xmm5,xmm5,xmm0
- vpand xmm4,xmm4,xmm3
-
- vpxor xmm7,xmm13,xmm1
-
- vpsrld xmm1,xmm14,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm14,10
- vpxor xmm13,xmm15,xmm4
- vpaddd xmm9,xmm9,xmm5
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm13,xmm13,xmm5
- vpaddd xmm13,xmm13,xmm7
- vmovdqu xmm5,XMMWORD[((192-128))+rax]
- vpaddd xmm6,xmm6,XMMWORD[((64-128))+rax]
-
- vpsrld xmm7,xmm5,3
- vpsrld xmm1,xmm5,7
- vpslld xmm2,xmm5,25
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm5,18
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm5,14
- vmovdqu xmm0,XMMWORD[((144-128))+rax]
- vpsrld xmm4,xmm0,10
-
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm0,17
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,15
- vpaddd xmm6,xmm6,xmm7
- vpxor xmm7,xmm4,xmm1
- vpsrld xmm1,xmm0,19
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,13
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
- vpaddd xmm6,xmm6,xmm7
- vpsrld xmm7,xmm9,6
- vpslld xmm2,xmm9,26
- vmovdqu XMMWORD[(176-128)+rax],xmm6
- vpaddd xmm6,xmm6,xmm12
-
- vpsrld xmm1,xmm9,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm9,21
- vpaddd xmm6,xmm6,XMMWORD[((-32))+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm9,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm9,7
- vpandn xmm0,xmm9,xmm11
- vpand xmm4,xmm9,xmm10
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm12,xmm13,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm13,30
- vpxor xmm0,xmm0,xmm4
- vpxor xmm4,xmm14,xmm13
-
- vpxor xmm12,xmm12,xmm1
- vpaddd xmm6,xmm6,xmm7
-
- vpsrld xmm1,xmm13,13
-
- vpslld xmm2,xmm13,19
- vpaddd xmm6,xmm6,xmm0
- vpand xmm3,xmm3,xmm4
-
- vpxor xmm7,xmm12,xmm1
-
- vpsrld xmm1,xmm13,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm13,10
- vpxor xmm12,xmm14,xmm3
- vpaddd xmm8,xmm8,xmm6
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm12,xmm12,xmm6
- vpaddd xmm12,xmm12,xmm7
- vmovdqu xmm6,XMMWORD[((208-128))+rax]
- vpaddd xmm5,xmm5,XMMWORD[((80-128))+rax]
-
- vpsrld xmm7,xmm6,3
- vpsrld xmm1,xmm6,7
- vpslld xmm2,xmm6,25
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm6,18
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm6,14
- vmovdqu xmm0,XMMWORD[((160-128))+rax]
- vpsrld xmm3,xmm0,10
-
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm0,17
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,15
- vpaddd xmm5,xmm5,xmm7
- vpxor xmm7,xmm3,xmm1
- vpsrld xmm1,xmm0,19
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,13
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
- vpaddd xmm5,xmm5,xmm7
- vpsrld xmm7,xmm8,6
- vpslld xmm2,xmm8,26
- vmovdqu XMMWORD[(192-128)+rax],xmm5
- vpaddd xmm5,xmm5,xmm11
-
- vpsrld xmm1,xmm8,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm8,21
- vpaddd xmm5,xmm5,XMMWORD[rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm8,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm8,7
- vpandn xmm0,xmm8,xmm10
- vpand xmm3,xmm8,xmm9
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm11,xmm12,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm12,30
- vpxor xmm0,xmm0,xmm3
- vpxor xmm3,xmm13,xmm12
-
- vpxor xmm11,xmm11,xmm1
- vpaddd xmm5,xmm5,xmm7
-
- vpsrld xmm1,xmm12,13
-
- vpslld xmm2,xmm12,19
- vpaddd xmm5,xmm5,xmm0
- vpand xmm4,xmm4,xmm3
-
- vpxor xmm7,xmm11,xmm1
-
- vpsrld xmm1,xmm12,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm12,10
- vpxor xmm11,xmm13,xmm4
- vpaddd xmm15,xmm15,xmm5
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm11,xmm11,xmm5
- vpaddd xmm11,xmm11,xmm7
- vmovdqu xmm5,XMMWORD[((224-128))+rax]
- vpaddd xmm6,xmm6,XMMWORD[((96-128))+rax]
-
- vpsrld xmm7,xmm5,3
- vpsrld xmm1,xmm5,7
- vpslld xmm2,xmm5,25
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm5,18
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm5,14
- vmovdqu xmm0,XMMWORD[((176-128))+rax]
- vpsrld xmm4,xmm0,10
-
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm0,17
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,15
- vpaddd xmm6,xmm6,xmm7
- vpxor xmm7,xmm4,xmm1
- vpsrld xmm1,xmm0,19
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,13
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
- vpaddd xmm6,xmm6,xmm7
- vpsrld xmm7,xmm15,6
- vpslld xmm2,xmm15,26
- vmovdqu XMMWORD[(208-128)+rax],xmm6
- vpaddd xmm6,xmm6,xmm10
-
- vpsrld xmm1,xmm15,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm15,21
- vpaddd xmm6,xmm6,XMMWORD[32+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm15,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm15,7
- vpandn xmm0,xmm15,xmm9
- vpand xmm4,xmm15,xmm8
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm10,xmm11,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm11,30
- vpxor xmm0,xmm0,xmm4
- vpxor xmm4,xmm12,xmm11
-
- vpxor xmm10,xmm10,xmm1
- vpaddd xmm6,xmm6,xmm7
-
- vpsrld xmm1,xmm11,13
-
- vpslld xmm2,xmm11,19
- vpaddd xmm6,xmm6,xmm0
- vpand xmm3,xmm3,xmm4
-
- vpxor xmm7,xmm10,xmm1
-
- vpsrld xmm1,xmm11,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm11,10
- vpxor xmm10,xmm12,xmm3
- vpaddd xmm14,xmm14,xmm6
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm10,xmm10,xmm6
- vpaddd xmm10,xmm10,xmm7
- vmovdqu xmm6,XMMWORD[((240-128))+rax]
- vpaddd xmm5,xmm5,XMMWORD[((112-128))+rax]
-
- vpsrld xmm7,xmm6,3
- vpsrld xmm1,xmm6,7
- vpslld xmm2,xmm6,25
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm6,18
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm6,14
- vmovdqu xmm0,XMMWORD[((192-128))+rax]
- vpsrld xmm3,xmm0,10
-
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm0,17
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,15
- vpaddd xmm5,xmm5,xmm7
- vpxor xmm7,xmm3,xmm1
- vpsrld xmm1,xmm0,19
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,13
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
- vpaddd xmm5,xmm5,xmm7
- vpsrld xmm7,xmm14,6
- vpslld xmm2,xmm14,26
- vmovdqu XMMWORD[(224-128)+rax],xmm5
- vpaddd xmm5,xmm5,xmm9
-
- vpsrld xmm1,xmm14,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm14,21
- vpaddd xmm5,xmm5,XMMWORD[64+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm14,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm14,7
- vpandn xmm0,xmm14,xmm8
- vpand xmm3,xmm14,xmm15
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm9,xmm10,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm10,30
- vpxor xmm0,xmm0,xmm3
- vpxor xmm3,xmm11,xmm10
-
- vpxor xmm9,xmm9,xmm1
- vpaddd xmm5,xmm5,xmm7
-
- vpsrld xmm1,xmm10,13
-
- vpslld xmm2,xmm10,19
- vpaddd xmm5,xmm5,xmm0
- vpand xmm4,xmm4,xmm3
-
- vpxor xmm7,xmm9,xmm1
-
- vpsrld xmm1,xmm10,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm10,10
- vpxor xmm9,xmm11,xmm4
- vpaddd xmm13,xmm13,xmm5
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm9,xmm9,xmm5
- vpaddd xmm9,xmm9,xmm7
- vmovdqu xmm5,XMMWORD[((0-128))+rax]
- vpaddd xmm6,xmm6,XMMWORD[((128-128))+rax]
-
- vpsrld xmm7,xmm5,3
- vpsrld xmm1,xmm5,7
- vpslld xmm2,xmm5,25
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm5,18
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm5,14
- vmovdqu xmm0,XMMWORD[((208-128))+rax]
- vpsrld xmm4,xmm0,10
-
- vpxor xmm7,xmm7,xmm1
- vpsrld xmm1,xmm0,17
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,15
- vpaddd xmm6,xmm6,xmm7
- vpxor xmm7,xmm4,xmm1
- vpsrld xmm1,xmm0,19
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm0,13
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
- vpaddd xmm6,xmm6,xmm7
- vpsrld xmm7,xmm13,6
- vpslld xmm2,xmm13,26
- vmovdqu XMMWORD[(240-128)+rax],xmm6
- vpaddd xmm6,xmm6,xmm8
-
- vpsrld xmm1,xmm13,11
- vpxor xmm7,xmm7,xmm2
- vpslld xmm2,xmm13,21
- vpaddd xmm6,xmm6,XMMWORD[96+rbp]
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm1,xmm13,25
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm13,7
- vpandn xmm0,xmm13,xmm15
- vpand xmm4,xmm13,xmm14
-
- vpxor xmm7,xmm7,xmm1
-
- vpsrld xmm8,xmm9,2
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm1,xmm9,30
- vpxor xmm0,xmm0,xmm4
- vpxor xmm4,xmm10,xmm9
-
- vpxor xmm8,xmm8,xmm1
- vpaddd xmm6,xmm6,xmm7
-
- vpsrld xmm1,xmm9,13
-
- vpslld xmm2,xmm9,19
- vpaddd xmm6,xmm6,xmm0
- vpand xmm3,xmm3,xmm4
-
- vpxor xmm7,xmm8,xmm1
-
- vpsrld xmm1,xmm9,22
- vpxor xmm7,xmm7,xmm2
-
- vpslld xmm2,xmm9,10
- vpxor xmm8,xmm10,xmm3
- vpaddd xmm12,xmm12,xmm6
-
- vpxor xmm7,xmm7,xmm1
- vpxor xmm7,xmm7,xmm2
-
- vpaddd xmm8,xmm8,xmm6
- vpaddd xmm8,xmm8,xmm7
- add rbp,256
- dec ecx
- jnz NEAR $L$oop_16_xx_avx
-
- mov ecx,1
- lea rbp,[((K256+128))]
- cmp ecx,DWORD[rbx]
- cmovge r8,rbp
- cmp ecx,DWORD[4+rbx]
- cmovge r9,rbp
- cmp ecx,DWORD[8+rbx]
- cmovge r10,rbp
- cmp ecx,DWORD[12+rbx]
- cmovge r11,rbp
- vmovdqa xmm7,XMMWORD[rbx]
- vpxor xmm0,xmm0,xmm0
- vmovdqa xmm6,xmm7
- vpcmpgtd xmm6,xmm6,xmm0
- vpaddd xmm7,xmm7,xmm6
-
- vmovdqu xmm0,XMMWORD[((0-128))+rdi]
- vpand xmm8,xmm8,xmm6
- vmovdqu xmm1,XMMWORD[((32-128))+rdi]
- vpand xmm9,xmm9,xmm6
- vmovdqu xmm2,XMMWORD[((64-128))+rdi]
- vpand xmm10,xmm10,xmm6
- vmovdqu xmm5,XMMWORD[((96-128))+rdi]
- vpand xmm11,xmm11,xmm6
- vpaddd xmm8,xmm8,xmm0
- vmovdqu xmm0,XMMWORD[((128-128))+rdi]
- vpand xmm12,xmm12,xmm6
- vpaddd xmm9,xmm9,xmm1
- vmovdqu xmm1,XMMWORD[((160-128))+rdi]
- vpand xmm13,xmm13,xmm6
- vpaddd xmm10,xmm10,xmm2
- vmovdqu xmm2,XMMWORD[((192-128))+rdi]
- vpand xmm14,xmm14,xmm6
- vpaddd xmm11,xmm11,xmm5
- vmovdqu xmm5,XMMWORD[((224-128))+rdi]
- vpand xmm15,xmm15,xmm6
- vpaddd xmm12,xmm12,xmm0
- vpaddd xmm13,xmm13,xmm1
- vmovdqu XMMWORD[(0-128)+rdi],xmm8
- vpaddd xmm14,xmm14,xmm2
- vmovdqu XMMWORD[(32-128)+rdi],xmm9
- vpaddd xmm15,xmm15,xmm5
- vmovdqu XMMWORD[(64-128)+rdi],xmm10
- vmovdqu XMMWORD[(96-128)+rdi],xmm11
- vmovdqu XMMWORD[(128-128)+rdi],xmm12
- vmovdqu XMMWORD[(160-128)+rdi],xmm13
- vmovdqu XMMWORD[(192-128)+rdi],xmm14
- vmovdqu XMMWORD[(224-128)+rdi],xmm15
-
- vmovdqu XMMWORD[rbx],xmm7
- vmovdqu xmm6,XMMWORD[$L$pbswap]
- dec edx
- jnz NEAR $L$oop_avx
-
- mov edx,DWORD[280+rsp]
- lea rdi,[16+rdi]
- lea rsi,[64+rsi]
- dec edx
- jnz NEAR $L$oop_grande_avx
-
-$L$done_avx:
- mov rax,QWORD[272+rsp]
-
- vzeroupper
- movaps xmm6,XMMWORD[((-184))+rax]
- movaps xmm7,XMMWORD[((-168))+rax]
- movaps xmm8,XMMWORD[((-152))+rax]
- movaps xmm9,XMMWORD[((-136))+rax]
- movaps xmm10,XMMWORD[((-120))+rax]
- movaps xmm11,XMMWORD[((-104))+rax]
- movaps xmm12,XMMWORD[((-88))+rax]
- movaps xmm13,XMMWORD[((-72))+rax]
- movaps xmm14,XMMWORD[((-56))+rax]
- movaps xmm15,XMMWORD[((-40))+rax]
- mov rbp,QWORD[((-16))+rax]
-
- mov rbx,QWORD[((-8))+rax]
-
- lea rsp,[rax]
-
-$L$epilogue_avx:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_sha256_multi_block_avx:
-
-ALIGN 32
-sha256_multi_block_avx2:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_sha256_multi_block_avx2:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
-
-
-
-_avx2_shortcut:
- mov rax,rsp
-
- push rbx
-
- push rbp
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
- lea rsp,[((-168))+rsp]
- movaps XMMWORD[rsp],xmm6
- movaps XMMWORD[16+rsp],xmm7
- movaps XMMWORD[32+rsp],xmm8
- movaps XMMWORD[48+rsp],xmm9
- movaps XMMWORD[64+rsp],xmm10
- movaps XMMWORD[80+rsp],xmm11
- movaps XMMWORD[(-120)+rax],xmm12
- movaps XMMWORD[(-104)+rax],xmm13
- movaps XMMWORD[(-88)+rax],xmm14
- movaps XMMWORD[(-72)+rax],xmm15
- sub rsp,576
- and rsp,-256
- mov QWORD[544+rsp],rax
-
-$L$body_avx2:
- lea rbp,[((K256+128))]
- lea rdi,[128+rdi]
-
-$L$oop_grande_avx2:
- mov DWORD[552+rsp],edx
- xor edx,edx
- lea rbx,[512+rsp]
-
- mov r12,QWORD[rsi]
-
- mov ecx,DWORD[8+rsi]
- cmp ecx,edx
- cmovg edx,ecx
- test ecx,ecx
- mov DWORD[rbx],ecx
- cmovle r12,rbp
-
- mov r13,QWORD[16+rsi]
-
- mov ecx,DWORD[24+rsi]
- cmp ecx,edx
- cmovg edx,ecx
- test ecx,ecx
- mov DWORD[4+rbx],ecx
- cmovle r13,rbp
-
- mov r14,QWORD[32+rsi]
-
- mov ecx,DWORD[40+rsi]
- cmp ecx,edx
- cmovg edx,ecx
- test ecx,ecx
- mov DWORD[8+rbx],ecx
- cmovle r14,rbp
-
- mov r15,QWORD[48+rsi]
-
- mov ecx,DWORD[56+rsi]
- cmp ecx,edx
- cmovg edx,ecx
- test ecx,ecx
- mov DWORD[12+rbx],ecx
- cmovle r15,rbp
-
- mov r8,QWORD[64+rsi]
-
- mov ecx,DWORD[72+rsi]
- cmp ecx,edx
- cmovg edx,ecx
- test ecx,ecx
- mov DWORD[16+rbx],ecx
- cmovle r8,rbp
-
- mov r9,QWORD[80+rsi]
-
- mov ecx,DWORD[88+rsi]
- cmp ecx,edx
- cmovg edx,ecx
- test ecx,ecx
- mov DWORD[20+rbx],ecx
- cmovle r9,rbp
-
- mov r10,QWORD[96+rsi]
-
- mov ecx,DWORD[104+rsi]
- cmp ecx,edx
- cmovg edx,ecx
- test ecx,ecx
- mov DWORD[24+rbx],ecx
- cmovle r10,rbp
-
- mov r11,QWORD[112+rsi]
-
- mov ecx,DWORD[120+rsi]
- cmp ecx,edx
- cmovg edx,ecx
- test ecx,ecx
- mov DWORD[28+rbx],ecx
- cmovle r11,rbp
- vmovdqu ymm8,YMMWORD[((0-128))+rdi]
- lea rax,[128+rsp]
- vmovdqu ymm9,YMMWORD[((32-128))+rdi]
- lea rbx,[((256+128))+rsp]
- vmovdqu ymm10,YMMWORD[((64-128))+rdi]
- vmovdqu ymm11,YMMWORD[((96-128))+rdi]
- vmovdqu ymm12,YMMWORD[((128-128))+rdi]
- vmovdqu ymm13,YMMWORD[((160-128))+rdi]
- vmovdqu ymm14,YMMWORD[((192-128))+rdi]
- vmovdqu ymm15,YMMWORD[((224-128))+rdi]
- vmovdqu ymm6,YMMWORD[$L$pbswap]
- jmp NEAR $L$oop_avx2
-
-ALIGN 32
-$L$oop_avx2:
- vpxor ymm4,ymm10,ymm9
- vmovd xmm5,DWORD[r12]
- vmovd xmm0,DWORD[r8]
- vmovd xmm1,DWORD[r13]
- vmovd xmm2,DWORD[r9]
- vpinsrd xmm5,xmm5,DWORD[r14],1
- vpinsrd xmm0,xmm0,DWORD[r10],1
- vpinsrd xmm1,xmm1,DWORD[r15],1
- vpunpckldq ymm5,ymm5,ymm1
- vpinsrd xmm2,xmm2,DWORD[r11],1
- vpunpckldq ymm0,ymm0,ymm2
- vinserti128 ymm5,ymm5,xmm0,1
- vpshufb ymm5,ymm5,ymm6
- vpsrld ymm7,ymm12,6
- vpslld ymm2,ymm12,26
- vmovdqu YMMWORD[(0-128)+rax],ymm5
- vpaddd ymm5,ymm5,ymm15
-
- vpsrld ymm1,ymm12,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm12,21
- vpaddd ymm5,ymm5,YMMWORD[((-128))+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm12,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm12,7
- vpandn ymm0,ymm12,ymm14
- vpand ymm3,ymm12,ymm13
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm15,ymm8,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm8,30
- vpxor ymm0,ymm0,ymm3
- vpxor ymm3,ymm9,ymm8
-
- vpxor ymm15,ymm15,ymm1
- vpaddd ymm5,ymm5,ymm7
-
- vpsrld ymm1,ymm8,13
-
- vpslld ymm2,ymm8,19
- vpaddd ymm5,ymm5,ymm0
- vpand ymm4,ymm4,ymm3
-
- vpxor ymm7,ymm15,ymm1
-
- vpsrld ymm1,ymm8,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm8,10
- vpxor ymm15,ymm9,ymm4
- vpaddd ymm11,ymm11,ymm5
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm15,ymm15,ymm5
- vpaddd ymm15,ymm15,ymm7
- vmovd xmm5,DWORD[4+r12]
- vmovd xmm0,DWORD[4+r8]
- vmovd xmm1,DWORD[4+r13]
- vmovd xmm2,DWORD[4+r9]
- vpinsrd xmm5,xmm5,DWORD[4+r14],1
- vpinsrd xmm0,xmm0,DWORD[4+r10],1
- vpinsrd xmm1,xmm1,DWORD[4+r15],1
- vpunpckldq ymm5,ymm5,ymm1
- vpinsrd xmm2,xmm2,DWORD[4+r11],1
- vpunpckldq ymm0,ymm0,ymm2
- vinserti128 ymm5,ymm5,xmm0,1
- vpshufb ymm5,ymm5,ymm6
- vpsrld ymm7,ymm11,6
- vpslld ymm2,ymm11,26
- vmovdqu YMMWORD[(32-128)+rax],ymm5
- vpaddd ymm5,ymm5,ymm14
-
- vpsrld ymm1,ymm11,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm11,21
- vpaddd ymm5,ymm5,YMMWORD[((-96))+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm11,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm11,7
- vpandn ymm0,ymm11,ymm13
- vpand ymm4,ymm11,ymm12
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm14,ymm15,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm15,30
- vpxor ymm0,ymm0,ymm4
- vpxor ymm4,ymm8,ymm15
-
- vpxor ymm14,ymm14,ymm1
- vpaddd ymm5,ymm5,ymm7
-
- vpsrld ymm1,ymm15,13
-
- vpslld ymm2,ymm15,19
- vpaddd ymm5,ymm5,ymm0
- vpand ymm3,ymm3,ymm4
-
- vpxor ymm7,ymm14,ymm1
-
- vpsrld ymm1,ymm15,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm15,10
- vpxor ymm14,ymm8,ymm3
- vpaddd ymm10,ymm10,ymm5
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm14,ymm14,ymm5
- vpaddd ymm14,ymm14,ymm7
- vmovd xmm5,DWORD[8+r12]
- vmovd xmm0,DWORD[8+r8]
- vmovd xmm1,DWORD[8+r13]
- vmovd xmm2,DWORD[8+r9]
- vpinsrd xmm5,xmm5,DWORD[8+r14],1
- vpinsrd xmm0,xmm0,DWORD[8+r10],1
- vpinsrd xmm1,xmm1,DWORD[8+r15],1
- vpunpckldq ymm5,ymm5,ymm1
- vpinsrd xmm2,xmm2,DWORD[8+r11],1
- vpunpckldq ymm0,ymm0,ymm2
- vinserti128 ymm5,ymm5,xmm0,1
- vpshufb ymm5,ymm5,ymm6
- vpsrld ymm7,ymm10,6
- vpslld ymm2,ymm10,26
- vmovdqu YMMWORD[(64-128)+rax],ymm5
- vpaddd ymm5,ymm5,ymm13
-
- vpsrld ymm1,ymm10,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm10,21
- vpaddd ymm5,ymm5,YMMWORD[((-64))+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm10,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm10,7
- vpandn ymm0,ymm10,ymm12
- vpand ymm3,ymm10,ymm11
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm13,ymm14,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm14,30
- vpxor ymm0,ymm0,ymm3
- vpxor ymm3,ymm15,ymm14
-
- vpxor ymm13,ymm13,ymm1
- vpaddd ymm5,ymm5,ymm7
-
- vpsrld ymm1,ymm14,13
-
- vpslld ymm2,ymm14,19
- vpaddd ymm5,ymm5,ymm0
- vpand ymm4,ymm4,ymm3
-
- vpxor ymm7,ymm13,ymm1
-
- vpsrld ymm1,ymm14,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm14,10
- vpxor ymm13,ymm15,ymm4
- vpaddd ymm9,ymm9,ymm5
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm13,ymm13,ymm5
- vpaddd ymm13,ymm13,ymm7
- vmovd xmm5,DWORD[12+r12]
- vmovd xmm0,DWORD[12+r8]
- vmovd xmm1,DWORD[12+r13]
- vmovd xmm2,DWORD[12+r9]
- vpinsrd xmm5,xmm5,DWORD[12+r14],1
- vpinsrd xmm0,xmm0,DWORD[12+r10],1
- vpinsrd xmm1,xmm1,DWORD[12+r15],1
- vpunpckldq ymm5,ymm5,ymm1
- vpinsrd xmm2,xmm2,DWORD[12+r11],1
- vpunpckldq ymm0,ymm0,ymm2
- vinserti128 ymm5,ymm5,xmm0,1
- vpshufb ymm5,ymm5,ymm6
- vpsrld ymm7,ymm9,6
- vpslld ymm2,ymm9,26
- vmovdqu YMMWORD[(96-128)+rax],ymm5
- vpaddd ymm5,ymm5,ymm12
-
- vpsrld ymm1,ymm9,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm9,21
- vpaddd ymm5,ymm5,YMMWORD[((-32))+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm9,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm9,7
- vpandn ymm0,ymm9,ymm11
- vpand ymm4,ymm9,ymm10
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm12,ymm13,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm13,30
- vpxor ymm0,ymm0,ymm4
- vpxor ymm4,ymm14,ymm13
-
- vpxor ymm12,ymm12,ymm1
- vpaddd ymm5,ymm5,ymm7
-
- vpsrld ymm1,ymm13,13
-
- vpslld ymm2,ymm13,19
- vpaddd ymm5,ymm5,ymm0
- vpand ymm3,ymm3,ymm4
-
- vpxor ymm7,ymm12,ymm1
-
- vpsrld ymm1,ymm13,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm13,10
- vpxor ymm12,ymm14,ymm3
- vpaddd ymm8,ymm8,ymm5
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm12,ymm12,ymm5
- vpaddd ymm12,ymm12,ymm7
- vmovd xmm5,DWORD[16+r12]
- vmovd xmm0,DWORD[16+r8]
- vmovd xmm1,DWORD[16+r13]
- vmovd xmm2,DWORD[16+r9]
- vpinsrd xmm5,xmm5,DWORD[16+r14],1
- vpinsrd xmm0,xmm0,DWORD[16+r10],1
- vpinsrd xmm1,xmm1,DWORD[16+r15],1
- vpunpckldq ymm5,ymm5,ymm1
- vpinsrd xmm2,xmm2,DWORD[16+r11],1
- vpunpckldq ymm0,ymm0,ymm2
- vinserti128 ymm5,ymm5,xmm0,1
- vpshufb ymm5,ymm5,ymm6
- vpsrld ymm7,ymm8,6
- vpslld ymm2,ymm8,26
- vmovdqu YMMWORD[(128-128)+rax],ymm5
- vpaddd ymm5,ymm5,ymm11
-
- vpsrld ymm1,ymm8,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm8,21
- vpaddd ymm5,ymm5,YMMWORD[rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm8,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm8,7
- vpandn ymm0,ymm8,ymm10
- vpand ymm3,ymm8,ymm9
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm11,ymm12,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm12,30
- vpxor ymm0,ymm0,ymm3
- vpxor ymm3,ymm13,ymm12
-
- vpxor ymm11,ymm11,ymm1
- vpaddd ymm5,ymm5,ymm7
-
- vpsrld ymm1,ymm12,13
-
- vpslld ymm2,ymm12,19
- vpaddd ymm5,ymm5,ymm0
- vpand ymm4,ymm4,ymm3
-
- vpxor ymm7,ymm11,ymm1
-
- vpsrld ymm1,ymm12,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm12,10
- vpxor ymm11,ymm13,ymm4
- vpaddd ymm15,ymm15,ymm5
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm11,ymm11,ymm5
- vpaddd ymm11,ymm11,ymm7
- vmovd xmm5,DWORD[20+r12]
- vmovd xmm0,DWORD[20+r8]
- vmovd xmm1,DWORD[20+r13]
- vmovd xmm2,DWORD[20+r9]
- vpinsrd xmm5,xmm5,DWORD[20+r14],1
- vpinsrd xmm0,xmm0,DWORD[20+r10],1
- vpinsrd xmm1,xmm1,DWORD[20+r15],1
- vpunpckldq ymm5,ymm5,ymm1
- vpinsrd xmm2,xmm2,DWORD[20+r11],1
- vpunpckldq ymm0,ymm0,ymm2
- vinserti128 ymm5,ymm5,xmm0,1
- vpshufb ymm5,ymm5,ymm6
- vpsrld ymm7,ymm15,6
- vpslld ymm2,ymm15,26
- vmovdqu YMMWORD[(160-128)+rax],ymm5
- vpaddd ymm5,ymm5,ymm10
-
- vpsrld ymm1,ymm15,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm15,21
- vpaddd ymm5,ymm5,YMMWORD[32+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm15,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm15,7
- vpandn ymm0,ymm15,ymm9
- vpand ymm4,ymm15,ymm8
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm10,ymm11,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm11,30
- vpxor ymm0,ymm0,ymm4
- vpxor ymm4,ymm12,ymm11
-
- vpxor ymm10,ymm10,ymm1
- vpaddd ymm5,ymm5,ymm7
-
- vpsrld ymm1,ymm11,13
-
- vpslld ymm2,ymm11,19
- vpaddd ymm5,ymm5,ymm0
- vpand ymm3,ymm3,ymm4
-
- vpxor ymm7,ymm10,ymm1
-
- vpsrld ymm1,ymm11,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm11,10
- vpxor ymm10,ymm12,ymm3
- vpaddd ymm14,ymm14,ymm5
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm10,ymm10,ymm5
- vpaddd ymm10,ymm10,ymm7
- vmovd xmm5,DWORD[24+r12]
- vmovd xmm0,DWORD[24+r8]
- vmovd xmm1,DWORD[24+r13]
- vmovd xmm2,DWORD[24+r9]
- vpinsrd xmm5,xmm5,DWORD[24+r14],1
- vpinsrd xmm0,xmm0,DWORD[24+r10],1
- vpinsrd xmm1,xmm1,DWORD[24+r15],1
- vpunpckldq ymm5,ymm5,ymm1
- vpinsrd xmm2,xmm2,DWORD[24+r11],1
- vpunpckldq ymm0,ymm0,ymm2
- vinserti128 ymm5,ymm5,xmm0,1
- vpshufb ymm5,ymm5,ymm6
- vpsrld ymm7,ymm14,6
- vpslld ymm2,ymm14,26
- vmovdqu YMMWORD[(192-128)+rax],ymm5
- vpaddd ymm5,ymm5,ymm9
-
- vpsrld ymm1,ymm14,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm14,21
- vpaddd ymm5,ymm5,YMMWORD[64+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm14,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm14,7
- vpandn ymm0,ymm14,ymm8
- vpand ymm3,ymm14,ymm15
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm9,ymm10,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm10,30
- vpxor ymm0,ymm0,ymm3
- vpxor ymm3,ymm11,ymm10
-
- vpxor ymm9,ymm9,ymm1
- vpaddd ymm5,ymm5,ymm7
-
- vpsrld ymm1,ymm10,13
-
- vpslld ymm2,ymm10,19
- vpaddd ymm5,ymm5,ymm0
- vpand ymm4,ymm4,ymm3
-
- vpxor ymm7,ymm9,ymm1
-
- vpsrld ymm1,ymm10,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm10,10
- vpxor ymm9,ymm11,ymm4
- vpaddd ymm13,ymm13,ymm5
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm9,ymm9,ymm5
- vpaddd ymm9,ymm9,ymm7
- vmovd xmm5,DWORD[28+r12]
- vmovd xmm0,DWORD[28+r8]
- vmovd xmm1,DWORD[28+r13]
- vmovd xmm2,DWORD[28+r9]
- vpinsrd xmm5,xmm5,DWORD[28+r14],1
- vpinsrd xmm0,xmm0,DWORD[28+r10],1
- vpinsrd xmm1,xmm1,DWORD[28+r15],1
- vpunpckldq ymm5,ymm5,ymm1
- vpinsrd xmm2,xmm2,DWORD[28+r11],1
- vpunpckldq ymm0,ymm0,ymm2
- vinserti128 ymm5,ymm5,xmm0,1
- vpshufb ymm5,ymm5,ymm6
- vpsrld ymm7,ymm13,6
- vpslld ymm2,ymm13,26
- vmovdqu YMMWORD[(224-128)+rax],ymm5
- vpaddd ymm5,ymm5,ymm8
-
- vpsrld ymm1,ymm13,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm13,21
- vpaddd ymm5,ymm5,YMMWORD[96+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm13,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm13,7
- vpandn ymm0,ymm13,ymm15
- vpand ymm4,ymm13,ymm14
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm8,ymm9,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm9,30
- vpxor ymm0,ymm0,ymm4
- vpxor ymm4,ymm10,ymm9
-
- vpxor ymm8,ymm8,ymm1
- vpaddd ymm5,ymm5,ymm7
-
- vpsrld ymm1,ymm9,13
-
- vpslld ymm2,ymm9,19
- vpaddd ymm5,ymm5,ymm0
- vpand ymm3,ymm3,ymm4
-
- vpxor ymm7,ymm8,ymm1
-
- vpsrld ymm1,ymm9,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm9,10
- vpxor ymm8,ymm10,ymm3
- vpaddd ymm12,ymm12,ymm5
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm8,ymm8,ymm5
- vpaddd ymm8,ymm8,ymm7
- add rbp,256
- vmovd xmm5,DWORD[32+r12]
- vmovd xmm0,DWORD[32+r8]
- vmovd xmm1,DWORD[32+r13]
- vmovd xmm2,DWORD[32+r9]
- vpinsrd xmm5,xmm5,DWORD[32+r14],1
- vpinsrd xmm0,xmm0,DWORD[32+r10],1
- vpinsrd xmm1,xmm1,DWORD[32+r15],1
- vpunpckldq ymm5,ymm5,ymm1
- vpinsrd xmm2,xmm2,DWORD[32+r11],1
- vpunpckldq ymm0,ymm0,ymm2
- vinserti128 ymm5,ymm5,xmm0,1
- vpshufb ymm5,ymm5,ymm6
- vpsrld ymm7,ymm12,6
- vpslld ymm2,ymm12,26
- vmovdqu YMMWORD[(256-256-128)+rbx],ymm5
- vpaddd ymm5,ymm5,ymm15
-
- vpsrld ymm1,ymm12,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm12,21
- vpaddd ymm5,ymm5,YMMWORD[((-128))+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm12,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm12,7
- vpandn ymm0,ymm12,ymm14
- vpand ymm3,ymm12,ymm13
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm15,ymm8,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm8,30
- vpxor ymm0,ymm0,ymm3
- vpxor ymm3,ymm9,ymm8
-
- vpxor ymm15,ymm15,ymm1
- vpaddd ymm5,ymm5,ymm7
-
- vpsrld ymm1,ymm8,13
-
- vpslld ymm2,ymm8,19
- vpaddd ymm5,ymm5,ymm0
- vpand ymm4,ymm4,ymm3
-
- vpxor ymm7,ymm15,ymm1
-
- vpsrld ymm1,ymm8,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm8,10
- vpxor ymm15,ymm9,ymm4
- vpaddd ymm11,ymm11,ymm5
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm15,ymm15,ymm5
- vpaddd ymm15,ymm15,ymm7
- vmovd xmm5,DWORD[36+r12]
- vmovd xmm0,DWORD[36+r8]
- vmovd xmm1,DWORD[36+r13]
- vmovd xmm2,DWORD[36+r9]
- vpinsrd xmm5,xmm5,DWORD[36+r14],1
- vpinsrd xmm0,xmm0,DWORD[36+r10],1
- vpinsrd xmm1,xmm1,DWORD[36+r15],1
- vpunpckldq ymm5,ymm5,ymm1
- vpinsrd xmm2,xmm2,DWORD[36+r11],1
- vpunpckldq ymm0,ymm0,ymm2
- vinserti128 ymm5,ymm5,xmm0,1
- vpshufb ymm5,ymm5,ymm6
- vpsrld ymm7,ymm11,6
- vpslld ymm2,ymm11,26
- vmovdqu YMMWORD[(288-256-128)+rbx],ymm5
- vpaddd ymm5,ymm5,ymm14
-
- vpsrld ymm1,ymm11,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm11,21
- vpaddd ymm5,ymm5,YMMWORD[((-96))+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm11,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm11,7
- vpandn ymm0,ymm11,ymm13
- vpand ymm4,ymm11,ymm12
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm14,ymm15,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm15,30
- vpxor ymm0,ymm0,ymm4
- vpxor ymm4,ymm8,ymm15
-
- vpxor ymm14,ymm14,ymm1
- vpaddd ymm5,ymm5,ymm7
-
- vpsrld ymm1,ymm15,13
-
- vpslld ymm2,ymm15,19
- vpaddd ymm5,ymm5,ymm0
- vpand ymm3,ymm3,ymm4
-
- vpxor ymm7,ymm14,ymm1
-
- vpsrld ymm1,ymm15,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm15,10
- vpxor ymm14,ymm8,ymm3
- vpaddd ymm10,ymm10,ymm5
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm14,ymm14,ymm5
- vpaddd ymm14,ymm14,ymm7
- vmovd xmm5,DWORD[40+r12]
- vmovd xmm0,DWORD[40+r8]
- vmovd xmm1,DWORD[40+r13]
- vmovd xmm2,DWORD[40+r9]
- vpinsrd xmm5,xmm5,DWORD[40+r14],1
- vpinsrd xmm0,xmm0,DWORD[40+r10],1
- vpinsrd xmm1,xmm1,DWORD[40+r15],1
- vpunpckldq ymm5,ymm5,ymm1
- vpinsrd xmm2,xmm2,DWORD[40+r11],1
- vpunpckldq ymm0,ymm0,ymm2
- vinserti128 ymm5,ymm5,xmm0,1
- vpshufb ymm5,ymm5,ymm6
- vpsrld ymm7,ymm10,6
- vpslld ymm2,ymm10,26
- vmovdqu YMMWORD[(320-256-128)+rbx],ymm5
- vpaddd ymm5,ymm5,ymm13
-
- vpsrld ymm1,ymm10,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm10,21
- vpaddd ymm5,ymm5,YMMWORD[((-64))+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm10,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm10,7
- vpandn ymm0,ymm10,ymm12
- vpand ymm3,ymm10,ymm11
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm13,ymm14,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm14,30
- vpxor ymm0,ymm0,ymm3
- vpxor ymm3,ymm15,ymm14
-
- vpxor ymm13,ymm13,ymm1
- vpaddd ymm5,ymm5,ymm7
-
- vpsrld ymm1,ymm14,13
-
- vpslld ymm2,ymm14,19
- vpaddd ymm5,ymm5,ymm0
- vpand ymm4,ymm4,ymm3
-
- vpxor ymm7,ymm13,ymm1
-
- vpsrld ymm1,ymm14,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm14,10
- vpxor ymm13,ymm15,ymm4
- vpaddd ymm9,ymm9,ymm5
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm13,ymm13,ymm5
- vpaddd ymm13,ymm13,ymm7
- vmovd xmm5,DWORD[44+r12]
- vmovd xmm0,DWORD[44+r8]
- vmovd xmm1,DWORD[44+r13]
- vmovd xmm2,DWORD[44+r9]
- vpinsrd xmm5,xmm5,DWORD[44+r14],1
- vpinsrd xmm0,xmm0,DWORD[44+r10],1
- vpinsrd xmm1,xmm1,DWORD[44+r15],1
- vpunpckldq ymm5,ymm5,ymm1
- vpinsrd xmm2,xmm2,DWORD[44+r11],1
- vpunpckldq ymm0,ymm0,ymm2
- vinserti128 ymm5,ymm5,xmm0,1
- vpshufb ymm5,ymm5,ymm6
- vpsrld ymm7,ymm9,6
- vpslld ymm2,ymm9,26
- vmovdqu YMMWORD[(352-256-128)+rbx],ymm5
- vpaddd ymm5,ymm5,ymm12
-
- vpsrld ymm1,ymm9,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm9,21
- vpaddd ymm5,ymm5,YMMWORD[((-32))+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm9,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm9,7
- vpandn ymm0,ymm9,ymm11
- vpand ymm4,ymm9,ymm10
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm12,ymm13,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm13,30
- vpxor ymm0,ymm0,ymm4
- vpxor ymm4,ymm14,ymm13
-
- vpxor ymm12,ymm12,ymm1
- vpaddd ymm5,ymm5,ymm7
-
- vpsrld ymm1,ymm13,13
-
- vpslld ymm2,ymm13,19
- vpaddd ymm5,ymm5,ymm0
- vpand ymm3,ymm3,ymm4
-
- vpxor ymm7,ymm12,ymm1
-
- vpsrld ymm1,ymm13,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm13,10
- vpxor ymm12,ymm14,ymm3
- vpaddd ymm8,ymm8,ymm5
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm12,ymm12,ymm5
- vpaddd ymm12,ymm12,ymm7
- vmovd xmm5,DWORD[48+r12]
- vmovd xmm0,DWORD[48+r8]
- vmovd xmm1,DWORD[48+r13]
- vmovd xmm2,DWORD[48+r9]
- vpinsrd xmm5,xmm5,DWORD[48+r14],1
- vpinsrd xmm0,xmm0,DWORD[48+r10],1
- vpinsrd xmm1,xmm1,DWORD[48+r15],1
- vpunpckldq ymm5,ymm5,ymm1
- vpinsrd xmm2,xmm2,DWORD[48+r11],1
- vpunpckldq ymm0,ymm0,ymm2
- vinserti128 ymm5,ymm5,xmm0,1
- vpshufb ymm5,ymm5,ymm6
- vpsrld ymm7,ymm8,6
- vpslld ymm2,ymm8,26
- vmovdqu YMMWORD[(384-256-128)+rbx],ymm5
- vpaddd ymm5,ymm5,ymm11
-
- vpsrld ymm1,ymm8,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm8,21
- vpaddd ymm5,ymm5,YMMWORD[rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm8,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm8,7
- vpandn ymm0,ymm8,ymm10
- vpand ymm3,ymm8,ymm9
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm11,ymm12,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm12,30
- vpxor ymm0,ymm0,ymm3
- vpxor ymm3,ymm13,ymm12
-
- vpxor ymm11,ymm11,ymm1
- vpaddd ymm5,ymm5,ymm7
-
- vpsrld ymm1,ymm12,13
-
- vpslld ymm2,ymm12,19
- vpaddd ymm5,ymm5,ymm0
- vpand ymm4,ymm4,ymm3
-
- vpxor ymm7,ymm11,ymm1
-
- vpsrld ymm1,ymm12,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm12,10
- vpxor ymm11,ymm13,ymm4
- vpaddd ymm15,ymm15,ymm5
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm11,ymm11,ymm5
- vpaddd ymm11,ymm11,ymm7
- vmovd xmm5,DWORD[52+r12]
- vmovd xmm0,DWORD[52+r8]
- vmovd xmm1,DWORD[52+r13]
- vmovd xmm2,DWORD[52+r9]
- vpinsrd xmm5,xmm5,DWORD[52+r14],1
- vpinsrd xmm0,xmm0,DWORD[52+r10],1
- vpinsrd xmm1,xmm1,DWORD[52+r15],1
- vpunpckldq ymm5,ymm5,ymm1
- vpinsrd xmm2,xmm2,DWORD[52+r11],1
- vpunpckldq ymm0,ymm0,ymm2
- vinserti128 ymm5,ymm5,xmm0,1
- vpshufb ymm5,ymm5,ymm6
- vpsrld ymm7,ymm15,6
- vpslld ymm2,ymm15,26
- vmovdqu YMMWORD[(416-256-128)+rbx],ymm5
- vpaddd ymm5,ymm5,ymm10
-
- vpsrld ymm1,ymm15,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm15,21
- vpaddd ymm5,ymm5,YMMWORD[32+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm15,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm15,7
- vpandn ymm0,ymm15,ymm9
- vpand ymm4,ymm15,ymm8
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm10,ymm11,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm11,30
- vpxor ymm0,ymm0,ymm4
- vpxor ymm4,ymm12,ymm11
-
- vpxor ymm10,ymm10,ymm1
- vpaddd ymm5,ymm5,ymm7
-
- vpsrld ymm1,ymm11,13
-
- vpslld ymm2,ymm11,19
- vpaddd ymm5,ymm5,ymm0
- vpand ymm3,ymm3,ymm4
-
- vpxor ymm7,ymm10,ymm1
-
- vpsrld ymm1,ymm11,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm11,10
- vpxor ymm10,ymm12,ymm3
- vpaddd ymm14,ymm14,ymm5
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm10,ymm10,ymm5
- vpaddd ymm10,ymm10,ymm7
- vmovd xmm5,DWORD[56+r12]
- vmovd xmm0,DWORD[56+r8]
- vmovd xmm1,DWORD[56+r13]
- vmovd xmm2,DWORD[56+r9]
- vpinsrd xmm5,xmm5,DWORD[56+r14],1
- vpinsrd xmm0,xmm0,DWORD[56+r10],1
- vpinsrd xmm1,xmm1,DWORD[56+r15],1
- vpunpckldq ymm5,ymm5,ymm1
- vpinsrd xmm2,xmm2,DWORD[56+r11],1
- vpunpckldq ymm0,ymm0,ymm2
- vinserti128 ymm5,ymm5,xmm0,1
- vpshufb ymm5,ymm5,ymm6
- vpsrld ymm7,ymm14,6
- vpslld ymm2,ymm14,26
- vmovdqu YMMWORD[(448-256-128)+rbx],ymm5
- vpaddd ymm5,ymm5,ymm9
-
- vpsrld ymm1,ymm14,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm14,21
- vpaddd ymm5,ymm5,YMMWORD[64+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm14,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm14,7
- vpandn ymm0,ymm14,ymm8
- vpand ymm3,ymm14,ymm15
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm9,ymm10,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm10,30
- vpxor ymm0,ymm0,ymm3
- vpxor ymm3,ymm11,ymm10
-
- vpxor ymm9,ymm9,ymm1
- vpaddd ymm5,ymm5,ymm7
-
- vpsrld ymm1,ymm10,13
-
- vpslld ymm2,ymm10,19
- vpaddd ymm5,ymm5,ymm0
- vpand ymm4,ymm4,ymm3
-
- vpxor ymm7,ymm9,ymm1
-
- vpsrld ymm1,ymm10,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm10,10
- vpxor ymm9,ymm11,ymm4
- vpaddd ymm13,ymm13,ymm5
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm9,ymm9,ymm5
- vpaddd ymm9,ymm9,ymm7
- vmovd xmm5,DWORD[60+r12]
- lea r12,[64+r12]
- vmovd xmm0,DWORD[60+r8]
- lea r8,[64+r8]
- vmovd xmm1,DWORD[60+r13]
- lea r13,[64+r13]
- vmovd xmm2,DWORD[60+r9]
- lea r9,[64+r9]
- vpinsrd xmm5,xmm5,DWORD[60+r14],1
- lea r14,[64+r14]
- vpinsrd xmm0,xmm0,DWORD[60+r10],1
- lea r10,[64+r10]
- vpinsrd xmm1,xmm1,DWORD[60+r15],1
- lea r15,[64+r15]
- vpunpckldq ymm5,ymm5,ymm1
- vpinsrd xmm2,xmm2,DWORD[60+r11],1
- lea r11,[64+r11]
- vpunpckldq ymm0,ymm0,ymm2
- vinserti128 ymm5,ymm5,xmm0,1
- vpshufb ymm5,ymm5,ymm6
- vpsrld ymm7,ymm13,6
- vpslld ymm2,ymm13,26
- vmovdqu YMMWORD[(480-256-128)+rbx],ymm5
- vpaddd ymm5,ymm5,ymm8
-
- vpsrld ymm1,ymm13,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm13,21
- vpaddd ymm5,ymm5,YMMWORD[96+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm13,25
- vpxor ymm7,ymm7,ymm2
- prefetcht0 [63+r12]
- vpslld ymm2,ymm13,7
- vpandn ymm0,ymm13,ymm15
- vpand ymm4,ymm13,ymm14
- prefetcht0 [63+r13]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm8,ymm9,2
- vpxor ymm7,ymm7,ymm2
- prefetcht0 [63+r14]
- vpslld ymm1,ymm9,30
- vpxor ymm0,ymm0,ymm4
- vpxor ymm4,ymm10,ymm9
- prefetcht0 [63+r15]
- vpxor ymm8,ymm8,ymm1
- vpaddd ymm5,ymm5,ymm7
-
- vpsrld ymm1,ymm9,13
- prefetcht0 [63+r8]
- vpslld ymm2,ymm9,19
- vpaddd ymm5,ymm5,ymm0
- vpand ymm3,ymm3,ymm4
- prefetcht0 [63+r9]
- vpxor ymm7,ymm8,ymm1
-
- vpsrld ymm1,ymm9,22
- vpxor ymm7,ymm7,ymm2
- prefetcht0 [63+r10]
- vpslld ymm2,ymm9,10
- vpxor ymm8,ymm10,ymm3
- vpaddd ymm12,ymm12,ymm5
- prefetcht0 [63+r11]
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm8,ymm8,ymm5
- vpaddd ymm8,ymm8,ymm7
- add rbp,256
- vmovdqu ymm5,YMMWORD[((0-128))+rax]
- mov ecx,3
- jmp NEAR $L$oop_16_xx_avx2
-ALIGN 32
-$L$oop_16_xx_avx2:
- vmovdqu ymm6,YMMWORD[((32-128))+rax]
- vpaddd ymm5,ymm5,YMMWORD[((288-256-128))+rbx]
-
- vpsrld ymm7,ymm6,3
- vpsrld ymm1,ymm6,7
- vpslld ymm2,ymm6,25
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm6,18
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm6,14
- vmovdqu ymm0,YMMWORD[((448-256-128))+rbx]
- vpsrld ymm3,ymm0,10
-
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm0,17
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,15
- vpaddd ymm5,ymm5,ymm7
- vpxor ymm7,ymm3,ymm1
- vpsrld ymm1,ymm0,19
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,13
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
- vpaddd ymm5,ymm5,ymm7
- vpsrld ymm7,ymm12,6
- vpslld ymm2,ymm12,26
- vmovdqu YMMWORD[(0-128)+rax],ymm5
- vpaddd ymm5,ymm5,ymm15
-
- vpsrld ymm1,ymm12,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm12,21
- vpaddd ymm5,ymm5,YMMWORD[((-128))+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm12,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm12,7
- vpandn ymm0,ymm12,ymm14
- vpand ymm3,ymm12,ymm13
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm15,ymm8,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm8,30
- vpxor ymm0,ymm0,ymm3
- vpxor ymm3,ymm9,ymm8
-
- vpxor ymm15,ymm15,ymm1
- vpaddd ymm5,ymm5,ymm7
-
- vpsrld ymm1,ymm8,13
-
- vpslld ymm2,ymm8,19
- vpaddd ymm5,ymm5,ymm0
- vpand ymm4,ymm4,ymm3
-
- vpxor ymm7,ymm15,ymm1
-
- vpsrld ymm1,ymm8,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm8,10
- vpxor ymm15,ymm9,ymm4
- vpaddd ymm11,ymm11,ymm5
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm15,ymm15,ymm5
- vpaddd ymm15,ymm15,ymm7
- vmovdqu ymm5,YMMWORD[((64-128))+rax]
- vpaddd ymm6,ymm6,YMMWORD[((320-256-128))+rbx]
-
- vpsrld ymm7,ymm5,3
- vpsrld ymm1,ymm5,7
- vpslld ymm2,ymm5,25
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm5,18
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm5,14
- vmovdqu ymm0,YMMWORD[((480-256-128))+rbx]
- vpsrld ymm4,ymm0,10
-
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm0,17
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,15
- vpaddd ymm6,ymm6,ymm7
- vpxor ymm7,ymm4,ymm1
- vpsrld ymm1,ymm0,19
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,13
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
- vpaddd ymm6,ymm6,ymm7
- vpsrld ymm7,ymm11,6
- vpslld ymm2,ymm11,26
- vmovdqu YMMWORD[(32-128)+rax],ymm6
- vpaddd ymm6,ymm6,ymm14
-
- vpsrld ymm1,ymm11,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm11,21
- vpaddd ymm6,ymm6,YMMWORD[((-96))+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm11,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm11,7
- vpandn ymm0,ymm11,ymm13
- vpand ymm4,ymm11,ymm12
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm14,ymm15,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm15,30
- vpxor ymm0,ymm0,ymm4
- vpxor ymm4,ymm8,ymm15
-
- vpxor ymm14,ymm14,ymm1
- vpaddd ymm6,ymm6,ymm7
-
- vpsrld ymm1,ymm15,13
-
- vpslld ymm2,ymm15,19
- vpaddd ymm6,ymm6,ymm0
- vpand ymm3,ymm3,ymm4
-
- vpxor ymm7,ymm14,ymm1
-
- vpsrld ymm1,ymm15,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm15,10
- vpxor ymm14,ymm8,ymm3
- vpaddd ymm10,ymm10,ymm6
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm14,ymm14,ymm6
- vpaddd ymm14,ymm14,ymm7
- vmovdqu ymm6,YMMWORD[((96-128))+rax]
- vpaddd ymm5,ymm5,YMMWORD[((352-256-128))+rbx]
-
- vpsrld ymm7,ymm6,3
- vpsrld ymm1,ymm6,7
- vpslld ymm2,ymm6,25
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm6,18
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm6,14
- vmovdqu ymm0,YMMWORD[((0-128))+rax]
- vpsrld ymm3,ymm0,10
-
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm0,17
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,15
- vpaddd ymm5,ymm5,ymm7
- vpxor ymm7,ymm3,ymm1
- vpsrld ymm1,ymm0,19
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,13
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
- vpaddd ymm5,ymm5,ymm7
- vpsrld ymm7,ymm10,6
- vpslld ymm2,ymm10,26
- vmovdqu YMMWORD[(64-128)+rax],ymm5
- vpaddd ymm5,ymm5,ymm13
-
- vpsrld ymm1,ymm10,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm10,21
- vpaddd ymm5,ymm5,YMMWORD[((-64))+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm10,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm10,7
- vpandn ymm0,ymm10,ymm12
- vpand ymm3,ymm10,ymm11
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm13,ymm14,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm14,30
- vpxor ymm0,ymm0,ymm3
- vpxor ymm3,ymm15,ymm14
-
- vpxor ymm13,ymm13,ymm1
- vpaddd ymm5,ymm5,ymm7
-
- vpsrld ymm1,ymm14,13
-
- vpslld ymm2,ymm14,19
- vpaddd ymm5,ymm5,ymm0
- vpand ymm4,ymm4,ymm3
-
- vpxor ymm7,ymm13,ymm1
-
- vpsrld ymm1,ymm14,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm14,10
- vpxor ymm13,ymm15,ymm4
- vpaddd ymm9,ymm9,ymm5
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm13,ymm13,ymm5
- vpaddd ymm13,ymm13,ymm7
- vmovdqu ymm5,YMMWORD[((128-128))+rax]
- vpaddd ymm6,ymm6,YMMWORD[((384-256-128))+rbx]
-
- vpsrld ymm7,ymm5,3
- vpsrld ymm1,ymm5,7
- vpslld ymm2,ymm5,25
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm5,18
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm5,14
- vmovdqu ymm0,YMMWORD[((32-128))+rax]
- vpsrld ymm4,ymm0,10
-
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm0,17
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,15
- vpaddd ymm6,ymm6,ymm7
- vpxor ymm7,ymm4,ymm1
- vpsrld ymm1,ymm0,19
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,13
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
- vpaddd ymm6,ymm6,ymm7
- vpsrld ymm7,ymm9,6
- vpslld ymm2,ymm9,26
- vmovdqu YMMWORD[(96-128)+rax],ymm6
- vpaddd ymm6,ymm6,ymm12
-
- vpsrld ymm1,ymm9,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm9,21
- vpaddd ymm6,ymm6,YMMWORD[((-32))+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm9,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm9,7
- vpandn ymm0,ymm9,ymm11
- vpand ymm4,ymm9,ymm10
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm12,ymm13,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm13,30
- vpxor ymm0,ymm0,ymm4
- vpxor ymm4,ymm14,ymm13
-
- vpxor ymm12,ymm12,ymm1
- vpaddd ymm6,ymm6,ymm7
-
- vpsrld ymm1,ymm13,13
-
- vpslld ymm2,ymm13,19
- vpaddd ymm6,ymm6,ymm0
- vpand ymm3,ymm3,ymm4
-
- vpxor ymm7,ymm12,ymm1
-
- vpsrld ymm1,ymm13,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm13,10
- vpxor ymm12,ymm14,ymm3
- vpaddd ymm8,ymm8,ymm6
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm12,ymm12,ymm6
- vpaddd ymm12,ymm12,ymm7
- vmovdqu ymm6,YMMWORD[((160-128))+rax]
- vpaddd ymm5,ymm5,YMMWORD[((416-256-128))+rbx]
-
- vpsrld ymm7,ymm6,3
- vpsrld ymm1,ymm6,7
- vpslld ymm2,ymm6,25
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm6,18
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm6,14
- vmovdqu ymm0,YMMWORD[((64-128))+rax]
- vpsrld ymm3,ymm0,10
-
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm0,17
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,15
- vpaddd ymm5,ymm5,ymm7
- vpxor ymm7,ymm3,ymm1
- vpsrld ymm1,ymm0,19
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,13
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
- vpaddd ymm5,ymm5,ymm7
- vpsrld ymm7,ymm8,6
- vpslld ymm2,ymm8,26
- vmovdqu YMMWORD[(128-128)+rax],ymm5
- vpaddd ymm5,ymm5,ymm11
-
- vpsrld ymm1,ymm8,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm8,21
- vpaddd ymm5,ymm5,YMMWORD[rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm8,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm8,7
- vpandn ymm0,ymm8,ymm10
- vpand ymm3,ymm8,ymm9
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm11,ymm12,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm12,30
- vpxor ymm0,ymm0,ymm3
- vpxor ymm3,ymm13,ymm12
-
- vpxor ymm11,ymm11,ymm1
- vpaddd ymm5,ymm5,ymm7
-
- vpsrld ymm1,ymm12,13
-
- vpslld ymm2,ymm12,19
- vpaddd ymm5,ymm5,ymm0
- vpand ymm4,ymm4,ymm3
-
- vpxor ymm7,ymm11,ymm1
-
- vpsrld ymm1,ymm12,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm12,10
- vpxor ymm11,ymm13,ymm4
- vpaddd ymm15,ymm15,ymm5
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm11,ymm11,ymm5
- vpaddd ymm11,ymm11,ymm7
- vmovdqu ymm5,YMMWORD[((192-128))+rax]
- vpaddd ymm6,ymm6,YMMWORD[((448-256-128))+rbx]
-
- vpsrld ymm7,ymm5,3
- vpsrld ymm1,ymm5,7
- vpslld ymm2,ymm5,25
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm5,18
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm5,14
- vmovdqu ymm0,YMMWORD[((96-128))+rax]
- vpsrld ymm4,ymm0,10
-
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm0,17
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,15
- vpaddd ymm6,ymm6,ymm7
- vpxor ymm7,ymm4,ymm1
- vpsrld ymm1,ymm0,19
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,13
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
- vpaddd ymm6,ymm6,ymm7
- vpsrld ymm7,ymm15,6
- vpslld ymm2,ymm15,26
- vmovdqu YMMWORD[(160-128)+rax],ymm6
- vpaddd ymm6,ymm6,ymm10
-
- vpsrld ymm1,ymm15,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm15,21
- vpaddd ymm6,ymm6,YMMWORD[32+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm15,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm15,7
- vpandn ymm0,ymm15,ymm9
- vpand ymm4,ymm15,ymm8
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm10,ymm11,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm11,30
- vpxor ymm0,ymm0,ymm4
- vpxor ymm4,ymm12,ymm11
-
- vpxor ymm10,ymm10,ymm1
- vpaddd ymm6,ymm6,ymm7
-
- vpsrld ymm1,ymm11,13
-
- vpslld ymm2,ymm11,19
- vpaddd ymm6,ymm6,ymm0
- vpand ymm3,ymm3,ymm4
-
- vpxor ymm7,ymm10,ymm1
-
- vpsrld ymm1,ymm11,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm11,10
- vpxor ymm10,ymm12,ymm3
- vpaddd ymm14,ymm14,ymm6
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm10,ymm10,ymm6
- vpaddd ymm10,ymm10,ymm7
- vmovdqu ymm6,YMMWORD[((224-128))+rax]
- vpaddd ymm5,ymm5,YMMWORD[((480-256-128))+rbx]
-
- vpsrld ymm7,ymm6,3
- vpsrld ymm1,ymm6,7
- vpslld ymm2,ymm6,25
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm6,18
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm6,14
- vmovdqu ymm0,YMMWORD[((128-128))+rax]
- vpsrld ymm3,ymm0,10
-
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm0,17
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,15
- vpaddd ymm5,ymm5,ymm7
- vpxor ymm7,ymm3,ymm1
- vpsrld ymm1,ymm0,19
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,13
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
- vpaddd ymm5,ymm5,ymm7
- vpsrld ymm7,ymm14,6
- vpslld ymm2,ymm14,26
- vmovdqu YMMWORD[(192-128)+rax],ymm5
- vpaddd ymm5,ymm5,ymm9
-
- vpsrld ymm1,ymm14,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm14,21
- vpaddd ymm5,ymm5,YMMWORD[64+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm14,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm14,7
- vpandn ymm0,ymm14,ymm8
- vpand ymm3,ymm14,ymm15
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm9,ymm10,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm10,30
- vpxor ymm0,ymm0,ymm3
- vpxor ymm3,ymm11,ymm10
-
- vpxor ymm9,ymm9,ymm1
- vpaddd ymm5,ymm5,ymm7
-
- vpsrld ymm1,ymm10,13
-
- vpslld ymm2,ymm10,19
- vpaddd ymm5,ymm5,ymm0
- vpand ymm4,ymm4,ymm3
-
- vpxor ymm7,ymm9,ymm1
-
- vpsrld ymm1,ymm10,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm10,10
- vpxor ymm9,ymm11,ymm4
- vpaddd ymm13,ymm13,ymm5
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm9,ymm9,ymm5
- vpaddd ymm9,ymm9,ymm7
- vmovdqu ymm5,YMMWORD[((256-256-128))+rbx]
- vpaddd ymm6,ymm6,YMMWORD[((0-128))+rax]
-
- vpsrld ymm7,ymm5,3
- vpsrld ymm1,ymm5,7
- vpslld ymm2,ymm5,25
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm5,18
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm5,14
- vmovdqu ymm0,YMMWORD[((160-128))+rax]
- vpsrld ymm4,ymm0,10
-
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm0,17
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,15
- vpaddd ymm6,ymm6,ymm7
- vpxor ymm7,ymm4,ymm1
- vpsrld ymm1,ymm0,19
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,13
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
- vpaddd ymm6,ymm6,ymm7
- vpsrld ymm7,ymm13,6
- vpslld ymm2,ymm13,26
- vmovdqu YMMWORD[(224-128)+rax],ymm6
- vpaddd ymm6,ymm6,ymm8
-
- vpsrld ymm1,ymm13,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm13,21
- vpaddd ymm6,ymm6,YMMWORD[96+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm13,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm13,7
- vpandn ymm0,ymm13,ymm15
- vpand ymm4,ymm13,ymm14
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm8,ymm9,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm9,30
- vpxor ymm0,ymm0,ymm4
- vpxor ymm4,ymm10,ymm9
-
- vpxor ymm8,ymm8,ymm1
- vpaddd ymm6,ymm6,ymm7
-
- vpsrld ymm1,ymm9,13
-
- vpslld ymm2,ymm9,19
- vpaddd ymm6,ymm6,ymm0
- vpand ymm3,ymm3,ymm4
-
- vpxor ymm7,ymm8,ymm1
-
- vpsrld ymm1,ymm9,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm9,10
- vpxor ymm8,ymm10,ymm3
- vpaddd ymm12,ymm12,ymm6
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm8,ymm8,ymm6
- vpaddd ymm8,ymm8,ymm7
- add rbp,256
- vmovdqu ymm6,YMMWORD[((288-256-128))+rbx]
- vpaddd ymm5,ymm5,YMMWORD[((32-128))+rax]
-
- vpsrld ymm7,ymm6,3
- vpsrld ymm1,ymm6,7
- vpslld ymm2,ymm6,25
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm6,18
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm6,14
- vmovdqu ymm0,YMMWORD[((192-128))+rax]
- vpsrld ymm3,ymm0,10
-
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm0,17
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,15
- vpaddd ymm5,ymm5,ymm7
- vpxor ymm7,ymm3,ymm1
- vpsrld ymm1,ymm0,19
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,13
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
- vpaddd ymm5,ymm5,ymm7
- vpsrld ymm7,ymm12,6
- vpslld ymm2,ymm12,26
- vmovdqu YMMWORD[(256-256-128)+rbx],ymm5
- vpaddd ymm5,ymm5,ymm15
-
- vpsrld ymm1,ymm12,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm12,21
- vpaddd ymm5,ymm5,YMMWORD[((-128))+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm12,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm12,7
- vpandn ymm0,ymm12,ymm14
- vpand ymm3,ymm12,ymm13
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm15,ymm8,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm8,30
- vpxor ymm0,ymm0,ymm3
- vpxor ymm3,ymm9,ymm8
-
- vpxor ymm15,ymm15,ymm1
- vpaddd ymm5,ymm5,ymm7
-
- vpsrld ymm1,ymm8,13
-
- vpslld ymm2,ymm8,19
- vpaddd ymm5,ymm5,ymm0
- vpand ymm4,ymm4,ymm3
-
- vpxor ymm7,ymm15,ymm1
-
- vpsrld ymm1,ymm8,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm8,10
- vpxor ymm15,ymm9,ymm4
- vpaddd ymm11,ymm11,ymm5
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm15,ymm15,ymm5
- vpaddd ymm15,ymm15,ymm7
- vmovdqu ymm5,YMMWORD[((320-256-128))+rbx]
- vpaddd ymm6,ymm6,YMMWORD[((64-128))+rax]
-
- vpsrld ymm7,ymm5,3
- vpsrld ymm1,ymm5,7
- vpslld ymm2,ymm5,25
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm5,18
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm5,14
- vmovdqu ymm0,YMMWORD[((224-128))+rax]
- vpsrld ymm4,ymm0,10
-
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm0,17
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,15
- vpaddd ymm6,ymm6,ymm7
- vpxor ymm7,ymm4,ymm1
- vpsrld ymm1,ymm0,19
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,13
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
- vpaddd ymm6,ymm6,ymm7
- vpsrld ymm7,ymm11,6
- vpslld ymm2,ymm11,26
- vmovdqu YMMWORD[(288-256-128)+rbx],ymm6
- vpaddd ymm6,ymm6,ymm14
-
- vpsrld ymm1,ymm11,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm11,21
- vpaddd ymm6,ymm6,YMMWORD[((-96))+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm11,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm11,7
- vpandn ymm0,ymm11,ymm13
- vpand ymm4,ymm11,ymm12
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm14,ymm15,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm15,30
- vpxor ymm0,ymm0,ymm4
- vpxor ymm4,ymm8,ymm15
-
- vpxor ymm14,ymm14,ymm1
- vpaddd ymm6,ymm6,ymm7
-
- vpsrld ymm1,ymm15,13
-
- vpslld ymm2,ymm15,19
- vpaddd ymm6,ymm6,ymm0
- vpand ymm3,ymm3,ymm4
-
- vpxor ymm7,ymm14,ymm1
-
- vpsrld ymm1,ymm15,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm15,10
- vpxor ymm14,ymm8,ymm3
- vpaddd ymm10,ymm10,ymm6
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm14,ymm14,ymm6
- vpaddd ymm14,ymm14,ymm7
- vmovdqu ymm6,YMMWORD[((352-256-128))+rbx]
- vpaddd ymm5,ymm5,YMMWORD[((96-128))+rax]
-
- vpsrld ymm7,ymm6,3
- vpsrld ymm1,ymm6,7
- vpslld ymm2,ymm6,25
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm6,18
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm6,14
- vmovdqu ymm0,YMMWORD[((256-256-128))+rbx]
- vpsrld ymm3,ymm0,10
-
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm0,17
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,15
- vpaddd ymm5,ymm5,ymm7
- vpxor ymm7,ymm3,ymm1
- vpsrld ymm1,ymm0,19
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,13
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
- vpaddd ymm5,ymm5,ymm7
- vpsrld ymm7,ymm10,6
- vpslld ymm2,ymm10,26
- vmovdqu YMMWORD[(320-256-128)+rbx],ymm5
- vpaddd ymm5,ymm5,ymm13
-
- vpsrld ymm1,ymm10,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm10,21
- vpaddd ymm5,ymm5,YMMWORD[((-64))+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm10,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm10,7
- vpandn ymm0,ymm10,ymm12
- vpand ymm3,ymm10,ymm11
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm13,ymm14,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm14,30
- vpxor ymm0,ymm0,ymm3
- vpxor ymm3,ymm15,ymm14
-
- vpxor ymm13,ymm13,ymm1
- vpaddd ymm5,ymm5,ymm7
-
- vpsrld ymm1,ymm14,13
-
- vpslld ymm2,ymm14,19
- vpaddd ymm5,ymm5,ymm0
- vpand ymm4,ymm4,ymm3
-
- vpxor ymm7,ymm13,ymm1
-
- vpsrld ymm1,ymm14,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm14,10
- vpxor ymm13,ymm15,ymm4
- vpaddd ymm9,ymm9,ymm5
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm13,ymm13,ymm5
- vpaddd ymm13,ymm13,ymm7
- vmovdqu ymm5,YMMWORD[((384-256-128))+rbx]
- vpaddd ymm6,ymm6,YMMWORD[((128-128))+rax]
-
- vpsrld ymm7,ymm5,3
- vpsrld ymm1,ymm5,7
- vpslld ymm2,ymm5,25
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm5,18
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm5,14
- vmovdqu ymm0,YMMWORD[((288-256-128))+rbx]
- vpsrld ymm4,ymm0,10
-
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm0,17
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,15
- vpaddd ymm6,ymm6,ymm7
- vpxor ymm7,ymm4,ymm1
- vpsrld ymm1,ymm0,19
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,13
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
- vpaddd ymm6,ymm6,ymm7
- vpsrld ymm7,ymm9,6
- vpslld ymm2,ymm9,26
- vmovdqu YMMWORD[(352-256-128)+rbx],ymm6
- vpaddd ymm6,ymm6,ymm12
-
- vpsrld ymm1,ymm9,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm9,21
- vpaddd ymm6,ymm6,YMMWORD[((-32))+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm9,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm9,7
- vpandn ymm0,ymm9,ymm11
- vpand ymm4,ymm9,ymm10
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm12,ymm13,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm13,30
- vpxor ymm0,ymm0,ymm4
- vpxor ymm4,ymm14,ymm13
-
- vpxor ymm12,ymm12,ymm1
- vpaddd ymm6,ymm6,ymm7
-
- vpsrld ymm1,ymm13,13
-
- vpslld ymm2,ymm13,19
- vpaddd ymm6,ymm6,ymm0
- vpand ymm3,ymm3,ymm4
-
- vpxor ymm7,ymm12,ymm1
-
- vpsrld ymm1,ymm13,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm13,10
- vpxor ymm12,ymm14,ymm3
- vpaddd ymm8,ymm8,ymm6
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm12,ymm12,ymm6
- vpaddd ymm12,ymm12,ymm7
- vmovdqu ymm6,YMMWORD[((416-256-128))+rbx]
- vpaddd ymm5,ymm5,YMMWORD[((160-128))+rax]
-
- vpsrld ymm7,ymm6,3
- vpsrld ymm1,ymm6,7
- vpslld ymm2,ymm6,25
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm6,18
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm6,14
- vmovdqu ymm0,YMMWORD[((320-256-128))+rbx]
- vpsrld ymm3,ymm0,10
-
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm0,17
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,15
- vpaddd ymm5,ymm5,ymm7
- vpxor ymm7,ymm3,ymm1
- vpsrld ymm1,ymm0,19
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,13
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
- vpaddd ymm5,ymm5,ymm7
- vpsrld ymm7,ymm8,6
- vpslld ymm2,ymm8,26
- vmovdqu YMMWORD[(384-256-128)+rbx],ymm5
- vpaddd ymm5,ymm5,ymm11
-
- vpsrld ymm1,ymm8,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm8,21
- vpaddd ymm5,ymm5,YMMWORD[rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm8,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm8,7
- vpandn ymm0,ymm8,ymm10
- vpand ymm3,ymm8,ymm9
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm11,ymm12,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm12,30
- vpxor ymm0,ymm0,ymm3
- vpxor ymm3,ymm13,ymm12
-
- vpxor ymm11,ymm11,ymm1
- vpaddd ymm5,ymm5,ymm7
-
- vpsrld ymm1,ymm12,13
-
- vpslld ymm2,ymm12,19
- vpaddd ymm5,ymm5,ymm0
- vpand ymm4,ymm4,ymm3
-
- vpxor ymm7,ymm11,ymm1
-
- vpsrld ymm1,ymm12,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm12,10
- vpxor ymm11,ymm13,ymm4
- vpaddd ymm15,ymm15,ymm5
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm11,ymm11,ymm5
- vpaddd ymm11,ymm11,ymm7
- vmovdqu ymm5,YMMWORD[((448-256-128))+rbx]
- vpaddd ymm6,ymm6,YMMWORD[((192-128))+rax]
-
- vpsrld ymm7,ymm5,3
- vpsrld ymm1,ymm5,7
- vpslld ymm2,ymm5,25
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm5,18
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm5,14
- vmovdqu ymm0,YMMWORD[((352-256-128))+rbx]
- vpsrld ymm4,ymm0,10
-
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm0,17
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,15
- vpaddd ymm6,ymm6,ymm7
- vpxor ymm7,ymm4,ymm1
- vpsrld ymm1,ymm0,19
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,13
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
- vpaddd ymm6,ymm6,ymm7
- vpsrld ymm7,ymm15,6
- vpslld ymm2,ymm15,26
- vmovdqu YMMWORD[(416-256-128)+rbx],ymm6
- vpaddd ymm6,ymm6,ymm10
-
- vpsrld ymm1,ymm15,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm15,21
- vpaddd ymm6,ymm6,YMMWORD[32+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm15,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm15,7
- vpandn ymm0,ymm15,ymm9
- vpand ymm4,ymm15,ymm8
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm10,ymm11,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm11,30
- vpxor ymm0,ymm0,ymm4
- vpxor ymm4,ymm12,ymm11
-
- vpxor ymm10,ymm10,ymm1
- vpaddd ymm6,ymm6,ymm7
-
- vpsrld ymm1,ymm11,13
-
- vpslld ymm2,ymm11,19
- vpaddd ymm6,ymm6,ymm0
- vpand ymm3,ymm3,ymm4
-
- vpxor ymm7,ymm10,ymm1
-
- vpsrld ymm1,ymm11,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm11,10
- vpxor ymm10,ymm12,ymm3
- vpaddd ymm14,ymm14,ymm6
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm10,ymm10,ymm6
- vpaddd ymm10,ymm10,ymm7
- vmovdqu ymm6,YMMWORD[((480-256-128))+rbx]
- vpaddd ymm5,ymm5,YMMWORD[((224-128))+rax]
-
- vpsrld ymm7,ymm6,3
- vpsrld ymm1,ymm6,7
- vpslld ymm2,ymm6,25
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm6,18
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm6,14
- vmovdqu ymm0,YMMWORD[((384-256-128))+rbx]
- vpsrld ymm3,ymm0,10
-
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm0,17
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,15
- vpaddd ymm5,ymm5,ymm7
- vpxor ymm7,ymm3,ymm1
- vpsrld ymm1,ymm0,19
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,13
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
- vpaddd ymm5,ymm5,ymm7
- vpsrld ymm7,ymm14,6
- vpslld ymm2,ymm14,26
- vmovdqu YMMWORD[(448-256-128)+rbx],ymm5
- vpaddd ymm5,ymm5,ymm9
-
- vpsrld ymm1,ymm14,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm14,21
- vpaddd ymm5,ymm5,YMMWORD[64+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm14,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm14,7
- vpandn ymm0,ymm14,ymm8
- vpand ymm3,ymm14,ymm15
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm9,ymm10,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm10,30
- vpxor ymm0,ymm0,ymm3
- vpxor ymm3,ymm11,ymm10
-
- vpxor ymm9,ymm9,ymm1
- vpaddd ymm5,ymm5,ymm7
-
- vpsrld ymm1,ymm10,13
-
- vpslld ymm2,ymm10,19
- vpaddd ymm5,ymm5,ymm0
- vpand ymm4,ymm4,ymm3
-
- vpxor ymm7,ymm9,ymm1
-
- vpsrld ymm1,ymm10,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm10,10
- vpxor ymm9,ymm11,ymm4
- vpaddd ymm13,ymm13,ymm5
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm9,ymm9,ymm5
- vpaddd ymm9,ymm9,ymm7
- vmovdqu ymm5,YMMWORD[((0-128))+rax]
- vpaddd ymm6,ymm6,YMMWORD[((256-256-128))+rbx]
-
- vpsrld ymm7,ymm5,3
- vpsrld ymm1,ymm5,7
- vpslld ymm2,ymm5,25
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm5,18
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm5,14
- vmovdqu ymm0,YMMWORD[((416-256-128))+rbx]
- vpsrld ymm4,ymm0,10
-
- vpxor ymm7,ymm7,ymm1
- vpsrld ymm1,ymm0,17
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,15
- vpaddd ymm6,ymm6,ymm7
- vpxor ymm7,ymm4,ymm1
- vpsrld ymm1,ymm0,19
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm0,13
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
- vpaddd ymm6,ymm6,ymm7
- vpsrld ymm7,ymm13,6
- vpslld ymm2,ymm13,26
- vmovdqu YMMWORD[(480-256-128)+rbx],ymm6
- vpaddd ymm6,ymm6,ymm8
-
- vpsrld ymm1,ymm13,11
- vpxor ymm7,ymm7,ymm2
- vpslld ymm2,ymm13,21
- vpaddd ymm6,ymm6,YMMWORD[96+rbp]
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm1,ymm13,25
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm13,7
- vpandn ymm0,ymm13,ymm15
- vpand ymm4,ymm13,ymm14
-
- vpxor ymm7,ymm7,ymm1
-
- vpsrld ymm8,ymm9,2
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm1,ymm9,30
- vpxor ymm0,ymm0,ymm4
- vpxor ymm4,ymm10,ymm9
-
- vpxor ymm8,ymm8,ymm1
- vpaddd ymm6,ymm6,ymm7
-
- vpsrld ymm1,ymm9,13
-
- vpslld ymm2,ymm9,19
- vpaddd ymm6,ymm6,ymm0
- vpand ymm3,ymm3,ymm4
-
- vpxor ymm7,ymm8,ymm1
-
- vpsrld ymm1,ymm9,22
- vpxor ymm7,ymm7,ymm2
-
- vpslld ymm2,ymm9,10
- vpxor ymm8,ymm10,ymm3
- vpaddd ymm12,ymm12,ymm6
-
- vpxor ymm7,ymm7,ymm1
- vpxor ymm7,ymm7,ymm2
-
- vpaddd ymm8,ymm8,ymm6
- vpaddd ymm8,ymm8,ymm7
- add rbp,256
- dec ecx
- jnz NEAR $L$oop_16_xx_avx2
-
- mov ecx,1
- lea rbx,[512+rsp]
- lea rbp,[((K256+128))]
- cmp ecx,DWORD[rbx]
- cmovge r12,rbp
- cmp ecx,DWORD[4+rbx]
- cmovge r13,rbp
- cmp ecx,DWORD[8+rbx]
- cmovge r14,rbp
- cmp ecx,DWORD[12+rbx]
- cmovge r15,rbp
- cmp ecx,DWORD[16+rbx]
- cmovge r8,rbp
- cmp ecx,DWORD[20+rbx]
- cmovge r9,rbp
- cmp ecx,DWORD[24+rbx]
- cmovge r10,rbp
- cmp ecx,DWORD[28+rbx]
- cmovge r11,rbp
- vmovdqa ymm7,YMMWORD[rbx]
- vpxor ymm0,ymm0,ymm0
- vmovdqa ymm6,ymm7
- vpcmpgtd ymm6,ymm6,ymm0
- vpaddd ymm7,ymm7,ymm6
-
- vmovdqu ymm0,YMMWORD[((0-128))+rdi]
- vpand ymm8,ymm8,ymm6
- vmovdqu ymm1,YMMWORD[((32-128))+rdi]
- vpand ymm9,ymm9,ymm6
- vmovdqu ymm2,YMMWORD[((64-128))+rdi]
- vpand ymm10,ymm10,ymm6
- vmovdqu ymm5,YMMWORD[((96-128))+rdi]
- vpand ymm11,ymm11,ymm6
- vpaddd ymm8,ymm8,ymm0
- vmovdqu ymm0,YMMWORD[((128-128))+rdi]
- vpand ymm12,ymm12,ymm6
- vpaddd ymm9,ymm9,ymm1
- vmovdqu ymm1,YMMWORD[((160-128))+rdi]
- vpand ymm13,ymm13,ymm6
- vpaddd ymm10,ymm10,ymm2
- vmovdqu ymm2,YMMWORD[((192-128))+rdi]
- vpand ymm14,ymm14,ymm6
- vpaddd ymm11,ymm11,ymm5
- vmovdqu ymm5,YMMWORD[((224-128))+rdi]
- vpand ymm15,ymm15,ymm6
- vpaddd ymm12,ymm12,ymm0
- vpaddd ymm13,ymm13,ymm1
- vmovdqu YMMWORD[(0-128)+rdi],ymm8
- vpaddd ymm14,ymm14,ymm2
- vmovdqu YMMWORD[(32-128)+rdi],ymm9
- vpaddd ymm15,ymm15,ymm5
- vmovdqu YMMWORD[(64-128)+rdi],ymm10
- vmovdqu YMMWORD[(96-128)+rdi],ymm11
- vmovdqu YMMWORD[(128-128)+rdi],ymm12
- vmovdqu YMMWORD[(160-128)+rdi],ymm13
- vmovdqu YMMWORD[(192-128)+rdi],ymm14
- vmovdqu YMMWORD[(224-128)+rdi],ymm15
-
- vmovdqu YMMWORD[rbx],ymm7
- lea rbx,[((256+128))+rsp]
- vmovdqu ymm6,YMMWORD[$L$pbswap]
- dec edx
- jnz NEAR $L$oop_avx2
-
-
-
-
-
-
-
-$L$done_avx2:
- mov rax,QWORD[544+rsp]
-
- vzeroupper
- movaps xmm6,XMMWORD[((-216))+rax]
- movaps xmm7,XMMWORD[((-200))+rax]
- movaps xmm8,XMMWORD[((-184))+rax]
- movaps xmm9,XMMWORD[((-168))+rax]
- movaps xmm10,XMMWORD[((-152))+rax]
- movaps xmm11,XMMWORD[((-136))+rax]
- movaps xmm12,XMMWORD[((-120))+rax]
- movaps xmm13,XMMWORD[((-104))+rax]
- movaps xmm14,XMMWORD[((-88))+rax]
- movaps xmm15,XMMWORD[((-72))+rax]
- mov r15,QWORD[((-48))+rax]
-
- mov r14,QWORD[((-40))+rax]
-
- mov r13,QWORD[((-32))+rax]
-
- mov r12,QWORD[((-24))+rax]
-
- mov rbp,QWORD[((-16))+rax]
-
- mov rbx,QWORD[((-8))+rax]
-
- lea rsp,[rax]
-
-$L$epilogue_avx2:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_sha256_multi_block_avx2:
ALIGN 256
K256:
DD 1116352408,1116352408,1116352408,1116352408
@@ -8203,60 +3443,6 @@ $L$in_prologue:
pop rsi
DB 0F3h,0C3h ;repret
-
-ALIGN 16
-avx2_handler:
- push rsi
- push rdi
- push rbx
- push rbp
- push r12
- push r13
- push r14
- push r15
- pushfq
- sub rsp,64
-
- mov rax,QWORD[120+r8]
- mov rbx,QWORD[248+r8]
-
- mov rsi,QWORD[8+r9]
- mov r11,QWORD[56+r9]
-
- mov r10d,DWORD[r11]
- lea r10,[r10*1+rsi]
- cmp rbx,r10
- jb NEAR $L$in_prologue
-
- mov rax,QWORD[152+r8]
-
- mov r10d,DWORD[4+r11]
- lea r10,[r10*1+rsi]
- cmp rbx,r10
- jae NEAR $L$in_prologue
-
- mov rax,QWORD[544+r8]
-
- mov rbx,QWORD[((-8))+rax]
- mov rbp,QWORD[((-16))+rax]
- mov r12,QWORD[((-24))+rax]
- mov r13,QWORD[((-32))+rax]
- mov r14,QWORD[((-40))+rax]
- mov r15,QWORD[((-48))+rax]
- mov QWORD[144+r8],rbx
- mov QWORD[160+r8],rbp
- mov QWORD[216+r8],r12
- mov QWORD[224+r8],r13
- mov QWORD[232+r8],r14
- mov QWORD[240+r8],r15
-
- lea rsi,[((-56-160))+rax]
- lea rdi,[512+r8]
- mov ecx,20
- DD 0xa548f3fc
-
- jmp NEAR $L$in_prologue
-
section .pdata rdata align=4
ALIGN 4
DD $L$SEH_begin_sha256_multi_block wrt ..imagebase
@@ -8265,12 +3451,6 @@ ALIGN 4
DD $L$SEH_begin_sha256_multi_block_shaext wrt ..imagebase
DD $L$SEH_end_sha256_multi_block_shaext wrt ..imagebase
DD $L$SEH_info_sha256_multi_block_shaext wrt ..imagebase
- DD $L$SEH_begin_sha256_multi_block_avx wrt ..imagebase
- DD $L$SEH_end_sha256_multi_block_avx wrt ..imagebase
- DD $L$SEH_info_sha256_multi_block_avx wrt ..imagebase
- DD $L$SEH_begin_sha256_multi_block_avx2 wrt ..imagebase
- DD $L$SEH_end_sha256_multi_block_avx2 wrt ..imagebase
- DD $L$SEH_info_sha256_multi_block_avx2 wrt ..imagebase
section .xdata rdata align=8
ALIGN 8
$L$SEH_info_sha256_multi_block:
@@ -8281,11 +3461,3 @@ $L$SEH_info_sha256_multi_block_shaext:
DB 9,0,0,0
DD se_handler wrt ..imagebase
DD $L$body_shaext wrt ..imagebase,$L$epilogue_shaext wrt ..imagebase
-$L$SEH_info_sha256_multi_block_avx:
-DB 9,0,0,0
- DD se_handler wrt ..imagebase
- DD $L$body_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
-$L$SEH_info_sha256_multi_block_avx2:
-DB 9,0,0,0
- DD avx2_handler wrt ..imagebase
- DD $L$body_avx2 wrt ..imagebase,$L$epilogue_avx2 wrt ..imagebase
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-x86_64.nasm
index 8238c4e4636..c20586762eb 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-x86_64.nasm
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha256-x86_64.nasm
@@ -26,14 +26,6 @@ $L$SEH_begin_sha256_block_data_order:
mov r11d,DWORD[8+r11]
test r11d,536870912
jnz NEAR _shaext_shortcut
- and r11d,296
- cmp r11d,296
- je NEAR $L$avx2_shortcut
- and r9d,1073741824
- and r10d,268435968
- or r10d,r9d
- cmp r10d,1342177792
- je NEAR $L$avx_shortcut
test r10d,512
jnz NEAR $L$ssse3_shortcut
mov rax,rsp
@@ -3157,2385 +3149,6 @@ $L$epilogue_ssse3:
DB 0F3h,0C3h ;repret
$L$SEH_end_sha256_block_data_order_ssse3:
-
-ALIGN 64
-sha256_block_data_order_avx:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_sha256_block_data_order_avx:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
-
-
-
-$L$avx_shortcut:
- mov rax,rsp
-
- push rbx
-
- push rbp
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
- shl rdx,4
- sub rsp,160
- lea rdx,[rdx*4+rsi]
- and rsp,-64
- mov QWORD[((64+0))+rsp],rdi
- mov QWORD[((64+8))+rsp],rsi
- mov QWORD[((64+16))+rsp],rdx
- mov QWORD[88+rsp],rax
-
- movaps XMMWORD[(64+32)+rsp],xmm6
- movaps XMMWORD[(64+48)+rsp],xmm7
- movaps XMMWORD[(64+64)+rsp],xmm8
- movaps XMMWORD[(64+80)+rsp],xmm9
-$L$prologue_avx:
-
- vzeroupper
- mov eax,DWORD[rdi]
- mov ebx,DWORD[4+rdi]
- mov ecx,DWORD[8+rdi]
- mov edx,DWORD[12+rdi]
- mov r8d,DWORD[16+rdi]
- mov r9d,DWORD[20+rdi]
- mov r10d,DWORD[24+rdi]
- mov r11d,DWORD[28+rdi]
- vmovdqa xmm8,XMMWORD[((K256+512+32))]
- vmovdqa xmm9,XMMWORD[((K256+512+64))]
- jmp NEAR $L$loop_avx
-ALIGN 16
-$L$loop_avx:
- vmovdqa xmm7,XMMWORD[((K256+512))]
- vmovdqu xmm0,XMMWORD[rsi]
- vmovdqu xmm1,XMMWORD[16+rsi]
- vmovdqu xmm2,XMMWORD[32+rsi]
- vmovdqu xmm3,XMMWORD[48+rsi]
- vpshufb xmm0,xmm0,xmm7
- lea rbp,[K256]
- vpshufb xmm1,xmm1,xmm7
- vpshufb xmm2,xmm2,xmm7
- vpaddd xmm4,xmm0,XMMWORD[rbp]
- vpshufb xmm3,xmm3,xmm7
- vpaddd xmm5,xmm1,XMMWORD[32+rbp]
- vpaddd xmm6,xmm2,XMMWORD[64+rbp]
- vpaddd xmm7,xmm3,XMMWORD[96+rbp]
- vmovdqa XMMWORD[rsp],xmm4
- mov r14d,eax
- vmovdqa XMMWORD[16+rsp],xmm5
- mov edi,ebx
- vmovdqa XMMWORD[32+rsp],xmm6
- xor edi,ecx
- vmovdqa XMMWORD[48+rsp],xmm7
- mov r13d,r8d
- jmp NEAR $L$avx_00_47
-
-ALIGN 16
-$L$avx_00_47:
- sub rbp,-128
- vpalignr xmm4,xmm1,xmm0,4
- shrd r13d,r13d,14
- mov eax,r14d
- mov r12d,r9d
- vpalignr xmm7,xmm3,xmm2,4
- shrd r14d,r14d,9
- xor r13d,r8d
- xor r12d,r10d
- vpsrld xmm6,xmm4,7
- shrd r13d,r13d,5
- xor r14d,eax
- and r12d,r8d
- vpaddd xmm0,xmm0,xmm7
- xor r13d,r8d
- add r11d,DWORD[rsp]
- mov r15d,eax
- vpsrld xmm7,xmm4,3
- xor r12d,r10d
- shrd r14d,r14d,11
- xor r15d,ebx
- vpslld xmm5,xmm4,14
- add r11d,r12d
- shrd r13d,r13d,6
- and edi,r15d
- vpxor xmm4,xmm7,xmm6
- xor r14d,eax
- add r11d,r13d
- xor edi,ebx
- vpshufd xmm7,xmm3,250
- shrd r14d,r14d,2
- add edx,r11d
- add r11d,edi
- vpsrld xmm6,xmm6,11
- mov r13d,edx
- add r14d,r11d
- shrd r13d,r13d,14
- vpxor xmm4,xmm4,xmm5
- mov r11d,r14d
- mov r12d,r8d
- shrd r14d,r14d,9
- vpslld xmm5,xmm5,11
- xor r13d,edx
- xor r12d,r9d
- shrd r13d,r13d,5
- vpxor xmm4,xmm4,xmm6
- xor r14d,r11d
- and r12d,edx
- xor r13d,edx
- vpsrld xmm6,xmm7,10
- add r10d,DWORD[4+rsp]
- mov edi,r11d
- xor r12d,r9d
- vpxor xmm4,xmm4,xmm5
- shrd r14d,r14d,11
- xor edi,eax
- add r10d,r12d
- vpsrlq xmm7,xmm7,17
- shrd r13d,r13d,6
- and r15d,edi
- xor r14d,r11d
- vpaddd xmm0,xmm0,xmm4
- add r10d,r13d
- xor r15d,eax
- shrd r14d,r14d,2
- vpxor xmm6,xmm6,xmm7
- add ecx,r10d
- add r10d,r15d
- mov r13d,ecx
- vpsrlq xmm7,xmm7,2
- add r14d,r10d
- shrd r13d,r13d,14
- mov r10d,r14d
- vpxor xmm6,xmm6,xmm7
- mov r12d,edx
- shrd r14d,r14d,9
- xor r13d,ecx
- vpshufb xmm6,xmm6,xmm8
- xor r12d,r8d
- shrd r13d,r13d,5
- xor r14d,r10d
- vpaddd xmm0,xmm0,xmm6
- and r12d,ecx
- xor r13d,ecx
- add r9d,DWORD[8+rsp]
- vpshufd xmm7,xmm0,80
- mov r15d,r10d
- xor r12d,r8d
- shrd r14d,r14d,11
- vpsrld xmm6,xmm7,10
- xor r15d,r11d
- add r9d,r12d
- shrd r13d,r13d,6
- vpsrlq xmm7,xmm7,17
- and edi,r15d
- xor r14d,r10d
- add r9d,r13d
- vpxor xmm6,xmm6,xmm7
- xor edi,r11d
- shrd r14d,r14d,2
- add ebx,r9d
- vpsrlq xmm7,xmm7,2
- add r9d,edi
- mov r13d,ebx
- add r14d,r9d
- vpxor xmm6,xmm6,xmm7
- shrd r13d,r13d,14
- mov r9d,r14d
- mov r12d,ecx
- vpshufb xmm6,xmm6,xmm9
- shrd r14d,r14d,9
- xor r13d,ebx
- xor r12d,edx
- vpaddd xmm0,xmm0,xmm6
- shrd r13d,r13d,5
- xor r14d,r9d
- and r12d,ebx
- vpaddd xmm6,xmm0,XMMWORD[rbp]
- xor r13d,ebx
- add r8d,DWORD[12+rsp]
- mov edi,r9d
- xor r12d,edx
- shrd r14d,r14d,11
- xor edi,r10d
- add r8d,r12d
- shrd r13d,r13d,6
- and r15d,edi
- xor r14d,r9d
- add r8d,r13d
- xor r15d,r10d
- shrd r14d,r14d,2
- add eax,r8d
- add r8d,r15d
- mov r13d,eax
- add r14d,r8d
- vmovdqa XMMWORD[rsp],xmm6
- vpalignr xmm4,xmm2,xmm1,4
- shrd r13d,r13d,14
- mov r8d,r14d
- mov r12d,ebx
- vpalignr xmm7,xmm0,xmm3,4
- shrd r14d,r14d,9
- xor r13d,eax
- xor r12d,ecx
- vpsrld xmm6,xmm4,7
- shrd r13d,r13d,5
- xor r14d,r8d
- and r12d,eax
- vpaddd xmm1,xmm1,xmm7
- xor r13d,eax
- add edx,DWORD[16+rsp]
- mov r15d,r8d
- vpsrld xmm7,xmm4,3
- xor r12d,ecx
- shrd r14d,r14d,11
- xor r15d,r9d
- vpslld xmm5,xmm4,14
- add edx,r12d
- shrd r13d,r13d,6
- and edi,r15d
- vpxor xmm4,xmm7,xmm6
- xor r14d,r8d
- add edx,r13d
- xor edi,r9d
- vpshufd xmm7,xmm0,250
- shrd r14d,r14d,2
- add r11d,edx
- add edx,edi
- vpsrld xmm6,xmm6,11
- mov r13d,r11d
- add r14d,edx
- shrd r13d,r13d,14
- vpxor xmm4,xmm4,xmm5
- mov edx,r14d
- mov r12d,eax
- shrd r14d,r14d,9
- vpslld xmm5,xmm5,11
- xor r13d,r11d
- xor r12d,ebx
- shrd r13d,r13d,5
- vpxor xmm4,xmm4,xmm6
- xor r14d,edx
- and r12d,r11d
- xor r13d,r11d
- vpsrld xmm6,xmm7,10
- add ecx,DWORD[20+rsp]
- mov edi,edx
- xor r12d,ebx
- vpxor xmm4,xmm4,xmm5
- shrd r14d,r14d,11
- xor edi,r8d
- add ecx,r12d
- vpsrlq xmm7,xmm7,17
- shrd r13d,r13d,6
- and r15d,edi
- xor r14d,edx
- vpaddd xmm1,xmm1,xmm4
- add ecx,r13d
- xor r15d,r8d
- shrd r14d,r14d,2
- vpxor xmm6,xmm6,xmm7
- add r10d,ecx
- add ecx,r15d
- mov r13d,r10d
- vpsrlq xmm7,xmm7,2
- add r14d,ecx
- shrd r13d,r13d,14
- mov ecx,r14d
- vpxor xmm6,xmm6,xmm7
- mov r12d,r11d
- shrd r14d,r14d,9
- xor r13d,r10d
- vpshufb xmm6,xmm6,xmm8
- xor r12d,eax
- shrd r13d,r13d,5
- xor r14d,ecx
- vpaddd xmm1,xmm1,xmm6
- and r12d,r10d
- xor r13d,r10d
- add ebx,DWORD[24+rsp]
- vpshufd xmm7,xmm1,80
- mov r15d,ecx
- xor r12d,eax
- shrd r14d,r14d,11
- vpsrld xmm6,xmm7,10
- xor r15d,edx
- add ebx,r12d
- shrd r13d,r13d,6
- vpsrlq xmm7,xmm7,17
- and edi,r15d
- xor r14d,ecx
- add ebx,r13d
- vpxor xmm6,xmm6,xmm7
- xor edi,edx
- shrd r14d,r14d,2
- add r9d,ebx
- vpsrlq xmm7,xmm7,2
- add ebx,edi
- mov r13d,r9d
- add r14d,ebx
- vpxor xmm6,xmm6,xmm7
- shrd r13d,r13d,14
- mov ebx,r14d
- mov r12d,r10d
- vpshufb xmm6,xmm6,xmm9
- shrd r14d,r14d,9
- xor r13d,r9d
- xor r12d,r11d
- vpaddd xmm1,xmm1,xmm6
- shrd r13d,r13d,5
- xor r14d,ebx
- and r12d,r9d
- vpaddd xmm6,xmm1,XMMWORD[32+rbp]
- xor r13d,r9d
- add eax,DWORD[28+rsp]
- mov edi,ebx
- xor r12d,r11d
- shrd r14d,r14d,11
- xor edi,ecx
- add eax,r12d
- shrd r13d,r13d,6
- and r15d,edi
- xor r14d,ebx
- add eax,r13d
- xor r15d,ecx
- shrd r14d,r14d,2
- add r8d,eax
- add eax,r15d
- mov r13d,r8d
- add r14d,eax
- vmovdqa XMMWORD[16+rsp],xmm6
- vpalignr xmm4,xmm3,xmm2,4
- shrd r13d,r13d,14
- mov eax,r14d
- mov r12d,r9d
- vpalignr xmm7,xmm1,xmm0,4
- shrd r14d,r14d,9
- xor r13d,r8d
- xor r12d,r10d
- vpsrld xmm6,xmm4,7
- shrd r13d,r13d,5
- xor r14d,eax
- and r12d,r8d
- vpaddd xmm2,xmm2,xmm7
- xor r13d,r8d
- add r11d,DWORD[32+rsp]
- mov r15d,eax
- vpsrld xmm7,xmm4,3
- xor r12d,r10d
- shrd r14d,r14d,11
- xor r15d,ebx
- vpslld xmm5,xmm4,14
- add r11d,r12d
- shrd r13d,r13d,6
- and edi,r15d
- vpxor xmm4,xmm7,xmm6
- xor r14d,eax
- add r11d,r13d
- xor edi,ebx
- vpshufd xmm7,xmm1,250
- shrd r14d,r14d,2
- add edx,r11d
- add r11d,edi
- vpsrld xmm6,xmm6,11
- mov r13d,edx
- add r14d,r11d
- shrd r13d,r13d,14
- vpxor xmm4,xmm4,xmm5
- mov r11d,r14d
- mov r12d,r8d
- shrd r14d,r14d,9
- vpslld xmm5,xmm5,11
- xor r13d,edx
- xor r12d,r9d
- shrd r13d,r13d,5
- vpxor xmm4,xmm4,xmm6
- xor r14d,r11d
- and r12d,edx
- xor r13d,edx
- vpsrld xmm6,xmm7,10
- add r10d,DWORD[36+rsp]
- mov edi,r11d
- xor r12d,r9d
- vpxor xmm4,xmm4,xmm5
- shrd r14d,r14d,11
- xor edi,eax
- add r10d,r12d
- vpsrlq xmm7,xmm7,17
- shrd r13d,r13d,6
- and r15d,edi
- xor r14d,r11d
- vpaddd xmm2,xmm2,xmm4
- add r10d,r13d
- xor r15d,eax
- shrd r14d,r14d,2
- vpxor xmm6,xmm6,xmm7
- add ecx,r10d
- add r10d,r15d
- mov r13d,ecx
- vpsrlq xmm7,xmm7,2
- add r14d,r10d
- shrd r13d,r13d,14
- mov r10d,r14d
- vpxor xmm6,xmm6,xmm7
- mov r12d,edx
- shrd r14d,r14d,9
- xor r13d,ecx
- vpshufb xmm6,xmm6,xmm8
- xor r12d,r8d
- shrd r13d,r13d,5
- xor r14d,r10d
- vpaddd xmm2,xmm2,xmm6
- and r12d,ecx
- xor r13d,ecx
- add r9d,DWORD[40+rsp]
- vpshufd xmm7,xmm2,80
- mov r15d,r10d
- xor r12d,r8d
- shrd r14d,r14d,11
- vpsrld xmm6,xmm7,10
- xor r15d,r11d
- add r9d,r12d
- shrd r13d,r13d,6
- vpsrlq xmm7,xmm7,17
- and edi,r15d
- xor r14d,r10d
- add r9d,r13d
- vpxor xmm6,xmm6,xmm7
- xor edi,r11d
- shrd r14d,r14d,2
- add ebx,r9d
- vpsrlq xmm7,xmm7,2
- add r9d,edi
- mov r13d,ebx
- add r14d,r9d
- vpxor xmm6,xmm6,xmm7
- shrd r13d,r13d,14
- mov r9d,r14d
- mov r12d,ecx
- vpshufb xmm6,xmm6,xmm9
- shrd r14d,r14d,9
- xor r13d,ebx
- xor r12d,edx
- vpaddd xmm2,xmm2,xmm6
- shrd r13d,r13d,5
- xor r14d,r9d
- and r12d,ebx
- vpaddd xmm6,xmm2,XMMWORD[64+rbp]
- xor r13d,ebx
- add r8d,DWORD[44+rsp]
- mov edi,r9d
- xor r12d,edx
- shrd r14d,r14d,11
- xor edi,r10d
- add r8d,r12d
- shrd r13d,r13d,6
- and r15d,edi
- xor r14d,r9d
- add r8d,r13d
- xor r15d,r10d
- shrd r14d,r14d,2
- add eax,r8d
- add r8d,r15d
- mov r13d,eax
- add r14d,r8d
- vmovdqa XMMWORD[32+rsp],xmm6
- vpalignr xmm4,xmm0,xmm3,4
- shrd r13d,r13d,14
- mov r8d,r14d
- mov r12d,ebx
- vpalignr xmm7,xmm2,xmm1,4
- shrd r14d,r14d,9
- xor r13d,eax
- xor r12d,ecx
- vpsrld xmm6,xmm4,7
- shrd r13d,r13d,5
- xor r14d,r8d
- and r12d,eax
- vpaddd xmm3,xmm3,xmm7
- xor r13d,eax
- add edx,DWORD[48+rsp]
- mov r15d,r8d
- vpsrld xmm7,xmm4,3
- xor r12d,ecx
- shrd r14d,r14d,11
- xor r15d,r9d
- vpslld xmm5,xmm4,14
- add edx,r12d
- shrd r13d,r13d,6
- and edi,r15d
- vpxor xmm4,xmm7,xmm6
- xor r14d,r8d
- add edx,r13d
- xor edi,r9d
- vpshufd xmm7,xmm2,250
- shrd r14d,r14d,2
- add r11d,edx
- add edx,edi
- vpsrld xmm6,xmm6,11
- mov r13d,r11d
- add r14d,edx
- shrd r13d,r13d,14
- vpxor xmm4,xmm4,xmm5
- mov edx,r14d
- mov r12d,eax
- shrd r14d,r14d,9
- vpslld xmm5,xmm5,11
- xor r13d,r11d
- xor r12d,ebx
- shrd r13d,r13d,5
- vpxor xmm4,xmm4,xmm6
- xor r14d,edx
- and r12d,r11d
- xor r13d,r11d
- vpsrld xmm6,xmm7,10
- add ecx,DWORD[52+rsp]
- mov edi,edx
- xor r12d,ebx
- vpxor xmm4,xmm4,xmm5
- shrd r14d,r14d,11
- xor edi,r8d
- add ecx,r12d
- vpsrlq xmm7,xmm7,17
- shrd r13d,r13d,6
- and r15d,edi
- xor r14d,edx
- vpaddd xmm3,xmm3,xmm4
- add ecx,r13d
- xor r15d,r8d
- shrd r14d,r14d,2
- vpxor xmm6,xmm6,xmm7
- add r10d,ecx
- add ecx,r15d
- mov r13d,r10d
- vpsrlq xmm7,xmm7,2
- add r14d,ecx
- shrd r13d,r13d,14
- mov ecx,r14d
- vpxor xmm6,xmm6,xmm7
- mov r12d,r11d
- shrd r14d,r14d,9
- xor r13d,r10d
- vpshufb xmm6,xmm6,xmm8
- xor r12d,eax
- shrd r13d,r13d,5
- xor r14d,ecx
- vpaddd xmm3,xmm3,xmm6
- and r12d,r10d
- xor r13d,r10d
- add ebx,DWORD[56+rsp]
- vpshufd xmm7,xmm3,80
- mov r15d,ecx
- xor r12d,eax
- shrd r14d,r14d,11
- vpsrld xmm6,xmm7,10
- xor r15d,edx
- add ebx,r12d
- shrd r13d,r13d,6
- vpsrlq xmm7,xmm7,17
- and edi,r15d
- xor r14d,ecx
- add ebx,r13d
- vpxor xmm6,xmm6,xmm7
- xor edi,edx
- shrd r14d,r14d,2
- add r9d,ebx
- vpsrlq xmm7,xmm7,2
- add ebx,edi
- mov r13d,r9d
- add r14d,ebx
- vpxor xmm6,xmm6,xmm7
- shrd r13d,r13d,14
- mov ebx,r14d
- mov r12d,r10d
- vpshufb xmm6,xmm6,xmm9
- shrd r14d,r14d,9
- xor r13d,r9d
- xor r12d,r11d
- vpaddd xmm3,xmm3,xmm6
- shrd r13d,r13d,5
- xor r14d,ebx
- and r12d,r9d
- vpaddd xmm6,xmm3,XMMWORD[96+rbp]
- xor r13d,r9d
- add eax,DWORD[60+rsp]
- mov edi,ebx
- xor r12d,r11d
- shrd r14d,r14d,11
- xor edi,ecx
- add eax,r12d
- shrd r13d,r13d,6
- and r15d,edi
- xor r14d,ebx
- add eax,r13d
- xor r15d,ecx
- shrd r14d,r14d,2
- add r8d,eax
- add eax,r15d
- mov r13d,r8d
- add r14d,eax
- vmovdqa XMMWORD[48+rsp],xmm6
- cmp BYTE[131+rbp],0
- jne NEAR $L$avx_00_47
- shrd r13d,r13d,14
- mov eax,r14d
- mov r12d,r9d
- shrd r14d,r14d,9
- xor r13d,r8d
- xor r12d,r10d
- shrd r13d,r13d,5
- xor r14d,eax
- and r12d,r8d
- xor r13d,r8d
- add r11d,DWORD[rsp]
- mov r15d,eax
- xor r12d,r10d
- shrd r14d,r14d,11
- xor r15d,ebx
- add r11d,r12d
- shrd r13d,r13d,6
- and edi,r15d
- xor r14d,eax
- add r11d,r13d
- xor edi,ebx
- shrd r14d,r14d,2
- add edx,r11d
- add r11d,edi
- mov r13d,edx
- add r14d,r11d
- shrd r13d,r13d,14
- mov r11d,r14d
- mov r12d,r8d
- shrd r14d,r14d,9
- xor r13d,edx
- xor r12d,r9d
- shrd r13d,r13d,5
- xor r14d,r11d
- and r12d,edx
- xor r13d,edx
- add r10d,DWORD[4+rsp]
- mov edi,r11d
- xor r12d,r9d
- shrd r14d,r14d,11
- xor edi,eax
- add r10d,r12d
- shrd r13d,r13d,6
- and r15d,edi
- xor r14d,r11d
- add r10d,r13d
- xor r15d,eax
- shrd r14d,r14d,2
- add ecx,r10d
- add r10d,r15d
- mov r13d,ecx
- add r14d,r10d
- shrd r13d,r13d,14
- mov r10d,r14d
- mov r12d,edx
- shrd r14d,r14d,9
- xor r13d,ecx
- xor r12d,r8d
- shrd r13d,r13d,5
- xor r14d,r10d
- and r12d,ecx
- xor r13d,ecx
- add r9d,DWORD[8+rsp]
- mov r15d,r10d
- xor r12d,r8d
- shrd r14d,r14d,11
- xor r15d,r11d
- add r9d,r12d
- shrd r13d,r13d,6
- and edi,r15d
- xor r14d,r10d
- add r9d,r13d
- xor edi,r11d
- shrd r14d,r14d,2
- add ebx,r9d
- add r9d,edi
- mov r13d,ebx
- add r14d,r9d
- shrd r13d,r13d,14
- mov r9d,r14d
- mov r12d,ecx
- shrd r14d,r14d,9
- xor r13d,ebx
- xor r12d,edx
- shrd r13d,r13d,5
- xor r14d,r9d
- and r12d,ebx
- xor r13d,ebx
- add r8d,DWORD[12+rsp]
- mov edi,r9d
- xor r12d,edx
- shrd r14d,r14d,11
- xor edi,r10d
- add r8d,r12d
- shrd r13d,r13d,6
- and r15d,edi
- xor r14d,r9d
- add r8d,r13d
- xor r15d,r10d
- shrd r14d,r14d,2
- add eax,r8d
- add r8d,r15d
- mov r13d,eax
- add r14d,r8d
- shrd r13d,r13d,14
- mov r8d,r14d
- mov r12d,ebx
- shrd r14d,r14d,9
- xor r13d,eax
- xor r12d,ecx
- shrd r13d,r13d,5
- xor r14d,r8d
- and r12d,eax
- xor r13d,eax
- add edx,DWORD[16+rsp]
- mov r15d,r8d
- xor r12d,ecx
- shrd r14d,r14d,11
- xor r15d,r9d
- add edx,r12d
- shrd r13d,r13d,6
- and edi,r15d
- xor r14d,r8d
- add edx,r13d
- xor edi,r9d
- shrd r14d,r14d,2
- add r11d,edx
- add edx,edi
- mov r13d,r11d
- add r14d,edx
- shrd r13d,r13d,14
- mov edx,r14d
- mov r12d,eax
- shrd r14d,r14d,9
- xor r13d,r11d
- xor r12d,ebx
- shrd r13d,r13d,5
- xor r14d,edx
- and r12d,r11d
- xor r13d,r11d
- add ecx,DWORD[20+rsp]
- mov edi,edx
- xor r12d,ebx
- shrd r14d,r14d,11
- xor edi,r8d
- add ecx,r12d
- shrd r13d,r13d,6
- and r15d,edi
- xor r14d,edx
- add ecx,r13d
- xor r15d,r8d
- shrd r14d,r14d,2
- add r10d,ecx
- add ecx,r15d
- mov r13d,r10d
- add r14d,ecx
- shrd r13d,r13d,14
- mov ecx,r14d
- mov r12d,r11d
- shrd r14d,r14d,9
- xor r13d,r10d
- xor r12d,eax
- shrd r13d,r13d,5
- xor r14d,ecx
- and r12d,r10d
- xor r13d,r10d
- add ebx,DWORD[24+rsp]
- mov r15d,ecx
- xor r12d,eax
- shrd r14d,r14d,11
- xor r15d,edx
- add ebx,r12d
- shrd r13d,r13d,6
- and edi,r15d
- xor r14d,ecx
- add ebx,r13d
- xor edi,edx
- shrd r14d,r14d,2
- add r9d,ebx
- add ebx,edi
- mov r13d,r9d
- add r14d,ebx
- shrd r13d,r13d,14
- mov ebx,r14d
- mov r12d,r10d
- shrd r14d,r14d,9
- xor r13d,r9d
- xor r12d,r11d
- shrd r13d,r13d,5
- xor r14d,ebx
- and r12d,r9d
- xor r13d,r9d
- add eax,DWORD[28+rsp]
- mov edi,ebx
- xor r12d,r11d
- shrd r14d,r14d,11
- xor edi,ecx
- add eax,r12d
- shrd r13d,r13d,6
- and r15d,edi
- xor r14d,ebx
- add eax,r13d
- xor r15d,ecx
- shrd r14d,r14d,2
- add r8d,eax
- add eax,r15d
- mov r13d,r8d
- add r14d,eax
- shrd r13d,r13d,14
- mov eax,r14d
- mov r12d,r9d
- shrd r14d,r14d,9
- xor r13d,r8d
- xor r12d,r10d
- shrd r13d,r13d,5
- xor r14d,eax
- and r12d,r8d
- xor r13d,r8d
- add r11d,DWORD[32+rsp]
- mov r15d,eax
- xor r12d,r10d
- shrd r14d,r14d,11
- xor r15d,ebx
- add r11d,r12d
- shrd r13d,r13d,6
- and edi,r15d
- xor r14d,eax
- add r11d,r13d
- xor edi,ebx
- shrd r14d,r14d,2
- add edx,r11d
- add r11d,edi
- mov r13d,edx
- add r14d,r11d
- shrd r13d,r13d,14
- mov r11d,r14d
- mov r12d,r8d
- shrd r14d,r14d,9
- xor r13d,edx
- xor r12d,r9d
- shrd r13d,r13d,5
- xor r14d,r11d
- and r12d,edx
- xor r13d,edx
- add r10d,DWORD[36+rsp]
- mov edi,r11d
- xor r12d,r9d
- shrd r14d,r14d,11
- xor edi,eax
- add r10d,r12d
- shrd r13d,r13d,6
- and r15d,edi
- xor r14d,r11d
- add r10d,r13d
- xor r15d,eax
- shrd r14d,r14d,2
- add ecx,r10d
- add r10d,r15d
- mov r13d,ecx
- add r14d,r10d
- shrd r13d,r13d,14
- mov r10d,r14d
- mov r12d,edx
- shrd r14d,r14d,9
- xor r13d,ecx
- xor r12d,r8d
- shrd r13d,r13d,5
- xor r14d,r10d
- and r12d,ecx
- xor r13d,ecx
- add r9d,DWORD[40+rsp]
- mov r15d,r10d
- xor r12d,r8d
- shrd r14d,r14d,11
- xor r15d,r11d
- add r9d,r12d
- shrd r13d,r13d,6
- and edi,r15d
- xor r14d,r10d
- add r9d,r13d
- xor edi,r11d
- shrd r14d,r14d,2
- add ebx,r9d
- add r9d,edi
- mov r13d,ebx
- add r14d,r9d
- shrd r13d,r13d,14
- mov r9d,r14d
- mov r12d,ecx
- shrd r14d,r14d,9
- xor r13d,ebx
- xor r12d,edx
- shrd r13d,r13d,5
- xor r14d,r9d
- and r12d,ebx
- xor r13d,ebx
- add r8d,DWORD[44+rsp]
- mov edi,r9d
- xor r12d,edx
- shrd r14d,r14d,11
- xor edi,r10d
- add r8d,r12d
- shrd r13d,r13d,6
- and r15d,edi
- xor r14d,r9d
- add r8d,r13d
- xor r15d,r10d
- shrd r14d,r14d,2
- add eax,r8d
- add r8d,r15d
- mov r13d,eax
- add r14d,r8d
- shrd r13d,r13d,14
- mov r8d,r14d
- mov r12d,ebx
- shrd r14d,r14d,9
- xor r13d,eax
- xor r12d,ecx
- shrd r13d,r13d,5
- xor r14d,r8d
- and r12d,eax
- xor r13d,eax
- add edx,DWORD[48+rsp]
- mov r15d,r8d
- xor r12d,ecx
- shrd r14d,r14d,11
- xor r15d,r9d
- add edx,r12d
- shrd r13d,r13d,6
- and edi,r15d
- xor r14d,r8d
- add edx,r13d
- xor edi,r9d
- shrd r14d,r14d,2
- add r11d,edx
- add edx,edi
- mov r13d,r11d
- add r14d,edx
- shrd r13d,r13d,14
- mov edx,r14d
- mov r12d,eax
- shrd r14d,r14d,9
- xor r13d,r11d
- xor r12d,ebx
- shrd r13d,r13d,5
- xor r14d,edx
- and r12d,r11d
- xor r13d,r11d
- add ecx,DWORD[52+rsp]
- mov edi,edx
- xor r12d,ebx
- shrd r14d,r14d,11
- xor edi,r8d
- add ecx,r12d
- shrd r13d,r13d,6
- and r15d,edi
- xor r14d,edx
- add ecx,r13d
- xor r15d,r8d
- shrd r14d,r14d,2
- add r10d,ecx
- add ecx,r15d
- mov r13d,r10d
- add r14d,ecx
- shrd r13d,r13d,14
- mov ecx,r14d
- mov r12d,r11d
- shrd r14d,r14d,9
- xor r13d,r10d
- xor r12d,eax
- shrd r13d,r13d,5
- xor r14d,ecx
- and r12d,r10d
- xor r13d,r10d
- add ebx,DWORD[56+rsp]
- mov r15d,ecx
- xor r12d,eax
- shrd r14d,r14d,11
- xor r15d,edx
- add ebx,r12d
- shrd r13d,r13d,6
- and edi,r15d
- xor r14d,ecx
- add ebx,r13d
- xor edi,edx
- shrd r14d,r14d,2
- add r9d,ebx
- add ebx,edi
- mov r13d,r9d
- add r14d,ebx
- shrd r13d,r13d,14
- mov ebx,r14d
- mov r12d,r10d
- shrd r14d,r14d,9
- xor r13d,r9d
- xor r12d,r11d
- shrd r13d,r13d,5
- xor r14d,ebx
- and r12d,r9d
- xor r13d,r9d
- add eax,DWORD[60+rsp]
- mov edi,ebx
- xor r12d,r11d
- shrd r14d,r14d,11
- xor edi,ecx
- add eax,r12d
- shrd r13d,r13d,6
- and r15d,edi
- xor r14d,ebx
- add eax,r13d
- xor r15d,ecx
- shrd r14d,r14d,2
- add r8d,eax
- add eax,r15d
- mov r13d,r8d
- add r14d,eax
- mov rdi,QWORD[((64+0))+rsp]
- mov eax,r14d
-
- add eax,DWORD[rdi]
- lea rsi,[64+rsi]
- add ebx,DWORD[4+rdi]
- add ecx,DWORD[8+rdi]
- add edx,DWORD[12+rdi]
- add r8d,DWORD[16+rdi]
- add r9d,DWORD[20+rdi]
- add r10d,DWORD[24+rdi]
- add r11d,DWORD[28+rdi]
-
- cmp rsi,QWORD[((64+16))+rsp]
-
- mov DWORD[rdi],eax
- mov DWORD[4+rdi],ebx
- mov DWORD[8+rdi],ecx
- mov DWORD[12+rdi],edx
- mov DWORD[16+rdi],r8d
- mov DWORD[20+rdi],r9d
- mov DWORD[24+rdi],r10d
- mov DWORD[28+rdi],r11d
- jb NEAR $L$loop_avx
-
- mov rsi,QWORD[88+rsp]
-
- vzeroupper
- movaps xmm6,XMMWORD[((64+32))+rsp]
- movaps xmm7,XMMWORD[((64+48))+rsp]
- movaps xmm8,XMMWORD[((64+64))+rsp]
- movaps xmm9,XMMWORD[((64+80))+rsp]
- mov r15,QWORD[((-48))+rsi]
-
- mov r14,QWORD[((-40))+rsi]
-
- mov r13,QWORD[((-32))+rsi]
-
- mov r12,QWORD[((-24))+rsi]
-
- mov rbp,QWORD[((-16))+rsi]
-
- mov rbx,QWORD[((-8))+rsi]
-
- lea rsp,[rsi]
-
-$L$epilogue_avx:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_sha256_block_data_order_avx:
-
-ALIGN 64
-sha256_block_data_order_avx2:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_sha256_block_data_order_avx2:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
-
-
-
-$L$avx2_shortcut:
- mov rax,rsp
-
- push rbx
-
- push rbp
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
- sub rsp,608
- shl rdx,4
- and rsp,-256*4
- lea rdx,[rdx*4+rsi]
- add rsp,448
- mov QWORD[((64+0))+rsp],rdi
- mov QWORD[((64+8))+rsp],rsi
- mov QWORD[((64+16))+rsp],rdx
- mov QWORD[88+rsp],rax
-
- movaps XMMWORD[(64+32)+rsp],xmm6
- movaps XMMWORD[(64+48)+rsp],xmm7
- movaps XMMWORD[(64+64)+rsp],xmm8
- movaps XMMWORD[(64+80)+rsp],xmm9
-$L$prologue_avx2:
-
- vzeroupper
- sub rsi,-16*4
- mov eax,DWORD[rdi]
- mov r12,rsi
- mov ebx,DWORD[4+rdi]
- cmp rsi,rdx
- mov ecx,DWORD[8+rdi]
- cmove r12,rsp
- mov edx,DWORD[12+rdi]
- mov r8d,DWORD[16+rdi]
- mov r9d,DWORD[20+rdi]
- mov r10d,DWORD[24+rdi]
- mov r11d,DWORD[28+rdi]
- vmovdqa ymm8,YMMWORD[((K256+512+32))]
- vmovdqa ymm9,YMMWORD[((K256+512+64))]
- jmp NEAR $L$oop_avx2
-ALIGN 16
-$L$oop_avx2:
- vmovdqa ymm7,YMMWORD[((K256+512))]
- vmovdqu xmm0,XMMWORD[((-64+0))+rsi]
- vmovdqu xmm1,XMMWORD[((-64+16))+rsi]
- vmovdqu xmm2,XMMWORD[((-64+32))+rsi]
- vmovdqu xmm3,XMMWORD[((-64+48))+rsi]
-
- vinserti128 ymm0,ymm0,XMMWORD[r12],1
- vinserti128 ymm1,ymm1,XMMWORD[16+r12],1
- vpshufb ymm0,ymm0,ymm7
- vinserti128 ymm2,ymm2,XMMWORD[32+r12],1
- vpshufb ymm1,ymm1,ymm7
- vinserti128 ymm3,ymm3,XMMWORD[48+r12],1
-
- lea rbp,[K256]
- vpshufb ymm2,ymm2,ymm7
- vpaddd ymm4,ymm0,YMMWORD[rbp]
- vpshufb ymm3,ymm3,ymm7
- vpaddd ymm5,ymm1,YMMWORD[32+rbp]
- vpaddd ymm6,ymm2,YMMWORD[64+rbp]
- vpaddd ymm7,ymm3,YMMWORD[96+rbp]
- vmovdqa YMMWORD[rsp],ymm4
- xor r14d,r14d
- vmovdqa YMMWORD[32+rsp],ymm5
- lea rsp,[((-64))+rsp]
- mov edi,ebx
- vmovdqa YMMWORD[rsp],ymm6
- xor edi,ecx
- vmovdqa YMMWORD[32+rsp],ymm7
- mov r12d,r9d
- sub rbp,-16*2*4
- jmp NEAR $L$avx2_00_47
-
-ALIGN 16
-$L$avx2_00_47:
- lea rsp,[((-64))+rsp]
- vpalignr ymm4,ymm1,ymm0,4
- add r11d,DWORD[((0+128))+rsp]
- and r12d,r8d
- rorx r13d,r8d,25
- vpalignr ymm7,ymm3,ymm2,4
- rorx r15d,r8d,11
- lea eax,[r14*1+rax]
- lea r11d,[r12*1+r11]
- vpsrld ymm6,ymm4,7
- andn r12d,r8d,r10d
- xor r13d,r15d
- rorx r14d,r8d,6
- vpaddd ymm0,ymm0,ymm7
- lea r11d,[r12*1+r11]
- xor r13d,r14d
- mov r15d,eax
- vpsrld ymm7,ymm4,3
- rorx r12d,eax,22
- lea r11d,[r13*1+r11]
- xor r15d,ebx
- vpslld ymm5,ymm4,14
- rorx r14d,eax,13
- rorx r13d,eax,2
- lea edx,[r11*1+rdx]
- vpxor ymm4,ymm7,ymm6
- and edi,r15d
- xor r14d,r12d
- xor edi,ebx
- vpshufd ymm7,ymm3,250
- xor r14d,r13d
- lea r11d,[rdi*1+r11]
- mov r12d,r8d
- vpsrld ymm6,ymm6,11
- add r10d,DWORD[((4+128))+rsp]
- and r12d,edx
- rorx r13d,edx,25
- vpxor ymm4,ymm4,ymm5
- rorx edi,edx,11
- lea r11d,[r14*1+r11]
- lea r10d,[r12*1+r10]
- vpslld ymm5,ymm5,11
- andn r12d,edx,r9d
- xor r13d,edi
- rorx r14d,edx,6
- vpxor ymm4,ymm4,ymm6
- lea r10d,[r12*1+r10]
- xor r13d,r14d
- mov edi,r11d
- vpsrld ymm6,ymm7,10
- rorx r12d,r11d,22
- lea r10d,[r13*1+r10]
- xor edi,eax
- vpxor ymm4,ymm4,ymm5
- rorx r14d,r11d,13
- rorx r13d,r11d,2
- lea ecx,[r10*1+rcx]
- vpsrlq ymm7,ymm7,17
- and r15d,edi
- xor r14d,r12d
- xor r15d,eax
- vpaddd ymm0,ymm0,ymm4
- xor r14d,r13d
- lea r10d,[r15*1+r10]
- mov r12d,edx
- vpxor ymm6,ymm6,ymm7
- add r9d,DWORD[((8+128))+rsp]
- and r12d,ecx
- rorx r13d,ecx,25
- vpsrlq ymm7,ymm7,2
- rorx r15d,ecx,11
- lea r10d,[r14*1+r10]
- lea r9d,[r12*1+r9]
- vpxor ymm6,ymm6,ymm7
- andn r12d,ecx,r8d
- xor r13d,r15d
- rorx r14d,ecx,6
- vpshufb ymm6,ymm6,ymm8
- lea r9d,[r12*1+r9]
- xor r13d,r14d
- mov r15d,r10d
- vpaddd ymm0,ymm0,ymm6
- rorx r12d,r10d,22
- lea r9d,[r13*1+r9]
- xor r15d,r11d
- vpshufd ymm7,ymm0,80
- rorx r14d,r10d,13
- rorx r13d,r10d,2
- lea ebx,[r9*1+rbx]
- vpsrld ymm6,ymm7,10
- and edi,r15d
- xor r14d,r12d
- xor edi,r11d
- vpsrlq ymm7,ymm7,17
- xor r14d,r13d
- lea r9d,[rdi*1+r9]
- mov r12d,ecx
- vpxor ymm6,ymm6,ymm7
- add r8d,DWORD[((12+128))+rsp]
- and r12d,ebx
- rorx r13d,ebx,25
- vpsrlq ymm7,ymm7,2
- rorx edi,ebx,11
- lea r9d,[r14*1+r9]
- lea r8d,[r12*1+r8]
- vpxor ymm6,ymm6,ymm7
- andn r12d,ebx,edx
- xor r13d,edi
- rorx r14d,ebx,6
- vpshufb ymm6,ymm6,ymm9
- lea r8d,[r12*1+r8]
- xor r13d,r14d
- mov edi,r9d
- vpaddd ymm0,ymm0,ymm6
- rorx r12d,r9d,22
- lea r8d,[r13*1+r8]
- xor edi,r10d
- vpaddd ymm6,ymm0,YMMWORD[rbp]
- rorx r14d,r9d,13
- rorx r13d,r9d,2
- lea eax,[r8*1+rax]
- and r15d,edi
- xor r14d,r12d
- xor r15d,r10d
- xor r14d,r13d
- lea r8d,[r15*1+r8]
- mov r12d,ebx
- vmovdqa YMMWORD[rsp],ymm6
- vpalignr ymm4,ymm2,ymm1,4
- add edx,DWORD[((32+128))+rsp]
- and r12d,eax
- rorx r13d,eax,25
- vpalignr ymm7,ymm0,ymm3,4
- rorx r15d,eax,11
- lea r8d,[r14*1+r8]
- lea edx,[r12*1+rdx]
- vpsrld ymm6,ymm4,7
- andn r12d,eax,ecx
- xor r13d,r15d
- rorx r14d,eax,6
- vpaddd ymm1,ymm1,ymm7
- lea edx,[r12*1+rdx]
- xor r13d,r14d
- mov r15d,r8d
- vpsrld ymm7,ymm4,3
- rorx r12d,r8d,22
- lea edx,[r13*1+rdx]
- xor r15d,r9d
- vpslld ymm5,ymm4,14
- rorx r14d,r8d,13
- rorx r13d,r8d,2
- lea r11d,[rdx*1+r11]
- vpxor ymm4,ymm7,ymm6
- and edi,r15d
- xor r14d,r12d
- xor edi,r9d
- vpshufd ymm7,ymm0,250
- xor r14d,r13d
- lea edx,[rdi*1+rdx]
- mov r12d,eax
- vpsrld ymm6,ymm6,11
- add ecx,DWORD[((36+128))+rsp]
- and r12d,r11d
- rorx r13d,r11d,25
- vpxor ymm4,ymm4,ymm5
- rorx edi,r11d,11
- lea edx,[r14*1+rdx]
- lea ecx,[r12*1+rcx]
- vpslld ymm5,ymm5,11
- andn r12d,r11d,ebx
- xor r13d,edi
- rorx r14d,r11d,6
- vpxor ymm4,ymm4,ymm6
- lea ecx,[r12*1+rcx]
- xor r13d,r14d
- mov edi,edx
- vpsrld ymm6,ymm7,10
- rorx r12d,edx,22
- lea ecx,[r13*1+rcx]
- xor edi,r8d
- vpxor ymm4,ymm4,ymm5
- rorx r14d,edx,13
- rorx r13d,edx,2
- lea r10d,[rcx*1+r10]
- vpsrlq ymm7,ymm7,17
- and r15d,edi
- xor r14d,r12d
- xor r15d,r8d
- vpaddd ymm1,ymm1,ymm4
- xor r14d,r13d
- lea ecx,[r15*1+rcx]
- mov r12d,r11d
- vpxor ymm6,ymm6,ymm7
- add ebx,DWORD[((40+128))+rsp]
- and r12d,r10d
- rorx r13d,r10d,25
- vpsrlq ymm7,ymm7,2
- rorx r15d,r10d,11
- lea ecx,[r14*1+rcx]
- lea ebx,[r12*1+rbx]
- vpxor ymm6,ymm6,ymm7
- andn r12d,r10d,eax
- xor r13d,r15d
- rorx r14d,r10d,6
- vpshufb ymm6,ymm6,ymm8
- lea ebx,[r12*1+rbx]
- xor r13d,r14d
- mov r15d,ecx
- vpaddd ymm1,ymm1,ymm6
- rorx r12d,ecx,22
- lea ebx,[r13*1+rbx]
- xor r15d,edx
- vpshufd ymm7,ymm1,80
- rorx r14d,ecx,13
- rorx r13d,ecx,2
- lea r9d,[rbx*1+r9]
- vpsrld ymm6,ymm7,10
- and edi,r15d
- xor r14d,r12d
- xor edi,edx
- vpsrlq ymm7,ymm7,17
- xor r14d,r13d
- lea ebx,[rdi*1+rbx]
- mov r12d,r10d
- vpxor ymm6,ymm6,ymm7
- add eax,DWORD[((44+128))+rsp]
- and r12d,r9d
- rorx r13d,r9d,25
- vpsrlq ymm7,ymm7,2
- rorx edi,r9d,11
- lea ebx,[r14*1+rbx]
- lea eax,[r12*1+rax]
- vpxor ymm6,ymm6,ymm7
- andn r12d,r9d,r11d
- xor r13d,edi
- rorx r14d,r9d,6
- vpshufb ymm6,ymm6,ymm9
- lea eax,[r12*1+rax]
- xor r13d,r14d
- mov edi,ebx
- vpaddd ymm1,ymm1,ymm6
- rorx r12d,ebx,22
- lea eax,[r13*1+rax]
- xor edi,ecx
- vpaddd ymm6,ymm1,YMMWORD[32+rbp]
- rorx r14d,ebx,13
- rorx r13d,ebx,2
- lea r8d,[rax*1+r8]
- and r15d,edi
- xor r14d,r12d
- xor r15d,ecx
- xor r14d,r13d
- lea eax,[r15*1+rax]
- mov r12d,r9d
- vmovdqa YMMWORD[32+rsp],ymm6
- lea rsp,[((-64))+rsp]
- vpalignr ymm4,ymm3,ymm2,4
- add r11d,DWORD[((0+128))+rsp]
- and r12d,r8d
- rorx r13d,r8d,25
- vpalignr ymm7,ymm1,ymm0,4
- rorx r15d,r8d,11
- lea eax,[r14*1+rax]
- lea r11d,[r12*1+r11]
- vpsrld ymm6,ymm4,7
- andn r12d,r8d,r10d
- xor r13d,r15d
- rorx r14d,r8d,6
- vpaddd ymm2,ymm2,ymm7
- lea r11d,[r12*1+r11]
- xor r13d,r14d
- mov r15d,eax
- vpsrld ymm7,ymm4,3
- rorx r12d,eax,22
- lea r11d,[r13*1+r11]
- xor r15d,ebx
- vpslld ymm5,ymm4,14
- rorx r14d,eax,13
- rorx r13d,eax,2
- lea edx,[r11*1+rdx]
- vpxor ymm4,ymm7,ymm6
- and edi,r15d
- xor r14d,r12d
- xor edi,ebx
- vpshufd ymm7,ymm1,250
- xor r14d,r13d
- lea r11d,[rdi*1+r11]
- mov r12d,r8d
- vpsrld ymm6,ymm6,11
- add r10d,DWORD[((4+128))+rsp]
- and r12d,edx
- rorx r13d,edx,25
- vpxor ymm4,ymm4,ymm5
- rorx edi,edx,11
- lea r11d,[r14*1+r11]
- lea r10d,[r12*1+r10]
- vpslld ymm5,ymm5,11
- andn r12d,edx,r9d
- xor r13d,edi
- rorx r14d,edx,6
- vpxor ymm4,ymm4,ymm6
- lea r10d,[r12*1+r10]
- xor r13d,r14d
- mov edi,r11d
- vpsrld ymm6,ymm7,10
- rorx r12d,r11d,22
- lea r10d,[r13*1+r10]
- xor edi,eax
- vpxor ymm4,ymm4,ymm5
- rorx r14d,r11d,13
- rorx r13d,r11d,2
- lea ecx,[r10*1+rcx]
- vpsrlq ymm7,ymm7,17
- and r15d,edi
- xor r14d,r12d
- xor r15d,eax
- vpaddd ymm2,ymm2,ymm4
- xor r14d,r13d
- lea r10d,[r15*1+r10]
- mov r12d,edx
- vpxor ymm6,ymm6,ymm7
- add r9d,DWORD[((8+128))+rsp]
- and r12d,ecx
- rorx r13d,ecx,25
- vpsrlq ymm7,ymm7,2
- rorx r15d,ecx,11
- lea r10d,[r14*1+r10]
- lea r9d,[r12*1+r9]
- vpxor ymm6,ymm6,ymm7
- andn r12d,ecx,r8d
- xor r13d,r15d
- rorx r14d,ecx,6
- vpshufb ymm6,ymm6,ymm8
- lea r9d,[r12*1+r9]
- xor r13d,r14d
- mov r15d,r10d
- vpaddd ymm2,ymm2,ymm6
- rorx r12d,r10d,22
- lea r9d,[r13*1+r9]
- xor r15d,r11d
- vpshufd ymm7,ymm2,80
- rorx r14d,r10d,13
- rorx r13d,r10d,2
- lea ebx,[r9*1+rbx]
- vpsrld ymm6,ymm7,10
- and edi,r15d
- xor r14d,r12d
- xor edi,r11d
- vpsrlq ymm7,ymm7,17
- xor r14d,r13d
- lea r9d,[rdi*1+r9]
- mov r12d,ecx
- vpxor ymm6,ymm6,ymm7
- add r8d,DWORD[((12+128))+rsp]
- and r12d,ebx
- rorx r13d,ebx,25
- vpsrlq ymm7,ymm7,2
- rorx edi,ebx,11
- lea r9d,[r14*1+r9]
- lea r8d,[r12*1+r8]
- vpxor ymm6,ymm6,ymm7
- andn r12d,ebx,edx
- xor r13d,edi
- rorx r14d,ebx,6
- vpshufb ymm6,ymm6,ymm9
- lea r8d,[r12*1+r8]
- xor r13d,r14d
- mov edi,r9d
- vpaddd ymm2,ymm2,ymm6
- rorx r12d,r9d,22
- lea r8d,[r13*1+r8]
- xor edi,r10d
- vpaddd ymm6,ymm2,YMMWORD[64+rbp]
- rorx r14d,r9d,13
- rorx r13d,r9d,2
- lea eax,[r8*1+rax]
- and r15d,edi
- xor r14d,r12d
- xor r15d,r10d
- xor r14d,r13d
- lea r8d,[r15*1+r8]
- mov r12d,ebx
- vmovdqa YMMWORD[rsp],ymm6
- vpalignr ymm4,ymm0,ymm3,4
- add edx,DWORD[((32+128))+rsp]
- and r12d,eax
- rorx r13d,eax,25
- vpalignr ymm7,ymm2,ymm1,4
- rorx r15d,eax,11
- lea r8d,[r14*1+r8]
- lea edx,[r12*1+rdx]
- vpsrld ymm6,ymm4,7
- andn r12d,eax,ecx
- xor r13d,r15d
- rorx r14d,eax,6
- vpaddd ymm3,ymm3,ymm7
- lea edx,[r12*1+rdx]
- xor r13d,r14d
- mov r15d,r8d
- vpsrld ymm7,ymm4,3
- rorx r12d,r8d,22
- lea edx,[r13*1+rdx]
- xor r15d,r9d
- vpslld ymm5,ymm4,14
- rorx r14d,r8d,13
- rorx r13d,r8d,2
- lea r11d,[rdx*1+r11]
- vpxor ymm4,ymm7,ymm6
- and edi,r15d
- xor r14d,r12d
- xor edi,r9d
- vpshufd ymm7,ymm2,250
- xor r14d,r13d
- lea edx,[rdi*1+rdx]
- mov r12d,eax
- vpsrld ymm6,ymm6,11
- add ecx,DWORD[((36+128))+rsp]
- and r12d,r11d
- rorx r13d,r11d,25
- vpxor ymm4,ymm4,ymm5
- rorx edi,r11d,11
- lea edx,[r14*1+rdx]
- lea ecx,[r12*1+rcx]
- vpslld ymm5,ymm5,11
- andn r12d,r11d,ebx
- xor r13d,edi
- rorx r14d,r11d,6
- vpxor ymm4,ymm4,ymm6
- lea ecx,[r12*1+rcx]
- xor r13d,r14d
- mov edi,edx
- vpsrld ymm6,ymm7,10
- rorx r12d,edx,22
- lea ecx,[r13*1+rcx]
- xor edi,r8d
- vpxor ymm4,ymm4,ymm5
- rorx r14d,edx,13
- rorx r13d,edx,2
- lea r10d,[rcx*1+r10]
- vpsrlq ymm7,ymm7,17
- and r15d,edi
- xor r14d,r12d
- xor r15d,r8d
- vpaddd ymm3,ymm3,ymm4
- xor r14d,r13d
- lea ecx,[r15*1+rcx]
- mov r12d,r11d
- vpxor ymm6,ymm6,ymm7
- add ebx,DWORD[((40+128))+rsp]
- and r12d,r10d
- rorx r13d,r10d,25
- vpsrlq ymm7,ymm7,2
- rorx r15d,r10d,11
- lea ecx,[r14*1+rcx]
- lea ebx,[r12*1+rbx]
- vpxor ymm6,ymm6,ymm7
- andn r12d,r10d,eax
- xor r13d,r15d
- rorx r14d,r10d,6
- vpshufb ymm6,ymm6,ymm8
- lea ebx,[r12*1+rbx]
- xor r13d,r14d
- mov r15d,ecx
- vpaddd ymm3,ymm3,ymm6
- rorx r12d,ecx,22
- lea ebx,[r13*1+rbx]
- xor r15d,edx
- vpshufd ymm7,ymm3,80
- rorx r14d,ecx,13
- rorx r13d,ecx,2
- lea r9d,[rbx*1+r9]
- vpsrld ymm6,ymm7,10
- and edi,r15d
- xor r14d,r12d
- xor edi,edx
- vpsrlq ymm7,ymm7,17
- xor r14d,r13d
- lea ebx,[rdi*1+rbx]
- mov r12d,r10d
- vpxor ymm6,ymm6,ymm7
- add eax,DWORD[((44+128))+rsp]
- and r12d,r9d
- rorx r13d,r9d,25
- vpsrlq ymm7,ymm7,2
- rorx edi,r9d,11
- lea ebx,[r14*1+rbx]
- lea eax,[r12*1+rax]
- vpxor ymm6,ymm6,ymm7
- andn r12d,r9d,r11d
- xor r13d,edi
- rorx r14d,r9d,6
- vpshufb ymm6,ymm6,ymm9
- lea eax,[r12*1+rax]
- xor r13d,r14d
- mov edi,ebx
- vpaddd ymm3,ymm3,ymm6
- rorx r12d,ebx,22
- lea eax,[r13*1+rax]
- xor edi,ecx
- vpaddd ymm6,ymm3,YMMWORD[96+rbp]
- rorx r14d,ebx,13
- rorx r13d,ebx,2
- lea r8d,[rax*1+r8]
- and r15d,edi
- xor r14d,r12d
- xor r15d,ecx
- xor r14d,r13d
- lea eax,[r15*1+rax]
- mov r12d,r9d
- vmovdqa YMMWORD[32+rsp],ymm6
- lea rbp,[128+rbp]
- cmp BYTE[3+rbp],0
- jne NEAR $L$avx2_00_47
- add r11d,DWORD[((0+64))+rsp]
- and r12d,r8d
- rorx r13d,r8d,25
- rorx r15d,r8d,11
- lea eax,[r14*1+rax]
- lea r11d,[r12*1+r11]
- andn r12d,r8d,r10d
- xor r13d,r15d
- rorx r14d,r8d,6
- lea r11d,[r12*1+r11]
- xor r13d,r14d
- mov r15d,eax
- rorx r12d,eax,22
- lea r11d,[r13*1+r11]
- xor r15d,ebx
- rorx r14d,eax,13
- rorx r13d,eax,2
- lea edx,[r11*1+rdx]
- and edi,r15d
- xor r14d,r12d
- xor edi,ebx
- xor r14d,r13d
- lea r11d,[rdi*1+r11]
- mov r12d,r8d
- add r10d,DWORD[((4+64))+rsp]
- and r12d,edx
- rorx r13d,edx,25
- rorx edi,edx,11
- lea r11d,[r14*1+r11]
- lea r10d,[r12*1+r10]
- andn r12d,edx,r9d
- xor r13d,edi
- rorx r14d,edx,6
- lea r10d,[r12*1+r10]
- xor r13d,r14d
- mov edi,r11d
- rorx r12d,r11d,22
- lea r10d,[r13*1+r10]
- xor edi,eax
- rorx r14d,r11d,13
- rorx r13d,r11d,2
- lea ecx,[r10*1+rcx]
- and r15d,edi
- xor r14d,r12d
- xor r15d,eax
- xor r14d,r13d
- lea r10d,[r15*1+r10]
- mov r12d,edx
- add r9d,DWORD[((8+64))+rsp]
- and r12d,ecx
- rorx r13d,ecx,25
- rorx r15d,ecx,11
- lea r10d,[r14*1+r10]
- lea r9d,[r12*1+r9]
- andn r12d,ecx,r8d
- xor r13d,r15d
- rorx r14d,ecx,6
- lea r9d,[r12*1+r9]
- xor r13d,r14d
- mov r15d,r10d
- rorx r12d,r10d,22
- lea r9d,[r13*1+r9]
- xor r15d,r11d
- rorx r14d,r10d,13
- rorx r13d,r10d,2
- lea ebx,[r9*1+rbx]
- and edi,r15d
- xor r14d,r12d
- xor edi,r11d
- xor r14d,r13d
- lea r9d,[rdi*1+r9]
- mov r12d,ecx
- add r8d,DWORD[((12+64))+rsp]
- and r12d,ebx
- rorx r13d,ebx,25
- rorx edi,ebx,11
- lea r9d,[r14*1+r9]
- lea r8d,[r12*1+r8]
- andn r12d,ebx,edx
- xor r13d,edi
- rorx r14d,ebx,6
- lea r8d,[r12*1+r8]
- xor r13d,r14d
- mov edi,r9d
- rorx r12d,r9d,22
- lea r8d,[r13*1+r8]
- xor edi,r10d
- rorx r14d,r9d,13
- rorx r13d,r9d,2
- lea eax,[r8*1+rax]
- and r15d,edi
- xor r14d,r12d
- xor r15d,r10d
- xor r14d,r13d
- lea r8d,[r15*1+r8]
- mov r12d,ebx
- add edx,DWORD[((32+64))+rsp]
- and r12d,eax
- rorx r13d,eax,25
- rorx r15d,eax,11
- lea r8d,[r14*1+r8]
- lea edx,[r12*1+rdx]
- andn r12d,eax,ecx
- xor r13d,r15d
- rorx r14d,eax,6
- lea edx,[r12*1+rdx]
- xor r13d,r14d
- mov r15d,r8d
- rorx r12d,r8d,22
- lea edx,[r13*1+rdx]
- xor r15d,r9d
- rorx r14d,r8d,13
- rorx r13d,r8d,2
- lea r11d,[rdx*1+r11]
- and edi,r15d
- xor r14d,r12d
- xor edi,r9d
- xor r14d,r13d
- lea edx,[rdi*1+rdx]
- mov r12d,eax
- add ecx,DWORD[((36+64))+rsp]
- and r12d,r11d
- rorx r13d,r11d,25
- rorx edi,r11d,11
- lea edx,[r14*1+rdx]
- lea ecx,[r12*1+rcx]
- andn r12d,r11d,ebx
- xor r13d,edi
- rorx r14d,r11d,6
- lea ecx,[r12*1+rcx]
- xor r13d,r14d
- mov edi,edx
- rorx r12d,edx,22
- lea ecx,[r13*1+rcx]
- xor edi,r8d
- rorx r14d,edx,13
- rorx r13d,edx,2
- lea r10d,[rcx*1+r10]
- and r15d,edi
- xor r14d,r12d
- xor r15d,r8d
- xor r14d,r13d
- lea ecx,[r15*1+rcx]
- mov r12d,r11d
- add ebx,DWORD[((40+64))+rsp]
- and r12d,r10d
- rorx r13d,r10d,25
- rorx r15d,r10d,11
- lea ecx,[r14*1+rcx]
- lea ebx,[r12*1+rbx]
- andn r12d,r10d,eax
- xor r13d,r15d
- rorx r14d,r10d,6
- lea ebx,[r12*1+rbx]
- xor r13d,r14d
- mov r15d,ecx
- rorx r12d,ecx,22
- lea ebx,[r13*1+rbx]
- xor r15d,edx
- rorx r14d,ecx,13
- rorx r13d,ecx,2
- lea r9d,[rbx*1+r9]
- and edi,r15d
- xor r14d,r12d
- xor edi,edx
- xor r14d,r13d
- lea ebx,[rdi*1+rbx]
- mov r12d,r10d
- add eax,DWORD[((44+64))+rsp]
- and r12d,r9d
- rorx r13d,r9d,25
- rorx edi,r9d,11
- lea ebx,[r14*1+rbx]
- lea eax,[r12*1+rax]
- andn r12d,r9d,r11d
- xor r13d,edi
- rorx r14d,r9d,6
- lea eax,[r12*1+rax]
- xor r13d,r14d
- mov edi,ebx
- rorx r12d,ebx,22
- lea eax,[r13*1+rax]
- xor edi,ecx
- rorx r14d,ebx,13
- rorx r13d,ebx,2
- lea r8d,[rax*1+r8]
- and r15d,edi
- xor r14d,r12d
- xor r15d,ecx
- xor r14d,r13d
- lea eax,[r15*1+rax]
- mov r12d,r9d
- add r11d,DWORD[rsp]
- and r12d,r8d
- rorx r13d,r8d,25
- rorx r15d,r8d,11
- lea eax,[r14*1+rax]
- lea r11d,[r12*1+r11]
- andn r12d,r8d,r10d
- xor r13d,r15d
- rorx r14d,r8d,6
- lea r11d,[r12*1+r11]
- xor r13d,r14d
- mov r15d,eax
- rorx r12d,eax,22
- lea r11d,[r13*1+r11]
- xor r15d,ebx
- rorx r14d,eax,13
- rorx r13d,eax,2
- lea edx,[r11*1+rdx]
- and edi,r15d
- xor r14d,r12d
- xor edi,ebx
- xor r14d,r13d
- lea r11d,[rdi*1+r11]
- mov r12d,r8d
- add r10d,DWORD[4+rsp]
- and r12d,edx
- rorx r13d,edx,25
- rorx edi,edx,11
- lea r11d,[r14*1+r11]
- lea r10d,[r12*1+r10]
- andn r12d,edx,r9d
- xor r13d,edi
- rorx r14d,edx,6
- lea r10d,[r12*1+r10]
- xor r13d,r14d
- mov edi,r11d
- rorx r12d,r11d,22
- lea r10d,[r13*1+r10]
- xor edi,eax
- rorx r14d,r11d,13
- rorx r13d,r11d,2
- lea ecx,[r10*1+rcx]
- and r15d,edi
- xor r14d,r12d
- xor r15d,eax
- xor r14d,r13d
- lea r10d,[r15*1+r10]
- mov r12d,edx
- add r9d,DWORD[8+rsp]
- and r12d,ecx
- rorx r13d,ecx,25
- rorx r15d,ecx,11
- lea r10d,[r14*1+r10]
- lea r9d,[r12*1+r9]
- andn r12d,ecx,r8d
- xor r13d,r15d
- rorx r14d,ecx,6
- lea r9d,[r12*1+r9]
- xor r13d,r14d
- mov r15d,r10d
- rorx r12d,r10d,22
- lea r9d,[r13*1+r9]
- xor r15d,r11d
- rorx r14d,r10d,13
- rorx r13d,r10d,2
- lea ebx,[r9*1+rbx]
- and edi,r15d
- xor r14d,r12d
- xor edi,r11d
- xor r14d,r13d
- lea r9d,[rdi*1+r9]
- mov r12d,ecx
- add r8d,DWORD[12+rsp]
- and r12d,ebx
- rorx r13d,ebx,25
- rorx edi,ebx,11
- lea r9d,[r14*1+r9]
- lea r8d,[r12*1+r8]
- andn r12d,ebx,edx
- xor r13d,edi
- rorx r14d,ebx,6
- lea r8d,[r12*1+r8]
- xor r13d,r14d
- mov edi,r9d
- rorx r12d,r9d,22
- lea r8d,[r13*1+r8]
- xor edi,r10d
- rorx r14d,r9d,13
- rorx r13d,r9d,2
- lea eax,[r8*1+rax]
- and r15d,edi
- xor r14d,r12d
- xor r15d,r10d
- xor r14d,r13d
- lea r8d,[r15*1+r8]
- mov r12d,ebx
- add edx,DWORD[32+rsp]
- and r12d,eax
- rorx r13d,eax,25
- rorx r15d,eax,11
- lea r8d,[r14*1+r8]
- lea edx,[r12*1+rdx]
- andn r12d,eax,ecx
- xor r13d,r15d
- rorx r14d,eax,6
- lea edx,[r12*1+rdx]
- xor r13d,r14d
- mov r15d,r8d
- rorx r12d,r8d,22
- lea edx,[r13*1+rdx]
- xor r15d,r9d
- rorx r14d,r8d,13
- rorx r13d,r8d,2
- lea r11d,[rdx*1+r11]
- and edi,r15d
- xor r14d,r12d
- xor edi,r9d
- xor r14d,r13d
- lea edx,[rdi*1+rdx]
- mov r12d,eax
- add ecx,DWORD[36+rsp]
- and r12d,r11d
- rorx r13d,r11d,25
- rorx edi,r11d,11
- lea edx,[r14*1+rdx]
- lea ecx,[r12*1+rcx]
- andn r12d,r11d,ebx
- xor r13d,edi
- rorx r14d,r11d,6
- lea ecx,[r12*1+rcx]
- xor r13d,r14d
- mov edi,edx
- rorx r12d,edx,22
- lea ecx,[r13*1+rcx]
- xor edi,r8d
- rorx r14d,edx,13
- rorx r13d,edx,2
- lea r10d,[rcx*1+r10]
- and r15d,edi
- xor r14d,r12d
- xor r15d,r8d
- xor r14d,r13d
- lea ecx,[r15*1+rcx]
- mov r12d,r11d
- add ebx,DWORD[40+rsp]
- and r12d,r10d
- rorx r13d,r10d,25
- rorx r15d,r10d,11
- lea ecx,[r14*1+rcx]
- lea ebx,[r12*1+rbx]
- andn r12d,r10d,eax
- xor r13d,r15d
- rorx r14d,r10d,6
- lea ebx,[r12*1+rbx]
- xor r13d,r14d
- mov r15d,ecx
- rorx r12d,ecx,22
- lea ebx,[r13*1+rbx]
- xor r15d,edx
- rorx r14d,ecx,13
- rorx r13d,ecx,2
- lea r9d,[rbx*1+r9]
- and edi,r15d
- xor r14d,r12d
- xor edi,edx
- xor r14d,r13d
- lea ebx,[rdi*1+rbx]
- mov r12d,r10d
- add eax,DWORD[44+rsp]
- and r12d,r9d
- rorx r13d,r9d,25
- rorx edi,r9d,11
- lea ebx,[r14*1+rbx]
- lea eax,[r12*1+rax]
- andn r12d,r9d,r11d
- xor r13d,edi
- rorx r14d,r9d,6
- lea eax,[r12*1+rax]
- xor r13d,r14d
- mov edi,ebx
- rorx r12d,ebx,22
- lea eax,[r13*1+rax]
- xor edi,ecx
- rorx r14d,ebx,13
- rorx r13d,ebx,2
- lea r8d,[rax*1+r8]
- and r15d,edi
- xor r14d,r12d
- xor r15d,ecx
- xor r14d,r13d
- lea eax,[r15*1+rax]
- mov r12d,r9d
- mov rdi,QWORD[512+rsp]
- add eax,r14d
-
- lea rbp,[448+rsp]
-
- add eax,DWORD[rdi]
- add ebx,DWORD[4+rdi]
- add ecx,DWORD[8+rdi]
- add edx,DWORD[12+rdi]
- add r8d,DWORD[16+rdi]
- add r9d,DWORD[20+rdi]
- add r10d,DWORD[24+rdi]
- add r11d,DWORD[28+rdi]
-
- mov DWORD[rdi],eax
- mov DWORD[4+rdi],ebx
- mov DWORD[8+rdi],ecx
- mov DWORD[12+rdi],edx
- mov DWORD[16+rdi],r8d
- mov DWORD[20+rdi],r9d
- mov DWORD[24+rdi],r10d
- mov DWORD[28+rdi],r11d
-
- cmp rsi,QWORD[80+rbp]
- je NEAR $L$done_avx2
-
- xor r14d,r14d
- mov edi,ebx
- xor edi,ecx
- mov r12d,r9d
- jmp NEAR $L$ower_avx2
-ALIGN 16
-$L$ower_avx2:
- add r11d,DWORD[((0+16))+rbp]
- and r12d,r8d
- rorx r13d,r8d,25
- rorx r15d,r8d,11
- lea eax,[r14*1+rax]
- lea r11d,[r12*1+r11]
- andn r12d,r8d,r10d
- xor r13d,r15d
- rorx r14d,r8d,6
- lea r11d,[r12*1+r11]
- xor r13d,r14d
- mov r15d,eax
- rorx r12d,eax,22
- lea r11d,[r13*1+r11]
- xor r15d,ebx
- rorx r14d,eax,13
- rorx r13d,eax,2
- lea edx,[r11*1+rdx]
- and edi,r15d
- xor r14d,r12d
- xor edi,ebx
- xor r14d,r13d
- lea r11d,[rdi*1+r11]
- mov r12d,r8d
- add r10d,DWORD[((4+16))+rbp]
- and r12d,edx
- rorx r13d,edx,25
- rorx edi,edx,11
- lea r11d,[r14*1+r11]
- lea r10d,[r12*1+r10]
- andn r12d,edx,r9d
- xor r13d,edi
- rorx r14d,edx,6
- lea r10d,[r12*1+r10]
- xor r13d,r14d
- mov edi,r11d
- rorx r12d,r11d,22
- lea r10d,[r13*1+r10]
- xor edi,eax
- rorx r14d,r11d,13
- rorx r13d,r11d,2
- lea ecx,[r10*1+rcx]
- and r15d,edi
- xor r14d,r12d
- xor r15d,eax
- xor r14d,r13d
- lea r10d,[r15*1+r10]
- mov r12d,edx
- add r9d,DWORD[((8+16))+rbp]
- and r12d,ecx
- rorx r13d,ecx,25
- rorx r15d,ecx,11
- lea r10d,[r14*1+r10]
- lea r9d,[r12*1+r9]
- andn r12d,ecx,r8d
- xor r13d,r15d
- rorx r14d,ecx,6
- lea r9d,[r12*1+r9]
- xor r13d,r14d
- mov r15d,r10d
- rorx r12d,r10d,22
- lea r9d,[r13*1+r9]
- xor r15d,r11d
- rorx r14d,r10d,13
- rorx r13d,r10d,2
- lea ebx,[r9*1+rbx]
- and edi,r15d
- xor r14d,r12d
- xor edi,r11d
- xor r14d,r13d
- lea r9d,[rdi*1+r9]
- mov r12d,ecx
- add r8d,DWORD[((12+16))+rbp]
- and r12d,ebx
- rorx r13d,ebx,25
- rorx edi,ebx,11
- lea r9d,[r14*1+r9]
- lea r8d,[r12*1+r8]
- andn r12d,ebx,edx
- xor r13d,edi
- rorx r14d,ebx,6
- lea r8d,[r12*1+r8]
- xor r13d,r14d
- mov edi,r9d
- rorx r12d,r9d,22
- lea r8d,[r13*1+r8]
- xor edi,r10d
- rorx r14d,r9d,13
- rorx r13d,r9d,2
- lea eax,[r8*1+rax]
- and r15d,edi
- xor r14d,r12d
- xor r15d,r10d
- xor r14d,r13d
- lea r8d,[r15*1+r8]
- mov r12d,ebx
- add edx,DWORD[((32+16))+rbp]
- and r12d,eax
- rorx r13d,eax,25
- rorx r15d,eax,11
- lea r8d,[r14*1+r8]
- lea edx,[r12*1+rdx]
- andn r12d,eax,ecx
- xor r13d,r15d
- rorx r14d,eax,6
- lea edx,[r12*1+rdx]
- xor r13d,r14d
- mov r15d,r8d
- rorx r12d,r8d,22
- lea edx,[r13*1+rdx]
- xor r15d,r9d
- rorx r14d,r8d,13
- rorx r13d,r8d,2
- lea r11d,[rdx*1+r11]
- and edi,r15d
- xor r14d,r12d
- xor edi,r9d
- xor r14d,r13d
- lea edx,[rdi*1+rdx]
- mov r12d,eax
- add ecx,DWORD[((36+16))+rbp]
- and r12d,r11d
- rorx r13d,r11d,25
- rorx edi,r11d,11
- lea edx,[r14*1+rdx]
- lea ecx,[r12*1+rcx]
- andn r12d,r11d,ebx
- xor r13d,edi
- rorx r14d,r11d,6
- lea ecx,[r12*1+rcx]
- xor r13d,r14d
- mov edi,edx
- rorx r12d,edx,22
- lea ecx,[r13*1+rcx]
- xor edi,r8d
- rorx r14d,edx,13
- rorx r13d,edx,2
- lea r10d,[rcx*1+r10]
- and r15d,edi
- xor r14d,r12d
- xor r15d,r8d
- xor r14d,r13d
- lea ecx,[r15*1+rcx]
- mov r12d,r11d
- add ebx,DWORD[((40+16))+rbp]
- and r12d,r10d
- rorx r13d,r10d,25
- rorx r15d,r10d,11
- lea ecx,[r14*1+rcx]
- lea ebx,[r12*1+rbx]
- andn r12d,r10d,eax
- xor r13d,r15d
- rorx r14d,r10d,6
- lea ebx,[r12*1+rbx]
- xor r13d,r14d
- mov r15d,ecx
- rorx r12d,ecx,22
- lea ebx,[r13*1+rbx]
- xor r15d,edx
- rorx r14d,ecx,13
- rorx r13d,ecx,2
- lea r9d,[rbx*1+r9]
- and edi,r15d
- xor r14d,r12d
- xor edi,edx
- xor r14d,r13d
- lea ebx,[rdi*1+rbx]
- mov r12d,r10d
- add eax,DWORD[((44+16))+rbp]
- and r12d,r9d
- rorx r13d,r9d,25
- rorx edi,r9d,11
- lea ebx,[r14*1+rbx]
- lea eax,[r12*1+rax]
- andn r12d,r9d,r11d
- xor r13d,edi
- rorx r14d,r9d,6
- lea eax,[r12*1+rax]
- xor r13d,r14d
- mov edi,ebx
- rorx r12d,ebx,22
- lea eax,[r13*1+rax]
- xor edi,ecx
- rorx r14d,ebx,13
- rorx r13d,ebx,2
- lea r8d,[rax*1+r8]
- and r15d,edi
- xor r14d,r12d
- xor r15d,ecx
- xor r14d,r13d
- lea eax,[r15*1+rax]
- mov r12d,r9d
- lea rbp,[((-64))+rbp]
- cmp rbp,rsp
- jae NEAR $L$ower_avx2
-
- mov rdi,QWORD[512+rsp]
- add eax,r14d
-
- lea rsp,[448+rsp]
-
-
-
- add eax,DWORD[rdi]
- add ebx,DWORD[4+rdi]
- add ecx,DWORD[8+rdi]
- add edx,DWORD[12+rdi]
- add r8d,DWORD[16+rdi]
- add r9d,DWORD[20+rdi]
- lea rsi,[128+rsi]
- add r10d,DWORD[24+rdi]
- mov r12,rsi
- add r11d,DWORD[28+rdi]
- cmp rsi,QWORD[((64+16))+rsp]
-
- mov DWORD[rdi],eax
- cmove r12,rsp
- mov DWORD[4+rdi],ebx
- mov DWORD[8+rdi],ecx
- mov DWORD[12+rdi],edx
- mov DWORD[16+rdi],r8d
- mov DWORD[20+rdi],r9d
- mov DWORD[24+rdi],r10d
- mov DWORD[28+rdi],r11d
-
- jbe NEAR $L$oop_avx2
- lea rbp,[rsp]
-
-
-
-
-$L$done_avx2:
- mov rsi,QWORD[88+rbp]
-
- vzeroupper
- movaps xmm6,XMMWORD[((64+32))+rbp]
- movaps xmm7,XMMWORD[((64+48))+rbp]
- movaps xmm8,XMMWORD[((64+64))+rbp]
- movaps xmm9,XMMWORD[((64+80))+rbp]
- mov r15,QWORD[((-48))+rsi]
-
- mov r14,QWORD[((-40))+rsi]
-
- mov r13,QWORD[((-32))+rsi]
-
- mov r12,QWORD[((-24))+rsi]
-
- mov rbp,QWORD[((-16))+rsi]
-
- mov rbx,QWORD[((-8))+rsi]
-
- lea rsp,[rsi]
-
-$L$epilogue_avx2:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_sha256_block_data_order_avx2:
EXTERN __imp_RtlVirtualUnwind
ALIGN 16
@@ -5568,13 +3181,6 @@ se_handler:
lea r10,[r10*1+rsi]
cmp rbx,r10
jae NEAR $L$in_prologue
- lea r10,[$L$avx2_shortcut]
- cmp rbx,r10
- jb NEAR $L$not_in_avx2
-
- and rax,-256*4
- add rax,448
-$L$not_in_avx2:
mov rsi,rax
mov rax,QWORD[((64+24))+rax]
@@ -5682,12 +3288,6 @@ ALIGN 4
DD $L$SEH_begin_sha256_block_data_order_ssse3 wrt ..imagebase
DD $L$SEH_end_sha256_block_data_order_ssse3 wrt ..imagebase
DD $L$SEH_info_sha256_block_data_order_ssse3 wrt ..imagebase
- DD $L$SEH_begin_sha256_block_data_order_avx wrt ..imagebase
- DD $L$SEH_end_sha256_block_data_order_avx wrt ..imagebase
- DD $L$SEH_info_sha256_block_data_order_avx wrt ..imagebase
- DD $L$SEH_begin_sha256_block_data_order_avx2 wrt ..imagebase
- DD $L$SEH_end_sha256_block_data_order_avx2 wrt ..imagebase
- DD $L$SEH_info_sha256_block_data_order_avx2 wrt ..imagebase
section .xdata rdata align=8
ALIGN 8
$L$SEH_info_sha256_block_data_order:
@@ -5701,11 +3301,3 @@ $L$SEH_info_sha256_block_data_order_ssse3:
DB 9,0,0,0
DD se_handler wrt ..imagebase
DD $L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase
-$L$SEH_info_sha256_block_data_order_avx:
-DB 9,0,0,0
- DD se_handler wrt ..imagebase
- DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
-$L$SEH_info_sha256_block_data_order_avx2:
-DB 9,0,0,0
- DD se_handler wrt ..imagebase
- DD $L$prologue_avx2 wrt ..imagebase,$L$epilogue_avx2 wrt ..imagebase
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha512-x86_64.nasm b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha512-x86_64.nasm
index 5ddba53d1c5..f75e7fe2629 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha512-x86_64.nasm
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/X64-MSFT/crypto/sha/sha512-x86_64.nasm
@@ -20,20 +20,6 @@ $L$SEH_begin_sha512_block_data_order:
- lea r11,[OPENSSL_ia32cap_P]
- mov r9d,DWORD[r11]
- mov r10d,DWORD[4+r11]
- mov r11d,DWORD[8+r11]
- test r10d,2048
- jnz NEAR $L$xop_shortcut
- and r11d,296
- cmp r11d,296
- je NEAR $L$avx2_shortcut
- and r9d,1073741824
- and r10d,268435968
- or r10d,r9d
- cmp r10d,1342177792
- je NEAR $L$avx_shortcut
mov rax,rsp
push rbx
@@ -1833,3833 +1819,110 @@ DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54
DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
DB 111,114,103,62,0
+EXTERN __imp_RtlVirtualUnwind
-ALIGN 64
-sha512_block_data_order_xop:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_sha512_block_data_order_xop:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
+ALIGN 16
+se_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
-$L$xop_shortcut:
- mov rax,rsp
+ mov r10d,DWORD[r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$in_prologue
- push rbx
+ mov rax,QWORD[152+r8]
- push rbp
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$in_prologue
+ mov rsi,rax
+ mov rax,QWORD[((128+24))+rax]
- push r12
+ mov rbx,QWORD[((-8))+rax]
+ mov rbp,QWORD[((-16))+rax]
+ mov r12,QWORD[((-24))+rax]
+ mov r13,QWORD[((-32))+rax]
+ mov r14,QWORD[((-40))+rax]
+ mov r15,QWORD[((-48))+rax]
+ mov QWORD[144+r8],rbx
+ mov QWORD[160+r8],rbp
+ mov QWORD[216+r8],r12
+ mov QWORD[224+r8],r13
+ mov QWORD[232+r8],r14
+ mov QWORD[240+r8],r15
- push r13
+ lea r10,[$L$epilogue]
+ cmp rbx,r10
+ jb NEAR $L$in_prologue
- push r14
+ lea rsi,[((128+32))+rsi]
+ lea rdi,[512+r8]
+ mov ecx,12
+ DD 0xa548f3fc
- push r15
+$L$in_prologue:
+ mov rdi,QWORD[8+rax]
+ mov rsi,QWORD[16+rax]
+ mov QWORD[152+r8],rax
+ mov QWORD[168+r8],rsi
+ mov QWORD[176+r8],rdi
- shl rdx,4
- sub rsp,256
- lea rdx,[rdx*8+rsi]
- and rsp,-64
- mov QWORD[((128+0))+rsp],rdi
- mov QWORD[((128+8))+rsp],rsi
- mov QWORD[((128+16))+rsp],rdx
- mov QWORD[152+rsp],rax
+ mov rdi,QWORD[40+r9]
+ mov rsi,r8
+ mov ecx,154
+ DD 0xa548f3fc
- movaps XMMWORD[(128+32)+rsp],xmm6
- movaps XMMWORD[(128+48)+rsp],xmm7
- movaps XMMWORD[(128+64)+rsp],xmm8
- movaps XMMWORD[(128+80)+rsp],xmm9
- movaps XMMWORD[(128+96)+rsp],xmm10
- movaps XMMWORD[(128+112)+rsp],xmm11
-$L$prologue_xop:
+ mov rsi,r9
+ xor rcx,rcx
+ mov rdx,QWORD[8+rsi]
+ mov r8,QWORD[rsi]
+ mov r9,QWORD[16+rsi]
+ mov r10,QWORD[40+rsi]
+ lea r11,[56+rsi]
+ lea r12,[24+rsi]
+ mov QWORD[32+rsp],r10
+ mov QWORD[40+rsp],r11
+ mov QWORD[48+rsp],r12
+ mov QWORD[56+rsp],rcx
+ call QWORD[__imp_RtlVirtualUnwind]
- vzeroupper
- mov rax,QWORD[rdi]
- mov rbx,QWORD[8+rdi]
- mov rcx,QWORD[16+rdi]
- mov rdx,QWORD[24+rdi]
- mov r8,QWORD[32+rdi]
- mov r9,QWORD[40+rdi]
- mov r10,QWORD[48+rdi]
- mov r11,QWORD[56+rdi]
- jmp NEAR $L$loop_xop
-ALIGN 16
-$L$loop_xop:
- vmovdqa xmm11,XMMWORD[((K512+1280))]
- vmovdqu xmm0,XMMWORD[rsi]
- lea rbp,[((K512+128))]
- vmovdqu xmm1,XMMWORD[16+rsi]
- vmovdqu xmm2,XMMWORD[32+rsi]
- vpshufb xmm0,xmm0,xmm11
- vmovdqu xmm3,XMMWORD[48+rsi]
- vpshufb xmm1,xmm1,xmm11
- vmovdqu xmm4,XMMWORD[64+rsi]
- vpshufb xmm2,xmm2,xmm11
- vmovdqu xmm5,XMMWORD[80+rsi]
- vpshufb xmm3,xmm3,xmm11
- vmovdqu xmm6,XMMWORD[96+rsi]
- vpshufb xmm4,xmm4,xmm11
- vmovdqu xmm7,XMMWORD[112+rsi]
- vpshufb xmm5,xmm5,xmm11
- vpaddq xmm8,xmm0,XMMWORD[((-128))+rbp]
- vpshufb xmm6,xmm6,xmm11
- vpaddq xmm9,xmm1,XMMWORD[((-96))+rbp]
- vpshufb xmm7,xmm7,xmm11
- vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp]
- vpaddq xmm11,xmm3,XMMWORD[((-32))+rbp]
- vmovdqa XMMWORD[rsp],xmm8
- vpaddq xmm8,xmm4,XMMWORD[rbp]
- vmovdqa XMMWORD[16+rsp],xmm9
- vpaddq xmm9,xmm5,XMMWORD[32+rbp]
- vmovdqa XMMWORD[32+rsp],xmm10
- vpaddq xmm10,xmm6,XMMWORD[64+rbp]
- vmovdqa XMMWORD[48+rsp],xmm11
- vpaddq xmm11,xmm7,XMMWORD[96+rbp]
- vmovdqa XMMWORD[64+rsp],xmm8
- mov r14,rax
- vmovdqa XMMWORD[80+rsp],xmm9
- mov rdi,rbx
- vmovdqa XMMWORD[96+rsp],xmm10
- xor rdi,rcx
- vmovdqa XMMWORD[112+rsp],xmm11
- mov r13,r8
- jmp NEAR $L$xop_00_47
-
-ALIGN 16
-$L$xop_00_47:
- add rbp,256
- vpalignr xmm8,xmm1,xmm0,8
- ror r13,23
- mov rax,r14
- vpalignr xmm11,xmm5,xmm4,8
- mov r12,r9
- ror r14,5
-DB 143,72,120,195,200,56
- xor r13,r8
- xor r12,r10
- vpsrlq xmm8,xmm8,7
- ror r13,4
- xor r14,rax
- vpaddq xmm0,xmm0,xmm11
- and r12,r8
- xor r13,r8
- add r11,QWORD[rsp]
- mov r15,rax
-DB 143,72,120,195,209,7
- xor r12,r10
- ror r14,6
- vpxor xmm8,xmm8,xmm9
- xor r15,rbx
- add r11,r12
- ror r13,14
- and rdi,r15
-DB 143,104,120,195,223,3
- xor r14,rax
- add r11,r13
- vpxor xmm8,xmm8,xmm10
- xor rdi,rbx
- ror r14,28
- vpsrlq xmm10,xmm7,6
- add rdx,r11
- add r11,rdi
- vpaddq xmm0,xmm0,xmm8
- mov r13,rdx
- add r14,r11
-DB 143,72,120,195,203,42
- ror r13,23
- mov r11,r14
- vpxor xmm11,xmm11,xmm10
- mov r12,r8
- ror r14,5
- xor r13,rdx
- xor r12,r9
- vpxor xmm11,xmm11,xmm9
- ror r13,4
- xor r14,r11
- and r12,rdx
- xor r13,rdx
- vpaddq xmm0,xmm0,xmm11
- add r10,QWORD[8+rsp]
- mov rdi,r11
- xor r12,r9
- ror r14,6
- vpaddq xmm10,xmm0,XMMWORD[((-128))+rbp]
- xor rdi,rax
- add r10,r12
- ror r13,14
- and r15,rdi
- xor r14,r11
- add r10,r13
- xor r15,rax
- ror r14,28
- add rcx,r10
- add r10,r15
- mov r13,rcx
- add r14,r10
- vmovdqa XMMWORD[rsp],xmm10
- vpalignr xmm8,xmm2,xmm1,8
- ror r13,23
- mov r10,r14
- vpalignr xmm11,xmm6,xmm5,8
- mov r12,rdx
- ror r14,5
-DB 143,72,120,195,200,56
- xor r13,rcx
- xor r12,r8
- vpsrlq xmm8,xmm8,7
- ror r13,4
- xor r14,r10
- vpaddq xmm1,xmm1,xmm11
- and r12,rcx
- xor r13,rcx
- add r9,QWORD[16+rsp]
- mov r15,r10
-DB 143,72,120,195,209,7
- xor r12,r8
- ror r14,6
- vpxor xmm8,xmm8,xmm9
- xor r15,r11
- add r9,r12
- ror r13,14
- and rdi,r15
-DB 143,104,120,195,216,3
- xor r14,r10
- add r9,r13
- vpxor xmm8,xmm8,xmm10
- xor rdi,r11
- ror r14,28
- vpsrlq xmm10,xmm0,6
- add rbx,r9
- add r9,rdi
- vpaddq xmm1,xmm1,xmm8
- mov r13,rbx
- add r14,r9
-DB 143,72,120,195,203,42
- ror r13,23
- mov r9,r14
- vpxor xmm11,xmm11,xmm10
- mov r12,rcx
- ror r14,5
- xor r13,rbx
- xor r12,rdx
- vpxor xmm11,xmm11,xmm9
- ror r13,4
- xor r14,r9
- and r12,rbx
- xor r13,rbx
- vpaddq xmm1,xmm1,xmm11
- add r8,QWORD[24+rsp]
- mov rdi,r9
- xor r12,rdx
- ror r14,6
- vpaddq xmm10,xmm1,XMMWORD[((-96))+rbp]
- xor rdi,r10
- add r8,r12
- ror r13,14
- and r15,rdi
- xor r14,r9
- add r8,r13
- xor r15,r10
- ror r14,28
- add rax,r8
- add r8,r15
- mov r13,rax
- add r14,r8
- vmovdqa XMMWORD[16+rsp],xmm10
- vpalignr xmm8,xmm3,xmm2,8
- ror r13,23
- mov r8,r14
- vpalignr xmm11,xmm7,xmm6,8
- mov r12,rbx
- ror r14,5
-DB 143,72,120,195,200,56
- xor r13,rax
- xor r12,rcx
- vpsrlq xmm8,xmm8,7
- ror r13,4
- xor r14,r8
- vpaddq xmm2,xmm2,xmm11
- and r12,rax
- xor r13,rax
- add rdx,QWORD[32+rsp]
- mov r15,r8
-DB 143,72,120,195,209,7
- xor r12,rcx
- ror r14,6
- vpxor xmm8,xmm8,xmm9
- xor r15,r9
- add rdx,r12
- ror r13,14
- and rdi,r15
-DB 143,104,120,195,217,3
- xor r14,r8
- add rdx,r13
- vpxor xmm8,xmm8,xmm10
- xor rdi,r9
- ror r14,28
- vpsrlq xmm10,xmm1,6
- add r11,rdx
- add rdx,rdi
- vpaddq xmm2,xmm2,xmm8
- mov r13,r11
- add r14,rdx
-DB 143,72,120,195,203,42
- ror r13,23
- mov rdx,r14
- vpxor xmm11,xmm11,xmm10
- mov r12,rax
- ror r14,5
- xor r13,r11
- xor r12,rbx
- vpxor xmm11,xmm11,xmm9
- ror r13,4
- xor r14,rdx
- and r12,r11
- xor r13,r11
- vpaddq xmm2,xmm2,xmm11
- add rcx,QWORD[40+rsp]
- mov rdi,rdx
- xor r12,rbx
- ror r14,6
- vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp]
- xor rdi,r8
- add rcx,r12
- ror r13,14
- and r15,rdi
- xor r14,rdx
- add rcx,r13
- xor r15,r8
- ror r14,28
- add r10,rcx
- add rcx,r15
- mov r13,r10
- add r14,rcx
- vmovdqa XMMWORD[32+rsp],xmm10
- vpalignr xmm8,xmm4,xmm3,8
- ror r13,23
- mov rcx,r14
- vpalignr xmm11,xmm0,xmm7,8
- mov r12,r11
- ror r14,5
-DB 143,72,120,195,200,56
- xor r13,r10
- xor r12,rax
- vpsrlq xmm8,xmm8,7
- ror r13,4
- xor r14,rcx
- vpaddq xmm3,xmm3,xmm11
- and r12,r10
- xor r13,r10
- add rbx,QWORD[48+rsp]
- mov r15,rcx
-DB 143,72,120,195,209,7
- xor r12,rax
- ror r14,6
- vpxor xmm8,xmm8,xmm9
- xor r15,rdx
- add rbx,r12
- ror r13,14
- and rdi,r15
-DB 143,104,120,195,218,3
- xor r14,rcx
- add rbx,r13
- vpxor xmm8,xmm8,xmm10
- xor rdi,rdx
- ror r14,28
- vpsrlq xmm10,xmm2,6
- add r9,rbx
- add rbx,rdi
- vpaddq xmm3,xmm3,xmm8
- mov r13,r9
- add r14,rbx
-DB 143,72,120,195,203,42
- ror r13,23
- mov rbx,r14
- vpxor xmm11,xmm11,xmm10
- mov r12,r10
- ror r14,5
- xor r13,r9
- xor r12,r11
- vpxor xmm11,xmm11,xmm9
- ror r13,4
- xor r14,rbx
- and r12,r9
- xor r13,r9
- vpaddq xmm3,xmm3,xmm11
- add rax,QWORD[56+rsp]
- mov rdi,rbx
- xor r12,r11
- ror r14,6
- vpaddq xmm10,xmm3,XMMWORD[((-32))+rbp]
- xor rdi,rcx
- add rax,r12
- ror r13,14
- and r15,rdi
- xor r14,rbx
- add rax,r13
- xor r15,rcx
- ror r14,28
- add r8,rax
- add rax,r15
- mov r13,r8
- add r14,rax
- vmovdqa XMMWORD[48+rsp],xmm10
- vpalignr xmm8,xmm5,xmm4,8
- ror r13,23
- mov rax,r14
- vpalignr xmm11,xmm1,xmm0,8
- mov r12,r9
- ror r14,5
-DB 143,72,120,195,200,56
- xor r13,r8
- xor r12,r10
- vpsrlq xmm8,xmm8,7
- ror r13,4
- xor r14,rax
- vpaddq xmm4,xmm4,xmm11
- and r12,r8
- xor r13,r8
- add r11,QWORD[64+rsp]
- mov r15,rax
-DB 143,72,120,195,209,7
- xor r12,r10
- ror r14,6
- vpxor xmm8,xmm8,xmm9
- xor r15,rbx
- add r11,r12
- ror r13,14
- and rdi,r15
-DB 143,104,120,195,219,3
- xor r14,rax
- add r11,r13
- vpxor xmm8,xmm8,xmm10
- xor rdi,rbx
- ror r14,28
- vpsrlq xmm10,xmm3,6
- add rdx,r11
- add r11,rdi
- vpaddq xmm4,xmm4,xmm8
- mov r13,rdx
- add r14,r11
-DB 143,72,120,195,203,42
- ror r13,23
- mov r11,r14
- vpxor xmm11,xmm11,xmm10
- mov r12,r8
- ror r14,5
- xor r13,rdx
- xor r12,r9
- vpxor xmm11,xmm11,xmm9
- ror r13,4
- xor r14,r11
- and r12,rdx
- xor r13,rdx
- vpaddq xmm4,xmm4,xmm11
- add r10,QWORD[72+rsp]
- mov rdi,r11
- xor r12,r9
- ror r14,6
- vpaddq xmm10,xmm4,XMMWORD[rbp]
- xor rdi,rax
- add r10,r12
- ror r13,14
- and r15,rdi
- xor r14,r11
- add r10,r13
- xor r15,rax
- ror r14,28
- add rcx,r10
- add r10,r15
- mov r13,rcx
- add r14,r10
- vmovdqa XMMWORD[64+rsp],xmm10
- vpalignr xmm8,xmm6,xmm5,8
- ror r13,23
- mov r10,r14
- vpalignr xmm11,xmm2,xmm1,8
- mov r12,rdx
- ror r14,5
-DB 143,72,120,195,200,56
- xor r13,rcx
- xor r12,r8
- vpsrlq xmm8,xmm8,7
- ror r13,4
- xor r14,r10
- vpaddq xmm5,xmm5,xmm11
- and r12,rcx
- xor r13,rcx
- add r9,QWORD[80+rsp]
- mov r15,r10
-DB 143,72,120,195,209,7
- xor r12,r8
- ror r14,6
- vpxor xmm8,xmm8,xmm9
- xor r15,r11
- add r9,r12
- ror r13,14
- and rdi,r15
-DB 143,104,120,195,220,3
- xor r14,r10
- add r9,r13
- vpxor xmm8,xmm8,xmm10
- xor rdi,r11
- ror r14,28
- vpsrlq xmm10,xmm4,6
- add rbx,r9
- add r9,rdi
- vpaddq xmm5,xmm5,xmm8
- mov r13,rbx
- add r14,r9
-DB 143,72,120,195,203,42
- ror r13,23
- mov r9,r14
- vpxor xmm11,xmm11,xmm10
- mov r12,rcx
- ror r14,5
- xor r13,rbx
- xor r12,rdx
- vpxor xmm11,xmm11,xmm9
- ror r13,4
- xor r14,r9
- and r12,rbx
- xor r13,rbx
- vpaddq xmm5,xmm5,xmm11
- add r8,QWORD[88+rsp]
- mov rdi,r9
- xor r12,rdx
- ror r14,6
- vpaddq xmm10,xmm5,XMMWORD[32+rbp]
- xor rdi,r10
- add r8,r12
- ror r13,14
- and r15,rdi
- xor r14,r9
- add r8,r13
- xor r15,r10
- ror r14,28
- add rax,r8
- add r8,r15
- mov r13,rax
- add r14,r8
- vmovdqa XMMWORD[80+rsp],xmm10
- vpalignr xmm8,xmm7,xmm6,8
- ror r13,23
- mov r8,r14
- vpalignr xmm11,xmm3,xmm2,8
- mov r12,rbx
- ror r14,5
-DB 143,72,120,195,200,56
- xor r13,rax
- xor r12,rcx
- vpsrlq xmm8,xmm8,7
- ror r13,4
- xor r14,r8
- vpaddq xmm6,xmm6,xmm11
- and r12,rax
- xor r13,rax
- add rdx,QWORD[96+rsp]
- mov r15,r8
-DB 143,72,120,195,209,7
- xor r12,rcx
- ror r14,6
- vpxor xmm8,xmm8,xmm9
- xor r15,r9
- add rdx,r12
- ror r13,14
- and rdi,r15
-DB 143,104,120,195,221,3
- xor r14,r8
- add rdx,r13
- vpxor xmm8,xmm8,xmm10
- xor rdi,r9
- ror r14,28
- vpsrlq xmm10,xmm5,6
- add r11,rdx
- add rdx,rdi
- vpaddq xmm6,xmm6,xmm8
- mov r13,r11
- add r14,rdx
-DB 143,72,120,195,203,42
- ror r13,23
- mov rdx,r14
- vpxor xmm11,xmm11,xmm10
- mov r12,rax
- ror r14,5
- xor r13,r11
- xor r12,rbx
- vpxor xmm11,xmm11,xmm9
- ror r13,4
- xor r14,rdx
- and r12,r11
- xor r13,r11
- vpaddq xmm6,xmm6,xmm11
- add rcx,QWORD[104+rsp]
- mov rdi,rdx
- xor r12,rbx
- ror r14,6
- vpaddq xmm10,xmm6,XMMWORD[64+rbp]
- xor rdi,r8
- add rcx,r12
- ror r13,14
- and r15,rdi
- xor r14,rdx
- add rcx,r13
- xor r15,r8
- ror r14,28
- add r10,rcx
- add rcx,r15
- mov r13,r10
- add r14,rcx
- vmovdqa XMMWORD[96+rsp],xmm10
- vpalignr xmm8,xmm0,xmm7,8
- ror r13,23
- mov rcx,r14
- vpalignr xmm11,xmm4,xmm3,8
- mov r12,r11
- ror r14,5
-DB 143,72,120,195,200,56
- xor r13,r10
- xor r12,rax
- vpsrlq xmm8,xmm8,7
- ror r13,4
- xor r14,rcx
- vpaddq xmm7,xmm7,xmm11
- and r12,r10
- xor r13,r10
- add rbx,QWORD[112+rsp]
- mov r15,rcx
-DB 143,72,120,195,209,7
- xor r12,rax
- ror r14,6
- vpxor xmm8,xmm8,xmm9
- xor r15,rdx
- add rbx,r12
- ror r13,14
- and rdi,r15
-DB 143,104,120,195,222,3
- xor r14,rcx
- add rbx,r13
- vpxor xmm8,xmm8,xmm10
- xor rdi,rdx
- ror r14,28
- vpsrlq xmm10,xmm6,6
- add r9,rbx
- add rbx,rdi
- vpaddq xmm7,xmm7,xmm8
- mov r13,r9
- add r14,rbx
-DB 143,72,120,195,203,42
- ror r13,23
- mov rbx,r14
- vpxor xmm11,xmm11,xmm10
- mov r12,r10
- ror r14,5
- xor r13,r9
- xor r12,r11
- vpxor xmm11,xmm11,xmm9
- ror r13,4
- xor r14,rbx
- and r12,r9
- xor r13,r9
- vpaddq xmm7,xmm7,xmm11
- add rax,QWORD[120+rsp]
- mov rdi,rbx
- xor r12,r11
- ror r14,6
- vpaddq xmm10,xmm7,XMMWORD[96+rbp]
- xor rdi,rcx
- add rax,r12
- ror r13,14
- and r15,rdi
- xor r14,rbx
- add rax,r13
- xor r15,rcx
- ror r14,28
- add r8,rax
- add rax,r15
- mov r13,r8
- add r14,rax
- vmovdqa XMMWORD[112+rsp],xmm10
- cmp BYTE[135+rbp],0
- jne NEAR $L$xop_00_47
- ror r13,23
- mov rax,r14
- mov r12,r9
- ror r14,5
- xor r13,r8
- xor r12,r10
- ror r13,4
- xor r14,rax
- and r12,r8
- xor r13,r8
- add r11,QWORD[rsp]
- mov r15,rax
- xor r12,r10
- ror r14,6
- xor r15,rbx
- add r11,r12
- ror r13,14
- and rdi,r15
- xor r14,rax
- add r11,r13
- xor rdi,rbx
- ror r14,28
- add rdx,r11
- add r11,rdi
- mov r13,rdx
- add r14,r11
- ror r13,23
- mov r11,r14
- mov r12,r8
- ror r14,5
- xor r13,rdx
- xor r12,r9
- ror r13,4
- xor r14,r11
- and r12,rdx
- xor r13,rdx
- add r10,QWORD[8+rsp]
- mov rdi,r11
- xor r12,r9
- ror r14,6
- xor rdi,rax
- add r10,r12
- ror r13,14
- and r15,rdi
- xor r14,r11
- add r10,r13
- xor r15,rax
- ror r14,28
- add rcx,r10
- add r10,r15
- mov r13,rcx
- add r14,r10
- ror r13,23
- mov r10,r14
- mov r12,rdx
- ror r14,5
- xor r13,rcx
- xor r12,r8
- ror r13,4
- xor r14,r10
- and r12,rcx
- xor r13,rcx
- add r9,QWORD[16+rsp]
- mov r15,r10
- xor r12,r8
- ror r14,6
- xor r15,r11
- add r9,r12
- ror r13,14
- and rdi,r15
- xor r14,r10
- add r9,r13
- xor rdi,r11
- ror r14,28
- add rbx,r9
- add r9,rdi
- mov r13,rbx
- add r14,r9
- ror r13,23
- mov r9,r14
- mov r12,rcx
- ror r14,5
- xor r13,rbx
- xor r12,rdx
- ror r13,4
- xor r14,r9
- and r12,rbx
- xor r13,rbx
- add r8,QWORD[24+rsp]
- mov rdi,r9
- xor r12,rdx
- ror r14,6
- xor rdi,r10
- add r8,r12
- ror r13,14
- and r15,rdi
- xor r14,r9
- add r8,r13
- xor r15,r10
- ror r14,28
- add rax,r8
- add r8,r15
- mov r13,rax
- add r14,r8
- ror r13,23
- mov r8,r14
- mov r12,rbx
- ror r14,5
- xor r13,rax
- xor r12,rcx
- ror r13,4
- xor r14,r8
- and r12,rax
- xor r13,rax
- add rdx,QWORD[32+rsp]
- mov r15,r8
- xor r12,rcx
- ror r14,6
- xor r15,r9
- add rdx,r12
- ror r13,14
- and rdi,r15
- xor r14,r8
- add rdx,r13
- xor rdi,r9
- ror r14,28
- add r11,rdx
- add rdx,rdi
- mov r13,r11
- add r14,rdx
- ror r13,23
- mov rdx,r14
- mov r12,rax
- ror r14,5
- xor r13,r11
- xor r12,rbx
- ror r13,4
- xor r14,rdx
- and r12,r11
- xor r13,r11
- add rcx,QWORD[40+rsp]
- mov rdi,rdx
- xor r12,rbx
- ror r14,6
- xor rdi,r8
- add rcx,r12
- ror r13,14
- and r15,rdi
- xor r14,rdx
- add rcx,r13
- xor r15,r8
- ror r14,28
- add r10,rcx
- add rcx,r15
- mov r13,r10
- add r14,rcx
- ror r13,23
- mov rcx,r14
- mov r12,r11
- ror r14,5
- xor r13,r10
- xor r12,rax
- ror r13,4
- xor r14,rcx
- and r12,r10
- xor r13,r10
- add rbx,QWORD[48+rsp]
- mov r15,rcx
- xor r12,rax
- ror r14,6
- xor r15,rdx
- add rbx,r12
- ror r13,14
- and rdi,r15
- xor r14,rcx
- add rbx,r13
- xor rdi,rdx
- ror r14,28
- add r9,rbx
- add rbx,rdi
- mov r13,r9
- add r14,rbx
- ror r13,23
- mov rbx,r14
- mov r12,r10
- ror r14,5
- xor r13,r9
- xor r12,r11
- ror r13,4
- xor r14,rbx
- and r12,r9
- xor r13,r9
- add rax,QWORD[56+rsp]
- mov rdi,rbx
- xor r12,r11
- ror r14,6
- xor rdi,rcx
- add rax,r12
- ror r13,14
- and r15,rdi
- xor r14,rbx
- add rax,r13
- xor r15,rcx
- ror r14,28
- add r8,rax
- add rax,r15
- mov r13,r8
- add r14,rax
- ror r13,23
- mov rax,r14
- mov r12,r9
- ror r14,5
- xor r13,r8
- xor r12,r10
- ror r13,4
- xor r14,rax
- and r12,r8
- xor r13,r8
- add r11,QWORD[64+rsp]
- mov r15,rax
- xor r12,r10
- ror r14,6
- xor r15,rbx
- add r11,r12
- ror r13,14
- and rdi,r15
- xor r14,rax
- add r11,r13
- xor rdi,rbx
- ror r14,28
- add rdx,r11
- add r11,rdi
- mov r13,rdx
- add r14,r11
- ror r13,23
- mov r11,r14
- mov r12,r8
- ror r14,5
- xor r13,rdx
- xor r12,r9
- ror r13,4
- xor r14,r11
- and r12,rdx
- xor r13,rdx
- add r10,QWORD[72+rsp]
- mov rdi,r11
- xor r12,r9
- ror r14,6
- xor rdi,rax
- add r10,r12
- ror r13,14
- and r15,rdi
- xor r14,r11
- add r10,r13
- xor r15,rax
- ror r14,28
- add rcx,r10
- add r10,r15
- mov r13,rcx
- add r14,r10
- ror r13,23
- mov r10,r14
- mov r12,rdx
- ror r14,5
- xor r13,rcx
- xor r12,r8
- ror r13,4
- xor r14,r10
- and r12,rcx
- xor r13,rcx
- add r9,QWORD[80+rsp]
- mov r15,r10
- xor r12,r8
- ror r14,6
- xor r15,r11
- add r9,r12
- ror r13,14
- and rdi,r15
- xor r14,r10
- add r9,r13
- xor rdi,r11
- ror r14,28
- add rbx,r9
- add r9,rdi
- mov r13,rbx
- add r14,r9
- ror r13,23
- mov r9,r14
- mov r12,rcx
- ror r14,5
- xor r13,rbx
- xor r12,rdx
- ror r13,4
- xor r14,r9
- and r12,rbx
- xor r13,rbx
- add r8,QWORD[88+rsp]
- mov rdi,r9
- xor r12,rdx
- ror r14,6
- xor rdi,r10
- add r8,r12
- ror r13,14
- and r15,rdi
- xor r14,r9
- add r8,r13
- xor r15,r10
- ror r14,28
- add rax,r8
- add r8,r15
- mov r13,rax
- add r14,r8
- ror r13,23
- mov r8,r14
- mov r12,rbx
- ror r14,5
- xor r13,rax
- xor r12,rcx
- ror r13,4
- xor r14,r8
- and r12,rax
- xor r13,rax
- add rdx,QWORD[96+rsp]
- mov r15,r8
- xor r12,rcx
- ror r14,6
- xor r15,r9
- add rdx,r12
- ror r13,14
- and rdi,r15
- xor r14,r8
- add rdx,r13
- xor rdi,r9
- ror r14,28
- add r11,rdx
- add rdx,rdi
- mov r13,r11
- add r14,rdx
- ror r13,23
- mov rdx,r14
- mov r12,rax
- ror r14,5
- xor r13,r11
- xor r12,rbx
- ror r13,4
- xor r14,rdx
- and r12,r11
- xor r13,r11
- add rcx,QWORD[104+rsp]
- mov rdi,rdx
- xor r12,rbx
- ror r14,6
- xor rdi,r8
- add rcx,r12
- ror r13,14
- and r15,rdi
- xor r14,rdx
- add rcx,r13
- xor r15,r8
- ror r14,28
- add r10,rcx
- add rcx,r15
- mov r13,r10
- add r14,rcx
- ror r13,23
- mov rcx,r14
- mov r12,r11
- ror r14,5
- xor r13,r10
- xor r12,rax
- ror r13,4
- xor r14,rcx
- and r12,r10
- xor r13,r10
- add rbx,QWORD[112+rsp]
- mov r15,rcx
- xor r12,rax
- ror r14,6
- xor r15,rdx
- add rbx,r12
- ror r13,14
- and rdi,r15
- xor r14,rcx
- add rbx,r13
- xor rdi,rdx
- ror r14,28
- add r9,rbx
- add rbx,rdi
- mov r13,r9
- add r14,rbx
- ror r13,23
- mov rbx,r14
- mov r12,r10
- ror r14,5
- xor r13,r9
- xor r12,r11
- ror r13,4
- xor r14,rbx
- and r12,r9
- xor r13,r9
- add rax,QWORD[120+rsp]
- mov rdi,rbx
- xor r12,r11
- ror r14,6
- xor rdi,rcx
- add rax,r12
- ror r13,14
- and r15,rdi
- xor r14,rbx
- add rax,r13
- xor r15,rcx
- ror r14,28
- add r8,rax
- add rax,r15
- mov r13,r8
- add r14,rax
- mov rdi,QWORD[((128+0))+rsp]
- mov rax,r14
-
- add rax,QWORD[rdi]
- lea rsi,[128+rsi]
- add rbx,QWORD[8+rdi]
- add rcx,QWORD[16+rdi]
- add rdx,QWORD[24+rdi]
- add r8,QWORD[32+rdi]
- add r9,QWORD[40+rdi]
- add r10,QWORD[48+rdi]
- add r11,QWORD[56+rdi]
-
- cmp rsi,QWORD[((128+16))+rsp]
-
- mov QWORD[rdi],rax
- mov QWORD[8+rdi],rbx
- mov QWORD[16+rdi],rcx
- mov QWORD[24+rdi],rdx
- mov QWORD[32+rdi],r8
- mov QWORD[40+rdi],r9
- mov QWORD[48+rdi],r10
- mov QWORD[56+rdi],r11
- jb NEAR $L$loop_xop
-
- mov rsi,QWORD[152+rsp]
-
- vzeroupper
- movaps xmm6,XMMWORD[((128+32))+rsp]
- movaps xmm7,XMMWORD[((128+48))+rsp]
- movaps xmm8,XMMWORD[((128+64))+rsp]
- movaps xmm9,XMMWORD[((128+80))+rsp]
- movaps xmm10,XMMWORD[((128+96))+rsp]
- movaps xmm11,XMMWORD[((128+112))+rsp]
- mov r15,QWORD[((-48))+rsi]
-
- mov r14,QWORD[((-40))+rsi]
-
- mov r13,QWORD[((-32))+rsi]
-
- mov r12,QWORD[((-24))+rsi]
-
- mov rbp,QWORD[((-16))+rsi]
-
- mov rbx,QWORD[((-8))+rsi]
-
- lea rsp,[rsi]
-
-$L$epilogue_xop:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_sha512_block_data_order_xop:
-
-ALIGN 64
-sha512_block_data_order_avx:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_sha512_block_data_order_avx:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
-
-
-
-$L$avx_shortcut:
- mov rax,rsp
-
- push rbx
-
- push rbp
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
- shl rdx,4
- sub rsp,256
- lea rdx,[rdx*8+rsi]
- and rsp,-64
- mov QWORD[((128+0))+rsp],rdi
- mov QWORD[((128+8))+rsp],rsi
- mov QWORD[((128+16))+rsp],rdx
- mov QWORD[152+rsp],rax
-
- movaps XMMWORD[(128+32)+rsp],xmm6
- movaps XMMWORD[(128+48)+rsp],xmm7
- movaps XMMWORD[(128+64)+rsp],xmm8
- movaps XMMWORD[(128+80)+rsp],xmm9
- movaps XMMWORD[(128+96)+rsp],xmm10
- movaps XMMWORD[(128+112)+rsp],xmm11
-$L$prologue_avx:
-
- vzeroupper
- mov rax,QWORD[rdi]
- mov rbx,QWORD[8+rdi]
- mov rcx,QWORD[16+rdi]
- mov rdx,QWORD[24+rdi]
- mov r8,QWORD[32+rdi]
- mov r9,QWORD[40+rdi]
- mov r10,QWORD[48+rdi]
- mov r11,QWORD[56+rdi]
- jmp NEAR $L$loop_avx
-ALIGN 16
-$L$loop_avx:
- vmovdqa xmm11,XMMWORD[((K512+1280))]
- vmovdqu xmm0,XMMWORD[rsi]
- lea rbp,[((K512+128))]
- vmovdqu xmm1,XMMWORD[16+rsi]
- vmovdqu xmm2,XMMWORD[32+rsi]
- vpshufb xmm0,xmm0,xmm11
- vmovdqu xmm3,XMMWORD[48+rsi]
- vpshufb xmm1,xmm1,xmm11
- vmovdqu xmm4,XMMWORD[64+rsi]
- vpshufb xmm2,xmm2,xmm11
- vmovdqu xmm5,XMMWORD[80+rsi]
- vpshufb xmm3,xmm3,xmm11
- vmovdqu xmm6,XMMWORD[96+rsi]
- vpshufb xmm4,xmm4,xmm11
- vmovdqu xmm7,XMMWORD[112+rsi]
- vpshufb xmm5,xmm5,xmm11
- vpaddq xmm8,xmm0,XMMWORD[((-128))+rbp]
- vpshufb xmm6,xmm6,xmm11
- vpaddq xmm9,xmm1,XMMWORD[((-96))+rbp]
- vpshufb xmm7,xmm7,xmm11
- vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp]
- vpaddq xmm11,xmm3,XMMWORD[((-32))+rbp]
- vmovdqa XMMWORD[rsp],xmm8
- vpaddq xmm8,xmm4,XMMWORD[rbp]
- vmovdqa XMMWORD[16+rsp],xmm9
- vpaddq xmm9,xmm5,XMMWORD[32+rbp]
- vmovdqa XMMWORD[32+rsp],xmm10
- vpaddq xmm10,xmm6,XMMWORD[64+rbp]
- vmovdqa XMMWORD[48+rsp],xmm11
- vpaddq xmm11,xmm7,XMMWORD[96+rbp]
- vmovdqa XMMWORD[64+rsp],xmm8
- mov r14,rax
- vmovdqa XMMWORD[80+rsp],xmm9
- mov rdi,rbx
- vmovdqa XMMWORD[96+rsp],xmm10
- xor rdi,rcx
- vmovdqa XMMWORD[112+rsp],xmm11
- mov r13,r8
- jmp NEAR $L$avx_00_47
-
-ALIGN 16
-$L$avx_00_47:
- add rbp,256
- vpalignr xmm8,xmm1,xmm0,8
- shrd r13,r13,23
- mov rax,r14
- vpalignr xmm11,xmm5,xmm4,8
- mov r12,r9
- shrd r14,r14,5
- vpsrlq xmm10,xmm8,1
- xor r13,r8
- xor r12,r10
- vpaddq xmm0,xmm0,xmm11
- shrd r13,r13,4
- xor r14,rax
- vpsrlq xmm11,xmm8,7
- and r12,r8
- xor r13,r8
- vpsllq xmm9,xmm8,56
- add r11,QWORD[rsp]
- mov r15,rax
- vpxor xmm8,xmm11,xmm10
- xor r12,r10
- shrd r14,r14,6
- vpsrlq xmm10,xmm10,7
- xor r15,rbx
- add r11,r12
- vpxor xmm8,xmm8,xmm9
- shrd r13,r13,14
- and rdi,r15
- vpsllq xmm9,xmm9,7
- xor r14,rax
- add r11,r13
- vpxor xmm8,xmm8,xmm10
- xor rdi,rbx
- shrd r14,r14,28
- vpsrlq xmm11,xmm7,6
- add rdx,r11
- add r11,rdi
- vpxor xmm8,xmm8,xmm9
- mov r13,rdx
- add r14,r11
- vpsllq xmm10,xmm7,3
- shrd r13,r13,23
- mov r11,r14
- vpaddq xmm0,xmm0,xmm8
- mov r12,r8
- shrd r14,r14,5
- vpsrlq xmm9,xmm7,19
- xor r13,rdx
- xor r12,r9
- vpxor xmm11,xmm11,xmm10
- shrd r13,r13,4
- xor r14,r11
- vpsllq xmm10,xmm10,42
- and r12,rdx
- xor r13,rdx
- vpxor xmm11,xmm11,xmm9
- add r10,QWORD[8+rsp]
- mov rdi,r11
- vpsrlq xmm9,xmm9,42
- xor r12,r9
- shrd r14,r14,6
- vpxor xmm11,xmm11,xmm10
- xor rdi,rax
- add r10,r12
- vpxor xmm11,xmm11,xmm9
- shrd r13,r13,14
- and r15,rdi
- vpaddq xmm0,xmm0,xmm11
- xor r14,r11
- add r10,r13
- vpaddq xmm10,xmm0,XMMWORD[((-128))+rbp]
- xor r15,rax
- shrd r14,r14,28
- add rcx,r10
- add r10,r15
- mov r13,rcx
- add r14,r10
- vmovdqa XMMWORD[rsp],xmm10
- vpalignr xmm8,xmm2,xmm1,8
- shrd r13,r13,23
- mov r10,r14
- vpalignr xmm11,xmm6,xmm5,8
- mov r12,rdx
- shrd r14,r14,5
- vpsrlq xmm10,xmm8,1
- xor r13,rcx
- xor r12,r8
- vpaddq xmm1,xmm1,xmm11
- shrd r13,r13,4
- xor r14,r10
- vpsrlq xmm11,xmm8,7
- and r12,rcx
- xor r13,rcx
- vpsllq xmm9,xmm8,56
- add r9,QWORD[16+rsp]
- mov r15,r10
- vpxor xmm8,xmm11,xmm10
- xor r12,r8
- shrd r14,r14,6
- vpsrlq xmm10,xmm10,7
- xor r15,r11
- add r9,r12
- vpxor xmm8,xmm8,xmm9
- shrd r13,r13,14
- and rdi,r15
- vpsllq xmm9,xmm9,7
- xor r14,r10
- add r9,r13
- vpxor xmm8,xmm8,xmm10
- xor rdi,r11
- shrd r14,r14,28
- vpsrlq xmm11,xmm0,6
- add rbx,r9
- add r9,rdi
- vpxor xmm8,xmm8,xmm9
- mov r13,rbx
- add r14,r9
- vpsllq xmm10,xmm0,3
- shrd r13,r13,23
- mov r9,r14
- vpaddq xmm1,xmm1,xmm8
- mov r12,rcx
- shrd r14,r14,5
- vpsrlq xmm9,xmm0,19
- xor r13,rbx
- xor r12,rdx
- vpxor xmm11,xmm11,xmm10
- shrd r13,r13,4
- xor r14,r9
- vpsllq xmm10,xmm10,42
- and r12,rbx
- xor r13,rbx
- vpxor xmm11,xmm11,xmm9
- add r8,QWORD[24+rsp]
- mov rdi,r9
- vpsrlq xmm9,xmm9,42
- xor r12,rdx
- shrd r14,r14,6
- vpxor xmm11,xmm11,xmm10
- xor rdi,r10
- add r8,r12
- vpxor xmm11,xmm11,xmm9
- shrd r13,r13,14
- and r15,rdi
- vpaddq xmm1,xmm1,xmm11
- xor r14,r9
- add r8,r13
- vpaddq xmm10,xmm1,XMMWORD[((-96))+rbp]
- xor r15,r10
- shrd r14,r14,28
- add rax,r8
- add r8,r15
- mov r13,rax
- add r14,r8
- vmovdqa XMMWORD[16+rsp],xmm10
- vpalignr xmm8,xmm3,xmm2,8
- shrd r13,r13,23
- mov r8,r14
- vpalignr xmm11,xmm7,xmm6,8
- mov r12,rbx
- shrd r14,r14,5
- vpsrlq xmm10,xmm8,1
- xor r13,rax
- xor r12,rcx
- vpaddq xmm2,xmm2,xmm11
- shrd r13,r13,4
- xor r14,r8
- vpsrlq xmm11,xmm8,7
- and r12,rax
- xor r13,rax
- vpsllq xmm9,xmm8,56
- add rdx,QWORD[32+rsp]
- mov r15,r8
- vpxor xmm8,xmm11,xmm10
- xor r12,rcx
- shrd r14,r14,6
- vpsrlq xmm10,xmm10,7
- xor r15,r9
- add rdx,r12
- vpxor xmm8,xmm8,xmm9
- shrd r13,r13,14
- and rdi,r15
- vpsllq xmm9,xmm9,7
- xor r14,r8
- add rdx,r13
- vpxor xmm8,xmm8,xmm10
- xor rdi,r9
- shrd r14,r14,28
- vpsrlq xmm11,xmm1,6
- add r11,rdx
- add rdx,rdi
- vpxor xmm8,xmm8,xmm9
- mov r13,r11
- add r14,rdx
- vpsllq xmm10,xmm1,3
- shrd r13,r13,23
- mov rdx,r14
- vpaddq xmm2,xmm2,xmm8
- mov r12,rax
- shrd r14,r14,5
- vpsrlq xmm9,xmm1,19
- xor r13,r11
- xor r12,rbx
- vpxor xmm11,xmm11,xmm10
- shrd r13,r13,4
- xor r14,rdx
- vpsllq xmm10,xmm10,42
- and r12,r11
- xor r13,r11
- vpxor xmm11,xmm11,xmm9
- add rcx,QWORD[40+rsp]
- mov rdi,rdx
- vpsrlq xmm9,xmm9,42
- xor r12,rbx
- shrd r14,r14,6
- vpxor xmm11,xmm11,xmm10
- xor rdi,r8
- add rcx,r12
- vpxor xmm11,xmm11,xmm9
- shrd r13,r13,14
- and r15,rdi
- vpaddq xmm2,xmm2,xmm11
- xor r14,rdx
- add rcx,r13
- vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp]
- xor r15,r8
- shrd r14,r14,28
- add r10,rcx
- add rcx,r15
- mov r13,r10
- add r14,rcx
- vmovdqa XMMWORD[32+rsp],xmm10
- vpalignr xmm8,xmm4,xmm3,8
- shrd r13,r13,23
- mov rcx,r14
- vpalignr xmm11,xmm0,xmm7,8
- mov r12,r11
- shrd r14,r14,5
- vpsrlq xmm10,xmm8,1
- xor r13,r10
- xor r12,rax
- vpaddq xmm3,xmm3,xmm11
- shrd r13,r13,4
- xor r14,rcx
- vpsrlq xmm11,xmm8,7
- and r12,r10
- xor r13,r10
- vpsllq xmm9,xmm8,56
- add rbx,QWORD[48+rsp]
- mov r15,rcx
- vpxor xmm8,xmm11,xmm10
- xor r12,rax
- shrd r14,r14,6
- vpsrlq xmm10,xmm10,7
- xor r15,rdx
- add rbx,r12
- vpxor xmm8,xmm8,xmm9
- shrd r13,r13,14
- and rdi,r15
- vpsllq xmm9,xmm9,7
- xor r14,rcx
- add rbx,r13
- vpxor xmm8,xmm8,xmm10
- xor rdi,rdx
- shrd r14,r14,28
- vpsrlq xmm11,xmm2,6
- add r9,rbx
- add rbx,rdi
- vpxor xmm8,xmm8,xmm9
- mov r13,r9
- add r14,rbx
- vpsllq xmm10,xmm2,3
- shrd r13,r13,23
- mov rbx,r14
- vpaddq xmm3,xmm3,xmm8
- mov r12,r10
- shrd r14,r14,5
- vpsrlq xmm9,xmm2,19
- xor r13,r9
- xor r12,r11
- vpxor xmm11,xmm11,xmm10
- shrd r13,r13,4
- xor r14,rbx
- vpsllq xmm10,xmm10,42
- and r12,r9
- xor r13,r9
- vpxor xmm11,xmm11,xmm9
- add rax,QWORD[56+rsp]
- mov rdi,rbx
- vpsrlq xmm9,xmm9,42
- xor r12,r11
- shrd r14,r14,6
- vpxor xmm11,xmm11,xmm10
- xor rdi,rcx
- add rax,r12
- vpxor xmm11,xmm11,xmm9
- shrd r13,r13,14
- and r15,rdi
- vpaddq xmm3,xmm3,xmm11
- xor r14,rbx
- add rax,r13
- vpaddq xmm10,xmm3,XMMWORD[((-32))+rbp]
- xor r15,rcx
- shrd r14,r14,28
- add r8,rax
- add rax,r15
- mov r13,r8
- add r14,rax
- vmovdqa XMMWORD[48+rsp],xmm10
- vpalignr xmm8,xmm5,xmm4,8
- shrd r13,r13,23
- mov rax,r14
- vpalignr xmm11,xmm1,xmm0,8
- mov r12,r9
- shrd r14,r14,5
- vpsrlq xmm10,xmm8,1
- xor r13,r8
- xor r12,r10
- vpaddq xmm4,xmm4,xmm11
- shrd r13,r13,4
- xor r14,rax
- vpsrlq xmm11,xmm8,7
- and r12,r8
- xor r13,r8
- vpsllq xmm9,xmm8,56
- add r11,QWORD[64+rsp]
- mov r15,rax
- vpxor xmm8,xmm11,xmm10
- xor r12,r10
- shrd r14,r14,6
- vpsrlq xmm10,xmm10,7
- xor r15,rbx
- add r11,r12
- vpxor xmm8,xmm8,xmm9
- shrd r13,r13,14
- and rdi,r15
- vpsllq xmm9,xmm9,7
- xor r14,rax
- add r11,r13
- vpxor xmm8,xmm8,xmm10
- xor rdi,rbx
- shrd r14,r14,28
- vpsrlq xmm11,xmm3,6
- add rdx,r11
- add r11,rdi
- vpxor xmm8,xmm8,xmm9
- mov r13,rdx
- add r14,r11
- vpsllq xmm10,xmm3,3
- shrd r13,r13,23
- mov r11,r14
- vpaddq xmm4,xmm4,xmm8
- mov r12,r8
- shrd r14,r14,5
- vpsrlq xmm9,xmm3,19
- xor r13,rdx
- xor r12,r9
- vpxor xmm11,xmm11,xmm10
- shrd r13,r13,4
- xor r14,r11
- vpsllq xmm10,xmm10,42
- and r12,rdx
- xor r13,rdx
- vpxor xmm11,xmm11,xmm9
- add r10,QWORD[72+rsp]
- mov rdi,r11
- vpsrlq xmm9,xmm9,42
- xor r12,r9
- shrd r14,r14,6
- vpxor xmm11,xmm11,xmm10
- xor rdi,rax
- add r10,r12
- vpxor xmm11,xmm11,xmm9
- shrd r13,r13,14
- and r15,rdi
- vpaddq xmm4,xmm4,xmm11
- xor r14,r11
- add r10,r13
- vpaddq xmm10,xmm4,XMMWORD[rbp]
- xor r15,rax
- shrd r14,r14,28
- add rcx,r10
- add r10,r15
- mov r13,rcx
- add r14,r10
- vmovdqa XMMWORD[64+rsp],xmm10
- vpalignr xmm8,xmm6,xmm5,8
- shrd r13,r13,23
- mov r10,r14
- vpalignr xmm11,xmm2,xmm1,8
- mov r12,rdx
- shrd r14,r14,5
- vpsrlq xmm10,xmm8,1
- xor r13,rcx
- xor r12,r8
- vpaddq xmm5,xmm5,xmm11
- shrd r13,r13,4
- xor r14,r10
- vpsrlq xmm11,xmm8,7
- and r12,rcx
- xor r13,rcx
- vpsllq xmm9,xmm8,56
- add r9,QWORD[80+rsp]
- mov r15,r10
- vpxor xmm8,xmm11,xmm10
- xor r12,r8
- shrd r14,r14,6
- vpsrlq xmm10,xmm10,7
- xor r15,r11
- add r9,r12
- vpxor xmm8,xmm8,xmm9
- shrd r13,r13,14
- and rdi,r15
- vpsllq xmm9,xmm9,7
- xor r14,r10
- add r9,r13
- vpxor xmm8,xmm8,xmm10
- xor rdi,r11
- shrd r14,r14,28
- vpsrlq xmm11,xmm4,6
- add rbx,r9
- add r9,rdi
- vpxor xmm8,xmm8,xmm9
- mov r13,rbx
- add r14,r9
- vpsllq xmm10,xmm4,3
- shrd r13,r13,23
- mov r9,r14
- vpaddq xmm5,xmm5,xmm8
- mov r12,rcx
- shrd r14,r14,5
- vpsrlq xmm9,xmm4,19
- xor r13,rbx
- xor r12,rdx
- vpxor xmm11,xmm11,xmm10
- shrd r13,r13,4
- xor r14,r9
- vpsllq xmm10,xmm10,42
- and r12,rbx
- xor r13,rbx
- vpxor xmm11,xmm11,xmm9
- add r8,QWORD[88+rsp]
- mov rdi,r9
- vpsrlq xmm9,xmm9,42
- xor r12,rdx
- shrd r14,r14,6
- vpxor xmm11,xmm11,xmm10
- xor rdi,r10
- add r8,r12
- vpxor xmm11,xmm11,xmm9
- shrd r13,r13,14
- and r15,rdi
- vpaddq xmm5,xmm5,xmm11
- xor r14,r9
- add r8,r13
- vpaddq xmm10,xmm5,XMMWORD[32+rbp]
- xor r15,r10
- shrd r14,r14,28
- add rax,r8
- add r8,r15
- mov r13,rax
- add r14,r8
- vmovdqa XMMWORD[80+rsp],xmm10
- vpalignr xmm8,xmm7,xmm6,8
- shrd r13,r13,23
- mov r8,r14
- vpalignr xmm11,xmm3,xmm2,8
- mov r12,rbx
- shrd r14,r14,5
- vpsrlq xmm10,xmm8,1
- xor r13,rax
- xor r12,rcx
- vpaddq xmm6,xmm6,xmm11
- shrd r13,r13,4
- xor r14,r8
- vpsrlq xmm11,xmm8,7
- and r12,rax
- xor r13,rax
- vpsllq xmm9,xmm8,56
- add rdx,QWORD[96+rsp]
- mov r15,r8
- vpxor xmm8,xmm11,xmm10
- xor r12,rcx
- shrd r14,r14,6
- vpsrlq xmm10,xmm10,7
- xor r15,r9
- add rdx,r12
- vpxor xmm8,xmm8,xmm9
- shrd r13,r13,14
- and rdi,r15
- vpsllq xmm9,xmm9,7
- xor r14,r8
- add rdx,r13
- vpxor xmm8,xmm8,xmm10
- xor rdi,r9
- shrd r14,r14,28
- vpsrlq xmm11,xmm5,6
- add r11,rdx
- add rdx,rdi
- vpxor xmm8,xmm8,xmm9
- mov r13,r11
- add r14,rdx
- vpsllq xmm10,xmm5,3
- shrd r13,r13,23
- mov rdx,r14
- vpaddq xmm6,xmm6,xmm8
- mov r12,rax
- shrd r14,r14,5
- vpsrlq xmm9,xmm5,19
- xor r13,r11
- xor r12,rbx
- vpxor xmm11,xmm11,xmm10
- shrd r13,r13,4
- xor r14,rdx
- vpsllq xmm10,xmm10,42
- and r12,r11
- xor r13,r11
- vpxor xmm11,xmm11,xmm9
- add rcx,QWORD[104+rsp]
- mov rdi,rdx
- vpsrlq xmm9,xmm9,42
- xor r12,rbx
- shrd r14,r14,6
- vpxor xmm11,xmm11,xmm10
- xor rdi,r8
- add rcx,r12
- vpxor xmm11,xmm11,xmm9
- shrd r13,r13,14
- and r15,rdi
- vpaddq xmm6,xmm6,xmm11
- xor r14,rdx
- add rcx,r13
- vpaddq xmm10,xmm6,XMMWORD[64+rbp]
- xor r15,r8
- shrd r14,r14,28
- add r10,rcx
- add rcx,r15
- mov r13,r10
- add r14,rcx
- vmovdqa XMMWORD[96+rsp],xmm10
- vpalignr xmm8,xmm0,xmm7,8
- shrd r13,r13,23
- mov rcx,r14
- vpalignr xmm11,xmm4,xmm3,8
- mov r12,r11
- shrd r14,r14,5
- vpsrlq xmm10,xmm8,1
- xor r13,r10
- xor r12,rax
- vpaddq xmm7,xmm7,xmm11
- shrd r13,r13,4
- xor r14,rcx
- vpsrlq xmm11,xmm8,7
- and r12,r10
- xor r13,r10
- vpsllq xmm9,xmm8,56
- add rbx,QWORD[112+rsp]
- mov r15,rcx
- vpxor xmm8,xmm11,xmm10
- xor r12,rax
- shrd r14,r14,6
- vpsrlq xmm10,xmm10,7
- xor r15,rdx
- add rbx,r12
- vpxor xmm8,xmm8,xmm9
- shrd r13,r13,14
- and rdi,r15
- vpsllq xmm9,xmm9,7
- xor r14,rcx
- add rbx,r13
- vpxor xmm8,xmm8,xmm10
- xor rdi,rdx
- shrd r14,r14,28
- vpsrlq xmm11,xmm6,6
- add r9,rbx
- add rbx,rdi
- vpxor xmm8,xmm8,xmm9
- mov r13,r9
- add r14,rbx
- vpsllq xmm10,xmm6,3
- shrd r13,r13,23
- mov rbx,r14
- vpaddq xmm7,xmm7,xmm8
- mov r12,r10
- shrd r14,r14,5
- vpsrlq xmm9,xmm6,19
- xor r13,r9
- xor r12,r11
- vpxor xmm11,xmm11,xmm10
- shrd r13,r13,4
- xor r14,rbx
- vpsllq xmm10,xmm10,42
- and r12,r9
- xor r13,r9
- vpxor xmm11,xmm11,xmm9
- add rax,QWORD[120+rsp]
- mov rdi,rbx
- vpsrlq xmm9,xmm9,42
- xor r12,r11
- shrd r14,r14,6
- vpxor xmm11,xmm11,xmm10
- xor rdi,rcx
- add rax,r12
- vpxor xmm11,xmm11,xmm9
- shrd r13,r13,14
- and r15,rdi
- vpaddq xmm7,xmm7,xmm11
- xor r14,rbx
- add rax,r13
- vpaddq xmm10,xmm7,XMMWORD[96+rbp]
- xor r15,rcx
- shrd r14,r14,28
- add r8,rax
- add rax,r15
- mov r13,r8
- add r14,rax
- vmovdqa XMMWORD[112+rsp],xmm10
- cmp BYTE[135+rbp],0
- jne NEAR $L$avx_00_47
- shrd r13,r13,23
- mov rax,r14
- mov r12,r9
- shrd r14,r14,5
- xor r13,r8
- xor r12,r10
- shrd r13,r13,4
- xor r14,rax
- and r12,r8
- xor r13,r8
- add r11,QWORD[rsp]
- mov r15,rax
- xor r12,r10
- shrd r14,r14,6
- xor r15,rbx
- add r11,r12
- shrd r13,r13,14
- and rdi,r15
- xor r14,rax
- add r11,r13
- xor rdi,rbx
- shrd r14,r14,28
- add rdx,r11
- add r11,rdi
- mov r13,rdx
- add r14,r11
- shrd r13,r13,23
- mov r11,r14
- mov r12,r8
- shrd r14,r14,5
- xor r13,rdx
- xor r12,r9
- shrd r13,r13,4
- xor r14,r11
- and r12,rdx
- xor r13,rdx
- add r10,QWORD[8+rsp]
- mov rdi,r11
- xor r12,r9
- shrd r14,r14,6
- xor rdi,rax
- add r10,r12
- shrd r13,r13,14
- and r15,rdi
- xor r14,r11
- add r10,r13
- xor r15,rax
- shrd r14,r14,28
- add rcx,r10
- add r10,r15
- mov r13,rcx
- add r14,r10
- shrd r13,r13,23
- mov r10,r14
- mov r12,rdx
- shrd r14,r14,5
- xor r13,rcx
- xor r12,r8
- shrd r13,r13,4
- xor r14,r10
- and r12,rcx
- xor r13,rcx
- add r9,QWORD[16+rsp]
- mov r15,r10
- xor r12,r8
- shrd r14,r14,6
- xor r15,r11
- add r9,r12
- shrd r13,r13,14
- and rdi,r15
- xor r14,r10
- add r9,r13
- xor rdi,r11
- shrd r14,r14,28
- add rbx,r9
- add r9,rdi
- mov r13,rbx
- add r14,r9
- shrd r13,r13,23
- mov r9,r14
- mov r12,rcx
- shrd r14,r14,5
- xor r13,rbx
- xor r12,rdx
- shrd r13,r13,4
- xor r14,r9
- and r12,rbx
- xor r13,rbx
- add r8,QWORD[24+rsp]
- mov rdi,r9
- xor r12,rdx
- shrd r14,r14,6
- xor rdi,r10
- add r8,r12
- shrd r13,r13,14
- and r15,rdi
- xor r14,r9
- add r8,r13
- xor r15,r10
- shrd r14,r14,28
- add rax,r8
- add r8,r15
- mov r13,rax
- add r14,r8
- shrd r13,r13,23
- mov r8,r14
- mov r12,rbx
- shrd r14,r14,5
- xor r13,rax
- xor r12,rcx
- shrd r13,r13,4
- xor r14,r8
- and r12,rax
- xor r13,rax
- add rdx,QWORD[32+rsp]
- mov r15,r8
- xor r12,rcx
- shrd r14,r14,6
- xor r15,r9
- add rdx,r12
- shrd r13,r13,14
- and rdi,r15
- xor r14,r8
- add rdx,r13
- xor rdi,r9
- shrd r14,r14,28
- add r11,rdx
- add rdx,rdi
- mov r13,r11
- add r14,rdx
- shrd r13,r13,23
- mov rdx,r14
- mov r12,rax
- shrd r14,r14,5
- xor r13,r11
- xor r12,rbx
- shrd r13,r13,4
- xor r14,rdx
- and r12,r11
- xor r13,r11
- add rcx,QWORD[40+rsp]
- mov rdi,rdx
- xor r12,rbx
- shrd r14,r14,6
- xor rdi,r8
- add rcx,r12
- shrd r13,r13,14
- and r15,rdi
- xor r14,rdx
- add rcx,r13
- xor r15,r8
- shrd r14,r14,28
- add r10,rcx
- add rcx,r15
- mov r13,r10
- add r14,rcx
- shrd r13,r13,23
- mov rcx,r14
- mov r12,r11
- shrd r14,r14,5
- xor r13,r10
- xor r12,rax
- shrd r13,r13,4
- xor r14,rcx
- and r12,r10
- xor r13,r10
- add rbx,QWORD[48+rsp]
- mov r15,rcx
- xor r12,rax
- shrd r14,r14,6
- xor r15,rdx
- add rbx,r12
- shrd r13,r13,14
- and rdi,r15
- xor r14,rcx
- add rbx,r13
- xor rdi,rdx
- shrd r14,r14,28
- add r9,rbx
- add rbx,rdi
- mov r13,r9
- add r14,rbx
- shrd r13,r13,23
- mov rbx,r14
- mov r12,r10
- shrd r14,r14,5
- xor r13,r9
- xor r12,r11
- shrd r13,r13,4
- xor r14,rbx
- and r12,r9
- xor r13,r9
- add rax,QWORD[56+rsp]
- mov rdi,rbx
- xor r12,r11
- shrd r14,r14,6
- xor rdi,rcx
- add rax,r12
- shrd r13,r13,14
- and r15,rdi
- xor r14,rbx
- add rax,r13
- xor r15,rcx
- shrd r14,r14,28
- add r8,rax
- add rax,r15
- mov r13,r8
- add r14,rax
- shrd r13,r13,23
- mov rax,r14
- mov r12,r9
- shrd r14,r14,5
- xor r13,r8
- xor r12,r10
- shrd r13,r13,4
- xor r14,rax
- and r12,r8
- xor r13,r8
- add r11,QWORD[64+rsp]
- mov r15,rax
- xor r12,r10
- shrd r14,r14,6
- xor r15,rbx
- add r11,r12
- shrd r13,r13,14
- and rdi,r15
- xor r14,rax
- add r11,r13
- xor rdi,rbx
- shrd r14,r14,28
- add rdx,r11
- add r11,rdi
- mov r13,rdx
- add r14,r11
- shrd r13,r13,23
- mov r11,r14
- mov r12,r8
- shrd r14,r14,5
- xor r13,rdx
- xor r12,r9
- shrd r13,r13,4
- xor r14,r11
- and r12,rdx
- xor r13,rdx
- add r10,QWORD[72+rsp]
- mov rdi,r11
- xor r12,r9
- shrd r14,r14,6
- xor rdi,rax
- add r10,r12
- shrd r13,r13,14
- and r15,rdi
- xor r14,r11
- add r10,r13
- xor r15,rax
- shrd r14,r14,28
- add rcx,r10
- add r10,r15
- mov r13,rcx
- add r14,r10
- shrd r13,r13,23
- mov r10,r14
- mov r12,rdx
- shrd r14,r14,5
- xor r13,rcx
- xor r12,r8
- shrd r13,r13,4
- xor r14,r10
- and r12,rcx
- xor r13,rcx
- add r9,QWORD[80+rsp]
- mov r15,r10
- xor r12,r8
- shrd r14,r14,6
- xor r15,r11
- add r9,r12
- shrd r13,r13,14
- and rdi,r15
- xor r14,r10
- add r9,r13
- xor rdi,r11
- shrd r14,r14,28
- add rbx,r9
- add r9,rdi
- mov r13,rbx
- add r14,r9
- shrd r13,r13,23
- mov r9,r14
- mov r12,rcx
- shrd r14,r14,5
- xor r13,rbx
- xor r12,rdx
- shrd r13,r13,4
- xor r14,r9
- and r12,rbx
- xor r13,rbx
- add r8,QWORD[88+rsp]
- mov rdi,r9
- xor r12,rdx
- shrd r14,r14,6
- xor rdi,r10
- add r8,r12
- shrd r13,r13,14
- and r15,rdi
- xor r14,r9
- add r8,r13
- xor r15,r10
- shrd r14,r14,28
- add rax,r8
- add r8,r15
- mov r13,rax
- add r14,r8
- shrd r13,r13,23
- mov r8,r14
- mov r12,rbx
- shrd r14,r14,5
- xor r13,rax
- xor r12,rcx
- shrd r13,r13,4
- xor r14,r8
- and r12,rax
- xor r13,rax
- add rdx,QWORD[96+rsp]
- mov r15,r8
- xor r12,rcx
- shrd r14,r14,6
- xor r15,r9
- add rdx,r12
- shrd r13,r13,14
- and rdi,r15
- xor r14,r8
- add rdx,r13
- xor rdi,r9
- shrd r14,r14,28
- add r11,rdx
- add rdx,rdi
- mov r13,r11
- add r14,rdx
- shrd r13,r13,23
- mov rdx,r14
- mov r12,rax
- shrd r14,r14,5
- xor r13,r11
- xor r12,rbx
- shrd r13,r13,4
- xor r14,rdx
- and r12,r11
- xor r13,r11
- add rcx,QWORD[104+rsp]
- mov rdi,rdx
- xor r12,rbx
- shrd r14,r14,6
- xor rdi,r8
- add rcx,r12
- shrd r13,r13,14
- and r15,rdi
- xor r14,rdx
- add rcx,r13
- xor r15,r8
- shrd r14,r14,28
- add r10,rcx
- add rcx,r15
- mov r13,r10
- add r14,rcx
- shrd r13,r13,23
- mov rcx,r14
- mov r12,r11
- shrd r14,r14,5
- xor r13,r10
- xor r12,rax
- shrd r13,r13,4
- xor r14,rcx
- and r12,r10
- xor r13,r10
- add rbx,QWORD[112+rsp]
- mov r15,rcx
- xor r12,rax
- shrd r14,r14,6
- xor r15,rdx
- add rbx,r12
- shrd r13,r13,14
- and rdi,r15
- xor r14,rcx
- add rbx,r13
- xor rdi,rdx
- shrd r14,r14,28
- add r9,rbx
- add rbx,rdi
- mov r13,r9
- add r14,rbx
- shrd r13,r13,23
- mov rbx,r14
- mov r12,r10
- shrd r14,r14,5
- xor r13,r9
- xor r12,r11
- shrd r13,r13,4
- xor r14,rbx
- and r12,r9
- xor r13,r9
- add rax,QWORD[120+rsp]
- mov rdi,rbx
- xor r12,r11
- shrd r14,r14,6
- xor rdi,rcx
- add rax,r12
- shrd r13,r13,14
- and r15,rdi
- xor r14,rbx
- add rax,r13
- xor r15,rcx
- shrd r14,r14,28
- add r8,rax
- add rax,r15
- mov r13,r8
- add r14,rax
- mov rdi,QWORD[((128+0))+rsp]
- mov rax,r14
-
- add rax,QWORD[rdi]
- lea rsi,[128+rsi]
- add rbx,QWORD[8+rdi]
- add rcx,QWORD[16+rdi]
- add rdx,QWORD[24+rdi]
- add r8,QWORD[32+rdi]
- add r9,QWORD[40+rdi]
- add r10,QWORD[48+rdi]
- add r11,QWORD[56+rdi]
-
- cmp rsi,QWORD[((128+16))+rsp]
-
- mov QWORD[rdi],rax
- mov QWORD[8+rdi],rbx
- mov QWORD[16+rdi],rcx
- mov QWORD[24+rdi],rdx
- mov QWORD[32+rdi],r8
- mov QWORD[40+rdi],r9
- mov QWORD[48+rdi],r10
- mov QWORD[56+rdi],r11
- jb NEAR $L$loop_avx
-
- mov rsi,QWORD[152+rsp]
-
- vzeroupper
- movaps xmm6,XMMWORD[((128+32))+rsp]
- movaps xmm7,XMMWORD[((128+48))+rsp]
- movaps xmm8,XMMWORD[((128+64))+rsp]
- movaps xmm9,XMMWORD[((128+80))+rsp]
- movaps xmm10,XMMWORD[((128+96))+rsp]
- movaps xmm11,XMMWORD[((128+112))+rsp]
- mov r15,QWORD[((-48))+rsi]
-
- mov r14,QWORD[((-40))+rsi]
-
- mov r13,QWORD[((-32))+rsi]
-
- mov r12,QWORD[((-24))+rsi]
-
- mov rbp,QWORD[((-16))+rsi]
-
- mov rbx,QWORD[((-8))+rsi]
-
- lea rsp,[rsi]
-
-$L$epilogue_avx:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_sha512_block_data_order_avx:
-
-ALIGN 64
-sha512_block_data_order_avx2:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_sha512_block_data_order_avx2:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
-
-
-
-$L$avx2_shortcut:
- mov rax,rsp
-
- push rbx
-
- push rbp
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
- sub rsp,1408
- shl rdx,4
- and rsp,-256*8
- lea rdx,[rdx*8+rsi]
- add rsp,1152
- mov QWORD[((128+0))+rsp],rdi
- mov QWORD[((128+8))+rsp],rsi
- mov QWORD[((128+16))+rsp],rdx
- mov QWORD[152+rsp],rax
-
- movaps XMMWORD[(128+32)+rsp],xmm6
- movaps XMMWORD[(128+48)+rsp],xmm7
- movaps XMMWORD[(128+64)+rsp],xmm8
- movaps XMMWORD[(128+80)+rsp],xmm9
- movaps XMMWORD[(128+96)+rsp],xmm10
- movaps XMMWORD[(128+112)+rsp],xmm11
-$L$prologue_avx2:
-
- vzeroupper
- sub rsi,-16*8
- mov rax,QWORD[rdi]
- mov r12,rsi
- mov rbx,QWORD[8+rdi]
- cmp rsi,rdx
- mov rcx,QWORD[16+rdi]
- cmove r12,rsp
- mov rdx,QWORD[24+rdi]
- mov r8,QWORD[32+rdi]
- mov r9,QWORD[40+rdi]
- mov r10,QWORD[48+rdi]
- mov r11,QWORD[56+rdi]
- jmp NEAR $L$oop_avx2
-ALIGN 16
-$L$oop_avx2:
- vmovdqu xmm0,XMMWORD[((-128))+rsi]
- vmovdqu xmm1,XMMWORD[((-128+16))+rsi]
- vmovdqu xmm2,XMMWORD[((-128+32))+rsi]
- lea rbp,[((K512+128))]
- vmovdqu xmm3,XMMWORD[((-128+48))+rsi]
- vmovdqu xmm4,XMMWORD[((-128+64))+rsi]
- vmovdqu xmm5,XMMWORD[((-128+80))+rsi]
- vmovdqu xmm6,XMMWORD[((-128+96))+rsi]
- vmovdqu xmm7,XMMWORD[((-128+112))+rsi]
-
- vmovdqa ymm10,YMMWORD[1152+rbp]
- vinserti128 ymm0,ymm0,XMMWORD[r12],1
- vinserti128 ymm1,ymm1,XMMWORD[16+r12],1
- vpshufb ymm0,ymm0,ymm10
- vinserti128 ymm2,ymm2,XMMWORD[32+r12],1
- vpshufb ymm1,ymm1,ymm10
- vinserti128 ymm3,ymm3,XMMWORD[48+r12],1
- vpshufb ymm2,ymm2,ymm10
- vinserti128 ymm4,ymm4,XMMWORD[64+r12],1
- vpshufb ymm3,ymm3,ymm10
- vinserti128 ymm5,ymm5,XMMWORD[80+r12],1
- vpshufb ymm4,ymm4,ymm10
- vinserti128 ymm6,ymm6,XMMWORD[96+r12],1
- vpshufb ymm5,ymm5,ymm10
- vinserti128 ymm7,ymm7,XMMWORD[112+r12],1
-
- vpaddq ymm8,ymm0,YMMWORD[((-128))+rbp]
- vpshufb ymm6,ymm6,ymm10
- vpaddq ymm9,ymm1,YMMWORD[((-96))+rbp]
- vpshufb ymm7,ymm7,ymm10
- vpaddq ymm10,ymm2,YMMWORD[((-64))+rbp]
- vpaddq ymm11,ymm3,YMMWORD[((-32))+rbp]
- vmovdqa YMMWORD[rsp],ymm8
- vpaddq ymm8,ymm4,YMMWORD[rbp]
- vmovdqa YMMWORD[32+rsp],ymm9
- vpaddq ymm9,ymm5,YMMWORD[32+rbp]
- vmovdqa YMMWORD[64+rsp],ymm10
- vpaddq ymm10,ymm6,YMMWORD[64+rbp]
- vmovdqa YMMWORD[96+rsp],ymm11
- lea rsp,[((-128))+rsp]
- vpaddq ymm11,ymm7,YMMWORD[96+rbp]
- vmovdqa YMMWORD[rsp],ymm8
- xor r14,r14
- vmovdqa YMMWORD[32+rsp],ymm9
- mov rdi,rbx
- vmovdqa YMMWORD[64+rsp],ymm10
- xor rdi,rcx
- vmovdqa YMMWORD[96+rsp],ymm11
- mov r12,r9
- add rbp,16*2*8
- jmp NEAR $L$avx2_00_47
-
-ALIGN 16
-$L$avx2_00_47:
- lea rsp,[((-128))+rsp]
- vpalignr ymm8,ymm1,ymm0,8
- add r11,QWORD[((0+256))+rsp]
- and r12,r8
- rorx r13,r8,41
- vpalignr ymm11,ymm5,ymm4,8
- rorx r15,r8,18
- lea rax,[r14*1+rax]
- lea r11,[r12*1+r11]
- vpsrlq ymm10,ymm8,1
- andn r12,r8,r10
- xor r13,r15
- rorx r14,r8,14
- vpaddq ymm0,ymm0,ymm11
- vpsrlq ymm11,ymm8,7
- lea r11,[r12*1+r11]
- xor r13,r14
- mov r15,rax
- vpsllq ymm9,ymm8,56
- vpxor ymm8,ymm11,ymm10
- rorx r12,rax,39
- lea r11,[r13*1+r11]
- xor r15,rbx
- vpsrlq ymm10,ymm10,7
- vpxor ymm8,ymm8,ymm9
- rorx r14,rax,34
- rorx r13,rax,28
- lea rdx,[r11*1+rdx]
- vpsllq ymm9,ymm9,7
- vpxor ymm8,ymm8,ymm10
- and rdi,r15
- xor r14,r12
- xor rdi,rbx
- vpsrlq ymm11,ymm7,6
- vpxor ymm8,ymm8,ymm9
- xor r14,r13
- lea r11,[rdi*1+r11]
- mov r12,r8
- vpsllq ymm10,ymm7,3
- vpaddq ymm0,ymm0,ymm8
- add r10,QWORD[((8+256))+rsp]
- and r12,rdx
- rorx r13,rdx,41
- vpsrlq ymm9,ymm7,19
- vpxor ymm11,ymm11,ymm10
- rorx rdi,rdx,18
- lea r11,[r14*1+r11]
- lea r10,[r12*1+r10]
- vpsllq ymm10,ymm10,42
- vpxor ymm11,ymm11,ymm9
- andn r12,rdx,r9
- xor r13,rdi
- rorx r14,rdx,14
- vpsrlq ymm9,ymm9,42
- vpxor ymm11,ymm11,ymm10
- lea r10,[r12*1+r10]
- xor r13,r14
- mov rdi,r11
- vpxor ymm11,ymm11,ymm9
- rorx r12,r11,39
- lea r10,[r13*1+r10]
- xor rdi,rax
- vpaddq ymm0,ymm0,ymm11
- rorx r14,r11,34
- rorx r13,r11,28
- lea rcx,[r10*1+rcx]
- vpaddq ymm10,ymm0,YMMWORD[((-128))+rbp]
- and r15,rdi
- xor r14,r12
- xor r15,rax
- xor r14,r13
- lea r10,[r15*1+r10]
- mov r12,rdx
- vmovdqa YMMWORD[rsp],ymm10
- vpalignr ymm8,ymm2,ymm1,8
- add r9,QWORD[((32+256))+rsp]
- and r12,rcx
- rorx r13,rcx,41
- vpalignr ymm11,ymm6,ymm5,8
- rorx r15,rcx,18
- lea r10,[r14*1+r10]
- lea r9,[r12*1+r9]
- vpsrlq ymm10,ymm8,1
- andn r12,rcx,r8
- xor r13,r15
- rorx r14,rcx,14
- vpaddq ymm1,ymm1,ymm11
- vpsrlq ymm11,ymm8,7
- lea r9,[r12*1+r9]
- xor r13,r14
- mov r15,r10
- vpsllq ymm9,ymm8,56
- vpxor ymm8,ymm11,ymm10
- rorx r12,r10,39
- lea r9,[r13*1+r9]
- xor r15,r11
- vpsrlq ymm10,ymm10,7
- vpxor ymm8,ymm8,ymm9
- rorx r14,r10,34
- rorx r13,r10,28
- lea rbx,[r9*1+rbx]
- vpsllq ymm9,ymm9,7
- vpxor ymm8,ymm8,ymm10
- and rdi,r15
- xor r14,r12
- xor rdi,r11
- vpsrlq ymm11,ymm0,6
- vpxor ymm8,ymm8,ymm9
- xor r14,r13
- lea r9,[rdi*1+r9]
- mov r12,rcx
- vpsllq ymm10,ymm0,3
- vpaddq ymm1,ymm1,ymm8
- add r8,QWORD[((40+256))+rsp]
- and r12,rbx
- rorx r13,rbx,41
- vpsrlq ymm9,ymm0,19
- vpxor ymm11,ymm11,ymm10
- rorx rdi,rbx,18
- lea r9,[r14*1+r9]
- lea r8,[r12*1+r8]
- vpsllq ymm10,ymm10,42
- vpxor ymm11,ymm11,ymm9
- andn r12,rbx,rdx
- xor r13,rdi
- rorx r14,rbx,14
- vpsrlq ymm9,ymm9,42
- vpxor ymm11,ymm11,ymm10
- lea r8,[r12*1+r8]
- xor r13,r14
- mov rdi,r9
- vpxor ymm11,ymm11,ymm9
- rorx r12,r9,39
- lea r8,[r13*1+r8]
- xor rdi,r10
- vpaddq ymm1,ymm1,ymm11
- rorx r14,r9,34
- rorx r13,r9,28
- lea rax,[r8*1+rax]
- vpaddq ymm10,ymm1,YMMWORD[((-96))+rbp]
- and r15,rdi
- xor r14,r12
- xor r15,r10
- xor r14,r13
- lea r8,[r15*1+r8]
- mov r12,rbx
- vmovdqa YMMWORD[32+rsp],ymm10
- vpalignr ymm8,ymm3,ymm2,8
- add rdx,QWORD[((64+256))+rsp]
- and r12,rax
- rorx r13,rax,41
- vpalignr ymm11,ymm7,ymm6,8
- rorx r15,rax,18
- lea r8,[r14*1+r8]
- lea rdx,[r12*1+rdx]
- vpsrlq ymm10,ymm8,1
- andn r12,rax,rcx
- xor r13,r15
- rorx r14,rax,14
- vpaddq ymm2,ymm2,ymm11
- vpsrlq ymm11,ymm8,7
- lea rdx,[r12*1+rdx]
- xor r13,r14
- mov r15,r8
- vpsllq ymm9,ymm8,56
- vpxor ymm8,ymm11,ymm10
- rorx r12,r8,39
- lea rdx,[r13*1+rdx]
- xor r15,r9
- vpsrlq ymm10,ymm10,7
- vpxor ymm8,ymm8,ymm9
- rorx r14,r8,34
- rorx r13,r8,28
- lea r11,[rdx*1+r11]
- vpsllq ymm9,ymm9,7
- vpxor ymm8,ymm8,ymm10
- and rdi,r15
- xor r14,r12
- xor rdi,r9
- vpsrlq ymm11,ymm1,6
- vpxor ymm8,ymm8,ymm9
- xor r14,r13
- lea rdx,[rdi*1+rdx]
- mov r12,rax
- vpsllq ymm10,ymm1,3
- vpaddq ymm2,ymm2,ymm8
- add rcx,QWORD[((72+256))+rsp]
- and r12,r11
- rorx r13,r11,41
- vpsrlq ymm9,ymm1,19
- vpxor ymm11,ymm11,ymm10
- rorx rdi,r11,18
- lea rdx,[r14*1+rdx]
- lea rcx,[r12*1+rcx]
- vpsllq ymm10,ymm10,42
- vpxor ymm11,ymm11,ymm9
- andn r12,r11,rbx
- xor r13,rdi
- rorx r14,r11,14
- vpsrlq ymm9,ymm9,42
- vpxor ymm11,ymm11,ymm10
- lea rcx,[r12*1+rcx]
- xor r13,r14
- mov rdi,rdx
- vpxor ymm11,ymm11,ymm9
- rorx r12,rdx,39
- lea rcx,[r13*1+rcx]
- xor rdi,r8
- vpaddq ymm2,ymm2,ymm11
- rorx r14,rdx,34
- rorx r13,rdx,28
- lea r10,[rcx*1+r10]
- vpaddq ymm10,ymm2,YMMWORD[((-64))+rbp]
- and r15,rdi
- xor r14,r12
- xor r15,r8
- xor r14,r13
- lea rcx,[r15*1+rcx]
- mov r12,r11
- vmovdqa YMMWORD[64+rsp],ymm10
- vpalignr ymm8,ymm4,ymm3,8
- add rbx,QWORD[((96+256))+rsp]
- and r12,r10
- rorx r13,r10,41
- vpalignr ymm11,ymm0,ymm7,8
- rorx r15,r10,18
- lea rcx,[r14*1+rcx]
- lea rbx,[r12*1+rbx]
- vpsrlq ymm10,ymm8,1
- andn r12,r10,rax
- xor r13,r15
- rorx r14,r10,14
- vpaddq ymm3,ymm3,ymm11
- vpsrlq ymm11,ymm8,7
- lea rbx,[r12*1+rbx]
- xor r13,r14
- mov r15,rcx
- vpsllq ymm9,ymm8,56
- vpxor ymm8,ymm11,ymm10
- rorx r12,rcx,39
- lea rbx,[r13*1+rbx]
- xor r15,rdx
- vpsrlq ymm10,ymm10,7
- vpxor ymm8,ymm8,ymm9
- rorx r14,rcx,34
- rorx r13,rcx,28
- lea r9,[rbx*1+r9]
- vpsllq ymm9,ymm9,7
- vpxor ymm8,ymm8,ymm10
- and rdi,r15
- xor r14,r12
- xor rdi,rdx
- vpsrlq ymm11,ymm2,6
- vpxor ymm8,ymm8,ymm9
- xor r14,r13
- lea rbx,[rdi*1+rbx]
- mov r12,r10
- vpsllq ymm10,ymm2,3
- vpaddq ymm3,ymm3,ymm8
- add rax,QWORD[((104+256))+rsp]
- and r12,r9
- rorx r13,r9,41
- vpsrlq ymm9,ymm2,19
- vpxor ymm11,ymm11,ymm10
- rorx rdi,r9,18
- lea rbx,[r14*1+rbx]
- lea rax,[r12*1+rax]
- vpsllq ymm10,ymm10,42
- vpxor ymm11,ymm11,ymm9
- andn r12,r9,r11
- xor r13,rdi
- rorx r14,r9,14
- vpsrlq ymm9,ymm9,42
- vpxor ymm11,ymm11,ymm10
- lea rax,[r12*1+rax]
- xor r13,r14
- mov rdi,rbx
- vpxor ymm11,ymm11,ymm9
- rorx r12,rbx,39
- lea rax,[r13*1+rax]
- xor rdi,rcx
- vpaddq ymm3,ymm3,ymm11
- rorx r14,rbx,34
- rorx r13,rbx,28
- lea r8,[rax*1+r8]
- vpaddq ymm10,ymm3,YMMWORD[((-32))+rbp]
- and r15,rdi
- xor r14,r12
- xor r15,rcx
- xor r14,r13
- lea rax,[r15*1+rax]
- mov r12,r9
- vmovdqa YMMWORD[96+rsp],ymm10
- lea rsp,[((-128))+rsp]
- vpalignr ymm8,ymm5,ymm4,8
- add r11,QWORD[((0+256))+rsp]
- and r12,r8
- rorx r13,r8,41
- vpalignr ymm11,ymm1,ymm0,8
- rorx r15,r8,18
- lea rax,[r14*1+rax]
- lea r11,[r12*1+r11]
- vpsrlq ymm10,ymm8,1
- andn r12,r8,r10
- xor r13,r15
- rorx r14,r8,14
- vpaddq ymm4,ymm4,ymm11
- vpsrlq ymm11,ymm8,7
- lea r11,[r12*1+r11]
- xor r13,r14
- mov r15,rax
- vpsllq ymm9,ymm8,56
- vpxor ymm8,ymm11,ymm10
- rorx r12,rax,39
- lea r11,[r13*1+r11]
- xor r15,rbx
- vpsrlq ymm10,ymm10,7
- vpxor ymm8,ymm8,ymm9
- rorx r14,rax,34
- rorx r13,rax,28
- lea rdx,[r11*1+rdx]
- vpsllq ymm9,ymm9,7
- vpxor ymm8,ymm8,ymm10
- and rdi,r15
- xor r14,r12
- xor rdi,rbx
- vpsrlq ymm11,ymm3,6
- vpxor ymm8,ymm8,ymm9
- xor r14,r13
- lea r11,[rdi*1+r11]
- mov r12,r8
- vpsllq ymm10,ymm3,3
- vpaddq ymm4,ymm4,ymm8
- add r10,QWORD[((8+256))+rsp]
- and r12,rdx
- rorx r13,rdx,41
- vpsrlq ymm9,ymm3,19
- vpxor ymm11,ymm11,ymm10
- rorx rdi,rdx,18
- lea r11,[r14*1+r11]
- lea r10,[r12*1+r10]
- vpsllq ymm10,ymm10,42
- vpxor ymm11,ymm11,ymm9
- andn r12,rdx,r9
- xor r13,rdi
- rorx r14,rdx,14
- vpsrlq ymm9,ymm9,42
- vpxor ymm11,ymm11,ymm10
- lea r10,[r12*1+r10]
- xor r13,r14
- mov rdi,r11
- vpxor ymm11,ymm11,ymm9
- rorx r12,r11,39
- lea r10,[r13*1+r10]
- xor rdi,rax
- vpaddq ymm4,ymm4,ymm11
- rorx r14,r11,34
- rorx r13,r11,28
- lea rcx,[r10*1+rcx]
- vpaddq ymm10,ymm4,YMMWORD[rbp]
- and r15,rdi
- xor r14,r12
- xor r15,rax
- xor r14,r13
- lea r10,[r15*1+r10]
- mov r12,rdx
- vmovdqa YMMWORD[rsp],ymm10
- vpalignr ymm8,ymm6,ymm5,8
- add r9,QWORD[((32+256))+rsp]
- and r12,rcx
- rorx r13,rcx,41
- vpalignr ymm11,ymm2,ymm1,8
- rorx r15,rcx,18
- lea r10,[r14*1+r10]
- lea r9,[r12*1+r9]
- vpsrlq ymm10,ymm8,1
- andn r12,rcx,r8
- xor r13,r15
- rorx r14,rcx,14
- vpaddq ymm5,ymm5,ymm11
- vpsrlq ymm11,ymm8,7
- lea r9,[r12*1+r9]
- xor r13,r14
- mov r15,r10
- vpsllq ymm9,ymm8,56
- vpxor ymm8,ymm11,ymm10
- rorx r12,r10,39
- lea r9,[r13*1+r9]
- xor r15,r11
- vpsrlq ymm10,ymm10,7
- vpxor ymm8,ymm8,ymm9
- rorx r14,r10,34
- rorx r13,r10,28
- lea rbx,[r9*1+rbx]
- vpsllq ymm9,ymm9,7
- vpxor ymm8,ymm8,ymm10
- and rdi,r15
- xor r14,r12
- xor rdi,r11
- vpsrlq ymm11,ymm4,6
- vpxor ymm8,ymm8,ymm9
- xor r14,r13
- lea r9,[rdi*1+r9]
- mov r12,rcx
- vpsllq ymm10,ymm4,3
- vpaddq ymm5,ymm5,ymm8
- add r8,QWORD[((40+256))+rsp]
- and r12,rbx
- rorx r13,rbx,41
- vpsrlq ymm9,ymm4,19
- vpxor ymm11,ymm11,ymm10
- rorx rdi,rbx,18
- lea r9,[r14*1+r9]
- lea r8,[r12*1+r8]
- vpsllq ymm10,ymm10,42
- vpxor ymm11,ymm11,ymm9
- andn r12,rbx,rdx
- xor r13,rdi
- rorx r14,rbx,14
- vpsrlq ymm9,ymm9,42
- vpxor ymm11,ymm11,ymm10
- lea r8,[r12*1+r8]
- xor r13,r14
- mov rdi,r9
- vpxor ymm11,ymm11,ymm9
- rorx r12,r9,39
- lea r8,[r13*1+r8]
- xor rdi,r10
- vpaddq ymm5,ymm5,ymm11
- rorx r14,r9,34
- rorx r13,r9,28
- lea rax,[r8*1+rax]
- vpaddq ymm10,ymm5,YMMWORD[32+rbp]
- and r15,rdi
- xor r14,r12
- xor r15,r10
- xor r14,r13
- lea r8,[r15*1+r8]
- mov r12,rbx
- vmovdqa YMMWORD[32+rsp],ymm10
- vpalignr ymm8,ymm7,ymm6,8
- add rdx,QWORD[((64+256))+rsp]
- and r12,rax
- rorx r13,rax,41
- vpalignr ymm11,ymm3,ymm2,8
- rorx r15,rax,18
- lea r8,[r14*1+r8]
- lea rdx,[r12*1+rdx]
- vpsrlq ymm10,ymm8,1
- andn r12,rax,rcx
- xor r13,r15
- rorx r14,rax,14
- vpaddq ymm6,ymm6,ymm11
- vpsrlq ymm11,ymm8,7
- lea rdx,[r12*1+rdx]
- xor r13,r14
- mov r15,r8
- vpsllq ymm9,ymm8,56
- vpxor ymm8,ymm11,ymm10
- rorx r12,r8,39
- lea rdx,[r13*1+rdx]
- xor r15,r9
- vpsrlq ymm10,ymm10,7
- vpxor ymm8,ymm8,ymm9
- rorx r14,r8,34
- rorx r13,r8,28
- lea r11,[rdx*1+r11]
- vpsllq ymm9,ymm9,7
- vpxor ymm8,ymm8,ymm10
- and rdi,r15
- xor r14,r12
- xor rdi,r9
- vpsrlq ymm11,ymm5,6
- vpxor ymm8,ymm8,ymm9
- xor r14,r13
- lea rdx,[rdi*1+rdx]
- mov r12,rax
- vpsllq ymm10,ymm5,3
- vpaddq ymm6,ymm6,ymm8
- add rcx,QWORD[((72+256))+rsp]
- and r12,r11
- rorx r13,r11,41
- vpsrlq ymm9,ymm5,19
- vpxor ymm11,ymm11,ymm10
- rorx rdi,r11,18
- lea rdx,[r14*1+rdx]
- lea rcx,[r12*1+rcx]
- vpsllq ymm10,ymm10,42
- vpxor ymm11,ymm11,ymm9
- andn r12,r11,rbx
- xor r13,rdi
- rorx r14,r11,14
- vpsrlq ymm9,ymm9,42
- vpxor ymm11,ymm11,ymm10
- lea rcx,[r12*1+rcx]
- xor r13,r14
- mov rdi,rdx
- vpxor ymm11,ymm11,ymm9
- rorx r12,rdx,39
- lea rcx,[r13*1+rcx]
- xor rdi,r8
- vpaddq ymm6,ymm6,ymm11
- rorx r14,rdx,34
- rorx r13,rdx,28
- lea r10,[rcx*1+r10]
- vpaddq ymm10,ymm6,YMMWORD[64+rbp]
- and r15,rdi
- xor r14,r12
- xor r15,r8
- xor r14,r13
- lea rcx,[r15*1+rcx]
- mov r12,r11
- vmovdqa YMMWORD[64+rsp],ymm10
- vpalignr ymm8,ymm0,ymm7,8
- add rbx,QWORD[((96+256))+rsp]
- and r12,r10
- rorx r13,r10,41
- vpalignr ymm11,ymm4,ymm3,8
- rorx r15,r10,18
- lea rcx,[r14*1+rcx]
- lea rbx,[r12*1+rbx]
- vpsrlq ymm10,ymm8,1
- andn r12,r10,rax
- xor r13,r15
- rorx r14,r10,14
- vpaddq ymm7,ymm7,ymm11
- vpsrlq ymm11,ymm8,7
- lea rbx,[r12*1+rbx]
- xor r13,r14
- mov r15,rcx
- vpsllq ymm9,ymm8,56
- vpxor ymm8,ymm11,ymm10
- rorx r12,rcx,39
- lea rbx,[r13*1+rbx]
- xor r15,rdx
- vpsrlq ymm10,ymm10,7
- vpxor ymm8,ymm8,ymm9
- rorx r14,rcx,34
- rorx r13,rcx,28
- lea r9,[rbx*1+r9]
- vpsllq ymm9,ymm9,7
- vpxor ymm8,ymm8,ymm10
- and rdi,r15
- xor r14,r12
- xor rdi,rdx
- vpsrlq ymm11,ymm6,6
- vpxor ymm8,ymm8,ymm9
- xor r14,r13
- lea rbx,[rdi*1+rbx]
- mov r12,r10
- vpsllq ymm10,ymm6,3
- vpaddq ymm7,ymm7,ymm8
- add rax,QWORD[((104+256))+rsp]
- and r12,r9
- rorx r13,r9,41
- vpsrlq ymm9,ymm6,19
- vpxor ymm11,ymm11,ymm10
- rorx rdi,r9,18
- lea rbx,[r14*1+rbx]
- lea rax,[r12*1+rax]
- vpsllq ymm10,ymm10,42
- vpxor ymm11,ymm11,ymm9
- andn r12,r9,r11
- xor r13,rdi
- rorx r14,r9,14
- vpsrlq ymm9,ymm9,42
- vpxor ymm11,ymm11,ymm10
- lea rax,[r12*1+rax]
- xor r13,r14
- mov rdi,rbx
- vpxor ymm11,ymm11,ymm9
- rorx r12,rbx,39
- lea rax,[r13*1+rax]
- xor rdi,rcx
- vpaddq ymm7,ymm7,ymm11
- rorx r14,rbx,34
- rorx r13,rbx,28
- lea r8,[rax*1+r8]
- vpaddq ymm10,ymm7,YMMWORD[96+rbp]
- and r15,rdi
- xor r14,r12
- xor r15,rcx
- xor r14,r13
- lea rax,[r15*1+rax]
- mov r12,r9
- vmovdqa YMMWORD[96+rsp],ymm10
- lea rbp,[256+rbp]
- cmp BYTE[((-121))+rbp],0
- jne NEAR $L$avx2_00_47
- add r11,QWORD[((0+128))+rsp]
- and r12,r8
- rorx r13,r8,41
- rorx r15,r8,18
- lea rax,[r14*1+rax]
- lea r11,[r12*1+r11]
- andn r12,r8,r10
- xor r13,r15
- rorx r14,r8,14
- lea r11,[r12*1+r11]
- xor r13,r14
- mov r15,rax
- rorx r12,rax,39
- lea r11,[r13*1+r11]
- xor r15,rbx
- rorx r14,rax,34
- rorx r13,rax,28
- lea rdx,[r11*1+rdx]
- and rdi,r15
- xor r14,r12
- xor rdi,rbx
- xor r14,r13
- lea r11,[rdi*1+r11]
- mov r12,r8
- add r10,QWORD[((8+128))+rsp]
- and r12,rdx
- rorx r13,rdx,41
- rorx rdi,rdx,18
- lea r11,[r14*1+r11]
- lea r10,[r12*1+r10]
- andn r12,rdx,r9
- xor r13,rdi
- rorx r14,rdx,14
- lea r10,[r12*1+r10]
- xor r13,r14
- mov rdi,r11
- rorx r12,r11,39
- lea r10,[r13*1+r10]
- xor rdi,rax
- rorx r14,r11,34
- rorx r13,r11,28
- lea rcx,[r10*1+rcx]
- and r15,rdi
- xor r14,r12
- xor r15,rax
- xor r14,r13
- lea r10,[r15*1+r10]
- mov r12,rdx
- add r9,QWORD[((32+128))+rsp]
- and r12,rcx
- rorx r13,rcx,41
- rorx r15,rcx,18
- lea r10,[r14*1+r10]
- lea r9,[r12*1+r9]
- andn r12,rcx,r8
- xor r13,r15
- rorx r14,rcx,14
- lea r9,[r12*1+r9]
- xor r13,r14
- mov r15,r10
- rorx r12,r10,39
- lea r9,[r13*1+r9]
- xor r15,r11
- rorx r14,r10,34
- rorx r13,r10,28
- lea rbx,[r9*1+rbx]
- and rdi,r15
- xor r14,r12
- xor rdi,r11
- xor r14,r13
- lea r9,[rdi*1+r9]
- mov r12,rcx
- add r8,QWORD[((40+128))+rsp]
- and r12,rbx
- rorx r13,rbx,41
- rorx rdi,rbx,18
- lea r9,[r14*1+r9]
- lea r8,[r12*1+r8]
- andn r12,rbx,rdx
- xor r13,rdi
- rorx r14,rbx,14
- lea r8,[r12*1+r8]
- xor r13,r14
- mov rdi,r9
- rorx r12,r9,39
- lea r8,[r13*1+r8]
- xor rdi,r10
- rorx r14,r9,34
- rorx r13,r9,28
- lea rax,[r8*1+rax]
- and r15,rdi
- xor r14,r12
- xor r15,r10
- xor r14,r13
- lea r8,[r15*1+r8]
- mov r12,rbx
- add rdx,QWORD[((64+128))+rsp]
- and r12,rax
- rorx r13,rax,41
- rorx r15,rax,18
- lea r8,[r14*1+r8]
- lea rdx,[r12*1+rdx]
- andn r12,rax,rcx
- xor r13,r15
- rorx r14,rax,14
- lea rdx,[r12*1+rdx]
- xor r13,r14
- mov r15,r8
- rorx r12,r8,39
- lea rdx,[r13*1+rdx]
- xor r15,r9
- rorx r14,r8,34
- rorx r13,r8,28
- lea r11,[rdx*1+r11]
- and rdi,r15
- xor r14,r12
- xor rdi,r9
- xor r14,r13
- lea rdx,[rdi*1+rdx]
- mov r12,rax
- add rcx,QWORD[((72+128))+rsp]
- and r12,r11
- rorx r13,r11,41
- rorx rdi,r11,18
- lea rdx,[r14*1+rdx]
- lea rcx,[r12*1+rcx]
- andn r12,r11,rbx
- xor r13,rdi
- rorx r14,r11,14
- lea rcx,[r12*1+rcx]
- xor r13,r14
- mov rdi,rdx
- rorx r12,rdx,39
- lea rcx,[r13*1+rcx]
- xor rdi,r8
- rorx r14,rdx,34
- rorx r13,rdx,28
- lea r10,[rcx*1+r10]
- and r15,rdi
- xor r14,r12
- xor r15,r8
- xor r14,r13
- lea rcx,[r15*1+rcx]
- mov r12,r11
- add rbx,QWORD[((96+128))+rsp]
- and r12,r10
- rorx r13,r10,41
- rorx r15,r10,18
- lea rcx,[r14*1+rcx]
- lea rbx,[r12*1+rbx]
- andn r12,r10,rax
- xor r13,r15
- rorx r14,r10,14
- lea rbx,[r12*1+rbx]
- xor r13,r14
- mov r15,rcx
- rorx r12,rcx,39
- lea rbx,[r13*1+rbx]
- xor r15,rdx
- rorx r14,rcx,34
- rorx r13,rcx,28
- lea r9,[rbx*1+r9]
- and rdi,r15
- xor r14,r12
- xor rdi,rdx
- xor r14,r13
- lea rbx,[rdi*1+rbx]
- mov r12,r10
- add rax,QWORD[((104+128))+rsp]
- and r12,r9
- rorx r13,r9,41
- rorx rdi,r9,18
- lea rbx,[r14*1+rbx]
- lea rax,[r12*1+rax]
- andn r12,r9,r11
- xor r13,rdi
- rorx r14,r9,14
- lea rax,[r12*1+rax]
- xor r13,r14
- mov rdi,rbx
- rorx r12,rbx,39
- lea rax,[r13*1+rax]
- xor rdi,rcx
- rorx r14,rbx,34
- rorx r13,rbx,28
- lea r8,[rax*1+r8]
- and r15,rdi
- xor r14,r12
- xor r15,rcx
- xor r14,r13
- lea rax,[r15*1+rax]
- mov r12,r9
- add r11,QWORD[rsp]
- and r12,r8
- rorx r13,r8,41
- rorx r15,r8,18
- lea rax,[r14*1+rax]
- lea r11,[r12*1+r11]
- andn r12,r8,r10
- xor r13,r15
- rorx r14,r8,14
- lea r11,[r12*1+r11]
- xor r13,r14
- mov r15,rax
- rorx r12,rax,39
- lea r11,[r13*1+r11]
- xor r15,rbx
- rorx r14,rax,34
- rorx r13,rax,28
- lea rdx,[r11*1+rdx]
- and rdi,r15
- xor r14,r12
- xor rdi,rbx
- xor r14,r13
- lea r11,[rdi*1+r11]
- mov r12,r8
- add r10,QWORD[8+rsp]
- and r12,rdx
- rorx r13,rdx,41
- rorx rdi,rdx,18
- lea r11,[r14*1+r11]
- lea r10,[r12*1+r10]
- andn r12,rdx,r9
- xor r13,rdi
- rorx r14,rdx,14
- lea r10,[r12*1+r10]
- xor r13,r14
- mov rdi,r11
- rorx r12,r11,39
- lea r10,[r13*1+r10]
- xor rdi,rax
- rorx r14,r11,34
- rorx r13,r11,28
- lea rcx,[r10*1+rcx]
- and r15,rdi
- xor r14,r12
- xor r15,rax
- xor r14,r13
- lea r10,[r15*1+r10]
- mov r12,rdx
- add r9,QWORD[32+rsp]
- and r12,rcx
- rorx r13,rcx,41
- rorx r15,rcx,18
- lea r10,[r14*1+r10]
- lea r9,[r12*1+r9]
- andn r12,rcx,r8
- xor r13,r15
- rorx r14,rcx,14
- lea r9,[r12*1+r9]
- xor r13,r14
- mov r15,r10
- rorx r12,r10,39
- lea r9,[r13*1+r9]
- xor r15,r11
- rorx r14,r10,34
- rorx r13,r10,28
- lea rbx,[r9*1+rbx]
- and rdi,r15
- xor r14,r12
- xor rdi,r11
- xor r14,r13
- lea r9,[rdi*1+r9]
- mov r12,rcx
- add r8,QWORD[40+rsp]
- and r12,rbx
- rorx r13,rbx,41
- rorx rdi,rbx,18
- lea r9,[r14*1+r9]
- lea r8,[r12*1+r8]
- andn r12,rbx,rdx
- xor r13,rdi
- rorx r14,rbx,14
- lea r8,[r12*1+r8]
- xor r13,r14
- mov rdi,r9
- rorx r12,r9,39
- lea r8,[r13*1+r8]
- xor rdi,r10
- rorx r14,r9,34
- rorx r13,r9,28
- lea rax,[r8*1+rax]
- and r15,rdi
- xor r14,r12
- xor r15,r10
- xor r14,r13
- lea r8,[r15*1+r8]
- mov r12,rbx
- add rdx,QWORD[64+rsp]
- and r12,rax
- rorx r13,rax,41
- rorx r15,rax,18
- lea r8,[r14*1+r8]
- lea rdx,[r12*1+rdx]
- andn r12,rax,rcx
- xor r13,r15
- rorx r14,rax,14
- lea rdx,[r12*1+rdx]
- xor r13,r14
- mov r15,r8
- rorx r12,r8,39
- lea rdx,[r13*1+rdx]
- xor r15,r9
- rorx r14,r8,34
- rorx r13,r8,28
- lea r11,[rdx*1+r11]
- and rdi,r15
- xor r14,r12
- xor rdi,r9
- xor r14,r13
- lea rdx,[rdi*1+rdx]
- mov r12,rax
- add rcx,QWORD[72+rsp]
- and r12,r11
- rorx r13,r11,41
- rorx rdi,r11,18
- lea rdx,[r14*1+rdx]
- lea rcx,[r12*1+rcx]
- andn r12,r11,rbx
- xor r13,rdi
- rorx r14,r11,14
- lea rcx,[r12*1+rcx]
- xor r13,r14
- mov rdi,rdx
- rorx r12,rdx,39
- lea rcx,[r13*1+rcx]
- xor rdi,r8
- rorx r14,rdx,34
- rorx r13,rdx,28
- lea r10,[rcx*1+r10]
- and r15,rdi
- xor r14,r12
- xor r15,r8
- xor r14,r13
- lea rcx,[r15*1+rcx]
- mov r12,r11
- add rbx,QWORD[96+rsp]
- and r12,r10
- rorx r13,r10,41
- rorx r15,r10,18
- lea rcx,[r14*1+rcx]
- lea rbx,[r12*1+rbx]
- andn r12,r10,rax
- xor r13,r15
- rorx r14,r10,14
- lea rbx,[r12*1+rbx]
- xor r13,r14
- mov r15,rcx
- rorx r12,rcx,39
- lea rbx,[r13*1+rbx]
- xor r15,rdx
- rorx r14,rcx,34
- rorx r13,rcx,28
- lea r9,[rbx*1+r9]
- and rdi,r15
- xor r14,r12
- xor rdi,rdx
- xor r14,r13
- lea rbx,[rdi*1+rbx]
- mov r12,r10
- add rax,QWORD[104+rsp]
- and r12,r9
- rorx r13,r9,41
- rorx rdi,r9,18
- lea rbx,[r14*1+rbx]
- lea rax,[r12*1+rax]
- andn r12,r9,r11
- xor r13,rdi
- rorx r14,r9,14
- lea rax,[r12*1+rax]
- xor r13,r14
- mov rdi,rbx
- rorx r12,rbx,39
- lea rax,[r13*1+rax]
- xor rdi,rcx
- rorx r14,rbx,34
- rorx r13,rbx,28
- lea r8,[rax*1+r8]
- and r15,rdi
- xor r14,r12
- xor r15,rcx
- xor r14,r13
- lea rax,[r15*1+rax]
- mov r12,r9
- mov rdi,QWORD[1280+rsp]
- add rax,r14
-
- lea rbp,[1152+rsp]
-
- add rax,QWORD[rdi]
- add rbx,QWORD[8+rdi]
- add rcx,QWORD[16+rdi]
- add rdx,QWORD[24+rdi]
- add r8,QWORD[32+rdi]
- add r9,QWORD[40+rdi]
- add r10,QWORD[48+rdi]
- add r11,QWORD[56+rdi]
-
- mov QWORD[rdi],rax
- mov QWORD[8+rdi],rbx
- mov QWORD[16+rdi],rcx
- mov QWORD[24+rdi],rdx
- mov QWORD[32+rdi],r8
- mov QWORD[40+rdi],r9
- mov QWORD[48+rdi],r10
- mov QWORD[56+rdi],r11
-
- cmp rsi,QWORD[144+rbp]
- je NEAR $L$done_avx2
-
- xor r14,r14
- mov rdi,rbx
- xor rdi,rcx
- mov r12,r9
- jmp NEAR $L$ower_avx2
-ALIGN 16
-$L$ower_avx2:
- add r11,QWORD[((0+16))+rbp]
- and r12,r8
- rorx r13,r8,41
- rorx r15,r8,18
- lea rax,[r14*1+rax]
- lea r11,[r12*1+r11]
- andn r12,r8,r10
- xor r13,r15
- rorx r14,r8,14
- lea r11,[r12*1+r11]
- xor r13,r14
- mov r15,rax
- rorx r12,rax,39
- lea r11,[r13*1+r11]
- xor r15,rbx
- rorx r14,rax,34
- rorx r13,rax,28
- lea rdx,[r11*1+rdx]
- and rdi,r15
- xor r14,r12
- xor rdi,rbx
- xor r14,r13
- lea r11,[rdi*1+r11]
- mov r12,r8
- add r10,QWORD[((8+16))+rbp]
- and r12,rdx
- rorx r13,rdx,41
- rorx rdi,rdx,18
- lea r11,[r14*1+r11]
- lea r10,[r12*1+r10]
- andn r12,rdx,r9
- xor r13,rdi
- rorx r14,rdx,14
- lea r10,[r12*1+r10]
- xor r13,r14
- mov rdi,r11
- rorx r12,r11,39
- lea r10,[r13*1+r10]
- xor rdi,rax
- rorx r14,r11,34
- rorx r13,r11,28
- lea rcx,[r10*1+rcx]
- and r15,rdi
- xor r14,r12
- xor r15,rax
- xor r14,r13
- lea r10,[r15*1+r10]
- mov r12,rdx
- add r9,QWORD[((32+16))+rbp]
- and r12,rcx
- rorx r13,rcx,41
- rorx r15,rcx,18
- lea r10,[r14*1+r10]
- lea r9,[r12*1+r9]
- andn r12,rcx,r8
- xor r13,r15
- rorx r14,rcx,14
- lea r9,[r12*1+r9]
- xor r13,r14
- mov r15,r10
- rorx r12,r10,39
- lea r9,[r13*1+r9]
- xor r15,r11
- rorx r14,r10,34
- rorx r13,r10,28
- lea rbx,[r9*1+rbx]
- and rdi,r15
- xor r14,r12
- xor rdi,r11
- xor r14,r13
- lea r9,[rdi*1+r9]
- mov r12,rcx
- add r8,QWORD[((40+16))+rbp]
- and r12,rbx
- rorx r13,rbx,41
- rorx rdi,rbx,18
- lea r9,[r14*1+r9]
- lea r8,[r12*1+r8]
- andn r12,rbx,rdx
- xor r13,rdi
- rorx r14,rbx,14
- lea r8,[r12*1+r8]
- xor r13,r14
- mov rdi,r9
- rorx r12,r9,39
- lea r8,[r13*1+r8]
- xor rdi,r10
- rorx r14,r9,34
- rorx r13,r9,28
- lea rax,[r8*1+rax]
- and r15,rdi
- xor r14,r12
- xor r15,r10
- xor r14,r13
- lea r8,[r15*1+r8]
- mov r12,rbx
- add rdx,QWORD[((64+16))+rbp]
- and r12,rax
- rorx r13,rax,41
- rorx r15,rax,18
- lea r8,[r14*1+r8]
- lea rdx,[r12*1+rdx]
- andn r12,rax,rcx
- xor r13,r15
- rorx r14,rax,14
- lea rdx,[r12*1+rdx]
- xor r13,r14
- mov r15,r8
- rorx r12,r8,39
- lea rdx,[r13*1+rdx]
- xor r15,r9
- rorx r14,r8,34
- rorx r13,r8,28
- lea r11,[rdx*1+r11]
- and rdi,r15
- xor r14,r12
- xor rdi,r9
- xor r14,r13
- lea rdx,[rdi*1+rdx]
- mov r12,rax
- add rcx,QWORD[((72+16))+rbp]
- and r12,r11
- rorx r13,r11,41
- rorx rdi,r11,18
- lea rdx,[r14*1+rdx]
- lea rcx,[r12*1+rcx]
- andn r12,r11,rbx
- xor r13,rdi
- rorx r14,r11,14
- lea rcx,[r12*1+rcx]
- xor r13,r14
- mov rdi,rdx
- rorx r12,rdx,39
- lea rcx,[r13*1+rcx]
- xor rdi,r8
- rorx r14,rdx,34
- rorx r13,rdx,28
- lea r10,[rcx*1+r10]
- and r15,rdi
- xor r14,r12
- xor r15,r8
- xor r14,r13
- lea rcx,[r15*1+rcx]
- mov r12,r11
- add rbx,QWORD[((96+16))+rbp]
- and r12,r10
- rorx r13,r10,41
- rorx r15,r10,18
- lea rcx,[r14*1+rcx]
- lea rbx,[r12*1+rbx]
- andn r12,r10,rax
- xor r13,r15
- rorx r14,r10,14
- lea rbx,[r12*1+rbx]
- xor r13,r14
- mov r15,rcx
- rorx r12,rcx,39
- lea rbx,[r13*1+rbx]
- xor r15,rdx
- rorx r14,rcx,34
- rorx r13,rcx,28
- lea r9,[rbx*1+r9]
- and rdi,r15
- xor r14,r12
- xor rdi,rdx
- xor r14,r13
- lea rbx,[rdi*1+rbx]
- mov r12,r10
- add rax,QWORD[((104+16))+rbp]
- and r12,r9
- rorx r13,r9,41
- rorx rdi,r9,18
- lea rbx,[r14*1+rbx]
- lea rax,[r12*1+rax]
- andn r12,r9,r11
- xor r13,rdi
- rorx r14,r9,14
- lea rax,[r12*1+rax]
- xor r13,r14
- mov rdi,rbx
- rorx r12,rbx,39
- lea rax,[r13*1+rax]
- xor rdi,rcx
- rorx r14,rbx,34
- rorx r13,rbx,28
- lea r8,[rax*1+r8]
- and r15,rdi
- xor r14,r12
- xor r15,rcx
- xor r14,r13
- lea rax,[r15*1+rax]
- mov r12,r9
- lea rbp,[((-128))+rbp]
- cmp rbp,rsp
- jae NEAR $L$ower_avx2
-
- mov rdi,QWORD[1280+rsp]
- add rax,r14
-
- lea rsp,[1152+rsp]
-
-
-
- add rax,QWORD[rdi]
- add rbx,QWORD[8+rdi]
- add rcx,QWORD[16+rdi]
- add rdx,QWORD[24+rdi]
- add r8,QWORD[32+rdi]
- add r9,QWORD[40+rdi]
- lea rsi,[256+rsi]
- add r10,QWORD[48+rdi]
- mov r12,rsi
- add r11,QWORD[56+rdi]
- cmp rsi,QWORD[((128+16))+rsp]
-
- mov QWORD[rdi],rax
- cmove r12,rsp
- mov QWORD[8+rdi],rbx
- mov QWORD[16+rdi],rcx
- mov QWORD[24+rdi],rdx
- mov QWORD[32+rdi],r8
- mov QWORD[40+rdi],r9
- mov QWORD[48+rdi],r10
- mov QWORD[56+rdi],r11
-
- jbe NEAR $L$oop_avx2
- lea rbp,[rsp]
-
-
-
-
-$L$done_avx2:
- mov rsi,QWORD[152+rbp]
-
- vzeroupper
- movaps xmm6,XMMWORD[((128+32))+rbp]
- movaps xmm7,XMMWORD[((128+48))+rbp]
- movaps xmm8,XMMWORD[((128+64))+rbp]
- movaps xmm9,XMMWORD[((128+80))+rbp]
- movaps xmm10,XMMWORD[((128+96))+rbp]
- movaps xmm11,XMMWORD[((128+112))+rbp]
- mov r15,QWORD[((-48))+rsi]
-
- mov r14,QWORD[((-40))+rsi]
-
- mov r13,QWORD[((-32))+rsi]
-
- mov r12,QWORD[((-24))+rsi]
-
- mov rbp,QWORD[((-16))+rsi]
-
- mov rbx,QWORD[((-8))+rsi]
-
- lea rsp,[rsi]
-
-$L$epilogue_avx2:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_sha512_block_data_order_avx2:
-EXTERN __imp_RtlVirtualUnwind
-
-ALIGN 16
-se_handler:
- push rsi
- push rdi
- push rbx
- push rbp
- push r12
- push r13
- push r14
- push r15
- pushfq
- sub rsp,64
-
- mov rax,QWORD[120+r8]
- mov rbx,QWORD[248+r8]
-
- mov rsi,QWORD[8+r9]
- mov r11,QWORD[56+r9]
-
- mov r10d,DWORD[r11]
- lea r10,[r10*1+rsi]
- cmp rbx,r10
- jb NEAR $L$in_prologue
-
- mov rax,QWORD[152+r8]
-
- mov r10d,DWORD[4+r11]
- lea r10,[r10*1+rsi]
- cmp rbx,r10
- jae NEAR $L$in_prologue
- lea r10,[$L$avx2_shortcut]
- cmp rbx,r10
- jb NEAR $L$not_in_avx2
-
- and rax,-256*8
- add rax,1152
-$L$not_in_avx2:
- mov rsi,rax
- mov rax,QWORD[((128+24))+rax]
-
- mov rbx,QWORD[((-8))+rax]
- mov rbp,QWORD[((-16))+rax]
- mov r12,QWORD[((-24))+rax]
- mov r13,QWORD[((-32))+rax]
- mov r14,QWORD[((-40))+rax]
- mov r15,QWORD[((-48))+rax]
- mov QWORD[144+r8],rbx
- mov QWORD[160+r8],rbp
- mov QWORD[216+r8],r12
- mov QWORD[224+r8],r13
- mov QWORD[232+r8],r14
- mov QWORD[240+r8],r15
-
- lea r10,[$L$epilogue]
- cmp rbx,r10
- jb NEAR $L$in_prologue
-
- lea rsi,[((128+32))+rsi]
- lea rdi,[512+r8]
- mov ecx,12
- DD 0xa548f3fc
-
-$L$in_prologue:
- mov rdi,QWORD[8+rax]
- mov rsi,QWORD[16+rax]
- mov QWORD[152+r8],rax
- mov QWORD[168+r8],rsi
- mov QWORD[176+r8],rdi
-
- mov rdi,QWORD[40+r9]
- mov rsi,r8
- mov ecx,154
- DD 0xa548f3fc
-
- mov rsi,r9
- xor rcx,rcx
- mov rdx,QWORD[8+rsi]
- mov r8,QWORD[rsi]
- mov r9,QWORD[16+rsi]
- mov r10,QWORD[40+rsi]
- lea r11,[56+rsi]
- lea r12,[24+rsi]
- mov QWORD[32+rsp],r10
- mov QWORD[40+rsp],r11
- mov QWORD[48+rsp],r12
- mov QWORD[56+rsp],rcx
- call QWORD[__imp_RtlVirtualUnwind]
-
- mov eax,1
- add rsp,64
- popfq
- pop r15
- pop r14
- pop r13
- pop r12
- pop rbp
- pop rbx
- pop rdi
- pop rsi
- DB 0F3h,0C3h ;repret
+ mov eax,1
+ add rsp,64
+ popfq
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+ pop rbx
+ pop rdi
+ pop rsi
+ DB 0F3h,0C3h ;repret
section .pdata rdata align=4
ALIGN 4
DD $L$SEH_begin_sha512_block_data_order wrt ..imagebase
DD $L$SEH_end_sha512_block_data_order wrt ..imagebase
DD $L$SEH_info_sha512_block_data_order wrt ..imagebase
- DD $L$SEH_begin_sha512_block_data_order_xop wrt ..imagebase
- DD $L$SEH_end_sha512_block_data_order_xop wrt ..imagebase
- DD $L$SEH_info_sha512_block_data_order_xop wrt ..imagebase
- DD $L$SEH_begin_sha512_block_data_order_avx wrt ..imagebase
- DD $L$SEH_end_sha512_block_data_order_avx wrt ..imagebase
- DD $L$SEH_info_sha512_block_data_order_avx wrt ..imagebase
- DD $L$SEH_begin_sha512_block_data_order_avx2 wrt ..imagebase
- DD $L$SEH_end_sha512_block_data_order_avx2 wrt ..imagebase
- DD $L$SEH_info_sha512_block_data_order_avx2 wrt ..imagebase
section .xdata rdata align=8
ALIGN 8
$L$SEH_info_sha512_block_data_order:
DB 9,0,0,0
DD se_handler wrt ..imagebase
DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase
-$L$SEH_info_sha512_block_data_order_xop:
-DB 9,0,0,0
- DD se_handler wrt ..imagebase
- DD $L$prologue_xop wrt ..imagebase,$L$epilogue_xop wrt ..imagebase
-$L$SEH_info_sha512_block_data_order_avx:
-DB 9,0,0,0
- DD se_handler wrt ..imagebase
- DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
-$L$SEH_info_sha512_block_data_order_avx2:
-DB 9,0,0,0
- DD se_handler wrt ..imagebase
- DD $L$prologue_avx2 wrt ..imagebase,$L$epilogue_avx2 wrt ..imagebase
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/configuration-ec.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/configuration-ec.h
index 437ede74d7a..8ed2c462485 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/configuration-ec.h
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/configuration-ec.h
@@ -109,6 +109,9 @@ extern "C" {
# ifndef OPENSSL_NO_DGRAM
# define OPENSSL_NO_DGRAM
# endif
+# ifndef OPENSSL_NO_DH
+# define OPENSSL_NO_DH
+# endif
# ifndef OPENSSL_NO_DSA
# define OPENSSL_NO_DSA
# endif
@@ -241,6 +244,9 @@ extern "C" {
# ifndef OPENSSL_NO_SM2
# define OPENSSL_NO_SM2
# endif
+# ifndef OPENSSL_NO_SM3
+# define OPENSSL_NO_SM3
+# endif
# ifndef OPENSSL_NO_SM4
# define OPENSSL_NO_SM4
# endif
diff --git a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/configuration-noec.h b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/configuration-noec.h
index 018225780b3..867ad08006a 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/configuration-noec.h
+++ b/CryptoPkg/Library/OpensslLib/OpensslGen/include/openssl/configuration-noec.h
@@ -109,6 +109,9 @@ extern "C" {
# ifndef OPENSSL_NO_DGRAM
# define OPENSSL_NO_DGRAM
# endif
+# ifndef OPENSSL_NO_DH
+# define OPENSSL_NO_DH
+# endif
# ifndef OPENSSL_NO_DSA
# define OPENSSL_NO_DSA
# endif
@@ -250,6 +253,9 @@ extern "C" {
# ifndef OPENSSL_NO_SM2
# define OPENSSL_NO_SM2
# endif
+# ifndef OPENSSL_NO_SM3
+# define OPENSSL_NO_SM3
+# endif
# ifndef OPENSSL_NO_SM4
# define OPENSSL_NO_SM4
# endif
diff --git a/CryptoPkg/Library/OpensslLib/OpensslLib.inf b/CryptoPkg/Library/OpensslLib/OpensslLib.inf
index b5e436a0168..cdb30e81ab0 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslLib.inf
+++ b/CryptoPkg/Library/OpensslLib/OpensslLib.inf
@@ -181,20 +181,6 @@
$(OPENSSL_PATH)/crypto/conf/conf_mod.c
$(OPENSSL_PATH)/crypto/conf/conf_sap.c
$(OPENSSL_PATH)/crypto/conf/conf_ssl.c
- $(OPENSSL_PATH)/crypto/dh/dh_ameth.c
- $(OPENSSL_PATH)/crypto/dh/dh_asn1.c
- $(OPENSSL_PATH)/crypto/dh/dh_backend.c
- $(OPENSSL_PATH)/crypto/dh/dh_check.c
- $(OPENSSL_PATH)/crypto/dh/dh_err.c
- $(OPENSSL_PATH)/crypto/dh/dh_gen.c
- $(OPENSSL_PATH)/crypto/dh/dh_group_params.c
- $(OPENSSL_PATH)/crypto/dh/dh_kdf.c
- $(OPENSSL_PATH)/crypto/dh/dh_key.c
- $(OPENSSL_PATH)/crypto/dh/dh_lib.c
- $(OPENSSL_PATH)/crypto/dh/dh_meth.c
- $(OPENSSL_PATH)/crypto/dh/dh_pmeth.c
- $(OPENSSL_PATH)/crypto/dh/dh_prn.c
- $(OPENSSL_PATH)/crypto/dh/dh_rfc5114.c
$(OPENSSL_PATH)/crypto/dso/dso_dl.c
$(OPENSSL_PATH)/crypto/dso/dso_dlfcn.c
$(OPENSSL_PATH)/crypto/dso/dso_err.c
@@ -427,8 +413,6 @@
$(OPENSSL_PATH)/crypto/sha/sha256.c
$(OPENSSL_PATH)/crypto/sha/sha3.c
$(OPENSSL_PATH)/crypto/sha/sha512.c
- $(OPENSSL_PATH)/crypto/sm3/legacy_sm3.c
- $(OPENSSL_PATH)/crypto/sm3/sm3.c
$(OPENSSL_PATH)/crypto/stack/stack.c
$(OPENSSL_PATH)/crypto/txt_db/txt_db.c
$(OPENSSL_PATH)/crypto/ui/ui_err.c
@@ -543,7 +527,6 @@
$(OPENSSL_PATH)/providers/implementations/digests/null_prov.c
$(OPENSSL_PATH)/providers/implementations/digests/sha2_prov.c
$(OPENSSL_PATH)/providers/implementations/digests/sha3_prov.c
- $(OPENSSL_PATH)/providers/implementations/digests/sm3_prov.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_der2key.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_epki2pki.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_msblob2key.c
@@ -551,7 +534,6 @@
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_pvk2key.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_spki2typespki.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/endecoder_common.c
- $(OPENSSL_PATH)/providers/implementations/exchange/dh_exch.c
$(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c
$(OPENSSL_PATH)/providers/implementations/kdfs/hkdf.c
$(OPENSSL_PATH)/providers/implementations/kdfs/kbkdf.c
@@ -565,7 +547,6 @@
$(OPENSSL_PATH)/providers/implementations/kdfs/tls1_prf.c
$(OPENSSL_PATH)/providers/implementations/kdfs/x942kdf.c
$(OPENSSL_PATH)/providers/implementations/kem/rsa_kem.c
- $(OPENSSL_PATH)/providers/implementations/keymgmt/dh_kmgmt.c
$(OPENSSL_PATH)/providers/implementations/keymgmt/kdf_legacy_kmgmt.c
$(OPENSSL_PATH)/providers/implementations/keymgmt/mac_legacy_kmgmt.c
$(OPENSSL_PATH)/providers/implementations/keymgmt/rsa_kmgmt.c
diff --git a/CryptoPkg/Library/OpensslLib/OpensslLibAccel.inf b/CryptoPkg/Library/OpensslLib/OpensslLibAccel.inf
index 673dba23621..6315e6edb32 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslLibAccel.inf
+++ b/CryptoPkg/Library/OpensslLib/OpensslLibAccel.inf
@@ -196,20 +196,6 @@
$(OPENSSL_PATH)/crypto/conf/conf_mod.c
$(OPENSSL_PATH)/crypto/conf/conf_sap.c
$(OPENSSL_PATH)/crypto/conf/conf_ssl.c
- $(OPENSSL_PATH)/crypto/dh/dh_ameth.c
- $(OPENSSL_PATH)/crypto/dh/dh_asn1.c
- $(OPENSSL_PATH)/crypto/dh/dh_backend.c
- $(OPENSSL_PATH)/crypto/dh/dh_check.c
- $(OPENSSL_PATH)/crypto/dh/dh_err.c
- $(OPENSSL_PATH)/crypto/dh/dh_gen.c
- $(OPENSSL_PATH)/crypto/dh/dh_group_params.c
- $(OPENSSL_PATH)/crypto/dh/dh_kdf.c
- $(OPENSSL_PATH)/crypto/dh/dh_key.c
- $(OPENSSL_PATH)/crypto/dh/dh_lib.c
- $(OPENSSL_PATH)/crypto/dh/dh_meth.c
- $(OPENSSL_PATH)/crypto/dh/dh_pmeth.c
- $(OPENSSL_PATH)/crypto/dh/dh_prn.c
- $(OPENSSL_PATH)/crypto/dh/dh_rfc5114.c
$(OPENSSL_PATH)/crypto/dso/dso_dl.c
$(OPENSSL_PATH)/crypto/dso/dso_dlfcn.c
$(OPENSSL_PATH)/crypto/dso/dso_err.c
@@ -441,8 +427,6 @@
$(OPENSSL_PATH)/crypto/sha/sha256.c
$(OPENSSL_PATH)/crypto/sha/sha3.c
$(OPENSSL_PATH)/crypto/sha/sha512.c
- $(OPENSSL_PATH)/crypto/sm3/legacy_sm3.c
- $(OPENSSL_PATH)/crypto/sm3/sm3.c
$(OPENSSL_PATH)/crypto/stack/stack.c
$(OPENSSL_PATH)/crypto/txt_db/txt_db.c
$(OPENSSL_PATH)/crypto/ui/ui_err.c
@@ -557,7 +541,6 @@
$(OPENSSL_PATH)/providers/implementations/digests/null_prov.c
$(OPENSSL_PATH)/providers/implementations/digests/sha2_prov.c
$(OPENSSL_PATH)/providers/implementations/digests/sha3_prov.c
- $(OPENSSL_PATH)/providers/implementations/digests/sm3_prov.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_der2key.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_epki2pki.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_msblob2key.c
@@ -565,7 +548,6 @@
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_pvk2key.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_spki2typespki.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/endecoder_common.c
- $(OPENSSL_PATH)/providers/implementations/exchange/dh_exch.c
$(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c
$(OPENSSL_PATH)/providers/implementations/kdfs/hkdf.c
$(OPENSSL_PATH)/providers/implementations/kdfs/kbkdf.c
@@ -579,7 +561,6 @@
$(OPENSSL_PATH)/providers/implementations/kdfs/tls1_prf.c
$(OPENSSL_PATH)/providers/implementations/kdfs/x942kdf.c
$(OPENSSL_PATH)/providers/implementations/kem/rsa_kem.c
- $(OPENSSL_PATH)/providers/implementations/keymgmt/dh_kmgmt.c
$(OPENSSL_PATH)/providers/implementations/keymgmt/kdf_legacy_kmgmt.c
$(OPENSSL_PATH)/providers/implementations/keymgmt/mac_legacy_kmgmt.c
$(OPENSSL_PATH)/providers/implementations/keymgmt/rsa_kmgmt.c
@@ -829,20 +810,6 @@
$(OPENSSL_PATH)/crypto/conf/conf_mod.c
$(OPENSSL_PATH)/crypto/conf/conf_sap.c
$(OPENSSL_PATH)/crypto/conf/conf_ssl.c
- $(OPENSSL_PATH)/crypto/dh/dh_ameth.c
- $(OPENSSL_PATH)/crypto/dh/dh_asn1.c
- $(OPENSSL_PATH)/crypto/dh/dh_backend.c
- $(OPENSSL_PATH)/crypto/dh/dh_check.c
- $(OPENSSL_PATH)/crypto/dh/dh_err.c
- $(OPENSSL_PATH)/crypto/dh/dh_gen.c
- $(OPENSSL_PATH)/crypto/dh/dh_group_params.c
- $(OPENSSL_PATH)/crypto/dh/dh_kdf.c
- $(OPENSSL_PATH)/crypto/dh/dh_key.c
- $(OPENSSL_PATH)/crypto/dh/dh_lib.c
- $(OPENSSL_PATH)/crypto/dh/dh_meth.c
- $(OPENSSL_PATH)/crypto/dh/dh_pmeth.c
- $(OPENSSL_PATH)/crypto/dh/dh_prn.c
- $(OPENSSL_PATH)/crypto/dh/dh_rfc5114.c
$(OPENSSL_PATH)/crypto/dso/dso_dl.c
$(OPENSSL_PATH)/crypto/dso/dso_dlfcn.c
$(OPENSSL_PATH)/crypto/dso/dso_err.c
@@ -1073,8 +1040,6 @@
$(OPENSSL_PATH)/crypto/sha/sha256.c
$(OPENSSL_PATH)/crypto/sha/sha3.c
$(OPENSSL_PATH)/crypto/sha/sha512.c
- $(OPENSSL_PATH)/crypto/sm3/legacy_sm3.c
- $(OPENSSL_PATH)/crypto/sm3/sm3.c
$(OPENSSL_PATH)/crypto/stack/stack.c
$(OPENSSL_PATH)/crypto/txt_db/txt_db.c
$(OPENSSL_PATH)/crypto/ui/ui_err.c
@@ -1189,7 +1154,6 @@
$(OPENSSL_PATH)/providers/implementations/digests/null_prov.c
$(OPENSSL_PATH)/providers/implementations/digests/sha2_prov.c
$(OPENSSL_PATH)/providers/implementations/digests/sha3_prov.c
- $(OPENSSL_PATH)/providers/implementations/digests/sm3_prov.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_der2key.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_epki2pki.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_msblob2key.c
@@ -1197,7 +1161,6 @@
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_pvk2key.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_spki2typespki.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/endecoder_common.c
- $(OPENSSL_PATH)/providers/implementations/exchange/dh_exch.c
$(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c
$(OPENSSL_PATH)/providers/implementations/kdfs/hkdf.c
$(OPENSSL_PATH)/providers/implementations/kdfs/kbkdf.c
@@ -1211,7 +1174,6 @@
$(OPENSSL_PATH)/providers/implementations/kdfs/tls1_prf.c
$(OPENSSL_PATH)/providers/implementations/kdfs/x942kdf.c
$(OPENSSL_PATH)/providers/implementations/kem/rsa_kem.c
- $(OPENSSL_PATH)/providers/implementations/keymgmt/dh_kmgmt.c
$(OPENSSL_PATH)/providers/implementations/keymgmt/kdf_legacy_kmgmt.c
$(OPENSSL_PATH)/providers/implementations/keymgmt/mac_legacy_kmgmt.c
$(OPENSSL_PATH)/providers/implementations/keymgmt/rsa_kmgmt.c
diff --git a/CryptoPkg/Library/OpensslLib/OpensslLibCrypto.inf b/CryptoPkg/Library/OpensslLib/OpensslLibCrypto.inf
index 35162b90fe8..9f09af4ee9f 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslLibCrypto.inf
+++ b/CryptoPkg/Library/OpensslLib/OpensslLibCrypto.inf
@@ -182,20 +182,6 @@
$(OPENSSL_PATH)/crypto/conf/conf_mod.c
$(OPENSSL_PATH)/crypto/conf/conf_sap.c
$(OPENSSL_PATH)/crypto/conf/conf_ssl.c
- $(OPENSSL_PATH)/crypto/dh/dh_ameth.c
- $(OPENSSL_PATH)/crypto/dh/dh_asn1.c
- $(OPENSSL_PATH)/crypto/dh/dh_backend.c
- $(OPENSSL_PATH)/crypto/dh/dh_check.c
- $(OPENSSL_PATH)/crypto/dh/dh_err.c
- $(OPENSSL_PATH)/crypto/dh/dh_gen.c
- $(OPENSSL_PATH)/crypto/dh/dh_group_params.c
- $(OPENSSL_PATH)/crypto/dh/dh_kdf.c
- $(OPENSSL_PATH)/crypto/dh/dh_key.c
- $(OPENSSL_PATH)/crypto/dh/dh_lib.c
- $(OPENSSL_PATH)/crypto/dh/dh_meth.c
- $(OPENSSL_PATH)/crypto/dh/dh_pmeth.c
- $(OPENSSL_PATH)/crypto/dh/dh_prn.c
- $(OPENSSL_PATH)/crypto/dh/dh_rfc5114.c
$(OPENSSL_PATH)/crypto/dso/dso_dl.c
$(OPENSSL_PATH)/crypto/dso/dso_dlfcn.c
$(OPENSSL_PATH)/crypto/dso/dso_err.c
@@ -428,8 +414,6 @@
$(OPENSSL_PATH)/crypto/sha/sha256.c
$(OPENSSL_PATH)/crypto/sha/sha3.c
$(OPENSSL_PATH)/crypto/sha/sha512.c
- $(OPENSSL_PATH)/crypto/sm3/legacy_sm3.c
- $(OPENSSL_PATH)/crypto/sm3/sm3.c
$(OPENSSL_PATH)/crypto/stack/stack.c
$(OPENSSL_PATH)/crypto/txt_db/txt_db.c
$(OPENSSL_PATH)/crypto/ui/ui_err.c
@@ -544,7 +528,6 @@
$(OPENSSL_PATH)/providers/implementations/digests/null_prov.c
$(OPENSSL_PATH)/providers/implementations/digests/sha2_prov.c
$(OPENSSL_PATH)/providers/implementations/digests/sha3_prov.c
- $(OPENSSL_PATH)/providers/implementations/digests/sm3_prov.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_der2key.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_epki2pki.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_msblob2key.c
@@ -552,7 +535,6 @@
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_pvk2key.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_spki2typespki.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/endecoder_common.c
- $(OPENSSL_PATH)/providers/implementations/exchange/dh_exch.c
$(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c
$(OPENSSL_PATH)/providers/implementations/kdfs/hkdf.c
$(OPENSSL_PATH)/providers/implementations/kdfs/kbkdf.c
@@ -566,7 +548,6 @@
$(OPENSSL_PATH)/providers/implementations/kdfs/tls1_prf.c
$(OPENSSL_PATH)/providers/implementations/kdfs/x942kdf.c
$(OPENSSL_PATH)/providers/implementations/kem/rsa_kem.c
- $(OPENSSL_PATH)/providers/implementations/keymgmt/dh_kmgmt.c
$(OPENSSL_PATH)/providers/implementations/keymgmt/kdf_legacy_kmgmt.c
$(OPENSSL_PATH)/providers/implementations/keymgmt/mac_legacy_kmgmt.c
$(OPENSSL_PATH)/providers/implementations/keymgmt/rsa_kmgmt.c
diff --git a/CryptoPkg/Library/OpensslLib/OpensslLibFull.inf b/CryptoPkg/Library/OpensslLib/OpensslLibFull.inf
index 55c63429048..b821fa8f8c4 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslLibFull.inf
+++ b/CryptoPkg/Library/OpensslLib/OpensslLibFull.inf
@@ -186,20 +186,6 @@
$(OPENSSL_PATH)/crypto/conf/conf_mod.c
$(OPENSSL_PATH)/crypto/conf/conf_sap.c
$(OPENSSL_PATH)/crypto/conf/conf_ssl.c
- $(OPENSSL_PATH)/crypto/dh/dh_ameth.c
- $(OPENSSL_PATH)/crypto/dh/dh_asn1.c
- $(OPENSSL_PATH)/crypto/dh/dh_backend.c
- $(OPENSSL_PATH)/crypto/dh/dh_check.c
- $(OPENSSL_PATH)/crypto/dh/dh_err.c
- $(OPENSSL_PATH)/crypto/dh/dh_gen.c
- $(OPENSSL_PATH)/crypto/dh/dh_group_params.c
- $(OPENSSL_PATH)/crypto/dh/dh_kdf.c
- $(OPENSSL_PATH)/crypto/dh/dh_key.c
- $(OPENSSL_PATH)/crypto/dh/dh_lib.c
- $(OPENSSL_PATH)/crypto/dh/dh_meth.c
- $(OPENSSL_PATH)/crypto/dh/dh_pmeth.c
- $(OPENSSL_PATH)/crypto/dh/dh_prn.c
- $(OPENSSL_PATH)/crypto/dh/dh_rfc5114.c
$(OPENSSL_PATH)/crypto/dso/dso_dl.c
$(OPENSSL_PATH)/crypto/dso/dso_dlfcn.c
$(OPENSSL_PATH)/crypto/dso/dso_err.c
@@ -470,8 +456,6 @@
$(OPENSSL_PATH)/crypto/sha/sha256.c
$(OPENSSL_PATH)/crypto/sha/sha3.c
$(OPENSSL_PATH)/crypto/sha/sha512.c
- $(OPENSSL_PATH)/crypto/sm3/legacy_sm3.c
- $(OPENSSL_PATH)/crypto/sm3/sm3.c
$(OPENSSL_PATH)/crypto/stack/stack.c
$(OPENSSL_PATH)/crypto/txt_db/txt_db.c
$(OPENSSL_PATH)/crypto/ui/ui_err.c
@@ -586,7 +570,6 @@
$(OPENSSL_PATH)/providers/implementations/digests/null_prov.c
$(OPENSSL_PATH)/providers/implementations/digests/sha2_prov.c
$(OPENSSL_PATH)/providers/implementations/digests/sha3_prov.c
- $(OPENSSL_PATH)/providers/implementations/digests/sm3_prov.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_der2key.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_epki2pki.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_msblob2key.c
@@ -594,7 +577,6 @@
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_pvk2key.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_spki2typespki.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/endecoder_common.c
- $(OPENSSL_PATH)/providers/implementations/exchange/dh_exch.c
$(OPENSSL_PATH)/providers/implementations/exchange/ecdh_exch.c
$(OPENSSL_PATH)/providers/implementations/exchange/ecx_exch.c
$(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c
@@ -610,7 +592,6 @@
$(OPENSSL_PATH)/providers/implementations/kdfs/tls1_prf.c
$(OPENSSL_PATH)/providers/implementations/kdfs/x942kdf.c
$(OPENSSL_PATH)/providers/implementations/kem/rsa_kem.c
- $(OPENSSL_PATH)/providers/implementations/keymgmt/dh_kmgmt.c
$(OPENSSL_PATH)/providers/implementations/keymgmt/ec_kmgmt.c
$(OPENSSL_PATH)/providers/implementations/keymgmt/ecx_kmgmt.c
$(OPENSSL_PATH)/providers/implementations/keymgmt/kdf_legacy_kmgmt.c
diff --git a/CryptoPkg/Library/OpensslLib/OpensslLibFullAccel.inf b/CryptoPkg/Library/OpensslLib/OpensslLibFullAccel.inf
index 3e3efa13d79..106edab99e2 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslLibFullAccel.inf
+++ b/CryptoPkg/Library/OpensslLib/OpensslLibFullAccel.inf
@@ -202,20 +202,6 @@
$(OPENSSL_PATH)/crypto/conf/conf_mod.c
$(OPENSSL_PATH)/crypto/conf/conf_sap.c
$(OPENSSL_PATH)/crypto/conf/conf_ssl.c
- $(OPENSSL_PATH)/crypto/dh/dh_ameth.c
- $(OPENSSL_PATH)/crypto/dh/dh_asn1.c
- $(OPENSSL_PATH)/crypto/dh/dh_backend.c
- $(OPENSSL_PATH)/crypto/dh/dh_check.c
- $(OPENSSL_PATH)/crypto/dh/dh_err.c
- $(OPENSSL_PATH)/crypto/dh/dh_gen.c
- $(OPENSSL_PATH)/crypto/dh/dh_group_params.c
- $(OPENSSL_PATH)/crypto/dh/dh_kdf.c
- $(OPENSSL_PATH)/crypto/dh/dh_key.c
- $(OPENSSL_PATH)/crypto/dh/dh_lib.c
- $(OPENSSL_PATH)/crypto/dh/dh_meth.c
- $(OPENSSL_PATH)/crypto/dh/dh_pmeth.c
- $(OPENSSL_PATH)/crypto/dh/dh_prn.c
- $(OPENSSL_PATH)/crypto/dh/dh_rfc5114.c
$(OPENSSL_PATH)/crypto/dso/dso_dl.c
$(OPENSSL_PATH)/crypto/dso/dso_dlfcn.c
$(OPENSSL_PATH)/crypto/dso/dso_err.c
@@ -489,8 +475,6 @@
$(OPENSSL_PATH)/crypto/sha/sha256.c
$(OPENSSL_PATH)/crypto/sha/sha3.c
$(OPENSSL_PATH)/crypto/sha/sha512.c
- $(OPENSSL_PATH)/crypto/sm3/legacy_sm3.c
- $(OPENSSL_PATH)/crypto/sm3/sm3.c
$(OPENSSL_PATH)/crypto/stack/stack.c
$(OPENSSL_PATH)/crypto/txt_db/txt_db.c
$(OPENSSL_PATH)/crypto/ui/ui_err.c
@@ -605,7 +589,6 @@
$(OPENSSL_PATH)/providers/implementations/digests/null_prov.c
$(OPENSSL_PATH)/providers/implementations/digests/sha2_prov.c
$(OPENSSL_PATH)/providers/implementations/digests/sha3_prov.c
- $(OPENSSL_PATH)/providers/implementations/digests/sm3_prov.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_der2key.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_epki2pki.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_msblob2key.c
@@ -613,7 +596,6 @@
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_pvk2key.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_spki2typespki.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/endecoder_common.c
- $(OPENSSL_PATH)/providers/implementations/exchange/dh_exch.c
$(OPENSSL_PATH)/providers/implementations/exchange/ecdh_exch.c
$(OPENSSL_PATH)/providers/implementations/exchange/ecx_exch.c
$(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c
@@ -629,7 +611,6 @@
$(OPENSSL_PATH)/providers/implementations/kdfs/tls1_prf.c
$(OPENSSL_PATH)/providers/implementations/kdfs/x942kdf.c
$(OPENSSL_PATH)/providers/implementations/kem/rsa_kem.c
- $(OPENSSL_PATH)/providers/implementations/keymgmt/dh_kmgmt.c
$(OPENSSL_PATH)/providers/implementations/keymgmt/ec_kmgmt.c
$(OPENSSL_PATH)/providers/implementations/keymgmt/ecx_kmgmt.c
$(OPENSSL_PATH)/providers/implementations/keymgmt/kdf_legacy_kmgmt.c
@@ -888,20 +869,6 @@
$(OPENSSL_PATH)/crypto/conf/conf_mod.c
$(OPENSSL_PATH)/crypto/conf/conf_sap.c
$(OPENSSL_PATH)/crypto/conf/conf_ssl.c
- $(OPENSSL_PATH)/crypto/dh/dh_ameth.c
- $(OPENSSL_PATH)/crypto/dh/dh_asn1.c
- $(OPENSSL_PATH)/crypto/dh/dh_backend.c
- $(OPENSSL_PATH)/crypto/dh/dh_check.c
- $(OPENSSL_PATH)/crypto/dh/dh_err.c
- $(OPENSSL_PATH)/crypto/dh/dh_gen.c
- $(OPENSSL_PATH)/crypto/dh/dh_group_params.c
- $(OPENSSL_PATH)/crypto/dh/dh_kdf.c
- $(OPENSSL_PATH)/crypto/dh/dh_key.c
- $(OPENSSL_PATH)/crypto/dh/dh_lib.c
- $(OPENSSL_PATH)/crypto/dh/dh_meth.c
- $(OPENSSL_PATH)/crypto/dh/dh_pmeth.c
- $(OPENSSL_PATH)/crypto/dh/dh_prn.c
- $(OPENSSL_PATH)/crypto/dh/dh_rfc5114.c
$(OPENSSL_PATH)/crypto/dso/dso_dl.c
$(OPENSSL_PATH)/crypto/dso/dso_dlfcn.c
$(OPENSSL_PATH)/crypto/dso/dso_err.c
@@ -1170,8 +1137,6 @@
$(OPENSSL_PATH)/crypto/sha/sha256.c
$(OPENSSL_PATH)/crypto/sha/sha3.c
$(OPENSSL_PATH)/crypto/sha/sha512.c
- $(OPENSSL_PATH)/crypto/sm3/legacy_sm3.c
- $(OPENSSL_PATH)/crypto/sm3/sm3.c
$(OPENSSL_PATH)/crypto/stack/stack.c
$(OPENSSL_PATH)/crypto/txt_db/txt_db.c
$(OPENSSL_PATH)/crypto/ui/ui_err.c
@@ -1286,7 +1251,6 @@
$(OPENSSL_PATH)/providers/implementations/digests/null_prov.c
$(OPENSSL_PATH)/providers/implementations/digests/sha2_prov.c
$(OPENSSL_PATH)/providers/implementations/digests/sha3_prov.c
- $(OPENSSL_PATH)/providers/implementations/digests/sm3_prov.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_der2key.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_epki2pki.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_msblob2key.c
@@ -1294,7 +1258,6 @@
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_pvk2key.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/decode_spki2typespki.c
$(OPENSSL_PATH)/providers/implementations/encode_decode/endecoder_common.c
- $(OPENSSL_PATH)/providers/implementations/exchange/dh_exch.c
$(OPENSSL_PATH)/providers/implementations/exchange/ecdh_exch.c
$(OPENSSL_PATH)/providers/implementations/exchange/ecx_exch.c
$(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c
@@ -1310,7 +1273,6 @@
$(OPENSSL_PATH)/providers/implementations/kdfs/tls1_prf.c
$(OPENSSL_PATH)/providers/implementations/kdfs/x942kdf.c
$(OPENSSL_PATH)/providers/implementations/kem/rsa_kem.c
- $(OPENSSL_PATH)/providers/implementations/keymgmt/dh_kmgmt.c
$(OPENSSL_PATH)/providers/implementations/keymgmt/ec_kmgmt.c
$(OPENSSL_PATH)/providers/implementations/keymgmt/ecx_kmgmt.c
$(OPENSSL_PATH)/providers/implementations/keymgmt/kdf_legacy_kmgmt.c
diff --git a/CryptoPkg/Library/OpensslLib/OpensslLibFullAccelTest.inf b/CryptoPkg/Library/OpensslLib/OpensslLibFullAccelTest.inf
new file mode 100644
index 00000000000..07ccf0b6eb3
--- /dev/null
+++ b/CryptoPkg/Library/OpensslLib/OpensslLibFullAccelTest.inf
@@ -0,0 +1,1473 @@
+## @file
+# This module provides OpenSSL Library implementation with ECC and TLS
+# features along with performance optimized implementations of SHA1,
+# SHA256, SHA512 AESNI, VPAED, and GHASH for IA32 and X64.
+#
+# This library should be used if a module module needs ECC in TLS, or
+# asymmetric cryptography services such as X509 certificate or PEM format
+# data processing. This library increases the size overhead up to ~115 KB
+# compared to OpensslLibAccel.inf library instance.
+#
+# Copyright (c) 2010 - 2020, Intel Corporation. All rights reserved.
+# (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+# SPDX-License-Identifier: BSD-2-Clause-Patent
+#
+##
+
+[Defines]
+ INF_VERSION = 0x00010005
+ BASE_NAME = OpensslLibFullAccel
+ MODULE_UNI_FILE = OpensslLibFullAccel.uni
+ FILE_GUID = AC649FB2-ADCF-450A-9C61-ED3CAFF12864
+ MODULE_TYPE = BASE
+ VERSION_STRING = 1.0
+ LIBRARY_CLASS = OpensslLib
+ CONSTRUCTOR = OpensslLibConstructor
+
+ DEFINE OPENSSL_PATH = openssl
+ DEFINE OPENSSL_GEN_PATH = OpensslGen
+ DEFINE OPENSSL_FLAGS = -DL_ENDIAN -DOPENSSL_SMALL_FOOTPRINT -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE
+ DEFINE OPENSSL_FLAGS_IA32 = -DAES_ASM -DGHASH_ASM -DMD5_ASM -DOPENSSL_CPUID_OBJ -DSHA1_ASM -DSHA256_ASM -DSHA512_ASM -DVPAES_ASM
+ DEFINE OPENSSL_FLAGS_X64 = -DAES_ASM -DBSAES_ASM -DGHASH_ASM -DKECCAK1600_ASM -DMD5_ASM -DOPENSSL_CPUID_OBJ -DSHA1_ASM -DSHA256_ASM -DSHA512_ASM -DVPAES_ASM
+
+#
+# VALID_ARCHITECTURES = IA32 X64
+#
+
+[Sources]
+ OpensslLibConstructor.c
+ $(OPENSSL_PATH)/e_os.h
+ $(OPENSSL_PATH)/ms/uplink.h
+ $(OPENSSL_PATH)/crypto/bn/bn_asm.c
+# Autogenerated files list starts here
+# Autogenerated files list ends here
+ buildinf.h
+ buildinf.c
+ OpensslStub/ossl_store.c
+ OpensslStub/rand_pool.c
+# OpensslStub/SslNull.c
+# OpensslStub/EcSm2Null.c
+ OpensslStub/uefiprov.c
+ OpensslStub/EncoderNull.c
+ OpensslStub/SslStatServNull.c
+ OpensslStub/SslExtServNull.c
+ OpensslStub/Pkcs12Null.c
+ OpensslStub/CipherNull.c
+
+[Sources.IA32]
+# Autogenerated files list starts here
+ #$(OPENSSL_PATH)/crypto/aes/aes_cfb.c
+ #$(OPENSSL_PATH)/crypto/aes/aes_ecb.c
+ #$(OPENSSL_PATH)/crypto/aes/aes_ige.c
+ #$(OPENSSL_PATH)/crypto/aes/aes_misc.c
+ #$(OPENSSL_PATH)/crypto/aes/aes_ofb.c
+ #$(OPENSSL_PATH)/crypto/aes/aes_wrap.c
+ $(OPENSSL_PATH)/crypto/asn1/a_bitstr.c
+ $(OPENSSL_PATH)/crypto/asn1/a_d2i_fp.c
+ $(OPENSSL_PATH)/crypto/asn1/a_digest.c
+ $(OPENSSL_PATH)/crypto/asn1/a_dup.c
+ $(OPENSSL_PATH)/crypto/asn1/a_gentm.c
+ $(OPENSSL_PATH)/crypto/asn1/a_i2d_fp.c
+ $(OPENSSL_PATH)/crypto/asn1/a_int.c
+ $(OPENSSL_PATH)/crypto/asn1/a_mbstr.c
+ $(OPENSSL_PATH)/crypto/asn1/a_object.c
+ $(OPENSSL_PATH)/crypto/asn1/a_octet.c
+ $(OPENSSL_PATH)/crypto/asn1/a_print.c
+ $(OPENSSL_PATH)/crypto/asn1/a_sign.c
+ $(OPENSSL_PATH)/crypto/asn1/a_strex.c
+ $(OPENSSL_PATH)/crypto/asn1/a_strnid.c
+ $(OPENSSL_PATH)/crypto/asn1/a_time.c
+ $(OPENSSL_PATH)/crypto/asn1/a_type.c
+ $(OPENSSL_PATH)/crypto/asn1/a_utctm.c
+ $(OPENSSL_PATH)/crypto/asn1/a_utf8.c
+ $(OPENSSL_PATH)/crypto/asn1/a_verify.c
+ $(OPENSSL_PATH)/crypto/asn1/ameth_lib.c
+ $(OPENSSL_PATH)/crypto/asn1/asn1_err.c
+ $(OPENSSL_PATH)/crypto/asn1/asn1_gen.c
+ $(OPENSSL_PATH)/crypto/asn1/asn1_item_list.c
+ $(OPENSSL_PATH)/crypto/asn1/asn1_lib.c
+ $(OPENSSL_PATH)/crypto/asn1/asn1_parse.c
+ $(OPENSSL_PATH)/crypto/asn1/asn_mime.c
+ $(OPENSSL_PATH)/crypto/asn1/asn_moid.c
+ $(OPENSSL_PATH)/crypto/asn1/asn_mstbl.c
+ $(OPENSSL_PATH)/crypto/asn1/asn_pack.c
+ $(OPENSSL_PATH)/crypto/asn1/bio_asn1.c
+ $(OPENSSL_PATH)/crypto/asn1/bio_ndef.c
+ $(OPENSSL_PATH)/crypto/asn1/d2i_param.c
+ $(OPENSSL_PATH)/crypto/asn1/d2i_pr.c
+ $(OPENSSL_PATH)/crypto/asn1/d2i_pu.c
+ $(OPENSSL_PATH)/crypto/asn1/evp_asn1.c
+ $(OPENSSL_PATH)/crypto/asn1/f_int.c
+ $(OPENSSL_PATH)/crypto/asn1/f_string.c
+ $(OPENSSL_PATH)/crypto/asn1/i2d_evp.c
+ $(OPENSSL_PATH)/crypto/asn1/nsseq.c
+ $(OPENSSL_PATH)/crypto/asn1/p5_pbe.c
+ $(OPENSSL_PATH)/crypto/asn1/p5_pbev2.c
+ $(OPENSSL_PATH)/crypto/asn1/p5_scrypt.c
+ $(OPENSSL_PATH)/crypto/asn1/p8_pkey.c
+ $(OPENSSL_PATH)/crypto/asn1/t_bitst.c
+ $(OPENSSL_PATH)/crypto/asn1/t_pkey.c
+ $(OPENSSL_PATH)/crypto/asn1/t_spki.c
+ $(OPENSSL_PATH)/crypto/asn1/tasn_dec.c
+ $(OPENSSL_PATH)/crypto/asn1/tasn_enc.c
+ $(OPENSSL_PATH)/crypto/asn1/tasn_fre.c
+ $(OPENSSL_PATH)/crypto/asn1/tasn_new.c
+ $(OPENSSL_PATH)/crypto/asn1/tasn_prn.c
+ $(OPENSSL_PATH)/crypto/asn1/tasn_scn.c
+ $(OPENSSL_PATH)/crypto/asn1/tasn_typ.c
+ $(OPENSSL_PATH)/crypto/asn1/tasn_utl.c
+ $(OPENSSL_PATH)/crypto/asn1/x_algor.c
+ $(OPENSSL_PATH)/crypto/asn1/x_bignum.c
+ $(OPENSSL_PATH)/crypto/asn1/x_info.c
+ $(OPENSSL_PATH)/crypto/asn1/x_int64.c
+ $(OPENSSL_PATH)/crypto/asn1/x_long.c
+ $(OPENSSL_PATH)/crypto/asn1/x_pkey.c
+ $(OPENSSL_PATH)/crypto/asn1/x_sig.c
+ $(OPENSSL_PATH)/crypto/asn1/x_spki.c
+ $(OPENSSL_PATH)/crypto/asn1/x_val.c
+ $(OPENSSL_PATH)/crypto/async/arch/async_null.c
+ $(OPENSSL_PATH)/crypto/async/arch/async_posix.c
+ $(OPENSSL_PATH)/crypto/async/arch/async_win.c
+ $(OPENSSL_PATH)/crypto/async/async.c
+ $(OPENSSL_PATH)/crypto/async/async_err.c
+ $(OPENSSL_PATH)/crypto/async/async_wait.c
+ $(OPENSSL_PATH)/crypto/bio/bf_buff.c
+ $(OPENSSL_PATH)/crypto/bio/bf_lbuf.c
+ $(OPENSSL_PATH)/crypto/bio/bf_nbio.c
+ $(OPENSSL_PATH)/crypto/bio/bf_null.c
+ $(OPENSSL_PATH)/crypto/bio/bf_prefix.c
+ $(OPENSSL_PATH)/crypto/bio/bf_readbuff.c
+ $(OPENSSL_PATH)/crypto/bio/bio_addr.c
+ $(OPENSSL_PATH)/crypto/bio/bio_cb.c
+ $(OPENSSL_PATH)/crypto/bio/bio_dump.c
+ $(OPENSSL_PATH)/crypto/bio/bio_err.c
+ $(OPENSSL_PATH)/crypto/bio/bio_lib.c
+ $(OPENSSL_PATH)/crypto/bio/bio_meth.c
+ $(OPENSSL_PATH)/crypto/bio/bio_print.c
+ $(OPENSSL_PATH)/crypto/bio/bio_sock.c
+ $(OPENSSL_PATH)/crypto/bio/bio_sock2.c
+ $(OPENSSL_PATH)/crypto/bio/bss_acpt.c
+ $(OPENSSL_PATH)/crypto/bio/bss_bio.c
+ $(OPENSSL_PATH)/crypto/bio/bss_conn.c
+ $(OPENSSL_PATH)/crypto/bio/bss_core.c
+ $(OPENSSL_PATH)/crypto/bio/bss_dgram.c
+ $(OPENSSL_PATH)/crypto/bio/bss_fd.c
+ $(OPENSSL_PATH)/crypto/bio/bss_file.c
+ $(OPENSSL_PATH)/crypto/bio/bss_log.c
+ $(OPENSSL_PATH)/crypto/bio/bss_mem.c
+ $(OPENSSL_PATH)/crypto/bio/bss_null.c
+ $(OPENSSL_PATH)/crypto/bio/bss_sock.c
+ $(OPENSSL_PATH)/crypto/bio/ossl_core_bio.c
+ $(OPENSSL_PATH)/crypto/bn/bn_add.c
+ $(OPENSSL_PATH)/crypto/bn/bn_blind.c
+ $(OPENSSL_PATH)/crypto/bn/bn_const.c
+ $(OPENSSL_PATH)/crypto/bn/bn_conv.c
+ $(OPENSSL_PATH)/crypto/bn/bn_ctx.c
+ $(OPENSSL_PATH)/crypto/bn/bn_dh.c
+ $(OPENSSL_PATH)/crypto/bn/bn_div.c
+ $(OPENSSL_PATH)/crypto/bn/bn_err.c
+ $(OPENSSL_PATH)/crypto/bn/bn_exp.c
+ $(OPENSSL_PATH)/crypto/bn/bn_exp2.c
+ $(OPENSSL_PATH)/crypto/bn/bn_gcd.c
+ $(OPENSSL_PATH)/crypto/bn/bn_gf2m.c
+ $(OPENSSL_PATH)/crypto/bn/bn_intern.c
+ $(OPENSSL_PATH)/crypto/bn/bn_kron.c
+ $(OPENSSL_PATH)/crypto/bn/bn_lib.c
+ $(OPENSSL_PATH)/crypto/bn/bn_mod.c
+ $(OPENSSL_PATH)/crypto/bn/bn_mont.c
+ $(OPENSSL_PATH)/crypto/bn/bn_mpi.c
+ $(OPENSSL_PATH)/crypto/bn/bn_mul.c
+ $(OPENSSL_PATH)/crypto/bn/bn_nist.c
+ $(OPENSSL_PATH)/crypto/bn/bn_prime.c
+ $(OPENSSL_PATH)/crypto/bn/bn_print.c
+ $(OPENSSL_PATH)/crypto/bn/bn_rand.c
+ $(OPENSSL_PATH)/crypto/bn/bn_recp.c
+ $(OPENSSL_PATH)/crypto/bn/bn_rsa_fips186_4.c
+ $(OPENSSL_PATH)/crypto/bn/bn_shift.c
+ $(OPENSSL_PATH)/crypto/bn/bn_sqr.c
+ $(OPENSSL_PATH)/crypto/bn/bn_sqrt.c
+ $(OPENSSL_PATH)/crypto/bn/bn_srp.c
+ $(OPENSSL_PATH)/crypto/bn/bn_word.c
+ $(OPENSSL_PATH)/crypto/bn/bn_x931p.c
+ $(OPENSSL_PATH)/crypto/buffer/buf_err.c
+ $(OPENSSL_PATH)/crypto/buffer/buffer.c
+ $(OPENSSL_PATH)/crypto/comp/c_zlib.c
+ $(OPENSSL_PATH)/crypto/comp/comp_err.c
+ $(OPENSSL_PATH)/crypto/comp/comp_lib.c
+ $(OPENSSL_PATH)/crypto/conf/conf_api.c
+ $(OPENSSL_PATH)/crypto/conf/conf_def.c
+ $(OPENSSL_PATH)/crypto/conf/conf_err.c
+ $(OPENSSL_PATH)/crypto/conf/conf_lib.c
+ $(OPENSSL_PATH)/crypto/conf/conf_mall.c
+ $(OPENSSL_PATH)/crypto/conf/conf_mod.c
+ $(OPENSSL_PATH)/crypto/conf/conf_sap.c
+ $(OPENSSL_PATH)/crypto/conf/conf_ssl.c
+ $(OPENSSL_PATH)/crypto/dso/dso_dl.c
+ $(OPENSSL_PATH)/crypto/dso/dso_dlfcn.c
+ $(OPENSSL_PATH)/crypto/dso/dso_err.c
+ $(OPENSSL_PATH)/crypto/dso/dso_lib.c
+ $(OPENSSL_PATH)/crypto/dso/dso_openssl.c
+ $(OPENSSL_PATH)/crypto/dso/dso_vms.c
+ $(OPENSSL_PATH)/crypto/dso/dso_win32.c
+ $(OPENSSL_PATH)/crypto/ec/curve448/arch_32/f_impl32.c
+ $(OPENSSL_PATH)/crypto/ec/curve448/arch_64/f_impl64.c
+ $(OPENSSL_PATH)/crypto/ec/curve448/curve448.c
+ $(OPENSSL_PATH)/crypto/ec/curve448/curve448_tables.c
+ $(OPENSSL_PATH)/crypto/ec/curve448/eddsa.c
+ $(OPENSSL_PATH)/crypto/ec/curve448/f_generic.c
+ $(OPENSSL_PATH)/crypto/ec/curve448/scalar.c
+ $(OPENSSL_PATH)/crypto/ec/curve25519.c
+ $(OPENSSL_PATH)/crypto/ec/ec2_oct.c
+ $(OPENSSL_PATH)/crypto/ec/ec2_smpl.c
+ $(OPENSSL_PATH)/crypto/ec/ec_ameth.c
+ $(OPENSSL_PATH)/crypto/ec/ec_asn1.c
+ $(OPENSSL_PATH)/crypto/ec/ec_backend.c
+ $(OPENSSL_PATH)/crypto/ec/ec_check.c
+ $(OPENSSL_PATH)/crypto/ec/ec_curve.c
+ $(OPENSSL_PATH)/crypto/ec/ec_cvt.c
+ $(OPENSSL_PATH)/crypto/ec/ec_deprecated.c
+ $(OPENSSL_PATH)/crypto/ec/ec_err.c
+ $(OPENSSL_PATH)/crypto/ec/ec_key.c
+ $(OPENSSL_PATH)/crypto/ec/ec_kmeth.c
+ $(OPENSSL_PATH)/crypto/ec/ec_lib.c
+ $(OPENSSL_PATH)/crypto/ec/ec_mult.c
+ $(OPENSSL_PATH)/crypto/ec/ec_oct.c
+ $(OPENSSL_PATH)/crypto/ec/ec_pmeth.c
+ $(OPENSSL_PATH)/crypto/ec/ec_print.c
+ $(OPENSSL_PATH)/crypto/ec/ecdh_kdf.c
+ $(OPENSSL_PATH)/crypto/ec/ecdh_ossl.c
+ $(OPENSSL_PATH)/crypto/ec/ecdsa_ossl.c
+ $(OPENSSL_PATH)/crypto/ec/ecdsa_sign.c
+ $(OPENSSL_PATH)/crypto/ec/ecdsa_vrf.c
+ $(OPENSSL_PATH)/crypto/ec/eck_prn.c
+ $(OPENSSL_PATH)/crypto/ec/ecp_mont.c
+ $(OPENSSL_PATH)/crypto/ec/ecp_nist.c
+ $(OPENSSL_PATH)/crypto/ec/ecp_oct.c
+ $(OPENSSL_PATH)/crypto/ec/ecp_smpl.c
+ $(OPENSSL_PATH)/crypto/ec/ecx_backend.c
+ $(OPENSSL_PATH)/crypto/ec/ecx_key.c
+ $(OPENSSL_PATH)/crypto/ec/ecx_meth.c
+ $(OPENSSL_PATH)/crypto/encode_decode/decoder_err.c
+ $(OPENSSL_PATH)/crypto/encode_decode/decoder_lib.c
+ $(OPENSSL_PATH)/crypto/encode_decode/decoder_meth.c
+ $(OPENSSL_PATH)/crypto/encode_decode/decoder_pkey.c
+ $(OPENSSL_PATH)/crypto/err/err.c
+ $(OPENSSL_PATH)/crypto/err/err_all.c
+ $(OPENSSL_PATH)/crypto/err/err_all_legacy.c
+ $(OPENSSL_PATH)/crypto/err/err_blocks.c
+ $(OPENSSL_PATH)/crypto/err/err_prn.c
+ $(OPENSSL_PATH)/crypto/ess/ess_asn1.c
+ $(OPENSSL_PATH)/crypto/ess/ess_err.c
+ $(OPENSSL_PATH)/crypto/ess/ess_lib.c
+ $(OPENSSL_PATH)/crypto/evp/asymcipher.c
+ $(OPENSSL_PATH)/crypto/evp/bio_b64.c
+ $(OPENSSL_PATH)/crypto/evp/bio_enc.c
+ $(OPENSSL_PATH)/crypto/evp/bio_md.c
+ $(OPENSSL_PATH)/crypto/evp/bio_ok.c
+ #$(OPENSSL_PATH)/crypto/evp/c_allc.c
+ $(OPENSSL_PATH)/crypto/evp/c_alld.c
+ $(OPENSSL_PATH)/crypto/evp/cmeth_lib.c
+ $(OPENSSL_PATH)/crypto/evp/ctrl_params_translate.c
+ $(OPENSSL_PATH)/crypto/evp/dh_ctrl.c
+ $(OPENSSL_PATH)/crypto/evp/dh_support.c
+ $(OPENSSL_PATH)/crypto/evp/digest.c
+ $(OPENSSL_PATH)/crypto/evp/dsa_ctrl.c
+ #$(OPENSSL_PATH)/crypto/evp/e_aes.c
+ #$(OPENSSL_PATH)/crypto/evp/e_aes_cbc_hmac_sha1.c
+ #$(OPENSSL_PATH)/crypto/evp/e_aes_cbc_hmac_sha256.c
+ $(OPENSSL_PATH)/crypto/evp/e_aria.c
+ $(OPENSSL_PATH)/crypto/evp/e_bf.c
+ $(OPENSSL_PATH)/crypto/evp/e_cast.c
+ $(OPENSSL_PATH)/crypto/evp/e_chacha20_poly1305.c
+ $(OPENSSL_PATH)/crypto/evp/e_des.c
+ $(OPENSSL_PATH)/crypto/evp/e_des3.c
+ $(OPENSSL_PATH)/crypto/evp/e_idea.c
+ $(OPENSSL_PATH)/crypto/evp/e_null.c
+ $(OPENSSL_PATH)/crypto/evp/e_rc2.c
+ $(OPENSSL_PATH)/crypto/evp/e_rc4.c
+ $(OPENSSL_PATH)/crypto/evp/e_rc4_hmac_md5.c
+ $(OPENSSL_PATH)/crypto/evp/e_rc5.c
+ $(OPENSSL_PATH)/crypto/evp/e_sm4.c
+ $(OPENSSL_PATH)/crypto/evp/e_xcbc_d.c
+ $(OPENSSL_PATH)/crypto/evp/ec_ctrl.c
+ $(OPENSSL_PATH)/crypto/evp/ec_support.c
+ $(OPENSSL_PATH)/crypto/evp/encode.c
+ $(OPENSSL_PATH)/crypto/evp/evp_cnf.c
+ $(OPENSSL_PATH)/crypto/evp/evp_enc.c
+ $(OPENSSL_PATH)/crypto/evp/evp_err.c
+ $(OPENSSL_PATH)/crypto/evp/evp_fetch.c
+ $(OPENSSL_PATH)/crypto/evp/evp_key.c
+ $(OPENSSL_PATH)/crypto/evp/evp_lib.c
+ $(OPENSSL_PATH)/crypto/evp/evp_pbe.c
+ $(OPENSSL_PATH)/crypto/evp/evp_pkey.c
+ $(OPENSSL_PATH)/crypto/evp/evp_rand.c
+ $(OPENSSL_PATH)/crypto/evp/evp_utils.c
+ $(OPENSSL_PATH)/crypto/evp/exchange.c
+ $(OPENSSL_PATH)/crypto/evp/kdf_lib.c
+ $(OPENSSL_PATH)/crypto/evp/kdf_meth.c
+ $(OPENSSL_PATH)/crypto/evp/kem.c
+ $(OPENSSL_PATH)/crypto/evp/keymgmt_lib.c
+ $(OPENSSL_PATH)/crypto/evp/keymgmt_meth.c
+ $(OPENSSL_PATH)/crypto/evp/legacy_md5.c
+ $(OPENSSL_PATH)/crypto/evp/legacy_md5_sha1.c
+ $(OPENSSL_PATH)/crypto/evp/legacy_sha.c
+ $(OPENSSL_PATH)/crypto/evp/m_null.c
+ $(OPENSSL_PATH)/crypto/evp/m_sigver.c
+ $(OPENSSL_PATH)/crypto/evp/mac_lib.c
+ $(OPENSSL_PATH)/crypto/evp/mac_meth.c
+ $(OPENSSL_PATH)/crypto/evp/names.c
+ $(OPENSSL_PATH)/crypto/evp/p5_crpt.c
+ $(OPENSSL_PATH)/crypto/evp/p5_crpt2.c
+ $(OPENSSL_PATH)/crypto/evp/p_dec.c
+ $(OPENSSL_PATH)/crypto/evp/p_enc.c
+ $(OPENSSL_PATH)/crypto/evp/p_legacy.c
+ $(OPENSSL_PATH)/crypto/evp/p_lib.c
+ $(OPENSSL_PATH)/crypto/evp/p_open.c
+ $(OPENSSL_PATH)/crypto/evp/p_seal.c
+ $(OPENSSL_PATH)/crypto/evp/p_sign.c
+ $(OPENSSL_PATH)/crypto/evp/p_verify.c
+ $(OPENSSL_PATH)/crypto/evp/pbe_scrypt.c
+ $(OPENSSL_PATH)/crypto/evp/pmeth_check.c
+ $(OPENSSL_PATH)/crypto/evp/pmeth_gn.c
+ $(OPENSSL_PATH)/crypto/evp/pmeth_lib.c
+ $(OPENSSL_PATH)/crypto/evp/signature.c
+ $(OPENSSL_PATH)/crypto/ffc/ffc_backend.c
+ $(OPENSSL_PATH)/crypto/ffc/ffc_dh.c
+ $(OPENSSL_PATH)/crypto/ffc/ffc_key_generate.c
+ $(OPENSSL_PATH)/crypto/ffc/ffc_key_validate.c
+ $(OPENSSL_PATH)/crypto/ffc/ffc_params.c
+ $(OPENSSL_PATH)/crypto/ffc/ffc_params_generate.c
+ $(OPENSSL_PATH)/crypto/ffc/ffc_params_validate.c
+ $(OPENSSL_PATH)/crypto/hmac/hmac.c
+ $(OPENSSL_PATH)/crypto/http/http_client.c
+ $(OPENSSL_PATH)/crypto/http/http_err.c
+ $(OPENSSL_PATH)/crypto/http/http_lib.c
+ $(OPENSSL_PATH)/crypto/kdf/kdf_err.c
+ $(OPENSSL_PATH)/crypto/lhash/lh_stats.c
+ $(OPENSSL_PATH)/crypto/lhash/lhash.c
+ $(OPENSSL_PATH)/crypto/asn1_dsa.c
+ $(OPENSSL_PATH)/crypto/bsearch.c
+ $(OPENSSL_PATH)/crypto/context.c
+ $(OPENSSL_PATH)/crypto/core_algorithm.c
+ $(OPENSSL_PATH)/crypto/core_fetch.c
+ $(OPENSSL_PATH)/crypto/core_namemap.c
+ $(OPENSSL_PATH)/crypto/cpt_err.c
+ $(OPENSSL_PATH)/crypto/cpuid.c
+ $(OPENSSL_PATH)/crypto/cryptlib.c
+ $(OPENSSL_PATH)/crypto/ctype.c
+ $(OPENSSL_PATH)/crypto/cversion.c
+ $(OPENSSL_PATH)/crypto/der_writer.c
+ $(OPENSSL_PATH)/crypto/ebcdic.c
+ $(OPENSSL_PATH)/crypto/ex_data.c
+ $(OPENSSL_PATH)/crypto/getenv.c
+ $(OPENSSL_PATH)/crypto/info.c
+ $(OPENSSL_PATH)/crypto/init.c
+ $(OPENSSL_PATH)/crypto/initthread.c
+ $(OPENSSL_PATH)/crypto/mem.c
+ $(OPENSSL_PATH)/crypto/mem_sec.c
+ $(OPENSSL_PATH)/crypto/o_dir.c
+ $(OPENSSL_PATH)/crypto/o_fopen.c
+ $(OPENSSL_PATH)/crypto/o_init.c
+ $(OPENSSL_PATH)/crypto/o_str.c
+ $(OPENSSL_PATH)/crypto/o_time.c
+ $(OPENSSL_PATH)/crypto/packet.c
+ $(OPENSSL_PATH)/crypto/param_build.c
+ $(OPENSSL_PATH)/crypto/param_build_set.c
+ $(OPENSSL_PATH)/crypto/params.c
+ $(OPENSSL_PATH)/crypto/params_dup.c
+ $(OPENSSL_PATH)/crypto/params_from_text.c
+ $(OPENSSL_PATH)/crypto/passphrase.c
+ $(OPENSSL_PATH)/crypto/provider.c
+ $(OPENSSL_PATH)/crypto/provider_child.c
+ $(OPENSSL_PATH)/crypto/provider_conf.c
+ $(OPENSSL_PATH)/crypto/provider_core.c
+ $(OPENSSL_PATH)/crypto/punycode.c
+ $(OPENSSL_PATH)/crypto/self_test_core.c
+ $(OPENSSL_PATH)/crypto/sparse_array.c
+ $(OPENSSL_PATH)/crypto/threads_lib.c
+ $(OPENSSL_PATH)/crypto/threads_none.c
+ $(OPENSSL_PATH)/crypto/threads_pthread.c
+ $(OPENSSL_PATH)/crypto/threads_win.c
+ $(OPENSSL_PATH)/crypto/trace.c
+ $(OPENSSL_PATH)/crypto/uid.c
+ $(OPENSSL_PATH)/crypto/md5/md5_dgst.c
+ $(OPENSSL_PATH)/crypto/md5/md5_one.c
+ $(OPENSSL_PATH)/crypto/md5/md5_sha1.c
+ $(OPENSSL_PATH)/crypto/modes/cbc128.c
+ $(OPENSSL_PATH)/crypto/modes/ccm128.c
+ $(OPENSSL_PATH)/crypto/modes/cfb128.c
+ $(OPENSSL_PATH)/crypto/modes/ctr128.c
+ $(OPENSSL_PATH)/crypto/modes/cts128.c
+ $(OPENSSL_PATH)/crypto/modes/gcm128.c
+ $(OPENSSL_PATH)/crypto/modes/ocb128.c
+ $(OPENSSL_PATH)/crypto/modes/ofb128.c
+ $(OPENSSL_PATH)/crypto/modes/siv128.c
+ $(OPENSSL_PATH)/crypto/modes/wrap128.c
+ $(OPENSSL_PATH)/crypto/modes/xts128.c
+ $(OPENSSL_PATH)/crypto/objects/o_names.c
+ $(OPENSSL_PATH)/crypto/objects/obj_dat.c
+ $(OPENSSL_PATH)/crypto/objects/obj_err.c
+ $(OPENSSL_PATH)/crypto/objects/obj_lib.c
+ $(OPENSSL_PATH)/crypto/objects/obj_xref.c
+ $(OPENSSL_PATH)/crypto/pem/pem_all.c
+ $(OPENSSL_PATH)/crypto/pem/pem_err.c
+ $(OPENSSL_PATH)/crypto/pem/pem_info.c
+ $(OPENSSL_PATH)/crypto/pem/pem_lib.c
+ $(OPENSSL_PATH)/crypto/pem/pem_oth.c
+ $(OPENSSL_PATH)/crypto/pem/pem_pk8.c
+ $(OPENSSL_PATH)/crypto/pem/pem_pkey.c
+ $(OPENSSL_PATH)/crypto/pem/pem_sign.c
+ $(OPENSSL_PATH)/crypto/pem/pem_x509.c
+ $(OPENSSL_PATH)/crypto/pem/pem_xaux.c
+ $(OPENSSL_PATH)/crypto/pem/pvkfmt.c
+ $(OPENSSL_PATH)/crypto/pkcs7/bio_pk7.c
+ $(OPENSSL_PATH)/crypto/pkcs7/pk7_asn1.c
+ $(OPENSSL_PATH)/crypto/pkcs7/pk7_attr.c
+ $(OPENSSL_PATH)/crypto/pkcs7/pk7_doit.c
+ $(OPENSSL_PATH)/crypto/pkcs7/pk7_lib.c
+ $(OPENSSL_PATH)/crypto/pkcs7/pk7_mime.c
+ $(OPENSSL_PATH)/crypto/pkcs7/pk7_smime.c
+ $(OPENSSL_PATH)/crypto/pkcs7/pkcs7err.c
+ $(OPENSSL_PATH)/crypto/property/defn_cache.c
+ $(OPENSSL_PATH)/crypto/property/property.c
+ $(OPENSSL_PATH)/crypto/property/property_err.c
+ $(OPENSSL_PATH)/crypto/property/property_parse.c
+ $(OPENSSL_PATH)/crypto/property/property_query.c
+ $(OPENSSL_PATH)/crypto/property/property_string.c
+ $(OPENSSL_PATH)/crypto/rand/prov_seed.c
+ $(OPENSSL_PATH)/crypto/rand/rand_deprecated.c
+ $(OPENSSL_PATH)/crypto/rand/rand_err.c
+ $(OPENSSL_PATH)/crypto/rand/rand_lib.c
+ $(OPENSSL_PATH)/crypto/rand/rand_meth.c
+ $(OPENSSL_PATH)/crypto/rand/rand_pool.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_ameth.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_asn1.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_backend.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_chk.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_crpt.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_err.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_gen.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_lib.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_meth.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_mp.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_mp_names.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_none.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_oaep.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_ossl.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_pk1.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_pmeth.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_prn.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_pss.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_saos.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_schemes.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_sign.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_sp800_56b_check.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_sp800_56b_gen.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_x931.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_x931g.c
+ $(OPENSSL_PATH)/crypto/sha/keccak1600.c
+ $(OPENSSL_PATH)/crypto/sha/sha1_one.c
+ $(OPENSSL_PATH)/crypto/sha/sha1dgst.c
+ $(OPENSSL_PATH)/crypto/sha/sha256.c
+ $(OPENSSL_PATH)/crypto/sha/sha3.c
+ $(OPENSSL_PATH)/crypto/sha/sha512.c
+ $(OPENSSL_PATH)/crypto/stack/stack.c
+ $(OPENSSL_PATH)/crypto/txt_db/txt_db.c
+ $(OPENSSL_PATH)/crypto/ui/ui_err.c
+ $(OPENSSL_PATH)/crypto/ui/ui_lib.c
+ $(OPENSSL_PATH)/crypto/ui/ui_null.c
+ $(OPENSSL_PATH)/crypto/ui/ui_openssl.c
+ $(OPENSSL_PATH)/crypto/ui/ui_util.c
+ $(OPENSSL_PATH)/crypto/x509/by_dir.c
+ $(OPENSSL_PATH)/crypto/x509/by_file.c
+ $(OPENSSL_PATH)/crypto/x509/by_store.c
+ $(OPENSSL_PATH)/crypto/x509/pcy_cache.c
+ $(OPENSSL_PATH)/crypto/x509/pcy_data.c
+ $(OPENSSL_PATH)/crypto/x509/pcy_lib.c
+ $(OPENSSL_PATH)/crypto/x509/pcy_map.c
+ $(OPENSSL_PATH)/crypto/x509/pcy_node.c
+ $(OPENSSL_PATH)/crypto/x509/pcy_tree.c
+ $(OPENSSL_PATH)/crypto/x509/t_crl.c
+ $(OPENSSL_PATH)/crypto/x509/t_req.c
+ $(OPENSSL_PATH)/crypto/x509/t_x509.c
+ $(OPENSSL_PATH)/crypto/x509/v3_addr.c
+ $(OPENSSL_PATH)/crypto/x509/v3_admis.c
+ $(OPENSSL_PATH)/crypto/x509/v3_akeya.c
+ $(OPENSSL_PATH)/crypto/x509/v3_akid.c
+ $(OPENSSL_PATH)/crypto/x509/v3_asid.c
+ $(OPENSSL_PATH)/crypto/x509/v3_bcons.c
+ $(OPENSSL_PATH)/crypto/x509/v3_bitst.c
+ $(OPENSSL_PATH)/crypto/x509/v3_conf.c
+ $(OPENSSL_PATH)/crypto/x509/v3_cpols.c
+ $(OPENSSL_PATH)/crypto/x509/v3_crld.c
+ $(OPENSSL_PATH)/crypto/x509/v3_enum.c
+ $(OPENSSL_PATH)/crypto/x509/v3_extku.c
+ $(OPENSSL_PATH)/crypto/x509/v3_genn.c
+ $(OPENSSL_PATH)/crypto/x509/v3_ia5.c
+ $(OPENSSL_PATH)/crypto/x509/v3_info.c
+ $(OPENSSL_PATH)/crypto/x509/v3_int.c
+ $(OPENSSL_PATH)/crypto/x509/v3_ist.c
+ $(OPENSSL_PATH)/crypto/x509/v3_lib.c
+ $(OPENSSL_PATH)/crypto/x509/v3_ncons.c
+ $(OPENSSL_PATH)/crypto/x509/v3_pci.c
+ $(OPENSSL_PATH)/crypto/x509/v3_pcia.c
+ $(OPENSSL_PATH)/crypto/x509/v3_pcons.c
+ $(OPENSSL_PATH)/crypto/x509/v3_pku.c
+ $(OPENSSL_PATH)/crypto/x509/v3_pmaps.c
+ $(OPENSSL_PATH)/crypto/x509/v3_prn.c
+ $(OPENSSL_PATH)/crypto/x509/v3_purp.c
+ $(OPENSSL_PATH)/crypto/x509/v3_san.c
+ $(OPENSSL_PATH)/crypto/x509/v3_skid.c
+ $(OPENSSL_PATH)/crypto/x509/v3_sxnet.c
+ $(OPENSSL_PATH)/crypto/x509/v3_tlsf.c
+ $(OPENSSL_PATH)/crypto/x509/v3_utf8.c
+ $(OPENSSL_PATH)/crypto/x509/v3_utl.c
+ $(OPENSSL_PATH)/crypto/x509/v3err.c
+ $(OPENSSL_PATH)/crypto/x509/x509_att.c
+ $(OPENSSL_PATH)/crypto/x509/x509_cmp.c
+ $(OPENSSL_PATH)/crypto/x509/x509_d2.c
+ $(OPENSSL_PATH)/crypto/x509/x509_def.c
+ $(OPENSSL_PATH)/crypto/x509/x509_err.c
+ $(OPENSSL_PATH)/crypto/x509/x509_ext.c
+ $(OPENSSL_PATH)/crypto/x509/x509_lu.c
+ $(OPENSSL_PATH)/crypto/x509/x509_meth.c
+ $(OPENSSL_PATH)/crypto/x509/x509_obj.c
+ $(OPENSSL_PATH)/crypto/x509/x509_r2x.c
+ $(OPENSSL_PATH)/crypto/x509/x509_req.c
+ $(OPENSSL_PATH)/crypto/x509/x509_set.c
+ $(OPENSSL_PATH)/crypto/x509/x509_trust.c
+ $(OPENSSL_PATH)/crypto/x509/x509_txt.c
+ $(OPENSSL_PATH)/crypto/x509/x509_v3.c
+ $(OPENSSL_PATH)/crypto/x509/x509_vfy.c
+ $(OPENSSL_PATH)/crypto/x509/x509_vpm.c
+ $(OPENSSL_PATH)/crypto/x509/x509cset.c
+ $(OPENSSL_PATH)/crypto/x509/x509name.c
+ $(OPENSSL_PATH)/crypto/x509/x509rset.c
+ $(OPENSSL_PATH)/crypto/x509/x509spki.c
+ $(OPENSSL_PATH)/crypto/x509/x509type.c
+ $(OPENSSL_PATH)/crypto/x509/x_all.c
+ $(OPENSSL_PATH)/crypto/x509/x_attrib.c
+ $(OPENSSL_PATH)/crypto/x509/x_crl.c
+ $(OPENSSL_PATH)/crypto/x509/x_exten.c
+ $(OPENSSL_PATH)/crypto/x509/x_name.c
+ $(OPENSSL_PATH)/crypto/x509/x_pubkey.c
+ $(OPENSSL_PATH)/crypto/x509/x_req.c
+ $(OPENSSL_PATH)/crypto/x509/x_x509.c
+ $(OPENSSL_PATH)/crypto/x509/x_x509a.c
+ $(OPENSSL_PATH)/providers/nullprov.c
+ $(OPENSSL_PATH)/providers/prov_running.c
+ $(OPENSSL_PATH)/providers/common/der/der_rsa_sig.c
+ $(OPENSSL_PATH)/providers/common/bio_prov.c
+ $(OPENSSL_PATH)/providers/common/capabilities.c
+ $(OPENSSL_PATH)/providers/common/digest_to_nid.c
+ $(OPENSSL_PATH)/providers/common/provider_seeding.c
+ $(OPENSSL_PATH)/providers/common/provider_util.c
+ $(OPENSSL_PATH)/providers/common/securitycheck.c
+ $(OPENSSL_PATH)/providers/common/securitycheck_default.c
+ $(OPENSSL_PATH)/providers/implementations/asymciphers/rsa_enc.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha1_hw.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha256_hw.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_ccm.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_ccm_hw.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_gcm.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_gcm_hw.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_hw.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_wrp.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_xts.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_xts_fips.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_xts_hw.c
+ $(OPENSSL_PATH)/providers/implementations/ciphers/cipher_cts.c
+ $(OPENSSL_PATH)/providers/implementations/ciphers/cipher_null.c
+ $(OPENSSL_PATH)/providers/implementations/digests/md5_prov.c
+ $(OPENSSL_PATH)/providers/implementations/digests/md5_sha1_prov.c
+ $(OPENSSL_PATH)/providers/implementations/digests/null_prov.c
+ $(OPENSSL_PATH)/providers/implementations/digests/sha2_prov.c
+ $(OPENSSL_PATH)/providers/implementations/digests/sha3_prov.c
+ $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_der2key.c
+ $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_epki2pki.c
+ $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_msblob2key.c
+ $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_pem2der.c
+ $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_pvk2key.c
+ $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_spki2typespki.c
+ $(OPENSSL_PATH)/providers/implementations/encode_decode/endecoder_common.c
+ $(OPENSSL_PATH)/providers/implementations/exchange/ecdh_exch.c
+ $(OPENSSL_PATH)/providers/implementations/exchange/ecx_exch.c
+ $(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c
+ $(OPENSSL_PATH)/providers/implementations/kdfs/hkdf.c
+ $(OPENSSL_PATH)/providers/implementations/kdfs/kbkdf.c
+ $(OPENSSL_PATH)/providers/implementations/kdfs/krb5kdf.c
+ $(OPENSSL_PATH)/providers/implementations/kdfs/pbkdf2.c
+ $(OPENSSL_PATH)/providers/implementations/kdfs/pbkdf2_fips.c
+ $(OPENSSL_PATH)/providers/implementations/kdfs/pkcs12kdf.c
+ $(OPENSSL_PATH)/providers/implementations/kdfs/scrypt.c
+ $(OPENSSL_PATH)/providers/implementations/kdfs/sshkdf.c
+ $(OPENSSL_PATH)/providers/implementations/kdfs/sskdf.c
+ $(OPENSSL_PATH)/providers/implementations/kdfs/tls1_prf.c
+ $(OPENSSL_PATH)/providers/implementations/kdfs/x942kdf.c
+ $(OPENSSL_PATH)/providers/implementations/kem/rsa_kem.c
+ $(OPENSSL_PATH)/providers/implementations/keymgmt/ec_kmgmt.c
+ $(OPENSSL_PATH)/providers/implementations/keymgmt/ecx_kmgmt.c
+ $(OPENSSL_PATH)/providers/implementations/keymgmt/kdf_legacy_kmgmt.c
+ $(OPENSSL_PATH)/providers/implementations/keymgmt/mac_legacy_kmgmt.c
+ $(OPENSSL_PATH)/providers/implementations/keymgmt/rsa_kmgmt.c
+ $(OPENSSL_PATH)/providers/implementations/macs/gmac_prov.c
+ $(OPENSSL_PATH)/providers/implementations/macs/hmac_prov.c
+ $(OPENSSL_PATH)/providers/implementations/macs/kmac_prov.c
+ $(OPENSSL_PATH)/providers/implementations/rands/crngt.c
+ $(OPENSSL_PATH)/providers/implementations/rands/drbg.c
+ $(OPENSSL_PATH)/providers/implementations/rands/drbg_ctr.c
+ $(OPENSSL_PATH)/providers/implementations/rands/drbg_hash.c
+ $(OPENSSL_PATH)/providers/implementations/rands/drbg_hmac.c
+ $(OPENSSL_PATH)/providers/implementations/rands/seed_src.c
+ $(OPENSSL_PATH)/providers/implementations/rands/test_rng.c
+ $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_cpu_x86.c
+ $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_tsc.c
+ $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_unix.c
+ $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_win.c
+ $(OPENSSL_PATH)/providers/implementations/signature/ecdsa_sig.c
+ $(OPENSSL_PATH)/providers/implementations/signature/eddsa_sig.c
+ $(OPENSSL_PATH)/providers/implementations/signature/mac_legacy_sig.c
+ $(OPENSSL_PATH)/providers/implementations/signature/rsa_sig.c
+ $(OPENSSL_PATH)/ssl/s3_cbc.c
+ $(OPENSSL_PATH)/providers/common/der/der_ec_key.c
+ $(OPENSSL_PATH)/providers/common/der/der_ec_sig.c
+ $(OPENSSL_PATH)/providers/common/der/der_ecx_key.c
+ $(OPENSSL_PATH)/providers/common/der/der_rsa_key.c
+ $(OPENSSL_PATH)/providers/common/provider_ctx.c
+ $(OPENSSL_PATH)/providers/common/provider_err.c
+ $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon.c
+ $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_block.c
+ $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_ccm.c
+ $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_ccm_hw.c
+ $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_gcm.c
+ $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_gcm_hw.c
+ $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_hw.c
+ $(OPENSSL_PATH)/providers/implementations/digests/digestcommon.c
+ $(OPENSSL_PATH)/ssl/record/tls_pad.c
+ $(OPENSSL_GEN_PATH)/providers/common/der/der_digests_gen.c
+ $(OPENSSL_GEN_PATH)/providers/common/der/der_ec_gen.c
+ $(OPENSSL_GEN_PATH)/providers/common/der/der_ecx_gen.c
+ $(OPENSSL_GEN_PATH)/providers/common/der/der_rsa_gen.c
+ $(OPENSSL_GEN_PATH)/providers/common/der/der_wrap_gen.c
+ $(OPENSSL_PATH)/ssl/bio_ssl.c
+ $(OPENSSL_PATH)/ssl/d1_lib.c
+ $(OPENSSL_PATH)/ssl/d1_msg.c
+ $(OPENSSL_PATH)/ssl/d1_srtp.c
+ $(OPENSSL_PATH)/ssl/methods.c
+ $(OPENSSL_PATH)/ssl/pqueue.c
+ $(OPENSSL_PATH)/ssl/s3_enc.c
+ $(OPENSSL_PATH)/ssl/s3_lib.c
+ $(OPENSSL_PATH)/ssl/s3_msg.c
+ $(OPENSSL_PATH)/ssl/ssl_asn1.c
+ $(OPENSSL_PATH)/ssl/ssl_cert.c
+ $(OPENSSL_PATH)/ssl/ssl_ciph.c
+ $(OPENSSL_PATH)/ssl/ssl_conf.c
+ $(OPENSSL_PATH)/ssl/ssl_err.c
+ $(OPENSSL_PATH)/ssl/ssl_err_legacy.c
+ $(OPENSSL_PATH)/ssl/ssl_init.c
+ $(OPENSSL_PATH)/ssl/ssl_lib.c
+ $(OPENSSL_PATH)/ssl/ssl_mcnf.c
+ $(OPENSSL_PATH)/ssl/ssl_rsa.c
+ $(OPENSSL_PATH)/ssl/ssl_rsa_legacy.c
+ $(OPENSSL_PATH)/ssl/ssl_sess.c
+ $(OPENSSL_PATH)/ssl/ssl_stat.c
+ $(OPENSSL_PATH)/ssl/ssl_txt.c
+ $(OPENSSL_PATH)/ssl/ssl_utst.c
+ $(OPENSSL_PATH)/ssl/t1_enc.c
+ $(OPENSSL_PATH)/ssl/t1_lib.c
+ $(OPENSSL_PATH)/ssl/t1_trce.c
+ $(OPENSSL_PATH)/ssl/tls13_enc.c
+ $(OPENSSL_PATH)/ssl/tls_depr.c
+ $(OPENSSL_PATH)/ssl/tls_srp.c
+ $(OPENSSL_PATH)/ssl/record/dtls1_bitmap.c
+ $(OPENSSL_PATH)/ssl/record/rec_layer_d1.c
+ $(OPENSSL_PATH)/ssl/record/rec_layer_s3.c
+ $(OPENSSL_PATH)/ssl/record/ssl3_buffer.c
+ $(OPENSSL_PATH)/ssl/record/ssl3_record.c
+ $(OPENSSL_PATH)/ssl/record/ssl3_record_tls13.c
+ $(OPENSSL_PATH)/ssl/statem/extensions.c
+ $(OPENSSL_PATH)/ssl/statem/extensions_clnt.c
+ $(OPENSSL_PATH)/ssl/statem/extensions_cust.c
+ $(OPENSSL_PATH)/ssl/statem/statem.c
+ $(OPENSSL_PATH)/ssl/statem/statem_clnt.c
+ $(OPENSSL_PATH)/ssl/statem/statem_dtls.c
+ $(OPENSSL_PATH)/ssl/statem/statem_lib.c
+ #$(OPENSSL_GEN_PATH)/IA32-MSFT/crypto/aes/aes-586.nasm | MSFT
+ #$(OPENSSL_GEN_PATH)/IA32-MSFT/crypto/aes/aesni-x86.nasm | MSFT
+ #$(OPENSSL_GEN_PATH)/IA32-MSFT/crypto/aes/vpaes-x86.nasm | MSFT
+ $(OPENSSL_GEN_PATH)/IA32-MSFT/crypto/x86cpuid.nasm | MSFT
+ $(OPENSSL_GEN_PATH)/IA32-MSFT/crypto/md5/md5-586.nasm | MSFT
+ $(OPENSSL_GEN_PATH)/IA32-MSFT/crypto/modes/ghash-x86.nasm | MSFT
+ $(OPENSSL_GEN_PATH)/IA32-MSFT/crypto/sha/sha1-586.nasm | MSFT
+ $(OPENSSL_GEN_PATH)/IA32-MSFT/crypto/sha/sha256-586.nasm | MSFT
+ $(OPENSSL_GEN_PATH)/IA32-MSFT/crypto/sha/sha512-586.nasm | MSFT
+ #$(OPENSSL_GEN_PATH)/IA32-GCC/crypto/aes/aes-586.S | GCC
+ #$(OPENSSL_GEN_PATH)/IA32-GCC/crypto/aes/aesni-x86.S | GCC
+ #$(OPENSSL_GEN_PATH)/IA32-GCC/crypto/aes/vpaes-x86.S | GCC
+ $(OPENSSL_GEN_PATH)/IA32-GCC/crypto/x86cpuid.S | GCC
+ $(OPENSSL_GEN_PATH)/IA32-GCC/crypto/md5/md5-586.S | GCC
+ $(OPENSSL_GEN_PATH)/IA32-GCC/crypto/modes/ghash-x86.S | GCC
+ $(OPENSSL_GEN_PATH)/IA32-GCC/crypto/sha/sha1-586.S | GCC
+ $(OPENSSL_GEN_PATH)/IA32-GCC/crypto/sha/sha256-586.S | GCC
+ $(OPENSSL_GEN_PATH)/IA32-GCC/crypto/sha/sha512-586.S | GCC
+# Autogenerated files list ends here
+
+[Sources.X64]
+ X64/ApiHooks.c
+# Autogenerated files list starts here
+ #$(OPENSSL_PATH)/crypto/aes/aes_cfb.c
+ #$(OPENSSL_PATH)/crypto/aes/aes_ecb.c
+ #$(OPENSSL_PATH)/crypto/aes/aes_ige.c
+ #$(OPENSSL_PATH)/crypto/aes/aes_misc.c
+ #$(OPENSSL_PATH)/crypto/aes/aes_ofb.c
+ #$(OPENSSL_PATH)/crypto/aes/aes_wrap.c
+ $(OPENSSL_PATH)/crypto/asn1/a_bitstr.c
+ $(OPENSSL_PATH)/crypto/asn1/a_d2i_fp.c
+ $(OPENSSL_PATH)/crypto/asn1/a_digest.c
+ $(OPENSSL_PATH)/crypto/asn1/a_dup.c
+ $(OPENSSL_PATH)/crypto/asn1/a_gentm.c
+ $(OPENSSL_PATH)/crypto/asn1/a_i2d_fp.c
+ $(OPENSSL_PATH)/crypto/asn1/a_int.c
+ $(OPENSSL_PATH)/crypto/asn1/a_mbstr.c
+ $(OPENSSL_PATH)/crypto/asn1/a_object.c
+ $(OPENSSL_PATH)/crypto/asn1/a_octet.c
+ $(OPENSSL_PATH)/crypto/asn1/a_print.c
+ $(OPENSSL_PATH)/crypto/asn1/a_sign.c
+ $(OPENSSL_PATH)/crypto/asn1/a_strex.c
+ $(OPENSSL_PATH)/crypto/asn1/a_strnid.c
+ $(OPENSSL_PATH)/crypto/asn1/a_time.c
+ $(OPENSSL_PATH)/crypto/asn1/a_type.c
+ $(OPENSSL_PATH)/crypto/asn1/a_utctm.c
+ $(OPENSSL_PATH)/crypto/asn1/a_utf8.c
+ $(OPENSSL_PATH)/crypto/asn1/a_verify.c
+ $(OPENSSL_PATH)/crypto/asn1/ameth_lib.c
+ $(OPENSSL_PATH)/crypto/asn1/asn1_err.c
+ $(OPENSSL_PATH)/crypto/asn1/asn1_gen.c
+ $(OPENSSL_PATH)/crypto/asn1/asn1_item_list.c
+ $(OPENSSL_PATH)/crypto/asn1/asn1_lib.c
+ $(OPENSSL_PATH)/crypto/asn1/asn1_parse.c
+ $(OPENSSL_PATH)/crypto/asn1/asn_mime.c
+ $(OPENSSL_PATH)/crypto/asn1/asn_moid.c
+ $(OPENSSL_PATH)/crypto/asn1/asn_mstbl.c
+ $(OPENSSL_PATH)/crypto/asn1/asn_pack.c
+ $(OPENSSL_PATH)/crypto/asn1/bio_asn1.c
+ $(OPENSSL_PATH)/crypto/asn1/bio_ndef.c
+ $(OPENSSL_PATH)/crypto/asn1/d2i_param.c
+ $(OPENSSL_PATH)/crypto/asn1/d2i_pr.c
+ $(OPENSSL_PATH)/crypto/asn1/d2i_pu.c
+ $(OPENSSL_PATH)/crypto/asn1/evp_asn1.c
+ $(OPENSSL_PATH)/crypto/asn1/f_int.c
+ $(OPENSSL_PATH)/crypto/asn1/f_string.c
+ $(OPENSSL_PATH)/crypto/asn1/i2d_evp.c
+ $(OPENSSL_PATH)/crypto/asn1/nsseq.c
+ $(OPENSSL_PATH)/crypto/asn1/p5_pbe.c
+ $(OPENSSL_PATH)/crypto/asn1/p5_pbev2.c
+ $(OPENSSL_PATH)/crypto/asn1/p5_scrypt.c
+ $(OPENSSL_PATH)/crypto/asn1/p8_pkey.c
+ $(OPENSSL_PATH)/crypto/asn1/t_bitst.c
+ $(OPENSSL_PATH)/crypto/asn1/t_pkey.c
+ $(OPENSSL_PATH)/crypto/asn1/t_spki.c
+ $(OPENSSL_PATH)/crypto/asn1/tasn_dec.c
+ $(OPENSSL_PATH)/crypto/asn1/tasn_enc.c
+ $(OPENSSL_PATH)/crypto/asn1/tasn_fre.c
+ $(OPENSSL_PATH)/crypto/asn1/tasn_new.c
+ $(OPENSSL_PATH)/crypto/asn1/tasn_prn.c
+ $(OPENSSL_PATH)/crypto/asn1/tasn_scn.c
+ $(OPENSSL_PATH)/crypto/asn1/tasn_typ.c
+ $(OPENSSL_PATH)/crypto/asn1/tasn_utl.c
+ $(OPENSSL_PATH)/crypto/asn1/x_algor.c
+ $(OPENSSL_PATH)/crypto/asn1/x_bignum.c
+ $(OPENSSL_PATH)/crypto/asn1/x_info.c
+ $(OPENSSL_PATH)/crypto/asn1/x_int64.c
+ $(OPENSSL_PATH)/crypto/asn1/x_long.c
+ $(OPENSSL_PATH)/crypto/asn1/x_pkey.c
+ $(OPENSSL_PATH)/crypto/asn1/x_sig.c
+ $(OPENSSL_PATH)/crypto/asn1/x_spki.c
+ $(OPENSSL_PATH)/crypto/asn1/x_val.c
+ $(OPENSSL_PATH)/crypto/async/arch/async_null.c
+ $(OPENSSL_PATH)/crypto/async/arch/async_posix.c
+ $(OPENSSL_PATH)/crypto/async/arch/async_win.c
+ $(OPENSSL_PATH)/crypto/async/async.c
+ $(OPENSSL_PATH)/crypto/async/async_err.c
+ $(OPENSSL_PATH)/crypto/async/async_wait.c
+ $(OPENSSL_PATH)/crypto/bio/bf_buff.c
+ $(OPENSSL_PATH)/crypto/bio/bf_lbuf.c
+ $(OPENSSL_PATH)/crypto/bio/bf_nbio.c
+ $(OPENSSL_PATH)/crypto/bio/bf_null.c
+ $(OPENSSL_PATH)/crypto/bio/bf_prefix.c
+ $(OPENSSL_PATH)/crypto/bio/bf_readbuff.c
+ $(OPENSSL_PATH)/crypto/bio/bio_addr.c
+ $(OPENSSL_PATH)/crypto/bio/bio_cb.c
+ $(OPENSSL_PATH)/crypto/bio/bio_dump.c
+ $(OPENSSL_PATH)/crypto/bio/bio_err.c
+ $(OPENSSL_PATH)/crypto/bio/bio_lib.c
+ $(OPENSSL_PATH)/crypto/bio/bio_meth.c
+ $(OPENSSL_PATH)/crypto/bio/bio_print.c
+ $(OPENSSL_PATH)/crypto/bio/bio_sock.c
+ $(OPENSSL_PATH)/crypto/bio/bio_sock2.c
+ $(OPENSSL_PATH)/crypto/bio/bss_acpt.c
+ $(OPENSSL_PATH)/crypto/bio/bss_bio.c
+ $(OPENSSL_PATH)/crypto/bio/bss_conn.c
+ $(OPENSSL_PATH)/crypto/bio/bss_core.c
+ $(OPENSSL_PATH)/crypto/bio/bss_dgram.c
+ $(OPENSSL_PATH)/crypto/bio/bss_fd.c
+ $(OPENSSL_PATH)/crypto/bio/bss_file.c
+ $(OPENSSL_PATH)/crypto/bio/bss_log.c
+ $(OPENSSL_PATH)/crypto/bio/bss_mem.c
+ $(OPENSSL_PATH)/crypto/bio/bss_null.c
+ $(OPENSSL_PATH)/crypto/bio/bss_sock.c
+ $(OPENSSL_PATH)/crypto/bio/ossl_core_bio.c
+ $(OPENSSL_PATH)/crypto/bn/bn_add.c
+ $(OPENSSL_PATH)/crypto/bn/bn_blind.c
+ $(OPENSSL_PATH)/crypto/bn/bn_const.c
+ $(OPENSSL_PATH)/crypto/bn/bn_conv.c
+ $(OPENSSL_PATH)/crypto/bn/bn_ctx.c
+ $(OPENSSL_PATH)/crypto/bn/bn_dh.c
+ $(OPENSSL_PATH)/crypto/bn/bn_div.c
+ $(OPENSSL_PATH)/crypto/bn/bn_err.c
+ $(OPENSSL_PATH)/crypto/bn/bn_exp.c
+ $(OPENSSL_PATH)/crypto/bn/bn_exp2.c
+ $(OPENSSL_PATH)/crypto/bn/bn_gcd.c
+ $(OPENSSL_PATH)/crypto/bn/bn_gf2m.c
+ $(OPENSSL_PATH)/crypto/bn/bn_intern.c
+ $(OPENSSL_PATH)/crypto/bn/bn_kron.c
+ $(OPENSSL_PATH)/crypto/bn/bn_lib.c
+ $(OPENSSL_PATH)/crypto/bn/bn_mod.c
+ $(OPENSSL_PATH)/crypto/bn/bn_mont.c
+ $(OPENSSL_PATH)/crypto/bn/bn_mpi.c
+ $(OPENSSL_PATH)/crypto/bn/bn_mul.c
+ $(OPENSSL_PATH)/crypto/bn/bn_nist.c
+ $(OPENSSL_PATH)/crypto/bn/bn_prime.c
+ $(OPENSSL_PATH)/crypto/bn/bn_print.c
+ $(OPENSSL_PATH)/crypto/bn/bn_rand.c
+ $(OPENSSL_PATH)/crypto/bn/bn_recp.c
+ $(OPENSSL_PATH)/crypto/bn/bn_rsa_fips186_4.c
+ $(OPENSSL_PATH)/crypto/bn/bn_shift.c
+ $(OPENSSL_PATH)/crypto/bn/bn_sqr.c
+ $(OPENSSL_PATH)/crypto/bn/bn_sqrt.c
+ $(OPENSSL_PATH)/crypto/bn/bn_srp.c
+ $(OPENSSL_PATH)/crypto/bn/bn_word.c
+ $(OPENSSL_PATH)/crypto/bn/bn_x931p.c
+ $(OPENSSL_PATH)/crypto/bn/rsaz_exp.c
+ $(OPENSSL_PATH)/crypto/bn/rsaz_exp_x2.c
+ $(OPENSSL_PATH)/crypto/buffer/buf_err.c
+ $(OPENSSL_PATH)/crypto/buffer/buffer.c
+ $(OPENSSL_PATH)/crypto/comp/c_zlib.c
+ $(OPENSSL_PATH)/crypto/comp/comp_err.c
+ $(OPENSSL_PATH)/crypto/comp/comp_lib.c
+ $(OPENSSL_PATH)/crypto/conf/conf_api.c
+ $(OPENSSL_PATH)/crypto/conf/conf_def.c
+ $(OPENSSL_PATH)/crypto/conf/conf_err.c
+ $(OPENSSL_PATH)/crypto/conf/conf_lib.c
+ $(OPENSSL_PATH)/crypto/conf/conf_mall.c
+ $(OPENSSL_PATH)/crypto/conf/conf_mod.c
+ $(OPENSSL_PATH)/crypto/conf/conf_sap.c
+ $(OPENSSL_PATH)/crypto/conf/conf_ssl.c
+ $(OPENSSL_PATH)/crypto/dso/dso_dl.c
+ $(OPENSSL_PATH)/crypto/dso/dso_dlfcn.c
+ $(OPENSSL_PATH)/crypto/dso/dso_err.c
+ $(OPENSSL_PATH)/crypto/dso/dso_lib.c
+ $(OPENSSL_PATH)/crypto/dso/dso_openssl.c
+ $(OPENSSL_PATH)/crypto/dso/dso_vms.c
+ $(OPENSSL_PATH)/crypto/dso/dso_win32.c
+ $(OPENSSL_PATH)/crypto/ec/curve448/arch_32/f_impl32.c
+ $(OPENSSL_PATH)/crypto/ec/curve448/arch_64/f_impl64.c
+ $(OPENSSL_PATH)/crypto/ec/curve448/curve448.c
+ $(OPENSSL_PATH)/crypto/ec/curve448/curve448_tables.c
+ $(OPENSSL_PATH)/crypto/ec/curve448/eddsa.c
+ $(OPENSSL_PATH)/crypto/ec/curve448/f_generic.c
+ $(OPENSSL_PATH)/crypto/ec/curve448/scalar.c
+ $(OPENSSL_PATH)/crypto/ec/curve25519.c
+ $(OPENSSL_PATH)/crypto/ec/ec2_oct.c
+ $(OPENSSL_PATH)/crypto/ec/ec2_smpl.c
+ $(OPENSSL_PATH)/crypto/ec/ec_ameth.c
+ $(OPENSSL_PATH)/crypto/ec/ec_asn1.c
+ $(OPENSSL_PATH)/crypto/ec/ec_backend.c
+ $(OPENSSL_PATH)/crypto/ec/ec_check.c
+ $(OPENSSL_PATH)/crypto/ec/ec_curve.c
+ $(OPENSSL_PATH)/crypto/ec/ec_cvt.c
+ $(OPENSSL_PATH)/crypto/ec/ec_deprecated.c
+ $(OPENSSL_PATH)/crypto/ec/ec_err.c
+ $(OPENSSL_PATH)/crypto/ec/ec_key.c
+ $(OPENSSL_PATH)/crypto/ec/ec_kmeth.c
+ $(OPENSSL_PATH)/crypto/ec/ec_lib.c
+ $(OPENSSL_PATH)/crypto/ec/ec_mult.c
+ $(OPENSSL_PATH)/crypto/ec/ec_oct.c
+ $(OPENSSL_PATH)/crypto/ec/ec_pmeth.c
+ $(OPENSSL_PATH)/crypto/ec/ec_print.c
+ $(OPENSSL_PATH)/crypto/ec/ecdh_kdf.c
+ $(OPENSSL_PATH)/crypto/ec/ecdh_ossl.c
+ $(OPENSSL_PATH)/crypto/ec/ecdsa_ossl.c
+ $(OPENSSL_PATH)/crypto/ec/ecdsa_sign.c
+ $(OPENSSL_PATH)/crypto/ec/ecdsa_vrf.c
+ $(OPENSSL_PATH)/crypto/ec/eck_prn.c
+ $(OPENSSL_PATH)/crypto/ec/ecp_mont.c
+ $(OPENSSL_PATH)/crypto/ec/ecp_nist.c
+ $(OPENSSL_PATH)/crypto/ec/ecp_oct.c
+ $(OPENSSL_PATH)/crypto/ec/ecp_smpl.c
+ $(OPENSSL_PATH)/crypto/ec/ecx_backend.c
+ $(OPENSSL_PATH)/crypto/ec/ecx_key.c
+ $(OPENSSL_PATH)/crypto/ec/ecx_meth.c
+ $(OPENSSL_PATH)/crypto/encode_decode/decoder_err.c
+ $(OPENSSL_PATH)/crypto/encode_decode/decoder_lib.c
+ $(OPENSSL_PATH)/crypto/encode_decode/decoder_meth.c
+ $(OPENSSL_PATH)/crypto/encode_decode/decoder_pkey.c
+ $(OPENSSL_PATH)/crypto/err/err.c
+ $(OPENSSL_PATH)/crypto/err/err_all.c
+ $(OPENSSL_PATH)/crypto/err/err_all_legacy.c
+ $(OPENSSL_PATH)/crypto/err/err_blocks.c
+ $(OPENSSL_PATH)/crypto/err/err_prn.c
+ $(OPENSSL_PATH)/crypto/ess/ess_asn1.c
+ $(OPENSSL_PATH)/crypto/ess/ess_err.c
+ $(OPENSSL_PATH)/crypto/ess/ess_lib.c
+ $(OPENSSL_PATH)/crypto/evp/asymcipher.c
+ $(OPENSSL_PATH)/crypto/evp/bio_b64.c
+ $(OPENSSL_PATH)/crypto/evp/bio_enc.c
+ $(OPENSSL_PATH)/crypto/evp/bio_md.c
+ $(OPENSSL_PATH)/crypto/evp/bio_ok.c
+ #$(OPENSSL_PATH)/crypto/evp/c_allc.c
+ $(OPENSSL_PATH)/crypto/evp/c_alld.c
+ $(OPENSSL_PATH)/crypto/evp/cmeth_lib.c
+ $(OPENSSL_PATH)/crypto/evp/ctrl_params_translate.c
+ $(OPENSSL_PATH)/crypto/evp/dh_ctrl.c
+ $(OPENSSL_PATH)/crypto/evp/dh_support.c
+ $(OPENSSL_PATH)/crypto/evp/digest.c
+ $(OPENSSL_PATH)/crypto/evp/dsa_ctrl.c
+ #$(OPENSSL_PATH)/crypto/evp/e_aes.c
+ #$(OPENSSL_PATH)/crypto/evp/e_aes_cbc_hmac_sha1.c
+#$(OPENSSL_PATH)/crypto/evp/e_aes_cbc_hmac_sha256.c
+ $(OPENSSL_PATH)/crypto/evp/e_aria.c
+ $(OPENSSL_PATH)/crypto/evp/e_bf.c
+ $(OPENSSL_PATH)/crypto/evp/e_cast.c
+ $(OPENSSL_PATH)/crypto/evp/e_chacha20_poly1305.c
+ $(OPENSSL_PATH)/crypto/evp/e_des.c
+ $(OPENSSL_PATH)/crypto/evp/e_des3.c
+ $(OPENSSL_PATH)/crypto/evp/e_idea.c
+ $(OPENSSL_PATH)/crypto/evp/e_null.c
+ $(OPENSSL_PATH)/crypto/evp/e_rc2.c
+ $(OPENSSL_PATH)/crypto/evp/e_rc4.c
+ $(OPENSSL_PATH)/crypto/evp/e_rc4_hmac_md5.c
+ $(OPENSSL_PATH)/crypto/evp/e_rc5.c
+ $(OPENSSL_PATH)/crypto/evp/e_sm4.c
+ $(OPENSSL_PATH)/crypto/evp/e_xcbc_d.c
+ $(OPENSSL_PATH)/crypto/evp/ec_ctrl.c
+ $(OPENSSL_PATH)/crypto/evp/ec_support.c
+ $(OPENSSL_PATH)/crypto/evp/encode.c
+ $(OPENSSL_PATH)/crypto/evp/evp_cnf.c
+ $(OPENSSL_PATH)/crypto/evp/evp_enc.c
+ $(OPENSSL_PATH)/crypto/evp/evp_err.c
+ $(OPENSSL_PATH)/crypto/evp/evp_fetch.c
+ $(OPENSSL_PATH)/crypto/evp/evp_key.c
+ $(OPENSSL_PATH)/crypto/evp/evp_lib.c
+ $(OPENSSL_PATH)/crypto/evp/evp_pbe.c
+ $(OPENSSL_PATH)/crypto/evp/evp_pkey.c
+ $(OPENSSL_PATH)/crypto/evp/evp_rand.c
+ $(OPENSSL_PATH)/crypto/evp/evp_utils.c
+ $(OPENSSL_PATH)/crypto/evp/exchange.c
+ $(OPENSSL_PATH)/crypto/evp/kdf_lib.c
+ $(OPENSSL_PATH)/crypto/evp/kdf_meth.c
+ $(OPENSSL_PATH)/crypto/evp/kem.c
+ $(OPENSSL_PATH)/crypto/evp/keymgmt_lib.c
+ $(OPENSSL_PATH)/crypto/evp/keymgmt_meth.c
+ $(OPENSSL_PATH)/crypto/evp/legacy_md5.c
+ $(OPENSSL_PATH)/crypto/evp/legacy_md5_sha1.c
+ $(OPENSSL_PATH)/crypto/evp/legacy_sha.c
+ $(OPENSSL_PATH)/crypto/evp/m_null.c
+ $(OPENSSL_PATH)/crypto/evp/m_sigver.c
+ $(OPENSSL_PATH)/crypto/evp/mac_lib.c
+ $(OPENSSL_PATH)/crypto/evp/mac_meth.c
+ $(OPENSSL_PATH)/crypto/evp/names.c
+ $(OPENSSL_PATH)/crypto/evp/p5_crpt.c
+ $(OPENSSL_PATH)/crypto/evp/p5_crpt2.c
+ $(OPENSSL_PATH)/crypto/evp/p_dec.c
+ $(OPENSSL_PATH)/crypto/evp/p_enc.c
+ $(OPENSSL_PATH)/crypto/evp/p_legacy.c
+ $(OPENSSL_PATH)/crypto/evp/p_lib.c
+ $(OPENSSL_PATH)/crypto/evp/p_open.c
+ $(OPENSSL_PATH)/crypto/evp/p_seal.c
+ $(OPENSSL_PATH)/crypto/evp/p_sign.c
+ $(OPENSSL_PATH)/crypto/evp/p_verify.c
+ $(OPENSSL_PATH)/crypto/evp/pbe_scrypt.c
+ $(OPENSSL_PATH)/crypto/evp/pmeth_check.c
+ $(OPENSSL_PATH)/crypto/evp/pmeth_gn.c
+ $(OPENSSL_PATH)/crypto/evp/pmeth_lib.c
+ $(OPENSSL_PATH)/crypto/evp/signature.c
+ $(OPENSSL_PATH)/crypto/ffc/ffc_backend.c
+ $(OPENSSL_PATH)/crypto/ffc/ffc_dh.c
+ $(OPENSSL_PATH)/crypto/ffc/ffc_key_generate.c
+ $(OPENSSL_PATH)/crypto/ffc/ffc_key_validate.c
+ $(OPENSSL_PATH)/crypto/ffc/ffc_params.c
+ $(OPENSSL_PATH)/crypto/ffc/ffc_params_generate.c
+ $(OPENSSL_PATH)/crypto/ffc/ffc_params_validate.c
+ $(OPENSSL_PATH)/crypto/hmac/hmac.c
+ $(OPENSSL_PATH)/crypto/http/http_client.c
+ $(OPENSSL_PATH)/crypto/http/http_err.c
+ $(OPENSSL_PATH)/crypto/http/http_lib.c
+ $(OPENSSL_PATH)/crypto/kdf/kdf_err.c
+ $(OPENSSL_PATH)/crypto/lhash/lh_stats.c
+ $(OPENSSL_PATH)/crypto/lhash/lhash.c
+ $(OPENSSL_PATH)/crypto/asn1_dsa.c
+ $(OPENSSL_PATH)/crypto/bsearch.c
+ $(OPENSSL_PATH)/crypto/context.c
+ $(OPENSSL_PATH)/crypto/core_algorithm.c
+ $(OPENSSL_PATH)/crypto/core_fetch.c
+ $(OPENSSL_PATH)/crypto/core_namemap.c
+ $(OPENSSL_PATH)/crypto/cpt_err.c
+ $(OPENSSL_PATH)/crypto/cpuid.c
+ $(OPENSSL_PATH)/crypto/cryptlib.c
+ $(OPENSSL_PATH)/crypto/ctype.c
+ $(OPENSSL_PATH)/crypto/cversion.c
+ $(OPENSSL_PATH)/crypto/der_writer.c
+ $(OPENSSL_PATH)/crypto/ebcdic.c
+ $(OPENSSL_PATH)/crypto/ex_data.c
+ $(OPENSSL_PATH)/crypto/getenv.c
+ $(OPENSSL_PATH)/crypto/info.c
+ $(OPENSSL_PATH)/crypto/init.c
+ $(OPENSSL_PATH)/crypto/initthread.c
+ $(OPENSSL_PATH)/crypto/mem.c
+ $(OPENSSL_PATH)/crypto/mem_sec.c
+ $(OPENSSL_PATH)/crypto/o_dir.c
+ $(OPENSSL_PATH)/crypto/o_fopen.c
+ $(OPENSSL_PATH)/crypto/o_init.c
+ $(OPENSSL_PATH)/crypto/o_str.c
+ $(OPENSSL_PATH)/crypto/o_time.c
+ $(OPENSSL_PATH)/crypto/packet.c
+ $(OPENSSL_PATH)/crypto/param_build.c
+ $(OPENSSL_PATH)/crypto/param_build_set.c
+ $(OPENSSL_PATH)/crypto/params.c
+ $(OPENSSL_PATH)/crypto/params_dup.c
+ $(OPENSSL_PATH)/crypto/params_from_text.c
+ $(OPENSSL_PATH)/crypto/passphrase.c
+ $(OPENSSL_PATH)/crypto/provider.c
+ $(OPENSSL_PATH)/crypto/provider_child.c
+ $(OPENSSL_PATH)/crypto/provider_conf.c
+ $(OPENSSL_PATH)/crypto/provider_core.c
+ $(OPENSSL_PATH)/crypto/punycode.c
+ $(OPENSSL_PATH)/crypto/self_test_core.c
+ $(OPENSSL_PATH)/crypto/sparse_array.c
+ $(OPENSSL_PATH)/crypto/threads_lib.c
+ $(OPENSSL_PATH)/crypto/threads_none.c
+ $(OPENSSL_PATH)/crypto/threads_pthread.c
+ $(OPENSSL_PATH)/crypto/threads_win.c
+ $(OPENSSL_PATH)/crypto/trace.c
+ $(OPENSSL_PATH)/crypto/uid.c
+ $(OPENSSL_PATH)/crypto/md5/md5_dgst.c
+ $(OPENSSL_PATH)/crypto/md5/md5_one.c
+ $(OPENSSL_PATH)/crypto/md5/md5_sha1.c
+ $(OPENSSL_PATH)/crypto/modes/cbc128.c
+ $(OPENSSL_PATH)/crypto/modes/ccm128.c
+ $(OPENSSL_PATH)/crypto/modes/cfb128.c
+ $(OPENSSL_PATH)/crypto/modes/ctr128.c
+ $(OPENSSL_PATH)/crypto/modes/cts128.c
+ $(OPENSSL_PATH)/crypto/modes/gcm128.c
+ $(OPENSSL_PATH)/crypto/modes/ocb128.c
+ $(OPENSSL_PATH)/crypto/modes/ofb128.c
+ $(OPENSSL_PATH)/crypto/modes/siv128.c
+ $(OPENSSL_PATH)/crypto/modes/wrap128.c
+ $(OPENSSL_PATH)/crypto/modes/xts128.c
+ $(OPENSSL_PATH)/crypto/objects/o_names.c
+ $(OPENSSL_PATH)/crypto/objects/obj_dat.c
+ $(OPENSSL_PATH)/crypto/objects/obj_err.c
+ $(OPENSSL_PATH)/crypto/objects/obj_lib.c
+ $(OPENSSL_PATH)/crypto/objects/obj_xref.c
+ $(OPENSSL_PATH)/crypto/pem/pem_all.c
+ $(OPENSSL_PATH)/crypto/pem/pem_err.c
+ $(OPENSSL_PATH)/crypto/pem/pem_info.c
+ $(OPENSSL_PATH)/crypto/pem/pem_lib.c
+ $(OPENSSL_PATH)/crypto/pem/pem_oth.c
+ $(OPENSSL_PATH)/crypto/pem/pem_pk8.c
+ $(OPENSSL_PATH)/crypto/pem/pem_pkey.c
+ $(OPENSSL_PATH)/crypto/pem/pem_sign.c
+ $(OPENSSL_PATH)/crypto/pem/pem_x509.c
+ $(OPENSSL_PATH)/crypto/pem/pem_xaux.c
+ $(OPENSSL_PATH)/crypto/pem/pvkfmt.c
+ $(OPENSSL_PATH)/crypto/pkcs7/bio_pk7.c
+ $(OPENSSL_PATH)/crypto/pkcs7/pk7_asn1.c
+ $(OPENSSL_PATH)/crypto/pkcs7/pk7_attr.c
+ $(OPENSSL_PATH)/crypto/pkcs7/pk7_doit.c
+ $(OPENSSL_PATH)/crypto/pkcs7/pk7_lib.c
+ $(OPENSSL_PATH)/crypto/pkcs7/pk7_mime.c
+ $(OPENSSL_PATH)/crypto/pkcs7/pk7_smime.c
+ $(OPENSSL_PATH)/crypto/pkcs7/pkcs7err.c
+ $(OPENSSL_PATH)/crypto/property/defn_cache.c
+ $(OPENSSL_PATH)/crypto/property/property.c
+ $(OPENSSL_PATH)/crypto/property/property_err.c
+ $(OPENSSL_PATH)/crypto/property/property_parse.c
+ $(OPENSSL_PATH)/crypto/property/property_query.c
+ $(OPENSSL_PATH)/crypto/property/property_string.c
+ $(OPENSSL_PATH)/crypto/rand/prov_seed.c
+ $(OPENSSL_PATH)/crypto/rand/rand_deprecated.c
+ $(OPENSSL_PATH)/crypto/rand/rand_err.c
+ $(OPENSSL_PATH)/crypto/rand/rand_lib.c
+ $(OPENSSL_PATH)/crypto/rand/rand_meth.c
+ $(OPENSSL_PATH)/crypto/rand/rand_pool.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_ameth.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_asn1.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_backend.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_chk.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_crpt.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_err.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_gen.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_lib.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_meth.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_mp.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_mp_names.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_none.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_oaep.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_ossl.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_pk1.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_pmeth.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_prn.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_pss.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_saos.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_schemes.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_sign.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_sp800_56b_check.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_sp800_56b_gen.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_x931.c
+ $(OPENSSL_PATH)/crypto/rsa/rsa_x931g.c
+ $(OPENSSL_PATH)/crypto/sha/sha1_one.c
+ $(OPENSSL_PATH)/crypto/sha/sha1dgst.c
+ $(OPENSSL_PATH)/crypto/sha/sha256.c
+ $(OPENSSL_PATH)/crypto/sha/sha3.c
+ $(OPENSSL_PATH)/crypto/sha/sha512.c
+ $(OPENSSL_PATH)/crypto/stack/stack.c
+ $(OPENSSL_PATH)/crypto/txt_db/txt_db.c
+ $(OPENSSL_PATH)/crypto/ui/ui_err.c
+ $(OPENSSL_PATH)/crypto/ui/ui_lib.c
+ $(OPENSSL_PATH)/crypto/ui/ui_null.c
+ $(OPENSSL_PATH)/crypto/ui/ui_openssl.c
+ $(OPENSSL_PATH)/crypto/ui/ui_util.c
+ $(OPENSSL_PATH)/crypto/x509/by_dir.c
+ $(OPENSSL_PATH)/crypto/x509/by_file.c
+ $(OPENSSL_PATH)/crypto/x509/by_store.c
+ $(OPENSSL_PATH)/crypto/x509/pcy_cache.c
+ $(OPENSSL_PATH)/crypto/x509/pcy_data.c
+ $(OPENSSL_PATH)/crypto/x509/pcy_lib.c
+ $(OPENSSL_PATH)/crypto/x509/pcy_map.c
+ $(OPENSSL_PATH)/crypto/x509/pcy_node.c
+ $(OPENSSL_PATH)/crypto/x509/pcy_tree.c
+ $(OPENSSL_PATH)/crypto/x509/t_crl.c
+ $(OPENSSL_PATH)/crypto/x509/t_req.c
+ $(OPENSSL_PATH)/crypto/x509/t_x509.c
+ $(OPENSSL_PATH)/crypto/x509/v3_addr.c
+ $(OPENSSL_PATH)/crypto/x509/v3_admis.c
+ $(OPENSSL_PATH)/crypto/x509/v3_akeya.c
+ $(OPENSSL_PATH)/crypto/x509/v3_akid.c
+ $(OPENSSL_PATH)/crypto/x509/v3_asid.c
+ $(OPENSSL_PATH)/crypto/x509/v3_bcons.c
+ $(OPENSSL_PATH)/crypto/x509/v3_bitst.c
+ $(OPENSSL_PATH)/crypto/x509/v3_conf.c
+ $(OPENSSL_PATH)/crypto/x509/v3_cpols.c
+ $(OPENSSL_PATH)/crypto/x509/v3_crld.c
+ $(OPENSSL_PATH)/crypto/x509/v3_enum.c
+ $(OPENSSL_PATH)/crypto/x509/v3_extku.c
+ $(OPENSSL_PATH)/crypto/x509/v3_genn.c
+ $(OPENSSL_PATH)/crypto/x509/v3_ia5.c
+ $(OPENSSL_PATH)/crypto/x509/v3_info.c
+ $(OPENSSL_PATH)/crypto/x509/v3_int.c
+ $(OPENSSL_PATH)/crypto/x509/v3_ist.c
+ $(OPENSSL_PATH)/crypto/x509/v3_lib.c
+ $(OPENSSL_PATH)/crypto/x509/v3_ncons.c
+ $(OPENSSL_PATH)/crypto/x509/v3_pci.c
+ $(OPENSSL_PATH)/crypto/x509/v3_pcia.c
+ $(OPENSSL_PATH)/crypto/x509/v3_pcons.c
+ $(OPENSSL_PATH)/crypto/x509/v3_pku.c
+ $(OPENSSL_PATH)/crypto/x509/v3_pmaps.c
+ $(OPENSSL_PATH)/crypto/x509/v3_prn.c
+ $(OPENSSL_PATH)/crypto/x509/v3_purp.c
+ $(OPENSSL_PATH)/crypto/x509/v3_san.c
+ $(OPENSSL_PATH)/crypto/x509/v3_skid.c
+ $(OPENSSL_PATH)/crypto/x509/v3_sxnet.c
+ $(OPENSSL_PATH)/crypto/x509/v3_tlsf.c
+ $(OPENSSL_PATH)/crypto/x509/v3_utf8.c
+ $(OPENSSL_PATH)/crypto/x509/v3_utl.c
+ $(OPENSSL_PATH)/crypto/x509/v3err.c
+ $(OPENSSL_PATH)/crypto/x509/x509_att.c
+ $(OPENSSL_PATH)/crypto/x509/x509_cmp.c
+ $(OPENSSL_PATH)/crypto/x509/x509_d2.c
+ $(OPENSSL_PATH)/crypto/x509/x509_def.c
+ $(OPENSSL_PATH)/crypto/x509/x509_err.c
+ $(OPENSSL_PATH)/crypto/x509/x509_ext.c
+ $(OPENSSL_PATH)/crypto/x509/x509_lu.c
+ $(OPENSSL_PATH)/crypto/x509/x509_meth.c
+ $(OPENSSL_PATH)/crypto/x509/x509_obj.c
+ $(OPENSSL_PATH)/crypto/x509/x509_r2x.c
+ $(OPENSSL_PATH)/crypto/x509/x509_req.c
+ $(OPENSSL_PATH)/crypto/x509/x509_set.c
+ $(OPENSSL_PATH)/crypto/x509/x509_trust.c
+ $(OPENSSL_PATH)/crypto/x509/x509_txt.c
+ $(OPENSSL_PATH)/crypto/x509/x509_v3.c
+ $(OPENSSL_PATH)/crypto/x509/x509_vfy.c
+ $(OPENSSL_PATH)/crypto/x509/x509_vpm.c
+ $(OPENSSL_PATH)/crypto/x509/x509cset.c
+ $(OPENSSL_PATH)/crypto/x509/x509name.c
+ $(OPENSSL_PATH)/crypto/x509/x509rset.c
+ $(OPENSSL_PATH)/crypto/x509/x509spki.c
+ $(OPENSSL_PATH)/crypto/x509/x509type.c
+ $(OPENSSL_PATH)/crypto/x509/x_all.c
+ $(OPENSSL_PATH)/crypto/x509/x_attrib.c
+ $(OPENSSL_PATH)/crypto/x509/x_crl.c
+ $(OPENSSL_PATH)/crypto/x509/x_exten.c
+ $(OPENSSL_PATH)/crypto/x509/x_name.c
+ $(OPENSSL_PATH)/crypto/x509/x_pubkey.c
+ $(OPENSSL_PATH)/crypto/x509/x_req.c
+ $(OPENSSL_PATH)/crypto/x509/x_x509.c
+ $(OPENSSL_PATH)/crypto/x509/x_x509a.c
+ $(OPENSSL_PATH)/providers/nullprov.c
+ $(OPENSSL_PATH)/providers/prov_running.c
+ $(OPENSSL_PATH)/providers/common/der/der_rsa_sig.c
+ $(OPENSSL_PATH)/providers/common/bio_prov.c
+ $(OPENSSL_PATH)/providers/common/capabilities.c
+ $(OPENSSL_PATH)/providers/common/digest_to_nid.c
+ $(OPENSSL_PATH)/providers/common/provider_seeding.c
+ $(OPENSSL_PATH)/providers/common/provider_util.c
+ $(OPENSSL_PATH)/providers/common/securitycheck.c
+ $(OPENSSL_PATH)/providers/common/securitycheck_default.c
+ $(OPENSSL_PATH)/providers/implementations/asymciphers/rsa_enc.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha1_hw.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_cbc_hmac_sha256_hw.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_ccm.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_ccm_hw.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_gcm.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_gcm_hw.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_hw.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_wrp.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_xts.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_xts_fips.c
+ #$(OPENSSL_PATH)/providers/implementations/ciphers/cipher_aes_xts_hw.c
+ $(OPENSSL_PATH)/providers/implementations/ciphers/cipher_cts.c
+ $(OPENSSL_PATH)/providers/implementations/ciphers/cipher_null.c
+ $(OPENSSL_PATH)/providers/implementations/digests/md5_prov.c
+ $(OPENSSL_PATH)/providers/implementations/digests/md5_sha1_prov.c
+ $(OPENSSL_PATH)/providers/implementations/digests/null_prov.c
+ $(OPENSSL_PATH)/providers/implementations/digests/sha2_prov.c
+ $(OPENSSL_PATH)/providers/implementations/digests/sha3_prov.c
+ $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_der2key.c
+ $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_epki2pki.c
+ $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_msblob2key.c
+ $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_pem2der.c
+ $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_pvk2key.c
+ $(OPENSSL_PATH)/providers/implementations/encode_decode/decode_spki2typespki.c
+ $(OPENSSL_PATH)/providers/implementations/encode_decode/endecoder_common.c
+ $(OPENSSL_PATH)/providers/implementations/exchange/ecdh_exch.c
+ $(OPENSSL_PATH)/providers/implementations/exchange/ecx_exch.c
+ $(OPENSSL_PATH)/providers/implementations/exchange/kdf_exch.c
+ $(OPENSSL_PATH)/providers/implementations/kdfs/hkdf.c
+ $(OPENSSL_PATH)/providers/implementations/kdfs/kbkdf.c
+ $(OPENSSL_PATH)/providers/implementations/kdfs/krb5kdf.c
+ $(OPENSSL_PATH)/providers/implementations/kdfs/pbkdf2.c
+ $(OPENSSL_PATH)/providers/implementations/kdfs/pbkdf2_fips.c
+ $(OPENSSL_PATH)/providers/implementations/kdfs/pkcs12kdf.c
+ $(OPENSSL_PATH)/providers/implementations/kdfs/scrypt.c
+ $(OPENSSL_PATH)/providers/implementations/kdfs/sshkdf.c
+ $(OPENSSL_PATH)/providers/implementations/kdfs/sskdf.c
+ $(OPENSSL_PATH)/providers/implementations/kdfs/tls1_prf.c
+ $(OPENSSL_PATH)/providers/implementations/kdfs/x942kdf.c
+ $(OPENSSL_PATH)/providers/implementations/kem/rsa_kem.c
+ $(OPENSSL_PATH)/providers/implementations/keymgmt/ec_kmgmt.c
+ $(OPENSSL_PATH)/providers/implementations/keymgmt/ecx_kmgmt.c
+ $(OPENSSL_PATH)/providers/implementations/keymgmt/kdf_legacy_kmgmt.c
+ $(OPENSSL_PATH)/providers/implementations/keymgmt/mac_legacy_kmgmt.c
+ $(OPENSSL_PATH)/providers/implementations/keymgmt/rsa_kmgmt.c
+ $(OPENSSL_PATH)/providers/implementations/macs/gmac_prov.c
+ $(OPENSSL_PATH)/providers/implementations/macs/hmac_prov.c
+ $(OPENSSL_PATH)/providers/implementations/macs/kmac_prov.c
+ $(OPENSSL_PATH)/providers/implementations/rands/crngt.c
+ $(OPENSSL_PATH)/providers/implementations/rands/drbg.c
+ $(OPENSSL_PATH)/providers/implementations/rands/drbg_ctr.c
+ $(OPENSSL_PATH)/providers/implementations/rands/drbg_hash.c
+ $(OPENSSL_PATH)/providers/implementations/rands/drbg_hmac.c
+ $(OPENSSL_PATH)/providers/implementations/rands/seed_src.c
+ $(OPENSSL_PATH)/providers/implementations/rands/test_rng.c
+ $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_cpu_x86.c
+ $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_tsc.c
+ $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_unix.c
+ $(OPENSSL_PATH)/providers/implementations/rands/seeding/rand_win.c
+ $(OPENSSL_PATH)/providers/implementations/signature/ecdsa_sig.c
+ $(OPENSSL_PATH)/providers/implementations/signature/eddsa_sig.c
+ $(OPENSSL_PATH)/providers/implementations/signature/mac_legacy_sig.c
+ $(OPENSSL_PATH)/providers/implementations/signature/rsa_sig.c
+ $(OPENSSL_PATH)/ssl/s3_cbc.c
+ $(OPENSSL_PATH)/providers/common/der/der_ec_key.c
+ $(OPENSSL_PATH)/providers/common/der/der_ec_sig.c
+ $(OPENSSL_PATH)/providers/common/der/der_ecx_key.c
+ $(OPENSSL_PATH)/providers/common/der/der_rsa_key.c
+ $(OPENSSL_PATH)/providers/common/provider_ctx.c
+ $(OPENSSL_PATH)/providers/common/provider_err.c
+ $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon.c
+ $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_block.c
+ $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_ccm.c
+ $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_ccm_hw.c
+ $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_gcm.c
+ $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_gcm_hw.c
+ $(OPENSSL_PATH)/providers/implementations/ciphers/ciphercommon_hw.c
+ $(OPENSSL_PATH)/providers/implementations/digests/digestcommon.c
+ $(OPENSSL_PATH)/ssl/record/tls_pad.c
+ $(OPENSSL_GEN_PATH)/providers/common/der/der_digests_gen.c
+ $(OPENSSL_GEN_PATH)/providers/common/der/der_ec_gen.c
+ $(OPENSSL_GEN_PATH)/providers/common/der/der_ecx_gen.c
+ $(OPENSSL_GEN_PATH)/providers/common/der/der_rsa_gen.c
+ $(OPENSSL_GEN_PATH)/providers/common/der/der_wrap_gen.c
+ $(OPENSSL_PATH)/ssl/bio_ssl.c
+ $(OPENSSL_PATH)/ssl/d1_lib.c
+ $(OPENSSL_PATH)/ssl/d1_msg.c
+ $(OPENSSL_PATH)/ssl/d1_srtp.c
+ $(OPENSSL_PATH)/ssl/methods.c
+ $(OPENSSL_PATH)/ssl/pqueue.c
+ $(OPENSSL_PATH)/ssl/s3_enc.c
+ $(OPENSSL_PATH)/ssl/s3_lib.c
+ $(OPENSSL_PATH)/ssl/s3_msg.c
+ $(OPENSSL_PATH)/ssl/ssl_asn1.c
+ $(OPENSSL_PATH)/ssl/ssl_cert.c
+ $(OPENSSL_PATH)/ssl/ssl_ciph.c
+ $(OPENSSL_PATH)/ssl/ssl_conf.c
+ $(OPENSSL_PATH)/ssl/ssl_err.c
+ $(OPENSSL_PATH)/ssl/ssl_err_legacy.c
+ $(OPENSSL_PATH)/ssl/ssl_init.c
+ $(OPENSSL_PATH)/ssl/ssl_lib.c
+ $(OPENSSL_PATH)/ssl/ssl_mcnf.c
+ $(OPENSSL_PATH)/ssl/ssl_rsa.c
+ $(OPENSSL_PATH)/ssl/ssl_rsa_legacy.c
+ $(OPENSSL_PATH)/ssl/ssl_sess.c
+ $(OPENSSL_PATH)/ssl/ssl_stat.c
+ $(OPENSSL_PATH)/ssl/ssl_txt.c
+ $(OPENSSL_PATH)/ssl/ssl_utst.c
+ $(OPENSSL_PATH)/ssl/t1_enc.c
+ $(OPENSSL_PATH)/ssl/t1_lib.c
+ $(OPENSSL_PATH)/ssl/t1_trce.c
+ $(OPENSSL_PATH)/ssl/tls13_enc.c
+ $(OPENSSL_PATH)/ssl/tls_depr.c
+ $(OPENSSL_PATH)/ssl/tls_srp.c
+ $(OPENSSL_PATH)/ssl/record/dtls1_bitmap.c
+ $(OPENSSL_PATH)/ssl/record/rec_layer_d1.c
+ $(OPENSSL_PATH)/ssl/record/rec_layer_s3.c
+ $(OPENSSL_PATH)/ssl/record/ssl3_buffer.c
+ $(OPENSSL_PATH)/ssl/record/ssl3_record.c
+ $(OPENSSL_PATH)/ssl/record/ssl3_record_tls13.c
+ $(OPENSSL_PATH)/ssl/statem/extensions.c
+ $(OPENSSL_PATH)/ssl/statem/extensions_clnt.c
+ $(OPENSSL_PATH)/ssl/statem/extensions_cust.c
+ $(OPENSSL_PATH)/ssl/statem/statem.c
+ $(OPENSSL_PATH)/ssl/statem/statem_clnt.c
+ $(OPENSSL_PATH)/ssl/statem/statem_dtls.c
+ $(OPENSSL_PATH)/ssl/statem/statem_lib.c
+ #$(OPENSSL_GEN_PATH)/X64-MSFT/crypto/aes/aes-x86_64.nasm | MSFT
+ #$(OPENSSL_GEN_PATH)/X64-MSFT/crypto/aes/aesni-mb-x86_64.nasm | MSFT
+ #$(OPENSSL_GEN_PATH)/X64-MSFT/crypto/aes/aesni-sha1-x86_64.nasm | MSFT
+ #$(OPENSSL_GEN_PATH)/X64-MSFT/crypto/aes/aesni-sha256-x86_64.nasm | MSFT
+ #$(OPENSSL_GEN_PATH)/X64-MSFT/crypto/aes/aesni-x86_64.nasm | MSFT
+ #$(OPENSSL_GEN_PATH)/X64-MSFT/crypto/aes/bsaes-x86_64.nasm | MSFT
+ #$(OPENSSL_GEN_PATH)/X64-MSFT/crypto/aes/vpaes-x86_64.nasm | MSFT
+ $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/x86_64cpuid.nasm | MSFT
+ $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/md5/md5-x86_64.nasm | MSFT
+ #$(OPENSSL_GEN_PATH)/X64-MSFT/crypto/modes/aesni-gcm-x86_64.nasm | MSFT
+ $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/modes/ghash-x86_64.nasm | MSFT
+ $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/sha/keccak1600-x86_64.nasm | MSFT
+ $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/sha/sha1-mb-x86_64.nasm | MSFT
+ $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/sha/sha1-x86_64.nasm | MSFT
+ $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/sha/sha256-mb-x86_64.nasm | MSFT
+ $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/sha/sha256-x86_64.nasm | MSFT
+ $(OPENSSL_GEN_PATH)/X64-MSFT/crypto/sha/sha512-x86_64.nasm | MSFT
+ #$(OPENSSL_GEN_PATH)/X64-GCC/crypto/aes/aes-x86_64.s | GCC
+ #$(OPENSSL_GEN_PATH)/X64-GCC/crypto/aes/aesni-mb-x86_64.s | GCC
+ #$(OPENSSL_GEN_PATH)/X64-GCC/crypto/aes/aesni-sha1-x86_64.s | GCC
+ #$(OPENSSL_GEN_PATH)/X64-GCC/crypto/aes/aesni-sha256-x86_64.s | GCC
+ #$(OPENSSL_GEN_PATH)/X64-GCC/crypto/aes/aesni-x86_64.s | GCC
+ #$(OPENSSL_GEN_PATH)/X64-GCC/crypto/aes/bsaes-x86_64.s | GCC
+ #$(OPENSSL_GEN_PATH)/X64-GCC/crypto/aes/vpaes-x86_64.s | GCC
+ $(OPENSSL_GEN_PATH)/X64-GCC/crypto/x86_64cpuid.s | GCC
+ $(OPENSSL_GEN_PATH)/X64-GCC/crypto/md5/md5-x86_64.s | GCC
+ #$(OPENSSL_GEN_PATH)/X64-GCC/crypto/modes/aesni-gcm-x86_64.s | GCC
+ $(OPENSSL_GEN_PATH)/X64-GCC/crypto/modes/ghash-x86_64.s | GCC
+ $(OPENSSL_GEN_PATH)/X64-GCC/crypto/sha/keccak1600-x86_64.s | GCC
+ $(OPENSSL_GEN_PATH)/X64-GCC/crypto/sha/sha1-mb-x86_64.s | GCC
+ $(OPENSSL_GEN_PATH)/X64-GCC/crypto/sha/sha1-x86_64.s | GCC
+ $(OPENSSL_GEN_PATH)/X64-GCC/crypto/sha/sha256-mb-x86_64.s | GCC
+ $(OPENSSL_GEN_PATH)/X64-GCC/crypto/sha/sha256-x86_64.s | GCC
+ $(OPENSSL_GEN_PATH)/X64-GCC/crypto/sha/sha512-x86_64.s | GCC
+# Autogenerated files list ends here
+
+[Packages]
+ MdePkg/MdePkg.dec
+ CryptoPkg/CryptoPkg.dec
+
+[LibraryClasses]
+ BaseLib
+ DebugLib
+ RngLib
+
+[BuildOptions]
+ #
+ # Disables the following Visual Studio compiler warnings brought by openssl source,
+ # so we do not break the build with /WX option:
+ # C4090: 'function' : different 'const' qualifiers
+ # C4132: 'object' : const object should be initialized (tls13_enc.c)
+ # C4210: nonstandard extension used: function given file scope
+ # C4244: conversion from type1 to type2, possible loss of data
+ # C4245: conversion from type1 to type2, signed/unsigned mismatch
+ # C4267: conversion from size_t to type, possible loss of data
+ # C4306: 'identifier' : conversion from 'type1' to 'type2' of greater size
+ # C4310: cast truncates constant value
+ # C4389: 'operator' : signed/unsigned mismatch (xxxx)
+ # C4700: uninitialized local variable 'name' used. (conf_sap.c(71))
+ # C4702: unreachable code
+ # C4706: assignment within conditional expression
+ # C4819: The file contains a character that cannot be represented in the current code page
+ # C4133: incompatible types - from 'ASN1_TYPE *' to 'const ASN1_STRING *' (v3_genn.c(101))
+ #
+ MSFT:*_*_IA32_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_IA32) /wd4090 /wd4132 /wd4210 /wd4244 /wd4245 /wd4267 /wd4310 /wd4389 /wd4700 /wd4702 /wd4706 /wd4819 /wd4133
+ MSFT:*_*_X64_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_X64) /wd4090 /wd4132 /wd4210 /wd4244 /wd4245 /wd4267 /wd4306 /wd4310 /wd4700 /wd4389 /wd4702 /wd4706 /wd4819 /wd4133
+
+ #
+ # Disable following Visual Studio 2015 compiler warnings brought by openssl source,
+ # so we do not break the build with /WX option:
+ # C4718: recursive call has no side effects, deleting
+ #
+ MSFT:*_VS2015x86_IA32_CC_FLAGS = /wd4718
+ MSFT:*_VS2015x86_X64_CC_FLAGS = /wd4718
+
+ INTEL:*_*_IA32_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER -U__ICC $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_IA32) /w
+ INTEL:*_*_X64_CC_FLAGS = -U_WIN32 -U_WIN64 -U_MSC_VER -U__ICC $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_X64) /w
+
+ #
+ # Suppress the following build warnings in openssl so we don't break the build with -Werror
+ # -Werror=maybe-uninitialized: there exist some other paths for which the variable is not initialized.
+ # -Werror=format: Check calls to printf and scanf, etc., to make sure that the arguments supplied have
+ # types appropriate to the format string specified.
+ # -Werror=unused-but-set-variable: Warn whenever a local variable is assigned to, but otherwise unused (aside from its declaration).
+ #
+ GCC:*_*_IA32_CC_FLAGS = -U_WIN32 -U_WIN64 $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_IA32) -Wno-error=maybe-uninitialized -Wno-error=unused-but-set-variable
+ GCC:*_*_X64_CC_FLAGS = -U_WIN32 -U_WIN64 $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_X64) -Wno-error=maybe-uninitialized -Wno-error=format -Wno-format -Wno-error=unused-but-set-variable -DNO_MSABI_VA_FUNCS
+ GCC:*_CLANGDWARF_*_CC_FLAGS = -std=c99 -Wno-error=uninitialized -Wno-error=incompatible-pointer-types -Wno-error=pointer-sign -Wno-error=implicit-function-declaration -Wno-error=ignored-pragma-optimize
+ GCC:*_CLANG35_*_CC_FLAGS = -std=c99 -Wno-error=uninitialized
+ GCC:*_CLANG38_*_CC_FLAGS = -std=c99 -Wno-error=uninitialized
+ GCC:*_CLANGPDB_*_CC_FLAGS = -std=c99 -Wno-error=uninitialized -Wno-error=incompatible-pointer-types -Wno-error=pointer-sign -Wno-error=implicit-function-declaration -Wno-error=ignored-pragma-optimize
+ # Revisit after switching to 3.0 branch
+ GCC:*_GCC5_*_CC_FLAGS = -Wno-unused-but-set-variable
+
+ # suppress the following warnings in openssl so we don't break the build with warnings-as-errors:
+ # 1295: Deprecated declaration - give arg types
+ # 550: was set but never used
+ # 1293: assignment in condition
+ # 111: statement is unreachable (invariably "break;" after "return X;" in case statement)
+ # 68: integer conversion resulted in a change of sign ("if (Status == -1)")
+ # 177: was declared but never referenced
+ # 223: function declared implicitly
+ # 144: a value of type cannot be used to initialize an entity of type
+ # 513: a value of type cannot be assigned to an entity of type
+ # 188: enumerated type mixed with another type (i.e. passing an integer as an enum without a cast)
+ # 1296: Extended constant initialiser used
+ # 128: loop is not reachable - may be emitted inappropriately if code follows a conditional return
+ # from the function that evaluates to true at compile time
+ # 546: transfer of control bypasses initialization - may be emitted inappropriately if the uninitialized
+ # variable is never referenced after the jump
+ # 1: ignore "#1-D: last line of file ends without a newline"
+ # 3017: may be used before being set (NOTE: This was fixed in OpenSSL 1.1 HEAD with
+ # commit d9b8b89bec4480de3a10bdaf9425db371c19145b, and can be dropped then.)
+ XCODE:*_*_IA32_CC_FLAGS = -mmmx -msse -U_WIN32 -U_WIN64 $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_IA32) -w -std=c99 -Wno-error=uninitialized
+ XCODE:*_*_X64_CC_FLAGS = -mmmx -msse -U_WIN32 -U_WIN64 $(OPENSSL_FLAGS) $(OPENSSL_FLAGS_X64) -w -std=c99 -Wno-error=uninitialized
diff --git a/CryptoPkg/Library/OpensslLib/OpensslStub/CipherNull.c b/CryptoPkg/Library/OpensslLib/OpensslStub/CipherNull.c
new file mode 100644
index 00000000000..c305594c202
--- /dev/null
+++ b/CryptoPkg/Library/OpensslLib/OpensslStub/CipherNull.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright 2023 Microsoft.
+ *
+ * A null implementation to not include any openssl ciphers
+ * as they are not used in project mu.
+ *
+ */
+
+void openssl_add_all_ciphers_int (void)
+{
+ return;
+}
+
+
diff --git a/CryptoPkg/Library/OpensslLib/OpensslStub/Md5Null.c b/CryptoPkg/Library/OpensslLib/OpensslStub/Md5Null.c
new file mode 100644
index 00000000000..d88786d6a9d
--- /dev/null
+++ b/CryptoPkg/Library/OpensslLib/OpensslStub/Md5Null.c
@@ -0,0 +1,73 @@
+/** @file
+ Null implementation of MD5 functions called by BaseCryptLib.
+
+ Copyright (c) 2022, Intel Corporation. All rights reserved.
+ SPDX-License-Identifier: BSD-2-Clause-Patent
+
+**/
+
+/*
+ * MD5 low level APIs are deprecated for public use, but still ok for
+ * internal use.
+ */
+
+#include "openssl/include/internal/deprecated.h"
+
+#include
+#include "crypto/evp.h"
+#include "openssl/crypto/evp/legacy_meth.h"
+
+static int init(EVP_MD_CTX *ctx)
+{
+ return 1;
+}
+
+static int update(EVP_MD_CTX *ctx, const void *data, size_t count)
+{
+ return 1;
+}
+
+static int final(EVP_MD_CTX *ctx, unsigned char *md)
+{
+ return 1;
+}
+
+IMPLEMENT_LEGACY_EVP_MD_METH(md5, MD5)
+
+static const EVP_MD md5_md = {
+ NID_md5,
+ NID_md5WithRSAEncryption,
+ MD5_DIGEST_LENGTH,
+ 0,
+ EVP_ORIG_GLOBAL,
+ LEGACY_EVP_MD_METH_TABLE(init, update, final, NULL, MD5_CBLOCK)
+};
+
+const EVP_MD *EVP_md5(void)
+{
+ return NULL;
+}
+
+//taken from md5_sha1.h
+static const EVP_MD md5_sha1_md = {
+ NID_md5_sha1,
+ NID_md5_sha1,
+ MD5_DIGEST_LENGTH,
+ 0,
+ EVP_ORIG_GLOBAL,
+ LEGACY_EVP_MD_METH_TABLE(init, update, final, NULL, MD5_CBLOCK),
+};
+
+const EVP_MD *EVP_md5_sha1(void)
+{
+ return NULL;
+}
+
+// Used for s3_cbc.c
+void MD5_Transform (MD5_CTX *c, const unsigned char *b) {
+ return;
+}
+
+int MD5_Init(MD5_CTX *c) {
+ return 1;
+}
diff --git a/CryptoPkg/Library/OpensslLib/OpensslStub/Sm3Null.c b/CryptoPkg/Library/OpensslLib/OpensslStub/Sm3Null.c
new file mode 100644
index 00000000000..e24638b0681
--- /dev/null
+++ b/CryptoPkg/Library/OpensslLib/OpensslStub/Sm3Null.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2017-2021 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2017 Ribose Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+// Copied from sm3_legacy.c
+
+#include "crypto/evp.h"
+#include "openssl/crypto/evp/legacy_meth.h"
+#include "internal/sm3.h"
+
+static int init(EVP_MD_CTX *ctx)
+{
+ return 1;
+}
+
+static int update(EVP_MD_CTX *ctx, const void *data, size_t count)
+{
+ return 1;
+}
+
+static int final(EVP_MD_CTX *ctx, unsigned char *md)
+{
+ return 1;
+}
+
+IMPLEMENT_LEGACY_EVP_MD_METH_LC(sm3_int, ossl_sm3)
+
+static const EVP_MD sm3_md = {
+ NID_sm3,
+ NID_sm3WithRSAEncryption,
+ SM3_DIGEST_LENGTH,
+ 0,
+ EVP_ORIG_GLOBAL,
+ LEGACY_EVP_MD_METH_TABLE(init, update, final, NULL,
+ SM3_CBLOCK),
+};
+
+const EVP_MD *EVP_sm3(void)
+{
+ return NULL;
+}
diff --git a/CryptoPkg/Library/OpensslLib/OpensslStub/uefiprov.c b/CryptoPkg/Library/OpensslLib/OpensslStub/uefiprov.c
index 40ab7e937c6..09ec2c942a4 100644
--- a/CryptoPkg/Library/OpensslLib/OpensslStub/uefiprov.c
+++ b/CryptoPkg/Library/OpensslLib/OpensslStub/uefiprov.c
@@ -113,17 +113,21 @@ static const OSSL_ALGORITHM deflt_digests[] = {
{ PROV_NAMES_SHA2_512, "provider=default", ossl_sha512_functions },
#ifndef OPENSSL_NO_SM3
+ // MU_CHANGE START
{ PROV_NAMES_SM3, "provider=default", ossl_sm3_functions },
+ // MU_CHANGE END
#endif /* OPENSSL_NO_SM3 */
#ifndef OPENSSL_NO_MD5
+ // MU_CHANGE START
{ PROV_NAMES_MD5, "provider=default", ossl_md5_functions },
+ // MU_CHANGE END
#endif /* OPENSSL_NO_MD5 */
{ PROV_NAMES_NULL, "provider=default", ossl_nullmd_functions },
{ NULL, NULL, NULL }
};
-
+// MU_CHANGE START
static const OSSL_ALGORITHM_CAPABLE deflt_ciphers[] = {
ALG(PROV_NAMES_NULL, ossl_null_functions),
ALG(PROV_NAMES_AES_256_ECB, ossl_aes256ecb_functions),
@@ -144,6 +148,7 @@ static const OSSL_ALGORITHM_CAPABLE deflt_ciphers[] = {
{ { NULL, NULL, NULL }, NULL }
};
static OSSL_ALGORITHM exported_ciphers[OSSL_NELEM(deflt_ciphers)];
+// MU_CHANGE END
static const OSSL_ALGORITHM deflt_macs[] = {
{ PROV_NAMES_HMAC, "provider=default", ossl_hmac_functions },
@@ -161,10 +166,14 @@ static const OSSL_ALGORITHM deflt_kdfs[] = {
static const OSSL_ALGORITHM deflt_keyexch[] = {
#ifndef OPENSSL_NO_DH
+ // MU_CHANGE start - disable DH
{ PROV_NAMES_DH, "provider=default", ossl_dh_keyexch_functions },
+ // MU_CHANGE end - disable DH
#endif
#ifndef OPENSSL_NO_EC
+ // MU_CHANGE start - disable DH
{ PROV_NAMES_ECDH, "provider=default", ossl_ecdh_keyexch_functions },
+ // MU_CHANGE end - disable DH
#endif
{ PROV_NAMES_TLS1_PRF, "provider=default", ossl_kdf_tls1_prf_keyexch_functions },
{ PROV_NAMES_HKDF, "provider=default", ossl_kdf_hkdf_keyexch_functions },
@@ -193,10 +202,12 @@ static const OSSL_ALGORITHM deflt_asym_cipher[] = {
static const OSSL_ALGORITHM deflt_keymgmt[] = {
#ifndef OPENSSL_NO_DH
+ // MU_CHANGE start - disable DH
{ PROV_NAMES_DH, "provider=default", ossl_dh_keymgmt_functions,
PROV_DESCS_DH },
{ PROV_NAMES_DHX, "provider=default", ossl_dhx_keymgmt_functions,
PROV_DESCS_DHX },
+ // MU_CHANGE end - disable DH
#endif
{ PROV_NAMES_RSA, "provider=default", ossl_rsa_keymgmt_functions,
@@ -230,7 +241,10 @@ static const OSSL_ALGORITHM *deflt_query(void *provctx, int operation_id,
case OSSL_OP_DIGEST:
return deflt_digests;
case OSSL_OP_CIPHER:
+ // MU_CHANGE START
return exported_ciphers;
+ //return NULL;
+ // MU_CHANGE END
case OSSL_OP_MAC:
return deflt_macs;
case OSSL_OP_KDF:
@@ -322,7 +336,9 @@ int ossl_uefi_provider_init(const OSSL_CORE_HANDLE *handle,
ossl_prov_ctx_set0_core_bio_method(*provctx, corebiometh);
*out = deflt_dispatch_table;
+ // MU_CHANGE START
ossl_prov_cache_exported_algorithms(deflt_ciphers, exported_ciphers);
+ // MU_CHANGE END
return 1;
}
diff --git a/CryptoPkg/Library/OpensslLib/configure.py b/CryptoPkg/Library/OpensslLib/configure.py
index 4243ca4c257..4d792fc9c8c 100755
--- a/CryptoPkg/Library/OpensslLib/configure.py
+++ b/CryptoPkg/Library/OpensslLib/configure.py
@@ -35,6 +35,7 @@ def openssl_configure(openssldir, target, ec = True):
'no-deprecated',
'no-des',
'no-dgram',
+ 'no-dh',
'no-dsa',
'no-dso',
'no-dtls',
@@ -73,6 +74,7 @@ def openssl_configure(openssldir, target, ec = True):
'no-siphash',
'no-siv',
'no-sm2',
+ 'no-sm3',
'no-sm4',
'no-sock',
'no-srp',