diff --git a/crypto/chacha/asm/chacha-x86_64.pl b/crypto/chacha/asm/chacha-x86_64.pl index d212d5fdc5..4da5ec9cab 100755 --- a/crypto/chacha/asm/chacha-x86_64.pl +++ b/crypto/chacha/asm/chacha-x86_64.pl @@ -80,8 +80,6 @@ $code.=<<___; .text -.extern OPENSSL_ia32cap_P - .section .rodata .align 64 .Lzero: @@ -230,24 +228,12 @@ sub ROUND { # critical path is 24 cycles per round ######################################################################## # Generic code path that handles all lengths on pre-SSSE3 processors. $code.=<<___; -.globl ChaCha20_ctr32 -.type ChaCha20_ctr32,\@function,5 +.globl ChaCha20_ctr32_nohw +.type ChaCha20_ctr32_nohw,\@function,5 .align 64 -ChaCha20_ctr32: +ChaCha20_ctr32_nohw: .cfi_startproc _CET_ENDBR - cmp \$0,$len - je .Lno_data - mov OPENSSL_ia32cap_P+4(%rip),%r10 -___ -$code.=<<___ if ($avx>2); - bt \$48,%r10 # check for AVX512F - jc .LChaCha20_avx512 -___ -$code.=<<___; - test \$`1<<(41-32)`,%r10d - jnz .LChaCha20_ssse3 - push %rbx .cfi_push rbx push %rbp @@ -419,7 +405,7 @@ sub ROUND { # critical path is 24 cycles per round .Lno_data: ret .cfi_endproc -.size ChaCha20_ctr32,.-ChaCha20_ctr32 +.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw ___ ######################################################################## @@ -454,19 +440,16 @@ sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round my $xframe = $win64 ? 32+8 : 8; $code.=<<___; -.type ChaCha20_ssse3,\@function,5 +.globl ChaCha20_ctr32_ssse3 +.type ChaCha20_ctr32_ssse3,\@function,5 .align 32 -ChaCha20_ssse3: -.LChaCha20_ssse3: +ChaCha20_ctr32_ssse3: .cfi_startproc + _CET_ENDBR mov %rsp,%r9 # frame pointer .cfi_def_cfa_register r9 ___ $code.=<<___; - cmp \$128,$len # we might throw away some data, - ja .LChaCha20_4x # but overall it won't be slower - -.Ldo_sse3_after_all: sub \$64+$xframe,%rsp ___ $code.=<<___ if ($win64); @@ -576,7 +559,7 @@ sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round .Lssse3_epilogue: ret .cfi_endproc -.size ChaCha20_ssse3,.-ChaCha20_ssse3 +.size ChaCha20_ctr32_ssse3,.-ChaCha20_ctr32_ssse3 ___ } @@ -714,29 +697,17 @@ sub SSSE3_lane_ROUND { my $xframe = $win64 ? 0xa8 : 8; $code.=<<___; -.type ChaCha20_4x,\@function,5 +.globl ChaCha20_ctr32_ssse3_4x +.type ChaCha20_ctr32_ssse3_4x,\@function,5 .align 32 -ChaCha20_4x: -.LChaCha20_4x: +ChaCha20_ctr32_ssse3_4x: .cfi_startproc + _CET_ENDBR mov %rsp,%r9 # frame pointer .cfi_def_cfa_register r9 mov %r10,%r11 ___ -$code.=<<___ if ($avx>1); - shr \$32,%r10 # OPENSSL_ia32cap_P+8 - test \$`1<<5`,%r10 # test AVX2 - jnz .LChaCha20_8x -___ $code.=<<___; - cmp \$192,$len - ja .Lproceed4x - - and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE - cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE - je .Ldo_sse3_after_all # to detect Atom - -.Lproceed4x: sub \$0x140+$xframe,%rsp ___ ################ stack layout @@ -1164,7 +1135,7 @@ sub SSSE3_lane_ROUND { .L4x_epilogue: ret .cfi_endproc -.size ChaCha20_4x,.-ChaCha20_4x +.size ChaCha20_ctr32_ssse3_4x,.-ChaCha20_ctr32_ssse3_4x ___ } @@ -1293,11 +1264,12 @@ sub AVX2_lane_ROUND { my $xframe = $win64 ? 0xa8 : 8; $code.=<<___; -.type ChaCha20_8x,\@function,5 +.globl ChaCha20_ctr32_avx2 +.type ChaCha20_ctr32_avx2,\@function,5 .align 32 -ChaCha20_8x: -.LChaCha20_8x: +ChaCha20_ctr32_avx2: .cfi_startproc + _CET_ENDBR mov %rsp,%r9 # frame register .cfi_def_cfa_register r9 sub \$0x280+$xframe,%rsp @@ -1809,7 +1781,7 @@ sub AVX2_lane_ROUND { .L8x_epilogue: ret .cfi_endproc -.size ChaCha20_8x,.-ChaCha20_8x +.size ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2 ___ } @@ -2719,22 +2691,22 @@ sub AVX512_lane_ROUND { .section .pdata .align 4 - .rva .LSEH_begin_ChaCha20_ctr32 - .rva .LSEH_end_ChaCha20_ctr32 - .rva .LSEH_info_ChaCha20_ctr32 + .rva .LSEH_begin_ChaCha20_ctr32_nohw + .rva .LSEH_end_ChaCha20_ctr32_nohw + .rva .LSEH_info_ChaCha20_ctr32_nohw - .rva .LSEH_begin_ChaCha20_ssse3 - .rva .LSEH_end_ChaCha20_ssse3 - .rva .LSEH_info_ChaCha20_ssse3 + .rva .LSEH_begin_ChaCha20_ctr32_ssse3 + .rva .LSEH_end_ChaCha20_ctr32_ssse3 + .rva .LSEH_info_ChaCha20_ctr32_ssse3 - .rva .LSEH_begin_ChaCha20_4x - .rva .LSEH_end_ChaCha20_4x - .rva .LSEH_info_ChaCha20_4x + .rva .LSEH_begin_ChaCha20_ctr32_ssse3_4x + .rva .LSEH_end_ChaCha20_ctr32_ssse3_4x + .rva .LSEH_info_ChaCha20_ctr32_ssse3_4x ___ $code.=<<___ if ($avx>1); - .rva .LSEH_begin_ChaCha20_8x - .rva .LSEH_end_ChaCha20_8x - .rva .LSEH_info_ChaCha20_8x + .rva .LSEH_begin_ChaCha20_ctr32_avx2 + .rva .LSEH_end_ChaCha20_ctr32_avx2 + .rva .LSEH_info_ChaCha20_ctr32_avx2 ___ $code.=<<___ if ($avx>2); .rva .LSEH_begin_ChaCha20_avx512 @@ -2748,22 +2720,22 @@ sub AVX512_lane_ROUND { $code.=<<___; .section .xdata .align 8 -.LSEH_info_ChaCha20_ctr32: +.LSEH_info_ChaCha20_ctr32_nohw: .byte 9,0,0,0 .rva se_handler -.LSEH_info_ChaCha20_ssse3: +.LSEH_info_ChaCha20_ctr32_ssse3: .byte 9,0,0,0 .rva ssse3_handler .rva .Lssse3_body,.Lssse3_epilogue -.LSEH_info_ChaCha20_4x: +.LSEH_info_ChaCha20_ctr32_ssse3_4x: .byte 9,0,0,0 .rva full_handler .rva .L4x_body,.L4x_epilogue ___ $code.=<<___ if ($avx>1); -.LSEH_info_ChaCha20_8x: +.LSEH_info_ChaCha20_ctr32_avx2: .byte 9,0,0,0 .rva full_handler .rva .L8x_body,.L8x_epilogue # HandlerData[] diff --git a/crypto/chacha/chacha.c b/crypto/chacha/chacha.c index 8a6dbfb60c..8772729527 100644 --- a/crypto/chacha/chacha.c +++ b/crypto/chacha/chacha.c @@ -91,6 +91,24 @@ static void ChaCha20_ctr32(uint8_t *out, const uint8_t *in, size_t in_len, ChaCha20_ctr32_neon(out, in, in_len, key, counter); return; } +#endif +#if defined(CHACHA20_ASM_AVX2) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) + if (ChaCha20_ctr32_avx2_capable(in_len)) { + ChaCha20_ctr32_avx2(out, in, in_len, key, counter); + return; + } +#endif +#if defined(CHACHA20_ASM_SSSE3_4X) + if (ChaCha20_ctr32_ssse3_4x_capable(in_len)) { + ChaCha20_ctr32_ssse3_4x(out, in, in_len, key, counter); + return; + } +#endif +#if defined(CHACHA20_ASM_SSSE3) + if (ChaCha20_ctr32_ssse3_capable(in_len)) { + ChaCha20_ctr32_ssse3(out, in, in_len, key, counter); + return; + } #endif if (in_len > 0) { ChaCha20_ctr32_nohw(out, in, in_len, key, counter); diff --git a/crypto/chacha/chacha_test.cc b/crypto/chacha/chacha_test.cc index 4e656aa223..ebfd8f8aca 100644 --- a/crypto/chacha/chacha_test.cc +++ b/crypto/chacha/chacha_test.cc @@ -359,6 +359,21 @@ static void check_abi(uint8_t *out, const uint8_t *in, size_t in_len, CHECK_ABI(ChaCha20_ctr32_neon, out, in, in_len, key, counter); } #endif +#if defined(CHACHA20_ASM_AVX2) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) + if (ChaCha20_ctr32_avx2_capable(in_len)) { + CHECK_ABI(ChaCha20_ctr32_avx2, out, in, in_len, key, counter); + } +#endif +#if defined(CHACHA20_ASM_SSSE3_4X) + if (ChaCha20_ctr32_ssse3_4x_capable(in_len)) { + CHECK_ABI(ChaCha20_ctr32_ssse3_4x, out, in, in_len, key, counter); + } +#endif +#if defined(CHACHA20_ASM_SSSE3) + if (ChaCha20_ctr32_ssse3_capable(in_len)) { + CHECK_ABI(ChaCha20_ctr32_ssse3, out, in, in_len, key, counter); + } +#endif #if defined(CHACHA20_ASM_NOHW) if (in_len > 0) { CHECK_ABI(ChaCha20_ctr32_nohw, out, in, in_len, key, counter); diff --git a/crypto/chacha/internal.h b/crypto/chacha/internal.h index 4c339c6fcd..ce0d08620b 100644 --- a/crypto/chacha/internal.h +++ b/crypto/chacha/internal.h @@ -30,8 +30,7 @@ extern "C" { void CRYPTO_hchacha20(uint8_t out[32], const uint8_t key[32], const uint8_t nonce[16]); -#if !defined(OPENSSL_NO_ASM) && \ - (defined(OPENSSL_X86) || defined(OPENSSL_X86_64)) +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) #define CHACHA20_ASM @@ -46,6 +45,31 @@ OPENSSL_INLINE int ChaCha20_ctr32_neon_capable(size_t len) { } void ChaCha20_ctr32_neon(uint8_t *out, const uint8_t *in, size_t in_len, const uint32_t key[8], const uint32_t counter[4]); +#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) +#define CHACHA20_ASM_NOHW + +#define CHACHA20_ASM_AVX2 +OPENSSL_INLINE int ChaCha20_ctr32_avx2_capable(size_t len) { + return (len > 128) && CRYPTO_is_AVX2_capable(); +} +void ChaCha20_ctr32_avx2(uint8_t *out, const uint8_t *in, size_t in_len, + const uint32_t key[8], const uint32_t counter[4]); + +#define CHACHA20_ASM_SSSE3_4X +OPENSSL_INLINE int ChaCha20_ctr32_ssse3_4x_capable(size_t len) { + int capable = (len > 128) && CRYPTO_is_SSSE3_capable(); + int faster = (len > 192) || !CRYPTO_cpu_perf_is_like_silvermont(); + return capable && faster; +} +void ChaCha20_ctr32_ssse3_4x(uint8_t *out, const uint8_t *in, size_t in_len, + const uint32_t key[8], const uint32_t counter[4]); + +#define CHACHA20_ASM_SSSE3 +OPENSSL_INLINE int ChaCha20_ctr32_ssse3_capable(size_t len) { + return (len > 128) && CRYPTO_is_SSSE3_capable(); +} +void ChaCha20_ctr32_ssse3(uint8_t *out, const uint8_t *in, size_t in_len, + const uint32_t key[8], const uint32_t counter[4]); #endif #if defined(CHACHA20_ASM) diff --git a/crypto/fipsmodule/cpucap/cpu_intel.c b/crypto/fipsmodule/cpucap/cpu_intel.c index 19b6540d02..bbf8543c86 100644 --- a/crypto/fipsmodule/cpucap/cpu_intel.c +++ b/crypto/fipsmodule/cpucap/cpu_intel.c @@ -240,7 +240,8 @@ void OPENSSL_cpuid_setup(void) { // Clear the XSAVE bit on Knights Landing to mimic Silvermont. This enables // some Silvermont-specific codepaths which perform better. See OpenSSL - // commit 64d92d74985ebb3d0be58a9718f9e080a14a8e7f. + // commit 64d92d74985ebb3d0be58a9718f9e080a14a8e7f and + // |CRYPTO_cpu_perf_is_like_silvermont|. if ((eax & 0x0fff0ff0) == 0x00050670 /* Knights Landing */ || (eax & 0x0fff0ff0) == 0x00080650 /* Knights Mill (per SDE) */) { ecx &= ~(1u << 26); @@ -267,7 +268,8 @@ void OPENSSL_cpuid_setup(void) { // Clear AVX2 and AVX512* bits. // // TODO(davidben): Should bits 17 and 26-28 also be cleared? Upstream - // doesn't clear those. + // doesn't clear those. See the comments in + // |CRYPTO_hardware_supports_XSAVE|. extended_features[0] &= ~((1u << 5) | (1u << 16) | (1u << 21) | (1u << 30) | (1u << 31)); } diff --git a/crypto/fipsmodule/cpucap/internal.h b/crypto/fipsmodule/cpucap/internal.h index 530e4e299e..b145129b82 100644 --- a/crypto/fipsmodule/cpucap/internal.h +++ b/crypto/fipsmodule/cpucap/internal.h @@ -34,7 +34,8 @@ void OPENSSL_cpuid_setup(void); // ECX for CPUID where EAX = 7 // // Note: the CPUID bits are pre-adjusted for the OSXSAVE bit and the YMM and XMM -// bits in XCR0, so it is not necessary to check those. +// bits in XCR0, so it is not necessary to check those. (WARNING: See caveats +// in cpu_intel.c.) extern uint32_t OPENSSL_ia32cap_P[4]; #if defined(BORINGSSL_FIPS) && !defined(BORINGSSL_SHARED_LIBRARY) @@ -81,6 +82,9 @@ OPENSSL_INLINE int CRYPTO_is_AESNI_capable(void) { return (OPENSSL_ia32cap_get()[1] & (1 << 25)) != 0; } +// We intentionally avoid defining a |CRYPTO_is_XSAVE_capable| function. See +// |CRYPTO_cpu_perf_is_like_silvermont|. + OPENSSL_INLINE int CRYPTO_is_AVX_capable(void) { return (OPENSSL_ia32cap_get()[1] & (1 << 28)) != 0; } @@ -131,6 +135,29 @@ OPENSSL_INLINE int CRYPTO_is_VBMI2_capable(void) { return (OPENSSL_ia32cap_get()[3] & (1 << 6)) != 0; } +// CRYPTO_cpu_perf_is_like_silvermont returns one if, based on a heuristic, the +// CPU has Silvermont-like performance characteristics. It is often faster to +// run different codepaths on these CPUs than the available instructions would +// otherwise select. See chacha-x86_64.pl. +// +// Bonnell, Silvermont's predecessor in the Atom lineup, will also be matched by +// this. |OPENSSL_cpuid_setup| forces Knights Landing to also be matched by +// this. Goldmont (Silvermont's successor in the Atom lineup) added XSAVE so it +// isn't matched by this. Various sources indicate AMD first implemented MOVBE +// and XSAVE at the same time in Jaguar, so it seems like AMD chips will not be +// matched by this. That seems to be the case for other x86(-64) CPUs. +OPENSSL_INLINE int CRYPTO_cpu_perf_is_like_silvermont(void) { + // WARNING: This MUST NOT be used to guard the execution of the XSAVE + // instruction. This is the "hardware supports XSAVE" bit, not the OSXSAVE bit + // that indicates whether we can safely execute XSAVE. This bit may be set + // even when XSAVE is disabled (by the operating system). See the comment in + // cpu_intel.c and check how the users of this bit use it. + // + // We do not use |__XSAVE__| for static detection because the hack in + // |OPENSSL_cpuid_setup| for Knights Landing CPUs needs to override it. + int hardware_supports_xsave = (OPENSSL_ia32cap_get()[1] & (1u << 26)) != 0; + return !hardware_supports_xsave && CRYPTO_is_MOVBE_capable(); +} #endif // OPENSSL_X86 || OPENSSL_X86_64 @@ -146,6 +173,11 @@ OPENSSL_INLINE int CRYPTO_is_VBMI2_capable(void) { #define OPENSSL_STATIC_ARMCAP #endif +#include + +extern uint32_t OPENSSL_armcap_P; +extern uint8_t OPENSSL_cpucap_initialized; + // Normalize some older feature flags to their modern ACLE values. // https://developer.arm.com/architectures/system-architectures/software-standards/acle #if defined(__ARM_NEON__) && !defined(__ARM_NEON) @@ -160,11 +192,6 @@ OPENSSL_INLINE int CRYPTO_is_VBMI2_capable(void) { #endif #endif -#include - -extern uint32_t OPENSSL_armcap_P; -extern uint8_t OPENSSL_cpucap_initialized; - // CRYPTO_is_NEON_capable returns true if the current CPU has a NEON unit. // If this is known statically, it is a constant inline function. // Otherwise, the capability is checked at runtime by checking the corresponding @@ -183,6 +210,18 @@ OPENSSL_INLINE int CRYPTO_is_ARMv8_PMULL_capable(void) { return (OPENSSL_armcap_P & ARMV8_PMULL) != 0; } +OPENSSL_INLINE int CRYPTO_is_ARMv8_SHA1_capable(void) { + return (OPENSSL_armcap_P & ARMV8_SHA1) != 0; +} + +OPENSSL_INLINE int CRYPTO_is_ARMv8_SHA256_capable(void) { + return (OPENSSL_armcap_P & ARMV8_SHA256) != 0; +} + +OPENSSL_INLINE int CRYPTO_is_ARMv8_SHA512_capable(void) { + return (OPENSSL_armcap_P & ARMV8_SHA512) != 0; +} + OPENSSL_INLINE int CRYPTO_is_ARMv8_GCM_8x_capable(void) { return ((OPENSSL_armcap_P & ARMV8_SHA3) != 0 && ((OPENSSL_armcap_P & ARMV8_NEOVERSE_V1) != 0 || @@ -196,6 +235,8 @@ OPENSSL_INLINE int CRYPTO_is_ARMv8_wide_multiplier_capable(void) { (OPENSSL_armcap_P & ARMV8_APPLE_M1) != 0; } + + #endif // OPENSSL_ARM || OPENSSL_AARCH64 #if defined(OPENSSL_PPC64LE) diff --git a/crypto/fipsmodule/sha/asm/sha1-armv8.pl b/crypto/fipsmodule/sha/asm/sha1-armv8.pl index b865a118b6..c88ffcdd33 100644 --- a/crypto/fipsmodule/sha/asm/sha1-armv8.pl +++ b/crypto/fipsmodule/sha/asm/sha1-armv8.pl @@ -178,22 +178,12 @@ sub BODY_20_39 { .text -.extern OPENSSL_armcap_P -.hidden OPENSSL_armcap_P -.globl sha1_block_data_order -.type sha1_block_data_order,%function +.globl sha1_block_data_order_nohw +.type sha1_block_data_order_nohw,%function .align 6 -sha1_block_data_order: +sha1_block_data_order_nohw: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. AARCH64_VALID_CALL_TARGET -#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10 - adrp x16,:pg_hi21_nc:OPENSSL_armcap_P -#else - adrp x16,:pg_hi21:OPENSSL_armcap_P -#endif - ldr w16,[x16,:lo12:OPENSSL_armcap_P] - tst w16,#ARMV8_SHA1 - b.ne .Lv8_entry stp x29,x30,[sp,#-96]! add x29,sp,#0 @@ -242,7 +232,7 @@ sub BODY_20_39 { ldp x27,x28,[sp,#80] ldr x29,[sp],#96 ret -.size sha1_block_data_order,.-sha1_block_data_order +.size sha1_block_data_order_nohw,.-sha1_block_data_order_nohw ___ {{{ my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3)); @@ -252,12 +242,12 @@ sub BODY_20_39 { my $ABCD_SAVE="v22.16b"; $code.=<<___; -.type sha1_block_armv8,%function +.globl sha1_block_data_order_hw +.type sha1_block_data_order_hw,%function .align 6 -sha1_block_armv8: +sha1_block_data_order_hw: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. AARCH64_VALID_CALL_TARGET -.Lv8_entry: stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -321,7 +311,7 @@ sub BODY_20_39 { ldr x29,[sp],#16 ret -.size sha1_block_armv8,.-sha1_block_armv8 +.size sha1_block_data_order_hw,.-sha1_block_data_order_hw .section .rodata .align 6 .Lconst: diff --git a/crypto/fipsmodule/sha/asm/sha1-x86_64.pl b/crypto/fipsmodule/sha/asm/sha1-x86_64.pl index e1a4f7f278..af8c1cf4ce 100755 --- a/crypto/fipsmodule/sha/asm/sha1-x86_64.pl +++ b/crypto/fipsmodule/sha/asm/sha1-x86_64.pl @@ -241,42 +241,13 @@ sub BODY_40_59 { $code.=<<___; .text -.extern OPENSSL_ia32cap_P -.globl sha1_block_data_order -.type sha1_block_data_order,\@function,3 +.globl sha1_block_data_order_nohw +.type sha1_block_data_order_nohw,\@function,3 .align 16 -sha1_block_data_order: +sha1_block_data_order_nohw: .cfi_startproc _CET_ENDBR - leaq OPENSSL_ia32cap_P(%rip),%r10 - mov 0(%r10),%r9d - mov 4(%r10),%r8d - mov 8(%r10),%r10d - test \$`1<<9`,%r8d # check SSSE3 bit - jz .Lialu -___ -$code.=<<___ if ($shaext); - test \$`1<<29`,%r10d # check SHA bit - jnz _shaext_shortcut -___ -$code.=<<___ if ($avx>1); - and \$`1<<3|1<<5|1<<8`,%r10d # check AVX2+BMI1+BMI2 - cmp \$`1<<3|1<<5|1<<8`,%r10d - je _avx2_shortcut -___ -$code.=<<___ if ($avx); - and \$`1<<28`,%r8d # mask AVX bit - and \$`1<<30`,%r9d # mask "Intel CPU" bit - or %r9d,%r8d - cmp \$`1<<28|1<<30`,%r8d - je _avx_shortcut -___ -$code.=<<___; - jmp _ssse3_shortcut - -.align 16 -.Lialu: mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx @@ -345,7 +316,7 @@ sub BODY_40_59 { .Lepilogue: ret .cfi_endproc -.size sha1_block_data_order,.-sha1_block_data_order +.size sha1_block_data_order_nohw,.-sha1_block_data_order_nohw ___ if ($shaext) {{{ ###################################################################### @@ -356,11 +327,12 @@ sub BODY_40_59 { my @MSG=map("%xmm$_",(4..7)); $code.=<<___; -.type sha1_block_data_order_shaext,\@function,3 +.globl sha1_block_data_order_hw +.type sha1_block_data_order_hw,\@function,3 .align 32 -sha1_block_data_order_shaext: -_shaext_shortcut: +sha1_block_data_order_hw: .cfi_startproc + _CET_ENDBR ___ $code.=<<___ if ($win64); lea `-8-4*16`(%rsp),%rsp @@ -461,7 +433,7 @@ sub BODY_40_59 { $code.=<<___; ret .cfi_endproc -.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext +.size sha1_block_data_order_hw,.-sha1_block_data_order_hw ___ }}} {{{ @@ -491,11 +463,12 @@ () } $code.=<<___; +.globl sha1_block_data_order_ssse3 .type sha1_block_data_order_ssse3,\@function,3 .align 16 sha1_block_data_order_ssse3: -_ssse3_shortcut: .cfi_startproc + _CET_ENDBR mov %rsp,$fp # frame pointer .cfi_def_cfa_register $fp push %rbx @@ -965,11 +938,12 @@ () my $_ror=sub { &shrd(@_[0],@_) }; $code.=<<___; +.globl sha1_block_data_order_avx .type sha1_block_data_order_avx,\@function,3 .align 16 sha1_block_data_order_avx: -_avx_shortcut: .cfi_startproc + _CET_ENDBR mov %rsp,$fp .cfi_def_cfa_register $fp push %rbx @@ -1344,11 +1318,12 @@ () my $frame="%r13"; $code.=<<___; +.globl sha1_block_data_order_avx2 .type sha1_block_data_order_avx2,\@function,3 .align 16 sha1_block_data_order_avx2: -_avx2_shortcut: .cfi_startproc + _CET_ENDBR mov %rsp,$fp .cfi_def_cfa_register $fp push %rbx @@ -2023,14 +1998,14 @@ () .section .pdata .align 4 - .rva .LSEH_begin_sha1_block_data_order - .rva .LSEH_end_sha1_block_data_order - .rva .LSEH_info_sha1_block_data_order + .rva .LSEH_begin_sha1_block_data_order_nohw + .rva .LSEH_end_sha1_block_data_order_nohw + .rva .LSEH_info_sha1_block_data_order_nohw ___ $code.=<<___ if ($shaext); - .rva .LSEH_begin_sha1_block_data_order_shaext - .rva .LSEH_end_sha1_block_data_order_shaext - .rva .LSEH_info_sha1_block_data_order_shaext + .rva .LSEH_begin_sha1_block_data_order_hw + .rva .LSEH_end_sha1_block_data_order_hw + .rva .LSEH_info_sha1_block_data_order_hw ___ $code.=<<___; .rva .LSEH_begin_sha1_block_data_order_ssse3 @@ -2050,12 +2025,12 @@ () $code.=<<___; .section .xdata .align 8 -.LSEH_info_sha1_block_data_order: +.LSEH_info_sha1_block_data_order_nohw: .byte 9,0,0,0 .rva se_handler ___ $code.=<<___ if ($shaext); -.LSEH_info_sha1_block_data_order_shaext: +.LSEH_info_sha1_block_data_order_hw: .byte 9,0,0,0 .rva shaext_handler ___ diff --git a/crypto/fipsmodule/sha/asm/sha512-armv8.pl b/crypto/fipsmodule/sha/asm/sha512-armv8.pl index 95779e4c76..4f90a240e4 100644 --- a/crypto/fipsmodule/sha/asm/sha512-armv8.pl +++ b/crypto/fipsmodule/sha/asm/sha512-armv8.pl @@ -78,7 +78,7 @@ $reg_t="w"; } -$func="sha${BITS}_block_data_order"; +$func="sha${BITS}_block_data_order_nohw"; ($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); @@ -182,31 +182,10 @@ sub BODY_00_xx { .text -.extern OPENSSL_armcap_P -.hidden OPENSSL_armcap_P .globl $func .type $func,%function .align 6 $func: - AARCH64_VALID_CALL_TARGET -#ifndef __KERNEL__ -#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10 - adrp x16,:pg_hi21_nc:OPENSSL_armcap_P -#else - adrp x16,:pg_hi21:OPENSSL_armcap_P -#endif - ldr w16,[x16,:lo12:OPENSSL_armcap_P] -___ -$code.=<<___ if ($SZ==4); - tst w16,#ARMV8_SHA256 - b.ne .Lv8_entry -___ -$code.=<<___ if ($SZ==8); - tst w16,#ARMV8_SHA512 - b.ne .Lv8_entry -___ -$code.=<<___; -#endif AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 @@ -358,10 +337,10 @@ sub BODY_00_xx { $code.=<<___; .text #ifndef __KERNEL__ -.type sha256_block_armv8,%function +.globl sha256_block_data_order_hw +.type sha256_block_data_order_hw,%function .align 6 -sha256_block_armv8: -.Lv8_entry: +sha256_block_data_order_hw: #ifdef BORINGSSL_DISPATCH_TEST .extern BORINGSSL_function_hit adrp x9,:pg_hi21:BORINGSSL_function_hit @@ -370,6 +349,7 @@ sub BODY_00_xx { strb w10, [x9,#6] // kFlag_sha256_hw #endif // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -434,7 +414,7 @@ sub BODY_00_xx { ldr x29,[sp],#16 ret -.size sha256_block_armv8,.-sha256_block_armv8 +.size sha256_block_data_order_hw,.-sha256_block_data_order_hw #endif ___ } @@ -451,10 +431,10 @@ sub BODY_00_xx { $code.=<<___; .text #ifndef __KERNEL__ -.type sha512_block_armv8,%function +.globl sha512_block_data_order_hw +.type sha512_block_data_order_hw,%function .align 6 -sha512_block_armv8: -.Lv8_entry: +sha512_block_data_order_hw: #ifdef BORINGSSL_DISPATCH_TEST .extern BORINGSSL_function_hit adrp x9,:pg_hi21:BORINGSSL_function_hit @@ -462,6 +442,8 @@ sub BODY_00_xx { mov w10, #1 strb w10, [x9,#8] // kFlag_sha512_hw #endif + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -545,7 +527,7 @@ sub BODY_00_xx { ldr x29,[sp],#16 ret -.size sha512_block_armv8,.-sha512_block_armv8 +.size sha512_block_data_order_hw,.-sha512_block_data_order_hw #endif ___ } diff --git a/crypto/fipsmodule/sha/asm/sha512-x86_64.pl b/crypto/fipsmodule/sha/asm/sha512-x86_64.pl index 48872e6d18..9aea64a4ab 100755 --- a/crypto/fipsmodule/sha/asm/sha512-x86_64.pl +++ b/crypto/fipsmodule/sha/asm/sha512-x86_64.pl @@ -261,42 +261,12 @@ () $code=<<___; .text -.extern OPENSSL_ia32cap_P -.globl $func -.type $func,\@function,3 +.globl ${func}_nohw +.type ${func}_nohw,\@function,3 .align 16 -$func: +${func}_nohw: .cfi_startproc _CET_ENDBR -___ -$code.=<<___ if ($SZ==4 || $avx); - leaq OPENSSL_ia32cap_P(%rip),%r11 - mov 0(%r11),%r9d - mov 4(%r11),%r10d - mov 8(%r11),%r11d -___ -$code.=<<___ if ($SZ==4 && $shaext); - test \$`1<<29`,%r11d # check for SHA - jnz .Lshaext_shortcut -___ - # XOP codepath removed. -$code.=<<___ if ($avx>1); - and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 - cmp \$`1<<8|1<<5|1<<3`,%r11d - je .Lavx2_shortcut -___ -$code.=<<___ if ($avx); - and \$`1<<30`,%r9d # mask "Intel CPU" bit - and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits - or %r9d,%r10d - cmp \$`1<<28|1<<9|1<<30`,%r10d - je .Lavx_shortcut -___ -$code.=<<___ if ($SZ==4); - test \$`1<<9`,%r10d - jnz .Lssse3_shortcut -___ -$code.=<<___; mov %rsp,%rax # copy %rsp .cfi_def_cfa_register %rax push %rbx @@ -404,7 +374,7 @@ () .Lepilogue: ret .cfi_endproc -.size $func,.-$func +.size ${func}_nohw,.-${func}_nohw ___ if ($SZ==4) { @@ -562,15 +532,16 @@ () my @MSG=map("%xmm$_",(3..6)); $code.=<<___; -.type sha256_block_data_order_shaext,\@function,3 +.globl sha256_block_data_order_hw +.type sha256_block_data_order_hw,\@function,3 .align 64 -sha256_block_data_order_shaext: -.Lshaext_shortcut: +sha256_block_data_order_hw: .cfi_startproc #ifdef BORINGSSL_DISPATCH_TEST .extern BORINGSSL_function_hit movb \$1,BORINGSSL_function_hit+6(%rip) #endif + _CET_ENDBR ___ $code.=<<___ if ($win64); lea `-8-5*16`(%rsp),%rsp @@ -715,7 +686,7 @@ () $code.=<<___; ret .cfi_endproc -.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext +.size sha256_block_data_order_hw,.-sha256_block_data_order_hw ___ }}} {{{ @@ -780,11 +751,12 @@ () my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); $code.=<<___; +.globl ${func}_ssse3 .type ${func}_ssse3,\@function,3 .align 64 ${func}_ssse3: .cfi_startproc -.Lssse3_shortcut: + _CET_ENDBR mov %rsp,%rax # copy %rsp .cfi_def_cfa_register %rax push %rbx @@ -1143,11 +1115,12 @@ () local *ror = sub { &shrd(@_[0],@_) }; $code.=<<___; +.globl ${func}_avx .type ${func}_avx,\@function,3 .align 64 ${func}_avx: .cfi_startproc -.Lavx_shortcut: + _CET_ENDBR mov %rsp,%rax # copy %rsp .cfi_def_cfa_register %rax push %rbx @@ -2013,14 +1986,14 @@ () $code.=<<___; .section .pdata .align 4 - .rva .LSEH_begin_$func - .rva .LSEH_end_$func - .rva .LSEH_info_$func + .rva .LSEH_begin_${func}_nohw + .rva .LSEH_end_${func}_nohw + .rva .LSEH_info_${func}_nohw ___ $code.=<<___ if ($SZ==4 && $shaext); - .rva .LSEH_begin_${func}_shaext - .rva .LSEH_end_${func}_shaext - .rva .LSEH_info_${func}_shaext + .rva .LSEH_begin_${func}_hw + .rva .LSEH_end_${func}_hw + .rva .LSEH_info_${func}_hw ___ $code.=<<___ if ($SZ==4); .rva .LSEH_begin_${func}_ssse3 @@ -2040,13 +2013,13 @@ () $code.=<<___; .section .xdata .align 8 -.LSEH_info_$func: +.LSEH_info_${func}_nohw: .byte 9,0,0,0 .rva se_handler .rva .Lprologue,.Lepilogue # HandlerData[] ___ $code.=<<___ if ($SZ==4 && $shaext); -.LSEH_info_${func}_shaext: +.LSEH_info_${func}_hw: .byte 9,0,0,0 .rva shaext_handler ___ diff --git a/crypto/fipsmodule/sha/internal.h b/crypto/fipsmodule/sha/internal.h index f782a15edc..8c58b76a98 100644 --- a/crypto/fipsmodule/sha/internal.h +++ b/crypto/fipsmodule/sha/internal.h @@ -15,6 +15,11 @@ #ifndef OPENSSL_HEADER_SHA_INTERNAL_H #define OPENSSL_HEADER_SHA_INTERNAL_H +#include + +#include "../../internal.h" +#include "../cpucap/internal.h" + #if defined(__cplusplus) extern "C" { #endif @@ -77,28 +82,154 @@ struct keccak_st { uint8_t buf[SHA3_MAX_BLOCKSIZE]; // should have at least the max data block size bytes uint8_t pad; }; +// Define SHA{n}[_{variant}]_ASM if sha{n}_block_data_order[_{variant}] is +// defined in assembly. -#if defined(OPENSSL_PPC64LE) || \ - (!defined(OPENSSL_NO_ASM) && \ - (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \ - defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))) -// POWER has an intrinsics-based implementation of SHA-1 and thus the functions -// normally defined in assembly are available even with |OPENSSL_NO_ASM| in -// this case. -#define SHA1_ASM -void sha1_block_data_order(uint32_t *state, const uint8_t *in, - size_t num_blocks); -#endif +#if defined(OPENSSL_PPC64LE) +#define SHA1_ALTIVEC -#if !defined(OPENSSL_NO_ASM) && \ - (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \ - defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) +void sha1_block_data_order(uint32_t *state, const uint8_t *data, + size_t num_blocks); + +#elif !defined(OPENSSL_NO_ASM) && (defined(OPENSSL_X86) || defined(OPENSSL_ARM)) +#define SHA1_ASM #define SHA256_ASM #define SHA512_ASM -void sha256_block_data_order(uint32_t *state, const uint8_t *in, + +void sha1_block_data_order(uint32_t *state, const uint8_t *data, + size_t num_blocks); +void sha256_block_data_order(uint32_t *state, const uint8_t *data, size_t num_blocks); -void sha512_block_data_order(uint64_t *state, const uint8_t *in, +void sha512_block_data_order(uint64_t *state, const uint8_t *data, size_t num_blocks); + +#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) + +#define SHA1_ASM_NOHW +#define SHA256_ASM_NOHW +#define SHA512_ASM_NOHW + +#define SHA1_ASM_HW +OPENSSL_INLINE int sha1_hw_capable(void) { + return CRYPTO_is_ARMv8_SHA1_capable(); +} + +#define SHA256_ASM_HW +OPENSSL_INLINE int sha256_hw_capable(void) { + return CRYPTO_is_ARMv8_SHA256_capable(); +} + +#define SHA512_ASM_HW +OPENSSL_INLINE int sha512_hw_capable(void) { + return CRYPTO_is_ARMv8_SHA512_capable(); +} + +#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) + +#define SHA1_ASM_NOHW +#define SHA256_ASM_NOHW +#define SHA512_ASM_NOHW + +#define SHA1_ASM_HW +OPENSSL_INLINE int sha1_hw_capable(void) { + return CRYPTO_is_SHAEXT_capable() && CRYPTO_is_SSSE3_capable(); +} + +#define SHA1_ASM_AVX2 +OPENSSL_INLINE int sha1_avx2_capable(void) { + // TODO: Simplify this logic, which was extracted from the assembly: + // * Does AVX2 imply SSSE3? + // * sha1_block_data_order_avx2 does not seem to use SSSE3 instructions. + return CRYPTO_is_AVX2_capable() && CRYPTO_is_BMI2_capable() && + CRYPTO_is_BMI1_capable() && CRYPTO_is_SSSE3_capable(); +} +void sha1_block_data_order_avx2(uint32_t *state, const uint8_t *data, + size_t num); + +#define SHA1_ASM_AVX +OPENSSL_INLINE int sha1_avx_capable(void) { + // TODO: Simplify this logic, which was extracted from the assembly: + // * Does AVX imply SSSE3? + // * sha1_block_data_order_avx does not seem to use SSSE3 instructions. + // Pre-Zen AMD CPUs had slow SHLD/SHRD; Zen added the SHA extension; see the + // discussion in sha1-586.pl. + return CRYPTO_is_AVX_capable() && CRYPTO_is_SSSE3_capable() && + CRYPTO_is_intel_cpu(); +} +void sha1_block_data_order_avx(uint32_t *state, const uint8_t *data, + size_t num); + +#define SHA1_ASM_SSSE3 +OPENSSL_INLINE int sha1_ssse3_capable(void) { + return CRYPTO_is_SSSE3_capable(); +} +void sha1_block_data_order_ssse3(uint32_t *state, const uint8_t *data, + size_t num); + +#define SHA256_ASM_HW +OPENSSL_INLINE int sha256_hw_capable(void) { + return CRYPTO_is_SHAEXT_capable(); +} + +#define SHA256_ASM_AVX +OPENSSL_INLINE int sha256_avx_capable(void) { + // TODO: Simplify this logic, which was extracted from the assembly: + // * Does AVX imply SSSE3? + // * sha256_block_data_order_avx does not seem to use SSSE3 instructions. + // Pre-Zen AMD CPUs had slow SHLD/SHRD; Zen added the SHA extension; see the + // discussion in sha1-586.pl. + return CRYPTO_is_AVX_capable() && CRYPTO_is_SSSE3_capable() && + CRYPTO_is_intel_cpu(); +} +void sha256_block_data_order_avx(uint32_t *state, const uint8_t *data, + size_t num); + +#define SHA256_ASM_SSSE3 +OPENSSL_INLINE int sha256_ssse3_capable(void) { + return CRYPTO_is_SSSE3_capable(); +} +void sha256_block_data_order_ssse3(uint32_t *state, const uint8_t *data, + size_t num); + +#define SHA512_ASM_AVX +OPENSSL_INLINE int sha512_avx_capable(void) { + // TODO: Simplify this logic, which was extracted from the assembly: + // * Does AVX imply SSSE3? + // * sha512_block_data_order_avx does not seem to use SSSE3 instructions. + // Pre-Zen AMD CPUs had slow SHLD/SHRD; Zen added the SHA extension; see the + // discussion in sha1-586.pl. + return CRYPTO_is_AVX_capable() && CRYPTO_is_SSSE3_capable() && + CRYPTO_is_intel_cpu(); +} +void sha512_block_data_order_avx(uint64_t *state, const uint8_t *data, + size_t num); + +#endif + +#if defined(SHA1_ASM_HW) +void sha1_block_data_order_hw(uint32_t *state, const uint8_t *data, size_t num); +#endif +#if defined(SHA1_ASM_NOHW) +void sha1_block_data_order_nohw(uint32_t *state, const uint8_t *data, + size_t num); +#endif + +#if defined(SHA256_ASM_HW) +void sha256_block_data_order_hw(uint32_t *state, const uint8_t *data, + size_t num); +#endif +#if defined(SHA256_ASM_NOHW) +void sha256_block_data_order_nohw(uint32_t *state, const uint8_t *data, + size_t num); +#endif + +#if defined(SHA512_ASM_HW) +void sha512_block_data_order_hw(uint64_t *state, const uint8_t *data, + size_t num); +#endif +#if defined(SHA512_ASM_NOHW) +void sha512_block_data_order_nohw(uint64_t *state, const uint8_t *data, + size_t num); #endif #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) @@ -153,38 +284,38 @@ OPENSSL_EXPORT int SHA512_256_get_state( SHA512_CTX *ctx, uint8_t out_h[SHA512_256_CHAINING_LENGTH], uint64_t *out_n); -// SHA3_224 writes the digest of |len| bytes from |data| to |out| and returns |out|. +// SHA3_224 writes the digest of |len| bytes from |data| to |out| and returns |out|. // There must be at least |SHA3_224_DIGEST_LENGTH| bytes of space in |out|. // On failure |SHA3_224| returns NULL. OPENSSL_EXPORT uint8_t *SHA3_224(const uint8_t *data, size_t len, - uint8_t out[SHA3_224_DIGEST_LENGTH]); - -// SHA3_256 writes the digest of |len| bytes from |data| to |out| and returns |out|. + uint8_t out[SHA3_224_DIGEST_LENGTH]); + +// SHA3_256 writes the digest of |len| bytes from |data| to |out| and returns |out|. // There must be at least |SHA3_256_DIGEST_LENGTH| bytes of space in |out|. // On failure |SHA3_256| returns NULL. OPENSSL_EXPORT uint8_t *SHA3_256(const uint8_t *data, size_t len, - uint8_t out[SHA3_256_DIGEST_LENGTH]); + uint8_t out[SHA3_256_DIGEST_LENGTH]); -// SHA3_384 writes the digest of |len| bytes from |data| to |out| and returns |out|. +// SHA3_384 writes the digest of |len| bytes from |data| to |out| and returns |out|. // There must be at least |SHA3_384_DIGEST_LENGTH| bytes of space in |out|. // On failure |SHA3_384| returns NULL. OPENSSL_EXPORT uint8_t *SHA3_384(const uint8_t *data, size_t len, - uint8_t out[SHA3_384_DIGEST_LENGTH]); + uint8_t out[SHA3_384_DIGEST_LENGTH]); -// SHA3_512 writes the digest of |len| bytes from |data| to |out| and returns |out|. +// SHA3_512 writes the digest of |len| bytes from |data| to |out| and returns |out|. // There must be at least |SHA3_512_DIGEST_LENGTH| bytes of space in |out|. // On failure |SHA3_512| returns NULL. OPENSSL_EXPORT uint8_t *SHA3_512(const uint8_t *data, size_t len, uint8_t out[SHA3_512_DIGEST_LENGTH]); -// SHAKE128 writes the |out_len| bytes output from |in_len| bytes |data| -// to |out| and returns |out| on success and NULL on failure. -OPENSSL_EXPORT uint8_t *SHAKE128(const uint8_t *data, const size_t in_len, +// SHAKE128 writes the |out_len| bytes output from |in_len| bytes |data| +// to |out| and returns |out| on success and NULL on failure. +OPENSSL_EXPORT uint8_t *SHAKE128(const uint8_t *data, const size_t in_len, uint8_t *out, size_t out_len); -// SHAKE256 writes |out_len| bytes output from |in_len| bytes |data| -// to |out| and returns |out| on success and NULL on failure. -OPENSSL_EXPORT uint8_t *SHAKE256(const uint8_t *data, const size_t in_len, +// SHAKE256 writes |out_len| bytes output from |in_len| bytes |data| +// to |out| and returns |out| on success and NULL on failure. +OPENSSL_EXPORT uint8_t *SHAKE256(const uint8_t *data, const size_t in_len, uint8_t *out, size_t out_len); // SHAKE_Init initializes |ctx| with specified |block_size|, returns 1 on @@ -202,22 +333,22 @@ OPENSSL_EXPORT void SHA3_Reset(KECCAK1600_CTX *ctx); OPENSSL_EXPORT int SHA3_Init(KECCAK1600_CTX *ctx, uint8_t pad, size_t bitlen); -// SHA3_Update processes all data blocks that don't need pad through +// SHA3_Update processes all data blocks that don't need pad through // |SHA3_Absorb| and returns 1 and 0 on failure. OPENSSL_EXPORT int SHA3_Update(KECCAK1600_CTX *ctx, const void *data, size_t len); -// SHA3_Final pads the last data block and processes it through |SHA3_Absorb|. +// SHA3_Final pads the last data block and processes it through |SHA3_Absorb|. // It processes the data through |SHA3_Squeeze| and returns 1 and 0 on failure. OPENSSL_EXPORT int SHA3_Final(uint8_t *md, KECCAK1600_CTX *ctx); -// SHA3_Absorb processes the largest multiple of |r| out of |len| bytes and -// returns the remaining number of bytes. -OPENSSL_EXPORT size_t SHA3_Absorb(uint64_t A[SHA3_ROWS][SHA3_ROWS], +// SHA3_Absorb processes the largest multiple of |r| out of |len| bytes and +// returns the remaining number of bytes. +OPENSSL_EXPORT size_t SHA3_Absorb(uint64_t A[SHA3_ROWS][SHA3_ROWS], const uint8_t *data, size_t len, size_t r); // SHA3_Squeeze generate |out| hash value of |len| bytes. -OPENSSL_EXPORT void SHA3_Squeeze(uint64_t A[SHA3_ROWS][SHA3_ROWS], +OPENSSL_EXPORT void SHA3_Squeeze(uint64_t A[SHA3_ROWS][SHA3_ROWS], uint8_t *out, size_t len, size_t r); #if defined(__cplusplus) diff --git a/crypto/fipsmodule/sha/sha1.c b/crypto/fipsmodule/sha/sha1.c index 5459e23427..681bff37d9 100644 --- a/crypto/fipsmodule/sha/sha1.c +++ b/crypto/fipsmodule/sha/sha1.c @@ -112,7 +112,7 @@ uint8_t *SHA1(const uint8_t *data, size_t len, uint8_t out[SHA_DIGEST_LENGTH]) { return out; } -#if !defined(SHA1_ASM) +#if !defined(SHA1_ASM) && !defined(SHA1_ALTIVEC) static void sha1_block_data_order(uint32_t *state, const uint8_t *data, size_t num); #endif @@ -235,9 +235,11 @@ int SHA1_get_state(SHA_CTX *ctx, uint8_t out_h[SHA1_CHAINING_LENGTH], * */ #define X(i) XX##i -#if !defined(SHA1_ASM) -static void sha1_block_data_order(uint32_t *state, const uint8_t *data, - size_t num) { +#if !defined(SHA1_ASM) && !defined(SHA1_ALTIVEC) + +#if !defined(SHA1_ASM_NOHW) +static void sha1_block_data_order_nohw(uint32_t *state, const uint8_t *data, + size_t num) { register uint32_t A, B, C, D, E, T; uint32_t XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, XX8, XX9, XX10, XX11, XX12, XX13, XX14, XX15; @@ -384,7 +386,38 @@ static void sha1_block_data_order(uint32_t *state, const uint8_t *data, E = state[4]; } } +#endif // !SHA1_ASM_NOHW + +static void sha1_block_data_order(uint32_t *state, const uint8_t *data, + size_t num) { +#if defined(SHA1_ASM_HW) + if (sha1_hw_capable()) { + sha1_block_data_order_hw(state, data, num); + return; + } #endif +#if defined(SHA1_ASM_AVX2) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX) + if (sha1_avx2_capable()) { + sha1_block_data_order_avx(state, data, num); + return; + } +#endif +#if defined(SHA1_ASM_AVX) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX) + if (sha1_avx_capable()) { + sha1_block_data_order_avx(state, data, num); + return; + } +#endif +#if defined(SHA1_ASM_SSSE3) + if (sha1_ssse3_capable()) { + sha1_block_data_order_ssse3(state, data, num); + return; + } +#endif + sha1_block_data_order_nohw(state, data, num); +} + +#endif // !SHA1_ASM && !SHA1_ALTIVEC #undef Xupdate #undef K_00_19 diff --git a/crypto/fipsmodule/sha/sha256.c b/crypto/fipsmodule/sha/sha256.c index 1b97987af9..d2e7c7b334 100644 --- a/crypto/fipsmodule/sha/sha256.c +++ b/crypto/fipsmodule/sha/sha256.c @@ -168,7 +168,7 @@ uint8_t *SHA256(const uint8_t *data, size_t len, return out; } -#ifndef SHA256_ASM +#if !defined(SHA256_ASM) static void sha256_block_data_order(uint32_t *state, const uint8_t *in, size_t num); #endif @@ -244,7 +244,9 @@ int SHA256_get_state(SHA256_CTX *ctx, uint8_t out_h[SHA256_CHAINING_LENGTH], return sha256_get_state_impl(ctx, out_h, out_n); } -#ifndef SHA256_ASM +#if !defined(SHA256_ASM) + +#if !defined(SHA256_ASM_NOHW) static const uint32_t K256[64] = { 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, 0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, 0xd807aa98UL, 0x12835b01UL, @@ -293,8 +295,8 @@ static const uint32_t K256[64] = { ROUND_00_15(i, a, b, c, d, e, f, g, h); \ } while (0) -static void sha256_block_data_order(uint32_t *state, const uint8_t *data, - size_t num) { +static void sha256_block_data_order_nohw(uint32_t *state, const uint8_t *data, + size_t num) { uint32_t a, b, c, d, e, f, g, h, s0, s1, T1; uint32_t X[16]; int i; @@ -380,7 +382,33 @@ static void sha256_block_data_order(uint32_t *state, const uint8_t *data, } } -#endif // !SHA256_ASM +#endif // !defined(SHA256_ASM_NOHW) + +static void sha256_block_data_order(uint32_t *state, const uint8_t *data, + size_t num) { +#if defined(SHA256_ASM_HW) + if (sha256_hw_capable()) { + sha256_block_data_order_hw(state, data, num); + return; + } +#endif +#if defined(SHA256_ASM_AVX) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX) + if (sha256_avx_capable()) { + sha256_block_data_order_avx(state, data, num); + return; + } +#endif +#if defined(SHA256_ASM_SSSE3) + if (sha256_ssse3_capable()) { + sha256_block_data_order_ssse3(state, data, num); + return; + } +#endif + sha256_block_data_order_nohw(state, data, num); +} + +#endif // !defined(SHA256_ASM) + void SHA256_TransformBlocks(uint32_t state[8], const uint8_t *data, size_t num_blocks) { diff --git a/crypto/fipsmodule/sha/sha512.c b/crypto/fipsmodule/sha/sha512.c index ddc01dc16c..f9d9d88dc6 100644 --- a/crypto/fipsmodule/sha/sha512.c +++ b/crypto/fipsmodule/sha/sha512.c @@ -458,7 +458,9 @@ int SHA512_256_get_state(SHA512_CTX *ctx, uint8_t out_h[SHA512_256_CHAINING_LENG return sha512_get_state_impl(ctx, out_h, out_n); } -#ifndef SHA512_ASM +#if !defined(SHA512_ASM) + +#if !defined(SHA512_ASM_NOHW) static const uint64_t K512[80] = { UINT64_C(0x428a2f98d728ae22), UINT64_C(0x7137449123ef65cd), UINT64_C(0xb5c0fbcfec4d3b2f), UINT64_C(0xe9b5dba58189dbbc), @@ -520,8 +522,8 @@ static const uint64_t K512[80] = { #if defined(__i386) || defined(__i386__) || defined(_M_IX86) // This code should give better results on 32-bit CPU with less than // ~24 registers, both size and performance wise... -static void sha512_block_data_order(uint64_t *state, const uint8_t *in, - size_t num) { +static void sha512_block_data_order_nohw(uint64_t *state, const uint8_t *in, + size_t num) { uint64_t A, E, T; uint64_t X[9 + 80], *F; int i; @@ -593,8 +595,8 @@ static void sha512_block_data_order(uint64_t *state, const uint8_t *in, ROUND_00_15(i + j, a, b, c, d, e, f, g, h); \ } while (0) -static void sha512_block_data_order(uint64_t *state, const uint8_t *in, - size_t num) { +static void sha512_block_data_order_nohw(uint64_t *state, const uint8_t *in, + size_t num) { uint64_t a, b, c, d, e, f, g, h, s0, s1, T1; uint64_t X[16]; int i; @@ -677,6 +679,25 @@ static void sha512_block_data_order(uint64_t *state, const uint8_t *in, #endif +#endif // !SHA512_ASM_NOHW + +static void sha512_block_data_order(uint64_t *state, const uint8_t *data, + size_t num) { +#if defined(SHA512_ASM_HW) + if (sha512_hw_capable()) { + sha512_block_data_order_hw(state, data, num); + return; + } +#endif +#if defined(SHA512_ASM_AVX) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX) + if (sha512_avx_capable()) { + sha512_block_data_order_avx(state, data, num); + return; + } +#endif + sha512_block_data_order_nohw(state, data, num); +} + #endif // !SHA512_ASM #undef Sigma0 diff --git a/crypto/fipsmodule/sha/sha_test.cc b/crypto/fipsmodule/sha/sha_test.cc index 28ddf92c57..c79e3912c7 100644 --- a/crypto/fipsmodule/sha/sha_test.cc +++ b/crypto/fipsmodule/sha/sha_test.cc @@ -21,41 +21,99 @@ #include "../../test/abi_test.h" #include "internal.h" -#if defined(SHA1_ASM) && defined(SUPPORTS_ABI_TEST) +#if defined(SUPPORTS_ABI_TEST) && !defined(SHA1_ALTIVEC) TEST(SHATest, SHA1ABI) { SHA_CTX ctx; SHA1_Init(&ctx); static const uint8_t kBuf[SHA_CBLOCK * 8] = {0}; - CHECK_ABI(sha1_block_data_order, ctx.h, kBuf, 1); - CHECK_ABI(sha1_block_data_order, ctx.h, kBuf, 2); - CHECK_ABI(sha1_block_data_order, ctx.h, kBuf, 4); - CHECK_ABI(sha1_block_data_order, ctx.h, kBuf, 8); + for (size_t blocks : {1, 2, 4, 8}) { +#if defined(SHA1_ASM) + CHECK_ABI(sha1_block_data_order, ctx.h, kBuf, blocks); +#endif +#if defined(SHA1_ASM_HW) + if (sha1_hw_capable()) { + CHECK_ABI(sha1_block_data_order_hw, ctx.h, kBuf, blocks); + } +#endif +#if defined(SHA1_ASM_AVX2) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX) + if (sha1_avx2_capable()) { + CHECK_ABI(sha1_block_data_order_avx2, ctx.h, kBuf, blocks); + } +#endif +#if defined(SHA1_ASM_AVX) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX) + if (sha1_avx_capable()) { + CHECK_ABI(sha1_block_data_order_avx, ctx.h, kBuf, blocks); + return; + } +#endif +#if defined(SHA1_ASM_SSSE3) + if (sha1_ssse3_capable()) { + CHECK_ABI(sha1_block_data_order_ssse3, ctx.h, kBuf, blocks); + return; + } +#endif +#if defined(SHA1_ASM_NOHW) + CHECK_ABI(sha1_block_data_order_nohw, ctx.h, kBuf, blocks); +#endif + } } -#endif // SHA1_ASM && SUPPORTS_ABI_TEST -#if defined(SHA256_ASM) && defined(SUPPORTS_ABI_TEST) TEST(SHATest, SHA256ABI) { SHA256_CTX ctx; SHA256_Init(&ctx); static const uint8_t kBuf[SHA256_CBLOCK * 8] = {0}; - CHECK_ABI(sha256_block_data_order, ctx.h, kBuf, 1); - CHECK_ABI(sha256_block_data_order, ctx.h, kBuf, 2); - CHECK_ABI(sha256_block_data_order, ctx.h, kBuf, 4); - CHECK_ABI(sha256_block_data_order, ctx.h, kBuf, 8); + for (size_t blocks : {1, 2, 4, 8}) { +#if defined(SHA256_ASM) + CHECK_ABI(sha256_block_data_order, ctx.h, kBuf, blocks); +#endif +#if defined(SHA256_ASM_HW) + if (sha256_hw_capable()) { + CHECK_ABI(sha256_block_data_order_hw, ctx.h, kBuf, blocks); + } +#endif +#if defined(SHA256_ASM_AVX) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX) + if (sha256_avx_capable()) { + CHECK_ABI(sha256_block_data_order_avx, ctx.h, kBuf, blocks); + return; + } +#endif +#if defined(SHA256_ASM_SSSE3) + if (sha256_ssse3_capable()) { + CHECK_ABI(sha256_block_data_order_ssse3, ctx.h, kBuf, blocks); + return; + } +#endif +#if defined(SHA256_ASM_NOHW) + CHECK_ABI(sha256_block_data_order_nohw, ctx.h, kBuf, blocks); +#endif + } } -#endif // SHA256_ASM && SUPPORTS_ABI_TEST -#if defined(SHA512_ASM) && defined(SUPPORTS_ABI_TEST) TEST(SHATest, SHA512ABI) { SHA512_CTX ctx; SHA512_Init(&ctx); static const uint8_t kBuf[SHA512_CBLOCK * 4] = {0}; - CHECK_ABI(sha512_block_data_order, ctx.h, kBuf, 1); - CHECK_ABI(sha512_block_data_order, ctx.h, kBuf, 2); - CHECK_ABI(sha512_block_data_order, ctx.h, kBuf, 3); - CHECK_ABI(sha512_block_data_order, ctx.h, kBuf, 4); + for (size_t blocks : {1, 2, 3, 4}) { +#if defined(SHA512_ASM) + CHECK_ABI(sha512_block_data_order, ctx.h, kBuf, blocks); +#endif +#if defined(SHA512_ASM_HW) + if (sha512_hw_capable()) { + CHECK_ABI(sha512_block_data_order_hw, ctx.h, kBuf, blocks); + } +#endif +#if defined(SHA512_ASM_AVX) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX) + if (sha512_avx_capable()) { + CHECK_ABI(sha512_block_data_order_avx, ctx.h, kBuf, blocks); + } +#endif +#if defined(SHA512_ASM_NOHW) + CHECK_ABI(sha512_block_data_order_nohw, ctx.h, kBuf, blocks); +#endif + } } -#endif // SHA512_ASM && SUPPORTS_ABI_TEST + +#endif // defined(SUPPORTS_ABI_TEST) && !defined(SHA1_ALTIVEC) diff --git a/generated-src/ios-aarch64/crypto/fipsmodule/sha1-armv8.S b/generated-src/ios-aarch64/crypto/fipsmodule/sha1-armv8.S index 744c630646..8f847749e3 100644 --- a/generated-src/ios-aarch64/crypto/fipsmodule/sha1-armv8.S +++ b/generated-src/ios-aarch64/crypto/fipsmodule/sha1-armv8.S @@ -8,23 +8,13 @@ .text - -.private_extern _OPENSSL_armcap_P -.globl _sha1_block_data_order -.private_extern _sha1_block_data_order +.globl _sha1_block_data_order_nohw +.private_extern _sha1_block_data_order_nohw .align 6 -_sha1_block_data_order: +_sha1_block_data_order_nohw: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. AARCH64_VALID_CALL_TARGET -#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10 - adrp x16,:pg_hi21_nc:_OPENSSL_armcap_P -#else - adrp x16,_OPENSSL_armcap_P@PAGE -#endif - ldr w16,[x16,_OPENSSL_armcap_P@PAGEOFF] - tst w16,#ARMV8_SHA1 - b.ne Lv8_entry stp x29,x30,[sp,#-96]! add x29,sp,#0 @@ -1081,12 +1071,13 @@ Loop: ldr x29,[sp],#96 ret +.globl _sha1_block_data_order_hw +.private_extern _sha1_block_data_order_hw .align 6 -sha1_block_armv8: +_sha1_block_data_order_hw: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. AARCH64_VALID_CALL_TARGET -Lv8_entry: stp x29,x30,[sp,#-16]! add x29,sp,#0 diff --git a/generated-src/ios-aarch64/crypto/fipsmodule/sha256-armv8.S b/generated-src/ios-aarch64/crypto/fipsmodule/sha256-armv8.S index 05c5dc0834..0f886deb1e 100644 --- a/generated-src/ios-aarch64/crypto/fipsmodule/sha256-armv8.S +++ b/generated-src/ios-aarch64/crypto/fipsmodule/sha256-armv8.S @@ -50,24 +50,11 @@ .text - -.private_extern _OPENSSL_armcap_P -.globl _sha256_block_data_order -.private_extern _sha256_block_data_order +.globl _sha256_block_data_order_nohw +.private_extern _sha256_block_data_order_nohw .align 6 -_sha256_block_data_order: - AARCH64_VALID_CALL_TARGET -#ifndef __KERNEL__ -#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10 - adrp x16,:pg_hi21_nc:_OPENSSL_armcap_P -#else - adrp x16,_OPENSSL_armcap_P@PAGE -#endif - ldr w16,[x16,_OPENSSL_armcap_P@PAGEOFF] - tst w16,#ARMV8_SHA256 - b.ne Lv8_entry -#endif +_sha256_block_data_order_nohw: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 @@ -1060,10 +1047,11 @@ LK256: .align 2 .text #ifndef __KERNEL__ +.globl _sha256_block_data_order_hw +.private_extern _sha256_block_data_order_hw .align 6 -sha256_block_armv8: -Lv8_entry: +_sha256_block_data_order_hw: #ifdef BORINGSSL_DISPATCH_TEST adrp x9,_BORINGSSL_function_hit@PAGE @@ -1072,6 +1060,7 @@ Lv8_entry: strb w10, [x9,#6] // kFlag_sha256_hw #endif // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 diff --git a/generated-src/ios-aarch64/crypto/fipsmodule/sha512-armv8.S b/generated-src/ios-aarch64/crypto/fipsmodule/sha512-armv8.S index 65dadcaf46..16e09c8c18 100644 --- a/generated-src/ios-aarch64/crypto/fipsmodule/sha512-armv8.S +++ b/generated-src/ios-aarch64/crypto/fipsmodule/sha512-armv8.S @@ -50,24 +50,11 @@ .text - -.private_extern _OPENSSL_armcap_P -.globl _sha512_block_data_order -.private_extern _sha512_block_data_order +.globl _sha512_block_data_order_nohw +.private_extern _sha512_block_data_order_nohw .align 6 -_sha512_block_data_order: - AARCH64_VALID_CALL_TARGET -#ifndef __KERNEL__ -#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10 - adrp x16,:pg_hi21_nc:_OPENSSL_armcap_P -#else - adrp x16,_OPENSSL_armcap_P@PAGE -#endif - ldr w16,[x16,_OPENSSL_armcap_P@PAGEOFF] - tst w16,#ARMV8_SHA512 - b.ne Lv8_entry -#endif +_sha512_block_data_order_nohw: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 @@ -1084,10 +1071,11 @@ LK512: .align 2 .text #ifndef __KERNEL__ +.globl _sha512_block_data_order_hw +.private_extern _sha512_block_data_order_hw .align 6 -sha512_block_armv8: -Lv8_entry: +_sha512_block_data_order_hw: #ifdef BORINGSSL_DISPATCH_TEST adrp x9,_BORINGSSL_function_hit@PAGE @@ -1095,6 +1083,8 @@ Lv8_entry: mov w10, #1 strb w10, [x9,#8] // kFlag_sha512_hw #endif + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 diff --git a/generated-src/linux-aarch64/crypto/fipsmodule/sha1-armv8.S b/generated-src/linux-aarch64/crypto/fipsmodule/sha1-armv8.S index a18a8a5578..f2df2dd30c 100644 --- a/generated-src/linux-aarch64/crypto/fipsmodule/sha1-armv8.S +++ b/generated-src/linux-aarch64/crypto/fipsmodule/sha1-armv8.S @@ -8,23 +8,13 @@ .text - -.hidden OPENSSL_armcap_P -.globl sha1_block_data_order -.hidden sha1_block_data_order -.type sha1_block_data_order,%function +.globl sha1_block_data_order_nohw +.hidden sha1_block_data_order_nohw +.type sha1_block_data_order_nohw,%function .align 6 -sha1_block_data_order: +sha1_block_data_order_nohw: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. AARCH64_VALID_CALL_TARGET -#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10 - adrp x16,:pg_hi21_nc:OPENSSL_armcap_P -#else - adrp x16,OPENSSL_armcap_P -#endif - ldr w16,[x16,:lo12:OPENSSL_armcap_P] - tst w16,#ARMV8_SHA1 - b.ne .Lv8_entry stp x29,x30,[sp,#-96]! add x29,sp,#0 @@ -1080,13 +1070,14 @@ sha1_block_data_order: ldp x27,x28,[sp,#80] ldr x29,[sp],#96 ret -.size sha1_block_data_order,.-sha1_block_data_order -.type sha1_block_armv8,%function +.size sha1_block_data_order_nohw,.-sha1_block_data_order_nohw +.globl sha1_block_data_order_hw +.hidden sha1_block_data_order_hw +.type sha1_block_data_order_hw,%function .align 6 -sha1_block_armv8: +sha1_block_data_order_hw: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. AARCH64_VALID_CALL_TARGET -.Lv8_entry: stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -1213,7 +1204,7 @@ sha1_block_armv8: ldr x29,[sp],#16 ret -.size sha1_block_armv8,.-sha1_block_armv8 +.size sha1_block_data_order_hw,.-sha1_block_data_order_hw .section .rodata .align 6 .Lconst: diff --git a/generated-src/linux-aarch64/crypto/fipsmodule/sha256-armv8.S b/generated-src/linux-aarch64/crypto/fipsmodule/sha256-armv8.S index 564f166f31..b66f6ae408 100644 --- a/generated-src/linux-aarch64/crypto/fipsmodule/sha256-armv8.S +++ b/generated-src/linux-aarch64/crypto/fipsmodule/sha256-armv8.S @@ -50,24 +50,11 @@ .text - -.hidden OPENSSL_armcap_P -.globl sha256_block_data_order -.hidden sha256_block_data_order -.type sha256_block_data_order,%function +.globl sha256_block_data_order_nohw +.hidden sha256_block_data_order_nohw +.type sha256_block_data_order_nohw,%function .align 6 -sha256_block_data_order: - AARCH64_VALID_CALL_TARGET -#ifndef __KERNEL__ -#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10 - adrp x16,:pg_hi21_nc:OPENSSL_armcap_P -#else - adrp x16,OPENSSL_armcap_P -#endif - ldr w16,[x16,:lo12:OPENSSL_armcap_P] - tst w16,#ARMV8_SHA256 - b.ne .Lv8_entry -#endif +sha256_block_data_order_nohw: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 @@ -1031,7 +1018,7 @@ sha256_block_data_order: ldp x29,x30,[sp],#128 AARCH64_VALIDATE_LINK_REGISTER ret -.size sha256_block_data_order,.-sha256_block_data_order +.size sha256_block_data_order_nohw,.-sha256_block_data_order_nohw .section .rodata .align 6 @@ -1060,10 +1047,11 @@ sha256_block_data_order: .align 2 .text #ifndef __KERNEL__ -.type sha256_block_armv8,%function +.globl sha256_block_data_order_hw +.hidden sha256_block_data_order_hw +.type sha256_block_data_order_hw,%function .align 6 -sha256_block_armv8: -.Lv8_entry: +sha256_block_data_order_hw: #ifdef BORINGSSL_DISPATCH_TEST adrp x9,BORINGSSL_function_hit @@ -1072,6 +1060,7 @@ sha256_block_armv8: strb w10, [x9,#6] // kFlag_sha256_hw #endif // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -1206,6 +1195,6 @@ sha256_block_armv8: ldr x29,[sp],#16 ret -.size sha256_block_armv8,.-sha256_block_armv8 +.size sha256_block_data_order_hw,.-sha256_block_data_order_hw #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/generated-src/linux-aarch64/crypto/fipsmodule/sha512-armv8.S b/generated-src/linux-aarch64/crypto/fipsmodule/sha512-armv8.S index 4e5563cd11..6aa688f8ae 100644 --- a/generated-src/linux-aarch64/crypto/fipsmodule/sha512-armv8.S +++ b/generated-src/linux-aarch64/crypto/fipsmodule/sha512-armv8.S @@ -50,24 +50,11 @@ .text - -.hidden OPENSSL_armcap_P -.globl sha512_block_data_order -.hidden sha512_block_data_order -.type sha512_block_data_order,%function +.globl sha512_block_data_order_nohw +.hidden sha512_block_data_order_nohw +.type sha512_block_data_order_nohw,%function .align 6 -sha512_block_data_order: - AARCH64_VALID_CALL_TARGET -#ifndef __KERNEL__ -#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10 - adrp x16,:pg_hi21_nc:OPENSSL_armcap_P -#else - adrp x16,OPENSSL_armcap_P -#endif - ldr w16,[x16,:lo12:OPENSSL_armcap_P] - tst w16,#ARMV8_SHA512 - b.ne .Lv8_entry -#endif +sha512_block_data_order_nohw: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 @@ -1031,7 +1018,7 @@ sha512_block_data_order: ldp x29,x30,[sp],#128 AARCH64_VALIDATE_LINK_REGISTER ret -.size sha512_block_data_order,.-sha512_block_data_order +.size sha512_block_data_order_nohw,.-sha512_block_data_order_nohw .section .rodata .align 6 @@ -1084,10 +1071,11 @@ sha512_block_data_order: .align 2 .text #ifndef __KERNEL__ -.type sha512_block_armv8,%function +.globl sha512_block_data_order_hw +.hidden sha512_block_data_order_hw +.type sha512_block_data_order_hw,%function .align 6 -sha512_block_armv8: -.Lv8_entry: +sha512_block_data_order_hw: #ifdef BORINGSSL_DISPATCH_TEST adrp x9,BORINGSSL_function_hit @@ -1095,6 +1083,8 @@ sha512_block_armv8: mov w10, #1 strb w10, [x9,#8] // kFlag_sha512_hw #endif + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -1608,6 +1598,6 @@ sha512_block_armv8: ldr x29,[sp],#16 ret -.size sha512_block_armv8,.-sha512_block_armv8 +.size sha512_block_data_order_hw,.-sha512_block_data_order_hw #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/generated-src/linux-x86_64/crypto/chacha/chacha-x86_64.S b/generated-src/linux-x86_64/crypto/chacha/chacha-x86_64.S index eac7a00fbc..caea7a015b 100644 --- a/generated-src/linux-x86_64/crypto/chacha/chacha-x86_64.S +++ b/generated-src/linux-x86_64/crypto/chacha/chacha-x86_64.S @@ -6,9 +6,6 @@ #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) .text -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P - .section .rodata .align 64 .Lzero: @@ -40,19 +37,13 @@ .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .text -.globl ChaCha20_ctr32 -.hidden ChaCha20_ctr32 -.type ChaCha20_ctr32,@function +.globl ChaCha20_ctr32_nohw +.hidden ChaCha20_ctr32_nohw +.type ChaCha20_ctr32_nohw,@function .align 64 -ChaCha20_ctr32: +ChaCha20_ctr32_nohw: .cfi_startproc _CET_ENDBR - cmpq $0,%rdx - je .Lno_data - movq OPENSSL_ia32cap_P+4(%rip),%r10 - testl $512,%r10d - jnz .LChaCha20_ssse3 - pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset rbx,-16 @@ -329,18 +320,16 @@ _CET_ENDBR .Lno_data: .byte 0xf3,0xc3 .cfi_endproc -.size ChaCha20_ctr32,.-ChaCha20_ctr32 -.type ChaCha20_ssse3,@function +.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw +.globl ChaCha20_ctr32_ssse3 +.hidden ChaCha20_ctr32_ssse3 +.type ChaCha20_ctr32_ssse3,@function .align 32 -ChaCha20_ssse3: -.LChaCha20_ssse3: +ChaCha20_ctr32_ssse3: .cfi_startproc +_CET_ENDBR movq %rsp,%r9 .cfi_def_cfa_register r9 - cmpq $128,%rdx - ja .LChaCha20_4x - -.Ldo_sse3_after_all: subq $64+8,%rsp movdqa .Lsigma(%rip),%xmm0 movdqu (%rcx),%xmm1 @@ -466,26 +455,17 @@ ChaCha20_ssse3: .Lssse3_epilogue: .byte 0xf3,0xc3 .cfi_endproc -.size ChaCha20_ssse3,.-ChaCha20_ssse3 -.type ChaCha20_4x,@function +.size ChaCha20_ctr32_ssse3,.-ChaCha20_ctr32_ssse3 +.globl ChaCha20_ctr32_ssse3_4x +.hidden ChaCha20_ctr32_ssse3_4x +.type ChaCha20_ctr32_ssse3_4x,@function .align 32 -ChaCha20_4x: -.LChaCha20_4x: +ChaCha20_ctr32_ssse3_4x: .cfi_startproc +_CET_ENDBR movq %rsp,%r9 .cfi_def_cfa_register r9 movq %r10,%r11 - shrq $32,%r10 - testq $32,%r10 - jnz .LChaCha20_8x - cmpq $192,%rdx - ja .Lproceed4x - - andq $71303168,%r11 - cmpq $4194304,%r11 - je .Ldo_sse3_after_all - -.Lproceed4x: subq $0x140+8,%rsp movdqa .Lsigma(%rip),%xmm11 movdqu (%rcx),%xmm15 @@ -1018,12 +998,14 @@ ChaCha20_4x: .L4x_epilogue: .byte 0xf3,0xc3 .cfi_endproc -.size ChaCha20_4x,.-ChaCha20_4x -.type ChaCha20_8x,@function +.size ChaCha20_ctr32_ssse3_4x,.-ChaCha20_ctr32_ssse3_4x +.globl ChaCha20_ctr32_avx2 +.hidden ChaCha20_ctr32_avx2 +.type ChaCha20_ctr32_avx2,@function .align 32 -ChaCha20_8x: -.LChaCha20_8x: +ChaCha20_ctr32_avx2: .cfi_startproc +_CET_ENDBR movq %rsp,%r9 .cfi_def_cfa_register r9 subq $0x280+8,%rsp @@ -1624,5 +1606,5 @@ ChaCha20_8x: .L8x_epilogue: .byte 0xf3,0xc3 .cfi_endproc -.size ChaCha20_8x,.-ChaCha20_8x +.size ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2 #endif diff --git a/generated-src/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S b/generated-src/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S index 1b64f02014..9596e29024 100644 --- a/generated-src/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S +++ b/generated-src/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S @@ -5,36 +5,14 @@ #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) .text -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P -.globl sha1_block_data_order -.hidden sha1_block_data_order -.type sha1_block_data_order,@function +.globl sha1_block_data_order_nohw +.hidden sha1_block_data_order_nohw +.type sha1_block_data_order_nohw,@function .align 16 -sha1_block_data_order: +sha1_block_data_order_nohw: .cfi_startproc _CET_ENDBR - leaq OPENSSL_ia32cap_P(%rip),%r10 - movl 0(%r10),%r9d - movl 4(%r10),%r8d - movl 8(%r10),%r10d - testl $512,%r8d - jz .Lialu - testl $536870912,%r10d - jnz _shaext_shortcut - andl $296,%r10d - cmpl $296,%r10d - je _avx2_shortcut - andl $268435456,%r8d - andl $1073741824,%r9d - orl %r9d,%r8d - cmpl $1342177280,%r8d - je _avx_shortcut - jmp _ssse3_shortcut - -.align 16 -.Lialu: movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx @@ -1264,12 +1242,14 @@ _CET_ENDBR .Lepilogue: .byte 0xf3,0xc3 .cfi_endproc -.size sha1_block_data_order,.-sha1_block_data_order -.type sha1_block_data_order_shaext,@function +.size sha1_block_data_order_nohw,.-sha1_block_data_order_nohw +.globl sha1_block_data_order_hw +.hidden sha1_block_data_order_hw +.type sha1_block_data_order_hw,@function .align 32 -sha1_block_data_order_shaext: -_shaext_shortcut: +sha1_block_data_order_hw: .cfi_startproc +_CET_ENDBR movdqu (%rdi),%xmm0 movd 16(%rdi),%xmm1 movdqa K_XX_XX+160(%rip),%xmm3 @@ -1434,12 +1414,14 @@ _shaext_shortcut: movd %xmm1,16(%rdi) .byte 0xf3,0xc3 .cfi_endproc -.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext +.size sha1_block_data_order_hw,.-sha1_block_data_order_hw +.globl sha1_block_data_order_ssse3 +.hidden sha1_block_data_order_ssse3 .type sha1_block_data_order_ssse3,@function .align 16 sha1_block_data_order_ssse3: -_ssse3_shortcut: .cfi_startproc +_CET_ENDBR movq %rsp,%r11 .cfi_def_cfa_register %r11 pushq %rbx @@ -2623,11 +2605,13 @@ _ssse3_shortcut: .byte 0xf3,0xc3 .cfi_endproc .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 +.globl sha1_block_data_order_avx +.hidden sha1_block_data_order_avx .type sha1_block_data_order_avx,@function .align 16 sha1_block_data_order_avx: -_avx_shortcut: .cfi_startproc +_CET_ENDBR movq %rsp,%r11 .cfi_def_cfa_register %r11 pushq %rbx @@ -3751,11 +3735,13 @@ _avx_shortcut: .byte 0xf3,0xc3 .cfi_endproc .size sha1_block_data_order_avx,.-sha1_block_data_order_avx +.globl sha1_block_data_order_avx2 +.hidden sha1_block_data_order_avx2 .type sha1_block_data_order_avx2,@function .align 16 sha1_block_data_order_avx2: -_avx2_shortcut: .cfi_startproc +_CET_ENDBR movq %rsp,%r11 .cfi_def_cfa_register %r11 pushq %rbx diff --git a/generated-src/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S b/generated-src/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S index 554d3e4d19..468c4e1346 100644 --- a/generated-src/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S +++ b/generated-src/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S @@ -6,28 +6,13 @@ #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) .text -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P -.globl sha256_block_data_order -.hidden sha256_block_data_order -.type sha256_block_data_order,@function +.globl sha256_block_data_order_nohw +.hidden sha256_block_data_order_nohw +.type sha256_block_data_order_nohw,@function .align 16 -sha256_block_data_order: +sha256_block_data_order_nohw: .cfi_startproc _CET_ENDBR - leaq OPENSSL_ia32cap_P(%rip),%r11 - movl 0(%r11),%r9d - movl 4(%r11),%r10d - movl 8(%r11),%r11d - testl $536870912,%r11d - jnz .Lshaext_shortcut - andl $1073741824,%r9d - andl $268435968,%r10d - orl %r9d,%r10d - cmpl $1342177792,%r10d - je .Lavx_shortcut - testl $512,%r10d - jnz .Lssse3_shortcut movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx @@ -1733,7 +1718,7 @@ _CET_ENDBR .Lepilogue: .byte 0xf3,0xc3 .cfi_endproc -.size sha256_block_data_order,.-sha256_block_data_order +.size sha256_block_data_order_nohw,.-sha256_block_data_order_nohw .section .rodata .align 64 .type K256,@object @@ -1779,16 +1764,18 @@ K256: .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .text -.type sha256_block_data_order_shaext,@function +.globl sha256_block_data_order_hw +.hidden sha256_block_data_order_hw +.type sha256_block_data_order_hw,@function .align 64 -sha256_block_data_order_shaext: -.Lshaext_shortcut: +sha256_block_data_order_hw: .cfi_startproc #ifdef BORINGSSL_DISPATCH_TEST .extern BORINGSSL_function_hit .hidden BORINGSSL_function_hit movb $1,BORINGSSL_function_hit+6(%rip) #endif +_CET_ENDBR leaq K256+128(%rip),%rcx movdqu (%rdi),%xmm1 movdqu 16(%rdi),%xmm2 @@ -1992,12 +1979,14 @@ sha256_block_data_order_shaext: movdqu %xmm2,16(%rdi) .byte 0xf3,0xc3 .cfi_endproc -.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext +.size sha256_block_data_order_hw,.-sha256_block_data_order_hw +.globl sha256_block_data_order_ssse3 +.hidden sha256_block_data_order_ssse3 .type sha256_block_data_order_ssse3,@function .align 64 sha256_block_data_order_ssse3: .cfi_startproc -.Lssse3_shortcut: +_CET_ENDBR movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx @@ -3106,11 +3095,13 @@ sha256_block_data_order_ssse3: .byte 0xf3,0xc3 .cfi_endproc .size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3 +.globl sha256_block_data_order_avx +.hidden sha256_block_data_order_avx .type sha256_block_data_order_avx,@function .align 64 sha256_block_data_order_avx: .cfi_startproc -.Lavx_shortcut: +_CET_ENDBR movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx diff --git a/generated-src/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S b/generated-src/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S index 66a60d73d0..117d55eed4 100644 --- a/generated-src/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S +++ b/generated-src/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S @@ -6,24 +6,13 @@ #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) .text -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P -.globl sha512_block_data_order -.hidden sha512_block_data_order -.type sha512_block_data_order,@function +.globl sha512_block_data_order_nohw +.hidden sha512_block_data_order_nohw +.type sha512_block_data_order_nohw,@function .align 16 -sha512_block_data_order: +sha512_block_data_order_nohw: .cfi_startproc _CET_ENDBR - leaq OPENSSL_ia32cap_P(%rip),%r11 - movl 0(%r11),%r9d - movl 4(%r11),%r10d - movl 8(%r11),%r11d - andl $1073741824,%r9d - andl $268435968,%r10d - orl %r9d,%r10d - cmpl $1342177792,%r10d - je .Lavx_shortcut movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx @@ -1729,7 +1718,7 @@ _CET_ENDBR .Lepilogue: .byte 0xf3,0xc3 .cfi_endproc -.size sha512_block_data_order,.-sha512_block_data_order +.size sha512_block_data_order_nohw,.-sha512_block_data_order_nohw .section .rodata .align 64 .type K512,@object @@ -1819,11 +1808,13 @@ K512: .quad 0x0001020304050607,0x08090a0b0c0d0e0f .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .text +.globl sha512_block_data_order_avx +.hidden sha512_block_data_order_avx .type sha512_block_data_order_avx,@function .align 64 sha512_block_data_order_avx: .cfi_startproc -.Lavx_shortcut: +_CET_ENDBR movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx diff --git a/generated-src/mac-x86_64/crypto/chacha/chacha-x86_64.S b/generated-src/mac-x86_64/crypto/chacha/chacha-x86_64.S index b80364b7a4..c2c9b4bf38 100644 --- a/generated-src/mac-x86_64/crypto/chacha/chacha-x86_64.S +++ b/generated-src/mac-x86_64/crypto/chacha/chacha-x86_64.S @@ -6,8 +6,6 @@ #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) .text - - .section __DATA,__const .p2align 6 L$zero: @@ -39,19 +37,13 @@ L$sixteen: .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .text -.globl _ChaCha20_ctr32 -.private_extern _ChaCha20_ctr32 +.globl _ChaCha20_ctr32_nohw +.private_extern _ChaCha20_ctr32_nohw .p2align 6 -_ChaCha20_ctr32: +_ChaCha20_ctr32_nohw: _CET_ENDBR - cmpq $0,%rdx - je L$no_data - movq _OPENSSL_ia32cap_P+4(%rip),%r10 - testl $512,%r10d - jnz L$ChaCha20_ssse3 - pushq %rbx pushq %rbp @@ -323,17 +315,15 @@ L$no_data: .byte 0xf3,0xc3 +.globl _ChaCha20_ctr32_ssse3 +.private_extern _ChaCha20_ctr32_ssse3 .p2align 5 -ChaCha20_ssse3: -L$ChaCha20_ssse3: +_ChaCha20_ctr32_ssse3: +_CET_ENDBR movq %rsp,%r9 - cmpq $128,%rdx - ja L$ChaCha20_4x - -L$do_sse3_after_all: subq $64+8,%rsp movdqa L$sigma(%rip),%xmm0 movdqu (%rcx),%xmm1 @@ -460,25 +450,16 @@ L$ssse3_epilogue: .byte 0xf3,0xc3 +.globl _ChaCha20_ctr32_ssse3_4x +.private_extern _ChaCha20_ctr32_ssse3_4x .p2align 5 -ChaCha20_4x: -L$ChaCha20_4x: +_ChaCha20_ctr32_ssse3_4x: +_CET_ENDBR movq %rsp,%r9 movq %r10,%r11 - shrq $32,%r10 - testq $32,%r10 - jnz L$ChaCha20_8x - cmpq $192,%rdx - ja L$proceed4x - - andq $71303168,%r11 - cmpq $4194304,%r11 - je L$do_sse3_after_all - -L$proceed4x: subq $0x140+8,%rsp movdqa L$sigma(%rip),%xmm11 movdqu (%rcx),%xmm15 @@ -1012,11 +993,13 @@ L$4x_epilogue: .byte 0xf3,0xc3 +.globl _ChaCha20_ctr32_avx2 +.private_extern _ChaCha20_ctr32_avx2 .p2align 5 -ChaCha20_8x: -L$ChaCha20_8x: +_ChaCha20_ctr32_avx2: +_CET_ENDBR movq %rsp,%r9 subq $0x280+8,%rsp diff --git a/generated-src/mac-x86_64/crypto/fipsmodule/sha1-x86_64.S b/generated-src/mac-x86_64/crypto/fipsmodule/sha1-x86_64.S index 51bdebbf6f..c2da422bbe 100644 --- a/generated-src/mac-x86_64/crypto/fipsmodule/sha1-x86_64.S +++ b/generated-src/mac-x86_64/crypto/fipsmodule/sha1-x86_64.S @@ -6,34 +6,13 @@ #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) .text - -.globl _sha1_block_data_order -.private_extern _sha1_block_data_order +.globl _sha1_block_data_order_nohw +.private_extern _sha1_block_data_order_nohw .p2align 4 -_sha1_block_data_order: +_sha1_block_data_order_nohw: _CET_ENDBR - leaq _OPENSSL_ia32cap_P(%rip),%r10 - movl 0(%r10),%r9d - movl 4(%r10),%r8d - movl 8(%r10),%r10d - testl $512,%r8d - jz L$ialu - testl $536870912,%r10d - jnz _shaext_shortcut - andl $296,%r10d - cmpl $296,%r10d - je _avx2_shortcut - andl $268435456,%r8d - andl $1073741824,%r9d - orl %r9d,%r8d - cmpl $1342177280,%r8d - je _avx_shortcut - jmp _ssse3_shortcut - -.p2align 4 -L$ialu: movq %rsp,%rax pushq %rbx @@ -1264,11 +1243,13 @@ L$epilogue: .byte 0xf3,0xc3 +.globl _sha1_block_data_order_hw +.private_extern _sha1_block_data_order_hw .p2align 5 -sha1_block_data_order_shaext: -_shaext_shortcut: +_sha1_block_data_order_hw: +_CET_ENDBR movdqu (%rdi),%xmm0 movd 16(%rdi),%xmm1 movdqa K_XX_XX+160(%rip),%xmm3 @@ -1434,11 +1415,13 @@ L$oop_shaext: .byte 0xf3,0xc3 +.globl _sha1_block_data_order_ssse3 +.private_extern _sha1_block_data_order_ssse3 .p2align 4 -sha1_block_data_order_ssse3: -_ssse3_shortcut: +_sha1_block_data_order_ssse3: +_CET_ENDBR movq %rsp,%r11 pushq %rbx @@ -2622,11 +2605,13 @@ L$epilogue_ssse3: .byte 0xf3,0xc3 +.globl _sha1_block_data_order_avx +.private_extern _sha1_block_data_order_avx .p2align 4 -sha1_block_data_order_avx: -_avx_shortcut: +_sha1_block_data_order_avx: +_CET_ENDBR movq %rsp,%r11 pushq %rbx @@ -3750,11 +3735,13 @@ L$epilogue_avx: .byte 0xf3,0xc3 +.globl _sha1_block_data_order_avx2 +.private_extern _sha1_block_data_order_avx2 .p2align 4 -sha1_block_data_order_avx2: -_avx2_shortcut: +_sha1_block_data_order_avx2: +_CET_ENDBR movq %rsp,%r11 pushq %rbx diff --git a/generated-src/mac-x86_64/crypto/fipsmodule/sha256-x86_64.S b/generated-src/mac-x86_64/crypto/fipsmodule/sha256-x86_64.S index a7307cf777..31e0cb3995 100644 --- a/generated-src/mac-x86_64/crypto/fipsmodule/sha256-x86_64.S +++ b/generated-src/mac-x86_64/crypto/fipsmodule/sha256-x86_64.S @@ -6,27 +6,13 @@ #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) .text - -.globl _sha256_block_data_order -.private_extern _sha256_block_data_order +.globl _sha256_block_data_order_nohw +.private_extern _sha256_block_data_order_nohw .p2align 4 -_sha256_block_data_order: +_sha256_block_data_order_nohw: _CET_ENDBR - leaq _OPENSSL_ia32cap_P(%rip),%r11 - movl 0(%r11),%r9d - movl 4(%r11),%r10d - movl 8(%r11),%r11d - testl $536870912,%r11d - jnz L$shaext_shortcut - andl $1073741824,%r9d - andl $268435968,%r10d - orl %r9d,%r10d - cmpl $1342177792,%r10d - je L$avx_shortcut - testl $512,%r10d - jnz L$ssse3_shortcut movq %rsp,%rax pushq %rbx @@ -1778,15 +1764,17 @@ K256: .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .text +.globl _sha256_block_data_order_hw +.private_extern _sha256_block_data_order_hw .p2align 6 -sha256_block_data_order_shaext: -L$shaext_shortcut: +_sha256_block_data_order_hw: #ifdef BORINGSSL_DISPATCH_TEST movb $1,_BORINGSSL_function_hit+6(%rip) #endif +_CET_ENDBR leaq K256+128(%rip),%rcx movdqu (%rdi),%xmm1 movdqu 16(%rdi),%xmm2 @@ -1991,11 +1979,13 @@ L$oop_shaext: .byte 0xf3,0xc3 +.globl _sha256_block_data_order_ssse3 +.private_extern _sha256_block_data_order_ssse3 .p2align 6 -sha256_block_data_order_ssse3: +_sha256_block_data_order_ssse3: -L$ssse3_shortcut: +_CET_ENDBR movq %rsp,%rax pushq %rbx @@ -3104,11 +3094,13 @@ L$epilogue_ssse3: .byte 0xf3,0xc3 +.globl _sha256_block_data_order_avx +.private_extern _sha256_block_data_order_avx .p2align 6 -sha256_block_data_order_avx: +_sha256_block_data_order_avx: -L$avx_shortcut: +_CET_ENDBR movq %rsp,%rax pushq %rbx diff --git a/generated-src/mac-x86_64/crypto/fipsmodule/sha512-x86_64.S b/generated-src/mac-x86_64/crypto/fipsmodule/sha512-x86_64.S index 9adc202914..5b873695c8 100644 --- a/generated-src/mac-x86_64/crypto/fipsmodule/sha512-x86_64.S +++ b/generated-src/mac-x86_64/crypto/fipsmodule/sha512-x86_64.S @@ -6,23 +6,13 @@ #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) .text - -.globl _sha512_block_data_order -.private_extern _sha512_block_data_order +.globl _sha512_block_data_order_nohw +.private_extern _sha512_block_data_order_nohw .p2align 4 -_sha512_block_data_order: +_sha512_block_data_order_nohw: _CET_ENDBR - leaq _OPENSSL_ia32cap_P(%rip),%r11 - movl 0(%r11),%r9d - movl 4(%r11),%r10d - movl 8(%r11),%r11d - andl $1073741824,%r9d - andl $268435968,%r10d - orl %r9d,%r10d - cmpl $1342177792,%r10d - je L$avx_shortcut movq %rsp,%rax pushq %rbx @@ -1818,11 +1808,13 @@ K512: .quad 0x0001020304050607,0x08090a0b0c0d0e0f .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .text +.globl _sha512_block_data_order_avx +.private_extern _sha512_block_data_order_avx .p2align 6 -sha512_block_data_order_avx: +_sha512_block_data_order_avx: -L$avx_shortcut: +_CET_ENDBR movq %rsp,%rax pushq %rbx diff --git a/generated-src/win-aarch64/crypto/fipsmodule/sha1-armv8.S b/generated-src/win-aarch64/crypto/fipsmodule/sha1-armv8.S index f5082a0d1b..f8c8b86121 100644 --- a/generated-src/win-aarch64/crypto/fipsmodule/sha1-armv8.S +++ b/generated-src/win-aarch64/crypto/fipsmodule/sha1-armv8.S @@ -8,25 +8,15 @@ .text +.globl sha1_block_data_order_nohw - -.globl sha1_block_data_order - -.def sha1_block_data_order +.def sha1_block_data_order_nohw .type 32 .endef .align 6 -sha1_block_data_order: +sha1_block_data_order_nohw: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. AARCH64_VALID_CALL_TARGET -#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10 - adrp x16,:pg_hi21_nc:OPENSSL_armcap_P -#else - adrp x16,OPENSSL_armcap_P -#endif - ldr w16,[x16,:lo12:OPENSSL_armcap_P] - tst w16,#ARMV8_SHA1 - b.ne Lv8_entry stp x29,x30,[sp,#-96]! add x29,sp,#0 @@ -1083,14 +1073,15 @@ Loop: ldr x29,[sp],#96 ret -.def sha1_block_armv8 +.globl sha1_block_data_order_hw + +.def sha1_block_data_order_hw .type 32 .endef .align 6 -sha1_block_armv8: +sha1_block_data_order_hw: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. AARCH64_VALID_CALL_TARGET -Lv8_entry: stp x29,x30,[sp,#-16]! add x29,sp,#0 diff --git a/generated-src/win-aarch64/crypto/fipsmodule/sha256-armv8.S b/generated-src/win-aarch64/crypto/fipsmodule/sha256-armv8.S index 493fc36bfa..b77f9e8346 100644 --- a/generated-src/win-aarch64/crypto/fipsmodule/sha256-armv8.S +++ b/generated-src/win-aarch64/crypto/fipsmodule/sha256-armv8.S @@ -50,26 +50,13 @@ .text +.globl sha256_block_data_order_nohw - -.globl sha256_block_data_order - -.def sha256_block_data_order +.def sha256_block_data_order_nohw .type 32 .endef .align 6 -sha256_block_data_order: - AARCH64_VALID_CALL_TARGET -#ifndef __KERNEL__ -#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10 - adrp x16,:pg_hi21_nc:OPENSSL_armcap_P -#else - adrp x16,OPENSSL_armcap_P -#endif - ldr w16,[x16,:lo12:OPENSSL_armcap_P] - tst w16,#ARMV8_SHA256 - b.ne Lv8_entry -#endif +sha256_block_data_order_nohw: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 @@ -1062,12 +1049,13 @@ LK256: .align 2 .text #ifndef __KERNEL__ -.def sha256_block_armv8 +.globl sha256_block_data_order_hw + +.def sha256_block_data_order_hw .type 32 .endef .align 6 -sha256_block_armv8: -Lv8_entry: +sha256_block_data_order_hw: #ifdef BORINGSSL_DISPATCH_TEST adrp x9,BORINGSSL_function_hit @@ -1076,6 +1064,7 @@ Lv8_entry: strb w10, [x9,#6] // kFlag_sha256_hw #endif // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 diff --git a/generated-src/win-aarch64/crypto/fipsmodule/sha512-armv8.S b/generated-src/win-aarch64/crypto/fipsmodule/sha512-armv8.S index cbb173e68d..fcfefb0b29 100644 --- a/generated-src/win-aarch64/crypto/fipsmodule/sha512-armv8.S +++ b/generated-src/win-aarch64/crypto/fipsmodule/sha512-armv8.S @@ -50,26 +50,13 @@ .text +.globl sha512_block_data_order_nohw - -.globl sha512_block_data_order - -.def sha512_block_data_order +.def sha512_block_data_order_nohw .type 32 .endef .align 6 -sha512_block_data_order: - AARCH64_VALID_CALL_TARGET -#ifndef __KERNEL__ -#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10 - adrp x16,:pg_hi21_nc:OPENSSL_armcap_P -#else - adrp x16,OPENSSL_armcap_P -#endif - ldr w16,[x16,:lo12:OPENSSL_armcap_P] - tst w16,#ARMV8_SHA512 - b.ne Lv8_entry -#endif +sha512_block_data_order_nohw: AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 @@ -1086,12 +1073,13 @@ LK512: .align 2 .text #ifndef __KERNEL__ -.def sha512_block_armv8 +.globl sha512_block_data_order_hw + +.def sha512_block_data_order_hw .type 32 .endef .align 6 -sha512_block_armv8: -Lv8_entry: +sha512_block_data_order_hw: #ifdef BORINGSSL_DISPATCH_TEST adrp x9,BORINGSSL_function_hit @@ -1099,6 +1087,8 @@ Lv8_entry: mov w10, #1 strb w10, [x9,#8] // kFlag_sha512_hw #endif + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET stp x29,x30,[sp,#-16]! add x29,sp,#0 diff --git a/generated-src/win-x86_64/crypto/chacha/chacha-x86_64.asm b/generated-src/win-x86_64/crypto/chacha/chacha-x86_64.asm index 13e8f61f77..b2a66c7439 100644 --- a/generated-src/win-x86_64/crypto/chacha/chacha-x86_64.asm +++ b/generated-src/win-x86_64/crypto/chacha/chacha-x86_64.asm @@ -12,8 +12,6 @@ default rel section .text code align=64 -EXTERN OPENSSL_ia32cap_P - section .rdata rdata align=8 ALIGN 64 $L$zero: @@ -50,14 +48,14 @@ $L$sixteen: DB 108,46,111,114,103,62,0 section .text -global ChaCha20_ctr32 +global ChaCha20_ctr32_nohw ALIGN 64 -ChaCha20_ctr32: +ChaCha20_ctr32_nohw: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_ChaCha20_ctr32: +$L$SEH_begin_ChaCha20_ctr32_nohw: mov rdi,rcx mov rsi,rdx mov rdx,r8 @@ -67,12 +65,6 @@ $L$SEH_begin_ChaCha20_ctr32: _CET_ENDBR - cmp rdx,0 - je NEAR $L$no_data - mov r10,QWORD[((OPENSSL_ia32cap_P+4))] - test r10d,512 - jnz NEAR $L$ChaCha20_ssse3 - push rbx push rbp @@ -345,14 +337,15 @@ $L$no_data: mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_ChaCha20_ctr32: +$L$SEH_end_ChaCha20_ctr32_nohw: +global ChaCha20_ctr32_ssse3 ALIGN 32 -ChaCha20_ssse3: +ChaCha20_ctr32_ssse3: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_ChaCha20_ssse3: +$L$SEH_begin_ChaCha20_ctr32_ssse3: mov rdi,rcx mov rsi,rdx mov rdx,r8 @@ -360,14 +353,10 @@ $L$SEH_begin_ChaCha20_ssse3: mov r8,QWORD[40+rsp] -$L$ChaCha20_ssse3: +_CET_ENDBR mov r9,rsp - cmp rdx,128 - ja NEAR $L$ChaCha20_4x - -$L$do_sse3_after_all: sub rsp,64+40 movaps XMMWORD[(-40)+r9],xmm6 movaps XMMWORD[(-24)+r9],xmm7 @@ -500,14 +489,15 @@ $L$ssse3_epilogue: mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_ChaCha20_ssse3: +$L$SEH_end_ChaCha20_ctr32_ssse3: +global ChaCha20_ctr32_ssse3_4x ALIGN 32 -ChaCha20_4x: +ChaCha20_ctr32_ssse3_4x: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_ChaCha20_4x: +$L$SEH_begin_ChaCha20_ctr32_ssse3_4x: mov rdi,rcx mov rsi,rdx mov rdx,r8 @@ -515,22 +505,11 @@ $L$SEH_begin_ChaCha20_4x: mov r8,QWORD[40+rsp] -$L$ChaCha20_4x: +_CET_ENDBR mov r9,rsp mov r11,r10 - shr r10,32 - test r10,32 - jnz NEAR $L$ChaCha20_8x - cmp rdx,192 - ja NEAR $L$proceed4x - - and r11,71303168 - cmp r11,4194304 - je NEAR $L$do_sse3_after_all - -$L$proceed4x: sub rsp,0x140+168 movaps XMMWORD[(-168)+r9],xmm6 movaps XMMWORD[(-152)+r9],xmm7 @@ -1086,14 +1065,15 @@ $L$4x_epilogue: mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_ChaCha20_4x: +$L$SEH_end_ChaCha20_ctr32_ssse3_4x: +global ChaCha20_ctr32_avx2 ALIGN 32 -ChaCha20_8x: +ChaCha20_ctr32_avx2: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_ChaCha20_8x: +$L$SEH_begin_ChaCha20_ctr32_avx2: mov rdi,rcx mov rsi,rdx mov rdx,r8 @@ -1101,8 +1081,8 @@ $L$SEH_begin_ChaCha20_8x: mov r8,QWORD[40+rsp] -$L$ChaCha20_8x: +_CET_ENDBR mov r9,rsp sub rsp,0x280+168 @@ -1726,7 +1706,7 @@ $L$8x_epilogue: mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_ChaCha20_8x: +$L$SEH_end_ChaCha20_ctr32_avx2: EXTERN __imp_RtlVirtualUnwind ALIGN 16 @@ -1895,36 +1875,36 @@ full_handler: section .pdata rdata align=4 ALIGN 4 - DD $L$SEH_begin_ChaCha20_ctr32 wrt ..imagebase - DD $L$SEH_end_ChaCha20_ctr32 wrt ..imagebase - DD $L$SEH_info_ChaCha20_ctr32 wrt ..imagebase - - DD $L$SEH_begin_ChaCha20_ssse3 wrt ..imagebase - DD $L$SEH_end_ChaCha20_ssse3 wrt ..imagebase - DD $L$SEH_info_ChaCha20_ssse3 wrt ..imagebase - - DD $L$SEH_begin_ChaCha20_4x wrt ..imagebase - DD $L$SEH_end_ChaCha20_4x wrt ..imagebase - DD $L$SEH_info_ChaCha20_4x wrt ..imagebase - DD $L$SEH_begin_ChaCha20_8x wrt ..imagebase - DD $L$SEH_end_ChaCha20_8x wrt ..imagebase - DD $L$SEH_info_ChaCha20_8x wrt ..imagebase + DD $L$SEH_begin_ChaCha20_ctr32_nohw wrt ..imagebase + DD $L$SEH_end_ChaCha20_ctr32_nohw wrt ..imagebase + DD $L$SEH_info_ChaCha20_ctr32_nohw wrt ..imagebase + + DD $L$SEH_begin_ChaCha20_ctr32_ssse3 wrt ..imagebase + DD $L$SEH_end_ChaCha20_ctr32_ssse3 wrt ..imagebase + DD $L$SEH_info_ChaCha20_ctr32_ssse3 wrt ..imagebase + + DD $L$SEH_begin_ChaCha20_ctr32_ssse3_4x wrt ..imagebase + DD $L$SEH_end_ChaCha20_ctr32_ssse3_4x wrt ..imagebase + DD $L$SEH_info_ChaCha20_ctr32_ssse3_4x wrt ..imagebase + DD $L$SEH_begin_ChaCha20_ctr32_avx2 wrt ..imagebase + DD $L$SEH_end_ChaCha20_ctr32_avx2 wrt ..imagebase + DD $L$SEH_info_ChaCha20_ctr32_avx2 wrt ..imagebase section .xdata rdata align=8 ALIGN 8 -$L$SEH_info_ChaCha20_ctr32: +$L$SEH_info_ChaCha20_ctr32_nohw: DB 9,0,0,0 DD se_handler wrt ..imagebase -$L$SEH_info_ChaCha20_ssse3: +$L$SEH_info_ChaCha20_ctr32_ssse3: DB 9,0,0,0 DD ssse3_handler wrt ..imagebase DD $L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase -$L$SEH_info_ChaCha20_4x: +$L$SEH_info_ChaCha20_ctr32_ssse3_4x: DB 9,0,0,0 DD full_handler wrt ..imagebase DD $L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase -$L$SEH_info_ChaCha20_8x: +$L$SEH_info_ChaCha20_ctr32_avx2: DB 9,0,0,0 DD full_handler wrt ..imagebase DD $L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase diff --git a/generated-src/win-x86_64/crypto/fipsmodule/sha1-x86_64.asm b/generated-src/win-x86_64/crypto/fipsmodule/sha1-x86_64.asm index bb9a775471..987fe122d6 100644 --- a/generated-src/win-x86_64/crypto/fipsmodule/sha1-x86_64.asm +++ b/generated-src/win-x86_64/crypto/fipsmodule/sha1-x86_64.asm @@ -11,16 +11,15 @@ default rel %include "openssl/boringssl_prefix_symbols_nasm.inc" section .text code align=64 -EXTERN OPENSSL_ia32cap_P -global sha1_block_data_order +global sha1_block_data_order_nohw ALIGN 16 -sha1_block_data_order: +sha1_block_data_order_nohw: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_sha1_block_data_order: +$L$SEH_begin_sha1_block_data_order_nohw: mov rdi,rcx mov rsi,rdx mov rdx,r8 @@ -28,26 +27,6 @@ $L$SEH_begin_sha1_block_data_order: _CET_ENDBR - lea r10,[OPENSSL_ia32cap_P] - mov r9d,DWORD[r10] - mov r8d,DWORD[4+r10] - mov r10d,DWORD[8+r10] - test r8d,512 - jz NEAR $L$ialu - test r10d,536870912 - jnz NEAR _shaext_shortcut - and r10d,296 - cmp r10d,296 - je NEAR _avx2_shortcut - and r8d,268435456 - and r9d,1073741824 - or r8d,r9d - cmp r8d,1342177280 - je NEAR _avx_shortcut - jmp NEAR _ssse3_shortcut - -ALIGN 16 -$L$ialu: mov rax,rsp push rbx @@ -1279,21 +1258,22 @@ $L$epilogue: mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_sha1_block_data_order: +$L$SEH_end_sha1_block_data_order_nohw: +global sha1_block_data_order_hw ALIGN 32 -sha1_block_data_order_shaext: +sha1_block_data_order_hw: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_sha1_block_data_order_shaext: +$L$SEH_begin_sha1_block_data_order_hw: mov rdi,rcx mov rsi,rdx mov rdx,r8 -_shaext_shortcut: +_CET_ENDBR lea rsp,[((-72))+rsp] movaps XMMWORD[(-8-64)+rax],xmm6 movaps XMMWORD[(-8-48)+rax],xmm7 @@ -1472,7 +1452,8 @@ $L$epilogue_shaext: mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_sha1_block_data_order_shaext: +$L$SEH_end_sha1_block_data_order_hw: +global sha1_block_data_order_ssse3 ALIGN 16 sha1_block_data_order_ssse3: @@ -1485,8 +1466,8 @@ $L$SEH_begin_sha1_block_data_order_ssse3: mov rdx,r8 -_ssse3_shortcut: +_CET_ENDBR mov r11,rsp push rbx @@ -2685,6 +2666,7 @@ $L$epilogue_ssse3: DB 0F3h,0C3h ;repret $L$SEH_end_sha1_block_data_order_ssse3: +global sha1_block_data_order_avx ALIGN 16 sha1_block_data_order_avx: @@ -2697,8 +2679,8 @@ $L$SEH_begin_sha1_block_data_order_avx: mov rdx,r8 -_avx_shortcut: +_CET_ENDBR mov r11,rsp push rbx @@ -3837,6 +3819,7 @@ $L$epilogue_avx: DB 0F3h,0C3h ;repret $L$SEH_end_sha1_block_data_order_avx: +global sha1_block_data_order_avx2 ALIGN 16 sha1_block_data_order_avx2: @@ -3849,8 +3832,8 @@ $L$SEH_begin_sha1_block_data_order_avx2: mov rdx,r8 -_avx2_shortcut: +_CET_ENDBR mov r11,rsp push rbx @@ -5742,12 +5725,12 @@ $L$common_seh_tail: section .pdata rdata align=4 ALIGN 4 - DD $L$SEH_begin_sha1_block_data_order wrt ..imagebase - DD $L$SEH_end_sha1_block_data_order wrt ..imagebase - DD $L$SEH_info_sha1_block_data_order wrt ..imagebase - DD $L$SEH_begin_sha1_block_data_order_shaext wrt ..imagebase - DD $L$SEH_end_sha1_block_data_order_shaext wrt ..imagebase - DD $L$SEH_info_sha1_block_data_order_shaext wrt ..imagebase + DD $L$SEH_begin_sha1_block_data_order_nohw wrt ..imagebase + DD $L$SEH_end_sha1_block_data_order_nohw wrt ..imagebase + DD $L$SEH_info_sha1_block_data_order_nohw wrt ..imagebase + DD $L$SEH_begin_sha1_block_data_order_hw wrt ..imagebase + DD $L$SEH_end_sha1_block_data_order_hw wrt ..imagebase + DD $L$SEH_info_sha1_block_data_order_hw wrt ..imagebase DD $L$SEH_begin_sha1_block_data_order_ssse3 wrt ..imagebase DD $L$SEH_end_sha1_block_data_order_ssse3 wrt ..imagebase DD $L$SEH_info_sha1_block_data_order_ssse3 wrt ..imagebase @@ -5759,10 +5742,10 @@ ALIGN 4 DD $L$SEH_info_sha1_block_data_order_avx2 wrt ..imagebase section .xdata rdata align=8 ALIGN 8 -$L$SEH_info_sha1_block_data_order: +$L$SEH_info_sha1_block_data_order_nohw: DB 9,0,0,0 DD se_handler wrt ..imagebase -$L$SEH_info_sha1_block_data_order_shaext: +$L$SEH_info_sha1_block_data_order_hw: DB 9,0,0,0 DD shaext_handler wrt ..imagebase $L$SEH_info_sha1_block_data_order_ssse3: diff --git a/generated-src/win-x86_64/crypto/fipsmodule/sha256-x86_64.asm b/generated-src/win-x86_64/crypto/fipsmodule/sha256-x86_64.asm index e7d366a132..d43cd0094f 100644 --- a/generated-src/win-x86_64/crypto/fipsmodule/sha256-x86_64.asm +++ b/generated-src/win-x86_64/crypto/fipsmodule/sha256-x86_64.asm @@ -12,15 +12,14 @@ default rel section .text code align=64 -EXTERN OPENSSL_ia32cap_P -global sha256_block_data_order +global sha256_block_data_order_nohw ALIGN 16 -sha256_block_data_order: +sha256_block_data_order_nohw: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_sha256_block_data_order: +$L$SEH_begin_sha256_block_data_order_nohw: mov rdi,rcx mov rsi,rdx mov rdx,r8 @@ -28,19 +27,6 @@ $L$SEH_begin_sha256_block_data_order: _CET_ENDBR - lea r11,[OPENSSL_ia32cap_P] - mov r9d,DWORD[r11] - mov r10d,DWORD[4+r11] - mov r11d,DWORD[8+r11] - test r11d,536870912 - jnz NEAR $L$shaext_shortcut - and r9d,1073741824 - and r10d,268435968 - or r10d,r9d - cmp r10d,1342177792 - je NEAR $L$avx_shortcut - test r10d,512 - jnz NEAR $L$ssse3_shortcut mov rax,rsp push rbx @@ -1748,7 +1734,7 @@ $L$epilogue: mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_sha256_block_data_order: +$L$SEH_end_sha256_block_data_order_nohw: section .rdata rdata align=8 ALIGN 64 @@ -1799,24 +1785,25 @@ K256: DB 111,114,103,62,0 section .text +global sha256_block_data_order_hw ALIGN 64 -sha256_block_data_order_shaext: +sha256_block_data_order_hw: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_sha256_block_data_order_shaext: +$L$SEH_begin_sha256_block_data_order_hw: mov rdi,rcx mov rsi,rdx mov rdx,r8 -$L$shaext_shortcut: %ifdef BORINGSSL_DISPATCH_TEST EXTERN BORINGSSL_function_hit mov BYTE[((BORINGSSL_function_hit+6))],1 %endif +_CET_ENDBR lea rsp,[((-88))+rsp] movaps XMMWORD[(-8-80)+rax],xmm6 movaps XMMWORD[(-8-64)+rax],xmm7 @@ -2036,7 +2023,8 @@ $L$epilogue_shaext: mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_sha256_block_data_order_shaext: +$L$SEH_end_sha256_block_data_order_hw: +global sha256_block_data_order_ssse3 ALIGN 64 sha256_block_data_order_ssse3: @@ -2050,7 +2038,7 @@ $L$SEH_begin_sha256_block_data_order_ssse3: -$L$ssse3_shortcut: +_CET_ENDBR mov rax,rsp push rbx @@ -3169,6 +3157,7 @@ $L$epilogue_ssse3: DB 0F3h,0C3h ;repret $L$SEH_end_sha256_block_data_order_ssse3: +global sha256_block_data_order_avx ALIGN 64 sha256_block_data_order_avx: @@ -3182,7 +3171,7 @@ $L$SEH_begin_sha256_block_data_order_avx: -$L$avx_shortcut: +_CET_ENDBR mov rax,rsp push rbx @@ -4393,12 +4382,12 @@ shaext_handler: section .pdata rdata align=4 ALIGN 4 - DD $L$SEH_begin_sha256_block_data_order wrt ..imagebase - DD $L$SEH_end_sha256_block_data_order wrt ..imagebase - DD $L$SEH_info_sha256_block_data_order wrt ..imagebase - DD $L$SEH_begin_sha256_block_data_order_shaext wrt ..imagebase - DD $L$SEH_end_sha256_block_data_order_shaext wrt ..imagebase - DD $L$SEH_info_sha256_block_data_order_shaext wrt ..imagebase + DD $L$SEH_begin_sha256_block_data_order_nohw wrt ..imagebase + DD $L$SEH_end_sha256_block_data_order_nohw wrt ..imagebase + DD $L$SEH_info_sha256_block_data_order_nohw wrt ..imagebase + DD $L$SEH_begin_sha256_block_data_order_hw wrt ..imagebase + DD $L$SEH_end_sha256_block_data_order_hw wrt ..imagebase + DD $L$SEH_info_sha256_block_data_order_hw wrt ..imagebase DD $L$SEH_begin_sha256_block_data_order_ssse3 wrt ..imagebase DD $L$SEH_end_sha256_block_data_order_ssse3 wrt ..imagebase DD $L$SEH_info_sha256_block_data_order_ssse3 wrt ..imagebase @@ -4407,11 +4396,11 @@ ALIGN 4 DD $L$SEH_info_sha256_block_data_order_avx wrt ..imagebase section .xdata rdata align=8 ALIGN 8 -$L$SEH_info_sha256_block_data_order: +$L$SEH_info_sha256_block_data_order_nohw: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase -$L$SEH_info_sha256_block_data_order_shaext: +$L$SEH_info_sha256_block_data_order_hw: DB 9,0,0,0 DD shaext_handler wrt ..imagebase $L$SEH_info_sha256_block_data_order_ssse3: diff --git a/generated-src/win-x86_64/crypto/fipsmodule/sha512-x86_64.asm b/generated-src/win-x86_64/crypto/fipsmodule/sha512-x86_64.asm index 2cfb76b762..f81b4630bd 100644 --- a/generated-src/win-x86_64/crypto/fipsmodule/sha512-x86_64.asm +++ b/generated-src/win-x86_64/crypto/fipsmodule/sha512-x86_64.asm @@ -12,15 +12,14 @@ default rel section .text code align=64 -EXTERN OPENSSL_ia32cap_P -global sha512_block_data_order +global sha512_block_data_order_nohw ALIGN 16 -sha512_block_data_order: +sha512_block_data_order_nohw: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp -$L$SEH_begin_sha512_block_data_order: +$L$SEH_begin_sha512_block_data_order_nohw: mov rdi,rcx mov rsi,rdx mov rdx,r8 @@ -28,15 +27,6 @@ $L$SEH_begin_sha512_block_data_order: _CET_ENDBR - lea r11,[OPENSSL_ia32cap_P] - mov r9d,DWORD[r11] - mov r10d,DWORD[4+r11] - mov r11d,DWORD[8+r11] - and r9d,1073741824 - and r10d,268435968 - or r10d,r9d - cmp r10d,1342177792 - je NEAR $L$avx_shortcut mov rax,rsp push rbx @@ -1744,7 +1734,7 @@ $L$epilogue: mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret -$L$SEH_end_sha512_block_data_order: +$L$SEH_end_sha512_block_data_order_nohw: section .rdata rdata align=8 ALIGN 64 @@ -1839,6 +1829,7 @@ K512: DB 111,114,103,62,0 section .text +global sha512_block_data_order_avx ALIGN 64 sha512_block_data_order_avx: @@ -1852,7 +1843,7 @@ $L$SEH_begin_sha512_block_data_order_avx: -$L$avx_shortcut: +_CET_ENDBR mov rax,rsp push rbx @@ -3125,15 +3116,15 @@ $L$in_prologue: section .pdata rdata align=4 ALIGN 4 - DD $L$SEH_begin_sha512_block_data_order wrt ..imagebase - DD $L$SEH_end_sha512_block_data_order wrt ..imagebase - DD $L$SEH_info_sha512_block_data_order wrt ..imagebase + DD $L$SEH_begin_sha512_block_data_order_nohw wrt ..imagebase + DD $L$SEH_end_sha512_block_data_order_nohw wrt ..imagebase + DD $L$SEH_info_sha512_block_data_order_nohw wrt ..imagebase DD $L$SEH_begin_sha512_block_data_order_avx wrt ..imagebase DD $L$SEH_end_sha512_block_data_order_avx wrt ..imagebase DD $L$SEH_info_sha512_block_data_order_avx wrt ..imagebase section .xdata rdata align=8 ALIGN 8 -$L$SEH_info_sha512_block_data_order: +$L$SEH_info_sha512_block_data_order_nohw: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase diff --git a/tests/ci/run_cross_tests.sh b/tests/ci/run_cross_tests.sh index 56eead5f5a..29b7761685 100755 --- a/tests/ci/run_cross_tests.sh +++ b/tests/ci/run_cross_tests.sh @@ -37,7 +37,8 @@ tar Jxf ${TARGET_CPU}-x-tools.tar.xz --no-same-owner --no-same-permissions cat < ${TARGET_CPU}.cmake # Specify the target system set(CMAKE_SYSTEM_NAME Linux) -set(CMAKE_SYSTEM_PROCESSOR ${TARGET_CPU}) +# For "armv6" we need to strip off the "v6", so it's just "arm" +set(CMAKE_SYSTEM_PROCESSOR ${TARGET_CPU/v6/}) # Specify the cross-compiler set(CMAKE_C_COMPILER ${SCRATCH_FOLDER}/${TARGET_PLATFORM}/bin/${TARGET_PLATFORM}-gcc)