diff --git a/crypto/chacha/asm/chacha-x86_64.pl b/crypto/chacha/asm/chacha-x86_64.pl index d212d5fdc51..4da5ec9cab6 100755 --- a/crypto/chacha/asm/chacha-x86_64.pl +++ b/crypto/chacha/asm/chacha-x86_64.pl @@ -80,8 +80,6 @@ $code.=<<___; .text -.extern OPENSSL_ia32cap_P - .section .rodata .align 64 .Lzero: @@ -230,24 +228,12 @@ sub ROUND { # critical path is 24 cycles per round ######################################################################## # Generic code path that handles all lengths on pre-SSSE3 processors. $code.=<<___; -.globl ChaCha20_ctr32 -.type ChaCha20_ctr32,\@function,5 +.globl ChaCha20_ctr32_nohw +.type ChaCha20_ctr32_nohw,\@function,5 .align 64 -ChaCha20_ctr32: +ChaCha20_ctr32_nohw: .cfi_startproc _CET_ENDBR - cmp \$0,$len - je .Lno_data - mov OPENSSL_ia32cap_P+4(%rip),%r10 -___ -$code.=<<___ if ($avx>2); - bt \$48,%r10 # check for AVX512F - jc .LChaCha20_avx512 -___ -$code.=<<___; - test \$`1<<(41-32)`,%r10d - jnz .LChaCha20_ssse3 - push %rbx .cfi_push rbx push %rbp @@ -419,7 +405,7 @@ sub ROUND { # critical path is 24 cycles per round .Lno_data: ret .cfi_endproc -.size ChaCha20_ctr32,.-ChaCha20_ctr32 +.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw ___ ######################################################################## @@ -454,19 +440,16 @@ sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round my $xframe = $win64 ? 32+8 : 8; $code.=<<___; -.type ChaCha20_ssse3,\@function,5 +.globl ChaCha20_ctr32_ssse3 +.type ChaCha20_ctr32_ssse3,\@function,5 .align 32 -ChaCha20_ssse3: -.LChaCha20_ssse3: +ChaCha20_ctr32_ssse3: .cfi_startproc + _CET_ENDBR mov %rsp,%r9 # frame pointer .cfi_def_cfa_register r9 ___ $code.=<<___; - cmp \$128,$len # we might throw away some data, - ja .LChaCha20_4x # but overall it won't be slower - -.Ldo_sse3_after_all: sub \$64+$xframe,%rsp ___ $code.=<<___ if ($win64); @@ -576,7 +559,7 @@ sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round .Lssse3_epilogue: ret .cfi_endproc -.size ChaCha20_ssse3,.-ChaCha20_ssse3 +.size ChaCha20_ctr32_ssse3,.-ChaCha20_ctr32_ssse3 ___ } @@ -714,29 +697,17 @@ sub SSSE3_lane_ROUND { my $xframe = $win64 ? 0xa8 : 8; $code.=<<___; -.type ChaCha20_4x,\@function,5 +.globl ChaCha20_ctr32_ssse3_4x +.type ChaCha20_ctr32_ssse3_4x,\@function,5 .align 32 -ChaCha20_4x: -.LChaCha20_4x: +ChaCha20_ctr32_ssse3_4x: .cfi_startproc + _CET_ENDBR mov %rsp,%r9 # frame pointer .cfi_def_cfa_register r9 mov %r10,%r11 ___ -$code.=<<___ if ($avx>1); - shr \$32,%r10 # OPENSSL_ia32cap_P+8 - test \$`1<<5`,%r10 # test AVX2 - jnz .LChaCha20_8x -___ $code.=<<___; - cmp \$192,$len - ja .Lproceed4x - - and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE - cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE - je .Ldo_sse3_after_all # to detect Atom - -.Lproceed4x: sub \$0x140+$xframe,%rsp ___ ################ stack layout @@ -1164,7 +1135,7 @@ sub SSSE3_lane_ROUND { .L4x_epilogue: ret .cfi_endproc -.size ChaCha20_4x,.-ChaCha20_4x +.size ChaCha20_ctr32_ssse3_4x,.-ChaCha20_ctr32_ssse3_4x ___ } @@ -1293,11 +1264,12 @@ sub AVX2_lane_ROUND { my $xframe = $win64 ? 0xa8 : 8; $code.=<<___; -.type ChaCha20_8x,\@function,5 +.globl ChaCha20_ctr32_avx2 +.type ChaCha20_ctr32_avx2,\@function,5 .align 32 -ChaCha20_8x: -.LChaCha20_8x: +ChaCha20_ctr32_avx2: .cfi_startproc + _CET_ENDBR mov %rsp,%r9 # frame register .cfi_def_cfa_register r9 sub \$0x280+$xframe,%rsp @@ -1809,7 +1781,7 @@ sub AVX2_lane_ROUND { .L8x_epilogue: ret .cfi_endproc -.size ChaCha20_8x,.-ChaCha20_8x +.size ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2 ___ } @@ -2719,22 +2691,22 @@ sub AVX512_lane_ROUND { .section .pdata .align 4 - .rva .LSEH_begin_ChaCha20_ctr32 - .rva .LSEH_end_ChaCha20_ctr32 - .rva .LSEH_info_ChaCha20_ctr32 + .rva .LSEH_begin_ChaCha20_ctr32_nohw + .rva .LSEH_end_ChaCha20_ctr32_nohw + .rva .LSEH_info_ChaCha20_ctr32_nohw - .rva .LSEH_begin_ChaCha20_ssse3 - .rva .LSEH_end_ChaCha20_ssse3 - .rva .LSEH_info_ChaCha20_ssse3 + .rva .LSEH_begin_ChaCha20_ctr32_ssse3 + .rva .LSEH_end_ChaCha20_ctr32_ssse3 + .rva .LSEH_info_ChaCha20_ctr32_ssse3 - .rva .LSEH_begin_ChaCha20_4x - .rva .LSEH_end_ChaCha20_4x - .rva .LSEH_info_ChaCha20_4x + .rva .LSEH_begin_ChaCha20_ctr32_ssse3_4x + .rva .LSEH_end_ChaCha20_ctr32_ssse3_4x + .rva .LSEH_info_ChaCha20_ctr32_ssse3_4x ___ $code.=<<___ if ($avx>1); - .rva .LSEH_begin_ChaCha20_8x - .rva .LSEH_end_ChaCha20_8x - .rva .LSEH_info_ChaCha20_8x + .rva .LSEH_begin_ChaCha20_ctr32_avx2 + .rva .LSEH_end_ChaCha20_ctr32_avx2 + .rva .LSEH_info_ChaCha20_ctr32_avx2 ___ $code.=<<___ if ($avx>2); .rva .LSEH_begin_ChaCha20_avx512 @@ -2748,22 +2720,22 @@ sub AVX512_lane_ROUND { $code.=<<___; .section .xdata .align 8 -.LSEH_info_ChaCha20_ctr32: +.LSEH_info_ChaCha20_ctr32_nohw: .byte 9,0,0,0 .rva se_handler -.LSEH_info_ChaCha20_ssse3: +.LSEH_info_ChaCha20_ctr32_ssse3: .byte 9,0,0,0 .rva ssse3_handler .rva .Lssse3_body,.Lssse3_epilogue -.LSEH_info_ChaCha20_4x: +.LSEH_info_ChaCha20_ctr32_ssse3_4x: .byte 9,0,0,0 .rva full_handler .rva .L4x_body,.L4x_epilogue ___ $code.=<<___ if ($avx>1); -.LSEH_info_ChaCha20_8x: +.LSEH_info_ChaCha20_ctr32_avx2: .byte 9,0,0,0 .rva full_handler .rva .L8x_body,.L8x_epilogue # HandlerData[] diff --git a/crypto/chacha/chacha.c b/crypto/chacha/chacha.c index 8a6dbfb60c0..a4f250367bd 100644 --- a/crypto/chacha/chacha.c +++ b/crypto/chacha/chacha.c @@ -91,6 +91,24 @@ static void ChaCha20_ctr32(uint8_t *out, const uint8_t *in, size_t in_len, ChaCha20_ctr32_neon(out, in, in_len, key, counter); return; } +#endif +#if defined(CHACHA20_ASM_AVX2) + if (ChaCha20_ctr32_avx2_capable(in_len)) { + ChaCha20_ctr32_avx2(out, in, in_len, key, counter); + return; + } +#endif +#if defined(CHACHA20_ASM_SSSE3_4X) + if (ChaCha20_ctr32_ssse3_4x_capable(in_len)) { + ChaCha20_ctr32_ssse3_4x(out, in, in_len, key, counter); + return; + } +#endif +#if defined(CHACHA20_ASM_SSSE3) + if (ChaCha20_ctr32_ssse3_capable(in_len)) { + ChaCha20_ctr32_ssse3(out, in, in_len, key, counter); + return; + } #endif if (in_len > 0) { ChaCha20_ctr32_nohw(out, in, in_len, key, counter); diff --git a/crypto/chacha/chacha_test.cc b/crypto/chacha/chacha_test.cc index 4e656aa2237..9f3f1d7ed52 100644 --- a/crypto/chacha/chacha_test.cc +++ b/crypto/chacha/chacha_test.cc @@ -359,6 +359,21 @@ static void check_abi(uint8_t *out, const uint8_t *in, size_t in_len, CHECK_ABI(ChaCha20_ctr32_neon, out, in, in_len, key, counter); } #endif +#if defined(CHACHA20_ASM_AVX2) + if (ChaCha20_ctr32_avx2_capable(in_len)) { + CHECK_ABI(ChaCha20_ctr32_avx2, out, in, in_len, key, counter); + } +#endif +#if defined(CHACHA20_ASM_SSSE3_4X) + if (ChaCha20_ctr32_ssse3_4x_capable(in_len)) { + CHECK_ABI(ChaCha20_ctr32_ssse3_4x, out, in, in_len, key, counter); + } +#endif +#if defined(CHACHA20_ASM_SSSE3) + if (ChaCha20_ctr32_ssse3_capable(in_len)) { + CHECK_ABI(ChaCha20_ctr32_ssse3, out, in, in_len, key, counter); + } +#endif #if defined(CHACHA20_ASM_NOHW) if (in_len > 0) { CHECK_ABI(ChaCha20_ctr32_nohw, out, in, in_len, key, counter); diff --git a/crypto/chacha/internal.h b/crypto/chacha/internal.h index 4c339c6fcd9..ce0d08620b7 100644 --- a/crypto/chacha/internal.h +++ b/crypto/chacha/internal.h @@ -30,8 +30,7 @@ extern "C" { void CRYPTO_hchacha20(uint8_t out[32], const uint8_t key[32], const uint8_t nonce[16]); -#if !defined(OPENSSL_NO_ASM) && \ - (defined(OPENSSL_X86) || defined(OPENSSL_X86_64)) +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) #define CHACHA20_ASM @@ -46,6 +45,31 @@ OPENSSL_INLINE int ChaCha20_ctr32_neon_capable(size_t len) { } void ChaCha20_ctr32_neon(uint8_t *out, const uint8_t *in, size_t in_len, const uint32_t key[8], const uint32_t counter[4]); +#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) +#define CHACHA20_ASM_NOHW + +#define CHACHA20_ASM_AVX2 +OPENSSL_INLINE int ChaCha20_ctr32_avx2_capable(size_t len) { + return (len > 128) && CRYPTO_is_AVX2_capable(); +} +void ChaCha20_ctr32_avx2(uint8_t *out, const uint8_t *in, size_t in_len, + const uint32_t key[8], const uint32_t counter[4]); + +#define CHACHA20_ASM_SSSE3_4X +OPENSSL_INLINE int ChaCha20_ctr32_ssse3_4x_capable(size_t len) { + int capable = (len > 128) && CRYPTO_is_SSSE3_capable(); + int faster = (len > 192) || !CRYPTO_cpu_perf_is_like_silvermont(); + return capable && faster; +} +void ChaCha20_ctr32_ssse3_4x(uint8_t *out, const uint8_t *in, size_t in_len, + const uint32_t key[8], const uint32_t counter[4]); + +#define CHACHA20_ASM_SSSE3 +OPENSSL_INLINE int ChaCha20_ctr32_ssse3_capable(size_t len) { + return (len > 128) && CRYPTO_is_SSSE3_capable(); +} +void ChaCha20_ctr32_ssse3(uint8_t *out, const uint8_t *in, size_t in_len, + const uint32_t key[8], const uint32_t counter[4]); #endif #if defined(CHACHA20_ASM) diff --git a/crypto/fipsmodule/cpucap/cpu_intel.c b/crypto/fipsmodule/cpucap/cpu_intel.c index 19b6540d023..bbf8543c862 100644 --- a/crypto/fipsmodule/cpucap/cpu_intel.c +++ b/crypto/fipsmodule/cpucap/cpu_intel.c @@ -240,7 +240,8 @@ void OPENSSL_cpuid_setup(void) { // Clear the XSAVE bit on Knights Landing to mimic Silvermont. This enables // some Silvermont-specific codepaths which perform better. See OpenSSL - // commit 64d92d74985ebb3d0be58a9718f9e080a14a8e7f. + // commit 64d92d74985ebb3d0be58a9718f9e080a14a8e7f and + // |CRYPTO_cpu_perf_is_like_silvermont|. if ((eax & 0x0fff0ff0) == 0x00050670 /* Knights Landing */ || (eax & 0x0fff0ff0) == 0x00080650 /* Knights Mill (per SDE) */) { ecx &= ~(1u << 26); @@ -267,7 +268,8 @@ void OPENSSL_cpuid_setup(void) { // Clear AVX2 and AVX512* bits. // // TODO(davidben): Should bits 17 and 26-28 also be cleared? Upstream - // doesn't clear those. + // doesn't clear those. See the comments in + // |CRYPTO_hardware_supports_XSAVE|. extended_features[0] &= ~((1u << 5) | (1u << 16) | (1u << 21) | (1u << 30) | (1u << 31)); } diff --git a/crypto/fipsmodule/cpucap/internal.h b/crypto/fipsmodule/cpucap/internal.h index d6c789aa7b4..dc703fef4c9 100644 --- a/crypto/fipsmodule/cpucap/internal.h +++ b/crypto/fipsmodule/cpucap/internal.h @@ -34,7 +34,8 @@ void OPENSSL_cpuid_setup(void); // ECX for CPUID where EAX = 7 // // Note: the CPUID bits are pre-adjusted for the OSXSAVE bit and the YMM and XMM -// bits in XCR0, so it is not necessary to check those. +// bits in XCR0, so it is not necessary to check those. (WARNING: See caveats +// in cpu_intel.c.) extern uint32_t OPENSSL_ia32cap_P[4]; #if defined(BORINGSSL_FIPS) && !defined(BORINGSSL_SHARED_LIBRARY) @@ -81,6 +82,9 @@ OPENSSL_INLINE int CRYPTO_is_AESNI_capable(void) { return (OPENSSL_ia32cap_get()[1] & (1 << 25)) != 0; } +// We intentionally avoid defining a |CRYPTO_is_XSAVE_capable| function. See +// |CRYPTO_cpu_perf_is_like_silvermont|. + OPENSSL_INLINE int CRYPTO_is_AVX_capable(void) { return (OPENSSL_ia32cap_get()[1] & (1 << 28)) != 0; } @@ -131,6 +135,29 @@ OPENSSL_INLINE int CRYPTO_is_VBMI2_capable(void) { return (OPENSSL_ia32cap_get()[3] & (1 << 6)) != 0; } +// CRYPTO_cpu_perf_is_like_silvermont returns one if, based on a heuristic, the +// CPU has Silvermont-like performance characteristics. It is often faster to +// run different codepaths on these CPUs than the available instructions would +// otherwise select. See chacha-x86_64.pl. +// +// Bonnell, Silvermont's predecessor in the Atom lineup, will also be matched by +// this. |OPENSSL_cpuid_setup| forces Knights Landing to also be matched by +// this. Goldmont (Silvermont's successor in the Atom lineup) added XSAVE so it +// isn't matched by this. Various sources indicate AMD first implemented MOVBE +// and XSAVE at the same time in Jaguar, so it seems like AMD chips will not be +// matched by this. That seems to be the case for other x86(-64) CPUs. +OPENSSL_INLINE int CRYPTO_cpu_perf_is_like_silvermont(void) { + // WARNING: This MUST NOT be used to guard the execution of the XSAVE + // instruction. This is the "hardware supports XSAVE" bit, not the OSXSAVE bit + // that indicates whether we can safely execute XSAVE. This bit may be set + // even when XSAVE is disabled (by the operating system). See the comment in + // cpu_intel.c and check how the users of this bit use it. + // + // We do not use |__XSAVE__| for static detection because the hack in + // |OPENSSL_cpuid_setup| for Knights Landing CPUs needs to override it. + int hardware_supports_xsave = (OPENSSL_get_ia32cap(1) & (1u << 26)) != 0; + return !hardware_supports_xsave && CRYPTO_is_MOVBE_capable(); +} #endif // OPENSSL_X86 || OPENSSL_X86_64