Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sha + chacha: Move AArch64/X86-64 dispatching to C. #1625

Merged
merged 4 commits into from
Jul 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 35 additions & 63 deletions crypto/chacha/asm/chacha-x86_64.pl
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,6 @@
$code.=<<___;
.text

.extern OPENSSL_ia32cap_P

.section .rodata
.align 64
.Lzero:
Expand Down Expand Up @@ -230,24 +228,12 @@ sub ROUND { # critical path is 24 cycles per round
########################################################################
# Generic code path that handles all lengths on pre-SSSE3 processors.
$code.=<<___;
.globl ChaCha20_ctr32
.type ChaCha20_ctr32,\@function,5
.globl ChaCha20_ctr32_nohw
.type ChaCha20_ctr32_nohw,\@function,5
.align 64
ChaCha20_ctr32:
ChaCha20_ctr32_nohw:
.cfi_startproc
_CET_ENDBR
cmp \$0,$len
je .Lno_data
mov OPENSSL_ia32cap_P+4(%rip),%r10
___
$code.=<<___ if ($avx>2);
bt \$48,%r10 # check for AVX512F
jc .LChaCha20_avx512
___
$code.=<<___;
test \$`1<<(41-32)`,%r10d
jnz .LChaCha20_ssse3

push %rbx
.cfi_push rbx
push %rbp
Expand Down Expand Up @@ -419,7 +405,7 @@ sub ROUND { # critical path is 24 cycles per round
.Lno_data:
ret
.cfi_endproc
.size ChaCha20_ctr32,.-ChaCha20_ctr32
.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw
___

########################################################################
Expand Down Expand Up @@ -454,19 +440,16 @@ sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
my $xframe = $win64 ? 32+8 : 8;

$code.=<<___;
.type ChaCha20_ssse3,\@function,5
.globl ChaCha20_ctr32_ssse3
.type ChaCha20_ctr32_ssse3,\@function,5
.align 32
ChaCha20_ssse3:
.LChaCha20_ssse3:
ChaCha20_ctr32_ssse3:
.cfi_startproc
_CET_ENDBR
mov %rsp,%r9 # frame pointer
.cfi_def_cfa_register r9
___
$code.=<<___;
cmp \$128,$len # we might throw away some data,
ja .LChaCha20_4x # but overall it won't be slower

.Ldo_sse3_after_all:
sub \$64+$xframe,%rsp
___
$code.=<<___ if ($win64);
Expand Down Expand Up @@ -576,7 +559,7 @@ sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
.Lssse3_epilogue:
ret
.cfi_endproc
.size ChaCha20_ssse3,.-ChaCha20_ssse3
.size ChaCha20_ctr32_ssse3,.-ChaCha20_ctr32_ssse3
___
}

Expand Down Expand Up @@ -714,29 +697,17 @@ sub SSSE3_lane_ROUND {
my $xframe = $win64 ? 0xa8 : 8;

$code.=<<___;
.type ChaCha20_4x,\@function,5
.globl ChaCha20_ctr32_ssse3_4x
.type ChaCha20_ctr32_ssse3_4x,\@function,5
.align 32
ChaCha20_4x:
.LChaCha20_4x:
ChaCha20_ctr32_ssse3_4x:
.cfi_startproc
_CET_ENDBR
mov %rsp,%r9 # frame pointer
.cfi_def_cfa_register r9
mov %r10,%r11
___
$code.=<<___ if ($avx>1);
shr \$32,%r10 # OPENSSL_ia32cap_P+8
test \$`1<<5`,%r10 # test AVX2
jnz .LChaCha20_8x
___
$code.=<<___;
cmp \$192,$len
ja .Lproceed4x

and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE
cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE
je .Ldo_sse3_after_all # to detect Atom

.Lproceed4x:
sub \$0x140+$xframe,%rsp
___
################ stack layout
Expand Down Expand Up @@ -1164,7 +1135,7 @@ sub SSSE3_lane_ROUND {
.L4x_epilogue:
ret
.cfi_endproc
.size ChaCha20_4x,.-ChaCha20_4x
.size ChaCha20_ctr32_ssse3_4x,.-ChaCha20_ctr32_ssse3_4x
___
}

Expand Down Expand Up @@ -1293,11 +1264,12 @@ sub AVX2_lane_ROUND {
my $xframe = $win64 ? 0xa8 : 8;

$code.=<<___;
.type ChaCha20_8x,\@function,5
.globl ChaCha20_ctr32_avx2
.type ChaCha20_ctr32_avx2,\@function,5
.align 32
ChaCha20_8x:
.LChaCha20_8x:
ChaCha20_ctr32_avx2:
.cfi_startproc
_CET_ENDBR
mov %rsp,%r9 # frame register
.cfi_def_cfa_register r9
sub \$0x280+$xframe,%rsp
Expand Down Expand Up @@ -1809,7 +1781,7 @@ sub AVX2_lane_ROUND {
.L8x_epilogue:
ret
.cfi_endproc
.size ChaCha20_8x,.-ChaCha20_8x
.size ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2
___
}

Expand Down Expand Up @@ -2719,22 +2691,22 @@ sub AVX512_lane_ROUND {

.section .pdata
.align 4
.rva .LSEH_begin_ChaCha20_ctr32
.rva .LSEH_end_ChaCha20_ctr32
.rva .LSEH_info_ChaCha20_ctr32
.rva .LSEH_begin_ChaCha20_ctr32_nohw
.rva .LSEH_end_ChaCha20_ctr32_nohw
.rva .LSEH_info_ChaCha20_ctr32_nohw

.rva .LSEH_begin_ChaCha20_ssse3
.rva .LSEH_end_ChaCha20_ssse3
.rva .LSEH_info_ChaCha20_ssse3
.rva .LSEH_begin_ChaCha20_ctr32_ssse3
.rva .LSEH_end_ChaCha20_ctr32_ssse3
.rva .LSEH_info_ChaCha20_ctr32_ssse3

.rva .LSEH_begin_ChaCha20_4x
.rva .LSEH_end_ChaCha20_4x
.rva .LSEH_info_ChaCha20_4x
.rva .LSEH_begin_ChaCha20_ctr32_ssse3_4x
.rva .LSEH_end_ChaCha20_ctr32_ssse3_4x
.rva .LSEH_info_ChaCha20_ctr32_ssse3_4x
___
$code.=<<___ if ($avx>1);
.rva .LSEH_begin_ChaCha20_8x
.rva .LSEH_end_ChaCha20_8x
.rva .LSEH_info_ChaCha20_8x
.rva .LSEH_begin_ChaCha20_ctr32_avx2
.rva .LSEH_end_ChaCha20_ctr32_avx2
.rva .LSEH_info_ChaCha20_ctr32_avx2
___
$code.=<<___ if ($avx>2);
.rva .LSEH_begin_ChaCha20_avx512
Expand All @@ -2748,22 +2720,22 @@ sub AVX512_lane_ROUND {
$code.=<<___;
.section .xdata
.align 8
.LSEH_info_ChaCha20_ctr32:
.LSEH_info_ChaCha20_ctr32_nohw:
.byte 9,0,0,0
.rva se_handler

.LSEH_info_ChaCha20_ssse3:
.LSEH_info_ChaCha20_ctr32_ssse3:
.byte 9,0,0,0
.rva ssse3_handler
.rva .Lssse3_body,.Lssse3_epilogue

.LSEH_info_ChaCha20_4x:
.LSEH_info_ChaCha20_ctr32_ssse3_4x:
.byte 9,0,0,0
.rva full_handler
.rva .L4x_body,.L4x_epilogue
___
$code.=<<___ if ($avx>1);
.LSEH_info_ChaCha20_8x:
.LSEH_info_ChaCha20_ctr32_avx2:
.byte 9,0,0,0
.rva full_handler
.rva .L8x_body,.L8x_epilogue # HandlerData[]
Expand Down
18 changes: 18 additions & 0 deletions crypto/chacha/chacha.c
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,24 @@ static void ChaCha20_ctr32(uint8_t *out, const uint8_t *in, size_t in_len,
ChaCha20_ctr32_neon(out, in, in_len, key, counter);
return;
}
#endif
#if defined(CHACHA20_ASM_AVX2) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
if (ChaCha20_ctr32_avx2_capable(in_len)) {
ChaCha20_ctr32_avx2(out, in, in_len, key, counter);
return;
}
#endif
#if defined(CHACHA20_ASM_SSSE3_4X)
if (ChaCha20_ctr32_ssse3_4x_capable(in_len)) {
ChaCha20_ctr32_ssse3_4x(out, in, in_len, key, counter);
return;
}
#endif
#if defined(CHACHA20_ASM_SSSE3)
if (ChaCha20_ctr32_ssse3_capable(in_len)) {
ChaCha20_ctr32_ssse3(out, in, in_len, key, counter);
return;
}
#endif
if (in_len > 0) {
ChaCha20_ctr32_nohw(out, in, in_len, key, counter);
Expand Down
15 changes: 15 additions & 0 deletions crypto/chacha/chacha_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,21 @@ static void check_abi(uint8_t *out, const uint8_t *in, size_t in_len,
CHECK_ABI(ChaCha20_ctr32_neon, out, in, in_len, key, counter);
}
#endif
#if defined(CHACHA20_ASM_AVX2) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
if (ChaCha20_ctr32_avx2_capable(in_len)) {
CHECK_ABI(ChaCha20_ctr32_avx2, out, in, in_len, key, counter);
}
#endif
#if defined(CHACHA20_ASM_SSSE3_4X)
if (ChaCha20_ctr32_ssse3_4x_capable(in_len)) {
CHECK_ABI(ChaCha20_ctr32_ssse3_4x, out, in, in_len, key, counter);
}
#endif
#if defined(CHACHA20_ASM_SSSE3)
if (ChaCha20_ctr32_ssse3_capable(in_len)) {
CHECK_ABI(ChaCha20_ctr32_ssse3, out, in, in_len, key, counter);
}
#endif
#if defined(CHACHA20_ASM_NOHW)
if (in_len > 0) {
CHECK_ABI(ChaCha20_ctr32_nohw, out, in, in_len, key, counter);
Expand Down
28 changes: 26 additions & 2 deletions crypto/chacha/internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ extern "C" {
void CRYPTO_hchacha20(uint8_t out[32], const uint8_t key[32],
const uint8_t nonce[16]);

#if !defined(OPENSSL_NO_ASM) && \
(defined(OPENSSL_X86) || defined(OPENSSL_X86_64))
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86)

#define CHACHA20_ASM

Expand All @@ -46,6 +45,31 @@ OPENSSL_INLINE int ChaCha20_ctr32_neon_capable(size_t len) {
}
void ChaCha20_ctr32_neon(uint8_t *out, const uint8_t *in, size_t in_len,
const uint32_t key[8], const uint32_t counter[4]);
#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
#define CHACHA20_ASM_NOHW

#define CHACHA20_ASM_AVX2
OPENSSL_INLINE int ChaCha20_ctr32_avx2_capable(size_t len) {
return (len > 128) && CRYPTO_is_AVX2_capable();
}
void ChaCha20_ctr32_avx2(uint8_t *out, const uint8_t *in, size_t in_len,
const uint32_t key[8], const uint32_t counter[4]);

#define CHACHA20_ASM_SSSE3_4X
OPENSSL_INLINE int ChaCha20_ctr32_ssse3_4x_capable(size_t len) {
int capable = (len > 128) && CRYPTO_is_SSSE3_capable();
int faster = (len > 192) || !CRYPTO_cpu_perf_is_like_silvermont();
return capable && faster;
}
void ChaCha20_ctr32_ssse3_4x(uint8_t *out, const uint8_t *in, size_t in_len,
const uint32_t key[8], const uint32_t counter[4]);

#define CHACHA20_ASM_SSSE3
OPENSSL_INLINE int ChaCha20_ctr32_ssse3_capable(size_t len) {
return (len > 128) && CRYPTO_is_SSSE3_capable();
}
void ChaCha20_ctr32_ssse3(uint8_t *out, const uint8_t *in, size_t in_len,
const uint32_t key[8], const uint32_t counter[4]);
#endif

#if defined(CHACHA20_ASM)
Expand Down
6 changes: 4 additions & 2 deletions crypto/fipsmodule/cpucap/cpu_intel.c
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,8 @@ void OPENSSL_cpuid_setup(void) {

// Clear the XSAVE bit on Knights Landing to mimic Silvermont. This enables
// some Silvermont-specific codepaths which perform better. See OpenSSL
// commit 64d92d74985ebb3d0be58a9718f9e080a14a8e7f.
// commit 64d92d74985ebb3d0be58a9718f9e080a14a8e7f and
// |CRYPTO_cpu_perf_is_like_silvermont|.
if ((eax & 0x0fff0ff0) == 0x00050670 /* Knights Landing */ ||
(eax & 0x0fff0ff0) == 0x00080650 /* Knights Mill (per SDE) */) {
ecx &= ~(1u << 26);
Expand All @@ -267,7 +268,8 @@ void OPENSSL_cpuid_setup(void) {
// Clear AVX2 and AVX512* bits.
//
// TODO(davidben): Should bits 17 and 26-28 also be cleared? Upstream
// doesn't clear those.
// doesn't clear those. See the comments in
// |CRYPTO_hardware_supports_XSAVE|.
extended_features[0] &=
~((1u << 5) | (1u << 16) | (1u << 21) | (1u << 30) | (1u << 31));
}
Expand Down
Loading
Loading