Skip to content

Commit

Permalink
enable clang16 Arm/PPC runtime dispatch if opted-in. Refs #1782
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 579157014
  • Loading branch information
jan-wassenberg authored and copybara-github committed Nov 3, 2023
1 parent 183720e commit 3966eb0
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 35 deletions.
64 changes: 35 additions & 29 deletions hwy/contrib/sort/vqsort-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,40 @@
#include "hwy/contrib/sort/vqsort.h" // Fill16BytesSecure
#endif

namespace hwy {
namespace detail {

HWY_INLINE void Fill16BytesStatic(void* bytes) {
#if !VQSORT_ONLY_STATIC
if (Fill16BytesSecure(bytes)) return;
#endif

uint64_t* words = reinterpret_cast<uint64_t*>(bytes);

// Static-only, or Fill16BytesSecure failed. Get some entropy from the
// stack/code location, and the clock() timer.
uint64_t** seed_stack = &words;
void (*seed_code)(void*) = &Fill16BytesStatic;
const uintptr_t bits_stack = reinterpret_cast<uintptr_t>(seed_stack);
const uintptr_t bits_code = reinterpret_cast<uintptr_t>(seed_code);
const uint64_t bits_time = static_cast<uint64_t>(clock());
words[0] = bits_stack ^ bits_time ^ 0xFEDCBA98; // "Nothing up my sleeve"
words[1] = bits_code ^ bits_time ^ 0x01234567; // constants.
}

HWY_INLINE uint64_t* GetGeneratorStateStatic() {
thread_local uint64_t state[3] = {0};
// This is a counter; zero indicates not yet initialized.
if (HWY_UNLIKELY(state[2] == 0)) {
Fill16BytesStatic(state);
state[2] = 1;
}
return state;
}

} // namespace detail
} // namespace hwy

#endif // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_

// Per-target
Expand Down Expand Up @@ -1754,34 +1788,6 @@ HWY_INLINE size_t CountAndReplaceNaN(D, Traits, T* HWY_RESTRICT, size_t) {
return 0;
}

HWY_INLINE void Fill16BytesStatic(void* bytes) {
#if !VQSORT_ONLY_STATIC
if (Fill16BytesSecure(bytes)) return;
#endif

uint64_t* words = reinterpret_cast<uint64_t*>(bytes);

// Static-only, or Fill16BytesSecure failed. Get some entropy from the
// stack/code location, and the clock() timer.
uint64_t** seed_stack = &words;
void (*seed_code)(void*) = &Fill16BytesStatic;
const uintptr_t bits_stack = reinterpret_cast<uintptr_t>(seed_stack);
const uintptr_t bits_code = reinterpret_cast<uintptr_t>(seed_code);
const uint64_t bits_time = static_cast<uint64_t>(clock());
words[0] = bits_stack ^ bits_time ^ 0xFEDCBA98; // "Nothing up my sleeve"
words[1] = bits_code ^ bits_time ^ 0x01234567; // constants.
}

HWY_INLINE uint64_t* GetGeneratorStateStatic() {
thread_local uint64_t state[3] = {0};
// This is a counter; zero indicates not yet initialized.
if (HWY_UNLIKELY(state[2] == 0)) {
Fill16BytesStatic(state);
state[2] = 1;
}
return state;
}

} // namespace detail

// Old interface with user-specified buffer, retained for compatibility. Called
Expand All @@ -1806,7 +1812,7 @@ void Sort(D d, Traits st, T* HWY_RESTRICT keys, size_t num,

#if VQSORT_ENABLED || HWY_IDE
if (!detail::HandleSpecialCases(d, st, keys, num, buf)) {
uint64_t* HWY_RESTRICT state = detail::GetGeneratorStateStatic();
uint64_t* HWY_RESTRICT state = hwy::detail::GetGeneratorStateStatic();
// Introspection: switch to worst-case N*logN heapsort after this many.
// Should never be reached, so computing log2 exactly does not help.
const size_t max_levels = 50;
Expand Down
4 changes: 1 addition & 3 deletions hwy/contrib/sort/vqsort.cc
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,6 @@ void Sorter::Fill24Bytes(const void*, size_t, void*) {}
bool Sorter::HaveFloat64() { return hwy::HaveFloat64(); }
Sorter::Sorter() {}
void Sorter::Delete() {}
uint64_t* GetGeneratorState() {
return HWY_STATIC_DISPATCH(detail::GetGeneratorStateStatic());
}
uint64_t* GetGeneratorState() { return hwy::detail::GetGeneratorStateStatic(); }

} // namespace hwy
8 changes: 5 additions & 3 deletions hwy/detect_targets.h
Original file line number Diff line number Diff line change
Expand Up @@ -537,9 +537,11 @@
// Clang, GCC and MSVC allow runtime dispatch on x86.
#if HWY_ARCH_X86
#define HWY_HAVE_RUNTIME_DISPATCH 1
// On Arm/PPC, currently only GCC does, and we require Linux to detect CPU
// capabilities.
#elif (HWY_ARCH_ARM || HWY_ARCH_PPC) && HWY_COMPILER_GCC_ACTUAL && \
// On Arm/PPC, GCC and Clang 16+ do, and we require Linux to detect CPU
// capabilities. Currently require opt-in for Clang because it is experimental.
#elif (HWY_ARCH_ARM || HWY_ARCH_PPC) && \
(HWY_COMPILER_GCC_ACTUAL || (HWY_COMPILER_CLANG >= 1600 && \
defined(HWY_ENABLE_CLANG_ARM_DISPATCH))) && \
HWY_OS_LINUX && !defined(TOOLCHAIN_MISS_SYS_AUXV_H)
#define HWY_HAVE_RUNTIME_DISPATCH 1
#else
Expand Down

0 comments on commit 3966eb0

Please sign in to comment.