From 89d0997ce9812cdf05261144718965325c0159eb Mon Sep 17 00:00:00 2001 From: Craig Edwards Date: Tue, 19 Sep 2023 21:23:05 +0000 Subject: [PATCH] refactor: improve AVX, make detection a little nicer split out isa into four separate files, rather than one long snakepit of #ifdef pass flags properly to caller, and have the caller set the compile flags and define set the define as a fixed name with a value 0, 1, 2 or 512, instead of four different defines --- cmake/DetectArchitecture.cmake | 30 +-- include/dpp/isa/avx.h | 136 ++++++++++++ include/dpp/isa/avx2.h | 151 +++++++++++++ include/dpp/isa/avx512.h | 129 +++++++++++ include/dpp/isa/fallback.h | 81 +++++++ include/dpp/isa_detection.h | 389 +-------------------------------- include/dpp/utility.h | 30 +++ library-vcpkg/CMakeLists.txt | 8 +- library/CMakeLists.txt | 8 +- src/dpp/utility.cpp | 13 ++ 10 files changed, 576 insertions(+), 399 deletions(-) create mode 100644 include/dpp/isa/avx.h create mode 100644 include/dpp/isa/avx2.h create mode 100644 include/dpp/isa/avx512.h create mode 100644 include/dpp/isa/fallback.h diff --git a/cmake/DetectArchitecture.cmake b/cmake/DetectArchitecture.cmake index 45a4f13ac2..1815324630 100644 --- a/cmake/DetectArchitecture.cmake +++ b/cmake/DetectArchitecture.cmake @@ -17,7 +17,6 @@ function(check_instruction_set INSTRUCTION_SET_NAME INSTRUCTION_SET_FLAG INSTRUC if(${INSTRUCTION_SET_NAME}) set(AVX_TYPE "${INSTRUCTION_SET_NAME}" PARENT_SCOPE) set(AVX_FLAG "${INSTRUCTION_SET_FLAG}" PARENT_SCOPE) - set(AVX_NAME "${INSTRUCTION_SET_NAME}" PARENT_SCOPE) else() return() endif() @@ -25,21 +24,23 @@ endfunction() if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") set(INSTRUCTION_SETS - "T_AVX?/arch:AVX?__m128i value{}#auto result = _mm_extract_epi32(value, 0)" - "T_AVX2?/arch:AVX2?__m256i value{}#auto result = _mm256_extract_epi32(value, 0)" - "T_AVX512?/arch:AVX512?int32_t result[16]#const _mm512i& value{}#_mm512_store_si512(result, value)" + "AVX1?/arch:AVX?__m128i value{}#auto result = _mm_extract_epi32(value, 0)" + "AVX2?/arch:AVX2?__m256i value{}#auto result = _mm256_extract_epi32(value, 0)" + "AVX512?/arch:AVX512?int32_t result[16]#const _mm512i& value{}#_mm512_store_si512(result, value)" ) else() set(INSTRUCTION_SETS - "T_AVX?-mavx?__m128i value{}#auto result = _mm_extract_epi32(value, 0)" - "T_AVX2?-mavx2?__m256i value{}#auto result = _mm256_extract_epi32(value, 0)" - "T_AVX512?-mavx512f?int32_t result[16]#const _mm512i& value{}#_mm512_store_si512(result, value)" + "AVX1?-mavx?__m128i value{}#auto result = _mm_extract_epi32(value, 0)" + "AVX2?-mavx2?__m256i value{}#auto result = _mm256_extract_epi32(value, 0)" + "AVX512?-mavx512f?int32_t result[16]#const _mm512i& value{}#_mm512_store_si512(result, value)" ) endif() set(CMAKE_REQUIRED_FLAGS_SAVE "${CMAKE_REQUIRED_FLAGS}") -set(AVX_NAME "T_fallback") +set(AVX_TYPE "AVX0") +set(AVX_TYPE "AVX0" PARENT_SCOPE) +set(AVX_FLAGS "" PARENT_SCOPE) # This is only supported on x86/x64, it is completely skipped and forced to T_fallback anywhere else if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "i386") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "AMD64")) @@ -54,11 +55,14 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64") OR (${CMAKE_SYSTEM_PROCESSOR} M check_instruction_set("${INSTRUCTION_SET_NAME}" "${INSTRUCTION_SET_FLAG}" "${INSTRUCTION_SET_INTRINSIC}") endforeach() - string(REPLACE "T_" "" AVX_DISPLAY ${AVX_NAME}) - message(STATUS "Detected ${CMAKE_SYSTEM_PROCESSOR} SSE type: ${AVX_DISPLAY}") + message(STATUS "Detected ${CMAKE_SYSTEM_PROCESSOR} AVX type: ${AVX_TYPE} (FLAGS: ${AVX_FLAG})") + set(AVX_TYPE ${AVX_TYPE}) + set(AVX_TYPE ${AVX_TYPE} PARENT_SCOPE) + set(AVX_FLAG ${AVX_FLAG} PARENT_SCOPE) set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS_SAVE}") else() - message(STATUS "SSE not supported by architecture ${CMAKE_SYSTEM_PROCESSOR} ${AVX_NAME}") - set(AVX_NAME "T_fallback") - set(AVX_TYPE "T_fallback") + message(STATUS "AVX not supported by architecture ${CMAKE_SYSTEM_PROCESSOR} ${AVX_TYPE}") + set(AVX_TYPE "AVX0") + set(AVX_FLAG "" PARENT_SCOPE) + set(AVX_TYPE "AVX0" PARENT_SCOPE) endif() diff --git a/include/dpp/isa/avx.h b/include/dpp/isa/avx.h new file mode 100644 index 0000000000..9f1d8b025e --- /dev/null +++ b/include/dpp/isa/avx.h @@ -0,0 +1,136 @@ +/************************************************************************************ + * + * D++, A Lightweight C++ library for Discord + * + * Copyright 2021 Craig Edwards and D++ contributors + * (https://github.com/brainboxdotcc/DPP/graphs/contributors) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ************************************************************************************/ +#pragma once + +#if defined _MSC_VER || defined __GNUC__ || defined __clang__ + +#include + +#ifdef max + #undef max +#endif +#ifdef min + #undef min +#endif + +namespace dpp { + + using avx_float = __m128; + using avx_int = __m128i; + + /* + * @brief Extracts a 32-bit integer from a 128-bit AVX register. + * @param value The AVX register containing packed 32-bit integers. + * @param index The index of the 32-bit integer to extract (0-3). + * @return The extracted 32-bit integer. + */ + inline int32_t extract_int32_from_avx(const avx_int& value, int64_t index) { + switch (index) { + case 0: { + return _mm_extract_epi32(value, 0); + } + case 1: { + return _mm_extract_epi32(value, 1); + } + case 2: { + return _mm_extract_epi32(value, 2); + } + case 3: { + return _mm_extract_epi32(value, 3); + } + default: { + return _mm_extract_epi32(value, 0); + } + } + } + + /** + * @brief A class for audio mixing operations using AVX instructions. + */ + class audio_mixer { + public: + /* + * @brief The number of 32-bit values per CPU register. + */ + inline static constexpr int32_t byte_blocks_per_register{ 4 }; + + /* + * @brief Stores values from a 128-bit AVX vector to a storage location. + * @tparam value_type The target value type for storage. + * @param values_to_store The 128-bit AVX vector containing values to store. + * @param storage_location Pointer to the storage location. + */ + template inline static void store_values(const avx_int& values_to_store, value_type* storage_location) { + for (int64_t x = 0; x < byte_blocks_per_register; ++x) { + storage_location[x] = static_cast(extract_int32_from_avx(values_to_store, x)); + } + } + + /** + * @brief Specialization for gathering non-float values into an AVX register. + * @tparam value_type The type of values being gathered. + * @tparam Indices Parameter pack of indices for gathering values. + * @return An AVX register containing gathered values. + */ + template inline static avx_float gather_values(value_type* values) { + alignas(16) float new_array[byte_blocks_per_register]{}; + for (size_t x = 0; x < byte_blocks_per_register; ++x) { + new_array[x] = static_cast(values[x]); + } + return _mm_load_ps(new_array); + } + + /** + * @brief Collect a single register worth of data from data_in, apply gain and increment, and store the result in data_out. + * This version uses AVX instructions. + * + * @param data_in Pointer to the input array of int32_t values. + * @param data_out Pointer to the output array of int16_t values. + * @param current_gain The gain to be applied to the elements. + * @param increment The increment value to be added to each element. + */ + inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) { + avx_float current_samples_new{ _mm_mul_ps(gather_values(data_in), + _mm_add_ps(_mm_set1_ps(current_gain), _mm_mul_ps(_mm_set1_ps(increment), _mm_set_ps(0.0f, 1.0f, 2.0f, 3.0f)))) }; + + current_samples_new = _mm_blendv_ps(_mm_max_ps(current_samples_new, _mm_set1_ps(static_cast(std::numeric_limits::min()))), + _mm_min_ps(current_samples_new, _mm_set1_ps(static_cast(std::numeric_limits::max()))), + _mm_cmp_ps(current_samples_new, _mm_set1_ps(0.0f), _CMP_GE_OQ)); + + store_values(_mm_cvtps_epi32(current_samples_new), data_out); + } + + /** + * @brief Combine a register worth of elements from decoded_data and store the result in up_sampled_vector. + * This version uses AVX instructions. + * + * @param up_sampled_vector Pointer to the array of int32_t values. + * @param decoded_data Pointer to the array of int16_t values. + */ + inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) { + auto newValues{ _mm_cvtps_epi32(_mm_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data))) }; + store_values(newValues, up_sampled_vector); + } + }; + +} // namespace dpp + +#endif \ No newline at end of file diff --git a/include/dpp/isa/avx2.h b/include/dpp/isa/avx2.h new file mode 100644 index 0000000000..8f89cb9509 --- /dev/null +++ b/include/dpp/isa/avx2.h @@ -0,0 +1,151 @@ +/************************************************************************************ + * + * D++, A Lightweight C++ library for Discord + * + * Copyright 2021 Craig Edwards and D++ contributors + * (https://github.com/brainboxdotcc/DPP/graphs/contributors) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ************************************************************************************/ +#pragma once + +#if defined _MSC_VER || defined __GNUC__ || defined __clang__ + +#include + +#ifdef max + #undef max +#endif +#ifdef min + #undef min +#endif + +namespace dpp { + + using avx_2_float = __m256; + using avx_2_int = __m256i; + + /* + * @brief Extracts a 32-bit integer from a 256-bit AVX2 register. + * @param value The AVX2 register containing packed 32-bit integers. + * @param index The index of the 32bit integer to extract (0-7). + * @return The extracted 32-bit integer. + */ + inline int32_t extract_int32_from_avx2(const avx_2_int& value, int64_t index) { + switch (index) { + case 0: { + return _mm256_extract_epi32(value, 0); + } + case 1: { + return _mm256_extract_epi32(value, 1); + } + case 2: { + return _mm256_extract_epi32(value, 2); + } + case 3: { + return _mm256_extract_epi32(value, 3); + } + case 4: { + return _mm256_extract_epi32(value, 4); + } + case 5: { + return _mm256_extract_epi32(value, 5); + } + case 6: { + return _mm256_extract_epi32(value, 6); + } + case 7: { + return _mm256_extract_epi32(value, 7); + } + default: { + return _mm256_extract_epi32(value, 0); + } + } + } + + /** + * @brief A class for audio mixing operations using AVX2 instructions. + */ + class audio_mixer { + public: + /* + * @brief The number of 32-bit values per CPU register. + */ + inline static constexpr int32_t byte_blocks_per_register{ 8 }; + + /* + * @brief Stores values from a 256-bit AVX2 vector to a storage location. + * @tparam value_type The target value type for storage. + * @param values_to_store The 256-bit AVX2 vector containing values to store. + * @param storage_location Pointer to the storage location. + */ + template inline static void store_values(const avx_2_int& values_to_store, value_type* storage_location) { + for (int64_t x = 0; x < byte_blocks_per_register; ++x) { + storage_location[x] = static_cast(extract_int32_from_avx2(values_to_store, x)); + } + } + + /** + * @brief Specialization for gathering non-float values into an AVX2 register. + * @tparam value_type The type of values being gathered. + * @tparam Indices Parameter pack of indices for gathering values. + * @return An AVX2 register containing gathered values. + */ + template inline static avx_2_float gather_values(value_type* values) { + alignas(32) float new_array[byte_blocks_per_register]{}; + for (size_t x = 0; x < byte_blocks_per_register; ++x) { + new_array[x] = static_cast(values[x]); + } + return _mm256_load_ps(new_array); + } + + /** + * @brief Collect a single register worth of data from data_in, apply gain and increment, and store the result in data_out. + * This version uses AVX2 instructions. + * + * @param data_in Pointer to the input array of int32_t values. + * @param data_out Pointer to the output array of int16_t values. + * @param current_gain The gain to be applied to the elements. + * @param increment The increment value to be added to each element. + */ + inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) { + avx_2_float current_samples_new{ _mm256_mul_ps(gather_values(data_in), + _mm256_add_ps(_mm256_set1_ps(current_gain), + _mm256_mul_ps(_mm256_set1_ps(increment), _mm256_set_ps(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f)))) }; + + current_samples_new = + _mm256_blendv_ps(_mm256_max_ps(current_samples_new, _mm256_set1_ps(static_cast(std::numeric_limits::min()))), + _mm256_min_ps(current_samples_new, _mm256_set1_ps(static_cast(std::numeric_limits::max()))), + _mm256_cmp_ps(current_samples_new, _mm256_set1_ps(0.0f), _CMP_GE_OQ)); + + store_values(_mm256_cvtps_epi32(current_samples_new), data_out); + } + + /** + * @brief Combine a register worth of elements from decoded_data and store the result in up_sampled_vector. + * This version uses AVX2 instructions. + * + * @param up_sampled_vector Pointer to the array of int32_t values. + * @param decoded_data Pointer to the array of int16_t values. + * @param x Index to select a specific set of elements to combine. + */ + inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) { + auto newValues{ _mm256_cvtps_epi32(_mm256_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data))) }; + store_values(newValues, up_sampled_vector); + } + }; + +} // namespace dpp + +#endif \ No newline at end of file diff --git a/include/dpp/isa/avx512.h b/include/dpp/isa/avx512.h new file mode 100644 index 0000000000..b691242b9f --- /dev/null +++ b/include/dpp/isa/avx512.h @@ -0,0 +1,129 @@ +/************************************************************************************ + * + * D++, A Lightweight C++ library for Discord + * + * Copyright 2021 Craig Edwards and D++ contributors + * (https://github.com/brainboxdotcc/DPP/graphs/contributors) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ************************************************************************************/ +#pragma once + +#if defined _MSC_VER || defined __GNUC__ || defined __clang__ + +#include + +#ifdef max + #undef max +#endif +#ifdef min + #undef min +#endif + + +namespace dpp { + + using avx_512_float = __m512; + using avx_512_int = __m512i; + + /* + * @brief Extracts a 32-bit integer from a 512-bit AVX-512 register. + * @param value The AVX-512 register containing packed 32-bit integers. + * @param index The index of the 32-bit integer to extract (0-15). + * @return The extracted 32-bit integer. + */ + inline int32_t extract_int32_from_avx512(const avx_512_int& value, int64_t index) { + alignas(64) int32_t result[32]; + _mm512_store_si512(result, value); + return result[index]; + } + + /** + * @brief A class for audio mixing operations using AVX512 instructions. + */ + class audio_mixer { + public: + /* + * @brief The number of 32-bit values per CPU register. + */ + inline static constexpr int32_t byte_blocks_per_register{ 16 }; + + /* + * @brief Stores values from a 512-bit AVX512 vector to a storage location. + * @tparam value_type The target value type for storage. + * @param values_to_store The 512-bit AVX512 vector containing values to store. + * @param storage_location Pointer to the storage location. + */ + template inline static void store_values(const avx_512_int& values_to_store, value_type* storage_location) { + for (int64_t x = 0; x < byte_blocks_per_register; ++x) { + storage_location[x] = static_cast(extract_int32_from_avx512(values_to_store, x)); + } + } + + /** + * @brief Specialization for gathering non-float values into an AVX512 register. + * @tparam value_type The type of values being gathered. + * @tparam Indices Parameter pack of indices for gathering values. + * @return An AVX512 register containing gathered values. + */ + template inline static avx_512_float gather_values(value_type* values) { + alignas(64) float new_array[byte_blocks_per_register]{}; + for (size_t x = 0; x < byte_blocks_per_register; ++x) { + new_array[x] = static_cast(values[x]); + } + return _mm512_load_ps(new_array); + } + + /** + * @brief Collect a single register worth of data from data_in, apply gain and increment, and store the result in data_out. + * This version uses AVX512 instructions. + * + * @param data_in Pointer to the input array of int32_t values. + * @param data_out Pointer to the output array of int16_t values. + * @param current_gain The gain to be applied to the elements. + * @param increment The increment value to be added to each element. + */ + inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) { + avx_512_float current_samples_new{ _mm512_mul_ps(gather_values(data_in), + _mm512_add_ps(_mm512_set1_ps(current_gain), + _mm512_mul_ps(_mm512_set1_ps(increment), + _mm512_set_ps(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f)))) }; + + __m512 lower_limit = _mm512_set1_ps(static_cast(std::numeric_limits::min())); + __m512 upper_limit = _mm512_set1_ps(static_cast(std::numeric_limits::max())); + + __mmask16 mask_ge = _mm512_cmp_ps_mask(current_samples_new, _mm512_set1_ps(0.0f), _CMP_GE_OQ); + + current_samples_new = _mm512_mask_max_ps(current_samples_new, mask_ge, current_samples_new, lower_limit); + current_samples_new = _mm512_mask_min_ps(current_samples_new, ~mask_ge, current_samples_new, upper_limit); + + store_values(_mm512_cvtps_epi32(current_samples_new), data_out); + } + + /** + * @brief Combine a register worth of elements from decoded_data and store the result in up_sampled_vector. + * This version uses AVX512 instructions. + * + * @param up_sampled_vector Pointer to the array of int32_t values. + * @param decoded_data Pointer to the array of int16_t values. + */ + inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) { + auto newValues{ _mm512_cvtps_epi32(_mm512_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data))) }; + store_values(newValues, up_sampled_vector); + } + }; + +} // namespace dpp + +#endif \ No newline at end of file diff --git a/include/dpp/isa/fallback.h b/include/dpp/isa/fallback.h new file mode 100644 index 0000000000..2ce44c4464 --- /dev/null +++ b/include/dpp/isa/fallback.h @@ -0,0 +1,81 @@ +/************************************************************************************ + * + * D++, A Lightweight C++ library for Discord + * + * Copyright 2021 Craig Edwards and D++ contributors + * (https://github.com/brainboxdotcc/DPP/graphs/contributors) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ************************************************************************************/ +#pragma once + +#ifdef max + #undef max +#endif +#ifdef min + #undef min +#endif + +namespace dpp { + + /** + * @brief A class for audio mixing operations using x64 instructions. + */ + class audio_mixer { + public: + /* + * @brief The number of 32-bit values per CPU register. + */ + inline static constexpr int32_t byte_blocks_per_register{ 2 }; + + /** + * @brief Collect a single register worth of data from data_in, apply gain and increment, and store the result in data_out. + * This version uses x64 instructions. + * + * @param data_in Pointer to the input array of int32_t values. + * @param data_out Pointer to the output array of int16_t values. + * @param current_gain The gain to be applied to the elements. + * @param increment The increment value to be added to each element. + */ + inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) { + for (uint64_t x = 0; x < byte_blocks_per_register; ++x) { + auto increment_new = increment * x; + auto current_gain_new = current_gain + increment_new; + auto current_sample_new = data_in[x] * current_gain_new; + if (current_sample_new >= std::numeric_limits::max()) { + current_sample_new = std::numeric_limits::max(); + } + else if (current_sample_new <= std::numeric_limits::min()) { + current_sample_new = std::numeric_limits::min(); + } + data_out[x] = static_cast(current_sample_new); + } + } + + /** + * @brief Combine a register worth of elements from decoded_data and store the result in up_sampled_vector. + * This version uses x64 instructions. + * + * @param up_sampled_vector Pointer to the array of int32_t values. + * @param decoded_data Pointer to the array of int16_t values. + */ + inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) { + for (uint64_t x = 0; x < byte_blocks_per_register; ++x) { + up_sampled_vector[x] += static_cast(decoded_data[x]); + } + + } + }; + +} // namespace dpp diff --git a/include/dpp/isa_detection.h b/include/dpp/isa_detection.h index 0c5fe7b398..2f7925efc5 100644 --- a/include/dpp/isa_detection.h +++ b/include/dpp/isa_detection.h @@ -20,385 +20,12 @@ ************************************************************************************/ #pragma once -#if defined _MSC_VER || defined __GNUC__ || defined __clang__ - - /* Sanity check for cases of broken detection */ - #if !defined(__i386__) && !defined(__x86_64__) && !defined(T_fallback) - #define T_fallback 1 - #endif - - #ifndef T_fallback - #include - - using avx_512_float = __m512; - using avx_512_int = __m512i; - using avx_2_float = __m256; - using avx_2_int = __m256i; - using avx_float = __m128; - using avx_int = __m128i; - - /* - * @brief Extracts a 32-bit integer from a 128-bit AVX register. - * @param value The AVX register containing packed 32-bit integers. - * @param index The index of the 32-bit integer to extract (0-3). - * @return The extracted 32-bit integer. - */ - inline int32_t extract_int32_from_avx(const avx_int& value, int64_t index) { - switch (index) { - case 0: { - return _mm_extract_epi32(value, 0); - } - case 1: { - return _mm_extract_epi32(value, 1); - } - case 2: { - return _mm_extract_epi32(value, 2); - } - case 3: { - return _mm_extract_epi32(value, 3); - } - default: { - return _mm_extract_epi32(value, 0); - } - } - } - - /* - * @brief Extracts a 32-bit integer from a 256-bit AVX2 register. - * @param value The AVX2 register containing packed 32-bit integers. - * @param index The index of the 32bit integer to extract (0-7). - * @return The extracted 32-bit integer. - */ - inline int32_t extract_int32_from_avx2(const avx_2_int& value, int64_t index) { - switch (index) { - case 0: { - return _mm256_extract_epi32(value, 0); - } - case 1: { - return _mm256_extract_epi32(value, 1); - } - case 2: { - return _mm256_extract_epi32(value, 2); - } - case 3: { - return _mm256_extract_epi32(value, 3); - } - case 4: { - return _mm256_extract_epi32(value, 4); - } - case 5: { - return _mm256_extract_epi32(value, 5); - } - case 6: { - return _mm256_extract_epi32(value, 6); - } - case 7: { - return _mm256_extract_epi32(value, 7); - } - default: { - return _mm256_extract_epi32(value, 0); - } - } - } - - /* - * @brief Extracts a 32-bit integer from a 512-bit AVX-512 register. - * @param value The AVX-512 register containing packed 32-bit integers. - * @param index The index of the 32-bit integer to extract (0-15). - * @return The extracted 32-bit integer. - */ - inline int32_t extract_int32_from_avx512(const avx_512_int& value, int64_t index) { - alignas(64) int32_t result[32]; - _mm512_store_si512(result, value); - return result[index]; - } - #endif +#if AVX_TYPE == 512 + #include "isa/avx512.h" +#elif AVX_TYPE == 2 + #include "isa/avx2.h" +#elif AVX_TYPE == 1 + #include "isa/avx.h" +#else + #include "isa/fallback.h" #endif - -#ifdef max - #undef max -#endif -#ifdef min - #undef min -#endif - -namespace dpp { - -#ifdef T_AVX512 - - /** - * @brief A class for audio mixing operations using AVX512 instructions. - */ - class audio_mixer { - public: - /* - * @brief The number of 32-bit values per CPU register. - */ - inline static constexpr int32_t byte_blocks_per_register{ 16 }; - - /* - * @brief Stores values from a 512-bit AVX512 vector to a storage location. - * @tparam value_type The target value type for storage. - * @param values_to_store The 512-bit AVX512 vector containing values to store. - * @param storage_location Pointer to the storage location. - */ - template inline static void store_values(const avx_512_int& values_to_store, value_type* storage_location) { - for (int64_t x = 0; x < byte_blocks_per_register; ++x) { - storage_location[x] = static_cast(extract_int32_from_avx512(values_to_store, x)); - } - } - - /** - * @brief Specialization for gathering non-float values into an AVX512 register. - * @tparam value_type The type of values being gathered. - * @tparam Indices Parameter pack of indices for gathering values. - * @return An AVX512 register containing gathered values. - */ - template inline static avx_512_float gather_values(value_type* values) { - alignas(64) float new_array[byte_blocks_per_register]{}; - for (size_t x = 0; x < byte_blocks_per_register; ++x) { - new_array[x] = static_cast(values[x]); - } - return _mm512_load_ps(new_array); - } - - /** - * @brief Collect a single register worth of data from data_in, apply gain and increment, and store the result in data_out. - * This version uses AVX512 instructions. - * - * @param data_in Pointer to the input array of int32_t values. - * @param data_out Pointer to the output array of int16_t values. - * @param current_gain The gain to be applied to the elements. - * @param increment The increment value to be added to each element. - */ - inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) { - avx_512_float current_samples_new{ _mm512_mul_ps(gather_values(data_in), - _mm512_add_ps(_mm512_set1_ps(current_gain), - _mm512_mul_ps(_mm512_set1_ps(increment), - _mm512_set_ps(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f)))) }; - - __m512 lower_limit = _mm512_set1_ps(static_cast(std::numeric_limits::min())); - __m512 upper_limit = _mm512_set1_ps(static_cast(std::numeric_limits::max())); - - __mmask16 mask_ge = _mm512_cmp_ps_mask(current_samples_new, _mm512_set1_ps(0.0f), _CMP_GE_OQ); - - current_samples_new = _mm512_mask_max_ps(current_samples_new, mask_ge, current_samples_new, lower_limit); - current_samples_new = _mm512_mask_min_ps(current_samples_new, ~mask_ge, current_samples_new, upper_limit); - - store_values(_mm512_cvtps_epi32(current_samples_new), data_out); - } - - /** - * @brief Combine a register worth of elements from decoded_data and store the result in up_sampled_vector. - * This version uses AVX512 instructions. - * - * @param up_sampled_vector Pointer to the array of int32_t values. - * @param decoded_data Pointer to the array of int16_t values. - */ - inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) { - auto newValues{ _mm512_cvtps_epi32(_mm512_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data))) }; - store_values(newValues, up_sampled_vector); - } - }; - -#elif T_AVX2 - - /** - * @brief A class for audio mixing operations using AVX2 instructions. - */ - class audio_mixer { - public: - /* - * @brief The number of 32-bit values per CPU register. - */ - inline static constexpr int32_t byte_blocks_per_register{ 8 }; - - /* - * @brief Stores values from a 256-bit AVX2 vector to a storage location. - * @tparam value_type The target value type for storage. - * @param values_to_store The 256-bit AVX2 vector containing values to store. - * @param storage_location Pointer to the storage location. - */ - template inline static void store_values(const avx_2_int& values_to_store, value_type* storage_location) { - for (int64_t x = 0; x < byte_blocks_per_register; ++x) { - storage_location[x] = static_cast(extract_int32_from_avx2(values_to_store, x)); - } - } - - /** - * @brief Specialization for gathering non-float values into an AVX2 register. - * @tparam value_type The type of values being gathered. - * @tparam Indices Parameter pack of indices for gathering values. - * @return An AVX2 register containing gathered values. - */ - template inline static avx_2_float gather_values(value_type* values) { - alignas(32) float new_array[byte_blocks_per_register]{}; - for (size_t x = 0; x < byte_blocks_per_register; ++x) { - new_array[x] = static_cast(values[x]); - } - return _mm256_load_ps(new_array); - } - - /** - * @brief Collect a single register worth of data from data_in, apply gain and increment, and store the result in data_out. - * This version uses AVX2 instructions. - * - * @param data_in Pointer to the input array of int32_t values. - * @param data_out Pointer to the output array of int16_t values. - * @param current_gain The gain to be applied to the elements. - * @param increment The increment value to be added to each element. - */ - inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) { - avx_2_float current_samples_new{ _mm256_mul_ps(gather_values(data_in), - _mm256_add_ps(_mm256_set1_ps(current_gain), - _mm256_mul_ps(_mm256_set1_ps(increment), _mm256_set_ps(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f)))) }; - - current_samples_new = - _mm256_blendv_ps(_mm256_max_ps(current_samples_new, _mm256_set1_ps(static_cast(std::numeric_limits::min()))), - _mm256_min_ps(current_samples_new, _mm256_set1_ps(static_cast(std::numeric_limits::max()))), - _mm256_cmp_ps(current_samples_new, _mm256_set1_ps(0.0f), _CMP_GE_OQ)); - - store_values(_mm256_cvtps_epi32(current_samples_new), data_out); - } - - /** - * @brief Combine a register worth of elements from decoded_data and store the result in up_sampled_vector. - * This version uses AVX2 instructions. - * - * @param up_sampled_vector Pointer to the array of int32_t values. - * @param decoded_data Pointer to the array of int16_t values. - * @param x Index to select a specific set of elements to combine. - */ - inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) { - auto newValues{ _mm256_cvtps_epi32(_mm256_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data))) }; - store_values(newValues, up_sampled_vector); - } - }; - -#elif T_AVX - - /** - * @brief A class for audio mixing operations using AVX instructions. - */ - class audio_mixer { - public: - /* - * @brief The number of 32-bit values per CPU register. - */ - inline static constexpr int32_t byte_blocks_per_register{ 4 }; - - /* - * @brief Stores values from a 128-bit AVX vector to a storage location. - * @tparam value_type The target value type for storage. - * @param values_to_store The 128-bit AVX vector containing values to store. - * @param storage_location Pointer to the storage location. - */ - template inline static void store_values(const avx_int& values_to_store, value_type* storage_location) { - for (int64_t x = 0; x < byte_blocks_per_register; ++x) { - storage_location[x] = static_cast(extract_int32_from_avx(values_to_store, x)); - } - } - - /** - * @brief Specialization for gathering non-float values into an AVX register. - * @tparam value_type The type of values being gathered. - * @tparam Indices Parameter pack of indices for gathering values. - * @return An AVX register containing gathered values. - */ - template inline static avx_float gather_values(value_type* values) { - alignas(16) float new_array[byte_blocks_per_register]{}; - for (size_t x = 0; x < byte_blocks_per_register; ++x) { - new_array[x] = static_cast(values[x]); - } - return _mm_load_ps(new_array); - } - - /** - * @brief Collect a single register worth of data from data_in, apply gain and increment, and store the result in data_out. - * This version uses AVX instructions. - * - * @param data_in Pointer to the input array of int32_t values. - * @param data_out Pointer to the output array of int16_t values. - * @param current_gain The gain to be applied to the elements. - * @param increment The increment value to be added to each element. - */ - inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) { - avx_float current_samples_new{ _mm_mul_ps(gather_values(data_in), - _mm_add_ps(_mm_set1_ps(current_gain), _mm_mul_ps(_mm_set1_ps(increment), _mm_set_ps(0.0f, 1.0f, 2.0f, 3.0f)))) }; - - current_samples_new = _mm_blendv_ps(_mm_max_ps(current_samples_new, _mm_set1_ps(static_cast(std::numeric_limits::min()))), - _mm_min_ps(current_samples_new, _mm_set1_ps(static_cast(std::numeric_limits::max()))), - _mm_cmp_ps(current_samples_new, _mm_set1_ps(0.0f), _CMP_GE_OQ)); - - store_values(_mm_cvtps_epi32(current_samples_new), data_out); - } - - /** - * @brief Combine a register worth of elements from decoded_data and store the result in up_sampled_vector. - * This version uses AVX instructions. - * - * @param up_sampled_vector Pointer to the array of int32_t values. - * @param decoded_data Pointer to the array of int16_t values. - */ - inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) { - auto newValues{ _mm_cvtps_epi32(_mm_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data))) }; - store_values(newValues, up_sampled_vector); - } - }; - -#else - - /** - * @brief A class for audio mixing operations using x64 instructions. - */ - class audio_mixer { - public: - /* - * @brief The number of 32-bit values per CPU register. - */ - inline static constexpr int32_t byte_blocks_per_register{ 2 }; - - /** - * @brief Collect a single register worth of data from data_in, apply gain and increment, and store the result in data_out. - * This version uses x64 instructions. - * - * @param data_in Pointer to the input array of int32_t values. - * @param data_out Pointer to the output array of int16_t values. - * @param current_gain The gain to be applied to the elements. - * @param increment The increment value to be added to each element. - */ - inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) { - for (uint64_t x = 0; x < byte_blocks_per_register; ++x) { - auto increment_new = increment * x; - auto current_gain_new = current_gain + increment_new; - auto current_sample_new = data_in[x] * current_gain_new; - if (current_sample_new >= std::numeric_limits::max()) { - current_sample_new = std::numeric_limits::max(); - } - else if (current_sample_new <= std::numeric_limits::min()) { - current_sample_new = std::numeric_limits::min(); - } - data_out[x] = static_cast(current_sample_new); - } - } - - /** - * @brief Combine a register worth of elements from decoded_data and store the result in up_sampled_vector. - * This version uses x64 instructions. - * - * @param up_sampled_vector Pointer to the array of int32_t values. - * @param decoded_data Pointer to the array of int16_t values. - */ - inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) { - for (uint64_t x = 0; x < byte_blocks_per_register; ++x) { - up_sampled_vector[x] += static_cast(decoded_data[x]); - } - - } - }; - -/**@}*/ - -#endif - -} // namespace dpp diff --git a/include/dpp/utility.h b/include/dpp/utility.h index 487a90d1de..ad2e632da0 100644 --- a/include/dpp/utility.h +++ b/include/dpp/utility.h @@ -89,6 +89,28 @@ namespace dpp { */ std::string DPP_EXPORT cdn_endpoint_url_sticker(snowflake sticker_id, sticker_format format); + /** + * @brief Supported AVX instruction set type for audio mixing + */ + enum avx_type_t : uint8_t { + /** + * @brief No AVX Support + */ + avx_none, + /** + * @brief AVX support + */ + avx_1, + /** + * @brief AVX2 support + */ + avx_2, + /** + * @brief AVX512 support + */ + avx_512, + }; + /** * @brief Timestamp formats for dpp::utility::timestamp() * @@ -281,6 +303,14 @@ namespace dpp { */ bool DPP_EXPORT has_voice(); + /** + * @brief Returns an enum value indicating which AVX instruction + * set is used for mixing received voice data, if any + * + * @return avx_type_t AVX type + */ + avx_type_t DPP_EXPORT voice_avx(); + /** * @brief Returns true if D++ was built with coroutine support * diff --git a/library-vcpkg/CMakeLists.txt b/library-vcpkg/CMakeLists.txt index a06ecf8163..ca6c43ebea 100644 --- a/library-vcpkg/CMakeLists.txt +++ b/library-vcpkg/CMakeLists.txt @@ -11,9 +11,15 @@ endif() add_library("${PROJECT_NAME}::${LIB_NAME}" ALIAS "${LIB_NAME}") -if(NOT DEFINED AVX_TYPE) +if(${AVX_TYPE} STREQUAL "OFF") include("${CMAKE_CURRENT_SOURCE_DIR}/../cmake/DetectArchitecture.cmake") + message("--- AVX type: ${AVX_TYPE}") +else() + message("-- AVX type overridden by configuration: ${AVX_TYPE}") endif() +STRING(REPLACE "AVX" "" AVX_TYPE ${AVX_TYPE}) +add_compile_definitions(AVX_TYPE=${AVX_TYPE}) +add_compile_options(${AVX_FLAG}) target_compile_definitions( "${LIB_NAME}" PUBLIC diff --git a/library/CMakeLists.txt b/library/CMakeLists.txt index a8ce71b0fd..00839b0e15 100644 --- a/library/CMakeLists.txt +++ b/library/CMakeLists.txt @@ -24,13 +24,13 @@ add_compile_definitions(DPP_OS=${CMAKE_SYSTEM_NAME}) if(${AVX_TYPE} STREQUAL "OFF") include("${CMAKE_CURRENT_SOURCE_DIR}/../cmake/DetectArchitecture.cmake") - add_compile_options("${AVX_FLAG}") - add_compile_definitions("${AVX_NAME}") - message("--- AVX type: ${AVX_NAME}") + message("--- AVX type: ${AVX_TYPE}") else() message("-- AVX type overridden by configuration: ${AVX_TYPE}") - add_compile_definitions("${AVX_TYPE}") endif() +STRING(REPLACE "AVX" "" AVX_TYPE ${AVX_TYPE}) +add_compile_definitions(AVX_TYPE=${AVX_TYPE}) +add_compile_options(${AVX_FLAG}) if(WIN32 AND NOT MINGW) if (NOT WINDOWS_32_BIT) diff --git a/src/dpp/utility.cpp b/src/dpp/utility.cpp index cd922b1c46..8a28201c33 100644 --- a/src/dpp/utility.cpp +++ b/src/dpp/utility.cpp @@ -108,6 +108,19 @@ namespace dpp { #endif } + avx_type_t voice_avx() { +#if AVX_TYPE == 512 + return avx_512; +#elif AVX_TYPE == 2 + return avx_2; +#elif AVX_TYPE == 1 + return avx_1; +#else + return avx_none; +#endif + + } + bool is_coro_enabled() { #ifdef DPP_CORO return true;