From 61945c1bf963739ebaa3992742b5efacd26b926b Mon Sep 17 00:00:00 2001 From: RealTimeChris <40668522+RealTimeChris@users.noreply.github.com> Date: Sat, 23 Sep 2023 21:44:54 -0400 Subject: [PATCH] Enhancement: Making AudioMixer a member instead of invoking it statically to cut down CPU usage by roughly 50%. Storing it along with the array of floats that are used as an intermediary for converting to/from avx registers seems to cut down CPU usage while streaming audio in my implementation by roughly 50%. As opposed to reallocating the array for every function call. --- include/dpp/discordvoiceclient.h | 6 ++++++ include/dpp/isa/avx.h | 29 +++++++++-------------------- include/dpp/isa/avx2.h | 29 +++++++++-------------------- include/dpp/isa/avx512.h | 27 ++++++++------------------- src/dpp/discordvoiceclient.cpp | 21 ++++++++++++--------- 5 files changed, 44 insertions(+), 68 deletions(-) diff --git a/include/dpp/discordvoiceclient.h b/include/dpp/discordvoiceclient.h index b9d6d51637..192b221830 100644 --- a/include/dpp/discordvoiceclient.h +++ b/include/dpp/discordvoiceclient.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -139,6 +140,11 @@ class DPP_EXPORT discord_voice_client : public websocket_client */ time_t connect_time; + /* + * @brief For mixing outgoing voice data. + */ + audio_mixer mixer; + /** * @brief IP of UDP/RTP endpoint */ diff --git a/include/dpp/isa/avx.h b/include/dpp/isa/avx.h index 1ba794d712..1700ca7808 100644 --- a/include/dpp/isa/avx.h +++ b/include/dpp/isa/avx.h @@ -49,7 +49,7 @@ namespace dpp { * @param current_gain The gain to be applied to the elements. * @param increment The increment value to be added to each element. */ - inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) { + inline void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) { avx_float current_samples_new{ _mm_mul_ps(gather_values(data_in), _mm_add_ps(_mm_set1_ps(current_gain), _mm_mul_ps(_mm_set1_ps(increment), _mm_set_ps(0.0f, 1.0f, 2.0f, 3.0f)))) }; @@ -67,12 +67,13 @@ namespace dpp { * @param up_sampled_vector Pointer to the array of int32_t values. * @param decoded_data Pointer to the array of int16_t values. */ - inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) { + inline void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) { auto newValues{ _mm_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data)) }; store_values(newValues, up_sampled_vector); } protected: + alignas(16) float values[byte_blocks_per_register]{};///< Array for storing the values to be loaded/stored. /** * @brief Stores values from a 128-bit AVX vector to a storage location. @@ -80,9 +81,10 @@ namespace dpp { * @param values_to_store The 128-bit AVX vector containing values to store. * @param storage_location Pointer to the storage location. */ - template inline static void store_values(const avx_float& values_to_store, value_type* storage_location) { + template inline void store_values(const avx_float& values_to_store, value_type* storage_location) { + _mm_store_ps(values, values_to_store); for (int64_t x = 0; x < byte_blocks_per_register; ++x) { - storage_location[x] = static_cast(extract_float_from_avx(values_to_store, x)); + storage_location[x] = static_cast(values[x]); } } @@ -92,24 +94,11 @@ namespace dpp { * @tparam Indices Parameter pack of indices for gathering values. * @return An AVX register containing gathered values. */ - template inline static avx_float gather_values(value_type* values) { - alignas(16) float new_array[byte_blocks_per_register]{}; + template inline avx_float gather_values(value_type* values_new) { for (uint64_t x = 0; x < byte_blocks_per_register; ++x) { - new_array[x] = static_cast(values[x]); + values[x] = static_cast(values_new[x]); } - return _mm_load_ps(new_array); - } - - /** - * @brief Extracts a 32-bit integer from a 128-bit AVX register. - * @param value The AVX register containing packed 32-bit integers. - * @param index The index of the 32-bit integer to extract (0-3). - * @return The extracted 32-bit integer. - */ - inline static float extract_float_from_avx(const avx_float& value, int64_t index) { - alignas(16) float new_array[4]{}; - _mm_store_ps(new_array, value); - return new_array[index]; + return _mm_load_ps(values); } }; diff --git a/include/dpp/isa/avx2.h b/include/dpp/isa/avx2.h index de53274293..579025f246 100644 --- a/include/dpp/isa/avx2.h +++ b/include/dpp/isa/avx2.h @@ -49,7 +49,7 @@ namespace dpp { * @param current_gain The gain to be applied to the elements. * @param increment The increment value to be added to each element. */ - inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) { + inline void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) { avx_2_float current_samples_new{ _mm256_mul_ps(gather_values(data_in), _mm256_add_ps(_mm256_set1_ps(current_gain), _mm256_mul_ps(_mm256_set1_ps(increment), _mm256_set_ps(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f)))) }; @@ -70,12 +70,13 @@ namespace dpp { * @param decoded_data Pointer to the array of int16_t values. * @param x Index to select a specific set of elements to combine. */ - inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) { + inline void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) { auto newValues{ _mm256_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data)) }; store_values(newValues, up_sampled_vector); } protected: + alignas(32) float values[byte_blocks_per_register]{};///< Array for storing the values to be loaded/stored. /** * @brief Stores values from a 256-bit AVX2 vector to a storage location. @@ -83,9 +84,10 @@ namespace dpp { * @param values_to_store The 256-bit AVX2 vector containing values to store. * @param storage_location Pointer to the storage location. */ - template inline static void store_values(const avx_2_float& values_to_store, value_type* storage_location) { + template inline void store_values(const avx_2_float& values_to_store, value_type* storage_location) { + _mm256_store_ps(values, values_to_store); for (int64_t x = 0; x < byte_blocks_per_register; ++x) { - storage_location[x] = static_cast(extract_float_from_avx_2(values_to_store, x)); + storage_location[x] = static_cast(values[x]); } } @@ -95,24 +97,11 @@ namespace dpp { * @tparam Indices Parameter pack of indices for gathering values. * @return An AVX2 register containing gathered values. */ - template inline static avx_2_float gather_values(value_type* values) { - alignas(32) float new_array[byte_blocks_per_register]{}; + template inline avx_2_float gather_values(value_type* values_new) { for (uint64_t x = 0; x < byte_blocks_per_register; ++x) { - new_array[x] = static_cast(values[x]); + values[x] = static_cast(values_new[x]); } - return _mm256_load_ps(new_array); - } - - /** - * @brief Extracts a 32-bit integer from a 256-bit AVX2 register. - * @param value The AVX2 register containing packed 32-bit integers. - * @param index The index of the 32-bit integer to extract (0-7). - * @return The extracted 32-bit integer. - */ - inline static float extract_float_from_avx_2(const avx_2_float& value, int64_t index) { - alignas(32) float new_array[byte_blocks_per_register]{}; - _mm256_store_ps(new_array, value); - return new_array[index]; + return _mm256_load_ps(values); } }; diff --git a/include/dpp/isa/avx512.h b/include/dpp/isa/avx512.h index 3fa9b31096..2bdc3344e6 100644 --- a/include/dpp/isa/avx512.h +++ b/include/dpp/isa/avx512.h @@ -49,7 +49,7 @@ namespace dpp { * @param current_gain The gain to be applied to the elements. * @param increment The increment value to be added to each element. */ - inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) { + inline void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) { avx_512_float current_samples_new{ _mm512_mul_ps(gather_values(data_in), _mm512_add_ps(_mm512_set1_ps(current_gain), _mm512_mul_ps(_mm512_set1_ps(increment), @@ -73,12 +73,13 @@ namespace dpp { * @param up_sampled_vector Pointer to the array of int32_t values. * @param decoded_data Pointer to the array of int16_t values. */ - inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) { + inline void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) { auto newValues{ _mm512_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data)) }; store_values(newValues, up_sampled_vector); } protected: + alignas(64) float values[byte_blocks_per_register]{};///< Array for storing the values to be loaded/stored. /** * @brief Stores values from a 512-bit AVX512 vector to a storage location. @@ -87,8 +88,9 @@ namespace dpp { * @param storage_location Pointer to the storage location. */ template inline static void store_values(const avx_512_float& values_to_store, value_type* storage_location) { + _mm256_store_ps(values, values_to_store); for (int64_t x = 0; x < byte_blocks_per_register; ++x) { - storage_location[x] = static_cast(extract_float_from_avx_512(values_to_store, x)); + storage_location[x] = static_cast(values[x]); } } @@ -98,24 +100,11 @@ namespace dpp { * @tparam Indices Parameter pack of indices for gathering values. * @return An AVX512 register containing gathered values. */ - template inline static avx_512_float gather_values(value_type* values) { - alignas(64) float new_array[byte_blocks_per_register]{}; + template inline avx_512_float gather_values(value_type* values_new) { for (uint64_t x = 0; x < byte_blocks_per_register; ++x) { - new_array[x] = static_cast(values[x]); + values[x] = static_cast(values_new[x]); } - return _mm512_load_ps(new_array); - } - - /** - * @brief Extracts a 32-bit integer from a 512-bit AVX512 register. - * @param value The AVX512 register containing packed 32-bit integers. - * @param index The index of the 32-bit integer to extract (0-15). - * @return The extracted 32-bit integer. - */ - inline static float extract_float_from_avx_512(const avx_512_float& value, int64_t index) { - alignas(64) float new_array[byte_blocks_per_register]{}; - _mm512_store_ps(new_array, value); - return new_array[index]; + return _mm512_load_ps(values); } }; diff --git a/src/dpp/discordvoiceclient.cpp b/src/dpp/discordvoiceclient.cpp index 2d0d766349..4cb14cfb1d 100644 --- a/src/dpp/discordvoiceclient.cpp +++ b/src/dpp/discordvoiceclient.cpp @@ -142,14 +142,17 @@ bool discord_voice_client::voice_payload::operator<(const voice_payload& other) } #ifdef HAVE_VOICE -size_t audio_mix(discord_voice_client& client, opus_int32* pcm_mix, const opus_int16* pcm, size_t park_count, int samples, int& max_samples) { +size_t audio_mix(discord_voice_client& client, audio_mixer& mixer, opus_int32* pcm_mix, const opus_int16* pcm, size_t park_count, int samples, int& max_samples) { /* Mix the combined stream if combined audio is bound */ if (client.creator->on_voice_receive_combined.empty()) { return 0; } + /* We must upsample the data to 32 bits wide, otherwise we could overflow */ - for (opus_int32 v = 0; v < samples * opus_channel_count / 16; ++v) { - audio_mixer::combine_samples(pcm_mix, pcm); + for (opus_int32 v = 0; v < samples * 2 / mixer.byte_blocks_per_register; ++v) { + mixer.combine_samples(pcm_mix, pcm); + pcm += mixer.byte_blocks_per_register; + pcm_mix += mixer.byte_blocks_per_register; } client.moving_average += park_count; max_samples = (std::max)(samples, max_samples); @@ -252,7 +255,7 @@ void discord_voice_client::voice_courier_loop(discord_voice_client& client, cour voice_receive_t vr(nullptr, "", &client, d.user_id, reinterpret_cast(pcm), samples * opus_channel_count * sizeof(opus_int16)); - park_count = audio_mix(client, pcm_mix, pcm, park_count, samples, max_samples); + park_count = audio_mix(client, client.mixer, pcm_mix, pcm, park_count, samples, max_samples); client.creator->on_voice_receive.call(vr); } } else { @@ -266,7 +269,7 @@ void discord_voice_client::voice_courier_loop(discord_voice_client& client, cour vr.reassign(&client, d.user_id, reinterpret_cast(pcm), samples * opus_channel_count * sizeof(opus_int16)); client.end_gain = 1.0f / client.moving_average; - park_count = audio_mix(client, pcm_mix, pcm, park_count, samples, max_samples); + park_count = audio_mix(client, client.mixer, pcm_mix, pcm, park_count, samples, max_samples); client.creator->on_voice_receive.call(vr); } @@ -281,10 +284,10 @@ void discord_voice_client::voice_courier_loop(discord_voice_client& client, cour /* Downsample the 32 bit samples back to 16 bit */ opus_int16 pcm_downsample[23040] = { 0 }; client.increment = (client.end_gain - client.current_gain) / static_cast(samples); - for (int64_t x = 0; x < samples / audio_mixer::byte_blocks_per_register; ++x) { - audio_mixer::collect_single_register(pcm_mix + (x * audio_mixer::byte_blocks_per_register), - pcm_downsample + (x * audio_mixer::byte_blocks_per_register), client.current_gain, client.increment); - client.current_gain += client.increment * static_cast(audio_mixer::byte_blocks_per_register); + for (int64_t x = 0; x < samples / client.mixer.byte_blocks_per_register; ++x) { + client.mixer.collect_single_register(pcm_mix + (x * client.mixer.byte_blocks_per_register), + pcm_downsample + (x * client.mixer.byte_blocks_per_register), client.current_gain, client.increment); + client.current_gain += client.increment * static_cast(client.mixer.byte_blocks_per_register); } voice_receive_t vr(nullptr, "", &client, 0, reinterpret_cast(pcm_downsample),