From 61945c1bf963739ebaa3992742b5efacd26b926b Mon Sep 17 00:00:00 2001
From: RealTimeChris <40668522+RealTimeChris@users.noreply.github.com>
Date: Sat, 23 Sep 2023 21:44:54 -0400
Subject: [PATCH] Enhancement: Making AudioMixer a member instead of invoking
 it statically to cut down CPU usage by roughly 50%.

Storing it along with the array of floats that are used as an intermediary for converting to/from avx registers seems to cut down CPU usage while streaming audio in my implementation by roughly 50%. As opposed to reallocating the array for every function call.
---
 include/dpp/discordvoiceclient.h |  6 ++++++
 include/dpp/isa/avx.h            | 29 +++++++++--------------------
 include/dpp/isa/avx2.h           | 29 +++++++++--------------------
 include/dpp/isa/avx512.h         | 27 ++++++++-------------------
 src/dpp/discordvoiceclient.cpp   | 21 ++++++++++++---------
 5 files changed, 44 insertions(+), 68 deletions(-)
diff --git a/include/dpp/discordvoiceclient.h b/include/dpp/discordvoiceclient.h
index b9d6d51637..192b221830 100644
--- a/include/dpp/discordvoiceclient.h
+++ b/include/dpp/discordvoiceclient.h
@@ -38,6 +38,7 @@
 #include <dpp/dispatcher.h>
 #include <dpp/cluster.h>
 #include <dpp/discordevents.h>
+#include <dpp/isa_detection.h>
 #include <dpp/socket.h>
 #include <queue>
 #include <thread>
@@ -139,6 +140,11 @@ class DPP_EXPORT discord_voice_client : public websocket_client
 	 */
 	time_t connect_time;
 
+	/*
+	* @brief For mixing outgoing voice data.
+	*/
+	audio_mixer mixer;
+
 	/**
 	 * @brief IP of UDP/RTP endpoint
 	 */
diff --git a/include/dpp/isa/avx.h b/include/dpp/isa/avx.h
index 1ba794d712..1700ca7808 100644
--- a/include/dpp/isa/avx.h
+++ b/include/dpp/isa/avx.h
@@ -49,7 +49,7 @@ namespace dpp {
 		 * @param current_gain The gain to be applied to the elements.
 		 * @param increment The increment value to be added to each element.
 		 */
-		inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
+		inline void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
 			avx_float current_samples_new{ _mm_mul_ps(gather_values(data_in),
 				_mm_add_ps(_mm_set1_ps(current_gain), _mm_mul_ps(_mm_set1_ps(increment), _mm_set_ps(0.0f, 1.0f, 2.0f, 3.0f)))) };
 
@@ -67,12 +67,13 @@ namespace dpp {
 		 * @param up_sampled_vector Pointer to the array of int32_t values.
 		 * @param decoded_data Pointer to the array of int16_t values.
 		 */
-		inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
+		inline void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
 			auto newValues{ _mm_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data)) };
 			store_values(newValues, up_sampled_vector);
 		}
 
 	protected:
+		alignas(16) float values[byte_blocks_per_register]{};///< Array for storing the values to be loaded/stored.
 
 		/**
 		 * @brief Stores values from a 128-bit AVX vector to a storage location.
@@ -80,9 +81,10 @@ namespace dpp {
 		 * @param values_to_store The 128-bit AVX vector containing values to store.
 		 * @param storage_location Pointer to the storage location.
 		 */
-		template<typename value_type> inline static void store_values(const avx_float& values_to_store, value_type* storage_location) {
+		template<typename value_type> inline void store_values(const avx_float& values_to_store, value_type* storage_location) {
+			_mm_store_ps(values, values_to_store);
 			for (int64_t x = 0; x < byte_blocks_per_register; ++x) {
-				storage_location[x] = static_cast<value_type>(extract_float_from_avx(values_to_store, x));
+				storage_location[x] = static_cast<value_type>(values[x]);
 			}
 		}
 
@@ -92,24 +94,11 @@ namespace dpp {
 		 * @tparam Indices Parameter pack of indices for gathering values.
 		 * @return An AVX register containing gathered values.
 		 */
-		template<typename value_type> inline static avx_float gather_values(value_type* values) {
-			alignas(16) float new_array[byte_blocks_per_register]{};
+		template<typename value_type> inline avx_float gather_values(value_type* values_new) {
 			for (uint64_t x = 0; x < byte_blocks_per_register; ++x) {
-				new_array[x] = static_cast<float>(values[x]);
+				values[x] = static_cast<float>(values_new[x]);
 			}
-			return _mm_load_ps(new_array);
-		}
-
-		/**
-		 * @brief Extracts a 32-bit integer from a 128-bit AVX register.
-		 * @param value The AVX register containing packed 32-bit integers.
-		 * @param index The index of the 32-bit integer to extract (0-3).
-		 * @return The extracted 32-bit integer.
-		 */
-		inline static float extract_float_from_avx(const avx_float& value, int64_t index) {
-			alignas(16) float new_array[4]{};
-			_mm_store_ps(new_array, value);
-			return new_array[index];
+			return _mm_load_ps(values);
 		}
 	};
 
diff --git a/include/dpp/isa/avx2.h b/include/dpp/isa/avx2.h
index de53274293..579025f246 100644
--- a/include/dpp/isa/avx2.h
+++ b/include/dpp/isa/avx2.h
@@ -49,7 +49,7 @@ namespace dpp {
 		 * @param current_gain The gain to be applied to the elements.
 		 * @param increment The increment value to be added to each element.
 		 */
-		inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
+		inline void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
 			avx_2_float current_samples_new{ _mm256_mul_ps(gather_values(data_in),
 				_mm256_add_ps(_mm256_set1_ps(current_gain),
 					_mm256_mul_ps(_mm256_set1_ps(increment), _mm256_set_ps(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f)))) };
@@ -70,12 +70,13 @@ namespace dpp {
 		 * @param decoded_data Pointer to the array of int16_t values.
 		 * @param x Index to select a specific set of elements to combine.
 		 */
-		inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
+		inline void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
 			auto newValues{ _mm256_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data)) };
 			store_values(newValues, up_sampled_vector);
 		}
 
 	protected:
+		alignas(32) float values[byte_blocks_per_register]{};///< Array for storing the values to be loaded/stored.
 
 		/**
 		 * @brief Stores values from a 256-bit AVX2 vector to a storage location.
@@ -83,9 +84,10 @@ namespace dpp {
 		 * @param values_to_store The 256-bit AVX2 vector containing values to store.
 		 * @param storage_location Pointer to the storage location.
 		 */
-		template<typename value_type> inline static void store_values(const avx_2_float& values_to_store, value_type* storage_location) {
+		template<typename value_type> inline void store_values(const avx_2_float& values_to_store, value_type* storage_location) {
+			_mm256_store_ps(values, values_to_store);
 			for (int64_t x = 0; x < byte_blocks_per_register; ++x) {
-				storage_location[x] = static_cast<value_type>(extract_float_from_avx_2(values_to_store, x));
+				storage_location[x] = static_cast<value_type>(values[x]);
 			}
 		}
 
@@ -95,24 +97,11 @@ namespace dpp {
 		 * @tparam Indices Parameter pack of indices for gathering values.
 		 * @return An AVX2 register containing gathered values.
 		 */
-		template<typename value_type> inline static avx_2_float gather_values(value_type* values) {
-			alignas(32) float new_array[byte_blocks_per_register]{};
+		template<typename value_type> inline avx_2_float gather_values(value_type* values_new) {
 			for (uint64_t x = 0; x < byte_blocks_per_register; ++x) {
-				new_array[x] = static_cast<float>(values[x]);
+				values[x] = static_cast<float>(values_new[x]);
 			}
-			return _mm256_load_ps(new_array);
-		}
-
-		/**
-		 * @brief Extracts a 32-bit integer from a 256-bit AVX2 register.
-		 * @param value The AVX2 register containing packed 32-bit integers.
-		 * @param index The index of the 32-bit integer to extract (0-7).
-		 * @return The extracted 32-bit integer.
-		 */
-		inline static float extract_float_from_avx_2(const avx_2_float& value, int64_t index) {
-			alignas(32) float new_array[byte_blocks_per_register]{};
-			_mm256_store_ps(new_array, value);
-			return new_array[index];
+			return _mm256_load_ps(values);
 		}
 	};
 
diff --git a/include/dpp/isa/avx512.h b/include/dpp/isa/avx512.h
index 3fa9b31096..2bdc3344e6 100644
--- a/include/dpp/isa/avx512.h
+++ b/include/dpp/isa/avx512.h
@@ -49,7 +49,7 @@ namespace dpp {
 		 * @param current_gain The gain to be applied to the elements.
 		 * @param increment The increment value to be added to each element.
 		 */
-		inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
+		inline void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
 			avx_512_float current_samples_new{ _mm512_mul_ps(gather_values(data_in),
 				_mm512_add_ps(_mm512_set1_ps(current_gain),
 					_mm512_mul_ps(_mm512_set1_ps(increment),
@@ -73,12 +73,13 @@ namespace dpp {
 		 * @param up_sampled_vector Pointer to the array of int32_t values.
 		 * @param decoded_data Pointer to the array of int16_t values.
 		 */
-		inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
+		inline void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
 			auto newValues{ _mm512_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data)) };
 			store_values(newValues, up_sampled_vector);
 		}
 
 	protected:
+		alignas(64) float values[byte_blocks_per_register]{};///< Array for storing the values to be loaded/stored.
 
 		/**
 		 * @brief Stores values from a 512-bit AVX512 vector to a storage location.
@@ -87,8 +88,9 @@ namespace dpp {
 		 * @param storage_location Pointer to the storage location.
 		 */
 		template<typename value_type> inline static void store_values(const avx_512_float& values_to_store, value_type* storage_location) {
+			_mm256_store_ps(values, values_to_store);
 			for (int64_t x = 0; x < byte_blocks_per_register; ++x) {
-				storage_location[x] = static_cast<value_type>(extract_float_from_avx_512(values_to_store, x));
+				storage_location[x] = static_cast<value_type>(values[x]);
 			}
 		}
 
@@ -98,24 +100,11 @@ namespace dpp {
 		 * @tparam Indices Parameter pack of indices for gathering values.
 		 * @return An AVX512 register containing gathered values.
 		 */
-		template<typename value_type> inline static avx_512_float gather_values(value_type* values) {
-			alignas(64) float new_array[byte_blocks_per_register]{};
+		template<typename value_type> inline avx_512_float gather_values(value_type* values_new) {
 			for (uint64_t x = 0; x < byte_blocks_per_register; ++x) {
-				new_array[x] = static_cast<float>(values[x]);
+				values[x] = static_cast<float>(values_new[x]);
 			}
-			return _mm512_load_ps(new_array);
-		}
-
-		/**
-		 * @brief Extracts a 32-bit integer from a 512-bit AVX512 register.
-		 * @param value The AVX512 register containing packed 32-bit integers.
-		 * @param index The index of the 32-bit integer to extract (0-15).
-		 * @return The extracted 32-bit integer.
-		 */
-		inline static float extract_float_from_avx_512(const avx_512_float& value, int64_t index) {
-			alignas(64) float new_array[byte_blocks_per_register]{};
-			_mm512_store_ps(new_array, value);
-			return new_array[index];
+			return _mm512_load_ps(values);
 		}
 	};
 
diff --git a/src/dpp/discordvoiceclient.cpp b/src/dpp/discordvoiceclient.cpp
index 2d0d766349..4cb14cfb1d 100644
--- a/src/dpp/discordvoiceclient.cpp
+++ b/src/dpp/discordvoiceclient.cpp
@@ -142,14 +142,17 @@ bool discord_voice_client::voice_payload::operator<(const voice_payload& other)
 }
 
 #ifdef HAVE_VOICE
-size_t audio_mix(discord_voice_client& client, opus_int32* pcm_mix, const opus_int16* pcm, size_t park_count, int samples, int& max_samples) {
+size_t audio_mix(discord_voice_client& client, audio_mixer& mixer, opus_int32* pcm_mix, const opus_int16* pcm, size_t park_count, int samples, int& max_samples) {
 	/* Mix the combined stream if combined audio is bound */
 	if (client.creator->on_voice_receive_combined.empty()) {
 		return 0;
 	}
+
 	/* We must upsample the data to 32 bits wide, otherwise we could overflow */
-	for (opus_int32 v = 0; v < samples * opus_channel_count / 16; ++v) {
-		audio_mixer::combine_samples(pcm_mix, pcm);
+	for (opus_int32 v = 0; v < samples * 2 / mixer.byte_blocks_per_register; ++v) {
+		mixer.combine_samples(pcm_mix, pcm);
+		pcm += mixer.byte_blocks_per_register;
+		pcm_mix += mixer.byte_blocks_per_register;
 	}
 	client.moving_average += park_count;
 	max_samples = (std::max)(samples, max_samples);
@@ -252,7 +255,7 @@ void discord_voice_client::voice_courier_loop(discord_voice_client& client, cour
 						voice_receive_t vr(nullptr, "", &client, d.user_id, reinterpret_cast<uint8_t*>(pcm),
 							samples * opus_channel_count * sizeof(opus_int16));
 
-						park_count = audio_mix(client, pcm_mix, pcm, park_count, samples, max_samples);
+						park_count = audio_mix(client, client.mixer, pcm_mix, pcm, park_count, samples, max_samples);
 						client.creator->on_voice_receive.call(vr);
 					}
 				} else {
@@ -266,7 +269,7 @@ void discord_voice_client::voice_courier_loop(discord_voice_client& client, cour
 						vr.reassign(&client, d.user_id, reinterpret_cast<uint8_t*>(pcm),
 							samples * opus_channel_count * sizeof(opus_int16));
 						client.end_gain = 1.0f / client.moving_average;
-						park_count = audio_mix(client, pcm_mix, pcm, park_count, samples, max_samples);
+						park_count = audio_mix(client, client.mixer, pcm_mix, pcm, park_count, samples, max_samples);
 						client.creator->on_voice_receive.call(vr);
 					}
 
@@ -281,10 +284,10 @@ void discord_voice_client::voice_courier_loop(discord_voice_client& client, cour
 			/* Downsample the 32 bit samples back to 16 bit */
 			opus_int16 pcm_downsample[23040] = { 0 };
 			client.increment = (client.end_gain - client.current_gain) / static_cast<float>(samples);
-			for (int64_t x = 0; x < samples / audio_mixer::byte_blocks_per_register; ++x) {
-				audio_mixer::collect_single_register(pcm_mix + (x * audio_mixer::byte_blocks_per_register),
-					pcm_downsample + (x * audio_mixer::byte_blocks_per_register), client.current_gain, client.increment);
-				client.current_gain += client.increment * static_cast<float>(audio_mixer::byte_blocks_per_register);
+			for (int64_t x = 0; x < samples / client.mixer.byte_blocks_per_register; ++x) {
+				client.mixer.collect_single_register(pcm_mix + (x * client.mixer.byte_blocks_per_register),
+					pcm_downsample + (x * client.mixer.byte_blocks_per_register), client.current_gain, client.increment);
+				client.current_gain += client.increment * static_cast<float>(client.mixer.byte_blocks_per_register);
 			}
 
 			voice_receive_t vr(nullptr, "", &client, 0, reinterpret_cast<uint8_t*>(pcm_downsample),