Enhancement: Making AudioMixer a member instead of invoking it static…

…ally to cut down CPU usage by roughly 50%. Storing it along with the array of floats that are used as an intermediary for converting to/from avx registers seems to cut down CPU usage while streaming audio in my implementation by roughly 50%. As opposed to reallocating the array for every function call.
brainboxdotcc · Sep 24, 2023 · ef81cf5 · ef81cf5
1 parent aa34a0e
commit ef81cf5
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 68 deletions.
diff --git a/include/dpp/discordvoiceclient.h b/include/dpp/discordvoiceclient.h
@@ -55,6 +55,8 @@ struct OpusRepacketizer;
 
 namespace dpp {
 
+class audio_mixer;
+
 // !TODO: change these to constexpr and rename every occurrence across the codebase
 #define AUDIO_TRACK_MARKER (uint16_t)0xFFFF
 
@@ -139,6 +141,11 @@ class DPP_EXPORT discord_voice_client : public websocket_client
 	 */
 	time_t connect_time;
 
+	/*
+	* @brief For mixing outgoing voice data.
+	*/
+	std::unique_ptr<audio_mixer> mixer;
+
 	/**
 	 * @brief IP of UDP/RTP endpoint
 	 */

diff --git a/include/dpp/isa/avx.h b/include/dpp/isa/avx.h
@@ -49,7 +49,7 @@ namespace dpp {
 		 * @param current_gain The gain to be applied to the elements.
 		 * @param increment The increment value to be added to each element.
 		 */
-		inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
+		inline void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
 			avx_float current_samples_new{ _mm_mul_ps(gather_values(data_in),
 				_mm_add_ps(_mm_set1_ps(current_gain), _mm_mul_ps(_mm_set1_ps(increment), _mm_set_ps(0.0f, 1.0f, 2.0f, 3.0f)))) };
 
@@ -67,22 +67,24 @@ namespace dpp {
 		 * @param up_sampled_vector Pointer to the array of int32_t values.
 		 * @param decoded_data Pointer to the array of int16_t values.
 		 */
-		inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
+		inline void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
 			auto newValues{ _mm_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data)) };
 			store_values(newValues, up_sampled_vector);
 		}
 
 	protected:
+		alignas(16) float values[byte_blocks_per_register]{};///< Array for storing the values to be loaded/stored.
 
 		/**
 		 * @brief Stores values from a 128-bit AVX vector to a storage location.
 		 * @tparam value_type The target value type for storage.
 		 * @param values_to_store The 128-bit AVX vector containing values to store.
 		 * @param storage_location Pointer to the storage location.
 		 */
-		template<typename value_type> inline static void store_values(const avx_float& values_to_store, value_type* storage_location) {
+		template<typename value_type> inline void store_values(const avx_float& values_to_store, value_type* storage_location) {
+			_mm_store_ps(values, values_to_store);
 			for (int64_t x = 0; x < byte_blocks_per_register; ++x) {
-				storage_location[x] = static_cast<value_type>(extract_float_from_avx(values_to_store, x));
+				storage_location[x] = static_cast<value_type>(values[x]);
 			}
 		}
 
@@ -92,24 +94,11 @@ namespace dpp {
 		 * @tparam Indices Parameter pack of indices for gathering values.
 		 * @return An AVX register containing gathered values.
 		 */
-		template<typename value_type> inline static avx_float gather_values(value_type* values) {
-			alignas(16) float new_array[byte_blocks_per_register]{};
+		template<typename value_type> inline avx_float gather_values(value_type* values_new) {
 			for (uint64_t x = 0; x < byte_blocks_per_register; ++x) {
-				new_array[x] = static_cast<float>(values[x]);
+				values[x] = static_cast<float>(values_new[x]);
 			}
-			return _mm_load_ps(new_array);
-		}
-
-		/**
-		 * @brief Extracts a 32-bit integer from a 128-bit AVX register.
-		 * @param value The AVX register containing packed 32-bit integers.
-		 * @param index The index of the 32-bit integer to extract (0-3).
-		 * @return The extracted 32-bit integer.
-		 */
-		inline static float extract_float_from_avx(const avx_float& value, int64_t index) {
-			alignas(16) float new_array[4]{};
-			_mm_store_ps(new_array, value);
-			return new_array[index];
+			return _mm_load_ps(values);
 		}
 	};
 

diff --git a/include/dpp/isa/avx2.h b/include/dpp/isa/avx2.h
@@ -49,7 +49,7 @@ namespace dpp {
 		 * @param current_gain The gain to be applied to the elements.
 		 * @param increment The increment value to be added to each element.
 		 */
-		inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
+		inline void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
 			avx_2_float current_samples_new{ _mm256_mul_ps(gather_values(data_in),
 				_mm256_add_ps(_mm256_set1_ps(current_gain),
 					_mm256_mul_ps(_mm256_set1_ps(increment), _mm256_set_ps(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f)))) };
@@ -70,22 +70,24 @@ namespace dpp {
 		 * @param decoded_data Pointer to the array of int16_t values.
 		 * @param x Index to select a specific set of elements to combine.
 		 */
-		inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
+		inline void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
 			auto newValues{ _mm256_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data)) };
 			store_values(newValues, up_sampled_vector);
 		}
 
 	protected:
+		alignas(32) float values[byte_blocks_per_register]{};///< Array for storing the values to be loaded/stored.
 
 		/**
 		 * @brief Stores values from a 256-bit AVX2 vector to a storage location.
 		 * @tparam value_type The target value type for storage.
 		 * @param values_to_store The 256-bit AVX2 vector containing values to store.
 		 * @param storage_location Pointer to the storage location.
 		 */
-		template<typename value_type> inline static void store_values(const avx_2_float& values_to_store, value_type* storage_location) {
+		template<typename value_type> inline void store_values(const avx_2_float& values_to_store, value_type* storage_location) {
+			_mm256_store_ps(values, values_to_store);
 			for (int64_t x = 0; x < byte_blocks_per_register; ++x) {
-				storage_location[x] = static_cast<value_type>(extract_float_from_avx_2(values_to_store, x));
+				storage_location[x] = static_cast<value_type>(values[x]);
 			}
 		}
 
@@ -95,24 +97,11 @@ namespace dpp {
 		 * @tparam Indices Parameter pack of indices for gathering values.
 		 * @return An AVX2 register containing gathered values.
 		 */
-		template<typename value_type> inline static avx_2_float gather_values(value_type* values) {
-			alignas(32) float new_array[byte_blocks_per_register]{};
+		template<typename value_type> inline avx_2_float gather_values(value_type* values_new) {
 			for (uint64_t x = 0; x < byte_blocks_per_register; ++x) {
-				new_array[x] = static_cast<float>(values[x]);
+				values[x] = static_cast<float>(values_new[x]);
 			}
-			return _mm256_load_ps(new_array);
-		}
-
-		/**
-		 * @brief Extracts a 32-bit integer from a 256-bit AVX2 register.
-		 * @param value The AVX2 register containing packed 32-bit integers.
-		 * @param index The index of the 32-bit integer to extract (0-7).
-		 * @return The extracted 32-bit integer.
-		 */
-		inline static float extract_float_from_avx_2(const avx_2_float& value, int64_t index) {
-			alignas(32) float new_array[byte_blocks_per_register]{};
-			_mm256_store_ps(new_array, value);
-			return new_array[index];
+			return _mm256_load_ps(values);
 		}
 	};
 

diff --git a/include/dpp/isa/avx512.h b/include/dpp/isa/avx512.h
@@ -49,7 +49,7 @@ namespace dpp {
 		 * @param current_gain The gain to be applied to the elements.
 		 * @param increment The increment value to be added to each element.
 		 */
-		inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
+		inline void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
 			avx_512_float current_samples_new{ _mm512_mul_ps(gather_values(data_in),
 				_mm512_add_ps(_mm512_set1_ps(current_gain),
 					_mm512_mul_ps(_mm512_set1_ps(increment),
@@ -73,12 +73,13 @@ namespace dpp {
 		 * @param up_sampled_vector Pointer to the array of int32_t values.
 		 * @param decoded_data Pointer to the array of int16_t values.
 		 */
-		inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
+		inline void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
 			auto newValues{ _mm512_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data)) };
 			store_values(newValues, up_sampled_vector);
 		}
 
 	protected:
+		alignas(64) float values[byte_blocks_per_register]{};///< Array for storing the values to be loaded/stored.
 
 		/**
 		 * @brief Stores values from a 512-bit AVX512 vector to a storage location.
@@ -87,8 +88,9 @@ namespace dpp {
 		 * @param storage_location Pointer to the storage location.
 		 */
 		template<typename value_type> inline static void store_values(const avx_512_float& values_to_store, value_type* storage_location) {
+			_mm256_store_ps(values, values_to_store);
 			for (int64_t x = 0; x < byte_blocks_per_register; ++x) {
-				storage_location[x] = static_cast<value_type>(extract_float_from_avx_512(values_to_store, x));
+				storage_location[x] = static_cast<value_type>(values[x]);
 			}
 		}
 
@@ -98,24 +100,11 @@ namespace dpp {
 		 * @tparam Indices Parameter pack of indices for gathering values.
 		 * @return An AVX512 register containing gathered values.
 		 */
-		template<typename value_type> inline static avx_512_float gather_values(value_type* values) {
-			alignas(64) float new_array[byte_blocks_per_register]{};
+		template<typename value_type> inline avx_512_float gather_values(value_type* values_new) {
 			for (uint64_t x = 0; x < byte_blocks_per_register; ++x) {
-				new_array[x] = static_cast<float>(values[x]);
+				values[x] = static_cast<float>(values_new[x]);
 			}
-			return _mm512_load_ps(new_array);
-		}
-
-		/**
-		 * @brief Extracts a 32-bit integer from a 512-bit AVX512 register.
-		 * @param value The AVX512 register containing packed 32-bit integers.
-		 * @param index The index of the 32-bit integer to extract (0-15).
-		 * @return The extracted 32-bit integer.
-		 */
-		inline static float extract_float_from_avx_512(const avx_512_float& value, int64_t index) {
-			alignas(64) float new_array[byte_blocks_per_register]{};
-			_mm512_store_ps(new_array, value);
-			return new_array[index];
+			return _mm512_load_ps(values);
 		}
 	};
 

diff --git a/src/dpp/discordvoiceclient.cpp b/src/dpp/discordvoiceclient.cpp
@@ -142,14 +142,17 @@ bool discord_voice_client::voice_payload::operator<(const voice_payload& other)
 }
 
 #ifdef HAVE_VOICE
-size_t audio_mix(discord_voice_client& client, opus_int32* pcm_mix, const opus_int16* pcm, size_t park_count, int samples, int& max_samples) {
+size_t audio_mix(discord_voice_client& client, audio_mixer& mixer, opus_int32* pcm_mix, const opus_int16* pcm, size_t park_count, int samples, int& max_samples) {
 	/* Mix the combined stream if combined audio is bound */
 	if (client.creator->on_voice_receive_combined.empty()) {
 		return 0;
 	}
+
 	/* We must upsample the data to 32 bits wide, otherwise we could overflow */
-	for (opus_int32 v = 0; v < samples * opus_channel_count / 16; ++v) {
-		audio_mixer::combine_samples(pcm_mix, pcm);
+	for (opus_int32 v = 0; v < (samples * opus_channel_count) / mixer.byte_blocks_per_register; ++v) {
+		mixer.combine_samples(pcm_mix, pcm);
+		pcm += mixer.byte_blocks_per_register;
+		pcm_mix += mixer.byte_blocks_per_register;
 	}
 	client.moving_average += park_count;
 	max_samples = (std::max)(samples, max_samples);
@@ -252,7 +255,7 @@ void discord_voice_client::voice_courier_loop(discord_voice_client& client, cour
 						voice_receive_t vr(nullptr, "", &client, d.user_id, reinterpret_cast<uint8_t*>(pcm),
 							samples * opus_channel_count * sizeof(opus_int16));
 
-						park_count = audio_mix(client, pcm_mix, pcm, park_count, samples, max_samples);
+						park_count = audio_mix(client, *client.mixer, pcm_mix, pcm, park_count, samples, max_samples);
 						client.creator->on_voice_receive.call(vr);
 					}
 				} else {
@@ -266,7 +269,7 @@ void discord_voice_client::voice_courier_loop(discord_voice_client& client, cour
 						vr.reassign(&client, d.user_id, reinterpret_cast<uint8_t*>(pcm),
 							samples * opus_channel_count * sizeof(opus_int16));
 						client.end_gain = 1.0f / client.moving_average;
-						park_count = audio_mix(client, pcm_mix, pcm, park_count, samples, max_samples);
+						park_count = audio_mix(client, *client.mixer, pcm_mix, pcm, park_count, samples, max_samples);
 						client.creator->on_voice_receive.call(vr);
 					}
 
@@ -280,11 +283,14 @@ void discord_voice_client::voice_courier_loop(discord_voice_client& client, cour
 
 			/* Downsample the 32 bit samples back to 16 bit */
 			opus_int16 pcm_downsample[23040] = { 0 };
+			opus_int16* pcm_downsample_ptr = pcm_downsample;
+			opus_int32* pcm_mix_ptr = pcm_mix;
 			client.increment = (client.end_gain - client.current_gain) / static_cast<float>(samples);
-			for (int64_t x = 0; x < samples / audio_mixer::byte_blocks_per_register; ++x) {
-				audio_mixer::collect_single_register(pcm_mix + (x * audio_mixer::byte_blocks_per_register),
-					pcm_downsample + (x * audio_mixer::byte_blocks_per_register), client.current_gain, client.increment);
-				client.current_gain += client.increment * static_cast<float>(audio_mixer::byte_blocks_per_register);
+			for (int64_t x = 0; x < (samples * opus_channel_count) / client.mixer->byte_blocks_per_register; ++x) {
+				client.mixer->collect_single_register(pcm_mix_ptr, pcm_downsample_ptr, client.current_gain, client.increment);
+				client.current_gain += client.increment * static_cast<float>(client.mixer->byte_blocks_per_register);
+				pcm_mix_ptr += client.mixer->byte_blocks_per_register;
+				pcm_downsample_ptr += client.mixer->byte_blocks_per_register;
 			}
 
 			voice_receive_t vr(nullptr, "", &client, 0, reinterpret_cast<uint8_t*>(pcm_downsample),
@@ -301,6 +307,7 @@ discord_voice_client::discord_voice_client(dpp::cluster* _cluster, snowflake _ch
 	runner(nullptr),
 	connect_time(0),
 	port(0),
+	mixer(std::make_unique<audio_mixer>()),
 	ssrc(0),
 	timescale(1000000),
 	paused(false),