Skip to content

Commit

Permalink
Enhancement: Making AudioMixer a member instead of invoking it static…
Browse files Browse the repository at this point in the history
…ally to cut down CPU usage by roughly 50%.

Storing it along with the array of floats that are used as an intermediary for converting to/from avx registers seems to cut down CPU usage while streaming audio in my implementation by roughly 50%. As opposed to reallocating the array for every function call.
  • Loading branch information
RealTimeChris committed Sep 24, 2023
1 parent aa34a0e commit ef81cf5
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 68 deletions.
7 changes: 7 additions & 0 deletions include/dpp/discordvoiceclient.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ struct OpusRepacketizer;

namespace dpp {

class audio_mixer;

// !TODO: change these to constexpr and rename every occurrence across the codebase
#define AUDIO_TRACK_MARKER (uint16_t)0xFFFF

Expand Down Expand Up @@ -139,6 +141,11 @@ class DPP_EXPORT discord_voice_client : public websocket_client
*/
time_t connect_time;

/*
* @brief For mixing outgoing voice data.
*/
std::unique_ptr<audio_mixer> mixer;

/**
* @brief IP of UDP/RTP endpoint
*/
Expand Down
29 changes: 9 additions & 20 deletions include/dpp/isa/avx.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ namespace dpp {
* @param current_gain The gain to be applied to the elements.
* @param increment The increment value to be added to each element.
*/
inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
inline void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
avx_float current_samples_new{ _mm_mul_ps(gather_values(data_in),
_mm_add_ps(_mm_set1_ps(current_gain), _mm_mul_ps(_mm_set1_ps(increment), _mm_set_ps(0.0f, 1.0f, 2.0f, 3.0f)))) };

Expand All @@ -67,22 +67,24 @@ namespace dpp {
* @param up_sampled_vector Pointer to the array of int32_t values.
* @param decoded_data Pointer to the array of int16_t values.
*/
inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
inline void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
auto newValues{ _mm_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data)) };
store_values(newValues, up_sampled_vector);
}

protected:
alignas(16) float values[byte_blocks_per_register]{};///< Array for storing the values to be loaded/stored.

/**
* @brief Stores values from a 128-bit AVX vector to a storage location.
* @tparam value_type The target value type for storage.
* @param values_to_store The 128-bit AVX vector containing values to store.
* @param storage_location Pointer to the storage location.
*/
template<typename value_type> inline static void store_values(const avx_float& values_to_store, value_type* storage_location) {
template<typename value_type> inline void store_values(const avx_float& values_to_store, value_type* storage_location) {
_mm_store_ps(values, values_to_store);
for (int64_t x = 0; x < byte_blocks_per_register; ++x) {
storage_location[x] = static_cast<value_type>(extract_float_from_avx(values_to_store, x));
storage_location[x] = static_cast<value_type>(values[x]);
}
}

Expand All @@ -92,24 +94,11 @@ namespace dpp {
* @tparam Indices Parameter pack of indices for gathering values.
* @return An AVX register containing gathered values.
*/
template<typename value_type> inline static avx_float gather_values(value_type* values) {
alignas(16) float new_array[byte_blocks_per_register]{};
template<typename value_type> inline avx_float gather_values(value_type* values_new) {
for (uint64_t x = 0; x < byte_blocks_per_register; ++x) {
new_array[x] = static_cast<float>(values[x]);
values[x] = static_cast<float>(values_new[x]);
}
return _mm_load_ps(new_array);
}

/**
* @brief Extracts a 32-bit integer from a 128-bit AVX register.
* @param value The AVX register containing packed 32-bit integers.
* @param index The index of the 32-bit integer to extract (0-3).
* @return The extracted 32-bit integer.
*/
inline static float extract_float_from_avx(const avx_float& value, int64_t index) {
alignas(16) float new_array[4]{};
_mm_store_ps(new_array, value);
return new_array[index];
return _mm_load_ps(values);
}
};

Expand Down
29 changes: 9 additions & 20 deletions include/dpp/isa/avx2.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ namespace dpp {
* @param current_gain The gain to be applied to the elements.
* @param increment The increment value to be added to each element.
*/
inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
inline void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
avx_2_float current_samples_new{ _mm256_mul_ps(gather_values(data_in),
_mm256_add_ps(_mm256_set1_ps(current_gain),
_mm256_mul_ps(_mm256_set1_ps(increment), _mm256_set_ps(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f)))) };
Expand All @@ -70,22 +70,24 @@ namespace dpp {
* @param decoded_data Pointer to the array of int16_t values.
* @param x Index to select a specific set of elements to combine.
*/
inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
inline void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
auto newValues{ _mm256_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data)) };
store_values(newValues, up_sampled_vector);
}

protected:
alignas(32) float values[byte_blocks_per_register]{};///< Array for storing the values to be loaded/stored.

/**
* @brief Stores values from a 256-bit AVX2 vector to a storage location.
* @tparam value_type The target value type for storage.
* @param values_to_store The 256-bit AVX2 vector containing values to store.
* @param storage_location Pointer to the storage location.
*/
template<typename value_type> inline static void store_values(const avx_2_float& values_to_store, value_type* storage_location) {
template<typename value_type> inline void store_values(const avx_2_float& values_to_store, value_type* storage_location) {
_mm256_store_ps(values, values_to_store);
for (int64_t x = 0; x < byte_blocks_per_register; ++x) {
storage_location[x] = static_cast<value_type>(extract_float_from_avx_2(values_to_store, x));
storage_location[x] = static_cast<value_type>(values[x]);
}
}

Expand All @@ -95,24 +97,11 @@ namespace dpp {
* @tparam Indices Parameter pack of indices for gathering values.
* @return An AVX2 register containing gathered values.
*/
template<typename value_type> inline static avx_2_float gather_values(value_type* values) {
alignas(32) float new_array[byte_blocks_per_register]{};
template<typename value_type> inline avx_2_float gather_values(value_type* values_new) {
for (uint64_t x = 0; x < byte_blocks_per_register; ++x) {
new_array[x] = static_cast<float>(values[x]);
values[x] = static_cast<float>(values_new[x]);
}
return _mm256_load_ps(new_array);
}

/**
* @brief Extracts a 32-bit integer from a 256-bit AVX2 register.
* @param value The AVX2 register containing packed 32-bit integers.
* @param index The index of the 32-bit integer to extract (0-7).
* @return The extracted 32-bit integer.
*/
inline static float extract_float_from_avx_2(const avx_2_float& value, int64_t index) {
alignas(32) float new_array[byte_blocks_per_register]{};
_mm256_store_ps(new_array, value);
return new_array[index];
return _mm256_load_ps(values);
}
};

Expand Down
27 changes: 8 additions & 19 deletions include/dpp/isa/avx512.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ namespace dpp {
* @param current_gain The gain to be applied to the elements.
* @param increment The increment value to be added to each element.
*/
inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
inline void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
avx_512_float current_samples_new{ _mm512_mul_ps(gather_values(data_in),
_mm512_add_ps(_mm512_set1_ps(current_gain),
_mm512_mul_ps(_mm512_set1_ps(increment),
Expand All @@ -73,12 +73,13 @@ namespace dpp {
* @param up_sampled_vector Pointer to the array of int32_t values.
* @param decoded_data Pointer to the array of int16_t values.
*/
inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
inline void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
auto newValues{ _mm512_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data)) };
store_values(newValues, up_sampled_vector);
}

protected:
alignas(64) float values[byte_blocks_per_register]{};///< Array for storing the values to be loaded/stored.

/**
* @brief Stores values from a 512-bit AVX512 vector to a storage location.
Expand All @@ -87,8 +88,9 @@ namespace dpp {
* @param storage_location Pointer to the storage location.
*/
template<typename value_type> inline static void store_values(const avx_512_float& values_to_store, value_type* storage_location) {
_mm256_store_ps(values, values_to_store);
for (int64_t x = 0; x < byte_blocks_per_register; ++x) {
storage_location[x] = static_cast<value_type>(extract_float_from_avx_512(values_to_store, x));
storage_location[x] = static_cast<value_type>(values[x]);
}
}

Expand All @@ -98,24 +100,11 @@ namespace dpp {
* @tparam Indices Parameter pack of indices for gathering values.
* @return An AVX512 register containing gathered values.
*/
template<typename value_type> inline static avx_512_float gather_values(value_type* values) {
alignas(64) float new_array[byte_blocks_per_register]{};
template<typename value_type> inline avx_512_float gather_values(value_type* values_new) {
for (uint64_t x = 0; x < byte_blocks_per_register; ++x) {
new_array[x] = static_cast<float>(values[x]);
values[x] = static_cast<float>(values_new[x]);
}
return _mm512_load_ps(new_array);
}

/**
* @brief Extracts a 32-bit integer from a 512-bit AVX512 register.
* @param value The AVX512 register containing packed 32-bit integers.
* @param index The index of the 32-bit integer to extract (0-15).
* @return The extracted 32-bit integer.
*/
inline static float extract_float_from_avx_512(const avx_512_float& value, int64_t index) {
alignas(64) float new_array[byte_blocks_per_register]{};
_mm512_store_ps(new_array, value);
return new_array[index];
return _mm512_load_ps(values);
}
};

Expand Down
25 changes: 16 additions & 9 deletions src/dpp/discordvoiceclient.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,14 +142,17 @@ bool discord_voice_client::voice_payload::operator<(const voice_payload& other)
}

#ifdef HAVE_VOICE
size_t audio_mix(discord_voice_client& client, opus_int32* pcm_mix, const opus_int16* pcm, size_t park_count, int samples, int& max_samples) {
size_t audio_mix(discord_voice_client& client, audio_mixer& mixer, opus_int32* pcm_mix, const opus_int16* pcm, size_t park_count, int samples, int& max_samples) {
/* Mix the combined stream if combined audio is bound */
if (client.creator->on_voice_receive_combined.empty()) {
return 0;
}

/* We must upsample the data to 32 bits wide, otherwise we could overflow */
for (opus_int32 v = 0; v < samples * opus_channel_count / 16; ++v) {
audio_mixer::combine_samples(pcm_mix, pcm);
for (opus_int32 v = 0; v < (samples * opus_channel_count) / mixer.byte_blocks_per_register; ++v) {
mixer.combine_samples(pcm_mix, pcm);
pcm += mixer.byte_blocks_per_register;
pcm_mix += mixer.byte_blocks_per_register;
}
client.moving_average += park_count;
max_samples = (std::max)(samples, max_samples);
Expand Down Expand Up @@ -252,7 +255,7 @@ void discord_voice_client::voice_courier_loop(discord_voice_client& client, cour
voice_receive_t vr(nullptr, "", &client, d.user_id, reinterpret_cast<uint8_t*>(pcm),
samples * opus_channel_count * sizeof(opus_int16));

park_count = audio_mix(client, pcm_mix, pcm, park_count, samples, max_samples);
park_count = audio_mix(client, *client.mixer, pcm_mix, pcm, park_count, samples, max_samples);
client.creator->on_voice_receive.call(vr);
}
} else {
Expand All @@ -266,7 +269,7 @@ void discord_voice_client::voice_courier_loop(discord_voice_client& client, cour
vr.reassign(&client, d.user_id, reinterpret_cast<uint8_t*>(pcm),
samples * opus_channel_count * sizeof(opus_int16));
client.end_gain = 1.0f / client.moving_average;
park_count = audio_mix(client, pcm_mix, pcm, park_count, samples, max_samples);
park_count = audio_mix(client, *client.mixer, pcm_mix, pcm, park_count, samples, max_samples);
client.creator->on_voice_receive.call(vr);
}

Expand All @@ -280,11 +283,14 @@ void discord_voice_client::voice_courier_loop(discord_voice_client& client, cour

/* Downsample the 32 bit samples back to 16 bit */
opus_int16 pcm_downsample[23040] = { 0 };
opus_int16* pcm_downsample_ptr = pcm_downsample;
opus_int32* pcm_mix_ptr = pcm_mix;
client.increment = (client.end_gain - client.current_gain) / static_cast<float>(samples);
for (int64_t x = 0; x < samples / audio_mixer::byte_blocks_per_register; ++x) {
audio_mixer::collect_single_register(pcm_mix + (x * audio_mixer::byte_blocks_per_register),
pcm_downsample + (x * audio_mixer::byte_blocks_per_register), client.current_gain, client.increment);
client.current_gain += client.increment * static_cast<float>(audio_mixer::byte_blocks_per_register);
for (int64_t x = 0; x < (samples * opus_channel_count) / client.mixer->byte_blocks_per_register; ++x) {
client.mixer->collect_single_register(pcm_mix_ptr, pcm_downsample_ptr, client.current_gain, client.increment);
client.current_gain += client.increment * static_cast<float>(client.mixer->byte_blocks_per_register);
pcm_mix_ptr += client.mixer->byte_blocks_per_register;
pcm_downsample_ptr += client.mixer->byte_blocks_per_register;
}

voice_receive_t vr(nullptr, "", &client, 0, reinterpret_cast<uint8_t*>(pcm_downsample),
Expand All @@ -301,6 +307,7 @@ discord_voice_client::discord_voice_client(dpp::cluster* _cluster, snowflake _ch
runner(nullptr),
connect_time(0),
port(0),
mixer(std::make_unique<audio_mixer>()),
ssrc(0),
timescale(1000000),
paused(false),
Expand Down

0 comments on commit ef81cf5

Please sign in to comment.