Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhancement: Making AudioMixer a member instead of invoking it statically to cut down CPU usage by roughly 50% #884

Merged
merged 1 commit into from
Sep 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions include/dpp/discordvoiceclient.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ struct OpusRepacketizer;

namespace dpp {

class audio_mixer;

// !TODO: change these to constexpr and rename every occurrence across the codebase
#define AUDIO_TRACK_MARKER (uint16_t)0xFFFF

Expand Down Expand Up @@ -139,6 +141,11 @@ class DPP_EXPORT discord_voice_client : public websocket_client
*/
time_t connect_time;

/*
* @brief For mixing outgoing voice data.
*/
std::unique_ptr<audio_mixer> mixer;

/**
* @brief IP of UDP/RTP endpoint
*/
Expand Down
29 changes: 9 additions & 20 deletions include/dpp/isa/avx.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ namespace dpp {
* @param current_gain The gain to be applied to the elements.
* @param increment The increment value to be added to each element.
*/
inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
inline void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
avx_float current_samples_new{ _mm_mul_ps(gather_values(data_in),
_mm_add_ps(_mm_set1_ps(current_gain), _mm_mul_ps(_mm_set1_ps(increment), _mm_set_ps(0.0f, 1.0f, 2.0f, 3.0f)))) };

Expand All @@ -67,22 +67,24 @@ namespace dpp {
* @param up_sampled_vector Pointer to the array of int32_t values.
* @param decoded_data Pointer to the array of int16_t values.
*/
inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
inline void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
auto newValues{ _mm_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data)) };
store_values(newValues, up_sampled_vector);
}

protected:
alignas(16) float values[byte_blocks_per_register]{};///< Array for storing the values to be loaded/stored.

/**
* @brief Stores values from a 128-bit AVX vector to a storage location.
* @tparam value_type The target value type for storage.
* @param values_to_store The 128-bit AVX vector containing values to store.
* @param storage_location Pointer to the storage location.
*/
template<typename value_type> inline static void store_values(const avx_float& values_to_store, value_type* storage_location) {
template<typename value_type> inline void store_values(const avx_float& values_to_store, value_type* storage_location) {
_mm_store_ps(values, values_to_store);
for (int64_t x = 0; x < byte_blocks_per_register; ++x) {
storage_location[x] = static_cast<value_type>(extract_float_from_avx(values_to_store, x));
storage_location[x] = static_cast<value_type>(values[x]);
}
}

Expand All @@ -92,24 +94,11 @@ namespace dpp {
* @tparam Indices Parameter pack of indices for gathering values.
* @return An AVX register containing gathered values.
*/
template<typename value_type> inline static avx_float gather_values(value_type* values) {
alignas(16) float new_array[byte_blocks_per_register]{};
template<typename value_type> inline avx_float gather_values(value_type* values_new) {
for (uint64_t x = 0; x < byte_blocks_per_register; ++x) {
new_array[x] = static_cast<float>(values[x]);
values[x] = static_cast<float>(values_new[x]);
}
return _mm_load_ps(new_array);
}

/**
* @brief Extracts a 32-bit integer from a 128-bit AVX register.
* @param value The AVX register containing packed 32-bit integers.
* @param index The index of the 32-bit integer to extract (0-3).
* @return The extracted 32-bit integer.
*/
inline static float extract_float_from_avx(const avx_float& value, int64_t index) {
alignas(16) float new_array[4]{};
_mm_store_ps(new_array, value);
return new_array[index];
return _mm_load_ps(values);
}
};

Expand Down
29 changes: 9 additions & 20 deletions include/dpp/isa/avx2.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ namespace dpp {
* @param current_gain The gain to be applied to the elements.
* @param increment The increment value to be added to each element.
*/
inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
inline void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
avx_2_float current_samples_new{ _mm256_mul_ps(gather_values(data_in),
_mm256_add_ps(_mm256_set1_ps(current_gain),
_mm256_mul_ps(_mm256_set1_ps(increment), _mm256_set_ps(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f)))) };
Expand All @@ -70,22 +70,24 @@ namespace dpp {
* @param decoded_data Pointer to the array of int16_t values.
* @param x Index to select a specific set of elements to combine.
*/
inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
inline void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
auto newValues{ _mm256_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data)) };
store_values(newValues, up_sampled_vector);
}

protected:
alignas(32) float values[byte_blocks_per_register]{};///< Array for storing the values to be loaded/stored.

/**
* @brief Stores values from a 256-bit AVX2 vector to a storage location.
* @tparam value_type The target value type for storage.
* @param values_to_store The 256-bit AVX2 vector containing values to store.
* @param storage_location Pointer to the storage location.
*/
template<typename value_type> inline static void store_values(const avx_2_float& values_to_store, value_type* storage_location) {
template<typename value_type> inline void store_values(const avx_2_float& values_to_store, value_type* storage_location) {
_mm256_store_ps(values, values_to_store);
for (int64_t x = 0; x < byte_blocks_per_register; ++x) {
storage_location[x] = static_cast<value_type>(extract_float_from_avx_2(values_to_store, x));
storage_location[x] = static_cast<value_type>(values[x]);
}
}

Expand All @@ -95,24 +97,11 @@ namespace dpp {
* @tparam Indices Parameter pack of indices for gathering values.
* @return An AVX2 register containing gathered values.
*/
template<typename value_type> inline static avx_2_float gather_values(value_type* values) {
alignas(32) float new_array[byte_blocks_per_register]{};
template<typename value_type> inline avx_2_float gather_values(value_type* values_new) {
for (uint64_t x = 0; x < byte_blocks_per_register; ++x) {
new_array[x] = static_cast<float>(values[x]);
values[x] = static_cast<float>(values_new[x]);
}
return _mm256_load_ps(new_array);
}

/**
* @brief Extracts a 32-bit integer from a 256-bit AVX2 register.
* @param value The AVX2 register containing packed 32-bit integers.
* @param index The index of the 32-bit integer to extract (0-7).
* @return The extracted 32-bit integer.
*/
inline static float extract_float_from_avx_2(const avx_2_float& value, int64_t index) {
alignas(32) float new_array[byte_blocks_per_register]{};
_mm256_store_ps(new_array, value);
return new_array[index];
return _mm256_load_ps(values);
}
};

Expand Down
27 changes: 8 additions & 19 deletions include/dpp/isa/avx512.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ namespace dpp {
* @param current_gain The gain to be applied to the elements.
* @param increment The increment value to be added to each element.
*/
inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
inline void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
avx_512_float current_samples_new{ _mm512_mul_ps(gather_values(data_in),
_mm512_add_ps(_mm512_set1_ps(current_gain),
_mm512_mul_ps(_mm512_set1_ps(increment),
Expand All @@ -73,12 +73,13 @@ namespace dpp {
* @param up_sampled_vector Pointer to the array of int32_t values.
* @param decoded_data Pointer to the array of int16_t values.
*/
inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
inline void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
auto newValues{ _mm512_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data)) };
store_values(newValues, up_sampled_vector);
}

protected:
alignas(64) float values[byte_blocks_per_register]{};///< Array for storing the values to be loaded/stored.

/**
* @brief Stores values from a 512-bit AVX512 vector to a storage location.
Expand All @@ -87,8 +88,9 @@ namespace dpp {
* @param storage_location Pointer to the storage location.
*/
template<typename value_type> inline static void store_values(const avx_512_float& values_to_store, value_type* storage_location) {
_mm256_store_ps(values, values_to_store);
for (int64_t x = 0; x < byte_blocks_per_register; ++x) {
storage_location[x] = static_cast<value_type>(extract_float_from_avx_512(values_to_store, x));
storage_location[x] = static_cast<value_type>(values[x]);
}
}

Expand All @@ -98,24 +100,11 @@ namespace dpp {
* @tparam Indices Parameter pack of indices for gathering values.
* @return An AVX512 register containing gathered values.
*/
template<typename value_type> inline static avx_512_float gather_values(value_type* values) {
alignas(64) float new_array[byte_blocks_per_register]{};
template<typename value_type> inline avx_512_float gather_values(value_type* values_new) {
for (uint64_t x = 0; x < byte_blocks_per_register; ++x) {
new_array[x] = static_cast<float>(values[x]);
values[x] = static_cast<float>(values_new[x]);
}
return _mm512_load_ps(new_array);
}

/**
* @brief Extracts a 32-bit integer from a 512-bit AVX512 register.
* @param value The AVX512 register containing packed 32-bit integers.
* @param index The index of the 32-bit integer to extract (0-15).
* @return The extracted 32-bit integer.
*/
inline static float extract_float_from_avx_512(const avx_512_float& value, int64_t index) {
alignas(64) float new_array[byte_blocks_per_register]{};
_mm512_store_ps(new_array, value);
return new_array[index];
return _mm512_load_ps(values);
}
};

Expand Down
25 changes: 16 additions & 9 deletions src/dpp/discordvoiceclient.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,14 +142,17 @@ bool discord_voice_client::voice_payload::operator<(const voice_payload& other)
}

#ifdef HAVE_VOICE
size_t audio_mix(discord_voice_client& client, opus_int32* pcm_mix, const opus_int16* pcm, size_t park_count, int samples, int& max_samples) {
size_t audio_mix(discord_voice_client& client, audio_mixer& mixer, opus_int32* pcm_mix, const opus_int16* pcm, size_t park_count, int samples, int& max_samples) {
/* Mix the combined stream if combined audio is bound */
if (client.creator->on_voice_receive_combined.empty()) {
return 0;
}

/* We must upsample the data to 32 bits wide, otherwise we could overflow */
for (opus_int32 v = 0; v < samples * opus_channel_count / 16; ++v) {
audio_mixer::combine_samples(pcm_mix, pcm);
for (opus_int32 v = 0; v < (samples * opus_channel_count) / mixer.byte_blocks_per_register; ++v) {
mixer.combine_samples(pcm_mix, pcm);
pcm += mixer.byte_blocks_per_register;
pcm_mix += mixer.byte_blocks_per_register;
}
client.moving_average += park_count;
max_samples = (std::max)(samples, max_samples);
Expand Down Expand Up @@ -252,7 +255,7 @@ void discord_voice_client::voice_courier_loop(discord_voice_client& client, cour
voice_receive_t vr(nullptr, "", &client, d.user_id, reinterpret_cast<uint8_t*>(pcm),
samples * opus_channel_count * sizeof(opus_int16));

park_count = audio_mix(client, pcm_mix, pcm, park_count, samples, max_samples);
park_count = audio_mix(client, *client.mixer, pcm_mix, pcm, park_count, samples, max_samples);
client.creator->on_voice_receive.call(vr);
}
} else {
Expand All @@ -266,7 +269,7 @@ void discord_voice_client::voice_courier_loop(discord_voice_client& client, cour
vr.reassign(&client, d.user_id, reinterpret_cast<uint8_t*>(pcm),
samples * opus_channel_count * sizeof(opus_int16));
client.end_gain = 1.0f / client.moving_average;
park_count = audio_mix(client, pcm_mix, pcm, park_count, samples, max_samples);
park_count = audio_mix(client, *client.mixer, pcm_mix, pcm, park_count, samples, max_samples);
client.creator->on_voice_receive.call(vr);
}

Expand All @@ -280,11 +283,14 @@ void discord_voice_client::voice_courier_loop(discord_voice_client& client, cour

/* Downsample the 32 bit samples back to 16 bit */
opus_int16 pcm_downsample[23040] = { 0 };
opus_int16* pcm_downsample_ptr = pcm_downsample;
opus_int32* pcm_mix_ptr = pcm_mix;
client.increment = (client.end_gain - client.current_gain) / static_cast<float>(samples);
for (int64_t x = 0; x < samples / audio_mixer::byte_blocks_per_register; ++x) {
audio_mixer::collect_single_register(pcm_mix + (x * audio_mixer::byte_blocks_per_register),
pcm_downsample + (x * audio_mixer::byte_blocks_per_register), client.current_gain, client.increment);
client.current_gain += client.increment * static_cast<float>(audio_mixer::byte_blocks_per_register);
for (int64_t x = 0; x < (samples * opus_channel_count) / client.mixer->byte_blocks_per_register; ++x) {
client.mixer->collect_single_register(pcm_mix_ptr, pcm_downsample_ptr, client.current_gain, client.increment);
client.current_gain += client.increment * static_cast<float>(client.mixer->byte_blocks_per_register);
pcm_mix_ptr += client.mixer->byte_blocks_per_register;
pcm_downsample_ptr += client.mixer->byte_blocks_per_register;
}

voice_receive_t vr(nullptr, "", &client, 0, reinterpret_cast<uint8_t*>(pcm_downsample),
Expand All @@ -301,6 +307,7 @@ discord_voice_client::discord_voice_client(dpp::cluster* _cluster, snowflake _ch
runner(nullptr),
connect_time(0),
port(0),
mixer(std::make_unique<audio_mixer>()),
ssrc(0),
timescale(1000000),
paused(false),
Expand Down