diff --git a/Dockerfile b/Dockerfile index 103c9f7115..bd37c2e9ef 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:noble@sha256:2e863c44b718727c860746568e1d54afd13b2fa71b160f5cd9058fc436217b30 +FROM ubuntu:noble@sha256:8a37d68f4f73ebf3d4efafbcf66379bf3728902a8038616808f04e34a9ab63ee RUN apt-get update && apt-get install --no-install-recommends -y libssl-dev zlib1g-dev libsodium-dev libopus-dev cmake pkg-config g++ gcc git make && apt-get clean && rm -rf /var/lib/apt/lists/* diff --git a/cmake/DetectArchitecture.cmake b/cmake/DetectArchitecture.cmake index 4c3a2030b5..af4f84a5b7 100644 --- a/cmake/DetectArchitecture.cmake +++ b/cmake/DetectArchitecture.cmake @@ -3,8 +3,13 @@ include(CheckCXXSourceRuns) function(check_instruction_set INSTRUCTION_SET_NAME INSTRUCTION_SET_FLAG INSTRUCTION_SET_INTRINSIC) set(INSTRUCTION_SET_CODE " + #if defined(__arm__) || defined(__aarch64__) + #include + #else #include #include + #endif + int main() { ${INSTRUCTION_SET_INTRINSIC}; @@ -27,12 +32,14 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") "AVX1?/arch:AVX?__m128i value{}#auto result = _mm_extract_epi32(value, 0)" "AVX2?/arch:AVX2?__m256i value{}#auto result = _mm256_add_epi32(__m256i{}, __m256i{})" "AVX512?/arch:AVX512?int32_t result[16]#const _mm512i& value{}#_mm512_store_si512(result, value)" + "AVX1024??uint8x16_t mask{ 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F }#vandq_u8(mask, mask)" ) else() set(INSTRUCTION_SETS "AVX1?-mavx?__m128i value{}#auto result = _mm_extract_epi32(value, 0)" "AVX2?-mavx2?__m256i value{}#auto result = _mm256_add_epi32(__m256i{}, __m256i{})" "AVX512?-mavx512f?int32_t result[16]#const _mm512i& value{}#_mm512_store_si512(result, value)" + "AVX1024??uint8x16_t mask{ 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F }#vandq_u8(mask, mask)" ) endif() @@ -43,7 +50,7 @@ set(AVX_TYPE "AVX0" PARENT_SCOPE) set(AVX_FLAGS "" PARENT_SCOPE) # This is only supported on x86/x64, it is completely skipped and forced to T_fallback anywhere else -if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "i386") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "AMD64")) +if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "i386") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "AMD64") OR (${CMAKE_HOST_SYSTEM_PROCESSOR} MATCHES "arm64") OR (${CMAKE_HOST_SYSTEM_PROCESSOR} MATCHES "armv7l")) foreach(INSTRUCTION_SET IN LISTS INSTRUCTION_SETS) string(REPLACE "?" ";" CURRENT_LIST "${INSTRUCTION_SET}") diff --git a/docpages/example_code/commandhandler.cpp b/docpages/example_code/commandhandler.cpp index ea23e87529..898a1cf63a 100644 --- a/docpages/example_code/commandhandler.cpp +++ b/docpages/example_code/commandhandler.cpp @@ -35,7 +35,7 @@ int main() { /* Command description */ "A test ping command", - /* Guild id (omit for a guild command) */ + /* Guild id (omit for a global command) */ 819556414099554344 ); diff --git a/doxygen-awesome-css b/doxygen-awesome-css index 28ed396de1..00c7339531 160000 --- a/doxygen-awesome-css +++ b/doxygen-awesome-css @@ -1 +1 @@ -Subproject commit 28ed396de19cd3d803bcb483dceefdb6d03b1b2b +Subproject commit 00c73395317fa2cc80bd2dbe6b5a568939b81f3e diff --git a/include/dpp/isa/neon.h b/include/dpp/isa/neon.h new file mode 100644 index 0000000000..8ee33459f8 --- /dev/null +++ b/include/dpp/isa/neon.h @@ -0,0 +1,118 @@ +/************************************************************************************ + * + * D++, A Lightweight C++ library for Discord + * + * Copyright 2021 Craig Edwards and D++ contributors + * (https://github.com/brainboxdotcc/DPP/graphs/contributors) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ************************************************************************************/ +#pragma once + +#if defined _MSC_VER || defined __GNUC__ || defined __clang__ + +#include +#include + +namespace dpp { + + using neon_float = float32x4_t; + + /** + * @brief A class for audio mixing operations using ARM NEON instructions. + */ + class audio_mixer { + public: + + /** + * @brief The number of 32-bit values per CPU register. + */ + inline static constexpr int32_t byte_blocks_per_register{ 4 }; + + /** + * @brief Collect a single register worth of data from data_in, apply gain and increment, and store the result in data_out. + * This version uses ARM NEON instructions. + * + * @param data_in Pointer to the input array of int32_t values. + * @param data_out Pointer to the output array of int16_t values. + * @param current_gain The gain to be applied to the elements. + * @param increment The increment value to be added to each element. + */ + inline void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) { + neon_float gathered_values = gather_values(data_in); + neon_float gain_vector = vdupq_n_f32(current_gain); + static constexpr float data[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + neon_float floats = vld1q_f32(data); + neon_float increment_vector = vmulq_f32(vdupq_n_f32(increment), floats)); + neon_float current_samples_new = vmulq_f32(gathered_values, vaddq_f32(gain_vector, increment_vector)); + + // Clamping the values between int16_t min and max + neon_float min_val = vdupq_n_f32(static_cast(std::numeric_limits::min())); + neon_float max_val = vdupq_n_f32(static_cast(std::numeric_limits::max())); + + current_samples_new = vmaxq_f32(current_samples_new, min_val); + current_samples_new = vminq_f32(current_samples_new, max_val); + + store_values(current_samples_new, data_out); + } + + /** + * @brief Combine a register worth of elements from decoded_data and store the result in up_sampled_vector. + * This version uses ARM NEON instructions. + * + * @param up_sampled_vector Pointer to the array of int32_t values. + * @param decoded_data Pointer to the array of int16_t values. + */ + inline void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) { + neon_float up_sampled = gather_values(up_sampled_vector); + neon_float decoded = gather_values(decoded_data); + neon_float newValues = vaddq_f32(up_sampled, decoded); + store_values(newValues, up_sampled_vector); + } + + protected: + /** + * @brief Array for storing the values to be loaded/stored. + */ + alignas(16) float values[byte_blocks_per_register]{}; + + /** + * @brief Stores values from a 128-bit NEON vector to a storage location. + * @tparam value_type The target value type for storage. + * @param values_to_store The 128-bit NEON vector containing values to store. + * @param storage_location Pointer to the storage location. + */ + template inline void store_values(const neon_float& values_to_store, value_type* storage_location) { + vst1q_f32(values, values_to_store); + for (int64_t x = 0; x < byte_blocks_per_register; ++x) { + storage_location[x] = static_cast(values[x]); + } + } + + /** + * @brief Specialization for gathering non-float values into a NEON register. + * @tparam value_type The type of values being gathered. + * @return A NEON register containing gathered values. + */ + template inline neon_float gather_values(value_type* values_new) { + for (uint64_t x = 0; x < byte_blocks_per_register; ++x) { + values[x] = static_cast(values_new[x]); + } + return vld1q_f32(values); + } + }; + +} // namespace dpp + +#endif \ No newline at end of file diff --git a/include/dpp/isa_detection.h b/include/dpp/isa_detection.h index 2f7925efc5..54b641a7f0 100644 --- a/include/dpp/isa_detection.h +++ b/include/dpp/isa_detection.h @@ -20,7 +20,9 @@ ************************************************************************************/ #pragma once -#if AVX_TYPE == 512 +#if AVX_TYPE == 1024 + #include "isa/neon.h" +#elif AVX_TYPE == 512 #include "isa/avx512.h" #elif AVX_TYPE == 2 #include "isa/avx2.h" diff --git a/src/dpp/voicestate.cpp b/src/dpp/voicestate.cpp index e421cfbd1f..f0bcbd6e9f 100644 --- a/src/dpp/voicestate.cpp +++ b/src/dpp/voicestate.cpp @@ -91,3 +91,4 @@ bool voicestate::is_suppressed() const { } } // namespace dpp +