refactor: improve AVX, make detection a little nicer

split out isa into four separate files, rather than one long snakepit of #ifdef pass flags properly to caller, and have the caller set the compile flags and define set the define as a fixed name with a value 0, 1, 2 or 512, instead of four different defines
brainboxdotcc · Sep 19, 2023 · 89d0997 · 89d0997
1 parent c79da97
commit 89d0997
Show file tree

Hide file tree

Showing 10 changed files with 576 additions and 399 deletions.
diff --git a/cmake/DetectArchitecture.cmake b/cmake/DetectArchitecture.cmake
@@ -17,29 +17,30 @@ function(check_instruction_set INSTRUCTION_SET_NAME INSTRUCTION_SET_FLAG INSTRUC
     if(${INSTRUCTION_SET_NAME})
         set(AVX_TYPE "${INSTRUCTION_SET_NAME}" PARENT_SCOPE)
         set(AVX_FLAG "${INSTRUCTION_SET_FLAG}" PARENT_SCOPE)
-        set(AVX_NAME "${INSTRUCTION_SET_NAME}" PARENT_SCOPE)
     else()
         return()
     endif()
 endfunction()
 
 if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
     set(INSTRUCTION_SETS
-        "T_AVX?/arch:AVX?__m128i value{}#auto result = _mm_extract_epi32(value, 0)"
-        "T_AVX2?/arch:AVX2?__m256i value{}#auto result = _mm256_extract_epi32(value, 0)"
-        "T_AVX512?/arch:AVX512?int32_t result[16]#const _mm512i& value{}#_mm512_store_si512(result, value)"
+        "AVX1?/arch:AVX?__m128i value{}#auto result = _mm_extract_epi32(value, 0)"
+        "AVX2?/arch:AVX2?__m256i value{}#auto result = _mm256_extract_epi32(value, 0)"
+        "AVX512?/arch:AVX512?int32_t result[16]#const _mm512i& value{}#_mm512_store_si512(result, value)"
     )
 else()
     set(INSTRUCTION_SETS
-        "T_AVX?-mavx?__m128i value{}#auto result = _mm_extract_epi32(value, 0)"
-        "T_AVX2?-mavx2?__m256i value{}#auto result = _mm256_extract_epi32(value, 0)"
-        "T_AVX512?-mavx512f?int32_t result[16]#const _mm512i& value{}#_mm512_store_si512(result, value)"
+        "AVX1?-mavx?__m128i value{}#auto result = _mm_extract_epi32(value, 0)"
+        "AVX2?-mavx2?__m256i value{}#auto result = _mm256_extract_epi32(value, 0)"
+        "AVX512?-mavx512f?int32_t result[16]#const _mm512i& value{}#_mm512_store_si512(result, value)"
 )
 endif()
 
 set(CMAKE_REQUIRED_FLAGS_SAVE "${CMAKE_REQUIRED_FLAGS}")
 
-set(AVX_NAME "T_fallback")
+set(AVX_TYPE "AVX0")
+set(AVX_TYPE "AVX0" PARENT_SCOPE)
+set(AVX_FLAGS "" PARENT_SCOPE)
 
 # This is only supported on x86/x64, it is completely skipped and forced to T_fallback anywhere else
 if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "i386") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "AMD64"))
@@ -54,11 +55,14 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64") OR (${CMAKE_SYSTEM_PROCESSOR} M
 		check_instruction_set("${INSTRUCTION_SET_NAME}" "${INSTRUCTION_SET_FLAG}" "${INSTRUCTION_SET_INTRINSIC}")
 	endforeach()
 
-	string(REPLACE "T_" "" AVX_DISPLAY ${AVX_NAME})
-	message(STATUS "Detected ${CMAKE_SYSTEM_PROCESSOR} SSE type: ${AVX_DISPLAY}")
+	message(STATUS "Detected ${CMAKE_SYSTEM_PROCESSOR} AVX type: ${AVX_TYPE} (FLAGS: ${AVX_FLAG})")
+	set(AVX_TYPE ${AVX_TYPE})
+	set(AVX_TYPE ${AVX_TYPE} PARENT_SCOPE)
+	set(AVX_FLAG ${AVX_FLAG} PARENT_SCOPE)
 	set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS_SAVE}")
 else()
-	message(STATUS "SSE not supported by architecture ${CMAKE_SYSTEM_PROCESSOR} ${AVX_NAME}")
-	set(AVX_NAME "T_fallback")
-	set(AVX_TYPE "T_fallback")
+	message(STATUS "AVX not supported by architecture ${CMAKE_SYSTEM_PROCESSOR} ${AVX_TYPE}")
+	set(AVX_TYPE "AVX0")
+	set(AVX_FLAG "" PARENT_SCOPE)
+	set(AVX_TYPE "AVX0" PARENT_SCOPE)
 endif()
diff --git a/include/dpp/isa/avx.h b/include/dpp/isa/avx.h
@@ -0,0 +1,136 @@
+/************************************************************************************
+ *
+ * D++, A Lightweight C++ library for Discord
+ *
+ * Copyright 2021 Craig Edwards and D++ contributors
+ * (https://github.com/brainboxdotcc/DPP/graphs/contributors)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ ************************************************************************************/
+#pragma once
+
+#if defined _MSC_VER || defined __GNUC__ || defined __clang__
+
+#include <immintrin.h>
+
+#ifdef max
+	#undef max
+#endif
+#ifdef min
+	#undef min
+#endif
+
+namespace dpp {
+
+	using avx_float = __m128;
+	using avx_int = __m128i;
+
+	/*
+	* @brief Extracts a 32-bit integer from a 128-bit AVX register.
+	* @param value The AVX register containing packed 32-bit integers.
+	* @param index The index of the 32-bit integer to extract (0-3).
+	* @return The extracted 32-bit integer.
+	*/
+	inline int32_t extract_int32_from_avx(const avx_int& value, int64_t index) {
+		switch (index) {
+			case 0: {
+				return _mm_extract_epi32(value, 0);
+			}
+			case 1: {
+				return _mm_extract_epi32(value, 1);
+			}
+			case 2: {
+				return _mm_extract_epi32(value, 2);
+			}
+			case 3: {
+				return _mm_extract_epi32(value, 3);
+			}
+			default: {
+				return _mm_extract_epi32(value, 0);
+			}
+		}
+	}
+
+	/**
+	 * @brief A class for audio mixing operations using AVX instructions.
+	 */
+	class audio_mixer {
+	public:
+		/*
+		 * @brief The number of 32-bit values per CPU register.
+		 */
+		inline static constexpr int32_t byte_blocks_per_register{ 4 };
+
+		/*
+		 * @brief Stores values from a 128-bit AVX vector to a storage location.
+		 * @tparam value_type The target value type for storage.
+		 * @param values_to_store The 128-bit AVX vector containing values to store.
+		 * @param storage_location Pointer to the storage location.
+		 */
+		template<typename value_type> inline static void store_values(const avx_int& values_to_store, value_type* storage_location) {
+			for (int64_t x = 0; x < byte_blocks_per_register; ++x) {
+				storage_location[x] = static_cast<value_type>(extract_int32_from_avx(values_to_store, x));
+			}
+		}
+
+		/**
+		 * @brief Specialization for gathering non-float values into an AVX register.
+		 * @tparam value_type The type of values being gathered.
+		 * @tparam Indices Parameter pack of indices for gathering values.
+		 * @return An AVX register containing gathered values.
+		 */
+		template<typename value_type> inline static avx_float gather_values(value_type* values) {
+			alignas(16) float new_array[byte_blocks_per_register]{};
+			for (size_t x = 0; x < byte_blocks_per_register; ++x) {
+				new_array[x] = static_cast<float>(values[x]);
+			}
+			return _mm_load_ps(new_array);
+		}
+
+		/**
+		 * @brief Collect a single register worth of data from data_in, apply gain and increment, and store the result in data_out.
+		 *        This version uses AVX instructions.
+		 *
+		 * @param data_in Pointer to the input array of int32_t values.
+		 * @param data_out Pointer to the output array of int16_t values.
+		 * @param current_gain The gain to be applied to the elements.
+		 * @param increment The increment value to be added to each element.
+		 */
+		inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
+			avx_float current_samples_new{ _mm_mul_ps(gather_values(data_in),
+				_mm_add_ps(_mm_set1_ps(current_gain), _mm_mul_ps(_mm_set1_ps(increment), _mm_set_ps(0.0f, 1.0f, 2.0f, 3.0f)))) };
+
+			current_samples_new = _mm_blendv_ps(_mm_max_ps(current_samples_new, _mm_set1_ps(static_cast<float>(std::numeric_limits<int16_t>::min()))),
+				_mm_min_ps(current_samples_new, _mm_set1_ps(static_cast<float>(std::numeric_limits<int16_t>::max()))),
+				_mm_cmp_ps(current_samples_new, _mm_set1_ps(0.0f), _CMP_GE_OQ));
+
+			store_values(_mm_cvtps_epi32(current_samples_new), data_out);
+		}
+
+		/**
+		 * @brief Combine a register worth of elements from decoded_data and store the result in up_sampled_vector.
+		 *        This version uses AVX instructions.
+		 *
+		 * @param up_sampled_vector Pointer to the array of int32_t values.
+		 * @param decoded_data Pointer to the array of int16_t values.
+		 */
+		inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
+			auto newValues{ _mm_cvtps_epi32(_mm_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data))) };
+			store_values(newValues, up_sampled_vector);
+		}
+	};
+
+} // namespace dpp
+
+#endif
diff --git a/include/dpp/isa/avx2.h b/include/dpp/isa/avx2.h
@@ -0,0 +1,151 @@
+/************************************************************************************
+ *
+ * D++, A Lightweight C++ library for Discord
+ *
+ * Copyright 2021 Craig Edwards and D++ contributors
+ * (https://github.com/brainboxdotcc/DPP/graphs/contributors)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ ************************************************************************************/
+#pragma once
+
+#if defined _MSC_VER || defined __GNUC__ || defined __clang__
+
+#include <immintrin.h>
+
+#ifdef max
+	#undef max
+#endif
+#ifdef min
+	#undef min
+#endif
+
+namespace dpp {
+
+	using avx_2_float = __m256;
+	using avx_2_int = __m256i;
+
+	/*
+ 	 * @brief Extracts a 32-bit integer from a 256-bit AVX2 register.
+	 * @param value The AVX2 register containing packed 32-bit integers.
+	 * @param index The index of the 32bit integer to extract (0-7).
+	 * @return The extracted 32-bit integer.
+	 */
+	inline int32_t extract_int32_from_avx2(const avx_2_int& value, int64_t index) {
+		switch (index) {
+			case 0: {
+				return _mm256_extract_epi32(value, 0);
+			}
+			case 1: {
+				return _mm256_extract_epi32(value, 1);
+			}
+			case 2: {
+				return _mm256_extract_epi32(value, 2);
+			}
+			case 3: {
+				return _mm256_extract_epi32(value, 3);
+			}
+			case 4: {
+				return _mm256_extract_epi32(value, 4);
+			}
+			case 5: {
+				return _mm256_extract_epi32(value, 5);
+			}
+			case 6: {
+				return _mm256_extract_epi32(value, 6);
+			}
+			case 7: {
+				return _mm256_extract_epi32(value, 7);
+			}
+			default: {
+				return _mm256_extract_epi32(value, 0);
+			}
+		}
+	}
+
+	/**
+	 * @brief A class for audio mixing operations using AVX2 instructions.
+	 */
+	class audio_mixer {
+	public:
+		/*
+		 * @brief The number of 32-bit values per CPU register.
+		 */
+		inline static constexpr int32_t byte_blocks_per_register{ 8 };
+
+		/*
+		 * @brief Stores values from a 256-bit AVX2 vector to a storage location.
+		 * @tparam value_type The target value type for storage.
+		 * @param values_to_store The 256-bit AVX2 vector containing values to store.
+		 * @param storage_location Pointer to the storage location.
+		 */
+		template<typename value_type> inline static void store_values(const avx_2_int& values_to_store, value_type* storage_location) {
+			for (int64_t x = 0; x < byte_blocks_per_register; ++x) {
+				storage_location[x] = static_cast<value_type>(extract_int32_from_avx2(values_to_store, x));
+			}
+		}
+
+		/**
+		 * @brief Specialization for gathering non-float values into an AVX2 register.
+		 * @tparam value_type The type of values being gathered.
+		 * @tparam Indices Parameter pack of indices for gathering values.
+		 * @return An AVX2 register containing gathered values.
+		 */
+		template<typename value_type> inline static avx_2_float gather_values(value_type* values) {
+			alignas(32) float new_array[byte_blocks_per_register]{};
+			for (size_t x = 0; x < byte_blocks_per_register; ++x) {
+				new_array[x] = static_cast<float>(values[x]);
+			}
+			return _mm256_load_ps(new_array);
+		}
+
+		/**
+		 * @brief Collect a single register worth of data from data_in, apply gain and increment, and store the result in data_out.
+		 *        This version uses AVX2 instructions.
+		 *
+		 * @param data_in Pointer to the input array of int32_t values.
+		 * @param data_out Pointer to the output array of int16_t values.
+		 * @param current_gain The gain to be applied to the elements.
+		 * @param increment The increment value to be added to each element.
+		 */
+		inline static void collect_single_register(int32_t* data_in, int16_t* data_out, float current_gain, float increment) {
+			avx_2_float current_samples_new{ _mm256_mul_ps(gather_values(data_in),
+				_mm256_add_ps(_mm256_set1_ps(current_gain),
+					_mm256_mul_ps(_mm256_set1_ps(increment), _mm256_set_ps(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f)))) };
+
+			current_samples_new =
+				_mm256_blendv_ps(_mm256_max_ps(current_samples_new, _mm256_set1_ps(static_cast<float>(std::numeric_limits<int16_t>::min()))),
+					_mm256_min_ps(current_samples_new, _mm256_set1_ps(static_cast<float>(std::numeric_limits<int16_t>::max()))),
+					_mm256_cmp_ps(current_samples_new, _mm256_set1_ps(0.0f), _CMP_GE_OQ));
+
+			store_values(_mm256_cvtps_epi32(current_samples_new), data_out);
+		}
+
+		/**
+		 * @brief Combine a register worth of elements from decoded_data and store the result in up_sampled_vector.
+		 *        This version uses AVX2 instructions.
+		 *
+		 * @param up_sampled_vector Pointer to the array of int32_t values.
+		 * @param decoded_data Pointer to the array of int16_t values.
+		 * @param x Index to select a specific set of elements to combine.
+		 */
+		inline static void combine_samples(int32_t* up_sampled_vector, const int16_t* decoded_data) {
+			auto newValues{ _mm256_cvtps_epi32(_mm256_add_ps(gather_values(up_sampled_vector), gather_values(decoded_data))) };
+			store_values(newValues, up_sampled_vector);
+		}
+	};
+
+} // namespace dpp
+
+#endif