diff --git a/src/moonlight-protocol/CMakeLists.txt b/src/moonlight-protocol/CMakeLists.txt index 5e3ebc32..57f08cb3 100644 --- a/src/moonlight-protocol/CMakeLists.txt +++ b/src/moonlight-protocol/CMakeLists.txt @@ -45,18 +45,20 @@ FetchContent_GetProperties(nanors) if (NOT nanors_POPULATED) FetchContent_Populate(nanors) - add_library(nanors STATIC ${nanors_SOURCE_DIR}/rs.c) + add_library(nanors) add_library(nanors::nanors ALIAS nanors) - target_include_directories(nanors PUBLIC ${nanors_SOURCE_DIR} ${nanors_SOURCE_DIR}/deps/obl/) + target_include_directories(nanors + PUBLIC + ./nanors + ${nanors_SOURCE_DIR} + ${nanors_SOURCE_DIR}/deps/obl/) target_sources(nanors - PRIVATE ${nanors_SOURCE_DIR}/rs.c - PUBLIC ${nanors_SOURCE_DIR}/rs.h) + PRIVATE ./nanors/rswrapper.c + PUBLIC ./nanors/rswrapper.h) - # TODO: There's a more advanced version of this with proper support for SSSE3, - # see: https://github.com/LizardByte/Sunshine/pull/2828 - set_source_files_properties(${nanors_SOURCE_DIR}/rs.c - PROPERTIES COMPILE_FLAGS "-include deps/obl/autoshim.h -ftree-vectorize") + set_source_files_properties(./nanors/rswrapper.c + PROPERTIES COMPILE_FLAGS "-ftree-vectorize -funroll-loops") target_link_libraries_system(moonlight PUBLIC nanors::nanors) endif () diff --git a/src/moonlight-protocol/moonlight/fec.hpp b/src/moonlight-protocol/moonlight/fec.hpp index ec15a67c..66338963 100644 --- a/src/moonlight-protocol/moonlight/fec.hpp +++ b/src/moonlight-protocol/moonlight/fec.hpp @@ -3,7 +3,7 @@ #include extern "C" { -#include +#include "rswrapper.h" } /** @@ -32,7 +32,7 @@ inline void init() { /** * A smart pointer to the reed_solomon data structure, it will release the memory when going out of scope */ -using rs_ptr = std::unique_ptr; +using rs_ptr = std::shared_ptr; /** * Creates and allocates the required Reed Solomon data structure. @@ -43,8 +43,8 @@ using rs_ptr = std::unique_ptr; * @return A smart pointer, it will release the memory when going out of scope */ inline rs_ptr create(int data_shards, int parity_shards) { - auto rs = reed_solomon_new(data_shards, parity_shards); - return {rs, ::reed_solomon_release}; + auto rs = reed_solomon_new_fn(data_shards, parity_shards); + return std::shared_ptr(rs, reed_solomon_release_fn); } /** @@ -63,7 +63,7 @@ inline rs_ptr create(int data_shards, int parity_shards) { * @return zero on success or an error code if failing. */ inline int encode(reed_solomon *rs, uint8_t **shards, int nr_shards, int block_size) { - return reed_solomon_encode(rs, shards, nr_shards, block_size); + return reed_solomon_encode_fn(rs, shards, nr_shards, block_size); } /** @@ -82,7 +82,7 @@ inline int encode(reed_solomon *rs, uint8_t **shards, int nr_shards, int block_s * @return zero on success or an error code if failing */ inline int decode(reed_solomon *rs, uint8_t **shards, uint8_t *marks, int nr_shards, int block_size) { - return reed_solomon_decode(rs, shards, marks, nr_shards, block_size); + return reed_solomon_decode_fn(rs, shards, marks, nr_shards, block_size); } } // namespace moonlight::fec \ No newline at end of file diff --git a/src/moonlight-protocol/nanors/rswrapper.c b/src/moonlight-protocol/nanors/rswrapper.c new file mode 100644 index 00000000..dd3c08e5 --- /dev/null +++ b/src/moonlight-protocol/nanors/rswrapper.c @@ -0,0 +1,153 @@ +/** + * @file src/rswrapper.c + * @brief Wrappers for nanors vectorization with different ISA options + */ + +// _FORTIY_SOURCE can cause some versions of GCC to try to inline +// memset() with incompatible target options when compiling rs.c +#ifdef _FORTIFY_SOURCE +#undef _FORTIFY_SOURCE +#endif + +// The assert() function is decorated with __cold on macOS which +// is incompatible with Clang's target multiversioning feature +#ifndef NDEBUG +#define NDEBUG +#endif + +#define DECORATE_FUNC_I(a, b) a##b +#define DECORATE_FUNC(a, b) DECORATE_FUNC_I(a, b) + +// Append an ISA suffix to the public RS API +#define reed_solomon_init DECORATE_FUNC(reed_solomon_init, ISA_SUFFIX) +#define reed_solomon_new DECORATE_FUNC(reed_solomon_new, ISA_SUFFIX) +#define reed_solomon_new_static DECORATE_FUNC(reed_solomon_new_static, ISA_SUFFIX) +#define reed_solomon_release DECORATE_FUNC(reed_solomon_release, ISA_SUFFIX) +#define reed_solomon_decode DECORATE_FUNC(reed_solomon_decode, ISA_SUFFIX) +#define reed_solomon_encode DECORATE_FUNC(reed_solomon_encode, ISA_SUFFIX) + +// Append an ISA suffix to internal functions to prevent multiple definition errors +#define obl_axpy_ref DECORATE_FUNC(obl_axpy_ref, ISA_SUFFIX) +#define obl_scal_ref DECORATE_FUNC(obl_scal_ref, ISA_SUFFIX) +#define obl_axpyb32_ref DECORATE_FUNC(obl_axpyb32_ref, ISA_SUFFIX) +#define obl_axpy DECORATE_FUNC(obl_axpy, ISA_SUFFIX) +#define obl_scal DECORATE_FUNC(obl_scal, ISA_SUFFIX) +#define obl_swap DECORATE_FUNC(obl_swap, ISA_SUFFIX) +#define obl_axpyb32 DECORATE_FUNC(obl_axpyb32, ISA_SUFFIX) +#define axpy DECORATE_FUNC(axpy, ISA_SUFFIX) +#define scal DECORATE_FUNC(scal, ISA_SUFFIX) +#define gemm DECORATE_FUNC(gemm, ISA_SUFFIX) +#define invert_mat DECORATE_FUNC(invert_mat, ISA_SUFFIX) + +#if defined(__x86_64__) || defined(__i386__) + +// Compile a variant for SSSE3 +#if defined(__clang__) +#pragma clang attribute push(__attribute__((target("ssse3"))), apply_to = function) +#else +#pragma GCC push_options +#pragma GCC target("ssse3") +#endif +#define ISA_SUFFIX _ssse3 +#define OBLAS_SSE3 +#include "./rs.c" +#undef OBLAS_SSE3 +#undef ISA_SUFFIX +#if defined(__clang__) +#pragma clang attribute pop +#else +#pragma GCC pop_options +#endif + +// Compile a variant for AVX2 +#if defined(__clang__) +#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function) +#else +#pragma GCC push_options +#pragma GCC target("avx2") +#endif +#define ISA_SUFFIX _avx2 +#define OBLAS_AVX2 +#include "./rs.c" +#undef OBLAS_AVX2 +#undef ISA_SUFFIX +#if defined(__clang__) +#pragma clang attribute pop +#else +#pragma GCC pop_options +#endif + +// Compile a variant for AVX512BW +#if defined(__clang__) +#pragma clang attribute push(__attribute__((target("avx512f,avx512bw"))), apply_to = function) +#else +#pragma GCC push_options +#pragma GCC target("avx512f,avx512bw") +#endif +#define ISA_SUFFIX _avx512 +#define OBLAS_AVX512 +#include "./rs.c" +#undef OBLAS_AVX512 +#undef ISA_SUFFIX +#if defined(__clang__) +#pragma clang attribute pop +#else +#pragma GCC pop_options +#endif + +#endif + +// Compile a default variant +#define ISA_SUFFIX _def +#include "./autoshim.h" +#include "./rs.c" +#undef ISA_SUFFIX + +#undef reed_solomon_init +#undef reed_solomon_new +#undef reed_solomon_new_static +#undef reed_solomon_release +#undef reed_solomon_decode +#undef reed_solomon_encode + +#include "rswrapper.h" + +reed_solomon_new_t reed_solomon_new_fn; +reed_solomon_release_t reed_solomon_release_fn; +reed_solomon_encode_t reed_solomon_encode_fn; +reed_solomon_decode_t reed_solomon_decode_fn; + +/** + * @brief This initializes the RS function pointers to the best vectorized version available. + * @details The streaming code will directly invoke these function pointers during encoding. + */ +void reed_solomon_init(void) { +#if defined(__x86_64__) || defined(__i386__) + if (__builtin_cpu_supports("avx512f") && __builtin_cpu_supports("avx512bw")) { + reed_solomon_new_fn = reed_solomon_new_avx512; + reed_solomon_release_fn = reed_solomon_release_avx512; + reed_solomon_encode_fn = reed_solomon_encode_avx512; + reed_solomon_decode_fn = reed_solomon_decode_avx512; + reed_solomon_init_avx512(); + } else if (__builtin_cpu_supports("avx2")) { + reed_solomon_new_fn = reed_solomon_new_avx2; + reed_solomon_release_fn = reed_solomon_release_avx2; + reed_solomon_encode_fn = reed_solomon_encode_avx2; + reed_solomon_decode_fn = reed_solomon_decode_avx2; + reed_solomon_init_avx2(); + } else if (__builtin_cpu_supports("ssse3")) { + reed_solomon_new_fn = reed_solomon_new_ssse3; + reed_solomon_release_fn = reed_solomon_release_ssse3; + reed_solomon_encode_fn = reed_solomon_encode_ssse3; + reed_solomon_decode_fn = reed_solomon_decode_ssse3; + reed_solomon_init_ssse3(); + } else +#endif + { + reed_solomon_new_fn = reed_solomon_new_def; + reed_solomon_release_fn = reed_solomon_release_def; + reed_solomon_encode_fn = reed_solomon_encode_def; + reed_solomon_decode_fn = reed_solomon_decode_def; + reed_solomon_init_def(); + } +} diff --git a/src/moonlight-protocol/nanors/rswrapper.h b/src/moonlight-protocol/nanors/rswrapper.h new file mode 100644 index 00000000..e705e3b3 --- /dev/null +++ b/src/moonlight-protocol/nanors/rswrapper.h @@ -0,0 +1,21 @@ +/** +* @file src/rswrapper.h +* @brief Wrappers for nanors vectorization +* @details This is a drop-in replacement for nanors rs.h +*/ +#pragma once + +#include "rs.h" +#include + +typedef struct _reed_solomon reed_solomon; + +typedef reed_solomon *(*reed_solomon_new_t)(int data_shards, int parity_shards); +typedef void (*reed_solomon_release_t)(reed_solomon *rs); +typedef int (*reed_solomon_encode_t)(reed_solomon *rs, uint8_t **shards, int nr_shards, int bs); +typedef int (*reed_solomon_decode_t)(reed_solomon *rs, uint8_t **shards, uint8_t *marks, int nr_shards, int bs); + +extern reed_solomon_new_t reed_solomon_new_fn; +extern reed_solomon_release_t reed_solomon_release_fn; +extern reed_solomon_encode_t reed_solomon_encode_fn; +extern reed_solomon_decode_t reed_solomon_decode_fn; \ No newline at end of file