From a2f34cc11c7865570db58364c3124b05f81d8a00 Mon Sep 17 00:00:00 2001 From: Jeongseok Lee Date: Mon, 30 Sep 2024 23:54:49 -0700 Subject: [PATCH] Fix memory alignment checks in SIMD error functions (#90) Summary: Previously, a static alignment value of 32 was used for AVX in SIMD implementations. This diff modifies this approach to dynamically determine the correct alignment value via DrJit at compile-time, ensuring compatibility across different SIMD instruction sets. Detailed changes: - Add constants for packet sizes and alignment sizes for both DrJit and AVX versions, rather than using magic numbers - Centralize `checkAlignment()` - Fix `getJacobianSize()` for both DrJit and AVX versions, correctly utilizing the package size - Update tests to check around the number of constraints around the package size to ensure the new `getJacobianSize()` works - Update GitHub CI to test both SIMD-enabled and disabled NOTE: This Diff the second trial of D47103616 Differential Revision: D63658278 --- .github/workflows/ci_ubuntu.yml | 20 +++- cmake/mt_defs.cmake | 3 + .../simd_normal_error_function.cpp | 62 +++++----- .../simd_normal_error_function.h | 6 +- .../simd_plane_error_function.cpp | 90 +++++++------- .../simd_plane_error_function.h | 4 +- .../simd_position_error_function.cpp | 111 ++++++++++-------- .../simd_position_error_function.h | 7 +- momentum/simd/simd.h | 19 ++- .../error_function_helpers.cpp | 6 +- .../character_solver/simd_functions_test.cpp | 51 +++++++- pixi.toml | 47 ++++++-- 12 files changed, 286 insertions(+), 140 deletions(-) diff --git a/.github/workflows/ci_ubuntu.yml b/.github/workflows/ci_ubuntu.yml index 009c3623..c57a9e3f 100644 --- a/.github/workflows/ci_ubuntu.yml +++ b/.github/workflows/ci_ubuntu.yml @@ -14,8 +14,18 @@ on: jobs: momentum: - name: momentum-ubuntu + name: momentum-simd:${{ matrix.simd }}-ubuntu runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - simd: "ON" + pymomentum: "ON" + - simd: "OFF" + pymomentum: "OFF" + env: + MOMENTUM_ENABLE_SIMD: ${{ matrix.simd }} steps: - name: Checkout uses: actions/checkout@v4 @@ -25,10 +35,14 @@ jobs: cache: true - name: Build and test Momentum run: | - pixi run test + MOMENTUM_BUILD_PYMOMENTUM=$MOMENTUM_BUILD_PYMOMENTUM \ + MOMENTUM_ENABLE_SIMD=$MOMENTUM_ENABLE_SIMD \ + pixi run test - name: Install Momentum and Build hello_world run: | - pixi run install + MOMENTUM_BUILD_PYMOMENTUM=$MOMENTUM_BUILD_PYMOMENTUM \ + MOMENTUM_ENABLE_SIMD=$MOMENTUM_ENABLE_SIMD \ + pixi run install pixi run cmake \ -S momentum/examples/hello_world \ -B momentum/examples/hello_world/build \ diff --git a/cmake/mt_defs.cmake b/cmake/mt_defs.cmake index 8464a69e..d9f337ed 100644 --- a/cmake/mt_defs.cmake +++ b/cmake/mt_defs.cmake @@ -251,6 +251,9 @@ function(mt_library) else() target_compile_options(${_ARG_NAME} ${type} -march=native) endif() + target_compile_definitions(${_ARG_NAME} + ${type} -DMOMENTUM_ENABLE_SIMD=1 + ) endif() target_compile_options(${_ARG_NAME} ${type} ${_ARG_PUBLIC_COMPILE_OPTIONS} diff --git a/momentum/character_solver/simd_normal_error_function.cpp b/momentum/character_solver/simd_normal_error_function.cpp index 3c7d9b49..18cea271 100644 --- a/momentum/character_solver/simd_normal_error_function.cpp +++ b/momentum/character_solver/simd_normal_error_function.cpp @@ -27,28 +27,15 @@ namespace momentum { -namespace { - -template -void ensureAligned(const Eigen::MatrixBase& mat, size_t& addressOffset) { - if (addressOffset == 8) - addressOffset = 0; - MT_CHECK(((uintptr_t)(&mat(addressOffset, 0))) % 32 == 0); -} - -} // namespace - SimdNormalConstraints::SimdNormalConstraints(const Skeleton* skel) { - constexpr uintptr_t kAlignment = 32; - // resize all arrays to the number of joints MT_CHECK(skel->joints.size() <= kMaxJoints); numJoints = static_cast(skel->joints.size()); const auto dataSize = kMaxConstraints * numJoints; - MT_CHECK(dataSize % kAlignment == 0); + MT_CHECK(dataSize % kSimdAlignment == 0); // create memory for all floats - data = makeAlignedUnique(dataSize * 10); + data = makeAlignedUnique(dataSize * 10); float* dataPtr = data.get(); offsetX = dataPtr + dataSize * 0; @@ -169,7 +156,7 @@ double SimdNormalErrorFunction::getError( } } - // sum the avx error register for final result + // sum the SIMD error register for final result return static_cast(drjit::sum(error) * kPlaneWeight * weight_); } @@ -310,9 +297,8 @@ double SimdNormalErrorFunction::getJacobian( // storage for joint errors std::vector jointErrors(constraints_->numJoints); - // need to make sure we're actually at a 32 byte data offset at the first offset for avx access - size_t addressOffset = 8 - (((size_t)jacobian.data() % 32) / 4); - ensureAligned(jacobian, addressOffset); + // need to make sure we're actually at a 32 byte data offset at the first offset for SIMD access + checkAlignment(jacobian); // loop over all joints, as these are our base units auto dispensoOptions = dispenso::ParForOptions(); @@ -413,7 +399,7 @@ double SimdNormalErrorFunction::getJacobian( usedRows = gsl::narrow_cast(jacobian.rows()); - // sum the avx error register for final result + // sum the SIMD error register for final result return static_cast(error); } @@ -426,11 +412,11 @@ size_t SimdNormalErrorFunction::getJacobianSize() const { size_t num = 0; for (int i = 0; i < constraints_->numJoints; i++) { jacobianOffset_[i] = num; - const auto numPackets = - (constraints_->constraintCount[i] + kSimdPacketSize - 1) / kSimdPacketSize; + const size_t count = constraints_->constraintCount[i]; + const auto numPackets = (count + kSimdPacketSize - 1) / kSimdPacketSize; num += numPackets * kSimdPacketSize; } - return num + kSimdPacketSize; + return num; } #ifdef MOMENTUM_ENABLE_AVX @@ -468,9 +454,8 @@ double SimdNormalErrorFunctionAVX::getJacobian( // storage for joint errors std::vector ets_error; - // need to make sure we're actually at a 32 byte data offset at the first offset for avx access - size_t addressOffset = 8 - (((size_t)jacobian.data() % 32) / 4); - ensureAligned(jacobian, addressOffset); + // need to make sure we're actually at a 32 byte data offset at the first offset for AVX access + checkAlignment(jacobian); // loop over all joints, as these are our base units auto dispensoOptions = dispenso::ParForOptions(); @@ -482,7 +467,7 @@ double SimdNormalErrorFunctionAVX::getJacobian( constraints_->numJoints, [&](double& error_local, const size_t jointId) { // get initial offset - const auto offset = jacobianOffset_[jointId] + addressOffset; + const auto offset = jacobianOffset_[jointId]; // pre-load some joint specific values const auto& transformation = state.jointState[jointId].transformation; @@ -497,9 +482,9 @@ double SimdNormalErrorFunctionAVX::getJacobian( const auto jointOffset = jointId * SimdNormalConstraints::kMaxConstraints; - // loop over all constraints in increments of 8 + // loop over all constraints in increments of kAvxPacketSize const uint32_t count = constraints_->constraintCount[jointId]; - for (uint32_t index = 0; index < count; index += 8) { + for (uint32_t index = 0; index < count; index += kAvxPacketSize) { // transform offset by joint transformation : pos = transform * offset // also transform normal by transformation : nml = (transform.linear() * // normal).normalized() @@ -687,9 +672,26 @@ double SimdNormalErrorFunctionAVX::getJacobian( usedRows = gsl::narrow_cast(jacobian.rows()); - // sum the avx error register for final result + // sum the AVX error register for final result return static_cast(error); } + +size_t SimdNormalErrorFunctionAVX::getJacobianSize() const { + if (constraints_ == nullptr) { + return 0; + } + + jacobianOffset_.resize(constraints_->numJoints); + size_t num = 0; + for (int i = 0; i < constraints_->numJoints; i++) { + jacobianOffset_[i] = num; + const size_t count = constraints_->constraintCount[i]; + const auto numPackets = (count + kAvxPacketSize - 1) / kAvxPacketSize; + num += numPackets * kAvxPacketSize; + } + return num; +} + #endif // MOMENTUM_ENABLE_AVX } // namespace momentum diff --git a/momentum/character_solver/simd_normal_error_function.h b/momentum/character_solver/simd_normal_error_function.h index 1416da43..0366f669 100644 --- a/momentum/character_solver/simd_normal_error_function.h +++ b/momentum/character_solver/simd_normal_error_function.h @@ -101,7 +101,7 @@ class SimdNormalErrorFunction : public SkeletonErrorFunction { Ref residual, int& usedRows) override; - [[nodiscard]] size_t getJacobianSize() const final; + [[nodiscard]] size_t getJacobianSize() const override; void setConstraints(const SimdNormalConstraints* cstrs) { constraints_ = cstrs; @@ -120,6 +120,7 @@ class SimdNormalErrorFunction : public SkeletonErrorFunction { }; #ifdef MOMENTUM_ENABLE_AVX + // A version of SimdNormalErrorFunction where the Jacobian has been hand-unrolled using // AVX instructions. On some platforms this performs better than the generic SIMD version // but it only works on Intel platforms that support AVX. @@ -141,7 +142,10 @@ class SimdNormalErrorFunctionAVX : public SimdNormalErrorFunction { Ref jacobian, Ref residual, int& usedRows) final; + + [[nodiscard]] size_t getJacobianSize() const final; }; + #endif // MOMENTUM_ENABLE_AVX } // namespace momentum diff --git a/momentum/character_solver/simd_plane_error_function.cpp b/momentum/character_solver/simd_plane_error_function.cpp index 430e31b2..2388dc29 100644 --- a/momentum/character_solver/simd_plane_error_function.cpp +++ b/momentum/character_solver/simd_plane_error_function.cpp @@ -26,28 +26,15 @@ namespace momentum { -namespace { - -template -void ensureAligned(const Eigen::MatrixBase& mat, size_t& addressOffset) { - if (addressOffset == 8) - addressOffset = 0; - MT_CHECK(((uintptr_t)(&mat(addressOffset, 0))) % 32 == 0); -} - -} // namespace - SimdPlaneConstraints::SimdPlaneConstraints(const Skeleton* skel) { - constexpr uintptr_t kAlignment = 32; - // resize all arrays to the number of joints MT_CHECK(skel->joints.size() <= kMaxJoints); numJoints = static_cast(skel->joints.size()); const auto dataSize = kMaxConstraints * numJoints; - MT_CHECK(dataSize % kAlignment == 0); + MT_CHECK(dataSize % kSimdAlignment == 0); // create memory for all floats - data = makeAlignedUnique(dataSize * 8); + data = makeAlignedUnique(dataSize * 8); float* dataPtr = data.get(); offsetX = dataPtr + dataSize * 0; @@ -161,7 +148,7 @@ double SimdPlaneErrorFunction::getError( } } - // Sum the AVX error register for final result + // Sum the SIMD error register for final result return static_cast(drjit::sum(error) * kPlaneWeight * weight_); } @@ -176,9 +163,8 @@ double SimdPlaneErrorFunction::getGradient( // Storage for joint errors std::vector jointErrors(constraints_->numJoints); - // Need to make sure we're actually at a 32 byte data offset at the first offset for AVX access - size_t addressOffset = 8 - (((size_t)gradient.data() % 32) / 4); - ensureAligned(gradient, addressOffset); + // Need to make sure we're actually at a 32 byte data offset at the first offset for SIMD access + checkAlignment(gradient); // Loop over all joints, as these are our base units auto dispensoOptions = dispenso::ParForOptions(); @@ -331,9 +317,8 @@ double SimdPlaneErrorFunction::getJacobian( // Storage for joint errors std::vector jointErrors(constraints_->numJoints); - // Need to make sure we're actually at a 32 byte data offset at the first offset for avx access - size_t addressOffset = 8 - (((size_t)jacobian.data() % 32) / 4); - ensureAligned(jacobian, addressOffset); + // Need to make sure we're actually at a 32 byte data offset at the first offset for SIMD access + checkAlignment(jacobian); // Loop over all joints, as these are our base units auto dispensoOptions = dispenso::ParForOptions(); @@ -455,11 +440,11 @@ size_t SimdPlaneErrorFunction::getJacobianSize() const { size_t num = 0; for (int i = 0; i < constraints_->numJoints; i++) { jacobianOffset_[i] = num; - const auto numPackets = - (constraints_->constraintCount[i] + kSimdPacketSize - 1) / kSimdPacketSize; + const size_t count = constraints_->constraintCount[i]; + const auto numPackets = (count + kSimdPacketSize - 1) / kSimdPacketSize; num += numPackets * kSimdPacketSize; } - return num + kSimdPacketSize; + return num; } void SimdPlaneErrorFunction::setConstraints(const SimdPlaneConstraints* cstrs) { @@ -517,9 +502,9 @@ double SimdPlaneErrorFunctionAVX::getError( const auto jointOffset = jointId * SimdPlaneConstraints::kMaxConstraints; - // loop over all constraints in increments of 8 + // loop over all constraints in increments of kAvxPacketSize const uint32_t count = constraints_->constraintCount[jointId]; - for (uint32_t index = 0; index < count; index += 8) { + for (uint32_t index = 0; index < count; index += kAvxPacketSize) { // transform offset by joint transformation : pos = transform * offset const __m256 valx = _mm256_loadu_ps(&constraints_->offsetX[jointOffset + index]); posx = _mm256_mul_ps(valx, _mm256_broadcast_ss(&transformation.data()[0])); @@ -557,7 +542,7 @@ double SimdPlaneErrorFunctionAVX::getError( } } - // sum the avx error register for final result + // sum the AVX error register for final result return static_cast(error * kPlaneWeight * weight_); } @@ -596,9 +581,9 @@ double SimdPlaneErrorFunctionAVX::getGradient( const auto jointOffset = jointId * SimdPlaneConstraints::kMaxConstraints; - // loop over all constraints in increments of 8 + // loop over all constraints in increments of kAvxPacketSize const uint32_t count = constraints_->constraintCount[jointId]; - for (uint32_t index = 0; index < count; index += 8) { + for (uint32_t index = 0; index < count; index += kAvxPacketSize) { // transform offset by joint transformation : pos = transform * offset const __m256 valx = _mm256_loadu_ps(&constraints_->offsetX[jointOffset + index]); posx = _mm256_mul_ps(valx, _mm256_broadcast_ss(&transformation.data()[0])); @@ -753,7 +738,7 @@ double SimdPlaneErrorFunctionAVX::getGradient( // finalize the gradient gradient += std::get<1>(ets_error_grad[0]).cast() * weight_; - // sum the avx error register for final result + // sum the AVX error register for final result error = std::get<0>(ets_error_grad[0]); } @@ -776,18 +761,17 @@ double SimdPlaneErrorFunctionAVX::getJacobian( // storage for joint errors std::vector ets_error; - // need to make sure we're actually at a 32 byte data offset at the first offset for avx access - size_t addressOffset = 8 - (((size_t)jacobian.data() % 32) / 4); - ensureAligned(jacobian, addressOffset); + // need to make sure we're actually at a 32 byte data offset at the first offset for AVX access + checkAlignment(jacobian); // calculate actually used number of rows const size_t maxRows = gsl::narrow_cast(jacobian.rows()); size_t num = 0; for (int i = 0; i < constraints_->numJoints; i++) { jacobianOffset_[i] = num; - const size_t res = constraints_->constraintCount[i] % 8; + const size_t res = constraints_->constraintCount[i] % kAvxPacketSize; if (res != 0) { - num += constraints_->constraintCount[i] + 8 - res; + num += constraints_->constraintCount[i] + kAvxPacketSize - res; } else { num += constraints_->constraintCount[i]; } @@ -803,7 +787,7 @@ double SimdPlaneErrorFunctionAVX::getJacobian( constraints_->numJoints, [&](double& error_local, const size_t jointId) { // get initial offset - const auto offset = jacobianOffset_[jointId] + addressOffset; + const auto offset = jacobianOffset_[jointId]; // pre-load some joint specific values const auto& transformation = state.jointState[jointId].transformation; @@ -815,12 +799,13 @@ double SimdPlaneErrorFunctionAVX::getJacobian( const auto jointOffset = jointId * SimdPlaneConstraints::kMaxConstraints; - // loop over all constraints in increments of 8 + // loop over all constraints in increments of kAvxPacketSize const uint32_t count = constraints_->constraintCount[jointId]; - for (uint32_t index = 0; index < count; index += 8) { + for (uint32_t index = 0; index < count; index += kAvxPacketSize) { // check if we're too much - if (offset + index + 8 >= maxRows) + if (offset + index + kAvxPacketSize >= maxRows) { continue; + } // transform offset by joint transformation: pos = transform * offset const __m256 valx = _mm256_loadu_ps(&constraints_->offsetX[jointOffset + index]); @@ -978,15 +963,32 @@ double SimdPlaneErrorFunctionAVX::getJacobian( double error = std::accumulate(ets_error.begin(), ets_error.end(), 0.0); - if (addressOffset + num >= maxRows) + if (num >= maxRows) { usedRows = gsl::narrow_cast(maxRows - 1); - else - usedRows = gsl::narrow_cast(addressOffset + num); + } else { + usedRows = gsl::narrow_cast(num); + } - // sum the avx error register for final result + // sum the AVX error register for final result return static_cast(error); } +size_t SimdPlaneErrorFunctionAVX::getJacobianSize() const { + if (constraints_ == nullptr) { + return 0; + } + + jacobianOffset_.resize(constraints_->numJoints); + size_t num = 0; + for (int i = 0; i < constraints_->numJoints; i++) { + jacobianOffset_[i] = num; + const size_t count = constraints_->constraintCount[i]; + const auto numPackets = (count + kAvxPacketSize - 1) / kAvxPacketSize; + num += numPackets * kAvxPacketSize; + } + return num; +} + #endif // MOMENTUM_ENABLE_AVX } // namespace momentum diff --git a/momentum/character_solver/simd_plane_error_function.h b/momentum/character_solver/simd_plane_error_function.h index cae35ad7..2d841082 100644 --- a/momentum/character_solver/simd_plane_error_function.h +++ b/momentum/character_solver/simd_plane_error_function.h @@ -100,7 +100,7 @@ class SimdPlaneErrorFunction : public SkeletonErrorFunction { Ref residual, int& usedRows) override; - [[nodiscard]] size_t getJacobianSize() const final; + [[nodiscard]] size_t getJacobianSize() const override; void setConstraints(const SimdPlaneConstraints* cstrs); @@ -145,6 +145,8 @@ class SimdPlaneErrorFunctionAVX : public SimdPlaneErrorFunction { Ref jacobian, Ref residual, int& usedRows) final; + + [[nodiscard]] size_t getJacobianSize() const final; }; #endif // MOMENTUM_ENABLE_AVX diff --git a/momentum/character_solver/simd_position_error_function.cpp b/momentum/character_solver/simd_position_error_function.cpp index 370d3cd0..218e282a 100644 --- a/momentum/character_solver/simd_position_error_function.cpp +++ b/momentum/character_solver/simd_position_error_function.cpp @@ -26,28 +26,15 @@ namespace momentum { -namespace { - -template -void ensureAligned(const Eigen::MatrixBase& mat, size_t& addressOffset) { - if (addressOffset == 8) - addressOffset = 0; - MT_CHECK(((uintptr_t)(&mat(addressOffset, 0))) % 32 == 0); -} - -} // namespace - SimdPositionConstraints::SimdPositionConstraints(const Skeleton* skel) { - constexpr uintptr_t kAlignment = 32; - // resize all arrays to the number of joints MT_CHECK(skel->joints.size() <= kMaxJoints); numJoints = static_cast(skel->joints.size()); const auto dataSize = kMaxConstraints * numJoints; - MT_CHECK(dataSize % kAlignment == 0); + MT_CHECK(dataSize % kSimdAlignment == 0); // create memory for all floats - data = makeAlignedUnique(dataSize * 7); + data = makeAlignedUnique(dataSize * 7); float* dataPtr = data.get(); offsetX = dataPtr + dataSize * 0; @@ -158,7 +145,7 @@ double SimdPositionErrorFunction::getError( } } - // Sum the AVX error register for final result + // Sum the SIMD error register for final result return static_cast(drjit::sum(error) * kPositionWeight * weight_); } @@ -173,9 +160,9 @@ double SimdPositionErrorFunction::getGradient( // Storage for joint errors std::vector jointErrors(constraints_->numJoints); - // need to make sure we're actually at a 32 byte data offset at the first offset for AVX access - size_t addressOffset = 8 - (((size_t)gradient.data() % 32) / 4); - ensureAligned(gradient, addressOffset); + // Need to make sure we're actually at a kSimdAlignment byte data offset at the first offset for + // SIMD access + checkAlignment(gradient); // Loop over all joints, as these are our base units auto dispensoOptions = dispenso::ParForOptions(); @@ -284,7 +271,7 @@ double SimdPositionErrorFunction::getGradient( }, dispensoOptions); - // Sum the AVX error register for final result + // Sum the SIMD error register for final result const double error = kPositionWeight * weight_ * std::accumulate(jointErrors.begin(), jointErrors.end(), 0.0); return static_cast(error); @@ -325,9 +312,9 @@ double SimdPositionErrorFunction::getJacobian( // Storage for joint errors std::vector jointErrors(constraints_->numJoints); - // Need to make sure we're actually at a 32 byte data offset at the first offset for avx access - size_t addressOffset = 8 - (((size_t)jacobian.data() % 32) / 4); - ensureAligned(jacobian, addressOffset); + // Need to make sure we're actually at a kSimdAlignment byte data offset at the first offset for + // SIMD access + checkAlignment(jacobian); // Loop over all joints, as these are our base units auto dispensoOptions = dispenso::ParForOptions(); @@ -342,7 +329,7 @@ double SimdPositionErrorFunction::getJacobian( // Loop over all constraints in increments of kSimdPacketSize for (uint32_t index = 0, jindex = 0; index < constraintCount; - index += kSimdPacketSize, jindex += kSimdPacketSize * 3) { + index += kSimdPacketSize, jindex += kSimdPacketSize * kConstraintDim) { const auto constraintOffsetIndex = jointId * SimdPositionConstraints::kMaxConstraints + index; const auto jacobianOffsetCur = jacobianOffset_[jointId] + jindex; @@ -480,7 +467,7 @@ double SimdPositionErrorFunction::getJacobian( usedRows = gsl::narrow_cast(jacobian.rows()); - // Sum the AVX error register for final result + // Sum the SIMD error register for final result const double error = kPositionWeight * weight_ * std::accumulate(jointErrors.begin(), jointErrors.end(), 0.0); return static_cast(error); @@ -498,12 +485,12 @@ size_t SimdPositionErrorFunction::getJacobianSize() const { const size_t count = constraints_->constraintCount[i]; const size_t residual = count % kSimdPacketSize; if (residual != 0) { - num += (count + kSimdPacketSize - residual) * 3; + num += (count + kSimdPacketSize - residual) * kConstraintDim; } else { - num += count * kSimdPacketSize; + num += count * kConstraintDim; } } - return num + kSimdPacketSize; + return num; } void SimdPositionErrorFunction::setConstraints(const SimdPositionConstraints* cstrs) { @@ -792,7 +779,7 @@ double SimdPositionErrorFunctionAVX::getGradient( // finalize the gradient gradient += std::get<1>(ets_error_grad[0]).cast() * weight_; - // sum the avx error register for final result + // sum the AVX error register for final result error = std::get<0>(ets_error_grad[0]); } @@ -814,8 +801,8 @@ double SimdPositionErrorFunctionAVX::getJacobian( // storage for joint errors std::vector ets_error; - // need to make sure we're actually at a 32 byte data offset at the first offset for avx access - const size_t addressOffset = 8 - ((size_t)jacobian.data() % 32) / 4; + // need to make sure we're actually at a 32 byte data offset at the first offset for AVX access + checkAlignment(jacobian); // loop over all joints, as these are our base units auto dispensoOptions = dispenso::ParForOptions(); @@ -827,7 +814,7 @@ double SimdPositionErrorFunctionAVX::getJacobian( constraints_->numJoints, [&](double& error_local, const size_t jointId) { // get initial offset - const auto offset = jacobianOffset_[jointId] + addressOffset; + const auto offset = jacobianOffset_[jointId]; // pre-load some joint specific values const auto& transformation = state.jointState[jointId].transformation; @@ -838,9 +825,10 @@ double SimdPositionErrorFunctionAVX::getJacobian( const auto jointOffset = jointId * SimdPositionConstraints::kMaxConstraints; - // loop over all constraints in increments of 8 + // loop over all constraints in increments of kAvxPacketSize const uint32_t count = constraints_->constraintCount[jointId]; - for (uint32_t index = 0, jindex = 0; index < count; index += 8, jindex += 24) { + for (uint32_t index = 0, jindex = 0; index < count; + index += kAvxPacketSize, jindex += kAvxPacketSize * kConstraintDim) { // transform offset by joint transformation : pos = transform * offset const __m256 valx = _mm256_loadu_ps(&constraints_->offsetX[jointOffset + index]); posx = _mm256_mul_ps(valx, _mm256_broadcast_ss(&transformation.data()[0])); @@ -920,19 +908,22 @@ double SimdPositionErrorFunctionAVX::getJacobian( const __m256 pjacx = _mm256_mul_ps(_mm256_broadcast_ss(&pw), jacx); auto* const addressx = &jacobian( - offset + jindex + 0, parameterTransform_.transform.innerIndexPtr()[pIndex]); + offset + jindex + kAvxPacketSize * 0, + parameterTransform_.transform.innerIndexPtr()[pIndex]); const __m256 prevx = _mm256_loadu_ps(addressx); _mm256_storeu_ps(addressx, _mm256_add_ps(prevx, pjacx)); const __m256 pjacy = _mm256_mul_ps(_mm256_broadcast_ss(&pw), jacy); auto* const addressy = &jacobian( - offset + jindex + 8, parameterTransform_.transform.innerIndexPtr()[pIndex]); + offset + jindex + kAvxPacketSize * 1, + parameterTransform_.transform.innerIndexPtr()[pIndex]); const __m256 prevy = _mm256_loadu_ps(addressy); _mm256_storeu_ps(addressy, _mm256_add_ps(prevy, pjacy)); const __m256 pjacz = _mm256_mul_ps(_mm256_broadcast_ss(&pw), jacz); auto* const addressz = &jacobian( - offset + jindex + 16, parameterTransform_.transform.innerIndexPtr()[pIndex]); + offset + jindex + kAvxPacketSize * 2, + parameterTransform_.transform.innerIndexPtr()[pIndex]); const __m256 prevz = _mm256_loadu_ps(addressz); _mm256_storeu_ps(addressz, _mm256_add_ps(prevz, pjacz)); } @@ -960,19 +951,22 @@ double SimdPositionErrorFunctionAVX::getJacobian( const __m256 pjacx = _mm256_mul_ps(_mm256_broadcast_ss(&pw), jacx); auto* const addressx = &jacobian( - offset + jindex + 0, parameterTransform_.transform.innerIndexPtr()[pIndex]); + offset + jindex + kAvxPacketSize * 0, + parameterTransform_.transform.innerIndexPtr()[pIndex]); const __m256 prevx = _mm256_loadu_ps(addressx); _mm256_storeu_ps(addressx, _mm256_add_ps(prevx, pjacx)); const __m256 pjacy = _mm256_mul_ps(_mm256_broadcast_ss(&pw), jacy); auto* const addressy = &jacobian( - offset + jindex + 8, parameterTransform_.transform.innerIndexPtr()[pIndex]); + offset + jindex + kAvxPacketSize * 1, + parameterTransform_.transform.innerIndexPtr()[pIndex]); const __m256 prevy = _mm256_loadu_ps(addressy); _mm256_storeu_ps(addressy, _mm256_add_ps(prevy, pjacy)); const __m256 pjacz = _mm256_mul_ps(_mm256_broadcast_ss(&pw), jacz); auto* const addressz = &jacobian( - offset + jindex + 16, parameterTransform_.transform.innerIndexPtr()[pIndex]); + offset + jindex + kAvxPacketSize * 2, + parameterTransform_.transform.innerIndexPtr()[pIndex]); const __m256 prevz = _mm256_loadu_ps(addressz); _mm256_storeu_ps(addressz, _mm256_add_ps(prevz, pjacz)); } @@ -995,19 +989,22 @@ double SimdPositionErrorFunctionAVX::getJacobian( const __m256 pjacx = _mm256_mul_ps(_mm256_broadcast_ss(&pw), jacx); auto* const addressx = &jacobian( - offset + jindex + 0, parameterTransform_.transform.innerIndexPtr()[pIndex]); + offset + jindex + kAvxPacketSize * 0, + parameterTransform_.transform.innerIndexPtr()[pIndex]); const __m256 prevx = _mm256_loadu_ps(addressx); _mm256_storeu_ps(addressx, _mm256_add_ps(prevx, pjacx)); const __m256 pjacy = _mm256_mul_ps(_mm256_broadcast_ss(&pw), jacy); auto* const addressy = &jacobian( - offset + jindex + 8, parameterTransform_.transform.innerIndexPtr()[pIndex]); + offset + jindex + kAvxPacketSize * 1, + parameterTransform_.transform.innerIndexPtr()[pIndex]); const __m256 prevy = _mm256_loadu_ps(addressy); _mm256_storeu_ps(addressy, _mm256_add_ps(prevy, pjacy)); const __m256 pjacz = _mm256_mul_ps(_mm256_broadcast_ss(&pw), jacz); auto* const addressz = &jacobian( - offset + jindex + 16, parameterTransform_.transform.innerIndexPtr()[pIndex]); + offset + jindex + kAvxPacketSize * 2, + parameterTransform_.transform.innerIndexPtr()[pIndex]); const __m256 prevz = _mm256_loadu_ps(addressz); _mm256_storeu_ps(addressz, _mm256_add_ps(prevz, pjacz)); } @@ -1018,9 +1015,9 @@ double SimdPositionErrorFunctionAVX::getJacobian( } // store the residual - _mm256_storeu_ps(&residual(offset + jindex + 0), resx); - _mm256_storeu_ps(&residual(offset + jindex + 8), resy); - _mm256_storeu_ps(&residual(offset + jindex + 16), resz); + _mm256_storeu_ps(&residual(offset + jindex + kAvxPacketSize * 0), resx); + _mm256_storeu_ps(&residual(offset + jindex + kAvxPacketSize * 1), resy); + _mm256_storeu_ps(&residual(offset + jindex + kAvxPacketSize * 2), resz); } }, dispensoOptions); @@ -1029,10 +1026,30 @@ double SimdPositionErrorFunctionAVX::getJacobian( usedRows = gsl::narrow_cast(jacobian.rows()); - // sum the avx error register for final result + // sum the AVX error register for final result return static_cast(error); } +size_t SimdPositionErrorFunctionAVX::getJacobianSize() const { + if (constraints_ == nullptr) { + return 0; + } + + jacobianOffset_.resize(constraints_->numJoints); + size_t num = 0; + for (int i = 0; i < constraints_->numJoints; ++i) { + jacobianOffset_[i] = num; + const size_t count = constraints_->constraintCount[i]; + const size_t residual = count % kAvxPacketSize; + if (residual != 0) { + num += (count + kAvxPacketSize - residual) * kConstraintDim; + } else { + num += count * kConstraintDim; + } + } + return num; +} + #endif // MOMENTUM_ENABLE_AVX } // namespace momentum diff --git a/momentum/character_solver/simd_position_error_function.h b/momentum/character_solver/simd_position_error_function.h index 540cd07f..6fc38440 100644 --- a/momentum/character_solver/simd_position_error_function.h +++ b/momentum/character_solver/simd_position_error_function.h @@ -67,6 +67,9 @@ struct SimdPositionConstraints final { /// results may be produced on multiple calls with the same data. class SimdPositionErrorFunction : public SkeletonErrorFunction { public: + /// The dimensionality of each constraint. + static constexpr size_t kConstraintDim = 3; + /// @param maxThreads An optional parameter that specifies the maximum number of threads to be /// used with dispenso::parallel_for. If this parameter is set to zero, the function will run in /// serial mode, i.e., it will not use any additional threads. By default, the value is set to the @@ -98,7 +101,7 @@ class SimdPositionErrorFunction : public SkeletonErrorFunction { Ref residual, int& usedRows) override; - [[nodiscard]] size_t getJacobianSize() const final; + [[nodiscard]] size_t getJacobianSize() const override; void setConstraints(const SimdPositionConstraints* cstrs); @@ -143,6 +146,8 @@ class SimdPositionErrorFunctionAVX : public SimdPositionErrorFunction { Ref jacobian, Ref residual, int& usedRows) final; + + [[nodiscard]] size_t getJacobianSize() const final; }; #endif // MOMENTUM_ENABLE_AVX diff --git a/momentum/simd/simd.h b/momentum/simd/simd.h index 578cc60a..fd49ecc9 100644 --- a/momentum/simd/simd.h +++ b/momentum/simd/simd.h @@ -7,6 +7,8 @@ #pragma once +#include + #include #include #include @@ -19,8 +21,11 @@ namespace momentum { +inline constexpr size_t kAvxPacketSize = 8; +inline constexpr size_t kAvxAlignment = kAvxPacketSize * sizeof(float); + inline constexpr size_t kSimdPacketSize = drjit::DefaultSize; -inline constexpr size_t kSimdAlignment = kSimdPacketSize * 4; +inline constexpr size_t kSimdAlignment = kSimdPacketSize * sizeof(float); template using Packet = drjit::Packet; @@ -44,6 +49,18 @@ using Vector3fP = Vector3P; using Vector2dP = Vector2P; using Vector3dP = Vector3P; +/// Checks if the data of the matrix is aligned correctly. +template +void checkAlignment(const Eigen::Ref& mat) { + MT_THROW_IF( + (uintptr_t(mat.data())) % Alignment != 0, + "Matrix ({}x{}, ptr: {}) is not aligned ({}) correctly.", + mat.rows(), + mat.cols(), + uintptr_t(mat.data()), + Alignment); +} + /// Calculates dot product of Eigen::Vector3f and 3-vector of packets. /// /// @tparam Derived A derived class of Eigen::MatrixBase, which should be compatible with diff --git a/momentum/test/character_solver/error_function_helpers.cpp b/momentum/test/character_solver/error_function_helpers.cpp index ac4b0c4b..6f4b299a 100644 --- a/momentum/test/character_solver/error_function_helpers.cpp +++ b/momentum/test/character_solver/error_function_helpers.cpp @@ -14,6 +14,7 @@ #include "momentum/character_solver/skeleton_error_function.h" #include "momentum/common/log.h" #include "momentum/math/fmt_eigen.h" +#include "momentum/simd/simd.h" namespace momentum { @@ -224,7 +225,7 @@ void validateIdentical( const auto n = transform.numAllModelParameters(); const size_t m1 = err1.getJacobianSize(); const size_t m2 = err2.getJacobianSize(); - EXPECT_LE(m1, m2); // SIMD pads out to a multiple of 8. + EXPECT_LE(m1, m2); // SIMD pads out to a multiple of kSimdPacketSize. Eigen::MatrixX j1 = Eigen::MatrixX::Zero(m1, n); Eigen::MatrixX j2 = Eigen::MatrixX::Zero(m2, n); @@ -267,7 +268,8 @@ void timeJacobian( const ModelParameters& modelParams, const char* type) { const size_t jacobianSize = errorFunction.getJacobianSize(); - const size_t paddedJacobianSize = jacobianSize + 8 - (jacobianSize % 8); + const size_t paddedJacobianSize = + jacobianSize + kSimdPacketSize - (jacobianSize % kSimdPacketSize); Eigen::MatrixXf jacobian = Eigen::MatrixXf::Zero( paddedJacobianSize, character.parameterTransform.numAllModelParameters()); Eigen::VectorXf residual = Eigen::VectorXf::Zero(paddedJacobianSize); diff --git a/momentum/test/character_solver/simd_functions_test.cpp b/momentum/test/character_solver/simd_functions_test.cpp index 612abb70..813643fb 100644 --- a/momentum/test/character_solver/simd_functions_test.cpp +++ b/momentum/test/character_solver/simd_functions_test.cpp @@ -46,7 +46,19 @@ TEST(Momentum_ErrorFunctions, SimdNormalFunctionIsSame) { const ParameterTransform& transform = character.parameterTransform; SimdNormalConstraints cl_simd(&skeleton); - for (size_t nConstraints : {0, 1, 2, 5, 15}) { + const std::initializer_list constraints = { + 0ul, + 1ul, + kAvxPacketSize - 1, + kAvxPacketSize, + kAvxPacketSize + 1, + kAvxPacketSize * 2, + kSimdPacketSize - 1, + kSimdPacketSize, + kSimdPacketSize + 1, + kSimdPacketSize * 2, + 100ul}; + for (size_t nConstraints : constraints) { const auto parameters = uniform(transform.numAllModelParameters(), -0.5, 0.5); std::vector cl; @@ -96,7 +108,19 @@ TEST(Momentum_ErrorFunctions, SimdPositionFunctionIsSame) { const ParameterTransform& transform = character.parameterTransform; SimdPositionConstraints cl_simd(&skeleton); - for (size_t nConstraints : {0, 1, 2, 5, 15}) { + const std::initializer_list constraints = { + 0ul, + 1ul, + kAvxPacketSize - 1, + kAvxPacketSize, + kAvxPacketSize + 1, + kAvxPacketSize * 2, + kSimdPacketSize - 1, + kSimdPacketSize, + kSimdPacketSize + 1, + kSimdPacketSize * 2, + 100ul}; + for (size_t nConstraints : constraints) { if (verbose) { fmt::print("nConstraints: {}\n", nConstraints); } @@ -150,7 +174,19 @@ TEST(Momentum_ErrorFunctions, SimdPlaneFunctionIsSame) { const ParameterTransform& transform = character.parameterTransform; SimdPlaneConstraints cl_simd(&skeleton); - for (size_t nConstraints : {0, 1, 2, 5, 15, 100}) { + const std::initializer_list constraints = { + 0ul, + 1ul, + kAvxPacketSize - 1, + kAvxPacketSize, + kAvxPacketSize + 1, + kAvxPacketSize * 2, + kSimdPacketSize - 1, + kSimdPacketSize, + kSimdPacketSize + 1, + kSimdPacketSize * 2, + 100ul}; + for (size_t nConstraints : constraints) { const auto parameters = uniform(transform.numAllModelParameters(), -0.5, 0.5); std::vector cl; @@ -202,7 +238,10 @@ TYPED_TEST(Momentum_ErrorFunctionsTest, SimdCollisionErrorFunctionIsSame) { CollisionErrorFunctionT errf_base(character); CollisionErrorFunctionStatelessT errf_stateless(character); + // TODO: Fix this test for SIMD when MOMENTUM_ENABLE_SIMD=OFF +#ifdef MOMENTUM_ENABLE_SIMD SimdCollisionErrorFunctionT errf_simd(character); +#endif const size_t nBendTests = 8; for (size_t iBendAmount = 4; iBendAmount < nBendTests; ++iBendAmount) { @@ -221,11 +260,17 @@ TYPED_TEST(Momentum_ErrorFunctionsTest, SimdCollisionErrorFunctionIsSame) { const double err_base = errf_base.getError(mp, skelState); const double err_stateless = errf_stateless.getError(mp, skelState); +#ifdef MOMENTUM_ENABLE_SIMD const double err_simd = errf_simd.getError(mp, skelState); +#endif ASSERT_NEAR(err_base, err_stateless, 0.0001 * err_base); +#ifdef MOMENTUM_ENABLE_SIMD ASSERT_NEAR(err_base, err_simd, 0.0001 * err_base); +#endif VALIDATE_IDENTICAL(T, errf_base, errf_stateless, skeleton, transform, mp.v); +#ifdef MOMENTUM_ENABLE_SIMD VALIDATE_IDENTICAL(T, errf_base, errf_simd, skeleton, transform, mp.v); +#endif } MT_LOGI("done."); diff --git a/pixi.toml b/pixi.toml index 84cd79e8..f1dcff76 100644 --- a/pixi.toml +++ b/pixi.toml @@ -43,9 +43,21 @@ spdlog = ">=1.12.0,<1.13" [tasks] clean = { cmd = "rm -rf build && rm -rf .pixi && rm pixi.lock" } -configure = { cmd = "cmake -S . -B build -G Ninja -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DCMAKE_BUILD_TYPE=Release -DMOMENTUM_USE_SYSTEM_GOOGLETEST=ON -DMOMENTUM_USE_SYSTEM_PYBIND11=ON -DMOMENTUM_USE_SYSTEM_RERUN_CPP_SDK=ON -DMOMENTUM_BUILD_TESTING=ON -DMOMENTUM_BUILD_EXAMPLES=ON -DMOMENTUM_BUILD_PYMOMENTUM=ON", inputs = [ - "CMakeLists.txt", -] } +configure = { cmd = """ + cmake \ + -S . \ + -B build \ + -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX \ + -DMOMENTUM_BUILD_TESTING=ON \ + -DMOMENTUM_BUILD_EXAMPLES=ON \ + -DMOMENTUM_BUILD_PYMOMENTUM=$MOMENTUM_BUILD_PYMOMENTUM \ + -DMOMENTUM_ENABLE_SIMD=$MOMENTUM_ENABLE_SIMD \ + -DMOMENTUM_USE_SYSTEM_GOOGLETEST=ON \ + -DMOMENTUM_USE_SYSTEM_PYBIND11=ON \ + -DMOMENTUM_USE_SYSTEM_RERUN_CPP_SDK=ON + """, env = { MOMENTUM_ENABLE_SIMD = "ON", MOMENTUM_BUILD_PYMOMENTUM = "ON" } } build = { cmd = "cmake --build build -j --target all", depends_on = [ "configure", ] } @@ -76,7 +88,15 @@ sysroot_linux-64 = ">=2.28" [target.linux-64.tasks] lint = { cmd = "clang-format-18 -i axel/**/*.h axel/**/*.cpp momentum/**/*.h momentum/**/*.cpp pymomentum/**/*.h pymomentum/**/*.cpp" } build_pymomentum = { cmd = "pip install -e ." } -test_pymomentum = { cmd = "pytest pymomentum/test/test_closest_points.py pymomentum/test/test_fbx.py pymomentum/test/test_parameter_transform.py pymomentum/test/test_quaternion.py pymomentum/test/test_skel_state.py pymomentum/test/test_skeleton.py", env = { MOMENTUM_MODELS_PATH = "momentum/" }, depends_on = [ +test_pymomentum = { cmd = """ + pytest \ + pymomentum/test/test_closest_points.py \ + pymomentum/test/test_fbx.py \ + pymomentum/test/test_parameter_transform.py \ + pymomentum/test/test_quaternion.py \ + pymomentum/test/test_skel_state.py \ + pymomentum/test/test_skeleton.py + """, env = { MOMENTUM_MODELS_PATH = "momentum/" }, depends_on = [ "build_pymomentum", ] } @@ -123,9 +143,22 @@ build_pymomentum = { cmd = "pip install -e ." } [target.win-64.dependencies] [target.win-64.tasks] -configure = { cmd = "cmake -S . -B build -G 'Visual Studio 17 2022' -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DBUILD_SHARED_LIBS=OFF -DMOMENTUM_BUILD_IO_FBX=ON -DMOMENTUM_USE_SYSTEM_GOOGLETEST=ON -DMOMENTUM_USE_SYSTEM_PYBIND11=ON -DMOMENTUM_USE_SYSTEM_RERUN_CPP_SDK=ON -DMOMENTUM_BUILD_TESTING=ON -DMOMENTUM_BUILD_EXAMPLES=ON -DMOMENTUM_BUILD_PYMOMENTUM=OFF", inputs = [ - "CMakeLists.txt", -] } +configure = { cmd = """ + cmake \ + -S . \ + -B build \ + -G 'Visual Studio 17 2022' \ + -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX \ + -DBUILD_SHARED_LIBS=OFF \ + -DMOMENTUM_BUILD_EXAMPLES=ON \ + -DMOMENTUM_BUILD_IO_FBX=ON \ + -DMOMENTUM_BUILD_PYMOMENTUM=$MOMENTUM_BUILD_PYMOMENTUM \ + -DMOMENTUM_BUILD_TESTING=ON \ + -DMOMENTUM_ENABLE_SIMD=%MOMENTUM_ENABLE_SIMD% \ + -DMOMENTUM_USE_SYSTEM_GOOGLETEST=ON \ + -DMOMENTUM_USE_SYSTEM_PYBIND11=ON \ + -DMOMENTUM_USE_SYSTEM_RERUN_CPP_SDK=ON + """, env = { MOMENTUM_ENABLE_SIMD = "ON", MOMENTUM_BUILD_PYMOMENTUM = "OFF" } } open_vs = { cmd = "cmd /c start build\\momentum.sln", depends_on = [ "configure", ] }