From 9256856279d6d02aa55b430b40693bf258831099 Mon Sep 17 00:00:00 2001 From: Bogdan Vlahov Date: Thu, 22 Aug 2024 20:50:37 -0400 Subject: [PATCH 1/4] Add Linear Dynamics Class - Added unit tests for the linear dynamics --- include/mppi/dynamics/linear/linear.cu | 99 ++++++++++++ include/mppi/dynamics/linear/linear.cuh | 137 ++++++++++++++++ tests/dynamics/linear_dynamics_tests.cu | 152 ++++++++++++++++++ .../dynamics/dynamics_generic_kernel_tests.cu | 35 +++- .../dynamics_generic_kernel_tests.cuh | 9 ++ 5 files changed, 424 insertions(+), 8 deletions(-) create mode 100644 include/mppi/dynamics/linear/linear.cu create mode 100644 include/mppi/dynamics/linear/linear.cuh create mode 100644 tests/dynamics/linear_dynamics_tests.cu diff --git a/include/mppi/dynamics/linear/linear.cu b/include/mppi/dynamics/linear/linear.cu new file mode 100644 index 0000000..be548ba --- /dev/null +++ b/include/mppi/dynamics/linear/linear.cu @@ -0,0 +1,99 @@ +/** + * @file linear.cu + * @brief Linear Dynamics + * @author Bogdan Vlahov + * @version + * @date 2024-08-16 + */ +#include +#include +#include + +namespace mm = mppi::matrix_multiplication; +namespace mp1 = mppi::p1; + +#define LINEAR_TEMPLATE template +#define LINEAR_DYNAMICS LinearDynamicsImpl + +LINEAR_TEMPLATE LINEAR_DYNAMICS::LinearDynamicsImpl(cudaStream_t stream) : PARENT_CLASS(stream) +{ + this->SHARED_MEM_REQUEST_GRD_BYTES = sizeof(PARAMS_T); +} + +LINEAR_TEMPLATE LINEAR_DYNAMICS::LinearDynamicsImpl(PARAMS_T& params, cudaStream_t stream) + : PARENT_CLASS(params, stream) +{ + this->SHARED_MEM_REQUEST_GRD_BYTES = sizeof(PARAMS_T); +} + +LINEAR_TEMPLATE +bool LINEAR_DYNAMICS::computeGrad(const Eigen::Ref& state, + const Eigen::Ref& control, Eigen::Ref A, + Eigen::Ref B) +{ + A = this->params_.getA(); + B = this->params_.getB(); + return true; +} + +LINEAR_TEMPLATE +LINEAR_DYNAMICS::state_array LINEAR_DYNAMICS::stateFromMap(const std::map& map) +{ + state_array x = this->getZeroState(); + for (int i = 0; i < this->STATE_DIM; i++) + { + std::string state_name = "x_" + std::to_string(i); + x(i, 0) = map.at(state_name); + } + return x; +} + +LINEAR_TEMPLATE +void LINEAR_DYNAMICS::step(Eigen::Ref state, Eigen::Ref next_state, + Eigen::Ref state_der, const Eigen::Ref& control, + Eigen::Ref output, const float t, const float dt) +{ + state_der = this->params_.getA() * state + this->params_.getB() * control; + next_state = state + state_der * dt; + this->stateToOutput(next_state, output); +} + +LINEAR_TEMPLATE +__device__ inline void LINEAR_DYNAMICS::step(float* state, float* next_state, float* state_der, float* control, + float* output, float* theta_s, const float t, const float dt) +{ + PARAMS_T* params_p = &(this->params_); + if (this->getGrdSharedSizeBytes() != 0) + { + params_p = (PARAMS_T*)theta_s; + } + float* A = params_p->A; + float* B = params_p->B; + + const mp1::Parallel1Dir P_DIR = mp1::Parallel1Dir::THREAD_Y; + mm::gemm1STATE_DIM, this->STATE_DIM, 1, P_DIR>(A, state, state_der); + mm::gemm1STATE_DIM, this->CONTROL_DIM, 1, P_DIR>(B, control, state_der, 1.0f, 1.0f); + // __syncthreads(); + int index, step; + mp1::getParallel1DIndex(index, step); + for (int i = index; i < this->STATE_DIM; i += step) + { + next_state[i] = state[i] + state_der[i] * dt; + output[i] = next_state[i]; + } +} + +LINEAR_TEMPLATE +__device__ void LINEAR_DYNAMICS::initializeDynamics(float* state, float* control, float* output, float* theta_s, + float t_0, float dt) +{ + PARENT_CLASS::initializeDynamics(state, control, output, theta_s, t_0, dt); + if (this->getGrdSharedSizeBytes() != 0) + { + PARAMS_T* shared = (PARAMS_T*)theta_s; + *shared = this->params_; + } +} + +#undef LINEAR_TEMPLATE +#undef LINEAR_DYNAMICS diff --git a/include/mppi/dynamics/linear/linear.cuh b/include/mppi/dynamics/linear/linear.cuh new file mode 100644 index 0000000..2a857d9 --- /dev/null +++ b/include/mppi/dynamics/linear/linear.cuh @@ -0,0 +1,137 @@ +/** + * @file linear.cuh + * @brief Linear Dynamics + * @author Bogdan Vlahov + * @version + * @date 2024-08-16 + */ +#pragma once + +#include + +template +struct LinearDynamicsParams : public DynamicsParams +{ +public: + using state_matrix = Eigen::Matrix; + using control_matrix = Eigen::Matrix; + enum class StateIndex : int + { + NUM_STATES = STATE_DIM, + }; + + enum class ControlIndex : int + { + NUM_CONTROLS = CONTROL_DIM, + }; + enum class OutputIndex : int + { + NUM_OUTPUTS = STATE_DIM, + }; + float A[STATE_DIM * STATE_DIM] = { 0.0f }; + float B[STATE_DIM * CONTROL_DIM] = { 0.0f }; + + LinearDynamicsParams() = default; + ~LinearDynamicsParams() = default; + + void setA(const Eigen::Ref& A_eigen) + { + memcpy(A, A_eigen.data(), sizeof(float) * STATE_DIM * STATE_DIM); + } + + void setB(const Eigen::Ref& B_eigen) + { + memcpy(B, B_eigen.data(), sizeof(float) * STATE_DIM * CONTROL_DIM); + } + + float* getA() const + { + return A; + } + float* getB() const + { + return B; + } + + Eigen::Map getA() + { + Eigen::Map A_eigen(A); + return A_eigen; + } + + Eigen::Map getB() + { + Eigen::Map B_eigen(B); + return B_eigen; + } +}; + +using namespace MPPI_internal; + +template > +class LinearDynamicsImpl : public Dynamics +{ +public: + using PARENT_CLASS = Dynamics; + using state_array = typename PARENT_CLASS::state_array; + using control_array = typename PARENT_CLASS::control_array; + using output_array = typename PARENT_CLASS::output_array; + using dfdx = typename PARENT_CLASS::dfdx; + using dfdu = typename PARENT_CLASS::dfdu; + using PARENT_CLASS::initializeDynamics; + + LinearDynamicsImpl(cudaStream_t stream = nullptr); + LinearDynamicsImpl(PARAMS_T& params, cudaStream_t stream = nullptr); + + std::string getDynamicsModelName() const override + { + return "Linear Dynamics"; + } + + bool computeGrad(const Eigen::Ref& state, const Eigen::Ref& control, + Eigen::Ref A, Eigen::Ref B); + + state_array stateFromMap(const std::map& map); + + void step(Eigen::Ref state, Eigen::Ref next_state, Eigen::Ref state_der, + const Eigen::Ref& control, Eigen::Ref output, const float t, + const float dt); + + void setA(const Eigen::Ref& A_eigen, bool synchronize = false) + { + this->params_.setA(A_eigen); + this->paramsToDevice(synchronize); + } + + void setB(const Eigen::Ref& B_eigen, bool synchronize = false) + { + this->params_.setB(B_eigen); + this->paramsToDevice(synchronize); + } + + __device__ inline void step(float* state, float* next_state, float* state_der, float* control, float* output, + float* theta_s, const float t, const float dt); + + __device__ void initializeDynamics(float* state, float* control, float* output, float* theta_s, float t_0, float dt); +}; + +template +class LinearDynamics + : public LinearDynamicsImpl, LinearDynamicsParams> +{ +public: + using PARENT_CLASS = + LinearDynamicsImpl, LinearDynamicsParams>; + using DYN_PARAMS_T = typename PARENT_CLASS::DYN_PARAMS_T; + + LinearDynamics(cudaStream_t stream = nullptr) : PARENT_CLASS(stream) + { + } + LinearDynamics(DYN_PARAMS_T& params, cudaStream_t stream = nullptr) : PARENT_CLASS(params, stream) + { + } +}; + +#ifdef __CUDACC__ +#include "linear.cu" +#endif diff --git a/tests/dynamics/linear_dynamics_tests.cu b/tests/dynamics/linear_dynamics_tests.cu new file mode 100644 index 0000000..1d17304 --- /dev/null +++ b/tests/dynamics/linear_dynamics_tests.cu @@ -0,0 +1,152 @@ +#include +#include +#include + +TEST(Linear, Dimensionality) +{ + using DYN_T = LinearDynamics<4, 12>; + ASSERT_EQ(4, DYN_T::STATE_DIM); + ASSERT_EQ(4, DYN_T::OUTPUT_DIM); + ASSERT_EQ(12, DYN_T::CONTROL_DIM); +} + +TEST(Linear, SetParamsA) +{ + using DYN_T = LinearDynamics<4, 12>; + using DFDX = typename DYN_T::dfdx; + // Set A from params + DFDX A = DFDX::Random(); + typename DYN_T::DYN_PARAMS_T params; + params.setA(A); + auto dynamics = DYN_T(params); + auto dyn_params = dynamics.getParams(); + Eigen::Map A_params_result(dyn_params.A); + float diff = (A - A_params_result).sum(); + EXPECT_FLOAT_EQ(diff, 0); + // Set A from dynamics + A = DFDX::Random(); + dynamics.setA(A); + dyn_params = dynamics.getParams(); + Eigen::Map A_class_result(dyn_params.A); + diff = (A - A_class_result).sum(); + EXPECT_FLOAT_EQ(diff, 0); +} + +TEST(Linear, SetParamsB) +{ + using DYN_T = LinearDynamics<4, 12>; + using DFDU = typename DYN_T::dfdu; + // Set B from params + DFDU B = DFDU::Random(); + typename DYN_T::DYN_PARAMS_T params; + params.setB(B); + auto dynamics = DYN_T(params); + auto dyn_params = dynamics.getParams(); + Eigen::Map B_result(dyn_params.B); + float diff = (B - B_result).sum(); + EXPECT_FLOAT_EQ(diff, 0); + // Set A from dynamics + B = DFDU::Random(); + dynamics.setB(B); + dyn_params = dynamics.getParams(); + Eigen::Map B_class_result(dyn_params.B); + diff = (B - B_class_result).sum(); + EXPECT_FLOAT_EQ(diff, 0); +} + +TEST(Linear, CheckSharedMemorySizes) +{ + using DYN_T = LinearDynamics<4, 12>; + auto dynamics = DYN_T(); + dynamics.GPUSetup(); + int output_gpu[2] = { 0 }; + int output_cpu[2] = { 0 }; + launchGetSharedMemorySizesKernel(dynamics, output_gpu); + output_cpu[0] = dynamics.getGrdSharedSizeBytes(); + output_cpu[1] = dynamics.getBlkSharedSizeBytes(); + ASSERT_EQ(output_cpu[0], output_gpu[0]); + ASSERT_EQ(output_cpu[1], output_gpu[1]); +} + +TEST(Linear, StepCPUGPUComparison) +{ + using DYN_T = LinearDynamics<10, 12>; + using DFDU = typename DYN_T::dfdu; + using DFDX = typename DYN_T::dfdx; + DFDU B = DFDU::Random(); + DFDX A = DFDX::Random(); + auto dynamics = DYN_T(); + dynamics.setA(A); + dynamics.setB(B); + typename DYN_T::buffer_trajectory buffer; + + std::vector x_sizes = { 1, 2, 4, 8, 16, 32 }; + for (const auto& x_dim : x_sizes) + { + checkGPUComputationStep(dynamics, 0.01, 32, x_dim, buffer); + } +} + +TEST(Linear, JacobianCheck) +{ + using DYN_T = LinearDynamics<10, 6>; + using DFDU = typename DYN_T::dfdu; + using DFDX = typename DYN_T::dfdx; + using state_array = typename DYN_T::state_array; + using control_array = typename DYN_T::control_array; + auto dynamics = DYN_T(); + DFDX A = DFDX::Identity(); + DFDU B = DFDU::Random(); + dynamics.setA(A); + dynamics.setB(B); + + DFDX Jacobian_A; + DFDU Jacobian_B; + state_array x = dynamics.getZeroState(); + control_array u = dynamics.getZeroControl(); + dynamics.computeGrad(x, u, Jacobian_A, Jacobian_B); + float a_diff = (Jacobian_A - A).array().abs().sum(); + float b_diff = (Jacobian_B - B).array().abs().sum(); + ASSERT_EQ(a_diff, 0); + ASSERT_EQ(b_diff, 0); +} + +TEST(Linear, HardCodeCPUTest) +{ + using DYN_T = LinearDynamics<3, 1>; + using DFDU = typename DYN_T::dfdu; + using DFDX = typename DYN_T::dfdx; + using state_array = typename DYN_T::state_array; + using control_array = typename DYN_T::control_array; + using output_array = typename DYN_T::output_array; + + auto dynamics = DYN_T(); + DFDX A = DFDX::Identity(); + DFDU B = DFDU::Zero(); + B(2, 0) = 0.5f; + dynamics.setA(A); + dynamics.setB(B); + + state_array x = dynamics.getZeroState(); + control_array u = dynamics.getZeroControl(); + x << 1, 5, 10; + u << 3; + float dt = 0.1; + output_array y; + state_array x_der, x_next; + dynamics.step(x, x_next, x_der, u, y, 0, dt); + // Check derivative + ASSERT_FLOAT_EQ(x_der[0], 1.0f); + ASSERT_FLOAT_EQ(x_der[1], 5.0f); + ASSERT_FLOAT_EQ(x_der[2], 10.0f + 1.5f); + + // Check next state + ASSERT_FLOAT_EQ(x_next[0], 1.0f + 1.0f * 0.1f); + ASSERT_FLOAT_EQ(x_next[1], 5.0f + 5.0f * 0.1f); + ASSERT_FLOAT_EQ(x_next[2], 10.0f + (10.0f + 1.5f) * 0.1f); + + // Check output + ASSERT_FLOAT_EQ(y[0], 1.0f + 1.0f * 0.1f); + ASSERT_FLOAT_EQ(y[1], 5.0f + 5.0f * 0.1f); + ASSERT_FLOAT_EQ(y[2], 10.0f + (10.0f + 1.5f) * 0.1f); +} diff --git a/tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cu b/tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cu index 738f04b..b844b85 100644 --- a/tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cu +++ b/tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cu @@ -26,6 +26,28 @@ void launchParameterTestKernel(CLASS_T& class_t, PARAMS_T& params) cudaFree(params_d); } +template +__global__ void getSharedMemorySizesKernel(DYN_T* __restrict__ dynamics, int* __restrict__ output_d) +{ + output_d[0] = dynamics->getGrdSharedSizeBytes(); + output_d[1] = dynamics->getBlkSharedSizeBytes(); +} + +template +void launchGetSharedMemorySizesKernel(DYNAMICS_T& dynamics, int shared_mem_sizes[2]) +{ + int* shared_mem_sizes_d; + HANDLE_ERROR(cudaMalloc((void**)&shared_mem_sizes_d, sizeof(int) * 2)); + + getSharedMemorySizesKernel<<<1, 1>>>(dynamics.model_d_, shared_mem_sizes_d); + HANDLE_ERROR(cudaGetLastError()); + + // Copy the memory back to the host + HANDLE_ERROR(cudaMemcpy(shared_mem_sizes, shared_mem_sizes_d, sizeof(int) * 2, cudaMemcpyDeviceToHost)); + + HANDLE_ERROR(cudaFree(shared_mem_sizes_d)); +} + template __global__ void controlRangesTestKernel(DYNAMICS_T* dynamics, float2* control_rngs) { @@ -305,17 +327,14 @@ __global__ void stepTestKernel(DYNAMICS_T* dynamics, float* state, float* contro float* u = control + (tid * DYNAMICS_T::CONTROL_DIM); float* y = output + (tid * DYNAMICS_T::OUTPUT_DIM); - dynamics->initializeDynamics(state, control, output, theta, 0.0f, dt); + if (tid < num) + { + dynamics->initializeDynamics(state, control, output, theta, 0.0f, dt); + } __syncthreads(); if (tid < num) { - float* x = state + (tid * DYNAMICS_T::STATE_DIM); - float* x_dot = state_der + (tid * DYNAMICS_T::STATE_DIM); - float* x_next = next_state + (tid * DYNAMICS_T::STATE_DIM); - float* u = control + (tid * DYNAMICS_T::CONTROL_DIM); - float* y = output + (tid * DYNAMICS_T::OUTPUT_DIM); - dynamics->enforceConstraints(x, u); dynamics->step(x, x_next, x_dot, u, y, theta, t, dt); } @@ -408,7 +427,7 @@ void launchStepTestKernel(DYNAMICS_T& dynamics, std::vector void checkGPUComputationStep(DYN_T& dynamics, float dt, int max_y_dim, int x_dim, - typename DYN_T::buffer_trajectory buffer, double tol = 1.0e-5) + typename DYN_T::buffer_trajectory buffer, double tol) { CudaCheckError(); dynamics.GPUSetup(); diff --git a/tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cuh b/tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cuh index 41dea8f..aa24b05 100644 --- a/tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cuh +++ b/tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cuh @@ -48,6 +48,15 @@ void launchComputeStateDerivTestKernel(DYNAMICS_T& dynamics, std::vector>& control, std::vector>& state_der, int dim_y); +template +__global__ void getSharedMemorySizesKernel(DYN_T* __restrict__ dynamics, int* __restrict__ output_d); +template +void launchGetSharedMemorySizesKernel(DYN_T& dynamics, int shared_mem_sizes[2]); + +template +void checkGPUComputationStep(DYN_T& dynamics, float dt, int max_y_dim, int x_dim, + typename DYN_T::buffer_trajectory buffer, double tol = 1.0e-5); + #if __CUDACC__ #include "dynamics_generic_kernel_tests.cu" #endif From 64daeec49ecb4866f2b4a741ff1ff9c78f3c374f Mon Sep 17 00:00:00 2001 From: Bogdan Vlahov Date: Wed, 28 Aug 2024 09:50:14 -0400 Subject: [PATCH 2/4] Ensure test kernels for dynamics check CUDA errors --- .../dynamics/dynamics_generic_kernel_tests.cu | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cu b/tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cu index b844b85..89ce963 100644 --- a/tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cu +++ b/tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cu @@ -17,7 +17,7 @@ void launchParameterTestKernel(CLASS_T& class_t, PARAMS_T& params) HANDLE_ERROR(cudaMalloc((void**)¶ms_d, sizeof(PARAMS_T))) parameterTestKernel<<<1, 1>>>(static_cast(class_t.model_d_), *params_d); - CudaCheckError(); + HANDLE_ERROR(cudaGetLastError()); // Copy the memory back to the host HANDLE_ERROR(cudaMemcpy(¶ms, params_d, sizeof(PARAMS_T), cudaMemcpyDeviceToHost)); @@ -70,7 +70,7 @@ void launchControlRangesTestKernel(DYNAMICS_T& dynamics, std::array<<<1, 1>>>(static_cast(dynamics.model_d_), ranges_d); - CudaCheckError(); + HANDLE_ERROR(cudaGetLastError()); // Copy the memory back to the host HANDLE_ERROR(cudaMemcpy(control_rngs.data(), ranges_d, sizeof(float2) * control_rngs.size(), cudaMemcpyDeviceToHost)); @@ -106,7 +106,7 @@ void launchEnforceConstraintTestKernel(DYNAMICS_T& dynamics, std::vector <<>>(static_cast(dynamics.model_d_), state_d, control_d, count); - CudaCheckError(); + HANDLE_ERROR(cudaGetLastError()); // Copy the memory back to the host HANDLE_ERROR(cudaMemcpy(state.data(), state_d, sizeof(float) * S_DIM * state.size(), cudaMemcpyDeviceToHost)); @@ -144,7 +144,7 @@ void launchUpdateStateTestKernel(DYNAMICS_T& dynamics, std::vector <<>>(static_cast(dynamics.model_d_), state_d, state_der_d, dt, count); - CudaCheckError(); + HANDLE_ERROR(cudaGetLastError()); // Copy the memory back to the host HANDLE_ERROR(cudaMemcpy(state.data(), state_d, sizeof(float) * S_DIM * state.size(), cudaMemcpyDeviceToHost)); @@ -183,7 +183,7 @@ void launchComputeKinematicsTestKernel(DYNAMICS_T& dynamics, std::vector <<>>(static_cast(dynamics.model_d_), state_d, state_der_d, count); - CudaCheckError(); + HANDLE_ERROR(cudaGetLastError()); // Copy the memory back to the host HANDLE_ERROR(cudaMemcpy(state.data(), state_d, sizeof(float) * S_DIM * state.size(), cudaMemcpyDeviceToHost)); @@ -241,7 +241,7 @@ void launchComputeDynamicsTestKernel(DYNAMICS_T& dynamics, std::vector <<>>(dynamics.model_d_, state_d, control_d, state_der_d, count); - CudaCheckError(); + HANDLE_ERROR(cudaGetLastError()); HANDLE_ERROR(cudaMemcpy(state.data(), state_d, sizeof(float) * S_DIM * count, cudaMemcpyDeviceToHost)); HANDLE_ERROR(cudaMemcpy(state_der.data(), state_der_d, sizeof(float) * S_DIM * count, cudaMemcpyDeviceToHost)); @@ -297,7 +297,7 @@ void launchComputeStateDerivTestKernel(DYNAMICS_T& dynamics, std::vector<<>>( static_cast(dynamics.model_d_), state_d, control_d, state_der_d, count); - CudaCheckError(); + HANDLE_ERROR(cudaGetLastError()); // Copy the memory back to the host HANDLE_ERROR(cudaMemcpy(state.data(), state_d, sizeof(float) * S_DIM * state.size(), cudaMemcpyDeviceToHost)); @@ -403,7 +403,7 @@ void launchStepTestKernel(DYNAMICS_T& dynamics, std::vector<<>>( dynamics.model_d_, state_d, control_d, state_der_d, next_state_d, output_d, t, dt, count); - CudaCheckError(); + HANDLE_ERROR(cudaGetLastError()); // Copy the memory back to the host HANDLE_ERROR( From 34693b56f900b1071af4e1384906c15cbe47fc2c Mon Sep 17 00:00:00 2001 From: Bogdan Vlahov Date: Fri, 30 Aug 2024 09:56:02 -0400 Subject: [PATCH 3/4] Add unit tests for floatX math operations --- tests/math_utils/cuda_math_utils_tests.cu | 685 ++++++++++++++++++++++ 1 file changed, 685 insertions(+) create mode 100644 tests/math_utils/cuda_math_utils_tests.cu diff --git a/tests/math_utils/cuda_math_utils_tests.cu b/tests/math_utils/cuda_math_utils_tests.cu new file mode 100644 index 0000000..9039479 --- /dev/null +++ b/tests/math_utils/cuda_math_utils_tests.cu @@ -0,0 +1,685 @@ +#include +#include +#include + +#include + +template +__global__ void VectorVectorAddTestKernel(const T* __restrict__ input1, const T* __restrict__ input2, + T* __restrict__ output2) +{ + *output2 = *input1 + *input2; +} + +template +__global__ void VectorVectorSubTestKernel(const T* __restrict__ input1, const T* __restrict__ input2, + T* __restrict__ output2) +{ + *output2 = *input1 - *input2; +} + +template +__global__ void VectorVectorMultTestKernel(const T* __restrict__ input1, const T* __restrict__ input2, + T* __restrict__ output2) +{ + *output2 = (*input1) * (*input2); +} + +template +__global__ void VectorVectorDivTestKernel(const T* __restrict__ input1, const T* __restrict__ input2, + T* __restrict__ output2) +{ + *output2 = (*input1) / (*input2); +} + +template +__global__ void VectorScalarAddTestKernel(const T* __restrict__ input1, const S* __restrict__ input2, + T* __restrict__ output2) +{ + *output2 = *input1 + *input2; +} + +template +__global__ void VectorScalarMultTestKernel(const T* __restrict__ input1, const S* __restrict__ input2, + T* __restrict__ output2) +{ + *output2 = (*input1) * (*input2); +} + +template +__global__ void VectorScalarSubTestKernel(const T* __restrict__ input1, const S* __restrict__ input2, + T* __restrict__ output2) +{ + *output2 = *input1 - *input2; +} + +template +__global__ void VectorScalarDivTestKernel(const T* __restrict__ input1, const S* __restrict__ input2, + T* __restrict__ output2) +{ + *output2 = (*input1) / (*input2); +} + +template +__global__ void VectorVectorScalarAddMultTestKernel(const T* __restrict__ input1, const T* __restrict__ input2, + const S* __restrict__ scalar, T* __restrict__ output2) +{ + *output2 = *input1 + (*input2) * (*scalar); +} + +template +class CudaFloatStructsTests : public ::testing::Test +{ +public: + T input1_cpu; + T input2_cpu; + T output_cpu; + T output_gpu; + T* input1_d; + T* input2_d; + T* output_d; + + float scalar; + float* scalar_d; + + void SetUp() override + { + HANDLE_ERROR(cudaMalloc((void**)&scalar_d, sizeof(float))); + HANDLE_ERROR(cudaMalloc((void**)&input1_d, sizeof(T))); + HANDLE_ERROR(cudaMalloc((void**)&input2_d, sizeof(T))); + HANDLE_ERROR(cudaMalloc((void**)&output_d, sizeof(T))); + + // Setup random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dist(0.0f, 10.f); + + initializeStruct(input1_cpu, gen, dist); + initializeStruct(input2_cpu, gen, dist); + HANDLE_ERROR(cudaMemcpy(input1_d, &input1_cpu, sizeof(T), cudaMemcpyHostToDevice)); + HANDLE_ERROR(cudaMemcpy(input2_d, &input2_cpu, sizeof(T), cudaMemcpyHostToDevice)); + HANDLE_ERROR(cudaMemcpy(scalar_d, &scalar, sizeof(float), cudaMemcpyHostToDevice)); + } + + void initializeStruct(T& val, std::mt19937& gen, std::uniform_real_distribution& dist) + { + std::cout << "Did not specialize!" << std::endl; + } + + T addVec(const T& val1, const T& val2) + { + T a; + std::cout << "Wrong addVec!" << std::endl; + return a; + } + + T addScalar(const T& val1, const float& val2) + { + T a; + std::cout << "Wrong addScalar!" << std::endl; + return a; + } + + T subVec(const T& val1, const T& val2) + { + T a; + std::cout << "Wrong subVec!" << std::endl; + return a; + } + + T subScalar(const T& val1, const float& val2) + { + T a; + std::cout << "Wrong subScalar!" << std::endl; + return a; + } + + T multVec(const T& val1, const T& val2) + { + T a; + std::cout << "Wrong multVec!" << std::endl; + return a; + } + + T multScalar(const T& val1, const float& val2) + { + T a; + std::cout << "Wrong multScalar!" << std::endl; + return a; + } + + T divVec(const T& val1, const T& val2) + { + T a; + std::cout << "Wrong divVec!" << std::endl; + return a; + } + + T divScalar(const T& val1, const float& val2) + { + T a; + std::cout << "Wrong divScalar!" << std::endl; + return a; + } + + bool assert_same(const T& val1, const T& val2, const std::string& str = ""); + + void TearDown() override + { + HANDLE_ERROR(cudaFree(scalar_d)); + HANDLE_ERROR(cudaFree(input1_d)); + HANDLE_ERROR(cudaFree(input2_d)); + HANDLE_ERROR(cudaFree(output_d)); + } +}; + +template <> +void CudaFloatStructsTests::initializeStruct(float2& val, std::mt19937& gen, + std::uniform_real_distribution& dist) +{ + val = make_float2(dist(gen), dist(gen)); +} + +template <> +bool CudaFloatStructsTests::assert_same(const float2& val1, const float2& val2, const std::string& str) +{ + bool result = true; + result = result && (val1.x == val2.x); + result = result && (val1.y == val2.y); + if (!result) + { + printf("(%f, %f) != (%f, %f)", val1.x, val1.y, val2.x, val2.y); + std::cout << str << std::endl; + } + return result; +} + +template <> +float2 CudaFloatStructsTests::addVec(const float2& val1, const float2& val2) +{ + float2 output; + output.x = val1.x + val2.x; + output.y = val1.y + val2.y; + return output; +} + +template <> +float2 CudaFloatStructsTests::multVec(const float2& val1, const float2& val2) +{ + float2 output; + output.x = val1.x * val2.x; + output.y = val1.y * val2.y; + return output; +} + +template <> +float2 CudaFloatStructsTests::addScalar(const float2& val1, const float& val2) +{ + float2 output; + output.x = val1.x + val2; + output.y = val1.y + val2; + return output; +} + +template <> +float2 CudaFloatStructsTests::multScalar(const float2& val1, const float& val2) +{ + float2 output; + output.x = val1.x * val2; + output.y = val1.y * val2; + return output; +} + +template <> +float2 CudaFloatStructsTests::subVec(const float2& val1, const float2& val2) +{ + float2 output; + output.x = val1.x - val2.x; + output.y = val1.y - val2.y; + return output; +} + +template <> +float2 CudaFloatStructsTests::divVec(const float2& val1, const float2& val2) +{ + float2 output; + output.x = val1.x / val2.x; + output.y = val1.y / val2.y; + return output; +} + +template <> +float2 CudaFloatStructsTests::subScalar(const float2& val1, const float& val2) +{ + float2 output; + output.x = val1.x - val2; + output.y = val1.y - val2; + return output; +} + +template <> +float2 CudaFloatStructsTests::divScalar(const float2& val1, const float& val2) +{ + float2 output; + output.x = val1.x / val2; + output.y = val1.y / val2; + return output; +} + +template <> +void CudaFloatStructsTests::initializeStruct(float4& val, std::mt19937& gen, + std::uniform_real_distribution& dist) +{ + val = make_float4(dist(gen), dist(gen), dist(gen), dist(gen)); +} + +template <> +bool CudaFloatStructsTests::assert_same(const float4& val1, const float4& val2, const std::string& str) +{ + bool result = true; + result = result && (val1.x == val2.x); + result = result && (val1.y == val2.y); + result = result && (val1.z == val2.z); + result = result && (val1.w == val2.w); + if (!result) + { + printf("(%f, %f, %f, %f) != (%f, %f, %f, %f)", val1.x, val1.y, val1.z, val1.w, val2.x, val2.y, val2.z, val2.w); + std::cout << str << std::endl; + } + return result; +} + +template <> +float4 CudaFloatStructsTests::addVec(const float4& val1, const float4& val2) +{ + float4 output; + output.x = val1.x + val2.x; + output.y = val1.y + val2.y; + output.z = val1.z + val2.z; + output.w = val1.w + val2.w; + return output; +} + +template <> +float4 CudaFloatStructsTests::multVec(const float4& val1, const float4& val2) +{ + float4 output; + output.x = val1.x * val2.x; + output.y = val1.y * val2.y; + output.z = val1.z * val2.z; + output.w = val1.w * val2.w; + return output; +} + +template <> +float4 CudaFloatStructsTests::addScalar(const float4& val1, const float& val2) +{ + float4 output; + output.x = val1.x + val2; + output.y = val1.y + val2; + output.z = val1.z + val2; + output.w = val1.w + val2; + return output; +} + +template <> +float4 CudaFloatStructsTests::multScalar(const float4& val1, const float& val2) +{ + float4 output; + output.x = val1.x * val2; + output.y = val1.y * val2; + output.z = val1.z * val2; + output.w = val1.w * val2; + return output; +} + +template <> +float4 CudaFloatStructsTests::subVec(const float4& val1, const float4& val2) +{ + float4 output; + output.x = val1.x - val2.x; + output.y = val1.y - val2.y; + output.z = val1.z - val2.z; + output.w = val1.w - val2.w; + return output; +} + +template <> +float4 CudaFloatStructsTests::divVec(const float4& val1, const float4& val2) +{ + float4 output; + output.x = val1.x / val2.x; + output.y = val1.y / val2.y; + output.z = val1.z / val2.z; + output.w = val1.w / val2.w; + return output; +} + +template <> +float4 CudaFloatStructsTests::subScalar(const float4& val1, const float& val2) +{ + float4 output; + output.x = val1.x - val2; + output.y = val1.y - val2; + output.z = val1.z - val2; + output.w = val1.w - val2; + return output; +} + +template <> +float4 CudaFloatStructsTests::divScalar(const float4& val1, const float& val2) +{ + float4 output; + output.x = val1.x / val2; + output.y = val1.y / val2; + output.z = val1.z / val2; + output.w = val1.w / val2; + return output; +} + +template <> +void CudaFloatStructsTests::initializeStruct(float3& val, std::mt19937& gen, + std::uniform_real_distribution& dist) +{ + val = make_float3(dist(gen), dist(gen), dist(gen)); +} + +template <> +bool CudaFloatStructsTests::assert_same(const float3& val1, const float3& val2, const std::string& str) +{ + bool result = true; + result = result && (val1.x == val2.x); + result = result && (val1.y == val2.y); + result = result && (val1.z == val2.z); + if (!result) + { + printf("(%f, %f, %f) != (%f, %f, %f)", val1.x, val1.y, val1.z, val2.x, val2.y, val2.z); + std::cout << str << std::endl; + } + return result; +} + +template <> +float3 CudaFloatStructsTests::addVec(const float3& val1, const float3& val2) +{ + float3 output; + output.x = val1.x + val2.x; + output.y = val1.y + val2.y; + output.z = val1.z + val2.z; + return output; +} + +template <> +float3 CudaFloatStructsTests::multVec(const float3& val1, const float3& val2) +{ + float3 output; + output.x = val1.x * val2.x; + output.y = val1.y * val2.y; + output.z = val1.z * val2.z; + return output; +} + +template <> +float3 CudaFloatStructsTests::addScalar(const float3& val1, const float& val2) +{ + float3 output; + output.x = val1.x + val2; + output.y = val1.y + val2; + output.z = val1.z + val2; + return output; +} + +template <> +float3 CudaFloatStructsTests::multScalar(const float3& val1, const float& val2) +{ + float3 output; + output.x = val1.x * val2; + output.y = val1.y * val2; + output.z = val1.z * val2; + return output; +} + +template <> +float3 CudaFloatStructsTests::subVec(const float3& val1, const float3& val2) +{ + float3 output; + output.x = val1.x - val2.x; + output.y = val1.y - val2.y; + output.z = val1.z - val2.z; + return output; +} + +template <> +float3 CudaFloatStructsTests::divVec(const float3& val1, const float3& val2) +{ + float3 output; + output.x = val1.x / val2.x; + output.y = val1.y / val2.y; + output.z = val1.z / val2.z; + return output; +} + +template <> +float3 CudaFloatStructsTests::subScalar(const float3& val1, const float& val2) +{ + float3 output; + output.x = val1.x - val2; + output.y = val1.y - val2; + output.z = val1.z - val2; + return output; +} + +template <> +float3 CudaFloatStructsTests::divScalar(const float3& val1, const float& val2) +{ + float3 output; + output.x = val1.x / val2; + output.y = val1.y / val2; + output.z = val1.z / val2; + return output; +} + +using TYPE_TESTS = ::testing::Types; + +TYPED_TEST_SUITE(CudaFloatStructsTests, TYPE_TESTS); + +TYPED_TEST(CudaFloatStructsTests, VecAddScalar) +{ + using T = TypeParam; + T ground_truth_output = this->addScalar(this->input1_cpu, this->scalar); + + this->output_cpu = this->input1_cpu + this->scalar; + std::string cpu_fail = "Doesn't pass cpu test"; + ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_cpu)) << cpu_fail; + + // Parallelization tests + T zero = this->output_gpu; + for (int blocks = 1; blocks < 10; blocks++) + { + for (int threads = 1; threads < 128; threads++) + { + HANDLE_ERROR(cudaMemcpy(this->output_d, &zero, sizeof(T), cudaMemcpyHostToDevice)); + VectorScalarAddTestKernel<<>>(this->input1_d, this->scalar_d, this->output_d); + HANDLE_ERROR(cudaMemcpy(&this->output_gpu, this->output_d, sizeof(T), cudaMemcpyDeviceToHost)); + std::string fail_gpu = + " failed with " + std::to_string(blocks) + " blocks of " + std::to_string(threads) + " threads"; + ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_gpu, fail_gpu)) << fail_gpu; + } + } +} + +TYPED_TEST(CudaFloatStructsTests, VecSubScalar) +{ + using T = TypeParam; + T ground_truth_output = this->subScalar(this->input1_cpu, this->scalar); + + this->output_cpu = this->input1_cpu - this->scalar; + std::string cpu_fail = "Doesn't pass cpu test"; + ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_cpu)) << cpu_fail; + + // Parallelization tests + T zero = this->output_gpu; + for (int blocks = 1; blocks < 10; blocks++) + { + for (int threads = 1; threads < 128; threads++) + { + HANDLE_ERROR(cudaMemcpy(this->output_d, &zero, sizeof(T), cudaMemcpyHostToDevice)); + VectorScalarSubTestKernel<<>>(this->input1_d, this->scalar_d, this->output_d); + HANDLE_ERROR(cudaMemcpy(&this->output_gpu, this->output_d, sizeof(T), cudaMemcpyDeviceToHost)); + std::string fail_gpu = + " failed with " + std::to_string(blocks) + " blocks of " + std::to_string(threads) + " threads"; + ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_gpu, fail_gpu)) << fail_gpu; + } + } +} + +TYPED_TEST(CudaFloatStructsTests, VecMultScalar) +{ + using T = TypeParam; + T ground_truth_output = this->multScalar(this->input1_cpu, this->scalar); + + this->output_cpu = this->input1_cpu * this->scalar; + std::string cpu_fail = "Doesn't pass cpu test"; + ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_cpu)) << cpu_fail; + + // Parallelization tests + T zero = this->output_gpu; + for (int blocks = 1; blocks < 10; blocks++) + { + for (int threads = 1; threads < 128; threads++) + { + HANDLE_ERROR(cudaMemcpy(this->output_d, &zero, sizeof(T), cudaMemcpyHostToDevice)); + VectorScalarMultTestKernel<<>>(this->input1_d, this->scalar_d, this->output_d); + HANDLE_ERROR(cudaMemcpy(&this->output_gpu, this->output_d, sizeof(T), cudaMemcpyDeviceToHost)); + std::string fail_gpu = + " failed with " + std::to_string(blocks) + " blocks of " + std::to_string(threads) + " threads"; + ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_gpu, fail_gpu)) << fail_gpu; + } + } +} + +TYPED_TEST(CudaFloatStructsTests, VecDivScalar) +{ + using T = TypeParam; + T ground_truth_output = this->divScalar(this->input1_cpu, this->scalar); + + this->output_cpu = this->input1_cpu / this->scalar; + std::string cpu_fail = "Doesn't pass cpu test"; + ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_cpu)) << cpu_fail; + + // Parallelization tests + T zero = this->output_gpu; + for (int blocks = 1; blocks < 10; blocks++) + { + for (int threads = 1; threads < 128; threads++) + { + HANDLE_ERROR(cudaMemcpy(this->output_d, &zero, sizeof(T), cudaMemcpyHostToDevice)); + VectorScalarDivTestKernel<<>>(this->input1_d, this->scalar_d, this->output_d); + HANDLE_ERROR(cudaMemcpy(&this->output_gpu, this->output_d, sizeof(T), cudaMemcpyDeviceToHost)); + std::string fail_gpu = + " failed with " + std::to_string(blocks) + " blocks of " + std::to_string(threads) + " threads"; + ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_gpu, fail_gpu)) << fail_gpu; + } + } +} + +TYPED_TEST(CudaFloatStructsTests, VecAddVec) +{ + using T = TypeParam; + T ground_truth_output = this->addVec(this->input1_cpu, this->input2_cpu); + + this->output_cpu = this->input1_cpu + this->input2_cpu; + std::string cpu_fail = "Doesn't pass cpu test"; + ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_cpu)) << cpu_fail; + + // Parallelization tests + T zero = this->output_gpu; + for (int blocks = 1; blocks < 10; blocks++) + { + for (int threads = 1; threads < 128; threads++) + { + HANDLE_ERROR(cudaMemcpy(this->output_d, &zero, sizeof(T), cudaMemcpyHostToDevice)); + VectorVectorAddTestKernel<<>>(this->input1_d, this->input2_d, this->output_d); + HANDLE_ERROR(cudaMemcpy(&this->output_gpu, this->output_d, sizeof(T), cudaMemcpyDeviceToHost)); + std::string fail_gpu = + " failed with " + std::to_string(blocks) + " blocks of " + std::to_string(threads) + " threads"; + ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_gpu, fail_gpu)) << fail_gpu; + } + } +} + +TYPED_TEST(CudaFloatStructsTests, VecSubVec) +{ + using T = TypeParam; + T ground_truth_output = this->subVec(this->input1_cpu, this->input2_cpu); + + this->output_cpu = this->input1_cpu - this->input2_cpu; + std::string cpu_fail = "Doesn't pass cpu test"; + ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_cpu)) << cpu_fail; + + // Parallelization tests + T zero = this->output_gpu; + for (int blocks = 1; blocks < 10; blocks++) + { + for (int threads = 1; threads < 128; threads++) + { + HANDLE_ERROR(cudaMemcpy(this->output_d, &zero, sizeof(T), cudaMemcpyHostToDevice)); + VectorVectorSubTestKernel<<>>(this->input1_d, this->input2_d, this->output_d); + HANDLE_ERROR(cudaMemcpy(&this->output_gpu, this->output_d, sizeof(T), cudaMemcpyDeviceToHost)); + std::string fail_gpu = + " failed with " + std::to_string(blocks) + " blocks of " + std::to_string(threads) + " threads"; + ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_gpu, fail_gpu)) << fail_gpu; + } + } +} + +TYPED_TEST(CudaFloatStructsTests, VecDivVec) +{ + using T = TypeParam; + T ground_truth_output = this->divVec(this->input1_cpu, this->input2_cpu); + + this->output_cpu = this->input1_cpu / this->input2_cpu; + std::string cpu_fail = "Doesn't pass cpu test"; + ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_cpu)) << cpu_fail; + + // Parallelization tests + T zero = this->output_gpu; + for (int blocks = 1; blocks < 10; blocks++) + { + for (int threads = 1; threads < 128; threads++) + { + HANDLE_ERROR(cudaMemcpy(this->output_d, &zero, sizeof(T), cudaMemcpyHostToDevice)); + VectorVectorDivTestKernel<<>>(this->input1_d, this->input2_d, this->output_d); + HANDLE_ERROR(cudaMemcpy(&this->output_gpu, this->output_d, sizeof(T), cudaMemcpyDeviceToHost)); + std::string fail_gpu = + " failed with " + std::to_string(blocks) + " blocks of " + std::to_string(threads) + " threads"; + ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_gpu, fail_gpu)) << fail_gpu; + } + } +} + +TYPED_TEST(CudaFloatStructsTests, VecAddVecMultScalar) +{ + using T = TypeParam; + T ground_truth_output = this->addVec(this->input1_cpu, this->multScalar(this->input2_cpu, this->scalar)); + + this->output_cpu = this->input1_cpu + this->input2_cpu * this->scalar; + std::string cpu_fail = "Doesn't pass cpu test"; + ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_cpu)) << cpu_fail; + + // Parallelization tests + T zero = this->output_gpu; + for (int blocks = 1; blocks < 10; blocks++) + { + for (int threads = 1; threads < 128; threads++) + { + HANDLE_ERROR(cudaMemcpy(this->output_d, &zero, sizeof(T), cudaMemcpyHostToDevice)); + VectorVectorScalarAddMultTestKernel + <<>>(this->input1_d, this->input2_d, this->scalar_d, this->output_d); + HANDLE_ERROR(cudaMemcpy(&this->output_gpu, this->output_d, sizeof(T), cudaMemcpyDeviceToHost)); + std::string fail_gpu = + " failed with " + std::to_string(blocks) + " blocks of " + std::to_string(threads) + " threads"; + ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_gpu, fail_gpu)) << fail_gpu; + } + } +} From 6d1d0609f9299e102a97fc4eb21c64159ca99228 Mon Sep 17 00:00:00 2001 From: Bogdan Vlahov Date: Tue, 3 Sep 2024 13:18:42 -0400 Subject: [PATCH 4/4] Add tests for sampling distributions and bugfixes - Make computeLikelihoodRatioCost with Eigen match device version of the method in GaussianDistribution - Add an Eigen version of computeFeedbackCost() in base sampling distribution class - Add Gaussian-specific unit tests to ensure Gaussian distributions are Gaussian as well as ensure that the the number of distributions asked for does not go over the max allowed in the GaussianParams struct - Add generic unit tests for sampling distributions that check that they construct correctly, give access to the same controls on host and device, calculate the same costs between host and device, and ensure that the number of distributions, timesteps and samples can be adjusted. --- .../colored_noise/colored_noise.cu | 2 +- .../gaussian/gaussian.cu | 48 +- .../gaussian/gaussian.cuh | 18 +- .../mppi/sampling_distributions/nln/nln.cu | 2 +- .../sampling_distribution.cuh | 18 +- .../cost_generic_kernel_tests.cu | 2 +- .../gaussian_noise_tests.cu | 245 +++++++ .../generic_sampling_distribution_tests.cu | 637 ++++++++++++++++++ 8 files changed, 948 insertions(+), 24 deletions(-) create mode 100644 tests/sampling_distributions/gaussian_noise_tests.cu create mode 100644 tests/sampling_distributions/generic_sampling_distribution_tests.cu diff --git a/include/mppi/sampling_distributions/colored_noise/colored_noise.cu b/include/mppi/sampling_distributions/colored_noise/colored_noise.cu index 51b0876..0e10905 100644 --- a/include/mppi/sampling_distributions/colored_noise/colored_noise.cu +++ b/include/mppi/sampling_distributions/colored_noise/colored_noise.cu @@ -298,7 +298,7 @@ __host__ void COLORED_NOISE::generateSamples(const int& optimization_stride, con const int freq_size = sample_freq.size(); int smaller_index = 0; - const int local_control_dim = this->CONTROL_DIM; // Needed for methods which use pass by reference + const int local_control_dim = this->CONTROL_DIM; // Needed for methods which use pass by reference Eigen::MatrixXf sample_freqs(freq_size, local_control_dim); // Adjust the weighting of each frequency by the exponents diff --git a/include/mppi/sampling_distributions/gaussian/gaussian.cu b/include/mppi/sampling_distributions/gaussian/gaussian.cu index 09a6cde..9841906 100644 --- a/include/mppi/sampling_distributions/gaussian/gaussian.cu +++ b/include/mppi/sampling_distributions/gaussian/gaussian.cu @@ -495,13 +495,8 @@ __host__ __device__ float GAUSSIAN_CLASS::computeLikelihoodRatioCost(const float float* control_cost_coeff = params_p->control_cost_coeff; float cost = 0.0f; -#ifdef __CUDA_ARCH__ - int i = threadIdx.y; - int step = blockDim.y; -#else - int i = 0; - int step = 1; -#endif + int i, step; + mppi::p1::getParallel1DIndex(i, step); if (CONTROL_DIM % 4 == 0) { @@ -584,13 +579,8 @@ __host__ __device__ float GAUSSIAN_CLASS::computeFeedbackCost(const float* __res float* control_cost_coeff = params_p->control_cost_coeff; float cost = 0.0f; -#ifdef __CUDA_ARCH__ - int i = threadIdx.y; - int step = blockDim.y; -#else - int i = 0; - int step = 1; -#endif + int i, step; + mppi::p1::getParallel1DIndex(i, step); if (CONTROL_DIM % 4 == 0) { @@ -629,7 +619,26 @@ __host__ __device__ float GAUSSIAN_CLASS::computeFeedbackCost(const float* __res } GAUSSIAN_TEMPLATE -__host__ float GAUSSIAN_CLASS::computeLikelihoodRatioCost(const Eigen::Ref& u, const int t, +__host__ float GAUSSIAN_CLASS::computeFeedbackCost(const Eigen::Ref& u_fb, const int t, + const int distribution_idx, const float lambda, const float alpha) +{ + float cost = 0.0f; + const int distribution_i = distribution_idx >= this->params_.num_distributions ? 0 : distribution_idx; + float* std_dev = &(this->params_.std_dev[CONTROL_DIM * distribution_i]); + if (this->params_.time_specific_std_dev) + { + std_dev = &(this->params_.std_dev[(distribution_i * this->getNumTimesteps() + t) * CONTROL_DIM]); + } + for (int i = 0; i < CONTROL_DIM; i++) + { + cost += this->params_.control_cost_coeff[i] * u_fb(i) * u_fb(i) / (std_dev[i] * std_dev[i]); + } + return 0.5f * lambda * (1.0f - alpha) * cost; +} + +GAUSSIAN_TEMPLATE +__host__ float GAUSSIAN_CLASS::computeLikelihoodRatioCost(const Eigen::Ref& u, + const int sample_index, const int t, const int distribution_idx, const float lambda, const float alpha) { @@ -638,16 +647,21 @@ __host__ float GAUSSIAN_CLASS::computeLikelihoodRatioCost(const Eigen::RefgetNumTimesteps() + t) * CONTROL_DIM; float* mean = &(this->means_[mean_index]); float* std_dev = &(this->params_.std_dev[CONTROL_DIM * distribution_i]); + control_array zero_mean = control_array::Zero(); + if (sample_index >= (1.0f - this->params_.pure_noise_trajectories_percentage) * this->params_.num_rollouts) + { + mean = zero_mean.data(); + } if (this->params_.time_specific_std_dev) { std_dev = &(this->params_.std_dev[(distribution_i * this->getNumTimesteps() + t) * CONTROL_DIM]); } for (int i = 0; i < CONTROL_DIM; i++) { - cost += this->params_.control_cost_coeff[i] * mean[i] * (mean[i] + 2.0f * u(i)) / + cost += this->params_.control_cost_coeff[i] * mean[i] * (mean[i] - 2.0f * u(i)) / (std_dev[i] * std_dev[i]); // Proper way } - return cost; + return 0.5f * lambda * (1.0f - alpha) * cost; } GAUSSIAN_TEMPLATE diff --git a/include/mppi/sampling_distributions/gaussian/gaussian.cuh b/include/mppi/sampling_distributions/gaussian/gaussian.cuh index f2ce267..3becc20 100644 --- a/include/mppi/sampling_distributions/gaussian/gaussian.cuh +++ b/include/mppi/sampling_distributions/gaussian/gaussian.cuh @@ -125,12 +125,15 @@ public: const int t, const int distribution_idx, const float lambda = 1.0, const float alpha = 0.0); + __host__ float computeFeedbackCost(const Eigen::Ref& u_fb, const int t, + const int distribution_idx, const float lambda = 1.0, const float alpha = 0.0); + __host__ __device__ float computeLikelihoodRatioCost(const float* __restrict__ u, float* __restrict__ theta_d, const int sample_index, const int t, const int distribution_idx, const float lambda = 1.0, const float alpha = 0.0); - __host__ float computeLikelihoodRatioCost(const Eigen::Ref& u, const int t, - const int distribution_idx, const float lambda = 1.0, + __host__ float computeLikelihoodRatioCost(const Eigen::Ref& u, const int sample_index, + const int t, const int distribution_idx, const float lambda = 1.0, const float alpha = 0.0); __host__ void copyImportanceSamplerToDevice(const float* importance_sampler, const int& distribution_idx, @@ -146,6 +149,17 @@ public: __host__ void setHostOptimalControlSequence(float* optimal_control_trajectory, const int& distribution_idx, bool synchronize = true); + __host__ void setNumDistributions(const int num_distributions, bool synchronize = false) + { + if (num_distributions > SAMPLING_PARAMS_T::MAX_DISTRIBUTIONS) + { + this->logger_->error("GaussianParams can't handle more than %d distributions but %d were requested\n", + SAMPLING_PARAMS_T::MAX_DISTRIBUTIONS, num_distributions); + throw std::out_of_range("Can't set num distributions higher than allowed in params"); + } + PARENT_CLASS::setNumDistributions(num_distributions, synchronize); + } + __host__ void updateDistributionParamsFromDevice(const float* trajectory_weights_d, float normalizer, const int& distribution_i, bool synchronize = false) override; diff --git a/include/mppi/sampling_distributions/nln/nln.cu b/include/mppi/sampling_distributions/nln/nln.cu index 179a1e5..4ce40e8 100644 --- a/include/mppi/sampling_distributions/nln/nln.cu +++ b/include/mppi/sampling_distributions/nln/nln.cu @@ -74,7 +74,7 @@ void NLN_NOISE::setParams(const SAMPLING_PARAMS_T& params, bool synchronize) bool adjusted_variance = false; for (int i = 0; i < this->CONTROL_DIM * this->getNumDistributions(); i++) { - if (this->params.std_dev[i] != params.std_dev[i]) + if (this->params_.std_dev[i] != params.std_dev[i]) { adjusted_variance = true; break; diff --git a/include/mppi/sampling_distributions/sampling_distribution.cuh b/include/mppi/sampling_distributions/sampling_distribution.cuh index 1c156b8..eb591ff 100644 --- a/include/mppi/sampling_distributions/sampling_distribution.cuh +++ b/include/mppi/sampling_distributions/sampling_distribution.cuh @@ -325,6 +325,20 @@ public: const int t, const int distribution_idx, const float lambda = 1.0, const float alpha = 0.0); + /** + * @brief host method to calculate the cost for feedback based on the sampling distribution in RMPPI + * + * @param u_fb - feedback control + * @param t - timestep + * @param distribution_idx - distribution index + * @param lambda - MPPI temperature parameter + * @param alpha - coeff to turn off the likelihood cost (set to 1 -> no likelihood cost, set to 0 -> all likelihood + * + * @return feedback cost + */ + __host__ float computeFeedbackCost(const Eigen::Ref& u_fb, const int t, + const int distribution_idx, const float lambda = 1.0, const float alpha = 0.0); + /** * @brief Device method to calculate the likelihood ratio cost for a given sample u * @@ -352,8 +366,8 @@ public: * @param alpha - coeff to turn off the likelihood cost (set to 1 -> no likelihood cost, set to 0 -> all likelihood * cost) */ - __host__ float computeLikelihoodRatioCost(const Eigen::Ref& u, const int t, - const int distribution_idx, const float lambda = 1.0, + __host__ float computeLikelihoodRatioCost(const Eigen::Ref& u, const int sample_index, + const int t, const int distribution_idx, const float lambda = 1.0, const float alpha = 0.0); /** diff --git a/tests/include/kernel_tests/cost_functions/cost_generic_kernel_tests.cu b/tests/include/kernel_tests/cost_functions/cost_generic_kernel_tests.cu index 8a0b382..3204926 100644 --- a/tests/include/kernel_tests/cost_functions/cost_generic_kernel_tests.cu +++ b/tests/include/kernel_tests/cost_functions/cost_generic_kernel_tests.cu @@ -442,7 +442,7 @@ void checkGPURolloutCost(COST_T& cost, float dt) int crash = 0; float cpu_cost = cost.computeRunningCost(y_index, u_index, t, &crash); #ifdef USE_ROLLOUT_COST_KERNEL - cpu_cost += sampler.computeLikelihoodRatioCost(u_index, t, 0); + cpu_cost += sampler.computeLikelihoodRatioCost(u_index, n, t, 0); total_cost += cpu_cost; #else ASSERT_NEAR(cpu_cost, output_cost_gpu[index], cpu_cost * 0.0015) diff --git a/tests/sampling_distributions/gaussian_noise_tests.cu b/tests/sampling_distributions/gaussian_noise_tests.cu new file mode 100644 index 0000000..eb64728 --- /dev/null +++ b/tests/sampling_distributions/gaussian_noise_tests.cu @@ -0,0 +1,245 @@ +#include +#include +#include +#include +#include + +namespace ms = mppi::sampling_distributions; + +template +class GaussianTests : public ::testing::Test +{ +public: + using SAMPLER_PARAMS_T = typename SAMPLER_T::SAMPLING_PARAMS_T; + static const int OUTPUT_DIM = E_INDEX(SAMPLER_T::OutputIndex, NUM_OUTPUTS); + + typedef Eigen::Matrix output_array; + int num_samples_ = 1000; + int num_timesteps_ = 100; + int num_distributions_ = 1; + int num_verifications_ = 0; + int* sampling_indices_d_ = nullptr; + int* times_d_ = nullptr; + int* distribution_indices_d_ = nullptr; + float* outputs_d_ = nullptr; + float* controls_d_ = nullptr; + float* costs_d_ = nullptr; + float lambda_ = 1.0f; + float alpha_ = 0.0f; + + std::shared_ptr sampler_ = nullptr; + cudaStream_t stream_ = 0; + curandGenerator_t gen_; + dim3 thread_block_ = dim3(32, 1, 1); + + std::vector sampled_indices_; + std::vector sampled_times_; + std::vector sampled_distributions_; + std::vector means_cpu_; + + void SetUp() override + { + cudaStreamCreate(&stream_); + sampler_ = std::make_shared(stream_); + updateSamplerSizes(); + sampler_->GPUSetup(); + HANDLE_CURAND_ERROR(curandCreateGenerator(&gen_, CURAND_RNG_PSEUDO_DEFAULT)); + HANDLE_CURAND_ERROR(curandSetPseudoRandomGeneratorSeed(gen_, 42)); + } + + void TearDown() override + { + } + + void updateSamplerSizes() + { + sampler_->setNumTimesteps(num_timesteps_); + sampler_->setNumRollouts(num_samples_); + sampler_->setNumDistributions(num_distributions_); + means_cpu_.resize(num_distributions_); + } + + void setRandomMeans() + { + for (int i = 0; i < num_distributions_; i++) + { + Eigen::MatrixXf mean_i = 5 * Eigen::MatrixXf::Random(SAMPLER_T::CONTROL_DIM, num_timesteps_); + sampler_->copyImportanceSamplerToDevice(mean_i.data(), i, false); + means_cpu_[i] = mean_i; + } + } +}; + +template +using GaussianParams3 = ms::GaussianParamsImpl; + +template +class TestGaussianDistribution + : public ms::GaussianDistributionImpl, GaussianParams3, DYN_PARAMS_T> +{ +public: + using PARENT_CLASS = ms::GaussianDistributionImpl; + using SAMPLING_PARAMS_T = typename PARENT_CLASS::SAMPLING_PARAMS_T; + + TestGaussianDistribution(cudaStream_t stream = 0) : PARENT_CLASS(stream) + { + } + TestGaussianDistribution(const SAMPLING_PARAMS_T& params, cudaStream_t stream = 0) : PARENT_CLASS(params, stream) + { + } +}; + +using TYPE_TESTS = ::testing::Types>, + ms::GaussianDistribution>, + TestGaussianDistribution>>; + +TYPED_TEST_SUITE(GaussianTests, TYPE_TESTS); + +TYPED_TEST(GaussianTests, SetNumDistributions) +{ + using T = TypeParam; + this->num_distributions_ = 3; + if (this->num_distributions_ > T::SAMPLING_PARAMS_T::MAX_DISTRIBUTIONS) + { + EXPECT_THROW(this->sampler_->setNumDistributions(this->num_distributions_), std::out_of_range); + } + else + { + this->sampler_->setNumDistributions(this->num_distributions_); + auto params = this->sampler_->getParams(); + EXPECT_EQ(this->num_distributions_, params.num_distributions); + } +} + +TYPED_TEST(GaussianTests, CheckSamplesAreGaussian) +{ + using T = TypeParam; + float std_dev = 2.3f; + auto params = this->sampler_->getParams(); + params.pure_noise_trajectories_percentage = 0.0f; + for (int i = 0; i < this->num_distributions_; i++) + { + for (int j = 0; j < T::CONTROL_DIM; j++) + { + params.std_dev[j + i * T::CONTROL_DIM] = std_dev; + } + } + this->sampler_->setParams(params); + this->setRandomMeans(); + this->setRandomMeans(); + + this->sampler_->generateSamples(0, 0, this->gen_, false); + std::vector sampled_controls(T::CONTROL_DIM * this->num_timesteps_ * this->num_samples_ * + this->num_distributions_); + HANDLE_ERROR(cudaMemcpyAsync(sampled_controls.data(), this->sampler_->getControlSample(0, 0, 0), + sizeof(float) * this->num_samples_ * this->num_timesteps_ * this->num_distributions_ * + T::CONTROL_DIM, + cudaMemcpyDeviceToHost, this->stream_)); + HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); + + int k_bins = 3; + std::vector binned_counts(k_bins, 0); + for (int d = 0; d < this->num_distributions_; d++) + { + for (int s = 0; s < this->num_samples_; s++) + { + for (int t = 0; t < this->num_timesteps_; t++) + { + for (int i = 0; i < T::CONTROL_DIM; i++) + { + int index = ((d * this->num_samples_ + s) * this->num_timesteps_ + t) * T::CONTROL_DIM + i; + float z_score = (sampled_controls[index] - this->means_cpu_[d](i, t)) / std_dev; + for (int j = k_bins; j >= 0; j--) + { + if (fabsf(z_score) < j + 1) + { + binned_counts[j]++; + } + else + { + break; + } + } + } + } + } + } + + // Check how many samples are properly sampled + std::vector normalized_bins(k_bins, 0.0f); + std::vector sigma_rules(k_bins); + for (int i = 0; i < k_bins; i++) + { + sigma_rules[i] = 0.5 * (erf((i + 1) / sqrt(2.0)) - erf(-(i + 1) / sqrt(2.0))); + normalized_bins[i] = (double)binned_counts[i] / + (this->num_distributions_ * this->num_samples_ * this->num_timesteps_ * T::CONTROL_DIM); + double diff = fabsf(normalized_bins[i] - sigma_rules[i]) / sigma_rules[i]; + EXPECT_NEAR(diff, 0.0, 1e-3); + } +} + +TYPED_TEST(GaussianTests, CheckZeroMeanSamplesAreGaussian) +{ + using T = TypeParam; + auto params = this->sampler_->getParams(); + params.pure_noise_trajectories_percentage = 1.0f; + float std_dev = 2.3f; + for (int i = 0; i < this->num_distributions_; i++) + { + for (int j = 0; j < T::CONTROL_DIM; j++) + { + params.std_dev[j + i * T::CONTROL_DIM] = std_dev; + } + } + this->sampler_->setParams(params); + this->setRandomMeans(); // should end up doing not effecting the samples + + this->sampler_->generateSamples(0, 0, this->gen_, false); + std::vector sampled_controls(T::CONTROL_DIM * this->num_timesteps_ * this->num_samples_ * + this->num_distributions_); + HANDLE_ERROR(cudaMemcpyAsync(sampled_controls.data(), this->sampler_->getControlSample(0, 0, 0), + sizeof(float) * this->num_samples_ * this->num_timesteps_ * this->num_distributions_ * + T::CONTROL_DIM, + cudaMemcpyDeviceToHost, this->stream_)); + HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); + + int k_bins = 3; + std::vector binned_counts(k_bins, 0); + for (int d = 0; d < this->num_distributions_; d++) + { + for (int s = 0; s < this->num_samples_; s++) + { + for (int t = 0; t < this->num_timesteps_; t++) + { + for (int i = 0; i < T::CONTROL_DIM; i++) + { + int index = ((d * this->num_samples_ + s) * this->num_timesteps_ + t) * T::CONTROL_DIM + i; + float z_score = (sampled_controls[index]) / std_dev; + for (int j = k_bins; j >= 0; j--) + { + if (fabsf(z_score) < j + 1) + { + binned_counts[j]++; + } + else + { + break; + } + } + } + } + } + } + + // Check how many samples are properly sampled + std::vector normalized_bins(k_bins, 0.0f); + std::vector sigma_rules(k_bins); + for (int i = 0; i < k_bins; i++) + { + sigma_rules[i] = 0.5 * (erf((i + 1) / sqrt(2.0)) - erf(-(i + 1) / sqrt(2.0))); + normalized_bins[i] = (double)binned_counts[i] / + (this->num_distributions_ * this->num_samples_ * this->num_timesteps_ * T::CONTROL_DIM); + double diff = fabsf(normalized_bins[i] - sigma_rules[i]) / sigma_rules[i]; + EXPECT_NEAR(diff, 0.0, 1e-3); + } +} diff --git a/tests/sampling_distributions/generic_sampling_distribution_tests.cu b/tests/sampling_distributions/generic_sampling_distribution_tests.cu new file mode 100644 index 0000000..03e8abc --- /dev/null +++ b/tests/sampling_distributions/generic_sampling_distribution_tests.cu @@ -0,0 +1,637 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if __cplusplus < 201703L // std::void_t is a C++17 feature, used for inherited_from_gaussian struct +namespace std +{ +template +struct make_void +{ + typedef void type; +}; + +template +using void_t = typename make_void::type; +} // namespace std +#endif + +namespace ms = mppi::sampling_distributions; + +template +__global__ void getControlSamplesKernel(SAMPLER_T* __restrict__ sampler, const int num_control_samples, + const int* __restrict__ sample_indexes_d, const int* __restrict__ times_d, + const int* __restrict__ distribution_indexes_d, + const float* __restrict__ outputs_d, float* __restrict__ control_samples_d) +{ + const int size_of_theta_d_bytes = mppi::kernels::calcClassSharedMemSize(sampler, blockDim); + using OutputIndex = typename SAMPLER_T::OutputIndex; + const int OUTPUT_DIM = E_INDEX(OutputIndex, NUM_OUTPUTS); + extern __shared__ float entire_buffer[]; + float* theta_d_shared = &entire_buffer[0 / sizeof(float)]; + float* outputs_shared = &entire_buffer[size_of_theta_d_bytes / sizeof(float)]; // THREAD_BLOCK_X * OUTPUT_DIM in size + + int x_index, x_step; + mppi::p1::getParallel1DIndex(x_index, x_step); + int y_index, y_step; + mppi::p1::getParallel1DIndex(y_index, y_step); + int test_index = threadIdx.x + blockDim.x * blockIdx.x; + // load output into shared memory + if (test_index < num_control_samples) + { + for (int j = y_index; j < OUTPUT_DIM; j += y_step) + { + outputs_shared[j + x_index * OUTPUT_DIM] = outputs_d[j + test_index * OUTPUT_DIM]; + } + } + __syncthreads(); + float* output = &outputs_shared[x_index * OUTPUT_DIM]; + // initialize sampling distributions + sampler->initializeDistributions(output, 0.0f, 0.01f, theta_d_shared); + __syncthreads(); + // get control samples + if (test_index < num_control_samples) + { + int sample_index = sample_indexes_d[test_index]; + int t = times_d[test_index]; + int distribution_idx = distribution_indexes_d[test_index]; + float* u_to_save_to = &control_samples_d[test_index * SAMPLER_T::CONTROL_DIM]; + sampler->readControlSample(sample_index, t, distribution_idx, u_to_save_to, theta_d_shared, y_step, y_index, + output); + } +} + +template +__global__ void getLikelihoodCostKernel(SAMPLER_T* __restrict__ sampler, const int num_control_samples, float lambda, + float alpha, const int* __restrict__ sample_indexes_d, + const int* __restrict__ times_d, const int* __restrict__ distribution_indexes_d, + const float* __restrict__ outputs_d, float* __restrict__ control_samples_d, + float* __restrict__ costs_d) +{ + const int size_of_theta_d_bytes = mppi::kernels::calcClassSharedMemSize(sampler, blockDim); + using OutputIndex = typename SAMPLER_T::OutputIndex; + const int OUTPUT_DIM = E_INDEX(OutputIndex, NUM_OUTPUTS); + extern __shared__ float entire_buffer[]; + float* theta_d_shared = &entire_buffer[0 / sizeof(float)]; + float* outputs_shared = + &theta_d_shared[size_of_theta_d_bytes / sizeof(float)]; // THREAD_BLOCK_X * OUTPUT_DIM in size + float* running_cost_shared = &outputs_shared[blockDim.x * OUTPUT_DIM]; + int running_cost_index = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z); + float* running_cost = &running_cost_shared[running_cost_index]; + running_cost[0] = 0.0f; + + int x_index, x_step; + mppi::p1::getParallel1DIndex(x_index, x_step); + int y_index, y_step; + mppi::p1::getParallel1DIndex(y_index, y_step); + int test_index = threadIdx.x + blockDim.x * blockIdx.x; + // load output into shared memory + if (test_index < num_control_samples) + { + for (int j = y_index; j < OUTPUT_DIM; j += y_step) + { + outputs_shared[j + x_index * OUTPUT_DIM] = outputs_d[j + test_index * OUTPUT_DIM]; + } + } + __syncthreads(); + float* output = &outputs_shared[x_index * OUTPUT_DIM]; + // initialize sampling distributions + sampler->initializeDistributions(output, 0.0f, 0.01f, theta_d_shared); + __syncthreads(); + // get control samples + if (test_index < num_control_samples) + { + int sample_index = sample_indexes_d[test_index]; + int t = times_d[test_index]; + int distribution_idx = distribution_indexes_d[test_index]; + float* u_to_save_to = &control_samples_d[test_index * SAMPLER_T::CONTROL_DIM]; + sampler->readControlSample(sample_index, t, distribution_idx, u_to_save_to, theta_d_shared, y_step, y_index, + output); + } + __syncthreads(); + // Calculate likelihood ratio cost + if (test_index < num_control_samples) + { + int sample_index = sample_indexes_d[test_index]; + int t = times_d[test_index]; + int distribution_idx = distribution_indexes_d[test_index]; + float* u = &control_samples_d[test_index * SAMPLER_T::CONTROL_DIM]; + running_cost[0] = + sampler->computeLikelihoodRatioCost(u, theta_d_shared, sample_index, t, distribution_idx, lambda, alpha); + } + // __syncthreads(); + // if (threadIdx.x == 1 && threadIdx.y + threadIdx.z == 0 && blockIdx.x == 0) + // { + // printf("Running costs:\n"); + // for (int i = 0; i < blockDim.x; i++) + // { + // float cost_sum = 0.0f; + // for (int j = 0; j < blockDim.y; j++) + // { + // cost_sum += running_cost_shared[i + j * blockDim.x]; + // printf("(%2d, %2d): %6.3f\n", i, j, running_cost_shared[i + j * blockDim.x]); + // } + // printf("Sum of y dim for %2d: %8.5f\n", i, cost_sum); + // } + // } + running_cost = &running_cost_shared[blockDim.x * blockDim.y * threadIdx.z]; + __syncthreads(); + mppi::kernels::costArrayReduction(&running_cost[x_index], blockDim.y, threadIdx.y, blockDim.y, threadIdx.y == 0, + blockDim.x); + if (test_index < num_control_samples) + { + costs_d[test_index] = running_cost[x_index]; + } +} + +template +__global__ void getFeedbackCostKernel(SAMPLER_T* __restrict__ sampler, const int num_control_samples, float lambda, + float alpha, const int* __restrict__ sample_indexes_d, + const int* __restrict__ times_d, const int* __restrict__ distribution_indexes_d, + const float* __restrict__ outputs_d, float* __restrict__ control_samples_d, + float* __restrict__ costs_d) +{ + const int size_of_theta_d_bytes = mppi::kernels::calcClassSharedMemSize(sampler, blockDim); + using OutputIndex = typename SAMPLER_T::OutputIndex; + const int OUTPUT_DIM = E_INDEX(OutputIndex, NUM_OUTPUTS); + extern __shared__ float entire_buffer[]; + float* theta_d_shared = &entire_buffer[0 / sizeof(float)]; + float* outputs_shared = + &theta_d_shared[size_of_theta_d_bytes / sizeof(float)]; // THREAD_BLOCK_X * OUTPUT_DIM in size + float* running_cost_shared = &outputs_shared[blockDim.x * OUTPUT_DIM]; + int running_cost_index = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z); + float* running_cost = &running_cost_shared[running_cost_index]; + running_cost[0] = 0.0f; + + int x_index, x_step; + mppi::p1::getParallel1DIndex(x_index, x_step); + int y_index, y_step; + mppi::p1::getParallel1DIndex(y_index, y_step); + int test_index = threadIdx.x + blockDim.x * blockIdx.x; + // load output into shared memory + if (test_index < num_control_samples) + { + for (int j = y_index; j < OUTPUT_DIM; j += y_step) + { + outputs_shared[j + x_index * OUTPUT_DIM] = outputs_d[j + test_index * OUTPUT_DIM]; + } + } + __syncthreads(); + float* output = &outputs_shared[x_index * OUTPUT_DIM]; + // initialize sampling distributions + sampler->initializeDistributions(output, 0.0f, 0.01f, theta_d_shared); + __syncthreads(); + // get control samples + if (test_index < num_control_samples) + { + int sample_index = sample_indexes_d[test_index]; + int t = times_d[test_index]; + int distribution_idx = distribution_indexes_d[test_index]; + float* u_to_save_to = &control_samples_d[test_index * SAMPLER_T::CONTROL_DIM]; + sampler->readControlSample(sample_index, t, distribution_idx, u_to_save_to, theta_d_shared, y_step, y_index, + output); + } + __syncthreads(); + // Calculate Feedback Control cost using random control samples from distribution + if (test_index < num_control_samples) + { + int sample_index = sample_indexes_d[test_index]; + int t = times_d[test_index]; + int distribution_idx = distribution_indexes_d[test_index]; + float* u = &control_samples_d[test_index * SAMPLER_T::CONTROL_DIM]; + running_cost[0] = sampler->computeFeedbackCost(u, theta_d_shared, t, distribution_idx, lambda, alpha); + } + running_cost = &running_cost_shared[blockDim.x * blockDim.y * threadIdx.z]; + __syncthreads(); + mppi::kernels::costArrayReduction(&running_cost[x_index], blockDim.y, threadIdx.y, blockDim.y, threadIdx.y == 0, + blockDim.x); + if (test_index < num_control_samples) + { + costs_d[test_index] = running_cost[x_index]; + } +} + +template +class SamplingDistributionTests : public ::testing::Test +{ +public: + using SAMPLER_PARAMS_T = typename SAMPLER_T::SAMPLING_PARAMS_T; + static const int OUTPUT_DIM = E_INDEX(SAMPLER_T::OutputIndex, NUM_OUTPUTS); + typedef Eigen::Matrix output_array; + int num_samples_ = 1000; + int num_timesteps_ = 100; + int num_distributions_ = 1; + int num_verifications_ = 0; + int* sampling_indices_d_ = nullptr; + int* times_d_ = nullptr; + int* distribution_indices_d_ = nullptr; + float* outputs_d_ = nullptr; + float* controls_d_ = nullptr; + float* costs_d_ = nullptr; + float lambda_ = 1.0f; + float alpha_ = 0.0f; + + std::shared_ptr sampler_ = nullptr; + cudaStream_t stream_ = 0; + curandGenerator_t gen_; + dim3 thread_block_ = dim3(32, 2, 1); + + std::vector sampled_indices_; + std::vector sampled_times_; + std::vector sampled_distributions_; + util::EigenAlignedVector sampled_outputs_; + util::EigenAlignedVector sampled_controls_cpu_; + util::EigenAlignedVector sampled_controls_gpu_; + + void SetUp() override + { + cudaStreamCreate(&stream_); + sampler_ = std::make_shared(stream_); + sampler_->GPUSetup(); + setUpCudaMemory(3000); + HANDLE_CURAND_ERROR(curandCreateGenerator(&gen_, CURAND_RNG_PSEUDO_DEFAULT)); + HANDLE_CURAND_ERROR(curandSetPseudoRandomGeneratorSeed(gen_, 42)); + } + + template + void freeCudaPtr(T*& ptr) + { + if (ptr != nullptr) + { + cudaFree(ptr); + } + ptr = nullptr; + } + + void setUpCudaMemory(const int num_verifications) + { + if (num_verifications > num_verifications_) + { + freeCudaPtr(outputs_d_); + freeCudaPtr(sampling_indices_d_); + freeCudaPtr(times_d_); + freeCudaPtr(distribution_indices_d_); + freeCudaPtr(controls_d_); + freeCudaPtr(costs_d_); + + // Allocate GPU memory + HANDLE_ERROR(cudaMalloc((void**)&sampling_indices_d_, sizeof(int) * num_verifications)); + HANDLE_ERROR(cudaMalloc((void**)×_d_, sizeof(int) * num_verifications)); + HANDLE_ERROR(cudaMalloc((void**)&distribution_indices_d_, sizeof(int) * num_verifications)); + HANDLE_ERROR(cudaMalloc((void**)&outputs_d_, sizeof(float) * OUTPUT_DIM * num_verifications)); + HANDLE_ERROR(cudaMalloc((void**)&controls_d_, sizeof(float) * SAMPLER_T::CONTROL_DIM * num_verifications)); + HANDLE_ERROR(cudaMalloc((void**)&costs_d_, sizeof(float) * num_verifications)); + + num_verifications_ = num_verifications; + } + } + + void generateNewSamples() + { + // Create sample index + std::vector samples = + mppi::math::sample_without_replacement(num_verifications_, num_timesteps_ * num_samples_ * num_distributions_); + + // Fill in sampled indices and copy to GPU + sampled_indices_.resize(num_verifications_); + sampled_times_.resize(num_verifications_); + sampled_distributions_.resize(num_verifications_); + sampled_outputs_.resize(num_verifications_); + for (int i = 0; i < num_verifications_; i++) + { + const int sample = samples[i]; + sampled_indices_[i] = (sample / (num_timesteps_ * num_distributions_)) % num_samples_; + sampled_times_[i] = (sample / num_distributions_) % num_timesteps_; + sampled_distributions_[i] = sample % num_distributions_; + sampled_outputs_[i] = output_array::Random(); + HANDLE_ERROR(cudaMemcpyAsync(outputs_d_ + i * OUTPUT_DIM, sampled_outputs_[i].data(), sizeof(float) * OUTPUT_DIM, + cudaMemcpyHostToDevice, stream_)); + } + HANDLE_ERROR(cudaMemcpyAsync(sampling_indices_d_, sampled_indices_.data(), sizeof(int) * num_verifications_, + cudaMemcpyHostToDevice, stream_)); + HANDLE_ERROR(cudaMemcpyAsync(times_d_, sampled_times_.data(), sizeof(int) * num_verifications_, + cudaMemcpyHostToDevice, stream_)); + HANDLE_ERROR(cudaMemcpyAsync(distribution_indices_d_, sampled_distributions_.data(), + sizeof(int) * num_verifications_, cudaMemcpyHostToDevice, stream_)); + + // Create non-zero mean + Eigen::MatrixXf random_mean = Eigen::MatrixXf::Random(SAMPLER_T::CONTROL_DIM, num_timesteps_); + for (int i = 0; i < num_distributions_; i++) + { + sampler_->copyImportanceSamplerToDevice(random_mean.data(), i, false); + } + float mean_update = 1.0f; + sampled_controls_cpu_.resize(num_verifications_); + sampled_controls_gpu_.resize(num_verifications_); + sampler_->generateSamples(0, 0, gen_, false); + HANDLE_ERROR(cudaMemcpyAsync(costs_d_, &mean_update, sizeof(float), cudaMemcpyHostToDevice, stream_)); + for (int i = 0; i < num_distributions_; i++) + { + sampler_->updateDistributionParamsFromDevice(costs_d_, 1.0f, i, false); + } + HANDLE_ERROR(cudaStreamSynchronize(stream_)); + } + + void updateSampler() + { + sampler_->setNumTimesteps(num_timesteps_); + sampler_->setNumRollouts(num_samples_); + sampler_->setNumDistributions(num_distributions_); + } + + void TearDown() override + { + HANDLE_ERROR(cudaStreamSynchronize(stream_)); + freeCudaPtr(sampling_indices_d_); + freeCudaPtr(times_d_); + freeCudaPtr(distribution_indices_d_); + freeCudaPtr(outputs_d_); + freeCudaPtr(controls_d_); + freeCudaPtr(costs_d_); + } +}; + +/** + * Special setup for Gaussian-based distributions + **/ +template +struct inherited_from_gaussian : std::false_type +{ +public: + void operator()(std::shared_ptr sampler) const + { // Empty setup + } +}; + +template +struct inherited_from_gaussian< + T, std::void_t, + typename T::SAMPLING_PARAMS_T>::value, + bool>::type>> : std::true_type +{ +public: + void operator()(std::shared_ptr sampler) const + { // Setup std dev and cost coefficients + auto params = sampler->getParams(); + for (int i = 0; i < T::CONTROL_DIM; i++) + { + params.control_cost_coeff[i] = 1.0f; + } + for (int dist_i = 0; dist_i < params.num_distributions; dist_i++) + { + for (int std_dev_i = 0; std_dev_i < T::CONTROL_DIM; std_dev_i++) + { + params.std_dev[std_dev_i + dist_i * T::CONTROL_DIM] = 2.0f; + } + } + sampler->setParams(params); + } +}; + +using TYPE_TESTS = ::testing::Types< + ms::GaussianDistribution>, ms::GaussianDistribution>, + ms::GaussianDistribution>, ms::GaussianDistribution>, + ms::GaussianDistribution>, ms::ColoredNoiseDistribution>, + ms::ColoredNoiseDistribution>, ms::ColoredNoiseDistribution>, + ms::ColoredNoiseDistribution>, ms::NLNDistribution>, + ms::NLNDistribution>, ms::NLNDistribution>, + ms::NLNDistribution>>; + +TYPED_TEST_SUITE(SamplingDistributionTests, TYPE_TESTS); + +TYPED_TEST(SamplingDistributionTests, TestCreation) +{ + using T = TypeParam; + EXPECT_TRUE(true); + // testMethod>(); + inherited_from_gaussian()(this->sampler_); +} + +TYPED_TEST(SamplingDistributionTests, SetNumDistributions) +{ + using T = TypeParam; + this->num_distributions_ = 2; + this->updateSampler(); + this->generateNewSamples(); + EXPECT_TRUE(true); +} + +TYPED_TEST(SamplingDistributionTests, ReadControlsFromGPU) +{ + using T = TypeParam; + this->updateSampler(); + this->generateNewSamples(); + + dim3 grid_dim(mppi::math::int_ceil(this->num_verifications_, this->thread_block_.x), 1, 1); + std::size_t shared_mem_bytes = mppi::kernels::calcClassSharedMemSize(this->sampler_.get(), this->thread_block_) + + sizeof(float) * this->OUTPUT_DIM * this->thread_block_.x; + + getControlSamplesKernel<<thread_block_, shared_mem_bytes, this->stream_>>>( + this->sampler_->sampling_d_, this->num_verifications_, this->sampling_indices_d_, this->times_d_, + this->distribution_indices_d_, this->outputs_d_, this->controls_d_); + HANDLE_ERROR(cudaGetLastError()); + // Copy back to CPU + for (int i = 0; i < this->num_verifications_; i++) + { + HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_gpu_[i].data(), this->controls_d_ + i * T::CONTROL_DIM, + sizeof(float) * T::CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_)); + // Query on CPU + float* u_i = + this->sampler_->getControlSample(this->sampled_indices_[i], this->sampled_times_[i], + this->sampled_distributions_[i], nullptr, this->sampled_outputs_[i].data()); + HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_cpu_[i].data(), u_i, sizeof(float) * T::CONTROL_DIM, + cudaMemcpyDeviceToHost, this->stream_)); + } + HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); + + for (int i = 0; i < this->num_verifications_; i++) + { + float diff = (this->sampled_controls_cpu_[i] - this->sampled_controls_gpu_[i]).array().abs().sum(); + ASSERT_FLOAT_EQ(diff, 0.0f); + } +} + +TYPED_TEST(SamplingDistributionTests, ReadControlsFromGPUMoreDistributions) +{ + using T = TypeParam; + this->num_distributions_ = 2; + this->updateSampler(); + this->generateNewSamples(); + + dim3 grid_dim(mppi::math::int_ceil(this->num_verifications_, this->thread_block_.x), 1, 1); + std::size_t shared_mem_bytes = mppi::kernels::calcClassSharedMemSize(this->sampler_.get(), this->thread_block_) + + sizeof(float) * this->OUTPUT_DIM * this->thread_block_.x; + + getControlSamplesKernel<<thread_block_, shared_mem_bytes, this->stream_>>>( + this->sampler_->sampling_d_, this->num_verifications_, this->sampling_indices_d_, this->times_d_, + this->distribution_indices_d_, this->outputs_d_, this->controls_d_); + HANDLE_ERROR(cudaGetLastError()); + // Copy back to CPU + for (int i = 0; i < this->num_verifications_; i++) + { + HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_gpu_[i].data(), this->controls_d_ + i * T::CONTROL_DIM, + sizeof(float) * T::CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_)); + // Query on CPU + float* u_i = + this->sampler_->getControlSample(this->sampled_indices_[i], this->sampled_times_[i], + this->sampled_distributions_[i], nullptr, this->sampled_outputs_[i].data()); + HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_cpu_[i].data(), u_i, sizeof(float) * T::CONTROL_DIM, + cudaMemcpyDeviceToHost, this->stream_)); + } + HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); + + for (int i = 0; i < this->num_verifications_; i++) + { + float diff = (this->sampled_controls_cpu_[i] - this->sampled_controls_gpu_[i]).array().abs().sum(); + ASSERT_FLOAT_EQ(diff, 0.0f); + } +} + +TYPED_TEST(SamplingDistributionTests, ReadControlsFromGPUDifferentNoiseForDistributions) +{ + using T = TypeParam; + this->num_distributions_ = 2; + auto params = this->sampler_->getParams(); + params.use_same_noise_for_all_distributions = false; + this->sampler_->setParams(params); + this->updateSampler(); + this->generateNewSamples(); + + dim3 grid_dim(mppi::math::int_ceil(this->num_verifications_, this->thread_block_.x), 1, 1); + std::size_t shared_mem_bytes = mppi::kernels::calcClassSharedMemSize(this->sampler_.get(), this->thread_block_) + + sizeof(float) * this->OUTPUT_DIM * this->thread_block_.x; + + getControlSamplesKernel<<thread_block_, shared_mem_bytes, this->stream_>>>( + this->sampler_->sampling_d_, this->num_verifications_, this->sampling_indices_d_, this->times_d_, + this->distribution_indices_d_, this->outputs_d_, this->controls_d_); + HANDLE_ERROR(cudaGetLastError()); + // Copy back to CPU + for (int i = 0; i < this->num_verifications_; i++) + { + HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_gpu_[i].data(), this->controls_d_ + i * T::CONTROL_DIM, + sizeof(float) * T::CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_)); + // Query on CPU + float* u_i = + this->sampler_->getControlSample(this->sampled_indices_[i], this->sampled_times_[i], + this->sampled_distributions_[i], nullptr, this->sampled_outputs_[i].data()); + HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_cpu_[i].data(), u_i, sizeof(float) * T::CONTROL_DIM, + cudaMemcpyDeviceToHost, this->stream_)); + } + HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); + + for (int i = 0; i < this->num_verifications_; i++) + { + float diff = (this->sampled_controls_cpu_[i] - this->sampled_controls_gpu_[i]).array().abs().sum(); + ASSERT_FLOAT_EQ(diff, 0.0f); + } +} + +TYPED_TEST(SamplingDistributionTests, CompareLikelihoodRatioCostsCPUvsGPU) +{ + using T = TypeParam; + this->num_distributions_ = 2; + auto params = this->sampler_->getParams(); + params.use_same_noise_for_all_distributions = false; + this->sampler_->setParams(params); + inherited_from_gaussian()(this->sampler_); + this->updateSampler(); + this->generateNewSamples(); + + dim3 grid_dim(mppi::math::int_ceil(this->num_verifications_, this->thread_block_.x), 1, 1); + std::size_t shared_mem_bytes = + mppi::kernels::calcClassSharedMemSize(this->sampler_.get(), this->thread_block_) + + sizeof(float) * (this->OUTPUT_DIM * this->thread_block_.x + + this->thread_block_.x * this->thread_block_.y * this->thread_block_.z); + + getLikelihoodCostKernel<<thread_block_, shared_mem_bytes, this->stream_>>>( + this->sampler_->sampling_d_, this->num_verifications_, this->lambda_, this->alpha_, this->sampling_indices_d_, + this->times_d_, this->distribution_indices_d_, this->outputs_d_, this->controls_d_, this->costs_d_); + HANDLE_ERROR(cudaGetLastError()); + + std::vector costs_gpu(this->num_verifications_); + // Copy back to CPU + for (int i = 0; i < this->num_verifications_; i++) + { + HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_gpu_[i].data(), this->controls_d_ + i * T::CONTROL_DIM, + sizeof(float) * T::CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_)); + // Query on CPU + float* u_i = + this->sampler_->getControlSample(this->sampled_indices_[i], this->sampled_times_[i], + this->sampled_distributions_[i], nullptr, this->sampled_outputs_[i].data()); + HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_cpu_[i].data(), u_i, sizeof(float) * T::CONTROL_DIM, + cudaMemcpyDeviceToHost, this->stream_)); + } + HANDLE_ERROR(cudaMemcpyAsync(costs_gpu.data(), this->costs_d_, sizeof(float) * this->num_verifications_, + cudaMemcpyDeviceToHost, this->stream_)); + HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); + + float cost; + for (int i = 2; i < this->num_verifications_; i++) + { + cost = 0.0f; + cost = this->sampler_->computeLikelihoodRatioCost(this->sampled_controls_cpu_[i], this->sampled_indices_[i], + this->sampled_times_[i], this->sampled_distributions_[i], + this->lambda_, this->alpha_); + ASSERT_NEAR(cost, costs_gpu[i], fabsf(cost) * 5e-5) + << " failed on sample " << this->sampled_indices_[i] << ", time " << this->sampled_times_[i] + << ", dist: " << this->sampled_distributions_[i]; + } +} + +TYPED_TEST(SamplingDistributionTests, CompareFeedbackCostsCPUvsGPU) +{ + using T = TypeParam; + this->num_distributions_ = 2; + auto params = this->sampler_->getParams(); + params.use_same_noise_for_all_distributions = false; + this->sampler_->setParams(params); + inherited_from_gaussian()(this->sampler_); + this->updateSampler(); + this->generateNewSamples(); + + dim3 grid_dim(mppi::math::int_ceil(this->num_verifications_, this->thread_block_.x), 1, 1); + std::size_t shared_mem_bytes = + mppi::kernels::calcClassSharedMemSize(this->sampler_.get(), this->thread_block_) + + sizeof(float) * (this->OUTPUT_DIM * this->thread_block_.x + + this->thread_block_.x * this->thread_block_.y * this->thread_block_.z); + + getFeedbackCostKernel<<thread_block_, shared_mem_bytes, this->stream_>>>( + this->sampler_->sampling_d_, this->num_verifications_, this->lambda_, this->alpha_, this->sampling_indices_d_, + this->times_d_, this->distribution_indices_d_, this->outputs_d_, this->controls_d_, this->costs_d_); + HANDLE_ERROR(cudaGetLastError()); + + std::vector costs_gpu(this->num_verifications_); + // Copy back to CPU + for (int i = 0; i < this->num_verifications_; i++) + { + HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_gpu_[i].data(), this->controls_d_ + i * T::CONTROL_DIM, + sizeof(float) * T::CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_)); + // Query on CPU + float* u_i = + this->sampler_->getControlSample(this->sampled_indices_[i], this->sampled_times_[i], + this->sampled_distributions_[i], nullptr, this->sampled_outputs_[i].data()); + HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_cpu_[i].data(), u_i, sizeof(float) * T::CONTROL_DIM, + cudaMemcpyDeviceToHost, this->stream_)); + } + HANDLE_ERROR(cudaMemcpyAsync(costs_gpu.data(), this->costs_d_, sizeof(float) * this->num_verifications_, + cudaMemcpyDeviceToHost, this->stream_)); + HANDLE_ERROR(cudaStreamSynchronize(this->stream_)); + + float cost; + for (int i = 0; i < this->num_verifications_; i++) + { + cost = 0.0f; + cost = this->sampler_->computeFeedbackCost(this->sampled_controls_cpu_[i], this->sampled_times_[i], + this->sampled_distributions_[i], this->lambda_, this->alpha_); + ASSERT_FLOAT_EQ(cost, costs_gpu[i]) << " failed on sample " << this->sampled_indices_[i] << ", time " + << this->sampled_times_[i] << ", dist: " << this->sampled_distributions_[i]; + } +}