From 9256856279d6d02aa55b430b40693bf258831099 Mon Sep 17 00:00:00 2001
From: Bogdan Vlahov <bvlahov3@gatech.edu>
Date: Thu, 22 Aug 2024 20:50:37 -0400
Subject: [PATCH 1/4] Add Linear Dynamics Class

- Added unit tests for the linear dynamics
---
 include/mppi/dynamics/linear/linear.cu        |  99 ++++++++++++
 include/mppi/dynamics/linear/linear.cuh       | 137 ++++++++++++++++
 tests/dynamics/linear_dynamics_tests.cu       | 152 ++++++++++++++++++
 .../dynamics/dynamics_generic_kernel_tests.cu |  35 +++-
 .../dynamics_generic_kernel_tests.cuh         |   9 ++
 5 files changed, 424 insertions(+), 8 deletions(-)
 create mode 100644 include/mppi/dynamics/linear/linear.cu
 create mode 100644 include/mppi/dynamics/linear/linear.cuh
 create mode 100644 tests/dynamics/linear_dynamics_tests.cu
diff --git a/include/mppi/dynamics/linear/linear.cu b/include/mppi/dynamics/linear/linear.cu
new file mode 100644
index 0000000..be548ba
--- /dev/null
+++ b/include/mppi/dynamics/linear/linear.cu
@@ -0,0 +1,99 @@
+/**
+ * @file linear.cu
+ * @brief Linear Dynamics
+ * @author Bogdan Vlahov
+ * @version
+ * @date 2024-08-16
+ */
+#include <mppi/dynamics/racer_dubins/racer_dubins.cuh>
+#include <string>
+#include <mppi/utils/math_utils.h>
+
+namespace mm = mppi::matrix_multiplication;
+namespace mp1 = mppi::p1;
+
+#define LINEAR_TEMPLATE template <class CLASS_T, class PARAMS_T>
+#define LINEAR_DYNAMICS LinearDynamicsImpl<CLASS_T, PARAMS_T>
+
+LINEAR_TEMPLATE LINEAR_DYNAMICS::LinearDynamicsImpl(cudaStream_t stream) : PARENT_CLASS(stream)
+{
+  this->SHARED_MEM_REQUEST_GRD_BYTES = sizeof(PARAMS_T);
+}
+
+LINEAR_TEMPLATE LINEAR_DYNAMICS::LinearDynamicsImpl(PARAMS_T& params, cudaStream_t stream)
+  : PARENT_CLASS(params, stream)
+{
+  this->SHARED_MEM_REQUEST_GRD_BYTES = sizeof(PARAMS_T);
+}
+
+LINEAR_TEMPLATE
+bool LINEAR_DYNAMICS::computeGrad(const Eigen::Ref<const state_array>& state,
+                                  const Eigen::Ref<const control_array>& control, Eigen::Ref<dfdx> A,
+                                  Eigen::Ref<dfdu> B)
+{
+  A = this->params_.getA();
+  B = this->params_.getB();
+  return true;
+}
+
+LINEAR_TEMPLATE
+LINEAR_DYNAMICS::state_array LINEAR_DYNAMICS::stateFromMap(const std::map<std::string, float>& map)
+{
+  state_array x = this->getZeroState();
+  for (int i = 0; i < this->STATE_DIM; i++)
+  {
+    std::string state_name = "x_" + std::to_string(i);
+    x(i, 0) = map.at(state_name);
+  }
+  return x;
+}
+
+LINEAR_TEMPLATE
+void LINEAR_DYNAMICS::step(Eigen::Ref<state_array> state, Eigen::Ref<state_array> next_state,
+                           Eigen::Ref<state_array> state_der, const Eigen::Ref<const control_array>& control,
+                           Eigen::Ref<output_array> output, const float t, const float dt)
+{
+  state_der = this->params_.getA() * state + this->params_.getB() * control;
+  next_state = state + state_der * dt;
+  this->stateToOutput(next_state, output);
+}
+
+LINEAR_TEMPLATE
+__device__ inline void LINEAR_DYNAMICS::step(float* state, float* next_state, float* state_der, float* control,
+                                             float* output, float* theta_s, const float t, const float dt)
+{
+  PARAMS_T* params_p = &(this->params_);
+  if (this->getGrdSharedSizeBytes() != 0)
+  {
+    params_p = (PARAMS_T*)theta_s;
+  }
+  float* A = params_p->A;
+  float* B = params_p->B;
+
+  const mp1::Parallel1Dir P_DIR = mp1::Parallel1Dir::THREAD_Y;
+  mm::gemm1<this->STATE_DIM, this->STATE_DIM, 1, P_DIR>(A, state, state_der);
+  mm::gemm1<this->STATE_DIM, this->CONTROL_DIM, 1, P_DIR>(B, control, state_der, 1.0f, 1.0f);
+  // __syncthreads();
+  int index, step;
+  mp1::getParallel1DIndex<P_DIR>(index, step);
+  for (int i = index; i < this->STATE_DIM; i += step)
+  {
+    next_state[i] = state[i] + state_der[i] * dt;
+    output[i] = next_state[i];
+  }
+}
+
+LINEAR_TEMPLATE
+__device__ void LINEAR_DYNAMICS::initializeDynamics(float* state, float* control, float* output, float* theta_s,
+                                                    float t_0, float dt)
+{
+  PARENT_CLASS::initializeDynamics(state, control, output, theta_s, t_0, dt);
+  if (this->getGrdSharedSizeBytes() != 0)
+  {
+    PARAMS_T* shared = (PARAMS_T*)theta_s;
+    *shared = this->params_;
+  }
+}
+
+#undef LINEAR_TEMPLATE
+#undef LINEAR_DYNAMICS
diff --git a/include/mppi/dynamics/linear/linear.cuh b/include/mppi/dynamics/linear/linear.cuh
new file mode 100644
index 0000000..2a857d9
--- /dev/null
+++ b/include/mppi/dynamics/linear/linear.cuh
@@ -0,0 +1,137 @@
+/**
+ * @file linear.cuh
+ * @brief Linear Dynamics
+ * @author Bogdan Vlahov
+ * @version
+ * @date 2024-08-16
+ */
+#pragma once
+
+#include <mppi/dynamics/dynamics.cuh>
+
+template <int STATE_DIM = 1, int CONTROL_DIM = 1>
+struct LinearDynamicsParams : public DynamicsParams
+{
+public:
+  using state_matrix = Eigen::Matrix<float, STATE_DIM, STATE_DIM>;
+  using control_matrix = Eigen::Matrix<float, STATE_DIM, CONTROL_DIM>;
+  enum class StateIndex : int
+  {
+    NUM_STATES = STATE_DIM,
+  };
+
+  enum class ControlIndex : int
+  {
+    NUM_CONTROLS = CONTROL_DIM,
+  };
+  enum class OutputIndex : int
+  {
+    NUM_OUTPUTS = STATE_DIM,
+  };
+  float A[STATE_DIM * STATE_DIM] = { 0.0f };
+  float B[STATE_DIM * CONTROL_DIM] = { 0.0f };
+
+  LinearDynamicsParams() = default;
+  ~LinearDynamicsParams() = default;
+
+  void setA(const Eigen::Ref<const state_matrix>& A_eigen)
+  {
+    memcpy(A, A_eigen.data(), sizeof(float) * STATE_DIM * STATE_DIM);
+  }
+
+  void setB(const Eigen::Ref<const control_matrix>& B_eigen)
+  {
+    memcpy(B, B_eigen.data(), sizeof(float) * STATE_DIM * CONTROL_DIM);
+  }
+
+  float* getA() const
+  {
+    return A;
+  }
+  float* getB() const
+  {
+    return B;
+  }
+
+  Eigen::Map<state_matrix> getA()
+  {
+    Eigen::Map<state_matrix> A_eigen(A);
+    return A_eigen;
+  }
+
+  Eigen::Map<control_matrix> getB()
+  {
+    Eigen::Map<control_matrix> B_eigen(B);
+    return B_eigen;
+  }
+};
+
+using namespace MPPI_internal;
+
+template <class CLASS_T, class PARAMS_T = LinearDynamicsParams<1, 1>>
+class LinearDynamicsImpl : public Dynamics<CLASS_T, PARAMS_T>
+{
+public:
+  using PARENT_CLASS = Dynamics<CLASS_T, PARAMS_T>;
+  using state_array = typename PARENT_CLASS::state_array;
+  using control_array = typename PARENT_CLASS::control_array;
+  using output_array = typename PARENT_CLASS::output_array;
+  using dfdx = typename PARENT_CLASS::dfdx;
+  using dfdu = typename PARENT_CLASS::dfdu;
+  using PARENT_CLASS::initializeDynamics;
+
+  LinearDynamicsImpl(cudaStream_t stream = nullptr);
+  LinearDynamicsImpl(PARAMS_T& params, cudaStream_t stream = nullptr);
+
+  std::string getDynamicsModelName() const override
+  {
+    return "Linear Dynamics";
+  }
+
+  bool computeGrad(const Eigen::Ref<const state_array>& state, const Eigen::Ref<const control_array>& control,
+                   Eigen::Ref<dfdx> A, Eigen::Ref<dfdu> B);
+
+  state_array stateFromMap(const std::map<std::string, float>& map);
+
+  void step(Eigen::Ref<state_array> state, Eigen::Ref<state_array> next_state, Eigen::Ref<state_array> state_der,
+            const Eigen::Ref<const control_array>& control, Eigen::Ref<output_array> output, const float t,
+            const float dt);
+
+  void setA(const Eigen::Ref<const dfdx>& A_eigen, bool synchronize = false)
+  {
+    this->params_.setA(A_eigen);
+    this->paramsToDevice(synchronize);
+  }
+
+  void setB(const Eigen::Ref<const dfdu>& B_eigen, bool synchronize = false)
+  {
+    this->params_.setB(B_eigen);
+    this->paramsToDevice(synchronize);
+  }
+
+  __device__ inline void step(float* state, float* next_state, float* state_der, float* control, float* output,
+                              float* theta_s, const float t, const float dt);
+
+  __device__ void initializeDynamics(float* state, float* control, float* output, float* theta_s, float t_0, float dt);
+};
+
+template <int STATE_DIM = 1, int CONTROL_DIM = 1>
+class LinearDynamics
+  : public LinearDynamicsImpl<LinearDynamics<STATE_DIM, CONTROL_DIM>, LinearDynamicsParams<STATE_DIM, CONTROL_DIM>>
+{
+public:
+  using PARENT_CLASS =
+      LinearDynamicsImpl<LinearDynamics<STATE_DIM, CONTROL_DIM>, LinearDynamicsParams<STATE_DIM, CONTROL_DIM>>;
+  using DYN_PARAMS_T = typename PARENT_CLASS::DYN_PARAMS_T;
+
+  LinearDynamics(cudaStream_t stream = nullptr) : PARENT_CLASS(stream)
+  {
+  }
+  LinearDynamics(DYN_PARAMS_T& params, cudaStream_t stream = nullptr) : PARENT_CLASS(params, stream)
+  {
+  }
+};
+
+#ifdef __CUDACC__
+#include "linear.cu"
+#endif
diff --git a/tests/dynamics/linear_dynamics_tests.cu b/tests/dynamics/linear_dynamics_tests.cu
new file mode 100644
index 0000000..1d17304
--- /dev/null
+++ b/tests/dynamics/linear_dynamics_tests.cu
@@ -0,0 +1,152 @@
+#include <gtest/gtest.h>
+#include <kernel_tests/dynamics/dynamics_generic_kernel_tests.cuh>
+#include <mppi/dynamics/linear/linear.cuh>
+
+TEST(Linear, Dimensionality)
+{
+  using DYN_T = LinearDynamics<4, 12>;
+  ASSERT_EQ(4, DYN_T::STATE_DIM);
+  ASSERT_EQ(4, DYN_T::OUTPUT_DIM);
+  ASSERT_EQ(12, DYN_T::CONTROL_DIM);
+}
+
+TEST(Linear, SetParamsA)
+{
+  using DYN_T = LinearDynamics<4, 12>;
+  using DFDX = typename DYN_T::dfdx;
+  // Set A from params
+  DFDX A = DFDX::Random();
+  typename DYN_T::DYN_PARAMS_T params;
+  params.setA(A);
+  auto dynamics = DYN_T(params);
+  auto dyn_params = dynamics.getParams();
+  Eigen::Map<DFDX> A_params_result(dyn_params.A);
+  float diff = (A - A_params_result).sum();
+  EXPECT_FLOAT_EQ(diff, 0);
+  // Set A from dynamics
+  A = DFDX::Random();
+  dynamics.setA(A);
+  dyn_params = dynamics.getParams();
+  Eigen::Map<DFDX> A_class_result(dyn_params.A);
+  diff = (A - A_class_result).sum();
+  EXPECT_FLOAT_EQ(diff, 0);
+}
+
+TEST(Linear, SetParamsB)
+{
+  using DYN_T = LinearDynamics<4, 12>;
+  using DFDU = typename DYN_T::dfdu;
+  // Set B from params
+  DFDU B = DFDU::Random();
+  typename DYN_T::DYN_PARAMS_T params;
+  params.setB(B);
+  auto dynamics = DYN_T(params);
+  auto dyn_params = dynamics.getParams();
+  Eigen::Map<DFDU> B_result(dyn_params.B);
+  float diff = (B - B_result).sum();
+  EXPECT_FLOAT_EQ(diff, 0);
+  // Set A from dynamics
+  B = DFDU::Random();
+  dynamics.setB(B);
+  dyn_params = dynamics.getParams();
+  Eigen::Map<DFDU> B_class_result(dyn_params.B);
+  diff = (B - B_class_result).sum();
+  EXPECT_FLOAT_EQ(diff, 0);
+}
+
+TEST(Linear, CheckSharedMemorySizes)
+{
+  using DYN_T = LinearDynamics<4, 12>;
+  auto dynamics = DYN_T();
+  dynamics.GPUSetup();
+  int output_gpu[2] = { 0 };
+  int output_cpu[2] = { 0 };
+  launchGetSharedMemorySizesKernel<DYN_T>(dynamics, output_gpu);
+  output_cpu[0] = dynamics.getGrdSharedSizeBytes();
+  output_cpu[1] = dynamics.getBlkSharedSizeBytes();
+  ASSERT_EQ(output_cpu[0], output_gpu[0]);
+  ASSERT_EQ(output_cpu[1], output_gpu[1]);
+}
+
+TEST(Linear, StepCPUGPUComparison)
+{
+  using DYN_T = LinearDynamics<10, 12>;
+  using DFDU = typename DYN_T::dfdu;
+  using DFDX = typename DYN_T::dfdx;
+  DFDU B = DFDU::Random();
+  DFDX A = DFDX::Random();
+  auto dynamics = DYN_T();
+  dynamics.setA(A);
+  dynamics.setB(B);
+  typename DYN_T::buffer_trajectory buffer;
+
+  std::vector<int> x_sizes = { 1, 2, 4, 8, 16, 32 };
+  for (const auto& x_dim : x_sizes)
+  {
+    checkGPUComputationStep<DYN_T>(dynamics, 0.01, 32, x_dim, buffer);
+  }
+}
+
+TEST(Linear, JacobianCheck)
+{
+  using DYN_T = LinearDynamics<10, 6>;
+  using DFDU = typename DYN_T::dfdu;
+  using DFDX = typename DYN_T::dfdx;
+  using state_array = typename DYN_T::state_array;
+  using control_array = typename DYN_T::control_array;
+  auto dynamics = DYN_T();
+  DFDX A = DFDX::Identity();
+  DFDU B = DFDU::Random();
+  dynamics.setA(A);
+  dynamics.setB(B);
+
+  DFDX Jacobian_A;
+  DFDU Jacobian_B;
+  state_array x = dynamics.getZeroState();
+  control_array u = dynamics.getZeroControl();
+  dynamics.computeGrad(x, u, Jacobian_A, Jacobian_B);
+  float a_diff = (Jacobian_A - A).array().abs().sum();
+  float b_diff = (Jacobian_B - B).array().abs().sum();
+  ASSERT_EQ(a_diff, 0);
+  ASSERT_EQ(b_diff, 0);
+}
+
+TEST(Linear, HardCodeCPUTest)
+{
+  using DYN_T = LinearDynamics<3, 1>;
+  using DFDU = typename DYN_T::dfdu;
+  using DFDX = typename DYN_T::dfdx;
+  using state_array = typename DYN_T::state_array;
+  using control_array = typename DYN_T::control_array;
+  using output_array = typename DYN_T::output_array;
+
+  auto dynamics = DYN_T();
+  DFDX A = DFDX::Identity();
+  DFDU B = DFDU::Zero();
+  B(2, 0) = 0.5f;
+  dynamics.setA(A);
+  dynamics.setB(B);
+
+  state_array x = dynamics.getZeroState();
+  control_array u = dynamics.getZeroControl();
+  x << 1, 5, 10;
+  u << 3;
+  float dt = 0.1;
+  output_array y;
+  state_array x_der, x_next;
+  dynamics.step(x, x_next, x_der, u, y, 0, dt);
+  // Check derivative
+  ASSERT_FLOAT_EQ(x_der[0], 1.0f);
+  ASSERT_FLOAT_EQ(x_der[1], 5.0f);
+  ASSERT_FLOAT_EQ(x_der[2], 10.0f + 1.5f);
+
+  // Check next state
+  ASSERT_FLOAT_EQ(x_next[0], 1.0f + 1.0f * 0.1f);
+  ASSERT_FLOAT_EQ(x_next[1], 5.0f + 5.0f * 0.1f);
+  ASSERT_FLOAT_EQ(x_next[2], 10.0f + (10.0f + 1.5f) * 0.1f);
+
+  // Check output
+  ASSERT_FLOAT_EQ(y[0], 1.0f + 1.0f * 0.1f);
+  ASSERT_FLOAT_EQ(y[1], 5.0f + 5.0f * 0.1f);
+  ASSERT_FLOAT_EQ(y[2], 10.0f + (10.0f + 1.5f) * 0.1f);
+}
diff --git a/tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cu b/tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cu
index 738f04b..b844b85 100644
--- a/tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cu
+++ b/tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cu
@@ -26,6 +26,28 @@ void launchParameterTestKernel(CLASS_T& class_t, PARAMS_T& params)
   cudaFree(params_d);
 }
 
+template <class DYN_T>
+__global__ void getSharedMemorySizesKernel(DYN_T* __restrict__ dynamics, int* __restrict__ output_d)
+{
+  output_d[0] = dynamics->getGrdSharedSizeBytes();
+  output_d[1] = dynamics->getBlkSharedSizeBytes();
+}
+
+template <typename DYNAMICS_T>
+void launchGetSharedMemorySizesKernel(DYNAMICS_T& dynamics, int shared_mem_sizes[2])
+{
+  int* shared_mem_sizes_d;
+  HANDLE_ERROR(cudaMalloc((void**)&shared_mem_sizes_d, sizeof(int) * 2));
+
+  getSharedMemorySizesKernel<DYNAMICS_T><<<1, 1>>>(dynamics.model_d_, shared_mem_sizes_d);
+  HANDLE_ERROR(cudaGetLastError());
+
+  // Copy the memory back to the host
+  HANDLE_ERROR(cudaMemcpy(shared_mem_sizes, shared_mem_sizes_d, sizeof(int) * 2, cudaMemcpyDeviceToHost));
+
+  HANDLE_ERROR(cudaFree(shared_mem_sizes_d));
+}
+
 template <typename DYNAMICS_T, int C_DIM>
 __global__ void controlRangesTestKernel(DYNAMICS_T* dynamics, float2* control_rngs)
 {
@@ -305,17 +327,14 @@ __global__ void stepTestKernel(DYNAMICS_T* dynamics, float* state, float* contro
   float* u = control + (tid * DYNAMICS_T::CONTROL_DIM);
   float* y = output + (tid * DYNAMICS_T::OUTPUT_DIM);
 
-  dynamics->initializeDynamics(state, control, output, theta, 0.0f, dt);
+  if (tid < num)
+  {
+    dynamics->initializeDynamics(state, control, output, theta, 0.0f, dt);
+  }
   __syncthreads();
 
   if (tid < num)
   {
-    float* x = state + (tid * DYNAMICS_T::STATE_DIM);
-    float* x_dot = state_der + (tid * DYNAMICS_T::STATE_DIM);
-    float* x_next = next_state + (tid * DYNAMICS_T::STATE_DIM);
-    float* u = control + (tid * DYNAMICS_T::CONTROL_DIM);
-    float* y = output + (tid * DYNAMICS_T::OUTPUT_DIM);
-
     dynamics->enforceConstraints(x, u);
     dynamics->step(x, x_next, x_dot, u, y, theta, t, dt);
   }
@@ -408,7 +427,7 @@ void launchStepTestKernel(DYNAMICS_T& dynamics, std::vector<std::array<float, DY
 
 template <class DYN_T>
 void checkGPUComputationStep(DYN_T& dynamics, float dt, int max_y_dim, int x_dim,
-                             typename DYN_T::buffer_trajectory buffer, double tol = 1.0e-5)
+                             typename DYN_T::buffer_trajectory buffer, double tol)
 {
   CudaCheckError();
   dynamics.GPUSetup();
diff --git a/tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cuh b/tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cuh
index 41dea8f..aa24b05 100644
--- a/tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cuh
+++ b/tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cuh
@@ -48,6 +48,15 @@ void launchComputeStateDerivTestKernel(DYNAMICS_T& dynamics, std::vector<std::ar
                                        std::vector<std::array<float, C_DIM>>& control,
                                        std::vector<std::array<float, S_DIM>>& state_der, int dim_y);
 
+template <class DYN_T>
+__global__ void getSharedMemorySizesKernel(DYN_T* __restrict__ dynamics, int* __restrict__ output_d);
+template <typename DYN_T>
+void launchGetSharedMemorySizesKernel(DYN_T& dynamics, int shared_mem_sizes[2]);
+
+template <class DYN_T>
+void checkGPUComputationStep(DYN_T& dynamics, float dt, int max_y_dim, int x_dim,
+                             typename DYN_T::buffer_trajectory buffer, double tol = 1.0e-5);
+
 #if __CUDACC__
 #include "dynamics_generic_kernel_tests.cu"
 #endif

From 64daeec49ecb4866f2b4a741ff1ff9c78f3c374f Mon Sep 17 00:00:00 2001
From: Bogdan Vlahov <bvlahov3@gatech.edu>
Date: Wed, 28 Aug 2024 09:50:14 -0400
Subject: [PATCH 2/4] Ensure test kernels for dynamics check CUDA errors

---
 .../dynamics/dynamics_generic_kernel_tests.cu    | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cu b/tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cu
index b844b85..89ce963 100644
--- a/tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cu
+++ b/tests/include/kernel_tests/dynamics/dynamics_generic_kernel_tests.cu
@@ -17,7 +17,7 @@ void launchParameterTestKernel(CLASS_T& class_t, PARAMS_T& params)
   HANDLE_ERROR(cudaMalloc((void**)&params_d, sizeof(PARAMS_T)))
 
   parameterTestKernel<CLASS_T, PARAMS_T><<<1, 1>>>(static_cast<CLASS_T*>(class_t.model_d_), *params_d);
-  CudaCheckError();
+  HANDLE_ERROR(cudaGetLastError());
 
   // Copy the memory back to the host
   HANDLE_ERROR(cudaMemcpy(&params, params_d, sizeof(PARAMS_T), cudaMemcpyDeviceToHost));
@@ -70,7 +70,7 @@ void launchControlRangesTestKernel(DYNAMICS_T& dynamics, std::array<float2, C_DI
   HANDLE_ERROR(cudaMalloc((void**)&ranges_d, sizeof(float2) * control_rngs.size()))
 
   controlRangesTestKernel<DYNAMICS_T, C_DIM><<<1, 1>>>(static_cast<DYNAMICS_T*>(dynamics.model_d_), ranges_d);
-  CudaCheckError();
+  HANDLE_ERROR(cudaGetLastError());
 
   // Copy the memory back to the host
   HANDLE_ERROR(cudaMemcpy(control_rngs.data(), ranges_d, sizeof(float2) * control_rngs.size(), cudaMemcpyDeviceToHost));
@@ -106,7 +106,7 @@ void launchEnforceConstraintTestKernel(DYNAMICS_T& dynamics, std::vector<std::ar
   dim3 numBlocks(1, 1);
   enforceConstraintTestKernel<DYNAMICS_T, S_DIM, C_DIM>
       <<<numBlocks, threadsPerBlock>>>(static_cast<DYNAMICS_T*>(dynamics.model_d_), state_d, control_d, count);
-  CudaCheckError();
+  HANDLE_ERROR(cudaGetLastError());
 
   // Copy the memory back to the host
   HANDLE_ERROR(cudaMemcpy(state.data(), state_d, sizeof(float) * S_DIM * state.size(), cudaMemcpyDeviceToHost));
@@ -144,7 +144,7 @@ void launchUpdateStateTestKernel(DYNAMICS_T& dynamics, std::vector<std::array<fl
   dim3 numBlocks(1, 1);
   updateStateTestKernel<DYNAMICS_T, S_DIM>
       <<<numBlocks, threadsPerBlock>>>(static_cast<DYNAMICS_T*>(dynamics.model_d_), state_d, state_der_d, dt, count);
-  CudaCheckError();
+  HANDLE_ERROR(cudaGetLastError());
 
   // Copy the memory back to the host
   HANDLE_ERROR(cudaMemcpy(state.data(), state_d, sizeof(float) * S_DIM * state.size(), cudaMemcpyDeviceToHost));
@@ -183,7 +183,7 @@ void launchComputeKinematicsTestKernel(DYNAMICS_T& dynamics, std::vector<std::ar
   dim3 numBlocks(1, 1);
   computeKinematicsTestKernel<DYNAMICS_T, S_DIM>
       <<<numBlocks, threadsPerBlock>>>(static_cast<DYNAMICS_T*>(dynamics.model_d_), state_d, state_der_d, count);
-  CudaCheckError();
+  HANDLE_ERROR(cudaGetLastError());
 
   // Copy the memory back to the host
   HANDLE_ERROR(cudaMemcpy(state.data(), state_d, sizeof(float) * S_DIM * state.size(), cudaMemcpyDeviceToHost));
@@ -241,7 +241,7 @@ void launchComputeDynamicsTestKernel(DYNAMICS_T& dynamics, std::vector<std::arra
   // launch kernel
   computeDynamicsTestKernel<DYNAMICS_T, S_DIM, C_DIM, BLOCKDIM_X>
       <<<numBlocks, threadsPerBlock, shared_mem>>>(dynamics.model_d_, state_d, control_d, state_der_d, count);
-  CudaCheckError();
+  HANDLE_ERROR(cudaGetLastError());
 
   HANDLE_ERROR(cudaMemcpy(state.data(), state_d, sizeof(float) * S_DIM * count, cudaMemcpyDeviceToHost));
   HANDLE_ERROR(cudaMemcpy(state_der.data(), state_der_d, sizeof(float) * S_DIM * count, cudaMemcpyDeviceToHost));
@@ -297,7 +297,7 @@ void launchComputeStateDerivTestKernel(DYNAMICS_T& dynamics, std::vector<std::ar
                         mppi::math::nearest_multiple_4(threadsPerBlock.x * DYNAMICS_T::OUTPUT_DIM);
   computeStateDerivTestKernel<DYNAMICS_T, S_DIM, C_DIM, BLOCKDIM_X><<<numBlocks, threadsPerBlock, shared_mem>>>(
       static_cast<DYNAMICS_T*>(dynamics.model_d_), state_d, control_d, state_der_d, count);
-  CudaCheckError();
+  HANDLE_ERROR(cudaGetLastError());
 
   // Copy the memory back to the host
   HANDLE_ERROR(cudaMemcpy(state.data(), state_d, sizeof(float) * S_DIM * state.size(), cudaMemcpyDeviceToHost));
@@ -403,7 +403,7 @@ void launchStepTestKernel(DYNAMICS_T& dynamics, std::vector<std::array<float, DY
   unsigned shared_mem = mppi::kernels::calcClassSharedMemSize(&dynamics, threadsPerBlock);
   stepTestKernel<DYNAMICS_T><<<numBlocks, threadsPerBlock, shared_mem>>>(
       dynamics.model_d_, state_d, control_d, state_der_d, next_state_d, output_d, t, dt, count);
-  CudaCheckError();
+  HANDLE_ERROR(cudaGetLastError());
 
   // Copy the memory back to the host
   HANDLE_ERROR(

From 34693b56f900b1071af4e1384906c15cbe47fc2c Mon Sep 17 00:00:00 2001
From: Bogdan Vlahov <bvlahov3@gatech.edu>
Date: Fri, 30 Aug 2024 09:56:02 -0400
Subject: [PATCH 3/4] Add unit tests for floatX math operations

---
 tests/math_utils/cuda_math_utils_tests.cu | 685 ++++++++++++++++++++++
 1 file changed, 685 insertions(+)
 create mode 100644 tests/math_utils/cuda_math_utils_tests.cu

diff --git a/tests/math_utils/cuda_math_utils_tests.cu b/tests/math_utils/cuda_math_utils_tests.cu
new file mode 100644
index 0000000..9039479
--- /dev/null
+++ b/tests/math_utils/cuda_math_utils_tests.cu
@@ -0,0 +1,685 @@
+#include <gtest/gtest.h>
+#include <mppi/utils/cuda_math_utils.cuh>
+#include <mppi/utils/gpu_err_chk.cuh>
+
+#include <random>
+
+template <class T = float2>
+__global__ void VectorVectorAddTestKernel(const T* __restrict__ input1, const T* __restrict__ input2,
+                                          T* __restrict__ output2)
+{
+  *output2 = *input1 + *input2;
+}
+
+template <class T = float2>
+__global__ void VectorVectorSubTestKernel(const T* __restrict__ input1, const T* __restrict__ input2,
+                                          T* __restrict__ output2)
+{
+  *output2 = *input1 - *input2;
+}
+
+template <class T = float2>
+__global__ void VectorVectorMultTestKernel(const T* __restrict__ input1, const T* __restrict__ input2,
+                                           T* __restrict__ output2)
+{
+  *output2 = (*input1) * (*input2);
+}
+
+template <class T = float2>
+__global__ void VectorVectorDivTestKernel(const T* __restrict__ input1, const T* __restrict__ input2,
+                                          T* __restrict__ output2)
+{
+  *output2 = (*input1) / (*input2);
+}
+
+template <class T = float2, class S = float>
+__global__ void VectorScalarAddTestKernel(const T* __restrict__ input1, const S* __restrict__ input2,
+                                          T* __restrict__ output2)
+{
+  *output2 = *input1 + *input2;
+}
+
+template <class T = float2, class S = float>
+__global__ void VectorScalarMultTestKernel(const T* __restrict__ input1, const S* __restrict__ input2,
+                                           T* __restrict__ output2)
+{
+  *output2 = (*input1) * (*input2);
+}
+
+template <class T = float2, class S = float>
+__global__ void VectorScalarSubTestKernel(const T* __restrict__ input1, const S* __restrict__ input2,
+                                          T* __restrict__ output2)
+{
+  *output2 = *input1 - *input2;
+}
+
+template <class T = float2, class S = float>
+__global__ void VectorScalarDivTestKernel(const T* __restrict__ input1, const S* __restrict__ input2,
+                                          T* __restrict__ output2)
+{
+  *output2 = (*input1) / (*input2);
+}
+
+template <class T = float2, class S = float>
+__global__ void VectorVectorScalarAddMultTestKernel(const T* __restrict__ input1, const T* __restrict__ input2,
+                                                    const S* __restrict__ scalar, T* __restrict__ output2)
+{
+  *output2 = *input1 + (*input2) * (*scalar);
+}
+
+template <class T = float2>
+class CudaFloatStructsTests : public ::testing::Test
+{
+public:
+  T input1_cpu;
+  T input2_cpu;
+  T output_cpu;
+  T output_gpu;
+  T* input1_d;
+  T* input2_d;
+  T* output_d;
+
+  float scalar;
+  float* scalar_d;
+
+  void SetUp() override
+  {
+    HANDLE_ERROR(cudaMalloc((void**)&scalar_d, sizeof(float)));
+    HANDLE_ERROR(cudaMalloc((void**)&input1_d, sizeof(T)));
+    HANDLE_ERROR(cudaMalloc((void**)&input2_d, sizeof(T)));
+    HANDLE_ERROR(cudaMalloc((void**)&output_d, sizeof(T)));
+
+    // Setup random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> dist(0.0f, 10.f);
+
+    initializeStruct(input1_cpu, gen, dist);
+    initializeStruct(input2_cpu, gen, dist);
+    HANDLE_ERROR(cudaMemcpy(input1_d, &input1_cpu, sizeof(T), cudaMemcpyHostToDevice));
+    HANDLE_ERROR(cudaMemcpy(input2_d, &input2_cpu, sizeof(T), cudaMemcpyHostToDevice));
+    HANDLE_ERROR(cudaMemcpy(scalar_d, &scalar, sizeof(float), cudaMemcpyHostToDevice));
+  }
+
+  void initializeStruct(T& val, std::mt19937& gen, std::uniform_real_distribution<float>& dist)
+  {
+    std::cout << "Did not specialize!" << std::endl;
+  }
+
+  T addVec(const T& val1, const T& val2)
+  {
+    T a;
+    std::cout << "Wrong addVec!" << std::endl;
+    return a;
+  }
+
+  T addScalar(const T& val1, const float& val2)
+  {
+    T a;
+    std::cout << "Wrong addScalar!" << std::endl;
+    return a;
+  }
+
+  T subVec(const T& val1, const T& val2)
+  {
+    T a;
+    std::cout << "Wrong subVec!" << std::endl;
+    return a;
+  }
+
+  T subScalar(const T& val1, const float& val2)
+  {
+    T a;
+    std::cout << "Wrong subScalar!" << std::endl;
+    return a;
+  }
+
+  T multVec(const T& val1, const T& val2)
+  {
+    T a;
+    std::cout << "Wrong multVec!" << std::endl;
+    return a;
+  }
+
+  T multScalar(const T& val1, const float& val2)
+  {
+    T a;
+    std::cout << "Wrong multScalar!" << std::endl;
+    return a;
+  }
+
+  T divVec(const T& val1, const T& val2)
+  {
+    T a;
+    std::cout << "Wrong divVec!" << std::endl;
+    return a;
+  }
+
+  T divScalar(const T& val1, const float& val2)
+  {
+    T a;
+    std::cout << "Wrong divScalar!" << std::endl;
+    return a;
+  }
+
+  bool assert_same(const T& val1, const T& val2, const std::string& str = "");
+
+  void TearDown() override
+  {
+    HANDLE_ERROR(cudaFree(scalar_d));
+    HANDLE_ERROR(cudaFree(input1_d));
+    HANDLE_ERROR(cudaFree(input2_d));
+    HANDLE_ERROR(cudaFree(output_d));
+  }
+};
+
+template <>
+void CudaFloatStructsTests<float2>::initializeStruct(float2& val, std::mt19937& gen,
+                                                     std::uniform_real_distribution<float>& dist)
+{
+  val = make_float2(dist(gen), dist(gen));
+}
+
+template <>
+bool CudaFloatStructsTests<float2>::assert_same(const float2& val1, const float2& val2, const std::string& str)
+{
+  bool result = true;
+  result = result && (val1.x == val2.x);
+  result = result && (val1.y == val2.y);
+  if (!result)
+  {
+    printf("(%f, %f) != (%f, %f)", val1.x, val1.y, val2.x, val2.y);
+    std::cout << str << std::endl;
+  }
+  return result;
+}
+
+template <>
+float2 CudaFloatStructsTests<float2>::addVec(const float2& val1, const float2& val2)
+{
+  float2 output;
+  output.x = val1.x + val2.x;
+  output.y = val1.y + val2.y;
+  return output;
+}
+
+template <>
+float2 CudaFloatStructsTests<float2>::multVec(const float2& val1, const float2& val2)
+{
+  float2 output;
+  output.x = val1.x * val2.x;
+  output.y = val1.y * val2.y;
+  return output;
+}
+
+template <>
+float2 CudaFloatStructsTests<float2>::addScalar(const float2& val1, const float& val2)
+{
+  float2 output;
+  output.x = val1.x + val2;
+  output.y = val1.y + val2;
+  return output;
+}
+
+template <>
+float2 CudaFloatStructsTests<float2>::multScalar(const float2& val1, const float& val2)
+{
+  float2 output;
+  output.x = val1.x * val2;
+  output.y = val1.y * val2;
+  return output;
+}
+
+template <>
+float2 CudaFloatStructsTests<float2>::subVec(const float2& val1, const float2& val2)
+{
+  float2 output;
+  output.x = val1.x - val2.x;
+  output.y = val1.y - val2.y;
+  return output;
+}
+
+template <>
+float2 CudaFloatStructsTests<float2>::divVec(const float2& val1, const float2& val2)
+{
+  float2 output;
+  output.x = val1.x / val2.x;
+  output.y = val1.y / val2.y;
+  return output;
+}
+
+template <>
+float2 CudaFloatStructsTests<float2>::subScalar(const float2& val1, const float& val2)
+{
+  float2 output;
+  output.x = val1.x - val2;
+  output.y = val1.y - val2;
+  return output;
+}
+
+template <>
+float2 CudaFloatStructsTests<float2>::divScalar(const float2& val1, const float& val2)
+{
+  float2 output;
+  output.x = val1.x / val2;
+  output.y = val1.y / val2;
+  return output;
+}
+
+template <>
+void CudaFloatStructsTests<float4>::initializeStruct(float4& val, std::mt19937& gen,
+                                                     std::uniform_real_distribution<float>& dist)
+{
+  val = make_float4(dist(gen), dist(gen), dist(gen), dist(gen));
+}
+
+template <>
+bool CudaFloatStructsTests<float4>::assert_same(const float4& val1, const float4& val2, const std::string& str)
+{
+  bool result = true;
+  result = result && (val1.x == val2.x);
+  result = result && (val1.y == val2.y);
+  result = result && (val1.z == val2.z);
+  result = result && (val1.w == val2.w);
+  if (!result)
+  {
+    printf("(%f, %f, %f, %f) != (%f, %f, %f, %f)", val1.x, val1.y, val1.z, val1.w, val2.x, val2.y, val2.z, val2.w);
+    std::cout << str << std::endl;
+  }
+  return result;
+}
+
+template <>
+float4 CudaFloatStructsTests<float4>::addVec(const float4& val1, const float4& val2)
+{
+  float4 output;
+  output.x = val1.x + val2.x;
+  output.y = val1.y + val2.y;
+  output.z = val1.z + val2.z;
+  output.w = val1.w + val2.w;
+  return output;
+}
+
+template <>
+float4 CudaFloatStructsTests<float4>::multVec(const float4& val1, const float4& val2)
+{
+  float4 output;
+  output.x = val1.x * val2.x;
+  output.y = val1.y * val2.y;
+  output.z = val1.z * val2.z;
+  output.w = val1.w * val2.w;
+  return output;
+}
+
+template <>
+float4 CudaFloatStructsTests<float4>::addScalar(const float4& val1, const float& val2)
+{
+  float4 output;
+  output.x = val1.x + val2;
+  output.y = val1.y + val2;
+  output.z = val1.z + val2;
+  output.w = val1.w + val2;
+  return output;
+}
+
+template <>
+float4 CudaFloatStructsTests<float4>::multScalar(const float4& val1, const float& val2)
+{
+  float4 output;
+  output.x = val1.x * val2;
+  output.y = val1.y * val2;
+  output.z = val1.z * val2;
+  output.w = val1.w * val2;
+  return output;
+}
+
+template <>
+float4 CudaFloatStructsTests<float4>::subVec(const float4& val1, const float4& val2)
+{
+  float4 output;
+  output.x = val1.x - val2.x;
+  output.y = val1.y - val2.y;
+  output.z = val1.z - val2.z;
+  output.w = val1.w - val2.w;
+  return output;
+}
+
+template <>
+float4 CudaFloatStructsTests<float4>::divVec(const float4& val1, const float4& val2)
+{
+  float4 output;
+  output.x = val1.x / val2.x;
+  output.y = val1.y / val2.y;
+  output.z = val1.z / val2.z;
+  output.w = val1.w / val2.w;
+  return output;
+}
+
+template <>
+float4 CudaFloatStructsTests<float4>::subScalar(const float4& val1, const float& val2)
+{
+  float4 output;
+  output.x = val1.x - val2;
+  output.y = val1.y - val2;
+  output.z = val1.z - val2;
+  output.w = val1.w - val2;
+  return output;
+}
+
+template <>
+float4 CudaFloatStructsTests<float4>::divScalar(const float4& val1, const float& val2)
+{
+  float4 output;
+  output.x = val1.x / val2;
+  output.y = val1.y / val2;
+  output.z = val1.z / val2;
+  output.w = val1.w / val2;
+  return output;
+}
+
+template <>
+void CudaFloatStructsTests<float3>::initializeStruct(float3& val, std::mt19937& gen,
+                                                     std::uniform_real_distribution<float>& dist)
+{
+  val = make_float3(dist(gen), dist(gen), dist(gen));
+}
+
+template <>
+bool CudaFloatStructsTests<float3>::assert_same(const float3& val1, const float3& val2, const std::string& str)
+{
+  bool result = true;
+  result = result && (val1.x == val2.x);
+  result = result && (val1.y == val2.y);
+  result = result && (val1.z == val2.z);
+  if (!result)
+  {
+    printf("(%f, %f, %f) != (%f, %f, %f)", val1.x, val1.y, val1.z, val2.x, val2.y, val2.z);
+    std::cout << str << std::endl;
+  }
+  return result;
+}
+
+template <>
+float3 CudaFloatStructsTests<float3>::addVec(const float3& val1, const float3& val2)
+{
+  float3 output;
+  output.x = val1.x + val2.x;
+  output.y = val1.y + val2.y;
+  output.z = val1.z + val2.z;
+  return output;
+}
+
+template <>
+float3 CudaFloatStructsTests<float3>::multVec(const float3& val1, const float3& val2)
+{
+  float3 output;
+  output.x = val1.x * val2.x;
+  output.y = val1.y * val2.y;
+  output.z = val1.z * val2.z;
+  return output;
+}
+
+template <>
+float3 CudaFloatStructsTests<float3>::addScalar(const float3& val1, const float& val2)
+{
+  float3 output;
+  output.x = val1.x + val2;
+  output.y = val1.y + val2;
+  output.z = val1.z + val2;
+  return output;
+}
+
+template <>
+float3 CudaFloatStructsTests<float3>::multScalar(const float3& val1, const float& val2)
+{
+  float3 output;
+  output.x = val1.x * val2;
+  output.y = val1.y * val2;
+  output.z = val1.z * val2;
+  return output;
+}
+
+template <>
+float3 CudaFloatStructsTests<float3>::subVec(const float3& val1, const float3& val2)
+{
+  float3 output;
+  output.x = val1.x - val2.x;
+  output.y = val1.y - val2.y;
+  output.z = val1.z - val2.z;
+  return output;
+}
+
+template <>
+float3 CudaFloatStructsTests<float3>::divVec(const float3& val1, const float3& val2)
+{
+  float3 output;
+  output.x = val1.x / val2.x;
+  output.y = val1.y / val2.y;
+  output.z = val1.z / val2.z;
+  return output;
+}
+
+template <>
+float3 CudaFloatStructsTests<float3>::subScalar(const float3& val1, const float& val2)
+{
+  float3 output;
+  output.x = val1.x - val2;
+  output.y = val1.y - val2;
+  output.z = val1.z - val2;
+  return output;
+}
+
+template <>
+float3 CudaFloatStructsTests<float3>::divScalar(const float3& val1, const float& val2)
+{
+  float3 output;
+  output.x = val1.x / val2;
+  output.y = val1.y / val2;
+  output.z = val1.z / val2;
+  return output;
+}
+
+using TYPE_TESTS = ::testing::Types<float2, float3, float4>;
+
+TYPED_TEST_SUITE(CudaFloatStructsTests, TYPE_TESTS);
+
+TYPED_TEST(CudaFloatStructsTests, VecAddScalar)
+{
+  using T = TypeParam;
+  T ground_truth_output = this->addScalar(this->input1_cpu, this->scalar);
+
+  this->output_cpu = this->input1_cpu + this->scalar;
+  std::string cpu_fail = "Doesn't pass cpu test";
+  ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_cpu)) << cpu_fail;
+
+  // Parallelization tests
+  T zero = this->output_gpu;
+  for (int blocks = 1; blocks < 10; blocks++)
+  {
+    for (int threads = 1; threads < 128; threads++)
+    {
+      HANDLE_ERROR(cudaMemcpy(this->output_d, &zero, sizeof(T), cudaMemcpyHostToDevice));
+      VectorScalarAddTestKernel<T, float><<<blocks, threads>>>(this->input1_d, this->scalar_d, this->output_d);
+      HANDLE_ERROR(cudaMemcpy(&this->output_gpu, this->output_d, sizeof(T), cudaMemcpyDeviceToHost));
+      std::string fail_gpu =
+          " failed with " + std::to_string(blocks) + " blocks of " + std::to_string(threads) + " threads";
+      ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_gpu, fail_gpu)) << fail_gpu;
+    }
+  }
+}
+
+TYPED_TEST(CudaFloatStructsTests, VecSubScalar)
+{
+  using T = TypeParam;
+  T ground_truth_output = this->subScalar(this->input1_cpu, this->scalar);
+
+  this->output_cpu = this->input1_cpu - this->scalar;
+  std::string cpu_fail = "Doesn't pass cpu test";
+  ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_cpu)) << cpu_fail;
+
+  // Parallelization tests
+  T zero = this->output_gpu;
+  for (int blocks = 1; blocks < 10; blocks++)
+  {
+    for (int threads = 1; threads < 128; threads++)
+    {
+      HANDLE_ERROR(cudaMemcpy(this->output_d, &zero, sizeof(T), cudaMemcpyHostToDevice));
+      VectorScalarSubTestKernel<T, float><<<blocks, threads>>>(this->input1_d, this->scalar_d, this->output_d);
+      HANDLE_ERROR(cudaMemcpy(&this->output_gpu, this->output_d, sizeof(T), cudaMemcpyDeviceToHost));
+      std::string fail_gpu =
+          " failed with " + std::to_string(blocks) + " blocks of " + std::to_string(threads) + " threads";
+      ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_gpu, fail_gpu)) << fail_gpu;
+    }
+  }
+}
+
+TYPED_TEST(CudaFloatStructsTests, VecMultScalar)
+{
+  using T = TypeParam;
+  T ground_truth_output = this->multScalar(this->input1_cpu, this->scalar);
+
+  this->output_cpu = this->input1_cpu * this->scalar;
+  std::string cpu_fail = "Doesn't pass cpu test";
+  ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_cpu)) << cpu_fail;
+
+  // Parallelization tests
+  T zero = this->output_gpu;
+  for (int blocks = 1; blocks < 10; blocks++)
+  {
+    for (int threads = 1; threads < 128; threads++)
+    {
+      HANDLE_ERROR(cudaMemcpy(this->output_d, &zero, sizeof(T), cudaMemcpyHostToDevice));
+      VectorScalarMultTestKernel<T, float><<<blocks, threads>>>(this->input1_d, this->scalar_d, this->output_d);
+      HANDLE_ERROR(cudaMemcpy(&this->output_gpu, this->output_d, sizeof(T), cudaMemcpyDeviceToHost));
+      std::string fail_gpu =
+          " failed with " + std::to_string(blocks) + " blocks of " + std::to_string(threads) + " threads";
+      ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_gpu, fail_gpu)) << fail_gpu;
+    }
+  }
+}
+
+TYPED_TEST(CudaFloatStructsTests, VecDivScalar)
+{
+  using T = TypeParam;
+  T ground_truth_output = this->divScalar(this->input1_cpu, this->scalar);
+
+  this->output_cpu = this->input1_cpu / this->scalar;
+  std::string cpu_fail = "Doesn't pass cpu test";
+  ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_cpu)) << cpu_fail;
+
+  // Parallelization tests
+  T zero = this->output_gpu;
+  for (int blocks = 1; blocks < 10; blocks++)
+  {
+    for (int threads = 1; threads < 128; threads++)
+    {
+      HANDLE_ERROR(cudaMemcpy(this->output_d, &zero, sizeof(T), cudaMemcpyHostToDevice));
+      VectorScalarDivTestKernel<T, float><<<blocks, threads>>>(this->input1_d, this->scalar_d, this->output_d);
+      HANDLE_ERROR(cudaMemcpy(&this->output_gpu, this->output_d, sizeof(T), cudaMemcpyDeviceToHost));
+      std::string fail_gpu =
+          " failed with " + std::to_string(blocks) + " blocks of " + std::to_string(threads) + " threads";
+      ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_gpu, fail_gpu)) << fail_gpu;
+    }
+  }
+}
+
+TYPED_TEST(CudaFloatStructsTests, VecAddVec)
+{
+  using T = TypeParam;
+  T ground_truth_output = this->addVec(this->input1_cpu, this->input2_cpu);
+
+  this->output_cpu = this->input1_cpu + this->input2_cpu;
+  std::string cpu_fail = "Doesn't pass cpu test";
+  ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_cpu)) << cpu_fail;
+
+  // Parallelization tests
+  T zero = this->output_gpu;
+  for (int blocks = 1; blocks < 10; blocks++)
+  {
+    for (int threads = 1; threads < 128; threads++)
+    {
+      HANDLE_ERROR(cudaMemcpy(this->output_d, &zero, sizeof(T), cudaMemcpyHostToDevice));
+      VectorVectorAddTestKernel<T><<<blocks, threads>>>(this->input1_d, this->input2_d, this->output_d);
+      HANDLE_ERROR(cudaMemcpy(&this->output_gpu, this->output_d, sizeof(T), cudaMemcpyDeviceToHost));
+      std::string fail_gpu =
+          " failed with " + std::to_string(blocks) + " blocks of " + std::to_string(threads) + " threads";
+      ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_gpu, fail_gpu)) << fail_gpu;
+    }
+  }
+}
+
+TYPED_TEST(CudaFloatStructsTests, VecSubVec)
+{
+  using T = TypeParam;
+  T ground_truth_output = this->subVec(this->input1_cpu, this->input2_cpu);
+
+  this->output_cpu = this->input1_cpu - this->input2_cpu;
+  std::string cpu_fail = "Doesn't pass cpu test";
+  ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_cpu)) << cpu_fail;
+
+  // Parallelization tests
+  T zero = this->output_gpu;
+  for (int blocks = 1; blocks < 10; blocks++)
+  {
+    for (int threads = 1; threads < 128; threads++)
+    {
+      HANDLE_ERROR(cudaMemcpy(this->output_d, &zero, sizeof(T), cudaMemcpyHostToDevice));
+      VectorVectorSubTestKernel<T><<<blocks, threads>>>(this->input1_d, this->input2_d, this->output_d);
+      HANDLE_ERROR(cudaMemcpy(&this->output_gpu, this->output_d, sizeof(T), cudaMemcpyDeviceToHost));
+      std::string fail_gpu =
+          " failed with " + std::to_string(blocks) + " blocks of " + std::to_string(threads) + " threads";
+      ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_gpu, fail_gpu)) << fail_gpu;
+    }
+  }
+}
+
+TYPED_TEST(CudaFloatStructsTests, VecDivVec)
+{
+  using T = TypeParam;
+  T ground_truth_output = this->divVec(this->input1_cpu, this->input2_cpu);
+
+  this->output_cpu = this->input1_cpu / this->input2_cpu;
+  std::string cpu_fail = "Doesn't pass cpu test";
+  ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_cpu)) << cpu_fail;
+
+  // Parallelization tests
+  T zero = this->output_gpu;
+  for (int blocks = 1; blocks < 10; blocks++)
+  {
+    for (int threads = 1; threads < 128; threads++)
+    {
+      HANDLE_ERROR(cudaMemcpy(this->output_d, &zero, sizeof(T), cudaMemcpyHostToDevice));
+      VectorVectorDivTestKernel<T><<<blocks, threads>>>(this->input1_d, this->input2_d, this->output_d);
+      HANDLE_ERROR(cudaMemcpy(&this->output_gpu, this->output_d, sizeof(T), cudaMemcpyDeviceToHost));
+      std::string fail_gpu =
+          " failed with " + std::to_string(blocks) + " blocks of " + std::to_string(threads) + " threads";
+      ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_gpu, fail_gpu)) << fail_gpu;
+    }
+  }
+}
+
+TYPED_TEST(CudaFloatStructsTests, VecAddVecMultScalar)
+{
+  using T = TypeParam;
+  T ground_truth_output = this->addVec(this->input1_cpu, this->multScalar(this->input2_cpu, this->scalar));
+
+  this->output_cpu = this->input1_cpu + this->input2_cpu * this->scalar;
+  std::string cpu_fail = "Doesn't pass cpu test";
+  ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_cpu)) << cpu_fail;
+
+  // Parallelization tests
+  T zero = this->output_gpu;
+  for (int blocks = 1; blocks < 10; blocks++)
+  {
+    for (int threads = 1; threads < 128; threads++)
+    {
+      HANDLE_ERROR(cudaMemcpy(this->output_d, &zero, sizeof(T), cudaMemcpyHostToDevice));
+      VectorVectorScalarAddMultTestKernel<T, float>
+          <<<blocks, threads>>>(this->input1_d, this->input2_d, this->scalar_d, this->output_d);
+      HANDLE_ERROR(cudaMemcpy(&this->output_gpu, this->output_d, sizeof(T), cudaMemcpyDeviceToHost));
+      std::string fail_gpu =
+          " failed with " + std::to_string(blocks) + " blocks of " + std::to_string(threads) + " threads";
+      ASSERT_TRUE(this->assert_same(ground_truth_output, this->output_gpu, fail_gpu)) << fail_gpu;
+    }
+  }
+}

From 6d1d0609f9299e102a97fc4eb21c64159ca99228 Mon Sep 17 00:00:00 2001
From: Bogdan Vlahov <bvlahov3@gatech.edu>
Date: Tue, 3 Sep 2024 13:18:42 -0400
Subject: [PATCH 4/4] Add tests for sampling distributions and bugfixes

- Make computeLikelihoodRatioCost with Eigen match device version of the
method in GaussianDistribution
- Add an Eigen version of computeFeedbackCost() in base sampling
distribution class
- Add Gaussian-specific unit tests to ensure Gaussian distributions are
Gaussian as well as ensure that the the number of distributions asked
for does not go over the max allowed in the GaussianParams struct
- Add generic unit tests for sampling distributions that check that they
construct correctly, give access to the same controls on host and
device, calculate the same costs between host and device, and ensure
that the number of distributions, timesteps and samples can be adjusted.
---
 .../colored_noise/colored_noise.cu            |   2 +-
 .../gaussian/gaussian.cu                      |  48 +-
 .../gaussian/gaussian.cuh                     |  18 +-
 .../mppi/sampling_distributions/nln/nln.cu    |   2 +-
 .../sampling_distribution.cuh                 |  18 +-
 .../cost_generic_kernel_tests.cu              |   2 +-
 .../gaussian_noise_tests.cu                   | 245 +++++++
 .../generic_sampling_distribution_tests.cu    | 637 ++++++++++++++++++
 8 files changed, 948 insertions(+), 24 deletions(-)
 create mode 100644 tests/sampling_distributions/gaussian_noise_tests.cu
 create mode 100644 tests/sampling_distributions/generic_sampling_distribution_tests.cu

diff --git a/include/mppi/sampling_distributions/colored_noise/colored_noise.cu b/include/mppi/sampling_distributions/colored_noise/colored_noise.cu
index 51b0876..0e10905 100644
--- a/include/mppi/sampling_distributions/colored_noise/colored_noise.cu
+++ b/include/mppi/sampling_distributions/colored_noise/colored_noise.cu
@@ -298,7 +298,7 @@ __host__ void COLORED_NOISE::generateSamples(const int& optimization_stride, con
   const int freq_size = sample_freq.size();
 
   int smaller_index = 0;
-  const int local_control_dim = this->CONTROL_DIM; // Needed for methods which use pass by reference
+  const int local_control_dim = this->CONTROL_DIM;  // Needed for methods which use pass by reference
   Eigen::MatrixXf sample_freqs(freq_size, local_control_dim);
 
   // Adjust the weighting of each frequency by the exponents
diff --git a/include/mppi/sampling_distributions/gaussian/gaussian.cu b/include/mppi/sampling_distributions/gaussian/gaussian.cu
index 09a6cde..9841906 100644
--- a/include/mppi/sampling_distributions/gaussian/gaussian.cu
+++ b/include/mppi/sampling_distributions/gaussian/gaussian.cu
@@ -495,13 +495,8 @@ __host__ __device__ float GAUSSIAN_CLASS::computeLikelihoodRatioCost(const float
   float* control_cost_coeff = params_p->control_cost_coeff;
 
   float cost = 0.0f;
-#ifdef __CUDA_ARCH__
-  int i = threadIdx.y;
-  int step = blockDim.y;
-#else
-  int i = 0;
-  int step = 1;
-#endif
+  int i, step;
+  mppi::p1::getParallel1DIndex<mppi::p1::Parallel1Dir::THREAD_Y>(i, step);
 
   if (CONTROL_DIM % 4 == 0)
   {
@@ -584,13 +579,8 @@ __host__ __device__ float GAUSSIAN_CLASS::computeFeedbackCost(const float* __res
   float* control_cost_coeff = params_p->control_cost_coeff;
 
   float cost = 0.0f;
-#ifdef __CUDA_ARCH__
-  int i = threadIdx.y;
-  int step = blockDim.y;
-#else
-  int i = 0;
-  int step = 1;
-#endif
+  int i, step;
+  mppi::p1::getParallel1DIndex<mppi::p1::Parallel1Dir::THREAD_Y>(i, step);
 
   if (CONTROL_DIM % 4 == 0)
   {
@@ -629,7 +619,26 @@ __host__ __device__ float GAUSSIAN_CLASS::computeFeedbackCost(const float* __res
 }
 
 GAUSSIAN_TEMPLATE
-__host__ float GAUSSIAN_CLASS::computeLikelihoodRatioCost(const Eigen::Ref<const control_array>& u, const int t,
+__host__ float GAUSSIAN_CLASS::computeFeedbackCost(const Eigen::Ref<const control_array>& u_fb, const int t,
+                                                   const int distribution_idx, const float lambda, const float alpha)
+{
+  float cost = 0.0f;
+  const int distribution_i = distribution_idx >= this->params_.num_distributions ? 0 : distribution_idx;
+  float* std_dev = &(this->params_.std_dev[CONTROL_DIM * distribution_i]);
+  if (this->params_.time_specific_std_dev)
+  {
+    std_dev = &(this->params_.std_dev[(distribution_i * this->getNumTimesteps() + t) * CONTROL_DIM]);
+  }
+  for (int i = 0; i < CONTROL_DIM; i++)
+  {
+    cost += this->params_.control_cost_coeff[i] * u_fb(i) * u_fb(i) / (std_dev[i] * std_dev[i]);
+  }
+  return 0.5f * lambda * (1.0f - alpha) * cost;
+}
+
+GAUSSIAN_TEMPLATE
+__host__ float GAUSSIAN_CLASS::computeLikelihoodRatioCost(const Eigen::Ref<const control_array>& u,
+                                                          const int sample_index, const int t,
                                                           const int distribution_idx, const float lambda,
                                                           const float alpha)
 {
@@ -638,16 +647,21 @@ __host__ float GAUSSIAN_CLASS::computeLikelihoodRatioCost(const Eigen::Ref<const
   const int mean_index = (distribution_i * this->getNumTimesteps() + t) * CONTROL_DIM;
   float* mean = &(this->means_[mean_index]);
   float* std_dev = &(this->params_.std_dev[CONTROL_DIM * distribution_i]);
+  control_array zero_mean = control_array::Zero();
+  if (sample_index >= (1.0f - this->params_.pure_noise_trajectories_percentage) * this->params_.num_rollouts)
+  {
+    mean = zero_mean.data();
+  }
   if (this->params_.time_specific_std_dev)
   {
     std_dev = &(this->params_.std_dev[(distribution_i * this->getNumTimesteps() + t) * CONTROL_DIM]);
   }
   for (int i = 0; i < CONTROL_DIM; i++)
   {
-    cost += this->params_.control_cost_coeff[i] * mean[i] * (mean[i] + 2.0f * u(i)) /
+    cost += this->params_.control_cost_coeff[i] * mean[i] * (mean[i] - 2.0f * u(i)) /
             (std_dev[i] * std_dev[i]);  // Proper way
   }
-  return cost;
+  return 0.5f * lambda * (1.0f - alpha) * cost;
 }
 
 GAUSSIAN_TEMPLATE
diff --git a/include/mppi/sampling_distributions/gaussian/gaussian.cuh b/include/mppi/sampling_distributions/gaussian/gaussian.cuh
index f2ce267..3becc20 100644
--- a/include/mppi/sampling_distributions/gaussian/gaussian.cuh
+++ b/include/mppi/sampling_distributions/gaussian/gaussian.cuh
@@ -125,12 +125,15 @@ public:
                                                 const int t, const int distribution_idx, const float lambda = 1.0,
                                                 const float alpha = 0.0);
 
+  __host__ float computeFeedbackCost(const Eigen::Ref<const control_array>& u_fb, const int t,
+                                     const int distribution_idx, const float lambda = 1.0, const float alpha = 0.0);
+
   __host__ __device__ float computeLikelihoodRatioCost(const float* __restrict__ u, float* __restrict__ theta_d,
                                                        const int sample_index, const int t, const int distribution_idx,
                                                        const float lambda = 1.0, const float alpha = 0.0);
 
-  __host__ float computeLikelihoodRatioCost(const Eigen::Ref<const control_array>& u, const int t,
-                                            const int distribution_idx, const float lambda = 1.0,
+  __host__ float computeLikelihoodRatioCost(const Eigen::Ref<const control_array>& u, const int sample_index,
+                                            const int t, const int distribution_idx, const float lambda = 1.0,
                                             const float alpha = 0.0);
 
   __host__ void copyImportanceSamplerToDevice(const float* importance_sampler, const int& distribution_idx,
@@ -146,6 +149,17 @@ public:
   __host__ void setHostOptimalControlSequence(float* optimal_control_trajectory, const int& distribution_idx,
                                               bool synchronize = true);
 
+  __host__ void setNumDistributions(const int num_distributions, bool synchronize = false)
+  {
+    if (num_distributions > SAMPLING_PARAMS_T::MAX_DISTRIBUTIONS)
+    {
+      this->logger_->error("GaussianParams can't handle more than %d distributions but %d were requested\n",
+                           SAMPLING_PARAMS_T::MAX_DISTRIBUTIONS, num_distributions);
+      throw std::out_of_range("Can't set num distributions higher than allowed in params");
+    }
+    PARENT_CLASS::setNumDistributions(num_distributions, synchronize);
+  }
+
   __host__ void updateDistributionParamsFromDevice(const float* trajectory_weights_d, float normalizer,
                                                    const int& distribution_i, bool synchronize = false) override;
 
diff --git a/include/mppi/sampling_distributions/nln/nln.cu b/include/mppi/sampling_distributions/nln/nln.cu
index 179a1e5..4ce40e8 100644
--- a/include/mppi/sampling_distributions/nln/nln.cu
+++ b/include/mppi/sampling_distributions/nln/nln.cu
@@ -74,7 +74,7 @@ void NLN_NOISE::setParams(const SAMPLING_PARAMS_T& params, bool synchronize)
   bool adjusted_variance = false;
   for (int i = 0; i < this->CONTROL_DIM * this->getNumDistributions(); i++)
   {
-    if (this->params.std_dev[i] != params.std_dev[i])
+    if (this->params_.std_dev[i] != params.std_dev[i])
     {
       adjusted_variance = true;
       break;
diff --git a/include/mppi/sampling_distributions/sampling_distribution.cuh b/include/mppi/sampling_distributions/sampling_distribution.cuh
index 1c156b8..eb591ff 100644
--- a/include/mppi/sampling_distributions/sampling_distribution.cuh
+++ b/include/mppi/sampling_distributions/sampling_distribution.cuh
@@ -325,6 +325,20 @@ public:
                                                 const int t, const int distribution_idx, const float lambda = 1.0,
                                                 const float alpha = 0.0);
 
+  /**
+   * @brief host method to calculate the cost for feedback based on the sampling distribution in RMPPI
+   *
+   * @param u_fb - feedback control
+   * @param t - timestep
+   * @param distribution_idx - distribution index
+   * @param lambda - MPPI temperature parameter
+   * @param alpha - coeff to turn off the likelihood cost (set to 1 -> no likelihood cost, set to 0 -> all likelihood
+   *
+   * @return feedback cost
+   */
+  __host__ float computeFeedbackCost(const Eigen::Ref<const control_array>& u_fb, const int t,
+                                     const int distribution_idx, const float lambda = 1.0, const float alpha = 0.0);
+
   /**
    * @brief Device method to calculate the likelihood ratio cost for a given sample u
    *
@@ -352,8 +366,8 @@ public:
    * @param alpha - coeff to turn off the likelihood cost (set to 1 -> no likelihood cost, set to 0 -> all likelihood
    * cost)
    */
-  __host__ float computeLikelihoodRatioCost(const Eigen::Ref<const control_array>& u, const int t,
-                                            const int distribution_idx, const float lambda = 1.0,
+  __host__ float computeLikelihoodRatioCost(const Eigen::Ref<const control_array>& u, const int sample_index,
+                                            const int t, const int distribution_idx, const float lambda = 1.0,
                                             const float alpha = 0.0);
 
   /**
diff --git a/tests/include/kernel_tests/cost_functions/cost_generic_kernel_tests.cu b/tests/include/kernel_tests/cost_functions/cost_generic_kernel_tests.cu
index 8a0b382..3204926 100644
--- a/tests/include/kernel_tests/cost_functions/cost_generic_kernel_tests.cu
+++ b/tests/include/kernel_tests/cost_functions/cost_generic_kernel_tests.cu
@@ -442,7 +442,7 @@ void checkGPURolloutCost(COST_T& cost, float dt)
         int crash = 0;
         float cpu_cost = cost.computeRunningCost(y_index, u_index, t, &crash);
 #ifdef USE_ROLLOUT_COST_KERNEL
-        cpu_cost += sampler.computeLikelihoodRatioCost(u_index, t, 0);
+        cpu_cost += sampler.computeLikelihoodRatioCost(u_index, n, t, 0);
         total_cost += cpu_cost;
 #else
         ASSERT_NEAR(cpu_cost, output_cost_gpu[index], cpu_cost * 0.0015)
diff --git a/tests/sampling_distributions/gaussian_noise_tests.cu b/tests/sampling_distributions/gaussian_noise_tests.cu
new file mode 100644
index 0000000..eb64728
--- /dev/null
+++ b/tests/sampling_distributions/gaussian_noise_tests.cu
@@ -0,0 +1,245 @@
+#include <gtest/gtest.h>
+#include <mppi/core/mppi_common.cuh>
+#include <mppi/dynamics/linear/linear.cuh>
+#include <mppi/sampling_distributions/colored_noise/colored_noise.cuh>
+#include <mppi/sampling_distributions/gaussian/gaussian.cuh>
+
+namespace ms = mppi::sampling_distributions;
+
+template <class SAMPLER_T>
+class GaussianTests : public ::testing::Test
+{
+public:
+  using SAMPLER_PARAMS_T = typename SAMPLER_T::SAMPLING_PARAMS_T;
+  static const int OUTPUT_DIM = E_INDEX(SAMPLER_T::OutputIndex, NUM_OUTPUTS);
+
+  typedef Eigen::Matrix<float, OUTPUT_DIM, 1> output_array;
+  int num_samples_ = 1000;
+  int num_timesteps_ = 100;
+  int num_distributions_ = 1;
+  int num_verifications_ = 0;
+  int* sampling_indices_d_ = nullptr;
+  int* times_d_ = nullptr;
+  int* distribution_indices_d_ = nullptr;
+  float* outputs_d_ = nullptr;
+  float* controls_d_ = nullptr;
+  float* costs_d_ = nullptr;
+  float lambda_ = 1.0f;
+  float alpha_ = 0.0f;
+
+  std::shared_ptr<SAMPLER_T> sampler_ = nullptr;
+  cudaStream_t stream_ = 0;
+  curandGenerator_t gen_;
+  dim3 thread_block_ = dim3(32, 1, 1);
+
+  std::vector<int> sampled_indices_;
+  std::vector<int> sampled_times_;
+  std::vector<int> sampled_distributions_;
+  std::vector<Eigen::MatrixXf> means_cpu_;
+
+  void SetUp() override
+  {
+    cudaStreamCreate(&stream_);
+    sampler_ = std::make_shared<SAMPLER_T>(stream_);
+    updateSamplerSizes();
+    sampler_->GPUSetup();
+    HANDLE_CURAND_ERROR(curandCreateGenerator(&gen_, CURAND_RNG_PSEUDO_DEFAULT));
+    HANDLE_CURAND_ERROR(curandSetPseudoRandomGeneratorSeed(gen_, 42));
+  }
+
+  void TearDown() override
+  {
+  }
+
+  void updateSamplerSizes()
+  {
+    sampler_->setNumTimesteps(num_timesteps_);
+    sampler_->setNumRollouts(num_samples_);
+    sampler_->setNumDistributions(num_distributions_);
+    means_cpu_.resize(num_distributions_);
+  }
+
+  void setRandomMeans()
+  {
+    for (int i = 0; i < num_distributions_; i++)
+    {
+      Eigen::MatrixXf mean_i = 5 * Eigen::MatrixXf::Random(SAMPLER_T::CONTROL_DIM, num_timesteps_);
+      sampler_->copyImportanceSamplerToDevice(mean_i.data(), i, false);
+      means_cpu_[i] = mean_i;
+    }
+  }
+};
+
+template <int C_DIM>
+using GaussianParams3 = ms::GaussianParamsImpl<C_DIM, 3>;
+
+template <class DYN_PARAMS_T>
+class TestGaussianDistribution
+  : public ms::GaussianDistributionImpl<TestGaussianDistribution<DYN_PARAMS_T>, GaussianParams3, DYN_PARAMS_T>
+{
+public:
+  using PARENT_CLASS = ms::GaussianDistributionImpl<TestGaussianDistribution, GaussianParams3, DYN_PARAMS_T>;
+  using SAMPLING_PARAMS_T = typename PARENT_CLASS::SAMPLING_PARAMS_T;
+
+  TestGaussianDistribution(cudaStream_t stream = 0) : PARENT_CLASS(stream)
+  {
+  }
+  TestGaussianDistribution(const SAMPLING_PARAMS_T& params, cudaStream_t stream = 0) : PARENT_CLASS(params, stream)
+  {
+  }
+};
+
+using TYPE_TESTS = ::testing::Types<ms::GaussianDistribution<LinearDynamicsParams<4, 1>>,
+                                    ms::GaussianDistribution<LinearDynamicsParams<4, 2>>,
+                                    TestGaussianDistribution<LinearDynamicsParams<4, 2>>>;
+
+TYPED_TEST_SUITE(GaussianTests, TYPE_TESTS);
+
+TYPED_TEST(GaussianTests, SetNumDistributions)
+{
+  using T = TypeParam;
+  this->num_distributions_ = 3;
+  if (this->num_distributions_ > T::SAMPLING_PARAMS_T::MAX_DISTRIBUTIONS)
+  {
+    EXPECT_THROW(this->sampler_->setNumDistributions(this->num_distributions_), std::out_of_range);
+  }
+  else
+  {
+    this->sampler_->setNumDistributions(this->num_distributions_);
+    auto params = this->sampler_->getParams();
+    EXPECT_EQ(this->num_distributions_, params.num_distributions);
+  }
+}
+
+TYPED_TEST(GaussianTests, CheckSamplesAreGaussian)
+{
+  using T = TypeParam;
+  float std_dev = 2.3f;
+  auto params = this->sampler_->getParams();
+  params.pure_noise_trajectories_percentage = 0.0f;
+  for (int i = 0; i < this->num_distributions_; i++)
+  {
+    for (int j = 0; j < T::CONTROL_DIM; j++)
+    {
+      params.std_dev[j + i * T::CONTROL_DIM] = std_dev;
+    }
+  }
+  this->sampler_->setParams(params);
+  this->setRandomMeans();
+  this->setRandomMeans();
+
+  this->sampler_->generateSamples(0, 0, this->gen_, false);
+  std::vector<float> sampled_controls(T::CONTROL_DIM * this->num_timesteps_ * this->num_samples_ *
+                                      this->num_distributions_);
+  HANDLE_ERROR(cudaMemcpyAsync(sampled_controls.data(), this->sampler_->getControlSample(0, 0, 0),
+                               sizeof(float) * this->num_samples_ * this->num_timesteps_ * this->num_distributions_ *
+                                   T::CONTROL_DIM,
+                               cudaMemcpyDeviceToHost, this->stream_));
+  HANDLE_ERROR(cudaStreamSynchronize(this->stream_));
+
+  int k_bins = 3;
+  std::vector<int> binned_counts(k_bins, 0);
+  for (int d = 0; d < this->num_distributions_; d++)
+  {
+    for (int s = 0; s < this->num_samples_; s++)
+    {
+      for (int t = 0; t < this->num_timesteps_; t++)
+      {
+        for (int i = 0; i < T::CONTROL_DIM; i++)
+        {
+          int index = ((d * this->num_samples_ + s) * this->num_timesteps_ + t) * T::CONTROL_DIM + i;
+          float z_score = (sampled_controls[index] - this->means_cpu_[d](i, t)) / std_dev;
+          for (int j = k_bins; j >= 0; j--)
+          {
+            if (fabsf(z_score) < j + 1)
+            {
+              binned_counts[j]++;
+            }
+            else
+            {
+              break;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Check how many samples are properly sampled
+  std::vector<double> normalized_bins(k_bins, 0.0f);
+  std::vector<double> sigma_rules(k_bins);
+  for (int i = 0; i < k_bins; i++)
+  {
+    sigma_rules[i] = 0.5 * (erf((i + 1) / sqrt(2.0)) - erf(-(i + 1) / sqrt(2.0)));
+    normalized_bins[i] = (double)binned_counts[i] /
+                         (this->num_distributions_ * this->num_samples_ * this->num_timesteps_ * T::CONTROL_DIM);
+    double diff = fabsf(normalized_bins[i] - sigma_rules[i]) / sigma_rules[i];
+    EXPECT_NEAR(diff, 0.0, 1e-3);
+  }
+}
+
+TYPED_TEST(GaussianTests, CheckZeroMeanSamplesAreGaussian)
+{
+  using T = TypeParam;
+  auto params = this->sampler_->getParams();
+  params.pure_noise_trajectories_percentage = 1.0f;
+  float std_dev = 2.3f;
+  for (int i = 0; i < this->num_distributions_; i++)
+  {
+    for (int j = 0; j < T::CONTROL_DIM; j++)
+    {
+      params.std_dev[j + i * T::CONTROL_DIM] = std_dev;
+    }
+  }
+  this->sampler_->setParams(params);
+  this->setRandomMeans();  // should end up doing not effecting the samples
+
+  this->sampler_->generateSamples(0, 0, this->gen_, false);
+  std::vector<float> sampled_controls(T::CONTROL_DIM * this->num_timesteps_ * this->num_samples_ *
+                                      this->num_distributions_);
+  HANDLE_ERROR(cudaMemcpyAsync(sampled_controls.data(), this->sampler_->getControlSample(0, 0, 0),
+                               sizeof(float) * this->num_samples_ * this->num_timesteps_ * this->num_distributions_ *
+                                   T::CONTROL_DIM,
+                               cudaMemcpyDeviceToHost, this->stream_));
+  HANDLE_ERROR(cudaStreamSynchronize(this->stream_));
+
+  int k_bins = 3;
+  std::vector<int> binned_counts(k_bins, 0);
+  for (int d = 0; d < this->num_distributions_; d++)
+  {
+    for (int s = 0; s < this->num_samples_; s++)
+    {
+      for (int t = 0; t < this->num_timesteps_; t++)
+      {
+        for (int i = 0; i < T::CONTROL_DIM; i++)
+        {
+          int index = ((d * this->num_samples_ + s) * this->num_timesteps_ + t) * T::CONTROL_DIM + i;
+          float z_score = (sampled_controls[index]) / std_dev;
+          for (int j = k_bins; j >= 0; j--)
+          {
+            if (fabsf(z_score) < j + 1)
+            {
+              binned_counts[j]++;
+            }
+            else
+            {
+              break;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Check how many samples are properly sampled
+  std::vector<double> normalized_bins(k_bins, 0.0f);
+  std::vector<double> sigma_rules(k_bins);
+  for (int i = 0; i < k_bins; i++)
+  {
+    sigma_rules[i] = 0.5 * (erf((i + 1) / sqrt(2.0)) - erf(-(i + 1) / sqrt(2.0)));
+    normalized_bins[i] = (double)binned_counts[i] /
+                         (this->num_distributions_ * this->num_samples_ * this->num_timesteps_ * T::CONTROL_DIM);
+    double diff = fabsf(normalized_bins[i] - sigma_rules[i]) / sigma_rules[i];
+    EXPECT_NEAR(diff, 0.0, 1e-3);
+  }
+}
diff --git a/tests/sampling_distributions/generic_sampling_distribution_tests.cu b/tests/sampling_distributions/generic_sampling_distribution_tests.cu
new file mode 100644
index 0000000..03e8abc
--- /dev/null
+++ b/tests/sampling_distributions/generic_sampling_distribution_tests.cu
@@ -0,0 +1,637 @@
+#include <gtest/gtest.h>
+#include <mppi/core/mppi_common.cuh>
+#include <mppi/ddp/util.h>
+#include <mppi/dynamics/linear/linear.cuh>
+#include <mppi/sampling_distributions/colored_noise/colored_noise.cuh>
+#include <mppi/sampling_distributions/gaussian/gaussian.cuh>
+#include <mppi/sampling_distributions/nln/nln.cuh>
+#include <mppi/utils/math_utils.h>
+#include <mppi/utils/type_printing.h>
+
+#if __cplusplus < 201703L  // std::void_t is a C++17 feature, used for inherited_from_gaussian struct
+namespace std
+{
+template <typename... Ts>
+struct make_void
+{
+  typedef void type;
+};
+
+template <typename... Ts>
+using void_t = typename make_void<Ts...>::type;
+}  // namespace std
+#endif
+
+namespace ms = mppi::sampling_distributions;
+
+template <class SAMPLER_T>
+__global__ void getControlSamplesKernel(SAMPLER_T* __restrict__ sampler, const int num_control_samples,
+                                        const int* __restrict__ sample_indexes_d, const int* __restrict__ times_d,
+                                        const int* __restrict__ distribution_indexes_d,
+                                        const float* __restrict__ outputs_d, float* __restrict__ control_samples_d)
+{
+  const int size_of_theta_d_bytes = mppi::kernels::calcClassSharedMemSize(sampler, blockDim);
+  using OutputIndex = typename SAMPLER_T::OutputIndex;
+  const int OUTPUT_DIM = E_INDEX(OutputIndex, NUM_OUTPUTS);
+  extern __shared__ float entire_buffer[];
+  float* theta_d_shared = &entire_buffer[0 / sizeof(float)];
+  float* outputs_shared = &entire_buffer[size_of_theta_d_bytes / sizeof(float)];  // THREAD_BLOCK_X * OUTPUT_DIM in size
+
+  int x_index, x_step;
+  mppi::p1::getParallel1DIndex<mppi::p1::Parallel1Dir::THREAD_X>(x_index, x_step);
+  int y_index, y_step;
+  mppi::p1::getParallel1DIndex<mppi::p1::Parallel1Dir::THREAD_Y>(y_index, y_step);
+  int test_index = threadIdx.x + blockDim.x * blockIdx.x;
+  // load output into shared memory
+  if (test_index < num_control_samples)
+  {
+    for (int j = y_index; j < OUTPUT_DIM; j += y_step)
+    {
+      outputs_shared[j + x_index * OUTPUT_DIM] = outputs_d[j + test_index * OUTPUT_DIM];
+    }
+  }
+  __syncthreads();
+  float* output = &outputs_shared[x_index * OUTPUT_DIM];
+  // initialize sampling distributions
+  sampler->initializeDistributions(output, 0.0f, 0.01f, theta_d_shared);
+  __syncthreads();
+  // get control samples
+  if (test_index < num_control_samples)
+  {
+    int sample_index = sample_indexes_d[test_index];
+    int t = times_d[test_index];
+    int distribution_idx = distribution_indexes_d[test_index];
+    float* u_to_save_to = &control_samples_d[test_index * SAMPLER_T::CONTROL_DIM];
+    sampler->readControlSample(sample_index, t, distribution_idx, u_to_save_to, theta_d_shared, y_step, y_index,
+                               output);
+  }
+}
+
+template <class SAMPLER_T>
+__global__ void getLikelihoodCostKernel(SAMPLER_T* __restrict__ sampler, const int num_control_samples, float lambda,
+                                        float alpha, const int* __restrict__ sample_indexes_d,
+                                        const int* __restrict__ times_d, const int* __restrict__ distribution_indexes_d,
+                                        const float* __restrict__ outputs_d, float* __restrict__ control_samples_d,
+                                        float* __restrict__ costs_d)
+{
+  const int size_of_theta_d_bytes = mppi::kernels::calcClassSharedMemSize(sampler, blockDim);
+  using OutputIndex = typename SAMPLER_T::OutputIndex;
+  const int OUTPUT_DIM = E_INDEX(OutputIndex, NUM_OUTPUTS);
+  extern __shared__ float entire_buffer[];
+  float* theta_d_shared = &entire_buffer[0 / sizeof(float)];
+  float* outputs_shared =
+      &theta_d_shared[size_of_theta_d_bytes / sizeof(float)];  // THREAD_BLOCK_X * OUTPUT_DIM in size
+  float* running_cost_shared = &outputs_shared[blockDim.x * OUTPUT_DIM];
+  int running_cost_index = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
+  float* running_cost = &running_cost_shared[running_cost_index];
+  running_cost[0] = 0.0f;
+
+  int x_index, x_step;
+  mppi::p1::getParallel1DIndex<mppi::p1::Parallel1Dir::THREAD_X>(x_index, x_step);
+  int y_index, y_step;
+  mppi::p1::getParallel1DIndex<mppi::p1::Parallel1Dir::THREAD_Y>(y_index, y_step);
+  int test_index = threadIdx.x + blockDim.x * blockIdx.x;
+  // load output into shared memory
+  if (test_index < num_control_samples)
+  {
+    for (int j = y_index; j < OUTPUT_DIM; j += y_step)
+    {
+      outputs_shared[j + x_index * OUTPUT_DIM] = outputs_d[j + test_index * OUTPUT_DIM];
+    }
+  }
+  __syncthreads();
+  float* output = &outputs_shared[x_index * OUTPUT_DIM];
+  // initialize sampling distributions
+  sampler->initializeDistributions(output, 0.0f, 0.01f, theta_d_shared);
+  __syncthreads();
+  // get control samples
+  if (test_index < num_control_samples)
+  {
+    int sample_index = sample_indexes_d[test_index];
+    int t = times_d[test_index];
+    int distribution_idx = distribution_indexes_d[test_index];
+    float* u_to_save_to = &control_samples_d[test_index * SAMPLER_T::CONTROL_DIM];
+    sampler->readControlSample(sample_index, t, distribution_idx, u_to_save_to, theta_d_shared, y_step, y_index,
+                               output);
+  }
+  __syncthreads();
+  // Calculate likelihood ratio cost
+  if (test_index < num_control_samples)
+  {
+    int sample_index = sample_indexes_d[test_index];
+    int t = times_d[test_index];
+    int distribution_idx = distribution_indexes_d[test_index];
+    float* u = &control_samples_d[test_index * SAMPLER_T::CONTROL_DIM];
+    running_cost[0] =
+        sampler->computeLikelihoodRatioCost(u, theta_d_shared, sample_index, t, distribution_idx, lambda, alpha);
+  }
+  // __syncthreads();
+  // if (threadIdx.x == 1 && threadIdx.y + threadIdx.z == 0 && blockIdx.x == 0)
+  // {
+  //   printf("Running costs:\n");
+  //   for (int i = 0; i < blockDim.x; i++)
+  //   {
+  //     float cost_sum = 0.0f;
+  //     for (int j = 0; j < blockDim.y; j++)
+  //     {
+  //       cost_sum += running_cost_shared[i + j * blockDim.x];
+  //       printf("(%2d, %2d): %6.3f\n", i, j, running_cost_shared[i + j * blockDim.x]);
+  //     }
+  //     printf("Sum of y dim for %2d: %8.5f\n", i, cost_sum);
+  //   }
+  // }
+  running_cost = &running_cost_shared[blockDim.x * blockDim.y * threadIdx.z];
+  __syncthreads();
+  mppi::kernels::costArrayReduction(&running_cost[x_index], blockDim.y, threadIdx.y, blockDim.y, threadIdx.y == 0,
+                                    blockDim.x);
+  if (test_index < num_control_samples)
+  {
+    costs_d[test_index] = running_cost[x_index];
+  }
+}
+
+template <class SAMPLER_T>
+__global__ void getFeedbackCostKernel(SAMPLER_T* __restrict__ sampler, const int num_control_samples, float lambda,
+                                      float alpha, const int* __restrict__ sample_indexes_d,
+                                      const int* __restrict__ times_d, const int* __restrict__ distribution_indexes_d,
+                                      const float* __restrict__ outputs_d, float* __restrict__ control_samples_d,
+                                      float* __restrict__ costs_d)
+{
+  const int size_of_theta_d_bytes = mppi::kernels::calcClassSharedMemSize(sampler, blockDim);
+  using OutputIndex = typename SAMPLER_T::OutputIndex;
+  const int OUTPUT_DIM = E_INDEX(OutputIndex, NUM_OUTPUTS);
+  extern __shared__ float entire_buffer[];
+  float* theta_d_shared = &entire_buffer[0 / sizeof(float)];
+  float* outputs_shared =
+      &theta_d_shared[size_of_theta_d_bytes / sizeof(float)];  // THREAD_BLOCK_X * OUTPUT_DIM in size
+  float* running_cost_shared = &outputs_shared[blockDim.x * OUTPUT_DIM];
+  int running_cost_index = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
+  float* running_cost = &running_cost_shared[running_cost_index];
+  running_cost[0] = 0.0f;
+
+  int x_index, x_step;
+  mppi::p1::getParallel1DIndex<mppi::p1::Parallel1Dir::THREAD_X>(x_index, x_step);
+  int y_index, y_step;
+  mppi::p1::getParallel1DIndex<mppi::p1::Parallel1Dir::THREAD_Y>(y_index, y_step);
+  int test_index = threadIdx.x + blockDim.x * blockIdx.x;
+  // load output into shared memory
+  if (test_index < num_control_samples)
+  {
+    for (int j = y_index; j < OUTPUT_DIM; j += y_step)
+    {
+      outputs_shared[j + x_index * OUTPUT_DIM] = outputs_d[j + test_index * OUTPUT_DIM];
+    }
+  }
+  __syncthreads();
+  float* output = &outputs_shared[x_index * OUTPUT_DIM];
+  // initialize sampling distributions
+  sampler->initializeDistributions(output, 0.0f, 0.01f, theta_d_shared);
+  __syncthreads();
+  // get control samples
+  if (test_index < num_control_samples)
+  {
+    int sample_index = sample_indexes_d[test_index];
+    int t = times_d[test_index];
+    int distribution_idx = distribution_indexes_d[test_index];
+    float* u_to_save_to = &control_samples_d[test_index * SAMPLER_T::CONTROL_DIM];
+    sampler->readControlSample(sample_index, t, distribution_idx, u_to_save_to, theta_d_shared, y_step, y_index,
+                               output);
+  }
+  __syncthreads();
+  // Calculate Feedback Control cost using random control samples from distribution
+  if (test_index < num_control_samples)
+  {
+    int sample_index = sample_indexes_d[test_index];
+    int t = times_d[test_index];
+    int distribution_idx = distribution_indexes_d[test_index];
+    float* u = &control_samples_d[test_index * SAMPLER_T::CONTROL_DIM];
+    running_cost[0] = sampler->computeFeedbackCost(u, theta_d_shared, t, distribution_idx, lambda, alpha);
+  }
+  running_cost = &running_cost_shared[blockDim.x * blockDim.y * threadIdx.z];
+  __syncthreads();
+  mppi::kernels::costArrayReduction(&running_cost[x_index], blockDim.y, threadIdx.y, blockDim.y, threadIdx.y == 0,
+                                    blockDim.x);
+  if (test_index < num_control_samples)
+  {
+    costs_d[test_index] = running_cost[x_index];
+  }
+}
+
+template <class SAMPLER_T>
+class SamplingDistributionTests : public ::testing::Test
+{
+public:
+  using SAMPLER_PARAMS_T = typename SAMPLER_T::SAMPLING_PARAMS_T;
+  static const int OUTPUT_DIM = E_INDEX(SAMPLER_T::OutputIndex, NUM_OUTPUTS);
+  typedef Eigen::Matrix<float, OUTPUT_DIM, 1> output_array;
+  int num_samples_ = 1000;
+  int num_timesteps_ = 100;
+  int num_distributions_ = 1;
+  int num_verifications_ = 0;
+  int* sampling_indices_d_ = nullptr;
+  int* times_d_ = nullptr;
+  int* distribution_indices_d_ = nullptr;
+  float* outputs_d_ = nullptr;
+  float* controls_d_ = nullptr;
+  float* costs_d_ = nullptr;
+  float lambda_ = 1.0f;
+  float alpha_ = 0.0f;
+
+  std::shared_ptr<SAMPLER_T> sampler_ = nullptr;
+  cudaStream_t stream_ = 0;
+  curandGenerator_t gen_;
+  dim3 thread_block_ = dim3(32, 2, 1);
+
+  std::vector<int> sampled_indices_;
+  std::vector<int> sampled_times_;
+  std::vector<int> sampled_distributions_;
+  util::EigenAlignedVector<float, OUTPUT_DIM, 1> sampled_outputs_;
+  util::EigenAlignedVector<float, SAMPLER_T::CONTROL_DIM, 1> sampled_controls_cpu_;
+  util::EigenAlignedVector<float, SAMPLER_T::CONTROL_DIM, 1> sampled_controls_gpu_;
+
+  void SetUp() override
+  {
+    cudaStreamCreate(&stream_);
+    sampler_ = std::make_shared<SAMPLER_T>(stream_);
+    sampler_->GPUSetup();
+    setUpCudaMemory(3000);
+    HANDLE_CURAND_ERROR(curandCreateGenerator(&gen_, CURAND_RNG_PSEUDO_DEFAULT));
+    HANDLE_CURAND_ERROR(curandSetPseudoRandomGeneratorSeed(gen_, 42));
+  }
+
+  template <class T>
+  void freeCudaPtr(T*& ptr)
+  {
+    if (ptr != nullptr)
+    {
+      cudaFree(ptr);
+    }
+    ptr = nullptr;
+  }
+
+  void setUpCudaMemory(const int num_verifications)
+  {
+    if (num_verifications > num_verifications_)
+    {
+      freeCudaPtr(outputs_d_);
+      freeCudaPtr(sampling_indices_d_);
+      freeCudaPtr(times_d_);
+      freeCudaPtr(distribution_indices_d_);
+      freeCudaPtr(controls_d_);
+      freeCudaPtr(costs_d_);
+
+      // Allocate GPU memory
+      HANDLE_ERROR(cudaMalloc((void**)&sampling_indices_d_, sizeof(int) * num_verifications));
+      HANDLE_ERROR(cudaMalloc((void**)&times_d_, sizeof(int) * num_verifications));
+      HANDLE_ERROR(cudaMalloc((void**)&distribution_indices_d_, sizeof(int) * num_verifications));
+      HANDLE_ERROR(cudaMalloc((void**)&outputs_d_, sizeof(float) * OUTPUT_DIM * num_verifications));
+      HANDLE_ERROR(cudaMalloc((void**)&controls_d_, sizeof(float) * SAMPLER_T::CONTROL_DIM * num_verifications));
+      HANDLE_ERROR(cudaMalloc((void**)&costs_d_, sizeof(float) * num_verifications));
+
+      num_verifications_ = num_verifications;
+    }
+  }
+
+  void generateNewSamples()
+  {
+    // Create sample index
+    std::vector<int> samples =
+        mppi::math::sample_without_replacement(num_verifications_, num_timesteps_ * num_samples_ * num_distributions_);
+
+    // Fill in sampled indices and copy to GPU
+    sampled_indices_.resize(num_verifications_);
+    sampled_times_.resize(num_verifications_);
+    sampled_distributions_.resize(num_verifications_);
+    sampled_outputs_.resize(num_verifications_);
+    for (int i = 0; i < num_verifications_; i++)
+    {
+      const int sample = samples[i];
+      sampled_indices_[i] = (sample / (num_timesteps_ * num_distributions_)) % num_samples_;
+      sampled_times_[i] = (sample / num_distributions_) % num_timesteps_;
+      sampled_distributions_[i] = sample % num_distributions_;
+      sampled_outputs_[i] = output_array::Random();
+      HANDLE_ERROR(cudaMemcpyAsync(outputs_d_ + i * OUTPUT_DIM, sampled_outputs_[i].data(), sizeof(float) * OUTPUT_DIM,
+                                   cudaMemcpyHostToDevice, stream_));
+    }
+    HANDLE_ERROR(cudaMemcpyAsync(sampling_indices_d_, sampled_indices_.data(), sizeof(int) * num_verifications_,
+                                 cudaMemcpyHostToDevice, stream_));
+    HANDLE_ERROR(cudaMemcpyAsync(times_d_, sampled_times_.data(), sizeof(int) * num_verifications_,
+                                 cudaMemcpyHostToDevice, stream_));
+    HANDLE_ERROR(cudaMemcpyAsync(distribution_indices_d_, sampled_distributions_.data(),
+                                 sizeof(int) * num_verifications_, cudaMemcpyHostToDevice, stream_));
+
+    // Create non-zero mean
+    Eigen::MatrixXf random_mean = Eigen::MatrixXf::Random(SAMPLER_T::CONTROL_DIM, num_timesteps_);
+    for (int i = 0; i < num_distributions_; i++)
+    {
+      sampler_->copyImportanceSamplerToDevice(random_mean.data(), i, false);
+    }
+    float mean_update = 1.0f;
+    sampled_controls_cpu_.resize(num_verifications_);
+    sampled_controls_gpu_.resize(num_verifications_);
+    sampler_->generateSamples(0, 0, gen_, false);
+    HANDLE_ERROR(cudaMemcpyAsync(costs_d_, &mean_update, sizeof(float), cudaMemcpyHostToDevice, stream_));
+    for (int i = 0; i < num_distributions_; i++)
+    {
+      sampler_->updateDistributionParamsFromDevice(costs_d_, 1.0f, i, false);
+    }
+    HANDLE_ERROR(cudaStreamSynchronize(stream_));
+  }
+
+  void updateSampler()
+  {
+    sampler_->setNumTimesteps(num_timesteps_);
+    sampler_->setNumRollouts(num_samples_);
+    sampler_->setNumDistributions(num_distributions_);
+  }
+
+  void TearDown() override
+  {
+    HANDLE_ERROR(cudaStreamSynchronize(stream_));
+    freeCudaPtr(sampling_indices_d_);
+    freeCudaPtr(times_d_);
+    freeCudaPtr(distribution_indices_d_);
+    freeCudaPtr(outputs_d_);
+    freeCudaPtr(controls_d_);
+    freeCudaPtr(costs_d_);
+  }
+};
+
+/**
+ * Special setup for Gaussian-based distributions
+ **/
+template <class T, typename = void>
+struct inherited_from_gaussian : std::false_type
+{
+public:
+  void operator()(std::shared_ptr<T> sampler) const
+  {  // Empty setup
+  }
+};
+
+template <class T>
+struct inherited_from_gaussian<
+    T, std::void_t<decltype(T::SAMPLING_PARAMS_T::MAX_DISTRIBUTIONS),
+                   typename std::enable_if<
+                       std::is_base_of<ms::GaussianParamsImpl<T::CONTROL_DIM, T::SAMPLING_PARAMS_T::MAX_DISTRIBUTIONS>,
+                                       typename T::SAMPLING_PARAMS_T>::value,
+                       bool>::type>> : std::true_type
+{
+public:
+  void operator()(std::shared_ptr<T> sampler) const
+  {  // Setup std dev and cost coefficients
+    auto params = sampler->getParams();
+    for (int i = 0; i < T::CONTROL_DIM; i++)
+    {
+      params.control_cost_coeff[i] = 1.0f;
+    }
+    for (int dist_i = 0; dist_i < params.num_distributions; dist_i++)
+    {
+      for (int std_dev_i = 0; std_dev_i < T::CONTROL_DIM; std_dev_i++)
+      {
+        params.std_dev[std_dev_i + dist_i * T::CONTROL_DIM] = 2.0f;
+      }
+    }
+    sampler->setParams(params);
+  }
+};
+
+using TYPE_TESTS = ::testing::Types<
+    ms::GaussianDistribution<LinearDynamicsParams<4, 1>>, ms::GaussianDistribution<LinearDynamicsParams<4, 2>>,
+    ms::GaussianDistribution<LinearDynamicsParams<4, 4>>, ms::GaussianDistribution<LinearDynamicsParams<4, 3>>,
+    ms::GaussianDistribution<LinearDynamicsParams<1, 7>>, ms::ColoredNoiseDistribution<LinearDynamicsParams<4, 1>>,
+    ms::ColoredNoiseDistribution<LinearDynamicsParams<4, 2>>, ms::ColoredNoiseDistribution<LinearDynamicsParams<4, 4>>,
+    ms::ColoredNoiseDistribution<LinearDynamicsParams<3, 3>>, ms::NLNDistribution<LinearDynamicsParams<4, 1>>,
+    ms::NLNDistribution<LinearDynamicsParams<4, 2>>, ms::NLNDistribution<LinearDynamicsParams<4, 4>>,
+    ms::NLNDistribution<LinearDynamicsParams<3, 3>>>;
+
+TYPED_TEST_SUITE(SamplingDistributionTests, TYPE_TESTS);
+
+TYPED_TEST(SamplingDistributionTests, TestCreation)
+{
+  using T = TypeParam;
+  EXPECT_TRUE(true);
+  // testMethod<T, inherited_from_gaussian<T>>();
+  inherited_from_gaussian<T>()(this->sampler_);
+}
+
+TYPED_TEST(SamplingDistributionTests, SetNumDistributions)
+{
+  using T = TypeParam;
+  this->num_distributions_ = 2;
+  this->updateSampler();
+  this->generateNewSamples();
+  EXPECT_TRUE(true);
+}
+
+TYPED_TEST(SamplingDistributionTests, ReadControlsFromGPU)
+{
+  using T = TypeParam;
+  this->updateSampler();
+  this->generateNewSamples();
+
+  dim3 grid_dim(mppi::math::int_ceil(this->num_verifications_, this->thread_block_.x), 1, 1);
+  std::size_t shared_mem_bytes = mppi::kernels::calcClassSharedMemSize<T>(this->sampler_.get(), this->thread_block_) +
+                                 sizeof(float) * this->OUTPUT_DIM * this->thread_block_.x;
+
+  getControlSamplesKernel<T><<<grid_dim, this->thread_block_, shared_mem_bytes, this->stream_>>>(
+      this->sampler_->sampling_d_, this->num_verifications_, this->sampling_indices_d_, this->times_d_,
+      this->distribution_indices_d_, this->outputs_d_, this->controls_d_);
+  HANDLE_ERROR(cudaGetLastError());
+  // Copy back to CPU
+  for (int i = 0; i < this->num_verifications_; i++)
+  {
+    HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_gpu_[i].data(), this->controls_d_ + i * T::CONTROL_DIM,
+                                 sizeof(float) * T::CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_));
+    // Query on CPU
+    float* u_i =
+        this->sampler_->getControlSample(this->sampled_indices_[i], this->sampled_times_[i],
+                                         this->sampled_distributions_[i], nullptr, this->sampled_outputs_[i].data());
+    HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_cpu_[i].data(), u_i, sizeof(float) * T::CONTROL_DIM,
+                                 cudaMemcpyDeviceToHost, this->stream_));
+  }
+  HANDLE_ERROR(cudaStreamSynchronize(this->stream_));
+
+  for (int i = 0; i < this->num_verifications_; i++)
+  {
+    float diff = (this->sampled_controls_cpu_[i] - this->sampled_controls_gpu_[i]).array().abs().sum();
+    ASSERT_FLOAT_EQ(diff, 0.0f);
+  }
+}
+
+TYPED_TEST(SamplingDistributionTests, ReadControlsFromGPUMoreDistributions)
+{
+  using T = TypeParam;
+  this->num_distributions_ = 2;
+  this->updateSampler();
+  this->generateNewSamples();
+
+  dim3 grid_dim(mppi::math::int_ceil(this->num_verifications_, this->thread_block_.x), 1, 1);
+  std::size_t shared_mem_bytes = mppi::kernels::calcClassSharedMemSize<T>(this->sampler_.get(), this->thread_block_) +
+                                 sizeof(float) * this->OUTPUT_DIM * this->thread_block_.x;
+
+  getControlSamplesKernel<T><<<grid_dim, this->thread_block_, shared_mem_bytes, this->stream_>>>(
+      this->sampler_->sampling_d_, this->num_verifications_, this->sampling_indices_d_, this->times_d_,
+      this->distribution_indices_d_, this->outputs_d_, this->controls_d_);
+  HANDLE_ERROR(cudaGetLastError());
+  // Copy back to CPU
+  for (int i = 0; i < this->num_verifications_; i++)
+  {
+    HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_gpu_[i].data(), this->controls_d_ + i * T::CONTROL_DIM,
+                                 sizeof(float) * T::CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_));
+    // Query on CPU
+    float* u_i =
+        this->sampler_->getControlSample(this->sampled_indices_[i], this->sampled_times_[i],
+                                         this->sampled_distributions_[i], nullptr, this->sampled_outputs_[i].data());
+    HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_cpu_[i].data(), u_i, sizeof(float) * T::CONTROL_DIM,
+                                 cudaMemcpyDeviceToHost, this->stream_));
+  }
+  HANDLE_ERROR(cudaStreamSynchronize(this->stream_));
+
+  for (int i = 0; i < this->num_verifications_; i++)
+  {
+    float diff = (this->sampled_controls_cpu_[i] - this->sampled_controls_gpu_[i]).array().abs().sum();
+    ASSERT_FLOAT_EQ(diff, 0.0f);
+  }
+}
+
+TYPED_TEST(SamplingDistributionTests, ReadControlsFromGPUDifferentNoiseForDistributions)
+{
+  using T = TypeParam;
+  this->num_distributions_ = 2;
+  auto params = this->sampler_->getParams();
+  params.use_same_noise_for_all_distributions = false;
+  this->sampler_->setParams(params);
+  this->updateSampler();
+  this->generateNewSamples();
+
+  dim3 grid_dim(mppi::math::int_ceil(this->num_verifications_, this->thread_block_.x), 1, 1);
+  std::size_t shared_mem_bytes = mppi::kernels::calcClassSharedMemSize<T>(this->sampler_.get(), this->thread_block_) +
+                                 sizeof(float) * this->OUTPUT_DIM * this->thread_block_.x;
+
+  getControlSamplesKernel<T><<<grid_dim, this->thread_block_, shared_mem_bytes, this->stream_>>>(
+      this->sampler_->sampling_d_, this->num_verifications_, this->sampling_indices_d_, this->times_d_,
+      this->distribution_indices_d_, this->outputs_d_, this->controls_d_);
+  HANDLE_ERROR(cudaGetLastError());
+  // Copy back to CPU
+  for (int i = 0; i < this->num_verifications_; i++)
+  {
+    HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_gpu_[i].data(), this->controls_d_ + i * T::CONTROL_DIM,
+                                 sizeof(float) * T::CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_));
+    // Query on CPU
+    float* u_i =
+        this->sampler_->getControlSample(this->sampled_indices_[i], this->sampled_times_[i],
+                                         this->sampled_distributions_[i], nullptr, this->sampled_outputs_[i].data());
+    HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_cpu_[i].data(), u_i, sizeof(float) * T::CONTROL_DIM,
+                                 cudaMemcpyDeviceToHost, this->stream_));
+  }
+  HANDLE_ERROR(cudaStreamSynchronize(this->stream_));
+
+  for (int i = 0; i < this->num_verifications_; i++)
+  {
+    float diff = (this->sampled_controls_cpu_[i] - this->sampled_controls_gpu_[i]).array().abs().sum();
+    ASSERT_FLOAT_EQ(diff, 0.0f);
+  }
+}
+
+TYPED_TEST(SamplingDistributionTests, CompareLikelihoodRatioCostsCPUvsGPU)
+{
+  using T = TypeParam;
+  this->num_distributions_ = 2;
+  auto params = this->sampler_->getParams();
+  params.use_same_noise_for_all_distributions = false;
+  this->sampler_->setParams(params);
+  inherited_from_gaussian<T>()(this->sampler_);
+  this->updateSampler();
+  this->generateNewSamples();
+
+  dim3 grid_dim(mppi::math::int_ceil(this->num_verifications_, this->thread_block_.x), 1, 1);
+  std::size_t shared_mem_bytes =
+      mppi::kernels::calcClassSharedMemSize<T>(this->sampler_.get(), this->thread_block_) +
+      sizeof(float) * (this->OUTPUT_DIM * this->thread_block_.x +
+                       this->thread_block_.x * this->thread_block_.y * this->thread_block_.z);
+
+  getLikelihoodCostKernel<T><<<grid_dim, this->thread_block_, shared_mem_bytes, this->stream_>>>(
+      this->sampler_->sampling_d_, this->num_verifications_, this->lambda_, this->alpha_, this->sampling_indices_d_,
+      this->times_d_, this->distribution_indices_d_, this->outputs_d_, this->controls_d_, this->costs_d_);
+  HANDLE_ERROR(cudaGetLastError());
+
+  std::vector<float> costs_gpu(this->num_verifications_);
+  // Copy back to CPU
+  for (int i = 0; i < this->num_verifications_; i++)
+  {
+    HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_gpu_[i].data(), this->controls_d_ + i * T::CONTROL_DIM,
+                                 sizeof(float) * T::CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_));
+    // Query on CPU
+    float* u_i =
+        this->sampler_->getControlSample(this->sampled_indices_[i], this->sampled_times_[i],
+                                         this->sampled_distributions_[i], nullptr, this->sampled_outputs_[i].data());
+    HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_cpu_[i].data(), u_i, sizeof(float) * T::CONTROL_DIM,
+                                 cudaMemcpyDeviceToHost, this->stream_));
+  }
+  HANDLE_ERROR(cudaMemcpyAsync(costs_gpu.data(), this->costs_d_, sizeof(float) * this->num_verifications_,
+                               cudaMemcpyDeviceToHost, this->stream_));
+  HANDLE_ERROR(cudaStreamSynchronize(this->stream_));
+
+  float cost;
+  for (int i = 2; i < this->num_verifications_; i++)
+  {
+    cost = 0.0f;
+    cost = this->sampler_->computeLikelihoodRatioCost(this->sampled_controls_cpu_[i], this->sampled_indices_[i],
+                                                      this->sampled_times_[i], this->sampled_distributions_[i],
+                                                      this->lambda_, this->alpha_);
+    ASSERT_NEAR(cost, costs_gpu[i], fabsf(cost) * 5e-5)
+        << " failed on sample " << this->sampled_indices_[i] << ", time " << this->sampled_times_[i]
+        << ", dist: " << this->sampled_distributions_[i];
+  }
+}
+
+TYPED_TEST(SamplingDistributionTests, CompareFeedbackCostsCPUvsGPU)
+{
+  using T = TypeParam;
+  this->num_distributions_ = 2;
+  auto params = this->sampler_->getParams();
+  params.use_same_noise_for_all_distributions = false;
+  this->sampler_->setParams(params);
+  inherited_from_gaussian<T>()(this->sampler_);
+  this->updateSampler();
+  this->generateNewSamples();
+
+  dim3 grid_dim(mppi::math::int_ceil(this->num_verifications_, this->thread_block_.x), 1, 1);
+  std::size_t shared_mem_bytes =
+      mppi::kernels::calcClassSharedMemSize<T>(this->sampler_.get(), this->thread_block_) +
+      sizeof(float) * (this->OUTPUT_DIM * this->thread_block_.x +
+                       this->thread_block_.x * this->thread_block_.y * this->thread_block_.z);
+
+  getFeedbackCostKernel<T><<<grid_dim, this->thread_block_, shared_mem_bytes, this->stream_>>>(
+      this->sampler_->sampling_d_, this->num_verifications_, this->lambda_, this->alpha_, this->sampling_indices_d_,
+      this->times_d_, this->distribution_indices_d_, this->outputs_d_, this->controls_d_, this->costs_d_);
+  HANDLE_ERROR(cudaGetLastError());
+
+  std::vector<float> costs_gpu(this->num_verifications_);
+  // Copy back to CPU
+  for (int i = 0; i < this->num_verifications_; i++)
+  {
+    HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_gpu_[i].data(), this->controls_d_ + i * T::CONTROL_DIM,
+                                 sizeof(float) * T::CONTROL_DIM, cudaMemcpyDeviceToHost, this->stream_));
+    // Query on CPU
+    float* u_i =
+        this->sampler_->getControlSample(this->sampled_indices_[i], this->sampled_times_[i],
+                                         this->sampled_distributions_[i], nullptr, this->sampled_outputs_[i].data());
+    HANDLE_ERROR(cudaMemcpyAsync(this->sampled_controls_cpu_[i].data(), u_i, sizeof(float) * T::CONTROL_DIM,
+                                 cudaMemcpyDeviceToHost, this->stream_));
+  }
+  HANDLE_ERROR(cudaMemcpyAsync(costs_gpu.data(), this->costs_d_, sizeof(float) * this->num_verifications_,
+                               cudaMemcpyDeviceToHost, this->stream_));
+  HANDLE_ERROR(cudaStreamSynchronize(this->stream_));
+
+  float cost;
+  for (int i = 0; i < this->num_verifications_; i++)
+  {
+    cost = 0.0f;
+    cost = this->sampler_->computeFeedbackCost(this->sampled_controls_cpu_[i], this->sampled_times_[i],
+                                               this->sampled_distributions_[i], this->lambda_, this->alpha_);
+    ASSERT_FLOAT_EQ(cost, costs_gpu[i]) << " failed on sample " << this->sampled_indices_[i] << ", time "
+                                        << this->sampled_times_[i] << ", dist: " << this->sampled_distributions_[i];
+  }
+}