From 6a18cb9ab4f7c1b6d514dcb7c14583ce68735fb4 Mon Sep 17 00:00:00 2001
From: "petro.zarytskyi" <petro.zarytskyi@gmail.com>
Date: Fri, 25 Oct 2024 19:21:49 +0200
Subject: [PATCH] Reimplement jacobians using the vectorized forward mode. 
 Previously, jacobians were based on the non-vectorized reverse mode, which
 was mostly incapable of capturing multiple outputs. The implementation worked
 in a few particular cases. For example, it was not possible to differentiate
 function calls or declare variables inside the original function body.  This
 PR implements jacobians using the vectorized forward mode. At the very least,
 this will solve the issues described above and give a way forward to solve
 other ones. This also means introducing features to the vectorized fwd mode
 will introduce the same features to jacobians.  Let's take a look at the new
 signature of jacobians. Since the function to be differentiated is expected
 to have multiple outputs, we should expect the output in the form of
 array/pointer/reference parameters (just like before). And for every output
 parameter, we should generate a corresponding adjoint parameter for the user
 to acquire the results. Since there is no way to specify which parameters are
 used as output and which are not, adjoints are generated for all
 array/pointer/reference parameters. For example:

```
void f(double a, double b, double* c)
 -->
void f_jac(double a, double b, double* c, <matrix<double>* _d_c)
```

or

```
void f(double a, double b, double* c, double[7] t)
 -->
void f_jac(double a, double b, double* c, double[7] t,
 array_ref<matrix<double>> _d_c, matrix<double>* _d_t)
```

This signature is also similar to the old one. e.g.
```
df.execute(a, b, c, result); // old behavior
df.execute(a, b, c, &result); // new behavior
```
However, the behavior differs for multiple output parameters, which the old jacobians did not support.

 Note: the same functionality can be achieved by using the vectorized reverse mode, which should probably be implemented at some point. However, the old code for jacobians is unlikely to be useful for that, and there is not much point in keeping it.

Fixes #472, Fixes #1057, Fixes #480, Fixes #527
---
 .../clad/Differentiator/BuiltinDerivatives.h  |  60 +--
 include/clad/Differentiator/Differentiator.h  |  14 +-
 include/clad/Differentiator/FunctionTraits.h  |  57 ++-
 .../clad/Differentiator/ReverseModeVisitor.h  |   4 -
 .../Differentiator/VectorForwardModeVisitor.h |   2 +-
 include/clad/Differentiator/VisitorBase.h     |   6 +
 lib/Differentiator/BaseForwardModeVisitor.cpp |  15 +-
 lib/Differentiator/CMakeLists.txt             |   1 +
 lib/Differentiator/DerivativeBuilder.cpp      |   5 +-
 lib/Differentiator/DiffPlanner.cpp            |   9 +-
 lib/Differentiator/JacobianModeVisitor.cpp    | 284 ++++++++++++++
 lib/Differentiator/JacobianModeVisitor.h      |  20 +
 lib/Differentiator/ReverseModeVisitor.cpp     | 352 ++----------------
 .../VectorForwardModeVisitor.cpp              |   2 +
 lib/Differentiator/VisitorBase.cpp            | 145 ++++++++
 test/FirstDerivative/BuiltinDerivatives.C     |   8 +-
 test/ForwardMode/UserDefinedTypes.C           |   2 +-
 test/Hessian/BuiltinDerivatives.C             |  28 +-
 test/Jacobian/FunctionCalls.C                 |  54 +--
 test/Jacobian/Functors.C                      | 299 ++++++---------
 test/Jacobian/Jacobian.C                      | 331 +++++++---------
 test/Jacobian/Pointers.C                      |  26 +-
 test/Jacobian/TemplateFunctors.C              | 112 +++---
 test/Jacobian/constexprTest.C                 |  91 ++---
 test/Jacobian/testUtility.C                   |  91 ++---
 test/NthDerivative/CustomDerivatives.C        |  24 +-
 test/TestUtils.h                              |  25 +-
 27 files changed, 1016 insertions(+), 1051 deletions(-)
 create mode 100644 lib/Differentiator/JacobianModeVisitor.cpp
 create mode 100644 lib/Differentiator/JacobianModeVisitor.h
diff --git a/include/clad/Differentiator/BuiltinDerivatives.h b/include/clad/Differentiator/BuiltinDerivatives.h
index 62296ab92..d309ababa 100644
--- a/include/clad/Differentiator/BuiltinDerivatives.h
+++ b/include/clad/Differentiator/BuiltinDerivatives.h
@@ -199,31 +199,31 @@ CUDA_HOST_DEVICE inline void __builtin_powf_pullback(float x, float exponent,
 // FIXME: Add the rest of the __builtin_ routines for log, sqrt, abs, etc.
 
 namespace std {
-template <typename T>
-CUDA_HOST_DEVICE ValueAndPushforward<T, T> abs_pushforward(T x, T d_x) {
+template <typename T, typename dT>
+CUDA_HOST_DEVICE ValueAndPushforward<T, dT> abs_pushforward(T x, dT d_x) {
   if (x >= 0)
     return {x, d_x};
   else
     return {-x, -d_x};
 }
 
-template <typename T>
-CUDA_HOST_DEVICE ValueAndPushforward<T, T> exp_pushforward(T x, T d_x) {
+template <typename T, typename dT>
+CUDA_HOST_DEVICE ValueAndPushforward<T, dT> exp_pushforward(T x, dT d_x) {
   return {::std::exp(x), ::std::exp(x) * d_x};
 }
 
-template <typename T>
-CUDA_HOST_DEVICE ValueAndPushforward<T, T> sin_pushforward(T x, T d_x) {
+template <typename T, typename dT>
+CUDA_HOST_DEVICE ValueAndPushforward<T, dT> sin_pushforward(T x, dT d_x) {
   return {::std::sin(x), ::std::cos(x) * d_x};
 }
 
-template <typename T>
-CUDA_HOST_DEVICE ValueAndPushforward<T, T> cos_pushforward(T x, T d_x) {
+template <typename T, typename dT>
+CUDA_HOST_DEVICE ValueAndPushforward<T, dT> cos_pushforward(T x, dT d_x) {
   return {::std::cos(x), (-1) * ::std::sin(x) * d_x};
 }
 
-template <typename T>
-CUDA_HOST_DEVICE ValueAndPushforward<T, T> sqrt_pushforward(T x, T d_x) {
+template <typename T, typename dT>
+CUDA_HOST_DEVICE ValueAndPushforward<T, dT> sqrt_pushforward(T x, dT d_x) {
   return {::std::sqrt(x), (((T)1) / (((T)2) * ::std::sqrt(x))) * d_x};
 }
 
@@ -232,9 +232,9 @@ CUDA_HOST_DEVICE ValueAndPushforward<T, T> floor_pushforward(T x, T /*d_x*/) {
   return {::std::floor(x), (T)0};
 }
 
-template <typename T>
-CUDA_HOST_DEVICE ValueAndPushforward<T, T> atan2_pushforward(T y, T x, T d_y,
-                                                             T d_x) {
+template <typename T, typename dT>
+CUDA_HOST_DEVICE ValueAndPushforward<T, dT> atan2_pushforward(T y, T x, dT d_y,
+                                                              dT d_x) {
   return {::std::atan2(y, x),
           -(y / ((x * x) + (y * y))) * d_x + x / ((x * x) + (y * y)) * d_y};
 }
@@ -246,8 +246,8 @@ CUDA_HOST_DEVICE void atan2_pullback(T y, T x, U d_z, T* d_y, T* d_x) {
   *d_x += -(y / ((x * x) + (y * y))) * d_z;
 }
 
-template <typename T>
-CUDA_HOST_DEVICE ValueAndPushforward<T, T> acos_pushforward(T x, T d_x) {
+template <typename T, typename dT>
+CUDA_HOST_DEVICE ValueAndPushforward<T, dT> acos_pushforward(T x, dT d_x) {
   return {::std::acos(x), ((-1) / (::std::sqrt(1 - x * x))) * d_x};
 }
 
@@ -263,17 +263,27 @@ ValueAndPushforward<float, float> sqrtf_pushforward(float x, float d_x) {
 
 #endif
 
-template <typename T1, typename T2>
-CUDA_HOST_DEVICE ValueAndPushforward<decltype(::std::pow(T1(), T2())),
-                                     decltype(::std::pow(T1(), T2()))>
-pow_pushforward(T1 x, T2 exponent, T1 d_x, T2 d_exponent) {
-  auto val = ::std::pow(x, exponent);
-  auto derivative = (exponent * ::std::pow(x, exponent - 1)) * d_x;
+template <typename T, typename dT> struct AdjOutType {
+  using type = T;
+};
+
+template <typename T, typename dT> struct AdjOutType<T, clad::array<dT>> {
+  using type = clad::array<T>;
+};
+
+template <typename T1, typename T2, typename dT1, typename dT2,
+          typename T_out = decltype(::std::pow(T1(), T2())),
+          typename dT_out = typename AdjOutType<T_out, dT1>::type>
+CUDA_HOST_DEVICE ValueAndPushforward<T_out, dT_out>
+pow_pushforward(T1 x, T2 exponent, dT1 d_x, dT2 d_exponent) {
+  T_out val = ::std::pow(x, exponent);
+  dT_out derivative = (exponent * ::std::pow(x, exponent - 1)) * d_x;
   // Only add directional derivative of base^exp w.r.t exp if the directional
-  // seed d_exponent is non-zero. This is required because if base is less than or
-  // equal to 0, then log(base) is undefined, and therefore if user only requested
-  // directional derivative of base^exp w.r.t base -- which is valid --, the result would
-  // be undefined because as per C++ valid number + NaN * 0 = NaN.
+  // seed d_exponent is non-zero. This is required because if base is less than
+  // or equal to 0, then log(base) is undefined, and therefore if user only
+  // requested directional derivative of base^exp w.r.t base -- which is valid
+  // --, the result would be undefined because as per C++ valid number + NaN * 0
+  // = NaN.
   if (d_exponent)
     derivative += (::std::pow(x, exponent) * ::std::log(x)) * d_exponent;
   return {val, derivative};
diff --git a/include/clad/Differentiator/Differentiator.h b/include/clad/Differentiator/Differentiator.h
index a4d450aff..1ae60d961 100644
--- a/include/clad/Differentiator/Differentiator.h
+++ b/include/clad/Differentiator/Differentiator.h
@@ -624,12 +624,13 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
             typename F, typename DerivedFnType = JacobianDerivedFnTraits_t<F>,
             typename = typename std::enable_if<
                 !std::is_class<remove_reference_and_pointer_t<F>>::value>::type>
-  constexpr CladFunction<
-      DerivedFnType, ExtractFunctorTraits_t<F>> __attribute__((annotate("J")))
+  constexpr CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>,
+                         /*EnablePadding=*/true> __attribute__((annotate("J")))
   jacobian(F f, ArgSpec args = "",
            DerivedFnType derivedFn = static_cast<DerivedFnType>(nullptr),
            const char* code = "") {
-    return CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>>(
+    return CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>,
+                        /*EnablePadding=*/true>(
         derivedFn /* will be replaced by Jacobian*/, code);
   }
 
@@ -640,12 +641,13 @@ CUDA_HOST_DEVICE T push(tape<T>& to, ArgsT... val) {
             typename F, typename DerivedFnType = JacobianDerivedFnTraits_t<F>,
             typename = typename std::enable_if<
                 std::is_class<remove_reference_and_pointer_t<F>>::value>::type>
-  constexpr CladFunction<
-      DerivedFnType, ExtractFunctorTraits_t<F>> __attribute__((annotate("J")))
+  constexpr CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>,
+                         /*EnablePadding=*/true> __attribute__((annotate("J")))
   jacobian(F&& f, ArgSpec args = "",
            DerivedFnType derivedFn = static_cast<DerivedFnType>(nullptr),
            const char* code = "") {
-    return CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>>(
+    return CladFunction<DerivedFnType, ExtractFunctorTraits_t<F>,
+                        /*EnablePadding=*/true>(
         derivedFn /* will be replaced by Jacobian*/, code, f);
   }
 
diff --git a/include/clad/Differentiator/FunctionTraits.h b/include/clad/Differentiator/FunctionTraits.h
index bc568e51d..9f8cda94f 100644
--- a/include/clad/Differentiator/FunctionTraits.h
+++ b/include/clad/Differentiator/FunctionTraits.h
@@ -2,6 +2,7 @@
 #define FUNCTION_TRAITS
 
 #include "clad/Differentiator/ArrayRef.h"
+#include "clad/Differentiator/Matrix.h"
 
 #include <type_traits>
 
@@ -548,15 +549,25 @@ namespace clad {
     using type = NoFunction*;
   };
 
-  template <class... Args> struct SelectLast;
+  // OutputVecParamType is used to deduce the type of derivative arguments
+  // for vector forward mode.
+  template <class T, class R> struct OutputVecParamType {
+    using type = array_ref<typename std::remove_pointer<R>::type>;
+  };
 
-  template <class... Args>
-  using SelectLast_t = typename SelectLast<Args...>::type;
+  template <class T, class R>
+  using OutputVecParamType_t = typename OutputVecParamType<T, R>::type;
+
+  /// Specialization for vector forward mode type.
+  template <class F, class = void> struct ExtractDerivedFnTraitsVecForwMode {};
 
-  template <class T> struct SelectLast<T> { using type = T; };
+  template <class F>
+  using ExtractDerivedFnTraitsVecForwMode_t =
+      typename ExtractDerivedFnTraitsVecForwMode<F>::type;
 
-  template <class T, class... Args> struct SelectLast<T, Args...> {
-    using type = typename SelectLast<Args...>::type;
+  template <class ReturnType, class... Args>
+  struct ExtractDerivedFnTraitsVecForwMode<ReturnType (*)(Args...)> {
+    using type = void (*)(Args..., OutputVecParamType_t<Args, void>...);
   };
 
   template <class T, class = void> struct JacobianDerivedFnTraits {};
@@ -569,7 +580,7 @@ namespace clad {
   // JacobianDerivedFnTraits specializations for pure function pointer types
   template <class ReturnType, class... Args>
   struct JacobianDerivedFnTraits<ReturnType (*)(Args...)> {
-    using type = void (*)(Args..., SelectLast_t<Args...>);
+    using type = void (*)(Args..., OutputParamType_t<Args, void>...);
   };
 
   /// These macro expansions are used to cover all possible cases of
@@ -581,11 +592,12 @@ namespace clad {
   /// qualifier and reference respectively. The AddNOEX adds cases for noexcept
   /// qualifier only if it is supported and finally AddSPECS declares the
   /// function with all the cases
-#define JacobianDerivedFnTraits_AddSPECS(var, cv, vol, ref, noex)              \
-  template <typename R, typename C, typename... Args>                          \
-  struct JacobianDerivedFnTraits<R (C::*)(Args...) cv vol ref noex> {          \
-    using type = void (C::*)(Args..., SelectLast_t<Args...>) cv vol ref noex;  \
-  };
+#define JacobianDerivedFnTraits_AddSPECS(var, cv, vol, ref, noex)            \
+    template <typename R, typename C, typename... Args>                        \
+    struct JacobianDerivedFnTraits<R (C::*)(Args...) cv vol ref noex> {        \
+      using type = void (C::*)(                                                \
+          Args..., OutputParamType_t<Args, void>...) cv vol ref noex;          \
+    };
 
 #if __cpp_noexcept_function_type > 0
 #define JacobianDerivedFnTraits_AddNOEX(var, con, vol, ref)                    \
@@ -739,27 +751,6 @@ namespace clad {
   using ExtractDerivedFnTraitsForwMode_t =
       typename ExtractDerivedFnTraitsForwMode<F>::type;
 
-  // OutputVecParamType is used to deduce the type of derivative arguments
-  // for vector forward mode.
-  template <class T, class R> struct OutputVecParamType {
-    using type = array_ref<typename std::remove_pointer<R>::type>;
-  };
-
-  template <class T, class R>
-  using OutputVecParamType_t = typename OutputVecParamType<T, R>::type;
-
-  /// Specialization for vector forward mode type.
-  template <class F, class = void> struct ExtractDerivedFnTraitsVecForwMode {};
-
-  template <class F>
-  using ExtractDerivedFnTraitsVecForwMode_t =
-      typename ExtractDerivedFnTraitsVecForwMode<F>::type;
-
-  template <class ReturnType, class... Args>
-  struct ExtractDerivedFnTraitsVecForwMode<ReturnType (*)(Args...)> {
-    using type = void (*)(Args..., OutputVecParamType_t<Args, void>...);
-  };
-
   /// Specialization for free function pointer type
   template <class F>
   struct ExtractDerivedFnTraitsForwMode<
diff --git a/include/clad/Differentiator/ReverseModeVisitor.h b/include/clad/Differentiator/ReverseModeVisitor.h
index a73513f94..ba0cf7c14 100644
--- a/include/clad/Differentiator/ReverseModeVisitor.h
+++ b/include/clad/Differentiator/ReverseModeVisitor.h
@@ -489,10 +489,6 @@ namespace clad {
            "attempt to differentiate unsupported operator, ignored.",
            args);
     }
-    /// Builds an overload for the gradient function that has derived params for
-    /// all the arguments of the requested function and it calls the original
-    /// gradient function internally
-    clang::FunctionDecl* CreateGradientOverload();
 
     /// Returns the type that should be used to represent the derivative of a
     /// variable of type `yType` with respect to a parameter variable of type
diff --git a/include/clad/Differentiator/VectorForwardModeVisitor.h b/include/clad/Differentiator/VectorForwardModeVisitor.h
index 6890b622a..6955dc5ed 100644
--- a/include/clad/Differentiator/VectorForwardModeVisitor.h
+++ b/include/clad/Differentiator/VectorForwardModeVisitor.h
@@ -9,7 +9,7 @@ namespace clad {
 /// A visitor for processing the function code in vector forward mode.
 /// Used to compute derivatives by clad::vector_forward_differentiate.
 class VectorForwardModeVisitor : public BaseForwardModeVisitor {
-private:
+protected:
   llvm::SmallVector<const clang::ValueDecl*, 16> m_IndependentVars;
   /// Map used to keep track of parameter variables w.r.t which the
   /// the derivative is being computed. This is separate from the
diff --git a/include/clad/Differentiator/VisitorBase.h b/include/clad/Differentiator/VisitorBase.h
index 210f82112..2ab7e35bc 100644
--- a/include/clad/Differentiator/VisitorBase.h
+++ b/include/clad/Differentiator/VisitorBase.h
@@ -628,6 +628,12 @@ namespace clad {
     /// Returns type clad::ConstructorReverseForwTag<T>
     clang::QualType GetCladConstructorReverseForwTagOfType(clang::QualType T);
 
+    /// Builds an overload for the derivative function that has derived params
+    /// for all the arguments of the requested function and it calls the
+    /// original derivative function internally. Used in gradient and jacobian
+    /// modes.
+    clang::FunctionDecl* CreateDerivativeOverload();
+
   public:
     /// Rebuild a sequence of nested namespaces ending with DC.
     clang::NamespaceDecl* RebuildEnclosingNamespaces(clang::DeclContext* DC);
diff --git a/lib/Differentiator/BaseForwardModeVisitor.cpp b/lib/Differentiator/BaseForwardModeVisitor.cpp
index 8015b8fdb..4a9c81412 100644
--- a/lib/Differentiator/BaseForwardModeVisitor.cpp
+++ b/lib/Differentiator/BaseForwardModeVisitor.cpp
@@ -1228,7 +1228,16 @@ StmtDiff BaseForwardModeVisitor::VisitCallExpr(const CallExpr* CE) {
   callDiff = m_Builder.BuildCallToCustomDerivativeOrNumericalDiff(
       customPushforward, customDerivativeArgs, getCurrentScope(),
       const_cast<DeclContext*>(FD->getDeclContext()));
-
+  // Custom derivative templates can be written in a
+  // general way that works for both vectorized and non-vectorized
+  // modes. We have to also look for the pushforward with the regular name.
+  if (!callDiff && m_DiffReq.Mode != DiffMode::forward) {
+    customPushforward =
+        clad::utils::ComputeEffectiveFnName(FD) + "_pushforward";
+    callDiff = m_Builder.BuildCallToCustomDerivativeOrNumericalDiff(
+        customPushforward, customDerivativeArgs, getCurrentScope(),
+        const_cast<DeclContext*>(FD->getDeclContext()));
+  }
   if (!isLambda) {
     // Check if it is a recursive call.
     if (!callDiff && (FD == m_DiffReq.Function) &&
@@ -1446,7 +1455,9 @@ BaseForwardModeVisitor::VisitBinaryOperator(const BinaryOperator* BinOp) {
       derivedR = BuildParens(derivedR);
     opDiff = BuildOp(opCode, derivedL, derivedR);
   } else if (BinOp->isAssignmentOp()) {
-    if (Ldiff.getExpr_dx()->isModifiableLvalue(m_Context) != Expr::MLV_Valid) {
+    if ((Ldiff.getExpr_dx()->isModifiableLvalue(m_Context) !=
+         Expr::MLV_Valid) &&
+        !isCladArrayType(Ldiff.getExpr_dx()->getType())) {
       diag(DiagnosticsEngine::Warning, BinOp->getEndLoc(),
            "derivative of an assignment attempts to assign to unassignable "
            "expr, assignment ignored");
diff --git a/lib/Differentiator/CMakeLists.txt b/lib/Differentiator/CMakeLists.txt
index 7f928b2ac..177608449 100644
--- a/lib/Differentiator/CMakeLists.txt
+++ b/lib/Differentiator/CMakeLists.txt
@@ -31,6 +31,7 @@ llvm_add_library(cladDifferentiator
   DiffPlanner.cpp
   ErrorEstimator.cpp
   EstimationModel.cpp
+  JacobianModeVisitor.cpp
   HessianModeVisitor.cpp
   MultiplexExternalRMVSource.cpp
   PushForwardModeVisitor.cpp
diff --git a/lib/Differentiator/DerivativeBuilder.cpp b/lib/Differentiator/DerivativeBuilder.cpp
index 946a68719..465f35aee 100644
--- a/lib/Differentiator/DerivativeBuilder.cpp
+++ b/lib/Differentiator/DerivativeBuilder.cpp
@@ -6,6 +6,7 @@
 
 #include "clad/Differentiator/DerivativeBuilder.h"
 
+#include "JacobianModeVisitor.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/TemplateBase.h"
 #include "clang/Sema/Lookup.h"
@@ -432,8 +433,8 @@ static void registerDerivative(FunctionDecl* derivedFD, Sema& semaRef) {
       HessianModeVisitor H(*this, request);
       result = H.Derive();
     } else if (request.Mode == DiffMode::jacobian) {
-      ReverseModeVisitor R(*this, request);
-      result = R.Derive();
+      JacobianModeVisitor J(*this, request);
+      result = J.DeriveJacobian();
     } else if (request.Mode == DiffMode::error_estimation) {
       ReverseModeVisitor R(*this, request);
       InitErrorEstimation(m_ErrorEstHandler, m_EstModel, *this, request);
diff --git a/lib/Differentiator/DiffPlanner.cpp b/lib/Differentiator/DiffPlanner.cpp
index 8f4cb12f6..1f1fe761f 100644
--- a/lib/Differentiator/DiffPlanner.cpp
+++ b/lib/Differentiator/DiffPlanner.cpp
@@ -579,7 +579,7 @@ namespace clad {
       
       // If the function has no parameters, then we cannot differentiate it."
       // and if the DiffMode is Jacobian, we must have atleast 2 parameters.
-      if (params.empty() || (params.size()==1 && this->Mode == DiffMode::jacobian)) {
+      if (params.empty()) {
         utils::EmitDiag(semaRef, DiagnosticsEngine::Error,
                         CallContext->getEndLoc(),
                         "Attempted to differentiate a function without "
@@ -587,13 +587,6 @@ namespace clad {
         return;
       }
 
-      // If it is a Vector valued function, the last parameter is to store the
-      // output vector and hence is not a differentiable parameter, so we must
-      // pop it out
-      if (this->Mode == DiffMode::jacobian){
-        params.pop_back();
-      }
-
       // insert an empty index for each parameter.
       for (unsigned i=0; i<params.size(); ++i) {
         DiffInputVarInfo dVarInfo(params[i], IndexInterval());
diff --git a/lib/Differentiator/JacobianModeVisitor.cpp b/lib/Differentiator/JacobianModeVisitor.cpp
new file mode 100644
index 000000000..09f118478
--- /dev/null
+++ b/lib/Differentiator/JacobianModeVisitor.cpp
@@ -0,0 +1,284 @@
+#include "JacobianModeVisitor.h"
+
+#include "ConstantFolder.h"
+#include "clad/Differentiator/CladUtils.h"
+
+#include "llvm/Support/SaveAndRestore.h"
+
+using namespace clang;
+
+namespace clad {
+JacobianModeVisitor::JacobianModeVisitor(DerivativeBuilder& builder,
+                                         const DiffRequest& request)
+    : VectorPushForwardModeVisitor(builder, request) {}
+
+DerivativeAndOverload JacobianModeVisitor::DeriveJacobian() {
+  const FunctionDecl* FD = m_DiffReq.Function;
+  assert(m_DiffReq.Mode == DiffMode::jacobian);
+
+  DiffParams args{};
+  for (const DiffInputVarInfo& dParam : m_DiffReq.DVI)
+    args.push_back(dParam.param);
+
+  // Generate name for the derivative function.
+  std::string derivedFnName = m_DiffReq.BaseFunctionName + "_jac";
+  if (args.size() != FD->getNumParams()) {
+    for (const ValueDecl* arg : args) {
+      const auto* it = std::find(FD->param_begin(), FD->param_end(), arg);
+      auto idx = std::distance(FD->param_begin(), it);
+      derivedFnName += ('_' + std::to_string(idx));
+    }
+  }
+  IdentifierInfo* II = &m_Context.Idents.get(derivedFnName);
+  SourceLocation loc{m_DiffReq->getLocation()};
+  DeclarationNameInfo name(II, loc);
+
+  llvm::SmallVector<QualType, 8> paramTypes;
+
+  // Generate the function type for the derivative.
+  paramTypes.reserve(m_DiffReq->getNumParams() + args.size());
+  for (auto* PVD : m_DiffReq->parameters())
+    paramTypes.push_back(PVD->getType());
+  for (auto* PVD : m_DiffReq->parameters()) {
+    QualType paramTy = PVD->getType();
+    if (utils::isArrayOrPointerType(paramTy) || paramTy->isReferenceType())
+      paramTypes.push_back(getParamAdjointType(paramTy));
+  }
+  QualType vectorDiffFunctionType = m_Context.getFunctionType(
+      m_Context.VoidTy,
+      llvm::ArrayRef<QualType>(paramTypes.data(), paramTypes.size()),
+      // Cast to function pointer.
+      dyn_cast<FunctionProtoType>(m_DiffReq->getType())->getExtProtoInfo());
+
+  // Create the function declaration for the derivative.
+  // FIXME: We should not use const_cast to get the decl context here.
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+  auto* DC = const_cast<DeclContext*>(m_DiffReq->getDeclContext());
+  m_Sema.CurContext = DC;
+  DeclWithContext result = m_Builder.cloneFunction(
+      m_DiffReq.Function, *this, DC, loc, name, vectorDiffFunctionType);
+  FunctionDecl* vectorDiffFD = result.first;
+  m_Derivative = vectorDiffFD;
+
+  // Function declaration scope
+  llvm::SaveAndRestore<DeclContext*> SaveContext(m_Sema.CurContext);
+  llvm::SaveAndRestore<Scope*> SaveScope(getCurrentScope());
+  beginScope(Scope::FunctionPrototypeScope | Scope::FunctionDeclarationScope |
+             Scope::DeclScope);
+  m_Sema.PushFunctionScope();
+  m_Sema.PushDeclContext(getCurrentScope(), m_Derivative);
+
+  // Create the body of the derivative.
+  beginScope(Scope::FnScope | Scope::DeclScope);
+  m_DerivativeFnScope = getCurrentScope();
+  beginBlock();
+
+  // Count the number of non-array independent variables requested for
+  // differentiation.
+  size_t nonArrayIndVarCount = 0;
+
+  // Set the parameters for the derivative.
+  llvm::SmallVector<ParmVarDecl*, 16> params;
+  llvm::SmallVector<ParmVarDecl*, 16> derivedParams;
+  llvm::SmallVector<DeclStmt*, 8> adjointDecls;
+  for (const auto* PVD : FD->parameters()) {
+    IdentifierInfo* PVDII = PVD->getIdentifier();
+    auto* newPVD = CloneParmVarDecl(PVD, PVDII,
+                                    /*pushOnScopeChains=*/true,
+                                    /*cloneDefaultArg=*/false);
+    params.push_back(newPVD);
+
+    if (!BaseForwardModeVisitor::IsDifferentiableType(PVD->getType()))
+      continue;
+    auto derivedPVDName = "_d_vector_" + std::string(PVDII->getName());
+    IdentifierInfo* derivedPVDII = CreateUniqueIdentifier(derivedPVDName);
+    Expr* derivedExpr = nullptr;
+    if (utils::isArrayOrPointerType(PVD->getType())) {
+      ParmVarDecl* derivedPVD = utils::BuildParmVarDecl(
+          m_Sema, m_Derivative, derivedPVDII,
+          getParamAdjointType(PVD->getType()), PVD->getStorageClass());
+      derivedParams.push_back(derivedPVD);
+      derivedExpr =
+          BuildOp(UO_Deref, BuildDeclRef(derivedPVD), PVD->getBeginLoc());
+      Expr* getSize = BuildCallExprToMemFn(BuildDeclRef(derivedPVD),
+                                           /*MemberFunctionName=*/"rows", {});
+      if (!m_IndVarCountExpr)
+        m_IndVarCountExpr = getSize;
+      else
+        m_IndVarCountExpr =
+            BuildOp(BinaryOperatorKind::BO_Add, m_IndVarCountExpr, getSize);
+    } else if (PVD->getType()->isReferenceType()) {
+      ParmVarDecl* derivedPVD = utils::BuildParmVarDecl(
+          m_Sema, m_Derivative, derivedPVDII,
+          getParamAdjointType(PVD->getType()), PVD->getStorageClass());
+      derivedParams.push_back(derivedPVD);
+      derivedExpr =
+          BuildOp(UO_Deref, BuildDeclRef(derivedPVD), PVD->getBeginLoc());
+      nonArrayIndVarCount += 1;
+    } else {
+      VarDecl* derivedPVD = BuildVarDecl(
+          GetPushForwardDerivativeType(PVD->getType()), derivedPVDII);
+      adjointDecls.push_back(BuildDeclStmt(derivedPVD));
+      derivedExpr = BuildDeclRef(derivedPVD);
+      nonArrayIndVarCount += 1;
+    }
+    m_Variables[newPVD] = derivedExpr;
+  }
+
+  params.insert(params.end(), derivedParams.begin(), derivedParams.end());
+
+  // Process the expression for the number independent variables.
+  // This will be the sum of the sizes of all array parameters and the number
+  // of non-array parameters.
+  Expr* nonArrayIndVarCountExpr = ConstantFolder::synthesizeLiteral(
+      m_Context.UnsignedLongTy, m_Context, nonArrayIndVarCount);
+  if (!m_IndVarCountExpr) {
+    m_IndVarCountExpr = nonArrayIndVarCountExpr;
+  } else if (nonArrayIndVarCount != 0) {
+    m_IndVarCountExpr = BuildOp(BinaryOperatorKind::BO_Add, m_IndVarCountExpr,
+                                nonArrayIndVarCountExpr);
+  }
+
+  vectorDiffFD->setParams(
+      clad_compat::makeArrayRef(params.data(), params.size()));
+  vectorDiffFD->setBody(nullptr);
+
+  // Instantiate a variable indepVarCount to store the total number of
+  // independent variables requested.
+  // size_t indepVarCount = m_IndVarCountExpr;
+  auto* totalIndVars = BuildVarDecl(m_Context.UnsignedLongTy, "indepVarCount",
+                                    m_IndVarCountExpr);
+  addToCurrentBlock(BuildDeclStmt(totalIndVars));
+  m_IndVarCountExpr = BuildDeclRef(totalIndVars);
+
+  for (DeclStmt* decl : adjointDecls)
+    addToCurrentBlock(decl);
+
+  // Expression for maintaining the number of independent variables processed
+  // till now present as array elements. This will be sum of sizes of all such
+  // arrays.
+  Expr* arrayIndVarCountExpr = nullptr;
+
+  // Number of non-array independent variables processed till now.
+  nonArrayIndVarCount = 0;
+
+  // Current Index of independent variable in the param list of the function.
+  size_t independentVarIndex = 0;
+
+  size_t numParamsOriginalFn = m_DiffReq->getNumParams();
+  for (size_t i = 0; i < numParamsOriginalFn; ++i) {
+    bool is_array =
+        utils::isArrayOrPointerType(m_DiffReq->getParamDecl(i)->getType());
+    ParmVarDecl* param = params[i];
+    Expr* paramDiff = m_Variables[param];
+    QualType dParamType = clad::utils::GetValueType(param->getType());
+    // Desugaring the type is necessary to pass it to other templates
+    dParamType = dParamType.getDesugaredType(m_Context);
+    Expr* dVectorParam = nullptr;
+    if (m_DiffReq.DVI.size() > independentVarIndex &&
+        m_DiffReq.DVI[independentVarIndex].param ==
+            m_DiffReq->getParamDecl(i)) {
+      // Current offset for independent variable.
+      Expr* offsetExpr = arrayIndVarCountExpr;
+      Expr* nonArrayIndVarCountExpr = ConstantFolder::synthesizeLiteral(
+          m_Context.UnsignedLongTy, m_Context, nonArrayIndVarCount);
+      if (!offsetExpr)
+        offsetExpr = nonArrayIndVarCountExpr;
+      else if (nonArrayIndVarCount != 0)
+        offsetExpr = BuildOp(BinaryOperatorKind::BO_Add, offsetExpr,
+                             nonArrayIndVarCountExpr);
+
+      if (is_array) {
+        Expr* base = cast<UnaryOperator>(paramDiff)->getSubExpr();
+        // Get size of the array.
+        Expr* getSize = BuildCallExprToMemFn(Clone(base),
+                                             /*MemberFunctionName=*/"rows", {});
+        // Create an identity matrix for the parameter,
+        // with number of rows equal to the size of the array,
+        // and number of columns equal to the number of independent variables
+        llvm::SmallVector<Expr*, 3> args = {getSize, m_IndVarCountExpr,
+                                            offsetExpr};
+        dVectorParam = BuildIdentityMatrixExpr(dParamType, args, loc);
+
+        // Update the array independent expression.
+        if (!arrayIndVarCountExpr)
+          arrayIndVarCountExpr = getSize;
+        else
+          arrayIndVarCountExpr = BuildOp(BinaryOperatorKind::BO_Add,
+                                         arrayIndVarCountExpr, getSize);
+      } else {
+        // Create a one hot vector for the parameter.
+        llvm::SmallVector<Expr*, 2> args = {m_IndVarCountExpr, offsetExpr};
+        dVectorParam = BuildCallExprToCladFunction("one_hot_vector", args,
+                                                   {dParamType}, loc);
+        ++nonArrayIndVarCount;
+      }
+      ++independentVarIndex;
+    } else {
+      // We cannot initialize derived variable for pointer types because
+      // we do not know the correct size.
+      if (is_array)
+        continue;
+      // This parameter is not an independent variable.
+      // Initialize by all zeros.
+      dVectorParam = BuildCallExprToCladFunction(
+          "zero_vector", {m_IndVarCountExpr}, {dParamType}, loc);
+    }
+
+    // For each function arg to be differentiated, create a variable
+    // _d_vector_arg to store the vector of derivatives for that arg.
+    // for ex: double f(double x, double y, double z);
+    // and we want to differentiate w.r.t. x and z, then we will have
+    // -> clad::array<double> _d_vector_x = {1, 0};
+    // -> clad::array<double> _d_vector_y = {0, 0};
+    // -> clad::array<double> _d_vector_z = {0, 1};
+    if (utils::isArrayOrPointerType(param->getType()) ||
+        param->getType()->isReferenceType()) {
+      Expr* paramAssignment =
+          BuildOp(BO_Assign, Clone(paramDiff), dVectorParam);
+      addToCurrentBlock(paramAssignment);
+    } else {
+      auto* paramDecl = cast<VarDecl>(cast<DeclRefExpr>(paramDiff)->getDecl());
+      m_Sema.AddInitializerToDecl(paramDecl, dVectorParam, true);
+      paramDecl->setInitStyle(VarDecl::InitializationStyle::CInit);
+    }
+  }
+
+  // Traverse the function body and generate the derivative.
+  Stmt* BodyDiff = Visit(FD->getBody()).getStmt();
+  for (Stmt* S : cast<CompoundStmt>(BodyDiff)->body())
+    addToCurrentBlock(S);
+
+  Stmt* vectorDiffBody = endBlock();
+  m_Derivative->setBody(vectorDiffBody);
+  endScope(); // Function body scope
+  m_Sema.PopFunctionScopeInfo();
+  m_Sema.PopDeclContext();
+  endScope(); // Function decl scope
+  // Create the overload declaration for the derivative.
+  FunctionDecl* overloadFD = CreateDerivativeOverload();
+  return DerivativeAndOverload{vectorDiffFD, overloadFD};
+}
+
+QualType JacobianModeVisitor::getParamAdjointType(QualType T) {
+  QualType derivedTy = GetPushForwardDerivativeType(T);
+  derivedTy = m_Context.getPointerType(derivedTy.getNonReferenceType());
+  return derivedTy;
+}
+
+StmtDiff JacobianModeVisitor::VisitReturnStmt(const clang::ReturnStmt* RS) {
+  // If there is no return value, we must not attempt to differentiate
+  if (!RS->getRetValue())
+    return nullptr;
+
+  StmtDiff retValDiff = Visit(RS->getRetValue());
+  // This can instantiate as part of the move or copy initialization and
+  // needs a fake source location.
+  SourceLocation fakeLoc = utils::GetValidSLoc(m_Sema);
+  Stmt* returnStmt =
+      m_Sema
+          .ActOnReturnStmt(fakeLoc, retValDiff.getExpr_dx(), getCurrentScope())
+          .get();
+  return StmtDiff(returnStmt);
+}
+} // end namespace clad
diff --git a/lib/Differentiator/JacobianModeVisitor.h b/lib/Differentiator/JacobianModeVisitor.h
new file mode 100644
index 000000000..0a9aca311
--- /dev/null
+++ b/lib/Differentiator/JacobianModeVisitor.h
@@ -0,0 +1,20 @@
+#ifndef CLAD_DIFFERENTIATOR_JACOBIANMODEVISITOR_H
+#define CLAD_DIFFERENTIATOR_JACOBIANMODEVISITOR_H
+
+#include "clad/Differentiator/VectorPushForwardModeVisitor.h"
+
+namespace clad {
+class JacobianModeVisitor : public VectorPushForwardModeVisitor {
+
+public:
+  JacobianModeVisitor(DerivativeBuilder& builder, const DiffRequest& request);
+
+  DerivativeAndOverload DeriveJacobian();
+
+  clang::QualType getParamAdjointType(clang::QualType T);
+
+  StmtDiff VisitReturnStmt(const clang::ReturnStmt* RS) override;
+};
+} // end namespace clad
+
+#endif // CLAD_DIFFERENTIATOR_JACOBIANMODEVISITOR_H
diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp
index 19b902ce1..b90dacb99 100644
--- a/lib/Differentiator/ReverseModeVisitor.cpp
+++ b/lib/Differentiator/ReverseModeVisitor.cpp
@@ -189,160 +189,10 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
     }
   }
 
-  FunctionDecl* ReverseModeVisitor::CreateGradientOverload() {
-    auto gradientParams = m_Derivative->parameters();
-    auto gradientNameInfo = m_Derivative->getNameInfo();
-    // Calculate the total number of parameters that would be required for
-    // automatic differentiation in the derived function if all args are
-    // requested.
-    // FIXME: Here we are assuming all function parameters are of differentiable
-    // type. Ideally, we should not make any such assumption.
-    std::size_t totalDerivedParamsSize = m_DiffReq->getNumParams() * 2;
-    std::size_t numOfDerivativeParams = m_DiffReq->getNumParams();
-
-    // Account for the this pointer.
-    if (isa<CXXMethodDecl>(m_DiffReq.Function) &&
-        !utils::IsStaticMethod(m_DiffReq.Function))
-      ++numOfDerivativeParams;
-    // All output parameters will be of type `void*`. These
-    // parameters will be casted to correct type before the call to the actual
-    // derived function.
-    // We require each output parameter to be of same type in the overloaded
-    // derived function due to limitations of generating the exact derived
-    // function type at the compile-time (without clad plugin help).
-    QualType outputParamType = m_Context.getPointerType(m_Context.VoidTy);
-
-    llvm::SmallVector<QualType, 16> paramTypes;
-
-    // Add types for representing original function parameters.
-    for (auto* PVD : m_DiffReq->parameters())
-      paramTypes.push_back(PVD->getType());
-    // Add types for representing parameter derivatives.
-    // FIXME: We are assuming all function parameters are differentiable. We
-    // should not make any such assumptions.
-    for (std::size_t i = 0; i < numOfDerivativeParams; ++i)
-      paramTypes.push_back(outputParamType);
-
-    auto gradFuncOverloadEPI =
-        dyn_cast<FunctionProtoType>(m_DiffReq->getType())->getExtProtoInfo();
-    QualType gradientFunctionOverloadType =
-        m_Context.getFunctionType(m_Context.VoidTy, paramTypes,
-                                  // Cast to function pointer.
-                                  gradFuncOverloadEPI);
-
-    // FIXME: We should not use const_cast to get the decl context here.
-    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-    auto* DC = const_cast<DeclContext*>(m_DiffReq->getDeclContext());
-    m_Sema.CurContext = DC;
-    DeclWithContext gradientOverloadFDWC =
-        m_Builder.cloneFunction(m_DiffReq.Function, *this, DC, noLoc,
-                                gradientNameInfo, gradientFunctionOverloadType);
-    FunctionDecl* gradientOverloadFD = gradientOverloadFDWC.first;
-
-    beginScope(Scope::FunctionPrototypeScope | Scope::FunctionDeclarationScope |
-               Scope::DeclScope);
-    m_Sema.PushFunctionScope();
-    m_Sema.PushDeclContext(getCurrentScope(), gradientOverloadFD);
-
-    llvm::SmallVector<ParmVarDecl*, 4> overloadParams;
-    llvm::SmallVector<Expr*, 4> callArgs;
-
-    overloadParams.reserve(totalDerivedParamsSize);
-    callArgs.reserve(gradientParams.size());
-
-    for (auto* PVD : m_DiffReq->parameters()) {
-      auto* VD = utils::BuildParmVarDecl(
-          m_Sema, gradientOverloadFD, PVD->getIdentifier(), PVD->getType(),
-          PVD->getStorageClass(), /*defArg=*/nullptr, PVD->getTypeSourceInfo());
-      overloadParams.push_back(VD);
-      callArgs.push_back(BuildDeclRef(VD));
-    }
-
-    for (std::size_t i = 0; i < numOfDerivativeParams; ++i) {
-      IdentifierInfo* II = nullptr;
-      StorageClass SC = StorageClass::SC_None;
-      std::size_t effectiveGradientIndex = m_DiffReq->getNumParams() + i;
-      // `effectiveGradientIndex < gradientParams.size()` implies that this
-      // parameter represents an actual derivative of one of the function
-      // original parameters.
-      if (effectiveGradientIndex < gradientParams.size()) {
-        auto* GVD = gradientParams[effectiveGradientIndex];
-        II = CreateUniqueIdentifier("_temp_" + GVD->getNameAsString());
-        SC = GVD->getStorageClass();
-      } else {
-        II = CreateUniqueIdentifier("_d_" + std::to_string(i));
-      }
-      auto* PVD = utils::BuildParmVarDecl(m_Sema, gradientOverloadFD, II,
-                                          outputParamType, SC);
-      overloadParams.push_back(PVD);
-    }
-
-    for (auto* PVD : overloadParams)
-      if (PVD->getIdentifier())
-        m_Sema.PushOnScopeChains(PVD, getCurrentScope(),
-                                 /*AddToContext=*/false);
-
-    gradientOverloadFD->setParams(overloadParams);
-    gradientOverloadFD->setBody(/*B=*/nullptr);
-
-    beginScope(Scope::FnScope | Scope::DeclScope);
-    m_DerivativeFnScope = getCurrentScope();
-    beginBlock();
-
-    // Build derivatives to be used in the call to the actual derived function.
-    // These are initialised by effectively casting the derivative parameters of
-    // overloaded derived function to the correct type.
-    for (std::size_t i = m_DiffReq->getNumParams(); i < gradientParams.size();
-         ++i) {
-      auto* overloadParam = overloadParams[i];
-      auto* gradientParam = gradientParams[i];
-      TypeSourceInfo* typeInfo =
-          m_Context.getTrivialTypeSourceInfo(gradientParam->getType());
-      SourceLocation fakeLoc = utils::GetValidSLoc(m_Sema);
-      auto* init = m_Sema
-                       .BuildCStyleCastExpr(fakeLoc, typeInfo, fakeLoc,
-                                            BuildDeclRef(overloadParam))
-                       .get();
-
-      auto* gradientVD = BuildGlobalVarDecl(gradientParam->getType(),
-                                            gradientParam->getName(), init);
-      callArgs.push_back(BuildDeclRef(gradientVD));
-      addToCurrentBlock(BuildDeclStmt(gradientVD));
-    }
-
-    // If the function is a global kernel, we need to transform it
-    // into a device function when calling it inside the overload function
-    // which is the final global kernel returned.
-    if (m_Derivative->hasAttr<clang::CUDAGlobalAttr>()) {
-      m_Derivative->dropAttr<clang::CUDAGlobalAttr>();
-      m_Derivative->addAttr(clang::CUDADeviceAttr::CreateImplicit(m_Context));
-    }
-
-    Expr* callExpr = BuildCallExprToFunction(m_Derivative, callArgs,
-                                             /*UseRefQualifiedThisObj=*/true);
-    addToCurrentBlock(callExpr);
-    Stmt* gradientOverloadBody = endBlock();
-
-    gradientOverloadFD->setBody(gradientOverloadBody);
-
-    endScope(); // Function body scope
-    m_Sema.PopFunctionScopeInfo();
-    m_Sema.PopDeclContext();
-    endScope(); // Function decl scope
-
-    return gradientOverloadFD;
-  }
-
   DerivativeAndOverload ReverseModeVisitor::Derive() {
     const FunctionDecl* FD = m_DiffReq.Function;
     if (m_ExternalSource)
       m_ExternalSource->ActOnStartOfDerive();
-
-    // FIXME: reverse mode plugins may have request mode other than
-    // `DiffMode::reverse`, but they still need the `DiffMode::reverse` mode
-    // specific behaviour, because they are "reverse" mode plugins.
-    // assert(m_DiffReq.Mode == DiffMode::reverse ||
-    //        m_DiffReq.Mode == DiffMode::jacobian && "Unexpected Mode.");
     if (m_DiffReq.Mode == DiffMode::error_estimation)
       // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
       const_cast<DiffRequest&>(m_DiffReq).Mode = DiffMode::reverse;
@@ -362,29 +212,12 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
 
     if (m_ExternalSource)
       m_ExternalSource->ActAfterParsingDiffArgs(m_DiffReq, args);
-    // Save the type of the output parameter(s) that is add by clad to the
-    // derived function
-    if (m_DiffReq.Mode == DiffMode::jacobian) {
-      unsigned lastArgN = m_DiffReq->getNumParams() - 1;
-      outputArrayStr = m_DiffReq->getParamDecl(lastArgN)->getNameAsString();
-    }
 
     auto derivativeBaseName = m_DiffReq.BaseFunctionName;
     std::string gradientName = derivativeBaseName + funcPostfix();
     // To be consistent with older tests, nothing is appended to 'f_grad' if
     // we differentiate w.r.t. all the parameters at once.
-    if (m_DiffReq.Mode == DiffMode::jacobian) {
-      // If Jacobian is asked, the last parameter is the result parameter
-      // and should be ignored
-      if (args.size() != FD->getNumParams()-1){
-        for (const auto* arg : args) {
-          const auto* const it =
-              std::find(FD->param_begin(), FD->param_end() - 1, arg);
-          auto idx = std::distance(FD->param_begin(), it);
-          gradientName += ('_' + std::to_string(idx));
-        }
-      }
-    } else if (args.size() != FD->getNumParams()) {
+    if (args.size() != FD->getNumParams()) {
       for (const auto* arg : args) {
         const auto* it = std::find(FD->param_begin(), FD->param_end(), arg);
         auto idx = std::distance(FD->param_begin(), it);
@@ -411,7 +244,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
     bool shouldCreateOverload = false;
     // FIXME: Gradient overload doesn't know how to handle additional parameters
     // added by the plugins yet.
-    if (m_DiffReq.Mode != DiffMode::jacobian && numExtraParam == 0)
+    if (numExtraParam == 0)
       shouldCreateOverload = true;
     if (!m_DiffReq.DeclarationOnly && !m_DiffReq.DerivedFDPrototypes.empty())
       // If the overload is already created, we don't need to create it again.
@@ -440,7 +273,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
       m_Derivative = customDerivative;
       FunctionDecl* gradientOverloadFD = nullptr;
       if (shouldCreateOverload)
-        gradientOverloadFD = CreateGradientOverload();
+        gradientOverloadFD = CreateDerivativeOverload();
       return DerivativeAndOverload{customDerivative, gradientOverloadFD};
     }
 
@@ -484,37 +317,6 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
     gradientFD->setBody(nullptr);
 
     if (!m_DiffReq.DeclarationOnly) {
-      if (m_DiffReq.Mode == DiffMode::jacobian) {
-        // Reference to the output parameter.
-        m_Result = BuildDeclRef(params.back());
-        numParams = args.size();
-
-        // Creates the ArraySubscriptExprs for the independent variables
-        size_t idx = 0;
-        for (const auto* arg : args) {
-          // FIXME: fix when adding array inputs, now we are just skipping all
-          // array/pointer inputs (not treating them as independent variables).
-          if (utils::isArrayOrPointerType(arg->getType())) {
-            if (arg->getName() == "p")
-              m_Variables[arg] = m_Result;
-            idx += 1;
-            continue;
-          }
-          auto size_type = m_Context.getSizeType();
-          unsigned size_type_bits = m_Context.getIntWidth(size_type);
-          // Create the idx literal.
-          auto* i = IntegerLiteral::Create(
-              m_Context, llvm::APInt(size_type_bits, idx), size_type, noLoc);
-          // Create the jacobianMatrix[idx] expression.
-          auto* result_at_i =
-              m_Sema.CreateBuiltinArraySubscriptExpr(m_Result, noLoc, i, noLoc)
-                  .get();
-          m_Variables[arg] = result_at_i;
-          idx += 1;
-          m_IndependentVars.push_back(arg);
-        }
-      }
-
       if (m_ExternalSource)
         m_ExternalSource->ActBeforeCreatingDerivedFnBodyScope();
 
@@ -550,8 +352,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
 
     FunctionDecl* gradientOverloadFD = nullptr;
     if (shouldCreateOverload) {
-      gradientOverloadFD =
-          CreateGradientOverload();
+      gradientOverloadFD = CreateDerivativeOverload();
     }
 
     return DerivativeAndOverload{result.first, gradientOverloadFD};
@@ -701,10 +502,6 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
       // derived variables are already created for independent variables.
       if (m_Variables.count(param))
         continue;
-      // in vector mode last non diff parameter is output parameter.
-      if (m_DiffReq.Mode == DiffMode::jacobian &&
-          i == m_DiffReq->getNumParams() - 1)
-        continue;
       auto VDDerivedType = getNonConstType(param->getType(), m_Context, m_Sema);
       // We cannot initialize derived variable for pointer types because
       // we do not know the correct size.
@@ -1610,48 +1407,31 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
       if (DRE->getDecl()->getType()->isReferenceType() &&
           VD->getType()->isPointerType())
         clonedDRE = BuildOp(UO_Deref, clonedDRE);
-      if (m_DiffReq.Mode == DiffMode::jacobian) {
-        if (m_VectorOutput.size() <= outputArrayCursor)
-          return StmtDiff(clonedDRE);
-
-        auto it = m_VectorOutput[outputArrayCursor].find(VD);
-        if (it == std::end(m_VectorOutput[outputArrayCursor]))
-          return StmtDiff(clonedDRE); // Not an independent variable, ignored.
-
-        // Create the (jacobianMatrix[idx] += dfdx) statement.
-        if (dfdx()) {
-          auto* add_assign = BuildOp(BO_AddAssign, it->second, dfdx());
-          // Add it to the body statements.
-          addToCurrentBlock(add_assign, direction::reverse);
-        }
-        return StmtDiff(clonedDRE, it->second, it->second);
-      } else {
-        // Check DeclRefExpr is a reference to an independent variable.
-        auto it = m_Variables.find(VD);
-        if (it == std::end(m_Variables)) {
-          // Is not an independent variable, ignored.
-          return StmtDiff(clonedDRE);
-        }
-        // Create the (_d_param[idx] += dfdx) statement.
-        if (dfdx()) {
-          // FIXME: not sure if this is generic.
-          // Don't update derivatives of record types.
-          if (!VD->getType()->isRecordType()) {
-            Expr* base = it->second;
-            if (auto* UO = dyn_cast<UnaryOperator>(it->second))
-              base = UO->getSubExpr()->IgnoreImpCasts();
-            if (shouldUseCudaAtomicOps(base)) {
-              Expr* atomicCall = BuildCallToCudaAtomicAdd(it->second, dfdx());
-              // Add it to the body statements.
-              addToCurrentBlock(atomicCall, direction::reverse);
-            } else {
-              auto* add_assign = BuildOp(BO_AddAssign, it->second, dfdx());
-              addToCurrentBlock(add_assign, direction::reverse);
-            }
+      // Check DeclRefExpr is a reference to an independent variable.
+      auto it = m_Variables.find(VD);
+      if (it == std::end(m_Variables)) {
+        // Is not an independent variable, ignored.
+        return StmtDiff(clonedDRE);
+      }
+      // Create the (_d_param[idx] += dfdx) statement.
+      if (dfdx()) {
+        // FIXME: not sure if this is generic.
+        // Don't update derivatives of record types.
+        if (!VD->getType()->isRecordType()) {
+          Expr* base = it->second;
+          if (auto* UO = dyn_cast<UnaryOperator>(it->second))
+            base = UO->getSubExpr()->IgnoreImpCasts();
+          if (shouldUseCudaAtomicOps(base)) {
+            Expr* atomicCall = BuildCallToCudaAtomicAdd(it->second, dfdx());
+            // Add it to the body statements.
+            addToCurrentBlock(atomicCall, direction::reverse);
+          } else {
+            auto* add_assign = BuildOp(BO_AddAssign, it->second, dfdx());
+            addToCurrentBlock(add_assign, direction::reverse);
           }
         }
-        return StmtDiff(clonedDRE, it->second, it->second);
       }
+      return StmtDiff(clonedDRE, it->second, it->second);
     }
 
     return StmtDiff(clonedDRE);
@@ -2720,66 +2500,6 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
         return BuildOp(opCode, LExpr, RExpr);
       }
 
-      // FIXME: Put this code into a separate subroutine and break out early
-      // using return if the diff mode is not jacobian and we are not dealing
-      // with the `outputArray`.
-      if (auto* ASE = dyn_cast<ArraySubscriptExpr>(L)) {
-        if (auto* DRE =
-                dyn_cast<DeclRefExpr>(ASE->getBase()->IgnoreImplicit())) {
-          auto type = QualType(DRE->getType()->getPointeeOrArrayElementType(),
-                               /*Quals=*/0);
-          std::string DRE_str = DRE->getDecl()->getNameAsString();
-
-          llvm::APSInt intIdx;
-          Expr::EvalResult res;
-          Expr::SideEffectsKind AllowSideEffects =
-              Expr::SideEffectsKind::SE_NoSideEffects;
-          auto isIdxValid =
-              ASE->getIdx()->EvaluateAsInt(res, m_Context, AllowSideEffects);
-
-          if (DRE_str == outputArrayStr && isIdxValid) {
-            intIdx = res.Val.getInt();
-            if (m_DiffReq.Mode == DiffMode::jacobian) {
-              outputArrayCursor = intIdx.getExtValue();
-
-              std::unordered_map<const clang::ValueDecl*, clang::Expr*>
-                  temp_m_Variables;
-              for (unsigned i = 0; i < numParams; i++) {
-                auto size_type = m_Context.getSizeType();
-                unsigned size_type_bits = m_Context.getIntWidth(size_type);
-                llvm::APInt idxValue(size_type_bits,
-                                     i + (outputArrayCursor * numParams));
-                auto* idx = IntegerLiteral::Create(m_Context, idxValue,
-                                                   size_type, noLoc);
-                // Create the jacobianMatrix[idx] expression.
-                auto* result_at_i = m_Sema
-                                        .CreateBuiltinArraySubscriptExpr(
-                                            m_Result, noLoc, idx, noLoc)
-                                        .get();
-                temp_m_Variables[m_IndependentVars[i]] = result_at_i;
-              }
-              if (m_VectorOutput.size() <= outputArrayCursor)
-                m_VectorOutput.resize(outputArrayCursor + 1);
-              m_VectorOutput[outputArrayCursor] = std::move(temp_m_Variables);
-            }
-
-            auto* dfdf = ConstantFolder::synthesizeLiteral(m_Context.IntTy,
-                                                           m_Context, 1);
-            ExprResult tmp = dfdf;
-            dfdf = m_Sema
-                       .ImpCastExprToType(tmp.get(), type,
-                                          m_Sema.PrepareScalarCast(tmp, type))
-                       .get();
-            auto ReturnResult = DifferentiateSingleExpr(R, dfdf);
-            StmtDiff ReturnDiff = ReturnResult.first;
-            Stmt* Reverse = ReturnDiff.getStmt_dx();
-            addToCurrentBlock(Reverse, direction::reverse);
-            for (Stmt* S : cast<CompoundStmt>(ReturnDiff.getStmt())->body())
-              addToCurrentBlock(S, direction::forward);
-          }
-        }
-      }
-
       // Visit LHS, but delay emission of its derivative statements, save them
       // in Lblock
       beginBlock(direction::reverse);
@@ -4730,11 +4450,6 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
         if (it != std::end(diffParams))
           paramTypes.push_back(ComputeParamType(PVD->getType()));
       }
-    } else if (m_DiffReq.Mode == DiffMode::jacobian) {
-      std::size_t lastArgIdx = m_DiffReq->getNumParams() - 1;
-      QualType derivativeParamType =
-          m_DiffReq->getParamDecl(lastArgIdx)->getType();
-      paramTypes.push_back(derivativeParamType);
     }
     return paramTypes;
   }
@@ -4756,8 +4471,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
 
     if (const auto* MD = dyn_cast<CXXMethodDecl>(m_DiffReq.Function)) {
       const CXXRecordDecl* RD = MD->getParent();
-      if (m_DiffReq.Mode != DiffMode::jacobian && MD->isInstance() &&
-          !RD->isLambda()) {
+      if (MD->isInstance() && !RD->isLambda()) {
         auto* thisDerivativePVD = utils::BuildParmVarDecl(
             m_Sema, m_Derivative, CreateUniqueIdentifier("_d_this"),
             derivativeFnType->getParamType(dParamTypesIdx));
@@ -4839,18 +4553,6 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
       ++dParamTypesIdx;
     }
 
-    if (m_DiffReq.Mode == DiffMode::jacobian) {
-      IdentifierInfo* II = CreateUniqueIdentifier("jacobianMatrix");
-      // FIXME: Why are we taking storageClass of `params.front()`?
-      auto* dPVD = utils::BuildParmVarDecl(
-          m_Sema, m_Derivative, II,
-          derivativeFnType->getParamType(dParamTypesIdx),
-          params.front()->getStorageClass());
-      paramDerivatives.push_back(dPVD);
-      if (dPVD->getIdentifier())
-        m_Sema.PushOnScopeChains(dPVD, getCurrentScope(),
-                                 /*AddToContext=*/false);
-    }
     params.insert(params.end(), paramDerivatives.begin(),
                   paramDerivatives.end());
     // FIXME: If we do not consider diffParams as an independent argument for
diff --git a/lib/Differentiator/VectorForwardModeVisitor.cpp b/lib/Differentiator/VectorForwardModeVisitor.cpp
index 1c9ff820d..8199957de 100644
--- a/lib/Differentiator/VectorForwardModeVisitor.cpp
+++ b/lib/Differentiator/VectorForwardModeVisitor.cpp
@@ -26,6 +26,8 @@ DiffMode VectorForwardModeVisitor::GetPushForwardMode() {
 
 QualType
 VectorForwardModeVisitor::GetPushForwardDerivativeType(QualType ParamType) {
+  if (ParamType == m_Context.VoidTy)
+    return ParamType;
   QualType valueType = utils::GetValueType(ParamType);
   QualType resType;
   if (utils::isArrayOrPointerType(ParamType)) {
diff --git a/lib/Differentiator/VisitorBase.cpp b/lib/Differentiator/VisitorBase.cpp
index 0e6453625..87c55b4cd 100644
--- a/lib/Differentiator/VisitorBase.cpp
+++ b/lib/Differentiator/VisitorBase.cpp
@@ -902,4 +902,149 @@ namespace clad {
   VisitorBase::GetCladConstructorReverseForwTagOfType(clang::QualType T) {
     return InstantiateTemplate(GetCladConstructorReverseForwTag(), {T});
   }
+
+  FunctionDecl* VisitorBase::CreateDerivativeOverload() {
+    auto diffParams = m_Derivative->parameters();
+    auto diffNameInfo = m_Derivative->getNameInfo();
+    // Calculate the total number of parameters that would be required for
+    // automatic differentiation in the derived function if all args are
+    // requested.
+    // FIXME: Here we are assuming all function parameters are of differentiable
+    // type. Ideally, we should not make any such assumption.
+    std::size_t totalDerivedParamsSize = m_DiffReq->getNumParams() * 2;
+    std::size_t numOfDerivativeParams = m_DiffReq->getNumParams();
+
+    // Account for the this pointer.
+    if (isa<CXXMethodDecl>(m_DiffReq.Function) &&
+        !utils::IsStaticMethod(m_DiffReq.Function) &&
+        (!m_DiffReq.Functor || m_DiffReq.Mode != DiffMode::jacobian))
+      ++numOfDerivativeParams;
+    // All output parameters will be of type `void*`. These
+    // parameters will be casted to correct type before the call to the actual
+    // derived function.
+    // We require each output parameter to be of same type in the overloaded
+    // derived function due to limitations of generating the exact derived
+    // function type at the compile-time (without clad plugin help).
+    QualType outputParamType = m_Context.getPointerType(m_Context.VoidTy);
+
+    llvm::SmallVector<QualType, 16> paramTypes;
+
+    // Add types for representing original function parameters.
+    for (auto* PVD : m_DiffReq->parameters())
+      paramTypes.push_back(PVD->getType());
+    // Add types for representing parameter derivatives.
+    // FIXME: We are assuming all function parameters are differentiable. We
+    // should not make any such assumptions.
+    for (std::size_t i = 0; i < numOfDerivativeParams; ++i)
+      paramTypes.push_back(outputParamType);
+
+    auto diffFuncOverloadEPI =
+        dyn_cast<FunctionProtoType>(m_DiffReq->getType())->getExtProtoInfo();
+    QualType diffFunctionOverloadType =
+        m_Context.getFunctionType(m_Context.VoidTy, paramTypes,
+                                  // Cast to function pointer.
+                                  diffFuncOverloadEPI);
+
+    // FIXME: We should not use const_cast to get the decl context here.
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+    auto* DC = const_cast<DeclContext*>(m_DiffReq->getDeclContext());
+    m_Sema.CurContext = DC;
+    DeclWithContext diffOverloadFDWC =
+        m_Builder.cloneFunction(m_DiffReq.Function, *this, DC, noLoc,
+                                diffNameInfo, diffFunctionOverloadType);
+    FunctionDecl* diffOverloadFD = diffOverloadFDWC.first;
+
+    beginScope(Scope::FunctionPrototypeScope | Scope::FunctionDeclarationScope |
+               Scope::DeclScope);
+    m_Sema.PushFunctionScope();
+    m_Sema.PushDeclContext(getCurrentScope(), diffOverloadFD);
+
+    llvm::SmallVector<ParmVarDecl*, 4> overloadParams;
+    llvm::SmallVector<Expr*, 4> callArgs;
+
+    overloadParams.reserve(totalDerivedParamsSize);
+    callArgs.reserve(diffParams.size());
+
+    for (auto* PVD : m_DiffReq->parameters()) {
+      auto* VD = utils::BuildParmVarDecl(
+          m_Sema, diffOverloadFD, PVD->getIdentifier(), PVD->getType(),
+          PVD->getStorageClass(), /*defArg=*/nullptr, PVD->getTypeSourceInfo());
+      overloadParams.push_back(VD);
+      callArgs.push_back(BuildDeclRef(VD));
+    }
+
+    for (std::size_t i = 0; i < numOfDerivativeParams; ++i) {
+      IdentifierInfo* II = nullptr;
+      StorageClass SC = StorageClass::SC_None;
+      std::size_t effectiveDiffIndex = m_DiffReq->getNumParams() + i;
+      // `effectiveDiffIndex < diffParams.size()` implies that this
+      // parameter represents an actual derivative of one of the function
+      // original parameters.
+      if (effectiveDiffIndex < diffParams.size()) {
+        auto* GVD = diffParams[effectiveDiffIndex];
+        II = CreateUniqueIdentifier("_temp_" + GVD->getNameAsString());
+        SC = GVD->getStorageClass();
+      } else {
+        II = CreateUniqueIdentifier("_d_" + std::to_string(i));
+      }
+      auto* PVD = utils::BuildParmVarDecl(m_Sema, diffOverloadFD, II,
+                                          outputParamType, SC);
+      overloadParams.push_back(PVD);
+    }
+
+    for (auto* PVD : overloadParams)
+      if (PVD->getIdentifier())
+        m_Sema.PushOnScopeChains(PVD, getCurrentScope(),
+                                 /*AddToContext=*/false);
+
+    diffOverloadFD->setParams(overloadParams);
+    diffOverloadFD->setBody(/*B=*/nullptr);
+
+    beginScope(Scope::FnScope | Scope::DeclScope);
+    m_DerivativeFnScope = getCurrentScope();
+    beginBlock();
+
+    // Build derivatives to be used in the call to the actual derived function.
+    // These are initialised by effectively casting the derivative parameters of
+    // overloaded derived function to the correct type.
+    for (std::size_t i = m_DiffReq->getNumParams(); i < diffParams.size();
+         ++i) {
+      auto* overloadParam = overloadParams[i];
+      auto* diffParam = diffParams[i];
+      TypeSourceInfo* typeInfo =
+          m_Context.getTrivialTypeSourceInfo(diffParam->getType());
+      SourceLocation fakeLoc = utils::GetValidSLoc(m_Sema);
+      auto* init = m_Sema
+                       .BuildCStyleCastExpr(fakeLoc, typeInfo, fakeLoc,
+                                            BuildDeclRef(overloadParam))
+                       .get();
+
+      auto* diffVD =
+          BuildGlobalVarDecl(diffParam->getType(), diffParam->getName(), init);
+      callArgs.push_back(BuildDeclRef(diffVD));
+      addToCurrentBlock(BuildDeclStmt(diffVD));
+    }
+
+    // If the function is a global kernel, we need to transform it
+    // into a device function when calling it inside the overload function
+    // which is the final global kernel returned.
+    if (m_Derivative->hasAttr<clang::CUDAGlobalAttr>()) {
+      m_Derivative->dropAttr<clang::CUDAGlobalAttr>();
+      m_Derivative->addAttr(clang::CUDADeviceAttr::CreateImplicit(m_Context));
+    }
+
+    Expr* callExpr = BuildCallExprToFunction(m_Derivative, callArgs,
+                                             /*useRefQualifiedThisObj=*/true);
+    addToCurrentBlock(callExpr);
+    Stmt* diffOverloadBody = endBlock();
+
+    diffOverloadFD->setBody(diffOverloadBody);
+
+    endScope(); // Function body scope
+    m_Sema.PopFunctionScopeInfo();
+    m_Sema.PopDeclContext();
+    endScope(); // Function decl scope
+
+    return diffOverloadFD;
+  }
 } // end namespace clad
diff --git a/test/FirstDerivative/BuiltinDerivatives.C b/test/FirstDerivative/BuiltinDerivatives.C
index 0ff3b6e62..95c658204 100644
--- a/test/FirstDerivative/BuiltinDerivatives.C
+++ b/test/FirstDerivative/BuiltinDerivatives.C
@@ -99,7 +99,7 @@ float f7(float x) {
 
 // CHECK: float f7_darg0(float x) {
 // CHECK-NEXT:     float _d_x = 1;
-// CHECK-NEXT:     {{(clad::)?}}ValueAndPushforward<decltype(::std::pow(float(), double())), decltype(::std::pow(float(), double()))> _t0 = clad::custom_derivatives{{(::std)?}}::pow_pushforward(x, 2., _d_x, 0.);
+// CHECK-NEXT:     {{(clad::)?}}ValueAndPushforward<double, double> _t0 = clad::custom_derivatives{{(::std)?}}::pow_pushforward(x, 2., _d_x, 0.);
 // CHECK-NEXT:     return _t0.pushforward;
 // CHECK-NEXT: }
 
@@ -120,7 +120,7 @@ double f8(float x) {
 
 // CHECK: double f8_darg0(float x) {
 // CHECK-NEXT:     float _d_x = 1;
-// CHECK-NEXT:     {{(clad::)?}}ValueAndPushforward<decltype(::std::pow(float(), int())), decltype(::std::pow(float(), int()))> _t0 = clad::custom_derivatives{{(::std)?}}::pow_pushforward(x, 2, _d_x, 0);
+// CHECK-NEXT:     {{(clad::)?}}ValueAndPushforward<double, double> _t0 = clad::custom_derivatives{{(::std)?}}::pow_pushforward(x, 2, _d_x, 0);
 // CHECK-NEXT:     return _t0.pushforward;
 // CHECK-NEXT: }
 
@@ -142,7 +142,7 @@ float f9(float x, float y) {
 // CHECK: float f9_darg0(float x, float y) {
 // CHECK-NEXT:     float _d_x = 1;
 // CHECK-NEXT:     float _d_y = 0;
-// CHECK-NEXT:     {{(clad::)?}}ValueAndPushforward<decltype(::std::pow(float(), float())), decltype(::std::pow(float(), float()))> _t0 = clad::custom_derivatives{{(::std)?}}::pow_pushforward(x, y, _d_x, _d_y);
+// CHECK-NEXT:     {{(clad::)?}}ValueAndPushforward<float, float> _t0 = clad::custom_derivatives{{(::std)?}}::pow_pushforward(x, y, _d_x, _d_y);
 // CHECK-NEXT:     return _t0.pushforward;
 // CHECK-NEXT: }
 
@@ -165,7 +165,7 @@ double f10(float x, int y) {
 // CHECK: double f10_darg0(float x, int y) {
 // CHECK-NEXT:     float _d_x = 1;
 // CHECK-NEXT:     int _d_y = 0;
-// CHECK-NEXT:     {{(clad::)?}}ValueAndPushforward<decltype(::std::pow(float(), int())), decltype(::std::pow(float(), int()))> _t0 = clad::custom_derivatives{{(::std)?}}::pow_pushforward(x, y, _d_x, _d_y);
+// CHECK-NEXT:     {{(clad::)?}}ValueAndPushforward<double, double> _t0 = clad::custom_derivatives{{(::std)?}}::pow_pushforward(x, y, _d_x, _d_y);
 // CHECK-NEXT:     return _t0.pushforward;
 // CHECK-NEXT: }
 
diff --git a/test/ForwardMode/UserDefinedTypes.C b/test/ForwardMode/UserDefinedTypes.C
index fc7d6c865..0acd9a2e1 100644
--- a/test/ForwardMode/UserDefinedTypes.C
+++ b/test/ForwardMode/UserDefinedTypes.C
@@ -1167,7 +1167,7 @@ int main() {
 // CHECK-NEXT:     {
 // CHECK-NEXT:         unsigned int _d_i = 0;
 // CHECK-NEXT:         for (unsigned int i = 0; i < 5U; ++i) {
-// CHECK-NEXT:             ValueAndPushforward<decltype(::std::pow(double(), double())), decltype(::std::pow(double(), double()))> _t0 = clad::custom_derivatives::pow_pushforward(a.data[i], b.data[i], _d_a.data[i], _d_b.data[i]);
+// CHECK-NEXT:             {{(clad::)?}}ValueAndPushforward<double, double> _t0 = clad::custom_derivatives::pow_pushforward(a.data[i], b.data[i], _d_a.data[i], _d_b.data[i]);
 // CHECK-NEXT:             _d_res.data[i] = _t0.pushforward;
 // CHECK-NEXT:             res.data[i] = _t0.value;
 // CHECK-NEXT:         }
diff --git a/test/Hessian/BuiltinDerivatives.C b/test/Hessian/BuiltinDerivatives.C
index 942175b4f..b4d12a240 100644
--- a/test/Hessian/BuiltinDerivatives.C
+++ b/test/Hessian/BuiltinDerivatives.C
@@ -266,17 +266,17 @@ int main() {
 
 // CHECK: float f4_darg0(float x) {
 // CHECK-NEXT:     float _d_x = 1;
-// CHECK-NEXT:     ValueAndPushforward<decltype(::std::pow(float(), float())), decltype(::std::pow(float(), float()))> _t0 = clad::custom_derivatives{{(::std)?}}::pow_pushforward(x, 4.F, _d_x, 0.F);
+// CHECK-NEXT:     {{(clad::)?}}ValueAndPushforward<float, float> _t0 = clad::custom_derivatives{{(::std)?}}::pow_pushforward(x, 4.F, _d_x, 0.F);
 // CHECK-NEXT:     return _t0.pushforward;
 // CHECK-NEXT: }
 
-// CHECK: void pow_pushforward_pullback(float x, float exponent, float d_x, float d_exponent, ValueAndPushforward<decltype(::std::pow(float(), float())), decltype(::std::pow(float(), float()))> _d_y, float *_d_x, float *_d_exponent, float *_d_d_x, float *_d_d_exponent);
+// CHECK: void pow_pushforward_pullback(float x, float exponent, float d_x, float d_exponent, ValueAndPushforward<float, float> _d_y, float *_d_x, float *_d_exponent, float *_d_d_x, float *_d_d_exponent);
 
 // CHECK: void f4_darg0_grad(float x, float *_d_x) {
 // CHECK-NEXT:     float _d__d_x = 0.F;
 // CHECK-NEXT:     float _d_x0 = 1;
-// CHECK-NEXT:     ValueAndPushforward<decltype(::std::pow(float(), float())), decltype(::std::pow(float(), float()))> _d__t0 = {};
-// CHECK-NEXT:     ValueAndPushforward<decltype(::std::pow(float(), float())), decltype(::std::pow(float(), float()))> _t00 = clad::custom_derivatives{{(::std)?}}::pow_pushforward(x, 4.F, _d_x0, 0.F);
+// CHECK-NEXT:     {{(clad::)?}}ValueAndPushforward<float, float> _d__t0 = {};
+// CHECK-NEXT:     {{(clad::)?}}ValueAndPushforward<float, float> _t00 = clad::custom_derivatives{{(::std)?}}::pow_pushforward(x, 4.F, _d_x0, 0.F);
 // CHECK-NEXT:     _d__t0.pushforward += 1;
 // CHECK-NEXT:     {
 // CHECK-NEXT:         {{.*}} _r0 = {};
@@ -293,15 +293,15 @@ int main() {
 
 // CHECK: float f5_darg0(float x) {
 // CHECK-NEXT:     float _d_x = 1;
-// CHECK-NEXT:     ValueAndPushforward<decltype(::std::pow(float(), float())), decltype(::std::pow(float(), float()))> _t0 = clad::custom_derivatives{{(::std)?}}::pow_pushforward(2.F, x, 0.F, _d_x);
+// CHECK-NEXT:     {{(clad::)?}}ValueAndPushforward<float, float> _t0 = clad::custom_derivatives{{(::std)?}}::pow_pushforward(2.F, x, 0.F, _d_x);
 // CHECK-NEXT:     return _t0.pushforward;
 // CHECK-NEXT: }
 
 // CHECK: void f5_darg0_grad(float x, float *_d_x) {
 // CHECK-NEXT:     float _d__d_x = 0.F;
 // CHECK-NEXT:     float _d_x0 = 1;
-// CHECK-NEXT:     ValueAndPushforward<decltype(::std::pow(float(), float())), decltype(::std::pow(float(), float()))> _d__t0 = {};
-// CHECK-NEXT:     ValueAndPushforward<decltype(::std::pow(float(), float())), decltype(::std::pow(float(), float()))> _t00 = clad::custom_derivatives{{(::std)?}}::pow_pushforward(2.F, x, 0.F, _d_x0);
+// CHECK-NEXT:     {{(clad::)?}}ValueAndPushforward<float, float> _d__t0 = {};
+// CHECK-NEXT:     {{(clad::)?}}ValueAndPushforward<float, float> _t00 = clad::custom_derivatives{{(::std)?}}::pow_pushforward(2.F, x, 0.F, _d_x0);
 // CHECK-NEXT:     _d__t0.pushforward += 1;
 // CHECK-NEXT:     {
 // CHECK-NEXT:         {{.*}} _r0 = {};
@@ -319,7 +319,7 @@ int main() {
 // CHECK: float f6_darg0(float x, float y) {
 // CHECK-NEXT:     float _d_x = 1;
 // CHECK-NEXT:     float _d_y = 0;
-// CHECK-NEXT:     ValueAndPushforward<decltype(::std::pow(float(), float())), decltype(::std::pow(float(), float()))> _t0 = clad::custom_derivatives{{(::std)?}}::pow_pushforward(x, y, _d_x, _d_y);
+// CHECK-NEXT:     {{(clad::)?}}ValueAndPushforward<float, float> _t0 = clad::custom_derivatives{{(::std)?}}::pow_pushforward(x, y, _d_x, _d_y);
 // CHECK-NEXT:     return _t0.pushforward;
 // CHECK-NEXT: }
 
@@ -328,8 +328,8 @@ int main() {
 // CHECK-NEXT:     float _d_x0 = 1;
 // CHECK-NEXT:     float _d__d_y = 0.F;
 // CHECK-NEXT:     float _d_y0 = 0;
-// CHECK-NEXT:     ValueAndPushforward<decltype(::std::pow(float(), float())), decltype(::std::pow(float(), float()))> _d__t0 = {};
-// CHECK-NEXT:     ValueAndPushforward<decltype(::std::pow(float(), float())), decltype(::std::pow(float(), float()))> _t00 = clad::custom_derivatives{{(::std)?}}::pow_pushforward(x, y, _d_x0, _d_y0);
+// CHECK-NEXT:     {{(clad::)?}}ValueAndPushforward<float, float> _d__t0 = {};
+// CHECK-NEXT:     {{(clad::)?}}ValueAndPushforward<float, float> _t00 = clad::custom_derivatives{{(::std)?}}::pow_pushforward(x, y, _d_x0, _d_y0);
 // CHECK-NEXT:     _d__t0.pushforward += 1;
 // CHECK-NEXT:     {
 // CHECK-NEXT:         {{.*}} _r0 = {};
@@ -349,7 +349,7 @@ int main() {
 // CHECK: float f6_darg1(float x, float y) {
 // CHECK-NEXT:     float _d_x = 0;
 // CHECK-NEXT:     float _d_y = 1;
-// CHECK-NEXT:     ValueAndPushforward<decltype(::std::pow(float(), float())), decltype(::std::pow(float(), float()))> _t0 = clad::custom_derivatives{{(::std)?}}::pow_pushforward(x, y, _d_x, _d_y);
+// CHECK-NEXT:     {{(clad::)?}}ValueAndPushforward<float, float> _t0 = clad::custom_derivatives{{(::std)?}}::pow_pushforward(x, y, _d_x, _d_y);
 // CHECK-NEXT:     return _t0.pushforward;
 // CHECK-NEXT: }
 
@@ -358,8 +358,8 @@ int main() {
 // CHECK-NEXT:     float _d_x0 = 0;
 // CHECK-NEXT:     float _d__d_y = 0.F;
 // CHECK-NEXT:     float _d_y0 = 1;
-// CHECK-NEXT:     ValueAndPushforward<decltype(::std::pow(float(), float())), decltype(::std::pow(float(), float()))> _d__t0 = {};
-// CHECK-NEXT:     ValueAndPushforward<decltype(::std::pow(float(), float())), decltype(::std::pow(float(), float()))> _t00 = clad::custom_derivatives{{(::std)?}}::pow_pushforward(x, y, _d_x0, _d_y0);
+// CHECK-NEXT:     {{(clad::)?}}ValueAndPushforward<float, float> _d__t0 = {};
+// CHECK-NEXT:     {{(clad::)?}}ValueAndPushforward<float, float> _t00 = clad::custom_derivatives{{(::std)?}}::pow_pushforward(x, y, _d_x0, _d_y0);
 // CHECK-NEXT:     _d__t0.pushforward += 1;
 // CHECK-NEXT:     {
 // CHECK-NEXT:         {{.*}} _r0 = {};
@@ -434,7 +434,7 @@ int main() {
 // CHECK-NEXT:     }
 // CHECK-NEXT: }
 
-// CHECK: void pow_pushforward_pullback(float x, float exponent, float d_x, float d_exponent, ValueAndPushforward<decltype(::std::pow(float(), float())), decltype(::std::pow(float(), float()))> _d_y, float *_d_x, float *_d_exponent, float *_d_d_x, float *_d_d_exponent) {
+// CHECK: void pow_pushforward_pullback(float x, float exponent, float d_x, float d_exponent, ValueAndPushforward<float, float> _d_y, float *_d_x, float *_d_exponent, float *_d_d_x, float *_d_d_exponent) {
 // CHECK-NEXT:     bool _cond0;
 // CHECK-NEXT:     float _t1;
 // CHECK-NEXT:     float _t2;
diff --git a/test/Jacobian/FunctionCalls.C b/test/Jacobian/FunctionCalls.C
index da7d5420c..a27d09963 100644
--- a/test/Jacobian/FunctionCalls.C
+++ b/test/Jacobian/FunctionCalls.C
@@ -6,38 +6,25 @@
 #include <cmath>
 #include "clad/Differentiator/Differentiator.h"
 
-double outputs[4], results[4];
+double outputs[4];
+clad::matrix<double> results(2, 2);
 
 void fn1(double i, double j, double* output) {
   output[0] = std::pow(i, j);
   output[1] = std::pow(j, i);
 }
 
-// CHECK: void fn1_jac(double i, double j, double *output, double *jacobianMatrix) {
-// CHECK-NEXT:     double _t0 = output[0];
-// CHECK-NEXT:     output[0] = std::pow(i, j);
-// CHECK-NEXT:     double _t1 = output[1];
-// CHECK-NEXT:     output[1] = std::pow(j, i);
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             double _r2 = 0.;
-// CHECK-NEXT:             double _r3 = 0.;
-// CHECK-NEXT:             clad::custom_derivatives::pow_pullback(j, i, 1, &_r2, &_r3);
-// CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += _r2;
-// CHECK-NEXT:             jacobianMatrix[{{2U|2UL|2ULL}}] += _r3;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         output[1] = _t1;
-// CHECK-NEXT:     }
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             double _r0 = 0.;
-// CHECK-NEXT:             double _r1 = 0.;
-// CHECK-NEXT:             clad::custom_derivatives::pow_pullback(i, j, 1, &_r0, &_r1);
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += _r0;
-// CHECK-NEXT:             jacobianMatrix[{{1U|1UL|1ULL}}] += _r1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         output[0] = _t0;
-// CHECK-NEXT:     }
+// CHECK: void fn1_jac(double i, double j, double *output, clad::matrix<double> *_d_vector_output) {
+// CHECK-NEXT:     unsigned long indepVarCount = _d_vector_output->rows() + {{2U|2UL|2ULL}};
+// CHECK-NEXT:     clad::array<double> _d_vector_i = clad::one_hot_vector(indepVarCount, {{0U|0UL|0ULL}});
+// CHECK-NEXT:     clad::array<double> _d_vector_j = clad::one_hot_vector(indepVarCount, {{1U|1UL|1ULL}});
+// CHECK-NEXT:     *_d_vector_output = clad::identity_matrix(_d_vector_output->rows(), indepVarCount, {{2U|2UL|2ULL}});
+// CHECK-NEXT:     {{.*}} _t0 = clad::custom_derivatives::pow_pushforward(i, j, _d_vector_i, _d_vector_j);
+// CHECK-NEXT:     *_d_vector_output[0] = _t0.pushforward;
+// CHECK-NEXT:     output[0] = _t0.value;
+// CHECK-NEXT:     {{.*}} _t1 = clad::custom_derivatives::pow_pushforward(j, i, _d_vector_j, _d_vector_i);
+// CHECK-NEXT:     *_d_vector_output[1] = _t1.pushforward;
+// CHECK-NEXT:     output[1] = _t1.value;
 // CHECK-NEXT: }
 
 #define INIT(F) auto d_##F = clad::jacobian(F);
@@ -47,17 +34,16 @@ void fn1(double i, double j, double* output) {
 template <unsigned numOfOutputs, typename Fn, typename... Args>
 void test(Fn derivedFn, Args... args) {
   unsigned numOfParameters = sizeof...(args);
-  unsigned numOfResults = numOfOutputs * numOfParameters;
   for (unsigned i = 0; i < numOfOutputs; ++i)
     outputs[i] = 0;
-  for (unsigned i = 0; i < numOfResults; ++i)
-    results[i] = 0;
-  derivedFn.execute(args..., outputs, results);
+  derivedFn.execute(args..., outputs, &results);
   printf("{");
-  for (unsigned i = 0; i < numOfResults; ++i) {
-    printf("%.2f", results[i]);
-    if (i != numOfResults - 1)
-      printf(", ");
+  for (unsigned i = 0; i < numOfOutputs; ++i) {
+    for (unsigned j = 0; j < numOfParameters; ++j) {
+      printf("%.2f", results[i][j]);
+      if (i != numOfOutputs - 1 || j != numOfParameters - 1)
+        printf(", ");
+    }
   }
   printf("}\n");
 }
diff --git a/test/Jacobian/Functors.C b/test/Jacobian/Functors.C
index cf04e1291..22902b3de 100644
--- a/test/Jacobian/Functors.C
+++ b/test/Jacobian/Functors.C
@@ -16,27 +16,21 @@ struct Experiment {
     x = val;
   }
 
-  // CHECK: void operator_call_jac(double i, double j, double *output, double *jacobianMatrix) {
-  // CHECK-NEXT:     double _t0 = output[0];
-  // CHECK-NEXT:     output[0] = this->x * i * i * j;
-  // CHECK-NEXT:     double _t1 = output[1];
-  // CHECK-NEXT:     output[1] = this->y * i * j * j;
-  // CHECK-NEXT:     {
-  // CHECK-NEXT:         {
-  // CHECK-NEXT:             jacobianMatrix[{{2U|2UL|2ULL}}] += this->y * 1 * j * j;
-  // CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += this->y * i * 1 * j;
-  // CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += this->y * i * j * 1;
-  // CHECK-NEXT:         }
-  // CHECK-NEXT:         output[1] = _t1;
-  // CHECK-NEXT:     }
-  // CHECK-NEXT:     {
-  // CHECK-NEXT:         {
-  // CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += this->x * 1 * j * i;
-  // CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += this->x * i * 1 * j;
-  // CHECK-NEXT:             jacobianMatrix[{{1U|1UL|1ULL}}] += this->x * i * i * 1;
-  // CHECK-NEXT:         }
-  // CHECK-NEXT:         output[0] = _t0;
-  // CHECK-NEXT:     }
+  // CHECK: void operator_call_jac(double i, double j, double *output, clad::matrix<double> *_d_vector_output) {
+  // CHECK-NEXT:     unsigned long indepVarCount = _d_vector_output->rows() + {{2U|2UL|2ULL}};
+  // CHECK-NEXT:     clad::array<double> _d_vector_i = clad::one_hot_vector(indepVarCount, {{0U|0UL|0ULL}});
+  // CHECK-NEXT:     clad::array<double> _d_vector_j = clad::one_hot_vector(indepVarCount, {{1U|1UL|1ULL}});
+  // CHECK-NEXT:     *_d_vector_output = clad::identity_matrix(_d_vector_output->rows(), indepVarCount, {{2U|2UL|2ULL}});
+  // CHECK-NEXT:     double &_t0 = this->x;
+  // CHECK-NEXT:     double _t1 = _t0 * i;
+  // CHECK-NEXT:     double _t2 = _t1 * i;
+  // CHECK-NEXT:     *_d_vector_output[0] = ((0 * i + _t0 * _d_vector_i) * i + _t1 * _d_vector_i) * j + _t2 * _d_vector_j;
+  // CHECK-NEXT:     output[0] = _t2 * j;
+  // CHECK-NEXT:     double &_t3 = this->y;
+  // CHECK-NEXT:     double _t4 = _t3 * i;
+  // CHECK-NEXT:     double _t5 = _t4 * j;
+  // CHECK-NEXT:     *_d_vector_output[1] = ((0 * i + _t3 * _d_vector_i) * j + _t4 * _d_vector_j) * j + _t5 * _d_vector_j;
+  // CHECK-NEXT:     output[1] = _t5 * j;
   // CHECK-NEXT: }
 
 };
@@ -52,27 +46,21 @@ struct ExperimentConst {
     x = val;
   }
 
-  // CHECK: void operator_call_jac(double i, double j, double *output, double *jacobianMatrix) const {
-  // CHECK-NEXT:     double _t0 = output[0];
-  // CHECK-NEXT:     output[0] = this->x * i * i * j;
-  // CHECK-NEXT:     double _t1 = output[1];
-  // CHECK-NEXT:     output[1] = this->y * i * j * j;
-  // CHECK-NEXT:     {
-  // CHECK-NEXT:         {
-  // CHECK-NEXT:             jacobianMatrix[{{2U|2UL|2ULL}}] += this->y * 1 * j * j;
-  // CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += this->y * i * 1 * j;
-  // CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += this->y * i * j * 1;
-  // CHECK-NEXT:         }
-  // CHECK-NEXT:         output[1] = _t1;
-  // CHECK-NEXT:     }
-  // CHECK-NEXT:     {
-  // CHECK-NEXT:         {
-  // CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += this->x * 1 * j * i;
-  // CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += this->x * i * 1 * j;
-  // CHECK-NEXT:             jacobianMatrix[{{1U|1UL|1ULL}}] += this->x * i * i * 1;
-  // CHECK-NEXT:         }
-  // CHECK-NEXT:         output[0] = _t0;
-  // CHECK-NEXT:     }
+  // CHECK: void operator_call_jac(double i, double j, double *output, clad::matrix<double> *_d_vector_output) const {
+  // CHECK-NEXT:     unsigned long indepVarCount = _d_vector_output->rows() + {{2U|2UL|2ULL}};
+  // CHECK-NEXT:     clad::array<double> _d_vector_i = clad::one_hot_vector(indepVarCount, {{0U|0UL|0ULL}});
+  // CHECK-NEXT:     clad::array<double> _d_vector_j = clad::one_hot_vector(indepVarCount, {{1U|1UL|1ULL}});
+  // CHECK-NEXT:     *_d_vector_output = clad::identity_matrix(_d_vector_output->rows(), indepVarCount, {{2U|2UL|2ULL}});
+  // CHECK-NEXT:     double &_t0 = this->x;
+  // CHECK-NEXT:     double _t1 = _t0 * i;
+  // CHECK-NEXT:     double _t2 = _t1 * i;
+  // CHECK-NEXT:     *_d_vector_output[0] = ((0 * i + _t0 * _d_vector_i) * i + _t1 * _d_vector_i) * j + _t2 * _d_vector_j;
+  // CHECK-NEXT:     output[0] = _t2 * j;
+  // CHECK-NEXT:     double &_t3 = this->y;
+  // CHECK-NEXT:     double _t4 = _t3 * i;
+  // CHECK-NEXT:     double _t5 = _t4 * j;
+  // CHECK-NEXT:     *_d_vector_output[1] = ((0 * i + _t3 * _d_vector_i) * j + _t4 * _d_vector_j) * j + _t5 * _d_vector_j;
+  // CHECK-NEXT:     output[1] = _t5 * j;
   // CHECK-NEXT: }
 
 };
@@ -88,29 +76,21 @@ struct ExperimentVolatile {
     x = val;
   }
 
-  // CHECK: void operator_call_jac(double i, double j, double *output, double *jacobianMatrix) volatile {
-  // CHECK-NEXT:     double _t0 = this->x * i;
-  // CHECK-NEXT:     double _t1 = output[0];
-  // CHECK-NEXT:     output[0] = this->x * i * i * j;
-  // CHECK-NEXT:     double _t2 = this->y * i;
-  // CHECK-NEXT:     double _t3 = output[1];
-  // CHECK-NEXT:     output[1] = this->y * i * j * j;
-  // CHECK-NEXT:     {
-  // CHECK-NEXT:         {
-  // CHECK-NEXT:             jacobianMatrix[{{2U|2UL|2ULL}}] += this->y * 1 * j * j;
-  // CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += _t2 * 1 * j;
-  // CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += _t2 * j * 1;
-  // CHECK-NEXT:         }
-  // CHECK-NEXT:         output[1] = _t3;
-  // CHECK-NEXT:     }
-  // CHECK-NEXT:     {
-  // CHECK-NEXT:         {
-  // CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += this->x * 1 * j * i;
-  // CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += _t0 * 1 * j;
-  // CHECK-NEXT:             jacobianMatrix[{{1U|1UL|1ULL}}] += _t0 * i * 1;
-  // CHECK-NEXT:         }
-  // CHECK-NEXT:         output[0] = _t1;
-  // CHECK-NEXT:     }
+  // CHECK: void operator_call_jac(double i, double j, double *output, clad::matrix<double> *_d_vector_output) volatile {
+  // CHECK-NEXT:     unsigned long indepVarCount = _d_vector_output->rows() + {{2U|2UL|2ULL}};
+  // CHECK-NEXT:     clad::array<double> _d_vector_i = clad::one_hot_vector(indepVarCount, {{0U|0UL|0ULL}});
+  // CHECK-NEXT:     clad::array<double> _d_vector_j = clad::one_hot_vector(indepVarCount, {{1U|1UL|1ULL}});
+  // CHECK-NEXT:     *_d_vector_output = clad::identity_matrix(_d_vector_output->rows(), indepVarCount, {{2U|2UL|2ULL}});
+  // CHECK-NEXT:     volatile double &_t0 = this->x;
+  // CHECK-NEXT:     double _t1 = _t0 * i;
+  // CHECK-NEXT:     double _t2 = _t1 * i;
+  // CHECK-NEXT:     *_d_vector_output[0] = ((0 * i + _t0 * _d_vector_i) * i + _t1 * _d_vector_i) * j + _t2 * _d_vector_j;
+  // CHECK-NEXT:     output[0] = _t2 * j;
+  // CHECK-NEXT:     volatile double &_t3 = this->y;
+  // CHECK-NEXT:     double _t4 = _t3 * i;
+  // CHECK-NEXT:     double _t5 = _t4 * j;
+  // CHECK-NEXT:     *_d_vector_output[1] = ((0 * i + _t3 * _d_vector_i) * j + _t4 * _d_vector_j) * j + _t5 * _d_vector_j;
+  // CHECK-NEXT:     output[1] = _t5 * j;
   // CHECK-NEXT: }
 
 };
@@ -126,29 +106,21 @@ struct ExperimentConstVolatile {
     x = val;
   }
 
-  // CHECK: void operator_call_jac(double i, double j, double *output, double *jacobianMatrix) const volatile {
-  // CHECK-NEXT:     double _t0 = this->x * i;
-  // CHECK-NEXT:     double _t1 = output[0];
-  // CHECK-NEXT:     output[0] = this->x * i * i * j;
-  // CHECK-NEXT:     double _t2 = this->y * i;
-  // CHECK-NEXT:     double _t3 = output[1];
-  // CHECK-NEXT:     output[1] = this->y * i * j * j;
-  // CHECK-NEXT:     {
-  // CHECK-NEXT:         {
-  // CHECK-NEXT:             jacobianMatrix[{{2U|2UL|2ULL}}] += this->y * 1 * j * j;
-  // CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += _t2 * 1 * j;
-  // CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += _t2 * j * 1;
-  // CHECK-NEXT:         }
-  // CHECK-NEXT:         output[1] = _t3;
-  // CHECK-NEXT:     }
-  // CHECK-NEXT:     {
-  // CHECK-NEXT:         {
-  // CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += this->x * 1 * j * i;
-  // CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += _t0 * 1 * j;
-  // CHECK-NEXT:             jacobianMatrix[{{1U|1UL|1ULL}}] += _t0 * i * 1;
-  // CHECK-NEXT:         }
-  // CHECK-NEXT:         output[0] = _t1;
-  // CHECK-NEXT:     }
+  // CHECK: void operator_call_jac(double i, double j, double *output, clad::matrix<double> *_d_vector_output) const volatile {
+  // CHECK-NEXT:     unsigned long indepVarCount = _d_vector_output->rows() + {{2U|2UL|2ULL}};
+  // CHECK-NEXT:     clad::array<double> _d_vector_i = clad::one_hot_vector(indepVarCount, {{0U|0UL|0ULL}});
+  // CHECK-NEXT:     clad::array<double> _d_vector_j = clad::one_hot_vector(indepVarCount, {{1U|1UL|1ULL}});
+  // CHECK-NEXT:     *_d_vector_output = clad::identity_matrix(_d_vector_output->rows(), indepVarCount, {{2U|2UL|2ULL}});
+  // CHECK-NEXT:     volatile double &_t0 = this->x;
+  // CHECK-NEXT:     double _t1 = _t0 * i;
+  // CHECK-NEXT:     double _t2 = _t1 * i;
+  // CHECK-NEXT:     *_d_vector_output[0] = ((0 * i + _t0 * _d_vector_i) * i + _t1 * _d_vector_i) * j + _t2 * _d_vector_j;
+  // CHECK-NEXT:     output[0] = _t2 * j;
+  // CHECK-NEXT:     volatile double &_t3 = this->y;
+  // CHECK-NEXT:     double _t4 = _t3 * i;
+  // CHECK-NEXT:     double _t5 = _t4 * j;
+  // CHECK-NEXT:     *_d_vector_output[1] = ((0 * i + _t3 * _d_vector_i) * j + _t4 * _d_vector_j) * j + _t5 * _d_vector_j;
+  // CHECK-NEXT:     output[1] = _t5 * j;
   // CHECK-NEXT: }
 
 };
@@ -166,27 +138,21 @@ namespace outer {
         x = val;
       }
       
-  // CHECK: void operator_call_jac(double i, double j, double *output, double *jacobianMatrix) {
-  // CHECK-NEXT:     double _t0 = output[0];
-  // CHECK-NEXT:     output[0] = this->x * i * i * j;
-  // CHECK-NEXT:     double _t1 = output[1];
-  // CHECK-NEXT:     output[1] = this->y * i * j * j;
-  // CHECK-NEXT:     {
-  // CHECK-NEXT:         {
-  // CHECK-NEXT:             jacobianMatrix[{{2U|2UL|2ULL}}] += this->y * 1 * j * j;
-  // CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += this->y * i * 1 * j;
-  // CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += this->y * i * j * 1;
-  // CHECK-NEXT:         }
-  // CHECK-NEXT:         output[1] = _t1;
-  // CHECK-NEXT:     }
-  // CHECK-NEXT:     {
-  // CHECK-NEXT:         {
-  // CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += this->x * 1 * j * i;
-  // CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += this->x * i * 1 * j;
-  // CHECK-NEXT:             jacobianMatrix[{{1U|1UL|1ULL}}] += this->x * i * i * 1;
-  // CHECK-NEXT:         }
-  // CHECK-NEXT:         output[0] = _t0;
-  // CHECK-NEXT:     }
+  // CHECK: void operator_call_jac(double i, double j, double *output, clad::matrix<double> *_d_vector_output) {
+  // CHECK-NEXT:     unsigned long indepVarCount = _d_vector_output->rows() + {{2U|2UL|2ULL}};
+  // CHECK-NEXT:     clad::array<double> _d_vector_i = clad::one_hot_vector(indepVarCount, {{0U|0UL|0ULL}});
+  // CHECK-NEXT:     clad::array<double> _d_vector_j = clad::one_hot_vector(indepVarCount, {{1U|1UL|1ULL}});
+  // CHECK-NEXT:     *_d_vector_output = clad::identity_matrix(_d_vector_output->rows(), indepVarCount, {{2U|2UL|2ULL}});
+  // CHECK-NEXT:     double &_t0 = this->x;
+  // CHECK-NEXT:     double _t1 = _t0 * i;
+  // CHECK-NEXT:     double _t2 = _t1 * i;
+  // CHECK-NEXT:     *_d_vector_output[0] = ((0 * i + _t0 * _d_vector_i) * i + _t1 * _d_vector_i) * j + _t2 * _d_vector_j;
+  // CHECK-NEXT:     output[0] = _t2 * j;
+  // CHECK-NEXT:     double &_t3 = this->y;
+  // CHECK-NEXT:     double _t4 = _t3 * i;
+  // CHECK-NEXT:     double _t5 = _t4 * j;
+  // CHECK-NEXT:     *_d_vector_output[1] = ((0 * i + _t3 * _d_vector_i) * j + _t4 * _d_vector_j) * j + _t5 * _d_vector_j;
+  // CHECK-NEXT:     output[1] = _t5 * j;
   // CHECK-NEXT: }
 
     };
@@ -196,27 +162,17 @@ namespace outer {
       output[1] = i*j*j;
     };
 
-  // CHECK: inline void operator_call_jac(double i, double j, double *output, double *jacobianMatrix) const {
-  // CHECK-NEXT:     double _t0 = output[0];
-  // CHECK-NEXT:     output[0] = i * i * j;
-  // CHECK-NEXT:     double _t1 = output[1];
-  // CHECK-NEXT:     output[1] = i * j * j;
-  // CHECK-NEXT:     {
-  // CHECK-NEXT:         {
-  // CHECK-NEXT:             jacobianMatrix[{{2U|2UL|2ULL}}] += 1 * j * j;
-  // CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += i * 1 * j;
-  // CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += i * j * 1;
-  // CHECK-NEXT:         }
-  // CHECK-NEXT:         output[1] = _t1;
-  // CHECK-NEXT:     }
-  // CHECK-NEXT:     {
-  // CHECK-NEXT:         {
-  // CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += 1 * j * i;
-  // CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += i * 1 * j;
-  // CHECK-NEXT:             jacobianMatrix[{{1U|1UL|1ULL}}] += i * i * 1;
-  // CHECK-NEXT:         }
-  // CHECK-NEXT:         output[0] = _t0;
-  // CHECK-NEXT:     }
+  // CHECK: inline void operator_call_jac(double i, double j, double *output, clad::matrix<double> *_d_vector_output) const {
+  // CHECK-NEXT:     unsigned long indepVarCount = _d_vector_output->rows() + {{2U|2UL|2ULL}};
+  // CHECK-NEXT:     clad::array<double> _d_vector_i = clad::one_hot_vector(indepVarCount, {{0U|0UL|0ULL}});
+  // CHECK-NEXT:     clad::array<double> _d_vector_j = clad::one_hot_vector(indepVarCount, {{1U|1UL|1ULL}});
+  // CHECK-NEXT:     *_d_vector_output = clad::identity_matrix(_d_vector_output->rows(), indepVarCount, {{2U|2UL|2ULL}});
+  // CHECK-NEXT:     double _t0 = i * i;
+  // CHECK-NEXT:     *_d_vector_output[0] = (_d_vector_i * i + i * _d_vector_i) * j + _t0 * _d_vector_j;
+  // CHECK-NEXT:     output[0] = _t0 * j;
+  // CHECK-NEXT:     double _t1 = i * j;
+  // CHECK-NEXT:     *_d_vector_output[1] = (_d_vector_i * j + i * _d_vector_j) * j + _t1 * _d_vector_j;
+  // CHECK-NEXT:     output[1] = _t1 * j;
   // CHECK-NEXT: }
 
   }
@@ -227,21 +183,20 @@ namespace outer {
   auto d_##E##Ref = clad::jacobian(E);
 
 #define TEST(E)                                                                \
-  result[0] = result[1] = result[2] = result[3] = 0;                           \
   output[0] = output[1] = 0;                                                   \
-  d_##E.execute(7, 9, output, result);                                         \
-  printf("{%.2f, %.2f, %.2f, %.2f}, ", result[0], result[1], result[2],        \
-         result[3]);                                                           \
-  result[0] = result[1] = result[2] = result[3] = 0;                           \
+  d_##E.execute(7, 9, output, &result);                                        \
+  printf("{%.2f, %.2f, %.2f, %.2f}, ", result[0][0], result[0][1],             \
+                                       result[1][0], result[1][1]);            \
   output[0] = output[1] = 0;                                                   \
-  d_##E##Ref.execute(7, 9, output, result);                                    \
-  printf("{%.2f, %.2f, %.2f, %.2f}\n", result[0], result[1], result[2],        \
-         result[3]);
+  d_##E##Ref.execute(7, 9, output, &result);                                   \
+  printf("{%.2f, %.2f, %.2f, %.2f}, ", result[0][0], result[0][1],             \
+                                       result[1][0], result[1][1]);
 
 double x = 3;
 double y = 5;
 int main() {
-  double output[2], result[4];
+  double output[2];
+  clad::matrix<double> result(2, 2);
   Experiment E(3, 5);
   auto E_Again = E;
   const ExperimentConst E_Const(3, 5);
@@ -254,27 +209,17 @@ int main() {
     output[1] = i*j*j;
   };
 
-  // CHECK: inline void operator_call_jac(double i, double j, double *output, double *jacobianMatrix) const {
-  // CHECK-NEXT:     double _t0 = output[0];
-  // CHECK-NEXT:     output[0] = i * i * j;
-  // CHECK-NEXT:     double _t1 = output[1];
-  // CHECK-NEXT:     output[1] = i * j * j;
-  // CHECK-NEXT:     {
-  // CHECK-NEXT:         {
-  // CHECK-NEXT:             jacobianMatrix[{{2U|2UL|2ULL}}] += 1 * j * j;
-  // CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += i * 1 * j;
-  // CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += i * j * 1;
-  // CHECK-NEXT:         }
-  // CHECK-NEXT:         output[1] = _t1;
-  // CHECK-NEXT:     }
-  // CHECK-NEXT:     {
-  // CHECK-NEXT:         {
-  // CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += 1 * j * i;
-  // CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += i * 1 * j;
-  // CHECK-NEXT:             jacobianMatrix[{{1U|1UL|1ULL}}] += i * i * 1;
-  // CHECK-NEXT:         }
-  // CHECK-NEXT:         output[0] = _t0;
-  // CHECK-NEXT:     }
+  // CHECK-NEXT: inline void operator_call_jac(double i, double j, double *output, clad::matrix<double> *_d_vector_output) const {
+  // CHECK-NEXT:     unsigned long indepVarCount = _d_vector_output->rows() + {{2U|2UL|2ULL}};
+  // CHECK-NEXT:     clad::array<double> _d_vector_i = clad::one_hot_vector(indepVarCount, {{0U|0UL|0ULL}});
+  // CHECK-NEXT:     clad::array<double> _d_vector_j = clad::one_hot_vector(indepVarCount, {{1U|1UL|1ULL}});
+  // CHECK-NEXT:     *_d_vector_output = clad::identity_matrix(_d_vector_output->rows(), indepVarCount, {{2U|2UL|2ULL}});
+  // CHECK-NEXT:     double _t0 = i * i;
+  // CHECK-NEXT:     *_d_vector_output[0] = (_d_vector_i * i + i * _d_vector_i) * j + _t0 * _d_vector_j;
+  // CHECK-NEXT:     output[0] = _t0 * j;
+  // CHECK-NEXT:     double _t1 = i * j;
+  // CHECK-NEXT:     *_d_vector_output[1] = (_d_vector_i * j + i * _d_vector_j) * j + _t1 * _d_vector_j;
+  // CHECK-NEXT:     output[1] = _t1 * j;
   // CHECK-NEXT: }
 
   auto lambdaWithCapture = [&](double i, double jj, double *output) {
@@ -282,27 +227,19 @@ int main() {
     output[1] = y*i*jj*jj;
   };
 
-// CHECK: inline void operator_call_jac(double i, double jj, double *output, double *jacobianMatrix) const {
-// CHECK-NEXT:     double _t0 = output[0];
-// CHECK-NEXT:     output[0] = x * i * i * jj;
-// CHECK-NEXT:     double _t1 = output[1];
-// CHECK-NEXT:     output[1] = y * i * jj * jj;
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             jacobianMatrix[{{2U|2UL|2ULL}}] += y * 1 * jj * jj;
-// CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += y * i * 1 * jj;
-// CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += y * i * jj * 1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         output[1] = _t1;
-// CHECK-NEXT:     }
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += x * 1 * jj * i;
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += x * i * 1 * jj;
-// CHECK-NEXT:             jacobianMatrix[{{1U|1UL|1ULL}}] += x * i * i * 1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         output[0] = _t0;
-// CHECK-NEXT:     }
+// CHECK: inline void operator_call_jac(double i, double jj, double *output, clad::matrix<double> *_d_vector_output) const {
+// CHECK-NEXT:     unsigned long indepVarCount = _d_vector_output->rows() + {{2U|2UL|2ULL}};
+// CHECK-NEXT:     clad::array<double> _d_vector_i = clad::one_hot_vector(indepVarCount, {{0U|0UL|0ULL}});
+// CHECK-NEXT:     clad::array<double> _d_vector_jj = clad::one_hot_vector(indepVarCount, {{1U|1UL|1ULL}});
+// CHECK-NEXT:     *_d_vector_output = clad::identity_matrix(_d_vector_output->rows(), indepVarCount, {{2U|2UL|2ULL}});
+// CHECK-NEXT:     double _t0 = x * i;
+// CHECK-NEXT:     double _t1 = _t0 * i;
+// CHECK-NEXT:     *_d_vector_output[0] = ((0. * i + x * _d_vector_i) * i + _t0 * _d_vector_i) * jj + _t1 * _d_vector_jj;
+// CHECK-NEXT:     output[0] = _t1 * jj;
+// CHECK-NEXT:     double _t2 = y * i;
+// CHECK-NEXT:     double _t3 = _t2 * jj;
+// CHECK-NEXT:     *_d_vector_output[1] = ((0. * i + y * _d_vector_i) * jj + _t2 * _d_vector_jj) * jj + _t3 * _d_vector_jj;
+// CHECK-NEXT:     output[1] = _t3 * jj;
 // CHECK-NEXT: }
 
   auto lambdaNNS = outer::inner::lambdaNNS;
diff --git a/test/Jacobian/Jacobian.C b/test/Jacobian/Jacobian.C
index 4d9537022..b348c7574 100644
--- a/test/Jacobian/Jacobian.C
+++ b/test/Jacobian/Jacobian.C
@@ -12,43 +12,22 @@ void f_1(double a, double b, double c, double output[]) {
   output[2] = c * c * 10 - a * a;
 }
 
-void f_1_jac(double a, double b, double c, double output[], double *_result);
-
-// CHECK: void f_1_jac(double a, double b, double c, double output[], double *jacobianMatrix) {
-// CHECK-NEXT:     double _t0 = output[0];
-// CHECK-NEXT:     output[0] = a * a * a;
-// CHECK-NEXT:     double _t1 = output[1];
-// CHECK-NEXT:     output[1] = a * a * a + b * b * b;
-// CHECK-NEXT:     double _t2 = output[2];
-// CHECK-NEXT:     output[2] = c * c * 10 - a * a;
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             jacobianMatrix[{{8U|8UL|8ULL}}] += 1 * 10 * c;
-// CHECK-NEXT:             jacobianMatrix[{{8U|8UL|8ULL}}] += c * 1 * 10;
-// CHECK-NEXT:             jacobianMatrix[{{6U|6UL|6ULL}}] += -1 * a;
-// CHECK-NEXT:             jacobianMatrix[{{6U|6UL|6ULL}}] += a * -1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         output[2] = _t2;
-// CHECK-NEXT:     }
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += 1 * a * a;
-// CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += a * 1 * a;
-// CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += a * a * 1;
-// CHECK-NEXT:             jacobianMatrix[{{4U|4UL|4ULL}}] += 1 * b * b;
-// CHECK-NEXT:             jacobianMatrix[{{4U|4UL|4ULL}}] += b * 1 * b;
-// CHECK-NEXT:             jacobianMatrix[{{4U|4UL|4ULL}}] += b * b * 1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         output[1] = _t1;
-// CHECK-NEXT:     }
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += 1 * a * a;
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += a * 1 * a;
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += a * a * 1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         output[0] = _t0;
-// CHECK-NEXT:     }
+// CHECK: void f_1_jac(double a, double b, double c, double output[], clad::matrix<double> *_d_vector_output) {
+// CHECK-NEXT:     unsigned long indepVarCount = _d_vector_output->rows() + {{3U|3UL|3ULL}};
+// CHECK-NEXT:     clad::array<double> _d_vector_a = clad::one_hot_vector(indepVarCount, {{0U|0UL|0ULL}});
+// CHECK-NEXT:     clad::array<double> _d_vector_b = clad::one_hot_vector(indepVarCount, {{1U|1UL|1ULL}});
+// CHECK-NEXT:     clad::array<double> _d_vector_c = clad::one_hot_vector(indepVarCount, {{2U|2UL|2ULL}});
+// CHECK-NEXT:     *_d_vector_output = clad::identity_matrix(_d_vector_output->rows(), indepVarCount, {{3U|3UL|3ULL}});
+// CHECK-NEXT:     double _t0 = a * a;
+// CHECK-NEXT:     *_d_vector_output[0] = (_d_vector_a * a + a * _d_vector_a) * a + _t0 * _d_vector_a;
+// CHECK-NEXT:     output[0] = _t0 * a;
+// CHECK-NEXT:     double _t1 = a * a;
+// CHECK-NEXT:     double _t2 = b * b;
+// CHECK-NEXT:     *_d_vector_output[1] = (_d_vector_a * a + a * _d_vector_a) * a + _t1 * _d_vector_a + (_d_vector_b * b + b * _d_vector_b) * b + _t2 * _d_vector_b;
+// CHECK-NEXT:     output[1] = _t1 * a + _t2 * b;
+// CHECK-NEXT:     double _t3 = c * c;
+// CHECK-NEXT:     *_d_vector_output[2] = (_d_vector_c * c + c * _d_vector_c) * 10 + _t3 * (clad::zero_vector(indepVarCount)) - (_d_vector_a * a + a * _d_vector_a);
+// CHECK-NEXT:     output[2] = _t3 * 10 - a * a;
 // CHECK-NEXT: }
 
 void f_3(double x, double y, double z, double *_result) {
@@ -59,47 +38,29 @@ void f_3(double x, double y, double z, double *_result) {
   _result[2] = sin(z) * constant;
 }
 
-void f_3_jac(double x, double y, double z, double *_result, double *jacobianMatrix);
-
-// CHECK: void f_3_jac(double x, double y, double z, double *_result, double *jacobianMatrix) {
-// CHECK-NEXT:     double _d_constant = 0.;
+// CHECK: void f_3_jac(double x, double y, double z, double *_result, clad::matrix<double> *_d_vector__result) {
+// CHECK-NEXT:     unsigned long indepVarCount = _d_vector__result->rows() + {{3U|3UL|3ULL}};
+// CHECK-NEXT:     clad::array<double> _d_vector_x = clad::one_hot_vector(indepVarCount, {{0U|0UL|0ULL}});
+// CHECK-NEXT:     clad::array<double> _d_vector_y = clad::one_hot_vector(indepVarCount, {{1U|1UL|1ULL}});
+// CHECK-NEXT:     clad::array<double> _d_vector_z = clad::one_hot_vector(indepVarCount, {{2U|2UL|2ULL}});
+// CHECK-NEXT:     *_d_vector__result = clad::identity_matrix(_d_vector__result->rows(), indepVarCount, {{3U|3UL|3ULL}});
+// CHECK-NEXT:     clad::array<double> _d_vector_constant(clad::zero_vector(indepVarCount)); 
 // CHECK-NEXT:     double constant = 42;
-// CHECK-NEXT:     double _t0 = sin(x);
-// CHECK-NEXT:     double _t1 = _result[0];
-// CHECK-NEXT:     _result[0] = sin(x) * constant;
-// CHECK-NEXT:     double _t2 = sin(y);
-// CHECK-NEXT:     double _t3 = _result[1];
-// CHECK-NEXT:     _result[1] = sin(y) * constant;
-// CHECK-NEXT:     double _t4 = sin(z);
-// CHECK-NEXT:     double _t5 = _result[2];
-// CHECK-NEXT:     _result[2] = sin(z) * constant;
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             double _r2 = 0.;
-// CHECK-NEXT:             _r2 += 1 * constant * clad::custom_derivatives::sin_pushforward(z, 1.).pushforward;
-// CHECK-NEXT:             jacobianMatrix[{{8U|8UL|8ULL}}] += _r2;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         _result[2] = _t5;
-// CHECK-NEXT:     }
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             double _r1 = 0.;
-// CHECK-NEXT:             _r1 += 1 * constant * clad::custom_derivatives::sin_pushforward(y, 1.).pushforward;
-// CHECK-NEXT:             jacobianMatrix[{{4U|4UL|4ULL}}] += _r1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         _result[1] = _t3;
-// CHECK-NEXT:     }
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             double _r0 = 0.;
-// CHECK-NEXT:             _r0 += 1 * constant * clad::custom_derivatives::sin_pushforward(x, 1.).pushforward;
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += _r0;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         _result[0] = _t1;
-// CHECK-NEXT:     }
+// CHECK-NEXT:     {{.*}} _t0 = clad::custom_derivatives::sin_pushforward(x, _d_vector_x);
+// CHECK-NEXT:     double &_t1 = _t0.value;
+// CHECK-NEXT:     *_d_vector__result[0] = _t0.pushforward * constant + _t1 * _d_vector_constant;
+// CHECK-NEXT:     _result[0] = _t1 * constant;
+// CHECK-NEXT:     {{.*}} _t2 = clad::custom_derivatives::sin_pushforward(y, _d_vector_y);
+// CHECK-NEXT:     double &_t3 = _t2.value;
+// CHECK-NEXT:     *_d_vector__result[1] = _t2.pushforward * constant + _t3 * _d_vector_constant;
+// CHECK-NEXT:     _result[1] = _t3 * constant;
+// CHECK-NEXT:     {{.*}} _t4 = clad::custom_derivatives::sin_pushforward(z, _d_vector_z);
+// CHECK-NEXT:     double &_t5 = _t4.value;
+// CHECK-NEXT:     *_d_vector__result[2] = _t4.pushforward * constant + _t5 * _d_vector_constant;
+// CHECK-NEXT:     _result[2] = _t5 * constant;
 // CHECK-NEXT: }
 double multiply(double x, double y) { return x * y; }
-//CHECK: void multiply_pullback(double x, double y, double _d_y0, double *_d_x, double *_d_y);
+// CHECK: clad::ValueAndPushforward<double, clad::array<double> > multiply_vector_pushforward(double x, double y, clad::array<double> _d_x, clad::array<double> _d_y);
 
 void f_4(double x, double y, double z, double *_result) {
   double constant = 42;
@@ -109,84 +70,43 @@ void f_4(double x, double y, double z, double *_result) {
   _result[2] = multiply(z, x) * constant;
 }
 
-void f_4_jac(double x, double y, double z, double *_result, double *jacobianMatrix);
-// CHECK: void f_4_jac(double x, double y, double z, double *_result, double *jacobianMatrix) {
-// CHECK-NEXT:     double _d_constant = 0.;
+// CHECK: void f_4_jac(double x, double y, double z, double *_result, clad::matrix<double> *_d_vector__result) {
+// CHECK-NEXT:     unsigned long indepVarCount = _d_vector__result->rows() + {{3U|3UL|3ULL}};
+// CHECK-NEXT:     clad::array<double> _d_vector_x = clad::one_hot_vector(indepVarCount, {{0U|0UL|0ULL}});
+// CHECK-NEXT:     clad::array<double> _d_vector_y = clad::one_hot_vector(indepVarCount, {{1U|1UL|1ULL}});
+// CHECK-NEXT:     clad::array<double> _d_vector_z = clad::one_hot_vector(indepVarCount, {{2U|2UL|2ULL}});
+// CHECK-NEXT:     *_d_vector__result = clad::identity_matrix(_d_vector__result->rows(), indepVarCount, {{3U|3UL|3ULL}});
+// CHECK-NEXT:     clad::array<double> _d_vector_constant(clad::zero_vector(indepVarCount)); 
 // CHECK-NEXT:     double constant = 42;
-// CHECK-NEXT:     double _t0 = multiply(x, y);
-// CHECK-NEXT:     double _t1 = _result[0];
-// CHECK-NEXT:     _result[0] = multiply(x, y) * constant;
-// CHECK-NEXT:     double _t2 = multiply(y, z);
-// CHECK-NEXT:     double _t3 = _result[1];
-// CHECK-NEXT:     _result[1] = multiply(y, z) * constant;
-// CHECK-NEXT:     double _t4 = multiply(z, x);
-// CHECK-NEXT:     double _t5 = _result[2];
-// CHECK-NEXT:     _result[2] = multiply(z, x) * constant;
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             double _r4 = 0.;
-// CHECK-NEXT:             double _r5 = 0.;
-// CHECK-NEXT:             multiply_pullback(z, x, 1 * constant, &_r4, &_r5);
-// CHECK-NEXT:             jacobianMatrix[{{8U|8UL|8ULL}}] += _r4;
-// CHECK-NEXT:             jacobianMatrix[{{6U|6UL|6ULL}}] += _r5;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         _result[2] = _t5;
-// CHECK-NEXT:     }
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             double _r2 = 0.;
-// CHECK-NEXT:             double _r3 = 0.;
-// CHECK-NEXT:             multiply_pullback(y, z, 1 * constant, &_r2, &_r3);
-// CHECK-NEXT:             jacobianMatrix[{{4U|4UL|4ULL}}] += _r2;
-// CHECK-NEXT:             jacobianMatrix[{{5U|5UL|5ULL}}] += _r3;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         _result[1] = _t3;
-// CHECK-NEXT:     }
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             double _r0 = 0.;
-// CHECK-NEXT:             double _r1 = 0.;
-// CHECK-NEXT:             multiply_pullback(x, y, 1 * constant, &_r0, &_r1);
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += _r0;
-// CHECK-NEXT:             jacobianMatrix[{{1U|1UL|1ULL}}] += _r1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         _result[0] = _t1;
-// CHECK-NEXT:     }
+// CHECK-NEXT:     clad::ValueAndPushforward<double, clad::array<double> > _t0 = multiply_vector_pushforward(x, y, _d_vector_x, _d_vector_y);
+// CHECK-NEXT:     double &_t1 = _t0.value;
+// CHECK-NEXT:     *_d_vector__result[0] = _t0.pushforward * constant + _t1 * _d_vector_constant;
+// CHECK-NEXT:     _result[0] = _t1 * constant;
+// CHECK-NEXT:     clad::ValueAndPushforward<double, clad::array<double> > _t2 = multiply_vector_pushforward(y, z, _d_vector_y, _d_vector_z);
+// CHECK-NEXT:     double &_t3 = _t2.value;
+// CHECK-NEXT:     *_d_vector__result[1] = _t2.pushforward * constant + _t3 * _d_vector_constant;
+// CHECK-NEXT:     _result[1] = _t3 * constant;
+// CHECK-NEXT:     clad::ValueAndPushforward<double, clad::array<double> > _t4 = multiply_vector_pushforward(z, x, _d_vector_z, _d_vector_x);
+// CHECK-NEXT:     double &_t5 = _t4.value;
+// CHECK-NEXT:     *_d_vector__result[2] = _t4.pushforward * constant + _t5 * _d_vector_constant;
+// CHECK-NEXT:     _result[2] = _t5 * constant;
 // CHECK-NEXT: }
 
-void f_1_jac_0(double a, double b, double c, double output[], double *jacobianMatrix);
-// CHECK: void f_1_jac_0(double a, double b, double c, double output[], double *jacobianMatrix) {
-// CHECK-NEXT:     double _d_b = 0.;
-// CHECK-NEXT:     double _d_c = 0.;
-// CHECK-NEXT:     double _t0 = output[0];
-// CHECK-NEXT:     output[0] = a * a * a;
-// CHECK-NEXT:     double _t1 = output[1];
-// CHECK-NEXT:     output[1] = a * a * a + b * b * b;
-// CHECK-NEXT:     double _t2 = output[2];
-// CHECK-NEXT:     output[2] = c * c * 10 - a * a;
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             jacobianMatrix[{{2U|2UL|2ULL}}] += -1 * a;
-// CHECK-NEXT:             jacobianMatrix[{{2U|2UL|2ULL}}] += a * -1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         output[2] = _t2;
-// CHECK-NEXT:     }
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             jacobianMatrix[{{1U|1UL|1ULL}}] += 1 * a * a;
-// CHECK-NEXT:             jacobianMatrix[{{1U|1UL|1ULL}}] += a * 1 * a;
-// CHECK-NEXT:             jacobianMatrix[{{1U|1UL|1ULL}}] += a * a * 1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         output[1] = _t1;
-// CHECK-NEXT:     }
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += 1 * a * a;
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += a * 1 * a;
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += a * a * 1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         output[0] = _t0;
-// CHECK-NEXT:     }
+// CHECK: void f_1_jac_0(double a, double b, double c, double output[], clad::matrix<double> *_d_vector_output) {
+// CHECK-NEXT:     unsigned long indepVarCount = _d_vector_output->rows() + {{3U|3UL|3ULL}};
+// CHECK-NEXT:     clad::array<double> _d_vector_a = clad::one_hot_vector(indepVarCount, {{0U|0UL|0ULL}});
+// CHECK-NEXT:     clad::array<double> _d_vector_b = clad::zero_vector(indepVarCount);
+// CHECK-NEXT:     clad::array<double> _d_vector_c = clad::zero_vector(indepVarCount);
+// CHECK-NEXT:     double _t0 = a * a;
+// CHECK-NEXT:     *_d_vector_output[0] = (_d_vector_a * a + a * _d_vector_a) * a + _t0 * _d_vector_a;
+// CHECK-NEXT:     output[0] = _t0 * a;
+// CHECK-NEXT:     double _t1 = a * a;
+// CHECK-NEXT:     double _t2 = b * b;
+// CHECK-NEXT:     *_d_vector_output[1] = (_d_vector_a * a + a * _d_vector_a) * a + _t1 * _d_vector_a + (_d_vector_b * b + b * _d_vector_b) * b + _t2 * _d_vector_b;
+// CHECK-NEXT:     output[1] = _t1 * a + _t2 * b;
+// CHECK-NEXT:     double _t3 = c * c;
+// CHECK-NEXT:     *_d_vector_output[2] = (_d_vector_c * c + c * _d_vector_c) * 10 + _t3 * (clad::zero_vector(indepVarCount)) - (_d_vector_a * a + a * _d_vector_a);
+// CHECK-NEXT:     output[2] = _t3 * 10 - a * a;
 // CHECK-NEXT: }
 
 void f_5(float a, double output[]){
@@ -194,66 +114,101 @@ void f_5(float a, double output[]){
   output[0]=a*a;  
 }
 
-// CHECK: void f_5_jac(float a, double output[], double *jacobianMatrix) {
-// CHECK-NEXT:     double _t0 = output[1];
+// CHECK: void f_5_jac(float a, double output[], clad::matrix<double> *_d_vector_output) {
+// CHECK-NEXT:     unsigned long indepVarCount = _d_vector_output->rows() + {{1U|1UL|1ULL}};
+// CHECK-NEXT:     clad::array<float> _d_vector_a = clad::one_hot_vector(indepVarCount, {{0U|0UL|0ULL}});
+// CHECK-NEXT:     *_d_vector_output = clad::identity_matrix(_d_vector_output->rows(), indepVarCount, {{1U|1UL|1ULL}});
+// CHECK-NEXT:     *_d_vector_output[1] = _d_vector_a;
 // CHECK-NEXT:     output[1] = a;
-// CHECK-NEXT:     double _t1 = output[0];
+// CHECK-NEXT:     *_d_vector_output[0] = _d_vector_a * a + a * _d_vector_a;
 // CHECK-NEXT:     output[0] = a * a;
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += 1 * a;
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += a * 1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         output[0] = _t1;
-// CHECK-NEXT:     }
-// CHECK-NEXT:     {
-// CHECK-NEXT:         jacobianMatrix[{{1U|1UL|1ULL}}] += 1;
-// CHECK-NEXT:         output[1] = _t0;
-// CHECK-NEXT:     }
 // CHECK-NEXT: }
 
+void f_6(double a[3], double b, double output[]) {
+    output[0] = a[0] * a[0] * a[0];
+    output[1] = a[0] * a[0] * a[0] + b * b * b;
+    output[2] = 2 * (a[0] + b);
+}
+
+// CHECK: void f_6_jac(double a[3], double b, double output[], clad::matrix<double> *_d_vector_a, clad::matrix<double> *_d_vector_output) {
+// CHECK-NEXT:     unsigned long indepVarCount = _d_vector_a->rows() + _d_vector_output->rows() + {{1U|1UL|1ULL}};
+// CHECK-NEXT:     clad::array<double> _d_vector_b = clad::one_hot_vector(indepVarCount, _d_vector_a->rows());
+// CHECK-NEXT:     *_d_vector_a = clad::identity_matrix(_d_vector_a->rows(), indepVarCount, {{0U|0UL|0ULL}});
+// CHECK-NEXT:     *_d_vector_output = clad::identity_matrix(_d_vector_output->rows(), indepVarCount, _d_vector_a->rows() + {{1U|1UL|1ULL}});
+// CHECK-NEXT:     double _t0 = a[0] * a[0];
+// CHECK-NEXT:     *_d_vector_output[0] = ((*_d_vector_a[0]) * a[0] + a[0] * (*_d_vector_a[0])) * a[0] + _t0 * (*_d_vector_a[0]);
+// CHECK-NEXT:     output[0] = _t0 * a[0];
+// CHECK-NEXT:     double _t1 = a[0] * a[0];
+// CHECK-NEXT:     double _t2 = b * b;
+// CHECK-NEXT:     *_d_vector_output[1] = ((*_d_vector_a[0]) * a[0] + a[0] * (*_d_vector_a[0])) * a[0] + _t1 * (*_d_vector_a[0]) + (_d_vector_b * b + b * _d_vector_b) * b + _t2 * _d_vector_b;
+// CHECK-NEXT:     output[1] = _t1 * a[0] + _t2 * b;
+// CHECK-NEXT:     double _t3 = (a[0] + b);
+// CHECK-NEXT:     *_d_vector_output[2] = (clad::zero_vector(indepVarCount)) * _t3 + 2 * (*_d_vector_a[0] + _d_vector_b);
+// CHECK-NEXT:     output[2] = 2 * _t3;
+// CHECK-NEXT: }
 
-#define TEST(F, x, y, z) { \
-  result[0] = 0; result[1] = 0; result[2] = 0;\
-  result[3] = 0; result[4] = 0; result[5] = 0;\
-  result[6] = 0; result[7] = 0; result[8] = 0;\
+void f_7(double a, double& b, double&c) {
+    b = 3 * a;
+    c = -5 * b;
+}
+
+// CHECK: void f_7_jac(double a, double &b, double &c, clad::array<double> *_d_vector_b, clad::array<double> *_d_vector_c) {
+// CHECK-NEXT:     unsigned long indepVarCount = {{3U|3UL|3ULL}};
+// CHECK-NEXT:     clad::array<double> _d_vector_a = clad::one_hot_vector(indepVarCount, {{0U|0UL|0ULL}});
+// CHECK-NEXT:     *_d_vector_b = clad::one_hot_vector(indepVarCount, {{1U|1UL|1ULL}});
+// CHECK-NEXT:     *_d_vector_c = clad::one_hot_vector(indepVarCount, {{2U|2UL|2ULL}});
+// CHECK-NEXT:     *_d_vector_b = (clad::zero_vector(indepVarCount)) * a + 3 * _d_vector_a;
+// CHECK-NEXT:     b = 3 * a;
+// CHECK-NEXT:     *_d_vector_c = - clad::zero_vector(indepVarCount) * b + -5 * *_d_vector_b;
+// CHECK-NEXT:     c = -5 * b;
+// CHECK-NEXT: }
+
+
+#define TEST(F, ...) { \
   outputarr[0] = 0; outputarr[1] = 1; outputarr[2] = 0;\
   auto j = clad::jacobian(F);\
-  j.execute(x, y, z, outputarr, result);\
+  j.execute(__VA_ARGS__, &result);\
   printf("Result is = {%.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f}\n",\
-  result[0], result[1], result[2],\
-  result[3], result[4], result[5],\
-  result[6], result[7], result[8]);\
-  F##_jac(x, y, z, outputarr, result);\
+  result[0][0], result[0][1], result[0][2],\
+  result[1][0], result[1][1], result[1][2],\
+  result[2][0], result[2][1], result[2][2]);\
 }
 
 #define TEST_F_1_SINGLE_PARAM(x, y, z) { \
-  result[0] = 0; result[1] = 0; result[2] = 0;\
   outputarr[0] = 0; outputarr[1] = 1; outputarr[2] = 0;\
   auto j = clad::jacobian(f_1,"a");\
-  j.execute(x, y, z, outputarr, result);\
+  j.execute(x, y, z, outputarr, &result);\
   printf("Result is = {%.2f, %.2f, %.2f}\n",\
-  result[0], result[1], result[2]);\
+  result[0][0], result[1][0], result[2][0]);\
 }
 
 
 int main() {
-  double result[10];
+  clad::matrix<double> result (3, 3);
   double outputarr[9];
-  TEST(f_1, 1, 2, 3); // CHECK-EXEC: Result is = {3.00, 0.00, 0.00, 3.00, 12.00, 0.00, -2.00, 0.00, 60.00}
-  TEST(f_3, 1, 2, 3); // CHECK-EXEC: Result is = {22.69, 0.00, 0.00, 0.00, -17.48, 0.00, 0.00, 0.00, -41.58}
-  TEST(f_4, 1, 2, 3); // CHECK-EXEC: Result is = {84.00, 42.00, 0.00, 0.00, 126.00, 84.00, 126.00, 0.00, 42.00}
+  TEST(f_1, 1, 2, 3, outputarr); // CHECK-EXEC: Result is = {3.00, 0.00, 0.00, 3.00, 12.00, 0.00, -2.00, 0.00, 60.00}
+  TEST(f_3, 1, 2, 3, outputarr); // CHECK-EXEC: Result is = {22.69, 0.00, 0.00, 0.00, -17.48, 0.00, 0.00, 0.00, -41.58}
+  TEST(f_4, 1, 2, 3, outputarr); // CHECK-EXEC: Result is = {84.00, 42.00, 0.00, 0.00, 126.00, 84.00, 126.00, 0.00, 42.00}
   TEST_F_1_SINGLE_PARAM(1, 2, 3); // CHECK-EXEC: Result is = {3.00, 3.00, -2.00}
 
   auto df5 = clad::jacobian(f_5);
   result[0] = 0; result[1] = 0;
-  df5.execute(3, outputarr, result);
-  printf("Result is = {%.2f, %.2f}", result[0], result[1]); // CHECK-EXEC: Result is = {6.00, 1.00}
+  df5.execute(3, outputarr, &result);
+  printf("Result is = {%.2f, %.2f}\n", result[0][0], result[1][0]); // CHECK-EXEC: Result is = {6.00, 1.00}
+  
+  double a[] = {3, 1};
+  clad::matrix<double> da (2, 5);
+  TEST(f_6, a, 2, outputarr, &da); // CHECK-EXEC: Result is = {27.00, 0.00, 0.00, 27.00, 0.00, 12.00, 2.00, 0.00, 2.00}
+
+  auto df7 = clad::jacobian(f_7);
+  clad::array<double> db, dc;
+  double a7=4, b=5, c=6;
+  df7.execute(a7, b, c, &db, &dc);
+  printf("Result is = {%.2f, %.2f}\n", db[0], dc[0]); // CHECK-EXEC: Result is = {3.00, -15.00}
+  
 }
 
-//CHECK: void multiply_pullback(double x, double y, double _d_y0, double *_d_x, double *_d_y) {
-//CHECK-NEXT:    {
-//CHECK-NEXT:        *_d_x += _d_y0 * y;
-//CHECK-NEXT:        *_d_y += x * _d_y0;
-//CHECK-NEXT:    }
-//CHECK-NEXT:}
+// CHECK: clad::ValueAndPushforward<double, clad::array<double> > multiply_vector_pushforward(double x, double y, clad::array<double> _d_x, clad::array<double> _d_y) {
+// CHECK-NEXT:     unsigned long indepVarCount = _d_y.size();
+// CHECK-NEXT:     return {x * y, _d_x * y + x * _d_y};
+// CHECK-NEXT: }
diff --git a/test/Jacobian/Pointers.C b/test/Jacobian/Pointers.C
index 01bdcecc4..1a90b61e7 100644
--- a/test/Jacobian/Pointers.C
+++ b/test/Jacobian/Pointers.C
@@ -10,31 +10,27 @@ void nonMemFn(double i, double j, double* out) {
   out[1] = j;
 }
 
-// CHECK: void nonMemFn_jac(double i, double j, double *out, double *jacobianMatrix) {
-// CHECK-NEXT:     double _t0 = out[0];
+// CHECK: void nonMemFn_jac(double i, double j, double *out, clad::matrix<double> *_d_vector_out) {
+// CHECK-NEXT:     unsigned long indepVarCount = _d_vector_out->rows() + {{2U|2UL|2ULL}};
+// CHECK-NEXT:     clad::array<double> _d_vector_i = clad::one_hot_vector(indepVarCount, {{0U|0UL|0ULL}});
+// CHECK-NEXT:     clad::array<double> _d_vector_j = clad::one_hot_vector(indepVarCount, {{1U|1UL|1ULL}});
+// CHECK-NEXT:     *_d_vector_out = clad::identity_matrix(_d_vector_out->rows(), indepVarCount, {{2U|2UL|2ULL}});
+// CHECK-NEXT:     *_d_vector_out[0] = _d_vector_i;
 // CHECK-NEXT:     out[0] = i;
-// CHECK-NEXT:     double _t1 = out[1];
+// CHECK-NEXT:     *_d_vector_out[1] = _d_vector_j;
 // CHECK-NEXT:     out[1] = j;
-// CHECK-NEXT:     {
-// CHECK-NEXT:         jacobianMatrix[{{3U|3UL|3ULL}}] += 1;
-// CHECK-NEXT:         out[1] = _t1;
-// CHECK-NEXT:     }
-// CHECK-NEXT:     {
-// CHECK-NEXT:         jacobianMatrix[{{0U|0UL|0ULL}}] += 1;
-// CHECK-NEXT:         out[0] = _t0;
-// CHECK-NEXT:     }
 // CHECK-NEXT: }
 
 #define NON_MEM_FN_TEST(var)\
-res[0]=res[1]=res[2]=res[3]=0;\
-var.execute(5, 7, out, res);\
-printf("{%.2f %.2f %.2f %.2f}\n", res[0], res[1], res[2], res[3]);
+var.execute(5, 7, out, &res);\
+printf("{%.2f %.2f %.2f %.2f}\n", res[0][0], res[0][1],\
+                                  res[1][0], res[1][1]);
 
 int main() {
   auto nonMemFnPtr = &nonMemFn;
   auto nonMemFnPtrToPtr = &nonMemFnPtr;
 
-  double res[4];
+  clad::matrix<double> res(2, 2);
   double out[2];
   auto d_nonMemFn = clad::jacobian(nonMemFn);
   auto d_nonMemFnPar = clad::jacobian((nonMemFn));
diff --git a/test/Jacobian/TemplateFunctors.C b/test/Jacobian/TemplateFunctors.C
index 9762d355e..62c0cb897 100644
--- a/test/Jacobian/TemplateFunctors.C
+++ b/test/Jacobian/TemplateFunctors.C
@@ -15,25 +15,24 @@ template <typename T> struct Experiment {
   void setX(T val) { x = val; }
 };
 
-// CHECK: void operator_call_jac(double i, double j, double *output, double *jacobianMatrix) {
-// CHECK-NEXT:     double _t0 = output[0];
-// CHECK-NEXT:     output[0] = this->x * this->y * i * j;
-// CHECK-NEXT:     double _t1 = output[1];
-// CHECK-NEXT:     output[1] = 2 * this->x * this->y * i * j;
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             jacobianMatrix[{{2U|2UL|2ULL}}] += 2 * this->x * this->y * 1 * j;
-// CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += 2 * this->x * this->y * i * 1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         output[1] = _t1;
-// CHECK-NEXT:     }
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += this->x * this->y * 1 * j;
-// CHECK-NEXT:             jacobianMatrix[{{1U|1UL|1ULL}}] += this->x * this->y * i * 1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         output[0] = _t0;
-// CHECK-NEXT:     }
+// CHECK: void operator_call_jac(double i, double j, double *output, clad::matrix<double> *_d_vector_output) {
+// CHECK-NEXT:     unsigned long indepVarCount = _d_vector_output->rows() + {{2U|2UL|2ULL}};
+// CHECK-NEXT:     clad::array<double> _d_vector_i = clad::one_hot_vector(indepVarCount, {{0U|0UL|0ULL}});
+// CHECK-NEXT:     clad::array<double> _d_vector_j = clad::one_hot_vector(indepVarCount, {{1U|1UL|1ULL}});
+// CHECK-NEXT:     *_d_vector_output = clad::identity_matrix(_d_vector_output->rows(), indepVarCount, {{2U|2UL|2ULL}});
+// CHECK-NEXT:     double &_t0 = this->x;
+// CHECK-NEXT:     double &_t1 = this->y;
+// CHECK-NEXT:     double _t2 = _t0 * _t1;
+// CHECK-NEXT:     double _t3 = _t2 * i;
+// CHECK-NEXT:     *_d_vector_output[0] = ((0 * _t1 + _t0 * 0) * i + _t2 * _d_vector_i) * j + _t3 * _d_vector_j;
+// CHECK-NEXT:     output[0] = _t3 * j;
+// CHECK-NEXT:     double &_t4 = this->x;
+// CHECK-NEXT:     double _t5 = 2 * _t4;
+// CHECK-NEXT:     double &_t6 = this->y;
+// CHECK-NEXT:     double _t7 = _t5 * _t6;
+// CHECK-NEXT:     double _t8 = _t7 * i;
+// CHECK-NEXT:     *_d_vector_output[1] = ((((clad::zero_vector(indepVarCount)) * _t4 + 2 * 0) * _t6 + _t5 * 0) * i + _t7 * _d_vector_i) * j + _t8 * _d_vector_j;
+// CHECK-NEXT:     output[1] = _t8 * j;
 // CHECK-NEXT: }
 
 template <> struct Experiment<long double> {
@@ -46,27 +45,26 @@ template <> struct Experiment<long double> {
   void setX(long double val) { x = val; }
 };
 
-// CHECK: void operator_call_jac(long double i, long double j, long double *output, long double *jacobianMatrix) {
-// CHECK-NEXT:     long double _t0 = output[0];
-// CHECK-NEXT:     output[0] = this->x * this->y * i * i * j;
-// CHECK-NEXT:     long double _t1 = output[1];
-// CHECK-NEXT:     output[1] = 2 * this->x * this->y * i * i * j;
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             jacobianMatrix[{{2U|2UL|2ULL}}] += 2 * this->x * this->y * 1 * j * i;
-// CHECK-NEXT:             jacobianMatrix[{{2U|2UL|2ULL}}] += 2 * this->x * this->y * i * 1 * j;
-// CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += 2 * this->x * this->y * i * i * 1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         output[1] = _t1;
-// CHECK-NEXT:     }
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += this->x * this->y * 1 * j * i;
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += this->x * this->y * i * 1 * j;
-// CHECK-NEXT:             jacobianMatrix[{{1U|1UL|1ULL}}] += this->x * this->y * i * i * 1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         output[0] = _t0;
-// CHECK-NEXT:     }
+// CHECK: void operator_call_jac(long double i, long double j, long double *output, clad::matrix<long double> *_d_vector_output) {
+// CHECK-NEXT:     unsigned long indepVarCount = _d_vector_output->rows() + {{2U|2UL|2ULL}};
+// CHECK-NEXT:     clad::array<long double> _d_vector_i = clad::one_hot_vector(indepVarCount, {{0U|0UL|0ULL}});
+// CHECK-NEXT:     clad::array<long double> _d_vector_j = clad::one_hot_vector(indepVarCount, {{1U|1UL|1ULL}});
+// CHECK-NEXT:     *_d_vector_output = clad::identity_matrix(_d_vector_output->rows(), indepVarCount, {{2U|2UL|2ULL}});
+// CHECK-NEXT:     long double &_t0 = this->x;
+// CHECK-NEXT:     long double &_t1 = this->y;
+// CHECK-NEXT:     long double _t2 = _t0 * _t1;
+// CHECK-NEXT:     long double _t3 = _t2 * i;
+// CHECK-NEXT:     long double _t4 = _t3 * i;
+// CHECK-NEXT:     *_d_vector_output[0] = (((0 * _t1 + _t0 * 0) * i + _t2 * _d_vector_i) * i + _t3 * _d_vector_i) * j + _t4 * _d_vector_j;
+// CHECK-NEXT:     output[0] = _t4 * j;
+// CHECK-NEXT:     long double &_t5 = this->x;
+// CHECK-NEXT:     long double _t6 = 2 * _t5;
+// CHECK-NEXT:     long double &_t7 = this->y;
+// CHECK-NEXT:     long double _t8 = _t6 * _t7;
+// CHECK-NEXT:     long double _t9 = _t8 * i;
+// CHECK-NEXT:     long double _t10 = _t9 * i;
+// CHECK-NEXT:     *_d_vector_output[1] = (((((clad::zero_vector(indepVarCount)) * _t5 + 2 * 0) * _t7 + _t6 * 0) * i + _t8 * _d_vector_i) * i + _t9 * _d_vector_i) * j + _t10 * _d_vector_j;
+// CHECK-NEXT:     output[1] = _t10 * j;
 // CHECK-NEXT: }
 
 #define INIT(E)                                                                \
@@ -74,32 +72,30 @@ template <> struct Experiment<long double> {
   auto d_##E##Ref = clad::jacobian(E);
 
 #define TEST_DOUBLE(E, ...)                                                    \
-  result[0] = result[1] = result[2] = result[3] = 0;                           \
   output[0] = output[1] = 0;                                                   \
-  d_##E.execute(__VA_ARGS__, output, result);                                  \
-  printf("{%.2f, %.2f, %.2f, %.2f} ", result[0], result[1], result[2],        \
-         result[3]);                                                           \
-  result[0] = result[1] = result[2] = result[3] = 0;                           \
+  d_##E.execute(__VA_ARGS__, output, &result);                                 \
+  printf("{%.2f, %.2f, %.2f, %.2f} ", result[0][0], result[0][1],              \
+                                      result[1][0], result[1][1]);             \
   output[0] = output[1] = 0;                                                   \
-  d_##E##Ref.execute(__VA_ARGS__, output, result);                             \
-  printf("{%.2f, %.2f, %.2f, %.2f}\n", result[0], result[1], result[2],        \
-         result[3]);
+  d_##E##Ref.execute(__VA_ARGS__, output, &result);                            \
+  printf("{%.2f, %.2f, %.2f, %.2f} ", result[0][0], result[0][1],              \
+                                      result[1][0], result[1][1]);             
 
 #define TEST_LONG_DOUBLE(E, ...)                                               \
-  result_ld[0] = result_ld[1] = result_ld[2] = result_ld[3] = 0;               \
   output_ld[0] = output_ld[1] = 0;                                             \
-  d_##E.execute(__VA_ARGS__, output_ld, result_ld);                            \
-  printf("{%.2Lf, %.2Lf, %.2Lf, %.2Lf} ", result_ld[0], result_ld[1],         \
-         result_ld[2], result_ld[3]);                                          \
-  result_ld[0] = result_ld[1] = result_ld[2] = result_ld[3] = 0;               \
+  d_##E.execute(__VA_ARGS__, output_ld, &result_ld);                           \
+  printf("{%.2Lf, %.2Lf, %.2Lf, %.2Lf} ", result_ld[0][0], result_ld[0][1],    \
+                                      result_ld[1][0], result_ld[1][1]);       \
   output_ld[0] = output_ld[1] = 0;                                             \
-  d_##E##Ref.execute(__VA_ARGS__, output_ld, result_ld);                       \
-  printf("{%.2Lf, %.2Lf, %.2Lf, %.2Lf}\n", result_ld[0], result_ld[1],         \
-         result_ld[2], result_ld[3]);
+  d_##E##Ref.execute(__VA_ARGS__, output_ld, &result_ld);                      \
+  printf("{%.2Lf, %.2Lf, %.2Lf, %.2Lf} ", result_ld[0][0], result_ld[0][1],    \
+                                      result_ld[1][0], result_ld[1][1]);             
 
 int main() {
-  double result[4], output[2];
-  long double result_ld[4], output_ld[2];
+  double output[2];
+  clad::matrix<double> result(2, 2);
+  long double output_ld[2];
+  clad::matrix<long double> result_ld(2, 2);
   Experiment<double> E(3, 5);
   Experiment<long double> E_ld(3, 5);
 
diff --git a/test/Jacobian/constexprTest.C b/test/Jacobian/constexprTest.C
index 1d34cd196..f8d6ef42e 100644
--- a/test/Jacobian/constexprTest.C
+++ b/test/Jacobian/constexprTest.C
@@ -8,9 +8,8 @@
 #include "../TestUtils.h"
 
   double result[3] = {0};
-  double jacobianou[6] = {0};
   double result1[3] = {0};
-  double jacobianou1[9] = {0};
+  clad::matrix<double> jacobian(3, 2);
 
 constexpr void fn_mul(double i, double j, double *res) {
    res[0] = i*i;
@@ -18,34 +17,17 @@ constexpr void fn_mul(double i, double j, double *res) {
    res[2] = i*j;
 }
 
-// CHECK: void fn_mul_jac(double i, double j, double *res, double *jacobianMatrix) {
-// CHECK-NEXT:     double _t0 = res[0];
+// CHECK: constexpr void fn_mul_jac(double i, double j, double *res, clad::matrix<double> *_d_vector_res) {
+// CHECK-NEXT:     unsigned long indepVarCount = _d_vector_res->rows() + {{2U|2UL|2ULL}};
+// CHECK-NEXT:     clad::array<double> _d_vector_i = clad::one_hot_vector(indepVarCount, {{0U|0UL|0ULL}});
+// CHECK-NEXT:     clad::array<double> _d_vector_j = clad::one_hot_vector(indepVarCount, {{1U|1UL|1ULL}});
+// CHECK-NEXT:     *_d_vector_res = clad::identity_matrix(_d_vector_res->rows(), indepVarCount, {{2U|2UL|2ULL}});
+// CHECK-NEXT:     *_d_vector_res[0] = _d_vector_i * i + i * _d_vector_i;
 // CHECK-NEXT:     res[0] = i * i;
-// CHECK-NEXT:     double _t1 = res[1];
+// CHECK-NEXT:     *_d_vector_res[1] = _d_vector_j * j + j * _d_vector_j;
 // CHECK-NEXT:     res[1] = j * j;
-// CHECK-NEXT:     double _t2 = res[2];
+// CHECK-NEXT:     *_d_vector_res[2] = _d_vector_i * j + i * _d_vector_j;
 // CHECK-NEXT:     res[2] = i * j;
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             jacobianMatrix[{{4U|4UL|4ULL}}] += 1 * j;
-// CHECK-NEXT:             jacobianMatrix[{{5U|5UL|5ULL}}] += i * 1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         res[2] = _t2;
-// CHECK-NEXT:     }
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += 1 * j;
-// CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += j * 1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         res[1] = _t1;
-// CHECK-NEXT:     }
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += 1 * i;
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += i * 1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         res[0] = _t0;
-// CHECK-NEXT:     }
 // CHECK-NEXT: }
 
 
@@ -55,41 +37,22 @@ constexpr void f_1(double x, double y, double z, double output[]) {
   output[2] = z * x * 10 - y * z;
 }
 
-// CHECK: constexpr void f_1_jac(double x, double y, double z, double output[], double *jacobianMatrix) {
-// CHECK-NEXT:     double _t0 = output[0];
-// CHECK-NEXT:     output[0] = x * x * x; 
-// CHECK-NEXT:     double _t1 = output[1];
-// CHECK-NEXT:     output[1] = x * y * x + y * x * x; 
-// CHECK-NEXT:     double _t2 = output[2];
-// CHECK-NEXT:     output[2] = z * x * 10 - y * z; 
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             jacobianMatrix[{{8U|8UL|8ULL}}] += 1 * 10 * x; 
-// CHECK-NEXT:             jacobianMatrix[{{6U|6UL|6ULL}}] += z * 1 * 10; 
-// CHECK-NEXT:             jacobianMatrix[{{7U|7UL|7ULL}}] += -1 * z;     
-// CHECK-NEXT:             jacobianMatrix[{{8U|8UL|8ULL}}] += y * -1;     
-// CHECK-NEXT:         }
-// CHECK-NEXT:         output[2] = _t2; 
-// CHECK-NEXT:     }
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += 1 * x * y; 
-// CHECK-NEXT:             jacobianMatrix[{{4U|4UL|4ULL}}] += x * 1 * x;
-// CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += x * y * 1;
-// CHECK-NEXT:             jacobianMatrix[{{4U|4UL|4ULL}}] += 1 * x * x;
-// CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += y * 1 * x; 
-// CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += y * x * 1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         output[1] = _t1; 
-// CHECK-NEXT:     }
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += 1 * x * x; 
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += x * 1 * x; 
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += x * x * 1; 
-// CHECK-NEXT:         }
-// CHECK-NEXT:         output[0] = _t0;
-// CHECK-NEXT:     }
+// CHECK: constexpr void f_1_jac(double x, double y, double z, double output[], clad::matrix<double> *_d_vector_output) {
+// CHECK-NEXT:     unsigned long indepVarCount = _d_vector_output->rows() + {{3U|3UL|3ULL}};
+// CHECK-NEXT:     clad::array<double> _d_vector_x = clad::one_hot_vector(indepVarCount, {{0U|0UL|0ULL}});
+// CHECK-NEXT:     clad::array<double> _d_vector_y = clad::one_hot_vector(indepVarCount, {{1U|1UL|1ULL}});
+// CHECK-NEXT:     clad::array<double> _d_vector_z = clad::one_hot_vector(indepVarCount, {{2U|2UL|2ULL}});
+// CHECK-NEXT:     *_d_vector_output = clad::identity_matrix(_d_vector_output->rows(), indepVarCount, {{3U|3UL|3ULL}});
+// CHECK-NEXT:     double _t0 = x * x;
+// CHECK-NEXT:     *_d_vector_output[0] = (_d_vector_x * x + x * _d_vector_x) * x + _t0 * _d_vector_x;
+// CHECK-NEXT:     output[0] = _t0 * x;
+// CHECK-NEXT:     double _t1 = x * y;
+// CHECK-NEXT:     double _t2 = y * x;
+// CHECK-NEXT:     *_d_vector_output[1] = (_d_vector_x * y + x * _d_vector_y) * x + _t1 * _d_vector_x + (_d_vector_y * x + y * _d_vector_x) * x + _t2 * _d_vector_x;
+// CHECK-NEXT:     output[1] = _t1 * x + _t2 * x;
+// CHECK-NEXT:     double _t3 = z * x;
+// CHECK-NEXT:     *_d_vector_output[2] = (_d_vector_z * x + z * _d_vector_x) * 10 + _t3 * (clad::zero_vector(indepVarCount)) - (_d_vector_y * z + y * _d_vector_z); 
+// CHECK-NEXT:     output[2] = _t3 * 10 - y * z;
 // CHECK-NEXT: }
 
 int main() {
@@ -97,7 +60,7 @@ int main() {
     INIT_JACOBIAN(fn_mul);
     INIT_JACOBIAN(f_1);
 
-    TEST_JACOBIAN(fn_mul, 2, 6, 3, 1, result, jacobianou); // CHECK-EXEC: {6.00, 0.00, 0.00, 2.00, 1.00, 3.00}
-    TEST_JACOBIAN(f_1, 3, 9, 4, 5, 6, result1, jacobianou1); // CHECK-EXEC: {48.00, 0.00, 0.00, 80.00, 32.00, 0.00, 60.00, -6.00, 35.00}
+    TEST_JACOBIAN(fn_mul, 2, 6, 3, 1, result, &jacobian); // CHECK-EXEC: {6.00, 0.00, 0.00, 2.00, 1.00, 3.00}
+    TEST_JACOBIAN(f_1, 3, 9, 4, 5, 6, result1, &jacobian); // CHECK-EXEC: {48.00, 0.00, 0.00, 80.00, 32.00, 0.00, 60.00, -6.00, 35.00}
 }
 
diff --git a/test/Jacobian/testUtility.C b/test/Jacobian/testUtility.C
index 7b38eb7ec..ae1e65b08 100644
--- a/test/Jacobian/testUtility.C
+++ b/test/Jacobian/testUtility.C
@@ -8,9 +8,8 @@
 #include "../TestUtils.h"
 
   double output[3] = {0};
-  double jacobian[6] = {0};
   double output1[3] = {0};
-  double jacobian1[9] = {0};
+  clad::matrix<double> jacobian(3, 3);
 
 void fn_mul(double i, double j, double *res) {
    res[0] = i*i;
@@ -18,34 +17,17 @@ void fn_mul(double i, double j, double *res) {
    res[2] = i*j;
 }
 
-// CHECK: void fn_mul_jac(double i, double j, double *res, double *jacobianMatrix) {
-// CHECK-NEXT:     double _t0 = res[0];
+// CHECK: void fn_mul_jac(double i, double j, double *res, clad::matrix<double> *_d_vector_res) {
+// CHECK-NEXT:     unsigned long indepVarCount = _d_vector_res->rows() + {{2U|2UL|2ULL}};
+// CHECK-NEXT:     clad::array<double> _d_vector_i = clad::one_hot_vector(indepVarCount, {{0U|0UL|0ULL}});
+// CHECK-NEXT:     clad::array<double> _d_vector_j = clad::one_hot_vector(indepVarCount, {{1U|1UL|1ULL}});
+// CHECK-NEXT:     *_d_vector_res = clad::identity_matrix(_d_vector_res->rows(), indepVarCount, {{2U|2UL|2ULL}});
+// CHECK-NEXT:     *_d_vector_res[0] = _d_vector_i * i + i * _d_vector_i;
 // CHECK-NEXT:     res[0] = i * i;
-// CHECK-NEXT:     double _t1 = res[1];
+// CHECK-NEXT:     *_d_vector_res[1] = _d_vector_j * j + j * _d_vector_j;
 // CHECK-NEXT:     res[1] = j * j;
-// CHECK-NEXT:     double _t2 = res[2];
+// CHECK-NEXT:     *_d_vector_res[2] = _d_vector_i * j + i * _d_vector_j;
 // CHECK-NEXT:     res[2] = i * j;
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             jacobianMatrix[{{4U|4UL|4ULL}}] += 1 * j;
-// CHECK-NEXT:             jacobianMatrix[{{5U|5UL|5ULL}}] += i * 1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         res[2] = _t2;
-// CHECK-NEXT:     }
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += 1 * j;
-// CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += j * 1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         res[1] = _t1;
-// CHECK-NEXT:     }
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += 1 * i;
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += i * 1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         res[0] = _t0;
-// CHECK-NEXT:     }
 // CHECK-NEXT: }
 
 
@@ -56,41 +38,22 @@ void f_1(double x, double y, double z, double output[]) {
   output[2] = z * x * 10 - y * z;
 }
 
-// CHECK: void f_1_jac(double x, double y, double z, double output[], double *jacobianMatrix) {
-// CHECK-NEXT:     double _t0 = output[0];
-// CHECK-NEXT:     output[0] = x * x * x;
-// CHECK-NEXT:     double _t1 = output[1];
-// CHECK-NEXT:     output[1] = x * y * x + y * x * x;
-// CHECK-NEXT:     double _t2 = output[2];
-// CHECK-NEXT:     output[2] = z * x * 10 - y * z;
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             jacobianMatrix[{{8U|8UL|8ULL}}] += 1 * 10 * x;
-// CHECK-NEXT:             jacobianMatrix[{{6U|6UL|6ULL}}] += z * 1 * 10;
-// CHECK-NEXT:             jacobianMatrix[{{7U|7UL|7ULL}}] += -1 * z;
-// CHECK-NEXT:             jacobianMatrix[{{8U|8UL|8ULL}}] += y * -1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         output[2] = _t2;
-// CHECK-NEXT:     }
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += 1 * x * y;
-// CHECK-NEXT:             jacobianMatrix[{{4U|4UL|4ULL}}] += x * 1 * x;
-// CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3ULL}}] += x * y * 1;
-// CHECK-NEXT:             jacobianMatrix[{{4U|4UL|4ULL}}] += 1 * x * x;
-// CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3OLL}}] += y * 1 * x;
-// CHECK-NEXT:             jacobianMatrix[{{3U|3UL|3OLL}}] += y * x * 1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         output[1] = _t1;
-// CHECK-NEXT:     }
-// CHECK-NEXT:     {
-// CHECK-NEXT:         {
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += 1 * x * x;
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += x * 1 * x;
-// CHECK-NEXT:             jacobianMatrix[{{0U|0UL|0ULL}}] += x * x * 1;
-// CHECK-NEXT:         }
-// CHECK-NEXT:         output[0] = _t0;
-// CHECK-NEXT:     }
+// CHECK: void f_1_jac(double x, double y, double z, double output[], clad::matrix<double> *_d_vector_output) {
+// CHECK-NEXT:     unsigned long indepVarCount = _d_vector_output->rows() + {{3U|3UL|3ULL}};
+// CHECK-NEXT:     clad::array<double> _d_vector_x = clad::one_hot_vector(indepVarCount, {{0U|0UL|0ULL}});
+// CHECK-NEXT:     clad::array<double> _d_vector_y = clad::one_hot_vector(indepVarCount, {{1U|1UL|1ULL}});
+// CHECK-NEXT:     clad::array<double> _d_vector_z = clad::one_hot_vector(indepVarCount, {{2U|2UL|2ULL}});
+// CHECK-NEXT:     *_d_vector_output = clad::identity_matrix(_d_vector_output->rows(), indepVarCount, {{3U|3UL|3ULL}});
+// CHECK-NEXT:     double _t0 = x * x;
+// CHECK-NEXT:     *_d_vector_output[0] = (_d_vector_x * x + x * _d_vector_x) * x + _t0 * _d_vector_x;
+// CHECK-NEXT:     output[0] = _t0 * x;
+// CHECK-NEXT:     double _t1 = x * y;
+// CHECK-NEXT:     double _t2 = y * x;
+// CHECK-NEXT:     *_d_vector_output[1] = (_d_vector_x * y + x * _d_vector_y) * x + _t1 * _d_vector_x + (_d_vector_y * x + y * _d_vector_x) * x + _t2 * _d_vector_x;
+// CHECK-NEXT:     output[1] = _t1 * x + _t2 * x;
+// CHECK-NEXT:     double _t3 = z * x;
+// CHECK-NEXT:     *_d_vector_output[2] = (_d_vector_z * x + z * _d_vector_x) * 10 + _t3 * (clad::zero_vector(indepVarCount)) - (_d_vector_y * z + y * _d_vector_z); 
+// CHECK-NEXT:     output[2] = _t3 * 10 - y * z;
 // CHECK-NEXT: }
 
 
@@ -98,7 +61,7 @@ int main(){
     INIT_JACOBIAN(fn_mul);
     INIT_JACOBIAN(f_1);
 
-    TEST_JACOBIAN(fn_mul, 2, 6, 3, 1, output, jacobian); // CHECK-EXEC: {6.00, 0.00, 0.00, 2.00, 1.00, 3.00}
-    TEST_JACOBIAN(f_1, 3, 9, 4, 5, 6, output1, jacobian1); // CHECK-EXEC: {48.00, 0.00, 0.00, 80.00, 32.00, 0.00, 60.00, -6.00, 35.00}
+    TEST_JACOBIAN(fn_mul, 2, 6, 3, 1, output, &jacobian); // CHECK-EXEC: {6.00, 0.00, 0.00, 2.00, 1.00, 3.00}
+    TEST_JACOBIAN(f_1, 3, 9, 4, 5, 6, output1, &jacobian); // CHECK-EXEC: {48.00, 0.00, 0.00, 80.00, 32.00, 0.00, 60.00, -6.00, 35.00}
 
 }
diff --git a/test/NthDerivative/CustomDerivatives.C b/test/NthDerivative/CustomDerivatives.C
index e5bcbecbb..827320a0d 100644
--- a/test/NthDerivative/CustomDerivatives.C
+++ b/test/NthDerivative/CustomDerivatives.C
@@ -55,15 +55,15 @@ float test_trig(float x, float y, int a, int b) {
 // CHECK-NEXT:    clad::ValueAndPushforward<ValueAndPushforward<float, float>, ValueAndPushforward<float, float> > _t0 = clad::custom_derivatives::std::sin_pushforward_pushforward(x * y, _d_x0 * y + x * _d_y0, _d_x * y + x * _d_y, _d__d_x * y + _d_x0 * _d_y + _d_x * _d_y0 + x * _d__d_y);
 // CHECK-NEXT:    ValueAndPushforward<float, float> _d__t0 = _t0.pushforward;
 // CHECK-NEXT:    ValueAndPushforward<float, float> _t00 = _t0.value;
-// CHECK-NEXT:    clad::ValueAndPushforward<ValueAndPushforward<decltype(::std::pow(float(), int())), decltype(::std::pow(float(), int()))>, ValueAndPushforward<decltype(::std::pow(float(), int())), decltype(::std::pow(float(), int()))> > _t1 = pow_pushforward_pushforward(_t00.value, a, _t00.pushforward, _d_a0, _d__t0.value, _d_a, _d__t0.pushforward, _d__d_a);
-// CHECK-NEXT:    ValueAndPushforward<decltype(::std::pow(float(), int())), decltype(::std::pow(float(), int()))> _d__t1 = _t1.pushforward;
-// CHECK-NEXT:    ValueAndPushforward<decltype(::std::pow(float(), int())), decltype(::std::pow(float(), int()))> _t10 = _t1.value;
+// CHECK-NEXT:    clad::ValueAndPushforward<ValueAndPushforward<double, double>, ValueAndPushforward<double, double> > _t1 = pow_pushforward_pushforward(_t00.value, a, _t00.pushforward, _d_a0, _d__t0.value, _d_a, _d__t0.pushforward, _d__d_a);
+// CHECK-NEXT:    ValueAndPushforward<double, double> _d__t1 = _t1.pushforward;
+// CHECK-NEXT:    ValueAndPushforward<double, double> _t10 = _t1.value;
 // CHECK-NEXT:    clad::ValueAndPushforward<ValueAndPushforward<float, float>, ValueAndPushforward<float, float> > _t2 = clad::custom_derivatives::std::cos_pushforward_pushforward(x * y, _d_x0 * y + x * _d_y0, _d_x * y + x * _d_y, _d__d_x * y + _d_x0 * _d_y + _d_x * _d_y0 + x * _d__d_y);
 // CHECK-NEXT:    ValueAndPushforward<float, float> _d__t2 = _t2.pushforward;
 // CHECK-NEXT:    ValueAndPushforward<float, float> _t20 = _t2.value;
-// CHECK-NEXT:    clad::ValueAndPushforward<ValueAndPushforward<decltype(::std::pow(float(), int())), decltype(::std::pow(float(), int()))>, ValueAndPushforward<decltype(::std::pow(float(), int())), decltype(::std::pow(float(), int()))> > _t3 = clad::custom_derivatives::std::pow_pushforward_pushforward(_t20.value, b, _t20.pushforward, _d_b0, _d__t2.value, _d_b, _d__t2.pushforward, _d__d_b);
-// CHECK-NEXT:    ValueAndPushforward<decltype(::std::pow(float(), int())), decltype(::std::pow(float(), int()))> _d__t3 = _t3.pushforward;
-// CHECK-NEXT:    ValueAndPushforward<decltype(::std::pow(float(), int())), decltype(::std::pow(float(), int()))> _t30 = _t3.value;
+// CHECK-NEXT:    clad::ValueAndPushforward<ValueAndPushforward<double, double>, ValueAndPushforward<double, double> > _t3 = clad::custom_derivatives::std::pow_pushforward_pushforward(_t20.value, b, _t20.pushforward, _d_b0, _d__t2.value, _d_b, _d__t2.pushforward, _d__d_b);
+// CHECK-NEXT:    ValueAndPushforward<double, double> _d__t3 = _t3.pushforward;
+// CHECK-NEXT:    ValueAndPushforward<double, double> _t30 = _t3.value;
 // CHECK-NEXT:    double &_d__t4 = _d__t1.value;
 // CHECK-NEXT:    double &_t40 = _t10.value;
 // CHECK-NEXT:    double &_d__t5 = _d__t3.value;
@@ -92,15 +92,15 @@ float test_trig(float x, float y, int a, int b) {
 // CHECK-NEXT:    clad::ValueAndPushforward<ValueAndPushforward<float, float>, ValueAndPushforward<float, float> > _t0 = clad::custom_derivatives::std::sin_pushforward_pushforward(x * y, _d_x0 * y + x * _d_y0, _d_x * y + x * _d_y, _d__d_x * y + _d_x0 * _d_y + _d_x * _d_y0 + x * _d__d_y);
 // CHECK-NEXT:    ValueAndPushforward<float, float> _d__t0 = _t0.pushforward;
 // CHECK-NEXT:    ValueAndPushforward<float, float> _t00 = _t0.value;
-// CHECK-NEXT:    clad::ValueAndPushforward<ValueAndPushforward<decltype(::std::pow(float(), int())), decltype(::std::pow(float(), int()))>, ValueAndPushforward<decltype(::std::pow(float(), int())), decltype(::std::pow(float(), int()))> > _t1 = clad::custom_derivatives::std::pow_pushforward_pushforward(_t00.value, a, _t00.pushforward, _d_a0, _d__t0.value, _d_a, _d__t0.pushforward, _d__d_a);
-// CHECK-NEXT:    ValueAndPushforward<decltype(::std::pow(float(), int())), decltype(::std::pow(float(), int()))> _d__t1 = _t1.pushforward;
-// CHECK-NEXT:    ValueAndPushforward<decltype(::std::pow(float(), int())), decltype(::std::pow(float(), int()))> _t10 = _t1.value;
+// CHECK-NEXT:    clad::ValueAndPushforward<ValueAndPushforward<double, double>, ValueAndPushforward<double, double> > _t1 = clad::custom_derivatives::std::pow_pushforward_pushforward(_t00.value, a, _t00.pushforward, _d_a0, _d__t0.value, _d_a, _d__t0.pushforward, _d__d_a);
+// CHECK-NEXT:    ValueAndPushforward<double, double> _d__t1 = _t1.pushforward;
+// CHECK-NEXT:    ValueAndPushforward<double, double> _t10 = _t1.value;
 // CHECK-NEXT:    clad::ValueAndPushforward<ValueAndPushforward<float, float>, ValueAndPushforward<float, float> > _t2 = clad::custom_derivatives::std::cos_pushforward_pushforward(x * y, _d_x0 * y + x * _d_y0, _d_x * y + x * _d_y, _d__d_x * y + _d_x0 * _d_y + _d_x * _d_y0 + x * _d__d_y);
 // CHECK-NEXT:    ValueAndPushforward<float, float> _d__t2 = _t2.pushforward;
 // CHECK-NEXT:    ValueAndPushforward<float, float> _t20 = _t2.value;
-// CHECK-NEXT:    clad::ValueAndPushforward<ValueAndPushforward<decltype(::std::pow(float(), int())), decltype(::std::pow(float(), int()))>, ValueAndPushforward<decltype(::std::pow(float(), int())), decltype(::std::pow(float(), int()))> > _t3 = clad::custom_derivatives::std::pow_pushforward_pushforward(_t20.value, b, _t20.pushforward, _d_b0, _d__t2.value, _d_b, _d__t2.pushforward, _d__d_b);
-// CHECK-NEXT:    ValueAndPushforward<decltype(::std::pow(float(), int())), decltype(::std::pow(float(), int()))> _d__t3 = _t3.pushforward;
-// CHECK-NEXT:    ValueAndPushforward<decltype(::std::pow(float(), int())), decltype(::std::pow(float(), int()))> _t30 = _t3.value;
+// CHECK-NEXT:    clad::ValueAndPushforward<ValueAndPushforward<double, double>, ValueAndPushforward<double, double> > _t3 = clad::custom_derivatives::std::pow_pushforward_pushforward(_t20.value, b, _t20.pushforward, _d_b0, _d__t2.value, _d_b, _d__t2.pushforward, _d__d_b);
+// CHECK-NEXT:    ValueAndPushforward<double, double> _d__t3 = _t3.pushforward;
+// CHECK-NEXT:    ValueAndPushforward<double, double> _t30 = _t3.value;
 // CHECK-NEXT:    double &_d__t4 = _d__t1.value;
 // CHECK-NEXT:    double &_t40 = _t10.value;
 // CHECK-NEXT:    double &_d__t5 = _d__t3.value;
diff --git a/test/TestUtils.h b/test/TestUtils.h
index 6d76c59f6..13d78df2c 100644
--- a/test/TestUtils.h
+++ b/test/TestUtils.h
@@ -116,16 +116,25 @@ void run_gradient_impl(CF cf, index_pack<S...> s, Args&&... args) {
   display(std::get<S>(t)...);
 }
 
-template <class CF, std::size_t... S, class... Args>
-void run_jacobian_impl(CF cf, std::size_t size, index_pack<S...> s,
-                       Args&&... args) {
+template <class CF, class... Args>
+void run_jacobian_impl(CF cf, std::size_t size, Args&&... args) {
   std::tuple<Args...> t = {args...};
-  reset(std::get<S>(t)...);
   cf.execute(args...);
   typedef
       typename std::remove_reference<decltype(*std::get<sizeof...(args) - 1>(
           t))>::type arrElemType;
-  displayarr<arrElemType>(std::get<sizeof...(args) - 1>(t), size);
+  auto& res = *std::get<sizeof...(args) - 1>(t);
+  unsigned numOfParameters = sizeof...(args) - 2;
+  unsigned numOfOutputs = res.rows();
+  printf("{");
+  for (unsigned i = 0; i < numOfOutputs; ++i) {
+    for (unsigned j = 0; j < numOfParameters; ++j) {
+      printf("%.2f", res[i][j]);
+      if (i != numOfOutputs - 1 || j != numOfParameters - 1)
+        printf(", ");
+    }
+  }
+  printf("}\n");
 }
 
 template <class CF, std::size_t... S, class... Args>
@@ -150,11 +159,7 @@ void run_gradient(CF cf, Args&&... args) {
 template <std::size_t NumOfDerivativeArgs, std::size_t size, class CF,
           class... Args>
 void run_jacobian(CF cf, Args&&... args) {
-  using DerivativeArgRange =
-      typename GenerateRange<sizeof...(Args) - NumOfDerivativeArgs,
-                             sizeof...(Args) - 1>::type;
-  run_jacobian_impl(cf, size, DerivativeArgRange(),
-                    std::forward<Args>(args)...);
+  run_jacobian_impl(cf, size, std::forward<Args>(args)...);
 }
 
 template <std::size_t NumOfDerivativeArgs, class CF, class... Args>