su2code · areenraj · Jul 26, 2024 · Jul 26, 2024 · Jul 26, 2024 · Jul 26, 2024
diff --git a/Common/include/CConfig.hpp b/Common/include/CConfig.hpp
@@ -133,6 +133,7 @@ class CConfig {
   Sens_Remove_Sharp,        /*!< \brief Flag for removing or not the sharp edges from the sensitivity computation. */
   Hold_GridFixed,           /*!< \brief Flag hold fixed some part of the mesh during the deformation. */
   Axisymmetric,             /*!< \brief Flag for axisymmetric calculations */
+  Enable_Cuda,              /*!< \brief Flag for switching GPU computing*/
   Integrated_HeatFlux;      /*!< \brief Flag for heat flux BC whether it deals with integrated values.*/
   su2double Buffet_k;       /*!< \brief Sharpness coefficient for buffet sensor.*/
   su2double Buffet_lambda;  /*!< \brief Offset parameter for buffet sensor.*/
@@ -1131,6 +1132,7 @@ class CConfig {
   su2double Theta_Interior_Penalty_DGFEM;    /*!< \brief Factor for the symmetrizing terms in the DG discretization of the viscous fluxes. */
   unsigned short byteAlignmentMatMul;        /*!< \brief Number of bytes in the vectorization direction for the matrix multiplication. Multipe of 64. */
   unsigned short sizeMatMulPadding;          /*!< \brief The matrix size in the vectorization direction padded to a multiple of 8. Computed from byteAlignmentMatMul. */
+  unsigned short gpuSizeMatMulPadding;
   bool Compute_Entropy;                      /*!< \brief Whether or not to compute the entropy in the fluid model. */
   bool Use_Lumped_MassMatrix_DGFEM;          /*!< \brief Whether or not to use the lumped mass matrix for DGFEM. */
   bool Jacobian_Spatial_Discretization_Only; /*!< \brief Flag to know if only the exact Jacobian of the spatial discretization must be computed. */
@@ -6224,6 +6226,12 @@ class CConfig {
    */
   bool GetAxisymmetric(void) const { return Axisymmetric; }
 
+  /*!
+   * \brief Get information about GPU support.
+   * \return <code>TRUE</code> if cuda is enabled; otherwise <code>FALSE</code>.
+   */
+  bool GetCUDA(void) const { return Enable_Cuda; }
+
   /*!
    * \brief Get information about there is a smoothing of the grid coordinates.
    * \return <code>TRUE</code> if there is smoothing of the grid coordinates; otherwise <code>FALSE</code>.
@@ -9089,6 +9097,8 @@ class CConfig {
    */
   unsigned short GetSizeMatMulPadding(void) const { return sizeMatMulPadding; }
 
+  unsigned short GetGPUSizeMatMulPadding(void) const { return gpuSizeMatMulPadding; }
+
   /*!
    * \brief Function to make available whether or not the entropy must be computed.
    * \return The boolean whether or not the entropy must be computed.

diff --git a/Common/include/linear_algebra/CMatrixVectorProduct.hpp b/Common/include/linear_algebra/CMatrixVectorProduct.hpp
@@ -50,6 +50,12 @@
  * handle the different types of matrix-vector products and still be
  * passed to a single implementation of the Krylov solvers.
  * This abstraction may also be used to define matrix-free products.
+ *
+ * There is also the use of a dummy class being made to select the
+ * correct function as defined by the user while deciding between
+ * CPU or GPU execution. This dummy class calls the correct member
+ * functions from its derived classes to map the suitable path of
+ * execution - CPU or GPU.
  */
 template <class ScalarType>
 class CMatrixVectorProduct {
@@ -60,6 +66,48 @@ class CMatrixVectorProduct {
 template <class ScalarType>
 CMatrixVectorProduct<ScalarType>::~CMatrixVectorProduct() {}
 
+/*!
+ * \class CExecutionPath
+ * \brief Dummy super class that holds the correct member functions in its child classes
+ */
+
+template <class ScalarType>
+class CExecutionPath {
+ public:
+  virtual void mat_vec_prod(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v, CGeometry* geometry,
+                            const CConfig* config, const CSysMatrix<ScalarType>& matrix) = 0;
+};
+
+/*!
+ * \class CCpuExecution
+ * \brief Derived class containing the CPU Matrix Vector Product Function
+ */
+template <class ScalarType>
+class CCpuExecution : public executionPath<ScalarType> {
+ public:
+  void mat_vec_prod(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v, CGeometry* geometry,
+                    const CConfig* config, const CSysMatrix<ScalarType>& matrix) override {
+    matrix.MatrixVectorProduct(u, v, geometry, config);
+  }
+};
+
+/*!
+ * \class CGpuExecution
+ * \brief Derived class containing the GPU Matrix Vector Product Function
+ */
+template <class ScalarType>
+class CGpuExecution : public executionPath<ScalarType> {
+ public:
+  void mat_vec_prod(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v, CGeometry* geometry,
+                    const CConfig* config, const CSysMatrix<ScalarType>& matrix) override {
+#ifdef HAVE_CUDA
+    matrix.GPUMatrixVectorProduct(u, v, geometry, config);
+#else
+    matrix.MatrixVectorProduct(u, v, geometry, config);
+#endif
+  }
+};
+
 /*!
  * \class CSysMatrixVectorProduct
  * \ingroup SpLinSys
@@ -71,6 +119,7 @@ class CSysMatrixVectorProduct final : public CMatrixVectorProduct<ScalarType> {
   const CSysMatrix<ScalarType>& matrix; /*!< \brief pointer to matrix that defines the product. */
   CGeometry* geometry;                  /*!< \brief geometry associated with the matrix. */
   const CConfig* config;                /*!< \brief config of the problem. */
+  CExecutionPath<ScalarType>* exec;     /*!< \brief interface that decides which path of execution to choose from. */
 
  public:
   /*!
@@ -81,7 +130,13 @@ class CSysMatrixVectorProduct final : public CMatrixVectorProduct<ScalarType> {
    */
   inline CSysMatrixVectorProduct(const CSysMatrix<ScalarType>& matrix_ref, CGeometry* geometry_ref,
                                  const CConfig* config_ref)
-      : matrix(matrix_ref), geometry(geometry_ref), config(config_ref) {}
+      : matrix(matrix_ref), geometry(geometry_ref), config(config_ref) {
+    if (config->GetCUDA()) {
+      exec = new CGpuExecution<ScalarType>;
+    } else {
+      exec = new CCpuExecution<ScalarType>;
+    }
+  }
 
   /*!
    * \note This class cannot be default constructed as that would leave us with invalid pointers.
@@ -94,6 +149,6 @@ class CSysMatrixVectorProduct final : public CMatrixVectorProduct<ScalarType> {
    * \param[out] v - CSysVector that is the result of the product
    */
   inline void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const override {
-    matrix.MatrixVectorProduct(u, v, geometry, config);
+    exec->mat_vec_prod(u, v, geometry, config, matrix);
   }
 };
diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp
@@ -145,6 +145,11 @@ class CSysMatrix {
   const unsigned long* col_ind; /*!< \brief Column index for each of the elements in val(). */
   const unsigned long* col_ptr; /*!< \brief The transpose of col_ind, pointer to blocks with the same column index. */
 
+  ScalarType* d_matrix; /*!< \brief Entries ot be stored on the device. */
+
+  unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
+  unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
+
   ScalarType* ILU_matrix;           /*!< \brief Entries of the ILU sparse matrix. */
   unsigned long nnz_ilu;            /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
   const unsigned long* row_ptr_ilu; /*!< \brief Pointers to the first element in each row (ILU). */
@@ -838,6 +843,21 @@ class CSysMatrix {
   void MatrixVectorProduct(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod, CGeometry* geometry,
                            const CConfig* config) const;
 
+  /*!
+   * \brief Performs the product of a sparse matrix by a CSysVector.
+   * \param[in] vec - CSysVector to be multiplied by the sparse matrix A.
+   * \param[in] geometry - Geometrical definition of the problem.
+   * \param[in] config - Definition of the particular problem.
+   * \param[out] prod - Result of the product.
+   */
+
+  void GPUMatrixVectorProduct(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod, CGeometry* geometry,
+                              const CConfig* config) const;
+
+  void FGMRESMainLoop(std::vector<ScalarType> W, std::vector<ScalarType> Z, su2vector<ScalarType>& g,
+                      su2vector<ScalarType>& sn, CSysVector<ScalarType>& cs, su2vector<ScalarType>& y,
+                      su2vector<ScalarType>& H, int m, CGeometry* geometry, const CConfig* config) const;
+
   /*!
    * \brief Build the Jacobi preconditioner.
    */

diff --git a/Common/include/linear_algebra/GPU_lin_alg.cuh b/Common/include/linear_algebra/GPU_lin_alg.cuh
@@ -0,0 +1,62 @@
+/*!
+ * \file GPU_lin_alg.cuh
+ * \brief Declaration of the GPU Matrix Vector Product CUDA Kernel.
+ *        The implemtation is in <i>GPU_lin_alg.cu</i>.
+ * \author A. Raj
+ * \version 8.0.1 "Harrier"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2024, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include<cuda_runtime.h>
+#include"../../include/linear_algebra/CSysMatrix.hpp"
+#include"iostream"
+
+/*!
+   * \brief assert style function that reads return codes after intercepting CUDA API calls.
+   *        It returns the result code and its location if the call is unsuccessful.
+   * \param[in] code - result code of CUDA function
+   * \param[in] file - name of file holding the function
+   * \param[in] line - line containing the function
+   */
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+   if (code != cudaSuccess)
+   {
+      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+}
+
+/*!
+   * \brief CUDA Kernel that performs the Matrix Vector Product: All threads execute product += matrix*vector
+   * \param[in] matrix - matrix to be multiplied
+   * \param[in] vec - vector to multiply the matrix with
+   * \param[in] prod - storing the output of the operation
+   * \param[in] d_row_ptr - a device array of pointers pointing to the first non-zero element in each row of the block matrix
+   * \param[in] d_col_ind - a device array holding the column index of each element of the block matrix
+   * \param[in] nPointDomain - number of real points of the mesh
+   * \param[in] nVar - number of variables of the problem
+   * \param[in] nEqn - number of equations of the problem
+   */
+
+template<typename matrixType, typename vectorType>
+__global__ void GPUMatrixVectorProductAdd(matrixType* matrix, vectorType* vec, vectorType* prod, unsigned long* d_row_ptr, unsigned long* d_col_ind, unsigned long nPointDomain, unsigned long nVar, unsigned long nEqn);
diff --git a/Common/include/template_nvblas.conf b/Common/include/template_nvblas.conf
@@ -0,0 +1,7 @@
+NVBLAS_CPU_BLAS_LIB /path/to/libopenblas.so
+
+NVBLAS_GPU_LIST ALL
+
+NVBLAS_TILE_DIM 2048
+
+NVBLAS_AUTOPIN_MEM_ENABLED
diff --git a/Common/src/CConfig.cpp b/Common/src/CConfig.cpp
@@ -1144,6 +1144,8 @@ void CConfig::SetConfig_Options() {
 
   /*\brief AXISYMMETRIC \n DESCRIPTION: Axisymmetric simulation \n DEFAULT: false \ingroup Config */
   addBoolOption("AXISYMMETRIC", Axisymmetric, false);
+  /*\brief ENABLE_CUDA \n DESCRIPTION: GPU Acceleration \n DEFAULT: false \ingroup Config */
+  addBoolOption("ENABLE_CUDA", Enable_Cuda, false);
   /* DESCRIPTION: Add the gravity force */
   addBoolOption("GRAVITY_FORCE", GravityForce, false);
   /* DESCRIPTION: Add the Vorticity Confinement term*/
@@ -2399,6 +2401,7 @@ void CConfig::SetConfig_Options() {
   /* DESCRIPTION: Number of aligned bytes for the matrix multiplications. Multiple of 64. (128 by default) */
   addUnsignedShortOption("ALIGNED_BYTES_MATMUL", byteAlignmentMatMul, 128);
 
+  addUnsignedShortOption("GPU Matrix Multiplication Size", gpuSizeMatMulPadding, 65472);
   /*!\par CONFIG_CATEGORY: FEA solver \ingroup Config*/
   /*--- Options related to the FEA solver ---*/
 

diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp
@@ -32,6 +32,16 @@
 
 #include <cmath>
 
+#ifdef HAVE_CUDA
+#include "../../include/linear_algebra/GPU_lin_alg.cuh"
+
+#ifndef gpuErrChk
+#define gpuErrChk(ans) \
+  { gpuAssert((ans), __FILE__, __LINE__); }
+#endif
+
+#endif
+
 template <class ScalarType>
 CSysMatrix<ScalarType>::CSysMatrix() : rank(SU2_MPI::GetRank()), size(SU2_MPI::GetSize()) {
   nPoint = nPointDomain = nVar = nEqn = 0;
@@ -131,6 +141,17 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
   col_ind = csr.innerIdx();
   dia_ptr = csr.diagPtr();
 
+#if defined(HAVE_CUDA)
+  gpuErrChk(cudaMalloc((void**)(&d_row_ptr), (sizeof(row_ptr) * (nPointDomain + 1.0))));
+  gpuErrChk(cudaMalloc((void**)(&d_col_ind), (sizeof(col_ind) * nnz)));
+  gpuErrChk(cudaMalloc((void**)(&d_matrix), (sizeof(ScalarType) * nnz * nVar * nEqn)));
+
+  gpuErrChk(
+      cudaMemcpy((void*)(d_row_ptr), (void*)row_ptr, (sizeof(row_ptr) * (nPointDomain + 1.0)), cudaMemcpyHostToDevice));
+  gpuErrChk(cudaMemcpy((void*)(d_col_ind), (void*)col_ind, (sizeof(col_ind)) * nnz, cudaMemcpyHostToDevice));
+
+#endif
+
   if (needTranspPtr) col_ptr = geometry->GetTransposeSparsePatternMap(type).data();
 
   if (type == ConnectivityType::FiniteVolume) {

diff --git a/Common/src/linear_algebra/GPU_lin_alg.cu b/Common/src/linear_algebra/GPU_lin_alg.cu
@@ -0,0 +1,103 @@
+/*!
+ * \file GPU_lin_alg.cu
+ * \brief Implementation of Matrix Vector Product CUDA Kernel
+ * \author A. Raj
+ * \version 8.0.1 "Harrier"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2024, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../../include/linear_algebra/CSysMatrix.hpp"
+#include "../../include/linear_algebra/GPU_lin_alg.cuh"
+
+#ifndef gpuErrChk
+#define gpuErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+#endif
+
+template<typename matrixType, typename vectorType>
+__global__ void GPUMatrixVectorProductAdd(matrixType* matrix, vectorType* vec, vectorType* prod, unsigned long* d_row_ptr, unsigned long* d_col_ind, unsigned long nPointDomain, unsigned long nVar, unsigned long nEqn)
+{
+
+   int i = blockIdx.x * blockDim.x + threadIdx.x;
+   int j = threadIdx.y;
+   int k = threadIdx.z;
+
+   int prod_index = i * nVar;
+
+   if(i<nPointDomain)
+   {
+      prod[prod_index + j] = 0.0;
+   }
+
+   __syncthreads();
+
+   vectorType res = 0.0;
+
+   if(i<nPointDomain)
+   {
+      for(int index = d_row_ptr[i]; index<d_row_ptr[i+1]; index++)
+      {
+        int matrix_index = index * nVar * nEqn;
+        int vec_index = d_col_ind[index] * nEqn;
+
+        res += matrix[matrix_index + (j * nEqn + k)] * vec[vec_index + k];
+      }
+
+      atomicAdd(&prod[prod_index + j],res);
+   }
+
+}
+
+
+template<class ScalarType>
+void CSysMatrix<ScalarType>::GPUMatrixVectorProduct(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod,
+                                                 CGeometry* geometry, const CConfig* config) const
+                                                 {
+
+  ScalarType* d_vec;
+  ScalarType* d_prod;
+
+  unsigned long mat_size = nnz*nVar*nEqn;
+  unsigned long vec_size = nPointDomain*nVar;
+
+  gpuErrChk(cudaMalloc((void**)(&d_vec), (sizeof(ScalarType)*vec_size)));
+  gpuErrChk(cudaMalloc((void**)(&d_prod), (sizeof(ScalarType)*vec_size)));
+
+  gpuErrChk(cudaMemcpy((void*)(d_matrix), (void*)&matrix[0], (sizeof(ScalarType)*mat_size), cudaMemcpyHostToDevice));
+  gpuErrChk(cudaMemcpy((void*)(d_vec), (void*)&vec[0], (sizeof(ScalarType)*vec_size), cudaMemcpyHostToDevice));
+  gpuErrChk(cudaMemcpy((void*)(d_prod), (void*)&prod[0], (sizeof(ScalarType)*vec_size), cudaMemcpyHostToDevice));
+
+  double xDim = (double) 1024.0/(nVar*nEqn);
+  dim3 blockDim(floor(xDim), nVar, nEqn);
+  double gridx = (double) nPointDomain/xDim;
+  dim3 gridDim(ceil(gridx), 1, 1);
+
+  GPUMatrixVectorProductAdd<<<gridDim, blockDim>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, nPointDomain, nVar, nEqn);
+  gpuErrChk( cudaPeekAtLastError() );
+
+  gpuErrChk(cudaMemcpy((void*)(&prod[0]), (void*)d_prod, (sizeof(ScalarType)*vec_size), cudaMemcpyDeviceToHost));
+
+  gpuErrChk(cudaFree(d_vec));
+  gpuErrChk(cudaFree(d_prod));
+
+}
+
+template class CSysMatrix<su2mixedfloat>;
diff --git a/Common/src/linear_algebra/meson.build b/Common/src/linear_algebra/meson.build
@@ -2,5 +2,6 @@ common_src += files(['CSysSolve_b.cpp',
                      'CSysSolve.cpp',
                      'CSysVector.cpp',
                      'CSysMatrix.cpp',
+                     'GPU_lin_alg.cu',
                      'CPastixWrapper.cpp',
                      'blas_structure.cpp'])