Merge pull request #2083 from su2code/fix_mz_adjoint_wall_time

Fix wall time for discrete adjoint MZ driver + OpenMP the GMRES orthogonalization when used by MZ driver
su2code · Jul 24, 2023 · 0699292 · 0699292
2 parents 02da4b6 + 19e200e
commit 0699292
Show file tree

Hide file tree

Showing 4 changed files with 61 additions and 31 deletions.
diff --git a/Common/include/linear_algebra/CSysSolve.hpp b/Common/include/linear_algebra/CSysSolve.hpp
@@ -168,6 +168,7 @@ class CSysSolve {
    * \brief Modified Gram-Schmidt orthogonalization
    * \author Based on Kesheng John Wu's mgsro subroutine in Saad's SPARSKIT
    *
+   * \param[in] shared_hsbg - if the Hessenberg matrix is shared by multiple threads
    * \param[in] i - index indicating which vector in w is being orthogonalized
    * \param[in,out] Hsbg - the upper Hessenberg begin updated
    * \param[in,out] w - the (i+1)th vector of w is orthogonalized against the
@@ -181,7 +182,7 @@ class CSysSolve {
    * vector is kept in nrm0 and updated after operating with each vector
    *
    */
-  void ModGramSchmidt(int i, su2matrix<ScalarType>& Hsbg, std::vector<VectorType>& w) const;
+  void ModGramSchmidt(bool shared_hsbg, int i, su2matrix<ScalarType>& Hsbg, std::vector<VectorType>& w) const;
 
   /*!
    * \brief writes header information for a CSysSolve residual history

diff --git a/Common/include/linear_algebra/vector_expressions.hpp b/Common/include/linear_algebra/vector_expressions.hpp
@@ -146,28 +146,30 @@ MAKE_UNARY_FUN(sign, sign_, sign_impl)
 
 /*--- Macro to create expressions and overloads for binary functions. ---*/
 
-#define MAKE_BINARY_FUN(FUN, EXPR, IMPL)                                                                             \
-  /*!--- Expression class. ---*/                                                                                     \
-  template <class U, class V, class Scalar>                                                                          \
-  class EXPR : public CVecExpr<EXPR<U, V, Scalar>, Scalar> {                                                         \
-    store_t<const U> u;                                                                                              \
-    store_t<const V> v;                                                                                              \
-                                                                                                                     \
-   public:                                                                                                           \
-    static constexpr bool StoreAsRef = false;                                                                        \
-    FORCEINLINE EXPR(const U& u_, const V& v_) : u(u_), v(v_) {}                                                     \
-    FORCEINLINE auto operator[](size_t i) const RETURNS(IMPL(u[i], v[i]))                                            \
-  };                                                                                                                 \
-  /*!--- Vector with vector function overload. ---*/                                                                 \
-  template <class U, class V, class S>                                                                               \
-  FORCEINLINE auto FUN(const CVecExpr<U, S>& u, const CVecExpr<V, S>& v)                                             \
-      RETURNS(EXPR<U, V, S>(u.derived(), v.derived())) /*!--- Vector with scalar function overload. ---*/            \
-      template <class U, class S>                                                                                    \
-      FORCEINLINE auto FUN(const CVecExpr<U, S>& u, decay_t<S> v)                                                    \
-          RETURNS(EXPR<U, Bcast<S>, S>(u.derived(), Bcast<S>(v))) /*!--- Scalar with vector function overload. ---*/ \
-      template <class S, class V>                                                                                    \
-      FORCEINLINE auto FUN(decay_t<S> u, const CVecExpr<V, S>& v)                                                    \
-          RETURNS(EXPR<Bcast<S>, V, S>(Bcast<S>(u), v.derived()))
+// clang-format off
+#define MAKE_BINARY_FUN(FUN, EXPR, IMPL)                                                                              \
+  /*!--- Expression class. ---*/                                                                                      \
+  template <class U, class V, class Scalar>                                                                           \
+  class EXPR : public CVecExpr<EXPR<U, V, Scalar>, Scalar> {                                                          \
+    store_t<const U> u;                                                                                               \
+    store_t<const V> v;                                                                                               \
+                                                                                                                      \
+   public:                                                                                                            \
+    static constexpr bool StoreAsRef = false;                                                                         \
+    FORCEINLINE EXPR(const U& u_, const V& v_) : u(u_), v(v_) {}                                                      \
+    FORCEINLINE auto operator[](size_t i) const RETURNS(IMPL(u[i], v[i]))                                             \
+  };                                                                                                                  \
+  /*!--- Vector with vector function overload. ---*/                                                                  \
+  template <class U, class V, class S>                                                                                \
+  FORCEINLINE auto FUN(const CVecExpr<U, S>& u, const CVecExpr<V, S>& v)                                              \
+      RETURNS(EXPR<U, V, S>(u.derived(), v.derived()))                                                                \
+  /*!--- Vector with scalar function overload. ---*/                                                                  \
+  template <class U, class S>                                                                                         \
+  FORCEINLINE auto FUN(const CVecExpr<U, S>& u, decay_t<S> v) RETURNS(EXPR<U, Bcast<S>, S>(u.derived(), Bcast<S>(v))) \
+  /*!--- Scalar with vector function overload. ---*/                                                                  \
+  template <class S, class V>                                                                                         \
+  FORCEINLINE auto FUN(decay_t<S> u, const CVecExpr<V, S>& v) RETURNS(EXPR<Bcast<S>, V, S>(Bcast<S>(u), v.derived()))
+// clang-format on
 
 /*--- std::max/min have issues (because they return by reference).
  * fmin and fmax return by value and thus are fine, but they would force

diff --git a/Common/src/linear_algebra/CSysSolve.cpp b/Common/src/linear_algebra/CSysSolve.cpp
@@ -109,8 +109,19 @@ void CSysSolve<ScalarType>::SolveReduced(int n, const su2matrix<ScalarType>& Hsb
 }
 
 template <class ScalarType>
-void CSysSolve<ScalarType>::ModGramSchmidt(int i, su2matrix<ScalarType>& Hsbg,
+void CSysSolve<ScalarType>::ModGramSchmidt(bool shared_hsbg, int i, su2matrix<ScalarType>& Hsbg,
                                            vector<CSysVector<ScalarType> >& w) const {
+  const auto thread = omp_get_thread_num();
+
+  /*--- If Hsbg is shared by multiple threads calling this function, only one
+   * thread can write into it. If Hsbg is private, all threads need to write. ---*/
+
+  auto SetHsbg = [&](int row, int col, const ScalarType& value) {
+    if (!shared_hsbg || thread == 0) {
+      Hsbg(row, col) = value;
+    }
+  };
+
   /*--- Parameter for reorthonormalization ---*/
 
   const ScalarType reorth = 0.98;
@@ -132,28 +143,29 @@ void CSysSolve<ScalarType>::ModGramSchmidt(int i, su2matrix<ScalarType>& Hsbg,
 
   for (int k = 0; k < i + 1; k++) {
     ScalarType prod = w[i + 1].dot(w[k]);
-    Hsbg(k, i) = prod;
+    ScalarType h_ki = prod;
     w[i + 1] -= prod * w[k];
 
     /*--- Check if reorthogonalization is necessary ---*/
 
     if (prod * prod > thr) {
       prod = w[i + 1].dot(w[k]);
-      Hsbg(k, i) += prod;
+      h_ki += prod;
       w[i + 1] -= prod * w[k];
     }
+    SetHsbg(k, i, h_ki);
 
     /*--- Update the norm and check its size ---*/
 
-    nrm -= pow(Hsbg(k, i), 2);
+    nrm -= pow(h_ki, 2);
     nrm = max<ScalarType>(nrm, 0.0);
     thr = nrm * reorth;
   }
 
   /*--- Test the resulting vector ---*/
 
   nrm = w[i + 1].norm();
-  Hsbg(i + 1, i) = nrm;
+  SetHsbg(i + 1, i, nrm);
 
   /*--- Scale the resulting vector ---*/
 
@@ -343,6 +355,9 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp
                                                       const CConfig* config) const {
   const bool masterRank = (SU2_MPI::GetRank() == MASTER_NODE);
   const bool flexible = !precond.IsIdentity();
+  /*--- If we call the solver outside of a parallel region, but the number of threads allows,
+   * we still want to parallelize some of the expensive operations. ---*/
+  const bool nestedParallel = !omp_in_parallel() && omp_get_max_threads() > 1;
 
   /*---  Check the subspace size ---*/
 
@@ -452,7 +467,14 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp
 
     /*---  Modified Gram-Schmidt orthogonalization ---*/
 
-    ModGramSchmidt(i, H, W);
+    if (nestedParallel) {
+      /*--- "omp parallel if" does not work well here ---*/
+      SU2_OMP_PARALLEL
+      ModGramSchmidt(true, i, H, W);
+      END_SU2_OMP_PARALLEL
+    } else {
+      ModGramSchmidt(false, i, H, W);
+    }
 
     /*---  Apply old Givens rotations to new column of the Hessenberg matrix then generate the
      new Givens rotation matrix and apply it to the last two elements of H[:][i] and g ---*/
@@ -480,8 +502,12 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp
 
   const auto& basis = flexible ? Z : W;
 
-  for (unsigned long k = 0; k < i; k++) {
-    x += y[k] * basis[k];
+  if (nestedParallel) {
+    SU2_OMP_PARALLEL
+    for (unsigned long k = 0; k < i; k++) x += y[k] * basis[k];
+    END_SU2_OMP_PARALLEL
+  } else {
+    for (unsigned long k = 0; k < i; k++) x += y[k] * basis[k];
   }
 
   /*---  Recalculate final (neg.) residual (this should be optional) ---*/

diff --git a/SU2_CFD/src/drivers/CDiscAdjMultizoneDriver.cpp b/SU2_CFD/src/drivers/CDiscAdjMultizoneDriver.cpp
@@ -308,6 +308,7 @@ void CDiscAdjMultizoneDriver::Run() {
 
   const unsigned long nOuterIter = driver_config->GetnOuter_Iter();
   const bool time_domain = driver_config->GetTime_Domain();
+  driver_config->Set_StartTime(SU2_MPI::Wtime());
 
   /*--- If the gradient of the objective function is 0 so are the adjoint variables.
    * Unless in unsteady problems where there are other contributions to the RHS. ---*/