Skip to content

Commit

Permalink
Merge pull request #2083 from su2code/fix_mz_adjoint_wall_time
Browse files Browse the repository at this point in the history
Fix wall time for discrete adjoint MZ driver + OpenMP the GMRES orthogonalization when used by MZ driver
  • Loading branch information
pcarruscag authored Jul 24, 2023
2 parents 02da4b6 + 19e200e commit 0699292
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 31 deletions.
3 changes: 2 additions & 1 deletion Common/include/linear_algebra/CSysSolve.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ class CSysSolve {
* \brief Modified Gram-Schmidt orthogonalization
* \author Based on Kesheng John Wu's mgsro subroutine in Saad's SPARSKIT
*
* \param[in] shared_hsbg - if the Hessenberg matrix is shared by multiple threads
* \param[in] i - index indicating which vector in w is being orthogonalized
* \param[in,out] Hsbg - the upper Hessenberg begin updated
* \param[in,out] w - the (i+1)th vector of w is orthogonalized against the
Expand All @@ -181,7 +182,7 @@ class CSysSolve {
* vector is kept in nrm0 and updated after operating with each vector
*
*/
void ModGramSchmidt(int i, su2matrix<ScalarType>& Hsbg, std::vector<VectorType>& w) const;
void ModGramSchmidt(bool shared_hsbg, int i, su2matrix<ScalarType>& Hsbg, std::vector<VectorType>& w) const;

/*!
* \brief writes header information for a CSysSolve residual history
Expand Down
46 changes: 24 additions & 22 deletions Common/include/linear_algebra/vector_expressions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,28 +146,30 @@ MAKE_UNARY_FUN(sign, sign_, sign_impl)

/*--- Macro to create expressions and overloads for binary functions. ---*/

#define MAKE_BINARY_FUN(FUN, EXPR, IMPL) \
/*!--- Expression class. ---*/ \
template <class U, class V, class Scalar> \
class EXPR : public CVecExpr<EXPR<U, V, Scalar>, Scalar> { \
store_t<const U> u; \
store_t<const V> v; \
\
public: \
static constexpr bool StoreAsRef = false; \
FORCEINLINE EXPR(const U& u_, const V& v_) : u(u_), v(v_) {} \
FORCEINLINE auto operator[](size_t i) const RETURNS(IMPL(u[i], v[i])) \
}; \
/*!--- Vector with vector function overload. ---*/ \
template <class U, class V, class S> \
FORCEINLINE auto FUN(const CVecExpr<U, S>& u, const CVecExpr<V, S>& v) \
RETURNS(EXPR<U, V, S>(u.derived(), v.derived())) /*!--- Vector with scalar function overload. ---*/ \
template <class U, class S> \
FORCEINLINE auto FUN(const CVecExpr<U, S>& u, decay_t<S> v) \
RETURNS(EXPR<U, Bcast<S>, S>(u.derived(), Bcast<S>(v))) /*!--- Scalar with vector function overload. ---*/ \
template <class S, class V> \
FORCEINLINE auto FUN(decay_t<S> u, const CVecExpr<V, S>& v) \
RETURNS(EXPR<Bcast<S>, V, S>(Bcast<S>(u), v.derived()))
// clang-format off
#define MAKE_BINARY_FUN(FUN, EXPR, IMPL) \
/*!--- Expression class. ---*/ \
template <class U, class V, class Scalar> \
class EXPR : public CVecExpr<EXPR<U, V, Scalar>, Scalar> { \
store_t<const U> u; \
store_t<const V> v; \
\
public: \
static constexpr bool StoreAsRef = false; \
FORCEINLINE EXPR(const U& u_, const V& v_) : u(u_), v(v_) {} \
FORCEINLINE auto operator[](size_t i) const RETURNS(IMPL(u[i], v[i])) \
}; \
/*!--- Vector with vector function overload. ---*/ \
template <class U, class V, class S> \
FORCEINLINE auto FUN(const CVecExpr<U, S>& u, const CVecExpr<V, S>& v) \
RETURNS(EXPR<U, V, S>(u.derived(), v.derived())) \
/*!--- Vector with scalar function overload. ---*/ \
template <class U, class S> \
FORCEINLINE auto FUN(const CVecExpr<U, S>& u, decay_t<S> v) RETURNS(EXPR<U, Bcast<S>, S>(u.derived(), Bcast<S>(v))) \
/*!--- Scalar with vector function overload. ---*/ \
template <class S, class V> \
FORCEINLINE auto FUN(decay_t<S> u, const CVecExpr<V, S>& v) RETURNS(EXPR<Bcast<S>, V, S>(Bcast<S>(u), v.derived()))
// clang-format on

/*--- std::max/min have issues (because they return by reference).
* fmin and fmax return by value and thus are fine, but they would force
Expand Down
42 changes: 34 additions & 8 deletions Common/src/linear_algebra/CSysSolve.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,19 @@ void CSysSolve<ScalarType>::SolveReduced(int n, const su2matrix<ScalarType>& Hsb
}

template <class ScalarType>
void CSysSolve<ScalarType>::ModGramSchmidt(int i, su2matrix<ScalarType>& Hsbg,
void CSysSolve<ScalarType>::ModGramSchmidt(bool shared_hsbg, int i, su2matrix<ScalarType>& Hsbg,
vector<CSysVector<ScalarType> >& w) const {
const auto thread = omp_get_thread_num();

/*--- If Hsbg is shared by multiple threads calling this function, only one
* thread can write into it. If Hsbg is private, all threads need to write. ---*/

auto SetHsbg = [&](int row, int col, const ScalarType& value) {
if (!shared_hsbg || thread == 0) {
Hsbg(row, col) = value;
}
};

/*--- Parameter for reorthonormalization ---*/

const ScalarType reorth = 0.98;
Expand All @@ -132,28 +143,29 @@ void CSysSolve<ScalarType>::ModGramSchmidt(int i, su2matrix<ScalarType>& Hsbg,

for (int k = 0; k < i + 1; k++) {
ScalarType prod = w[i + 1].dot(w[k]);
Hsbg(k, i) = prod;
ScalarType h_ki = prod;
w[i + 1] -= prod * w[k];

/*--- Check if reorthogonalization is necessary ---*/

if (prod * prod > thr) {
prod = w[i + 1].dot(w[k]);
Hsbg(k, i) += prod;
h_ki += prod;
w[i + 1] -= prod * w[k];
}
SetHsbg(k, i, h_ki);

/*--- Update the norm and check its size ---*/

nrm -= pow(Hsbg(k, i), 2);
nrm -= pow(h_ki, 2);
nrm = max<ScalarType>(nrm, 0.0);
thr = nrm * reorth;
}

/*--- Test the resulting vector ---*/

nrm = w[i + 1].norm();
Hsbg(i + 1, i) = nrm;
SetHsbg(i + 1, i, nrm);

/*--- Scale the resulting vector ---*/

Expand Down Expand Up @@ -343,6 +355,9 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp
const CConfig* config) const {
const bool masterRank = (SU2_MPI::GetRank() == MASTER_NODE);
const bool flexible = !precond.IsIdentity();
/*--- If we call the solver outside of a parallel region, but the number of threads allows,
* we still want to parallelize some of the expensive operations. ---*/
const bool nestedParallel = !omp_in_parallel() && omp_get_max_threads() > 1;

/*--- Check the subspace size ---*/

Expand Down Expand Up @@ -452,7 +467,14 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp

/*--- Modified Gram-Schmidt orthogonalization ---*/

ModGramSchmidt(i, H, W);
if (nestedParallel) {
/*--- "omp parallel if" does not work well here ---*/
SU2_OMP_PARALLEL
ModGramSchmidt(true, i, H, W);
END_SU2_OMP_PARALLEL
} else {
ModGramSchmidt(false, i, H, W);
}

/*--- Apply old Givens rotations to new column of the Hessenberg matrix then generate the
new Givens rotation matrix and apply it to the last two elements of H[:][i] and g ---*/
Expand Down Expand Up @@ -480,8 +502,12 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp

const auto& basis = flexible ? Z : W;

for (unsigned long k = 0; k < i; k++) {
x += y[k] * basis[k];
if (nestedParallel) {
SU2_OMP_PARALLEL
for (unsigned long k = 0; k < i; k++) x += y[k] * basis[k];
END_SU2_OMP_PARALLEL
} else {
for (unsigned long k = 0; k < i; k++) x += y[k] * basis[k];
}

/*--- Recalculate final (neg.) residual (this should be optional) ---*/
Expand Down
1 change: 1 addition & 0 deletions SU2_CFD/src/drivers/CDiscAdjMultizoneDriver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,7 @@ void CDiscAdjMultizoneDriver::Run() {

const unsigned long nOuterIter = driver_config->GetnOuter_Iter();
const bool time_domain = driver_config->GetTime_Domain();
driver_config->Set_StartTime(SU2_MPI::Wtime());

/*--- If the gradient of the objective function is 0 so are the adjoint variables.
* Unless in unsteady problems where there are other contributions to the RHS. ---*/
Expand Down

0 comments on commit 0699292

Please sign in to comment.