From 332ad014db7d6d2e120b142083c0b3c1168e767a Mon Sep 17 00:00:00 2001 From: Cody Melton Date: Tue, 22 Aug 2023 16:57:14 -0600 Subject: [PATCH 1/9] initial attempt at BLAS implementation, commented out since it doesn't actually pass the test --- src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp | 9 ++++++++- src/QMCWaveFunctions/BsplineFactory/SplineR2R.h | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp index ebe548dd25..918784cd35 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp +++ b/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp @@ -17,6 +17,7 @@ #include "SplineR2R.h" #include "spline2/MultiBsplineEval.hpp" #include "QMCWaveFunctions/BsplineFactory/contraction_helper.hpp" +#include "Platforms/CPU/BLAS.hpp" namespace qmcplusplus { @@ -56,7 +57,7 @@ void SplineR2R::storeParamsBeforeRotation() { const auto spline_ptr = SplineInst->getSplinePtr(); const auto coefs_tot_size = spline_ptr->coefs_size; - coef_copy_ = std::make_shared>(coefs_tot_size); + coef_copy_ = std::make_shared>(coefs_tot_size); std::copy_n(spline_ptr->coefs, coefs_tot_size, coef_copy_->begin()); } @@ -135,6 +136,12 @@ void SplineR2R::applyRotation(const ValueMatrix& rot_mat, bool use_stored_co spl_coefs[cur_elem] = newval; } } + + //std::vector rot_mat_padded(Nsplines * Nsplines, 0); + //for (auto i = 0; i < OrbitalSetSize; i++) + // for (auto j = 0; j < OrbitalSetSize; j++) + // rot_mat_padded[i * Nsplines + j] = rot_mat[i][j]; + //BLAS::gemm('N', 'N', BasisSetSize, Nsplines, Nsplines, ST(1.0), (*coef_copy_).data(), BasisSetSize, rot_mat_padded.data(), Nsplines, ST(0.0), spl_coefs, BasisSetSize); } diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineR2R.h b/src/QMCWaveFunctions/BsplineFactory/SplineR2R.h index a3ac0f919d..3de6fc33fc 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineR2R.h +++ b/src/QMCWaveFunctions/BsplineFactory/SplineR2R.h @@ -59,7 +59,7 @@ class SplineR2R : public BsplineSet std::shared_ptr> SplineInst; ///Copy of original splines for orbital rotation - std::shared_ptr> coef_copy_; + std::shared_ptr> coef_copy_; ///thread private ratios for reduction when using nested threading, numVP x numThread Matrix ratios_private; From 047b9016bbbc3aff1d3f619ba3840b7b917d3866 Mon Sep 17 00:00:00 2001 From: Cody Melton Date: Wed, 23 Aug 2023 17:42:55 -0600 Subject: [PATCH 2/9] got blas call working. row major vs. column major was the problem --- .../BsplineFactory/SplineR2R.cpp | 25 ++++--------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp index 918784cd35..9863aefdb9 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp +++ b/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp @@ -121,27 +121,12 @@ void SplineR2R::applyRotation(const ValueMatrix& rot_mat, bool use_stored_co std::copy_n(spl_coefs, coefs_tot_size, coef_copy_->begin()); } - // Apply rotation the dumb way b/c I can't get BLAS::gemm to work... - for (auto i = 0; i < BasisSetSize; i++) - { + std::vector rot_mat_padded(Nsplines * Nsplines, 0); + for (auto i = 0; i < OrbitalSetSize; i++) for (auto j = 0; j < OrbitalSetSize; j++) - { - const auto cur_elem = Nsplines * i + j; - auto newval{0.}; - for (auto k = 0; k < OrbitalSetSize; k++) - { - const auto index = i * Nsplines + k; - newval += (*coef_copy_)[index] * rot_mat[k][j]; - } - spl_coefs[cur_elem] = newval; - } - } - - //std::vector rot_mat_padded(Nsplines * Nsplines, 0); - //for (auto i = 0; i < OrbitalSetSize; i++) - // for (auto j = 0; j < OrbitalSetSize; j++) - // rot_mat_padded[i * Nsplines + j] = rot_mat[i][j]; - //BLAS::gemm('N', 'N', BasisSetSize, Nsplines, Nsplines, ST(1.0), (*coef_copy_).data(), BasisSetSize, rot_mat_padded.data(), Nsplines, ST(0.0), spl_coefs, BasisSetSize); + rot_mat_padded[i * Nsplines + j] = rot_mat.data()[i * OrbitalSetSize + j]; + BLAS::gemm('N', 'N', Nsplines, BasisSetSize, Nsplines, ST(1.0), rot_mat_padded.data(), Nsplines, (*coef_copy_).data(), Nsplines, ST(0.0), spl_coefs, Nsplines); + } From 53f111e9ecbf98db70daef3c7a57c3024045bdc1 Mon Sep 17 00:00:00 2001 From: Cody Melton Date: Thu, 24 Aug 2023 13:26:01 -0600 Subject: [PATCH 3/9] make split in implementation depending on float vs. double --- .../BsplineFactory/SplineR2R.cpp | 31 ++++++++++++++++--- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp index 9863aefdb9..cf29232e51 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp +++ b/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp @@ -121,11 +121,32 @@ void SplineR2R::applyRotation(const ValueMatrix& rot_mat, bool use_stored_co std::copy_n(spl_coefs, coefs_tot_size, coef_copy_->begin()); } - std::vector rot_mat_padded(Nsplines * Nsplines, 0); - for (auto i = 0; i < OrbitalSetSize; i++) - for (auto j = 0; j < OrbitalSetSize; j++) - rot_mat_padded[i * Nsplines + j] = rot_mat.data()[i * OrbitalSetSize + j]; - BLAS::gemm('N', 'N', Nsplines, BasisSetSize, Nsplines, ST(1.0), rot_mat_padded.data(), Nsplines, (*coef_copy_).data(), Nsplines, ST(0.0), spl_coefs, Nsplines); + + if constexpr (std::is_same_v) + { + //Here, ST should be equal to ValueType, which will be double for R2R. Using BLAS to make things faster + std::vector rot_mat_padded(Nsplines * Nsplines, 0); + for (auto i = 0; i < OrbitalSetSize; i++) + for (auto j = 0; j < OrbitalSetSize; j++) + rot_mat_padded[i * Nsplines + j] = rot_mat.data()[i * OrbitalSetSize + j]; + BLAS::gemm('N', 'N', Nsplines, BasisSetSize, Nsplines, ST(1.0), rot_mat_padded.data(), Nsplines, (*coef_copy_).data(), Nsplines, ST(0.0), spl_coefs, Nsplines); + } + else + { + //Here, ST is float but ValueType is double for R2R. Due to issues with type conversions, just doing naive matrix multiplication in this case to not lose precision on rot_mat + for (auto i = 0; i < BasisSetSize; i++) + for (auto j = 0; j < OrbitalSetSize; j++) + { + const auto cur_elem = Nsplines * i + j; + auto newval{0.}; + for (auto k = 0; k < OrbitalSetSize; k++) + { + const auto index = i * Nsplines + k; + newval += (*coef_copy_)[index] * rot_mat[k][j]; + } + spl_coefs[cur_elem] = newval; + } + } } From c86e331c5250a97616ae443068bfa660c2c2026b Mon Sep 17 00:00:00 2001 From: Cody Melton Date: Thu, 24 Aug 2023 14:27:53 -0600 Subject: [PATCH 4/9] removed padded rot_mat --- src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp index cf29232e51..9c55f27f04 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp +++ b/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp @@ -121,15 +121,12 @@ void SplineR2R::applyRotation(const ValueMatrix& rot_mat, bool use_stored_co std::copy_n(spl_coefs, coefs_tot_size, coef_copy_->begin()); } - + if constexpr (std::is_same_v) { //Here, ST should be equal to ValueType, which will be double for R2R. Using BLAS to make things faster - std::vector rot_mat_padded(Nsplines * Nsplines, 0); - for (auto i = 0; i < OrbitalSetSize; i++) - for (auto j = 0; j < OrbitalSetSize; j++) - rot_mat_padded[i * Nsplines + j] = rot_mat.data()[i * OrbitalSetSize + j]; - BLAS::gemm('N', 'N', Nsplines, BasisSetSize, Nsplines, ST(1.0), rot_mat_padded.data(), Nsplines, (*coef_copy_).data(), Nsplines, ST(0.0), spl_coefs, Nsplines); + BLAS::gemm('N', 'N', OrbitalSetSize, BasisSetSize, OrbitalSetSize, ST(1.0), rot_mat.data(), OrbitalSetSize, + (*coef_copy_).data(), Nsplines, ST(0.0), spl_coefs, Nsplines); } else { @@ -147,7 +144,6 @@ void SplineR2R::applyRotation(const ValueMatrix& rot_mat, bool use_stored_co spl_coefs[cur_elem] = newval; } } - } From 85b18c591bca1daf167e363ddd45d6973dff75ab Mon Sep 17 00:00:00 2001 From: Cody Melton Date: Thu, 24 Aug 2023 16:50:30 -0600 Subject: [PATCH 5/9] add blas for SplineC2C applyRotation --- .../BsplineFactory/SplineC2C.cpp | 54 ++++++++++++------- .../BsplineFactory/SplineC2C.h | 2 +- 2 files changed, 35 insertions(+), 21 deletions(-) diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2C.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineC2C.cpp index 7f4b1d1bd5..dfa33f8da9 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineC2C.cpp +++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2C.cpp @@ -17,6 +17,7 @@ #include "spline2/MultiBsplineEval.hpp" #include "QMCWaveFunctions/BsplineFactory/contraction_helper.hpp" #include "CPU/math.hpp" +#include "CPU/BLAS.hpp" namespace qmcplusplus { @@ -57,7 +58,7 @@ void SplineC2C::storeParamsBeforeRotation() { const auto spline_ptr = SplineInst->getSplinePtr(); const auto coefs_tot_size = spline_ptr->coefs_size; - coef_copy_ = std::make_shared>(coefs_tot_size); + coef_copy_ = std::make_shared>(coefs_tot_size); std::copy_n(spline_ptr->coefs, coefs_tot_size, coef_copy_->begin()); } @@ -120,27 +121,40 @@ void SplineC2C::applyRotation(const ValueMatrix& rot_mat, bool use_stored_co std::copy_n(spl_coefs, coefs_tot_size, coef_copy_->begin()); } - for (int i = 0; i < basis_set_size; i++) - for (int j = 0; j < OrbitalSetSize; j++) - { - // cur_elem points to the real componend of the coefficient. - // Imag component is adjacent in memory. - const auto cur_elem = Nsplines * i + 2 * j; - ST newval_r{0.}; - ST newval_i{0.}; - for (auto k = 0; k < OrbitalSetSize; k++) + if constexpr (std::is_same_v) + { + //if ST is double, go ahead and use blas to make things faster + //Note that Nsplines needs to be divided by 2 since spl_coefs and coef_copy_ are stored as reals. + //Also casting them as ValueType so they are complex to do the correct gemm + BLAS::gemm('N', 'N', OrbitalSetSize, basis_set_size, OrbitalSetSize, ST(1.0), rot_mat.data(), OrbitalSetSize, + (ValueType*)(*coef_copy_).data(), Nsplines / 2, ST(0.0), (ValueType*)spl_coefs, Nsplines / 2); + } + else + { + // if ST is float, RealType is double and ValueType is std::complex for C2C + // Just use naive matrix multiplication in order to avoid losing precision on rotation matrix + for (int i = 0; i < basis_set_size; i++) + for (int j = 0; j < OrbitalSetSize; j++) { - const auto index = Nsplines * i + 2 * k; - ST zr = (*coef_copy_)[index]; - ST zi = (*coef_copy_)[index + 1]; - ST wr = rot_mat[k][j].real(); - ST wi = rot_mat[k][j].imag(); - newval_r += zr * wr - zi * wi; - newval_i += zr * wi + zi * wr; + // cur_elem points to the real componend of the coefficient. + // Imag component is adjacent in memory. + const auto cur_elem = Nsplines * i + 2 * j; + ST newval_r{0.}; + ST newval_i{0.}; + for (auto k = 0; k < OrbitalSetSize; k++) + { + const auto index = Nsplines * i + 2 * k; + ST zr = (*coef_copy_)[index]; + ST zi = (*coef_copy_)[index + 1]; + ST wr = rot_mat[k][j].real(); + ST wi = rot_mat[k][j].imag(); + newval_r += zr * wr - zi * wi; + newval_i += zr * wi + zi * wr; + } + spl_coefs[cur_elem] = newval_r; + spl_coefs[cur_elem + 1] = newval_i; } - spl_coefs[cur_elem] = newval_r; - spl_coefs[cur_elem + 1] = newval_i; - } + } } template diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2C.h b/src/QMCWaveFunctions/BsplineFactory/SplineC2C.h index af082e0cea..9410e80cfb 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineC2C.h +++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2C.h @@ -64,7 +64,7 @@ class SplineC2C : public BsplineSet std::shared_ptr> SplineInst; ///Copy of original splines for orbital rotation - std::shared_ptr> coef_copy_; + std::shared_ptr> coef_copy_; vContainer_type mKK; VectorSoaContainer myKcart; From 3259ddeef6fefba3aabf7bc7ad656fef74b7cbee Mon Sep 17 00:00:00 2001 From: Cody Melton Date: Thu, 24 Aug 2023 16:58:34 -0600 Subject: [PATCH 6/9] use valuetype in blas call --- src/QMCWaveFunctions/BsplineFactory/SplineC2C.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2C.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineC2C.cpp index dfa33f8da9..e608c272f3 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineC2C.cpp +++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2C.cpp @@ -126,8 +126,8 @@ void SplineC2C::applyRotation(const ValueMatrix& rot_mat, bool use_stored_co //if ST is double, go ahead and use blas to make things faster //Note that Nsplines needs to be divided by 2 since spl_coefs and coef_copy_ are stored as reals. //Also casting them as ValueType so they are complex to do the correct gemm - BLAS::gemm('N', 'N', OrbitalSetSize, basis_set_size, OrbitalSetSize, ST(1.0), rot_mat.data(), OrbitalSetSize, - (ValueType*)(*coef_copy_).data(), Nsplines / 2, ST(0.0), (ValueType*)spl_coefs, Nsplines / 2); + BLAS::gemm('N', 'N', OrbitalSetSize, basis_set_size, OrbitalSetSize, ValueType(1.0,0.0), rot_mat.data(), OrbitalSetSize, + (ValueType*)(*coef_copy_).data(), Nsplines / 2, ValueType(0.0,0.0), (ValueType*)spl_coefs, Nsplines / 2); } else { From 2fb7ce3a9119d1638b97dd4fd38aaa02225c168b Mon Sep 17 00:00:00 2001 From: Cody Melton Date: Thu, 24 Aug 2023 16:59:37 -0600 Subject: [PATCH 7/9] clang format --- src/QMCWaveFunctions/BsplineFactory/SplineC2C.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2C.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineC2C.cpp index e608c272f3..403cdaac10 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineC2C.cpp +++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2C.cpp @@ -126,8 +126,9 @@ void SplineC2C::applyRotation(const ValueMatrix& rot_mat, bool use_stored_co //if ST is double, go ahead and use blas to make things faster //Note that Nsplines needs to be divided by 2 since spl_coefs and coef_copy_ are stored as reals. //Also casting them as ValueType so they are complex to do the correct gemm - BLAS::gemm('N', 'N', OrbitalSetSize, basis_set_size, OrbitalSetSize, ValueType(1.0,0.0), rot_mat.data(), OrbitalSetSize, - (ValueType*)(*coef_copy_).data(), Nsplines / 2, ValueType(0.0,0.0), (ValueType*)spl_coefs, Nsplines / 2); + BLAS::gemm('N', 'N', OrbitalSetSize, basis_set_size, OrbitalSetSize, ValueType(1.0, 0.0), rot_mat.data(), + OrbitalSetSize, (ValueType*)(*coef_copy_).data(), Nsplines / 2, ValueType(0.0, 0.0), + (ValueType*)spl_coefs, Nsplines / 2); } else { From b1b093af8f0c0f6720be811c6ef8748a337a7f3a Mon Sep 17 00:00:00 2001 From: Cody Melton Date: Fri, 25 Aug 2023 08:37:24 -0600 Subject: [PATCH 8/9] address comments --- src/QMCWaveFunctions/BsplineFactory/SplineC2C.cpp | 6 +++--- src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2C.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineC2C.cpp index 403cdaac10..805e8d2ef2 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineC2C.cpp +++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2C.cpp @@ -134,15 +134,15 @@ void SplineC2C::applyRotation(const ValueMatrix& rot_mat, bool use_stored_co { // if ST is float, RealType is double and ValueType is std::complex for C2C // Just use naive matrix multiplication in order to avoid losing precision on rotation matrix - for (int i = 0; i < basis_set_size; i++) - for (int j = 0; j < OrbitalSetSize; j++) + for (IndexType i = 0; i < basis_set_size; i++) + for (IndexType j = 0; j < OrbitalSetSize; j++) { // cur_elem points to the real componend of the coefficient. // Imag component is adjacent in memory. const auto cur_elem = Nsplines * i + 2 * j; ST newval_r{0.}; ST newval_i{0.}; - for (auto k = 0; k < OrbitalSetSize; k++) + for (IndexType k = 0; k < OrbitalSetSize; k++) { const auto index = Nsplines * i + 2 * k; ST zr = (*coef_copy_)[index]; diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp index 9c55f27f04..1da6aa5023 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp +++ b/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp @@ -131,12 +131,12 @@ void SplineR2R::applyRotation(const ValueMatrix& rot_mat, bool use_stored_co else { //Here, ST is float but ValueType is double for R2R. Due to issues with type conversions, just doing naive matrix multiplication in this case to not lose precision on rot_mat - for (auto i = 0; i < BasisSetSize; i++) - for (auto j = 0; j < OrbitalSetSize; j++) + for (IndexType i = 0; i < BasisSetSize; i++) + for (IndexType j = 0; j < OrbitalSetSize; j++) { const auto cur_elem = Nsplines * i + j; - auto newval{0.}; - for (auto k = 0; k < OrbitalSetSize; k++) + FullPrecValueType newval{0.}; + for (IndexType k = 0; k < OrbitalSetSize; k++) { const auto index = i * Nsplines + k; newval += (*coef_copy_)[index] * rot_mat[k][j]; From 91adaa9b714834f2e9c08ffaa36e990cd66c1694 Mon Sep 17 00:00:00 2001 From: Cody Melton Date: Mon, 28 Aug 2023 13:45:33 -0600 Subject: [PATCH 9/9] address comments --- src/QMCWaveFunctions/BsplineFactory/SplineC2C.cpp | 2 +- src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2C.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineC2C.cpp index 805e8d2ef2..8a3cd77c60 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineC2C.cpp +++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2C.cpp @@ -127,7 +127,7 @@ void SplineC2C::applyRotation(const ValueMatrix& rot_mat, bool use_stored_co //Note that Nsplines needs to be divided by 2 since spl_coefs and coef_copy_ are stored as reals. //Also casting them as ValueType so they are complex to do the correct gemm BLAS::gemm('N', 'N', OrbitalSetSize, basis_set_size, OrbitalSetSize, ValueType(1.0, 0.0), rot_mat.data(), - OrbitalSetSize, (ValueType*)(*coef_copy_).data(), Nsplines / 2, ValueType(0.0, 0.0), + OrbitalSetSize, (ValueType*)coef_copy_->data(), Nsplines / 2, ValueType(0.0, 0.0), (ValueType*)spl_coefs, Nsplines / 2); } else diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp index 1da6aa5023..5b0fa59ed3 100644 --- a/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp +++ b/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp @@ -122,11 +122,11 @@ void SplineR2R::applyRotation(const ValueMatrix& rot_mat, bool use_stored_co } - if constexpr (std::is_same_v) + if constexpr (std::is_same_v) { //Here, ST should be equal to ValueType, which will be double for R2R. Using BLAS to make things faster BLAS::gemm('N', 'N', OrbitalSetSize, BasisSetSize, OrbitalSetSize, ST(1.0), rot_mat.data(), OrbitalSetSize, - (*coef_copy_).data(), Nsplines, ST(0.0), spl_coefs, Nsplines); + coef_copy_->data(), Nsplines, ST(0.0), spl_coefs, Nsplines); } else {