From 687ccaab934322fe0126fb5978a7e0de49e106e1 Mon Sep 17 00:00:00 2001 From: Anton Kozhevnikov Date: Thu, 13 Jul 2023 13:44:32 +0200 Subject: [PATCH] [feature] new split index implementation (#862) new split index implementation --- apps/tests/test_allgather.cpp | 4 +- apps/tests/test_wf_ortho.cpp | 7 +- apps/tests/test_wf_trans.cpp | 16 +- apps/unit_tests/test_splindex.cpp | 44 +- python_module/py_sirius.cpp | 2 +- src/SDDK/splindex.hpp | 635 +++++++++--------- src/SDDK/wave_functions.hpp | 44 +- src/api/sirius_api.cpp | 13 +- src/band/band.cpp | 5 +- src/band/band.hpp | 30 +- src/band/diag_full_potential.cpp | 9 +- src/band/residuals.hpp | 10 +- src/band/solve.cpp | 5 +- src/density/density.cpp | 84 ++- src/density/density.hpp | 17 +- src/dft/dft_ground_state.cpp | 12 +- src/dft/energy.cpp | 7 +- src/fft/fft.hpp | 2 +- src/fft/gvec.cpp | 8 +- src/fft/gvec.hpp | 6 +- src/function3d/field4d.hpp | 2 +- src/function3d/paw_field4d.hpp | 8 +- src/function3d/periodic_function.hpp | 9 +- src/function3d/spheric_function_set.hpp | 83 ++- src/geometry/force.cpp | 30 +- src/geometry/non_local_functor.hpp | 7 +- src/geometry/stress.cpp | 15 +- src/hamiltonian/hamiltonian.cpp | 19 +- src/hamiltonian/hamiltonian_k.cpp | 58 +- src/hamiltonian/non_local_operator.hpp | 5 +- src/hamiltonian/non_local_operator_base.hpp | 20 +- .../hubbard_occupancies_derivatives.cpp | 4 +- src/k_point/generate_fv_states.cpp | 15 +- src/k_point/k_point.cpp | 39 +- src/k_point/k_point_set.cpp | 79 +-- src/k_point/k_point_set.hpp | 26 +- src/linalg/dmatrix.cpp | 98 +-- src/linalg/dmatrix.hpp | 32 +- src/mixer/mixer_functions.cpp | 24 +- src/nlcglib/adaptor.cpp | 65 +- src/nlcglib/inverse_overlap.hpp | 2 +- src/nlcglib/overlap.hpp | 8 +- src/nlcglib/ultrasoft_precond.hpp | 8 +- src/potential/paw_potential.cpp | 30 +- src/potential/poisson.cpp | 24 +- src/potential/potential.hpp | 28 +- src/potential/xc.cpp | 4 +- src/potential/xc_mt.cpp | 19 +- src/radial/radial_integrals.cpp | 20 +- src/radial/radial_integrals.hpp | 24 +- src/strong_type.hpp | 4 +- src/symmetry/symmetrize_forces.hpp | 6 +- src/symmetry/symmetrize_mt_function.hpp | 15 +- src/traits.hpp | 6 +- src/typedefs.hpp | 1 - src/unit_cell/atom.hpp | 2 +- src/unit_cell/basis_functions_index.hpp | 3 - src/unit_cell/unit_cell.cpp | 35 +- src/unit_cell/unit_cell.hpp | 22 +- 59 files changed, 898 insertions(+), 961 deletions(-) diff --git a/apps/tests/test_allgather.cpp b/apps/tests/test_allgather.cpp index ad790ccf80..4efa77a0d8 100644 --- a/apps/tests/test_allgather.cpp +++ b/apps/tests/test_allgather.cpp @@ -7,10 +7,10 @@ void test_allgather() int N = 11; std::vector vec(N, 0.0); - sddk::splindex spl(N, mpi::Communicator::world().size(), mpi::Communicator::world().rank()); + sddk::splindex_block<> spl(N, n_blocks(mpi::Communicator::world().size()), block_id(mpi::Communicator::world().rank())); for (int i = 0; i < spl.local_size(); i++) { - vec[spl[i]] = mpi::Communicator::world().rank() + 1.0; + vec[spl.global_index(i)] = mpi::Communicator::world().rank() + 1.0; } { diff --git a/apps/tests/test_wf_ortho.cpp b/apps/tests/test_wf_ortho.cpp index 1828630dad..cf1936b1cf 100644 --- a/apps/tests/test_wf_ortho.cpp +++ b/apps/tests/test_wf_ortho.cpp @@ -39,10 +39,9 @@ void test_wf_ortho(BLACS_grid const& blacs_grid__, double cutoff__, int num_band for (int igloc = 0; igloc < gvec->count(); igloc++) { phi.pw_coeffs(igloc, s, wf::band_index(i)) = utils::random>(); } - for (int ialoc = 0; ialoc < phi.spl_num_atoms().local_size(); ialoc++) { - int ia = phi.spl_num_atoms()[ialoc]; - for (int xi = 0; xi < num_mt_coeffs[ia]; xi++) { - phi.mt_coeffs(xi, wf::atom_index(ialoc), s, wf::band_index(i)) = utils::random>(); + for (auto it : phi.spl_num_atoms()) { + for (int xi = 0; xi < num_mt_coeffs[it.i]; xi++) { + phi.mt_coeffs(xi, it.li, s, wf::band_index(i)) = utils::random>(); } } } diff --git a/apps/tests/test_wf_trans.cpp b/apps/tests/test_wf_trans.cpp index 2949087152..dc81715ed3 100644 --- a/apps/tests/test_wf_trans.cpp +++ b/apps/tests/test_wf_trans.cpp @@ -37,10 +37,9 @@ void test_wf_trans(la::BLACS_grid const& blacs_grid__, double cutoff__, int num_ for (int igloc = 0; igloc < gvec->count(); igloc++) { phi.pw_coeffs(igloc, s, wf::band_index(i)) = utils::random>(); } - for (int ialoc = 0; ialoc < phi.spl_num_atoms().local_size(); ialoc++) { - int ia = phi.spl_num_atoms()[ialoc]; - for (int xi = 0; xi < num_mt_coeffs[ia]; xi++) { - phi.mt_coeffs(xi, wf::atom_index(ialoc), s, wf::band_index(i)) = utils::random>(); + for (auto it : phi.spl_num_atoms()) { + for (int xi = 0; xi < num_mt_coeffs[it.i]; xi++) { + phi.mt_coeffs(xi, it.li, s, wf::band_index(i)) = utils::random>(); } } } @@ -86,12 +85,11 @@ void test_wf_trans(la::BLACS_grid const& blacs_grid__, double cutoff__, int num_ phi.pw_coeffs(igloc, s, wf::band_index(i)) - psi.pw_coeffs(igloc, s, wf::band_index(num_bands__ - i - 1))); } - for (int ialoc = 0; ialoc < phi.spl_num_atoms().local_size(); ialoc++) { - int ia = phi.spl_num_atoms()[ialoc]; - for (int xi = 0; xi < num_mt_coeffs[ia]; xi++) { + for (auto it : phi.spl_num_atoms()) { + for (int xi = 0; xi < num_mt_coeffs[it.i]; xi++) { diff += std::abs( - phi.mt_coeffs(xi, wf::atom_index(ialoc), s, wf::band_index(i)) - - psi.mt_coeffs(xi, wf::atom_index(ialoc), s, wf::band_index(num_bands__ - i - 1))); + phi.mt_coeffs(xi, it.li, s, wf::band_index(i)) - + psi.mt_coeffs(xi, it.li, s, wf::band_index(num_bands__ - i - 1))); } } } diff --git a/apps/unit_tests/test_splindex.cpp b/apps/unit_tests/test_splindex.cpp index 52e6809860..dfd0150441 100644 --- a/apps/unit_tests/test_splindex.cpp +++ b/apps/unit_tests/test_splindex.cpp @@ -8,10 +8,10 @@ int test1() { for (int num_ranks = 1; num_ranks < 20; num_ranks++) { for (int N = 1; N < 1130; N++) { - splindex spl(N, num_ranks, 0); + splindex_block<> spl(N, n_blocks(num_ranks), block_id(0)); int sz = 0; for (int i = 0; i < num_ranks; i++) { - sz += (int)spl.local_size(i); + sz += spl.local_size(block_id(i)); } if (sz != N) { std::stringstream s; @@ -20,21 +20,21 @@ int test1() s << "computed global index size: " << sz << std::endl; s << "number of ranks: " << num_ranks << std::endl; for (int i = 0; i < num_ranks; i++) { - s << "i, local_size(i): " << i << ", " << spl.local_size(i) << std::endl; + s << "i, local_size(i): " << i << ", " << spl.local_size(block_id(i)) << std::endl; } throw std::runtime_error(s.str()); } for (int i = 0; i < N; i++) { - int rank = spl.local_rank(i); - int offset = (int)spl.local_index(i); - if (i != (int)spl.global_index(offset, rank)) { + int rank = spl.location(block_id(i)).ib; + int offset = spl.location(block_id(i)).index_local; + if (i != (int)spl.global_index(offset, block_id(rank))) { std::stringstream s; s << "test1: wrong index." << std::endl; s << "global index size: " << N << std::endl; s << "number of ranks: " << num_ranks << std::endl; s << "global index: " << i << std::endl; s << "rank, offset: " << rank << ", " << offset << std::endl; - s << "computed global index: " << spl.global_index(offset, rank) << std::endl; + s << "computed global index: " << spl.global_index(offset, block_id(rank)) << std::endl; throw std::runtime_error(s.str()); } } @@ -48,22 +48,28 @@ int test2() for (int bs = 1; bs < 17; bs++) { for (int num_ranks = 1; num_ranks < 13; num_ranks++) { for (int N = 1; N < 1113; N++) { - splindex spl(N, num_ranks, 0, bs); + sddk::splindex_block_cyclic<> spl(N, n_blocks(num_ranks), block_id(0), bs); int sz = 0; for (int i = 0; i < num_ranks; i++) { - sz += (int)spl.local_size(i); + sz += (int)spl.local_size(block_id(i)); } if (sz != N) { std::stringstream s; - s << "test2: wrong sum of local sizes" << std::endl; + s << "test2: wrong sum of local sizes" << std::endl + << "N : " << N << std::endl + << "num_ranks :" << num_ranks << std::endl + << "block size : " << bs << std::endl; + for (int i = 0; i < num_ranks; i++) { + s << "rank, local_size : " << i << ", " << spl.local_size(block_id(i)) << std::endl; + } throw std::runtime_error(s.str()); } for (int i = 0; i < N; i++) { - int rank = spl.local_rank(i); - int offset = (int)spl.local_index(i); - if (i != (int)spl.global_index(offset, rank)) { + int rank = spl.location(block_id(i)).ib; + int offset = spl.location(block_id(i)).index_local; + if (i != (int)spl.global_index(offset, block_id(rank))) { std::stringstream s; s << "test2: wrong index" << std::endl; s << "bs = " << bs << std::endl @@ -72,7 +78,7 @@ int test2() << "idx = " << i << std::endl << "rank = " << rank << std::endl << "offset = " << offset << std::endl - << "computed index = " << spl.global_index(offset, rank) << std::endl; + << "computed index = " << spl.global_index(offset, block_id(rank)) << std::endl; throw std::runtime_error(s.str()); } } @@ -86,14 +92,14 @@ int test3() { for (int num_ranks = 1; num_ranks < 20; num_ranks++) { for (int N = 1; N < 1130; N++) { - splindex spl_tmp(N, num_ranks, 0); + splindex_block<> spl_tmp(N, n_blocks(num_ranks), block_id(0)); - splindex spl(N, num_ranks, 0, spl_tmp.counts()); + splindex_chunk<> spl(N, n_blocks(num_ranks), block_id(0), spl_tmp.counts()); for (int i = 0; i < N; i++) { - int rank = spl.local_rank(i); - int offset = spl.local_index(i); - if (i != spl.global_index(offset, rank)) { + int rank = spl.location(block_id(i)).ib; + int offset = spl.location(block_id(i)).index_local; + if (i != spl.global_index(offset, block_id(rank))) { std::stringstream s; s << "test3: wrong index" << std::endl; throw std::runtime_error(s.str()); diff --git a/python_module/py_sirius.cpp b/python_module/py_sirius.cpp index 05c95aad05..d789712b15 100644 --- a/python_module/py_sirius.cpp +++ b/python_module/py_sirius.cpp @@ -409,7 +409,7 @@ PYBIND11_MODULE(py_sirius, m) [](K_point_set& ks, int i) -> K_point& { if (i >= ks.spl_num_kpoints().local_size()) throw pybind11::index_error("out of bounds"); - return *ks.get(ks.spl_num_kpoints(i)); + return *ks.get(ks.spl_num_kpoints().global_index(typename kp_index_t::local(i))); }, py::return_value_policy::reference_internal) .def("__len__", [](K_point_set const& ks) { return ks.spl_num_kpoints().local_size(); }) diff --git a/src/SDDK/splindex.hpp b/src/SDDK/splindex.hpp index 026b6b71d9..cc3d808ed7 100644 --- a/src/SDDK/splindex.hpp +++ b/src/SDDK/splindex.hpp @@ -25,26 +25,57 @@ #ifndef __SPLINDEX_HPP__ #define __SPLINDEX_HPP__ -#include -#include -#include -#include -#include +#include "strong_type.hpp" +#include "utils/rte.hpp" +#include + +/// Basic index type. +template +struct basic_index_t { + typedef T value_type; + typedef T global; + typedef T local; +}; -namespace sddk { +/// K-point index type. +struct kp_index_t { + typedef int value_type; + typedef strong_type global; + typedef strong_type local; +}; -/// Type of split index. -enum class splindex_t -{ - /// Block distribution. - block, - /// Block-cyclic distribution. - block_cyclic, - /// Custom distribution in continuous chunks of arbitrary size. - chunk +struct atom_index_t { + typedef int value_type; + typedef strong_type global; + typedef strong_type local; +}; + +struct atom_type_index_t { + typedef int value_type; + typedef strong_type global; + typedef strong_type local; }; -/// Type of index domain. +struct atom_symmetry_class_index_t { + typedef int value_type; + typedef strong_type global; + typedef strong_type local; +}; + +struct paw_atom_index_t { + typedef int value_type; + typedef strong_type global; + typedef strong_type local; +}; + +/// Number of blocks to which the global index is split. +using n_blocks = strong_type; +/// ID of the block. +/** The id of the block has the range [0, n_blocks) */ +using block_id = strong_type; + +namespace sddk { + enum class index_domain_t { /// Global index. @@ -54,456 +85,420 @@ enum class index_domain_t }; /// Base class for split index. -template -class splindex_base +template > +class splindex { + public: + using value_type = typename Index_t::value_type; protected: - /// Rank of the block with local fraction of the global index. - int rank_{-1}; + /// Number of blocks over which the global index is distributed. + n_blocks n_blocks_{-1}; - /// Number of ranks over which the global index is distributed. - int num_ranks_{-1}; + /// Index of the block with local fraction of the global index. + block_id block_id_{-1}; - /// size of the global index - T global_index_size_; + /// Size (aka length) of the global index. + value_type size_{-1}; - /// Default constructor. - splindex_base() - { - } - - /// Pair of describing the location of a global index. + /// Pair of describing the location of a global index element. struct location_t { - T local_index; - int rank; - location_t(T local_index__, int rank__) - : local_index(local_index__) - , rank(rank__) + /// Local index inside a block. + typename Index_t::local index_local; + /// Index of the block. + block_id ib; + /// Constructor. + location_t(typename Index_t::local index_local__, block_id ib__) + : index_local(index_local__) + , ib(ib__) { } }; public: - /// Rank id. - inline int rank() const - { - return rank_; - } - /// Number of ranks that are participating in the distribution of an index. - inline int num_ranks() const - { - return num_ranks_; - } - - inline T global_index_size() const + /// Default constructor. + splindex() { - return global_index_size_; } - static inline T block_size(T size__, int num_ranks__) - { - return size__ / num_ranks__ + std::min(T(1), size__ % num_ranks__); - } -}; - -/// Split index. -template -class splindex : public splindex_base -{ -}; - -/// Specialization for the block distribution. -template -class splindex : public splindex_base -{ - private: - T block_size_; - - void init(T global_index_size__, int num_ranks__, int rank__) + /// Constructor. + /** Check and set index size, number of blocks and block id. */ + splindex(value_type size__, n_blocks n_blocks__, block_id block_id__) { - this->global_index_size_ = global_index_size__; - - if (num_ranks__ < 0) { + if (size__ < 0) { std::stringstream s; - s << "wrong number of ranks: " << num_ranks__; + s << "wrong size : " << size__; throw std::runtime_error(s.str()); } - this->num_ranks_ = num_ranks__; + this->size_ = size__; - if (rank__ < 0 || rank__ >= num_ranks__) { + if (n_blocks__.get() < 0) { std::stringstream s; - s << "wrong rank: " << rank__; + s << "wrong number of blocks : " << n_blocks__.get(); throw std::runtime_error(s.str()); } - this->rank_ = rank__; + this->n_blocks_ = n_blocks__; - block_size_ = this->block_size(global_index_size__, num_ranks__); + if (block_id__.get() < 0 || block_id__.get() >= n_blocks__.get()) { + std::stringstream s; + s << "wrong rank block id : " << block_id__.get(); + throw std::runtime_error(s.str()); + } + this->block_id_ = block_id__; } - public: - /// Default constructor - splindex() + virtual ~splindex() { } - /// Constructor. - splindex(T global_index_size__, int num_ranks__, int rank__) - { - init(global_index_size__, num_ranks__, rank__); - } + /// Return local size of the split index for a given block. + virtual value_type local_size(block_id block_id__) const = 0; - /// Return "local index, rank" pair for a global index. - inline typename splindex_base::location_t location(T idxglob__) const - { - assert(idxglob__ < this->global_index_size_); + /// Return location (block_id and local offset) of the global index. + virtual location_t location(typename Index_t::global idx__) const = 0; - int rank = int(idxglob__ / block_size_); - T idxloc = idxglob__ - rank * block_size_; + /// Return global index by block id and local index. + virtual typename Index_t::global global_index(typename Index_t::local idxloc__, block_id block_id__) const = 0; - return typename splindex_base::location_t(idxloc, rank); + /// Return local size for the current block. + value_type local_size() const + { + return this->local_size(this->block_id_); } - /// Return local size of the split index for an arbitrary rank. - inline T local_size(int rank__) const + /// Return global index of an element by local index and block id. + inline auto global_index(typename splindex::location_t loc__) const { - assert(rank__ >= 0 && rank__ < this->num_ranks_); - - if (this->global_index_size_ == 0) { - return 0; - } - - int n = static_cast(this->global_index_size_ / block_size_); - if (rank__ < n) { - return block_size_; - } else if (rank__ == n) { - return this->global_index_size_ - rank__ * block_size_; - } else { - return 0; - } + return this->global_index(loc__.index_local, loc__.ib); } - /// Return local size of the split index for a current rank. - inline T local_size() const + inline auto global_index(typename Index_t::local idxloc__) const { - return local_size(this->rank_); + return this->global_index(idxloc__, this->block_id_); } - /// Return rank which holds the element with the given global index. - inline int local_rank(T idxglob__) const + /// Return total length of the index (global number of elements). + inline auto size() const noexcept { - return location(idxglob__).rank; + return size_; } - /// Return local index of the element for the rank which handles the given global index. - inline T local_index(T idxglob__) const + /// Compute size of the block from global index size and number of blocks. + static inline auto block_size(value_type size__, n_blocks n_blocks__) { - return location(idxglob__).local_index; + return size__ / n_blocks__ + std::min(value_type(1), size__ % n_blocks__); } +}; - /// Return global index of an element by local index and rank. - inline T global_index(T idxloc__, int rank__) const - { - assert(rank__ >= 0 && rank__ < this->num_ranks_); +template +class splindex_iterator_t +{ + private: + splindex const* idx_{nullptr}; + public: + using difference_type = std::ptrdiff_t; + typename Index_t::local li; - if (local_size(rank__) == 0) { - return std::numeric_limits::max(); - } + splindex_iterator_t& operator=(splindex_iterator_t const& lhs_) = default; - assert(idxloc__ < local_size(rank__)); + splindex_iterator_t(splindex const& idx__) + : idx_{&idx__} + , li{0} + { + } - return rank__ * block_size_ + idxloc__; + inline bool operator!=(splindex_iterator_t const& rhs__) + { + return this->li != rhs__.li; } - inline T global_offset() const + inline splindex_iterator_t& operator++() { - return global_index(0, this->rank_); + this->li++; + return *this; } - inline T global_offset(int rank__) const + inline splindex_iterator_t operator++(int) { - return global_index(0, rank__); + splindex_iterator_t tmp(this->idx()); + this->li++; + return tmp; } - inline T operator[](T idxloc__) const + inline auto operator*() { - return global_index(idxloc__, this->rank_); + struct { + typename Index_t::global i; + typename Index_t::local li; } ret{idx_->global_index(this->li), this->li}; + return ret; } - inline std::vector offsets() const + inline difference_type operator-(splindex_iterator_t const& rhs__) const { - std::vector v(this->num_ranks_); - for (int i = 0; i < this->num_ranks_; i++) { - v[i] = global_offset(i); - } - return v; + return li - rhs__.li; } - inline std::vector counts() const + inline splindex_iterator_t& operator+=(difference_type rhs__) { - std::vector v(this->num_ranks_); - for (int i = 0; i < this->num_ranks_; i++) { - v[i] = local_size(i); - } - return v; + li += rhs__; + return *this; } }; -/// Specialization for the block-cyclic distribution. -template -class splindex : public splindex_base +template > +class splindex_block : public splindex { + public: + using value_type = typename splindex::value_type; private: - /// cyclic block size of the distribution - int block_size_{-1}; + /// Local index size of a given block. + value_type block_size_; + public: + splindex_block() + { + } + /// Constructor. + splindex_block(value_type size__, n_blocks n_blocks__, block_id block_id__) + : splindex(size__, n_blocks__, block_id__) + { + this->block_size_ = this->block_size(size__, n_blocks__); + } - // Check and initialize variables. - void init(T global_index_size__, int num_ranks__, int rank__, int block_size__) + using splindex::local_size; + + /// Return local size of the split index for a given block. + inline value_type local_size(block_id block_id__) const { - this->global_index_size_ = global_index_size__; + RTE_ASSERT(block_id__ >= 0 && block_id__ < this->n_blocks_); - if (num_ranks__ < 0) { - std::stringstream s; - s << "wrong number of ranks: " << num_ranks__; - throw std::runtime_error(s.str()); + if (this->size_ == 0) { + return 0; } - this->num_ranks_ = num_ranks__; - if (rank__ < 0 || rank__ >= num_ranks__) { - std::stringstream s; - s << "wrong rank: " << rank__; - throw std::runtime_error(s.str()); + auto n = static_cast(this->size_ / block_size_); + if (block_id__ < n) { + return block_size_; + } else { + return std::max(0, this->size_ - block_id__ * block_size_); } - this->rank_ = rank__; + } - if (block_size__ <= 0) { - std::stringstream s; - s << "wrong block size: " << block_size__; - throw std::runtime_error(s.str()); - } - block_size_ = block_size__; + /// Return "local index, rank" pair for a global index. + inline typename splindex::location_t location(typename Index_t::global idx__) const + { + RTE_ASSERT(idx__ < this->size_); + + auto ib = static_cast(idx__ / this->block_size_); + value_type idxloc = idx__ - ib * this->block_size_; + + return typename splindex::location_t(typename Index_t::local(idxloc), block_id(ib)); } - public: - struct iterator - { - T idxloc_; - T idxglob_; - T num_blocks_min_; - int block_size_; - int rank_; - int num_ranks_; - - iterator(T idxglob__, int num_ranks__, int block_size__) - : idxglob_(idxglob__) - , num_ranks_(num_ranks__) - , block_size_(block_size__) - { - /* number of full blocks */ - T num_blocks = idxglob__ / block_size_; - num_blocks_min_ = num_blocks / num_ranks_; - idxloc_ = num_blocks_min_ * block_size_ + idxglob_ % block_size_; - rank_ = static_cast(num_blocks % num_ranks_); - } + using splindex::global_index; - bool operator!=(iterator const& rhs__) const - { - return idxglob_ != rhs__.idxglob_; - } + /// Return global index of an element by local index and block id. + inline typename Index_t::global global_index(typename Index_t::local idxloc__, block_id block_id__) const + { + RTE_ASSERT(block_id__ >= 0 && block_id__ < this->n_blocks_); - iterator& operator++() - { - idxglob_++; - idxloc_++; - if (idxloc_ % block_size_ == 0) { - rank_++; - if (rank_ % num_ranks_ == 0) { - num_blocks_min_++; - rank_ = 0; - } - idxloc_ = num_blocks_min_ * block_size_;// + idxglob_ % block_size_; - } + if (this->local_size(block_id__) == 0) { + return typename Index_t::global(-1); } - }; - /// Default constructor - splindex() - { + RTE_ASSERT(idxloc__ < local_size(block_id__)); + + return typename Index_t::global(this->block_size_ * block_id__ + idxloc__); } - /// Constructor with implicit cyclic block size - splindex(T global_index_size__, int num_ranks__, int rank__, int bs__) + inline auto global_offset() const { - init(global_index_size__, num_ranks__, rank__, bs__); + return this->global_index(typename Index_t::local(0), this->block_id_); } - iterator at(T idxglob__) const + inline auto global_offset(block_id iblock__) const { - return iterator(idxglob__, this->num_ranks_, this->block_size_); + return this->global_index(typename Index_t::local(0), iblock__); } - /// Return "local index, rank" pair for a global index. - inline typename splindex_base::location_t location(T idxglob__) const + inline auto counts() const { - assert(idxglob__ < this->global_index_size_); - - /* number of full blocks */ - T num_blocks = idxglob__ / block_size_; - - /* local index */ - T idxloc = (num_blocks / this->num_ranks_) * block_size_ + idxglob__ % block_size_; - - /* corresponding rank */ - int rank = static_cast(num_blocks % this->num_ranks_); + std::vector v(this->n_blocks_); + for (int i = 0; i < this->n_blocks_; i++) { + v[i] = local_size(block_id(i)); + } + return v; + } +}; - return typename splindex_base::location_t(idxloc, rank); +template > +class splindex_block_cyclic : public splindex +{ + public: + using value_type = typename splindex::value_type; + private: + /// Cyclic block size. + value_type block_size_; + public: + splindex_block_cyclic() + { } + /// Constructor. + splindex_block_cyclic(value_type size__, n_blocks n_blocks__, block_id block_id__, value_type block_size__) + : splindex(size__, n_blocks__, block_id__) + , block_size_{block_size__} + { + } + + using splindex::local_size; - /// Return local size of the split index for an arbitrary rank. - inline T local_size(int rank__) const + /// Return local size of the split index for a given block. + inline value_type local_size(block_id block_id__) const { - assert(rank__ >= 0 && rank__ < this->num_ranks_); + RTE_ASSERT(block_id__ >= 0 && block_id__ < this->n_blocks_); + if (this->size_ == 0) { + return 0; + } /* number of full blocks */ - T num_blocks = this->global_index_size_ / block_size_; - - T n = (num_blocks / this->num_ranks_) * block_size_; + auto num_blocks = this->size() / this->block_size_; - int rank_offs = static_cast(num_blocks % this->num_ranks_); + auto n = (num_blocks / this->n_blocks_) * this->block_size_; + auto rank_offs = static_cast(num_blocks % this->n_blocks_); - if (rank__ < rank_offs) { - n += block_size_; - } else if (rank__ == rank_offs) { - n += this->global_index_size_ % block_size_; + if (block_id__ < rank_offs) { + n += this->block_size_; + } else if (block_id__ == rank_offs) { + n += this->size_ % this->block_size_; } return n; } - /// Return local size of the split index for a current rank. - inline T local_size() const + /// Return "local index, rank" pair for a global index. + inline typename splindex::location_t location(typename Index_t::global idx__) const { - return local_size(this->rank_); - } + RTE_ASSERT(idx__ < this->size_); - /// Return rank which holds the element with the given global index. - inline int local_rank(T idxglob__) const - { - return location(idxglob__).rank; - } + /* number of full blocks */ + auto num_blocks = idx__ / this->block_size_; - /// Return local index of the element for the rank which handles the given global index. - inline T local_index(T idxglob__) const - { - return location(idxglob__).local_index; + /* local index */ + value_type idxloc = (num_blocks / this->n_blocks_) * block_size_ + idx__ % this->block_size_; + + /* corresponding rank */ + auto ib = static_cast(num_blocks % this->n_blocks_); + + return typename splindex::location_t(typename Index_t::local(idxloc), block_id(ib)); } - /// Get a global index by local index of a rank. - inline T global_index(T idxloc__, int rank__) const + using splindex::global_index; + + /// Return global index of an element by local index and block id. + inline typename Index_t::global global_index(typename Index_t::local idxloc__, block_id block_id__) const { - assert(rank__ >= 0 && rank__ < this->num_ranks_); - assert(idxloc__ < local_size(rank__)); + RTE_ASSERT(block_id__ >= 0 && block_id__ < this->n_blocks_); + RTE_ASSERT(idxloc__ < local_size(block_id__)); - T nb = idxloc__ / block_size_; + auto nb = idxloc__ / this->block_size_; - return (nb * this->num_ranks_ + rank__) * block_size_ + idxloc__ % block_size_; + return typename Index_t::global((nb * this->n_blocks_ + block_id__) * this->block_size_ + + idxloc__ % this->block_size_); } - /// Get global index of this rank. - inline T operator[](T idxloc__) const - { - return global_index(idxloc__, this->rank_); - } }; -/// Specialization for the block distribution. -template -class splindex : public splindex_base +/// Externally defined block distribution. +template > +class splindex_chunk : public splindex { + public: + using value_type = typename splindex::value_type; private: - std::vector> global_index_; - std::vector::location_t> locations_; + std::vector> global_index_; + std::vector::location_t> locations_; public: /// Default constructor. - splindex() + splindex_chunk() { } /// Constructor with specific partitioning. - splindex(T global_index_size__, int num_ranks__, int rank__, std::vector const& counts__) - { - this->global_index_size_ = global_index_size__; - - if (num_ranks__ < 0) { - std::stringstream s; - s << "wrong number of ranks: " << num_ranks__; - throw std::runtime_error(s.str()); - } - this->num_ranks_ = num_ranks__; - - if (rank__ < 0 || rank__ >= num_ranks__) { - std::stringstream s; - s << "wrong rank: " << rank__; - throw std::runtime_error(s.str()); - } - this->rank_ = rank__; - - for (int r = 0; r < num_ranks__; r++) { - global_index_.push_back(std::vector()); - for (int i = 0; i < counts__[r]; i++) { - global_index_.back().push_back(static_cast(locations_.size())); - locations_.push_back(typename splindex_base::location_t(i, r)); + splindex_chunk(value_type size__, n_blocks n_blocks__, block_id block_id__, std::vector const counts__) + : splindex(size__, n_blocks__, block_id__) + { + for (int r = 0; r < n_blocks__.get(); r++) { + global_index_.push_back(std::vector()); + for (value_type i = 0; i < counts__[r]; i++) { + global_index_.back().push_back(static_cast(locations_.size())); + locations_.push_back(typename splindex::location_t(typename Index_t::local(i), block_id(r))); } } - assert(static_cast(locations_.size()) == global_index_size__); + RTE_ASSERT(static_cast(locations_.size()) == this->size()); } - inline T local_size(int rank__) const - { - assert(rank__ >= 0); - assert(rank__ < this->num_ranks_); - return static_cast(global_index_[rank__].size()); - } + using splindex::local_size; - inline T local_size() const + inline value_type local_size(block_id block_id__) const { - return local_size(this->rank_); - } + RTE_ASSERT(block_id__ >= 0 && block_id__ < this->n_blocks_); - inline int local_rank(T idxglob__) const - { - return locations_[idxglob__].rank; + if (this->size_ == 0) { + return 0; + } + return static_cast(global_index_[block_id__].size()); } - inline T local_index(T idxglob__) const + inline typename splindex::location_t location(typename Index_t::global idx__) const { - return locations_[idxglob__].local_index; + return locations_[idx__]; } - inline T global_index(T idxloc__, int rank__) const - { - if (local_size(rank__) == 0) { - return std::numeric_limits::max(); - } + using splindex::global_index; - assert(idxloc__ < local_size(rank__)); - - return global_index_[rank__][idxloc__]; - } - - inline T operator[](T idxloc__) const + inline typename Index_t::global global_index(typename Index_t::local idxloc__, block_id block_id__) const { - return global_index(idxloc__, this->rank_); + RTE_ASSERT(block_id__ >= 0 && block_id__ < this->n_blocks_); + RTE_ASSERT(idxloc__ < local_size(block_id__)); + + return typename Index_t::global(global_index_[block_id__][idxloc__]); } - inline T global_offset() const + inline auto global_offset() const { - return global_index(0, this->rank_); + return this->global_index(typename Index_t::local(0), this->block_id_); } }; +template +auto begin_global(splindex const& a__) +{ + return typename Index_t::global(0); +} + +template +auto end_global(splindex const& a__) +{ + return typename Index_t::global(a__.size()); +} + +template +auto begin(splindex const& a__) +{ + splindex_iterator_t it(a__); + it.li = typename Index_t::local(0); + return it; +} + +template +auto end(splindex const& a__) +{ + splindex_iterator_t it(a__); + it.li = typename Index_t::local(a__.local_size()); + return it; +} + } // namespace sddk #endif // __SPLINDEX_HPP__ diff --git a/src/SDDK/wave_functions.hpp b/src/SDDK/wave_functions.hpp index e202622253..28f6e0c25a 100644 --- a/src/SDDK/wave_functions.hpp +++ b/src/SDDK/wave_functions.hpp @@ -113,10 +113,8 @@ auto checksum_gpu(std::complex const* wf__, int ld__, int num_rows_loc__, int /// Namespace for the wave-functions. namespace wf { -using sirius::strong_type; - using spin_index = strong_type; -using atom_index = strong_type; +//using atom_index = strong_type; using band_index = strong_type; using num_bands = strong_type; @@ -494,7 +492,7 @@ class Wave_functions_mt : public Wave_functions_base /// Total number of atoms. int num_atoms_{0}; /// Distribution of atoms between MPI ranks. - sddk::splindex spl_num_atoms_; + sddk::splindex_block spl_num_atoms_; /// Local size of muffin-tin coefficients for each rank. /** Each rank stores local fraction of atoms. Each atom has a set of MT coefficients. */ mpi::block_data_descriptor mt_coeffs_distr_; @@ -509,10 +507,12 @@ class Wave_functions_mt : public Wave_functions_base static int get_local_num_mt_coeffs(std::vector num_mt_coeffs__, mpi::Communicator const& comm__) { int num_atoms = static_cast(num_mt_coeffs__.size()); - sddk::splindex spl_atoms(num_atoms, comm__.size(), comm__.rank()); - auto it_begin = num_mt_coeffs__.begin() + spl_atoms.global_offset(); - auto it_end = it_begin + spl_atoms.local_size(); - return std::accumulate(it_begin, it_end, 0); + sddk::splindex_block spl_atoms(num_atoms, n_blocks(comm__.size()), block_id(comm__.rank())); + int result{0}; + for (auto it : spl_atoms) { + result += num_mt_coeffs__[it.i]; + } + return result; } /// Construct without muffin-tin part. @@ -520,7 +520,7 @@ class Wave_functions_mt : public Wave_functions_base sddk::memory_t default_mem__, int num_pw__) : Wave_functions_base(num_pw__, 0, num_md__, num_wf__, default_mem__) , comm_{comm__} - , spl_num_atoms_{sddk::splindex(num_atoms_, comm_.size(), comm_.rank())} + , spl_num_atoms_{sddk::splindex_block(num_atoms_, n_blocks(comm_.size()), block_id(comm_.rank()))} { } @@ -537,13 +537,13 @@ class Wave_functions_mt : public Wave_functions_base default_mem__) , comm_{comm__} , num_atoms_{static_cast(num_mt_coeffs__.size())} - , spl_num_atoms_{sddk::splindex(num_atoms_, comm_.size(), comm_.rank())} + , spl_num_atoms_{sddk::splindex_block(num_atoms_, n_blocks(comm_.size()), block_id(comm_.rank()))} , num_mt_coeffs_{num_mt_coeffs__} { mt_coeffs_distr_ = mpi::block_data_descriptor(comm_.size()); for (int ia = 0; ia < num_atoms_; ia++) { - int rank = spl_num_atoms_.local_rank(ia); + auto rank = spl_num_atoms_.location(atom_index_t::global(ia)).ib; if (rank == comm_.rank()) { offset_in_local_mt_coeffs_.push_back(mt_coeffs_distr_.counts[rank]); } @@ -555,14 +555,14 @@ class Wave_functions_mt : public Wave_functions_base /// Return reference to the coefficient by atomic orbital index, atom, spin and band indices. inline auto& - mt_coeffs(int xi__, atom_index ia__, spin_index ispn__, band_index i__) + mt_coeffs(int xi__, atom_index_t::local ia__, spin_index ispn__, band_index i__) { return this->data_[ispn__.get()](this->num_pw_ + xi__ + offset_in_local_mt_coeffs_[ia__.get()], i__.get()); } /// Return const reference to the coefficient by atomic orbital index, atom, spin and band indices. inline auto const& - mt_coeffs(int xi__, atom_index ia__, spin_index ispn__, band_index i__) const + mt_coeffs(int xi__, atom_index_t::local ia__, spin_index ispn__, band_index i__) const { return this->data_[ispn__.get()](this->num_pw_ + xi__ + offset_in_local_mt_coeffs_[ia__.get()], i__.get()); } @@ -571,14 +571,14 @@ class Wave_functions_mt : public Wave_functions_base /// Return const pointer to the coefficient by atomic orbital index, atom, spin and band indices. inline std::complex const* - at(sddk::memory_t mem__, int xi__, atom_index ia__, spin_index s__, band_index b__) const + at(sddk::memory_t mem__, int xi__, atom_index_t::local ia__, spin_index s__, band_index b__) const { return this->data_[s__.get()].at(mem__, this->num_pw_ + xi__ + offset_in_local_mt_coeffs_[ia__.get()], b__.get()); } /// Return pointer to the coefficient by atomic orbital index, atom, spin and band indices. inline auto - at(sddk::memory_t mem__, int xi__, atom_index ia__, spin_index s__, band_index b__) + at(sddk::memory_t mem__, int xi__, atom_index_t::local ia__, spin_index s__, band_index b__) { return this->data_[s__.get()].at(mem__, this->num_pw_ + xi__ + offset_in_local_mt_coeffs_[ia__.get()], b__.get()); } @@ -837,7 +837,7 @@ class Wave_functions_fft : public Wave_functions_base /// Pointer to FFT-friendly G+k vector deistribution. std::shared_ptr gkvec_fft_; /// Split number of wave-functions between column communicator. - sddk::splindex spl_num_wf_; + sddk::splindex_block<> spl_num_wf_; /// Pointer to the original wave-functions. Wave_functions* wf_{nullptr}; /// Spin-index of the wave-function component @@ -866,7 +866,7 @@ class Wave_functions_fft : public Wave_functions_base std::vector colsplit(comm_col.size() + 1); colsplit[0] = 0; for (int i = 0; i < comm_col.size(); i++) { - colsplit[i + 1] = colsplit[i] + spl_num_wf_.local_size(i); + colsplit[i + 1] = colsplit[i] + spl_num_wf_.local_size(block_id(i)); } std::vector owners(gkvec_fft_->gvec().comm().size()); @@ -930,8 +930,8 @@ class Wave_functions_fft : public Wave_functions_base /* send and receive dimensions */ mpi::block_data_descriptor sd(comm_col.size()), rd(comm_col.size()); for (int j = 0; j < comm_col.size(); j++) { - sd.counts[j] = spl_num_wf_.local_size(j) * row_distr.counts[comm_col.rank()]; - rd.counts[j] = spl_num_wf_.local_size(comm_col.rank()) * row_distr.counts[j]; + sd.counts[j] = spl_num_wf_.local_size(block_id(j)) * row_distr.counts[comm_col.rank()]; + rd.counts[j] = spl_num_wf_.local_size(block_id(comm_col.rank())) * row_distr.counts[j]; } sd.calc_offsets(); rd.calc_offsets(); @@ -1003,8 +1003,8 @@ class Wave_functions_fft : public Wave_functions_base /* send and receive dimensions */ mpi::block_data_descriptor sd(comm_col.size()), rd(comm_col.size()); for (int j = 0; j < comm_col.size(); j++) { - sd.counts[j] = spl_num_wf_.local_size(comm_col.rank()) * row_distr.counts[j]; - rd.counts[j] = spl_num_wf_.local_size(j) * row_distr.counts[comm_col.rank()]; + sd.counts[j] = spl_num_wf_.local_size(block_id(comm_col.rank())) * row_distr.counts[j]; + rd.counts[j] = spl_num_wf_.local_size(block_id(j)) * row_distr.counts[comm_col.rank()]; } sd.calc_offsets(); rd.calc_offsets(); @@ -1069,7 +1069,7 @@ class Wave_functions_fft : public Wave_functions_base , shuffle_flag_{shuffle_flag___} { auto& comm_col = gkvec_fft_->comm_ortho_fft(); - spl_num_wf_ = sddk::splindex(br__.size(), comm_col.size(), comm_col.rank()); + spl_num_wf_ = sddk::splindex_block<>(br__.size(), n_blocks(comm_col.size()), block_id(comm_col.rank())); this->num_mt_ = 0; this->num_md_ = wf::num_mag_dims(0); this->num_sc_ = wf::num_spins(1); diff --git a/src/api/sirius_api.cpp b/src/api/sirius_api.cpp index 8edbff7163..b075dbd244 100644 --- a/src/api/sirius_api.cpp +++ b/src/api/sirius_api.cpp @@ -3302,11 +3302,11 @@ sirius_get_wave_functions(void* const* ks_handler__, double const* vkl__, int co ks.comm().bcast(&spin, 1, dest_rank); /* rank where k-point vkl resides on the SIRIUS side */ - int src_rank = ks.spl_num_kpoints().local_rank(jk); + int src_rank = ks.spl_num_kpoints().location(typename kp_index_t::global(jk)).ib; if (ks.comm().rank() == src_rank || ks.comm().rank() == dest_rank) { /* send G+k copy to destination rank (where host code receives the data) */ - auto gkvec = ks.get_gkvec(jk, dest_rank); + auto gkvec = ks.get_gkvec(typename kp_index_t::global(jk), dest_rank); sddk::mdarray, 2> wf; if (ks.comm().rank() == dest_rank) { @@ -3963,7 +3963,7 @@ sirius_get_gkvec_arrays(void* const* ks_handler__, int* ik__, int* num_gkvec__, auto kp = ks.get(*ik__ - 1); /* get rank that stores a given k-point */ - int rank = ks.spl_num_kpoints().local_rank(*ik__ - 1); + int rank = ks.spl_num_kpoints().location(kp_index_t::global(*ik__ - 1)).ib; auto& comm_k = ks.ctx().comm_k(); @@ -5229,7 +5229,7 @@ sirius_get_rg_values(void* const* handler__, char const* label__, int const* gri for (int rank = 0; rank < fft_comm.size(); rank++) { /* slab of FFT grid for a given rank */ - sddk::mdarray buf(f->spfft().dim_x(), f->spfft().dim_y(), spl_z.local_size(rank)); + sddk::mdarray buf(f->spfft().dim_x(), f->spfft().dim_y(), spl_z.local_size(block_id(rank))); if (rank == fft_comm.rank()) { std::copy(&f->value(0), &f->value(0) + f->spfft().local_slice_size(), &buf[0]); } @@ -5247,9 +5247,10 @@ sirius_get_rg_values(void* const* handler__, char const* label__, int const* gri for (int iz = 0; iz < nz; iz++) { /* global z coordinate inside FFT box */ int z = local_box_origin(2, r) + iz - 1; /* Fortran counts from 1 */ - if (z >= spl_z.global_offset(rank) && z < spl_z.global_offset(rank) + spl_z.local_size(rank)) { + if (z >= spl_z.global_offset(block_id(rank)) && z < spl_z.global_offset(block_id(rank)) + + spl_z.local_size(block_id(rank))) { /* make z local for SIRIUS FFT partitioning */ - z -= spl_z.global_offset(rank); + z -= spl_z.global_offset(block_id(rank)); for (int iy = 0; iy < ny; iy++) { /* global y coordinate inside FFT box */ int y = local_box_origin(1, r) + iy - 1; /* Fortran counts from 1 */ diff --git a/src/band/band.cpp b/src/band/band.cpp index 881fa35432..01cd7a5c93 100644 --- a/src/band/band.cpp +++ b/src/band/band.cpp @@ -53,9 +53,8 @@ Band::initialize_subspace(K_point_set& kset__, Hamiltonian0& H0__) const N = unit_cell_.num_ps_atomic_wf().first; } - for (int ikloc = 0; ikloc < kset__.spl_num_kpoints().local_size(); ikloc++) { - int ik = kset__.spl_num_kpoints(ikloc); - auto kp = kset__.get(ik); + for (auto it: kset__.spl_num_kpoints()) { + auto kp = kset__.get(it.i); auto Hk = H0__(*kp); if (ctx_.gamma_point() && (ctx_.so_correction() == false)) { ::sirius::initialize_subspace(Hk, N); diff --git a/src/band/band.hpp b/src/band/band.hpp index 15ff268cf3..5fb4652f02 100644 --- a/src/band/band.hpp +++ b/src/band/band.hpp @@ -88,10 +88,12 @@ class Band // TODO: Band class is lightweight and in principle can be converted /* copy old N - num_locked x N - num_locked distributed matrix */ if (N__ > 0) { - sddk::splindex spl_row(N__ - num_locked__, - mtrx__.blacs_grid().num_ranks_row(), mtrx__.blacs_grid().rank_row(), mtrx__.bs_row()); - sddk::splindex spl_col(N__ - num_locked__, - mtrx__.blacs_grid().num_ranks_col(), mtrx__.blacs_grid().rank_col(), mtrx__.bs_col()); + sddk::splindex_block_cyclic<> spl_row(N__ - num_locked__, + n_blocks(mtrx__.blacs_grid().num_ranks_row()), block_id(mtrx__.blacs_grid().rank_row()), + mtrx__.bs_row()); + sddk::splindex_block_cyclic<> spl_col(N__ - num_locked__, + n_blocks(mtrx__.blacs_grid().num_ranks_col()), block_id(mtrx__.blacs_grid().rank_col()), + mtrx__.bs_col()); if (mtrx_old__) { if (spl_row.local_size()) { @@ -135,10 +137,12 @@ class Band // TODO: Band class is lightweight and in principle can be converted } if (ctx_.print_checksum()) { - sddk::splindex spl_row(N__ + n__ - num_locked__, - mtrx__.blacs_grid().num_ranks_row(), mtrx__.blacs_grid().rank_row(), mtrx__.bs_row()); - sddk::splindex spl_col(N__ + n__ - num_locked__, - mtrx__.blacs_grid().num_ranks_col(), mtrx__.blacs_grid().rank_col(), mtrx__.bs_col()); + sddk::splindex_block_cyclic<> spl_row(N__ + n__ - num_locked__, + n_blocks(mtrx__.blacs_grid().num_ranks_row()), block_id(mtrx__.blacs_grid().rank_row()), + mtrx__.bs_row()); + sddk::splindex_block_cyclic<> spl_col(N__ + n__ - num_locked__, + n_blocks(mtrx__.blacs_grid().num_ranks_col()), block_id(mtrx__.blacs_grid().rank_col()), + mtrx__.bs_col()); auto cs = mtrx__.checksum(N__ + n__ - num_locked__, N__ + n__ - num_locked__); if (ctx_.comm_band().rank() == 0) { utils::print_checksum("subspace_mtrx", cs, RTE_OUT(std::cout)); @@ -150,10 +154,12 @@ class Band // TODO: Band class is lightweight and in principle can be converted /* save new matrix */ if (mtrx_old__) { - sddk::splindex spl_row(N__ + n__ - num_locked__, - mtrx__.blacs_grid().num_ranks_row(), mtrx__.blacs_grid().rank_row(), mtrx__.bs_row()); - sddk::splindex spl_col(N__ + n__ - num_locked__, - mtrx__.blacs_grid().num_ranks_col(), mtrx__.blacs_grid().rank_col(), mtrx__.bs_col()); + sddk::splindex_block_cyclic<> spl_row(N__ + n__ - num_locked__, + n_blocks(mtrx__.blacs_grid().num_ranks_row()), block_id(mtrx__.blacs_grid().rank_row()), + mtrx__.bs_row()); + sddk::splindex_block_cyclic<> spl_col(N__ + n__ - num_locked__, + n_blocks(mtrx__.blacs_grid().num_ranks_col()), block_id(mtrx__.blacs_grid().rank_col()), + mtrx__.bs_col()); if (spl_row.local_size()) { #pragma omp parallel for schedule(static) diff --git a/src/band/diag_full_potential.cpp b/src/band/diag_full_potential.cpp index 2415dcfdfb..ef2b56744a 100644 --- a/src/band/diag_full_potential.cpp +++ b/src/band/diag_full_potential.cpp @@ -300,11 +300,10 @@ void Band::diag_full_potential_first_variation_davidson(Hamiltonian_k& H /* add pure local orbitals to the basis staring from ncomp index */ if (nlo) { - for (int ialoc = 0; ialoc < phi_extra_new->spl_num_atoms().local_size(); ialoc++) { - int ia = phi_extra_new->spl_num_atoms()[ialoc]; - for (int xi = 0; xi < unit_cell_.atom(ia).mt_lo_basis_size(); xi++) { - phi_extra_new->mt_coeffs(xi, wf::atom_index(ialoc), wf::spin_index(0), - wf::band_index(offset_lo[ia] + xi + ncomp)) = 1.0; + for (auto it : phi_extra_new->spl_num_atoms()) { + for (int xi = 0; xi < unit_cell_.atom(it.i).mt_lo_basis_size(); xi++) { + phi_extra_new->mt_coeffs(xi, it.li, wf::spin_index(0), + wf::band_index(offset_lo[it.i] + xi + ncomp)) = 1.0; } } } diff --git a/src/band/residuals.hpp b/src/band/residuals.hpp index 0034458ade..4a7bd1ef05 100644 --- a/src/band/residuals.hpp +++ b/src/band/residuals.hpp @@ -340,11 +340,13 @@ residuals(Simulation_context& ctx__, sddk::memory_t mem__, wf::spin_range sr__, auto pos_src = evec__.spl_col().location(ev_idx[j]); auto pos_dest = evec_tmp.spl_col().location(j); /* do MPI send / receive */ - if (pos_src.rank == evec__.blacs_grid().comm_col().rank() && num_rows_local) { - evec__.blacs_grid().comm_col().isend(&evec__(0, pos_src.local_index), num_rows_local, pos_dest.rank, ev_idx[j]); + if (pos_src.ib == evec__.blacs_grid().comm_col().rank() && num_rows_local) { + evec__.blacs_grid().comm_col().isend(&evec__(0, pos_src.index_local), num_rows_local, pos_dest.ib, + ev_idx[j]); } - if (pos_dest.rank == evec__.blacs_grid().comm_col().rank() && num_rows_local) { - evec__.blacs_grid().comm_col().recv(&evec_tmp(0, pos_dest.local_index), num_rows_local, pos_src.rank, ev_idx[j]); + if (pos_dest.ib == evec__.blacs_grid().comm_col().rank() && num_rows_local) { + evec__.blacs_grid().comm_col().recv(&evec_tmp(0, pos_dest.index_local), num_rows_local, pos_src.ib, + ev_idx[j]); } } } diff --git a/src/band/solve.cpp b/src/band/solve.cpp index 7de53db43c..af9932c201 100644 --- a/src/band/solve.cpp +++ b/src/band/solve.cpp @@ -154,9 +154,8 @@ Band::solve(K_point_set& kset__, Hamiltonian0& H0__, double itsol_tol__) cons int num_dav_iter{0}; /* solve secular equation and generate wave functions */ - for (int ikloc = 0; ikloc < kset__.spl_num_kpoints().local_size(); ikloc++) { - int ik = kset__.spl_num_kpoints(ikloc); - auto kp = kset__.get(ik); + for (auto it : kset__.spl_num_kpoints()) { + auto kp = kset__.get(it.i); auto Hk = H0__(*kp); if (ctx_.full_potential()) { diff --git a/src/density/density.cpp b/src/density/density.cpp index 5b1398797f..b6839905fd 100644 --- a/src/density/density.cpp +++ b/src/density/density.cpp @@ -391,10 +391,10 @@ Density::initial_density_full_pot() /* initialize the magnetization */ if (ctx_.num_mag_dims()) { - for (int ialoc = 0; ialoc < unit_cell_.spl_num_atoms().local_size(); ialoc++) { - int ia = unit_cell_.spl_num_atoms(ialoc); - r3::vector v = unit_cell_.atom(ia).vector_field(); - double len = v.length(); + for (auto it : unit_cell_.spl_num_atoms()) { + int ia = it.i; + auto v = unit_cell_.atom(ia).vector_field(); + auto len = v.length(); int nmtp = unit_cell_.atom(ia).num_mt_points(); Spline rho_s(unit_cell_.atom(ia).type().radial_grid()); @@ -436,7 +436,7 @@ void Density::init_density_matrix_for_paw() { for (int ipaw = 0; ipaw < unit_cell_.num_paw_atoms(); ipaw++) { - int ia = unit_cell_.paw_atom_index(ipaw); + int ia = unit_cell_.paw_atom_index(paw_atom_index_t::global(ipaw)); auto& dm = density_matrix(ia); dm.zero(); @@ -479,10 +479,10 @@ Density::init_density_matrix_for_paw() } void -Density::generate_paw_atom_density(int ialoc__) +Density::generate_paw_atom_density(paw_atom_index_t::local ialoc__) { - int ia_paw = ctx_.unit_cell().spl_num_paw_atoms(ialoc__); - int ia = ctx_.unit_cell().paw_atom_index(ia_paw); + auto ia_paw = ctx_.unit_cell().spl_num_paw_atoms(ialoc__); + auto ia = ctx_.unit_cell().paw_atom_index(ia_paw); auto& atom_type = ctx_.unit_cell().atom(ia).type(); @@ -570,8 +570,8 @@ Density::generate_paw_loc_density() PROFILE("sirius::Density::generate_paw_loc_density"); #pragma omp parallel for - for (int ialoc = 0; ialoc < unit_cell_.spl_num_paw_atoms().local_size(); ialoc++) { - generate_paw_atom_density(ialoc); + for (auto it : unit_cell_.spl_num_paw_atoms()) { + generate_paw_atom_density(it.li); } } @@ -713,7 +713,7 @@ Density::add_k_point_contribution_rg(K_point* kp__, std::arrayband_occupancy(j, ispn) * kp__->weight() / omega; auto inp_wf = wf_fft__[ispn].pw_coeffs_spfft(wf_mem, wf::band_index(i)); @@ -732,7 +732,7 @@ Density::add_k_point_contribution_rg(K_point* kp__, std::arrayband_occupancy(j, 0) * kp__->weight() / omega; auto wf_mem_up = wf_fft__[0].on_device() ? sddk::memory_t::device : sddk::memory_t::host; @@ -793,17 +793,18 @@ add_k_point_contribution_dm_fplapw(Simulation_context const& ctx__, K_point c /* add |psi_j> n_j >(z) * kp__.band_occupancy(j, ispn) * kp__.weight(); + wf2(xi, j, ispn) = static_cast>(z) * kp__.band_occupancy(j, ispn) * + kp__.weight(); } } } @@ -841,7 +842,7 @@ add_k_point_contribution_dm_pwpp_collinear(Simulation_context& ctx__, K_point bp_coeffs__, kp__.spinor_wave_functions(), wf::spin_index(ispn), wf::band_range(0, nbnd)); /* use communicator of the k-point to split band index */ - sddk::splindex spl_nbnd(nbnd, kp__.comm().size(), kp__.comm().rank()); + sddk::splindex_block<> spl_nbnd(nbnd, n_blocks(kp__.comm().size()), block_id(kp__.comm().rank())); int nbnd_loc = spl_nbnd.local_size(); if (nbnd_loc) { // TODO: this part can also be moved to GPU @@ -861,7 +862,7 @@ add_k_point_contribution_dm_pwpp_collinear(Simulation_context& ctx__, K_point for (int i = 0; i < nbnd_loc; i++) { /* global index of band */ - int j = spl_nbnd[i]; + auto j = spl_nbnd.global_index(i); for (int xi = 0; xi < nbf; xi++) { bp1(xi, i) = beta_psi(offs + xi, j); @@ -891,7 +892,7 @@ add_k_point_contribution_dm_pwpp_noncollinear(Simulation_context& ctx__, K_point /* total number of occupied bands */ int nbnd = kp__.num_occupied_bands(); - sddk::splindex spl_nbnd(nbnd, kp__.comm().size(), kp__.comm().rank()); + sddk::splindex_block<> spl_nbnd(nbnd, n_blocks(kp__.comm().size()), block_id(kp__.comm().rank())); int nbnd_loc = spl_nbnd.local_size(); /* auxiliary arrays */ @@ -909,7 +910,7 @@ add_k_point_contribution_dm_pwpp_noncollinear(Simulation_context& ctx__, K_point #pragma omp parallel for schedule(static) for (int i = 0; i < nbnd_loc; i++) { - int j = spl_nbnd[i]; + auto j = spl_nbnd.global_index(i); for (int m = 0; m < nbeta; m++) { bp1(m, i, ispn) = beta_psi(m, j); @@ -1105,11 +1106,10 @@ Density::generate(K_point_set const& ks__, bool symmetrize__, bool add_core__, b /* find the core states */ generate_core_charge_density(); /* add core contribution */ - for (int ialoc = 0; ialoc < (int)unit_cell_.spl_num_atoms().local_size(); ialoc++) { - int ia = unit_cell_.spl_num_atoms(ialoc); - for (int ir = 0; ir < unit_cell_.atom(ia).num_mt_points(); ir++) { - rho().mt()[ia](0, ir) += - unit_cell_.atom(ia).symmetry_class().ae_core_charge_density(ir) / y00; + for (auto it : unit_cell_.spl_num_atoms()) { + for (int ir = 0; ir < unit_cell_.atom(it.i).num_mt_points(); ir++) { + rho().mt()[it.i](0, ir) += + unit_cell_.atom(it.i).symmetry_class().ae_core_charge_density(ir) / y00; } } } @@ -1267,9 +1267,8 @@ Density::generate_valence(K_point_set const& ks__) auto mem = ctx_.processing_unit() == sddk::device_t::CPU ? sddk::memory_t::host : sddk::memory_t::device; /* start the main loop over k-points */ - for (int ikloc = 0; ikloc < ks__.spl_num_kpoints().local_size(); ikloc++) { - int ik = ks__.spl_num_kpoints(ikloc); - auto kp = ks__.get(ik); + for (auto it : ks__.spl_num_kpoints()) { + auto kp = ks__.get(it.i); std::array, 2> wf_fft; @@ -1645,9 +1644,8 @@ Density::generate_valence_mt() sddk::mdarray rf_pairs(unit_cell_.max_num_mt_points(), max_num_rf_pairs); sddk::mdarray dlm(ctx_.lmmax_rho(), unit_cell_.max_num_mt_points(), ctx_.num_mag_dims() + 1); - for (int ialoc = 0; ialoc < unit_cell_.spl_num_atoms().local_size(); ialoc++) { - int ia = unit_cell_.spl_num_atoms(ialoc); - auto& atom_type = unit_cell_.atom(ia).type(); + for (auto it : unit_cell_.spl_num_atoms()) { + auto& atom_type = unit_cell_.atom(it.i).type(); int nmtp = atom_type.num_mt_points(); int num_rf_pairs = atom_type.mt_radial_basis_size() * (atom_type.mt_radial_basis_size() + 1) / 2; @@ -1655,15 +1653,15 @@ Density::generate_valence_mt() PROFILE_START("sirius::Density::generate|sum_zdens"); switch (ctx_.num_mag_dims()) { case 3: { - reduce_density_matrix<3>(atom_type, density_matrix(ia), mt_density_matrix); + reduce_density_matrix<3>(atom_type, density_matrix(it.i), mt_density_matrix); break; } case 1: { - reduce_density_matrix<1>(atom_type, density_matrix(ia), mt_density_matrix); + reduce_density_matrix<1>(atom_type, density_matrix(it.i), mt_density_matrix); break; } case 0: { - reduce_density_matrix<0>(atom_type, density_matrix(ia), mt_density_matrix); + reduce_density_matrix<0>(atom_type, density_matrix(it.i), mt_density_matrix); break; } } @@ -1676,9 +1674,9 @@ Density::generate_valence_mt() for (int idxrf1 = 0; idxrf1 <= idxrf2; idxrf1++) { /* off-diagonal pairs are taken two times: d_{12}*f_1*f_2 + d_{21}*f_2*f_1 = d_{12}*2*f_1*f_2 */ int n = (idxrf1 == idxrf2) ? 1 : 2; - for (int ir = 0; ir < unit_cell_.atom(ia).num_mt_points(); ir++) { - rf_pairs(ir, offs + idxrf1) = n * unit_cell_.atom(ia).symmetry_class().radial_function(ir, idxrf1) * - unit_cell_.atom(ia).symmetry_class().radial_function(ir, idxrf2); + for (int ir = 0; ir < unit_cell_.atom(it.i).num_mt_points(); ir++) { + rf_pairs(ir, offs + idxrf1) = n * unit_cell_.atom(it.i).symmetry_class().radial_function(ir, idxrf1) * + unit_cell_.atom(it.i).symmetry_class().radial_function(ir, idxrf2); } } } @@ -1693,21 +1691,21 @@ Density::generate_valence_mt() switch (ctx_.num_mag_dims()) { case 3: { /* copy x component */ - std::copy(&dlm(0, 0, 2), &dlm(0, 0, 2) + sz, &mag(1).mt()[ia](0, 0)); + std::copy(&dlm(0, 0, 2), &dlm(0, 0, 2) + sz, &mag(1).mt()[it.i](0, 0)); /* copy y component */ - std::copy(&dlm(0, 0, 3), &dlm(0, 0, 3) + sz, &mag(2).mt()[ia](0, 0)); + std::copy(&dlm(0, 0, 3), &dlm(0, 0, 3) + sz, &mag(2).mt()[it.i](0, 0)); } case 1: { for (int ir = 0; ir < nmtp; ir++) { for (int lm = 0; lm < ctx_.lmmax_rho(); lm++) { - rho().mt()[ia](lm, ir) = dlm(lm, ir, 0) + dlm(lm, ir, 1); - mag(0).mt()[ia](lm, ir) = dlm(lm, ir, 0) - dlm(lm, ir, 1); + rho().mt()[it.i](lm, ir) = dlm(lm, ir, 0) + dlm(lm, ir, 1); + mag(0).mt()[it.i](lm, ir) = dlm(lm, ir, 0) - dlm(lm, ir, 1); } } break; } case 0: { - std::copy(&dlm(0, 0, 0), &dlm(0, 0, 0) + sz, &rho().mt()[ia](0, 0)); + std::copy(&dlm(0, 0, 0), &dlm(0, 0, 0) + sz, &rho().mt()[it.i](0, 0)); } } } diff --git a/src/density/density.hpp b/src/density/density.hpp index 117c1a1465..379dda8f35 100644 --- a/src/density/density.hpp +++ b/src/density/density.hpp @@ -251,7 +251,7 @@ class Density : public Field4D PAW_density, Hubbard_matrix>> mixer_; /// Generate atomic densities in the case of PAW. - void generate_paw_atom_density(int iapaw__); + void generate_paw_atom_density(paw_atom_index_t::local iapaw__); /// Initialize PAW density matrix. void init_density_matrix_for_paw(); @@ -311,13 +311,14 @@ class Density : public Field4D { PROFILE("sirius::Density::generate_core_charge_density"); - for (int icloc = 0; icloc < unit_cell_.spl_num_atom_symmetry_classes().local_size(); icloc++) { - int ic = unit_cell_.spl_num_atom_symmetry_classes(icloc); - unit_cell_.atom_symmetry_class(ic).generate_core_charge_density(ctx_.core_relativity()); + auto& spl_idx = unit_cell_.spl_num_atom_symmetry_classes(); + + for (auto it : spl_idx) { + unit_cell_.atom_symmetry_class(it.i).generate_core_charge_density(ctx_.core_relativity()); } - for (int ic = 0; ic < unit_cell_.num_atom_symmetry_classes(); ic++) { - int rank = unit_cell_.spl_num_atom_symmetry_classes().local_rank(ic); + for (auto ic = begin_global(spl_idx); ic != end_global(spl_idx); ic++) { + auto rank = spl_idx.location(ic).ib; unit_cell_.atom_symmetry_class(ic).sync_core_charge_density(ctx_.comm(), rank); } } @@ -442,9 +443,9 @@ class Density : public Field4D return *rho_pseudo_core_; } - inline auto const& density_mt(int ialoc) const + inline auto const& density_mt(atom_index_t::local ialoc__) const { - int ia = ctx_.unit_cell().spl_num_atoms(ialoc); + auto ia = ctx_.unit_cell().spl_num_atoms().global_index(ialoc__); return rho().mt()[ia]; } diff --git a/src/dft/dft_ground_state.cpp b/src/dft/dft_ground_state.cpp index 67a4cf8834..fd0e8f072a 100644 --- a/src/dft/dft_ground_state.cpp +++ b/src/dft/dft_ground_state.cpp @@ -79,9 +79,8 @@ DFT_ground_state::energy_kin_sum_pw() const { double ekin{0}; - for (int ikloc = 0; ikloc < kset_.spl_num_kpoints().local_size(); ikloc++) { - int ik = kset_.spl_num_kpoints(ikloc); - auto kp = kset_.get(ik); + for (auto it : kset_.spl_num_kpoints()) { + auto kp = kset_.get(it.i); #pragma omp parallel for schedule(static) reduction(+:ekin) for (int igloc = 0; igloc < kp->num_gkvec_loc(); igloc++) { @@ -281,12 +280,11 @@ DFT_ground_state::find(double density_tol__, double energy_tol__, double iter_so ctx_.cfg().parameters().precision_hs("fp64"); ctx_.cfg().lock(); - for (int ikloc = 0; ikloc < kset_.spl_num_kpoints().local_size(); ikloc++) { - int ik = kset_.spl_num_kpoints(ikloc); + for (auto it : kset_.spl_num_kpoints()) { for (int ispn = 0; ispn < ctx_.num_spins(); ispn++) { - wf::copy(sddk::memory_t::host, kset_.get(ik)->spinor_wave_functions(), + wf::copy(sddk::memory_t::host, kset_.get(it.i)->spinor_wave_functions(), wf::spin_index(ispn), wf::band_range(0, ctx_.num_bands()), - kset_.get(ik)->spinor_wave_functions(), wf::spin_index(ispn), + kset_.get(it.i)->spinor_wave_functions(), wf::spin_index(ispn), wf::band_range(0, ctx_.num_bands())); } } diff --git a/src/dft/energy.cpp b/src/dft/energy.cpp index 77c9995c0c..48863017ff 100644 --- a/src/dft/energy.cpp +++ b/src/dft/energy.cpp @@ -106,10 +106,9 @@ energy_enuc(Simulation_context const& ctx, Potential const& potential) auto& unit_cell = ctx.unit_cell(); double enuc{0}; if (ctx.full_potential()) { - for (int ialoc = 0; ialoc < unit_cell.spl_num_atoms().local_size(); ialoc++) { - int ia = unit_cell.spl_num_atoms(ialoc); - int zn = unit_cell.atom(ia).zn(); - enuc -= 0.5 * zn * potential.vh_el(ia); + for (auto it : unit_cell.spl_num_atoms()) { + int zn = unit_cell.atom(it.i).zn(); + enuc -= 0.5 * zn * potential.vh_el(it.i); } ctx.comm().allreduce(&enuc, 1); } diff --git a/src/fft/fft.hpp b/src/fft/fft.hpp index 24fdc2fd44..be15f08b97 100644 --- a/src/fft/fft.hpp +++ b/src/fft/fft.hpp @@ -232,7 +232,7 @@ inline size_t spfft_grid_size_local(T const& spfft__) * using block distribution. */ inline auto split_z_dimension(int size_z__, mpi::Communicator const& comm_fft__) { - return sddk::splindex(size_z__, comm_fft__.size(), comm_fft__.rank()); + return sddk::splindex_block<>(size_z__, n_blocks(comm_fft__.size()), block_id(comm_fft__.rank())); } } // namespace fft diff --git a/src/fft/gvec.cpp b/src/fft/gvec.cpp index 1e10e67cbe..30529f9617 100644 --- a/src/fft/gvec.cpp +++ b/src/fft/gvec.cpp @@ -671,14 +671,14 @@ Gvec_shells::Gvec_shells(Gvec const& gvec__) a2a_recv_ = mpi::block_data_descriptor(comm_.size()); /* split G-vector shells between ranks in cyclic order */ - spl_num_gsh_ = sddk::splindex(gvec_.num_shells(), comm_.size(), comm_.rank(), 1); + spl_num_gsh_ = sddk::splindex_block_cyclic<>(gvec_.num_shells(), n_blocks(comm_.size()), block_id(comm_.rank()), 1); /* each rank sends a fraction of its local G-vectors to other ranks */ /* count this fraction */ for (int igloc = 0; igloc < gvec_.count(); igloc++) { int ig = gvec_.offset() + igloc; int igsh = gvec_.shell(ig); - a2a_send_.counts[spl_num_gsh_.local_rank(igsh)]++; + a2a_send_.counts[spl_num_gsh_.location(igsh).ib]++; } a2a_send_.calc_offsets(); /* sanity check: total number of elements to send is equal to the local number of G-vector */ @@ -692,7 +692,7 @@ Gvec_shells::Gvec_shells(Gvec const& gvec__) int ig = gvec_.gvec_offset(r) + igloc; /* index of the G-vector shell */ int igsh = gvec_.shell(ig); - if (spl_num_gsh_.local_rank(igsh) == comm_.rank()) { + if (spl_num_gsh_.location(igsh).ib == comm_.rank()) { a2a_recv_.counts[r]++; } } @@ -714,7 +714,7 @@ Gvec_shells::Gvec_shells(Gvec const& gvec__) int ig = gvec_.gvec_offset(r) + igloc; int igsh = gvec_.shell(ig); auto G = gvec_.gvec(ig); - if (spl_num_gsh_.local_rank(igsh) == comm_.rank()) { + if (spl_num_gsh_.location(igsh).ib == comm_.rank()) { for (int x : {0, 1, 2}) { gvec_remapped_(x, a2a_recv_.offsets[r] + counts[r]) = G[x]; } diff --git a/src/fft/gvec.hpp b/src/fft/gvec.hpp index a51316bf38..d07dbd4dc8 100644 --- a/src/fft/gvec.hpp +++ b/src/fft/gvec.hpp @@ -920,7 +920,7 @@ class Gvec_shells mpi::block_data_descriptor a2a_recv_; /// Split global index of G-shells between MPI ranks. - sddk::splindex spl_num_gsh_; + sddk::splindex_block_cyclic<> spl_num_gsh_; /// List of G-vectors in the remapped storage. sddk::mdarray gvec_remapped_; @@ -996,7 +996,7 @@ class Gvec_shells for (int igloc = 0; igloc < gvec_.count(); igloc++) { int ig = gvec_.offset() + igloc; int igsh = gvec_.shell(ig); - int r = spl_num_gsh_.local_rank(igsh); + int r = spl_num_gsh_.location(igsh).ib; send_buf[a2a_send_.offsets[r] + counts[r]] = data__[igloc]; counts[r]++; } @@ -1023,7 +1023,7 @@ class Gvec_shells for (int igloc = 0; igloc < gvec_.count(); igloc++) { int ig = gvec_.offset() + igloc; int igsh = gvec_.shell(ig); - int r = spl_num_gsh_.local_rank(igsh); + int r = spl_num_gsh_.location(igsh).ib; data__[igloc] = recv_buf[a2a_send_.offsets[r] + counts[r]]; counts[r]++; } diff --git a/src/function3d/field4d.hpp b/src/function3d/field4d.hpp index 46348bab90..4ffbeb71e2 100644 --- a/src/function3d/field4d.hpp +++ b/src/function3d/field4d.hpp @@ -144,7 +144,7 @@ class Field4D auto mt_components() { - std::vector*> result; + std::vector*> result; result.push_back(&components_[0]->mt()); switch (ctx_.num_mag_dims()) { case 1: { diff --git a/src/function3d/paw_field4d.hpp b/src/function3d/paw_field4d.hpp index 3f608491a9..255758fc19 100644 --- a/src/function3d/paw_field4d.hpp +++ b/src/function3d/paw_field4d.hpp @@ -38,9 +38,9 @@ class PAW_field4D /// Unit cell. Unit_cell const& uc_; /// All-electron part. - std::array, 4> ae_components_; + std::array, 4> ae_components_; /// Pseudo-potential part. - std::array, 4> ps_components_; + std::array, 4> ps_components_; /* copy constructor is forbidden */ PAW_field4D(PAW_field4D const& src__) = delete; /* copy assignment operator is forbidden */ @@ -57,9 +57,9 @@ class PAW_field4D auto ptr = (is_global__) ? nullptr : &uc__.spl_num_paw_atoms(); for (int j = 0; j < uc__.parameters().num_mag_dims() + 1; j++) { - ae_components_[j] = Spheric_function_set(uc__, uc__.paw_atoms(), + ae_components_[j] = Spheric_function_set(uc__, uc__.paw_atoms(), [&uc__](int ia){return lmax_t(2 * uc__.atom(ia).type().indexr().lmax());}, ptr); - ps_components_[j] = Spheric_function_set(uc__, uc__.paw_atoms(), + ps_components_[j] = Spheric_function_set(uc__, uc__.paw_atoms(), [&uc__](int ia){return lmax_t(2 * uc__.atom(ia).type().indexr().lmax());}, ptr); } } diff --git a/src/function3d/periodic_function.hpp b/src/function3d/periodic_function.hpp index de1deb304e..0352aaf9b6 100644 --- a/src/function3d/periodic_function.hpp +++ b/src/function3d/periodic_function.hpp @@ -64,7 +64,7 @@ class Periodic_function Smooth_periodic_function rg_component_; /// Muffin-tin part of the periodic function. - Spheric_function_set mt_component_; + Spheric_function_set mt_component_; /// Alias to G-vectors. fft::Gvec const& gvec_; @@ -88,7 +88,7 @@ class Periodic_function /// Constructor for interstitial and muffin-tin parts (FP-LAPW case). Periodic_function(Simulation_context const& ctx__, std::function lmax__, - sddk::splindex const* spl_atoms__ = nullptr, + sddk::splindex_block const* spl_atoms__ = nullptr, smooth_periodic_function_ptr_t const* rg_ptr__ = nullptr, spheric_function_set_ptr_t const* mt_ptr__ = nullptr) : ctx_(ctx__) @@ -162,9 +162,8 @@ class Periodic_function if (ctx_.full_potential()) { mt_val = std::vector(unit_cell_.num_atoms(), 0); - for (int ialoc = 0; ialoc < unit_cell_.spl_num_atoms().local_size(); ialoc++) { - int ia = unit_cell_.spl_num_atoms(ialoc); - mt_val[ia] = mt_component_[ia].component(0).integrate(2) * fourpi * y00; + for (auto it : unit_cell_.spl_num_atoms()) { + mt_val[it.i] = mt_component_[it.i].component(0).integrate(2) * fourpi * y00; } comm_.allreduce(&mt_val[0], unit_cell_.num_atoms()); diff --git a/src/function3d/spheric_function_set.hpp b/src/function3d/spheric_function_set.hpp index f9e18217e7..5f99f6ea52 100644 --- a/src/function3d/spheric_function_set.hpp +++ b/src/function3d/spheric_function_set.hpp @@ -8,7 +8,7 @@ namespace sirius { using lmax_t = strong_type; -template +template class Spheric_function_set { private: @@ -18,7 +18,7 @@ class Spheric_function_set std::vector atoms_; /// Split the number of atoms between MPI ranks. /** If the pointer is null, spheric functions set is treated as global, without MPI distribution */ - sddk::splindex const* spl_atoms_{nullptr}; + sddk::splindex_block const* spl_atoms_{nullptr}; /// List of spheric functions. std::vector> func_; @@ -41,9 +41,8 @@ class Spheric_function_set }; if (spl_atoms_) { - for (int i = 0; i < spl_atoms_->local_size(); i++) { - int ia = atoms_[(*spl_atoms_)[i]]; - set_func(ia); + for (auto it : (*spl_atoms_)) { + set_func(atoms_[it.i]); } } else { for (int ia : atoms_) { @@ -59,7 +58,7 @@ class Spheric_function_set /// Constructor for all atoms. Spheric_function_set(Unit_cell const& unit_cell__, std::function lmax__, - sddk::splindex const* spl_atoms__ = nullptr, + sddk::splindex_block const* spl_atoms__ = nullptr, spheric_function_set_ptr_t const* sptr__ = nullptr) : unit_cell_{&unit_cell__} , spl_atoms_{spl_atoms__} @@ -68,7 +67,7 @@ class Spheric_function_set atoms_.resize(unit_cell__.num_atoms()); std::iota(atoms_.begin(), atoms_.end(), 0); if (spl_atoms_) { - if (spl_atoms_->global_index_size() != unit_cell__.num_atoms()) { + if (spl_atoms_->size() != unit_cell__.num_atoms()) { RTE_THROW("wrong split atom index"); } } @@ -77,14 +76,14 @@ class Spheric_function_set /// Constructor for a subset of atoms. Spheric_function_set(Unit_cell const& unit_cell__, std::vector atoms__, std::function lmax__, - sddk::splindex const* spl_atoms__ = nullptr) + sddk::splindex_block const* spl_atoms__ = nullptr) : unit_cell_{&unit_cell__} , atoms_{atoms__} , spl_atoms_{spl_atoms__} , all_atoms_{false} { if (spl_atoms_) { - if (spl_atoms_->global_index_size() != static_cast(atoms__.size())) { + if (spl_atoms_->size() != static_cast(atoms__.size())) { RTE_THROW("wrong split atom index"); } } @@ -125,16 +124,16 @@ class Spheric_function_set /// Synchronize global function. /** Assuming that each MPI rank was handling part of the global spherical function, broadcast data * from each rank. As a result, each rank stores a full and identical copy of global spherical function. */ - inline void sync(sddk::splindex const& spl_atoms__) + inline void sync(sddk::splindex_block const& spl_atoms__) { - for (int i = 0; i < spl_atoms__.global_index_size(); i++) { - auto loc = spl_atoms__.location(i); + for (int i = 0; i < spl_atoms__.size(); i++) { + auto loc = spl_atoms__.location(typename I::global(i)); int ia = atoms_[i]; - unit_cell_->comm().bcast(func_[ia].at(sddk::memory_t::host), static_cast(func_[ia].size()), loc.rank); + unit_cell_->comm().bcast(func_[ia].at(sddk::memory_t::host), static_cast(func_[ia].size()), loc.ib); } } - Spheric_function_set& operator+=(Spheric_function_set const& rhs__) + Spheric_function_set& operator+=(Spheric_function_set const& rhs__) { for (int ia = 0; ia < unit_cell_->num_atoms(); ia++) { if (func_[ia].size() && rhs__[ia].size()) { @@ -144,33 +143,33 @@ class Spheric_function_set return *this; } - template - friend F - inner(Spheric_function_set const& f1__, Spheric_function_set const& f2__); + template + friend T_ + inner(Spheric_function_set const& f1__, Spheric_function_set const& f2__); - template + template friend void - copy(Spheric_function_set const& src__, Spheric_function_set& dest__); + copy(Spheric_function_set const& src__, Spheric_function_set& dest__); - template + template friend void - copy(Spheric_function_set const& src__, spheric_function_set_ptr_t dest__); + copy(Spheric_function_set const& src__, spheric_function_set_ptr_t dest__); - template + template friend void - copy(spheric_function_set_ptr_t src__, Spheric_function_set const& dest__); + copy(spheric_function_set_ptr_t src__, Spheric_function_set const& dest__); - template + template friend void - scale(F alpha__, Spheric_function_set& x__); + scale(T_ alpha__, Spheric_function_set& x__); - template + template friend void - axpy(F alpha__, Spheric_function_set const& x__, Spheric_function_set& y__); + axpy(T_ alpha__, Spheric_function_set const& x__, Spheric_function_set& y__); }; -template -inline T inner(Spheric_function_set const& f1__, Spheric_function_set const& f2__) +template +inline T inner(Spheric_function_set const& f1__, Spheric_function_set const& f2__) { auto ptr = (f1__.spl_atoms_) ? f1__.spl_atoms_ : f2__.spl_atoms_; @@ -185,13 +184,13 @@ inline T inner(Spheric_function_set const& f1__, Spheric_function_set cons if (ptr) { for (int i = 0; i < ptr->local_size(); i++) { - int ia = f1__.atoms_[(*ptr)[i]]; + int ia = f1__.atoms_[(*ptr).global_index(typename I::local(i))]; result += inner(f1__[ia], f2__[ia]); } } else { - sddk::splindex spl_atoms(f1__.atoms_.size(), comm.size(), comm.rank()); + sddk::splindex_block spl_atoms(f1__.atoms_.size(), n_blocks(comm.size()), block_id(comm.rank())); for (int i = 0; i < spl_atoms.local_size(); i++) { - int ia = f1__.atoms_[spl_atoms[i]]; + int ia = f1__.atoms_[spl_atoms.global_index(typename I::local(i))]; result += inner(f1__[ia], f2__[ia]); } } @@ -201,9 +200,9 @@ inline T inner(Spheric_function_set const& f1__, Spheric_function_set cons /// Copy from Spheric_function_set to external pointer. /** External pointer is assumed to be global. */ -template +template inline void -copy(Spheric_function_set const& src__, spheric_function_set_ptr_t dest__) +copy(Spheric_function_set const& src__, spheric_function_set_ptr_t dest__) { auto p = dest__.ptr; for (auto ia : src__.atoms()) { @@ -229,9 +228,9 @@ copy(Spheric_function_set const& src__, spheric_function_set_ptr_t dest__) /// Copy from external pointer to Spheric_function_set. /** External pointer is assumed to be global. */ -template +template inline void -copy(spheric_function_set_ptr_t const src__, Spheric_function_set& dest__) +copy(spheric_function_set_ptr_t const src__, Spheric_function_set& dest__) { auto p = src__.ptr; for (auto ia : dest__.atoms()) { @@ -250,9 +249,9 @@ copy(spheric_function_set_ptr_t const src__, Spheric_function_set& dest__) } } -template +template inline void -copy(Spheric_function_set const& src__, Spheric_function_set& dest__) +copy(Spheric_function_set const& src__, Spheric_function_set& dest__) { for (int ia = 0; ia < src__.unit_cell_->num_atoms(); ia++) { if (src__.func_[ia].size()) { @@ -261,9 +260,9 @@ copy(Spheric_function_set const& src__, Spheric_function_set& dest__) } } -template +template inline void -scale(T alpha__, Spheric_function_set& x__) +scale(T alpha__, Spheric_function_set& x__) { for (int ia = 0; ia < x__.unit_cell_->num_atoms(); ia++) { if (x__.func_[ia].size()) { @@ -272,9 +271,9 @@ scale(T alpha__, Spheric_function_set& x__) } } -template +template inline void -axpy(T alpha__, Spheric_function_set const& x__, Spheric_function_set& y__) +axpy(T alpha__, Spheric_function_set const& x__, Spheric_function_set& y__) { for (int ia = 0; ia < x__.unit_cell_->num_atoms(); ia++) { if (x__.func_[ia].size()) { diff --git a/src/geometry/force.cpp b/src/geometry/force.cpp index 1612f4a947..01751c0cd6 100644 --- a/src/geometry/force.cpp +++ b/src/geometry/force.cpp @@ -53,8 +53,8 @@ void Force::calc_forces_nonloc_aux() auto& spl_num_kp = kset_.spl_num_kpoints(); - for (int ikploc = 0; ikploc < spl_num_kp.local_size(); ikploc++) { - auto* kp = kset_.get(spl_num_kp[ikploc]); + for (auto it : spl_num_kp) { + auto* kp = kset_.get(it.i); if (ctx_.gamma_point()) { add_k_point_contribution(*kp, forces_nonloc_); @@ -211,10 +211,9 @@ Force::calc_forces_ibs() } Hamiltonian0 H0(potential_, false); - for (int ikloc = 0; ikloc < kset_.spl_num_kpoints().local_size(); ikloc++) { - int ik = kset_.spl_num_kpoints(ikloc); - auto hk = H0(*kset_.get(ik)); - add_ibs_force(kset_.get(ik), hk, ffac, forces_ibs_); + for (auto it : kset_.spl_num_kpoints()) { + auto hk = H0(*kset_.get(it.i)); + add_ibs_force(kset_.get(it.i), hk, ffac, forces_ibs_); } ctx_.comm().allreduce(&forces_ibs_(0, 0), (int)forces_ibs_.size()); symmetrize_forces(ctx_.unit_cell(), forces_ibs_); @@ -228,11 +227,10 @@ Force::calc_forces_rho() forces_rho_ = sddk::mdarray(3, ctx_.unit_cell().num_atoms()); forces_rho_.zero(); - for (int ialoc = 0; ialoc < ctx_.unit_cell().spl_num_atoms().local_size(); ialoc++) { - int ia = ctx_.unit_cell().spl_num_atoms(ialoc); - auto g = gradient(density_.density_mt(ialoc)); + for (auto it : ctx_.unit_cell().spl_num_atoms()) { + auto g = gradient(density_.density_mt(it.li)); for (int x = 0; x < 3; x++) { - forces_rho_(x, ia) = inner(potential_.effective_potential_mt(ialoc), g[x]); + forces_rho_(x, it.i) = inner(potential_.effective_potential_mt(it.li), g[x]); } } ctx_.comm().allreduce(&forces_rho_(0, 0), (int)forces_rho_.size()); @@ -247,11 +245,10 @@ Force::calc_forces_hf() forces_hf_ = sddk::mdarray(3, ctx_.unit_cell().num_atoms()); forces_hf_.zero(); - for (int ialoc = 0; ialoc < ctx_.unit_cell().spl_num_atoms().local_size(); ialoc++) { - int ia = ctx_.unit_cell().spl_num_atoms(ialoc); - auto g = gradient(potential_.hartree_potential_mt(ialoc)); + for (auto it : ctx_.unit_cell().spl_num_atoms()) { + auto g = gradient(potential_.hartree_potential_mt(it.li)); for (int x = 0; x < 3; x++) { - forces_hf_(x, ia) = ctx_.unit_cell().atom(ia).zn() * g[x](0, 0) * y00; + forces_hf_(x, it.i) = ctx_.unit_cell().atom(it.i).zn() * g[x](0, 0) * y00; } } ctx_.comm().allreduce(&forces_hf_(0, 0), (int)forces_hf_.size()); @@ -273,10 +270,9 @@ Force::calc_forces_hubbard() Q_operator q_op(ctx_); - for (int ikloc = 0; ikloc < kset_.spl_num_kpoints().local_size(); ikloc++) { + for (auto it : kset_.spl_num_kpoints()) { - int ik = kset_.spl_num_kpoints(ikloc); - auto kp = kset_.get(ik); + auto kp = kset_.get(it.i); // kp->beta_projectors().prepare(); auto mg1 = kp->spinor_wave_functions().memory_guard(ctx_.processing_unit_memory_t(), wf::copy_to::device); auto mg2 = kp->hubbard_wave_functions_S().memory_guard(ctx_.processing_unit_memory_t(), wf::copy_to::device); diff --git a/src/geometry/non_local_functor.hpp b/src/geometry/non_local_functor.hpp index 665d1bb800..e2f519e1b3 100644 --- a/src/geometry/non_local_functor.hpp +++ b/src/geometry/non_local_functor.hpp @@ -85,14 +85,11 @@ void add_k_point_contribution_nonlocal(Simulation_context& ctx__, Beta_projector int nbnd = kp__.num_occupied_bands(ispn); /* inner product of beta gradient and WF */ - // auto bp_base_phi_chunk = bp_base__.template inner(ctx__.processing_unit_memory_t(), icnk, - // kp__.spinor_wave_functions(), wf::spin_index(ispn), wf::band_range(0, nbnd)); - auto bp_base_phi_chunk = inner_prod_beta( ctx__.spla_context(), mt, ctx__.host_memory_t(), sddk::is_device_memory(mt), beta_coeffs_base, kp__.spinor_wave_functions(), wf::spin_index(ispn), wf::band_range(0, nbnd)); - sddk::splindex spl_nbnd(nbnd, kp__.comm().size(), kp__.comm().rank()); + sddk::splindex_block<> spl_nbnd(nbnd, n_blocks(kp__.comm().size()), block_id(kp__.comm().rank())); int nbnd_loc = spl_nbnd.local_size(); @@ -112,7 +109,7 @@ void add_k_point_contribution_nonlocal(Simulation_context& ctx__, Beta_projector sddk::matrix& beta_phi_chunk) { /* gather everything = - 2 Re[ occ(k,n) weight(k) beta_phi*(i,n) [Dij - E(n)Qij] beta_base_phi(j,n) ]*/ for (int ibnd_loc = 0; ibnd_loc < nbnd_loc; ibnd_loc++) { - int ibnd = spl_nbnd[ibnd_loc]; + int ibnd = spl_nbnd.global_index(ibnd_loc); auto d1 = main_two_factor * kp__.band_occupancy(ibnd, ispn) * kp__.weight(); auto z2 = dij - static_cast>(kp__.band_energy(ibnd, ispn) * qij); diff --git a/src/geometry/stress.cpp b/src/geometry/stress.cpp index 4dc10d332e..6f5d834a72 100644 --- a/src/geometry/stress.cpp +++ b/src/geometry/stress.cpp @@ -50,9 +50,8 @@ Stress::calc_stress_nonloc_aux() print_memory_usage(ctx_.out(), FILE_LINE); - for (int ikloc = 0; ikloc < kset_.spl_num_kpoints().local_size(); ikloc++) { - int ik = kset_.spl_num_kpoints(ikloc); - auto kp = kset_.get(ik); + for (auto it : kset_.spl_num_kpoints()) { + auto kp = kset_.get(it.i); auto mem = ctx_.processing_unit_memory_t(); auto mg = kp->spinor_wave_functions().memory_guard(mem, wf::copy_to::device); Beta_projectors_strain_deriv bp_strain_deriv(ctx_, kp->gkvec()); @@ -131,9 +130,8 @@ Stress::calc_stress_hubbard() if (is_device_memory(ctx_.processing_unit_memory_t())) { dn.allocate(ctx_.processing_unit_memory_t()); } - for (int ikloc = 0; ikloc < kset_.spl_num_kpoints().local_size(); ikloc++) { - int ik = kset_.spl_num_kpoints(ikloc); - auto kp = kset_.get(ik); + for (auto it : kset_.spl_num_kpoints()) { + auto kp = kset_.get(it.i); dn.zero(); if (is_device_memory(ctx_.processing_unit_memory_t())) { dn.zero(ctx_.processing_unit_memory_t()); @@ -652,9 +650,8 @@ Stress::calc_stress_kin_aux() { stress_kin_.zero(); - for (int ikloc = 0; ikloc < kset_.spl_num_kpoints().local_size(); ikloc++) { - int ik = kset_.spl_num_kpoints(ikloc); - auto kp = kset_.get(ik); + for (auto it : kset_.spl_num_kpoints()) { + auto kp = kset_.get(it.i); double fact = kp->gkvec().reduced() ? 2.0 : 1.0; fact *= kp->weight(); diff --git a/src/hamiltonian/hamiltonian.cpp b/src/hamiltonian/hamiltonian.cpp index fd5bc2e282..c16c7c3ed3 100644 --- a/src/hamiltonian/hamiltonian.cpp +++ b/src/hamiltonian/hamiltonian.cpp @@ -157,10 +157,9 @@ Hamiltonian0::apply_bmt(wf::Wave_functions& psi__, std::vector, 3> zm(unit_cell_.max_mt_basis_size(), unit_cell_.max_mt_basis_size(), ctx_.num_mag_dims()); - for (int ialoc = 0; ialoc < psi__.spl_num_atoms().local_size(); ialoc++) { - int ia = psi__.spl_num_atoms()[ialoc]; + for (auto it : psi__.spl_num_atoms()) { + auto ia = it.i; auto& atom = unit_cell_.atom(ia); - //int offset = psi__.offset_mt_coeffs(ialoc); int mt_basis_size = atom.type().mt_basis_size(); zm.zero(); @@ -183,9 +182,9 @@ Hamiltonian0::apply_bmt(wf::Wave_functions& psi__, std::vector */ la::wrap(la::lib_t::blas).hemm( 'L', 'U', mt_basis_size, ctx_.num_fv_states(), &la::constant>::one(), - zm.at(sddk::memory_t::host), zm.ld(), &psi__.mt_coeffs(0, wf::atom_index(ialoc), wf::spin_index(0), wf::band_index(0)), + zm.at(sddk::memory_t::host), zm.ld(), &psi__.mt_coeffs(0, it.li, wf::spin_index(0), wf::band_index(0)), psi__.ld(), &la::constant>::zero(), - &bpsi__[0].mt_coeffs(0, wf::atom_index(ialoc), wf::spin_index(0), wf::band_index(0)), bpsi__[0].ld()); + &bpsi__[0].mt_coeffs(0, it.li, wf::spin_index(0), wf::band_index(0)), bpsi__[0].ld()); /* compute bwf = (B_x - iB_y)|wf_j> */ if (bpsi__.size() == 3) { @@ -204,9 +203,9 @@ Hamiltonian0::apply_bmt(wf::Wave_functions& psi__, std::vector>::one(), - zm.at(sddk::memory_t::host), zm.ld(), &psi__.mt_coeffs(0, wf::atom_index(ialoc), wf::spin_index(0), wf::band_index(0)), + zm.at(sddk::memory_t::host), zm.ld(), &psi__.mt_coeffs(0, it.li, wf::spin_index(0), wf::band_index(0)), psi__.ld(), &la::constant>::zero(), - &bpsi__[2].mt_coeffs(0, wf::atom_index(ialoc), wf::spin_index(0), wf::band_index(0)), bpsi__[2].ld()); + &bpsi__[2].mt_coeffs(0, it.li, wf::spin_index(0), wf::band_index(0)), bpsi__[2].ld()); } } } @@ -219,10 +218,10 @@ Hamiltonian0::apply_so_correction(wf::Wave_functions& psi__, std::vector::get_h_o_diag_lapw() const auto const& uc = H0_.ctx().unit_cell(); - sddk::splindex spl_num_atoms(uc.num_atoms(), kp_.comm().size(), kp_.comm().rank()); + sddk::splindex_block spl_num_atoms(uc.num_atoms(), n_blocks(kp_.comm().size()), block_id(kp_.comm().rank())); int nlo{0}; - for (int ialoc = 0; ialoc < spl_num_atoms.local_size(); ialoc++) { - int ia = spl_num_atoms[ialoc]; - nlo += uc.atom(ia).mt_lo_basis_size(); + for (auto it : spl_num_atoms) { + nlo += uc.atom(it.i).mt_lo_basis_size(); } auto h_diag = (what & 1) ? sddk::mdarray(kp_.num_gkvec_loc() + nlo, 1) : sddk::mdarray(); @@ -283,11 +282,10 @@ Hamiltonian_k::get_h_o_diag_lapw() const } nlo = 0; - for (int ialoc = 0; ialoc < spl_num_atoms.local_size(); ialoc++) { - int ia = spl_num_atoms[ialoc]; - auto& atom = uc.atom(ia); + for (auto it : spl_num_atoms) { + auto& atom = uc.atom(it.i); auto& type = atom.type(); - auto& hmt = H0_.hmt(ia); + auto& hmt = H0_.hmt(it.i); #pragma omp parallel for for (int ilo = 0; ilo < type.mt_lo_basis_size(); ilo++) { int xi_lo = type.mt_aw_basis_size() + ilo; @@ -890,15 +888,15 @@ Hamiltonian_k::apply_fv_h_o(bool apw_only__, bool phi_is_lo__, wf::band_range auto apply_hmt_apw_lo = [this, &ctx, &phi__, la, mem, &b__, &spl_atoms](wf::Wave_functions_mt& h_apw_lo__) { #pragma omp parallel for - for (int ialoc = 0; ialoc < spl_atoms.local_size(); ialoc++) { + for (auto it: spl_atoms) { int tid = omp_get_thread_num(); - int ia = spl_atoms[ialoc]; + int ia = it.li; auto& atom = ctx.unit_cell().atom(ia); auto& type = atom.type(); int naw = type.mt_aw_basis_size(); int nlo = type.mt_lo_basis_size(); - auto aidx = wf::atom_index(ialoc); + auto aidx = it.li; auto& hmt = this->H0_.hmt(ia); @@ -921,14 +919,14 @@ Hamiltonian_k::apply_fv_h_o(bool apw_only__, bool phi_is_lo__, wf::band_range o_apw_lo__.zero(sddk::memory_t::host, wf::spin_index(0), wf::band_range(0, b__.size())); #pragma omp parallel for - for (int ialoc = 0; ialoc < spl_atoms.local_size(); ialoc++) { - int ia = spl_atoms[ialoc]; + for (auto it : spl_atoms) { + int ia = it.i; auto& atom = ctx.unit_cell().atom(ia); auto& type = atom.type(); int naw = type.mt_aw_basis_size(); int nlo = type.mt_lo_basis_size(); - auto aidx = wf::atom_index(ialoc); + auto aidx = it.li; for (int j = 0; j < b__.size(); j++) { for (int ilo = 0; ilo < nlo; ilo++) { @@ -952,15 +950,15 @@ Hamiltonian_k::apply_fv_h_o(bool apw_only__, bool phi_is_lo__, wf::band_range { /* lo-lo contribution */ #pragma omp parallel for - for (int ialoc = 0; ialoc < spl_atoms.local_size(); ialoc++) { + for (auto it : spl_atoms) { int tid = omp_get_thread_num(); - int ia = spl_atoms[ialoc]; + auto ia = it.i; auto& atom = ctx.unit_cell().atom(ia); auto& type = atom.type(); int naw = type.mt_aw_basis_size(); int nlo = type.mt_lo_basis_size(); - auto aidx = wf::atom_index(ialoc); + auto aidx = it.li; auto& hmt = H0_.hmt(ia); @@ -977,12 +975,12 @@ Hamiltonian_k::apply_fv_h_o(bool apw_only__, bool phi_is_lo__, wf::band_range { /* lo-lo contribution */ #pragma omp parallel for - for (int ialoc = 0; ialoc < spl_atoms.local_size(); ialoc++) { - int ia = spl_atoms[ialoc]; + for (auto it : spl_atoms) { + auto ia = it.i; auto& atom = ctx.unit_cell().atom(ia); auto& type = atom.type(); - auto aidx = wf::atom_index(ialoc); + auto aidx = it.li; for (int ilo = 0; ilo < type.mt_lo_basis_size(); ilo++) { int xi_lo = type.mt_aw_basis_size() + ilo; @@ -1012,14 +1010,14 @@ Hamiltonian_k::apply_fv_h_o(bool apw_only__, bool phi_is_lo__, wf::band_range wf::Wave_functions_mt& halm_phi__) { #pragma omp parallel for - for (int ialoc = 0; ialoc < alm_phi__.spl_num_atoms().local_size(); ialoc++) { + for (auto it : alm_phi__.spl_num_atoms()) { int tid = omp_get_thread_num(); - int ia = atom_begin__ + alm_phi__.spl_num_atoms()[ialoc]; + int ia = atom_begin__ + it.i; auto& atom = ctx.unit_cell().atom(ia); auto& type = atom.type(); int naw = type.mt_aw_basis_size(); - auto aidx = wf::atom_index(ialoc); + auto aidx = it.li; auto& hmt = H0_.hmt(ia); @@ -1036,15 +1034,15 @@ Hamiltonian_k::apply_fv_h_o(bool apw_only__, bool phi_is_lo__, wf::band_range auto apply_hmt_lo_apw = [this, &ctx, la, mem, &b__, &spl_atoms](wf::Wave_functions_mt const& alm_phi__, wf::Wave_functions& hphi__) { #pragma omp parallel for - for (int ialoc = 0; ialoc < spl_atoms.local_size(); ialoc++) { + for (auto it : spl_atoms) { int tid = omp_get_thread_num(); - int ia = spl_atoms[ialoc]; + int ia = it.i; auto& atom = ctx.unit_cell().atom(ia); auto& type = atom.type(); int naw = type.mt_aw_basis_size(); int nlo = type.mt_lo_basis_size(); - auto aidx = wf::atom_index(ialoc); + auto aidx = it.li; auto& hmt = H0_.hmt(ia); @@ -1062,14 +1060,14 @@ Hamiltonian_k::apply_fv_h_o(bool apw_only__, bool phi_is_lo__, wf::band_range auto apply_omt_lo_apw = [this, &ctx, mem, &b__, &spl_atoms](wf::Wave_functions_mt const& alm_phi__, wf::Wave_functions& ophi__) { #pragma omp parallel for - for (int ialoc = 0; ialoc < spl_atoms.local_size(); ialoc++) { - int ia = spl_atoms[ialoc]; + for (auto it : spl_atoms) { + int ia = it.i; auto& atom = ctx.unit_cell().atom(ia); auto& type = atom.type(); int naw = type.mt_aw_basis_size(); int nlo = type.mt_lo_basis_size(); - auto aidx = wf::atom_index(ialoc); + auto aidx = it.li; for (int ilo = 0; ilo < nlo; ilo++) { int xi_lo = naw + ilo; @@ -1213,7 +1211,7 @@ Hamiltonian_k::apply_fv_h_o(bool apw_only__, bool phi_is_lo__, wf::band_range /* loop over blocks of atoms */ for (auto na : utils::split_in_blocks(ctx.unit_cell().num_atoms(), 64)) { - sddk::splindex spl_atoms(na, comm.size(), comm.rank()); + sddk::splindex_block<> spl_atoms(na, n_blocks(comm.size()), block_id(comm.rank())); /* actual number of AW radial functions in a block of atoms */ int num_mt_aw{0}; diff --git a/src/hamiltonian/non_local_operator.hpp b/src/hamiltonian/non_local_operator.hpp index 0d813faeb5..fea1bd1f34 100644 --- a/src/hamiltonian/non_local_operator.hpp +++ b/src/hamiltonian/non_local_operator.hpp @@ -27,13 +27,10 @@ #include "SDDK/omp.hpp" #include "SDDK/memory.hpp" -// #include "beta_projectors/beta_projectors.hpp" -// #include "beta_projectors/beta_projectors_base.hpp" -// #include "beta_projectors/beta_projectors_strain_deriv.hpp" #include "non_local_operator_base.hpp" #include "context/simulation_context.hpp" #include "hubbard/hubbard_matrix.hpp" -#include "traits.hpp" +//#include "traits.hpp" #include "utils/rte.hpp" namespace sirius { diff --git a/src/hamiltonian/non_local_operator_base.hpp b/src/hamiltonian/non_local_operator_base.hpp index 9c62df4d43..bcf2db19a2 100644 --- a/src/hamiltonian/non_local_operator_base.hpp +++ b/src/hamiltonian/non_local_operator_base.hpp @@ -74,17 +74,17 @@ class Non_local_operator /// Apply beta projectors from one atom in a chunk of beta projectors to all wave-functions. template std::enable_if_t, F>::value, void> - apply(sddk::memory_t mem__, int chunk__, wf::atom_index ia__, int ispn_block__, wf::Wave_functions& op_phi__, - wf::band_range br__, const beta_projectors_coeffs_t& beta_coeffs__, sddk::matrix& beta_phi__); + apply(sddk::memory_t mem__, int chunk__, atom_index_t::local ia__, int ispn_block__, wf::Wave_functions& op_phi__, + wf::band_range br__, beta_projectors_coeffs_t const& beta_coeffs__, sddk::matrix& beta_phi__); /// computes α B*Q + β out template - void lmatmul(sddk::matrix& out, const sddk::matrix& B__, int ispn_block__, sddk::memory_t mem_t, + void lmatmul(sddk::matrix& out, sddk::matrix const& B__, int ispn_block__, sddk::memory_t mem_t, identity_t alpha = F{1}, identity_t beta = F{0}) const; /// computes α Q*B + β out template - void rmatmul(sddk::matrix& out, const sddk::matrix& B__, int ispn_block__, sddk::memory_t mem_t, + void rmatmul(sddk::matrix& out, sddk::matrix const& B__, int ispn_block__, sddk::memory_t mem_t, identity_t alpha = F{1}, identity_t beta = F{0}) const; template >::value>> @@ -207,9 +207,9 @@ Non_local_operator::apply(sddk::memory_t mem__, int chunk__, int ispn_block__ template template std::enable_if_t, F>::value, void> -Non_local_operator::apply(sddk::memory_t mem__, int chunk__, wf::atom_index ia__, int ispn_block__, +Non_local_operator::apply(sddk::memory_t mem__, int chunk__, atom_index_t::local ia__, int ispn_block__, wf::Wave_functions& op_phi__, wf::band_range br__, - const beta_projectors_coeffs_t& beta_coeffs__, sddk::matrix& beta_phi__) + beta_projectors_coeffs_t const& beta_coeffs__, sddk::matrix& beta_phi__) { if (is_null_) { return; @@ -275,8 +275,8 @@ Non_local_operator::lmatmul(sddk::matrix& out, const sddk::matrix& B__, } // check shapes - assert(out.size(0) == B__.size(0) && static_cast(out.size(1)) == this->size_); - assert(static_cast(B__.size(1)) == this->size_); + RTE_ASSERT(out.size(0) == B__.size(0) && static_cast(out.size(1)) == this->size_); + RTE_ASSERT(static_cast(B__.size(1)) == this->size_); int num_atoms = uc.num_atoms(); @@ -314,8 +314,8 @@ Non_local_operator::rmatmul(sddk::matrix& out, const sddk::matrix& B__, } // check shapes - assert(static_cast(out.size(0)) == this->size_ && out.size(1) == B__.size(1)); - assert(static_cast(B__.size(0)) == this->size_); + RTE_ASSERT(static_cast(out.size(0)) == this->size_ && out.size(1) == B__.size(1)); + RTE_ASSERT(static_cast(B__.size(0)) == this->size_); int num_atoms = uc.num_atoms(); diff --git a/src/hubbard/hubbard_occupancies_derivatives.cpp b/src/hubbard/hubbard_occupancies_derivatives.cpp index 5c1f08fed2..da9b7373c3 100644 --- a/src/hubbard/hubbard_occupancies_derivatives.cpp +++ b/src/hubbard/hubbard_occupancies_derivatives.cpp @@ -297,9 +297,9 @@ Hubbard::compute_occupancies_derivatives(K_point& kp__, Q_operator Q < beta | phi_atomic > */ phi_atomic_tmp->zero(mt, wf::spin_index(0), wf::band_range(0, nawf)); if (ctx_.unit_cell().atom(ja).type().augment()) { - q_op__.apply(mt, ichunk, wf::atom_index(i), 0, *phi_atomic_tmp, wf::band_range(0, nawf), + q_op__.apply(mt, ichunk, atom_index_t::local(i), 0, *phi_atomic_tmp, wf::band_range(0, nawf), bp_grad_coeffs, beta_phi_atomic); - q_op__.apply(mt, ichunk, wf::atom_index(i), 0, *phi_atomic_tmp, wf::band_range(0, nawf), + q_op__.apply(mt, ichunk, atom_index_t::local(i), 0, *phi_atomic_tmp, wf::band_range(0, nawf), bp_coeffs, grad_beta_phi_atomic); } diff --git a/src/k_point/generate_fv_states.cpp b/src/k_point/generate_fv_states.cpp index 18eaad46a4..5bbf55f47f 100644 --- a/src/k_point/generate_fv_states.cpp +++ b/src/k_point/generate_fv_states.cpp @@ -97,18 +97,17 @@ void K_point::generate_fv_states() auto out_ptr = &fv_states_->pw_coeffs(0, wf::spin_index(0), wf::band_index(i)); std::copy(in_ptr, in_ptr + gkvec().count(), out_ptr); - for (int ialoc = 0; ialoc < alm_fv_slab.spl_num_atoms().local_size(); ialoc++) { - int ia = alm_fv_slab.spl_num_atoms()[ialoc]; - int num_mt_aw = uc.atom(ia).type().mt_aw_basis_size(); + for (auto it : alm_fv_slab.spl_num_atoms()) { + int num_mt_aw = uc.atom(it.i).type().mt_aw_basis_size(); /* aw part of the muffin-tin coefficients */ for (int xi = 0; xi < num_mt_aw; xi++) { - fv_states_->mt_coeffs(xi, wf::atom_index(ialoc), wf::spin_index(0), wf::band_index(i)) = - alm_fv_slab.mt_coeffs(xi, wf::atom_index(ialoc), wf::spin_index(0), wf::band_index(i)); + fv_states_->mt_coeffs(xi, it.li, wf::spin_index(0), wf::band_index(i)) = + alm_fv_slab.mt_coeffs(xi, it.li, wf::spin_index(0), wf::band_index(i)); } /* lo part of muffin-tin coefficients */ - for (int xi = 0; xi < uc.atom(ia).type().mt_lo_basis_size(); xi++) { - fv_states_->mt_coeffs(num_mt_aw + xi, wf::atom_index(ialoc), wf::spin_index(0), wf::band_index(i)) = - fv_eigen_vectors_slab().mt_coeffs(xi, wf::atom_index(ialoc), wf::spin_index(0), wf::band_index(i)); + for (int xi = 0; xi < uc.atom(it.i).type().mt_lo_basis_size(); xi++) { + fv_states_->mt_coeffs(num_mt_aw + xi, it.li, wf::spin_index(0), wf::band_index(i)) = + fv_eigen_vectors_slab().mt_coeffs(xi, it.li, wf::spin_index(0), wf::band_index(i)); } } } diff --git a/src/k_point/k_point.cpp b/src/k_point/k_point.cpp index 054259ce95..abb664b10b 100644 --- a/src/k_point/k_point.cpp +++ b/src/k_point/k_point.cpp @@ -353,11 +353,13 @@ K_point::generate_gkvec(double gk_cutoff__) ctx_.spfft_coarse().local_z_length(), gkvec_partition_->count(), SPFFT_INDEX_TRIPLETS, gv.at(sddk::memory_t::host)))); - sddk::splindex spl_ngk_row(num_gkvec(), num_ranks_row_, rank_row_, ctx_.cyclic_block_size()); + sddk::splindex_block_cyclic<> spl_ngk_row(num_gkvec(), n_blocks(num_ranks_row_), block_id(rank_row_), + ctx_.cyclic_block_size()); num_gkvec_row_ = spl_ngk_row.local_size(); sddk::mdarray gkvec_row(3, num_gkvec_row_); - sddk::splindex spl_ngk_col(num_gkvec(), num_ranks_col_, rank_col_, ctx_.cyclic_block_size()); + sddk::splindex_block_cyclic<> spl_ngk_col(num_gkvec(), n_blocks(num_ranks_col_), block_id(rank_col_), + ctx_.cyclic_block_size()); num_gkvec_col_ = spl_ngk_col.local_size(); sddk::mdarray gkvec_col(3, num_gkvec_col_); @@ -367,14 +369,14 @@ K_point::generate_gkvec(double gk_cutoff__) int ig = gkvec_->gvec_offset(rank) + igloc; auto loc_row = spl_ngk_row.location(ig); auto loc_col = spl_ngk_col.location(ig); - if (loc_row.rank == comm_row().rank()) { + if (loc_row.ib == comm_row().rank()) { for (int x : {0, 1, 2}) { - gkvec_row(x, loc_row.local_index) = gv(x, igloc); + gkvec_row(x, loc_row.index_local) = gv(x, igloc); } } - if (loc_col.rank == comm_col().rank()) { + if (loc_col.ib == comm_col().rank()) { for (int x : {0, 1, 2}) { - gkvec_col(x, loc_col.local_index) = gv(x, igloc); + gkvec_col(x, loc_col.index_local) = gv(x, igloc); } } } @@ -453,11 +455,12 @@ K_point::get_fv_eigen_vectors(sddk::mdarray, 2>& fv_evec__) c for (int ia = 0; ia < ctx_.unit_cell().num_atoms(); ia++) { /* number of atom local orbitals */ int nlo = ctx_.unit_cell().atom(ia).mt_lo_basis_size(); - auto loc = fv_eigen_vectors_slab_->spl_num_atoms().location(ia); - if (loc.rank == this->comm().rank()) { + auto loc = fv_eigen_vectors_slab_->spl_num_atoms().location(typename atom_index_t::global(ia)); + if (loc.ib == this->comm().rank()) { for (int xi = 0; xi < nlo; xi++) { fv_evec__(this->num_gkvec() + offs + xi, ist) = - fv_eigen_vectors_slab_->mt_coeffs(xi, wf::atom_index(loc.local_index), wf::spin_index(0), wf::band_index(ist)); + fv_eigen_vectors_slab_->mt_coeffs(xi, atom_index_t::local(loc.index_local), wf::spin_index(0), + wf::band_index(ist)); } } offs += nlo; @@ -763,18 +766,20 @@ void K_point::generate_gklo_basis() { /* find local number of row G+k vectors */ - sddk::splindex spl_ngk_row(num_gkvec(), num_ranks_row_, rank_row_, ctx_.cyclic_block_size()); + sddk::splindex_block_cyclic<> spl_ngk_row(num_gkvec(), n_blocks(num_ranks_row_), block_id(rank_row_), + ctx_.cyclic_block_size()); num_gkvec_row_ = spl_ngk_row.local_size(); /* find local number of column G+k vectors */ - sddk::splindex spl_ngk_col(num_gkvec(), num_ranks_col_, rank_col_, ctx_.cyclic_block_size()); + sddk::splindex_block_cyclic<> spl_ngk_col(num_gkvec(), n_blocks(num_ranks_col_), block_id(rank_col_), + ctx_.cyclic_block_size()); num_gkvec_col_ = spl_ngk_col.local_size(); if (ctx_.full_potential()) { - sddk::splindex spl_nlo_row(num_gkvec() + unit_cell_.mt_lo_basis_size(), - num_ranks_row_, rank_row_, ctx_.cyclic_block_size()); - sddk::splindex spl_nlo_col(num_gkvec() + unit_cell_.mt_lo_basis_size(), - num_ranks_col_, rank_col_, ctx_.cyclic_block_size()); + sddk::splindex_block_cyclic<> spl_nlo_row(num_gkvec() + unit_cell_.mt_lo_basis_size(), + n_blocks(num_ranks_row_), block_id(rank_row_), ctx_.cyclic_block_size()); + sddk::splindex_block_cyclic<> spl_nlo_col(num_gkvec() + unit_cell_.mt_lo_basis_size(), + n_blocks(num_ranks_col_), block_id(rank_col_), ctx_.cyclic_block_size()); lo_basis_descriptor lo_desc; @@ -797,10 +802,10 @@ K_point::generate_gklo_basis() lo_desc.order = static_cast(order); lo_desc.idxrf = static_cast(idxrf); - if (spl_nlo_row.local_rank(num_gkvec() + idx) == rank_row_) { + if (spl_nlo_row.location(num_gkvec() + idx).ib == rank_row_) { lo_basis_descriptors_row_.push_back(lo_desc); } - if (spl_nlo_col.local_rank(num_gkvec() + idx) == rank_col_) { + if (spl_nlo_col.location(num_gkvec() + idx).ib == rank_col_) { lo_basis_descriptors_col_.push_back(lo_desc); } diff --git a/src/k_point/k_point_set.cpp b/src/k_point/k_point_set.cpp index 15d7a310b0..a2a6ee629a 100644 --- a/src/k_point/k_point_set.cpp +++ b/src/k_point/k_point_set.cpp @@ -33,26 +33,25 @@ void K_point_set::sync_band() sddk::mdarray data(ctx_.num_bands(), ctx_.num_spinors(), num_kpoints(), get_memory_pool(sddk::memory_t::host), "K_point_set::sync_band.data"); + data.zero(); int nb = ctx_.num_bands() * ctx_.num_spinors(); #pragma omp parallel - for (int ikloc = 0; ikloc < spl_num_kpoints_.local_size(); ikloc++) { - int ik = spl_num_kpoints_[ikloc]; - auto kp = this->get(ik); + for (auto it : spl_num_kpoints_) { + auto kp = this->get(it.i); switch (what) { case sync_band_t::energy: { - std::copy(&kp->band_energies_(0, 0), &kp->band_energies_(0, 0) + nb, &data(0, 0, ik)); + std::copy(&kp->band_energies_(0, 0), &kp->band_energies_(0, 0) + nb, &data(0, 0, it.i)); break; } case sync_band_t::occupancy: { - std::copy(&kp->band_occupancies_(0, 0), &kp->band_occupancies_(0, 0) + nb, &data(0, 0, ik)); + std::copy(&kp->band_occupancies_(0, 0), &kp->band_occupancies_(0, 0) + nb, &data(0, 0, it.i)); break; } } } - comm().allgather(data.at(sddk::memory_t::host), nb * spl_num_kpoints_.local_size(), - nb * spl_num_kpoints_.global_offset()); + comm().allreduce(data.at(sddk::memory_t::host), static_cast(data.size())); #pragma omp parallel for for (int ik = 0; ik < num_kpoints(); ik++) { @@ -139,16 +138,18 @@ void K_point_set::initialize(std::vector const& counts) PROFILE("sirius::K_point_set::initialize"); /* distribute k-points along the 1-st dimension of the MPI grid */ if (counts.empty()) { - sddk::splindex spl_tmp(num_kpoints(), comm().size(), comm().rank()); - spl_num_kpoints_ = sddk::splindex(num_kpoints(), comm().size(), comm().rank(), spl_tmp.counts()); + sddk::splindex_block<> spl_tmp(num_kpoints(), n_blocks(comm().size()), block_id(comm().rank())); + spl_num_kpoints_ = sddk::splindex_chunk(num_kpoints(), n_blocks(comm().size()), + block_id(comm().rank()), spl_tmp.counts()); } else { - spl_num_kpoints_ = sddk::splindex(num_kpoints(), comm().size(), comm().rank(), counts); + spl_num_kpoints_ = sddk::splindex_chunk(num_kpoints(), n_blocks(comm().size()), + block_id(comm().rank()), counts); } - for (int ikloc = 0; ikloc < spl_num_kpoints_.local_size(); ikloc++) { - kpoints_[spl_num_kpoints_[ikloc]]->initialize(); + for (auto it : spl_num_kpoints_) { + kpoints_[it.i]->initialize(); #if defined(USE_FP32) - kpoints_float_[spl_num_kpoints_[ikloc]]->initialize(); + kpoints_float_[it.i]->initialize(); #endif } @@ -301,12 +302,11 @@ void K_point_set::find_band_occupancies() } } } - for (int ikloc = 0; ikloc < spl_num_kpoints_.local_size(); ikloc++) { - int ik = spl_num_kpoints_[ikloc]; + for (auto it : spl_num_kpoints_) { for (int ispn = 0; ispn < ctx_.num_spinors(); ispn++) { #pragma omp parallel for for (int j = 0; j < ctx_.num_bands(); j++) { - this->get(ik)->band_occupancy(j, ispn, ctx_.max_occupancy()); + this->get(it.i)->band_occupancy(j, ispn, ctx_.max_occupancy()); } } } @@ -325,32 +325,30 @@ void K_point_set::find_band_occupancies() auto emax = std::numeric_limits::lowest(); #pragma omp parallel for reduction(min:emin) reduction(max:emax) - for (int ikloc = 0; ikloc < spl_num_kpoints_.local_size(); ikloc++) { - int ik = spl_num_kpoints_[ikloc]; + for (auto it : spl_num_kpoints_) { for (int ispn = 0; ispn < ctx_.num_spinors(); ispn++) { - emin = std::min(emin, this->get(ik)->band_energy(0, ispn)); - emax = std::max(emax, this->get(ik)->band_energy(ctx_.num_bands() - 1, ispn)); + emin = std::min(emin, this->get(it.i)->band_energy(0, ispn)); + emax = std::max(emax, this->get(it.i)->band_energy(ctx_.num_bands() - 1, ispn)); } } comm().allreduce(&emin, 1); comm().allreduce(&emax, 1); - sddk::splindex splb(ctx_.num_bands(), ctx_.comm_band().size(), ctx_.comm_band().rank()); + sddk::splindex_block<> splb(ctx_.num_bands(), n_blocks(ctx_.comm_band().size()), block_id(ctx_.comm_band().rank())); /* computes N(ef; f) = \sum_{i,k} f(ef - e_{k,i}) */ auto compute_ne = [&](double ef, auto&& f) { double ne{0}; - for (int ikloc = 0; ikloc < spl_num_kpoints_.local_size(); ikloc++) { - int ik = spl_num_kpoints_[ikloc]; + for (auto it : spl_num_kpoints_) { double tmp{0}; #pragma omp parallel reduction(+ : tmp) for (int ispn = 0; ispn < ctx_.num_spinors(); ispn++) { #pragma omp for for (int j = 0; j < splb.local_size(); j++) { - tmp += f(ef - this->get(ik)->band_energy(splb[j], ispn)) * ctx_.max_occupancy(); + tmp += f(ef - this->get(it.i)->band_energy(splb.global_index(j), ispn)) * ctx_.max_occupancy(); } } - ne += tmp * kpoints_[ik]->weight(); + ne += tmp * kpoints_[it.i]->weight(); } ctx_.comm().allreduce(&ne, 1); return ne; @@ -393,13 +391,12 @@ void K_point_set::find_band_occupancies() energy_fermi_ = bisection_search(F, emin, emax, tol); } - for (int ikloc = 0; ikloc < spl_num_kpoints_.local_size(); ikloc++) { - int ik = spl_num_kpoints_[ikloc]; + for (auto it : spl_num_kpoints_) { for (int ispn = 0; ispn < ctx_.num_spinors(); ispn++) { #pragma omp parallel for for (int j = 0; j < ctx_.num_bands(); j++) { - auto o = f(energy_fermi_ - this->get(ik)->band_energy(j, ispn)) * ctx_.max_occupancy(); - this->get(ik)->band_occupancy(j, ispn, o); + auto o = f(energy_fermi_ - this->get(it.i)->band_energy(j, ispn)) * ctx_.max_occupancy(); + this->get(it.i)->band_occupancy(j, ispn, o); } } } @@ -454,16 +451,15 @@ double K_point_set::valence_eval_sum() const { double eval_sum{0}; - sddk::splindex splb(ctx_.num_bands(), ctx_.comm_band().size(), ctx_.comm_band().rank()); + sddk::splindex_block<> splb(ctx_.num_bands(), n_blocks(ctx_.comm_band().size()), block_id(ctx_.comm_band().rank())); - for (int ikloc = 0; ikloc < spl_num_kpoints_.local_size(); ikloc++) { - auto ik = spl_num_kpoints_[ikloc]; - auto const& kp = this->get(ik); + for (auto it : spl_num_kpoints_) { + auto const& kp = this->get(it.i); double tmp{0}; #pragma omp parallel for reduction(+:tmp) for (int j = 0; j < splb.local_size(); j++) { for (int ispn = 0; ispn < ctx_.num_spinors(); ispn++) { - tmp += kp->band_energy(splb[j], ispn) * kp->band_occupancy(splb[j], ispn); + tmp += kp->band_energy(splb.global_index(j), ispn) * kp->band_occupancy(splb.global_index(j), ispn); } } eval_sum += kp->weight() * tmp; @@ -503,16 +499,15 @@ double K_point_set::entropy_sum() const auto f = smearing::entropy(ctx_.smearing(), ctx_.smearing_width()); - sddk::splindex splb(ctx_.num_bands(), ctx_.comm_band().size(), ctx_.comm_band().rank()); + sddk::splindex_block<> splb(ctx_.num_bands(), n_blocks(ctx_.comm_band().size()), block_id(ctx_.comm_band().rank())); - for (int ikloc = 0; ikloc < spl_num_kpoints_.local_size(); ikloc++) { - auto ik = spl_num_kpoints_[ikloc]; - auto const& kp = this->get(ik); + for (auto it : spl_num_kpoints_) { + auto const& kp = this->get(it.i); double tmp{0}; #pragma omp parallel for reduction(+:tmp) for (int j = 0; j < splb.local_size(); j++) { for (int ispn = 0; ispn < ctx_.num_spinors(); ispn++) { - tmp += ctx_.max_occupancy() * f(energy_fermi_ - kp->band_energy(splb[j], ispn)); + tmp += ctx_.max_occupancy() * f(energy_fermi_ - kp->band_energy(splb.global_index(j), ispn)); } } s_sum += kp->weight() * tmp; @@ -552,8 +547,8 @@ void K_point_set::print_info() pout << std::endl << utils::hbar(80, '-') << std::endl; } - for (int ikloc = 0; ikloc < spl_num_kpoints().local_size(); ikloc++) { - int ik = spl_num_kpoints(ikloc); + for (auto it : spl_num_kpoints()) { + int ik = it.i; pout << std::setw(4) << ik << utils::ffmt(9, 4) << kpoints_[ik]->vk()[0] << utils::ffmt(9, 4) << kpoints_[ik]->vk()[1] << utils::ffmt(9, 4) << kpoints_[ik]->vk()[2] << utils::ffmt(17, 6) << kpoints_[ik]->weight() << std::setw(11) << kpoints_[ik]->num_gkvec(); @@ -579,7 +574,7 @@ void K_point_set::save(std::string const& name__) const ctx_.comm().barrier(); for (int ik = 0; ik < num_kpoints(); ik++) { /* check if this ranks stores the k-point */ - if (ctx_.comm_k().rank() == spl_num_kpoints_.local_rank(ik)) { + if (ctx_.comm_k().rank() == spl_num_kpoints_.location(typename kp_index_t::global(ik)).ib) { this->get(ik)->save(name__, ik); } /* wait for all */ diff --git a/src/k_point/k_point_set.hpp b/src/k_point/k_point_set.hpp index 653a9a4865..725f0f183b 100644 --- a/src/k_point/k_point_set.hpp +++ b/src/k_point/k_point_set.hpp @@ -52,7 +52,7 @@ class K_point_set #endif /// Split index of k-points. - sddk::splindex spl_num_kpoints_; + sddk::splindex_chunk spl_num_kpoints_; /// Fermi energy which is searched in find_band_occupancies(). double energy_fermi_{0}; @@ -144,11 +144,10 @@ class K_point_set void update() { /* update k-points */ - for (int ikloc = 0; ikloc < spl_num_kpoints().local_size(); ikloc++) { - int ik = spl_num_kpoints(ikloc); - kpoints_[ik]->update(); + for (auto it : spl_num_kpoints_) { + kpoints_[it.i]->update(); #if defined(USE_FP32) - kpoints_float_[ik]->update(); + kpoints_float_[it.i]->update(); #endif } } @@ -168,9 +167,8 @@ class K_point_set int max_num_gkvec() const { int max_num_gkvec{0}; - for (int ikloc = 0; ikloc < spl_num_kpoints_.local_size(); ikloc++) { - auto ik = spl_num_kpoints_[ikloc]; - max_num_gkvec = std::max(max_num_gkvec, kpoints_[ik]->num_gkvec()); + for (auto it : spl_num_kpoints_) { + max_num_gkvec = std::max(max_num_gkvec, kpoints_[it.i]->num_gkvec()); } comm().allreduce(&max_num_gkvec, 1); return max_num_gkvec; @@ -208,9 +206,9 @@ class K_point_set return static_cast(kpoints_.size()); } - inline int spl_num_kpoints(int ikloc) const + inline auto spl_num_kpoints(kp_index_t::local ikloc__) const { - return spl_num_kpoints_[ikloc]; + return spl_num_kpoints_.global_index(ikloc__); } inline double energy_fermi() const @@ -251,13 +249,13 @@ class K_point_set /// Send G+k vectors of k-point jk to a given rank. /** Other ranks receive an empty Gvec placeholder */ - inline fft::Gvec get_gkvec(int jk__, int rank__) + inline fft::Gvec get_gkvec(kp_index_t::global jk__, int rank__) { /* rank in the k-point communicator */ int my_rank = comm().rank(); /* rank that stores jk */ - int jrank = spl_num_kpoints().local_rank(jk__); + int jrank = spl_num_kpoints().location(jk__).ib; /* need this to pass communicator */ fft::Gvec gkvec(ctx_.comm_band()); @@ -276,7 +274,7 @@ class K_point_set template<> inline K_point* K_point_set::get(int ik__) const { - assert(ik__ >= 0 && ik__ < (int)kpoints_.size()); + RTE_ASSERT(ik__ >= 0 && ik__ < (int)kpoints_.size()); return kpoints_[ik__].get(); } @@ -284,7 +282,7 @@ template<> inline K_point* K_point_set::get(int ik__) const { #if defined(USE_FP32) - assert(ik__ >= 0 && ik__ < (int)kpoints_float_.size()); + RTE_ASSERT(ik__ >= 0 && ik__ < (int)kpoints_float_.size()); return kpoints_float_[ik__].get(); #else RTE_THROW("not compiled with FP32 support"); diff --git a/src/linalg/dmatrix.cpp b/src/linalg/dmatrix.cpp index 94075f844c..31a6a47a3e 100644 --- a/src/linalg/dmatrix.cpp +++ b/src/linalg/dmatrix.cpp @@ -29,17 +29,17 @@ namespace la { template dmatrix::dmatrix(int num_rows__, int num_cols__, BLACS_grid const& blacs_grid__, int bs_row__, int bs_col__, sddk::memory_t mem_type__) - : sddk::matrix(sddk::splindex(num_rows__, blacs_grid__.num_ranks_row(), - blacs_grid__.rank_row(), bs_row__).local_size(), - sddk::splindex(num_cols__, blacs_grid__.num_ranks_col(), - blacs_grid__.rank_col(), bs_col__).local_size(), mem_type__) + : sddk::matrix(sddk::splindex_block_cyclic<>(num_rows__, n_blocks(blacs_grid__.num_ranks_row()), + block_id(blacs_grid__.rank_row()), bs_row__).local_size(), + sddk::splindex_block_cyclic<>(num_cols__, n_blocks(blacs_grid__.num_ranks_col()), + block_id(blacs_grid__.rank_col()), bs_col__).local_size(), mem_type__) , num_rows_(num_rows__) , num_cols_(num_cols__) , bs_row_(bs_row__) , bs_col_(bs_col__) , blacs_grid_(&blacs_grid__) - , spl_row_(num_rows_, blacs_grid__.num_ranks_row(), blacs_grid__.rank_row(), bs_row_) - , spl_col_(num_cols_, blacs_grid__.num_ranks_col(), blacs_grid__.rank_col(), bs_col_) + , spl_row_(num_rows_, n_blocks(blacs_grid__.num_ranks_row()), block_id(blacs_grid__.rank_row()), bs_row_) + , spl_col_(num_cols_, n_blocks(blacs_grid__.num_ranks_col()), block_id(blacs_grid__.rank_col()), bs_col_) , spla_dist_(spla::MatrixDistribution::create_blacs_block_cyclic_from_mapping( blacs_grid__.comm().native(), blacs_grid__.rank_map().data(), blacs_grid__.num_ranks_row(), blacs_grid__.num_ranks_col(), bs_row__,bs_col__)) @@ -51,17 +51,17 @@ template dmatrix::dmatrix(T* ptr__, int num_rows__, int num_cols__, BLACS_grid const& blacs_grid__, int bs_row__, int bs_col__) : sddk::matrix(ptr__, - sddk::splindex(num_rows__, blacs_grid__.num_ranks_row(), blacs_grid__.rank_row(), - bs_row__).local_size(), - sddk::splindex(num_cols__, blacs_grid__.num_ranks_col(), blacs_grid__.rank_col(), - bs_col__).local_size()) + sddk::splindex_block_cyclic<>(num_rows__, n_blocks(blacs_grid__.num_ranks_row()), + block_id(blacs_grid__.rank_row()), bs_row__).local_size(), + sddk::splindex_block_cyclic<>(num_cols__, n_blocks(blacs_grid__.num_ranks_col()), + block_id(blacs_grid__.rank_col()), bs_col__).local_size()) , num_rows_(num_rows__) , num_cols_(num_cols__) , bs_row_(bs_row__) , bs_col_(bs_col__) , blacs_grid_(&blacs_grid__) - , spl_row_(num_rows_, blacs_grid__.num_ranks_row(), blacs_grid__.rank_row(), bs_row_) - , spl_col_(num_cols_, blacs_grid__.num_ranks_col(), blacs_grid__.rank_col(), bs_col_) + , spl_row_(num_rows_, n_blocks(blacs_grid__.num_ranks_row()), block_id(blacs_grid__.rank_row()), bs_row_) + , spl_col_(num_cols_, n_blocks(blacs_grid__.num_ranks_col()), block_id(blacs_grid__.rank_col()), bs_col_) , spla_dist_(spla::MatrixDistribution::create_blacs_block_cyclic_from_mapping( blacs_grid__.comm().native(), blacs_grid__.rank_map().data(), blacs_grid__.num_ranks_row(), blacs_grid__.num_ranks_col(), bs_row__, bs_col__)) @@ -76,8 +76,8 @@ dmatrix::dmatrix(int num_rows__, int num_cols__, sddk::memory_t mem_type__) , num_cols_(num_cols__) , bs_row_(1) , bs_col_(1) - , spl_row_(num_rows_, 1, 0, bs_row_) - , spl_col_(num_cols_, 1, 0, bs_col_) + , spl_row_(num_rows_, n_blocks(1), block_id(0), bs_row_) + , spl_col_(num_cols_, n_blocks(1), block_id(0), bs_col_) { } @@ -88,8 +88,8 @@ dmatrix::dmatrix(int num_rows__, int num_cols__, sddk::memory_pool& mp__, std , num_cols_(num_cols__) , bs_row_(1) , bs_col_(1) - , spl_row_(num_rows_, 1, 0, bs_row_) - , spl_col_(num_cols_, 1, 0, bs_col_) + , spl_row_(num_rows_, n_blocks(1), block_id(0), bs_row_) + , spl_col_(num_cols_, n_blocks(1), block_id(0), bs_col_) { } @@ -101,8 +101,8 @@ dmatrix::dmatrix(T* ptr__, int num_rows__, int num_cols__) , num_cols_(num_cols__) , bs_row_(1) , bs_col_(1) - , spl_row_(num_rows_, 1, 0, bs_row_) - , spl_col_(num_cols_, 1, 0, bs_col_) + , spl_row_(num_rows_, n_blocks(1), block_id(0), bs_row_) + , spl_col_(num_cols_, n_blocks(1), block_id(0), bs_col_) { init(); } @@ -110,17 +110,17 @@ dmatrix::dmatrix(T* ptr__, int num_rows__, int num_cols__) template dmatrix::dmatrix(int num_rows__, int num_cols__, BLACS_grid const& blacs_grid__, int bs_row__, int bs_col__, sddk::memory_pool& mp__) - : sddk::matrix(sddk::splindex(num_rows__, blacs_grid__.num_ranks_row(), blacs_grid__.rank_row(), - bs_row__).local_size(), - sddk::splindex(num_cols__, blacs_grid__.num_ranks_col(), blacs_grid__.rank_col(), - bs_col__).local_size(), mp__) + : sddk::matrix(sddk::splindex_block_cyclic<>(num_rows__, n_blocks(blacs_grid__.num_ranks_row()), + block_id(blacs_grid__.rank_row()), bs_row__).local_size(), + sddk::splindex_block_cyclic<>(num_cols__, n_blocks(blacs_grid__.num_ranks_col()), + block_id(blacs_grid__.rank_col()), bs_col__).local_size(), mp__) , num_rows_(num_rows__) , num_cols_(num_cols__) , bs_row_(bs_row__) , bs_col_(bs_col__) , blacs_grid_(&blacs_grid__) - , spl_row_(num_rows_, blacs_grid__.num_ranks_row(), blacs_grid__.rank_row(), bs_row_) - , spl_col_(num_cols_, blacs_grid__.num_ranks_col(), blacs_grid__.rank_col(), bs_col_) + , spl_row_(num_rows_, n_blocks(blacs_grid__.num_ranks_row()), block_id(blacs_grid__.rank_row()), bs_row_) + , spl_col_(num_cols_, n_blocks(blacs_grid__.num_ranks_col()), block_id(blacs_grid__.rank_col()), bs_col_) , spla_dist_(spla::MatrixDistribution::create_blacs_block_cyclic_from_mapping( blacs_grid__.comm().native(), blacs_grid__.rank_map().data(), blacs_grid__.num_ranks_row(), blacs_grid__.num_ranks_col(), bs_row__, bs_col__)) @@ -131,13 +131,15 @@ dmatrix::dmatrix(int num_rows__, int num_cols__, BLACS_grid const& blacs_grid template void dmatrix::set(int ir0__, int jc0__, int mr__, int nc__, T* ptr__, int ld__) { - sddk::splindex spl_r0(ir0__, blacs_grid().num_ranks_row(), blacs_grid().rank_row(), bs_row_); - sddk::splindex spl_r1(ir0__ + mr__, blacs_grid().num_ranks_row(), blacs_grid().rank_row(), - bs_row_); + sddk::splindex_block_cyclic<> spl_r0(ir0__, n_blocks(blacs_grid().num_ranks_row()), + block_id(blacs_grid().rank_row()), bs_row_); + sddk::splindex_block_cyclic<> spl_r1(ir0__ + mr__, n_blocks(blacs_grid().num_ranks_row()), + block_id(blacs_grid().rank_row()), bs_row_); - sddk::splindex spl_c0(jc0__, blacs_grid().num_ranks_col(), blacs_grid().rank_col(), bs_col_); - sddk::splindex spl_c1(jc0__ + nc__, blacs_grid().num_ranks_col(), blacs_grid().rank_col(), - bs_col_); + sddk::splindex_block_cyclic<> spl_c0(jc0__, n_blocks(blacs_grid().num_ranks_col()), + block_id(blacs_grid().rank_col()), bs_col_); + sddk::splindex_block_cyclic<> spl_c1(jc0__ + nc__, n_blocks(blacs_grid().num_ranks_col()), + block_id(blacs_grid().rank_col()), bs_col_); int m0 = spl_r0.local_size(); int m1 = spl_r1.local_size(); @@ -147,10 +149,10 @@ void dmatrix::set(int ir0__, int jc0__, int mr__, int nc__, T* ptr__, int ld_ std::vector map_col(n1 - n0); for (int i = 0; i < m1 - m0; i++) { - map_row[i] = spl_r1[m0 + i] - ir0__; + map_row[i] = spl_r1.global_index(m0 + i) - ir0__; } for (int j = 0; j < n1 - n0; j++) { - map_col[j] = spl_c1[n0 + j] - jc0__; + map_col[j] = spl_c1.global_index(n0 + j) - jc0__; } //#pragma omp parallel for @@ -165,10 +167,10 @@ template void dmatrix::set(const int irow_glob, const int icol_glob, T val) { auto r = spl_row_.location(irow_glob); - if (blacs_grid_->rank_row() == r.rank) { + if (blacs_grid_->rank_row() == r.ib) { auto c = spl_col_.location(icol_glob); - if (blacs_grid_->rank_col() == c.rank) { - (*this)(r.local_index, c.local_index) = val; + if (blacs_grid_->rank_col() == c.ib) { + (*this)(r.index_local, c.index_local) = val; } } } @@ -177,10 +179,10 @@ template void dmatrix::add(const int irow_glob, const int icol_glob, T val) { auto r = spl_row_.location(irow_glob); - if (blacs_grid_->rank_row() == r.rank) { + if (blacs_grid_->rank_row() == r.ib) { auto c = spl_col_.location(icol_glob); - if (blacs_grid_->rank_col() == c.rank) { - (*this)(r.local_index, c.local_index) += val; + if (blacs_grid_->rank_col() == c.ib) { + (*this)(r.index_local, c.index_local) += val; } } } @@ -189,10 +191,10 @@ template void dmatrix::add(real_type beta__, const int irow_glob, const int icol_glob, T val) { auto r = spl_row_.location(irow_glob); - if (blacs_grid_->rank_row() == r.rank) { + if (blacs_grid_->rank_row() == r.ib) { auto c = spl_col_.location(icol_glob); - if (blacs_grid_->rank_col() == c.rank) { - (*this)(r.local_index, c.local_index) = (*this)(r.local_index, c.local_index) * beta__ + val; + if (blacs_grid_->rank_col() == c.ib) { + (*this)(r.index_local, c.index_local) = (*this)(r.index_local, c.index_local) * beta__ + val; } } } @@ -202,11 +204,11 @@ void dmatrix::make_real_diag(int n__) { for (int i = 0; i < n__; i++) { auto r = spl_row_.location(i); - if (blacs_grid_->rank_row() == r.rank) { + if (blacs_grid_->rank_row() == r.ib) { auto c = spl_col_.location(i); - if (blacs_grid_->rank_col() == c.rank) { - T v = (*this)(r.local_index, c.local_index); - (*this)(r.local_index, c.local_index) = std::real(v); + if (blacs_grid_->rank_col() == c.ib) { + T v = (*this)(r.index_local, c.index_local); + (*this)(r.index_local, c.index_local) = std::real(v); } } } @@ -220,10 +222,10 @@ sddk::mdarray dmatrix::get_diag(int n__) for (int i = 0; i < n__; i++) { auto r = spl_row_.location(i); - if (blacs_grid_->rank_row() == r.rank) { + if (blacs_grid_->rank_row() == r.ib) { auto c = spl_col_.location(i); - if (blacs_grid_->rank_col() == c.rank) { - d[i] = (*this)(r.local_index, c.local_index); + if (blacs_grid_->rank_col() == c.ib) { + d[i] = (*this)(r.index_local, c.index_local); } } } diff --git a/src/linalg/dmatrix.hpp b/src/linalg/dmatrix.hpp index 2cfbbacece..08b7f99f08 100644 --- a/src/linalg/dmatrix.hpp +++ b/src/linalg/dmatrix.hpp @@ -68,10 +68,10 @@ class dmatrix: public sddk::matrix BLACS_grid const* blacs_grid_{nullptr}; /// Split index of matrix rows. - sddk::splindex spl_row_; + sddk::splindex_block_cyclic<> spl_row_; /// Split index of matrix columns. - sddk::splindex spl_col_; + sddk::splindex_block_cyclic<> spl_col_; /// ScaLAPACK matrix descriptor. ftn_int descriptor_[9]; @@ -153,13 +153,13 @@ class dmatrix: public sddk::matrix /// Return local number of rows for a given MPI rank. inline int num_rows_local(int rank) const { - return spl_row_.local_size(rank); + return spl_row_.local_size(block_id(rank)); } /// Return global row index in the range [0, num_rows) by the local index in the range [0, num_rows_local). inline int irow(int irow_loc) const { - return spl_row_[irow_loc]; + return spl_row_.global_index(irow_loc); } inline int num_cols() const @@ -175,13 +175,13 @@ class dmatrix: public sddk::matrix inline int num_cols_local(int rank) const { - return spl_col_.local_size(rank); + return spl_col_.local_size(block_id(rank)); } /// Inindex of column in global matrix. inline int icol(int icol_loc) const { - return spl_col_[icol_loc]; + return spl_col_.global_index(icol_loc); } inline int const* descriptor() const @@ -221,11 +221,15 @@ class dmatrix: public sddk::matrix { int m0, m1, n0, n1; if (blacs_grid_ != nullptr) { - sddk::splindex spl_r0(ir0__, blacs_grid().num_ranks_row(), blacs_grid().rank_row(), bs_row_); - sddk::splindex spl_r1(ir0__ + nr__, blacs_grid().num_ranks_row(), blacs_grid().rank_row(), bs_row_); + sddk::splindex_block_cyclic<> spl_r0(ir0__, n_blocks(blacs_grid().num_ranks_row()), + block_id(blacs_grid().rank_row()), bs_row_); + sddk::splindex_block_cyclic<> spl_r1(ir0__ + nr__, n_blocks(blacs_grid().num_ranks_row()), + block_id(blacs_grid().rank_row()), bs_row_); - sddk::splindex spl_c0(ic0__, blacs_grid().num_ranks_col(), blacs_grid().rank_col(), bs_col_); - sddk::splindex spl_c1(ic0__ + nc__, blacs_grid().num_ranks_col(), blacs_grid().rank_col(), bs_col_); + sddk::splindex_block_cyclic<> spl_c0(ic0__, n_blocks(blacs_grid().num_ranks_col()), + block_id(blacs_grid().rank_col()), bs_col_); + sddk::splindex_block_cyclic<> spl_c1(ic0__ + nc__, n_blocks(blacs_grid().num_ranks_col()), + block_id(blacs_grid().rank_col()), bs_col_); m0 = spl_r0.local_size(); m1 = spl_r1.local_size(); @@ -382,10 +386,10 @@ class dmatrix: public sddk::matrix T cs{0}; if (blacs_grid_ != nullptr) { - sddk::splindex spl_row(m__, this->blacs_grid().num_ranks_row(), - this->blacs_grid().rank_row(), this->bs_row()); - sddk::splindex spl_col(n__, this->blacs_grid().num_ranks_col(), - this->blacs_grid().rank_col(), this->bs_col()); + sddk::splindex_block_cyclic<> spl_row(m__, n_blocks(this->blacs_grid().num_ranks_row()), + block_id(this->blacs_grid().rank_row()), this->bs_row()); + sddk::splindex_block_cyclic<> spl_col(n__, n_blocks(this->blacs_grid().num_ranks_col()), + block_id(this->blacs_grid().rank_col()), this->bs_col()); for (int i = 0; i < spl_col.local_size(); i++) { for (int j = 0; j < spl_row.local_size(); j++) { cs += (*this)(j, i); diff --git a/src/mixer/mixer_functions.cpp b/src/mixer/mixer_functions.cpp index caf8367317..0157cbc999 100644 --- a/src/mixer/mixer_functions.cpp +++ b/src/mixer/mixer_functions.cpp @@ -73,8 +73,8 @@ FunctionProperties> periodic_function_property() y.rg().value(i) = xi * -s + yi * c; } if (x.ctx().full_potential()) { - for (int ialoc = 0; ialoc < x.ctx().unit_cell().spl_num_atoms().local_size(); ialoc++) { - int ia = x.ctx().unit_cell().spl_num_atoms(ialoc); + for (auto it : x.ctx().unit_cell().spl_num_atoms()) { + int ia = it.i; auto& x_f_mt = x.mt()[ia]; auto& y_f_mt = y.mt()[ia]; #pragma omp for schedule(static) nowait @@ -226,9 +226,8 @@ FunctionProperties> paw_density_function_property() auto scale_func = [](double alpha, PAW_density& x) -> void { - for (int i = 0; i < x.unit_cell().spl_num_paw_atoms().local_size(); i++) { - int ipaw = x.unit_cell().spl_num_paw_atoms(i); - int ia = x.unit_cell().paw_atom_index(ipaw); + for (auto it : x.unit_cell().spl_num_paw_atoms()) { + int ia = x.unit_cell().paw_atom_index(it.i); for (int j = 0; j < x.unit_cell().parameters().num_mag_dims() + 1; j++) { x.ae_density(j, ia) *= alpha; x.ps_density(j, ia) *= alpha; @@ -238,9 +237,8 @@ FunctionProperties> paw_density_function_property() auto copy_function = [](PAW_density const& x, PAW_density& y) -> void { - for (int i = 0; i < x.unit_cell().spl_num_paw_atoms().local_size(); i++) { - int ipaw = x.unit_cell().spl_num_paw_atoms(i); - int ia = x.unit_cell().paw_atom_index(ipaw); + for (auto it : x.unit_cell().spl_num_paw_atoms()) { + int ia = x.unit_cell().paw_atom_index(it.i); for (int j = 0; j < x.unit_cell().parameters().num_mag_dims() + 1; j++) { sddk::copy(x.ae_density(j, ia), y.ae_density(j, ia)); sddk::copy(x.ps_density(j, ia), y.ps_density(j, ia)); @@ -250,9 +248,8 @@ FunctionProperties> paw_density_function_property() auto axpy_function = [](double alpha, PAW_density const& x, PAW_density& y) -> void { - for (int i = 0; i < x.unit_cell().spl_num_paw_atoms().local_size(); i++) { - int ipaw = x.unit_cell().spl_num_paw_atoms(i); - int ia = x.unit_cell().paw_atom_index(ipaw); + for (auto it : x.unit_cell().spl_num_paw_atoms()) { + int ia = x.unit_cell().paw_atom_index(it.i); for (int j = 0; j < x.unit_cell().parameters().num_mag_dims() + 1; j++) { y.ae_density(j, ia) = x.ae_density(j, ia) * alpha + y.ae_density(j, ia); y.ps_density(j, ia) = x.ps_density(j, ia) * alpha + y.ps_density(j, ia); @@ -262,9 +259,8 @@ FunctionProperties> paw_density_function_property() auto rotate_function = [](double c, double s, PAW_density& x, PAW_density& y) -> void { - for (int i = 0; i < x.unit_cell().spl_num_paw_atoms().local_size(); i++) { - int ipaw = x.unit_cell().spl_num_paw_atoms(i); - int ia = x.unit_cell().paw_atom_index(ipaw); + for (auto it : x.unit_cell().spl_num_paw_atoms()) { + int ia = x.unit_cell().paw_atom_index(it.i); for (int j = 0; j < x.unit_cell().parameters().num_mag_dims() + 1; j++) { x.ae_density(j, ia) = x.ae_density(j, ia) * c + s * y.ae_density(j, ia); y.ae_density(j, ia) = y.ae_density(j, ia) * c - s * x.ae_density(j, ia); diff --git a/src/nlcglib/adaptor.cpp b/src/nlcglib/adaptor.cpp index 5fa0ff38e0..c0c75241e5 100644 --- a/src/nlcglib/adaptor.cpp +++ b/src/nlcglib/adaptor.cpp @@ -66,7 +66,7 @@ make_vector(std::vector const& wfct, Simulation_context const& ctx, K int num_spins = ctx.num_spins(); int nb = ctx.num_bands(); for (auto i = 0u; i < wfct.size(); ++i) { - auto gidk = kset.spl_num_kpoints(i); // global k-point index + auto gidk = kset.spl_num_kpoints().global_index(typename kp_index_t::local(i)); //(i); // global k-point index for (int ispn = 0; ispn < num_spins; ++ispn) { auto& array = wfct[i]->pw_coeffs(wf::spin_index(ispn)); int lda = array.size(0); @@ -105,24 +105,20 @@ Energy::Energy(K_point_set& kset, Density& density, Potential& potential) , density_(density) , potential_(potential) { - // intialize hphi and sphi and allocate (device) memory int nk = kset.spl_num_kpoints().local_size(); auto& ctx = kset.ctx(); - // auto& mpd = ctx.mem_pool(ctx.preferred_memory_t()); hphis_.resize(nk); - // sphis.resize(nk); cphis_.resize(nk); - for (int i = 0; i < nk; ++i) { - auto global_kpoint_index = kset.spl_num_kpoints(i); - auto& kp = *kset.get(global_kpoint_index); + for (auto it : kset.spl_num_kpoints()) { + auto& kp = *kset.get(it.i); sddk::memory_t preferred_memory_t = ctx.processing_unit_memory_t(); auto num_mag_dims = wf::num_mag_dims(ctx.num_mag_dims()); auto num_bands = wf::num_bands(ctx.num_bands()); // make a new wf for Hamiltonian apply... - hphis_[i] = + hphis_[it.li] = std::make_shared>(kp.gkvec_sptr(), num_mag_dims, num_bands, preferred_memory_t); - cphis_[i] = &kp.spinor_wave_functions(); - hphis_[i]->allocate(sddk::memory_t::host); + cphis_[it.li] = &kp.spinor_wave_functions(); + hphis_[it.li]->allocate(sddk::memory_t::host); } // need to allocate wavefunctions on GPU } @@ -133,7 +129,6 @@ Energy::compute() auto& ctx = kset_.ctx(); int num_spins = ctx.num_spins(); int num_bands = ctx.num_bands(); - int nk = kset_.spl_num_kpoints().local_size(); density_.generate(kset_, ctx.use_symmetry(), true /* add core */, true /* transform to rg */); @@ -145,15 +140,15 @@ Energy::compute() auto proc_mem_t = ctx.processing_unit_memory_t(); // apply Hamiltonian - for (int i = 0; i < nk; ++i) { - auto& kp = *kset_.get(kset_.spl_num_kpoints(i)); + for (auto it : kset_.spl_num_kpoints()) { + auto& kp = *kset_.get(it.i); std::vector band_energies(num_bands); - auto mem_guard = cphis_[i]->memory_guard(proc_mem_t, wf::copy_to::device); - auto mem_guard_h = hphis_[i]->memory_guard(proc_mem_t, wf::copy_to::host); + auto mem_guard = cphis_[it.li]->memory_guard(proc_mem_t, wf::copy_to::device); + auto mem_guard_h = hphis_[it.li]->memory_guard(proc_mem_t, wf::copy_to::host); auto null_ptr_wfc = std::shared_ptr>(); - apply_hamiltonian(H0, kp, *hphis_[i], kp.spinor_wave_functions(), null_ptr_wfc); + apply_hamiltonian(H0, kp, *hphis_[it.li], kp.spinor_wave_functions(), null_ptr_wfc); // compute band energies = diag() for (int ispn = 0; ispn < num_spins; ++ispn) { for (int jj = 0; jj < num_bands; ++jj) { @@ -162,7 +157,7 @@ Energy::compute() wf::band_range bandr{jj, jj + 1}; wf::inner(ctx.spla_context(), proc_mem_t, wf::spin_range(ispn), /* bra */ kp.spinor_wave_functions(), bandr, - /* ket */ *hphis_[i], bandr, + /* ket */ *hphis_[it.li], bandr, /*result*/ dmat, 0, 0); kp.band_energy(jj, ispn, dmat(0, 0).real()); } @@ -210,22 +205,19 @@ Energy::get_C(nlcglib::memory_type memory = nlcglib::memory_type::none) std::shared_ptr Energy::get_fn() { - auto nk = kset_.spl_num_kpoints().local_size(); const int ns = kset_.ctx().num_spins(); int nbands = kset_.ctx().num_bands(); std::vector> fn; std::vector> kindices; - for (int ik = 0; ik < nk; ++ik) { - // global k-point index - auto gidk = kset_.spl_num_kpoints(ik); - auto& kp = *kset_.get(gidk); + for (auto it : kset_.spl_num_kpoints()) { + auto& kp = *kset_.get(it.i); for (int ispn = 0; ispn < ns; ++ispn) { std::vector fn_local(nbands); for (int i = 0; i < nbands; ++i) { fn_local[i] = kp.band_occupancy(i, ispn); } fn.push_back(std::move(fn_local)); - kindices.emplace_back(gidk, ispn); + kindices.emplace_back(it.i, ispn); } } return std::make_shared(fn, kindices, kset_.comm().native()); @@ -260,22 +252,19 @@ Energy::set_fn(const std::vector>& keys, const std::vector Energy::get_ek() { - auto nk = kset_.spl_num_kpoints().local_size(); const int ns = kset_.ctx().num_spins(); int nbands = kset_.ctx().num_bands(); std::vector> ek; std::vector> kindices; - for (int ik = 0; ik < nk; ++ik) { - // global k-point index - auto gidk = kset_.spl_num_kpoints(ik); - auto& kp = *kset_.get(gidk); + for (auto it : kset_.spl_num_kpoints()) { + auto& kp = *kset_.get(it.i); for (int ispn = 0; ispn < ns; ++ispn) { std::vector ek_local(nbands); for (int i = 0; i < nbands; ++i) { ek_local[i] = kp.band_energy(i, ispn); } ek.push_back(std::move(ek_local)); - kindices.emplace_back(gidk, ispn); + kindices.emplace_back(it.i.get(), ispn); } } return std::make_shared(ek, kindices, kset_.comm().native()); @@ -284,14 +273,11 @@ Energy::get_ek() std::shared_ptr Energy::get_gkvec_ekin() { - auto nk = kset_.spl_num_kpoints().local_size(); const int ns = kset_.ctx().num_spins(); std::vector> gkvec_cart; std::vector> kindices; - for (int ik = 0; ik < nk; ++ik) { - // global k-point index - auto gidk = kset_.spl_num_kpoints(ik); - auto& kp = *kset_.get(gidk); + for (auto it : kset_.spl_num_kpoints()) { + auto& kp = *kset_.get(it.i); for (int ispn = 0; ispn < ns; ++ispn) { int gkvec_count = kp.gkvec().count(); auto& gkvec = kp.gkvec(); @@ -300,7 +286,7 @@ Energy::get_gkvec_ekin() gkvec_local[i] = gkvec.gkvec_cart(i).length(); } gkvec_cart.push_back(std::move(gkvec_local)); - kindices.emplace_back(gidk, ispn); + kindices.emplace_back(it.i.get(), ispn); } } return std::make_shared(gkvec_cart, kindices, kset_.comm().native()); @@ -309,19 +295,16 @@ Energy::get_gkvec_ekin() std::shared_ptr Energy::get_kpoint_weights() { - auto nk = kset_.spl_num_kpoints().local_size(); const int ns = kset_.ctx().num_spins(); std::vector weights; std::vector> kindices; - for (int ik = 0; ik < nk; ++ik) { - // global k-point index - auto gidk = kset_.spl_num_kpoints(ik); - auto& kp = *kset_.get(gidk); + for (auto it : kset_.spl_num_kpoints()) { + auto& kp = *kset_.get(it.i); // also return weights for every spin index for (int ispn = 0; ispn < ns; ++ispn) { weights.push_back(kp.weight()); - kindices.emplace_back(gidk, ispn); + kindices.emplace_back(it.i.get(), ispn); } } return std::make_shared(weights, kindices, kset_.comm().native()); diff --git a/src/nlcglib/inverse_overlap.hpp b/src/nlcglib/inverse_overlap.hpp index 5c0ea84a98..d74d32579b 100644 --- a/src/nlcglib/inverse_overlap.hpp +++ b/src/nlcglib/inverse_overlap.hpp @@ -296,7 +296,7 @@ S_k::apply(sddk::mdarray& Y, sddk::mdarrayispn_, pm, 1, 0); + q_op_.rmatmul(R, bphi, this->ispn_, pm, 1.0, 0.0); sddk::auto_copy(Y, X, pu); diff --git a/src/nlcglib/overlap.hpp b/src/nlcglib/overlap.hpp index 97e0cfe2b3..1800d4b842 100644 --- a/src/nlcglib/overlap.hpp +++ b/src/nlcglib/overlap.hpp @@ -60,12 +60,10 @@ template Overlap_operators::Overlap_operators(const K_point_set& kset, Simulation_context& ctx, const Q_operator& q_op) { - int nk = kset.spl_num_kpoints().local_size(); - for (int ik_loc = 0; ik_loc < nk; ++ik_loc) { - int ik = kset.spl_num_kpoints(ik_loc); - auto& kp = *kset.get(ik); + for (auto it : kset.spl_num_kpoints()) { + auto& kp = *kset.get(it.i); for (int ispn = 0; ispn < ctx.num_spins(); ++ispn) { - key_t key{ik, ispn}; + key_t key{it.i.get(), ispn}; data_[key] = std::make_shared(ctx, q_op, kp.beta_projectors(), ispn); } } diff --git a/src/nlcglib/ultrasoft_precond.hpp b/src/nlcglib/ultrasoft_precond.hpp index ea142a9898..737ede7333 100644 --- a/src/nlcglib/ultrasoft_precond.hpp +++ b/src/nlcglib/ultrasoft_precond.hpp @@ -59,12 +59,10 @@ class UltrasoftPrecond : public nlcglib::UltrasoftPrecondBase inline UltrasoftPrecond::UltrasoftPrecond(K_point_set const& kset, Simulation_context& ctx, Q_operator const& q_op) { - int nk = kset.spl_num_kpoints().local_size(); - for (int ik_loc = 0; ik_loc < nk; ++ik_loc) { - int ik = kset.spl_num_kpoints(ik_loc); - auto& kp = *kset.get(ik); + for (auto it : kset.spl_num_kpoints()) { + auto& kp = *kset.get(it.i); for (int ispn = 0; ispn < ctx.num_spins(); ++ispn) { - key_t key{ik, ispn}; + key_t key{it.i.get(), ispn}; data_[key] = std::make_shared(ctx, q_op, ispn, kp.beta_projectors(), kp.gkvec()); } } diff --git a/src/potential/paw_potential.cpp b/src/potential/paw_potential.cpp index 738e7ebdac..38d04ae886 100644 --- a/src/potential/paw_potential.cpp +++ b/src/potential/paw_potential.cpp @@ -36,16 +36,16 @@ void Potential::init_PAW() bool const is_global{true}; paw_potential_ = std::make_unique>(unit_cell_, is_global); - paw_ae_exc_ = std::make_unique>(unit_cell_, unit_cell_.paw_atoms(), + paw_ae_exc_ = std::make_unique>(unit_cell_, unit_cell_.paw_atoms(), [this](int ia){return lmax_t(2 * this->unit_cell_.atom(ia).type().indexr().lmax());}); - paw_ps_exc_ = std::make_unique>(unit_cell_, unit_cell_.paw_atoms(), + paw_ps_exc_ = std::make_unique>(unit_cell_, unit_cell_.paw_atoms(), [this](int ia){return lmax_t(2 * this->unit_cell_.atom(ia).type().indexr().lmax());}); /* initialize dij matrix */ paw_dij_.resize(unit_cell_.num_paw_atoms()); for (int i = 0; i < unit_cell_.num_paw_atoms(); i++) { - int ia = unit_cell_.paw_atom_index(i); + int ia = unit_cell_.paw_atom_index(paw_atom_index_t::global(i)); paw_dij_[i] = sddk::mdarray(unit_cell_.atom(ia).mt_basis_size(), unit_cell_.atom(ia).mt_basis_size(), ctx_.num_mag_dims() + 1); } @@ -64,16 +64,15 @@ void Potential::generate_PAW_effective_potential(Density const& density) paw_hartree_total_energy_ = 0.0; /* calculate xc and hartree for atoms */ - for (int i = 0; i < unit_cell_.spl_num_paw_atoms().local_size(); i++) { - int ia = unit_cell_.paw_atom_index(unit_cell_.spl_num_paw_atoms(i)); - paw_hartree_total_energy_ += calc_PAW_local_potential(ia, density.paw_ae_density(ia), - density.paw_ps_density(ia)); + for (auto it : unit_cell_.spl_num_paw_atoms()) { + paw_hartree_total_energy_ += calc_PAW_local_potential(it.i, density.paw_ae_density(it.i), + density.paw_ps_density(it.i)); } comm_.allreduce(&paw_hartree_total_energy_, 1); paw_potential_->sync(); - std::vector*> ae_comp; - std::vector*> ps_comp; + std::vector*> ae_comp; + std::vector*> ps_comp; for (int j = 0; j < ctx_.num_mag_dims() + 1; j++) { ae_comp.push_back(&paw_potential_->ae_component(j)); ps_comp.push_back(&paw_potential_->ps_component(j)); @@ -95,20 +94,19 @@ void Potential::generate_PAW_effective_potential(Density const& density) /* calculate PAW Dij matrix */ #pragma omp parallel for - for (int i = 0; i < unit_cell_.spl_num_paw_atoms().local_size(); i++) { - int ia_paw = unit_cell_.spl_num_paw_atoms(i); - int ia = unit_cell_.paw_atom_index(ia_paw); - calc_PAW_local_Dij(ia, paw_dij_[ia_paw]); + for (auto it : unit_cell_.spl_num_paw_atoms()) { + auto ia = unit_cell_.paw_atom_index(it.i); + calc_PAW_local_Dij(ia, paw_dij_[it.i]); } for (int i = 0; i < unit_cell_.num_paw_atoms(); i++) { - auto location = unit_cell_.spl_num_paw_atoms().location(i); - comm_.bcast(paw_dij_[i].at(sddk::memory_t::host), paw_dij_[i].size(), location.rank); + auto location = unit_cell_.spl_num_paw_atoms().location(typename paw_atom_index_t::global(i)); + comm_.bcast(paw_dij_[i].at(sddk::memory_t::host), paw_dij_[i].size(), location.ib); } /* add paw Dij to uspp Dij */ #pragma omp parallel for for (int i = 0; i < unit_cell_.num_paw_atoms(); i++) { - int ia = unit_cell_.paw_atom_index(i); + auto ia = unit_cell_.paw_atom_index(typename paw_atom_index_t::global(i)); auto& atom = unit_cell_.atom(ia); for (int imagn = 0; imagn < ctx_.num_mag_dims() + 1; imagn++) { diff --git a/src/potential/poisson.cpp b/src/potential/poisson.cpp index d45f6866a0..dc9a739786 100644 --- a/src/potential/poisson.cpp +++ b/src/potential/poisson.cpp @@ -248,27 +248,26 @@ void Potential::poisson(Periodic_function const& rho) } } - for (int ialoc = 0; ialoc < unit_cell_.spl_num_atoms().local_size(); ialoc++) { - int ia = unit_cell_.spl_num_atoms(ialoc); - int nmtp = unit_cell_.atom(ia).num_mt_points(); + for (auto it : unit_cell_.spl_num_atoms()) { + int nmtp = unit_cell_.atom(it.i).num_mt_points(); std::vector vlm(ctx_.lmmax_pot()); - SHT::convert(ctx_.lmax_pot(), &vmtlm(0, ia), &vlm[0]); + SHT::convert(ctx_.lmax_pot(), &vmtlm(0, it.i), &vlm[0]); #pragma omp parallel for default(shared) for (int lm = 0; lm < ctx_.lmmax_pot(); lm++) { int l = l_by_lm_[lm]; for (int ir = 0; ir < nmtp; ir++) { - hartree_potential_->mt()[ia](lm, ir) += vlm[lm] * rRl(ir, l, unit_cell_.atom(ia).type_id()); + hartree_potential_->mt()[it.i](lm, ir) += vlm[lm] * rRl(ir, l, unit_cell_.atom(it.i).type_id()); } } /* save electronic part of the potential at the point of origin */ #ifdef __VHA_AUX - vh_el_(ia) = y00 * hartree_potential_->mt()[ia](0, 0) + - unit_cell_.atom(ia).zn() / unit_cell_.atom(ia).radial_grid(0); + vh_el_(it.i) = y00 * hartree_potential_->mt()[it.i](0, 0) + + unit_cell_.atom(it.i).zn() / unit_cell_.atom(it.i).radial_grid(0); #else - vh_el_(ia) = y00 * hartree_potential_->mt()[ia](0, 0); + vh_el_(it.i) = y00 * hartree_potential_->mt()[it.i](0, 0); #endif } ctx_.comm().allgather(vh_el_.at(sddk::memory_t::host), unit_cell_.spl_num_atoms().local_size(), @@ -292,14 +291,13 @@ void Potential::poisson(Periodic_function const& rho) /* add nucleus potential and contribution to Hartree energy */ if (ctx_.full_potential()) { double evha_nuc{0}; - for (int ialoc = 0; ialoc < unit_cell_.spl_num_atoms().local_size(); ialoc++) { - int ia = unit_cell_.spl_num_atoms(ialoc); - auto& atom = unit_cell_.atom(ia); + for (auto it : unit_cell_.spl_num_atoms()) { + auto& atom = unit_cell_.atom(it.i); Spline srho(atom.radial_grid()); for (int ir = 0; ir < atom.num_mt_points(); ir++) { double r = atom.radial_grid(ir); - hartree_potential_->mt()[ia](0, ir) -= atom.zn() / r / y00; - srho(ir) = rho.mt()[ia](0, ir) * r; + hartree_potential_->mt()[it.i](0, ir) -= atom.zn() / r / y00; + srho(ir) = rho.mt()[it.i](0, ir) * r; } evha_nuc -= atom.zn() * srho.interpolate().integrate(0) / y00; } diff --git a/src/potential/potential.hpp b/src/potential/potential.hpp index 1db837cba1..67c2092b2f 100644 --- a/src/potential/potential.hpp +++ b/src/potential/potential.hpp @@ -120,10 +120,10 @@ class Potential : public Field4D std::unique_ptr> paw_potential_; /// Exchange-correlation energy density of PAW atoms pseudodensity. - std::unique_ptr> paw_ps_exc_; + std::unique_ptr> paw_ps_exc_; /// Exchange-correlation energy density of PAW atoms all-electron density. - std::unique_ptr> paw_ae_exc_; + std::unique_ptr> paw_ae_exc_; /// Contribution to D-operator matrix from the PAW atoms. std::vector> paw_dij_; @@ -165,8 +165,8 @@ class Potential : public Field4D sddk::mdarray, 2> qmt(ctx_.lmmax_rho(), unit_cell_.num_atoms()); qmt.zero(); - for (int ialoc = 0; ialoc < unit_cell_.spl_num_atoms().local_size(); ialoc++) { - int ia = unit_cell_.spl_num_atoms(ialoc); + for (auto it : unit_cell_.spl_num_atoms()) { + auto ia = it.i; auto qmt_re = poisson_vmt(unit_cell_.atom(ia), rho__.mt()[ia], const_cast&>(hartree_potential_->mt()[ia])); @@ -667,9 +667,8 @@ class Potential : public Field4D /* compute contribution from the core */ double ecore{0}; #pragma omp parallel for reduction(+:ecore) - for (int i = 0; i < unit_cell_.spl_num_paw_atoms().local_size(); i++) { - int ia_paw = unit_cell_.spl_num_paw_atoms(i); - int ia = unit_cell_.paw_atom_index(ia_paw); + for (auto it : unit_cell_.spl_num_paw_atoms()) { + auto ia = unit_cell_.paw_atom_index(it.i); auto& atom = unit_cell_.atom(ia); @@ -701,10 +700,9 @@ class Potential : public Field4D { double e{0}; #pragma omp parallel for reduction(+:e) - for (int i = 0; i < unit_cell_.spl_num_paw_atoms().local_size(); i++) { - int ia_paw = unit_cell_.spl_num_paw_atoms(i); - int ia = unit_cell_.paw_atom_index(ia_paw); - e += calc_PAW_one_elec_energy(unit_cell_.atom(ia), density__.density_matrix(ia), paw_dij_[ia_paw]); + for (auto it : unit_cell_.spl_num_paw_atoms()) { + auto ia = unit_cell_.paw_atom_index(it.i); + e += calc_PAW_one_elec_energy(unit_cell_.atom(ia), density__.density_matrix(ia), paw_dij_[it.i]); } comm_.allreduce(&e, 1); return e; @@ -737,9 +735,9 @@ class Potential : public Field4D return *dveff_; } - auto const& effective_potential_mt(int ialoc) const + auto const& effective_potential_mt(atom_index_t::local ialoc) const { - int ia = unit_cell_.spl_num_atoms(ialoc); + auto ia = unit_cell_.spl_num_atoms().global_index(ialoc); return this->scalar().mt()[ia]; } @@ -758,9 +756,9 @@ class Potential : public Field4D return *hartree_potential_; } - auto const& hartree_potential_mt(int ialoc) const + auto const& hartree_potential_mt(atom_index_t::local ialoc) const { - int ia = unit_cell_.spl_num_atoms(ialoc); + auto ia = unit_cell_.spl_num_atoms().global_index(ialoc); return hartree_potential_->mt()[ia]; } diff --git a/src/potential/xc.cpp b/src/potential/xc.cpp index ddcb8fb099..a735f95583 100644 --- a/src/potential/xc.cpp +++ b/src/potential/xc.cpp @@ -154,7 +154,7 @@ void Potential::xc_rg_nonmagnetic(Density const& density__) #pragma omp parallel { /* split local size between threads */ - sddk::splindex spl_t(num_points, omp_get_num_threads(), omp_get_thread_num()); + sddk::splindex_block <>spl_t(num_points, n_blocks(omp_get_num_threads()), block_id(omp_get_thread_num())); /* if this is an LDA functional */ if (ixc.is_lda()) { ixc.get_lda(spl_t.local_size(), &rho.value(spl_t.global_offset()), @@ -351,7 +351,7 @@ void Potential::xc_rg_magnetic(Density const& density__) #pragma omp parallel { /* split local size between threads */ - sddk::splindex spl_t(num_points, omp_get_num_threads(), omp_get_thread_num()); + sddk::splindex_block<> spl_t(num_points, n_blocks(omp_get_num_threads()), block_id(omp_get_thread_num())); /* if this is an LDA functional */ if (ixc.is_lda()) { ixc.get_lda(spl_t.local_size(), &rho_up.value(spl_t.global_offset()), diff --git a/src/potential/xc_mt.cpp b/src/potential/xc_mt.cpp index a9655d2b6a..d334a6ef76 100644 --- a/src/potential/xc_mt.cpp +++ b/src/potential/xc_mt.cpp @@ -319,26 +319,25 @@ void Potential::xc_mt(Density const& density__) PROFILE("sirius::Potential::xc_mt"); #pragma omp parallel for - for (int ialoc = 0; ialoc < unit_cell_.spl_num_atoms().local_size(); ialoc++) { - int ia = unit_cell_.spl_num_atoms(ialoc); - auto& rgrid = unit_cell_.atom(ia).radial_grid(); + for (auto it : unit_cell_.spl_num_atoms()) { + auto& rgrid = unit_cell_.atom(it.i).radial_grid(); std::vector rho(ctx_.num_mag_dims() + 1); std::vector vxc(ctx_.num_mag_dims() + 1); - rho[0] = &density__.rho().mt()[ia]; - vxc[0] = &xc_potential_->mt()[ia]; + rho[0] = &density__.rho().mt()[it.i]; + vxc[0] = &xc_potential_->mt()[it.i]; for (int j = 0; j < ctx_.num_mag_dims(); j++) { - rho[j + 1] = &density__.mag(j).mt()[ia]; - vxc[j + 1] = &effective_magnetic_field(j).mt()[ia]; + rho[j + 1] = &density__.mag(j).mt()[it.i]; + vxc[j + 1] = &effective_magnetic_field(j).mt()[it.i]; } - sirius::xc_mt(rgrid, *sht_, xc_func_, ctx_.num_mag_dims(), rho, vxc, &xc_energy_density_->mt()[ia]); + sirius::xc_mt(rgrid, *sht_, xc_func_, ctx_.num_mag_dims(), rho, vxc, &xc_energy_density_->mt()[it.i]); /* z, x, y order */ std::array comp_map = {2, 0, 1}; /* add auxiliary magnetic field antiparallel to starting magnetization */ for (int j = 0; j < ctx_.num_mag_dims(); j++) { for (int ir = 0; ir < rgrid.num_points(); ir++) { - effective_magnetic_field(j).mt()[ia](0, ir) -= - aux_bf_(j, ia) * ctx_.unit_cell().atom(ia).vector_field()[comp_map[j]]; + effective_magnetic_field(j).mt()[it.i](0, ir) -= + aux_bf_(j, it.i) * ctx_.unit_cell().atom(it.i).vector_field()[comp_map[j]]; } } } // ialoc diff --git a/src/radial/radial_integrals.cpp b/src/radial/radial_integrals.cpp index b461b6f9d0..b59f4b56be 100644 --- a/src/radial/radial_integrals.cpp +++ b/src/radial/radial_integrals.cpp @@ -96,8 +96,8 @@ void Radial_integrals_aug::generate() } #pragma omp parallel for - for (int iq_loc = 0; iq_loc < spl_q_.local_size(); iq_loc++) { - int iq = spl_q_[iq_loc]; + for (auto it : spl_q_) { + int iq = it.i; Spherical_Bessel_functions jl(2 * lmax_beta, atom_type.radial_grid(), grid_q_[iq]); @@ -154,8 +154,8 @@ void Radial_integrals_rho_pseudo::generate() Spline rho(atom_type.radial_grid(), atom_type.ps_total_charge_density()); #pragma omp parallel for - for (int iq_loc = 0; iq_loc < spl_q_.local_size(); iq_loc++) { - int iq = spl_q_[iq_loc]; + for (auto it : spl_q_) { + int iq = it.i; Spherical_Bessel_functions jl(0, atom_type.radial_grid(), grid_q_[iq]); values_(iat)(iq) = sirius::inner(jl[0], rho, 0, atom_type.num_mt_points()) / fourpi; @@ -182,8 +182,8 @@ void Radial_integrals_rho_core_pseudo::generate() Spline ps_core(atom_type.radial_grid(), atom_type.ps_core_charge_density()); #pragma omp parallel for - for (int iq_loc = 0; iq_loc < spl_q_.local_size(); iq_loc++) { - int iq = spl_q_[iq_loc]; + for (auto it : spl_q_) { + int iq = it.i; Spherical_Bessel_functions jl(0, atom_type.radial_grid(), grid_q_[iq]); if (jl_deriv) { @@ -216,8 +216,8 @@ void Radial_integrals_beta::generate() } #pragma omp parallel for - for (int iq_loc = 0; iq_loc < spl_q_.local_size(); iq_loc++) { - int iq = spl_q_[iq_loc]; + for (auto it : spl_q_) { + int iq = it.i; Spherical_Bessel_functions jl(unit_cell_.lmax(), atom_type.radial_grid(), grid_q_[iq]); for (int idxrf = 0; idxrf < nrb; idxrf++) { int l = atom_type.indexr(idxrf).am.l(); @@ -313,8 +313,8 @@ void Radial_integrals_vloc::generate() auto rg = atom_type.radial_grid().segment(np); #pragma omp parallel for - for (int iq_loc = 0; iq_loc < spl_q_.local_size(); iq_loc++) { - int iq = spl_q_[iq_loc]; + for (auto it : spl_q_) { + int iq = it.i; Spline s(rg); double g = grid_q_[iq]; diff --git a/src/radial/radial_integrals.hpp b/src/radial/radial_integrals.hpp index 2b0211642b..b54b70a949 100644 --- a/src/radial/radial_integrals.hpp +++ b/src/radial/radial_integrals.hpp @@ -43,7 +43,7 @@ class Radial_integrals_base Radial_grid grid_q_; /// Split index of q-points. - sddk::splindex spl_q_; + sddk::splindex_block<> spl_q_; /// Array with integrals. sddk::mdarray, N> values_; @@ -64,8 +64,8 @@ class Radial_integrals_base qmax_ = qmax__ + std::max(10.0, qmax__ * 0.1); grid_q_ = Radial_grid_lin(static_cast(np__ * qmax_), 0, qmax_); - spl_q_ = sddk::splindex(grid_q_.num_points(), unit_cell_.comm().size(), - unit_cell_.comm().rank()); + spl_q_ = sddk::splindex_block<>(grid_q_.num_points(), n_blocks(unit_cell_.comm().size()), + block_id(unit_cell_.comm().rank())); } /// Get starting index iq and delta dq for the q-point on the linear grid. @@ -247,17 +247,17 @@ class Radial_integrals_rho_pseudo : public Radial_integrals_base<1> } /// Compute all values of the raial integrals. - inline sddk::mdarray values(std::vector& q__, mpi::Communicator const& comm__) const + inline auto values(std::vector& q__, mpi::Communicator const& comm__) const { int nq = static_cast(q__.size()); - sddk::splindex splq(nq, comm__.size(), comm__.rank()); + sddk::splindex_block<> splq(nq, n_blocks(comm__.size()), block_id(comm__.rank())); sddk::mdarray result(nq, unit_cell_.num_atom_types()); result.zero(); for (int iat = 0; iat < unit_cell_.num_atom_types(); iat++) { if (!unit_cell_.atom_type(iat).ps_total_charge_density().empty()) { #pragma omp parallel for for (int iqloc = 0; iqloc < splq.local_size(); iqloc++) { - int iq = splq[iqloc]; + auto iq = splq.global_index(iqloc); if (ri_callback_) { ri_callback_(iat + 1, 1, &q__[iq], &result(iq, iat)); } else { @@ -291,17 +291,17 @@ class Radial_integrals_rho_core_pseudo : public Radial_integrals_base<1> } /// Compute all values of the raial integrals. - inline sddk::mdarray values(std::vector& q__, mpi::Communicator const& comm__) const + inline auto values(std::vector& q__, mpi::Communicator const& comm__) const { int nq = static_cast(q__.size()); - sddk::splindex splq(nq, comm__.size(), comm__.rank()); + sddk::splindex_block<> splq(nq, n_blocks(comm__.size()), block_id(comm__.rank())); sddk::mdarray result(nq, unit_cell_.num_atom_types()); result.zero(); for (int iat = 0; iat < unit_cell_.num_atom_types(); iat++) { if (!unit_cell_.atom_type(iat).ps_core_charge_density().empty()) { #pragma omp parallel for for (int iqloc = 0; iqloc < splq.local_size(); iqloc++) { - int iq = splq[iqloc]; + auto iq = splq.global_index(iqloc); if (ri_callback_) { ri_callback_(iat + 1, 1, &q__[iq], &result(iq, iat)); } else { @@ -402,17 +402,17 @@ class Radial_integrals_vloc : public Radial_integrals_base<1> } /// Compute all values of the raial integrals. - inline sddk::mdarray values(std::vector& q__, mpi::Communicator const& comm__) const + inline auto values(std::vector& q__, mpi::Communicator const& comm__) const { int nq = static_cast(q__.size()); - sddk::splindex splq(nq, comm__.size(), comm__.rank()); + sddk::splindex_block<> splq(nq, n_blocks(comm__.size()), block_id(comm__.rank())); sddk::mdarray result(nq, unit_cell_.num_atom_types()); result.zero(); for (int iat = 0; iat < unit_cell_.num_atom_types(); iat++) { if (!unit_cell_.atom_type(iat).local_potential().empty()) { #pragma omp parallel for for (int iqloc = 0; iqloc < splq.local_size(); iqloc++) { - int iq = splq[iqloc]; + auto iq = splq.global_index(iqloc); if (ri_callback_) { ri_callback_(iat + 1, 1, &q__[iq], &result(iq, iat)); } else { diff --git a/src/strong_type.hpp b/src/strong_type.hpp index 4661638cac..52879220f7 100644 --- a/src/strong_type.hpp +++ b/src/strong_type.hpp @@ -1,7 +1,7 @@ #ifndef __STRONG_TYPE_HPP__ #define __STRONG_TYPE_HPP__ -namespace sirius { +//namespace sirius { template class strong_type @@ -52,6 +52,6 @@ class strong_type } }; -} +//} #endif diff --git a/src/symmetry/symmetrize_forces.hpp b/src/symmetry/symmetrize_forces.hpp index 6d14722dba..2d134a6432 100644 --- a/src/symmetry/symmetrize_forces.hpp +++ b/src/symmetry/symmetrize_forces.hpp @@ -47,11 +47,11 @@ symmetrize_forces(Unit_cell const& uc__, sddk::mdarray& f__) for (int ia = 0; ia < uc__.num_atoms(); ia++) { r3::vector force_ia(&f__(0, ia)); int ja = sym[isym].spg_op.sym_atom[ia]; - auto location = uc__.spl_num_atoms().location(ja); - if (location.rank == uc__.comm().rank()) { + auto location = uc__.spl_num_atoms().location(typename atom_index_t::global(ja)); + if (location.ib == uc__.comm().rank()) { auto force_ja = dot(Rc, force_ia); for (int x : {0, 1, 2}) { - sym_forces(x, location.local_index) += force_ja[x]; + sym_forces(x, location.index_local) += force_ja[x]; } } } diff --git a/src/symmetry/symmetrize_mt_function.hpp b/src/symmetry/symmetrize_mt_function.hpp index 054eafebcd..60ee682056 100644 --- a/src/symmetry/symmetrize_mt_function.hpp +++ b/src/symmetry/symmetrize_mt_function.hpp @@ -29,9 +29,10 @@ namespace sirius { +template inline void symmetrize_mt_function(Crystal_symmetry const& sym__, mpi::Communicator const& comm__, int num_mag_dims__, - std::vector*> frlm__) + std::vector*> frlm__) { PROFILE("sirius::symmetrize_mt_function"); @@ -46,7 +47,7 @@ symmetrize_mt_function(Crystal_symmetry const& sym__, mpi::Communicator const& c int lmax = utils::lmax(lmmax); /* split atoms between MPI ranks */ - sddk::splindex spl_atoms(frlm.atoms().size(), comm__.size(), comm__.rank()); + sddk::splindex_block spl_atoms(frlm.atoms().size(), n_blocks(comm__.size()), block_id(comm__.rank())); /* space for real Rlm rotation matrix */ sddk::mdarray rotm(lmmax, lmmax); @@ -67,9 +68,9 @@ symmetrize_mt_function(Crystal_symmetry const& sym__, mpi::Communicator const& c /* compute Rlm rotation matrix */ sht::rotation_matrix(lmax, sym__[i].spg_op.euler_angles, sym__[i].spg_op.proper, rotm); - for (int ialoc = 0; ialoc < spl_atoms.local_size(); ialoc++) { + for (auto it : spl_atoms) { /* get global index of the atom */ - int ia = frlm.atoms()[spl_atoms[ialoc]]; + int ia = it.i; int lmmax_ia = frlm[ia].angular_domain_size(); int nrmax_ia = frlm.unit_cell().atom(ia).num_mt_points(); int ja = sym__[i].spg_op.inv_sym_atom[ia]; @@ -83,14 +84,14 @@ symmetrize_mt_function(Crystal_symmetry const& sym__, mpi::Communicator const& c /* always symmetrize the scalar component */ for (int ir = 0; ir < nrmax_ia; ir++) { for (int lm = 0; lm < lmmax_ia; lm++) { - fsym_loc(lm, ir, 0, ialoc) += ftmp(lm, ir, 0); + fsym_loc(lm, ir, 0, it.li) += ftmp(lm, ir, 0); } } /* apply S part to [0, 0, z] collinear vector */ if (num_mag_dims__ == 1) { for (int ir = 0; ir < nrmax_ia; ir++) { for (int lm = 0; lm < lmmax_ia; lm++) { - fsym_loc(lm, ir, 1, ialoc) += ftmp(lm, ir, 1) * S(2, 2); + fsym_loc(lm, ir, 1, it.li) += ftmp(lm, ir, 1) * S(2, 2); } } } @@ -100,7 +101,7 @@ symmetrize_mt_function(Crystal_symmetry const& sym__, mpi::Communicator const& c for (int j : {0, 1, 2}) { for (int ir = 0; ir < nrmax_ia; ir++) { for (int lm = 0; lm < lmmax_ia; lm++) { - fsym_loc(lm, ir, 1 + k, ialoc) += ftmp(lm, ir, 1 + j) * S(k, j); + fsym_loc(lm, ir, 1 + k, it.li) += ftmp(lm, ir, 1 + j) * S(k, j); } } } diff --git a/src/traits.hpp b/src/traits.hpp index 4e76bfc409..c33452b087 100644 --- a/src/traits.hpp +++ b/src/traits.hpp @@ -1,5 +1,5 @@ -#ifndef TRAITS_H -#define TRAITS_H +#ifndef __TRAITS_HPP__ +#define __TRAITS_HPP__ namespace sirius { @@ -14,4 +14,4 @@ using identity_t = typename identity::type; } // namespace sirius -#endif /* TRAITS_H */ +#endif /* __TRAITS_HPP__ */ diff --git a/src/typedefs.hpp b/src/typedefs.hpp index cedb083bbe..3ca8e90c9a 100644 --- a/src/typedefs.hpp +++ b/src/typedefs.hpp @@ -26,7 +26,6 @@ #define __TYPEDEFS_HPP__ #include -//#include #include #include #include diff --git a/src/unit_cell/atom.hpp b/src/unit_cell/atom.hpp index ef9aa0adff..d1f2731472 100644 --- a/src/unit_cell/atom.hpp +++ b/src/unit_cell/atom.hpp @@ -151,7 +151,7 @@ class Atom RTE_THROW("not yet mpi parallel"); } - sddk::splindex spl_lm(lmmax, comm__.size(), comm__.rank()); + sddk::splindex_block<> spl_lm(lmmax, n_blocks(comm__.size()), block_id(comm__.rank())); auto l_by_lm = utils::l_by_lm(lmax_pot_); diff --git a/src/unit_cell/basis_functions_index.hpp b/src/unit_cell/basis_functions_index.hpp index 9ccad9ef75..e928e1645d 100644 --- a/src/unit_cell/basis_functions_index.hpp +++ b/src/unit_cell/basis_functions_index.hpp @@ -72,9 +72,6 @@ class basis_functions_index sddk::mdarray index_by_lm_order_; - /// Maximum l of the radial basis functions. - int lmax_{-1}; - int offset_lo_{-1}; std::vector offset_; diff --git a/src/unit_cell/unit_cell.cpp b/src/unit_cell/unit_cell.cpp index 369e84c539..e277786bef 100644 --- a/src/unit_cell/unit_cell.cpp +++ b/src/unit_cell/unit_cell.cpp @@ -531,13 +531,12 @@ Unit_cell::generate_radial_functions(std::ostream& out__) { PROFILE("sirius::Unit_cell::generate_radial_functions"); - for (int icloc = 0; icloc < (int)spl_num_atom_symmetry_classes().local_size(); icloc++) { - int ic = spl_num_atom_symmetry_classes(icloc); - atom_symmetry_class(ic).generate_radial_functions(parameters_.valence_relativity()); + for (auto it : spl_num_atom_symmetry_classes()) { + atom_symmetry_class(it.i).generate_radial_functions(parameters_.valence_relativity()); } for (int ic = 0; ic < num_atom_symmetry_classes(); ic++) { - int rank = spl_num_atom_symmetry_classes().local_rank(ic); + int rank = spl_num_atom_symmetry_classes().location(typename atom_symmetry_class_index_t::global(ic)).ib; atom_symmetry_class(ic).sync_radial_functions(comm_, rank); } @@ -547,9 +546,8 @@ Unit_cell::generate_radial_functions(std::ostream& out__) pout << std::endl << "Linearization energies" << std::endl; } - for (int icloc = 0; icloc < (int)spl_num_atom_symmetry_classes().local_size(); icloc++) { - int ic = spl_num_atom_symmetry_classes(icloc); - atom_symmetry_class(ic).write_enu(pout); + for (auto it : spl_num_atom_symmetry_classes()) { + atom_symmetry_class(it.i).write_enu(pout); } RTE_OUT(out__) << pout.flush(0); } @@ -566,13 +564,12 @@ Unit_cell::generate_radial_integrals() PROFILE("sirius::Unit_cell::generate_radial_integrals"); try { - for (int icloc = 0; icloc < spl_num_atom_symmetry_classes().local_size(); icloc++) { - int ic = spl_num_atom_symmetry_classes(icloc); - atom_symmetry_class(ic).generate_radial_integrals(parameters_.valence_relativity()); + for (auto it : spl_num_atom_symmetry_classes()) { + atom_symmetry_class(it.i).generate_radial_integrals(parameters_.valence_relativity()); } for (int ic = 0; ic < num_atom_symmetry_classes(); ic++) { - int rank = spl_num_atom_symmetry_classes().local_rank(ic); + int rank = spl_num_atom_symmetry_classes().location(typename atom_symmetry_class_index_t::global(ic)).ib; atom_symmetry_class(ic).sync_radial_integrals(comm_, rank); } } catch(std::exception const& e) { @@ -582,13 +579,12 @@ Unit_cell::generate_radial_integrals() } try { - for (int ialoc = 0; ialoc < spl_num_atoms_.local_size(); ialoc++) { - int ia = spl_num_atoms_[ialoc]; - atom(ia).generate_radial_integrals(parameters_.processing_unit(), mpi::Communicator::self()); + for (auto it : spl_num_atoms_) { + atom(it.i).generate_radial_integrals(parameters_.processing_unit(), mpi::Communicator::self()); } for (int ia = 0; ia < num_atoms(); ia++) { - int rank = spl_num_atoms().local_rank(ia); + int rank = spl_num_atoms().location(typename atom_index_t::global(ia)).ib; atom(ia).sync_radial_integrals(comm_, rank); } } catch(std::exception const& e) { @@ -665,7 +661,7 @@ Unit_cell::initialize() PROFILE("sirius::Unit_cell::initialize"); /* split number of atom between all MPI ranks */ - spl_num_atoms_ = sddk::splindex(num_atoms(), comm_.size(), comm_.rank()); + spl_num_atoms_ = sddk::splindex_block(num_atoms(), n_blocks(comm_.size()), block_id(comm_.rank())); /* initialize atom types */ for (int iat = 0; iat < num_atom_types(); iat++) { @@ -873,8 +869,8 @@ Unit_cell::update() get_symmetry(); - spl_num_atom_symmetry_classes_ = sddk::splindex(num_atom_symmetry_classes(), - comm_.size(), comm_.rank()); + spl_num_atom_symmetry_classes_ = sddk::splindex_block(num_atom_symmetry_classes(), + n_blocks(comm_.size()), block_id(comm_.rank())); volume_mt_ = 0.0; if (parameters_.full_potential()) { @@ -957,7 +953,8 @@ Unit_cell::init_paw() } } - spl_num_paw_atoms_ = sddk::splindex(num_paw_atoms(), comm_.size(), comm_.rank()); + spl_num_paw_atoms_ = sddk::splindex_block(num_paw_atoms(), n_blocks(comm_.size()), + block_id(comm_.rank())); } std::pair> diff --git a/src/unit_cell/unit_cell.hpp b/src/unit_cell/unit_cell.hpp index bff0c0cb03..116def490b 100644 --- a/src/unit_cell/unit_cell.hpp +++ b/src/unit_cell/unit_cell.hpp @@ -58,16 +58,16 @@ class Unit_cell std::vector> atoms_; /// Split index of atoms. - sddk::splindex spl_num_atoms_; + sddk::splindex_block spl_num_atoms_; /// Global index of atom by index of PAW atom. std::vector paw_atom_index_; /// Split index of PAW atoms. - sddk::splindex spl_num_paw_atoms_; + sddk::splindex_block spl_num_paw_atoms_; /// Split index of atom symmetry classes. - sddk::splindex spl_num_atom_symmetry_classes_; + sddk::splindex_block spl_num_atom_symmetry_classes_; /// Bravais lattice vectors in column order. /** The following convention is used to transform fractional coordinates to Cartesian: @@ -174,13 +174,13 @@ class Unit_cell return spl_num_paw_atoms_; } - inline int spl_num_paw_atoms(int idx__) const + inline auto spl_num_paw_atoms(paw_atom_index_t::local idx__) const { - return spl_num_paw_atoms_[idx__]; + return spl_num_paw_atoms_.global_index(idx__); } /// Return global index of atom by the index in the list of PAW atoms. - inline int paw_atom_index(int ipaw__) const + inline int paw_atom_index(paw_atom_index_t::global ipaw__) const { return paw_atom_index_[ipaw__]; } @@ -513,21 +513,11 @@ class Unit_cell return spl_num_atoms_; } - inline auto spl_num_atoms(int i) const - { - return static_cast(spl_num_atoms_[i]); - } - inline auto const& spl_num_atom_symmetry_classes() const { return spl_num_atom_symmetry_classes_; } - inline auto spl_num_atom_symmetry_classes(int i) const - { - return static_cast(spl_num_atom_symmetry_classes_[i]); - } - inline double volume_mt() const { return volume_mt_;