From 687ccaab934322fe0126fb5978a7e0de49e106e1 Mon Sep 17 00:00:00 2001
From: Anton Kozhevnikov <toxa81@gmail.com>
Date: Thu, 13 Jul 2023 13:44:32 +0200
Subject: [PATCH] [feature] new split index implementation (#862)

new split index implementation
---
 apps/tests/test_allgather.cpp                 |   4 +-
 apps/tests/test_wf_ortho.cpp                  |   7 +-
 apps/tests/test_wf_trans.cpp                  |  16 +-
 apps/unit_tests/test_splindex.cpp             |  44 +-
 python_module/py_sirius.cpp                   |   2 +-
 src/SDDK/splindex.hpp                         | 635 +++++++++---------
 src/SDDK/wave_functions.hpp                   |  44 +-
 src/api/sirius_api.cpp                        |  13 +-
 src/band/band.cpp                             |   5 +-
 src/band/band.hpp                             |  30 +-
 src/band/diag_full_potential.cpp              |   9 +-
 src/band/residuals.hpp                        |  10 +-
 src/band/solve.cpp                            |   5 +-
 src/density/density.cpp                       |  84 ++-
 src/density/density.hpp                       |  17 +-
 src/dft/dft_ground_state.cpp                  |  12 +-
 src/dft/energy.cpp                            |   7 +-
 src/fft/fft.hpp                               |   2 +-
 src/fft/gvec.cpp                              |   8 +-
 src/fft/gvec.hpp                              |   6 +-
 src/function3d/field4d.hpp                    |   2 +-
 src/function3d/paw_field4d.hpp                |   8 +-
 src/function3d/periodic_function.hpp          |   9 +-
 src/function3d/spheric_function_set.hpp       |  83 ++-
 src/geometry/force.cpp                        |  30 +-
 src/geometry/non_local_functor.hpp            |   7 +-
 src/geometry/stress.cpp                       |  15 +-
 src/hamiltonian/hamiltonian.cpp               |  19 +-
 src/hamiltonian/hamiltonian_k.cpp             |  58 +-
 src/hamiltonian/non_local_operator.hpp        |   5 +-
 src/hamiltonian/non_local_operator_base.hpp   |  20 +-
 .../hubbard_occupancies_derivatives.cpp       |   4 +-
 src/k_point/generate_fv_states.cpp            |  15 +-
 src/k_point/k_point.cpp                       |  39 +-
 src/k_point/k_point_set.cpp                   |  79 +--
 src/k_point/k_point_set.hpp                   |  26 +-
 src/linalg/dmatrix.cpp                        |  98 +--
 src/linalg/dmatrix.hpp                        |  32 +-
 src/mixer/mixer_functions.cpp                 |  24 +-
 src/nlcglib/adaptor.cpp                       |  65 +-
 src/nlcglib/inverse_overlap.hpp               |   2 +-
 src/nlcglib/overlap.hpp                       |   8 +-
 src/nlcglib/ultrasoft_precond.hpp             |   8 +-
 src/potential/paw_potential.cpp               |  30 +-
 src/potential/poisson.cpp                     |  24 +-
 src/potential/potential.hpp                   |  28 +-
 src/potential/xc.cpp                          |   4 +-
 src/potential/xc_mt.cpp                       |  19 +-
 src/radial/radial_integrals.cpp               |  20 +-
 src/radial/radial_integrals.hpp               |  24 +-
 src/strong_type.hpp                           |   4 +-
 src/symmetry/symmetrize_forces.hpp            |   6 +-
 src/symmetry/symmetrize_mt_function.hpp       |  15 +-
 src/traits.hpp                                |   6 +-
 src/typedefs.hpp                              |   1 -
 src/unit_cell/atom.hpp                        |   2 +-
 src/unit_cell/basis_functions_index.hpp       |   3 -
 src/unit_cell/unit_cell.cpp                   |  35 +-
 src/unit_cell/unit_cell.hpp                   |  22 +-
 59 files changed, 898 insertions(+), 961 deletions(-)
diff --git a/apps/tests/test_allgather.cpp b/apps/tests/test_allgather.cpp
index ad790ccf80..4efa77a0d8 100644
--- a/apps/tests/test_allgather.cpp
+++ b/apps/tests/test_allgather.cpp
@@ -7,10 +7,10 @@ void test_allgather()
     int N = 11;
     std::vector<double> vec(N, 0.0);
 
-    sddk::splindex<sddk::splindex_t::block> spl(N, mpi::Communicator::world().size(), mpi::Communicator::world().rank());
+    sddk::splindex_block<> spl(N, n_blocks(mpi::Communicator::world().size()), block_id(mpi::Communicator::world().rank()));
 
     for (int i = 0; i < spl.local_size(); i++) {
-        vec[spl[i]] = mpi::Communicator::world().rank() + 1.0;
+        vec[spl.global_index(i)] = mpi::Communicator::world().rank() + 1.0;
     }
 
     {
diff --git a/apps/tests/test_wf_ortho.cpp b/apps/tests/test_wf_ortho.cpp
index 1828630dad..cf1936b1cf 100644
--- a/apps/tests/test_wf_ortho.cpp
+++ b/apps/tests/test_wf_ortho.cpp
@@ -39,10 +39,9 @@ void test_wf_ortho(BLACS_grid const& blacs_grid__, double cutoff__, int num_band
             for (int igloc = 0; igloc < gvec->count(); igloc++) {
                 phi.pw_coeffs(igloc, s, wf::band_index(i)) = utils::random<std::complex<T>>();
             }
-            for (int ialoc = 0; ialoc < phi.spl_num_atoms().local_size(); ialoc++) {
-                int ia = phi.spl_num_atoms()[ialoc];
-                for (int xi = 0; xi < num_mt_coeffs[ia]; xi++) {
-                    phi.mt_coeffs(xi, wf::atom_index(ialoc), s, wf::band_index(i)) = utils::random<std::complex<T>>();
+            for (auto it : phi.spl_num_atoms()) {
+                for (int xi = 0; xi < num_mt_coeffs[it.i]; xi++) {
+                    phi.mt_coeffs(xi, it.li, s, wf::band_index(i)) = utils::random<std::complex<T>>();
                 }
             }
         }
diff --git a/apps/tests/test_wf_trans.cpp b/apps/tests/test_wf_trans.cpp
index 2949087152..dc81715ed3 100644
--- a/apps/tests/test_wf_trans.cpp
+++ b/apps/tests/test_wf_trans.cpp
@@ -37,10 +37,9 @@ void test_wf_trans(la::BLACS_grid const& blacs_grid__, double cutoff__, int num_
             for (int igloc = 0; igloc < gvec->count(); igloc++) {
                 phi.pw_coeffs(igloc, s, wf::band_index(i)) = utils::random<std::complex<T>>();
             }
-            for (int ialoc = 0; ialoc < phi.spl_num_atoms().local_size(); ialoc++) {
-                int ia = phi.spl_num_atoms()[ialoc];
-                for (int xi = 0; xi < num_mt_coeffs[ia]; xi++) {
-                    phi.mt_coeffs(xi, wf::atom_index(ialoc), s, wf::band_index(i)) = utils::random<std::complex<T>>();
+            for (auto it : phi.spl_num_atoms()) {
+                for (int xi = 0; xi < num_mt_coeffs[it.i]; xi++) {
+                    phi.mt_coeffs(xi, it.li, s, wf::band_index(i)) = utils::random<std::complex<T>>();
                 }
             }
         }
@@ -86,12 +85,11 @@ void test_wf_trans(la::BLACS_grid const& blacs_grid__, double cutoff__, int num_
                     phi.pw_coeffs(igloc, s, wf::band_index(i)) -
                     psi.pw_coeffs(igloc, s, wf::band_index(num_bands__ - i - 1)));
             }
-            for (int ialoc = 0; ialoc < phi.spl_num_atoms().local_size(); ialoc++) {
-                int ia = phi.spl_num_atoms()[ialoc];
-                for (int xi = 0; xi < num_mt_coeffs[ia]; xi++) {
+            for (auto it : phi.spl_num_atoms()) {
+                for (int xi = 0; xi < num_mt_coeffs[it.i]; xi++) {
                     diff += std::abs(
-                        phi.mt_coeffs(xi, wf::atom_index(ialoc), s, wf::band_index(i)) -
-                        psi.mt_coeffs(xi, wf::atom_index(ialoc), s, wf::band_index(num_bands__ - i - 1)));
+                        phi.mt_coeffs(xi, it.li, s, wf::band_index(i)) -
+                        psi.mt_coeffs(xi, it.li, s, wf::band_index(num_bands__ - i - 1)));
                 }
             }
         }
diff --git a/apps/unit_tests/test_splindex.cpp b/apps/unit_tests/test_splindex.cpp
index 52e6809860..dfd0150441 100644
--- a/apps/unit_tests/test_splindex.cpp
+++ b/apps/unit_tests/test_splindex.cpp
@@ -8,10 +8,10 @@ int test1()
 {
     for (int num_ranks = 1; num_ranks < 20; num_ranks++) {
         for (int N = 1; N < 1130; N++) {
-            splindex<splindex_t::block> spl(N, num_ranks, 0);
+            splindex_block<> spl(N, n_blocks(num_ranks), block_id(0));
             int sz = 0;
             for (int i = 0; i < num_ranks; i++) {
-                sz += (int)spl.local_size(i);
+                sz += spl.local_size(block_id(i));
             }
             if (sz != N) {
                 std::stringstream s;
@@ -20,21 +20,21 @@ int test1()
                 s << "computed global index size: " << sz << std::endl;
                 s << "number of ranks: " << num_ranks << std::endl;
                 for (int i = 0; i < num_ranks; i++) {
-                    s << "i, local_size(i): " << i << ", " << spl.local_size(i) << std::endl;
+                    s << "i, local_size(i): " << i << ", " << spl.local_size(block_id(i)) << std::endl;
                 }
                 throw std::runtime_error(s.str());
             }
             for (int i = 0; i < N; i++) {
-                int rank = spl.local_rank(i);
-                int offset = (int)spl.local_index(i);
-                if (i != (int)spl.global_index(offset, rank)) {
+                int rank = spl.location(block_id(i)).ib;
+                int offset = spl.location(block_id(i)).index_local;
+                if (i != (int)spl.global_index(offset, block_id(rank))) {
                     std::stringstream s;
                     s << "test1: wrong index." << std::endl;
                     s << "global index size: " << N << std::endl;
                     s << "number of ranks: " << num_ranks << std::endl;
                     s << "global index: " << i << std::endl;
                     s << "rank, offset: " << rank << ", " << offset << std::endl;
-                    s << "computed global index: " << spl.global_index(offset, rank) << std::endl;
+                    s << "computed global index: " << spl.global_index(offset, block_id(rank)) << std::endl;
                     throw std::runtime_error(s.str());
                 }
             }
@@ -48,22 +48,28 @@ int test2()
     for (int bs = 1; bs < 17; bs++) {
         for (int num_ranks = 1; num_ranks < 13; num_ranks++) {
             for (int N = 1; N < 1113; N++) {
-                splindex<splindex_t::block_cyclic> spl(N, num_ranks, 0, bs);
+                sddk::splindex_block_cyclic<> spl(N, n_blocks(num_ranks), block_id(0), bs);
                 int sz = 0;
                 for (int i = 0; i < num_ranks; i++) {
-                    sz += (int)spl.local_size(i);
+                    sz += (int)spl.local_size(block_id(i));
                 }
                 if (sz != N) {
                     std::stringstream s;
 
-                    s << "test2: wrong sum of local sizes" << std::endl;
+                    s << "test2: wrong sum of local sizes" << std::endl
+                      << "N : " << N << std::endl
+                      << "num_ranks :" << num_ranks << std::endl
+                      << "block size : " << bs << std::endl;
+                    for (int i = 0; i < num_ranks; i++) {
+                        s << "rank, local_size : " << i << ", " << spl.local_size(block_id(i)) << std::endl;
+                    }
                     throw std::runtime_error(s.str());
                 }
 
                 for (int i = 0; i < N; i++) {
-                    int rank = spl.local_rank(i);
-                    int offset = (int)spl.local_index(i);
-                    if (i != (int)spl.global_index(offset, rank)) {
+                    int rank = spl.location(block_id(i)).ib;
+                    int offset = spl.location(block_id(i)).index_local;
+                    if (i != (int)spl.global_index(offset, block_id(rank))) {
                         std::stringstream s;
                         s << "test2: wrong index" << std::endl;
                         s << "bs = " << bs << std::endl
@@ -72,7 +78,7 @@ int test2()
                           << "idx = " << i << std::endl
                           << "rank = " << rank << std::endl
                           << "offset = " << offset << std::endl
-                          << "computed index = " << spl.global_index(offset, rank) << std::endl;
+                          << "computed index = " << spl.global_index(offset, block_id(rank)) << std::endl;
                         throw std::runtime_error(s.str());
                     }
                 }
@@ -86,14 +92,14 @@ int test3()
 {
     for (int num_ranks = 1; num_ranks < 20; num_ranks++) {
         for (int N = 1; N < 1130; N++) {
-            splindex<splindex_t::block> spl_tmp(N, num_ranks, 0);
+            splindex_block<> spl_tmp(N, n_blocks(num_ranks), block_id(0));
 
-            splindex<splindex_t::chunk> spl(N, num_ranks, 0, spl_tmp.counts());
+            splindex_chunk<> spl(N, n_blocks(num_ranks), block_id(0), spl_tmp.counts());
 
             for (int i = 0; i < N; i++) {
-                int rank = spl.local_rank(i);
-                int offset = spl.local_index(i);
-                if (i != spl.global_index(offset, rank)) {
+                int rank = spl.location(block_id(i)).ib;
+                int offset = spl.location(block_id(i)).index_local;
+                if (i != spl.global_index(offset, block_id(rank))) {
                     std::stringstream s;
                     s << "test3: wrong index" << std::endl;
                     throw std::runtime_error(s.str());
diff --git a/python_module/py_sirius.cpp b/python_module/py_sirius.cpp
index 05c95aad05..d789712b15 100644
--- a/python_module/py_sirius.cpp
+++ b/python_module/py_sirius.cpp
@@ -409,7 +409,7 @@ PYBIND11_MODULE(py_sirius, m)
             [](K_point_set& ks, int i) -> K_point<double>& {
                 if (i >= ks.spl_num_kpoints().local_size())
                     throw pybind11::index_error("out of bounds");
-                return *ks.get<double>(ks.spl_num_kpoints(i));
+                return *ks.get<double>(ks.spl_num_kpoints().global_index(typename kp_index_t::local(i)));
             },
             py::return_value_policy::reference_internal)
         .def("__len__", [](K_point_set const& ks) { return ks.spl_num_kpoints().local_size(); })
diff --git a/src/SDDK/splindex.hpp b/src/SDDK/splindex.hpp
index 026b6b71d9..cc3d808ed7 100644
--- a/src/SDDK/splindex.hpp
+++ b/src/SDDK/splindex.hpp
@@ -25,26 +25,57 @@
 #ifndef __SPLINDEX_HPP__
 #define __SPLINDEX_HPP__
 
-#include <algorithm>
-#include <vector>
-#include <sstream>
-#include <limits>
-#include <cassert>
+#include "strong_type.hpp"
+#include "utils/rte.hpp"
+#include <cstddef>
+
+/// Basic index type.
+template <typename T = int>
+struct basic_index_t {
+    typedef T value_type;
+    typedef T global;
+    typedef T local;
+};
 
-namespace sddk {
+/// K-point index type.
+struct kp_index_t {
+    typedef int value_type;
+    typedef strong_type<value_type, struct __kp_global_index_tag> global;
+    typedef strong_type<value_type, struct __kp_local_index_tag> local;
+};
 
-/// Type of split index.
-enum class splindex_t
-{
-    /// Block distribution.
-    block,
-    /// Block-cyclic distribution.
-    block_cyclic,
-    /// Custom distribution in continuous chunks of arbitrary size.
-    chunk
+struct atom_index_t {
+    typedef int value_type;
+    typedef strong_type<value_type, struct __atom_global_index_tag> global;
+    typedef strong_type<value_type, struct __atom_local_index_tag> local;
+};
+
+struct atom_type_index_t {
+    typedef int value_type;
+    typedef strong_type<value_type, struct __atom_type_global_index_tag> global;
+    typedef strong_type<value_type, struct __atom_type_local_index_tag> local;
 };
 
-/// Type of index domain.
+struct atom_symmetry_class_index_t {
+    typedef int value_type;
+    typedef strong_type<value_type, struct __atom_symmetry_class_global_index_tag> global;
+    typedef strong_type<value_type, struct __atom_symmetry_class_local_index_tag> local;
+};
+
+struct paw_atom_index_t {
+    typedef int value_type;
+    typedef strong_type<value_type, struct __paw_atom_global_index_tag> global;
+    typedef strong_type<value_type, struct __paw_atom_local_index_tag> local;
+};
+
+/// Number of blocks to which the global index is split.
+using n_blocks = strong_type<int, struct __n_blocks_tag>;
+/// ID of the block.
+/** The id of the block has the range [0, n_blocks) */
+using block_id = strong_type<int, struct __block_id_tag>;
+
+namespace sddk {
+
 enum class index_domain_t
 {
     /// Global index.
@@ -54,456 +85,420 @@ enum class index_domain_t
 };
 
 /// Base class for split index.
-template <typename T>
-class splindex_base
+template <typename Index_t = basic_index_t<int>>
+class splindex
 {
+  public:
+    using value_type = typename Index_t::value_type;
   protected:
-    /// Rank of the block with local fraction of the global index.
-    int rank_{-1};
+    /// Number of blocks over which the global index is distributed.
+    n_blocks n_blocks_{-1};
 
-    /// Number of ranks over which the global index is distributed.
-    int num_ranks_{-1};
+    /// Index of the block with local fraction of the global index.
+    block_id block_id_{-1};
 
-    /// size of the global index
-    T global_index_size_;
+    /// Size (aka length) of the global index.
+    value_type size_{-1};
 
-    /// Default constructor.
-    splindex_base()
-    {
-    }
-
-    /// Pair of <local index, rank> describing the location of a global index.
+    /// Pair of <local index, block_id> describing the location of a global index element.
     struct location_t
     {
-        T local_index;
-        int rank;
-        location_t(T local_index__, int rank__)
-            : local_index(local_index__)
-            , rank(rank__)
+        /// Local index inside a block.
+        typename Index_t::local index_local;
+        /// Index of the block.
+        block_id ib;
+        /// Constructor.
+        location_t(typename Index_t::local index_local__, block_id ib__)
+            : index_local(index_local__)
+            , ib(ib__)
         {
         }
     };
 
   public:
-    /// Rank id.
-    inline int rank() const
-    {
-        return rank_;
-    }
 
-    /// Number of ranks that are participating in the distribution of an index.
-    inline int num_ranks() const
-    {
-        return num_ranks_;
-    }
-
-    inline T global_index_size() const
+    /// Default constructor.
+    splindex()
     {
-        return global_index_size_;
     }
 
-    static inline T block_size(T size__, int num_ranks__)
-    {
-        return size__ / num_ranks__ + std::min(T(1), size__ % num_ranks__);
-    }
-};
-
-/// Split index.
-template <splindex_t type, typename T = int>
-class splindex : public splindex_base<T>
-{
-};
-
-/// Specialization for the block distribution.
-template <typename T>
-class splindex<splindex_t::block, T> : public splindex_base<T>
-{
-  private:
-    T block_size_;
-
-    void init(T global_index_size__, int num_ranks__, int rank__)
+    /// Constructor.
+    /** Check and set index size, number of blocks and block id. */
+    splindex(value_type size__, n_blocks n_blocks__, block_id block_id__)
     {
-        this->global_index_size_ = global_index_size__;
-
-        if (num_ranks__ < 0) {
+        if (size__ < 0) {
             std::stringstream s;
-            s << "wrong number of ranks: " << num_ranks__;
+            s << "wrong size : " << size__;
             throw std::runtime_error(s.str());
         }
-        this->num_ranks_ = num_ranks__;
+        this->size_ = size__;
 
-        if (rank__ < 0 || rank__ >= num_ranks__) {
+        if (n_blocks__.get() < 0) {
             std::stringstream s;
-            s << "wrong rank: " << rank__;
+            s << "wrong number of blocks : " << n_blocks__.get();
             throw std::runtime_error(s.str());
         }
-        this->rank_ = rank__;
+        this->n_blocks_ = n_blocks__;
 
-        block_size_ = this->block_size(global_index_size__, num_ranks__);
+        if (block_id__.get() < 0 || block_id__.get() >= n_blocks__.get()) {
+            std::stringstream s;
+            s << "wrong rank block id : " << block_id__.get();
+            throw std::runtime_error(s.str());
+        }
+        this->block_id_ = block_id__;
     }
 
-  public:
-    /// Default constructor
-    splindex()
+    virtual ~splindex()
     {
     }
 
-    /// Constructor.
-    splindex(T global_index_size__, int num_ranks__, int rank__)
-    {
-        init(global_index_size__, num_ranks__, rank__);
-    }
+    /// Return local size of the split index for a given block.
+    virtual value_type local_size(block_id block_id__) const = 0;
 
-    /// Return "local index, rank" pair for a global index.
-    inline typename splindex_base<T>::location_t location(T idxglob__) const
-    {
-        assert(idxglob__ < this->global_index_size_);
+    /// Return location (block_id and local offset) of the global index.
+    virtual location_t location(typename Index_t::global idx__) const = 0;
 
-        int rank = int(idxglob__ / block_size_);
-        T idxloc = idxglob__ - rank * block_size_;
+    /// Return global index by block id and local index.
+    virtual typename Index_t::global global_index(typename Index_t::local idxloc__, block_id block_id__) const = 0;
 
-        return typename splindex_base<T>::location_t(idxloc, rank);
+    /// Return local size for the current block.
+    value_type local_size() const
+    {
+        return this->local_size(this->block_id_);
     }
 
-    /// Return local size of the split index for an arbitrary rank.
-    inline T local_size(int rank__) const
+    /// Return global index of an element by local index and block id.
+    inline auto global_index(typename splindex<Index_t>::location_t loc__) const
     {
-        assert(rank__ >= 0 && rank__ < this->num_ranks_);
-
-        if (this->global_index_size_ == 0) {
-            return 0;
-        }
-
-        int n = static_cast<int>(this->global_index_size_ / block_size_);
-        if (rank__ < n) {
-            return block_size_;
-        } else if (rank__ == n) {
-            return this->global_index_size_ - rank__ * block_size_;
-        } else {
-            return 0;
-        }
+        return this->global_index(loc__.index_local, loc__.ib);
     }
 
-    /// Return local size of the split index for a current rank.
-    inline T local_size() const
+    inline auto global_index(typename Index_t::local idxloc__) const
     {
-        return local_size(this->rank_);
+        return this->global_index(idxloc__, this->block_id_);
     }
 
-    /// Return rank which holds the element with the given global index.
-    inline int local_rank(T idxglob__) const
+    /// Return total length of the index (global number of elements).
+    inline auto size() const noexcept
     {
-        return location(idxglob__).rank;
+        return size_;
     }
 
-    /// Return local index of the element for the rank which handles the given global index.
-    inline T local_index(T idxglob__) const
+    /// Compute size of the block from global index size and number of blocks.
+    static inline auto block_size(value_type size__, n_blocks n_blocks__)
     {
-        return location(idxglob__).local_index;
+        return size__ / n_blocks__ + std::min(value_type(1), size__ % n_blocks__);
     }
+};
 
-    /// Return global index of an element by local index and rank.
-    inline T global_index(T idxloc__, int rank__) const
-    {
-        assert(rank__ >= 0 && rank__ < this->num_ranks_);
+template <typename Index_t>
+class splindex_iterator_t
+{
+  private:
+    splindex<Index_t> const* idx_{nullptr};
+  public:
+    using difference_type = std::ptrdiff_t;
+    typename Index_t::local li;
 
-        if (local_size(rank__) == 0) {
-            return std::numeric_limits<T>::max();
-        }
+    splindex_iterator_t<Index_t>& operator=(splindex_iterator_t<Index_t> const& lhs_) = default;
 
-        assert(idxloc__ < local_size(rank__));
+    splindex_iterator_t(splindex<Index_t> const& idx__)
+        : idx_{&idx__}
+        , li{0}
+    {
+    }
 
-        return rank__ * block_size_ + idxloc__;
+    inline bool operator!=(splindex_iterator_t<Index_t> const& rhs__)
+    {
+        return this->li != rhs__.li;
     }
 
-    inline T global_offset() const
+    inline splindex_iterator_t<Index_t>& operator++()
     {
-        return global_index(0, this->rank_);
+        this->li++;
+        return *this;
     }
 
-    inline T global_offset(int rank__) const
+    inline splindex_iterator_t<Index_t> operator++(int)
     {
-        return global_index(0, rank__);
+        splindex_iterator_t<Index_t> tmp(this->idx());
+        this->li++;
+        return tmp;
     }
 
-    inline T operator[](T idxloc__) const
+    inline auto operator*()
     {
-        return global_index(idxloc__, this->rank_);
+        struct {
+            typename Index_t::global i;
+            typename Index_t::local li; } ret{idx_->global_index(this->li), this->li};
+        return ret;
     }
 
-    inline std::vector<T> offsets() const
+    inline difference_type operator-(splindex_iterator_t<Index_t> const& rhs__) const
     {
-        std::vector<T> v(this->num_ranks_);
-        for (int i = 0; i < this->num_ranks_; i++) {
-            v[i] = global_offset(i);
-        }
-        return v;
+        return li - rhs__.li;
     }
 
-    inline std::vector<T> counts() const
+    inline splindex_iterator_t<Index_t>& operator+=(difference_type rhs__)
     {
-        std::vector<T> v(this->num_ranks_);
-        for (int i = 0; i < this->num_ranks_; i++) {
-            v[i] = local_size(i);
-        }
-        return v;
+        li += rhs__;
+        return *this;
     }
 };
 
-/// Specialization for the block-cyclic distribution.
-template <typename T>
-class splindex<splindex_t::block_cyclic, T> : public splindex_base<T>
+template <typename Index_t = basic_index_t<int>>
+class splindex_block : public splindex<Index_t>
 {
+  public:
+    using value_type = typename splindex<Index_t>::value_type;
   private:
-    /// cyclic block size of the distribution
-    int block_size_{-1};
+    /// Local index size of a given block.
+    value_type block_size_;
+  public:
+    splindex_block()
+    {
+    }
+    /// Constructor.
+    splindex_block(value_type size__, n_blocks n_blocks__, block_id block_id__)
+        : splindex<Index_t>(size__, n_blocks__, block_id__)
+    {
+        this->block_size_ = this->block_size(size__, n_blocks__);
+    }
 
-    // Check and initialize variables.
-    void init(T global_index_size__, int num_ranks__, int rank__, int block_size__)
+    using splindex<Index_t>::local_size;
+
+    /// Return local size of the split index for a given block.
+    inline value_type local_size(block_id block_id__) const
     {
-        this->global_index_size_ = global_index_size__;
+        RTE_ASSERT(block_id__ >= 0 && block_id__ < this->n_blocks_);
 
-        if (num_ranks__ < 0) {
-            std::stringstream s;
-            s << "wrong number of ranks: " << num_ranks__;
-            throw std::runtime_error(s.str());
+        if (this->size_ == 0) {
+            return 0;
         }
-        this->num_ranks_ = num_ranks__;
 
-        if (rank__ < 0 || rank__ >= num_ranks__) {
-            std::stringstream s;
-            s << "wrong rank: " << rank__;
-            throw std::runtime_error(s.str());
+        auto n = static_cast<int>(this->size_ / block_size_);
+        if (block_id__ < n) {
+            return block_size_;
+        } else {
+            return std::max(0, this->size_ - block_id__ * block_size_);
         }
-        this->rank_ = rank__;
+    }
 
-        if (block_size__ <= 0) {
-            std::stringstream s;
-            s << "wrong block size: " << block_size__;
-            throw std::runtime_error(s.str());
-        }
-        block_size_ = block_size__;
+    /// Return "local index, rank" pair for a global index.
+    inline typename splindex<Index_t>::location_t location(typename Index_t::global idx__) const
+    {
+        RTE_ASSERT(idx__ < this->size_);
+
+        auto ib = static_cast<int>(idx__ / this->block_size_);
+        value_type idxloc = idx__ - ib * this->block_size_;
+
+        return typename splindex<Index_t>::location_t(typename Index_t::local(idxloc), block_id(ib));
     }
 
-  public:
-    struct iterator
-    {
-        T idxloc_;
-        T idxglob_;
-        T num_blocks_min_;
-        int block_size_;
-        int rank_;
-        int num_ranks_;
-
-        iterator(T idxglob__, int num_ranks__, int block_size__)
-            : idxglob_(idxglob__)
-            , num_ranks_(num_ranks__)
-            , block_size_(block_size__)
-        {
-            /* number of full blocks */
-            T num_blocks = idxglob__ / block_size_;
-            num_blocks_min_ = num_blocks / num_ranks_;
-            idxloc_ = num_blocks_min_ * block_size_ + idxglob_ % block_size_;
-            rank_ = static_cast<int>(num_blocks % num_ranks_);
-        }
+    using splindex<Index_t>::global_index;
 
-        bool operator!=(iterator const& rhs__) const
-        {
-            return idxglob_ != rhs__.idxglob_;
-        }
+    /// Return global index of an element by local index and block id.
+    inline typename Index_t::global global_index(typename Index_t::local idxloc__, block_id block_id__) const
+    {
+        RTE_ASSERT(block_id__ >= 0 && block_id__ < this->n_blocks_);
 
-        iterator& operator++()
-        {
-            idxglob_++;
-            idxloc_++;
-            if (idxloc_ % block_size_ == 0) {
-                rank_++;
-                if (rank_ % num_ranks_ == 0) {
-                    num_blocks_min_++;
-                    rank_ = 0;
-                }
-                idxloc_ = num_blocks_min_ * block_size_;// + idxglob_ % block_size_;
-            }
+        if (this->local_size(block_id__) == 0) {
+            return typename Index_t::global(-1);
         }
-    };
 
-    /// Default constructor
-    splindex()
-    {
+        RTE_ASSERT(idxloc__ < local_size(block_id__));
+
+        return typename Index_t::global(this->block_size_ * block_id__ + idxloc__);
     }
 
-    /// Constructor with implicit cyclic block size
-    splindex(T global_index_size__, int num_ranks__, int rank__, int bs__)
+    inline auto global_offset() const
     {
-        init(global_index_size__, num_ranks__, rank__, bs__);
+        return this->global_index(typename Index_t::local(0), this->block_id_);
     }
 
-    iterator at(T idxglob__) const
+    inline auto global_offset(block_id iblock__) const
     {
-        return iterator(idxglob__, this->num_ranks_, this->block_size_);
+        return this->global_index(typename Index_t::local(0), iblock__);
     }
 
-    /// Return "local index, rank" pair for a global index.
-    inline typename splindex_base<T>::location_t location(T idxglob__) const
+    inline auto counts() const
     {
-        assert(idxglob__ < this->global_index_size_);
-
-        /* number of full blocks */
-        T num_blocks = idxglob__ / block_size_;
-
-        /* local index */
-        T idxloc = (num_blocks / this->num_ranks_) * block_size_ + idxglob__ % block_size_;
-
-        /* corresponding rank */
-        int rank = static_cast<int>(num_blocks % this->num_ranks_);
+        std::vector<value_type> v(this->n_blocks_);
+        for (int i = 0; i < this->n_blocks_; i++) {
+            v[i] = local_size(block_id(i));
+        }
+        return v;
+    }
+};
 
-        return typename splindex_base<T>::location_t(idxloc, rank);
+template <typename Index_t = basic_index_t<int>>
+class splindex_block_cyclic : public splindex<Index_t>
+{
+  public:
+    using value_type = typename splindex<Index_t>::value_type;
+  private:
+    /// Cyclic block size.
+    value_type block_size_;
+  public:
+    splindex_block_cyclic()
+    {
     }
+    /// Constructor.
+    splindex_block_cyclic(value_type size__, n_blocks n_blocks__, block_id block_id__, value_type block_size__)
+        : splindex<Index_t>(size__, n_blocks__, block_id__)
+        , block_size_{block_size__}
+    {
+    }
+
+    using splindex<Index_t>::local_size;
 
-    /// Return local size of the split index for an arbitrary rank.
-    inline T local_size(int rank__) const
+    /// Return local size of the split index for a given block.
+    inline value_type local_size(block_id block_id__) const
     {
-        assert(rank__ >= 0 && rank__ < this->num_ranks_);
+        RTE_ASSERT(block_id__ >= 0 && block_id__ < this->n_blocks_);
 
+        if (this->size_ == 0) {
+            return 0;
+        }
         /* number of full blocks */
-        T num_blocks = this->global_index_size_ / block_size_;
-
-        T n = (num_blocks / this->num_ranks_) * block_size_;
+        auto num_blocks = this->size() / this->block_size_;
 
-        int rank_offs = static_cast<int>(num_blocks % this->num_ranks_);
+        auto n = (num_blocks / this->n_blocks_) * this->block_size_;
+        auto rank_offs = static_cast<int>(num_blocks % this->n_blocks_);
 
-        if (rank__ < rank_offs) {
-            n += block_size_;
-        } else if (rank__ == rank_offs) {
-            n += this->global_index_size_ % block_size_;
+        if (block_id__ < rank_offs) {
+            n += this->block_size_;
+        } else if (block_id__ == rank_offs) {
+            n += this->size_ % this->block_size_;
         }
         return n;
     }
 
-    /// Return local size of the split index for a current rank.
-    inline T local_size() const
+    /// Return "local index, rank" pair for a global index.
+    inline typename splindex<Index_t>::location_t location(typename Index_t::global idx__) const
     {
-        return local_size(this->rank_);
-    }
+        RTE_ASSERT(idx__ < this->size_);
 
-    /// Return rank which holds the element with the given global index.
-    inline int local_rank(T idxglob__) const
-    {
-        return location(idxglob__).rank;
-    }
+        /* number of full blocks */
+        auto num_blocks = idx__ / this->block_size_;
 
-    /// Return local index of the element for the rank which handles the given global index.
-    inline T local_index(T idxglob__) const
-    {
-        return location(idxglob__).local_index;
+        /* local index */
+        value_type idxloc = (num_blocks / this->n_blocks_) * block_size_ + idx__ % this->block_size_;
+
+        /* corresponding rank */
+        auto ib = static_cast<int>(num_blocks % this->n_blocks_);
+
+        return typename splindex<Index_t>::location_t(typename Index_t::local(idxloc), block_id(ib));
     }
 
-    /// Get a global index by local index of a rank.
-    inline T global_index(T idxloc__, int rank__) const
+    using splindex<Index_t>::global_index;
+
+    /// Return global index of an element by local index and block id.
+    inline typename Index_t::global global_index(typename Index_t::local idxloc__, block_id block_id__) const
     {
-        assert(rank__ >= 0 && rank__ < this->num_ranks_);
-        assert(idxloc__ < local_size(rank__));
+        RTE_ASSERT(block_id__ >= 0 && block_id__ < this->n_blocks_);
+        RTE_ASSERT(idxloc__ < local_size(block_id__));
 
-        T nb = idxloc__ / block_size_;
+        auto nb = idxloc__ / this->block_size_;
 
-        return (nb * this->num_ranks_ + rank__) * block_size_ + idxloc__ % block_size_;
+        return typename Index_t::global((nb * this->n_blocks_ + block_id__) * this->block_size_ +
+                idxloc__ % this->block_size_);
     }
 
-    /// Get global index of this rank.
-    inline T operator[](T idxloc__) const
-    {
-        return global_index(idxloc__, this->rank_);
-    }
 };
 
-/// Specialization for the block distribution.
-template <typename T>
-class splindex<splindex_t::chunk, T> : public splindex_base<T>
+/// Externally defined block distribution.
+template <typename Index_t = basic_index_t<int>>
+class splindex_chunk : public splindex<Index_t>
 {
+  public:
+    using value_type = typename splindex<Index_t>::value_type;
   private:
-    std::vector<std::vector<T>> global_index_;
-    std::vector<typename splindex_base<T>::location_t> locations_;
+    std::vector<std::vector<value_type>> global_index_;
+    std::vector<typename splindex<Index_t>::location_t> locations_;
 
   public:
     /// Default constructor.
-    splindex()
+    splindex_chunk()
     {
     }
 
     /// Constructor with specific partitioning.
-    splindex(T global_index_size__, int num_ranks__, int rank__, std::vector<T> const& counts__)
-    {
-        this->global_index_size_ = global_index_size__;
-
-        if (num_ranks__ < 0) {
-            std::stringstream s;
-            s << "wrong number of ranks: " << num_ranks__;
-            throw std::runtime_error(s.str());
-        }
-        this->num_ranks_ = num_ranks__;
-
-        if (rank__ < 0 || rank__ >= num_ranks__) {
-            std::stringstream s;
-            s << "wrong rank: " << rank__;
-            throw std::runtime_error(s.str());
-        }
-        this->rank_ = rank__;
-
-        for (int r = 0; r < num_ranks__; r++) {
-            global_index_.push_back(std::vector<T>());
-            for (int i = 0; i < counts__[r]; i++) {
-                global_index_.back().push_back(static_cast<T>(locations_.size()));
-                locations_.push_back(typename splindex_base<T>::location_t(i, r));
+    splindex_chunk(value_type size__, n_blocks n_blocks__, block_id block_id__, std::vector<value_type> const counts__)
+        : splindex<Index_t>(size__, n_blocks__, block_id__)
+    {
+        for (int r = 0; r < n_blocks__.get(); r++) {
+            global_index_.push_back(std::vector<value_type>());
+            for (value_type i = 0; i < counts__[r]; i++) {
+                global_index_.back().push_back(static_cast<value_type>(locations_.size()));
+                locations_.push_back(typename splindex<Index_t>::location_t(typename Index_t::local(i), block_id(r)));
             }
         }
 
-        assert(static_cast<T>(locations_.size()) == global_index_size__);
+        RTE_ASSERT(static_cast<value_type>(locations_.size()) == this->size());
     }
 
-    inline T local_size(int rank__) const
-    {
-        assert(rank__ >= 0);
-        assert(rank__ < this->num_ranks_);
-        return static_cast<T>(global_index_[rank__].size());
-    }
+    using splindex<Index_t>::local_size;
 
-    inline T local_size() const
+    inline value_type local_size(block_id block_id__) const
     {
-        return local_size(this->rank_);
-    }
+        RTE_ASSERT(block_id__ >= 0 && block_id__ < this->n_blocks_);
 
-    inline int local_rank(T idxglob__) const
-    {
-        return locations_[idxglob__].rank;
+        if (this->size_ == 0) {
+            return 0;
+        }
+        return static_cast<value_type>(global_index_[block_id__].size());
     }
 
-    inline T local_index(T idxglob__) const
+    inline typename splindex<Index_t>::location_t location(typename Index_t::global idx__) const
     {
-        return locations_[idxglob__].local_index;
+        return locations_[idx__];
     }
 
-    inline T global_index(T idxloc__, int rank__) const
-    {
-        if (local_size(rank__) == 0) {
-            return std::numeric_limits<T>::max();
-        }
+    using splindex<Index_t>::global_index;
 
-        assert(idxloc__ < local_size(rank__));
-
-        return global_index_[rank__][idxloc__];
-    }
-
-    inline T operator[](T idxloc__) const
+    inline typename Index_t::global global_index(typename Index_t::local idxloc__, block_id block_id__) const
     {
-        return global_index(idxloc__, this->rank_);
+        RTE_ASSERT(block_id__ >= 0 && block_id__ < this->n_blocks_);
+        RTE_ASSERT(idxloc__ < local_size(block_id__));
+
+        return typename Index_t::global(global_index_[block_id__][idxloc__]);
     }
 
-    inline T global_offset() const
+    inline auto global_offset() const
     {
-        return global_index(0, this->rank_);
+        return this->global_index(typename Index_t::local(0), this->block_id_);
     }
 };
 
+template <typename Index_t>
+auto begin_global(splindex<Index_t> const& a__)
+{
+    return typename Index_t::global(0);
+}
+
+template <typename Index_t>
+auto end_global(splindex<Index_t> const& a__)
+{
+    return typename Index_t::global(a__.size());
+}
+
+template <typename Index_t>
+auto begin(splindex<Index_t> const& a__)
+{
+    splindex_iterator_t<Index_t> it(a__);
+    it.li = typename Index_t::local(0);
+    return it;
+}
+
+template <typename Index_t>
+auto end(splindex<Index_t> const& a__)
+{
+    splindex_iterator_t<Index_t> it(a__);
+    it.li = typename Index_t::local(a__.local_size());
+    return it;
+}
+
 } // namespace sddk
 
 #endif // __SPLINDEX_HPP__
diff --git a/src/SDDK/wave_functions.hpp b/src/SDDK/wave_functions.hpp
index e202622253..28f6e0c25a 100644
--- a/src/SDDK/wave_functions.hpp
+++ b/src/SDDK/wave_functions.hpp
@@ -113,10 +113,8 @@ auto checksum_gpu(std::complex<T> const* wf__, int ld__, int num_rows_loc__, int
 /// Namespace for the wave-functions.
 namespace wf {
 
-using sirius::strong_type;
-
 using spin_index = strong_type<int, struct __spin_index_tag>;
-using atom_index = strong_type<int, struct __atom_index_tag>;
+//using atom_index = strong_type<int, struct __atom_index_tag>;
 using band_index = strong_type<int, struct __band_index_tag>;
 
 using num_bands = strong_type<int, struct __num_bands_tag>;
@@ -494,7 +492,7 @@ class Wave_functions_mt : public Wave_functions_base<T>
     /// Total number of atoms.
     int num_atoms_{0};
     /// Distribution of atoms between MPI ranks.
-    sddk::splindex<sddk::splindex_t::block> spl_num_atoms_;
+    sddk::splindex_block<atom_index_t> spl_num_atoms_;
     /// Local size of muffin-tin coefficients for each rank.
     /** Each rank stores local fraction of atoms. Each atom has a set of MT coefficients. */
     mpi::block_data_descriptor mt_coeffs_distr_;
@@ -509,10 +507,12 @@ class Wave_functions_mt : public Wave_functions_base<T>
     static int get_local_num_mt_coeffs(std::vector<int> num_mt_coeffs__, mpi::Communicator const& comm__)
     {
         int num_atoms = static_cast<int>(num_mt_coeffs__.size());
-        sddk::splindex<sddk::splindex_t::block> spl_atoms(num_atoms, comm__.size(), comm__.rank());
-        auto it_begin = num_mt_coeffs__.begin() + spl_atoms.global_offset();
-        auto it_end = it_begin + spl_atoms.local_size();
-        return std::accumulate(it_begin, it_end, 0);
+        sddk::splindex_block<atom_index_t> spl_atoms(num_atoms, n_blocks(comm__.size()), block_id(comm__.rank()));
+        int result{0};
+        for (auto it : spl_atoms) {
+            result += num_mt_coeffs__[it.i];
+        }
+        return result;
     }
 
     /// Construct without muffin-tin part.
@@ -520,7 +520,7 @@ class Wave_functions_mt : public Wave_functions_base<T>
             sddk::memory_t default_mem__, int num_pw__)
         : Wave_functions_base<T>(num_pw__, 0, num_md__, num_wf__, default_mem__)
         , comm_{comm__}
-        , spl_num_atoms_{sddk::splindex<sddk::splindex_t::block>(num_atoms_, comm_.size(), comm_.rank())}
+        , spl_num_atoms_{sddk::splindex_block<atom_index_t>(num_atoms_, n_blocks(comm_.size()), block_id(comm_.rank()))}
     {
     }
 
@@ -537,13 +537,13 @@ class Wave_functions_mt : public Wave_functions_base<T>
                                  default_mem__)
         , comm_{comm__}
         , num_atoms_{static_cast<int>(num_mt_coeffs__.size())}
-        , spl_num_atoms_{sddk::splindex<sddk::splindex_t::block>(num_atoms_, comm_.size(), comm_.rank())}
+        , spl_num_atoms_{sddk::splindex_block<atom_index_t>(num_atoms_, n_blocks(comm_.size()), block_id(comm_.rank()))}
         , num_mt_coeffs_{num_mt_coeffs__}
     {
         mt_coeffs_distr_ = mpi::block_data_descriptor(comm_.size());
 
         for (int ia = 0; ia < num_atoms_; ia++) {
-            int rank = spl_num_atoms_.local_rank(ia);
+            auto rank = spl_num_atoms_.location(atom_index_t::global(ia)).ib;
             if (rank == comm_.rank()) {
                 offset_in_local_mt_coeffs_.push_back(mt_coeffs_distr_.counts[rank]);
             }
@@ -555,14 +555,14 @@ class Wave_functions_mt : public Wave_functions_base<T>
 
     /// Return reference to the coefficient by atomic orbital index, atom, spin and band indices.
     inline auto&
-    mt_coeffs(int xi__, atom_index ia__, spin_index ispn__, band_index i__)
+    mt_coeffs(int xi__, atom_index_t::local ia__, spin_index ispn__, band_index i__)
     {
         return this->data_[ispn__.get()](this->num_pw_ + xi__ + offset_in_local_mt_coeffs_[ia__.get()], i__.get());
     }
 
     /// Return const reference to the coefficient by atomic orbital index, atom, spin and band indices.
     inline auto const&
-    mt_coeffs(int xi__, atom_index ia__, spin_index ispn__, band_index i__) const
+    mt_coeffs(int xi__, atom_index_t::local ia__, spin_index ispn__, band_index i__) const
     {
         return this->data_[ispn__.get()](this->num_pw_ + xi__ + offset_in_local_mt_coeffs_[ia__.get()], i__.get());
     }
@@ -571,14 +571,14 @@ class Wave_functions_mt : public Wave_functions_base<T>
 
     /// Return const pointer to the coefficient by atomic orbital index, atom, spin and band indices.
     inline std::complex<T> const*
-    at(sddk::memory_t mem__, int xi__, atom_index ia__, spin_index s__, band_index b__) const
+    at(sddk::memory_t mem__, int xi__, atom_index_t::local ia__, spin_index s__, band_index b__) const
     {
         return this->data_[s__.get()].at(mem__, this->num_pw_ + xi__ + offset_in_local_mt_coeffs_[ia__.get()], b__.get());
     }
 
     /// Return pointer to the coefficient by atomic orbital index, atom, spin and band indices.
     inline auto
-    at(sddk::memory_t mem__, int xi__, atom_index ia__, spin_index s__, band_index b__)
+    at(sddk::memory_t mem__, int xi__, atom_index_t::local ia__, spin_index s__, band_index b__)
     {
         return this->data_[s__.get()].at(mem__, this->num_pw_ + xi__ + offset_in_local_mt_coeffs_[ia__.get()], b__.get());
     }
@@ -837,7 +837,7 @@ class Wave_functions_fft : public Wave_functions_base<T>
     /// Pointer to FFT-friendly G+k vector deistribution.
     std::shared_ptr<fft::Gvec_fft> gkvec_fft_;
     /// Split number of wave-functions between column communicator.
-    sddk::splindex<sddk::splindex_t::block> spl_num_wf_;
+    sddk::splindex_block<> spl_num_wf_;
     /// Pointer to the original wave-functions.
     Wave_functions<T>* wf_{nullptr};
     /// Spin-index of the wave-function component
@@ -866,7 +866,7 @@ class Wave_functions_fft : public Wave_functions_base<T>
         std::vector<int> colsplit(comm_col.size() + 1);
         colsplit[0] = 0;
         for (int i = 0; i < comm_col.size(); i++) {
-            colsplit[i + 1] = colsplit[i] + spl_num_wf_.local_size(i);
+            colsplit[i + 1] = colsplit[i] + spl_num_wf_.local_size(block_id(i));
         }
 
         std::vector<int> owners(gkvec_fft_->gvec().comm().size());
@@ -930,8 +930,8 @@ class Wave_functions_fft : public Wave_functions_base<T>
             /* send and receive dimensions */
             mpi::block_data_descriptor sd(comm_col.size()), rd(comm_col.size());
             for (int j = 0; j < comm_col.size(); j++) {
-                sd.counts[j] = spl_num_wf_.local_size(j) * row_distr.counts[comm_col.rank()];
-                rd.counts[j] = spl_num_wf_.local_size(comm_col.rank()) * row_distr.counts[j];
+                sd.counts[j] = spl_num_wf_.local_size(block_id(j)) * row_distr.counts[comm_col.rank()];
+                rd.counts[j] = spl_num_wf_.local_size(block_id(comm_col.rank())) * row_distr.counts[j];
             }
             sd.calc_offsets();
             rd.calc_offsets();
@@ -1003,8 +1003,8 @@ class Wave_functions_fft : public Wave_functions_base<T>
             /* send and receive dimensions */
             mpi::block_data_descriptor sd(comm_col.size()), rd(comm_col.size());
             for (int j = 0; j < comm_col.size(); j++) {
-                sd.counts[j] = spl_num_wf_.local_size(comm_col.rank()) * row_distr.counts[j];
-                rd.counts[j] = spl_num_wf_.local_size(j) * row_distr.counts[comm_col.rank()];
+                sd.counts[j] = spl_num_wf_.local_size(block_id(comm_col.rank())) * row_distr.counts[j];
+                rd.counts[j] = spl_num_wf_.local_size(block_id(j)) * row_distr.counts[comm_col.rank()];
             }
             sd.calc_offsets();
             rd.calc_offsets();
@@ -1069,7 +1069,7 @@ class Wave_functions_fft : public Wave_functions_base<T>
         , shuffle_flag_{shuffle_flag___}
     {
         auto& comm_col = gkvec_fft_->comm_ortho_fft();
-        spl_num_wf_ = sddk::splindex<sddk::splindex_t::block>(br__.size(), comm_col.size(), comm_col.rank());
+        spl_num_wf_ = sddk::splindex_block<>(br__.size(), n_blocks(comm_col.size()), block_id(comm_col.rank()));
         this->num_mt_ = 0;
         this->num_md_ = wf::num_mag_dims(0);
         this->num_sc_ = wf::num_spins(1);
diff --git a/src/api/sirius_api.cpp b/src/api/sirius_api.cpp
index 8edbff7163..b075dbd244 100644
--- a/src/api/sirius_api.cpp
+++ b/src/api/sirius_api.cpp
@@ -3302,11 +3302,11 @@ sirius_get_wave_functions(void* const* ks_handler__, double const* vkl__, int co
             ks.comm().bcast(&spin, 1, dest_rank);
 
             /* rank where k-point vkl resides on the SIRIUS side */
-            int src_rank = ks.spl_num_kpoints().local_rank(jk);
+            int src_rank = ks.spl_num_kpoints().location(typename kp_index_t::global(jk)).ib;
 
             if (ks.comm().rank() == src_rank || ks.comm().rank() == dest_rank) {
                 /* send G+k copy to destination rank (where host code receives the data) */
-                auto gkvec = ks.get_gkvec(jk, dest_rank);
+                auto gkvec = ks.get_gkvec(typename kp_index_t::global(jk), dest_rank);
 
                 sddk::mdarray<std::complex<double>, 2> wf;
                 if (ks.comm().rank() == dest_rank) {
@@ -3963,7 +3963,7 @@ sirius_get_gkvec_arrays(void* const* ks_handler__, int* ik__, int* num_gkvec__,
             auto kp = ks.get<double>(*ik__ - 1);
 
             /* get rank that stores a given k-point */
-            int rank = ks.spl_num_kpoints().local_rank(*ik__ - 1);
+            int rank = ks.spl_num_kpoints().location(kp_index_t::global(*ik__ - 1)).ib;
 
             auto& comm_k = ks.ctx().comm_k();
 
@@ -5229,7 +5229,7 @@ sirius_get_rg_values(void* const* handler__, char const* label__, int const* gri
 
             for (int rank = 0; rank < fft_comm.size(); rank++) {
                 /* slab of FFT grid for a given rank */
-                sddk::mdarray<double, 3> buf(f->spfft().dim_x(), f->spfft().dim_y(), spl_z.local_size(rank));
+                sddk::mdarray<double, 3> buf(f->spfft().dim_x(), f->spfft().dim_y(), spl_z.local_size(block_id(rank)));
                 if (rank == fft_comm.rank()) {
                     std::copy(&f->value(0), &f->value(0) + f->spfft().local_slice_size(), &buf[0]);
                 }
@@ -5247,9 +5247,10 @@ sirius_get_rg_values(void* const* handler__, char const* label__, int const* gri
                 for (int iz = 0; iz < nz; iz++) {
                     /* global z coordinate inside FFT box */
                     int z = local_box_origin(2, r) + iz - 1; /* Fortran counts from 1 */
-                    if (z >= spl_z.global_offset(rank) && z < spl_z.global_offset(rank) + spl_z.local_size(rank)) {
+                    if (z >= spl_z.global_offset(block_id(rank)) && z < spl_z.global_offset(block_id(rank)) +
+                            spl_z.local_size(block_id(rank))) {
                         /* make z local for SIRIUS FFT partitioning */
-                        z -= spl_z.global_offset(rank);
+                        z -= spl_z.global_offset(block_id(rank));
                         for (int iy = 0; iy < ny; iy++) {
                             /* global y coordinate inside FFT box */
                             int y = local_box_origin(1, r) + iy - 1; /* Fortran counts from 1 */
diff --git a/src/band/band.cpp b/src/band/band.cpp
index 881fa35432..01cd7a5c93 100644
--- a/src/band/band.cpp
+++ b/src/band/band.cpp
@@ -53,9 +53,8 @@ Band::initialize_subspace(K_point_set& kset__, Hamiltonian0<T>& H0__) const
         N = unit_cell_.num_ps_atomic_wf().first;
     }
 
-    for (int ikloc = 0; ikloc < kset__.spl_num_kpoints().local_size(); ikloc++) {
-        int ik  = kset__.spl_num_kpoints(ikloc);
-        auto kp = kset__.get<T>(ik);
+    for (auto it: kset__.spl_num_kpoints()) {
+        auto kp = kset__.get<T>(it.i);
         auto Hk = H0__(*kp);
         if (ctx_.gamma_point() && (ctx_.so_correction() == false)) {
             ::sirius::initialize_subspace<T, T>(Hk, N);
diff --git a/src/band/band.hpp b/src/band/band.hpp
index 15ff268cf3..5fb4652f02 100644
--- a/src/band/band.hpp
+++ b/src/band/band.hpp
@@ -88,10 +88,12 @@ class Band // TODO: Band class is lightweight and in principle can be converted
 
         /* copy old N - num_locked x N - num_locked distributed matrix */
         if (N__ > 0) {
-            sddk::splindex<sddk::splindex_t::block_cyclic> spl_row(N__ - num_locked__,
-                    mtrx__.blacs_grid().num_ranks_row(), mtrx__.blacs_grid().rank_row(), mtrx__.bs_row());
-            sddk::splindex<sddk::splindex_t::block_cyclic> spl_col(N__ - num_locked__,
-                    mtrx__.blacs_grid().num_ranks_col(), mtrx__.blacs_grid().rank_col(), mtrx__.bs_col());
+            sddk::splindex_block_cyclic<> spl_row(N__ - num_locked__,
+                    n_blocks(mtrx__.blacs_grid().num_ranks_row()), block_id(mtrx__.blacs_grid().rank_row()),
+                    mtrx__.bs_row());
+            sddk::splindex_block_cyclic<> spl_col(N__ - num_locked__,
+                    n_blocks(mtrx__.blacs_grid().num_ranks_col()), block_id(mtrx__.blacs_grid().rank_col()),
+                    mtrx__.bs_col());
 
             if (mtrx_old__) {
                 if (spl_row.local_size()) {
@@ -135,10 +137,12 @@ class Band // TODO: Band class is lightweight and in principle can be converted
         }
 
         if (ctx_.print_checksum()) {
-            sddk::splindex<sddk::splindex_t::block_cyclic> spl_row(N__ + n__ - num_locked__,
-                    mtrx__.blacs_grid().num_ranks_row(), mtrx__.blacs_grid().rank_row(), mtrx__.bs_row());
-            sddk::splindex<sddk::splindex_t::block_cyclic> spl_col(N__ + n__ - num_locked__,
-                    mtrx__.blacs_grid().num_ranks_col(), mtrx__.blacs_grid().rank_col(), mtrx__.bs_col());
+            sddk::splindex_block_cyclic<> spl_row(N__ + n__ - num_locked__,
+                    n_blocks(mtrx__.blacs_grid().num_ranks_row()), block_id(mtrx__.blacs_grid().rank_row()),
+                    mtrx__.bs_row());
+            sddk::splindex_block_cyclic<> spl_col(N__ + n__ - num_locked__,
+                    n_blocks(mtrx__.blacs_grid().num_ranks_col()), block_id(mtrx__.blacs_grid().rank_col()),
+                    mtrx__.bs_col());
             auto cs = mtrx__.checksum(N__ + n__ - num_locked__, N__ + n__ - num_locked__);
             if (ctx_.comm_band().rank() == 0) {
                 utils::print_checksum("subspace_mtrx", cs, RTE_OUT(std::cout));
@@ -150,10 +154,12 @@ class Band // TODO: Band class is lightweight and in principle can be converted
 
         /* save new matrix */
         if (mtrx_old__) {
-            sddk::splindex<sddk::splindex_t::block_cyclic> spl_row(N__ + n__ - num_locked__,
-                    mtrx__.blacs_grid().num_ranks_row(), mtrx__.blacs_grid().rank_row(), mtrx__.bs_row());
-            sddk::splindex<sddk::splindex_t::block_cyclic> spl_col(N__ + n__ - num_locked__,
-                    mtrx__.blacs_grid().num_ranks_col(), mtrx__.blacs_grid().rank_col(), mtrx__.bs_col());
+            sddk::splindex_block_cyclic<> spl_row(N__ + n__ - num_locked__,
+                    n_blocks(mtrx__.blacs_grid().num_ranks_row()), block_id(mtrx__.blacs_grid().rank_row()),
+                    mtrx__.bs_row());
+            sddk::splindex_block_cyclic<> spl_col(N__ + n__ - num_locked__,
+                    n_blocks(mtrx__.blacs_grid().num_ranks_col()), block_id(mtrx__.blacs_grid().rank_col()),
+                    mtrx__.bs_col());
 
             if (spl_row.local_size()) {
                 #pragma omp parallel for schedule(static)
diff --git a/src/band/diag_full_potential.cpp b/src/band/diag_full_potential.cpp
index 2415dcfdfb..ef2b56744a 100644
--- a/src/band/diag_full_potential.cpp
+++ b/src/band/diag_full_potential.cpp
@@ -300,11 +300,10 @@ void Band::diag_full_potential_first_variation_davidson(Hamiltonian_k<double>& H
 
     /* add pure local orbitals to the basis staring from ncomp index */
     if (nlo) {
-        for (int ialoc = 0; ialoc < phi_extra_new->spl_num_atoms().local_size(); ialoc++) {
-            int ia = phi_extra_new->spl_num_atoms()[ialoc];
-            for (int xi = 0; xi < unit_cell_.atom(ia).mt_lo_basis_size(); xi++) {
-                phi_extra_new->mt_coeffs(xi, wf::atom_index(ialoc), wf::spin_index(0),
-                        wf::band_index(offset_lo[ia] + xi + ncomp)) = 1.0;
+        for (auto it : phi_extra_new->spl_num_atoms()) {
+            for (int xi = 0; xi < unit_cell_.atom(it.i).mt_lo_basis_size(); xi++) {
+                phi_extra_new->mt_coeffs(xi, it.li, wf::spin_index(0),
+                        wf::band_index(offset_lo[it.i] + xi + ncomp)) = 1.0;
             }
         }
     }
diff --git a/src/band/residuals.hpp b/src/band/residuals.hpp
index 0034458ade..4a7bd1ef05 100644
--- a/src/band/residuals.hpp
+++ b/src/band/residuals.hpp
@@ -340,11 +340,13 @@ residuals(Simulation_context& ctx__, sddk::memory_t mem__, wf::spin_range sr__,
                 auto pos_src  = evec__.spl_col().location(ev_idx[j]);
                 auto pos_dest = evec_tmp.spl_col().location(j);
                 /* do MPI send / receive */
-                if (pos_src.rank == evec__.blacs_grid().comm_col().rank() && num_rows_local) {
-                    evec__.blacs_grid().comm_col().isend(&evec__(0, pos_src.local_index), num_rows_local, pos_dest.rank, ev_idx[j]);
+                if (pos_src.ib == evec__.blacs_grid().comm_col().rank() && num_rows_local) {
+                    evec__.blacs_grid().comm_col().isend(&evec__(0, pos_src.index_local), num_rows_local, pos_dest.ib,
+                            ev_idx[j]);
                 }
-                if (pos_dest.rank == evec__.blacs_grid().comm_col().rank() && num_rows_local) {
-                    evec__.blacs_grid().comm_col().recv(&evec_tmp(0, pos_dest.local_index), num_rows_local, pos_src.rank, ev_idx[j]);
+                if (pos_dest.ib == evec__.blacs_grid().comm_col().rank() && num_rows_local) {
+                    evec__.blacs_grid().comm_col().recv(&evec_tmp(0, pos_dest.index_local), num_rows_local, pos_src.ib,
+                            ev_idx[j]);
                 }
             }
         }
diff --git a/src/band/solve.cpp b/src/band/solve.cpp
index 7de53db43c..af9932c201 100644
--- a/src/band/solve.cpp
+++ b/src/band/solve.cpp
@@ -154,9 +154,8 @@ Band::solve(K_point_set& kset__, Hamiltonian0<T>& H0__, double itsol_tol__) cons
 
     int num_dav_iter{0};
     /* solve secular equation and generate wave functions */
-    for (int ikloc = 0; ikloc < kset__.spl_num_kpoints().local_size(); ikloc++) {
-        int ik  = kset__.spl_num_kpoints(ikloc);
-        auto kp = kset__.get<T>(ik);
+    for (auto it : kset__.spl_num_kpoints()) {
+        auto kp = kset__.get<T>(it.i);
 
         auto Hk = H0__(*kp);
         if (ctx_.full_potential()) {
diff --git a/src/density/density.cpp b/src/density/density.cpp
index 5b1398797f..b6839905fd 100644
--- a/src/density/density.cpp
+++ b/src/density/density.cpp
@@ -391,10 +391,10 @@ Density::initial_density_full_pot()
 
     /* initialize the magnetization */
     if (ctx_.num_mag_dims()) {
-        for (int ialoc = 0; ialoc < unit_cell_.spl_num_atoms().local_size(); ialoc++) {
-            int ia             = unit_cell_.spl_num_atoms(ialoc);
-            r3::vector<double> v = unit_cell_.atom(ia).vector_field();
-            double len         = v.length();
+        for (auto it : unit_cell_.spl_num_atoms()) {
+            int ia = it.i;
+            auto v = unit_cell_.atom(ia).vector_field();
+            auto len = v.length();
 
             int nmtp = unit_cell_.atom(ia).num_mt_points();
             Spline<double> rho_s(unit_cell_.atom(ia).type().radial_grid());
@@ -436,7 +436,7 @@ void
 Density::init_density_matrix_for_paw()
 {
     for (int ipaw = 0; ipaw < unit_cell_.num_paw_atoms(); ipaw++) {
-        int ia = unit_cell_.paw_atom_index(ipaw);
+        int ia = unit_cell_.paw_atom_index(paw_atom_index_t::global(ipaw));
 
         auto& dm = density_matrix(ia);
         dm.zero();
@@ -479,10 +479,10 @@ Density::init_density_matrix_for_paw()
 }
 
 void
-Density::generate_paw_atom_density(int ialoc__)
+Density::generate_paw_atom_density(paw_atom_index_t::local ialoc__)
 {
-    int ia_paw = ctx_.unit_cell().spl_num_paw_atoms(ialoc__);
-    int ia     = ctx_.unit_cell().paw_atom_index(ia_paw);
+    auto ia_paw = ctx_.unit_cell().spl_num_paw_atoms(ialoc__);
+    auto ia     = ctx_.unit_cell().paw_atom_index(ia_paw);
 
     auto& atom_type = ctx_.unit_cell().atom(ia).type();
 
@@ -570,8 +570,8 @@ Density::generate_paw_loc_density()
     PROFILE("sirius::Density::generate_paw_loc_density");
 
     #pragma omp parallel for
-    for (int ialoc = 0; ialoc < unit_cell_.spl_num_paw_atoms().local_size(); ialoc++) {
-        generate_paw_atom_density(ialoc);
+    for (auto it : unit_cell_.spl_num_paw_atoms()) {
+        generate_paw_atom_density(it.li);
     }
 }
 
@@ -713,7 +713,7 @@ Density::add_k_point_contribution_rg(K_point<T>* kp__, std::array<wf::Wave_funct
             /* local number of bands for a fft-distribution */
             int nbnd = wf_fft__[ispn].num_wf_local();
             for (int i = 0; i < nbnd; i++) {
-                int j = wf_fft__[ispn].spl_num_wf()[i];
+                auto j = wf_fft__[ispn].spl_num_wf().global_index(i);
                 T w = kp__->band_occupancy(j, ispn) * kp__->weight() / omega;
 
                 auto inp_wf = wf_fft__[ispn].pw_coeffs_spfft(wf_mem, wf::band_index(i));
@@ -732,7 +732,7 @@ Density::add_k_point_contribution_rg(K_point<T>* kp__, std::array<wf::Wave_funct
 
         int nbnd = wf_fft__[0].num_wf_local();
         for (int i = 0; i < nbnd; i++) {
-            int j = wf_fft__[0].spl_num_wf()[i];
+            auto j = wf_fft__[0].spl_num_wf().global_index(i);
             T w = kp__->band_occupancy(j, 0) * kp__->weight() / omega;
 
             auto wf_mem_up = wf_fft__[0].on_device() ? sddk::memory_t::device : sddk::memory_t::host;
@@ -793,17 +793,18 @@ add_k_point_contribution_dm_fplapw(Simulation_context const& ctx__, K_point<T> c
 
     /* add |psi_j> n_j <psi_j| to density matrix */
 
-    for (int ialoc = 0; ialoc < kp__.spinor_wave_functions().spl_num_atoms().local_size(); ialoc++) {
-        int ia            = kp__.spinor_wave_functions().spl_num_atoms()[ialoc];
+    for (auto it : kp__.spinor_wave_functions().spl_num_atoms()) {
+        int ia            = it.i;
         int mt_basis_size = uc.atom(ia).type().mt_basis_size();
 
         for (int ispn = 0; ispn < ctx__.num_spins(); ispn++) {
             for (int j = 0; j < kp__.num_occupied_bands(ispn); j++) {
                 for (int xi = 0; xi < mt_basis_size; xi++) {
-                    auto z = kp__.spinor_wave_functions().mt_coeffs(xi, wf::atom_index(ialoc),
-                            wf::spin_index(ispn), wf::band_index(j));
+                    auto z = kp__.spinor_wave_functions().mt_coeffs(xi, it.li, wf::spin_index(ispn),
+                            wf::band_index(j));
                     wf1(xi, j, ispn) = std::conj(z);
-                    wf2(xi, j, ispn) = static_cast<std::complex<double>>(z) * kp__.band_occupancy(j, ispn) * kp__.weight();
+                    wf2(xi, j, ispn) = static_cast<std::complex<double>>(z) * kp__.band_occupancy(j, ispn) *
+                        kp__.weight();
                 }
             }
         }
@@ -841,7 +842,7 @@ add_k_point_contribution_dm_pwpp_collinear(Simulation_context& ctx__, K_point<T>
                     bp_coeffs__, kp__.spinor_wave_functions(), wf::spin_index(ispn), wf::band_range(0, nbnd));
 
         /* use communicator of the k-point to split band index */
-        sddk::splindex<sddk::splindex_t::block> spl_nbnd(nbnd, kp__.comm().size(), kp__.comm().rank());
+        sddk::splindex_block<> spl_nbnd(nbnd, n_blocks(kp__.comm().size()), block_id(kp__.comm().rank()));
 
         int nbnd_loc = spl_nbnd.local_size();
         if (nbnd_loc) { // TODO: this part can also be moved to GPU
@@ -861,7 +862,7 @@ add_k_point_contribution_dm_pwpp_collinear(Simulation_context& ctx__, K_point<T>
 
                 for (int i = 0; i < nbnd_loc; i++) {
                     /* global index of band */
-                    int j = spl_nbnd[i];
+                    auto j = spl_nbnd.global_index(i);
 
                     for (int xi = 0; xi < nbf; xi++) {
                         bp1(xi, i) = beta_psi(offs + xi, j);
@@ -891,7 +892,7 @@ add_k_point_contribution_dm_pwpp_noncollinear(Simulation_context& ctx__, K_point
     /* total number of occupied bands */
     int nbnd = kp__.num_occupied_bands();
 
-    sddk::splindex<sddk::splindex_t::block> spl_nbnd(nbnd, kp__.comm().size(), kp__.comm().rank());
+    sddk::splindex_block<> spl_nbnd(nbnd, n_blocks(kp__.comm().size()), block_id(kp__.comm().rank()));
     int nbnd_loc = spl_nbnd.local_size();
 
     /* auxiliary arrays */
@@ -909,7 +910,7 @@ add_k_point_contribution_dm_pwpp_noncollinear(Simulation_context& ctx__, K_point
 
         #pragma omp parallel for schedule(static)
         for (int i = 0; i < nbnd_loc; i++) {
-            int j = spl_nbnd[i];
+            auto j = spl_nbnd.global_index(i);
 
             for (int m = 0; m < nbeta; m++) {
                 bp1(m, i, ispn) = beta_psi(m, j);
@@ -1105,11 +1106,10 @@ Density::generate(K_point_set const& ks__, bool symmetrize__, bool add_core__, b
             /* find the core states */
             generate_core_charge_density();
             /* add core contribution */
-            for (int ialoc = 0; ialoc < (int)unit_cell_.spl_num_atoms().local_size(); ialoc++) {
-                int ia = unit_cell_.spl_num_atoms(ialoc);
-                for (int ir = 0; ir < unit_cell_.atom(ia).num_mt_points(); ir++) {
-                    rho().mt()[ia](0, ir) +=
-                        unit_cell_.atom(ia).symmetry_class().ae_core_charge_density(ir) / y00;
+            for (auto it : unit_cell_.spl_num_atoms()) {
+                for (int ir = 0; ir < unit_cell_.atom(it.i).num_mt_points(); ir++) {
+                    rho().mt()[it.i](0, ir) +=
+                        unit_cell_.atom(it.i).symmetry_class().ae_core_charge_density(ir) / y00;
                 }
             }
         }
@@ -1267,9 +1267,8 @@ Density::generate_valence(K_point_set const& ks__)
     auto mem = ctx_.processing_unit() == sddk::device_t::CPU ? sddk::memory_t::host : sddk::memory_t::device;
 
     /* start the main loop over k-points */
-    for (int ikloc = 0; ikloc < ks__.spl_num_kpoints().local_size(); ikloc++) {
-        int ik  = ks__.spl_num_kpoints(ikloc);
-        auto kp = ks__.get<T>(ik);
+    for (auto it : ks__.spl_num_kpoints()) {
+        auto kp = ks__.get<T>(it.i);
 
         std::array<wf::Wave_functions_fft<T>, 2> wf_fft;
 
@@ -1645,9 +1644,8 @@ Density::generate_valence_mt()
     sddk::mdarray<double, 2> rf_pairs(unit_cell_.max_num_mt_points(), max_num_rf_pairs);
     sddk::mdarray<double, 3> dlm(ctx_.lmmax_rho(), unit_cell_.max_num_mt_points(), ctx_.num_mag_dims() + 1);
 
-    for (int ialoc = 0; ialoc < unit_cell_.spl_num_atoms().local_size(); ialoc++) {
-        int ia          = unit_cell_.spl_num_atoms(ialoc);
-        auto& atom_type = unit_cell_.atom(ia).type();
+    for (auto it : unit_cell_.spl_num_atoms()) {
+        auto& atom_type = unit_cell_.atom(it.i).type();
 
         int nmtp         = atom_type.num_mt_points();
         int num_rf_pairs = atom_type.mt_radial_basis_size() * (atom_type.mt_radial_basis_size() + 1) / 2;
@@ -1655,15 +1653,15 @@ Density::generate_valence_mt()
         PROFILE_START("sirius::Density::generate|sum_zdens");
         switch (ctx_.num_mag_dims()) {
             case 3: {
-                reduce_density_matrix<3>(atom_type, density_matrix(ia), mt_density_matrix);
+                reduce_density_matrix<3>(atom_type, density_matrix(it.i), mt_density_matrix);
                 break;
             }
             case 1: {
-                reduce_density_matrix<1>(atom_type, density_matrix(ia), mt_density_matrix);
+                reduce_density_matrix<1>(atom_type, density_matrix(it.i), mt_density_matrix);
                 break;
             }
             case 0: {
-                reduce_density_matrix<0>(atom_type, density_matrix(ia), mt_density_matrix);
+                reduce_density_matrix<0>(atom_type, density_matrix(it.i), mt_density_matrix);
                 break;
             }
         }
@@ -1676,9 +1674,9 @@ Density::generate_valence_mt()
             for (int idxrf1 = 0; idxrf1 <= idxrf2; idxrf1++) {
                 /* off-diagonal pairs are taken two times: d_{12}*f_1*f_2 + d_{21}*f_2*f_1 = d_{12}*2*f_1*f_2 */
                 int n = (idxrf1 == idxrf2) ? 1 : 2;
-                for (int ir = 0; ir < unit_cell_.atom(ia).num_mt_points(); ir++) {
-                    rf_pairs(ir, offs + idxrf1) = n * unit_cell_.atom(ia).symmetry_class().radial_function(ir, idxrf1) *
-                                                  unit_cell_.atom(ia).symmetry_class().radial_function(ir, idxrf2);
+                for (int ir = 0; ir < unit_cell_.atom(it.i).num_mt_points(); ir++) {
+                    rf_pairs(ir, offs + idxrf1) = n * unit_cell_.atom(it.i).symmetry_class().radial_function(ir, idxrf1) *
+                                                  unit_cell_.atom(it.i).symmetry_class().radial_function(ir, idxrf2);
                 }
             }
         }
@@ -1693,21 +1691,21 @@ Density::generate_valence_mt()
         switch (ctx_.num_mag_dims()) {
             case 3: {
                 /* copy x component */
-                std::copy(&dlm(0, 0, 2), &dlm(0, 0, 2) + sz, &mag(1).mt()[ia](0, 0));
+                std::copy(&dlm(0, 0, 2), &dlm(0, 0, 2) + sz, &mag(1).mt()[it.i](0, 0));
                 /* copy y component */
-                std::copy(&dlm(0, 0, 3), &dlm(0, 0, 3) + sz, &mag(2).mt()[ia](0, 0));
+                std::copy(&dlm(0, 0, 3), &dlm(0, 0, 3) + sz, &mag(2).mt()[it.i](0, 0));
             }
             case 1: {
                 for (int ir = 0; ir < nmtp; ir++) {
                     for (int lm = 0; lm < ctx_.lmmax_rho(); lm++) {
-                        rho().mt()[ia](lm, ir) = dlm(lm, ir, 0) + dlm(lm, ir, 1);
-                        mag(0).mt()[ia](lm, ir) = dlm(lm, ir, 0) - dlm(lm, ir, 1);
+                        rho().mt()[it.i](lm, ir) = dlm(lm, ir, 0) + dlm(lm, ir, 1);
+                        mag(0).mt()[it.i](lm, ir) = dlm(lm, ir, 0) - dlm(lm, ir, 1);
                     }
                 }
                 break;
             }
             case 0: {
-                std::copy(&dlm(0, 0, 0), &dlm(0, 0, 0) + sz, &rho().mt()[ia](0, 0));
+                std::copy(&dlm(0, 0, 0), &dlm(0, 0, 0) + sz, &rho().mt()[it.i](0, 0));
             }
         }
     }
diff --git a/src/density/density.hpp b/src/density/density.hpp
index 117c1a1465..379dda8f35 100644
--- a/src/density/density.hpp
+++ b/src/density/density.hpp
@@ -251,7 +251,7 @@ class Density : public Field4D
                                  PAW_density<double>, Hubbard_matrix>> mixer_;
 
     /// Generate atomic densities in the case of PAW.
-    void generate_paw_atom_density(int iapaw__);
+    void generate_paw_atom_density(paw_atom_index_t::local iapaw__);
 
     /// Initialize PAW density matrix.
     void init_density_matrix_for_paw();
@@ -311,13 +311,14 @@ class Density : public Field4D
     {
         PROFILE("sirius::Density::generate_core_charge_density");
 
-        for (int icloc = 0; icloc < unit_cell_.spl_num_atom_symmetry_classes().local_size(); icloc++) {
-            int ic = unit_cell_.spl_num_atom_symmetry_classes(icloc);
-            unit_cell_.atom_symmetry_class(ic).generate_core_charge_density(ctx_.core_relativity());
+        auto& spl_idx = unit_cell_.spl_num_atom_symmetry_classes();
+
+        for (auto it : spl_idx) {
+            unit_cell_.atom_symmetry_class(it.i).generate_core_charge_density(ctx_.core_relativity());
         }
 
-        for (int ic = 0; ic < unit_cell_.num_atom_symmetry_classes(); ic++) {
-            int rank = unit_cell_.spl_num_atom_symmetry_classes().local_rank(ic);
+        for (auto ic = begin_global(spl_idx); ic != end_global(spl_idx); ic++) {
+            auto rank = spl_idx.location(ic).ib;
             unit_cell_.atom_symmetry_class(ic).sync_core_charge_density(ctx_.comm(), rank);
         }
     }
@@ -442,9 +443,9 @@ class Density : public Field4D
         return *rho_pseudo_core_;
     }
 
-    inline auto const& density_mt(int ialoc) const
+    inline auto const& density_mt(atom_index_t::local ialoc__) const
     {
-        int ia = ctx_.unit_cell().spl_num_atoms(ialoc);
+        auto ia = ctx_.unit_cell().spl_num_atoms().global_index(ialoc__);
         return rho().mt()[ia];
     }
 
diff --git a/src/dft/dft_ground_state.cpp b/src/dft/dft_ground_state.cpp
index 67a4cf8834..fd0e8f072a 100644
--- a/src/dft/dft_ground_state.cpp
+++ b/src/dft/dft_ground_state.cpp
@@ -79,9 +79,8 @@ DFT_ground_state::energy_kin_sum_pw() const
 {
     double ekin{0};
 
-    for (int ikloc = 0; ikloc < kset_.spl_num_kpoints().local_size(); ikloc++) {
-        int ik  = kset_.spl_num_kpoints(ikloc);
-        auto kp = kset_.get<double>(ik);
+    for (auto it : kset_.spl_num_kpoints()) {
+        auto kp = kset_.get<double>(it.i);
 
         #pragma omp parallel for schedule(static) reduction(+:ekin)
         for (int igloc = 0; igloc < kp->num_gkvec_loc(); igloc++) {
@@ -281,12 +280,11 @@ DFT_ground_state::find(double density_tol__, double energy_tol__, double iter_so
                     ctx_.cfg().parameters().precision_hs("fp64");
                     ctx_.cfg().lock();
 
-                    for (int ikloc = 0; ikloc < kset_.spl_num_kpoints().local_size(); ikloc++) {
-                        int ik = kset_.spl_num_kpoints(ikloc);
+                    for (auto it : kset_.spl_num_kpoints()) {
                         for (int ispn = 0; ispn < ctx_.num_spins(); ispn++) {
-                            wf::copy(sddk::memory_t::host, kset_.get<float>(ik)->spinor_wave_functions(),
+                            wf::copy(sddk::memory_t::host, kset_.get<float>(it.i)->spinor_wave_functions(),
                                     wf::spin_index(ispn), wf::band_range(0, ctx_.num_bands()),
-                                    kset_.get<double>(ik)->spinor_wave_functions(), wf::spin_index(ispn),
+                                    kset_.get<double>(it.i)->spinor_wave_functions(), wf::spin_index(ispn),
                                     wf::band_range(0, ctx_.num_bands()));
                         }
                     }
diff --git a/src/dft/energy.cpp b/src/dft/energy.cpp
index 77c9995c0c..48863017ff 100644
--- a/src/dft/energy.cpp
+++ b/src/dft/energy.cpp
@@ -106,10 +106,9 @@ energy_enuc(Simulation_context const& ctx, Potential const& potential)
     auto& unit_cell = ctx.unit_cell();
     double enuc{0};
     if (ctx.full_potential()) {
-        for (int ialoc = 0; ialoc < unit_cell.spl_num_atoms().local_size(); ialoc++) {
-            int ia = unit_cell.spl_num_atoms(ialoc);
-            int zn = unit_cell.atom(ia).zn();
-            enuc -= 0.5 * zn * potential.vh_el(ia);
+        for (auto it : unit_cell.spl_num_atoms()) {
+            int zn = unit_cell.atom(it.i).zn();
+            enuc -= 0.5 * zn * potential.vh_el(it.i);
         }
         ctx.comm().allreduce(&enuc, 1);
     }
diff --git a/src/fft/fft.hpp b/src/fft/fft.hpp
index 24fdc2fd44..be15f08b97 100644
--- a/src/fft/fft.hpp
+++ b/src/fft/fft.hpp
@@ -232,7 +232,7 @@ inline size_t spfft_grid_size_local(T const& spfft__)
  *  using block distribution. */
 inline auto split_z_dimension(int size_z__, mpi::Communicator const& comm_fft__)
 {
-    return sddk::splindex<sddk::splindex_t::block>(size_z__, comm_fft__.size(), comm_fft__.rank());
+    return sddk::splindex_block<>(size_z__, n_blocks(comm_fft__.size()), block_id(comm_fft__.rank()));
 }
 
 } // namespace fft
diff --git a/src/fft/gvec.cpp b/src/fft/gvec.cpp
index 1e10e67cbe..30529f9617 100644
--- a/src/fft/gvec.cpp
+++ b/src/fft/gvec.cpp
@@ -671,14 +671,14 @@ Gvec_shells::Gvec_shells(Gvec const& gvec__)
     a2a_recv_ = mpi::block_data_descriptor(comm_.size());
 
     /* split G-vector shells between ranks in cyclic order */
-    spl_num_gsh_ = sddk::splindex<sddk::splindex_t::block_cyclic>(gvec_.num_shells(), comm_.size(), comm_.rank(), 1);
+    spl_num_gsh_ = sddk::splindex_block_cyclic<>(gvec_.num_shells(), n_blocks(comm_.size()), block_id(comm_.rank()), 1);
 
     /* each rank sends a fraction of its local G-vectors to other ranks */
     /* count this fraction */
     for (int igloc = 0; igloc < gvec_.count(); igloc++) {
         int ig   = gvec_.offset() + igloc;
         int igsh = gvec_.shell(ig);
-        a2a_send_.counts[spl_num_gsh_.local_rank(igsh)]++;
+        a2a_send_.counts[spl_num_gsh_.location(igsh).ib]++;
     }
     a2a_send_.calc_offsets();
     /* sanity check: total number of elements to send is equal to the local number of G-vector */
@@ -692,7 +692,7 @@ Gvec_shells::Gvec_shells(Gvec const& gvec__)
             int ig = gvec_.gvec_offset(r) + igloc;
             /* index of the G-vector shell */
             int igsh = gvec_.shell(ig);
-            if (spl_num_gsh_.local_rank(igsh) == comm_.rank()) {
+            if (spl_num_gsh_.location(igsh).ib == comm_.rank()) {
                 a2a_recv_.counts[r]++;
             }
         }
@@ -714,7 +714,7 @@ Gvec_shells::Gvec_shells(Gvec const& gvec__)
             int ig   = gvec_.gvec_offset(r) + igloc;
             int igsh = gvec_.shell(ig);
             auto G   = gvec_.gvec<sddk::index_domain_t::global>(ig);
-            if (spl_num_gsh_.local_rank(igsh) == comm_.rank()) {
+            if (spl_num_gsh_.location(igsh).ib == comm_.rank()) {
                 for (int x : {0, 1, 2}) {
                     gvec_remapped_(x, a2a_recv_.offsets[r] + counts[r]) = G[x];
                 }
diff --git a/src/fft/gvec.hpp b/src/fft/gvec.hpp
index a51316bf38..d07dbd4dc8 100644
--- a/src/fft/gvec.hpp
+++ b/src/fft/gvec.hpp
@@ -920,7 +920,7 @@ class Gvec_shells
     mpi::block_data_descriptor a2a_recv_;
 
     /// Split global index of G-shells between MPI ranks.
-    sddk::splindex<sddk::splindex_t::block_cyclic> spl_num_gsh_;
+    sddk::splindex_block_cyclic<> spl_num_gsh_;
 
     /// List of G-vectors in the remapped storage.
     sddk::mdarray<int, 2> gvec_remapped_;
@@ -996,7 +996,7 @@ class Gvec_shells
         for (int igloc = 0; igloc < gvec_.count(); igloc++) {
             int ig                                     = gvec_.offset() + igloc;
             int igsh                                   = gvec_.shell(ig);
-            int r                                      = spl_num_gsh_.local_rank(igsh);
+            int r                                      = spl_num_gsh_.location(igsh).ib;
             send_buf[a2a_send_.offsets[r] + counts[r]] = data__[igloc];
             counts[r]++;
         }
@@ -1023,7 +1023,7 @@ class Gvec_shells
         for (int igloc = 0; igloc < gvec_.count(); igloc++) {
             int ig        = gvec_.offset() + igloc;
             int igsh      = gvec_.shell(ig);
-            int r         = spl_num_gsh_.local_rank(igsh);
+            int r         = spl_num_gsh_.location(igsh).ib;
             data__[igloc] = recv_buf[a2a_send_.offsets[r] + counts[r]];
             counts[r]++;
         }
diff --git a/src/function3d/field4d.hpp b/src/function3d/field4d.hpp
index 46348bab90..4ffbeb71e2 100644
--- a/src/function3d/field4d.hpp
+++ b/src/function3d/field4d.hpp
@@ -144,7 +144,7 @@ class Field4D
 
     auto mt_components()
     {
-        std::vector<Spheric_function_set<double>*> result;
+        std::vector<Spheric_function_set<double, atom_index_t>*> result;
         result.push_back(&components_[0]->mt());
         switch (ctx_.num_mag_dims()) {
             case 1: {
diff --git a/src/function3d/paw_field4d.hpp b/src/function3d/paw_field4d.hpp
index 3f608491a9..255758fc19 100644
--- a/src/function3d/paw_field4d.hpp
+++ b/src/function3d/paw_field4d.hpp
@@ -38,9 +38,9 @@ class PAW_field4D
     /// Unit cell.
     Unit_cell const& uc_;
     /// All-electron part.
-    std::array<Spheric_function_set<T>, 4> ae_components_;
+    std::array<Spheric_function_set<T, paw_atom_index_t>, 4> ae_components_;
     /// Pseudo-potential part.
-    std::array<Spheric_function_set<T>, 4> ps_components_;
+    std::array<Spheric_function_set<T, paw_atom_index_t>, 4> ps_components_;
     /* copy constructor is forbidden */
     PAW_field4D(PAW_field4D const& src__) = delete;
     /* copy assignment operator is forbidden */
@@ -57,9 +57,9 @@ class PAW_field4D
         auto ptr = (is_global__) ? nullptr : &uc__.spl_num_paw_atoms();
 
         for (int j = 0; j < uc__.parameters().num_mag_dims() + 1; j++) {
-            ae_components_[j] = Spheric_function_set<T>(uc__, uc__.paw_atoms(),
+            ae_components_[j] = Spheric_function_set<T, paw_atom_index_t>(uc__, uc__.paw_atoms(),
                     [&uc__](int ia){return lmax_t(2 * uc__.atom(ia).type().indexr().lmax());}, ptr);
-            ps_components_[j] = Spheric_function_set<T>(uc__, uc__.paw_atoms(),
+            ps_components_[j] = Spheric_function_set<T, paw_atom_index_t>(uc__, uc__.paw_atoms(),
                     [&uc__](int ia){return lmax_t(2 * uc__.atom(ia).type().indexr().lmax());}, ptr);
         }
     }
diff --git a/src/function3d/periodic_function.hpp b/src/function3d/periodic_function.hpp
index de1deb304e..0352aaf9b6 100644
--- a/src/function3d/periodic_function.hpp
+++ b/src/function3d/periodic_function.hpp
@@ -64,7 +64,7 @@ class Periodic_function
     Smooth_periodic_function<T> rg_component_;
 
     /// Muffin-tin part of the periodic function.
-    Spheric_function_set<T> mt_component_;
+    Spheric_function_set<T, atom_index_t> mt_component_;
 
     /// Alias to G-vectors.
     fft::Gvec const& gvec_;
@@ -88,7 +88,7 @@ class Periodic_function
 
     /// Constructor for interstitial and muffin-tin parts (FP-LAPW case).
     Periodic_function(Simulation_context const& ctx__, std::function<lmax_t(int)> lmax__,
-            sddk::splindex<sddk::splindex_t::block> const* spl_atoms__ = nullptr,
+            sddk::splindex_block<atom_index_t> const* spl_atoms__ = nullptr,
             smooth_periodic_function_ptr_t<T> const* rg_ptr__ = nullptr,
             spheric_function_set_ptr_t<T> const* mt_ptr__ = nullptr)
         : ctx_(ctx__)
@@ -162,9 +162,8 @@ class Periodic_function
         if (ctx_.full_potential()) {
             mt_val = std::vector<T>(unit_cell_.num_atoms(), 0);
 
-            for (int ialoc = 0; ialoc < unit_cell_.spl_num_atoms().local_size(); ialoc++) {
-                int ia     = unit_cell_.spl_num_atoms(ialoc);
-                mt_val[ia] = mt_component_[ia].component(0).integrate(2) * fourpi * y00;
+            for (auto it : unit_cell_.spl_num_atoms()) {
+                mt_val[it.i] = mt_component_[it.i].component(0).integrate(2) * fourpi * y00;
             }
 
             comm_.allreduce(&mt_val[0], unit_cell_.num_atoms());
diff --git a/src/function3d/spheric_function_set.hpp b/src/function3d/spheric_function_set.hpp
index f9e18217e7..5f99f6ea52 100644
--- a/src/function3d/spheric_function_set.hpp
+++ b/src/function3d/spheric_function_set.hpp
@@ -8,7 +8,7 @@ namespace sirius {
 
 using lmax_t = strong_type<int, struct __lmax_t_tag>;
 
-template <typename T>
+template <typename T, typename I>
 class Spheric_function_set
 {
   private:
@@ -18,7 +18,7 @@ class Spheric_function_set
     std::vector<int> atoms_;
     /// Split the number of atoms between MPI ranks.
     /** If the pointer is null, spheric functions set is treated as global, without MPI distribution */
-    sddk::splindex<sddk::splindex_t::block> const* spl_atoms_{nullptr};
+    sddk::splindex_block<I> const* spl_atoms_{nullptr};
     /// List of spheric functions.
     std::vector<Spheric_function<function_domain_t::spectral, T>> func_;
 
@@ -41,9 +41,8 @@ class Spheric_function_set
         };
 
         if (spl_atoms_) {
-            for (int i = 0; i < spl_atoms_->local_size(); i++) {
-                int ia = atoms_[(*spl_atoms_)[i]];
-                set_func(ia);
+            for (auto it : (*spl_atoms_)) {
+                set_func(atoms_[it.i]);
             }
         } else {
             for (int ia : atoms_) {
@@ -59,7 +58,7 @@ class Spheric_function_set
 
     /// Constructor for all atoms.
     Spheric_function_set(Unit_cell const& unit_cell__, std::function<lmax_t(int)> lmax__,
-            sddk::splindex<sddk::splindex_t::block> const* spl_atoms__ = nullptr,
+            sddk::splindex_block<I> const* spl_atoms__ = nullptr,
             spheric_function_set_ptr_t<T> const* sptr__ = nullptr)
         : unit_cell_{&unit_cell__}
         , spl_atoms_{spl_atoms__}
@@ -68,7 +67,7 @@ class Spheric_function_set
         atoms_.resize(unit_cell__.num_atoms());
         std::iota(atoms_.begin(), atoms_.end(), 0);
         if (spl_atoms_) {
-            if (spl_atoms_->global_index_size() != unit_cell__.num_atoms()) {
+            if (spl_atoms_->size() != unit_cell__.num_atoms()) {
                 RTE_THROW("wrong split atom index");
             }
         }
@@ -77,14 +76,14 @@ class Spheric_function_set
 
     /// Constructor for a subset of atoms.
     Spheric_function_set(Unit_cell const& unit_cell__, std::vector<int> atoms__, std::function<lmax_t(int)> lmax__,
-            sddk::splindex<sddk::splindex_t::block> const* spl_atoms__ = nullptr)
+            sddk::splindex_block<I> const* spl_atoms__ = nullptr)
         : unit_cell_{&unit_cell__}
         , atoms_{atoms__}
         , spl_atoms_{spl_atoms__}
         , all_atoms_{false}
     {
         if (spl_atoms_) {
-            if (spl_atoms_->global_index_size() != static_cast<int>(atoms__.size())) {
+            if (spl_atoms_->size() != static_cast<int>(atoms__.size())) {
                 RTE_THROW("wrong split atom index");
             }
         }
@@ -125,16 +124,16 @@ class Spheric_function_set
     /// Synchronize global function.
     /** Assuming that each MPI rank was handling part of the global spherical function, broadcast data
      *  from each rank. As a result, each rank stores a full and identical copy of global spherical function. */
-    inline void sync(sddk::splindex<sddk::splindex_t::block> const& spl_atoms__)
+    inline void sync(sddk::splindex_block<I> const& spl_atoms__)
     {
-        for (int i = 0; i < spl_atoms__.global_index_size(); i++) {
-            auto loc = spl_atoms__.location(i);
+        for (int i = 0; i < spl_atoms__.size(); i++) {
+            auto loc = spl_atoms__.location(typename I::global(i));
             int ia = atoms_[i];
-            unit_cell_->comm().bcast(func_[ia].at(sddk::memory_t::host), static_cast<int>(func_[ia].size()), loc.rank);
+            unit_cell_->comm().bcast(func_[ia].at(sddk::memory_t::host), static_cast<int>(func_[ia].size()), loc.ib);
         }
     }
 
-    Spheric_function_set<T>& operator+=(Spheric_function_set<T> const& rhs__)
+    Spheric_function_set<T, I>& operator+=(Spheric_function_set<T, I> const& rhs__)
     {
         for (int ia = 0; ia < unit_cell_->num_atoms(); ia++) {
             if (func_[ia].size() && rhs__[ia].size()) {
@@ -144,33 +143,33 @@ class Spheric_function_set
         return *this;
     }
 
-    template <typename F>
-    friend F
-    inner(Spheric_function_set<F> const& f1__, Spheric_function_set<F> const& f2__);
+    template <typename T_, typename I_>
+    friend T_
+    inner(Spheric_function_set<T_, I_> const& f1__, Spheric_function_set<T_, I_> const& f2__);
 
-    template <typename F>
+    template <typename T_, typename I_>
     friend void
-    copy(Spheric_function_set<F> const& src__, Spheric_function_set<F>& dest__);
+    copy(Spheric_function_set<T_, I_> const& src__, Spheric_function_set<T_, I_>& dest__);
 
-    template <typename F>
+    template <typename T_, typename I_>
     friend void
-    copy(Spheric_function_set<F> const& src__, spheric_function_set_ptr_t<F> dest__);
+    copy(Spheric_function_set<T_, I_> const& src__, spheric_function_set_ptr_t<T_> dest__);
 
-    template <typename F>
+    template <typename T_, typename I_>
     friend void
-    copy(spheric_function_set_ptr_t<F> src__, Spheric_function_set<F> const& dest__);
+    copy(spheric_function_set_ptr_t<T_> src__, Spheric_function_set<T_, I_> const& dest__);
 
-    template <typename F>
+    template <typename T_, typename I_>
     friend void
-    scale(F alpha__, Spheric_function_set<F>& x__);
+    scale(T_ alpha__, Spheric_function_set<T_, I_>& x__);
 
-    template <typename F>
+    template <typename T_, typename I_>
     friend void
-    axpy(F alpha__, Spheric_function_set<F> const& x__, Spheric_function_set<F>& y__);
+    axpy(T_ alpha__, Spheric_function_set<T_, I_> const& x__, Spheric_function_set<T_, I_>& y__);
 };
 
-template <typename T>
-inline T inner(Spheric_function_set<T> const& f1__, Spheric_function_set<T> const& f2__)
+template <typename T, typename I>
+inline T inner(Spheric_function_set<T, I> const& f1__, Spheric_function_set<T, I> const& f2__)
 {
     auto ptr = (f1__.spl_atoms_) ? f1__.spl_atoms_ : f2__.spl_atoms_;
 
@@ -185,13 +184,13 @@ inline T inner(Spheric_function_set<T> const& f1__, Spheric_function_set<T> cons
 
     if (ptr) {
         for (int i = 0; i < ptr->local_size(); i++) {
-            int ia = f1__.atoms_[(*ptr)[i]];
+            int ia = f1__.atoms_[(*ptr).global_index(typename I::local(i))];
             result += inner(f1__[ia], f2__[ia]);
         }
     } else {
-        sddk::splindex<sddk::splindex_t::block> spl_atoms(f1__.atoms_.size(), comm.size(), comm.rank());
+        sddk::splindex_block<I> spl_atoms(f1__.atoms_.size(), n_blocks(comm.size()), block_id(comm.rank()));
         for (int i = 0; i < spl_atoms.local_size(); i++) {
-            int ia = f1__.atoms_[spl_atoms[i]];
+            int ia = f1__.atoms_[spl_atoms.global_index(typename I::local(i))];
             result += inner(f1__[ia], f2__[ia]);
         }
     }
@@ -201,9 +200,9 @@ inline T inner(Spheric_function_set<T> const& f1__, Spheric_function_set<T> cons
 
 /// Copy from Spheric_function_set to external pointer.
 /** External pointer is assumed to be global. */
-template <typename T>
+template <typename T, typename I>
 inline void
-copy(Spheric_function_set<T> const& src__, spheric_function_set_ptr_t<T> dest__)
+copy(Spheric_function_set<T, I> const& src__, spheric_function_set_ptr_t<T> dest__)
 {
     auto p = dest__.ptr;
     for (auto ia : src__.atoms()) {
@@ -229,9 +228,9 @@ copy(Spheric_function_set<T> const& src__, spheric_function_set_ptr_t<T> dest__)
 
 /// Copy from external pointer to Spheric_function_set.
 /** External pointer is assumed to be global. */
-template <typename T>
+template <typename T, typename I>
 inline void
-copy(spheric_function_set_ptr_t<T> const src__, Spheric_function_set<T>& dest__)
+copy(spheric_function_set_ptr_t<T> const src__, Spheric_function_set<T, I>& dest__)
 {
     auto p = src__.ptr;
     for (auto ia : dest__.atoms()) {
@@ -250,9 +249,9 @@ copy(spheric_function_set_ptr_t<T> const src__, Spheric_function_set<T>& dest__)
     }
 }
 
-template <typename T>
+template <typename T, typename I>
 inline void
-copy(Spheric_function_set<T> const& src__, Spheric_function_set<T>& dest__)
+copy(Spheric_function_set<T, I> const& src__, Spheric_function_set<T, I>& dest__)
 {
     for (int ia = 0; ia < src__.unit_cell_->num_atoms(); ia++) {
         if (src__.func_[ia].size()) {
@@ -261,9 +260,9 @@ copy(Spheric_function_set<T> const& src__, Spheric_function_set<T>& dest__)
     }
 }
 
-template <typename T>
+template <typename T, typename I>
 inline void
-scale(T alpha__, Spheric_function_set<T>& x__)
+scale(T alpha__, Spheric_function_set<T, I>& x__)
 {
     for (int ia = 0; ia < x__.unit_cell_->num_atoms(); ia++) {
         if (x__.func_[ia].size()) {
@@ -272,9 +271,9 @@ scale(T alpha__, Spheric_function_set<T>& x__)
     }
 }
 
-template <typename T>
+template <typename T, typename I>
 inline void
-axpy(T alpha__, Spheric_function_set<T> const& x__, Spheric_function_set<T>& y__)
+axpy(T alpha__, Spheric_function_set<T, I> const& x__, Spheric_function_set<T, I>& y__)
 {
     for (int ia = 0; ia < x__.unit_cell_->num_atoms(); ia++) {
         if (x__.func_[ia].size()) {
diff --git a/src/geometry/force.cpp b/src/geometry/force.cpp
index 1612f4a947..01751c0cd6 100644
--- a/src/geometry/force.cpp
+++ b/src/geometry/force.cpp
@@ -53,8 +53,8 @@ void Force::calc_forces_nonloc_aux()
 
     auto& spl_num_kp = kset_.spl_num_kpoints();
 
-    for (int ikploc = 0; ikploc < spl_num_kp.local_size(); ikploc++) {
-        auto* kp = kset_.get<T>(spl_num_kp[ikploc]);
+    for (auto it : spl_num_kp) {
+        auto* kp = kset_.get<T>(it.i);
 
         if (ctx_.gamma_point()) {
             add_k_point_contribution<T, T>(*kp, forces_nonloc_);
@@ -211,10 +211,9 @@ Force::calc_forces_ibs()
     }
 
     Hamiltonian0<double> H0(potential_, false);
-    for (int ikloc = 0; ikloc < kset_.spl_num_kpoints().local_size(); ikloc++) {
-        int ik  = kset_.spl_num_kpoints(ikloc);
-        auto hk = H0(*kset_.get<double>(ik));
-        add_ibs_force(kset_.get<double>(ik), hk, ffac, forces_ibs_);
+    for (auto it : kset_.spl_num_kpoints()) {
+        auto hk = H0(*kset_.get<double>(it.i));
+        add_ibs_force(kset_.get<double>(it.i), hk, ffac, forces_ibs_);
     }
     ctx_.comm().allreduce(&forces_ibs_(0, 0), (int)forces_ibs_.size());
     symmetrize_forces(ctx_.unit_cell(), forces_ibs_);
@@ -228,11 +227,10 @@ Force::calc_forces_rho()
 
     forces_rho_ = sddk::mdarray<double, 2>(3, ctx_.unit_cell().num_atoms());
     forces_rho_.zero();
-    for (int ialoc = 0; ialoc < ctx_.unit_cell().spl_num_atoms().local_size(); ialoc++) {
-        int ia = ctx_.unit_cell().spl_num_atoms(ialoc);
-        auto g = gradient(density_.density_mt(ialoc));
+    for (auto it : ctx_.unit_cell().spl_num_atoms()) {
+        auto g = gradient(density_.density_mt(it.li));
         for (int x = 0; x < 3; x++) {
-            forces_rho_(x, ia) = inner(potential_.effective_potential_mt(ialoc), g[x]);
+            forces_rho_(x, it.i) = inner(potential_.effective_potential_mt(it.li), g[x]);
         }
     }
     ctx_.comm().allreduce(&forces_rho_(0, 0), (int)forces_rho_.size());
@@ -247,11 +245,10 @@ Force::calc_forces_hf()
     forces_hf_ = sddk::mdarray<double, 2>(3, ctx_.unit_cell().num_atoms());
     forces_hf_.zero();
 
-    for (int ialoc = 0; ialoc < ctx_.unit_cell().spl_num_atoms().local_size(); ialoc++) {
-        int ia = ctx_.unit_cell().spl_num_atoms(ialoc);
-        auto g = gradient(potential_.hartree_potential_mt(ialoc));
+    for (auto it : ctx_.unit_cell().spl_num_atoms()) {
+        auto g = gradient(potential_.hartree_potential_mt(it.li));
         for (int x = 0; x < 3; x++) {
-            forces_hf_(x, ia) = ctx_.unit_cell().atom(ia).zn() * g[x](0, 0) * y00;
+            forces_hf_(x, it.i) = ctx_.unit_cell().atom(it.i).zn() * g[x](0, 0) * y00;
         }
     }
     ctx_.comm().allreduce(&forces_hf_(0, 0), (int)forces_hf_.size());
@@ -273,10 +270,9 @@ Force::calc_forces_hubbard()
 
         Q_operator<double> q_op(ctx_);
 
-        for (int ikloc = 0; ikloc < kset_.spl_num_kpoints().local_size(); ikloc++) {
+        for (auto it : kset_.spl_num_kpoints()) {
 
-            int ik  = kset_.spl_num_kpoints(ikloc);
-            auto kp = kset_.get<double>(ik);
+            auto kp = kset_.get<double>(it.i);
             // kp->beta_projectors().prepare();
             auto mg1 = kp->spinor_wave_functions().memory_guard(ctx_.processing_unit_memory_t(), wf::copy_to::device);
             auto mg2 = kp->hubbard_wave_functions_S().memory_guard(ctx_.processing_unit_memory_t(), wf::copy_to::device);
diff --git a/src/geometry/non_local_functor.hpp b/src/geometry/non_local_functor.hpp
index 665d1bb800..e2f519e1b3 100644
--- a/src/geometry/non_local_functor.hpp
+++ b/src/geometry/non_local_functor.hpp
@@ -85,14 +85,11 @@ void add_k_point_contribution_nonlocal(Simulation_context& ctx__, Beta_projector
                 int nbnd = kp__.num_occupied_bands(ispn);
 
                 /* inner product of beta gradient and WF */
-                // auto bp_base_phi_chunk = bp_base__.template inner<F>(ctx__.processing_unit_memory_t(), icnk,
-                //         kp__.spinor_wave_functions(), wf::spin_index(ispn), wf::band_range(0, nbnd));
-
                 auto bp_base_phi_chunk = inner_prod_beta<F>(
                     ctx__.spla_context(), mt, ctx__.host_memory_t(), sddk::is_device_memory(mt), beta_coeffs_base,
                     kp__.spinor_wave_functions(), wf::spin_index(ispn), wf::band_range(0, nbnd));
 
-                sddk::splindex<sddk::splindex_t::block> spl_nbnd(nbnd, kp__.comm().size(), kp__.comm().rank());
+                sddk::splindex_block<> spl_nbnd(nbnd, n_blocks(kp__.comm().size()), block_id(kp__.comm().rank()));
 
                 int nbnd_loc = spl_nbnd.local_size();
 
@@ -112,7 +109,7 @@ void add_k_point_contribution_nonlocal(Simulation_context& ctx__, Beta_projector
                                        sddk::matrix<F>& beta_phi_chunk) {
                         /* gather everything = - 2  Re[ occ(k,n) weight(k) beta_phi*(i,n) [Dij - E(n)Qij] beta_base_phi(j,n) ]*/
                         for (int ibnd_loc = 0; ibnd_loc < nbnd_loc; ibnd_loc++) {
-                            int ibnd = spl_nbnd[ibnd_loc];
+                            int ibnd = spl_nbnd.global_index(ibnd_loc);
 
                             auto d1 = main_two_factor * kp__.band_occupancy(ibnd, ispn) * kp__.weight();
                             auto z2 = dij - static_cast<real_type<F>>(kp__.band_energy(ibnd, ispn) * qij);
diff --git a/src/geometry/stress.cpp b/src/geometry/stress.cpp
index 4dc10d332e..6f5d834a72 100644
--- a/src/geometry/stress.cpp
+++ b/src/geometry/stress.cpp
@@ -50,9 +50,8 @@ Stress::calc_stress_nonloc_aux()
 
     print_memory_usage(ctx_.out(), FILE_LINE);
 
-    for (int ikloc = 0; ikloc < kset_.spl_num_kpoints().local_size(); ikloc++) {
-        int ik  = kset_.spl_num_kpoints(ikloc);
-        auto kp = kset_.get<T>(ik);
+    for (auto it : kset_.spl_num_kpoints()) {
+        auto kp = kset_.get<T>(it.i);
         auto mem = ctx_.processing_unit_memory_t();
         auto mg = kp->spinor_wave_functions().memory_guard(mem, wf::copy_to::device);
         Beta_projectors_strain_deriv<T> bp_strain_deriv(ctx_, kp->gkvec());
@@ -131,9 +130,8 @@ Stress::calc_stress_hubbard()
     if (is_device_memory(ctx_.processing_unit_memory_t())) {
         dn.allocate(ctx_.processing_unit_memory_t());
     }
-    for (int ikloc = 0; ikloc < kset_.spl_num_kpoints().local_size(); ikloc++) {
-        int ik  = kset_.spl_num_kpoints(ikloc);
-        auto kp = kset_.get<double>(ik);
+    for (auto it : kset_.spl_num_kpoints()) {
+        auto kp = kset_.get<double>(it.i);
         dn.zero();
         if (is_device_memory(ctx_.processing_unit_memory_t())) {
             dn.zero(ctx_.processing_unit_memory_t());
@@ -652,9 +650,8 @@ Stress::calc_stress_kin_aux()
 {
     stress_kin_.zero();
 
-    for (int ikloc = 0; ikloc < kset_.spl_num_kpoints().local_size(); ikloc++) {
-        int ik  = kset_.spl_num_kpoints(ikloc);
-        auto kp = kset_.get<T>(ik);
+    for (auto it : kset_.spl_num_kpoints()) {
+        auto kp = kset_.get<T>(it.i);
 
         double fact = kp->gkvec().reduced() ? 2.0 : 1.0;
         fact *=  kp->weight();
diff --git a/src/hamiltonian/hamiltonian.cpp b/src/hamiltonian/hamiltonian.cpp
index fd5bc2e282..c16c7c3ed3 100644
--- a/src/hamiltonian/hamiltonian.cpp
+++ b/src/hamiltonian/hamiltonian.cpp
@@ -157,10 +157,9 @@ Hamiltonian0<T>::apply_bmt(wf::Wave_functions<T>& psi__, std::vector<wf::Wave_fu
 {
     sddk::mdarray<std::complex<T>, 3> zm(unit_cell_.max_mt_basis_size(), unit_cell_.max_mt_basis_size(), ctx_.num_mag_dims());
 
-    for (int ialoc = 0; ialoc < psi__.spl_num_atoms().local_size(); ialoc++) {
-        int ia            = psi__.spl_num_atoms()[ialoc];
+    for (auto it : psi__.spl_num_atoms()) {
+        auto ia           = it.i;
         auto& atom        = unit_cell_.atom(ia);
-        //int offset        = psi__.offset_mt_coeffs(ialoc);
         int mt_basis_size = atom.type().mt_basis_size();
 
         zm.zero();
@@ -183,9 +182,9 @@ Hamiltonian0<T>::apply_bmt(wf::Wave_functions<T>& psi__, std::vector<wf::Wave_fu
         /* compute bwf = B_z*|wf_j> */
         la::wrap(la::lib_t::blas).hemm(
             'L', 'U', mt_basis_size, ctx_.num_fv_states(), &la::constant<std::complex<T>>::one(),
-            zm.at(sddk::memory_t::host), zm.ld(), &psi__.mt_coeffs(0, wf::atom_index(ialoc), wf::spin_index(0), wf::band_index(0)),
+            zm.at(sddk::memory_t::host), zm.ld(), &psi__.mt_coeffs(0, it.li, wf::spin_index(0), wf::band_index(0)),
             psi__.ld(), &la::constant<std::complex<T>>::zero(),
-            &bpsi__[0].mt_coeffs(0, wf::atom_index(ialoc), wf::spin_index(0), wf::band_index(0)), bpsi__[0].ld());
+            &bpsi__[0].mt_coeffs(0, it.li, wf::spin_index(0), wf::band_index(0)), bpsi__[0].ld());
 
         /* compute bwf = (B_x - iB_y)|wf_j> */
         if (bpsi__.size() == 3) {
@@ -204,9 +203,9 @@ Hamiltonian0<T>::apply_bmt(wf::Wave_functions<T>& psi__, std::vector<wf::Wave_fu
 
             la::wrap(la::lib_t::blas).gemm(
                'N', 'N', mt_basis_size, ctx_.num_fv_states(), mt_basis_size, &la::constant<std::complex<T>>::one(),
-               zm.at(sddk::memory_t::host), zm.ld(), &psi__.mt_coeffs(0, wf::atom_index(ialoc), wf::spin_index(0), wf::band_index(0)),
+               zm.at(sddk::memory_t::host), zm.ld(), &psi__.mt_coeffs(0, it.li, wf::spin_index(0), wf::band_index(0)),
                psi__.ld(), &la::constant<std::complex<T>>::zero(),
-               &bpsi__[2].mt_coeffs(0, wf::atom_index(ialoc), wf::spin_index(0), wf::band_index(0)), bpsi__[2].ld());
+               &bpsi__[2].mt_coeffs(0, it.li, wf::spin_index(0), wf::band_index(0)), bpsi__[2].ld());
         }
     }
 }
@@ -219,10 +218,10 @@ Hamiltonian0<T>::apply_so_correction(wf::Wave_functions<T>& psi__, std::vector<w
 
     wf::spin_index s(0);
 
-    for (int ialoc = 0; ialoc < psi__.spl_num_atoms().local_size(); ialoc++) {
-        int ia     = psi__.spl_num_atoms()[ialoc];
+    for (auto it : psi__.spl_num_atoms()) {
+        auto ia    = it.i;
         auto& atom = unit_cell_.atom(ia);
-        wf::atom_index a(ialoc);
+        auto a     = it.li;
 
         for (int l = 0; l <= atom.type().lmax_apw(); l++) {
             /* number of radial functions for this l */
diff --git a/src/hamiltonian/hamiltonian_k.cpp b/src/hamiltonian/hamiltonian_k.cpp
index 8b131863c8..02380ad2b9 100644
--- a/src/hamiltonian/hamiltonian_k.cpp
+++ b/src/hamiltonian/hamiltonian_k.cpp
@@ -210,11 +210,10 @@ Hamiltonian_k<T>::get_h_o_diag_lapw() const
 
     auto const& uc = H0_.ctx().unit_cell();
 
-    sddk::splindex<sddk::splindex_t::block> spl_num_atoms(uc.num_atoms(), kp_.comm().size(), kp_.comm().rank());
+    sddk::splindex_block<atom_index_t> spl_num_atoms(uc.num_atoms(), n_blocks(kp_.comm().size()), block_id(kp_.comm().rank()));
     int nlo{0};
-    for (int ialoc = 0; ialoc < spl_num_atoms.local_size(); ialoc++) {
-        int ia = spl_num_atoms[ialoc];
-        nlo += uc.atom(ia).mt_lo_basis_size();
+    for (auto it : spl_num_atoms) {
+        nlo += uc.atom(it.i).mt_lo_basis_size();
     }
 
     auto h_diag = (what & 1) ? sddk::mdarray<T, 2>(kp_.num_gkvec_loc() + nlo, 1) : sddk::mdarray<T, 2>();
@@ -283,11 +282,10 @@ Hamiltonian_k<T>::get_h_o_diag_lapw() const
     }
 
     nlo = 0;
-    for (int ialoc = 0; ialoc < spl_num_atoms.local_size(); ialoc++) {
-        int ia     = spl_num_atoms[ialoc];
-        auto& atom = uc.atom(ia);
+    for (auto it : spl_num_atoms) {
+        auto& atom = uc.atom(it.i);
         auto& type = atom.type();
-        auto& hmt = H0_.hmt(ia);
+        auto& hmt = H0_.hmt(it.i);
         #pragma omp parallel for
         for (int ilo = 0; ilo < type.mt_lo_basis_size(); ilo++) {
             int xi_lo = type.mt_aw_basis_size() + ilo;
@@ -890,15 +888,15 @@ Hamiltonian_k<T>::apply_fv_h_o(bool apw_only__, bool phi_is_lo__, wf::band_range
     auto apply_hmt_apw_lo = [this, &ctx, &phi__, la, mem, &b__, &spl_atoms](wf::Wave_functions_mt<T>& h_apw_lo__)
     {
         #pragma omp parallel for
-        for (int ialoc = 0; ialoc < spl_atoms.local_size(); ialoc++) {
+        for (auto it: spl_atoms) {
             int tid    = omp_get_thread_num();
-            int ia     = spl_atoms[ialoc];
+            int ia     = it.li;
             auto& atom = ctx.unit_cell().atom(ia);
             auto& type = atom.type();
             int naw    = type.mt_aw_basis_size();
             int nlo    = type.mt_lo_basis_size();
 
-            auto aidx  = wf::atom_index(ialoc);
+            auto aidx  = it.li;
 
             auto& hmt = this->H0_.hmt(ia);
 
@@ -921,14 +919,14 @@ Hamiltonian_k<T>::apply_fv_h_o(bool apw_only__, bool phi_is_lo__, wf::band_range
         o_apw_lo__.zero(sddk::memory_t::host, wf::spin_index(0), wf::band_range(0, b__.size()));
 
         #pragma omp parallel for
-        for (int ialoc = 0; ialoc < spl_atoms.local_size(); ialoc++) {
-            int ia     = spl_atoms[ialoc];
+        for (auto it : spl_atoms) {
+            int ia     = it.i;
             auto& atom = ctx.unit_cell().atom(ia);
             auto& type = atom.type();
             int naw    = type.mt_aw_basis_size();
             int nlo    = type.mt_lo_basis_size();
 
-            auto aidx  = wf::atom_index(ialoc);
+            auto aidx  = it.li;
 
             for (int j = 0; j < b__.size(); j++) {
                 for (int ilo = 0; ilo < nlo; ilo++) {
@@ -952,15 +950,15 @@ Hamiltonian_k<T>::apply_fv_h_o(bool apw_only__, bool phi_is_lo__, wf::band_range
     {
         /* lo-lo contribution */
         #pragma omp parallel for
-        for (int ialoc = 0; ialoc < spl_atoms.local_size(); ialoc++) {
+        for (auto it : spl_atoms) {
             int tid    = omp_get_thread_num();
-            int ia     = spl_atoms[ialoc];
+            auto ia    = it.i;
             auto& atom = ctx.unit_cell().atom(ia);
             auto& type = atom.type();
             int naw    = type.mt_aw_basis_size();
             int nlo    = type.mt_lo_basis_size();
 
-            auto aidx  = wf::atom_index(ialoc);
+            auto aidx  = it.li;
 
             auto& hmt = H0_.hmt(ia);
 
@@ -977,12 +975,12 @@ Hamiltonian_k<T>::apply_fv_h_o(bool apw_only__, bool phi_is_lo__, wf::band_range
     {
         /* lo-lo contribution */
         #pragma omp parallel for
-        for (int ialoc = 0; ialoc < spl_atoms.local_size(); ialoc++) {
-            int ia     = spl_atoms[ialoc];
+        for (auto it : spl_atoms) {
+            auto ia    = it.i;
             auto& atom = ctx.unit_cell().atom(ia);
             auto& type = atom.type();
 
-            auto aidx  = wf::atom_index(ialoc);
+            auto aidx  = it.li;
 
             for (int ilo = 0; ilo < type.mt_lo_basis_size(); ilo++) {
                 int xi_lo = type.mt_aw_basis_size() + ilo;
@@ -1012,14 +1010,14 @@ Hamiltonian_k<T>::apply_fv_h_o(bool apw_only__, bool phi_is_lo__, wf::band_range
             wf::Wave_functions_mt<T>& halm_phi__)
     {
         #pragma omp parallel for
-        for (int ialoc = 0; ialoc < alm_phi__.spl_num_atoms().local_size(); ialoc++) {
+        for (auto it : alm_phi__.spl_num_atoms()) {
             int tid = omp_get_thread_num();
-            int ia = atom_begin__ + alm_phi__.spl_num_atoms()[ialoc];
+            int ia = atom_begin__ + it.i;
             auto& atom = ctx.unit_cell().atom(ia);
             auto& type = atom.type();
             int naw    = type.mt_aw_basis_size();
 
-            auto aidx  = wf::atom_index(ialoc);
+            auto aidx  = it.li;
 
             auto& hmt = H0_.hmt(ia);
 
@@ -1036,15 +1034,15 @@ Hamiltonian_k<T>::apply_fv_h_o(bool apw_only__, bool phi_is_lo__, wf::band_range
     auto apply_hmt_lo_apw = [this, &ctx, la, mem, &b__, &spl_atoms](wf::Wave_functions_mt<T> const& alm_phi__, wf::Wave_functions<T>& hphi__)
     {
         #pragma omp parallel for
-        for (int ialoc = 0; ialoc < spl_atoms.local_size(); ialoc++) {
+        for (auto it : spl_atoms) {
             int tid = omp_get_thread_num();
-            int ia     = spl_atoms[ialoc];
+            int ia     = it.i;
             auto& atom = ctx.unit_cell().atom(ia);
             auto& type = atom.type();
             int naw    = type.mt_aw_basis_size();
             int nlo    = type.mt_lo_basis_size();
 
-            auto aidx  = wf::atom_index(ialoc);
+            auto aidx  = it.li;
 
             auto& hmt = H0_.hmt(ia);
 
@@ -1062,14 +1060,14 @@ Hamiltonian_k<T>::apply_fv_h_o(bool apw_only__, bool phi_is_lo__, wf::band_range
     auto apply_omt_lo_apw = [this, &ctx, mem, &b__, &spl_atoms](wf::Wave_functions_mt<T> const& alm_phi__, wf::Wave_functions<T>& ophi__)
     {
         #pragma omp parallel for
-        for (int ialoc = 0; ialoc < spl_atoms.local_size(); ialoc++) {
-            int ia     = spl_atoms[ialoc];
+        for (auto it : spl_atoms) {
+            int ia     = it.i;
             auto& atom = ctx.unit_cell().atom(ia);
             auto& type = atom.type();
             int naw    = type.mt_aw_basis_size();
             int nlo    = type.mt_lo_basis_size();
 
-            auto aidx  = wf::atom_index(ialoc);
+            auto aidx  = it.li;
 
             for (int ilo = 0; ilo < nlo; ilo++) {
                 int xi_lo = naw + ilo;
@@ -1213,7 +1211,7 @@ Hamiltonian_k<T>::apply_fv_h_o(bool apw_only__, bool phi_is_lo__, wf::band_range
     /* loop over blocks of atoms */
     for (auto na : utils::split_in_blocks(ctx.unit_cell().num_atoms(), 64)) {
 
-        sddk::splindex<sddk::splindex_t::block> spl_atoms(na, comm.size(), comm.rank());
+        sddk::splindex_block<> spl_atoms(na, n_blocks(comm.size()), block_id(comm.rank()));
 
         /* actual number of AW radial functions in a block of atoms */
         int num_mt_aw{0};
diff --git a/src/hamiltonian/non_local_operator.hpp b/src/hamiltonian/non_local_operator.hpp
index 0d813faeb5..fea1bd1f34 100644
--- a/src/hamiltonian/non_local_operator.hpp
+++ b/src/hamiltonian/non_local_operator.hpp
@@ -27,13 +27,10 @@
 
 #include "SDDK/omp.hpp"
 #include "SDDK/memory.hpp"
-// #include "beta_projectors/beta_projectors.hpp"
-// #include "beta_projectors/beta_projectors_base.hpp"
-// #include "beta_projectors/beta_projectors_strain_deriv.hpp"
 #include "non_local_operator_base.hpp"
 #include "context/simulation_context.hpp"
 #include "hubbard/hubbard_matrix.hpp"
-#include "traits.hpp"
+//#include "traits.hpp"
 #include "utils/rte.hpp"
 
 namespace sirius {
diff --git a/src/hamiltonian/non_local_operator_base.hpp b/src/hamiltonian/non_local_operator_base.hpp
index 9c62df4d43..bcf2db19a2 100644
--- a/src/hamiltonian/non_local_operator_base.hpp
+++ b/src/hamiltonian/non_local_operator_base.hpp
@@ -74,17 +74,17 @@ class Non_local_operator
     /// Apply beta projectors from one atom in a chunk of beta projectors to all wave-functions.
     template <typename F>
     std::enable_if_t<std::is_same<std::complex<T>, F>::value, void>
-    apply(sddk::memory_t mem__, int chunk__, wf::atom_index ia__, int ispn_block__, wf::Wave_functions<T>& op_phi__,
-          wf::band_range br__, const beta_projectors_coeffs_t<T>& beta_coeffs__, sddk::matrix<F>& beta_phi__);
+    apply(sddk::memory_t mem__, int chunk__, atom_index_t::local ia__, int ispn_block__, wf::Wave_functions<T>& op_phi__,
+          wf::band_range br__, beta_projectors_coeffs_t<T> const& beta_coeffs__, sddk::matrix<F>& beta_phi__);
 
     /// computes α B*Q + β out
     template <typename F>
-    void lmatmul(sddk::matrix<F>& out, const sddk::matrix<F>& B__, int ispn_block__, sddk::memory_t mem_t,
+    void lmatmul(sddk::matrix<F>& out, sddk::matrix<F> const& B__, int ispn_block__, sddk::memory_t mem_t,
                  identity_t<F> alpha = F{1}, identity_t<F> beta = F{0}) const;
 
     /// computes α Q*B + β out
     template <typename F>
-    void rmatmul(sddk::matrix<F>& out, const sddk::matrix<F>& B__, int ispn_block__, sddk::memory_t mem_t,
+    void rmatmul(sddk::matrix<F>& out, sddk::matrix<F> const& B__, int ispn_block__, sddk::memory_t mem_t,
                  identity_t<F> alpha = F{1}, identity_t<F> beta = F{0}) const;
 
     template <typename F, typename = std::enable_if_t<std::is_same<T, real_type<F>>::value>>
@@ -207,9 +207,9 @@ Non_local_operator<T>::apply(sddk::memory_t mem__, int chunk__, int ispn_block__
 template <class T>
 template <class F>
 std::enable_if_t<std::is_same<std::complex<T>, F>::value, void>
-Non_local_operator<T>::apply(sddk::memory_t mem__, int chunk__, wf::atom_index ia__, int ispn_block__,
+Non_local_operator<T>::apply(sddk::memory_t mem__, int chunk__, atom_index_t::local ia__, int ispn_block__,
                              wf::Wave_functions<T>& op_phi__, wf::band_range br__,
-                             const beta_projectors_coeffs_t<T>& beta_coeffs__, sddk::matrix<F>& beta_phi__)
+                             beta_projectors_coeffs_t<T> const& beta_coeffs__, sddk::matrix<F>& beta_phi__)
 {
     if (is_null_) {
         return;
@@ -275,8 +275,8 @@ Non_local_operator<T>::lmatmul(sddk::matrix<F>& out, const sddk::matrix<F>& B__,
     }
 
     // check shapes
-    assert(out.size(0) == B__.size(0) && static_cast<int>(out.size(1)) == this->size_);
-    assert(static_cast<int>(B__.size(1)) == this->size_);
+    RTE_ASSERT(out.size(0) == B__.size(0) && static_cast<int>(out.size(1)) == this->size_);
+    RTE_ASSERT(static_cast<int>(B__.size(1)) == this->size_);
 
     int num_atoms = uc.num_atoms();
 
@@ -314,8 +314,8 @@ Non_local_operator<T>::rmatmul(sddk::matrix<F>& out, const sddk::matrix<F>& B__,
     }
 
     // check shapes
-    assert(static_cast<int>(out.size(0)) == this->size_ && out.size(1) == B__.size(1));
-    assert(static_cast<int>(B__.size(0)) == this->size_);
+    RTE_ASSERT(static_cast<int>(out.size(0)) == this->size_ && out.size(1) == B__.size(1));
+    RTE_ASSERT(static_cast<int>(B__.size(0)) == this->size_);
 
     int num_atoms = uc.num_atoms();
 
diff --git a/src/hubbard/hubbard_occupancies_derivatives.cpp b/src/hubbard/hubbard_occupancies_derivatives.cpp
index 5c1f08fed2..da9b7373c3 100644
--- a/src/hubbard/hubbard_occupancies_derivatives.cpp
+++ b/src/hubbard/hubbard_occupancies_derivatives.cpp
@@ -297,9 +297,9 @@ Hubbard::compute_occupancies_derivatives(K_point<double>& kp__, Q_operator<doubl
                  *   | d beta / dr > Q < beta        | phi_atomic > */
                 phi_atomic_tmp->zero(mt, wf::spin_index(0), wf::band_range(0, nawf));
                 if (ctx_.unit_cell().atom(ja).type().augment()) {
-                    q_op__.apply(mt, ichunk, wf::atom_index(i), 0, *phi_atomic_tmp, wf::band_range(0, nawf),
+                    q_op__.apply(mt, ichunk, atom_index_t::local(i), 0, *phi_atomic_tmp, wf::band_range(0, nawf),
                             bp_grad_coeffs, beta_phi_atomic);
-                    q_op__.apply(mt, ichunk, wf::atom_index(i), 0, *phi_atomic_tmp, wf::band_range(0, nawf),
+                    q_op__.apply(mt, ichunk, atom_index_t::local(i), 0, *phi_atomic_tmp, wf::band_range(0, nawf),
                             bp_coeffs, grad_beta_phi_atomic);
                 }
 
diff --git a/src/k_point/generate_fv_states.cpp b/src/k_point/generate_fv_states.cpp
index 18eaad46a4..5bbf55f47f 100644
--- a/src/k_point/generate_fv_states.cpp
+++ b/src/k_point/generate_fv_states.cpp
@@ -97,18 +97,17 @@ void K_point<T>::generate_fv_states()
         auto out_ptr = &fv_states_->pw_coeffs(0, wf::spin_index(0), wf::band_index(i));
         std::copy(in_ptr, in_ptr + gkvec().count(), out_ptr);
 
-        for (int ialoc = 0; ialoc < alm_fv_slab.spl_num_atoms().local_size(); ialoc++) {
-            int ia = alm_fv_slab.spl_num_atoms()[ialoc];
-            int num_mt_aw = uc.atom(ia).type().mt_aw_basis_size();
+        for (auto it : alm_fv_slab.spl_num_atoms()) {
+            int num_mt_aw = uc.atom(it.i).type().mt_aw_basis_size();
             /* aw part of the muffin-tin coefficients */
             for (int xi = 0; xi < num_mt_aw; xi++) {
-                fv_states_->mt_coeffs(xi, wf::atom_index(ialoc), wf::spin_index(0), wf::band_index(i)) =
-                    alm_fv_slab.mt_coeffs(xi, wf::atom_index(ialoc), wf::spin_index(0), wf::band_index(i));
+                fv_states_->mt_coeffs(xi, it.li, wf::spin_index(0), wf::band_index(i)) =
+                    alm_fv_slab.mt_coeffs(xi, it.li, wf::spin_index(0), wf::band_index(i));
             }
             /* lo part of muffin-tin coefficients */
-            for (int xi = 0; xi < uc.atom(ia).type().mt_lo_basis_size(); xi++) {
-                fv_states_->mt_coeffs(num_mt_aw + xi, wf::atom_index(ialoc), wf::spin_index(0), wf::band_index(i)) =
-                    fv_eigen_vectors_slab().mt_coeffs(xi, wf::atom_index(ialoc), wf::spin_index(0), wf::band_index(i));
+            for (int xi = 0; xi < uc.atom(it.i).type().mt_lo_basis_size(); xi++) {
+                fv_states_->mt_coeffs(num_mt_aw + xi, it.li, wf::spin_index(0), wf::band_index(i)) =
+                    fv_eigen_vectors_slab().mt_coeffs(xi, it.li, wf::spin_index(0), wf::band_index(i));
             }
         }
     }
diff --git a/src/k_point/k_point.cpp b/src/k_point/k_point.cpp
index 054259ce95..abb664b10b 100644
--- a/src/k_point/k_point.cpp
+++ b/src/k_point/k_point.cpp
@@ -353,11 +353,13 @@ K_point<T>::generate_gkvec(double gk_cutoff__)
         ctx_.spfft_coarse<double>().local_z_length(), gkvec_partition_->count(), SPFFT_INDEX_TRIPLETS,
         gv.at(sddk::memory_t::host))));
 
-    sddk::splindex<sddk::splindex_t::block_cyclic> spl_ngk_row(num_gkvec(), num_ranks_row_, rank_row_, ctx_.cyclic_block_size());
+    sddk::splindex_block_cyclic<> spl_ngk_row(num_gkvec(), n_blocks(num_ranks_row_), block_id(rank_row_),
+            ctx_.cyclic_block_size());
     num_gkvec_row_ = spl_ngk_row.local_size();
     sddk::mdarray<int, 2> gkvec_row(3, num_gkvec_row_);
 
-    sddk::splindex<sddk::splindex_t::block_cyclic> spl_ngk_col(num_gkvec(), num_ranks_col_, rank_col_, ctx_.cyclic_block_size());
+    sddk::splindex_block_cyclic<> spl_ngk_col(num_gkvec(), n_blocks(num_ranks_col_), block_id(rank_col_),
+            ctx_.cyclic_block_size());
     num_gkvec_col_ = spl_ngk_col.local_size();
     sddk::mdarray<int, 2> gkvec_col(3, num_gkvec_col_);
 
@@ -367,14 +369,14 @@ K_point<T>::generate_gkvec(double gk_cutoff__)
             int ig = gkvec_->gvec_offset(rank) + igloc;
             auto loc_row = spl_ngk_row.location(ig);
             auto loc_col = spl_ngk_col.location(ig);
-            if (loc_row.rank == comm_row().rank()) {
+            if (loc_row.ib == comm_row().rank()) {
                 for (int x : {0, 1, 2}) {
-                    gkvec_row(x, loc_row.local_index) = gv(x, igloc);
+                    gkvec_row(x, loc_row.index_local) = gv(x, igloc);
                 }
             }
-            if (loc_col.rank == comm_col().rank()) {
+            if (loc_col.ib == comm_col().rank()) {
                 for (int x : {0, 1, 2}) {
-                    gkvec_col(x, loc_col.local_index) = gv(x, igloc);
+                    gkvec_col(x, loc_col.index_local) = gv(x, igloc);
                 }
             }
         }
@@ -453,11 +455,12 @@ K_point<T>::get_fv_eigen_vectors(sddk::mdarray<std::complex<T>, 2>& fv_evec__) c
             for (int ia = 0; ia < ctx_.unit_cell().num_atoms(); ia++) {
                 /* number of atom local orbitals */
                 int nlo = ctx_.unit_cell().atom(ia).mt_lo_basis_size();
-                auto loc = fv_eigen_vectors_slab_->spl_num_atoms().location(ia);
-                if (loc.rank == this->comm().rank()) {
+                auto loc = fv_eigen_vectors_slab_->spl_num_atoms().location(typename atom_index_t::global(ia));
+                if (loc.ib == this->comm().rank()) {
                     for (int xi = 0; xi < nlo; xi++) {
                         fv_evec__(this->num_gkvec() + offs + xi, ist) =
-                            fv_eigen_vectors_slab_->mt_coeffs(xi, wf::atom_index(loc.local_index), wf::spin_index(0), wf::band_index(ist));
+                            fv_eigen_vectors_slab_->mt_coeffs(xi, atom_index_t::local(loc.index_local), wf::spin_index(0),
+                                    wf::band_index(ist));
                     }
                 }
                 offs += nlo;
@@ -763,18 +766,20 @@ void
 K_point<T>::generate_gklo_basis()
 {
     /* find local number of row G+k vectors */
-    sddk::splindex<sddk::splindex_t::block_cyclic> spl_ngk_row(num_gkvec(), num_ranks_row_, rank_row_, ctx_.cyclic_block_size());
+    sddk::splindex_block_cyclic<> spl_ngk_row(num_gkvec(), n_blocks(num_ranks_row_), block_id(rank_row_),
+            ctx_.cyclic_block_size());
     num_gkvec_row_ = spl_ngk_row.local_size();
 
     /* find local number of column G+k vectors */
-    sddk::splindex<sddk::splindex_t::block_cyclic> spl_ngk_col(num_gkvec(), num_ranks_col_, rank_col_, ctx_.cyclic_block_size());
+    sddk::splindex_block_cyclic<> spl_ngk_col(num_gkvec(), n_blocks(num_ranks_col_), block_id(rank_col_),
+            ctx_.cyclic_block_size());
     num_gkvec_col_ = spl_ngk_col.local_size();
 
     if (ctx_.full_potential()) {
-        sddk::splindex<sddk::splindex_t::block_cyclic> spl_nlo_row(num_gkvec() + unit_cell_.mt_lo_basis_size(),
-                num_ranks_row_, rank_row_, ctx_.cyclic_block_size());
-        sddk::splindex<sddk::splindex_t::block_cyclic> spl_nlo_col(num_gkvec() + unit_cell_.mt_lo_basis_size(),
-                num_ranks_col_, rank_col_, ctx_.cyclic_block_size());
+        sddk::splindex_block_cyclic<> spl_nlo_row(num_gkvec() + unit_cell_.mt_lo_basis_size(),
+                n_blocks(num_ranks_row_), block_id(rank_row_), ctx_.cyclic_block_size());
+        sddk::splindex_block_cyclic<> spl_nlo_col(num_gkvec() + unit_cell_.mt_lo_basis_size(),
+                n_blocks(num_ranks_col_), block_id(rank_col_), ctx_.cyclic_block_size());
 
         lo_basis_descriptor lo_desc;
 
@@ -797,10 +802,10 @@ K_point<T>::generate_gklo_basis()
                 lo_desc.order = static_cast<uint8_t>(order);
                 lo_desc.idxrf = static_cast<uint8_t>(idxrf);
 
-                if (spl_nlo_row.local_rank(num_gkvec() + idx) == rank_row_) {
+                if (spl_nlo_row.location(num_gkvec() + idx).ib == rank_row_) {
                     lo_basis_descriptors_row_.push_back(lo_desc);
                 }
-                if (spl_nlo_col.local_rank(num_gkvec() + idx) == rank_col_) {
+                if (spl_nlo_col.location(num_gkvec() + idx).ib == rank_col_) {
                     lo_basis_descriptors_col_.push_back(lo_desc);
                 }
 
diff --git a/src/k_point/k_point_set.cpp b/src/k_point/k_point_set.cpp
index 15d7a310b0..a2a6ee629a 100644
--- a/src/k_point/k_point_set.cpp
+++ b/src/k_point/k_point_set.cpp
@@ -33,26 +33,25 @@ void K_point_set::sync_band()
 
     sddk::mdarray<double, 3> data(ctx_.num_bands(), ctx_.num_spinors(), num_kpoints(),
             get_memory_pool(sddk::memory_t::host), "K_point_set::sync_band.data");
+    data.zero();
 
     int nb = ctx_.num_bands() * ctx_.num_spinors();
     #pragma omp parallel
-    for (int ikloc = 0; ikloc < spl_num_kpoints_.local_size(); ikloc++) {
-        int ik = spl_num_kpoints_[ikloc];
-        auto kp = this->get<T>(ik);
+    for (auto it : spl_num_kpoints_) {
+        auto kp = this->get<T>(it.i);
         switch (what) {
             case sync_band_t::energy: {
-                std::copy(&kp->band_energies_(0, 0), &kp->band_energies_(0, 0) + nb, &data(0, 0, ik));
+                std::copy(&kp->band_energies_(0, 0), &kp->band_energies_(0, 0) + nb, &data(0, 0, it.i));
                 break;
             }
             case sync_band_t::occupancy: {
-                std::copy(&kp->band_occupancies_(0, 0), &kp->band_occupancies_(0, 0) + nb, &data(0, 0, ik));
+                std::copy(&kp->band_occupancies_(0, 0), &kp->band_occupancies_(0, 0) + nb, &data(0, 0, it.i));
                 break;
             }
         }
     }
 
-    comm().allgather(data.at(sddk::memory_t::host), nb * spl_num_kpoints_.local_size(),
-        nb * spl_num_kpoints_.global_offset());
+    comm().allreduce(data.at(sddk::memory_t::host), static_cast<int>(data.size()));
 
     #pragma omp parallel for
     for (int ik = 0; ik < num_kpoints(); ik++) {
@@ -139,16 +138,18 @@ void K_point_set::initialize(std::vector<int> const& counts)
     PROFILE("sirius::K_point_set::initialize");
     /* distribute k-points along the 1-st dimension of the MPI grid */
     if (counts.empty()) {
-        sddk::splindex<sddk::splindex_t::block> spl_tmp(num_kpoints(), comm().size(), comm().rank());
-        spl_num_kpoints_ = sddk::splindex<sddk::splindex_t::chunk>(num_kpoints(), comm().size(), comm().rank(), spl_tmp.counts());
+        sddk::splindex_block<> spl_tmp(num_kpoints(), n_blocks(comm().size()), block_id(comm().rank()));
+        spl_num_kpoints_ = sddk::splindex_chunk<kp_index_t>(num_kpoints(), n_blocks(comm().size()),
+                block_id(comm().rank()), spl_tmp.counts());
     } else {
-        spl_num_kpoints_ = sddk::splindex<sddk::splindex_t::chunk>(num_kpoints(), comm().size(), comm().rank(), counts);
+        spl_num_kpoints_ = sddk::splindex_chunk<kp_index_t>(num_kpoints(), n_blocks(comm().size()),
+                block_id(comm().rank()), counts);
     }
 
-    for (int ikloc = 0; ikloc < spl_num_kpoints_.local_size(); ikloc++) {
-        kpoints_[spl_num_kpoints_[ikloc]]->initialize();
+    for (auto it : spl_num_kpoints_) {
+        kpoints_[it.i]->initialize();
 #if defined(USE_FP32)
-        kpoints_float_[spl_num_kpoints_[ikloc]]->initialize();
+        kpoints_float_[it.i]->initialize();
 #endif
     }
 
@@ -301,12 +302,11 @@ void K_point_set::find_band_occupancies()
                 }
             }
         }
-        for (int ikloc = 0; ikloc < spl_num_kpoints_.local_size(); ikloc++) {
-            int ik = spl_num_kpoints_[ikloc];
+        for (auto it : spl_num_kpoints_) {
             for (int ispn = 0; ispn < ctx_.num_spinors(); ispn++) {
                 #pragma omp parallel for
                 for (int j = 0; j < ctx_.num_bands(); j++) {
-                    this->get<T>(ik)->band_occupancy(j, ispn, ctx_.max_occupancy());
+                    this->get<T>(it.i)->band_occupancy(j, ispn, ctx_.max_occupancy());
                 }
             }
         }
@@ -325,32 +325,30 @@ void K_point_set::find_band_occupancies()
     auto emax = std::numeric_limits<double>::lowest();
 
     #pragma omp parallel for reduction(min:emin) reduction(max:emax)
-    for (int ikloc = 0; ikloc < spl_num_kpoints_.local_size(); ikloc++) {
-        int ik = spl_num_kpoints_[ikloc];
+    for (auto it : spl_num_kpoints_) {
         for (int ispn = 0; ispn < ctx_.num_spinors(); ispn++) {
-            emin = std::min(emin, this->get<T>(ik)->band_energy(0, ispn));
-            emax = std::max(emax, this->get<T>(ik)->band_energy(ctx_.num_bands() - 1, ispn));
+            emin = std::min(emin, this->get<T>(it.i)->band_energy(0, ispn));
+            emax = std::max(emax, this->get<T>(it.i)->band_energy(ctx_.num_bands() - 1, ispn));
         }
     }
     comm().allreduce<double, mpi::op_t::min>(&emin, 1);
     comm().allreduce<double, mpi::op_t::max>(&emax, 1);
 
-    sddk::splindex<sddk::splindex_t::block> splb(ctx_.num_bands(), ctx_.comm_band().size(), ctx_.comm_band().rank());
+    sddk::splindex_block<> splb(ctx_.num_bands(), n_blocks(ctx_.comm_band().size()), block_id(ctx_.comm_band().rank()));
 
     /* computes N(ef; f) = \sum_{i,k} f(ef - e_{k,i}) */
     auto compute_ne = [&](double ef, auto&& f) {
         double ne{0};
-        for (int ikloc = 0; ikloc < spl_num_kpoints_.local_size(); ikloc++) {
-            int ik = spl_num_kpoints_[ikloc];
+        for (auto it : spl_num_kpoints_) {
             double tmp{0};
             #pragma omp parallel reduction(+ : tmp)
             for (int ispn = 0; ispn < ctx_.num_spinors(); ispn++) {
                 #pragma omp for
                 for (int j = 0; j < splb.local_size(); j++) {
-                    tmp += f(ef - this->get<T>(ik)->band_energy(splb[j], ispn)) * ctx_.max_occupancy();
+                    tmp += f(ef - this->get<T>(it.i)->band_energy(splb.global_index(j), ispn)) * ctx_.max_occupancy();
                 }
             }
-            ne += tmp * kpoints_[ik]->weight();
+            ne += tmp * kpoints_[it.i]->weight();
         }
         ctx_.comm().allreduce(&ne, 1);
         return ne;
@@ -393,13 +391,12 @@ void K_point_set::find_band_occupancies()
         energy_fermi_ = bisection_search(F, emin, emax, tol);
     }
 
-    for (int ikloc = 0; ikloc < spl_num_kpoints_.local_size(); ikloc++) {
-        int ik = spl_num_kpoints_[ikloc];
+    for (auto it : spl_num_kpoints_) {
         for (int ispn = 0; ispn < ctx_.num_spinors(); ispn++) {
             #pragma omp parallel for
             for (int j = 0; j < ctx_.num_bands(); j++) {
-                auto o = f(energy_fermi_ - this->get<T>(ik)->band_energy(j, ispn)) * ctx_.max_occupancy();
-                this->get<T>(ik)->band_occupancy(j, ispn, o);
+                auto o = f(energy_fermi_ - this->get<T>(it.i)->band_energy(j, ispn)) * ctx_.max_occupancy();
+                this->get<T>(it.i)->band_occupancy(j, ispn, o);
             }
         }
     }
@@ -454,16 +451,15 @@ double K_point_set::valence_eval_sum() const
 {
     double eval_sum{0};
 
-    sddk::splindex<sddk::splindex_t::block> splb(ctx_.num_bands(), ctx_.comm_band().size(), ctx_.comm_band().rank());
+    sddk::splindex_block<> splb(ctx_.num_bands(), n_blocks(ctx_.comm_band().size()), block_id(ctx_.comm_band().rank()));
 
-    for (int ikloc = 0; ikloc < spl_num_kpoints_.local_size(); ikloc++) {
-        auto ik = spl_num_kpoints_[ikloc];
-        auto const& kp = this->get<T>(ik);
+    for (auto it : spl_num_kpoints_) {
+        auto const& kp = this->get<T>(it.i);
         double tmp{0};
         #pragma omp parallel for reduction(+:tmp)
         for (int j = 0; j < splb.local_size(); j++) {
             for (int ispn = 0; ispn < ctx_.num_spinors(); ispn++) {
-                tmp += kp->band_energy(splb[j], ispn) * kp->band_occupancy(splb[j], ispn);
+                tmp += kp->band_energy(splb.global_index(j), ispn) * kp->band_occupancy(splb.global_index(j), ispn);
             }
         }
         eval_sum += kp->weight() * tmp;
@@ -503,16 +499,15 @@ double K_point_set::entropy_sum() const
 
     auto f = smearing::entropy(ctx_.smearing(), ctx_.smearing_width());
 
-    sddk::splindex<sddk::splindex_t::block> splb(ctx_.num_bands(), ctx_.comm_band().size(), ctx_.comm_band().rank());
+    sddk::splindex_block<> splb(ctx_.num_bands(), n_blocks(ctx_.comm_band().size()), block_id(ctx_.comm_band().rank()));
 
-    for (int ikloc = 0; ikloc < spl_num_kpoints_.local_size(); ikloc++) {
-        auto ik = spl_num_kpoints_[ikloc];
-        auto const& kp = this->get<T>(ik);
+    for (auto it : spl_num_kpoints_) {
+        auto const& kp = this->get<T>(it.i);
         double tmp{0};
         #pragma omp parallel for reduction(+:tmp)
         for (int j = 0; j < splb.local_size(); j++) {
             for (int ispn = 0; ispn < ctx_.num_spinors(); ispn++) {
-                tmp += ctx_.max_occupancy() * f(energy_fermi_ - kp->band_energy(splb[j], ispn));
+                tmp += ctx_.max_occupancy() * f(energy_fermi_ - kp->band_energy(splb.global_index(j), ispn));
             }
         }
         s_sum += kp->weight() * tmp;
@@ -552,8 +547,8 @@ void K_point_set::print_info()
         pout << std::endl << utils::hbar(80, '-') << std::endl;
     }
 
-    for (int ikloc = 0; ikloc < spl_num_kpoints().local_size(); ikloc++) {
-        int ik = spl_num_kpoints(ikloc);
+    for (auto it : spl_num_kpoints()) {
+        int ik = it.i;
         pout << std::setw(4) << ik << utils::ffmt(9, 4) << kpoints_[ik]->vk()[0] << utils::ffmt(9, 4)
              << kpoints_[ik]->vk()[1] << utils::ffmt(9, 4) << kpoints_[ik]->vk()[2] << utils::ffmt(17, 6)
              << kpoints_[ik]->weight() << std::setw(11) << kpoints_[ik]->num_gkvec();
@@ -579,7 +574,7 @@ void K_point_set::save(std::string const& name__) const
     ctx_.comm().barrier();
     for (int ik = 0; ik < num_kpoints(); ik++) {
         /* check if this ranks stores the k-point */
-        if (ctx_.comm_k().rank() == spl_num_kpoints_.local_rank(ik)) {
+        if (ctx_.comm_k().rank() == spl_num_kpoints_.location(typename kp_index_t::global(ik)).ib) {
             this->get<double>(ik)->save(name__, ik);
         }
         /* wait for all */
diff --git a/src/k_point/k_point_set.hpp b/src/k_point/k_point_set.hpp
index 653a9a4865..725f0f183b 100644
--- a/src/k_point/k_point_set.hpp
+++ b/src/k_point/k_point_set.hpp
@@ -52,7 +52,7 @@ class K_point_set
 #endif
 
     /// Split index of k-points.
-    sddk::splindex<sddk::splindex_t::chunk> spl_num_kpoints_;
+    sddk::splindex_chunk<kp_index_t> spl_num_kpoints_;
 
     /// Fermi energy which is searched in find_band_occupancies().
     double energy_fermi_{0};
@@ -144,11 +144,10 @@ class K_point_set
     void update()
     {
         /* update k-points */
-        for (int ikloc = 0; ikloc < spl_num_kpoints().local_size(); ikloc++) {
-            int ik = spl_num_kpoints(ikloc);
-            kpoints_[ik]->update();
+        for (auto it : spl_num_kpoints_) {
+            kpoints_[it.i]->update();
 #if defined(USE_FP32)
-            kpoints_float_[ik]->update();
+            kpoints_float_[it.i]->update();
 #endif
         }
     }
@@ -168,9 +167,8 @@ class K_point_set
     int max_num_gkvec() const
     {
         int max_num_gkvec{0};
-        for (int ikloc = 0; ikloc < spl_num_kpoints_.local_size(); ikloc++) {
-            auto ik       = spl_num_kpoints_[ikloc];
-            max_num_gkvec = std::max(max_num_gkvec, kpoints_[ik]->num_gkvec());
+        for (auto it : spl_num_kpoints_) {
+            max_num_gkvec = std::max(max_num_gkvec, kpoints_[it.i]->num_gkvec());
         }
         comm().allreduce<int, mpi::op_t::max>(&max_num_gkvec, 1);
         return max_num_gkvec;
@@ -208,9 +206,9 @@ class K_point_set
         return static_cast<int>(kpoints_.size());
     }
 
-    inline int spl_num_kpoints(int ikloc) const
+    inline auto spl_num_kpoints(kp_index_t::local ikloc__) const
     {
-        return spl_num_kpoints_[ikloc];
+        return spl_num_kpoints_.global_index(ikloc__);
     }
 
     inline double energy_fermi() const
@@ -251,13 +249,13 @@ class K_point_set
 
     /// Send G+k vectors of k-point jk to a given rank.
     /** Other ranks receive an empty Gvec placeholder */
-    inline fft::Gvec get_gkvec(int jk__, int rank__)
+    inline fft::Gvec get_gkvec(kp_index_t::global jk__, int rank__)
     {
         /* rank in the k-point communicator */
         int my_rank = comm().rank();
 
         /* rank that stores jk */
-        int jrank = spl_num_kpoints().local_rank(jk__);
+        int jrank = spl_num_kpoints().location(jk__).ib;
 
         /* need this to pass communicator */
         fft::Gvec gkvec(ctx_.comm_band());
@@ -276,7 +274,7 @@ class K_point_set
 template<>
 inline K_point<double>* K_point_set::get<double>(int ik__) const
 {
-    assert(ik__ >= 0 && ik__ < (int)kpoints_.size());
+    RTE_ASSERT(ik__ >= 0 && ik__ < (int)kpoints_.size());
     return kpoints_[ik__].get();
 }
 
@@ -284,7 +282,7 @@ template<>
 inline K_point<float>* K_point_set::get<float>(int ik__) const
 {
 #if defined(USE_FP32)
-    assert(ik__ >= 0 && ik__ < (int)kpoints_float_.size());
+    RTE_ASSERT(ik__ >= 0 && ik__ < (int)kpoints_float_.size());
     return kpoints_float_[ik__].get();
 #else
     RTE_THROW("not compiled with FP32 support");
diff --git a/src/linalg/dmatrix.cpp b/src/linalg/dmatrix.cpp
index 94075f844c..31a6a47a3e 100644
--- a/src/linalg/dmatrix.cpp
+++ b/src/linalg/dmatrix.cpp
@@ -29,17 +29,17 @@ namespace la {
 template <typename T>
 dmatrix<T>::dmatrix(int num_rows__, int num_cols__, BLACS_grid const& blacs_grid__, int bs_row__, int bs_col__,
                     sddk::memory_t mem_type__)
-    : sddk::matrix<T>(sddk::splindex<sddk::splindex_t::block_cyclic>(num_rows__, blacs_grid__.num_ranks_row(),
-                blacs_grid__.rank_row(), bs_row__).local_size(),
-            sddk::splindex<sddk::splindex_t::block_cyclic>(num_cols__, blacs_grid__.num_ranks_col(),
-                blacs_grid__.rank_col(), bs_col__).local_size(), mem_type__)
+    : sddk::matrix<T>(sddk::splindex_block_cyclic<>(num_rows__, n_blocks(blacs_grid__.num_ranks_row()),
+                block_id(blacs_grid__.rank_row()), bs_row__).local_size(),
+            sddk::splindex_block_cyclic<>(num_cols__, n_blocks(blacs_grid__.num_ranks_col()),
+                block_id(blacs_grid__.rank_col()), bs_col__).local_size(), mem_type__)
     , num_rows_(num_rows__)
     , num_cols_(num_cols__)
     , bs_row_(bs_row__)
     , bs_col_(bs_col__)
     , blacs_grid_(&blacs_grid__)
-    , spl_row_(num_rows_, blacs_grid__.num_ranks_row(), blacs_grid__.rank_row(), bs_row_)
-    , spl_col_(num_cols_, blacs_grid__.num_ranks_col(), blacs_grid__.rank_col(), bs_col_)
+    , spl_row_(num_rows_, n_blocks(blacs_grid__.num_ranks_row()), block_id(blacs_grid__.rank_row()), bs_row_)
+    , spl_col_(num_cols_, n_blocks(blacs_grid__.num_ranks_col()), block_id(blacs_grid__.rank_col()), bs_col_)
     , spla_dist_(spla::MatrixDistribution::create_blacs_block_cyclic_from_mapping(
           blacs_grid__.comm().native(), blacs_grid__.rank_map().data(), blacs_grid__.num_ranks_row(),
           blacs_grid__.num_ranks_col(), bs_row__,bs_col__))
@@ -51,17 +51,17 @@ template <typename T>
 dmatrix<T>::dmatrix(T* ptr__, int num_rows__, int num_cols__, BLACS_grid const& blacs_grid__, int bs_row__,
                     int bs_col__)
     : sddk::matrix<T>(ptr__,
-                sddk::splindex<sddk::splindex_t::block_cyclic>(num_rows__, blacs_grid__.num_ranks_row(), blacs_grid__.rank_row(),
-                                                   bs_row__).local_size(),
-                sddk::splindex<sddk::splindex_t::block_cyclic>(num_cols__, blacs_grid__.num_ranks_col(), blacs_grid__.rank_col(),
-                                                   bs_col__).local_size())
+                sddk::splindex_block_cyclic<>(num_rows__, n_blocks(blacs_grid__.num_ranks_row()),
+                    block_id(blacs_grid__.rank_row()), bs_row__).local_size(),
+                sddk::splindex_block_cyclic<>(num_cols__, n_blocks(blacs_grid__.num_ranks_col()),
+                    block_id(blacs_grid__.rank_col()), bs_col__).local_size())
     , num_rows_(num_rows__)
     , num_cols_(num_cols__)
     , bs_row_(bs_row__)
     , bs_col_(bs_col__)
     , blacs_grid_(&blacs_grid__)
-    , spl_row_(num_rows_, blacs_grid__.num_ranks_row(), blacs_grid__.rank_row(), bs_row_)
-    , spl_col_(num_cols_, blacs_grid__.num_ranks_col(), blacs_grid__.rank_col(), bs_col_)
+    , spl_row_(num_rows_, n_blocks(blacs_grid__.num_ranks_row()), block_id(blacs_grid__.rank_row()), bs_row_)
+    , spl_col_(num_cols_, n_blocks(blacs_grid__.num_ranks_col()), block_id(blacs_grid__.rank_col()), bs_col_)
     , spla_dist_(spla::MatrixDistribution::create_blacs_block_cyclic_from_mapping(
           blacs_grid__.comm().native(), blacs_grid__.rank_map().data(), blacs_grid__.num_ranks_row(),
           blacs_grid__.num_ranks_col(), bs_row__, bs_col__))
@@ -76,8 +76,8 @@ dmatrix<T>::dmatrix(int num_rows__, int num_cols__, sddk::memory_t mem_type__)
     , num_cols_(num_cols__)
     , bs_row_(1)
     , bs_col_(1)
-    , spl_row_(num_rows_, 1, 0, bs_row_)
-    , spl_col_(num_cols_, 1, 0, bs_col_)
+    , spl_row_(num_rows_, n_blocks(1), block_id(0), bs_row_)
+    , spl_col_(num_cols_, n_blocks(1), block_id(0), bs_col_)
 {
 }
 
@@ -88,8 +88,8 @@ dmatrix<T>::dmatrix(int num_rows__, int num_cols__, sddk::memory_pool& mp__, std
     , num_cols_(num_cols__)
     , bs_row_(1)
     , bs_col_(1)
-    , spl_row_(num_rows_, 1, 0, bs_row_)
-    , spl_col_(num_cols_, 1, 0, bs_col_)
+    , spl_row_(num_rows_, n_blocks(1), block_id(0), bs_row_)
+    , spl_col_(num_cols_, n_blocks(1), block_id(0), bs_col_)
 {
 }
 
@@ -101,8 +101,8 @@ dmatrix<T>::dmatrix(T* ptr__, int num_rows__, int num_cols__)
     , num_cols_(num_cols__)
     , bs_row_(1)
     , bs_col_(1)
-    , spl_row_(num_rows_, 1, 0, bs_row_)
-    , spl_col_(num_cols_, 1, 0, bs_col_)
+    , spl_row_(num_rows_, n_blocks(1), block_id(0), bs_row_)
+    , spl_col_(num_cols_, n_blocks(1), block_id(0), bs_col_)
 {
     init();
 }
@@ -110,17 +110,17 @@ dmatrix<T>::dmatrix(T* ptr__, int num_rows__, int num_cols__)
 template <typename T>
 dmatrix<T>::dmatrix(int num_rows__, int num_cols__, BLACS_grid const& blacs_grid__, int bs_row__, int bs_col__,
                     sddk::memory_pool& mp__)
-    : sddk::matrix<T>(sddk::splindex<sddk::splindex_t::block_cyclic>(num_rows__, blacs_grid__.num_ranks_row(), blacs_grid__.rank_row(),
-                                                   bs_row__).local_size(),
-                sddk::splindex<sddk::splindex_t::block_cyclic>(num_cols__, blacs_grid__.num_ranks_col(), blacs_grid__.rank_col(),
-                                                   bs_col__).local_size(), mp__)
+    : sddk::matrix<T>(sddk::splindex_block_cyclic<>(num_rows__, n_blocks(blacs_grid__.num_ranks_row()),
+                block_id(blacs_grid__.rank_row()), bs_row__).local_size(),
+                sddk::splindex_block_cyclic<>(num_cols__, n_blocks(blacs_grid__.num_ranks_col()),
+                    block_id(blacs_grid__.rank_col()), bs_col__).local_size(), mp__)
     , num_rows_(num_rows__)
     , num_cols_(num_cols__)
     , bs_row_(bs_row__)
     , bs_col_(bs_col__)
     , blacs_grid_(&blacs_grid__)
-    , spl_row_(num_rows_, blacs_grid__.num_ranks_row(), blacs_grid__.rank_row(), bs_row_)
-    , spl_col_(num_cols_, blacs_grid__.num_ranks_col(), blacs_grid__.rank_col(), bs_col_)
+    , spl_row_(num_rows_, n_blocks(blacs_grid__.num_ranks_row()), block_id(blacs_grid__.rank_row()), bs_row_)
+    , spl_col_(num_cols_, n_blocks(blacs_grid__.num_ranks_col()), block_id(blacs_grid__.rank_col()), bs_col_)
     , spla_dist_(spla::MatrixDistribution::create_blacs_block_cyclic_from_mapping(
           blacs_grid__.comm().native(), blacs_grid__.rank_map().data(), blacs_grid__.num_ranks_row(),
           blacs_grid__.num_ranks_col(), bs_row__, bs_col__))
@@ -131,13 +131,15 @@ dmatrix<T>::dmatrix(int num_rows__, int num_cols__, BLACS_grid const& blacs_grid
 template <typename T>
 void dmatrix<T>::set(int ir0__, int jc0__, int mr__, int nc__, T* ptr__, int ld__)
 {
-    sddk::splindex<sddk::splindex_t::block_cyclic> spl_r0(ir0__, blacs_grid().num_ranks_row(), blacs_grid().rank_row(), bs_row_);
-    sddk::splindex<sddk::splindex_t::block_cyclic> spl_r1(ir0__ + mr__, blacs_grid().num_ranks_row(), blacs_grid().rank_row(),
-                                              bs_row_);
+    sddk::splindex_block_cyclic<> spl_r0(ir0__, n_blocks(blacs_grid().num_ranks_row()),
+            block_id(blacs_grid().rank_row()), bs_row_);
+    sddk::splindex_block_cyclic<> spl_r1(ir0__ + mr__, n_blocks(blacs_grid().num_ranks_row()),
+            block_id(blacs_grid().rank_row()), bs_row_);
 
-    sddk::splindex<sddk::splindex_t::block_cyclic> spl_c0(jc0__, blacs_grid().num_ranks_col(), blacs_grid().rank_col(), bs_col_);
-    sddk::splindex<sddk::splindex_t::block_cyclic> spl_c1(jc0__ + nc__, blacs_grid().num_ranks_col(), blacs_grid().rank_col(),
-                                              bs_col_);
+    sddk::splindex_block_cyclic<> spl_c0(jc0__, n_blocks(blacs_grid().num_ranks_col()),
+            block_id(blacs_grid().rank_col()), bs_col_);
+    sddk::splindex_block_cyclic<> spl_c1(jc0__ + nc__, n_blocks(blacs_grid().num_ranks_col()),
+            block_id(blacs_grid().rank_col()), bs_col_);
 
     int m0 = spl_r0.local_size();
     int m1 = spl_r1.local_size();
@@ -147,10 +149,10 @@ void dmatrix<T>::set(int ir0__, int jc0__, int mr__, int nc__, T* ptr__, int ld_
     std::vector<int> map_col(n1 - n0);
 
     for (int i = 0; i < m1 - m0; i++) {
-        map_row[i] = spl_r1[m0 + i] - ir0__;
+        map_row[i] = spl_r1.global_index(m0 + i) - ir0__;
     }
     for (int j = 0; j < n1 - n0; j++) {
-        map_col[j] = spl_c1[n0 + j] - jc0__;
+        map_col[j] = spl_c1.global_index(n0 + j) - jc0__;
     }
 
     //#pragma omp parallel for
@@ -165,10 +167,10 @@ template <typename T>
 void dmatrix<T>::set(const int irow_glob, const int icol_glob, T val)
 {
     auto r = spl_row_.location(irow_glob);
-    if (blacs_grid_->rank_row() == r.rank) {
+    if (blacs_grid_->rank_row() == r.ib) {
         auto c = spl_col_.location(icol_glob);
-        if (blacs_grid_->rank_col() == c.rank) {
-            (*this)(r.local_index, c.local_index) = val;
+        if (blacs_grid_->rank_col() == c.ib) {
+            (*this)(r.index_local, c.index_local) = val;
         }
     }
 }
@@ -177,10 +179,10 @@ template <typename T>
 void dmatrix<T>::add(const int irow_glob, const int icol_glob, T val)
 {
     auto r = spl_row_.location(irow_glob);
-    if (blacs_grid_->rank_row() == r.rank) {
+    if (blacs_grid_->rank_row() == r.ib) {
         auto c = spl_col_.location(icol_glob);
-        if (blacs_grid_->rank_col() == c.rank) {
-            (*this)(r.local_index, c.local_index) += val;
+        if (blacs_grid_->rank_col() == c.ib) {
+            (*this)(r.index_local, c.index_local) += val;
         }
     }
 }
@@ -189,10 +191,10 @@ template <typename T>
 void dmatrix<T>::add(real_type<T> beta__, const int irow_glob, const int icol_glob, T val)
 {
     auto r = spl_row_.location(irow_glob);
-    if (blacs_grid_->rank_row() == r.rank) {
+    if (blacs_grid_->rank_row() == r.ib) {
         auto c = spl_col_.location(icol_glob);
-        if (blacs_grid_->rank_col() == c.rank) {
-            (*this)(r.local_index, c.local_index) = (*this)(r.local_index, c.local_index) * beta__ + val;
+        if (blacs_grid_->rank_col() == c.ib) {
+            (*this)(r.index_local, c.index_local) = (*this)(r.index_local, c.index_local) * beta__ + val;
         }
     }
 }
@@ -202,11 +204,11 @@ void dmatrix<T>::make_real_diag(int n__)
 {
     for (int i = 0; i < n__; i++) {
         auto r = spl_row_.location(i);
-        if (blacs_grid_->rank_row() == r.rank) {
+        if (blacs_grid_->rank_row() == r.ib) {
             auto c = spl_col_.location(i);
-            if (blacs_grid_->rank_col() == c.rank) {
-                T v                                   = (*this)(r.local_index, c.local_index);
-                (*this)(r.local_index, c.local_index) = std::real(v);
+            if (blacs_grid_->rank_col() == c.ib) {
+                T v                                   = (*this)(r.index_local, c.index_local);
+                (*this)(r.index_local, c.index_local) = std::real(v);
             }
         }
     }
@@ -220,10 +222,10 @@ sddk::mdarray<T, 1> dmatrix<T>::get_diag(int n__)
 
     for (int i = 0; i < n__; i++) {
         auto r = spl_row_.location(i);
-        if (blacs_grid_->rank_row() == r.rank) {
+        if (blacs_grid_->rank_row() == r.ib) {
             auto c = spl_col_.location(i);
-            if (blacs_grid_->rank_col() == c.rank) {
-                d[i] = (*this)(r.local_index, c.local_index);
+            if (blacs_grid_->rank_col() == c.ib) {
+                d[i] = (*this)(r.index_local, c.index_local);
             }
         }
     }
diff --git a/src/linalg/dmatrix.hpp b/src/linalg/dmatrix.hpp
index 2cfbbacece..08b7f99f08 100644
--- a/src/linalg/dmatrix.hpp
+++ b/src/linalg/dmatrix.hpp
@@ -68,10 +68,10 @@ class dmatrix: public sddk::matrix<T>
     BLACS_grid const* blacs_grid_{nullptr};
 
     /// Split index of matrix rows.
-    sddk::splindex<sddk::splindex_t::block_cyclic> spl_row_;
+    sddk::splindex_block_cyclic<> spl_row_;
 
     /// Split index of matrix columns.
-    sddk::splindex<sddk::splindex_t::block_cyclic> spl_col_;
+    sddk::splindex_block_cyclic<> spl_col_;
 
     /// ScaLAPACK matrix descriptor.
     ftn_int descriptor_[9];
@@ -153,13 +153,13 @@ class dmatrix: public sddk::matrix<T>
     /// Return local number of rows for a given MPI rank.
     inline int num_rows_local(int rank) const
     {
-        return spl_row_.local_size(rank);
+        return spl_row_.local_size(block_id(rank));
     }
 
     /// Return global row index in the range [0, num_rows) by the local index in the range [0, num_rows_local).
     inline int irow(int irow_loc) const
     {
-        return spl_row_[irow_loc];
+        return spl_row_.global_index(irow_loc);
     }
 
     inline int num_cols() const
@@ -175,13 +175,13 @@ class dmatrix: public sddk::matrix<T>
 
     inline int num_cols_local(int rank) const
     {
-        return spl_col_.local_size(rank);
+        return spl_col_.local_size(block_id(rank));
     }
 
     /// Inindex of column in global matrix.
     inline int icol(int icol_loc) const
     {
-        return spl_col_[icol_loc];
+        return spl_col_.global_index(icol_loc);
     }
 
     inline int const* descriptor() const
@@ -221,11 +221,15 @@ class dmatrix: public sddk::matrix<T>
     {
         int m0, m1, n0, n1;
         if (blacs_grid_ != nullptr) {
-            sddk::splindex<sddk::splindex_t::block_cyclic> spl_r0(ir0__, blacs_grid().num_ranks_row(), blacs_grid().rank_row(), bs_row_);
-            sddk::splindex<sddk::splindex_t::block_cyclic> spl_r1(ir0__ + nr__, blacs_grid().num_ranks_row(), blacs_grid().rank_row(), bs_row_);
+            sddk::splindex_block_cyclic<> spl_r0(ir0__, n_blocks(blacs_grid().num_ranks_row()),
+                    block_id(blacs_grid().rank_row()), bs_row_);
+            sddk::splindex_block_cyclic<> spl_r1(ir0__ + nr__, n_blocks(blacs_grid().num_ranks_row()),
+                    block_id(blacs_grid().rank_row()), bs_row_);
 
-            sddk::splindex<sddk::splindex_t::block_cyclic> spl_c0(ic0__, blacs_grid().num_ranks_col(), blacs_grid().rank_col(), bs_col_);
-            sddk::splindex<sddk::splindex_t::block_cyclic> spl_c1(ic0__ + nc__, blacs_grid().num_ranks_col(), blacs_grid().rank_col(), bs_col_);
+            sddk::splindex_block_cyclic<> spl_c0(ic0__, n_blocks(blacs_grid().num_ranks_col()),
+                    block_id(blacs_grid().rank_col()), bs_col_);
+            sddk::splindex_block_cyclic<> spl_c1(ic0__ + nc__, n_blocks(blacs_grid().num_ranks_col()),
+                    block_id(blacs_grid().rank_col()), bs_col_);
 
             m0 = spl_r0.local_size();
             m1 = spl_r1.local_size();
@@ -382,10 +386,10 @@ class dmatrix: public sddk::matrix<T>
         T cs{0};
 
         if (blacs_grid_ != nullptr) {
-            sddk::splindex<sddk::splindex_t::block_cyclic> spl_row(m__, this->blacs_grid().num_ranks_row(),
-                                                       this->blacs_grid().rank_row(), this->bs_row());
-            sddk::splindex<sddk::splindex_t::block_cyclic> spl_col(n__, this->blacs_grid().num_ranks_col(),
-                                                       this->blacs_grid().rank_col(), this->bs_col());
+            sddk::splindex_block_cyclic<> spl_row(m__, n_blocks(this->blacs_grid().num_ranks_row()),
+                                                       block_id(this->blacs_grid().rank_row()), this->bs_row());
+            sddk::splindex_block_cyclic<> spl_col(n__, n_blocks(this->blacs_grid().num_ranks_col()),
+                                                       block_id(this->blacs_grid().rank_col()), this->bs_col());
             for (int i = 0; i < spl_col.local_size(); i++) {
                 for (int j = 0; j < spl_row.local_size(); j++) {
                     cs += (*this)(j, i);
diff --git a/src/mixer/mixer_functions.cpp b/src/mixer/mixer_functions.cpp
index caf8367317..0157cbc999 100644
--- a/src/mixer/mixer_functions.cpp
+++ b/src/mixer/mixer_functions.cpp
@@ -73,8 +73,8 @@ FunctionProperties<Periodic_function<double>> periodic_function_property()
                 y.rg().value(i) = xi * -s + yi * c;
             }
             if (x.ctx().full_potential()) {
-                for (int ialoc = 0; ialoc < x.ctx().unit_cell().spl_num_atoms().local_size(); ialoc++) {
-                    int ia = x.ctx().unit_cell().spl_num_atoms(ialoc);
+                for (auto it : x.ctx().unit_cell().spl_num_atoms()) {
+                    int ia = it.i;
                     auto& x_f_mt = x.mt()[ia];
                     auto& y_f_mt = y.mt()[ia];
                     #pragma omp for schedule(static) nowait
@@ -226,9 +226,8 @@ FunctionProperties<PAW_density<double>> paw_density_function_property()
 
     auto scale_func = [](double alpha, PAW_density<double>& x) -> void
     {
-        for (int i = 0; i < x.unit_cell().spl_num_paw_atoms().local_size(); i++) {
-            int ipaw = x.unit_cell().spl_num_paw_atoms(i);
-            int ia = x.unit_cell().paw_atom_index(ipaw);
+        for (auto it : x.unit_cell().spl_num_paw_atoms()) {
+            int ia = x.unit_cell().paw_atom_index(it.i);
             for (int j = 0; j < x.unit_cell().parameters().num_mag_dims() + 1; j++) {
                 x.ae_density(j, ia) *= alpha;
                 x.ps_density(j, ia) *= alpha;
@@ -238,9 +237,8 @@ FunctionProperties<PAW_density<double>> paw_density_function_property()
 
     auto copy_function = [](PAW_density<double> const& x, PAW_density<double>& y) -> void
     {
-        for (int i = 0; i < x.unit_cell().spl_num_paw_atoms().local_size(); i++) {
-            int ipaw = x.unit_cell().spl_num_paw_atoms(i);
-            int ia = x.unit_cell().paw_atom_index(ipaw);
+        for (auto it : x.unit_cell().spl_num_paw_atoms()) {
+            int ia = x.unit_cell().paw_atom_index(it.i);
             for (int j = 0; j < x.unit_cell().parameters().num_mag_dims() + 1; j++) {
                 sddk::copy(x.ae_density(j, ia), y.ae_density(j, ia));
                 sddk::copy(x.ps_density(j, ia), y.ps_density(j, ia));
@@ -250,9 +248,8 @@ FunctionProperties<PAW_density<double>> paw_density_function_property()
 
     auto axpy_function = [](double alpha, PAW_density<double> const& x, PAW_density<double>& y) -> void
     {
-        for (int i = 0; i < x.unit_cell().spl_num_paw_atoms().local_size(); i++) {
-            int ipaw = x.unit_cell().spl_num_paw_atoms(i);
-            int ia = x.unit_cell().paw_atom_index(ipaw);
+        for (auto it : x.unit_cell().spl_num_paw_atoms()) {
+            int ia = x.unit_cell().paw_atom_index(it.i);
             for (int j = 0; j < x.unit_cell().parameters().num_mag_dims() + 1; j++) {
                 y.ae_density(j, ia) = x.ae_density(j, ia) * alpha + y.ae_density(j, ia);
                 y.ps_density(j, ia) = x.ps_density(j, ia) * alpha + y.ps_density(j, ia);
@@ -262,9 +259,8 @@ FunctionProperties<PAW_density<double>> paw_density_function_property()
 
     auto rotate_function = [](double c, double s, PAW_density<double>& x, PAW_density<double>& y) -> void
     {
-        for (int i = 0; i < x.unit_cell().spl_num_paw_atoms().local_size(); i++) {
-            int ipaw = x.unit_cell().spl_num_paw_atoms(i);
-            int ia = x.unit_cell().paw_atom_index(ipaw);
+        for (auto it : x.unit_cell().spl_num_paw_atoms()) {
+            int ia = x.unit_cell().paw_atom_index(it.i);
             for (int j = 0; j < x.unit_cell().parameters().num_mag_dims() + 1; j++) {
                 x.ae_density(j, ia) = x.ae_density(j, ia) * c + s * y.ae_density(j, ia);
                 y.ae_density(j, ia) = y.ae_density(j, ia) * c - s * x.ae_density(j, ia);
diff --git a/src/nlcglib/adaptor.cpp b/src/nlcglib/adaptor.cpp
index 5fa0ff38e0..c0c75241e5 100644
--- a/src/nlcglib/adaptor.cpp
+++ b/src/nlcglib/adaptor.cpp
@@ -66,7 +66,7 @@ make_vector(std::vector<wfc_ptr_t> const& wfct, Simulation_context const& ctx, K
     int num_spins = ctx.num_spins();
     int nb        = ctx.num_bands();
     for (auto i = 0u; i < wfct.size(); ++i) {
-        auto gidk = kset.spl_num_kpoints(i); // global k-point index
+        auto gidk = kset.spl_num_kpoints().global_index(typename kp_index_t::local(i)); //(i); // global k-point index
         for (int ispn = 0; ispn < num_spins; ++ispn) {
             auto& array   = wfct[i]->pw_coeffs(wf::spin_index(ispn));
             int lda       = array.size(0);
@@ -105,24 +105,20 @@ Energy::Energy(K_point_set& kset, Density& density, Potential& potential)
     , density_(density)
     , potential_(potential)
 {
-    // intialize hphi and sphi and allocate (device) memory
     int nk    = kset.spl_num_kpoints().local_size();
     auto& ctx = kset.ctx();
-    // auto& mpd = ctx.mem_pool(ctx.preferred_memory_t());
     hphis_.resize(nk);
-    // sphis.resize(nk);
     cphis_.resize(nk);
-    for (int i = 0; i < nk; ++i) {
-        auto global_kpoint_index          = kset.spl_num_kpoints(i);
-        auto& kp                          = *kset.get<double>(global_kpoint_index);
+    for (auto it : kset.spl_num_kpoints()) {
+        auto& kp                          = *kset.get<double>(it.i);
         sddk::memory_t preferred_memory_t = ctx.processing_unit_memory_t();
         auto num_mag_dims                 = wf::num_mag_dims(ctx.num_mag_dims());
         auto num_bands                    = wf::num_bands(ctx.num_bands());
         // make a new wf for Hamiltonian apply...
-        hphis_[i] =
+        hphis_[it.li] =
             std::make_shared<wf::Wave_functions<prec_t>>(kp.gkvec_sptr(), num_mag_dims, num_bands, preferred_memory_t);
-        cphis_[i] = &kp.spinor_wave_functions();
-        hphis_[i]->allocate(sddk::memory_t::host);
+        cphis_[it.li] = &kp.spinor_wave_functions();
+        hphis_[it.li]->allocate(sddk::memory_t::host);
     }
     // need to allocate wavefunctions on GPU
 }
@@ -133,7 +129,6 @@ Energy::compute()
     auto& ctx     = kset_.ctx();
     int num_spins = ctx.num_spins();
     int num_bands = ctx.num_bands();
-    int nk        = kset_.spl_num_kpoints().local_size();
 
     density_.generate<prec_t>(kset_, ctx.use_symmetry(), true /* add core */, true /* transform to rg */);
 
@@ -145,15 +140,15 @@ Energy::compute()
     auto proc_mem_t = ctx.processing_unit_memory_t();
 
     // apply Hamiltonian
-    for (int i = 0; i < nk; ++i) {
-        auto& kp = *kset_.get<prec_t>(kset_.spl_num_kpoints(i));
+    for (auto it : kset_.spl_num_kpoints()) {
+        auto& kp = *kset_.get<prec_t>(it.i);
         std::vector<double> band_energies(num_bands);
 
-        auto mem_guard   = cphis_[i]->memory_guard(proc_mem_t, wf::copy_to::device);
-        auto mem_guard_h = hphis_[i]->memory_guard(proc_mem_t, wf::copy_to::host);
+        auto mem_guard   = cphis_[it.li]->memory_guard(proc_mem_t, wf::copy_to::device);
+        auto mem_guard_h = hphis_[it.li]->memory_guard(proc_mem_t, wf::copy_to::host);
 
         auto null_ptr_wfc = std::shared_ptr<wf::Wave_functions<prec_t>>();
-        apply_hamiltonian(H0, kp, *hphis_[i], kp.spinor_wave_functions(), null_ptr_wfc);
+        apply_hamiltonian(H0, kp, *hphis_[it.li], kp.spinor_wave_functions(), null_ptr_wfc);
         // compute band energies = diag(<psi|H|psi>)
         for (int ispn = 0; ispn < num_spins; ++ispn) {
             for (int jj = 0; jj < num_bands; ++jj) {
@@ -162,7 +157,7 @@ Energy::compute()
                 wf::band_range bandr{jj, jj + 1};
                 wf::inner(ctx.spla_context(), proc_mem_t, wf::spin_range(ispn),
                           /* bra */ kp.spinor_wave_functions(), bandr,
-                          /* ket */ *hphis_[i], bandr,
+                          /* ket */ *hphis_[it.li], bandr,
                           /*result*/ dmat, 0, 0);
                 kp.band_energy(jj, ispn, dmat(0, 0).real());
             }
@@ -210,22 +205,19 @@ Energy::get_C(nlcglib::memory_type memory = nlcglib::memory_type::none)
 std::shared_ptr<nlcglib::VectorBaseZ>
 Energy::get_fn()
 {
-    auto nk      = kset_.spl_num_kpoints().local_size();
     const int ns = kset_.ctx().num_spins();
     int nbands   = kset_.ctx().num_bands();
     std::vector<std::vector<double>> fn;
     std::vector<std::pair<int, int>> kindices;
-    for (int ik = 0; ik < nk; ++ik) {
-        // global k-point index
-        auto gidk = kset_.spl_num_kpoints(ik);
-        auto& kp  = *kset_.get<prec_t>(gidk);
+    for (auto it : kset_.spl_num_kpoints()) {
+        auto& kp = *kset_.get<prec_t>(it.i);
         for (int ispn = 0; ispn < ns; ++ispn) {
             std::vector<double> fn_local(nbands);
             for (int i = 0; i < nbands; ++i) {
                 fn_local[i] = kp.band_occupancy(i, ispn);
             }
             fn.push_back(std::move(fn_local));
-            kindices.emplace_back(gidk, ispn);
+            kindices.emplace_back(it.i, ispn);
         }
     }
     return std::make_shared<Array1d>(fn, kindices, kset_.comm().native());
@@ -260,22 +252,19 @@ Energy::set_fn(const std::vector<std::pair<int, int>>& keys, const std::vector<s
 std::shared_ptr<nlcglib::VectorBaseZ>
 Energy::get_ek()
 {
-    auto nk      = kset_.spl_num_kpoints().local_size();
     const int ns = kset_.ctx().num_spins();
     int nbands   = kset_.ctx().num_bands();
     std::vector<std::vector<double>> ek;
     std::vector<std::pair<int, int>> kindices;
-    for (int ik = 0; ik < nk; ++ik) {
-        // global k-point index
-        auto gidk = kset_.spl_num_kpoints(ik);
-        auto& kp  = *kset_.get<prec_t>(gidk);
+    for (auto it :  kset_.spl_num_kpoints()) {
+        auto& kp = *kset_.get<prec_t>(it.i);
         for (int ispn = 0; ispn < ns; ++ispn) {
             std::vector<double> ek_local(nbands);
             for (int i = 0; i < nbands; ++i) {
                 ek_local[i] = kp.band_energy(i, ispn);
             }
             ek.push_back(std::move(ek_local));
-            kindices.emplace_back(gidk, ispn);
+            kindices.emplace_back(it.i.get(), ispn);
         }
     }
     return std::make_shared<Array1d>(ek, kindices, kset_.comm().native());
@@ -284,14 +273,11 @@ Energy::get_ek()
 std::shared_ptr<nlcglib::VectorBaseZ>
 Energy::get_gkvec_ekin()
 {
-    auto nk      = kset_.spl_num_kpoints().local_size();
     const int ns = kset_.ctx().num_spins();
     std::vector<std::vector<double>> gkvec_cart;
     std::vector<std::pair<int, int>> kindices;
-    for (int ik = 0; ik < nk; ++ik) {
-        // global k-point index
-        auto gidk = kset_.spl_num_kpoints(ik);
-        auto& kp  = *kset_.get<prec_t>(gidk);
+    for (auto it : kset_.spl_num_kpoints()) {
+        auto& kp  = *kset_.get<prec_t>(it.i);
         for (int ispn = 0; ispn < ns; ++ispn) {
             int gkvec_count = kp.gkvec().count();
             auto& gkvec     = kp.gkvec();
@@ -300,7 +286,7 @@ Energy::get_gkvec_ekin()
                 gkvec_local[i] = gkvec.gkvec_cart<sddk::index_domain_t::global>(i).length();
             }
             gkvec_cart.push_back(std::move(gkvec_local));
-            kindices.emplace_back(gidk, ispn);
+            kindices.emplace_back(it.i.get(), ispn);
         }
     }
     return std::make_shared<Array1d>(gkvec_cart, kindices, kset_.comm().native());
@@ -309,19 +295,16 @@ Energy::get_gkvec_ekin()
 std::shared_ptr<nlcglib::ScalarBaseZ>
 Energy::get_kpoint_weights()
 {
-    auto nk      = kset_.spl_num_kpoints().local_size();
     const int ns = kset_.ctx().num_spins();
     std::vector<double> weights;
     std::vector<std::pair<int, int>> kindices;
-    for (int ik = 0; ik < nk; ++ik) {
-        // global k-point index
-        auto gidk = kset_.spl_num_kpoints(ik);
-        auto& kp  = *kset_.get<double>(gidk);
+    for (auto it : kset_.spl_num_kpoints()) {
+        auto& kp  = *kset_.get<double>(it.i);
 
         // also return weights for every spin index
         for (int ispn = 0; ispn < ns; ++ispn) {
             weights.push_back(kp.weight());
-            kindices.emplace_back(gidk, ispn);
+            kindices.emplace_back(it.i.get(), ispn);
         }
     }
     return std::make_shared<Scalar>(weights, kindices, kset_.comm().native());
diff --git a/src/nlcglib/inverse_overlap.hpp b/src/nlcglib/inverse_overlap.hpp
index 5c0ea84a98..d74d32579b 100644
--- a/src/nlcglib/inverse_overlap.hpp
+++ b/src/nlcglib/inverse_overlap.hpp
@@ -296,7 +296,7 @@ S_k<numeric_t>::apply(sddk::mdarray<numeric_t, 2>& Y, sddk::mdarray<numeric_t, 2
         R.allocate(sddk::memory_t::device);
     }
 
-    q_op_.rmatmul(R, bphi, this->ispn_, pm, 1, 0);
+    q_op_.rmatmul(R, bphi, this->ispn_, pm, 1.0, 0.0);
 
     sddk::auto_copy(Y, X, pu);
 
diff --git a/src/nlcglib/overlap.hpp b/src/nlcglib/overlap.hpp
index 97e0cfe2b3..1800d4b842 100644
--- a/src/nlcglib/overlap.hpp
+++ b/src/nlcglib/overlap.hpp
@@ -60,12 +60,10 @@ template <class op_t>
 Overlap_operators<op_t>::Overlap_operators(const K_point_set& kset, Simulation_context& ctx,
                                            const Q_operator<double>& q_op)
 {
-    int nk = kset.spl_num_kpoints().local_size();
-    for (int ik_loc = 0; ik_loc < nk; ++ik_loc) {
-        int ik   = kset.spl_num_kpoints(ik_loc);
-        auto& kp = *kset.get<double>(ik);
+    for (auto it : kset.spl_num_kpoints()) {
+        auto& kp = *kset.get<double>(it.i);
         for (int ispn = 0; ispn < ctx.num_spins(); ++ispn) {
-            key_t key{ik, ispn};
+            key_t key{it.i.get(), ispn};
             data_[key] = std::make_shared<op_t>(ctx, q_op, kp.beta_projectors(), ispn);
         }
     }
diff --git a/src/nlcglib/ultrasoft_precond.hpp b/src/nlcglib/ultrasoft_precond.hpp
index ea142a9898..737ede7333 100644
--- a/src/nlcglib/ultrasoft_precond.hpp
+++ b/src/nlcglib/ultrasoft_precond.hpp
@@ -59,12 +59,10 @@ class UltrasoftPrecond : public nlcglib::UltrasoftPrecondBase
 inline UltrasoftPrecond::UltrasoftPrecond(K_point_set const& kset, Simulation_context& ctx,
                                           Q_operator<double> const& q_op)
 {
-    int nk = kset.spl_num_kpoints().local_size();
-    for (int ik_loc = 0; ik_loc < nk; ++ik_loc) {
-        int ik   = kset.spl_num_kpoints(ik_loc);
-        auto& kp = *kset.get<double>(ik);
+    for (auto it : kset.spl_num_kpoints()) {
+        auto& kp = *kset.get<double>(it.i);
         for (int ispn = 0; ispn < ctx.num_spins(); ++ispn) {
-            key_t key{ik, ispn};
+            key_t key{it.i.get(), ispn};
             data_[key] = std::make_shared<op_t>(ctx, q_op, ispn, kp.beta_projectors(), kp.gkvec());
         }
     }
diff --git a/src/potential/paw_potential.cpp b/src/potential/paw_potential.cpp
index 738e7ebdac..38d04ae886 100644
--- a/src/potential/paw_potential.cpp
+++ b/src/potential/paw_potential.cpp
@@ -36,16 +36,16 @@ void Potential::init_PAW()
     bool const is_global{true};
     paw_potential_ = std::make_unique<PAW_field4D<double>>(unit_cell_, is_global);
 
-    paw_ae_exc_ = std::make_unique<Spheric_function_set<double>>(unit_cell_, unit_cell_.paw_atoms(),
+    paw_ae_exc_ = std::make_unique<Spheric_function_set<double, paw_atom_index_t>>(unit_cell_, unit_cell_.paw_atoms(),
                     [this](int ia){return lmax_t(2 * this->unit_cell_.atom(ia).type().indexr().lmax());});
 
-    paw_ps_exc_ = std::make_unique<Spheric_function_set<double>>(unit_cell_, unit_cell_.paw_atoms(),
+    paw_ps_exc_ = std::make_unique<Spheric_function_set<double, paw_atom_index_t>>(unit_cell_, unit_cell_.paw_atoms(),
                     [this](int ia){return lmax_t(2 * this->unit_cell_.atom(ia).type().indexr().lmax());});
 
     /* initialize dij matrix */
     paw_dij_.resize(unit_cell_.num_paw_atoms());
     for (int i = 0; i < unit_cell_.num_paw_atoms(); i++) {
-        int ia = unit_cell_.paw_atom_index(i);
+        int ia = unit_cell_.paw_atom_index(paw_atom_index_t::global(i));
         paw_dij_[i] = sddk::mdarray<double, 3>(unit_cell_.atom(ia).mt_basis_size(), unit_cell_.atom(ia).mt_basis_size(),
                 ctx_.num_mag_dims() + 1);
     }
@@ -64,16 +64,15 @@ void Potential::generate_PAW_effective_potential(Density const& density)
     paw_hartree_total_energy_ = 0.0;
 
     /* calculate xc and hartree for atoms */
-    for (int i = 0; i < unit_cell_.spl_num_paw_atoms().local_size(); i++) {
-        int ia = unit_cell_.paw_atom_index(unit_cell_.spl_num_paw_atoms(i));
-        paw_hartree_total_energy_ += calc_PAW_local_potential(ia, density.paw_ae_density(ia),
-                                                              density.paw_ps_density(ia));
+    for (auto it : unit_cell_.spl_num_paw_atoms()) {
+        paw_hartree_total_energy_ += calc_PAW_local_potential(it.i, density.paw_ae_density(it.i),
+                                                              density.paw_ps_density(it.i));
     }
     comm_.allreduce(&paw_hartree_total_energy_, 1);
 
     paw_potential_->sync();
-    std::vector<Spheric_function_set<double>*> ae_comp;
-    std::vector<Spheric_function_set<double>*> ps_comp;
+    std::vector<Spheric_function_set<double, paw_atom_index_t>*> ae_comp;
+    std::vector<Spheric_function_set<double, paw_atom_index_t>*> ps_comp;
     for (int j = 0; j < ctx_.num_mag_dims() + 1; j++) {
         ae_comp.push_back(&paw_potential_->ae_component(j));
         ps_comp.push_back(&paw_potential_->ps_component(j));
@@ -95,20 +94,19 @@ void Potential::generate_PAW_effective_potential(Density const& density)
 
     /* calculate PAW Dij matrix */
     #pragma omp parallel for
-    for (int i = 0; i < unit_cell_.spl_num_paw_atoms().local_size(); i++) {
-        int ia_paw = unit_cell_.spl_num_paw_atoms(i);
-        int ia = unit_cell_.paw_atom_index(ia_paw);
-        calc_PAW_local_Dij(ia, paw_dij_[ia_paw]);
+    for (auto it : unit_cell_.spl_num_paw_atoms()) {
+        auto ia = unit_cell_.paw_atom_index(it.i);
+        calc_PAW_local_Dij(ia, paw_dij_[it.i]);
     }
     for (int i = 0; i < unit_cell_.num_paw_atoms(); i++) {
-        auto location = unit_cell_.spl_num_paw_atoms().location(i);
-        comm_.bcast(paw_dij_[i].at(sddk::memory_t::host), paw_dij_[i].size(), location.rank);
+        auto location = unit_cell_.spl_num_paw_atoms().location(typename paw_atom_index_t::global(i));
+        comm_.bcast(paw_dij_[i].at(sddk::memory_t::host), paw_dij_[i].size(), location.ib);
     }
 
     /* add paw Dij to uspp Dij */
     #pragma omp parallel for
     for (int i = 0; i < unit_cell_.num_paw_atoms(); i++) {
-        int ia     = unit_cell_.paw_atom_index(i);
+        auto ia    = unit_cell_.paw_atom_index(typename paw_atom_index_t::global(i));
         auto& atom = unit_cell_.atom(ia);
 
         for (int imagn = 0; imagn < ctx_.num_mag_dims() + 1; imagn++) {
diff --git a/src/potential/poisson.cpp b/src/potential/poisson.cpp
index d45f6866a0..dc9a739786 100644
--- a/src/potential/poisson.cpp
+++ b/src/potential/poisson.cpp
@@ -248,27 +248,26 @@ void Potential::poisson(Periodic_function<double> const& rho)
             }
         }
 
-        for (int ialoc = 0; ialoc < unit_cell_.spl_num_atoms().local_size(); ialoc++) {
-            int ia = unit_cell_.spl_num_atoms(ialoc);
-            int nmtp = unit_cell_.atom(ia).num_mt_points();
+        for (auto it : unit_cell_.spl_num_atoms()) {
+            int nmtp = unit_cell_.atom(it.i).num_mt_points();
 
             std::vector<double> vlm(ctx_.lmmax_pot());
-            SHT::convert(ctx_.lmax_pot(), &vmtlm(0, ia), &vlm[0]);
+            SHT::convert(ctx_.lmax_pot(), &vmtlm(0, it.i), &vlm[0]);
 
             #pragma omp parallel for default(shared)
             for (int lm = 0; lm < ctx_.lmmax_pot(); lm++) {
                 int l = l_by_lm_[lm];
 
                 for (int ir = 0; ir < nmtp; ir++) {
-                    hartree_potential_->mt()[ia](lm, ir) += vlm[lm] * rRl(ir, l, unit_cell_.atom(ia).type_id());
+                    hartree_potential_->mt()[it.i](lm, ir) += vlm[lm] * rRl(ir, l, unit_cell_.atom(it.i).type_id());
                 }
             }
             /* save electronic part of the potential at the point of origin */
 #ifdef __VHA_AUX
-            vh_el_(ia) = y00 * hartree_potential_->mt()[ia](0, 0) +
-                unit_cell_.atom(ia).zn() / unit_cell_.atom(ia).radial_grid(0);
+            vh_el_(it.i) = y00 * hartree_potential_->mt()[it.i](0, 0) +
+                unit_cell_.atom(it.i).zn() / unit_cell_.atom(it.i).radial_grid(0);
 #else
-            vh_el_(ia) = y00 * hartree_potential_->mt()[ia](0, 0);
+            vh_el_(it.i) = y00 * hartree_potential_->mt()[it.i](0, 0);
 #endif
         }
         ctx_.comm().allgather(vh_el_.at(sddk::memory_t::host), unit_cell_.spl_num_atoms().local_size(),
@@ -292,14 +291,13 @@ void Potential::poisson(Periodic_function<double> const& rho)
     /* add nucleus potential and contribution to Hartree energy */
     if (ctx_.full_potential()) {
         double evha_nuc{0};
-        for (int ialoc = 0; ialoc < unit_cell_.spl_num_atoms().local_size(); ialoc++) {
-            int ia = unit_cell_.spl_num_atoms(ialoc);
-            auto& atom = unit_cell_.atom(ia);
+        for (auto it : unit_cell_.spl_num_atoms()) {
+            auto& atom = unit_cell_.atom(it.i);
             Spline<double> srho(atom.radial_grid());
             for (int ir = 0; ir < atom.num_mt_points(); ir++) {
                 double r = atom.radial_grid(ir);
-                hartree_potential_->mt()[ia](0, ir) -= atom.zn() / r / y00;
-                srho(ir) = rho.mt()[ia](0, ir) * r;
+                hartree_potential_->mt()[it.i](0, ir) -= atom.zn() / r / y00;
+                srho(ir) = rho.mt()[it.i](0, ir) * r;
             }
             evha_nuc -= atom.zn() * srho.interpolate().integrate(0) / y00;
         }
diff --git a/src/potential/potential.hpp b/src/potential/potential.hpp
index 1db837cba1..67c2092b2f 100644
--- a/src/potential/potential.hpp
+++ b/src/potential/potential.hpp
@@ -120,10 +120,10 @@ class Potential : public Field4D
     std::unique_ptr<PAW_field4D<double>> paw_potential_;
 
     /// Exchange-correlation energy density of PAW atoms pseudodensity.
-    std::unique_ptr<Spheric_function_set<double>> paw_ps_exc_;
+    std::unique_ptr<Spheric_function_set<double, paw_atom_index_t>> paw_ps_exc_;
 
     /// Exchange-correlation energy density of PAW atoms all-electron density.
-    std::unique_ptr<Spheric_function_set<double>> paw_ae_exc_;
+    std::unique_ptr<Spheric_function_set<double, paw_atom_index_t>> paw_ae_exc_;
 
     /// Contribution to D-operator matrix from the PAW atoms.
     std::vector<sddk::mdarray<double, 3>> paw_dij_;
@@ -165,8 +165,8 @@ class Potential : public Field4D
         sddk::mdarray<std::complex<double>, 2> qmt(ctx_.lmmax_rho(), unit_cell_.num_atoms());
         qmt.zero();
 
-        for (int ialoc = 0; ialoc < unit_cell_.spl_num_atoms().local_size(); ialoc++) {
-            int ia = unit_cell_.spl_num_atoms(ialoc);
+        for (auto it : unit_cell_.spl_num_atoms()) {
+            auto ia = it.i;
 
             auto qmt_re = poisson_vmt<false>(unit_cell_.atom(ia), rho__.mt()[ia],
                 const_cast<Spheric_function<function_domain_t::spectral, double>&>(hartree_potential_->mt()[ia]));
@@ -667,9 +667,8 @@ class Potential : public Field4D
         /* compute contribution from the core */
         double ecore{0};
         #pragma omp parallel for reduction(+:ecore)
-        for (int i = 0; i < unit_cell_.spl_num_paw_atoms().local_size(); i++) {
-            int ia_paw = unit_cell_.spl_num_paw_atoms(i);
-            int ia     = unit_cell_.paw_atom_index(ia_paw);
+        for (auto it : unit_cell_.spl_num_paw_atoms()) {
+            auto ia = unit_cell_.paw_atom_index(it.i);
 
             auto& atom = unit_cell_.atom(ia);
 
@@ -701,10 +700,9 @@ class Potential : public Field4D
     {
         double e{0};
         #pragma omp parallel for reduction(+:e)
-        for (int i = 0; i < unit_cell_.spl_num_paw_atoms().local_size(); i++) {
-            int ia_paw = unit_cell_.spl_num_paw_atoms(i);
-            int ia     = unit_cell_.paw_atom_index(ia_paw);
-            e += calc_PAW_one_elec_energy(unit_cell_.atom(ia), density__.density_matrix(ia), paw_dij_[ia_paw]);
+        for (auto it : unit_cell_.spl_num_paw_atoms()) {
+            auto ia = unit_cell_.paw_atom_index(it.i);
+            e += calc_PAW_one_elec_energy(unit_cell_.atom(ia), density__.density_matrix(ia), paw_dij_[it.i]);
         }
         comm_.allreduce(&e, 1);
         return e;
@@ -737,9 +735,9 @@ class Potential : public Field4D
         return *dveff_;
     }
 
-    auto const& effective_potential_mt(int ialoc) const
+    auto const& effective_potential_mt(atom_index_t::local ialoc) const
     {
-        int ia = unit_cell_.spl_num_atoms(ialoc);
+        auto ia = unit_cell_.spl_num_atoms().global_index(ialoc);
         return this->scalar().mt()[ia];
     }
 
@@ -758,9 +756,9 @@ class Potential : public Field4D
         return *hartree_potential_;
     }
 
-    auto const& hartree_potential_mt(int ialoc) const
+    auto const& hartree_potential_mt(atom_index_t::local ialoc) const
     {
-        int ia = unit_cell_.spl_num_atoms(ialoc);
+        auto ia = unit_cell_.spl_num_atoms().global_index(ialoc);
         return hartree_potential_->mt()[ia];
     }
 
diff --git a/src/potential/xc.cpp b/src/potential/xc.cpp
index ddcb8fb099..a735f95583 100644
--- a/src/potential/xc.cpp
+++ b/src/potential/xc.cpp
@@ -154,7 +154,7 @@ void Potential::xc_rg_nonmagnetic(Density const& density__)
             #pragma omp parallel
             {
                 /* split local size between threads */
-                sddk::splindex<sddk::splindex_t::block> spl_t(num_points, omp_get_num_threads(), omp_get_thread_num());
+                sddk::splindex_block <>spl_t(num_points, n_blocks(omp_get_num_threads()), block_id(omp_get_thread_num()));
                 /* if this is an LDA functional */
                 if (ixc.is_lda()) {
                     ixc.get_lda(spl_t.local_size(), &rho.value(spl_t.global_offset()),
@@ -351,7 +351,7 @@ void Potential::xc_rg_magnetic(Density const& density__)
             #pragma omp parallel
             {
                 /* split local size between threads */
-                sddk::splindex<sddk::splindex_t::block> spl_t(num_points, omp_get_num_threads(), omp_get_thread_num());
+                sddk::splindex_block<> spl_t(num_points, n_blocks(omp_get_num_threads()), block_id(omp_get_thread_num()));
                 /* if this is an LDA functional */
                 if (ixc.is_lda()) {
                     ixc.get_lda(spl_t.local_size(), &rho_up.value(spl_t.global_offset()),
diff --git a/src/potential/xc_mt.cpp b/src/potential/xc_mt.cpp
index a9655d2b6a..d334a6ef76 100644
--- a/src/potential/xc_mt.cpp
+++ b/src/potential/xc_mt.cpp
@@ -319,26 +319,25 @@ void Potential::xc_mt(Density const& density__)
     PROFILE("sirius::Potential::xc_mt");
 
     #pragma omp parallel for
-    for (int ialoc = 0; ialoc < unit_cell_.spl_num_atoms().local_size(); ialoc++) {
-        int ia = unit_cell_.spl_num_atoms(ialoc);
-        auto& rgrid = unit_cell_.atom(ia).radial_grid();
+    for (auto it : unit_cell_.spl_num_atoms()) {
+        auto& rgrid = unit_cell_.atom(it.i).radial_grid();
         std::vector<Flm const*> rho(ctx_.num_mag_dims() + 1);
         std::vector<Flm*> vxc(ctx_.num_mag_dims() + 1);
-        rho[0] = &density__.rho().mt()[ia];
-        vxc[0] = &xc_potential_->mt()[ia];
+        rho[0] = &density__.rho().mt()[it.i];
+        vxc[0] = &xc_potential_->mt()[it.i];
         for (int j = 0; j < ctx_.num_mag_dims(); j++) {
-            rho[j + 1] = &density__.mag(j).mt()[ia];
-            vxc[j + 1] = &effective_magnetic_field(j).mt()[ia];
+            rho[j + 1] = &density__.mag(j).mt()[it.i];
+            vxc[j + 1] = &effective_magnetic_field(j).mt()[it.i];
         }
-        sirius::xc_mt(rgrid, *sht_, xc_func_, ctx_.num_mag_dims(), rho, vxc, &xc_energy_density_->mt()[ia]);
+        sirius::xc_mt(rgrid, *sht_, xc_func_, ctx_.num_mag_dims(), rho, vxc, &xc_energy_density_->mt()[it.i]);
 
         /* z, x, y order */
         std::array<int, 3> comp_map = {2, 0, 1};
         /* add auxiliary magnetic field antiparallel to starting magnetization */
         for (int j = 0; j < ctx_.num_mag_dims(); j++) {
             for (int ir = 0; ir < rgrid.num_points(); ir++) {
-                effective_magnetic_field(j).mt()[ia](0, ir) -=
-                    aux_bf_(j, ia) * ctx_.unit_cell().atom(ia).vector_field()[comp_map[j]];
+                effective_magnetic_field(j).mt()[it.i](0, ir) -=
+                    aux_bf_(j, it.i) * ctx_.unit_cell().atom(it.i).vector_field()[comp_map[j]];
             }
         }
     } // ialoc
diff --git a/src/radial/radial_integrals.cpp b/src/radial/radial_integrals.cpp
index b461b6f9d0..b59f4b56be 100644
--- a/src/radial/radial_integrals.cpp
+++ b/src/radial/radial_integrals.cpp
@@ -96,8 +96,8 @@ void Radial_integrals_aug<jl_deriv>::generate()
         }
 
         #pragma omp parallel for
-        for (int iq_loc = 0; iq_loc < spl_q_.local_size(); iq_loc++) {
-            int iq = spl_q_[iq_loc];
+        for (auto it : spl_q_) {
+            int iq = it.i;
 
             Spherical_Bessel_functions jl(2 * lmax_beta, atom_type.radial_grid(), grid_q_[iq]);
 
@@ -154,8 +154,8 @@ void Radial_integrals_rho_pseudo::generate()
         Spline<double> rho(atom_type.radial_grid(), atom_type.ps_total_charge_density());
 
         #pragma omp parallel for
-        for (int iq_loc = 0; iq_loc < spl_q_.local_size(); iq_loc++) {
-            int iq = spl_q_[iq_loc];
+        for (auto it : spl_q_) {
+            int iq = it.i;
             Spherical_Bessel_functions jl(0, atom_type.radial_grid(), grid_q_[iq]);
 
             values_(iat)(iq) = sirius::inner(jl[0], rho, 0, atom_type.num_mt_points()) / fourpi;
@@ -182,8 +182,8 @@ void Radial_integrals_rho_core_pseudo<jl_deriv>::generate()
         Spline<double> ps_core(atom_type.radial_grid(), atom_type.ps_core_charge_density());
 
         #pragma omp parallel for
-        for (int iq_loc = 0; iq_loc < spl_q_.local_size(); iq_loc++) {
-            int iq = spl_q_[iq_loc];
+        for (auto it : spl_q_) {
+            int iq = it.i;
             Spherical_Bessel_functions jl(0, atom_type.radial_grid(), grid_q_[iq]);
 
             if (jl_deriv) {
@@ -216,8 +216,8 @@ void Radial_integrals_beta<jl_deriv>::generate()
         }
 
         #pragma omp parallel for
-        for (int iq_loc = 0; iq_loc < spl_q_.local_size(); iq_loc++) {
-            int iq = spl_q_[iq_loc];
+        for (auto it : spl_q_) {
+            int iq = it.i;
             Spherical_Bessel_functions jl(unit_cell_.lmax(), atom_type.radial_grid(), grid_q_[iq]);
             for (int idxrf = 0; idxrf < nrb; idxrf++) {
                 int l  = atom_type.indexr(idxrf).am.l();
@@ -313,8 +313,8 @@ void Radial_integrals_vloc<jl_deriv>::generate()
         auto rg = atom_type.radial_grid().segment(np);
 
         #pragma omp parallel for
-        for (int iq_loc = 0; iq_loc < spl_q_.local_size(); iq_loc++) {
-            int iq = spl_q_[iq_loc];
+        for (auto it : spl_q_) {
+            int iq = it.i;
             Spline<double> s(rg);
             double g = grid_q_[iq];
 
diff --git a/src/radial/radial_integrals.hpp b/src/radial/radial_integrals.hpp
index 2b0211642b..b54b70a949 100644
--- a/src/radial/radial_integrals.hpp
+++ b/src/radial/radial_integrals.hpp
@@ -43,7 +43,7 @@ class Radial_integrals_base
     Radial_grid<double> grid_q_;
 
     /// Split index of q-points.
-    sddk::splindex<sddk::splindex_t::block> spl_q_;
+    sddk::splindex_block<> spl_q_;
 
     /// Array with integrals.
     sddk::mdarray<Spline<double>, N> values_;
@@ -64,8 +64,8 @@ class Radial_integrals_base
         qmax_ = qmax__ + std::max(10.0, qmax__ * 0.1);
 
         grid_q_ = Radial_grid_lin<double>(static_cast<int>(np__ * qmax_), 0, qmax_);
-        spl_q_  = sddk::splindex<sddk::splindex_t::block>(grid_q_.num_points(), unit_cell_.comm().size(),
-                                                          unit_cell_.comm().rank());
+        spl_q_  = sddk::splindex_block<>(grid_q_.num_points(), n_blocks(unit_cell_.comm().size()),
+                block_id(unit_cell_.comm().rank()));
     }
 
     /// Get starting index iq and delta dq for the q-point on the linear grid.
@@ -247,17 +247,17 @@ class Radial_integrals_rho_pseudo : public Radial_integrals_base<1>
     }
 
     /// Compute all values of the raial integrals.
-    inline sddk::mdarray<double, 2> values(std::vector<double>& q__, mpi::Communicator const& comm__) const
+    inline auto values(std::vector<double>& q__, mpi::Communicator const& comm__) const
     {
         int nq = static_cast<int>(q__.size());
-        sddk::splindex<sddk::splindex_t::block> splq(nq, comm__.size(), comm__.rank());
+        sddk::splindex_block<> splq(nq, n_blocks(comm__.size()), block_id(comm__.rank()));
         sddk::mdarray<double, 2> result(nq, unit_cell_.num_atom_types());
         result.zero();
         for (int iat = 0; iat < unit_cell_.num_atom_types(); iat++) {
             if (!unit_cell_.atom_type(iat).ps_total_charge_density().empty()) {
                 #pragma omp parallel for
                 for (int iqloc = 0; iqloc < splq.local_size(); iqloc++) {
-                    int iq = splq[iqloc];
+                    auto iq = splq.global_index(iqloc);
                     if (ri_callback_) {
                         ri_callback_(iat + 1, 1, &q__[iq], &result(iq, iat));
                     } else {
@@ -291,17 +291,17 @@ class Radial_integrals_rho_core_pseudo : public Radial_integrals_base<1>
     }
 
     /// Compute all values of the raial integrals.
-    inline sddk::mdarray<double, 2> values(std::vector<double>& q__, mpi::Communicator const& comm__) const
+    inline auto values(std::vector<double>& q__, mpi::Communicator const& comm__) const
     {
         int nq = static_cast<int>(q__.size());
-        sddk::splindex<sddk::splindex_t::block> splq(nq, comm__.size(), comm__.rank());
+        sddk::splindex_block<> splq(nq, n_blocks(comm__.size()), block_id(comm__.rank()));
         sddk::mdarray<double, 2> result(nq, unit_cell_.num_atom_types());
         result.zero();
         for (int iat = 0; iat < unit_cell_.num_atom_types(); iat++) {
             if (!unit_cell_.atom_type(iat).ps_core_charge_density().empty()) {
                 #pragma omp parallel for
                 for (int iqloc = 0; iqloc < splq.local_size(); iqloc++) {
-                    int iq = splq[iqloc];
+                    auto iq = splq.global_index(iqloc);
                     if (ri_callback_) {
                         ri_callback_(iat + 1, 1, &q__[iq], &result(iq, iat));
                     } else {
@@ -402,17 +402,17 @@ class Radial_integrals_vloc : public Radial_integrals_base<1>
     }
 
     /// Compute all values of the raial integrals.
-    inline sddk::mdarray<double, 2> values(std::vector<double>& q__, mpi::Communicator const& comm__) const
+    inline auto values(std::vector<double>& q__, mpi::Communicator const& comm__) const
     {
         int nq = static_cast<int>(q__.size());
-        sddk::splindex<sddk::splindex_t::block> splq(nq, comm__.size(), comm__.rank());
+        sddk::splindex_block<> splq(nq, n_blocks(comm__.size()), block_id(comm__.rank()));
         sddk::mdarray<double, 2> result(nq, unit_cell_.num_atom_types());
         result.zero();
         for (int iat = 0; iat < unit_cell_.num_atom_types(); iat++) {
             if (!unit_cell_.atom_type(iat).local_potential().empty()) {
                 #pragma omp parallel for
                 for (int iqloc = 0; iqloc < splq.local_size(); iqloc++) {
-                    int iq = splq[iqloc];
+                    auto iq = splq.global_index(iqloc);
                     if (ri_callback_) {
                         ri_callback_(iat + 1, 1, &q__[iq], &result(iq, iat));
                     } else {
diff --git a/src/strong_type.hpp b/src/strong_type.hpp
index 4661638cac..52879220f7 100644
--- a/src/strong_type.hpp
+++ b/src/strong_type.hpp
@@ -1,7 +1,7 @@
 #ifndef __STRONG_TYPE_HPP__
 #define __STRONG_TYPE_HPP__
 
-namespace sirius {
+//namespace sirius {
 
 template <typename T, typename Tag>
 class strong_type
@@ -52,6 +52,6 @@ class strong_type
     }
 };
 
-}
+//}
 
 #endif
diff --git a/src/symmetry/symmetrize_forces.hpp b/src/symmetry/symmetrize_forces.hpp
index 6d14722dba..2d134a6432 100644
--- a/src/symmetry/symmetrize_forces.hpp
+++ b/src/symmetry/symmetrize_forces.hpp
@@ -47,11 +47,11 @@ symmetrize_forces(Unit_cell const& uc__, sddk::mdarray<double, 2>& f__)
         for (int ia = 0; ia < uc__.num_atoms(); ia++) {
             r3::vector<double> force_ia(&f__(0, ia));
             int ja        = sym[isym].spg_op.sym_atom[ia];
-            auto location = uc__.spl_num_atoms().location(ja);
-            if (location.rank == uc__.comm().rank()) {
+            auto location = uc__.spl_num_atoms().location(typename atom_index_t::global(ja));
+            if (location.ib == uc__.comm().rank()) {
                 auto force_ja = dot(Rc, force_ia);
                 for (int x : {0, 1, 2}) {
-                    sym_forces(x, location.local_index) += force_ja[x];
+                    sym_forces(x, location.index_local) += force_ja[x];
                 }
             }
         }
diff --git a/src/symmetry/symmetrize_mt_function.hpp b/src/symmetry/symmetrize_mt_function.hpp
index 054eafebcd..60ee682056 100644
--- a/src/symmetry/symmetrize_mt_function.hpp
+++ b/src/symmetry/symmetrize_mt_function.hpp
@@ -29,9 +29,10 @@
 
 namespace sirius {
 
+template <typename Index_t>
 inline void
 symmetrize_mt_function(Crystal_symmetry const& sym__, mpi::Communicator const& comm__, int num_mag_dims__,
-        std::vector<Spheric_function_set<double>*> frlm__)
+        std::vector<Spheric_function_set<double, Index_t>*> frlm__)
 {
     PROFILE("sirius::symmetrize_mt_function");
 
@@ -46,7 +47,7 @@ symmetrize_mt_function(Crystal_symmetry const& sym__, mpi::Communicator const& c
     int lmax = utils::lmax(lmmax);
 
     /* split atoms between MPI ranks */
-    sddk::splindex<sddk::splindex_t::block> spl_atoms(frlm.atoms().size(), comm__.size(), comm__.rank());
+    sddk::splindex_block<Index_t> spl_atoms(frlm.atoms().size(), n_blocks(comm__.size()), block_id(comm__.rank()));
 
     /* space for real Rlm rotation matrix */
     sddk::mdarray<double, 2> rotm(lmmax, lmmax);
@@ -67,9 +68,9 @@ symmetrize_mt_function(Crystal_symmetry const& sym__, mpi::Communicator const& c
         /* compute Rlm rotation matrix */
         sht::rotation_matrix(lmax, sym__[i].spg_op.euler_angles, sym__[i].spg_op.proper, rotm);
 
-        for (int ialoc = 0; ialoc < spl_atoms.local_size(); ialoc++) {
+        for (auto it : spl_atoms) {
             /* get global index of the atom */
-            int ia = frlm.atoms()[spl_atoms[ialoc]];
+            int ia = it.i;
             int lmmax_ia = frlm[ia].angular_domain_size();
             int nrmax_ia = frlm.unit_cell().atom(ia).num_mt_points();
             int ja = sym__[i].spg_op.inv_sym_atom[ia];
@@ -83,14 +84,14 @@ symmetrize_mt_function(Crystal_symmetry const& sym__, mpi::Communicator const& c
             /* always symmetrize the scalar component */
             for (int ir = 0; ir < nrmax_ia; ir++) {
                 for (int lm = 0; lm < lmmax_ia; lm++) {
-                    fsym_loc(lm, ir, 0, ialoc) += ftmp(lm, ir, 0);
+                    fsym_loc(lm, ir, 0, it.li) += ftmp(lm, ir, 0);
                 }
             }
             /* apply S part to [0, 0, z] collinear vector */
             if (num_mag_dims__ == 1) {
                 for (int ir = 0; ir < nrmax_ia; ir++) {
                     for (int lm = 0; lm < lmmax_ia; lm++) {
-                        fsym_loc(lm, ir, 1, ialoc) += ftmp(lm, ir, 1) * S(2, 2);
+                        fsym_loc(lm, ir, 1, it.li) += ftmp(lm, ir, 1) * S(2, 2);
                     }
                 }
             }
@@ -100,7 +101,7 @@ symmetrize_mt_function(Crystal_symmetry const& sym__, mpi::Communicator const& c
                     for (int j : {0, 1, 2}) {
                         for (int ir = 0; ir < nrmax_ia; ir++) {
                             for (int lm = 0; lm < lmmax_ia; lm++) {
-                                fsym_loc(lm, ir, 1 + k, ialoc) += ftmp(lm, ir, 1 + j) * S(k, j);
+                                fsym_loc(lm, ir, 1 + k, it.li) += ftmp(lm, ir, 1 + j) * S(k, j);
                             }
                         }
                     }
diff --git a/src/traits.hpp b/src/traits.hpp
index 4e76bfc409..c33452b087 100644
--- a/src/traits.hpp
+++ b/src/traits.hpp
@@ -1,5 +1,5 @@
-#ifndef TRAITS_H
-#define TRAITS_H
+#ifndef __TRAITS_HPP__
+#define __TRAITS_HPP__
 
 namespace sirius {
 
@@ -14,4 +14,4 @@ using identity_t = typename identity<X>::type;
 
 } // namespace sirius
 
-#endif /* TRAITS_H */
+#endif /* __TRAITS_HPP__ */
diff --git a/src/typedefs.hpp b/src/typedefs.hpp
index cedb083bbe..3ca8e90c9a 100644
--- a/src/typedefs.hpp
+++ b/src/typedefs.hpp
@@ -26,7 +26,6 @@
 #define __TYPEDEFS_HPP__
 
 #include <cstdlib>
-//#include <assert.h>
 #include <complex>
 #include <cstdint>
 #include <vector>
diff --git a/src/unit_cell/atom.hpp b/src/unit_cell/atom.hpp
index ef9aa0adff..d1f2731472 100644
--- a/src/unit_cell/atom.hpp
+++ b/src/unit_cell/atom.hpp
@@ -151,7 +151,7 @@ class Atom
             RTE_THROW("not yet mpi parallel");
         }
 
-        sddk::splindex<sddk::splindex_t::block> spl_lm(lmmax, comm__.size(), comm__.rank());
+        sddk::splindex_block<> spl_lm(lmmax, n_blocks(comm__.size()), block_id(comm__.rank()));
 
         auto l_by_lm = utils::l_by_lm(lmax_pot_);
 
diff --git a/src/unit_cell/basis_functions_index.hpp b/src/unit_cell/basis_functions_index.hpp
index 9ccad9ef75..e928e1645d 100644
--- a/src/unit_cell/basis_functions_index.hpp
+++ b/src/unit_cell/basis_functions_index.hpp
@@ -72,9 +72,6 @@ class basis_functions_index
 
     sddk::mdarray<int, 2> index_by_lm_order_;
 
-    /// Maximum l of the radial basis functions.
-    int lmax_{-1};
-
     int offset_lo_{-1};
 
     std::vector<int> offset_;
diff --git a/src/unit_cell/unit_cell.cpp b/src/unit_cell/unit_cell.cpp
index 369e84c539..e277786bef 100644
--- a/src/unit_cell/unit_cell.cpp
+++ b/src/unit_cell/unit_cell.cpp
@@ -531,13 +531,12 @@ Unit_cell::generate_radial_functions(std::ostream& out__)
 {
     PROFILE("sirius::Unit_cell::generate_radial_functions");
 
-    for (int icloc = 0; icloc < (int)spl_num_atom_symmetry_classes().local_size(); icloc++) {
-        int ic = spl_num_atom_symmetry_classes(icloc);
-        atom_symmetry_class(ic).generate_radial_functions(parameters_.valence_relativity());
+    for (auto it : spl_num_atom_symmetry_classes()) {
+        atom_symmetry_class(it.i).generate_radial_functions(parameters_.valence_relativity());
     }
 
     for (int ic = 0; ic < num_atom_symmetry_classes(); ic++) {
-        int rank = spl_num_atom_symmetry_classes().local_rank(ic);
+        int rank = spl_num_atom_symmetry_classes().location(typename atom_symmetry_class_index_t::global(ic)).ib;
         atom_symmetry_class(ic).sync_radial_functions(comm_, rank);
     }
 
@@ -547,9 +546,8 @@ Unit_cell::generate_radial_functions(std::ostream& out__)
             pout << std::endl << "Linearization energies" << std::endl;
         }
 
-        for (int icloc = 0; icloc < (int)spl_num_atom_symmetry_classes().local_size(); icloc++) {
-            int ic = spl_num_atom_symmetry_classes(icloc);
-            atom_symmetry_class(ic).write_enu(pout);
+        for (auto it : spl_num_atom_symmetry_classes()) {
+            atom_symmetry_class(it.i).write_enu(pout);
         }
         RTE_OUT(out__) << pout.flush(0);
     }
@@ -566,13 +564,12 @@ Unit_cell::generate_radial_integrals()
     PROFILE("sirius::Unit_cell::generate_radial_integrals");
 
     try {
-        for (int icloc = 0; icloc < spl_num_atom_symmetry_classes().local_size(); icloc++) {
-            int ic = spl_num_atom_symmetry_classes(icloc);
-            atom_symmetry_class(ic).generate_radial_integrals(parameters_.valence_relativity());
+        for (auto it : spl_num_atom_symmetry_classes()) {
+            atom_symmetry_class(it.i).generate_radial_integrals(parameters_.valence_relativity());
         }
 
         for (int ic = 0; ic < num_atom_symmetry_classes(); ic++) {
-            int rank = spl_num_atom_symmetry_classes().local_rank(ic);
+            int rank = spl_num_atom_symmetry_classes().location(typename atom_symmetry_class_index_t::global(ic)).ib;
             atom_symmetry_class(ic).sync_radial_integrals(comm_, rank);
         }
     } catch(std::exception const& e) {
@@ -582,13 +579,12 @@ Unit_cell::generate_radial_integrals()
     }
 
     try {
-        for (int ialoc = 0; ialoc < spl_num_atoms_.local_size(); ialoc++) {
-            int ia = spl_num_atoms_[ialoc];
-            atom(ia).generate_radial_integrals(parameters_.processing_unit(), mpi::Communicator::self());
+        for (auto it : spl_num_atoms_) {
+            atom(it.i).generate_radial_integrals(parameters_.processing_unit(), mpi::Communicator::self());
         }
 
         for (int ia = 0; ia < num_atoms(); ia++) {
-            int rank = spl_num_atoms().local_rank(ia);
+            int rank = spl_num_atoms().location(typename atom_index_t::global(ia)).ib;
             atom(ia).sync_radial_integrals(comm_, rank);
         }
     } catch(std::exception const& e) {
@@ -665,7 +661,7 @@ Unit_cell::initialize()
     PROFILE("sirius::Unit_cell::initialize");
 
     /* split number of atom between all MPI ranks */
-    spl_num_atoms_ = sddk::splindex<sddk::splindex_t::block>(num_atoms(), comm_.size(), comm_.rank());
+    spl_num_atoms_ = sddk::splindex_block<atom_index_t>(num_atoms(), n_blocks(comm_.size()), block_id(comm_.rank()));
 
     /* initialize atom types */
     for (int iat = 0; iat < num_atom_types(); iat++) {
@@ -873,8 +869,8 @@ Unit_cell::update()
 
     get_symmetry();
 
-    spl_num_atom_symmetry_classes_ = sddk::splindex<sddk::splindex_t::block>(num_atom_symmetry_classes(),
-            comm_.size(), comm_.rank());
+    spl_num_atom_symmetry_classes_ = sddk::splindex_block<atom_symmetry_class_index_t>(num_atom_symmetry_classes(),
+            n_blocks(comm_.size()), block_id(comm_.rank()));
 
     volume_mt_ = 0.0;
     if (parameters_.full_potential()) {
@@ -957,7 +953,8 @@ Unit_cell::init_paw()
         }
     }
 
-    spl_num_paw_atoms_ = sddk::splindex<sddk::splindex_t::block>(num_paw_atoms(), comm_.size(), comm_.rank());
+    spl_num_paw_atoms_ = sddk::splindex_block<paw_atom_index_t>(num_paw_atoms(), n_blocks(comm_.size()),
+            block_id(comm_.rank()));
 }
 
 std::pair<int, std::vector<int>>
diff --git a/src/unit_cell/unit_cell.hpp b/src/unit_cell/unit_cell.hpp
index bff0c0cb03..116def490b 100644
--- a/src/unit_cell/unit_cell.hpp
+++ b/src/unit_cell/unit_cell.hpp
@@ -58,16 +58,16 @@ class Unit_cell
     std::vector<std::shared_ptr<Atom>> atoms_;
 
     /// Split index of atoms.
-    sddk::splindex<sddk::splindex_t::block> spl_num_atoms_;
+    sddk::splindex_block<atom_index_t> spl_num_atoms_;
 
     /// Global index of atom by index of PAW atom.
     std::vector<int> paw_atom_index_;
 
     /// Split index of PAW atoms.
-    sddk::splindex<sddk::splindex_t::block> spl_num_paw_atoms_;
+    sddk::splindex_block<paw_atom_index_t> spl_num_paw_atoms_;
 
     /// Split index of atom symmetry classes.
-    sddk::splindex<sddk::splindex_t::block> spl_num_atom_symmetry_classes_;
+    sddk::splindex_block<atom_symmetry_class_index_t> spl_num_atom_symmetry_classes_;
 
     /// Bravais lattice vectors in column order.
     /** The following convention is used to transform fractional coordinates to Cartesian:
@@ -174,13 +174,13 @@ class Unit_cell
         return spl_num_paw_atoms_;
     }
 
-    inline int spl_num_paw_atoms(int idx__) const
+    inline auto spl_num_paw_atoms(paw_atom_index_t::local idx__) const
     {
-        return spl_num_paw_atoms_[idx__];
+        return spl_num_paw_atoms_.global_index(idx__);
     }
 
     /// Return global index of atom by the index in the list of PAW atoms.
-    inline int paw_atom_index(int ipaw__) const
+    inline int paw_atom_index(paw_atom_index_t::global ipaw__) const
     {
         return paw_atom_index_[ipaw__];
     }
@@ -513,21 +513,11 @@ class Unit_cell
         return spl_num_atoms_;
     }
 
-    inline auto spl_num_atoms(int i) const
-    {
-        return static_cast<int>(spl_num_atoms_[i]);
-    }
-
     inline auto const& spl_num_atom_symmetry_classes() const
     {
         return spl_num_atom_symmetry_classes_;
     }
 
-    inline auto spl_num_atom_symmetry_classes(int i) const
-    {
-        return static_cast<int>(spl_num_atom_symmetry_classes_[i]);
-    }
-
     inline double volume_mt() const
     {
         return volume_mt_;