From f7dc83ccb4f82974f0c5d2647c65f674369acac7 Mon Sep 17 00:00:00 2001
From: Philip Fackler <facklerpw@ornl.gov>
Date: Mon, 25 Sep 2023 11:04:30 -0400
Subject: [PATCH 1/3] Refactored everything needed for test_RotatedSPOsT

---
 src/Particle/CMakeLists.txt                   |    4 +
 src/Particle/InitMolecularSystemT.cpp         |  314 +++
 src/Particle/InitMolecularSystemT.h           |   79 +
 src/Particle/LongRange/StructFactT.cpp        |    2 +-
 src/Particle/LongRange/StructFactT.h          |    9 +-
 src/Particle/MCWalkerConfigurationT.cpp       |  313 +++
 src/Particle/MCWalkerConfigurationT.h         |  244 ++
 src/Particle/ParticleIO/LatticeIO.cpp         |  210 ++
 src/Particle/ParticleIO/LatticeIO.h           |   13 +
 src/Particle/ParticleIO/XMLParticleIO.cpp     |  398 +++
 src/Particle/ParticleIO/XMLParticleIO.h       |   36 +
 src/Particle/ParticleSetPoolT.cpp             |  278 ++
 src/Particle/ParticleSetPoolT.h               |  155 ++
 src/Particle/ParticleSetT.BC.cpp              |  194 ++
 src/Particle/ParticleSetT.cpp                 |  212 ++
 src/Particle/ParticleSetT.h                   |   12 +-
 src/Particle/ParticleSetTraits.h              |    1 +
 src/Particle/ReptileT.h                       |  350 +++
 src/Particle/SampleStackT.cpp                 |   81 +
 src/Particle/SampleStackT.h                   |   84 +
 src/Particle/SimulationCellT.h                |    5 +-
 .../BsplineFactory/BsplineReaderBase.h        |    1 +
 .../BsplineFactory/BsplineReaderBaseT.cpp     |  259 ++
 .../BsplineFactory/BsplineReaderBaseT.h       |  228 ++
 .../BsplineFactory/BsplineSetT.h              |   10 +-
 .../HybridRepCenterOrbitalsT.cpp              |   23 +
 .../BsplineFactory/HybridRepCenterOrbitalsT.h |  819 ++++++
 .../BsplineFactory/HybridRepCplxT.h           |  292 ++
 .../BsplineFactory/HybridRepRealT.h           |  303 ++
 .../BsplineFactory/HybridRepSetReader.h       |    1 +
 .../BsplineFactory/HybridRepSetReaderT.h      |  492 ++++
 .../BsplineFactory/SplineC2COMPTargetT.cpp    | 2438 +++++++++--------
 .../BsplineFactory/SplineC2COMPTargetT.h      |  615 +++--
 .../BsplineFactory/SplineC2CT.cpp             | 1479 +++++-----
 .../BsplineFactory/SplineC2CT.h               |  429 +--
 ...TOMPTarget.cpp => SplineC2ROMPTargetT.cpp} |  126 +-
 ...eC2RTOMPTarget.h => SplineC2ROMPTargetT.h} |   83 +-
 .../BsplineFactory/SplineC2RT.cpp             | 2333 ++++++++--------
 .../BsplineFactory/SplineC2RT.h               |  399 +--
 .../BsplineFactory/SplineR2RT.cpp             |   79 +-
 .../BsplineFactory/SplineR2RT.h               |   41 +-
 .../BsplineFactory/SplineSetReader.h          |    4 +
 .../BsplineFactory/SplineSetReaderT.h         |  322 +++
 .../BsplineFactory/createBsplineReaderT.cpp   |  331 +++
 .../BsplineFactory/createBsplineReaderT.h     |   59 +
 src/QMCWaveFunctions/CMakeLists.txt           |   29 +-
 src/QMCWaveFunctions/EinsplineSetBuilderT.cpp | 1815 ++++++++++++
 src/QMCWaveFunctions/EinsplineSetBuilderT.h   |  334 +++
 src/QMCWaveFunctions/OrbitalSetTraits.h       |    1 +
 src/QMCWaveFunctions/SPOSetT.h                |    1 +
 src/QMCWaveFunctions/SpinorSetT.cpp           |    2 +-
 src/QMCWaveFunctions/SpinorSetT.h             |    3 +-
 src/QMCWaveFunctions/tests/CMakeLists.txt     |    1 +
 .../tests/test_RotatedSPOsT.cpp               | 1024 +++++++
 src/mpi/mpi_datatype.h                        |    2 +
 55 files changed, 13639 insertions(+), 3733 deletions(-)
 create mode 100644 src/Particle/InitMolecularSystemT.cpp
 create mode 100644 src/Particle/InitMolecularSystemT.h
 create mode 100644 src/Particle/MCWalkerConfigurationT.cpp
 create mode 100644 src/Particle/MCWalkerConfigurationT.h
 create mode 100644 src/Particle/ParticleSetPoolT.cpp
 create mode 100644 src/Particle/ParticleSetPoolT.h
 create mode 100644 src/Particle/ParticleSetT.BC.cpp
 create mode 100644 src/Particle/ReptileT.h
 create mode 100644 src/Particle/SampleStackT.cpp
 create mode 100644 src/Particle/SampleStackT.h
 create mode 100644 src/QMCWaveFunctions/BsplineFactory/BsplineReaderBaseT.cpp
 create mode 100644 src/QMCWaveFunctions/BsplineFactory/BsplineReaderBaseT.h
 create mode 100644 src/QMCWaveFunctions/BsplineFactory/HybridRepCenterOrbitalsT.cpp
 create mode 100644 src/QMCWaveFunctions/BsplineFactory/HybridRepCenterOrbitalsT.h
 create mode 100644 src/QMCWaveFunctions/BsplineFactory/HybridRepCplxT.h
 create mode 100644 src/QMCWaveFunctions/BsplineFactory/HybridRepRealT.h
 create mode 100644 src/QMCWaveFunctions/BsplineFactory/HybridRepSetReaderT.h
 rename src/QMCWaveFunctions/BsplineFactory/{SplineC2RTOMPTarget.cpp => SplineC2ROMPTargetT.cpp} (96%)
 rename src/QMCWaveFunctions/BsplineFactory/{SplineC2RTOMPTarget.h => SplineC2ROMPTargetT.h} (82%)
 create mode 100644 src/QMCWaveFunctions/BsplineFactory/SplineSetReaderT.h
 create mode 100644 src/QMCWaveFunctions/BsplineFactory/createBsplineReaderT.cpp
 create mode 100644 src/QMCWaveFunctions/BsplineFactory/createBsplineReaderT.h
 create mode 100644 src/QMCWaveFunctions/EinsplineSetBuilderT.cpp
 create mode 100644 src/QMCWaveFunctions/EinsplineSetBuilderT.h
 create mode 100644 src/QMCWaveFunctions/tests/test_RotatedSPOsT.cpp

diff --git a/src/Particle/CMakeLists.txt b/src/Particle/CMakeLists.txt
index b6517626c1..9dc57daf01 100644
--- a/src/Particle/CMakeLists.txt
+++ b/src/Particle/CMakeLists.txt
@@ -14,9 +14,11 @@
 ####################################
 set(PARTICLE
     InitMolecularSystem.cpp
+    InitMolecularSystemT.cpp
     SimulationCell.cpp
     SimulationCellT.cpp
     ParticleSetPool.cpp
+    ParticleSetPoolT.cpp
     ParticleSet.cpp
     ParticleSetT.cpp
     PSdispatcher.cpp
@@ -28,9 +30,11 @@ set(PARTICLE
     MCCoords.cpp
     MCCoordsT.cpp
     MCWalkerConfiguration.cpp
+    MCWalkerConfigurationT.cpp
     WalkerConfigurations.cpp
     SpeciesSet.cpp
     SampleStack.cpp
+    SampleStackT.cpp
     createDistanceTableAA.cpp
     createDistanceTableAB.cpp
     createDistanceTableT.cpp
diff --git a/src/Particle/InitMolecularSystemT.cpp b/src/Particle/InitMolecularSystemT.cpp
new file mode 100644
index 0000000000..a4559fc288
--- /dev/null
+++ b/src/Particle/InitMolecularSystemT.cpp
@@ -0,0 +1,314 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
+//
+// Copyright (c) 2020 QMCPACK developers.
+//
+// File developed by: Jordan E. Vincent, University of Illinois at
+// Urbana-Champaign
+//                    Luke Shulenburger, lshulen@sandia.gov, Sandia National
+//                    Laboratories Jeremy McMinnis, jmcminis@gmail.com,
+//                    University of Illinois at Urbana-Champaign Jeongnim Kim,
+//                    jeongnim.kim@gmail.com, University of Illinois at
+//                    Urbana-Champaign Miguel Morales, moralessilva2@llnl.gov,
+//                    Lawrence Livermore National Laboratory Mark Dewing,
+//                    markdewing@gmail.com, University of Illinois at
+//                    Urbana-Champaign Mark A. Berrill, berrillma@ornl.gov, Oak
+//                    Ridge National Laboratory
+//
+// File created by: Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois
+// at Urbana-Champaign
+//////////////////////////////////////////////////////////////////////////////////////
+
+#include "InitMolecularSystemT.h"
+
+#include "OhmmsData/AttributeSet.h"
+#include "Particle/DistanceTableT.h"
+#include "Particle/ParticleSetPoolT.h"
+#include "ParticleBase/RandomSeqGeneratorGlobal.h"
+
+namespace qmcplusplus
+{
+template <typename T>
+InitMolecularSystemT<T>::InitMolecularSystemT(
+    ParticleSetPoolT<T>& pset, const char* aname) :
+    OhmmsElementBase(aname),
+    ptclPool(pset)
+{
+}
+
+template <typename T>
+bool
+InitMolecularSystemT<T>::put(xmlNodePtr cur)
+{
+    std::string target("e"), source("i"), volume("no");
+    OhmmsAttributeSet hAttrib;
+    hAttrib.add(target, "target");
+    hAttrib.add(source, "source");
+    hAttrib.add(volume, "use_volume");
+    hAttrib.put(cur);
+    ParticleSetT<T>* els = ptclPool.getParticleSet(target);
+    if (els == 0) {
+        ERRORMSG("No target particle " << target << " exists.")
+        return false;
+    }
+    ParticleSetT<T>* ions = ptclPool.getParticleSet(source);
+    if (ions == 0) {
+        ERRORMSG("No source particle " << source << " exists.")
+        return false;
+    }
+
+    app_log() << "<init source=\"" << source << "\" target=\"" << target
+              << "\">" << std::endl;
+
+    if (volume == "yes")
+        initWithVolume(ions, els);
+    else
+        initMolecule(ions, els);
+
+    makeUniformRandom(els->spins);
+    els->spins *= 2 * M_PI;
+
+    app_log() << "</init>" << std::endl;
+    app_log().flush();
+
+    return true;
+}
+
+template <typename T>
+void
+InitMolecularSystemT<T>::initAtom(ParticleSetT<T>* ions, ParticleSetT<T>* els)
+{
+    // 3N-dimensional Gaussian
+    typename ParticleSetT<T>::ParticlePos chi(els->getTotalNum());
+    makeGaussRandom(chi);
+    RealType q = std::sqrt(static_cast<RealType>(els->getTotalNum())) * 0.5;
+    int nel(els->getTotalNum()), items(0);
+    while (nel) {
+        els->R[items] = ions->R[0] + q * chi[items];
+        --nel;
+        ++items;
+    }
+}
+
+template <typename TReal>
+struct LoneElectronT
+{
+    using RealType = TReal;
+    int ID;
+    RealType BondLength;
+    inline LoneElectronT(int id, RealType bl) : ID(id), BondLength(bl)
+    {
+    }
+};
+
+template <typename T>
+void
+InitMolecularSystemT<T>::initMolecule(
+    ParticleSetT<T>* ions, ParticleSetT<T>* els)
+{
+    if (ions->getTotalNum() == 1)
+        return initAtom(ions, els);
+
+    const int d_ii_ID = ions->addTable(*ions);
+    ions->update();
+    const typename ParticleSetT<T>::ParticleIndex& grID(ions->GroupID);
+    SpeciesSet& Species(ions->getSpeciesSet());
+    int Centers = ions->getTotalNum();
+    std::vector<int> Qtot(Centers), Qcore(Centers), Qval(Centers, 0);
+    // use charge as the core electrons first
+    int icharge = Species.addAttribute("charge");
+    // Assign default core charge
+    for (int iat = 0; iat < Centers; iat++)
+        Qtot[iat] = static_cast<int>(Species(icharge, grID[iat]));
+    // cutoff radius (Bohr) this a random choice
+    RealType cutoff = 4.0;
+    typename ParticleSetT<T>::ParticlePos chi(els->getTotalNum());
+    // makeGaussRandom(chi);
+    makeSphereRandom(chi);
+    // the upper limit of the electron index with spin up
+    const int numUp = els->last(0);
+    // the upper limit of the electron index with spin down. Pay attention to
+    // the no spin down electron case.
+    const int numDown = els->last(els->groups() > 1 ? 1 : 0) - els->first(0);
+    // consumer counter of random numbers chi
+    int random_number_counter = 0;
+    int nup_tot = 0, ndown_tot = numUp;
+    std::vector<LoneElectronT<RealType>> loneQ;
+    RealType rmin = cutoff;
+    typename ParticleSetT<T>::SingleParticlePos cm;
+
+    const auto& dist = ions->getDistTableAA(d_ii_ID).getDistances();
+    // Step 1. Distribute even Q[iat] of atomic center iat. If Q[iat] is odd,
+    // put Q[iat]-1 and save the lone electron.
+    for (size_t iat = 0; iat < Centers; iat++) {
+        cm += ions->R[iat];
+        for (size_t jat = iat + 1; jat < Centers; ++jat) {
+            rmin = std::min(rmin, dist[jat][iat]);
+        }
+        // use 40% of the minimum bond
+        RealType sep = rmin * 0.4;
+        int v2 = Qtot[iat] / 2;
+        if (Qtot[iat] > v2 * 2) {
+            loneQ.push_back(LoneElectronT<RealType>(iat, sep));
+        }
+        for (int k = 0; k < v2; k++) {
+            // initialize electron positions in pairs
+            if (nup_tot < numUp)
+                els->R[nup_tot++] =
+                    ions->R[iat] + sep * chi[random_number_counter++];
+            if (ndown_tot < numDown)
+                els->R[ndown_tot++] =
+                    ions->R[iat] + sep * chi[random_number_counter++];
+        }
+    }
+
+    // Step 2. Distribute the electrons left alone
+    // mmorales: changed order of spin assignment to help with spin
+    // imbalances in molecules at large distances.
+    // Not guaranteed to work, but should help in most cases
+    // as long as atoms in molecules are defined sequencially
+    typename std::vector<LoneElectronT<RealType>>::iterator it(loneQ.begin());
+    typename std::vector<LoneElectronT<RealType>>::iterator it_end(loneQ.end());
+    while (it != it_end && nup_tot != numUp && ndown_tot != numDown) {
+        if (nup_tot < numUp) {
+            els->R[nup_tot++] = ions->R[(*it).ID] +
+                (*it).BondLength * chi[random_number_counter++];
+            ++it;
+        }
+        if (ndown_tot < numDown && it != it_end) {
+            els->R[ndown_tot++] = ions->R[(*it).ID] +
+                (*it).BondLength * chi[random_number_counter++];
+            ++it;
+        }
+    }
+
+    // Step 3. Handle more than neutral electrons
+    // extra electrons around the geometric center
+    RealType cnorm = 1.0 / static_cast<RealType>(Centers);
+    RealType sep = rmin * 2;
+    cm = cnorm * cm;
+    if (nup_tot < numUp)
+        while (nup_tot < numUp)
+            els->R[nup_tot++] = cm + sep * chi[random_number_counter++];
+    if (ndown_tot < numDown)
+        while (ndown_tot < numDown)
+            els->R[ndown_tot++] = cm + sep * chi[random_number_counter++];
+
+    // safety check. all the random numbers should have been consumed once and
+    // only once.
+    if (random_number_counter != chi.size())
+        throw std::runtime_error("initMolecule unexpected random number "
+                                 "consumption. Please report a bug!");
+
+    // put all the electrons in a unit box
+    if (els->getLattice().SuperCellEnum != SUPERCELL_OPEN) {
+        els->R.setUnit(PosUnit::Cartesian);
+        els->applyBC(els->R);
+        els->update(false);
+    }
+}
+
+/// helper function to determine the lower bound of a domain (need to move up)
+template <typename T>
+inline TinyVector<T, 3>
+lower_bound(const TinyVector<T, 3>& a, const TinyVector<T, 3>& b)
+{
+    return TinyVector<T, 3>(
+        std::min(a[0], b[0]), std::min(a[1], b[1]), std::min(a[2], b[2]));
+}
+
+/// helper function to determine the upper bound of a domain (need to move up)
+template <typename T>
+inline TinyVector<T, 3>
+upper_bound(const TinyVector<T, 3>& a, const TinyVector<T, 3>& b)
+{
+    return TinyVector<T, 3>(
+        std::max(a[0], b[0]), std::max(a[1], b[1]), std::max(a[2], b[2]));
+}
+
+template <typename T>
+void
+InitMolecularSystemT<T>::initWithVolume(
+    ParticleSetT<T>* ions, ParticleSetT<T>* els)
+{
+    TinyVector<RealType, OHMMS_DIM> start(1.0);
+    TinyVector<RealType, OHMMS_DIM> end(0.0);
+
+    typename ParticleSetT<T>::ParticlePos Ru(ions->getTotalNum());
+    Ru.setUnit(PosUnit::Lattice);
+    ions->applyBC(ions->R, Ru);
+
+    for (int iat = 0; iat < Ru.size(); iat++) {
+        start = lower_bound(Ru[iat], start);
+        end = upper_bound(Ru[iat], end);
+    }
+
+    TinyVector<RealType, OHMMS_DIM> shift;
+    Tensor<RealType, OHMMS_DIM> newbox(ions->getLattice().R);
+
+    RealType buffer = 2.0; // buffer 2 bohr
+    for (int idim = 0; idim < OHMMS_DIM; ++idim) {
+        // if(ions->getLattice().BoxBConds[idim])
+        //{
+        //   start[idim]=0.0;
+        //   end[idim]=1.0;
+        //   shift[idim]=0.0;
+        // }
+        // else
+        {
+            RealType buffer_r = buffer * ions->getLattice().OneOverLength[idim];
+            start[idim] = std::max((RealType)0.0, (start[idim] - buffer_r));
+            end[idim] = std::min((RealType)1.0, (end[idim] + buffer_r));
+            shift[idim] = start[idim] * ions->getLattice().Length[idim];
+            if (std::abs(end[idim] = start[idim]) <
+                buffer) { // handle singular case
+                start[idim] = std::max(0.0, start[idim] - buffer_r / 2.0);
+                end[idim] = std::min(1.0, end[idim] + buffer_r / 2.0);
+            }
+
+            newbox(idim, idim) =
+                (end[idim] - start[idim]) * ions->getLattice().Length[idim];
+        }
+    }
+
+    typename ParticleSetT<T>::ParticleLayout slattice(ions->getLattice());
+    slattice.set(newbox);
+
+    app_log() << "  InitMolecularSystem::initWithVolume " << std::endl;
+    app_log() << "  Effective Lattice shifted by  " << shift << std::endl;
+    app_log() << newbox << std::endl;
+
+    Ru.resize(els->getTotalNum());
+    makeUniformRandom(Ru);
+    for (int iat = 0; iat < Ru.size(); ++iat)
+        els->R[iat] = slattice.toCart(Ru[iat]) + shift;
+    els->R.setUnit(PosUnit::Cartesian);
+}
+
+template <typename T>
+bool
+InitMolecularSystemT<T>::put(std::istream& is)
+{
+    return true;
+}
+
+template <typename T>
+bool
+InitMolecularSystemT<T>::get(std::ostream& os) const
+{
+    return true;
+}
+
+template <typename T>
+void
+InitMolecularSystemT<T>::reset()
+{
+}
+
+template class InitMolecularSystemT<double>;
+template class InitMolecularSystemT<float>;
+template class InitMolecularSystemT<std::complex<double>>;
+template class InitMolecularSystemT<std::complex<float>>;
+
+} // namespace qmcplusplus
diff --git a/src/Particle/InitMolecularSystemT.h b/src/Particle/InitMolecularSystemT.h
new file mode 100644
index 0000000000..3bfe148db5
--- /dev/null
+++ b/src/Particle/InitMolecularSystemT.h
@@ -0,0 +1,79 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
+//
+// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
+//
+// File developed by: Jeremy McMinnis, jmcminis@gmail.com, University of
+// Illinois at Urbana-Champaign
+//                    Jeongnim Kim, jeongnim.kim@gmail.com, University of
+//                    Illinois at Urbana-Champaign
+//
+// File created by: Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois
+// at Urbana-Champaign
+//////////////////////////////////////////////////////////////////////////////////////
+
+#ifndef QMCPLUSPLUS_INITMOLECULARSYSTEMT_H
+#define QMCPLUSPLUS_INITMOLECULARSYSTEMT_H
+
+#include "OhmmsData/OhmmsElementBase.h"
+#include "ParticleSetTraits.h"
+
+#include <map>
+
+namespace qmcplusplus
+{
+template <typename T>
+class ParticleSetT;
+template <typename T>
+class ParticleSetPoolT;
+
+/* Engine to initialize the initial electronic structure for a molecular system
+ */
+template <typename T>
+class InitMolecularSystemT : public OhmmsElementBase
+{
+public:
+    using RealType = typename ParticleSetTraits<T>::RealType;
+
+    InitMolecularSystemT(ParticleSetPoolT<T>& pset, const char* aname = "mosystem");
+
+    bool
+    get(std::ostream& os) const override;
+    bool
+    put(std::istream& is) override;
+    bool
+    put(xmlNodePtr cur) override;
+    void
+    reset() override;
+
+    /** initialize els for an atom
+     */
+    void
+    initAtom(ParticleSetT<T>* ions, ParticleSetT<T>* els);
+    /** initialize els position for a molecule
+     *
+     * Use the valence of each ionic species on a sphere
+     */
+    void
+    initMolecule(ParticleSetT<T>* ions, ParticleSetT<T>* els);
+    /** initialize els for the systems with a mixed boundary
+     *
+     * Use the bound of the ionic systems and uniform random positions within a
+     * reduced box
+     */
+    void
+    initWithVolume(ParticleSetT<T>* ions, ParticleSetT<T>* els);
+
+private:
+    /** pointer to ParticleSetPool
+     *
+     * QMCHamiltonian needs to know which ParticleSet object
+     * is used as an input object for the evaluations.
+     * Any number of ParticleSet can be used to describe
+     * a QMCHamiltonian.
+     */
+    ParticleSetPoolT<T>& ptclPool;
+};
+} // namespace qmcplusplus
+#endif
diff --git a/src/Particle/LongRange/StructFactT.cpp b/src/Particle/LongRange/StructFactT.cpp
index 6f1dae8a9e..363d364c68 100644
--- a/src/Particle/LongRange/StructFactT.cpp
+++ b/src/Particle/LongRange/StructFactT.cpp
@@ -32,7 +32,7 @@ namespace qmcplusplus
 // Constructor - pass arguments to k_lists_' constructor
 template <typename T>
 StructFactT<T>::StructFactT(
-    const ParticleLayout& lattice, const KContainer& k_lists) :
+    const ParticleLayout& lattice, const KContainerT<T>& k_lists) :
     SuperCellEnum(SUPERCELL_BULK),
     k_lists_(k_lists),
     StorePerParticle(false),
diff --git a/src/Particle/LongRange/StructFactT.h b/src/Particle/LongRange/StructFactT.h
index 218b3adf31..e61ed50beb 100644
--- a/src/Particle/LongRange/StructFactT.h
+++ b/src/Particle/LongRange/StructFactT.h
@@ -28,7 +28,8 @@ namespace qmcplusplus
 {
 template <typename T>
 class ParticleSetT;
-class KContainer;
+template <typename T>
+class KContainerT;
 template <typename T>
 struct SKMultiWalkerMemT;
 
@@ -65,7 +66,7 @@ class StructFactT
      * At least in the batched version Structure factor is _NOT_ valid
      * after construction.
      */
-    StructFactT(const ParticleLayout& lattice, const KContainer& k_lists);
+    StructFactT(const ParticleLayout& lattice, const KContainerT<T>& k_lists);
     /// desructor
     ~StructFactT();
 
@@ -100,7 +101,7 @@ class StructFactT
     }
 
     /// accessor of k_lists_
-    const KContainer&
+    const KContainerT<T>&
     getKLists() const
     {
         return k_lists_;
@@ -119,7 +120,7 @@ class StructFactT
     resize(int nkpts, int num_species, int num_ptcls);
 
     /// K-Vector List.
-    const KContainer& k_lists_;
+    const KContainerT<T>& k_lists_;
     /** Whether intermediate data is stored per particle. default false
      * storing data per particle needs significant amount of memory but some
      * calculation may request it. storing data per particle specie is more
diff --git a/src/Particle/MCWalkerConfigurationT.cpp b/src/Particle/MCWalkerConfigurationT.cpp
new file mode 100644
index 0000000000..1f3fcaa1c0
--- /dev/null
+++ b/src/Particle/MCWalkerConfigurationT.cpp
@@ -0,0 +1,313 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
+//
+// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
+//
+// File developed by: Jordan E. Vincent, University of Illinois at
+// Urbana-Champaign
+//                    Bryan Clark, bclark@Princeton.edu, Princeton University
+//                    Ken Esler, kpesler@gmail.com, University of Illinois at
+//                    Urbana-Champaign Jeremy McMinnis, jmcminis@gmail.com,
+//                    University of Illinois at Urbana-Champaign Jeongnim Kim,
+//                    jeongnim.kim@gmail.com, University of Illinois at
+//                    Urbana-Champaign Cynthia Gu, zg1@ornl.gov, Oak Ridge
+//                    National Laboratory Ye Luo, yeluo@anl.gov, Argonne
+//                    National Laboratory Mark A. Berrill, berrillma@ornl.gov,
+//                    Oak Ridge National Laboratory
+//
+// File created by: Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois
+// at Urbana-Champaign
+//////////////////////////////////////////////////////////////////////////////////////
+
+#include "MCWalkerConfigurationT.h"
+
+#include "LongRange/StructFact.h"
+#include "Message/CommOperators.h"
+#include "Message/Communicate.h"
+#include "Particle/HDFWalkerOutput.h"
+#include "Particle/MCSample.h"
+#include "Particle/ReptileT.h"
+#include "ParticleBase/RandomSeqGenerator.h"
+#include "Utilities/IteratorUtility.h"
+#include "hdf/HDFVersion.h"
+#include "hdf/hdf_hyperslab.h"
+
+#include <map>
+
+namespace qmcplusplus
+{
+template <typename T>
+MCWalkerConfigurationT<T>::MCWalkerConfigurationT(
+    const SimulationCellT<T>& simulation_cell,
+    const DynamicCoordinateKind kind) :
+    ParticleSetT<T>(simulation_cell, kind),
+    ReadyForPbyP(false),
+    UpdateMode(Update_Walker),
+    reptile(0),
+    Polymer(0)
+{
+}
+
+template <typename T>
+MCWalkerConfigurationT<T>::MCWalkerConfigurationT(
+    const MCWalkerConfigurationT& mcw) :
+    ParticleSetT<T>(mcw),
+    ReadyForPbyP(false),
+    UpdateMode(Update_Walker),
+    Polymer(0)
+{
+    samples.clearEnsemble();
+    samples.setMaxSamples(mcw.getMaxSamples());
+    setWalkerOffsets(mcw.getWalkerOffsets());
+    this->Properties = mcw.Properties;
+}
+
+template <typename T>
+MCWalkerConfigurationT<T>::~MCWalkerConfigurationT() = default;
+
+template <typename T>
+void
+MCWalkerConfigurationT<T>::createWalkers(int n)
+{
+    const int old_nw = getActiveWalkers();
+    WalkerConfigurations::createWalkers(n, this->TotalNum);
+    // no pre-existing walkers, need to initialized based on particleset.
+    if (old_nw == 0)
+        for (auto& awalker : walker_list_) {
+            awalker->R = this->R;
+            awalker->spins = this->spins;
+        }
+    resizeWalkerHistories();
+}
+
+template <typename T>
+void
+MCWalkerConfigurationT<T>::resize(int numWalkers, int numPtcls)
+{
+    if (this->TotalNum && walker_list_.size())
+        app_warning()
+            << "MCWalkerConfiguration::resize cleans up the walker list."
+            << std::endl;
+    const int old_nw = getActiveWalkers();
+    ParticleSetT<T>::resize(unsigned(numPtcls));
+    WalkerConfigurations::resize(numWalkers, this->TotalNum);
+    // no pre-existing walkers, need to initialized based on particleset.
+    if (old_nw == 0)
+        for (auto& awalker : walker_list_) {
+            awalker->R = this->R;
+            awalker->spins = this->spins;
+        }
+}
+
+/** Make Metropolis move to the walkers and save in a temporary array.
+ * @param it the iterator of the first walker to work on
+ * @param tauinv  inverse of the time step
+ *
+ * R + D + X
+ */
+template <typename T>
+void
+MCWalkerConfigurationT<T>::sample(iterator it, RealType tauinv)
+{
+    throw std::runtime_error("MCWalkerConfiguration::sample obsolete");
+    //  makeGaussRandom(R);
+    //  R *= tauinv;
+    //  R += (*it)->R + (*it)->Drift;
+}
+
+/** reset the Property container of all the walkers
+ */
+template <typename T>
+void
+MCWalkerConfigurationT<T>::resetWalkerProperty(int ncopy)
+{
+    int m(this->PropertyList.size());
+    app_log() << "  Resetting Properties of the walkers " << ncopy << " x " << m
+              << std::endl;
+    try {
+        this->Properties.resize(ncopy, m);
+    }
+    catch (std::domain_error& de) {
+        app_error() << de.what() << '\n'
+                    << "This is likely because some object has attempted to "
+                       "add walker properties\n"
+                    << " in excess of WALKER_MAX_PROPERTIES.\n"
+                    << "build with cmake ... "
+                       "-DWALKER_MAX_PROPERTIES=at_least_properties_required"
+                    << std::endl;
+        APP_ABORT("Fatal Exception");
+    }
+
+    for (auto& walker : walker_list_) {
+        walker->resizeProperty(ncopy, m);
+        walker->Weight = 1.0;
+    }
+    resizeWalkerHistories();
+}
+
+template <typename T>
+void
+MCWalkerConfigurationT<T>::resizeWalkerHistories()
+{
+    // using std::vector<std::vector<RealType> > is too costly.
+    int np = this->PropertyHistory.size();
+    if (np)
+        for (int iw = 0; iw < walker_list_.size(); ++iw)
+            walker_list_[iw]->PropertyHistory = this->PropertyHistory;
+    np = this->PHindex.size();
+    if (np)
+        for (int iw = 0; iw < walker_list_.size(); ++iw)
+            walker_list_[iw]->PHindex = this->PHindex;
+    ;
+}
+
+/** allocate the SampleStack
+ * @param n number of samples per thread
+ */
+template <typename T>
+void
+MCWalkerConfigurationT<T>::setNumSamples(int n)
+{
+    samples.clearEnsemble();
+    samples.setMaxSamples(n);
+}
+
+/** save the current walkers to SampleStack
+ */
+template <typename T>
+void
+MCWalkerConfigurationT<T>::saveEnsemble()
+{
+    saveEnsemble(walker_list_.begin(), walker_list_.end());
+}
+
+/** save the [first,last) walkers to SampleStack
+ */
+template <typename T>
+void
+MCWalkerConfigurationT<T>::saveEnsemble(iterator first, iterator last)
+{
+    for (; first != last; first++) {
+        samples.appendSample(MCSample(**first));
+    }
+}
+/** load a single sample from SampleStack
+ */
+template <typename T>
+void
+MCWalkerConfigurationT<T>::loadSample(ParticleSetT<T>& pset, size_t iw) const
+{
+    samples.loadSample(pset, iw);
+}
+
+/** load SampleStack to walker_list_
+ */
+template <typename T>
+void
+MCWalkerConfigurationT<T>::loadEnsemble()
+{
+    using WP = WalkerProperties::Indexes;
+    int nsamples = std::min(samples.getMaxSamples(), samples.getNumSamples());
+    if (samples.empty() || nsamples == 0)
+        return;
+    Walker_t::PropertyContainer_t prop(
+        1, this->PropertyList.size(), 1, WP::MAXPROPERTIES);
+    walker_list_.resize(nsamples);
+    for (int i = 0; i < nsamples; ++i) {
+        auto awalker = std::make_unique<Walker_t>(this->TotalNum);
+        awalker->Properties.copy(prop);
+        samples.getSample(i).convertToWalker(*awalker);
+        walker_list_[i] = std::move(awalker);
+    }
+    resizeWalkerHistories();
+    samples.clearEnsemble();
+}
+
+template <typename T>
+bool
+MCWalkerConfigurationT<T>::dumpEnsemble(
+    std::vector<MCWalkerConfigurationT<T>*>& others, HDFWalkerOutput& out,
+    int np, int nBlock)
+{
+    WalkerConfigurations wctemp;
+    for (auto* mcwc : others) {
+        const auto& astack(mcwc->getSampleStack());
+        const size_t sample_size =
+            std::min(mcwc->getMaxSamples(), mcwc->numSamples());
+        for (int j = 0; j < sample_size; ++j) {
+            const auto& sample = astack.getSample(j);
+            const size_t num_ptcls = sample.getNumPtcls();
+            auto awalker = std::make_unique<Walker_t>(num_ptcls);
+            sample.convertToWalker(*awalker);
+            wctemp.push_back(std::move(awalker));
+        }
+    }
+    const int w = wctemp.getActiveWalkers();
+    if (w == 0)
+        return false;
+
+    // The following code assumes the same amount of active walkers on all the
+    // MPI ranks
+    std::vector<int> nwoff(np + 1, 0);
+    for (int ip = 0; ip < np; ++ip)
+        nwoff[ip + 1] = nwoff[ip] + w;
+    wctemp.setWalkerOffsets(nwoff);
+    out.dump(wctemp, nBlock);
+    return true;
+}
+
+template <typename T>
+int
+MCWalkerConfigurationT<T>::getMaxSamples() const
+{
+    return samples.getMaxSamples();
+}
+
+template <typename T>
+void
+MCWalkerConfigurationT<T>::loadEnsemble(
+    std::vector<MCWalkerConfigurationT<T>*>& others, bool doclean)
+{
+    using WP = WalkerProperties::Indexes;
+    std::vector<int> off(others.size() + 1, 0);
+    for (int i = 0; i < others.size(); ++i) {
+        off[i + 1] = off[i] +
+            std::min(others[i]->getMaxSamples(), others[i]->numSamples());
+    }
+    int nw_tot = off.back();
+    if (nw_tot) {
+        Walker_t::PropertyContainer_t prop(
+            1, this->PropertyList.size(), 1, WP::MAXPROPERTIES);
+        while (walker_list_.size())
+            pop_back();
+        walker_list_.resize(nw_tot);
+        for (int i = 0; i < others.size(); ++i) {
+            SampleStackT<T>& astack(others[i]->getSampleStack());
+            for (int j = 0, iw = off[i]; iw < off[i + 1]; ++j, ++iw) {
+                auto awalker = std::make_unique<Walker_t>(this->TotalNum);
+                awalker->Properties.copy(prop);
+                astack.getSample(j).convertToWalker(*awalker);
+                walker_list_[iw] = std::move(awalker);
+            }
+            if (doclean)
+                others[i]->clearEnsemble();
+        }
+    }
+    if (doclean)
+        resizeWalkerHistories();
+}
+
+template <typename T>
+void
+MCWalkerConfigurationT<T>::clearEnsemble()
+{
+    samples.clearEnsemble();
+}
+
+template class MCWalkerConfigurationT<double>;
+template class MCWalkerConfigurationT<float>;
+template class MCWalkerConfigurationT<std::complex<double>>;
+template class MCWalkerConfigurationT<std::complex<float>>;
+
+} // namespace qmcplusplus
diff --git a/src/Particle/MCWalkerConfigurationT.h b/src/Particle/MCWalkerConfigurationT.h
new file mode 100644
index 0000000000..49a159e51d
--- /dev/null
+++ b/src/Particle/MCWalkerConfigurationT.h
@@ -0,0 +1,244 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
+//
+// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
+//
+// File developed by: Jordan E. Vincent, University of Illinois at
+// Urbana-Champaign
+//                    Ken Esler, kpesler@gmail.com, University of Illinois at
+//                    Urbana-Champaign Jeremy McMinnis, jmcminis@gmail.com,
+//                    University of Illinois at Urbana-Champaign Jeongnim Kim,
+//                    jeongnim.kim@gmail.com, University of Illinois at
+//                    Urbana-Champaign Cynthia Gu, zg1@ornl.gov, Oak Ridge
+//                    National Laboratory Raymond Clay III,
+//                    j.k.rofling@gmail.com, Lawrence Livermore National
+//                    Laboratory Ye Luo, yeluo@anl.gov, Argonne National
+//                    Laboratory Mark A. Berrill, berrillma@ornl.gov, Oak Ridge
+//                    National Laboratory
+//
+// File created by: Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois
+// at Urbana-Champaign
+//////////////////////////////////////////////////////////////////////////////////////
+
+/** @file MCWalkerConfiguration.h
+ * @brief Declaration of a MCWalkerConfiguration
+ */
+#ifndef QMCPLUSPLUS_MCWALKERCONFIGURATIONT_H
+#define QMCPLUSPLUS_MCWALKERCONFIGURATIONT_H
+#include "Particle/ParticleSetT.h"
+#include "Particle/SampleStackT.h"
+#include "Particle/Walker.h"
+#include "Particle/WalkerConfigurations.h"
+#include "Utilities/IteratorUtility.h"
+
+namespace qmcplusplus
+{
+// Forward declaration
+class MultiChain;
+class HDFWalkerOutput;
+template <typename T>
+class ReptileT;
+
+/** A set of walkers that are to be advanced by Metropolis Monte Carlo.
+ *
+ *As a derived class from ParticleSet, MCWalkerConfiguration interacts with
+ *QMCHamiltonian and TrialWaveFunction as a ParticleSet, while QMCDrivers
+ *use it as multiple walkers whose configurations are advanced according
+ to MC algorithms.
+ *
+ Each walker is represented by Walker<PosVector_t> and
+ *MCWalkerConfiguration contains a list of
+ *the walkers.  This class enables two possible moves:
+ *<ul>
+ *<li> move the entire active walkers, similarly to molecu. Suitable for
+ *small and big systems with a small time step.
+ *<li> move a particle for each walker. Suitable for large systems.
+
+ *</ul>
+ */
+template <typename T>
+class MCWalkerConfigurationT :
+    public ParticleSetT<T>,
+    public WalkerConfigurations
+{
+public:
+    /**enumeration for update*/
+    enum
+    {
+        Update_All = 0, /// move all the active walkers
+        Update_Walker, /// move a walker by walker
+        Update_Particle /// move a particle by particle
+    };
+
+    using Walker_t = WalkerConfigurations::Walker_t;
+    /// container type of the Properties of a Walker
+    using PropertyContainer_t = Walker_t::PropertyContainer_t;
+    /// container type of Walkers
+    using WalkerList_t = std::vector<std::unique_ptr<Walker_t>>;
+    /// FIX: a type alias of iterator for an object should not be for just one
+    /// of many objects it holds.
+    using iterator = WalkerList_t::iterator;
+    /// const_iterator of Walker container
+    using const_iterator = WalkerList_t::const_iterator;
+
+    using ReptileList_t = UPtrVector<ReptileT<T>>;
+
+    using RealType = typename ParticleSetT<T>::RealType;
+
+    /// default constructor
+    MCWalkerConfigurationT(const SimulationCellT<T>& simulation_cell,
+        const DynamicCoordinateKind kind = DynamicCoordinateKind::DC_POS);
+
+    /// default constructor: copy only ParticleSet
+    MCWalkerConfigurationT(const MCWalkerConfigurationT& mcw);
+    ~MCWalkerConfigurationT();
+    /** create numWalkers Walkers
+     *
+     * Append Walkers to WalkerList.
+     */
+    void
+    createWalkers(int numWalkers);
+    /// clean up the walker list and make a new list
+    void
+    resize(int numWalkers, int numPtcls);
+
+    /// clean up the walker list
+    using WalkerConfigurations::clear;
+    /// resize Walker::PropertyHistory and Walker::PHindex:
+    void
+    resizeWalkerHistories();
+
+    /// make random moves for all the walkers
+    // void sample(iterator first, iterator last, value_type tauinv);
+    /// make a random move for a walker
+    void
+    sample(iterator it, RealType tauinv);
+
+    /// return the number of particles per walker
+    inline int
+    getParticleNum() const
+    {
+        return this->R.size();
+    }
+    /**@}*/
+
+    /** set LocalEnergy
+     * @param e current average Local Energy
+     */
+    inline void
+    setLocalEnergy(RealType e)
+    {
+        LocalEnergy = e;
+    }
+
+    /** return LocalEnergy
+     */
+    inline RealType
+    getLocalEnergy() const
+    {
+        return LocalEnergy;
+    }
+
+    inline MultiChain*
+    getPolymer()
+    {
+        return Polymer;
+    }
+
+    inline void
+    setPolymer(MultiChain* chain)
+    {
+        Polymer = chain;
+    }
+
+    void
+    resetWalkerProperty(int ncopy = 1);
+
+    inline bool
+    updatePbyP() const
+    {
+        return ReadyForPbyP;
+    }
+
+    //@{save/load/clear function for optimization
+    //
+    int
+    numSamples() const
+    {
+        return samples.getNumSamples();
+    }
+    /// set the number of max samples
+    void
+    setNumSamples(int n);
+    /// save the position of current walkers to SampleStack
+    void
+    saveEnsemble();
+    /// save the position of current walkers
+    void
+    saveEnsemble(iterator first, iterator last);
+    /// load a single sample from SampleStack
+    void
+    loadSample(ParticleSetT<T>& pset, size_t iw) const;
+    /// load SampleStack data to the current list of walker configurations
+    void
+    loadEnsemble();
+    /// load the SampleStacks of others to the current list of walker
+    /// configurations
+    void
+    loadEnsemble(
+        std::vector<MCWalkerConfigurationT<T>*>& others, bool doclean = true);
+    /** dump Samples to a file
+     * @param others MCWalkerConfigurations whose samples will be collected
+     * @param out engine to write the samples to state_0/walkers
+     * @param np number of processors
+     * @return true with non-zero samples
+     *
+     * CAUTION: The current implementation assumes the same amount of active
+     * walkers on all the MPI ranks.
+     */
+    static bool
+    dumpEnsemble(std::vector<MCWalkerConfigurationT<T>*>& others,
+        HDFWalkerOutput& out, int np, int nBlock);
+    /// clear the ensemble
+    void
+    clearEnsemble();
+
+    const SampleStackT<T>&
+    getSampleStack() const
+    {
+        return samples;
+    }
+    SampleStackT<T>&
+    getSampleStack()
+    {
+        return samples;
+    }
+
+    /// Transitional forwarding methods
+    int
+    getMaxSamples() const;
+    //@}
+
+protected:
+    /// true if the buffer is ready for particle-by-particle updates
+    bool ReadyForPbyP;
+    /// update-mode index
+    int UpdateMode;
+
+    RealType LocalEnergy;
+
+public:
+    /// a collection of reptiles contained in MCWalkerConfiguration.
+    ReptileList_t ReptileList;
+    ReptileT<T>* reptile;
+
+    friend class MCPopulation;
+
+private:
+    MultiChain* Polymer;
+
+    SampleStackT<T> samples;
+};
+} // namespace qmcplusplus
+#endif
diff --git a/src/Particle/ParticleIO/LatticeIO.cpp b/src/Particle/ParticleIO/LatticeIO.cpp
index 2d8ea238e0..0fe1756969 100644
--- a/src/Particle/ParticleIO/LatticeIO.cpp
+++ b/src/Particle/ParticleIO/LatticeIO.cpp
@@ -233,6 +233,216 @@ bool LatticeParser::put(xmlNodePtr cur)
 }
 
 
+template <typename T>
+bool LatticeParserT<T>::put(xmlNodePtr cur)
+{
+  const int DIM             = ParticleLayout::SingleParticlePos::Size;
+  double a0                 = 1.0;
+  double rs                 = -1.0;
+  int nptcl                 = 0;
+  int nsh                   = 0; //for backwards compatibility w/ odd heg initialization style
+  int pol                   = 0;
+  using SingleParticleIndex = typename ParticleLayout::SingleParticleIndex;
+  TinyVector<std::string, DIM> bconds("p");
+
+  Tensor<OHMMS_PRECISION_FULL, DIM> lattice_in;
+  bool lattice_defined = false;
+  bool bconds_defined  = false;
+  int boxsum           = 0;
+
+  app_summary() << std::endl;
+  app_summary() << " Lattice" << std::endl;
+  app_summary() << " -------" << std::endl;
+  cur = cur->xmlChildrenNode;
+  while (cur != NULL)
+  {
+    std::string cname((const char*)cur->name);
+    if (cname == "parameter")
+    {
+      const std::string aname(getXMLAttributeValue(cur, "name"));
+      if (aname == "scale")
+      {
+        putContent(a0, cur);
+      }
+      else if (aname == "lattice")
+      {
+        const std::string units_prop(getXMLAttributeValue(cur, "units"));
+        if (!units_prop.empty() && units_prop != "bohr")
+        {
+          std::ostringstream err_msg;
+          err_msg << "LatticeParser::put. Only atomic units (bohr) supported for lattice units. Input file uses: "
+                  << units_prop;
+          throw UniformCommunicateError(err_msg.str());
+        }
+
+        putContent(lattice_in, cur);
+        lattice_defined = true;
+        //putContent(ref_.R,cur);
+      }
+      else if (aname == "bconds")
+      {
+        putContent(bconds, cur);
+        bconds_defined = true;
+        for (int idir = 0; idir < DIM; idir++)
+        {
+          char b = bconds[idir][0];
+          if (b == 'n' || b == 'N')
+          {
+            ref_.BoxBConds[idir] = false;
+          }
+          else if (b == 'p' || b == 'P')
+          {
+            ref_.BoxBConds[idir] = true;
+            boxsum++;
+          }
+          else
+          {
+            std::ostringstream err_msg;
+            err_msg << "LatticeParser::put. Unknown label '" + bconds[idir] +
+                    "' used for periodicity. Only 'p', 'P', 'n' and 'N' are valid!";
+            throw UniformCommunicateError(err_msg.str());
+          }
+
+          // Protect BCs which are not implemented.
+          if (idir > 0 && !ref_.BoxBConds[idir - 1] && ref_.BoxBConds[idir])
+          {
+            std::ostringstream err_msg;
+            err_msg
+                << "LatticeParser::put. In \"bconds\", non periodic directions must be placed after the periodic ones.";
+            throw UniformCommunicateError(err_msg.str());
+          }
+        }
+      }
+      else if (aname == "vacuum")
+      {
+        putContent(ref_.VacuumScale, cur);
+      }
+      else if (aname == "LR_dim_cutoff")
+      {
+        putContent(ref_.LR_dim_cutoff, cur);
+      }
+      else if (aname == "LR_handler")
+      {
+        std::string handler_type("opt_breakup");
+        //This chops whitespace so the simple str == comparisons work
+        putContent(handler_type, cur);
+        handler_type = lowerCase(handler_type);
+        if (handler_type == "ewald")
+          LRCoulombSingleton::this_lr_type = LRCoulombSingleton::EWALD;
+        else if (handler_type == "opt_breakup")
+          LRCoulombSingleton::this_lr_type = LRCoulombSingleton::ESLER;
+        else if (handler_type == "opt_breakup_original")
+          LRCoulombSingleton::this_lr_type = LRCoulombSingleton::NATOLI;
+        else if (handler_type == "ewald_strict2d")
+        {
+          LRCoulombSingleton::this_lr_type = LRCoulombSingleton::STRICT2D;
+          ref_.ndim                        = 2;
+        }
+        else if (handler_type == "ewald_quasi2d")
+          LRCoulombSingleton::this_lr_type = LRCoulombSingleton::QUASI2D;
+        else
+          throw UniformCommunicateError("LatticeParser::put. Long range breakup handler not recognized.");
+      }
+      else if (aname == "LR_tol")
+      {
+        putContent(ref_.LR_tol, cur);
+      }
+      else if (aname == "rs")
+      {
+        lattice_defined = true;
+        OhmmsAttributeSet rAttrib;
+        rAttrib.add(nptcl, "condition");
+        rAttrib.add(pol, "polarized");
+        rAttrib.add(nsh, "shell");
+        rAttrib.put(cur);
+        putContent(rs, cur);
+      }
+      else if (aname == "nparticles")
+      {
+        putContent(nptcl, cur);
+      }
+    }
+    cur = cur->next;
+  }
+
+  // checking boundary conditions
+  if (lattice_defined)
+  {
+    if (!bconds_defined)
+    {
+      app_log() << "  Lattice is specified but boundary conditions are not. Assuming PBC." << std::endl;
+      ref_.BoxBConds = true;
+    }
+  }
+  else if (boxsum == 0)
+    app_log() << "  Lattice is not specified for the Open BC. Add a huge box." << std::endl;
+  else
+    throw UniformCommunicateError("LatticeParser::put. Mixed boundary is supported only when a lattice is specified!");
+
+  //special heg processing
+  if (rs > 0.0)
+  {
+    HEGGrid<typename ParticleLayout::Scalar_t> heg(ref_);
+    if (pol == 0)
+    {
+      if (nsh > 0)
+        nptcl = 2 * heg.getNumberOfKpoints(nsh);
+      else
+        nsh = heg.getShellIndex(nptcl / 2);
+    }
+    else
+    { //             spin polarized
+      if (nsh > 0)
+        nptcl = heg.getNumberOfKpoints(nsh);
+      else
+        nsh = heg.getShellIndex(nptcl);
+    }
+    typename ParticleLayout::Scalar_t acubic = heg.getCellLength(nptcl, rs);
+    app_log() << "  " << OHMMS_DIM << "D HEG system"
+              << "\n     rs  = " << rs;
+    if (pol == 0)
+    {
+      app_log() << "\n     number of up particles = " << nptcl / 2 << "\n     number of dn particles = " << nptcl / 2;
+    }
+    else
+    {
+      app_log() << "\n     number of up particles = " << nptcl;
+    }
+    app_log() << "\n     filled kshells      = " << nsh << "\n     lattice constant    = " << acubic << " bohr"
+              << std::endl;
+    lattice_in = 0.0;
+    for (int idim = 0; idim < DIM; idim++)
+      lattice_in(idim, idim) = acubic;
+    a0 = 1.0;
+  }
+
+  if (lattice_defined)
+  {
+    lattice_in *= a0;
+    ref_.set(lattice_in);
+  }
+
+  if (ref_.SuperCellEnum != SUPERCELL_SLAB && LRCoulombSingleton::isQuasi2D())
+    throw UniformCommunicateError("LatticeParser::put. Quasi 2D Ewald only works with boundary condition 'p p n'!");
+
+  if (ref_.SuperCellEnum == SUPERCELL_OPEN)
+    ref_.WignerSeitzRadius = ref_.SimulationCellRadius;
+
+  std::string unit_name = "bohr";
+  app_log() << std::fixed;
+  app_log() << "  Simulation cell radius   = " << ref_.SimulationCellRadius << " " << unit_name << std::endl;
+  app_log() << "  Wigner-Seitz cell radius = " << ref_.WignerSeitzRadius << " " << unit_name << std::endl;
+  app_log() << std::endl;
+
+  return lattice_defined;
+}
+
+template class LatticeParserT<double>;
+template class LatticeParserT<float>;
+template class LatticeParserT<std::complex<double>>;
+template class LatticeParserT<std::complex<float>>;
+
+
 bool LatticeXMLWriter::get(std::ostream& os) const
 {
   os << "<unitcell>" << std::endl;
diff --git a/src/Particle/ParticleIO/LatticeIO.h b/src/Particle/ParticleIO/LatticeIO.h
index a52e17858d..41e3da8790 100644
--- a/src/Particle/ParticleIO/LatticeIO.h
+++ b/src/Particle/ParticleIO/LatticeIO.h
@@ -17,6 +17,7 @@
 
 #include "OhmmsData/OhmmsElementBase.h"
 #include "Configuration.h"
+#include "ParticleSetTraits.h"
 
 namespace qmcplusplus
 {
@@ -31,6 +32,18 @@ class LatticeParser
 };
 
 
+template <typename T>
+class LatticeParserT
+{
+  using ParticleLayout = typename LatticeParticleTraits<T>::ParticleLayout;
+  ParticleLayout& ref_;
+
+public:
+  LatticeParserT(ParticleLayout& lat) : ref_(lat) {}
+  bool put(xmlNodePtr cur);
+};
+
+
 class LatticeXMLWriter
 {
   using ParticleLayout = PtclOnLatticeTraits::ParticleLayout;
diff --git a/src/Particle/ParticleIO/XMLParticleIO.cpp b/src/Particle/ParticleIO/XMLParticleIO.cpp
index 26b9d658e7..e5c29cbc59 100644
--- a/src/Particle/ParticleIO/XMLParticleIO.cpp
+++ b/src/Particle/ParticleIO/XMLParticleIO.cpp
@@ -487,6 +487,404 @@ void XMLParticleParser::getPtclAttrib(xmlNodePtr cur, int in_offset, int copy_si
 }
 
 
+
+template <typename T>
+XMLParticleParserT<T>::XMLParticleParserT(Particle_t& aptcl) : ref_(aptcl)
+{
+  //add ref particle attributes
+  ref_.createAttributeList(ref_AttribList);
+}
+
+/** process xmlnode &lt;particleset/&gt; which contains everything about the particle set to initialize
+ *@param cur the xmlnode to work on
+ *
+ */
+template <typename T>
+bool XMLParticleParserT<T>::readXML(xmlNodePtr cur)
+{
+  ReportEngine PRE("XMLParticleParser", "readXML");
+
+  if (ref_.getTotalNum())
+    throw UniformCommunicateError("The ParticleSet object to load XML input was not empty. Report a bug!");
+
+  SpeciesSet& tspecies(ref_.getSpeciesSet());
+  if (tspecies.size() != 0)
+    throw UniformCommunicateError("The SpeciesSet object to load XML input was not empty. Report a bug!");
+
+  // the total number of particles, once it is set non-zero, always check against it.
+  int nat = 0;
+  // the number of particles by group, once it is constructed, always check against it.
+  std::vector<int> nat_group;
+
+  std::string pname("none");
+  std::string randomizeR("no");
+  OhmmsAttributeSet pAttrib;
+  pAttrib.add(randomizeR, "random");
+  pAttrib.add(nat, "size");
+  pAttrib.add(pname, "name");
+  pAttrib.put(cur);
+
+  ref_.setName(pname.c_str());
+
+  if (nat != 0)
+  {
+    app_debug() << "Set the total size " << nat
+                << " by the 'size' attribute found in 'particleset' XML element node named '" << pname << "'."
+                << std::endl;
+  }
+
+  bool ionid_found = false;
+  { // parse all the 'group's to obtain or verify the total number of particles
+    //total count of the particles to be created
+    int ntot               = 0;
+    int num_non_zero_group = 0;
+    bool group_found       = false;
+
+    processChildren(cur, [&](const std::string& cname, const xmlNodePtr element) {
+      if (cname == "atom")
+        throw UniformCommunicateError("XML element node atom is no more supported");
+      else if (cname.find("ell") < cname.size()) //accept UnitCell, unitcell, supercell
+        throw UniformCommunicateError("Constructing cell inside particleset is illegal!");
+      else if (cname == "group")
+      {
+        group_found       = true;
+        std::string sname = getXMLAttributeValue(element, "name");
+        if (sname.empty())
+          throw UniformCommunicateError("'group' element node must include a name attribute!");
+        else
+        {
+          const int sid = tspecies.addSpecies(sname);
+          setSpeciesProperty(tspecies, sid, element);
+        }
+
+        int nat_per_group = 0;
+        OhmmsAttributeSet gAttrib;
+        gAttrib.add(nat_per_group, "size");
+        gAttrib.put(element);
+
+        nat_group.push_back(nat_per_group);
+        ntot += nat_per_group;
+        if (nat_per_group > 0)
+          num_non_zero_group++;
+      }
+      else if (cname == attrib_tag && getXMLAttributeValue(element, "name") == ionid_tag)
+        ionid_found = true;
+    });
+
+    if (!group_found)
+      throw UniformCommunicateError("No 'group' XML element node was found. Check XML input!");
+
+    if (nat != 0 && ntot != 0 && nat != ntot)
+    {
+      std::ostringstream msg;
+      msg << "The total number of particles deterimined previously was " << nat
+          << "but the sum of the sizes from all the 'group' XML element nodes is " << ntot
+          << ". Please check the 'particleset' XML element node!" << std::endl;
+      throw UniformCommunicateError(msg.str());
+    }
+
+    if (nat == 0 && ntot != 0)
+    {
+      nat = ntot;
+      app_debug() << "Set the total size " << nat << " by the sum of the 'size's on all the 'group' XML element nodes."
+                  << std::endl;
+    }
+
+    if (ntot > 0 && num_non_zero_group != nat_group.size())
+      throw UniformCommunicateError("Some 'group' XML element node doesn't contain a 'size' attribute! 'size = 0' is not allowed in the input. Make appropriate adjustments to the input or converter.");
+  }
+
+  { // parse all the 'attrib's to obtain or verify the total number of particles
+    processChildren(cur, [&](const std::string& cname, const xmlNodePtr element) {
+      if (cname == attrib_tag)
+      {
+        std::string sname = getXMLAttributeValue(element, "name");
+        if (sname.empty())
+          throw UniformCommunicateError("'" + ParticleTags::attrib_tag +
+                                        "' XML element node must include a name attribute!");
+
+        int size_att = 0;
+        OhmmsAttributeSet aAttrib;
+        aAttrib.add(size_att, "size");
+        aAttrib.put(element);
+
+        if (nat != 0 && size_att != 0 && nat != size_att)
+        {
+          std::ostringstream msg;
+          msg << "The total number of particles deterimined previously was " << nat
+              << " but the 'size' atttribute found on the '" << ParticleTags::attrib_tag
+              << "' XML element nodes named '" << sname << "' is " << size_att
+              << ". Please check the 'particleset' XML element node!" << std::endl;
+          throw UniformCommunicateError(msg.str());
+        }
+
+        if (nat == 0 && size_att != 0)
+        {
+          nat = size_att;
+          app_debug() << "Set the total size " << nat << " by the 'size' on the '" << ParticleTags::attrib_tag
+                      << "' XML element node named '" << sname << "'." << std::endl;
+        }
+      }
+    });
+  }
+
+  if (nat == 0)
+    throw UniformCommunicateError("Failed in figuring out the total number of particles. Check XML input!");
+
+  if (ionid_found)
+  { // parse ionid and construct input order to stored order
+    std::vector<int> map_storage_to_input(nat);
+    processChildren(cur, [&](const std::string& cname, const xmlNodePtr element) {
+      if (cname == attrib_tag && getXMLAttributeValue(element, "name") == ionid_tag)
+      {
+        std::string datatype = getXMLAttributeValue(element, datatype_tag);
+        if (datatype != stringtype_tag)
+          throw UniformCommunicateError("'ionid' only supports datatype=\"" + stringtype_tag + "\"");
+        std::vector<std::string> d_in(nat);
+        putContent(d_in, element);
+        bool input_ungrouped = false;
+        int storage_index    = 0;
+        for (int ig = 0; ig < nat_group.size(); ig++)
+        {
+          const auto& group_species_name = tspecies.getSpeciesName(ig);
+          int count_group_size           = 0;
+          for (int iat = 0; iat < nat; iat++)
+          {
+            const int element_index = tspecies.findSpecies(d_in[iat]);
+            if (element_index == tspecies.size())
+              throw UniformCommunicateError("Element " + d_in[iat] +
+                                            " doesn't match any species from 'group' XML element nodes.");
+            if (element_index == ig)
+            {
+              if (iat != storage_index)
+                input_ungrouped = true;
+              count_group_size++;
+              map_storage_to_input[storage_index++] = iat;
+            }
+          }
+
+          if (count_group_size == 0)
+            throw UniformCommunicateError("Element '" + group_species_name + "' not found in 'ionid'.");
+
+          if (nat_group[ig] == 0)
+            nat_group[ig] = count_group_size;
+          else if (nat_group[ig] != count_group_size)
+          {
+            std::ostringstream msg;
+            msg << "The number of particles of element '" << group_species_name << "' from 'group' XML elment node was "
+                << nat_group[ig] << " but 'ionid' contains " << count_group_size << " entries." << std::endl;
+            throw UniformCommunicateError(msg.str());
+          }
+        }
+
+        if (input_ungrouped)
+        {
+          app_log() << "  Input particle set is not grouped by species.  Remapping particle position indices "
+                       "internally."
+                    << std::endl;
+          app_debug() << "    Species : input particle index -> internal particle index" << std::endl;
+          for (int new_idx = 0; new_idx < map_storage_to_input.size(); new_idx++)
+          {
+            int old_idx = map_storage_to_input[new_idx];
+            if (new_idx != old_idx)
+            {
+              app_debug() << "    " << d_in[old_idx] << " : " << old_idx << " -> " << new_idx << std::endl;
+            }
+          }
+        }
+      }
+    });
+
+    checkGrouping(nat, nat_group);
+    ref_.create(nat_group);
+    // save map_storage_to_input
+    ref_.setMapStorageToInput(map_storage_to_input);
+
+    for (int iat = 0; iat < nat; iat++)
+    {
+      processChildren(cur, [&](const std::string& cname, const xmlNodePtr element) {
+        if (cname == attrib_tag && getXMLAttributeValue(element, "name") != ionid_tag)
+          getPtclAttrib(element, map_storage_to_input[iat], 1, iat);
+      });
+    }
+  }
+  else
+  {
+    // fix old input with positions outside 'group'
+    if (nat_group.size() == 1 && nat_group[0] == 0)
+      nat_group[0] = nat;
+
+    checkGrouping(nat, nat_group);
+    ref_.create(nat_group);
+
+    // obtain 'attrib' inside 'group'
+    size_t start = 0;
+    size_t ig    = 0;
+    processChildren(cur, [&](const std::string& cname, const xmlNodePtr child) {
+      if (cname == "group")
+      {
+        processChildren(child, [&](const std::string& cname, const xmlNodePtr element) {
+          if (cname == attrib_tag)
+            getPtclAttrib(element, 0, nat_group[ig], start);
+        });
+        start += nat_group[ig];
+        ig++;
+      }
+      else if (cname == attrib_tag)
+      {
+        if (nat_group.size() > 1)
+          throw UniformCommunicateError("An 'attrib' XML element node was found outside 'group'"
+                                        " without XML element node named 'ionid'."
+                                        " Cannot map particles to more than one species. Check XML input!");
+        getPtclAttrib(child, 0, nat, 0);
+      }
+    });
+  }
+
+  if (ref_.getLattice().SuperCellEnum)
+  {
+    if (randomizeR == "yes")
+    {
+      makeUniformRandom(ref_.R);
+      ref_.R.setUnit(PosUnit::Lattice);
+      ref_.convert2Cart(ref_.R);
+      makeUniformRandom(ref_.spins);
+      ref_.spins *= 2 * M_PI;
+    }
+    else // put them [0,1) in the cell
+      ref_.applyBC(ref_.R);
+  }
+
+  //this sets Mass, Z
+  ref_.resetGroups();
+  ref_.createSK();
+
+  return true;
+}
+
+template <typename T>
+void XMLParticleParserT<T>::checkGrouping(int nat, const std::vector<int>& nat_group) const
+{
+  app_debug() << "There are " << nat << " particles in " << nat_group.size() << " species containing:" << std::endl;
+  for (int ig = 0; ig < nat_group.size(); ig++)
+  {
+    const auto& group_species_name = ref_.getSpeciesSet().getSpeciesName(ig);
+    if (nat_group[ig] == 0)
+      throw UniformCommunicateError("Element '" + group_species_name + "' was provided but never referenced.");
+    app_debug() << "    " << nat_group[ig] << " '" << group_species_name << "'" << std::endl;
+  }
+
+  if (std::accumulate(nat_group.begin(), nat_group.end(), 0) != nat)
+    throw UniformCommunicateError(
+        "The total number of particles doesn't match the sum of the particle counts of all the species.");
+}
+
+/** process xmlnode to reset the properties of a particle set
+ * @param cur current node
+ * @return true, if successful
+ *
+ * This resets or adds new attributes to a particle set.
+ * It cannot modify the size of the particle set.
+ */
+template <typename T>
+bool XMLParticleParserT<T>::reset(xmlNodePtr cur)
+{
+  ReportEngine PRE("XMLParticleParser", "reset");
+  SpeciesSet& tspecies(ref_.getSpeciesSet());
+  cur = cur->xmlChildrenNode;
+  while (cur != NULL)
+  {
+    std::string cname((const char*)cur->name);
+    if (cname == "group")
+    {
+      std::string sname;
+      OhmmsAttributeSet gAttrib;
+      gAttrib.add(sname, "name");
+      gAttrib.put(cur);
+      if (sname.size())
+      {
+        int sid = tspecies.addSpecies(sname);
+        setSpeciesProperty(tspecies, sid, cur);
+      }
+    }
+    cur = cur->next;
+  }
+  //  //@todo Will add a member function to ParticleSet to handle these
+  //  int massind=tspecies.addAttribute("mass");
+  //  for(int iat=0; iat<ref_.getTotalNum(); iat++)
+  //    ref_.Mass[iat]=tspecies(massind,ref_.GroupID[iat]);
+  //
+  //  int qind=tspecies.addAttribute("charge");
+  //  for(int iat=0; iat<ref_.getTotalNum(); iat++)
+  //    ref_.Z[iat]=tspecies(qind,ref_.GroupID[iat]);
+  //
+  return true;
+}
+
+template <typename T>
+void XMLParticleParserT<T>::getPtclAttrib(xmlNodePtr cur, int in_offset, int copy_size, int out_offset)
+{
+  std::string oname, otype;
+  int utype   = 0;
+  int size_in = 0;
+  OhmmsAttributeSet pAttrib;
+  pAttrib.add(otype, datatype_tag);  //datatype
+  pAttrib.add(oname, "name");        //name
+  pAttrib.add(utype, condition_tag); //condition
+  pAttrib.add(size_in, "size");      //size
+  pAttrib.put(cur);
+  if (oname.empty() || otype.empty())
+  {
+    app_error() << "   Missing attrib/@name or attrib/@datatype " << std::endl;
+    app_error() << R"(     <attrib name="aname"  datatype="atype"/>)" << std::endl;
+    return;
+  }
+  int t_id = ref_AttribList.getAttribType(otype);
+
+  if (oname == ionid_tag)
+    throw UniformCommunicateError("'ionid' should not be parsed by getPtclAttrib.");
+  else
+  {
+    //very permissive in that a unregistered attribute will be created and stored by ParticleSet
+    //cloning is not going to work
+    if (t_id == PA_IndexType)
+    {
+      ParticleIndex* obj = nullptr;
+      obj                = ref_AttribList.getAttribute(otype, oname, obj);
+      ParticleAttribXmlNode<ParticleIndex> a(*obj, static_cast<PosUnit>(utype));
+      a.put(cur, in_offset, copy_size, out_offset);
+    }
+    else if (t_id == PA_ScalarType)
+    {
+      ParticleScalar* obj = nullptr;
+      obj                 = ref_AttribList.getAttribute(otype, oname, obj);
+      ParticleAttribXmlNode<ParticleScalar> a(*obj, static_cast<PosUnit>(utype));
+      a.put(cur, in_offset, copy_size, out_offset);
+    }
+    else if (t_id == PA_PositionType)
+    {
+      ParticlePos* obj = nullptr;
+      obj              = ref_AttribList.getAttribute(otype, oname, obj);
+      ParticleAttribXmlNode<ParticlePos> a(*obj, static_cast<PosUnit>(utype));
+      a.put(cur, in_offset, copy_size, out_offset);
+    }
+    else if (t_id == PA_TensorType)
+    {
+      ParticleTensor* obj = nullptr;
+      obj                 = ref_AttribList.getAttribute(otype, oname, obj);
+      ParticleAttribXmlNode<ParticleTensor> a(*obj, static_cast<PosUnit>(utype));
+      a.put(cur, in_offset, copy_size, out_offset);
+    }
+  }
+}
+
+template class XMLParticleParserT<double>;
+template class XMLParticleParserT<float>;
+template class XMLParticleParserT<std::complex<double>>;
+template class XMLParticleParserT<std::complex<float>>;
+
+
+
 XMLSaveParticle::XMLSaveParticle(Particle_t& pin) : ref_(pin) {}
 
 XMLSaveParticle::~XMLSaveParticle() {}
diff --git a/src/Particle/ParticleIO/XMLParticleIO.h b/src/Particle/ParticleIO/XMLParticleIO.h
index c05590ea3c..11a0c41a87 100644
--- a/src/Particle/ParticleIO/XMLParticleIO.h
+++ b/src/Particle/ParticleIO/XMLParticleIO.h
@@ -18,6 +18,7 @@
 #include "OhmmsData/OhmmsElementBase.h"
 #include "OhmmsData/RecordProperty.h"
 #include "Particle/ParticleSet.h"
+#include "Particle/ParticleSetT.h"
 
 namespace qmcplusplus
 {
@@ -139,6 +140,41 @@ class XMLParticleParser : public ParticleTags
   bool reset(xmlNodePtr cur);
 };
 
+template <typename T>
+class XMLParticleParserT : public ParticleTags
+{
+  using Particle_t     = ParticleSetT<T>;
+  using ParticleIndex  = typename Particle_t::ParticleIndex;
+  using ParticleScalar = typename Particle_t::ParticleScalar;
+  using ParticlePos    = typename Particle_t::ParticlePos;
+  using ParticleTensor = typename Particle_t::ParticleTensor;
+
+  Particle_t& ref_;
+  AttribListType ref_AttribList;
+
+  /** read the data of a particle attribute
+   *@param cur the xmlnode
+   *@param in_offset the location offset to read from XML element node body.
+   *@param copy_size the number of particle attributes to be read
+   *@param out_offset the current local count to which copy_size particle attributes are added.
+   */
+  void getPtclAttrib(xmlNodePtr cur, int in_offset, int copy_size, int out_offset);
+
+  void checkGrouping(int nat, const std::vector<int>& nat_group) const;
+
+public:
+  /**constructor
+   *@param aptcl the particleset to be initialized
+   */
+  XMLParticleParserT(Particle_t& aptcl);
+
+  bool readXML(xmlNodePtr cur);
+
+  /** reset the properties of a particle set
+   */
+  bool reset(xmlNodePtr cur);
+};
+
 class XMLSaveParticle : public ParticleTags, public RecordProperty
 {
   using Particle_t     = ParticleSet;
diff --git a/src/Particle/ParticleSetPoolT.cpp b/src/Particle/ParticleSetPoolT.cpp
new file mode 100644
index 0000000000..7100822214
--- /dev/null
+++ b/src/Particle/ParticleSetPoolT.cpp
@@ -0,0 +1,278 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
+//
+// Copyright (c) 2020 QMCPACK developers.
+//
+// File developed by: Jeremy McMinnis, jmcminis@gmail.com, University of
+// Illinois at Urbana-Champaign
+//                    Raymond Clay III, j.k.rofling@gmail.com, Lawrence
+//                    Livermore National Laboratory Jeongnim Kim,
+//                    jeongnim.kim@gmail.com, University of Illinois at
+//                    Urbana-Champaign Mark A. Berrill, berrillma@ornl.gov, Oak
+//                    Ridge National Laboratory
+//
+// File created by: Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois
+// at Urbana-Champaign
+//////////////////////////////////////////////////////////////////////////////////////
+
+/**@file ParticleSetPool.cpp
+ * @brief Implements ParticleSetPool operators.
+ */
+#include "ParticleSetPoolT.h"
+
+#include "LongRange/LRCoulombSingleton.h"
+#include "OhmmsData/AttributeSet.h"
+#include "OhmmsData/Libxml2Doc.h"
+#include "Particle/InitMolecularSystemT.h"
+#include "ParticleBase/RandomSeqGenerator.h"
+#include "ParticleIO/LatticeIO.h"
+#include "ParticleIO/XMLParticleIO.h"
+#include "Utilities/ProgressReportEngine.h"
+#include <Message/UniformCommunicateError.h>
+#include <PlatformSelector.hpp>
+
+namespace qmcplusplus
+{
+template <typename T>
+ParticleSetPoolT<T>::ParticleSetPoolT(Communicate* c, const char* aname) :
+    MPIObjectBase(c),
+    simulation_cell_(std::make_unique<SimulationCellT<T>>())
+{
+    ClassName = "ParticleSetPool";
+    myName = aname;
+}
+
+template <typename T>
+ParticleSetPoolT<T>::ParticleSetPoolT(ParticleSetPoolT&& other) noexcept :
+    MPIObjectBase(other.myComm),
+    simulation_cell_(std::move(other.simulation_cell_)),
+    myPool(std::move(other.myPool))
+{
+    ClassName = other.ClassName;
+    myName = other.myName;
+}
+
+template <typename T>
+ParticleSetPoolT<T>::~ParticleSetPoolT() = default;
+
+template <typename T>
+ParticleSetT<T>*
+ParticleSetPoolT<T>::getParticleSet(const std::string& pname)
+{
+    if (auto pit = myPool.find(pname); pit == myPool.end())
+        return nullptr;
+    else
+        return pit->second.get();
+}
+
+template <typename T>
+MCWalkerConfigurationT<T>*
+ParticleSetPoolT<T>::getWalkerSet(const std::string& pname)
+{
+    auto mc = dynamic_cast<MCWalkerConfigurationT<T>*>(getParticleSet(pname));
+    if (mc == nullptr) {
+        throw std::runtime_error(
+            "ParticleSePool::getWalkerSet missing " + pname);
+    }
+    return mc;
+}
+
+template <typename T>
+void
+ParticleSetPoolT<T>::addParticleSet(std::unique_ptr<ParticleSetT<T>>&& p)
+{
+    const auto pit(myPool.find(p->getName()));
+    if (pit == myPool.end()) {
+        auto& pname = p->getName();
+        LOGMSG("  Adding " << pname << " ParticleSet to the pool")
+        if (&p->getSimulationCell() != simulation_cell_.get())
+            throw std::runtime_error(
+                "Bug detected! ParticleSetPool::addParticleSet requires p "
+                "created with the simulation "
+                "cell from ParticleSetPool.");
+        myPool.emplace(pname, std::move(p));
+    }
+    else
+        throw std::runtime_error(
+            p->getName() + " exists. Cannot be added again.");
+}
+
+template <typename T>
+bool
+ParticleSetPoolT<T>::readSimulationCellXML(xmlNodePtr cur)
+{
+    ReportEngine PRE("ParticleSetPool", "putLattice");
+
+    bool lattice_defined = false;
+    try {
+        LatticeParserT<T> a(simulation_cell_->lattice_);
+        lattice_defined = a.put(cur);
+    }
+    catch (const UniformCommunicateError& ue) {
+        myComm->barrier_and_abort(ue.what());
+    }
+
+    if (lattice_defined) {
+        app_log() << "  Overwriting global supercell " << std::endl;
+        simulation_cell_->resetLRBox();
+        if (outputManager.isHighActive())
+            simulation_cell_->lattice_.print(app_log(), 2);
+        else
+            simulation_cell_->lattice_.print(app_summary(), 1);
+    }
+    return lattice_defined;
+}
+
+/** process an xml element
+ * @param cur current xmlNodePtr
+ * @return true, if successful.
+ *
+ * Creating MCWalkerConfiguration for all the ParticleSet
+ * objects.
+ */
+template <typename T>
+bool
+ParticleSetPoolT<T>::put(xmlNodePtr cur)
+{
+    ReportEngine PRE("ParticleSetPool", "put");
+    std::string id("e");
+    std::string role("none");
+    std::string randomR("no");
+    std::string randomsrc;
+    std::string useGPU;
+    std::string spinor;
+    OhmmsAttributeSet pAttrib;
+    pAttrib.add(id, "id");
+    pAttrib.add(id, "name");
+    pAttrib.add(role, "role");
+    pAttrib.add(randomR, "random");
+    pAttrib.add(randomsrc, "randomsrc");
+    pAttrib.add(randomsrc, "random_source");
+    pAttrib.add(spinor, "spinor", {"no", "yes"});
+    pAttrib.add(useGPU, "gpu", CPUOMPTargetSelector::candidate_values);
+    pAttrib.put(cur);
+    // backward compatibility
+    if (id == "e" && role == "none")
+        role = "MC";
+    ParticleSetT<T>* pTemp = getParticleSet(id);
+    if (pTemp == 0) {
+        const bool use_offload = CPUOMPTargetSelector::selectPlatform(useGPU) ==
+            PlatformKind::OMPTARGET;
+        app_summary() << std::endl;
+        app_summary() << " Particle Set" << std::endl;
+        app_summary() << " ------------" << std::endl;
+        app_summary() << "  Name: " << id
+                      << "   Offload : " << (use_offload ? "yes" : "no")
+                      << std::endl;
+        app_summary() << std::endl;
+
+        // select OpenMP offload implementation in ParticleSet.
+        if (use_offload)
+            pTemp = new MCWalkerConfigurationT<T>(
+                *simulation_cell_, DynamicCoordinateKind::DC_POS_OFFLOAD);
+        else
+            pTemp = new MCWalkerConfigurationT<T>(
+                *simulation_cell_, DynamicCoordinateKind::DC_POS);
+
+        myPool.emplace(id, pTemp);
+
+        try {
+            XMLParticleParserT<T> pread(*pTemp);
+            pread.readXML(cur);
+        }
+        catch (const UniformCommunicateError& ue) {
+            myComm->barrier_and_abort(ue.what());
+        }
+
+        // if random_source is given, create a node <init target="" soruce=""/>
+        if (randomR == "yes" && !randomsrc.empty()) {
+            xmlNodePtr anode = xmlNewNode(NULL, (const xmlChar*)"init");
+            xmlNewProp(anode, (const xmlChar*)"source",
+                (const xmlChar*)randomsrc.c_str());
+            xmlNewProp(
+                anode, (const xmlChar*)"target", (const xmlChar*)id.c_str());
+            randomize_nodes.push_back(anode);
+        }
+        pTemp->setName(id);
+        pTemp->setSpinor(spinor == "yes");
+        app_summary() << "  Particle set size: " << pTemp->getTotalNum()
+                      << "   Groups : " << pTemp->groups() << std::endl;
+        app_summary() << std::endl;
+        return true;
+    }
+    else {
+        app_warning() << "Particle set " << id
+                      << " is already created. Ignoring this section."
+                      << std::endl;
+    }
+    app_summary() << std::endl;
+    return true;
+}
+
+template <typename T>
+void
+ParticleSetPoolT<T>::randomize()
+{
+    app_log() << "ParticleSetPool::randomize " << randomize_nodes.size()
+              << " ParticleSet" << (randomize_nodes.size() == 1 ? "" : "s")
+              << "." << std::endl;
+    bool success = true;
+    for (int i = 0; i < randomize_nodes.size(); ++i) {
+        InitMolecularSystemT<T> moinit(*this);
+        success &= moinit.put(randomize_nodes[i]);
+        xmlFreeNode(randomize_nodes[i]);
+    }
+    randomize_nodes.clear();
+    if (!success)
+        throw std::runtime_error(
+            "ParticleSePool::randomize failed to randomize some Particlesets!");
+}
+
+template <typename T>
+bool
+ParticleSetPoolT<T>::get(std::ostream& os) const
+{
+    os << "ParticleSetPool has: " << std::endl << std::endl;
+    os.setf(std::ios::scientific, std::ios::floatfield);
+    os.precision(14);
+    for (const auto& [name, pset] : myPool)
+        if (outputManager.isDebugActive())
+            pset->print(os, 0);
+        else
+            pset->print(os, 10 /* maxParticlesToPrint */);
+    return true;
+}
+
+template <typename T>
+void
+ParticleSetPoolT<T>::output_particleset_info(
+    Libxml2Document& doc, xmlNodePtr root)
+{
+    xmlNodePtr particles_info = doc.addChild(root, "particles");
+    typename PoolType::const_iterator it(myPool.begin()), it_end(myPool.end());
+    while (it != it_end) {
+        xmlNodePtr particle = doc.addChild(particles_info, "particle");
+        doc.addChild(particle, "name", (*it).second->getName());
+        doc.addChild(particle, "size", (*it).second->getTotalNum());
+        ++it;
+    }
+}
+
+/** reset is used to initialize and evaluate the distance tables
+ */
+template <typename T>
+void
+ParticleSetPoolT<T>::reset()
+{
+    for (const auto& [key, pset] : myPool)
+        pset->update();
+}
+
+// explicit instantiations
+template class ParticleSetPoolT<double>;
+template class ParticleSetPoolT<float>;
+template class ParticleSetPoolT<std::complex<double>>;
+template class ParticleSetPoolT<std::complex<float>>;
+
+} // namespace qmcplusplus
diff --git a/src/Particle/ParticleSetPoolT.h b/src/Particle/ParticleSetPoolT.h
new file mode 100644
index 0000000000..da72817dfc
--- /dev/null
+++ b/src/Particle/ParticleSetPoolT.h
@@ -0,0 +1,155 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
+//
+// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
+//
+// File developed by: Jeremy McMinnis, jmcminis@gmail.com, University of
+// Illinois at Urbana-Champaign
+//                    Jeongnim Kim, jeongnim.kim@gmail.com, University of
+//                    Illinois at Urbana-Champaign Mark A. Berrill,
+//                    berrillma@ornl.gov, Oak Ridge National Laboratory
+//
+// File created by: Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois
+// at Urbana-Champaign
+//////////////////////////////////////////////////////////////////////////////////////
+
+#ifndef QMCPLUSPLUS_PARTICLESETPOOLT_H
+#define QMCPLUSPLUS_PARTICLESETPOOLT_H
+
+#include "Message/MPIObjectBase.h"
+#include "OhmmsData/OhmmsElementBase.h"
+#include "Particle/MCWalkerConfigurationT.h"
+#include "ParticleSetT.h"
+#include "SimulationCellT.h"
+
+namespace qmcplusplus
+{
+/** @ingroup qmcapp
+ * @brief Manage a collection of ParticleSet objects
+ *
+ * This object handles \<particleset\> elements and
+ * functions as a builder class for ParticleSet objects.
+ */
+template <typename T>
+class ParticleSetPoolT : public MPIObjectBase
+{
+public:
+    using PoolType =
+        std::map<std::string, const std::unique_ptr<ParticleSetT<T>>>;
+
+    /** constructor
+     * @param aname xml tag
+     */
+    ParticleSetPoolT(Communicate* c, const char* aname = "particleset");
+    ~ParticleSetPoolT();
+
+    ParticleSetPoolT(const ParticleSetPoolT&) = delete;
+    ParticleSetPoolT&
+    operator=(const ParticleSetPoolT&) = delete;
+    ParticleSetPoolT(ParticleSetPoolT&& pset) noexcept;
+    ParticleSetPoolT&
+    operator=(ParticleSetPoolT&&) = default;
+
+    bool
+    put(xmlNodePtr cur);
+    bool
+    get(std::ostream& os) const;
+    void
+    reset();
+
+    void
+    output_particleset_info(Libxml2Document& doc, xmlNodePtr root);
+
+    /** initialize the supercell shared by all the particle sets
+     *
+     *  return value is never checked anywhere
+     *  side effect simulation_cell_ UPtr<ParticleLayout> is set
+     *  to particle layout created on heap.
+     *  This is later directly assigned to pset member variable Lattice.
+     */
+    bool
+    readSimulationCellXML(xmlNodePtr cur);
+
+    /// return true, if the pool is empty
+    inline bool
+    empty() const
+    {
+        return myPool.empty();
+    }
+
+    /** add a ParticleSet* to the pool with its ownership transferred
+     * ParticleSet built outside the ParticleSetPool must be constructed with
+     * the simulation cell from this->simulation_cell_.
+     */
+    void
+    addParticleSet(std::unique_ptr<ParticleSetT<T>>&& p);
+
+    /** get a named ParticleSet
+     * @param pname name of the ParticleSet
+     * @return a MCWalkerConfiguration object with pname
+     *
+     * When the named ParticleSet is not in this object, return 0.
+     */
+    ParticleSetT<T>*
+    getParticleSet(const std::string& pname);
+
+    /** get a named MCWalkerConfiguration
+     * @param pname name of the MCWalkerConfiguration
+     * @return a MCWalkerConfiguration object with pname
+     *
+     * When the named MCWalkerConfiguration is not in this object, return 0.
+     */
+    MCWalkerConfigurationT<T>*
+    getWalkerSet(const std::string& pname);
+
+    /** get the Pool object
+     */
+    inline const PoolType&
+    getPool() const
+    {
+        return myPool;
+    }
+
+    /// get simulation cell
+    const auto&
+    getSimulationCell() const
+    {
+        return *simulation_cell_;
+    }
+
+    /// set simulation cell
+    void
+    setSimulationCell(const SimulationCellT<T>& simulation_cell)
+    {
+        *simulation_cell_ = simulation_cell;
+    }
+
+    /** randomize a particleset particleset/@random='yes' &&
+     * particleset@random_source exists
+     */
+    void
+    randomize();
+
+private:
+    /** global simulation cell
+     *
+     * updated by
+     * - readSimulationCellXML() parsing <simulationcell> element
+     * - setSimulationCell()
+     */
+    std::unique_ptr<SimulationCellT<T>> simulation_cell_;
+    /** List of ParticleSet owned
+     *
+     * Each ParticleSet has to have a unique name which is used as a key for the
+     * map.
+     */
+    PoolType myPool;
+    /** xml node for random initialization.
+     *
+     * randomize() process initializations just before starting qmc sections
+     */
+    std::vector<xmlNodePtr> randomize_nodes;
+};
+} // namespace qmcplusplus
+#endif
diff --git a/src/Particle/ParticleSetT.BC.cpp b/src/Particle/ParticleSetT.BC.cpp
new file mode 100644
index 0000000000..50c3f641e6
--- /dev/null
+++ b/src/Particle/ParticleSetT.BC.cpp
@@ -0,0 +1,194 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source License.
+// See LICENSE file in top directory for details.
+//
+// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
+//
+// File developed by: Jeremy McMinnis, jmcminis@gmail.com, University of Illinois at Urbana-Champaign
+//                    Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois at Urbana-Champaign
+//                    Mark A. Berrill, berrillma@ornl.gov, Oak Ridge National Laboratory
+//
+// File created by: Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois at Urbana-Champaign
+//////////////////////////////////////////////////////////////////////////////////////
+
+
+/**@file ParticleSet.BC.cpp
+ * @brief definition of functions controlling Boundary Conditions
+ */
+#include "Particle/ParticleSetT.h"
+#include "Particle/FastParticleOperators.h"
+#include "Concurrency/OpenMP.h"
+#include "LongRange/StructFactT.h"
+
+namespace qmcplusplus
+{
+/** Creating StructureFactor
+ *
+ * Currently testing only 1 component for PBCs.
+ */
+void ParticleSet::createSK()
+{
+  if (structure_factor_)
+    throw std::runtime_error("Report bug! structure_factor_ has already been created. Unexpected call sequence.");
+
+  auto& Lattice = getLattice();
+  auto& LRBox   = getLRBox();
+  if (Lattice.explicitly_defined)
+    convert2Cart(R); //make sure that R is in Cartesian coordinates
+
+  if (Lattice.SuperCellEnum != SUPERCELL_OPEN)
+  {
+    app_log() << "\n  Creating Structure Factor for periodic systems " << LRBox.LR_kc << std::endl;
+    structure_factor_ = std::make_unique<StructFact>(LRBox, simulation_cell_.getKLists());
+  }
+
+  //set the mass array
+  int beforemass = my_species_.numAttributes();
+  int massind    = my_species_.addAttribute("mass");
+  if (beforemass == massind)
+  {
+    app_log() << "  ParticleSet::createSK setting mass of  " << getName() << " to 1.0" << std::endl;
+    for (int ig = 0; ig < my_species_.getTotalNum(); ++ig)
+      my_species_(massind, ig) = 1.0;
+  }
+  for (int iat = 0; iat < GroupID.size(); iat++)
+    Mass[iat] = my_species_(massind, GroupID[iat]);
+
+  coordinates_->setAllParticlePos(R);
+}
+
+void ParticleSet::turnOnPerParticleSK()
+{
+  if (structure_factor_)
+    structure_factor_->turnOnStorePerParticle(*this);
+  else
+    throw std::runtime_error("ParticleSet::turnOnPerParticleSK trying to turn on per particle storage in "
+                             "structure_factor_ but structure_factor_ has not been created.");
+}
+
+bool ParticleSet::getPerParticleSKState() const
+{
+  bool isPerParticleOn = false;
+  if (structure_factor_)
+    isPerParticleOn = structure_factor_->isStorePerParticle();
+  return isPerParticleOn;
+}
+
+void ParticleSet::convert(const ParticlePos& pin, ParticlePos& pout)
+{
+  if (pin.getUnit() == pout.getUnit())
+  {
+    pout = pin;
+    return;
+  }
+  if (pin.getUnit() == PosUnit::Lattice)
+  //convert to CartesianUnit
+  {
+    ConvertPosUnit<ParticlePos, Tensor_t, DIM>::apply(pin, getLattice().R, pout, 0, pin.size());
+  }
+  else
+  //convert to getLattice()Unit
+  {
+    ConvertPosUnit<ParticlePos, Tensor_t, DIM>::apply(pin, getLattice().G, pout, 0, pin.size());
+  }
+}
+
+void ParticleSet::convert2Unit(const ParticlePos& pin, ParticlePos& pout)
+{
+  pout.setUnit(PosUnit::Lattice);
+  if (pin.getUnit() == PosUnit::Lattice)
+    pout = pin;
+  else
+    ConvertPosUnit<ParticlePos, Tensor_t, DIM>::apply(pin, getLattice().G, pout, 0, pin.size());
+}
+
+void ParticleSet::convert2Cart(const ParticlePos& pin, ParticlePos& pout)
+{
+  pout.setUnit(PosUnit::Cartesian);
+  if (pin.getUnit() == PosUnit::Cartesian)
+    pout = pin;
+  else
+    ConvertPosUnit<ParticlePos, Tensor_t, DIM>::apply(pin, getLattice().R, pout, 0, pin.size());
+}
+
+void ParticleSet::convert2Unit(ParticlePos& pinout)
+{
+  if (pinout.getUnit() == PosUnit::Lattice)
+    return;
+  else
+  {
+    pinout.setUnit(PosUnit::Lattice);
+    ConvertPosUnit<ParticlePos, Tensor_t, DIM>::apply(pinout, getLattice().G, 0, pinout.size());
+  }
+}
+
+void ParticleSet::convert2Cart(ParticlePos& pinout)
+{
+  if (pinout.getUnit() == PosUnit::Cartesian)
+    return;
+  else
+  {
+    pinout.setUnit(PosUnit::Cartesian);
+    ConvertPosUnit<ParticlePos, Tensor_t, DIM>::apply(pinout, getLattice().R, 0, pinout.size());
+  }
+}
+
+void ParticleSet::applyBC(const ParticlePos& pin, ParticlePos& pout) { applyBC(pin, pout, 0, pin.size()); }
+
+void ParticleSet::applyBC(const ParticlePos& pin, ParticlePos& pout, int first, int last)
+{
+  if (pin.getUnit() == PosUnit::Cartesian)
+  {
+    if (pout.getUnit() == PosUnit::Cartesian)
+      ApplyBConds<ParticlePos, Tensor_t, DIM>::Cart2Cart(pin, getLattice().G, getLattice().R, pout, first, last);
+    else if (pout.getUnit() == PosUnit::Lattice)
+      ApplyBConds<ParticlePos, Tensor_t, DIM>::Cart2Unit(pin, getLattice().G, pout, first, last);
+    else
+      throw std::runtime_error("Unknown unit conversion");
+  }
+  else if (pin.getUnit() == PosUnit::Lattice)
+  {
+    if (pout.getUnit() == PosUnit::Cartesian)
+      ApplyBConds<ParticlePos, Tensor_t, DIM>::Unit2Cart(pin, getLattice().R, pout, first, last);
+    else if (pout.getUnit() == PosUnit::Lattice)
+      ApplyBConds<ParticlePos, Tensor_t, DIM>::Unit2Unit(pin, pout, first, last);
+    else
+      throw std::runtime_error("Unknown unit conversion");
+  }
+  else
+    throw std::runtime_error("Unknown unit conversion");
+}
+
+void ParticleSet::applyBC(ParticlePos& pos)
+{
+  if (pos.getUnit() == PosUnit::Lattice)
+  {
+    ApplyBConds<ParticlePos, Tensor_t, DIM>::Unit2Unit(pos, 0, TotalNum);
+  }
+  else
+  {
+    ApplyBConds<ParticlePos, Tensor_t, DIM>::Cart2Cart(pos, getLattice().G, getLattice().R, 0, TotalNum);
+  }
+}
+
+void ParticleSet::applyMinimumImage(ParticlePos& pinout)
+{
+  if (getLattice().SuperCellEnum == SUPERCELL_OPEN)
+    return;
+  for (int i = 0; i < pinout.size(); ++i)
+    getLattice().applyMinimumImage(pinout[i]);
+}
+
+void ParticleSet::convert2UnitInBox(const ParticlePos& pin, ParticlePos& pout)
+{
+  pout.setUnit(PosUnit::Lattice);
+  convert2Unit(pin, pout); // convert to crystalline unit
+  put2box(pout);
+}
+
+void ParticleSet::convert2CartInBox(const ParticlePos& pin, ParticlePos& pout)
+{
+  convert2UnitInBox(pin, pout); // convert to crystalline unit
+  convert2Cart(pout);
+}
+} // namespace qmcplusplus
diff --git a/src/Particle/ParticleSetT.cpp b/src/Particle/ParticleSetT.cpp
index 5b78bed54e..bc5f7518ab 100644
--- a/src/Particle/ParticleSetT.cpp
+++ b/src/Particle/ParticleSetT.cpp
@@ -23,8 +23,10 @@
 
 #include "ParticleSetT.h"
 
+#include "Concurrency/OpenMP.h"
 #include "Particle/DistanceTableT.h"
 #include "Particle/DynamicCoordinatesBuilder.h"
+#include "Particle/FastParticleOperators.h"
 #include "Particle/LongRange/StructFactT.h"
 #include "Particle/createDistanceTableT.h"
 #include "ParticleBase/RandomSeqGeneratorGlobal.h"
@@ -1124,6 +1126,216 @@ ParticleSetT<T>::extractSKRefList(
     return sk_list;
 }
 
+/** Creating StructureFactor
+ *
+ * Currently testing only 1 component for PBCs.
+ */
+template <typename T>
+void
+ParticleSetT<T>::createSK()
+{
+    if (structure_factor_)
+        throw std::runtime_error("Report bug! structure_factor_ has already "
+                                 "been created. Unexpected call sequence.");
+
+    auto& Lattice = getLattice();
+    auto& LRBox = getLRBox();
+    if (Lattice.explicitly_defined)
+        convert2Cart(R); // make sure that R is in Cartesian coordinates
+
+    if (Lattice.SuperCellEnum != SUPERCELL_OPEN) {
+        app_log() << "\n  Creating Structure Factor for periodic systems "
+                  << LRBox.LR_kc << std::endl;
+        structure_factor_ = std::make_unique<StructFactT<T>>(
+            LRBox, simulation_cell_.getKLists());
+    }
+
+    // set the mass array
+    int beforemass = my_species_.numAttributes();
+    int massind = my_species_.addAttribute("mass");
+    if (beforemass == massind) {
+        app_log() << "  ParticleSet::createSK setting mass of  " << getName()
+                  << " to 1.0" << std::endl;
+        for (int ig = 0; ig < my_species_.getTotalNum(); ++ig)
+            my_species_(massind, ig) = 1.0;
+    }
+    for (int iat = 0; iat < GroupID.size(); iat++)
+        Mass[iat] = my_species_(massind, GroupID[iat]);
+
+    coordinates_->setAllParticlePos(R);
+}
+
+template <typename T>
+void
+ParticleSetT<T>::turnOnPerParticleSK()
+{
+    if (structure_factor_)
+        structure_factor_->turnOnStorePerParticle(*this);
+    else
+        throw std::runtime_error(
+            "ParticleSet::turnOnPerParticleSK trying to turn on per particle "
+            "storage in "
+            "structure_factor_ but structure_factor_ has not been created.");
+}
+
+template <typename T>
+bool
+ParticleSetT<T>::getPerParticleSKState() const
+{
+    bool isPerParticleOn = false;
+    if (structure_factor_)
+        isPerParticleOn = structure_factor_->isStorePerParticle();
+    return isPerParticleOn;
+}
+
+template <typename T>
+void
+ParticleSetT<T>::convert(const ParticlePos& pin, ParticlePos& pout)
+{
+    if (pin.getUnit() == pout.getUnit()) {
+        pout = pin;
+        return;
+    }
+    if (pin.getUnit() == PosUnit::Lattice)
+    // convert to CartesianUnit
+    {
+        ConvertPosUnit<ParticlePos, Tensor_t, DIM>::apply(
+            pin, getLattice().R, pout, 0, pin.size());
+    }
+    else
+    // convert to getLattice()Unit
+    {
+        ConvertPosUnit<ParticlePos, Tensor_t, DIM>::apply(
+            pin, getLattice().G, pout, 0, pin.size());
+    }
+}
+
+template <typename T>
+void
+ParticleSetT<T>::convert2Unit(const ParticlePos& pin, ParticlePos& pout)
+{
+    pout.setUnit(PosUnit::Lattice);
+    if (pin.getUnit() == PosUnit::Lattice)
+        pout = pin;
+    else
+        ConvertPosUnit<ParticlePos, Tensor_t, DIM>::apply(
+            pin, getLattice().G, pout, 0, pin.size());
+}
+
+template <typename T>
+void
+ParticleSetT<T>::convert2Cart(const ParticlePos& pin, ParticlePos& pout)
+{
+    pout.setUnit(PosUnit::Cartesian);
+    if (pin.getUnit() == PosUnit::Cartesian)
+        pout = pin;
+    else
+        ConvertPosUnit<ParticlePos, Tensor_t, DIM>::apply(
+            pin, getLattice().R, pout, 0, pin.size());
+}
+
+template <typename T>
+void
+ParticleSetT<T>::convert2Unit(ParticlePos& pinout)
+{
+    if (pinout.getUnit() == PosUnit::Lattice)
+        return;
+    else {
+        pinout.setUnit(PosUnit::Lattice);
+        ConvertPosUnit<ParticlePos, Tensor_t, DIM>::apply(
+            pinout, getLattice().G, 0, pinout.size());
+    }
+}
+
+template <typename T>
+void
+ParticleSetT<T>::convert2Cart(ParticlePos& pinout)
+{
+    if (pinout.getUnit() == PosUnit::Cartesian)
+        return;
+    else {
+        pinout.setUnit(PosUnit::Cartesian);
+        ConvertPosUnit<ParticlePos, Tensor_t, DIM>::apply(
+            pinout, getLattice().R, 0, pinout.size());
+    }
+}
+
+template <typename T>
+void
+ParticleSetT<T>::applyBC(const ParticlePos& pin, ParticlePos& pout)
+{
+    applyBC(pin, pout, 0, pin.size());
+}
+
+template <typename T>
+void
+ParticleSetT<T>::applyBC(
+    const ParticlePos& pin, ParticlePos& pout, int first, int last)
+{
+    if (pin.getUnit() == PosUnit::Cartesian) {
+        if (pout.getUnit() == PosUnit::Cartesian)
+            ApplyBConds<ParticlePos, Tensor_t, DIM>::Cart2Cart(
+                pin, getLattice().G, getLattice().R, pout, first, last);
+        else if (pout.getUnit() == PosUnit::Lattice)
+            ApplyBConds<ParticlePos, Tensor_t, DIM>::Cart2Unit(
+                pin, getLattice().G, pout, first, last);
+        else
+            throw std::runtime_error("Unknown unit conversion");
+    }
+    else if (pin.getUnit() == PosUnit::Lattice) {
+        if (pout.getUnit() == PosUnit::Cartesian)
+            ApplyBConds<ParticlePos, Tensor_t, DIM>::Unit2Cart(
+                pin, getLattice().R, pout, first, last);
+        else if (pout.getUnit() == PosUnit::Lattice)
+            ApplyBConds<ParticlePos, Tensor_t, DIM>::Unit2Unit(
+                pin, pout, first, last);
+        else
+            throw std::runtime_error("Unknown unit conversion");
+    }
+    else
+        throw std::runtime_error("Unknown unit conversion");
+}
+
+template <typename T>
+void
+ParticleSetT<T>::applyBC(ParticlePos& pos)
+{
+    if (pos.getUnit() == PosUnit::Lattice) {
+        ApplyBConds<ParticlePos, Tensor_t, DIM>::Unit2Unit(pos, 0, TotalNum);
+    }
+    else {
+        ApplyBConds<ParticlePos, Tensor_t, DIM>::Cart2Cart(
+            pos, getLattice().G, getLattice().R, 0, TotalNum);
+    }
+}
+
+template <typename T>
+void
+ParticleSetT<T>::applyMinimumImage(ParticlePos& pinout)
+{
+    if (getLattice().SuperCellEnum == SUPERCELL_OPEN)
+        return;
+    for (int i = 0; i < pinout.size(); ++i)
+        getLattice().applyMinimumImage(pinout[i]);
+}
+
+template <typename T>
+void
+ParticleSetT<T>::convert2UnitInBox(const ParticlePos& pin, ParticlePos& pout)
+{
+    pout.setUnit(PosUnit::Lattice);
+    convert2Unit(pin, pout); // convert to crystalline unit
+    put2box(pout);
+}
+
+template <typename T>
+void
+ParticleSetT<T>::convert2CartInBox(const ParticlePos& pin, ParticlePos& pout)
+{
+    convert2UnitInBox(pin, pout); // convert to crystalline unit
+    convert2Cart(pout);
+}
+
 // explicit instantiations
 template class ParticleSetT<double>;
 template class ParticleSetT<float>;
diff --git a/src/Particle/ParticleSetT.h b/src/Particle/ParticleSetT.h
index 138b352616..906e092adb 100644
--- a/src/Particle/ParticleSetT.h
+++ b/src/Particle/ParticleSetT.h
@@ -74,6 +74,7 @@ class ParticleSetT : public OhmmsElementBase
 
     using Index_t = typename LatticeParticleTraits<T>::Index_t;
     using Scalar_t = typename LatticeParticleTraits<T>::Scalar_t;
+    using Tensor_t = typename LatticeParticleTraits<T>::Tensor_t;
     using ParticleLayout = typename LatticeParticleTraits<T>::ParticleLayout;
     using SingleParticlePos =
         typename LatticeParticleTraits<T>::SingleParticlePos;
@@ -84,6 +85,7 @@ class ParticleSetT : public OhmmsElementBase
         typename LatticeParticleTraits<T>::ParticleGradient;
     using ParticleLaplacian =
         typename LatticeParticleTraits<T>::ParticleLaplacian;
+    using ParticleTensor = typename LatticeParticleTraits<T>::ParticleTensor;
 
     /// walker type
     using Walker_t = Walker<ParticleSetTraits<T>, LatticeParticleTraits<T>>;
@@ -99,6 +101,8 @@ class ParticleSetT : public OhmmsElementBase
         quantum
     };
 
+    static constexpr auto DIM = ParticleSetTraits<T>::DIM;
+
     /// quantum_domain of the particles, default = classical
     quantum_domains quantum_domain;
 
@@ -124,14 +128,14 @@ class ParticleSetT : public OhmmsElementBase
     Index_t direction;
 
     /// Particle density in G-space for MPC interaction
-    std::vector<TinyVector<int, OHMMS_DIM>> DensityReducedGvecs;
+    std::vector<TinyVector<int, DIM>> DensityReducedGvecs;
     std::vector<ComplexType> Density_G;
-    Array<RealType, OHMMS_DIM> Density_r;
+    Array<RealType, DIM> Density_r;
 
     /// DFT potential
-    std::vector<TinyVector<int, OHMMS_DIM>> VHXCReducedGvecs;
+    std::vector<TinyVector<int, DIM>> VHXCReducedGvecs;
     std::vector<ComplexType> VHXC_G[2];
-    Array<RealType, OHMMS_DIM> VHXC_r[2];
+    Array<RealType, DIM> VHXC_r[2];
 
     /** name-value map of Walker Properties
      *
diff --git a/src/Particle/ParticleSetTraits.h b/src/Particle/ParticleSetTraits.h
index 3ea028b54f..299687aeec 100644
--- a/src/Particle/ParticleSetTraits.h
+++ b/src/Particle/ParticleSetTraits.h
@@ -71,6 +71,7 @@ struct LatticeParticleTraits
     using Index_t = int;
     using Scalar_t = FullPrecRealType;
     using Complex_t = FullPrecComplexType;
+    using Tensor_t = ParticleTensorType;
 
     using ParticleIndex = ParticleAttrib<Index_t>;
     using ParticleScalar = ParticleAttrib<Scalar_t>;
diff --git a/src/Particle/ReptileT.h b/src/Particle/ReptileT.h
new file mode 100644
index 0000000000..ada42b2712
--- /dev/null
+++ b/src/Particle/ReptileT.h
@@ -0,0 +1,350 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
+//
+// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
+//
+// File developed by: Jeremy McMinnis, jmcminis@gmail.com, University of
+// Illinois at Urbana-Champaign
+//                    Raymond Clay III, j.k.rofling@gmail.com, Lawrence
+//                    Livermore National Laboratory Mark A. Berrill,
+//                    berrillma@ornl.gov, Oak Ridge National Laboratory
+//
+// File created by: Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois
+// at Urbana-Champaign
+//////////////////////////////////////////////////////////////////////////////////////
+
+#ifndef QMCPLUSPLUS_REPTILET_H
+#define QMCPLUSPLUS_REPTILET_H
+
+#include "Configuration.h"
+#include "ParticleSetTraits.h"
+#include "QMCDrivers/DriftOperators.h"
+#include "QMCDrivers/WalkerProperties.h"
+#include "Walker.h"
+
+namespace qmcplusplus
+{
+template <typename T>
+class MCWalkerConfigurationT;
+
+template <typename T>
+class ReptileT
+{
+public:
+    using WP = WalkerProperties::Indexes;
+    using Walker_t = typename MCWalkerConfigurationT<T>::Walker_t;
+    // using Buffer_t = Walker_t::Buffer_t             ;
+    //     using Walker_t = MCWalkerConfiguration::Walker_t;
+    using WalkerIter_t = typename MCWalkerConfigurationT<T>::iterator;
+    using ReptileConfig_t = std::vector<typename Walker_t::ParticlePos>;
+    using IndexType = typename ParticleSetTraits<T>::IndexType;
+    using RealType = typename ParticleSetTraits<T>::RealType;
+
+    std::vector<IndexType> Action;
+    std::vector<IndexType> TransProb;
+
+    RealType forwardprob;
+    RealType backwardprob;
+    RealType forwardaction;
+    RealType backwardaction;
+
+    RealType tau;
+
+    MCWalkerConfigurationT<T>& w;
+    WalkerIter_t repstart, repend;
+    IndexType direction, headindex, nbeads;
+    Walker_t* prophead;
+
+    inline ReptileT(
+        MCWalkerConfigurationT<T>& W, WalkerIter_t start, WalkerIter_t end) :
+        w(W),
+        repstart(start),
+        repend(end),
+        direction(1),
+        headindex(0),
+        prophead(0) //, r2prop(0.0), r2accept(0.0),tau(0.0)
+    {
+        Action.resize(3);
+        Action[0] = w.addProperty("ActionBackward");
+        Action[1] = w.addProperty("ActionForward");
+        Action[2] = w.addProperty("ActionLocal");
+        TransProb.resize(2);
+        TransProb[0] = w.addProperty("TransProbBackward");
+        TransProb[1] = w.addProperty("TransProbForward");
+
+        nbeads = repend - repstart;
+    }
+
+    ~ReptileT()
+    {
+    }
+
+    inline IndexType
+    size()
+    {
+        return nbeads;
+    }
+
+    inline Walker_t&
+    operator[](IndexType i)
+    {
+        return getWalker(getBeadIndex(i));
+    }
+
+    inline IndexType
+    wrapIndex(IndexType repindex)
+    {
+        return (repindex % nbeads + nbeads) % nbeads;
+    }
+
+    inline Walker_t&
+    getWalker(IndexType i)
+    {
+        WalkerIter_t bead = repstart + wrapIndex(i);
+        return **bead;
+    }
+
+    inline IndexType
+    getBeadIndex(IndexType i)
+    {
+        return wrapIndex(headindex + direction * i);
+    }
+    inline Walker_t&
+    getBead(IndexType i)
+    {
+        return getWalker(getBeadIndex(i));
+    }
+    inline Walker_t&
+    getHead()
+    {
+        return getWalker(getBeadIndex(0));
+    }
+    inline Walker_t&
+    getTail()
+    {
+        return getWalker(getBeadIndex(nbeads - 1));
+    }
+    inline Walker_t&
+    getNext()
+    {
+        return getWalker(getBeadIndex(nbeads - 2));
+    }
+    inline Walker_t&
+    getCenter()
+    {
+        return getWalker(getBeadIndex((nbeads - 1) / 2));
+    }
+    // inline void setProposedHead(){
+
+    inline void
+    flip()
+    {
+        // direction*=-1;
+        // headindex = getBeadIndex(nbeads-1);
+        headindex = wrapIndex(headindex - direction);
+        direction *= -1;
+    }
+
+    inline void
+    setDirection(IndexType dir)
+    {
+        direction = dir;
+    }
+
+    inline void
+    setBead(Walker_t& walker, IndexType i)
+    {
+        IndexType index = getBeadIndex(i);
+        Walker_t& newbead(getWalker(index));
+        newbead = walker; // This should be a hard copy
+    }
+
+    inline void
+    setHead(Walker_t& overwrite)
+    {
+        // overwrite last element.
+        headindex = getBeadIndex(nbeads - 1); // sets to position of tail.
+        Walker_t& newhead(getBead(0));
+        newhead = overwrite;
+    }
+    // This function does two things:  1.)  Moves the reptile forward 1
+    // step.  2.) Returns the new head.
+    inline Walker_t&
+    getNewHead()
+    {
+        // overwrite last element.
+        headindex = getBeadIndex(nbeads - 1); // sets to position of tail.
+        return getWalker(headindex);
+    }
+
+    void
+    saveAction(Walker_t& walker, IndexType d, RealType val, IndexType nPsi = 0)
+    {
+        // IndexType repdirection=circbuffer.get_direction();
+        IndexType actionindex = 2;
+        if (direction != 0)
+            actionindex = (1 - d * direction) / 2;
+        walker.Properties(nPsi, Action[actionindex]) = val;
+    }
+
+    RealType
+    getDirectionalAction(Walker_t& walker, IndexType d, IndexType nPsi = 0)
+    {
+        // IndexType repdirection=circbuffer.get_direction();
+        IndexType actionindex = 2;
+        if (d != 0)
+            actionindex = (1 - direction * d) / 2;
+
+        return walker.Properties(nPsi, Action[actionindex]);
+    }
+
+    RealType
+    getLinkAction(Walker_t& new_walker, Walker_t& old_walker, IndexType d,
+        IndexType nPsi = 0)
+    {
+        RealType af = getDirectionalAction(old_walker, +1, nPsi);
+        RealType ab = getDirectionalAction(new_walker, -1, nPsi);
+        RealType a0 = getDirectionalAction(old_walker, 0, nPsi) +
+            getDirectionalAction(new_walker, 0, nPsi);
+        return af + ab + a0;
+    }
+
+    void
+    saveTransProb(
+        Walker_t& walker, IndexType d, RealType val, IndexType nPsi = 0)
+    {
+        // IndexType repdirection=circbuffer.get_direction();
+        IndexType transindex = (1 - d * direction) / 2;
+        walker.Properties(nPsi, TransProb[transindex]) = val;
+    }
+
+    void
+    saveTransProb(ParticleSetT<T>& W, IndexType d, RealType val, IndexType nPsi = 0)
+    {
+        // IndexType repdirection=circbuffer.get_direction();
+        IndexType transindex = (1 - d * direction) / 2;
+        W.Properties(nPsi, TransProb[transindex]) = val;
+    }
+    RealType
+    getTransProb(Walker_t& walker, IndexType d, RealType nPsi = 0)
+    {
+        // IndexType repdirection=circbuffer.get_direction();
+        IndexType transindex = (1 - d * direction) / 2;
+        return walker.Properties(nPsi, TransProb[transindex]);
+    }
+    RealType
+    getTransProb(ParticleSetT<T>& W, IndexType d, RealType nPsi = 0)
+    {
+        // IndexType repdirection=circbuffer.get_direction();
+        IndexType transindex = (1 - d * direction) / 2;
+        return W.Properties(nPsi, TransProb[transindex]);
+    }
+
+    inline void
+    printState()
+    {
+        app_log() << "********PRINT REPTILE STATE*********\n";
+        app_log() << "Direction=" << direction << "  Headindex=" << headindex
+                  << "  tail=" << getBeadIndex(nbeads - 1)
+                  << "\n  next=" << getBeadIndex(nbeads - 2)
+                  << "  nbeads=" << nbeads << std::endl;
+        app_log() << "BeadIndex\tWrapIndex\tEnergy\tAction[0]\tAction[1]"
+                     "\tAction[2]\t\n";
+        for (int i = 0; i < nbeads; i++) {
+            app_log() << i << "\t" << getBeadIndex(i) << "\t"
+                      << getBead(i).Properties(WP::LOCALENERGY) << "\t"
+                      << getBead(i).Properties(Action[0]) << "\t"
+                      << getBead(i).Properties(Action[1]) << "\t"
+                      << getBead(i).Properties(Action[2]) << "\n";
+        }
+        app_log() << "POSITIONS===============:\n";
+        for (int i = 0; i < nbeads; i++) {
+            //  app_log()<<i<<"\t1"<<1<<"\t"<<getBead(i).R[0]<<"\n";
+            //  app_log()<<i<<"\t2"<<2<<"\t"<<getBead(i).R[1]<<"\n";
+            app_log() << "BEAD #" << i << " tau = " << tau * i << std::endl;
+            app_log() << getBead(i).R << std::endl;
+        }
+        app_log() << "GVECS===============:\n";
+        for (int i = 0; i < nbeads; i++) {
+            //      app_log()<<i<<"\t1"<<1<<"\t"<<getBead(i).G[0]<<"\n";
+            //      app_log()<<i<<"\t2"<<2<<"\t"<<getBead(i).G[1]<<"\n";
+            app_log() << "BEAD #" << i << " tau = " << tau * i << std::endl;
+            app_log() << getBead(i).G << std::endl;
+        }
+        app_log() << "************************************\n";
+    }
+    inline RealType
+    getTau()
+    {
+        return tau;
+    }
+    inline void
+    setTau(RealType t)
+    {
+        tau = t;
+    }
+
+    // This takes a value of imaginary time "t" and returns a 3N particle
+    // position vector, corresponding to a time slice extrapolated
+    //  from the current reptile.  If t>length of reptile, then return the last
+    //  bead.  if t<0; return the first bead.
+    inline typename Walker_t::ParticlePos
+    linearInterp(RealType t)
+    {
+        IndexType nbead =
+            IndexType(t / tau); // Calculate the lower bound on the timeslice.
+                                // t is between binnum*Tau and (binnum+1)Tau
+        RealType beadfrac =
+            t / tau - nbead; // the fractional coordinate between n and n+1 bead
+        if (nbead <= 0) {
+            typename ParticleSetT<T>::ParticlePos result = getHead().R;
+            return result;
+        }
+        else if (nbead >= nbeads - 1) {
+            typename ParticleSetT<T>::ParticlePos result = getTail().R;
+            return result;
+        }
+
+        else {
+            typename Walker_t::ParticlePos dR(getBead(nbead + 1).R),
+                interpR(getBead(nbead).R);
+            dR = dR - getBead(nbead).R;
+
+            interpR = getBead(nbead).R + beadfrac * dR;
+            return interpR;
+        }
+    }
+    inline ReptileConfig_t
+    getReptileSlicePositions(RealType tau, RealType beta)
+    {
+        IndexType nbeads_new = IndexType(beta / tau);
+        ReptileConfig_t new_reptile_coords(0);
+
+        for (IndexType i = 0; i < nbeads_new; i++)
+            new_reptile_coords.push_back(linearInterp(tau * i));
+
+        return new_reptile_coords;
+    }
+
+    inline void
+    setReptileSlicePositions(ReptileConfig_t& rept)
+    {
+        if (rept.size() == nbeads) {
+            for (int i = 0; i < nbeads; i++)
+                getBead(i).R = rept[i];
+        }
+        else
+            ;
+    }
+
+    inline void
+    setReptileSlicePositions(typename Walker_t::ParticlePos R)
+    {
+        for (int i = 0; i < nbeads; i++)
+            getBead(i).R = R;
+    }
+};
+
+} // namespace qmcplusplus
+#endif
diff --git a/src/Particle/SampleStackT.cpp b/src/Particle/SampleStackT.cpp
new file mode 100644
index 0000000000..a40acd9bb9
--- /dev/null
+++ b/src/Particle/SampleStackT.cpp
@@ -0,0 +1,81 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
+//
+// Copyright (c) 2020  QMCPACK developers.
+//
+// File developed by: Mark Dewing, mdewing@anl.gov, Argonne National Laboratory
+//
+// File created by: Mark Dewing, mdewing@anl.gov, Argonne National Laboratory
+//////////////////////////////////////////////////////////////////////////////////////
+
+#include "SampleStackT.h"
+
+#include "Utilities/IteratorUtility.h"
+
+namespace qmcplusplus
+{
+
+/** allocate the SampleStack
+ * @param n number of samples per rank
+ * @param num_ranks number of ranks. Used to set global number of samples.
+ */
+template <typename T>
+void
+SampleStackT<T>::setMaxSamples(size_t n, size_t num_ranks)
+{
+    max_samples_ = n;
+    global_num_samples_ = n * num_ranks;
+    current_sample_count_ = std::min(current_sample_count_, max_samples_);
+    sample_vector_.resize(n, MCSample(0));
+}
+
+template <typename T>
+const MCSample&
+SampleStackT<T>::getSample(size_t i) const
+{
+    return sample_vector_[i];
+}
+
+template <typename T>
+void
+SampleStackT<T>::appendSample(MCSample&& sample)
+{
+    // Ignore samples in excess of the expected number of samples
+    if (current_sample_count_ < max_samples_) {
+        sample_vector_[current_sample_count_] = std::move(sample);
+        current_sample_count_++;
+    }
+}
+
+/** load a single sample from SampleStack
+ */
+template <typename T>
+void
+SampleStackT<T>::loadSample(ParticleSetT<T>& pset, size_t iw) const
+{
+    pset.R = sample_vector_[iw].R;
+    pset.spins = sample_vector_[iw].spins;
+}
+
+template <typename T>
+void
+SampleStackT<T>::clearEnsemble()
+{
+    sample_vector_.clear();
+    current_sample_count_ = 0;
+}
+
+template <typename T>
+void
+SampleStackT<T>::resetSampleCount()
+{
+    current_sample_count_ = 0;
+}
+
+template class SampleStackT<double>;
+template class SampleStackT<float>;
+template class SampleStackT<std::complex<double>>;
+template class SampleStackT<std::complex<float>>;
+
+} // namespace qmcplusplus
diff --git a/src/Particle/SampleStackT.h b/src/Particle/SampleStackT.h
new file mode 100644
index 0000000000..228a26e874
--- /dev/null
+++ b/src/Particle/SampleStackT.h
@@ -0,0 +1,84 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
+//
+// Copyright (c) 2020  QMCPACK developers.
+//
+// File developed by: Mark Dewing, mdewing@anl.gov, Argonne National Laboratory
+//
+// File created by: Mark Dewing, mdewing@anl.gov, Argonne National Laboratory
+//////////////////////////////////////////////////////////////////////////////////////
+
+#ifndef QMCPLUSPLUS_SAMPLE_STACKT_H
+#define QMCPLUSPLUS_SAMPLE_STACKT_H
+
+#include "Particle/MCSample.h"
+#include "Particle/ParticleSetT.h"
+#include "Particle/Walker.h"
+#include "Particle/WalkerConfigurations.h"
+
+#include <vector>
+
+namespace qmcplusplus
+{
+template <typename T>
+class SampleStackT
+{
+public:
+    using PropertySetType = typename ParticleSetTraits<T>::PropertySetType;
+
+    size_t
+    getMaxSamples() const
+    {
+        return max_samples_;
+    }
+
+    bool
+    empty() const
+    {
+        return sample_vector_.empty();
+    }
+
+    const MCSample&
+    getSample(size_t i) const;
+
+    //@{save/load/clear function for optimization
+    inline size_t
+    getNumSamples() const
+    {
+        return current_sample_count_;
+    }
+    /// set the number of max samples per rank.
+    void
+    setMaxSamples(size_t n, size_t number_of_ranks = 1);
+    /// Global number of samples is number of samples per rank * number of ranks
+    size_t
+    getGlobalNumSamples() const
+    {
+        return global_num_samples_;
+    }
+    /// load a single sample from SampleStack
+    void
+    loadSample(ParticleSetT<T>& pset, size_t iw) const;
+
+    void
+    appendSample(MCSample&& sample);
+
+    /// clear the ensemble
+    void
+    clearEnsemble();
+    //@}
+    ///  Set the sample count to zero but preserve the storage
+    void
+    resetSampleCount();
+
+private:
+    size_t max_samples_{10};
+    size_t current_sample_count_{0};
+    size_t global_num_samples_{max_samples_};
+
+    std::vector<MCSample> sample_vector_;
+};
+
+} // namespace qmcplusplus
+#endif
diff --git a/src/Particle/SimulationCellT.h b/src/Particle/SimulationCellT.h
index ff8240325a..7eb03fc3a4 100644
--- a/src/Particle/SimulationCellT.h
+++ b/src/Particle/SimulationCellT.h
@@ -17,7 +17,8 @@
 
 namespace qmcplusplus
 {
-class ParticleSetPool;
+template <typename T>
+class ParticleSetPoolT;
 
 template <typename T>
 class SimulationCellT
@@ -65,7 +66,7 @@ class SimulationCellT
     /// K-Vector List.
     KContainerT<T> k_lists_;
 
-    friend class ParticleSetPool;
+    friend class ParticleSetPoolT<T>;
 };
 } // namespace qmcplusplus
 #endif
diff --git a/src/QMCWaveFunctions/BsplineFactory/BsplineReaderBase.h b/src/QMCWaveFunctions/BsplineFactory/BsplineReaderBase.h
index 2a69c08aaf..44b574fcd4 100644
--- a/src/QMCWaveFunctions/BsplineFactory/BsplineReaderBase.h
+++ b/src/QMCWaveFunctions/BsplineFactory/BsplineReaderBase.h
@@ -23,6 +23,7 @@
 #include "mpi/collectives.h"
 #include "mpi/point2point.h"
 #include <einspline/bspline_base.h>
+#include "QMCWaveFunctions/EinsplineSetBuilder.h"
 
 namespace qmcplusplus
 {
diff --git a/src/QMCWaveFunctions/BsplineFactory/BsplineReaderBaseT.cpp b/src/QMCWaveFunctions/BsplineFactory/BsplineReaderBaseT.cpp
new file mode 100644
index 0000000000..bf6c0c7fff
--- /dev/null
+++ b/src/QMCWaveFunctions/BsplineFactory/BsplineReaderBaseT.cpp
@@ -0,0 +1,259 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
+//
+// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
+//
+// File developed by: Jeongnim Kim, jeongnim.kim@gmail.com, University of
+// Illinois at Urbana-Champaign
+//                    Paul R. C. Kent, kentpr@ornl.gov, Oak Ridge National
+//                    Laboratory Mark A. Berrill, berrillma@ornl.gov, Oak Ridge
+//                    National Laboratory Ye Luo, yeluo@anl.gov, Argonne
+//                    National Laboratory
+//
+// File created by: Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois
+// at Urbana-Champaign
+//////////////////////////////////////////////////////////////////////////////////////
+
+/** @file BsplineReaderBase.cpp
+ *
+ * Implement super function
+ */
+#include "BsplineReaderBaseT.h"
+
+#include "Message/CommOperators.h"
+#include "OhmmsData/AttributeSet.h"
+#include "QMCWaveFunctions/EinsplineSetBuilderT.h"
+
+#include <array>
+#include <filesystem>
+
+namespace qmcplusplus
+{
+template <typename T>
+BsplineReaderBaseT<T>::BsplineReaderBaseT(EinsplineSetBuilderT<T>* e) :
+    mybuilder(e),
+    MeshSize(0),
+    checkNorm(true),
+    saveSplineCoefs(false),
+    rotate(true)
+{
+    myComm = mybuilder->getCommunicator();
+}
+
+template <typename T>
+void
+BsplineReaderBaseT<T>::get_psi_g(
+    int ti, int spin, int ib, Vector<std::complex<double>>& cG)
+{
+    int ncg = 0;
+    if (myComm->rank() == 0) {
+        std::string path = psi_g_path(ti, spin, ib);
+        mybuilder->H5File.read(cG, path);
+        ncg = cG.size();
+    }
+    myComm->bcast(ncg);
+    if (ncg != mybuilder->MaxNumGvecs) {
+        APP_ABORT("Failed : ncg != MaxNumGvecs");
+    }
+    myComm->bcast(cG);
+}
+
+template <typename T>
+BsplineReaderBaseT<T>::~BsplineReaderBaseT()
+{
+}
+
+inline std::string
+make_bandinfo_filename(const std::string& root, int spin, int twist,
+    const Tensor<int, 3>& tilematrix, int gid)
+{
+    std::ostringstream oo;
+    oo << root << ".tile_" << tilematrix(0, 0) << tilematrix(0, 1)
+       << tilematrix(0, 2) << tilematrix(1, 0) << tilematrix(1, 1)
+       << tilematrix(1, 2) << tilematrix(2, 0) << tilematrix(2, 1)
+       << tilematrix(2, 2) << ".spin_" << spin << ".tw_" << twist;
+    if (gid >= 0)
+        oo << ".g" << gid;
+    return oo.str();
+}
+
+inline std::string
+make_bandgroup_name(const std::string& root, int spin, int twist,
+    const Tensor<int, 3>& tilematrix, int first, int last)
+{
+    std::ostringstream oo;
+    oo << root << ".tile_" << tilematrix(0, 0) << tilematrix(0, 1)
+       << tilematrix(0, 2) << tilematrix(1, 0) << tilematrix(1, 1)
+       << tilematrix(1, 2) << tilematrix(2, 0) << tilematrix(2, 1)
+       << tilematrix(2, 2) << ".spin_" << spin << ".tw_" << twist << ".l"
+       << first << "u" << last;
+    return oo.str();
+}
+
+template <typename T>
+void
+BsplineReaderBaseT<T>::setCommon(xmlNodePtr cur)
+{
+    // check orbital normalization by default
+    std::string checkOrbNorm("yes");
+    std::string saveCoefs("no");
+    OhmmsAttributeSet a;
+    a.add(checkOrbNorm, "check_orb_norm");
+    a.add(saveCoefs, "save_coefs");
+    a.put(cur);
+
+    // allow user to turn off norm check with a warning
+    if (checkOrbNorm == "no") {
+        app_log() << "WARNING: disable orbital normalization check!"
+                  << std::endl;
+        checkNorm = false;
+    }
+    saveSplineCoefs = saveCoefs == "yes";
+}
+
+template <typename T>
+std::unique_ptr<SPOSetT<T>>
+BsplineReaderBaseT<T>::create_spline_set(int spin, xmlNodePtr cur)
+{
+    int ns(0);
+    std::string spo_object_name;
+    OhmmsAttributeSet a;
+    a.add(ns, "size");
+    a.add(spo_object_name, "name");
+    a.add(spo_object_name, "id");
+    a.put(cur);
+
+    if (ns == 0)
+        APP_ABORT_TRACE(__FILE__, __LINE__, "parameter/@size missing");
+
+    if (spo2band.empty())
+        spo2band.resize(mybuilder->states.size());
+
+    std::vector<BandInfo>& fullband = (*(mybuilder->FullBands[spin]));
+
+    if (spo2band[spin].empty()) {
+        spo2band[spin].reserve(fullband.size());
+        if (!mybuilder->states[spin])
+            mybuilder->states[spin] = std::make_unique<SPOSetInfo>();
+        mybuilder->clear_states(spin);
+        initialize_spo2band(
+            spin, fullband, *mybuilder->states[spin], spo2band[spin]);
+    }
+
+    BandInfoGroup vals;
+    vals.TwistIndex = fullband[0].TwistIndex;
+    vals.GroupID = 0;
+    vals.myName = make_bandgroup_name(mybuilder->getName(), spin,
+        mybuilder->twist_num_, mybuilder->TileMatrix, 0, ns);
+    vals.selectBands(fullband, 0, ns, false);
+
+    return create_spline_set(spo_object_name, spin, vals);
+}
+
+template <typename T>
+std::unique_ptr<SPOSetT<T>>
+BsplineReaderBaseT<T>::create_spline_set(
+    int spin, xmlNodePtr cur, SPOSetInputInfo& input_info)
+{
+    std::string spo_object_name;
+    OhmmsAttributeSet a;
+    a.add(spo_object_name, "name");
+    a.add(spo_object_name, "id");
+    a.put(cur);
+
+    if (spo2band.empty())
+        spo2band.resize(mybuilder->states.size());
+
+    std::vector<BandInfo>& fullband = (*(mybuilder->FullBands[spin]));
+
+    if (spo2band[spin].empty()) {
+        spo2band[spin].reserve(fullband.size());
+        if (!mybuilder->states[spin])
+            mybuilder->states[spin] = std::make_unique<SPOSetInfo>();
+        mybuilder->clear_states(spin);
+        initialize_spo2band(
+            spin, fullband, *mybuilder->states[spin], spo2band[spin]);
+    }
+
+    BandInfoGroup vals;
+    vals.TwistIndex = fullband[0].TwistIndex;
+    vals.GroupID = 0;
+    vals.myName = make_bandgroup_name(mybuilder->getName(), spin,
+        mybuilder->twist_num_, mybuilder->TileMatrix, input_info.min_index(),
+        input_info.max_index());
+    vals.selectBands(fullband, spo2band[spin][input_info.min_index()],
+        input_info.max_index() - input_info.min_index(), false);
+
+    return create_spline_set(spo_object_name, spin, vals);
+}
+
+/** build index tables to map a state to band with k-point folidng
+ * @param bigspace full BandInfo constructed by EinsplineSetBuilder
+ * @param sposet SPOSetInfo owned by someone, most likely EinsplinseSetBuilder
+ * @param spo2band spo2band[i] is the index in bigspace
+ *
+ * At gamma or arbitrary kpoints with complex wavefunctions, spo2band[i]==i
+ */
+template <typename T>
+void
+BsplineReaderBaseT<T>::initialize_spo2band(int spin,
+    const std::vector<BandInfo>& bigspace, SPOSetInfo& sposet,
+    std::vector<int>& spo2band)
+{
+    spo2band.reserve(bigspace.size());
+    int ns = 0;
+    for (int i = 0; i < bigspace.size(); ++i) {
+        spo2band.push_back(i);
+        SPOInfo a(ns, bigspace[i].Energy);
+        sposet.add(a);
+        ns++;
+        if (bigspace[i].MakeTwoCopies) {
+            spo2band.push_back(i);
+            SPOInfo b(ns, bigspace[i].Energy);
+            sposet.add(b);
+            ns++;
+        }
+    }
+
+    // write to a file
+    const Communicate* comm = myComm;
+    if (comm->rank())
+        return;
+
+    std::filesystem::path aname = make_bandinfo_filename(mybuilder->getName(),
+        spin, mybuilder->twist_num_, mybuilder->TileMatrix, comm->getGroupID());
+    aname += ".bandinfo.dat";
+
+    std::ofstream o(aname.c_str());
+    std::array<char, 1024> s;
+    ns = 0;
+    using PosType = QMCTraits::PosType;
+    o << "#  Band    State   TwistIndex BandIndex Energy      Kx      Ky      "
+         "Kz      K1      K2      K3    KmK "
+      << std::endl;
+    for (int i = 0; i < bigspace.size(); ++i) {
+        int ti = bigspace[i].TwistIndex;
+        int bi = bigspace[i].BandIndex;
+        double e = bigspace[i].Energy;
+        int nd = (bigspace[i].MakeTwoCopies) ? 2 : 1;
+        PosType k = mybuilder->PrimCell.k_cart(mybuilder->primcell_kpoints[ti]);
+        int s_size = std::snprintf(s.data(), s.size(),
+            "%8d %8d %8d %8d %12.6f %7.4f %7.4f %7.4f %7.4f %7.4f %7.4f %6d\n",
+            i, ns, ti, bi, e, k[0], k[1], k[2],
+            mybuilder->primcell_kpoints[ti][0],
+            mybuilder->primcell_kpoints[ti][1],
+            mybuilder->primcell_kpoints[ti][2], nd);
+        if (s_size < 0)
+            throw std::runtime_error("Error generating bandinfo");
+        o << s.data();
+        ns += nd;
+    }
+}
+
+template class BsplineReaderBaseT<double>;
+template class BsplineReaderBaseT<float>;
+template class BsplineReaderBaseT<std::complex<double>>;
+template class BsplineReaderBaseT<std::complex<float>>;
+
+} // namespace qmcplusplus
diff --git a/src/QMCWaveFunctions/BsplineFactory/BsplineReaderBaseT.h b/src/QMCWaveFunctions/BsplineFactory/BsplineReaderBaseT.h
new file mode 100644
index 0000000000..5eab41dea5
--- /dev/null
+++ b/src/QMCWaveFunctions/BsplineFactory/BsplineReaderBaseT.h
@@ -0,0 +1,228 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
+//
+// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
+//
+// File developed by: Jeongnim Kim, jeongnim.kim@gmail.com, University of
+// Illinois at Urbana-Champaign
+//                    Raymond Clay III, j.k.rofling@gmail.com, Lawrence
+//                    Livermore National Laboratory Ye Luo, yeluo@anl.gov,
+//                    Argonne National Laboratory Mark A. Berrill,
+//                    berrillma@ornl.gov, Oak Ridge National Laboratory
+//
+// File created by: Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois
+// at Urbana-Champaign
+//////////////////////////////////////////////////////////////////////////////////////
+
+#ifndef QMCPLUSPLUS_BSPLINE_READER_BASET_H
+#define QMCPLUSPLUS_BSPLINE_READER_BASET_H
+
+#include "Containers/OhmmsPETE/TinyVector.h"
+#include "QMCWaveFunctions/BandInfo.h"
+#include "QMCWaveFunctions/SPOSetT.h"
+#include "mpi/collectives.h"
+#include "mpi/point2point.h"
+#include <einspline/bspline_base.h>
+
+namespace qmcplusplus
+{
+struct SPOSetInputInfo;
+template <typename T>
+class EinsplineSetBuilderT;
+
+/**
+ * Each SplineC2X needs a reader derived from BsplineReaderBase.
+ * This base class handles common chores
+ * - check_twists : read gvectors, set twists for folded bands if needed, and
+ * set the phase for the special K
+ * - set_grid : create the basic grid and boundary conditions for einspline
+ * Note that template is abused but it works.
+ */
+template <typename T>
+class BsplineReaderBaseT
+{
+public:
+    /// pointer to the EinsplineSetBuilder
+    EinsplineSetBuilderT<T>* mybuilder;
+    /// communicator
+    Communicate* myComm;
+    /// mesh size
+    TinyVector<int, 3> MeshSize;
+    /// check the norm of orbitals
+    bool checkNorm;
+    /// save spline coefficients to storage
+    bool saveSplineCoefs;
+    /// apply orbital rotations
+    bool rotate;
+    /// map from spo index to band index
+    std::vector<std::vector<int>> spo2band;
+
+    BsplineReaderBaseT(EinsplineSetBuilderT<T>* e);
+
+    virtual ~BsplineReaderBaseT();
+
+    /** read gvectors and set the mesh, and prepare for einspline
+     */
+    template <typename GT, typename BCT>
+    inline bool
+    set_grid(const TinyVector<int, 3>& halfg, GT* xyz_grid, BCT* xyz_bc)
+    {
+        // This sets MeshSize from the input file
+        bool havePsig = mybuilder->ReadGvectors_ESHDF();
+
+        // If this MeshSize is not initialized, use the meshsize set by the
+        // input based on FFT grid and meshfactor
+        if (MeshSize[0] == 0)
+            MeshSize = mybuilder->MeshSize;
+
+        app_log() << "  Using meshsize=" << MeshSize
+                  << "\n  vs input meshsize=" << mybuilder->MeshSize
+                  << std::endl;
+
+        for (int j = 0; j < 3; ++j) {
+            xyz_grid[j].start = 0.0;
+            xyz_grid[j].end = 1.0;
+            xyz_grid[j].num = MeshSize[j];
+
+            if (halfg[j]) {
+                xyz_bc[j].lCode = ANTIPERIODIC;
+                xyz_bc[j].rCode = ANTIPERIODIC;
+            }
+            else {
+                xyz_bc[j].lCode = PERIODIC;
+                xyz_bc[j].rCode = PERIODIC;
+            }
+
+            xyz_bc[j].lVal = 0.0;
+            xyz_bc[j].rVal = 0.0;
+        }
+        return havePsig;
+    }
+
+    /** initialize twist-related data for N orbitals
+     */
+    template <typename SPE>
+    inline void
+    check_twists(SPE* bspline, const BandInfoGroup& bandgroup)
+    {
+        // init(orbitalSet,bspline);
+        bspline->PrimLattice = mybuilder->PrimCell;
+        bspline->GGt =
+            dot(transpose(bspline->PrimLattice.G), bspline->PrimLattice.G);
+
+        int N = bandgroup.getNumDistinctOrbitals();
+        int numOrbs = bandgroup.getNumSPOs();
+
+        bspline->setOrbitalSetSize(numOrbs);
+        bspline->resizeStorage(N, N);
+
+        bspline->first_spo = bandgroup.getFirstSPO();
+        bspline->last_spo = bandgroup.getLastSPO();
+
+        int num = 0;
+        const std::vector<BandInfo>& cur_bands = bandgroup.myBands;
+        for (int iorb = 0; iorb < N; iorb++) {
+            int ti = cur_bands[iorb].TwistIndex;
+            bspline->kPoints[iorb] =
+                mybuilder->PrimCell.k_cart(-mybuilder->primcell_kpoints[ti]);
+            bspline->MakeTwoCopies[iorb] =
+                (num < (numOrbs - 1)) && cur_bands[iorb].MakeTwoCopies;
+            num += bspline->MakeTwoCopies[iorb] ? 2 : 1;
+        }
+
+        app_log() << "NumDistinctOrbitals " << N << " numOrbs = " << numOrbs
+                  << std::endl;
+
+        bspline->HalfG = 0;
+        TinyVector<int, 3> bconds =
+            mybuilder->TargetPtcl.getLattice().BoxBConds;
+        if (!bspline->isComplex()) {
+            // no k-point folding, single special k point (G, L ...)
+            TinyVector<double, 3> twist0 =
+                mybuilder->primcell_kpoints[bandgroup.TwistIndex];
+            for (int i = 0; i < 3; i++)
+                if (bconds[i] &&
+                    ((std::abs(std::abs(twist0[i]) - 0.5) < 1.0e-8)))
+                    bspline->HalfG[i] = 1;
+                else
+                    bspline->HalfG[i] = 0;
+            app_log() << "  TwistIndex = " << cur_bands[0].TwistIndex
+                      << " TwistAngle " << twist0 << std::endl;
+            app_log() << "   HalfG = " << bspline->HalfG << std::endl;
+        }
+        app_log().flush();
+    }
+
+    /** return the path name in hdf5
+     */
+    inline std::string
+    psi_g_path(int ti, int spin, int ib)
+    {
+        std::ostringstream path;
+        path << "/electrons/kpoint_" << ti << "/spin_" << spin << "/state_"
+             << ib << "/psi_g";
+        return path.str();
+    }
+
+    /** return the path name in hdf5
+     */
+    inline std::string
+    psi_r_path(int ti, int spin, int ib)
+    {
+        std::ostringstream path;
+        path << "/electrons/kpoint_" << ti << "/spin_" << spin << "/state_"
+             << ib << "/psi_r";
+        return path.str();
+    }
+
+    /** read/bcast psi_g
+     * @param ti twist index
+     * @param spin spin index
+     * @param ib band index
+     * @param cG psi_g as stored in hdf5
+     */
+    void
+    get_psi_g(int ti, int spin, int ib, Vector<std::complex<double>>& cG);
+
+    /** create the actual spline sets
+     */
+    virtual std::unique_ptr<SPOSetT<T>>
+    create_spline_set(const std::string& my_name, int spin,
+        const BandInfoGroup& bandgroup) = 0;
+
+    /** setting common parameters
+     */
+    void
+    setCommon(xmlNodePtr cur);
+
+    /** create the spline after one of the kind is created */
+    std::unique_ptr<SPOSetT<T>>
+    create_spline_set(int spin, xmlNodePtr cur, SPOSetInputInfo& input_info);
+
+    /** create the spline set */
+    std::unique_ptr<SPOSetT<T>>
+    create_spline_set(int spin, xmlNodePtr cur);
+
+    /** Set the checkNorm variable */
+    inline void
+    setCheckNorm(bool new_checknorm)
+    {
+        checkNorm = new_checknorm;
+    };
+
+    /** Set the orbital rotation flag. Rotations are applied to balance the
+     * real/imaginary components. */
+    inline void
+    setRotate(bool new_rotate)
+    {
+        rotate = new_rotate;
+    };
+
+    void
+    initialize_spo2band(int spin, const std::vector<BandInfo>& bigspace,
+        SPOSetInfo& sposet, std::vector<int>& band2spo);
+};
+
+} // namespace qmcplusplus
+#endif
diff --git a/src/QMCWaveFunctions/BsplineFactory/BsplineSetT.h b/src/QMCWaveFunctions/BsplineFactory/BsplineSetT.h
index 720d0bd5e9..9286624c92 100644
--- a/src/QMCWaveFunctions/BsplineFactory/BsplineSetT.h
+++ b/src/QMCWaveFunctions/BsplineFactory/BsplineSetT.h
@@ -225,8 +225,11 @@ class BsplineSetT : public SPOSetT<T>
     }
 
     template <class BSPLINESPO>
-    friend struct SplineSetReader;
-    friend struct BsplineReaderBase;
+    friend class SplineSetReaderT;
+    template <typename>
+    friend class BsplineReaderBaseT;
+    template <typename>
+    friend class HybridRepSetReaderT;
 
 protected:
     static const int D = QMCTraits::DIM;
@@ -253,8 +256,5 @@ class BsplineSetT : public SPOSetT<T>
     std::vector<int> offset;
 };
 
-
-
-
 } // namespace qmcplusplus
 #endif
diff --git a/src/QMCWaveFunctions/BsplineFactory/HybridRepCenterOrbitalsT.cpp b/src/QMCWaveFunctions/BsplineFactory/HybridRepCenterOrbitalsT.cpp
new file mode 100644
index 0000000000..da978b3647
--- /dev/null
+++ b/src/QMCWaveFunctions/BsplineFactory/HybridRepCenterOrbitalsT.cpp
@@ -0,0 +1,23 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source License.
+// See LICENSE file in top directory for details.
+//
+// Copyright (c) 2021 QMCPACK developers.
+//
+// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
+//
+// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
+//////////////////////////////////////////////////////////////////////////////////////
+
+
+#include "HybridRepCenterOrbitalsT.h"
+
+namespace qmcplusplus
+{
+template class AtomicOrbitalsT<float>;
+template class AtomicOrbitalsT<double>;
+template class HybridRepCenterOrbitalsT<float, float>;
+template class HybridRepCenterOrbitalsT<float, double>;
+template class HybridRepCenterOrbitalsT<double, float>;
+template class HybridRepCenterOrbitalsT<double, double>;
+} // namespace qmcplusplus
diff --git a/src/QMCWaveFunctions/BsplineFactory/HybridRepCenterOrbitalsT.h b/src/QMCWaveFunctions/BsplineFactory/HybridRepCenterOrbitalsT.h
new file mode 100644
index 0000000000..85bf667736
--- /dev/null
+++ b/src/QMCWaveFunctions/BsplineFactory/HybridRepCenterOrbitalsT.h
@@ -0,0 +1,819 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
+//
+// Copyright (c) 2019 QMCPACK developers.
+//
+// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
+//
+// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
+//
+//////////////////////////////////////////////////////////////////////////////////////
+
+#ifndef QMCPLUSPLUS_HYBRIDREP_CENTER_ORBITALST_H
+#define QMCPLUSPLUS_HYBRIDREP_CENTER_ORBITALST_H
+
+#include "Numerics/SmoothFunctions.hpp"
+#include "Numerics/SoaSphericalTensor.h"
+#include "Particle/DistanceTableT.h"
+#include "Particle/VirtualParticleSetT.h"
+#include "hdf/hdf_archive.h"
+#include "spline2/MultiBspline1D.hpp"
+
+namespace qmcplusplus
+{
+template <class BSPLINESPO>
+class HybridRepSetReaderT;
+
+template <typename T>
+class AtomicOrbitalsT
+{
+public:
+    static const int D = 3;
+    using AtomicSplineType = typename bspline_traits<T, 1>::SplineType;
+    using AtomicBCType = typename bspline_traits<T, 1>::BCType;
+    using AtomicSingleSplineType = UBspline_1d_d;
+    using PointType = TinyVector<T, D>;
+    using value_type = T;
+
+    using vContainer_type = aligned_vector<T>;
+
+private:
+    // near core cutoff
+    T rmin;
+    // far from core cutoff, rmin_sqrt>=rmin
+    T rmin_sqrt;
+    T cutoff, cutoff_buffer, spline_radius, non_overlapping_radius;
+    int spline_npoints, BaseN;
+    int NumBands, Npad;
+    PointType center_pos;
+    const int lmax, lm_tot;
+    SoaSphericalTensor<T> Ylm;
+    vContainer_type l_vals;
+    vContainer_type r_power_minus_l;
+    /// 1D spline of radial functions of all the orbitals
+    std::shared_ptr<MultiBspline1D<T>> SplineInst;
+
+    vContainer_type localV, localG, localL;
+
+public:
+    AtomicOrbitalsT(int Lmax) :
+        lmax(Lmax),
+        lm_tot((Lmax + 1) * (Lmax + 1)),
+        Ylm(Lmax)
+    {
+        r_power_minus_l.resize(lm_tot);
+        l_vals.resize(lm_tot);
+        for (int l = 0; l <= lmax; l++)
+            for (int m = -l; m <= l; m++)
+                l_vals[l * (l + 1) + m] = l;
+        rmin = std::exp(
+            std::log(std::numeric_limits<T>::min()) / std::max(Lmax, 1));
+        rmin = std::max(rmin, std::numeric_limits<T>::epsilon());
+        rmin_sqrt =
+            std::max(rmin, std::sqrt(std::numeric_limits<T>::epsilon()));
+    }
+
+    // accessing functions, const only
+    T
+    getCutoff() const
+    {
+        return cutoff;
+    }
+    T
+    getCutoffBuffer() const
+    {
+        return cutoff_buffer;
+    }
+    T
+    getSplineRadius() const
+    {
+        return spline_radius;
+    }
+    T
+    getNonOverlappingRadius() const
+    {
+        return non_overlapping_radius;
+    }
+    int
+    getSplineNpoints() const
+    {
+        return spline_npoints;
+    }
+    int
+    getLmax() const
+    {
+        return lmax;
+    }
+    const PointType&
+    getCenterPos() const
+    {
+        return center_pos;
+    }
+
+    inline void
+    resizeStorage(size_t Nb)
+    {
+        NumBands = Nb;
+        Npad = getAlignedSize<T>(Nb);
+        localV.resize(Npad * lm_tot);
+        localG.resize(Npad * lm_tot);
+        localL.resize(Npad * lm_tot);
+        create_spline();
+    }
+
+    void
+    bcast_tables(Communicate* comm)
+    {
+        chunked_bcast(comm, SplineInst->getSplinePtr());
+    }
+
+    void
+    gather_tables(Communicate* comm, std::vector<int>& offset)
+    {
+        gatherv(comm, SplineInst->getSplinePtr(), Npad, offset);
+    }
+
+    template <typename PT, typename VT>
+    inline void
+    set_info(const PT& R, const VT& cutoff_in, const VT& cutoff_buffer_in,
+        const VT& spline_radius_in, const VT& non_overlapping_radius_in,
+        const int spline_npoints_in)
+    {
+        center_pos[0] = R[0];
+        center_pos[1] = R[1];
+        center_pos[2] = R[2];
+        cutoff = cutoff_in;
+        cutoff_buffer = cutoff_buffer_in;
+        spline_radius = spline_radius_in;
+        spline_npoints = spline_npoints_in;
+        non_overlapping_radius = non_overlapping_radius_in;
+        BaseN = spline_npoints + 2;
+    }
+
+    inline void
+    create_spline()
+    {
+        AtomicBCType bc;
+        bc.lCode = FLAT;
+        bc.rCode = NATURAL;
+        Ugrid grid;
+        grid.start = 0.0;
+        grid.end = spline_radius;
+        grid.num = spline_npoints;
+        SplineInst = std::make_shared<MultiBspline1D<T>>();
+        SplineInst->create(grid, bc, lm_tot * Npad);
+    }
+
+    inline size_t
+    getSplineSizeInBytes() const
+    {
+        return SplineInst->sizeInByte();
+    }
+
+    inline void
+    flush_zero()
+    {
+        SplineInst->flush_zero();
+    }
+
+    inline void
+    set_spline(AtomicSingleSplineType* spline, int lm, int ispline)
+    {
+        SplineInst->copy_spline(spline, lm * Npad + ispline, 0, BaseN);
+    }
+
+    bool
+    read_splines(hdf_archive& h5f)
+    {
+        einspline_engine<AtomicSplineType> bigtable(SplineInst->getSplinePtr());
+        int lmax_in = 0, spline_npoints_in = 0;
+        T spline_radius_in;
+        if (!h5f.readEntry(lmax_in, "l_max") || lmax_in != lmax)
+            return false;
+        if (!h5f.readEntry(spline_radius_in, "spline_radius") ||
+            spline_radius_in != spline_radius)
+            return false;
+        if (!h5f.readEntry(spline_npoints_in, "spline_npoints") ||
+            spline_npoints_in != spline_npoints)
+            return false;
+        return h5f.readEntry(bigtable, "radial_spline");
+    }
+
+    bool
+    write_splines(hdf_archive& h5f)
+    {
+        bool success = true;
+        success = success && h5f.writeEntry(spline_radius, "spline_radius");
+        success = success && h5f.writeEntry(spline_npoints, "spline_npoints");
+        success = success && h5f.writeEntry(lmax, "l_max");
+        success = success && h5f.writeEntry(center_pos, "position");
+        einspline_engine<AtomicSplineType> bigtable(SplineInst->getSplinePtr());
+        success = success && h5f.writeEntry(bigtable, "radial_spline");
+        return success;
+    }
+
+    // evaluate only V
+    template <typename VV>
+    inline void
+    evaluate_v(const T& r, const PointType& dr, VV& myV)
+    {
+        if (r > std::numeric_limits<T>::epsilon())
+            Ylm.evaluateV(dr[0] / r, dr[1] / r, dr[2] / r);
+        else
+            Ylm.evaluateV(0, 0, 1);
+        const T* restrict Ylm_v = Ylm[0];
+
+        constexpr T czero(0);
+        T* restrict val = myV.data();
+        T* restrict local_val = localV.data();
+        std::fill(myV.begin(), myV.end(), czero);
+
+        SplineInst->evaluate(r, localV);
+
+        for (size_t lm = 0; lm < lm_tot; lm++) {
+#pragma omp simd aligned(val, local_val : QMC_SIMD_ALIGNMENT)
+            for (size_t ib = 0; ib < myV.size(); ib++)
+                val[ib] += Ylm_v[lm] * local_val[ib];
+            local_val += Npad;
+        }
+    }
+
+    template <typename DISPL, typename VM>
+    inline void
+    evaluateValues(const DISPL& Displacements, const int center_idx, const T& r,
+        VM& multi_myV)
+    {
+        if (r <= std::numeric_limits<T>::epsilon())
+            Ylm.evaluateV(0, 0, 1);
+        const T* restrict Ylm_v = Ylm[0];
+
+        const size_t m = multi_myV.cols();
+        constexpr T czero(0);
+        std::fill(multi_myV.begin(), multi_myV.end(), czero);
+        SplineInst->evaluate(r, localV);
+
+        for (int ivp = 0; ivp < Displacements.size(); ivp++) {
+            PointType dr = Displacements[ivp][center_idx];
+            if (r > std::numeric_limits<T>::epsilon())
+                Ylm.evaluateV(-dr[0] / r, -dr[1] / r, -dr[2] / r);
+
+            T* restrict val = multi_myV[ivp];
+            T* restrict local_val = localV.data();
+            for (size_t lm = 0; lm < lm_tot; lm++) {
+#pragma omp simd aligned(val, local_val : QMC_SIMD_ALIGNMENT)
+                for (size_t ib = 0; ib < m; ib++)
+                    val[ib] += Ylm_v[lm] * local_val[ib];
+                local_val += Npad;
+            }
+        }
+    }
+
+    // evaluate VGL
+    template <typename VV, typename GV>
+    inline void
+    evaluate_vgl(const T& r, const PointType& dr, VV& myV, GV& myG, VV& myL)
+    {
+        T drx, dry, drz, rhatx, rhaty, rhatz, rinv;
+        if (r > rmin) {
+            rinv = 1.0 / r;
+        }
+        else {
+            rinv = 0;
+        }
+        drx = dr[0];
+        dry = dr[1];
+        drz = dr[2];
+        rhatx = drx * rinv;
+        rhaty = dry * rinv;
+        rhatz = drz * rinv;
+
+        Ylm.evaluateVGL(drx, dry, drz);
+        const T* restrict Ylm_v = Ylm[0];
+        const T* restrict Ylm_gx = Ylm[1];
+        const T* restrict Ylm_gy = Ylm[2];
+        const T* restrict Ylm_gz = Ylm[3];
+
+        T* restrict g0 = myG.data(0);
+        T* restrict g1 = myG.data(1);
+        T* restrict g2 = myG.data(2);
+        constexpr T czero(0), cone(1), chalf(0.5);
+        std::fill(myV.begin(), myV.end(), czero);
+        std::fill(g0, g0 + Npad, czero);
+        std::fill(g1, g1 + Npad, czero);
+        std::fill(g2, g2 + Npad, czero);
+        std::fill(myL.begin(), myL.end(), czero);
+        T* restrict val = myV.data();
+        T* restrict lapl = myL.data();
+        T* restrict local_val = localV.data();
+        T* restrict local_grad = localG.data();
+        T* restrict local_lapl = localL.data();
+
+        SplineInst->evaluate_vgl(r, localV, localG, localL);
+
+        if (r > rmin_sqrt) {
+            // far from core
+            r_power_minus_l[0] = cone;
+            T r_power_temp = cone;
+            for (int l = 1; l <= lmax; l++) {
+                r_power_temp *= rinv;
+                for (int m = -l, lm = l * l; m <= l; m++, lm++)
+                    r_power_minus_l[lm] = r_power_temp;
+            }
+
+            for (size_t lm = 0; lm < lm_tot; lm++) {
+                const T& l_val = l_vals[lm];
+                const T& r_power = r_power_minus_l[lm];
+                const T Ylm_rescale = Ylm_v[lm] * r_power;
+                const T rhat_dot_G = (rhatx * Ylm_gx[lm] + rhaty * Ylm_gy[lm] +
+                                         rhatz * Ylm_gz[lm]) *
+                    r_power;
+#pragma omp simd aligned( \
+    val, g0, g1, g2, lapl, local_val, local_grad, local_lapl \
+    : QMC_SIMD_ALIGNMENT)
+                for (size_t ib = 0; ib < myV.size(); ib++) {
+                    const T local_v = local_val[ib];
+                    const T local_g = local_grad[ib];
+                    const T local_l = local_lapl[ib];
+                    // value
+                    const T Vpart = l_val * rinv * local_v;
+                    val[ib] += Ylm_rescale * local_v;
+
+                    // grad
+                    const T factor1 = local_g * Ylm_rescale;
+                    const T factor2 = local_v * r_power;
+                    const T factor3 = -Vpart * Ylm_rescale;
+                    g0[ib] += factor1 * rhatx + factor2 * Ylm_gx[lm] +
+                        factor3 * rhatx;
+                    g1[ib] += factor1 * rhaty + factor2 * Ylm_gy[lm] +
+                        factor3 * rhaty;
+                    g2[ib] += factor1 * rhatz + factor2 * Ylm_gz[lm] +
+                        factor3 * rhatz;
+
+                    // laplacian
+                    lapl[ib] +=
+                        (local_l + (local_g * (2 - l_val) - Vpart) * rinv) *
+                            Ylm_rescale +
+                        (local_g - Vpart) * rhat_dot_G;
+                }
+                local_val += Npad;
+                local_grad += Npad;
+                local_lapl += Npad;
+            }
+        }
+        else if (r > rmin) {
+            // the possibility of reaching here is very very low
+            std::cout
+                << "Warning: an electron is very close to an ion, distance="
+                << r << " be careful!" << std::endl;
+            // near core, kill divergence in the laplacian
+            r_power_minus_l[0] = cone;
+            T r_power_temp = cone;
+            for (int l = 1; l <= lmax; l++) {
+                r_power_temp *= rinv;
+                for (int m = -l, lm = l * l; m <= l; m++, lm++)
+                    r_power_minus_l[lm] = r_power_temp;
+            }
+
+            for (size_t lm = 0; lm < lm_tot; lm++) {
+                const T& l_val = l_vals[lm];
+                const T& r_power = r_power_minus_l[lm];
+                const T Ylm_rescale = Ylm_v[lm] * r_power;
+                const T rhat_dot_G = (Ylm_gx[lm] * rhatx + Ylm_gy[lm] * rhaty +
+                                         Ylm_gz[lm] * rhatz) *
+                    r_power * r;
+#pragma omp simd aligned( \
+    val, g0, g1, g2, lapl, local_val, local_grad, local_lapl \
+    : QMC_SIMD_ALIGNMENT)
+                for (size_t ib = 0; ib < myV.size(); ib++) {
+                    const T local_v = local_val[ib];
+                    const T local_g = local_grad[ib];
+                    const T local_l = local_lapl[ib];
+                    // value
+                    const T Vpart = Ylm_rescale * local_v;
+                    val[ib] += Vpart;
+
+                    // grad
+                    const T factor1 = local_g * Ylm_rescale;
+                    const T factor2 = local_v * r_power;
+                    const T factor3 = -l_val * Vpart * rinv;
+                    g0[ib] += factor1 * rhatx + factor2 * Ylm_gx[lm] +
+                        factor3 * rhatx;
+                    g1[ib] += factor1 * rhaty + factor2 * Ylm_gy[lm] +
+                        factor3 * rhaty;
+                    g2[ib] += factor1 * rhatz + factor2 * Ylm_gz[lm] +
+                        factor3 * rhatz;
+
+                    // laplacian
+                    lapl[ib] += local_l * (cone - chalf * l_val) *
+                        (3 * Ylm_rescale + rhat_dot_G);
+                }
+                local_val += Npad;
+                local_grad += Npad;
+                local_lapl += Npad;
+            }
+        }
+        else {
+            std::cout << "Warning: an electron is on top of an ion!"
+                      << std::endl;
+            // strictly zero
+
+#pragma omp simd aligned(val, lapl, local_val, local_lapl : QMC_SIMD_ALIGNMENT)
+            for (size_t ib = 0; ib < myV.size(); ib++) {
+                // value
+                val[ib] = Ylm_v[0] * local_val[ib];
+
+                // laplacian
+                lapl[ib] = local_lapl[ib] * static_cast<T>(3) * Ylm_v[0];
+            }
+            local_val += Npad;
+            local_grad += Npad;
+            local_lapl += Npad;
+            if (lm_tot > 0) {
+                // std::cout << std::endl;
+                for (size_t lm = 1; lm < 4; lm++) {
+#pragma omp simd aligned(g0, g1, g2, local_grad : QMC_SIMD_ALIGNMENT)
+                    for (size_t ib = 0; ib < myV.size(); ib++) {
+                        const T local_g = local_grad[ib];
+                        // grad
+                        g0[ib] += local_g * Ylm_gx[lm];
+                        g1[ib] += local_g * Ylm_gy[lm];
+                        g2[ib] += local_g * Ylm_gz[lm];
+                    }
+                    local_grad += Npad;
+                }
+            }
+        }
+    }
+
+    template <typename VV, typename GV, typename HT>
+    void
+    evaluate_vgh(const T& r, const PointType& dr, VV& myV, GV& myG, HT& myH)
+    {
+        // Needed to do tensor product here
+        APP_ABORT("AtomicOrbitals::evaluate_vgh");
+    }
+};
+
+template <typename ST, typename VT>
+class HybridRepCenterOrbitalsT
+{
+public:
+    static const int D = 3;
+    using PointType = typename AtomicOrbitalsT<ST>::PointType;
+    using RealType = typename DistanceTableT<VT>::RealType;
+    using PosType = typename DistanceTableT<VT>::PosType;
+
+private:
+    /// atomic centers
+    std::vector<AtomicOrbitalsT<ST>> AtomicCenters;
+    /// table index
+    int myTableID;
+    /// mapping supercell to primitive cell
+    std::vector<int> Super2Prim;
+    /// r from distance table
+    RealType dist_r;
+    /// dr from distance table
+    PosType dist_dr;
+    /// for APBC
+    PointType r_image;
+    /// smooth function value
+    RealType f;
+    /// smooth function first derivative
+    RealType df_dr;
+    /// smooth function second derivative
+    RealType d2f_dr2;
+    /// smoothing schemes
+    enum class smoothing_schemes
+    {
+        CONSISTENT = 0,
+        SMOOTHALL,
+        SMOOTHPARTIAL
+    } smooth_scheme;
+    /// smoothing function
+    smoothing_functions smooth_func_id;
+
+public:
+    HybridRepCenterOrbitalsT()
+    {
+    }
+
+    void
+    set_info(const ParticleSetT<VT>& ions, ParticleSetT<VT>& els,
+        const std::vector<int>& mapping)
+    {
+        myTableID = els.addTable(ions, DTModes::NEED_VP_FULL_TABLE_ON_HOST);
+        Super2Prim = mapping;
+    }
+
+    inline void
+    resizeStorage(size_t Nb)
+    {
+        size_t SplineCoefsBytes = 0;
+
+        for (int ic = 0; ic < AtomicCenters.size(); ic++) {
+            AtomicCenters[ic].resizeStorage(Nb);
+            SplineCoefsBytes += AtomicCenters[ic].getSplineSizeInBytes();
+        }
+
+        app_log()
+            << "MEMORY " << SplineCoefsBytes / (1 << 20) << " MB allocated "
+            << "for the atomic radial splines in hybrid orbital representation"
+            << std::endl;
+    }
+
+    void
+    bcast_tables(Communicate* comm)
+    {
+        for (int ic = 0; ic < AtomicCenters.size(); ic++)
+            AtomicCenters[ic].bcast_tables(comm);
+    }
+
+    void
+    gather_atomic_tables(Communicate* comm, std::vector<int>& offset)
+    {
+        if (comm->size() == 1)
+            return;
+        for (int ic = 0; ic < AtomicCenters.size(); ic++)
+            AtomicCenters[ic].gather_tables(comm, offset);
+    }
+
+    inline void
+    flush_zero()
+    {
+        for (int ic = 0; ic < AtomicCenters.size(); ic++)
+            AtomicCenters[ic].flush_zero();
+    }
+
+    bool
+    read_splines(hdf_archive& h5f)
+    {
+        bool success = true;
+        size_t ncenter;
+
+        try {
+            h5f.push("atomic_centers", false);
+        }
+        catch (...) {
+            success = false;
+        }
+        success = success && h5f.readEntry(ncenter, "number_of_centers");
+        if (!success)
+            return success;
+        if (ncenter != AtomicCenters.size())
+            success = false;
+        // read splines of each center
+        for (int ic = 0; ic < AtomicCenters.size(); ic++) {
+            std::ostringstream gname;
+            gname << "center_" << ic;
+            try {
+                h5f.push(gname.str().c_str(), false);
+            }
+            catch (...) {
+                success = false;
+            }
+            success = success && AtomicCenters[ic].read_splines(h5f);
+            h5f.pop();
+        }
+        h5f.pop();
+        return success;
+    }
+
+    bool
+    write_splines(hdf_archive& h5f)
+    {
+        bool success = true;
+        int ncenter = AtomicCenters.size();
+        try {
+            h5f.push("atomic_centers", true);
+        }
+        catch (...) {
+            success = false;
+        }
+        success = success && h5f.writeEntry(ncenter, "number_of_centers");
+        // write splines of each center
+        for (int ic = 0; ic < AtomicCenters.size(); ic++) {
+            std::ostringstream gname;
+            gname << "center_" << ic;
+            try {
+                h5f.push(gname.str().c_str(), true);
+            }
+            catch (...) {
+                success = false;
+            }
+            success = success && AtomicCenters[ic].write_splines(h5f);
+            h5f.pop();
+        }
+        h5f.pop();
+        return success;
+    }
+
+    template <typename Cell>
+    inline int
+    get_bc_sign(
+        const PointType& r, const Cell& PrimLattice, TinyVector<int, D>& HalfG)
+    {
+        int bc_sign = 0;
+        PointType shift_unit = PrimLattice.toUnit(r - r_image);
+        for (int i = 0; i < D; i++) {
+            ST img = round(shift_unit[i]);
+            bc_sign += HalfG[i] * (int)img;
+        }
+        return bc_sign;
+    }
+
+    // evaluate only V
+    template <typename VV>
+    inline RealType
+    evaluate_v(const ParticleSetT<VT>& P, const int iat, VV& myV)
+    {
+        const auto& ei_dist = P.getDistTableAB(myTableID);
+        const int center_idx = ei_dist.get_first_neighbor(
+            iat, dist_r, dist_dr, P.getActivePtcl() == iat);
+        if (center_idx < 0)
+            abort();
+        auto& myCenter = AtomicCenters[Super2Prim[center_idx]];
+        if (dist_r < myCenter.getCutoff()) {
+            PointType dr(-dist_dr[0], -dist_dr[1], -dist_dr[2]);
+            r_image = myCenter.getCenterPos() + dr;
+            myCenter.evaluate_v(dist_r, dr, myV);
+            return smooth_function(
+                myCenter.getCutoffBuffer(), myCenter.getCutoff(), dist_r);
+        }
+        return RealType(-1);
+    }
+
+    /* check if the batched algorithm is safe to operate
+     * @param VP virtual particle set
+     * @return true if it is safe
+     *
+     * When the reference electron in the NLPP evaluation has a distance larger
+     * than the non overlapping radius of the reference center. Some qudrature
+     * points may get its SPOs evaluated from the nearest center which is not
+     * the reference center. The batched algorthm forces the evaluation on the
+     * reference center and introduce some error. In this case, the non-batched
+     * algorithm should be used.
+     */
+    bool
+    is_batched_safe(const VirtualParticleSetT<VT>& VP)
+    {
+        const int center_idx = VP.refSourcePtcl;
+        auto& myCenter = AtomicCenters[Super2Prim[center_idx]];
+        return VP.getRefPS().getDistTableAB(myTableID).getDistRow(
+                   VP.refPtcl)[center_idx] < myCenter.getNonOverlappingRadius();
+    }
+
+    // C2C, C2R cases
+    template <typename VM>
+    inline RealType
+    evaluateValuesC2X(const VirtualParticleSetT<VT>& VP, VM& multi_myV)
+    {
+        const int center_idx = VP.refSourcePtcl;
+        dist_r = VP.getRefPS().getDistTableAB(myTableID).getDistRow(
+            VP.refPtcl)[center_idx];
+        auto& myCenter = AtomicCenters[Super2Prim[center_idx]];
+        if (dist_r < myCenter.getCutoff()) {
+            myCenter.evaluateValues(
+                VP.getDistTableAB(myTableID).getDisplacements(), center_idx,
+                dist_r, multi_myV);
+            return smooth_function(
+                myCenter.getCutoffBuffer(), myCenter.getCutoff(), dist_r);
+        }
+        return RealType(-1);
+    }
+
+    // R2R case
+    template <typename VM, typename Cell, typename SV>
+    inline RealType
+    evaluateValuesR2R(const VirtualParticleSetT<VT>& VP,
+        const Cell& PrimLattice, TinyVector<int, D>& HalfG, VM& multi_myV,
+        SV& bc_signs)
+    {
+        const int center_idx = VP.refSourcePtcl;
+        dist_r = VP.getRefPS().getDistTableAB(myTableID).getDistRow(
+            VP.refPtcl)[center_idx];
+        auto& myCenter = AtomicCenters[Super2Prim[center_idx]];
+        if (dist_r < myCenter.getCutoff()) {
+            const auto& displ = VP.getDistTableAB(myTableID).getDisplacements();
+            for (int ivp = 0; ivp < VP.getTotalNum(); ivp++) {
+                r_image = myCenter.getCenterPos() - displ[ivp][center_idx];
+                bc_signs[ivp] = get_bc_sign(VP.R[ivp], PrimLattice, HalfG);
+                ;
+            }
+            myCenter.evaluateValues(displ, center_idx, dist_r, multi_myV);
+            return smooth_function(
+                myCenter.getCutoffBuffer(), myCenter.getCutoff(), dist_r);
+        }
+        return RealType(-1);
+    }
+
+    // evaluate only VGL
+    template <typename VV, typename GV>
+    inline RealType
+    evaluate_vgl(
+        const ParticleSetT<VT>& P, const int iat, VV& myV, GV& myG, VV& myL)
+    {
+        const auto& ei_dist = P.getDistTableAB(myTableID);
+        const int center_idx = ei_dist.get_first_neighbor(
+            iat, dist_r, dist_dr, P.getActivePtcl() == iat);
+        if (center_idx < 0)
+            abort();
+        auto& myCenter = AtomicCenters[Super2Prim[center_idx]];
+        if (dist_r < myCenter.getCutoff()) {
+            PointType dr(-dist_dr[0], -dist_dr[1], -dist_dr[2]);
+            r_image = myCenter.getCenterPos() + dr;
+            myCenter.evaluate_vgl(dist_r, dr, myV, myG, myL);
+            return smooth_function(
+                myCenter.getCutoffBuffer(), myCenter.getCutoff(), dist_r);
+        }
+        return RealType(-1);
+    }
+
+    // evaluate only VGH
+    template <typename VV, typename GV, typename HT>
+    inline RealType
+    evaluate_vgh(
+        const ParticleSetT<VT>& P, const int iat, VV& myV, GV& myG, HT& myH)
+    {
+        const auto& ei_dist = P.getDistTableAB(myTableID);
+        const int center_idx = ei_dist.get_first_neighbor(
+            iat, dist_r, dist_dr, P.getActivePtcl() == iat);
+        if (center_idx < 0)
+            abort();
+        auto& myCenter = AtomicCenters[Super2Prim[center_idx]];
+        if (dist_r < myCenter.getCutoff()) {
+            PointType dr(-dist_dr[0], -dist_dr[1], -dist_dr[2]);
+            r_image = myCenter.getCenterPos() + dr;
+            myCenter.evaluate_vgh(dist_r, dr, myV, myG, myH);
+            return smooth_function(
+                myCenter.getCutoffBuffer(), myCenter.getCutoff(), dist_r);
+        }
+        return RealType(-1);
+    }
+
+    // interpolate buffer region, value only
+    template <typename VV>
+    inline void
+    interpolate_buffer_v(VV& psi, const VV& psi_AO) const
+    {
+        const RealType cone(1);
+        for (size_t i = 0; i < psi.size(); i++)
+            psi[i] = psi_AO[i] * f + psi[i] * (cone - f);
+    }
+
+    // interpolate buffer region, value, gradients and laplacian
+    template <typename VV, typename GV>
+    inline void
+    interpolate_buffer_vgl(VV& psi, GV& dpsi, VV& d2psi, const VV& psi_AO,
+        const GV& dpsi_AO, const VV& d2psi_AO) const
+    {
+        const RealType cone(1), ctwo(2);
+        const RealType rinv(1.0 / dist_r);
+        if (smooth_scheme == smoothing_schemes::CONSISTENT)
+            for (size_t i = 0; i < psi.size();
+                 i++) { // psi, dpsi, d2psi are all consistent
+                d2psi[i] = d2psi_AO[i] * f + d2psi[i] * (cone - f) +
+                    df_dr * rinv * ctwo * dot(dpsi[i] - dpsi_AO[i], dist_dr) +
+                    (psi_AO[i] - psi[i]) * (d2f_dr2 + ctwo * rinv * df_dr);
+                dpsi[i] = dpsi_AO[i] * f + dpsi[i] * (cone - f) +
+                    df_dr * rinv * dist_dr * (psi[i] - psi_AO[i]);
+                psi[i] = psi_AO[i] * f + psi[i] * (cone - f);
+            }
+        else if (smooth_scheme == smoothing_schemes::SMOOTHALL)
+            for (size_t i = 0; i < psi.size(); i++) {
+                d2psi[i] = d2psi_AO[i] * f + d2psi[i] * (cone - f);
+                dpsi[i] = dpsi_AO[i] * f + dpsi[i] * (cone - f);
+                psi[i] = psi_AO[i] * f + psi[i] * (cone - f);
+            }
+        else if (smooth_scheme == smoothing_schemes::SMOOTHPARTIAL)
+            for (size_t i = 0; i < psi.size();
+                 i++) { // dpsi, d2psi are consistent but psi is not.
+                d2psi[i] = d2psi_AO[i] * f + d2psi[i] * (cone - f) +
+                    df_dr * rinv * ctwo * dot(dpsi[i] - dpsi_AO[i], dist_dr);
+                dpsi[i] = dpsi_AO[i] * f + dpsi[i] * (cone - f);
+                psi[i] = psi_AO[i] * f + psi[i] * (cone - f);
+            }
+        else
+            throw std::runtime_error("Unknown smooth scheme!");
+    }
+
+    inline RealType
+    smooth_function(const ST& cutoff_buffer, const ST& cutoff, const RealType r)
+    {
+        const RealType cone(1);
+        if (r < cutoff_buffer)
+            return cone;
+        const RealType scale = cone / (cutoff - cutoff_buffer);
+        const RealType x = (r - cutoff_buffer) * scale;
+        f = smoothing(smooth_func_id, x, df_dr, d2f_dr2);
+        df_dr *= scale;
+        d2f_dr2 *= scale * scale;
+        return f;
+    }
+
+    template <class BSPLINESPO>
+    friend class HybridRepSetReaderT;
+};
+
+} // namespace qmcplusplus
+#endif
diff --git a/src/QMCWaveFunctions/BsplineFactory/HybridRepCplxT.h b/src/QMCWaveFunctions/BsplineFactory/HybridRepCplxT.h
new file mode 100644
index 0000000000..6f3dd504a9
--- /dev/null
+++ b/src/QMCWaveFunctions/BsplineFactory/HybridRepCplxT.h
@@ -0,0 +1,292 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
+//
+// Copyright (c) 2019 QMCPACK developers.
+//
+// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
+//
+// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
+//
+//////////////////////////////////////////////////////////////////////////////////////
+
+#ifndef QMCPLUSPLUS_HYBRIDREP_CPLXT_H
+#define QMCPLUSPLUS_HYBRIDREP_CPLXT_H
+
+#include "QMCWaveFunctions/BsplineFactory/HybridRepCenterOrbitalsT.h"
+namespace qmcplusplus
+{
+/** hybrid representation orbitals combining B-spline orbitals on a grid and
+ * atomic centered orbitals.
+ * @tparam SPLINEBASE B-spline orbital class.
+ *
+ * Only works with SPLINEBASE class containing complex splines
+ */
+template <typename SPLINEBASE>
+class HybridRepCplxT :
+    public SPLINEBASE,
+    private HybridRepCenterOrbitalsT<typename SPLINEBASE::DataType,
+        typename SPLINEBASE::ValueType>
+{
+public:
+    using HYBRIDBASE = HybridRepCenterOrbitalsT<typename SPLINEBASE::DataType,
+        typename SPLINEBASE::ValueType>;
+    using ST = typename SPLINEBASE::DataType;
+    using PointType = typename SPLINEBASE::PointType;
+    using SingleSplineType = typename SPLINEBASE::SingleSplineType;
+    using RealType = typename SPLINEBASE::RealType;
+    // types for evaluation results
+    using typename SPLINEBASE::GGGVector;
+    using typename SPLINEBASE::GradMatrix;
+    using typename SPLINEBASE::GradType;
+    using typename SPLINEBASE::GradVector;
+    using typename SPLINEBASE::HessVector;
+    using typename SPLINEBASE::OffloadMWVGLArray;
+    using typename SPLINEBASE::ValueMatrix;
+    using typename SPLINEBASE::ValueType;
+    using typename SPLINEBASE::ValueVector;
+
+private:
+    ValueVector psi_AO, d2psi_AO;
+    GradVector dpsi_AO;
+    Matrix<ST, aligned_allocator<ST>> multi_myV;
+
+    using SPLINEBASE::HalfG;
+    using SPLINEBASE::myG;
+    using SPLINEBASE::myH;
+    using SPLINEBASE::myL;
+    using SPLINEBASE::myV;
+
+public:
+    HybridRepCplxT(const std::string& my_name) : SPLINEBASE(my_name)
+    {
+    }
+
+    std::string
+    getClassName() const final
+    {
+        return "Hybrid" + SPLINEBASE::getClassName();
+    }
+    std::string
+    getKeyword() const final
+    {
+        return "Hybrid" + SPLINEBASE::getKeyword();
+    }
+    bool
+    isOMPoffload() const final
+    {
+        return false;
+    }
+
+    std::unique_ptr<SPOSetT<ValueType>>
+    makeClone() const override
+    {
+        return std::make_unique<HybridRepCplxT>(*this);
+    }
+
+    inline void
+    resizeStorage(size_t n, size_t nvals)
+    {
+        SPLINEBASE::resizeStorage(n, nvals);
+        HYBRIDBASE::resizeStorage(myV.size());
+    }
+
+    void
+    bcast_tables(Communicate* comm)
+    {
+        SPLINEBASE::bcast_tables(comm);
+        HYBRIDBASE::bcast_tables(comm);
+    }
+
+    void
+    gather_tables(Communicate* comm)
+    {
+        SPLINEBASE::gather_tables(comm);
+        HYBRIDBASE::gather_atomic_tables(comm, SPLINEBASE::offset);
+    }
+
+    bool
+    read_splines(hdf_archive& h5f)
+    {
+        return HYBRIDBASE::read_splines(h5f) && SPLINEBASE::read_splines(h5f);
+    }
+
+    bool
+    write_splines(hdf_archive& h5f)
+    {
+        return HYBRIDBASE::write_splines(h5f) && SPLINEBASE::write_splines(h5f);
+    }
+
+    inline void
+    flush_zero()
+    {
+        // SPLINEBASE::flush_zero();
+        HYBRIDBASE::flush_zero();
+    }
+
+    void
+    evaluateValue(const ParticleSetT<ValueType>& P, const int iat,
+        ValueVector& psi) override
+    {
+        const RealType smooth_factor = HYBRIDBASE::evaluate_v(P, iat, myV);
+        const RealType cone(1);
+        if (smooth_factor < 0) {
+            SPLINEBASE::evaluateValue(P, iat, psi);
+        }
+        else if (smooth_factor == cone) {
+            const PointType& r = P.activeR(iat);
+            SPLINEBASE::assign_v(r, myV, psi, 0, myV.size() / 2);
+        }
+        else {
+            const PointType& r = P.activeR(iat);
+            psi_AO.resize(psi.size());
+            SPLINEBASE::assign_v(r, myV, psi_AO, 0, myV.size() / 2);
+            SPLINEBASE::evaluateValue(P, iat, psi);
+            HYBRIDBASE::interpolate_buffer_v(psi, psi_AO);
+        }
+    }
+
+    void
+    evaluateDetRatios(const VirtualParticleSetT<ValueType>& VP,
+        ValueVector& psi, const ValueVector& psiinv,
+        std::vector<ValueType>& ratios) override
+    {
+        if (VP.isOnSphere()) {
+            // resize scratch space
+            psi_AO.resize(psi.size());
+            if (multi_myV.rows() < VP.getTotalNum())
+                multi_myV.resize(VP.getTotalNum(), myV.size());
+            const RealType smooth_factor =
+                HYBRIDBASE::evaluateValuesC2X(VP, multi_myV);
+            const RealType cone(1);
+            for (int iat = 0; iat < VP.getTotalNum(); ++iat) {
+                if (smooth_factor < 0)
+                    SPLINEBASE::evaluateValue(VP, iat, psi);
+                else if (smooth_factor == cone) {
+                    const PointType& r = VP.R[iat];
+                    Vector<ST, aligned_allocator<ST>> myV_one(
+                        multi_myV[iat], myV.size());
+                    SPLINEBASE::assign_v(r, myV_one, psi, 0, myV.size() / 2);
+                }
+                else {
+                    const PointType& r = VP.R[iat];
+                    Vector<ST, aligned_allocator<ST>> myV_one(
+                        multi_myV[iat], myV.size());
+                    SPLINEBASE::assign_v(r, myV_one, psi_AO, 0, myV.size() / 2);
+                    SPLINEBASE::evaluateValue(VP, iat, psi);
+                    HYBRIDBASE::interpolate_buffer_v(psi, psi_AO);
+                }
+                ratios[iat] = simd::dot(psi.data(), psiinv.data(), psi.size());
+            }
+        }
+        else {
+            for (int iat = 0; iat < VP.getTotalNum(); ++iat) {
+                evaluateValue(VP, iat, psi);
+                ratios[iat] = simd::dot(psi.data(), psiinv.data(), psi.size());
+            }
+        }
+    }
+
+    void
+    mw_evaluateDetRatios(
+        const RefVectorWithLeader<SPOSetT<ValueType>>& spo_list,
+        const RefVectorWithLeader<const VirtualParticleSetT<ValueType>>&
+            vp_list,
+        const RefVector<ValueVector>& psi_list,
+        const std::vector<const ValueType*>& invRow_ptr_list,
+        std::vector<std::vector<ValueType>>& ratios_list) const final
+    {
+        BsplineSetT<ValueType>::mw_evaluateDetRatios(
+            spo_list, vp_list, psi_list, invRow_ptr_list, ratios_list);
+    }
+
+    void
+    evaluateVGL(const ParticleSetT<ValueType>& P, const int iat,
+        ValueVector& psi, GradVector& dpsi, ValueVector& d2psi) override
+    {
+        const RealType smooth_factor =
+            HYBRIDBASE::evaluate_vgl(P, iat, myV, myG, myL);
+        const RealType cone(1);
+        if (smooth_factor < 0) {
+            SPLINEBASE::evaluateVGL(P, iat, psi, dpsi, d2psi);
+        }
+        else if (smooth_factor == cone) {
+            const PointType& r = P.activeR(iat);
+            SPLINEBASE::assign_vgl_from_l(r, psi, dpsi, d2psi);
+        }
+        else {
+            const PointType& r = P.activeR(iat);
+            psi_AO.resize(psi.size());
+            dpsi_AO.resize(psi.size());
+            d2psi_AO.resize(psi.size());
+            SPLINEBASE::assign_vgl_from_l(r, psi_AO, dpsi_AO, d2psi_AO);
+            SPLINEBASE::evaluateVGL(P, iat, psi, dpsi, d2psi);
+            HYBRIDBASE::interpolate_buffer_vgl(
+                psi, dpsi, d2psi, psi_AO, dpsi_AO, d2psi_AO);
+        }
+    }
+
+    void
+    mw_evaluateVGL(const RefVectorWithLeader<SPOSetT<ValueType>>& sa_list,
+        const RefVectorWithLeader<ParticleSetT<ValueType>>& P_list, int iat,
+        const RefVector<ValueVector>& psi_v_list,
+        const RefVector<GradVector>& dpsi_v_list,
+        const RefVector<ValueVector>& d2psi_v_list) const final
+    {
+        BsplineSetT<ValueType>::mw_evaluateVGL(
+            sa_list, P_list, iat, psi_v_list, dpsi_v_list, d2psi_v_list);
+    }
+
+    void
+    mw_evaluateVGLandDetRatioGrads(
+        const RefVectorWithLeader<SPOSetT<ValueType>>& spo_list,
+        const RefVectorWithLeader<ParticleSetT<ValueType>>& P_list, int iat,
+        const std::vector<const ValueType*>& invRow_ptr_list,
+        OffloadMWVGLArray& phi_vgl_v, std::vector<ValueType>& ratios,
+        std::vector<GradType>& grads) const final
+    {
+        BsplineSetT<ValueType>::mw_evaluateVGLandDetRatioGrads(
+            spo_list, P_list, iat, invRow_ptr_list, phi_vgl_v, ratios, grads);
+    }
+
+    void
+    evaluateVGH(const ParticleSetT<ValueType>& P, const int iat,
+        ValueVector& psi, GradVector& dpsi, HessVector& grad_grad_psi) override
+    {
+        APP_ABORT("HybridRepCplx::evaluate_vgh not implemented!");
+        if (HYBRIDBASE::evaluate_vgh(P, iat, myV, myG, myH)) {
+            const PointType& r = P.activeR(iat);
+            SPLINEBASE::assign_vgh(
+                r, psi, dpsi, grad_grad_psi, 0, myV.size() / 2);
+        }
+        else
+            SPLINEBASE::evaluateVGH(P, iat, psi, dpsi, grad_grad_psi);
+    }
+
+    void
+    evaluateVGHGH(const ParticleSetT<ValueType>& P, const int iat,
+        ValueVector& psi, GradVector& dpsi, HessVector& grad_grad_psi,
+        GGGVector& grad_grad_grad_psi) override
+    {
+        APP_ABORT("HybridRepCplx::evaluate_vghgh not implemented!");
+    }
+
+    void
+    evaluate_notranspose(const ParticleSetT<ValueType>& P, int first, int last,
+        ValueMatrix& logdet, GradMatrix& dlogdet, ValueMatrix& d2logdet) final
+    {
+        // bypass SPLINEBASE::evaluate_notranspose
+        BsplineSetT<ValueType>::evaluate_notranspose(
+            P, first, last, logdet, dlogdet, d2logdet);
+    }
+
+    template <class BSPLINESPO>
+    friend class HybridRepSetReaderT;
+    template <class BSPLINESPO>
+    friend class SplineSetReaderT;
+    template <typename>
+    friend class BsplineReaderBaseT;
+};
+
+} // namespace qmcplusplus
+#endif
diff --git a/src/QMCWaveFunctions/BsplineFactory/HybridRepRealT.h b/src/QMCWaveFunctions/BsplineFactory/HybridRepRealT.h
new file mode 100644
index 0000000000..eea06ea1d1
--- /dev/null
+++ b/src/QMCWaveFunctions/BsplineFactory/HybridRepRealT.h
@@ -0,0 +1,303 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
+//
+// Copyright (c) 2019 QMCPACK developers.
+//
+// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
+//
+// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
+//
+//////////////////////////////////////////////////////////////////////////////////////
+
+/** @file HybridRepReal.h
+ *
+ * hold HybridRepReal
+ */
+#ifndef QMCPLUSPLUS_HYBRIDREP_REALT_H
+#define QMCPLUSPLUS_HYBRIDREP_REALT_H
+
+#include "QMCWaveFunctions/BsplineFactory/HybridRepCenterOrbitalsT.h"
+namespace qmcplusplus
+{
+/** hybrid representation orbitals combining B-spline orbitals on a grid and
+ * atomic centered orbitals.
+ * @tparam SPLINEBASE B-spline orbital class.
+ *
+ * Only works with SPLINEBASE class containing real splines
+ */
+template <typename SPLINEBASE>
+class HybridRepRealT :
+    public SPLINEBASE,
+    private HybridRepCenterOrbitalsT<typename SPLINEBASE::DataType,
+        typename SPLINEBASE::ValueType>
+{
+public:
+    using HYBRIDBASE = HybridRepCenterOrbitalsT<typename SPLINEBASE::DataType,
+        typename SPLINEBASE::ValueType>;
+    using ST = typename SPLINEBASE::DataType;
+    using PointType = typename SPLINEBASE::PointType;
+    using SingleSplineType = typename SPLINEBASE::SingleSplineType;
+    using RealType = typename SPLINEBASE::RealType;
+    // types for evaluation results
+    using typename SPLINEBASE::GGGVector;
+    using typename SPLINEBASE::GradMatrix;
+    using typename SPLINEBASE::GradType;
+    using typename SPLINEBASE::GradVector;
+    using typename SPLINEBASE::HessVector;
+    using typename SPLINEBASE::OffloadMWVGLArray;
+    using typename SPLINEBASE::ValueMatrix;
+    using typename SPLINEBASE::ValueType;
+    using typename SPLINEBASE::ValueVector;
+
+private:
+    ValueVector psi_AO, d2psi_AO;
+    GradVector dpsi_AO;
+    Matrix<ST, aligned_allocator<ST>> multi_myV;
+
+    using SPLINEBASE::HalfG;
+    using SPLINEBASE::myG;
+    using SPLINEBASE::myH;
+    using SPLINEBASE::myL;
+    using SPLINEBASE::myV;
+    using SPLINEBASE::PrimLattice;
+
+public:
+    HybridRepRealT(const std::string& my_name) : SPLINEBASE(my_name)
+    {
+    }
+
+    std::string
+    getClassName() const final
+    {
+        return "Hybrid" + SPLINEBASE::getClassName();
+    }
+    std::string
+    getKeyword() const final
+    {
+        return "Hybrid" + SPLINEBASE::getKeyword();
+    }
+    bool
+    isOMPoffload() const final
+    {
+        return false;
+    }
+
+    std::unique_ptr<SPOSetT<ValueType>>
+    makeClone() const override
+    {
+        return std::make_unique<HybridRepRealT>(*this);
+    }
+
+    inline void
+    resizeStorage(size_t n, size_t nvals)
+    {
+        SPLINEBASE::resizeStorage(n, nvals);
+        HYBRIDBASE::resizeStorage(myV.size());
+    }
+
+    void
+    bcast_tables(Communicate* comm)
+    {
+        SPLINEBASE::bcast_tables(comm);
+        HYBRIDBASE::bcast_tables(comm);
+    }
+
+    void
+    gather_tables(Communicate* comm)
+    {
+        SPLINEBASE::gather_tables(comm);
+        HYBRIDBASE::gather_atomic_tables(comm, SPLINEBASE::offset);
+    }
+
+    inline void
+    flush_zero()
+    {
+        // SPLINEBASE::flush_zero();
+        HYBRIDBASE::flush_zero();
+    }
+
+    bool
+    read_splines(hdf_archive& h5f)
+    {
+        return HYBRIDBASE::read_splines(h5f) && SPLINEBASE::read_splines(h5f);
+    }
+
+    bool
+    write_splines(hdf_archive& h5f)
+    {
+        return HYBRIDBASE::write_splines(h5f) && SPLINEBASE::write_splines(h5f);
+    }
+
+    void
+    evaluateValue(const ParticleSetT<ValueType>& P, const int iat,
+        ValueVector& psi) override
+    {
+        const RealType smooth_factor = HYBRIDBASE::evaluate_v(P, iat, myV);
+        const RealType cone(1);
+        if (smooth_factor < 0) {
+            SPLINEBASE::evaluateValue(P, iat, psi);
+        }
+        else if (smooth_factor == cone) {
+            const PointType& r = P.activeR(iat);
+            int bc_sign = HYBRIDBASE::get_bc_sign(r, PrimLattice, HalfG);
+            SPLINEBASE::assign_v(bc_sign, myV, psi, 0, myV.size());
+        }
+        else {
+            const PointType& r = P.activeR(iat);
+            psi_AO.resize(psi.size());
+            int bc_sign = HYBRIDBASE::get_bc_sign(r, PrimLattice, HalfG);
+            SPLINEBASE::assign_v(bc_sign, myV, psi_AO, 0, myV.size());
+            SPLINEBASE::evaluateValue(P, iat, psi);
+            HYBRIDBASE::interpolate_buffer_v(psi, psi_AO);
+        }
+    }
+
+    void
+    evaluateDetRatios(const VirtualParticleSetT<ValueType>& VP,
+        ValueVector& psi, const ValueVector& psiinv,
+        std::vector<ValueType>& ratios) override
+    {
+        if (VP.isOnSphere() && HYBRIDBASE::is_batched_safe(VP)) {
+            // resize scratch space
+            psi_AO.resize(psi.size());
+            if (multi_myV.rows() < VP.getTotalNum())
+                multi_myV.resize(VP.getTotalNum(), myV.size());
+            std::vector<int> bc_signs(VP.getTotalNum());
+            const RealType smooth_factor = HYBRIDBASE::evaluateValuesR2R(
+                VP, PrimLattice, HalfG, multi_myV, bc_signs);
+            const RealType cone(1);
+            for (int iat = 0; iat < VP.getTotalNum(); ++iat) {
+                if (smooth_factor < 0)
+                    SPLINEBASE::evaluateValue(VP, iat, psi);
+                else if (smooth_factor == cone) {
+                    Vector<ST, aligned_allocator<ST>> myV_one(
+                        multi_myV[iat], myV.size());
+                    SPLINEBASE::assign_v(
+                        bc_signs[iat], myV_one, psi, 0, myV.size());
+                }
+                else {
+                    Vector<ST, aligned_allocator<ST>> myV_one(
+                        multi_myV[iat], myV.size());
+                    SPLINEBASE::assign_v(
+                        bc_signs[iat], myV_one, psi_AO, 0, myV.size());
+                    SPLINEBASE::evaluateValue(VP, iat, psi);
+                    HYBRIDBASE::interpolate_buffer_v(psi, psi_AO);
+                }
+                ratios[iat] = simd::dot(psi.data(), psiinv.data(), psi.size());
+            }
+        }
+        else {
+            for (int iat = 0; iat < VP.getTotalNum(); ++iat) {
+                evaluateValue(VP, iat, psi);
+                ratios[iat] = simd::dot(psi.data(), psiinv.data(), psi.size());
+            }
+        }
+    }
+
+    void
+    mw_evaluateDetRatios(
+        const RefVectorWithLeader<SPOSetT<ValueType>>& spo_list,
+        const RefVectorWithLeader<const VirtualParticleSetT<ValueType>>&
+            vp_list,
+        const RefVector<ValueVector>& psi_list,
+        const std::vector<const ValueType*>& invRow_ptr_list,
+        std::vector<std::vector<ValueType>>& ratios_list) const final
+    {
+        BsplineSetT<ValueType>::mw_evaluateDetRatios(
+            spo_list, vp_list, psi_list, invRow_ptr_list, ratios_list);
+    }
+
+    void
+    evaluateVGL(const ParticleSetT<ValueType>& P, const int iat,
+        ValueVector& psi, GradVector& dpsi, ValueVector& d2psi) override
+    {
+        const RealType smooth_factor =
+            HYBRIDBASE::evaluate_vgl(P, iat, myV, myG, myL);
+        const RealType cone(1);
+        if (smooth_factor < 0) {
+            SPLINEBASE::evaluateVGL(P, iat, psi, dpsi, d2psi);
+        }
+        else if (smooth_factor == cone) {
+            const PointType& r = P.activeR(iat);
+            int bc_sign = HYBRIDBASE::get_bc_sign(r, PrimLattice, HalfG);
+            SPLINEBASE::assign_vgl_from_l(bc_sign, psi, dpsi, d2psi);
+        }
+        else {
+            const PointType& r = P.activeR(iat);
+            psi_AO.resize(psi.size());
+            dpsi_AO.resize(psi.size());
+            d2psi_AO.resize(psi.size());
+            int bc_sign = HYBRIDBASE::get_bc_sign(r, PrimLattice, HalfG);
+            SPLINEBASE::assign_vgl_from_l(bc_sign, psi_AO, dpsi_AO, d2psi_AO);
+            SPLINEBASE::evaluateVGL(P, iat, psi, dpsi, d2psi);
+            HYBRIDBASE::interpolate_buffer_vgl(
+                psi, dpsi, d2psi, psi_AO, dpsi_AO, d2psi_AO);
+        }
+    }
+
+    void
+    mw_evaluateVGL(const RefVectorWithLeader<SPOSetT<ValueType>>& sa_list,
+        const RefVectorWithLeader<ParticleSetT<ValueType>>& P_list, int iat,
+        const RefVector<ValueVector>& psi_v_list,
+        const RefVector<GradVector>& dpsi_v_list,
+        const RefVector<ValueVector>& d2psi_v_list) const final
+    {
+        BsplineSetT<ValueType>::mw_evaluateVGL(
+            sa_list, P_list, iat, psi_v_list, dpsi_v_list, d2psi_v_list);
+    }
+
+    void
+    mw_evaluateVGLandDetRatioGrads(
+        const RefVectorWithLeader<SPOSetT<ValueType>>& spo_list,
+        const RefVectorWithLeader<ParticleSetT<ValueType>>& P_list, int iat,
+        const std::vector<const ValueType*>& invRow_ptr_list,
+        OffloadMWVGLArray& phi_vgl_v, std::vector<ValueType>& ratios,
+        std::vector<GradType>& grads) const final
+    {
+        BsplineSetT<ValueType>::mw_evaluateVGLandDetRatioGrads(
+            spo_list, P_list, iat, invRow_ptr_list, phi_vgl_v, ratios, grads);
+    }
+
+    void
+    evaluateVGH(const ParticleSetT<ValueType>& P, const int iat,
+        ValueVector& psi, GradVector& dpsi, HessVector& grad_grad_psi) override
+    {
+        APP_ABORT("HybridRepReal::evaluateVGH not implemented!");
+        if (HYBRIDBASE::evaluate_vgh(P, iat, myV, myG, myH)) {
+            const PointType& r = P.activeR(iat);
+            int bc_sign = HYBRIDBASE::get_bc_sign(r, PrimLattice, HalfG);
+            SPLINEBASE::assign_vgh(
+                bc_sign, psi, dpsi, grad_grad_psi, 0, myV.size());
+        }
+        else
+            SPLINEBASE::evaluateVGH(P, iat, psi, dpsi, grad_grad_psi);
+    }
+
+    void
+    evaluateVGHGH(const ParticleSetT<ValueType>& P, const int iat,
+        ValueVector& psi, GradVector& dpsi, HessVector& grad_grad_psi,
+        GGGVector& grad_grad_grad_psi) override
+    {
+        APP_ABORT("HybridRepCplx::evaluateVGHGH not implemented!");
+    }
+
+    void
+    evaluate_notranspose(const ParticleSetT<ValueType>& P, int first, int last,
+        ValueMatrix& logdet, GradMatrix& dlogdet, ValueMatrix& d2logdet) final
+    {
+        // bypass SPLINEBASE::evaluate_notranspose
+        BsplineSetT<ValueType>::evaluate_notranspose(
+            P, first, last, logdet, dlogdet, d2logdet);
+    }
+
+    template <class BSPLINESPO>
+    friend class HybridRepSetReaderT;
+    template <class BSPLINESPO>
+    friend class SplineSetReaderT;
+    template <typename>
+    friend class BsplineReaderBaseT;
+};
+
+} // namespace qmcplusplus
+#endif
diff --git a/src/QMCWaveFunctions/BsplineFactory/HybridRepSetReader.h b/src/QMCWaveFunctions/BsplineFactory/HybridRepSetReader.h
index 1e25e2ae11..a54219c80c 100644
--- a/src/QMCWaveFunctions/BsplineFactory/HybridRepSetReader.h
+++ b/src/QMCWaveFunctions/BsplineFactory/HybridRepSetReader.h
@@ -21,6 +21,7 @@
 #include "Numerics/Quadrature.h"
 #include "Numerics/Bessel.h"
 #include "QMCWaveFunctions/BsplineFactory/HybridRepCenterOrbitals.h"
+#include "QMCWaveFunctions/BsplineFactory/SplineSetReader.h"
 #include "OhmmsData/AttributeSet.h"
 #include "CPU/math.hpp"
 #include "Concurrency/OpenMP.h"
diff --git a/src/QMCWaveFunctions/BsplineFactory/HybridRepSetReaderT.h b/src/QMCWaveFunctions/BsplineFactory/HybridRepSetReaderT.h
new file mode 100644
index 0000000000..affb06638c
--- /dev/null
+++ b/src/QMCWaveFunctions/BsplineFactory/HybridRepSetReaderT.h
@@ -0,0 +1,492 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
+//
+// Copyright (c) 2019 QMCPACK developers.
+//
+// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
+//
+// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
+//////////////////////////////////////////////////////////////////////////////////////
+
+#ifndef QMCPLUSPLUS_HYBRIDREP_READERT_H
+#define QMCPLUSPLUS_HYBRIDREP_READERT_H
+
+#include "CPU/math.hpp"
+#include "Concurrency/OpenMP.h"
+#include "Numerics/Bessel.h"
+#include "Numerics/Quadrature.h"
+#include "OhmmsData/AttributeSet.h"
+#include "QMCWaveFunctions/BsplineFactory/HybridRepCenterOrbitalsT.h"
+#include "QMCWaveFunctions/BsplineFactory/HybridRepSetReader.h"
+#include "QMCWaveFunctions/BsplineFactory/SplineSetReaderT.h"
+
+namespace qmcplusplus
+{
+
+/** General HybridRepSetReader to handle any unitcell
+ */
+template <typename SA>
+class HybridRepSetReaderT : public SplineSetReaderT<SA>
+{
+public:
+    using BaseReader = SplineSetReaderT<SA>;
+
+    using BaseReader::bspline;
+    using BaseReader::mybuilder;
+    using BaseReader::rotate_phase_i;
+    using BaseReader::rotate_phase_r;
+    using typename BaseReader::DataType;
+    using typename BaseReader::ValueType;
+
+    HybridRepSetReaderT(EinsplineSetBuilderT<ValueType>* e) : BaseReader(e)
+    {
+    }
+
+    /** initialize basic parameters of atomic orbitals */
+    void
+    initialize_hybridrep_atomic_centers() override
+    {
+        OhmmsAttributeSet a;
+        std::string scheme_name("Consistent");
+        std::string s_function_name("LEKS2018");
+        a.add(scheme_name, "smoothing_scheme");
+        a.add(s_function_name, "smoothing_function");
+        a.put(mybuilder->XMLRoot);
+        // assign smooth_scheme
+        if (scheme_name == "Consistent")
+            this->bspline->smooth_scheme = SA::smoothing_schemes::CONSISTENT;
+        else if (scheme_name == "SmoothAll")
+            bspline->smooth_scheme = SA::smoothing_schemes::SMOOTHALL;
+        else if (scheme_name == "SmoothPartial")
+            bspline->smooth_scheme = SA::smoothing_schemes::SMOOTHPARTIAL;
+        else
+            APP_ABORT(
+                "initialize_hybridrep_atomic_centers wrong smoothing_scheme "
+                "name! Only allows Consistent, SmoothAll or "
+                "SmoothPartial.");
+
+        // assign smooth_function
+        if (s_function_name == "LEKS2018")
+            bspline->smooth_func_id = smoothing_functions::LEKS2018;
+        else if (s_function_name == "coscos")
+            bspline->smooth_func_id = smoothing_functions::COSCOS;
+        else if (s_function_name == "linear")
+            bspline->smooth_func_id = smoothing_functions::LINEAR;
+        else
+            APP_ABORT(
+                "initialize_hybridrep_atomic_centers wrong smoothing_function "
+                "name! Only allows LEKS2018, coscos or linear.");
+        app_log() << "Hybrid orbital representation uses " << scheme_name
+                  << " smoothing scheme and " << s_function_name
+                  << " smoothing function." << std::endl;
+
+        bspline->set_info(*(mybuilder->SourcePtcl), mybuilder->TargetPtcl,
+            mybuilder->Super2Prim);
+        auto& centers = bspline->AtomicCenters;
+        auto& ACInfo = mybuilder->AtomicCentersInfo;
+        // load atomic center info only when it is not initialized
+        if (centers.size() == 0) {
+            bool success = true;
+            app_log() << "Reading atomic center info for hybrid representation"
+                      << std::endl;
+            for (int center_idx = 0; center_idx < ACInfo.Ncenters;
+                 center_idx++) {
+                const int my_GroupID = ACInfo.GroupID[center_idx];
+                if (ACInfo.cutoff[center_idx] < 0) {
+                    app_error() << "Hybrid orbital representation needs "
+                                   "parameter 'cutoff_radius' for atom "
+                                << center_idx << std::endl;
+                    success = false;
+                }
+
+                if (ACInfo.inner_cutoff[center_idx] < 0) {
+                    const double inner_cutoff =
+                        std::max(ACInfo.cutoff[center_idx] - 0.3, 0.0);
+                    app_log() << "Hybrid orbital representation setting "
+                                 "'inner_cutoff' to "
+                              << inner_cutoff << " for group " << my_GroupID
+                              << " as atom " << center_idx << std::endl;
+                    // overwrite the inner_cutoff of all the atoms of the same
+                    // species
+                    for (int id = 0; id < ACInfo.Ncenters; id++)
+                        if (my_GroupID == ACInfo.GroupID[id])
+                            ACInfo.inner_cutoff[id] = inner_cutoff;
+                }
+                else if (ACInfo.inner_cutoff[center_idx] >
+                    ACInfo.cutoff[center_idx]) {
+                    app_error()
+                        << "Hybrid orbital representation 'inner_cutoff' must "
+                           "be smaller than 'spline_radius' for atom "
+                        << center_idx << std::endl;
+                    success = false;
+                }
+
+                if (ACInfo.cutoff[center_idx] > 0) {
+                    if (ACInfo.lmax[center_idx] < 0) {
+                        app_error() << "Hybrid orbital representation needs "
+                                       "parameter 'lmax' for atom "
+                                    << center_idx << std::endl;
+                        success = false;
+                    }
+
+                    if (ACInfo.spline_radius[center_idx] < 0 &&
+                        ACInfo.spline_npoints[center_idx] < 0) {
+                        app_log() << "Parameters 'spline_radius' and "
+                                     "'spline_npoints' for group "
+                                  << my_GroupID << " as atom " << center_idx
+                                  << " are not specified." << std::endl;
+                        const double delta =
+                            std::min(0.02, ACInfo.cutoff[center_idx] / 4.0);
+                        const int n_grid_point =
+                            std::ceil(
+                                (ACInfo.cutoff[center_idx] + 1e-4) / delta) +
+                            3;
+                        for (int id = 0; id < ACInfo.Ncenters; id++)
+                            if (my_GroupID == ACInfo.GroupID[id]) {
+                                ACInfo.spline_npoints[id] = n_grid_point;
+                                ACInfo.spline_radius[id] =
+                                    (n_grid_point - 1) * delta;
+                            }
+                        app_log() << "  Based on default grid point distance "
+                                  << delta << std::endl;
+                        app_log()
+                            << "  Setting 'spline_npoints' to "
+                            << ACInfo.spline_npoints[center_idx] << std::endl;
+                        app_log()
+                            << "  Setting 'spline_radius' to "
+                            << ACInfo.spline_radius[center_idx] << std::endl;
+                    }
+                    else {
+                        if (ACInfo.spline_radius[center_idx] < 0) {
+                            app_error()
+                                << "Hybrid orbital representation needs "
+                                   "parameter 'spline_radius' for atom "
+                                << center_idx << std::endl;
+                            success = false;
+                        }
+
+                        if (ACInfo.spline_npoints[center_idx] < 0) {
+                            app_error()
+                                << "Hybrid orbital representation needs "
+                                   "parameter 'spline_npoints' for atom "
+                                << center_idx << std::endl;
+                            success = false;
+                        }
+                    }
+
+                    // check maximally allowed cutoff_radius
+                    double max_allowed_cutoff =
+                        ACInfo.spline_radius[center_idx] -
+                        2.0 * ACInfo.spline_radius[center_idx] /
+                            (ACInfo.spline_npoints[center_idx] - 1);
+                    if (success &&
+                        ACInfo.cutoff[center_idx] > max_allowed_cutoff) {
+                        app_error() << "Hybrid orbital representation requires "
+                                       "cutoff_radius<="
+                                    << max_allowed_cutoff
+                                    << " calculated by "
+                                       "spline_radius-2*spline_radius/"
+                                       "(spline_npoints-1) for atom "
+                                    << center_idx << std::endl;
+                        success = false;
+                    }
+                }
+                else {
+                    // no atomic regions for this atom type
+                    ACInfo.spline_radius[center_idx] = 0.0;
+                    ACInfo.spline_npoints[center_idx] = 0;
+                    ACInfo.lmax[center_idx] = 0;
+                }
+            }
+            if (!success)
+                BaseReader::myComm->barrier_and_abort(
+                    "initialize_hybridrep_atomic_centers Failed to initialize "
+                    "atomic centers "
+                    "in hybrid orbital representation!");
+
+            for (int center_idx = 0; center_idx < ACInfo.Ncenters;
+                 center_idx++) {
+                AtomicOrbitalsT<DataType> oneCenter(ACInfo.lmax[center_idx]);
+                oneCenter.set_info(ACInfo.ion_pos[center_idx],
+                    ACInfo.cutoff[center_idx], ACInfo.inner_cutoff[center_idx],
+                    ACInfo.spline_radius[center_idx],
+                    ACInfo.non_overlapping_radius[center_idx],
+                    ACInfo.spline_npoints[center_idx]);
+                centers.push_back(oneCenter);
+            }
+        }
+    }
+
+    /** initialize construct atomic orbital radial functions from plane waves */
+    inline void
+    create_atomic_centers_Gspace(Vector<std::complex<double>>& cG,
+        Communicate& band_group_comm, int iorb) override
+    {
+        band_group_comm.bcast(rotate_phase_r);
+        band_group_comm.bcast(rotate_phase_i);
+        band_group_comm.bcast(cG);
+        // distribute G-vectors over processor groups
+        const int Ngvecs = mybuilder->Gvecs[0].size();
+        const int Nprocs = band_group_comm.size();
+        const int Ngvecgroups = std::min(Ngvecs, Nprocs);
+        Communicate gvec_group_comm(band_group_comm, Ngvecgroups);
+        std::vector<int> gvec_groups(Ngvecgroups + 1, 0);
+        FairDivideLow(Ngvecs, Ngvecgroups, gvec_groups);
+        const int gvec_first = gvec_groups[gvec_group_comm.getGroupID()];
+        const int gvec_last = gvec_groups[gvec_group_comm.getGroupID() + 1];
+
+        // prepare Gvecs Ylm(G)
+        using UnitCellType =
+            typename EinsplineSetBuilderT<ValueType>::UnitCellType;
+        Gvectors<double, UnitCellType> Gvecs(mybuilder->Gvecs[0],
+            mybuilder->PrimCell, bspline->HalfG, gvec_first, gvec_last);
+        // if(band_group_comm.isGroupLeader()) std::cout << "print band=" <<
+        // iorb << " KE=" << Gvecs.evaluate_KE(cG) << std::endl;
+
+        std::vector<AtomicOrbitalsT<DataType>>& centers = bspline->AtomicCenters;
+        app_log() << "Transforming band " << iorb << " on Rank 0" << std::endl;
+        // collect atomic centers by group
+        std::vector<int> uniq_species;
+        for (int center_idx = 0; center_idx < centers.size(); center_idx++) {
+            auto& ACInfo = mybuilder->AtomicCentersInfo;
+            const int my_GroupID = ACInfo.GroupID[center_idx];
+            int found_idx = -1;
+            for (size_t idx = 0; idx < uniq_species.size(); idx++)
+                if (my_GroupID == uniq_species[idx]) {
+                    found_idx = idx;
+                    break;
+                }
+            if (found_idx < 0)
+                uniq_species.push_back(my_GroupID);
+        }
+        // construct group list
+        std::vector<std::vector<int>> group_list(uniq_species.size());
+        for (int center_idx = 0; center_idx < centers.size(); center_idx++) {
+            auto& ACInfo = mybuilder->AtomicCentersInfo;
+            const int my_GroupID = ACInfo.GroupID[center_idx];
+            for (size_t idx = 0; idx < uniq_species.size(); idx++)
+                if (my_GroupID == uniq_species[idx]) {
+                    group_list[idx].push_back(center_idx);
+                    break;
+                }
+        }
+
+        for (int group_idx = 0; group_idx < group_list.size(); group_idx++) {
+            const auto& mygroup = group_list[group_idx];
+            const double spline_radius = centers[mygroup[0]].getSplineRadius();
+            const int spline_npoints = centers[mygroup[0]].getSplineNpoints();
+            const int lmax = centers[mygroup[0]].getLmax();
+            const double delta =
+                spline_radius / static_cast<double>(spline_npoints - 1);
+            const int lm_tot = (lmax + 1) * (lmax + 1);
+            const size_t natoms = mygroup.size();
+            const int policy = lm_tot > natoms ? 0 : 1;
+
+            std::vector<std::complex<double>> i_power(lm_tot);
+            // rotate phase is introduced here.
+            std::complex<double> i_temp(rotate_phase_r, rotate_phase_i);
+            for (size_t l = 0; l <= lmax; l++) {
+                for (size_t lm = l * l; lm < (l + 1) * (l + 1); lm++)
+                    i_power[lm] = i_temp;
+                i_temp *= std::complex<double>(0.0, 1.0);
+            }
+
+            std::vector<Matrix<double>> all_vals(natoms);
+            std::vector<std::vector<aligned_vector<double>>> vals_local(
+                spline_npoints * omp_get_max_threads());
+            VectorSoaContainer<double, 3> myRSoA(natoms);
+            for (size_t idx = 0; idx < natoms; idx++) {
+                all_vals[idx].resize(spline_npoints, lm_tot * 2);
+                all_vals[idx] = 0.0;
+                myRSoA(idx) = centers[mygroup[idx]].getCenterPos();
+            }
+
+#pragma omp parallel
+            {
+                const size_t tid = omp_get_thread_num();
+                const size_t nt = omp_get_num_threads();
+
+                for (int ip = 0; ip < spline_npoints; ip++) {
+                    const size_t ip_idx = tid * spline_npoints + ip;
+                    if (policy == 1) {
+                        vals_local[ip_idx].resize(lm_tot * 2);
+                        for (size_t lm = 0; lm < lm_tot * 2; lm++) {
+                            auto& vals = vals_local[ip_idx][lm];
+                            vals.resize(natoms);
+                            std::fill(vals.begin(), vals.end(), 0.0);
+                        }
+                    }
+                    else {
+                        vals_local[ip_idx].resize(natoms * 2);
+                        for (size_t iat = 0; iat < natoms * 2; iat++) {
+                            auto& vals = vals_local[ip_idx][iat];
+                            vals.resize(lm_tot);
+                            std::fill(vals.begin(), vals.end(), 0.0);
+                        }
+                    }
+                }
+
+                const size_t size_pw_tile = 32;
+                const size_t num_pw_tiles =
+                    (Gvecs.NumGvecs + size_pw_tile - 1) / size_pw_tile;
+                aligned_vector<double> j_lm_G(lm_tot, 0.0);
+                std::vector<aligned_vector<double>> phase_shift_r(size_pw_tile);
+                std::vector<aligned_vector<double>> phase_shift_i(size_pw_tile);
+                std::vector<aligned_vector<double>> YlmG(size_pw_tile);
+                for (size_t ig = 0; ig < size_pw_tile; ig++) {
+                    phase_shift_r[ig].resize(natoms);
+                    phase_shift_i[ig].resize(natoms);
+                    YlmG[ig].resize(lm_tot);
+                }
+                SoaSphericalTensor<double> Ylm(lmax);
+
+#pragma omp for
+                for (size_t tile_id = 0; tile_id < num_pw_tiles; tile_id++) {
+                    const size_t ig_first = tile_id * size_pw_tile;
+                    const size_t ig_last =
+                        std::min((tile_id + 1) * size_pw_tile, Gvecs.NumGvecs);
+                    for (size_t ig = ig_first; ig < ig_last; ig++) {
+                        const size_t ig_local = ig - ig_first;
+                        // calculate phase shift for all the centers of this
+                        // group
+                        Gvecs.calc_phase_shift(myRSoA, ig,
+                            phase_shift_r[ig_local], phase_shift_i[ig_local]);
+                        Gvecs.calc_Ylm_G(ig, Ylm, YlmG[ig_local]);
+                    }
+
+                    for (int ip = 0; ip < spline_npoints; ip++) {
+                        double r = delta * static_cast<double>(ip);
+                        const size_t ip_idx = tid * spline_npoints + ip;
+
+                        for (size_t ig = ig_first; ig < ig_last; ig++) {
+                            const size_t ig_local = ig - ig_first;
+                            // calculate spherical bessel function
+                            Gvecs.calc_jlm_G(lmax, r, ig, j_lm_G);
+                            for (size_t lm = 0; lm < lm_tot; lm++)
+                                j_lm_G[lm] *= YlmG[ig_local][lm];
+
+                            const double cG_r = cG[ig + gvec_first].real();
+                            const double cG_i = cG[ig + gvec_first].imag();
+                            if (policy == 1) {
+                                for (size_t lm = 0; lm < lm_tot; lm++) {
+                                    double* restrict vals_r =
+                                        vals_local[ip_idx][lm * 2].data();
+                                    double* restrict vals_i =
+                                        vals_local[ip_idx][lm * 2 + 1].data();
+                                    const double* restrict ps_r_ptr =
+                                        phase_shift_r[ig_local].data();
+                                    const double* restrict ps_i_ptr =
+                                        phase_shift_i[ig_local].data();
+                                    double cG_j_r = cG_r * j_lm_G[lm];
+                                    double cG_j_i = cG_i * j_lm_G[lm];
+#pragma omp simd aligned(vals_r, vals_i, ps_r_ptr, ps_i_ptr \
+                         : QMC_SIMD_ALIGNMENT)
+                                    for (size_t idx = 0; idx < natoms; idx++) {
+                                        const double ps_r = ps_r_ptr[idx];
+                                        const double ps_i = ps_i_ptr[idx];
+                                        vals_r[idx] +=
+                                            cG_j_r * ps_r - cG_j_i * ps_i;
+                                        vals_i[idx] +=
+                                            cG_j_i * ps_r + cG_j_r * ps_i;
+                                    }
+                                }
+                            }
+                            else {
+                                for (size_t idx = 0; idx < natoms; idx++) {
+                                    double* restrict vals_r =
+                                        vals_local[ip_idx][idx * 2].data();
+                                    double* restrict vals_i =
+                                        vals_local[ip_idx][idx * 2 + 1].data();
+                                    const double* restrict j_lm_G_ptr =
+                                        j_lm_G.data();
+                                    double cG_ps_r =
+                                        cG_r * phase_shift_r[ig_local][idx] -
+                                        cG_i * phase_shift_i[ig_local][idx];
+                                    double cG_ps_i =
+                                        cG_i * phase_shift_r[ig_local][idx] +
+                                        cG_r * phase_shift_i[ig_local][idx];
+#pragma omp simd aligned(vals_r, vals_i, j_lm_G_ptr : QMC_SIMD_ALIGNMENT)
+                                    for (size_t lm = 0; lm < lm_tot; lm++) {
+                                        const double jlm = j_lm_G_ptr[lm];
+                                        vals_r[lm] += cG_ps_r * jlm;
+                                        vals_i[lm] += cG_ps_i * jlm;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+
+#pragma omp for collapse(2)
+                for (int ip = 0; ip < spline_npoints; ip++)
+                    for (size_t idx = 0; idx < natoms; idx++) {
+                        double* vals = all_vals[idx][ip];
+                        for (size_t tid = 0; tid < nt; tid++)
+                            for (size_t lm = 0; lm < lm_tot; lm++) {
+                                double vals_th_r, vals_th_i;
+                                const size_t ip_idx = tid * spline_npoints + ip;
+                                if (policy == 1) {
+                                    vals_th_r = vals_local[ip_idx][lm * 2][idx];
+                                    vals_th_i =
+                                        vals_local[ip_idx][lm * 2 + 1][idx];
+                                }
+                                else {
+                                    vals_th_r = vals_local[ip_idx][idx * 2][lm];
+                                    vals_th_i =
+                                        vals_local[ip_idx][idx * 2 + 1][lm];
+                                }
+                                const double real_tmp =
+                                    4.0 * M_PI * i_power[lm].real();
+                                const double imag_tmp =
+                                    4.0 * M_PI * i_power[lm].imag();
+                                vals[lm] +=
+                                    vals_th_r * real_tmp - vals_th_i * imag_tmp;
+                                vals[lm + lm_tot] +=
+                                    vals_th_i * real_tmp + vals_th_r * imag_tmp;
+                            }
+                    }
+            }
+            // app_log() << "Building band " << iorb << " at center " <<
+            // center_idx << std::endl;
+
+            for (size_t idx = 0; idx < natoms; idx++) {
+                // reduce all_vals
+                band_group_comm.reduce_in_place(
+                    all_vals[idx].data(), all_vals[idx].size());
+                if (!band_group_comm.isGroupLeader())
+                    continue;
+#pragma omp parallel for
+                for (int lm = 0; lm < lm_tot; lm++) {
+                    auto& mycenter = centers[mygroup[idx]];
+                    aligned_vector<double> splineData_r(spline_npoints);
+                    UBspline_1d_d* atomic_spline_r = nullptr;
+                    for (size_t ip = 0; ip < spline_npoints; ip++)
+                        splineData_r[ip] = all_vals[idx][ip][lm];
+                    atomic_spline_r = einspline::create(atomic_spline_r, 0.0,
+                        spline_radius, spline_npoints, splineData_r.data(),
+                        ((lm == 0) || (lm > 3)));
+                    if (!bspline->isComplex()) {
+                        mycenter.set_spline(atomic_spline_r, lm, iorb);
+                        einspline::destroy(atomic_spline_r);
+                    }
+                    else {
+                        aligned_vector<double> splineData_i(spline_npoints);
+                        UBspline_1d_d* atomic_spline_i = nullptr;
+                        for (size_t ip = 0; ip < spline_npoints; ip++)
+                            splineData_i[ip] = all_vals[idx][ip][lm + lm_tot];
+                        atomic_spline_i = einspline::create(atomic_spline_i,
+                            0.0, spline_radius, spline_npoints,
+                            splineData_i.data(), ((lm == 0) || (lm > 3)));
+                        mycenter.set_spline(atomic_spline_r, lm, iorb * 2);
+                        mycenter.set_spline(atomic_spline_i, lm, iorb * 2 + 1);
+                        einspline::destroy(atomic_spline_r);
+                        einspline::destroy(atomic_spline_i);
+                    }
+                }
+            }
+        }
+    }
+};
+} // namespace qmcplusplus
+#endif
diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTargetT.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTargetT.cpp
index 69cf51d09e..9c02ad06d2 100644
--- a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTargetT.cpp
+++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTargetT.cpp
@@ -1,6 +1,6 @@
 //////////////////////////////////////////////////////////////////////////////////////
-// This file is distributed under the University of Illinois/NCSA Open Source License.
-// See LICENSE file in top directory for details.
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
 //
 // Copyright (c) 2020 QMCPACK developers.
 //
@@ -9,1215 +9,1409 @@
 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
 //////////////////////////////////////////////////////////////////////////////////////
 
-
 #include "SplineC2COMPTargetT.h"
-#include "spline2/MultiBsplineEval.hpp"
-#include "spline2/MultiBsplineEval_OMPoffload.hpp"
-#include "QMCWaveFunctions/BsplineFactory/contraction_helper.hpp"
-#include "Platforms/OMPTarget/ompReductionComplex.hpp"
+
 #include "ApplyPhaseC2C.hpp"
 #include "Concurrency/OpenMP.h"
+#include "Platforms/OMPTarget/ompReductionComplex.hpp"
+#include "QMCWaveFunctions/BsplineFactory/contraction_helper.hpp"
+#include "spline2/MultiBsplineEval.hpp"
+#include "spline2/MultiBsplineEval_OMPoffload.hpp"
 
 namespace qmcplusplus
 {
-template<typename ST>
-SplineC2COMPTargetT<ST>::SplineC2COMPTargetT(const SplineC2COMPTargetT& in) = default;
-
-template<typename ST>
-inline void SplineC2COMPTargetT<ST>::set_spline(SingleSplineType* spline_r,
-                                               SingleSplineType* spline_i,
-                                               int twist,
-                                               int ispline,
-                                               int level)
+template <typename ST, typename VT>
+SplineC2COMPTargetT<ST, VT>::SplineC2COMPTargetT(
+    const SplineC2COMPTargetT& in) = default;
+
+template <typename ST, typename VT>
+inline void
+SplineC2COMPTargetT<ST, VT>::set_spline(SingleSplineType* spline_r,
+    SingleSplineType* spline_i, int twist, int ispline, int level)
 {
-  SplineInst->copy_spline(spline_r, 2 * ispline);
-  SplineInst->copy_spline(spline_i, 2 * ispline + 1);
+    SplineInst->copy_spline(spline_r, 2 * ispline);
+    SplineInst->copy_spline(spline_i, 2 * ispline + 1);
 }
 
-template<typename ST>
-bool SplineC2COMPTargetT<ST>::read_splines(hdf_archive& h5f)
+template <typename ST, typename VT>
+bool
+SplineC2COMPTargetT<ST, VT>::read_splines(hdf_archive& h5f)
 {
-  std::ostringstream o;
-  o << "spline_" << this->MyIndex;
-  einspline_engine<SplineType> bigtable(SplineInst->getSplinePtr());
-  return h5f.readEntry(bigtable, o.str().c_str()); //"spline_0");
+    std::ostringstream o;
+    o << "spline_" << this->MyIndex;
+    einspline_engine<SplineType> bigtable(SplineInst->getSplinePtr());
+    return h5f.readEntry(bigtable, o.str().c_str()); //"spline_0");
 }
 
-template<typename ST>
-bool SplineC2COMPTargetT<ST>::write_splines(hdf_archive& h5f)
+template <typename ST, typename VT>
+bool
+SplineC2COMPTargetT<ST, VT>::write_splines(hdf_archive& h5f)
 {
-  std::ostringstream o;
-  o << "spline_" << this->MyIndex;
-  einspline_engine<SplineType> bigtable(SplineInst->getSplinePtr());
-  return h5f.writeEntry(bigtable, o.str().c_str()); //"spline_0");
+    std::ostringstream o;
+    o << "spline_" << this->MyIndex;
+    einspline_engine<SplineType> bigtable(SplineInst->getSplinePtr());
+    return h5f.writeEntry(bigtable, o.str().c_str()); //"spline_0");
 }
 
-template<typename ST>
-inline void SplineC2COMPTargetT<ST>::assign_v(const PointType& r,
-                                             const vContainer_type& myV,
-                                             ValueVector& psi,
-                                             int first,
-                                             int last) const
+template <typename ST, typename VT>
+inline void
+SplineC2COMPTargetT<ST, VT>::assign_v(const PointType& r,
+    const vContainer_type& myV, ValueVector& psi, int first, int last) const
 {
-  // protect last
-  last = last > this->kPoints.size() ? this->kPoints.size() : last;
+    // protect last
+    last = last > this->kPoints.size() ? this->kPoints.size() : last;
 
-  const ST x = r[0], y = r[1], z = r[2];
-  const ST* restrict kx = myKcart->data(0);
-  const ST* restrict ky = myKcart->data(1);
-  const ST* restrict kz = myKcart->data(2);
+    const ST x = r[0], y = r[1], z = r[2];
+    const ST* restrict kx = myKcart->data(0);
+    const ST* restrict ky = myKcart->data(1);
+    const ST* restrict kz = myKcart->data(2);
 #pragma omp simd
-  for (size_t j = first; j < last; ++j)
-  {
-    ST s, c;
-    const ST val_r = myV[2 * j];
-    const ST val_i = myV[2 * j + 1];
-    omptarget::sincos(-(x * kx[j] + y * ky[j] + z * kz[j]), &s, &c);
-    psi[j + this->first_spo] = ComplexT(val_r * c - val_i * s, val_i * c + val_r * s);
-  }
+    for (size_t j = first; j < last; ++j) {
+        ST s, c;
+        const ST val_r = myV[2 * j];
+        const ST val_i = myV[2 * j + 1];
+        omptarget::sincos(-(x * kx[j] + y * ky[j] + z * kz[j]), &s, &c);
+        psi[j + this->first_spo] =
+            ComplexT(val_r * c - val_i * s, val_i * c + val_r * s);
+    }
 }
 
-template<typename ST>
-void SplineC2COMPTargetT<ST>::evaluateValue(const ParticleSetT<ComplexT>& P, const int iat, ValueVector& psi)
+template <typename ST, typename VT>
+void
+SplineC2COMPTargetT<ST, VT>::evaluateValue(
+    const ParticleSetT<VT>& P, const int iat, ValueVector& psi)
 {
-  const PointType& r = P.activeR(iat);
-  PointType ru(PrimLattice.toUnit_floor(r));
+    const PointType& r = P.activeR(iat);
+    PointType ru(PrimLattice.toUnit_floor(r));
 
 #pragma omp parallel
-  {
-    int first, last;
-    // Factor of 2 because psi is complex and the spline storage and evaluation uses a real type
-    FairDivideAligned(2 * psi.size(), getAlignment<ST>(), omp_get_num_threads(), omp_get_thread_num(), first, last);
-
-    spline2::evaluate3d(SplineInst->getSplinePtr(), ru, myV, first, last);
-    assign_v(r, myV, psi, first / 2, last / 2);
-  }
+    {
+        int first, last;
+        // Factor of 2 because psi is complex and the spline storage and
+        // evaluation uses a real type
+        FairDivideAligned(2 * psi.size(), getAlignment<ST>(),
+            omp_get_num_threads(), omp_get_thread_num(), first, last);
+
+        spline2::evaluate3d(SplineInst->getSplinePtr(), ru, myV, first, last);
+        assign_v(r, myV, psi, first / 2, last / 2);
+    }
 }
 
-template<typename ST>
-void SplineC2COMPTargetT<ST>::evaluateDetRatios(const VirtualParticleSetT<ComplexT>& VP,
-                                               ValueVector& psi,
-                                               const ValueVector& psiinv,
-                                               std::vector<ValueType>& ratios)
+template <typename ST, typename VT>
+void
+SplineC2COMPTargetT<ST, VT>::evaluateDetRatios(
+    const VirtualParticleSetT<VT>& VP, ValueVector& psi,
+    const ValueVector& psiinv, std::vector<ValueType>& ratios)
 {
-  const int nVP = VP.getTotalNum();
-  psiinv_pos_copy.resize(psiinv.size() + nVP * 3);
+    const int nVP = VP.getTotalNum();
+    psiinv_pos_copy.resize(psiinv.size() + nVP * 3);
+
+    // stage psiinv to psiinv_pos_copy
+    std::copy_n(psiinv.data(), psiinv.size(), psiinv_pos_copy.data());
+
+    // pack particle positions
+    auto* restrict pos_scratch =
+        reinterpret_cast<RealType*>(psiinv_pos_copy.data() + psiinv.size());
+    for (int iat = 0; iat < nVP; ++iat) {
+        const PointType& r = VP.activeR(iat);
+        PointType ru(PrimLattice.toUnit_floor(r));
+        pos_scratch[iat * 6] = r[0];
+        pos_scratch[iat * 6 + 1] = r[1];
+        pos_scratch[iat * 6 + 2] = r[2];
+        pos_scratch[iat * 6 + 3] = ru[0];
+        pos_scratch[iat * 6 + 4] = ru[1];
+        pos_scratch[iat * 6 + 5] = ru[2];
+    }
 
-  // stage psiinv to psiinv_pos_copy
-  std::copy_n(psiinv.data(), psiinv.size(), psiinv_pos_copy.data());
+    const size_t ChunkSizePerTeam = 512;
+    const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
+    ratios_private.resize(nVP, NumTeams);
+    const auto padded_size = myV.size();
+    offload_scratch.resize(padded_size * nVP);
+    const auto orb_size = psiinv.size();
+    results_scratch.resize(padded_size * nVP);
+
+    // Ye: need to extract sizes and pointers before entering target region
+    const auto* spline_ptr = SplineInst->getSplinePtr();
+    auto* offload_scratch_ptr = offload_scratch.data();
+    auto* results_scratch_ptr = results_scratch.data();
+    const auto myKcart_padded_size = myKcart->capacity();
+    auto* myKcart_ptr = myKcart->data();
+    auto* psiinv_ptr = psiinv_pos_copy.data();
+    auto* ratios_private_ptr = ratios_private.data();
+    const size_t first_spo_local = this->first_spo;
 
-  // pack particle positions
-  auto* restrict pos_scratch = reinterpret_cast<RealType*>(psiinv_pos_copy.data() + psiinv.size());
-  for (int iat = 0; iat < nVP; ++iat)
-  {
-    const PointType& r = VP.activeR(iat);
-    PointType ru(PrimLattice.toUnit_floor(r));
-    pos_scratch[iat * 6]     = r[0];
-    pos_scratch[iat * 6 + 1] = r[1];
-    pos_scratch[iat * 6 + 2] = r[2];
-    pos_scratch[iat * 6 + 3] = ru[0];
-    pos_scratch[iat * 6 + 4] = ru[1];
-    pos_scratch[iat * 6 + 5] = ru[2];
-  }
-
-  const size_t ChunkSizePerTeam = 512;
-  const int NumTeams            = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
-  ratios_private.resize(nVP, NumTeams);
-  const auto padded_size = myV.size();
-  offload_scratch.resize(padded_size * nVP);
-  const auto orb_size = psiinv.size();
-  results_scratch.resize(padded_size * nVP);
-
-  // Ye: need to extract sizes and pointers before entering target region
-  const auto* spline_ptr         = SplineInst->getSplinePtr();
-  auto* offload_scratch_ptr      = offload_scratch.data();
-  auto* results_scratch_ptr      = results_scratch.data();
-  const auto myKcart_padded_size = myKcart->capacity();
-  auto* myKcart_ptr              = myKcart->data();
-  auto* psiinv_ptr               = psiinv_pos_copy.data();
-  auto* ratios_private_ptr       = ratios_private.data();
-  const size_t first_spo_local   = this->first_spo;
-
-  {
-    ScopedTimer offload(offload_timer_);
-    PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(NumTeams*nVP) \
+    {
+        ScopedTimer offload(offload_timer_);
+        PRAGMA_OFFLOAD(
+            "omp target teams distribute collapse(2) num_teams(NumTeams*nVP) \
                 map(always, to: psiinv_ptr[0:psiinv_pos_copy.size()]) \
                 map(always, from: ratios_private_ptr[0:NumTeams*nVP])")
-    for (int iat = 0; iat < nVP; iat++)
-      for (int team_id = 0; team_id < NumTeams; team_id++)
-      {
-        const size_t first = ChunkSizePerTeam * team_id;
-        const size_t last  = omptarget::min(first + ChunkSizePerTeam, padded_size);
-
-        auto* restrict offload_scratch_iat_ptr = offload_scratch_ptr + padded_size * iat;
-        auto* restrict psi_iat_ptr             = results_scratch_ptr + padded_size * iat;
-        auto* restrict pos_scratch             = reinterpret_cast<RealType*>(psiinv_ptr + orb_size);
-
-        int ix, iy, iz;
-        ST a[4], b[4], c[4];
-        spline2::computeLocationAndFractional(spline_ptr, ST(pos_scratch[iat * 6 + 3]), ST(pos_scratch[iat * 6 + 4]),
-                                              ST(pos_scratch[iat * 6 + 5]), ix, iy, iz, a, b, c);
-
-        PRAGMA_OFFLOAD("omp parallel for")
-        for (int index = 0; index < last - first; index++)
-          spline2offload::evaluate_v_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c,
-                                             offload_scratch_iat_ptr + first + index);
-        const size_t first_cplx = first / 2;
-        const size_t last_cplx  = omptarget::min(last / 2, orb_size);
-        PRAGMA_OFFLOAD("omp parallel for")
-        for (int index = first_cplx; index < last_cplx; index++)
-          C2C::assign_v(ST(pos_scratch[iat * 6]), ST(pos_scratch[iat * 6 + 1]), ST(pos_scratch[iat * 6 + 2]),
-                        psi_iat_ptr, offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local, index);
-
-        ComplexT sum(0);
-        PRAGMA_OFFLOAD("omp parallel for simd reduction(+:sum)")
-        for (int i = first_cplx; i < last_cplx; i++)
-          sum += psi_iat_ptr[i] * psiinv_ptr[i];
-        ratios_private_ptr[iat * NumTeams + team_id] = sum;
-      }
-  }
-
-  // do the reduction manually
-  for (int iat = 0; iat < nVP; ++iat)
-  {
-    ratios[iat] = ComplexT(0);
-    for (int tid = 0; tid < NumTeams; tid++)
-      ratios[iat] += ratios_private[iat][tid];
-  }
+        for (int iat = 0; iat < nVP; iat++)
+            for (int team_id = 0; team_id < NumTeams; team_id++) {
+                const size_t first = ChunkSizePerTeam * team_id;
+                const size_t last =
+                    omptarget::min(first + ChunkSizePerTeam, padded_size);
+
+                auto* restrict offload_scratch_iat_ptr =
+                    offload_scratch_ptr + padded_size * iat;
+                auto* restrict psi_iat_ptr =
+                    results_scratch_ptr + padded_size * iat;
+                auto* restrict pos_scratch =
+                    reinterpret_cast<RealType*>(psiinv_ptr + orb_size);
+
+                int ix, iy, iz;
+                ST a[4], b[4], c[4];
+                spline2::computeLocationAndFractional(spline_ptr,
+                    ST(pos_scratch[iat * 6 + 3]), ST(pos_scratch[iat * 6 + 4]),
+                    ST(pos_scratch[iat * 6 + 5]), ix, iy, iz, a, b, c);
+
+                PRAGMA_OFFLOAD("omp parallel for")
+                for (int index = 0; index < last - first; index++)
+                    spline2offload::evaluate_v_impl_v2(spline_ptr, ix, iy, iz,
+                        first + index, a, b, c,
+                        offload_scratch_iat_ptr + first + index);
+                const size_t first_cplx = first / 2;
+                const size_t last_cplx = omptarget::min(last / 2, orb_size);
+                PRAGMA_OFFLOAD("omp parallel for")
+                for (int index = first_cplx; index < last_cplx; index++)
+                    C2C::assign_v(ST(pos_scratch[iat * 6]),
+                        ST(pos_scratch[iat * 6 + 1]),
+                        ST(pos_scratch[iat * 6 + 2]), psi_iat_ptr,
+                        offload_scratch_iat_ptr, myKcart_ptr,
+                        myKcart_padded_size, first_spo_local, index);
+
+                ComplexT sum(0);
+                PRAGMA_OFFLOAD("omp parallel for simd reduction(+:sum)")
+                for (int i = first_cplx; i < last_cplx; i++)
+                    sum += psi_iat_ptr[i] * psiinv_ptr[i];
+                ratios_private_ptr[iat * NumTeams + team_id] = sum;
+            }
+    }
+
+    // do the reduction manually
+    for (int iat = 0; iat < nVP; ++iat) {
+        ratios[iat] = ComplexT(0);
+        for (int tid = 0; tid < NumTeams; tid++)
+            ratios[iat] += ratios_private[iat][tid];
+    }
 }
 
-template<typename ST>
-void SplineC2COMPTargetT<ST>::mw_evaluateDetRatios(const RefVectorWithLeader<SPOSet>& spo_list,
-                                                  const RefVectorWithLeader<const VirtualParticleSetT<ComplexT>>& vp_list,
-                                                  const RefVector<ValueVector>& psi_list,
-                                                  const std::vector<const ValueType*>& invRow_ptr_list,
-                                                  std::vector<std::vector<ValueType>>& ratios_list) const
+template <typename ST, typename VT>
+void
+SplineC2COMPTargetT<ST, VT>::mw_evaluateDetRatios(
+    const RefVectorWithLeader<SPOSetT<VT>>& spo_list,
+    const RefVectorWithLeader<const VirtualParticleSetT<VT>>& vp_list,
+    const RefVector<ValueVector>& psi_list,
+    const std::vector<const ValueType*>& invRow_ptr_list,
+    std::vector<std::vector<ValueType>>& ratios_list) const
 {
-  assert(this == &spo_list.getLeader());
-  auto& phi_leader            = spo_list.template getCastedLeader<SplineC2COMPTargetT<ST>>();
-  auto& mw_mem                = phi_leader.mw_mem_handle_.getResource();
-  auto& det_ratios_buffer_H2D = mw_mem.det_ratios_buffer_H2D;
-  auto& mw_ratios_private     = mw_mem.mw_ratios_private;
-  auto& mw_offload_scratch    = mw_mem.mw_offload_scratch;
-  auto& mw_results_scratch    = mw_mem.mw_results_scratch;
-  const size_t nw             = spo_list.size();
-  const size_t orb_size       = phi_leader.size();
-
-  size_t mw_nVP = 0;
-  for (const VirtualParticleSetT<ComplexT>& VP : vp_list)
-    mw_nVP += VP.getTotalNum();
-
-  const size_t packed_size = nw * sizeof(ValueType*) + mw_nVP * (6 * sizeof(ST) + sizeof(int));
-  det_ratios_buffer_H2D.resize(packed_size);
-
-  // pack invRow_ptr_list to det_ratios_buffer_H2D
-  Vector<const ValueType*> ptr_buffer(reinterpret_cast<const ValueType**>(det_ratios_buffer_H2D.data()), nw);
-  for (size_t iw = 0; iw < nw; iw++)
-    ptr_buffer[iw] = invRow_ptr_list[iw];
-
-  // pack particle positions
-  auto* pos_ptr = reinterpret_cast<ST*>(det_ratios_buffer_H2D.data() + nw * sizeof(ValueType*));
-  auto* ref_id_ptr =
-      reinterpret_cast<int*>(det_ratios_buffer_H2D.data() + nw * sizeof(ValueType*) + mw_nVP * 6 * sizeof(ST));
-  size_t iVP = 0;
-  for (size_t iw = 0; iw < nw; iw++)
-  {
-    const VirtualParticleSetT<ComplexT>& VP = vp_list[iw];
-    assert(ratios_list[iw].size() == VP.getTotalNum());
-    for (size_t iat = 0; iat < VP.getTotalNum(); ++iat, ++iVP)
-    {
-      ref_id_ptr[iVP]    = iw;
-      const PointType& r = VP.activeR(iat);
-      PointType ru(PrimLattice.toUnit_floor(r));
-      pos_ptr[0] = r[0];
-      pos_ptr[1] = r[1];
-      pos_ptr[2] = r[2];
-      pos_ptr[3] = ru[0];
-      pos_ptr[4] = ru[1];
-      pos_ptr[5] = ru[2];
-      pos_ptr += 6;
+    assert(this == &spo_list.getLeader());
+    auto& phi_leader = spo_list.template getCastedLeader<SplineC2COMPTargetT>();
+    auto& mw_mem = phi_leader.mw_mem_handle_.getResource();
+    auto& det_ratios_buffer_H2D = mw_mem.det_ratios_buffer_H2D;
+    auto& mw_ratios_private = mw_mem.mw_ratios_private;
+    auto& mw_offload_scratch = mw_mem.mw_offload_scratch;
+    auto& mw_results_scratch = mw_mem.mw_results_scratch;
+    const size_t nw = spo_list.size();
+    const size_t orb_size = phi_leader.size();
+
+    size_t mw_nVP = 0;
+    for (const VirtualParticleSetT<VT>& VP : vp_list)
+        mw_nVP += VP.getTotalNum();
+
+    const size_t packed_size =
+        nw * sizeof(ValueType*) + mw_nVP * (6 * sizeof(ST) + sizeof(int));
+    det_ratios_buffer_H2D.resize(packed_size);
+
+    // pack invRow_ptr_list to det_ratios_buffer_H2D
+    Vector<const ValueType*> ptr_buffer(
+        reinterpret_cast<const ValueType**>(det_ratios_buffer_H2D.data()), nw);
+    for (size_t iw = 0; iw < nw; iw++)
+        ptr_buffer[iw] = invRow_ptr_list[iw];
+
+    // pack particle positions
+    auto* pos_ptr = reinterpret_cast<ST*>(
+        det_ratios_buffer_H2D.data() + nw * sizeof(ValueType*));
+    auto* ref_id_ptr = reinterpret_cast<int*>(det_ratios_buffer_H2D.data() +
+        nw * sizeof(ValueType*) + mw_nVP * 6 * sizeof(ST));
+    size_t iVP = 0;
+    for (size_t iw = 0; iw < nw; iw++) {
+        const VirtualParticleSetT<VT>& VP = vp_list[iw];
+        assert(ratios_list[iw].size() == VP.getTotalNum());
+        for (size_t iat = 0; iat < VP.getTotalNum(); ++iat, ++iVP) {
+            ref_id_ptr[iVP] = iw;
+            const PointType& r = VP.activeR(iat);
+            PointType ru(PrimLattice.toUnit_floor(r));
+            pos_ptr[0] = r[0];
+            pos_ptr[1] = r[1];
+            pos_ptr[2] = r[2];
+            pos_ptr[3] = ru[0];
+            pos_ptr[4] = ru[1];
+            pos_ptr[5] = ru[2];
+            pos_ptr += 6;
+        }
     }
-  }
-
-  const size_t ChunkSizePerTeam = 512;
-  const int NumTeams            = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
-  mw_ratios_private.resize(mw_nVP, NumTeams);
-  const auto padded_size = myV.size();
-  mw_offload_scratch.resize(padded_size * mw_nVP);
-  mw_results_scratch.resize(padded_size * mw_nVP);
-
-  // Ye: need to extract sizes and pointers before entering target region
-  const auto* spline_ptr         = SplineInst->getSplinePtr();
-  auto* offload_scratch_ptr      = mw_offload_scratch.data();
-  auto* results_scratch_ptr      = mw_results_scratch.data();
-  const auto myKcart_padded_size = myKcart->capacity();
-  auto* myKcart_ptr              = myKcart->data();
-  auto* buffer_H2D_ptr           = det_ratios_buffer_H2D.data();
-  auto* ratios_private_ptr       = mw_ratios_private.data();
-  const size_t first_spo_local   = this->first_spo;
-
-  {
-    ScopedTimer offload(offload_timer_);
-    PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(NumTeams*mw_nVP) \
+
+    const size_t ChunkSizePerTeam = 512;
+    const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
+    mw_ratios_private.resize(mw_nVP, NumTeams);
+    const auto padded_size = myV.size();
+    mw_offload_scratch.resize(padded_size * mw_nVP);
+    mw_results_scratch.resize(padded_size * mw_nVP);
+
+    // Ye: need to extract sizes and pointers before entering target region
+    const auto* spline_ptr = SplineInst->getSplinePtr();
+    auto* offload_scratch_ptr = mw_offload_scratch.data();
+    auto* results_scratch_ptr = mw_results_scratch.data();
+    const auto myKcart_padded_size = myKcart->capacity();
+    auto* myKcart_ptr = myKcart->data();
+    auto* buffer_H2D_ptr = det_ratios_buffer_H2D.data();
+    auto* ratios_private_ptr = mw_ratios_private.data();
+    const size_t first_spo_local = this->first_spo;
+
+    {
+        ScopedTimer offload(offload_timer_);
+        PRAGMA_OFFLOAD(
+            "omp target teams distribute collapse(2) num_teams(NumTeams*mw_nVP) \
                 map(always, to: buffer_H2D_ptr[0:det_ratios_buffer_H2D.size()]) \
                 map(always, from: ratios_private_ptr[0:NumTeams*mw_nVP])")
-    for (int iat = 0; iat < mw_nVP; iat++)
-      for (int team_id = 0; team_id < NumTeams; team_id++)
-      {
-        const size_t first = ChunkSizePerTeam * team_id;
-        const size_t last  = omptarget::min(first + ChunkSizePerTeam, padded_size);
-
-        auto* restrict offload_scratch_iat_ptr = offload_scratch_ptr + padded_size * iat;
-        auto* restrict psi_iat_ptr             = results_scratch_ptr + padded_size * iat;
-        auto* ref_id_ptr = reinterpret_cast<int*>(buffer_H2D_ptr + nw * sizeof(ValueType*) + mw_nVP * 6 * sizeof(ST));
-        auto* restrict psiinv_ptr  = reinterpret_cast<const ValueType**>(buffer_H2D_ptr)[ref_id_ptr[iat]];
-        auto* restrict pos_scratch = reinterpret_cast<ST*>(buffer_H2D_ptr + nw * sizeof(ValueType*));
-
-        int ix, iy, iz;
-        ST a[4], b[4], c[4];
-        spline2::computeLocationAndFractional(spline_ptr, pos_scratch[iat * 6 + 3], pos_scratch[iat * 6 + 4],
-                                              pos_scratch[iat * 6 + 5], ix, iy, iz, a, b, c);
-
-        PRAGMA_OFFLOAD("omp parallel for")
-        for (int index = 0; index < last - first; index++)
-          spline2offload::evaluate_v_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c,
-                                             offload_scratch_iat_ptr + first + index);
-        const size_t first_cplx = first / 2;
-        const size_t last_cplx  = omptarget::min(last / 2, orb_size);
-        PRAGMA_OFFLOAD("omp parallel for")
-        for (int index = first_cplx; index < last_cplx; index++)
-          C2C::assign_v(pos_scratch[iat * 6], pos_scratch[iat * 6 + 1], pos_scratch[iat * 6 + 2], psi_iat_ptr,
-                        offload_scratch_iat_ptr, myKcart_ptr, myKcart_padded_size, first_spo_local, index);
-
-        ComplexT sum(0);
-        PRAGMA_OFFLOAD("omp parallel for simd reduction(+:sum)")
-        for (int i = first_cplx; i < last_cplx; i++)
-          sum += psi_iat_ptr[i] * psiinv_ptr[i];
-        ratios_private_ptr[iat * NumTeams + team_id] = sum;
-      }
-  }
-
-  // do the reduction manually
-  iVP = 0;
-  for (size_t iw = 0; iw < nw; iw++)
-  {
-    auto& ratios = ratios_list[iw];
-    for (size_t iat = 0; iat < ratios.size(); iat++, iVP++)
-    {
-      ratios[iat] = ComplexT(0);
-      for (int tid = 0; tid < NumTeams; ++tid)
-        ratios[iat] += mw_ratios_private[iVP][tid];
+        for (int iat = 0; iat < mw_nVP; iat++)
+            for (int team_id = 0; team_id < NumTeams; team_id++) {
+                const size_t first = ChunkSizePerTeam * team_id;
+                const size_t last =
+                    omptarget::min(first + ChunkSizePerTeam, padded_size);
+
+                auto* restrict offload_scratch_iat_ptr =
+                    offload_scratch_ptr + padded_size * iat;
+                auto* restrict psi_iat_ptr =
+                    results_scratch_ptr + padded_size * iat;
+                auto* ref_id_ptr = reinterpret_cast<int*>(buffer_H2D_ptr +
+                    nw * sizeof(ValueType*) + mw_nVP * 6 * sizeof(ST));
+                auto* restrict psiinv_ptr = reinterpret_cast<const ValueType**>(
+                    buffer_H2D_ptr)[ref_id_ptr[iat]];
+                auto* restrict pos_scratch = reinterpret_cast<ST*>(
+                    buffer_H2D_ptr + nw * sizeof(ValueType*));
+
+                int ix, iy, iz;
+                ST a[4], b[4], c[4];
+                spline2::computeLocationAndFractional(spline_ptr,
+                    pos_scratch[iat * 6 + 3], pos_scratch[iat * 6 + 4],
+                    pos_scratch[iat * 6 + 5], ix, iy, iz, a, b, c);
+
+                PRAGMA_OFFLOAD("omp parallel for")
+                for (int index = 0; index < last - first; index++)
+                    spline2offload::evaluate_v_impl_v2(spline_ptr, ix, iy, iz,
+                        first + index, a, b, c,
+                        offload_scratch_iat_ptr + first + index);
+                const size_t first_cplx = first / 2;
+                const size_t last_cplx = omptarget::min(last / 2, orb_size);
+                PRAGMA_OFFLOAD("omp parallel for")
+                for (int index = first_cplx; index < last_cplx; index++)
+                    C2C::assign_v(pos_scratch[iat * 6],
+                        pos_scratch[iat * 6 + 1], pos_scratch[iat * 6 + 2],
+                        psi_iat_ptr, offload_scratch_iat_ptr, myKcart_ptr,
+                        myKcart_padded_size, first_spo_local, index);
+
+                ComplexT sum(0);
+                PRAGMA_OFFLOAD("omp parallel for simd reduction(+:sum)")
+                for (int i = first_cplx; i < last_cplx; i++)
+                    sum += psi_iat_ptr[i] * psiinv_ptr[i];
+                ratios_private_ptr[iat * NumTeams + team_id] = sum;
+            }
+    }
+
+    // do the reduction manually
+    iVP = 0;
+    for (size_t iw = 0; iw < nw; iw++) {
+        auto& ratios = ratios_list[iw];
+        for (size_t iat = 0; iat < ratios.size(); iat++, iVP++) {
+            ratios[iat] = ComplexT(0);
+            for (int tid = 0; tid < NumTeams; ++tid)
+                ratios[iat] += mw_ratios_private[iVP][tid];
+        }
     }
-  }
 }
 
-/** assign_vgl_from_l can be used when myL is precomputed and myV,myG,myL in cartesian
-   */
-template<typename ST>
-inline void SplineC2COMPTargetT<ST>::assign_vgl_from_l(const PointType& r,
-                                                      ValueVector& psi,
-                                                      GradVector& dpsi,
-                                                      ValueVector& d2psi)
+/** assign_vgl_from_l can be used when myL is precomputed and myV,myG,myL in
+ * cartesian
+ */
+template <typename ST, typename VT>
+inline void
+SplineC2COMPTargetT<ST, VT>::assign_vgl_from_l(
+    const PointType& r, ValueVector& psi, GradVector& dpsi, ValueVector& d2psi)
 {
-  constexpr ST two(2);
-  const ST x = r[0], y = r[1], z = r[2];
+    constexpr ST two(2);
+    const ST x = r[0], y = r[1], z = r[2];
 
-  const ST* restrict k0 = myKcart->data(0);
-  const ST* restrict k1 = myKcart->data(1);
-  const ST* restrict k2 = myKcart->data(2);
+    const ST* restrict k0 = myKcart->data(0);
+    const ST* restrict k1 = myKcart->data(1);
+    const ST* restrict k2 = myKcart->data(2);
 
-  const ST* restrict g0 = myG.data(0);
-  const ST* restrict g1 = myG.data(1);
-  const ST* restrict g2 = myG.data(2);
+    const ST* restrict g0 = myG.data(0);
+    const ST* restrict g1 = myG.data(1);
+    const ST* restrict g2 = myG.data(2);
 
-  const size_t N = this->last_spo - this->first_spo;
+    const size_t N = this->last_spo - this->first_spo;
 #pragma omp simd
-  for (size_t j = 0; j < N; ++j)
-  {
-    const size_t jr = j << 1;
-    const size_t ji = jr + 1;
-
-    const ST kX    = k0[j];
-    const ST kY    = k1[j];
-    const ST kZ    = k2[j];
-    const ST val_r = myV[jr];
-    const ST val_i = myV[ji];
-
-    //phase
-    ST s, c;
-    omptarget::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
-
-    //dot(PrimLattice.G,myG[j])
-    const ST dX_r = g0[jr];
-    const ST dY_r = g1[jr];
-    const ST dZ_r = g2[jr];
-
-    const ST dX_i = g0[ji];
-    const ST dY_i = g1[ji];
-    const ST dZ_i = g2[ji];
-
-    // \f$\nabla \psi_r + {\bf k}\psi_i\f$
-    const ST gX_r = dX_r + val_i * kX;
-    const ST gY_r = dY_r + val_i * kY;
-    const ST gZ_r = dZ_r + val_i * kZ;
-    const ST gX_i = dX_i - val_r * kX;
-    const ST gY_i = dY_i - val_r * kY;
-    const ST gZ_i = dZ_i - val_r * kZ;
-
-    const ST lap_r = myL[jr] + (*mKK)[j] * val_r + two * (kX * dX_i + kY * dY_i + kZ * dZ_i);
-    const ST lap_i = myL[ji] + (*mKK)[j] * val_i - two * (kX * dX_r + kY * dY_r + kZ * dZ_r);
-
-    const size_t psiIndex = j + this->first_spo;
-    psi[psiIndex]         = ComplexT(c * val_r - s * val_i, c * val_i + s * val_r);
-    dpsi[psiIndex][0]     = ComplexT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
-    dpsi[psiIndex][1]     = ComplexT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
-    dpsi[psiIndex][2]     = ComplexT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);
-    d2psi[psiIndex]       = ComplexT(c * lap_r - s * lap_i, c * lap_i + s * lap_r);
-  }
+    for (size_t j = 0; j < N; ++j) {
+        const size_t jr = j << 1;
+        const size_t ji = jr + 1;
+
+        const ST kX = k0[j];
+        const ST kY = k1[j];
+        const ST kZ = k2[j];
+        const ST val_r = myV[jr];
+        const ST val_i = myV[ji];
+
+        // phase
+        ST s, c;
+        omptarget::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
+
+        // dot(PrimLattice.G,myG[j])
+        const ST dX_r = g0[jr];
+        const ST dY_r = g1[jr];
+        const ST dZ_r = g2[jr];
+
+        const ST dX_i = g0[ji];
+        const ST dY_i = g1[ji];
+        const ST dZ_i = g2[ji];
+
+        // \f$\nabla \psi_r + {\bf k}\psi_i\f$
+        const ST gX_r = dX_r + val_i * kX;
+        const ST gY_r = dY_r + val_i * kY;
+        const ST gZ_r = dZ_r + val_i * kZ;
+        const ST gX_i = dX_i - val_r * kX;
+        const ST gY_i = dY_i - val_r * kY;
+        const ST gZ_i = dZ_i - val_r * kZ;
+
+        const ST lap_r = myL[jr] + (*mKK)[j] * val_r +
+            two * (kX * dX_i + kY * dY_i + kZ * dZ_i);
+        const ST lap_i = myL[ji] + (*mKK)[j] * val_i -
+            two * (kX * dX_r + kY * dY_r + kZ * dZ_r);
+
+        const size_t psiIndex = j + this->first_spo;
+        psi[psiIndex] = ComplexT(c * val_r - s * val_i, c * val_i + s * val_r);
+        dpsi[psiIndex][0] = ComplexT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
+        dpsi[psiIndex][1] = ComplexT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
+        dpsi[psiIndex][2] = ComplexT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);
+        d2psi[psiIndex] =
+            ComplexT(c * lap_r - s * lap_i, c * lap_i + s * lap_r);
+    }
 }
 
-template<typename ST>
-void SplineC2COMPTargetT<ST>::evaluateVGL(const ParticleSetT<ComplexT>& P,
-                                         const int iat,
-                                         ValueVector& psi,
-                                         GradVector& dpsi,
-                                         ValueVector& d2psi)
+template <typename ST, typename VT>
+void
+SplineC2COMPTargetT<ST, VT>::evaluateVGL(const ParticleSetT<VT>& P,
+    const int iat, ValueVector& psi, GradVector& dpsi, ValueVector& d2psi)
 {
-  const PointType& r = P.activeR(iat);
-  PointType ru(PrimLattice.toUnit_floor(r));
-
-  const size_t ChunkSizePerTeam = 512;
-  const int NumTeams            = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
-
-  const auto padded_size = myV.size();
-  offload_scratch.resize(padded_size * SoAFields3D::NUM_FIELDS);
-  const auto orb_size = psi.size();
-  // for V(1)G(3)L(1) final result
-  results_scratch.resize(padded_size * 5);
-
-  // Ye: need to extract sizes and pointers before entering target region
-  const auto* spline_ptr    = SplineInst->getSplinePtr();
-  auto* offload_scratch_ptr = offload_scratch.data();
-  auto* results_scratch_ptr = results_scratch.data();
-  const auto x = r[0], y = r[1], z = r[2];
-  const auto rux = ru[0], ruy = ru[1], ruz = ru[2];
-  const auto myKcart_padded_size = myKcart->capacity();
-  auto* mKK_ptr                  = mKK->data();
-  auto* GGt_ptr                  = GGt_offload->data();
-  auto* PrimLattice_G_ptr        = PrimLattice_G_offload->data();
-  auto* myKcart_ptr              = myKcart->data();
-  const size_t first_spo_local   = this->first_spo;
-
-  {
-    ScopedTimer offload(offload_timer_);
-    PRAGMA_OFFLOAD("omp target teams distribute num_teams(NumTeams) \
-                map(always, from: results_scratch_ptr[0:padded_size*5])")
-    for (int team_id = 0; team_id < NumTeams; team_id++)
+    const PointType& r = P.activeR(iat);
+    PointType ru(PrimLattice.toUnit_floor(r));
+
+    const size_t ChunkSizePerTeam = 512;
+    const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
+
+    const auto padded_size = myV.size();
+    offload_scratch.resize(padded_size * SoAFields3D::NUM_FIELDS);
+    const auto orb_size = psi.size();
+    // for V(1)G(3)L(1) final result
+    results_scratch.resize(padded_size * 5);
+
+    // Ye: need to extract sizes and pointers before entering target region
+    const auto* spline_ptr = SplineInst->getSplinePtr();
+    auto* offload_scratch_ptr = offload_scratch.data();
+    auto* results_scratch_ptr = results_scratch.data();
+    const auto x = r[0], y = r[1], z = r[2];
+    const auto rux = ru[0], ruy = ru[1], ruz = ru[2];
+    const auto myKcart_padded_size = myKcart->capacity();
+    auto* mKK_ptr = mKK->data();
+    auto* GGt_ptr = GGt_offload->data();
+    auto* PrimLattice_G_ptr = PrimLattice_G_offload->data();
+    auto* myKcart_ptr = myKcart->data();
+    const size_t first_spo_local = this->first_spo;
+
     {
-      const size_t first = ChunkSizePerTeam * team_id;
-      const size_t last  = omptarget::min(first + ChunkSizePerTeam, padded_size);
-
-      int ix, iy, iz;
-      ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
-      spline2::computeLocationAndFractional(spline_ptr, rux, ruy, ruz, ix, iy, iz, a, b, c, da, db, dc, d2a, d2b, d2c);
-
-      const ST G[9]      = {PrimLattice_G_ptr[0], PrimLattice_G_ptr[1], PrimLattice_G_ptr[2],
-                            PrimLattice_G_ptr[3], PrimLattice_G_ptr[4], PrimLattice_G_ptr[5],
-                            PrimLattice_G_ptr[6], PrimLattice_G_ptr[7], PrimLattice_G_ptr[8]};
-      const ST symGGt[6] = {GGt_ptr[0], GGt_ptr[1] + GGt_ptr[3], GGt_ptr[2] + GGt_ptr[6],
-                            GGt_ptr[4], GGt_ptr[5] + GGt_ptr[7], GGt_ptr[8]};
-
-      PRAGMA_OFFLOAD("omp parallel for")
-      for (int index = 0; index < last - first; index++)
-      {
-        spline2offload::evaluate_vgh_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c, da, db, dc, d2a, d2b, d2c,
-                                             offload_scratch_ptr + first + index, padded_size);
-        const int output_index = first + index;
-        offload_scratch_ptr[padded_size * SoAFields3D::LAPL + output_index] =
-            SymTrace(offload_scratch_ptr[padded_size * SoAFields3D::HESS00 + output_index],
-                     offload_scratch_ptr[padded_size * SoAFields3D::HESS01 + output_index],
-                     offload_scratch_ptr[padded_size * SoAFields3D::HESS02 + output_index],
-                     offload_scratch_ptr[padded_size * SoAFields3D::HESS11 + output_index],
-                     offload_scratch_ptr[padded_size * SoAFields3D::HESS12 + output_index],
-                     offload_scratch_ptr[padded_size * SoAFields3D::HESS22 + output_index], symGGt);
-      }
-
-      const size_t first_cplx = first / 2;
-      const size_t last_cplx  = omptarget::min(last / 2, orb_size);
-      PRAGMA_OFFLOAD("omp parallel for")
-      for (int index = first_cplx; index < last_cplx; index++)
-        C2C::assign_vgl(x, y, z, results_scratch_ptr, padded_size, mKK_ptr, offload_scratch_ptr, padded_size, G,
-                        myKcart_ptr, myKcart_padded_size, first_spo_local, index);
+        ScopedTimer offload(offload_timer_);
+        PRAGMA_OFFLOAD("omp target teams distribute num_teams(NumTeams) \
+                map(always, from: results_scratch_ptr[0:padded_size*5])")
+        for (int team_id = 0; team_id < NumTeams; team_id++) {
+            const size_t first = ChunkSizePerTeam * team_id;
+            const size_t last =
+                omptarget::min(first + ChunkSizePerTeam, padded_size);
+
+            int ix, iy, iz;
+            ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
+            spline2::computeLocationAndFractional(spline_ptr, rux, ruy, ruz, ix,
+                iy, iz, a, b, c, da, db, dc, d2a, d2b, d2c);
+
+            const ST G[9] = {PrimLattice_G_ptr[0], PrimLattice_G_ptr[1],
+                PrimLattice_G_ptr[2], PrimLattice_G_ptr[3],
+                PrimLattice_G_ptr[4], PrimLattice_G_ptr[5],
+                PrimLattice_G_ptr[6], PrimLattice_G_ptr[7],
+                PrimLattice_G_ptr[8]};
+            const ST symGGt[6] = {GGt_ptr[0], GGt_ptr[1] + GGt_ptr[3],
+                GGt_ptr[2] + GGt_ptr[6], GGt_ptr[4], GGt_ptr[5] + GGt_ptr[7],
+                GGt_ptr[8]};
+
+            PRAGMA_OFFLOAD("omp parallel for")
+            for (int index = 0; index < last - first; index++) {
+                spline2offload::evaluate_vgh_impl_v2(spline_ptr, ix, iy, iz,
+                    first + index, a, b, c, da, db, dc, d2a, d2b, d2c,
+                    offload_scratch_ptr + first + index, padded_size);
+                const int output_index = first + index;
+                offload_scratch_ptr[padded_size * SoAFields3D::LAPL +
+                    output_index] =
+                    SymTrace(
+                        offload_scratch_ptr[padded_size * SoAFields3D::HESS00 +
+                            output_index],
+                        offload_scratch_ptr[padded_size * SoAFields3D::HESS01 +
+                            output_index],
+                        offload_scratch_ptr[padded_size * SoAFields3D::HESS02 +
+                            output_index],
+                        offload_scratch_ptr[padded_size * SoAFields3D::HESS11 +
+                            output_index],
+                        offload_scratch_ptr[padded_size * SoAFields3D::HESS12 +
+                            output_index],
+                        offload_scratch_ptr[padded_size * SoAFields3D::HESS22 +
+                            output_index],
+                        symGGt);
+            }
+
+            const size_t first_cplx = first / 2;
+            const size_t last_cplx = omptarget::min(last / 2, orb_size);
+            PRAGMA_OFFLOAD("omp parallel for")
+            for (int index = first_cplx; index < last_cplx; index++)
+                C2C::assign_vgl(x, y, z, results_scratch_ptr, padded_size,
+                    mKK_ptr, offload_scratch_ptr, padded_size, G, myKcart_ptr,
+                    myKcart_padded_size, first_spo_local, index);
+        }
+    }
+
+    for (size_t i = 0; i < orb_size; i++) {
+        psi[i] = results_scratch[i];
+        dpsi[i][0] = results_scratch[i + padded_size];
+        dpsi[i][1] = results_scratch[i + padded_size * 2];
+        dpsi[i][2] = results_scratch[i + padded_size * 3];
+        d2psi[i] = results_scratch[i + padded_size * 4];
     }
-  }
-
-  for (size_t i = 0; i < orb_size; i++)
-  {
-    psi[i]     = results_scratch[i];
-    dpsi[i][0] = results_scratch[i + padded_size];
-    dpsi[i][1] = results_scratch[i + padded_size * 2];
-    dpsi[i][2] = results_scratch[i + padded_size * 3];
-    d2psi[i]   = results_scratch[i + padded_size * 4];
-  }
 }
 
-template<typename ST>
-void SplineC2COMPTargetT<ST>::evaluateVGLMultiPos(const Vector<ST, OffloadPinnedAllocator<ST>>& multi_pos,
-                                                 Vector<ST, OffloadPinnedAllocator<ST>>& offload_scratch,
-                                                 Vector<ComplexT, OffloadPinnedAllocator<ComplexT>>& results_scratch,
-                                                 const RefVector<ValueVector>& psi_v_list,
-                                                 const RefVector<GradVector>& dpsi_v_list,
-                                                 const RefVector<ValueVector>& d2psi_v_list) const
+template <typename ST, typename VT>
+void
+SplineC2COMPTargetT<ST, VT>::evaluateVGLMultiPos(
+    const Vector<ST, OffloadPinnedAllocator<ST>>& multi_pos,
+    Vector<ST, OffloadPinnedAllocator<ST>>& offload_scratch,
+    Vector<ComplexT, OffloadPinnedAllocator<ComplexT>>& results_scratch,
+    const RefVector<ValueVector>& psi_v_list,
+    const RefVector<GradVector>& dpsi_v_list,
+    const RefVector<ValueVector>& d2psi_v_list) const
 {
-  const size_t num_pos          = psi_v_list.size();
-  const size_t ChunkSizePerTeam = 512;
-  const int NumTeams            = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
-  const auto padded_size        = myV.size();
-  offload_scratch.resize(padded_size * num_pos * SoAFields3D::NUM_FIELDS);
-  const auto orb_size = psi_v_list[0].get().size();
-  // for V(1)G(3)L(1) final result
-  results_scratch.resize(padded_size * num_pos * 5);
-
-  // Ye: need to extract sizes and pointers before entering target region
-  const auto* spline_ptr         = SplineInst->getSplinePtr();
-  auto* pos_copy_ptr             = multi_pos.data();
-  auto* offload_scratch_ptr      = offload_scratch.data();
-  auto* results_scratch_ptr      = results_scratch.data();
-  const auto myKcart_padded_size = myKcart->capacity();
-  auto* mKK_ptr                  = mKK->data();
-  auto* GGt_ptr                  = GGt_offload->data();
-  auto* PrimLattice_G_ptr        = PrimLattice_G_offload->data();
-  auto* myKcart_ptr              = myKcart->data();
-  const size_t first_spo_local   = this->first_spo;
-
-  {
-    ScopedTimer offload(offload_timer_);
-    PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(NumTeams*num_pos) \
+    const size_t num_pos = psi_v_list.size();
+    const size_t ChunkSizePerTeam = 512;
+    const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
+    const auto padded_size = myV.size();
+    offload_scratch.resize(padded_size * num_pos * SoAFields3D::NUM_FIELDS);
+    const auto orb_size = psi_v_list[0].get().size();
+    // for V(1)G(3)L(1) final result
+    results_scratch.resize(padded_size * num_pos * 5);
+
+    // Ye: need to extract sizes and pointers before entering target region
+    const auto* spline_ptr = SplineInst->getSplinePtr();
+    auto* pos_copy_ptr = multi_pos.data();
+    auto* offload_scratch_ptr = offload_scratch.data();
+    auto* results_scratch_ptr = results_scratch.data();
+    const auto myKcart_padded_size = myKcart->capacity();
+    auto* mKK_ptr = mKK->data();
+    auto* GGt_ptr = GGt_offload->data();
+    auto* PrimLattice_G_ptr = PrimLattice_G_offload->data();
+    auto* myKcart_ptr = myKcart->data();
+    const size_t first_spo_local = this->first_spo;
+
+    {
+        ScopedTimer offload(offload_timer_);
+        PRAGMA_OFFLOAD(
+            "omp target teams distribute collapse(2) num_teams(NumTeams*num_pos) \
                     map(always, to: pos_copy_ptr[0:num_pos*6]) \
                     map(always, from: results_scratch_ptr[0:padded_size*num_pos*5])")
-    for (int iw = 0; iw < num_pos; iw++)
-      for (int team_id = 0; team_id < NumTeams; team_id++)
-      {
-        const size_t first = ChunkSizePerTeam * team_id;
-        const size_t last  = omptarget::min(first + ChunkSizePerTeam, padded_size);
-
-        auto* restrict offload_scratch_iw_ptr = offload_scratch_ptr + padded_size * iw * SoAFields3D::NUM_FIELDS;
-        auto* restrict psi_iw_ptr             = results_scratch_ptr + padded_size * iw * 5;
-
-        int ix, iy, iz;
-        ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
-        spline2::computeLocationAndFractional(spline_ptr, pos_copy_ptr[iw * 6 + 3], pos_copy_ptr[iw * 6 + 4],
-                                              pos_copy_ptr[iw * 6 + 5], ix, iy, iz, a, b, c, da, db, dc, d2a, d2b, d2c);
-
-        const ST G[9]      = {PrimLattice_G_ptr[0], PrimLattice_G_ptr[1], PrimLattice_G_ptr[2],
-                              PrimLattice_G_ptr[3], PrimLattice_G_ptr[4], PrimLattice_G_ptr[5],
-                              PrimLattice_G_ptr[6], PrimLattice_G_ptr[7], PrimLattice_G_ptr[8]};
-        const ST symGGt[6] = {GGt_ptr[0], GGt_ptr[1] + GGt_ptr[3], GGt_ptr[2] + GGt_ptr[6],
-                              GGt_ptr[4], GGt_ptr[5] + GGt_ptr[7], GGt_ptr[8]};
-
-        PRAGMA_OFFLOAD("omp parallel for")
-        for (int index = 0; index < last - first; index++)
-        {
-          spline2offload::evaluate_vgh_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c, da, db, dc, d2a, d2b,
-                                               d2c, offload_scratch_iw_ptr + first + index, padded_size);
-          const int output_index = first + index;
-          offload_scratch_iw_ptr[padded_size * SoAFields3D::LAPL + output_index] =
-              SymTrace(offload_scratch_iw_ptr[padded_size * SoAFields3D::HESS00 + output_index],
-                       offload_scratch_iw_ptr[padded_size * SoAFields3D::HESS01 + output_index],
-                       offload_scratch_iw_ptr[padded_size * SoAFields3D::HESS02 + output_index],
-                       offload_scratch_iw_ptr[padded_size * SoAFields3D::HESS11 + output_index],
-                       offload_scratch_iw_ptr[padded_size * SoAFields3D::HESS12 + output_index],
-                       offload_scratch_iw_ptr[padded_size * SoAFields3D::HESS22 + output_index], symGGt);
-        }
+        for (int iw = 0; iw < num_pos; iw++)
+            for (int team_id = 0; team_id < NumTeams; team_id++) {
+                const size_t first = ChunkSizePerTeam * team_id;
+                const size_t last =
+                    omptarget::min(first + ChunkSizePerTeam, padded_size);
+
+                auto* restrict offload_scratch_iw_ptr = offload_scratch_ptr +
+                    padded_size * iw * SoAFields3D::NUM_FIELDS;
+                auto* restrict psi_iw_ptr =
+                    results_scratch_ptr + padded_size * iw * 5;
+
+                int ix, iy, iz;
+                ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4],
+                    d2c[4];
+                spline2::computeLocationAndFractional(spline_ptr,
+                    pos_copy_ptr[iw * 6 + 3], pos_copy_ptr[iw * 6 + 4],
+                    pos_copy_ptr[iw * 6 + 5], ix, iy, iz, a, b, c, da, db, dc,
+                    d2a, d2b, d2c);
+
+                const ST G[9] = {PrimLattice_G_ptr[0], PrimLattice_G_ptr[1],
+                    PrimLattice_G_ptr[2], PrimLattice_G_ptr[3],
+                    PrimLattice_G_ptr[4], PrimLattice_G_ptr[5],
+                    PrimLattice_G_ptr[6], PrimLattice_G_ptr[7],
+                    PrimLattice_G_ptr[8]};
+                const ST symGGt[6] = {GGt_ptr[0], GGt_ptr[1] + GGt_ptr[3],
+                    GGt_ptr[2] + GGt_ptr[6], GGt_ptr[4],
+                    GGt_ptr[5] + GGt_ptr[7], GGt_ptr[8]};
+
+                PRAGMA_OFFLOAD("omp parallel for")
+                for (int index = 0; index < last - first; index++) {
+                    spline2offload::evaluate_vgh_impl_v2(spline_ptr, ix, iy, iz,
+                        first + index, a, b, c, da, db, dc, d2a, d2b, d2c,
+                        offload_scratch_iw_ptr + first + index, padded_size);
+                    const int output_index = first + index;
+                    offload_scratch_iw_ptr[padded_size * SoAFields3D::LAPL +
+                        output_index] =
+                        SymTrace(offload_scratch_iw_ptr[padded_size *
+                                         SoAFields3D::HESS00 +
+                                     output_index],
+                            offload_scratch_iw_ptr[padded_size *
+                                    SoAFields3D::HESS01 +
+                                output_index],
+                            offload_scratch_iw_ptr[padded_size *
+                                    SoAFields3D::HESS02 +
+                                output_index],
+                            offload_scratch_iw_ptr[padded_size *
+                                    SoAFields3D::HESS11 +
+                                output_index],
+                            offload_scratch_iw_ptr[padded_size *
+                                    SoAFields3D::HESS12 +
+                                output_index],
+                            offload_scratch_iw_ptr[padded_size *
+                                    SoAFields3D::HESS22 +
+                                output_index],
+                            symGGt);
+                }
+
+                const size_t first_cplx = first / 2;
+                const size_t last_cplx = omptarget::min(last / 2, orb_size);
+                PRAGMA_OFFLOAD("omp parallel for")
+                for (int index = first_cplx; index < last_cplx; index++)
+                    C2C::assign_vgl(pos_copy_ptr[iw * 6],
+                        pos_copy_ptr[iw * 6 + 1], pos_copy_ptr[iw * 6 + 2],
+                        psi_iw_ptr, padded_size, mKK_ptr,
+                        offload_scratch_iw_ptr, padded_size, G, myKcart_ptr,
+                        myKcart_padded_size, first_spo_local, index);
+            }
+    }
 
-        const size_t first_cplx = first / 2;
-        const size_t last_cplx  = omptarget::min(last / 2, orb_size);
-        PRAGMA_OFFLOAD("omp parallel for")
-        for (int index = first_cplx; index < last_cplx; index++)
-          C2C::assign_vgl(pos_copy_ptr[iw * 6], pos_copy_ptr[iw * 6 + 1], pos_copy_ptr[iw * 6 + 2], psi_iw_ptr,
-                          padded_size, mKK_ptr, offload_scratch_iw_ptr, padded_size, G, myKcart_ptr,
-                          myKcart_padded_size, first_spo_local, index);
-      }
-  }
-
-  for (int iw = 0; iw < num_pos; ++iw)
-  {
-    auto* restrict results_iw_ptr = results_scratch_ptr + padded_size * iw * 5;
-    ValueVector& psi_v(psi_v_list[iw]);
-    GradVector& dpsi_v(dpsi_v_list[iw]);
-    ValueVector& d2psi_v(d2psi_v_list[iw]);
-    for (size_t i = 0; i < orb_size; i++)
-    {
-      psi_v[i]     = results_iw_ptr[i];
-      dpsi_v[i][0] = results_iw_ptr[i + padded_size];
-      dpsi_v[i][1] = results_iw_ptr[i + padded_size * 2];
-      dpsi_v[i][2] = results_iw_ptr[i + padded_size * 3];
-      d2psi_v[i]   = results_iw_ptr[i + padded_size * 4];
+    for (int iw = 0; iw < num_pos; ++iw) {
+        auto* restrict results_iw_ptr =
+            results_scratch_ptr + padded_size * iw * 5;
+        ValueVector& psi_v(psi_v_list[iw]);
+        GradVector& dpsi_v(dpsi_v_list[iw]);
+        ValueVector& d2psi_v(d2psi_v_list[iw]);
+        for (size_t i = 0; i < orb_size; i++) {
+            psi_v[i] = results_iw_ptr[i];
+            dpsi_v[i][0] = results_iw_ptr[i + padded_size];
+            dpsi_v[i][1] = results_iw_ptr[i + padded_size * 2];
+            dpsi_v[i][2] = results_iw_ptr[i + padded_size * 3];
+            d2psi_v[i] = results_iw_ptr[i + padded_size * 4];
+        }
     }
-  }
 }
 
-template<typename ST>
-void SplineC2COMPTargetT<ST>::mw_evaluateVGL(const RefVectorWithLeader<SPOSet>& sa_list,
-                                            const RefVectorWithLeader<ParticleSetT<ComplexT>>& P_list,
-                                            int iat,
-                                            const RefVector<ValueVector>& psi_v_list,
-                                            const RefVector<GradVector>& dpsi_v_list,
-                                            const RefVector<ValueVector>& d2psi_v_list) const
+template <typename ST, typename VT>
+void
+SplineC2COMPTargetT<ST, VT>::mw_evaluateVGL(
+    const RefVectorWithLeader<SPOSetT<VT>>& sa_list,
+    const RefVectorWithLeader<ParticleSetT<VT>>& P_list, int iat,
+    const RefVector<ValueVector>& psi_v_list,
+    const RefVector<GradVector>& dpsi_v_list,
+    const RefVector<ValueVector>& d2psi_v_list) const
 {
-  assert(this == &sa_list.getLeader());
-  auto& phi_leader = sa_list.template getCastedLeader<SplineC2COMPTargetT<ST>>();
-  auto& mw_mem             = phi_leader.mw_mem_handle_.getResource();
-  auto& mw_pos_copy        = mw_mem.mw_pos_copy;
-  auto& mw_offload_scratch = mw_mem.mw_offload_scratch;
-  auto& mw_results_scratch = mw_mem.mw_results_scratch;
-  const int nwalkers       = sa_list.size();
-  mw_pos_copy.resize(nwalkers * 6);
-
-  // pack particle positions
-  for (int iw = 0; iw < nwalkers; ++iw)
-  {
-    const PointType& r = P_list[iw].activeR(iat);
-    PointType ru(PrimLattice.toUnit_floor(r));
-    mw_pos_copy[iw * 6]     = r[0];
-    mw_pos_copy[iw * 6 + 1] = r[1];
-    mw_pos_copy[iw * 6 + 2] = r[2];
-    mw_pos_copy[iw * 6 + 3] = ru[0];
-    mw_pos_copy[iw * 6 + 4] = ru[1];
-    mw_pos_copy[iw * 6 + 5] = ru[2];
-  }
-
-  phi_leader.evaluateVGLMultiPos(mw_pos_copy, mw_offload_scratch, mw_results_scratch, psi_v_list, dpsi_v_list,
-                                 d2psi_v_list);
+    assert(this == &sa_list.getLeader());
+    auto& phi_leader = sa_list.template getCastedLeader<SplineC2COMPTargetT>();
+    auto& mw_mem = phi_leader.mw_mem_handle_.getResource();
+    auto& mw_pos_copy = mw_mem.mw_pos_copy;
+    auto& mw_offload_scratch = mw_mem.mw_offload_scratch;
+    auto& mw_results_scratch = mw_mem.mw_results_scratch;
+    const int nwalkers = sa_list.size();
+    mw_pos_copy.resize(nwalkers * 6);
+
+    // pack particle positions
+    for (int iw = 0; iw < nwalkers; ++iw) {
+        const PointType& r = P_list[iw].activeR(iat);
+        PointType ru(PrimLattice.toUnit_floor(r));
+        mw_pos_copy[iw * 6] = r[0];
+        mw_pos_copy[iw * 6 + 1] = r[1];
+        mw_pos_copy[iw * 6 + 2] = r[2];
+        mw_pos_copy[iw * 6 + 3] = ru[0];
+        mw_pos_copy[iw * 6 + 4] = ru[1];
+        mw_pos_copy[iw * 6 + 5] = ru[2];
+    }
+
+    phi_leader.evaluateVGLMultiPos(mw_pos_copy, mw_offload_scratch,
+        mw_results_scratch, psi_v_list, dpsi_v_list, d2psi_v_list);
 }
 
-template<typename ST>
-void SplineC2COMPTargetT<ST>::mw_evaluateVGLandDetRatioGrads(const RefVectorWithLeader<SPOSet>& spo_list,
-                                                            const RefVectorWithLeader<ParticleSetT<ComplexT>>& P_list,
-                                                            int iat,
-                                                            const std::vector<const ValueType*>& invRow_ptr_list,
-                                                            OffloadMWVGLArray& phi_vgl_v,
-                                                            std::vector<ValueType>& ratios,
-                                                            std::vector<GradType>& grads) const
+template <typename ST, typename VT>
+void
+SplineC2COMPTargetT<ST, VT>::mw_evaluateVGLandDetRatioGrads(
+    const RefVectorWithLeader<SPOSetT<VT>>& spo_list,
+    const RefVectorWithLeader<ParticleSetT<VT>>& P_list, int iat,
+    const std::vector<const ValueType*>& invRow_ptr_list,
+    OffloadMWVGLArray& phi_vgl_v, std::vector<ValueType>& ratios,
+    std::vector<GradType>& grads) const
 {
-  assert(this == &spo_list.getLeader());
-  auto& phi_leader         = spo_list.template getCastedLeader<SplineC2COMPTargetT<ST>>();
-  auto& mw_mem             = phi_leader.mw_mem_handle_.getResource();
-  auto& buffer_H2D         = mw_mem.buffer_H2D;
-  auto& rg_private         = mw_mem.rg_private;
-  auto& mw_offload_scratch = mw_mem.mw_offload_scratch;
-  auto& mw_results_scratch = mw_mem.mw_results_scratch;
-  const int nwalkers       = spo_list.size();
-  buffer_H2D.resize(nwalkers, sizeof(ST) * 6 + sizeof(ValueType*));
-
-  // pack particle positions and invRow pointers.
-  for (int iw = 0; iw < nwalkers; ++iw)
-  {
-    const PointType& r = P_list[iw].activeR(iat);
-    PointType ru(PrimLattice.toUnit_floor(r));
-    Vector<ST> pos_copy(reinterpret_cast<ST*>(buffer_H2D[iw]), 6);
-
-    pos_copy[0] = r[0];
-    pos_copy[1] = r[1];
-    pos_copy[2] = r[2];
-    pos_copy[3] = ru[0];
-    pos_copy[4] = ru[1];
-    pos_copy[5] = ru[2];
-
-    auto& invRow_ptr = *reinterpret_cast<const ValueType**>(buffer_H2D[iw] + sizeof(ST) * 6);
-    invRow_ptr       = invRow_ptr_list[iw];
-  }
-
-  const size_t num_pos          = nwalkers;
-  const auto orb_size           = phi_vgl_v.size(2);
-  const auto padded_size        = myV.size();
-  const size_t ChunkSizePerTeam = 512;
-  const int NumTeams            = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
-  mw_offload_scratch.resize(padded_size * num_pos * SoAFields3D::NUM_FIELDS);
-  // for V(1)G(3)L(1) final result
-  mw_results_scratch.resize(padded_size * num_pos * 5);
-  // per team ratio and grads
-  rg_private.resize(num_pos, NumTeams * 4);
-
-  // Ye: need to extract sizes and pointers before entering target region
-  const auto* spline_ptr         = SplineInst->getSplinePtr();
-  auto* buffer_H2D_ptr           = buffer_H2D.data();
-  auto* offload_scratch_ptr      = mw_offload_scratch.data();
-  auto* results_scratch_ptr      = mw_results_scratch.data();
-  const auto myKcart_padded_size = myKcart->capacity();
-  auto* mKK_ptr                  = mKK->data();
-  auto* GGt_ptr                  = GGt_offload->data();
-  auto* PrimLattice_G_ptr        = PrimLattice_G_offload->data();
-  auto* myKcart_ptr              = myKcart->data();
-  auto* phi_vgl_ptr              = phi_vgl_v.data();
-  auto* rg_private_ptr           = rg_private.data();
-  const size_t buffer_H2D_stride = buffer_H2D.cols();
-  const size_t first_spo_local   = this->first_spo;
-  const size_t phi_vgl_stride    = num_pos * orb_size;
-
-  {
-    ScopedTimer offload(offload_timer_);
-    PRAGMA_OFFLOAD("omp target teams distribute collapse(2) num_teams(NumTeams*num_pos) \
+    assert(this == &spo_list.getLeader());
+    auto& phi_leader = spo_list.template getCastedLeader<SplineC2COMPTargetT>();
+    auto& mw_mem = phi_leader.mw_mem_handle_.getResource();
+    auto& buffer_H2D = mw_mem.buffer_H2D;
+    auto& rg_private = mw_mem.rg_private;
+    auto& mw_offload_scratch = mw_mem.mw_offload_scratch;
+    auto& mw_results_scratch = mw_mem.mw_results_scratch;
+    const int nwalkers = spo_list.size();
+    buffer_H2D.resize(nwalkers, sizeof(ST) * 6 + sizeof(ValueType*));
+
+    // pack particle positions and invRow pointers.
+    for (int iw = 0; iw < nwalkers; ++iw) {
+        const PointType& r = P_list[iw].activeR(iat);
+        PointType ru(PrimLattice.toUnit_floor(r));
+        Vector<ST> pos_copy(reinterpret_cast<ST*>(buffer_H2D[iw]), 6);
+
+        pos_copy[0] = r[0];
+        pos_copy[1] = r[1];
+        pos_copy[2] = r[2];
+        pos_copy[3] = ru[0];
+        pos_copy[4] = ru[1];
+        pos_copy[5] = ru[2];
+
+        auto& invRow_ptr = *reinterpret_cast<const ValueType**>(
+            buffer_H2D[iw] + sizeof(ST) * 6);
+        invRow_ptr = invRow_ptr_list[iw];
+    }
+
+    const size_t num_pos = nwalkers;
+    const auto orb_size = phi_vgl_v.size(2);
+    const auto padded_size = myV.size();
+    const size_t ChunkSizePerTeam = 512;
+    const int NumTeams = (myV.size() + ChunkSizePerTeam - 1) / ChunkSizePerTeam;
+    mw_offload_scratch.resize(padded_size * num_pos * SoAFields3D::NUM_FIELDS);
+    // for V(1)G(3)L(1) final result
+    mw_results_scratch.resize(padded_size * num_pos * 5);
+    // per team ratio and grads
+    rg_private.resize(num_pos, NumTeams * 4);
+
+    // Ye: need to extract sizes and pointers before entering target region
+    const auto* spline_ptr = SplineInst->getSplinePtr();
+    auto* buffer_H2D_ptr = buffer_H2D.data();
+    auto* offload_scratch_ptr = mw_offload_scratch.data();
+    auto* results_scratch_ptr = mw_results_scratch.data();
+    const auto myKcart_padded_size = myKcart->capacity();
+    auto* mKK_ptr = mKK->data();
+    auto* GGt_ptr = GGt_offload->data();
+    auto* PrimLattice_G_ptr = PrimLattice_G_offload->data();
+    auto* myKcart_ptr = myKcart->data();
+    auto* phi_vgl_ptr = phi_vgl_v.data();
+    auto* rg_private_ptr = rg_private.data();
+    const size_t buffer_H2D_stride = buffer_H2D.cols();
+    const size_t first_spo_local = this->first_spo;
+    const size_t phi_vgl_stride = num_pos * orb_size;
+
+    {
+        ScopedTimer offload(offload_timer_);
+        PRAGMA_OFFLOAD(
+            "omp target teams distribute collapse(2) num_teams(NumTeams*num_pos) \
                     map(always, to: buffer_H2D_ptr[:buffer_H2D.size()]) \
                     map(always, from: rg_private_ptr[0:rg_private.size()])")
-    for (int iw = 0; iw < num_pos; iw++)
-      for (int team_id = 0; team_id < NumTeams; team_id++)
-      {
-        const size_t first = ChunkSizePerTeam * team_id;
-        const size_t last  = omptarget::min(first + ChunkSizePerTeam, padded_size);
-
-        auto* restrict offload_scratch_iw_ptr = offload_scratch_ptr + padded_size * iw * SoAFields3D::NUM_FIELDS;
-        auto* restrict psi_iw_ptr             = results_scratch_ptr + padded_size * iw * 5;
-        const auto* restrict pos_iw_ptr       = reinterpret_cast<ST*>(buffer_H2D_ptr + buffer_H2D_stride * iw);
-        const auto* restrict invRow_iw_ptr =
-            *reinterpret_cast<ValueType**>(buffer_H2D_ptr + buffer_H2D_stride * iw + sizeof(ST) * 6);
-
-        int ix, iy, iz;
-        ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4], d2c[4];
-        spline2::computeLocationAndFractional(spline_ptr, pos_iw_ptr[3], pos_iw_ptr[4], pos_iw_ptr[5], ix, iy, iz, a, b,
-                                              c, da, db, dc, d2a, d2b, d2c);
-
-        const ST G[9]      = {PrimLattice_G_ptr[0], PrimLattice_G_ptr[1], PrimLattice_G_ptr[2],
-                              PrimLattice_G_ptr[3], PrimLattice_G_ptr[4], PrimLattice_G_ptr[5],
-                              PrimLattice_G_ptr[6], PrimLattice_G_ptr[7], PrimLattice_G_ptr[8]};
-        const ST symGGt[6] = {GGt_ptr[0], GGt_ptr[1] + GGt_ptr[3], GGt_ptr[2] + GGt_ptr[6],
-                              GGt_ptr[4], GGt_ptr[5] + GGt_ptr[7], GGt_ptr[8]};
-
-        PRAGMA_OFFLOAD("omp parallel for")
-        for (int index = 0; index < last - first; index++)
-        {
-          spline2offload::evaluate_vgh_impl_v2(spline_ptr, ix, iy, iz, first + index, a, b, c, da, db, dc, d2a, d2b,
-                                               d2c, offload_scratch_iw_ptr + first + index, padded_size);
-          const int output_index = first + index;
-          offload_scratch_iw_ptr[padded_size * SoAFields3D::LAPL + output_index] =
-              SymTrace(offload_scratch_iw_ptr[padded_size * SoAFields3D::HESS00 + output_index],
-                       offload_scratch_iw_ptr[padded_size * SoAFields3D::HESS01 + output_index],
-                       offload_scratch_iw_ptr[padded_size * SoAFields3D::HESS02 + output_index],
-                       offload_scratch_iw_ptr[padded_size * SoAFields3D::HESS11 + output_index],
-                       offload_scratch_iw_ptr[padded_size * SoAFields3D::HESS12 + output_index],
-                       offload_scratch_iw_ptr[padded_size * SoAFields3D::HESS22 + output_index], symGGt);
-        }
+        for (int iw = 0; iw < num_pos; iw++)
+            for (int team_id = 0; team_id < NumTeams; team_id++) {
+                const size_t first = ChunkSizePerTeam * team_id;
+                const size_t last =
+                    omptarget::min(first + ChunkSizePerTeam, padded_size);
+
+                auto* restrict offload_scratch_iw_ptr = offload_scratch_ptr +
+                    padded_size * iw * SoAFields3D::NUM_FIELDS;
+                auto* restrict psi_iw_ptr =
+                    results_scratch_ptr + padded_size * iw * 5;
+                const auto* restrict pos_iw_ptr = reinterpret_cast<ST*>(
+                    buffer_H2D_ptr + buffer_H2D_stride * iw);
+                const auto* restrict invRow_iw_ptr =
+                    *reinterpret_cast<ValueType**>(buffer_H2D_ptr +
+                        buffer_H2D_stride * iw + sizeof(ST) * 6);
+
+                int ix, iy, iz;
+                ST a[4], b[4], c[4], da[4], db[4], dc[4], d2a[4], d2b[4],
+                    d2c[4];
+                spline2::computeLocationAndFractional(spline_ptr, pos_iw_ptr[3],
+                    pos_iw_ptr[4], pos_iw_ptr[5], ix, iy, iz, a, b, c, da, db,
+                    dc, d2a, d2b, d2c);
+
+                const ST G[9] = {PrimLattice_G_ptr[0], PrimLattice_G_ptr[1],
+                    PrimLattice_G_ptr[2], PrimLattice_G_ptr[3],
+                    PrimLattice_G_ptr[4], PrimLattice_G_ptr[5],
+                    PrimLattice_G_ptr[6], PrimLattice_G_ptr[7],
+                    PrimLattice_G_ptr[8]};
+                const ST symGGt[6] = {GGt_ptr[0], GGt_ptr[1] + GGt_ptr[3],
+                    GGt_ptr[2] + GGt_ptr[6], GGt_ptr[4],
+                    GGt_ptr[5] + GGt_ptr[7], GGt_ptr[8]};
+
+                PRAGMA_OFFLOAD("omp parallel for")
+                for (int index = 0; index < last - first; index++) {
+                    spline2offload::evaluate_vgh_impl_v2(spline_ptr, ix, iy, iz,
+                        first + index, a, b, c, da, db, dc, d2a, d2b, d2c,
+                        offload_scratch_iw_ptr + first + index, padded_size);
+                    const int output_index = first + index;
+                    offload_scratch_iw_ptr[padded_size * SoAFields3D::LAPL +
+                        output_index] =
+                        SymTrace(offload_scratch_iw_ptr[padded_size *
+                                         SoAFields3D::HESS00 +
+                                     output_index],
+                            offload_scratch_iw_ptr[padded_size *
+                                    SoAFields3D::HESS01 +
+                                output_index],
+                            offload_scratch_iw_ptr[padded_size *
+                                    SoAFields3D::HESS02 +
+                                output_index],
+                            offload_scratch_iw_ptr[padded_size *
+                                    SoAFields3D::HESS11 +
+                                output_index],
+                            offload_scratch_iw_ptr[padded_size *
+                                    SoAFields3D::HESS12 +
+                                output_index],
+                            offload_scratch_iw_ptr[padded_size *
+                                    SoAFields3D::HESS22 +
+                                output_index],
+                            symGGt);
+                }
+
+                const size_t first_cplx = first / 2;
+                const size_t last_cplx = omptarget::min(last / 2, orb_size);
+                PRAGMA_OFFLOAD("omp parallel for")
+                for (int index = first_cplx; index < last_cplx; index++)
+                    C2C::assign_vgl(pos_iw_ptr[0], pos_iw_ptr[1], pos_iw_ptr[2],
+                        psi_iw_ptr, padded_size, mKK_ptr,
+                        offload_scratch_iw_ptr, padded_size, G, myKcart_ptr,
+                        myKcart_padded_size, first_spo_local, index);
+
+                ValueType* restrict psi = psi_iw_ptr;
+                ValueType* restrict dpsi_x = psi_iw_ptr + padded_size;
+                ValueType* restrict dpsi_y = psi_iw_ptr + padded_size * 2;
+                ValueType* restrict dpsi_z = psi_iw_ptr + padded_size * 3;
+                ValueType* restrict d2psi = psi_iw_ptr + padded_size * 4;
+
+                ValueType* restrict out_phi = phi_vgl_ptr + iw * orb_size;
+                ValueType* restrict out_dphi_x = out_phi + phi_vgl_stride;
+                ValueType* restrict out_dphi_y = out_dphi_x + phi_vgl_stride;
+                ValueType* restrict out_dphi_z = out_dphi_y + phi_vgl_stride;
+                ValueType* restrict out_d2phi = out_dphi_z + phi_vgl_stride;
+
+                ValueType ratio(0), grad_x(0), grad_y(0), grad_z(0);
+                PRAGMA_OFFLOAD("omp parallel for \
+                        reduction(+: ratio, grad_x, grad_y, grad_z)")
+                for (size_t j = first_cplx; j < last_cplx; j++) {
+                    const size_t psiIndex = first_spo_local + j;
+
+                    out_phi[psiIndex] = psi[psiIndex];
+                    out_dphi_x[psiIndex] = dpsi_x[psiIndex];
+                    out_dphi_y[psiIndex] = dpsi_y[psiIndex];
+                    out_dphi_z[psiIndex] = dpsi_z[psiIndex];
+                    out_d2phi[psiIndex] = d2psi[psiIndex];
+
+                    ratio += psi[psiIndex] * invRow_iw_ptr[psiIndex];
+                    grad_x += dpsi_x[psiIndex] * invRow_iw_ptr[psiIndex];
+                    grad_y += dpsi_y[psiIndex] * invRow_iw_ptr[psiIndex];
+                    grad_z += dpsi_z[psiIndex] * invRow_iw_ptr[psiIndex];
+                }
+
+                rg_private_ptr[(iw * NumTeams + team_id) * 4] = ratio;
+                rg_private_ptr[(iw * NumTeams + team_id) * 4 + 1] = grad_x;
+                rg_private_ptr[(iw * NumTeams + team_id) * 4 + 2] = grad_y;
+                rg_private_ptr[(iw * NumTeams + team_id) * 4 + 3] = grad_z;
+            }
+    }
 
-        const size_t first_cplx = first / 2;
-        const size_t last_cplx  = omptarget::min(last / 2, orb_size);
-        PRAGMA_OFFLOAD("omp parallel for")
-        for (int index = first_cplx; index < last_cplx; index++)
-          C2C::assign_vgl(pos_iw_ptr[0], pos_iw_ptr[1], pos_iw_ptr[2], psi_iw_ptr, padded_size, mKK_ptr,
-                          offload_scratch_iw_ptr, padded_size, G, myKcart_ptr, myKcart_padded_size, first_spo_local,
-                          index);
-
-        ValueType* restrict psi    = psi_iw_ptr;
-        ValueType* restrict dpsi_x = psi_iw_ptr + padded_size;
-        ValueType* restrict dpsi_y = psi_iw_ptr + padded_size * 2;
-        ValueType* restrict dpsi_z = psi_iw_ptr + padded_size * 3;
-        ValueType* restrict d2psi  = psi_iw_ptr + padded_size * 4;
-
-        ValueType* restrict out_phi    = phi_vgl_ptr + iw * orb_size;
-        ValueType* restrict out_dphi_x = out_phi + phi_vgl_stride;
-        ValueType* restrict out_dphi_y = out_dphi_x + phi_vgl_stride;
-        ValueType* restrict out_dphi_z = out_dphi_y + phi_vgl_stride;
-        ValueType* restrict out_d2phi  = out_dphi_z + phi_vgl_stride;
-
-        ValueType ratio(0), grad_x(0), grad_y(0), grad_z(0);
-        PRAGMA_OFFLOAD("omp parallel for reduction(+: ratio, grad_x, grad_y, grad_z)")
-        for (size_t j = first_cplx; j < last_cplx; j++)
-        {
-          const size_t psiIndex = first_spo_local + j;
-
-          out_phi[psiIndex]    = psi[psiIndex];
-          out_dphi_x[psiIndex] = dpsi_x[psiIndex];
-          out_dphi_y[psiIndex] = dpsi_y[psiIndex];
-          out_dphi_z[psiIndex] = dpsi_z[psiIndex];
-          out_d2phi[psiIndex]  = d2psi[psiIndex];
-
-          ratio += psi[psiIndex] * invRow_iw_ptr[psiIndex];
-          grad_x += dpsi_x[psiIndex] * invRow_iw_ptr[psiIndex];
-          grad_y += dpsi_y[psiIndex] * invRow_iw_ptr[psiIndex];
-          grad_z += dpsi_z[psiIndex] * invRow_iw_ptr[psiIndex];
+    for (int iw = 0; iw < num_pos; iw++) {
+        ValueType ratio(0);
+        for (int team_id = 0; team_id < NumTeams; team_id++)
+            ratio += rg_private[iw][team_id * 4];
+        ratios[iw] = ratio;
+
+        ValueType grad_x(0), grad_y(0), grad_z(0);
+        for (int team_id = 0; team_id < NumTeams; team_id++) {
+            grad_x += rg_private[iw][team_id * 4 + 1];
+            grad_y += rg_private[iw][team_id * 4 + 2];
+            grad_z += rg_private[iw][team_id * 4 + 3];
         }
-
-        rg_private_ptr[(iw * NumTeams + team_id) * 4]     = ratio;
-        rg_private_ptr[(iw * NumTeams + team_id) * 4 + 1] = grad_x;
-        rg_private_ptr[(iw * NumTeams + team_id) * 4 + 2] = grad_y;
-        rg_private_ptr[(iw * NumTeams + team_id) * 4 + 3] = grad_z;
-      }
-  }
-
-  for (int iw = 0; iw < num_pos; iw++)
-  {
-    ValueType ratio(0);
-    for (int team_id = 0; team_id < NumTeams; team_id++)
-      ratio += rg_private[iw][team_id * 4];
-    ratios[iw] = ratio;
-
-    ValueType grad_x(0), grad_y(0), grad_z(0);
-    for (int team_id = 0; team_id < NumTeams; team_id++)
-    {
-      grad_x += rg_private[iw][team_id * 4 + 1];
-      grad_y += rg_private[iw][team_id * 4 + 2];
-      grad_z += rg_private[iw][team_id * 4 + 3];
+        grads[iw] = GradType{grad_x / ratio, grad_y / ratio, grad_z / ratio};
     }
-    grads[iw] = GradType{grad_x / ratio, grad_y / ratio, grad_z / ratio};
-  }
 }
-template<typename ST>
-void SplineC2COMPTargetT<ST>::assign_vgh(const PointType& r,
-                                        ValueVector& psi,
-                                        GradVector& dpsi,
-                                        HessVector& grad_grad_psi,
-                                        int first,
-                                        int last) const
+
+template <typename ST, typename VT>
+void
+SplineC2COMPTargetT<ST, VT>::assign_vgh(const PointType& r, ValueVector& psi,
+    GradVector& dpsi, HessVector& grad_grad_psi, int first, int last) const
 {
-  // protect last
-  last = last > this->kPoints.size() ? this->kPoints.size() : last;
-
-  const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1), g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
-           g11 = PrimLattice.G(4), g12 = PrimLattice.G(5), g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
-           g22 = PrimLattice.G(8);
-  const ST x = r[0], y = r[1], z = r[2];
-
-  const ST* restrict k0 = myKcart->data(0);
-  const ST* restrict k1 = myKcart->data(1);
-  const ST* restrict k2 = myKcart->data(2);
-
-  const ST* restrict g0  = myG.data(0);
-  const ST* restrict g1  = myG.data(1);
-  const ST* restrict g2  = myG.data(2);
-  const ST* restrict h00 = myH.data(0);
-  const ST* restrict h01 = myH.data(1);
-  const ST* restrict h02 = myH.data(2);
-  const ST* restrict h11 = myH.data(3);
-  const ST* restrict h12 = myH.data(4);
-  const ST* restrict h22 = myH.data(5);
+    // protect last
+    last = last > this->kPoints.size() ? this->kPoints.size() : last;
+
+    const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1),
+             g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
+             g11 = PrimLattice.G(4), g12 = PrimLattice.G(5),
+             g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
+             g22 = PrimLattice.G(8);
+    const ST x = r[0], y = r[1], z = r[2];
+
+    const ST* restrict k0 = myKcart->data(0);
+    const ST* restrict k1 = myKcart->data(1);
+    const ST* restrict k2 = myKcart->data(2);
+
+    const ST* restrict g0 = myG.data(0);
+    const ST* restrict g1 = myG.data(1);
+    const ST* restrict g2 = myG.data(2);
+    const ST* restrict h00 = myH.data(0);
+    const ST* restrict h01 = myH.data(1);
+    const ST* restrict h02 = myH.data(2);
+    const ST* restrict h11 = myH.data(3);
+    const ST* restrict h12 = myH.data(4);
+    const ST* restrict h22 = myH.data(5);
 
 #pragma omp simd
-  for (size_t j = first; j < last; ++j)
-  {
-    int jr = j << 1;
-    int ji = jr + 1;
-
-    const ST kX    = k0[j];
-    const ST kY    = k1[j];
-    const ST kZ    = k2[j];
-    const ST val_r = myV[jr];
-    const ST val_i = myV[ji];
-
-    //phase
-    ST s, c;
-    omptarget::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
-
-    //dot(PrimLattice.G,myG[j])
-    const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
-    const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
-    const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
-
-    const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
-    const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
-    const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
-
-    // \f$\nabla \psi_r + {\bf k}\psi_i\f$
-    const ST gX_r = dX_r + val_i * kX;
-    const ST gY_r = dY_r + val_i * kY;
-    const ST gZ_r = dZ_r + val_i * kZ;
-    const ST gX_i = dX_i - val_r * kX;
-    const ST gY_i = dY_i - val_r * kY;
-    const ST gZ_i = dZ_i - val_r * kZ;
-
-    const size_t psiIndex = j + this->first_spo;
-    psi[psiIndex]         = ComplexT(c * val_r - s * val_i, c * val_i + s * val_r);
-    dpsi[psiIndex][0]     = ComplexT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
-    dpsi[psiIndex][1]     = ComplexT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
-    dpsi[psiIndex][2]     = ComplexT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);
-
-    const ST h_xx_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g00, g01, g02) + kX * (gX_i + dX_i);
-    const ST h_xy_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g10, g11, g12) + kX * (gY_i + dY_i);
-    const ST h_xz_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g20, g21, g22) + kX * (gZ_i + dZ_i);
-    const ST h_yx_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g00, g01, g02) + kY * (gX_i + dX_i);
-    const ST h_yy_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g10, g11, g12) + kY * (gY_i + dY_i);
-    const ST h_yz_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g20, g21, g22) + kY * (gZ_i + dZ_i);
-    const ST h_zx_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g00, g01, g02) + kZ * (gX_i + dX_i);
-    const ST h_zy_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g10, g11, g12) + kZ * (gY_i + dY_i);
-    const ST h_zz_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g20, g21, g22) + kZ * (gZ_i + dZ_i);
-
-    const ST h_xx_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g00, g01, g02) - kX * (gX_r + dX_r);
-    const ST h_xy_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g10, g11, g12) - kX * (gY_r + dY_r);
-    const ST h_xz_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g20, g21, g22) - kX * (gZ_r + dZ_r);
-    const ST h_yx_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g00, g01, g02) - kY * (gX_r + dX_r);
-    const ST h_yy_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g10, g11, g12) - kY * (gY_r + dY_r);
-    const ST h_yz_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g20, g21, g22) - kY * (gZ_r + dZ_r);
-    const ST h_zx_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g00, g01, g02) - kZ * (gX_r + dX_r);
-    const ST h_zy_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g10, g11, g12) - kZ * (gY_r + dY_r);
-    const ST h_zz_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g20, g21, g22) - kZ * (gZ_r + dZ_r);
-
-    grad_grad_psi[psiIndex][0] = ComplexT(c * h_xx_r - s * h_xx_i, c * h_xx_i + s * h_xx_r);
-    grad_grad_psi[psiIndex][1] = ComplexT(c * h_xy_r - s * h_xy_i, c * h_xy_i + s * h_xy_r);
-    grad_grad_psi[psiIndex][2] = ComplexT(c * h_xz_r - s * h_xz_i, c * h_xz_i + s * h_xz_r);
-    grad_grad_psi[psiIndex][3] = ComplexT(c * h_yx_r - s * h_yx_i, c * h_yx_i + s * h_yx_r);
-    grad_grad_psi[psiIndex][4] = ComplexT(c * h_yy_r - s * h_yy_i, c * h_yy_i + s * h_yy_r);
-    grad_grad_psi[psiIndex][5] = ComplexT(c * h_yz_r - s * h_yz_i, c * h_yz_i + s * h_yz_r);
-    grad_grad_psi[psiIndex][6] = ComplexT(c * h_zx_r - s * h_zx_i, c * h_zx_i + s * h_zx_r);
-    grad_grad_psi[psiIndex][7] = ComplexT(c * h_zy_r - s * h_zy_i, c * h_zy_i + s * h_zy_r);
-    grad_grad_psi[psiIndex][8] = ComplexT(c * h_zz_r - s * h_zz_i, c * h_zz_i + s * h_zz_r);
-  }
+    for (size_t j = first; j < last; ++j) {
+        int jr = j << 1;
+        int ji = jr + 1;
+
+        const ST kX = k0[j];
+        const ST kY = k1[j];
+        const ST kZ = k2[j];
+        const ST val_r = myV[jr];
+        const ST val_i = myV[ji];
+
+        // phase
+        ST s, c;
+        omptarget::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
+
+        // dot(PrimLattice.G,myG[j])
+        const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
+        const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
+        const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
+
+        const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
+        const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
+        const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
+
+        // \f$\nabla \psi_r + {\bf k}\psi_i\f$
+        const ST gX_r = dX_r + val_i * kX;
+        const ST gY_r = dY_r + val_i * kY;
+        const ST gZ_r = dZ_r + val_i * kZ;
+        const ST gX_i = dX_i - val_r * kX;
+        const ST gY_i = dY_i - val_r * kY;
+        const ST gZ_i = dZ_i - val_r * kZ;
+
+        const size_t psiIndex = j + this->first_spo;
+        psi[psiIndex] = ComplexT(c * val_r - s * val_i, c * val_i + s * val_r);
+        dpsi[psiIndex][0] = ComplexT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
+        dpsi[psiIndex][1] = ComplexT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
+        dpsi[psiIndex][2] = ComplexT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);
+
+        const ST h_xx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g00, g01, g02, g00, g01, g02) +
+            kX * (gX_i + dX_i);
+        const ST h_xy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g00, g01, g02, g10, g11, g12) +
+            kX * (gY_i + dY_i);
+        const ST h_xz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g00, g01, g02, g20, g21, g22) +
+            kX * (gZ_i + dZ_i);
+        const ST h_yx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g10, g11, g12, g00, g01, g02) +
+            kY * (gX_i + dX_i);
+        const ST h_yy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g10, g11, g12, g10, g11, g12) +
+            kY * (gY_i + dY_i);
+        const ST h_yz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g10, g11, g12, g20, g21, g22) +
+            kY * (gZ_i + dZ_i);
+        const ST h_zx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g20, g21, g22, g00, g01, g02) +
+            kZ * (gX_i + dX_i);
+        const ST h_zy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g20, g21, g22, g10, g11, g12) +
+            kZ * (gY_i + dY_i);
+        const ST h_zz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g20, g21, g22, g20, g21, g22) +
+            kZ * (gZ_i + dZ_i);
+
+        const ST h_xx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g00, g01, g02, g00, g01, g02) -
+            kX * (gX_r + dX_r);
+        const ST h_xy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g00, g01, g02, g10, g11, g12) -
+            kX * (gY_r + dY_r);
+        const ST h_xz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g00, g01, g02, g20, g21, g22) -
+            kX * (gZ_r + dZ_r);
+        const ST h_yx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g10, g11, g12, g00, g01, g02) -
+            kY * (gX_r + dX_r);
+        const ST h_yy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g10, g11, g12, g10, g11, g12) -
+            kY * (gY_r + dY_r);
+        const ST h_yz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g10, g11, g12, g20, g21, g22) -
+            kY * (gZ_r + dZ_r);
+        const ST h_zx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g20, g21, g22, g00, g01, g02) -
+            kZ * (gX_r + dX_r);
+        const ST h_zy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g20, g21, g22, g10, g11, g12) -
+            kZ * (gY_r + dY_r);
+        const ST h_zz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g20, g21, g22, g20, g21, g22) -
+            kZ * (gZ_r + dZ_r);
+
+        grad_grad_psi[psiIndex][0] =
+            ComplexT(c * h_xx_r - s * h_xx_i, c * h_xx_i + s * h_xx_r);
+        grad_grad_psi[psiIndex][1] =
+            ComplexT(c * h_xy_r - s * h_xy_i, c * h_xy_i + s * h_xy_r);
+        grad_grad_psi[psiIndex][2] =
+            ComplexT(c * h_xz_r - s * h_xz_i, c * h_xz_i + s * h_xz_r);
+        grad_grad_psi[psiIndex][3] =
+            ComplexT(c * h_yx_r - s * h_yx_i, c * h_yx_i + s * h_yx_r);
+        grad_grad_psi[psiIndex][4] =
+            ComplexT(c * h_yy_r - s * h_yy_i, c * h_yy_i + s * h_yy_r);
+        grad_grad_psi[psiIndex][5] =
+            ComplexT(c * h_yz_r - s * h_yz_i, c * h_yz_i + s * h_yz_r);
+        grad_grad_psi[psiIndex][6] =
+            ComplexT(c * h_zx_r - s * h_zx_i, c * h_zx_i + s * h_zx_r);
+        grad_grad_psi[psiIndex][7] =
+            ComplexT(c * h_zy_r - s * h_zy_i, c * h_zy_i + s * h_zy_r);
+        grad_grad_psi[psiIndex][8] =
+            ComplexT(c * h_zz_r - s * h_zz_i, c * h_zz_i + s * h_zz_r);
+    }
 }
 
-template<typename ST>
-void SplineC2COMPTargetT<ST>::evaluateVGH(const ParticleSetT<ComplexT>& P,
-                                         const int iat,
-                                         ValueVector& psi,
-                                         GradVector& dpsi,
-                                         HessVector& grad_grad_psi)
+template <typename ST, typename VT>
+void
+SplineC2COMPTargetT<ST, VT>::evaluateVGH(const ParticleSetT<VT>& P,
+    const int iat, ValueVector& psi, GradVector& dpsi,
+    HessVector& grad_grad_psi)
 {
-  const PointType& r = P.activeR(iat);
-  PointType ru(PrimLattice.toUnit_floor(r));
+    const PointType& r = P.activeR(iat);
+    PointType ru(PrimLattice.toUnit_floor(r));
 
 #pragma omp parallel
-  {
-    int first, last;
-    // Factor of 2 because psi is complex and the spline storage and evaluation uses a real type
-    FairDivideAligned(2 * psi.size(), getAlignment<ST>(), omp_get_num_threads(), omp_get_thread_num(), first, last);
-
-    spline2::evaluate3d_vgh(SplineInst->getSplinePtr(), ru, myV, myG, myH, first, last);
-    assign_vgh(r, psi, dpsi, grad_grad_psi, first / 2, last / 2);
-  }
+    {
+        int first, last;
+        // Factor of 2 because psi is complex and the spline storage and
+        // evaluation uses a real type
+        FairDivideAligned(2 * psi.size(), getAlignment<ST>(),
+            omp_get_num_threads(), omp_get_thread_num(), first, last);
+
+        spline2::evaluate3d_vgh(
+            SplineInst->getSplinePtr(), ru, myV, myG, myH, first, last);
+        assign_vgh(r, psi, dpsi, grad_grad_psi, first / 2, last / 2);
+    }
 }
 
-template<typename ST>
-void SplineC2COMPTargetT<ST>::assign_vghgh(const PointType& r,
-                                          ValueVector& psi,
-                                          GradVector& dpsi,
-                                          HessVector& grad_grad_psi,
-                                          GGGVector& grad_grad_grad_psi,
-                                          int first,
-                                          int last) const
+template <typename ST, typename VT>
+void
+SplineC2COMPTargetT<ST, VT>::assign_vghgh(const PointType& r, ValueVector& psi,
+    GradVector& dpsi, HessVector& grad_grad_psi, GGGVector& grad_grad_grad_psi,
+    int first, int last) const
 {
-  // protect last
-  last = last < 0 ? this->kPoints.size() : (last > this->kPoints.size() ? this->kPoints.size() : last);
-
-  const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1), g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
-           g11 = PrimLattice.G(4), g12 = PrimLattice.G(5), g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
-           g22 = PrimLattice.G(8);
-  const ST x = r[0], y = r[1], z = r[2];
-
-  const ST* restrict k0 = myKcart->data(0);
-  const ST* restrict k1 = myKcart->data(1);
-  const ST* restrict k2 = myKcart->data(2);
-
-  const ST* restrict g0  = myG.data(0);
-  const ST* restrict g1  = myG.data(1);
-  const ST* restrict g2  = myG.data(2);
-  const ST* restrict h00 = myH.data(0);
-  const ST* restrict h01 = myH.data(1);
-  const ST* restrict h02 = myH.data(2);
-  const ST* restrict h11 = myH.data(3);
-  const ST* restrict h12 = myH.data(4);
-  const ST* restrict h22 = myH.data(5);
-
-  const ST* restrict gh000 = mygH.data(0);
-  const ST* restrict gh001 = mygH.data(1);
-  const ST* restrict gh002 = mygH.data(2);
-  const ST* restrict gh011 = mygH.data(3);
-  const ST* restrict gh012 = mygH.data(4);
-  const ST* restrict gh022 = mygH.data(5);
-  const ST* restrict gh111 = mygH.data(6);
-  const ST* restrict gh112 = mygH.data(7);
-  const ST* restrict gh122 = mygH.data(8);
-  const ST* restrict gh222 = mygH.data(9);
-
-//SIMD doesn't work quite right yet.  Comment out until further debugging.
+    // protect last
+    last = last < 0 ?
+        this->kPoints.size() :
+        (last > this->kPoints.size() ? this->kPoints.size() : last);
+
+    const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1),
+             g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
+             g11 = PrimLattice.G(4), g12 = PrimLattice.G(5),
+             g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
+             g22 = PrimLattice.G(8);
+    const ST x = r[0], y = r[1], z = r[2];
+
+    const ST* restrict k0 = myKcart->data(0);
+    const ST* restrict k1 = myKcart->data(1);
+    const ST* restrict k2 = myKcart->data(2);
+
+    const ST* restrict g0 = myG.data(0);
+    const ST* restrict g1 = myG.data(1);
+    const ST* restrict g2 = myG.data(2);
+    const ST* restrict h00 = myH.data(0);
+    const ST* restrict h01 = myH.data(1);
+    const ST* restrict h02 = myH.data(2);
+    const ST* restrict h11 = myH.data(3);
+    const ST* restrict h12 = myH.data(4);
+    const ST* restrict h22 = myH.data(5);
+
+    const ST* restrict gh000 = mygH.data(0);
+    const ST* restrict gh001 = mygH.data(1);
+    const ST* restrict gh002 = mygH.data(2);
+    const ST* restrict gh011 = mygH.data(3);
+    const ST* restrict gh012 = mygH.data(4);
+    const ST* restrict gh022 = mygH.data(5);
+    const ST* restrict gh111 = mygH.data(6);
+    const ST* restrict gh112 = mygH.data(7);
+    const ST* restrict gh122 = mygH.data(8);
+    const ST* restrict gh222 = mygH.data(9);
+
+// SIMD doesn't work quite right yet.  Comment out until further debugging.
 #pragma omp simd
-  for (size_t j = first; j < last; ++j)
-  {
-    int jr = j << 1;
-    int ji = jr + 1;
-
-    const ST kX    = k0[j];
-    const ST kY    = k1[j];
-    const ST kZ    = k2[j];
-    const ST val_r = myV[jr];
-    const ST val_i = myV[ji];
-
-    //phase
-    ST s, c;
-    omptarget::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
-
-    //dot(PrimLattice.G,myG[j])
-    const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
-    const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
-    const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
-
-    const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
-    const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
-    const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
-
-    // \f$\nabla \psi_r + {\bf k}\psi_i\f$
-    const ST gX_r = dX_r + val_i * kX;
-    const ST gY_r = dY_r + val_i * kY;
-    const ST gZ_r = dZ_r + val_i * kZ;
-    const ST gX_i = dX_i - val_r * kX;
-    const ST gY_i = dY_i - val_r * kY;
-    const ST gZ_i = dZ_i - val_r * kZ;
-
-    const size_t psiIndex = j + this->first_spo;
-    psi[psiIndex]         = ComplexT(c * val_r - s * val_i, c * val_i + s * val_r);
-    dpsi[psiIndex][0]     = ComplexT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
-    dpsi[psiIndex][1]     = ComplexT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
-    dpsi[psiIndex][2]     = ComplexT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);
-
-    //intermediates for computation of hessian. \partial_i \partial_j phi in cartesian coordinates.
-    const ST f_xx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g00, g01, g02);
-    const ST f_xy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g10, g11, g12);
-    const ST f_xz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g20, g21, g22);
-    const ST f_yy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g10, g11, g12);
-    const ST f_yz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g20, g21, g22);
-    const ST f_zz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g20, g21, g22);
-
-    const ST f_xx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g00, g01, g02);
-    const ST f_xy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g10, g11, g12);
-    const ST f_xz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g20, g21, g22);
-    const ST f_yy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g10, g11, g12);
-    const ST f_yz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g20, g21, g22);
-    const ST f_zz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g20, g21, g22);
-
-    const ST h_xx_r = f_xx_r + 2 * kX * dX_i - kX * kX * val_r;
-    const ST h_xy_r = f_xy_r + (kX * dY_i + kY * dX_i) - kX * kY * val_r;
-    const ST h_xz_r = f_xz_r + (kX * dZ_i + kZ * dX_i) - kX * kZ * val_r;
-    const ST h_yy_r = f_yy_r + 2 * kY * dY_i - kY * kY * val_r;
-    const ST h_yz_r = f_yz_r + (kY * dZ_i + kZ * dY_i) - kY * kZ * val_r;
-    const ST h_zz_r = f_zz_r + 2 * kZ * dZ_i - kZ * kZ * val_r;
-
-    const ST h_xx_i = f_xx_i - 2 * kX * dX_r - kX * kX * val_i;
-    const ST h_xy_i = f_xy_i - (kX * dY_r + kY * dX_r) - kX * kY * val_i;
-    const ST h_xz_i = f_xz_i - (kX * dZ_r + kZ * dX_r) - kX * kZ * val_i;
-    const ST h_yy_i = f_yy_i - 2 * kY * dY_r - kY * kY * val_i;
-    const ST h_yz_i = f_yz_i - (kZ * dY_r + kY * dZ_r) - kZ * kY * val_i;
-    const ST h_zz_i = f_zz_i - 2 * kZ * dZ_r - kZ * kZ * val_i;
-
-    grad_grad_psi[psiIndex][0] = ComplexT(c * h_xx_r - s * h_xx_i, c * h_xx_i + s * h_xx_r);
-    grad_grad_psi[psiIndex][1] = ComplexT(c * h_xy_r - s * h_xy_i, c * h_xy_i + s * h_xy_r);
-    grad_grad_psi[psiIndex][2] = ComplexT(c * h_xz_r - s * h_xz_i, c * h_xz_i + s * h_xz_r);
-    grad_grad_psi[psiIndex][3] = ComplexT(c * h_xy_r - s * h_xy_i, c * h_xy_i + s * h_xy_r);
-    grad_grad_psi[psiIndex][4] = ComplexT(c * h_yy_r - s * h_yy_i, c * h_yy_i + s * h_yy_r);
-    grad_grad_psi[psiIndex][5] = ComplexT(c * h_yz_r - s * h_yz_i, c * h_yz_i + s * h_yz_r);
-    grad_grad_psi[psiIndex][6] = ComplexT(c * h_xz_r - s * h_xz_i, c * h_xz_i + s * h_xz_r);
-    grad_grad_psi[psiIndex][7] = ComplexT(c * h_yz_r - s * h_yz_i, c * h_yz_i + s * h_yz_r);
-    grad_grad_psi[psiIndex][8] = ComplexT(c * h_zz_r - s * h_zz_i, c * h_zz_i + s * h_zz_r);
-
-    //These are the real and imaginary components of the third SPO derivative.  _xxx denotes
-    // third derivative w.r.t. x, _xyz, a derivative with resepect to x,y, and z, and so on.
-
-    const ST f3_xxx_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g00, g01, g02);
-    const ST f3_xxy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g10, g11, g12);
-    const ST f3_xxz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g20, g21, g22);
-    const ST f3_xyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g10, g11, g12, g10, g11, g12);
-    const ST f3_xyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g10, g11, g12, g20, g21, g22);
-    const ST f3_xzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g20, g21, g22, g20, g21, g22);
-    const ST f3_yyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g10, g11, g12, g10, g11, g12);
-    const ST f3_yyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g10, g11, g12, g20, g21, g22);
-    const ST f3_yzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g20, g21, g22, g20, g21, g22);
-    const ST f3_zzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g20, g21, g22, g20, g21, g22, g20, g21, g22);
-
-    const ST f3_xxx_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g00, g01, g02);
-    const ST f3_xxy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g10, g11, g12);
-    const ST f3_xxz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g20, g21, g22);
-    const ST f3_xyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g10, g11, g12, g10, g11, g12);
-    const ST f3_xyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g10, g11, g12, g20, g21, g22);
-    const ST f3_xzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g20, g21, g22, g20, g21, g22);
-    const ST f3_yyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g10, g11, g12, g10, g11, g12);
-    const ST f3_yyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g10, g11, g12, g20, g21, g22);
-    const ST f3_yzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g20, g21, g22, g20, g21, g22);
-    const ST f3_zzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g20, g21, g22, g20, g21, g22, g20, g21, g22);
-
-    //Here is where we build up the components of the physical hessian gradient, namely, d^3/dx^3(e^{-ik*r}\phi(r)
-    const ST gh_xxx_r = f3_xxx_r + 3 * kX * f_xx_i - 3 * kX * kX * dX_r - kX * kX * kX * val_i;
-    const ST gh_xxx_i = f3_xxx_i - 3 * kX * f_xx_r - 3 * kX * kX * dX_i + kX * kX * kX * val_r;
-    const ST gh_xxy_r =
-        f3_xxy_r + (kY * f_xx_i + 2 * kX * f_xy_i) - (kX * kX * dY_r + 2 * kX * kY * dX_r) - kX * kX * kY * val_i;
-    const ST gh_xxy_i =
-        f3_xxy_i - (kY * f_xx_r + 2 * kX * f_xy_r) - (kX * kX * dY_i + 2 * kX * kY * dX_i) + kX * kX * kY * val_r;
-    const ST gh_xxz_r =
-        f3_xxz_r + (kZ * f_xx_i + 2 * kX * f_xz_i) - (kX * kX * dZ_r + 2 * kX * kZ * dX_r) - kX * kX * kZ * val_i;
-    const ST gh_xxz_i =
-        f3_xxz_i - (kZ * f_xx_r + 2 * kX * f_xz_r) - (kX * kX * dZ_i + 2 * kX * kZ * dX_i) + kX * kX * kZ * val_r;
-    const ST gh_xyy_r =
-        f3_xyy_r + (2 * kY * f_xy_i + kX * f_yy_i) - (2 * kX * kY * dY_r + kY * kY * dX_r) - kX * kY * kY * val_i;
-    const ST gh_xyy_i =
-        f3_xyy_i - (2 * kY * f_xy_r + kX * f_yy_r) - (2 * kX * kY * dY_i + kY * kY * dX_i) + kX * kY * kY * val_r;
-    const ST gh_xyz_r = f3_xyz_r + (kX * f_yz_i + kY * f_xz_i + kZ * f_xy_i) -
-        (kX * kY * dZ_r + kY * kZ * dX_r + kZ * kX * dY_r) - kX * kY * kZ * val_i;
-    const ST gh_xyz_i = f3_xyz_i - (kX * f_yz_r + kY * f_xz_r + kZ * f_xy_r) -
-        (kX * kY * dZ_i + kY * kZ * dX_i + kZ * kX * dY_i) + kX * kY * kZ * val_r;
-    const ST gh_xzz_r =
-        f3_xzz_r + (2 * kZ * f_xz_i + kX * f_zz_i) - (2 * kX * kZ * dZ_r + kZ * kZ * dX_r) - kX * kZ * kZ * val_i;
-    const ST gh_xzz_i =
-        f3_xzz_i - (2 * kZ * f_xz_r + kX * f_zz_r) - (2 * kX * kZ * dZ_i + kZ * kZ * dX_i) + kX * kZ * kZ * val_r;
-    const ST gh_yyy_r = f3_yyy_r + 3 * kY * f_yy_i - 3 * kY * kY * dY_r - kY * kY * kY * val_i;
-    const ST gh_yyy_i = f3_yyy_i - 3 * kY * f_yy_r - 3 * kY * kY * dY_i + kY * kY * kY * val_r;
-    const ST gh_yyz_r =
-        f3_yyz_r + (kZ * f_yy_i + 2 * kY * f_yz_i) - (kY * kY * dZ_r + 2 * kY * kZ * dY_r) - kY * kY * kZ * val_i;
-    const ST gh_yyz_i =
-        f3_yyz_i - (kZ * f_yy_r + 2 * kY * f_yz_r) - (kY * kY * dZ_i + 2 * kY * kZ * dY_i) + kY * kY * kZ * val_r;
-    const ST gh_yzz_r =
-        f3_yzz_r + (2 * kZ * f_yz_i + kY * f_zz_i) - (2 * kY * kZ * dZ_r + kZ * kZ * dY_r) - kY * kZ * kZ * val_i;
-    const ST gh_yzz_i =
-        f3_yzz_i - (2 * kZ * f_yz_r + kY * f_zz_r) - (2 * kY * kZ * dZ_i + kZ * kZ * dY_i) + kY * kZ * kZ * val_r;
-    const ST gh_zzz_r = f3_zzz_r + 3 * kZ * f_zz_i - 3 * kZ * kZ * dZ_r - kZ * kZ * kZ * val_i;
-    const ST gh_zzz_i = f3_zzz_i - 3 * kZ * f_zz_r - 3 * kZ * kZ * dZ_i + kZ * kZ * kZ * val_r;
-
-    grad_grad_grad_psi[psiIndex][0][0] = ComplexT(c * gh_xxx_r - s * gh_xxx_i, c * gh_xxx_i + s * gh_xxx_r);
-    grad_grad_grad_psi[psiIndex][0][1] = ComplexT(c * gh_xxy_r - s * gh_xxy_i, c * gh_xxy_i + s * gh_xxy_r);
-    grad_grad_grad_psi[psiIndex][0][2] = ComplexT(c * gh_xxz_r - s * gh_xxz_i, c * gh_xxz_i + s * gh_xxz_r);
-    grad_grad_grad_psi[psiIndex][0][3] = ComplexT(c * gh_xxy_r - s * gh_xxy_i, c * gh_xxy_i + s * gh_xxy_r);
-    grad_grad_grad_psi[psiIndex][0][4] = ComplexT(c * gh_xyy_r - s * gh_xyy_i, c * gh_xyy_i + s * gh_xyy_r);
-    grad_grad_grad_psi[psiIndex][0][5] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
-    grad_grad_grad_psi[psiIndex][0][6] = ComplexT(c * gh_xxz_r - s * gh_xxz_i, c * gh_xxz_i + s * gh_xxz_r);
-    grad_grad_grad_psi[psiIndex][0][7] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
-    grad_grad_grad_psi[psiIndex][0][8] = ComplexT(c * gh_xzz_r - s * gh_xzz_i, c * gh_xzz_i + s * gh_xzz_r);
-
-    grad_grad_grad_psi[psiIndex][1][0] = ComplexT(c * gh_xxy_r - s * gh_xxy_i, c * gh_xxy_i + s * gh_xxy_r);
-    grad_grad_grad_psi[psiIndex][1][1] = ComplexT(c * gh_xyy_r - s * gh_xyy_i, c * gh_xyy_i + s * gh_xyy_r);
-    grad_grad_grad_psi[psiIndex][1][2] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
-    grad_grad_grad_psi[psiIndex][1][3] = ComplexT(c * gh_xyy_r - s * gh_xyy_i, c * gh_xyy_i + s * gh_xyy_r);
-    grad_grad_grad_psi[psiIndex][1][4] = ComplexT(c * gh_yyy_r - s * gh_yyy_i, c * gh_yyy_i + s * gh_yyy_r);
-    grad_grad_grad_psi[psiIndex][1][5] = ComplexT(c * gh_yyz_r - s * gh_yyz_i, c * gh_yyz_i + s * gh_yyz_r);
-    grad_grad_grad_psi[psiIndex][1][6] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
-    grad_grad_grad_psi[psiIndex][1][7] = ComplexT(c * gh_yyz_r - s * gh_yyz_i, c * gh_yyz_i + s * gh_yyz_r);
-    grad_grad_grad_psi[psiIndex][1][8] = ComplexT(c * gh_yzz_r - s * gh_yzz_i, c * gh_yzz_i + s * gh_yzz_r);
-
-
-    grad_grad_grad_psi[psiIndex][2][0] = ComplexT(c * gh_xxz_r - s * gh_xxz_i, c * gh_xxz_i + s * gh_xxz_r);
-    grad_grad_grad_psi[psiIndex][2][1] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
-    grad_grad_grad_psi[psiIndex][2][2] = ComplexT(c * gh_xzz_r - s * gh_xzz_i, c * gh_xzz_i + s * gh_xzz_r);
-    grad_grad_grad_psi[psiIndex][2][3] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
-    grad_grad_grad_psi[psiIndex][2][4] = ComplexT(c * gh_yyz_r - s * gh_yyz_i, c * gh_yyz_i + s * gh_yyz_r);
-    grad_grad_grad_psi[psiIndex][2][5] = ComplexT(c * gh_yzz_r - s * gh_yzz_i, c * gh_yzz_i + s * gh_yzz_r);
-    grad_grad_grad_psi[psiIndex][2][6] = ComplexT(c * gh_xzz_r - s * gh_xzz_i, c * gh_xzz_i + s * gh_xzz_r);
-    grad_grad_grad_psi[psiIndex][2][7] = ComplexT(c * gh_yzz_r - s * gh_yzz_i, c * gh_yzz_i + s * gh_yzz_r);
-    grad_grad_grad_psi[psiIndex][2][8] = ComplexT(c * gh_zzz_r - s * gh_zzz_i, c * gh_zzz_i + s * gh_zzz_r);
-  }
+    for (size_t j = first; j < last; ++j) {
+        int jr = j << 1;
+        int ji = jr + 1;
+
+        const ST kX = k0[j];
+        const ST kY = k1[j];
+        const ST kZ = k2[j];
+        const ST val_r = myV[jr];
+        const ST val_i = myV[ji];
+
+        // phase
+        ST s, c;
+        omptarget::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
+
+        // dot(PrimLattice.G,myG[j])
+        const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
+        const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
+        const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
+
+        const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
+        const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
+        const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
+
+        // \f$\nabla \psi_r + {\bf k}\psi_i\f$
+        const ST gX_r = dX_r + val_i * kX;
+        const ST gY_r = dY_r + val_i * kY;
+        const ST gZ_r = dZ_r + val_i * kZ;
+        const ST gX_i = dX_i - val_r * kX;
+        const ST gY_i = dY_i - val_r * kY;
+        const ST gZ_i = dZ_i - val_r * kZ;
+
+        const size_t psiIndex = j + this->first_spo;
+        psi[psiIndex] = ComplexT(c * val_r - s * val_i, c * val_i + s * val_r);
+        dpsi[psiIndex][0] = ComplexT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
+        dpsi[psiIndex][1] = ComplexT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
+        dpsi[psiIndex][2] = ComplexT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);
+
+        // intermediates for computation of hessian. \partial_i \partial_j phi
+        // in cartesian coordinates.
+        const ST f_xx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+            h22[jr], g00, g01, g02, g00, g01, g02);
+        const ST f_xy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+            h22[jr], g00, g01, g02, g10, g11, g12);
+        const ST f_xz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+            h22[jr], g00, g01, g02, g20, g21, g22);
+        const ST f_yy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+            h22[jr], g10, g11, g12, g10, g11, g12);
+        const ST f_yz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+            h22[jr], g10, g11, g12, g20, g21, g22);
+        const ST f_zz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+            h22[jr], g20, g21, g22, g20, g21, g22);
+
+        const ST f_xx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+            h22[ji], g00, g01, g02, g00, g01, g02);
+        const ST f_xy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+            h22[ji], g00, g01, g02, g10, g11, g12);
+        const ST f_xz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+            h22[ji], g00, g01, g02, g20, g21, g22);
+        const ST f_yy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+            h22[ji], g10, g11, g12, g10, g11, g12);
+        const ST f_yz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+            h22[ji], g10, g11, g12, g20, g21, g22);
+        const ST f_zz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+            h22[ji], g20, g21, g22, g20, g21, g22);
+
+        const ST h_xx_r = f_xx_r + 2 * kX * dX_i - kX * kX * val_r;
+        const ST h_xy_r = f_xy_r + (kX * dY_i + kY * dX_i) - kX * kY * val_r;
+        const ST h_xz_r = f_xz_r + (kX * dZ_i + kZ * dX_i) - kX * kZ * val_r;
+        const ST h_yy_r = f_yy_r + 2 * kY * dY_i - kY * kY * val_r;
+        const ST h_yz_r = f_yz_r + (kY * dZ_i + kZ * dY_i) - kY * kZ * val_r;
+        const ST h_zz_r = f_zz_r + 2 * kZ * dZ_i - kZ * kZ * val_r;
+
+        const ST h_xx_i = f_xx_i - 2 * kX * dX_r - kX * kX * val_i;
+        const ST h_xy_i = f_xy_i - (kX * dY_r + kY * dX_r) - kX * kY * val_i;
+        const ST h_xz_i = f_xz_i - (kX * dZ_r + kZ * dX_r) - kX * kZ * val_i;
+        const ST h_yy_i = f_yy_i - 2 * kY * dY_r - kY * kY * val_i;
+        const ST h_yz_i = f_yz_i - (kZ * dY_r + kY * dZ_r) - kZ * kY * val_i;
+        const ST h_zz_i = f_zz_i - 2 * kZ * dZ_r - kZ * kZ * val_i;
+
+        grad_grad_psi[psiIndex][0] =
+            ComplexT(c * h_xx_r - s * h_xx_i, c * h_xx_i + s * h_xx_r);
+        grad_grad_psi[psiIndex][1] =
+            ComplexT(c * h_xy_r - s * h_xy_i, c * h_xy_i + s * h_xy_r);
+        grad_grad_psi[psiIndex][2] =
+            ComplexT(c * h_xz_r - s * h_xz_i, c * h_xz_i + s * h_xz_r);
+        grad_grad_psi[psiIndex][3] =
+            ComplexT(c * h_xy_r - s * h_xy_i, c * h_xy_i + s * h_xy_r);
+        grad_grad_psi[psiIndex][4] =
+            ComplexT(c * h_yy_r - s * h_yy_i, c * h_yy_i + s * h_yy_r);
+        grad_grad_psi[psiIndex][5] =
+            ComplexT(c * h_yz_r - s * h_yz_i, c * h_yz_i + s * h_yz_r);
+        grad_grad_psi[psiIndex][6] =
+            ComplexT(c * h_xz_r - s * h_xz_i, c * h_xz_i + s * h_xz_r);
+        grad_grad_psi[psiIndex][7] =
+            ComplexT(c * h_yz_r - s * h_yz_i, c * h_yz_i + s * h_yz_r);
+        grad_grad_psi[psiIndex][8] =
+            ComplexT(c * h_zz_r - s * h_zz_i, c * h_zz_i + s * h_zz_r);
+
+        // These are the real and imaginary components of the third SPO
+        // derivative.  _xxx denotes
+        //  third derivative w.r.t. x, _xyz, a derivative with resepect to x,y,
+        //  and z, and so on.
+
+        const ST f3_xxx_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g00, g01, g02, g00, g01, g02, g00, g01, g02);
+        const ST f3_xxy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g00, g01, g02, g00, g01, g02, g10, g11, g12);
+        const ST f3_xxz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g00, g01, g02, g00, g01, g02, g20, g21, g22);
+        const ST f3_xyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g00, g01, g02, g10, g11, g12, g10, g11, g12);
+        const ST f3_xyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g00, g01, g02, g10, g11, g12, g20, g21, g22);
+        const ST f3_xzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g00, g01, g02, g20, g21, g22, g20, g21, g22);
+        const ST f3_yyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g10, g11, g12, g10, g11, g12, g10, g11, g12);
+        const ST f3_yyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g10, g11, g12, g10, g11, g12, g20, g21, g22);
+        const ST f3_yzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g10, g11, g12, g20, g21, g22, g20, g21, g22);
+        const ST f3_zzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g20, g21, g22, g20, g21, g22, g20, g21, g22);
+
+        const ST f3_xxx_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g00, g01, g02, g00, g01, g02, g00, g01, g02);
+        const ST f3_xxy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g00, g01, g02, g00, g01, g02, g10, g11, g12);
+        const ST f3_xxz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g00, g01, g02, g00, g01, g02, g20, g21, g22);
+        const ST f3_xyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g00, g01, g02, g10, g11, g12, g10, g11, g12);
+        const ST f3_xyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g00, g01, g02, g10, g11, g12, g20, g21, g22);
+        const ST f3_xzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g00, g01, g02, g20, g21, g22, g20, g21, g22);
+        const ST f3_yyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g10, g11, g12, g10, g11, g12, g10, g11, g12);
+        const ST f3_yyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g10, g11, g12, g10, g11, g12, g20, g21, g22);
+        const ST f3_yzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g10, g11, g12, g20, g21, g22, g20, g21, g22);
+        const ST f3_zzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g20, g21, g22, g20, g21, g22, g20, g21, g22);
+
+        // Here is where we build up the components of the physical hessian
+        // gradient, namely, d^3/dx^3(e^{-ik*r}\phi(r)
+        const ST gh_xxx_r = f3_xxx_r + 3 * kX * f_xx_i - 3 * kX * kX * dX_r -
+            kX * kX * kX * val_i;
+        const ST gh_xxx_i = f3_xxx_i - 3 * kX * f_xx_r - 3 * kX * kX * dX_i +
+            kX * kX * kX * val_r;
+        const ST gh_xxy_r = f3_xxy_r + (kY * f_xx_i + 2 * kX * f_xy_i) -
+            (kX * kX * dY_r + 2 * kX * kY * dX_r) - kX * kX * kY * val_i;
+        const ST gh_xxy_i = f3_xxy_i - (kY * f_xx_r + 2 * kX * f_xy_r) -
+            (kX * kX * dY_i + 2 * kX * kY * dX_i) + kX * kX * kY * val_r;
+        const ST gh_xxz_r = f3_xxz_r + (kZ * f_xx_i + 2 * kX * f_xz_i) -
+            (kX * kX * dZ_r + 2 * kX * kZ * dX_r) - kX * kX * kZ * val_i;
+        const ST gh_xxz_i = f3_xxz_i - (kZ * f_xx_r + 2 * kX * f_xz_r) -
+            (kX * kX * dZ_i + 2 * kX * kZ * dX_i) + kX * kX * kZ * val_r;
+        const ST gh_xyy_r = f3_xyy_r + (2 * kY * f_xy_i + kX * f_yy_i) -
+            (2 * kX * kY * dY_r + kY * kY * dX_r) - kX * kY * kY * val_i;
+        const ST gh_xyy_i = f3_xyy_i - (2 * kY * f_xy_r + kX * f_yy_r) -
+            (2 * kX * kY * dY_i + kY * kY * dX_i) + kX * kY * kY * val_r;
+        const ST gh_xyz_r = f3_xyz_r +
+            (kX * f_yz_i + kY * f_xz_i + kZ * f_xy_i) -
+            (kX * kY * dZ_r + kY * kZ * dX_r + kZ * kX * dY_r) -
+            kX * kY * kZ * val_i;
+        const ST gh_xyz_i = f3_xyz_i -
+            (kX * f_yz_r + kY * f_xz_r + kZ * f_xy_r) -
+            (kX * kY * dZ_i + kY * kZ * dX_i + kZ * kX * dY_i) +
+            kX * kY * kZ * val_r;
+        const ST gh_xzz_r = f3_xzz_r + (2 * kZ * f_xz_i + kX * f_zz_i) -
+            (2 * kX * kZ * dZ_r + kZ * kZ * dX_r) - kX * kZ * kZ * val_i;
+        const ST gh_xzz_i = f3_xzz_i - (2 * kZ * f_xz_r + kX * f_zz_r) -
+            (2 * kX * kZ * dZ_i + kZ * kZ * dX_i) + kX * kZ * kZ * val_r;
+        const ST gh_yyy_r = f3_yyy_r + 3 * kY * f_yy_i - 3 * kY * kY * dY_r -
+            kY * kY * kY * val_i;
+        const ST gh_yyy_i = f3_yyy_i - 3 * kY * f_yy_r - 3 * kY * kY * dY_i +
+            kY * kY * kY * val_r;
+        const ST gh_yyz_r = f3_yyz_r + (kZ * f_yy_i + 2 * kY * f_yz_i) -
+            (kY * kY * dZ_r + 2 * kY * kZ * dY_r) - kY * kY * kZ * val_i;
+        const ST gh_yyz_i = f3_yyz_i - (kZ * f_yy_r + 2 * kY * f_yz_r) -
+            (kY * kY * dZ_i + 2 * kY * kZ * dY_i) + kY * kY * kZ * val_r;
+        const ST gh_yzz_r = f3_yzz_r + (2 * kZ * f_yz_i + kY * f_zz_i) -
+            (2 * kY * kZ * dZ_r + kZ * kZ * dY_r) - kY * kZ * kZ * val_i;
+        const ST gh_yzz_i = f3_yzz_i - (2 * kZ * f_yz_r + kY * f_zz_r) -
+            (2 * kY * kZ * dZ_i + kZ * kZ * dY_i) + kY * kZ * kZ * val_r;
+        const ST gh_zzz_r = f3_zzz_r + 3 * kZ * f_zz_i - 3 * kZ * kZ * dZ_r -
+            kZ * kZ * kZ * val_i;
+        const ST gh_zzz_i = f3_zzz_i - 3 * kZ * f_zz_r - 3 * kZ * kZ * dZ_i +
+            kZ * kZ * kZ * val_r;
+
+        grad_grad_grad_psi[psiIndex][0][0] =
+            ComplexT(c * gh_xxx_r - s * gh_xxx_i, c * gh_xxx_i + s * gh_xxx_r);
+        grad_grad_grad_psi[psiIndex][0][1] =
+            ComplexT(c * gh_xxy_r - s * gh_xxy_i, c * gh_xxy_i + s * gh_xxy_r);
+        grad_grad_grad_psi[psiIndex][0][2] =
+            ComplexT(c * gh_xxz_r - s * gh_xxz_i, c * gh_xxz_i + s * gh_xxz_r);
+        grad_grad_grad_psi[psiIndex][0][3] =
+            ComplexT(c * gh_xxy_r - s * gh_xxy_i, c * gh_xxy_i + s * gh_xxy_r);
+        grad_grad_grad_psi[psiIndex][0][4] =
+            ComplexT(c * gh_xyy_r - s * gh_xyy_i, c * gh_xyy_i + s * gh_xyy_r);
+        grad_grad_grad_psi[psiIndex][0][5] =
+            ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
+        grad_grad_grad_psi[psiIndex][0][6] =
+            ComplexT(c * gh_xxz_r - s * gh_xxz_i, c * gh_xxz_i + s * gh_xxz_r);
+        grad_grad_grad_psi[psiIndex][0][7] =
+            ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
+        grad_grad_grad_psi[psiIndex][0][8] =
+            ComplexT(c * gh_xzz_r - s * gh_xzz_i, c * gh_xzz_i + s * gh_xzz_r);
+
+        grad_grad_grad_psi[psiIndex][1][0] =
+            ComplexT(c * gh_xxy_r - s * gh_xxy_i, c * gh_xxy_i + s * gh_xxy_r);
+        grad_grad_grad_psi[psiIndex][1][1] =
+            ComplexT(c * gh_xyy_r - s * gh_xyy_i, c * gh_xyy_i + s * gh_xyy_r);
+        grad_grad_grad_psi[psiIndex][1][2] =
+            ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
+        grad_grad_grad_psi[psiIndex][1][3] =
+            ComplexT(c * gh_xyy_r - s * gh_xyy_i, c * gh_xyy_i + s * gh_xyy_r);
+        grad_grad_grad_psi[psiIndex][1][4] =
+            ComplexT(c * gh_yyy_r - s * gh_yyy_i, c * gh_yyy_i + s * gh_yyy_r);
+        grad_grad_grad_psi[psiIndex][1][5] =
+            ComplexT(c * gh_yyz_r - s * gh_yyz_i, c * gh_yyz_i + s * gh_yyz_r);
+        grad_grad_grad_psi[psiIndex][1][6] =
+            ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
+        grad_grad_grad_psi[psiIndex][1][7] =
+            ComplexT(c * gh_yyz_r - s * gh_yyz_i, c * gh_yyz_i + s * gh_yyz_r);
+        grad_grad_grad_psi[psiIndex][1][8] =
+            ComplexT(c * gh_yzz_r - s * gh_yzz_i, c * gh_yzz_i + s * gh_yzz_r);
+
+        grad_grad_grad_psi[psiIndex][2][0] =
+            ComplexT(c * gh_xxz_r - s * gh_xxz_i, c * gh_xxz_i + s * gh_xxz_r);
+        grad_grad_grad_psi[psiIndex][2][1] =
+            ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
+        grad_grad_grad_psi[psiIndex][2][2] =
+            ComplexT(c * gh_xzz_r - s * gh_xzz_i, c * gh_xzz_i + s * gh_xzz_r);
+        grad_grad_grad_psi[psiIndex][2][3] =
+            ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
+        grad_grad_grad_psi[psiIndex][2][4] =
+            ComplexT(c * gh_yyz_r - s * gh_yyz_i, c * gh_yyz_i + s * gh_yyz_r);
+        grad_grad_grad_psi[psiIndex][2][5] =
+            ComplexT(c * gh_yzz_r - s * gh_yzz_i, c * gh_yzz_i + s * gh_yzz_r);
+        grad_grad_grad_psi[psiIndex][2][6] =
+            ComplexT(c * gh_xzz_r - s * gh_xzz_i, c * gh_xzz_i + s * gh_xzz_r);
+        grad_grad_grad_psi[psiIndex][2][7] =
+            ComplexT(c * gh_yzz_r - s * gh_yzz_i, c * gh_yzz_i + s * gh_yzz_r);
+        grad_grad_grad_psi[psiIndex][2][8] =
+            ComplexT(c * gh_zzz_r - s * gh_zzz_i, c * gh_zzz_i + s * gh_zzz_r);
+    }
 }
 
-template<typename ST>
-void SplineC2COMPTargetT<ST>::evaluateVGHGH(const ParticleSetT<ComplexT>& P,
-                                           const int iat,
-                                           ValueVector& psi,
-                                           GradVector& dpsi,
-                                           HessVector& grad_grad_psi,
-                                           GGGVector& grad_grad_grad_psi)
+template <typename ST, typename VT>
+void
+SplineC2COMPTargetT<ST, VT>::evaluateVGHGH(const ParticleSetT<VT>& P,
+    const int iat, ValueVector& psi, GradVector& dpsi,
+    HessVector& grad_grad_psi, GGGVector& grad_grad_grad_psi)
 {
-  const PointType& r = P.activeR(iat);
-  PointType ru(PrimLattice.toUnit_floor(r));
+    const PointType& r = P.activeR(iat);
+    PointType ru(PrimLattice.toUnit_floor(r));
 #pragma omp parallel
-  {
-    int first, last;
-    FairDivideAligned(2 * psi.size(), getAlignment<ST>(), omp_get_num_threads(), omp_get_thread_num(), first, last);
-
-    spline2::evaluate3d_vghgh(SplineInst->getSplinePtr(), ru, myV, myG, myH, mygH, first, last);
-    assign_vghgh(r, psi, dpsi, grad_grad_psi, grad_grad_grad_psi, first / 2, last / 2);
-  }
+    {
+        int first, last;
+        FairDivideAligned(2 * psi.size(), getAlignment<ST>(),
+            omp_get_num_threads(), omp_get_thread_num(), first, last);
+
+        spline2::evaluate3d_vghgh(
+            SplineInst->getSplinePtr(), ru, myV, myG, myH, mygH, first, last);
+        assign_vghgh(r, psi, dpsi, grad_grad_psi, grad_grad_grad_psi, first / 2,
+            last / 2);
+    }
 }
 
-template<typename ST>
-void SplineC2COMPTargetT<ST>::evaluate_notranspose(const ParticleSetT<ComplexT>& P,
-                                                  int first,
-                                                  int last,
-                                                  ValueMatrix& logdet,
-                                                  GradMatrix& dlogdet,
-                                                  ValueMatrix& d2logdet)
+template <typename ST, typename VT>
+void
+SplineC2COMPTargetT<ST, VT>::evaluate_notranspose(const ParticleSetT<VT>& P,
+    int first, int last, ValueMatrix& logdet, GradMatrix& dlogdet,
+    ValueMatrix& d2logdet)
 {
-  // chunk the [first, last) loop into blocks to save temporary memory usage
-  const int block_size = 16;
-
-  // reference vectors refer to the rows of matrices
-  std::vector<ValueVector> multi_psi_v;
-  std::vector<GradVector> multi_dpsi_v;
-  std::vector<ValueVector> multi_d2psi_v;
-  RefVector<ValueVector> psi_v_list;
-  RefVector<GradVector> dpsi_v_list;
-  RefVector<ValueVector> d2psi_v_list;
-
-  multi_psi_v.reserve(block_size);
-  multi_dpsi_v.reserve(block_size);
-  multi_d2psi_v.reserve(block_size);
-  psi_v_list.reserve(block_size);
-  dpsi_v_list.reserve(block_size);
-  d2psi_v_list.reserve(block_size);
-
-  for (int iat = first, i = 0; iat < last; iat += block_size, i += block_size)
-  {
-    const int actual_block_size = std::min(last - iat, block_size);
-    multi_pos_copy.resize(actual_block_size * 6);
-    multi_psi_v.clear();
-    multi_dpsi_v.clear();
-    multi_d2psi_v.clear();
-    psi_v_list.clear();
-    dpsi_v_list.clear();
-    d2psi_v_list.clear();
-
-    for (int ipos = 0; ipos < actual_block_size; ++ipos)
-    {
-      // pack particle positions
-      const PointType& r = P.activeR(iat + ipos);
-      PointType ru(PrimLattice.toUnit_floor(r));
-      multi_pos_copy[ipos * 6]     = r[0];
-      multi_pos_copy[ipos * 6 + 1] = r[1];
-      multi_pos_copy[ipos * 6 + 2] = r[2];
-      multi_pos_copy[ipos * 6 + 3] = ru[0];
-      multi_pos_copy[ipos * 6 + 4] = ru[1];
-      multi_pos_copy[ipos * 6 + 5] = ru[2];
-
-      multi_psi_v.emplace_back(logdet[i + ipos], logdet.cols());
-      multi_dpsi_v.emplace_back(dlogdet[i + ipos], dlogdet.cols());
-      multi_d2psi_v.emplace_back(d2logdet[i + ipos], d2logdet.cols());
-
-      psi_v_list.push_back(multi_psi_v[ipos]);
-      dpsi_v_list.push_back(multi_dpsi_v[ipos]);
-      d2psi_v_list.push_back(multi_d2psi_v[ipos]);
-    }
+    // chunk the [first, last) loop into blocks to save temporary memory usage
+    const int block_size = 16;
+
+    // reference vectors refer to the rows of matrices
+    std::vector<ValueVector> multi_psi_v;
+    std::vector<GradVector> multi_dpsi_v;
+    std::vector<ValueVector> multi_d2psi_v;
+    RefVector<ValueVector> psi_v_list;
+    RefVector<GradVector> dpsi_v_list;
+    RefVector<ValueVector> d2psi_v_list;
+
+    multi_psi_v.reserve(block_size);
+    multi_dpsi_v.reserve(block_size);
+    multi_d2psi_v.reserve(block_size);
+    psi_v_list.reserve(block_size);
+    dpsi_v_list.reserve(block_size);
+    d2psi_v_list.reserve(block_size);
+
+    for (int iat = first, i = 0; iat < last;
+         iat += block_size, i += block_size) {
+        const int actual_block_size = std::min(last - iat, block_size);
+        multi_pos_copy.resize(actual_block_size * 6);
+        multi_psi_v.clear();
+        multi_dpsi_v.clear();
+        multi_d2psi_v.clear();
+        psi_v_list.clear();
+        dpsi_v_list.clear();
+        d2psi_v_list.clear();
+
+        for (int ipos = 0; ipos < actual_block_size; ++ipos) {
+            // pack particle positions
+            const PointType& r = P.activeR(iat + ipos);
+            PointType ru(PrimLattice.toUnit_floor(r));
+            multi_pos_copy[ipos * 6] = r[0];
+            multi_pos_copy[ipos * 6 + 1] = r[1];
+            multi_pos_copy[ipos * 6 + 2] = r[2];
+            multi_pos_copy[ipos * 6 + 3] = ru[0];
+            multi_pos_copy[ipos * 6 + 4] = ru[1];
+            multi_pos_copy[ipos * 6 + 5] = ru[2];
+
+            multi_psi_v.emplace_back(logdet[i + ipos], logdet.cols());
+            multi_dpsi_v.emplace_back(dlogdet[i + ipos], dlogdet.cols());
+            multi_d2psi_v.emplace_back(d2logdet[i + ipos], d2logdet.cols());
+
+            psi_v_list.push_back(multi_psi_v[ipos]);
+            dpsi_v_list.push_back(multi_dpsi_v[ipos]);
+            d2psi_v_list.push_back(multi_d2psi_v[ipos]);
+        }
 
-    evaluateVGLMultiPos(multi_pos_copy, offload_scratch, results_scratch, psi_v_list, dpsi_v_list, d2psi_v_list);
-  }
+        evaluateVGLMultiPos(multi_pos_copy, offload_scratch, results_scratch,
+            psi_v_list, dpsi_v_list, d2psi_v_list);
+    }
 }
 
-template class SplineC2COMPTargetT<float>;
-template class SplineC2COMPTargetT<double>;
+template class SplineC2COMPTargetT<float, std::complex<float>>;
+template class SplineC2COMPTargetT<float, std::complex<double>>;
+template class SplineC2COMPTargetT<double, std::complex<float>>;
+template class SplineC2COMPTargetT<double, std::complex<double>>;
 
 } // namespace qmcplusplus
diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTargetT.h b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTargetT.h
index a1c7a2cd2f..86c20dfd5d 100644
--- a/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTargetT.h
+++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2COMPTargetT.h
@@ -1,6 +1,6 @@
-  //////////////////////////////////////////////////////////////////////////////////////
-// This file is distributed under the University of Illinois/NCSA Open Source License.
-// See LICENSE file in top directory for details.
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
 //
 // Copyright (c) 2020 QMCPACK developers.
 //
@@ -9,316 +9,369 @@
 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
 //////////////////////////////////////////////////////////////////////////////////////
 
-
-/** @file SplineC2COMPTargetT.h
+/** @file SplineC2COMPTarget.h
  *
- * class to handle complex splines to complex orbitals with splines of arbitrary precision
- * splines storage and computation is offloaded to accelerators using OpenMP target
+ * class to handle complex splines to complex orbitals with splines of arbitrary
+ * precision splines storage and computation is offloaded to accelerators using
+ * OpenMP target
  */
-#ifndef QMCPLUSPLUS_SPLINE_C2C_OMPTARGET_H
-#define QMCPLUSPLUS_SPLINE_C2C_OMPTARGET_H
+#ifndef QMCPLUSPLUS_SPLINE_C2C_OMPTARGETT_H
+#define QMCPLUSPLUS_SPLINE_C2C_OMPTARGETT_H
 
-#include <memory>
-#include "QMCWaveFunctions/BsplineFactory/BsplineSetT.h"
-#include "OhmmsSoA/VectorSoaContainer.h"
-#include "spline2/MultiBspline.hpp"
 #include "OMPTarget/OffloadAlignedAllocators.hpp"
+#include "OhmmsSoA/VectorSoaContainer.h"
+#include "QMCWaveFunctions/BsplineFactory/BsplineSetT.h"
+#include "QMCWaveFunctions/BsplineFactory/contraction_helper.hpp"
+#include "SplineOMPTargetMultiWalkerMem.h"
 #include "Utilities/FairDivide.h"
 #include "Utilities/TimerManager.h"
+#include "spline2/MultiBspline.hpp"
 #include <ResourceHandle.h>
-#include "SplineOMPTargetMultiWalkerMem.h"
+
+#include <memory>
 
 namespace qmcplusplus
 {
-/** class to match std::complex<ST> spline with BsplineSetT<ComplexT>::ValueType (complex) SPOs with OpenMP offload
+/** class to match std::complex<ST> spline with BsplineSet::ValueType (complex)
+ * SPOs with OpenMP offload
  * @tparam ST precision of spline
  *
  * Requires temporage storage and multiplication of phase vectors
- * The internal storage of complex spline coefficients uses double sized real arrays of ST type, aligned and padded.
- * All the output orbitals are complex.
+ * The internal storage of complex spline coefficients uses double sized real
+ * arrays of ST type, aligned and padded. All the output orbitals are complex.
  */
-template<typename ST>
-class SplineC2COMPTargetT : public BsplineSetT<std::complex<ST>>
+template <typename ST, typename VT>
+class SplineC2COMPTargetT : public BsplineSetT<VT>
 {
 public:
-  using SplineType       = typename bspline_traits<ST, 3>::SplineType;
-  using BCType           = typename bspline_traits<ST, 3>::BCType;
-  using DataType         = ST;
-  using PointType        = TinyVector<ST, 3>;
-  using SingleSplineType = UBspline_3d_d;
-  // types for evaluation results
-  using ComplexT = std::complex<ST>;
-  using GGGVector = typename BsplineSetT<ComplexT>::GGGVector;
-  using GradType = typename BsplineSetT<ComplexT>::GradType;
-  using GradVector = typename BsplineSetT<ComplexT>::GradVector;
-  using GradMatrix = typename BsplineSetT<ComplexT>::GradMatrix;
-  using HessVector = typename BsplineSetT<ComplexT>::HessVector;
-  using ValueVector = typename BsplineSetT<ComplexT>::ValueVector;
-  using ValueMatrix = typename BsplineSetT<ComplexT>::ValueMatrix;
-  using OffloadMWVGLArray = Array<ComplexT, 3, OffloadPinnedAllocator<ComplexT>>; 
-
-  using vContainer_type  = Vector<ST, aligned_allocator<ST>>;
-  using gContainer_type  = VectorSoaContainer<ST, 3>;
-  using hContainer_type  = VectorSoaContainer<ST, 6>;
-  using ghContainer_type = VectorSoaContainer<ST, 10>;
-
-  using RealType  = typename SPOSetT<ComplexT>::RealType;
-  using ValueType = typename SPOSetT<ComplexT>::ValueType;
-
-  using SPOSet = SPOSetT<ComplexT>;
-  using SplineC2COMPTarget = SplineC2COMPTargetT<ST>;
-
-  template<typename DT>
-  using OffloadVector = Vector<DT, OffloadAllocator<DT>>;
-  template<typename DT>
-  using OffloadPosVector = VectorSoaContainer<DT, 3, OffloadAllocator<DT>>;
+    using SplineType = typename bspline_traits<ST, 3>::SplineType;
+    using BCType = typename bspline_traits<ST, 3>::BCType;
+    using DataType = ST;
+    using PointType = TinyVector<ST, 3>;
+    using SingleSplineType = UBspline_3d_d;
+    // types for evaluation results
+    using ComplexT = typename BsplineSetT<VT>::ValueType;
+    using typename BsplineSetT<VT>::ValueType;
+    using typename BsplineSetT<VT>::RealType;
+    using typename BsplineSetT<VT>::GradType;
+    using typename BsplineSetT<VT>::GGGVector;
+    using typename BsplineSetT<VT>::GradVector;
+    using typename BsplineSetT<VT>::GradMatrix;
+    using typename BsplineSetT<VT>::HessVector;
+    using typename BsplineSetT<VT>::ValueVector;
+    using typename BsplineSetT<VT>::ValueMatrix;
+    using typename BsplineSetT<VT>::OffloadMWVGLArray;
+
+    using vContainer_type = Vector<ST, aligned_allocator<ST>>;
+    using gContainer_type = VectorSoaContainer<ST, 3>;
+    using hContainer_type = VectorSoaContainer<ST, 6>;
+    using ghContainer_type = VectorSoaContainer<ST, 10>;
+
+    template <typename DT>
+    using OffloadVector = Vector<DT, OffloadAllocator<DT>>;
+    template <typename DT>
+    using OffloadPosVector = VectorSoaContainer<DT, 3, OffloadAllocator<DT>>;
 
 private:
-  /// timer for offload portion
-  NewTimer& offload_timer_;
-  ///primitive cell
-  CrystalLattice<ST, 3> PrimLattice;
-  ///\f$GGt=G^t G \f$, transformation for tensor in LatticeUnit to CartesianUnit, e.g. Hessian
-  Tensor<ST, 3> GGt;
-  ///multi bspline set
-  std::shared_ptr<MultiBspline<ST, OffloadAllocator<ST>, OffloadAllocator<SplineType>>> SplineInst;
-
-  std::shared_ptr<OffloadVector<ST>> mKK;
-  std::shared_ptr<OffloadPosVector<ST>> myKcart;
-  std::shared_ptr<OffloadVector<ST>> GGt_offload;
-  std::shared_ptr<OffloadVector<ST>> PrimLattice_G_offload;
-
-  ResourceHandle<SplineOMPTargetMultiWalkerMem<ST, ComplexT>> mw_mem_handle_;
-
-  ///team private ratios for reduction, numVP x numTeams
-  Matrix<ComplexT, OffloadPinnedAllocator<ComplexT>> ratios_private;
-  ///offload scratch space, dynamically resized to the maximal need
-  Vector<ST, OffloadPinnedAllocator<ST>> offload_scratch;
-  ///result scratch space, dynamically resized to the maximal need
-  Vector<ComplexT, OffloadPinnedAllocator<ComplexT>> results_scratch;
-  ///psiinv and position scratch space, used to avoid allocation on the fly and faster transfer
-  Vector<ComplexT, OffloadPinnedAllocator<ComplexT>> psiinv_pos_copy;
-  ///position scratch space, used to avoid allocation on the fly and faster transfer
-  Vector<ST, OffloadPinnedAllocator<ST>> multi_pos_copy;
-
-  void evaluateVGLMultiPos(const Vector<ST, OffloadPinnedAllocator<ST>>& multi_pos_copy,
-                           Vector<ST, OffloadPinnedAllocator<ST>>& offload_scratch,
-                           Vector<ComplexT, OffloadPinnedAllocator<ComplexT>>& results_scratch,
-                           const RefVector<ValueVector>& psi_v_list,
-                           const RefVector<GradVector>& dpsi_v_list,
-                           const RefVector<ValueVector>& d2psi_v_list) const;
+    /// timer for offload portion
+    NewTimer& offload_timer_;
+    /// primitive cell
+    CrystalLattice<ST, 3> PrimLattice;
+    ///\f$GGt=G^t G \f$, transformation for tensor in LatticeUnit to
+    /// CartesianUnit, e.g. Hessian
+    Tensor<ST, 3> GGt;
+    /// multi bspline set
+    std::shared_ptr<
+        MultiBspline<ST, OffloadAllocator<ST>, OffloadAllocator<SplineType>>>
+        SplineInst;
+
+    std::shared_ptr<OffloadVector<ST>> mKK;
+    std::shared_ptr<OffloadPosVector<ST>> myKcart;
+    std::shared_ptr<OffloadVector<ST>> GGt_offload;
+    std::shared_ptr<OffloadVector<ST>> PrimLattice_G_offload;
+
+    ResourceHandle<SplineOMPTargetMultiWalkerMem<ST, ComplexT>> mw_mem_handle_;
+
+    /// team private ratios for reduction, numVP x numTeams
+    Matrix<ComplexT, OffloadPinnedAllocator<ComplexT>> ratios_private;
+    /// offload scratch space, dynamically resized to the maximal need
+    Vector<ST, OffloadPinnedAllocator<ST>> offload_scratch;
+    /// result scratch space, dynamically resized to the maximal need
+    Vector<ComplexT, OffloadPinnedAllocator<ComplexT>> results_scratch;
+    /// psiinv and position scratch space, used to avoid allocation on the fly
+    /// and faster transfer
+    Vector<ComplexT, OffloadPinnedAllocator<ComplexT>> psiinv_pos_copy;
+    /// position scratch space, used to avoid allocation on the fly and faster
+    /// transfer
+    Vector<ST, OffloadPinnedAllocator<ST>> multi_pos_copy;
+
+    void
+    evaluateVGLMultiPos(
+        const Vector<ST, OffloadPinnedAllocator<ST>>& multi_pos_copy,
+        Vector<ST, OffloadPinnedAllocator<ST>>& offload_scratch,
+        Vector<ComplexT, OffloadPinnedAllocator<ComplexT>>& results_scratch,
+        const RefVector<ValueVector>& psi_v_list,
+        const RefVector<GradVector>& dpsi_v_list,
+        const RefVector<ValueVector>& d2psi_v_list) const;
 
 protected:
-  /// intermediate result vectors
-  vContainer_type myV;
-  vContainer_type myL;
-  gContainer_type myG;
-  hContainer_type myH;
-  ghContainer_type mygH;
+    /// intermediate result vectors
+    vContainer_type myV;
+    vContainer_type myL;
+    gContainer_type myG;
+    hContainer_type myH;
+    ghContainer_type mygH;
 
 public:
-  SplineC2COMPTargetT(const std::string& my_name)
-      : BsplineSetT<ComplexT>(my_name),
-        offload_timer_(createGlobalTimer("SplineC2COMPTargetT::offload", timer_level_fine)),
+    SplineC2COMPTargetT(const std::string& my_name) :
+        BsplineSetT<VT>(my_name),
+        offload_timer_(
+            createGlobalTimer("SplineC2COMPTarget::offload", timer_level_fine)),
         GGt_offload(std::make_shared<OffloadVector<ST>>(9)),
         PrimLattice_G_offload(std::make_shared<OffloadVector<ST>>(9))
-  {}
-
-  SplineC2COMPTargetT(const SplineC2COMPTargetT& in);
-
-  virtual std::string getClassName() const override { return "SplineC2COMPTargetT"; }
-  virtual std::string getKeyword() const override { return "SplineC2C"; }
-  bool isComplex() const override { return true; };
-  virtual bool isOMPoffload() const override { return true; }
-
-  void createResource(ResourceCollection& collection) const override
-  {
-    auto resource_index = collection.addResource(std::make_unique<SplineOMPTargetMultiWalkerMem<ST, ComplexT>>());
-  }
-
-  void acquireResource(ResourceCollection& collection, const RefVectorWithLeader<SPOSet>& spo_list) const override
-  {
-    assert(this == &spo_list.getLeader());
-    auto& phi_leader          = spo_list.template getCastedLeader<SplineC2COMPTarget>();
-    phi_leader.mw_mem_handle_ = collection.lendResource<SplineOMPTargetMultiWalkerMem<ST, ComplexT>>();
-  }
-
-  void releaseResource(ResourceCollection& collection, const RefVectorWithLeader<SPOSet>& spo_list) const override
-  {
-    assert(this == &spo_list.getLeader());
-    auto& phi_leader = spo_list.template getCastedLeader<SplineC2COMPTarget>();
-    collection.takebackResource(phi_leader.mw_mem_handle_);
-  }
-
-  std::unique_ptr<SPOSet> makeClone() const override { return std::make_unique<SplineC2COMPTargetT>(*this); }
-
-  inline void resizeStorage(size_t n, size_t nvals)
-  {
-    BsplineSetT<ComplexT>::init_base(n);
-    size_t npad = getAlignedSize<ST>(2 * n);
-    myV.resize(npad);
-    myG.resize(npad);
-    myL.resize(npad);
-    myH.resize(npad);
-    mygH.resize(npad);
-  }
-
-  void bcast_tables(Communicate* comm) { chunked_bcast(comm, SplineInst->getSplinePtr()); }
-
-  void gather_tables(Communicate* comm)
-  {
-    if (comm->size() == 1)
-      return;
-    const int Nbands      = this->kPoints.size();
-    const int Nbandgroups = comm->size();
-    this->offset.resize(Nbandgroups + 1, 0);
-    FairDivideLow(Nbands, Nbandgroups, this->offset);
-
-    for (size_t ib = 0; ib < this->offset.size(); ib++)
-      this->offset[ib] *= 2;
-    gatherv(comm, SplineInst->getSplinePtr(), SplineInst->getSplinePtr()->z_stride, this->offset);
-  }
-
-  template<typename GT, typename BCT>
-  void create_spline(GT& xyz_g, BCT& xyz_bc)
-  {
-    resize_kpoints();
-    SplineInst = std::make_shared<MultiBspline<ST, OffloadAllocator<ST>, OffloadAllocator<SplineType>>>();
-    SplineInst->create(xyz_g, xyz_bc, myV.size());
-
-    app_log() << "MEMORY " << SplineInst->sizeInByte() / (1 << 20) << " MB allocated "
-              << "for the coefficients in 3D spline orbital representation" << std::endl;
-  }
-
-  /// this routine can not be called from threaded region
-  void finalizeConstruction() override
-  {
-    // map the SplineInst->getSplinePtr() structure to GPU
-    auto* MultiSpline    = SplineInst->getSplinePtr();
-    auto* restrict coefs = MultiSpline->coefs;
-    // attach pointers on the device to achieve deep copy
-    PRAGMA_OFFLOAD("omp target map(always, to: MultiSpline[0:1], coefs[0:MultiSpline->coefs_size])")
     {
-      MultiSpline->coefs = coefs;
     }
 
-    // transfer static data to GPU
-    auto* mKK_ptr = mKK->data();
-    PRAGMA_OFFLOAD("omp target update to(mKK_ptr[0:mKK->size()])")
-    auto* myKcart_ptr = myKcart->data();
-    PRAGMA_OFFLOAD("omp target update to(myKcart_ptr[0:myKcart->capacity()*3])")
-    for (size_t i = 0; i < 9; i++)
+    SplineC2COMPTargetT(const SplineC2COMPTargetT& in);
+
+    virtual std::string
+    getClassName() const override
     {
-      (*GGt_offload)[i]           = GGt[i];
-      (*PrimLattice_G_offload)[i] = PrimLattice.G[i];
+        return "SplineC2COMPTarget";
     }
-    auto* PrimLattice_G_ptr = PrimLattice_G_offload->data();
-    PRAGMA_OFFLOAD("omp target update to(PrimLattice_G_ptr[0:9])")
-    auto* GGt_ptr = GGt_offload->data();
-    PRAGMA_OFFLOAD("omp target update to(GGt_ptr[0:9])")
-  }
-
-  inline void flush_zero() { SplineInst->flush_zero(); }
-
-  /** remap kPoints to pack the double copy */
-  inline void resize_kpoints()
-  {
-    const size_t nk = this->kPoints.size();
-    mKK             = std::make_shared<OffloadVector<ST>>(nk);
-    myKcart         = std::make_shared<OffloadPosVector<ST>>(nk);
-    for (size_t i = 0; i < nk; ++i)
+    virtual std::string
+    getKeyword() const override
     {
-      (*mKK)[i]     = -dot(this->kPoints[i], this->kPoints[i]);
-      (*myKcart)(i) = this->kPoints[i];
+        return "SplineC2C";
+    }
+    bool
+    isComplex() const override
+    {
+        return true;
+    };
+    virtual bool
+    isOMPoffload() const override
+    {
+        return true;
+    }
+
+    void
+    createResource(ResourceCollection& collection) const override
+    {
+        auto resource_index = collection.addResource(
+            std::make_unique<SplineOMPTargetMultiWalkerMem<ST, ComplexT>>());
+    }
+
+    void
+    acquireResource(ResourceCollection& collection,
+        const RefVectorWithLeader<SPOSetT<VT>>& spo_list) const override
+    {
+        assert(this == &spo_list.getLeader());
+        auto& phi_leader =
+            spo_list.template getCastedLeader<SplineC2COMPTargetT>();
+        phi_leader.mw_mem_handle_ =
+            collection
+                .lendResource<SplineOMPTargetMultiWalkerMem<ST, ComplexT>>();
+    }
+
+    void
+    releaseResource(ResourceCollection& collection,
+        const RefVectorWithLeader<SPOSetT<VT>>& spo_list) const override
+    {
+        assert(this == &spo_list.getLeader());
+        auto& phi_leader =
+            spo_list.template getCastedLeader<SplineC2COMPTargetT>();
+        collection.takebackResource(phi_leader.mw_mem_handle_);
+    }
+
+    std::unique_ptr<SPOSetT<VT>>
+    makeClone() const override
+    {
+        return std::make_unique<SplineC2COMPTargetT>(*this);
+    }
+
+    inline void
+    resizeStorage(size_t n, size_t nvals)
+    {
+        this->init_base(n);
+        size_t npad = getAlignedSize<ST>(2 * n);
+        myV.resize(npad);
+        myG.resize(npad);
+        myL.resize(npad);
+        myH.resize(npad);
+        mygH.resize(npad);
+    }
+
+    void
+    bcast_tables(Communicate* comm)
+    {
+        chunked_bcast(comm, SplineInst->getSplinePtr());
+    }
+
+    void
+    gather_tables(Communicate* comm)
+    {
+        if (comm->size() == 1)
+            return;
+        const int Nbands = this->kPoints.size();
+        const int Nbandgroups = comm->size();
+        this->offset.resize(Nbandgroups + 1, 0);
+        FairDivideLow(Nbands, Nbandgroups, this->offset);
+
+        for (size_t ib = 0; ib < this->offset.size(); ib++)
+            this->offset[ib] *= 2;
+        gatherv(comm, SplineInst->getSplinePtr(),
+            SplineInst->getSplinePtr()->z_stride, this->offset);
     }
-  }
-
-  void set_spline(SingleSplineType* spline_r, SingleSplineType* spline_i, int twist, int ispline, int level);
-
-  bool read_splines(hdf_archive& h5f);
-
-  bool write_splines(hdf_archive& h5f);
-
-  void assign_v(const PointType& r, const vContainer_type& myV, ValueVector& psi, int first, int last) const;
-
-  virtual void evaluateValue(const ParticleSetT<ComplexT>& P, const int iat, ValueVector& psi) override;
-
-  virtual void evaluateDetRatios(const VirtualParticleSetT<ComplexT>& VP,
-                                 ValueVector& psi,
-                                 const ValueVector& psiinv,
-                                 std::vector<ValueType>& ratios) override;
-
-  virtual void mw_evaluateDetRatios(const RefVectorWithLeader<SPOSet>& spo_list,
-                                    const RefVectorWithLeader<const VirtualParticleSetT<ComplexT>>& vp_list,
-                                    const RefVector<ValueVector>& psi_list,
-                                    const std::vector<const ValueType*>& invRow_ptr_list,
-                                    std::vector<std::vector<ValueType>>& ratios_list) const override;
-
-  /** assign_vgl_from_l can be used when myL is precomputed and myV,myG,myL in cartesian
-   */
-  void assign_vgl_from_l(const PointType& r, ValueVector& psi, GradVector& dpsi, ValueVector& d2psi);
-
-  virtual void evaluateVGL(const ParticleSetT<ComplexT>& P,
-                           const int iat,
-                           ValueVector& psi,
-                           GradVector& dpsi,
-                           ValueVector& d2psi) override;
-
-  virtual void mw_evaluateVGL(const RefVectorWithLeader<SPOSet>& sa_list,
-                              const RefVectorWithLeader<ParticleSetT<ComplexT>>& P_list,
-                              int iat,
-                              const RefVector<ValueVector>& psi_v_list,
-                              const RefVector<GradVector>& dpsi_v_list,
-                              const RefVector<ValueVector>& d2psi_v_list) const override;
-
-  virtual void mw_evaluateVGLandDetRatioGrads(const RefVectorWithLeader<SPOSet>& spo_list,
-                                              const RefVectorWithLeader<ParticleSetT<ComplexT>>& P_list,
-                                              int iat,
-                                              const std::vector<const ValueType*>& invRow_ptr_list,
-                                              OffloadMWVGLArray& phi_vgl_v,
-                                              std::vector<ValueType>& ratios,
-                                              std::vector<GradType>& grads) const override;
-
-  void assign_vgh(const PointType& r,
-                  ValueVector& psi,
-                  GradVector& dpsi,
-                  HessVector& grad_grad_psi,
-                  int first,
-                  int last) const;
-
-  virtual void evaluateVGH(const ParticleSetT<ComplexT>& P,
-                           const int iat,
-                           ValueVector& psi,
-                           GradVector& dpsi,
-                           HessVector& grad_grad_psi) override;
-
-  void assign_vghgh(const PointType& r,
-                    ValueVector& psi,
-                    GradVector& dpsi,
-                    HessVector& grad_grad_psi,
-                    GGGVector& grad_grad_grad_psi,
-                    int first = 0,
-                    int last  = -1) const;
-
-  virtual void evaluateVGHGH(const ParticleSetT<ComplexT>& P,
-                             const int iat,
-                             ValueVector& psi,
-                             GradVector& dpsi,
-                             HessVector& grad_grad_psi,
-                             GGGVector& grad_grad_grad_psi) override;
-
-  virtual void evaluate_notranspose(const ParticleSetT<ComplexT>& P,
-                                    int first,
-                                    int last,
-                                    ValueMatrix& logdet,
-                                    GradMatrix& dlogdet,
-                                    ValueMatrix& d2logdet) override;
-
-  template<class BSPLINESPO>
-  friend struct SplineSetReader;
-  friend struct BsplineReaderBase;
-};
 
+    template <typename GT, typename BCT>
+    void
+    create_spline(GT& xyz_g, BCT& xyz_bc)
+    {
+        resize_kpoints();
+        SplineInst = std::make_shared<MultiBspline<ST, OffloadAllocator<ST>,
+            OffloadAllocator<SplineType>>>();
+        SplineInst->create(xyz_g, xyz_bc, myV.size());
+
+        app_log() << "MEMORY " << SplineInst->sizeInByte() / (1 << 20)
+                  << " MB allocated "
+                  << "for the coefficients in 3D spline orbital representation"
+                  << std::endl;
+    }
+
+    /// this routine can not be called from threaded region
+    void
+    finalizeConstruction() override
+    {
+        // map the SplineInst->getSplinePtr() structure to GPU
+        auto* MultiSpline = SplineInst->getSplinePtr();
+        auto* restrict coefs = MultiSpline->coefs;
+        // attach pointers on the device to achieve deep copy
+        PRAGMA_OFFLOAD("omp target \
+                map(always, to: MultiSpline[0:1], \
+                    coefs[0:MultiSpline->coefs_size])")
+        {
+            MultiSpline->coefs = coefs;
+        }
+
+        // transfer static data to GPU
+        auto* mKK_ptr = mKK->data();
+        PRAGMA_OFFLOAD("omp target update to(mKK_ptr[0:mKK->size()])")
+        auto* myKcart_ptr = myKcart->data();
+        PRAGMA_OFFLOAD(
+            "omp target update to(myKcart_ptr[0:myKcart->capacity()*3])")
+        for (size_t i = 0; i < 9; i++) {
+            (*GGt_offload)[i] = GGt[i];
+            (*PrimLattice_G_offload)[i] = PrimLattice.G[i];
+        }
+        auto* PrimLattice_G_ptr = PrimLattice_G_offload->data();
+        PRAGMA_OFFLOAD("omp target update to(PrimLattice_G_ptr[0:9])")
+        auto* GGt_ptr = GGt_offload->data();
+        PRAGMA_OFFLOAD("omp target update to(GGt_ptr[0:9])")
+    }
+
+    inline void
+    flush_zero()
+    {
+        SplineInst->flush_zero();
+    }
+
+    /** remap kPoints to pack the double copy */
+    inline void
+    resize_kpoints()
+    {
+        const size_t nk = this->kPoints.size();
+        mKK = std::make_shared<OffloadVector<ST>>(nk);
+        myKcart = std::make_shared<OffloadPosVector<ST>>(nk);
+        for (size_t i = 0; i < nk; ++i) {
+            (*mKK)[i] = -dot(this->kPoints[i], this->kPoints[i]);
+            (*myKcart)(i) = this->kPoints[i];
+        }
+    }
+
+    void
+    set_spline(SingleSplineType* spline_r, SingleSplineType* spline_i,
+        int twist, int ispline, int level);
+
+    bool
+    read_splines(hdf_archive& h5f);
+
+    bool
+    write_splines(hdf_archive& h5f);
+
+    void
+    assign_v(const PointType& r, const vContainer_type& myV, ValueVector& psi,
+        int first, int last) const;
+
+    virtual void
+    evaluateValue(
+        const ParticleSetT<VT>& P, const int iat, ValueVector& psi) override;
+
+    virtual void
+    evaluateDetRatios(const VirtualParticleSetT<VT>& VP, ValueVector& psi,
+        const ValueVector& psiinv, std::vector<ValueType>& ratios) override;
+
+    virtual void
+    mw_evaluateDetRatios(const RefVectorWithLeader<SPOSetT<VT>>& spo_list,
+        const RefVectorWithLeader<const VirtualParticleSetT<VT>>& vp_list,
+        const RefVector<ValueVector>& psi_list,
+        const std::vector<const ValueType*>& invRow_ptr_list,
+        std::vector<std::vector<ValueType>>& ratios_list) const override;
+
+    /** assign_vgl_from_l can be used when myL is precomputed and myV,myG,myL in
+     * cartesian
+     */
+    void
+    assign_vgl_from_l(const PointType& r, ValueVector& psi, GradVector& dpsi,
+        ValueVector& d2psi);
+
+    virtual void
+    evaluateVGL(const ParticleSetT<VT>& P, const int iat, ValueVector& psi,
+        GradVector& dpsi, ValueVector& d2psi) override;
+
+    virtual void
+    mw_evaluateVGL(const RefVectorWithLeader<SPOSetT<VT>>& sa_list,
+        const RefVectorWithLeader<ParticleSetT<VT>>& P_list, int iat,
+        const RefVector<ValueVector>& psi_v_list,
+        const RefVector<GradVector>& dpsi_v_list,
+        const RefVector<ValueVector>& d2psi_v_list) const override;
+
+    virtual void
+    mw_evaluateVGLandDetRatioGrads(
+        const RefVectorWithLeader<SPOSetT<VT>>& spo_list,
+        const RefVectorWithLeader<ParticleSetT<VT>>& P_list, int iat,
+        const std::vector<const ValueType*>& invRow_ptr_list,
+        OffloadMWVGLArray& phi_vgl_v, std::vector<ValueType>& ratios,
+        std::vector<GradType>& grads) const override;
+
+    void
+    assign_vgh(const PointType& r, ValueVector& psi, GradVector& dpsi,
+        HessVector& grad_grad_psi, int first, int last) const;
+
+    virtual void
+    evaluateVGH(const ParticleSetT<VT>& P, const int iat, ValueVector& psi,
+        GradVector& dpsi, HessVector& grad_grad_psi) override;
+
+    void
+    assign_vghgh(const PointType& r, ValueVector& psi, GradVector& dpsi,
+        HessVector& grad_grad_psi, GGGVector& grad_grad_grad_psi, int first = 0,
+        int last = -1) const;
+
+    virtual void
+    evaluateVGHGH(const ParticleSetT<VT>& P, const int iat, ValueVector& psi,
+        GradVector& dpsi, HessVector& grad_grad_psi,
+        GGGVector& grad_grad_grad_psi) override;
+
+    virtual void
+    evaluate_notranspose(const ParticleSetT<VT>& P, int first, int last,
+        ValueMatrix& logdet, GradMatrix& dlogdet,
+        ValueMatrix& d2logdet) override;
+
+    template <class BSPLINESPO>
+    friend class SplineSetReaderT;
+    template <typename>
+    friend class BsplineReaderBaseT;
+};
 
 } // namespace qmcplusplus
 #endif
diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2CT.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineC2CT.cpp
index e6b05e4cd3..90edda7a96 100644
--- a/src/QMCWaveFunctions/BsplineFactory/SplineC2CT.cpp
+++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2CT.cpp
@@ -1,6 +1,6 @@
 //////////////////////////////////////////////////////////////////////////////////////
-// This file is distributed under the University of Illinois/NCSA Open Source License.
-// See LICENSE file in top directory for details.
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
 //
 // Copyright (c) 2019 QMCPACK developers.
 //
@@ -10,56 +10,59 @@
 // File created by: Jeongnim Kim, jeongnim.kim@intel.com, Intel Corp.
 //////////////////////////////////////////////////////////////////////////////////////
 
+#include "SplineC2CT.h"
 
-#include <complex>
+#include "CPU/BLAS.hpp"
+#include "CPU/math.hpp"
 #include "Concurrency/OpenMP.h"
-#include "SplineC2CT.h"
-#include "spline2/MultiBsplineEval.hpp"
 #include "QMCWaveFunctions/BsplineFactory/contraction_helper.hpp"
-#include "CPU/math.hpp"
+#include "spline2/MultiBsplineEval.hpp"
+
+#include <complex>
 
 namespace qmcplusplus
 {
-template<class T>
-SplineC2CT<T>::SplineC2CT(const SplineC2CT& in) = default;
-
-template<class T>
-inline void SplineC2CT<T>::set_spline(SingleSplineType* spline_r,
-                                      SingleSplineType* spline_i,
-                                      int twist,
-                                      int ispline,
-                                      int level)
+template <typename ST, typename VT>
+SplineC2CT<ST, VT>::SplineC2CT(const SplineC2CT& in) = default;
+
+template <typename ST, typename VT>
+inline void
+SplineC2CT<ST, VT>::set_spline(SingleSplineType* spline_r,
+    SingleSplineType* spline_i, int twist, int ispline, int level)
 {
-  SplineInst->copy_spline(spline_r, 2 * ispline);
-  SplineInst->copy_spline(spline_i, 2 * ispline + 1);
+    SplineInst->copy_spline(spline_r, 2 * ispline);
+    SplineInst->copy_spline(spline_i, 2 * ispline + 1);
 }
 
-template<class T>
-bool SplineC2CT<T>::read_splines(hdf_archive& h5f)
+template <typename ST, typename VT>
+bool
+SplineC2CT<ST, VT>::read_splines(hdf_archive& h5f)
 {
-  std::ostringstream o;
-  o << "spline_" << this->MyIndex;
-  einspline_engine<SplineType> bigtable(SplineInst->getSplinePtr());
-  return h5f.readEntry(bigtable, o.str().c_str()); //"spline_0");
+    std::ostringstream o;
+    o << "spline_" << this->MyIndex;
+    einspline_engine<SplineType> bigtable(SplineInst->getSplinePtr());
+    return h5f.readEntry(bigtable, o.str().c_str()); //"spline_0");
 }
 
-template<class T>
-bool SplineC2CT<T>::write_splines(hdf_archive& h5f)
+template <typename ST, typename VT>
+bool
+SplineC2CT<ST, VT>::write_splines(hdf_archive& h5f)
 {
-  std::ostringstream o;
-  o << "spline_" << this->MyIndex;
-  einspline_engine<SplineType> bigtable(SplineInst->getSplinePtr());
-  return h5f.writeEntry(bigtable, o.str().c_str()); //"spline_0");
+    std::ostringstream o;
+    o << "spline_" << this->MyIndex;
+    einspline_engine<SplineType> bigtable(SplineInst->getSplinePtr());
+    return h5f.writeEntry(bigtable, o.str().c_str()); //"spline_0");
 }
 
-template<class T>
-void SplineC2CT<T>::storeParamsBeforeRotation()
+template <typename ST, typename VT>
+void
+SplineC2CT<ST, VT>::storeParamsBeforeRotation()
 {
-  const auto spline_ptr     = SplineInst->getSplinePtr();
-  const auto coefs_tot_size = spline_ptr->coefs_size;
-  coef_copy_                = std::make_shared<std::vector<RealType>>(coefs_tot_size);
+    const auto spline_ptr = SplineInst->getSplinePtr();
+    const auto coefs_tot_size = spline_ptr->coefs_size;
+    coef_copy_ = std::make_shared<std::vector<ST>>(coefs_tot_size);
 
-  std::copy_n(spline_ptr->coefs, coefs_tot_size, coef_copy_->begin());
+    std::copy_n(spline_ptr->coefs, coefs_tot_size, coef_copy_->begin());
 }
 
 /*
@@ -101,700 +104,834 @@ void SplineC2CT<T>::storeParamsBeforeRotation()
   NB: For splines (typically) BasisSetSize >> OrbitalSetSize, so the spl_coefs
   "matrix" is very tall and skinny.
 */
-template<class T>
-void SplineC2CT<T>::applyRotation(const ValueMatrix& rot_mat, bool use_stored_copy)
+template <typename ST, typename VT>
+void
+SplineC2CT<ST, VT>::applyRotation(
+    const ValueMatrix& rot_mat, bool use_stored_copy)
 {
-  // SplineInst is a MultiBspline. See src/spline2/MultiBspline.hpp
-  const auto spline_ptr = SplineInst->getSplinePtr();
-  assert(spline_ptr != nullptr);
-  const auto spl_coefs      = spline_ptr->coefs;
-  const auto Nsplines       = spline_ptr->num_splines; // May include padding
-  const auto coefs_tot_size = spline_ptr->coefs_size;
-  const auto basis_set_size = coefs_tot_size / Nsplines;
-  assert(this->OrbitalSetSize == rot_mat.rows());
-  assert(this->OrbitalSetSize == rot_mat.cols());
-
-  if (!use_stored_copy)
-  {
-    assert(coef_copy_ != nullptr);
-    std::copy_n(spl_coefs, coefs_tot_size, coef_copy_->begin());
-  }
-
-  for (int i = 0; i < basis_set_size; i++)
-    for (int j = 0; j < this->OrbitalSetSize; j++)
-    {
-      // cur_elem points to the real componend of the coefficient.
-      // Imag component is adjacent in memory.
-      const auto cur_elem = Nsplines * i + 2 * j;
-      RealType newval_r{0.};
-      RealType newval_i{0.};
-      for (auto k = 0; k < this->OrbitalSetSize; k++)
-      {
-        const auto index = Nsplines * i + 2 * k;
-        RealType zr      = (*coef_copy_)[index];
-        RealType zi      = (*coef_copy_)[index + 1];
-        RealType wr      = rot_mat[k][j].real();
-        RealType wi      = rot_mat[k][j].imag();
-        newval_r += zr * wr - zi * wi;
-        newval_i += zr * wi + zi * wr;
-      }
-      spl_coefs[cur_elem]     = newval_r;
-      spl_coefs[cur_elem + 1] = newval_i;
+    // SplineInst is a MultiBspline. See src/spline2/MultiBspline.hpp
+    const auto spline_ptr = SplineInst->getSplinePtr();
+    assert(spline_ptr != nullptr);
+    const auto spl_coefs = spline_ptr->coefs;
+    const auto Nsplines = spline_ptr->num_splines; // May include padding
+    const auto coefs_tot_size = spline_ptr->coefs_size;
+    const auto basis_set_size = coefs_tot_size / Nsplines;
+    assert(this->OrbitalSetSize == rot_mat.rows());
+    assert(this->OrbitalSetSize == rot_mat.cols());
+
+    if (!use_stored_copy) {
+        assert(coef_copy_ != nullptr);
+        std::copy_n(spl_coefs, coefs_tot_size, coef_copy_->begin());
+    }
+
+    if constexpr (std::is_same_v<ST, RealType>) {
+        // if ST is double, go ahead and use blas to make things faster
+        // Note that Nsplines needs to be divided by 2 since spl_coefs and
+        // coef_copy_ are stored as reals. Also casting them as ValueType so
+        // they are complex to do the correct gemm
+        BLAS::gemm('N', 'N', this->OrbitalSetSize, basis_set_size,
+            this->OrbitalSetSize, ValueType(1.0, 0.0), rot_mat.data(),
+            this->OrbitalSetSize, (ValueType*)coef_copy_->data(), Nsplines / 2,
+            ValueType(0.0, 0.0), (ValueType*)spl_coefs, Nsplines / 2);
+    }
+    else {
+        // if ST is float, RealType is double and ValueType is
+        // std::complex<double> for C2C Just use naive matrix multiplication in
+        // order to avoid losing precision on rotation matrix
+        for (IndexType i = 0; i < basis_set_size; i++)
+            for (IndexType j = 0; j < this->OrbitalSetSize; j++) {
+                // cur_elem points to the real componend of the coefficient.
+                // Imag component is adjacent in memory.
+                const auto cur_elem = Nsplines * i + 2 * j;
+                ST newval_r{0.};
+                ST newval_i{0.};
+                for (IndexType k = 0; k < this->OrbitalSetSize; k++) {
+                    const auto index = Nsplines * i + 2 * k;
+                    ST zr = (*coef_copy_)[index];
+                    ST zi = (*coef_copy_)[index + 1];
+                    ST wr = rot_mat[k][j].real();
+                    ST wi = rot_mat[k][j].imag();
+                    newval_r += zr * wr - zi * wi;
+                    newval_i += zr * wi + zi * wr;
+                }
+                spl_coefs[cur_elem] = newval_r;
+                spl_coefs[cur_elem + 1] = newval_i;
+            }
     }
 }
 
-template<class T>
-inline void SplineC2CT<T>::assign_v(const PointType& r,
-                                    const vContainer_type& myV,
-                                    ValueVector& psi,
-                                    int first,
-                                    int last) const
+template <typename ST, typename VT>
+inline void
+SplineC2CT<ST, VT>::assign_v(const PointType& r, const vContainer_type& myV,
+    ValueVector& psi, int first, int last) const
 {
-  const auto kPointsSize = this->kPoints.size();
-  // protect last
-  last = last > kPointsSize ? kPointsSize : last;
-
-  const RealType x = r[0], y = r[1], z = r[2];
-  const RealType* restrict kx = myKcart.data(0);
-  const RealType* restrict ky = myKcart.data(1);
-  const RealType* restrict kz = myKcart.data(2);
+    // protect last
+    last = last > this->kPoints.size() ? this->kPoints.size() : last;
+
+    const ST x = r[0], y = r[1], z = r[2];
+    const ST* restrict kx = myKcart.data(0);
+    const ST* restrict ky = myKcart.data(1);
+    const ST* restrict kz = myKcart.data(2);
 #pragma omp simd
-  for (size_t j = first; j < last; ++j)
-  {
-    RealType s, c;
-    const RealType val_r = myV[2 * j];
-    const RealType val_i = myV[2 * j + 1];
-    qmcplusplus::sincos(-(x * kx[j] + y * ky[j] + z * kz[j]), &s, &c);
-    psi[j + this->first_spo] = ComplexT(val_r * c - val_i * s, val_i * c + val_r * s);
-  }
+    for (size_t j = first; j < last; ++j) {
+        ST s, c;
+        const ST val_r = myV[2 * j];
+        const ST val_i = myV[2 * j + 1];
+        qmcplusplus::sincos(-(x * kx[j] + y * ky[j] + z * kz[j]), &s, &c);
+        psi[j + this->first_spo] =
+            ComplexT(val_r * c - val_i * s, val_i * c + val_r * s);
+    }
 }
 
-template<class T>
-void SplineC2CT<T>::evaluateValue(const ParticleSetT<T>& P, const int iat, ValueVector& psi)
+template <typename ST, typename VT>
+void
+SplineC2CT<ST, VT>::evaluateValue(
+    const ParticleSetT<VT>& P, const int iat, ValueVector& psi)
 {
-  const PointType& r = P.activeR(iat);
-  PointType ru(PrimLattice.toUnit_floor(r));
+    const PointType& r = P.activeR(iat);
+    PointType ru(PrimLattice.toUnit_floor(r));
 
 #pragma omp parallel
-  {
-    int first, last;
-    // Factor of 2 because psi is complex and the spline storage and evaluation uses a real type
-    FairDivideAligned(2 * psi.size(), getAlignment<T>(), omp_get_num_threads(), omp_get_thread_num(), first, last);
-
-    spline2::evaluate3d(SplineInst->getSplinePtr(), ru, myV, first, last);
-    assign_v(r, myV, psi, first / 2, last / 2);
-  }
+    {
+        int first, last;
+        // Factor of 2 because psi is complex and the spline storage and
+        // evaluation uses a real type
+        FairDivideAligned(2 * psi.size(), getAlignment<ST>(),
+            omp_get_num_threads(), omp_get_thread_num(), first, last);
+
+        spline2::evaluate3d(SplineInst->getSplinePtr(), ru, myV, first, last);
+        assign_v(r, myV, psi, first / 2, last / 2);
+    }
 }
 
-template<class T>
-void SplineC2CT<T>::evaluateDetRatios(const VirtualParticleSetT<T>& VP,
-                                      ValueVector& psi,
-                                      const ValueVector& psiinv,
-                                      std::vector<ValueType>& ratios)
+template <typename ST, typename VT>
+void
+SplineC2CT<ST, VT>::evaluateDetRatios(const VirtualParticleSetT<VT>& VP,
+    ValueVector& psi, const ValueVector& psiinv, std::vector<ValueType>& ratios)
 {
-  const bool need_resize = ratios_private.rows() < VP.getTotalNum();
+    const bool need_resize = ratios_private.rows() < VP.getTotalNum();
 
 #pragma omp parallel
-  {
-    int tid = omp_get_thread_num();
-    // initialize thread private ratios
-    if (need_resize)
     {
-      if (tid == 0) // just like #pragma omp master, but one fewer call to the runtime
-        ratios_private.resize(VP.getTotalNum(), omp_get_num_threads());
+        int tid = omp_get_thread_num();
+        // initialize thread private ratios
+        if (need_resize) {
+            if (tid == 0) // just like #pragma omp master, but one fewer call to
+                          // the runtime
+                ratios_private.resize(VP.getTotalNum(), omp_get_num_threads());
 #pragma omp barrier
+        }
+        int first, last;
+        // Factor of 2 because psi is complex and the spline storage and
+        // evaluation uses a real type
+        FairDivideAligned(2 * psi.size(), getAlignment<ST>(),
+            omp_get_num_threads(), tid, first, last);
+        const int first_cplx = first / 2;
+        const int last_cplx =
+            this->kPoints.size() < last / 2 ? this->kPoints.size() : last / 2;
+
+        for (int iat = 0; iat < VP.getTotalNum(); ++iat) {
+            const PointType& r = VP.activeR(iat);
+            PointType ru(PrimLattice.toUnit_floor(r));
+
+            spline2::evaluate3d(
+                SplineInst->getSplinePtr(), ru, myV, first, last);
+            assign_v(r, myV, psi, first_cplx, last_cplx);
+            ratios_private[iat][tid] = simd::dot(psi.data() + first_cplx,
+                psiinv.data() + first_cplx, last_cplx - first_cplx);
+        }
     }
-    int first, last;
-    // Factor of 2 because psi is complex and the spline storage and evaluation uses a real type
-    FairDivideAligned(2 * psi.size(), getAlignment<T>(), omp_get_num_threads(), tid, first, last);
-    const int first_cplx   = first / 2;
-    const auto kPointsSize = this->kPoints.size();
-    const int last_cplx    = kPointsSize < last / 2 ? kPointsSize : last / 2;
-
-    for (int iat = 0; iat < VP.getTotalNum(); ++iat)
-    {
-      const PointType& r = VP.activeR(iat);
-      PointType ru(PrimLattice.toUnit_floor(r));
 
-      spline2::evaluate3d(SplineInst->getSplinePtr(), ru, myV, first, last);
-      assign_v(r, myV, psi, first_cplx, last_cplx);
-      ratios_private[iat][tid] = simd::dot(psi.data() + first_cplx, psiinv.data() + first_cplx, last_cplx - first_cplx);
+    // do the reduction manually
+    for (int iat = 0; iat < VP.getTotalNum(); ++iat) {
+        ratios[iat] = ComplexT(0);
+        for (int tid = 0; tid < ratios_private.cols(); tid++)
+            ratios[iat] += ratios_private[iat][tid];
     }
-  }
-
-  // do the reduction manually
-  for (int iat = 0; iat < VP.getTotalNum(); ++iat)
-  {
-    ratios[iat] = ComplexT(0);
-    for (int tid = 0; tid < ratios_private.cols(); tid++)
-      ratios[iat] += ratios_private[iat][tid];
-  }
 }
 
 /** assign_vgl
-   */
-template<class T>
-inline void SplineC2CT<T>::assign_vgl(const PointType& r,
-                                      ValueVector& psi,
-                                      GradVector& dpsi,
-                                      ValueVector& d2psi,
-                                      int first,
-                                      int last) const
+ */
+template <typename ST, typename VT>
+inline void
+SplineC2CT<ST, VT>::assign_vgl(const PointType& r, ValueVector& psi,
+    GradVector& dpsi, ValueVector& d2psi, int first, int last) const
 {
-  // protect last
-  const auto kPointsSize = this->kPoints.size();
-  last                   = last > kPointsSize ? kPointsSize : last;
-
-  constexpr RealType zero(0);
-  constexpr RealType two(2);
-  const RealType g00 = PrimLattice.G(0), g01 = PrimLattice.G(1), g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
-                 g11 = PrimLattice.G(4), g12 = PrimLattice.G(5), g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
-                 g22 = PrimLattice.G(8);
-  const RealType x = r[0], y = r[1], z = r[2];
-  const RealType symGG[6] = {GGt[0], GGt[1] + GGt[3], GGt[2] + GGt[6], GGt[4], GGt[5] + GGt[7], GGt[8]};
-
-  const RealType* restrict k0 = myKcart.data(0);
-  const RealType* restrict k1 = myKcart.data(1);
-  const RealType* restrict k2 = myKcart.data(2);
-
-  const RealType* restrict g0  = myG.data(0);
-  const RealType* restrict g1  = myG.data(1);
-  const RealType* restrict g2  = myG.data(2);
-  const RealType* restrict h00 = myH.data(0);
-  const RealType* restrict h01 = myH.data(1);
-  const RealType* restrict h02 = myH.data(2);
-  const RealType* restrict h11 = myH.data(3);
-  const RealType* restrict h12 = myH.data(4);
-  const RealType* restrict h22 = myH.data(5);
+    // protect last
+    last = last > this->kPoints.size() ? this->kPoints.size() : last;
+
+    constexpr ST zero(0);
+    constexpr ST two(2);
+    const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1),
+             g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
+             g11 = PrimLattice.G(4), g12 = PrimLattice.G(5),
+             g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
+             g22 = PrimLattice.G(8);
+    const ST x = r[0], y = r[1], z = r[2];
+    const ST symGG[6] = {GGt[0], GGt[1] + GGt[3], GGt[2] + GGt[6], GGt[4],
+        GGt[5] + GGt[7], GGt[8]};
+
+    const ST* restrict k0 = myKcart.data(0);
+    const ST* restrict k1 = myKcart.data(1);
+    const ST* restrict k2 = myKcart.data(2);
+
+    const ST* restrict g0 = myG.data(0);
+    const ST* restrict g1 = myG.data(1);
+    const ST* restrict g2 = myG.data(2);
+    const ST* restrict h00 = myH.data(0);
+    const ST* restrict h01 = myH.data(1);
+    const ST* restrict h02 = myH.data(2);
+    const ST* restrict h11 = myH.data(3);
+    const ST* restrict h12 = myH.data(4);
+    const ST* restrict h22 = myH.data(5);
 
 #pragma omp simd
-  for (size_t j = first; j < last; ++j)
-  {
-    const size_t jr = j << 1;
-    const size_t ji = jr + 1;
-
-    const RealType kX    = k0[j];
-    const RealType kY    = k1[j];
-    const RealType kZ    = k2[j];
-    const RealType val_r = myV[jr];
-    const RealType val_i = myV[ji];
-
-    //phase
-    RealType s, c;
-    qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
-
-    //dot(PrimLattice.G,myG[j])
-    const RealType dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
-    const RealType dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
-    const RealType dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
-
-    const RealType dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
-    const RealType dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
-    const RealType dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
-
-    // \f$\nabla \psi_r + {\bf k}\psi_i\f$
-    const RealType gX_r = dX_r + val_i * kX;
-    const RealType gY_r = dY_r + val_i * kY;
-    const RealType gZ_r = dZ_r + val_i * kZ;
-    const RealType gX_i = dX_i - val_r * kX;
-    const RealType gY_i = dY_i - val_r * kY;
-    const RealType gZ_i = dZ_i - val_r * kZ;
-
-    const RealType lcart_r = SymTrace(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], symGG);
-    const RealType lcart_i = SymTrace(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], symGG);
-    const RealType lap_r   = lcart_r + mKK[j] * val_r + two * (kX * dX_i + kY * dY_i + kZ * dZ_i);
-    const RealType lap_i   = lcart_i + mKK[j] * val_i - two * (kX * dX_r + kY * dY_r + kZ * dZ_r);
-    const size_t psiIndex  = j + this->first_spo;
-    psi[psiIndex]          = ComplexT(c * val_r - s * val_i, c * val_i + s * val_r);
-    dpsi[psiIndex][0]      = ComplexT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
-    dpsi[psiIndex][1]      = ComplexT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
-    dpsi[psiIndex][2]      = ComplexT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);
-    d2psi[psiIndex]        = ComplexT(c * lap_r - s * lap_i, c * lap_i + s * lap_r);
-  }
+    for (size_t j = first; j < last; ++j) {
+        const size_t jr = j << 1;
+        const size_t ji = jr + 1;
+
+        const ST kX = k0[j];
+        const ST kY = k1[j];
+        const ST kZ = k2[j];
+        const ST val_r = myV[jr];
+        const ST val_i = myV[ji];
+
+        // phase
+        ST s, c;
+        qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
+
+        // dot(PrimLattice.G,myG[j])
+        const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
+        const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
+        const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
+
+        const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
+        const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
+        const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
+
+        // \f$\nabla \psi_r + {\bf k}\psi_i\f$
+        const ST gX_r = dX_r + val_i * kX;
+        const ST gY_r = dY_r + val_i * kY;
+        const ST gZ_r = dZ_r + val_i * kZ;
+        const ST gX_i = dX_i - val_r * kX;
+        const ST gY_i = dY_i - val_r * kY;
+        const ST gZ_i = dZ_i - val_r * kZ;
+
+        const ST lcart_r = SymTrace(
+            h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], symGG);
+        const ST lcart_i = SymTrace(
+            h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], symGG);
+        const ST lap_r = lcart_r + mKK[j] * val_r +
+            two * (kX * dX_i + kY * dY_i + kZ * dZ_i);
+        const ST lap_i = lcart_i + mKK[j] * val_i -
+            two * (kX * dX_r + kY * dY_r + kZ * dZ_r);
+        const size_t psiIndex = j + this->first_spo;
+        psi[psiIndex] = ComplexT(c * val_r - s * val_i, c * val_i + s * val_r);
+        dpsi[psiIndex][0] = ComplexT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
+        dpsi[psiIndex][1] = ComplexT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
+        dpsi[psiIndex][2] = ComplexT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);
+        d2psi[psiIndex] =
+            ComplexT(c * lap_r - s * lap_i, c * lap_i + s * lap_r);
+    }
 }
 
-/** assign_vgl_from_l can be used when myL is precomputed and myV,myG,myL in cartesian
-   */
-template<class T>
-inline void SplineC2CT<T>::assign_vgl_from_l(const PointType& r, ValueVector& psi, GradVector& dpsi, ValueVector& d2psi)
+/** assign_vgl_from_l can be used when myL is precomputed and myV,myG,myL in
+ * cartesian
+ */
+template <typename ST, typename VT>
+inline void
+SplineC2CT<ST, VT>::assign_vgl_from_l(
+    const PointType& r, ValueVector& psi, GradVector& dpsi, ValueVector& d2psi)
 {
-  constexpr RealType two(2);
-  const RealType x = r[0], y = r[1], z = r[2];
+    constexpr ST two(2);
+    const ST x = r[0], y = r[1], z = r[2];
 
-  const RealType* restrict k0 = myKcart.data(0);
-  const RealType* restrict k1 = myKcart.data(1);
-  const RealType* restrict k2 = myKcart.data(2);
+    const ST* restrict k0 = myKcart.data(0);
+    const ST* restrict k1 = myKcart.data(1);
+    const ST* restrict k2 = myKcart.data(2);
 
-  const RealType* restrict g0 = myG.data(0);
-  const RealType* restrict g1 = myG.data(1);
-  const RealType* restrict g2 = myG.data(2);
+    const ST* restrict g0 = myG.data(0);
+    const ST* restrict g1 = myG.data(1);
+    const ST* restrict g2 = myG.data(2);
 
-  const size_t N = this->last_spo - this->first_spo;
+    const size_t N = this->last_spo - this->first_spo;
 #pragma omp simd
-  for (size_t j = 0; j < N; ++j)
-  {
-    const size_t jr = j << 1;
-    const size_t ji = jr + 1;
-
-    const RealType kX    = k0[j];
-    const RealType kY    = k1[j];
-    const RealType kZ    = k2[j];
-    const RealType val_r = myV[jr];
-    const RealType val_i = myV[ji];
-
-    //phase
-    RealType s, c;
-    qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
-
-    //dot(PrimLattice.G,myG[j])
-    const RealType dX_r = g0[jr];
-    const RealType dY_r = g1[jr];
-    const RealType dZ_r = g2[jr];
-
-    const RealType dX_i = g0[ji];
-    const RealType dY_i = g1[ji];
-    const RealType dZ_i = g2[ji];
-
-    // \f$\nabla \psi_r + {\bf k}\psi_i\f$
-    const RealType gX_r = dX_r + val_i * kX;
-    const RealType gY_r = dY_r + val_i * kY;
-    const RealType gZ_r = dZ_r + val_i * kZ;
-    const RealType gX_i = dX_i - val_r * kX;
-    const RealType gY_i = dY_i - val_r * kY;
-    const RealType gZ_i = dZ_i - val_r * kZ;
-
-    const RealType lap_r = myL[jr] + mKK[j] * val_r + two * (kX * dX_i + kY * dY_i + kZ * dZ_i);
-    const RealType lap_i = myL[ji] + mKK[j] * val_i - two * (kX * dX_r + kY * dY_r + kZ * dZ_r);
-
-    const size_t psiIndex = j + this->first_spo;
-    psi[psiIndex]         = ComplexT(c * val_r - s * val_i, c * val_i + s * val_r);
-    dpsi[psiIndex][0]     = ComplexT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
-    dpsi[psiIndex][1]     = ComplexT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
-    dpsi[psiIndex][2]     = ComplexT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);
-    d2psi[psiIndex]       = ComplexT(c * lap_r - s * lap_i, c * lap_i + s * lap_r);
-  }
+    for (size_t j = 0; j < N; ++j) {
+        const size_t jr = j << 1;
+        const size_t ji = jr + 1;
+
+        const ST kX = k0[j];
+        const ST kY = k1[j];
+        const ST kZ = k2[j];
+        const ST val_r = myV[jr];
+        const ST val_i = myV[ji];
+
+        // phase
+        ST s, c;
+        qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
+
+        // dot(PrimLattice.G,myG[j])
+        const ST dX_r = g0[jr];
+        const ST dY_r = g1[jr];
+        const ST dZ_r = g2[jr];
+
+        const ST dX_i = g0[ji];
+        const ST dY_i = g1[ji];
+        const ST dZ_i = g2[ji];
+
+        // \f$\nabla \psi_r + {\bf k}\psi_i\f$
+        const ST gX_r = dX_r + val_i * kX;
+        const ST gY_r = dY_r + val_i * kY;
+        const ST gZ_r = dZ_r + val_i * kZ;
+        const ST gX_i = dX_i - val_r * kX;
+        const ST gY_i = dY_i - val_r * kY;
+        const ST gZ_i = dZ_i - val_r * kZ;
+
+        const ST lap_r = myL[jr] + mKK[j] * val_r +
+            two * (kX * dX_i + kY * dY_i + kZ * dZ_i);
+        const ST lap_i = myL[ji] + mKK[j] * val_i -
+            two * (kX * dX_r + kY * dY_r + kZ * dZ_r);
+
+        const size_t psiIndex = j + this->first_spo;
+        psi[psiIndex] = ComplexT(c * val_r - s * val_i, c * val_i + s * val_r);
+        dpsi[psiIndex][0] = ComplexT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
+        dpsi[psiIndex][1] = ComplexT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
+        dpsi[psiIndex][2] = ComplexT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);
+        d2psi[psiIndex] =
+            ComplexT(c * lap_r - s * lap_i, c * lap_i + s * lap_r);
+    }
 }
 
-template<class T>
-void SplineC2CT<T>::evaluateVGL(const ParticleSetT<T>& P,
-                                const int iat,
-                                ValueVector& psi,
-                                GradVector& dpsi,
-                                ValueVector& d2psi)
+template <typename ST, typename VT>
+void
+SplineC2CT<ST, VT>::evaluateVGL(const ParticleSetT<VT>& P, const int iat,
+    ValueVector& psi, GradVector& dpsi, ValueVector& d2psi)
 {
-  const PointType& r = P.activeR(iat);
-  PointType ru(PrimLattice.toUnit_floor(r));
+    const PointType& r = P.activeR(iat);
+    PointType ru(PrimLattice.toUnit_floor(r));
 
 #pragma omp parallel
-  {
-    int first, last;
-    // Factor of 2 because psi is complex and the spline storage and evaluation uses a real type
-    FairDivideAligned(2 * psi.size(), getAlignment<T>(), omp_get_num_threads(), omp_get_thread_num(), first, last);
-
-    spline2::evaluate3d_vgh(SplineInst->getSplinePtr(), ru, myV, myG, myH, first, last);
-    assign_vgl(r, psi, dpsi, d2psi, first / 2, last / 2);
-  }
+    {
+        int first, last;
+        // Factor of 2 because psi is complex and the spline storage and
+        // evaluation uses a real type
+        FairDivideAligned(2 * psi.size(), getAlignment<ST>(),
+            omp_get_num_threads(), omp_get_thread_num(), first, last);
+
+        spline2::evaluate3d_vgh(
+            SplineInst->getSplinePtr(), ru, myV, myG, myH, first, last);
+        assign_vgl(r, psi, dpsi, d2psi, first / 2, last / 2);
+    }
 }
 
-template<class T>
-void SplineC2CT<T>::assign_vgh(const PointType& r,
-                               ValueVector& psi,
-                               GradVector& dpsi,
-                               HessVector& grad_grad_psi,
-                               int first,
-                               int last) const
+template <typename ST, typename VT>
+void
+SplineC2CT<ST, VT>::assign_vgh(const PointType& r, ValueVector& psi,
+    GradVector& dpsi, HessVector& grad_grad_psi, int first, int last) const
 {
-  // protect last
-  const auto kPointsSize = this->kPoints.size();
-  last                   = last > kPointsSize ? kPointsSize : last;
-
-  const RealType g00 = PrimLattice.G(0), g01 = PrimLattice.G(1), g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
-                 g11 = PrimLattice.G(4), g12 = PrimLattice.G(5), g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
-                 g22 = PrimLattice.G(8);
-  const RealType x = r[0], y = r[1], z = r[2];
-
-  const RealType* restrict k0 = myKcart.data(0);
-  const RealType* restrict k1 = myKcart.data(1);
-  const RealType* restrict k2 = myKcart.data(2);
-
-  const RealType* restrict g0  = myG.data(0);
-  const RealType* restrict g1  = myG.data(1);
-  const RealType* restrict g2  = myG.data(2);
-  const RealType* restrict h00 = myH.data(0);
-  const RealType* restrict h01 = myH.data(1);
-  const RealType* restrict h02 = myH.data(2);
-  const RealType* restrict h11 = myH.data(3);
-  const RealType* restrict h12 = myH.data(4);
-  const RealType* restrict h22 = myH.data(5);
+    // protect last
+    last = last > this->kPoints.size() ? this->kPoints.size() : last;
+
+    const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1),
+             g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
+             g11 = PrimLattice.G(4), g12 = PrimLattice.G(5),
+             g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
+             g22 = PrimLattice.G(8);
+    const ST x = r[0], y = r[1], z = r[2];
+
+    const ST* restrict k0 = myKcart.data(0);
+    const ST* restrict k1 = myKcart.data(1);
+    const ST* restrict k2 = myKcart.data(2);
+
+    const ST* restrict g0 = myG.data(0);
+    const ST* restrict g1 = myG.data(1);
+    const ST* restrict g2 = myG.data(2);
+    const ST* restrict h00 = myH.data(0);
+    const ST* restrict h01 = myH.data(1);
+    const ST* restrict h02 = myH.data(2);
+    const ST* restrict h11 = myH.data(3);
+    const ST* restrict h12 = myH.data(4);
+    const ST* restrict h22 = myH.data(5);
 
 #pragma omp simd
-  for (size_t j = first; j < last; ++j)
-  {
-    int jr = j << 1;
-    int ji = jr + 1;
-
-    const RealType kX    = k0[j];
-    const RealType kY    = k1[j];
-    const RealType kZ    = k2[j];
-    const RealType val_r = myV[jr];
-    const RealType val_i = myV[ji];
-
-    //phase
-    RealType s, c;
-    qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
-
-    //dot(PrimLattice.G,myG[j])
-    const RealType dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
-    const RealType dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
-    const RealType dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
-
-    const RealType dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
-    const RealType dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
-    const RealType dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
-
-    // \f$\nabla \psi_r + {\bf k}\psi_i\f$
-    const RealType gX_r = dX_r + val_i * kX;
-    const RealType gY_r = dY_r + val_i * kY;
-    const RealType gZ_r = dZ_r + val_i * kZ;
-    const RealType gX_i = dX_i - val_r * kX;
-    const RealType gY_i = dY_i - val_r * kY;
-    const RealType gZ_i = dZ_i - val_r * kZ;
-
-    const size_t psiIndex = j + this->first_spo;
-    psi[psiIndex]         = ComplexT(c * val_r - s * val_i, c * val_i + s * val_r);
-    dpsi[psiIndex][0]     = ComplexT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
-    dpsi[psiIndex][1]     = ComplexT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
-    dpsi[psiIndex][2]     = ComplexT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);
-
-    const RealType h_xx_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g00, g01, g02) + kX * (gX_i + dX_i);
-    const RealType h_xy_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g10, g11, g12) + kX * (gY_i + dY_i);
-    const RealType h_xz_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g20, g21, g22) + kX * (gZ_i + dZ_i);
-    const RealType h_yx_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g00, g01, g02) + kY * (gX_i + dX_i);
-    const RealType h_yy_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g10, g11, g12) + kY * (gY_i + dY_i);
-    const RealType h_yz_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g20, g21, g22) + kY * (gZ_i + dZ_i);
-    const RealType h_zx_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g00, g01, g02) + kZ * (gX_i + dX_i);
-    const RealType h_zy_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g10, g11, g12) + kZ * (gY_i + dY_i);
-    const RealType h_zz_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g20, g21, g22) + kZ * (gZ_i + dZ_i);
-
-    const RealType h_xx_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g00, g01, g02) - kX * (gX_r + dX_r);
-    const RealType h_xy_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g10, g11, g12) - kX * (gY_r + dY_r);
-    const RealType h_xz_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g20, g21, g22) - kX * (gZ_r + dZ_r);
-    const RealType h_yx_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g00, g01, g02) - kY * (gX_r + dX_r);
-    const RealType h_yy_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g10, g11, g12) - kY * (gY_r + dY_r);
-    const RealType h_yz_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g20, g21, g22) - kY * (gZ_r + dZ_r);
-    const RealType h_zx_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g00, g01, g02) - kZ * (gX_r + dX_r);
-    const RealType h_zy_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g10, g11, g12) - kZ * (gY_r + dY_r);
-    const RealType h_zz_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g20, g21, g22) - kZ * (gZ_r + dZ_r);
-
-    grad_grad_psi[psiIndex][0] = ComplexT(c * h_xx_r - s * h_xx_i, c * h_xx_i + s * h_xx_r);
-    grad_grad_psi[psiIndex][1] = ComplexT(c * h_xy_r - s * h_xy_i, c * h_xy_i + s * h_xy_r);
-    grad_grad_psi[psiIndex][2] = ComplexT(c * h_xz_r - s * h_xz_i, c * h_xz_i + s * h_xz_r);
-    grad_grad_psi[psiIndex][3] = ComplexT(c * h_yx_r - s * h_yx_i, c * h_yx_i + s * h_yx_r);
-    grad_grad_psi[psiIndex][4] = ComplexT(c * h_yy_r - s * h_yy_i, c * h_yy_i + s * h_yy_r);
-    grad_grad_psi[psiIndex][5] = ComplexT(c * h_yz_r - s * h_yz_i, c * h_yz_i + s * h_yz_r);
-    grad_grad_psi[psiIndex][6] = ComplexT(c * h_zx_r - s * h_zx_i, c * h_zx_i + s * h_zx_r);
-    grad_grad_psi[psiIndex][7] = ComplexT(c * h_zy_r - s * h_zy_i, c * h_zy_i + s * h_zy_r);
-    grad_grad_psi[psiIndex][8] = ComplexT(c * h_zz_r - s * h_zz_i, c * h_zz_i + s * h_zz_r);
-  }
+    for (size_t j = first; j < last; ++j) {
+        int jr = j << 1;
+        int ji = jr + 1;
+
+        const ST kX = k0[j];
+        const ST kY = k1[j];
+        const ST kZ = k2[j];
+        const ST val_r = myV[jr];
+        const ST val_i = myV[ji];
+
+        // phase
+        ST s, c;
+        qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
+
+        // dot(PrimLattice.G,myG[j])
+        const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
+        const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
+        const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
+
+        const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
+        const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
+        const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
+
+        // \f$\nabla \psi_r + {\bf k}\psi_i\f$
+        const ST gX_r = dX_r + val_i * kX;
+        const ST gY_r = dY_r + val_i * kY;
+        const ST gZ_r = dZ_r + val_i * kZ;
+        const ST gX_i = dX_i - val_r * kX;
+        const ST gY_i = dY_i - val_r * kY;
+        const ST gZ_i = dZ_i - val_r * kZ;
+
+        const size_t psiIndex = j + this->first_spo;
+        psi[psiIndex] = ComplexT(c * val_r - s * val_i, c * val_i + s * val_r);
+        dpsi[psiIndex][0] = ComplexT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
+        dpsi[psiIndex][1] = ComplexT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
+        dpsi[psiIndex][2] = ComplexT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);
+
+        const ST h_xx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g00, g01, g02, g00, g01, g02) +
+            kX * (gX_i + dX_i);
+        const ST h_xy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g00, g01, g02, g10, g11, g12) +
+            kX * (gY_i + dY_i);
+        const ST h_xz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g00, g01, g02, g20, g21, g22) +
+            kX * (gZ_i + dZ_i);
+        const ST h_yx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g10, g11, g12, g00, g01, g02) +
+            kY * (gX_i + dX_i);
+        const ST h_yy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g10, g11, g12, g10, g11, g12) +
+            kY * (gY_i + dY_i);
+        const ST h_yz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g10, g11, g12, g20, g21, g22) +
+            kY * (gZ_i + dZ_i);
+        const ST h_zx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g20, g21, g22, g00, g01, g02) +
+            kZ * (gX_i + dX_i);
+        const ST h_zy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g20, g21, g22, g10, g11, g12) +
+            kZ * (gY_i + dY_i);
+        const ST h_zz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g20, g21, g22, g20, g21, g22) +
+            kZ * (gZ_i + dZ_i);
+
+        const ST h_xx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g00, g01, g02, g00, g01, g02) -
+            kX * (gX_r + dX_r);
+        const ST h_xy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g00, g01, g02, g10, g11, g12) -
+            kX * (gY_r + dY_r);
+        const ST h_xz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g00, g01, g02, g20, g21, g22) -
+            kX * (gZ_r + dZ_r);
+        const ST h_yx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g10, g11, g12, g00, g01, g02) -
+            kY * (gX_r + dX_r);
+        const ST h_yy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g10, g11, g12, g10, g11, g12) -
+            kY * (gY_r + dY_r);
+        const ST h_yz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g10, g11, g12, g20, g21, g22) -
+            kY * (gZ_r + dZ_r);
+        const ST h_zx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g20, g21, g22, g00, g01, g02) -
+            kZ * (gX_r + dX_r);
+        const ST h_zy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g20, g21, g22, g10, g11, g12) -
+            kZ * (gY_r + dY_r);
+        const ST h_zz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g20, g21, g22, g20, g21, g22) -
+            kZ * (gZ_r + dZ_r);
+
+        grad_grad_psi[psiIndex][0] =
+            ComplexT(c * h_xx_r - s * h_xx_i, c * h_xx_i + s * h_xx_r);
+        grad_grad_psi[psiIndex][1] =
+            ComplexT(c * h_xy_r - s * h_xy_i, c * h_xy_i + s * h_xy_r);
+        grad_grad_psi[psiIndex][2] =
+            ComplexT(c * h_xz_r - s * h_xz_i, c * h_xz_i + s * h_xz_r);
+        grad_grad_psi[psiIndex][3] =
+            ComplexT(c * h_yx_r - s * h_yx_i, c * h_yx_i + s * h_yx_r);
+        grad_grad_psi[psiIndex][4] =
+            ComplexT(c * h_yy_r - s * h_yy_i, c * h_yy_i + s * h_yy_r);
+        grad_grad_psi[psiIndex][5] =
+            ComplexT(c * h_yz_r - s * h_yz_i, c * h_yz_i + s * h_yz_r);
+        grad_grad_psi[psiIndex][6] =
+            ComplexT(c * h_zx_r - s * h_zx_i, c * h_zx_i + s * h_zx_r);
+        grad_grad_psi[psiIndex][7] =
+            ComplexT(c * h_zy_r - s * h_zy_i, c * h_zy_i + s * h_zy_r);
+        grad_grad_psi[psiIndex][8] =
+            ComplexT(c * h_zz_r - s * h_zz_i, c * h_zz_i + s * h_zz_r);
+    }
 }
 
-template<class T>
-void SplineC2CT<T>::evaluateVGH(const ParticleSetT<T>& P,
-                                const int iat,
-                                ValueVector& psi,
-                                GradVector& dpsi,
-                                HessVector& grad_grad_psi)
+template <typename ST, typename VT>
+void
+SplineC2CT<ST, VT>::evaluateVGH(const ParticleSetT<VT>& P, const int iat,
+    ValueVector& psi, GradVector& dpsi, HessVector& grad_grad_psi)
 {
-  const PointType& r = P.activeR(iat);
-  PointType ru(PrimLattice.toUnit_floor(r));
+    const PointType& r = P.activeR(iat);
+    PointType ru(PrimLattice.toUnit_floor(r));
 
 #pragma omp parallel
-  {
-    int first, last;
-    // Factor of 2 because psi is complex and the spline storage and evaluation uses a real type
-    FairDivideAligned(2 * psi.size(), getAlignment<T>(), omp_get_num_threads(), omp_get_thread_num(), first, last);
-
-    spline2::evaluate3d_vgh(SplineInst->getSplinePtr(), ru, myV, myG, myH, first, last);
-    assign_vgh(r, psi, dpsi, grad_grad_psi, first / 2, last / 2);
-  }
+    {
+        int first, last;
+        // Factor of 2 because psi is complex and the spline storage and
+        // evaluation uses a real type
+        FairDivideAligned(2 * psi.size(), getAlignment<ST>(),
+            omp_get_num_threads(), omp_get_thread_num(), first, last);
+
+        spline2::evaluate3d_vgh(
+            SplineInst->getSplinePtr(), ru, myV, myG, myH, first, last);
+        assign_vgh(r, psi, dpsi, grad_grad_psi, first / 2, last / 2);
+    }
 }
 
-template<class T>
-void SplineC2CT<T>::assign_vghgh(const PointType& r,
-                                 ValueVector& psi,
-                                 GradVector& dpsi,
-                                 HessVector& grad_grad_psi,
-                                 GGGVector& grad_grad_grad_psi,
-                                 int first,
-                                 int last) const
+template <typename ST, typename VT>
+void
+SplineC2CT<ST, VT>::assign_vghgh(const PointType& r, ValueVector& psi,
+    GradVector& dpsi, HessVector& grad_grad_psi, GGGVector& grad_grad_grad_psi,
+    int first, int last) const
 {
-  // protect last
-  const auto kPointsSize = this->kPoints.size();
-  last                   = last < 0 ? kPointsSize : (last > kPointsSize ? kPointsSize : last);
-
-  const RealType g00 = PrimLattice.G(0), g01 = PrimLattice.G(1), g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
-                 g11 = PrimLattice.G(4), g12 = PrimLattice.G(5), g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
-                 g22 = PrimLattice.G(8);
-  const RealType x = r[0], y = r[1], z = r[2];
-
-  const RealType* restrict k0 = myKcart.data(0);
-  const RealType* restrict k1 = myKcart.data(1);
-  const RealType* restrict k2 = myKcart.data(2);
-
-  const RealType* restrict g0  = myG.data(0);
-  const RealType* restrict g1  = myG.data(1);
-  const RealType* restrict g2  = myG.data(2);
-  const RealType* restrict h00 = myH.data(0);
-  const RealType* restrict h01 = myH.data(1);
-  const RealType* restrict h02 = myH.data(2);
-  const RealType* restrict h11 = myH.data(3);
-  const RealType* restrict h12 = myH.data(4);
-  const RealType* restrict h22 = myH.data(5);
-
-  const RealType* restrict gh000 = mygH.data(0);
-  const RealType* restrict gh001 = mygH.data(1);
-  const RealType* restrict gh002 = mygH.data(2);
-  const RealType* restrict gh011 = mygH.data(3);
-  const RealType* restrict gh012 = mygH.data(4);
-  const RealType* restrict gh022 = mygH.data(5);
-  const RealType* restrict gh111 = mygH.data(6);
-  const RealType* restrict gh112 = mygH.data(7);
-  const RealType* restrict gh122 = mygH.data(8);
-  const RealType* restrict gh222 = mygH.data(9);
-
-//SIMD doesn't work quite right yet.  Comment out until further debugging.
+    // protect last
+    last = last < 0 ?
+        this->kPoints.size() :
+        (last > this->kPoints.size() ? this->kPoints.size() : last);
+
+    const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1),
+             g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
+             g11 = PrimLattice.G(4), g12 = PrimLattice.G(5),
+             g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
+             g22 = PrimLattice.G(8);
+    const ST x = r[0], y = r[1], z = r[2];
+
+    const ST* restrict k0 = myKcart.data(0);
+    const ST* restrict k1 = myKcart.data(1);
+    const ST* restrict k2 = myKcart.data(2);
+
+    const ST* restrict g0 = myG.data(0);
+    const ST* restrict g1 = myG.data(1);
+    const ST* restrict g2 = myG.data(2);
+    const ST* restrict h00 = myH.data(0);
+    const ST* restrict h01 = myH.data(1);
+    const ST* restrict h02 = myH.data(2);
+    const ST* restrict h11 = myH.data(3);
+    const ST* restrict h12 = myH.data(4);
+    const ST* restrict h22 = myH.data(5);
+
+    const ST* restrict gh000 = mygH.data(0);
+    const ST* restrict gh001 = mygH.data(1);
+    const ST* restrict gh002 = mygH.data(2);
+    const ST* restrict gh011 = mygH.data(3);
+    const ST* restrict gh012 = mygH.data(4);
+    const ST* restrict gh022 = mygH.data(5);
+    const ST* restrict gh111 = mygH.data(6);
+    const ST* restrict gh112 = mygH.data(7);
+    const ST* restrict gh122 = mygH.data(8);
+    const ST* restrict gh222 = mygH.data(9);
+
+// SIMD doesn't work quite right yet.  Comment out until further debugging.
 #pragma omp simd
-  for (size_t j = first; j < last; ++j)
-  {
-    int jr = j << 1;
-    int ji = jr + 1;
-
-    const RealType kX    = k0[j];
-    const RealType kY    = k1[j];
-    const RealType kZ    = k2[j];
-    const RealType val_r = myV[jr];
-    const RealType val_i = myV[ji];
-
-    //phase
-    RealType s, c;
-    qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
-
-    //dot(PrimLattice.G,myG[j])
-    const RealType dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
-    const RealType dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
-    const RealType dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
-
-    const RealType dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
-    const RealType dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
-    const RealType dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
-
-    // \f$\nabla \psi_r + {\bf k}\psi_i\f$
-    const RealType gX_r = dX_r + val_i * kX;
-    const RealType gY_r = dY_r + val_i * kY;
-    const RealType gZ_r = dZ_r + val_i * kZ;
-    const RealType gX_i = dX_i - val_r * kX;
-    const RealType gY_i = dY_i - val_r * kY;
-    const RealType gZ_i = dZ_i - val_r * kZ;
-
-    const size_t psiIndex = j + this->first_spo;
-    psi[psiIndex]         = ComplexT(c * val_r - s * val_i, c * val_i + s * val_r);
-    dpsi[psiIndex][0]     = ComplexT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
-    dpsi[psiIndex][1]     = ComplexT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
-    dpsi[psiIndex][2]     = ComplexT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);
-
-    //intermediates for computation of hessian. \partial_i \partial_j phi in cartesian coordinates.
-    const RealType f_xx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g00, g01, g02);
-    const RealType f_xy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g10, g11, g12);
-    const RealType f_xz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g20, g21, g22);
-    const RealType f_yy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g10, g11, g12);
-    const RealType f_yz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g20, g21, g22);
-    const RealType f_zz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g20, g21, g22);
-
-    const RealType f_xx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g00, g01, g02);
-    const RealType f_xy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g10, g11, g12);
-    const RealType f_xz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g20, g21, g22);
-    const RealType f_yy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g10, g11, g12);
-    const RealType f_yz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g20, g21, g22);
-    const RealType f_zz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g20, g21, g22);
-
-    const RealType h_xx_r = f_xx_r + 2 * kX * dX_i - kX * kX * val_r;
-    const RealType h_xy_r = f_xy_r + (kX * dY_i + kY * dX_i) - kX * kY * val_r;
-    const RealType h_xz_r = f_xz_r + (kX * dZ_i + kZ * dX_i) - kX * kZ * val_r;
-    const RealType h_yy_r = f_yy_r + 2 * kY * dY_i - kY * kY * val_r;
-    const RealType h_yz_r = f_yz_r + (kY * dZ_i + kZ * dY_i) - kY * kZ * val_r;
-    const RealType h_zz_r = f_zz_r + 2 * kZ * dZ_i - kZ * kZ * val_r;
-
-    const RealType h_xx_i = f_xx_i - 2 * kX * dX_r - kX * kX * val_i;
-    const RealType h_xy_i = f_xy_i - (kX * dY_r + kY * dX_r) - kX * kY * val_i;
-    const RealType h_xz_i = f_xz_i - (kX * dZ_r + kZ * dX_r) - kX * kZ * val_i;
-    const RealType h_yy_i = f_yy_i - 2 * kY * dY_r - kY * kY * val_i;
-    const RealType h_yz_i = f_yz_i - (kZ * dY_r + kY * dZ_r) - kZ * kY * val_i;
-    const RealType h_zz_i = f_zz_i - 2 * kZ * dZ_r - kZ * kZ * val_i;
-
-    grad_grad_psi[psiIndex][0] = ComplexT(c * h_xx_r - s * h_xx_i, c * h_xx_i + s * h_xx_r);
-    grad_grad_psi[psiIndex][1] = ComplexT(c * h_xy_r - s * h_xy_i, c * h_xy_i + s * h_xy_r);
-    grad_grad_psi[psiIndex][2] = ComplexT(c * h_xz_r - s * h_xz_i, c * h_xz_i + s * h_xz_r);
-    grad_grad_psi[psiIndex][3] = ComplexT(c * h_xy_r - s * h_xy_i, c * h_xy_i + s * h_xy_r);
-    grad_grad_psi[psiIndex][4] = ComplexT(c * h_yy_r - s * h_yy_i, c * h_yy_i + s * h_yy_r);
-    grad_grad_psi[psiIndex][5] = ComplexT(c * h_yz_r - s * h_yz_i, c * h_yz_i + s * h_yz_r);
-    grad_grad_psi[psiIndex][6] = ComplexT(c * h_xz_r - s * h_xz_i, c * h_xz_i + s * h_xz_r);
-    grad_grad_psi[psiIndex][7] = ComplexT(c * h_yz_r - s * h_yz_i, c * h_yz_i + s * h_yz_r);
-    grad_grad_psi[psiIndex][8] = ComplexT(c * h_zz_r - s * h_zz_i, c * h_zz_i + s * h_zz_r);
-
-    //These are the real and imaginary components of the third SPO derivative.  _xxx denotes
-    // third derivative w.r.t. x, _xyz, a derivative with resepect to x,y, and z, and so on.
-
-    const RealType f3_xxx_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                          gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g00, g01, g02);
-    const RealType f3_xxy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                          gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g10, g11, g12);
-    const RealType f3_xxz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                          gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g20, g21, g22);
-    const RealType f3_xyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                          gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g10, g11, g12, g10, g11, g12);
-    const RealType f3_xyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                          gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g10, g11, g12, g20, g21, g22);
-    const RealType f3_xzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                          gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g20, g21, g22, g20, g21, g22);
-    const RealType f3_yyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                          gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g10, g11, g12, g10, g11, g12);
-    const RealType f3_yyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                          gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g10, g11, g12, g20, g21, g22);
-    const RealType f3_yzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                          gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g20, g21, g22, g20, g21, g22);
-    const RealType f3_zzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                          gh112[jr], gh122[jr], gh222[jr], g20, g21, g22, g20, g21, g22, g20, g21, g22);
-
-    const RealType f3_xxx_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                          gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g00, g01, g02);
-    const RealType f3_xxy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                          gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g10, g11, g12);
-    const RealType f3_xxz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                          gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g20, g21, g22);
-    const RealType f3_xyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                          gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g10, g11, g12, g10, g11, g12);
-    const RealType f3_xyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                          gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g10, g11, g12, g20, g21, g22);
-    const RealType f3_xzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                          gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g20, g21, g22, g20, g21, g22);
-    const RealType f3_yyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                          gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g10, g11, g12, g10, g11, g12);
-    const RealType f3_yyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                          gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g10, g11, g12, g20, g21, g22);
-    const RealType f3_yzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                          gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g20, g21, g22, g20, g21, g22);
-    const RealType f3_zzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                          gh112[ji], gh122[ji], gh222[ji], g20, g21, g22, g20, g21, g22, g20, g21, g22);
-
-    //Here is where we build up the components of the physical hessian gradient, namely, d^3/dx^3(e^{-ik*r}\phi(r)
-    const RealType gh_xxx_r = f3_xxx_r + 3 * kX * f_xx_i - 3 * kX * kX * dX_r - kX * kX * kX * val_i;
-    const RealType gh_xxx_i = f3_xxx_i - 3 * kX * f_xx_r - 3 * kX * kX * dX_i + kX * kX * kX * val_r;
-    const RealType gh_xxy_r =
-        f3_xxy_r + (kY * f_xx_i + 2 * kX * f_xy_i) - (kX * kX * dY_r + 2 * kX * kY * dX_r) - kX * kX * kY * val_i;
-    const RealType gh_xxy_i =
-        f3_xxy_i - (kY * f_xx_r + 2 * kX * f_xy_r) - (kX * kX * dY_i + 2 * kX * kY * dX_i) + kX * kX * kY * val_r;
-    const RealType gh_xxz_r =
-        f3_xxz_r + (kZ * f_xx_i + 2 * kX * f_xz_i) - (kX * kX * dZ_r + 2 * kX * kZ * dX_r) - kX * kX * kZ * val_i;
-    const RealType gh_xxz_i =
-        f3_xxz_i - (kZ * f_xx_r + 2 * kX * f_xz_r) - (kX * kX * dZ_i + 2 * kX * kZ * dX_i) + kX * kX * kZ * val_r;
-    const RealType gh_xyy_r =
-        f3_xyy_r + (2 * kY * f_xy_i + kX * f_yy_i) - (2 * kX * kY * dY_r + kY * kY * dX_r) - kX * kY * kY * val_i;
-    const RealType gh_xyy_i =
-        f3_xyy_i - (2 * kY * f_xy_r + kX * f_yy_r) - (2 * kX * kY * dY_i + kY * kY * dX_i) + kX * kY * kY * val_r;
-    const RealType gh_xyz_r = f3_xyz_r + (kX * f_yz_i + kY * f_xz_i + kZ * f_xy_i) -
-        (kX * kY * dZ_r + kY * kZ * dX_r + kZ * kX * dY_r) - kX * kY * kZ * val_i;
-    const RealType gh_xyz_i = f3_xyz_i - (kX * f_yz_r + kY * f_xz_r + kZ * f_xy_r) -
-        (kX * kY * dZ_i + kY * kZ * dX_i + kZ * kX * dY_i) + kX * kY * kZ * val_r;
-    const RealType gh_xzz_r =
-        f3_xzz_r + (2 * kZ * f_xz_i + kX * f_zz_i) - (2 * kX * kZ * dZ_r + kZ * kZ * dX_r) - kX * kZ * kZ * val_i;
-    const RealType gh_xzz_i =
-        f3_xzz_i - (2 * kZ * f_xz_r + kX * f_zz_r) - (2 * kX * kZ * dZ_i + kZ * kZ * dX_i) + kX * kZ * kZ * val_r;
-    const RealType gh_yyy_r = f3_yyy_r + 3 * kY * f_yy_i - 3 * kY * kY * dY_r - kY * kY * kY * val_i;
-    const RealType gh_yyy_i = f3_yyy_i - 3 * kY * f_yy_r - 3 * kY * kY * dY_i + kY * kY * kY * val_r;
-    const RealType gh_yyz_r =
-        f3_yyz_r + (kZ * f_yy_i + 2 * kY * f_yz_i) - (kY * kY * dZ_r + 2 * kY * kZ * dY_r) - kY * kY * kZ * val_i;
-    const RealType gh_yyz_i =
-        f3_yyz_i - (kZ * f_yy_r + 2 * kY * f_yz_r) - (kY * kY * dZ_i + 2 * kY * kZ * dY_i) + kY * kY * kZ * val_r;
-    const RealType gh_yzz_r =
-        f3_yzz_r + (2 * kZ * f_yz_i + kY * f_zz_i) - (2 * kY * kZ * dZ_r + kZ * kZ * dY_r) - kY * kZ * kZ * val_i;
-    const RealType gh_yzz_i =
-        f3_yzz_i - (2 * kZ * f_yz_r + kY * f_zz_r) - (2 * kY * kZ * dZ_i + kZ * kZ * dY_i) + kY * kZ * kZ * val_r;
-    const RealType gh_zzz_r = f3_zzz_r + 3 * kZ * f_zz_i - 3 * kZ * kZ * dZ_r - kZ * kZ * kZ * val_i;
-    const RealType gh_zzz_i = f3_zzz_i - 3 * kZ * f_zz_r - 3 * kZ * kZ * dZ_i + kZ * kZ * kZ * val_r;
-
-    grad_grad_grad_psi[psiIndex][0][0] = ComplexT(c * gh_xxx_r - s * gh_xxx_i, c * gh_xxx_i + s * gh_xxx_r);
-    grad_grad_grad_psi[psiIndex][0][1] = ComplexT(c * gh_xxy_r - s * gh_xxy_i, c * gh_xxy_i + s * gh_xxy_r);
-    grad_grad_grad_psi[psiIndex][0][2] = ComplexT(c * gh_xxz_r - s * gh_xxz_i, c * gh_xxz_i + s * gh_xxz_r);
-    grad_grad_grad_psi[psiIndex][0][3] = ComplexT(c * gh_xxy_r - s * gh_xxy_i, c * gh_xxy_i + s * gh_xxy_r);
-    grad_grad_grad_psi[psiIndex][0][4] = ComplexT(c * gh_xyy_r - s * gh_xyy_i, c * gh_xyy_i + s * gh_xyy_r);
-    grad_grad_grad_psi[psiIndex][0][5] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
-    grad_grad_grad_psi[psiIndex][0][6] = ComplexT(c * gh_xxz_r - s * gh_xxz_i, c * gh_xxz_i + s * gh_xxz_r);
-    grad_grad_grad_psi[psiIndex][0][7] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
-    grad_grad_grad_psi[psiIndex][0][8] = ComplexT(c * gh_xzz_r - s * gh_xzz_i, c * gh_xzz_i + s * gh_xzz_r);
-
-    grad_grad_grad_psi[psiIndex][1][0] = ComplexT(c * gh_xxy_r - s * gh_xxy_i, c * gh_xxy_i + s * gh_xxy_r);
-    grad_grad_grad_psi[psiIndex][1][1] = ComplexT(c * gh_xyy_r - s * gh_xyy_i, c * gh_xyy_i + s * gh_xyy_r);
-    grad_grad_grad_psi[psiIndex][1][2] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
-    grad_grad_grad_psi[psiIndex][1][3] = ComplexT(c * gh_xyy_r - s * gh_xyy_i, c * gh_xyy_i + s * gh_xyy_r);
-    grad_grad_grad_psi[psiIndex][1][4] = ComplexT(c * gh_yyy_r - s * gh_yyy_i, c * gh_yyy_i + s * gh_yyy_r);
-    grad_grad_grad_psi[psiIndex][1][5] = ComplexT(c * gh_yyz_r - s * gh_yyz_i, c * gh_yyz_i + s * gh_yyz_r);
-    grad_grad_grad_psi[psiIndex][1][6] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
-    grad_grad_grad_psi[psiIndex][1][7] = ComplexT(c * gh_yyz_r - s * gh_yyz_i, c * gh_yyz_i + s * gh_yyz_r);
-    grad_grad_grad_psi[psiIndex][1][8] = ComplexT(c * gh_yzz_r - s * gh_yzz_i, c * gh_yzz_i + s * gh_yzz_r);
-
-
-    grad_grad_grad_psi[psiIndex][2][0] = ComplexT(c * gh_xxz_r - s * gh_xxz_i, c * gh_xxz_i + s * gh_xxz_r);
-    grad_grad_grad_psi[psiIndex][2][1] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
-    grad_grad_grad_psi[psiIndex][2][2] = ComplexT(c * gh_xzz_r - s * gh_xzz_i, c * gh_xzz_i + s * gh_xzz_r);
-    grad_grad_grad_psi[psiIndex][2][3] = ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
-    grad_grad_grad_psi[psiIndex][2][4] = ComplexT(c * gh_yyz_r - s * gh_yyz_i, c * gh_yyz_i + s * gh_yyz_r);
-    grad_grad_grad_psi[psiIndex][2][5] = ComplexT(c * gh_yzz_r - s * gh_yzz_i, c * gh_yzz_i + s * gh_yzz_r);
-    grad_grad_grad_psi[psiIndex][2][6] = ComplexT(c * gh_xzz_r - s * gh_xzz_i, c * gh_xzz_i + s * gh_xzz_r);
-    grad_grad_grad_psi[psiIndex][2][7] = ComplexT(c * gh_yzz_r - s * gh_yzz_i, c * gh_yzz_i + s * gh_yzz_r);
-    grad_grad_grad_psi[psiIndex][2][8] = ComplexT(c * gh_zzz_r - s * gh_zzz_i, c * gh_zzz_i + s * gh_zzz_r);
-  }
+    for (size_t j = first; j < last; ++j) {
+        int jr = j << 1;
+        int ji = jr + 1;
+
+        const ST kX = k0[j];
+        const ST kY = k1[j];
+        const ST kZ = k2[j];
+        const ST val_r = myV[jr];
+        const ST val_i = myV[ji];
+
+        // phase
+        ST s, c;
+        qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
+
+        // dot(PrimLattice.G,myG[j])
+        const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
+        const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
+        const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
+
+        const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
+        const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
+        const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
+
+        // \f$\nabla \psi_r + {\bf k}\psi_i\f$
+        const ST gX_r = dX_r + val_i * kX;
+        const ST gY_r = dY_r + val_i * kY;
+        const ST gZ_r = dZ_r + val_i * kZ;
+        const ST gX_i = dX_i - val_r * kX;
+        const ST gY_i = dY_i - val_r * kY;
+        const ST gZ_i = dZ_i - val_r * kZ;
+
+        const size_t psiIndex = j + this->first_spo;
+        psi[psiIndex] = ComplexT(c * val_r - s * val_i, c * val_i + s * val_r);
+        dpsi[psiIndex][0] = ComplexT(c * gX_r - s * gX_i, c * gX_i + s * gX_r);
+        dpsi[psiIndex][1] = ComplexT(c * gY_r - s * gY_i, c * gY_i + s * gY_r);
+        dpsi[psiIndex][2] = ComplexT(c * gZ_r - s * gZ_i, c * gZ_i + s * gZ_r);
+
+        // intermediates for computation of hessian. \partial_i \partial_j phi
+        // in cartesian coordinates.
+        const ST f_xx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+            h22[jr], g00, g01, g02, g00, g01, g02);
+        const ST f_xy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+            h22[jr], g00, g01, g02, g10, g11, g12);
+        const ST f_xz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+            h22[jr], g00, g01, g02, g20, g21, g22);
+        const ST f_yy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+            h22[jr], g10, g11, g12, g10, g11, g12);
+        const ST f_yz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+            h22[jr], g10, g11, g12, g20, g21, g22);
+        const ST f_zz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+            h22[jr], g20, g21, g22, g20, g21, g22);
+
+        const ST f_xx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+            h22[ji], g00, g01, g02, g00, g01, g02);
+        const ST f_xy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+            h22[ji], g00, g01, g02, g10, g11, g12);
+        const ST f_xz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+            h22[ji], g00, g01, g02, g20, g21, g22);
+        const ST f_yy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+            h22[ji], g10, g11, g12, g10, g11, g12);
+        const ST f_yz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+            h22[ji], g10, g11, g12, g20, g21, g22);
+        const ST f_zz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+            h22[ji], g20, g21, g22, g20, g21, g22);
+
+        const ST h_xx_r = f_xx_r + 2 * kX * dX_i - kX * kX * val_r;
+        const ST h_xy_r = f_xy_r + (kX * dY_i + kY * dX_i) - kX * kY * val_r;
+        const ST h_xz_r = f_xz_r + (kX * dZ_i + kZ * dX_i) - kX * kZ * val_r;
+        const ST h_yy_r = f_yy_r + 2 * kY * dY_i - kY * kY * val_r;
+        const ST h_yz_r = f_yz_r + (kY * dZ_i + kZ * dY_i) - kY * kZ * val_r;
+        const ST h_zz_r = f_zz_r + 2 * kZ * dZ_i - kZ * kZ * val_r;
+
+        const ST h_xx_i = f_xx_i - 2 * kX * dX_r - kX * kX * val_i;
+        const ST h_xy_i = f_xy_i - (kX * dY_r + kY * dX_r) - kX * kY * val_i;
+        const ST h_xz_i = f_xz_i - (kX * dZ_r + kZ * dX_r) - kX * kZ * val_i;
+        const ST h_yy_i = f_yy_i - 2 * kY * dY_r - kY * kY * val_i;
+        const ST h_yz_i = f_yz_i - (kZ * dY_r + kY * dZ_r) - kZ * kY * val_i;
+        const ST h_zz_i = f_zz_i - 2 * kZ * dZ_r - kZ * kZ * val_i;
+
+        grad_grad_psi[psiIndex][0] =
+            ComplexT(c * h_xx_r - s * h_xx_i, c * h_xx_i + s * h_xx_r);
+        grad_grad_psi[psiIndex][1] =
+            ComplexT(c * h_xy_r - s * h_xy_i, c * h_xy_i + s * h_xy_r);
+        grad_grad_psi[psiIndex][2] =
+            ComplexT(c * h_xz_r - s * h_xz_i, c * h_xz_i + s * h_xz_r);
+        grad_grad_psi[psiIndex][3] =
+            ComplexT(c * h_xy_r - s * h_xy_i, c * h_xy_i + s * h_xy_r);
+        grad_grad_psi[psiIndex][4] =
+            ComplexT(c * h_yy_r - s * h_yy_i, c * h_yy_i + s * h_yy_r);
+        grad_grad_psi[psiIndex][5] =
+            ComplexT(c * h_yz_r - s * h_yz_i, c * h_yz_i + s * h_yz_r);
+        grad_grad_psi[psiIndex][6] =
+            ComplexT(c * h_xz_r - s * h_xz_i, c * h_xz_i + s * h_xz_r);
+        grad_grad_psi[psiIndex][7] =
+            ComplexT(c * h_yz_r - s * h_yz_i, c * h_yz_i + s * h_yz_r);
+        grad_grad_psi[psiIndex][8] =
+            ComplexT(c * h_zz_r - s * h_zz_i, c * h_zz_i + s * h_zz_r);
+
+        // These are the real and imaginary components of the third SPO
+        // derivative.  _xxx denotes
+        //  third derivative w.r.t. x, _xyz, a derivative with resepect to x,y,
+        //  and z, and so on.
+
+        const ST f3_xxx_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g00, g01, g02, g00, g01, g02, g00, g01, g02);
+        const ST f3_xxy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g00, g01, g02, g00, g01, g02, g10, g11, g12);
+        const ST f3_xxz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g00, g01, g02, g00, g01, g02, g20, g21, g22);
+        const ST f3_xyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g00, g01, g02, g10, g11, g12, g10, g11, g12);
+        const ST f3_xyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g00, g01, g02, g10, g11, g12, g20, g21, g22);
+        const ST f3_xzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g00, g01, g02, g20, g21, g22, g20, g21, g22);
+        const ST f3_yyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g10, g11, g12, g10, g11, g12, g10, g11, g12);
+        const ST f3_yyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g10, g11, g12, g10, g11, g12, g20, g21, g22);
+        const ST f3_yzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g10, g11, g12, g20, g21, g22, g20, g21, g22);
+        const ST f3_zzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g20, g21, g22, g20, g21, g22, g20, g21, g22);
+
+        const ST f3_xxx_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g00, g01, g02, g00, g01, g02, g00, g01, g02);
+        const ST f3_xxy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g00, g01, g02, g00, g01, g02, g10, g11, g12);
+        const ST f3_xxz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g00, g01, g02, g00, g01, g02, g20, g21, g22);
+        const ST f3_xyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g00, g01, g02, g10, g11, g12, g10, g11, g12);
+        const ST f3_xyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g00, g01, g02, g10, g11, g12, g20, g21, g22);
+        const ST f3_xzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g00, g01, g02, g20, g21, g22, g20, g21, g22);
+        const ST f3_yyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g10, g11, g12, g10, g11, g12, g10, g11, g12);
+        const ST f3_yyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g10, g11, g12, g10, g11, g12, g20, g21, g22);
+        const ST f3_yzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g10, g11, g12, g20, g21, g22, g20, g21, g22);
+        const ST f3_zzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g20, g21, g22, g20, g21, g22, g20, g21, g22);
+
+        // Here is where we build up the components of the physical hessian
+        // gradient, namely, d^3/dx^3(e^{-ik*r}\phi(r)
+        const ST gh_xxx_r = f3_xxx_r + 3 * kX * f_xx_i - 3 * kX * kX * dX_r -
+            kX * kX * kX * val_i;
+        const ST gh_xxx_i = f3_xxx_i - 3 * kX * f_xx_r - 3 * kX * kX * dX_i +
+            kX * kX * kX * val_r;
+        const ST gh_xxy_r = f3_xxy_r + (kY * f_xx_i + 2 * kX * f_xy_i) -
+            (kX * kX * dY_r + 2 * kX * kY * dX_r) - kX * kX * kY * val_i;
+        const ST gh_xxy_i = f3_xxy_i - (kY * f_xx_r + 2 * kX * f_xy_r) -
+            (kX * kX * dY_i + 2 * kX * kY * dX_i) + kX * kX * kY * val_r;
+        const ST gh_xxz_r = f3_xxz_r + (kZ * f_xx_i + 2 * kX * f_xz_i) -
+            (kX * kX * dZ_r + 2 * kX * kZ * dX_r) - kX * kX * kZ * val_i;
+        const ST gh_xxz_i = f3_xxz_i - (kZ * f_xx_r + 2 * kX * f_xz_r) -
+            (kX * kX * dZ_i + 2 * kX * kZ * dX_i) + kX * kX * kZ * val_r;
+        const ST gh_xyy_r = f3_xyy_r + (2 * kY * f_xy_i + kX * f_yy_i) -
+            (2 * kX * kY * dY_r + kY * kY * dX_r) - kX * kY * kY * val_i;
+        const ST gh_xyy_i = f3_xyy_i - (2 * kY * f_xy_r + kX * f_yy_r) -
+            (2 * kX * kY * dY_i + kY * kY * dX_i) + kX * kY * kY * val_r;
+        const ST gh_xyz_r = f3_xyz_r +
+            (kX * f_yz_i + kY * f_xz_i + kZ * f_xy_i) -
+            (kX * kY * dZ_r + kY * kZ * dX_r + kZ * kX * dY_r) -
+            kX * kY * kZ * val_i;
+        const ST gh_xyz_i = f3_xyz_i -
+            (kX * f_yz_r + kY * f_xz_r + kZ * f_xy_r) -
+            (kX * kY * dZ_i + kY * kZ * dX_i + kZ * kX * dY_i) +
+            kX * kY * kZ * val_r;
+        const ST gh_xzz_r = f3_xzz_r + (2 * kZ * f_xz_i + kX * f_zz_i) -
+            (2 * kX * kZ * dZ_r + kZ * kZ * dX_r) - kX * kZ * kZ * val_i;
+        const ST gh_xzz_i = f3_xzz_i - (2 * kZ * f_xz_r + kX * f_zz_r) -
+            (2 * kX * kZ * dZ_i + kZ * kZ * dX_i) + kX * kZ * kZ * val_r;
+        const ST gh_yyy_r = f3_yyy_r + 3 * kY * f_yy_i - 3 * kY * kY * dY_r -
+            kY * kY * kY * val_i;
+        const ST gh_yyy_i = f3_yyy_i - 3 * kY * f_yy_r - 3 * kY * kY * dY_i +
+            kY * kY * kY * val_r;
+        const ST gh_yyz_r = f3_yyz_r + (kZ * f_yy_i + 2 * kY * f_yz_i) -
+            (kY * kY * dZ_r + 2 * kY * kZ * dY_r) - kY * kY * kZ * val_i;
+        const ST gh_yyz_i = f3_yyz_i - (kZ * f_yy_r + 2 * kY * f_yz_r) -
+            (kY * kY * dZ_i + 2 * kY * kZ * dY_i) + kY * kY * kZ * val_r;
+        const ST gh_yzz_r = f3_yzz_r + (2 * kZ * f_yz_i + kY * f_zz_i) -
+            (2 * kY * kZ * dZ_r + kZ * kZ * dY_r) - kY * kZ * kZ * val_i;
+        const ST gh_yzz_i = f3_yzz_i - (2 * kZ * f_yz_r + kY * f_zz_r) -
+            (2 * kY * kZ * dZ_i + kZ * kZ * dY_i) + kY * kZ * kZ * val_r;
+        const ST gh_zzz_r = f3_zzz_r + 3 * kZ * f_zz_i - 3 * kZ * kZ * dZ_r -
+            kZ * kZ * kZ * val_i;
+        const ST gh_zzz_i = f3_zzz_i - 3 * kZ * f_zz_r - 3 * kZ * kZ * dZ_i +
+            kZ * kZ * kZ * val_r;
+
+        grad_grad_grad_psi[psiIndex][0][0] =
+            ComplexT(c * gh_xxx_r - s * gh_xxx_i, c * gh_xxx_i + s * gh_xxx_r);
+        grad_grad_grad_psi[psiIndex][0][1] =
+            ComplexT(c * gh_xxy_r - s * gh_xxy_i, c * gh_xxy_i + s * gh_xxy_r);
+        grad_grad_grad_psi[psiIndex][0][2] =
+            ComplexT(c * gh_xxz_r - s * gh_xxz_i, c * gh_xxz_i + s * gh_xxz_r);
+        grad_grad_grad_psi[psiIndex][0][3] =
+            ComplexT(c * gh_xxy_r - s * gh_xxy_i, c * gh_xxy_i + s * gh_xxy_r);
+        grad_grad_grad_psi[psiIndex][0][4] =
+            ComplexT(c * gh_xyy_r - s * gh_xyy_i, c * gh_xyy_i + s * gh_xyy_r);
+        grad_grad_grad_psi[psiIndex][0][5] =
+            ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
+        grad_grad_grad_psi[psiIndex][0][6] =
+            ComplexT(c * gh_xxz_r - s * gh_xxz_i, c * gh_xxz_i + s * gh_xxz_r);
+        grad_grad_grad_psi[psiIndex][0][7] =
+            ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
+        grad_grad_grad_psi[psiIndex][0][8] =
+            ComplexT(c * gh_xzz_r - s * gh_xzz_i, c * gh_xzz_i + s * gh_xzz_r);
+
+        grad_grad_grad_psi[psiIndex][1][0] =
+            ComplexT(c * gh_xxy_r - s * gh_xxy_i, c * gh_xxy_i + s * gh_xxy_r);
+        grad_grad_grad_psi[psiIndex][1][1] =
+            ComplexT(c * gh_xyy_r - s * gh_xyy_i, c * gh_xyy_i + s * gh_xyy_r);
+        grad_grad_grad_psi[psiIndex][1][2] =
+            ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
+        grad_grad_grad_psi[psiIndex][1][3] =
+            ComplexT(c * gh_xyy_r - s * gh_xyy_i, c * gh_xyy_i + s * gh_xyy_r);
+        grad_grad_grad_psi[psiIndex][1][4] =
+            ComplexT(c * gh_yyy_r - s * gh_yyy_i, c * gh_yyy_i + s * gh_yyy_r);
+        grad_grad_grad_psi[psiIndex][1][5] =
+            ComplexT(c * gh_yyz_r - s * gh_yyz_i, c * gh_yyz_i + s * gh_yyz_r);
+        grad_grad_grad_psi[psiIndex][1][6] =
+            ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
+        grad_grad_grad_psi[psiIndex][1][7] =
+            ComplexT(c * gh_yyz_r - s * gh_yyz_i, c * gh_yyz_i + s * gh_yyz_r);
+        grad_grad_grad_psi[psiIndex][1][8] =
+            ComplexT(c * gh_yzz_r - s * gh_yzz_i, c * gh_yzz_i + s * gh_yzz_r);
+
+        grad_grad_grad_psi[psiIndex][2][0] =
+            ComplexT(c * gh_xxz_r - s * gh_xxz_i, c * gh_xxz_i + s * gh_xxz_r);
+        grad_grad_grad_psi[psiIndex][2][1] =
+            ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
+        grad_grad_grad_psi[psiIndex][2][2] =
+            ComplexT(c * gh_xzz_r - s * gh_xzz_i, c * gh_xzz_i + s * gh_xzz_r);
+        grad_grad_grad_psi[psiIndex][2][3] =
+            ComplexT(c * gh_xyz_r - s * gh_xyz_i, c * gh_xyz_i + s * gh_xyz_r);
+        grad_grad_grad_psi[psiIndex][2][4] =
+            ComplexT(c * gh_yyz_r - s * gh_yyz_i, c * gh_yyz_i + s * gh_yyz_r);
+        grad_grad_grad_psi[psiIndex][2][5] =
+            ComplexT(c * gh_yzz_r - s * gh_yzz_i, c * gh_yzz_i + s * gh_yzz_r);
+        grad_grad_grad_psi[psiIndex][2][6] =
+            ComplexT(c * gh_xzz_r - s * gh_xzz_i, c * gh_xzz_i + s * gh_xzz_r);
+        grad_grad_grad_psi[psiIndex][2][7] =
+            ComplexT(c * gh_yzz_r - s * gh_yzz_i, c * gh_yzz_i + s * gh_yzz_r);
+        grad_grad_grad_psi[psiIndex][2][8] =
+            ComplexT(c * gh_zzz_r - s * gh_zzz_i, c * gh_zzz_i + s * gh_zzz_r);
+    }
 }
 
-template<class T>
-void SplineC2CT<T>::evaluateVGHGH(const ParticleSetT<T>& P,
-                                  const int iat,
-                                  ValueVector& psi,
-                                  GradVector& dpsi,
-                                  HessVector& grad_grad_psi,
-                                  GGGVector& grad_grad_grad_psi)
+template <typename ST, typename VT>
+void
+SplineC2CT<ST, VT>::evaluateVGHGH(const ParticleSetT<VT>& P, const int iat,
+    ValueVector& psi, GradVector& dpsi, HessVector& grad_grad_psi,
+    GGGVector& grad_grad_grad_psi)
 {
-  const PointType& r = P.activeR(iat);
-  PointType ru(PrimLattice.toUnit_floor(r));
+    const PointType& r = P.activeR(iat);
+    PointType ru(PrimLattice.toUnit_floor(r));
 #pragma omp parallel
-  {
-    int first, last;
-    // Factor of 2 because psi is complex and the spline storage and evaluation uses a real type
-    FairDivideAligned(2 * psi.size(), getAlignment<T>(), omp_get_num_threads(), omp_get_thread_num(), first, last);
-
-    spline2::evaluate3d_vghgh(SplineInst->getSplinePtr(), ru, myV, myG, myH, mygH, first, last);
-    assign_vghgh(r, psi, dpsi, grad_grad_psi, grad_grad_grad_psi, first / 2, last / 2);
-  }
+    {
+        int first, last;
+        // Factor of 2 because psi is complex and the spline storage and
+        // evaluation uses a real type
+        FairDivideAligned(2 * psi.size(), getAlignment<ST>(),
+            omp_get_num_threads(), omp_get_thread_num(), first, last);
+
+        spline2::evaluate3d_vghgh(
+            SplineInst->getSplinePtr(), ru, myV, myG, myH, mygH, first, last);
+        assign_vghgh(r, psi, dpsi, grad_grad_psi, grad_grad_grad_psi, first / 2,
+            last / 2);
+    }
 }
 
-template class SplineC2CT<std::complex<float>>;
-template class SplineC2CT<std::complex<double>>;
+template class SplineC2CT<float, std::complex<double>>;
+template class SplineC2CT<float, std::complex<float>>;
+template class SplineC2CT<double, std::complex<double>>;
+template class SplineC2CT<double, std::complex<float>>;
 
 } // namespace qmcplusplus
diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2CT.h b/src/QMCWaveFunctions/BsplineFactory/SplineC2CT.h
index a7ba99e272..e48a285ef1 100644
--- a/src/QMCWaveFunctions/BsplineFactory/SplineC2CT.h
+++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2CT.h
@@ -1,6 +1,6 @@
 //////////////////////////////////////////////////////////////////////////////////////
-// This file is distributed under the University of Illinois/NCSA Open Source License.
-// See LICENSE file in top directory for details.
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
 //
 // Copyright (c) 2019 QMCPACK developers.
 //
@@ -10,227 +10,262 @@
 // File created by: Jeongnim Kim, jeongnim.kim@intel.com, Intel Corp.
 //////////////////////////////////////////////////////////////////////////////////////
 
-
 /** @file
  *
- * class to handle complex splines to complex orbitals with splines of arbitrary precision
+ * class to handle complex splines to complex orbitals with splines of arbitrary
+ * precision
  */
 #ifndef QMCPLUSPLUS_SPLINE_C2CT_H
 #define QMCPLUSPLUS_SPLINE_C2CT_H
 
-#include <memory>
-#include "BsplineSetT.h"
 #include "OhmmsSoA/VectorSoaContainer.h"
-#include "spline2/MultiBspline.hpp"
+#include "QMCWaveFunctions/BsplineFactory/BsplineSetT.h"
 #include "Utilities/FairDivide.h"
+#include "spline2/MultiBspline.hpp"
+
+#include <memory>
 
 namespace qmcplusplus
 {
-/** class to match std::complex<T> spline with BsplineSet::ValueType (complex) SPOs
- * @tparam T precision of spline
+/** class to match std::complex<ST> spline with BsplineSet::ValueType (complex)
+ * SPOs
+ * @tparam ST precision of spline
  *
  * Requires temporage storage and multiplication of phase vectors
- * The internal storage of complex spline coefficients uses double sized real arrays of T type, aligned and padded.
- * All the output orbitals are complex.
+ * The internal storage of complex spline coefficients uses double sized real
+ * arrays of ST type, aligned and padded. All the output orbitals are complex.
  */
-template<class T>
-class SplineC2CT : public BsplineSetT<T>
+template <typename ST, typename VT>
+class SplineC2CT : public BsplineSetT<VT>
 {
 public:
-  using RealType         = typename BsplineSetT<T>::RealType;
-  using SplineType       = typename bspline_traits<RealType, 3>::SplineType;
-  using BCType           = typename bspline_traits<RealType, 3>::BCType;
-  using DataType         = RealType;
-  using PointType        = TinyVector<RealType, 3>;
-  using SingleSplineType = UBspline_3d_d;
-
-
-  // types for evaluation results
-  // only works for Complex
-  using ComplexT    = T;
-  using ValueType   = typename BsplineSetT<T>::ValueType;
-  using GGGVector   = typename BsplineSetT<T>::GGGVector;
-  using GradVector  = typename BsplineSetT<T>::GradVector;
-  using HessVector  = typename BsplineSetT<T>::HessVector;
-  using ValueVector = typename BsplineSetT<T>::ValueVector;
-  using ValueMatrix = typename BsplineSetT<T>::ValueMatrix;
-
-  using vContainer_type  = Vector<RealType, aligned_allocator<RealType>>;
-  using gContainer_type  = VectorSoaContainer<RealType, 3>;
-  using hContainer_type  = VectorSoaContainer<RealType, 6>;
-  using ghContainer_type = VectorSoaContainer<RealType, 10>;
+    using SplineType = typename bspline_traits<ST, 3>::SplineType;
+    using BCType = typename bspline_traits<ST, 3>::BCType;
+    using DataType = ST;
+    using PointType = TinyVector<ST, 3>;
+    using SingleSplineType = UBspline_3d_d;
+
+    // types for evaluation results
+    using ComplexT = typename BsplineSetT<VT>::ValueType;
+    using typename BsplineSetT<VT>::IndexType;
+    using typename BsplineSetT<VT>::ValueType;
+    using typename BsplineSetT<VT>::RealType;
+    using typename BsplineSetT<VT>::GGGVector;
+    using typename BsplineSetT<VT>::GradVector;
+    using typename BsplineSetT<VT>::HessVector;
+    using typename BsplineSetT<VT>::ValueVector;
+    using typename BsplineSetT<VT>::ValueMatrix;
+
+    using vContainer_type = Vector<ST, aligned_allocator<ST>>;
+    using gContainer_type = VectorSoaContainer<ST, 3>;
+    using hContainer_type = VectorSoaContainer<ST, 6>;
+    using ghContainer_type = VectorSoaContainer<ST, 10>;
+
+private:
+    /// primitive cell
+    CrystalLattice<ST, 3> PrimLattice;
+    ///\f$GGt=G^t G \f$, transformation for tensor in LatticeUnit to
+    ///CartesianUnit, e.g. Hessian
+    Tensor<ST, 3> GGt;
+    /// multi bspline set
+    std::shared_ptr<MultiBspline<ST>> SplineInst;
+
+    /// Copy of original splines for orbital rotation
+    std::shared_ptr<std::vector<ST>> coef_copy_;
+
+    vContainer_type mKK;
+    VectorSoaContainer<ST, 3> myKcart;
+
+    /// thread private ratios for reduction when using nested threading, numVP x
+    /// numThread
+    Matrix<ComplexT> ratios_private;
+
+protected:
+    /// intermediate result vectors
+    vContainer_type myV;
+    vContainer_type myL;
+    gContainer_type myG;
+    hContainer_type myH;
+    ghContainer_type mygH;
 
 public:
-  SplineC2CT<T>(const std::string& my_name) : BsplineSetT<T>(my_name) {}
-
-  SplineC2CT<T>(const SplineC2CT<T>& in);
-  virtual std::string getClassName() const final { return "SplineC2C"; }
-  virtual std::string getKeyword() const final { return "SplineC2C"; }
-  bool isComplex() const final { return true; };
-
-  std::unique_ptr<SPOSetT<T>> makeClone() const final { return std::make_unique<SplineC2CT<T>>(*this); }
-
-  bool isRotationSupported() const final { return true; }
-
-  /// Store an original copy of the spline coefficients for orbital rotation
-  void storeParamsBeforeRotation() final;
-
-  /*
-    Implements orbital rotations via [1,2].
-    Should be called by RotatedSPOs::apply_rotation()
-    This implementation requires that NSPOs > Nelec. In other words,
-    if you want to run a orbopt wfn, you must include some virtual orbitals!
-    Some results (using older Berkeley branch) were published in [3].
-    [1] Filippi & Fahy, JCP 112, (2000)
-    [2] Toulouse & Umrigar, JCP 126, (2007)
-    [3] Townsend et al., PRB 102, (2020)
-  */
-  void applyRotation(const ValueMatrix& rot_mat, bool use_stored_copy) final;
-
-  inline void resizeStorage(size_t n, size_t nvals)
-  {
-    this->init_base(n);
-    size_t npad = getAlignedSize<T>(2 * n);
-    myV.resize(npad);
-    myG.resize(npad);
-    myL.resize(npad);
-    myH.resize(npad);
-    mygH.resize(npad);
-  }
-
-  void bcast_tables(Communicate* comm) { chunked_bcast(comm, SplineInst->getSplinePtr()); }
-
-  void gather_tables(Communicate* comm)
-  {
-    if (comm->size() == 1)
-      return;
-    const int Nbands      = this->kPoints.size();
-    const int Nbandgroups = comm->size();
-
-    auto& offset = this->offset;
-    offset.resize(Nbandgroups + 1, 0);
-    FairDivideLow(Nbands, Nbandgroups, offset);
-    for (size_t ib = 0; ib < offset.size(); ib++)
-      offset[ib] *= 2;
-    gatherv(comm, SplineInst->getSplinePtr(), SplineInst->getSplinePtr()->z_stride, offset);
-  }
-
-  template<typename GT, typename BCT>
-  void create_spline(GT& xyz_g, BCT& xyz_bc)
-  {
-    resize_kpoints();
-    SplineInst = std::make_shared<MultiBspline<T>>();
-    SplineInst->create(xyz_g, xyz_bc, myV.size());
-    app_log() << "MEMORY " << SplineInst->sizeInByte() / (1 << 20) << " MB allocated "
-              << "for the coefficients in 3D spline orbital representation" << std::endl;
-  }
-
-  inline void flush_zero() { SplineInst->flush_zero(); }
-
-  /** remap kPoints to pack the double copy */
-  inline void resize_kpoints()
-  {
-    const auto& kPoints = this->kPoints;
-    const size_t nk     = kPoints.size();
-    mKK.resize(nk);
-    myKcart.resize(nk);
-    for (size_t i = 0; i < nk; ++i)
+    SplineC2CT(const std::string& my_name) : BsplineSetT<VT>(my_name)
     {
-      mKK[i]     = -dot(kPoints[i], kPoints[i]);
-      myKcart(i) = kPoints[i];
     }
-  }
-
-  void set_spline(SingleSplineType* spline_r, SingleSplineType* spline_i, int twist, int ispline, int level);
-
-  bool read_splines(hdf_archive& h5f);
-
-  bool write_splines(hdf_archive& h5f);
-
-  void assign_v(const PointType& r, const vContainer_type& myV, ValueVector& psi, int first, int last) const;
-
-  void evaluateValue(const ParticleSetT<T>& P, const int iat, ValueVector& psi) override;
-
-  void evaluateDetRatios(const VirtualParticleSetT<T>& VP,
-                         ValueVector& psi,
-                         const ValueVector& psiinv,
-                         std::vector<ValueType>& ratios) override;
-
-  /** assign_vgl
-   */
-  void assign_vgl(const PointType& r, ValueVector& psi, GradVector& dpsi, ValueVector& d2psi, int first, int last)
-      const;
-
-  /** assign_vgl_from_l can be used when myL is precomputed and myV,myG,myL in cartesian
-   */
-  void assign_vgl_from_l(const PointType& r, ValueVector& psi, GradVector& dpsi, ValueVector& d2psi);
-
-  void evaluateVGL(const ParticleSetT<T>& P,
-                   const int iat,
-                   ValueVector& psi,
-                   GradVector& dpsi,
-                   ValueVector& d2psi) override;
-
-  void assign_vgh(const PointType& r,
-                  ValueVector& psi,
-                  GradVector& dpsi,
-                  HessVector& grad_grad_psi,
-                  int first,
-                  int last) const;
-
-  void evaluateVGH(const ParticleSetT<T>& P,
-                   const int iat,
-                   ValueVector& psi,
-                   GradVector& dpsi,
-                   HessVector& grad_grad_psi) override;
-
-  void assign_vghgh(const PointType& r,
-                    ValueVector& psi,
-                    GradVector& dpsi,
-                    HessVector& grad_grad_psi,
-                    GGGVector& grad_grad_grad_psi,
-                    int first = 0,
-                    int last  = -1) const;
-
-  void evaluateVGHGH(const ParticleSetT<T>& P,
-                     const int iat,
-                     ValueVector& psi,
-                     GradVector& dpsi,
-                     HessVector& grad_grad_psi,
-                     GGGVector& grad_grad_grad_psi) override;
-
-  template<class BSPLINESPO>
-  friend struct SplineSetReader;
-  friend struct BsplineReaderBase;
 
-protected:
-  /// intermediate result vectors
-  vContainer_type myV;
-  vContainer_type myL;
-  gContainer_type myG;
-  hContainer_type myH;
-  ghContainer_type mygH;
+    SplineC2CT(const SplineC2CT& in);
+    virtual std::string
+    getClassName() const override
+    {
+        return "SplineC2C";
+    }
+    virtual std::string
+    getKeyword() const override
+    {
+        return "SplineC2C";
+    }
+    bool
+    isComplex() const override
+    {
+        return true;
+    };
 
-private:
-  ///primitive cell
-  CrystalLattice<RealType, 3> PrimLattice;
-  ///\f$GGt=G^t G \f$, transformation for tensor in LatticeUnit to CartesianUnit, e.g. Hessian
-  Tensor<RealType, 3> GGt;
-  ///multi bspline set
-  std::shared_ptr<MultiBspline<RealType>> SplineInst;
+    std::unique_ptr<SPOSetT<VT>>
+    makeClone() const override
+    {
+        return std::make_unique<SplineC2CT>(*this);
+    }
 
-  ///Copy of original splines for orbital rotation
-  std::shared_ptr<std::vector<RealType>> coef_copy_;
+    bool
+    isRotationSupported() const override
+    {
+        return true;
+    }
 
-  vContainer_type mKK;
-  VectorSoaContainer<RealType, 3> myKcart;
+    /// Store an original copy of the spline coefficients for orbital rotation
+    void
+    storeParamsBeforeRotation() override;
+
+    /*
+      Implements orbital rotations via [1,2].
+      Should be called by RotatedSPOs::apply_rotation()
+      This implementation requires that NSPOs > Nelec. In other words,
+      if you want to run a orbopt wfn, you must include some virtual orbitals!
+      Some results (using older Berkeley branch) were published in [3].
+      [1] Filippi & Fahy, JCP 112, (2000)
+      [2] Toulouse & Umrigar, JCP 126, (2007)
+      [3] Townsend et al., PRB 102, (2020)
+    */
+    void
+    applyRotation(const ValueMatrix& rot_mat, bool use_stored_copy) override;
+
+    inline void
+    resizeStorage(size_t n, size_t nvals)
+    {
+        this->init_base(n);
+        size_t npad = getAlignedSize<ST>(2 * n);
+        myV.resize(npad);
+        myG.resize(npad);
+        myL.resize(npad);
+        myH.resize(npad);
+        mygH.resize(npad);
+    }
 
-  ///thread private ratios for reduction when using nested threading, numVP x numThread
-  Matrix<ComplexT> ratios_private;
-};
+    void
+    bcast_tables(Communicate* comm)
+    {
+        chunked_bcast(comm, SplineInst->getSplinePtr());
+    }
+
+    void
+    gather_tables(Communicate* comm)
+    {
+        if (comm->size() == 1)
+            return;
+        const int Nbands = this->kPoints.size();
+        const int Nbandgroups = comm->size();
+        this->offset.resize(Nbandgroups + 1, 0);
+        FairDivideLow(Nbands, Nbandgroups, this->offset);
+        for (size_t ib = 0; ib < this->offset.size(); ib++)
+            this->offset[ib] *= 2;
+        gatherv(comm, SplineInst->getSplinePtr(),
+            SplineInst->getSplinePtr()->z_stride, this->offset);
+    }
 
-extern template class SplineC2CT<float>;
-extern template class SplineC2CT<double>;
+    template <typename GT, typename BCT>
+    void
+    create_spline(GT& xyz_g, BCT& xyz_bc)
+    {
+        resize_kpoints();
+        SplineInst = std::make_shared<MultiBspline<ST>>();
+        SplineInst->create(xyz_g, xyz_bc, myV.size());
+        app_log() << "MEMORY " << SplineInst->sizeInByte() / (1 << 20)
+                  << " MB allocated "
+                  << "for the coefficients in 3D spline orbital representation"
+                  << std::endl;
+    }
+
+    inline void
+    flush_zero()
+    {
+        SplineInst->flush_zero();
+    }
+
+    /** remap kPoints to pack the double copy */
+    inline void
+    resize_kpoints()
+    {
+        const size_t nk = this->kPoints.size();
+        mKK.resize(nk);
+        myKcart.resize(nk);
+        for (size_t i = 0; i < nk; ++i) {
+            mKK[i] = -dot(this->kPoints[i], this->kPoints[i]);
+            myKcart(i) = this->kPoints[i];
+        }
+    }
+
+    void
+    set_spline(SingleSplineType* spline_r, SingleSplineType* spline_i,
+        int twist, int ispline, int level);
+
+    bool
+    read_splines(hdf_archive& h5f);
+
+    bool
+    write_splines(hdf_archive& h5f);
+
+    void
+    assign_v(const PointType& r, const vContainer_type& myV, ValueVector& psi,
+        int first, int last) const;
+
+    void
+    evaluateValue(
+        const ParticleSetT<VT>& P, const int iat, ValueVector& psi) override;
+
+    void
+    evaluateDetRatios(const VirtualParticleSetT<VT>& VP, ValueVector& psi,
+        const ValueVector& psiinv, std::vector<ValueType>& ratios) override;
+
+    /** assign_vgl
+     */
+    void
+    assign_vgl(const PointType& r, ValueVector& psi, GradVector& dpsi,
+        ValueVector& d2psi, int first, int last) const;
+
+    /** assign_vgl_from_l can be used when myL is precomputed and myV,myG,myL in
+     * cartesian
+     */
+    void
+    assign_vgl_from_l(const PointType& r, ValueVector& psi, GradVector& dpsi,
+        ValueVector& d2psi);
+
+    void
+    evaluateVGL(const ParticleSetT<VT>& P, const int iat, ValueVector& psi,
+        GradVector& dpsi, ValueVector& d2psi) override;
+
+    void
+    assign_vgh(const PointType& r, ValueVector& psi, GradVector& dpsi,
+        HessVector& grad_grad_psi, int first, int last) const;
+
+    void
+    evaluateVGH(const ParticleSetT<VT>& P, const int iat, ValueVector& psi,
+        GradVector& dpsi, HessVector& grad_grad_psi) override;
+
+    void
+    assign_vghgh(const PointType& r, ValueVector& psi, GradVector& dpsi,
+        HessVector& grad_grad_psi, GGGVector& grad_grad_grad_psi, int first = 0,
+        int last = -1) const;
+
+    void
+    evaluateVGHGH(const ParticleSetT<VT>& P, const int iat, ValueVector& psi,
+        GradVector& dpsi, HessVector& grad_grad_psi,
+        GGGVector& grad_grad_grad_psi) override;
+
+    template <class BSPLINESPO>
+    friend class SplineSetReaderT;
+    template <typename>
+    friend class BsplineReaderBaseT;
+};
 
 } // namespace qmcplusplus
 #endif
diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2RTOMPTarget.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTargetT.cpp
similarity index 96%
rename from src/QMCWaveFunctions/BsplineFactory/SplineC2RTOMPTarget.cpp
rename to src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTargetT.cpp
index 93ada7660a..1e3e02cd6a 100644
--- a/src/QMCWaveFunctions/BsplineFactory/SplineC2RTOMPTarget.cpp
+++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTargetT.cpp
@@ -9,7 +9,7 @@
 // File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
 //////////////////////////////////////////////////////////////////////////////////////
 
-#include "SplineC2RTOMPTarget.h"
+#include "SplineC2ROMPTargetT.h"
 
 #include "ApplyPhaseC2R.hpp"
 #include "Concurrency/OpenMP.h"
@@ -19,22 +19,22 @@
 
 namespace qmcplusplus
 {
-template <typename ST>
-SplineC2RTOMPTarget<ST>::SplineC2RTOMPTarget(
-    const SplineC2RTOMPTarget& in) = default;
+template <typename ST, typename VT>
+SplineC2ROMPTargetT<ST, VT>::SplineC2ROMPTargetT(
+    const SplineC2ROMPTargetT& in) = default;
 
-template <typename ST>
+template <typename ST, typename VT>
 inline void
-SplineC2RTOMPTarget<ST>::set_spline(SingleSplineType* spline_r,
+SplineC2ROMPTargetT<ST, VT>::set_spline(SingleSplineType* spline_r,
     SingleSplineType* spline_i, int twist, int ispline, int level)
 {
     SplineInst->copy_spline(spline_r, 2 * ispline);
     SplineInst->copy_spline(spline_i, 2 * ispline + 1);
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 bool
-SplineC2RTOMPTarget<ST>::read_splines(hdf_archive& h5f)
+SplineC2ROMPTargetT<ST, VT>::read_splines(hdf_archive& h5f)
 {
     std::ostringstream o;
     o << "spline_" << this->MyIndex;
@@ -42,9 +42,9 @@ SplineC2RTOMPTarget<ST>::read_splines(hdf_archive& h5f)
     return h5f.readEntry(bigtable, o.str().c_str()); //"spline_0");
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 bool
-SplineC2RTOMPTarget<ST>::write_splines(hdf_archive& h5f)
+SplineC2ROMPTargetT<ST, VT>::write_splines(hdf_archive& h5f)
 {
     std::ostringstream o;
     o << "spline_" << this->MyIndex;
@@ -52,9 +52,9 @@ SplineC2RTOMPTarget<ST>::write_splines(hdf_archive& h5f)
     return h5f.writeEntry(bigtable, o.str().c_str()); //"spline_0");
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 inline void
-SplineC2RTOMPTarget<ST>::assign_v(const PointType& r,
+SplineC2ROMPTargetT<ST, VT>::assign_v(const PointType& r,
     const vContainer_type& myV, ValueVector& psi, int first, int last) const
 {
     // protect last
@@ -89,10 +89,10 @@ SplineC2RTOMPTarget<ST>::assign_v(const PointType& r,
     }
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 void
-SplineC2RTOMPTarget<ST>::evaluateValue(
-    const ParticleSetT<ST>& P, const int iat, ValueVector& psi)
+SplineC2ROMPTargetT<ST, VT>::evaluateValue(
+    const ParticleSetT<VT>& P, const int iat, ValueVector& psi)
 {
     const PointType& r = P.activeR(iat);
     PointType ru(PrimLattice.toUnit_floor(r));
@@ -166,10 +166,11 @@ SplineC2RTOMPTarget<ST>::evaluateValue(
     }
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 void
-SplineC2RTOMPTarget<ST>::evaluateDetRatios(const VirtualParticleSetT<ST>& VP,
-    ValueVector& psi, const ValueVector& psiinv, std::vector<ValueType>& ratios)
+SplineC2ROMPTargetT<ST, VT>::evaluateDetRatios(
+    const VirtualParticleSetT<VT>& VP, ValueVector& psi,
+    const ValueVector& psiinv, std::vector<ValueType>& ratios)
 {
     const int nVP = VP.getTotalNum();
     psiinv_pos_copy.resize(psiinv.size() + nVP * 6);
@@ -271,18 +272,17 @@ SplineC2RTOMPTarget<ST>::evaluateDetRatios(const VirtualParticleSetT<ST>& VP,
     }
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 void
-SplineC2RTOMPTarget<ST>::mw_evaluateDetRatios(
-    const RefVectorWithLeader<SPOSetT<ST>>& spo_list,
-    const RefVectorWithLeader<const VirtualParticleSetT<ST>>& vp_list,
+SplineC2ROMPTargetT<ST, VT>::mw_evaluateDetRatios(
+    const RefVectorWithLeader<SPOSetT<VT>>& spo_list,
+    const RefVectorWithLeader<const VirtualParticleSetT<VT>>& vp_list,
     const RefVector<ValueVector>& psi_list,
     const std::vector<const ValueType*>& invRow_ptr_list,
     std::vector<std::vector<ValueType>>& ratios_list) const
 {
     assert(this == &spo_list.getLeader());
-    auto& phi_leader =
-        spo_list.template getCastedLeader<SplineC2RTOMPTarget<ST>>();
+    auto& phi_leader = spo_list.template getCastedLeader<SplineC2ROMPTargetT>();
     auto& mw_mem = phi_leader.mw_mem_handle_.getResource();
     auto& det_ratios_buffer_H2D = mw_mem.det_ratios_buffer_H2D;
     auto& mw_ratios_private = mw_mem.mw_ratios_private;
@@ -292,7 +292,7 @@ SplineC2RTOMPTarget<ST>::mw_evaluateDetRatios(
     const size_t requested_orb_size = phi_leader.size();
 
     size_t mw_nVP = 0;
-    for (const VirtualParticleSetT<ST>& VP : vp_list)
+    for (const VirtualParticleSetT<VT>& VP : vp_list)
         mw_nVP += VP.getTotalNum();
 
     const size_t packed_size =
@@ -312,7 +312,7 @@ SplineC2RTOMPTarget<ST>::mw_evaluateDetRatios(
         nw * sizeof(ValueType*) + mw_nVP * 6 * sizeof(TT));
     size_t iVP = 0;
     for (size_t iw = 0; iw < nw; iw++) {
-        const VirtualParticleSetT<ST>& VP = vp_list[iw];
+        const VirtualParticleSetT<VT>& VP = vp_list[iw];
         assert(ratios_list[iw].size() == VP.getTotalNum());
         for (size_t iat = 0; iat < VP.getTotalNum(); ++iat, ++iVP) {
             ref_id_ptr[iVP] = iw;
@@ -420,9 +420,9 @@ SplineC2RTOMPTarget<ST>::mw_evaluateDetRatios(
 /** assign_vgl_from_l can be used when myL is precomputed and myV,myG,myL in
  * cartesian
  */
-template <typename ST>
+template <typename ST, typename VT>
 inline void
-SplineC2RTOMPTarget<ST>::assign_vgl_from_l(
+SplineC2ROMPTargetT<ST, VT>::assign_vgl_from_l(
     const PointType& r, ValueVector& psi, GradVector& dpsi, ValueVector& d2psi)
 {
     constexpr ST two(2);
@@ -539,10 +539,10 @@ SplineC2RTOMPTarget<ST>::assign_vgl_from_l(
     }
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 void
-SplineC2RTOMPTarget<ST>::evaluateVGL(const ParticleSetT<ST>& P, const int iat,
-    ValueVector& psi, GradVector& dpsi, ValueVector& d2psi)
+SplineC2ROMPTargetT<ST, VT>::evaluateVGL(const ParticleSetT<VT>& P,
+    const int iat, ValueVector& psi, GradVector& dpsi, ValueVector& d2psi)
 {
     const PointType& r = P.activeR(iat);
     PointType ru(PrimLattice.toUnit_floor(r));
@@ -643,9 +643,9 @@ SplineC2RTOMPTarget<ST>::evaluateVGL(const ParticleSetT<ST>& P, const int iat,
     }
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 void
-SplineC2RTOMPTarget<ST>::evaluateVGLMultiPos(
+SplineC2ROMPTargetT<ST, VT>::evaluateVGLMultiPos(
     const Vector<ST, OffloadPinnedAllocator<ST>>& multi_pos,
     Vector<ST, OffloadPinnedAllocator<ST>>& offload_scratch,
     Vector<TT, OffloadPinnedAllocator<TT>>& results_scratch,
@@ -771,18 +771,17 @@ SplineC2RTOMPTarget<ST>::evaluateVGLMultiPos(
     }
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 void
-SplineC2RTOMPTarget<ST>::mw_evaluateVGL(
-    const RefVectorWithLeader<SPOSetT<ST>>& sa_list,
-    const RefVectorWithLeader<ParticleSetT<ST>>& P_list, int iat,
+SplineC2ROMPTargetT<ST, VT>::mw_evaluateVGL(
+    const RefVectorWithLeader<SPOSetT<VT>>& sa_list,
+    const RefVectorWithLeader<ParticleSetT<VT>>& P_list, int iat,
     const RefVector<ValueVector>& psi_v_list,
     const RefVector<GradVector>& dpsi_v_list,
     const RefVector<ValueVector>& d2psi_v_list) const
 {
     assert(this == &sa_list.getLeader());
-    auto& phi_leader =
-        sa_list.template getCastedLeader<SplineC2RTOMPTarget<ST>>();
+    auto& phi_leader = sa_list.template getCastedLeader<SplineC2ROMPTargetT>();
     auto& mw_mem = phi_leader.mw_mem_handle_.getResource();
     auto& mw_pos_copy = mw_mem.mw_pos_copy;
     auto& mw_offload_scratch = mw_mem.mw_offload_scratch;
@@ -806,18 +805,17 @@ SplineC2RTOMPTarget<ST>::mw_evaluateVGL(
         mw_results_scratch, psi_v_list, dpsi_v_list, d2psi_v_list);
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 void
-SplineC2RTOMPTarget<ST>::mw_evaluateVGLandDetRatioGrads(
-    const RefVectorWithLeader<SPOSetT<ST>>& spo_list,
-    const RefVectorWithLeader<ParticleSetT<ST>>& P_list, int iat,
+SplineC2ROMPTargetT<ST, VT>::mw_evaluateVGLandDetRatioGrads(
+    const RefVectorWithLeader<SPOSetT<VT>>& spo_list,
+    const RefVectorWithLeader<ParticleSetT<VT>>& P_list, int iat,
     const std::vector<const ValueType*>& invRow_ptr_list,
     OffloadMWVGLArray& phi_vgl_v, std::vector<ValueType>& ratios,
     std::vector<GradType>& grads) const
 {
     assert(this == &spo_list.getLeader());
-    auto& phi_leader =
-        spo_list.template getCastedLeader<SplineC2RTOMPTarget<ST>>();
+    auto& phi_leader = spo_list.template getCastedLeader<SplineC2ROMPTargetT>();
     auto& mw_mem = phi_leader.mw_mem_handle_.getResource();
     auto& buffer_H2D = mw_mem.buffer_H2D;
     auto& rg_private = mw_mem.rg_private;
@@ -1012,9 +1010,9 @@ SplineC2RTOMPTarget<ST>::mw_evaluateVGLandDetRatioGrads(
     }
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 void
-SplineC2RTOMPTarget<ST>::assign_vgh(const PointType& r, ValueVector& psi,
+SplineC2ROMPTargetT<ST, VT>::assign_vgh(const PointType& r, ValueVector& psi,
     GradVector& dpsi, HessVector& grad_grad_psi, int first, int last) const
 {
     // protect last
@@ -1269,10 +1267,11 @@ SplineC2RTOMPTarget<ST>::assign_vgh(const PointType& r, ValueVector& psi,
     }
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 void
-SplineC2RTOMPTarget<ST>::evaluateVGH(const ParticleSetT<ST>& P, const int iat,
-    ValueVector& psi, GradVector& dpsi, HessVector& grad_grad_psi)
+SplineC2ROMPTargetT<ST, VT>::evaluateVGH(const ParticleSetT<VT>& P,
+    const int iat, ValueVector& psi, GradVector& dpsi,
+    HessVector& grad_grad_psi)
 {
     const PointType& r = P.activeR(iat);
     PointType ru(PrimLattice.toUnit_floor(r));
@@ -1288,16 +1287,15 @@ SplineC2RTOMPTarget<ST>::evaluateVGH(const ParticleSetT<ST>& P, const int iat,
     }
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 void
-SplineC2RTOMPTarget<ST>::assign_vghgh(const PointType& r, ValueVector& psi,
+SplineC2ROMPTargetT<ST, VT>::assign_vghgh(const PointType& r, ValueVector& psi,
     GradVector& dpsi, HessVector& grad_grad_psi, GGGVector& grad_grad_grad_psi,
     int first, int last) const
 {
     // protect last
-    last = last < 0 ?
-        this->kPoints.size() :
-        (last > this->kPoints.size() ? this->kPoints.size() : last);
+    last = last < 0 ? this->kPoints.size() :
+                      (last > this->kPoints.size() ? this->kPoints.size() : last);
 
     const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1),
              g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
@@ -1847,11 +1845,11 @@ SplineC2RTOMPTarget<ST>::assign_vghgh(const PointType& r, ValueVector& psi,
     }
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 void
-SplineC2RTOMPTarget<ST>::evaluateVGHGH(const ParticleSetT<ST>& P, const int iat,
-    ValueVector& psi, GradVector& dpsi, HessVector& grad_grad_psi,
-    GGGVector& grad_grad_grad_psi)
+SplineC2ROMPTargetT<ST, VT>::evaluateVGHGH(const ParticleSetT<VT>& P,
+    const int iat, ValueVector& psi, GradVector& dpsi,
+    HessVector& grad_grad_psi, GGGVector& grad_grad_grad_psi)
 {
     const PointType& r = P.activeR(iat);
     PointType ru(PrimLattice.toUnit_floor(r));
@@ -1868,9 +1866,9 @@ SplineC2RTOMPTarget<ST>::evaluateVGHGH(const ParticleSetT<ST>& P, const int iat,
     }
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 void
-SplineC2RTOMPTarget<ST>::evaluate_notranspose(const ParticleSetT<ST>& P,
+SplineC2ROMPTargetT<ST, VT>::evaluate_notranspose(const ParticleSetT<VT>& P,
     int first, int last, ValueMatrix& logdet, GradMatrix& dlogdet,
     ValueMatrix& d2logdet)
 {
@@ -1928,7 +1926,9 @@ SplineC2RTOMPTarget<ST>::evaluate_notranspose(const ParticleSetT<ST>& P,
     }
 }
 
-template class SplineC2RTOMPTarget<float>;
-template class SplineC2RTOMPTarget<double>;
+template class SplineC2ROMPTargetT<float, float>;
+template class SplineC2ROMPTargetT<float, double>;
+template class SplineC2ROMPTargetT<double, float>;
+template class SplineC2ROMPTargetT<double, double>;
 
 } // namespace qmcplusplus
diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2RTOMPTarget.h b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTargetT.h
similarity index 82%
rename from src/QMCWaveFunctions/BsplineFactory/SplineC2RTOMPTarget.h
rename to src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTargetT.h
index e79ee57450..0d3aef1f2d 100644
--- a/src/QMCWaveFunctions/BsplineFactory/SplineC2RTOMPTarget.h
+++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2ROMPTargetT.h
@@ -15,8 +15,8 @@
  * precision splines storage and computation is offloaded to accelerators using
  * OpenMP target
  */
-#ifndef QMCPLUSPLUS_SPLINE_C2RT_OMPTARGET_H
-#define QMCPLUSPLUS_SPLINE_C2RT_OMPTARGET_H
+#ifndef QMCPLUSPLUS_SPLINE_C2R_OMPTARGETT_H
+#define QMCPLUSPLUS_SPLINE_C2R_OMPTARGETT_H
 
 #include "OMPTarget/OffloadAlignedAllocators.hpp"
 #include "OhmmsSoA/VectorSoaContainer.h"
@@ -42,8 +42,8 @@ namespace qmcplusplus
  * orbital. All the output orbitals are real (C2R). The maximal number of output
  * orbitals is OrbitalSetSize.
  */
-template <typename ST>
-class SplineC2RTOMPTarget : public BsplineSetT<ST>
+template <typename ST, typename VT>
+class SplineC2ROMPTargetT : public BsplineSetT<VT>
 {
 public:
     using SplineType = typename bspline_traits<ST, 3>::SplineType;
@@ -52,16 +52,16 @@ class SplineC2RTOMPTarget : public BsplineSetT<ST>
     using PointType = TinyVector<ST, 3>;
     using SingleSplineType = UBspline_3d_d;
     // types for evaluation results
-    using TT = typename BsplineSetT<ST>::ValueType;
-    using ValueType = typename BsplineSetT<ST>::ValueType;
-    using GradType = typename BsplineSetT<ST>::GradType;
-    using GGGVector = typename BsplineSetT<ST>::GGGVector;
-    using GradVector = typename BsplineSetT<ST>::GradVector;
-    using HessVector = typename BsplineSetT<ST>::HessVector;
-    using ValueVector = typename BsplineSetT<ST>::ValueVector;
-    using ValueMatrix = typename BsplineSetT<ST>::ValueMatrix;
-    using GradMatrix = typename BsplineSetT<ST>::GradMatrix;
-    using OffloadMWVGLArray = typename BsplineSetT<ST>::OffloadMWVGLArray;
+    using TT = typename BsplineSetT<VT>::ValueType;
+    using typename BsplineSetT<VT>::ValueType;
+    using typename BsplineSetT<VT>::GradType;
+    using typename BsplineSetT<VT>::GGGVector;
+    using typename BsplineSetT<VT>::GradVector;
+    using typename BsplineSetT<VT>::GradMatrix;
+    using typename BsplineSetT<VT>::HessVector;
+    using typename BsplineSetT<VT>::ValueVector;
+    using typename BsplineSetT<VT>::ValueMatrix;
+    using typename BsplineSetT<VT>::OffloadMWVGLArray;
 
     using vContainer_type = Vector<ST, aligned_allocator<ST>>;
     using gContainer_type = VectorSoaContainer<ST, 3>;
@@ -126,8 +126,8 @@ class SplineC2RTOMPTarget : public BsplineSetT<ST>
     ghContainer_type mygH;
 
 public:
-    SplineC2RTOMPTarget(const std::string& my_name) :
-        BsplineSetT<ST>(my_name),
+    SplineC2ROMPTargetT(const std::string& my_name) :
+        BsplineSetT<VT>(my_name),
         offload_timer_(
             createGlobalTimer("SplineC2ROMPTarget::offload", timer_level_fine)),
         nComplexBands(0),
@@ -136,7 +136,7 @@ class SplineC2RTOMPTarget : public BsplineSetT<ST>
     {
     }
 
-    SplineC2RTOMPTarget(const SplineC2RTOMPTarget& in);
+    SplineC2ROMPTargetT(const SplineC2ROMPTargetT& in);
 
     virtual std::string
     getClassName() const override
@@ -168,27 +168,29 @@ class SplineC2RTOMPTarget : public BsplineSetT<ST>
 
     void
     acquireResource(ResourceCollection& collection,
-        const RefVectorWithLeader<SPOSetT<ST>>& spo_list) const override
+        const RefVectorWithLeader<SPOSetT<VT>>& spo_list) const override
     {
         assert(this == &spo_list.getLeader());
-        auto& phi_leader = spo_list.template getCastedLeader<SplineC2RTOMPTarget<ST>>();
+        auto& phi_leader =
+            spo_list.template getCastedLeader<SplineC2ROMPTargetT>();
         phi_leader.mw_mem_handle_ =
             collection.lendResource<SplineOMPTargetMultiWalkerMem<ST, TT>>();
     }
 
     void
     releaseResource(ResourceCollection& collection,
-        const RefVectorWithLeader<SPOSetT<ST>>& spo_list) const override
+        const RefVectorWithLeader<SPOSetT<VT>>& spo_list) const override
     {
         assert(this == &spo_list.getLeader());
-        auto& phi_leader = spo_list.template getCastedLeader<SplineC2RTOMPTarget<ST>>();
+        auto& phi_leader =
+            spo_list.template getCastedLeader<SplineC2ROMPTargetT>();
         collection.takebackResource(phi_leader.mw_mem_handle_);
     }
 
-    std::unique_ptr<SPOSetT<ST>>
+    std::unique_ptr<SPOSetT<VT>>
     makeClone() const override
     {
-        return std::make_unique<SplineC2RTOMPTarget>(*this);
+        return std::make_unique<SplineC2ROMPTargetT>(*this);
     }
 
     inline void
@@ -248,8 +250,9 @@ class SplineC2RTOMPTarget : public BsplineSetT<ST>
         auto* MultiSpline = SplineInst->getSplinePtr();
         auto* restrict coefs = MultiSpline->coefs;
         // attach pointers on the device to achieve deep copy
-        PRAGMA_OFFLOAD("omp target map(always, to: MultiSpline[0:1], \
-                       coefs[0:MultiSpline->coefs_size])")
+        PRAGMA_OFFLOAD("omp target \
+                map(always, to: MultiSpline[0:1], \
+                    coefs[0:MultiSpline->coefs_size])")
         {
             MultiSpline->coefs = coefs;
         }
@@ -306,15 +309,15 @@ class SplineC2RTOMPTarget : public BsplineSetT<ST>
 
     virtual void
     evaluateValue(
-        const ParticleSetT<ST>& P, const int iat, ValueVector& psi) override;
+        const ParticleSetT<VT>& P, const int iat, ValueVector& psi) override;
 
     virtual void
-    evaluateDetRatios(const VirtualParticleSetT<ST>& VP, ValueVector& psi,
+    evaluateDetRatios(const VirtualParticleSetT<VT>& VP, ValueVector& psi,
         const ValueVector& psiinv, std::vector<ValueType>& ratios) override;
 
     virtual void
-    mw_evaluateDetRatios(const RefVectorWithLeader<SPOSetT<ST>>& spo_list,
-        const RefVectorWithLeader<const VirtualParticleSetT<ST>>& vp_list,
+    mw_evaluateDetRatios(const RefVectorWithLeader<SPOSetT<VT>>& spo_list,
+        const RefVectorWithLeader<const VirtualParticleSetT<VT>>& vp_list,
         const RefVector<ValueVector>& psi_list,
         const std::vector<const ValueType*>& invRow_ptr_list,
         std::vector<std::vector<ValueType>>& ratios_list) const override;
@@ -327,20 +330,20 @@ class SplineC2RTOMPTarget : public BsplineSetT<ST>
         ValueVector& d2psi);
 
     virtual void
-    evaluateVGL(const ParticleSetT<ST>& P, const int iat, ValueVector& psi,
+    evaluateVGL(const ParticleSetT<VT>& P, const int iat, ValueVector& psi,
         GradVector& dpsi, ValueVector& d2psi) override;
 
     virtual void
-    mw_evaluateVGL(const RefVectorWithLeader<SPOSetT<ST>>& sa_list,
-        const RefVectorWithLeader<ParticleSetT<ST>>& P_list, int iat,
+    mw_evaluateVGL(const RefVectorWithLeader<SPOSetT<VT>>& sa_list,
+        const RefVectorWithLeader<ParticleSetT<VT>>& P_list, int iat,
         const RefVector<ValueVector>& psi_v_list,
         const RefVector<GradVector>& dpsi_v_list,
         const RefVector<ValueVector>& d2psi_v_list) const override;
 
     virtual void
     mw_evaluateVGLandDetRatioGrads(
-        const RefVectorWithLeader<SPOSetT<ST>>& spo_list,
-        const RefVectorWithLeader<ParticleSetT<ST>>& P_list, int iat,
+        const RefVectorWithLeader<SPOSetT<VT>>& spo_list,
+        const RefVectorWithLeader<ParticleSetT<VT>>& P_list, int iat,
         const std::vector<const ValueType*>& invRow_ptr_list,
         OffloadMWVGLArray& phi_vgl_v, std::vector<ValueType>& ratios,
         std::vector<GradType>& grads) const override;
@@ -350,7 +353,7 @@ class SplineC2RTOMPTarget : public BsplineSetT<ST>
         HessVector& grad_grad_psi, int first, int last) const;
 
     virtual void
-    evaluateVGH(const ParticleSetT<ST>& P, const int iat, ValueVector& psi,
+    evaluateVGH(const ParticleSetT<VT>& P, const int iat, ValueVector& psi,
         GradVector& dpsi, HessVector& grad_grad_psi) override;
 
     void
@@ -359,18 +362,20 @@ class SplineC2RTOMPTarget : public BsplineSetT<ST>
         int last = -1) const;
 
     virtual void
-    evaluateVGHGH(const ParticleSetT<ST>& P, const int iat, ValueVector& psi,
+    evaluateVGHGH(const ParticleSetT<VT>& P, const int iat, ValueVector& psi,
         GradVector& dpsi, HessVector& grad_grad_psi,
         GGGVector& grad_grad_grad_psi) override;
 
     virtual void
-    evaluate_notranspose(const ParticleSetT<ST>& P, int first, int last,
+    evaluate_notranspose(const ParticleSetT<VT>& P, int first, int last,
         ValueMatrix& logdet, GradMatrix& dlogdet,
         ValueMatrix& d2logdet) override;
 
     template <class BSPLINESPO>
-    friend struct SplineSetReader;
-    friend struct BsplineReaderBase;
+    friend class SplineSetReaderT;
+    template <typename>
+    friend class BsplineReaderBaseT;
 };
+
 } // namespace qmcplusplus
 #endif
diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2RT.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineC2RT.cpp
index fd0e182bc0..6e5bf82b72 100644
--- a/src/QMCWaveFunctions/BsplineFactory/SplineC2RT.cpp
+++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2RT.cpp
@@ -1,1191 +1,1328 @@
 //////////////////////////////////////////////////////////////////////////////////////
-// This file is distributed under the University of Illinois/NCSA Open Source License.
-// See LICENSE file in top directory for details.
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
 //
 // Copyright (c) 2019 QMCPACK developers.
 //
-// File developed by: Jeremy McMinnis, jmcminis@gmail.com, University of Illinois at Urbana-Champaign
-//                    Jeongnim Kim, jeongnim.kim@intel.com, University of Illinois at Urbana-Champaign
-//                    Ye Luo, yeluo@anl.gov, Argonne National Laboratory
-//                    Anouar Benali, benali@anl.gov, Argonne National Laboratory
-//                    Mark A. Berrill, berrillma@ornl.gov, Oak Ridge National Laboratory
+// File developed by: Jeremy McMinnis, jmcminis@gmail.com, University of
+// Illinois at Urbana-Champaign
+//                    Jeongnim Kim, jeongnim.kim@intel.com, University of
+//                    Illinois at Urbana-Champaign Ye Luo, yeluo@anl.gov,
+//                    Argonne National Laboratory Anouar Benali, benali@anl.gov,
+//                    Argonne National Laboratory Mark A. Berrill,
+//                    berrillma@ornl.gov, Oak Ridge National Laboratory
 //
-// File created by: Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois at Urbana-Champaign
+// File created by: Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois
+// at Urbana-Champaign
 //////////////////////////////////////////////////////////////////////////////////////
 
+#include "SplineC2RT.h"
 
+#include "CPU/math.hpp"
 #include "Concurrency/OpenMP.h"
-#include "SplineC2RT.h"
-#include "spline2/MultiBsplineEval.hpp"
 #include "QMCWaveFunctions/BsplineFactory/contraction_helper.hpp"
-#include "CPU/math.hpp"
+#include "spline2/MultiBsplineEval.hpp"
 
 namespace qmcplusplus
 {
-template<typename ST>
-SplineC2RT<ST>::SplineC2RT(const SplineC2RT& in) = default;
-
-template<typename ST>
-inline void SplineC2RT<ST>::set_spline(SingleSplineType* spline_r,
-                                       SingleSplineType* spline_i,
-                                       int twist,
-                                       int ispline,
-                                       int level)
+template <typename ST, typename VT>
+SplineC2RT<ST, VT>::SplineC2RT(const SplineC2RT& in) = default;
+
+template <typename ST, typename VT>
+inline void
+SplineC2RT<ST, VT>::set_spline(SingleSplineType* spline_r,
+    SingleSplineType* spline_i, int twist, int ispline, int level)
 {
-  SplineInst->copy_spline(spline_r, 2 * ispline);
-  SplineInst->copy_spline(spline_i, 2 * ispline + 1);
+    SplineInst->copy_spline(spline_r, 2 * ispline);
+    SplineInst->copy_spline(spline_i, 2 * ispline + 1);
 }
 
-template<typename ST>
-bool SplineC2RT<ST>::read_splines(hdf_archive& h5f)
+template <typename ST, typename VT>
+bool
+SplineC2RT<ST, VT>::read_splines(hdf_archive& h5f)
 {
-  std::ostringstream o;
-  o << "spline_" << this->MyIndex;
-  einspline_engine<SplineType> bigtable(SplineInst->getSplinePtr());
-  return h5f.readEntry(bigtable, o.str().c_str()); //"spline_0");
+    std::ostringstream o;
+    o << "spline_" << this->MyIndex;
+    einspline_engine<SplineType> bigtable(SplineInst->getSplinePtr());
+    return h5f.readEntry(bigtable, o.str().c_str()); //"spline_0");
 }
 
-template<typename ST>
-bool SplineC2RT<ST>::write_splines(hdf_archive& h5f)
+template <typename ST, typename VT>
+bool
+SplineC2RT<ST, VT>::write_splines(hdf_archive& h5f)
 {
-  std::ostringstream o;
-  o << "spline_" << this->MyIndex;
-  einspline_engine<SplineType> bigtable(SplineInst->getSplinePtr());
-  return h5f.writeEntry(bigtable, o.str().c_str()); //"spline_0");
+    std::ostringstream o;
+    o << "spline_" << this->MyIndex;
+    einspline_engine<SplineType> bigtable(SplineInst->getSplinePtr());
+    return h5f.writeEntry(bigtable, o.str().c_str()); //"spline_0");
 }
 
-template<typename ST>
-inline void SplineC2RT<ST>::assign_v(const PointType& r,
-                                     const vContainer_type& myV,
-                                     ValueVector& psi,
-                                     int first,
-                                     int last) const
+template <typename ST, typename VT>
+inline void
+SplineC2RT<ST, VT>::assign_v(const PointType& r, const vContainer_type& myV,
+    ValueVector& psi, int first, int last) const
 {
-  // protect last
-  last = last > this->kPoints.size() ? this->kPoints.size() : last;
+    // protect last
+    last = last > this->kPoints.size() ? this->kPoints.size() : last;
 
-  const ST x = r[0], y = r[1], z = r[2];
-  const ST* restrict kx = myKcart.data(0);
-  const ST* restrict ky = myKcart.data(1);
-  const ST* restrict kz = myKcart.data(2);
+    const ST x = r[0], y = r[1], z = r[2];
+    const ST* restrict kx = myKcart.data(0);
+    const ST* restrict ky = myKcart.data(1);
+    const ST* restrict kz = myKcart.data(2);
 
-  TT* restrict psi_s              = psi.data() + this->first_spo;
-  const size_t requested_orb_size = psi.size();
+    TT* restrict psi_s = psi.data() + this->first_spo;
+    const size_t requested_orb_size = psi.size();
 #pragma omp simd
-  for (size_t j = first; j < std::min(nComplexBands, last); j++)
-  {
-    ST s, c;
-    const size_t jr = j << 1;
-    const size_t ji = jr + 1;
-    const ST val_r  = myV[jr];
-    const ST val_i  = myV[ji];
-    qmcplusplus::sincos(-(x * kx[j] + y * ky[j] + z * kz[j]), &s, &c);
-    if (jr < requested_orb_size)
-      psi_s[jr] = val_r * c - val_i * s;
-    if (ji < requested_orb_size)
-      psi_s[ji] = val_i * c + val_r * s;
-  }
-
-  psi_s += nComplexBands;
+    for (size_t j = first; j < std::min(nComplexBands, last); j++) {
+        ST s, c;
+        const size_t jr = j << 1;
+        const size_t ji = jr + 1;
+        const ST val_r = myV[jr];
+        const ST val_i = myV[ji];
+        qmcplusplus::sincos(-(x * kx[j] + y * ky[j] + z * kz[j]), &s, &c);
+        if (jr < requested_orb_size)
+            psi_s[jr] = val_r * c - val_i * s;
+        if (ji < requested_orb_size)
+            psi_s[ji] = val_i * c + val_r * s;
+    }
+
+    psi_s += nComplexBands;
 #pragma omp simd
-  for (size_t j = std::max(nComplexBands, first); j < last; j++)
-  {
-    ST s, c;
-    const ST val_r = myV[2 * j];
-    const ST val_i = myV[2 * j + 1];
-    qmcplusplus::sincos(-(x * kx[j] + y * ky[j] + z * kz[j]), &s, &c);
-    if (j < requested_orb_size)
-      psi_s[j] = val_r * c - val_i * s;
-  }
+    for (size_t j = std::max(nComplexBands, first); j < last; j++) {
+        ST s, c;
+        const ST val_r = myV[2 * j];
+        const ST val_i = myV[2 * j + 1];
+        qmcplusplus::sincos(-(x * kx[j] + y * ky[j] + z * kz[j]), &s, &c);
+        if (j < requested_orb_size)
+            psi_s[j] = val_r * c - val_i * s;
+    }
 }
 
-template<typename ST>
-void SplineC2RT<ST>::evaluateValue(const ParticleSetT<ST>& P, const int iat, ValueVector& psi)
+template <typename ST, typename VT>
+void
+SplineC2RT<ST, VT>::evaluateValue(
+    const ParticleSetT<VT>& P, const int iat, ValueVector& psi)
 {
-  const PointType& r = P.activeR(iat);
-  PointType ru(PrimLattice.toUnit_floor(r));
+    const PointType& r = P.activeR(iat);
+    PointType ru(PrimLattice.toUnit_floor(r));
 
 #pragma omp parallel
-  {
-    int first, last;
-    FairDivideAligned(myV.size(), getAlignment<ST>(), omp_get_num_threads(), omp_get_thread_num(), first, last);
+    {
+        int first, last;
+        FairDivideAligned(myV.size(), getAlignment<ST>(), omp_get_num_threads(),
+            omp_get_thread_num(), first, last);
 
-    spline2::evaluate3d(SplineInst->getSplinePtr(), ru, myV, first, last);
-    assign_v(r, myV, psi, first / 2, last / 2);
-  }
+        spline2::evaluate3d(SplineInst->getSplinePtr(), ru, myV, first, last);
+        assign_v(r, myV, psi, first / 2, last / 2);
+    }
 }
 
-template<typename ST>
-void SplineC2RT<ST>::evaluateDetRatios(const VirtualParticleSetT<ST>& VP,
-                                       ValueVector& psi,
-                                       const ValueVector& psiinv,
-                                       std::vector<TT>& ratios)
+template <typename ST, typename VT>
+void
+SplineC2RT<ST, VT>::evaluateDetRatios(const VirtualParticleSetT<VT>& VP,
+    ValueVector& psi, const ValueVector& psiinv, std::vector<TT>& ratios)
 {
-  const bool need_resize = ratios_private.rows() < VP.getTotalNum();
+    const bool need_resize = ratios_private.rows() < VP.getTotalNum();
 
 #pragma omp parallel
-  {
-    int tid = omp_get_thread_num();
-    // initialize thread private ratios
-    if (need_resize)
     {
-      if (tid == 0) // just like #pragma omp master, but one fewer call to the runtime
-        ratios_private.resize(VP.getTotalNum(), omp_get_num_threads());
+        int tid = omp_get_thread_num();
+        // initialize thread private ratios
+        if (need_resize) {
+            if (tid == 0) // just like #pragma omp master, but one fewer call to
+                          // the runtime
+                ratios_private.resize(VP.getTotalNum(), omp_get_num_threads());
 #pragma omp barrier
+        }
+        int first, last;
+        FairDivideAligned(myV.size(), getAlignment<ST>(), omp_get_num_threads(),
+            tid, first, last);
+        const int first_cplx = first / 2;
+        const int last_cplx =
+            this->kPoints.size() < last / 2 ? this->kPoints.size() : last / 2;
+
+        for (int iat = 0; iat < VP.getTotalNum(); ++iat) {
+            const PointType& r = VP.activeR(iat);
+            PointType ru(PrimLattice.toUnit_floor(r));
+
+            spline2::evaluate3d(
+                SplineInst->getSplinePtr(), ru, myV, first, last);
+            assign_v(r, myV, psi, first_cplx, last_cplx);
+
+            const int first_real =
+                first_cplx + std::min(nComplexBands, first_cplx);
+            const int last_real =
+                last_cplx + std::min(nComplexBands, last_cplx);
+            ratios_private[iat][tid] = simd::dot(psi.data() + first_real,
+                psiinv.data() + first_real, last_real - first_real);
+        }
     }
-    int first, last;
-    FairDivideAligned(myV.size(), getAlignment<ST>(), omp_get_num_threads(), tid, first, last);
-    const int first_cplx = first / 2;
-    const int last_cplx  = this->kPoints.size() < last / 2 ? this->kPoints.size() : last / 2;
 
-    for (int iat = 0; iat < VP.getTotalNum(); ++iat)
-    {
-      const PointType& r = VP.activeR(iat);
-      PointType ru(PrimLattice.toUnit_floor(r));
-
-      spline2::evaluate3d(SplineInst->getSplinePtr(), ru, myV, first, last);
-      assign_v(r, myV, psi, first_cplx, last_cplx);
-
-      const int first_real     = first_cplx + std::min(nComplexBands, first_cplx);
-      const int last_real      = last_cplx + std::min(nComplexBands, last_cplx);
-      ratios_private[iat][tid] = simd::dot(psi.data() + first_real, psiinv.data() + first_real, last_real - first_real);
+    // do the reduction manually
+    for (int iat = 0; iat < VP.getTotalNum(); ++iat) {
+        ratios[iat] = TT(0);
+        for (int tid = 0; tid < ratios_private.cols(); tid++)
+            ratios[iat] += ratios_private[iat][tid];
     }
-  }
-
-  // do the reduction manually
-  for (int iat = 0; iat < VP.getTotalNum(); ++iat)
-  {
-    ratios[iat] = TT(0);
-    for (int tid = 0; tid < ratios_private.cols(); tid++)
-      ratios[iat] += ratios_private[iat][tid];
-  }
 }
 
 /** assign_vgl
-   */
-template<typename ST>
-inline void SplineC2RT<ST>::assign_vgl(const PointType& r,
-                                       ValueVector& psi,
-                                       GradVector& dpsi,
-                                       ValueVector& d2psi,
-                                       int first,
-                                       int last) const
+ */
+template <typename ST, typename VT>
+inline void
+SplineC2RT<ST, VT>::assign_vgl(const PointType& r, ValueVector& psi,
+    GradVector& dpsi, ValueVector& d2psi, int first, int last) const
 {
-  // protect last
-  last = last > this->kPoints.size() ? this->kPoints.size() : last;
-
-  constexpr ST two(2);
-  const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1), g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
-           g11 = PrimLattice.G(4), g12 = PrimLattice.G(5), g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
-           g22 = PrimLattice.G(8);
-  const ST x = r[0], y = r[1], z = r[2];
-  const ST symGG[6] = {GGt[0], GGt[1] + GGt[3], GGt[2] + GGt[6], GGt[4], GGt[5] + GGt[7], GGt[8]};
-
-  const ST* restrict k0 = myKcart.data(0);
-  ASSUME_ALIGNED(k0);
-  const ST* restrict k1 = myKcart.data(1);
-  ASSUME_ALIGNED(k1);
-  const ST* restrict k2 = myKcart.data(2);
-  ASSUME_ALIGNED(k2);
-
-  const ST* restrict g0 = myG.data(0);
-  ASSUME_ALIGNED(g0);
-  const ST* restrict g1 = myG.data(1);
-  ASSUME_ALIGNED(g1);
-  const ST* restrict g2 = myG.data(2);
-  ASSUME_ALIGNED(g2);
-  const ST* restrict h00 = myH.data(0);
-  ASSUME_ALIGNED(h00);
-  const ST* restrict h01 = myH.data(1);
-  ASSUME_ALIGNED(h01);
-  const ST* restrict h02 = myH.data(2);
-  ASSUME_ALIGNED(h02);
-  const ST* restrict h11 = myH.data(3);
-  ASSUME_ALIGNED(h11);
-  const ST* restrict h12 = myH.data(4);
-  ASSUME_ALIGNED(h12);
-  const ST* restrict h22 = myH.data(5);
-  ASSUME_ALIGNED(h22);
-
-  const size_t requested_orb_size = psi.size();
+    // protect last
+    last = last > this->kPoints.size() ? this->kPoints.size() : last;
+
+    constexpr ST two(2);
+    const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1),
+             g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
+             g11 = PrimLattice.G(4), g12 = PrimLattice.G(5),
+             g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
+             g22 = PrimLattice.G(8);
+    const ST x = r[0], y = r[1], z = r[2];
+    const ST symGG[6] = {GGt[0], GGt[1] + GGt[3], GGt[2] + GGt[6], GGt[4],
+        GGt[5] + GGt[7], GGt[8]};
+
+    const ST* restrict k0 = myKcart.data(0);
+    ASSUME_ALIGNED(k0);
+    const ST* restrict k1 = myKcart.data(1);
+    ASSUME_ALIGNED(k1);
+    const ST* restrict k2 = myKcart.data(2);
+    ASSUME_ALIGNED(k2);
+
+    const ST* restrict g0 = myG.data(0);
+    ASSUME_ALIGNED(g0);
+    const ST* restrict g1 = myG.data(1);
+    ASSUME_ALIGNED(g1);
+    const ST* restrict g2 = myG.data(2);
+    ASSUME_ALIGNED(g2);
+    const ST* restrict h00 = myH.data(0);
+    ASSUME_ALIGNED(h00);
+    const ST* restrict h01 = myH.data(1);
+    ASSUME_ALIGNED(h01);
+    const ST* restrict h02 = myH.data(2);
+    ASSUME_ALIGNED(h02);
+    const ST* restrict h11 = myH.data(3);
+    ASSUME_ALIGNED(h11);
+    const ST* restrict h12 = myH.data(4);
+    ASSUME_ALIGNED(h12);
+    const ST* restrict h22 = myH.data(5);
+    ASSUME_ALIGNED(h22);
+
+    const size_t requested_orb_size = psi.size();
 #pragma omp simd
-  for (size_t j = first; j < std::min(nComplexBands, last); j++)
-  {
-    const size_t jr = j << 1;
-    const size_t ji = jr + 1;
-
-    const ST kX    = k0[j];
-    const ST kY    = k1[j];
-    const ST kZ    = k2[j];
-    const ST val_r = myV[jr];
-    const ST val_i = myV[ji];
-
-    //phase
-    ST s, c;
-    qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
-
-    //dot(PrimLattice.G,myG[j])
-    const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
-    const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
-    const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
-
-    const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
-    const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
-    const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
-
-    // \f$\nabla \psi_r + {\bf k}\psi_i\f$
-    const ST gX_r = dX_r + val_i * kX;
-    const ST gY_r = dY_r + val_i * kY;
-    const ST gZ_r = dZ_r + val_i * kZ;
-    const ST gX_i = dX_i - val_r * kX;
-    const ST gY_i = dY_i - val_r * kY;
-    const ST gZ_i = dZ_i - val_r * kZ;
-
-    const ST lcart_r = SymTrace(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], symGG);
-    const ST lcart_i = SymTrace(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], symGG);
-    const ST lap_r   = lcart_r + mKK[j] * val_r + two * (kX * dX_i + kY * dY_i + kZ * dZ_i);
-    const ST lap_i   = lcart_i + mKK[j] * val_i - two * (kX * dX_r + kY * dY_r + kZ * dZ_r);
-
-    const size_t psiIndex = this->first_spo + jr;
-    if (psiIndex < requested_orb_size)
-    {
-      psi[psiIndex]     = c * val_r - s * val_i;
-      dpsi[psiIndex][0] = c * gX_r - s * gX_i;
-      dpsi[psiIndex][1] = c * gY_r - s * gY_i;
-      dpsi[psiIndex][2] = c * gZ_r - s * gZ_i;
-      d2psi[psiIndex]   = c * lap_r - s * lap_i;
+    for (size_t j = first; j < std::min(nComplexBands, last); j++) {
+        const size_t jr = j << 1;
+        const size_t ji = jr + 1;
+
+        const ST kX = k0[j];
+        const ST kY = k1[j];
+        const ST kZ = k2[j];
+        const ST val_r = myV[jr];
+        const ST val_i = myV[ji];
+
+        // phase
+        ST s, c;
+        qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
+
+        // dot(PrimLattice.G,myG[j])
+        const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
+        const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
+        const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
+
+        const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
+        const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
+        const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
+
+        // \f$\nabla \psi_r + {\bf k}\psi_i\f$
+        const ST gX_r = dX_r + val_i * kX;
+        const ST gY_r = dY_r + val_i * kY;
+        const ST gZ_r = dZ_r + val_i * kZ;
+        const ST gX_i = dX_i - val_r * kX;
+        const ST gY_i = dY_i - val_r * kY;
+        const ST gZ_i = dZ_i - val_r * kZ;
+
+        const ST lcart_r = SymTrace(
+            h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], symGG);
+        const ST lcart_i = SymTrace(
+            h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], symGG);
+        const ST lap_r = lcart_r + mKK[j] * val_r +
+            two * (kX * dX_i + kY * dY_i + kZ * dZ_i);
+        const ST lap_i = lcart_i + mKK[j] * val_i -
+            two * (kX * dX_r + kY * dY_r + kZ * dZ_r);
+
+        const size_t psiIndex = this->first_spo + jr;
+        if (psiIndex < requested_orb_size) {
+            psi[psiIndex] = c * val_r - s * val_i;
+            dpsi[psiIndex][0] = c * gX_r - s * gX_i;
+            dpsi[psiIndex][1] = c * gY_r - s * gY_i;
+            dpsi[psiIndex][2] = c * gZ_r - s * gZ_i;
+            d2psi[psiIndex] = c * lap_r - s * lap_i;
+        }
+        if (psiIndex + 1 < requested_orb_size) {
+            psi[psiIndex + 1] = c * val_i + s * val_r;
+            dpsi[psiIndex + 1][0] = c * gX_i + s * gX_r;
+            dpsi[psiIndex + 1][1] = c * gY_i + s * gY_r;
+            dpsi[psiIndex + 1][2] = c * gZ_i + s * gZ_r;
+            d2psi[psiIndex + 1] = c * lap_i + s * lap_r;
+        }
     }
-    if (psiIndex + 1 < requested_orb_size)
-    {
-      psi[psiIndex + 1]     = c * val_i + s * val_r;
-      dpsi[psiIndex + 1][0] = c * gX_i + s * gX_r;
-      dpsi[psiIndex + 1][1] = c * gY_i + s * gY_r;
-      dpsi[psiIndex + 1][2] = c * gZ_i + s * gZ_r;
-      d2psi[psiIndex + 1]   = c * lap_i + s * lap_r;
-    }
-  }
 
 #pragma omp simd
-  for (size_t j = std::max(nComplexBands, first); j < last; j++)
-  {
-    const size_t jr = j << 1;
-    const size_t ji = jr + 1;
-
-    const ST kX    = k0[j];
-    const ST kY    = k1[j];
-    const ST kZ    = k2[j];
-    const ST val_r = myV[jr];
-    const ST val_i = myV[ji];
-
-    //phase
-    ST s, c;
-    qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
-
-    //dot(PrimLattice.G,myG[j])
-    const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
-    const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
-    const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
-
-    const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
-    const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
-    const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
-
-    // \f$\nabla \psi_r + {\bf k}\psi_i\f$
-    const ST gX_r = dX_r + val_i * kX;
-    const ST gY_r = dY_r + val_i * kY;
-    const ST gZ_r = dZ_r + val_i * kZ;
-    const ST gX_i = dX_i - val_r * kX;
-    const ST gY_i = dY_i - val_r * kY;
-    const ST gZ_i = dZ_i - val_r * kZ;
-
-    if (const size_t psiIndex = this->first_spo + nComplexBands + j; psiIndex < requested_orb_size)
-    {
-      psi[psiIndex]     = c * val_r - s * val_i;
-      dpsi[psiIndex][0] = c * gX_r - s * gX_i;
-      dpsi[psiIndex][1] = c * gY_r - s * gY_i;
-      dpsi[psiIndex][2] = c * gZ_r - s * gZ_i;
-
-      const ST lcart_r = SymTrace(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], symGG);
-      const ST lcart_i = SymTrace(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], symGG);
-      const ST lap_r   = lcart_r + mKK[j] * val_r + two * (kX * dX_i + kY * dY_i + kZ * dZ_i);
-      const ST lap_i   = lcart_i + mKK[j] * val_i - two * (kX * dX_r + kY * dY_r + kZ * dZ_r);
-      d2psi[psiIndex]  = c * lap_r - s * lap_i;
+    for (size_t j = std::max(nComplexBands, first); j < last; j++) {
+        const size_t jr = j << 1;
+        const size_t ji = jr + 1;
+
+        const ST kX = k0[j];
+        const ST kY = k1[j];
+        const ST kZ = k2[j];
+        const ST val_r = myV[jr];
+        const ST val_i = myV[ji];
+
+        // phase
+        ST s, c;
+        qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
+
+        // dot(PrimLattice.G,myG[j])
+        const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
+        const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
+        const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
+
+        const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
+        const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
+        const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
+
+        // \f$\nabla \psi_r + {\bf k}\psi_i\f$
+        const ST gX_r = dX_r + val_i * kX;
+        const ST gY_r = dY_r + val_i * kY;
+        const ST gZ_r = dZ_r + val_i * kZ;
+        const ST gX_i = dX_i - val_r * kX;
+        const ST gY_i = dY_i - val_r * kY;
+        const ST gZ_i = dZ_i - val_r * kZ;
+
+        if (const size_t psiIndex = this->first_spo + nComplexBands + j;
+            psiIndex < requested_orb_size) {
+            psi[psiIndex] = c * val_r - s * val_i;
+            dpsi[psiIndex][0] = c * gX_r - s * gX_i;
+            dpsi[psiIndex][1] = c * gY_r - s * gY_i;
+            dpsi[psiIndex][2] = c * gZ_r - s * gZ_i;
+
+            const ST lcart_r = SymTrace(
+                h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], symGG);
+            const ST lcart_i = SymTrace(
+                h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], symGG);
+            const ST lap_r = lcart_r + mKK[j] * val_r +
+                two * (kX * dX_i + kY * dY_i + kZ * dZ_i);
+            const ST lap_i = lcart_i + mKK[j] * val_i -
+                two * (kX * dX_r + kY * dY_r + kZ * dZ_r);
+            d2psi[psiIndex] = c * lap_r - s * lap_i;
+        }
     }
-  }
 }
 
-/** assign_vgl_from_l can be used when myL is precomputed and myV,myG,myL in cartesian
-   */
-template<typename ST>
-inline void SplineC2RT<ST>::assign_vgl_from_l(const PointType& r,
-                                              ValueVector& psi,
-                                              GradVector& dpsi,
-                                              ValueVector& d2psi)
+/** assign_vgl_from_l can be used when myL is precomputed and myV,myG,myL in
+ * cartesian
+ */
+template <typename ST, typename VT>
+inline void
+SplineC2RT<ST, VT>::assign_vgl_from_l(
+    const PointType& r, ValueVector& psi, GradVector& dpsi, ValueVector& d2psi)
 {
-  constexpr ST two(2);
-  const ST x = r[0], y = r[1], z = r[2];
+    constexpr ST two(2);
+    const ST x = r[0], y = r[1], z = r[2];
 
-  const ST* restrict k0 = myKcart.data(0);
-  ASSUME_ALIGNED(k0);
-  const ST* restrict k1 = myKcart.data(1);
-  ASSUME_ALIGNED(k1);
-  const ST* restrict k2 = myKcart.data(2);
-  ASSUME_ALIGNED(k2);
+    const ST* restrict k0 = myKcart.data(0);
+    ASSUME_ALIGNED(k0);
+    const ST* restrict k1 = myKcart.data(1);
+    ASSUME_ALIGNED(k1);
+    const ST* restrict k2 = myKcart.data(2);
+    ASSUME_ALIGNED(k2);
 
-  const ST* restrict g0 = myG.data(0);
-  ASSUME_ALIGNED(g0);
-  const ST* restrict g1 = myG.data(1);
-  ASSUME_ALIGNED(g1);
-  const ST* restrict g2 = myG.data(2);
-  ASSUME_ALIGNED(g2);
+    const ST* restrict g0 = myG.data(0);
+    ASSUME_ALIGNED(g0);
+    const ST* restrict g1 = myG.data(1);
+    ASSUME_ALIGNED(g1);
+    const ST* restrict g2 = myG.data(2);
+    ASSUME_ALIGNED(g2);
 
-  const size_t N = this->kPoints.size();
+    const size_t N = this->kPoints.size();
 
 #pragma omp simd
-  for (size_t j = 0; j < nComplexBands; j++)
-  {
-    const size_t jr = j << 1;
-    const size_t ji = jr + 1;
-
-    const ST kX    = k0[j];
-    const ST kY    = k1[j];
-    const ST kZ    = k2[j];
-    const ST val_r = myV[jr];
-    const ST val_i = myV[ji];
-
-    //phase
-    ST s, c;
-    qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
-
-    //dot(PrimLattice.G,myG[j])
-    const ST dX_r = g0[jr];
-    const ST dY_r = g1[jr];
-    const ST dZ_r = g2[jr];
-
-    const ST dX_i = g0[ji];
-    const ST dY_i = g1[ji];
-    const ST dZ_i = g2[ji];
-
-    // \f$\nabla \psi_r + {\bf k}\psi_i\f$
-    const ST gX_r = dX_r + val_i * kX;
-    const ST gY_r = dY_r + val_i * kY;
-    const ST gZ_r = dZ_r + val_i * kZ;
-    const ST gX_i = dX_i - val_r * kX;
-    const ST gY_i = dY_i - val_r * kY;
-    const ST gZ_i = dZ_i - val_r * kZ;
-
-    const ST lap_r = myL[jr] + mKK[j] * val_r + two * (kX * dX_i + kY * dY_i + kZ * dZ_i);
-    const ST lap_i = myL[ji] + mKK[j] * val_i - two * (kX * dX_r + kY * dY_r + kZ * dZ_r);
-
-    const size_t psiIndex = this->first_spo + jr;
-    psi[psiIndex]         = c * val_r - s * val_i;
-    psi[psiIndex + 1]     = c * val_i + s * val_r;
-    d2psi[psiIndex]       = c * lap_r - s * lap_i;
-    d2psi[psiIndex + 1]   = c * lap_i + s * lap_r;
-    dpsi[psiIndex][0]     = c * gX_r - s * gX_i;
-    dpsi[psiIndex][1]     = c * gY_r - s * gY_i;
-    dpsi[psiIndex][2]     = c * gZ_r - s * gZ_i;
-    dpsi[psiIndex + 1][0] = c * gX_i + s * gX_r;
-    dpsi[psiIndex + 1][1] = c * gY_i + s * gY_r;
-    dpsi[psiIndex + 1][2] = c * gZ_i + s * gZ_r;
-  }
+    for (size_t j = 0; j < nComplexBands; j++) {
+        const size_t jr = j << 1;
+        const size_t ji = jr + 1;
+
+        const ST kX = k0[j];
+        const ST kY = k1[j];
+        const ST kZ = k2[j];
+        const ST val_r = myV[jr];
+        const ST val_i = myV[ji];
+
+        // phase
+        ST s, c;
+        qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
+
+        // dot(PrimLattice.G,myG[j])
+        const ST dX_r = g0[jr];
+        const ST dY_r = g1[jr];
+        const ST dZ_r = g2[jr];
+
+        const ST dX_i = g0[ji];
+        const ST dY_i = g1[ji];
+        const ST dZ_i = g2[ji];
+
+        // \f$\nabla \psi_r + {\bf k}\psi_i\f$
+        const ST gX_r = dX_r + val_i * kX;
+        const ST gY_r = dY_r + val_i * kY;
+        const ST gZ_r = dZ_r + val_i * kZ;
+        const ST gX_i = dX_i - val_r * kX;
+        const ST gY_i = dY_i - val_r * kY;
+        const ST gZ_i = dZ_i - val_r * kZ;
+
+        const ST lap_r = myL[jr] + mKK[j] * val_r +
+            two * (kX * dX_i + kY * dY_i + kZ * dZ_i);
+        const ST lap_i = myL[ji] + mKK[j] * val_i -
+            two * (kX * dX_r + kY * dY_r + kZ * dZ_r);
+
+        const size_t psiIndex = this->first_spo + jr;
+        psi[psiIndex] = c * val_r - s * val_i;
+        psi[psiIndex + 1] = c * val_i + s * val_r;
+        d2psi[psiIndex] = c * lap_r - s * lap_i;
+        d2psi[psiIndex + 1] = c * lap_i + s * lap_r;
+        dpsi[psiIndex][0] = c * gX_r - s * gX_i;
+        dpsi[psiIndex][1] = c * gY_r - s * gY_i;
+        dpsi[psiIndex][2] = c * gZ_r - s * gZ_i;
+        dpsi[psiIndex + 1][0] = c * gX_i + s * gX_r;
+        dpsi[psiIndex + 1][1] = c * gY_i + s * gY_r;
+        dpsi[psiIndex + 1][2] = c * gZ_i + s * gZ_r;
+    }
 
 #pragma omp simd
-  for (size_t j = nComplexBands; j < N; j++)
-  {
-    const size_t jr = j << 1;
-    const size_t ji = jr + 1;
-
-    const ST kX    = k0[j];
-    const ST kY    = k1[j];
-    const ST kZ    = k2[j];
-    const ST val_r = myV[jr];
-    const ST val_i = myV[ji];
-
-    //phase
-    ST s, c;
-    qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
-
-    //dot(PrimLattice.G,myG[j])
-    const ST dX_r = g0[jr];
-    const ST dY_r = g1[jr];
-    const ST dZ_r = g2[jr];
-
-    const ST dX_i = g0[ji];
-    const ST dY_i = g1[ji];
-    const ST dZ_i = g2[ji];
-
-    // \f$\nabla \psi_r + {\bf k}\psi_i\f$
-    const ST gX_r         = dX_r + val_i * kX;
-    const ST gY_r         = dY_r + val_i * kY;
-    const ST gZ_r         = dZ_r + val_i * kZ;
-    const ST gX_i         = dX_i - val_r * kX;
-    const ST gY_i         = dY_i - val_r * kY;
-    const ST gZ_i         = dZ_i - val_r * kZ;
-    const size_t psiIndex = this->first_spo + nComplexBands + j;
-    psi[psiIndex]         = c * val_r - s * val_i;
-    dpsi[psiIndex][0]     = c * gX_r - s * gX_i;
-    dpsi[psiIndex][1]     = c * gY_r - s * gY_i;
-    dpsi[psiIndex][2]     = c * gZ_r - s * gZ_i;
-
-    const ST lap_r  = myL[jr] + mKK[j] * val_r + two * (kX * dX_i + kY * dY_i + kZ * dZ_i);
-    const ST lap_i  = myL[ji] + mKK[j] * val_i - two * (kX * dX_r + kY * dY_r + kZ * dZ_r);
-    d2psi[psiIndex] = c * lap_r - s * lap_i;
-  }
+    for (size_t j = nComplexBands; j < N; j++) {
+        const size_t jr = j << 1;
+        const size_t ji = jr + 1;
+
+        const ST kX = k0[j];
+        const ST kY = k1[j];
+        const ST kZ = k2[j];
+        const ST val_r = myV[jr];
+        const ST val_i = myV[ji];
+
+        // phase
+        ST s, c;
+        qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
+
+        // dot(PrimLattice.G,myG[j])
+        const ST dX_r = g0[jr];
+        const ST dY_r = g1[jr];
+        const ST dZ_r = g2[jr];
+
+        const ST dX_i = g0[ji];
+        const ST dY_i = g1[ji];
+        const ST dZ_i = g2[ji];
+
+        // \f$\nabla \psi_r + {\bf k}\psi_i\f$
+        const ST gX_r = dX_r + val_i * kX;
+        const ST gY_r = dY_r + val_i * kY;
+        const ST gZ_r = dZ_r + val_i * kZ;
+        const ST gX_i = dX_i - val_r * kX;
+        const ST gY_i = dY_i - val_r * kY;
+        const ST gZ_i = dZ_i - val_r * kZ;
+        const size_t psiIndex = this->first_spo + nComplexBands + j;
+        psi[psiIndex] = c * val_r - s * val_i;
+        dpsi[psiIndex][0] = c * gX_r - s * gX_i;
+        dpsi[psiIndex][1] = c * gY_r - s * gY_i;
+        dpsi[psiIndex][2] = c * gZ_r - s * gZ_i;
+
+        const ST lap_r = myL[jr] + mKK[j] * val_r +
+            two * (kX * dX_i + kY * dY_i + kZ * dZ_i);
+        const ST lap_i = myL[ji] + mKK[j] * val_i -
+            two * (kX * dX_r + kY * dY_r + kZ * dZ_r);
+        d2psi[psiIndex] = c * lap_r - s * lap_i;
+    }
 }
 
-template<typename ST>
-void SplineC2RT<ST>::evaluateVGL(const ParticleSetT<ST>& P,
-                                 const int iat,
-                                 ValueVector& psi,
-                                 GradVector& dpsi,
-                                 ValueVector& d2psi)
+template <typename ST, typename VT>
+void
+SplineC2RT<ST, VT>::evaluateVGL(const ParticleSetT<VT>& P, const int iat,
+    ValueVector& psi, GradVector& dpsi, ValueVector& d2psi)
 {
-  const PointType& r = P.activeR(iat);
-  PointType ru(PrimLattice.toUnit_floor(r));
+    const PointType& r = P.activeR(iat);
+    PointType ru(PrimLattice.toUnit_floor(r));
 
 #pragma omp parallel
-  {
-    int first, last;
-    FairDivideAligned(myV.size(), getAlignment<ST>(), omp_get_num_threads(), omp_get_thread_num(), first, last);
+    {
+        int first, last;
+        FairDivideAligned(myV.size(), getAlignment<ST>(), omp_get_num_threads(),
+            omp_get_thread_num(), first, last);
 
-    spline2::evaluate3d_vgh(SplineInst->getSplinePtr(), ru, myV, myG, myH, first, last);
-    assign_vgl(r, psi, dpsi, d2psi, first / 2, last / 2);
-  }
+        spline2::evaluate3d_vgh(
+            SplineInst->getSplinePtr(), ru, myV, myG, myH, first, last);
+        assign_vgl(r, psi, dpsi, d2psi, first / 2, last / 2);
+    }
 }
 
-template<typename ST>
-void SplineC2RT<ST>::assign_vgh(const PointType& r,
-                                ValueVector& psi,
-                                GradVector& dpsi,
-                                HessVector& grad_grad_psi,
-                                int first,
-                                int last) const
+template <typename ST, typename VT>
+void
+SplineC2RT<ST, VT>::assign_vgh(const PointType& r, ValueVector& psi,
+    GradVector& dpsi, HessVector& grad_grad_psi, int first, int last) const
 {
-  // protect last
-  last = last > this->kPoints.size() ? this->kPoints.size() : last;
-
-  const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1), g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
-           g11 = PrimLattice.G(4), g12 = PrimLattice.G(5), g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
-           g22 = PrimLattice.G(8);
-  const ST x = r[0], y = r[1], z = r[2];
-
-  const ST* restrict k0 = myKcart.data(0);
-  const ST* restrict k1 = myKcart.data(1);
-  const ST* restrict k2 = myKcart.data(2);
-
-  const ST* restrict g0  = myG.data(0);
-  const ST* restrict g1  = myG.data(1);
-  const ST* restrict g2  = myG.data(2);
-  const ST* restrict h00 = myH.data(0);
-  const ST* restrict h01 = myH.data(1);
-  const ST* restrict h02 = myH.data(2);
-  const ST* restrict h11 = myH.data(3);
-  const ST* restrict h12 = myH.data(4);
-  const ST* restrict h22 = myH.data(5);
+    // protect last
+    last = last > this->kPoints.size() ? this->kPoints.size() : last;
+
+    const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1),
+             g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
+             g11 = PrimLattice.G(4), g12 = PrimLattice.G(5),
+             g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
+             g22 = PrimLattice.G(8);
+    const ST x = r[0], y = r[1], z = r[2];
+
+    const ST* restrict k0 = myKcart.data(0);
+    const ST* restrict k1 = myKcart.data(1);
+    const ST* restrict k2 = myKcart.data(2);
+
+    const ST* restrict g0 = myG.data(0);
+    const ST* restrict g1 = myG.data(1);
+    const ST* restrict g2 = myG.data(2);
+    const ST* restrict h00 = myH.data(0);
+    const ST* restrict h01 = myH.data(1);
+    const ST* restrict h02 = myH.data(2);
+    const ST* restrict h11 = myH.data(3);
+    const ST* restrict h12 = myH.data(4);
+    const ST* restrict h22 = myH.data(5);
 
 #pragma omp simd
-  for (size_t j = first; j < std::min(nComplexBands, last); j++)
-  {
-    int jr = j << 1;
-    int ji = jr + 1;
-
-    const ST kX    = k0[j];
-    const ST kY    = k1[j];
-    const ST kZ    = k2[j];
-    const ST val_r = myV[jr];
-    const ST val_i = myV[ji];
-
-    //phase
-    ST s, c;
-    qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
-
-    //dot(PrimLattice.G,myG[j])
-    const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
-    const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
-    const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
-
-    const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
-    const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
-    const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
-
-    // \f$\nabla \psi_r + {\bf k}\psi_i\f$
-    const ST gX_r = dX_r + val_i * kX;
-    const ST gY_r = dY_r + val_i * kY;
-    const ST gZ_r = dZ_r + val_i * kZ;
-    const ST gX_i = dX_i - val_r * kX;
-    const ST gY_i = dY_i - val_r * kY;
-    const ST gZ_i = dZ_i - val_r * kZ;
-
-    const size_t psiIndex = this->first_spo + jr;
-
-    psi[psiIndex]     = c * val_r - s * val_i;
-    dpsi[psiIndex][0] = c * gX_r - s * gX_i;
-    dpsi[psiIndex][1] = c * gY_r - s * gY_i;
-    dpsi[psiIndex][2] = c * gZ_r - s * gZ_i;
-
-    psi[psiIndex + 1]     = c * val_i + s * val_r;
-    dpsi[psiIndex + 1][0] = c * gX_i + s * gX_r;
-    dpsi[psiIndex + 1][1] = c * gY_i + s * gY_r;
-    dpsi[psiIndex + 1][2] = c * gZ_i + s * gZ_r;
-
-    const ST h_xx_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g00, g01, g02) + kX * (gX_i + dX_i);
-    const ST h_xy_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g10, g11, g12) + kX * (gY_i + dY_i);
-    const ST h_xz_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g20, g21, g22) + kX * (gZ_i + dZ_i);
-    const ST h_yx_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g00, g01, g02) + kY * (gX_i + dX_i);
-    const ST h_yy_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g10, g11, g12) + kY * (gY_i + dY_i);
-    const ST h_yz_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g20, g21, g22) + kY * (gZ_i + dZ_i);
-    const ST h_zx_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g00, g01, g02) + kZ * (gX_i + dX_i);
-    const ST h_zy_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g10, g11, g12) + kZ * (gY_i + dY_i);
-    const ST h_zz_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g20, g21, g22) + kZ * (gZ_i + dZ_i);
-
-    const ST h_xx_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g00, g01, g02) - kX * (gX_r + dX_r);
-    const ST h_xy_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g10, g11, g12) - kX * (gY_r + dY_r);
-    const ST h_xz_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g20, g21, g22) - kX * (gZ_r + dZ_r);
-    const ST h_yx_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g00, g01, g02) - kY * (gX_r + dX_r);
-    const ST h_yy_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g10, g11, g12) - kY * (gY_r + dY_r);
-    const ST h_yz_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g20, g21, g22) - kY * (gZ_r + dZ_r);
-    const ST h_zx_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g00, g01, g02) - kZ * (gX_r + dX_r);
-    const ST h_zy_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g10, g11, g12) - kZ * (gY_r + dY_r);
-    const ST h_zz_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g20, g21, g22) - kZ * (gZ_r + dZ_r);
-
-    grad_grad_psi[psiIndex][0] = c * h_xx_r - s * h_xx_i;
-    grad_grad_psi[psiIndex][1] = c * h_xy_r - s * h_xy_i;
-    grad_grad_psi[psiIndex][2] = c * h_xz_r - s * h_xz_i;
-    grad_grad_psi[psiIndex][3] = c * h_yx_r - s * h_yx_i;
-    grad_grad_psi[psiIndex][4] = c * h_yy_r - s * h_yy_i;
-    grad_grad_psi[psiIndex][5] = c * h_yz_r - s * h_yz_i;
-    grad_grad_psi[psiIndex][6] = c * h_zx_r - s * h_zx_i;
-    grad_grad_psi[psiIndex][7] = c * h_zy_r - s * h_zy_i;
-    grad_grad_psi[psiIndex][8] = c * h_zz_r - s * h_zz_i;
-
-    grad_grad_psi[psiIndex + 1][0] = c * h_xx_i + s * h_xx_r;
-    grad_grad_psi[psiIndex + 1][1] = c * h_xy_i + s * h_xy_r;
-    grad_grad_psi[psiIndex + 1][2] = c * h_xz_i + s * h_xz_r;
-    grad_grad_psi[psiIndex + 1][3] = c * h_yx_i + s * h_yx_r;
-    grad_grad_psi[psiIndex + 1][4] = c * h_yy_i + s * h_yy_r;
-    grad_grad_psi[psiIndex + 1][5] = c * h_yz_i + s * h_yz_r;
-    grad_grad_psi[psiIndex + 1][6] = c * h_zx_i + s * h_zx_r;
-    grad_grad_psi[psiIndex + 1][7] = c * h_zy_i + s * h_zy_r;
-    grad_grad_psi[psiIndex + 1][8] = c * h_zz_i + s * h_zz_r;
-  }
+    for (size_t j = first; j < std::min(nComplexBands, last); j++) {
+        int jr = j << 1;
+        int ji = jr + 1;
+
+        const ST kX = k0[j];
+        const ST kY = k1[j];
+        const ST kZ = k2[j];
+        const ST val_r = myV[jr];
+        const ST val_i = myV[ji];
+
+        // phase
+        ST s, c;
+        qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
+
+        // dot(PrimLattice.G,myG[j])
+        const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
+        const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
+        const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
+
+        const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
+        const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
+        const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
+
+        // \f$\nabla \psi_r + {\bf k}\psi_i\f$
+        const ST gX_r = dX_r + val_i * kX;
+        const ST gY_r = dY_r + val_i * kY;
+        const ST gZ_r = dZ_r + val_i * kZ;
+        const ST gX_i = dX_i - val_r * kX;
+        const ST gY_i = dY_i - val_r * kY;
+        const ST gZ_i = dZ_i - val_r * kZ;
+
+        const size_t psiIndex = this->first_spo + jr;
+
+        psi[psiIndex] = c * val_r - s * val_i;
+        dpsi[psiIndex][0] = c * gX_r - s * gX_i;
+        dpsi[psiIndex][1] = c * gY_r - s * gY_i;
+        dpsi[psiIndex][2] = c * gZ_r - s * gZ_i;
+
+        psi[psiIndex + 1] = c * val_i + s * val_r;
+        dpsi[psiIndex + 1][0] = c * gX_i + s * gX_r;
+        dpsi[psiIndex + 1][1] = c * gY_i + s * gY_r;
+        dpsi[psiIndex + 1][2] = c * gZ_i + s * gZ_r;
+
+        const ST h_xx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g00, g01, g02, g00, g01, g02) +
+            kX * (gX_i + dX_i);
+        const ST h_xy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g00, g01, g02, g10, g11, g12) +
+            kX * (gY_i + dY_i);
+        const ST h_xz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g00, g01, g02, g20, g21, g22) +
+            kX * (gZ_i + dZ_i);
+        const ST h_yx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g10, g11, g12, g00, g01, g02) +
+            kY * (gX_i + dX_i);
+        const ST h_yy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g10, g11, g12, g10, g11, g12) +
+            kY * (gY_i + dY_i);
+        const ST h_yz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g10, g11, g12, g20, g21, g22) +
+            kY * (gZ_i + dZ_i);
+        const ST h_zx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g20, g21, g22, g00, g01, g02) +
+            kZ * (gX_i + dX_i);
+        const ST h_zy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g20, g21, g22, g10, g11, g12) +
+            kZ * (gY_i + dY_i);
+        const ST h_zz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g20, g21, g22, g20, g21, g22) +
+            kZ * (gZ_i + dZ_i);
+
+        const ST h_xx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g00, g01, g02, g00, g01, g02) -
+            kX * (gX_r + dX_r);
+        const ST h_xy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g00, g01, g02, g10, g11, g12) -
+            kX * (gY_r + dY_r);
+        const ST h_xz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g00, g01, g02, g20, g21, g22) -
+            kX * (gZ_r + dZ_r);
+        const ST h_yx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g10, g11, g12, g00, g01, g02) -
+            kY * (gX_r + dX_r);
+        const ST h_yy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g10, g11, g12, g10, g11, g12) -
+            kY * (gY_r + dY_r);
+        const ST h_yz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g10, g11, g12, g20, g21, g22) -
+            kY * (gZ_r + dZ_r);
+        const ST h_zx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g20, g21, g22, g00, g01, g02) -
+            kZ * (gX_r + dX_r);
+        const ST h_zy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g20, g21, g22, g10, g11, g12) -
+            kZ * (gY_r + dY_r);
+        const ST h_zz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g20, g21, g22, g20, g21, g22) -
+            kZ * (gZ_r + dZ_r);
+
+        grad_grad_psi[psiIndex][0] = c * h_xx_r - s * h_xx_i;
+        grad_grad_psi[psiIndex][1] = c * h_xy_r - s * h_xy_i;
+        grad_grad_psi[psiIndex][2] = c * h_xz_r - s * h_xz_i;
+        grad_grad_psi[psiIndex][3] = c * h_yx_r - s * h_yx_i;
+        grad_grad_psi[psiIndex][4] = c * h_yy_r - s * h_yy_i;
+        grad_grad_psi[psiIndex][5] = c * h_yz_r - s * h_yz_i;
+        grad_grad_psi[psiIndex][6] = c * h_zx_r - s * h_zx_i;
+        grad_grad_psi[psiIndex][7] = c * h_zy_r - s * h_zy_i;
+        grad_grad_psi[psiIndex][8] = c * h_zz_r - s * h_zz_i;
+
+        grad_grad_psi[psiIndex + 1][0] = c * h_xx_i + s * h_xx_r;
+        grad_grad_psi[psiIndex + 1][1] = c * h_xy_i + s * h_xy_r;
+        grad_grad_psi[psiIndex + 1][2] = c * h_xz_i + s * h_xz_r;
+        grad_grad_psi[psiIndex + 1][3] = c * h_yx_i + s * h_yx_r;
+        grad_grad_psi[psiIndex + 1][4] = c * h_yy_i + s * h_yy_r;
+        grad_grad_psi[psiIndex + 1][5] = c * h_yz_i + s * h_yz_r;
+        grad_grad_psi[psiIndex + 1][6] = c * h_zx_i + s * h_zx_r;
+        grad_grad_psi[psiIndex + 1][7] = c * h_zy_i + s * h_zy_r;
+        grad_grad_psi[psiIndex + 1][8] = c * h_zz_i + s * h_zz_r;
+    }
 
 #pragma omp simd
-  for (size_t j = std::max(nComplexBands, first); j < last; j++)
-  {
-    int jr = j << 1;
-    int ji = jr + 1;
-
-    const ST kX    = k0[j];
-    const ST kY    = k1[j];
-    const ST kZ    = k2[j];
-    const ST val_r = myV[jr];
-    const ST val_i = myV[ji];
-
-    //phase
-    ST s, c;
-    qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
-
-    //dot(PrimLattice.G,myG[j])
-    const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
-    const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
-    const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
-
-    const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
-    const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
-    const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
-
-    // \f$\nabla \psi_r + {\bf k}\psi_i\f$
-    const ST gX_r = dX_r + val_i * kX;
-    const ST gY_r = dY_r + val_i * kY;
-    const ST gZ_r = dZ_r + val_i * kZ;
-    const ST gX_i = dX_i - val_r * kX;
-    const ST gY_i = dY_i - val_r * kY;
-    const ST gZ_i = dZ_i - val_r * kZ;
-
-    const size_t psiIndex = this->first_spo + nComplexBands + j;
-
-    psi[psiIndex]     = c * val_r - s * val_i;
-    dpsi[psiIndex][0] = c * gX_r - s * gX_i;
-    dpsi[psiIndex][1] = c * gY_r - s * gY_i;
-    dpsi[psiIndex][2] = c * gZ_r - s * gZ_i;
-
-    const ST h_xx_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g00, g01, g02) + kX * (gX_i + dX_i);
-    const ST h_xy_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g10, g11, g12) + kX * (gY_i + dY_i);
-    const ST h_xz_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g20, g21, g22) + kX * (gZ_i + dZ_i);
-    const ST h_yx_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g00, g01, g02) + kY * (gX_i + dX_i);
-    const ST h_yy_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g10, g11, g12) + kY * (gY_i + dY_i);
-    const ST h_yz_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g20, g21, g22) + kY * (gZ_i + dZ_i);
-    const ST h_zx_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g00, g01, g02) + kZ * (gX_i + dX_i);
-    const ST h_zy_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g10, g11, g12) + kZ * (gY_i + dY_i);
-    const ST h_zz_r =
-        v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g20, g21, g22) + kZ * (gZ_i + dZ_i);
-
-    const ST h_xx_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g00, g01, g02) - kX * (gX_r + dX_r);
-    const ST h_xy_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g10, g11, g12) - kX * (gY_r + dY_r);
-    const ST h_xz_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g20, g21, g22) - kX * (gZ_r + dZ_r);
-    const ST h_yx_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g00, g01, g02) - kY * (gX_r + dX_r);
-    const ST h_yy_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g10, g11, g12) - kY * (gY_r + dY_r);
-    const ST h_yz_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g20, g21, g22) - kY * (gZ_r + dZ_r);
-    const ST h_zx_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g00, g01, g02) - kZ * (gX_r + dX_r);
-    const ST h_zy_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g10, g11, g12) - kZ * (gY_r + dY_r);
-    const ST h_zz_i =
-        v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g20, g21, g22) - kZ * (gZ_r + dZ_r);
-
-    grad_grad_psi[psiIndex][0] = c * h_xx_r - s * h_xx_i;
-    grad_grad_psi[psiIndex][1] = c * h_xy_r - s * h_xy_i;
-    grad_grad_psi[psiIndex][2] = c * h_xz_r - s * h_xz_i;
-    grad_grad_psi[psiIndex][3] = c * h_yx_r - s * h_yx_i;
-    grad_grad_psi[psiIndex][4] = c * h_yy_r - s * h_yy_i;
-    grad_grad_psi[psiIndex][5] = c * h_yz_r - s * h_yz_i;
-    grad_grad_psi[psiIndex][6] = c * h_zx_r - s * h_zx_i;
-    grad_grad_psi[psiIndex][7] = c * h_zy_r - s * h_zy_i;
-    grad_grad_psi[psiIndex][8] = c * h_zz_r - s * h_zz_i;
-  }
+    for (size_t j = std::max(nComplexBands, first); j < last; j++) {
+        int jr = j << 1;
+        int ji = jr + 1;
+
+        const ST kX = k0[j];
+        const ST kY = k1[j];
+        const ST kZ = k2[j];
+        const ST val_r = myV[jr];
+        const ST val_i = myV[ji];
+
+        // phase
+        ST s, c;
+        qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
+
+        // dot(PrimLattice.G,myG[j])
+        const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
+        const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
+        const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
+
+        const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
+        const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
+        const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
+
+        // \f$\nabla \psi_r + {\bf k}\psi_i\f$
+        const ST gX_r = dX_r + val_i * kX;
+        const ST gY_r = dY_r + val_i * kY;
+        const ST gZ_r = dZ_r + val_i * kZ;
+        const ST gX_i = dX_i - val_r * kX;
+        const ST gY_i = dY_i - val_r * kY;
+        const ST gZ_i = dZ_i - val_r * kZ;
+
+        const size_t psiIndex = this->first_spo + nComplexBands + j;
+
+        psi[psiIndex] = c * val_r - s * val_i;
+        dpsi[psiIndex][0] = c * gX_r - s * gX_i;
+        dpsi[psiIndex][1] = c * gY_r - s * gY_i;
+        dpsi[psiIndex][2] = c * gZ_r - s * gZ_i;
+
+        const ST h_xx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g00, g01, g02, g00, g01, g02) +
+            kX * (gX_i + dX_i);
+        const ST h_xy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g00, g01, g02, g10, g11, g12) +
+            kX * (gY_i + dY_i);
+        const ST h_xz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g00, g01, g02, g20, g21, g22) +
+            kX * (gZ_i + dZ_i);
+        const ST h_yx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g10, g11, g12, g00, g01, g02) +
+            kY * (gX_i + dX_i);
+        const ST h_yy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g10, g11, g12, g10, g11, g12) +
+            kY * (gY_i + dY_i);
+        const ST h_yz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g10, g11, g12, g20, g21, g22) +
+            kY * (gZ_i + dZ_i);
+        const ST h_zx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g20, g21, g22, g00, g01, g02) +
+            kZ * (gX_i + dX_i);
+        const ST h_zy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g20, g21, g22, g10, g11, g12) +
+            kZ * (gY_i + dY_i);
+        const ST h_zz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+                              h22[jr], g20, g21, g22, g20, g21, g22) +
+            kZ * (gZ_i + dZ_i);
+
+        const ST h_xx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g00, g01, g02, g00, g01, g02) -
+            kX * (gX_r + dX_r);
+        const ST h_xy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g00, g01, g02, g10, g11, g12) -
+            kX * (gY_r + dY_r);
+        const ST h_xz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g00, g01, g02, g20, g21, g22) -
+            kX * (gZ_r + dZ_r);
+        const ST h_yx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g10, g11, g12, g00, g01, g02) -
+            kY * (gX_r + dX_r);
+        const ST h_yy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g10, g11, g12, g10, g11, g12) -
+            kY * (gY_r + dY_r);
+        const ST h_yz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g10, g11, g12, g20, g21, g22) -
+            kY * (gZ_r + dZ_r);
+        const ST h_zx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g20, g21, g22, g00, g01, g02) -
+            kZ * (gX_r + dX_r);
+        const ST h_zy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g20, g21, g22, g10, g11, g12) -
+            kZ * (gY_r + dY_r);
+        const ST h_zz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+                              h22[ji], g20, g21, g22, g20, g21, g22) -
+            kZ * (gZ_r + dZ_r);
+
+        grad_grad_psi[psiIndex][0] = c * h_xx_r - s * h_xx_i;
+        grad_grad_psi[psiIndex][1] = c * h_xy_r - s * h_xy_i;
+        grad_grad_psi[psiIndex][2] = c * h_xz_r - s * h_xz_i;
+        grad_grad_psi[psiIndex][3] = c * h_yx_r - s * h_yx_i;
+        grad_grad_psi[psiIndex][4] = c * h_yy_r - s * h_yy_i;
+        grad_grad_psi[psiIndex][5] = c * h_yz_r - s * h_yz_i;
+        grad_grad_psi[psiIndex][6] = c * h_zx_r - s * h_zx_i;
+        grad_grad_psi[psiIndex][7] = c * h_zy_r - s * h_zy_i;
+        grad_grad_psi[psiIndex][8] = c * h_zz_r - s * h_zz_i;
+    }
 }
 
-template<typename ST>
-void SplineC2RT<ST>::evaluateVGH(const ParticleSetT<ST>& P,
-                                 const int iat,
-                                 ValueVector& psi,
-                                 GradVector& dpsi,
-                                 HessVector& grad_grad_psi)
+template <typename ST, typename VT>
+void
+SplineC2RT<ST, VT>::evaluateVGH(const ParticleSetT<VT>& P, const int iat,
+    ValueVector& psi, GradVector& dpsi, HessVector& grad_grad_psi)
 {
-  const PointType& r = P.activeR(iat);
-  PointType ru(PrimLattice.toUnit_floor(r));
+    const PointType& r = P.activeR(iat);
+    PointType ru(PrimLattice.toUnit_floor(r));
 #pragma omp parallel
-  {
-    int first, last;
-    FairDivideAligned(myV.size(), getAlignment<ST>(), omp_get_num_threads(), omp_get_thread_num(), first, last);
+    {
+        int first, last;
+        FairDivideAligned(myV.size(), getAlignment<ST>(), omp_get_num_threads(),
+            omp_get_thread_num(), first, last);
 
-    spline2::evaluate3d_vgh(SplineInst->getSplinePtr(), ru, myV, myG, myH, first, last);
-    assign_vgh(r, psi, dpsi, grad_grad_psi, first / 2, last / 2);
-  }
+        spline2::evaluate3d_vgh(
+            SplineInst->getSplinePtr(), ru, myV, myG, myH, first, last);
+        assign_vgh(r, psi, dpsi, grad_grad_psi, first / 2, last / 2);
+    }
 }
 
-template<typename ST>
-void SplineC2RT<ST>::assign_vghgh(const PointType& r,
-                                  ValueVector& psi,
-                                  GradVector& dpsi,
-                                  HessVector& grad_grad_psi,
-                                  GGGVector& grad_grad_grad_psi,
-                                  int first,
-                                  int last) const
+template <typename ST, typename VT>
+void
+SplineC2RT<ST, VT>::assign_vghgh(const PointType& r, ValueVector& psi,
+    GradVector& dpsi, HessVector& grad_grad_psi, GGGVector& grad_grad_grad_psi,
+    int first, int last) const
 {
-  // protect last
-  last = last < 0 ? this->kPoints.size() : (last > this->kPoints.size() ? this->kPoints.size() : last);
-
-  const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1), g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
-           g11 = PrimLattice.G(4), g12 = PrimLattice.G(5), g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
-           g22 = PrimLattice.G(8);
-  const ST x = r[0], y = r[1], z = r[2];
-
-  const ST* restrict k0 = myKcart.data(0);
-  const ST* restrict k1 = myKcart.data(1);
-  const ST* restrict k2 = myKcart.data(2);
-
-  const ST* restrict g0  = myG.data(0);
-  const ST* restrict g1  = myG.data(1);
-  const ST* restrict g2  = myG.data(2);
-  const ST* restrict h00 = myH.data(0);
-  const ST* restrict h01 = myH.data(1);
-  const ST* restrict h02 = myH.data(2);
-  const ST* restrict h11 = myH.data(3);
-  const ST* restrict h12 = myH.data(4);
-  const ST* restrict h22 = myH.data(5);
-
-  const ST* restrict gh000 = mygH.data(0);
-  const ST* restrict gh001 = mygH.data(1);
-  const ST* restrict gh002 = mygH.data(2);
-  const ST* restrict gh011 = mygH.data(3);
-  const ST* restrict gh012 = mygH.data(4);
-  const ST* restrict gh022 = mygH.data(5);
-  const ST* restrict gh111 = mygH.data(6);
-  const ST* restrict gh112 = mygH.data(7);
-  const ST* restrict gh122 = mygH.data(8);
-  const ST* restrict gh222 = mygH.data(9);
-
-//SIMD doesn't work quite right yet.  Comment out until further debugging.
+    // protect last
+    last = last < 0 ? this->kPoints.size() :
+                      (last > this->kPoints.size() ? this->kPoints.size() : last);
+
+    const ST g00 = PrimLattice.G(0), g01 = PrimLattice.G(1),
+             g02 = PrimLattice.G(2), g10 = PrimLattice.G(3),
+             g11 = PrimLattice.G(4), g12 = PrimLattice.G(5),
+             g20 = PrimLattice.G(6), g21 = PrimLattice.G(7),
+             g22 = PrimLattice.G(8);
+    const ST x = r[0], y = r[1], z = r[2];
+
+    const ST* restrict k0 = myKcart.data(0);
+    const ST* restrict k1 = myKcart.data(1);
+    const ST* restrict k2 = myKcart.data(2);
+
+    const ST* restrict g0 = myG.data(0);
+    const ST* restrict g1 = myG.data(1);
+    const ST* restrict g2 = myG.data(2);
+    const ST* restrict h00 = myH.data(0);
+    const ST* restrict h01 = myH.data(1);
+    const ST* restrict h02 = myH.data(2);
+    const ST* restrict h11 = myH.data(3);
+    const ST* restrict h12 = myH.data(4);
+    const ST* restrict h22 = myH.data(5);
+
+    const ST* restrict gh000 = mygH.data(0);
+    const ST* restrict gh001 = mygH.data(1);
+    const ST* restrict gh002 = mygH.data(2);
+    const ST* restrict gh011 = mygH.data(3);
+    const ST* restrict gh012 = mygH.data(4);
+    const ST* restrict gh022 = mygH.data(5);
+    const ST* restrict gh111 = mygH.data(6);
+    const ST* restrict gh112 = mygH.data(7);
+    const ST* restrict gh122 = mygH.data(8);
+    const ST* restrict gh222 = mygH.data(9);
+
+// SIMD doesn't work quite right yet.  Comment out until further debugging.
 #pragma omp simd
-  for (size_t j = first; j < std::min(nComplexBands, last); j++)
-  {
-    int jr = j << 1;
-    int ji = jr + 1;
-
-    const ST kX    = k0[j];
-    const ST kY    = k1[j];
-    const ST kZ    = k2[j];
-    const ST val_r = myV[jr];
-    const ST val_i = myV[ji];
-
-    //phase
-    ST s, c;
-    qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
-
-    //dot(PrimLattice.G,myG[j])
-    const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
-    const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
-    const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
-
-    const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
-    const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
-    const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
-
-    // \f$\nabla \psi_r + {\bf k}\psi_i\f$
-    const ST gX_r = dX_r + val_i * kX;
-    const ST gY_r = dY_r + val_i * kY;
-    const ST gZ_r = dZ_r + val_i * kZ;
-    const ST gX_i = dX_i - val_r * kX;
-    const ST gY_i = dY_i - val_r * kY;
-    const ST gZ_i = dZ_i - val_r * kZ;
-
-    const size_t psiIndex = this->first_spo + jr;
-    psi[psiIndex]         = c * val_r - s * val_i;
-    dpsi[psiIndex][0]     = c * gX_r - s * gX_i;
-    dpsi[psiIndex][1]     = c * gY_r - s * gY_i;
-    dpsi[psiIndex][2]     = c * gZ_r - s * gZ_i;
-
-    psi[psiIndex + 1]     = c * val_i + s * val_r;
-    dpsi[psiIndex + 1][0] = c * gX_i + s * gX_r;
-    dpsi[psiIndex + 1][1] = c * gY_i + s * gY_r;
-    dpsi[psiIndex + 1][2] = c * gZ_i + s * gZ_r;
-
-    //intermediates for computation of hessian. \partial_i \partial_j phi in cartesian coordinates.
-    const ST f_xx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g00, g01, g02);
-    const ST f_xy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g10, g11, g12);
-    const ST f_xz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g20, g21, g22);
-    const ST f_yy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g10, g11, g12);
-    const ST f_yz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g20, g21, g22);
-    const ST f_zz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g20, g21, g22);
-
-    const ST f_xx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g00, g01, g02);
-    const ST f_xy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g10, g11, g12);
-    const ST f_xz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g20, g21, g22);
-    const ST f_yy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g10, g11, g12);
-    const ST f_yz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g20, g21, g22);
-    const ST f_zz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g20, g21, g22);
-
-    const ST h_xx_r = f_xx_r + 2 * kX * dX_i - kX * kX * val_r;
-    const ST h_xy_r = f_xy_r + (kX * dY_i + kY * dX_i) - kX * kY * val_r;
-    const ST h_xz_r = f_xz_r + (kX * dZ_i + kZ * dX_i) - kX * kZ * val_r;
-    const ST h_yy_r = f_yy_r + 2 * kY * dY_i - kY * kY * val_r;
-    const ST h_yz_r = f_yz_r + (kY * dZ_i + kZ * dY_i) - kY * kZ * val_r;
-    const ST h_zz_r = f_zz_r + 2 * kZ * dZ_i - kZ * kZ * val_r;
-
-    const ST h_xx_i = f_xx_i - 2 * kX * dX_r - kX * kX * val_i;
-    const ST h_xy_i = f_xy_i - (kX * dY_r + kY * dX_r) - kX * kY * val_i;
-    const ST h_xz_i = f_xz_i - (kX * dZ_r + kZ * dX_r) - kX * kZ * val_i;
-    const ST h_yy_i = f_yy_i - 2 * kY * dY_r - kY * kY * val_i;
-    const ST h_yz_i = f_yz_i - (kZ * dY_r + kY * dZ_r) - kZ * kY * val_i;
-    const ST h_zz_i = f_zz_i - 2 * kZ * dZ_r - kZ * kZ * val_i;
-
-    grad_grad_psi[psiIndex][0] = c * h_xx_r - s * h_xx_i;
-    grad_grad_psi[psiIndex][1] = c * h_xy_r - s * h_xy_i;
-    grad_grad_psi[psiIndex][2] = c * h_xz_r - s * h_xz_i;
-    grad_grad_psi[psiIndex][3] = c * h_xy_r - s * h_xy_i;
-    grad_grad_psi[psiIndex][4] = c * h_yy_r - s * h_yy_i;
-    grad_grad_psi[psiIndex][5] = c * h_yz_r - s * h_yz_i;
-    grad_grad_psi[psiIndex][6] = c * h_xz_r - s * h_xz_i;
-    grad_grad_psi[psiIndex][7] = c * h_yz_r - s * h_yz_i;
-    grad_grad_psi[psiIndex][8] = c * h_zz_r - s * h_zz_i;
-
-    grad_grad_psi[psiIndex + 1][0] = c * h_xx_i + s * h_xx_r;
-    grad_grad_psi[psiIndex + 1][1] = c * h_xy_i + s * h_xy_r;
-    grad_grad_psi[psiIndex + 1][2] = c * h_xz_i + s * h_xz_r;
-    grad_grad_psi[psiIndex + 1][3] = c * h_xy_i + s * h_xy_r;
-    grad_grad_psi[psiIndex + 1][4] = c * h_yy_i + s * h_yy_r;
-    grad_grad_psi[psiIndex + 1][5] = c * h_yz_i + s * h_yz_r;
-    grad_grad_psi[psiIndex + 1][6] = c * h_xz_i + s * h_xz_r;
-    grad_grad_psi[psiIndex + 1][7] = c * h_yz_i + s * h_yz_r;
-    grad_grad_psi[psiIndex + 1][8] = c * h_zz_i + s * h_zz_r;
-
-    //These are the real and imaginary components of the third SPO derivative.  _xxx denotes
-    // third derivative w.r.t. x, _xyz, a derivative with resepect to x,y, and z, and so on.
-
-    const ST f3_xxx_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g00, g01, g02);
-    const ST f3_xxy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g10, g11, g12);
-    const ST f3_xxz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g20, g21, g22);
-    const ST f3_xyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g10, g11, g12, g10, g11, g12);
-    const ST f3_xyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g10, g11, g12, g20, g21, g22);
-    const ST f3_xzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g20, g21, g22, g20, g21, g22);
-    const ST f3_yyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g10, g11, g12, g10, g11, g12);
-    const ST f3_yyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g10, g11, g12, g20, g21, g22);
-    const ST f3_yzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g20, g21, g22, g20, g21, g22);
-    const ST f3_zzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g20, g21, g22, g20, g21, g22, g20, g21, g22);
-
-    const ST f3_xxx_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g00, g01, g02);
-    const ST f3_xxy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g10, g11, g12);
-    const ST f3_xxz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g20, g21, g22);
-    const ST f3_xyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g10, g11, g12, g10, g11, g12);
-    const ST f3_xyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g10, g11, g12, g20, g21, g22);
-    const ST f3_xzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g20, g21, g22, g20, g21, g22);
-    const ST f3_yyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g10, g11, g12, g10, g11, g12);
-    const ST f3_yyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g10, g11, g12, g20, g21, g22);
-    const ST f3_yzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g20, g21, g22, g20, g21, g22);
-    const ST f3_zzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g20, g21, g22, g20, g21, g22, g20, g21, g22);
-
-    //Here is where we build up the components of the physical hessian gradient, namely, d^3/dx^3(e^{-ik*r}\phi(r)
-    const ST gh_xxx_r = f3_xxx_r + 3 * kX * f_xx_i - 3 * kX * kX * dX_r - kX * kX * kX * val_i;
-    const ST gh_xxx_i = f3_xxx_i - 3 * kX * f_xx_r - 3 * kX * kX * dX_i + kX * kX * kX * val_r;
-    const ST gh_xxy_r =
-        f3_xxy_r + (kY * f_xx_i + 2 * kX * f_xy_i) - (kX * kX * dY_r + 2 * kX * kY * dX_r) - kX * kX * kY * val_i;
-    const ST gh_xxy_i =
-        f3_xxy_i - (kY * f_xx_r + 2 * kX * f_xy_r) - (kX * kX * dY_i + 2 * kX * kY * dX_i) + kX * kX * kY * val_r;
-    const ST gh_xxz_r =
-        f3_xxz_r + (kZ * f_xx_i + 2 * kX * f_xz_i) - (kX * kX * dZ_r + 2 * kX * kZ * dX_r) - kX * kX * kZ * val_i;
-    const ST gh_xxz_i =
-        f3_xxz_i - (kZ * f_xx_r + 2 * kX * f_xz_r) - (kX * kX * dZ_i + 2 * kX * kZ * dX_i) + kX * kX * kZ * val_r;
-    const ST gh_xyy_r =
-        f3_xyy_r + (2 * kY * f_xy_i + kX * f_yy_i) - (2 * kX * kY * dY_r + kY * kY * dX_r) - kX * kY * kY * val_i;
-    const ST gh_xyy_i =
-        f3_xyy_i - (2 * kY * f_xy_r + kX * f_yy_r) - (2 * kX * kY * dY_i + kY * kY * dX_i) + kX * kY * kY * val_r;
-    const ST gh_xyz_r = f3_xyz_r + (kX * f_yz_i + kY * f_xz_i + kZ * f_xy_i) -
-        (kX * kY * dZ_r + kY * kZ * dX_r + kZ * kX * dY_r) - kX * kY * kZ * val_i;
-    const ST gh_xyz_i = f3_xyz_i - (kX * f_yz_r + kY * f_xz_r + kZ * f_xy_r) -
-        (kX * kY * dZ_i + kY * kZ * dX_i + kZ * kX * dY_i) + kX * kY * kZ * val_r;
-    const ST gh_xzz_r =
-        f3_xzz_r + (2 * kZ * f_xz_i + kX * f_zz_i) - (2 * kX * kZ * dZ_r + kZ * kZ * dX_r) - kX * kZ * kZ * val_i;
-    const ST gh_xzz_i =
-        f3_xzz_i - (2 * kZ * f_xz_r + kX * f_zz_r) - (2 * kX * kZ * dZ_i + kZ * kZ * dX_i) + kX * kZ * kZ * val_r;
-    const ST gh_yyy_r = f3_yyy_r + 3 * kY * f_yy_i - 3 * kY * kY * dY_r - kY * kY * kY * val_i;
-    const ST gh_yyy_i = f3_yyy_i - 3 * kY * f_yy_r - 3 * kY * kY * dY_i + kY * kY * kY * val_r;
-    const ST gh_yyz_r =
-        f3_yyz_r + (kZ * f_yy_i + 2 * kY * f_yz_i) - (kY * kY * dZ_r + 2 * kY * kZ * dY_r) - kY * kY * kZ * val_i;
-    const ST gh_yyz_i =
-        f3_yyz_i - (kZ * f_yy_r + 2 * kY * f_yz_r) - (kY * kY * dZ_i + 2 * kY * kZ * dY_i) + kY * kY * kZ * val_r;
-    const ST gh_yzz_r =
-        f3_yzz_r + (2 * kZ * f_yz_i + kY * f_zz_i) - (2 * kY * kZ * dZ_r + kZ * kZ * dY_r) - kY * kZ * kZ * val_i;
-    const ST gh_yzz_i =
-        f3_yzz_i - (2 * kZ * f_yz_r + kY * f_zz_r) - (2 * kY * kZ * dZ_i + kZ * kZ * dY_i) + kY * kZ * kZ * val_r;
-    const ST gh_zzz_r = f3_zzz_r + 3 * kZ * f_zz_i - 3 * kZ * kZ * dZ_r - kZ * kZ * kZ * val_i;
-    const ST gh_zzz_i = f3_zzz_i - 3 * kZ * f_zz_r - 3 * kZ * kZ * dZ_i + kZ * kZ * kZ * val_r;
-
-    grad_grad_grad_psi[psiIndex][0][0] = c * gh_xxx_r - s * gh_xxx_i;
-    grad_grad_grad_psi[psiIndex][0][1] = c * gh_xxy_r - s * gh_xxy_i;
-    grad_grad_grad_psi[psiIndex][0][2] = c * gh_xxz_r - s * gh_xxz_i;
-    grad_grad_grad_psi[psiIndex][0][3] = c * gh_xxy_r - s * gh_xxy_i;
-    grad_grad_grad_psi[psiIndex][0][4] = c * gh_xyy_r - s * gh_xyy_i;
-    grad_grad_grad_psi[psiIndex][0][5] = c * gh_xyz_r - s * gh_xyz_i;
-    grad_grad_grad_psi[psiIndex][0][6] = c * gh_xxz_r - s * gh_xxz_i;
-    grad_grad_grad_psi[psiIndex][0][7] = c * gh_xyz_r - s * gh_xyz_i;
-    grad_grad_grad_psi[psiIndex][0][8] = c * gh_xzz_r - s * gh_xzz_i;
-
-    grad_grad_grad_psi[psiIndex][1][0] = c * gh_xxy_r - s * gh_xxy_i;
-    grad_grad_grad_psi[psiIndex][1][1] = c * gh_xyy_r - s * gh_xyy_i;
-    grad_grad_grad_psi[psiIndex][1][2] = c * gh_xyz_r - s * gh_xyz_i;
-    grad_grad_grad_psi[psiIndex][1][3] = c * gh_xyy_r - s * gh_xyy_i;
-    grad_grad_grad_psi[psiIndex][1][4] = c * gh_yyy_r - s * gh_yyy_i;
-    grad_grad_grad_psi[psiIndex][1][5] = c * gh_yyz_r - s * gh_yyz_i;
-    grad_grad_grad_psi[psiIndex][1][6] = c * gh_xyz_r - s * gh_xyz_i;
-    grad_grad_grad_psi[psiIndex][1][7] = c * gh_yyz_r - s * gh_yyz_i;
-    grad_grad_grad_psi[psiIndex][1][8] = c * gh_yzz_r - s * gh_yzz_i;
-
-    grad_grad_grad_psi[psiIndex][2][0] = c * gh_xxz_r - s * gh_xxz_i;
-    grad_grad_grad_psi[psiIndex][2][1] = c * gh_xyz_r - s * gh_xyz_i;
-    grad_grad_grad_psi[psiIndex][2][2] = c * gh_xzz_r - s * gh_xzz_i;
-    grad_grad_grad_psi[psiIndex][2][3] = c * gh_xyz_r - s * gh_xyz_i;
-    grad_grad_grad_psi[psiIndex][2][4] = c * gh_yyz_r - s * gh_yyz_i;
-    grad_grad_grad_psi[psiIndex][2][5] = c * gh_yzz_r - s * gh_yzz_i;
-    grad_grad_grad_psi[psiIndex][2][6] = c * gh_xzz_r - s * gh_xzz_i;
-    grad_grad_grad_psi[psiIndex][2][7] = c * gh_yzz_r - s * gh_yzz_i;
-    grad_grad_grad_psi[psiIndex][2][8] = c * gh_zzz_r - s * gh_zzz_i;
-
-    grad_grad_grad_psi[psiIndex + 1][0][0] = c * gh_xxx_i + s * gh_xxx_r;
-    grad_grad_grad_psi[psiIndex + 1][0][1] = c * gh_xxy_i + s * gh_xxy_r;
-    grad_grad_grad_psi[psiIndex + 1][0][2] = c * gh_xxz_i + s * gh_xxz_r;
-    grad_grad_grad_psi[psiIndex + 1][0][3] = c * gh_xxy_i + s * gh_xxy_r;
-    grad_grad_grad_psi[psiIndex + 1][0][4] = c * gh_xyy_i + s * gh_xyy_r;
-    grad_grad_grad_psi[psiIndex + 1][0][5] = c * gh_xyz_i + s * gh_xyz_r;
-    grad_grad_grad_psi[psiIndex + 1][0][6] = c * gh_xxz_i + s * gh_xxz_r;
-    grad_grad_grad_psi[psiIndex + 1][0][7] = c * gh_xyz_i + s * gh_xyz_r;
-    grad_grad_grad_psi[psiIndex + 1][0][8] = c * gh_xzz_i + s * gh_xzz_r;
-
-    grad_grad_grad_psi[psiIndex + 1][1][0] = c * gh_xxy_i + s * gh_xxy_r;
-    grad_grad_grad_psi[psiIndex + 1][1][1] = c * gh_xyy_i + s * gh_xyy_r;
-    grad_grad_grad_psi[psiIndex + 1][1][2] = c * gh_xyz_i + s * gh_xyz_r;
-    grad_grad_grad_psi[psiIndex + 1][1][3] = c * gh_xyy_i + s * gh_xyy_r;
-    grad_grad_grad_psi[psiIndex + 1][1][4] = c * gh_yyy_i + s * gh_yyy_r;
-    grad_grad_grad_psi[psiIndex + 1][1][5] = c * gh_yyz_i + s * gh_yyz_r;
-    grad_grad_grad_psi[psiIndex + 1][1][6] = c * gh_xyz_i + s * gh_xyz_r;
-    grad_grad_grad_psi[psiIndex + 1][1][7] = c * gh_yyz_i + s * gh_yyz_r;
-    grad_grad_grad_psi[psiIndex + 1][1][8] = c * gh_yzz_i + s * gh_yzz_r;
-
-    grad_grad_grad_psi[psiIndex + 1][2][0] = c * gh_xxz_i + s * gh_xxz_r;
-    grad_grad_grad_psi[psiIndex + 1][2][1] = c * gh_xyz_i + s * gh_xyz_r;
-    grad_grad_grad_psi[psiIndex + 1][2][2] = c * gh_xzz_i + s * gh_xzz_r;
-    grad_grad_grad_psi[psiIndex + 1][2][3] = c * gh_xyz_i + s * gh_xyz_r;
-    grad_grad_grad_psi[psiIndex + 1][2][4] = c * gh_yyz_i + s * gh_yyz_r;
-    grad_grad_grad_psi[psiIndex + 1][2][5] = c * gh_yzz_i + s * gh_yzz_r;
-    grad_grad_grad_psi[psiIndex + 1][2][6] = c * gh_xzz_i + s * gh_xzz_r;
-    grad_grad_grad_psi[psiIndex + 1][2][7] = c * gh_yzz_i + s * gh_yzz_r;
-    grad_grad_grad_psi[psiIndex + 1][2][8] = c * gh_zzz_i + s * gh_zzz_r;
-  }
+    for (size_t j = first; j < std::min(nComplexBands, last); j++) {
+        int jr = j << 1;
+        int ji = jr + 1;
+
+        const ST kX = k0[j];
+        const ST kY = k1[j];
+        const ST kZ = k2[j];
+        const ST val_r = myV[jr];
+        const ST val_i = myV[ji];
+
+        // phase
+        ST s, c;
+        qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
+
+        // dot(PrimLattice.G,myG[j])
+        const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
+        const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
+        const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
+
+        const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
+        const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
+        const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
+
+        // \f$\nabla \psi_r + {\bf k}\psi_i\f$
+        const ST gX_r = dX_r + val_i * kX;
+        const ST gY_r = dY_r + val_i * kY;
+        const ST gZ_r = dZ_r + val_i * kZ;
+        const ST gX_i = dX_i - val_r * kX;
+        const ST gY_i = dY_i - val_r * kY;
+        const ST gZ_i = dZ_i - val_r * kZ;
+
+        const size_t psiIndex = this->first_spo + jr;
+        psi[psiIndex] = c * val_r - s * val_i;
+        dpsi[psiIndex][0] = c * gX_r - s * gX_i;
+        dpsi[psiIndex][1] = c * gY_r - s * gY_i;
+        dpsi[psiIndex][2] = c * gZ_r - s * gZ_i;
+
+        psi[psiIndex + 1] = c * val_i + s * val_r;
+        dpsi[psiIndex + 1][0] = c * gX_i + s * gX_r;
+        dpsi[psiIndex + 1][1] = c * gY_i + s * gY_r;
+        dpsi[psiIndex + 1][2] = c * gZ_i + s * gZ_r;
+
+        // intermediates for computation of hessian. \partial_i \partial_j phi
+        // in cartesian coordinates.
+        const ST f_xx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+            h22[jr], g00, g01, g02, g00, g01, g02);
+        const ST f_xy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+            h22[jr], g00, g01, g02, g10, g11, g12);
+        const ST f_xz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+            h22[jr], g00, g01, g02, g20, g21, g22);
+        const ST f_yy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+            h22[jr], g10, g11, g12, g10, g11, g12);
+        const ST f_yz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+            h22[jr], g10, g11, g12, g20, g21, g22);
+        const ST f_zz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+            h22[jr], g20, g21, g22, g20, g21, g22);
+
+        const ST f_xx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+            h22[ji], g00, g01, g02, g00, g01, g02);
+        const ST f_xy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+            h22[ji], g00, g01, g02, g10, g11, g12);
+        const ST f_xz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+            h22[ji], g00, g01, g02, g20, g21, g22);
+        const ST f_yy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+            h22[ji], g10, g11, g12, g10, g11, g12);
+        const ST f_yz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+            h22[ji], g10, g11, g12, g20, g21, g22);
+        const ST f_zz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+            h22[ji], g20, g21, g22, g20, g21, g22);
+
+        const ST h_xx_r = f_xx_r + 2 * kX * dX_i - kX * kX * val_r;
+        const ST h_xy_r = f_xy_r + (kX * dY_i + kY * dX_i) - kX * kY * val_r;
+        const ST h_xz_r = f_xz_r + (kX * dZ_i + kZ * dX_i) - kX * kZ * val_r;
+        const ST h_yy_r = f_yy_r + 2 * kY * dY_i - kY * kY * val_r;
+        const ST h_yz_r = f_yz_r + (kY * dZ_i + kZ * dY_i) - kY * kZ * val_r;
+        const ST h_zz_r = f_zz_r + 2 * kZ * dZ_i - kZ * kZ * val_r;
+
+        const ST h_xx_i = f_xx_i - 2 * kX * dX_r - kX * kX * val_i;
+        const ST h_xy_i = f_xy_i - (kX * dY_r + kY * dX_r) - kX * kY * val_i;
+        const ST h_xz_i = f_xz_i - (kX * dZ_r + kZ * dX_r) - kX * kZ * val_i;
+        const ST h_yy_i = f_yy_i - 2 * kY * dY_r - kY * kY * val_i;
+        const ST h_yz_i = f_yz_i - (kZ * dY_r + kY * dZ_r) - kZ * kY * val_i;
+        const ST h_zz_i = f_zz_i - 2 * kZ * dZ_r - kZ * kZ * val_i;
+
+        grad_grad_psi[psiIndex][0] = c * h_xx_r - s * h_xx_i;
+        grad_grad_psi[psiIndex][1] = c * h_xy_r - s * h_xy_i;
+        grad_grad_psi[psiIndex][2] = c * h_xz_r - s * h_xz_i;
+        grad_grad_psi[psiIndex][3] = c * h_xy_r - s * h_xy_i;
+        grad_grad_psi[psiIndex][4] = c * h_yy_r - s * h_yy_i;
+        grad_grad_psi[psiIndex][5] = c * h_yz_r - s * h_yz_i;
+        grad_grad_psi[psiIndex][6] = c * h_xz_r - s * h_xz_i;
+        grad_grad_psi[psiIndex][7] = c * h_yz_r - s * h_yz_i;
+        grad_grad_psi[psiIndex][8] = c * h_zz_r - s * h_zz_i;
+
+        grad_grad_psi[psiIndex + 1][0] = c * h_xx_i + s * h_xx_r;
+        grad_grad_psi[psiIndex + 1][1] = c * h_xy_i + s * h_xy_r;
+        grad_grad_psi[psiIndex + 1][2] = c * h_xz_i + s * h_xz_r;
+        grad_grad_psi[psiIndex + 1][3] = c * h_xy_i + s * h_xy_r;
+        grad_grad_psi[psiIndex + 1][4] = c * h_yy_i + s * h_yy_r;
+        grad_grad_psi[psiIndex + 1][5] = c * h_yz_i + s * h_yz_r;
+        grad_grad_psi[psiIndex + 1][6] = c * h_xz_i + s * h_xz_r;
+        grad_grad_psi[psiIndex + 1][7] = c * h_yz_i + s * h_yz_r;
+        grad_grad_psi[psiIndex + 1][8] = c * h_zz_i + s * h_zz_r;
+
+        // These are the real and imaginary components of the third SPO
+        // derivative.  _xxx denotes
+        //  third derivative w.r.t. x, _xyz, a derivative with resepect to x,y,
+        //  and z, and so on.
+
+        const ST f3_xxx_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g00, g01, g02, g00, g01, g02, g00, g01, g02);
+        const ST f3_xxy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g00, g01, g02, g00, g01, g02, g10, g11, g12);
+        const ST f3_xxz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g00, g01, g02, g00, g01, g02, g20, g21, g22);
+        const ST f3_xyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g00, g01, g02, g10, g11, g12, g10, g11, g12);
+        const ST f3_xyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g00, g01, g02, g10, g11, g12, g20, g21, g22);
+        const ST f3_xzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g00, g01, g02, g20, g21, g22, g20, g21, g22);
+        const ST f3_yyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g10, g11, g12, g10, g11, g12, g10, g11, g12);
+        const ST f3_yyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g10, g11, g12, g10, g11, g12, g20, g21, g22);
+        const ST f3_yzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g10, g11, g12, g20, g21, g22, g20, g21, g22);
+        const ST f3_zzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g20, g21, g22, g20, g21, g22, g20, g21, g22);
+
+        const ST f3_xxx_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g00, g01, g02, g00, g01, g02, g00, g01, g02);
+        const ST f3_xxy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g00, g01, g02, g00, g01, g02, g10, g11, g12);
+        const ST f3_xxz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g00, g01, g02, g00, g01, g02, g20, g21, g22);
+        const ST f3_xyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g00, g01, g02, g10, g11, g12, g10, g11, g12);
+        const ST f3_xyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g00, g01, g02, g10, g11, g12, g20, g21, g22);
+        const ST f3_xzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g00, g01, g02, g20, g21, g22, g20, g21, g22);
+        const ST f3_yyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g10, g11, g12, g10, g11, g12, g10, g11, g12);
+        const ST f3_yyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g10, g11, g12, g10, g11, g12, g20, g21, g22);
+        const ST f3_yzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g10, g11, g12, g20, g21, g22, g20, g21, g22);
+        const ST f3_zzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g20, g21, g22, g20, g21, g22, g20, g21, g22);
+
+        // Here is where we build up the components of the physical hessian
+        // gradient, namely, d^3/dx^3(e^{-ik*r}\phi(r)
+        const ST gh_xxx_r = f3_xxx_r + 3 * kX * f_xx_i - 3 * kX * kX * dX_r -
+            kX * kX * kX * val_i;
+        const ST gh_xxx_i = f3_xxx_i - 3 * kX * f_xx_r - 3 * kX * kX * dX_i +
+            kX * kX * kX * val_r;
+        const ST gh_xxy_r = f3_xxy_r + (kY * f_xx_i + 2 * kX * f_xy_i) -
+            (kX * kX * dY_r + 2 * kX * kY * dX_r) - kX * kX * kY * val_i;
+        const ST gh_xxy_i = f3_xxy_i - (kY * f_xx_r + 2 * kX * f_xy_r) -
+            (kX * kX * dY_i + 2 * kX * kY * dX_i) + kX * kX * kY * val_r;
+        const ST gh_xxz_r = f3_xxz_r + (kZ * f_xx_i + 2 * kX * f_xz_i) -
+            (kX * kX * dZ_r + 2 * kX * kZ * dX_r) - kX * kX * kZ * val_i;
+        const ST gh_xxz_i = f3_xxz_i - (kZ * f_xx_r + 2 * kX * f_xz_r) -
+            (kX * kX * dZ_i + 2 * kX * kZ * dX_i) + kX * kX * kZ * val_r;
+        const ST gh_xyy_r = f3_xyy_r + (2 * kY * f_xy_i + kX * f_yy_i) -
+            (2 * kX * kY * dY_r + kY * kY * dX_r) - kX * kY * kY * val_i;
+        const ST gh_xyy_i = f3_xyy_i - (2 * kY * f_xy_r + kX * f_yy_r) -
+            (2 * kX * kY * dY_i + kY * kY * dX_i) + kX * kY * kY * val_r;
+        const ST gh_xyz_r = f3_xyz_r +
+            (kX * f_yz_i + kY * f_xz_i + kZ * f_xy_i) -
+            (kX * kY * dZ_r + kY * kZ * dX_r + kZ * kX * dY_r) -
+            kX * kY * kZ * val_i;
+        const ST gh_xyz_i = f3_xyz_i -
+            (kX * f_yz_r + kY * f_xz_r + kZ * f_xy_r) -
+            (kX * kY * dZ_i + kY * kZ * dX_i + kZ * kX * dY_i) +
+            kX * kY * kZ * val_r;
+        const ST gh_xzz_r = f3_xzz_r + (2 * kZ * f_xz_i + kX * f_zz_i) -
+            (2 * kX * kZ * dZ_r + kZ * kZ * dX_r) - kX * kZ * kZ * val_i;
+        const ST gh_xzz_i = f3_xzz_i - (2 * kZ * f_xz_r + kX * f_zz_r) -
+            (2 * kX * kZ * dZ_i + kZ * kZ * dX_i) + kX * kZ * kZ * val_r;
+        const ST gh_yyy_r = f3_yyy_r + 3 * kY * f_yy_i - 3 * kY * kY * dY_r -
+            kY * kY * kY * val_i;
+        const ST gh_yyy_i = f3_yyy_i - 3 * kY * f_yy_r - 3 * kY * kY * dY_i +
+            kY * kY * kY * val_r;
+        const ST gh_yyz_r = f3_yyz_r + (kZ * f_yy_i + 2 * kY * f_yz_i) -
+            (kY * kY * dZ_r + 2 * kY * kZ * dY_r) - kY * kY * kZ * val_i;
+        const ST gh_yyz_i = f3_yyz_i - (kZ * f_yy_r + 2 * kY * f_yz_r) -
+            (kY * kY * dZ_i + 2 * kY * kZ * dY_i) + kY * kY * kZ * val_r;
+        const ST gh_yzz_r = f3_yzz_r + (2 * kZ * f_yz_i + kY * f_zz_i) -
+            (2 * kY * kZ * dZ_r + kZ * kZ * dY_r) - kY * kZ * kZ * val_i;
+        const ST gh_yzz_i = f3_yzz_i - (2 * kZ * f_yz_r + kY * f_zz_r) -
+            (2 * kY * kZ * dZ_i + kZ * kZ * dY_i) + kY * kZ * kZ * val_r;
+        const ST gh_zzz_r = f3_zzz_r + 3 * kZ * f_zz_i - 3 * kZ * kZ * dZ_r -
+            kZ * kZ * kZ * val_i;
+        const ST gh_zzz_i = f3_zzz_i - 3 * kZ * f_zz_r - 3 * kZ * kZ * dZ_i +
+            kZ * kZ * kZ * val_r;
+
+        grad_grad_grad_psi[psiIndex][0][0] = c * gh_xxx_r - s * gh_xxx_i;
+        grad_grad_grad_psi[psiIndex][0][1] = c * gh_xxy_r - s * gh_xxy_i;
+        grad_grad_grad_psi[psiIndex][0][2] = c * gh_xxz_r - s * gh_xxz_i;
+        grad_grad_grad_psi[psiIndex][0][3] = c * gh_xxy_r - s * gh_xxy_i;
+        grad_grad_grad_psi[psiIndex][0][4] = c * gh_xyy_r - s * gh_xyy_i;
+        grad_grad_grad_psi[psiIndex][0][5] = c * gh_xyz_r - s * gh_xyz_i;
+        grad_grad_grad_psi[psiIndex][0][6] = c * gh_xxz_r - s * gh_xxz_i;
+        grad_grad_grad_psi[psiIndex][0][7] = c * gh_xyz_r - s * gh_xyz_i;
+        grad_grad_grad_psi[psiIndex][0][8] = c * gh_xzz_r - s * gh_xzz_i;
+
+        grad_grad_grad_psi[psiIndex][1][0] = c * gh_xxy_r - s * gh_xxy_i;
+        grad_grad_grad_psi[psiIndex][1][1] = c * gh_xyy_r - s * gh_xyy_i;
+        grad_grad_grad_psi[psiIndex][1][2] = c * gh_xyz_r - s * gh_xyz_i;
+        grad_grad_grad_psi[psiIndex][1][3] = c * gh_xyy_r - s * gh_xyy_i;
+        grad_grad_grad_psi[psiIndex][1][4] = c * gh_yyy_r - s * gh_yyy_i;
+        grad_grad_grad_psi[psiIndex][1][5] = c * gh_yyz_r - s * gh_yyz_i;
+        grad_grad_grad_psi[psiIndex][1][6] = c * gh_xyz_r - s * gh_xyz_i;
+        grad_grad_grad_psi[psiIndex][1][7] = c * gh_yyz_r - s * gh_yyz_i;
+        grad_grad_grad_psi[psiIndex][1][8] = c * gh_yzz_r - s * gh_yzz_i;
+
+        grad_grad_grad_psi[psiIndex][2][0] = c * gh_xxz_r - s * gh_xxz_i;
+        grad_grad_grad_psi[psiIndex][2][1] = c * gh_xyz_r - s * gh_xyz_i;
+        grad_grad_grad_psi[psiIndex][2][2] = c * gh_xzz_r - s * gh_xzz_i;
+        grad_grad_grad_psi[psiIndex][2][3] = c * gh_xyz_r - s * gh_xyz_i;
+        grad_grad_grad_psi[psiIndex][2][4] = c * gh_yyz_r - s * gh_yyz_i;
+        grad_grad_grad_psi[psiIndex][2][5] = c * gh_yzz_r - s * gh_yzz_i;
+        grad_grad_grad_psi[psiIndex][2][6] = c * gh_xzz_r - s * gh_xzz_i;
+        grad_grad_grad_psi[psiIndex][2][7] = c * gh_yzz_r - s * gh_yzz_i;
+        grad_grad_grad_psi[psiIndex][2][8] = c * gh_zzz_r - s * gh_zzz_i;
+
+        grad_grad_grad_psi[psiIndex + 1][0][0] = c * gh_xxx_i + s * gh_xxx_r;
+        grad_grad_grad_psi[psiIndex + 1][0][1] = c * gh_xxy_i + s * gh_xxy_r;
+        grad_grad_grad_psi[psiIndex + 1][0][2] = c * gh_xxz_i + s * gh_xxz_r;
+        grad_grad_grad_psi[psiIndex + 1][0][3] = c * gh_xxy_i + s * gh_xxy_r;
+        grad_grad_grad_psi[psiIndex + 1][0][4] = c * gh_xyy_i + s * gh_xyy_r;
+        grad_grad_grad_psi[psiIndex + 1][0][5] = c * gh_xyz_i + s * gh_xyz_r;
+        grad_grad_grad_psi[psiIndex + 1][0][6] = c * gh_xxz_i + s * gh_xxz_r;
+        grad_grad_grad_psi[psiIndex + 1][0][7] = c * gh_xyz_i + s * gh_xyz_r;
+        grad_grad_grad_psi[psiIndex + 1][0][8] = c * gh_xzz_i + s * gh_xzz_r;
+
+        grad_grad_grad_psi[psiIndex + 1][1][0] = c * gh_xxy_i + s * gh_xxy_r;
+        grad_grad_grad_psi[psiIndex + 1][1][1] = c * gh_xyy_i + s * gh_xyy_r;
+        grad_grad_grad_psi[psiIndex + 1][1][2] = c * gh_xyz_i + s * gh_xyz_r;
+        grad_grad_grad_psi[psiIndex + 1][1][3] = c * gh_xyy_i + s * gh_xyy_r;
+        grad_grad_grad_psi[psiIndex + 1][1][4] = c * gh_yyy_i + s * gh_yyy_r;
+        grad_grad_grad_psi[psiIndex + 1][1][5] = c * gh_yyz_i + s * gh_yyz_r;
+        grad_grad_grad_psi[psiIndex + 1][1][6] = c * gh_xyz_i + s * gh_xyz_r;
+        grad_grad_grad_psi[psiIndex + 1][1][7] = c * gh_yyz_i + s * gh_yyz_r;
+        grad_grad_grad_psi[psiIndex + 1][1][8] = c * gh_yzz_i + s * gh_yzz_r;
+
+        grad_grad_grad_psi[psiIndex + 1][2][0] = c * gh_xxz_i + s * gh_xxz_r;
+        grad_grad_grad_psi[psiIndex + 1][2][1] = c * gh_xyz_i + s * gh_xyz_r;
+        grad_grad_grad_psi[psiIndex + 1][2][2] = c * gh_xzz_i + s * gh_xzz_r;
+        grad_grad_grad_psi[psiIndex + 1][2][3] = c * gh_xyz_i + s * gh_xyz_r;
+        grad_grad_grad_psi[psiIndex + 1][2][4] = c * gh_yyz_i + s * gh_yyz_r;
+        grad_grad_grad_psi[psiIndex + 1][2][5] = c * gh_yzz_i + s * gh_yzz_r;
+        grad_grad_grad_psi[psiIndex + 1][2][6] = c * gh_xzz_i + s * gh_xzz_r;
+        grad_grad_grad_psi[psiIndex + 1][2][7] = c * gh_yzz_i + s * gh_yzz_r;
+        grad_grad_grad_psi[psiIndex + 1][2][8] = c * gh_zzz_i + s * gh_zzz_r;
+    }
 #pragma omp simd
-  for (size_t j = std::max(nComplexBands, first); j < last; j++)
-  {
-    int jr = j << 1;
-    int ji = jr + 1;
-
-    const ST kX    = k0[j];
-    const ST kY    = k1[j];
-    const ST kZ    = k2[j];
-    const ST val_r = myV[jr];
-    const ST val_i = myV[ji];
-
-    //phase
-    ST s, c;
-    qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
-
-    //dot(PrimLattice.G,myG[j])
-    const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
-    const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
-    const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
-
-    const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
-    const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
-    const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
-
-    // \f$\nabla \psi_r + {\bf k}\psi_i\f$
-    const ST gX_r = dX_r + val_i * kX;
-    const ST gY_r = dY_r + val_i * kY;
-    const ST gZ_r = dZ_r + val_i * kZ;
-    const ST gX_i = dX_i - val_r * kX;
-    const ST gY_i = dY_i - val_r * kY;
-    const ST gZ_i = dZ_i - val_r * kZ;
-
-    const size_t psiIndex = this->first_spo + nComplexBands + j;
-    psi[psiIndex]         = c * val_r - s * val_i;
-    dpsi[psiIndex][0]     = c * gX_r - s * gX_i;
-    dpsi[psiIndex][1]     = c * gY_r - s * gY_i;
-    dpsi[psiIndex][2]     = c * gZ_r - s * gZ_i;
-
-    //intermediates for computation of hessian. \partial_i \partial_j phi in cartesian coordinates.
-    const ST f_xx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g00, g01, g02);
-    const ST f_xy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g10, g11, g12);
-    const ST f_xz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g00, g01, g02, g20, g21, g22);
-    const ST f_yy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g10, g11, g12);
-    const ST f_yz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g10, g11, g12, g20, g21, g22);
-    const ST f_zz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr], h22[jr], g20, g21, g22, g20, g21, g22);
-
-    const ST f_xx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g00, g01, g02);
-    const ST f_xy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g10, g11, g12);
-    const ST f_xz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g00, g01, g02, g20, g21, g22);
-    const ST f_yy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g10, g11, g12);
-    const ST f_yz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g10, g11, g12, g20, g21, g22);
-    const ST f_zz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji], h22[ji], g20, g21, g22, g20, g21, g22);
-
-    const ST h_xx_r = f_xx_r + 2 * kX * dX_i - kX * kX * val_r;
-    const ST h_xy_r = f_xy_r + (kX * dY_i + kY * dX_i) - kX * kY * val_r;
-    const ST h_xz_r = f_xz_r + (kX * dZ_i + kZ * dX_i) - kX * kZ * val_r;
-    const ST h_yy_r = f_yy_r + 2 * kY * dY_i - kY * kY * val_r;
-    const ST h_yz_r = f_yz_r + (kY * dZ_i + kZ * dY_i) - kY * kZ * val_r;
-    const ST h_zz_r = f_zz_r + 2 * kZ * dZ_i - kZ * kZ * val_r;
-
-    const ST h_xx_i = f_xx_i - 2 * kX * dX_r - kX * kX * val_i;
-    const ST h_xy_i = f_xy_i - (kX * dY_r + kY * dX_r) - kX * kY * val_i;
-    const ST h_xz_i = f_xz_i - (kX * dZ_r + kZ * dX_r) - kX * kZ * val_i;
-    const ST h_yy_i = f_yy_i - 2 * kY * dY_r - kY * kY * val_i;
-    const ST h_yz_i = f_yz_i - (kZ * dY_r + kY * dZ_r) - kZ * kY * val_i;
-    const ST h_zz_i = f_zz_i - 2 * kZ * dZ_r - kZ * kZ * val_i;
-
-    grad_grad_psi[psiIndex][0] = c * h_xx_r - s * h_xx_i;
-    grad_grad_psi[psiIndex][1] = c * h_xy_r - s * h_xy_i;
-    grad_grad_psi[psiIndex][2] = c * h_xz_r - s * h_xz_i;
-    grad_grad_psi[psiIndex][3] = c * h_xy_r - s * h_xy_i;
-    grad_grad_psi[psiIndex][4] = c * h_yy_r - s * h_yy_i;
-    grad_grad_psi[psiIndex][5] = c * h_yz_r - s * h_yz_i;
-    grad_grad_psi[psiIndex][6] = c * h_xz_r - s * h_xz_i;
-    grad_grad_psi[psiIndex][7] = c * h_yz_r - s * h_yz_i;
-    grad_grad_psi[psiIndex][8] = c * h_zz_r - s * h_zz_i;
-
-    //These are the real and imaginary components of the third SPO derivative.  _xxx denotes
-    // third derivative w.r.t. x, _xyz, a derivative with resepect to x,y, and z, and so on.
-
-    const ST f3_xxx_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g00, g01, g02);
-    const ST f3_xxy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g10, g11, g12);
-    const ST f3_xxz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g00, g01, g02, g20, g21, g22);
-    const ST f3_xyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g10, g11, g12, g10, g11, g12);
-    const ST f3_xyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g10, g11, g12, g20, g21, g22);
-    const ST f3_xzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g00, g01, g02, g20, g21, g22, g20, g21, g22);
-    const ST f3_yyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g10, g11, g12, g10, g11, g12);
-    const ST f3_yyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g10, g11, g12, g20, g21, g22);
-    const ST f3_yzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g10, g11, g12, g20, g21, g22, g20, g21, g22);
-    const ST f3_zzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr], gh011[jr], gh012[jr], gh022[jr], gh111[jr],
-                                    gh112[jr], gh122[jr], gh222[jr], g20, g21, g22, g20, g21, g22, g20, g21, g22);
-
-    const ST f3_xxx_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g00, g01, g02);
-    const ST f3_xxy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g10, g11, g12);
-    const ST f3_xxz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g00, g01, g02, g20, g21, g22);
-    const ST f3_xyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g10, g11, g12, g10, g11, g12);
-    const ST f3_xyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g10, g11, g12, g20, g21, g22);
-    const ST f3_xzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g00, g01, g02, g20, g21, g22, g20, g21, g22);
-    const ST f3_yyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g10, g11, g12, g10, g11, g12);
-    const ST f3_yyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g10, g11, g12, g20, g21, g22);
-    const ST f3_yzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g10, g11, g12, g20, g21, g22, g20, g21, g22);
-    const ST f3_zzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji], gh011[ji], gh012[ji], gh022[ji], gh111[ji],
-                                    gh112[ji], gh122[ji], gh222[ji], g20, g21, g22, g20, g21, g22, g20, g21, g22);
-
-    //Here is where we build up the components of the physical hessian gradient, namely, d^3/dx^3(e^{-ik*r}\phi(r)
-    const ST gh_xxx_r = f3_xxx_r + 3 * kX * f_xx_i - 3 * kX * kX * dX_r - kX * kX * kX * val_i;
-    const ST gh_xxx_i = f3_xxx_i - 3 * kX * f_xx_r - 3 * kX * kX * dX_i + kX * kX * kX * val_r;
-    const ST gh_xxy_r =
-        f3_xxy_r + (kY * f_xx_i + 2 * kX * f_xy_i) - (kX * kX * dY_r + 2 * kX * kY * dX_r) - kX * kX * kY * val_i;
-    const ST gh_xxy_i =
-        f3_xxy_i - (kY * f_xx_r + 2 * kX * f_xy_r) - (kX * kX * dY_i + 2 * kX * kY * dX_i) + kX * kX * kY * val_r;
-    const ST gh_xxz_r =
-        f3_xxz_r + (kZ * f_xx_i + 2 * kX * f_xz_i) - (kX * kX * dZ_r + 2 * kX * kZ * dX_r) - kX * kX * kZ * val_i;
-    const ST gh_xxz_i =
-        f3_xxz_i - (kZ * f_xx_r + 2 * kX * f_xz_r) - (kX * kX * dZ_i + 2 * kX * kZ * dX_i) + kX * kX * kZ * val_r;
-    const ST gh_xyy_r =
-        f3_xyy_r + (2 * kY * f_xy_i + kX * f_yy_i) - (2 * kX * kY * dY_r + kY * kY * dX_r) - kX * kY * kY * val_i;
-    const ST gh_xyy_i =
-        f3_xyy_i - (2 * kY * f_xy_r + kX * f_yy_r) - (2 * kX * kY * dY_i + kY * kY * dX_i) + kX * kY * kY * val_r;
-    const ST gh_xyz_r = f3_xyz_r + (kX * f_yz_i + kY * f_xz_i + kZ * f_xy_i) -
-        (kX * kY * dZ_r + kY * kZ * dX_r + kZ * kX * dY_r) - kX * kY * kZ * val_i;
-    const ST gh_xyz_i = f3_xyz_i - (kX * f_yz_r + kY * f_xz_r + kZ * f_xy_r) -
-        (kX * kY * dZ_i + kY * kZ * dX_i + kZ * kX * dY_i) + kX * kY * kZ * val_r;
-    const ST gh_xzz_r =
-        f3_xzz_r + (2 * kZ * f_xz_i + kX * f_zz_i) - (2 * kX * kZ * dZ_r + kZ * kZ * dX_r) - kX * kZ * kZ * val_i;
-    const ST gh_xzz_i =
-        f3_xzz_i - (2 * kZ * f_xz_r + kX * f_zz_r) - (2 * kX * kZ * dZ_i + kZ * kZ * dX_i) + kX * kZ * kZ * val_r;
-    const ST gh_yyy_r = f3_yyy_r + 3 * kY * f_yy_i - 3 * kY * kY * dY_r - kY * kY * kY * val_i;
-    const ST gh_yyy_i = f3_yyy_i - 3 * kY * f_yy_r - 3 * kY * kY * dY_i + kY * kY * kY * val_r;
-    const ST gh_yyz_r =
-        f3_yyz_r + (kZ * f_yy_i + 2 * kY * f_yz_i) - (kY * kY * dZ_r + 2 * kY * kZ * dY_r) - kY * kY * kZ * val_i;
-    const ST gh_yyz_i =
-        f3_yyz_i - (kZ * f_yy_r + 2 * kY * f_yz_r) - (kY * kY * dZ_i + 2 * kY * kZ * dY_i) + kY * kY * kZ * val_r;
-    const ST gh_yzz_r =
-        f3_yzz_r + (2 * kZ * f_yz_i + kY * f_zz_i) - (2 * kY * kZ * dZ_r + kZ * kZ * dY_r) - kY * kZ * kZ * val_i;
-    const ST gh_yzz_i =
-        f3_yzz_i - (2 * kZ * f_yz_r + kY * f_zz_r) - (2 * kY * kZ * dZ_i + kZ * kZ * dY_i) + kY * kZ * kZ * val_r;
-    const ST gh_zzz_r = f3_zzz_r + 3 * kZ * f_zz_i - 3 * kZ * kZ * dZ_r - kZ * kZ * kZ * val_i;
-    const ST gh_zzz_i = f3_zzz_i - 3 * kZ * f_zz_r - 3 * kZ * kZ * dZ_i + kZ * kZ * kZ * val_r;
-    //[x][xx] //These are the unique entries
-    grad_grad_grad_psi[psiIndex][0][0] = c * gh_xxx_r - s * gh_xxx_i;
-    grad_grad_grad_psi[psiIndex][0][1] = c * gh_xxy_r - s * gh_xxy_i;
-    grad_grad_grad_psi[psiIndex][0][2] = c * gh_xxz_r - s * gh_xxz_i;
-    grad_grad_grad_psi[psiIndex][0][3] = c * gh_xxy_r - s * gh_xxy_i;
-    grad_grad_grad_psi[psiIndex][0][4] = c * gh_xyy_r - s * gh_xyy_i;
-    grad_grad_grad_psi[psiIndex][0][5] = c * gh_xyz_r - s * gh_xyz_i;
-    grad_grad_grad_psi[psiIndex][0][6] = c * gh_xxz_r - s * gh_xxz_i;
-    grad_grad_grad_psi[psiIndex][0][7] = c * gh_xyz_r - s * gh_xyz_i;
-    grad_grad_grad_psi[psiIndex][0][8] = c * gh_xzz_r - s * gh_xzz_i;
-
-    grad_grad_grad_psi[psiIndex][1][0] = c * gh_xxy_r - s * gh_xxy_i;
-    grad_grad_grad_psi[psiIndex][1][1] = c * gh_xyy_r - s * gh_xyy_i;
-    grad_grad_grad_psi[psiIndex][1][2] = c * gh_xyz_r - s * gh_xyz_i;
-    grad_grad_grad_psi[psiIndex][1][3] = c * gh_xyy_r - s * gh_xyy_i;
-    grad_grad_grad_psi[psiIndex][1][4] = c * gh_yyy_r - s * gh_yyy_i;
-    grad_grad_grad_psi[psiIndex][1][5] = c * gh_yyz_r - s * gh_yyz_i;
-    grad_grad_grad_psi[psiIndex][1][6] = c * gh_xyz_r - s * gh_xyz_i;
-    grad_grad_grad_psi[psiIndex][1][7] = c * gh_yyz_r - s * gh_yyz_i;
-    grad_grad_grad_psi[psiIndex][1][8] = c * gh_yzz_r - s * gh_yzz_i;
-
-    grad_grad_grad_psi[psiIndex][2][0] = c * gh_xxz_r - s * gh_xxz_i;
-    grad_grad_grad_psi[psiIndex][2][1] = c * gh_xyz_r - s * gh_xyz_i;
-    grad_grad_grad_psi[psiIndex][2][2] = c * gh_xzz_r - s * gh_xzz_i;
-    grad_grad_grad_psi[psiIndex][2][3] = c * gh_xyz_r - s * gh_xyz_i;
-    grad_grad_grad_psi[psiIndex][2][4] = c * gh_yyz_r - s * gh_yyz_i;
-    grad_grad_grad_psi[psiIndex][2][5] = c * gh_yzz_r - s * gh_yzz_i;
-    grad_grad_grad_psi[psiIndex][2][6] = c * gh_xzz_r - s * gh_xzz_i;
-    grad_grad_grad_psi[psiIndex][2][7] = c * gh_yzz_r - s * gh_yzz_i;
-    grad_grad_grad_psi[psiIndex][2][8] = c * gh_zzz_r - s * gh_zzz_i;
-  }
+    for (size_t j = std::max(nComplexBands, first); j < last; j++) {
+        int jr = j << 1;
+        int ji = jr + 1;
+
+        const ST kX = k0[j];
+        const ST kY = k1[j];
+        const ST kZ = k2[j];
+        const ST val_r = myV[jr];
+        const ST val_i = myV[ji];
+
+        // phase
+        ST s, c;
+        qmcplusplus::sincos(-(x * kX + y * kY + z * kZ), &s, &c);
+
+        // dot(PrimLattice.G,myG[j])
+        const ST dX_r = g00 * g0[jr] + g01 * g1[jr] + g02 * g2[jr];
+        const ST dY_r = g10 * g0[jr] + g11 * g1[jr] + g12 * g2[jr];
+        const ST dZ_r = g20 * g0[jr] + g21 * g1[jr] + g22 * g2[jr];
+
+        const ST dX_i = g00 * g0[ji] + g01 * g1[ji] + g02 * g2[ji];
+        const ST dY_i = g10 * g0[ji] + g11 * g1[ji] + g12 * g2[ji];
+        const ST dZ_i = g20 * g0[ji] + g21 * g1[ji] + g22 * g2[ji];
+
+        // \f$\nabla \psi_r + {\bf k}\psi_i\f$
+        const ST gX_r = dX_r + val_i * kX;
+        const ST gY_r = dY_r + val_i * kY;
+        const ST gZ_r = dZ_r + val_i * kZ;
+        const ST gX_i = dX_i - val_r * kX;
+        const ST gY_i = dY_i - val_r * kY;
+        const ST gZ_i = dZ_i - val_r * kZ;
+
+        const size_t psiIndex = this->first_spo + nComplexBands + j;
+        psi[psiIndex] = c * val_r - s * val_i;
+        dpsi[psiIndex][0] = c * gX_r - s * gX_i;
+        dpsi[psiIndex][1] = c * gY_r - s * gY_i;
+        dpsi[psiIndex][2] = c * gZ_r - s * gZ_i;
+
+        // intermediates for computation of hessian. \partial_i \partial_j phi
+        // in cartesian coordinates.
+        const ST f_xx_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+            h22[jr], g00, g01, g02, g00, g01, g02);
+        const ST f_xy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+            h22[jr], g00, g01, g02, g10, g11, g12);
+        const ST f_xz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+            h22[jr], g00, g01, g02, g20, g21, g22);
+        const ST f_yy_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+            h22[jr], g10, g11, g12, g10, g11, g12);
+        const ST f_yz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+            h22[jr], g10, g11, g12, g20, g21, g22);
+        const ST f_zz_r = v_m_v(h00[jr], h01[jr], h02[jr], h11[jr], h12[jr],
+            h22[jr], g20, g21, g22, g20, g21, g22);
+
+        const ST f_xx_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+            h22[ji], g00, g01, g02, g00, g01, g02);
+        const ST f_xy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+            h22[ji], g00, g01, g02, g10, g11, g12);
+        const ST f_xz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+            h22[ji], g00, g01, g02, g20, g21, g22);
+        const ST f_yy_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+            h22[ji], g10, g11, g12, g10, g11, g12);
+        const ST f_yz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+            h22[ji], g10, g11, g12, g20, g21, g22);
+        const ST f_zz_i = v_m_v(h00[ji], h01[ji], h02[ji], h11[ji], h12[ji],
+            h22[ji], g20, g21, g22, g20, g21, g22);
+
+        const ST h_xx_r = f_xx_r + 2 * kX * dX_i - kX * kX * val_r;
+        const ST h_xy_r = f_xy_r + (kX * dY_i + kY * dX_i) - kX * kY * val_r;
+        const ST h_xz_r = f_xz_r + (kX * dZ_i + kZ * dX_i) - kX * kZ * val_r;
+        const ST h_yy_r = f_yy_r + 2 * kY * dY_i - kY * kY * val_r;
+        const ST h_yz_r = f_yz_r + (kY * dZ_i + kZ * dY_i) - kY * kZ * val_r;
+        const ST h_zz_r = f_zz_r + 2 * kZ * dZ_i - kZ * kZ * val_r;
+
+        const ST h_xx_i = f_xx_i - 2 * kX * dX_r - kX * kX * val_i;
+        const ST h_xy_i = f_xy_i - (kX * dY_r + kY * dX_r) - kX * kY * val_i;
+        const ST h_xz_i = f_xz_i - (kX * dZ_r + kZ * dX_r) - kX * kZ * val_i;
+        const ST h_yy_i = f_yy_i - 2 * kY * dY_r - kY * kY * val_i;
+        const ST h_yz_i = f_yz_i - (kZ * dY_r + kY * dZ_r) - kZ * kY * val_i;
+        const ST h_zz_i = f_zz_i - 2 * kZ * dZ_r - kZ * kZ * val_i;
+
+        grad_grad_psi[psiIndex][0] = c * h_xx_r - s * h_xx_i;
+        grad_grad_psi[psiIndex][1] = c * h_xy_r - s * h_xy_i;
+        grad_grad_psi[psiIndex][2] = c * h_xz_r - s * h_xz_i;
+        grad_grad_psi[psiIndex][3] = c * h_xy_r - s * h_xy_i;
+        grad_grad_psi[psiIndex][4] = c * h_yy_r - s * h_yy_i;
+        grad_grad_psi[psiIndex][5] = c * h_yz_r - s * h_yz_i;
+        grad_grad_psi[psiIndex][6] = c * h_xz_r - s * h_xz_i;
+        grad_grad_psi[psiIndex][7] = c * h_yz_r - s * h_yz_i;
+        grad_grad_psi[psiIndex][8] = c * h_zz_r - s * h_zz_i;
+
+        // These are the real and imaginary components of the third SPO
+        // derivative.  _xxx denotes
+        //  third derivative w.r.t. x, _xyz, a derivative with resepect to x,y,
+        //  and z, and so on.
+
+        const ST f3_xxx_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g00, g01, g02, g00, g01, g02, g00, g01, g02);
+        const ST f3_xxy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g00, g01, g02, g00, g01, g02, g10, g11, g12);
+        const ST f3_xxz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g00, g01, g02, g00, g01, g02, g20, g21, g22);
+        const ST f3_xyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g00, g01, g02, g10, g11, g12, g10, g11, g12);
+        const ST f3_xyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g00, g01, g02, g10, g11, g12, g20, g21, g22);
+        const ST f3_xzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g00, g01, g02, g20, g21, g22, g20, g21, g22);
+        const ST f3_yyy_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g10, g11, g12, g10, g11, g12, g10, g11, g12);
+        const ST f3_yyz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g10, g11, g12, g10, g11, g12, g20, g21, g22);
+        const ST f3_yzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g10, g11, g12, g20, g21, g22, g20, g21, g22);
+        const ST f3_zzz_r = t3_contract(gh000[jr], gh001[jr], gh002[jr],
+            gh011[jr], gh012[jr], gh022[jr], gh111[jr], gh112[jr], gh122[jr],
+            gh222[jr], g20, g21, g22, g20, g21, g22, g20, g21, g22);
+
+        const ST f3_xxx_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g00, g01, g02, g00, g01, g02, g00, g01, g02);
+        const ST f3_xxy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g00, g01, g02, g00, g01, g02, g10, g11, g12);
+        const ST f3_xxz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g00, g01, g02, g00, g01, g02, g20, g21, g22);
+        const ST f3_xyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g00, g01, g02, g10, g11, g12, g10, g11, g12);
+        const ST f3_xyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g00, g01, g02, g10, g11, g12, g20, g21, g22);
+        const ST f3_xzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g00, g01, g02, g20, g21, g22, g20, g21, g22);
+        const ST f3_yyy_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g10, g11, g12, g10, g11, g12, g10, g11, g12);
+        const ST f3_yyz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g10, g11, g12, g10, g11, g12, g20, g21, g22);
+        const ST f3_yzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g10, g11, g12, g20, g21, g22, g20, g21, g22);
+        const ST f3_zzz_i = t3_contract(gh000[ji], gh001[ji], gh002[ji],
+            gh011[ji], gh012[ji], gh022[ji], gh111[ji], gh112[ji], gh122[ji],
+            gh222[ji], g20, g21, g22, g20, g21, g22, g20, g21, g22);
+
+        // Here is where we build up the components of the physical hessian
+        // gradient, namely, d^3/dx^3(e^{-ik*r}\phi(r)
+        const ST gh_xxx_r = f3_xxx_r + 3 * kX * f_xx_i - 3 * kX * kX * dX_r -
+            kX * kX * kX * val_i;
+        const ST gh_xxx_i = f3_xxx_i - 3 * kX * f_xx_r - 3 * kX * kX * dX_i +
+            kX * kX * kX * val_r;
+        const ST gh_xxy_r = f3_xxy_r + (kY * f_xx_i + 2 * kX * f_xy_i) -
+            (kX * kX * dY_r + 2 * kX * kY * dX_r) - kX * kX * kY * val_i;
+        const ST gh_xxy_i = f3_xxy_i - (kY * f_xx_r + 2 * kX * f_xy_r) -
+            (kX * kX * dY_i + 2 * kX * kY * dX_i) + kX * kX * kY * val_r;
+        const ST gh_xxz_r = f3_xxz_r + (kZ * f_xx_i + 2 * kX * f_xz_i) -
+            (kX * kX * dZ_r + 2 * kX * kZ * dX_r) - kX * kX * kZ * val_i;
+        const ST gh_xxz_i = f3_xxz_i - (kZ * f_xx_r + 2 * kX * f_xz_r) -
+            (kX * kX * dZ_i + 2 * kX * kZ * dX_i) + kX * kX * kZ * val_r;
+        const ST gh_xyy_r = f3_xyy_r + (2 * kY * f_xy_i + kX * f_yy_i) -
+            (2 * kX * kY * dY_r + kY * kY * dX_r) - kX * kY * kY * val_i;
+        const ST gh_xyy_i = f3_xyy_i - (2 * kY * f_xy_r + kX * f_yy_r) -
+            (2 * kX * kY * dY_i + kY * kY * dX_i) + kX * kY * kY * val_r;
+        const ST gh_xyz_r = f3_xyz_r +
+            (kX * f_yz_i + kY * f_xz_i + kZ * f_xy_i) -
+            (kX * kY * dZ_r + kY * kZ * dX_r + kZ * kX * dY_r) -
+            kX * kY * kZ * val_i;
+        const ST gh_xyz_i = f3_xyz_i -
+            (kX * f_yz_r + kY * f_xz_r + kZ * f_xy_r) -
+            (kX * kY * dZ_i + kY * kZ * dX_i + kZ * kX * dY_i) +
+            kX * kY * kZ * val_r;
+        const ST gh_xzz_r = f3_xzz_r + (2 * kZ * f_xz_i + kX * f_zz_i) -
+            (2 * kX * kZ * dZ_r + kZ * kZ * dX_r) - kX * kZ * kZ * val_i;
+        const ST gh_xzz_i = f3_xzz_i - (2 * kZ * f_xz_r + kX * f_zz_r) -
+            (2 * kX * kZ * dZ_i + kZ * kZ * dX_i) + kX * kZ * kZ * val_r;
+        const ST gh_yyy_r = f3_yyy_r + 3 * kY * f_yy_i - 3 * kY * kY * dY_r -
+            kY * kY * kY * val_i;
+        const ST gh_yyy_i = f3_yyy_i - 3 * kY * f_yy_r - 3 * kY * kY * dY_i +
+            kY * kY * kY * val_r;
+        const ST gh_yyz_r = f3_yyz_r + (kZ * f_yy_i + 2 * kY * f_yz_i) -
+            (kY * kY * dZ_r + 2 * kY * kZ * dY_r) - kY * kY * kZ * val_i;
+        const ST gh_yyz_i = f3_yyz_i - (kZ * f_yy_r + 2 * kY * f_yz_r) -
+            (kY * kY * dZ_i + 2 * kY * kZ * dY_i) + kY * kY * kZ * val_r;
+        const ST gh_yzz_r = f3_yzz_r + (2 * kZ * f_yz_i + kY * f_zz_i) -
+            (2 * kY * kZ * dZ_r + kZ * kZ * dY_r) - kY * kZ * kZ * val_i;
+        const ST gh_yzz_i = f3_yzz_i - (2 * kZ * f_yz_r + kY * f_zz_r) -
+            (2 * kY * kZ * dZ_i + kZ * kZ * dY_i) + kY * kZ * kZ * val_r;
+        const ST gh_zzz_r = f3_zzz_r + 3 * kZ * f_zz_i - 3 * kZ * kZ * dZ_r -
+            kZ * kZ * kZ * val_i;
+        const ST gh_zzz_i = f3_zzz_i - 3 * kZ * f_zz_r - 3 * kZ * kZ * dZ_i +
+            kZ * kZ * kZ * val_r;
+        //[x][xx] //These are the unique entries
+        grad_grad_grad_psi[psiIndex][0][0] = c * gh_xxx_r - s * gh_xxx_i;
+        grad_grad_grad_psi[psiIndex][0][1] = c * gh_xxy_r - s * gh_xxy_i;
+        grad_grad_grad_psi[psiIndex][0][2] = c * gh_xxz_r - s * gh_xxz_i;
+        grad_grad_grad_psi[psiIndex][0][3] = c * gh_xxy_r - s * gh_xxy_i;
+        grad_grad_grad_psi[psiIndex][0][4] = c * gh_xyy_r - s * gh_xyy_i;
+        grad_grad_grad_psi[psiIndex][0][5] = c * gh_xyz_r - s * gh_xyz_i;
+        grad_grad_grad_psi[psiIndex][0][6] = c * gh_xxz_r - s * gh_xxz_i;
+        grad_grad_grad_psi[psiIndex][0][7] = c * gh_xyz_r - s * gh_xyz_i;
+        grad_grad_grad_psi[psiIndex][0][8] = c * gh_xzz_r - s * gh_xzz_i;
+
+        grad_grad_grad_psi[psiIndex][1][0] = c * gh_xxy_r - s * gh_xxy_i;
+        grad_grad_grad_psi[psiIndex][1][1] = c * gh_xyy_r - s * gh_xyy_i;
+        grad_grad_grad_psi[psiIndex][1][2] = c * gh_xyz_r - s * gh_xyz_i;
+        grad_grad_grad_psi[psiIndex][1][3] = c * gh_xyy_r - s * gh_xyy_i;
+        grad_grad_grad_psi[psiIndex][1][4] = c * gh_yyy_r - s * gh_yyy_i;
+        grad_grad_grad_psi[psiIndex][1][5] = c * gh_yyz_r - s * gh_yyz_i;
+        grad_grad_grad_psi[psiIndex][1][6] = c * gh_xyz_r - s * gh_xyz_i;
+        grad_grad_grad_psi[psiIndex][1][7] = c * gh_yyz_r - s * gh_yyz_i;
+        grad_grad_grad_psi[psiIndex][1][8] = c * gh_yzz_r - s * gh_yzz_i;
+
+        grad_grad_grad_psi[psiIndex][2][0] = c * gh_xxz_r - s * gh_xxz_i;
+        grad_grad_grad_psi[psiIndex][2][1] = c * gh_xyz_r - s * gh_xyz_i;
+        grad_grad_grad_psi[psiIndex][2][2] = c * gh_xzz_r - s * gh_xzz_i;
+        grad_grad_grad_psi[psiIndex][2][3] = c * gh_xyz_r - s * gh_xyz_i;
+        grad_grad_grad_psi[psiIndex][2][4] = c * gh_yyz_r - s * gh_yyz_i;
+        grad_grad_grad_psi[psiIndex][2][5] = c * gh_yzz_r - s * gh_yzz_i;
+        grad_grad_grad_psi[psiIndex][2][6] = c * gh_xzz_r - s * gh_xzz_i;
+        grad_grad_grad_psi[psiIndex][2][7] = c * gh_yzz_r - s * gh_yzz_i;
+        grad_grad_grad_psi[psiIndex][2][8] = c * gh_zzz_r - s * gh_zzz_i;
+    }
 }
 
-template<typename ST>
-void SplineC2RT<ST>::evaluateVGHGH(const ParticleSetT<ST>& P,
-                                   const int iat,
-                                   ValueVector& psi,
-                                   GradVector& dpsi,
-                                   HessVector& grad_grad_psi,
-                                   GGGVector& grad_grad_grad_psi)
+template <typename ST, typename VT>
+void
+SplineC2RT<ST, VT>::evaluateVGHGH(const ParticleSetT<VT>& P, const int iat,
+    ValueVector& psi, GradVector& dpsi, HessVector& grad_grad_psi,
+    GGGVector& grad_grad_grad_psi)
 {
-  const PointType& r = P.activeR(iat);
-  PointType ru(PrimLattice.toUnit_floor(r));
+    const PointType& r = P.activeR(iat);
+    PointType ru(PrimLattice.toUnit_floor(r));
 #pragma omp parallel
-  {
-    int first, last;
-    FairDivideAligned(myV.size(), getAlignment<ST>(), omp_get_num_threads(), omp_get_thread_num(), first, last);
-
-    spline2::evaluate3d_vghgh(SplineInst->getSplinePtr(), ru, myV, myG, myH, mygH, first, last);
-    assign_vghgh(r, psi, dpsi, grad_grad_psi, grad_grad_grad_psi, first / 2, last / 2);
-  }
+    {
+        int first, last;
+        FairDivideAligned(myV.size(), getAlignment<ST>(), omp_get_num_threads(),
+            omp_get_thread_num(), first, last);
+
+        spline2::evaluate3d_vghgh(
+            SplineInst->getSplinePtr(), ru, myV, myG, myH, mygH, first, last);
+        assign_vghgh(r, psi, dpsi, grad_grad_psi, grad_grad_grad_psi, first / 2,
+            last / 2);
+    }
 }
 
-template class SplineC2RT<float>;
-template class SplineC2RT<double>;
+template class SplineC2RT<float, float>;
+template class SplineC2RT<float, double>;
+template class SplineC2RT<double, float>;
+template class SplineC2RT<double, double>;
 
 } // namespace qmcplusplus
diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineC2RT.h b/src/QMCWaveFunctions/BsplineFactory/SplineC2RT.h
index cd6b45c240..b7cf9e109d 100644
--- a/src/QMCWaveFunctions/BsplineFactory/SplineC2RT.h
+++ b/src/QMCWaveFunctions/BsplineFactory/SplineC2RT.h
@@ -1,214 +1,255 @@
 //////////////////////////////////////////////////////////////////////////////////////
-// This file is distributed under the University of Illinois/NCSA Open Source License.
-// See LICENSE file in top directory for details.
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
 //
 // Copyright (c) 2019 QMCPACK developers.
 //
-// File developed by: Jeremy McMinnis, jmcminis@gmail.com, University of Illinois at Urbana-Champaign
-//                    Jeongnim Kim, jeongnim.kim@intel.com, University of Illinois at Urbana-Champaign
-//                    Ye Luo, yeluo@anl.gov, Argonne National Laboratory
-//                    Anouar Benali, benali@anl.gov, Argonne National Laboratory
-//                    Mark A. Berrill, berrillma@ornl.gov, Oak Ridge National Laboratory
+// File developed by: Jeremy McMinnis, jmcminis@gmail.com, University of
+// Illinois at Urbana-Champaign
+//                    Jeongnim Kim, jeongnim.kim@intel.com, University of
+//                    Illinois at Urbana-Champaign Ye Luo, yeluo@anl.gov,
+//                    Argonne National Laboratory Anouar Benali, benali@anl.gov,
+//                    Argonne National Laboratory Mark A. Berrill,
+//                    berrillma@ornl.gov, Oak Ridge National Laboratory
 //
-// File created by: Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois at Urbana-Champaign
+// File created by: Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois
+// at Urbana-Champaign
 //////////////////////////////////////////////////////////////////////////////////////
 
-
 /** @file
  *
- * class to handle complex splines to real orbitals with splines of arbitrary precision
+ * class to handle complex splines to real orbitals with splines of arbitrary
+ * precision
  */
 #ifndef QMCPLUSPLUS_SPLINE_C2RT_H
 #define QMCPLUSPLUS_SPLINE_C2RT_H
 
-#include <memory>
-#include "QMCWaveFunctions/BsplineFactory/BsplineSetT.h"
 #include "OhmmsSoA/VectorSoaContainer.h"
-#include "spline2/MultiBspline.hpp"
+#include "QMCWaveFunctions/BsplineFactory/BsplineSetT.h"
 #include "Utilities/FairDivide.h"
+#include "spline2/MultiBspline.hpp"
+
+#include <memory>
 
 namespace qmcplusplus
 {
-/** class to match std::complex<ST> spline with BsplineSet::ValueType (real) SPOs
+/** class to match std::complex<ST> spline with BsplineSet::ValueType (real)
+ * SPOs
  * @tparam ST precision of spline
  *
  * Requires temporage storage and multiplication of phase vectors
- * The internal storage of complex spline coefficients uses double sized real arrays of ST type, aligned and padded.
- * The first nComplexBands complex splines produce 2 real orbitals.
- * The rest complex splines produce 1 real orbital.
- * All the output orbitals are real (C2R). The maximal number of output orbitals is OrbitalSetSize.
+ * The internal storage of complex spline coefficients uses double sized real
+ * arrays of ST type, aligned and padded. The first nComplexBands complex
+ * splines produce 2 real orbitals. The rest complex splines produce 1 real
+ * orbital. All the output orbitals are real (C2R). The maximal number of output
+ * orbitals is OrbitalSetSize.
  */
-template<typename ST>
-class SplineC2RT : public BsplineSetT<ST>
+template <typename ST, typename VT>
+class SplineC2RT : public BsplineSetT<VT>
 {
 public:
-  using SplineType       = typename bspline_traits<ST, 3>::SplineType;
-  using BCType           = typename bspline_traits<ST, 3>::BCType;
-  using DataType         = ST;
-  using PointType        = TinyVector<ST, 3>;
-  using SingleSplineType = UBspline_3d_d;
-  // types for evaluation results
-  using TT          = typename BsplineSetT<ST>::ValueType;
-  using ValueVector = typename BsplineSetT<ST>::ValueVector;
-  using GGGVector   = typename BsplineSetT<ST>::GGGVector;
-  using GradVector  = typename BsplineSetT<ST>::GradVector;
-  using HessVector  = typename BsplineSetT<ST>::HessVector;
-
-  using vContainer_type = Vector<ST, aligned_allocator<ST>>;
-  using gContainer_type = VectorSoaContainer<ST, 3>;
-  using hContainer_type = VectorSoaContainer<ST, 6>;
-
-  using ghContainer_type = VectorSoaContainer<ST, 10>;
+    using SplineType = typename bspline_traits<ST, 3>::SplineType;
+    using BCType = typename bspline_traits<ST, 3>::BCType;
+    using DataType = ST;
+    using PointType = TinyVector<ST, 3>;
+    using SingleSplineType = UBspline_3d_d;
+    // types for evaluation results
+    using TT = typename BsplineSetT<VT>::ValueType;
+    using typename BsplineSetT<VT>::GGGVector;
+    using typename BsplineSetT<VT>::GradVector;
+    using typename BsplineSetT<VT>::HessVector;
+    using typename BsplineSetT<VT>::ValueVector;
+
+    using vContainer_type = Vector<ST, aligned_allocator<ST>>;
+    using gContainer_type = VectorSoaContainer<ST, 3>;
+    using hContainer_type = VectorSoaContainer<ST, 6>;
+    using ghContainer_type = VectorSoaContainer<ST, 10>;
 
 private:
-  ///primitive cell
-  CrystalLattice<ST, 3> PrimLattice;
-  ///\f$GGt=G^t G \f$, transformation for tensor in LatticeUnit to CartesianUnit, e.g. Hessian
-  Tensor<ST, 3> GGt;
-  ///number of complex bands
-  int nComplexBands;
-  ///multi bspline set
-  std::shared_ptr<MultiBspline<ST>> SplineInst;
-
-  vContainer_type mKK;
-  VectorSoaContainer<ST, 3> myKcart;
-
-  ///thread private ratios for reduction when using nested threading, numVP x numThread
-  Matrix<TT> ratios_private;
+    /// primitive cell
+    CrystalLattice<ST, 3> PrimLattice;
+    ///\f$GGt=G^t G \f$, transformation for tensor in LatticeUnit to
+    ///CartesianUnit, e.g. Hessian
+    Tensor<ST, 3> GGt;
+    /// number of complex bands
+    int nComplexBands;
+    /// multi bspline set
+    std::shared_ptr<MultiBspline<ST>> SplineInst;
+
+    vContainer_type mKK;
+    VectorSoaContainer<ST, 3> myKcart;
+
+    /// thread private ratios for reduction when using nested threading, numVP x
+    /// numThread
+    Matrix<TT> ratios_private;
 
 protected:
-  /// intermediate result vectors
-  vContainer_type myV;
-  vContainer_type myL;
-  gContainer_type myG;
-  hContainer_type myH;
-  ghContainer_type mygH;
+    /// intermediate result vectors
+    vContainer_type myV;
+    vContainer_type myL;
+    gContainer_type myG;
+    hContainer_type myH;
+    ghContainer_type mygH;
 
 public:
-  SplineC2RT(const std::string& my_name) : BsplineSetT<ST>(my_name), nComplexBands(0) {}
-
-  SplineC2RT(const SplineC2RT& in);
-  virtual std::string getClassName() const override { return "SplineC2R"; }
-  virtual std::string getKeyword() const override { return "SplineC2R"; }
-  bool isComplex() const override { return true; };
-
-  std::unique_ptr<SPOSetT<ST>> makeClone() const override { return std::make_unique<SplineC2RT<ST>>(*this); }
-
-  inline void resizeStorage(size_t n, size_t nvals)
-  {
-    this->init_base(n);
-    size_t npad = getAlignedSize<ST>(2 * n);
-    myV.resize(npad);
-    myG.resize(npad);
-    myL.resize(npad);
-    myH.resize(npad);
-    mygH.resize(npad);
-  }
-
-  void bcast_tables(Communicate* comm) { chunked_bcast(comm, SplineInst->getSplinePtr()); }
-
-  void gather_tables(Communicate* comm)
-  {
-    if (comm->size() == 1)
-      return;
-    const int Nbands      = this->kPoints.size();
-    const int Nbandgroups = comm->size();
-    this->offset.resize(Nbandgroups + 1, 0);
-    FairDivideLow(Nbands, Nbandgroups, this->offset);
-
-    for (size_t ib = 0; ib < this->offset.size(); ib++)
-      this->offset[ib] = this->offset[ib] * 2;
-    gatherv(comm, SplineInst->getSplinePtr(), SplineInst->getSplinePtr()->z_stride, this->offset);
-  }
-
-  template<typename GT, typename BCT>
-  void create_spline(GT& xyz_g, BCT& xyz_bc)
-  {
-    resize_kpoints();
-    SplineInst = std::make_shared<MultiBspline<ST>>();
-    SplineInst->create(xyz_g, xyz_bc, myV.size());
-
-    app_log() << "MEMORY " << SplineInst->sizeInByte() / (1 << 20) << " MB allocated "
-              << "for the coefficients in 3D spline orbital representation" << std::endl;
-  }
-
-  inline void flush_zero() { SplineInst->flush_zero(); }
-
-  /** remap kPoints to pack the double copy */
-  inline void resize_kpoints()
-  {
-    nComplexBands = this->remap_kpoints();
-    const int nk  = this->kPoints.size();
-    mKK.resize(nk);
-    myKcart.resize(nk);
-    for (size_t i = 0; i < nk; ++i)
+    SplineC2RT(const std::string& my_name) :
+        BsplineSetT<VT>(my_name),
+        nComplexBands(0)
+    {
+    }
+
+    SplineC2RT(const SplineC2RT& in);
+    virtual std::string
+    getClassName() const override
+    {
+        return "SplineC2R";
+    }
+    virtual std::string
+    getKeyword() const override
+    {
+        return "SplineC2R";
+    }
+    bool
+    isComplex() const override
+    {
+        return true;
+    };
+
+    std::unique_ptr<SPOSetT<VT>>
+    makeClone() const override
     {
-      mKK[i]     = -dot(this->kPoints[i], this->kPoints[i]);
-      myKcart(i) = this->kPoints[i];
+        return std::make_unique<SplineC2RT>(*this);
     }
-  }
-
-  void set_spline(SingleSplineType* spline_r, SingleSplineType* spline_i, int twist, int ispline, int level);
-
-  bool read_splines(hdf_archive& h5f);
-
-  bool write_splines(hdf_archive& h5f);
-
-  void assign_v(const PointType& r, const vContainer_type& myV, ValueVector& psi, int first, int last) const;
-
-  void evaluateValue(const ParticleSetT<ST>& P, const int iat, ValueVector& psi) override;
-
-  void evaluateDetRatios(const VirtualParticleSetT<ST>& VP,
-                         ValueVector& psi,
-                         const ValueVector& psiinv,
-                         std::vector<TT>& ratios) override;
-
-  /** assign_vgl
-   */
-  void assign_vgl(const PointType& r, ValueVector& psi, GradVector& dpsi, ValueVector& d2psi, int first, int last)
-      const;
-
-  /** assign_vgl_from_l can be used when myL is precomputed and myV,myG,myL in cartesian
-   */
-  void assign_vgl_from_l(const PointType& r, ValueVector& psi, GradVector& dpsi, ValueVector& d2psi);
-
-  void evaluateVGL(const ParticleSetT<ST>& P,
-                   const int iat,
-                   ValueVector& psi,
-                   GradVector& dpsi,
-                   ValueVector& d2psi) override;
-
-  void assign_vgh(const PointType& r,
-                  ValueVector& psi,
-                  GradVector& dpsi,
-                  HessVector& grad_grad_psi,
-                  int first,
-                  int last) const;
-
-  void evaluateVGH(const ParticleSetT<ST>& P,
-                   const int iat,
-                   ValueVector& psi,
-                   GradVector& dpsi,
-                   HessVector& grad_grad_psi) override;
-
-  void assign_vghgh(const PointType& r,
-                    ValueVector& psi,
-                    GradVector& dpsi,
-                    HessVector& grad_grad_psi,
-                    GGGVector& grad_grad_grad_psi,
-                    int first = 0,
-                    int last  = -1) const;
-
-  void evaluateVGHGH(const ParticleSetT<ST>& P,
-                     const int iat,
-                     ValueVector& psi,
-                     GradVector& dpsi,
-                     HessVector& grad_grad_psi,
-                     GGGVector& grad_grad_grad_psi) override;
-
-  template<class BSPLINESPO>
-  friend struct SplineSetReader;
-  friend struct BsplineReaderBase;
+
+    inline void
+    resizeStorage(size_t n, size_t nvals)
+    {
+        this->init_base(n);
+        size_t npad = getAlignedSize<ST>(2 * n);
+        myV.resize(npad);
+        myG.resize(npad);
+        myL.resize(npad);
+        myH.resize(npad);
+        mygH.resize(npad);
+    }
+
+    void
+    bcast_tables(Communicate* comm)
+    {
+        chunked_bcast(comm, SplineInst->getSplinePtr());
+    }
+
+    void
+    gather_tables(Communicate* comm)
+    {
+        if (comm->size() == 1)
+            return;
+        const int Nbands = this->kPoints.size();
+        const int Nbandgroups = comm->size();
+        this->offset.resize(Nbandgroups + 1, 0);
+        FairDivideLow(Nbands, Nbandgroups, this->offset);
+
+        for (size_t ib = 0; ib < this->offset.size(); ib++)
+            this->offset[ib] = this->offset[ib] * 2;
+        gatherv(comm, SplineInst->getSplinePtr(),
+            SplineInst->getSplinePtr()->z_stride, this->offset);
+    }
+
+    template <typename GT, typename BCT>
+    void
+    create_spline(GT& xyz_g, BCT& xyz_bc)
+    {
+        resize_kpoints();
+        SplineInst = std::make_shared<MultiBspline<ST>>();
+        SplineInst->create(xyz_g, xyz_bc, myV.size());
+
+        app_log() << "MEMORY " << SplineInst->sizeInByte() / (1 << 20)
+                  << " MB allocated "
+                  << "for the coefficients in 3D spline orbital representation"
+                  << std::endl;
+    }
+
+    inline void
+    flush_zero()
+    {
+        SplineInst->flush_zero();
+    }
+
+    /** remap kPoints to pack the double copy */
+    inline void
+    resize_kpoints()
+    {
+        nComplexBands = this->remap_kpoints();
+        const int nk = this->kPoints.size();
+        mKK.resize(nk);
+        myKcart.resize(nk);
+        for (size_t i = 0; i < nk; ++i) {
+            mKK[i] = -dot(this->kPoints[i], this->kPoints[i]);
+            myKcart(i) = this->kPoints[i];
+        }
+    }
+
+    void
+    set_spline(SingleSplineType* spline_r, SingleSplineType* spline_i,
+        int twist, int ispline, int level);
+
+    bool
+    read_splines(hdf_archive& h5f);
+
+    bool
+    write_splines(hdf_archive& h5f);
+
+    void
+    assign_v(const PointType& r, const vContainer_type& myV, ValueVector& psi,
+        int first, int last) const;
+
+    void
+    evaluateValue(
+        const ParticleSetT<VT>& P, const int iat, ValueVector& psi) override;
+
+    void
+    evaluateDetRatios(const VirtualParticleSetT<VT>& VP, ValueVector& psi,
+        const ValueVector& psiinv, std::vector<TT>& ratios) override;
+
+    /** assign_vgl
+     */
+    void
+    assign_vgl(const PointType& r, ValueVector& psi, GradVector& dpsi,
+        ValueVector& d2psi, int first, int last) const;
+
+    /** assign_vgl_from_l can be used when myL is precomputed and myV,myG,myL in
+     * cartesian
+     */
+    void
+    assign_vgl_from_l(const PointType& r, ValueVector& psi, GradVector& dpsi,
+        ValueVector& d2psi);
+
+    void
+    evaluateVGL(const ParticleSetT<VT>& P, const int iat, ValueVector& psi,
+        GradVector& dpsi, ValueVector& d2psi) override;
+
+    void
+    assign_vgh(const PointType& r, ValueVector& psi, GradVector& dpsi,
+        HessVector& grad_grad_psi, int first, int last) const;
+
+    void
+    evaluateVGH(const ParticleSetT<VT>& P, const int iat, ValueVector& psi,
+        GradVector& dpsi, HessVector& grad_grad_psi) override;
+
+    void
+    assign_vghgh(const PointType& r, ValueVector& psi, GradVector& dpsi,
+        HessVector& grad_grad_psi, GGGVector& grad_grad_grad_psi, int first = 0,
+        int last = -1) const;
+
+    void
+    evaluateVGHGH(const ParticleSetT<VT>& P, const int iat, ValueVector& psi,
+        GradVector& dpsi, HessVector& grad_grad_psi,
+        GGGVector& grad_grad_grad_psi) override;
+
+    template <class BSPLINESPO>
+    friend class SplineSetReaderT;
+    template <typename>
+    friend class BsplineReaderBaseT;
 };
 
 } // namespace qmcplusplus
diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineR2RT.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineR2RT.cpp
index 176cb5dee8..ce4bb5e8aa 100644
--- a/src/QMCWaveFunctions/BsplineFactory/SplineR2RT.cpp
+++ b/src/QMCWaveFunctions/BsplineFactory/SplineR2RT.cpp
@@ -23,20 +23,20 @@
 
 namespace qmcplusplus
 {
-template <typename ST>
-SplineR2RT<ST>::SplineR2RT(const SplineR2RT& in) = default;
+template <typename ST, typename VT>
+SplineR2RT<ST, VT>::SplineR2RT(const SplineR2RT& in) = default;
 
-template <typename ST>
+template <typename ST, typename VT>
 inline void
-SplineR2RT<ST>::set_spline(SingleSplineType* spline_r,
+SplineR2RT<ST, VT>::set_spline(SingleSplineType* spline_r,
     SingleSplineType* spline_i, int twist, int ispline, int level)
 {
     SplineInst->copy_spline(spline_r, ispline);
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 bool
-SplineR2RT<ST>::read_splines(hdf_archive& h5f)
+SplineR2RT<ST, VT>::read_splines(hdf_archive& h5f)
 {
     std::ostringstream o;
     o << "spline_" << this->MyIndex;
@@ -44,9 +44,9 @@ SplineR2RT<ST>::read_splines(hdf_archive& h5f)
     return h5f.readEntry(bigtable, o.str().c_str()); //"spline_0");
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 bool
-SplineR2RT<ST>::write_splines(hdf_archive& h5f)
+SplineR2RT<ST, VT>::write_splines(hdf_archive& h5f)
 {
     std::ostringstream o;
     o << "spline_" << this->MyIndex;
@@ -54,13 +54,13 @@ SplineR2RT<ST>::write_splines(hdf_archive& h5f)
     return h5f.writeEntry(bigtable, o.str().c_str()); //"spline_0");
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 void
-SplineR2RT<ST>::storeParamsBeforeRotation()
+SplineR2RT<ST, VT>::storeParamsBeforeRotation()
 {
     const auto spline_ptr = SplineInst->getSplinePtr();
     const auto coefs_tot_size = spline_ptr->coefs_size;
-    coef_copy_ = std::make_shared<std::vector<RealType>>(coefs_tot_size);
+    coef_copy_ = std::make_shared<std::vector<ST>>(coefs_tot_size);
 
     std::copy_n(spline_ptr->coefs, coefs_tot_size, coef_copy_->begin());
 }
@@ -104,9 +104,10 @@ SplineR2RT<ST>::storeParamsBeforeRotation()
   NB: For splines (typically) BasisSetSize >> OrbitalSetSize, so the spl_coefs
   "matrix" is very tall and skinny.
 */
-template <typename ST>
+template <typename ST, typename VT>
 void
-SplineR2RT<ST>::applyRotation(const ValueMatrix& rot_mat, bool use_stored_copy)
+SplineR2RT<ST, VT>::applyRotation(
+    const ValueMatrix& rot_mat, bool use_stored_copy)
 {
     // SplineInst is a MultiBspline. See src/spline2/MultiBspline.hpp
     const auto spline_ptr = SplineInst->getSplinePtr();
@@ -138,9 +139,9 @@ SplineR2RT<ST>::applyRotation(const ValueMatrix& rot_mat, bool use_stored_copy)
     }
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 inline void
-SplineR2RT<ST>::assign_v(int bc_sign, const vContainer_type& myV,
+SplineR2RT<ST, VT>::assign_v(int bc_sign, const vContainer_type& myV,
     ValueVector& psi, int first, int last) const
 {
     // protect last
@@ -152,10 +153,10 @@ SplineR2RT<ST>::assign_v(int bc_sign, const vContainer_type& myV,
         psi[this->first_spo + j] = signed_one * myV[j];
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 void
-SplineR2RT<ST>::evaluateValue(
-    const ParticleSetT<ST>& P, const int iat, ValueVector& psi)
+SplineR2RT<ST, VT>::evaluateValue(
+    const ParticleSetT<VT>& P, const int iat, ValueVector& psi)
 {
     const PointType& r = P.activeR(iat);
     PointType ru;
@@ -172,9 +173,9 @@ SplineR2RT<ST>::evaluateValue(
     }
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 void
-SplineR2RT<ST>::evaluateDetRatios(const VirtualParticleSetT<ST>& VP,
+SplineR2RT<ST, VT>::evaluateDetRatios(const VirtualParticleSetT<VT>& VP,
     ValueVector& psi, const ValueVector& psiinv, std::vector<TT>& ratios)
 {
     const bool need_resize = ratios_private.rows() < VP.getTotalNum();
@@ -216,9 +217,9 @@ SplineR2RT<ST>::evaluateDetRatios(const VirtualParticleSetT<ST>& VP,
     }
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 inline void
-SplineR2RT<ST>::assign_vgl(int bc_sign, ValueVector& psi, GradVector& dpsi,
+SplineR2RT<ST, VT>::assign_vgl(int bc_sign, ValueVector& psi, GradVector& dpsi,
     ValueVector& d2psi, int first, int last) const
 {
     // protect last
@@ -261,9 +262,9 @@ SplineR2RT<ST>::assign_vgl(int bc_sign, ValueVector& psi, GradVector& dpsi,
 /** assign_vgl_from_l can be used when myL is precomputed and myV,myG,myL in
  * cartesian
  */
-template <typename ST>
+template <typename ST, typename VT>
 inline void
-SplineR2RT<ST>::assign_vgl_from_l(
+SplineR2RT<ST, VT>::assign_vgl_from_l(
     int bc_sign, ValueVector& psi, GradVector& dpsi, ValueVector& d2psi)
 {
     const ST signed_one = (bc_sign & 1) ? -1 : 1;
@@ -283,9 +284,9 @@ SplineR2RT<ST>::assign_vgl_from_l(
     }
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 void
-SplineR2RT<ST>::evaluateVGL(const ParticleSetT<ST>& P, const int iat,
+SplineR2RT<ST, VT>::evaluateVGL(const ParticleSetT<VT>& P, const int iat,
     ValueVector& psi, GradVector& dpsi, ValueVector& d2psi)
 {
     const PointType& r = P.activeR(iat);
@@ -304,9 +305,9 @@ SplineR2RT<ST>::evaluateVGL(const ParticleSetT<ST>& P, const int iat,
     }
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 void
-SplineR2RT<ST>::assign_vgh(int bc_sign, ValueVector& psi, GradVector& dpsi,
+SplineR2RT<ST, VT>::assign_vgh(int bc_sign, ValueVector& psi, GradVector& dpsi,
     HessVector& grad_grad_psi, int first, int last) const
 {
     // protect last
@@ -373,9 +374,9 @@ SplineR2RT<ST>::assign_vgh(int bc_sign, ValueVector& psi, GradVector& dpsi,
     }
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 void
-SplineR2RT<ST>::evaluateVGH(const ParticleSetT<ST>& P, const int iat,
+SplineR2RT<ST, VT>::evaluateVGH(const ParticleSetT<VT>& P, const int iat,
     ValueVector& psi, GradVector& dpsi, HessVector& grad_grad_psi)
 {
     const PointType& r = P.activeR(iat);
@@ -394,11 +395,11 @@ SplineR2RT<ST>::evaluateVGH(const ParticleSetT<ST>& P, const int iat,
     }
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 void
-SplineR2RT<ST>::assign_vghgh(int bc_sign, ValueVector& psi, GradVector& dpsi,
-    HessVector& grad_grad_psi, GGGVector& grad_grad_grad_psi, int first,
-    int last) const
+SplineR2RT<ST, VT>::assign_vghgh(int bc_sign, ValueVector& psi,
+    GradVector& dpsi, HessVector& grad_grad_psi, GGGVector& grad_grad_grad_psi,
+    int first, int last) const
 {
     // protect last
     last = last < 0 ?
@@ -574,9 +575,9 @@ SplineR2RT<ST>::assign_vghgh(int bc_sign, ValueVector& psi, GradVector& dpsi,
     }
 }
 
-template <typename ST>
+template <typename ST, typename VT>
 void
-SplineR2RT<ST>::evaluateVGHGH(const ParticleSetT<ST>& P, const int iat,
+SplineR2RT<ST, VT>::evaluateVGHGH(const ParticleSetT<VT>& P, const int iat,
     ValueVector& psi, GradVector& dpsi, HessVector& grad_grad_psi,
     GGGVector& grad_grad_grad_psi)
 {
@@ -597,7 +598,9 @@ SplineR2RT<ST>::evaluateVGHGH(const ParticleSetT<ST>& P, const int iat,
     }
 }
 
-template class SplineR2RT<float>;
-template class SplineR2RT<double>;
+template class SplineR2RT<float, float>;
+template class SplineR2RT<float, double>; // do we need this one?
+template class SplineR2RT<double, float>;
+template class SplineR2RT<double, double>;
 
 } // namespace qmcplusplus
diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineR2RT.h b/src/QMCWaveFunctions/BsplineFactory/SplineR2RT.h
index f265561e18..ece156ac1a 100644
--- a/src/QMCWaveFunctions/BsplineFactory/SplineR2RT.h
+++ b/src/QMCWaveFunctions/BsplineFactory/SplineR2RT.h
@@ -33,8 +33,8 @@ namespace qmcplusplus
  * Requires temporage storage and multiplication of the sign of the real part of
  * the phase Internal storage ST type arrays are aligned and padded.
  */
-template <typename ST>
-class SplineR2RT : public BsplineSetT<ST>
+template <typename ST, typename VT>
+class SplineR2RT : public BsplineSetT<VT>
 {
 public:
     using SplineType = typename bspline_traits<ST, 3>::SplineType;
@@ -43,19 +43,19 @@ class SplineR2RT : public BsplineSetT<ST>
     using PointType = TinyVector<ST, 3>;
     using SingleSplineType = UBspline_3d_d;
     // types for evaluation results
-    using TT = typename BsplineSetT<ST>::ValueType;
-    using GGGVector = typename BsplineSetT<ST>::GGGVector;
-    using ValueMatrix = typename BsplineSetT<ST>::ValueMatrix;
-    using GradVector = typename BsplineSetT<ST>::GradVector;
-    using HessVector = typename BsplineSetT<ST>::HessVector;
-    using ValueVector = typename BsplineSetT<ST>::ValueVector;
+    using TT = typename BsplineSetT<VT>::ValueType;
+    using GGGVector = typename BsplineSetT<VT>::GGGVector;
+    using ValueMatrix = typename BsplineSetT<VT>::ValueMatrix;
+    using GradVector = typename BsplineSetT<VT>::GradVector;
+    using HessVector = typename BsplineSetT<VT>::HessVector;
+    using ValueVector = typename BsplineSetT<VT>::ValueVector;
 
     using vContainer_type = Vector<ST, aligned_allocator<ST>>;
     using gContainer_type = VectorSoaContainer<ST, 3>;
     using hContainer_type = VectorSoaContainer<ST, 6>;
     using ghContainer_type = VectorSoaContainer<ST, 10>;
 
-    using RealType = typename SPOSetT<ST>::RealType;
+    using RealType = typename SPOSetT<VT>::RealType;
 
 private:
     bool IsGamma;
@@ -66,7 +66,7 @@ class SplineR2RT : public BsplineSetT<ST>
     std::shared_ptr<MultiBspline<ST>> SplineInst;
 
     /// Copy of original splines for orbital rotation
-    std::shared_ptr<std::vector<RealType>> coef_copy_;
+    std::shared_ptr<std::vector<ST>> coef_copy_;
 
     /// thread private ratios for reduction when using nested threading, numVP x
     /// numThread
@@ -83,7 +83,7 @@ class SplineR2RT : public BsplineSetT<ST>
     ghContainer_type mygH;
 
 public:
-    SplineR2RT(const std::string& my_name) : BsplineSetT<ST>(my_name)
+    SplineR2RT(const std::string& my_name) : BsplineSetT<VT>(my_name)
     {
     }
 
@@ -109,10 +109,10 @@ class SplineR2RT : public BsplineSetT<ST>
         return true;
     }
 
-    std::unique_ptr<SPOSetT<ST>>
+    std::unique_ptr<SPOSetT<VT>>
     makeClone() const override
     {
-        return std::make_unique<SplineR2RT<ST>>(*this);
+        return std::make_unique<SplineR2RT<ST, VT>>(*this);
     }
 
     /// Store an original copy of the spline coefficients for orbital rotation
@@ -222,10 +222,10 @@ class SplineR2RT : public BsplineSetT<ST>
 
     void
     evaluateValue(
-        const ParticleSetT<ST>& P, const int iat, ValueVector& psi) override;
+        const ParticleSetT<VT>& P, const int iat, ValueVector& psi) override;
 
     void
-    evaluateDetRatios(const VirtualParticleSetT<ST>& VP, ValueVector& psi,
+    evaluateDetRatios(const VirtualParticleSetT<VT>& VP, ValueVector& psi,
         const ValueVector& psiinv, std::vector<TT>& ratios) override;
 
     void
@@ -240,7 +240,7 @@ class SplineR2RT : public BsplineSetT<ST>
         int bc_sign, ValueVector& psi, GradVector& dpsi, ValueVector& d2psi);
 
     void
-    evaluateVGL(const ParticleSetT<ST>& P, const int iat, ValueVector& psi,
+    evaluateVGL(const ParticleSetT<VT>& P, const int iat, ValueVector& psi,
         GradVector& dpsi, ValueVector& d2psi) override;
 
     void
@@ -248,7 +248,7 @@ class SplineR2RT : public BsplineSetT<ST>
         HessVector& grad_grad_psi, int first, int last) const;
 
     void
-    evaluateVGH(const ParticleSetT<ST>& P, const int iat, ValueVector& psi,
+    evaluateVGH(const ParticleSetT<VT>& P, const int iat, ValueVector& psi,
         GradVector& dpsi, HessVector& grad_grad_psi) override;
 
     void
@@ -257,13 +257,14 @@ class SplineR2RT : public BsplineSetT<ST>
         int last = -1) const;
 
     void
-    evaluateVGHGH(const ParticleSetT<ST>& P, const int iat, ValueVector& psi,
+    evaluateVGHGH(const ParticleSetT<VT>& P, const int iat, ValueVector& psi,
         GradVector& dpsi, HessVector& grad_grad_psi,
         GGGVector& grad_grad_grad_psi) override;
 
     template <class BSPLINESPO>
-    friend struct SplineSetReader;
-    friend struct BsplineReaderBase;
+    friend class SplineSetReaderT;
+    template <typename>
+    friend class BsplineReaderBaseT;
 };
 
 } // namespace qmcplusplus
diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineSetReader.h b/src/QMCWaveFunctions/BsplineFactory/SplineSetReader.h
index dbeb68ff3c..97ba261ddd 100644
--- a/src/QMCWaveFunctions/BsplineFactory/SplineSetReader.h
+++ b/src/QMCWaveFunctions/BsplineFactory/SplineSetReader.h
@@ -28,6 +28,10 @@
 #include "mpi/collectives.h"
 #include "mpi/point2point.h"
 #include "Utilities/FairDivide.h"
+#include "QMCWaveFunctions/BsplineFactory/BsplineReaderBase.h"
+#include "Utilities/ProgressReportEngine.h"
+#include "QMCWaveFunctions/einspline_helper.hpp"
+#include <fftw3.h>
 
 namespace qmcplusplus
 {
diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineSetReaderT.h b/src/QMCWaveFunctions/BsplineFactory/SplineSetReaderT.h
new file mode 100644
index 0000000000..816561008c
--- /dev/null
+++ b/src/QMCWaveFunctions/BsplineFactory/SplineSetReaderT.h
@@ -0,0 +1,322 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
+//
+// Copyright (c) 2019 QMCPACK developers.
+//
+// File developed by: Jeremy McMinnis, jmcminis@gmail.com, University of
+// Illinois at Urbana-Champaign
+//                    Jeongnim Kim, jeongnim.kim@gmail.com, University of
+//                    Illinois at Urbana-Champaign Ye Luo, yeluo@anl.gov,
+//                    Argonne National Laboratory Mark A. Berrill,
+//                    berrillma@ornl.gov, Oak Ridge National Laboratory Jeongnim
+//                    Kim, jeongnim.kim@inte.com, Intel Corp.
+//
+// File created by: Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois
+// at Urbana-Champaign
+//////////////////////////////////////////////////////////////////////////////////////
+
+#ifndef QMCPLUSPLUS_SPLINESET_READERT_H
+#define QMCPLUSPLUS_SPLINESET_READERT_H
+#include "BsplineFactory/BsplineReaderBaseT.h"
+#include "Utilities/FairDivide.h"
+#include "mpi/collectives.h"
+#include "mpi/point2point.h"
+
+namespace qmcplusplus
+{
+/** General SplineSetReader to handle any unitcell
+ */
+template <typename SA>
+class SplineSetReaderT : public BsplineReaderBaseT<typename SA::ValueType>
+{
+public:
+    using splineset_t = SA;
+    using DataType = typename splineset_t::DataType;
+    using SplineType = typename splineset_t::SplineType;
+    using ValueType = typename splineset_t::ValueType;
+
+    Array<std::complex<double>, 3> FFTbox;
+    Array<double, 3> splineData_r, splineData_i;
+    double rotate_phase_r, rotate_phase_i;
+    UBspline_3d_d* spline_r;
+    UBspline_3d_d* spline_i;
+    splineset_t* bspline;
+    fftw_plan FFTplan;
+
+    SplineSetReaderT(EinsplineSetBuilderT<ValueType>* e) :
+        BsplineReaderBaseT<ValueType>(e),
+        spline_r(nullptr),
+        spline_i(nullptr),
+        bspline(nullptr),
+        FFTplan(nullptr)
+    {
+    }
+
+    ~SplineSetReaderT() override
+    {
+        clear();
+    }
+
+    void
+    clear()
+    {
+        einspline::destroy(spline_r);
+        einspline::destroy(spline_i);
+        if (FFTplan != nullptr)
+            fftw_destroy_plan(FFTplan);
+        FFTplan = nullptr;
+    }
+
+    // set info for Hybrid
+    virtual void
+    initialize_hybridrep_atomic_centers()
+    {
+    }
+    // transform cG to radial functions
+    virtual void
+    create_atomic_centers_Gspace(Vector<std::complex<double>>& cG,
+        Communicate& band_group_comm, int iorb)
+    {
+    }
+
+    std::unique_ptr<SPOSetT<ValueType>>
+    create_spline_set(const std::string& my_name, int spin,
+        const BandInfoGroup& bandgroup) override
+    {
+        ReportEngine PRE("SplineSetReader", "create_spline_set(spin,SPE*)");
+        // Timer c_prep, c_unpack,c_fft, c_phase, c_spline, c_newphase, c_h5,
+        // c_init; double t_prep=0.0, t_unpack=0.0, t_fft=0.0, t_phase=0.0,
+        // t_spline=0.0, t_newphase=0.0, t_h5=0.0, t_init=0.0;
+        bspline = new splineset_t(my_name);
+        app_log() << "  ClassName = " << bspline->getClassName() << std::endl;
+        if (bspline->isComplex())
+            app_log() << "  Using complex einspline table" << std::endl;
+        else
+            app_log() << "  Using real einspline table" << std::endl;
+
+        // set info for Hybrid
+        this->initialize_hybridrep_atomic_centers();
+
+        // baseclass handles twists
+        this->check_twists(bspline, bandgroup);
+
+        Ugrid xyz_grid[3];
+
+        typename splineset_t::BCType xyz_bc[3];
+        bool havePsig = this->set_grid(bspline->HalfG, xyz_grid, xyz_bc);
+        if (!havePsig)
+            this->myComm->barrier_and_abort(
+                "SplineSetReader needs psi_g. Set precision=\"double\".");
+        bspline->create_spline(xyz_grid, xyz_bc);
+
+        std::ostringstream oo;
+        oo << bandgroup.myName << ".g" << this->MeshSize[0] << "x"
+           << this->MeshSize[1] << "x" << this->MeshSize[2] << ".h5";
+
+        const std::string splinefile(oo.str());
+        bool root = (this->myComm->rank() == 0);
+        int foundspline = 0;
+        Timer now;
+        if (root) {
+            now.restart();
+            hdf_archive h5f(this->myComm);
+            foundspline = h5f.open(splinefile, H5F_ACC_RDONLY);
+            if (foundspline) {
+                std::string aname("none");
+                foundspline = h5f.readEntry(aname, "class_name");
+                foundspline =
+                    (aname.find(bspline->getKeyword()) != std::string::npos);
+            }
+            if (foundspline) {
+                int sizeD = 0;
+                foundspline = h5f.readEntry(sizeD, "sizeof");
+                foundspline = (sizeD == sizeof(DataType));
+            }
+            if (foundspline) {
+                foundspline = bspline->read_splines(h5f);
+                if (foundspline)
+                    app_log() << "  Successfully restored coefficients from "
+                              << splinefile << ". The reading time is "
+                              << now.elapsed() << " sec." << std::endl;
+            }
+            h5f.close();
+        }
+        this->myComm->bcast(foundspline);
+        if (foundspline) {
+            now.restart();
+            bspline->bcast_tables(this->myComm);
+            app_log() << "  SplineSetReader bcast the full table "
+                      << now.elapsed() << " sec." << std::endl;
+            app_log().flush();
+        }
+        else {
+            bspline->flush_zero();
+
+            int nx = this->MeshSize[0];
+            int ny = this->MeshSize[1];
+            int nz = this->MeshSize[2];
+            if (havePsig) // perform FFT using FFTW
+            {
+                FFTbox.resize(nx, ny, nz);
+                FFTplan = fftw_plan_dft_3d(nx, ny, nz,
+                    reinterpret_cast<fftw_complex*>(FFTbox.data()),
+                    reinterpret_cast<fftw_complex*>(FFTbox.data()), +1,
+                    FFTW_ESTIMATE);
+                splineData_r.resize(nx, ny, nz);
+                if (bspline->isComplex())
+                    splineData_i.resize(nx, ny, nz);
+
+                TinyVector<double, 3> start(0.0);
+                TinyVector<double, 3> end(1.0);
+                spline_r = einspline::create(
+                    spline_r, start, end, this->MeshSize, bspline->HalfG);
+                if (bspline->isComplex())
+                    spline_i = einspline::create(
+                        spline_i, start, end, this->MeshSize, bspline->HalfG);
+
+                now.restart();
+                initialize_spline_pio_gather(spin, bandgroup);
+                app_log() << "  SplineSetReader initialize_spline_pio "
+                          << now.elapsed() << " sec" << std::endl;
+
+                fftw_destroy_plan(FFTplan);
+                FFTplan = NULL;
+            }
+            else // why, don't know
+                initialize_spline_psi_r(spin, bandgroup);
+            if (this->saveSplineCoefs && root) {
+                now.restart();
+                hdf_archive h5f;
+                h5f.create(splinefile);
+                std::string classname = bspline->getClassName();
+                h5f.write(classname, "class_name");
+                int sizeD = sizeof(DataType);
+                h5f.write(sizeD, "sizeof");
+                bspline->write_splines(h5f);
+                h5f.close();
+                app_log() << "  Stored spline coefficients in " << splinefile
+                          << " for potential reuse. The writing time is "
+                          << now.elapsed() << " sec." << std::endl;
+            }
+        }
+
+        clear();
+        return std::unique_ptr<SPOSetT<ValueType>>{bspline};
+    }
+
+    /** fft and spline cG
+     * @param cG psi_g to be processed
+     * @param ti twist index
+     * @param iorb orbital index
+     *
+     * Perform FFT and spline to spline_r and spline_i
+     */
+    inline void
+    fft_spline(Vector<std::complex<double>>& cG, int ti)
+    {
+        unpack4fftw(cG, this->mybuilder->Gvecs[0], this->MeshSize, FFTbox);
+        fftw_execute(FFTplan);
+        if (bspline->isComplex()) {
+            if (this->rotate)
+                fix_phase_rotate_c2c(FFTbox, splineData_r, splineData_i,
+                    this->mybuilder->primcell_kpoints[ti], rotate_phase_r,
+                    rotate_phase_i);
+            else {
+                split_real_components_c2c(FFTbox, splineData_r, splineData_i);
+                rotate_phase_r = 1.0;
+                rotate_phase_i = 0.0;
+            }
+            einspline::set(spline_r, splineData_r.data());
+            einspline::set(spline_i, splineData_i.data());
+        }
+        else {
+            fix_phase_rotate_c2r(FFTbox, splineData_r,
+                this->mybuilder->primcell_kpoints[ti], rotate_phase_r,
+                rotate_phase_i);
+            einspline::set(spline_r, splineData_r.data());
+        }
+    }
+
+    /** initialize the splines
+     */
+    void
+    initialize_spline_pio_gather(int spin, const BandInfoGroup& bandgroup)
+    {
+        // distribute bands over processor groups
+        int Nbands = bandgroup.getNumDistinctOrbitals();
+        const int Nprocs = this->myComm->size();
+        const int Nbandgroups = std::min(Nbands, Nprocs);
+        Communicate band_group_comm(*this->myComm, Nbandgroups);
+        std::vector<int> band_groups(Nbandgroups + 1, 0);
+        FairDivideLow(Nbands, Nbandgroups, band_groups);
+        int iorb_first = band_groups[band_group_comm.getGroupID()];
+        int iorb_last = band_groups[band_group_comm.getGroupID() + 1];
+
+        app_log() << "Start transforming plane waves to 3D B-Splines."
+                  << std::endl;
+        hdf_archive h5f(&band_group_comm, false);
+        Vector<std::complex<double>> cG(this->mybuilder->Gvecs[0].size());
+        const std::vector<BandInfo>& cur_bands = bandgroup.myBands;
+        if (band_group_comm.isGroupLeader())
+            h5f.open(this->mybuilder->H5FileName, H5F_ACC_RDONLY);
+        for (int iorb = iorb_first; iorb < iorb_last; iorb++) {
+            if (band_group_comm.isGroupLeader()) {
+                int iorb_h5 = bspline->BandIndexMap[iorb];
+                int ti = cur_bands[iorb_h5].TwistIndex;
+                std::string s =
+                    this->psi_g_path(ti, spin, cur_bands[iorb_h5].BandIndex);
+                if (!h5f.readEntry(cG, s)) {
+                    std::ostringstream msg;
+                    msg << "SplineSetReader Failed to read band(s) from h5 "
+                           "file. "
+                        << "Attempted dataset " << s << " with " << cG.size()
+                        << " complex numbers." << std::endl;
+                    throw std::runtime_error(msg.str());
+                }
+                double total_norm = compute_norm(cG);
+                if ((this->checkNorm) &&
+                    (std::abs(total_norm - 1.0) > PW_COEFF_NORM_TOLERANCE)) {
+                    std::ostringstream msg;
+                    msg << "SplineSetReader The orbital " << iorb_h5
+                        << " has a wrong norm " << total_norm
+                        << ", computed from plane wave coefficients!"
+                        << std::endl
+                        << "This may indicate a problem with the HDF5 library "
+                           "versions used "
+                        << "during wavefunction conversion or read."
+                        << std::endl;
+                    throw std::runtime_error(msg.str());
+                }
+                fft_spline(cG, ti);
+                bspline->set_spline(
+                    spline_r, spline_i, cur_bands[iorb_h5].TwistIndex, iorb, 0);
+            }
+            this->create_atomic_centers_Gspace(cG, band_group_comm, iorb);
+        }
+
+        this->myComm->barrier();
+        Timer now;
+        if (band_group_comm.isGroupLeader()) {
+            now.restart();
+            bspline->gather_tables(band_group_comm.getGroupLeaderComm());
+            app_log() << "  Time to gather the table = " << now.elapsed()
+                      << std::endl;
+        }
+        now.restart();
+        bspline->bcast_tables(this->myComm);
+        app_log() << "  Time to bcast the table = " << now.elapsed()
+                  << std::endl;
+    }
+
+    void
+    initialize_spline_psi_r(int spin, const BandInfoGroup& bandgroup)
+    {
+        // old implementation buried in the history
+        this->myComm->barrier_and_abort(
+            "SplineSetReaderP initialize_spline_psi_r "
+            "implementation not finished.");
+    }
+};
+} // namespace qmcplusplus
+#endif
diff --git a/src/QMCWaveFunctions/BsplineFactory/createBsplineReaderT.cpp b/src/QMCWaveFunctions/BsplineFactory/createBsplineReaderT.cpp
new file mode 100644
index 0000000000..f7b5e17c77
--- /dev/null
+++ b/src/QMCWaveFunctions/BsplineFactory/createBsplineReaderT.cpp
@@ -0,0 +1,331 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
+//
+// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
+//
+// File developed by:
+//
+// File created by: Jeongnim Kim, jeongnim.kim@intel.com, Intel Corp.
+//////////////////////////////////////////////////////////////////////////////////////
+
+#include "QMCWaveFunctions/BsplineFactory/createBsplineReaderT.h"
+
+#include "CPU/SIMD/vmath.hpp"
+#include "CPU/e2iphi.h"
+#include "QMCWaveFunctions/BsplineFactory/BsplineReaderBaseT.h"
+#include "QMCWaveFunctions/BsplineFactory/BsplineSetT.h"
+#include "QMCWaveFunctions/BsplineFactory/HybridRepCplxT.h"
+#include "QMCWaveFunctions/BsplineFactory/HybridRepRealT.h"
+#include "QMCWaveFunctions/BsplineFactory/HybridRepSetReaderT.h"
+#include "QMCWaveFunctions/BsplineFactory/SplineC2COMPTargetT.h"
+#include "QMCWaveFunctions/BsplineFactory/SplineC2CT.h"
+#include "QMCWaveFunctions/BsplineFactory/SplineC2ROMPTargetT.h"
+#include "QMCWaveFunctions/BsplineFactory/SplineC2RT.h"
+#include "QMCWaveFunctions/BsplineFactory/SplineR2RT.h"
+#include "QMCWaveFunctions/BsplineFactory/SplineSetReaderT.h"
+#include "QMCWaveFunctions/EinsplineSetBuilderT.h"
+#include "QMCWaveFunctions/einspline_helper.hpp"
+#include "Utilities/ProgressReportEngine.h"
+#include <PlatformSelector.hpp>
+#include <fftw3.h>
+
+namespace qmcplusplus
+{
+template <typename T>
+struct CreateComplexHelper
+{
+    static inline std::unique_ptr<BsplineReaderBaseT<T>>
+    createDouble(
+        EinsplineSetBuilderT<T>* e, bool hybrid_rep, const std::string& useGPU)
+    {
+        using RealType = typename EinsplineSetBuilderT<T>::RealType;
+        std::unique_ptr<BsplineReaderBaseT<T>> aReader;
+
+        app_summary()
+            << "    Using real valued spline SPOs with complex double "
+               "precision storage (C2R)."
+            << std::endl;
+        if (CPUOMPTargetSelector::selectPlatform(useGPU) ==
+            PlatformKind::OMPTARGET) {
+            app_summary() << "    Running OpenMP offload code path."
+                          << std::endl;
+            if (hybrid_rep) {
+                app_summary()
+                    << "    Using hybrid orbital representation." << std::endl;
+                aReader = std::make_unique<HybridRepSetReaderT<
+                    HybridRepCplxT<SplineC2ROMPTargetT<double, T>>>>(e);
+            }
+            else
+                aReader = std::make_unique<
+                    SplineSetReaderT<SplineC2ROMPTargetT<double, T>>>(e);
+        }
+        else {
+            app_summary() << "    Running on CPU." << std::endl;
+            if (hybrid_rep) {
+                app_summary()
+                    << "    Using hybrid orbital representation." << std::endl;
+                aReader = std::make_unique<
+                    HybridRepSetReaderT<HybridRepCplxT<SplineC2RT<double, T>>>>(
+                    e);
+            }
+            else
+                aReader =
+                    std::make_unique<SplineSetReaderT<SplineC2RT<double, T>>>(
+                        e);
+        }
+
+        return aReader;
+    }
+
+    static inline std::unique_ptr<BsplineReaderBaseT<T>>
+    createSingle(
+        EinsplineSetBuilderT<T>* e, bool hybrid_rep, const std::string& useGPU)
+    {
+        using RealType = typename EinsplineSetBuilderT<T>::RealType;
+        std::unique_ptr<BsplineReaderBaseT<T>> aReader;
+
+        app_summary()
+            << "    Using real valued spline SPOs with complex single "
+               "precision storage (C2R)."
+            << std::endl;
+        if (CPUOMPTargetSelector::selectPlatform(useGPU) ==
+            PlatformKind::OMPTARGET) {
+            app_summary() << "    Running OpenMP offload code path."
+                          << std::endl;
+            if (hybrid_rep) {
+                app_summary()
+                    << "    Using hybrid orbital representation." << std::endl;
+                aReader = std::make_unique<HybridRepSetReaderT<
+                    HybridRepCplxT<SplineC2ROMPTargetT<float, T>>>>(e);
+            }
+            else
+                aReader = std::make_unique<
+                    SplineSetReaderT<SplineC2ROMPTargetT<float, T>>>(e);
+        }
+        else {
+            app_summary() << "    Running on CPU." << std::endl;
+            if (hybrid_rep) {
+                app_summary()
+                    << "    Using hybrid orbital representation." << std::endl;
+                aReader = std::make_unique<
+                    HybridRepSetReaderT<HybridRepCplxT<SplineC2RT<float, T>>>>(
+                    e);
+            }
+            else
+                aReader =
+                    std::make_unique<SplineSetReaderT<SplineC2RT<float, T>>>(e);
+        }
+
+        return aReader;
+    }
+};
+
+template <typename T>
+struct CreateComplexHelper<std::complex<T>>
+{
+    using ValueType = std::complex<T>;
+    using RealType = typename EinsplineSetBuilderT<ValueType>::RealType;
+
+    static inline std::unique_ptr<BsplineReaderBaseT<ValueType>>
+    createDouble(EinsplineSetBuilderT<ValueType>* e, bool hybrid_rep,
+        const std::string& useGPU)
+    {
+        std::unique_ptr<BsplineReaderBaseT<ValueType>> aReader;
+
+        app_summary()
+            << "    Using complex valued spline SPOs with complex double "
+               "precision storage (C2C)."
+            << std::endl;
+        if (CPUOMPTargetSelector::selectPlatform(useGPU) ==
+            PlatformKind::OMPTARGET) {
+            app_summary() << "    Running OpenMP offload code path."
+                          << std::endl;
+            if (hybrid_rep) {
+                app_summary()
+                    << "    Using hybrid orbital representation." << std::endl;
+                aReader = std::make_unique<HybridRepSetReaderT<
+                    HybridRepCplxT<SplineC2COMPTargetT<float, ValueType>>>>(e);
+            }
+            else
+                aReader = std::make_unique<
+                    SplineSetReaderT<SplineC2COMPTargetT<float, ValueType>>>(e);
+        }
+        else {
+            app_summary() << "    Running on CPU." << std::endl;
+            if (hybrid_rep) {
+                app_summary()
+                    << "    Using hybrid orbital representation." << std::endl;
+                aReader = std::make_unique<HybridRepSetReaderT<
+                    HybridRepCplxT<SplineC2CT<double, ValueType>>>>(e);
+            }
+            else
+                aReader = std::make_unique<
+                    SplineSetReaderT<SplineC2CT<double, ValueType>>>(e);
+        }
+
+        return aReader;
+    }
+
+    static inline std::unique_ptr<BsplineReaderBaseT<ValueType>>
+    createSingle(EinsplineSetBuilderT<ValueType>* e, bool hybrid_rep,
+        const std::string& useGPU)
+    {
+        std::unique_ptr<BsplineReaderBaseT<ValueType>> aReader;
+
+        app_summary()
+            << "    Using complex valued spline SPOs with complex single "
+               "precision storage (C2C)."
+            << std::endl;
+        if (CPUOMPTargetSelector::selectPlatform(useGPU) ==
+            PlatformKind::OMPTARGET) {
+            app_summary() << "    Running OpenMP offload code path."
+                          << std::endl;
+            if (hybrid_rep) {
+                app_summary()
+                    << "    Using hybrid orbital representation." << std::endl;
+                aReader = std::make_unique<HybridRepSetReaderT<
+                    HybridRepCplxT<SplineC2COMPTargetT<float, ValueType>>>>(e);
+            }
+            else
+                aReader = std::make_unique<
+                    SplineSetReaderT<SplineC2COMPTargetT<float, ValueType>>>(e);
+        }
+        else {
+            app_summary() << "    Running on CPU." << std::endl;
+            if (hybrid_rep) {
+                app_summary()
+                    << "    Using hybrid orbital representation." << std::endl;
+                aReader = std::make_unique<HybridRepSetReaderT<
+                    HybridRepCplxT<SplineC2CT<float, ValueType>>>>(e);
+            }
+            else
+                aReader = std::make_unique<
+                    SplineSetReaderT<SplineC2CT<float, ValueType>>>(e);
+        }
+
+        return aReader;
+    }
+};
+
+template <typename T>
+std::unique_ptr<BsplineReaderBaseT<T>>
+createBsplineComplexDoubleT(
+    EinsplineSetBuilderT<T>* e, bool hybrid_rep, const std::string& useGPU)
+{
+    return CreateComplexHelper<T>::createDouble(e, hybrid_rep, useGPU);
+}
+
+template std::unique_ptr<BsplineReaderBaseT<std::complex<float>>>
+createBsplineComplexDoubleT<std::complex<float>>(
+    EinsplineSetBuilderT<std::complex<float>>* e, bool hybrid_rep,
+    const std::string& useGPU);
+
+template std::unique_ptr<BsplineReaderBaseT<std::complex<double>>>
+createBsplineComplexDoubleT<std::complex<double>>(
+    EinsplineSetBuilderT<std::complex<double>>* e, bool hybrid_rep,
+    const std::string& useGPU);
+
+template std::unique_ptr<BsplineReaderBaseT<float>>
+createBsplineComplexDoubleT<float>(
+    EinsplineSetBuilderT<float>* e, bool hybrid_rep, const std::string& useGPU);
+
+template std::unique_ptr<BsplineReaderBaseT<double>>
+createBsplineComplexDoubleT<double>(EinsplineSetBuilderT<double>* e,
+    bool hybrid_rep, const std::string& useGPU);
+
+template <typename T>
+std::unique_ptr<BsplineReaderBaseT<T>>
+createBsplineComplexSingleT(
+    EinsplineSetBuilderT<T>* e, bool hybrid_rep, const std::string& useGPU)
+{
+    return CreateComplexHelper<T>::createSingle(e, hybrid_rep, useGPU);
+}
+
+template std::unique_ptr<BsplineReaderBaseT<std::complex<float>>>
+createBsplineComplexSingleT<std::complex<float>>(
+    EinsplineSetBuilderT<std::complex<float>>* e, bool hybrid_rep,
+    const std::string& useGPU);
+
+template std::unique_ptr<BsplineReaderBaseT<std::complex<double>>>
+createBsplineComplexSingleT<std::complex<double>>(
+    EinsplineSetBuilderT<std::complex<double>>* e, bool hybrid_rep,
+    const std::string& useGPU);
+
+template std::unique_ptr<BsplineReaderBaseT<float>>
+createBsplineComplexSingleT<float>(
+    EinsplineSetBuilderT<float>* e, bool hybrid_rep, const std::string& useGPU);
+
+template std::unique_ptr<BsplineReaderBaseT<double>>
+createBsplineComplexSingleT<double>(EinsplineSetBuilderT<double>* e,
+    bool hybrid_rep, const std::string& useGPU);
+
+template <typename T>
+std::unique_ptr<BsplineReaderBaseT<T>>
+createBsplineRealDoubleT(
+    EinsplineSetBuilderT<T>* e, bool hybrid_rep, const std::string& useGPU)
+{
+    app_summary() << "    Using real valued spline SPOs with real double "
+                     "precision storage (R2R)."
+                  << std::endl;
+    if (CPUOMPTargetSelector::selectPlatform(useGPU) == PlatformKind::OMPTARGET)
+        app_summary() << "OpenMP offload has not been implemented to support "
+                         "real valued spline SPOs with real storage!"
+                      << std::endl;
+    app_summary() << "    Running on CPU." << std::endl;
+
+    std::unique_ptr<BsplineReaderBaseT<T>> aReader;
+    if (hybrid_rep) {
+        app_summary() << "    Using hybrid orbital representation."
+                      << std::endl;
+        aReader = std::make_unique<
+            HybridRepSetReaderT<HybridRepRealT<SplineR2RT<double, T>>>>(e);
+    }
+    else
+        aReader = std::make_unique<SplineSetReaderT<SplineR2RT<double, T>>>(e);
+    return aReader;
+}
+
+template std::unique_ptr<BsplineReaderBaseT<float>>
+createBsplineRealDoubleT<float>(
+    EinsplineSetBuilderT<float>* e, bool hybrid_rep, const std::string& useGPU);
+
+template std::unique_ptr<BsplineReaderBaseT<double>>
+createBsplineRealDoubleT<double>(EinsplineSetBuilderT<double>* e,
+    bool hybrid_rep, const std::string& useGPU);
+
+template <typename T>
+std::unique_ptr<BsplineReaderBaseT<T>>
+createBsplineRealSingleT(
+    EinsplineSetBuilderT<T>* e, bool hybrid_rep, const std::string& useGPU)
+{
+    app_summary() << "    Using real valued spline SPOs with real single "
+                     "precision storage (R2R)."
+                  << std::endl;
+    if (CPUOMPTargetSelector::selectPlatform(useGPU) == PlatformKind::OMPTARGET)
+        app_summary() << "OpenMP offload has not been implemented to support "
+                         "real valued spline SPOs with real storage!"
+                      << std::endl;
+    app_summary() << "    Running on CPU." << std::endl;
+
+    std::unique_ptr<BsplineReaderBaseT<T>> aReader;
+    if (hybrid_rep) {
+        app_summary() << "    Using hybrid orbital representation."
+                      << std::endl;
+        aReader = std::make_unique<
+            HybridRepSetReaderT<HybridRepRealT<SplineR2RT<float, T>>>>(e);
+    }
+    else
+        aReader = std::make_unique<SplineSetReaderT<SplineR2RT<float, T>>>(e);
+    return aReader;
+}
+
+template std::unique_ptr<BsplineReaderBaseT<float>>
+createBsplineRealSingleT<float>(
+    EinsplineSetBuilderT<float>* e, bool hybrid_rep, const std::string& useGPU);
+
+template std::unique_ptr<BsplineReaderBaseT<double>>
+createBsplineRealSingleT<double>(EinsplineSetBuilderT<double>* e,
+    bool hybrid_rep, const std::string& useGPU);
+
+} // namespace qmcplusplus
diff --git a/src/QMCWaveFunctions/BsplineFactory/createBsplineReaderT.h b/src/QMCWaveFunctions/BsplineFactory/createBsplineReaderT.h
new file mode 100644
index 0000000000..898d8f2a2e
--- /dev/null
+++ b/src/QMCWaveFunctions/BsplineFactory/createBsplineReaderT.h
@@ -0,0 +1,59 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
+//
+// Copyright (c) 2019 QMCPACK developers.
+//
+// File developed by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
+//
+// File created by: Ye Luo, yeluo@anl.gov, Argonne National Laboratory
+//////////////////////////////////////////////////////////////////////////////////////
+
+#ifndef QMCPLUSPLUS_CREATE_BSPLINE_READERT_H
+#define QMCPLUSPLUS_CREATE_BSPLINE_READERT_H
+
+#include <memory>
+#include <string>
+
+namespace qmcplusplus
+{
+/// forward declaration
+template <typename T>
+class BsplineReaderBaseT;
+template <typename T>
+class EinsplineSetBuilderT;
+
+/** create a reader which handles complex (double size real) splines, C2R or C2C
+ * case spline storage and computation precision is double
+ */
+template <typename T>
+std::unique_ptr<BsplineReaderBaseT<T>>
+createBsplineComplexDoubleT(
+    EinsplineSetBuilderT<T>* e, bool hybrid_rep, const std::string& useGPU);
+
+/** create a reader which handles complex (double size real) splines, C2R or C2C
+ * case spline storage and computation precision is float
+ */
+template <typename T>
+std::unique_ptr<BsplineReaderBaseT<T>>
+createBsplineComplexSingleT(
+    EinsplineSetBuilderT<T>* e, bool hybrid_rep, const std::string& useGPU);
+
+/** create a reader which handles real splines, R2R case
+ *  spline storage and computation precision is double
+ */
+template <typename T>
+std::unique_ptr<BsplineReaderBaseT<T>>
+createBsplineRealDoubleT(
+    EinsplineSetBuilderT<T>* e, bool hybrid_rep, const std::string& useGPU);
+
+/** create a reader which handles real splines, R2R case
+ *  spline storage and computation precision is float
+ */
+template <typename T>
+std::unique_ptr<BsplineReaderBaseT<T>>
+createBsplineRealSingleT(
+    EinsplineSetBuilderT<T>* e, bool hybrid_rep, const std::string& useGPU);
+
+} // namespace qmcplusplus
+#endif
diff --git a/src/QMCWaveFunctions/CMakeLists.txt b/src/QMCWaveFunctions/CMakeLists.txt
index 909893abe6..05c1fe018b 100644
--- a/src/QMCWaveFunctions/CMakeLists.txt
+++ b/src/QMCWaveFunctions/CMakeLists.txt
@@ -108,28 +108,43 @@ if(OHMMS_DIM MATCHES 3)
   if(HAVE_EINSPLINE)
     set(FERMION_SRCS
         ${FERMION_SRCS}
+        EinsplineSetBuilderT.cpp
         EinsplineSetBuilderCommon.cpp
         EinsplineSetBuilderOld.cpp
         EinsplineSetBuilderReadBands_ESHDF.cpp
         EinsplineSetBuilderESHDF.fft.cpp
         EinsplineSetBuilder_createSPOs.cpp
+        BsplineFactory/createBsplineReaderT.cpp
         BsplineFactory/createComplexDouble.cpp
         BsplineFactory/createComplexSingle.cpp
         BsplineFactory/HybridRepCenterOrbitals.cpp
+        BsplineFactory/HybridRepCenterOrbitalsT.cpp
         BandInfo.cpp
-        BsplineFactory/BsplineReaderBase.cpp)
+        BsplineFactory/SplineC2RT.cpp
+        BsplineFactory/SplineR2RT.cpp
+        BsplineFactory/SplineC2CT.cpp
+        BsplineFactory/BsplineReaderBase.cpp
+        BsplineFactory/BsplineReaderBaseT.cpp)
     set(FERMION_OMPTARGET_SRCS
         Fermion/DiracDeterminantBatched.cpp
         Fermion/MultiDiracDeterminant.2.cpp
-        BsplineFactory/SplineC2RTOMPTarget.cpp
+        BsplineFactory/SplineC2ROMPTargetT.cpp
+        BsplineFactory/SplineC2COMPTargetT.cpp
     )
     if(QMC_COMPLEX)
-      set(FERMION_SRCS ${FERMION_SRCS} EinsplineSpinorSetBuilder.cpp BsplineFactory/SplineC2C.cpp BsplineFactory/SplineC2CT.cpp)
-      set(FERMION_OMPTARGET_SRCS ${FERMION_OMPTARGET_SRCS} BsplineFactory/SplineC2COMPTarget.cpp BsplineFactory/SplineC2COMPTargetT.cpp)
+      set(FERMION_SRCS ${FERMION_SRCS}
+        EinsplineSpinorSetBuilder.cpp 
+        BsplineFactory/SplineC2C.cpp)
+      set(FERMION_OMPTARGET_SRCS ${FERMION_OMPTARGET_SRCS}
+        BsplineFactory/SplineC2COMPTarget.cpp)
     else(QMC_COMPLEX)
-      set(FERMION_SRCS ${FERMION_SRCS} BsplineFactory/createRealSingle.cpp BsplineFactory/createRealDouble.cpp
-        BsplineFactory/SplineC2R.cpp BsplineFactory/SplineC2RT.cpp BsplineFactory/SplineR2R.cpp BsplineFactory/SplineR2RT.cpp)
-      set(FERMION_OMPTARGET_SRCS ${FERMION_OMPTARGET_SRCS} BsplineFactory/SplineC2ROMPTarget.cpp)
+      set(FERMION_SRCS ${FERMION_SRCS}
+        BsplineFactory/createRealSingle.cpp
+        BsplineFactory/createRealDouble.cpp
+        BsplineFactory/SplineC2R.cpp
+        BsplineFactory/SplineR2R.cpp)
+      set(FERMION_OMPTARGET_SRCS ${FERMION_OMPTARGET_SRCS}
+        BsplineFactory/SplineC2ROMPTarget.cpp)
     endif(QMC_COMPLEX)
 
   endif(HAVE_EINSPLINE)
diff --git a/src/QMCWaveFunctions/EinsplineSetBuilderT.cpp b/src/QMCWaveFunctions/EinsplineSetBuilderT.cpp
new file mode 100644
index 0000000000..f48ea6348a
--- /dev/null
+++ b/src/QMCWaveFunctions/EinsplineSetBuilderT.cpp
@@ -0,0 +1,1815 @@
+
+#include "QMCWaveFunctions/EinsplineSetBuilderT.h"
+
+#include "CPU/SIMD/vmath.hpp"
+#include "CPU/e2iphi.h"
+#include "CPU/math.hpp"
+#include "Message/CommOperators.h"
+#include "Message/Communicate.h"
+#include "OhmmsData/AttributeSet.h"
+#include "Particle/DistanceTableT.h"
+#include "ParticleBase/RandomSeqGenerator.h"
+#include "QMCWaveFunctions/BsplineFactory/BsplineReaderBaseT.h"
+#include "QMCWaveFunctions/BsplineFactory/BsplineSetT.h"
+#include "QMCWaveFunctions/BsplineFactory/createBsplineReaderT.h"
+#include "QMCWaveFunctions/WaveFunctionComponentBuilder.h"
+#include "QMCWaveFunctions/einspline_helper.hpp"
+#include "Utilities/ProgressReportEngine.h"
+#include "Utilities/Timer.h"
+#include "Utilities/qmc_common.h"
+#include <Message/UniformCommunicateError.h>
+#include <PlatformSelector.hpp>
+#include <fftw3.h>
+
+#include <array>
+#include <string_view>
+#include <vector>
+
+namespace qmcplusplus
+{
+// std::map<H5OrbSet,SPOSet*,H5OrbSet>  EinsplineSetBuilder::SPOSetMap;
+// std::map<TinyVector<int,4>,EinsplineSetBuilder::OrbType*,Int4less>
+// EinsplineSetBuilder::OrbitalMap;
+////std::map<H5OrbSet,multi_UBspline_3d_z*,H5OrbSet>
+/// EinsplineSetBuilder::ExtendedMap_z;
+////std::map<H5OrbSet,multi_UBspline_3d_d*,H5OrbSet>
+/// EinsplineSetBuilder::ExtendedMap_d;
+
+template <typename T>
+EinsplineSetBuilderT<T>::EinsplineSetBuilderT(ParticleSetT<T>& p,
+    const PSetMap& psets, Communicate* comm, xmlNodePtr cur) :
+    SPOSetBuilderT<T>("spline", comm),
+    ParticleSets(psets),
+    TargetPtcl(p),
+    XMLRoot(cur),
+    Format(QMCPACK),
+    NumBands(0),
+    NumElectrons(0),
+    NumSpins(0),
+    NumTwists(0),
+    MeshFactor(1.0),
+    MeshSize(0, 0, 0),
+    twist_num_(-1),
+    LastSpinSet(-1),
+    NumOrbitalsRead(-1),
+    makeRotations(false)
+{
+    this->ClassName = "EinsplineSetBuilder";
+
+    MatchingTol = 10 * std::numeric_limits<float>::epsilon();
+    for (int i = 0; i < 3; i++)
+        for (int j = 0; j < 3; j++)
+            TileMatrix(i, j) = 0;
+
+    // invalidate states by the basis class
+    this->states.clear();
+    this->states.resize(p.groups());
+
+    // create vectors with nullptr
+    FullBands.resize(p.groups());
+}
+
+template <typename T>
+inline TinyVector<T, 3>
+IntPart(const TinyVector<T, 3>& twist)
+{
+    return TinyVector<T, 3>(round(twist[0] - 1.0e-6), round(twist[1] - 1.0e-6),
+        round(twist[2] - 1.0e-6));
+}
+
+template <typename T>
+inline TinyVector<T, 3>
+FracPart(const TinyVector<T, 3>& twist)
+{
+    return twist - IntPart(twist);
+}
+
+template <typename T>
+EinsplineSetBuilderT<T>::~EinsplineSetBuilderT()
+{
+    DEBUG_MEMORY("EinsplineSetBuilder::~EinsplineSetBuilder");
+}
+
+template <typename T>
+bool
+EinsplineSetBuilderT<T>::CheckLattice()
+{
+    double diff = 0.0;
+    for (int i = 0; i < OHMMS_DIM; i++)
+        for (int j = 0; j < OHMMS_DIM; j++) {
+            double max_abs = std::max(std::abs(SuperLattice(i, j)),
+                static_cast<double>(std::abs(TargetPtcl.getLattice().R(i, j))));
+            if (max_abs > MatchingTol)
+                diff = std::max(diff,
+                    std::abs(
+                        SuperLattice(i, j) - TargetPtcl.getLattice().R(i, j)) /
+                        max_abs);
+        }
+
+    if (diff > MatchingTol) {
+        std::ostringstream o;
+        o.setf(std::ios::scientific, std::ios::floatfield);
+        o.precision(6);
+        o << "EinsplineSetBuilder::ReadOrbitalInfo_ESHDF \n"
+          << "Mismatched supercell lattices.\n";
+        o << " Lattice in ESHDF5 " << std::endl;
+        o << SuperLattice << std::endl;
+        o << " Lattice in xml" << std::endl;
+        o << TargetPtcl.getLattice().R << std::endl;
+        o << " Difference " << std::endl;
+        o << SuperLattice - TargetPtcl.getLattice().R << std::endl;
+        o << " Max relative error = " << diff << std::endl;
+        o << " Tolerance      = " << MatchingTol << std::endl;
+        app_error() << o.str();
+        return false;
+    }
+    return true;
+}
+
+template <typename T>
+void
+EinsplineSetBuilderT<T>::BroadcastOrbitalInfo()
+{
+    if (this->myComm->size() == 1)
+        return;
+    int numIons = IonTypes.size();
+    int numDensityGvecs = TargetPtcl.DensityReducedGvecs.size();
+    PooledData<double> abuffer;
+    PooledData<int> aibuffer;
+    aibuffer.add(Version.begin(), Version.end()); // myComm->bcast(Version);
+    aibuffer.add(Format);
+    abuffer.add(Lattice.begin(), Lattice.end()); // myComm->bcast(Lattice);
+    abuffer.add(RecipLattice.begin(),
+        RecipLattice.end()); // myComm->bcast(RecipLattice);
+    abuffer.add(SuperLattice.begin(),
+        SuperLattice.end()); // myComm->bcast(SuperLattice);
+    abuffer.add(
+        LatticeInv.begin(), LatticeInv.end()); // myComm->bcast(LatticeInv);
+    aibuffer.add(NumBands); // myComm->bcast(NumBands);
+    aibuffer.add(NumElectrons); // myComm->bcast(NumElectrons);
+    aibuffer.add(NumSpins); // myComm->bcast(NumSpins);
+    aibuffer.add(NumTwists); // myComm->bcast(NumTwists);
+    aibuffer.add(numIons); // myComm->bcast(numIons);
+    aibuffer.add(numDensityGvecs);
+    aibuffer.add(HaveOrbDerivs);
+    this->myComm->bcast(abuffer);
+    this->myComm->bcast(aibuffer);
+    if (this->myComm->rank()) {
+        abuffer.rewind();
+        aibuffer.rewind();
+        aibuffer.get(Version.begin(), Version.end());
+        aibuffer.get(Format);
+        abuffer.get(Lattice.begin(), Lattice.end());
+        abuffer.get(RecipLattice.begin(), RecipLattice.end());
+        abuffer.get(SuperLattice.begin(), SuperLattice.end());
+        abuffer.get(LatticeInv.begin(), LatticeInv.end());
+        aibuffer.get(NumBands);
+        aibuffer.get(NumElectrons);
+        aibuffer.get(NumSpins);
+        aibuffer.get(NumTwists);
+        aibuffer.get(numIons);
+        aibuffer.get(numDensityGvecs);
+        aibuffer.get(HaveOrbDerivs);
+        TargetPtcl.DensityReducedGvecs.resize(numDensityGvecs);
+        TargetPtcl.Density_G.resize(numDensityGvecs);
+    }
+    if (IonTypes.size() != numIons) {
+        IonTypes.resize(numIons);
+        IonPos.resize(numIons);
+    }
+    // new buffer
+    PooledData<double> bbuffer;
+    PooledData<int> bibuffer;
+    for (int i = 0; i < numIons; ++i)
+        bibuffer.add(IonTypes[i]);
+    // myComm->bcast(IonTypes);
+    bbuffer.add(&IonPos[0][0], &IonPos[0][0] + OHMMS_DIM * numIons);
+    // myComm->bcast(IonPos);
+    if (primcell_kpoints.size() != NumTwists)
+        primcell_kpoints.resize(NumTwists);
+    bbuffer.add(&primcell_kpoints[0][0],
+        &primcell_kpoints[0][0] + OHMMS_DIM * NumTwists);
+    bibuffer.add(&(TargetPtcl.DensityReducedGvecs[0][0]),
+        &(TargetPtcl.DensityReducedGvecs[0][0]) + numDensityGvecs * OHMMS_DIM);
+    bbuffer.add(&(TargetPtcl.Density_G[0]),
+        &(TargetPtcl.Density_G[0]) + numDensityGvecs);
+    this->myComm->bcast(bbuffer);
+    this->myComm->bcast(bibuffer);
+    if (this->myComm->rank()) {
+        bbuffer.rewind();
+        bibuffer.rewind();
+        for (int i = 0; i < numIons; ++i)
+            bibuffer.get(IonTypes[i]);
+        bbuffer.get(&IonPos[0][0], &IonPos[0][0] + OHMMS_DIM * numIons);
+        bbuffer.get(&primcell_kpoints[0][0],
+            &primcell_kpoints[0][0] + OHMMS_DIM * NumTwists);
+        bibuffer.get(&(TargetPtcl.DensityReducedGvecs[0][0]),
+            &(TargetPtcl.DensityReducedGvecs[0][0]) +
+                numDensityGvecs * OHMMS_DIM);
+        bbuffer.get(&(TargetPtcl.Density_G[0]),
+            &(TargetPtcl.Density_G[0]) + numDensityGvecs);
+    }
+    // buffer to bcast hybrid representation atomic orbital info
+    PooledData<double> cbuffer;
+    PooledData<int> cibuffer;
+    this->myComm->bcast(cbuffer);
+    this->myComm->bcast(cibuffer);
+    AtomicCentersInfo.resize(numIons);
+    Super2Prim.resize(SourcePtcl->R.size());
+    cbuffer.add(AtomicCentersInfo.inner_cutoff.begin(),
+        AtomicCentersInfo.inner_cutoff.end());
+    cbuffer.add(AtomicCentersInfo.non_overlapping_radius.begin(),
+        AtomicCentersInfo.non_overlapping_radius.end());
+    cbuffer.add(
+        AtomicCentersInfo.cutoff.begin(), AtomicCentersInfo.cutoff.end());
+    cbuffer.add(AtomicCentersInfo.spline_radius.begin(),
+        AtomicCentersInfo.spline_radius.end());
+    cibuffer.add(Super2Prim.begin(), Super2Prim.end());
+    cibuffer.add(AtomicCentersInfo.lmax.begin(), AtomicCentersInfo.lmax.end());
+    cibuffer.add(
+        AtomicCentersInfo.GroupID.begin(), AtomicCentersInfo.GroupID.end());
+    cibuffer.add(AtomicCentersInfo.spline_npoints.begin(),
+        AtomicCentersInfo.spline_npoints.end());
+    this->myComm->bcast(cbuffer);
+    this->myComm->bcast(cibuffer);
+    if (this->myComm->rank()) {
+        cbuffer.rewind();
+        cibuffer.rewind();
+        cbuffer.get(AtomicCentersInfo.inner_cutoff.begin(),
+            AtomicCentersInfo.inner_cutoff.end());
+        cbuffer.get(AtomicCentersInfo.non_overlapping_radius.begin(),
+            AtomicCentersInfo.non_overlapping_radius.end());
+        cbuffer.get(
+            AtomicCentersInfo.cutoff.begin(), AtomicCentersInfo.cutoff.end());
+        cbuffer.get(AtomicCentersInfo.spline_radius.begin(),
+            AtomicCentersInfo.spline_radius.end());
+        cibuffer.get(Super2Prim.begin(), Super2Prim.end());
+        cibuffer.get(
+            AtomicCentersInfo.lmax.begin(), AtomicCentersInfo.lmax.end());
+        cibuffer.get(
+            AtomicCentersInfo.GroupID.begin(), AtomicCentersInfo.GroupID.end());
+        cibuffer.get(AtomicCentersInfo.spline_npoints.begin(),
+            AtomicCentersInfo.spline_npoints.end());
+        for (int i = 0; i < numIons; i++)
+            AtomicCentersInfo.ion_pos[i] = IonPos[i];
+    }
+}
+
+////////////////////////////////////////////////////////////////
+//// Create the ion ParticleSet from the data in the HDF file //
+////////////////////////////////////////////////////////////////
+// void
+// EinsplineSetBuilder::CreateIonParticleSet( std::string sourceName)
+//{
+//   //    ParticleSet &pTemp = *(new MCWalkerConfiguration);
+//   ParticleSet &pTemp = *(new ParticleSet);
+//   pTemp.setName (sourceName);
+//   SpeciesSet& tspecies(pTemp.getSpeciesSet());
+//   ParticleSets[sourceName] = &pTemp;
+// }
+//
+
+template <typename T>
+void
+EinsplineSetBuilderT<T>::TileIons()
+{
+    // set the primitive lattice
+    SourcePtcl->getPrimitiveLattice().set(Lattice);
+
+    for (int j = 0; j < IonPos.size(); ++j)
+        IonPos[j] =
+            FracPart(SourcePtcl->getPrimitiveLattice().toUnit(IonPos[j]));
+
+    IonPos.resize(SourcePtcl->getTotalNum());
+    IonTypes.resize(SourcePtcl->getTotalNum());
+    std::copy(SourcePtcl->R.begin(), SourcePtcl->R.end(), IonPos.begin());
+    std::copy(SourcePtcl->GroupID.begin(), SourcePtcl->GroupID.end(),
+        IonTypes.begin());
+
+    // app_log() << "  Primitive Cell\n";
+    // SourcePtcl->getPrimitiveLattice().print(app_log());
+    // app_log() << "  Super Cell\n";
+    // SourcePtcl->Lattice.print(app_log());
+
+    // Don't need to do this, already one by ParticleSetPool.cpp
+    //   Vector<TinyVector<double, OHMMS_DIM> > primPos   = IonPos;
+    //   Vector<int>                            primTypes = IonTypes;
+    //   int numCopies = std::abs(det(TileMatrix));
+    //   IonTypes.resize(primPos.size()*numCopies);
+    //   IonPos.resize  (primPos.size()*numCopies);
+    //   int maxCopies = 10;
+    //   using Vec3 = TinyVector<double,3>;
+    //   int index=0;
+    //   for (int i0=-maxCopies; i0<=maxCopies; i0++)
+    //     for (int i1=-maxCopies; i1<=maxCopies; i1++)
+    //       for (int i2=-maxCopies; i2<=maxCopies; i2++)
+    //         for (int iat=0; iat < primPos.size(); iat++)
+    //         {
+    //           Vec3 r     = primPos[iat];
+    //           Vec3 uPrim = PrimCell.toUnit(r);
+    //           for (int i=0; i<3; i++)
+    //             uPrim[i] -= std::floor(uPrim[i]);
+    //           r = PrimCell.toCart(uPrim) + (double)i0*PrimCell.a(0) +
+    //               (double)i1*PrimCell.a(1) + (double)i2*PrimCell.a(2);
+    //           Vec3 uSuper = SuperCell.toUnit(r);
+    //           if ((uSuper[0] >= -1.0e-4) && (uSuper[0] < 0.9999) &&
+    //               (uSuper[1] >= -1.0e-4) && (uSuper[1] < 0.9999) &&
+    //               (uSuper[2] >= -1.0e-4) && (uSuper[2] < 0.9999))
+    //           {
+    //             IonPos[index]= r;
+    //             IonTypes[index]= primTypes[iat];
+    //             index++;
+    //           }
+    //         }
+    //   if (index != primPos.size()*numCopies)
+    //   {
+    //     app_error() << "The number of tiled ions, " << IonPos.size()
+    //                 << ", does not match the expected number of "
+    //                 << primPos.size()*numCopies << " or the index "<< index
+    //                 <<".  Aborting.\n";
+    //     APP_ABORT("EinsplineSetBuilder::TileIons()");
+    //   }
+    //   if (myComm->rank() == 0)
+    //   {
+    //     char buf[1000];
+    //     snprintf (buf, 1000, "Supercell reduced ion positions = \n");
+    //     app_log() << buf;
+    //     app_log().flush();
+    //     for (int i=0; i<IonPos.size(); i++)
+    //     {
+    //       PosType u = SuperCell.toUnit(IonPos[i]);
+    //       char buf2[1000];
+    //       snprintf (buf2, 1000, "   %14.10f %14.10f %14.10f\n",
+    //                u[0], u[1], u[2]);
+    //       app_log() << buf2;
+    //       app_log().flush();
+    //       //		 IonPos[i][0], IonPos[i][1], IonPos[i][2]);
+    //     }
+    //   }
+}
+
+template <typename T>
+bool
+EinsplineSetBuilderT<T>::TwistPair(PosType a, PosType b) const
+{
+    bool pair = true;
+    for (int n = 0; n < OHMMS_DIM; n++) {
+        double d = a[n] + b[n];
+        if (std::abs(d - round(d)) > MatchingTol)
+            pair = false;
+    }
+    return pair;
+}
+
+template <typename T>
+void
+EinsplineSetBuilderT<T>::AnalyzeTwists2(
+    const int twist_num_inp, const TinyVector<double, OHMMS_DIM>& twist_inp)
+{
+    Tensor<double, 3> S;
+    for (int i = 0; i < 3; i++)
+        for (int j = 0; j < 3; j++)
+            S(i, j) = (double)TileMatrix(i, j);
+
+    const int num_prim_kpoints = primcell_kpoints.size();
+
+    // build a list of unique super twists that all the primitive cell k-point
+    // correspond to.
+    std::vector<PosType> superFracs; // twist super twist coordinates
+    std::vector<int>
+        superIndex; // the indices of the super twists that correpsond to all
+                    // the primitive cell k-points in the unique list.
+    {
+        // scan all the primitive cell k-points
+        for (int ki = 0; ki < num_prim_kpoints; ki++) {
+            PosType primTwist = primcell_kpoints[ki];
+            PosType superTwist = dot(S, primTwist);
+            PosType kp = PrimCell.k_cart(primTwist);
+            PosType ks = SuperCell.k_cart(superTwist);
+            // check the consistency of tiling, primitive and super cells.
+            if (dot(ks - kp, ks - kp) > 1.0e-6) {
+                app_error() << "Primitive and super k-points do not agree.  "
+                               "Error in coding.\n";
+                APP_ABORT("EinsplineSetBuilder::AnalyzeTwists2");
+            }
+            PosType frac = FracPart(superTwist);
+            // verify if the super twist that correpsonds to this primitive cell
+            // k-point exists in the unique list or not.
+            bool found = false;
+            for (int j = 0; j < superFracs.size(); j++) {
+                PosType diff = frac - superFracs[j];
+                if (dot(diff, diff) < 1.0e-6) {
+                    found = true;
+                    superIndex.push_back(j);
+                }
+            }
+            if (!found) {
+                superIndex.push_back(superFracs.size());
+                superFracs.push_back(frac);
+            }
+        }
+        assert(superIndex.size() == num_prim_kpoints);
+    }
+
+    const int numSuperTwists = superFracs.size();
+    {
+        app_log() << "Found " << numSuperTwists << " distinct supercell twist"
+                  << (numSuperTwists > 1 ? "s" : "") << " based on "
+                  << num_prim_kpoints << " primitive cell k-point"
+                  << (num_prim_kpoints > 1 ? "s" : "") << std::endl;
+        if (this->myComm->rank() == 0) {
+            int n_tot_irred(0);
+            for (int si = 0; si < numSuperTwists; si++) {
+                std::array<char, 1000> buf;
+                int length = std::snprintf(buf.data(), buf.size(),
+                    "Super twist #%d:  [ %9.5f %9.5f %9.5f ]\n", si,
+                    superFracs[si][0], superFracs[si][1], superFracs[si][2]);
+                if (length < 0)
+                    throw std::runtime_error(
+                        "Error converting Super twist to a string");
+                app_log() << std::string_view(buf.data(), length);
+                app_log().flush();
+            }
+        }
+    }
+
+    // For each supercell twist, create a list of primitive twists which
+    // correspond to it.
+    std::vector<std::vector<int>> superSets;
+    {
+        superSets.resize(numSuperTwists);
+        for (int ki = 0; ki < num_prim_kpoints; ki++)
+            superSets[superIndex[ki]].push_back(ki);
+    }
+
+    { // look up a super cell twist and return its index in the unique list of
+      // super cell twists.
+        std::function find_twist =
+            [&](const TinyVector<double, OHMMS_DIM>& twist) {
+                int twist_num = -1;
+                PosType gtFrac = FracPart(twist);
+                float eps = 1e-5;
+                for (int si = 0; si < numSuperTwists; si++) {
+                    PosType locDiff = gtFrac - superFracs[si];
+                    if (dot(locDiff, locDiff) < eps)
+                        twist_num = si;
+                }
+
+                if (twist_num < 0) {
+                    std::array<char, 1000> buf;
+                    int length = std::snprintf(buf.data(), buf.size(),
+                        "AnalyzeTwists2. Input twist [ %9.5f %9.5f %9.5f] not "
+                        "found in the list of super twists above.\n",
+                        twist[0], twist[1], twist[2]);
+                    if (length < 0)
+                        throw std::runtime_error(
+                            "Error generating error message");
+                    throw UniformCommunicateError(buf.data());
+                }
+                return twist_num;
+            };
+
+        if (twist_inp[0] > TWIST_NO_INPUT || twist_inp[1] > TWIST_NO_INPUT ||
+            twist_inp[2] > TWIST_NO_INPUT) {
+            if (twist_num_inp != TWISTNUM_NO_INPUT)
+                app_warning()
+                    << "twist attribute exists. twistnum attribute ignored. "
+                       "To prevent this message, remove twistnum from input."
+                    << std::endl;
+
+            twist_num_ = find_twist(twist_inp);
+        }
+        else if (twist_num_inp != TWISTNUM_NO_INPUT) {
+            app_warning() << "twist attribute does't exist but twistnum "
+                             "attribute was found. "
+                          << "This is potentially ambiguous. Specifying twist "
+                             "attribute is preferred."
+                          << std::endl;
+            if (twist_num_inp < 0 || twist_num_inp >= numSuperTwists) {
+                std::ostringstream msg;
+                msg << "AnalyzeTwists2. twistnum input value " << twist_num_inp
+                    << " is outside the acceptable range [0, " << numSuperTwists
+                    << ")." << std::endl;
+                throw UniformCommunicateError(msg.str());
+            }
+            twist_num_ = twist_num_inp;
+        }
+        else {
+            app_log() << "twist attribte does't exist. Set Gamma point."
+                      << std::endl;
+            twist_num_ = find_twist({0, 0, 0});
+        }
+
+        assert(twist_num_ >= 0 && twist_num_ < numSuperTwists);
+
+        std::array<char, 1000> buf;
+        int length = std::snprintf(buf.data(), buf.size(),
+            "  Using supercell twist %d:  [ %9.5f %9.5f %9.5f]", twist_num_,
+            superFracs[twist_num_][0], superFracs[twist_num_][1],
+            superFracs[twist_num_][2]);
+        if (length < 0)
+            throw std::runtime_error(
+                "Error converting supercell twist to a string");
+        app_log() << std::string_view(buf.data(), length) << std::endl;
+    }
+
+    TargetPtcl.setTwist(superFracs[twist_num_]);
+#ifndef QMC_COMPLEX
+    // Check to see if supercell twist is okay to use with real wave
+    // functions
+    for (int dim = 0; dim < OHMMS_DIM; dim++) {
+        double t = 2.0 * superFracs[twist_num_][dim];
+        if (std::abs(t - round(t)) > MatchingTol * 100) {
+            app_error()
+                << "Cannot use this super twist with real wavefunctions.\n"
+                << "Please recompile with QMC_COMPLEX=1.\n";
+            APP_ABORT("EinsplineSetBuilder::AnalyzeTwists2");
+        }
+    }
+#endif
+    // Now check to see that each supercell twist has the right twists
+    // to tile the primitive cell orbitals.
+    const int numTwistsNeeded = std::abs(det(TileMatrix));
+    for (int si = 0; si < numSuperTwists; si++) {
+        // First make sure we have enough points
+        if (superSets[si].size() != numTwistsNeeded) {
+            std::array<char, 1000> buf;
+            int length = std::snprintf(buf.data(), buf.size(),
+                "Super twist %d should own %d k-points, but owns %d.\n", si,
+                numTwistsNeeded, static_cast<int>(superSets[si].size()));
+            if (length < 0)
+                throw std::runtime_error("Error generating Super twist string");
+            app_error() << std::string_view(buf.data(), length);
+            if (si == twist_num_) {
+                APP_ABORT("EinsplineSetBuilder::AnalyzeTwists2");
+            }
+            else
+                continue;
+        }
+        // Now, make sure they are all distinct
+        int N = superSets[si].size();
+        for (int i = 0; i < N; i++) {
+            PosType twistPrim_i = primcell_kpoints[superSets[si][i]];
+            PosType twistSuper_i = dot(S, twistPrim_i);
+            PosType superInt_i = IntPart(twistSuper_i);
+            for (int j = i + 1; j < N; j++) {
+                PosType twistPrim_j = primcell_kpoints[superSets[si][j]];
+                PosType twistSuper_j = dot(S, twistPrim_j);
+                PosType superInt_j = IntPart(twistSuper_j);
+                if (dot(superInt_i - superInt_j, superInt_i - superInt_j) <
+                    1.0e-6) {
+                    app_error()
+                        << "Identical k-points detected in super twist set "
+                        << si << std::endl;
+                    APP_ABORT_TRACE(__FILE__, __LINE__, "AnalyzeTwists2");
+                }
+            }
+        }
+    }
+    app_log().flush();
+    // Finally, record which k-points to include on this group of
+    // processors, which have been assigned supercell twist twist_num_
+    IncludeTwists.clear();
+    for (int i = 0; i < superSets[twist_num_].size(); i++)
+        IncludeTwists.push_back(superSets[twist_num_][i]);
+    // Now, find out which twists are distinct
+    DistinctTwists.clear();
+#ifndef QMC_COMPLEX
+    std::vector<int> copyTwists;
+    for (int i = 0; i < IncludeTwists.size(); i++) {
+        int ti = IncludeTwists[i];
+        PosType twist_i = primcell_kpoints[ti];
+        bool distinct = true;
+        for (int j = i + 1; j < IncludeTwists.size(); j++) {
+            int tj = IncludeTwists[j];
+            PosType twist_j = primcell_kpoints[tj];
+            PosType sum = twist_i + twist_j;
+            PosType diff = twist_i - twist_j;
+            if (TwistPair(twist_i, twist_j))
+                distinct = false;
+        }
+        if (distinct)
+            DistinctTwists.push_back(ti);
+        else
+            copyTwists.push_back(ti);
+    }
+    // Now determine which distinct twists require two copies
+    MakeTwoCopies.resize(DistinctTwists.size());
+    for (int i = 0; i < DistinctTwists.size(); i++) {
+        MakeTwoCopies[i] = false;
+        int ti = DistinctTwists[i];
+        PosType twist_i = primcell_kpoints[ti];
+        for (int j = 0; j < copyTwists.size(); j++) {
+            int tj = copyTwists[j];
+            PosType twist_j = primcell_kpoints[tj];
+            if (TwistPair(twist_i, twist_j))
+                MakeTwoCopies[i] = true;
+        }
+        if (this->myComm->rank() == 0) {
+            std::array<char, 1000> buf;
+            int length = std::snprintf(buf.data(), buf.size(),
+                "Using %d copies of twist angle [%6.3f, %6.3f, %6.3f]\n",
+                MakeTwoCopies[i] ? 2 : 1, twist_i[0], twist_i[1], twist_i[2]);
+            if (length < 0)
+                throw std::runtime_error("Error generating string");
+            app_log() << std::string_view(buf.data(), length);
+            app_log().flush();
+        }
+    }
+    // Find out if we can make real orbitals
+    use_real_splines_ = true;
+    for (int i = 0; i < DistinctTwists.size(); i++) {
+        int ti = DistinctTwists[i];
+        PosType twist = primcell_kpoints[ti];
+        for (int j = 0; j < OHMMS_DIM; j++)
+            if (std::abs(twist[j] - 0.0) > MatchingTol &&
+                std::abs(twist[j] - 0.5) > MatchingTol &&
+                std::abs(twist[j] + 0.5) > MatchingTol)
+                use_real_splines_ = false;
+    }
+    if (use_real_splines_ && (DistinctTwists.size() > 1)) {
+        app_log() << "***** Use of real orbitals is possible, but not "
+                     "currently implemented\n"
+                  << "      with more than one twist angle.\n";
+        use_real_splines_ = false;
+    }
+    if (use_real_splines_)
+        app_log() << "Using real splines.\n";
+    else
+        app_log() << "Using complex splines.\n";
+#else
+    DistinctTwists.resize(IncludeTwists.size());
+    MakeTwoCopies.resize(IncludeTwists.size());
+    for (int i = 0; i < IncludeTwists.size(); i++) {
+        DistinctTwists[i] = IncludeTwists[i];
+        MakeTwoCopies[i] = false;
+    }
+    use_real_splines_ = false;
+#endif
+}
+
+template <typename T>
+void
+EinsplineSetBuilderT<T>::OccupyBands(
+    int spin, int sortBands, int numOrbs, bool skipChecks)
+{
+    if (this->myComm->rank() != 0)
+        return;
+    if (spin >= NumSpins && !skipChecks) {
+        app_error() << "To developer: User is requesting for orbitals in an "
+                       "invalid spin group "
+                    << spin << ". Current h5 file only contains spin groups "
+                    << "[0.." << NumSpins - 1 << "]." << std::endl;
+        app_error() << "To user: Orbital H5 file contains no spin down data "
+                       "and is appropriate only for spin unpolarized "
+                       "calculations. "
+                    << "If this is your intent, please replace 'spindataset=1' "
+                       "with 'spindataset=0' in the input file."
+                    << std::endl;
+        abort();
+    }
+    if (Format == ESHDF) {
+        OccupyBands_ESHDF(spin, sortBands, numOrbs);
+        return;
+    }
+    std::string eigenstatesGroup;
+    if (Version[0] == 0 && Version[1] == 11)
+        eigenstatesGroup = "/eigenstates_3";
+    else if (Version[0] == 0 && Version[1] == 20)
+        eigenstatesGroup = "/eigenstates";
+
+    if (FullBands[spin]->size()) {
+        app_log() << "  FullBand[" << spin << "] exists. Reuse it. "
+                  << std::endl;
+        return;
+    }
+
+    std::vector<BandInfo>& SortBands(*FullBands[spin]);
+
+    SortBands.clear();
+    for (int ti = 0; ti < DistinctTwists.size(); ti++) {
+        int tindex = DistinctTwists[ti];
+        // First, read valence states
+        for (int bi = 0; bi < NumBands; bi++) {
+            BandInfo band;
+            band.TwistIndex = tindex;
+            band.BandIndex = bi;
+            band.MakeTwoCopies = MakeTwoCopies[ti];
+            // Read eigenenergy from file
+            std::ostringstream ePath, sPath;
+            if ((Version[0] == 0 && Version[1] == 11) || NumTwists > 1) {
+                ePath << eigenstatesGroup << "/twist_" << tindex << "/band_"
+                      << bi << "/eigenvalue";
+                sPath << eigenstatesGroup << "/twist_" << tindex << "/band_"
+                      << bi << "/spin";
+            }
+            else if (NumBands > 1) {
+                ePath << eigenstatesGroup << "/twist/band_" << bi
+                      << "/eigenvalue";
+                sPath << eigenstatesGroup << "/twist/band_" << bi << "/spin";
+            }
+            else {
+                ePath << eigenstatesGroup << "/twist/band/eigenvalue";
+                sPath << eigenstatesGroup << "/twist/band/spin";
+            }
+            band.Energy = -1.01e100;
+            H5File.read(band.Energy, ePath.str());
+            if (band.Energy > -1.0e100) {
+                H5File.read(band.Spin, sPath.str());
+                if (band.Spin == spin)
+                    SortBands.push_back(band);
+            }
+        }
+    }
+    int orbIndex = 0;
+    int numOrbs_counter = 0;
+    while (numOrbs_counter < numOrbs) {
+        if (SortBands[orbIndex].MakeTwoCopies)
+            numOrbs_counter += 2;
+        else
+            numOrbs_counter++;
+        orbIndex++;
+    }
+    NumDistinctOrbitals = orbIndex;
+    app_log() << "We will read " << NumDistinctOrbitals
+              << " distinct orbitals.\n";
+}
+
+template <typename T>
+void
+EinsplineSetBuilderT<T>::bcastSortBands(int spin, int n, bool root)
+{
+    std::vector<BandInfo>& SortBands(*FullBands[spin]);
+
+    TinyVector<int, 2> nbands(int(SortBands.size()), n);
+    mpi::bcast(*this->myComm, nbands);
+
+    // buffer to serialize BandInfo
+    PooledData<OHMMS_PRECISION_FULL> misc(nbands[0] * 4);
+    n = NumDistinctOrbitals = nbands[1];
+
+    if (root) {
+        misc.rewind();
+        for (int i = 0; i < n; ++i) {
+            misc.put(SortBands[i].TwistIndex);
+            misc.put(SortBands[i].BandIndex);
+            misc.put(SortBands[i].Energy);
+            misc.put(SortBands[i].MakeTwoCopies);
+        }
+
+        for (int i = n; i < SortBands.size(); ++i) {
+            misc.put(SortBands[i].TwistIndex);
+            misc.put(SortBands[i].BandIndex);
+            misc.put(SortBands[i].Energy);
+            misc.put(SortBands[i].MakeTwoCopies);
+        }
+    }
+    this->myComm->bcast(misc);
+
+    if (!root) {
+        SortBands.resize(nbands[0]);
+        misc.rewind();
+        for (int i = 0; i < n; ++i) {
+            misc.get(SortBands[i].TwistIndex);
+            misc.get(SortBands[i].BandIndex);
+            misc.get(SortBands[i].Energy);
+            misc.get(SortBands[i].MakeTwoCopies);
+        }
+        for (int i = n; i < SortBands.size(); ++i) {
+            misc.get(SortBands[i].TwistIndex);
+            misc.get(SortBands[i].BandIndex);
+            misc.get(SortBands[i].Energy);
+            misc.get(SortBands[i].MakeTwoCopies);
+        }
+    }
+}
+
+inline bool
+sortByIndex(BandInfo leftB, BandInfo rightB)
+{
+    if (leftB.BandIndex == rightB.BandIndex) {
+        if ((leftB.Energy < rightB.Energy + 1e-6) &&
+            (leftB.Energy > rightB.Energy - 1e-6))
+            return leftB.TwistIndex < rightB.TwistIndex;
+        else
+            return leftB.Energy < rightB.Energy;
+    }
+    else
+        return (leftB.BandIndex < rightB.BandIndex);
+};
+
+template <typename T>
+bool
+EinsplineSetBuilderT<T>::ReadOrbitalInfo_ESHDF(bool skipChecks)
+{
+    app_log() << "  Reading orbital file in ESHDF format.\n";
+    H5File.read(Version, "/version");
+    app_log() << "  ESHDF orbital file version " << Version[0] << "."
+              << Version[1] << "." << Version[2] << std::endl;
+    H5File.read(Lattice, "/supercell/primitive_vectors");
+    RecipLattice = 2.0 * M_PI * inverse(Lattice);
+    SuperLattice = dot(TileMatrix, Lattice);
+    std::array<char, 1000> buff;
+    int length = std::snprintf(buff.data(), buff.size(),
+        "  Lattice = \n    [ %9.6f %9.6f %9.6f\n"
+        "      %9.6f %9.6f %9.6f\n"
+        "      %9.6f %9.6f %9.6f ]\n",
+        Lattice(0, 0), Lattice(0, 1), Lattice(0, 2), Lattice(1, 0),
+        Lattice(1, 1), Lattice(1, 2), Lattice(2, 0), Lattice(2, 1),
+        Lattice(2, 2));
+    if (length < 0)
+        throw std::runtime_error("Error converting lattice to a string");
+    app_log() << std::string_view(buff.data(), length);
+    length = std::snprintf(buff.data(), buff.size(),
+        "  SuperLattice = \n    [ %9.6f %9.6f %9.6f\n"
+        "      %9.6f %9.6f %9.6f\n"
+        "      %9.6f %9.6f %9.6f ]\n",
+        SuperLattice(0, 0), SuperLattice(0, 1), SuperLattice(0, 2),
+        SuperLattice(1, 0), SuperLattice(1, 1), SuperLattice(1, 2),
+        SuperLattice(2, 0), SuperLattice(2, 1), SuperLattice(2, 2));
+    if (length < 0)
+        throw std::runtime_error("Error converting SuperLattice to a string");
+    app_log() << std::string_view(buff.data(), length) << std::endl;
+    if (!CheckLattice())
+        throw std::runtime_error("CheckLattice failed");
+    PrimCell.set(Lattice);
+    for (int i = 0; i < 3; i++)
+        for (int j = 0; j < 3; j++)
+            LatticeInv(i, j) = RecipLattice(i, j) / (2.0 * M_PI);
+    int have_dpsi = false;
+    NumTwists = NumSpins = NumBands = 0;
+    NumElectrons = TargetPtcl.getTotalNum();
+    H5File.read(NumBands, "/electrons/kpoint_0/spin_0/number_of_states");
+    H5File.readEntry(NumSpins, "/electrons/number_of_spins");
+    H5File.read(NumTwists, "/electrons/number_of_kpoints");
+    H5File.readEntry(have_dpsi, "/electrons/have_dpsi");
+    HaveOrbDerivs = have_dpsi;
+    app_log() << "bands=" << NumBands << ", elecs=" << NumElectrons
+              << ", spins=" << NumSpins << ", twists=" << NumTwists
+              << std::endl;
+    //////////////////////////////////
+    // Read ion types and locations //
+    //////////////////////////////////
+    Vector<int> species_ids;
+    H5File.read(species_ids, "/atoms/species_ids");
+    int num_species;
+    H5File.read(num_species, "/atoms/number_of_species");
+    std::vector<int> atomic_numbers(num_species);
+    for (int isp = 0; isp < num_species; isp++) {
+        std::ostringstream name;
+        name << "/atoms/species_" << isp << "/atomic_number";
+        H5File.readEntry(atomic_numbers[isp], name.str());
+    }
+    IonTypes.resize(species_ids.size());
+    for (int i = 0; i < species_ids.size(); i++)
+        IonTypes[i] = atomic_numbers[species_ids[i]];
+    H5File.read(IonPos, "/atoms/positions");
+    for (int i = 0; i < IonTypes.size(); i++)
+        app_log() << "Atom type(" << i << ") = " << IonTypes[i] << std::endl;
+    /////////////////////////////////////
+    // Read atom orbital info from xml //
+    /////////////////////////////////////
+    // construct Super2Prim mapping.
+    if (Super2Prim.size() == 0) {
+        // SourcePtcl->convert2Cart(SourcePtcl->R);
+        Super2Prim.resize(SourcePtcl->R.size(), -1);
+        std::vector<int> prim_atom_counts;
+        prim_atom_counts.resize(IonPos.size(), 0);
+        for (int i = 0; i < SourcePtcl->R.size(); i++) {
+            PosType ref = PrimCell.toUnit_floor(SourcePtcl->R[i]);
+            for (int j = 0; j < IonPos.size(); j++) {
+                PosType dr = PrimCell.toUnit_floor(IonPos[j]) - ref;
+                for (int k = 0; k < OHMMS_DIM; k++)
+                    dr[k] -= round(dr[k]);
+                if (dot(dr, dr) < MatchingTol) {
+                    if (Super2Prim[i] < 0) {
+                        Super2Prim[i] = j;
+                        prim_atom_counts[j]++;
+                    }
+                    else {
+                        app_error()
+                            << "Supercell ion " << i << " at "
+                            << SourcePtcl->R[j]
+                            << " was found twice in the primitive cell as ion "
+                            << Super2Prim[i] << " and " << j << std::endl;
+                        if (!skipChecks)
+                            abort();
+                    }
+                }
+            }
+            if (Super2Prim[i] < 0) {
+                app_error() << "Supercell ion " << i
+                            << " not found in the primitive cell" << std::endl;
+                if (!skipChecks)
+                    abort();
+            }
+            else {
+                // app_log() << "Supercell ion " << i << " mapped to primitive
+                // cell ion " << Super2Prim[i] << std::endl;
+            }
+        }
+        const int tiling_size = std::abs(det(TileMatrix));
+        for (int i = 0; i < IonPos.size(); i++)
+            if (prim_atom_counts[i] != tiling_size) {
+                app_error() << "Primitive cell ion " << i << " was found only "
+                            << prim_atom_counts[i]
+                            << " times in the supercell rather than "
+                            << tiling_size << std::endl;
+                if (!skipChecks)
+                    abort();
+            }
+        // construct AtomicCentersInfo
+        AtomicCentersInfo.resize(IonPos.size());
+        for (int i = 0; i < IonPos.size(); i++)
+            AtomicCentersInfo.ion_pos[i] = IonPos[i];
+        const auto& source_species = SourcePtcl->getSpeciesSet();
+        int Zind = source_species.findAttribute("atomicnumber");
+        const int table_id = SourcePtcl->addTable(*SourcePtcl);
+        const auto& ii_table = SourcePtcl->getDistTable(table_id);
+        SourcePtcl->update(true);
+        for (int i = 0; i < IonPos.size(); i++) {
+            AtomicCentersInfo.non_overlapping_radius[i] =
+                std::numeric_limits<RealType>::max();
+            // should only call get_first_neighbor to set non_overlapping_radius
+            // if there are more than one atom  in the cell
+            if (Super2Prim.size() == 1)
+                continue;
+            for (int j = 0; j < Super2Prim.size(); j++)
+                if (Super2Prim[j] == i) {
+                    // set GroupID for each ion in primitive cell
+                    if ((Zind < 0) ||
+                        (source_species(Zind, SourcePtcl->GroupID[j]) ==
+                            IonTypes[i]))
+                        AtomicCentersInfo.GroupID[i] = SourcePtcl->GroupID[j];
+                    else {
+                        app_error()
+                            << "Primitive cell ion " << i
+                            << " vs supercell ion " << j
+                            << " atomic number not matching: " << IonTypes[i]
+                            << " vs "
+                            << source_species(Zind, SourcePtcl->GroupID[j])
+                            << std::endl;
+                        if (!skipChecks)
+                            abort();
+                    }
+                    // set non_overlapping_radius for each ion in primitive cell
+                    RealType r(0);
+                    PosType dr;
+                    ii_table.get_first_neighbor(j, r, dr, false);
+                    if (r < 1e-3)
+                        APP_ABORT("EinsplineSetBuilder::ReadOrbitalInfo_ESHDF "
+                                  "too close ions <1e-3 bohr!");
+                    AtomicCentersInfo.non_overlapping_radius[i] = 0.5 * r;
+                    break;
+                }
+        }
+
+        // load cutoff_radius, spline_radius, spline_npoints, lmax if exists.
+        const int inner_cutoff_ind =
+            source_species.findAttribute("inner_cutoff");
+        const int cutoff_radius_ind =
+            source_species.findAttribute("cutoff_radius");
+        const int spline_radius_ind =
+            source_species.findAttribute("spline_radius");
+        const int spline_npoints_ind =
+            source_species.findAttribute("spline_npoints");
+        const int lmax_ind = source_species.findAttribute("lmax");
+
+        for (int center_idx = 0; center_idx < AtomicCentersInfo.Ncenters;
+             center_idx++) {
+            const int my_GroupID = AtomicCentersInfo.GroupID[center_idx];
+            if (inner_cutoff_ind >= 0)
+                AtomicCentersInfo.inner_cutoff[center_idx] =
+                    source_species(inner_cutoff_ind, my_GroupID);
+            if (cutoff_radius_ind >= 0)
+                AtomicCentersInfo.cutoff[center_idx] =
+                    source_species(cutoff_radius_ind, my_GroupID);
+            if (spline_radius_ind >= 0)
+                AtomicCentersInfo.spline_radius[center_idx] =
+                    source_species(spline_radius_ind, my_GroupID);
+            if (spline_npoints_ind >= 0)
+                AtomicCentersInfo.spline_npoints[center_idx] =
+                    source_species(spline_npoints_ind, my_GroupID);
+            if (lmax_ind >= 0)
+                AtomicCentersInfo.lmax[center_idx] =
+                    source_species(lmax_ind, my_GroupID);
+        }
+    }
+    ///////////////////////////
+    // Read the twist angles //
+    ///////////////////////////
+    primcell_kpoints.resize(NumTwists);
+    for (int ti = 0; ti < NumTwists; ti++) {
+        std::ostringstream path;
+        path << "/electrons/kpoint_" << ti << "/reduced_k";
+        TinyVector<double, OHMMS_DIM> primcell_kpoints_DP;
+        H5File.read(primcell_kpoints_DP, path.str());
+        primcell_kpoints[ti] = primcell_kpoints_DP;
+    }
+    if (qmc_common.use_density) {
+        //////////////////////////////////////////////////////////
+        // Only if it is bulk: If the density has not been set in TargetPtcl,
+        // and   // the density is available, read it in and save it     // in
+        // TargetPtcl.                                       //
+        //////////////////////////////////////////////////////////
+        if (TargetPtcl.getLattice().SuperCellEnum == SUPERCELL_BULK) {
+            // FIXME:  add support for more than one spin density
+            if (TargetPtcl.Density_G.empty()) {
+                Array<double, OHMMS_DIM> Density_r_DP;
+                TinyVector<int, 3> mesh;
+                H5File.read(TargetPtcl.DensityReducedGvecs,
+                    "/electrons/density/gvectors");
+                int numG = TargetPtcl.DensityReducedGvecs.size();
+// Convert primitive G-vectors to supercell G-vectors
+// Also, flip sign since ESHDF format uses opposite sign convention
+#pragma omp parallel for
+                for (int iG = 0; iG < numG; iG++)
+                    TargetPtcl.DensityReducedGvecs[iG] = -1 *
+                        dot(TileMatrix, TargetPtcl.DensityReducedGvecs[iG]);
+                app_log() << "  Read " << numG << " density G-vectors.\n";
+                for (int ispin = 0; ispin < NumSpins; ispin++) {
+                    std::ostringstream density_r_path, density_g_path;
+                    density_r_path << "/electrons/density/spin_" << ispin
+                                   << "/density_r";
+                    density_g_path << "/electrons/density/spin_" << ispin
+                                   << "/density_g";
+                    H5File.readEntry(Density_r_DP, density_r_path.str());
+                    TargetPtcl.Density_r = Density_r_DP;
+                    if (TargetPtcl.DensityReducedGvecs.size()) {
+                        app_log() << "  EinsplineSetBuilder found density in "
+                                     "the HDF5 file.\n";
+                        std::vector<ComplexType> density_G;
+                        std::vector<std::complex<double>> Density_G_DP;
+                        H5File.read(Density_G_DP, density_g_path.str());
+                        density_G.assign(
+                            Density_G_DP.begin(), Density_G_DP.end());
+                        if (!density_G.size()) {
+                            app_error() << "  Density reduced G-vectors "
+                                           "defined, but not the"
+                                        << " density.\n";
+                            abort();
+                        }
+                        else {
+                            if (ispin == 0)
+                                TargetPtcl.Density_G = density_G;
+                            else
+                                for (int iG = 0; iG < density_G.size(); iG++)
+                                    TargetPtcl.Density_G[iG] += density_G[iG];
+                        }
+                    }
+                }
+            }
+            //////////////////////////////////////////////////////////
+            // If the density has not been set in TargetPtcl, and   //
+            // the density is available, read it in and save it     //
+            // in TargetPtcl.                                       //
+            //////////////////////////////////////////////////////////
+            // FIXME:  add support for more than one spin potential
+            if (!TargetPtcl.VHXC_r[0].size()) {
+                TinyVector<int, 3> mesh;
+                H5File.readEntry(
+                    TargetPtcl.VHXCReducedGvecs, "/electrons/VHXC/gvectors");
+                int numG = TargetPtcl.VHXCReducedGvecs.size();
+// Convert primitive G-vectors to supercell G-vectors
+// Also, flip sign since ESHDF format uses opposite sign convention
+#pragma omp parallel for
+                for (int iG = 0; iG < numG; iG++)
+                    TargetPtcl.VHXCReducedGvecs[iG] =
+                        -1 * dot(TileMatrix, TargetPtcl.VHXCReducedGvecs[iG]);
+                app_log() << "  Read " << numG << " VHXC G-vectors.\n";
+                for (int ispin = 0; ispin < NumSpins; ispin++) {
+                    Array<double, OHMMS_DIM> VHXC_r_DP;
+                    std::ostringstream VHXC_r_path, VHXC_g_path;
+                    VHXC_r_path << "/electrons/VHXC/spin_" << ispin
+                                << "/VHXC_r";
+                    VHXC_g_path << "/electrons/VHXC/spin_" << ispin
+                                << "/VHXC_g";
+                    H5File.readEntry(VHXC_r_DP, VHXC_r_path.str());
+                    TargetPtcl.VHXC_r[ispin] = VHXC_r_DP;
+                    if (TargetPtcl.VHXCReducedGvecs.size()) {
+                        app_log() << "  EinsplineSetBuilder found VHXC in the "
+                                     "HDF5 file.\n";
+                        std::vector<std::complex<double>> VHXC_G_DP;
+                        std::vector<ComplexType> VHXC_G;
+                        H5File.read(VHXC_G_DP, VHXC_g_path.str());
+                        VHXC_G.assign(VHXC_G_DP.begin(), VHXC_G_DP.end());
+                        if (!VHXC_G.size()) {
+                            app_error() << "  VHXC reduced G-vectors defined, "
+                                           "but not the"
+                                        << " VHXC.\n";
+                            abort();
+                        }
+                        else
+                            TargetPtcl.VHXC_G[ispin] = VHXC_G;
+                    }
+                }
+            }
+        }
+    }
+    else {
+        app_log() << "   Skip initialization of the density" << std::endl;
+    }
+    return true;
+}
+
+template <typename T>
+void
+EinsplineSetBuilderT<T>::OccupyBands_ESHDF(int spin, int sortBands, int numOrbs)
+{
+    if (this->myComm->rank() != 0)
+        return;
+
+    std::vector<BandInfo>& SortBands(*FullBands[spin]);
+    SortBands.clear(); //??? can exit if SortBands is already made?
+    int maxOrbs(0);
+    for (int ti = 0; ti < DistinctTwists.size(); ti++) {
+        int tindex = DistinctTwists[ti];
+        // First, read valence states
+        std::ostringstream ePath;
+        ePath << "/electrons/kpoint_" << tindex << "/spin_" << spin
+              << "/eigenvalues";
+        std::vector<double> eigvals;
+        H5File.read(eigvals, ePath.str());
+        for (int bi = 0; bi < NumBands; bi++) {
+            BandInfo band;
+            band.TwistIndex = tindex;
+            band.BandIndex = bi;
+            band.MakeTwoCopies = MakeTwoCopies[ti];
+            band.Energy = eigvals[bi];
+            if (band.Energy > -1.0e100)
+                SortBands.push_back(band);
+            if (MakeTwoCopies[ti])
+                maxOrbs += 2;
+            else
+                maxOrbs++;
+        }
+    }
+
+    app_log()
+        << SortBands.size()
+        << " complex-valued orbitals supplied by h5 can be expanded up to "
+        << maxOrbs << " SPOs." << std::endl;
+    if (maxOrbs < numOrbs)
+        this->myComm->barrier_and_abort(
+            "EinsplineSetBuilder::OccupyBands_ESHDF user input requests "
+            "more orbitals than what the h5 file supplies.");
+
+    // Now sort the bands by energy
+    if (sortBands == 2) {
+        app_log() << "Sorting the bands by index now:\n";
+        sort(SortBands.begin(), SortBands.end(), sortByIndex);
+    }
+    else if (sortBands == 1) {
+        app_log() << "Sorting the bands now:\n";
+        sort(SortBands.begin(), SortBands.end());
+    }
+
+    std::vector<int> gsOcc(maxOrbs);
+    int N_gs_orbs = numOrbs;
+    int nocced(0);
+    for (int ti = 0; ti < SortBands.size(); ti++) {
+        if (nocced < N_gs_orbs) {
+            if (SortBands[ti].MakeTwoCopies && (N_gs_orbs - nocced > 1)) {
+                nocced += 2;
+                gsOcc[ti] = 2;
+            }
+            else if ((SortBands[ti].MakeTwoCopies &&
+                         (N_gs_orbs - nocced == 1)) ||
+                !SortBands[ti].MakeTwoCopies) {
+                nocced += 1;
+                gsOcc[ti] = 1;
+            }
+        }
+    }
+    if (occ_format == "energy") {
+        app_log() << "  Occupying bands based on energy in mode "
+                  << (Occ.size() > 0 ? "\"excited\"" : "\"ground\"")
+                  << std::endl;
+        // To get the occupations right.
+        std::vector<int> Removed(0, 0);
+        std::vector<int> Added(0, 0);
+        for (int ien = 0; ien < Occ.size(); ien++) {
+            if (Occ[ien] < 0)
+                Removed.push_back(-Occ[ien]);
+            else if (Occ[ien] > 0)
+                Added.push_back(Occ[ien]);
+        }
+        if (Added.size() - Removed.size() != 0) {
+            app_log() << "need to add and remove same number of orbitals. "
+                      << Added.size() << " " << Removed.size() << std::endl;
+            APP_ABORT("ChangedOccupations");
+        }
+        std::vector<int> DiffOcc(maxOrbs, 0);
+        // Probably a cleaner way to do this.
+        for (int i = 0; i < Removed.size(); i++)
+            DiffOcc[Removed[i] - 1] -= 1;
+        for (int i = 0; i < Added.size(); i++)
+            DiffOcc[Added[i] - 1] += 1;
+        std::vector<int> SumOrb(SortBands.size(), 0);
+        int doi(0);
+        for (int i = 0; i < SumOrb.size(); i++) {
+            if (SortBands[i].MakeTwoCopies) {
+                SumOrb[i] = gsOcc[i] + DiffOcc[doi++];
+                SumOrb[i] += DiffOcc[doi++];
+            }
+            else
+                SumOrb[i] = gsOcc[i] + DiffOcc[doi++];
+        }
+        std::vector<BandInfo> ReOrderedBands;
+        std::vector<BandInfo> RejectedBands;
+        for (int i = 0; i < SumOrb.size(); i++) {
+            if (SumOrb[i] == 2) {
+                SortBands[i].MakeTwoCopies = true;
+                ReOrderedBands.push_back(SortBands[i]);
+            }
+            else if (SumOrb[i] == 1) {
+                SortBands[i].MakeTwoCopies = false;
+                ReOrderedBands.push_back(SortBands[i]);
+            }
+            else if (SumOrb[i] == 0) {
+                SortBands[i].MakeTwoCopies = false;
+                RejectedBands.push_back(SortBands[i]);
+            }
+            else {
+                app_log() << " Trying to add the same orbital (" << i
+                          << ") less than zero or more than 2 times."
+                          << std::endl;
+                APP_ABORT("Sorting Excitation");
+            }
+        }
+        ReOrderedBands.insert(
+            ReOrderedBands.end(), RejectedBands.begin(), RejectedBands.end());
+        SortBands = ReOrderedBands;
+    }
+    else if (occ_format == "band") {
+        app_log() << "  Occupying bands based on (ti,bi) data." << std::endl;
+        if (Occ.size() != particle_hole_pairs * 4) {
+            app_log()
+                << " Need Occ = pairs*4. Occ is (ti,bi) of removed, then added."
+                << std::endl;
+            app_log() << Occ.size() << " " << particle_hole_pairs << std::endl;
+            APP_ABORT("ChangedOccupations");
+        }
+        int cnt(0);
+        for (int ien = 0; ien < SortBands.size(); ien++) {
+            if ((Occ[cnt] == SortBands[ien].TwistIndex) &&
+                (Occ[cnt + 1] == SortBands[ien].BandIndex)) {
+                if (cnt < particle_hole_pairs * 2) {
+                    gsOcc[ien] -= 1;
+                    cnt += 2;
+                    app_log() << "removing orbital " << ien << std::endl;
+                }
+                else {
+                    gsOcc[ien] += 1;
+                    app_log() << "adding orbital " << ien << std::endl;
+                    cnt += 2;
+                }
+            }
+        }
+        std::vector<BandInfo> ReOrderedBands;
+        std::vector<BandInfo> RejectedBands;
+        for (int i = 0; i < SortBands.size(); i++) {
+            if (gsOcc[i] == 2) {
+                SortBands[i].MakeTwoCopies = true;
+                ReOrderedBands.push_back(SortBands[i]);
+            }
+            else if (gsOcc[i] == 1) {
+                SortBands[i].MakeTwoCopies = false;
+                ReOrderedBands.push_back(SortBands[i]);
+            }
+            else if (gsOcc[i] == 0) {
+                SortBands[i].MakeTwoCopies = false;
+                RejectedBands.push_back(SortBands[i]);
+            }
+            else {
+                app_log() << " Trying to add the same orbital (" << i
+                          << ") less than zero or more than 2 times."
+                          << std::endl;
+                APP_ABORT("Sorting Excitation");
+            }
+        }
+        ReOrderedBands.insert(
+            ReOrderedBands.end(), RejectedBands.begin(), RejectedBands.end());
+        SortBands = ReOrderedBands;
+    }
+    // for(int sw=0;sw<Removed.size();sw++){
+    //   app_log()<<" Swapping two orbitals "<<Removed[sw]<<" and "<<Added[sw]<<
+    //   std::endl; BandInfo tempband(SortBands[Removed[sw]-1]);
+    //   SortBands[Removed[sw]-1] = SortBands[Added[sw]-1];
+    //   SortBands[Added[sw]-1] = tempband;
+    // }
+    int orbIndex = 0;
+    int numOrbs_counter = 0;
+    while (numOrbs_counter < numOrbs) {
+        if (SortBands[orbIndex].MakeTwoCopies)
+            numOrbs_counter += 2;
+        else
+            numOrbs_counter++;
+        orbIndex++;
+    }
+    NumDistinctOrbitals = orbIndex;
+    app_log() << "We will read " << NumDistinctOrbitals
+              << " distinct complex-valued orbitals from h5.\n";
+}
+
+template <typename T>
+void
+EinsplineSetBuilderT<T>::set_metadata(int numOrbs, int twist_num_inp,
+    const TinyVector<double, OHMMS_DIM>& twist_inp, bool skipChecks)
+{
+    // 1. set a lot of internal parameters in the EinsplineSetBuilder class
+    //  e.g. TileMatrix, use_real_splines_, DistinctTwists, MakeTwoCopies.
+    // 2. this is also where metadata for the orbitals are read from the
+    // wavefunction hdf5 file
+    //  and broadcast to MPI groups. Variables broadcasted are listed in
+    //  EinsplineSetBuilderCommon.cpp
+    //  EinsplineSetBuilder::BroadcastOrbitalInfo()
+    //
+
+    Timer orb_info_timer;
+    // The tiling can be set by a simple vector, (e.g. 2x2x2), or by a
+    // full 3x3 matrix of integers.  If the tilematrix was not set in
+    // the input file...
+    bool matrixNotSet = true;
+    for (int i = 0; i < 3; i++)
+        for (int j = 0; j < 3; j++)
+            matrixNotSet = matrixNotSet && (TileMatrix(i, j) == 0);
+    // then set the matrix to identity.
+    if (matrixNotSet)
+        for (int i = 0; i < 3; i++)
+            for (int j = 0; j < 3; j++)
+                TileMatrix(i, j) = (i == j) ? 1 : 0;
+    if (this->myComm->rank() == 0) {
+        std::array<char, 1000> buff;
+        int length = std::snprintf(buff.data(), buff.size(),
+            "  TileMatrix = \n [ %2d %2d %2d\n   %2d %2d %2d\n   %2d %2d %2d "
+            "]\n",
+            TileMatrix(0, 0), TileMatrix(0, 1), TileMatrix(0, 2),
+            TileMatrix(1, 0), TileMatrix(1, 1), TileMatrix(1, 2),
+            TileMatrix(2, 0), TileMatrix(2, 1), TileMatrix(2, 2));
+        if (length < 0)
+            throw std::runtime_error("Error converting TileMatrix to a string");
+        app_log() << std::string_view(buff.data(), length);
+    }
+    if (numOrbs == 0)
+        this->myComm->barrier_and_abort(
+            "EinsplineSetBuilder::createSPOSet You must specify the number of "
+            "orbitals in the input file.");
+    else
+        app_log() << "  Reading " << numOrbs << " orbitals from HDF5 file.\n";
+
+    /////////////////////////////////////////////////////////////////
+    // Read the basic orbital information, without reading all the //
+    // orbitals themselves.                                        //
+    /////////////////////////////////////////////////////////////////
+    orb_info_timer.restart();
+    if (this->myComm->rank() == 0)
+        if (!ReadOrbitalInfo(skipChecks))
+            throw std::runtime_error("EinsplineSetBuilder::set_metadata Error "
+                                     "reading orbital info from HDF5 file.");
+    app_log() << "TIMER  EinsplineSetBuilder::ReadOrbitalInfo "
+              << orb_info_timer.elapsed() << std::endl;
+    this->myComm->barrier();
+
+    orb_info_timer.restart();
+    BroadcastOrbitalInfo();
+    app_log() << "TIMER  EinsplineSetBuilder::BroadcastOrbitalInfo "
+              << orb_info_timer.elapsed() << std::endl;
+    app_log().flush();
+
+    // setup primitive cell and supercell
+    PrimCell.set(Lattice);
+    SuperCell.set(SuperLattice);
+    GGt = dot(transpose(PrimCell.G), PrimCell.G);
+
+    // Now, analyze the k-point mesh to figure out the what k-points  are needed
+    AnalyzeTwists2(twist_num_inp, twist_inp);
+}
+
+template <typename T>
+std::unique_ptr<SPOSetT<T>>
+EinsplineSetBuilderT<T>::createSPOSetFromXML(xmlNodePtr cur)
+{
+    // use 2 bohr as the default when truncated orbitals are used based on the
+    // extend of the ions
+    int numOrbs = 0;
+    int sortBands(1);
+    int spinSet = 0;
+    bool skipChecks = false;
+    int twist_num_inp = TWISTNUM_NO_INPUT;
+    TinyVector<double, OHMMS_DIM> twist_inp(TWIST_NO_INPUT);
+
+    std::string sourceName;
+    std::string spo_prec("double");
+    std::string truncate("no");
+    std::string hybrid_rep("no");
+    std::string skip_checks("no");
+    std::string use_einspline_set_extended(
+        "no"); // use old spline library for high-order derivatives, e.g. needed
+               // for backflow optimization
+    std::string useGPU;
+    std::string GPUsharing = "no";
+    std::string spo_object_name;
+
+    ScopedTimer spo_timer_scope(createGlobalTimer(
+        "einspline::CreateSPOSetFromXML", timer_level_medium));
+
+    {
+        TinyVector<int, OHMMS_DIM> TileFactor_do_not_use;
+        OhmmsAttributeSet a;
+        a.add(H5FileName, "href");
+        a.add(TileFactor_do_not_use, "tile", {}, TagStatus::DELETED);
+        a.add(sortBands, "sort");
+        a.add(TileMatrix, "tilematrix");
+        a.add(twist_num_inp, "twistnum");
+        a.add(twist_inp, "twist");
+        a.add(sourceName, "source");
+        a.add(MeshFactor, "meshfactor");
+        a.add(hybrid_rep, "hybridrep");
+        a.add(useGPU, "gpu", CPUOMPTargetSelector::candidate_values);
+        a.add(GPUsharing,
+            "gpusharing"); // split spline across GPUs visible per rank
+        a.add(spo_prec, "precision");
+        a.add(truncate, "truncate");
+        a.add(this->myName, "tag");
+        a.add(skip_checks, "skip_checks");
+
+        a.put(XMLRoot);
+        a.add(numOrbs, "size");
+        a.add(numOrbs, "norbs");
+        a.add(spinSet, "spindataset");
+        a.add(spinSet, "group");
+        a.put(cur);
+
+        if (this->myName.empty())
+            this->myName = "einspline";
+    }
+
+    if (skip_checks == "yes")
+        skipChecks = true;
+
+    auto pit(ParticleSets.find(sourceName));
+    if (pit == ParticleSets.end())
+        this->myComm->barrier_and_abort(
+            "Einspline needs the source particleset");
+    else
+        SourcePtcl = pit->second.get();
+
+    ///////////////////////////////////////////////
+    // Read occupation information from XML file //
+    ///////////////////////////////////////////////
+    const std::vector<int> last_occ(Occ);
+    Occ.resize(0, 0); // correspond to ground
+    bool NewOcc(false);
+
+    {
+        OhmmsAttributeSet oAttrib;
+        oAttrib.add(spinSet, "spindataset");
+        oAttrib.add(spo_object_name, "name");
+        oAttrib.add(spo_object_name, "id");
+        oAttrib.put(cur);
+    }
+
+    xmlNodePtr spo_cur = cur;
+    cur = cur->children;
+    while (cur != NULL) {
+        std::string cname((const char*)(cur->name));
+        if (cname == "occupation") {
+            std::string occ_mode("ground");
+            occ_format = "energy";
+            particle_hole_pairs = 0;
+            OhmmsAttributeSet oAttrib;
+            oAttrib.add(occ_mode, "mode");
+            oAttrib.add(spinSet, "spindataset");
+            oAttrib.add(occ_format, "format");
+            oAttrib.add(particle_hole_pairs, "pairs");
+            oAttrib.put(cur);
+            if (occ_mode == "excited")
+                putContent(Occ, cur);
+            else if (occ_mode != "ground")
+                this->myComm->barrier_and_abort(
+                    "EinsplineSetBuilder::createSPOSet Only ground state "
+                    "occupation "
+                    "currently supported in EinsplineSetBuilder.");
+        }
+        cur = cur->next;
+    }
+    if (Occ != last_occ) {
+        NewOcc = true;
+    }
+    else
+        NewOcc = false;
+#if defined(MIXED_PRECISION)
+    app_log() << "\t  MIXED_PRECISION=1 Overwriting the einspline storage to "
+                 "single precision.\n";
+    spo_prec = "single"; // overwrite
+#endif
+    H5OrbSet aset(H5FileName, spinSet, numOrbs);
+    const auto iter = SPOSetMap.find(aset);
+    if ((iter != SPOSetMap.end()) && (!NewOcc))
+        app_warning()
+            << "!!!!!!! Identical SPOSets are detected by EinsplineSetBuilder! "
+               "Implicit sharing one SPOSet for spin-up and spin-down "
+               "electrons has been removed. "
+               "Each determinant creates its own SPOSet with dedicated memory "
+               "for spline coefficients. "
+               "To avoid increasing the memory footprint of spline "
+               "coefficients, "
+               "create a single SPOset outside the determinantset using "
+               "'sposet_collection' "
+               "and reference it by name on the determinant line."
+            << std::endl;
+
+    if (FullBands[spinSet] == 0)
+        FullBands[spinSet] = std::make_unique<std::vector<BandInfo>>();
+
+    // Ensure the first SPO set must be spinSet==0
+    // to correctly initialize key data of EinsplineSetBuilder
+    if (SPOSetMap.size() == 0 && spinSet != 0)
+        this->myComm->barrier_and_abort(
+            "The first SPO set must have spindataset=\"0\"");
+
+    // set the internal parameters
+    if (spinSet == 0)
+        set_metadata(numOrbs, twist_num_inp, twist_inp, skipChecks);
+
+    //////////////////////////////////
+    // Create the OrbitalSet object
+    //////////////////////////////////
+    Timer mytimer;
+    mytimer.restart();
+    OccupyBands(spinSet, sortBands, numOrbs, skipChecks);
+    if (spinSet == 0)
+        TileIons();
+
+    bool use_single = (spo_prec == "single" || spo_prec == "float");
+
+    // safeguard for a removed feature
+    if (truncate == "yes")
+        this->myComm->barrier_and_abort(
+            "The 'truncate' feature of spline SPO has been removed. Please use "
+            "hybrid orbital representation.");
+
+    createBsplineReader(use_single, hybrid_rep == "yes", useGPU);
+
+    MixedSplineReader->setCommon(XMLRoot);
+    // temporary disable the following function call, Ye Luo
+    // RotateBands_ESHDF(spinSet,
+    // dynamic_cast<EinsplineSetExtended<std::complex<double> >*>(OrbitalSet));
+    bcastSortBands(spinSet, NumDistinctOrbitals, this->myComm->rank() == 0);
+    auto OrbitalSet = MixedSplineReader->create_spline_set(spinSet, spo_cur);
+    if (!OrbitalSet)
+        this->myComm->barrier_and_abort("Failed to create SPOSet*");
+    app_log() << "Time spent in creating B-spline SPOs " << mytimer.elapsed()
+              << "sec" << std::endl;
+    OrbitalSet->finalizeConstruction();
+    SPOSetMap[aset] = OrbitalSet.get();
+    return OrbitalSet;
+}
+
+template <typename T>
+void
+EinsplineSetBuilderT<T>::createBsplineReader(
+    bool useSingle, bool hybridRep, const std::string& useGPU)
+{
+    if (use_real_splines_) {
+        // if(TargetPtcl.Lattice.SuperCellEnum != SUPERCELL_BULK &&
+        // truncate=="yes")
+        if (MixedSplineReader == 0) {
+            if (useSingle)
+                MixedSplineReader =
+                    createBsplineRealSingleT(this, hybridRep, useGPU);
+            else
+                MixedSplineReader =
+                    createBsplineRealDoubleT(this, hybridRep, useGPU);
+        }
+    }
+    else {
+        if (MixedSplineReader == 0) {
+            if (useSingle)
+                MixedSplineReader =
+                    createBsplineComplexSingleT(this, hybridRep, useGPU);
+            else
+                MixedSplineReader =
+                    createBsplineComplexDoubleT(this, hybridRep, useGPU);
+        }
+    }
+}
+
+template <>
+void
+EinsplineSetBuilderT<std::complex<float>>::createBsplineReader(
+    bool useSingle, bool hybridRep, const std::string& useGPU)
+{
+    if (MixedSplineReader == 0) {
+        if (useSingle)
+            MixedSplineReader =
+                createBsplineComplexSingleT(this, hybridRep, useGPU);
+        else
+            MixedSplineReader =
+                createBsplineComplexDoubleT(this, hybridRep, useGPU);
+    }
+}
+
+template <>
+void
+EinsplineSetBuilderT<std::complex<double>>::createBsplineReader(
+    bool useSingle, bool hybridRep, const std::string& useGPU)
+{
+    if (MixedSplineReader == 0) {
+        if (useSingle)
+            MixedSplineReader =
+                createBsplineComplexSingleT(this, hybridRep, useGPU);
+        else
+            MixedSplineReader =
+                createBsplineComplexDoubleT(this, hybridRep, useGPU);
+    }
+}
+
+template <typename T>
+std::unique_ptr<SPOSetT<T>>
+EinsplineSetBuilderT<T>::createSPOSet(
+    xmlNodePtr cur, SPOSetInputInfo& input_info)
+{
+    if (MixedSplineReader == 0)
+        this->myComm->barrier_and_abort(
+            "EinsplineSetExtended<T> cannot create a SPOSet");
+
+    std::string aname;
+    int spinSet(0);
+    OhmmsAttributeSet a;
+    a.add(spinSet, "spindataset");
+    a.add(spinSet, "group");
+    a.put(cur);
+
+    // allow only non-overlapping index sets and use the max index as the
+    // identifier
+    int norb = input_info.max_index();
+    H5OrbSet aset(H5FileName, spinSet, norb);
+
+    auto bspline_zd =
+        MixedSplineReader->create_spline_set(spinSet, cur, input_info);
+    if (bspline_zd)
+        SPOSetMap[aset] = bspline_zd.get();
+    return bspline_zd;
+}
+
+template <typename T>
+bool
+EinsplineSetBuilderT<T>::ReadOrbitalInfo(bool skipChecks)
+{
+    if (!H5File.open(H5FileName, H5F_ACC_RDONLY)) {
+        app_error() << "Could not open HDF5 file \"" << H5FileName
+                    << "\" in EinsplineSetBuilder::ReadOrbitalInfo.\n";
+        return false;
+    }
+
+    // Read format
+    std::string format;
+    H5File.read(format, "/format");
+    H5File.read(Version, "/version");
+    app_log() << "  HDF5 orbital file version " << Version[0] << "."
+              << Version[1] << "." << Version[2] << "\n";
+    if (format.find("ES") < format.size()) {
+        Format = ESHDF;
+        return ReadOrbitalInfo_ESHDF(skipChecks);
+    }
+
+    app_error() << "EinsplineSetBuilder::ReadOrbitalInfo too old h5 file which "
+                   "is not in ESHDF format! Regenerate the h5 file";
+    return false;
+}
+
+template <typename T>
+bool
+EinsplineSetBuilderT<T>::ReadGvectors_ESHDF()
+{
+    bool root = this->myComm->rank() == 0;
+    // this is always ugly
+    MeshSize = 0;
+    int hasPsig = 1;
+    if (root) {
+        H5File.readEntry(MeshSize, "/electrons/psi_r_mesh");
+        H5File.readEntry(MeshSize, "/electrons/mesh");
+    }
+    this->myComm->bcast(MeshSize);
+    hasPsig = (MeshSize[0] == 0);
+    if (hasPsig) {
+        int nallowed = 257;
+        int allowed[] = {72, 75, 80, 81, 90, 96, 100, 108, 120, 125, 128, 135,
+            144, 150, 160, 162, 180, 192, 200, 216, 225, 240, 243, 250, 256,
+            270, 288, 300, 320, 324, 360, 375, 384, 400, 405, 432, 450, 480,
+            486, 500, 512, 540, 576, 600, 625, 640, 648, 675, 720, 729, 750,
+            768, 800, 810, 864, 900, 960, 972, 1000, 1024, 1080, 1125, 1152,
+            1200, 1215, 1250, 1280, 1296, 1350, 1440, 1458, 1500, 1536, 1600,
+            1620, 1728, 1800, 1875, 1920, 1944, 2000, 2025, 2048, 2160, 2187,
+            2250, 2304, 2400, 2430, 2500, 2560, 2592, 2700, 2880, 2916, 3000,
+            3072, 3125, 3200, 3240, 3375, 3456, 3600, 3645, 3750, 3840, 3888,
+            4000, 4050, 4096, 4320, 4374, 4500, 4608, 4800, 4860, 5000, 5120,
+            5184, 5400, 5625, 5760, 5832, 6000, 6075, 6144, 6250, 6400, 6480,
+            6561, 6750, 6912, 7200, 7290, 7500, 7680, 7776, 8000, 8100, 8192,
+            8640, 8748, 9000, 9216, 9375, 9600, 9720, 10000, 10125, 10240,
+            10368, 10800, 10935, 11250, 11520, 11664, 12000, 12150, 12288,
+            12500, 12800, 12960, 13122, 13500, 13824, 14400, 14580, 15000,
+            15360, 15552, 15625, 16000, 16200, 16384, 16875, 17280, 17496,
+            18000, 18225, 18432, 18750, 19200, 19440, 19683, 20000, 20250,
+            20480, 20736, 21600, 21870, 22500, 23040, 23328, 24000, 24300,
+            24576, 25000, 25600, 25920, 26244, 27000, 27648, 28125, 28800,
+            29160, 30000, 30375, 30720, 31104, 31250, 32000, 32400, 32768,
+            32805, 33750, 34560, 34992, 36000, 36450, 36864, 37500, 38400,
+            38880, 39366, 40000, 40500, 40960, 41472, 43200, 43740, 45000,
+            46080, 46656, 46875, 48000, 48600, 49152, 50000, 50625, 51200,
+            51840, 52488, 54000, 54675, 55296, 56250, 57600, 58320, 59049,
+            60000, 60750, 61440, 62208, 62500, 64000, 64800, 65536};
+        MaxNumGvecs = 0;
+        //    std::set<TinyVector<int,3> > Gset;
+        // Read k-points for all G-vectors and take the union
+        TinyVector<int, 3> maxIndex(0, 0, 0);
+        Gvecs.resize(NumTwists);
+        {
+            int numg = 0;
+            if (root) {
+                std::ostringstream Gpath;
+                Gpath << "/electrons/kpoint_0/gvectors";
+                H5File.read(Gvecs[0], Gpath.str());
+                numg = Gvecs[0].size();
+            }
+            this->myComm->bcast(numg);
+            if (!root)
+                Gvecs[0].resize(numg);
+            this->myComm->bcast(Gvecs[0]);
+            MaxNumGvecs = Gvecs[0].size();
+            for (int ig = 0; ig < Gvecs[0].size(); ig++) {
+                maxIndex[0] = std::max(maxIndex[0], std::abs(Gvecs[0][ig][0]));
+                maxIndex[1] = std::max(maxIndex[1], std::abs(Gvecs[0][ig][1]));
+                maxIndex[2] = std::max(maxIndex[2], std::abs(Gvecs[0][ig][2]));
+            }
+            // for (int ig=0; ig<Gvecs.size(); ig++)
+            // 	if (Gset.find(Gvecs[ig]) == Gset.end())
+            // 	  Gset.insert(Gvecs[ig]);
+        } // done with kpoint_0
+        MeshSize[0] = (int)std::ceil(4.0 * MeshFactor * maxIndex[0]);
+        MeshSize[1] = (int)std::ceil(4.0 * MeshFactor * maxIndex[1]);
+        MeshSize[2] = (int)std::ceil(4.0 * MeshFactor * maxIndex[2]);
+        // only use 2^a 3^b 5^c where a>=2  up to 65536
+        int* ix = std::lower_bound(allowed, allowed + nallowed, MeshSize[0]);
+        int* iy = std::lower_bound(allowed, allowed + nallowed, MeshSize[1]);
+        int* iz = std::lower_bound(allowed, allowed + nallowed, MeshSize[2]);
+        MeshSize[0] =
+            (MeshSize[0] > 128) ? *ix : (MeshSize[0] + MeshSize[0] % 2);
+        MeshSize[1] =
+            (MeshSize[1] > 128) ? *iy : (MeshSize[1] + MeshSize[1] % 2);
+        MeshSize[2] =
+            (MeshSize[2] > 128) ? *iz : (MeshSize[2] + MeshSize[2] % 2);
+        if (Version[0] < 2) {
+            // get the map for each twist, but use the MeshSize from kpoint_0
+            app_log() << "  ESHDF::Version " << Version << std::endl;
+            app_log() << "  Assumes distinct Gvecs set for different twists. "
+                         "Regenerate orbital files using updated QE."
+                      << std::endl;
+            for (int k = 0; k < DistinctTwists.size(); ++k) {
+                int ik = DistinctTwists[k];
+                if (ik == 0)
+                    continue; // already done
+                int numg = 0;
+                if (root) {
+                    std::ostringstream Gpath;
+                    Gpath << "/electrons/kpoint_" << ik << "/gvectors";
+                    H5File.read(Gvecs[ik], Gpath.str());
+                    numg = Gvecs[ik].size();
+                }
+                this->myComm->bcast(numg);
+                if (numg == 0) {
+                    // copy kpoint_0, default
+                    Gvecs[ik] = Gvecs[0];
+                }
+                else {
+                    if (numg != MaxNumGvecs) {
+                        std::ostringstream o;
+                        o << "Twist " << ik
+                          << ": The number of Gvecs is different from kpoint_0."
+                          << " This is not supported anymore. Rerun "
+                             "pw2qmcpack.x or equivalent";
+                        APP_ABORT(o.str());
+                    }
+                    if (!root)
+                        Gvecs[ik].resize(numg);
+                    this->myComm->bcast(Gvecs[ik]);
+                }
+            }
+        }
+    }
+    app_log() << "B-spline mesh factor is " << MeshFactor << std::endl;
+    app_log() << "B-spline mesh size is (" << MeshSize[0] << ", " << MeshSize[1]
+              << ", " << MeshSize[2] << ")\n";
+    app_log() << "Maxmimum number of Gvecs " << MaxNumGvecs << std::endl;
+    app_log().flush();
+    return hasPsig;
+}
+
+template class EinsplineSetBuilderT<double>;
+template class EinsplineSetBuilderT<float>;
+template class EinsplineSetBuilderT<std::complex<double>>;
+template class EinsplineSetBuilderT<std::complex<float>>;
+
+} // namespace qmcplusplus
diff --git a/src/QMCWaveFunctions/EinsplineSetBuilderT.h b/src/QMCWaveFunctions/EinsplineSetBuilderT.h
new file mode 100644
index 0000000000..b7d6e3658e
--- /dev/null
+++ b/src/QMCWaveFunctions/EinsplineSetBuilderT.h
@@ -0,0 +1,334 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
+//
+// Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
+//
+// File developed by: Ken Esler, kpesler@gmail.com, University of Illinois at
+// Urbana-Champaign
+//                    Jeremy McMinnis, jmcminis@gmail.com, University of
+//                    Illinois at Urbana-Champaign Jaron T. Krogel,
+//                    krogeljt@ornl.gov, Oak Ridge National Laboratory Jeongnim
+//                    Kim, jeongnim.kim@gmail.com, University of Illinois at
+//                    Urbana-Champaign Ye Luo, yeluo@anl.gov, Argonne National
+//                    Laboratory Raymond Clay III, j.k.rofling@gmail.com,
+//                    Lawrence Livermore National Laboratory Mark A. Berrill,
+//                    berrillma@ornl.gov, Oak Ridge National Laboratory
+//
+// File created by: Ken Esler, kpesler@gmail.com, University of Illinois at
+// Urbana-Champaign
+//////////////////////////////////////////////////////////////////////////////////////
+
+/** @file EinsplineSetBuilder.h
+ *
+ * Builder class for einspline-based SPOSet objects.
+ */
+#ifndef QMCPLUSPLUS_EINSPLINE_SET_BUILDERT_H
+#define QMCPLUSPLUS_EINSPLINE_SET_BUILDERT_H
+
+#include "QMCWaveFunctions/BandInfo.h"
+#include "QMCWaveFunctions/EinsplineSetBuilder.h"
+#include "QMCWaveFunctions/SPOSetBuilderT.h"
+
+#include <filesystem>
+#include <map>
+
+// #define PW_COEFF_NORM_TOLERANCE 1e-6
+
+class Communicate;
+
+namespace qmcplusplus
+{
+/// forward declaration of BsplineReaderBase
+template <typename T>
+class BsplineReaderBaseT;
+
+// Helper needed for TwistMap
+// struct Int3less
+// {
+//     bool
+//     operator()(const TinyVector<int, 3>& a, const TinyVector<int, 3>& b)
+//     const
+//     {
+//         if (a[0] > b[0])
+//             return false;
+//         if (a[0] < b[0])
+//             return true;
+//         if (a[1] > b[1])
+//             return false;
+//         if (a[1] < b[1])
+//             return true;
+//         if (a[2] > b[2])
+//             return false;
+//         if (a[2] < b[2])
+//             return true;
+//         return false;
+//     }
+// };
+// struct Int4less
+// {
+//     bool
+//     operator()(const TinyVector<int, 4>& a, const TinyVector<int, 4>& b)
+//     const
+//     {
+//         for (int i = 0; i < 4; i++) {
+//             if (a[i] > b[i])
+//                 return false;
+//             if (a[i] < b[i])
+//                 return true;
+//         }
+//         return false;
+//     }
+// };
+
+/** construct a name for spline SPO set
+ */
+// struct H5OrbSet
+// {
+//     /// index for the spin set
+//     int SpinSet;
+//     /// number of orbitals that belong to this set
+//     int NumOrbs;
+//     /// name of the HDF5 file
+//     std::filesystem::path FileName;
+//     /** true if a < b
+//      *
+//      * The ordering
+//      * - name
+//      * - spin set
+//      * - number of orbitals
+//      */
+//     bool
+//     operator()(const H5OrbSet& a, const H5OrbSet& b) const
+//     {
+//         if (a.FileName == b.FileName) {
+//             if (a.SpinSet == b.SpinSet)
+//                 return a.NumOrbs < b.NumOrbs;
+//             else
+//                 return a.SpinSet < b.SpinSet;
+//         }
+//         else
+//             return a.FileName < b.FileName;
+//     }
+
+//     H5OrbSet(std::filesystem::path name, int spinSet, int numOrbs) :
+//         SpinSet(spinSet),
+//         NumOrbs(numOrbs),
+//         FileName(std::move(name))
+//     {
+//     }
+//     H5OrbSet() = default;
+// };
+
+/** EinsplineSet builder
+ */
+template <typename T>
+class EinsplineSetBuilderT : public SPOSetBuilderT<T>
+{
+public:
+    static constexpr auto DIM = ParticleSetT<T>::DIM;
+
+    using PSetMap =
+        std::map<std::string, const std::unique_ptr<ParticleSetT<T>>>;
+    using UnitCellType =
+        CrystalLattice<typename ParticleSetT<T>::Scalar_t, DIM>;
+    using RealType = typename SPOSetBuilderT<T>::RealType;
+    using PosType = typename SPOSetBuilderT<T>::PosType;
+    using ComplexType = typename SPOSetT<T>::ComplexType;
+
+    /// reference to the particleset pool
+    const PSetMap& ParticleSets;
+    /// quantum particle set
+    ParticleSetT<T>& TargetPtcl;
+    /// ionic system
+    ParticleSetT<T>* SourcePtcl;
+
+    /**  Helper vector for sorting bands
+     */
+    std::vector<std::unique_ptr<std::vector<BandInfo>>> FullBands;
+
+    /// reader to use BsplineReaderBase
+    std::unique_ptr<BsplineReaderBaseT<T>> MixedSplineReader;
+
+    /// This is true if we have the orbital derivatives w.r.t. the ion positions
+    bool HaveOrbDerivs;
+    /// root XML node with href, sort, tilematrix, twistnum, source,
+    /// precision,truncate,version
+    xmlNodePtr XMLRoot;
+
+    std::map<H5OrbSet, SPOSetT<T>*, H5OrbSet> SPOSetMap;
+
+    /// constructor
+    EinsplineSetBuilderT(ParticleSetT<T>& p, const PSetMap& psets,
+        Communicate* comm, xmlNodePtr cur);
+
+    /// destructor
+    ~EinsplineSetBuilderT() override;
+
+    /** initialize the Antisymmetric wave function for electrons
+     * @param cur the current xml node
+     */
+    std::unique_ptr<SPOSetT<T>>
+    createSPOSetFromXML(xmlNodePtr cur) override;
+
+    /** initialize with the existing SPOSet */
+    std::unique_ptr<SPOSetT<T>>
+    createSPOSet(xmlNodePtr cur, SPOSetInputInfo& input_info) override;
+
+    //////////////////////////////////////
+    // HDF5-related data  and functions //
+    //////////////////////////////////////
+    hdf_archive H5File;
+    std::filesystem::path H5FileName;
+    // HDF5 orbital file version
+    typedef enum
+    {
+        QMCPACK,
+        ESHDF
+    } FormatType;
+    FormatType Format;
+    TinyVector<int, 3> Version;
+    std::string parameterGroup, ionsGroup, eigenstatesGroup;
+    std::vector<int> Occ;
+    bool
+    ReadOrbitalInfo(bool skipChecks = false);
+    bool
+    ReadOrbitalInfo_ESHDF(bool skipChecks = false);
+    void
+    BroadcastOrbitalInfo();
+    bool
+    CheckLattice();
+
+    /** read gvectors for each twist
+     * @return true, if psi_g is found
+     */
+    bool
+    ReadGvectors_ESHDF();
+
+    Tensor<double, OHMMS_DIM> Lattice, RecipLattice, LatticeInv, SuperLattice,
+        GGt;
+    UnitCellType SuperCell, PrimCell, PrimCellInv;
+    int NumBands, NumElectrons, NumSpins, NumTwists;
+    int MaxNumGvecs;
+    double MeshFactor;
+    RealType MatchingTol;
+    TinyVector<int, 3> MeshSize;
+    std::vector<std::vector<TinyVector<int, 3>>> Gvecs;
+
+    Vector<int> IonTypes;
+    Vector<TinyVector<double, OHMMS_DIM>> IonPos;
+    // mapping the ions in the supercell to the primitive cell
+    std::vector<int> Super2Prim;
+
+    /////////////////////////////
+    // Twist angle information //
+    /////////////////////////////
+    // The "true" twist number after analyzing twistnum, twist XML input and h5
+    int twist_num_;
+    // primitive cell k-points from DFT calculations
+    std::vector<TinyVector<double, OHMMS_DIM>> primcell_kpoints;
+    // primitive cell to supercell tiling matrix
+    Tensor<int, OHMMS_DIM> TileMatrix;
+    // This vector stores which twist indices will be used by this clone
+    std::vector<TinyVector<int, OHMMS_DIM>> UseTwists;
+    std::vector<int> IncludeTwists, DistinctTwists;
+    /// if false, splines are conceptually complex valued
+    bool use_real_splines_;
+    int NumDistinctOrbitals;
+    // This is true if the corresponding twist in DistinctTwists should
+    // should be used to generate two distinct orbitals from the real and
+    // imaginary parts.
+    std::vector<bool> MakeTwoCopies;
+    // This maps a 3-integer twist index into the twist number in the file
+    std::map<TinyVector<int, OHMMS_DIM>, int, Int3less> TwistMap;
+
+    bool
+    TwistPair(PosType a, PosType b) const;
+    void
+    TileIons();
+    void
+    OccupyBands(int spin, int sortBands, int numOrbs, bool skipChecks = false);
+    void
+    OccupyBands_ESHDF(int spin, int sortBands, int numOrbs);
+
+    ////////////////////////////////
+    // Atomic orbital information //
+    ////////////////////////////////
+    struct CenterInfo
+    {
+        std::vector<int> lmax, spline_npoints, GroupID;
+        std::vector<double> spline_radius, cutoff, inner_cutoff,
+            non_overlapping_radius;
+        std::vector<TinyVector<double, OHMMS_DIM>> ion_pos;
+        int Ncenters;
+
+        CenterInfo() : Ncenters(0){};
+
+        void
+        resize(int ncenters)
+        {
+            Ncenters = ncenters;
+            lmax.resize(ncenters, -1);
+            spline_npoints.resize(ncenters, -1);
+            GroupID.resize(ncenters, 0);
+            spline_radius.resize(ncenters, -1.0);
+            inner_cutoff.resize(ncenters, -1.0);
+            non_overlapping_radius.resize(ncenters, -1.0);
+            cutoff.resize(ncenters, -1.0);
+            ion_pos.resize(ncenters);
+        }
+    } AtomicCentersInfo;
+
+    // This returns the path in the HDF5 file to the group for orbital
+    // with twist ti and band bi
+    std::string
+    OrbitalPath(int ti, int bi);
+
+    /////////////////////////////////////////////////////////////
+    // Information to avoid storing the same orbitals twice in //
+    // spin-restricted calculations.                           //
+    /////////////////////////////////////////////////////////////
+    int LastSpinSet, NumOrbitalsRead;
+
+    std::string occ_format;
+    int particle_hole_pairs;
+    bool makeRotations;
+
+protected:
+    /** broadcast SortBands
+     * @param N number of state
+     * @param root true if it is the i/o node
+     */
+    void
+    bcastSortBands(int splin, int N, bool root);
+
+    /** a specific but clean code path in createSPOSetFromXML, for PBC, double,
+     * ESHDF
+     * @param cur the current xml node
+     */
+    void
+    set_metadata(int numOrbs, int twist_num_inp,
+        const TinyVector<double, OHMMS_DIM>& twist_inp,
+        bool skipChecks = false);
+
+    void
+    createBsplineReader(
+        bool useSingle, bool hybridRep, const std::string& useGPU);
+
+    /** analyze twists of orbitals in h5 and determinine twist_num_
+     * @param twist_num_inp twistnum XML input
+     * @param twist_inp twst XML input
+     */
+    void
+    AnalyzeTwists2(const int twist_num_inp,
+        const TinyVector<double, OHMMS_DIM>& twist_inp);
+
+    /// twistnum_inp == -9999 to indicate no given input after parsing XML
+    static constexpr int TWISTNUM_NO_INPUT = -9999;
+    /// twist_inp[i] <= -9999 to indicate no given input after parsing XML
+    static constexpr double TWIST_NO_INPUT = -9999;
+};
+
+} // namespace qmcplusplus
+
+#endif
diff --git a/src/QMCWaveFunctions/OrbitalSetTraits.h b/src/QMCWaveFunctions/OrbitalSetTraits.h
index 7b35937067..881532fcef 100644
--- a/src/QMCWaveFunctions/OrbitalSetTraits.h
+++ b/src/QMCWaveFunctions/OrbitalSetTraits.h
@@ -54,6 +54,7 @@ struct OrbitalSetTraits //: public OrbitalTraits<T>
     DIM = OHMMS_DIM
   };
   using RealType       = RealAlias<T>;
+  using ComplexType    = std::complex<RealType>;
   using ValueType      = T;
   using IndexType      = int;
   using PosType        = TinyVector<RealType, DIM>;
diff --git a/src/QMCWaveFunctions/SPOSetT.h b/src/QMCWaveFunctions/SPOSetT.h
index f3fd993c5c..98c3743f7a 100644
--- a/src/QMCWaveFunctions/SPOSetT.h
+++ b/src/QMCWaveFunctions/SPOSetT.h
@@ -78,6 +78,7 @@ class SPOSetT : public QMCTraits
         Array<T, 2, OffloadPinnedAllocator<T>>; // [walker, Orbs]
     using PosType = typename OrbitalSetTraits<T>::PosType;
     using RealType = typename OrbitalSetTraits<T>::RealType;
+    using ComplexType = typename OrbitalSetTraits<T>::ComplexType;
     using ValueType = typename OrbitalSetTraits<T>::ValueType;
     using FullRealType = typename OrbitalSetTraits<double>::RealType;
     template <typename DT>
diff --git a/src/QMCWaveFunctions/SpinorSetT.cpp b/src/QMCWaveFunctions/SpinorSetT.cpp
index 1090397ad1..bac10a6ec8 100644
--- a/src/QMCWaveFunctions/SpinorSetT.cpp
+++ b/src/QMCWaveFunctions/SpinorSetT.cpp
@@ -183,7 +183,7 @@ SpinorSetT<T>::mw_evaluateVGLWithSpin(
     const RefVector<ValueVector>& psi_v_list,
     const RefVector<GradVector>& dpsi_v_list,
     const RefVector<ValueVector>& d2psi_v_list,
-    OffloadMatrix<QMCTraits::ComplexType>& mw_dspin) const
+    OffloadMatrix<ComplexType>& mw_dspin) const
 {
     auto& spo_leader = spo_list.template getCastedLeader<SpinorSetT<T>>();
     auto& P_leader = P_list.getLeader();
diff --git a/src/QMCWaveFunctions/SpinorSetT.h b/src/QMCWaveFunctions/SpinorSetT.h
index 08990e350b..08d869b112 100644
--- a/src/QMCWaveFunctions/SpinorSetT.h
+++ b/src/QMCWaveFunctions/SpinorSetT.h
@@ -40,6 +40,7 @@ class SpinorSetT : public SPOSetT<T>
     template <typename DT>
     using OffloadMatrix = typename SPOSetT<T>::template OffloadMatrix<DT>;
     using RealType = typename SPOSetT<T>::RealType;
+    using ComplexType = typename SPOSetT<T>::ComplexType;
     using IndexType = OHMMS_INDEXTYPE;
 
     /** constructor */
@@ -129,7 +130,7 @@ class SpinorSetT : public SPOSetT<T>
         const RefVector<ValueVector>& psi_v_list,
         const RefVector<GradVector>& dpsi_v_list,
         const RefVector<ValueVector>& d2psi_v_list,
-        OffloadMatrix<QMCTraits::ComplexType>& mw_dspin) const override;
+        OffloadMatrix<ComplexType>& mw_dspin) const override;
 
     /** evaluate the values, gradients and laplacians of this single-particle
      * orbital sets and determinant ratio and grads of multiple walkers. Device
diff --git a/src/QMCWaveFunctions/tests/CMakeLists.txt b/src/QMCWaveFunctions/tests/CMakeLists.txt
index b414f0158b..ee68f38a1e 100644
--- a/src/QMCWaveFunctions/tests/CMakeLists.txt
+++ b/src/QMCWaveFunctions/tests/CMakeLists.txt
@@ -112,6 +112,7 @@ set(SPOSET_SRC
     test_pw.cpp
     test_ConstantSPOSet.cpp
     test_ConstantSPOSetT.cpp
+    test_RotatedSPOsT.cpp
     ${MO_SRCS})
 if(NiO_a16_H5_FOUND)
   set(SPOSET_SRC ${SPOSET_SRC} test_einset_NiO_a16.cpp)
diff --git a/src/QMCWaveFunctions/tests/test_RotatedSPOsT.cpp b/src/QMCWaveFunctions/tests/test_RotatedSPOsT.cpp
new file mode 100644
index 0000000000..24a5087f79
--- /dev/null
+++ b/src/QMCWaveFunctions/tests/test_RotatedSPOsT.cpp
@@ -0,0 +1,1024 @@
+//////////////////////////////////////////////////////////////////////////////////////
+// This file is distributed under the University of Illinois/NCSA Open Source
+// License. See LICENSE file in top directory for details.
+//
+// Copyright (c) 2022 QMCPACK developers.
+//
+// File developed by: Joshua Townsend, jptowns@sandia.gov, Sandia National
+// Laboratories
+//
+// File created by: Joshua Townsend, jptowns@sandia.gov, Sandia National
+// Laboratories
+//////////////////////////////////////////////////////////////////////////////////////
+
+#include "FakeSPOT.h"
+#include "OhmmsData/Libxml2Doc.h"
+#include "OhmmsPETE/OhmmsMatrix.h"
+#include "Particle/ParticleSetPoolT.h"
+#include "Particle/ParticleSetT.h"
+#include "QMCWaveFunctions/EinsplineSetBuilderT.h"
+#include "QMCWaveFunctions/RotatedSPOsT.h"
+#include "QMCWaveFunctions/WaveFunctionComponent.h"
+#include "catch.hpp"
+#include "checkMatrix.hpp"
+#include "type_traits/ConvertToReal.h"
+#include "type_traits/template_types.hpp"
+#include <ResourceCollection.h>
+#include <stdio.h>
+
+#include <limits>
+#include <string>
+
+using std::string;
+
+namespace qmcplusplus
+{
+template <typename T>
+struct ValueApproxHelper
+{
+    using Type = Catch::Detail::Approx;
+};
+template <typename T>
+struct ValueApproxHelper<std::complex<T>>
+{
+    using Type = Catch::Detail::ComplexApprox;
+};
+
+template <typename T>
+using ValueApprox = typename ValueApproxHelper<T>::Type;
+
+namespace testing
+{
+OptVariablesType<float>&
+getMyVars(SPOSetT<float>& rot)
+{
+    return rot.myVars;
+}
+OptVariablesType<double>&
+getMyVars(SPOSetT<double>& rot)
+{
+    return rot.myVars;
+}
+OptVariablesType<float>&
+getMyVarsFull(RotatedSPOsT<float>& rot)
+{
+    return rot.myVarsFull;
+}
+OptVariablesType<double>&
+getMyVarsFull(RotatedSPOsT<double>& rot)
+{
+    return rot.myVarsFull;
+}
+std::vector<std::vector<float>>&
+getHistoryParams(RotatedSPOsT<float>& rot)
+{
+    return rot.history_params_;
+}
+
+std::vector<std::vector<double>>&
+getHistoryParams(RotatedSPOsT<double>& rot)
+{
+    return rot.history_params_;
+}
+} // namespace testing
+
+/*
+  JPT 04.01.2022: Adapted from test_einset.cpp
+  Test the spline rotated machinery for SplineR2R (extend to others later).
+*/
+TEMPLATE_TEST_CASE(
+    "RotatedSPOs via SplineR2R", "[wavefunction][template]", double, float)
+{
+    using RealType = typename SPOSetT<TestType>::RealType;
+
+    /*
+      BEGIN Boilerplate stuff to make a simple SPOSet. Copied from
+      test_einset.cpp
+    */
+
+    Communicate* c = OHMMS::Controller;
+
+    // We get a "Mismatched supercell lattices" error due to default ctor?
+    typename ParticleSetT<TestType>::ParticleLayout lattice;
+
+    // diamondC_1x1x1
+    lattice.R = {3.37316115, 3.37316115, 0.0, 0.0, 3.37316115, 3.37316115,
+        3.37316115, 0.0, 3.37316115};
+
+    ParticleSetPoolT<TestType> ptcl = ParticleSetPoolT<TestType>(c);
+    ptcl.setSimulationCell(lattice);
+    // LAttice seems fine after this point...
+
+    auto ions_uptr =
+        std::make_unique<ParticleSetT<TestType>>(ptcl.getSimulationCell());
+    auto elec_uptr =
+        std::make_unique<ParticleSetT<TestType>>(ptcl.getSimulationCell());
+    ParticleSetT<TestType>& ions_(*ions_uptr);
+    ParticleSetT<TestType>& elec_(*elec_uptr);
+
+    ions_.setName("ion");
+    ptcl.addParticleSet(std::move(ions_uptr));
+    ions_.create({2});
+    ions_.R[0] = {0.0, 0.0, 0.0};
+    ions_.R[1] = {1.68658058, 1.68658058, 1.68658058};
+    elec_.setName("elec");
+    ptcl.addParticleSet(std::move(elec_uptr));
+    elec_.create({2});
+    elec_.R[0] = {0.0, 0.0, 0.0};
+    elec_.R[1] = {0.0, 1.0, 0.0};
+    SpeciesSet& tspecies = elec_.getSpeciesSet();
+    int upIdx = tspecies.addSpecies("u");
+    int chargeIdx = tspecies.addAttribute("charge");
+    tspecies(chargeIdx, upIdx) = -1;
+
+    // diamondC_1x1x1 - 8 bands available
+    const char* particles = R"(<tmp>
+<determinantset type="einspline" href="diamondC_1x1x1.pwscf.h5" tilematrix="1 0 0 0 1 0 0 0 1" twistnum="0" source="ion" meshfactor="1.0" precision="float" size="8"/>
+</tmp>
+)";
+
+    Libxml2Document doc;
+    bool okay = doc.parseFromString(particles);
+    REQUIRE(okay);
+
+    xmlNodePtr root = doc.getRoot();
+
+    xmlNodePtr ein1 = xmlFirstElementChild(root);
+
+    EinsplineSetBuilderT<TestType> einSet(elec_, ptcl.getPool(), c, ein1);
+    auto spo = einSet.createSPOSetFromXML(ein1);
+    REQUIRE(spo);
+
+    /*
+      END Boilerplate stuff. Now we have a SplineR2R wavefunction
+      ready for rotation. What follows is the actual test.
+    */
+
+    // SplineR2R only for the moment, so skip if QMC_COMPLEX is set
+#if !defined(QMC_COMPLEX)
+
+    spo->storeParamsBeforeRotation();
+    // 1.) Make a RotatedSPOs object so that we can use the rotation routines
+    auto rot_spo = std::make_unique<RotatedSPOsT<TestType>>(
+        "one_rotated_set", std::move(spo));
+
+    // Sanity check for orbs. Expect 2 electrons, 8 orbitals, & 79507 coefs/orb.
+    const auto orbitalsetsize = rot_spo->getOrbitalSetSize();
+    REQUIRE(orbitalsetsize == 8);
+
+    // 2.) Get data for unrotated orbitals. Check that there's no rotation
+    rot_spo->buildOptVariables(elec_.R.size());
+    typename SPOSetT<TestType>::ValueMatrix psiM_bare(
+        elec_.R.size(), orbitalsetsize);
+    typename SPOSetT<TestType>::GradMatrix dpsiM_bare(
+        elec_.R.size(), orbitalsetsize);
+    typename SPOSetT<TestType>::ValueMatrix d2psiM_bare(
+        elec_.R.size(), orbitalsetsize);
+    rot_spo->evaluate_notranspose(
+        elec_, 0, elec_.R.size(), psiM_bare, dpsiM_bare, d2psiM_bare);
+
+    // This stuff checks that no rotation was applied. Copied from
+    // test_einset.cpp. value
+    CHECK(std::real(psiM_bare[1][0]) == ValueApprox<TestType>(-0.8886948824));
+    CHECK(std::real(psiM_bare[1][1]) == ValueApprox<TestType>(1.4194120169));
+    // grad
+    CHECK(
+        std::real(dpsiM_bare[1][0][0]) == ValueApprox<TestType>(-0.0000183403));
+    CHECK(
+        std::real(dpsiM_bare[1][0][1]) == ValueApprox<TestType>(0.1655139178));
+    CHECK(
+        std::real(dpsiM_bare[1][0][2]) == ValueApprox<TestType>(-0.0000193077));
+    CHECK(
+        std::real(dpsiM_bare[1][1][0]) == ValueApprox<TestType>(-1.3131694794));
+    CHECK(
+        std::real(dpsiM_bare[1][1][1]) == ValueApprox<TestType>(-1.1174004078));
+    CHECK(
+        std::real(dpsiM_bare[1][1][2]) == ValueApprox<TestType>(-0.8462534547));
+    // lapl
+    CHECK(std::real(d2psiM_bare[1][0]) == ValueApprox<TestType>(1.3313053846));
+    CHECK(std::real(d2psiM_bare[1][1]) == ValueApprox<TestType>(-4.712583065));
+
+    /*
+       3.) Apply a rotation to the orbitals
+           To do this, construct a params vector and call the
+       RotatedSPOs::apply_rotation(params) method. That should do the
+       right thing for this particular spline class.
+
+       For 2 electrons in 8 orbs, we expect 2*(8-2) = 12 params.
+    */
+    const auto rot_size = rot_spo->m_act_rot_inds.size();
+    REQUIRE(rot_size == 12); // = Nelec*(Norbs - Nelec) = 2*(8-2) = 12
+    std::vector<RealType> param(rot_size);
+    for (auto i = 0; i < rot_size; i++) {
+        param[i] = 0.01 * static_cast<RealType>(i);
+    }
+    rot_spo->apply_rotation(
+        param, false); // Expect this to call SplineR2R::applyRotation()
+
+    // 4.) Get data for rotated orbitals.
+    typename SPOSetT<TestType>::ValueMatrix psiM_rot(
+        elec_.R.size(), orbitalsetsize);
+    typename SPOSetT<TestType>::GradMatrix dpsiM_rot(
+        elec_.R.size(), orbitalsetsize);
+    typename SPOSetT<TestType>::ValueMatrix d2psiM_rot(
+        elec_.R.size(), orbitalsetsize);
+    rot_spo->evaluate_notranspose(
+        elec_, 0, elec_.R.size(), psiM_rot, dpsiM_rot, d2psiM_rot);
+
+    /*
+       Manually encode the unitary transformation. Ugly, but it works.
+       @TODO: Use the total rotation machinery when it's implemented
+
+       NB: This is truncated to 5 sig-figs, so there is some slop here as
+           compared to what is done in the splines via apply_rotation().
+       So below we reduce the threshold for comparison. This can
+       probably be ditched once we have a way to grab the actual
+       rotation matrix...
+    */
+    typename SPOSetT<TestType>::ValueMatrix rot_mat(
+        orbitalsetsize, orbitalsetsize);
+    rot_mat[0][0] = 0.99726;
+    rot_mat[0][1] = -0.00722;
+    rot_mat[0][2] = 0.00014;
+    rot_mat[0][3] = -0.00982;
+    rot_mat[0][4] = -0.01979;
+    rot_mat[0][5] = -0.02976;
+    rot_mat[0][6] = -0.03972;
+    rot_mat[0][7] = -0.04969;
+    rot_mat[1][0] = -0.00722;
+    rot_mat[1][1] = 0.97754;
+    rot_mat[1][2] = -0.05955;
+    rot_mat[1][3] = -0.06945;
+    rot_mat[1][4] = -0.07935;
+    rot_mat[1][5] = -0.08925;
+    rot_mat[1][6] = -0.09915;
+    rot_mat[1][7] = -0.10905;
+    rot_mat[2][0] = -0.00014;
+    rot_mat[2][1] = 0.05955;
+    rot_mat[2][2] = 0.99821;
+    rot_mat[2][3] = -0.00209;
+    rot_mat[2][4] = -0.00239;
+    rot_mat[2][5] = -0.00269;
+    rot_mat[2][6] = -0.00299;
+    rot_mat[2][7] = -0.00329;
+    rot_mat[3][0] = 0.00982;
+    rot_mat[3][1] = 0.06945;
+    rot_mat[3][2] = -0.00209;
+    rot_mat[3][3] = 0.99751;
+    rot_mat[3][4] = -0.00289;
+    rot_mat[3][5] = -0.00329;
+    rot_mat[3][6] = -0.00368;
+    rot_mat[3][7] = -0.00408;
+    rot_mat[4][0] = 0.01979;
+    rot_mat[4][1] = 0.07935;
+    rot_mat[4][2] = -0.00239;
+    rot_mat[4][3] = -0.00289;
+    rot_mat[4][4] = 0.99661;
+    rot_mat[4][5] = -0.00388;
+    rot_mat[4][6] = -0.00438;
+    rot_mat[4][7] = -0.00488;
+    rot_mat[5][0] = 0.02976;
+    rot_mat[5][1] = 0.08925;
+    rot_mat[5][2] = -0.00269;
+    rot_mat[5][3] = -0.00329;
+    rot_mat[5][4] = -0.00388;
+    rot_mat[5][5] = 0.99552;
+    rot_mat[5][6] = -0.00508;
+    rot_mat[5][7] = -0.00568;
+    rot_mat[6][0] = 0.03972;
+    rot_mat[6][1] = 0.09915;
+    rot_mat[6][2] = -0.00299;
+    rot_mat[6][3] = -0.00368;
+    rot_mat[6][4] = -0.00438;
+    rot_mat[6][5] = -0.00508;
+    rot_mat[6][6] = 0.99422;
+    rot_mat[6][7] = -0.00647;
+    rot_mat[7][0] = 0.04969;
+    rot_mat[7][1] = 0.10905;
+    rot_mat[7][2] = -0.00329;
+    rot_mat[7][3] = -0.00408;
+    rot_mat[7][4] = -0.00488;
+    rot_mat[7][5] = -0.00568;
+    rot_mat[7][6] = -0.00647;
+    rot_mat[7][7] = 0.99273;
+
+    // Now compute the expected values by hand using the transformation above
+    double val1 = 0.;
+    double val2 = 0.;
+    for (auto i = 0; i < rot_mat.size1(); i++) {
+        val1 += psiM_bare[0][i] * rot_mat[i][0];
+        val2 += psiM_bare[1][i] * rot_mat[i][0];
+    }
+
+    // value
+    CHECK(std::real(psiM_rot[0][0]) == ValueApprox<TestType>(val1));
+    CHECK(std::real(psiM_rot[1][0]) == ValueApprox<TestType>(val2));
+
+    std::vector<double> grad1(3);
+    std::vector<double> grad2(3);
+    for (auto j = 0; j < grad1.size(); j++) {
+        for (auto i = 0; i < rot_mat.size1(); i++) {
+            grad1[j] += dpsiM_bare[0][i][j] * rot_mat[i][0];
+            grad2[j] += dpsiM_bare[1][i][j] * rot_mat[i][0];
+        }
+    }
+
+    // grad
+    CHECK(
+        dpsiM_rot[0][0][0] == ValueApprox<TestType>(grad1[0]).epsilon(0.0001));
+    CHECK(
+        dpsiM_rot[0][0][1] == ValueApprox<TestType>(grad1[1]).epsilon(0.0001));
+    CHECK(
+        dpsiM_rot[0][0][2] == ValueApprox<TestType>(grad1[2]).epsilon(0.0001));
+    CHECK(
+        dpsiM_rot[1][0][0] == ValueApprox<TestType>(grad2[0]).epsilon(0.0001));
+    CHECK(
+        dpsiM_rot[1][0][1] == ValueApprox<TestType>(grad2[1]).epsilon(0.0001));
+    CHECK(
+        dpsiM_rot[1][0][2] == ValueApprox<TestType>(grad2[2]).epsilon(0.0001));
+
+    double lap1 = 0.;
+    double lap2 = 0.;
+    for (auto i = 0; i < rot_mat.size1(); i++) {
+        lap1 += d2psiM_bare[0][i] * rot_mat[i][0];
+        lap2 += d2psiM_bare[1][i] * rot_mat[i][0];
+    }
+
+    // Lapl
+    CHECK(std::real(d2psiM_rot[0][0]) ==
+        ValueApprox<TestType>(lap1).epsilon(0.0001));
+    CHECK(std::real(d2psiM_rot[1][0]) ==
+        ValueApprox<TestType>(lap2).epsilon(0.0001));
+
+#endif
+}
+
+TEMPLATE_TEST_CASE("RotatedSPOs createRotationIndices",
+    "[wavefunction][template]", double, float)
+{
+    // No active-active or virtual-virtual rotations
+    // Only active-virtual
+    typename RotatedSPOsT<TestType>::RotationIndices rot_ind;
+    int nel = 1;
+    int nmo = 3;
+    RotatedSPOsT<TestType>::createRotationIndices(nel, nmo, rot_ind);
+    CHECK(rot_ind.size() == 2);
+
+    // Full rotation contains all rotations
+    // Size should be number of pairs of orbitals: nmo*(nmo-1)/2
+    typename RotatedSPOsT<TestType>::RotationIndices full_rot_ind;
+    RotatedSPOsT<TestType>::createRotationIndicesFull(nel, nmo, full_rot_ind);
+    CHECK(full_rot_ind.size() == 3);
+
+    nel = 2;
+    typename RotatedSPOsT<TestType>::RotationIndices rot_ind2;
+    RotatedSPOsT<TestType>::createRotationIndices(nel, nmo, rot_ind2);
+    CHECK(rot_ind2.size() == 2);
+
+    typename RotatedSPOsT<TestType>::RotationIndices full_rot_ind2;
+    RotatedSPOsT<TestType>::createRotationIndicesFull(nel, nmo, full_rot_ind2);
+    CHECK(full_rot_ind2.size() == 3);
+
+    nmo = 4;
+    typename RotatedSPOsT<TestType>::RotationIndices rot_ind3;
+    RotatedSPOsT<TestType>::createRotationIndices(nel, nmo, rot_ind3);
+    CHECK(rot_ind3.size() == 4);
+
+    typename RotatedSPOsT<TestType>::RotationIndices full_rot_ind3;
+    RotatedSPOsT<TestType>::createRotationIndicesFull(nel, nmo, full_rot_ind3);
+    CHECK(full_rot_ind3.size() == 6);
+}
+
+TEMPLATE_TEST_CASE("RotatedSPOs constructAntiSymmetricMatrix",
+    "[wavefunction][template]", double, float)
+{
+    using ValueType = typename SPOSetT<TestType>::ValueType;
+    using ValueMatrix = typename SPOSetT<TestType>::ValueMatrix;
+
+    typename RotatedSPOsT<TestType>::RotationIndices rot_ind;
+    int nel = 1;
+    int nmo = 3;
+    RotatedSPOsT<TestType>::createRotationIndices(nel, nmo, rot_ind);
+
+    ValueMatrix m3(nmo, nmo);
+    m3 = ValueType(0);
+    std::vector<ValueType> params = {0.1, 0.2};
+
+    RotatedSPOsT<TestType>::constructAntiSymmetricMatrix(rot_ind, params, m3);
+
+    // clang-format off
+  std::vector<ValueType> expected_data = { 0.0,  -0.1, -0.2,
+                                           0.1,   0.0,  0.0,
+                                           0.2,   0.0,  0.0 };
+    // clang-format on
+
+    ValueMatrix expected_m3(expected_data.data(), 3, 3);
+
+    CheckMatrixResult check_matrix_result = checkMatrix(m3, expected_m3, true);
+    CHECKED_ELSE(check_matrix_result.result)
+    {
+        FAIL(check_matrix_result.result_message);
+    }
+
+    std::vector<ValueType> params_out(2);
+    RotatedSPOsT<TestType>::extractParamsFromAntiSymmetricMatrix(
+        rot_ind, m3, params_out);
+    CHECK(params_out[0] == ValueApprox<TestType>(0.1));
+    CHECK(params_out[1] == ValueApprox<TestType>(0.2));
+}
+
+// Expected values of the matrix exponential come from gen_matrix_ops.py
+TEMPLATE_TEST_CASE("RotatedSPOs exponentiate matrix",
+    "[wavefunction][template]", double, float)
+{
+    using ValueType = typename SPOSetT<TestType>::ValueType;
+    using ValueMatrix = typename SPOSetT<TestType>::ValueMatrix;
+
+    std::vector<typename SPOSetT<TestType>::ValueType> mat1_data = {0.0};
+    typename SPOSetT<TestType>::ValueMatrix m1(mat1_data.data(), 1, 1);
+    RotatedSPOsT<TestType>::exponentiate_antisym_matrix(m1);
+    // Always return 1.0 (the only possible anti-symmetric 1x1 matrix is 0)
+    CHECK(m1(0, 0) == ValueApprox<TestType>(1.0));
+
+    // clang-format off
+  std::vector<typename SPOSetT<TestType>::ValueType> mat2_data = { 0.0, -0.1,
+                                               0.1,  0.0 };
+    // clang-format on
+
+    typename SPOSetT<TestType>::ValueMatrix m2(mat2_data.data(), 2, 2);
+    RotatedSPOsT<TestType>::exponentiate_antisym_matrix(m2);
+
+    // clang-format off
+  std::vector<ValueType> expected_rot2 = {  0.995004165278026,  -0.0998334166468282,
+                                            0.0998334166468282,  0.995004165278026 };
+    // clang-format on
+
+    ValueMatrix expected_m2(expected_rot2.data(), 2, 2);
+    CheckMatrixResult check_matrix_result2 = checkMatrix(m2, expected_m2, true);
+    CHECKED_ELSE(check_matrix_result2.result)
+    {
+        FAIL(check_matrix_result2.result_message);
+    }
+
+    // clang-format off
+  std::vector<ValueType> m3_input_data = { 0.0,  -0.3, -0.1,
+                                           0.3,   0.0, -0.2,
+                                           0.1,   0.2,  0.0 };
+
+
+  std::vector<ValueType> expected_rot3 = {  0.950580617906092, -0.302932713402637, -0.0680313164049401,
+                                            0.283164960565074,  0.935754803277919, -0.210191705950743,
+                                            0.127334574917630,  0.180540076694398,  0.975290308953046 };
+
+    // clang-format on
+
+    ValueMatrix m3(m3_input_data.data(), 3, 3);
+    ValueMatrix expected_m3(expected_rot3.data(), 3, 3);
+
+    RotatedSPOsT<TestType>::exponentiate_antisym_matrix(m3);
+
+    CheckMatrixResult check_matrix_result3 = checkMatrix(m3, expected_m3, true);
+    CHECKED_ELSE(check_matrix_result3.result)
+    {
+        FAIL(check_matrix_result3.result_message);
+    }
+}
+
+TEMPLATE_TEST_CASE(
+    "RotatedSPOs log matrix", "[wavefunction][template]", double, float)
+{
+    using ValueType = typename SPOSetT<TestType>::ValueType;
+    using ValueMatrix = typename SPOSetT<TestType>::ValueMatrix;
+
+    std::vector<typename SPOSetT<TestType>::ValueType> mat1_data = {1.0};
+    typename SPOSetT<TestType>::ValueMatrix m1(mat1_data.data(), 1, 1);
+    typename SPOSetT<TestType>::ValueMatrix out_m1(1, 1);
+    RotatedSPOsT<TestType>::log_antisym_matrix(m1, out_m1);
+    // Should always be 1.0 (the only possible anti-symmetric 1x1 matrix is 0)
+    CHECK(out_m1(0, 0) == ValueApprox<TestType>(0.0));
+
+    // clang-format off
+  std::vector<ValueType> start_rot2 = {  0.995004165278026,  -0.0998334166468282,
+                                         0.0998334166468282,  0.995004165278026 };
+
+  std::vector<typename SPOSetT<TestType>::ValueType> mat2_data = { 0.0, -0.1,
+                                               0.1,  0.0 };
+    // clang-format on
+
+    ValueMatrix rot_m2(start_rot2.data(), 2, 2);
+    ValueMatrix out_m2(2, 2);
+    RotatedSPOsT<TestType>::log_antisym_matrix(rot_m2, out_m2);
+
+    typename SPOSetT<TestType>::ValueMatrix m2(mat2_data.data(), 2, 2);
+    CheckMatrixResult check_matrix_result2 = checkMatrix(m2, out_m2, true);
+    CHECKED_ELSE(check_matrix_result2.result)
+    {
+        FAIL(check_matrix_result2.result_message);
+    }
+
+    // clang-format off
+  std::vector<ValueType> start_rot3 = {  0.950580617906092, -0.302932713402637, -0.0680313164049401,
+                                         0.283164960565074,  0.935754803277919, -0.210191705950743,
+                                         0.127334574917630,  0.180540076694398,  0.975290308953046 };
+
+  std::vector<ValueType> m3_input_data = { 0.0,  -0.3, -0.1,
+                                           0.3,   0.0, -0.2,
+                                           0.1,   0.2,  0.0 };
+    // clang-format on
+    ValueMatrix rot_m3(start_rot3.data(), 3, 3);
+    ValueMatrix out_m3(3, 3);
+    RotatedSPOsT<TestType>::log_antisym_matrix(rot_m3, out_m3);
+
+    typename SPOSetT<TestType>::ValueMatrix m3(m3_input_data.data(), 3, 3);
+    CheckMatrixResult check_matrix_result3 = checkMatrix(m3, out_m3, true);
+    CHECKED_ELSE(check_matrix_result3.result)
+    {
+        FAIL(check_matrix_result3.result_message);
+    }
+}
+
+// Test round trip A -> exp(A) -> log(exp(A))
+// The log is multi-valued so this test may fail if the rotation parameters are
+// too large. The exponentials will be the same, though
+//   exp(log(exp(A))) == exp(A)
+TEMPLATE_TEST_CASE(
+    "RotatedSPOs exp-log matrix", "[wavefunction][template]", double, float)
+{
+    using ValueType = typename SPOSetT<TestType>::ValueType;
+    using ValueMatrix = typename SPOSetT<TestType>::ValueMatrix;
+
+    typename RotatedSPOsT<TestType>::RotationIndices rot_ind;
+    int nel = 2;
+    int nmo = 4;
+    RotatedSPOsT<TestType>::createRotationIndices(nel, nmo, rot_ind);
+
+    ValueMatrix rot_m4(nmo, nmo);
+    rot_m4 = ValueType(0);
+
+    std::vector<ValueType> params4 = {-1.1, 1.5, 0.2, -0.15};
+
+    RotatedSPOsT<TestType>::constructAntiSymmetricMatrix(
+        rot_ind, params4, rot_m4);
+    ValueMatrix orig_rot_m4 = rot_m4;
+    ValueMatrix out_m4(nmo, nmo);
+
+    RotatedSPOsT<TestType>::exponentiate_antisym_matrix(rot_m4);
+
+    RotatedSPOsT<TestType>::log_antisym_matrix(rot_m4, out_m4);
+
+    CheckMatrixResult check_matrix_result4 =
+        checkMatrix(out_m4, orig_rot_m4, true);
+    CHECKED_ELSE(check_matrix_result4.result)
+    {
+        FAIL(check_matrix_result4.result_message);
+    }
+
+    std::vector<ValueType> params4out(4);
+    RotatedSPOsT<TestType>::extractParamsFromAntiSymmetricMatrix(
+        rot_ind, out_m4, params4out);
+    for (int i = 0; i < params4.size(); i++) {
+        CHECK(params4[i] == ValueApprox<TestType>(params4out[i]));
+    }
+}
+
+TEMPLATE_TEST_CASE(
+    "RotatedSPOs hcpBe", "[wavefunction][template]", double, float)
+{
+    using RealType = typename OrbitalSetTraits<TestType>::RealType;
+    Communicate* c = OHMMS::Controller;
+
+    typename ParticleSetT<TestType>::ParticleLayout lattice;
+    lattice.R = {4.32747284, 0.00000000, 0.00000000, -2.16373642, 3.74770142,
+        0.00000000, 0.00000000, 0.00000000, 6.78114995};
+
+    ParticleSetPoolT<TestType> ptcl = ParticleSetPoolT<TestType>(c);
+    ptcl.setSimulationCell(lattice);
+    auto ions_uptr =
+        std::make_unique<ParticleSetT<TestType>>(ptcl.getSimulationCell());
+    auto elec_uptr =
+        std::make_unique<ParticleSetT<TestType>>(ptcl.getSimulationCell());
+    ParticleSetT<TestType>& ions(*ions_uptr);
+    ParticleSetT<TestType>& elec(*elec_uptr);
+
+    ions.setName("ion");
+    ptcl.addParticleSet(std::move(ions_uptr));
+    ions.create({1});
+    ions.R[0] = {0.0, 0.0, 0.0};
+
+    elec.setName("elec");
+    ptcl.addParticleSet(std::move(elec_uptr));
+    elec.create({1});
+    elec.R[0] = {0.0, 0.0, 0.0};
+
+    SpeciesSet& tspecies = elec.getSpeciesSet();
+    int upIdx = tspecies.addSpecies("u");
+    int chargeIdx = tspecies.addAttribute("charge");
+    tspecies(chargeIdx, upIdx) = -1;
+
+    // Add the attribute save_coefs="yes" to the sposet_builder tag to generate
+    // the spline file for use in eval_bspline_spo.py
+
+    const char* particles = R"(<tmp>
+<sposet_builder type="bspline" href="hcpBe.pwscf.h5" tilematrix="1 0 0 0 1 0 0 0 1" twistnum="0" source="ion" meshfactor="1.0" precision="double">
+      <sposet type="bspline" name="spo_ud" spindataset="0" size="2"/>
+</sposet_builder>
+</tmp>)";
+
+    Libxml2Document doc;
+    bool okay = doc.parseFromString(particles);
+    REQUIRE(okay);
+
+    xmlNodePtr root = doc.getRoot();
+
+    xmlNodePtr sposet_builder = xmlFirstElementChild(root);
+    xmlNodePtr sposet_ptr = xmlFirstElementChild(sposet_builder);
+
+    EinsplineSetBuilderT<TestType> einSet(elec, ptcl.getPool(), c, sposet_builder);
+    auto spo = einSet.createSPOSetFromXML(sposet_ptr);
+    REQUIRE(spo);
+
+    spo->storeParamsBeforeRotation();
+    auto rot_spo = std::make_unique<RotatedSPOsT<TestType>>(
+        "one_rotated_set", std::move(spo));
+
+    // Sanity check for orbs. Expect 1 electron, 2 orbitals
+    const auto orbitalsetsize = rot_spo->getOrbitalSetSize();
+    REQUIRE(orbitalsetsize == 2);
+
+    rot_spo->buildOptVariables(elec.R.size());
+
+    typename SPOSetT<TestType>::ValueMatrix psiM_bare(
+        elec.R.size(), orbitalsetsize);
+    typename SPOSetT<TestType>::GradMatrix dpsiM_bare(
+        elec.R.size(), orbitalsetsize);
+    typename SPOSetT<TestType>::ValueMatrix d2psiM_bare(
+        elec.R.size(), orbitalsetsize);
+    rot_spo->evaluate_notranspose(
+        elec, 0, elec.R.size(), psiM_bare, dpsiM_bare, d2psiM_bare);
+
+    // Values generated from eval_bspline_spo.py, the
+    // generate_point_values_hcpBe function
+    CHECK(
+        std::real(psiM_bare[0][0]) == ValueApprox<TestType>(0.210221765375514));
+    CHECK(std::real(psiM_bare[0][1]) ==
+        ValueApprox<TestType>(-2.984345024542937e-06));
+
+    CHECK(std::real(d2psiM_bare[0][0]) ==
+        ValueApprox<TestType>(5.303848362116568));
+
+    OptVariablesType<TestType> opt_vars;
+    rot_spo->checkInVariablesExclusive(opt_vars);
+    opt_vars.resetIndex();
+    rot_spo->checkOutVariables(opt_vars);
+    rot_spo->resetParametersExclusive(opt_vars);
+
+    using ValueType = TestType;
+    Vector<ValueType> dlogpsi(1);
+    Vector<ValueType> dhpsioverpsi(1);
+    rot_spo->evaluateDerivatives(elec, opt_vars, dlogpsi, dhpsioverpsi, 0, 1);
+
+    CHECK(dlogpsi[0] == ValueApprox<TestType>(-1.41961753e-05));
+    CHECK(dhpsioverpsi[0] == ValueApprox<TestType>(-0.00060853));
+
+    std::vector<RealType> params = {0.1};
+    rot_spo->apply_rotation(params, false);
+
+    rot_spo->evaluate_notranspose(
+        elec, 0, elec.R.size(), psiM_bare, dpsiM_bare, d2psiM_bare);
+    CHECK(std::real(psiM_bare[0][0]) ==
+        ValueApprox<TestType>(0.20917123424337608));
+    CHECK(std::real(psiM_bare[0][1]) ==
+        ValueApprox<TestType>(-0.02099012652669549));
+
+    CHECK(std::real(d2psiM_bare[0][0]) ==
+        ValueApprox<TestType>(5.277362065087747));
+
+    dlogpsi[0] = 0.0;
+    dhpsioverpsi[0] = 0.0;
+
+    rot_spo->evaluateDerivatives(elec, opt_vars, dlogpsi, dhpsioverpsi, 0, 1);
+    CHECK(dlogpsi[0] == ValueApprox<TestType>(-0.10034901119468914));
+    CHECK(dhpsioverpsi[0] == ValueApprox<TestType>(32.96939041498753));
+}
+
+// Test construction of delta rotation
+TEMPLATE_TEST_CASE("RotatedSPOs construct delta matrix",
+    "[wavefunction][template]", double, float)
+{
+    using ValueType = typename SPOSetT<TestType>::ValueType;
+    using ValueMatrix = typename SPOSetT<TestType>::ValueMatrix;
+
+    int nel = 2;
+    int nmo = 4;
+    typename RotatedSPOsT<TestType>::RotationIndices rot_ind;
+    RotatedSPOsT<TestType>::createRotationIndices(nel, nmo, rot_ind);
+    typename RotatedSPOsT<TestType>::RotationIndices full_rot_ind;
+    RotatedSPOsT<TestType>::createRotationIndicesFull(nel, nmo, full_rot_ind);
+    // rot_ind size is 4 and full rot_ind size is 6
+
+    ValueMatrix rot_m4(nmo, nmo);
+    rot_m4 = ValueType(0);
+
+    // When comparing with gen_matrix_ops.py, be aware of the order of indices
+    // in full_rot
+    // rot_ind is (0,2) (0,3) (1,2) (1,3)
+    // full_rot_ind is (0,2) (0,3) (1,2) (1,3) (0,1) (2,3)
+    // The extra indices go at the back
+    std::vector<ValueType> old_params = {1.5, 0.2, -0.15, 0.03, -1.1, 0.05};
+    std::vector<ValueType> delta_params = {0.1, 0.3, 0.2, -0.1};
+    std::vector<ValueType> new_params(6);
+
+    RotatedSPOsT<TestType>::constructDeltaRotation(
+        delta_params, old_params, rot_ind, full_rot_ind, new_params, rot_m4);
+
+    // clang-format off
+  std::vector<ValueType> rot_data4 =
+    { -0.371126931484737,  0.491586564957393,   -0.784780958819798,   0.0687480658200083,
+      -0.373372784561548,  0.66111547793048,     0.610450337985578,   0.225542620014052,
+       0.751270334458895,  0.566737323353515,   -0.0297901110611425, -0.336918744155143,
+       0.398058348785074,  0.00881931472604944, -0.102867783149713,   0.911531672428406 };
+    // clang-format on
+
+    ValueMatrix new_rot_m4(rot_data4.data(), 4, 4);
+
+    CheckMatrixResult check_matrix_result4 =
+        checkMatrix(rot_m4, new_rot_m4, true);
+    CHECKED_ELSE(check_matrix_result4.result)
+    {
+        FAIL(check_matrix_result4.result_message);
+    }
+
+    // Reminder: Ordering!
+    std::vector<ValueType> expected_new_param = {1.6813965019790489,
+        0.3623564254653294, -0.05486544454559908, -0.20574472941408453,
+        -0.9542513302873077, 0.27497788909911774};
+    for (int i = 0; i < new_params.size(); i++)
+        CHECK(new_params[i] == ValueApprox<TestType>(expected_new_param[i]));
+
+    // Rotated back to original position
+
+    std::vector<ValueType> new_params2(6);
+    std::vector<ValueType> reverse_delta_params = {-0.1, -0.3, -0.2, 0.1};
+    RotatedSPOsT<TestType>::constructDeltaRotation(reverse_delta_params,
+        new_params, rot_ind, full_rot_ind, new_params2, rot_m4);
+    for (int i = 0; i < new_params2.size(); i++)
+        CHECK(new_params2[i] == ValueApprox<TestType>(old_params[i]));
+}
+
+// Test using global rotation
+TEMPLATE_TEST_CASE("RotatedSPOs read and write parameters",
+    "[wavefunction][template]", double, float)
+{
+    auto fake_spo = std::make_unique<FakeSPOT<TestType>>();
+    fake_spo->setOrbitalSetSize(4);
+    RotatedSPOsT<TestType> rot("fake_rot", std::move(fake_spo));
+    int nel = 2;
+    rot.buildOptVariables(nel);
+
+    optimize::VariableSetT<TestType> vs;
+    rot.checkInVariablesExclusive(vs);
+    vs[0] = 0.1;
+    vs[1] = 0.15;
+    vs[2] = 0.2;
+    vs[3] = 0.25;
+    rot.resetParametersExclusive(vs);
+
+    {
+        hdf_archive hout;
+        vs.writeToHDF("rot_vp.h5", hout);
+
+        rot.writeVariationalParameters(hout);
+    }
+
+    auto fake_spo2 = std::make_unique<FakeSPOT<TestType>>();
+    fake_spo2->setOrbitalSetSize(4);
+
+    RotatedSPOsT<TestType> rot2("fake_rot", std::move(fake_spo2));
+    rot2.buildOptVariables(nel);
+
+    optimize::VariableSetT<TestType> vs2;
+    rot2.checkInVariablesExclusive(vs2);
+
+    hdf_archive hin;
+    vs2.readFromHDF("rot_vp.h5", hin);
+    rot2.readVariationalParameters(hin);
+
+    auto& var = testing::getMyVars(rot2);
+    CHECK(var[0] == ValueApprox<TestType>(vs[0]));
+    CHECK(var[1] == ValueApprox<TestType>(vs[1]));
+    CHECK(var[2] == ValueApprox<TestType>(vs[2]));
+    CHECK(var[3] == ValueApprox<TestType>(vs[3]));
+
+    auto& full_var = testing::getMyVarsFull(rot2);
+    CHECK(full_var[0] == ValueApprox<TestType>(vs[0]));
+    CHECK(full_var[1] == ValueApprox<TestType>(vs[1]));
+    CHECK(full_var[2] == ValueApprox<TestType>(vs[2]));
+    CHECK(full_var[3] == ValueApprox<TestType>(vs[3]));
+    CHECK(full_var[4] == ValueApprox<TestType>(0.0));
+    CHECK(full_var[5] == ValueApprox<TestType>(0.0));
+}
+
+// Test using history list.
+TEMPLATE_TEST_CASE("RotatedSPOs read and write parameters history",
+    "[wavefunction][template]", double, float)
+{
+    auto fake_spo = std::make_unique<FakeSPOT<TestType>>();
+    fake_spo->setOrbitalSetSize(4);
+    RotatedSPOsT<TestType> rot("fake_rot", std::move(fake_spo));
+    rot.set_use_global_rotation(false);
+    int nel = 2;
+    rot.buildOptVariables(nel);
+
+    optimize::VariableSetT<TestType> vs;
+    rot.checkInVariablesExclusive(vs);
+    vs[0] = 0.1;
+    vs[1] = 0.15;
+    vs[2] = 0.2;
+    vs[3] = 0.25;
+    rot.resetParametersExclusive(vs);
+
+    {
+        hdf_archive hout;
+        vs.writeToHDF("rot_vp_hist.h5", hout);
+
+        rot.writeVariationalParameters(hout);
+    }
+
+    auto fake_spo2 = std::make_unique<FakeSPOT<TestType>>();
+    fake_spo2->setOrbitalSetSize(4);
+
+    RotatedSPOsT<TestType> rot2("fake_rot", std::move(fake_spo2));
+    rot2.buildOptVariables(nel);
+
+    optimize::VariableSetT<TestType> vs2;
+    rot2.checkInVariablesExclusive(vs2);
+
+    hdf_archive hin;
+    vs2.readFromHDF("rot_vp_hist.h5", hin);
+    rot2.readVariationalParameters(hin);
+
+    auto& var = testing::getMyVars(rot2);
+    CHECK(var[0] == ValueApprox<TestType>(vs[0]));
+    CHECK(var[1] == ValueApprox<TestType>(vs[1]));
+    CHECK(var[2] == ValueApprox<TestType>(vs[2]));
+    CHECK(var[3] == ValueApprox<TestType>(vs[3]));
+
+    auto hist = testing::getHistoryParams(rot2);
+    REQUIRE(hist.size() == 1);
+    REQUIRE(hist[0].size() == 4);
+}
+
+template <typename T>
+class DummySPOSetWithoutMWT : public SPOSetT<T>
+{
+public:
+    using ValueVector = typename SPOSetT<T>::ValueVector;
+    using ValueMatrix = typename SPOSetT<T>::ValueMatrix;
+    using GradVector = typename SPOSetT<T>::GradVector;
+    using GradMatrix = typename SPOSetT<T>::GradMatrix;
+
+    DummySPOSetWithoutMWT(const std::string& my_name) : SPOSetT<T>(my_name)
+    {
+    }
+    void
+    setOrbitalSetSize(int norbs) override
+    {
+    }
+    void
+    evaluateValue(const ParticleSetT<T>& P, int iat,
+        typename SPOSetT<T>::ValueVector& psi) override
+    {
+        assert(psi.size() == 3);
+        psi[0] = 123;
+        psi[1] = 456;
+        psi[2] = 789;
+    }
+    void
+    evaluateVGL(const ParticleSetT<T>& P, int iat, ValueVector& psi,
+        GradVector& dpsi, ValueVector& d2psi) override
+    {
+    }
+    void
+    evaluate_notranspose(const ParticleSetT<T>& P, int first, int last,
+        ValueMatrix& logdet, GradMatrix& dlogdet,
+        ValueMatrix& d2logdet) override
+    {
+    }
+    std::string
+    getClassName() const override
+    {
+        return this->my_name_;
+    }
+};
+
+template <typename T>
+class DummySPOSetWithMWT : public DummySPOSetWithoutMWT<T>
+{
+public:
+    using ValueVector = typename DummySPOSetWithoutMWT<T>::ValueVector;
+
+    DummySPOSetWithMWT(const std::string& my_name) :
+        DummySPOSetWithoutMWT<T>(my_name)
+    {
+    }
+    void
+    mw_evaluateValue(const RefVectorWithLeader<SPOSetT<T>>& spo_list,
+        const RefVectorWithLeader<ParticleSetT<T>>& P_list, int iat,
+        const RefVector<ValueVector>& psi_v_list) const override
+    {
+        for (auto& psi : psi_v_list) {
+            assert(psi.get().size() == 3);
+            psi.get()[0] = 321;
+            psi.get()[1] = 654;
+            psi.get()[2] = 987;
+        }
+    }
+};
+
+TEMPLATE_TEST_CASE(
+    "RotatedSPOs mw_ APIs", "[wavefunction][template]", double, float)
+{
+    // checking that mw_ API works in RotatedSPOs and is not defaulting to
+    // SPOSet default implementation
+    {
+        // First check calling the mw_ APIs for RotatedSPOs, for which the
+        // underlying implementation just calls the underlying SPOSet mw_ API
+        // In the case that the underlying SPOSet doesn't specialize the mw_
+        // API, the underlying SPOSet will fall back to the default SPOSet mw_,
+        // which is just a loop over the single walker API.
+        RotatedSPOsT<TestType> rot_spo0("rotated0",
+            std::make_unique<DummySPOSetWithoutMWT<TestType>>("no mw 0"));
+        RotatedSPOsT<TestType> rot_spo1("rotated1",
+            std::make_unique<DummySPOSetWithoutMWT<TestType>>("no mw 1"));
+        RefVectorWithLeader<SPOSetT<TestType>> spo_list(
+            rot_spo0, {rot_spo0, rot_spo1});
+
+        ResourceCollection spo_res("test_rot_res");
+        rot_spo0.createResource(spo_res);
+        ResourceCollectionTeamLock<SPOSetT<TestType>> mw_sposet_lock(
+            spo_res, spo_list);
+
+        const SimulationCellT<TestType> simulation_cell;
+        ParticleSetT<TestType> elec0(simulation_cell);
+        ParticleSetT<TestType> elec1(simulation_cell);
+        RefVectorWithLeader<ParticleSetT<TestType>> p_list(
+            elec0, {elec0, elec1});
+
+        typename SPOSetT<TestType>::ValueVector psi0(3);
+        typename SPOSetT<TestType>::ValueVector psi1(3);
+        RefVector<typename SPOSetT<TestType>::ValueVector> psi_v_list{
+            psi0, psi1};
+
+        rot_spo0.mw_evaluateValue(spo_list, p_list, 0, psi_v_list);
+        for (int iw = 0; iw < spo_list.size(); iw++) {
+            CHECK(psi_v_list[iw].get()[0] == ValueApprox<TestType>(123));
+            CHECK(psi_v_list[iw].get()[1] == ValueApprox<TestType>(456));
+            CHECK(psi_v_list[iw].get()[2] == ValueApprox<TestType>(789));
+        }
+    }
+    {
+        // In the case that the underlying SPOSet DOES have mw_ specializations,
+        // we want to make sure that RotatedSPOs are triggering that
+        // appropriately This will mean that the underlying SPOSets will do the
+        // appropriate offloading To check this, DummySPOSetWithMW has an
+        // explicit mw_evaluateValue which sets different values than what gets
+        // set in evaluateValue. By doing this, we are ensuring that
+        // RotatedSPOs->mw_evaluaeValue is calling the specialization in the
+        // underlying SPO and not using the default SPOSet implementation which
+        // loops over single walker APIs (which have different values enforced
+        // in
+        //  DummySPOSetWithoutMW
+
+        RotatedSPOsT<TestType> rot_spo0(
+            "rotated0", std::make_unique<DummySPOSetWithMWT<TestType>>("mw 0"));
+        RotatedSPOsT<TestType> rot_spo1(
+            "rotated1", std::make_unique<DummySPOSetWithMWT<TestType>>("mw 1"));
+        RefVectorWithLeader<SPOSetT<TestType>> spo_list(
+            rot_spo0, {rot_spo0, rot_spo1});
+
+        ResourceCollection spo_res("test_rot_res");
+        rot_spo0.createResource(spo_res);
+        ResourceCollectionTeamLock<SPOSetT<TestType>> mw_sposet_lock(
+            spo_res, spo_list);
+
+        const SimulationCellT<TestType> simulation_cell;
+        ParticleSetT<TestType> elec0(simulation_cell);
+        ParticleSetT<TestType> elec1(simulation_cell);
+        RefVectorWithLeader<ParticleSetT<TestType>> p_list(
+            elec0, {elec0, elec1});
+
+        typename SPOSetT<TestType>::ValueVector psi0(3);
+        typename SPOSetT<TestType>::ValueVector psi1(3);
+        RefVector<typename SPOSetT<TestType>::ValueVector> psi_v_list{
+            psi0, psi1};
+
+        rot_spo0.mw_evaluateValue(spo_list, p_list, 0, psi_v_list);
+        for (int iw = 0; iw < spo_list.size(); iw++) {
+            CHECK(psi_v_list[iw].get()[0] == ValueApprox<TestType>(321));
+            CHECK(psi_v_list[iw].get()[1] == ValueApprox<TestType>(654));
+            CHECK(psi_v_list[iw].get()[2] == ValueApprox<TestType>(987));
+        }
+    }
+}
+
+} // namespace qmcplusplus
diff --git a/src/mpi/mpi_datatype.h b/src/mpi/mpi_datatype.h
index 3750fba976..8f3c58e994 100644
--- a/src/mpi/mpi_datatype.h
+++ b/src/mpi/mpi_datatype.h
@@ -13,6 +13,8 @@
 #ifndef QMCPLUSPLUS_MPI_DATATYPEDEFINE_H
 #define QMCPLUSPLUS_MPI_DATATYPEDEFINE_H
 
+#include "Message/Communicate.h"
+
 #if defined(HAVE_MPI)
 #include <mpi.h>
 #else

From b91b2d694a2bf25b02e0323ac6140db75beaba30 Mon Sep 17 00:00:00 2001
From: Philip Fackler <facklerpw@ornl.gov>
Date: Mon, 25 Sep 2023 15:54:54 -0400
Subject: [PATCH 2/3] Add new bits to RotatedSPOsT

---
 src/QMCWaveFunctions/RotatedSPOsT.cpp | 146 ++++++++++++++++++++++++++
 src/QMCWaveFunctions/RotatedSPOsT.h   |  70 ++++++++++++
 2 files changed, 216 insertions(+)

diff --git a/src/QMCWaveFunctions/RotatedSPOsT.cpp b/src/QMCWaveFunctions/RotatedSPOsT.cpp
index 128bca9798..dabdc282a9 100644
--- a/src/QMCWaveFunctions/RotatedSPOsT.cpp
+++ b/src/QMCWaveFunctions/RotatedSPOsT.cpp
@@ -1688,6 +1688,152 @@ RotatedSPOsT<T>::makeClone() const
     return myclone;
 }
 
+template <typename T>
+void
+RotatedSPOsT<T>::mw_evaluateDetRatios(
+    const RefVectorWithLeader<SPOSetT<T>>& spo_list,
+    const RefVectorWithLeader<const VirtualParticleSetT<T>>& vp_list,
+    const RefVector<ValueVector>& psi_list,
+    const std::vector<const ValueType*>& invRow_ptr_list,
+    std::vector<std::vector<ValueType>>& ratios_list) const
+{
+    auto phi_list = extractPhiRefList(spo_list);
+    auto& leader = phi_list.getLeader();
+    leader.mw_evaluateDetRatios(
+        phi_list, vp_list, psi_list, invRow_ptr_list, ratios_list);
+}
+
+template <typename T>
+void
+RotatedSPOsT<T>::mw_evaluateValue(
+    const RefVectorWithLeader<SPOSetT<T>>& spo_list,
+    const RefVectorWithLeader<ParticleSetT<T>>& P_list, int iat,
+    const RefVector<ValueVector>& psi_v_list) const
+{
+    auto phi_list = extractPhiRefList(spo_list);
+    auto& leader = phi_list.getLeader();
+    leader.mw_evaluateValue(phi_list, P_list, iat, psi_v_list);
+}
+
+template <typename T>
+void
+RotatedSPOsT<T>::mw_evaluateVGL(const RefVectorWithLeader<SPOSetT<T>>& spo_list,
+    const RefVectorWithLeader<ParticleSetT<T>>& P_list, int iat,
+    const RefVector<ValueVector>& psi_v_list,
+    const RefVector<GradVector>& dpsi_v_list,
+    const RefVector<ValueVector>& d2psi_v_list) const
+{
+    auto phi_list = extractPhiRefList(spo_list);
+    auto& leader = phi_list.getLeader();
+    leader.mw_evaluateVGL(
+        phi_list, P_list, iat, psi_v_list, dpsi_v_list, d2psi_v_list);
+}
+
+template <typename T>
+void
+RotatedSPOsT<T>::mw_evaluateVGLWithSpin(
+    const RefVectorWithLeader<SPOSetT<T>>& spo_list,
+    const RefVectorWithLeader<ParticleSetT<T>>& P_list, int iat,
+    const RefVector<ValueVector>& psi_v_list,
+    const RefVector<GradVector>& dpsi_v_list,
+    const RefVector<ValueVector>& d2psi_v_list,
+    OffloadMatrix<ComplexType>& mw_dspin) const
+{
+    auto phi_list = extractPhiRefList(spo_list);
+    auto& leader = phi_list.getLeader();
+    leader.mw_evaluateVGLWithSpin(
+        phi_list, P_list, iat, psi_v_list, dpsi_v_list, d2psi_v_list, mw_dspin);
+}
+
+template <typename T>
+void
+RotatedSPOsT<T>::mw_evaluateVGLandDetRatioGrads(
+    const RefVectorWithLeader<SPOSetT<T>>& spo_list,
+    const RefVectorWithLeader<ParticleSetT<T>>& P_list, int iat,
+    const std::vector<const ValueType*>& invRow_ptr_list,
+    OffloadMWVGLArray& phi_vgl_v, std::vector<ValueType>& ratios,
+    std::vector<GradType>& grads) const
+{
+    auto phi_list = extractPhiRefList(spo_list);
+    auto& leader = phi_list.getLeader();
+    leader.mw_evaluateVGLandDetRatioGrads(
+        phi_list, P_list, iat, invRow_ptr_list, phi_vgl_v, ratios, grads);
+}
+
+template <typename T>
+void
+RotatedSPOsT<T>::mw_evaluateVGLandDetRatioGradsWithSpin(
+    const RefVectorWithLeader<SPOSetT<T>>& spo_list,
+    const RefVectorWithLeader<ParticleSetT<T>>& P_list, int iat,
+    const std::vector<const ValueType*>& invRow_ptr_list,
+    OffloadMWVGLArray& phi_vgl_v, std::vector<ValueType>& ratios,
+    std::vector<GradType>& grads, std::vector<ValueType>& spingrads) const
+{
+    auto phi_list = extractPhiRefList(spo_list);
+    auto& leader = phi_list.getLeader();
+    leader.mw_evaluateVGLandDetRatioGradsWithSpin(phi_list, P_list, iat,
+        invRow_ptr_list, phi_vgl_v, ratios, grads, spingrads);
+}
+
+template <typename T>
+void
+RotatedSPOsT<T>::mw_evaluate_notranspose(
+    const RefVectorWithLeader<SPOSetT<T>>& spo_list,
+    const RefVectorWithLeader<ParticleSetT<T>>& P_list, int first, int last,
+    const RefVector<ValueMatrix>& logdet_list,
+    const RefVector<GradMatrix>& dlogdet_list,
+    const RefVector<ValueMatrix>& d2logdet_list) const
+{
+    auto phi_list = extractPhiRefList(spo_list);
+    auto& leader = phi_list.getLeader();
+    leader.mw_evaluate_notranspose(phi_list, P_list, first, last, logdet_list,
+        dlogdet_list, d2logdet_list);
+}
+
+template <typename T>
+void
+RotatedSPOsT<T>::createResource(ResourceCollection& collection) const
+{
+    Phi->createResource(collection);
+}
+
+template <typename T>
+void
+RotatedSPOsT<T>::acquireResource(ResourceCollection& collection,
+    const RefVectorWithLeader<SPOSetT<T>>& spo_list) const
+{
+    auto phi_list = extractPhiRefList(spo_list);
+    auto& leader = phi_list.getLeader();
+    leader.acquireResource(collection, phi_list);
+}
+
+template <typename T>
+void
+RotatedSPOsT<T>::releaseResource(ResourceCollection& collection,
+    const RefVectorWithLeader<SPOSetT<T>>& spo_list) const
+{
+    auto phi_list = extractPhiRefList(spo_list);
+    auto& leader = phi_list.getLeader();
+    leader.releaseResource(collection, phi_list);
+}
+
+template <typename T>
+RefVectorWithLeader<SPOSetT<T>>
+RotatedSPOsT<T>::extractPhiRefList(
+    const RefVectorWithLeader<SPOSetT<T>>& spo_list)
+{
+    auto& spo_leader = spo_list.template getCastedLeader<RotatedSPOsT>();
+    const auto nw = spo_list.size();
+    RefVectorWithLeader<SPOSetT<T>> phi_list(*spo_leader.Phi);
+    phi_list.reserve(nw);
+    for (int iw = 0; iw < nw; iw++) {
+        RotatedSPOsT& rot =
+            spo_list.template getCastedElement<RotatedSPOsT>(iw);
+        phi_list.emplace_back(*rot.Phi);
+    }
+    return phi_list;
+}
+
 // Class concrete types from ValueType
 template class RotatedSPOsT<double>;
 template class RotatedSPOsT<float>;
diff --git a/src/QMCWaveFunctions/RotatedSPOsT.h b/src/QMCWaveFunctions/RotatedSPOsT.h
index 971d2528b3..fa4778a6f4 100644
--- a/src/QMCWaveFunctions/RotatedSPOsT.h
+++ b/src/QMCWaveFunctions/RotatedSPOsT.h
@@ -40,6 +40,8 @@ class RotatedSPOsT : public SPOSetT<T>, public OptimizableObjectT<T>
     using IndexType = typename SPOSetT<T>::IndexType;
     using RealType = typename SPOSetT<T>::RealType;
     using ValueType = typename SPOSetT<T>::ValueType;
+    using GradType = typename SPOSetT<T>::GradType;
+    using ComplexType = typename SPOSetT<T>::ComplexType;
     using FullRealType = typename SPOSetT<T>::FullRealType;
     using ValueVector = typename SPOSetT<T>::ValueVector;
     using ValueMatrix = typename SPOSetT<T>::ValueMatrix;
@@ -49,6 +51,9 @@ class RotatedSPOsT : public SPOSetT<T>, public OptimizableObjectT<T>
     using HessMatrix = typename SPOSetT<T>::HessMatrix;
     using GGGVector = typename SPOSetT<T>::GGGVector;
     using GGGMatrix = typename SPOSetT<T>::GGGMatrix;
+    using OffloadMWVGLArray = typename SPOSetT<T>::OffloadMWVGLArray;
+    template <typename DT>
+    using OffloadMatrix = Matrix<DT, OffloadPinnedAllocator<DT>>;
 
     // constructor
     RotatedSPOsT(
@@ -399,6 +404,68 @@ class RotatedSPOsT : public SPOSetT<T>, public OptimizableObjectT<T>
         use_global_rot_ = use_global_rotation;
     }
 
+    void
+    mw_evaluateDetRatios(const RefVectorWithLeader<SPOSetT<T>>& spo_list,
+        const RefVectorWithLeader<const VirtualParticleSetT<T>>& vp_list,
+        const RefVector<ValueVector>& psi_list,
+        const std::vector<const ValueType*>& invRow_ptr_list,
+        std::vector<std::vector<ValueType>>& ratios_list) const override;
+
+    void
+    mw_evaluateValue(const RefVectorWithLeader<SPOSetT<T>>& spo_list,
+        const RefVectorWithLeader<ParticleSetT<T>>& P_list, int iat,
+        const RefVector<ValueVector>& psi_v_list) const override;
+
+    void
+    mw_evaluateVGL(const RefVectorWithLeader<SPOSetT<T>>& spo_list,
+        const RefVectorWithLeader<ParticleSetT<T>>& P_list, int iat,
+        const RefVector<ValueVector>& psi_v_list,
+        const RefVector<GradVector>& dpsi_v_list,
+        const RefVector<ValueVector>& d2psi_v_list) const override;
+
+    void
+    mw_evaluateVGLWithSpin(const RefVectorWithLeader<SPOSetT<T>>& spo_list,
+        const RefVectorWithLeader<ParticleSetT<T>>& P_list, int iat,
+        const RefVector<ValueVector>& psi_v_list,
+        const RefVector<GradVector>& dpsi_v_list,
+        const RefVector<ValueVector>& d2psi_v_list,
+        OffloadMatrix<ComplexType>& mw_dspin) const override;
+
+    void
+    mw_evaluateVGLandDetRatioGrads(
+        const RefVectorWithLeader<SPOSetT<T>>& spo_list,
+        const RefVectorWithLeader<ParticleSetT<T>>& P_list, int iat,
+        const std::vector<const ValueType*>& invRow_ptr_list,
+        OffloadMWVGLArray& phi_vgl_v, std::vector<ValueType>& ratios,
+        std::vector<GradType>& grads) const override;
+
+    void
+    mw_evaluateVGLandDetRatioGradsWithSpin(
+        const RefVectorWithLeader<SPOSetT<T>>& spo_list,
+        const RefVectorWithLeader<ParticleSetT<T>>& P_list, int iat,
+        const std::vector<const ValueType*>& invRow_ptr_list,
+        OffloadMWVGLArray& phi_vgl_v, std::vector<ValueType>& ratios,
+        std::vector<GradType>& grads,
+        std::vector<ValueType>& spingrads) const override;
+
+    void
+    mw_evaluate_notranspose(const RefVectorWithLeader<SPOSetT<T>>& spo_list,
+        const RefVectorWithLeader<ParticleSetT<T>>& P_list, int first, int last,
+        const RefVector<ValueMatrix>& logdet_list,
+        const RefVector<GradMatrix>& dlogdet_list,
+        const RefVector<ValueMatrix>& d2logdet_list) const override;
+
+    void
+    createResource(ResourceCollection& collection) const override;
+
+    void
+    acquireResource(ResourceCollection& collection,
+        const RefVectorWithLeader<SPOSetT<T>>& spo_list) const override;
+
+    void
+    releaseResource(ResourceCollection& collection,
+        const RefVectorWithLeader<SPOSetT<T>>& spo_list) const override;
+
 private:
     /// true if SPO parameters (orbital rotation parameters) have been supplied
     /// by input
@@ -415,6 +482,9 @@ class RotatedSPOsT : public SPOSetT<T>, public OptimizableObjectT<T>
     /// Use global rotation or history list
     bool use_global_rot_ = true;
 
+    static RefVectorWithLeader<SPOSetT<T>>
+    extractPhiRefList(const RefVectorWithLeader<SPOSetT<T>>& spo_list);
+
     friend OptVariablesType<double>&
     testing::getMyVarsFull(RotatedSPOsT<double>& rot);
     friend OptVariablesType<float>&

From a0deb0047b2fb48c6a7bb95b69aa1dece14358de Mon Sep 17 00:00:00 2001
From: Philip Fackler <facklerpw@ornl.gov>
Date: Wed, 27 Sep 2023 10:32:21 -0400
Subject: [PATCH 3/3] Bugfix: removed QMC_COMPLEX conditions where no longer
 needed

---
 src/Particle/ParticleSetT.h                   |   5 +-
 .../BsplineFactory/SplineR2RT.cpp             |  31 ++--
 .../BsplineFactory/SplineR2RT.h               |   6 +-
 src/QMCWaveFunctions/CMakeLists.txt           |  12 +-
 src/QMCWaveFunctions/EinsplineSetBuilderT.cpp | 156 +++++++++---------
 src/QMCWaveFunctions/PlaneWave/PWOrbitalSet.h |   4 +-
 .../PlaneWave/PWOrbitalSetT.h                 |   4 +-
 src/QMCWaveFunctions/RotatedSPOsT.cpp         |  37 ++---
 .../tests/test_RotatedSPOsT.cpp               |   3 +-
 9 files changed, 138 insertions(+), 120 deletions(-)

diff --git a/src/Particle/ParticleSetT.h b/src/Particle/ParticleSetT.h
index 906e092adb..10b627696a 100644
--- a/src/Particle/ParticleSetT.h
+++ b/src/Particle/ParticleSetT.h
@@ -21,8 +21,6 @@
 #ifndef QMCPLUSPLUS_PARTICLESETT_H
 #define QMCPLUSPLUS_PARTICLESETT_H
 
-#include <memory>
-
 #include "DTModes.h"
 #include "DynamicCoordinatesT.h"
 #include "MCCoordsT.hpp"
@@ -38,6 +36,8 @@
 #include "Walker.h"
 #include "type_traits/template_types.hpp"
 
+#include <memory>
+
 namespace qmcplusplus
 {
 /// forward declarations
@@ -696,6 +696,7 @@ class ParticleSetT : public OhmmsElementBase
     {
         myTwist = t;
     }
+
     inline const SingleParticlePos&
     getTwist() const
     {
diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineR2RT.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineR2RT.cpp
index ce4bb5e8aa..2469d3c1d2 100644
--- a/src/QMCWaveFunctions/BsplineFactory/SplineR2RT.cpp
+++ b/src/QMCWaveFunctions/BsplineFactory/SplineR2RT.cpp
@@ -17,6 +17,7 @@
 
 #include "SplineR2RT.h"
 
+#include "CPU/BLAS.hpp"
 #include "Concurrency/OpenMP.h"
 #include "QMCWaveFunctions/BsplineFactory/contraction_helper.hpp"
 #include "spline2/MultiBsplineEval.hpp"
@@ -125,17 +126,27 @@ SplineR2RT<ST, VT>::applyRotation(
         std::copy_n(spl_coefs, coefs_tot_size, coef_copy_->begin());
     }
 
-    // Apply rotation the dumb way b/c I can't get BLAS::gemm to work...
-    for (auto i = 0; i < BasisSetSize; i++) {
-        for (auto j = 0; j < this->OrbitalSetSize; j++) {
-            const auto cur_elem = Nsplines * i + j;
-            auto newval{0.};
-            for (auto k = 0; k < this->OrbitalSetSize; k++) {
-                const auto index = i * Nsplines + k;
-                newval += (*coef_copy_)[index] * rot_mat[k][j];
+    if constexpr (std::is_same_v<ST, RealType>) {
+        // Here, ST should be equal to ValueType, which will be double for R2R.
+        // Using BLAS to make things faster
+        BLAS::gemm('N', 'N', this->OrbitalSetSize, BasisSetSize,
+            this->OrbitalSetSize, ST(1.0), rot_mat.data(), this->OrbitalSetSize,
+            coef_copy_->data(), Nsplines, ST(0.0), spl_coefs, Nsplines);
+    }
+    else {
+        // Here, ST is float but ValueType is double for R2R. Due to issues with
+        // type conversions, just doing naive matrix multiplication in this case
+        // to not lose precision on rot_mat
+        for (IndexType i = 0; i < BasisSetSize; i++)
+            for (IndexType j = 0; j < this->OrbitalSetSize; j++) {
+                const auto cur_elem = Nsplines * i + j;
+                FullPrecValueType newval{0.};
+                for (IndexType k = 0; k < this->OrbitalSetSize; k++) {
+                    const auto index = i * Nsplines + k;
+                    newval += (*coef_copy_)[index] * rot_mat[k][j];
+                }
+                spl_coefs[cur_elem] = newval;
             }
-            spl_coefs[cur_elem] = newval;
-        }
     }
 }
 
diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineR2RT.h b/src/QMCWaveFunctions/BsplineFactory/SplineR2RT.h
index ece156ac1a..1e2a841e13 100644
--- a/src/QMCWaveFunctions/BsplineFactory/SplineR2RT.h
+++ b/src/QMCWaveFunctions/BsplineFactory/SplineR2RT.h
@@ -40,8 +40,12 @@ class SplineR2RT : public BsplineSetT<VT>
     using SplineType = typename bspline_traits<ST, 3>::SplineType;
     using BCType = typename bspline_traits<ST, 3>::BCType;
     using DataType = ST;
+    using RealType = typename SPOSetT<VT>::RealType;
+    using IndexType = typename SPOSetT<VT>::IndexType;
+    using FullPrecValueType = double;
     using PointType = TinyVector<ST, 3>;
     using SingleSplineType = UBspline_3d_d;
+
     // types for evaluation results
     using TT = typename BsplineSetT<VT>::ValueType;
     using GGGVector = typename BsplineSetT<VT>::GGGVector;
@@ -55,8 +59,6 @@ class SplineR2RT : public BsplineSetT<VT>
     using hContainer_type = VectorSoaContainer<ST, 6>;
     using ghContainer_type = VectorSoaContainer<ST, 10>;
 
-    using RealType = typename SPOSetT<VT>::RealType;
-
 private:
     bool IsGamma;
     ///\f$GGt=G^t G \f$, transformation for tensor in LatticeUnit to
diff --git a/src/QMCWaveFunctions/CMakeLists.txt b/src/QMCWaveFunctions/CMakeLists.txt
index 05c1fe018b..78cfb90d62 100644
--- a/src/QMCWaveFunctions/CMakeLists.txt
+++ b/src/QMCWaveFunctions/CMakeLists.txt
@@ -150,11 +150,17 @@ if(OHMMS_DIM MATCHES 3)
   endif(HAVE_EINSPLINE)
 
   # plane wave SPO
-  set(FERMION_SRCS ${FERMION_SRCS} PlaneWave/PWBasis.cpp PlaneWave/PWBasisT.cpp PlaneWave/PWParameterSet.cpp PlaneWave/PWOrbitalBuilder.cpp)
+  set(FERMION_SRCS ${FERMION_SRCS}
+    PlaneWave/PWBasis.cpp
+    PlaneWave/PWBasisT.cpp
+    PlaneWave/PWOrbitalSetT.cpp
+    PlaneWave/PWRealOrbitalSetT.cpp
+    PlaneWave/PWParameterSet.cpp
+    PlaneWave/PWOrbitalBuilder.cpp)
   if(QMC_COMPLEX)
-    set(FERMION_SRCS ${FERMION_SRCS} PlaneWave/PWOrbitalSet.cpp PlaneWave/PWOrbitalSetT.cpp)
+    set(FERMION_SRCS ${FERMION_SRCS} PlaneWave/PWOrbitalSet.cpp)
   else()
-    set(FERMION_SRCS ${FERMION_SRCS} PlaneWave/PWRealOrbitalSet.cpp PlaneWave/PWRealOrbitalSetT.cpp)
+    set(FERMION_SRCS ${FERMION_SRCS} PlaneWave/PWRealOrbitalSet.cpp)
   endif(QMC_COMPLEX)
 
   if(NOT QMC_COMPLEX)
diff --git a/src/QMCWaveFunctions/EinsplineSetBuilderT.cpp b/src/QMCWaveFunctions/EinsplineSetBuilderT.cpp
index f48ea6348a..46157f9b28 100644
--- a/src/QMCWaveFunctions/EinsplineSetBuilderT.cpp
+++ b/src/QMCWaveFunctions/EinsplineSetBuilderT.cpp
@@ -514,19 +514,19 @@ EinsplineSetBuilderT<T>::AnalyzeTwists2(
     }
 
     TargetPtcl.setTwist(superFracs[twist_num_]);
-#ifndef QMC_COMPLEX
-    // Check to see if supercell twist is okay to use with real wave
-    // functions
-    for (int dim = 0; dim < OHMMS_DIM; dim++) {
-        double t = 2.0 * superFracs[twist_num_][dim];
-        if (std::abs(t - round(t)) > MatchingTol * 100) {
-            app_error()
-                << "Cannot use this super twist with real wavefunctions.\n"
-                << "Please recompile with QMC_COMPLEX=1.\n";
-            APP_ABORT("EinsplineSetBuilder::AnalyzeTwists2");
+    if constexpr (!IsComplex_t<T>{}()) {
+        // Check to see if supercell twist is okay to use with real wave
+        // functions
+        for (int dim = 0; dim < OHMMS_DIM; dim++) {
+            double t = 2.0 * superFracs[twist_num_][dim];
+            if (std::abs(t - round(t)) > MatchingTol * 100) {
+                app_error()
+                    << "Cannot use this super twist with real wavefunctions.\n"
+                    << "Please recompile with QMC_COMPLEX=1.\n";
+                APP_ABORT("EinsplineSetBuilder::AnalyzeTwists2");
+            }
         }
     }
-#endif
     // Now check to see that each supercell twist has the right twists
     // to tile the primitive cell orbitals.
     const int numTwistsNeeded = std::abs(det(TileMatrix));
@@ -574,78 +574,80 @@ EinsplineSetBuilderT<T>::AnalyzeTwists2(
         IncludeTwists.push_back(superSets[twist_num_][i]);
     // Now, find out which twists are distinct
     DistinctTwists.clear();
-#ifndef QMC_COMPLEX
-    std::vector<int> copyTwists;
-    for (int i = 0; i < IncludeTwists.size(); i++) {
-        int ti = IncludeTwists[i];
-        PosType twist_i = primcell_kpoints[ti];
-        bool distinct = true;
-        for (int j = i + 1; j < IncludeTwists.size(); j++) {
-            int tj = IncludeTwists[j];
-            PosType twist_j = primcell_kpoints[tj];
-            PosType sum = twist_i + twist_j;
-            PosType diff = twist_i - twist_j;
-            if (TwistPair(twist_i, twist_j))
-                distinct = false;
+    if constexpr (!IsComplex_t<T>{}()) {
+        std::vector<int> copyTwists;
+        for (int i = 0; i < IncludeTwists.size(); i++) {
+            int ti = IncludeTwists[i];
+            PosType twist_i = primcell_kpoints[ti];
+            bool distinct = true;
+            for (int j = i + 1; j < IncludeTwists.size(); j++) {
+                int tj = IncludeTwists[j];
+                PosType twist_j = primcell_kpoints[tj];
+                PosType sum = twist_i + twist_j;
+                PosType diff = twist_i - twist_j;
+                if (TwistPair(twist_i, twist_j))
+                    distinct = false;
+            }
+            if (distinct)
+                DistinctTwists.push_back(ti);
+            else
+                copyTwists.push_back(ti);
         }
-        if (distinct)
-            DistinctTwists.push_back(ti);
-        else
-            copyTwists.push_back(ti);
-    }
-    // Now determine which distinct twists require two copies
-    MakeTwoCopies.resize(DistinctTwists.size());
-    for (int i = 0; i < DistinctTwists.size(); i++) {
-        MakeTwoCopies[i] = false;
-        int ti = DistinctTwists[i];
-        PosType twist_i = primcell_kpoints[ti];
-        for (int j = 0; j < copyTwists.size(); j++) {
-            int tj = copyTwists[j];
-            PosType twist_j = primcell_kpoints[tj];
-            if (TwistPair(twist_i, twist_j))
-                MakeTwoCopies[i] = true;
+        // Now determine which distinct twists require two copies
+        MakeTwoCopies.resize(DistinctTwists.size());
+        for (int i = 0; i < DistinctTwists.size(); i++) {
+            MakeTwoCopies[i] = false;
+            int ti = DistinctTwists[i];
+            PosType twist_i = primcell_kpoints[ti];
+            for (int j = 0; j < copyTwists.size(); j++) {
+                int tj = copyTwists[j];
+                PosType twist_j = primcell_kpoints[tj];
+                if (TwistPair(twist_i, twist_j))
+                    MakeTwoCopies[i] = true;
+            }
+            if (this->myComm->rank() == 0) {
+                std::array<char, 1000> buf;
+                int length = std::snprintf(buf.data(), buf.size(),
+                    "Using %d copies of twist angle [%6.3f, %6.3f, %6.3f]\n",
+                    MakeTwoCopies[i] ? 2 : 1, twist_i[0], twist_i[1],
+                    twist_i[2]);
+                if (length < 0)
+                    throw std::runtime_error("Error generating string");
+                app_log() << std::string_view(buf.data(), length);
+                app_log().flush();
+            }
         }
-        if (this->myComm->rank() == 0) {
-            std::array<char, 1000> buf;
-            int length = std::snprintf(buf.data(), buf.size(),
-                "Using %d copies of twist angle [%6.3f, %6.3f, %6.3f]\n",
-                MakeTwoCopies[i] ? 2 : 1, twist_i[0], twist_i[1], twist_i[2]);
-            if (length < 0)
-                throw std::runtime_error("Error generating string");
-            app_log() << std::string_view(buf.data(), length);
-            app_log().flush();
+        // Find out if we can make real orbitals
+        use_real_splines_ = true;
+        for (int i = 0; i < DistinctTwists.size(); i++) {
+            int ti = DistinctTwists[i];
+            PosType twist = primcell_kpoints[ti];
+            for (int j = 0; j < OHMMS_DIM; j++)
+                if (std::abs(twist[j] - 0.0) > MatchingTol &&
+                    std::abs(twist[j] - 0.5) > MatchingTol &&
+                    std::abs(twist[j] + 0.5) > MatchingTol)
+                    use_real_splines_ = false;
         }
+        if (use_real_splines_ && (DistinctTwists.size() > 1)) {
+            app_log() << "***** Use of real orbitals is possible, but not "
+                         "currently implemented\n"
+                      << "      with more than one twist angle.\n";
+            use_real_splines_ = false;
+        }
+        if (use_real_splines_)
+            app_log() << "Using real splines.\n";
+        else
+            app_log() << "Using complex splines.\n";
     }
-    // Find out if we can make real orbitals
-    use_real_splines_ = true;
-    for (int i = 0; i < DistinctTwists.size(); i++) {
-        int ti = DistinctTwists[i];
-        PosType twist = primcell_kpoints[ti];
-        for (int j = 0; j < OHMMS_DIM; j++)
-            if (std::abs(twist[j] - 0.0) > MatchingTol &&
-                std::abs(twist[j] - 0.5) > MatchingTol &&
-                std::abs(twist[j] + 0.5) > MatchingTol)
-                use_real_splines_ = false;
-    }
-    if (use_real_splines_ && (DistinctTwists.size() > 1)) {
-        app_log() << "***** Use of real orbitals is possible, but not "
-                     "currently implemented\n"
-                  << "      with more than one twist angle.\n";
+    else {
+        DistinctTwists.resize(IncludeTwists.size());
+        MakeTwoCopies.resize(IncludeTwists.size());
+        for (int i = 0; i < IncludeTwists.size(); i++) {
+            DistinctTwists[i] = IncludeTwists[i];
+            MakeTwoCopies[i] = false;
+        }
         use_real_splines_ = false;
     }
-    if (use_real_splines_)
-        app_log() << "Using real splines.\n";
-    else
-        app_log() << "Using complex splines.\n";
-#else
-    DistinctTwists.resize(IncludeTwists.size());
-    MakeTwoCopies.resize(IncludeTwists.size());
-    for (int i = 0; i < IncludeTwists.size(); i++) {
-        DistinctTwists[i] = IncludeTwists[i];
-        MakeTwoCopies[i] = false;
-    }
-    use_real_splines_ = false;
-#endif
 }
 
 template <typename T>
diff --git a/src/QMCWaveFunctions/PlaneWave/PWOrbitalSet.h b/src/QMCWaveFunctions/PlaneWave/PWOrbitalSet.h
index 225033214b..5add827a86 100644
--- a/src/QMCWaveFunctions/PlaneWave/PWOrbitalSet.h
+++ b/src/QMCWaveFunctions/PlaneWave/PWOrbitalSet.h
@@ -15,8 +15,8 @@
 /** @file PWOrbitalSet.h
  * @brief Definition of member functions of Plane-wave basis set
  */
-#ifndef QMCPLUSPLUS_PLANEWAVE_ORBITALSETT_BLAS_H
-#define QMCPLUSPLUS_PLANEWAVE_ORBITALSETT_BLAS_H
+#ifndef QMCPLUSPLUS_PLANEWAVE_ORBITALSET_BLAS_H
+#define QMCPLUSPLUS_PLANEWAVE_ORBITALSET_BLAS_H
 
 #include "QMCWaveFunctions/PlaneWave/PWBasis.h"
 #include "QMCWaveFunctions/SPOSet.h"
diff --git a/src/QMCWaveFunctions/PlaneWave/PWOrbitalSetT.h b/src/QMCWaveFunctions/PlaneWave/PWOrbitalSetT.h
index d4e13de966..9103a16ee2 100644
--- a/src/QMCWaveFunctions/PlaneWave/PWOrbitalSetT.h
+++ b/src/QMCWaveFunctions/PlaneWave/PWOrbitalSetT.h
@@ -18,8 +18,8 @@
 /** @file PWOrbitalSetT.h
  * @brief Definition of member functions of Plane-wave basis set
  */
-#ifndef QMCPLUSPLUS_PLANEWAVE_ORBITALSET_BLAS_H
-#define QMCPLUSPLUS_PLANEWAVE_ORBITALSET_BLAS_H
+#ifndef QMCPLUSPLUS_PLANEWAVE_ORBITALSETT_BLAS_H
+#define QMCPLUSPLUS_PLANEWAVE_ORBITALSETT_BLAS_H
 
 #include "CPU/BLAS.hpp"
 #include "QMCWaveFunctions/PlaneWave/PWBasisT.h"
diff --git a/src/QMCWaveFunctions/RotatedSPOsT.cpp b/src/QMCWaveFunctions/RotatedSPOsT.cpp
index dabdc282a9..1aa8af8ada 100644
--- a/src/QMCWaveFunctions/RotatedSPOsT.cpp
+++ b/src/QMCWaveFunctions/RotatedSPOsT.cpp
@@ -307,7 +307,6 @@ template <typename T>
 void
 RotatedSPOsT<T>::buildOptVariables(const size_t nel)
 {
-#if !defined(QMC_COMPLEX)
     /* Only rebuild optimized variables if more after-rotation orbitals are
      * needed Consider ROHF, there is only one set of SPO for both spin up and
      * down Nup > Ndown. nel_major_ will be set Nup.
@@ -332,7 +331,6 @@ RotatedSPOsT<T>::buildOptVariables(const size_t nel)
 
         buildOptVariables(created_m_act_rot_inds, created_full_rot_inds);
     }
-#endif
 }
 
 template <typename T>
@@ -340,7 +338,6 @@ void
 RotatedSPOsT<T>::buildOptVariables(
     const RotationIndices& rotations, const RotationIndices& full_rotations)
 {
-#if !defined(QMC_COMPLEX)
     const size_t nmo = Phi->getOrbitalSetSize();
 
     // create active rotations
@@ -419,7 +416,6 @@ RotatedSPOsT<T>::buildOptVariables(
             param[i] = this->myVars[i];
         apply_rotation(param, false);
     }
-#endif
 }
 
 template <typename T>
@@ -858,33 +854,32 @@ RotatedSPOsT<T>::evaluateDerivatives(ParticleSetT<T>& P,
     // possibly replace wit BLAS calls
     for (int i = 0; i < nel; i++)
         for (int j = 0; j < nmo; j++)
-            Bbar(i, j) = d2psiM_all(i, j) +
-                2.0 * dot(myG_J[i], dpsiM_all(i, j)) +
+            Bbar(i, j) = d2psiM_all(i, j) + 2 * dot(myG_J[i], dpsiM_all(i, j)) +
                 myL_J[i] * psiM_all(i, j);
 
     //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~PART2
-    const T* const A(psiM_all.data());
-    const T* const Ainv(psiM_inv.data());
-    const T* const B(Bbar.data());
-    ValueMatrix T_mat;
+    const ValueType* const A(psiM_all.data());
+    const ValueType* const Ainv(psiM_inv.data());
+    const ValueType* const B(Bbar.data());
+    ValueMatrix t;
     ValueMatrix Y1;
     ValueMatrix Y2;
     ValueMatrix Y3;
     ValueMatrix Y4;
-    T_mat.resize(nel, nmo);
+    t.resize(nel, nmo);
     Y1.resize(nel, nel);
     Y2.resize(nel, nmo);
     Y3.resize(nel, nmo);
     Y4.resize(nel, nmo);
 
-    BLAS::gemm('N', 'N', nmo, nel, nel, T(1.0), A, nmo, Ainv, nel, T(0.0),
-        T_mat.data(), nmo);
-    BLAS::gemm('N', 'N', nel, nel, nel, T(1.0), B, nmo, Ainv, nel, T(0.0),
-        Y1.data(), nel);
-    BLAS::gemm('N', 'N', nmo, nel, nel, T(1.0), T_mat.data(), nmo, Y1.data(),
-        nel, T(0.0), Y2.data(), nmo);
-    BLAS::gemm('N', 'N', nmo, nel, nel, T(1.0), B, nmo, Ainv, nel, T(0.0),
-        Y3.data(), nmo);
+    BLAS::gemm('N', 'N', nmo, nel, nel, ValueType(1.0), A, nmo, Ainv, nel,
+        ValueType(0.0), t.data(), nmo);
+    BLAS::gemm('N', 'N', nel, nel, nel, ValueType(1.0), B, nmo, Ainv, nel,
+        ValueType(0.0), Y1.data(), nel);
+    BLAS::gemm('N', 'N', nmo, nel, nel, ValueType(1.0), t.data(), nmo,
+        Y1.data(), nel, ValueType(0.0), Y2.data(), nmo);
+    BLAS::gemm('N', 'N', nmo, nel, nel, ValueType(1.0), B, nmo, Ainv, nel,
+        ValueType(0.0), Y3.data(), nmo);
 
     // possibly replace with BLAS call
     Y4 = Y3 - Y2;
@@ -894,8 +889,8 @@ RotatedSPOsT<T>::evaluateDerivatives(ParticleSetT<T>& P,
         if (kk >= 0) {
             const int p = m_act_rot_inds.at(i).first;
             const int q = m_act_rot_inds.at(i).second;
-            dlogpsi[kk] += T_mat(p, q);
-            dhpsioverpsi[kk] += T(-0.5) * Y4(p, q);
+            dlogpsi[kk] += t(p, q);
+            dhpsioverpsi[kk] += ValueType(-0.5) * Y4(p, q);
         }
     }
 }
diff --git a/src/QMCWaveFunctions/tests/test_RotatedSPOsT.cpp b/src/QMCWaveFunctions/tests/test_RotatedSPOsT.cpp
index 24a5087f79..e5c04d205f 100644
--- a/src/QMCWaveFunctions/tests/test_RotatedSPOsT.cpp
+++ b/src/QMCWaveFunctions/tests/test_RotatedSPOsT.cpp
@@ -633,7 +633,8 @@ TEMPLATE_TEST_CASE(
     xmlNodePtr sposet_builder = xmlFirstElementChild(root);
     xmlNodePtr sposet_ptr = xmlFirstElementChild(sposet_builder);
 
-    EinsplineSetBuilderT<TestType> einSet(elec, ptcl.getPool(), c, sposet_builder);
+    EinsplineSetBuilderT<TestType> einSet(
+        elec, ptcl.getPool(), c, sposet_builder);
     auto spo = einSet.createSPOSetFromXML(sposet_ptr);
     REQUIRE(spo);