From bfd21e5eb54a357bdbd35867ba9ae64f28dbb425 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Wed, 2 Aug 2023 08:43:45 -0400
Subject: [PATCH 1/7] hush warning in madness_parsec_key_print re: format
 identifier mismatch with the type

---
 src/madness/world/parsec.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/src/madness/world/parsec.cc b/src/madness/world/parsec.cc
index 0267edbc15b..be8eddecbc6 100644
--- a/src/madness/world/parsec.cc
+++ b/src/madness/world/parsec.cc
@@ -6,6 +6,8 @@
 #include "thread.h"
 #include <parsec/parsec_internal.h>
 
+#include <cinttypes>
+
 // Here we initialize with the right child class
 namespace madness {
 #ifndef MADNESS_ASSERTIONS_DISABLE
@@ -66,7 +68,7 @@ namespace madness {
     }
 
     static char *madness_parsec_key_print(char *buffer, size_t buffer_size, parsec_key_t k, void *user_data) {
-        snprintf(buffer, buffer_size, "MADNESS_TASK(%" PRIx64 ")", (intptr_t)(k));
+        snprintf(buffer, buffer_size, "MADNESS_TASK(%" PRIxPTR ")", k);
         return buffer;
     }
 

From c576d84a0907d03cad375f054ff5d4ac370b26a4 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Wed, 2 Aug 2023 09:11:46 -0400
Subject: [PATCH 2/7] Derivative::do_diff2b aborts if PaRSEC is used as backend
 since it does not support recursive tasking and that's needed for making
 progress here

---
 src/madness/mra/derivative.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/madness/mra/derivative.h b/src/madness/mra/derivative.h
index bf00cf959f8..37bdc616ef9 100644
--- a/src/madness/mra/derivative.h
+++ b/src/madness/mra/derivative.h
@@ -370,7 +370,12 @@ namespace madness {
                     return;
                 }
             }
-            tensorT gcoeffs = df->parent_to_child(found_argT.get().second, found_argT.get().first,key).full_tensor_copy();
+#ifdef HAVE_PARSEC
+            std::cerr << "FATAL ERROR: PaRSEC does not support recursive task execution but Derivative::do_diff2b requires this. Use a different backend" << std::endl;
+            abort();
+#endif
+            const auto& found_argT_value = found_argT.get();  // do not recursively execute tasks to avoid making PaRSEC sad
+            tensorT gcoeffs = df->parent_to_child(found_argT_value.second, found_argT_value.first,key).full_tensor_copy();
 
             //if (this->bc.get_bc().dim(0) == 1) {
             if (NDIM == 1) {

From ff339524466d1e486b834651b18148a43f941434 Mon Sep 17 00:00:00 2001
From: "Robert J. Harrison" <rjharrison@gmail.com>
Date: Sat, 19 Aug 2023 12:25:48 -0400
Subject: [PATCH 3/7] spin-free Dirac 1e-atom with psudo-large component

---
 src/examples/CMakeLists.txt    |   2 +-
 src/examples/hatom_sf_dirac.cc | 151 +++++++++++++++++++++++++++++++++
 2 files changed, 152 insertions(+), 1 deletion(-)
 create mode 100644 src/examples/hatom_sf_dirac.cc

diff --git a/src/examples/CMakeLists.txt b/src/examples/CMakeLists.txt
index 4715fee334d..38b0b55aa00 100644
--- a/src/examples/CMakeLists.txt
+++ b/src/examples/CMakeLists.txt
@@ -1,7 +1,7 @@
 # src/examples
 
 set(EXAMPLE_SOURCES
-    madinfo h2dft hedft hello hatom_energy h2 he tdse_example heat heat2 csqrt 
+    madinfo h2dft hedft hello hatom_energy h2 he tdse_example heat heat2 csqrt hatom_sf_dirac
     sdf_shape_tester test_gmres tdse1d vnucso nonlinschro sininteg functionio 
     dataloadbal hatom_1d binaryop dielectric hehf 3dharmonic testsolver
     testspectralprop dielectric_external_field tiny h2dynamic newsolver testcomplexfunctionsolver
diff --git a/src/examples/hatom_sf_dirac.cc b/src/examples/hatom_sf_dirac.cc
new file mode 100644
index 00000000000..3f82142ed24
--- /dev/null
+++ b/src/examples/hatom_sf_dirac.cc
@@ -0,0 +1,151 @@
+#include <fstream>
+#include <iostream>
+#include <madness/mra/mra.h>
+#include <madness/mra/operator.h>
+
+using namespace madness;
+
+static const double Z = 10.0;      // nuclear charge
+static const double L = 23.0/Z;    // [-L,L] box size
+static const long k = 8;           // wavelet order
+static const double thresh = 1e-6; // precision
+static const double c = 137.035999679; // speed of light
+
+static double guess(const coord_3d& r) {
+    const double x=r[0], y=r[1], z=r[2];
+    return exp(-Z*sqrt(x*x+y*y+z*z))/sqrt(constants::pi/(Z*Z*Z));
+}
+
+static double V(const coord_3d& r) {
+    const double x=r[0], y=r[1], z=r[2];
+    return -Z/(sqrt(x*x+y*y+z*z));
+}
+
+// Applies pV.p/4m^2c^2
+real_function_3d pVp(World& world, real_function_3d& V, real_function_3d& phi) {
+    real_function_3d Wphi = real_function_3d(world);
+    for (int axis=0; axis<3; axis++) {
+        real_derivative_3d D = free_space_derivative<double,3>(world, axis);
+        D.set_bspline1(); // try bspline smoother derivative
+        Wphi -= D(V*D(phi));
+    }
+    Wphi *= 1.0/(4.0*c*c);
+    return Wphi;
+}
+
+// E<psi|psi> = <psi|T|phi> + <psi|V|psi>
+std::tuple<double,double,double,double> compute_energy(World& world,
+                                                       real_function_3d& V,
+                                                       real_function_3d& psi,
+                                                       real_function_3d& phi)
+{
+    double overlap = inner(psi,psi);
+    double potential_energy = inner(psi,V*psi) / overlap;
+    double kinetic_energy = 0.0;
+    for (int axis=0; axis<3; axis++) {
+        real_derivative_3d D = free_space_derivative<double,3>(world, axis); // more accurate to use ABGV derivative for KE
+        real_function_3d dpsi = D(psi);
+        real_function_3d dphi = D(phi);
+        kinetic_energy += 0.5*inner(dpsi,dphi);
+    }
+    kinetic_energy /= overlap;
+    double total_energy = kinetic_energy + potential_energy;
+    return {total_energy, kinetic_energy, potential_energy, overlap};
+}
+
+
+
+// (T-E) phi = -V psi + E(psi-phi)
+// and with s=1+E/2mc^2
+// (T-E) (psi-s*phi) = - pVp phi - E(psi-s*phi)
+std::tuple<real_function_3d, real_function_3d, double> iterate(World& world,
+                                                               real_function_3d& V,
+                                                               real_function_3d& psi,
+                                                               real_function_3d& phi,
+                                                               double energy)
+{
+    real_convolution_3d op = BSHOperator3D(world, sqrt(-2*energy), 0.001, 1e-6);
+    
+    real_function_3d rhs = -2*(V*psi - energy*(psi-phi));
+    rhs.truncate();
+    real_function_3d phi_new = apply(op,rhs);
+
+    double s = 1.0 + energy/(2*c*c);
+    rhs = -2*(pVp(world,V,phi) + energy*(psi-s*phi));
+    rhs.truncate();
+    real_function_3d psimphi_new = apply(op,rhs);
+    real_function_3d psi_new = s*phi_new + psimphi_new;
+    psi_new.truncate();
+    phi_new.truncate();
+
+    double rnorm = (psi_new-psi).norm2();
+    double norm = inner(psi_new,psi_new);
+    psi_new *= 1.0/sqrt(norm);
+    phi_new *= 1.0/sqrt(norm);
+    
+    return {psi_new, phi_new, rnorm};
+}
+
+double exact_energy(double Z) {
+    if (Z<10) {
+        double a2 = Z*Z/(c*c);
+        double a4 = a2*a2;
+        double a6 = a4*a2;
+        double a8 = a4*a4;
+        return Z*Z*(-1.0/2.0 - 1.0/8.0*a2 - 1/16*a4 - 5.0/128.0*a6 - 7.0/256.0*a8);
+    }
+    double Za = Z/c;
+    double s = Za / sqrt(1.0 - Za*Za);
+    return c*c/std::sqrt(1.0 + s*s) - c*c;
+}
+
+void run(World& world) {
+    FunctionDefaults<3>::set_k(k);
+    FunctionDefaults<3>::set_thresh(thresh);
+    FunctionDefaults<3>::set_truncate_mode(1);
+    FunctionDefaults<3>::set_cubic_cell(-L,L);
+
+    real_function_3d Vnuc = real_factory_3d(world).f(V);
+    real_function_3d psi  = real_factory_3d(world).f(guess);
+    real_function_3d phi  = real_factory_3d(world).f(guess);
+    print("Initial should norm be 1:", psi.norm2());
+    print("Exact Dirac-Coulomb energy:", exact_energy(Z));
+
+    double energy = -0.5*Z*Z; // NR energy guess
+    for (int iter=0; iter<20; iter++) {
+        auto [psi_new, phi_new, rnorm] = iterate(world, Vnuc, psi, phi, energy);
+        psi = psi_new;
+        phi = phi_new;
+        auto psisize = psi.size();
+        auto phisize = phi.size();
+        auto phinorm = phi.norm2();
+        if (world.rank() == 0) {
+            print("\niteration ", iter, " rnorm ", rnorm, " psi size ", psisize, " phi size ", phisize, " phi norm ", phinorm);
+        }
+
+        auto [total_energy, kinetic_energy, potential_energy, overlap] = compute_energy(world, Vnuc, psi, phi);
+        
+        if (world.rank() == 0) {
+            print("            Kinetic energy  ", kinetic_energy);
+            print(" Nuclear attraction energy ", potential_energy);
+            print("              Total energy ", total_energy, total_energy - energy);
+        }
+
+        energy = total_energy;
+
+        if (iter>2 and rnorm<1e-4) break;
+    }
+}
+
+int main(int argc, char** argv) {
+    World& world = initialize(argc, argv);
+    startup(world,argc,argv);
+    std::cout.precision(10);
+
+    run(world); // Put all functions in separate scope to force destruction before finalize
+
+    world.gop.fence();
+
+    finalize();
+    return 0;
+}

From 943c219355c90bc1c6c6c988608aa05af5191e04 Mon Sep 17 00:00:00 2001
From: "Robert J. Harrison" <rjharrison@gmail.com>
Date: Sun, 20 Aug 2023 20:43:20 -0400
Subject: [PATCH 4/7] tweaking for accuracy and convergence, with limited
 success

---
 src/examples/hatom_sf_dirac.cc | 259 ++++++++++++++++++++++++++++-----
 1 file changed, 226 insertions(+), 33 deletions(-)

diff --git a/src/examples/hatom_sf_dirac.cc b/src/examples/hatom_sf_dirac.cc
index 3f82142ed24..66d7a325e57 100644
--- a/src/examples/hatom_sf_dirac.cc
+++ b/src/examples/hatom_sf_dirac.cc
@@ -1,33 +1,157 @@
 #include <fstream>
 #include <iostream>
+#include <algorithm>
 #include <madness/mra/mra.h>
 #include <madness/mra/operator.h>
+#include <madness/mra/lbdeux.h>
+#include <madness/mra/nonlinsol.h>
+#include <madness/chem/atomutil.h>
 
 using namespace madness;
 
 static const double Z = 10.0;      // nuclear charge
-static const double L = 23.0/Z;    // [-L,L] box size
+static const double L = 40.0/Z;    // [-L,L] box size so exp(-Zr)=1e-16 padded a bit since we are masking
 static const long k = 8;           // wavelet order
 static const double thresh = 1e-6; // precision
 static const double c = 137.035999679; // speed of light
+static const int Vtruncmode = (Z<=10 && thresh>1e-10) ? 1 : 0; // Point nucleus potential woes
+static const double eprec = 0.0; //std::min(1e-4,thresh*Z*Z); // Note this is for NR wavefun, and for now we just want relative not absolute error
+static const double rcut = 1e-4; // typical size of heavy nucleus //smoothing_parameter(Z, eprec);
+static const double v = std::sqrt(1.0 - Z*Z/(c*c)) - 1.0;
 
-static double guess(const coord_3d& r) {
-    const double x=r[0], y=r[1], z=r[2];
-    return exp(-Z*sqrt(x*x+y*y+z*z))/sqrt(constants::pi/(Z*Z*Z));
+inline double mask1(double x) {
+    x = (x * x * (3. - 2. * x));
+    return x;
+}
+
+static double mask3(const coord_3d& ruser) {
+    coord_3d rsim;
+    user_to_sim(ruser, rsim);
+    double x = rsim[0], y = rsim[1], z = rsim[2];
+    double lo = 0.0625, hi = 1.0 - lo, result = 1.0;
+    double rlo = 1.0 / lo;
+
+    if (x < lo)
+        result *= mask1(mask1(x * rlo));
+    else if (x > hi)
+        result *= mask1(mask1((1.0 - x) * rlo));
+    if (y < lo)
+        result *= mask1(mask1(y * rlo));
+    else if (y > hi)
+        result *= mask1(mask1((1.0 - y) * rlo));
+    if (z < lo)
+        result *= mask1(mask1(z * rlo));
+    else if (z > hi)
+        result *= mask1(mask1((1.0 - z) * rlo));
+
+    return result;
+}
+
+
+struct LBCost {
+    double leaf_value;
+    double parent_value;
+    LBCost(double leaf_value=1.0, double parent_value=1.0)
+        : leaf_value(leaf_value)
+        , parent_value(parent_value)
+    {}
+
+    double operator()(const Key<3>& key, const FunctionNode<double,3>& node) const {
+        if (node.is_leaf()) return leaf_value;
+        else return parent_value;
+    }
+};
+
+// This class is used to store information for the non-linear solver
+struct F {
+    real_function_3d psi, phi;
+
+    F(const real_function_3d& psi, const real_function_3d& phi) 
+        : psi(psi), phi(phi)
+    {}
+
+    F operator-(const F& b) const {
+        return F(psi-b.psi, phi-b.phi);
+    }
+
+    F operator+=(const F& b) { // Operator+= necessarphi
+        psi += b.psi; phi += b.phi;
+        return *this;
+    }
+
+    F operator*(double a) const { // Scale bphi a constant necessarphi
+        return F(psi*a,phi*a);
+    }
+};
+
+double inner(const F& a, const F& b) {
+    return inner(a.psi,b.psi) + inner(a.phi,b.phi);
+}
+
+struct allocator {
+    World& world;
+
+    allocator(World& world) : world(world) {}
+
+    F operator()() {
+        return F(real_function_3d(world),real_function_3d(world));
+    }
+};
+
+static double psi_guess(const coord_3d& r) {
+    double R = std::sqrt(r[0]*r[0] + r[1]*r[1] + r[2]*r[2]);
+    return std::pow(R+rcut,v)*exp(-Z*R)/sqrt(constants::pi/(Z*Z*Z)); // note +rcut in singular part to smooth on nuclear scale
+}
+
+static double phi_guess(const coord_3d& r) {
+    double R = std::sqrt(r[0]*r[0] + r[1]*r[1] + r[2]*r[2]);
+    return exp(-Z*R)/sqrt(constants::pi/(Z*Z*Z));
 }
 
 static double V(const coord_3d& r) {
     const double x=r[0], y=r[1], z=r[2];
-    return -Z/(sqrt(x*x+y*y+z*z));
+    //return -Z/(sqrt(x*x+y*y+z*z));
+    return (-Z/rcut)*smoothed_potential(sqrt(x*x+y*y+z*z)/rcut);
+}
+
+class Vderiv : public FunctionFunctorInterface<double,3> {
+    const int axis;
+public:
+    Vderiv(int axis) : axis(axis) {}
+
+    double operator()(const coord_3d& r) const {
+        double R = std::sqrt(r[0]*r[0] + r[1]*r[1] + r[2]*r[2]);
+        double rc = 1.0/rcut;
+        return - Z * (r[axis] / R) * dsmoothed_potential(R * rc) * (rc * rc);
+    }
+};
+
+// Applies pV.p/4m^2c^2 phi = sum_q (-dV/dq dphi/dq - V d^2phi/dq^2)
+real_function_3d pVp(World& world, const real_function_3d& V, const real_function_3d gradV[3], const real_function_3d& phi) {
+    real_function_3d Wphi = real_function_3d(world);
+    for (int axis=0; axis<3; axis++) {
+        {
+            real_derivative_3d D = free_space_derivative<double,3>(world, axis);
+            D.set_bspline1();
+            Wphi -= D(phi)*gradV[axis];
+        }
+        {
+            real_derivative_3d D = free_space_derivative<double,3>(world, axis);
+            D.set_bspline2();
+            Wphi -= D(phi)*V;
+        }
+    }
+    Wphi *= 1.0/(4.0*c*c);
+    return Wphi;
 }
 
 // Applies pV.p/4m^2c^2
-real_function_3d pVp(World& world, real_function_3d& V, real_function_3d& phi) {
+real_function_3d pVp(World& world, const real_function_3d& V, const real_function_3d& phi) {
     real_function_3d Wphi = real_function_3d(world);
     for (int axis=0; axis<3; axis++) {
         real_derivative_3d D = free_space_derivative<double,3>(world, axis);
         D.set_bspline1(); // try bspline smoother derivative
-        Wphi -= D(V*D(phi));
+        Wphi -= D(D(phi)*V);
     }
     Wphi *= 1.0/(4.0*c*c);
     return Wphi;
@@ -53,32 +177,58 @@ std::tuple<double,double,double,double> compute_energy(World& world,
     return {total_energy, kinetic_energy, potential_energy, overlap};
 }
 
-
-
 // (T-E) phi = -V psi + E(psi-phi)
 // and with s=1+E/2mc^2
 // (T-E) (psi-s*phi) = - pVp phi - E(psi-s*phi)
+template <typename solverT>
 std::tuple<real_function_3d, real_function_3d, double> iterate(World& world,
-                                                               real_function_3d& V,
-                                                               real_function_3d& psi,
-                                                               real_function_3d& phi,
-                                                               double energy)
+                                                               const real_function_3d& V,
+                                                               const real_function_3d gradV[3],
+                                                               const real_function_3d& mask,
+                                                               const real_function_3d& psi,
+                                                               const real_function_3d& phi,
+                                                               const double energy,
+                                                               const int iter,
+                                                               solverT& solver)
 {
-    real_convolution_3d op = BSHOperator3D(world, sqrt(-2*energy), 0.001, 1e-6);
-    
-    real_function_3d rhs = -2*(V*psi - energy*(psi-phi));
+    real_convolution_3d op = BSHOperator3D(world, sqrt(-2*energy), rcut*0.1, thresh);
+    real_function_3d rhs = -2*(psi*V - energy*(psi-phi));
     rhs.truncate();
     real_function_3d phi_new = apply(op,rhs);
 
     double s = 1.0 + energy/(2*c*c);
-    rhs = -2*(pVp(world,V,phi) + energy*(psi-s*phi));
+    rhs = -2*(pVp(world,V,gradV,phi) + energy*(psi-s*phi));
+    //rhs = -2*(pVp(world,V,phi) + energy*(psi-s*phi));
     rhs.truncate();
     real_function_3d psimphi_new = apply(op,rhs);
     real_function_3d psi_new = s*phi_new + psimphi_new;
+
     psi_new.truncate();
     phi_new.truncate();
 
-    double rnorm = (psi_new-psi).norm2();
+    psi_new = psi_new * mask;
+    phi_new = phi_new * mask;
+    
+    double rnorm = (psi_new-psi).norm2() + (phi_new-phi).norm2();
+
+    // Comment out next 5 lines to not use the solver (i.e., to just do fixed-point iteration)
+    // if (iter>5) {
+    //     F f = solver.update(F(psi,phi), F(psi_new-psi, phi_new-phi));
+    //     psi_new = f.psi;
+    //     phi_new = f.phi;
+    //     psi_new.truncate();
+    //     phi_new.truncate();
+    // }
+
+    double stepnorm = (psi_new-psi).norm2() + (phi_new-phi).norm2();
+    double maxstep = 1e-1;
+    if (stepnorm > maxstep) {
+        if (world.rank() == 0) print("step restriction", stepnorm);
+        double step = maxstep/stepnorm;
+        psi_new = psi + step*(psi_new-psi);
+        phi_new = phi + step*(phi_new-phi);
+    }
+
     double norm = inner(psi_new,psi_new);
     psi_new *= 1.0/sqrt(norm);
     phi_new *= 1.0/sqrt(norm);
@@ -87,7 +237,7 @@ std::tuple<real_function_3d, real_function_3d, double> iterate(World& world,
 }
 
 double exact_energy(double Z) {
-    if (Z<10) {
+    if (Z<=10) {
         double a2 = Z*Z/(c*c);
         double a4 = a2*a2;
         double a6 = a4*a2;
@@ -105,27 +255,70 @@ void run(World& world) {
     FunctionDefaults<3>::set_truncate_mode(1);
     FunctionDefaults<3>::set_cubic_cell(-L,L);
 
-    real_function_3d Vnuc = real_factory_3d(world).f(V);
-    real_function_3d psi  = real_factory_3d(world).f(guess);
-    real_function_3d phi  = real_factory_3d(world).f(guess);
-    print("Initial should norm be 1:", psi.norm2());
-    print("Exact Dirac-Coulomb energy:", exact_energy(Z));
+    if (world.rank() == 0) {
+        print("            Z:", Z);
+        print("            L:", L);
+        print("            k:", k);
+        print("            v:", v);
+        print("            c:", c);
+        print("         rcut:", rcut);
+        print("        eprec:", eprec);
+        print("       thresh:", thresh);
+        print("       Vtmode:", Vtruncmode);
+        print("     exact DC:", exact_energy(Z));
+        print("");
+    }
 
+    {
+        real_function_3d Vnuc = real_factory_3d(world).f(V).truncate_mode(Vtruncmode);
+        LoadBalanceDeux<3> lb(world);
+        lb.add_tree(Vnuc, LBCost(1.0,8.0));
+        FunctionDefaults<3>::redistribute(world, lb.load_balance(2.0,false));
+    }
+    real_function_3d Vnuc = real_factory_3d(world).f(V).truncate_mode(Vtruncmode);
+
+    real_function_3d gradV[3];
+    for (int axis=0; axis<3; axis++) {
+        std::shared_ptr<FunctionFunctorInterface<double,3>> p(new Vderiv(axis));
+        gradV[axis] = real_factory_3d(world).functor(p).truncate_mode(0);
+    }
+
+    
+    coord_3d lo = {0.0,0.0,-L}, hi = {0.0,0.0,L};
+    const int npt = 1001;
+    
+    real_function_3d mask  = real_factory_3d(world).f(mask3);
+    plot_line("mask.dat", npt, lo, hi, mask);
+    
+    real_function_3d psi  = real_factory_3d(world).f(psi_guess);
+    real_function_3d phi  = real_factory_3d(world).f(phi_guess);
+    psi = psi*mask;
+    phi = phi*mask;
+    {
+        double norm;
+        norm = psi.norm2();  psi.scale(1.0/norm);
+        norm = phi.norm2();  phi.scale(1.0/norm);
+    }
+
+    XNonlinearSolver<F,double,allocator> solver = XNonlinearSolver<F,double,allocator>(allocator(world));
+    solver.set_maxsub(10);
+    
     double energy = -0.5*Z*Z; // NR energy guess
-    for (int iter=0; iter<20; iter++) {
-        auto [psi_new, phi_new, rnorm] = iterate(world, Vnuc, psi, phi, energy);
-        psi = psi_new;
-        phi = phi_new;
+    for (int iter=0; iter<100; iter++) {
+        char fname[256];
+        sprintf(fname,"psi-phi-%3.3d.dat", iter);
+        plot_line(fname, npt, lo, hi, psi, phi);
+        
+        auto [psi_new, phi_new, rnorm] = iterate(world, Vnuc, gradV, mask, psi, phi, energy, iter, solver);
+        psi = psi_new*mask;
+        phi = phi_new*mask;
         auto psisize = psi.size();
         auto phisize = phi.size();
         auto phinorm = phi.norm2();
-        if (world.rank() == 0) {
-            print("\niteration ", iter, " rnorm ", rnorm, " psi size ", psisize, " phi size ", phisize, " phi norm ", phinorm);
-        }
-
         auto [total_energy, kinetic_energy, potential_energy, overlap] = compute_energy(world, Vnuc, psi, phi);
         
         if (world.rank() == 0) {
+            print("\niteration ", iter, " rnorm ", rnorm, " psi size ", psisize, " phi size ", phisize, " phi norm ", phinorm);
             print("            Kinetic energy  ", kinetic_energy);
             print(" Nuclear attraction energy ", potential_energy);
             print("              Total energy ", total_energy, total_energy - energy);
@@ -133,7 +326,7 @@ void run(World& world) {
 
         energy = total_energy;
 
-        if (iter>2 and rnorm<1e-4) break;
+        if (iter>10 and rnorm<10.0*thresh) break;
     }
 }
 

From 4b744031b49f86ef147d3d9936d9b99a833e0b23 Mon Sep 17 00:00:00 2001
From: "Robert J. Harrison" <rjharrison@gmail.com>
Date: Tue, 22 Aug 2023 15:20:39 -0400
Subject: [PATCH 5/7] note deliberate errors that are for debugging solver

---
 src/examples/hatom_sf_dirac.cc | 64 +++++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 24 deletions(-)

diff --git a/src/examples/hatom_sf_dirac.cc b/src/examples/hatom_sf_dirac.cc
index 66d7a325e57..88ce7306633 100644
--- a/src/examples/hatom_sf_dirac.cc
+++ b/src/examples/hatom_sf_dirac.cc
@@ -9,8 +9,8 @@
 
 using namespace madness;
 
-static const double Z = 10.0;      // nuclear charge
-static const double L = 40.0/Z;    // [-L,L] box size so exp(-Zr)=1e-16 padded a bit since we are masking
+static const double Z = 80.0;      // nuclear charge
+static const double L = 160.0/Z;    // L=40/Z [-L,L] box size so exp(-Zr)=1e-16 padded a bit since we are masking
 static const long k = 8;           // wavelet order
 static const double thresh = 1e-6; // precision
 static const double c = 137.035999679; // speed of light
@@ -98,14 +98,33 @@ struct allocator {
     }
 };
 
+double exact_energy(double Z) {
+    if (Z<=10) {
+        double a2 = Z*Z/(c*c);
+        double a4 = a2*a2;
+        double a6 = a4*a2;
+        double a8 = a4*a4;
+        return Z*Z*(-1.0/2.0 - 1.0/8.0*a2 - 1/16*a4 - 5.0/128.0*a6 - 7.0/256.0*a8);
+    }
+    double Za = Z/c;
+    double s = Za / sqrt(1.0 - Za*Za);
+    return c*c/std::sqrt(1.0 + s*s) - c*c;
+}
+
 static double psi_guess(const coord_3d& r) {
     double R = std::sqrt(r[0]*r[0] + r[1]*r[1] + r[2]*r[2]);
-    return std::pow(R+rcut,v)*exp(-Z*R)/sqrt(constants::pi/(Z*Z*Z)); // note +rcut in singular part to smooth on nuclear scale
+    const double expnt = std::sqrt(-2*exact_energy(Z)*(1+exact_energy(Z)/(2*c*c)));
+    return std::pow(R+rcut,v)*exp(-expnt*R)/sqrt(constants::pi/(expnt*expnt*expnt)); // note +rcut in singular part to smooth on nuclear scale
+    //const double expnt = Z*0.5;
+    //return exp(-expnt*R)/sqrt(constants::pi/(expnt*expnt*expnt)); // note +rcut in singular part to smooth on nuclear scale
 }
 
 static double phi_guess(const coord_3d& r) {
     double R = std::sqrt(r[0]*r[0] + r[1]*r[1] + r[2]*r[2]);
-    return exp(-Z*R)/sqrt(constants::pi/(Z*Z*Z));
+    const double expnt = std::sqrt(-2*exact_energy(Z)*(1+exact_energy(Z)/(2*c*c)));
+    return exp(-expnt*R)/sqrt(constants::pi/(expnt*expnt*expnt));
+    // const double expnt = Z;
+    // return exp(-expnt*R)/sqrt(constants::pi/(expnt*expnt*expnt)); // note +rcut in singular part to smooth on nuclear scale
 }
 
 static double V(const coord_3d& r) {
@@ -187,17 +206,21 @@ std::tuple<real_function_3d, real_function_3d, double> iterate(World& world,
                                                                const real_function_3d& mask,
                                                                const real_function_3d& psi,
                                                                const real_function_3d& phi,
+                                                               const real_function_3d& fakeV,
                                                                const double energy,
                                                                const int iter,
                                                                solverT& solver)
 {
     real_convolution_3d op = BSHOperator3D(world, sqrt(-2*energy), rcut*0.1, thresh);
-    real_function_3d rhs = -2*(psi*V - energy*(psi-phi));
+    //real_function_3d rhs = -2*(psi*V - energy*(psi-phi));
+    real_function_3d rhs = -2*(0.5*(psi+phi)*V - energy*(psi-phi));
     rhs.truncate();
     real_function_3d phi_new = apply(op,rhs);
 
-    double s = 1.0 + energy/(2*c*c);
-    rhs = -2*(pVp(world,V,gradV,phi) + energy*(psi-s*phi));
+    //double s = 1.0 + energy/(2*c*c);
+    double s = 1.0;
+    rhs = -2*(energy*(psi-s*phi));
+    //rhs = -2*(pVp(world,V,gradV,phi) + energy*(psi-s*phi));
     //rhs = -2*(pVp(world,V,phi) + energy*(psi-s*phi));
     rhs.truncate();
     real_function_3d psimphi_new = apply(op,rhs);
@@ -209,6 +232,9 @@ std::tuple<real_function_3d, real_function_3d, double> iterate(World& world,
     psi_new = psi_new * mask;
     phi_new = phi_new * mask;
     
+    double rnormL = (psi_new-psi).norm2();
+    double rnormP = (phi_new-phi).norm2();
+    if (world.rank() == 0) print(rnormL, rnormP);
     double rnorm = (psi_new-psi).norm2() + (phi_new-phi).norm2();
 
     // Comment out next 5 lines to not use the solver (i.e., to just do fixed-point iteration)
@@ -236,19 +262,6 @@ std::tuple<real_function_3d, real_function_3d, double> iterate(World& world,
     return {psi_new, phi_new, rnorm};
 }
 
-double exact_energy(double Z) {
-    if (Z<=10) {
-        double a2 = Z*Z/(c*c);
-        double a4 = a2*a2;
-        double a6 = a4*a2;
-        double a8 = a4*a4;
-        return Z*Z*(-1.0/2.0 - 1.0/8.0*a2 - 1/16*a4 - 5.0/128.0*a6 - 7.0/256.0*a8);
-    }
-    double Za = Z/c;
-    double s = Za / sqrt(1.0 - Za*Za);
-    return c*c/std::sqrt(1.0 + s*s) - c*c;
-}
-
 void run(World& world) {
     FunctionDefaults<3>::set_k(k);
     FunctionDefaults<3>::set_thresh(thresh);
@@ -287,6 +300,8 @@ void run(World& world) {
     coord_3d lo = {0.0,0.0,-L}, hi = {0.0,0.0,L};
     const int npt = 1001;
     
+    real_function_3d fakeV = 1e-4*Vnuc;
+
     real_function_3d mask  = real_factory_3d(world).f(mask3);
     plot_line("mask.dat", npt, lo, hi, mask);
     
@@ -304,14 +319,15 @@ void run(World& world) {
     solver.set_maxsub(10);
     
     double energy = -0.5*Z*Z; // NR energy guess
-    for (int iter=0; iter<100; iter++) {
+    for (int iter=0; iter<1000; iter++) {
         char fname[256];
         sprintf(fname,"psi-phi-%3.3d.dat", iter);
         plot_line(fname, npt, lo, hi, psi, phi);
         
-        auto [psi_new, phi_new, rnorm] = iterate(world, Vnuc, gradV, mask, psi, phi, energy, iter, solver);
-        psi = psi_new*mask;
-        phi = phi_new*mask;
+        auto [psi_new, phi_new, rnorm] = iterate(world, Vnuc, gradV, mask, psi, phi, fakeV, energy, iter, solver);
+        psi = psi_new;
+        phi = phi_new;
+
         auto psisize = psi.size();
         auto phisize = phi.size();
         auto phinorm = phi.norm2();

From b0dc838233880b58a20dc04b6229223d28918800 Mon Sep 17 00:00:00 2001
From: "Robert J. Harrison" <rjharrison@gmail.com>
Date: Wed, 23 Aug 2023 08:22:49 -0400
Subject: [PATCH 6/7] checking in a working version free from the hacking

---
 src/examples/hatom_sf_dirac.cc | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/src/examples/hatom_sf_dirac.cc b/src/examples/hatom_sf_dirac.cc
index 88ce7306633..f1ba5664ccb 100644
--- a/src/examples/hatom_sf_dirac.cc
+++ b/src/examples/hatom_sf_dirac.cc
@@ -10,7 +10,7 @@
 using namespace madness;
 
 static const double Z = 80.0;      // nuclear charge
-static const double L = 160.0/Z;    // L=40/Z [-L,L] box size so exp(-Zr)=1e-16 padded a bit since we are masking
+static const double L = 40.0/Z;    // L=40/Z [-L,L] box size so exp(-Zr)=1e-16 padded a bit since we are masking
 static const long k = 8;           // wavelet order
 static const double thresh = 1e-6; // precision
 static const double c = 137.035999679; // speed of light
@@ -206,22 +206,18 @@ std::tuple<real_function_3d, real_function_3d, double> iterate(World& world,
                                                                const real_function_3d& mask,
                                                                const real_function_3d& psi,
                                                                const real_function_3d& phi,
-                                                               const real_function_3d& fakeV,
                                                                const double energy,
                                                                const int iter,
                                                                solverT& solver)
 {
     real_convolution_3d op = BSHOperator3D(world, sqrt(-2*energy), rcut*0.1, thresh);
-    //real_function_3d rhs = -2*(psi*V - energy*(psi-phi));
-    real_function_3d rhs = -2*(0.5*(psi+phi)*V - energy*(psi-phi));
+    real_function_3d rhs = -2*(psi*V - energy*(psi-phi));
     rhs.truncate();
     real_function_3d phi_new = apply(op,rhs);
 
-    //double s = 1.0 + energy/(2*c*c);
-    double s = 1.0;
-    rhs = -2*(energy*(psi-s*phi));
+    double s = 1.0 + energy/(2*c*c);
     //rhs = -2*(pVp(world,V,gradV,phi) + energy*(psi-s*phi));
-    //rhs = -2*(pVp(world,V,phi) + energy*(psi-s*phi));
+    rhs = -2*(pVp(world,V,phi) + energy*(psi-s*phi));
     rhs.truncate();
     real_function_3d psimphi_new = apply(op,rhs);
     real_function_3d psi_new = s*phi_new + psimphi_new;
@@ -300,8 +296,6 @@ void run(World& world) {
     coord_3d lo = {0.0,0.0,-L}, hi = {0.0,0.0,L};
     const int npt = 1001;
     
-    real_function_3d fakeV = 1e-4*Vnuc;
-
     real_function_3d mask  = real_factory_3d(world).f(mask3);
     plot_line("mask.dat", npt, lo, hi, mask);
     
@@ -324,7 +318,7 @@ void run(World& world) {
         sprintf(fname,"psi-phi-%3.3d.dat", iter);
         plot_line(fname, npt, lo, hi, psi, phi);
         
-        auto [psi_new, phi_new, rnorm] = iterate(world, Vnuc, gradV, mask, psi, phi, fakeV, energy, iter, solver);
+        auto [psi_new, phi_new, rnorm] = iterate(world, Vnuc, gradV, mask, psi, phi, energy, iter, solver);
         psi = psi_new;
         phi = phi_new;
 

From 8fc84180334b9700b5e514f690f8de4626d6c57b Mon Sep 17 00:00:00 2001
From: "Robert J. Harrison" <rjharrison@gmail.com>
Date: Sat, 26 Aug 2023 14:32:03 -0400
Subject: [PATCH 7/7] what a difference -E makes

---
 src/examples/hatom_sf_dirac.cc | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/examples/hatom_sf_dirac.cc b/src/examples/hatom_sf_dirac.cc
index f1ba5664ccb..41e284ed181 100644
--- a/src/examples/hatom_sf_dirac.cc
+++ b/src/examples/hatom_sf_dirac.cc
@@ -7,7 +7,7 @@
 #include <madness/mra/nonlinsol.h>
 #include <madness/chem/atomutil.h>
 
-using namespace madness;
+using namespace madness; 
 
 static const double Z = 80.0;      // nuclear charge
 static const double L = 40.0/Z;    // L=40/Z [-L,L] box size so exp(-Zr)=1e-16 padded a bit since we are masking
@@ -198,7 +198,8 @@ std::tuple<double,double,double,double> compute_energy(World& world,
 
 // (T-E) phi = -V psi + E(psi-phi)
 // and with s=1+E/2mc^2
-// (T-E) (psi-s*phi) = - pVp phi - E(psi-s*phi)
+// T (psi-s*phi) = - pVp phi/4mc^2
+// BAD!! (T-E) (psi-s*phi) = - pVp phi - E(psi-s*phi)
 template <typename solverT>
 std::tuple<real_function_3d, real_function_3d, double> iterate(World& world,
                                                                const real_function_3d& V,
@@ -210,6 +211,7 @@ std::tuple<real_function_3d, real_function_3d, double> iterate(World& world,
                                                                const int iter,
                                                                solverT& solver)
 {
+    real_convolution_3d coulop = CoulombOperator(world, rcut*0.1, thresh);
     real_convolution_3d op = BSHOperator3D(world, sqrt(-2*energy), rcut*0.1, thresh);
     real_function_3d rhs = -2*(psi*V - energy*(psi-phi));
     rhs.truncate();
@@ -217,9 +219,9 @@ std::tuple<real_function_3d, real_function_3d, double> iterate(World& world,
 
     double s = 1.0 + energy/(2*c*c);
     //rhs = -2*(pVp(world,V,gradV,phi) + energy*(psi-s*phi));
-    rhs = -2*(pVp(world,V,phi) + energy*(psi-s*phi));
+    rhs = -(2/(4*constants::pi))*pVp(world,V,phi);
     rhs.truncate();
-    real_function_3d psimphi_new = apply(op,rhs);
+    real_function_3d psimphi_new = apply(coulop,rhs);
     real_function_3d psi_new = s*phi_new + psimphi_new;
 
     psi_new.truncate();
@@ -234,13 +236,13 @@ std::tuple<real_function_3d, real_function_3d, double> iterate(World& world,
     double rnorm = (psi_new-psi).norm2() + (phi_new-phi).norm2();
 
     // Comment out next 5 lines to not use the solver (i.e., to just do fixed-point iteration)
-    // if (iter>5) {
-    //     F f = solver.update(F(psi,phi), F(psi_new-psi, phi_new-phi));
-    //     psi_new = f.psi;
-    //     phi_new = f.phi;
-    //     psi_new.truncate();
-    //     phi_new.truncate();
-    // }
+    if (iter>1) {
+        F f = solver.update(F(psi,phi), F(psi_new-psi, phi_new-phi));
+        psi_new = f.psi;
+        phi_new = f.phi;
+        psi_new.truncate();
+        phi_new.truncate();
+    }
 
     double stepnorm = (psi_new-psi).norm2() + (phi_new-phi).norm2();
     double maxstep = 1e-1;