From 246b00e53752bd9261b1f7f4ea4cb82273351a92 Mon Sep 17 00:00:00 2001
From: fbischoff <florian.bischoff@hu-berlin.de>
Date: Mon, 5 Aug 2024 13:25:04 -0400
Subject: [PATCH 01/11] fixing the timing in cloud

---
 src/madness/world/cloud.h | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)
diff --git a/src/madness/world/cloud.h b/src/madness/world/cloud.h
index aba5f36ad38..0da3c0cf3af 100644
--- a/src/madness/world/cloud.h
+++ b/src/madness/world/cloud.h
@@ -207,9 +207,9 @@ class Cloud {
         double rtime = double(reading_time);
         double wtime = double(writing_time);
         double ptime = double(replication_time);
-        universe.gop.sum(rtime);
-        universe.gop.sum(wtime);
-        universe.gop.sum(ptime);
+        universe.gop.max(rtime);
+        universe.gop.max(wtime);
+        universe.gop.max(ptime);
         long creads = long(cache_reads);
         long cstores = long(cache_stores);
         universe.gop.sum(creads);
@@ -217,9 +217,9 @@ class Cloud {
         if (universe.rank() == 0) {
             auto precision = std::cout.precision();
             std::cout << std::fixed << std::setprecision(1);
-            print("cloud storing cpu time", wtime * 0.001);
-            print("cloud replication cpu time", ptime * 0.001);
-            print("cloud reading cpu time", rtime * 0.001, std::defaultfloat);
+            print("cloud storing wall time", wtime * 0.001);
+            print("cloud replication wall time", ptime * 0.001);
+            print("cloud reading wall time", rtime * 0.001, std::defaultfloat);
             std::cout << std::setprecision(precision) << std::scientific;
             print("cloud cache stores    ", long(cstores));
             print("cloud cache loads     ", long(creads));
@@ -239,6 +239,8 @@ class Cloud {
         cache_reads=0l;
     }
 
+    /// @param[in]  world the subworld the objects are loaded to
+    /// @param[in]  recordlist the list of records where the objects are stored
     template<typename T>
     T load(madness::World &world, const recordlistT recordlist) const {
         recordlistT rlist = recordlist;
@@ -250,6 +252,7 @@ class Cloud {
         }
     }
 
+    /// @param[in]  world presumably the universe
     template<typename T>
     recordlistT store(madness::World &world, const T &source) {
         if (is_replicated) {
@@ -270,6 +273,7 @@ class Cloud {
     void replicate(const std::size_t chunk_size=INT_MAX) {
 
         World& world=container.get_world();
+        world.gop.fence();
         cloudtimer t(world,replication_time);
         container.reset_pmap_to_local();
         is_replicated=true;
@@ -351,13 +355,13 @@ class Cloud {
 
     struct cloudtimer {
         World& world;
-        double cpu0;
+        double wall0;
         std::atomic<long> &rtime;
 
-        cloudtimer(World& world, std::atomic<long> &readtime) : world(world), cpu0(cpu_time()), rtime(readtime) {}
+        cloudtimer(World& world, std::atomic<long> &readtime) : world(world), wall0(wall_time()), rtime(readtime) {}
 
         ~cloudtimer() {
-            if (world.rank()==0) rtime += long((cpu_time() - cpu0) * 1000l);
+            if (world.rank()==0) rtime += long((wall_time() - wall0) * 1000l);
         }
     };
 

From d1fceb7c377bf90983ba4e427785edef9c8253ae Mon Sep 17 00:00:00 2001
From: fbischoff <florian.bischoff@hu-berlin.de>
Date: Mon, 5 Aug 2024 13:51:01 -0400
Subject: [PATCH 02/11] fixing the timing in cloud, part II of ..

---
 src/madness/world/cloud.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/madness/world/cloud.h b/src/madness/world/cloud.h
index 0da3c0cf3af..ecc18b6bdcf 100644
--- a/src/madness/world/cloud.h
+++ b/src/madness/world/cloud.h
@@ -361,7 +361,8 @@ class Cloud {
         cloudtimer(World& world, std::atomic<long> &readtime) : world(world), wall0(wall_time()), rtime(readtime) {}
 
         ~cloudtimer() {
-            if (world.rank()==0) rtime += long((wall_time() - wall0) * 1000l);
+            long deltatime=long((wall_time() - wall0) * 1000l);
+            rtime += deltatime;
         }
     };
 

From e1843c8f5242e7d1475d2029a233c551f14fdea9 Mon Sep 17 00:00:00 2001
From: "Robert J. Harrison" <rjharrison@gmail.com>
Date: Wed, 7 Aug 2024 11:07:45 -0500
Subject: [PATCH 03/11] remove compiler type warning

---
 src/madness/world/worldgop.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/madness/world/worldgop.h b/src/madness/world/worldgop.h
index 490a457a115..34817a755c5 100644
--- a/src/madness/world/worldgop.h
+++ b/src/madness/world/worldgop.h
@@ -788,7 +788,7 @@ namespace madness {
           auto buf0 = std::unique_ptr<T[]>(new T[nelem_per_maxmsg]);
           auto buf1 = std::unique_ptr<T[]>(new T[nelem_per_maxmsg]);
 
-          auto reduce_impl = [&,this](T* buf, int nelem) {
+          auto reduce_impl = [&,this](T* buf, size_t nelem) {
             MADNESS_ASSERT(nelem <= nelem_per_maxmsg);
             SafeMPI::Request req0, req1;
             Tag gsum_tag = world_.mpi.unique_tag();

From 5dc6de51505861bd8262502b4504f51c1eb4557f Mon Sep 17 00:00:00 2001
From: "Robert J. Harrison" <rjharrison@gmail.com>
Date: Wed, 7 Aug 2024 11:08:15 -0500
Subject: [PATCH 04/11] testing optimized parallel container serialization

---
 src/madness/world/test_dc.cc | 156 +++++++++++++++++++++++++++++++++--
 1 file changed, 149 insertions(+), 7 deletions(-)

diff --git a/src/madness/world/test_dc.cc b/src/madness/world/test_dc.cc
index 5197b006dac..930079dd8a5 100644
--- a/src/madness/world/test_dc.cc
+++ b/src/madness/world/test_dc.cc
@@ -29,10 +29,18 @@
   fax:   865-572-0680
 */
 
+//#define MAD_ARCHIVE_DEBUG_ENABLE
+
+#include <algorithm>
+
 #include <madness/world/MADworld.h>
 #include <madness/world/worlddc.h>
+#include <madness/world/worldmutex.h>
 #include <madness/world/atomicint.h>
 
+#include <madness/world/vector_archive.h>
+#include <madness/world/parallel_archive.h>
+
 using namespace madness;
 using namespace std;
 
@@ -240,16 +248,150 @@ void test_local(World& world) {
 
 }
 
+namespace madness {
+    namespace archive {
+        /// Write container to parallel archive
+        template <class keyT, class valueT>
+        struct ArchiveStoreImpl< ParallelOutputArchive<VectorOutputArchive>, WorldContainer<keyT,valueT> > {
+            static void store(const ParallelOutputArchive<VectorOutputArchive>& ar, const WorldContainer<keyT,valueT>& t) {
+                using localarchiveT = VectorOutputArchive;
+                const long magic = -5881828; // Sitar Indian restaurant in Knoxville (negative to indicate parallel!)
+                typedef WorldContainer<keyT,valueT> dcT;
+                using const_iterator = typename dcT::const_iterator;
+
+                const size_t default_size = 100*1024*1024;
+
+                World* world = ar.get_world();
+                world->gop.fence();
+
+                std::vector<unsigned char> v;
+                v.reserve(default_size);
+                size_t count = 0;
+
+                class op : public TaskInterface {
+                    const size_t ntasks;
+                    const size_t taskid;
+                    const dcT& t;
+                    std::vector<unsigned char>& vtotal;
+                    size_t& total_count;
+                    Mutex& mutex;
+                    
+                public:
+                    op(size_t ntasks, size_t taskid, const dcT& t, std::vector<unsigned char>& vtotal, size_t& total_count, Mutex& mutex)
+                        : ntasks(ntasks), taskid(taskid), t(t), vtotal(vtotal), total_count(total_count), mutex(mutex) {}
+                    void run(World& world) {
+                        std::vector<unsigned char> v;
+                        v.reserve(std::max(size_t(1024*1024),vtotal.capacity()/ntasks));
+                        VectorOutputArchive var(v);
+                        const_iterator it=t.begin();
+                        size_t count = 0;
+                        size_t n = 0;
+                        while (it!=t.end()) {
+                            if ((n%ntasks) == taskid) {
+                                var & *it;
+                                ++count;
+                            }
+                            ++it;
+                            n++;
+                        }
+
+                        if (count) {
+                            mutex.lock();
+                            vtotal.insert(vtotal.end(), v.begin(), v.end());
+                            total_count += count;
+                            mutex.unlock();
+                        }
+                    }
+                };
+                
+                Mutex mutex;
+                size_t ntasks = std::max(size_t(1), ThreadPool::size());
+                for (size_t taskid=0; taskid<ntasks; taskid++)
+                    world->taskq.add(new op(ntasks, taskid, t, v, count, mutex));
+                world->gop.fence();
+
+                // Gather all buffers to process 0
+                // first gather all of the sizes and counts to a vector in process 0
+                int size = v.size();
+                std::vector<int> sizes(world->size());
+                MPI_Gather(&size, 1, MPI_INT, sizes.data(), 1, MPI_INT, 0, world->mpi.comm().Get_mpi_comm());
+                world->gop.sum(count); // just need total number of elements
+
+                // build the cumulative sum of sizes
+                std::vector<int> offsets(world->size());
+                offsets[0] = 0;
+                for (int i=1; i<world->size(); ++i) offsets[i] = offsets[i-1] + sizes[i-1];
+                int total_size = offsets.back() + sizes.back();
+
+                // gather the vector of data v from each process to process 0
+                unsigned char* all_data=0;
+                if (world->rank() == 0) {
+                    all_data = new unsigned char[total_size];
+                }
+                MPI_Gatherv(v.data(), v.size(), MPI_BYTE, all_data, sizes.data(), offsets.data(), MPI_BYTE, 0, world->mpi.comm().Get_mpi_comm());
+
+                if (world->rank() == 0) {
+                    auto& localar = ar.local_archive();
+                    localar & magic & 1; // 1 client
+                    // localar & t;
+                    ArchivePrePostImpl<localarchiveT,dcT>::preamble_store(localar);
+                    localar & -magic & count;                
+                    localar.store(all_data, total_size);
+                    ArchivePrePostImpl<localarchiveT,dcT>::postamble_store(localar);
+
+                    delete[] all_data;
+                }
+                world->gop.fence();
+            }
+        };
+    }
+}
+
+void test_florian(World& world) {
+    WorldContainer<Key,Node> c(world);
+
+    Key key1(1);
+    Node node1(1);
+
+    if (world.rank() == 0) {
+        for (int i=0; i<100; ++i) {
+            c.replace(Key(i),Node(i));
+        }
+    }
+    world.gop.fence();
+
+    std::vector<unsigned char> v;
+    {
+        archive::VectorOutputArchive var(v);
+        archive::ParallelOutputArchive ar(world,var);
+        ar & c;
+    }
+
+    WorldContainer<Key,Node> c2(world);
+    {
+        archive::VectorInputArchive var2(v);
+        archive::ParallelInputArchive ar2(world,var2);
+        ar2 & c2;
+    }
+
+    for (int i=0; i<100; ++i) {
+        MADNESS_CHECK(c2.find(Key(i)).get()->second.get() == i);
+    }
+
+    world.gop.fence();
+    print("test_florian passed");
+}
+
 int main(int argc, char** argv) {
-    initialize(argc, argv);
-    World world(SafeMPI::COMM_WORLD);
 
     try {
-        test0(world);
-        test1(world);
-        test1(world);
-        test1(world);
-        test_local(world);
+        World& world = initialize(argc, argv);
+        // test0(world);
+        // test1(world);
+        // test1(world);
+        // test1(world);
+        // test_local(world);
+        test_florian(world);
     }
     catch (const SafeMPI::Exception& e) {
         error("caught an MPI exception");

From 85f305201bdf6c6f0b9c258e506b934c78d43d55 Mon Sep 17 00:00:00 2001
From: fbischoff <florian.bischoff@hu-berlin.de>
Date: Wed, 7 Aug 2024 12:21:53 -0400
Subject: [PATCH 05/11] moving serialization to VectorOutputArchive into
 worlddc.h

---
 src/madness/world/test_dc.cc | 99 ------------------------------------
 src/madness/world/worlddc.h  | 95 ++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+), 99 deletions(-)

diff --git a/src/madness/world/test_dc.cc b/src/madness/world/test_dc.cc
index 930079dd8a5..d95154c08ad 100644
--- a/src/madness/world/test_dc.cc
+++ b/src/madness/world/test_dc.cc
@@ -248,105 +248,6 @@ void test_local(World& world) {
 
 }
 
-namespace madness {
-    namespace archive {
-        /// Write container to parallel archive
-        template <class keyT, class valueT>
-        struct ArchiveStoreImpl< ParallelOutputArchive<VectorOutputArchive>, WorldContainer<keyT,valueT> > {
-            static void store(const ParallelOutputArchive<VectorOutputArchive>& ar, const WorldContainer<keyT,valueT>& t) {
-                using localarchiveT = VectorOutputArchive;
-                const long magic = -5881828; // Sitar Indian restaurant in Knoxville (negative to indicate parallel!)
-                typedef WorldContainer<keyT,valueT> dcT;
-                using const_iterator = typename dcT::const_iterator;
-
-                const size_t default_size = 100*1024*1024;
-
-                World* world = ar.get_world();
-                world->gop.fence();
-
-                std::vector<unsigned char> v;
-                v.reserve(default_size);
-                size_t count = 0;
-
-                class op : public TaskInterface {
-                    const size_t ntasks;
-                    const size_t taskid;
-                    const dcT& t;
-                    std::vector<unsigned char>& vtotal;
-                    size_t& total_count;
-                    Mutex& mutex;
-                    
-                public:
-                    op(size_t ntasks, size_t taskid, const dcT& t, std::vector<unsigned char>& vtotal, size_t& total_count, Mutex& mutex)
-                        : ntasks(ntasks), taskid(taskid), t(t), vtotal(vtotal), total_count(total_count), mutex(mutex) {}
-                    void run(World& world) {
-                        std::vector<unsigned char> v;
-                        v.reserve(std::max(size_t(1024*1024),vtotal.capacity()/ntasks));
-                        VectorOutputArchive var(v);
-                        const_iterator it=t.begin();
-                        size_t count = 0;
-                        size_t n = 0;
-                        while (it!=t.end()) {
-                            if ((n%ntasks) == taskid) {
-                                var & *it;
-                                ++count;
-                            }
-                            ++it;
-                            n++;
-                        }
-
-                        if (count) {
-                            mutex.lock();
-                            vtotal.insert(vtotal.end(), v.begin(), v.end());
-                            total_count += count;
-                            mutex.unlock();
-                        }
-                    }
-                };
-                
-                Mutex mutex;
-                size_t ntasks = std::max(size_t(1), ThreadPool::size());
-                for (size_t taskid=0; taskid<ntasks; taskid++)
-                    world->taskq.add(new op(ntasks, taskid, t, v, count, mutex));
-                world->gop.fence();
-
-                // Gather all buffers to process 0
-                // first gather all of the sizes and counts to a vector in process 0
-                int size = v.size();
-                std::vector<int> sizes(world->size());
-                MPI_Gather(&size, 1, MPI_INT, sizes.data(), 1, MPI_INT, 0, world->mpi.comm().Get_mpi_comm());
-                world->gop.sum(count); // just need total number of elements
-
-                // build the cumulative sum of sizes
-                std::vector<int> offsets(world->size());
-                offsets[0] = 0;
-                for (int i=1; i<world->size(); ++i) offsets[i] = offsets[i-1] + sizes[i-1];
-                int total_size = offsets.back() + sizes.back();
-
-                // gather the vector of data v from each process to process 0
-                unsigned char* all_data=0;
-                if (world->rank() == 0) {
-                    all_data = new unsigned char[total_size];
-                }
-                MPI_Gatherv(v.data(), v.size(), MPI_BYTE, all_data, sizes.data(), offsets.data(), MPI_BYTE, 0, world->mpi.comm().Get_mpi_comm());
-
-                if (world->rank() == 0) {
-                    auto& localar = ar.local_archive();
-                    localar & magic & 1; // 1 client
-                    // localar & t;
-                    ArchivePrePostImpl<localarchiveT,dcT>::preamble_store(localar);
-                    localar & -magic & count;                
-                    localar.store(all_data, total_size);
-                    ArchivePrePostImpl<localarchiveT,dcT>::postamble_store(localar);
-
-                    delete[] all_data;
-                }
-                world->gop.fence();
-            }
-        };
-    }
-}
-
 void test_florian(World& world) {
     WorldContainer<Key,Node> c(world);
 
diff --git a/src/madness/world/worlddc.h b/src/madness/world/worlddc.h
index d375c7b9de3..2c5aa4523fc 100644
--- a/src/madness/world/worlddc.h
+++ b/src/madness/world/worlddc.h
@@ -1680,6 +1680,101 @@ namespace madness {
             }
         };
 
+             /// Write container to parallel archive
+        template <class keyT, class valueT>
+        struct ArchiveStoreImpl< ParallelOutputArchive<VectorOutputArchive>, WorldContainer<keyT,valueT> > {
+            static void store(const ParallelOutputArchive<VectorOutputArchive>& ar, const WorldContainer<keyT,valueT>& t) {
+                using localarchiveT = VectorOutputArchive;
+                const long magic = -5881828; // Sitar Indian restaurant in Knoxville (negative to indicate parallel!)
+                typedef WorldContainer<keyT,valueT> dcT;
+                using const_iterator = typename dcT::const_iterator;
+
+                const size_t default_size = 100*1024*1024;
+
+                World* world = ar.get_world();
+                world->gop.fence();
+
+                std::vector<unsigned char> v;
+                v.reserve(default_size);
+                size_t count = 0;
+
+                class op : public TaskInterface {
+                    const size_t ntasks;
+                    const size_t taskid;
+                    const dcT& t;
+                    std::vector<unsigned char>& vtotal;
+                    size_t& total_count;
+                    Mutex& mutex;
+
+                public:
+                    op(size_t ntasks, size_t taskid, const dcT& t, std::vector<unsigned char>& vtotal, size_t& total_count, Mutex& mutex)
+                        : ntasks(ntasks), taskid(taskid), t(t), vtotal(vtotal), total_count(total_count), mutex(mutex) {}
+                    void run(World& world) {
+                        std::vector<unsigned char> v;
+                        v.reserve(std::max(size_t(1024*1024),vtotal.capacity()/ntasks));
+                        VectorOutputArchive var(v);
+                        const_iterator it=t.begin();
+                        size_t count = 0;
+                        size_t n = 0;
+                        while (it!=t.end()) {
+                            if ((n%ntasks) == taskid) {
+                                var & *it;
+                                ++count;
+                            }
+                            ++it;
+                            n++;
+                        }
+
+                        if (count) {
+                            mutex.lock();
+                            vtotal.insert(vtotal.end(), v.begin(), v.end());
+                            total_count += count;
+                            mutex.unlock();
+                        }
+                    }
+                };
+
+                Mutex mutex;
+                size_t ntasks = std::max(size_t(1), ThreadPool::size());
+                for (size_t taskid=0; taskid<ntasks; taskid++)
+                    world->taskq.add(new op(ntasks, taskid, t, v, count, mutex));
+                world->gop.fence();
+
+                // Gather all buffers to process 0
+                // first gather all of the sizes and counts to a vector in process 0
+                int size = v.size();
+                std::vector<int> sizes(world->size());
+                MPI_Gather(&size, 1, MPI_INT, sizes.data(), 1, MPI_INT, 0, world->mpi.comm().Get_mpi_comm());
+                world->gop.sum(count); // just need total number of elements
+
+                // build the cumulative sum of sizes
+                std::vector<int> offsets(world->size());
+                offsets[0] = 0;
+                for (int i=1; i<world->size(); ++i) offsets[i] = offsets[i-1] + sizes[i-1];
+                int total_size = offsets.back() + sizes.back();
+
+                // gather the vector of data v from each process to process 0
+                unsigned char* all_data=0;
+                if (world->rank() == 0) {
+                    all_data = new unsigned char[total_size];
+                }
+                MPI_Gatherv(v.data(), v.size(), MPI_BYTE, all_data, sizes.data(), offsets.data(), MPI_BYTE, 0, world->mpi.comm().Get_mpi_comm());
+
+                if (world->rank() == 0) {
+                    auto& localar = ar.local_archive();
+                    localar & magic & 1; // 1 client
+                    // localar & t;
+                    ArchivePrePostImpl<localarchiveT,dcT>::preamble_store(localar);
+                    localar & -magic & count;
+                    localar.store(all_data, total_size);
+                    ArchivePrePostImpl<localarchiveT,dcT>::postamble_store(localar);
+
+                    delete[] all_data;
+                }
+                world->gop.fence();
+            }
+        };
+
         template <class keyT, class valueT, class localarchiveT>
         struct ArchiveLoadImpl< ParallelInputArchive<localarchiveT>, WorldContainer<keyT,valueT> > {
             /// Read container from parallel archive

From b9b87d09a6109846d644a564753f78b6bb9b0f70 Mon Sep 17 00:00:00 2001
From: fbischoff <florian.bischoff@hu-berlin.de>
Date: Wed, 7 Aug 2024 14:23:29 -0400
Subject: [PATCH 06/11] moving serialization to VectorOutputArchive into
 worlddc.h

---
 src/madness/world/parallel_dc_archive.h |  17 ++-
 src/madness/world/worlddc.h             | 136 ++++++++++++------------
 2 files changed, 84 insertions(+), 69 deletions(-)

diff --git a/src/madness/world/parallel_dc_archive.h b/src/madness/world/parallel_dc_archive.h
index 7ceb9210267..d178d26cba3 100644
--- a/src/madness/world/parallel_dc_archive.h
+++ b/src/madness/world/parallel_dc_archive.h
@@ -36,7 +36,10 @@ namespace madness {
             {
                 close();
             }
-            
+
+            VectorOutputArchive& get_archive() {
+                return ar;
+            }
             template <class T>
             inline
             typename std::enable_if< madness::is_trivially_serializable<T>::value, void >::type
@@ -97,7 +100,17 @@ namespace madness {
             
             void close() {}
         };
-        
+
+
+        template <class keyT, class valueT>
+        struct ArchiveStoreImpl< ParallelOutputArchive<ContainerRecordOutputArchive>, WorldContainer<keyT,valueT> > {
+            static void store(const ParallelOutputArchive<ContainerRecordOutputArchive>& ar, const WorldContainer<keyT,valueT>& t) {
+                ParallelOutputArchive<VectorOutputArchive> par(*(ar.get_world()), ar.local_archive().get_archive());
+                par & t;
+
+            }
+        };
+
     }
 
 
diff --git a/src/madness/world/worlddc.h b/src/madness/world/worlddc.h
index 2c5aa4523fc..91e57e52c9f 100644
--- a/src/madness/world/worlddc.h
+++ b/src/madness/world/worlddc.h
@@ -1613,74 +1613,8 @@ namespace madness {
     }
 
     namespace archive {
-        /// Write container to parallel archive with optional fence
 
-        /// \ingroup worlddc
-        /// Each node (process) is served by a designated IO node.
-        /// The IO node has a binary local file archive to which is
-        /// first written a cookie and the number of servers.  The IO
-        /// node then loops thru all of its clients and in turn tells
-        /// each to write its data over an MPI stream, which is copied
-        /// directly to the output file.  The stream contents are then
-        /// cookie, no. of clients, foreach client (usual sequential archive).
-        ///
-        /// If ar.dofence() is true (default) fence is invoked before and
-        /// after the IO. The fence is optional but it is of course
-        /// necessary to be sure that all updates have completed
-        /// before doing IO, and that all IO has completed before
-        /// subsequent modifications. Also, there is always at least
-        /// some synchronization between a client and its IO server.
-        template <class keyT, class valueT, class localarchiveT>
-        struct ArchiveStoreImpl< ParallelOutputArchive<localarchiveT>, WorldContainer<keyT,valueT> > {
-            static void store(const ParallelOutputArchive<localarchiveT>& ar, const WorldContainer<keyT,valueT>& t) {
-                const long magic = -5881828; // Sitar Indian restaurant in Knoxville (negative to indicate parallel!)
-                typedef WorldContainer<keyT,valueT> dcT;
-                // typedef typename dcT::const_iterator iterator; // unused?
-                typedef typename dcT::pairT pairT;
-                World* world = ar.get_world();
-                Tag tag = world->mpi.unique_tag();
-                ProcessID me = world->rank();
-                if (ar.dofence()) world->gop.fence();
-                if (ar.is_io_node()) {
-                    auto& localar = ar.local_archive();
-                    localar & magic & ar.num_io_clients();
-                    for (ProcessID p=0; p<world->size(); ++p) {
-                        if (p == me) {
-                            localar & t;
-                        }
-                        else if (ar.io_node(p) == me) {
-                            world->mpi.Send(int(1),p,tag); // Tell client to start sending
-                            archive::MPIInputArchive source(*world, p);
-                            long cookie = 0l;
-                            unsigned long count = 0ul;
-
-                            ArchivePrePostImpl<localarchiveT,dcT>::preamble_store(localar);
-
-                            source & cookie & count;
-                            localar & cookie & count;
-                            while (count--) {
-                                pairT datum;
-                                source & datum;
-                                localar & datum;
-                            }
-
-                            ArchivePrePostImpl<localarchiveT,dcT>::postamble_store(localar);
-                        }
-                    }
-                }
-                else {
-                    ProcessID p = ar.my_io_node();
-                    int flag;
-                    world->mpi.Recv(flag,p,tag);
-                    MPIOutputArchive dest(*world, p);
-                    dest & t;
-                    dest.flush();
-                }
-                if (ar.dofence()) world->gop.fence();
-            }
-        };
-
-             /// Write container to parallel archive
+        /// Write container to parallel archive
         template <class keyT, class valueT>
         struct ArchiveStoreImpl< ParallelOutputArchive<VectorOutputArchive>, WorldContainer<keyT,valueT> > {
             static void store(const ParallelOutputArchive<VectorOutputArchive>& ar, const WorldContainer<keyT,valueT>& t) {
@@ -1775,6 +1709,74 @@ namespace madness {
             }
         };
 
+
+        /// Write container to parallel archive with optional fence
+
+        /// \ingroup worlddc
+        /// Each node (process) is served by a designated IO node.
+        /// The IO node has a binary local file archive to which is
+        /// first written a cookie and the number of servers.  The IO
+        /// node then loops thru all of its clients and in turn tells
+        /// each to write its data over an MPI stream, which is copied
+        /// directly to the output file.  The stream contents are then
+        /// cookie, no. of clients, foreach client (usual sequential archive).
+        ///
+        /// If ar.dofence() is true (default) fence is invoked before and
+        /// after the IO. The fence is optional but it is of course
+        /// necessary to be sure that all updates have completed
+        /// before doing IO, and that all IO has completed before
+        /// subsequent modifications. Also, there is always at least
+        /// some synchronization between a client and its IO server.
+        template <class keyT, class valueT, class localarchiveT>
+        struct ArchiveStoreImpl< ParallelOutputArchive<localarchiveT>, WorldContainer<keyT,valueT> > {
+            static void store(const ParallelOutputArchive<localarchiveT>& ar, const WorldContainer<keyT,valueT>& t) {
+                const long magic = -5881828; // Sitar Indian restaurant in Knoxville (negative to indicate parallel!)
+                typedef WorldContainer<keyT,valueT> dcT;
+                // typedef typename dcT::const_iterator iterator; // unused?
+                typedef typename dcT::pairT pairT;
+                World* world = ar.get_world();
+                Tag tag = world->mpi.unique_tag();
+                ProcessID me = world->rank();
+                if (ar.dofence()) world->gop.fence();
+                if (ar.is_io_node()) {
+                    auto& localar = ar.local_archive();
+                    localar & magic & ar.num_io_clients();
+                    for (ProcessID p=0; p<world->size(); ++p) {
+                        if (p == me) {
+                            localar & t;
+                        }
+                        else if (ar.io_node(p) == me) {
+                            world->mpi.Send(int(1),p,tag); // Tell client to start sending
+                            archive::MPIInputArchive source(*world, p);
+                            long cookie = 0l;
+                            unsigned long count = 0ul;
+
+                            ArchivePrePostImpl<localarchiveT,dcT>::preamble_store(localar);
+
+                            source & cookie & count;
+                            localar & cookie & count;
+                            while (count--) {
+                                pairT datum;
+                                source & datum;
+                                localar & datum;
+                            }
+
+                            ArchivePrePostImpl<localarchiveT,dcT>::postamble_store(localar);
+                        }
+                    }
+                }
+                else {
+                    ProcessID p = ar.my_io_node();
+                    int flag;
+                    world->mpi.Recv(flag,p,tag);
+                    MPIOutputArchive dest(*world, p);
+                    dest & t;
+                    dest.flush();
+                }
+                if (ar.dofence()) world->gop.fence();
+            }
+        };
+
         template <class keyT, class valueT, class localarchiveT>
         struct ArchiveLoadImpl< ParallelInputArchive<localarchiveT>, WorldContainer<keyT,valueT> > {
             /// Read container from parallel archive

From 41020aab73b17c3753af50a6c060986b109550d6 Mon Sep 17 00:00:00 2001
From: fbischoff <florian.bischoff@hu-berlin.de>
Date: Wed, 7 Aug 2024 14:48:43 -0400
Subject: [PATCH 07/11] looking for the lost time in WorldContainer
 serialization

---
 src/madness/world/cloud.h   | 6 ++++++
 src/madness/world/worlddc.h | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/src/madness/world/cloud.h b/src/madness/world/cloud.h
index ecc18b6bdcf..ae05897c9f8 100644
--- a/src/madness/world/cloud.h
+++ b/src/madness/world/cloud.h
@@ -206,9 +206,11 @@ class Cloud {
     void print_timings(World &universe) const {
         double rtime = double(reading_time);
         double wtime = double(writing_time);
+        double wtime1 = double(writing_time1);
         double ptime = double(replication_time);
         universe.gop.max(rtime);
         universe.gop.max(wtime);
+        universe.gop.max(wtime1);
         universe.gop.max(ptime);
         long creads = long(cache_reads);
         long cstores = long(cache_stores);
@@ -218,6 +220,7 @@ class Cloud {
             auto precision = std::cout.precision();
             std::cout << std::fixed << std::setprecision(1);
             print("cloud storing wall time", wtime * 0.001);
+            print("cloud storing wall time inner loop", wtime1 * 0.001);
             print("cloud replication wall time", ptime * 0.001);
             print("cloud reading wall time", rtime * 0.001, std::defaultfloat);
             std::cout << std::setprecision(precision) << std::scientific;
@@ -234,6 +237,7 @@ class Cloud {
     void clear_timings() {
         reading_time=0l;
         writing_time=0l;
+        writing_time1=0l;
         replication_time=0l;
         cache_stores=0l;
         cache_reads=0l;
@@ -332,6 +336,7 @@ class Cloud {
 
     mutable std::atomic<long> reading_time=0l;    // in ms
     mutable std::atomic<long> writing_time=0l;    // in ms
+    mutable std::atomic<long> writing_time1=0l;    // in ms
     mutable std::atomic<long> replication_time=0l;    // in ms
     mutable std::atomic<long> cache_reads=0l;
     mutable std::atomic<long> cache_stores=0l;
@@ -423,6 +428,7 @@ class Cloud {
         if (is_already_present) {
             if (world.rank()==0) cache_stores++;
         } else {
+            cloudtimer t(world,writing_time1);
             madness::archive::ContainerRecordOutputArchive ar(world, container, record);
             madness::archive::ParallelOutputArchive<madness::archive::ContainerRecordOutputArchive> par(world, ar);
             par & source;
diff --git a/src/madness/world/worlddc.h b/src/madness/world/worlddc.h
index 91e57e52c9f..125fae64efa 100644
--- a/src/madness/world/worlddc.h
+++ b/src/madness/world/worlddc.h
@@ -1615,6 +1615,10 @@ namespace madness {
     namespace archive {
 
         /// Write container to parallel archive
+
+        /// specialization for parallel serialization of a WorldContainer:
+        /// all threads on each process serialize some values into a buffer, which gets concatenated
+        /// and finally serialized to localarchive (aka VectorOutputArchive).
         template <class keyT, class valueT>
         struct ArchiveStoreImpl< ParallelOutputArchive<VectorOutputArchive>, WorldContainer<keyT,valueT> > {
             static void store(const ParallelOutputArchive<VectorOutputArchive>& ar, const WorldContainer<keyT,valueT>& t) {
@@ -1650,6 +1654,7 @@ namespace madness {
                         const_iterator it=t.begin();
                         size_t count = 0;
                         size_t n = 0;
+                        /// threads serialize round-robin over the container
                         while (it!=t.end()) {
                             if ((n%ntasks) == taskid) {
                                 var & *it;
@@ -1659,6 +1664,7 @@ namespace madness {
                             n++;
                         }
 
+                        // concatenate the buffers from each thread
                         if (count) {
                             mutex.lock();
                             vtotal.insert(vtotal.end(), v.begin(), v.end());

From fe29acfaed63ae388692a0fd32ef895b07e95261 Mon Sep 17 00:00:00 2001
From: fbischoff <florian.bischoff@hu-berlin.de>
Date: Wed, 7 Aug 2024 16:22:03 -0400
Subject: [PATCH 08/11] looking for the lost time in WorldContainer
 serialization

---
 src/madness/world/test_dc.cc       | 43 +++++++++++++++++++++++-------
 src/madness/world/vector_archive.h |  1 +
 src/madness/world/worlddc.h        | 17 +++++++++---
 3 files changed, 47 insertions(+), 14 deletions(-)

diff --git a/src/madness/world/test_dc.cc b/src/madness/world/test_dc.cc
index d95154c08ad..bf03c72ee6f 100644
--- a/src/madness/world/test_dc.cc
+++ b/src/madness/world/test_dc.cc
@@ -89,6 +89,26 @@ struct Node {
     ~Node() {}
 };
 
+struct LargeNode {
+    std::vector<int> k;
+
+    LargeNode() : k() {}
+
+    LargeNode(int val) {
+        k=std::vector<int>(10000,val);
+    }
+
+    int get() const {
+        return k[0];
+    }
+
+    template <typename Archive>
+    void serialize(const Archive& ar) {
+        ar & k;
+    }
+
+    ~LargeNode() {}
+};
 ostream& operator<<(ostream&s, const Node& node) {
     s << "Node(" << node.k << ")";
     return s;
@@ -249,34 +269,37 @@ void test_local(World& world) {
 }
 
 void test_florian(World& world) {
-    WorldContainer<Key,Node> c(world);
-
-    Key key1(1);
-    Node node1(1);
+    WorldContainer<Key,LargeNode> c(world);
+    long nlarge=200000;
 
     if (world.rank() == 0) {
-        for (int i=0; i<100; ++i) {
-            c.replace(Key(i),Node(i));
+        for (int i=0; i<nlarge; ++i) {
+            c.replace(Key(i),LargeNode(i));
         }
     }
     world.gop.fence();
-
+    double wall0=wall_time();
+    printf("starting at time %8.4f with %ld items\n",wall0,nlarge);
     std::vector<unsigned char> v;
     {
         archive::VectorOutputArchive var(v);
         archive::ParallelOutputArchive ar(world,var);
         ar & c;
     }
+    double wall1=wall_time();
+    printf("ending at time %8.4f after %8.4fs\n",wall1,wall1-wall0);
 
-    WorldContainer<Key,Node> c2(world);
+    WorldContainer<Key,LargeNode> c2(world);
     {
         archive::VectorInputArchive var2(v);
         archive::ParallelInputArchive ar2(world,var2);
         ar2 & c2;
     }
 
-    for (int i=0; i<100; ++i) {
-        MADNESS_CHECK(c2.find(Key(i)).get()->second.get() == i);
+    if (world.rank()==0) {
+        for (int i=0; i<nlarge; ++i) {
+            MADNESS_CHECK(c2.find(Key(i)).get()->second.get() == i);
+        }
     }
 
     world.gop.fence();
diff --git a/src/madness/world/vector_archive.h b/src/madness/world/vector_archive.h
index 8dfa56179d5..129c35e44e9 100644
--- a/src/madness/world/vector_archive.h
+++ b/src/madness/world/vector_archive.h
@@ -53,6 +53,7 @@ namespace madness {
 
         /// Wraps an archive around an STL \c vector for output.
         class VectorOutputArchive : public BaseOutputArchive {
+        public:
             mutable std::vector<unsigned char>* v; ///< The STL vector being wrapped.
 
         public:
diff --git a/src/madness/world/worlddc.h b/src/madness/world/worlddc.h
index 125fae64efa..f10ff355fcf 100644
--- a/src/madness/world/worlddc.h
+++ b/src/madness/world/worlddc.h
@@ -1627,7 +1627,8 @@ namespace madness {
                 typedef WorldContainer<keyT,valueT> dcT;
                 using const_iterator = typename dcT::const_iterator;
 
-                const size_t default_size = 100*1024*1024;
+                // const size_t default_size = 100*1024*1024;
+                const size_t default_size = 8ul<<30;
 
                 World* world = ar.get_world();
                 world->gop.fence();
@@ -1649,8 +1650,8 @@ namespace madness {
                         : ntasks(ntasks), taskid(taskid), t(t), vtotal(vtotal), total_count(total_count), mutex(mutex) {}
                     void run(World& world) {
                         std::vector<unsigned char> v;
-                        v.reserve(std::max(size_t(1024*1024),vtotal.capacity()/ntasks));
-                        VectorOutputArchive var(v);
+                        std::size_t hint_size=std::max(size_t(1024*1024),vtotal.capacity()/ntasks);
+                        VectorOutputArchive var(v,hint_size+taskid);
                         const_iterator it=t.begin();
                         size_t count = 0;
                         size_t n = 0;
@@ -1663,7 +1664,7 @@ namespace madness {
                             ++it;
                             n++;
                         }
-
+                        print("count, n, ntasks, taskid, size",count,n,ntasks,taskid,var.v->capacity());
                         // concatenate the buffers from each thread
                         if (count) {
                             mutex.lock();
@@ -1674,12 +1675,16 @@ namespace madness {
                     }
                 };
 
+                world->gop.fence();
+                double wall0=wall_time();
                 Mutex mutex;
                 size_t ntasks = std::max(size_t(1), ThreadPool::size());
                 for (size_t taskid=0; taskid<ntasks; taskid++)
                     world->taskq.add(new op(ntasks, taskid, t, v, count, mutex));
                 world->gop.fence();
 
+                double wall1=wall_time();
+                if (world->rank()==0) printf("time in the taskq: %8.4fs\n",wall1-wall0);
                 // Gather all buffers to process 0
                 // first gather all of the sizes and counts to a vector in process 0
                 int size = v.size();
@@ -1687,12 +1692,14 @@ namespace madness {
                 MPI_Gather(&size, 1, MPI_INT, sizes.data(), 1, MPI_INT, 0, world->mpi.comm().Get_mpi_comm());
                 world->gop.sum(count); // just need total number of elements
 
+                print("time 3",wall_time());
                 // build the cumulative sum of sizes
                 std::vector<int> offsets(world->size());
                 offsets[0] = 0;
                 for (int i=1; i<world->size(); ++i) offsets[i] = offsets[i-1] + sizes[i-1];
                 int total_size = offsets.back() + sizes.back();
 
+                print("time 4",wall_time());
                 // gather the vector of data v from each process to process 0
                 unsigned char* all_data=0;
                 if (world->rank() == 0) {
@@ -1700,6 +1707,7 @@ namespace madness {
                 }
                 MPI_Gatherv(v.data(), v.size(), MPI_BYTE, all_data, sizes.data(), offsets.data(), MPI_BYTE, 0, world->mpi.comm().Get_mpi_comm());
 
+                print("time 5",wall_time());
                 if (world->rank() == 0) {
                     auto& localar = ar.local_archive();
                     localar & magic & 1; // 1 client
@@ -1712,6 +1720,7 @@ namespace madness {
                     delete[] all_data;
                 }
                 world->gop.fence();
+                print("time 6",wall_time());
             }
         };
 

From c80f0e1dd044126e0c994aadd8b24849b5103b74 Mon Sep 17 00:00:00 2001
From: "Robert J. Harrison" <rjharrison@gmail.com>
Date: Wed, 7 Aug 2024 16:13:27 -0500
Subject: [PATCH 09/11] bug in serialize

---
 src/madness/world/worlddc.h | 62 +++++++++++++++++++++----------------
 1 file changed, 35 insertions(+), 27 deletions(-)

diff --git a/src/madness/world/worlddc.h b/src/madness/world/worlddc.h
index f10ff355fcf..671e2964071 100644
--- a/src/madness/world/worlddc.h
+++ b/src/madness/world/worlddc.h
@@ -1633,45 +1633,39 @@ namespace madness {
                 World* world = ar.get_world();
                 world->gop.fence();
 
-                std::vector<unsigned char> v;
-                v.reserve(default_size);
-                size_t count = 0;
-
-                class op : public TaskInterface {
+                class op_serialize : public TaskInterface {
                     const size_t ntasks;
                     const size_t taskid;
                     const dcT& t;
-                    std::vector<unsigned char>& vtotal;
-                    size_t& total_count;
-                    Mutex& mutex;
+                    std::vector<unsigned char>& v;
 
                 public:
-                    op(size_t ntasks, size_t taskid, const dcT& t, std::vector<unsigned char>& vtotal, size_t& total_count, Mutex& mutex)
-                        : ntasks(ntasks), taskid(taskid), t(t), vtotal(vtotal), total_count(total_count), mutex(mutex) {}
+                    op_serialize(size_t ntasks, size_t taskid, const dcT& t, std::vector<unsigned char>& v)
+                        : ntasks(ntasks), taskid(taskid), t(t), v(v) {}
                     void run(World& world) {
-                        std::vector<unsigned char> v;
-                        std::size_t hint_size=std::max(size_t(1024*1024),vtotal.capacity()/ntasks);
-                        VectorOutputArchive var(v,hint_size+taskid);
+                        std::size_t hint_size=(1ul<<30)/ntasks;
+                        VectorOutputArchive var(v,hint_size);
                         const_iterator it=t.begin();
-                        size_t count = 0;
                         size_t n = 0;
                         /// threads serialize round-robin over the container
                         while (it!=t.end()) {
                             if ((n%ntasks) == taskid) {
                                 var & *it;
-                                ++count;
                             }
                             ++it;
                             n++;
                         }
-                        print("count, n, ntasks, taskid, size",count,n,ntasks,taskid,var.v->capacity());
-                        // concatenate the buffers from each thread
-                        if (count) {
-                            mutex.lock();
-                            vtotal.insert(vtotal.end(), v.begin(), v.end());
-                            total_count += count;
-                            mutex.unlock();
-                        }
+                    }
+                };
+
+                class op_concat : public TaskInterface {
+                    unsigned char* all_data;
+                    const std::vector<unsigned char>& v;
+                public:
+                    op_concat(unsigned char* all_data, const std::vector<unsigned char>& v)
+                        : all_data(all_data), v(v) {}
+                    void run(World& world) {
+                        memcpy(all_data, v.data(), v.size());
                     }
                 };
 
@@ -1679,15 +1673,29 @@ namespace madness {
                 double wall0=wall_time();
                 Mutex mutex;
                 size_t ntasks = std::max(size_t(1), ThreadPool::size());
+
+                std::vector<std::vector<unsigned char>> v(ntasks);
                 for (size_t taskid=0; taskid<ntasks; taskid++)
-                    world->taskq.add(new op(ntasks, taskid, t, v, count, mutex));
+                    world->taskq.add(new op_serialize(ntasks, taskid, t, v[taskid]));
                 world->gop.fence();
+                // total size of all vectors
+                size_t total_size = 0;
+                for (size_t taskid=0; taskid<ntasks; taskid++) total_size += v[taskid].size();
+                std::vector<unsigned char> vtotal(total_size);
+                
+                size_t offset = 0;
+                for (size_t taskid=0; taskid<ntasks; taskid++) {
+                    world->taskq.add(new op_concat(&vtotal[offset], v[taskid]));
+                    offset += v[taskid].size();
+                }
+                v.clear();
 
                 double wall1=wall_time();
                 if (world->rank()==0) printf("time in the taskq: %8.4fs\n",wall1-wall0);
                 // Gather all buffers to process 0
                 // first gather all of the sizes and counts to a vector in process 0
-                int size = v.size();
+                int size = vtotal.size();
+                int count = t.size();
                 std::vector<int> sizes(world->size());
                 MPI_Gather(&size, 1, MPI_INT, sizes.data(), 1, MPI_INT, 0, world->mpi.comm().Get_mpi_comm());
                 world->gop.sum(count); // just need total number of elements
@@ -1697,7 +1705,7 @@ namespace madness {
                 std::vector<int> offsets(world->size());
                 offsets[0] = 0;
                 for (int i=1; i<world->size(); ++i) offsets[i] = offsets[i-1] + sizes[i-1];
-                int total_size = offsets.back() + sizes.back();
+                MADNESS_CHECK(offsets.back() + sizes.back() == total_size);
 
                 print("time 4",wall_time());
                 // gather the vector of data v from each process to process 0
@@ -1705,7 +1713,7 @@ namespace madness {
                 if (world->rank() == 0) {
                     all_data = new unsigned char[total_size];
                 }
-                MPI_Gatherv(v.data(), v.size(), MPI_BYTE, all_data, sizes.data(), offsets.data(), MPI_BYTE, 0, world->mpi.comm().Get_mpi_comm());
+                MPI_Gatherv(vtotal.data(), vtotal.size(), MPI_BYTE, all_data, sizes.data(), offsets.data(), MPI_BYTE, 0, world->mpi.comm().Get_mpi_comm());
 
                 print("time 5",wall_time());
                 if (world->rank() == 0) {

From 4f0df9b68b43e4a9683054a0864aa9efe44163c7 Mon Sep 17 00:00:00 2001
From: "Robert J. Harrison" <rjharrison@gmail.com>
Date: Thu, 15 Aug 2024 10:24:01 -0400
Subject: [PATCH 10/11] merged gnuplot from bspline branch

---
 src/madness/misc/CMakeLists.txt  |   8 +-
 src/madness/misc/gnuplot.h       | 198 +++++++++++++++++++++++++++++++
 src/madness/misc/test_gnuplot.cc |   7 ++
 3 files changed, 209 insertions(+), 4 deletions(-)
 create mode 100644 src/madness/misc/gnuplot.h
 create mode 100644 src/madness/misc/test_gnuplot.cc

diff --git a/src/madness/misc/CMakeLists.txt b/src/madness/misc/CMakeLists.txt
index 790a0d1e09a..f825dcfa6a3 100644
--- a/src/madness/misc/CMakeLists.txt
+++ b/src/madness/misc/CMakeLists.txt
@@ -1,8 +1,8 @@
 # src/madness/misc
 
-set(MADMISC_HEADERS misc.h ran.h phandler.h interpolation_1d.h cfft.h info.h)
+set(MADMISC_HEADERS misc.h ran.h phandler.h interpolation_1d.h cfft.h info.h gnuplot.h)
 set(MADMISC_SOURCES
-    checksum_file.cc position_stream.cc gprofexit.cc ran.cc cfft.cc info.cc unique_filename.cc)
+    checksum_file.cc position_stream.cc gprofexit.cc ran.cc cfft.cc info.cc)
 # retrieve git metadata
 include(GetGitMetadata)
 vgkit_cmake_git_metadata()
@@ -20,8 +20,8 @@ add_mad_library(misc MADMISC_SOURCES MADMISC_HEADERS "world" "madness/misc/")
 if(BUILD_TESTING)
   
   # The list of unit test source files
-  set(MISC_TEST_SOURCES interp3.cc)
+  set(MISC_TEST_SOURCES interp3.cc test_gnuplot.cc)
   
   add_unittests(misc "${MISC_TEST_SOURCES}" "MADmisc;MADgtest" "unittests;short")
 
-endif()
\ No newline at end of file
+endif()
diff --git a/src/madness/misc/gnuplot.h b/src/madness/misc/gnuplot.h
new file mode 100644
index 00000000000..465d298cdb7
--- /dev/null
+++ b/src/madness/misc/gnuplot.h
@@ -0,0 +1,198 @@
+#ifndef MADNESS_GNUPLOT_H__INCUDED
+#define MADNESS_GNUPLOT_H__INCUDED
+
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cmath>
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+namespace madness {
+    class Gnuplot {
+        FILE *f;     // pipe connection to gnuplot process
+        pid_t pid;   // pid of gnuplot process
+        FILE *ftee;  // filestream for data tee'd from gnuplot process
+        
+        // base case for unpacking datablock value
+        template <typename T>
+        void dbvalue(size_t i, const T& t) {
+            char buf[256];
+            snprintf(buf,sizeof(buf),"%16.8e",double(t[i]));
+            (*this)(buf); // newline
+        }
+        
+        // recursion case for unpacking datablock value
+        template <typename T, typename... Ts>
+        void dbvalue(size_t i, const T& t, Ts... values) {
+            char buf[256];
+            snprintf(buf,sizeof(buf),"%16.8e ",double(t[i])); // space
+            (*this)(buf,false);
+            dbvalue(i,values...);
+        }
+        
+        // base case for unpacking plot value
+        template <int n, typename T>
+        void doplot(const char* name, const T& value0) {
+            char buf[256];
+            snprintf(buf, sizeof(buf), "%s using 1:%d", name, n);
+            (*this)(buf);
+        }
+        
+        // recursion case for unpacking plot value
+        template <int n, typename T, typename... Ts>
+        void doplot(const char* name, const T& value0, Ts... values) {
+            char buf[256];
+            snprintf(buf, sizeof(buf), "%s using 1:%d, ", name, n);
+            (*this)(buf,false);
+            
+            doplot<n+1,Ts...>(name, values...);
+        }
+        
+    public:
+        Gnuplot& operator=(Gnuplot&&) = delete;
+        Gnuplot& operator=(const Gnuplot&) = delete;
+        Gnuplot(const Gnuplot&) = delete;
+        Gnuplot(const std::string& cmd = "", const std::string& teefile = "") : f(0), ftee(0) {
+            int p[2];
+            if (pipe (p)) {
+                throw "Pipe failed.";
+            }        
+            pid = fork ();
+            if (pid ==  0) { // Child process.
+                close(p[1]);
+                dup2(p[0],STDIN_FILENO);
+                close(p[0]);
+                if (execlp ("gnuplot", "gnuplot", "-persist", NULL) == -1) {
+                    //if (execlp ("cat", "cat", "-", NULL) == -1) {
+                    fprintf(stderr,"Gnuplot: execlp failed for gnuplot ... plotting disabled\n");
+                    exit(1);
+                }
+            }
+            else if (pid < (pid_t) 0)  { // Failure
+                throw "Fork failed.";
+            }
+            else { // Parent process
+                close (p[0]);
+                f = fdopen (p[1], "w");
+            }
+            if (teefile.size() > 0) {
+                ftee = fopen(teefile.c_str(),"w");
+                if (!ftee) {
+                    fprintf(stderr,"Gnuplot: fopen failed for tee file %s ... tee of plotting disabled\n",teefile.c_str());
+                }
+            }
+            
+            if (cmd.size() > 0) (*this)(cmd);
+        }
+        
+        // outputs string to gnuplot process
+        void operator()(const char* cmd, bool EOL=true) {
+            
+            if (f) {
+                if (!fprintf(f,"%s",cmd)) {
+                    fprintf(stderr,"Gnuplot: failed writing to gnuplot pipe ... plotting disabled\n");
+                    fclose(f);
+                    f = NULL;
+                }
+            }
+            if (ftee) fprintf(ftee,"%s",cmd);
+
+            const int n = strlen(cmd);
+            if (EOL && ((n==0) || (cmd[n-1] != '\n') ) ) {
+                if (f) {
+                    if (!fprintf(f,"\n")) {
+                        fprintf(stderr,"Gnuplot: failed writing newline to gnuplot pipe ... plotting disabled\n");
+                        fclose(f);
+                        f = NULL;
+                    }
+                }
+                if (ftee) fprintf(ftee,"\n");
+            }
+            if (f) fflush(f);
+        }
+        
+        // outputs string to gnuplot process
+        void operator()(const std::string& cmd, bool EOL=true) {
+            (*this)(cmd.c_str(), EOL);
+        }
+        
+        // Define a gnuplot data block with given name assuming 1-d indexing via [] with explicit size
+        template <typename T, typename... Ts>
+        void db(const std::string& name, size_t size, const T& x, Ts... values) {
+            (*this)("$",false);
+            (*this)(name,false);
+            (*this)(" << EOD");
+            for (size_t i = 0; i<size; ++i) {
+                dbvalue(i,x,values...);
+            }
+            (*this)("EOD");
+        }
+        
+        // Define a gnuplot data block with given name assuming 1-d indexing via [] with size from x (a 1-d container that supports size())
+        template <typename T, typename... Ts>
+        void db(const std::string& name, const T& x, Ts... values) {
+            db(name,(size_t) x.size(),x,values...); // have to force x.size() to be size_t since Tensor<T>::size() returns long
+        }
+        
+        // Plots data in 2 or more vectors by generating the following gnuplot commands:
+        // $data << EOD
+        // <data values>
+        // EOD
+        // plot $data using 1:2, $data using 1:3, ...
+        template <typename T, typename... Ts>
+        void plot(const T& x, Ts... values) {
+            db("data", x, values...);
+            (*this)("plot ",false);
+            doplot<2,Ts...>("$data", values...); // note we peeled off the x values
+        }
+        
+        ~Gnuplot() {
+            if (f) {
+                fclose(f);
+                waitpid(pid,0,0);
+            }
+            if (ftee) fclose(ftee);
+        }
+        
+        static void test() {
+            std::vector<double> x = {1.0,2.0,3.0};
+            std::vector<double> y = {-1.0,-2.0,3.0};
+            std::vector<double> z = {10.0,11.0,12.0};
+            {
+                Gnuplot g("set style data lp; set grid");
+                g.plot(x,y,z);
+            }
+            {
+                Gnuplot g;
+                //g("set term png");
+                //g("set output \"test.png\"");
+                g.db("xyz",x,y,z);
+                g("plot $xyz using 1:2 with linespoints");
+            }
+            {
+                // use x11 to work around bug (https://sourceforge.net/p/gnuplot/bugs/2634/) ... wxt temporarily needs GDK_BACKEND=x11
+                Gnuplot g("set term x11; set xrange [-10:10]; set yrange [0:1]; set grid; set style data l", "test3.gnuplot");
+                size_t npts = 100;
+                std::vector<double> x(npts), y(npts);
+                for (size_t i = 0; i<npts; ++i) x[i] = -10.0 + 20.0 * i / (npts-1);
+                for (int step=0; step<40; step++) {
+                    double phase = step*0.3;
+                    for (size_t i = 0; i<npts; ++i) y[i] = 0.5 + 0.5 * std::sin(x[i]+phase);
+                    char buf[256];
+                    snprintf(buf,sizeof(buf),"set title \"step %d\"",step);
+                    g(buf);
+                    g.plot(x,y);
+                    usleep(100000);
+                }
+            }
+        }
+    };
+}
+#endif
diff --git a/src/madness/misc/test_gnuplot.cc b/src/madness/misc/test_gnuplot.cc
new file mode 100644
index 00000000000..822e3ed31f2
--- /dev/null
+++ b/src/madness/misc/test_gnuplot.cc
@@ -0,0 +1,7 @@
+#include <madness/misc/gnuplot.h>
+
+int main (void)
+{
+    madness::Gnuplot::test();
+    return 0;
+}

From 21f1e5aeb6cefc513361f50e6a41700f4165442e Mon Sep 17 00:00:00 2001
From: "Robert J. Harrison" <rjharrison@gmail.com>
Date: Thu, 15 Aug 2024 10:44:50 -0400
Subject: [PATCH 11/11] beginning of test directory for periodic stuff

---
 src/examples/periodic/test.cc | 403 ++++++++++++++++++++++++++++++++++
 1 file changed, 403 insertions(+)
 create mode 100644 src/examples/periodic/test.cc

diff --git a/src/examples/periodic/test.cc b/src/examples/periodic/test.cc
new file mode 100644
index 00000000000..88854f1baf2
--- /dev/null
+++ b/src/examples/periodic/test.cc
@@ -0,0 +1,403 @@
+#include <iostream>
+#include <complex>
+#include <numeric>
+#include <vector>
+#include <cmath>
+
+#include <fftw3.h>
+#include <lapacke.h>
+
+#include <madness/misc/gnuplot.h>
+
+/*****************
+
+Use FFT to comupute the electrostatic potential due to some test
+charge densities.  Note that FFT produces a result with
+
+  a) mean value that is zero because it projects out the constant (k=0) mode.
+
+  b) an opposing electric field to cancel that arising from the periodic
+     sum of the dipole moment potential
+
+Test cases --- select by number
+1) Gaussian spheropole
+2) Gaussian dipole
+3) Gaussian quadrupole
+4) Cosine function
+
+Compile with "g++ -O3 -Wall test.cc -I.. -lfftw3 -llapacke"
+
+ *****************/
+
+const int test_case = 1; // 1, 2, 3, 4
+
+const double pi = 3.14159265358979323846;
+const double L = 1.0; // must be integer for cosine test to work, and > 1 for gaussian tests to work
+const double xshift = 0.0; // shift from origin of charge distributions to test the periodicity in [-0.5*L,0.5*L]
+const double yshift = 0.0; 
+const double zshift = 0.0;
+
+// Will be assigned by main program based on test_case selection
+double (*f)(double, double, double) = nullptr;
+double (*exact)(double, double, double) = nullptr;
+
+// Evaluate erf(a*r)/r with correct limit at origin
+double erfaroverr(double a, double r) {
+    if (a*r*r/3.0 < 1e-8) {
+        return 2*a/std::sqrt(pi);
+    }
+    else {
+        return std::erf(a*r)/r;
+    }
+}
+
+// Solve M[m,n] c[n] = f[m] in least squares sense
+// M is the fitting matrix corresponding to m observations and n parameters
+// f is the vector of m observations
+// return the vector c of n fitted parameters
+std::vector<double> solve(size_t m, size_t n, const std::vector<double>& M, const std::vector<double>& f) {
+    std::vector<double> c = f;
+    std::vector<double> M1 = M;
+    lapack_int mm = m, nn = n, nrhs = 1, lda = n, ldb = 1;
+    LAPACKE_dgels(LAPACK_ROW_MAJOR,'N',mm,nn,nrhs,M1.data(),lda,c.data(),ldb);
+    
+    return std::vector<double>(c.data(), c.data()+n);
+}
+
+// print the matrix M[m,n] in row-major order
+template <typename T>
+void print(size_t m, size_t n, const std::vector<T>& M) {
+    for (size_t i=0; i<m; i++) {
+        for (size_t j=0; j<n; j++) {
+            std::cout << i << " " << M[i*n+j] << " ";
+        }
+        std::cout << std::endl;
+    }
+}
+
+// Fit the m data points in f to a polynomial of order n in inverse powers of N
+std::vector<double> fit(size_t m, size_t n, const std::vector<double> N, const std::vector<double>& f) {
+    //print(1,n,N);
+    //print(m,1,f);
+    
+    std::vector<double> M;
+    for (size_t i=0; i<m; i++) {
+        for (size_t j=0; j<n; j++) {
+            M.push_back(pow(1.0/N[i], j));
+        }
+    }
+    //print(m,n,M);
+    return solve(m, n, M, f);
+}
+
+// A C++ procedure to tabulate f(x,y,z) at x = x(i), y = y(j), z = z(k)
+std::vector<double> tabulate(double(*f)(double, double, double), std::vector<double> x, std::vector<double> y, std::vector<double> z) {
+    const size_t nx = x.size();
+    const size_t ny = y.size();
+    const size_t nz = z.size();
+    std::vector<double> F(nx * ny * nz);
+    for (size_t i = 0; i < nx; i++) {
+        for (size_t j = 0; j < ny; j++) {
+            for (size_t k = 0; k < nz; k++) {
+                F[i*ny*nz + j*nz + k] = f(x[i], y[j], z[k]);
+            }
+        }
+    }
+    return F;
+}
+
+// Periodic sum of functions
+double periodic_sum(double x, double y, double z, double(*f)(double, double, double), const int N = 50) {
+    // N number of lattice points summed in each direction
+    double sum = 0.0;
+    for (int X=-N; X<=N; X++) {
+        for (int Y=-N; Y<=N; Y++) {
+            for (int Z=-N; Z<=N; Z++) {
+                sum += f(x+X*L,y+Y*L,z+Z*L);
+            }
+        }
+    }
+    return sum;
+}
+
+// Periodic sum of functions returning vector of values of partial sums in order of increasing cube size
+std::vector<double> periodic_sum_partial(double x, double y, double z, double(*f)(double, double, double), const int N) {
+    std::vector<double> sums;
+    double sum = f(x,y,z);
+    sums.push_back(sum);
+    int count = 1;
+    // Dumb loop structure since attempt to be smart failed!
+    for (int n = 1; n <= N; n++) {
+        for (int X=-n; X<=n; X++) {
+            for (int Y=-n; Y<=n; Y++) {
+                for (int Z=-n; Z<=n; Z++) {
+                    if (std::abs(X) == n || std::abs(Y) == n || std::abs(Z) == n) {
+                        sum += f(x+X*L,y+Y*L,z+Z*L);
+                        count++;
+                    }
+                }
+            }
+        }
+        sums.push_back(sum);
+    }
+    return sums;
+}
+
+// Periodic sum of functions accelerated by fitting t to a polynomial in 1/N
+double periodic_sum_accelerated(double x, double y, double z, double(*f)(double, double, double), const size_t N=9, const size_t p=7) {
+    std::vector<double> sums = periodic_sum_partial(x,y,z,f,N);
+    // Extract the last p terms of the series in v
+    std::vector<double> v(&sums[N-p], &sums[N]);
+    std::vector<double> n(p);
+    for (size_t j=0; j<p; j++) {
+        n[j] = N-p+j;
+    }
+    std::vector<double> c = fit(p, p, n, v);
+    return c[0];
+}
+
+double distancesq(double x, double y, double z, double x0, double y0, double z0) {
+    double dx = x - x0;
+    double dy = y - y0;
+    double dz = z - z0;
+    return dx*dx + dy*dy + dz*dz;
+}
+
+// Gaussian spheropole test function
+const double a = 100.0;
+const double b = 200.0;
+double f_spheropole(double x, double y, double z) {
+    const double afac = std::pow(a/pi, 1.5);
+    const double bfac = std::pow(b/pi, 1.5);
+    const double rsq = distancesq(x,y,z,xshift,yshift,zshift);
+    return afac*exp(-a*rsq) - bfac*exp(-b*rsq);
+}
+
+// No need for periodic summation since the potential is zero exterior to the charge density
+double exact_spheropole(double x, double y, double z) {
+    const double mean = pi*(1/b - 1/a)/(L*L*L); // the mean of the potential over the domain
+    const double rsq = distancesq(x,y,z,xshift,yshift,zshift);
+    const double r = std::sqrt(rsq);
+    //return std::erf(std::sqrt(a)*r)/r - std::erf(std::sqrt(b)*r)/r - mean;
+    return erfaroverr(std::sqrt(a),r) - erfaroverr(std::sqrt(b),r) - mean;
+};
+
+// Gaussian dipole test function
+const double offset = 0.1;
+double f_dipole(double x, double y, double z) {
+    const double bfac = std::pow(b/pi, 1.5);
+    const double r1sq = distancesq(x,y,z,xshift,yshift,zshift-offset);
+    const double r2sq = distancesq(x,y,z,xshift,yshift,zshift+offset);
+    return bfac*(std::exp(-b*r1sq) - std::exp(-b*r2sq));
+}
+
+// Potential due to single Gaussian dipole
+double exact_dipoleX(double x, double y, double z) {
+    const double r1sq = distancesq(x,y,z,xshift,yshift,zshift-offset);
+    const double r2sq = distancesq(x,y,z,xshift,yshift,zshift+offset);
+    const double r1 = std::sqrt(r1sq);
+    const double r2 = std::sqrt(r2sq);
+    //return std::erf(std::sqrt(b)*r1)/r1 - std::erf(std::sqrt(b)*r2)/r2;
+    double sb = std::sqrt(b);
+    return erfaroverr(sb,r1) - erfaroverr(sb,r2);
+};
+
+// Potential due to opposing electric field generated by FT to satisty the periodic boundary conditions and continuity
+double opposing_field_potential(double x, double y, double z) {
+    // center of charge is at (xshift,yshift,zshift)
+    const double mu = 2*offset;
+    return mu*4*pi/(3*L*L*L) * (z-zshift);
+}
+
+// Periodic sum of dipole potentials
+double exact_dipole(double x, double y, double z) {
+    // const double mean = 0.0; // the mean of the potential over the domain(zero due to dipole symmetry)
+    return periodic_sum_accelerated(x, y, z, exact_dipoleX) + opposing_field_potential(x,y,z);
+}
+
+// Gaussian quadrupole test function in yz plane
+double f_quadrupole(double x, double y, double z) {
+    const double bfac = std::pow(b/pi, 1.5);
+    const double r1sq = distancesq(x, y, z, xshift, yshift,        zshift-offset);
+    const double r2sq = distancesq(x, y, z, xshift, yshift,        zshift+offset);
+    const double r3sq = distancesq(x, y, z, xshift, yshift-offset, zshift);
+    const double r4sq = distancesq(x, y, z, xshift, yshift+offset, zshift);
+    return bfac*(std::exp(-b*r1sq) + std::exp(-b*r2sq) - std::exp(-b*r3sq) - std::exp(-b*r4sq)  );
+}
+
+// Potential due to single Gaussian quadrupole
+double exact_quadrupoleX(double x, double y, double z) {
+    const double r1sq = distancesq(x, y, z, xshift, yshift,        zshift-offset);
+    const double r2sq = distancesq(x, y, z, xshift, yshift,        zshift+offset);
+    const double r3sq = distancesq(x, y, z, xshift, yshift-offset, zshift);
+    const double r4sq = distancesq(x, y, z, xshift, yshift+offset, zshift);
+    const double r1 = std::sqrt(r1sq);
+    const double r2 = std::sqrt(r2sq);
+    const double r3 = std::sqrt(r3sq);
+    const double r4 = std::sqrt(r4sq);
+    //return std::erf(std::sqrt(b)*r1)/r1 + std::erf(std::sqrt(b)*r2)/r2  - std::erf(std::sqrt(b)*r3)/r3  - std::erf(std::sqrt(b)*r4)/r4;
+    double sb = std::sqrt(b);
+    return  erfaroverr(sb,r1) + erfaroverr(sb,r2) - erfaroverr(sb,r3) - erfaroverr(sb,r4);
+};
+
+// Periodic sum of quadrupole potentials
+double exact_quadrupole(double x, double y, double z) {
+    // const double mean = 0.0; // the mean of the potential over the domain(zero due to dipole symmetry)
+    //return periodic_sum(x, y, z, exact_quadrupoleX);
+    return periodic_sum_accelerated(x, y, z, exact_quadrupoleX);
+}
+
+
+// Cosine test function
+const double wx = 1;
+const double wy = 2;
+const double wz = 3;
+double f_cosine(double x, double y, double z) {
+    return std::cos(wx*2*pi*(x-xshift)/L)*std::cos(wy*2*pi*(y-yshift)/L)*std::cos(wz*2*pi*(z-zshift)/L);
+}
+
+double exact_cosine(double x, double y, double z) {
+    return f_cosine(x,y,z) / (pi*(wx*wx + wy*wy + wz*wz)/(L*L));
+}
+
+// Return a vector with n equally spaced values between a and b, optionally including the right endpoint
+std::vector<double> linspace(double a, double b, size_t n, bool include_right_endpoint = true) {
+    double h = (b - a) / (include_right_endpoint ? (n - 1) : n);
+    std::vector<double> v(n);
+    for (size_t i = 0; i < n; i++) {
+        v[i] = a + i*h;
+    }
+    return v;
+}
+
+int main() {
+    std::cout.precision(15);
+
+    if (test_case == 1) {
+        f = f_spheropole;
+        exact = exact_spheropole;
+    } else if (test_case == 2) {
+        f = f_dipole;
+        exact = exact_dipole;
+    } else if (test_case == 3) {
+        f = f_quadrupole;
+        exact = exact_quadrupole;
+    } else if (test_case == 4) {
+        f = f_cosine;
+        exact = exact_cosine;
+    } else {
+        std::cerr << "Invalid test case number" << std::endl;
+        return 1;
+    }
+    
+    const size_t n = 50*L; // number of lattice points in each direction
+    const size_t nh = n/2+1; // for real to complex transform last dimension
+    const size_t mid = n/2;  // for mapping hermitian symmetry in transform (and testing and plotting)
+
+    const double lo = -0.5*L;
+    const double hi = 0.5*L;
+    const double h = (hi - lo) / (n - 1);
+    std::vector<double> x = linspace(lo,hi,n,false); // exclude right endpoint
+    std::vector<double> y = x; // Presently assuming cubic domain with equal spacing in x, y, and z
+    std::vector<double> z = x;
+
+    std::vector<double> F = tabulate(f, x, y, z);
+    {
+        madness::Gnuplot g("set style data l");
+        // Extract the middle z column?
+        std::vector<double> col(&F[mid*n*n + mid*n], &F[mid*n*n + mid*n + n]);
+        g.plot(z, col);
+    }
+
+    // sum the values in F to get the total charge
+    double norm = std::reduce(F.begin(), F.end(), 0.0)*h*h*h;
+    std::cout << "norm = " << norm << std::endl;
+    
+    // Perform a 3D Fourier transform of F and store the result in G
+    std::vector<fftw_complex> G(n*n*nh);
+    {
+        fftw_plan plan = fftw_plan_dft_r2c_3d(n, n, n, F.data() , G.data(), FFTW_ESTIMATE);
+        fftw_execute(plan);
+        fftw_destroy_plan(plan);
+    }
+
+    // Navigate the way FFTW stores the Hermitian symmetric data
+    auto fudge = [=](size_t j) {return j<nh ? j : n-j;};
+
+    double kscale = 4.0*pi*L*L/(n*n);
+
+    // Apply Coulomb GF in fourier space
+    for (size_t jx=0; jx<n; jx++) {
+        double kx = 2.0*pi*fudge(jx)/n;
+        for (size_t jy=0; jy<n; jy++) {
+            double ky = 2.0*pi*fudge(jy)/n;
+            for (size_t jz=0; jz<nh; jz++) {
+                double kz = 2.0*pi*jz/n;
+                double ksq = kx*kx + ky*ky + kz*kz;
+
+                if (ksq > 1.0e-20) {
+                    double kfac = kscale/ksq;
+                    size_t j = jx*n*nh + jy*nh + jz;
+                    //if (std::abs(G[j][0]) > 1e-6) {
+                        //std::cout << jx << " (" << fudge(jx) << ") " << jy << " (" << fudge(jy) << ") " << jz << " " << ksq << " " << G[j][0] << " " << G[j][1] << std::endl;
+                    //}
+                    G[j][0] *= kfac;
+                    G[j][1] *= kfac;
+                }
+            }
+        }
+    }
+    
+    // Do the reverse transform into FF
+    std::vector<double> FF(n*n*n);
+    {
+        fftw_plan plan = fftw_plan_dft_c2r_3d(n, n, n, G.data() , FF.data(), FFTW_ESTIMATE);
+        fftw_execute(plan);
+        fftw_destroy_plan(plan);
+        const double scale = 1.0/(n*n*n); // FFTW transforms are not normalized
+        for (double& u : FF) u *= scale;
+    }
+
+    // Check the mean potential --- expected to be zero
+    double FFmean = std::reduce(FF.begin(), FF.end(), 0.0)*h*h*h / (L*L*L);
+    std::cout << "FFmean = " << FFmean << std::endl;
+    
+    size_t j = mid; //mid+3; // test point (avoid origin) -- don't need to avoid origin now we are treating erf(0)/0 correctly
+    {
+        madness::Gnuplot g("set style data l");
+        std::vector<double> col(&FF[j*n*n + j*n], &FF[j*n*n + j*n + n]);
+        std::vector<double> col2(n);
+        std::vector<double> col3(n);
+        for (size_t i = 0; i < n; i++) {
+            double v = exact(x[j],y[j],z[i]);
+            std::cout << i << " " << z[i] << " " << col[i] << " " << v << " " << col[i]-v << std::endl;
+            col2[i] = v;
+            col3[i] = col[i] - v;
+        }
+
+        //g.plot(z, col, col2);
+        g.plot(z, col3);
+        
+    }
+
+    // std::vector<double> sums = periodic_sum_partial(x[j],y[j],z[0],exact_quadrupoleX,50);
+    // //std::vector<double> sums = periodic_sum_partial(x[j],y[j],z[0],exact_dipoleX,50);
+    // for (size_t i = 0; i < sums.size(); i++) {
+    //     std::cout << i << " " << sums[i] << " " << exact(x[j],y[j],z[0]) << std::endl;
+    // }
+    // std::cout << exact(x[j],y[j],z[0]) << " " << FF[j*n*n + j*n + 0] <<  " !! " << periodic_sum_accelerated(x[j],y[j],z[0],exact_quadrupoleX) << std::endl;
+
+    // size_t p = 7; // no. of terms in the polynomial fit --- 7 is best for quadrupole
+    // for (size_t i=p+1; i<sums.size()-p; i++) {
+    //     // Extract the last p terms of the series in v starting at position i
+    //     std::vector<double> f(&sums[i], &sums[i+p]);
+    //     std::vector<double> N(p);
+    //     for (size_t j=0; j<p; j++) {
+    //         N[j] = i+j;
+    //     }
+    //     std::vector<double> c = fit(p, p, N, f);
+    //     std::cout << i << " " << c[0] << " " << sums[i]-FF[j*n*n + j*n + 0] << " " << c[0]-FF[j*n*n + j*n + 0] << std::endl;
+    // }
+
+    return 0;
+}