From 246b00e53752bd9261b1f7f4ea4cb82273351a92 Mon Sep 17 00:00:00 2001 From: fbischoff Date: Mon, 5 Aug 2024 13:25:04 -0400 Subject: [PATCH 01/11] fixing the timing in cloud --- src/madness/world/cloud.h | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/madness/world/cloud.h b/src/madness/world/cloud.h index aba5f36ad38..0da3c0cf3af 100644 --- a/src/madness/world/cloud.h +++ b/src/madness/world/cloud.h @@ -207,9 +207,9 @@ class Cloud { double rtime = double(reading_time); double wtime = double(writing_time); double ptime = double(replication_time); - universe.gop.sum(rtime); - universe.gop.sum(wtime); - universe.gop.sum(ptime); + universe.gop.max(rtime); + universe.gop.max(wtime); + universe.gop.max(ptime); long creads = long(cache_reads); long cstores = long(cache_stores); universe.gop.sum(creads); @@ -217,9 +217,9 @@ class Cloud { if (universe.rank() == 0) { auto precision = std::cout.precision(); std::cout << std::fixed << std::setprecision(1); - print("cloud storing cpu time", wtime * 0.001); - print("cloud replication cpu time", ptime * 0.001); - print("cloud reading cpu time", rtime * 0.001, std::defaultfloat); + print("cloud storing wall time", wtime * 0.001); + print("cloud replication wall time", ptime * 0.001); + print("cloud reading wall time", rtime * 0.001, std::defaultfloat); std::cout << std::setprecision(precision) << std::scientific; print("cloud cache stores ", long(cstores)); print("cloud cache loads ", long(creads)); @@ -239,6 +239,8 @@ class Cloud { cache_reads=0l; } + /// @param[in] world the subworld the objects are loaded to + /// @param[in] recordlist the list of records where the objects are stored template T load(madness::World &world, const recordlistT recordlist) const { recordlistT rlist = recordlist; @@ -250,6 +252,7 @@ class Cloud { } } + /// @param[in] world presumably the universe template recordlistT store(madness::World &world, const T &source) { if (is_replicated) { @@ -270,6 +273,7 @@ class Cloud { void replicate(const std::size_t chunk_size=INT_MAX) { World& world=container.get_world(); + world.gop.fence(); cloudtimer t(world,replication_time); container.reset_pmap_to_local(); is_replicated=true; @@ -351,13 +355,13 @@ class Cloud { struct cloudtimer { World& world; - double cpu0; + double wall0; std::atomic &rtime; - cloudtimer(World& world, std::atomic &readtime) : world(world), cpu0(cpu_time()), rtime(readtime) {} + cloudtimer(World& world, std::atomic &readtime) : world(world), wall0(wall_time()), rtime(readtime) {} ~cloudtimer() { - if (world.rank()==0) rtime += long((cpu_time() - cpu0) * 1000l); + if (world.rank()==0) rtime += long((wall_time() - wall0) * 1000l); } }; From d1fceb7c377bf90983ba4e427785edef9c8253ae Mon Sep 17 00:00:00 2001 From: fbischoff Date: Mon, 5 Aug 2024 13:51:01 -0400 Subject: [PATCH 02/11] fixing the timing in cloud, part II of .. --- src/madness/world/cloud.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/madness/world/cloud.h b/src/madness/world/cloud.h index 0da3c0cf3af..ecc18b6bdcf 100644 --- a/src/madness/world/cloud.h +++ b/src/madness/world/cloud.h @@ -361,7 +361,8 @@ class Cloud { cloudtimer(World& world, std::atomic &readtime) : world(world), wall0(wall_time()), rtime(readtime) {} ~cloudtimer() { - if (world.rank()==0) rtime += long((wall_time() - wall0) * 1000l); + long deltatime=long((wall_time() - wall0) * 1000l); + rtime += deltatime; } }; From e1843c8f5242e7d1475d2029a233c551f14fdea9 Mon Sep 17 00:00:00 2001 From: "Robert J. Harrison" Date: Wed, 7 Aug 2024 11:07:45 -0500 Subject: [PATCH 03/11] remove compiler type warning --- src/madness/world/worldgop.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/madness/world/worldgop.h b/src/madness/world/worldgop.h index 490a457a115..34817a755c5 100644 --- a/src/madness/world/worldgop.h +++ b/src/madness/world/worldgop.h @@ -788,7 +788,7 @@ namespace madness { auto buf0 = std::unique_ptr(new T[nelem_per_maxmsg]); auto buf1 = std::unique_ptr(new T[nelem_per_maxmsg]); - auto reduce_impl = [&,this](T* buf, int nelem) { + auto reduce_impl = [&,this](T* buf, size_t nelem) { MADNESS_ASSERT(nelem <= nelem_per_maxmsg); SafeMPI::Request req0, req1; Tag gsum_tag = world_.mpi.unique_tag(); From 5dc6de51505861bd8262502b4504f51c1eb4557f Mon Sep 17 00:00:00 2001 From: "Robert J. Harrison" Date: Wed, 7 Aug 2024 11:08:15 -0500 Subject: [PATCH 04/11] testing optimized parallel container serialization --- src/madness/world/test_dc.cc | 156 +++++++++++++++++++++++++++++++++-- 1 file changed, 149 insertions(+), 7 deletions(-) diff --git a/src/madness/world/test_dc.cc b/src/madness/world/test_dc.cc index 5197b006dac..930079dd8a5 100644 --- a/src/madness/world/test_dc.cc +++ b/src/madness/world/test_dc.cc @@ -29,10 +29,18 @@ fax: 865-572-0680 */ +//#define MAD_ARCHIVE_DEBUG_ENABLE + +#include + #include #include +#include #include +#include +#include + using namespace madness; using namespace std; @@ -240,16 +248,150 @@ void test_local(World& world) { } +namespace madness { + namespace archive { + /// Write container to parallel archive + template + struct ArchiveStoreImpl< ParallelOutputArchive, WorldContainer > { + static void store(const ParallelOutputArchive& ar, const WorldContainer& t) { + using localarchiveT = VectorOutputArchive; + const long magic = -5881828; // Sitar Indian restaurant in Knoxville (negative to indicate parallel!) + typedef WorldContainer dcT; + using const_iterator = typename dcT::const_iterator; + + const size_t default_size = 100*1024*1024; + + World* world = ar.get_world(); + world->gop.fence(); + + std::vector v; + v.reserve(default_size); + size_t count = 0; + + class op : public TaskInterface { + const size_t ntasks; + const size_t taskid; + const dcT& t; + std::vector& vtotal; + size_t& total_count; + Mutex& mutex; + + public: + op(size_t ntasks, size_t taskid, const dcT& t, std::vector& vtotal, size_t& total_count, Mutex& mutex) + : ntasks(ntasks), taskid(taskid), t(t), vtotal(vtotal), total_count(total_count), mutex(mutex) {} + void run(World& world) { + std::vector v; + v.reserve(std::max(size_t(1024*1024),vtotal.capacity()/ntasks)); + VectorOutputArchive var(v); + const_iterator it=t.begin(); + size_t count = 0; + size_t n = 0; + while (it!=t.end()) { + if ((n%ntasks) == taskid) { + var & *it; + ++count; + } + ++it; + n++; + } + + if (count) { + mutex.lock(); + vtotal.insert(vtotal.end(), v.begin(), v.end()); + total_count += count; + mutex.unlock(); + } + } + }; + + Mutex mutex; + size_t ntasks = std::max(size_t(1), ThreadPool::size()); + for (size_t taskid=0; taskidtaskq.add(new op(ntasks, taskid, t, v, count, mutex)); + world->gop.fence(); + + // Gather all buffers to process 0 + // first gather all of the sizes and counts to a vector in process 0 + int size = v.size(); + std::vector sizes(world->size()); + MPI_Gather(&size, 1, MPI_INT, sizes.data(), 1, MPI_INT, 0, world->mpi.comm().Get_mpi_comm()); + world->gop.sum(count); // just need total number of elements + + // build the cumulative sum of sizes + std::vector offsets(world->size()); + offsets[0] = 0; + for (int i=1; isize(); ++i) offsets[i] = offsets[i-1] + sizes[i-1]; + int total_size = offsets.back() + sizes.back(); + + // gather the vector of data v from each process to process 0 + unsigned char* all_data=0; + if (world->rank() == 0) { + all_data = new unsigned char[total_size]; + } + MPI_Gatherv(v.data(), v.size(), MPI_BYTE, all_data, sizes.data(), offsets.data(), MPI_BYTE, 0, world->mpi.comm().Get_mpi_comm()); + + if (world->rank() == 0) { + auto& localar = ar.local_archive(); + localar & magic & 1; // 1 client + // localar & t; + ArchivePrePostImpl::preamble_store(localar); + localar & -magic & count; + localar.store(all_data, total_size); + ArchivePrePostImpl::postamble_store(localar); + + delete[] all_data; + } + world->gop.fence(); + } + }; + } +} + +void test_florian(World& world) { + WorldContainer c(world); + + Key key1(1); + Node node1(1); + + if (world.rank() == 0) { + for (int i=0; i<100; ++i) { + c.replace(Key(i),Node(i)); + } + } + world.gop.fence(); + + std::vector v; + { + archive::VectorOutputArchive var(v); + archive::ParallelOutputArchive ar(world,var); + ar & c; + } + + WorldContainer c2(world); + { + archive::VectorInputArchive var2(v); + archive::ParallelInputArchive ar2(world,var2); + ar2 & c2; + } + + for (int i=0; i<100; ++i) { + MADNESS_CHECK(c2.find(Key(i)).get()->second.get() == i); + } + + world.gop.fence(); + print("test_florian passed"); +} + int main(int argc, char** argv) { - initialize(argc, argv); - World world(SafeMPI::COMM_WORLD); try { - test0(world); - test1(world); - test1(world); - test1(world); - test_local(world); + World& world = initialize(argc, argv); + // test0(world); + // test1(world); + // test1(world); + // test1(world); + // test_local(world); + test_florian(world); } catch (const SafeMPI::Exception& e) { error("caught an MPI exception"); From 85f305201bdf6c6f0b9c258e506b934c78d43d55 Mon Sep 17 00:00:00 2001 From: fbischoff Date: Wed, 7 Aug 2024 12:21:53 -0400 Subject: [PATCH 05/11] moving serialization to VectorOutputArchive into worlddc.h --- src/madness/world/test_dc.cc | 99 ------------------------------------ src/madness/world/worlddc.h | 95 ++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 99 deletions(-) diff --git a/src/madness/world/test_dc.cc b/src/madness/world/test_dc.cc index 930079dd8a5..d95154c08ad 100644 --- a/src/madness/world/test_dc.cc +++ b/src/madness/world/test_dc.cc @@ -248,105 +248,6 @@ void test_local(World& world) { } -namespace madness { - namespace archive { - /// Write container to parallel archive - template - struct ArchiveStoreImpl< ParallelOutputArchive, WorldContainer > { - static void store(const ParallelOutputArchive& ar, const WorldContainer& t) { - using localarchiveT = VectorOutputArchive; - const long magic = -5881828; // Sitar Indian restaurant in Knoxville (negative to indicate parallel!) - typedef WorldContainer dcT; - using const_iterator = typename dcT::const_iterator; - - const size_t default_size = 100*1024*1024; - - World* world = ar.get_world(); - world->gop.fence(); - - std::vector v; - v.reserve(default_size); - size_t count = 0; - - class op : public TaskInterface { - const size_t ntasks; - const size_t taskid; - const dcT& t; - std::vector& vtotal; - size_t& total_count; - Mutex& mutex; - - public: - op(size_t ntasks, size_t taskid, const dcT& t, std::vector& vtotal, size_t& total_count, Mutex& mutex) - : ntasks(ntasks), taskid(taskid), t(t), vtotal(vtotal), total_count(total_count), mutex(mutex) {} - void run(World& world) { - std::vector v; - v.reserve(std::max(size_t(1024*1024),vtotal.capacity()/ntasks)); - VectorOutputArchive var(v); - const_iterator it=t.begin(); - size_t count = 0; - size_t n = 0; - while (it!=t.end()) { - if ((n%ntasks) == taskid) { - var & *it; - ++count; - } - ++it; - n++; - } - - if (count) { - mutex.lock(); - vtotal.insert(vtotal.end(), v.begin(), v.end()); - total_count += count; - mutex.unlock(); - } - } - }; - - Mutex mutex; - size_t ntasks = std::max(size_t(1), ThreadPool::size()); - for (size_t taskid=0; taskidtaskq.add(new op(ntasks, taskid, t, v, count, mutex)); - world->gop.fence(); - - // Gather all buffers to process 0 - // first gather all of the sizes and counts to a vector in process 0 - int size = v.size(); - std::vector sizes(world->size()); - MPI_Gather(&size, 1, MPI_INT, sizes.data(), 1, MPI_INT, 0, world->mpi.comm().Get_mpi_comm()); - world->gop.sum(count); // just need total number of elements - - // build the cumulative sum of sizes - std::vector offsets(world->size()); - offsets[0] = 0; - for (int i=1; isize(); ++i) offsets[i] = offsets[i-1] + sizes[i-1]; - int total_size = offsets.back() + sizes.back(); - - // gather the vector of data v from each process to process 0 - unsigned char* all_data=0; - if (world->rank() == 0) { - all_data = new unsigned char[total_size]; - } - MPI_Gatherv(v.data(), v.size(), MPI_BYTE, all_data, sizes.data(), offsets.data(), MPI_BYTE, 0, world->mpi.comm().Get_mpi_comm()); - - if (world->rank() == 0) { - auto& localar = ar.local_archive(); - localar & magic & 1; // 1 client - // localar & t; - ArchivePrePostImpl::preamble_store(localar); - localar & -magic & count; - localar.store(all_data, total_size); - ArchivePrePostImpl::postamble_store(localar); - - delete[] all_data; - } - world->gop.fence(); - } - }; - } -} - void test_florian(World& world) { WorldContainer c(world); diff --git a/src/madness/world/worlddc.h b/src/madness/world/worlddc.h index d375c7b9de3..2c5aa4523fc 100644 --- a/src/madness/world/worlddc.h +++ b/src/madness/world/worlddc.h @@ -1680,6 +1680,101 @@ namespace madness { } }; + /// Write container to parallel archive + template + struct ArchiveStoreImpl< ParallelOutputArchive, WorldContainer > { + static void store(const ParallelOutputArchive& ar, const WorldContainer& t) { + using localarchiveT = VectorOutputArchive; + const long magic = -5881828; // Sitar Indian restaurant in Knoxville (negative to indicate parallel!) + typedef WorldContainer dcT; + using const_iterator = typename dcT::const_iterator; + + const size_t default_size = 100*1024*1024; + + World* world = ar.get_world(); + world->gop.fence(); + + std::vector v; + v.reserve(default_size); + size_t count = 0; + + class op : public TaskInterface { + const size_t ntasks; + const size_t taskid; + const dcT& t; + std::vector& vtotal; + size_t& total_count; + Mutex& mutex; + + public: + op(size_t ntasks, size_t taskid, const dcT& t, std::vector& vtotal, size_t& total_count, Mutex& mutex) + : ntasks(ntasks), taskid(taskid), t(t), vtotal(vtotal), total_count(total_count), mutex(mutex) {} + void run(World& world) { + std::vector v; + v.reserve(std::max(size_t(1024*1024),vtotal.capacity()/ntasks)); + VectorOutputArchive var(v); + const_iterator it=t.begin(); + size_t count = 0; + size_t n = 0; + while (it!=t.end()) { + if ((n%ntasks) == taskid) { + var & *it; + ++count; + } + ++it; + n++; + } + + if (count) { + mutex.lock(); + vtotal.insert(vtotal.end(), v.begin(), v.end()); + total_count += count; + mutex.unlock(); + } + } + }; + + Mutex mutex; + size_t ntasks = std::max(size_t(1), ThreadPool::size()); + for (size_t taskid=0; taskidtaskq.add(new op(ntasks, taskid, t, v, count, mutex)); + world->gop.fence(); + + // Gather all buffers to process 0 + // first gather all of the sizes and counts to a vector in process 0 + int size = v.size(); + std::vector sizes(world->size()); + MPI_Gather(&size, 1, MPI_INT, sizes.data(), 1, MPI_INT, 0, world->mpi.comm().Get_mpi_comm()); + world->gop.sum(count); // just need total number of elements + + // build the cumulative sum of sizes + std::vector offsets(world->size()); + offsets[0] = 0; + for (int i=1; isize(); ++i) offsets[i] = offsets[i-1] + sizes[i-1]; + int total_size = offsets.back() + sizes.back(); + + // gather the vector of data v from each process to process 0 + unsigned char* all_data=0; + if (world->rank() == 0) { + all_data = new unsigned char[total_size]; + } + MPI_Gatherv(v.data(), v.size(), MPI_BYTE, all_data, sizes.data(), offsets.data(), MPI_BYTE, 0, world->mpi.comm().Get_mpi_comm()); + + if (world->rank() == 0) { + auto& localar = ar.local_archive(); + localar & magic & 1; // 1 client + // localar & t; + ArchivePrePostImpl::preamble_store(localar); + localar & -magic & count; + localar.store(all_data, total_size); + ArchivePrePostImpl::postamble_store(localar); + + delete[] all_data; + } + world->gop.fence(); + } + }; + template struct ArchiveLoadImpl< ParallelInputArchive, WorldContainer > { /// Read container from parallel archive From b9b87d09a6109846d644a564753f78b6bb9b0f70 Mon Sep 17 00:00:00 2001 From: fbischoff Date: Wed, 7 Aug 2024 14:23:29 -0400 Subject: [PATCH 06/11] moving serialization to VectorOutputArchive into worlddc.h --- src/madness/world/parallel_dc_archive.h | 17 ++- src/madness/world/worlddc.h | 136 ++++++++++++------------ 2 files changed, 84 insertions(+), 69 deletions(-) diff --git a/src/madness/world/parallel_dc_archive.h b/src/madness/world/parallel_dc_archive.h index 7ceb9210267..d178d26cba3 100644 --- a/src/madness/world/parallel_dc_archive.h +++ b/src/madness/world/parallel_dc_archive.h @@ -36,7 +36,10 @@ namespace madness { { close(); } - + + VectorOutputArchive& get_archive() { + return ar; + } template inline typename std::enable_if< madness::is_trivially_serializable::value, void >::type @@ -97,7 +100,17 @@ namespace madness { void close() {} }; - + + + template + struct ArchiveStoreImpl< ParallelOutputArchive, WorldContainer > { + static void store(const ParallelOutputArchive& ar, const WorldContainer& t) { + ParallelOutputArchive par(*(ar.get_world()), ar.local_archive().get_archive()); + par & t; + + } + }; + } diff --git a/src/madness/world/worlddc.h b/src/madness/world/worlddc.h index 2c5aa4523fc..91e57e52c9f 100644 --- a/src/madness/world/worlddc.h +++ b/src/madness/world/worlddc.h @@ -1613,74 +1613,8 @@ namespace madness { } namespace archive { - /// Write container to parallel archive with optional fence - /// \ingroup worlddc - /// Each node (process) is served by a designated IO node. - /// The IO node has a binary local file archive to which is - /// first written a cookie and the number of servers. The IO - /// node then loops thru all of its clients and in turn tells - /// each to write its data over an MPI stream, which is copied - /// directly to the output file. The stream contents are then - /// cookie, no. of clients, foreach client (usual sequential archive). - /// - /// If ar.dofence() is true (default) fence is invoked before and - /// after the IO. The fence is optional but it is of course - /// necessary to be sure that all updates have completed - /// before doing IO, and that all IO has completed before - /// subsequent modifications. Also, there is always at least - /// some synchronization between a client and its IO server. - template - struct ArchiveStoreImpl< ParallelOutputArchive, WorldContainer > { - static void store(const ParallelOutputArchive& ar, const WorldContainer& t) { - const long magic = -5881828; // Sitar Indian restaurant in Knoxville (negative to indicate parallel!) - typedef WorldContainer dcT; - // typedef typename dcT::const_iterator iterator; // unused? - typedef typename dcT::pairT pairT; - World* world = ar.get_world(); - Tag tag = world->mpi.unique_tag(); - ProcessID me = world->rank(); - if (ar.dofence()) world->gop.fence(); - if (ar.is_io_node()) { - auto& localar = ar.local_archive(); - localar & magic & ar.num_io_clients(); - for (ProcessID p=0; psize(); ++p) { - if (p == me) { - localar & t; - } - else if (ar.io_node(p) == me) { - world->mpi.Send(int(1),p,tag); // Tell client to start sending - archive::MPIInputArchive source(*world, p); - long cookie = 0l; - unsigned long count = 0ul; - - ArchivePrePostImpl::preamble_store(localar); - - source & cookie & count; - localar & cookie & count; - while (count--) { - pairT datum; - source & datum; - localar & datum; - } - - ArchivePrePostImpl::postamble_store(localar); - } - } - } - else { - ProcessID p = ar.my_io_node(); - int flag; - world->mpi.Recv(flag,p,tag); - MPIOutputArchive dest(*world, p); - dest & t; - dest.flush(); - } - if (ar.dofence()) world->gop.fence(); - } - }; - - /// Write container to parallel archive + /// Write container to parallel archive template struct ArchiveStoreImpl< ParallelOutputArchive, WorldContainer > { static void store(const ParallelOutputArchive& ar, const WorldContainer& t) { @@ -1775,6 +1709,74 @@ namespace madness { } }; + + /// Write container to parallel archive with optional fence + + /// \ingroup worlddc + /// Each node (process) is served by a designated IO node. + /// The IO node has a binary local file archive to which is + /// first written a cookie and the number of servers. The IO + /// node then loops thru all of its clients and in turn tells + /// each to write its data over an MPI stream, which is copied + /// directly to the output file. The stream contents are then + /// cookie, no. of clients, foreach client (usual sequential archive). + /// + /// If ar.dofence() is true (default) fence is invoked before and + /// after the IO. The fence is optional but it is of course + /// necessary to be sure that all updates have completed + /// before doing IO, and that all IO has completed before + /// subsequent modifications. Also, there is always at least + /// some synchronization between a client and its IO server. + template + struct ArchiveStoreImpl< ParallelOutputArchive, WorldContainer > { + static void store(const ParallelOutputArchive& ar, const WorldContainer& t) { + const long magic = -5881828; // Sitar Indian restaurant in Knoxville (negative to indicate parallel!) + typedef WorldContainer dcT; + // typedef typename dcT::const_iterator iterator; // unused? + typedef typename dcT::pairT pairT; + World* world = ar.get_world(); + Tag tag = world->mpi.unique_tag(); + ProcessID me = world->rank(); + if (ar.dofence()) world->gop.fence(); + if (ar.is_io_node()) { + auto& localar = ar.local_archive(); + localar & magic & ar.num_io_clients(); + for (ProcessID p=0; psize(); ++p) { + if (p == me) { + localar & t; + } + else if (ar.io_node(p) == me) { + world->mpi.Send(int(1),p,tag); // Tell client to start sending + archive::MPIInputArchive source(*world, p); + long cookie = 0l; + unsigned long count = 0ul; + + ArchivePrePostImpl::preamble_store(localar); + + source & cookie & count; + localar & cookie & count; + while (count--) { + pairT datum; + source & datum; + localar & datum; + } + + ArchivePrePostImpl::postamble_store(localar); + } + } + } + else { + ProcessID p = ar.my_io_node(); + int flag; + world->mpi.Recv(flag,p,tag); + MPIOutputArchive dest(*world, p); + dest & t; + dest.flush(); + } + if (ar.dofence()) world->gop.fence(); + } + }; + template struct ArchiveLoadImpl< ParallelInputArchive, WorldContainer > { /// Read container from parallel archive From 41020aab73b17c3753af50a6c060986b109550d6 Mon Sep 17 00:00:00 2001 From: fbischoff Date: Wed, 7 Aug 2024 14:48:43 -0400 Subject: [PATCH 07/11] looking for the lost time in WorldContainer serialization --- src/madness/world/cloud.h | 6 ++++++ src/madness/world/worlddc.h | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/src/madness/world/cloud.h b/src/madness/world/cloud.h index ecc18b6bdcf..ae05897c9f8 100644 --- a/src/madness/world/cloud.h +++ b/src/madness/world/cloud.h @@ -206,9 +206,11 @@ class Cloud { void print_timings(World &universe) const { double rtime = double(reading_time); double wtime = double(writing_time); + double wtime1 = double(writing_time1); double ptime = double(replication_time); universe.gop.max(rtime); universe.gop.max(wtime); + universe.gop.max(wtime1); universe.gop.max(ptime); long creads = long(cache_reads); long cstores = long(cache_stores); @@ -218,6 +220,7 @@ class Cloud { auto precision = std::cout.precision(); std::cout << std::fixed << std::setprecision(1); print("cloud storing wall time", wtime * 0.001); + print("cloud storing wall time inner loop", wtime1 * 0.001); print("cloud replication wall time", ptime * 0.001); print("cloud reading wall time", rtime * 0.001, std::defaultfloat); std::cout << std::setprecision(precision) << std::scientific; @@ -234,6 +237,7 @@ class Cloud { void clear_timings() { reading_time=0l; writing_time=0l; + writing_time1=0l; replication_time=0l; cache_stores=0l; cache_reads=0l; @@ -332,6 +336,7 @@ class Cloud { mutable std::atomic reading_time=0l; // in ms mutable std::atomic writing_time=0l; // in ms + mutable std::atomic writing_time1=0l; // in ms mutable std::atomic replication_time=0l; // in ms mutable std::atomic cache_reads=0l; mutable std::atomic cache_stores=0l; @@ -423,6 +428,7 @@ class Cloud { if (is_already_present) { if (world.rank()==0) cache_stores++; } else { + cloudtimer t(world,writing_time1); madness::archive::ContainerRecordOutputArchive ar(world, container, record); madness::archive::ParallelOutputArchive par(world, ar); par & source; diff --git a/src/madness/world/worlddc.h b/src/madness/world/worlddc.h index 91e57e52c9f..125fae64efa 100644 --- a/src/madness/world/worlddc.h +++ b/src/madness/world/worlddc.h @@ -1615,6 +1615,10 @@ namespace madness { namespace archive { /// Write container to parallel archive + + /// specialization for parallel serialization of a WorldContainer: + /// all threads on each process serialize some values into a buffer, which gets concatenated + /// and finally serialized to localarchive (aka VectorOutputArchive). template struct ArchiveStoreImpl< ParallelOutputArchive, WorldContainer > { static void store(const ParallelOutputArchive& ar, const WorldContainer& t) { @@ -1650,6 +1654,7 @@ namespace madness { const_iterator it=t.begin(); size_t count = 0; size_t n = 0; + /// threads serialize round-robin over the container while (it!=t.end()) { if ((n%ntasks) == taskid) { var & *it; @@ -1659,6 +1664,7 @@ namespace madness { n++; } + // concatenate the buffers from each thread if (count) { mutex.lock(); vtotal.insert(vtotal.end(), v.begin(), v.end()); From fe29acfaed63ae388692a0fd32ef895b07e95261 Mon Sep 17 00:00:00 2001 From: fbischoff Date: Wed, 7 Aug 2024 16:22:03 -0400 Subject: [PATCH 08/11] looking for the lost time in WorldContainer serialization --- src/madness/world/test_dc.cc | 43 +++++++++++++++++++++++------- src/madness/world/vector_archive.h | 1 + src/madness/world/worlddc.h | 17 +++++++++--- 3 files changed, 47 insertions(+), 14 deletions(-) diff --git a/src/madness/world/test_dc.cc b/src/madness/world/test_dc.cc index d95154c08ad..bf03c72ee6f 100644 --- a/src/madness/world/test_dc.cc +++ b/src/madness/world/test_dc.cc @@ -89,6 +89,26 @@ struct Node { ~Node() {} }; +struct LargeNode { + std::vector k; + + LargeNode() : k() {} + + LargeNode(int val) { + k=std::vector(10000,val); + } + + int get() const { + return k[0]; + } + + template + void serialize(const Archive& ar) { + ar & k; + } + + ~LargeNode() {} +}; ostream& operator<<(ostream&s, const Node& node) { s << "Node(" << node.k << ")"; return s; @@ -249,34 +269,37 @@ void test_local(World& world) { } void test_florian(World& world) { - WorldContainer c(world); - - Key key1(1); - Node node1(1); + WorldContainer c(world); + long nlarge=200000; if (world.rank() == 0) { - for (int i=0; i<100; ++i) { - c.replace(Key(i),Node(i)); + for (int i=0; i v; { archive::VectorOutputArchive var(v); archive::ParallelOutputArchive ar(world,var); ar & c; } + double wall1=wall_time(); + printf("ending at time %8.4f after %8.4fs\n",wall1,wall1-wall0); - WorldContainer c2(world); + WorldContainer c2(world); { archive::VectorInputArchive var2(v); archive::ParallelInputArchive ar2(world,var2); ar2 & c2; } - for (int i=0; i<100; ++i) { - MADNESS_CHECK(c2.find(Key(i)).get()->second.get() == i); + if (world.rank()==0) { + for (int i=0; isecond.get() == i); + } } world.gop.fence(); diff --git a/src/madness/world/vector_archive.h b/src/madness/world/vector_archive.h index 8dfa56179d5..129c35e44e9 100644 --- a/src/madness/world/vector_archive.h +++ b/src/madness/world/vector_archive.h @@ -53,6 +53,7 @@ namespace madness { /// Wraps an archive around an STL \c vector for output. class VectorOutputArchive : public BaseOutputArchive { + public: mutable std::vector* v; ///< The STL vector being wrapped. public: diff --git a/src/madness/world/worlddc.h b/src/madness/world/worlddc.h index 125fae64efa..f10ff355fcf 100644 --- a/src/madness/world/worlddc.h +++ b/src/madness/world/worlddc.h @@ -1627,7 +1627,8 @@ namespace madness { typedef WorldContainer dcT; using const_iterator = typename dcT::const_iterator; - const size_t default_size = 100*1024*1024; + // const size_t default_size = 100*1024*1024; + const size_t default_size = 8ul<<30; World* world = ar.get_world(); world->gop.fence(); @@ -1649,8 +1650,8 @@ namespace madness { : ntasks(ntasks), taskid(taskid), t(t), vtotal(vtotal), total_count(total_count), mutex(mutex) {} void run(World& world) { std::vector v; - v.reserve(std::max(size_t(1024*1024),vtotal.capacity()/ntasks)); - VectorOutputArchive var(v); + std::size_t hint_size=std::max(size_t(1024*1024),vtotal.capacity()/ntasks); + VectorOutputArchive var(v,hint_size+taskid); const_iterator it=t.begin(); size_t count = 0; size_t n = 0; @@ -1663,7 +1664,7 @@ namespace madness { ++it; n++; } - + print("count, n, ntasks, taskid, size",count,n,ntasks,taskid,var.v->capacity()); // concatenate the buffers from each thread if (count) { mutex.lock(); @@ -1674,12 +1675,16 @@ namespace madness { } }; + world->gop.fence(); + double wall0=wall_time(); Mutex mutex; size_t ntasks = std::max(size_t(1), ThreadPool::size()); for (size_t taskid=0; taskidtaskq.add(new op(ntasks, taskid, t, v, count, mutex)); world->gop.fence(); + double wall1=wall_time(); + if (world->rank()==0) printf("time in the taskq: %8.4fs\n",wall1-wall0); // Gather all buffers to process 0 // first gather all of the sizes and counts to a vector in process 0 int size = v.size(); @@ -1687,12 +1692,14 @@ namespace madness { MPI_Gather(&size, 1, MPI_INT, sizes.data(), 1, MPI_INT, 0, world->mpi.comm().Get_mpi_comm()); world->gop.sum(count); // just need total number of elements + print("time 3",wall_time()); // build the cumulative sum of sizes std::vector offsets(world->size()); offsets[0] = 0; for (int i=1; isize(); ++i) offsets[i] = offsets[i-1] + sizes[i-1]; int total_size = offsets.back() + sizes.back(); + print("time 4",wall_time()); // gather the vector of data v from each process to process 0 unsigned char* all_data=0; if (world->rank() == 0) { @@ -1700,6 +1707,7 @@ namespace madness { } MPI_Gatherv(v.data(), v.size(), MPI_BYTE, all_data, sizes.data(), offsets.data(), MPI_BYTE, 0, world->mpi.comm().Get_mpi_comm()); + print("time 5",wall_time()); if (world->rank() == 0) { auto& localar = ar.local_archive(); localar & magic & 1; // 1 client @@ -1712,6 +1720,7 @@ namespace madness { delete[] all_data; } world->gop.fence(); + print("time 6",wall_time()); } }; From c80f0e1dd044126e0c994aadd8b24849b5103b74 Mon Sep 17 00:00:00 2001 From: "Robert J. Harrison" Date: Wed, 7 Aug 2024 16:13:27 -0500 Subject: [PATCH 09/11] bug in serialize --- src/madness/world/worlddc.h | 62 +++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 27 deletions(-) diff --git a/src/madness/world/worlddc.h b/src/madness/world/worlddc.h index f10ff355fcf..671e2964071 100644 --- a/src/madness/world/worlddc.h +++ b/src/madness/world/worlddc.h @@ -1633,45 +1633,39 @@ namespace madness { World* world = ar.get_world(); world->gop.fence(); - std::vector v; - v.reserve(default_size); - size_t count = 0; - - class op : public TaskInterface { + class op_serialize : public TaskInterface { const size_t ntasks; const size_t taskid; const dcT& t; - std::vector& vtotal; - size_t& total_count; - Mutex& mutex; + std::vector& v; public: - op(size_t ntasks, size_t taskid, const dcT& t, std::vector& vtotal, size_t& total_count, Mutex& mutex) - : ntasks(ntasks), taskid(taskid), t(t), vtotal(vtotal), total_count(total_count), mutex(mutex) {} + op_serialize(size_t ntasks, size_t taskid, const dcT& t, std::vector& v) + : ntasks(ntasks), taskid(taskid), t(t), v(v) {} void run(World& world) { - std::vector v; - std::size_t hint_size=std::max(size_t(1024*1024),vtotal.capacity()/ntasks); - VectorOutputArchive var(v,hint_size+taskid); + std::size_t hint_size=(1ul<<30)/ntasks; + VectorOutputArchive var(v,hint_size); const_iterator it=t.begin(); - size_t count = 0; size_t n = 0; /// threads serialize round-robin over the container while (it!=t.end()) { if ((n%ntasks) == taskid) { var & *it; - ++count; } ++it; n++; } - print("count, n, ntasks, taskid, size",count,n,ntasks,taskid,var.v->capacity()); - // concatenate the buffers from each thread - if (count) { - mutex.lock(); - vtotal.insert(vtotal.end(), v.begin(), v.end()); - total_count += count; - mutex.unlock(); - } + } + }; + + class op_concat : public TaskInterface { + unsigned char* all_data; + const std::vector& v; + public: + op_concat(unsigned char* all_data, const std::vector& v) + : all_data(all_data), v(v) {} + void run(World& world) { + memcpy(all_data, v.data(), v.size()); } }; @@ -1679,15 +1673,29 @@ namespace madness { double wall0=wall_time(); Mutex mutex; size_t ntasks = std::max(size_t(1), ThreadPool::size()); + + std::vector> v(ntasks); for (size_t taskid=0; taskidtaskq.add(new op(ntasks, taskid, t, v, count, mutex)); + world->taskq.add(new op_serialize(ntasks, taskid, t, v[taskid])); world->gop.fence(); + // total size of all vectors + size_t total_size = 0; + for (size_t taskid=0; taskid vtotal(total_size); + + size_t offset = 0; + for (size_t taskid=0; taskidtaskq.add(new op_concat(&vtotal[offset], v[taskid])); + offset += v[taskid].size(); + } + v.clear(); double wall1=wall_time(); if (world->rank()==0) printf("time in the taskq: %8.4fs\n",wall1-wall0); // Gather all buffers to process 0 // first gather all of the sizes and counts to a vector in process 0 - int size = v.size(); + int size = vtotal.size(); + int count = t.size(); std::vector sizes(world->size()); MPI_Gather(&size, 1, MPI_INT, sizes.data(), 1, MPI_INT, 0, world->mpi.comm().Get_mpi_comm()); world->gop.sum(count); // just need total number of elements @@ -1697,7 +1705,7 @@ namespace madness { std::vector offsets(world->size()); offsets[0] = 0; for (int i=1; isize(); ++i) offsets[i] = offsets[i-1] + sizes[i-1]; - int total_size = offsets.back() + sizes.back(); + MADNESS_CHECK(offsets.back() + sizes.back() == total_size); print("time 4",wall_time()); // gather the vector of data v from each process to process 0 @@ -1705,7 +1713,7 @@ namespace madness { if (world->rank() == 0) { all_data = new unsigned char[total_size]; } - MPI_Gatherv(v.data(), v.size(), MPI_BYTE, all_data, sizes.data(), offsets.data(), MPI_BYTE, 0, world->mpi.comm().Get_mpi_comm()); + MPI_Gatherv(vtotal.data(), vtotal.size(), MPI_BYTE, all_data, sizes.data(), offsets.data(), MPI_BYTE, 0, world->mpi.comm().Get_mpi_comm()); print("time 5",wall_time()); if (world->rank() == 0) { From 4f0df9b68b43e4a9683054a0864aa9efe44163c7 Mon Sep 17 00:00:00 2001 From: "Robert J. Harrison" Date: Thu, 15 Aug 2024 10:24:01 -0400 Subject: [PATCH 10/11] merged gnuplot from bspline branch --- src/madness/misc/CMakeLists.txt | 8 +- src/madness/misc/gnuplot.h | 198 +++++++++++++++++++++++++++++++ src/madness/misc/test_gnuplot.cc | 7 ++ 3 files changed, 209 insertions(+), 4 deletions(-) create mode 100644 src/madness/misc/gnuplot.h create mode 100644 src/madness/misc/test_gnuplot.cc diff --git a/src/madness/misc/CMakeLists.txt b/src/madness/misc/CMakeLists.txt index 790a0d1e09a..f825dcfa6a3 100644 --- a/src/madness/misc/CMakeLists.txt +++ b/src/madness/misc/CMakeLists.txt @@ -1,8 +1,8 @@ # src/madness/misc -set(MADMISC_HEADERS misc.h ran.h phandler.h interpolation_1d.h cfft.h info.h) +set(MADMISC_HEADERS misc.h ran.h phandler.h interpolation_1d.h cfft.h info.h gnuplot.h) set(MADMISC_SOURCES - checksum_file.cc position_stream.cc gprofexit.cc ran.cc cfft.cc info.cc unique_filename.cc) + checksum_file.cc position_stream.cc gprofexit.cc ran.cc cfft.cc info.cc) # retrieve git metadata include(GetGitMetadata) vgkit_cmake_git_metadata() @@ -20,8 +20,8 @@ add_mad_library(misc MADMISC_SOURCES MADMISC_HEADERS "world" "madness/misc/") if(BUILD_TESTING) # The list of unit test source files - set(MISC_TEST_SOURCES interp3.cc) + set(MISC_TEST_SOURCES interp3.cc test_gnuplot.cc) add_unittests(misc "${MISC_TEST_SOURCES}" "MADmisc;MADgtest" "unittests;short") -endif() \ No newline at end of file +endif() diff --git a/src/madness/misc/gnuplot.h b/src/madness/misc/gnuplot.h new file mode 100644 index 00000000000..465d298cdb7 --- /dev/null +++ b/src/madness/misc/gnuplot.h @@ -0,0 +1,198 @@ +#ifndef MADNESS_GNUPLOT_H__INCUDED +#define MADNESS_GNUPLOT_H__INCUDED + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +namespace madness { + class Gnuplot { + FILE *f; // pipe connection to gnuplot process + pid_t pid; // pid of gnuplot process + FILE *ftee; // filestream for data tee'd from gnuplot process + + // base case for unpacking datablock value + template + void dbvalue(size_t i, const T& t) { + char buf[256]; + snprintf(buf,sizeof(buf),"%16.8e",double(t[i])); + (*this)(buf); // newline + } + + // recursion case for unpacking datablock value + template + void dbvalue(size_t i, const T& t, Ts... values) { + char buf[256]; + snprintf(buf,sizeof(buf),"%16.8e ",double(t[i])); // space + (*this)(buf,false); + dbvalue(i,values...); + } + + // base case for unpacking plot value + template + void doplot(const char* name, const T& value0) { + char buf[256]; + snprintf(buf, sizeof(buf), "%s using 1:%d", name, n); + (*this)(buf); + } + + // recursion case for unpacking plot value + template + void doplot(const char* name, const T& value0, Ts... values) { + char buf[256]; + snprintf(buf, sizeof(buf), "%s using 1:%d, ", name, n); + (*this)(buf,false); + + doplot(name, values...); + } + + public: + Gnuplot& operator=(Gnuplot&&) = delete; + Gnuplot& operator=(const Gnuplot&) = delete; + Gnuplot(const Gnuplot&) = delete; + Gnuplot(const std::string& cmd = "", const std::string& teefile = "") : f(0), ftee(0) { + int p[2]; + if (pipe (p)) { + throw "Pipe failed."; + } + pid = fork (); + if (pid == 0) { // Child process. + close(p[1]); + dup2(p[0],STDIN_FILENO); + close(p[0]); + if (execlp ("gnuplot", "gnuplot", "-persist", NULL) == -1) { + //if (execlp ("cat", "cat", "-", NULL) == -1) { + fprintf(stderr,"Gnuplot: execlp failed for gnuplot ... plotting disabled\n"); + exit(1); + } + } + else if (pid < (pid_t) 0) { // Failure + throw "Fork failed."; + } + else { // Parent process + close (p[0]); + f = fdopen (p[1], "w"); + } + if (teefile.size() > 0) { + ftee = fopen(teefile.c_str(),"w"); + if (!ftee) { + fprintf(stderr,"Gnuplot: fopen failed for tee file %s ... tee of plotting disabled\n",teefile.c_str()); + } + } + + if (cmd.size() > 0) (*this)(cmd); + } + + // outputs string to gnuplot process + void operator()(const char* cmd, bool EOL=true) { + + if (f) { + if (!fprintf(f,"%s",cmd)) { + fprintf(stderr,"Gnuplot: failed writing to gnuplot pipe ... plotting disabled\n"); + fclose(f); + f = NULL; + } + } + if (ftee) fprintf(ftee,"%s",cmd); + + const int n = strlen(cmd); + if (EOL && ((n==0) || (cmd[n-1] != '\n') ) ) { + if (f) { + if (!fprintf(f,"\n")) { + fprintf(stderr,"Gnuplot: failed writing newline to gnuplot pipe ... plotting disabled\n"); + fclose(f); + f = NULL; + } + } + if (ftee) fprintf(ftee,"\n"); + } + if (f) fflush(f); + } + + // outputs string to gnuplot process + void operator()(const std::string& cmd, bool EOL=true) { + (*this)(cmd.c_str(), EOL); + } + + // Define a gnuplot data block with given name assuming 1-d indexing via [] with explicit size + template + void db(const std::string& name, size_t size, const T& x, Ts... values) { + (*this)("$",false); + (*this)(name,false); + (*this)(" << EOD"); + for (size_t i = 0; i + void db(const std::string& name, const T& x, Ts... values) { + db(name,(size_t) x.size(),x,values...); // have to force x.size() to be size_t since Tensor::size() returns long + } + + // Plots data in 2 or more vectors by generating the following gnuplot commands: + // $data << EOD + // + // EOD + // plot $data using 1:2, $data using 1:3, ... + template + void plot(const T& x, Ts... values) { + db("data", x, values...); + (*this)("plot ",false); + doplot<2,Ts...>("$data", values...); // note we peeled off the x values + } + + ~Gnuplot() { + if (f) { + fclose(f); + waitpid(pid,0,0); + } + if (ftee) fclose(ftee); + } + + static void test() { + std::vector x = {1.0,2.0,3.0}; + std::vector y = {-1.0,-2.0,3.0}; + std::vector z = {10.0,11.0,12.0}; + { + Gnuplot g("set style data lp; set grid"); + g.plot(x,y,z); + } + { + Gnuplot g; + //g("set term png"); + //g("set output \"test.png\""); + g.db("xyz",x,y,z); + g("plot $xyz using 1:2 with linespoints"); + } + { + // use x11 to work around bug (https://sourceforge.net/p/gnuplot/bugs/2634/) ... wxt temporarily needs GDK_BACKEND=x11 + Gnuplot g("set term x11; set xrange [-10:10]; set yrange [0:1]; set grid; set style data l", "test3.gnuplot"); + size_t npts = 100; + std::vector x(npts), y(npts); + for (size_t i = 0; i + +int main (void) +{ + madness::Gnuplot::test(); + return 0; +} From 21f1e5aeb6cefc513361f50e6a41700f4165442e Mon Sep 17 00:00:00 2001 From: "Robert J. Harrison" Date: Thu, 15 Aug 2024 10:44:50 -0400 Subject: [PATCH 11/11] beginning of test directory for periodic stuff --- src/examples/periodic/test.cc | 403 ++++++++++++++++++++++++++++++++++ 1 file changed, 403 insertions(+) create mode 100644 src/examples/periodic/test.cc diff --git a/src/examples/periodic/test.cc b/src/examples/periodic/test.cc new file mode 100644 index 00000000000..88854f1baf2 --- /dev/null +++ b/src/examples/periodic/test.cc @@ -0,0 +1,403 @@ +#include +#include +#include +#include +#include + +#include +#include + +#include + +/***************** + +Use FFT to comupute the electrostatic potential due to some test +charge densities. Note that FFT produces a result with + + a) mean value that is zero because it projects out the constant (k=0) mode. + + b) an opposing electric field to cancel that arising from the periodic + sum of the dipole moment potential + +Test cases --- select by number +1) Gaussian spheropole +2) Gaussian dipole +3) Gaussian quadrupole +4) Cosine function + +Compile with "g++ -O3 -Wall test.cc -I.. -lfftw3 -llapacke" + + *****************/ + +const int test_case = 1; // 1, 2, 3, 4 + +const double pi = 3.14159265358979323846; +const double L = 1.0; // must be integer for cosine test to work, and > 1 for gaussian tests to work +const double xshift = 0.0; // shift from origin of charge distributions to test the periodicity in [-0.5*L,0.5*L] +const double yshift = 0.0; +const double zshift = 0.0; + +// Will be assigned by main program based on test_case selection +double (*f)(double, double, double) = nullptr; +double (*exact)(double, double, double) = nullptr; + +// Evaluate erf(a*r)/r with correct limit at origin +double erfaroverr(double a, double r) { + if (a*r*r/3.0 < 1e-8) { + return 2*a/std::sqrt(pi); + } + else { + return std::erf(a*r)/r; + } +} + +// Solve M[m,n] c[n] = f[m] in least squares sense +// M is the fitting matrix corresponding to m observations and n parameters +// f is the vector of m observations +// return the vector c of n fitted parameters +std::vector solve(size_t m, size_t n, const std::vector& M, const std::vector& f) { + std::vector c = f; + std::vector M1 = M; + lapack_int mm = m, nn = n, nrhs = 1, lda = n, ldb = 1; + LAPACKE_dgels(LAPACK_ROW_MAJOR,'N',mm,nn,nrhs,M1.data(),lda,c.data(),ldb); + + return std::vector(c.data(), c.data()+n); +} + +// print the matrix M[m,n] in row-major order +template +void print(size_t m, size_t n, const std::vector& M) { + for (size_t i=0; i fit(size_t m, size_t n, const std::vector N, const std::vector& f) { + //print(1,n,N); + //print(m,1,f); + + std::vector M; + for (size_t i=0; i tabulate(double(*f)(double, double, double), std::vector x, std::vector y, std::vector z) { + const size_t nx = x.size(); + const size_t ny = y.size(); + const size_t nz = z.size(); + std::vector F(nx * ny * nz); + for (size_t i = 0; i < nx; i++) { + for (size_t j = 0; j < ny; j++) { + for (size_t k = 0; k < nz; k++) { + F[i*ny*nz + j*nz + k] = f(x[i], y[j], z[k]); + } + } + } + return F; +} + +// Periodic sum of functions +double periodic_sum(double x, double y, double z, double(*f)(double, double, double), const int N = 50) { + // N number of lattice points summed in each direction + double sum = 0.0; + for (int X=-N; X<=N; X++) { + for (int Y=-N; Y<=N; Y++) { + for (int Z=-N; Z<=N; Z++) { + sum += f(x+X*L,y+Y*L,z+Z*L); + } + } + } + return sum; +} + +// Periodic sum of functions returning vector of values of partial sums in order of increasing cube size +std::vector periodic_sum_partial(double x, double y, double z, double(*f)(double, double, double), const int N) { + std::vector sums; + double sum = f(x,y,z); + sums.push_back(sum); + int count = 1; + // Dumb loop structure since attempt to be smart failed! + for (int n = 1; n <= N; n++) { + for (int X=-n; X<=n; X++) { + for (int Y=-n; Y<=n; Y++) { + for (int Z=-n; Z<=n; Z++) { + if (std::abs(X) == n || std::abs(Y) == n || std::abs(Z) == n) { + sum += f(x+X*L,y+Y*L,z+Z*L); + count++; + } + } + } + } + sums.push_back(sum); + } + return sums; +} + +// Periodic sum of functions accelerated by fitting t to a polynomial in 1/N +double periodic_sum_accelerated(double x, double y, double z, double(*f)(double, double, double), const size_t N=9, const size_t p=7) { + std::vector sums = periodic_sum_partial(x,y,z,f,N); + // Extract the last p terms of the series in v + std::vector v(&sums[N-p], &sums[N]); + std::vector n(p); + for (size_t j=0; j c = fit(p, p, n, v); + return c[0]; +} + +double distancesq(double x, double y, double z, double x0, double y0, double z0) { + double dx = x - x0; + double dy = y - y0; + double dz = z - z0; + return dx*dx + dy*dy + dz*dz; +} + +// Gaussian spheropole test function +const double a = 100.0; +const double b = 200.0; +double f_spheropole(double x, double y, double z) { + const double afac = std::pow(a/pi, 1.5); + const double bfac = std::pow(b/pi, 1.5); + const double rsq = distancesq(x,y,z,xshift,yshift,zshift); + return afac*exp(-a*rsq) - bfac*exp(-b*rsq); +} + +// No need for periodic summation since the potential is zero exterior to the charge density +double exact_spheropole(double x, double y, double z) { + const double mean = pi*(1/b - 1/a)/(L*L*L); // the mean of the potential over the domain + const double rsq = distancesq(x,y,z,xshift,yshift,zshift); + const double r = std::sqrt(rsq); + //return std::erf(std::sqrt(a)*r)/r - std::erf(std::sqrt(b)*r)/r - mean; + return erfaroverr(std::sqrt(a),r) - erfaroverr(std::sqrt(b),r) - mean; +}; + +// Gaussian dipole test function +const double offset = 0.1; +double f_dipole(double x, double y, double z) { + const double bfac = std::pow(b/pi, 1.5); + const double r1sq = distancesq(x,y,z,xshift,yshift,zshift-offset); + const double r2sq = distancesq(x,y,z,xshift,yshift,zshift+offset); + return bfac*(std::exp(-b*r1sq) - std::exp(-b*r2sq)); +} + +// Potential due to single Gaussian dipole +double exact_dipoleX(double x, double y, double z) { + const double r1sq = distancesq(x,y,z,xshift,yshift,zshift-offset); + const double r2sq = distancesq(x,y,z,xshift,yshift,zshift+offset); + const double r1 = std::sqrt(r1sq); + const double r2 = std::sqrt(r2sq); + //return std::erf(std::sqrt(b)*r1)/r1 - std::erf(std::sqrt(b)*r2)/r2; + double sb = std::sqrt(b); + return erfaroverr(sb,r1) - erfaroverr(sb,r2); +}; + +// Potential due to opposing electric field generated by FT to satisty the periodic boundary conditions and continuity +double opposing_field_potential(double x, double y, double z) { + // center of charge is at (xshift,yshift,zshift) + const double mu = 2*offset; + return mu*4*pi/(3*L*L*L) * (z-zshift); +} + +// Periodic sum of dipole potentials +double exact_dipole(double x, double y, double z) { + // const double mean = 0.0; // the mean of the potential over the domain(zero due to dipole symmetry) + return periodic_sum_accelerated(x, y, z, exact_dipoleX) + opposing_field_potential(x,y,z); +} + +// Gaussian quadrupole test function in yz plane +double f_quadrupole(double x, double y, double z) { + const double bfac = std::pow(b/pi, 1.5); + const double r1sq = distancesq(x, y, z, xshift, yshift, zshift-offset); + const double r2sq = distancesq(x, y, z, xshift, yshift, zshift+offset); + const double r3sq = distancesq(x, y, z, xshift, yshift-offset, zshift); + const double r4sq = distancesq(x, y, z, xshift, yshift+offset, zshift); + return bfac*(std::exp(-b*r1sq) + std::exp(-b*r2sq) - std::exp(-b*r3sq) - std::exp(-b*r4sq) ); +} + +// Potential due to single Gaussian quadrupole +double exact_quadrupoleX(double x, double y, double z) { + const double r1sq = distancesq(x, y, z, xshift, yshift, zshift-offset); + const double r2sq = distancesq(x, y, z, xshift, yshift, zshift+offset); + const double r3sq = distancesq(x, y, z, xshift, yshift-offset, zshift); + const double r4sq = distancesq(x, y, z, xshift, yshift+offset, zshift); + const double r1 = std::sqrt(r1sq); + const double r2 = std::sqrt(r2sq); + const double r3 = std::sqrt(r3sq); + const double r4 = std::sqrt(r4sq); + //return std::erf(std::sqrt(b)*r1)/r1 + std::erf(std::sqrt(b)*r2)/r2 - std::erf(std::sqrt(b)*r3)/r3 - std::erf(std::sqrt(b)*r4)/r4; + double sb = std::sqrt(b); + return erfaroverr(sb,r1) + erfaroverr(sb,r2) - erfaroverr(sb,r3) - erfaroverr(sb,r4); +}; + +// Periodic sum of quadrupole potentials +double exact_quadrupole(double x, double y, double z) { + // const double mean = 0.0; // the mean of the potential over the domain(zero due to dipole symmetry) + //return periodic_sum(x, y, z, exact_quadrupoleX); + return periodic_sum_accelerated(x, y, z, exact_quadrupoleX); +} + + +// Cosine test function +const double wx = 1; +const double wy = 2; +const double wz = 3; +double f_cosine(double x, double y, double z) { + return std::cos(wx*2*pi*(x-xshift)/L)*std::cos(wy*2*pi*(y-yshift)/L)*std::cos(wz*2*pi*(z-zshift)/L); +} + +double exact_cosine(double x, double y, double z) { + return f_cosine(x,y,z) / (pi*(wx*wx + wy*wy + wz*wz)/(L*L)); +} + +// Return a vector with n equally spaced values between a and b, optionally including the right endpoint +std::vector linspace(double a, double b, size_t n, bool include_right_endpoint = true) { + double h = (b - a) / (include_right_endpoint ? (n - 1) : n); + std::vector v(n); + for (size_t i = 0; i < n; i++) { + v[i] = a + i*h; + } + return v; +} + +int main() { + std::cout.precision(15); + + if (test_case == 1) { + f = f_spheropole; + exact = exact_spheropole; + } else if (test_case == 2) { + f = f_dipole; + exact = exact_dipole; + } else if (test_case == 3) { + f = f_quadrupole; + exact = exact_quadrupole; + } else if (test_case == 4) { + f = f_cosine; + exact = exact_cosine; + } else { + std::cerr << "Invalid test case number" << std::endl; + return 1; + } + + const size_t n = 50*L; // number of lattice points in each direction + const size_t nh = n/2+1; // for real to complex transform last dimension + const size_t mid = n/2; // for mapping hermitian symmetry in transform (and testing and plotting) + + const double lo = -0.5*L; + const double hi = 0.5*L; + const double h = (hi - lo) / (n - 1); + std::vector x = linspace(lo,hi,n,false); // exclude right endpoint + std::vector y = x; // Presently assuming cubic domain with equal spacing in x, y, and z + std::vector z = x; + + std::vector F = tabulate(f, x, y, z); + { + madness::Gnuplot g("set style data l"); + // Extract the middle z column? + std::vector col(&F[mid*n*n + mid*n], &F[mid*n*n + mid*n + n]); + g.plot(z, col); + } + + // sum the values in F to get the total charge + double norm = std::reduce(F.begin(), F.end(), 0.0)*h*h*h; + std::cout << "norm = " << norm << std::endl; + + // Perform a 3D Fourier transform of F and store the result in G + std::vector G(n*n*nh); + { + fftw_plan plan = fftw_plan_dft_r2c_3d(n, n, n, F.data() , G.data(), FFTW_ESTIMATE); + fftw_execute(plan); + fftw_destroy_plan(plan); + } + + // Navigate the way FFTW stores the Hermitian symmetric data + auto fudge = [=](size_t j) {return j 1.0e-20) { + double kfac = kscale/ksq; + size_t j = jx*n*nh + jy*nh + jz; + //if (std::abs(G[j][0]) > 1e-6) { + //std::cout << jx << " (" << fudge(jx) << ") " << jy << " (" << fudge(jy) << ") " << jz << " " << ksq << " " << G[j][0] << " " << G[j][1] << std::endl; + //} + G[j][0] *= kfac; + G[j][1] *= kfac; + } + } + } + } + + // Do the reverse transform into FF + std::vector FF(n*n*n); + { + fftw_plan plan = fftw_plan_dft_c2r_3d(n, n, n, G.data() , FF.data(), FFTW_ESTIMATE); + fftw_execute(plan); + fftw_destroy_plan(plan); + const double scale = 1.0/(n*n*n); // FFTW transforms are not normalized + for (double& u : FF) u *= scale; + } + + // Check the mean potential --- expected to be zero + double FFmean = std::reduce(FF.begin(), FF.end(), 0.0)*h*h*h / (L*L*L); + std::cout << "FFmean = " << FFmean << std::endl; + + size_t j = mid; //mid+3; // test point (avoid origin) -- don't need to avoid origin now we are treating erf(0)/0 correctly + { + madness::Gnuplot g("set style data l"); + std::vector col(&FF[j*n*n + j*n], &FF[j*n*n + j*n + n]); + std::vector col2(n); + std::vector col3(n); + for (size_t i = 0; i < n; i++) { + double v = exact(x[j],y[j],z[i]); + std::cout << i << " " << z[i] << " " << col[i] << " " << v << " " << col[i]-v << std::endl; + col2[i] = v; + col3[i] = col[i] - v; + } + + //g.plot(z, col, col2); + g.plot(z, col3); + + } + + // std::vector sums = periodic_sum_partial(x[j],y[j],z[0],exact_quadrupoleX,50); + // //std::vector sums = periodic_sum_partial(x[j],y[j],z[0],exact_dipoleX,50); + // for (size_t i = 0; i < sums.size(); i++) { + // std::cout << i << " " << sums[i] << " " << exact(x[j],y[j],z[0]) << std::endl; + // } + // std::cout << exact(x[j],y[j],z[0]) << " " << FF[j*n*n + j*n + 0] << " !! " << periodic_sum_accelerated(x[j],y[j],z[0],exact_quadrupoleX) << std::endl; + + // size_t p = 7; // no. of terms in the polynomial fit --- 7 is best for quadrupole + // for (size_t i=p+1; i f(&sums[i], &sums[i+p]); + // std::vector N(p); + // for (size_t j=0; j c = fit(p, p, N, f); + // std::cout << i << " " << c[0] << " " << sums[i]-FF[j*n*n + j*n + 0] << " " << c[0]-FF[j*n*n + j*n + 0] << std::endl; + // } + + return 0; +}