From 4e795fc7491d8dd405f3f71ab907da64011e0408 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <weiqunzhang@lbl.gov>
Date: Thu, 13 Jul 2023 18:49:05 -0700
Subject: [PATCH] PODVector Updates

Remove deprecated and unused PolymorphicAllocator. It has been replaced by
PolymorphicArenaAllocator.

Restrict PODVector's Allocator to std::allocator and AMReX's various Arena
based allocators. This simplifies the implementation of PODVector, because
std::allocator is stateless and Arena based allocators are simple even when
it's polymorphic.

Fix a few issues of PODVectors with a PolymorphicArenaAllocator. For
example, copy assignment operator should copy the Allocator. Copy
constructor should consider the possibility that other PODVector has a
different type of Arena.

Add placeholders for potentially growing and shrinking memory allocation
in-place that will be implemented in a follow-up PR.

Update PODVector's growth strategy. Hopefully this helps to reduce the
memory consumption.

  * Always try to grow in-place.

  * For assign, operator=, resize & reserve, allocate the specified size
    without capacity.

  * For push_back & emplace_back, grow the capacity by a factor that is 1.5
    by default.

  * For insert, the capacity grows either by a factor that is 1.5 by default
    or to the new size, whichever is greater.
---
 Src/AmrCore/AMReX_TagBox.cpp           |   2 +-
 Src/Base/AMReX_Arena.H                 |  21 +
 Src/Base/AMReX_GpuAllocators.H         | 250 ++++---
 Src/Base/AMReX_GpuContainers.H         |  13 -
 Src/Base/AMReX_PODVector.H             | 899 ++++++++++++-------------
 Src/Particle/AMReX_NeighborParticles.H |   1 -
 Src/Particle/AMReX_ParticleContainer.H |   1 -
 7 files changed, 626 insertions(+), 561 deletions(-)
diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp
index 440e448b0d7..f80daf81265 100644
--- a/Src/AmrCore/AMReX_TagBox.cpp
+++ b/Src/AmrCore/AMReX_TagBox.cpp
@@ -492,7 +492,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
     Gpu::dtoh_memcpy(hv_ntags.data(), dv_ntags.data(), ntotblocks*sizeof(int));
 
     Gpu::PinnedVector<int> hv_tags_offset(ntotblocks+1);
-    hv_tags_offset[0] = 0;
+    if (! hv_tags_offset.empty()) { hv_tags_offset[0] = 0; }
     std::partial_sum(hv_ntags.begin(), hv_ntags.end(), hv_tags_offset.begin()+1);
     int ntotaltags = hv_tags_offset.back();
 
diff --git a/Src/Base/AMReX_Arena.H b/Src/Base/AMReX_Arena.H
index a28fa492d01..c3dad6ac2b4 100644
--- a/Src/Base/AMReX_Arena.H
+++ b/Src/Base/AMReX_Arena.H
@@ -7,6 +7,7 @@
 #include <cstddef>
 #include <cstdlib>
 #include <limits>
+#include <utility>
 
 namespace amrex {
 
@@ -100,6 +101,26 @@ public:
     * \return a pointer to the allocated memory
     */
     [[nodiscard]] virtual void* alloc (std::size_t sz) = 0;
+
+    /**
+     * Try to allocate in-place by extending the capacity of given pointer.
+     */
+    [[nodiscard]] virtual std::pair<void*,std::size_t>
+    alloc_in_place (void* /*pt*/, std::size_t /*szmin*/, std::size_t szmax)
+    {
+        auto* p = alloc(szmax);
+        return std::make_pair(p, szmax);
+    }
+
+    /**
+     * Try to shrink in-place
+     */
+    [[nodiscard]] virtual void*
+    shrink_in_place (void* /*pt*/, std::size_t sz)
+    {
+        return alloc(sz);
+    }
+
     /**
     * \brief A pure virtual function for deleting the arena pointed to by pt
     */
diff --git a/Src/Base/AMReX_GpuAllocators.H b/Src/Base/AMReX_GpuAllocators.H
index 5738324a52e..01e050334ac 100644
--- a/Src/Base/AMReX_GpuAllocators.H
+++ b/Src/Base/AMReX_GpuAllocators.H
@@ -25,201 +25,267 @@ namespace amrex {
     template <typename T>
     struct IsPolymorphicArenaAllocator : std::false_type {};
 
-    struct ArenaAllocatorTraits {
-        typedef std::true_type propagate_on_container_copy_assignment;
-        typedef std::true_type propagate_on_container_move_assignment;
-        typedef std::true_type propagate_on_container_swap;
-        typedef std::true_type is_always_equal;
+    struct ArenaAllocatorBase {};
+
+    template <class T, class Enable = void>
+    struct IsArenaAllocator : std::false_type {};
+    //
+    template <class T>
+    struct IsArenaAllocator
+                <T,std::enable_if_t<std::is_base_of<ArenaAllocatorBase,T>::value>>
+        : std::true_type {};
+
+    template <typename T>
+    struct FatPtr
+    {
+        T* m_ptr = nullptr;
+        std::size_t m_size = 0;
+        [[nodiscard]] constexpr T* ptr () const noexcept { return m_ptr; }
+        [[nodiscard]] constexpr std::size_t size () const noexcept { return m_size; }
     };
 
+    template <typename T>
+    [[nodiscard]] FatPtr<T>
+    allocateInPlace (T* p, std::size_t nmin, std::size_t nmax, Arena* ar)
+    {
+        auto pn = ar->alloc_in_place(p, nmin*sizeof(T), nmax*sizeof(T));
+        return FatPtr<T>{(T*)pn.first, pn.second/sizeof(T)};
+    }
+
     template<typename T>
     class ArenaAllocator
-        : public ArenaAllocatorTraits
+        : public ArenaAllocatorBase
     {
     public :
 
         using value_type = T;
 
-        inline value_type* allocate(std::size_t n)
+        [[nodiscard]] T* allocate (std::size_t n)
+        {
+            return (T*) arena()->alloc(n * sizeof(T));
+        }
+
+        [[nodiscard]] FatPtr<T>
+        allocate_in_place (T* p, std::size_t nmin, std::size_t nmax)
+        {
+            return allocateInPlace(p, nmin, nmax, arena());
+        }
+
+        [[nodiscard]] T*
+        shrink_in_place (T* p, std::size_t n)
         {
-            value_type* result = nullptr;
-            result = (value_type*) The_Arena()->alloc(n * sizeof(T));
-            return result;
+            return (T*) arena()->shrink_in_place(p,n*sizeof(T));
         }
 
-        inline void deallocate(value_type* ptr, std::size_t)
+        void deallocate (T* ptr, std::size_t)
         {
-            if (ptr != nullptr) { The_Arena()->free(ptr); }
+            if (ptr != nullptr) { arena()->free(ptr); }
+        }
+
+        [[nodiscard]] Arena* arena () const noexcept {
+            return The_Arena();
         }
     };
 
     template<typename T>
     class DeviceArenaAllocator
-        : public ArenaAllocatorTraits
+        : public ArenaAllocatorBase
     {
     public :
 
         using value_type = T;
 
-        inline value_type* allocate(std::size_t n)
+        [[nodiscard]] T* allocate(std::size_t n)
         {
-            value_type* result = nullptr;
-            result = (value_type*) The_Device_Arena()->alloc(n * sizeof(T));
-            return result;
+            return (T*) arena()->alloc(n * sizeof(T));
         }
 
-        inline void deallocate(value_type* ptr, std::size_t)
+        [[nodiscard]] FatPtr<T>
+        allocate_in_place (T* p, std::size_t nmin, std::size_t nmax)
         {
-            if (ptr != nullptr) { The_Device_Arena()->free(ptr); }
+            return allocateInPlace(p, nmin, nmax, arena());
+        }
+
+        [[nodiscard]] T*
+        shrink_in_place (T* p, std::size_t n)
+        {
+            return (T*) arena()->shrink_in_place(p,n*sizeof(T));
+        }
+
+        void deallocate(T* ptr, std::size_t)
+        {
+            if (ptr != nullptr) { arena()->free(ptr); }
+        }
+
+        [[nodiscard]] Arena* arena () const noexcept {
+            return The_Device_Arena();
         }
     };
 
     template<typename T>
     class PinnedArenaAllocator
-        : public ArenaAllocatorTraits
+        : public ArenaAllocatorBase
     {
     public :
 
         using value_type = T;
 
-        inline value_type* allocate(std::size_t n)
+        [[nodiscard]] T* allocate(std::size_t n)
         {
-            value_type* result = nullptr;
-            result = (value_type*) The_Pinned_Arena()->alloc(n * sizeof(T));
-            return result;
+            return (T*) arena()->alloc(n * sizeof(T));
         }
 
-        inline void deallocate(value_type* ptr, std::size_t)
+        [[nodiscard]] FatPtr<T>
+        allocate_in_place (T* p, std::size_t nmin, std::size_t nmax)
         {
-            if (ptr != nullptr) { The_Pinned_Arena()->free(ptr); }
+            return allocateInPlace(p, nmin, nmax, arena());
+        }
+
+        [[nodiscard]] T*
+        shrink_in_place (T* p, std::size_t n)
+        {
+            return (T*) arena()->shrink_in_place(p,n*sizeof(T));
+        }
+
+        void deallocate(T* ptr, std::size_t)
+        {
+            if (ptr != nullptr) { arena()->free(ptr); }
+        }
+
+        [[nodiscard]] Arena* arena () const noexcept {
+            return The_Pinned_Arena();
         }
     };
 
     template<typename T>
     class ManagedArenaAllocator
-        : public ArenaAllocatorTraits
+        : public ArenaAllocatorBase
     {
     public :
 
         using value_type = T;
 
-        inline value_type* allocate(std::size_t n)
+        [[nodiscard]] T* allocate(std::size_t n)
+        {
+            return (T*) arena()->alloc(n * sizeof(T));
+        }
+
+        [[nodiscard]] FatPtr<T>
+        allocate_in_place (T* p, std::size_t nmin, std::size_t nmax)
         {
-            value_type* result = nullptr;
-            result = (value_type*) The_Managed_Arena()->alloc(n * sizeof(T));
-            return result;
+            return allocateInPlace(p, nmin, nmax, arena());
         }
 
-        inline void deallocate(value_type* ptr, std::size_t)
+        [[nodiscard]] T*
+        shrink_in_place (T* p, std::size_t n)
         {
-            if (ptr != nullptr) { The_Managed_Arena()->free(ptr); }
+            return (T*) arena()->shrink_in_place(p,n*sizeof(T));
+        }
+
+        void deallocate(T* ptr, std::size_t)
+        {
+            if (ptr != nullptr) { arena()->free(ptr); }
+        }
+
+        [[nodiscard]] Arena* arena () const noexcept {
+            return The_Managed_Arena();
         }
     };
 
     template<typename T>
     class AsyncArenaAllocator
-        : public ArenaAllocatorTraits
+        : public ArenaAllocatorBase
     {
     public :
 
         using value_type = T;
 
-        inline value_type* allocate(std::size_t n)
+        [[nodiscard]] T* allocate(std::size_t n)
         {
-            value_type* result = nullptr;
-            result = (value_type*) The_Async_Arena()->alloc(n * sizeof(T));
-            return result;
+            return (T*) arena()->alloc(n * sizeof(T));
         }
 
-        inline void deallocate(value_type* ptr, std::size_t)
+        [[nodiscard]] FatPtr<T>
+        allocate_in_place (T* p, std::size_t nmin, std::size_t nmax)
         {
-            if (ptr != nullptr) { The_Async_Arena()->free(ptr); }
+            return allocateInPlace(p, nmin, nmax, arena());
         }
-    };
-
-    template<typename T>
-    class PolymorphicArenaAllocator
-        : public ArenaAllocatorTraits
-    {
-    public :
 
-        using value_type = T;
-
-        inline value_type* allocate(std::size_t n)
+        [[nodiscard]] T*
+        shrink_in_place (T* p, std::size_t n)
         {
-            value_type* result = nullptr;
-            result = (value_type*) arena()->alloc(n * sizeof(T));
-            return result;
+            return (T*) arena()->shrink_in_place(p,n*sizeof(T));
         }
 
-        inline void deallocate(value_type* ptr, std::size_t)
+        void deallocate(T* ptr, std::size_t)
         {
             if (ptr != nullptr) { arena()->free(ptr); }
         }
 
         [[nodiscard]] Arena* arena () const noexcept {
-            return (m_arena) ? m_arena : The_Arena();
+            return The_Async_Arena();
         }
-
-        Arena* m_arena = nullptr;
     };
 
     template<typename T>
-    class PolymorphicAllocator
+    class PolymorphicArenaAllocator
+        : public ArenaAllocatorBase
     {
     public :
 
         using value_type = T;
 
-        PolymorphicAllocator () : m_use_gpu_aware_mpi(ParallelDescriptor::UseGpuAwareMpi()) {}
+        constexpr PolymorphicArenaAllocator () = default;
+        constexpr PolymorphicArenaAllocator (Arena* a_arena)
+            : m_arena(a_arena) {}
 
-        inline value_type* allocate(std::size_t n)
+        [[nodiscard]] T* allocate(std::size_t n)
         {
-            value_type* result = nullptr;
-            if (m_use_gpu_aware_mpi)
-            {
-                result = (value_type*) The_Arena()->alloc(n * sizeof(T));
-            }
-            else
-            {
-                result = (value_type*) The_Pinned_Arena()->alloc(n * sizeof(T));
-            }
-            return result;
+            return (T*) arena()->alloc(n * sizeof(T));
         }
 
-        inline void deallocate(value_type* ptr, std::size_t)
+        [[nodiscard]] FatPtr<T>
+        allocate_in_place (T* p, std::size_t nmin, std::size_t nmax)
         {
-            if (ptr != nullptr)
-            {
-                if (m_use_gpu_aware_mpi)
-                {
-                    The_Arena()->free(ptr);
-                }
-                else
-                {
-                    The_Pinned_Arena()->free(ptr);
-                }
-            }
+            return allocateInPlace(p, nmin, nmax, arena());
         }
 
-        bool m_use_gpu_aware_mpi;
-
-        template <class U, class V>
-        friend bool
-        operator== (PolymorphicAllocator<U> const& a, PolymorphicAllocator<V> const& b) noexcept
+        [[nodiscard]] T*
+        shrink_in_place (T* p, std::size_t n)
         {
-            return a.m_use_gpu_aware_mpi == b.m_use_gpu_aware_mpi;
+            return (T*) arena()->shrink_in_place(p,n*sizeof(T));
         }
 
-        template <class U, class V>
-        friend bool
-        operator!= (PolymorphicAllocator<U> const& a, PolymorphicAllocator<V> const& b) noexcept
+        void deallocate(T* ptr, std::size_t)
         {
-            return a.m_use_gpu_aware_mpi != b.m_use_gpu_aware_mpi;
+            if (ptr != nullptr) { arena()->free(ptr); }
+        }
+
+        [[nodiscard]] Arena* arena () const noexcept {
+            return (m_arena) ? m_arena : The_Arena();
         }
 
+        void arena (Arena* a_arena) noexcept { m_arena = a_arena; }
+
+        Arena* m_arena = nullptr;
     };
 
+    template <typename A1, typename A2,
+              std::enable_if_t<IsArenaAllocator<A1>::value &&
+                               IsArenaAllocator<A2>::value, int> = 0>
+    bool operator== (A1 const& a1, A2 const& a2)
+    {
+        return a1.arena() == a2.arena();
+    }
+
+    template <typename A1, typename A2,
+              std::enable_if_t<IsArenaAllocator<A1>::value &&
+                               IsArenaAllocator<A2>::value, int> = 0>
+    bool operator!= (A1 const& a1, A2 const& a2)
+    {
+        return a1.arena() != a2.arena();
+    }
+
 #ifdef AMREX_USE_GPU
     template <typename T>
     struct RunOnGpu<ArenaAllocator<T> > : std::true_type {};
diff --git a/Src/Base/AMReX_GpuContainers.H b/Src/Base/AMReX_GpuContainers.H
index a5ba76a477f..cb7b97acaf3 100644
--- a/Src/Base/AMReX_GpuContainers.H
+++ b/Src/Base/AMReX_GpuContainers.H
@@ -61,16 +61,6 @@ namespace amrex::Gpu {
     template <class T>
     using HostVector = PinnedVector<T>;
 
-    /**
-     * \brief The behavior of PolymorphicVector changes depending on
-     * the amrex.use_gpu_aware_mpi runtime flag. If the flag is true,
-     * this vector will use device memory. If it is false, this Vector
-     * will use pinned memory.
-     *
-     */
-    template <class T>
-    using PolymorphicVector = PODVector<T, PolymorphicAllocator<T> >;
-
     /**
      * \brief This is identical to ManagedVector<T>. The ManagedDeviceVector
      * form is deprecated and will be removed in a future release.
@@ -101,9 +91,6 @@ namespace amrex::Gpu {
 
     template <class T>
     using AsyncVector = PODVector<T>;
-
-    template <class T>
-    using PolymorphicVector = PODVector<T>;
 #endif
 
     struct HostToDevice {};
diff --git a/Src/Base/AMReX_PODVector.H b/Src/Base/AMReX_PODVector.H
index 358021cb5f7..a4a06e12607 100644
--- a/Src/Base/AMReX_PODVector.H
+++ b/Src/Base/AMReX_PODVector.H
@@ -17,204 +17,179 @@ namespace amrex
 {
     namespace detail
     {
-        template <typename T, typename U, typename Size, typename Value>
-        typename std::enable_if<RunOnGpu<T>::value && !IsPolymorphicArenaAllocator<T>::value>::type
-        uninitializedFillNImpl (U* data, Size count, const Value& value, T& /*allocator*/)
-        {
-            amrex::ParallelFor(count, [=] AMREX_GPU_DEVICE (Size i) noexcept {
-                data[i] = value;
-            });
-            Gpu::Device::streamSynchronize();
+        template <typename T, typename Size, template<class> class Allocator>
+        FatPtr<T> allocate_in_place ([[maybe_unused]] T* p, [[maybe_unused]] Size nmin, Size nmax,
+                                     Allocator<T>& allocator)
+        {
+            if constexpr (IsArenaAllocator<Allocator<T>>::value) {
+                return allocator.allocate_in_place(p, nmin, nmax);
+            } else {
+                T* pnew = allocator.allocate(nmax);
+                return {pnew, nmax};
+            }
         }
 
-        template <typename T, typename U, typename Size, typename Value>
-        typename std::enable_if<!RunOnGpu<T>::value && !IsPolymorphicArenaAllocator<T>::value>::type
-        uninitializedFillNImpl (U* data, Size count, const Value& value, T& /*allocator*/)
+        template <typename T, typename Size, template<class> class Allocator>
+        T* shrink_in_place ([[maybe_unused]] T* p, Size n, Allocator<T>& allocator)
         {
-            std::uninitialized_fill_n<U*, Size, Value>(data, count, value);
+            if constexpr (IsArenaAllocator<Allocator<T>>::value) {
+                return allocator.shrink_in_place(p, n);
+            } else {
+                return allocator.allocate(n);
+            }
         }
 
-        template <typename T, typename U, typename Size, typename Value>
-        typename std::enable_if<IsPolymorphicArenaAllocator<T>::value>::type
-        uninitializedFillNImpl (U* data, Size count, const Value& value, T& allocator)
+        template <typename T, typename Size, template<class> class Allocator>
+        void uninitializedFillNImpl (T* data, Size count, const T& value,
+                                     [[maybe_unused]] Allocator<T> const& allocator)
         {
 #ifdef AMREX_USE_GPU
-            if (allocator.arena()->isManaged() || allocator.arena()->isDevice()) {
+            if constexpr (RunOnGpu<Allocator<T>>::value)
+            {
                 amrex::ParallelFor(count, [=] AMREX_GPU_DEVICE (Size i) noexcept {
-                                              data[i] = value;
-                                          });
-                Gpu::Device::streamSynchronize();
-            } else
-#endif
+                    data[i] = value;
+                });
+                Gpu::streamSynchronize();
+                return;
+            }
+            else if constexpr (IsPolymorphicArenaAllocator<Allocator<T>>::value)
             {
-                amrex::ignore_unused(allocator);
-                std::uninitialized_fill_n<U*, Size, Value>(data, count, value);
+                if (allocator.arena()->isManaged() ||
+                    allocator.arena()->isDevice())
+                {
+                    amrex::ParallelFor(count, [=] AMREX_GPU_DEVICE (Size i) noexcept
+                    {
+                        data[i] = value;
+                    });
+                    Gpu::streamSynchronize();
+                    return;
+                }
             }
+#endif
+            std::uninitialized_fill_n(data, count, value);
         }
 
-        template <typename T, typename U, typename Size>
-        typename std::enable_if<RunOnGpu<T>::value && !IsPolymorphicArenaAllocator<T>::value>::type
-        fillValuesImpl (U* dst, const U* src, Size count, T& /*allocator*/)
-        {
-            amrex::ParallelFor(count, [=] AMREX_GPU_DEVICE (Size i) noexcept {
-                dst[i] = src[i];
-            });
-            Gpu::Device::streamSynchronize();
-        }
-
-        template <typename T, typename U, typename Size>
-        typename std::enable_if<!RunOnGpu<T>::value && !IsPolymorphicArenaAllocator<T>::value>::type
-        fillValuesImpl (U* dst, const U* src, Size count, T& /*allocator*/)
+        template <typename T, template<class> class Allocator>
+        void initFromListImpl (T* data, std::initializer_list<T> const& list,
+                               [[maybe_unused]] Allocator<T> const & allocator)
         {
-            for (Size i = 0; i < count; ++i) { dst[i] = src[i];}
+            auto count = list.size() * sizeof(T);
+#ifdef AMREX_USE_GPU
+            if constexpr (RunOnGpu<Allocator<T>>::value)
+            {
+                Gpu::htod_memcpy_async(data, std::data(list), count);
+                Gpu::streamSynchronize();
+                return;
+            }
+            else if constexpr (IsPolymorphicArenaAllocator<Allocator<T>>::value)
+            {
+                if (allocator.arena()->isManaged() ||
+                    allocator.arena()->isDevice())
+                {
+                    Gpu::htod_memcpy_async(data, std::data(list), count);
+                    Gpu::streamSynchronize();
+                    return;
+                }
+            }
+#endif
+            std::memcpy(data, std::data(list), count);
         }
 
-        template <typename T, typename U, typename Size>
-        typename std::enable_if<IsPolymorphicArenaAllocator<T>::value>::type
-        fillValuesImpl (U* dst, const U* src, Size count, T& allocator)
+        template <typename T, typename Size, template<class> class Allocator>
+        void fillValuesImpl (T* dst, T const* src, Size count,
+                             [[maybe_unused]] Allocator<T> const& allocator)
         {
 #ifdef AMREX_USE_GPU
-            if (allocator.arena()->isManaged() || allocator.arena()->isDevice()) {
+            if constexpr (RunOnGpu<Allocator<T>>::value)
+            {
                 amrex::ParallelFor(count, [=] AMREX_GPU_DEVICE (Size i) noexcept {
-                                              dst[i] = src[i];
-                                          });
+                    dst[i] = src[i];
+                });
                 Gpu::Device::streamSynchronize();
-            } else
-#endif
+                return;
+            }
+            else if constexpr (IsPolymorphicArenaAllocator<Allocator<T>>::value)
             {
-                amrex::ignore_unused(allocator);
-                for (Size i = 0; i < count; ++i) { dst[i] = src[i];}
+                if (allocator.arena()->isManaged() ||
+                    allocator.arena()->isDevice())
+                {
+                    amrex::ParallelFor(count, [=] AMREX_GPU_DEVICE (Size i) noexcept
+                    {
+                        dst[i] = src[i];
+                    });
+                    Gpu::streamSynchronize();
+                    return;
+                }
             }
-        }
-
-        template <typename T>
-        std::enable_if_t<RunOnGpu<T>::value && !IsPolymorphicArenaAllocator<T>::value, bool>
-        memCopyImpl (void* dst, const void* src, std::size_t count, T& /*allocator*/)
-        {
-#ifdef AMREX_USE_GPU
-            Gpu::dtod_memcpy_async(dst, src, count);
-            return true;
-#else
-            std::memcpy(dst, src, count);
-            return false;
 #endif
+            for (Size i = 0; i < count; ++i) { dst[i] = src[i]; }
         }
 
-        template <typename T>
-        std::enable_if_t<!RunOnGpu<T>::value && !IsPolymorphicArenaAllocator<T>::value, bool>
-        memCopyImpl (void* dst, const void* src, std::size_t count, T& /*allocator*/)
-        {
-            std::memcpy(dst, src, count);
-            return false;
-        }
-
-        template <typename T>
-        std::enable_if_t<IsPolymorphicArenaAllocator<T>::value, bool>
-        memCopyImpl (void* dst, const void* src, std::size_t count, T& allocator)
+        template <typename Allocator>
+        void memCopyImpl (void* dst, const void* src, std::size_t count,
+                          [[maybe_unused]] Allocator const& dst_allocator,
+                          [[maybe_unused]] Allocator const& src_allocator,
+                          [[maybe_unused]] bool sync = true)
         {
 #ifdef AMREX_USE_GPU
-            if (allocator.arena()->isManaged() || allocator.arena()->isDevice()) {
+            if constexpr (RunOnGpu<Allocator>::value)
+            {
                 Gpu::dtod_memcpy_async(dst, src, count);
-                return true;
-            } else
-#endif
+                if (sync) { Gpu::streamSynchronize(); }
+                return;
+            }
+            else if constexpr (IsPolymorphicArenaAllocator<Allocator>::value)
             {
-                amrex::ignore_unused(allocator);
-                std::memcpy(dst, src, count);
-                return false;
+                bool dst_on_device = dst_allocator.arena()->isManaged() ||
+                                     dst_allocator.arena()->isDevice();
+                bool src_on_device = src_allocator.arena()->isManaged() ||
+                                     src_allocator.arena()->isDevice();
+                if (dst_on_device || src_on_device)
+                {
+                    if (dst_on_device && src_on_device) {
+                        Gpu::dtod_memcpy_async(dst, src, count);
+                    } else if (dst_on_device) {
+                        Gpu::htod_memcpy_async(dst, src, count);
+                    } else {
+                        Gpu::dtoh_memcpy_async(dst, src, count);
+                    }
+                    if (sync) { Gpu::streamSynchronize(); }
+                    return;
+                }
             }
-        }
-
-        template <typename T>
-        typename std::enable_if<RunOnGpu<T>::value && !IsPolymorphicArenaAllocator<T>::value>::type
-        memMoveImpl (void* dst, const void* src, std::size_t count, T& allocator)
-        {
-            if (count == 0) { return; }
-#ifdef AMREX_USE_GPU
-            AMREX_ASSERT(count % sizeof(typename T::value_type) == 0);
-
-            auto N = count / sizeof(typename T::value_type);
-            auto tmp = allocator.allocate(N);
-
-            Gpu::dtod_memcpy_async(tmp, src, count);
-            Gpu::dtod_memcpy_async(dst, tmp, count);
-
-            Gpu::Device::streamSynchronize();
-
-            allocator.deallocate(tmp, N);
-#else
-            amrex::ignore_unused(allocator);
-            std::memmove(dst, src, count);
 #endif
+            std::memcpy(dst, src, count);
         }
 
-        template <typename T>
-        typename std::enable_if<!RunOnGpu<T>::value && !IsPolymorphicArenaAllocator<T>::value>::type
- memMoveImpl (void* dst, const void* src, std::size_t count, T& /*allocator*/)
-        {
-            std::memmove(dst, src, count);
-        }
-
-        template <typename T>
-        typename std::enable_if<IsPolymorphicArenaAllocator<T>::value>::type
-        memMoveImpl (void* dst, const void* src, std::size_t count, T& allocator)
+        template <typename Allocator>
+        void memMoveImpl (void* dst, const void* src, std::size_t count,
+                          [[maybe_unused]] Allocator const& allocator)
         {
 #ifdef AMREX_USE_GPU
-            if (allocator.arena()->isManaged() || allocator.arena()->isDevice()) {
-                if (count == 0) { return; }
-
-                AMREX_ASSERT(count % sizeof(typename T::value_type) == 0);
-
-                auto N = count / sizeof(typename T::value_type);
-                auto tmp = allocator.allocate(N);
-
+            if constexpr (RunOnGpu<Allocator>::value)
+            {
+                auto* tmp = The_Arena()->alloc(count);
                 Gpu::dtod_memcpy_async(tmp, src, count);
                 Gpu::dtod_memcpy_async(dst, tmp, count);
-
-                Gpu::Device::streamSynchronize();
-
-                allocator.deallocate(tmp, N);
-            } else
-#endif
-            {
-            amrex::ignore_unused(allocator);
-            std::memmove(dst, src, count);
-            }
-        }
-
-        template <typename T, typename U>
-        typename std::enable_if<RunOnGpu<T>::value && !IsPolymorphicArenaAllocator<T>::value>::type
-        initFromListImpl (U* data, std::initializer_list<U> list, T& /*allocator*/)
-        {
-#ifdef AMREX_USE_GPU
-            Gpu::htod_memcpy_async(data, &(*list.begin()), list.size() * sizeof(U));
-            Gpu::streamSynchronize();
-#else
-            std::memcpy(data, &(*list.begin()), list.size() * sizeof(U));
-#endif
-        }
-
-        template <typename T, typename U>
-        typename std::enable_if<!RunOnGpu<T>::value && !IsPolymorphicArenaAllocator<T>::value>::type
-        initFromListImpl (U* data, std::initializer_list<U> list, T& /*allocator*/)
-        {
-            for (const auto& value : list) { *(data++) = value; }
-        }
-
-        template <typename T, typename U>
-        typename std::enable_if<IsPolymorphicArenaAllocator<T>::value>::type
-        initFromListImpl (U* data, std::initializer_list<U> list, T& allocator)
-        {
-#ifdef AMREX_USE_GPU
-            if (allocator.arena()->isManaged() || allocator.arena()->isDevice()) {
-                Gpu::htod_memcpy_async(data, &(*list.begin()), list.size() * sizeof(U));
                 Gpu::streamSynchronize();
-            } else
-#endif
+                The_Arena()->free(tmp);
+                return;
+            }
+            else if constexpr (IsPolymorphicArenaAllocator<Allocator>::value)
             {
-                amrex::ignore_unused(allocator);
-                for (const auto& value : list) { *(data++) = value; }
+                if (allocator.arena()->isManaged() ||
+                    allocator.arena()->isDevice())
+                {
+                    auto* tmp = The_Arena()->alloc(count);
+                    Gpu::dtod_memcpy_async(tmp, src, count);
+                    Gpu::dtod_memcpy_async(dst, tmp, count);
+                    Gpu::streamSynchronize();
+                    The_Arena()->free(tmp);
+                    return;
+                }
             }
+#endif
+            std::memmove(dst, src, count);
         }
     }
 
@@ -239,76 +214,78 @@ namespace amrex
         static_assert(std::is_trivially_copyable<T>(), "PODVector can only hold trivially copyable types");
         //        static_assert(std::is_trivially_default_constructible<T>(), "PODVector can only hold trivial dc types");
 
-
         using Allocator::allocate;
         using Allocator::deallocate;
 
     public:
-        typedef T value_type;
-        typedef Allocator allocator_type;
-        typedef std::size_t size_t;
-        typedef std::size_t size_type;
-        typedef std::ptrdiff_t difference_type;
-
-        typedef T& reference;
-        typedef T* pointer;
-        typedef T* iterator;
-        typedef std::reverse_iterator<iterator> reverse_iterator;
-
-        typedef const T& const_reference;
-        typedef const T* const_pointer;
-        typedef const T* const_iterator;
-        typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+        using value_type      = T;
+        using allocator_type  = Allocator;
+        using size_type       = std::size_t;
+        using difference_type = std::ptrdiff_t;
+
+        using reference        = T&;
+        using pointer          = T*;
+        using iterator         = T*;
+        using reverse_iterator = std::reverse_iterator<iterator>;
+
+        using const_reference        = const T&;
+        using const_pointer          = const T*;
+        using const_iterator         = const T* ;
+        using const_reverse_iterator = std::reverse_iterator<const_iterator>;
 
     private:
-        pointer m_data;
+        pointer m_data = nullptr;
         size_type m_size{0}, m_capacity{0};
 
     public:
-        explicit PODVector () noexcept :
-        Allocator(), m_data(nullptr)
-        {}
+        constexpr PODVector () noexcept = default;
 
-        explicit PODVector (const allocator_type& a_allocator) noexcept
-            : Allocator(a_allocator), m_data(nullptr)
+        constexpr explicit PODVector (const allocator_type& a_allocator) noexcept
+            : Allocator(a_allocator)
         {}
 
-        explicit PODVector (size_type a_size) noexcept
-            : m_data(nullptr), m_size(a_size)
+        explicit PODVector (size_type a_size)
+            : m_size(a_size), m_capacity(a_size)
         {
             if (a_size != 0) {
-                AllocateBuffer(GetNewCapacity(a_size));
+                m_data = allocate(m_size);
             }
         }
 
         PODVector (size_type a_size, const value_type& a_value,
-                   const allocator_type& a_allocator = Allocator()) noexcept
-            : Allocator(a_allocator), m_data(nullptr), m_size(a_size)
+                   const allocator_type& a_allocator = Allocator())
+            : Allocator(a_allocator), m_size(a_size), m_capacity(a_size)
         {
             if (a_size != 0) {
-                AllocateBuffer(GetNewCapacity(a_size));
-                detail::uninitializedFillNImpl<Allocator>(m_data, a_size, a_value, *this);
+                m_data = allocate(m_size);
+                detail::uninitializedFillNImpl(m_data, a_size, a_value,
+                                               (Allocator const&)(*this));
             }
         }
 
         PODVector (std::initializer_list<T> a_initializer_list,
-                   const allocator_type& a_allocator = Allocator()) noexcept
-            : Allocator(a_allocator), m_data(nullptr), m_size(a_initializer_list.size())
+                   const allocator_type& a_allocator = Allocator())
+            : Allocator(a_allocator),
+              m_size    (a_initializer_list.size()),
+              m_capacity(a_initializer_list.size())
         {
             if (a_initializer_list.size() != 0) {
-                AllocateBuffer(GetNewCapacity(m_size));
-                detail::initFromListImpl<Allocator>(m_data, a_initializer_list, *this);
+                m_data = allocate(m_size);
+                detail::initFromListImpl(m_data, a_initializer_list,
+                                         (Allocator const&)(*this));
             }
         }
 
-        PODVector (const PODVector<T, Allocator>& a_vector) noexcept
-            : Allocator(a_vector), m_data(nullptr), m_size(a_vector.size())
+        PODVector (const PODVector<T, Allocator>& a_vector)
+            : Allocator(a_vector),
+              m_size    (a_vector.size()),
+              m_capacity(a_vector.size())
         {
-            using namespace detail;
             if (a_vector.size() != 0) {
-                AllocateBuffer(a_vector.capacity());
-                auto r = memCopyImpl<Allocator>(m_data, a_vector.m_data, a_vector.size() * sizeof(T), *this);
-                if (r) { Gpu::streamSynchronize(); }
+                m_data = allocate(m_size);
+                detail::memCopyImpl(m_data, a_vector.m_data, a_vector.nBytes(),
+                                    (Allocator const&)(*this),
+                                    (Allocator const&)a_vector);
             }
         }
 
@@ -323,191 +300,229 @@ namespace amrex
             a_vector.m_capacity = 0;
         }
 
-        PODVector (PODVector<T, Allocator>&& a_vector, const allocator_type& a_allocator) noexcept
-            : Allocator(a_allocator),
-              m_data(a_vector.m_data),
-              m_size(a_vector.m_size),
-              m_capacity(a_vector.m_capacity)
+        ~PODVector ()
         {
-            a_vector.m_data = nullptr;
-            a_vector.m_size = 0;
-            a_vector.m_capacity = 0;
+            // let's not worry about other allocators
+            static_assert(std::is_same<Allocator,std::allocator<T>>::value ||
+                          IsArenaAllocator<Allocator>::value);
+            if (m_data != nullptr) {
+                deallocate(m_data, capacity());
+            }
         }
 
-        ~PODVector () noexcept { if (m_data != nullptr) { deallocate(m_data, capacity()); }}
-
-        PODVector& operator= (const PODVector<T, Allocator>& a_vector) noexcept
+        PODVector& operator= (const PODVector<T, Allocator>& a_vector)
         {
-            CopyAssignmentDoIt(a_vector, typename std::allocator_traits<Allocator>::
-                               propagate_on_container_copy_assignment());
+            if (this == &a_vector) { return *this; }
+
+            if ((Allocator const&)(*this) != (Allocator const&)a_vector) {
+                if (m_data != nullptr) {
+                    deallocate(m_data, m_capacity);
+                    m_data = nullptr;
+                    m_size = 0;
+                    m_capacity = 0;
+                }
+                (Allocator&)(*this) = (Allocator const&)a_vector;
+            }
+
+            const auto other_size = a_vector.size();
+            if ( other_size > m_capacity ) {
+                clear();
+                reserve(other_size);
+            }
+
+            m_size = other_size;
+            if (m_size > 0) {
+                detail::memCopyImpl(m_data, a_vector.m_data, nBytes(),
+                                    (Allocator const&)(*this),
+                                    (Allocator const&)a_vector);
+            }
             return *this;
         }
 
         PODVector& operator= (PODVector<T, Allocator>&& a_vector) noexcept
-            (std::allocator_traits<Allocator>::propagate_on_container_move_assignment::value)
         {
-            MoveAssignmentDoIt(std::move(a_vector), typename std::allocator_traits<Allocator>::
-                               propagate_on_container_move_assignment());
-            return *this;
-        }
+            if (this == &a_vector) { return *this; }
 
-        iterator erase (const_iterator a_pos) noexcept
-        {
-            --m_size;
-            detail::memMoveImpl<Allocator>(const_cast<iterator>(a_pos), a_pos+1, (end() - a_pos)*sizeof(T), *this);
-            return const_cast<iterator>(a_pos);
-        }
+            if (static_cast<Allocator const&>(a_vector) ==
+                static_cast<Allocator const&>(*this))
+            {
+                if (m_data != nullptr) {
+                    deallocate(m_data, m_capacity);
+                }
 
-        iterator erase (const_iterator a_first, const_iterator a_last) noexcept
-        {
-            size_type num_to_erase = a_last - a_first;
-            if (num_to_erase == 0) return const_cast<iterator>(a_first);
-            m_size -= num_to_erase;
-            detail::memMoveImpl<Allocator>(const_cast<iterator>(a_first), a_first+num_to_erase, (end() - a_first)*sizeof(T), *this);
-            return const_cast<iterator>(a_first);
-        }
+                m_data = a_vector.m_data;
+                m_size = a_vector.m_size;
+                m_capacity = a_vector.m_capacity;
 
-        iterator insert (const_iterator a_pos, const T& a_item) noexcept
-        {
-            if(m_size == m_capacity)
-            {
-                size_t insert_index = std::distance(m_data, const_cast<iterator>(a_pos));
-                AllocateBufferForInsert(GetNewCapacity(1), insert_index, 1);
-                a_pos = m_data;
-                std::advance(a_pos, insert_index);
+                a_vector.m_data = nullptr;
+                a_vector.m_size = 0;
+                a_vector.m_capacity = 0;
             }
             else
             {
-                detail::memMoveImpl<Allocator>(const_cast<iterator>(a_pos)+1, a_pos, (end() - a_pos) * sizeof(T), *this);
-                ++m_size;
+                // if the allocators are not the same we give up and copy
+                *this = a_vector;
             }
 
-            *const_cast<iterator>(a_pos) = a_item;
+            return *this;
+        }
 
-            return const_cast<iterator>(a_pos);
+        iterator erase (const_iterator a_pos)
+        {
+            auto* pos = const_cast<iterator>(a_pos);
+            --m_size;
+            detail::memMoveImpl(pos, a_pos+1, (end() - pos)*sizeof(T),
+                                (Allocator const&)(*this));
+            return pos;
         }
 
-        iterator insert (const_iterator a_pos, size_type a_count, const T& a_value) noexcept
+        iterator erase (const_iterator a_first, const_iterator a_last)
         {
-            if (a_count == 0) return const_cast<iterator>(a_pos);
-            if( capacity() < size() + a_count)
-            {
-                size_t insert_index = std::distance(m_data, const_cast<iterator>(a_pos));
-                AllocateBufferForInsert(GetNewCapacity(a_count), insert_index, a_count);
-                a_pos = m_data;
-                std::advance(a_pos, insert_index);
-            }
-            else
-            {
-                detail::memMoveImpl<Allocator>(const_cast<iterator>(a_pos)+a_count, a_pos, (end() - a_pos) * sizeof(T), *this);
-                m_size += a_count;
+            size_type num_to_erase = a_last - a_first;
+            auto* first = const_cast<iterator>(a_first);
+            if (num_to_erase > 0) {
+                m_size -= num_to_erase;
+                detail::memMoveImpl(first, a_last, (end() - first)*sizeof(T),
+                                    (Allocator const&)(*this));
             }
+            return first;
+        }
 
-            detail::uninitializedFillNImpl<Allocator>(const_cast<iterator>(a_pos), a_count, a_value, *this);
-
-            return const_cast<iterator>(a_pos);
+        iterator insert (const_iterator a_pos, const T& a_item)
+        {
+            return insert(a_pos, 1, a_item);
         }
 
-        iterator insert (const_iterator a_pos, T&& a_item) noexcept
+        iterator insert (const_iterator a_pos, size_type a_count, const T& a_value)
         {
-            if(m_size == m_capacity)
-            {
-                size_t insert_index = std::distance(m_data, const_cast<iterator>(a_pos));
-                AllocateBufferForInsert(GetNewCapacity(1), insert_index, 1);
-                a_pos = m_data;
-                std::advance(a_pos, insert_index);
-            }
-            else
-            {
-                detail::memMoveImpl<Allocator>(const_cast<iterator>(a_pos)+1, a_pos, (end() - a_pos) * sizeof(T), *this);
-                ++m_size;
+            auto* pos = const_cast<iterator>(a_pos);
+            if (a_count > 0) {
+                if (m_capacity < m_size + a_count)
+                {
+                    std::size_t insert_index = std::distance(m_data, pos);
+                    AllocateBufferForInsert(insert_index, a_count);
+                    pos = m_data + insert_index;
+                }
+                else
+                {
+                    detail::memMoveImpl(pos+a_count, a_pos, (end() - pos) * sizeof(T),
+                                        (Allocator const&)(*this));
+                    m_size += a_count;
+                }
+                detail::uninitializedFillNImpl(pos, a_count, a_value,
+                                               (Allocator const&)(*this));
             }
+            return pos;
+        }
 
-            detail::uninitializedFillNImpl<Allocator>(const_cast<iterator>(a_pos), 1, a_item, *this);
-            return const_cast<iterator>(a_pos);
+        iterator insert (const_iterator a_pos, T&& a_item)
+        {
+            // This is *POD* vector afterall
+            return insert(a_pos, 1, a_item);
         }
 
-        iterator insert (const_iterator a_pos, std::initializer_list<T> a_initializer_list) noexcept
+        iterator insert (const_iterator a_pos,
+                         std::initializer_list<T> a_initializer_list)
         {
+            auto* pos = const_cast<iterator>(a_pos);
             size_type count = a_initializer_list.size();
-            if( capacity() < size() + count)
-            {
-                size_t insert_index = std::distance(m_data, const_cast<iterator>(a_pos));
-                AllocateBufferForInsert(GetNewCapacity(count), insert_index, count);
-                a_pos = m_data;
-                std::advance(a_pos, insert_index);
-            }
-            else
-            {
-                detail::memMoveImpl<Allocator>(const_cast<iterator>(a_pos)+count, a_pos, (end() - a_pos) * sizeof(T), *this);
-                m_size += count;
+            if (count > 0) {
+                if (m_capacity < m_size + count)
+                {
+                    std::size_t insert_index = std::distance(m_data, pos);
+                    AllocateBufferForInsert(insert_index, count);
+                    pos = m_data + insert_index;
+                }
+                else
+                {
+                    detail::memMoveImpl(pos+count, a_pos, (end() - pos) * sizeof(T),
+                                        (Allocator const&)(*this));
+                    m_size += count;
+                }
+                detail::initFromListImpl(pos, a_initializer_list,
+                                         (Allocator const&)(*this));
             }
-            detail::initFromListImpl<Allocator>(const_cast<iterator>(a_pos), a_initializer_list, *this);
-            return const_cast<iterator>(a_pos);
+            return pos;
         }
 
         template <class InputIt, class bar = typename std::iterator_traits<InputIt>::difference_type>
-        iterator insert (const_iterator a_pos, InputIt a_first, InputIt a_last) noexcept
+        iterator insert (const_iterator a_pos, InputIt a_first, InputIt a_last)
         {
+            auto* pos = const_cast<iterator>(a_pos);
             size_type count = std::distance(a_first, a_last);
-            if (count == 0) return const_cast<iterator>(a_pos);
-            if( capacity() < size() + count)
-            {
-                size_t insert_index = std::distance(m_data, const_cast<iterator>(a_pos));
-                AllocateBufferForInsert(GetNewCapacity(count), insert_index, count);
-                a_pos = m_data;
-                std::advance(a_pos, insert_index);
+            if (count > 0) {
+                if (m_capacity < m_size + count)
+                {
+                    std::size_t insert_index = std::distance(m_data, pos);
+                    AllocateBufferForInsert(insert_index, count);
+                    pos = m_data + insert_index;
+                }
+                else
+                {
+                    detail::memMoveImpl(pos+count, a_pos, (end() - pos) * sizeof(T),
+                                        (Allocator const&)(*this));
+                    m_size += count;
+                }
+                // Unfortunately we don't know whether InputIt points
+                // GPU or CPU memory. We will assume it's the same as
+                // the vector.
+                detail::fillValuesImpl(pos, a_first, count,
+                                       (Allocator const&)(*this));
             }
-            else
-            {
-                detail::memMoveImpl<Allocator>(const_cast<iterator>(a_pos)+count, a_pos, (end() - a_pos) * sizeof(T), *this);
-                m_size += count;
-            }
-            auto* dst = const_cast<iterator>(a_pos);
-            detail::fillValuesImpl<Allocator>(dst, a_first, count, *this);
-            return const_cast<iterator>(a_pos);
+            return pos;
         }
 
-        void assign (size_type a_count, const T& a_value) noexcept
+        void assign (size_type a_count, const T& a_value)
         {
-            if ( a_count > capacity() ) AllocateBuffer(GetNewCapacity(a_count));
+            if ( a_count > m_capacity ) {
+                clear();
+                reserve(a_count);
+            }
             m_size = a_count;
-            detail::uninitializedFillNImpl<Allocator>(m_data, a_count, a_value, *this);
+            detail::uninitializedFillNImpl(m_data, a_count, a_value,
+                                           (Allocator const&)(*this));
         }
 
-        void assign (std::initializer_list<T> a_initializer_list) noexcept
+        void assign (std::initializer_list<T> a_initializer_list)
         {
-            if(a_initializer_list.size() > capacity())
-                AllocateBuffer(GetNewCapacity(a_initializer_list.size()));
+            if (a_initializer_list.size() > m_capacity) {
+                clear();
+                reserve(a_initializer_list.size());
+            }
             m_size = a_initializer_list.size();
-            detail::initFromListImpl<Allocator>(const_cast<iterator>(m_data), a_initializer_list, *this);
+            detail::initFromListImpl(m_data, a_initializer_list,
+                                     (Allocator const&)(*this));
         }
 
         template <class InputIt, class bar = typename std::iterator_traits<InputIt>::difference_type>
-        void assign (InputIt a_first, InputIt a_last) noexcept
+        void assign (InputIt a_first, InputIt a_last)
         {
-            size_t count = std::distance(a_first, a_last);
-            if (count > capacity()) AllocateBuffer(GetNewCapacity(count));
-            m_size = count;        auto dst = const_cast<iterator>(m_data);
-            detail::fillValuesImpl<Allocator>(dst, a_first, count, *this);
+            std::size_t count = std::distance(a_first, a_last);
+            if (count > m_capacity) {
+                clear();
+                reserve(count);
+            }
+            m_size = count;
+            detail::fillValuesImpl(m_data, a_first, count,
+                                   (Allocator const&)(*this));
         }
 
-        // don't have the emplace methods, but not sure how often we use those.
         [[nodiscard]] allocator_type get_allocator () const noexcept { return *this; }
 
-        void push_back (const T& a_value) noexcept
+        void push_back (const T& a_value)
         {
-            if (m_size == m_capacity) AllocateBuffer(GetNewCapacity(1));
-            detail::uninitializedFillNImpl<Allocator>(m_data+m_size, 1, a_value, *this);
+            if (m_size == m_capacity) {
+                auto new_capacity = GetNewCapacityForPush();
+                AllocateBufferForPush(new_capacity);
+            }
+            detail::uninitializedFillNImpl(m_data+m_size, 1, a_value,
+                                           (Allocator const&)(*this));
             ++m_size;
         }
 
-        void push_back (T&& a_value) noexcept
-        {
-            if (m_size == m_capacity) AllocateBuffer(GetNewCapacity(1));
-            detail::uninitializedFillNImpl<Allocator>(m_data+m_size, 1, a_value, *this);
-            ++m_size;
-        }
+        // Because T is trivial, there is no need for push_back(T&&)
+
+        // Don't have the emplace methods, but not sure how often we use those.
 
         void pop_back () noexcept { --m_size; }
 
@@ -515,8 +530,6 @@ namespace amrex
 
         [[nodiscard]] size_type size () const noexcept { return m_size; }
 
-        [[nodiscard]] size_type max_size () const noexcept { return Allocator::max_size(); }
-
         [[nodiscard]] size_type capacity () const noexcept { return m_capacity; }
 
         [[nodiscard]] bool empty () const noexcept { return m_size == 0; }
@@ -565,42 +578,53 @@ namespace amrex
 
         [[nodiscard]] const_reverse_iterator crend () const noexcept { return const_reverse_iterator(begin()); }
 
-        void resize (size_type a_new_size) noexcept
+        void resize (size_type a_new_size)
         {
-            if (capacity() < a_new_size) AllocateBuffer(GetNewCapacity(a_new_size - capacity()));
+            if (m_capacity < a_new_size) {
+                reserve(a_new_size);
+            }
             m_size = a_new_size;
         }
 
-        void resize (size_type a_new_size, const T& a_val) noexcept
+        void resize (size_type a_new_size, const T& a_val)
         {
-            size_type old_size = size();
+            size_type old_size = m_size;
             resize(a_new_size);
             if (old_size < a_new_size)
             {
-                detail::uninitializedFillNImpl<Allocator>(m_data + old_size,
-                                                          size() - old_size,
-                                                          a_val, *this);
+                detail::uninitializedFillNImpl(m_data + old_size,
+                                               m_size - old_size, a_val,
+                                               (Allocator const&)(*this));
             }
         }
 
-        void reserve (size_type a_capacity) noexcept
+        void reserve (size_type a_capacity)
         {
-            if(capacity() < a_capacity) AllocateBuffer(a_capacity);
+            if (m_capacity < a_capacity) {
+                auto fp = detail::allocate_in_place(m_data, a_capacity, a_capacity,
+                                                    (Allocator&)(*this));
+                UpdateDataPtr(fp);
+            }
         }
 
-        void shrink_to_fit () noexcept
-        {
-            const size_type current_size = size();
-            if ( current_size == 0 )
-            {
-                deallocate(m_data, capacity());
-                m_data = nullptr;
-                m_size = 0;
-                m_capacity = 0;
-            }
-            else if ( current_size < capacity() )
-            {
-                AllocateBuffer(current_size);
+        void shrink_to_fit ()
+        {
+            if (m_data != nullptr) {
+                if (m_size == 0) {
+                    deallocate(m_data, m_capacity);
+                    m_data = nullptr;
+                    m_capacity = 0;
+                } else if (m_size < m_capacity) {
+                    auto* new_data = detail::shrink_in_place(m_data, m_size,
+                                                             (Allocator&)(*this));
+                    if (new_data != m_data) {
+                        memCopyImpl(new_data, m_data, nBytes(),
+                                    (Allocator const&)(*this),
+                                    (Allocator const&)(*this));
+                        deallocate(m_data, m_capacity);
+                    }
+                    m_capacity = m_size;
+                }
             }
         }
 
@@ -614,122 +638,91 @@ namespace amrex
 
     private:
 
-        // this is where we would change the growth strategy
-        [[nodiscard]] size_type GetNewCapacity (size_type a_num_to_be_added) const noexcept
+        [[nodiscard]] size_type nBytes () const noexcept
         {
-            size_type new_capacity = capacity();
-
-            if (capacity() == 0)
-            {
-                new_capacity = std::max(64 / sizeof(T), size_type(1));
-            }
-
-            while (new_capacity < (capacity() + a_num_to_be_added))
-            {
-                new_capacity = static_cast<size_type>(
-                    VectorGrowthStrategy::GetGrowthFactor() * static_cast<Real>(new_capacity + 1));
-            }
-
-            return new_capacity;
-        }
-
-        // this is where we would play games with the allocator
-        void AllocateBuffer (size_type a_capacity) noexcept
-        {
-            pointer new_data = allocate(a_capacity);
-            if (m_data) {
-                auto r = detail::memCopyImpl<Allocator>(new_data, m_data, size() * sizeof(T), *this);
-                if (r) { amrex::Gpu::streamSynchronize(); }
-            }
-            deallocate(m_data, capacity());
-            m_data = new_data;
-            m_capacity = a_capacity;
-        }
-
-        // need to have this version too
-        void AllocateBufferForInsert (size_type a_capacity, size_type a_index, size_type a_count) noexcept
-        {
-            using namespace detail;
-            pointer new_data = allocate(a_capacity);
-            if (m_data)
-            {
-                memCopyImpl<Allocator>(new_data, m_data, a_index * sizeof(T), *this);
-                auto r = memCopyImpl<Allocator>(new_data + a_index + a_count, m_data + a_index,
-                                       (size() - a_index)*sizeof(T), *this);
-                if (r) { amrex::Gpu::streamSynchronize(); }
-            }
-            deallocate(m_data, capacity());
-            m_data = new_data;
-            m_size = size() + a_count;
-            m_capacity = a_capacity;
+            return m_size*sizeof(T);
         }
 
-        PODVector& CopyAssignmentDoIt (const PODVector<T, Allocator>& a_vector, std::true_type) noexcept
+        // this is where we would change the growth strategy for push_back
+        [[nodiscard]] size_type GetNewCapacityForPush () const noexcept
         {
-            const size_t other_size = a_vector.size();
-            if ( other_size > capacity() ) { AllocateBuffer(other_size); }
-            m_size = other_size;
-            auto r = detail::memCopyImpl<Allocator>(m_data, a_vector.m_data, size() * sizeof(T), *this);
-            if (r) { Gpu::streamSynchronize(); }
-            return *this;
-        }
-
-        PODVector& CopyAssignmentDoIt (const PODVector<T, Allocator>& a_vector, std::false_type) noexcept
-        {
-            if (static_cast<Allocator const&>(a_vector) == static_cast<Allocator&>(*this))
-            {
-                return CopyAssignmentDoIt(a_vector, std::true_type());
-            }
-            else
-            {
-                const size_t other_size = a_vector.size();
-                if ( other_size > capacity() ) { AllocateBuffer(other_size); }
-                m_size = other_size;
-                auto r = detail::memCopyImpl<Allocator>(m_data, a_vector.m_data, size() * sizeof(T), *this);
-                if (r) { Gpu::streamSynchronize(); }
-                Allocator::operator=(static_cast<Allocator const&>(a_vector));
-                return *this;
+            if (m_capacity == 0) {
+                return std::max(64/sizeof(T), size_type(1));
+            } else {
+                Real const gf = VectorGrowthStrategy::GetGrowthFactor();
+                if (amrex::almostEqual(gf, Real(1.5))) {
+                    return (m_capacity*3+1)/2;
+                } else {
+                    return size_type(gf*Real(m_capacity+1));
+                }
             }
         }
 
-        PODVector& MoveAssignmentDoIt (PODVector<T, Allocator>&& a_vector, std::false_type) noexcept
+        void UpdateDataPtr (FatPtr<T> const& fp)
         {
-            if(static_cast<Allocator&>(a_vector) == static_cast<Allocator&>(*this))
-            {
+            auto* new_data = fp.ptr();
+            auto new_capacity = fp.size();
+            if (m_data != nullptr && m_data != new_data) {
+                if (m_size > 0) {
+                    detail::memCopyImpl(new_data, m_data, nBytes(),
+                                        (Allocator const&)(*this),
+                                        (Allocator const&)(*this));
+                }
                 deallocate(m_data, capacity());
-
-                m_data = a_vector.m_data;
-                m_size = a_vector.m_size;
-                m_capacity = a_vector.m_capacity;
-
-                a_vector.m_data = nullptr;
-                a_vector.m_size = 0;
-                a_vector.m_capacity = 0;
             }
-            else
-            {
-                // if the allocators are not the same we give up and copy
-                CopyAssignmentDoIt(a_vector, std::false_type());
+            m_data = new_data;
+            m_capacity = new_capacity;
+        }
+
+        // This is where we play games with the allocator. This function
+        // updates m_data and m_capacity, but not m_size.
+        void AllocateBufferForPush (size_type target_capacity)
+        {
+            auto fp = detail::allocate_in_place(m_data, m_size+1, target_capacity,
+                                                (Allocator&)(*this));
+            UpdateDataPtr(fp);
+        }
+
+        // This is where we play games with the allocator and the growth
+        // strategy for insert. This function updates m_data, m_size and
+        // m_capacity.
+        void AllocateBufferForInsert (size_type a_index, size_type a_count)
+        {
+            size_type new_size = m_size + a_count;
+            size_type new_capacity = std::max(new_size, GetNewCapacityForPush());
+            auto fp = detail::allocate_in_place(m_data, new_size, new_capacity,
+                                                (Allocator&)(*this));
+            auto* new_data = fp.ptr();
+            new_capacity = fp.size();
+
+            if (m_data != nullptr) {
+                if (m_data == new_data) {
+                    if (m_size > a_index) {
+                        detail::memMoveImpl(m_data+a_index+a_count, m_data+a_index,
+                                            (m_size-a_index)*sizeof(T),
+                                            (Allocator const&)(*this));
+                    }
+                } else {
+                    if (m_size > 0) {
+                        if (a_index > 0) {
+                            detail::memCopyImpl(new_data, m_data, a_index*sizeof(T),
+                                                (Allocator const&)(*this),
+                                                (Allocator const&)(*this), false);
+                        }
+                        if (m_size > a_index) {
+                            detail::memCopyImpl(new_data+a_index+a_count, m_data+a_index,
+                                                (m_size-a_index)*sizeof(T),
+                                                (Allocator const&)(*this),
+                                                (Allocator const&)(*this), false);
+                        }
+                        Gpu::streamSynchronize();
+                    }
+                    deallocate(m_data, m_capacity);
+                }
             }
-
-            return *this;
-        }
-
-        PODVector& MoveAssignmentDoIt (PODVector<T, Allocator>&& a_vector, std::true_type) noexcept
-        {
-            deallocate(m_data, capacity());
-
-            m_data = a_vector.m_data;
-            m_size = a_vector.m_size;
-            m_capacity = a_vector.m_capacity;
-
-            a_vector.m_data = nullptr;
-            a_vector.m_size = 0;
-            a_vector.m_capacity = 0;
-
-            Allocator::operator=(std::move(static_cast<Allocator const&>(a_vector)));
-
-            return *this;
+            m_data = new_data;
+            m_size = new_size;
+            m_capacity = new_capacity;
         }
     };
 }
diff --git a/Src/Particle/AMReX_NeighborParticles.H b/Src/Particle/AMReX_NeighborParticles.H
index 685bc3e794b..1e0872a032e 100644
--- a/Src/Particle/AMReX_NeighborParticles.H
+++ b/Src/Particle/AMReX_NeighborParticles.H
@@ -195,7 +195,6 @@ public:
     using ParticleVector = typename ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>::ParticleVector;
     using ParticleTile = typename ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>::ParticleTileType;
     using IntVector  = typename ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>::IntVector;
-    using SendBuffer = typename ParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>::SendBuffer;
 
     NeighborParticleContainer (ParGDBBase* gdb, int ncells);
 
diff --git a/Src/Particle/AMReX_ParticleContainer.H b/Src/Particle/AMReX_ParticleContainer.H
index 3e6a6113fc8..9c75cb909b4 100644
--- a/Src/Particle/AMReX_ParticleContainer.H
+++ b/Src/Particle/AMReX_ParticleContainer.H
@@ -189,7 +189,6 @@ public:
     using IntVector        = typename SoA::IntVector;
     using ParticleVector   = typename AoS::ParticleVector;
     using CharVector       = Gpu::DeviceVector<char>;
-    using SendBuffer       = Gpu::PolymorphicVector<char>;
     using ParIterType      = ParIter_impl<ParticleType, NArrayReal, NArrayInt, Allocator>;
     using ParConstIterType = ParConstIter_impl<ParticleType, NArrayReal, NArrayInt, Allocator>;