diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index 1fcc59478..3eaadf9e4 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -825,26 +825,7 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) {
         cnrn_target_delete(nrb->_nrb_index, nrb->_size);
     }
 #endif
-    // Reallocate host buffers using ecalloc_align (as in phase2.cpp) and
-    // free_memory (as in nrn_setup.cpp)
-    auto const realloc = [old_size = nrb->_size, nrb](auto*& ptr, std::size_t extra_size = 0) {
-        using T = std::remove_pointer_t<std::remove_reference_t<decltype(ptr)>>;
-        static_assert(std::is_trivial<T>::value,
-                      "Only trivially constructible and copiable types are supported.");
-        static_assert(std::is_same<decltype(ptr), T*&>::value,
-                      "ptr should be reference-to-pointer");
-        auto* const new_data = static_cast<T*>(ecalloc_align((nrb->_size + extra_size), sizeof(T)));
-        std::memcpy(new_data, ptr, (old_size + extra_size) * sizeof(T));
-        free_memory(ptr);
-        ptr = new_data;
-    };
-    nrb->_size *= 2;
-    realloc(nrb->_pnt_index);
-    realloc(nrb->_weight_index);
-    realloc(nrb->_nrb_t);
-    realloc(nrb->_nrb_flag);
-    realloc(nrb->_displ, 1);
-    realloc(nrb->_nrb_index);
+    nrb->grow();
 #ifdef CORENEURON_ENABLE_GPU
     if (nt->compute_gpu) {
         // update device copy
diff --git a/coreneuron/io/nrn_setup.cpp b/coreneuron/io/nrn_setup.cpp
index 703e853d8..9ef76b5a8 100644
--- a/coreneuron/io/nrn_setup.cpp
+++ b/coreneuron/io/nrn_setup.cpp
@@ -764,15 +764,7 @@ void nrn_cleanup() {
 
             NetReceiveBuffer_t* nrb = ml->_net_receive_buffer;
             if (nrb) {
-                if (nrb->_size) {
-                    free_memory(nrb->_pnt_index);
-                    free_memory(nrb->_weight_index);
-                    free_memory(nrb->_nrb_t);
-                    free_memory(nrb->_nrb_flag);
-                    free_memory(nrb->_displ);
-                    free_memory(nrb->_nrb_index);
-                }
-                free_memory(nrb);
+                delete nrb;
                 ml->_net_receive_buffer = nullptr;
             }
 
diff --git a/coreneuron/io/phase2.cpp b/coreneuron/io/phase2.cpp
index ad5748ad7..2092985e2 100644
--- a/coreneuron/io/phase2.cpp
+++ b/coreneuron/io/phase2.cpp
@@ -479,20 +479,13 @@ void Phase2::set_net_send_buffer(Memb_list** ml_list, const std::vector<int>& pn
         // Does this thread have this type.
         Memb_list* ml = ml_list[type];
         if (ml) {  // needs a NetReceiveBuffer
-            NetReceiveBuffer_t* nrb =
-                (NetReceiveBuffer_t*) ecalloc_align(1, sizeof(NetReceiveBuffer_t));
+            NetReceiveBuffer_t* nrb = new NetReceiveBuffer_t();
             assert(!ml->_net_receive_buffer);
             ml->_net_receive_buffer = nrb;
             nrb->_pnt_offset = pnt_offset[type];
 
             // begin with a size equal to the number of instances, or at least 8
-            nrb->_size = std::max(8, ml->nodecount);
-            nrb->_pnt_index = (int*) ecalloc_align(nrb->_size, sizeof(int));
-            nrb->_displ = (int*) ecalloc_align(nrb->_size + 1, sizeof(int));
-            nrb->_nrb_index = (int*) ecalloc_align(nrb->_size, sizeof(int));
-            nrb->_weight_index = (int*) ecalloc_align(nrb->_size, sizeof(int));
-            nrb->_nrb_t = (double*) ecalloc_align(nrb->_size, sizeof(double));
-            nrb->_nrb_flag = (double*) ecalloc_align(nrb->_size, sizeof(double));
+            nrb->initialize(std::max(8, ml->nodecount));
         }
     }
 
diff --git a/coreneuron/mechanism/mechanism.hpp b/coreneuron/mechanism/mechanism.hpp
index 9427423df..c5f0c3409 100644
--- a/coreneuron/mechanism/mechanism.hpp
+++ b/coreneuron/mechanism/mechanism.hpp
@@ -38,51 +38,74 @@ struct Point_process {
     short _tid; /* NrnThread id */
 };
 
-struct NetReceiveBuffer_t {
-    int* _displ;     /* _displ_cnt + 1 of these */
-    int* _nrb_index; /* _cnt of these (order of increasing _pnt_index) */
-
-    int* _pnt_index;
-    int* _weight_index;
-    double* _nrb_t;
-    double* _nrb_flag;
-    int _cnt;
-    int _displ_cnt; /* number of unique _pnt_index */
-    int _size;      /* capacity */
-    int _pnt_offset;
-    size_t size_of_object() {
-        size_t nbytes = 0;
+struct NetReceiveBuffer_t: public UnifiedMemManaged<> {
+    unified_uniq_ptr<int[]> _displ;     /* _displ_cnt + 1 of these */
+    unified_uniq_ptr<int[]> _nrb_index; /* _cnt of these (order of increasing _pnt_index) */
+
+    unified_uniq_ptr<int[]> _pnt_index;
+    unified_uniq_ptr<int[]> _weight_index;
+    unified_uniq_ptr<double[]> _nrb_t;
+    unified_uniq_ptr<double[]> _nrb_flag;
+    int _cnt = 0;
+    int _displ_cnt = 0; /* number of unique _pnt_index */
+
+    std::size_t _size = 0; /* capacity */
+    int _pnt_offset = 0;
+    std::size_t size_of_object() {
+        std::size_t nbytes = 0;
         nbytes += _size * sizeof(int) * 3;
         nbytes += (_size + 1) * sizeof(int);
         nbytes += _size * sizeof(double) * 2;
         return nbytes;
     }
+
+    void initialize(std::size_t size) {
+        _size = size;
+        _pnt_index = allocate_unique<int[]>(allocator<int>{}, _size);
+        auto displ_size = _size + 1;
+        _displ = allocate_unique<int[]>(allocator<int>{}, displ_size);
+        _nrb_index = allocate_unique<int[]>(allocator<int>{}, _size);
+        _weight_index = allocate_unique<int[]>(allocator<int>{}, _size);
+        _nrb_t = allocate_unique<double[]>(allocator<double>{}, _size);
+        _nrb_flag = allocate_unique<double[]>(allocator<double>{}, _size);
+    }
+
+    void grow() {
+        std::size_t new_size = _size * 2;
+        grow_buf(_pnt_index, _size, new_size);
+        grow_buf(_weight_index, _size, new_size);
+        grow_buf(_nrb_t, _size, new_size);
+        grow_buf(_nrb_flag, _size, new_size);
+        grow_buf(_displ, _size + 1, new_size + 1);
+        grow_buf(_nrb_index, _size, new_size);
+        _size = new_size;
+    }
 };
 
-struct NetSendBuffer_t: MemoryManaged {
-    int* _sendtype;  // net_send, net_event, net_move
-    int* _vdata_index;
-    int* _pnt_index;
-    int* _weight_index;
-    double* _nsb_t;
-    double* _nsb_flag;
-    int _cnt;
-    int _size;       /* capacity */
-    int reallocated; /* if buffer resized/reallocated, needs to be copy to cpu */
+struct NetSendBuffer_t: public UnifiedMemManaged<> {
+    unified_uniq_ptr<int[]> _sendtype;  // net_send, net_event, net_move
+    unified_uniq_ptr<int[]> _vdata_index;
+    unified_uniq_ptr<int[]> _pnt_index;
+    unified_uniq_ptr<int[]> _weight_index;
+    unified_uniq_ptr<double[]> _nsb_t;
+    unified_uniq_ptr<double[]> _nsb_flag;
+    int _cnt = 0;
+    std::size_t _size = 0; /* capacity */
+    int reallocated = 0;   /* if buffer resized/reallocated, needs to be copy to cpu */
 
     NetSendBuffer_t(int size)
         : _size(size) {
         _cnt = 0;
 
-        _sendtype = (int*) ecalloc_align(_size, sizeof(int));
-        _vdata_index = (int*) ecalloc_align(_size, sizeof(int));
-        _pnt_index = (int*) ecalloc_align(_size, sizeof(int));
-        _weight_index = (int*) ecalloc_align(_size, sizeof(int));
+        _sendtype = allocate_unique<int[]>(allocator<int>{}, _size);
+        _vdata_index = allocate_unique<int[]>(allocator<int>{}, _size);
+        _pnt_index = allocate_unique<int[]>(allocator<int>{}, _size);
+        _weight_index = allocate_unique<int[]>(allocator<int>{}, _size);
         // when == 1, NetReceiveBuffer_t is newly allocated (i.e. we need to free previous copy
         // and recopy new data
         reallocated = 1;
-        _nsb_t = (double*) ecalloc_align(_size, sizeof(double));
-        _nsb_flag = (double*) ecalloc_align(_size, sizeof(double));
+        _nsb_t = allocate_unique<double[]>(allocator<double>{}, _size);
+        _nsb_flag = allocate_unique<double[]>(allocator<double>{}, _size);
     }
 
     size_t size_of_object() {
@@ -92,39 +115,15 @@ struct NetSendBuffer_t: MemoryManaged {
         return nbytes;
     }
 
-    ~NetSendBuffer_t() {
-        free_memory(_sendtype);
-        free_memory(_vdata_index);
-        free_memory(_pnt_index);
-        free_memory(_weight_index);
-        free_memory(_nsb_t);
-        free_memory(_nsb_flag);
-    }
-
     void grow() {
-#ifdef CORENEURON_ENABLE_GPU
-        int cannot_reallocate_on_device = 0;
-        assert(cannot_reallocate_on_device);
-#else
-        int new_size = _size * 2;
-        grow_buf(&_sendtype, _size, new_size);
-        grow_buf(&_vdata_index, _size, new_size);
-        grow_buf(&_pnt_index, _size, new_size);
-        grow_buf(&_weight_index, _size, new_size);
-        grow_buf(&_nsb_t, _size, new_size);
-        grow_buf(&_nsb_flag, _size, new_size);
+        std::size_t new_size = _size * 2;
+        grow_buf(_sendtype, _size, new_size);
+        grow_buf(_vdata_index, _size, new_size);
+        grow_buf(_pnt_index, _size, new_size);
+        grow_buf(_weight_index, _size, new_size);
+        grow_buf(_nsb_t, _size, new_size);
+        grow_buf(_nsb_flag, _size, new_size);
         _size = new_size;
-#endif
-    }
-
-  private:
-    template <typename T>
-    void grow_buf(T** buf, int size, int new_size) {
-        T* new_buf = nullptr;
-        new_buf = (T*) ecalloc_align(new_size, sizeof(T));
-        memcpy(new_buf, *buf, size * sizeof(T));
-        free(*buf);
-        *buf = new_buf;
     }
 };
 
diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp
index c95fedcf2..66eec5927 100644
--- a/coreneuron/permute/cellorder.cpp
+++ b/coreneuron/permute/cellorder.cpp
@@ -50,11 +50,11 @@ InterleaveInfo::InterleaveInfo(const InterleaveInfo& info) {
     nwarp = info.nwarp;
     nstride = info.nstride;
 
-    copy_align_array(stridedispl, info.stridedispl, nwarp + 1);
-    copy_align_array(stride, info.stride, nstride);
-    copy_align_array(firstnode, info.firstnode, nwarp + 1);
-    copy_align_array(lastnode, info.lastnode, nwarp + 1);
-    copy_align_array(cellsize, info.cellsize, nwarp);
+    copy_array(stridedispl, info.stridedispl, nwarp + 1);
+    copy_array(stride, info.stride, nstride);
+    copy_array(firstnode, info.firstnode, nwarp + 1);
+    copy_array(lastnode, info.lastnode, nwarp + 1);
+    copy_array(cellsize, info.cellsize, nwarp);
 
     copy_array(nnode, info.nnode, nwarp);
     copy_array(ncycle, info.ncycle, nwarp);
@@ -74,25 +74,6 @@ InterleaveInfo& InterleaveInfo::operator=(const InterleaveInfo& info) {
     return *this;
 }
 
-InterleaveInfo::~InterleaveInfo() {
-    if (stride) {
-        free_memory(stride);
-        free_memory(firstnode);
-        free_memory(lastnode);
-        free_memory(cellsize);
-    }
-    if (stridedispl) {
-        free_memory(stridedispl);
-    }
-    if (idle) {
-        delete[] nnode;
-        delete[] ncycle;
-        delete[] idle;
-        delete[] cache_access;
-        delete[] child_race;
-    }
-}
-
 void create_interleave_info() {
     destroy_interleave_info();
     interleave_info = new InterleaveInfo[nrn_nthread];
@@ -299,8 +280,13 @@ int* interleave_order(int ith, int ncell, int nnode, int* parent) {
         }
     }
 
-    int nwarp = 0, nstride = 0, *stride = nullptr, *firstnode = nullptr;
-    int *lastnode = nullptr, *cellsize = nullptr, *stridedispl = nullptr;
+    int nwarp = 0;
+    int nstride = 0;
+    int* stride = nullptr;
+    int* firstnode = nullptr;
+    int* lastnode = nullptr;
+    int* cellsize = nullptr;
+    int* stridedispl = nullptr;
 
     int* order = node_order(
         ncell, nnode, parent, nwarp, nstride, stride, firstnode, lastnode, cellsize, stridedispl);
diff --git a/coreneuron/permute/cellorder.hpp b/coreneuron/permute/cellorder.hpp
index fe2f2f84e..df20f9194 100644
--- a/coreneuron/permute/cellorder.hpp
+++ b/coreneuron/permute/cellorder.hpp
@@ -47,12 +47,11 @@ class InterleaveInfo;  // forward declaration
  */
 void solve_interleaved2_launcher(NrnThread* nt, InterleaveInfo* info, int ncore, void* stream);
 
-class InterleaveInfo: public MemoryManaged {
+class InterleaveInfo: public UnifiedMemManaged<> {
   public:
     InterleaveInfo() = default;
     InterleaveInfo(const InterleaveInfo&);
     InterleaveInfo& operator=(const InterleaveInfo&);
-    ~InterleaveInfo();
     int nwarp = 0;  // used only by interleave2
     int nstride = 0;
     int* stridedispl = nullptr;  // interleave2: nwarp+1
@@ -106,17 +105,10 @@ int* node_order(int ncell,
                 int*& cellsize,
                 int*& stridedispl);
 
-// copy src array to dest with new allocation
-template <typename T>
-void copy_array(T*& dest, T* src, size_t n) {
-    dest = new T[n];
-    std::copy(src, src + n, dest);
-}
-
 // copy src array to dest with NRN_SOA_BYTE_ALIGN ecalloc_align allocation
 template <typename T>
-void copy_align_array(T*& dest, T* src, size_t n) {
-    dest = static_cast<T*>(ecalloc_align(n, sizeof(T)));
+void copy_array(T*& dest, T* src, size_t n) {
+    dest = static_cast<T*>(allocate_unified(n * sizeof(T)));
     std::copy(src, src + n, dest);
 }
 
diff --git a/coreneuron/sim/multicore.hpp b/coreneuron/sim/multicore.hpp
index a6ac50be0..f8a8e6c50 100644
--- a/coreneuron/sim/multicore.hpp
+++ b/coreneuron/sim/multicore.hpp
@@ -36,6 +36,8 @@ struct NrnThreadMembList { /* patterned after CvMembList in cvodeobj.h */
     int* dependencies; /* list of mechanism types that this mechanism depends on*/
     int ndependencies; /* for scheduling we need to know the dependency count */
 };
+
+
 NrnThreadMembList* create_tml(NrnThread& nt,
                               int mech_id,
                               Memb_func& memb_func,
@@ -72,7 +74,7 @@ struct PreSynHelper {
     int flag_;
 };
 
-struct NrnThread: public MemoryManaged {
+struct NrnThread: UnifiedMemManaged<> {
     double _t = 0;
     double _dt = -1e9;
     double cj = 0.0;
diff --git a/coreneuron/utils/memory.cpp b/coreneuron/utils/memory.cpp
index 8f45487dc..3d0271c1b 100644
--- a/coreneuron/utils/memory.cpp
+++ b/coreneuron/utils/memory.cpp
@@ -13,6 +13,7 @@
 #endif
 
 #include <cassert>
+#include <cstdlib>
 
 namespace coreneuron {
 bool gpu_enabled() {
@@ -23,38 +24,52 @@ bool gpu_enabled() {
 #endif
 }
 
-void* allocate_unified(std::size_t num_bytes) {
+void* allocate_host(size_t num_bytes, std::size_t alignment) {
+    size_t fill = 0;
+    void* pointer = nullptr;
+    if (alignment > 0) {
+        if (num_bytes % alignment != 0) {
+            size_t multiple = num_bytes / alignment;
+            fill = alignment * (multiple + 1) - num_bytes;
+        }
+        nrn_assert((pointer = std::aligned_alloc(alignment, num_bytes + fill)) != nullptr);
+    } else {
+        nrn_assert((pointer = std::malloc(num_bytes)) != nullptr);
+    }
+    return pointer;
+}
+
+void deallocate_host(void* pointer, std::size_t num_bytes) {
+    free(pointer);
+}
+
+void* allocate_unified(std::size_t num_bytes, std::size_t alignment) {
 #ifdef CORENEURON_ENABLE_GPU
     // The build supports GPU execution, check if --gpu was passed to actually
     // enable it. We should not call CUDA APIs in GPU builds if --gpu was not passed.
     if (corenrn_param.gpu) {
+        void* pointer = nullptr;
         // Allocate managed/unified memory.
-        void* ptr{nullptr};
-        auto const code = cudaMallocManaged(&ptr, num_bytes);
+        auto const code = cudaMallocManaged(&pointer, num_bytes);
         assert(code == cudaSuccess);
-        return ptr;
+        return pointer;
     }
 #endif
     // Either the build does not have GPU support or --gpu was not passed.
-    // Allocate using standard operator new.
-    // When we have C++17 support then propagate `alignment` here.
-    return ::operator new(num_bytes);
+    // Allocate using host allocator.
+    return allocate_host(num_bytes, alignment);
 }
 
-void deallocate_unified(void* ptr, std::size_t num_bytes) {
+void deallocate_unified(void* pointer, std::size_t num_bytes) {
     // See comments in allocate_unified to understand the different branches.
 #ifdef CORENEURON_ENABLE_GPU
     if (corenrn_param.gpu) {
         // Deallocate managed/unified memory.
-        auto const code = cudaFree(ptr);
+        auto const code = cudaFree(pointer);
         assert(code == cudaSuccess);
         return;
     }
 #endif
-#ifdef __cpp_sized_deallocation
-    ::operator delete(ptr, num_bytes);
-#else
-    ::operator delete(ptr);
-#endif
+    deallocate_host(pointer, num_bytes);
 }
 }  // namespace coreneuron
diff --git a/coreneuron/utils/memory.h b/coreneuron/utils/memory.h
index fc1e73d2e..9aa563f6c 100644
--- a/coreneuron/utils/memory.h
+++ b/coreneuron/utils/memory.h
@@ -30,45 +30,93 @@ namespace coreneuron {
  */
 bool gpu_enabled();
 
+/** @brief Allocate host memory using new
+ */
+void* allocate_host(size_t num_bytes, size_t alignment = NRN_SOA_BYTE_ALIGN);
+
+/** @brief Deallocate memory allocated by `allocate_host`.
+ */
+void deallocate_host(void* pointer, std::size_t num_bytes);
+
 /** @brief Allocate unified memory in GPU builds iff GPU enabled, otherwise new
  */
-void* allocate_unified(std::size_t num_bytes);
+void* allocate_unified(std::size_t num_bytes, std::size_t alignment = NRN_SOA_BYTE_ALIGN);
 
 /** @brief Deallocate memory allocated by `allocate_unified`.
  */
-void deallocate_unified(void* ptr, std::size_t num_bytes);
+void deallocate_unified(void* pointer, std::size_t num_bytes);
 
 /** @brief C++ allocator that uses [de]allocate_unified.
  */
-template <typename T>
-struct unified_allocator {
+
+template <typename T, std::size_t alignment = NRN_SOA_BYTE_ALIGN>
+struct host_allocator {
     using value_type = T;
 
-    unified_allocator() = default;
+    host_allocator() = default;
 
     template <typename U>
-    unified_allocator(unified_allocator<U> const&) noexcept {}
+    host_allocator(host_allocator<U, alignment> const&) noexcept {}
 
     value_type* allocate(std::size_t n) {
-        return static_cast<value_type*>(allocate_unified(n * sizeof(value_type)));
+        return static_cast<value_type*>(allocate_host(n * sizeof(value_type), alignment));
     }
 
     void deallocate(value_type* p, std::size_t n) noexcept {
-        deallocate_unified(p, n * sizeof(value_type));
+        deallocate_host(p, n);
     }
 };
 
 template <typename T, typename U>
-bool operator==(unified_allocator<T> const&, unified_allocator<U> const&) noexcept {
+bool operator==(host_allocator<T> const&, host_allocator<U> const&) noexcept {
     return true;
 }
 
 template <typename T, typename U>
-bool operator!=(unified_allocator<T> const& x, unified_allocator<U> const& y) noexcept {
+bool operator!=(host_allocator<T> const& x, host_allocator<U> const& y) noexcept {
+    return !(x == y);
+}
+
+
+template <typename T, bool force>
+struct unified_allocator {
+    using value_type = T;
+
+    unified_allocator() = default;
+
+    template <typename U>
+    unified_allocator(unified_allocator<U, force> const&) noexcept {}
+
+    value_type* allocate(std::size_t n) {
+        void* ptr = nullptr;
+        // TODO: implement here the "always unified" semantics
+        if constexpr (force) {
+            ptr = allocate_unified(n * sizeof(value_type), 0);
+        } else {
+            ptr = allocate_unified(n * sizeof(value_type), 0);
+        }
+        return static_cast<value_type*>(ptr);
+    }
+
+    void deallocate(value_type* p, std::size_t n) noexcept {
+        deallocate_unified(p, n);
+    }
+};
+
+template <typename T, bool Tforce, typename U, bool Uforce>
+bool operator==(unified_allocator<T, Tforce> const&, unified_allocator<U, Uforce> const&) noexcept {
+    return true;
+}
+
+template <typename T, bool Tforce, typename U, bool Uforce>
+bool operator!=(unified_allocator<T, Tforce> const& x,
+                unified_allocator<U, Uforce> const& y) noexcept {
     return !(x == y);
 }
 
 /** @brief Allocator-aware deleter for use with std::unique_ptr.
+ *
+ * The deleter was extended to also support unique_ptr<T[]> types.
  *
  *  This is copied from https://stackoverflow.com/a/23132307. See also
  *  http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0316r0.html,
@@ -76,22 +124,32 @@ bool operator!=(unified_allocator<T> const& x, unified_allocator<U> const& y) no
  *  boost::allocate_unique<...>.
  *  Hopefully std::allocate_unique will be included in C++23.
  */
+
 template <typename Alloc>
 struct alloc_deleter {
     alloc_deleter() = default;  // OL210813 addition
     alloc_deleter(const Alloc& a)
-        : a(a) {}
+        : a(a)
+        , size(1) {}
+
+    alloc_deleter(const Alloc& a, std::size_t N)
+        : a(a)
+        , size(N) {}
 
     using pointer = typename std::allocator_traits<Alloc>::pointer;
 
     void operator()(pointer p) const {
         Alloc aa(a);
-        std::allocator_traits<Alloc>::destroy(aa, std::addressof(*p));
-        std::allocator_traits<Alloc>::deallocate(aa, p, 1);
+        auto pp = &p[size - 1];
+        do {
+            std::allocator_traits<Alloc>::destroy(aa, std::addressof(*pp--));
+        } while (pp >= p);
+        std::allocator_traits<Alloc>::deallocate(aa, p, size);
     }
 
   private:
     Alloc a;
+    std::size_t size;
 };
 
 template <typename T, typename Alloc, typename... Args>
@@ -111,11 +169,138 @@ auto allocate_unique(const Alloc& alloc, Args&&... args) {
         throw;
     }
 }
+
+
+template <typename T, typename Alloc>
+auto allocate_unique(const Alloc& alloc, std::size_t N) {
+    using AT = std::allocator_traits<Alloc>;
+    using ElemType = typename std::remove_extent<T>::type;
+    static_assert(std::is_same<typename AT::value_type, std::remove_cv_t<ElemType>>{}(),
+                  "Allocator has the wrong value_type");
+
+    Alloc a(alloc);
+    auto p = AT::allocate(a, N);
+    auto ep = reinterpret_cast<ElemType*>(p);
+    try {
+        for (std::size_t i = 0; i < N; ++i) {
+            AT::construct(a, std::addressof(*ep++));
+        }
+        using D = alloc_deleter<Alloc>;
+        return std::unique_ptr<T, D>(p, D(a, N));
+    } catch (...) {
+        AT::deallocate(a, p, N);
+        throw;
+    }
+}
+
+
+class [[deprecated]] MemoryManaged {};
+
+class HostMemManaged {
+  public:
+    template <typename T>
+    using allocator = host_allocator<T>;
+
+    template <typename U>
+    using host_uniq_ptr =
+        std::unique_ptr<U, alloc_deleter<allocator<typename std::remove_extent<U>::type>>>;
+
+    static void* operator new(std::size_t count) {
+        return allocate_host(count);
+    }
+
+    static void* operator new[](std::size_t count) {
+        return allocate_host(count);
+    }
+
+    static void operator delete(void* ptr, std::size_t sz) {
+        deallocate_host(ptr, sz);
+    }
+
+    static void operator delete[](void* ptr, std::size_t sz) {
+        deallocate_host(ptr, sz);
+    }
+
+  protected:
+    template <typename T>
+    void grow_buf(host_uniq_ptr<T[]>& buf, std::size_t size, std::size_t new_size) {
+        auto new_buf = allocate_unique<T[]>(allocator<T>{}, new_size);
+        std::copy(buf.get(), buf.get() + size, new_buf.get());
+        buf.swap(new_buf);
+    }
+
+    /**
+     * Initialize new dest buffer from src
+     *
+     */
+    template <typename T>
+    void initialize_from_other(host_uniq_ptr<T>& dest, host_uniq_ptr<T>& src, std::size_t size) {
+        dest = allocate_unique<T[]>(allocator<T>{}, size);
+        std::copy(src.get(), src.get() + size, dest.get);
+    }
+};
+
+template <bool force = false>
+class UnifiedMemManaged {
+  public:
+    template <typename T>
+    using allocator = unified_allocator<T, force>;
+
+    template <typename U>
+    using unified_uniq_ptr =
+        std::unique_ptr<U, alloc_deleter<allocator<typename std::remove_extent<U>::type>>>;
+
+    static void* operator new(std::size_t count) {
+        return allocate_unified(count);
+    }
+
+    static void* operator new[](std::size_t count) {
+        return allocate_unified(count);
+    }
+
+    static void operator delete(void* ptr, std::size_t sz) {
+        deallocate_unified(ptr, sz);
+    }
+
+    static void operator delete[](void* ptr, std::size_t sz) {
+        deallocate_unified(ptr, sz);
+    }
+
+  protected:
+  protected:
+    template <typename T>
+    void grow_buf(unified_uniq_ptr<T[]>& buf, std::size_t size, std::size_t new_size) {
+#ifdef CORENEURON_ENABLE_GPU
+        if (force || corenrn_param.gpu) {
+            int cannot_reallocate_on_device = 0;
+            assert(cannot_reallocate_on_device);
+        }
+#endif
+        auto new_buf = allocate_unique<T[]>(allocator<T>{}, new_size);
+        std::copy(buf.get(), buf.get() + size, new_buf.get());
+        buf.swap(new_buf);
+    }
+
+    /**
+     * Initialize new dest buffer from src
+     *
+     */
+    template <typename T>
+    void initialize_from_other(unified_uniq_ptr<T>& dest,
+                               const unified_uniq_ptr<T>& src,
+                               std::size_t size) {
+        dest = allocate_unique<T[]>(allocator<T>{}, size);
+        std::copy(src.get(), src.get() + size, dest.get);
+    }
+};
+
+
 }  // namespace coreneuron
 
 /// for gpu builds with unified memory support
 #ifdef CORENEURON_UNIFIED_MEMORY
 
+////////// to be removed
 #include <cuda_runtime_api.h>
 
 // TODO : error handling for CUDA routines
@@ -131,7 +316,7 @@ inline void calloc_memory(void*& pointer, size_t num_bytes, size_t /*alignment*/
 inline void free_memory(void* pointer) {
     cudaFree(pointer);
 }
-
+////////////////////
 /**
  * A base class providing overloaded new and delete operators for CUDA allocation
  *
@@ -139,12 +324,12 @@ inline void free_memory(void* pointer) {
  * may need to implement a special copy-construtor. This is documented here:
  * \link: https://devblogs.nvidia.com/unified-memory-in-cuda-6/
  */
-class MemoryManaged {
+/*class MemoryManaged {
   public:
     void* operator new(size_t len) {
         void* ptr;
         cudaMallocManaged(&ptr, len);
-        cudaDeviceSynchronize();
+        cuyydaDeviceSynchronize();
         return ptr;
     }
 
@@ -164,17 +349,16 @@ class MemoryManaged {
         cudaDeviceSynchronize();
         cudaFree(ptr);
     }
-};
+};*/
 
 
 /// for cpu builds use posix memalign
 #else
-class MemoryManaged {
+/*class MemoryManaged {
     // does nothing by default
-};
-
-#include <cstdlib>
+};*/
 
+//////// to be removed
 inline void alloc_memory(void*& pointer, size_t num_bytes, size_t alignment) {
     size_t fill = 0;
     if (num_bytes % alignment != 0) {
@@ -192,6 +376,7 @@ inline void calloc_memory(void*& pointer, size_t num_bytes, size_t alignment) {
 inline void free_memory(void* pointer) {
     free(pointer);
 }
+/////////////////
 
 #endif
 
@@ -224,7 +409,7 @@ inline bool is_aligned(void* pointer, std::size_t alignment) {
  */
 inline void* emalloc_align(size_t size, size_t alignment = NRN_SOA_BYTE_ALIGN) {
     void* memptr;
-    alloc_memory(memptr, size, alignment);
+    memptr = allocate_unified(size, alignment);
     nrn_assert(is_aligned(memptr, alignment));
     return memptr;
 }
@@ -239,7 +424,9 @@ inline void* ecalloc_align(size_t n, size_t size, size_t alignment = NRN_SOA_BYT
     if (n == 0) {
         return nullptr;
     }
-    calloc_memory(p, n * size, alignment);
+    p = allocate_unified(n * size, alignment);
+    // TODO: Maybe allocate_unified should do this when asked (and use cuda API when available)
+    memset(p, 0, n * size);
     nrn_assert(is_aligned(p, alignment));
     return p;
 }
diff --git a/coreneuron/utils/randoms/nrnran123.cpp b/coreneuron/utils/randoms/nrnran123.cpp
index 14e2b15df..ed7b19367 100644
--- a/coreneuron/utils/randoms/nrnran123.cpp
+++ b/coreneuron/utils/randoms/nrnran123.cpp
@@ -70,7 +70,7 @@ std::unordered_map<void*, std::size_t> random123_allocate_unified::m_block_sizes
 using random123_allocator =
     boost::fast_pool_allocator<coreneuron::nrnran123_State, random123_allocate_unified>;
 #else
-using random123_allocator = coreneuron::unified_allocator<coreneuron::nrnran123_State>;
+using random123_allocator = coreneuron::unified_allocator<coreneuron::nrnran123_State, true>;
 #endif
 /* Global data structure per process. Using a unique_ptr here causes [minor]
  * problems because its destructor can be called very late during application
diff --git a/tests/unit/interleave_info/check_constructors.cpp b/tests/unit/interleave_info/check_constructors.cpp
index 00353072c..2c6bf3cae 100644
--- a/tests/unit/interleave_info/check_constructors.cpp
+++ b/tests/unit/interleave_info/check_constructors.cpp
@@ -25,10 +25,10 @@ BOOST_AUTO_TEST_CASE(interleave_info_test) {
     info1.nstride = nstride;
 
     // to avoid same values, different sub-array is used to initialize different members
-    copy_align_array(info1.stridedispl, data1, nwarp + 1);
-    copy_align_array(info1.stride, data1 + 1, nstride);
-    copy_align_array(info1.firstnode, data1 + 1, nwarp + 1);
-    copy_align_array(info1.lastnode, data1 + 1, nwarp + 1);
+    copy_array(info1.stridedispl, data1, nwarp + 1);
+    copy_array(info1.stride, data1 + 1, nstride);
+    copy_array(info1.firstnode, data1 + 1, nwarp + 1);
+    copy_array(info1.lastnode, data1 + 1, nwarp + 1);
 
     // check if copy_array works
     BOOST_CHECK_NE(info1.firstnode, info1.lastnode);
@@ -37,7 +37,7 @@ BOOST_AUTO_TEST_CASE(interleave_info_test) {
                                   info1.lastnode,
                                   info1.lastnode + nwarp + 1);
 
-    copy_align_array(info1.cellsize, data1 + 4, nwarp);
+    copy_array(info1.cellsize, data1 + 4, nwarp);
     copy_array(info1.nnode, data2, nwarp);
     copy_array(info1.ncycle, data2 + 1, nwarp);
     copy_array(info1.idle, data2 + 2, nwarp);