diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp index 1fcc59478..3eaadf9e4 100644 --- a/coreneuron/gpu/nrn_acc_manager.cpp +++ b/coreneuron/gpu/nrn_acc_manager.cpp @@ -825,26 +825,7 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) { cnrn_target_delete(nrb->_nrb_index, nrb->_size); } #endif - // Reallocate host buffers using ecalloc_align (as in phase2.cpp) and - // free_memory (as in nrn_setup.cpp) - auto const realloc = [old_size = nrb->_size, nrb](auto*& ptr, std::size_t extra_size = 0) { - using T = std::remove_pointer_t>; - static_assert(std::is_trivial::value, - "Only trivially constructible and copiable types are supported."); - static_assert(std::is_same::value, - "ptr should be reference-to-pointer"); - auto* const new_data = static_cast(ecalloc_align((nrb->_size + extra_size), sizeof(T))); - std::memcpy(new_data, ptr, (old_size + extra_size) * sizeof(T)); - free_memory(ptr); - ptr = new_data; - }; - nrb->_size *= 2; - realloc(nrb->_pnt_index); - realloc(nrb->_weight_index); - realloc(nrb->_nrb_t); - realloc(nrb->_nrb_flag); - realloc(nrb->_displ, 1); - realloc(nrb->_nrb_index); + nrb->grow(); #ifdef CORENEURON_ENABLE_GPU if (nt->compute_gpu) { // update device copy diff --git a/coreneuron/io/nrn_setup.cpp b/coreneuron/io/nrn_setup.cpp index 703e853d8..9ef76b5a8 100644 --- a/coreneuron/io/nrn_setup.cpp +++ b/coreneuron/io/nrn_setup.cpp @@ -764,15 +764,7 @@ void nrn_cleanup() { NetReceiveBuffer_t* nrb = ml->_net_receive_buffer; if (nrb) { - if (nrb->_size) { - free_memory(nrb->_pnt_index); - free_memory(nrb->_weight_index); - free_memory(nrb->_nrb_t); - free_memory(nrb->_nrb_flag); - free_memory(nrb->_displ); - free_memory(nrb->_nrb_index); - } - free_memory(nrb); + delete nrb; ml->_net_receive_buffer = nullptr; } diff --git a/coreneuron/io/phase2.cpp b/coreneuron/io/phase2.cpp index ad5748ad7..2092985e2 100644 --- a/coreneuron/io/phase2.cpp +++ b/coreneuron/io/phase2.cpp @@ -479,20 +479,13 @@ void Phase2::set_net_send_buffer(Memb_list** ml_list, const std::vector& pn // Does this thread have this type. Memb_list* ml = ml_list[type]; if (ml) { // needs a NetReceiveBuffer - NetReceiveBuffer_t* nrb = - (NetReceiveBuffer_t*) ecalloc_align(1, sizeof(NetReceiveBuffer_t)); + NetReceiveBuffer_t* nrb = new NetReceiveBuffer_t(); assert(!ml->_net_receive_buffer); ml->_net_receive_buffer = nrb; nrb->_pnt_offset = pnt_offset[type]; // begin with a size equal to the number of instances, or at least 8 - nrb->_size = std::max(8, ml->nodecount); - nrb->_pnt_index = (int*) ecalloc_align(nrb->_size, sizeof(int)); - nrb->_displ = (int*) ecalloc_align(nrb->_size + 1, sizeof(int)); - nrb->_nrb_index = (int*) ecalloc_align(nrb->_size, sizeof(int)); - nrb->_weight_index = (int*) ecalloc_align(nrb->_size, sizeof(int)); - nrb->_nrb_t = (double*) ecalloc_align(nrb->_size, sizeof(double)); - nrb->_nrb_flag = (double*) ecalloc_align(nrb->_size, sizeof(double)); + nrb->initialize(std::max(8, ml->nodecount)); } } diff --git a/coreneuron/mechanism/mechanism.hpp b/coreneuron/mechanism/mechanism.hpp index 9427423df..c5f0c3409 100644 --- a/coreneuron/mechanism/mechanism.hpp +++ b/coreneuron/mechanism/mechanism.hpp @@ -38,51 +38,74 @@ struct Point_process { short _tid; /* NrnThread id */ }; -struct NetReceiveBuffer_t { - int* _displ; /* _displ_cnt + 1 of these */ - int* _nrb_index; /* _cnt of these (order of increasing _pnt_index) */ - - int* _pnt_index; - int* _weight_index; - double* _nrb_t; - double* _nrb_flag; - int _cnt; - int _displ_cnt; /* number of unique _pnt_index */ - int _size; /* capacity */ - int _pnt_offset; - size_t size_of_object() { - size_t nbytes = 0; +struct NetReceiveBuffer_t: public UnifiedMemManaged<> { + unified_uniq_ptr _displ; /* _displ_cnt + 1 of these */ + unified_uniq_ptr _nrb_index; /* _cnt of these (order of increasing _pnt_index) */ + + unified_uniq_ptr _pnt_index; + unified_uniq_ptr _weight_index; + unified_uniq_ptr _nrb_t; + unified_uniq_ptr _nrb_flag; + int _cnt = 0; + int _displ_cnt = 0; /* number of unique _pnt_index */ + + std::size_t _size = 0; /* capacity */ + int _pnt_offset = 0; + std::size_t size_of_object() { + std::size_t nbytes = 0; nbytes += _size * sizeof(int) * 3; nbytes += (_size + 1) * sizeof(int); nbytes += _size * sizeof(double) * 2; return nbytes; } + + void initialize(std::size_t size) { + _size = size; + _pnt_index = allocate_unique(allocator{}, _size); + auto displ_size = _size + 1; + _displ = allocate_unique(allocator{}, displ_size); + _nrb_index = allocate_unique(allocator{}, _size); + _weight_index = allocate_unique(allocator{}, _size); + _nrb_t = allocate_unique(allocator{}, _size); + _nrb_flag = allocate_unique(allocator{}, _size); + } + + void grow() { + std::size_t new_size = _size * 2; + grow_buf(_pnt_index, _size, new_size); + grow_buf(_weight_index, _size, new_size); + grow_buf(_nrb_t, _size, new_size); + grow_buf(_nrb_flag, _size, new_size); + grow_buf(_displ, _size + 1, new_size + 1); + grow_buf(_nrb_index, _size, new_size); + _size = new_size; + } }; -struct NetSendBuffer_t: MemoryManaged { - int* _sendtype; // net_send, net_event, net_move - int* _vdata_index; - int* _pnt_index; - int* _weight_index; - double* _nsb_t; - double* _nsb_flag; - int _cnt; - int _size; /* capacity */ - int reallocated; /* if buffer resized/reallocated, needs to be copy to cpu */ +struct NetSendBuffer_t: public UnifiedMemManaged<> { + unified_uniq_ptr _sendtype; // net_send, net_event, net_move + unified_uniq_ptr _vdata_index; + unified_uniq_ptr _pnt_index; + unified_uniq_ptr _weight_index; + unified_uniq_ptr _nsb_t; + unified_uniq_ptr _nsb_flag; + int _cnt = 0; + std::size_t _size = 0; /* capacity */ + int reallocated = 0; /* if buffer resized/reallocated, needs to be copy to cpu */ NetSendBuffer_t(int size) : _size(size) { _cnt = 0; - _sendtype = (int*) ecalloc_align(_size, sizeof(int)); - _vdata_index = (int*) ecalloc_align(_size, sizeof(int)); - _pnt_index = (int*) ecalloc_align(_size, sizeof(int)); - _weight_index = (int*) ecalloc_align(_size, sizeof(int)); + _sendtype = allocate_unique(allocator{}, _size); + _vdata_index = allocate_unique(allocator{}, _size); + _pnt_index = allocate_unique(allocator{}, _size); + _weight_index = allocate_unique(allocator{}, _size); // when == 1, NetReceiveBuffer_t is newly allocated (i.e. we need to free previous copy // and recopy new data reallocated = 1; - _nsb_t = (double*) ecalloc_align(_size, sizeof(double)); - _nsb_flag = (double*) ecalloc_align(_size, sizeof(double)); + _nsb_t = allocate_unique(allocator{}, _size); + _nsb_flag = allocate_unique(allocator{}, _size); } size_t size_of_object() { @@ -92,39 +115,15 @@ struct NetSendBuffer_t: MemoryManaged { return nbytes; } - ~NetSendBuffer_t() { - free_memory(_sendtype); - free_memory(_vdata_index); - free_memory(_pnt_index); - free_memory(_weight_index); - free_memory(_nsb_t); - free_memory(_nsb_flag); - } - void grow() { -#ifdef CORENEURON_ENABLE_GPU - int cannot_reallocate_on_device = 0; - assert(cannot_reallocate_on_device); -#else - int new_size = _size * 2; - grow_buf(&_sendtype, _size, new_size); - grow_buf(&_vdata_index, _size, new_size); - grow_buf(&_pnt_index, _size, new_size); - grow_buf(&_weight_index, _size, new_size); - grow_buf(&_nsb_t, _size, new_size); - grow_buf(&_nsb_flag, _size, new_size); + std::size_t new_size = _size * 2; + grow_buf(_sendtype, _size, new_size); + grow_buf(_vdata_index, _size, new_size); + grow_buf(_pnt_index, _size, new_size); + grow_buf(_weight_index, _size, new_size); + grow_buf(_nsb_t, _size, new_size); + grow_buf(_nsb_flag, _size, new_size); _size = new_size; -#endif - } - - private: - template - void grow_buf(T** buf, int size, int new_size) { - T* new_buf = nullptr; - new_buf = (T*) ecalloc_align(new_size, sizeof(T)); - memcpy(new_buf, *buf, size * sizeof(T)); - free(*buf); - *buf = new_buf; } }; diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp index c95fedcf2..66eec5927 100644 --- a/coreneuron/permute/cellorder.cpp +++ b/coreneuron/permute/cellorder.cpp @@ -50,11 +50,11 @@ InterleaveInfo::InterleaveInfo(const InterleaveInfo& info) { nwarp = info.nwarp; nstride = info.nstride; - copy_align_array(stridedispl, info.stridedispl, nwarp + 1); - copy_align_array(stride, info.stride, nstride); - copy_align_array(firstnode, info.firstnode, nwarp + 1); - copy_align_array(lastnode, info.lastnode, nwarp + 1); - copy_align_array(cellsize, info.cellsize, nwarp); + copy_array(stridedispl, info.stridedispl, nwarp + 1); + copy_array(stride, info.stride, nstride); + copy_array(firstnode, info.firstnode, nwarp + 1); + copy_array(lastnode, info.lastnode, nwarp + 1); + copy_array(cellsize, info.cellsize, nwarp); copy_array(nnode, info.nnode, nwarp); copy_array(ncycle, info.ncycle, nwarp); @@ -74,25 +74,6 @@ InterleaveInfo& InterleaveInfo::operator=(const InterleaveInfo& info) { return *this; } -InterleaveInfo::~InterleaveInfo() { - if (stride) { - free_memory(stride); - free_memory(firstnode); - free_memory(lastnode); - free_memory(cellsize); - } - if (stridedispl) { - free_memory(stridedispl); - } - if (idle) { - delete[] nnode; - delete[] ncycle; - delete[] idle; - delete[] cache_access; - delete[] child_race; - } -} - void create_interleave_info() { destroy_interleave_info(); interleave_info = new InterleaveInfo[nrn_nthread]; @@ -299,8 +280,13 @@ int* interleave_order(int ith, int ncell, int nnode, int* parent) { } } - int nwarp = 0, nstride = 0, *stride = nullptr, *firstnode = nullptr; - int *lastnode = nullptr, *cellsize = nullptr, *stridedispl = nullptr; + int nwarp = 0; + int nstride = 0; + int* stride = nullptr; + int* firstnode = nullptr; + int* lastnode = nullptr; + int* cellsize = nullptr; + int* stridedispl = nullptr; int* order = node_order( ncell, nnode, parent, nwarp, nstride, stride, firstnode, lastnode, cellsize, stridedispl); diff --git a/coreneuron/permute/cellorder.hpp b/coreneuron/permute/cellorder.hpp index fe2f2f84e..df20f9194 100644 --- a/coreneuron/permute/cellorder.hpp +++ b/coreneuron/permute/cellorder.hpp @@ -47,12 +47,11 @@ class InterleaveInfo; // forward declaration */ void solve_interleaved2_launcher(NrnThread* nt, InterleaveInfo* info, int ncore, void* stream); -class InterleaveInfo: public MemoryManaged { +class InterleaveInfo: public UnifiedMemManaged<> { public: InterleaveInfo() = default; InterleaveInfo(const InterleaveInfo&); InterleaveInfo& operator=(const InterleaveInfo&); - ~InterleaveInfo(); int nwarp = 0; // used only by interleave2 int nstride = 0; int* stridedispl = nullptr; // interleave2: nwarp+1 @@ -106,17 +105,10 @@ int* node_order(int ncell, int*& cellsize, int*& stridedispl); -// copy src array to dest with new allocation -template -void copy_array(T*& dest, T* src, size_t n) { - dest = new T[n]; - std::copy(src, src + n, dest); -} - // copy src array to dest with NRN_SOA_BYTE_ALIGN ecalloc_align allocation template -void copy_align_array(T*& dest, T* src, size_t n) { - dest = static_cast(ecalloc_align(n, sizeof(T))); +void copy_array(T*& dest, T* src, size_t n) { + dest = static_cast(allocate_unified(n * sizeof(T))); std::copy(src, src + n, dest); } diff --git a/coreneuron/sim/multicore.hpp b/coreneuron/sim/multicore.hpp index a6ac50be0..f8a8e6c50 100644 --- a/coreneuron/sim/multicore.hpp +++ b/coreneuron/sim/multicore.hpp @@ -36,6 +36,8 @@ struct NrnThreadMembList { /* patterned after CvMembList in cvodeobj.h */ int* dependencies; /* list of mechanism types that this mechanism depends on*/ int ndependencies; /* for scheduling we need to know the dependency count */ }; + + NrnThreadMembList* create_tml(NrnThread& nt, int mech_id, Memb_func& memb_func, @@ -72,7 +74,7 @@ struct PreSynHelper { int flag_; }; -struct NrnThread: public MemoryManaged { +struct NrnThread: UnifiedMemManaged<> { double _t = 0; double _dt = -1e9; double cj = 0.0; diff --git a/coreneuron/utils/memory.cpp b/coreneuron/utils/memory.cpp index 8f45487dc..3d0271c1b 100644 --- a/coreneuron/utils/memory.cpp +++ b/coreneuron/utils/memory.cpp @@ -13,6 +13,7 @@ #endif #include +#include namespace coreneuron { bool gpu_enabled() { @@ -23,38 +24,52 @@ bool gpu_enabled() { #endif } -void* allocate_unified(std::size_t num_bytes) { +void* allocate_host(size_t num_bytes, std::size_t alignment) { + size_t fill = 0; + void* pointer = nullptr; + if (alignment > 0) { + if (num_bytes % alignment != 0) { + size_t multiple = num_bytes / alignment; + fill = alignment * (multiple + 1) - num_bytes; + } + nrn_assert((pointer = std::aligned_alloc(alignment, num_bytes + fill)) != nullptr); + } else { + nrn_assert((pointer = std::malloc(num_bytes)) != nullptr); + } + return pointer; +} + +void deallocate_host(void* pointer, std::size_t num_bytes) { + free(pointer); +} + +void* allocate_unified(std::size_t num_bytes, std::size_t alignment) { #ifdef CORENEURON_ENABLE_GPU // The build supports GPU execution, check if --gpu was passed to actually // enable it. We should not call CUDA APIs in GPU builds if --gpu was not passed. if (corenrn_param.gpu) { + void* pointer = nullptr; // Allocate managed/unified memory. - void* ptr{nullptr}; - auto const code = cudaMallocManaged(&ptr, num_bytes); + auto const code = cudaMallocManaged(&pointer, num_bytes); assert(code == cudaSuccess); - return ptr; + return pointer; } #endif // Either the build does not have GPU support or --gpu was not passed. - // Allocate using standard operator new. - // When we have C++17 support then propagate `alignment` here. - return ::operator new(num_bytes); + // Allocate using host allocator. + return allocate_host(num_bytes, alignment); } -void deallocate_unified(void* ptr, std::size_t num_bytes) { +void deallocate_unified(void* pointer, std::size_t num_bytes) { // See comments in allocate_unified to understand the different branches. #ifdef CORENEURON_ENABLE_GPU if (corenrn_param.gpu) { // Deallocate managed/unified memory. - auto const code = cudaFree(ptr); + auto const code = cudaFree(pointer); assert(code == cudaSuccess); return; } #endif -#ifdef __cpp_sized_deallocation - ::operator delete(ptr, num_bytes); -#else - ::operator delete(ptr); -#endif + deallocate_host(pointer, num_bytes); } } // namespace coreneuron diff --git a/coreneuron/utils/memory.h b/coreneuron/utils/memory.h index fc1e73d2e..9aa563f6c 100644 --- a/coreneuron/utils/memory.h +++ b/coreneuron/utils/memory.h @@ -30,45 +30,93 @@ namespace coreneuron { */ bool gpu_enabled(); +/** @brief Allocate host memory using new + */ +void* allocate_host(size_t num_bytes, size_t alignment = NRN_SOA_BYTE_ALIGN); + +/** @brief Deallocate memory allocated by `allocate_host`. + */ +void deallocate_host(void* pointer, std::size_t num_bytes); + /** @brief Allocate unified memory in GPU builds iff GPU enabled, otherwise new */ -void* allocate_unified(std::size_t num_bytes); +void* allocate_unified(std::size_t num_bytes, std::size_t alignment = NRN_SOA_BYTE_ALIGN); /** @brief Deallocate memory allocated by `allocate_unified`. */ -void deallocate_unified(void* ptr, std::size_t num_bytes); +void deallocate_unified(void* pointer, std::size_t num_bytes); /** @brief C++ allocator that uses [de]allocate_unified. */ -template -struct unified_allocator { + +template +struct host_allocator { using value_type = T; - unified_allocator() = default; + host_allocator() = default; template - unified_allocator(unified_allocator const&) noexcept {} + host_allocator(host_allocator const&) noexcept {} value_type* allocate(std::size_t n) { - return static_cast(allocate_unified(n * sizeof(value_type))); + return static_cast(allocate_host(n * sizeof(value_type), alignment)); } void deallocate(value_type* p, std::size_t n) noexcept { - deallocate_unified(p, n * sizeof(value_type)); + deallocate_host(p, n); } }; template -bool operator==(unified_allocator const&, unified_allocator const&) noexcept { +bool operator==(host_allocator const&, host_allocator const&) noexcept { return true; } template -bool operator!=(unified_allocator const& x, unified_allocator const& y) noexcept { +bool operator!=(host_allocator const& x, host_allocator const& y) noexcept { + return !(x == y); +} + + +template +struct unified_allocator { + using value_type = T; + + unified_allocator() = default; + + template + unified_allocator(unified_allocator const&) noexcept {} + + value_type* allocate(std::size_t n) { + void* ptr = nullptr; + // TODO: implement here the "always unified" semantics + if constexpr (force) { + ptr = allocate_unified(n * sizeof(value_type), 0); + } else { + ptr = allocate_unified(n * sizeof(value_type), 0); + } + return static_cast(ptr); + } + + void deallocate(value_type* p, std::size_t n) noexcept { + deallocate_unified(p, n); + } +}; + +template +bool operator==(unified_allocator const&, unified_allocator const&) noexcept { + return true; +} + +template +bool operator!=(unified_allocator const& x, + unified_allocator const& y) noexcept { return !(x == y); } /** @brief Allocator-aware deleter for use with std::unique_ptr. + * + * The deleter was extended to also support unique_ptr types. * * This is copied from https://stackoverflow.com/a/23132307. See also * http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0316r0.html, @@ -76,22 +124,32 @@ bool operator!=(unified_allocator const& x, unified_allocator const& y) no * boost::allocate_unique<...>. * Hopefully std::allocate_unique will be included in C++23. */ + template struct alloc_deleter { alloc_deleter() = default; // OL210813 addition alloc_deleter(const Alloc& a) - : a(a) {} + : a(a) + , size(1) {} + + alloc_deleter(const Alloc& a, std::size_t N) + : a(a) + , size(N) {} using pointer = typename std::allocator_traits::pointer; void operator()(pointer p) const { Alloc aa(a); - std::allocator_traits::destroy(aa, std::addressof(*p)); - std::allocator_traits::deallocate(aa, p, 1); + auto pp = &p[size - 1]; + do { + std::allocator_traits::destroy(aa, std::addressof(*pp--)); + } while (pp >= p); + std::allocator_traits::deallocate(aa, p, size); } private: Alloc a; + std::size_t size; }; template @@ -111,11 +169,138 @@ auto allocate_unique(const Alloc& alloc, Args&&... args) { throw; } } + + +template +auto allocate_unique(const Alloc& alloc, std::size_t N) { + using AT = std::allocator_traits; + using ElemType = typename std::remove_extent::type; + static_assert(std::is_same>{}(), + "Allocator has the wrong value_type"); + + Alloc a(alloc); + auto p = AT::allocate(a, N); + auto ep = reinterpret_cast(p); + try { + for (std::size_t i = 0; i < N; ++i) { + AT::construct(a, std::addressof(*ep++)); + } + using D = alloc_deleter; + return std::unique_ptr(p, D(a, N)); + } catch (...) { + AT::deallocate(a, p, N); + throw; + } +} + + +class [[deprecated]] MemoryManaged {}; + +class HostMemManaged { + public: + template + using allocator = host_allocator; + + template + using host_uniq_ptr = + std::unique_ptr::type>>>; + + static void* operator new(std::size_t count) { + return allocate_host(count); + } + + static void* operator new[](std::size_t count) { + return allocate_host(count); + } + + static void operator delete(void* ptr, std::size_t sz) { + deallocate_host(ptr, sz); + } + + static void operator delete[](void* ptr, std::size_t sz) { + deallocate_host(ptr, sz); + } + + protected: + template + void grow_buf(host_uniq_ptr& buf, std::size_t size, std::size_t new_size) { + auto new_buf = allocate_unique(allocator{}, new_size); + std::copy(buf.get(), buf.get() + size, new_buf.get()); + buf.swap(new_buf); + } + + /** + * Initialize new dest buffer from src + * + */ + template + void initialize_from_other(host_uniq_ptr& dest, host_uniq_ptr& src, std::size_t size) { + dest = allocate_unique(allocator{}, size); + std::copy(src.get(), src.get() + size, dest.get); + } +}; + +template +class UnifiedMemManaged { + public: + template + using allocator = unified_allocator; + + template + using unified_uniq_ptr = + std::unique_ptr::type>>>; + + static void* operator new(std::size_t count) { + return allocate_unified(count); + } + + static void* operator new[](std::size_t count) { + return allocate_unified(count); + } + + static void operator delete(void* ptr, std::size_t sz) { + deallocate_unified(ptr, sz); + } + + static void operator delete[](void* ptr, std::size_t sz) { + deallocate_unified(ptr, sz); + } + + protected: + protected: + template + void grow_buf(unified_uniq_ptr& buf, std::size_t size, std::size_t new_size) { +#ifdef CORENEURON_ENABLE_GPU + if (force || corenrn_param.gpu) { + int cannot_reallocate_on_device = 0; + assert(cannot_reallocate_on_device); + } +#endif + auto new_buf = allocate_unique(allocator{}, new_size); + std::copy(buf.get(), buf.get() + size, new_buf.get()); + buf.swap(new_buf); + } + + /** + * Initialize new dest buffer from src + * + */ + template + void initialize_from_other(unified_uniq_ptr& dest, + const unified_uniq_ptr& src, + std::size_t size) { + dest = allocate_unique(allocator{}, size); + std::copy(src.get(), src.get() + size, dest.get); + } +}; + + } // namespace coreneuron /// for gpu builds with unified memory support #ifdef CORENEURON_UNIFIED_MEMORY +////////// to be removed #include // TODO : error handling for CUDA routines @@ -131,7 +316,7 @@ inline void calloc_memory(void*& pointer, size_t num_bytes, size_t /*alignment*/ inline void free_memory(void* pointer) { cudaFree(pointer); } - +//////////////////// /** * A base class providing overloaded new and delete operators for CUDA allocation * @@ -139,12 +324,12 @@ inline void free_memory(void* pointer) { * may need to implement a special copy-construtor. This is documented here: * \link: https://devblogs.nvidia.com/unified-memory-in-cuda-6/ */ -class MemoryManaged { +/*class MemoryManaged { public: void* operator new(size_t len) { void* ptr; cudaMallocManaged(&ptr, len); - cudaDeviceSynchronize(); + cuyydaDeviceSynchronize(); return ptr; } @@ -164,17 +349,16 @@ class MemoryManaged { cudaDeviceSynchronize(); cudaFree(ptr); } -}; +};*/ /// for cpu builds use posix memalign #else -class MemoryManaged { +/*class MemoryManaged { // does nothing by default -}; - -#include +};*/ +//////// to be removed inline void alloc_memory(void*& pointer, size_t num_bytes, size_t alignment) { size_t fill = 0; if (num_bytes % alignment != 0) { @@ -192,6 +376,7 @@ inline void calloc_memory(void*& pointer, size_t num_bytes, size_t alignment) { inline void free_memory(void* pointer) { free(pointer); } +///////////////// #endif @@ -224,7 +409,7 @@ inline bool is_aligned(void* pointer, std::size_t alignment) { */ inline void* emalloc_align(size_t size, size_t alignment = NRN_SOA_BYTE_ALIGN) { void* memptr; - alloc_memory(memptr, size, alignment); + memptr = allocate_unified(size, alignment); nrn_assert(is_aligned(memptr, alignment)); return memptr; } @@ -239,7 +424,9 @@ inline void* ecalloc_align(size_t n, size_t size, size_t alignment = NRN_SOA_BYT if (n == 0) { return nullptr; } - calloc_memory(p, n * size, alignment); + p = allocate_unified(n * size, alignment); + // TODO: Maybe allocate_unified should do this when asked (and use cuda API when available) + memset(p, 0, n * size); nrn_assert(is_aligned(p, alignment)); return p; } diff --git a/coreneuron/utils/randoms/nrnran123.cpp b/coreneuron/utils/randoms/nrnran123.cpp index 14e2b15df..ed7b19367 100644 --- a/coreneuron/utils/randoms/nrnran123.cpp +++ b/coreneuron/utils/randoms/nrnran123.cpp @@ -70,7 +70,7 @@ std::unordered_map random123_allocate_unified::m_block_sizes using random123_allocator = boost::fast_pool_allocator; #else -using random123_allocator = coreneuron::unified_allocator; +using random123_allocator = coreneuron::unified_allocator; #endif /* Global data structure per process. Using a unique_ptr here causes [minor] * problems because its destructor can be called very late during application diff --git a/tests/unit/interleave_info/check_constructors.cpp b/tests/unit/interleave_info/check_constructors.cpp index 00353072c..2c6bf3cae 100644 --- a/tests/unit/interleave_info/check_constructors.cpp +++ b/tests/unit/interleave_info/check_constructors.cpp @@ -25,10 +25,10 @@ BOOST_AUTO_TEST_CASE(interleave_info_test) { info1.nstride = nstride; // to avoid same values, different sub-array is used to initialize different members - copy_align_array(info1.stridedispl, data1, nwarp + 1); - copy_align_array(info1.stride, data1 + 1, nstride); - copy_align_array(info1.firstnode, data1 + 1, nwarp + 1); - copy_align_array(info1.lastnode, data1 + 1, nwarp + 1); + copy_array(info1.stridedispl, data1, nwarp + 1); + copy_array(info1.stride, data1 + 1, nstride); + copy_array(info1.firstnode, data1 + 1, nwarp + 1); + copy_array(info1.lastnode, data1 + 1, nwarp + 1); // check if copy_array works BOOST_CHECK_NE(info1.firstnode, info1.lastnode); @@ -37,7 +37,7 @@ BOOST_AUTO_TEST_CASE(interleave_info_test) { info1.lastnode, info1.lastnode + nwarp + 1); - copy_align_array(info1.cellsize, data1 + 4, nwarp); + copy_array(info1.cellsize, data1 + 4, nwarp); copy_array(info1.nnode, data2, nwarp); copy_array(info1.ncycle, data2 + 1, nwarp); copy_array(info1.idle, data2 + 2, nwarp);