diff --git a/galaxy.kdevelop b/galaxy.kdevelop index 505587a..7046804 100644 --- a/galaxy.kdevelop +++ b/galaxy.kdevelop @@ -22,7 +22,7 @@ src/galfast.x - optimized + debug /home/mjuric/projects/galaxy/debug/src/galfast.x diff --git a/src/common/cuda_rng.h b/src/common/cuda_rng.h index b0009b9..9955bae 100644 --- a/src/common/cuda_rng.h +++ b/src/common/cuda_rng.h @@ -78,7 +78,7 @@ namespace prngs this->nstreams = nstreams; if(on_gpu) { - cuxErrCheck( cudaMalloc((void**)&gstate, sizeof(uint32_t)*nstreams*statewidth) ); + gstate = cuxNew(nstreams*statewidth); cuxErrCheck( cudaMemcpy(gstate, states, sizeof(uint32_t)*nstreams*statewidth, cudaMemcpyHostToDevice) ); } else { gstate = new uint32_t[nstreams*statewidth]; diff --git a/src/common/cux.cpp b/src/common/cux.cpp index bd5b946..2def228 100644 --- a/src/common/cux.cpp +++ b/src/common/cux.cpp @@ -152,43 +152,69 @@ void cuxSmartPtr_impl_t::gc() } } -void *cuxSmartPtr_impl_t::syncTo(bool device) +unsigned cuxGetFreeMem(unsigned *totalptr = NULL) +{ +#if !CUDA_DEVEMU + // Memory info + unsigned free = 0, total = 0; + cuxErrCheck( (cudaError)cuMemGetInfo(&free, &total) ); + if(totalptr) *totalptr = total; + return free; +#else + if(totalptr) *totalptr = 0; + return 0; +#endif +} + +void cuxMallocErrCheck_impl(cudaError err, size_t msize, const char *fun, const char *file, const int line) { - if(onDevice != device) + VERIFY(err == cudaSuccess) { - std::swap(slave, m_data.ptr); + MLOG(verb1) << "CUDA ERROR: " << cudaGetErrorString(err); + MLOG(verb1) << "CUDA ERROR: Attempted to allocate " << msize / (1<<20) << "MB, " << cuxGetFreeMem() / (1<<20) << "MB free."; + MLOG(verb1) << "CUDA ERROR: In " << fun << " (" << file << ":" << line << ")\n"; } +} - // Allocate m_data.ptr if needed. - if(!m_data.ptr) +#define GC_AND_RETRY_IF_FAIL(x, msize) \ + { \ + cudaError err = (x); \ + if(err == cudaErrorMemoryAllocation) \ + { \ + global_gc(); \ + err = (x); \ + } \ + cuxMallocErrCheck(err, msize); \ + } + +void *cuxSmartPtr_impl_t::syncTo(bool device) +{ + if(onDevice == device && m_data.ptr) { return m_data.ptr; } + + // Allocate slave (future master) + if(!slave) { - if(device) + if(device) // syncing to GPU device { - //cuxErrCheck( cudaMalloc((void**)&m_data.ptr, memsize()) ); - cudaError err = cudaMalloc((void**)&m_data.ptr, memsize()); - if(err == cudaErrorMemoryAllocation) - { - global_gc(); - err = cudaMalloc((void**)&m_data.ptr, memsize()); - } - cuxErrCheck(err); + GC_AND_RETRY_IF_FAIL( cudaMalloc((void**)&slave, memsize()), memsize() ); } - else + else // syncing to host { - m_data.ptr = new char[memsize()]; + slave = new char[memsize()]; //memset(m_data.ptr, 0xff, memsize()); // debugging } } - // copy slave -> m_data.ptr (if there's something to copy) - if(onDevice != device && slave) + std::swap(m_data.ptr, slave); + onDevice = device; + + if(slave) { - cudaMemcpyKind dir = device ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToHost; + // copy slave -> m_data.ptr (if there's something to copy) + cudaMemcpyKind dir = onDevice ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToHost; cuxErrCheck( cudaMemcpy(m_data.ptr, slave, memsize(), dir) ); } - onDevice = device; - // assume the sync dirtied up the textures cleanCudaArray = false; @@ -197,17 +223,6 @@ void *cuxSmartPtr_impl_t::syncTo(bool device) return m_data.ptr; } -#define GC_AND_RETRY_IF_FAIL(x) \ - { \ - cudaError err = (x); \ - if(err == cudaErrorMemoryAllocation) \ - { \ - global_gc(); \ - err = (x); \ - } \ - cuxErrCheck(err); \ - } - cudaArray *cuxSmartPtr_impl_t::getCUDAArray(cudaChannelFormatDesc &channelDesc) { ASSERT(channelDesc.x + channelDesc.y + channelDesc.z + channelDesc.w == m_elementSize*8); @@ -228,12 +243,12 @@ cudaArray *cuxSmartPtr_impl_t::getCUDAArray(cudaChannelFormatDesc &channelDesc) if(ex.depth > 1) { // 3D arrays - GC_AND_RETRY_IF_FAIL( cudaMalloc3DArray(&cuArray, &channelDesc, ex) ); + GC_AND_RETRY_IF_FAIL( cudaMalloc3DArray(&cuArray, &channelDesc, ex), memsize() ); } else { // 2D and 1D arrays - GC_AND_RETRY_IF_FAIL( cudaMallocArray(&cuArray, &channelDesc, ex.width, ex.height) ); + GC_AND_RETRY_IF_FAIL( cudaMallocArray(&cuArray, &channelDesc, ex.width, ex.height), memsize() ); } } @@ -654,21 +669,20 @@ const char *cpuinfo() void abort_on_cuda_error(cudaError err) { - if(err == cudaSuccess) { return; } - - MLOG(verb1) << "CUDA ERROR: " << cudaGetErrorString(err); - //abort(); - exit(-100); + VERIFY(err == cudaSuccess) + { + MLOG(verb1) << "CUDA ERROR: " << cudaGetErrorString(err); + } } void cuxErrCheck_impl(cudaError err, const char *fun, const char *file, const int line) { - if(err != cudaSuccess) + VERIFY(err == cudaSuccess) { + MLOG(verb1) << "CUDA ERROR: " << cudaGetErrorString(err); MLOG(verb1) << "CUDA ERROR: In " << fun << " (" << file << ":" << line << ")\n"; - abort_on_cuda_error(err); -// throw cuxException(err); } +// throw cuxException(err); } #if HAVE_CUDA @@ -717,8 +731,8 @@ bool cux_init() #if !CUDA_DEVEMU // ensure a CUDA context is created and fetch the active // device id - void *tmp; - cuxErrCheck( cudaMalloc(&tmp, 1024) ); + char *tmp; + tmp = cuxNew(1024); cuxErrCheck( cudaFree(tmp) ); cuxErrCheck( cudaGetDevice(&dev) ); #endif diff --git a/src/common/cux.h b/src/common/cux.h index c9dd33f..fb44873 100644 --- a/src/common/cux.h +++ b/src/common/cux.h @@ -83,6 +83,9 @@ void cuxErrCheck_impl(cudaError err, const char *fun, const char *file, const in #define cuxErrCheck(expr) \ cuxErrCheck_impl(expr, __PRETTY_FUNCTION__, __FILE__, __LINE__) +void cuxMallocErrCheck_impl(cudaError err, size_t msize, const char *fun, const char *file, const int line); +#define cuxMallocErrCheck(expr, msize) \ + cuxMallocErrCheck_impl(expr, msize, __PRETTY_FUNCTION__, __FILE__, __LINE__) /** Compute the size, in bytes, of an aligned (strided) 1/2/3D array. @@ -110,7 +113,7 @@ template size_t size = arrayMemSize(nx, ny, nz, align); T *devptr; - cuxErrCheck( cudaMalloc((void**)&devptr, size) ); + cuxMallocErrCheck( cudaMalloc((void**)&devptr, size), size ); return devptr; }