diff --git a/galaxy.kdevelop b/galaxy.kdevelop
index 505587a..7046804 100644
--- a/galaxy.kdevelop
+++ b/galaxy.kdevelop
@@ -22,7 +22,7 @@
   <kdevautoproject>
     <general>
       <activetarget>src/galfast.x</activetarget>
-      <useconfiguration>optimized</useconfiguration>
+      <useconfiguration>debug</useconfiguration>
     </general>
     <run>
       <mainprogram>/home/mjuric/projects/galaxy/debug/src/galfast.x</mainprogram>
diff --git a/src/common/cuda_rng.h b/src/common/cuda_rng.h
index b0009b9..9955bae 100644
--- a/src/common/cuda_rng.h
+++ b/src/common/cuda_rng.h
@@ -78,7 +78,7 @@ namespace prngs
 			this->nstreams = nstreams;
 			if(on_gpu)
 			{
-				cuxErrCheck( cudaMalloc((void**)&gstate, sizeof(uint32_t)*nstreams*statewidth) );
+				gstate = cuxNew<uint32_t>(nstreams*statewidth);
 				cuxErrCheck( cudaMemcpy(gstate, states, sizeof(uint32_t)*nstreams*statewidth, cudaMemcpyHostToDevice) );
 			} else {
 				gstate = new uint32_t[nstreams*statewidth];
diff --git a/src/common/cux.cpp b/src/common/cux.cpp
index bd5b946..2def228 100644
--- a/src/common/cux.cpp
+++ b/src/common/cux.cpp
@@ -152,43 +152,69 @@ void cuxSmartPtr_impl_t::gc()
 	}
 }
 
-void *cuxSmartPtr_impl_t::syncTo(bool device)
+unsigned cuxGetFreeMem(unsigned *totalptr = NULL)
+{
+#if !CUDA_DEVEMU
+	// Memory info
+	unsigned free = 0, total = 0;
+	cuxErrCheck( (cudaError)cuMemGetInfo(&free, &total) );
+	if(totalptr) *totalptr = total;
+	return free;
+#else
+	if(totalptr) *totalptr = 0;
+	return 0;
+#endif
+}
+
+void cuxMallocErrCheck_impl(cudaError err, size_t msize, const char *fun, const char *file, const int line)
 {
-	if(onDevice != device)
+	VERIFY(err == cudaSuccess)
 	{
-		std::swap(slave, m_data.ptr);
+		MLOG(verb1) << "CUDA ERROR: " << cudaGetErrorString(err);
+		MLOG(verb1) << "CUDA ERROR: Attempted to allocate " << msize / (1<<20) << "MB, " << cuxGetFreeMem() / (1<<20) << "MB free.";
+		MLOG(verb1) << "CUDA ERROR: In " << fun << " (" << file << ":" << line << ")\n";
 	}
+}
 
-	// Allocate m_data.ptr if needed.
-	if(!m_data.ptr)
+#define GC_AND_RETRY_IF_FAIL(x, msize) \
+	{ \
+		cudaError err = (x); \
+		if(err == cudaErrorMemoryAllocation) \
+		{ \
+			global_gc(); \
+			err = (x); \
+		} \
+		cuxMallocErrCheck(err, msize); \
+	}
+
+void *cuxSmartPtr_impl_t::syncTo(bool device)
+{
+	if(onDevice == device && m_data.ptr) { return m_data.ptr; }
+
+	// Allocate slave (future master)
+	if(!slave)
 	{
-		if(device)
+		if(device)	// syncing to GPU device
 		{
-			//cuxErrCheck( cudaMalloc((void**)&m_data.ptr, memsize()) );
-			cudaError err = cudaMalloc((void**)&m_data.ptr, memsize());
-			if(err == cudaErrorMemoryAllocation)
-			{
-				global_gc();
-				err = cudaMalloc((void**)&m_data.ptr, memsize());
-			}
-			cuxErrCheck(err);
+			GC_AND_RETRY_IF_FAIL( cudaMalloc((void**)&slave, memsize()), memsize() );
 		}
-		else
+		else		// syncing to host
 		{
-			m_data.ptr = new char[memsize()];
+			slave = new char[memsize()];
 			//memset(m_data.ptr, 0xff, memsize());	// debugging
 		}
 	}
 
-	// copy slave -> m_data.ptr (if there's something to copy)
-	if(onDevice != device && slave)
+	std::swap(m_data.ptr, slave);
+	onDevice = device;
+
+	if(slave)
 	{
-		cudaMemcpyKind dir = device ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToHost;
+		// copy slave -> m_data.ptr (if there's something to copy)
+		cudaMemcpyKind dir = onDevice ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToHost;
 		cuxErrCheck( cudaMemcpy(m_data.ptr, slave, memsize(), dir) );
 	}
 
-	onDevice = device;
-
 	// assume the sync dirtied up the textures
 	cleanCudaArray = false;
 
@@ -197,17 +223,6 @@ void *cuxSmartPtr_impl_t::syncTo(bool device)
 	return m_data.ptr;
 }
 
-#define GC_AND_RETRY_IF_FAIL(x) \
-	{ \
-		cudaError err = (x); \
-		if(err == cudaErrorMemoryAllocation) \
-		{ \
-			global_gc(); \
-			err = (x); \
-		} \
-		cuxErrCheck(err); \
-	}
-
 cudaArray *cuxSmartPtr_impl_t::getCUDAArray(cudaChannelFormatDesc &channelDesc)
 {
 	ASSERT(channelDesc.x + channelDesc.y + channelDesc.z + channelDesc.w == m_elementSize*8);
@@ -228,12 +243,12 @@ cudaArray *cuxSmartPtr_impl_t::getCUDAArray(cudaChannelFormatDesc &channelDesc)
 			if(ex.depth > 1)
 			{
 				// 3D arrays
-				GC_AND_RETRY_IF_FAIL( cudaMalloc3DArray(&cuArray, &channelDesc, ex) );
+				GC_AND_RETRY_IF_FAIL( cudaMalloc3DArray(&cuArray, &channelDesc, ex), memsize() );
 			}
 			else
 			{
 				// 2D and 1D arrays
-				GC_AND_RETRY_IF_FAIL( cudaMallocArray(&cuArray, &channelDesc, ex.width, ex.height) );
+				GC_AND_RETRY_IF_FAIL( cudaMallocArray(&cuArray, &channelDesc, ex.width, ex.height), memsize() );
 			}
 		}
 
@@ -654,21 +669,20 @@ const char *cpuinfo()
 
 void abort_on_cuda_error(cudaError err)
 {
-	if(err == cudaSuccess) { return; }
-
-	MLOG(verb1) << "CUDA ERROR: " << cudaGetErrorString(err);
-	//abort();
-	exit(-100);
+	VERIFY(err == cudaSuccess)
+	{
+		MLOG(verb1) << "CUDA ERROR: " << cudaGetErrorString(err);
+	}
 }
 
 void cuxErrCheck_impl(cudaError err, const char *fun, const char *file, const int line)
 {
-	if(err != cudaSuccess)
+	VERIFY(err == cudaSuccess)
 	{
+		MLOG(verb1) << "CUDA ERROR: " << cudaGetErrorString(err);
 		MLOG(verb1) << "CUDA ERROR: In " << fun << " (" << file << ":" << line << ")\n";
-		abort_on_cuda_error(err);
-//		throw cuxException(err);
 	}
+//		throw cuxException(err);
 }
 
 #if HAVE_CUDA
@@ -717,8 +731,8 @@ bool cux_init()
 #if !CUDA_DEVEMU
 	// ensure a CUDA context is created and fetch the active
 	// device id
-	void *tmp;
-	cuxErrCheck( cudaMalloc(&tmp, 1024) );
+	char *tmp;
+	tmp = cuxNew<char>(1024);
 	cuxErrCheck( cudaFree(tmp) );
 	cuxErrCheck( cudaGetDevice(&dev) );
 #endif
diff --git a/src/common/cux.h b/src/common/cux.h
index c9dd33f..fb44873 100644
--- a/src/common/cux.h
+++ b/src/common/cux.h
@@ -83,6 +83,9 @@ void cuxErrCheck_impl(cudaError err, const char *fun, const char *file, const in
 #define cuxErrCheck(expr) \
 	cuxErrCheck_impl(expr, __PRETTY_FUNCTION__, __FILE__, __LINE__)
 
+void cuxMallocErrCheck_impl(cudaError err, size_t msize, const char *fun, const char *file, const int line);
+#define cuxMallocErrCheck(expr, msize) \
+	cuxMallocErrCheck_impl(expr, msize, __PRETTY_FUNCTION__, __FILE__, __LINE__)
 
 /**
 	Compute the size, in bytes, of an aligned (strided) 1/2/3D array.
@@ -110,7 +113,7 @@ template<typename T>
 		size_t size = arrayMemSize<T>(nx, ny, nz, align);
 
 		T *devptr;
-		cuxErrCheck( cudaMalloc((void**)&devptr, size) );
+		cuxMallocErrCheck( cudaMalloc((void**)&devptr, size), size );
 		return devptr;
 	}