Bugfix GPU MEX code (bad sizeof() argument)

This commit changes a few instances where sizeof(ptr) was used instead of sizeof(*ptr). I am somewhat surprised that this did not cause any segfaults or errors during testing.
Nabarb · Feb 7, 2019 · f85a5e3 · f85a5e3
1 parent 1bab0e3
commit f85a5e3
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 8 deletions.
diff --git a/@MoDT/mex_src/sumFramesGpu.cu b/@MoDT/mex_src/sumFramesGpu.cu
@@ -291,11 +291,11 @@ void computeFrameSums(int D, int N, int K, int T,
 
         // Copy the fsLim indices to the GPU
         int *d_fsLim;
-        cudaStat = cudaMalloc((void**)&d_fsLim, T*2*sizeof(d_fsLim));
+        cudaStat = cudaMalloc((void**)&d_fsLim, T*2*sizeof(*d_fsLim));
         std::unique_ptr<int,CudaDeleter> cleanup_fsLim(d_fsLim);
         if (cudaStat != cudaSuccess)
             mexErrMsgIdAndTxt(cudaErrId, "Failed to allocate CUDA memory");
-        cudaStat = cudaMemcpyAsync(d_fsLim, fsLim0.data(), T*2*sizeof(d_fsLim), 
+        cudaStat = cudaMemcpyAsync(d_fsLim, fsLim0.data(), T*2*sizeof(*d_fsLim), 
                 cudaMemcpyHostToDevice);
         if (cudaStat != cudaSuccess)
             mexErrMsgIdAndTxt(cudaErrId, "cuBLAS error copying to GPU");
@@ -311,8 +311,8 @@ void computeFrameSums(int D, int N, int K, int T,
         int maxThreads = prop.maxThreadsPerMultiProcessor;
         int maxK_thread = (maxThreads/2) / (D+1);
         int maxMem = prop.sharedMemPerMultiprocessor;
-        int maxK_mem = ((maxMem/2) - maxT_eff*sizeof(d_fsLim))
-                       / (READ_BUFF_SZ*sizeof(d_Y)) - D;
+        int maxK_mem = ((maxMem/2) - maxT_eff*sizeof(*d_fsLim))
+                       / (READ_BUFF_SZ*sizeof(numeric_t)) - D;
         int maxK = std::min(maxK_thread, maxK_mem);
         // If we can't do all of the clusters at once, try to spread them evenly
         int K_eff, grid_x;
@@ -326,8 +326,8 @@ void computeFrameSums(int D, int N, int K, int T,
         // This determines the threads per block and the memory usage
         int nWarps = (D*K_eff + K_eff + 31)/32;
         dim3 threadsPerBlock(32, nWarps);
-        int memPerBlock = (D+K_eff)*READ_BUFF_SZ*sizeof(d_Y) + 
-                maxT_eff*sizeof(d_fsLim);
+        int memPerBlock = (D+K_eff)*READ_BUFF_SZ*sizeof(numeric_t) + 
+                maxT_eff*sizeof(*d_fsLim);
         // Figure out how many blocks in the grid
         int blocksPerMP = std::min(maxThreads/(64*nWarps), maxMem/memPerBlock);
         int grid_y = 2 * prop.multiProcessorCount * blocksPerMP;

diff --git a/@MoDT/mex_src/weightCovGpu.cu b/@MoDT/mex_src/weightCovGpu.cu
@@ -378,13 +378,13 @@ void computeWeightedCov(int D, int N,
     if (use_blas)
         optimizeBatches(D, N, 1, nBatches, batchSize);
     else
-        optimizeBatches(D, N, READ_BUFF_BYTES/sizeof(d_A), nBatches, batchSize);
+        optimizeBatches(D, N, READ_BUFF_BYTES/sizeof(*d_A), nBatches, batchSize);
     // Allocate memory for batches, if necessary
     numeric_t *d_CBatched;
     std::unique_ptr<numeric_t,CudaDeleter> cleanup_CBatched;
     if (nBatches > 1) {
         cudaStat = cudaMalloc((void**)&d_CBatched, 
-                static_cast<ptrdiff_t>(D)*D*nBatches*sizeof(d_A) );
+                static_cast<ptrdiff_t>(D)*D*nBatches*sizeof(*d_A) );
         if (cudaStat != cudaSuccess)
             mexErrMsgIdAndTxt(cudaErrId, "Failed to allocate CUDA memory for batch output");
         cleanup_CBatched = std::unique_ptr<numeric_t,CudaDeleter>{d_CBatched};