Skip to content

Commit

Permalink
Bugfix GPU MEX code (bad sizeof() argument)
Browse files Browse the repository at this point in the history
This commit changes a few instances where sizeof(ptr) was used instead
of sizeof(*ptr). I am somewhat surprised that this did not cause any
segfaults or errors during testing.
  • Loading branch information
kqshan committed Feb 7, 2019
1 parent 1bab0e3 commit f85a5e3
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 8 deletions.
12 changes: 6 additions & 6 deletions @MoDT/mex_src/sumFramesGpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -291,11 +291,11 @@ void computeFrameSums(int D, int N, int K, int T,

// Copy the fsLim indices to the GPU
int *d_fsLim;
cudaStat = cudaMalloc((void**)&d_fsLim, T*2*sizeof(d_fsLim));
cudaStat = cudaMalloc((void**)&d_fsLim, T*2*sizeof(*d_fsLim));
std::unique_ptr<int,CudaDeleter> cleanup_fsLim(d_fsLim);
if (cudaStat != cudaSuccess)
mexErrMsgIdAndTxt(cudaErrId, "Failed to allocate CUDA memory");
cudaStat = cudaMemcpyAsync(d_fsLim, fsLim0.data(), T*2*sizeof(d_fsLim),
cudaStat = cudaMemcpyAsync(d_fsLim, fsLim0.data(), T*2*sizeof(*d_fsLim),
cudaMemcpyHostToDevice);
if (cudaStat != cudaSuccess)
mexErrMsgIdAndTxt(cudaErrId, "cuBLAS error copying to GPU");
Expand All @@ -311,8 +311,8 @@ void computeFrameSums(int D, int N, int K, int T,
int maxThreads = prop.maxThreadsPerMultiProcessor;
int maxK_thread = (maxThreads/2) / (D+1);
int maxMem = prop.sharedMemPerMultiprocessor;
int maxK_mem = ((maxMem/2) - maxT_eff*sizeof(d_fsLim))
/ (READ_BUFF_SZ*sizeof(d_Y)) - D;
int maxK_mem = ((maxMem/2) - maxT_eff*sizeof(*d_fsLim))
/ (READ_BUFF_SZ*sizeof(numeric_t)) - D;
int maxK = std::min(maxK_thread, maxK_mem);
// If we can't do all of the clusters at once, try to spread them evenly
int K_eff, grid_x;
Expand All @@ -326,8 +326,8 @@ void computeFrameSums(int D, int N, int K, int T,
// This determines the threads per block and the memory usage
int nWarps = (D*K_eff + K_eff + 31)/32;
dim3 threadsPerBlock(32, nWarps);
int memPerBlock = (D+K_eff)*READ_BUFF_SZ*sizeof(d_Y) +
maxT_eff*sizeof(d_fsLim);
int memPerBlock = (D+K_eff)*READ_BUFF_SZ*sizeof(numeric_t) +
maxT_eff*sizeof(*d_fsLim);
// Figure out how many blocks in the grid
int blocksPerMP = std::min(maxThreads/(64*nWarps), maxMem/memPerBlock);
int grid_y = 2 * prop.multiProcessorCount * blocksPerMP;
Expand Down
4 changes: 2 additions & 2 deletions @MoDT/mex_src/weightCovGpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -378,13 +378,13 @@ void computeWeightedCov(int D, int N,
if (use_blas)
optimizeBatches(D, N, 1, nBatches, batchSize);
else
optimizeBatches(D, N, READ_BUFF_BYTES/sizeof(d_A), nBatches, batchSize);
optimizeBatches(D, N, READ_BUFF_BYTES/sizeof(*d_A), nBatches, batchSize);
// Allocate memory for batches, if necessary
numeric_t *d_CBatched;
std::unique_ptr<numeric_t,CudaDeleter> cleanup_CBatched;
if (nBatches > 1) {
cudaStat = cudaMalloc((void**)&d_CBatched,
static_cast<ptrdiff_t>(D)*D*nBatches*sizeof(d_A) );
static_cast<ptrdiff_t>(D)*D*nBatches*sizeof(*d_A) );
if (cudaStat != cudaSuccess)
mexErrMsgIdAndTxt(cudaErrId, "Failed to allocate CUDA memory for batch output");
cleanup_CBatched = std::unique_ptr<numeric_t,CudaDeleter>{d_CBatched};
Expand Down

0 comments on commit f85a5e3

Please sign in to comment.