Skip to content

Commit

Permalink
FIX: GPU memory allocation
Browse files Browse the repository at this point in the history
  • Loading branch information
ilwoolyu committed Apr 25, 2020
1 parent d936c59 commit 182f720
Show file tree
Hide file tree
Showing 5 changed files with 13 additions and 12 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ project(HSD)

set(HSD_VERSION_MAJOR 1)
set(HSD_VERSION_MINOR 2)
set(HSD_VERSION_PATCH 9)
set(HSD_VERSION_PATCH 10)
set(HSD_VERSION
${HSD_VERSION_MAJOR}.${HSD_VERSION_MINOR}.${HSD_VERSION_PATCH})

Expand Down
3 changes: 2 additions & 1 deletion src/HSD.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -503,12 +503,13 @@ void HSD::init(const char **sphere, const char **property, const float *weight,

// Find the maximum buffer size for gradients
int nVertex = 0;
int nFace = m_ico_mesh->nFace();
int nFace = 0;
m_nMaxVertex = m_nQuerySamples;
for (int subj = 0; subj < m_nSubj; subj++)
{
m_nMaxVertex = (m_nMaxVertex < m_spharm[subj].sphere->nVertex()) ? m_spharm[subj].sphere->nVertex(): m_nMaxVertex;
nVertex = (nVertex < m_spharm[subj].sphere->nVertex()) ? m_spharm[subj].sphere->nVertex(): nVertex;
nFace = (nFace < m_spharm[subj].sphere->nFace()) ? m_spharm[subj].sphere->nFace(): nFace;
}

#ifdef _USE_CUDA_BLAS
Expand Down
10 changes: 5 additions & 5 deletions src/cuda/grad.cu
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ void Gradient::ATB(double *h_A, int nr_rows_A, int nr_cols_A, double *h_B, int n
_ATB(d_A, nr_rows_A, nr_cols_A, d_B, nr_cols_B, d_C, handle);
cublasDestroy(handle);

cudaMemcpyAsync(h_C, d_C, nr_rows_A * nr_cols_B * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(h_C, d_C, nr_rows_A * nr_cols_B * sizeof(double), cudaMemcpyDeviceToHost);

cudaFree(d_A);
cudaFree(d_B);
Expand Down Expand Up @@ -429,10 +429,9 @@ void Gradient::updateGradientProperties(const float *vertex, int nVertex, const
cudaMemcpyAsync(dp_u1, u1, 3 * sizeof(float), cudaMemcpyHostToDevice, stream[sid]);
cudaMemcpyAsync(dp_u2, u2, 3 * sizeof(float), cudaMemcpyHostToDevice, stream[sid]);
cudaMemcpyAsync(dp_gradient, gradient, (degree + 1) * (degree + 1) * 3 * sizeof(double), cudaMemcpyHostToDevice, stream[sid]);
cudaMemcpyAsync(dp_M, M, size * 3 * size * 3 * sizeof(double), cudaMemcpyHostToDevice, stream[sid]);

gradient_properties_kernel<<<nblocks,blocksize,0,stream[sid]>>>(dp_vertex0, nVertex, dp_face, nFace, dp_feature, dp_propertySamples, nSamples, dp_variance, dp_property, dp_pole, dp_Y, dp_coeff, degree, deg_beg, deg_end, normalization, dp_m_bar, dp_u1, dp_u2, dp_fid, dp_gradient_new, dp_gradient_raw, dp_gradient_diag, dp_dEdx);
//dEdx_kernel<<<nblocks2,blocksize,0,stream>>>(nSamples, degree, deg_beg, deg_end, dp_gradient_new, dp_gradient_raw, dp_dEdx);
//dEdx_kernel<<<nblocks2,blocksize,0,stream[sid]>>>(nSamples, degree, deg_beg, deg_end, dp_gradient_new, dp_gradient_raw, dp_dEdx);
_ATB(dp_gradient_raw, nSamples, size * 3, dp_dEdx, 1, dp_gradient_work, handle[sid], stream[sid]);
for (int i = 0; i < 3; i++)
cudaMemcpyAsync(&dp_gradient_new[n0 + (degree + 1) * (degree + 1) * i], &dp_gradient_work[i * size], size * sizeof(double), cudaMemcpyDeviceToDevice, stream[sid]);
Expand All @@ -443,6 +442,7 @@ void Gradient::updateGradientProperties(const float *vertex, int nVertex, const

if (hessian)
{
cudaMemcpyAsync(dp_M, M, size * 3 * size * 3 * sizeof(double), cudaMemcpyHostToDevice, stream[sid]);
_ATDA(dp_gradient_raw, dp_gradient_diag, nSamples, size * 3, dp_M_new, dp_gradient_work, sid);
_PA(dp_M, dp_M_new, size * 3, size * 3, stream[sid]);
cudaMemcpyAsync(M, dp_M, size * 3 * size * 3 * sizeof(double), cudaMemcpyDeviceToHost, stream[sid]);
Expand Down Expand Up @@ -591,10 +591,9 @@ void Gradient::updateGradientDsiplacement(const float *vertex0, const float *ver
cudaMemcpyAsync(dp_u1, u1, 3 * sizeof(float), cudaMemcpyHostToDevice, stream[sid]);
cudaMemcpyAsync(dp_u2, u2, 3 * sizeof(float), cudaMemcpyHostToDevice, stream[sid]);
cudaMemcpyAsync(dp_gradient, gradient, (degree + 1) * (degree + 1) * 3 * sizeof(double), cudaMemcpyHostToDevice, stream[sid]);
cudaMemcpyAsync(dp_M, M, size * 3 * size * 3 * sizeof(double), cudaMemcpyHostToDevice, stream[sid]);

gradient_displacement_kernel<<<nblocks,blocksize,0,stream[sid]>>>(dp_vertex0, dp_vertex1, nVertex, dp_pole, dp_Y, dp_coeff, degree, deg_beg, deg_end, normalization, dp_u1, dp_u2, dp_gradient_new, dp_gradient_raw, dp_gradient_diag, dp_dEdx);
//dEdx_kernel<<<nblocks2,blocksize,0,stream>>>(nVertex, degree, deg_beg, deg_end, dp_gradient_new, dp_gradient_raw, dp_dEdx);
//dEdx_kernel<<<nblocks2,blocksize,0,stream[sid]>>>(nVertex, degree, deg_beg, deg_end, dp_gradient_new, dp_gradient_raw, dp_dEdx);
_ATB(dp_gradient_raw, nVertex, size * 3, dp_dEdx, 1, dp_gradient_work, handle[sid], stream[sid]);
for (int i = 0; i < 3; i++)
cudaMemcpyAsync(&dp_gradient_new[n0 + (degree + 1) * (degree + 1) * i], &dp_gradient_work[i * size], size * sizeof(double), cudaMemcpyDeviceToDevice, stream[sid]);
Expand All @@ -605,6 +604,7 @@ void Gradient::updateGradientDsiplacement(const float *vertex0, const float *ver

if (hessian)
{
cudaMemcpyAsync(dp_M, M, size * 3 * size * 3 * sizeof(double), cudaMemcpyHostToDevice, stream[sid]);
_ATDA(dp_gradient_raw, dp_gradient_diag, nVertex, size * 3, dp_M_new, dp_gradient_work, sid);
_PA(dp_M, dp_M_new, size * 3, size * 3, stream[sid]);
cudaMemcpyAsync(M, dp_M, size * 3 * size * 3 * sizeof(double), cudaMemcpyDeviceToHost, stream[sid]);
Expand Down
2 changes: 1 addition & 1 deletion src/cuda/grad.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class Gradient
static void _DA(double *d_D, double *d_A, int nr_rows, int nr_cols, double *d_B, cudaStream_t stream = 0);
static void _SA(double scalar, double *d_A, int nr_rows, int nr_cols, cudaStream_t stream = 0);
static void _PA(double *d_A, double *d_B, int nr_rows, int nr_cols, cudaStream_t stream = 0);
void _ATDA(double *d_A, double *d_D, int nr_rows_A, int nr_cols_A, double *d_B, double *d_C, int sid);
void _ATDA(double *d_A, double *d_D, int nr_rows_A, int nr_cols_A, double *d_B, double *d_C, int sid = 0);
void allocMemory(int nVertex, int nFace, int nSamples, int degree);
void freeMemory(void);

Expand Down
8 changes: 4 additions & 4 deletions wrapper/PARSE_ARGS.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ std::vector<std::string> listOutput;
std::vector<std::string> listLandmark;
std::vector<std::string> listCoeff;
std::vector<std::string> listSurf;
float weightMap = 0.1;
float weightMap = 1;
std::vector<float> listWeight;
float idprior = 625;
int degree = 15;
Expand All @@ -37,9 +37,9 @@ void PARSE_ARGS(int argc, char **argv)
std::string desc("Hierarchical Spherical Deformation for Cortical Surface Registration "
HSD_VERSION "\n"
"Author: Ilwoo Lyu\n"
"Please refer to the following paper for details:\n"
"[1] Lyu et al., Hierarchical Spherical Deformation for Shape Correspondence, MICCAI 2018.\n"
"[2] Lyu et al., Hierarchical Spherical Deformation for Cortical Surface Registration, Medical Image Analysis 2019.\n"
"Please refer to the following papers for details:\n"
"[1] Lyu et al., Hierarchical Spherical Deformation for Shape Correspondence, Medical Image Computing and Computer Assisted Intervention (MICCAI) 2018, LNCS11070, 853-861, 2018.\n"
"[2] Lyu et al., Hierarchical Spherical Deformation for Cortical Surface Registration, Medical Image Analysis, 57, 72-88, 2019.\n"
);

CLI::App app(desc);
Expand Down

0 comments on commit 182f720

Please sign in to comment.