From 182f7200b458a85b84081ee7258472d0ba5a1502 Mon Sep 17 00:00:00 2001 From: ilwoolyu Date: Fri, 24 Apr 2020 23:20:04 -0500 Subject: [PATCH] FIX: GPU memory allocation --- CMakeLists.txt | 2 +- src/HSD.cpp | 3 ++- src/cuda/grad.cu | 10 +++++----- src/cuda/grad.h | 2 +- wrapper/PARSE_ARGS.h | 8 ++++---- 5 files changed, 13 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b9d9517..a52a027 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,7 @@ project(HSD) set(HSD_VERSION_MAJOR 1) set(HSD_VERSION_MINOR 2) -set(HSD_VERSION_PATCH 9) +set(HSD_VERSION_PATCH 10) set(HSD_VERSION ${HSD_VERSION_MAJOR}.${HSD_VERSION_MINOR}.${HSD_VERSION_PATCH}) diff --git a/src/HSD.cpp b/src/HSD.cpp index 029c71f..83cd58b 100644 --- a/src/HSD.cpp +++ b/src/HSD.cpp @@ -503,12 +503,13 @@ void HSD::init(const char **sphere, const char **property, const float *weight, // Find the maximum buffer size for gradients int nVertex = 0; - int nFace = m_ico_mesh->nFace(); + int nFace = 0; m_nMaxVertex = m_nQuerySamples; for (int subj = 0; subj < m_nSubj; subj++) { m_nMaxVertex = (m_nMaxVertex < m_spharm[subj].sphere->nVertex()) ? m_spharm[subj].sphere->nVertex(): m_nMaxVertex; nVertex = (nVertex < m_spharm[subj].sphere->nVertex()) ? m_spharm[subj].sphere->nVertex(): nVertex; + nFace = (nFace < m_spharm[subj].sphere->nFace()) ? m_spharm[subj].sphere->nFace(): nFace; } #ifdef _USE_CUDA_BLAS diff --git a/src/cuda/grad.cu b/src/cuda/grad.cu index 771ee8a..3f2ba28 100644 --- a/src/cuda/grad.cu +++ b/src/cuda/grad.cu @@ -290,7 +290,7 @@ void Gradient::ATB(double *h_A, int nr_rows_A, int nr_cols_A, double *h_B, int n _ATB(d_A, nr_rows_A, nr_cols_A, d_B, nr_cols_B, d_C, handle); cublasDestroy(handle); - cudaMemcpyAsync(h_C, d_C, nr_rows_A * nr_cols_B * sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(h_C, d_C, nr_rows_A * nr_cols_B * sizeof(double), cudaMemcpyDeviceToHost); cudaFree(d_A); cudaFree(d_B); @@ -429,10 +429,9 @@ void Gradient::updateGradientProperties(const float *vertex, int nVertex, const cudaMemcpyAsync(dp_u1, u1, 3 * sizeof(float), cudaMemcpyHostToDevice, stream[sid]); cudaMemcpyAsync(dp_u2, u2, 3 * sizeof(float), cudaMemcpyHostToDevice, stream[sid]); cudaMemcpyAsync(dp_gradient, gradient, (degree + 1) * (degree + 1) * 3 * sizeof(double), cudaMemcpyHostToDevice, stream[sid]); - cudaMemcpyAsync(dp_M, M, size * 3 * size * 3 * sizeof(double), cudaMemcpyHostToDevice, stream[sid]); gradient_properties_kernel<<>>(dp_vertex0, nVertex, dp_face, nFace, dp_feature, dp_propertySamples, nSamples, dp_variance, dp_property, dp_pole, dp_Y, dp_coeff, degree, deg_beg, deg_end, normalization, dp_m_bar, dp_u1, dp_u2, dp_fid, dp_gradient_new, dp_gradient_raw, dp_gradient_diag, dp_dEdx); - //dEdx_kernel<<>>(nSamples, degree, deg_beg, deg_end, dp_gradient_new, dp_gradient_raw, dp_dEdx); + //dEdx_kernel<<>>(nSamples, degree, deg_beg, deg_end, dp_gradient_new, dp_gradient_raw, dp_dEdx); _ATB(dp_gradient_raw, nSamples, size * 3, dp_dEdx, 1, dp_gradient_work, handle[sid], stream[sid]); for (int i = 0; i < 3; i++) cudaMemcpyAsync(&dp_gradient_new[n0 + (degree + 1) * (degree + 1) * i], &dp_gradient_work[i * size], size * sizeof(double), cudaMemcpyDeviceToDevice, stream[sid]); @@ -443,6 +442,7 @@ void Gradient::updateGradientProperties(const float *vertex, int nVertex, const if (hessian) { + cudaMemcpyAsync(dp_M, M, size * 3 * size * 3 * sizeof(double), cudaMemcpyHostToDevice, stream[sid]); _ATDA(dp_gradient_raw, dp_gradient_diag, nSamples, size * 3, dp_M_new, dp_gradient_work, sid); _PA(dp_M, dp_M_new, size * 3, size * 3, stream[sid]); cudaMemcpyAsync(M, dp_M, size * 3 * size * 3 * sizeof(double), cudaMemcpyDeviceToHost, stream[sid]); @@ -591,10 +591,9 @@ void Gradient::updateGradientDsiplacement(const float *vertex0, const float *ver cudaMemcpyAsync(dp_u1, u1, 3 * sizeof(float), cudaMemcpyHostToDevice, stream[sid]); cudaMemcpyAsync(dp_u2, u2, 3 * sizeof(float), cudaMemcpyHostToDevice, stream[sid]); cudaMemcpyAsync(dp_gradient, gradient, (degree + 1) * (degree + 1) * 3 * sizeof(double), cudaMemcpyHostToDevice, stream[sid]); - cudaMemcpyAsync(dp_M, M, size * 3 * size * 3 * sizeof(double), cudaMemcpyHostToDevice, stream[sid]); gradient_displacement_kernel<<>>(dp_vertex0, dp_vertex1, nVertex, dp_pole, dp_Y, dp_coeff, degree, deg_beg, deg_end, normalization, dp_u1, dp_u2, dp_gradient_new, dp_gradient_raw, dp_gradient_diag, dp_dEdx); - //dEdx_kernel<<>>(nVertex, degree, deg_beg, deg_end, dp_gradient_new, dp_gradient_raw, dp_dEdx); + //dEdx_kernel<<>>(nVertex, degree, deg_beg, deg_end, dp_gradient_new, dp_gradient_raw, dp_dEdx); _ATB(dp_gradient_raw, nVertex, size * 3, dp_dEdx, 1, dp_gradient_work, handle[sid], stream[sid]); for (int i = 0; i < 3; i++) cudaMemcpyAsync(&dp_gradient_new[n0 + (degree + 1) * (degree + 1) * i], &dp_gradient_work[i * size], size * sizeof(double), cudaMemcpyDeviceToDevice, stream[sid]); @@ -605,6 +604,7 @@ void Gradient::updateGradientDsiplacement(const float *vertex0, const float *ver if (hessian) { + cudaMemcpyAsync(dp_M, M, size * 3 * size * 3 * sizeof(double), cudaMemcpyHostToDevice, stream[sid]); _ATDA(dp_gradient_raw, dp_gradient_diag, nVertex, size * 3, dp_M_new, dp_gradient_work, sid); _PA(dp_M, dp_M_new, size * 3, size * 3, stream[sid]); cudaMemcpyAsync(M, dp_M, size * 3 * size * 3 * sizeof(double), cudaMemcpyDeviceToHost, stream[sid]); diff --git a/src/cuda/grad.h b/src/cuda/grad.h index 177a4d2..66a7b20 100644 --- a/src/cuda/grad.h +++ b/src/cuda/grad.h @@ -16,7 +16,7 @@ class Gradient static void _DA(double *d_D, double *d_A, int nr_rows, int nr_cols, double *d_B, cudaStream_t stream = 0); static void _SA(double scalar, double *d_A, int nr_rows, int nr_cols, cudaStream_t stream = 0); static void _PA(double *d_A, double *d_B, int nr_rows, int nr_cols, cudaStream_t stream = 0); - void _ATDA(double *d_A, double *d_D, int nr_rows_A, int nr_cols_A, double *d_B, double *d_C, int sid); + void _ATDA(double *d_A, double *d_D, int nr_rows_A, int nr_cols_A, double *d_B, double *d_C, int sid = 0); void allocMemory(int nVertex, int nFace, int nSamples, int degree); void freeMemory(void); diff --git a/wrapper/PARSE_ARGS.h b/wrapper/PARSE_ARGS.h index 10240ac..11689e4 100644 --- a/wrapper/PARSE_ARGS.h +++ b/wrapper/PARSE_ARGS.h @@ -15,7 +15,7 @@ std::vector listOutput; std::vector listLandmark; std::vector listCoeff; std::vector listSurf; -float weightMap = 0.1; +float weightMap = 1; std::vector listWeight; float idprior = 625; int degree = 15; @@ -37,9 +37,9 @@ void PARSE_ARGS(int argc, char **argv) std::string desc("Hierarchical Spherical Deformation for Cortical Surface Registration " HSD_VERSION "\n" "Author: Ilwoo Lyu\n" - "Please refer to the following paper for details:\n" - "[1] Lyu et al., Hierarchical Spherical Deformation for Shape Correspondence, MICCAI 2018.\n" - "[2] Lyu et al., Hierarchical Spherical Deformation for Cortical Surface Registration, Medical Image Analysis 2019.\n" + "Please refer to the following papers for details:\n" + "[1] Lyu et al., Hierarchical Spherical Deformation for Shape Correspondence, Medical Image Computing and Computer Assisted Intervention (MICCAI) 2018, LNCS11070, 853-861, 2018.\n" + "[2] Lyu et al., Hierarchical Spherical Deformation for Cortical Surface Registration, Medical Image Analysis, 57, 72-88, 2019.\n" ); CLI::App app(desc);