From 182f7200b458a85b84081ee7258472d0ba5a1502 Mon Sep 17 00:00:00 2001
From: ilwoolyu <ilwoolyu@gmail.com>
Date: Fri, 24 Apr 2020 23:20:04 -0500
Subject: [PATCH] FIX: GPU memory allocation

---
 CMakeLists.txt       |  2 +-
 src/HSD.cpp          |  3 ++-
 src/cuda/grad.cu     | 10 +++++-----
 src/cuda/grad.h      |  2 +-
 wrapper/PARSE_ARGS.h |  8 ++++----
 5 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b9d9517..a52a027 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,7 +4,7 @@ project(HSD)
 
 set(HSD_VERSION_MAJOR 1)
 set(HSD_VERSION_MINOR 2)
-set(HSD_VERSION_PATCH 9)
+set(HSD_VERSION_PATCH 10)
 set(HSD_VERSION
 ${HSD_VERSION_MAJOR}.${HSD_VERSION_MINOR}.${HSD_VERSION_PATCH})
 
diff --git a/src/HSD.cpp b/src/HSD.cpp
index 029c71f..83cd58b 100644
--- a/src/HSD.cpp
+++ b/src/HSD.cpp
@@ -503,12 +503,13 @@ void HSD::init(const char **sphere, const char **property, const float *weight,
 
 	// Find the maximum buffer size for gradients
 	int nVertex = 0;
-	int nFace = m_ico_mesh->nFace();
+	int nFace = 0;
 	m_nMaxVertex = m_nQuerySamples;
 	for (int subj = 0; subj < m_nSubj; subj++)
 	{
 		m_nMaxVertex = (m_nMaxVertex < m_spharm[subj].sphere->nVertex()) ? m_spharm[subj].sphere->nVertex(): m_nMaxVertex;
 		nVertex = (nVertex < m_spharm[subj].sphere->nVertex()) ? m_spharm[subj].sphere->nVertex(): nVertex;
+		nFace = (nFace < m_spharm[subj].sphere->nFace()) ? m_spharm[subj].sphere->nFace(): nFace;
 	}
 
 #ifdef _USE_CUDA_BLAS
diff --git a/src/cuda/grad.cu b/src/cuda/grad.cu
index 771ee8a..3f2ba28 100644
--- a/src/cuda/grad.cu
+++ b/src/cuda/grad.cu
@@ -290,7 +290,7 @@ void Gradient::ATB(double *h_A, int nr_rows_A, int nr_cols_A, double *h_B, int n
 	_ATB(d_A, nr_rows_A, nr_cols_A, d_B, nr_cols_B, d_C, handle);
 	cublasDestroy(handle);
 
-	cudaMemcpyAsync(h_C, d_C, nr_rows_A * nr_cols_B * sizeof(double), cudaMemcpyDeviceToHost);
+	cudaMemcpy(h_C, d_C, nr_rows_A * nr_cols_B * sizeof(double), cudaMemcpyDeviceToHost);
 
 	cudaFree(d_A);
 	cudaFree(d_B);
@@ -429,10 +429,9 @@ void Gradient::updateGradientProperties(const float *vertex, int nVertex, const
 	cudaMemcpyAsync(dp_u1, u1, 3 * sizeof(float), cudaMemcpyHostToDevice, stream[sid]);
 	cudaMemcpyAsync(dp_u2, u2, 3 * sizeof(float), cudaMemcpyHostToDevice, stream[sid]);
 	cudaMemcpyAsync(dp_gradient, gradient, (degree + 1) * (degree + 1) * 3 * sizeof(double), cudaMemcpyHostToDevice, stream[sid]);
-	cudaMemcpyAsync(dp_M, M, size * 3 * size * 3 * sizeof(double), cudaMemcpyHostToDevice, stream[sid]);
 
 	gradient_properties_kernel<<<nblocks,blocksize,0,stream[sid]>>>(dp_vertex0, nVertex, dp_face, nFace, dp_feature, dp_propertySamples, nSamples, dp_variance, dp_property, dp_pole, dp_Y, dp_coeff, degree, deg_beg, deg_end, normalization, dp_m_bar, dp_u1, dp_u2, dp_fid, dp_gradient_new, dp_gradient_raw, dp_gradient_diag, dp_dEdx);
-	//dEdx_kernel<<<nblocks2,blocksize,0,stream>>>(nSamples, degree, deg_beg, deg_end, dp_gradient_new, dp_gradient_raw, dp_dEdx);
+	//dEdx_kernel<<<nblocks2,blocksize,0,stream[sid]>>>(nSamples, degree, deg_beg, deg_end, dp_gradient_new, dp_gradient_raw, dp_dEdx);
 	_ATB(dp_gradient_raw, nSamples, size * 3, dp_dEdx, 1, dp_gradient_work, handle[sid], stream[sid]);
 	for (int i = 0; i < 3; i++)
 		cudaMemcpyAsync(&dp_gradient_new[n0 + (degree + 1) * (degree + 1) * i], &dp_gradient_work[i * size], size * sizeof(double), cudaMemcpyDeviceToDevice, stream[sid]);
@@ -443,6 +442,7 @@ void Gradient::updateGradientProperties(const float *vertex, int nVertex, const
 
 	if (hessian)
 	{
+		cudaMemcpyAsync(dp_M, M, size * 3 * size * 3 * sizeof(double), cudaMemcpyHostToDevice, stream[sid]);
 		_ATDA(dp_gradient_raw, dp_gradient_diag, nSamples, size * 3, dp_M_new, dp_gradient_work, sid);
 		_PA(dp_M, dp_M_new, size * 3, size * 3, stream[sid]);
 		cudaMemcpyAsync(M, dp_M, size * 3 * size * 3 * sizeof(double), cudaMemcpyDeviceToHost, stream[sid]);
@@ -591,10 +591,9 @@ void Gradient::updateGradientDsiplacement(const float *vertex0, const float *ver
 	cudaMemcpyAsync(dp_u1, u1, 3 * sizeof(float), cudaMemcpyHostToDevice, stream[sid]);
 	cudaMemcpyAsync(dp_u2, u2, 3 * sizeof(float), cudaMemcpyHostToDevice, stream[sid]);
 	cudaMemcpyAsync(dp_gradient, gradient, (degree + 1) * (degree + 1) * 3 * sizeof(double), cudaMemcpyHostToDevice, stream[sid]);
-	cudaMemcpyAsync(dp_M, M, size * 3 * size * 3 * sizeof(double), cudaMemcpyHostToDevice, stream[sid]);
 
 	gradient_displacement_kernel<<<nblocks,blocksize,0,stream[sid]>>>(dp_vertex0, dp_vertex1, nVertex, dp_pole, dp_Y, dp_coeff, degree, deg_beg, deg_end, normalization, dp_u1, dp_u2, dp_gradient_new, dp_gradient_raw, dp_gradient_diag, dp_dEdx);
-	//dEdx_kernel<<<nblocks2,blocksize,0,stream>>>(nVertex, degree, deg_beg, deg_end, dp_gradient_new, dp_gradient_raw, dp_dEdx);
+	//dEdx_kernel<<<nblocks2,blocksize,0,stream[sid]>>>(nVertex, degree, deg_beg, deg_end, dp_gradient_new, dp_gradient_raw, dp_dEdx);
 	_ATB(dp_gradient_raw, nVertex, size * 3, dp_dEdx, 1, dp_gradient_work, handle[sid], stream[sid]);
 	for (int i = 0; i < 3; i++)
 		cudaMemcpyAsync(&dp_gradient_new[n0 + (degree + 1) * (degree + 1) * i], &dp_gradient_work[i * size], size * sizeof(double), cudaMemcpyDeviceToDevice, stream[sid]);
@@ -605,6 +604,7 @@ void Gradient::updateGradientDsiplacement(const float *vertex0, const float *ver
 
 	if (hessian)
 	{
+		cudaMemcpyAsync(dp_M, M, size * 3 * size * 3 * sizeof(double), cudaMemcpyHostToDevice, stream[sid]);
 		_ATDA(dp_gradient_raw, dp_gradient_diag, nVertex, size * 3, dp_M_new, dp_gradient_work, sid);
 		_PA(dp_M, dp_M_new, size * 3, size * 3, stream[sid]);
 		cudaMemcpyAsync(M, dp_M, size * 3 * size * 3 * sizeof(double), cudaMemcpyDeviceToHost, stream[sid]);
diff --git a/src/cuda/grad.h b/src/cuda/grad.h
index 177a4d2..66a7b20 100644
--- a/src/cuda/grad.h
+++ b/src/cuda/grad.h
@@ -16,7 +16,7 @@ class Gradient
 	static void _DA(double *d_D, double *d_A, int nr_rows, int nr_cols, double *d_B, cudaStream_t stream = 0);
 	static void _SA(double scalar, double *d_A, int nr_rows, int nr_cols, cudaStream_t stream = 0);
 	static void _PA(double *d_A, double *d_B, int nr_rows, int nr_cols, cudaStream_t stream = 0);
-	void _ATDA(double *d_A, double *d_D, int nr_rows_A, int nr_cols_A, double *d_B, double *d_C, int sid);
+	void _ATDA(double *d_A, double *d_D, int nr_rows_A, int nr_cols_A, double *d_B, double *d_C, int sid = 0);
 	void allocMemory(int nVertex, int nFace, int nSamples, int degree);
 	void freeMemory(void);
 
diff --git a/wrapper/PARSE_ARGS.h b/wrapper/PARSE_ARGS.h
index 10240ac..11689e4 100644
--- a/wrapper/PARSE_ARGS.h
+++ b/wrapper/PARSE_ARGS.h
@@ -15,7 +15,7 @@ std::vector<std::string> listOutput;
 std::vector<std::string> listLandmark;
 std::vector<std::string> listCoeff;
 std::vector<std::string> listSurf;
-float weightMap = 0.1;
+float weightMap = 1;
 std::vector<float> listWeight;
 float idprior = 625;
 int degree = 15;
@@ -37,9 +37,9 @@ void PARSE_ARGS(int argc, char **argv)
     std::string desc("Hierarchical Spherical Deformation for Cortical Surface Registration "
 					 HSD_VERSION "\n"
 					 "Author: Ilwoo Lyu\n"
-					 "Please refer to the following paper for details:\n"
-					 "[1] Lyu et al., Hierarchical Spherical Deformation for Shape Correspondence, MICCAI 2018.\n"
-					 "[2] Lyu et al., Hierarchical Spherical Deformation for Cortical Surface Registration, Medical Image Analysis 2019.\n"
+					 "Please refer to the following papers for details:\n"
+					 "[1] Lyu et al., Hierarchical Spherical Deformation for Shape Correspondence, Medical Image Computing and Computer Assisted Intervention (MICCAI) 2018, LNCS11070, 853-861, 2018.\n"
+					 "[2] Lyu et al., Hierarchical Spherical Deformation for Cortical Surface Registration, Medical Image Analysis, 57, 72-88, 2019.\n"
 					 );
 
     CLI::App app(desc);