From 6c43901bd077b155b70e025897f6ffd72545157f Mon Sep 17 00:00:00 2001 From: Adrian Przybylski Date: Mon, 26 Mar 2018 17:06:15 +0200 Subject: [PATCH] update to CUDA interface & memory management --- Gpufit/cuda_kernels.cu | 8 +- Gpufit/cuda_kernels.cuh | 2 + Gpufit/examples/CUDA_Interface_Example.cu | 64 ++++++++++++--- Gpufit/gpu_data.cu | 96 +++++++++++++---------- Gpufit/gpu_data.cuh | 13 +-- Gpufit/gpufit.cpp | 31 ++++---- Gpufit/gpufit.h | 15 ++-- Gpufit/info.cpp | 23 ++++-- Gpufit/info.h | 3 - Gpufit/interface.cpp | 24 +----- Gpufit/interface.h | 1 - Gpufit/lm_fit.cpp | 21 +++-- Gpufit/lm_fit_cuda.cu | 12 ++- 13 files changed, 186 insertions(+), 127 deletions(-) diff --git a/Gpufit/cuda_kernels.cu b/Gpufit/cuda_kernels.cu index a9f2f341..bdb4ea24 100644 --- a/Gpufit/cuda_kernels.cu +++ b/Gpufit/cuda_kernels.cu @@ -198,6 +198,7 @@ __device__ void sum_up_floats(volatile float* shared_array, int const size) __global__ void cuda_sum_chi_square_subtotals( float * chi_squares, + float const * subtotals, int const n_blocks_per_fit, int const n_fits, int const * finished) @@ -208,10 +209,11 @@ __global__ void cuda_sum_chi_square_subtotals( return; float * chi_square = chi_squares + index; + float const * subtotal = subtotals + index; double sum = 0.0; for (int i = 0; i < n_blocks_per_fit; i++) - sum += chi_square[i * n_fits]; + sum += subtotal[i * n_fits]; chi_square[0] = sum; } @@ -462,6 +464,7 @@ __global__ void cuda_calculate_chi_squares( __global__ void cuda_sum_gradient_subtotals( float * gradients, + float const * subtotals, int const n_blocks_per_fit, int const n_fits, int const n_parameters, @@ -475,10 +478,11 @@ __global__ void cuda_sum_gradient_subtotals( return; float * gradient = gradients + index; + float const * subtotal = subtotals + index; double sum = 0.0; for (int i = 0; i < n_blocks_per_fit; i++) - sum += gradient[i * n_fits * n_parameters]; + sum += subtotal[i * n_fits * n_parameters]; gradient[0] = sum; } diff --git a/Gpufit/cuda_kernels.cuh b/Gpufit/cuda_kernels.cuh index 7fe2860c..519d0b2f 100644 --- a/Gpufit/cuda_kernels.cuh +++ b/Gpufit/cuda_kernels.cuh @@ -7,6 +7,7 @@ void configure_model(ModelID const model_id, int & n_parameters, int & n_dimensi extern __global__ void cuda_sum_chi_square_subtotals( float * chi_squares, + float const * subtotals, int const n_blocks_per_fit, int const n_fits, int const * finished); @@ -34,6 +35,7 @@ extern __global__ void cuda_calculate_chi_squares( extern __global__ void cuda_sum_gradient_subtotals( float * gradients, + float const * subtotals, int const n_blocks_per_fit, int const n_fits, int const n_parameters, diff --git a/Gpufit/examples/CUDA_Interface_Example.cu b/Gpufit/examples/CUDA_Interface_Example.cu index e27c0af7..a9eeb27c 100644 --- a/Gpufit/examples/CUDA_Interface_Example.cu +++ b/Gpufit/examples/CUDA_Interface_Example.cu @@ -73,7 +73,7 @@ void gauss_fit_2d_example() // number of fits, fit points and parameters - size_t const n_fits = 10; + size_t const n_fits = 100000; size_t const size_x = 50; size_t const n_points_per_fit = size_x * size_x; size_t const n_model_parameters = 5; @@ -185,6 +185,21 @@ void gauss_fit_2d_example() &gpu_user_info, data.size() * sizeof(float))); + int * gpu_states; + CUDA_CHECK_STATUS(cudaMalloc( + &gpu_states, + output_states.size() * sizeof(int))); + + float * gpu_chi_squares; + CUDA_CHECK_STATUS(cudaMalloc( + &gpu_chi_squares, + output_chi_square.size() * sizeof(float))); + + int * gpu_n_iterations; + CUDA_CHECK_STATUS(cudaMalloc( + &gpu_n_iterations, + output_number_iterations.size() * sizeof(int))); + // call to gpufit (C interface) std::chrono::high_resolution_clock::time_point time_0 = std::chrono::high_resolution_clock::now(); int const status = gpufit_cuda_interface @@ -194,28 +209,53 @@ void gauss_fit_2d_example() gpu_data, gpu_weights, model_id, - gpu_initial_parameters, tolerance, max_number_iterations, parameters_to_fit.data(), estimator_id, data.size() * sizeof(float), gpu_user_info, - output_parameters.data(), - output_states.data(), - output_chi_square.data(), - output_number_iterations.data() + gpu_initial_parameters, + gpu_states, + gpu_chi_squares, + gpu_n_iterations ); std::chrono::high_resolution_clock::time_point time_1 = std::chrono::high_resolution_clock::now(); + // check status + if (status != ReturnState::OK) + { + throw std::runtime_error(gpufit_get_last_error()); + } + + CUDA_CHECK_STATUS(cudaMemcpy( + output_parameters.data(), + gpu_initial_parameters, + output_parameters.size() * sizeof(float), + cudaMemcpyDeviceToHost)); + CUDA_CHECK_STATUS(cudaMemcpy( + output_states.data(), + gpu_states, + output_states.size() * sizeof(int), + cudaMemcpyDeviceToHost)); + CUDA_CHECK_STATUS(cudaMemcpy( + output_chi_square.data(), + gpu_chi_squares, + output_chi_square.size() * sizeof(float), + cudaMemcpyDeviceToHost)); + CUDA_CHECK_STATUS(cudaMemcpy( + output_number_iterations.data(), + gpu_n_iterations, + output_number_iterations.size() * sizeof(int), + cudaMemcpyDeviceToHost)); + CUDA_CHECK_STATUS(cudaFree(gpu_data)); + CUDA_CHECK_STATUS(cudaFree(gpu_weights)); + CUDA_CHECK_STATUS(cudaFree(gpu_user_info)); CUDA_CHECK_STATUS(cudaFree(gpu_initial_parameters)); - - // check status - if (status != ReturnState::OK) - { - throw std::runtime_error(gpufit_get_last_error()); - } + CUDA_CHECK_STATUS(cudaFree(gpu_states)); + CUDA_CHECK_STATUS(cudaFree(gpu_chi_squares)); + CUDA_CHECK_STATUS(cudaFree(gpu_n_iterations)); // print execution time std::cout << "execution time " diff --git a/Gpufit/gpu_data.cu b/Gpufit/gpu_data.cu index 1c41b3be..44ffb199 100644 --- a/Gpufit/gpu_data.cu +++ b/Gpufit/gpu_data.cu @@ -11,33 +11,49 @@ GPUData::GPUData(Info const & info) : (info_.data_location_ == HOST) ? info_.max_chunk_size_*info_.n_points_ : 0), weights_( - (info_.use_weights_ && info_.weight_location_ == HOST) + (info_.use_weights_ && info_.data_location_ == HOST) ? info_.n_points_ * info_.max_chunk_size_ : 0 ), parameters_( - (info_.parameter_location_ == HOST) + (info_.data_location_ == HOST) ? info_.max_chunk_size_*info_.n_parameters_ : 0 ), - user_info_((info_.user_info_location_ == HOST) + user_info_( + (info_.data_location_ == HOST) ? info_.user_info_size_ : 0), prev_parameters_( info_.max_chunk_size_*info_.n_parameters_ ), parameters_to_fit_indices_( info_.n_parameters_to_fit_ ), - chi_squares_( info_.max_chunk_size_ * info_.n_blocks_per_fit_), + chi_squares_( + (info_.data_location_ == HOST) + ? info_.max_chunk_size_ : 0), + prev_chi_squares_( info_.max_chunk_size_ ), - gradients_( info_.max_chunk_size_ * info_.n_parameters_to_fit_ * info_.n_blocks_per_fit_), + gradients_( info_.max_chunk_size_ * info_.n_parameters_to_fit_), hessians_( info_.max_chunk_size_ * info_.n_parameters_to_fit_ * info_.n_parameters_to_fit_ ), deltas_(info_.max_chunk_size_ * info_.n_parameters_to_fit_), scaling_vectors_(info_.max_chunk_size_ * info_.n_parameters_to_fit_), + subtotals_( + (info_.n_blocks_per_fit_ > 1) + ? info_.max_chunk_size_ * info_.n_parameters_to_fit_ * info_.n_blocks_per_fit_ : 0), + values_( info_.max_chunk_size_ * info_.n_points_ ), derivatives_( info_.max_chunk_size_ * info_.n_points_ * info_.n_parameters_ ), lambdas_( info_.max_chunk_size_ ), - states_( info_.max_chunk_size_ ), + + states_( + (info_.data_location_ == HOST) + ? info_.max_chunk_size_ : 0), + finished_( info_.max_chunk_size_ ), iteration_failed_(info_.max_chunk_size_), all_finished_( 1 ), - n_iterations_( info_.max_chunk_size_ ), + + n_iterations_( + (info_.data_location_ == HOST) + ? info_.max_chunk_size_ : 0), + solution_info_(info_.max_chunk_size_) #ifdef ARCH_64 @@ -68,57 +84,53 @@ void GPUData::init float const * const data, float const * const weights, float const * const initial_parameters, - std::vector const & parameters_to_fit_indices) + std::vector const & parameters_to_fit_indices, + int * states, + float * chi_squares, + int * n_iterations) { chunk_size_ = chunk_size; chunk_index_ = chunk_index; - set(prev_chi_squares_, 0.f, chunk_size_); - set(states_, 0, chunk_size_); - set(finished_, 0, chunk_size_); - set(scaling_vectors_, 0.f, chunk_size_ * info_.n_parameters_to_fit_); - if (info_.data_location_ == HOST) { write( data_, - &data[chunk_index_*info_.max_chunk_size_*info_.n_points_], - chunk_size_*info_.n_points_); - } - else if (info_.data_location_ == DEVICE) - { - data_.assign(data + chunk_index_*info_.max_chunk_size_*info_.n_points_); - } - - if (info_.use_weights_) - { - if (info_.weight_location_ == HOST) - { - write(weights_, &weights[chunk_index_*info_.max_chunk_size_*info_.n_points_], - chunk_size_*info_.n_points_); - } - else if (info_.weight_location_ == DEVICE) - { - weights_.assign(weights + chunk_index_*info_.max_chunk_size_*info_.n_points_); - } - } - - if (info_.parameter_location_ == HOST) - { + data + chunk_index_*info_.max_chunk_size_*info_.n_points_, + chunk_size_ * info_.n_points_); write( parameters_, - &initial_parameters[chunk_index_*info_.max_chunk_size_*info_.n_parameters_], + initial_parameters + chunk_index_*info_.max_chunk_size_*info_.n_parameters_, chunk_size_ * info_.n_parameters_); + if (info_.use_weights_) + write( + weights_, + weights + chunk_index_*info_.max_chunk_size_*info_.n_points_, + chunk_size_ * info_.n_points_); } - else if (info_.parameter_location_ == DEVICE) + else if (info_.data_location_ == DEVICE) { + data_.assign( + data + chunk_index_*info_.max_chunk_size_*info_.n_points_); parameters_.assign( - initial_parameters - + chunk_index_*info_.max_chunk_size_*info_.n_parameters_); + initial_parameters + chunk_index_*info_.max_chunk_size_*info_.n_parameters_); + if (info_.use_weights_) + weights_.assign( + weights + chunk_index_*info_.max_chunk_size_*info_.n_points_); + states_.assign( + states + chunk_index_ * info_.max_chunk_size_); + chi_squares_.assign( + chi_squares + chunk_index_ * info_.max_chunk_size_); + n_iterations_.assign( + n_iterations + chunk_index_ * info_.max_chunk_size_); } write(parameters_to_fit_indices_, parameters_to_fit_indices); + set(prev_chi_squares_, 0.f, chunk_size_); + set(finished_, 0, chunk_size_); + set(scaling_vectors_, 0.f, chunk_size_ * info_.n_parameters_to_fit_); + set(states_, 0, chunk_size_); set(lambdas_, 0.001f, chunk_size_); } @@ -126,11 +138,11 @@ void GPUData::init_user_info(char const * const user_info) { if (info_.user_info_size_ > 0) { - if (info_.user_info_location_ == HOST) + if (info_.data_location_ == HOST) { write(user_info_, user_info, info_.user_info_size_); } - else if (info_.user_info_location_ == DEVICE) + else if (info_.data_location_ == DEVICE) { user_info_.assign(user_info); } diff --git a/Gpufit/gpu_data.cuh b/Gpufit/gpu_data.cuh index ccabe723..ae3deb3c 100644 --- a/Gpufit/gpu_data.cuh +++ b/Gpufit/gpu_data.cuh @@ -11,7 +11,7 @@ template< typename Type > struct Device_Array { - explicit Device_Array(std::size_t const size) + explicit Device_Array(std::size_t const size) : allocated_size_(size) { std::size_t const maximum_size = std::numeric_limits< std::size_t >::max(); std::size_t const type_size = sizeof(Type); @@ -33,7 +33,7 @@ struct Device_Array } } - ~Device_Array() { if (data_location_ == HOST) cudaFree(data_); } + ~Device_Array() { if (allocated_size_ > 0) cudaFree(data_); } operator Type * () { return static_cast(data_); } operator Type const * () const { return static_cast(data_); } @@ -46,7 +46,6 @@ struct Device_Array void assign(Type const * data) { data_ = const_cast(data); - data_location_ = DEVICE; } Type * copy(std::size_t const size, Type * const to) const @@ -68,7 +67,7 @@ struct Device_Array private: void * data_; - DataLocation data_location_; + std::size_t allocated_size_; }; class GPUData @@ -84,7 +83,10 @@ public: float const * data, float const * weights, float const * initial_parameters, - std::vector const & parameters_to_fit_indices + std::vector const & parameters_to_fit_indices, + int * states, + float * chi_squares, + int * n_iterations ); void init_user_info(char const * user_info); @@ -123,6 +125,7 @@ public: Device_Array< float > hessians_; Device_Array< float > deltas_; Device_Array< float > scaling_vectors_; + Device_Array< float > subtotals_; Device_Array< float > values_; Device_Array< float > derivatives_; diff --git a/Gpufit/gpufit.cpp b/Gpufit/gpufit.cpp index 104a47fb..33fce06b 100644 --- a/Gpufit/gpufit.cpp +++ b/Gpufit/gpufit.cpp @@ -65,39 +65,38 @@ int gpufit_cuda_interface ( size_t n_fits, size_t n_points, - float * data, - float * weights, + float * gpu_data, + float * gpu_weights, int model_id, - float * initial_parameters, float tolerance, int max_n_iterations, int * parameters_to_fit, int estimator_id, size_t user_info_size, - char * user_info, - float * output_parameters, - int * output_states, - float * output_chi_squares, - int * output_n_iterations + char * gpu_user_info, + float * gpu_fit_parameters, + int * gpu_output_states, + float * gpu_output_chi_squares, + int * gpu_output_n_iterations ) try { FitInterface fi( - data, - weights, + gpu_data, + gpu_weights, n_fits, static_cast(n_points), tolerance, max_n_iterations, static_cast(estimator_id), - initial_parameters, + gpu_fit_parameters, parameters_to_fit, - user_info, + gpu_user_info, user_info_size, - output_parameters, - output_states, - output_chi_squares, - output_n_iterations, + gpu_fit_parameters, + gpu_output_states, + gpu_output_chi_squares, + gpu_output_n_iterations, DEVICE); fi.fit(static_cast(model_id)); diff --git a/Gpufit/gpufit.h b/Gpufit/gpufit.h index 5a9d8ef0..a61cbe73 100644 --- a/Gpufit/gpufit.h +++ b/Gpufit/gpufit.h @@ -41,20 +41,19 @@ VISIBLE int gpufit_cuda_interface ( size_t n_fits, size_t n_points, - float * data, - float * weights, + float * gpu_data, + float * gpu_weights, int model_id, - float * initial_parameters, float tolerance, int max_n_iterations, int * parameters_to_fit, int estimator_id, size_t user_info_size, - char * user_info, - float * output_parameters, - int * output_states, - float * output_chi_squares, - int * output_n_iterations + char * gpu_user_info, + float * gpu_fit_parameters, + int * gpu_output_states, + float * gpu_output_chi_squares, + int * gpu_output_n_iterations ); VISIBLE char const * gpufit_get_last_error() ; diff --git a/Gpufit/info.cpp b/Gpufit/info.cpp index 5c01bc3b..07912d21 100644 --- a/Gpufit/info.cpp +++ b/Gpufit/info.cpp @@ -72,8 +72,7 @@ void Info::set_max_chunk_size() = sizeof(float) *(1 * n_points_ // values + 1 * n_parameters_ // prev_parameters - + 1 * n_blocks_per_fit_ // chi_square - + 1 * n_parameters_to_fit_ * n_blocks_per_fit_ // gradient + + 1 * n_parameters_to_fit_ // gradient + 1 * n_parameters_to_fit_ * n_parameters_to_fit_ // hessian + 2 * n_parameters_to_fit_ // delta, scaling_vector + 1 * n_points_*n_parameters_ // derivatives @@ -81,14 +80,24 @@ void Info::set_max_chunk_size() + sizeof(int) *(1 * n_parameters_to_fit_ // indices of fitted parameters - + 5); // state, finished, iteration failed flag, - // number of iterations, solution info + + 3); // finished, iteration failed flag, + // solution info + if (n_blocks_per_fit_ > 1) + { + one_fit_memory + += sizeof(float) + * n_parameters_to_fit_ * n_blocks_per_fit_; // subtotals + } + if (data_location_ == HOST) + { one_fit_memory += sizeof(float) * n_points_; // data - if (parameter_location_ == HOST) one_fit_memory += sizeof(float) * n_parameters_; // parameters - if (use_weights_ && weight_location_ == HOST) - one_fit_memory += sizeof(float) * n_points_; // weights + one_fit_memory += sizeof(float); // chi-square + one_fit_memory += sizeof(int) * 2; // state, number of iterations + if (use_weights_) + one_fit_memory += sizeof(float) * n_points_; // weights + } #ifdef ARCH_64 one_fit_memory diff --git a/Gpufit/info.h b/Gpufit/info.h index 09d69cd8..7b1f78ac 100644 --- a/Gpufit/info.h +++ b/Gpufit/info.h @@ -45,9 +45,6 @@ class Info int warp_size_; DataLocation data_location_; - DataLocation weight_location_; - DataLocation parameter_location_; - DataLocation user_info_location_; private: std::size_t max_blocks_; diff --git a/Gpufit/interface.cpp b/Gpufit/interface.cpp index ca7e8e98..f2b091d4 100644 --- a/Gpufit/interface.cpp +++ b/Gpufit/interface.cpp @@ -68,33 +68,12 @@ void FitInterface::configure_info(Info & info, ModelID const model_id) info.user_info_size_ = user_info_size_; info.n_parameters_ = n_parameters_; info.use_weights_ = weights_ ? true : false; + info.data_location_ = data_location_; info.set_number_of_parameters_to_fit(parameters_to_fit_); info.configure(); } -void FitInterface::identify_input_locations(Info & info) -{ - if (data_location_ == HOST) - { - info.data_location_ = HOST; - info.parameter_location_ = HOST; - if (weights_) - info.weight_location_ = HOST; - if (user_info_size_ > 0) - info.user_info_location_ = HOST; - } - else if (data_location_ == DEVICE) - { - info.data_location_ = DEVICE; - info.parameter_location_ = DEVICE; - if (weights_) - info.weight_location_ = DEVICE; - if (user_info_size_ > 0) - info.user_info_location_ = DEVICE; - } -} - void FitInterface::fit(ModelID const model_id) { int n_dimensions = 0; @@ -103,7 +82,6 @@ void FitInterface::fit(ModelID const model_id) check_sizes(); Info info; - identify_input_locations(info); configure_info(info, model_id); LMFit lmfit diff --git a/Gpufit/interface.h b/Gpufit/interface.h index 5e66e49c..01fbcb9c 100644 --- a/Gpufit/interface.h +++ b/Gpufit/interface.h @@ -34,7 +34,6 @@ class FitInterface private: void check_sizes(); void configure_info(Info & info, ModelID const model_id); - void identify_input_locations(Info & info); public: diff --git a/Gpufit/lm_fit.cpp b/Gpufit/lm_fit.cpp index c396ab9d..838faa16 100644 --- a/Gpufit/lm_fit.cpp +++ b/Gpufit/lm_fit.cpp @@ -47,11 +47,17 @@ void LMFit::set_parameters_to_fit_indices() void LMFit::get_results(GPUData const & gpu_data, int const n_fits) { - output_parameters_ - = gpu_data.parameters_.copy( n_fits*info_.n_parameters_, output_parameters_ ) ; - output_states_ = gpu_data.states_.copy( n_fits, output_states_ ) ; - output_chi_squares_ = gpu_data.chi_squares_.copy( n_fits, output_chi_squares_ ) ; - output_n_iterations_ = gpu_data.n_iterations_.copy( n_fits, output_n_iterations_ ) ; + if (info_.data_location_ == HOST) + { + output_parameters_ + = gpu_data.parameters_.copy(n_fits*info_.n_parameters_, output_parameters_); + output_states_ + = gpu_data.states_.copy(n_fits, output_states_); + output_chi_squares_ + = gpu_data.chi_squares_.copy(n_fits, output_chi_squares_); + output_n_iterations_ + = gpu_data.n_iterations_.copy(n_fits, output_n_iterations_); + } } void LMFit::run(float const tolerance) @@ -74,7 +80,10 @@ void LMFit::run(float const tolerance) data_, weights_, initial_parameters_, - parameters_to_fit_indices_); + parameters_to_fit_indices_, + output_states_, + output_chi_squares_, + output_n_iterations_); LMFitCUDA lmfit_cuda( tolerance, diff --git a/Gpufit/lm_fit_cuda.cu b/Gpufit/lm_fit_cuda.cu index 7b1f8520..848f97e7 100644 --- a/Gpufit/lm_fit_cuda.cu +++ b/Gpufit/lm_fit_cuda.cu @@ -178,8 +178,11 @@ void LMFitCUDA::calc_chi_squares() int const shared_size = sizeof(float) * threads.x; + float * chi_squares = + info_.n_blocks_per_fit_ > 1 ? gpu_data_.subtotals_ : gpu_data_.chi_squares_; + cuda_calculate_chi_squares <<< blocks, threads, shared_size >>>( - gpu_data_.chi_squares_, + chi_squares, gpu_data_.states_, gpu_data_.data_, gpu_data_.values_, @@ -200,6 +203,7 @@ void LMFitCUDA::calc_chi_squares() { cuda_sum_chi_square_subtotals <<< blocks, threads >>> ( gpu_data_.chi_squares_, + gpu_data_.subtotals_, info_.n_blocks_per_fit_, n_fits_, gpu_data_.finished_); @@ -225,8 +229,11 @@ void LMFitCUDA::calc_gradients() int const shared_size = sizeof(float) * threads.x; + float * gradients + = info_.n_blocks_per_fit_ > 1 ? gpu_data_.subtotals_ : gpu_data_.gradients_; + cuda_calculate_gradients <<< blocks, threads, shared_size >>>( - gpu_data_.gradients_, + gradients, gpu_data_.data_, gpu_data_.values_, gpu_data_.derivatives_, @@ -252,6 +259,7 @@ void LMFitCUDA::calc_gradients() cuda_sum_gradient_subtotals <<< blocks, threads >>> ( gpu_data_.gradients_, + gpu_data_.subtotals_, info_.n_blocks_per_fit_, n_fits_, info_.n_parameters_to_fit_,