diff --git a/Gpufit/Gpufit.def b/Gpufit/Gpufit.def index 0e3b9db..f10c68e 100644 --- a/Gpufit/Gpufit.def +++ b/Gpufit/Gpufit.def @@ -4,4 +4,5 @@ EXPORTS gpufit_get_last_error @2 gpufit_get_cuda_version @3 gpufit_cuda_available @4 - gpufit_portable_interface @5 \ No newline at end of file + gpufit_portable_interface @5 + gpufit_cuda_interface @6 \ No newline at end of file diff --git a/Gpufit/constants.h b/Gpufit/constants.h index 9e85fcc..62964ee 100644 --- a/Gpufit/constants.h +++ b/Gpufit/constants.h @@ -23,4 +23,6 @@ enum FitState { CONVERGED = 0, MAX_ITERATION = 1, SINGULAR_HESSIAN = 2, NEG_CURV // return state enum ReturnState { OK = 0, ERROR = -1 }; +enum DataLocation { HOST = 0, DEVICE = 1 }; + #endif diff --git a/Gpufit/examples/CMakeLists.txt b/Gpufit/examples/CMakeLists.txt index bb4902f..d92178c 100644 --- a/Gpufit/examples/CMakeLists.txt +++ b/Gpufit/examples/CMakeLists.txt @@ -7,8 +7,18 @@ function( add_example module name ) set_property( TARGET ${name} PROPERTY FOLDER GpufitExamples ) endfunction() +function( add_cuda_example module name ) + cuda_add_executable( ${name} ${name}.cu ) + target_link_libraries( ${name} ${module} ) + set_property( TARGET ${name} + PROPERTY RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}" ) + set_property( TARGET ${name} PROPERTY FOLDER GpufitExamples ) +endfunction() + # Examples add_example( Gpufit Simple_Example ) add_example( Gpufit Linear_Regression_Example ) add_example( Gpufit Gauss_Fit_2D_Example ) + +add_cuda_example( Gpufit CUDA_Interface_Example ) diff --git a/Gpufit/examples/CUDA_Interface_Example.cu b/Gpufit/examples/CUDA_Interface_Example.cu new file mode 100644 index 0000000..e27c0af --- /dev/null +++ b/Gpufit/examples/CUDA_Interface_Example.cu @@ -0,0 +1,321 @@ +#include "../gpufit.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#define CUDA_CHECK_STATUS( cuda_function_call ) \ + if (cudaError_t const status = cuda_function_call) \ + { \ + throw std::runtime_error( cudaGetErrorString( status ) ) ; \ + } + +void generate_gauss_2d( + std::vector const & x_coordinates, + std::vector const & y_coordinates, + std::vector const & gauss_params, + std::vector & output_values) +{ + // Generates a Gaussian 2D function at a set of X and Y coordinates. The Gaussian is defined by + // an array of five parameters. + + // x_coordinates: Vector of X coordinates. + // y_coordinates: Vector of Y coordinates. + // gauss_params: Vector of function parameters. + // output_values: Output vector containing the values of the Gaussian function at the + // corresponding X, Y coordinates. + + // gauss_params[0]: Amplitude + // gauss_params[1]: Center X position + // guass_params[2]: Center Y position + // gauss_params[3]: Gaussian width (standard deviation) + // gauss_params[4]: Baseline offset + + // This code assumes that x_coordinates.size == y_coordinates.size == output_values.size + + for (size_t i = 0; i < x_coordinates.size(); i++) + { + + float arg = -( (x_coordinates[i] - gauss_params[1]) * (x_coordinates[i] - gauss_params[1]) + + (y_coordinates[i] - gauss_params[2]) * (y_coordinates[i] - gauss_params[2]) ) + / (2.f * gauss_params[3] * gauss_params[3]); + + output_values[i] = gauss_params[0] * exp(arg) + gauss_params[4]; + + } +} + +void gauss_fit_2d_example() +{ + /* + This example generates test data in form of 10000 two dimensional Gaussian + peaks with the size of 5x5 data points per peak. It is noised by Poisson + distributed noise. The initial guesses were randomized, within a specified + range of the true value. The GAUSS_2D model is fitted to the test data sets + using the MLE estimator. + + The console output shows + - the execution time, + - the ratio of converged fits including ratios of not converged fits for + different reasons, + - the values of the true parameters and the mean values of the fitted + parameters including their standard deviation, + - the mean chi square value + - and the mean number of iterations needed. + + True parameters and noise and number of fits is the same as for the Matlab/Python 2D Gaussian examples. + */ + + + // number of fits, fit points and parameters + size_t const n_fits = 10; + size_t const size_x = 50; + size_t const n_points_per_fit = size_x * size_x; + size_t const n_model_parameters = 5; + + // true parameters (amplitude, center x position, center y position, width, offset) + std::vector< float > true_parameters{ 10.f, 14.5f, 14.5f, 3.f, 10.f}; + + std::cout << "generate example data" << std::endl; + + // initialize random number generator + std::mt19937 rng; + rng.seed(0); + std::uniform_real_distribution< float> uniform_dist(0, 1); + + // initial parameters (randomized) + std::vector< float > initial_parameters(n_fits * n_model_parameters); + for (size_t i = 0; i < n_fits; i++) + { + for (size_t j = 0; j < n_model_parameters; j++) + { + if (j == 1 || j == 2) + { + initial_parameters[i * n_model_parameters + j] + = true_parameters[j] + true_parameters[3] + * (-0.2f + 0.4f * uniform_dist(rng)); + } + else + { + initial_parameters[i * n_model_parameters + j] + = true_parameters[j] * (0.8f + 0.4f * uniform_dist(rng)); + } + } + } + + // generate x and y values + std::vector< float > x(n_points_per_fit); + std::vector< float > y(n_points_per_fit); + for (size_t i = 0; i < size_x; i++) + { + for (size_t j = 0; j < size_x; j++) { + x[i * size_x + j] = static_cast(j); + y[i * size_x + j] = static_cast(i); + } + } + + // generate test data with Poisson noise + std::vector< float > temp(n_points_per_fit); + generate_gauss_2d(x, y, true_parameters, temp); + + std::vector< float > data(n_fits * n_points_per_fit); + for (size_t i = 0; i < n_fits; i++) + { + for (size_t j = 0; j < n_points_per_fit; j++) + { + std::poisson_distribution< int > poisson_dist(temp[j]); + data[i * n_points_per_fit + j] = static_cast(poisson_dist(rng)); + } + } + + // tolerance + float const tolerance = 0.001f; + + // maximum number of iterations + int const max_number_iterations = 20; + + // estimator ID + int const estimator_id = MLE; + + // model ID + int const model_id = GAUSS_2D; + + // parameters to fit (all of them) + std::vector< int > parameters_to_fit(n_model_parameters, 1); + + // output parameters + std::vector< float > output_parameters(n_fits * n_model_parameters); + std::vector< int > output_states(n_fits); + std::vector< float > output_chi_square(n_fits); + std::vector< int > output_number_iterations(n_fits); + + float * gpu_data; + CUDA_CHECK_STATUS(cudaMalloc( + &gpu_data, + data.size() * sizeof(float))); + CUDA_CHECK_STATUS(cudaMemcpy( + gpu_data, + data.data(), + data.size() * sizeof(float), + cudaMemcpyHostToDevice)); + + float * gpu_initial_parameters; + CUDA_CHECK_STATUS(cudaMalloc( + &gpu_initial_parameters, + initial_parameters.size() * sizeof(float))); + CUDA_CHECK_STATUS(cudaMemcpy( + gpu_initial_parameters, + initial_parameters.data(), + initial_parameters.size() * sizeof(float), + cudaMemcpyHostToDevice)); + + float * gpu_weights; + CUDA_CHECK_STATUS(cudaMalloc( + &gpu_weights, + data.size() * sizeof(float))); + cudaMemset(gpu_weights, 1, data.size() * sizeof(float)); + + char * gpu_user_info; + CUDA_CHECK_STATUS(cudaMalloc( + &gpu_user_info, + data.size() * sizeof(float))); + + // call to gpufit (C interface) + std::chrono::high_resolution_clock::time_point time_0 = std::chrono::high_resolution_clock::now(); + int const status = gpufit_cuda_interface + ( + n_fits, + n_points_per_fit, + gpu_data, + gpu_weights, + model_id, + gpu_initial_parameters, + tolerance, + max_number_iterations, + parameters_to_fit.data(), + estimator_id, + data.size() * sizeof(float), + gpu_user_info, + output_parameters.data(), + output_states.data(), + output_chi_square.data(), + output_number_iterations.data() + ); + std::chrono::high_resolution_clock::time_point time_1 = std::chrono::high_resolution_clock::now(); + + CUDA_CHECK_STATUS(cudaFree(gpu_data)); + CUDA_CHECK_STATUS(cudaFree(gpu_initial_parameters)); + + // check status + if (status != ReturnState::OK) + { + throw std::runtime_error(gpufit_get_last_error()); + } + + // print execution time + std::cout << "execution time " + << std::chrono::duration_cast(time_1 - time_0).count() << " ms" << std::endl; + + // get fit states + std::vector< int > output_states_histogram(5, 0); + for (std::vector< int >::iterator it = output_states.begin(); it != output_states.end(); ++it) + { + output_states_histogram[*it]++; + } + + std::cout << "ratio converged " << (float)output_states_histogram[0] / n_fits << "\n"; + std::cout << "ratio max iteration exceeded " << (float)output_states_histogram[1] / n_fits << "\n"; + std::cout << "ratio singular hessian " << (float)output_states_histogram[2] / n_fits << "\n"; + std::cout << "ratio neg curvature MLE " << (float)output_states_histogram[3] / n_fits << "\n"; + std::cout << "ratio gpu not read " << (float)output_states_histogram[4] / n_fits << "\n"; + + // compute mean of fitted parameters for converged fits + std::vector< float > output_parameters_mean(n_model_parameters, 0); + for (size_t i = 0; i != n_fits; i++) + { + if (output_states[i] == FitState::CONVERGED) + { + for (size_t j = 0; j < n_model_parameters; j++) + { + output_parameters_mean[j] += output_parameters[i * n_model_parameters + j]; + } + } + } + // normalize + for (size_t j = 0; j < n_model_parameters; j++) + { + output_parameters_mean[j] /= output_states_histogram[0]; + } + + // compute std of fitted parameters for converged fits + std::vector< float > output_parameters_std(n_model_parameters, 0); + for (size_t i = 0; i != n_fits; i++) + { + if (output_states[i] == FitState::CONVERGED) + { + for (size_t j = 0; j < n_model_parameters; j++) + { + output_parameters_std[j] + += (output_parameters[i * n_model_parameters + j] - output_parameters_mean[j]) + * (output_parameters[i * n_model_parameters + j] - output_parameters_mean[j]); + } + } + } + // normalize and take square root + for (size_t j = 0; j < n_model_parameters; j++) + { + output_parameters_std[j] = sqrt(output_parameters_std[j] / output_states_histogram[0]); + } + + // print true value, fitted mean and std for every parameter + for (size_t j = 0; j < n_model_parameters; j++) + { + std::cout + << "parameter " << j + << " true " << true_parameters[j] + << " fitted mean " << output_parameters_mean[j] + << " std " << output_parameters_std[j] << std::endl; + } + + // compute mean chi-square for those converged + float output_chi_square_mean = 0; + for (size_t i = 0; i != n_fits; i++) + { + if (output_states[i] == FitState::CONVERGED) + { + output_chi_square_mean += output_chi_square[i]; + } + } + output_chi_square_mean /= static_cast(output_states_histogram[0]); + std::cout << "mean chi square " << output_chi_square_mean << std::endl; + + // compute mean number of iterations for those converged + float output_number_iterations_mean = 0; + for (size_t i = 0; i != n_fits; i++) + { + if (output_states[i] == FitState::CONVERGED) + { + output_number_iterations_mean += static_cast(output_number_iterations[i]); + } + } + // normalize + output_number_iterations_mean /= static_cast(output_states_histogram[0]); + std::cout << "mean number of iterations " << output_number_iterations_mean << std::endl; + +} + +int main(int argc, char *argv[]) +{ + gauss_fit_2d_example(); + + std::cout << std::endl << "Example completed!" << std::endl; + std::cout << "Press ENTER to exit" << std::endl; + std::getchar(); + + return 0; +} diff --git a/Gpufit/gpu_data.cu b/Gpufit/gpu_data.cu index ed5b774..1c41b3b 100644 --- a/Gpufit/gpu_data.cu +++ b/Gpufit/gpu_data.cu @@ -7,12 +7,20 @@ GPUData::GPUData(Info const & info) : chunk_size_(0), info_(info), - data_( info_.max_chunk_size_*info_.n_points_ ), - weights_( info_.use_weights_ ? info_.n_points_ * info_.max_chunk_size_ : 0 ), - parameters_( info_.max_chunk_size_*info_.n_parameters_ ), + data_( + (info_.data_location_ == HOST) + ? info_.max_chunk_size_*info_.n_points_ : 0), + weights_( + (info_.use_weights_ && info_.weight_location_ == HOST) + ? info_.n_points_ * info_.max_chunk_size_ : 0 ), + parameters_( + (info_.parameter_location_ == HOST) + ? info_.max_chunk_size_*info_.n_parameters_ : 0 ), + user_info_((info_.user_info_location_ == HOST) + ? info_.user_info_size_ : 0), + prev_parameters_( info_.max_chunk_size_*info_.n_parameters_ ), parameters_to_fit_indices_( info_.n_parameters_to_fit_ ), - user_info_( info_.user_info_size_ ), chi_squares_( info_.max_chunk_size_ * info_.n_blocks_per_fit_), prev_chi_squares_( info_.max_chunk_size_ ), @@ -70,19 +78,44 @@ void GPUData::init set(finished_, 0, chunk_size_); set(scaling_vectors_, 0.f, chunk_size_ * info_.n_parameters_to_fit_); - write( - data_, - &data[chunk_index_*info_.max_chunk_size_*info_.n_points_], - chunk_size_*info_.n_points_); - - if (info_.use_weights_) - write(weights_, &weights[chunk_index_*info_.max_chunk_size_*info_.n_points_], + if (info_.data_location_ == HOST) + { + write( + data_, + &data[chunk_index_*info_.max_chunk_size_*info_.n_points_], chunk_size_*info_.n_points_); - - write( - parameters_, - &initial_parameters[chunk_index_*info_.max_chunk_size_*info_.n_parameters_], - chunk_size_ * info_.n_parameters_); + } + else if (info_.data_location_ == DEVICE) + { + data_.assign(data + chunk_index_*info_.max_chunk_size_*info_.n_points_); + } + + if (info_.use_weights_) + { + if (info_.weight_location_ == HOST) + { + write(weights_, &weights[chunk_index_*info_.max_chunk_size_*info_.n_points_], + chunk_size_*info_.n_points_); + } + else if (info_.weight_location_ == DEVICE) + { + weights_.assign(weights + chunk_index_*info_.max_chunk_size_*info_.n_points_); + } + } + + if (info_.parameter_location_ == HOST) + { + write( + parameters_, + &initial_parameters[chunk_index_*info_.max_chunk_size_*info_.n_parameters_], + chunk_size_ * info_.n_parameters_); + } + else if (info_.parameter_location_ == DEVICE) + { + parameters_.assign( + initial_parameters + + chunk_index_*info_.max_chunk_size_*info_.n_parameters_); + } write(parameters_to_fit_indices_, parameters_to_fit_indices); @@ -92,7 +125,16 @@ void GPUData::init void GPUData::init_user_info(char const * const user_info) { if (info_.user_info_size_ > 0) - write(user_info_, user_info, info_.user_info_size_); + { + if (info_.user_info_location_ == HOST) + { + write(user_info_, user_info, info_.user_info_size_); + } + else if (info_.user_info_location_ == DEVICE) + { + user_info_.assign(user_info); + } + } } void GPUData::read(bool * dst, int const * src) diff --git a/Gpufit/gpu_data.cuh b/Gpufit/gpu_data.cuh index f8a5f11..ccabe72 100644 --- a/Gpufit/gpu_data.cuh +++ b/Gpufit/gpu_data.cuh @@ -33,7 +33,7 @@ struct Device_Array } } - ~Device_Array() { cudaFree(data_); } + ~Device_Array() { if (data_location_ == HOST) cudaFree(data_); } operator Type * () { return static_cast(data_); } operator Type const * () const { return static_cast(data_); } @@ -43,6 +43,12 @@ struct Device_Array return static_cast(data_); } + void assign(Type const * data) + { + data_ = const_cast(data); + data_location_ = DEVICE; + } + Type * copy(std::size_t const size, Type * const to) const { // TODO check size parameter @@ -62,6 +68,7 @@ struct Device_Array private: void * data_; + DataLocation data_location_; }; class GPUData diff --git a/Gpufit/gpufit.cpp b/Gpufit/gpufit.cpp index 2605f03..104a47f 100644 --- a/Gpufit/gpufit.cpp +++ b/Gpufit/gpufit.cpp @@ -41,7 +41,8 @@ try output_parameters, output_states, output_chi_squares, - output_n_iterations); + output_n_iterations, + HOST); fi.fit(static_cast(model_id)); @@ -60,6 +61,62 @@ catch( ... ) return ReturnState::ERROR; } +int gpufit_cuda_interface +( + size_t n_fits, + size_t n_points, + float * data, + float * weights, + int model_id, + float * initial_parameters, + float tolerance, + int max_n_iterations, + int * parameters_to_fit, + int estimator_id, + size_t user_info_size, + char * user_info, + float * output_parameters, + int * output_states, + float * output_chi_squares, + int * output_n_iterations +) +try +{ + FitInterface fi( + data, + weights, + n_fits, + static_cast(n_points), + tolerance, + max_n_iterations, + static_cast(estimator_id), + initial_parameters, + parameters_to_fit, + user_info, + user_info_size, + output_parameters, + output_states, + output_chi_squares, + output_n_iterations, + DEVICE); + + fi.fit(static_cast(model_id)); + + return ReturnState::OK; +} +catch (std::exception & exception) +{ + last_error = exception.what(); + + return ReturnState::ERROR; +} +catch (...) +{ + last_error = "unknown error"; + + return ReturnState::ERROR; +} + char const * gpufit_get_last_error() { return last_error.c_str() ; diff --git a/Gpufit/gpufit.h b/Gpufit/gpufit.h index a61f2ab..5a9d8ef 100644 --- a/Gpufit/gpufit.h +++ b/Gpufit/gpufit.h @@ -37,6 +37,26 @@ VISIBLE int gpufit int * output_n_iterations ) ; +VISIBLE int gpufit_cuda_interface +( + size_t n_fits, + size_t n_points, + float * data, + float * weights, + int model_id, + float * initial_parameters, + float tolerance, + int max_n_iterations, + int * parameters_to_fit, + int estimator_id, + size_t user_info_size, + char * user_info, + float * output_parameters, + int * output_states, + float * output_chi_squares, + int * output_n_iterations +); + VISIBLE char const * gpufit_get_last_error() ; // returns 1 if cuda is available and 0 otherwise diff --git a/Gpufit/info.cpp b/Gpufit/info.cpp index dc5a1b8..5c01bc3 100644 --- a/Gpufit/info.cpp +++ b/Gpufit/info.cpp @@ -70,8 +70,8 @@ void Info::set_max_chunk_size() { int one_fit_memory = sizeof(float) - *(2 * n_points_ // data, values - + 2 * n_parameters_ // parameters, prev_parameters + *(1 * n_points_ // values + + 1 * n_parameters_ // prev_parameters + 1 * n_blocks_per_fit_ // chi_square + 1 * n_parameters_to_fit_ * n_blocks_per_fit_ // gradient + 1 * n_parameters_to_fit_ * n_parameters_to_fit_ // hessian @@ -83,6 +83,13 @@ void Info::set_max_chunk_size() *(1 * n_parameters_to_fit_ // indices of fitted parameters + 5); // state, finished, iteration failed flag, // number of iterations, solution info + if (data_location_ == HOST) + one_fit_memory += sizeof(float) * n_points_; // data + if (parameter_location_ == HOST) + one_fit_memory += sizeof(float) * n_parameters_; // parameters + if (use_weights_ && weight_location_ == HOST) + one_fit_memory += sizeof(float) * n_points_; // weights + #ifdef ARCH_64 one_fit_memory += sizeof(float) @@ -91,11 +98,7 @@ void Info::set_max_chunk_size() + sizeof(int) * (1 * n_parameters_to_fit_); // pivot vector #endif // ARCH_64 - - - if (use_weights_) - one_fit_memory += sizeof(float) * n_points_; - + std::size_t tmp_chunk_size = available_gpu_memory_ / one_fit_memory; if (tmp_chunk_size == 0) diff --git a/Gpufit/info.h b/Gpufit/info.h index 3bfcef9..09d69cd 100644 --- a/Gpufit/info.h +++ b/Gpufit/info.h @@ -44,6 +44,11 @@ class Info int max_threads_; int warp_size_; + DataLocation data_location_; + DataLocation weight_location_; + DataLocation parameter_location_; + DataLocation user_info_location_; + private: std::size_t max_blocks_; std::size_t available_gpu_memory_; diff --git a/Gpufit/interface.cpp b/Gpufit/interface.cpp index 0277067..ca7e8e9 100644 --- a/Gpufit/interface.cpp +++ b/Gpufit/interface.cpp @@ -18,7 +18,8 @@ FitInterface::FitInterface float * output_parameters, int * output_states, float * output_chi_squares, - int * output_n_iterations + int * output_n_iterations, + DataLocation data_location ) : data_( data ), weights_( weights ), @@ -35,7 +36,8 @@ FitInterface::FitInterface output_states_(output_states), output_chi_squares_(output_chi_squares), output_n_iterations_(output_n_iterations), - n_parameters_(0) + n_parameters_(0), + data_location_(data_location) {} FitInterface::~FitInterface() @@ -71,6 +73,28 @@ void FitInterface::configure_info(Info & info, ModelID const model_id) info.configure(); } +void FitInterface::identify_input_locations(Info & info) +{ + if (data_location_ == HOST) + { + info.data_location_ = HOST; + info.parameter_location_ = HOST; + if (weights_) + info.weight_location_ = HOST; + if (user_info_size_ > 0) + info.user_info_location_ = HOST; + } + else if (data_location_ == DEVICE) + { + info.data_location_ = DEVICE; + info.parameter_location_ = DEVICE; + if (weights_) + info.weight_location_ = DEVICE; + if (user_info_size_ > 0) + info.user_info_location_ = DEVICE; + } +} + void FitInterface::fit(ModelID const model_id) { int n_dimensions = 0; @@ -79,6 +103,7 @@ void FitInterface::fit(ModelID const model_id) check_sizes(); Info info; + identify_input_locations(info); configure_info(info, model_id); LMFit lmfit diff --git a/Gpufit/interface.h b/Gpufit/interface.h index 03a73f4..5e66e49 100644 --- a/Gpufit/interface.h +++ b/Gpufit/interface.h @@ -24,7 +24,8 @@ class FitInterface float * output_parameters, int * output_states, float * output_chi_squares, - int * output_n_iterations + int * output_n_iterations, + DataLocation data_location ) ; virtual ~FitInterface(); @@ -33,6 +34,7 @@ class FitInterface private: void check_sizes(); void configure_info(Info & info, ModelID const model_id); + void identify_input_locations(Info & info); public: @@ -52,6 +54,8 @@ class FitInterface EstimatorID estimator_id_; std::size_t const user_info_size_; + DataLocation data_location_; + //output float * output_parameters_; int * output_states_;