Skip to content

Commit

Permalink
update to CUDA interface & memory management
Browse files Browse the repository at this point in the history
  • Loading branch information
adrianjp88 committed Mar 26, 2018
1 parent 433ce73 commit 6c43901
Show file tree
Hide file tree
Showing 13 changed files with 186 additions and 127 deletions.
8 changes: 6 additions & 2 deletions Gpufit/cuda_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ __device__ void sum_up_floats(volatile float* shared_array, int const size)

__global__ void cuda_sum_chi_square_subtotals(
float * chi_squares,
float const * subtotals,
int const n_blocks_per_fit,
int const n_fits,
int const * finished)
Expand All @@ -208,10 +209,11 @@ __global__ void cuda_sum_chi_square_subtotals(
return;

float * chi_square = chi_squares + index;
float const * subtotal = subtotals + index;

double sum = 0.0;
for (int i = 0; i < n_blocks_per_fit; i++)
sum += chi_square[i * n_fits];
sum += subtotal[i * n_fits];

chi_square[0] = sum;
}
Expand Down Expand Up @@ -462,6 +464,7 @@ __global__ void cuda_calculate_chi_squares(

__global__ void cuda_sum_gradient_subtotals(
float * gradients,
float const * subtotals,
int const n_blocks_per_fit,
int const n_fits,
int const n_parameters,
Expand All @@ -475,10 +478,11 @@ __global__ void cuda_sum_gradient_subtotals(
return;

float * gradient = gradients + index;
float const * subtotal = subtotals + index;

double sum = 0.0;
for (int i = 0; i < n_blocks_per_fit; i++)
sum += gradient[i * n_fits * n_parameters];
sum += subtotal[i * n_fits * n_parameters];

gradient[0] = sum;
}
Expand Down
2 changes: 2 additions & 0 deletions Gpufit/cuda_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ void configure_model(ModelID const model_id, int & n_parameters, int & n_dimensi

extern __global__ void cuda_sum_chi_square_subtotals(
float * chi_squares,
float const * subtotals,
int const n_blocks_per_fit,
int const n_fits,
int const * finished);
Expand Down Expand Up @@ -34,6 +35,7 @@ extern __global__ void cuda_calculate_chi_squares(

extern __global__ void cuda_sum_gradient_subtotals(
float * gradients,
float const * subtotals,
int const n_blocks_per_fit,
int const n_fits,
int const n_parameters,
Expand Down
64 changes: 52 additions & 12 deletions Gpufit/examples/CUDA_Interface_Example.cu
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ void gauss_fit_2d_example()


// number of fits, fit points and parameters
size_t const n_fits = 10;
size_t const n_fits = 100000;
size_t const size_x = 50;
size_t const n_points_per_fit = size_x * size_x;
size_t const n_model_parameters = 5;
Expand Down Expand Up @@ -185,6 +185,21 @@ void gauss_fit_2d_example()
&gpu_user_info,
data.size() * sizeof(float)));

int * gpu_states;
CUDA_CHECK_STATUS(cudaMalloc(
&gpu_states,
output_states.size() * sizeof(int)));

float * gpu_chi_squares;
CUDA_CHECK_STATUS(cudaMalloc(
&gpu_chi_squares,
output_chi_square.size() * sizeof(float)));

int * gpu_n_iterations;
CUDA_CHECK_STATUS(cudaMalloc(
&gpu_n_iterations,
output_number_iterations.size() * sizeof(int)));

// call to gpufit (C interface)
std::chrono::high_resolution_clock::time_point time_0 = std::chrono::high_resolution_clock::now();
int const status = gpufit_cuda_interface
Expand All @@ -194,28 +209,53 @@ void gauss_fit_2d_example()
gpu_data,
gpu_weights,
model_id,
gpu_initial_parameters,
tolerance,
max_number_iterations,
parameters_to_fit.data(),
estimator_id,
data.size() * sizeof(float),
gpu_user_info,
output_parameters.data(),
output_states.data(),
output_chi_square.data(),
output_number_iterations.data()
gpu_initial_parameters,
gpu_states,
gpu_chi_squares,
gpu_n_iterations
);
std::chrono::high_resolution_clock::time_point time_1 = std::chrono::high_resolution_clock::now();

// check status
if (status != ReturnState::OK)
{
throw std::runtime_error(gpufit_get_last_error());
}

CUDA_CHECK_STATUS(cudaMemcpy(
output_parameters.data(),
gpu_initial_parameters,
output_parameters.size() * sizeof(float),
cudaMemcpyDeviceToHost));
CUDA_CHECK_STATUS(cudaMemcpy(
output_states.data(),
gpu_states,
output_states.size() * sizeof(int),
cudaMemcpyDeviceToHost));
CUDA_CHECK_STATUS(cudaMemcpy(
output_chi_square.data(),
gpu_chi_squares,
output_chi_square.size() * sizeof(float),
cudaMemcpyDeviceToHost));
CUDA_CHECK_STATUS(cudaMemcpy(
output_number_iterations.data(),
gpu_n_iterations,
output_number_iterations.size() * sizeof(int),
cudaMemcpyDeviceToHost));

CUDA_CHECK_STATUS(cudaFree(gpu_data));
CUDA_CHECK_STATUS(cudaFree(gpu_weights));
CUDA_CHECK_STATUS(cudaFree(gpu_user_info));
CUDA_CHECK_STATUS(cudaFree(gpu_initial_parameters));

// check status
if (status != ReturnState::OK)
{
throw std::runtime_error(gpufit_get_last_error());
}
CUDA_CHECK_STATUS(cudaFree(gpu_states));
CUDA_CHECK_STATUS(cudaFree(gpu_chi_squares));
CUDA_CHECK_STATUS(cudaFree(gpu_n_iterations));

// print execution time
std::cout << "execution time "
Expand Down
96 changes: 54 additions & 42 deletions Gpufit/gpu_data.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,33 +11,49 @@ GPUData::GPUData(Info const & info) :
(info_.data_location_ == HOST)
? info_.max_chunk_size_*info_.n_points_ : 0),
weights_(
(info_.use_weights_ && info_.weight_location_ == HOST)
(info_.use_weights_ && info_.data_location_ == HOST)
? info_.n_points_ * info_.max_chunk_size_ : 0 ),
parameters_(
(info_.parameter_location_ == HOST)
(info_.data_location_ == HOST)
? info_.max_chunk_size_*info_.n_parameters_ : 0 ),
user_info_((info_.user_info_location_ == HOST)
user_info_(
(info_.data_location_ == HOST)
? info_.user_info_size_ : 0),

prev_parameters_( info_.max_chunk_size_*info_.n_parameters_ ),
parameters_to_fit_indices_( info_.n_parameters_to_fit_ ),

chi_squares_( info_.max_chunk_size_ * info_.n_blocks_per_fit_),
chi_squares_(
(info_.data_location_ == HOST)
? info_.max_chunk_size_ : 0),

prev_chi_squares_( info_.max_chunk_size_ ),
gradients_( info_.max_chunk_size_ * info_.n_parameters_to_fit_ * info_.n_blocks_per_fit_),
gradients_( info_.max_chunk_size_ * info_.n_parameters_to_fit_),
hessians_( info_.max_chunk_size_ * info_.n_parameters_to_fit_ * info_.n_parameters_to_fit_ ),
deltas_(info_.max_chunk_size_ * info_.n_parameters_to_fit_),
scaling_vectors_(info_.max_chunk_size_ * info_.n_parameters_to_fit_),

subtotals_(
(info_.n_blocks_per_fit_ > 1)
? info_.max_chunk_size_ * info_.n_parameters_to_fit_ * info_.n_blocks_per_fit_ : 0),

values_( info_.max_chunk_size_ * info_.n_points_ ),
derivatives_( info_.max_chunk_size_ * info_.n_points_ * info_.n_parameters_ ),

lambdas_( info_.max_chunk_size_ ),
states_( info_.max_chunk_size_ ),

states_(
(info_.data_location_ == HOST)
? info_.max_chunk_size_ : 0),

finished_( info_.max_chunk_size_ ),
iteration_failed_(info_.max_chunk_size_),
all_finished_( 1 ),
n_iterations_( info_.max_chunk_size_ ),

n_iterations_(
(info_.data_location_ == HOST)
? info_.max_chunk_size_ : 0),

solution_info_(info_.max_chunk_size_)

#ifdef ARCH_64
Expand Down Expand Up @@ -68,69 +84,65 @@ void GPUData::init
float const * const data,
float const * const weights,
float const * const initial_parameters,
std::vector<int> const & parameters_to_fit_indices)
std::vector<int> const & parameters_to_fit_indices,
int * states,
float * chi_squares,
int * n_iterations)
{
chunk_size_ = chunk_size;
chunk_index_ = chunk_index;

set(prev_chi_squares_, 0.f, chunk_size_);
set(states_, 0, chunk_size_);
set(finished_, 0, chunk_size_);
set(scaling_vectors_, 0.f, chunk_size_ * info_.n_parameters_to_fit_);

if (info_.data_location_ == HOST)
{
write(
data_,
&data[chunk_index_*info_.max_chunk_size_*info_.n_points_],
chunk_size_*info_.n_points_);
}
else if (info_.data_location_ == DEVICE)
{
data_.assign(data + chunk_index_*info_.max_chunk_size_*info_.n_points_);
}

if (info_.use_weights_)
{
if (info_.weight_location_ == HOST)
{
write(weights_, &weights[chunk_index_*info_.max_chunk_size_*info_.n_points_],
chunk_size_*info_.n_points_);
}
else if (info_.weight_location_ == DEVICE)
{
weights_.assign(weights + chunk_index_*info_.max_chunk_size_*info_.n_points_);
}
}

if (info_.parameter_location_ == HOST)
{
data + chunk_index_*info_.max_chunk_size_*info_.n_points_,
chunk_size_ * info_.n_points_);
write(
parameters_,
&initial_parameters[chunk_index_*info_.max_chunk_size_*info_.n_parameters_],
initial_parameters + chunk_index_*info_.max_chunk_size_*info_.n_parameters_,
chunk_size_ * info_.n_parameters_);
if (info_.use_weights_)
write(
weights_,
weights + chunk_index_*info_.max_chunk_size_*info_.n_points_,
chunk_size_ * info_.n_points_);
}
else if (info_.parameter_location_ == DEVICE)
else if (info_.data_location_ == DEVICE)
{
data_.assign(
data + chunk_index_*info_.max_chunk_size_*info_.n_points_);
parameters_.assign(
initial_parameters
+ chunk_index_*info_.max_chunk_size_*info_.n_parameters_);
initial_parameters + chunk_index_*info_.max_chunk_size_*info_.n_parameters_);
if (info_.use_weights_)
weights_.assign(
weights + chunk_index_*info_.max_chunk_size_*info_.n_points_);
states_.assign(
states + chunk_index_ * info_.max_chunk_size_);
chi_squares_.assign(
chi_squares + chunk_index_ * info_.max_chunk_size_);
n_iterations_.assign(
n_iterations + chunk_index_ * info_.max_chunk_size_);
}

write(parameters_to_fit_indices_, parameters_to_fit_indices);

set(prev_chi_squares_, 0.f, chunk_size_);
set(finished_, 0, chunk_size_);
set(scaling_vectors_, 0.f, chunk_size_ * info_.n_parameters_to_fit_);
set(states_, 0, chunk_size_);
set(lambdas_, 0.001f, chunk_size_);
}

void GPUData::init_user_info(char const * const user_info)
{
if (info_.user_info_size_ > 0)
{
if (info_.user_info_location_ == HOST)
if (info_.data_location_ == HOST)
{
write(user_info_, user_info, info_.user_info_size_);
}
else if (info_.user_info_location_ == DEVICE)
else if (info_.data_location_ == DEVICE)
{
user_info_.assign(user_info);
}
Expand Down
13 changes: 8 additions & 5 deletions Gpufit/gpu_data.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
template< typename Type >
struct Device_Array
{
explicit Device_Array(std::size_t const size)
explicit Device_Array(std::size_t const size) : allocated_size_(size)
{
std::size_t const maximum_size = std::numeric_limits< std::size_t >::max();
std::size_t const type_size = sizeof(Type);
Expand All @@ -33,7 +33,7 @@ struct Device_Array
}
}

~Device_Array() { if (data_location_ == HOST) cudaFree(data_); }
~Device_Array() { if (allocated_size_ > 0) cudaFree(data_); }

operator Type * () { return static_cast<Type *>(data_); }
operator Type const * () const { return static_cast<Type *>(data_); }
Expand All @@ -46,7 +46,6 @@ struct Device_Array
void assign(Type const * data)
{
data_ = const_cast<Type *>(data);
data_location_ = DEVICE;
}

Type * copy(std::size_t const size, Type * const to) const
Expand All @@ -68,7 +67,7 @@ struct Device_Array

private:
void * data_;
DataLocation data_location_;
std::size_t allocated_size_;
};

class GPUData
Expand All @@ -84,7 +83,10 @@ public:
float const * data,
float const * weights,
float const * initial_parameters,
std::vector<int> const & parameters_to_fit_indices
std::vector<int> const & parameters_to_fit_indices,
int * states,
float * chi_squares,
int * n_iterations
);
void init_user_info(char const * user_info);

Expand Down Expand Up @@ -123,6 +125,7 @@ public:
Device_Array< float > hessians_;
Device_Array< float > deltas_;
Device_Array< float > scaling_vectors_;
Device_Array< float > subtotals_;

Device_Array< float > values_;
Device_Array< float > derivatives_;
Expand Down
Loading

0 comments on commit 6c43901

Please sign in to comment.