Skip to content

Commit

Permalink
more thrust exception handling (#357)
Browse files Browse the repository at this point in the history
  • Loading branch information
chrischoy committed May 27, 2021
1 parent f8039ec commit c29157d
Show file tree
Hide file tree
Showing 8 changed files with 107 additions and 92 deletions.
10 changes: 5 additions & 5 deletions src/broadcast_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -241,11 +241,11 @@ void BroadcastBackwardKernelGPU(
// cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);

// Sort COO first
thrust::sort_by_key(thrust::device, //
d_out_map, // key begin
d_out_map + nnz, // key end
d_in_map // value begin
);
THRUST_CHECK(thrust::sort_by_key(thrust::device, //
d_out_map, // key begin
d_out_map + nnz, // key end
d_in_map // value begin
));

cusparseSpMMAlg_t mm_alg;
#if defined(CUDART_VERSION) && (CUDART_VERSION < 10010)
Expand Down
8 changes: 4 additions & 4 deletions src/coordinate_map_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -973,7 +973,7 @@ CoordinateFieldMapGPU<coordinate_field_type, coordinate_int_type,
m_coordinate_size);

CUDA_CHECK(cudaStreamSynchronize(0));
kernel_map.decompose();
THRUST_CHECK(kernel_map.decompose());
LOG_DEBUG("origin map decomposed");

return kernel_map;
Expand Down Expand Up @@ -1660,7 +1660,7 @@ CoordinateMapGPU<coordinate_type, TemplatedAllocator>::kernel_map(
CUDA_CHECK(cudaStreamSynchronize(0));
LOG_DEBUG("Preallocated kernel map done");

kernel_map.decompose();
THRUST_CHECK(kernel_map.decompose());
base_type::m_byte_allocator.deallocate(
reinterpret_cast<char *>(d_p_count_per_thread),
num_threads * sizeof(index_type));
Expand Down Expand Up @@ -1730,7 +1730,7 @@ CoordinateMapGPU<coordinate_type, TemplatedAllocator>::kernel_map(
CUDA_CHECK(cudaMemcpy(kernel_map.out_maps.data(), d_p_valid_out_index,
valid_size * sizeof(index_type),
cudaMemcpyDeviceToDevice));
kernel_map.decompose();
THRUST_CHECK(kernel_map.decompose());

base_type::m_byte_allocator.deallocate(
reinterpret_cast<char *>(d_p_valid_in_index),
Expand Down Expand Up @@ -1961,7 +1961,7 @@ CoordinateMapGPU<coordinate_type, TemplatedAllocator>::origin_map(
m_coordinate_size);

CUDA_CHECK(cudaStreamSynchronize(0));
kernel_map.decompose();
THRUST_CHECK(kernel_map.decompose());
LOG_DEBUG("origin map decomposed");

return kernel_map;
Expand Down
7 changes: 7 additions & 0 deletions src/gpu.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,13 @@ namespace minkowski {
<< __FILE__ << ":" << __LINE__); \
}

#define THRUST_CATCH \
catch (thrust::system_error e) { \
throw std::runtime_error(Formatter() \
<< "Thrust error: " << e.what() << " at " \
<< __FILE__ << ":" << __LINE__); \
}

// CUDA: library error reporting.
const char *cublasGetErrorString(cublasStatus_t error);

Expand Down
52 changes: 28 additions & 24 deletions src/kernel_map.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -314,15 +314,15 @@ public:
LOG_DEBUG("Decomposing", kernels.end() - kernels.begin(), "elements");
// the memory space must be initialized first!
// sort
thrust::sort_by_key(thrust::device, //
kernels.begin(), // key begin
kernels.end(), // key end
thrust::make_zip_iterator( // value begin
thrust::make_tuple( //
in_maps.begin(), //
out_maps.begin() //
) //
));
THRUST_CHECK(thrust::sort_by_key(thrust::device, //
kernels.begin(), // key begin
kernels.end(), // key end
thrust::make_zip_iterator( // value begin
thrust::make_tuple( //
in_maps.begin(), //
out_maps.begin() //
) //
)));

#ifdef DEBUG
size_type map_size =
Expand Down Expand Up @@ -357,21 +357,25 @@ public:
gpu_storage<index_type, byte_allocator_type> out_key_min(m_capacity);
gpu_storage<index_type, byte_allocator_type> out_key_size(m_capacity);

auto end = thrust::reduce_by_key(
thrust::device, // policy
kernels.begin(), // key begin
kernels.end(), // key end
thrust::make_zip_iterator(
thrust::make_tuple(min_begin, size_begin)), // value begin
out_key.begin(), // key out begin
thrust::make_zip_iterator(thrust::make_tuple(
out_key_min.begin(), out_key_size.begin())), // value out begin
thrust::equal_to<index_type>(), // key equal binary predicate
detail::min_size_functor<index_type>() // value binary operator
);

size_type num_unique_keys = end.first - out_key.begin();
LOG_DEBUG(num_unique_keys, "unique kernel map keys found");
size_type num_unique_keys;

try {
auto end = thrust::reduce_by_key(
thrust::device, // policy
kernels.begin(), // key begin
kernels.end(), // key end
thrust::make_zip_iterator(
thrust::make_tuple(min_begin, size_begin)), // value begin
out_key.begin(), // key out begin
thrust::make_zip_iterator(thrust::make_tuple(
out_key_min.begin(), out_key_size.begin())), // value out begin
thrust::equal_to<index_type>(), // key equal binary predicate
detail::min_size_functor<index_type>() // value binary operator
);
num_unique_keys = end.first - out_key.begin();
LOG_DEBUG(num_unique_keys, "unique kernel map keys found");
}
THRUST_CATCH;

auto const cpu_out_keys = out_key.to_vector(num_unique_keys);
auto const cpu_out_offset = out_key_min.to_vector(num_unique_keys);
Expand Down
30 changes: 16 additions & 14 deletions src/pooling_avg_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -214,10 +214,10 @@ void NonzeroAvgPoolingForwardKernelGPU(
CUDA_CHECK(cudaMemcpy(sorted_col_ptr, kernel_map.in_maps.begin(),
sparse_nnzs * sizeof(Itype), cudaMemcpyDeviceToDevice));

thrust::sort_by_key(thrust::device, //
sorted_row_ptr, // key begin
sorted_row_ptr + sparse_nnzs, // key end
sorted_col_ptr);
THRUST_CHECK(thrust::sort_by_key(thrust::device, //
sorted_row_ptr, // key begin
sorted_row_ptr + sparse_nnzs, // key end
sorted_col_ptr));

// +---------+ +---+
// | spm | | i |
Expand Down Expand Up @@ -280,16 +280,18 @@ void NonzeroAvgPoolingForwardKernelGPU(
(Dtype *)allocator.allocate(sparse_nnzs * sizeof(Dtype));

// reduce by key
auto end = thrust::reduce_by_key(thrust::device, // policy
sorted_row_ptr, // key begin
sorted_row_ptr + sparse_nnzs, // key end
d_ones, // value begin
unique_row_ptr, // key out begin
reduced_val_ptr // value out begin
);

int num_unique_keys = end.first - unique_row_ptr;
LOG_DEBUG("Num unique keys:", num_unique_keys);
int num_unique_keys;
try {
auto end = thrust::reduce_by_key(thrust::device, // policy
sorted_row_ptr, // key begin
sorted_row_ptr + sparse_nnzs, // key end
d_ones, // value begin
unique_row_ptr, // key out begin
reduced_val_ptr // value out begin
);
num_unique_keys = end.first - unique_row_ptr;
LOG_DEBUG("Num unique keys:", num_unique_keys);
} THRUST_CATCH;

#ifdef DEBUG
Itype *p_unique_row = (Itype *)std::malloc(num_unique_keys * sizeof(Itype));
Expand Down
33 changes: 18 additions & 15 deletions src/pooling_max_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -147,31 +147,34 @@ void max_pool_forward_pointer_kernel_gpu(
MapItype *d_reduced_out_map = d_scr + 2 * nmap + 2; // reduced output maps

// create number of in_feat per out, and starting index
thrust::sequence(thrust::device, d_index, d_index + nmap);
THRUST_CHECK(thrust::sequence(thrust::device, d_index, d_index + nmap));

////////////////////////////////
// Reduction
////////////////////////////////
// sort d_out_map and d_in_map with the d_out_map so that in_feat are
// placed adjacent according to out_map
if (!is_sorted)
thrust::sort_by_key(thrust::device, d_out_map, d_out_map + nmap, d_in_map);
THRUST_CHECK(thrust::sort_by_key(thrust::device, d_out_map,
d_out_map + nmap, d_in_map));

thrust::equal_to<MapItype> equal_pred;
thrust::minimum<MapItype> min_op;

auto reduction_pair =
thrust::reduce_by_key(thrust::device, // execution policy
d_out_map, // key begin
d_out_map + nmap, // key end
d_index, // val begin
d_reduced_out_map, // key out begin
d_in_map_min, // val out begin
equal_pred, // binary pred
min_op); // binary op
CUDA_CHECK(cudaStreamSynchronize(0));

size_t num_unique_out_map = reduction_pair.first - d_reduced_out_map;
size_t num_unique_out_map;

try {
auto reduction_pair =
thrust::reduce_by_key(thrust::device, // execution policy
d_out_map, // key begin
d_out_map + nmap, // key end
d_index, // val begin
d_reduced_out_map, // key out begin
d_in_map_min, // val out begin
equal_pred, // binary pred
min_op); // binary op
CUDA_CHECK(cudaStreamSynchronize(0));
num_unique_out_map = reduction_pair.first - d_reduced_out_map;
} THRUST_CATCH;

#ifdef DEBUG
std::cout << "num_unique_out_map: " << num_unique_out_map << "\n";
Expand Down
55 changes: 27 additions & 28 deletions src/spmm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -235,15 +235,15 @@ torch::Tensor coo_spmm(torch::Tensor const &rows, torch::Tensor const &cols,
CUDA_CHECK(cudaMemcpy(sorted_val_ptr, values_ptr, nnz * sizeof(scalar_t),
cudaMemcpyDeviceToDevice));

thrust::sort_by_key(thrust::device, //
sorted_row_ptr, // key begin
sorted_row_ptr + nnz, // key end
thrust::make_zip_iterator( // value begin
thrust::make_tuple( //
sorted_col_ptr, //
sorted_val_ptr //
) //
));
THRUST_CHECK(thrust::sort_by_key(thrust::device, //
sorted_row_ptr, // key begin
sorted_row_ptr + nnz, // key end
thrust::make_zip_iterator( // value begin
thrust::make_tuple( //
sorted_col_ptr, //
sorted_val_ptr //
) //
)));
LOG_DEBUG("sorted row", cudaDeviceSynchronize());
} else {
sorted_row_ptr = row_indices_ptr;
Expand Down Expand Up @@ -481,10 +481,10 @@ coo_spmm_average(torch::Tensor const &rows, torch::Tensor const &cols,
CUDA_CHECK(cudaMemcpy(sorted_col_ptr, col_indices_ptr,
nnz * sizeof(th_int_type), cudaMemcpyDeviceToDevice));

thrust::sort_by_key(thrust::device, //
sorted_row_ptr, // key begin
sorted_row_ptr + nnz, // key end
sorted_col_ptr);
THRUST_CHECK(thrust::sort_by_key(thrust::device, //
sorted_row_ptr, // key begin
sorted_row_ptr + nnz, // key end
sorted_col_ptr));

/////////////////////////////////////////////////////////////////////////
// Create vals
Expand All @@ -496,21 +496,20 @@ coo_spmm_average(torch::Tensor const &rows, torch::Tensor const &cols,
(scalar_t *)c10::cuda::CUDACachingAllocator::raw_alloc(
nnz * sizeof(scalar_t));
torch::Tensor ones = at::ones({nnz}, mat2.options());

// reduce by key
auto end = thrust::reduce_by_key(
thrust::device, // policy
sorted_row_ptr, // key begin
sorted_row_ptr + nnz, // key end
reinterpret_cast<scalar_t *>(ones.data_ptr()), // value begin
unique_row_ptr, // key out begin
reduced_val_ptr // value out begin
);

int num_unique_keys = end.first - unique_row_ptr;
LOG_DEBUG("Num unique keys:", num_unique_keys);

// Create values
int num_unique_keys;
try {
// reduce by key
auto end = thrust::reduce_by_key(
thrust::device, // policy
sorted_row_ptr, // key begin
sorted_row_ptr + nnz, // key end
reinterpret_cast<scalar_t *>(ones.data_ptr()), // value begin
unique_row_ptr, // key out begin
reduced_val_ptr // value out begin
);
num_unique_keys = end.first - unique_row_ptr;
LOG_DEBUG("Num unique keys:", num_unique_keys);
} THRUST_CATCH;

// Copy the results to the correct output
inverse_val<th_int_type, scalar_t>
Expand Down
4 changes: 2 additions & 2 deletions tests/cpp/coordinate_map_gpu_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,8 @@ coordinate_map_batch_find_test(const torch::Tensor &coordinates,
std::vector<index_type> cpu_firsts(NR);
std::vector<index_type> cpu_seconds(NR);

thrust::copy(firsts.cbegin(), firsts.cend(), cpu_firsts.begin());
thrust::copy(seconds.cbegin(), seconds.cend(), cpu_seconds.begin());
THRUST_CHECK(thrust::copy(firsts.cbegin(), firsts.cend(), cpu_firsts.begin()));
THRUST_CHECK(thrust::copy(seconds.cbegin(), seconds.cend(), cpu_seconds.begin()));
return std::make_pair(cpu_firsts, cpu_seconds);
}

Expand Down

0 comments on commit c29157d

Please sign in to comment.