Skip to content

Commit

Permalink
add hipGraph tests
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffdaily committed Jan 10, 2024
1 parent 3097100 commit f3b46e4
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 18 deletions.
7 changes: 7 additions & 0 deletions cmake/onnxruntime_unittests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1277,6 +1277,9 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
if (onnxruntime_USE_CUDA)
list(APPEND onnxruntime_shared_lib_test_LIBS cudart)
endif()
if (onnxruntime_USE_ROCM)
list(APPEND onnxruntime_shared_lib_test_LIBS hip::host)
endif()
if (onnxruntime_USE_TENSORRT)
list(APPEND onnxruntime_shared_lib_test_LIBS ${TENSORRT_LIBRARY_INFER})
endif()
Expand All @@ -1294,6 +1297,10 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
target_include_directories(onnxruntime_shared_lib_test PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
target_sources(onnxruntime_shared_lib_test PRIVATE ${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/cuda_ops.cu)
endif()
if (onnxruntime_USE_ROCM)
target_include_directories(onnxruntime_shared_lib_test PRIVATE ${onnxruntime_ROCM_HOME}/include)
target_compile_definitions(onnxruntime_shared_lib_test PRIVATE __HIP_PLATFORM_AMD__)
endif()
if (CMAKE_SYSTEM_NAME STREQUAL "Android")
target_sources(onnxruntime_shared_lib_test PRIVATE
"${ONNXRUNTIME_ROOT}/core/platform/android/cxa_demangle.cc"
Expand Down
112 changes: 94 additions & 18 deletions onnxruntime/test/shared_lib/test_inference.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@
#include <cuda_runtime.h>
#endif

#ifdef USE_ROCM
#include <hip/hip_runtime.h>
#endif

// Once we use C++17 this could be replaced with std::size
template <typename T, size_t N>
constexpr size_t countof(T (&)[N]) { return N; }
Expand Down Expand Up @@ -1757,6 +1761,27 @@ TEST(CApiTest, get_allocator_cuda) {
}
#endif

#ifdef USE_ROCM
TEST(CApiTest, get_allocator_rocm) {
Ort::SessionOptions session_options;
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, 0));
Ort::Session session(*ort_env, NAMED_AND_ANON_DIM_PARAM_URI, session_options);

Ort::MemoryInfo info_rocm("Hip", OrtAllocatorType::OrtArenaAllocator, 0, OrtMemTypeDefault);
Ort::Allocator rocm_allocator(session, info_rocm);

auto allocator_info = rocm_allocator.GetInfo();
ASSERT_TRUE(info_rocm == allocator_info);
void* p = rocm_allocator.Alloc(1024);
ASSERT_NE(p, nullptr);
rocm_allocator.Free(p);

auto mem_allocation = rocm_allocator.GetAllocation(1024);
ASSERT_NE(nullptr, mem_allocation.get());
ASSERT_EQ(1024U, mem_allocation.size());
}
#endif

TEST(CApiTest, io_binding) {
Ort::SessionOptions session_options;
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CPU(session_options, 1));
Expand Down Expand Up @@ -1932,7 +1957,7 @@ TEST(CApiTest, io_binding_cuda) {
}
#endif

#if defined(USE_CUDA) || defined(USE_TENSORRT)
#if defined(USE_CUDA) || defined(USE_TENSORRT) || defined(USE_ROCM)
TEST(CApiTest, basic_cuda_graph) {
const auto& api = Ort::GetApi();
Ort::SessionOptions session_options;
Expand All @@ -1950,7 +1975,7 @@ TEST(CApiTest, basic_cuda_graph) {
ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_TensorRT_V2(
static_cast<OrtSessionOptions*>(session_options),
rel_trt_options.get()) == nullptr);
#else
#elif defined(USE_CUDA)
// Enable cuda graph in cuda provider option.
OrtCUDAProviderOptionsV2* cuda_options = nullptr;
ASSERT_TRUE(api.CreateCUDAProviderOptions(&cuda_options) == nullptr);
Expand All @@ -1963,34 +1988,55 @@ TEST(CApiTest, basic_cuda_graph) {
ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_CUDA_V2(
static_cast<OrtSessionOptions*>(session_options),
rel_cuda_options.get()) == nullptr);
#elif defined(USE_ROCM)
// Enable hip graph in rocm provider option.
OrtROCMProviderOptions* rocm_options = nullptr;
ASSERT_TRUE(api.CreateROCMProviderOptions(&rocm_options) == nullptr);
std::unique_ptr<OrtROCMProviderOptions, decltype(api.ReleaseROCMProviderOptions)>
rel_rocm_options(rocm_options, api.ReleaseROCMProviderOptions);
std::vector<const char*> keys{"enable_hip_graph"};
std::vector<const char*> values{"1"};
ASSERT_TRUE(api.UpdateROCMProviderOptions(rel_rocm_options.get(), keys.data(), values.data(), 1) == nullptr);

ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_ROCM(
static_cast<OrtSessionOptions*>(session_options),
rel_rocm_options.get()) == nullptr);
#endif

Ort::Session session(*ort_env, MODEL_URI, session_options);
Ort::MemoryInfo info_cuda("Cuda", OrtAllocatorType::OrtArenaAllocator, 0, OrtMemTypeDefault);
#if defined(USE_ROCM)
// local hipify
#define cudaMemcpy hipMemcpy
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
Ort::MemoryInfo info_mem("Hip", OrtAllocatorType::OrtArenaAllocator, 0, OrtMemTypeDefault);
#else
Ort::MemoryInfo info_mem("Cuda", OrtAllocatorType::OrtArenaAllocator, 0, OrtMemTypeDefault);
#endif

Ort::Allocator cuda_allocator(session, info_cuda);
auto allocator_info = cuda_allocator.GetInfo();
ASSERT_TRUE(info_cuda == allocator_info);
Ort::Allocator allocator(session, info_mem);
auto allocator_info = allocator.GetInfo();
ASSERT_TRUE(info_mem == allocator_info);

const std::array<int64_t, 2> x_shape = {3, 2};
std::array<float, 3 * 2> x_values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
auto input_data = cuda_allocator.GetAllocation(x_values.size() * sizeof(float));
auto input_data = allocator.GetAllocation(x_values.size() * sizeof(float));

ASSERT_NE(input_data.get(), nullptr);
cudaMemcpy(input_data.get(), x_values.data(), sizeof(float) * x_values.size(), cudaMemcpyHostToDevice);
(void)cudaMemcpy(input_data.get(), x_values.data(), sizeof(float) * x_values.size(), cudaMemcpyHostToDevice);

// Create an OrtValue tensor backed by data on CUDA memory
Ort::Value bound_x = Ort::Value::CreateTensor(info_cuda, reinterpret_cast<float*>(input_data.get()), x_values.size(),
Ort::Value bound_x = Ort::Value::CreateTensor(info_mem, reinterpret_cast<float*>(input_data.get()), x_values.size(),
x_shape.data(), x_shape.size());

const std::array<int64_t, 2> expected_y_shape = {3, 2};
std::array<float, 3 * 2> expected_y = {1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f};
auto output_data = cuda_allocator.GetAllocation(expected_y.size() * sizeof(float));
auto output_data = allocator.GetAllocation(expected_y.size() * sizeof(float));

ASSERT_NE(output_data.get(), nullptr);

// Create an OrtValue tensor backed by data on CUDA memory
Ort::Value bound_y = Ort::Value::CreateTensor(info_cuda, reinterpret_cast<float*>(output_data.get()),
Ort::Value bound_y = Ort::Value::CreateTensor(info_mem, reinterpret_cast<float*>(output_data.get()),
expected_y.size(), expected_y_shape.data(), expected_y_shape.size());

// Create IoBinding for inputs and outputs.
Expand All @@ -2003,31 +2049,37 @@ TEST(CApiTest, basic_cuda_graph) {

// Check the values against the bound raw memory (needs copying from device to host first)
std::array<float, 3 * 2> y_values;
cudaMemcpy(y_values.data(), output_data.get(), sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost);
(void)cudaMemcpy(y_values.data(), output_data.get(), sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost);
ASSERT_THAT(y_values, ::testing::ContainerEq(expected_y));

// Replay the captured CUDA graph
session.Run(Ort::RunOptions(), binding);
cudaMemcpy(y_values.data(), output_data.get(), sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost);
(void)cudaMemcpy(y_values.data(), output_data.get(), sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost);
ASSERT_THAT(y_values, ::testing::ContainerEq(expected_y));

// Change the input and replay the CUDA graph again.
x_values = {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f};
cudaMemcpy(input_data.get(), x_values.data(), sizeof(float) * x_values.size(), cudaMemcpyHostToDevice);
(void)cudaMemcpy(input_data.get(), x_values.data(), sizeof(float) * x_values.size(), cudaMemcpyHostToDevice);
binding.SynchronizeInputs();

session.Run(Ort::RunOptions(), binding);
cudaMemcpy(y_values.data(), output_data.get(), sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost);
(void)cudaMemcpy(y_values.data(), output_data.get(), sizeof(float) * y_values.size(), cudaMemcpyDeviceToHost);
expected_y = {10.0f, 40.0f, 90.0f, 160.0f, 250.0f, 360.0f};
ASSERT_THAT(y_values, ::testing::ContainerEq(expected_y));

// Clean up
binding.ClearBoundInputs();
binding.ClearBoundOutputs();
#if defined(USE_ROCM)
#undef cudaMemcpy
#undef cudaMemcpyHostToDevice
#undef cudaMemcpyDeviceToHost
#endif
}

#ifndef REDUCED_OPS_BUILD
// The following test uses some ops not supported in the reduced ops build
#ifndef REDUCED_OPS_BUILD
#if defined(USE_CUDA) || defined(USE_TENSORRT)
TEST(CApiTest, cuda_graph_with_shape_nodes) {
const auto& api = Ort::GetApi();

Expand All @@ -2048,10 +2100,34 @@ TEST(CApiTest, cuda_graph_with_shape_nodes) {
// Successful loading of the ONNX model with shape nodes with cuda graph feature enabled
Ort::Session session(*ort_env, TSTR("testdata/cuda_graph_with_shape_nodes.onnx"), session_options);
}
#endif // defined(USE_CUDA) || defined(USE_TENSORRT)

Check warning on line 2103 in onnxruntime/test/shared_lib/test_inference.cc

View workflow job for this annotation

GitHub Actions / Lint C++

[cpplint] reported by reviewdog 🐶 At least two spaces is best between code and comments [whitespace/comments] [2] Raw Output: onnxruntime/test/shared_lib/test_inference.cc:2103: At least two spaces is best between code and comments [whitespace/comments] [2]

#endif
#if defined(USE_ROCM)
TEST(CApiTest, hip_graph_with_shape_nodes) {
const auto& api = Ort::GetApi();

#endif
// Enable hip graph in rocm provider option.
OrtROCMProviderOptions* rocm_options = nullptr;
ASSERT_TRUE(api.CreateROCMProviderOptions(&rocm_options) == nullptr);
std::unique_ptr<OrtROCMProviderOptions, decltype(api.ReleaseROCMProviderOptions)>
rel_rocm_options(rocm_options, api.ReleaseROCMProviderOptions);
std::vector<const char*> keys{"enable_hip_graph"};
std::vector<const char*> values{"1"};
ASSERT_TRUE(api.UpdateROCMProviderOptions(rel_rocm_options.get(), keys.data(), values.data(), 1) == nullptr);

Ort::SessionOptions session_options;
ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_ROCM(
static_cast<OrtSessionOptions*>(session_options),
rel_rocm_options.get()) == nullptr);

// Successful loading of the ONNX model with shape nodes with hip graph feature enabled
Ort::Session session(*ort_env, TSTR("testdata/cuda_graph_with_shape_nodes.onnx"), session_options);
}
#endif // defined(USE_ROCM)

Check warning on line 2126 in onnxruntime/test/shared_lib/test_inference.cc

View workflow job for this annotation

GitHub Actions / Lint C++

[cpplint] reported by reviewdog 🐶 At least two spaces is best between code and comments [whitespace/comments] [2] Raw Output: onnxruntime/test/shared_lib/test_inference.cc:2126: At least two spaces is best between code and comments [whitespace/comments] [2]

#endif // REDUCED_OPS_BUILD

Check warning on line 2128 in onnxruntime/test/shared_lib/test_inference.cc

View workflow job for this annotation

GitHub Actions / Lint C++

[cpplint] reported by reviewdog 🐶 At least two spaces is best between code and comments [whitespace/comments] [2] Raw Output: onnxruntime/test/shared_lib/test_inference.cc:2128: At least two spaces is best between code and comments [whitespace/comments] [2]

#endif // defined(USE_CUDA) || defined(USE_TENSORRT) || defined(USE_ROCM)

Check warning on line 2130 in onnxruntime/test/shared_lib/test_inference.cc

View workflow job for this annotation

GitHub Actions / Lint C++

[cpplint] reported by reviewdog 🐶 At least two spaces is best between code and comments [whitespace/comments] [2] Raw Output: onnxruntime/test/shared_lib/test_inference.cc:2130: At least two spaces is best between code and comments [whitespace/comments] [2]

TEST(CApiTest, create_tensor) {
const char* s[] = {"abc", "kmp"};
Expand Down

0 comments on commit f3b46e4

Please sign in to comment.