From 07d78b65f901c5f3b2a804f0f0b5b4d51108817e Mon Sep 17 00:00:00 2001 From: Ye Wang <52801275+wangyems@users.noreply.github.com> Date: Thu, 7 Mar 2024 00:32:43 -0800 Subject: [PATCH 1/2] Update CUDA-ExecutionProvider.md --- .../CUDA-ExecutionProvider.md | 29 ++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/docs/execution-providers/CUDA-ExecutionProvider.md b/docs/execution-providers/CUDA-ExecutionProvider.md index b2d6d68ef4784..d465f6bf1b973 100644 --- a/docs/execution-providers/CUDA-ExecutionProvider.md +++ b/docs/execution-providers/CUDA-ExecutionProvider.md @@ -338,8 +338,11 @@ shown below) if [N, C, 1, D] is preferred. While using the CUDA EP, ORT supports the usage of [CUDA Graphs](https://developer.nvidia.com/blog/cuda-10-features-revealed/) to remove CPU overhead associated with -launching CUDA kernels sequentially. To enable the usage of CUDA Graphs, use the provider option as shown in the samples -below. +launching CUDA kernels sequentially. To enable the usage of CUDA Graphs, use the provider options as shown in the samples +below. ORT supports multi-graph capture capability by passing the cuda graph annotation id to the run options. If the +cuda graph annotation id value is set to -1, cuda graph capture/replay is disabled in that run. User are not expected to +set the value to 0 as it is reserved for internal use. + Currently, there are some constraints with regards to using the CUDA Graphs feature: * Models with control-flow ops (i.e. `If`, `Loop` and `Scan` ops) are not supported. @@ -348,8 +351,7 @@ Currently, there are some constraints with regards to using the CUDA Graphs feat * The input/output types of models need to be tensors. -* Shapes of inputs/outputs cannot change across inference calls. Dynamic shape models are supported - the only - constraint is that the input/output shapes should be the same across all inference calls. +* Shapes of inputs/outputs cannot change across inference calls for the same graph annotation id. * By design, [CUDA Graphs](https://developer.nvidia.com/blog/cuda-10-features-revealed/) is designed to read from/write to the same CUDA virtual memory addresses during the graph replaying step as it does during the graph capturing step. @@ -385,22 +387,26 @@ captured and cached in the first `Run()`. session = onnxrt.InferenceSession("matmul_2.onnx", providers=providers) io_binding = session.io_binding() + # Pass gpu_graph_id to RunOptions through RunConfigs + ro = onnxrt.RunOptions() + ro.add_run_config_entry("gpu_graph_id", "1") + # Bind the input and output io_binding.bind_ortvalue_input('X', x_ortvalue) io_binding.bind_ortvalue_output('Y', y_ortvalue) # One regular run for the necessary memory allocation and cuda graph capturing - session.run_with_iobinding(io_binding) + session.run_with_iobinding(io_binding, ro) expected_y = np.array([[5.0], [11.0], [17.0]], dtype=np.float32) np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05) # After capturing, CUDA graph replay happens from this Run onwards - session.run_with_iobinding(io_binding) + session.run_with_iobinding(io_binding, ro) np.testing.assert_allclose(expected_y, y_ortvalue.numpy(), rtol=1e-05, atol=1e-05) # Update input and then replay CUDA graph with the updated input x_ortvalue.update_inplace(np.array([[10.0, 20.0], [30.0, 40.0], [50.0, 60.0]], dtype=np.float32)) - session.run_with_iobinding(io_binding) + session.run_with_iobinding(io_binding, ro) ``` * C/C++ ```c++ @@ -429,6 +435,9 @@ captured and cached in the first `Run()`. Ort::SessionOptions session_options; api.SessionOptionsAppendExecutionProvider_CUDA_V2(static_cast(session_options), rel_cuda_options.get(); + // Pass gpu_graph_id to RunOptions through RunConfigs + Ort::RunOptions run_option; + run_option.AddConfigEntry(kOrtRunOptionsConfigCudaGraphAnnotation, "1"); // Create IO bound inputs and outputs. Ort::Session session(*ort_env, ORT_TSTR("matmul_2.onnx"), session_options); @@ -459,15 +468,15 @@ captured and cached in the first `Run()`. binding.BindOutput("Y", bound_y); // One regular run for necessary memory allocation and graph capturing - session.Run(Ort::RunOptions(), binding); + session.Run(run_option, binding); // After capturing, CUDA graph replay happens from this Run onwards - session.Run(Ort::RunOptions(), binding); + session.Run(run_option, binding); // Update input and then replay CUDA graph with the updated input x_values = {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f}; cudaMemcpy(input_data.get(), x_values.data(), sizeof(float) * x_values.size(), cudaMemcpyHostToDevice); - session.Run(Ort::RunOptions(), binding); + session.Run(run_option, binding); ``` * C# (future) From 43ce9f49fd1331ad6afa623a70d6cb1ee061e297 Mon Sep 17 00:00:00 2001 From: Ye Wang <52801275+wangyems@users.noreply.github.com> Date: Thu, 7 Mar 2024 10:32:48 -0800 Subject: [PATCH 2/2] review comments --- .../CUDA-ExecutionProvider.md | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/docs/execution-providers/CUDA-ExecutionProvider.md b/docs/execution-providers/CUDA-ExecutionProvider.md index d465f6bf1b973..995a29277cfe8 100644 --- a/docs/execution-providers/CUDA-ExecutionProvider.md +++ b/docs/execution-providers/CUDA-ExecutionProvider.md @@ -339,9 +339,9 @@ shown below) if [N, C, 1, D] is preferred. While using the CUDA EP, ORT supports the usage of [CUDA Graphs](https://developer.nvidia.com/blog/cuda-10-features-revealed/) to remove CPU overhead associated with launching CUDA kernels sequentially. To enable the usage of CUDA Graphs, use the provider options as shown in the samples -below. ORT supports multi-graph capture capability by passing the cuda graph annotation id to the run options. If the -cuda graph annotation id value is set to -1, cuda graph capture/replay is disabled in that run. User are not expected to -set the value to 0 as it is reserved for internal use. +below. ORT supports multi-graph capture capability by passing the user specified gpu_graph_id to the run options. +gpu_graph_id is optional when the session uses one cuda graph. If not set, the default value is 0. If the gpu_graph_id is +set to -1, cuda graph capture/replay is disabled in that run. Currently, there are some constraints with regards to using the CUDA Graphs feature: @@ -351,7 +351,11 @@ Currently, there are some constraints with regards to using the CUDA Graphs feat * The input/output types of models need to be tensors. -* Shapes of inputs/outputs cannot change across inference calls for the same graph annotation id. +* Shapes and addresses of inputs/outputs cannot change across inference calls for the same graph annotation id. Input + tensors for replay shall be copied to the address of input tensors used in graph capture. + +* In multi-graph capture mode, the captured graphs will remain in the session's lifetime and the captured graph deletion + feature is not supported at the moment. * By design, [CUDA Graphs](https://developer.nvidia.com/blog/cuda-10-features-revealed/) is designed to read from/write to the same CUDA virtual memory addresses during the graph replaying step as it does during the graph capturing step. @@ -389,6 +393,7 @@ captured and cached in the first `Run()`. # Pass gpu_graph_id to RunOptions through RunConfigs ro = onnxrt.RunOptions() + # gpu_graph_id is optional if the session uses only one cuda graph ro.add_run_config_entry("gpu_graph_id", "1") # Bind the input and output @@ -437,7 +442,8 @@ captured and cached in the first `Run()`. // Pass gpu_graph_id to RunOptions through RunConfigs Ort::RunOptions run_option; - run_option.AddConfigEntry(kOrtRunOptionsConfigCudaGraphAnnotation, "1"); + // gpu_graph_id is optional if the session uses only one cuda graph + run_option.AddConfigEntry("gpu_graph_id", "1"); // Create IO bound inputs and outputs. Ort::Session session(*ort_env, ORT_TSTR("matmul_2.onnx"), session_options);