diff --git a/CMakeLists.txt b/CMakeLists.txt index 2610719d..b81795e3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -33,6 +33,16 @@ else() add_subdirectory(client_backend) +find_package(Git REQUIRED) + +execute_process(WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + COMMAND "${GIT_EXECUTABLE}" log -n 1 --abbrev-commit --format=format:%h + RESULT_VARIABLE RETURN_CODE + OUTPUT_VARIABLE GIT_SHA) +if(NOT RETURN_CODE EQUAL "0") + set(GIT_SHA "unknown") +endif() + set( PERF_ANALYZER_SRCS command_line_parser.cc @@ -43,11 +53,23 @@ set( data_loader.cc concurrency_manager.cc request_rate_manager.cc + load_worker.cc + concurrency_worker.cc + request_rate_worker.cc custom_load_manager.cc + infer_context.cc inference_profiler.cc report_writer.cc mpi_utils.cc metrics_manager.cc + infer_data_manager_base.cc + infer_data_manager.cc + infer_data_manager_shm.cc + sequence_manager.cc + profile_data_collector.cc + profile_data_exporter.cc + periodic_concurrency_manager.cc + periodic_concurrency_worker.cc ) set( @@ -61,6 +83,11 @@ set( concurrency_manager.h request_rate_manager.h custom_load_manager.h + iworker.h + load_worker.h + request_rate_worker.h + concurrency_worker.h + infer_context.h inference_profiler.h report_writer.h mpi_utils.h @@ -68,6 +95,24 @@ set( constants.h metrics.h metrics_manager.h + infer_data_manager_factory.h + iinfer_data_manager.h + infer_data_manager.h + infer_data_manager_shm.h + infer_data_manager_base.h + infer_data.h + sequence_manager.h + sequence_status.h + ictx_id_tracker.h + concurrency_ctx_id_tracker.h + fifo_ctx_id_tracker.h + rand_ctx_id_tracker.h + request_record.h + profile_data_collector.h + profile_data_exporter.h + periodic_concurrency_manager.h + periodic_concurrency_worker.h + thread_config.h ) add_executable( @@ -85,6 +130,13 @@ target_link_libraries( ${CMAKE_DL_LIBS} ) +target_compile_definitions( + perf_analyzer + PRIVATE + PERF_ANALYZER_VERSION=${PERF_ANALYZER_VERSION} + GIT_SHA=${GIT_SHA} +) + # If gpu is enabled then compile with CUDA dependencies if(TRITON_ENABLE_GPU) target_compile_definitions( @@ -119,6 +171,13 @@ if(TRITON_ENABLE_PERF_ANALYZER_TS) ) endif() +if(TRITON_ENABLE_PERF_ANALYZER_OPENAI) + target_compile_definitions( + client-backend-library + PUBLIC TRITON_ENABLE_PERF_ANALYZER_OPENAI=1 + ) +endif() + install( TARGETS perf_analyzer RUNTIME DESTINATION bin @@ -142,19 +201,43 @@ add_executable( ${PERF_ANALYZER_UNIT_TESTS_SRCS} ${PERF_ANALYZER_UNIT_TESTS_HDRS} mock_inference_profiler.h + mock_model_parser.h + test_utils.h + client_backend/mock_client_backend.h + mock_concurrency_worker.h + mock_data_loader.h + mock_infer_context.h + mock_infer_data_manager.h + mock_request_rate_worker.h + mock_sequence_manager.h + mock_profile_data_collector.h + mock_profile_data_exporter.h + test_dataloader.cc test_inference_profiler.cc test_command_line_parser.cc + test_idle_timer.cc + test_load_manager_base.h + test_load_manager.cc test_model_parser.cc test_metrics_manager.cc + test_perf_utils.cc test_report_writer.cc client_backend/triton/test_triton_client_backend.cc + test_request_rate_manager.cc + test_concurrency_manager.cc + test_custom_load_manager.cc + test_sequence_manager.cc + test_infer_context.cc + test_ctx_id_tracker.cc + test_profile_data_collector.cc + test_profile_data_exporter.cc $ ) # -Wno-write-strings is needed for the unit tests in order to statically create # input argv cases in the CommandLineParser unit test # -set_target_properties(perf_analyzer_unit_tests +set_target_properties(perf_analyzer_unit_tests PROPERTIES COMPILE_FLAGS "-Wno-write-strings") target_link_libraries( diff --git a/README.md b/README.md new file mode 100644 index 00000000..e910f466 --- /dev/null +++ b/README.md @@ -0,0 +1,171 @@ + + +# Triton Performance Analyzer + +Triton Performance Analyzer is CLI tool which can help you optimize the +inference performance of models running on Triton Inference Server by measuring +changes in performance as you experiment with different optimization strategies. + +
+ +# Features + +### Inference Load Modes + +- [Concurrency Mode](docs/inference_load_modes.md#concurrency-mode) simlulates + load by maintaining a specific concurrency of outgoing requests to the + server + +- [Request Rate Mode](docs/inference_load_modes.md#request-rate-mode) simulates + load by sending consecutive requests at a specific rate to the server + +- [Custom Interval Mode](docs/inference_load_modes.md#custom-interval-mode) + simulates load by sending consecutive requests at specific intervals to the + server + +### Performance Measurement Modes + +- [Time Windows Mode](docs/measurements_metrics.md#time-windows) measures model + performance repeatedly over a specific time interval until performance has + stabilized + +- [Count Windows Mode](docs/measurements_metrics.md#count-windows) measures + model performance repeatedly over a specific number of requests until + performance has stabilized + +### Other Features + +- [Sequence Models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#stateful-models), + [Ensemble Models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#ensemble-models), + and + [Decoupled Models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/decoupled_models.md) + can be profiled in addition to standard/stateless/coupled models + +- [Input Data](docs/input_data.md) to model inferences can be auto-generated or + specified as well as verifying output + +- [TensorFlow Serving](docs/benchmarking.md#benchmarking-tensorflow-serving) and + [TorchServe](docs/benchmarking.md#benchmarking-torchserve) can be used as the + inference server in addition to the default Triton server + +
+ +# Quick Start + +The steps below will guide you on how to start using Perf Analyzer. + +### Step 1: Start Triton Container + +```bash +export RELEASE= # e.g. to use the release from the end of February of 2023, do `export RELEASE=23.02` + +docker pull nvcr.io/nvidia/tritonserver:${RELEASE}-py3 + +docker run --gpus all --rm -it --net host nvcr.io/nvidia/tritonserver:${RELEASE}-py3 +``` + +### Step 2: Download `simple` Model + +```bash +# inside triton container +git clone --depth 1 https://github.com/triton-inference-server/server + +mkdir model_repository ; cp -r server/docs/examples/model_repository/simple model_repository +``` + +### Step 3: Start Triton Server + +```bash +# inside triton container +tritonserver --model-repository $(pwd)/model_repository &> server.log & + +# confirm server is ready, look for 'HTTP/1.1 200 OK' +curl -v localhost:8000/v2/health/ready + +# detach (CTRL-p CTRL-q) +``` + +### Step 4: Start Triton SDK Container + +```bash +docker pull nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk + +docker run --gpus all --rm -it --net host nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk +``` + +### Step 5: Run Perf Analyzer + +```bash +# inside sdk container +perf_analyzer -m simple +``` + +See the full [quick start guide](docs/quick_start.md) for additional tips on +how to analyze output. + +
+ +# Documentation + +- [Installation](docs/install.md) +- [Perf Analyzer CLI](docs/cli.md) +- [Inference Load Modes](docs/inference_load_modes.md) +- [Input Data](docs/input_data.md) +- [Measurements & Metrics](docs/measurements_metrics.md) +- [Benchmarking](docs/benchmarking.md) + +
+ +# Contributing + +Contributions to Triton Perf Analyzer are more than welcome. To contribute +please review the [contribution +guidelines](https://github.com/triton-inference-server/server/blob/main/CONTRIBUTING.md), +then fork and create a pull request. + +
+ +# Reporting problems, asking questions + +We appreciate any feedback, questions or bug reporting regarding this +project. When help with code is needed, follow the process outlined in +the Stack Overflow (https://stackoverflow.com/help/mcve) +document. Ensure posted examples are: + +- minimal - use as little code as possible that still produces the + same problem + +- complete - provide all parts needed to reproduce the problem. Check + if you can strip external dependency and still show the problem. The + less time we spend on reproducing problems the more time we have to + fix it + +- verifiable - test the code you're about to provide to make sure it + reproduces the problem. Remove all other problems that are not + related to your request/question. diff --git a/base_queue_ctx_id_tracker.h b/base_queue_ctx_id_tracker.h new file mode 100644 index 00000000..ba0f1781 --- /dev/null +++ b/base_queue_ctx_id_tracker.h @@ -0,0 +1,67 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include + +#include "ictx_id_tracker.h" + +namespace triton { namespace perfanalyzer { + +// Base class for CtxIdTrackers that track available IDs via a queue +// +class BaseQueueCtxIdTracker : public ICtxIdTracker { + public: + BaseQueueCtxIdTracker() = default; + + void Restore(size_t id) override { free_ctx_ids_.push(id); } + + size_t Get() override + { + if (!IsAvailable()) { + throw std::runtime_error("free ctx id list is empty"); + } + + size_t ctx_id = free_ctx_ids_.front(); + free_ctx_ids_.pop(); + return ctx_id; + } + + bool IsAvailable() override { return free_ctx_ids_.size() > 0; } + + protected: + std::queue free_ctx_ids_; + + // Erase all entries in the tracking queue + // + void Clear() + { + std::queue empty; + std::swap(free_ctx_ids_, empty); + } +}; + +}}; // namespace triton::perfanalyzer diff --git a/client_backend/CMakeLists.txt b/client_backend/CMakeLists.txt index 23da6f32..2c780ee2 100644 --- a/client_backend/CMakeLists.txt +++ b/client_backend/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -43,6 +43,10 @@ if(TRITON_ENABLE_PERF_ANALYZER_TS) add_subdirectory(torchserve) endif() +if(TRITON_ENABLE_PERF_ANALYZER_OPENAI) + add_subdirectory(openai) +endif() + set( CLIENT_BACKEND_SRCS client_backend.cc @@ -71,6 +75,12 @@ if(TRITON_ENABLE_PERF_ANALYZER_TS) set(TS_TARGET_INCLUDE_DIRECTORY PRIVATE $) endif() +if(TRITON_ENABLE_PERF_ANALYZER_OPENAI) + set(OPENAI_LIBRARY $) + set(OPENAI_TARGET_LINK_LIBRARY PUBLIC $) + set(OPENAI_TARGET_INCLUDE_DIRECTORY PRIVATE $) +endif() + add_library( client-backend-library ${CLIENT_BACKEND_SRCS} @@ -80,6 +90,7 @@ add_library( ${CAPI_LIBRARY} ${TFS_LIBRARY} ${TS_LIBRARY} + ${OPENAI_LIBRARY} ) target_link_libraries( @@ -89,6 +100,7 @@ target_link_libraries( ${CAPI_TARGET_LINK_LIBRARY} ${TFS_TARGET_LINK_LIBRARY} ${TS_TARGET_LINK_LIBRARY} + ${OPENAI_TARGET_LINK_LIBRARY} ) target_include_directories( @@ -97,4 +109,5 @@ target_include_directories( ${CAPI_TARGET_INCLUDE_DIRECTORY} ${TFS_TARGET_INCLUDE_DIRECTORY} ${TS_TARGET_INCLUDE_DIRECTORY} + ${OPENAI_TARGET_INCLUDE_DIRECTORY} ) diff --git a/client_backend/client_backend.cc b/client_backend/client_backend.cc index f886678d..09af5e5e 100644 --- a/client_backend/client_backend.cc +++ b/client_backend/client_backend.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -32,6 +32,10 @@ #include "triton_c_api/triton_c_api_backend.h" #endif // TRITON_ENABLE_PERF_ANALYZER_C_API +#ifdef TRITON_ENABLE_PERF_ANALYZER_OPENAI +#include "openai/openai_client_backend.h" +#endif // TRITON_ENABLE_PERF_ANALYZER_OPENAI + #ifdef TRITON_ENABLE_PERF_ANALYZER_TFS #include "tensorflow_serving/tfserve_client_backend.h" #endif // TRITON_ENABLE_PERF_ANALYZER_TFS @@ -86,6 +90,9 @@ BackendKindToString(const BackendKind kind) case TRITON_C_API: return std::string("TRITON_C_API"); break; + case OPENAI: + return std::string("OPENAI"); + break; default: return std::string("UNKNOWN"); break; @@ -112,20 +119,22 @@ BackendToGrpcType(const GrpcCompressionAlgorithm compression_algorithm) // Error ClientBackendFactory::Create( - const BackendKind kind, const std::string& url, const ProtocolType protocol, - const SslOptionsBase& ssl_options, + const BackendKind kind, const std::string& url, const std::string& endpoint, + const ProtocolType protocol, const SslOptionsBase& ssl_options, const std::map> trace_options, const GrpcCompressionAlgorithm compression_algorithm, std::shared_ptr http_headers, const std::string& triton_server_path, - const std::string& model_repository_path, const std::string& memory_type, - const bool verbose, const std::string& metrics_url, + const std::string& model_repository_path, const bool verbose, + const std::string& metrics_url, const cb::TensorFormat input_tensor_format, + const cb::TensorFormat output_tensor_format, std::shared_ptr* factory) { factory->reset(new ClientBackendFactory( - kind, url, protocol, ssl_options, trace_options, compression_algorithm, - http_headers, triton_server_path, model_repository_path, memory_type, - verbose, metrics_url)); + kind, url, endpoint, protocol, ssl_options, trace_options, + compression_algorithm, http_headers, triton_server_path, + model_repository_path, verbose, metrics_url, input_tensor_format, + output_tensor_format)); return Error::Success; } @@ -134,25 +143,33 @@ ClientBackendFactory::CreateClientBackend( std::unique_ptr* client_backend) { RETURN_IF_CB_ERROR(ClientBackend::Create( - kind_, url_, protocol_, ssl_options_, trace_options_, + kind_, url_, endpoint_, protocol_, ssl_options_, trace_options_, compression_algorithm_, http_headers_, verbose_, triton_server_path, - model_repository_path_, memory_type_, metrics_url_, client_backend)); + model_repository_path_, metrics_url_, input_tensor_format_, + output_tensor_format_, client_backend)); return Error::Success; } +const BackendKind& +ClientBackendFactory::Kind() +{ + return kind_; +} + // // ClientBackend // Error ClientBackend::Create( - const BackendKind kind, const std::string& url, const ProtocolType protocol, - const SslOptionsBase& ssl_options, + const BackendKind kind, const std::string& url, const std::string& endpoint, + const ProtocolType protocol, const SslOptionsBase& ssl_options, const std::map> trace_options, const GrpcCompressionAlgorithm compression_algorithm, std::shared_ptr http_headers, const bool verbose, const std::string& triton_server_path, - const std::string& model_repository_path, const std::string& memory_type, - const std::string& metrics_url, + const std::string& model_repository_path, const std::string& metrics_url, + const TensorFormat input_tensor_format, + const TensorFormat output_tensor_format, std::unique_ptr* client_backend) { std::unique_ptr local_backend; @@ -160,8 +177,15 @@ ClientBackend::Create( RETURN_IF_CB_ERROR(tritonremote::TritonClientBackend::Create( url, protocol, ssl_options, trace_options, BackendToGrpcType(compression_algorithm), http_headers, verbose, - metrics_url, &local_backend)); + metrics_url, input_tensor_format, output_tensor_format, + &local_backend)); + } +#ifdef TRITON_ENABLE_PERF_ANALYZER_OPENAI + else if (kind == OPENAI) { + RETURN_IF_CB_ERROR(openai::OpenAiClientBackend::Create( + url, endpoint, protocol, http_headers, verbose, &local_backend)); } +#endif // TRITON_ENABLE_PERF_ANALYZER_OPENAI #ifdef TRITON_ENABLE_PERF_ANALYZER_TFS else if (kind == TENSORFLOW_SERVING) { RETURN_IF_CB_ERROR(tfserving::TFServeClientBackend::Create( @@ -178,8 +202,7 @@ ClientBackend::Create( #ifdef TRITON_ENABLE_PERF_ANALYZER_C_API else if (kind == TRITON_C_API) { RETURN_IF_CB_ERROR(tritoncapi::TritonCApiClientBackend::Create( - triton_server_path, model_repository_path, memory_type, verbose, - &local_backend)); + triton_server_path, model_repository_path, verbose, &local_backend)); } #endif // TRITON_ENABLE_PERF_ANALYZER_C_API else { @@ -325,6 +348,26 @@ ClientBackend::RegisterCudaSharedMemory( pa::GENERIC_ERROR); } +Error +ClientBackend::RegisterCudaMemory( + const std::string& name, void* handle, const size_t byte_size) +{ + return Error( + "client backend of kind " + BackendKindToString(kind_) + + " does not support RegisterCudaMemory API", + pa::GENERIC_ERROR); +} + +Error +ClientBackend::RegisterSystemMemory( + const std::string& name, void* memory_ptr, const size_t byte_size) +{ + return Error( + "client backend of kind " + BackendKindToString(kind_) + + " does not support RegisterCudaMemory API", + pa::GENERIC_ERROR); +} + // // Shared Memory Utilities // @@ -392,6 +435,12 @@ InferInput::Create( RETURN_IF_CB_ERROR(tritonremote::TritonInferInput::Create( infer_input, name, dims, datatype)); } +#ifdef TRITON_ENABLE_PERF_ANALYZER_OPENAI + else if (kind == OPENAI) { + RETURN_IF_CB_ERROR( + openai::OpenAiInferInput::Create(infer_input, name, dims, datatype)); + } +#endif // TRITON_ENABLE_PERF_ANALYZER_OPENAI #ifdef TRITON_ENABLE_PERF_ANALYZER_TFS else if (kind == TENSORFLOW_SERVING) { RETURN_IF_CB_ERROR(tfserving::TFServeInferInput::Create( @@ -456,6 +505,14 @@ InferInput::SetSharedMemory( pa::GENERIC_ERROR); } +Error +InferInput::RawData(const uint8_t** buf, size_t* byte_size) +{ + return Error( + "client backend of kind " + BackendKindToString(kind_) + + " does not support RawData() for InferInput", + pa::GENERIC_ERROR); +} InferInput::InferInput( const BackendKind kind, const std::string& name, @@ -470,12 +527,19 @@ InferInput::InferInput( Error InferRequestedOutput::Create( InferRequestedOutput** infer_output, const BackendKind kind, - const std::string& name, const size_t class_count) + const std::string& name, const std::string& datatype, + const size_t class_count) { if (kind == TRITON) { RETURN_IF_CB_ERROR(tritonremote::TritonInferRequestedOutput::Create( - infer_output, name, class_count)); + infer_output, name, class_count, datatype)); } +#ifdef TRITON_ENABLE_PERF_ANALYZER_OPENAI + else if (kind == OPENAI) { + RETURN_IF_CB_ERROR(openai::OpenAiInferRequestedOutput::Create( + infer_output, name, datatype)); + } +#endif // TRITON_ENABLE_PERF_ANALYZER_OPENAI #ifdef TRITON_ENABLE_PERF_ANALYZER_TFS else if (kind == TENSORFLOW_SERVING) { RETURN_IF_CB_ERROR( @@ -485,7 +549,7 @@ InferRequestedOutput::Create( #ifdef TRITON_ENABLE_PERF_ANALYZER_C_API else if (kind == TRITON_C_API) { RETURN_IF_CB_ERROR(tritoncapi::TritonCApiInferRequestedOutput::Create( - infer_output, name, class_count)); + infer_output, name, class_count, datatype)); } #endif // TRITON_ENABLE_PERF_ANALYZER_C_API else { @@ -509,8 +573,9 @@ InferRequestedOutput::SetSharedMemory( } InferRequestedOutput::InferRequestedOutput( - const BackendKind kind, const std::string& name) - : kind_(kind), name_(name) + const BackendKind kind, const std::string& name, + const std::string& datatype) + : kind_(kind), name_(name), datatype_(datatype) { } diff --git a/client_backend/client_backend.h b/client_backend/client_backend.h index 6e6ec08b..06f68c2e 100644 --- a/client_backend/client_backend.h +++ b/client_backend/client_backend.h @@ -1,4 +1,4 @@ -// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -38,6 +38,7 @@ #include "../constants.h" #include "../metrics.h" +#include "../perf_analyzer_exception.h" #include "ipc.h" namespace pa = triton::perfanalyzer; @@ -70,6 +71,15 @@ namespace triton { namespace perfanalyzer { namespace clientbackend { } \ while (false) +#define THROW_IF_ERROR(S, MSG) \ + do { \ + triton::perfanalyzer::clientbackend::Error status__ = (S); \ + if (!status__.IsOk()) { \ + std::cerr << "error: " << (MSG) << ": " << status__ << std::endl; \ + throw PerfAnalyzerException(GENERIC_ERROR); \ + } \ + } while (false) + //============================================================================== /// Error status reported by backends /// @@ -88,7 +98,7 @@ class Error { explicit Error(const std::string& msg); /// Accessor for the message of this error. - /// \return The messsage for the error. Empty if no error. + /// \return The message for the error. Empty if no error. const std::string& Message() const { return msg_; } /// Accessor for the error code. @@ -125,14 +135,18 @@ enum BackendKind { TRITON = 0, TENSORFLOW_SERVING = 1, TORCHSERVE = 2, - TRITON_C_API = 3 + TRITON_C_API = 3, + OPENAI = 4 }; +std::string BackendKindToString(const BackendKind kind); + enum ProtocolType { HTTP = 0, GRPC = 1, UNKNOWN = 2 }; enum GrpcCompressionAlgorithm { COMPRESS_NONE = 0, COMPRESS_DEFLATE = 1, COMPRESS_GZIP = 2 }; +enum class TensorFormat { BINARY, JSON, UNKNOWN }; typedef std::map Headers; using OnCompleteFn = std::function; @@ -181,6 +195,15 @@ struct ModelStatistics { uint64_t cache_miss_time_ns_; }; +/// +/// Structure to hold Request parameter data for Inference Request. +/// +struct RequestParameter { + std::string name; + std::string value; + std::string type; +}; + //============================================================================== /// Structure to hold options for Inference Request. /// @@ -188,7 +211,7 @@ struct InferOptions { explicit InferOptions(const std::string& model_name) : model_name_(model_name), model_version_(""), request_id_(""), sequence_id_(0), sequence_id_str_(""), sequence_start_(false), - sequence_end_(false) + sequence_end_(false), triton_enable_empty_final_response_(true) { } /// The name of the model to run inference. @@ -217,6 +240,11 @@ struct InferOptions { /// sequence. Default value is False. This argument is ignored if /// 'sequence_id' is 0. bool sequence_end_; + /// Whether to tell Triton to enable an empty final response. + bool triton_enable_empty_final_response_; + + /// Additional parameters to pass to the model + std::unordered_map request_parameters_; }; struct SslOptionsBase { @@ -242,6 +270,7 @@ class ClientBackendFactory { /// Create a factory that can be used to construct Client Backends. /// \param kind The kind of client backend to create. /// \param url The inference server url and port. + /// \param endpoint The endpoint on the inference server to send requests to /// \param protocol The protocol type used. /// \param ssl_options The SSL options used with client backend. /// \param compression_algorithm The compression algorithm to be used @@ -254,48 +283,58 @@ class ClientBackendFactory { /// /opt/tritonserver) Must contain libtritonserver.so. /// \param model_repository_path Only for C api backend. Path to model /// repository which contains the desired model. - /// \param memory_type Only for C api backend. Type of memory used - /// (system is default) /// \param verbose Enables the verbose mode. /// \param metrics_url The inference server metrics url and port. + /// \param input_tensor_format The Triton inference request input tensor + /// format. + /// \param output_tensor_format The Triton inference response output tensor + /// format. /// \param factory Returns a new ClientBackend object. /// \return Error object indicating success or failure. static Error Create( const BackendKind kind, const std::string& url, - const ProtocolType protocol, const SslOptionsBase& ssl_options, + const std::string& endpoint, const ProtocolType protocol, + const SslOptionsBase& ssl_options, const std::map> trace_options, const GrpcCompressionAlgorithm compression_algorithm, std::shared_ptr http_headers, const std::string& triton_server_path, - const std::string& model_repository_path, const std::string& memory_type, - const bool verbose, const std::string& metrics_url, + const std::string& model_repository_path, const bool verbose, + const std::string& metrics_url, const TensorFormat input_tensor_format, + const TensorFormat output_tensor_format, std::shared_ptr* factory); + const BackendKind& Kind(); + /// Create a ClientBackend. /// \param backend Returns a new Client backend object. - Error CreateClientBackend(std::unique_ptr* backend); + virtual Error CreateClientBackend(std::unique_ptr* backend); private: ClientBackendFactory( const BackendKind kind, const std::string& url, - const ProtocolType protocol, const SslOptionsBase& ssl_options, + const std::string& endpoint, const ProtocolType protocol, + const SslOptionsBase& ssl_options, const std::map> trace_options, const GrpcCompressionAlgorithm compression_algorithm, const std::shared_ptr http_headers, const std::string& triton_server_path, - const std::string& model_repository_path, const std::string& memory_type, - const bool verbose, const std::string& metrics_url) - : kind_(kind), url_(url), protocol_(protocol), ssl_options_(ssl_options), - trace_options_(trace_options), + const std::string& model_repository_path, const bool verbose, + const std::string& metrics_url, const TensorFormat input_tensor_format, + const TensorFormat output_tensor_format) + : kind_(kind), url_(url), endpoint_(endpoint), protocol_(protocol), + ssl_options_(ssl_options), trace_options_(trace_options), compression_algorithm_(compression_algorithm), http_headers_(http_headers), triton_server_path(triton_server_path), - model_repository_path_(model_repository_path), - memory_type_(memory_type), verbose_(verbose), metrics_url_(metrics_url) + model_repository_path_(model_repository_path), verbose_(verbose), + metrics_url_(metrics_url), input_tensor_format_(input_tensor_format), + output_tensor_format_(output_tensor_format) { } const BackendKind kind_; const std::string url_; + const std::string endpoint_; const ProtocolType protocol_; const SslOptionsBase& ssl_options_; const std::map> trace_options_; @@ -303,9 +342,22 @@ class ClientBackendFactory { std::shared_ptr http_headers_; std::string triton_server_path; std::string model_repository_path_; - std::string memory_type_; const bool verbose_; const std::string metrics_url_{""}; + const TensorFormat input_tensor_format_{TensorFormat::UNKNOWN}; + const TensorFormat output_tensor_format_{TensorFormat::UNKNOWN}; + + +#ifndef DOCTEST_CONFIG_DISABLE + protected: + ClientBackendFactory() + : kind_(BackendKind()), url_(""), protocol_(ProtocolType()), + ssl_options_(SslOptionsBase()), + trace_options_(std::map>()), + compression_algorithm_(GrpcCompressionAlgorithm()), verbose_(false) + { + } +#endif }; // @@ -315,12 +367,14 @@ class ClientBackend { public: static Error Create( const BackendKind kind, const std::string& url, - const ProtocolType protocol, const SslOptionsBase& ssl_options, + const std::string& endpoint, const ProtocolType protocol, + const SslOptionsBase& ssl_options, const std::map> trace_options, const GrpcCompressionAlgorithm compression_algorithm, std::shared_ptr http_headers, const bool verbose, const std::string& library_directory, const std::string& model_repository, - const std::string& memory_type, const std::string& metrics_url, + const std::string& metrics_url, const TensorFormat input_tensor_format, + const TensorFormat output_tensor_format, std::unique_ptr* client_backend); /// Destructor for the client backend object @@ -390,13 +444,20 @@ class ClientBackend { const std::string& name, const cudaIpcMemHandle_t& handle, const size_t byte_size); + /// Registers cuda memory to the server. + virtual Error RegisterCudaMemory( + const std::string& name, void* handle, const size_t byte_size); + + /// Registers a system memory location on the server. + virtual Error RegisterSystemMemory( + const std::string& name, void* memory_ptr, const size_t byte_size); + // // Shared Memory Utilities // - // FIXME: These should probably move to a common area with shm_utils not tied - // specifically to inferenceserver. - // Create a shared memory region of the size 'byte_size' and return the unique - // identifier. + // FIXME: These should probably move to a common area with shm_utils not + // tied specifically to inferenceserver. Create a shared memory region of + // the size 'byte_size' and return the unique identifier. virtual Error CreateSharedMemoryRegion( std::string shm_key, size_t byte_size, int* shm_fd); @@ -416,7 +477,7 @@ class ClientBackend { // \return error Returns an error if unable to close shared memory descriptor. virtual Error CloseSharedMemory(int shm_fd); - // Destory the shared memory region with the given name. + // Destroy the shared memory region with the given name. // \return error Returns an error if unable to unlink shared memory region. virtual Error UnlinkSharedMemoryRegion(std::string shm_key); @@ -432,7 +493,7 @@ class ClientBackend { const BackendKind kind_{TRITON}; #ifndef DOCTEST_CONFIG_DISABLE - protected: + public: ClientBackend() = default; #endif }; @@ -499,6 +560,15 @@ class InferInput { virtual Error SetSharedMemory( const std::string& name, size_t byte_size, size_t offset = 0); + /// Get access to the buffer holding raw input. Note the buffer is owned by + /// InferInput instance. Users can copy out the data if required to extend + /// the lifetime. + /// \param buf Returns the pointer to the start of the buffer. + /// \param byte_size Returns the size of buffer in bytes. + /// \return Error object indicating success or failure of the + /// request. + virtual Error RawData(const uint8_t** buf, size_t* byte_size); + protected: InferInput( const BackendKind kind, const std::string& name, @@ -522,18 +592,24 @@ class InferRequestedOutput { /// \param infer_output Returns a new InferOutputGrpc object. /// \param kind The kind of the associated client backend. /// \param name The name of output being requested. + /// \param datatype The datatype of the output /// \param class_count The number of classifications to be requested. The /// default value is 0 which means the classification results are not /// requested. /// \return Error object indicating success or failure. static Error Create( InferRequestedOutput** infer_output, const BackendKind kind, - const std::string& name, const size_t class_count = 0); + const std::string& name, const std::string& datatype, + const size_t class_count = 0); /// Gets name of the associated output tensor. /// \return The name of the tensor. const std::string& Name() const { return name_; } + /// Gets datatype of the associated output tensor. + /// \return The datatype of the tensor + const std::string& Datatype() const { return datatype_; } + /// Set the output tensor data to be written to specified shared /// memory region. /// \param region_name The name of the shared memory region. @@ -546,9 +622,12 @@ class InferRequestedOutput { const size_t offset = 0); protected: - InferRequestedOutput(const BackendKind kind, const std::string& name); + InferRequestedOutput( + const BackendKind kind, const std::string& name, + const std::string& datatype = ""); const BackendKind kind_; const std::string name_; + const std::string datatype_; }; // @@ -575,6 +654,22 @@ class InferResult { virtual Error RawData( const std::string& output_name, const uint8_t** buf, size_t* byte_size) const = 0; + + /// Get final response bool for this response. + /// \return Error object indicating the success or failure. + virtual Error IsFinalResponse(bool* is_final_response) const + { + return Error("InferResult::IsFinalResponse() not implemented"); + }; + + /// Get null response bool for this response. + /// \return Error object indicating the success or failure. + virtual Error IsNullResponse(bool* is_null_response) const + { + return Error("InferResult::IsNullResponse() not implemented"); + }; }; }}} // namespace triton::perfanalyzer::clientbackend + +namespace cb = triton::perfanalyzer::clientbackend; diff --git a/client_backend/mock_client_backend.h b/client_backend/mock_client_backend.h new file mode 100644 index 00000000..483af914 --- /dev/null +++ b/client_backend/mock_client_backend.h @@ -0,0 +1,660 @@ +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include +#include +#include +#include + +#include "../doctest.h" +#include "client_backend.h" +#include "gmock/gmock.h" + +namespace triton { namespace perfanalyzer { namespace clientbackend { + +// Holds information (either the raw data or a shared memory label) for an +// inference input +// +struct TestRecordedInput { + TestRecordedInput(int32_t data_in, size_t size_in) + : shared_memory_label(""), data(data_in), size(size_in) + { + } + + TestRecordedInput(std::string label_in, size_t size_in) + : shared_memory_label(label_in), data(0), size(size_in) + { + } + + std::string shared_memory_label; + int32_t data; + size_t size; +}; + +/// Mock class of an InferInput +/// +class MockInferInput : public InferInput { + public: + MockInferInput( + const BackendKind kind, const std::string& name, + const std::vector& dims, const std::string& datatype) + : InferInput(kind, name, datatype), dims_(dims) + { + } + + const std::vector& Shape() const override { return dims_; } + + Error Reset() override + { + recorded_inputs_.clear(); + return Error::Success; + } + + Error AppendRaw(const uint8_t* input, size_t input_byte_size) override + { + if (input) { + int32_t val = *reinterpret_cast(input); + recorded_inputs_.push_back(TestRecordedInput(val, input_byte_size)); + } + ++append_raw_calls_; + return Error::Success; + } + + Error SetSharedMemory( + const std::string& name, size_t byte_size, size_t offset = 0) + { + recorded_inputs_.push_back(TestRecordedInput(name, byte_size)); + ++set_shared_memory_calls_; + return Error::Success; + } + + const std::vector dims_{}; + std::vector recorded_inputs_{}; + std::atomic append_raw_calls_{0}; + std::atomic set_shared_memory_calls_{0}; +}; + +/// Mock class of an InferResult +/// +class MockInferResult : public InferResult { + public: + MockInferResult(const InferOptions& options) : req_id_(options.request_id_) {} + + Error Id(std::string* id) const override + { + *id = req_id_; + return Error::Success; + } + Error RequestStatus() const override { return Error::Success; } + Error RawData( + const std::string& output_name, const uint8_t** buf, + size_t* byte_size) const override + { + return Error::Success; + } + + Error IsFinalResponse(bool* is_final_response) const override + { + if (is_final_response == nullptr) { + return Error("is_final_response cannot be nullptr"); + } + *is_final_response = true; + return Error::Success; + } + + Error IsNullResponse(bool* is_null_response) const override + { + if (is_null_response == nullptr) { + return Error("is_null_response cannot be nullptr"); + } + *is_null_response = false; + return Error::Success; + } + + private: + std::string req_id_; +}; + +/// Class to track statistics of MockClientBackend +/// +class MockClientStats { + public: + enum class ReqType { SYNC, ASYNC, ASYNC_STREAM }; + + struct SeqStatus { + // Set of all unique sequence IDs observed in requests + // + std::set used_seq_ids; + + // Map of all "live" sequence IDs (sequences that have started and not + // ended) to their current length (how many requests have been sent to that + // sequence ID since it started) + // + std::map live_seq_ids_to_length; + + // Map of sequence ID to how many requests have been received for it. + // + std::map seq_ids_to_count; + + // Map of sequence IDs to how many are "inflight" for that sequence ID + // (inflight means the request has been received, response has not been + // returned) + // + std::map seq_ids_to_inflight_count; + + // Maximum observed number of live sequences (sequences that have started + // and not ended) + // + uint32_t max_live_seq_count = 0; + + // Maximum observed number of inflight requests for a sequence + // + uint32_t max_inflight_seq_count = 0; + + std::vector seq_lengths; + + bool IsSeqLive(uint64_t seq_id) + { + return ( + live_seq_ids_to_length.find(seq_id) != live_seq_ids_to_length.end()); + } + void HandleSeqStart(uint64_t seq_id) + { + used_seq_ids.insert(seq_id); + live_seq_ids_to_length[seq_id] = 0; + if (live_seq_ids_to_length.size() > max_live_seq_count) { + max_live_seq_count = live_seq_ids_to_length.size(); + } + } + void HandleSeqEnd(uint64_t seq_id) + { + uint32_t len = live_seq_ids_to_length[seq_id]; + seq_lengths.push_back(len); + auto it = live_seq_ids_to_length.find(seq_id); + live_seq_ids_to_length.erase(it); + } + + void HandleSeqRequest(uint64_t seq_id) + { + live_seq_ids_to_length[seq_id]++; + + if (seq_ids_to_count.find(seq_id) == seq_ids_to_count.end()) { + seq_ids_to_count[seq_id] = 1; + } else { + seq_ids_to_count[seq_id]++; + } + + if (seq_ids_to_inflight_count.find(seq_id) == + seq_ids_to_inflight_count.end()) { + seq_ids_to_inflight_count[seq_id] = 1; + } else { + seq_ids_to_inflight_count[seq_id]++; + } + if (seq_ids_to_inflight_count[seq_id] > max_inflight_seq_count) { + max_inflight_seq_count = seq_ids_to_inflight_count[seq_id]; + } + } + + void Reset() + { + // Note that live_seq_ids_to_length is explicitly not reset here. + // This is because we always want to maintain the true status of + // live sequences + + used_seq_ids.clear(); + max_live_seq_count = 0; + seq_lengths.clear(); + seq_ids_to_count.clear(); + } + }; + + std::atomic num_infer_calls{0}; + std::atomic num_async_infer_calls{0}; + std::atomic num_async_stream_infer_calls{0}; + std::atomic num_start_stream_calls{0}; + + std::atomic num_active_infer_calls{0}; + + std::atomic num_append_raw_calls{0}; + std::atomic num_set_shared_memory_calls{0}; + // Struct tracking shared memory method calls + // + struct SharedMemoryStats { + std::atomic num_unregister_all_shared_memory_calls{0}; + std::atomic num_register_system_shared_memory_calls{0}; + std::atomic num_register_cuda_shared_memory_calls{0}; + std::atomic num_register_cuda_memory_calls{0}; + std::atomic num_register_system_memory_calls{0}; + std::atomic num_create_shared_memory_region_calls{0}; + std::atomic num_map_shared_memory_calls{0}; + std::atomic num_close_shared_memory_calls{0}; + std::atomic num_unlink_shared_memory_region_calls{0}; + std::atomic num_unmap_shared_memory_calls{0}; + + // bool operator==(const SharedMemoryStats& lhs, const SharedMemoryStats& + // rhs) + bool operator==(const SharedMemoryStats& rhs) const + { + if (this->num_unregister_all_shared_memory_calls == + rhs.num_unregister_all_shared_memory_calls && + this->num_register_system_shared_memory_calls == + rhs.num_register_system_shared_memory_calls && + this->num_register_cuda_shared_memory_calls == + rhs.num_register_cuda_shared_memory_calls && + this->num_register_cuda_memory_calls == + rhs.num_register_cuda_memory_calls && + this->num_register_system_memory_calls == + rhs.num_register_system_memory_calls && + this->num_create_shared_memory_region_calls == + rhs.num_create_shared_memory_region_calls && + this->num_map_shared_memory_calls == + rhs.num_map_shared_memory_calls && + this->num_close_shared_memory_calls == + rhs.num_close_shared_memory_calls && + this->num_unlink_shared_memory_region_calls == + rhs.num_unlink_shared_memory_region_calls && + this->num_unmap_shared_memory_calls == + rhs.num_unmap_shared_memory_calls) { + return true; + } + return false; + } + }; + + /// Determines how long the backend will delay before sending a "response". + /// If a single value vector is passed in, all responses will take that long. + /// If a list of values is passed in, then the mock backend will loop through + /// the values (and loop back to the start when it hits the end of the vector) + /// + void SetDelays(std::vector times) + { + response_delays_.clear(); + for (size_t t : times) { + response_delays_.push_back(std::chrono::milliseconds{t}); + } + } + + /// Determines the return status of requests. + /// If a single value vector is passed in, all responses will return that + /// status. If a list of values is passed in, then the mock backend will loop + /// through the values (and loop back to the start when it hits the end of the + /// vector) + /// + void SetReturnStatuses(std::vector statuses) + { + response_statuses_.clear(); + for (bool success : statuses) { + if (success) { + response_statuses_.push_back(Error::Success); + } else { + response_statuses_.push_back(Error("Injected test error")); + } + } + } + + std::chrono::milliseconds GetNextDelay() + { + std::lock_guard lock(mtx_); + + auto val = response_delays_[response_delays_index_]; + response_delays_index_++; + if (response_delays_index_ == response_delays_.size()) { + response_delays_index_ = 0; + } + return val; + } + + Error GetNextReturnStatus() + { + std::lock_guard lock(mtx_); + + auto val = response_statuses_[response_statuses_index_]; + response_statuses_index_++; + if (response_statuses_index_ == response_statuses_.size()) { + response_statuses_index_ = 0; + } + return val; + } + + bool start_stream_enable_stats_value{false}; + + std::vector> + request_timestamps; + SeqStatus sequence_status; + SharedMemoryStats memory_stats; + + // Each entry in the top vector is a list of all inputs for an inference + // request. If there are multiple inputs due to batching and/or the model + // having multiple inputs, all of those from the same request will be in the + // same second level vector + std::vector> recorded_inputs{}; + + void CaptureRequest( + ReqType type, const InferOptions& options, + const std::vector& inputs, + const std::vector& outputs) + { + num_active_infer_calls++; + + std::lock_guard lock(mtx_); + auto time = std::chrono::system_clock::now(); + request_timestamps.push_back(time); + + // Group all values across all inputs together into a single vector, and + // then record it + std::vector request_inputs; + for (const auto& input : inputs) { + auto recorded_inputs = + static_cast(input)->recorded_inputs_; + request_inputs.insert( + request_inputs.end(), recorded_inputs.begin(), recorded_inputs.end()); + } + recorded_inputs.push_back(request_inputs); + + UpdateCallCount(type); + UpdateSeqStatus(options); + AccumulateInferInputCalls(inputs); + } + + void CaptureRequestEnd(const InferOptions& options) + { + num_active_infer_calls--; + + if (options.sequence_id_ != 0) { + sequence_status.seq_ids_to_inflight_count[options.sequence_id_]--; + } + } + + void CaptureStreamStart() + { + std::lock_guard lock(mtx_); + num_start_stream_calls++; + } + + + void Reset() + { + std::lock_guard lock(mtx_); + num_infer_calls = 0; + num_async_infer_calls = 0; + num_async_stream_infer_calls = 0; + num_start_stream_calls = 0; + request_timestamps.clear(); + sequence_status.Reset(); + } + + private: + std::vector response_delays_{ + std::chrono::milliseconds{0}}; + std::vector response_statuses_{Error::Success}; + std::atomic response_delays_index_{0}; + std::atomic response_statuses_index_{0}; + + std::mutex mtx_; + + void UpdateCallCount(ReqType type) + { + if (type == ReqType::SYNC) { + num_infer_calls++; + } else if (type == ReqType::ASYNC) { + num_async_infer_calls++; + } else { + num_async_stream_infer_calls++; + } + } + + void UpdateSeqStatus(const InferOptions& options) + { + // Seq ID of 0 is reserved for "not a sequence" + // + if (options.sequence_id_ != 0) { + // If a sequence ID is not live, it must be starting + if (!sequence_status.IsSeqLive(options.sequence_id_)) { + REQUIRE(options.sequence_start_ == true); + } + + // If a new sequence is starting, that sequence ID must not already be + // live + if (options.sequence_start_ == true) { + REQUIRE(sequence_status.IsSeqLive(options.sequence_id_) == false); + sequence_status.HandleSeqStart(options.sequence_id_); + } + + sequence_status.HandleSeqRequest(options.sequence_id_); + + // If a sequence is ending, it must be live + if (options.sequence_end_) { + REQUIRE(sequence_status.IsSeqLive(options.sequence_id_) == true); + sequence_status.HandleSeqEnd(options.sequence_id_); + } + } + } + + void AccumulateInferInputCalls(const std::vector& inputs) + { + for (const auto& input : inputs) { + const MockInferInput* mock_input = + static_cast(input); + num_append_raw_calls += mock_input->append_raw_calls_; + num_set_shared_memory_calls += mock_input->set_shared_memory_calls_; + } + } +}; + +/// Mock implementation of ClientBackend interface +/// +class NaggyMockClientBackend : public ClientBackend { + public: + NaggyMockClientBackend(std::shared_ptr stats) : stats_(stats) + { + ON_CALL(*this, AsyncStreamInfer(testing::_, testing::_, testing::_)) + .WillByDefault( + [this]( + const InferOptions& options, + const std::vector& inputs, + const std::vector& outputs) + -> Error { + stats_->CaptureRequest( + MockClientStats::ReqType::ASYNC_STREAM, options, inputs, + outputs); + + LaunchAsyncMockRequest(options, stream_callback_); + + return stats_->GetNextReturnStatus(); + }); + } + + MOCK_METHOD( + Error, ModelConfig, + (rapidjson::Document*, const std::string&, const std::string&), + (override)); + MOCK_METHOD( + Error, AsyncStreamInfer, + (const InferOptions&, const std::vector&, + const std::vector&), + (override)); + + Error Infer( + InferResult** result, const InferOptions& options, + const std::vector& inputs, + const std::vector& outputs) override + { + stats_->CaptureRequest( + MockClientStats::ReqType::SYNC, options, inputs, outputs); + + std::this_thread::sleep_for(stats_->GetNextDelay()); + + local_completed_req_count_++; + stats_->CaptureRequestEnd(options); + + return stats_->GetNextReturnStatus(); + } + + Error AsyncInfer( + OnCompleteFn callback, const InferOptions& options, + const std::vector& inputs, + const std::vector& outputs) override + { + stats_->CaptureRequest( + MockClientStats::ReqType::ASYNC, options, inputs, outputs); + + LaunchAsyncMockRequest(options, callback); + + return stats_->GetNextReturnStatus(); + } + + Error StartStream(OnCompleteFn callback, bool enable_stats) + { + stats_->CaptureStreamStart(); + stats_->start_stream_enable_stats_value = enable_stats; + stream_callback_ = callback; + return stats_->GetNextReturnStatus(); + } + + Error ClientInferStat(InferStat* infer_stat) override + { + infer_stat->completed_request_count = local_completed_req_count_; + return Error::Success; + } + + Error UnregisterAllSharedMemory() override + { + stats_->memory_stats.num_unregister_all_shared_memory_calls++; + return Error::Success; + } + + Error RegisterSystemSharedMemory( + const std::string& name, const std::string& key, + const size_t byte_size) override + { + stats_->memory_stats.num_register_system_shared_memory_calls++; + return Error::Success; + } + + Error RegisterCudaSharedMemory( + const std::string& name, const cudaIpcMemHandle_t& handle, + const size_t byte_size) override + { + stats_->memory_stats.num_register_cuda_shared_memory_calls++; + return Error::Success; + } + + Error RegisterCudaMemory( + const std::string& name, void* handle, const size_t byte_size) override + { + stats_->memory_stats.num_register_cuda_memory_calls++; + return Error::Success; + } + + Error RegisterSystemMemory( + const std::string& name, void* memory_ptr, + const size_t byte_size) override + { + stats_->memory_stats.num_register_system_memory_calls++; + return Error::Success; + } + + Error CreateSharedMemoryRegion( + std::string shm_key, size_t byte_size, int* shm_fd) override + { + stats_->memory_stats.num_create_shared_memory_region_calls++; + return Error::Success; + } + + Error MapSharedMemory( + int shm_fd, size_t offset, size_t byte_size, void** shm_addr) override + { + stats_->memory_stats.num_map_shared_memory_calls++; + return Error::Success; + } + + Error CloseSharedMemory(int shm_fd) override + { + stats_->memory_stats.num_close_shared_memory_calls++; + return Error::Success; + } + + Error UnlinkSharedMemoryRegion(std::string shm_key) override + { + stats_->memory_stats.num_unlink_shared_memory_region_calls++; + return Error::Success; + } + + Error UnmapSharedMemory(void* shm_addr, size_t byte_size) override + { + stats_->memory_stats.num_unmap_shared_memory_calls++; + return Error::Success; + } + + OnCompleteFn stream_callback_; + + private: + void LaunchAsyncMockRequest(const InferOptions options, OnCompleteFn callback) + { + std::thread([this, options, callback]() { + std::this_thread::sleep_for(stats_->GetNextDelay()); + local_completed_req_count_++; + + InferResult* result = new MockInferResult(options); + callback(result); + + stats_->CaptureRequestEnd(options); + }).detach(); + } + + // Total count of how many requests this client has handled and finished + size_t local_completed_req_count_ = 0; + + std::shared_ptr stats_; +}; + +using MockClientBackend = testing::NiceMock; + +/// Mock factory that always creates a MockClientBackend instead +/// of a real backend +/// +class MockClientBackendFactory : public ClientBackendFactory { + public: + MockClientBackendFactory(std::shared_ptr stats) + { + stats_ = stats; + } + + Error CreateClientBackend(std::unique_ptr* backend) override + { + std::unique_ptr mock_backend( + new MockClientBackend(stats_)); + *backend = std::move(mock_backend); + return Error::Success; + } + + private: + std::shared_ptr stats_; +}; + +}}} // namespace triton::perfanalyzer::clientbackend diff --git a/client_backend/openai/CMakeLists.txt b/client_backend/openai/CMakeLists.txt new file mode 100644 index 00000000..93963e37 --- /dev/null +++ b/client_backend/openai/CMakeLists.txt @@ -0,0 +1,60 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +cmake_minimum_required (VERSION 3.18) + +set( + OPENAI_CLIENT_BACKEND_SRCS + http_client.cc + openai_client_backend.cc + openai_client.cc + openai_infer_input.cc +) + +set( + OPENAI_CLIENT_BACKEND_HDRS + http_client.h + openai_client_backend.h + openai_client.h + openai_infer_input.h +) + +add_library( + openai-client-backend-library EXCLUDE_FROM_ALL OBJECT + ${OPENAI_CLIENT_BACKEND_SRCS} + ${OPENAI_CLIENT_BACKEND_HDRS} +) + +target_link_libraries( + openai-client-backend-library + PUBLIC CURL::libcurl + PUBLIC httpclient_static +) + +if(${TRITON_ENABLE_GPU}) + target_include_directories(openai-client-backend-library PUBLIC ${CUDA_INCLUDE_DIRS}) + target_link_libraries(openai-client-backend-library PRIVATE ${CUDA_LIBRARIES}) +endif() # TRITON_ENABLE_GPU diff --git a/client_backend/openai/http_client.cc b/client_backend/openai/http_client.cc new file mode 100644 index 00000000..17fb42e0 --- /dev/null +++ b/client_backend/openai/http_client.cc @@ -0,0 +1,301 @@ +// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "http_client.h" + +#include +#include + +namespace triton { namespace perfanalyzer { namespace clientbackend { +namespace openai { + +HttpRequest::HttpRequest( + std::function&& completion_callback, const bool verbose) + : completion_callback_(std::move(completion_callback)), verbose_(verbose) +{ +} + +HttpRequest::~HttpRequest() +{ + if (header_list_ != nullptr) { + curl_slist_free_all(header_list_); + header_list_ = nullptr; + } +} + +void +HttpRequest::AddInput(uint8_t* buf, size_t byte_size) +{ + data_buffers_.push_back(std::pair(buf, byte_size)); + total_input_byte_size_ += byte_size; +} + +void +HttpRequest::GetNextInput(uint8_t* buf, size_t size, size_t* input_bytes) +{ + *input_bytes = 0; + + while (!data_buffers_.empty() && size > 0) { + const size_t csz = std::min(data_buffers_.front().second, size); + if (csz > 0) { + const uint8_t* input_ptr = data_buffers_.front().first; + std::copy(input_ptr, input_ptr + csz, buf); + size -= csz; + buf += csz; + *input_bytes += csz; + + data_buffers_.front().first += csz; + data_buffers_.front().second -= csz; + } + if (data_buffers_.front().second == 0) { + data_buffers_.pop_front(); + } + } +} + +std::mutex HttpClient::curl_init_mtx_{}; +HttpClient::HttpClient( + const std::string& server_url, bool verbose, + const HttpSslOptions& ssl_options) + : url_(server_url), verbose_(verbose), ssl_options_(ssl_options) +{ + // [TODO TMA-1670] uncomment below and remove class-wise mutex once confirm + // curl >= 7.84.0 will always be used + // auto* ver = curl_version_info(CURLVERSION_NOW); + // if (ver->features & CURL_VERSION_THREADSAFE == 0) { + // throw std::runtime_error( + // "HTTP client has dependency on CURL library to have thread-safe " + // "support (CURL_VERSION_THREADSAFE set)"); + // } + { + std::lock_guard lk(curl_init_mtx_); + if (curl_global_init(CURL_GLOBAL_ALL) != 0) { + throw std::runtime_error("CURL global initialization failed"); + } + } + + multi_handle_ = curl_multi_init(); + + worker_ = std::thread(&HttpClient::AsyncTransfer, this); +} + +HttpClient::~HttpClient() +{ + { + std::lock_guard lock(mutex_); + exiting_ = true; + } + + curl_multi_wakeup(multi_handle_); + + // thread not joinable if AsyncInfer() is not called + // (it is default constructed thread before the first AsyncInfer() call) + if (worker_.joinable()) { + worker_.join(); + } + + curl_multi_cleanup(multi_handle_); + + { + std::lock_guard lk(curl_init_mtx_); + curl_global_cleanup(); + } +} + +const std::string& +HttpClient::ParseSslCertType(HttpSslOptions::CERTTYPE cert_type) +{ + static std::string pem_str{"PEM"}; + static std::string der_str{"DER"}; + switch (cert_type) { + case HttpSslOptions::CERTTYPE::CERT_PEM: + return pem_str; + case HttpSslOptions::CERTTYPE::CERT_DER: + return der_str; + } + throw std::runtime_error( + "Unexpected SSL certificate type encountered. Only PEM and DER are " + "supported."); +} + +const std::string& +HttpClient::ParseSslKeyType(HttpSslOptions::KEYTYPE key_type) +{ + static std::string pem_str{"PEM"}; + static std::string der_str{"DER"}; + switch (key_type) { + case HttpSslOptions::KEYTYPE::KEY_PEM: + return pem_str; + case HttpSslOptions::KEYTYPE::KEY_DER: + return der_str; + } + throw std::runtime_error( + "unsupported SSL key type encountered. Only PEM and DER are " + "supported."); +} + +void +HttpClient::SetSSLCurlOptions(CURL* curl_handle) +{ + curl_easy_setopt( + curl_handle, CURLOPT_SSL_VERIFYPEER, ssl_options_.verify_peer); + curl_easy_setopt( + curl_handle, CURLOPT_SSL_VERIFYHOST, ssl_options_.verify_host); + if (!ssl_options_.ca_info.empty()) { + curl_easy_setopt(curl_handle, CURLOPT_CAINFO, ssl_options_.ca_info.c_str()); + } + const auto& curl_cert_type = ParseSslCertType(ssl_options_.cert_type); + curl_easy_setopt(curl_handle, CURLOPT_SSLCERTTYPE, curl_cert_type.c_str()); + if (!ssl_options_.cert.empty()) { + curl_easy_setopt(curl_handle, CURLOPT_SSLCERT, ssl_options_.cert.c_str()); + } + const auto& curl_key_type = ParseSslKeyType(ssl_options_.key_type); + curl_easy_setopt(curl_handle, CURLOPT_SSLKEYTYPE, curl_key_type.c_str()); + if (!ssl_options_.key.empty()) { + curl_easy_setopt(curl_handle, CURLOPT_SSLKEY, ssl_options_.key.c_str()); + } +} + +void +HttpClient::Send(CURL* handle, std::unique_ptr&& request) +{ + { + std::lock_guard lock(mutex_); + + if (exiting_) { + return; + } + + auto insert_result = new_async_requests_.emplace(std::make_pair( + reinterpret_cast(handle), std::move(request))); + if (!insert_result.second) { + curl_easy_cleanup(handle); + throw std::runtime_error( + "Failed to insert new asynchronous request context."); + } + } + curl_multi_wakeup(multi_handle_); +} + +void +HttpClient::AsyncTransfer() +{ + int messages_in_queue = 0; + int still_running = 0; + int numfds = 0; + CURLMsg* msg = nullptr; + AsyncReqMap ongoing_async_requests; + + do { + { + // Check for new requests and add them to ongoing requests + + std::lock_guard lock(mutex_); + + for (auto& pair : new_async_requests_) { + curl_multi_add_handle( + multi_handle_, reinterpret_cast(pair.first)); + + ongoing_async_requests[pair.first] = std::move(pair.second); + } + new_async_requests_.clear(); + } + + CURLMcode mc = curl_multi_perform(multi_handle_, &still_running); + + if (mc != CURLM_OK) { + std::cerr << "Unexpected error: curl_multi failed. Code:" << mc + << std::endl; + continue; + } + + while ((msg = curl_multi_info_read(multi_handle_, &messages_in_queue))) { + if (msg->msg != CURLMSG_DONE) { + // Something wrong happened. + std::cerr << "Unexpected error: received CURLMsg=" << msg->msg + << std::endl; + continue; + } + + uintptr_t identifier = reinterpret_cast(msg->easy_handle); + auto itr = ongoing_async_requests.find(identifier); + // This shouldn't happen + if (itr == ongoing_async_requests.end()) { + std::cerr << "Unexpected error: received completed request that is not " + "in the list of asynchronous requests" + << std::endl; + curl_multi_remove_handle(multi_handle_, msg->easy_handle); + curl_easy_cleanup(msg->easy_handle); + continue; + } + + uint32_t http_code = 400; + if (msg->data.result == CURLE_OK) { + curl_easy_getinfo(msg->easy_handle, CURLINFO_RESPONSE_CODE, &http_code); + } else if (msg->data.result == CURLE_OPERATION_TIMEDOUT) { + http_code = 499; + } + + itr->second->http_code_ = http_code; + itr->second->completion_callback_(itr->second.get()); + ongoing_async_requests.erase(itr); + curl_multi_remove_handle(multi_handle_, msg->easy_handle); + curl_easy_cleanup(msg->easy_handle); + } + + + // Wait for activity on existing requests or + // explicit curl_multi_wakeup call + // + // If there are no descriptors in the multi_handle_ + // then curl_multi_poll will wait until curl_multi_wakeup + // is called + // + // curl_multi_wakeup is called when adding a new request + // or exiting + + mc = curl_multi_poll(multi_handle_, NULL, 0, INT_MAX, &numfds); + + if (mc != CURLM_OK) { + std::cerr << "Unexpected error: curl_multi failed. Code:" << mc + << std::endl; + } + + } while (!exiting_); + + for (auto& request : ongoing_async_requests) { + CURL* easy_handle = reinterpret_cast(request.first); + curl_multi_remove_handle(multi_handle_, easy_handle); + curl_easy_cleanup(easy_handle); + } + + for (auto& request : new_async_requests_) { + CURL* easy_handle = reinterpret_cast(request.first); + curl_easy_cleanup(easy_handle); + } +} + +}}}} // namespace triton::perfanalyzer::clientbackend::openai diff --git a/client_backend/openai/http_client.h b/client_backend/openai/http_client.h new file mode 100644 index 00000000..7ff9bb14 --- /dev/null +++ b/client_backend/openai/http_client.h @@ -0,0 +1,172 @@ +// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace triton { namespace perfanalyzer { namespace clientbackend { +namespace openai { + +// The options for authorizing and authenticating SSL/TLS connections. +struct HttpSslOptions { + enum CERTTYPE { CERT_PEM = 0, CERT_DER = 1 }; + enum KEYTYPE { + KEY_PEM = 0, + KEY_DER = 1 + // TODO TMA-1645: Support loading private key from crypto engine + // KEY_ENG = 2 + }; + explicit HttpSslOptions() + : verify_peer(1), verify_host(2), cert_type(CERTTYPE::CERT_PEM), + key_type(KEYTYPE::KEY_PEM) + { + } + // This option determines whether curl verifies the authenticity of the peer's + // certificate. A value of 1 means curl verifies; 0 (zero) means it does not. + // Default value is 1. See here for more details: + // https://curl.se/libcurl/c/CURLOPT_SSL_VERIFYPEER.html + long verify_peer; + // This option determines whether libcurl verifies that the server cert is for + // the server it is known as. The default value for this option is 2 which + // means that certificate must indicate that the server is the server to which + // you meant to connect, or the connection fails. See here for more details: + // https://curl.se/libcurl/c/CURLOPT_SSL_VERIFYHOST.html + long verify_host; + // File holding one or more certificates to verify the peer with. If not + // specified, client will look for the system path where cacert bundle is + // assumed to be stored, as established at build time. See here for more + // information: https://curl.se/libcurl/c/CURLOPT_CAINFO.html + std::string ca_info; + // The format of client certificate. By default it is CERT_PEM. See here for + // more details: https://curl.se/libcurl/c/CURLOPT_SSLCERTTYPE.html + CERTTYPE cert_type; + // The file name of your client certificate. See here for more details: + // https://curl.se/libcurl/c/CURLOPT_SSLCERT.html + std::string cert; + // The format of the private key. By default it is KEY_PEM. See here for more + // details: https://curl.se/libcurl/c/CURLOPT_SSLKEYTYPE.html. + KEYTYPE key_type; + // The private key. See here for more details: + // https://curl.se/libcurl/c/CURLOPT_SSLKEY.html. + std::string key; +}; + +// HttpRequest object representing the context of a HTTP transaction. Currently +// it is also designed to be the placeholder for response data, but how the +// response is stored can be revisited later. +// 'completion_callback' doesn't transfer ownership of HttpRequest, caller must +// not keep the reference and access HttpRequest object after +// 'completion_callback' returns +class HttpRequest { + public: + HttpRequest( + std::function&& completion_callback, + const bool verbose = false); + virtual ~HttpRequest(); + + // Adds the input data to be delivered to the server, note that the HTTP + // request does not own the buffer. + void AddInput(uint8_t* buf, size_t byte_size); + + // Helper function for CURL + // Copy into 'buf' up to 'size' bytes of input data. Return the + // actual amount copied in 'input_bytes'. + void GetNextInput(uint8_t* buf, size_t size, size_t* input_bytes); + + // Buffer that accumulates the response body. + std::string response_buffer_; + + size_t total_input_byte_size_{0}; + + // HTTP response code for the inference request + uint32_t http_code_{200}; + + std::function completion_callback_{nullptr}; + + // Pointer to the list of the HTTP request header, keep it such that it will + // be valid during the transfer and can be freed once transfer is completed. + struct curl_slist* header_list_{nullptr}; + + protected: + const bool verbose_{false}; + + // Pointers to the input data. + std::deque> data_buffers_; +}; + +// Base class for common HTTP functionalities +class HttpClient { + public: + enum class CompressionType { NONE, DEFLATE, GZIP }; + + virtual ~HttpClient(); + + protected: + void SetSSLCurlOptions(CURL* curl_handle); + + HttpClient( + const std::string& server_url, bool verbose = false, + const HttpSslOptions& ssl_options = HttpSslOptions()); + + // Note that this function does not block + void Send(CURL* handle, std::unique_ptr&& request); + + protected: + void AsyncTransfer(); + + bool exiting_{false}; + + std::thread worker_; + std::mutex mutex_; + + // The server url + const std::string url_; + // The options for authorizing and authenticating SSL/TLS connections + HttpSslOptions ssl_options_; + + using AsyncReqMap = std::map>; + // curl multi handle for processing asynchronous requests + void* multi_handle_; + // map to record new asynchronous requests with pointer to easy handle + // or tag id as key + AsyncReqMap new_async_requests_; + + bool verbose_; + + private: + const std::string& ParseSslKeyType(HttpSslOptions::KEYTYPE key_type); + const std::string& ParseSslCertType(HttpSslOptions::CERTTYPE cert_type); + static std::mutex curl_init_mtx_; +}; +}}}} // namespace triton::perfanalyzer::clientbackend::openai diff --git a/client_backend/openai/openai_client.cc b/client_backend/openai/openai_client.cc new file mode 100644 index 00000000..cd517f6a --- /dev/null +++ b/client_backend/openai/openai_client.cc @@ -0,0 +1,314 @@ +// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Include this first to make sure we are a friend of common classes. +#define TRITON_INFERENCE_SERVER_CLIENT_CLASS InferenceServerHttpClient +#include "openai_client.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +#ifdef TRITON_ENABLE_ZLIB +#include +#endif + +extern "C" { +#include "cencode.h" +} + +#ifdef _WIN32 +#define strncasecmp(x, y, z) _strnicmp(x, y, z) +#undef min // NOMINMAX did not resolve std::min compile error +#endif //_WIN32 + +namespace triton { namespace perfanalyzer { namespace clientbackend { +namespace openai { + +//============================================================================== + +void +ChatCompletionRequest::SendResponse(bool is_final, bool is_null) +{ + response_callback_(new ChatCompletionResult( + http_code_, std::move(response_buffer_), is_final, is_null, request_id_)); +} + +ChatCompletionClient::ChatCompletionClient( + const std::string& url, const std::string& endpoint, bool verbose, + const HttpSslOptions& ssl_options) + : HttpClient(std::string(url + "/" + endpoint), verbose, ssl_options) +{ +} + +size_t +ChatCompletionClient::RequestProvider( + void* contents, size_t size, size_t nmemb, void* userp) +{ + auto request = reinterpret_cast(userp); + + size_t input_bytes = 0; + request->GetNextInput( + reinterpret_cast(contents), size * nmemb, &input_bytes); + + request->timer_.CaptureTimestamp( + triton::client::RequestTimers::Kind::SEND_END); + + return input_bytes; +} + +size_t +ChatCompletionClient::ResponseHeaderHandler( + void* contents, size_t size, size_t nmemb, void* userp) +{ + auto request = reinterpret_cast(userp); + + char* buf = reinterpret_cast(contents); + size_t byte_size = size * nmemb; + + std::string hdr(buf, byte_size); + std::transform(hdr.begin(), hdr.end(), hdr.begin(), [](unsigned char c) { + return std::tolower(c); + }); + if (hdr.find("content-type") != std::string::npos && + hdr.find("text/event-stream") != std::string::npos) { + request->is_stream_ = true; + } + return byte_size; +} + +size_t +ChatCompletionClient::ResponseHandler( + void* contents, size_t size, size_t nmemb, void* userp) +{ + // [TODO TMA-1666] verify if the SSE responses received are complete, or the + // response need to be stitched first. To verify, print out the received + // responses from SendResponse() to make sure the OpenAI server doesn't chunk + // the HTTP responses in the way that misaligns with the SSE responses. Reason + // of not stitching responses now is that it is a bit complicated that to make + // the write callback bulletproof is to assume the response can be chunked at + // arbitrary position, then bake in checking for SSE style (data:.*\n\n) by + // iterating all received buffer character by character. + size_t result_bytes = size * nmemb; + // return early if the response is empty as the response handling is + // triggered by the content of the response. + if (result_bytes == 0) { + return result_bytes; + } + + auto request = reinterpret_cast(userp); + if (request->timer_.Timestamp( + triton::client::RequestTimers::Kind::RECV_START) == 0) { + request->timer_.CaptureTimestamp( + triton::client::RequestTimers::Kind::RECV_START); + } + + char* buf = reinterpret_cast(contents); + request->response_buffer_.append(buf, result_bytes); + // Send response now if streaming, otherwise wait until request has been + // completed + if (request->is_stream_) { + auto done_signal = + (request->response_buffer_.find("data: [DONE]") != std::string::npos); + request->SendResponse( + done_signal /* is_final */, done_signal /* is_null */); + } + + // ResponseHandler may be called multiple times so we overwrite + // RECV_END so that we always have the time of the last. + request->timer_.CaptureTimestamp( + triton::client::RequestTimers::Kind::RECV_END); + + return result_bytes; +} + + +Error +ChatCompletionClient::AsyncInfer( + std::function callback, + std::string& serialized_request_body, const std::string& request_id, + const Headers& headers) +{ + if (callback == nullptr) { + return Error( + "Callback function must be provided along with AsyncInfer() call."); + } + + auto completion_callback = [this](HttpRequest* req) { + auto request = static_cast(req); + request->timer_.CaptureTimestamp( + triton::client::RequestTimers::Kind::REQUEST_END); + UpdateInferStat(request->timer_); + if (!request->is_stream_) { + request->SendResponse(true /* is_final */, false /* is_null */); + } + }; + std::unique_ptr request(new ChatCompletionRequest( + std::move(completion_callback), std::move(callback), request_id, + verbose_)); + auto raw_request = static_cast(request.get()); + raw_request->timer_.CaptureTimestamp( + triton::client::RequestTimers::Kind::REQUEST_START); + request->AddInput( + reinterpret_cast(serialized_request_body.data()), + serialized_request_body.size()); + + CURL* multi_easy_handle = curl_easy_init(); + Error err = PreRunProcessing(multi_easy_handle, raw_request, headers); + if (!err.IsOk()) { + curl_easy_cleanup(multi_easy_handle); + return err; + } + + raw_request->timer_.CaptureTimestamp( + triton::client::RequestTimers::Kind::SEND_START); + Send(multi_easy_handle, std::move(request)); + return Error::Success; +} + +Error +ChatCompletionClient::PreRunProcessing( + CURL* curl, ChatCompletionRequest* request, const Headers& headers) +{ + curl_easy_setopt(curl, CURLOPT_URL, url_.c_str()); + curl_easy_setopt(curl, CURLOPT_USERAGENT, "libcurl-agent/1.0"); + curl_easy_setopt(curl, CURLOPT_POST, 1L); + curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1L); + + if (verbose_) { + curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L); + } + + const long buffer_byte_size = 16 * 1024 * 1024; + curl_easy_setopt(curl, CURLOPT_UPLOAD_BUFFERSIZE, buffer_byte_size); + curl_easy_setopt(curl, CURLOPT_BUFFERSIZE, buffer_byte_size); + + // request data provided by RequestProvider() + curl_easy_setopt(curl, CURLOPT_READFUNCTION, RequestProvider); + curl_easy_setopt(curl, CURLOPT_READDATA, request); + + // response headers handled by ResponseHeaderHandler() + curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, ResponseHeaderHandler); + curl_easy_setopt(curl, CURLOPT_HEADERDATA, request); + + // response data handled by ResponseHandler() + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, ResponseHandler); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, request); + + const curl_off_t post_byte_size = request->total_input_byte_size_; + curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE_LARGE, post_byte_size); + + SetSSLCurlOptions(curl); + + struct curl_slist* list = nullptr; + list = curl_slist_append(list, "Expect:"); + list = curl_slist_append(list, "Content-Type: application/json"); + + for (const auto& pr : headers) { + std::string hdr = pr.first + ": " + pr.second; + list = curl_slist_append(list, hdr.c_str()); + } + + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, list); + + // The list will be freed when the request is destructed + request->header_list_ = list; + + return Error::Success; +} + +Error +ChatCompletionClient::UpdateInferStat( + const triton::client::RequestTimers& timer) +{ + const uint64_t request_time_ns = timer.Duration( + triton::client::RequestTimers::Kind::REQUEST_START, + triton::client::RequestTimers::Kind::REQUEST_END); + const uint64_t send_time_ns = timer.Duration( + triton::client::RequestTimers::Kind::SEND_START, + triton::client::RequestTimers::Kind::SEND_END); + const uint64_t recv_time_ns = timer.Duration( + triton::client::RequestTimers::Kind::RECV_START, + triton::client::RequestTimers::Kind::RECV_END); + + if ((request_time_ns == std::numeric_limits::max()) || + (send_time_ns == std::numeric_limits::max()) || + (recv_time_ns == std::numeric_limits::max())) { + return Error( + "Timer not set correctly." + + ((timer.Timestamp(triton::client::RequestTimers::Kind::REQUEST_START) > + timer.Timestamp(triton::client::RequestTimers::Kind::REQUEST_END)) + ? (" Request time from " + + std::to_string(timer.Timestamp( + triton::client::RequestTimers::Kind::REQUEST_START)) + + " to " + + std::to_string(timer.Timestamp( + triton::client::RequestTimers::Kind::REQUEST_END)) + + ".") + : "") + + ((timer.Timestamp(triton::client::RequestTimers::Kind::SEND_START) > + timer.Timestamp(triton::client::RequestTimers::Kind::SEND_END)) + ? (" Send time from " + + std::to_string(timer.Timestamp( + triton::client::RequestTimers::Kind::SEND_START)) + + " to " + + std::to_string(timer.Timestamp( + triton::client::RequestTimers::Kind::SEND_END)) + + ".") + : "") + + ((timer.Timestamp(triton::client::RequestTimers::Kind::RECV_START) > + timer.Timestamp(triton::client::RequestTimers::Kind::RECV_END)) + ? (" Receive time from " + + std::to_string(timer.Timestamp( + triton::client::RequestTimers::Kind::RECV_START)) + + " to " + + std::to_string(timer.Timestamp( + triton::client::RequestTimers::Kind::RECV_END)) + + ".") + : "")); + } + + infer_stat_.completed_request_count++; + infer_stat_.cumulative_total_request_time_ns += request_time_ns; + infer_stat_.cumulative_send_time_ns += send_time_ns; + infer_stat_.cumulative_receive_time_ns += recv_time_ns; + + return Error::Success; +} + +//============================================================================== + +}}}} // namespace triton::perfanalyzer::clientbackend::openai diff --git a/client_backend/openai/openai_client.h b/client_backend/openai/openai_client.h new file mode 100644 index 00000000..aadcb325 --- /dev/null +++ b/client_backend/openai/openai_client.h @@ -0,0 +1,180 @@ +// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include +#include + +#include "../client_backend.h" +#include "common.h" +#include "http_client.h" + + +namespace triton { namespace perfanalyzer { namespace clientbackend { +namespace openai { + +class ChatCompletionResult : public InferResult { + public: + ChatCompletionResult( + uint32_t http_code, std::string&& serialized_response, bool is_final, + bool is_null, const std::string& request_id) + : http_code_(http_code), + serialized_response_(std::move(serialized_response)), + is_final_(is_final), is_null_(is_null), request_id_(request_id) + { + } + virtual ~ChatCompletionResult() = default; + + /// Get the id of the request which generated this response. + /// \param id Returns the request id that generated the result. + /// \return Error object indicating success or failure. + Error Id(std::string* id) const override + { + *id = request_id_; + return Error::Success; + } + + + /// Returns the status of the request. + /// \return Error object indicating the success or failure of the + /// request. + Error RequestStatus() const override + { + if ((http_code_ >= 400) && (http_code_ <= 599)) { + return Error( + "OpenAI response returns HTTP code " + std::to_string(http_code_)); + } + return Error::Success; + } + + /// Returns the raw data of the output. + /// \return Error object indicating the success or failure of the + /// request. + Error RawData( + const std::string& output_name, const uint8_t** buf, + size_t* byte_size) const override + { + // There is only a single output (and it has no defined name), so we can + // disregard output_name + *buf = reinterpret_cast(serialized_response_.c_str()); + *byte_size = serialized_response_.size(); + return Error::Success; + } + + /// Get final response bool for this response. + /// \return Error object indicating the success or failure. + Error IsFinalResponse(bool* is_final_response) const override + { + *is_final_response = is_final_; + return Error::Success; + }; + + /// Get null response bool for this response. + /// \return Error object indicating the success or failure. + Error IsNullResponse(bool* is_null_response) const override + { + *is_null_response = is_null_; + return Error::Success; + }; + + private: + const uint32_t http_code_{200}; + const std::string serialized_response_; + const bool is_final_{false}; + const bool is_null_{false}; + const std::string request_id_; +}; + + +class ChatCompletionRequest : public HttpRequest { + public: + virtual ~ChatCompletionRequest() {} + ChatCompletionRequest( + std::function&& completion_callback, + std::function&& response_callback, + const std::string& request_id, const bool verbose = false) + : HttpRequest(std::move(completion_callback), verbose), + response_callback_(std::move(response_callback)), + request_id_(request_id) + { + } + void SendResponse(bool is_final, bool is_null); + bool is_stream_{false}; + std::function response_callback_{nullptr}; + // The timers for infer request. + triton::client::RequestTimers timer_; + const std::string request_id_; +}; + +class ChatCompletionClient : public HttpClient { + public: + virtual ~ChatCompletionClient() = default; + + /// Create a client that can be used to communicate with the server. + /// \param server_url The inference server name, port, optional + /// scheme and optional base path in the following format: + /// host:port/. + /// \param endpoint The name of the endpoint to send requests to + /// \param verbose If true generate verbose output when contacting + /// the inference server. + /// \param ssl_options Specifies the settings for configuring + /// SSL encryption and authorization. Providing these options + /// do not ensure that SSL/TLS will be used in communication. + /// The use of SSL/TLS depends entirely on the server endpoint. + /// These options will be ignored if the server_url does not + /// expose `https://` scheme. + ChatCompletionClient( + const std::string& server_url, const std::string& endpoint, + bool verbose = false, + const HttpSslOptions& ssl_options = HttpSslOptions()); + + /// Simplified AsyncInfer() where the request body is expected to be + /// prepared by the caller, the client here is responsible to communicate + /// with a OpenAI-compatible server in both streaming and non-streaming case. + Error AsyncInfer( + std::function callback, + std::string& serialized_request_body, const std::string& request_id, + const Headers& headers); + + const InferStat& ClientInferStat() { return infer_stat_; } + + private: + // setup curl handle + Error PreRunProcessing( + CURL* curl, ChatCompletionRequest* request, const Headers& headers); + + static size_t ResponseHandler( + void* contents, size_t size, size_t nmemb, void* userp); + static size_t RequestProvider( + void* contents, size_t size, size_t nmemb, void* userp); + static size_t ResponseHeaderHandler( + void* contents, size_t size, size_t nmemb, void* userp); + + Error UpdateInferStat(const triton::client::RequestTimers& timer); + InferStat infer_stat_; +}; + +}}}} // namespace triton::perfanalyzer::clientbackend::openai diff --git a/client_backend/openai/openai_client_backend.cc b/client_backend/openai/openai_client_backend.cc new file mode 100644 index 00000000..15bbbdc6 --- /dev/null +++ b/client_backend/openai/openai_client_backend.cc @@ -0,0 +1,112 @@ +// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "openai_client_backend.h" + +#include "openai_infer_input.h" + +namespace triton { namespace perfanalyzer { namespace clientbackend { +namespace openai { + +//============================================================================== + +Error +OpenAiClientBackend::Create( + const std::string& url, const std::string& endpoint, + const ProtocolType protocol, std::shared_ptr http_headers, + const bool verbose, std::unique_ptr* client_backend) +{ + if (protocol == ProtocolType::GRPC) { + return Error( + "perf_analyzer does not support gRPC protocol with OpenAI endpoints"); + } + std::unique_ptr openai_client_backend( + new OpenAiClientBackend(http_headers)); + + openai_client_backend->http_client_.reset( + new ChatCompletionClient(url, endpoint, verbose)); + + *client_backend = std::move(openai_client_backend); + + return Error::Success; +} + +Error +OpenAiClientBackend::AsyncInfer( + OnCompleteFn callback, const InferOptions& options, + const std::vector& inputs, + const std::vector& outputs) +{ + if (inputs.size() != 1) { + return Error("Only expecting one input"); + } + + auto raw_input = dynamic_cast(inputs[0]); + raw_input->PrepareForRequest(); + RETURN_IF_CB_ERROR(http_client_->AsyncInfer( + callback, raw_input->GetRequestBody(), options.request_id_, + *http_headers_)); + return Error::Success; +} + + +Error +OpenAiClientBackend::ClientInferStat(InferStat* infer_stat) +{ + *infer_stat = http_client_->ClientInferStat(); + return Error::Success; +} + +//============================================================================== + +Error +OpenAiInferRequestedOutput::Create( + InferRequestedOutput** infer_output, const std::string& name, + const std::string& datatype) +{ + OpenAiInferRequestedOutput* local_infer_output = + new OpenAiInferRequestedOutput(name, datatype); + + tc::InferRequestedOutput* openai_infer_output; + RETURN_IF_TRITON_ERROR(tc::InferRequestedOutput::Create( + &openai_infer_output, name, 0, datatype)); + local_infer_output->output_.reset(openai_infer_output); + + *infer_output = local_infer_output; + + return Error::Success; +} + +OpenAiInferRequestedOutput::OpenAiInferRequestedOutput( + const std::string& name, const std::string& datatype) + : InferRequestedOutput(BackendKind::OPENAI, name, datatype) +{ +} + +//============================================================================== + + +}}}} // namespace triton::perfanalyzer::clientbackend::openai diff --git a/client_backend/openai/openai_client_backend.h b/client_backend/openai/openai_client_backend.h new file mode 100644 index 00000000..2d475eac --- /dev/null +++ b/client_backend/openai/openai_client_backend.h @@ -0,0 +1,111 @@ +// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include + +#include "../../perf_utils.h" +#include "../client_backend.h" +#include "openai_client.h" +#include "openai_infer_input.h" + +#define RETURN_IF_TRITON_ERROR(S) \ + do { \ + const tc::Error& status__ = (S); \ + if (!status__.IsOk()) { \ + return Error(status__.Message()); \ + } \ + } while (false) + +namespace tc = triton::client; +namespace cb = triton::perfanalyzer::clientbackend; + +namespace triton { namespace perfanalyzer { namespace clientbackend { +namespace openai { + + +//============================================================================== +/// OpenAiClientBackend is used to generate load on the serving instance, +/// which supports OpenAI Chat Completions API +/// +class OpenAiClientBackend : public ClientBackend { + public: + /// Create an OpenAI client backend which can be used to interact with the + /// server. + /// \param url The inference server url and port. + /// \param endpoint The endpoint on the inference server to send requests to + /// \param protocol The protocol type used. + /// \param http_headers Map of HTTP headers. The map key/value indicates + /// the header name/value. + /// \param verbose Enables the verbose mode. + /// \param client_backend Returns a new OpenAiClientBackend + /// object. + /// \return Error object indicating success or failure. + static Error Create( + const std::string& url, const std::string& endpoint, + const ProtocolType protocol, std::shared_ptr http_headers, + const bool verbose, std::unique_ptr* client_backend); + + /// See ClientBackend::AsyncInfer() + Error AsyncInfer( + OnCompleteFn callback, const InferOptions& options, + const std::vector& inputs, + const std::vector& outputs) override; + + /// See ClientBackend::ClientInferStat() + Error ClientInferStat(InferStat* infer_stat) override; + + private: + OpenAiClientBackend(std::shared_ptr http_headers) + : ClientBackend(BackendKind::OPENAI), http_headers_(http_headers) + { + } + + std::unique_ptr http_client_; + std::shared_ptr http_headers_; +}; + +//============================================================== +/// OpenAiInferRequestedOutput is a wrapper around +/// InferRequestedOutput object of triton common client library. +/// +class OpenAiInferRequestedOutput : public InferRequestedOutput { + public: + static Error Create( + InferRequestedOutput** infer_output, const std::string& name, + const std::string& datatype); + /// Returns the raw InferRequestedOutput object required by OpenAi client + /// library. + tc::InferRequestedOutput* Get() const { return output_.get(); } + + private: + explicit OpenAiInferRequestedOutput( + const std::string& name, const std::string& datatype); + + std::unique_ptr output_; +}; + +}}}} // namespace triton::perfanalyzer::clientbackend::openai diff --git a/client_backend/openai/openai_infer_input.cc b/client_backend/openai/openai_infer_input.cc new file mode 100644 index 00000000..dcf213fc --- /dev/null +++ b/client_backend/openai/openai_infer_input.cc @@ -0,0 +1,103 @@ +// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "openai_infer_input.h" + +namespace triton { namespace perfanalyzer { namespace clientbackend { +namespace openai { + +Error +OpenAiInferInput::Create( + InferInput** infer_input, const std::string& name, + const std::vector& dims, const std::string& datatype) +{ + OpenAiInferInput* local_infer_input = + new OpenAiInferInput(name, dims, datatype); + + *infer_input = local_infer_input; + return Error::Success; +} + +Error +OpenAiInferInput::SetShape(const std::vector& shape) +{ + shape_ = shape; + return Error::Success; +} + +Error +OpenAiInferInput::Reset() +{ + data_str_.clear(); + + bufs_.clear(); + buf_byte_sizes_.clear(); + byte_size_ = 0; + return Error::Success; +} + +Error +OpenAiInferInput::AppendRaw(const uint8_t* input, size_t input_byte_size) +{ + data_str_.clear(); + + byte_size_ += input_byte_size; + + bufs_.push_back(input); + buf_byte_sizes_.push_back(input_byte_size); + return Error::Success; +} + +Error +OpenAiInferInput::RawData(const uint8_t** buf, size_t* byte_size) +{ + // TMA-1775 - handle multi-batch case + *buf = bufs_[0]; + *byte_size = buf_byte_sizes_[0]; + return Error::Success; +} + +Error +OpenAiInferInput::PrepareForRequest() +{ + // Reset position so request sends entire input. + if (data_str_.empty() && (byte_size_ != 0)) { + for (size_t i = 0; i < bufs_.size(); ++i) { + data_str_.append( + reinterpret_cast(bufs_[i]), buf_byte_sizes_[i]); + } + } + return Error::Success; +} + +OpenAiInferInput::OpenAiInferInput( + const std::string& name, const std::vector& dims, + const std::string& datatype) + : InferInput(BackendKind::OPENAI, name, datatype), shape_(dims) +{ +} + +}}}} // namespace triton::perfanalyzer::clientbackend::openai diff --git a/client_backend/openai/openai_infer_input.h b/client_backend/openai/openai_infer_input.h new file mode 100644 index 00000000..93a12b51 --- /dev/null +++ b/client_backend/openai/openai_infer_input.h @@ -0,0 +1,75 @@ +// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include + +#include "../../perf_utils.h" +#include "../client_backend.h" + + +namespace triton { namespace perfanalyzer { namespace clientbackend { +namespace openai { + +//============================================================== +/// OpenAiInferInput instance holds the information regarding +/// model input tensors and their corresponding generated data. +/// +class OpenAiInferInput : public InferInput { + public: + static Error Create( + InferInput** infer_input, const std::string& name, + const std::vector& dims, const std::string& datatype); + /// See InferInput::Shape() + const std::vector& Shape() const override { return shape_; } + /// See InferInput::SetShape() + Error SetShape(const std::vector& shape) override; + /// See InferInput::Reset() + Error Reset() override; + /// See InferInput::AppendRaw() + Error AppendRaw(const uint8_t* input, size_t input_byte_size) override; + /// See InferInput::RawData() + Error RawData(const uint8_t** buf, size_t* byte_size) override; + /// Prepare the input to be in the form expected by an OpenAI client, + /// must call before accessing the data. + Error PrepareForRequest(); + /// Get the contiguous request body string + std::string& GetRequestBody() { return data_str_; } + + private: + explicit OpenAiInferInput( + const std::string& name, const std::vector& dims, + const std::string& datatype); + + std::vector shape_; + size_t byte_size_{0}; + + std::vector bufs_; + std::vector buf_byte_sizes_; + std::string data_str_; +}; + +}}}} // namespace triton::perfanalyzer::clientbackend::openai diff --git a/client_backend/tensorflow_serving/CMakeLists.txt b/client_backend/tensorflow_serving/CMakeLists.txt index 7a1ad6eb..ba1c2fa4 100644 --- a/client_backend/tensorflow_serving/CMakeLists.txt +++ b/client_backend/tensorflow_serving/CMakeLists.txt @@ -71,7 +71,7 @@ file(GLOB TF_EXAMPLE_PROTOS ${CMAKE_BINARY_DIR}/protos/tensorflow/core/example/* file(GLOB TF_FW_PROTOS ${CMAKE_BINARY_DIR}/protos/tensorflow/core/framework/*.proto) file(GLOB TF_PROTOBUF_PROTOS ${CMAKE_BINARY_DIR}/protos/tensorflow/core/protobuf/*.proto) -# This is a dirty hack to prevent unneccesary leaking dependency +# This is a dirty hack to prevent unnecessary leaking dependency list(FILTER TF_PROTOBUF_PROTOS EXCLUDE REGEX "autotuning.proto$|conv_autotuning.proto$") # Compiling CPP sources from proto files. diff --git a/client_backend/tensorflow_serving/tfserve_client_backend.h b/client_backend/tensorflow_serving/tfserve_client_backend.h index 324f4abe..bd6b5db8 100644 --- a/client_backend/tensorflow_serving/tfserve_client_backend.h +++ b/client_backend/tensorflow_serving/tfserve_client_backend.h @@ -26,6 +26,7 @@ #pragma once #include + #include "../../perf_utils.h" #include "../client_backend.h" #include "tfserve_grpc_client.h" @@ -47,7 +48,7 @@ namespace tfserving { //============================================================================== -/// TFServeClientBackend is used to generate load on the TF serving isntance +/// TFServeClientBackend is used to generate load on the TF serving instance /// class TFServeClientBackend : public ClientBackend { public: diff --git a/client_backend/tensorflow_serving/tfserve_grpc_client.cc b/client_backend/tensorflow_serving/tfserve_grpc_client.cc index 16391234..f53e4d17 100644 --- a/client_backend/tensorflow_serving/tfserve_grpc_client.cc +++ b/client_backend/tensorflow_serving/tfserve_grpc_client.cc @@ -32,6 +32,7 @@ #include #include #include + #include "tfserve_client_backend.h" /// Type alias for string-TensorProto map. diff --git a/client_backend/tensorflow_serving/tfserve_grpc_client.h b/client_backend/tensorflow_serving/tfserve_grpc_client.h index 88b9d9a1..bfa475b8 100644 --- a/client_backend/tensorflow_serving/tfserve_grpc_client.h +++ b/client_backend/tensorflow_serving/tfserve_grpc_client.h @@ -26,6 +26,7 @@ #pragma once #include + #include "../client_backend.h" #include "common.h" #include "tensorflow_serving/apis/prediction_service.grpc.pb.h" @@ -128,7 +129,7 @@ class GrpcClient : public tc::InferenceServerClient { /// Run asynchronous inference on server. /// Once the request is completed, the InferResult pointer will be passed to /// the provided 'callback' function. Upon the invocation of callback - /// function, the ownership of InferResult object is transfered to the + /// function, the ownership of InferResult object is transferred to the /// function caller. It is then the caller's choice on either retrieving the /// results inside the callback function or deferring it to a different thread /// so that the client is unblocked. In order to prevent memory leak, user diff --git a/client_backend/tensorflow_serving/tfserve_infer_input.h b/client_backend/tensorflow_serving/tfserve_infer_input.h index 3062cbda..ec1a35dd 100644 --- a/client_backend/tensorflow_serving/tfserve_infer_input.h +++ b/client_backend/tensorflow_serving/tfserve_infer_input.h @@ -1,4 +1,4 @@ -// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -26,6 +26,7 @@ #pragma once #include + #include "../../perf_utils.h" #include "../client_backend.h" @@ -54,7 +55,7 @@ class TFServeInferInput : public InferInput { /// \param byte_size The size of data added in bytes. /// \return Error object indicating success or failure. Error ByteSize(size_t* byte_size) const; - /// Resets the heads to start providing data from the begining. + /// Resets the heads to start providing data from the beginning. Error PrepareForRequest(); /// Get the next chunk of data if available. Error GetNext(const uint8_t** buf, size_t* input_bytes, bool* end_of_input); @@ -65,7 +66,7 @@ class TFServeInferInput : public InferInput { const std::string& datatype); std::vector shape_; - size_t byte_size_; + size_t byte_size_{0}; size_t bufs_idx_, buf_pos_; std::vector bufs_; diff --git a/client_backend/torchserve/torchserve_client_backend.h b/client_backend/torchserve/torchserve_client_backend.h index f1d9e2e3..25566256 100644 --- a/client_backend/torchserve/torchserve_client_backend.h +++ b/client_backend/torchserve/torchserve_client_backend.h @@ -26,6 +26,7 @@ #pragma once #include + #include "../../perf_utils.h" #include "../client_backend.h" #include "torchserve_http_client.h" @@ -47,7 +48,7 @@ namespace torchserve { //============================================================================== -/// TorchServeClientBackend is used to generate load on the Torchserve isntance +/// TorchServeClientBackend is used to generate load on the Torchserve instance /// class TorchServeClientBackend : public ClientBackend { public: diff --git a/client_backend/torchserve/torchserve_http_client.cc b/client_backend/torchserve/torchserve_http_client.cc index 995b9011..c835ab10 100644 --- a/client_backend/torchserve/torchserve_http_client.cc +++ b/client_backend/torchserve/torchserve_http_client.cc @@ -28,6 +28,7 @@ #include #include + #include "torchserve_client_backend.h" namespace triton { namespace perfanalyzer { namespace clientbackend { diff --git a/client_backend/torchserve/torchserve_http_client.h b/client_backend/torchserve/torchserve_http_client.h index 4c498db7..ede9cdfd 100644 --- a/client_backend/torchserve/torchserve_http_client.h +++ b/client_backend/torchserve/torchserve_http_client.h @@ -28,6 +28,7 @@ #include #include #include + #include "../client_backend.h" #include "common.h" #include "torchserve_infer_input.h" diff --git a/client_backend/torchserve/torchserve_infer_input.h b/client_backend/torchserve/torchserve_infer_input.h index 0ecbad13..cc629cd1 100644 --- a/client_backend/torchserve/torchserve_infer_input.h +++ b/client_backend/torchserve/torchserve_infer_input.h @@ -26,6 +26,7 @@ #pragma once #include + #include "../../perf_utils.h" #include "../client_backend.h" @@ -55,7 +56,7 @@ class TorchServeInferInput : public InferInput { /// \param byte_size The size of data added in bytes. /// \return Error object indicating success or failure. Error ByteSize(size_t* byte_size) const; - /// Resets the heads to start providing data from the begining. + /// Resets the heads to start providing data from the beginning. Error PrepareForRequest(); /// Get the next chunk of data if available. Error GetNext(const uint8_t** buf, size_t* input_bytes, bool* end_of_input); diff --git a/client_backend/triton/test_triton_client_backend.cc b/client_backend/triton/test_triton_client_backend.cc index 4c726c97..c32ad17b 100644 --- a/client_backend/triton/test_triton_client_backend.cc +++ b/client_backend/triton/test_triton_client_backend.cc @@ -27,6 +27,7 @@ #include #include #include + #include "../../doctest.h" #include "triton_client_backend.h" diff --git a/client_backend/triton/triton_client_backend.cc b/client_backend/triton/triton_client_backend.cc index 0213ba8f..419123e5 100644 --- a/client_backend/triton/triton_client_backend.cc +++ b/client_backend/triton/triton_client_backend.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -27,8 +27,10 @@ #include "triton_client_backend.h" #include + #include #include + #include "../../constants.h" #include "../../perf_analyzer_exception.h" #include "json_utils.h" @@ -93,12 +95,14 @@ TritonClientBackend::Create( const std::map> trace_options, const grpc_compression_algorithm compression_algorithm, std::shared_ptr http_headers, const bool verbose, - const std::string& metrics_url, + const std::string& metrics_url, const TensorFormat input_tensor_format, + const TensorFormat output_tensor_format, std::unique_ptr* client_backend) { std::unique_ptr triton_client_backend( new TritonClientBackend( - protocol, compression_algorithm, http_headers, metrics_url)); + protocol, compression_algorithm, http_headers, metrics_url, + input_tensor_format, output_tensor_format)); if (protocol == ProtocolType::HTTP) { triton::client::HttpSslOptions http_ssl_options = ParseHttpSslOptions(ssl_options); @@ -409,7 +413,7 @@ TritonClientBackend::AccessMetricsEndpoint(std::string& metrics_endpoint_text) if (res != CURLE_OK) { throw triton::perfanalyzer::PerfAnalyzerException( - "curl_easy_perform() failed: " + std::string(curl_easy_strerror(res)), + "Unable to connect to Metrics endpoint " + metrics_url_, triton::perfanalyzer::GENERIC_ERROR); } @@ -550,7 +554,9 @@ TritonClientBackend::ParseInferInputToTriton( std::vector* triton_inputs) { for (const auto input : inputs) { - triton_inputs->push_back((dynamic_cast(input))->Get()); + tc::InferInput* triton_input{dynamic_cast(input)->Get()}; + triton_input->SetBinaryData(input_tensor_format_ == TensorFormat::BINARY); + triton_inputs->push_back(triton_input); } } @@ -560,8 +566,10 @@ TritonClientBackend::ParseInferRequestedOutputToTriton( std::vector* triton_outputs) { for (const auto output : outputs) { - triton_outputs->push_back( - (dynamic_cast(output))->Get()); + tc::InferRequestedOutput* triton_output{ + dynamic_cast(output)->Get()}; + triton_output->SetBinaryData(input_tensor_format_ == TensorFormat::BINARY); + triton_outputs->push_back(triton_output); } } @@ -580,6 +588,16 @@ TritonClientBackend::ParseInferOptionsToTriton( triton_options->sequence_start_ = options.sequence_start_; triton_options->sequence_end_ = options.sequence_end_; } + triton_options->triton_enable_empty_final_response_ = + options.triton_enable_empty_final_response_; + + for (auto& map_entry : options.request_parameters_) { + auto rp = tc::RequestParameter(); + rp.name = map_entry.second.name; + rp.value = map_entry.second.value; + rp.type = map_entry.second.type; + triton_options->request_parameters[map_entry.first] = rp; + } } @@ -738,6 +756,13 @@ TritonInferInput::SetSharedMemory( return Error::Success; } +Error +TritonInferInput::RawData(const uint8_t** buf, size_t* byte_size) +{ + RETURN_IF_TRITON_ERROR(input_->RawData(buf, byte_size)); + return Error::Success; +} + TritonInferInput::TritonInferInput( const std::string& name, const std::string& datatype) : InferInput(BackendKind::TRITON, name, datatype) @@ -750,14 +775,14 @@ TritonInferInput::TritonInferInput( Error TritonInferRequestedOutput::Create( InferRequestedOutput** infer_output, const std::string& name, - const size_t class_count) + const size_t class_count, const std::string& datatype) { TritonInferRequestedOutput* local_infer_output = - new TritonInferRequestedOutput(name); + new TritonInferRequestedOutput(name, datatype); tc::InferRequestedOutput* triton_infer_output; RETURN_IF_TRITON_ERROR(tc::InferRequestedOutput::Create( - &triton_infer_output, name, class_count)); + &triton_infer_output, name, class_count, datatype)); local_infer_output->output_.reset(triton_infer_output); *infer_output = local_infer_output; @@ -775,8 +800,9 @@ TritonInferRequestedOutput::SetSharedMemory( } -TritonInferRequestedOutput::TritonInferRequestedOutput(const std::string& name) - : InferRequestedOutput(BackendKind::TRITON, name) +TritonInferRequestedOutput::TritonInferRequestedOutput( + const std::string& name, const std::string& datatype) + : InferRequestedOutput(BackendKind::TRITON, name, datatype) { } @@ -810,6 +836,20 @@ TritonInferResult::RawData( return Error::Success; } +Error +TritonInferResult::IsFinalResponse(bool* is_final_response) const +{ + RETURN_IF_TRITON_ERROR(result_->IsFinalResponse(is_final_response)); + return Error::Success; +} + +Error +TritonInferResult::IsNullResponse(bool* is_null_response) const +{ + RETURN_IF_TRITON_ERROR(result_->IsNullResponse(is_null_response)); + return Error::Success; +} + //============================================================================== }}}} // namespace triton::perfanalyzer::clientbackend::tritonremote diff --git a/client_backend/triton/triton_client_backend.h b/client_backend/triton/triton_client_backend.h index b573d8e3..fd48d32c 100644 --- a/client_backend/triton/triton_client_backend.h +++ b/client_backend/triton/triton_client_backend.h @@ -1,4 +1,4 @@ -// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -30,6 +30,7 @@ #include #include #include + #include "../../constants.h" #include "../../metrics.h" #include "../../perf_utils.h" @@ -79,6 +80,10 @@ class TritonClientBackend : public ClientBackend { /// the header name/value. /// \param verbose Enables the verbose mode. /// \param metrics_url The inference server metrics url and port. + /// \param input_tensor_format The Triton inference request input tensor + /// format. + /// \param output_tensor_format The Triton inference response output tensor + /// format. /// \param client_backend Returns a new TritonClientBackend object. /// \return Error object indicating success or failure. static Error Create( @@ -88,6 +93,8 @@ class TritonClientBackend : public ClientBackend { const grpc_compression_algorithm compression_algorithm, std::shared_ptr http_headers, const bool verbose, const std::string& metrics_url, + const cb::TensorFormat input_tensor_format, + const cb::TensorFormat output_tensor_format, std::unique_ptr* client_backend); /// See ClientBackend::ServerExtensions() @@ -169,10 +176,14 @@ class TritonClientBackend : public ClientBackend { TritonClientBackend( const ProtocolType protocol, const grpc_compression_algorithm compression_algorithm, - std::shared_ptr http_headers, const std::string& metrics_url) + std::shared_ptr http_headers, const std::string& metrics_url, + const cb::TensorFormat input_tensor_format, + const cb::TensorFormat output_tensor_format) : ClientBackend(BackendKind::TRITON), protocol_(protocol), compression_algorithm_(compression_algorithm), - http_headers_(http_headers), metrics_url_(metrics_url) + http_headers_(http_headers), metrics_url_(metrics_url), + input_tensor_format_(input_tensor_format), + output_tensor_format_(output_tensor_format) { } @@ -239,11 +250,13 @@ class TritonClientBackend : public ClientBackend { const grpc_compression_algorithm compression_algorithm_{GRPC_COMPRESS_NONE}; std::shared_ptr http_headers_; const std::string metrics_url_{""}; + const cb::TensorFormat input_tensor_format_{cb::TensorFormat::UNKNOWN}; + const cb::TensorFormat output_tensor_format_{cb::TensorFormat::UNKNOWN}; #ifndef DOCTEST_CONFIG_DISABLE friend TestTritonClientBackend; - protected: + public: TritonClientBackend() = default; #endif }; @@ -270,6 +283,8 @@ class TritonInferInput : public InferInput { /// See InferInput::SetSharedMemory() Error SetSharedMemory( const std::string& name, size_t byte_size, size_t offset = 0) override; + /// See InferInput::RawData() + Error RawData(const uint8_t** buf, size_t* byte_size) override; private: explicit TritonInferInput( @@ -286,7 +301,7 @@ class TritonInferRequestedOutput : public InferRequestedOutput { public: static Error Create( InferRequestedOutput** infer_output, const std::string& name, - const size_t class_count = 0); + const size_t class_count = 0, const std::string& datatype = ""); /// Returns the raw InferRequestedOutput object required by triton client /// library. tc::InferRequestedOutput* Get() const { return output_.get(); } @@ -296,7 +311,8 @@ class TritonInferRequestedOutput : public InferRequestedOutput { const size_t offset = 0) override; private: - explicit TritonInferRequestedOutput(const std::string& name); + explicit TritonInferRequestedOutput( + const std::string& name, const std::string& datatype); std::unique_ptr output_; }; @@ -316,6 +332,10 @@ class TritonInferResult : public InferResult { Error RawData( const std::string& output_name, const uint8_t** buf, size_t* byte_size) const override; + /// See InferResult::IsFinalResponse() + Error IsFinalResponse(bool* is_final_response) const override; + /// See InferResult::IsNullResponse() + Error IsNullResponse(bool* is_null_response) const override; private: std::unique_ptr result_; diff --git a/client_backend/triton_c_api/CMakeLists.txt b/client_backend/triton_c_api/CMakeLists.txt index 705d25b0..5e21b744 100644 --- a/client_backend/triton_c_api/CMakeLists.txt +++ b/client_backend/triton_c_api/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -31,14 +31,18 @@ set( triton_c_api_backend.cc shared_library.cc triton_loader.cc + shared_memory_manager.cc + scoped_defer.cc ) set( TRITON_C_API_CLIENT_BACKEND_HDRS triton_c_api_backend.h shared_library.h + shared_memory_manager.h triton_loader.h c_api_infer_results.h + scoped_defer.h ) add_library( diff --git a/client_backend/triton_c_api/scoped_defer.cc b/client_backend/triton_c_api/scoped_defer.cc new file mode 100644 index 00000000..ff25eb0f --- /dev/null +++ b/client_backend/triton_c_api/scoped_defer.cc @@ -0,0 +1,53 @@ +// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "scoped_defer.h" + +namespace triton { namespace perfanalyzer { namespace clientbackend { +namespace tritoncapi { +ScopedDefer::ScopedDefer(std::function task) +{ + task_ = task; + done_ = false; +} + +void +ScopedDefer::Complete() +{ + if (!done_) { + task_(); + done_ = true; + } +} + +ScopedDefer::~ScopedDefer() +{ + if (!done_) { + task_(); + } +} + +}}}} // namespace triton::perfanalyzer::clientbackend::tritoncapi diff --git a/client_backend/triton_c_api/scoped_defer.h b/client_backend/triton_c_api/scoped_defer.h new file mode 100644 index 00000000..c5fcc7ea --- /dev/null +++ b/client_backend/triton_c_api/scoped_defer.h @@ -0,0 +1,44 @@ +// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once +#include + +namespace triton { namespace perfanalyzer { namespace clientbackend { +namespace tritoncapi { + +class ScopedDefer { + public: + ScopedDefer(std::function task); + ~ScopedDefer(); + void Complete(); + + private: + std::function task_; + bool done_; +}; + +}}}} // namespace triton::perfanalyzer::clientbackend::tritoncapi diff --git a/client_backend/triton_c_api/shared_library.cc b/client_backend/triton_c_api/shared_library.cc index 67a8dc79..8c06860e 100644 --- a/client_backend/triton_c_api/shared_library.cc +++ b/client_backend/triton_c_api/shared_library.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -24,7 +24,9 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "shared_library.h" + #include + #include /// FIXME: Duplication of server/src/core/shared_library.cc @@ -36,7 +38,6 @@ namespace tritoncapi { Error OpenLibraryHandle(const std::string& path, void** handle) { - std::cout << "OpenLibraryHandle: " << path << std::endl; *handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL); if (*handle == nullptr) { return Error("unable to load backend library: " + std::string(dlerror())); diff --git a/client_backend/triton_c_api/shared_library.h b/client_backend/triton_c_api/shared_library.h index 45eea356..dbc49e4d 100644 --- a/client_backend/triton_c_api/shared_library.h +++ b/client_backend/triton_c_api/shared_library.h @@ -26,6 +26,7 @@ #pragma once #include + #include "../client_backend.h" /// FIXME: Duplication of server/src/core/shared_library.h /// Separate shared_library to common library and delete this diff --git a/client_backend/triton_c_api/shared_memory_manager.cc b/client_backend/triton_c_api/shared_memory_manager.cc new file mode 100644 index 00000000..0658daed --- /dev/null +++ b/client_backend/triton_c_api/shared_memory_manager.cc @@ -0,0 +1,208 @@ +// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "shared_memory_manager.h" + +#include +#include +#include +#include + +#include "common.h" + +namespace triton { namespace perfanalyzer { namespace clientbackend { +namespace tritoncapi { + +SharedMemoryManager::~SharedMemoryManager() +{ + UnregisterAll(TRITONSERVER_MEMORY_CPU); + UnregisterAll(TRITONSERVER_MEMORY_GPU); +} + +#ifdef TRITON_ENABLE_GPU +Error +SharedMemoryManager::RegisterCUDAMemory( + const std::string& name, void* dev_ptr, const size_t byte_size, + const int device_id) +{ + // Serialize all operations that write/read current shared memory regions + std::lock_guard lock(mu_); + + // If name is already in shared_memory_map_ then return error saying already + // registered + if (shared_memory_map_.find(name) != shared_memory_map_.end()) { + return Error( + std::string("shared memory region '" + name + "' already in manager")); + } + + shared_memory_map_.insert(std::make_pair( + name, std::unique_ptr(new MemoryInfo( + name, 0 /* offset */, byte_size, dev_ptr, + TRITONSERVER_MEMORY_GPU, device_id)))); + return Error::Success; +} +#endif // TRITON_ENABLE_GPU + +Error +SharedMemoryManager::RegisterSystemMemory( + const std::string& name, void* ptr, const size_t byte_size) +{ + // Serialize all operations that write/read current shared memory regions + std::lock_guard lock(mu_); + + // If name is already in shared_memory_map_ then return error saying already + // registered + if (shared_memory_map_.find(name) != shared_memory_map_.end()) { + return Error("shared memory region '" + name + "' already in manager"); + } + + shared_memory_map_.insert(std::make_pair( + name, std::make_unique( + name, 0 /* offset */, byte_size, ptr, TRITONSERVER_MEMORY_CPU, + 0 /* device id */))); + + return Error::Success; +} + +Error +SharedMemoryManager::GetMemoryInfo( + const std::string& name, size_t offset, size_t byte_size, + void** shm_mapped_addr, TRITONSERVER_MemoryType* memory_type, + int64_t* device_id) +{ + // protect shared_memory_map_ from concurrent access + std::lock_guard lock(mu_); + + auto it = shared_memory_map_.find(name); + if (it == shared_memory_map_.end()) { + return Error( + std::string("Unable to find shared memory region: '" + name + "'")); + } + + // validate offset + size_t shm_region_end = 0; + if (it->second->kind_ == TRITONSERVER_MEMORY_CPU) { + shm_region_end = it->second->offset_; + } + if (it->second->byte_size_ > 0) { + shm_region_end += it->second->byte_size_ - 1; + } + if (offset > shm_region_end) { + return Error( + std::string("Invalid offset for shared memory region: '" + name + "'") + .c_str()); + } + // validate byte_size + offset is within memory bounds + size_t total_req_shm = offset + byte_size - 1; + if (total_req_shm > shm_region_end) { + return Error(std::string( + "Invalid offset + byte size for shared memory region: '" + + name + "'") + .c_str()); + } + + if (it->second->kind_ == TRITONSERVER_MEMORY_CPU) { + *shm_mapped_addr = (void*)((uint8_t*)it->second->mapped_addr_ + + it->second->offset_ + offset); + } else { + *shm_mapped_addr = (void*)((uint8_t*)it->second->mapped_addr_ + offset); + } + + *memory_type = it->second->kind_; + *device_id = it->second->device_id_; + + return Error::Success; +} + + +Error +SharedMemoryManager::Unregister( + const std::string& name, TRITONSERVER_MemoryType memory_type) +{ + // Serialize all operations that write/read current shared memory regions + std::lock_guard lock(mu_); + + return UnregisterHelper(name, memory_type); +} + +Error +SharedMemoryManager::UnregisterAll(TRITONSERVER_MemoryType memory_type) +{ + // Serialize all operations that write/read current shared memory regions + std::lock_guard lock(mu_); + std::string error_message = "Failed to unregister the following "; + std::vector unregister_fails; + + if (memory_type == TRITONSERVER_MEMORY_CPU) { + error_message += "system shared memory regions: "; + for (auto& it : shared_memory_map_) { + if (it.second->kind_ == TRITONSERVER_MEMORY_CPU) { + Error err = UnregisterHelper(it.first, memory_type); + if (!err.IsOk()) { + unregister_fails.push_back(it.first); + } + } + } + } else if (memory_type == TRITONSERVER_MEMORY_GPU) { + error_message += "cuda shared memory regions: "; + for (auto& it : shared_memory_map_) { + if (it.second->kind_ == TRITONSERVER_MEMORY_GPU) { + Error err = UnregisterHelper(it.first, memory_type); + if (!err.IsOk()) { + unregister_fails.push_back(it.first); + } + } + } + } + + if (!unregister_fails.empty()) { + for (auto unreg_fail : unregister_fails) { + error_message += unreg_fail + " ,"; + } + return Error(error_message); + } + + return Error::Success; +} + +Error +SharedMemoryManager::UnregisterHelper( + const std::string& name, TRITONSERVER_MemoryType memory_type) +{ + // Must hold the lock on register_mu_ while calling this function. + auto it = shared_memory_map_.find(name); + + if (it == shared_memory_map_.end()) { + return Error("Shared memory region " + name + " doesn't exist."); + } + + // Remove region information from shared_memory_map_ + shared_memory_map_.erase(it); + + return Error::Success; +} + +}}}} // namespace triton::perfanalyzer::clientbackend::tritoncapi diff --git a/client_backend/triton_c_api/shared_memory_manager.h b/client_backend/triton_c_api/shared_memory_manager.h new file mode 100644 index 00000000..6b2082c4 --- /dev/null +++ b/client_backend/triton_c_api/shared_memory_manager.h @@ -0,0 +1,141 @@ +// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include + +#include +#include +#include +#include + +#include "../client_backend.h" + +#ifdef TRITON_ENABLE_GPU +#include +#endif // TRITON_ENABLE_GPU + +namespace triton { namespace perfanalyzer { namespace clientbackend { +namespace tritoncapi { + +class SharedMemoryManager { + public: + SharedMemoryManager() = default; + ~SharedMemoryManager(); + +#ifdef TRITON_ENABLE_GPU + /// Add a memory block representing memory in CUDA (GPU) memory + /// to the manager. Return an Error if a memory block of the same name + /// already exists in the manager. + /// \param name The name of the memory block. + /// \param dev_ptr The device pointer + /// \param byte_size The size, in bytes of the block. + /// \param device id The GPU number the memory region is in. + /// \return an Error indicating success or failure. + Error RegisterCUDAMemory( + const std::string& name, void* dev_ptr, const size_t byte_size, + const int device_id); +#endif // TRITON_ENABLE_GPU + + /// Add a system memory block to the manager. + /// Return an Error if a shared memory block of the same name + /// already exists in the manager. + /// \param name The name of the memory block. + /// \param ptr The device pointer + /// \param byte_size The size, in bytes of the block. + /// \return an Error indicating success or failure. + Error RegisterSystemMemory( + const std::string& name, void* ptr, const size_t byte_size); + + /// Get the access information for the shared memory block + /// with the specified name. Return an Error + /// if named block doesn't exist. + /// \param name The name of the shared memory block to get. + /// \param offset The offset in the block + /// \param byte_size The byte size to request for the shm region + /// \param shm_mapped_addr Returns the pointer to the shared + /// memory block with the specified name and offset + /// \param memory_type Returns the type of the memory + /// \param device_id Returns the device id associated with the + /// memory block + /// \return an Error indicating success or failure. + Error GetMemoryInfo( + const std::string& name, size_t offset, size_t byte_size, + void** shm_mapped_addr, TRITONSERVER_MemoryType* memory_type, + int64_t* device_id); + + /// Removes the named shared memory block of the specified type from + /// the manager. Any future attempt to get the details of this block + /// will result in an array till another block with the same name is + /// added to the manager. + /// \param name The name of the shared memory block to remove. + /// \param memory_type The type of memory to unregister. + /// \return an Error indicating success or failure. + Error Unregister( + const std::string& name, TRITONSERVER_MemoryType memory_type); + + /// Unregister all shared memory blocks of specified type from the manager. + /// \param memory_type The type of memory to unregister. + /// \return an Error indicating success or failure. + Error UnregisterAll(TRITONSERVER_MemoryType memory_type); + + private: + /// A helper function to remove the named shared memory blocks of + /// specified type + Error UnregisterHelper( + const std::string& name, TRITONSERVER_MemoryType memory_type); + + /// A struct that records the shared memory regions registered by the shared + /// memory manager. + struct MemoryInfo { + MemoryInfo( + const std::string& name, const size_t offset, const size_t byte_size, + void* mapped_addr, const TRITONSERVER_MemoryType kind, + const int64_t device_id) + : name_(name), offset_(offset), byte_size_(byte_size), + mapped_addr_(mapped_addr), kind_(kind), device_id_(device_id) + { + } + + std::string name_; + size_t offset_; + size_t byte_size_; + void* mapped_addr_; + TRITONSERVER_MemoryType kind_; + int64_t device_id_; + }; + + using SharedMemoryStateMap = + std::map>; + + // A map between the name and the details of the associated + // shared memory block + SharedMemoryStateMap shared_memory_map_; + + // A mutex to protect the concurrent access to shared_memory_map_ + std::mutex mu_; +}; +}}}} // namespace triton::perfanalyzer::clientbackend::tritoncapi diff --git a/client_backend/triton_c_api/triton_c_api_backend.cc b/client_backend/triton_c_api/triton_c_api_backend.cc index e09b8ffe..e97f1ea8 100644 --- a/client_backend/triton_c_api/triton_c_api_backend.cc +++ b/client_backend/triton_c_api/triton_c_api_backend.cc @@ -38,21 +38,25 @@ namespace tritoncapi { Error TritonCApiClientBackend::Create( const std::string& triton_server_path, - const std::string& model_repository_path, const std::string& memory_type, - const bool verbose, std::unique_ptr* client_backend) + const std::string& model_repository_path, const bool verbose, + std::unique_ptr* client_backend) { - if (triton_server_path.empty() || model_repository_path.empty() || - memory_type.empty()) { - return Error(std::string( - "Unable to create Triton C-API client backend. /lib/libtritonserver.so " - "directory:" + - triton_server_path + " model repo:" + model_repository_path + - " memory type:" + memory_type)); + if (triton_server_path.empty()) { + return Error( + "--triton-server-path should not be empty when using " + "service-kind=triton_c_api."); } + + if (model_repository_path.empty()) { + return Error( + "--model-repository should not be empty when using " + "service-kind=triton_c_api."); + } + std::unique_ptr triton_client_backend( new TritonCApiClientBackend()); - TritonLoader::Create( - triton_server_path, model_repository_path, memory_type, verbose); + RETURN_IF_ERROR( + TritonLoader::Create(triton_server_path, model_repository_path, verbose)); *client_backend = std::move(triton_client_backend); return Error::Success; } @@ -61,7 +65,7 @@ Error TritonCApiClientBackend::ServerExtensions(std::set* extensions) { rapidjson::Document server_metadata_json; - RETURN_IF_ERROR(TritonLoader::ServerMetaData(&server_metadata_json)); + RETURN_IF_ERROR(triton_loader_->ServerMetaData(&server_metadata_json)); for (const auto& extension : server_metadata_json["extensions"].GetArray()) { extensions->insert( std::string(extension.GetString(), extension.GetStringLength())); @@ -74,10 +78,10 @@ TritonCApiClientBackend::ModelMetadata( rapidjson::Document* model_metadata, const std::string& model_name, const std::string& model_version) { - if (!TritonLoader::ModelIsLoaded()) { - TritonLoader::LoadModel(model_name, model_version); + if (!triton_loader_->ModelIsLoaded()) { + triton_loader_->LoadModel(model_name, model_version); } - RETURN_IF_ERROR(TritonLoader::ModelMetadata(model_metadata)); + RETURN_IF_ERROR(triton_loader_->ModelMetadata(model_metadata)); return Error::Success; } @@ -86,10 +90,11 @@ TritonCApiClientBackend::ModelConfig( rapidjson::Document* model_config, const std::string& model_name, const std::string& model_version) { - if (!TritonLoader::ModelIsLoaded()) { - TritonLoader::LoadModel(model_name, model_version); + if (!triton_loader_->ModelIsLoaded()) { + triton_loader_->LoadModel(model_name, model_version); } - RETURN_IF_ERROR(TritonLoader::ModelConfig(model_config)); + RETURN_IF_ERROR( + triton_loader_->ModelConfig(model_config, model_name, model_version)); return Error::Success; } @@ -109,7 +114,7 @@ TritonCApiClientBackend::Infer( ParseInferOptionsToTriton(options, &triton_options); capi::InferResult* triton_result; - RETURN_IF_ERROR(TritonLoader::Infer( + RETURN_IF_ERROR(triton_loader_->Infer( triton_options, triton_inputs, triton_outputs, &triton_result)); *result = new TritonCApiInferResult(triton_result); @@ -122,7 +127,7 @@ TritonCApiClientBackend::ClientInferStat(InferStat* infer_stat) { tc::InferStat triton_infer_stat; - TritonLoader::ClientInferStat(&triton_infer_stat); + triton_loader_->ClientInferStat(&triton_infer_stat); ParseInferStat(triton_infer_stat, infer_stat); return Error::Success; } @@ -133,13 +138,38 @@ TritonCApiClientBackend::ModelInferenceStatistics( const std::string& model_name, const std::string& model_version) { rapidjson::Document infer_stat_json; - RETURN_IF_ERROR(TritonLoader::ModelInferenceStatistics( + RETURN_IF_ERROR(triton_loader_->ModelInferenceStatistics( model_name, model_version, &infer_stat_json)); ParseStatistics(infer_stat_json, model_stats); return Error::Success; } +Error +TritonCApiClientBackend::UnregisterAllSharedMemory() +{ + RETURN_IF_ERROR(triton_loader_->UnregisterAllSharedMemory()); + return Error::Success; +} + +Error +TritonCApiClientBackend::RegisterSystemMemory( + const std::string& name, void* ptr, const size_t byte_size) +{ + RETURN_IF_ERROR(triton_loader_->RegisterSystemMemory(name, ptr, byte_size)); + return Error::Success; +} + +#ifdef TRITON_ENABLE_GPU +Error +TritonCApiClientBackend::RegisterCudaMemory( + const std::string& name, void* handle, const size_t byte_size) +{ + RETURN_IF_ERROR(triton_loader_->RegisterCudaMemory(name, handle, byte_size)); + return Error::Success; +} +#endif // TRITON_ENABLE_GPU + void TritonCApiClientBackend::ParseInferInputToTriton( const std::vector& inputs, @@ -286,6 +316,14 @@ TritonCApiInferInput::AppendRaw(const uint8_t* input, size_t input_byte_size) return Error::Success; } +Error +TritonCApiInferInput::SetSharedMemory( + const std::string& name, size_t byte_size, size_t offset) +{ + RETURN_IF_TRITON_ERROR(input_->SetSharedMemory(name, byte_size, offset)); + return Error::Success; +} + TritonCApiInferInput::TritonCApiInferInput( const std::string& name, const std::string& datatype) : InferInput(BackendKind::TRITON_C_API, name, datatype) @@ -298,14 +336,14 @@ TritonCApiInferInput::TritonCApiInferInput( Error TritonCApiInferRequestedOutput::Create( InferRequestedOutput** infer_output, const std::string& name, - const size_t class_count) + const size_t class_count, const std::string& datatype) { TritonCApiInferRequestedOutput* local_infer_output = new TritonCApiInferRequestedOutput(name); tc::InferRequestedOutput* triton_infer_output; RETURN_IF_TRITON_ERROR(tc::InferRequestedOutput::Create( - &triton_infer_output, name, class_count)); + &triton_infer_output, name, class_count, datatype)); local_infer_output->output_.reset(triton_infer_output); *infer_output = local_infer_output; @@ -313,6 +351,14 @@ TritonCApiInferRequestedOutput::Create( return Error::Success; } +Error +TritonCApiInferRequestedOutput::SetSharedMemory( + const std::string& name, size_t byte_size, size_t offset) +{ + RETURN_IF_TRITON_ERROR(output_->SetSharedMemory(name, byte_size, offset)); + return Error::Success; +} + TritonCApiInferRequestedOutput::TritonCApiInferRequestedOutput( const std::string& name) : InferRequestedOutput(BackendKind::TRITON_C_API, name) diff --git a/client_backend/triton_c_api/triton_c_api_backend.h b/client_backend/triton_c_api/triton_c_api_backend.h index 86e08b75..0f9f5def 100644 --- a/client_backend/triton_c_api/triton_c_api_backend.h +++ b/client_backend/triton_c_api/triton_c_api_backend.h @@ -26,7 +26,9 @@ #pragma once #include + #include "../client_backend.h" +#include "shared_memory_manager.h" #include "triton_loader.h" #define RETURN_IF_TRITON_ERROR(S) \ @@ -66,17 +68,16 @@ class TritonCApiClientBackend : public ClientBackend { /// \param triton_server_path Tritonserver library that contains /// lib/libtritonserver.so. /// \param model_repository_path The model repository. - /// \param memory_type Type of memory used in Triton Server. /// \param verbose Enables the verbose mode of TritonServer. /// \param client_backend Returns a new TritonCApiClientBackend object. /// \return Error object indicating success /// or failure. static Error Create( const std::string& triton_server_path, - const std::string& model_repository_path, const std::string& memory_type, - const bool verbose, std::unique_ptr* client_backend); + const std::string& model_repository_path, const bool verbose, + std::unique_ptr* client_backend); - ~TritonCApiClientBackend() { TritonLoader::Delete(); } + ~TritonCApiClientBackend() { triton_loader_->Delete(); } /// See ClientBackend::ServerExtensions() Error ServerExtensions(std::set* server_extensions) override; @@ -106,8 +107,25 @@ class TritonCApiClientBackend : public ClientBackend { const std::string& model_name = "", const std::string& model_version = "") override; +#ifdef TRITON_ENABLE_GPU + /// See ClientBackend::RegisterCudaMemory + Error RegisterCudaMemory( + const std::string& name, void* handle, const size_t byte_size) override; +#endif // TRITON_ENABLE_GPU + + /// See ClientBackend::RegisterSystemMemory + Error RegisterSystemMemory( + const std::string& name, void* ptr, const size_t byte_size) override; + + /// See ClientBackend::UnregisterAllSharedMemory + Error UnregisterAllSharedMemory(); + private: - TritonCApiClientBackend() : ClientBackend(BackendKind::TRITON_C_API) {} + TritonCApiClientBackend() + : ClientBackend(BackendKind::TRITON_C_API), + triton_loader_(TritonLoader::GetSingleton()) + { + } void ParseInferInputToTriton( const std::vector& inputs, std::vector* triton_inputs); @@ -121,6 +139,7 @@ class TritonCApiClientBackend : public ClientBackend { std::map* model_stats); void ParseInferStat( const tc::InferStat& triton_infer_stat, InferStat* infer_stat); + TritonLoader* triton_loader_; }; //============================================================== @@ -132,17 +151,26 @@ class TritonCApiInferInput : public InferInput { static Error Create( InferInput** infer_input, const std::string& name, const std::vector& dims, const std::string& datatype); + /// Returns the raw InferInput object required by triton client library. tc::InferInput* Get() const { return input_.get(); } + /// See InferInput::Shape() const std::vector& Shape() const override; + /// See InferInput::SetShape() Error SetShape(const std::vector& shape) override; + /// See InferInput::Reset() Error Reset() override; + /// See InferInput::AppendRaw() Error AppendRaw(const uint8_t* input, size_t input_byte_size) override; + /// See InferInput::SetSharedMemory() + Error SetSharedMemory( + const std::string& name, size_t byte_size, size_t offset = 0) override; + private: explicit TritonCApiInferInput( const std::string& name, const std::string& datatype); @@ -158,11 +186,15 @@ class TritonCApiInferRequestedOutput : public InferRequestedOutput { public: static Error Create( InferRequestedOutput** infer_output, const std::string& name, - const size_t class_count = 0); + const size_t class_count = 0, const std::string& datatype = ""); /// Returns the raw InferRequestedOutput object required by triton client /// library. tc::InferRequestedOutput* Get() const { return output_.get(); } + /// See InferInput::SetSharedMemory() + Error SetSharedMemory( + const std::string& name, size_t byte_size, size_t offset = 0) override; + private: explicit TritonCApiInferRequestedOutput(const std::string& name); diff --git a/client_backend/triton_c_api/triton_loader.cc b/client_backend/triton_c_api/triton_loader.cc index 62be6922..35f7657f 100644 --- a/client_backend/triton_c_api/triton_loader.cc +++ b/client_backend/triton_c_api/triton_loader.cc @@ -32,18 +32,50 @@ #include #include #include + #include #include #include #include #include + #include "c_api_infer_results.h" +#include "scoped_defer.h" namespace triton { namespace perfanalyzer { namespace clientbackend { namespace tritoncapi { namespace { -TRITONSERVER_MemoryType requested_memory_type; +struct AllocPayload { + struct OutputInfo { + enum Kind { BINARY, SHM }; + + Kind kind_; + void* base_; + uint64_t byte_size_; + TRITONSERVER_MemoryType memory_type_; + int64_t device_id_; + + // For shared memory + OutputInfo( + void* base, uint64_t byte_size, TRITONSERVER_MemoryType memory_type, + int64_t device_id) + : kind_(SHM), base_(base), byte_size_(byte_size), + memory_type_(memory_type), device_id_(device_id) + { + } + }; + + ~AllocPayload() + { + for (auto it : output_map_) { + delete it.second; + } + } + + std::unordered_map output_map_; +}; + bool helper_verbose = false; /// Helper function for allocating memory TRITONSERVER_Error* @@ -59,6 +91,11 @@ ResponseAlloc( *actual_memory_type = preferred_memory_type; *actual_memory_type_id = preferred_memory_type_id; + // This variable indicates whether the buffer should be freed or not. + bool* should_free = new bool; + *buffer_userp = should_free; + *should_free = false; + // If 'byte_size' is zero just return 'buffer' == nullptr, we don't // need to do any other book-keeping. if (byte_size == 0) { @@ -69,20 +106,35 @@ ResponseAlloc( << tensor_name << std::endl; } } else { - void* allocated_ptr = nullptr; - *actual_memory_type = TRITONSERVER_MEMORY_CPU; - allocated_ptr = malloc(byte_size); - - // Pass the tensor name with buffer_userp so we can show it when - // releasing the buffer. - if (allocated_ptr != nullptr) { - *buffer = allocated_ptr; - *buffer_userp = new std::string(tensor_name); - if (helper_verbose) { - std::cout << "allocated " << byte_size << " bytes in " - << size_t(*actual_memory_type) << " for result tensor " - << tensor_name << std::endl; + AllocPayload* alloc_payload = reinterpret_cast(userp); + auto output_map_it = alloc_payload->output_map_.find(tensor_name); + if (output_map_it == alloc_payload->output_map_.end()) { + void* allocated_ptr = nullptr; + *actual_memory_type = TRITONSERVER_MEMORY_CPU; + *actual_memory_type_id = 0; + allocated_ptr = malloc(byte_size); + *should_free = true; + + if (allocated_ptr != nullptr) { + *buffer = allocated_ptr; + } + } else { + // It is in shared memory + AllocPayload::OutputInfo* output_info = output_map_it->second; + if (byte_size > output_info->byte_size_) { + return TritonLoader::GetSingleton()->ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + std::string( + "shared memory size specified with the request for output '" + + std::string(tensor_name) + "' (" + + std::to_string(output_info->byte_size_) + + " bytes) should be at least " + std::to_string(byte_size) + + " bytes to hold the results") + .c_str()); } + *actual_memory_type = output_info->memory_type_; + *actual_memory_type_id = output_info->device_id_; + *buffer = output_info->base_; } } @@ -96,29 +148,16 @@ ResponseRelease( size_t byte_size, TRITONSERVER_MemoryType memory_type, int64_t memory_type_id) { - std::string* name = nullptr; - if (buffer_userp != nullptr) { - name = reinterpret_cast(buffer_userp); - } else { - name = new std::string(""); - } - if (helper_verbose) { - std::cout << "Releasing buffer " << buffer << " of size " << byte_size - << " in " << size_t(memory_type) << " for result '" << *name - << "'" << std::endl; - } + bool* should_free = reinterpret_cast(buffer_userp); switch (memory_type) { case TRITONSERVER_MEMORY_CPU: - free(buffer); - break; - default: - std::cerr << "error: unexpected buffer allocated in CUDA managed memory" - << std::endl; + if (*should_free) { + free(buffer); + } break; } - delete name; - + free(should_free); return nullptr; // Success } @@ -126,7 +165,7 @@ void InferRequestComplete( TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp) { - TritonLoader::DeleteInferRequest(request); + TritonLoader::GetSingleton()->DeleteInferRequest(request); } @@ -157,7 +196,7 @@ GetModelVersionFromString(const std::string& version_string, int64_t* version) catch (std::exception& e) { return Error( std::string( - "failed to get model version from specified version string '" + + "Failed to get model version from specified version string '" + version_string + "' (details: " + e.what() + "), version should be an integral value > 0") .c_str()); @@ -184,25 +223,18 @@ FolderExists(const std::string& path) } } } // namespace + Error TritonLoader::Create( const std::string& triton_server_path, - const std::string& model_repository_path, const std::string& memory_type, - bool verbose) + const std::string& model_repository_path, bool verbose) { if (!GetSingleton()->ServerIsReady()) { - if (triton_server_path.empty() || model_repository_path.empty()) { - return Error("cannot load server, paths are empty"); - } GetSingleton()->ClearHandles(); - FAIL_IF_ERR( - GetSingleton()->PopulateInternals( - triton_server_path, model_repository_path, memory_type, verbose), - "Populating internal variables"); - FAIL_IF_ERR( - GetSingleton()->LoadServerLibrary(), "Loading Triton Server library"); - FAIL_IF_ERR( - GetSingleton()->StartTriton(memory_type), "Starting Triton Server"); + RETURN_IF_ERROR(GetSingleton()->PopulateInternals( + triton_server_path, model_repository_path, verbose)); + RETURN_IF_ERROR(GetSingleton()->LoadServerLibrary()); + RETURN_IF_ERROR(GetSingleton()->StartTriton()); } return Error::Success; @@ -211,10 +243,10 @@ TritonLoader::Create( Error TritonLoader::Delete() { - if (GetSingleton()->server_ != nullptr) { - GetSingleton()->server_is_ready_ = false; - GetSingleton()->model_is_loaded_ = false; - (GetSingleton()->server_).reset(); + if (server_ != nullptr) { + server_is_ready_ = false; + model_is_loaded_ = false; + server_.reset(); } return Error::Success; } @@ -222,25 +254,25 @@ TritonLoader::Delete() Error TritonLoader::PopulateInternals( const std::string& triton_server_path, - const std::string& model_repository_path, const std::string& memory_type, - bool verbose) + const std::string& model_repository_path, bool verbose) { RETURN_IF_ERROR(FolderExists(triton_server_path)); RETURN_IF_ERROR(FolderExists(model_repository_path)); - GetSingleton()->triton_server_path_ = triton_server_path; - GetSingleton()->model_repository_path_ = model_repository_path; - GetSingleton()->verbose_ = verbose; - GetSingleton()->verbose_level_ = GetSingleton()->verbose_ ? 1 : 0; + + triton_server_path_ = triton_server_path; + model_repository_path_ = model_repository_path; + verbose_ = verbose; + verbose_level_ = verbose_ ? 1 : 0; return Error::Success; } Error -TritonLoader::StartTriton(const std::string& memory_type) +TritonLoader::StartTriton() { // Check API version. uint32_t api_version_major, api_version_minor; REPORT_TRITONSERVER_ERROR( - GetSingleton()->api_version_fn_(&api_version_major, &api_version_minor)); + api_version_fn_(&api_version_major, &api_version_minor)); if ((TRITONSERVER_API_VERSION_MAJOR != api_version_major) || (TRITONSERVER_API_VERSION_MINOR > api_version_minor)) { std::stringstream sstream; @@ -254,65 +286,58 @@ TritonLoader::StartTriton(const std::string& memory_type) // Create the server... TRITONSERVER_ServerOptions* server_options = nullptr; RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->options_new_fn_(&server_options), - "creating server options"); + options_new_fn_(&server_options), "creating server options"); RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->options_set_model_repo_path_fn_( - server_options, GetSingleton()->model_repository_path_.c_str()), + options_set_model_repo_path_fn_( + server_options, model_repository_path_.c_str()), "setting model repository path"); RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->set_log_verbose_fn_( - server_options, GetSingleton()->verbose_level_), + set_cuda_memory_pool_byte_size_(server_options, 0, 1073741824), + "setting cuda memory pool byte size failed."); + RETURN_IF_TRITONSERVER_ERROR( + set_log_verbose_fn_(server_options, verbose_level_), "setting verbose logging level"); RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->set_log_info_fn_( - server_options, GetSingleton()->verbose_), + set_log_info_fn_(server_options, verbose_), "setting if log verbose level is true"); RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->set_backend_directory_fn_( - server_options, - (GetSingleton()->triton_server_path_ + "/backends").c_str()), + set_backend_directory_fn_( + server_options, (triton_server_path_ + "/backends").c_str()), "setting backend directory"); RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->set_repo_agent_directory_fn_( - server_options, - (GetSingleton()->triton_server_path_ + "/repoagents").c_str()), + set_repo_agent_directory_fn_( + server_options, (triton_server_path_ + "/repoagents").c_str()), "setting repository agent directory"); RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->set_strict_model_config_fn_(server_options, true), + set_strict_model_config_fn_(server_options, true), "setting strict model configuration"); double min_compute_capability = 0; // FIXME: Do not have GPU support right now RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->set_min_supported_compute_capability_fn_( + set_min_supported_compute_capability_fn_( server_options, min_compute_capability), "setting minimum supported CUDA compute capability"); TRITONSERVER_Server* server_ptr = nullptr; RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->server_new_fn_(&server_ptr, server_options), - "creating server"); + server_new_fn_(&server_ptr, server_options), "creating server"); RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->server_options_delete_fn_(server_options), - "deleting server options"); + server_options_delete_fn_(server_options), "deleting server options"); std::shared_ptr shared_server( - server_ptr, GetSingleton()->server_delete_fn_); - GetSingleton()->server_ = shared_server; + server_ptr, server_delete_fn_); + server_ = shared_server; // Wait until the server is both live and ready. size_t health_iters = 0; while (true) { bool live, ready; RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->server_is_live_fn_( - (GetSingleton()->server_).get(), &live), + server_is_live_fn_(server_.get(), &live), "unable to get server liveness"); RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->server_is_ready_fn_( - (GetSingleton()->server_).get(), &ready), + server_is_ready_fn_(server_.get(), &ready), "unable to get server readiness"); if (live && ready) { - std::cout << "server is alive!" << std::endl; - GetSingleton()->server_is_ready_ = true; + server_is_ready_ = true; break; } @@ -323,24 +348,20 @@ TritonLoader::StartTriton(const std::string& memory_type) std::this_thread::sleep_for(std::chrono::milliseconds(500)); } // Print status of the server. - if (GetSingleton()->verbose_) { + if (verbose_) { TRITONSERVER_Message* server_metadata_message; RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->server_metadata_fn_( - (GetSingleton()->server_).get(), &server_metadata_message), + server_metadata_fn_(server_.get(), &server_metadata_message), "unable to get server metadata message"); const char* buffer; size_t byte_size; RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->message_serialize_to_json_fn_( + message_serialize_to_json_fn_( server_metadata_message, &buffer, &byte_size), "unable to serialize server metadata message"); - std::cout << "Server Status:" << std::endl; - std::cout << std::string(buffer, byte_size) << std::endl; - RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->message_delete_fn_(server_metadata_message), + message_delete_fn_(server_metadata_message), "deleting status metadata"); } @@ -355,13 +376,12 @@ TritonLoader::ServerMetaData(rapidjson::Document* server_metadata) } TRITONSERVER_Message* server_metadata_message; RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->server_metadata_fn_( - (GetSingleton()->server_).get(), &server_metadata_message), + server_metadata_fn_(server_.get(), &server_metadata_message), "unable to get server metadata message"); const char* buffer; size_t byte_size; RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->message_serialize_to_json_fn_( + message_serialize_to_json_fn_( server_metadata_message, &buffer, &byte_size), "unable to serialize server metadata message"); server_metadata->Parse(buffer, byte_size); @@ -372,8 +392,7 @@ TritonLoader::ServerMetaData(rapidjson::Document* server_metadata) " at " + std::to_string(server_metadata->GetErrorOffset())); } RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->message_delete_fn_(server_metadata_message), - "deleting status metadata"); + message_delete_fn_(server_metadata_message), "deleting status metadata"); return Error::Success; } @@ -384,23 +403,21 @@ TritonLoader::LoadModel( if (!ServerIsReady()) { return Error("server is not ready, abort!"); } - GetSingleton()->model_name_ = model_name; + model_name_ = model_name; - RETURN_IF_ERROR(GetModelVersionFromString( - model_version, &(GetSingleton()->model_version_))); + RETURN_IF_ERROR(GetModelVersionFromString(model_version, &model_version_)); // Wait for the model to become available. bool is_ready = false; size_t health_iters = 0; // some error handling - if (GetSingleton()->model_repository_path_.empty()) { + if (model_repository_path_.empty()) { return Error("Need to specify model repository"); } while (!is_ready) { RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->model_is_ready_fn_( - GetSingleton()->server_.get(), GetSingleton()->model_name_.c_str(), - GetSingleton()->model_version_, &is_ready), + model_is_ready_fn_( + server_.get(), model_name_.c_str(), model_version_, &is_ready), "unable to get model readiness"); if (!is_ready) { if (++health_iters >= 10) { @@ -410,8 +427,8 @@ TritonLoader::LoadModel( continue; } } - GetSingleton()->model_is_loaded_ = - true; // flag to confirm model is correct and loaded + // flag to confirm model is correct and loaded + model_is_loaded_ = true; return Error::Success; } @@ -425,14 +442,14 @@ TritonLoader::ModelMetadata(rapidjson::Document* model_metadata) // get model metadata RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->model_metadata_fn_( - (GetSingleton()->server_).get(), GetSingleton()->model_name_.c_str(), - GetSingleton()->model_version_, &model_metadata_message), + model_metadata_fn_( + server_.get(), model_name_.c_str(), model_version_, + &model_metadata_message), "unable to get model metadata message"); const char* buffer; size_t byte_size; RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->message_serialize_to_json_fn_( + message_serialize_to_json_fn_( model_metadata_message, &buffer, &byte_size), "unable to serialize model status protobuf"); @@ -445,21 +462,17 @@ TritonLoader::ModelMetadata(rapidjson::Document* model_metadata) } RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->message_delete_fn_(model_metadata_message), - "deleting status protobuf"); + message_delete_fn_(model_metadata_message), "deleting status protobuf"); - if (strcmp( - (*model_metadata)["name"].GetString(), - GetSingleton()->model_name_.c_str())) { + if (strcmp((*model_metadata)["name"].GetString(), model_name_.c_str())) { return Error("unable to find metadata for model"); } bool found_version = false; if (model_metadata->HasMember("versions")) { for (const auto& version : (*model_metadata)["versions"].GetArray()) { - if (strcmp( - version.GetString(), - std::to_string(GetSingleton()->model_version_).c_str()) == 0) { + if (strcmp(version.GetString(), std::to_string(model_version_).c_str()) == + 0) { found_version = true; break; } @@ -467,15 +480,16 @@ TritonLoader::ModelMetadata(rapidjson::Document* model_metadata) } if (!found_version) { std::string msg = "unable to find version " + - std::to_string(GetSingleton()->model_version_) + - " status for model"; + std::to_string(model_version_) + " status for model"; return Error(msg); } return Error::Success; } Error -TritonLoader::ModelConfig(rapidjson::Document* model_config) +TritonLoader::ModelConfig( + rapidjson::Document* model_config, const std::string& model_name, + const std::string& model_version) { if (!ModelIsLoaded() || !ServerIsReady()) { return Error("Model is not loaded and/or server is not ready"); @@ -483,16 +497,14 @@ TritonLoader::ModelConfig(rapidjson::Document* model_config) TRITONSERVER_Message* model_config_message; uint32_t config_version = 1; RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->model_config_fn_( - (GetSingleton()->server_).get(), GetSingleton()->model_name_.c_str(), - GetSingleton()->model_version_, config_version, + model_config_fn_( + (server_).get(), model_name.c_str(), model_version_, config_version, &model_config_message), "unable to get model config message"); const char* buffer; size_t byte_size; RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->message_serialize_to_json_fn_( - model_config_message, &buffer, &byte_size), + message_serialize_to_json_fn_(model_config_message, &buffer, &byte_size), "unable to serialize model config status protobuf"); model_config->Parse(buffer, byte_size); @@ -504,7 +516,7 @@ TritonLoader::ModelConfig(rapidjson::Document* model_config) } RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->message_delete_fn_(model_config_message), + message_delete_fn_(model_config_message), "deleting server config status protobuf"); return Error::Success; @@ -513,12 +525,9 @@ TritonLoader::ModelConfig(rapidjson::Document* model_config) Error TritonLoader::LoadServerLibrary() { - std::string full_path = - GetSingleton()->triton_server_path_ + SERVER_LIBRARY_PATH; + std::string full_path = triton_server_path_ + server_library_path_; RETURN_IF_ERROR(FolderExists(full_path)); - FAIL_IF_ERR( - OpenLibraryHandle(full_path, &dlhandle_), - "shared library loading library:" + full_path); + RETURN_IF_ERROR(OpenLibraryHandle(full_path, &dlhandle_)); TritonServerApiVersionFn_t apifn; TritonServerOptionsNewFn_t onfn; @@ -582,6 +591,7 @@ TritonLoader::LoadServerLibrary() TritonSeverUnloadModelFn_t umfn; TritonSeverSetLogInfoFn_t slifn; + TritonServerSetCudaMemoryPoolByteSizeFn_t scmpbsfn; RETURN_IF_ERROR(GetEntrypoint( dlhandle_, "TRITONSERVER_ApiVersion", false /* optional */, @@ -608,6 +618,9 @@ TritonLoader::LoadServerLibrary() RETURN_IF_ERROR(GetEntrypoint( dlhandle_, "TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability", false /* optional */, reinterpret_cast(&smsccfn))); + RETURN_IF_ERROR(GetEntrypoint( + dlhandle_, "TRITONSERVER_ServerOptionsSetCudaMemoryPoolByteSize", + false /* optional */, reinterpret_cast(&scmpbsfn))); RETURN_IF_ERROR(GetEntrypoint( dlhandle_, "TRITONSERVER_ServerNew", false /* optional */, @@ -717,7 +730,7 @@ TritonLoader::LoadServerLibrary() dlhandle_, "TRITONSERVER_InferenceRequestSetFlags", false /* optional */, reinterpret_cast(&sffn))); RETURN_IF_ERROR(GetEntrypoint( - dlhandle_, "TRITONSERVER_InferenceRequestSetPriority", + dlhandle_, "TRITONSERVER_InferenceRequestSetPriorityUInt64", false /* optional */, reinterpret_cast(&spfn))); RETURN_IF_ERROR(GetEntrypoint( dlhandle_, "TRITONSERVER_InferenceRequestSetTimeoutMicroseconds", @@ -809,6 +822,7 @@ TritonLoader::LoadServerLibrary() unload_model_fn_ = umfn; set_log_info_fn_ = slifn; + set_cuda_memory_pool_byte_size_ = scmpbsfn; return Error::Success; } @@ -899,62 +913,89 @@ TritonLoader::Infer( const std::vector& outputs, InferResult** result) { + Error error = Error::Success; if (!ServerIsReady() || !ModelIsLoaded()) { return Error("Server is not ready and/or requested model is not loaded"); } + TRITONSERVER_ResponseAllocator* allocator = nullptr; TRITONSERVER_InferenceRequest* irequest = nullptr; + TRITONSERVER_InferenceResponse* completed_response = nullptr; tc::RequestTimers timer; timer.Reset(); timer.CaptureTimestamp(tc::RequestTimers::Kind::REQUEST_START); - GetSingleton()->InitializeRequest(options, outputs, &allocator, &irequest); - GetSingleton()->AddInputs(inputs, irequest); - GetSingleton()->AddOutputs(outputs, irequest); - timer.CaptureTimestamp(tc::RequestTimers::Kind::SEND_START); + + RETURN_IF_ERROR(InitializeRequest(options, outputs, &allocator, &irequest)); + ScopedDefer error_handler([&error, &completed_response, &allocator, this] { + error = CleanUp(completed_response, allocator); + }); + RETURN_IF_ERROR(AddInputs(inputs, irequest)); + RETURN_IF_ERROR(AddOutputs(outputs, irequest)); + + AllocPayload alloc_payload; + for (auto& output : outputs) { + if (output->IsSharedMemory()) { + std::string shm_name; + size_t shm_byte_size; + size_t offset; + // TODO: Error handling + output->SharedMemoryInfo(&shm_name, &shm_byte_size, &offset); + + void* buf; + TRITONSERVER_MemoryType memory_type; + int64_t memory_type_id; + RETURN_IF_ERROR(shm_manager_->GetMemoryInfo( + shm_name, offset, shm_byte_size, &buf, &memory_type, + &memory_type_id)); + + alloc_payload.output_map_.emplace( + std::piecewise_construct, std::forward_as_tuple(output->Name()), + std::forward_as_tuple(new AllocPayload::OutputInfo( + buf, shm_byte_size, memory_type, memory_type_id))); + } + } + + const char* cid = nullptr; + RETURN_IF_TRITONSERVER_ERROR( + request_id_fn_(irequest, &cid), "Failed to get request id"); + std::string id = cid; + // Perform inference... + timer.CaptureTimestamp(tc::RequestTimers::Kind::SEND_START); auto p = new std::promise(); std::future completed = p->get_future(); RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->inference_request_set_response_callback_fn_( - irequest, allocator, nullptr /* response_allocator_userp */, + inference_request_set_response_callback_fn_( + irequest, allocator, &alloc_payload /* response_allocator_userp */, InferResponseComplete, reinterpret_cast(p)), "setting response callback"); RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->infer_async_fn_( - (GetSingleton()->server_).get(), irequest, nullptr /* trace */), + infer_async_fn_((server_).get(), irequest, nullptr /* trace */), "running inference"); timer.CaptureTimestamp(tc::RequestTimers::Kind::SEND_END); + // Wait for the inference to complete. - TRITONSERVER_InferenceResponse* completed_response = completed.get(); - - // check if there completed response is an error and needs to shut down - // gracefully - TRITONSERVER_Error* completed_response_err = - GetSingleton()->inference_response_error_fn_(completed_response); - if (completed_response_err != nullptr) { - // intentionally not using the return value from Clean up here - // it is captured to avoid warnings but at this point, the error from the - // tritonserver (completed_response_err) is more important to bubble up - Error val = CleanUp(completed_response, allocator); - RETURN_IF_TRITONSERVER_ERROR( - completed_response_err, "request failure in triton server"); - } + completed_response = completed.get(); + + RETURN_IF_TRITONSERVER_ERROR( + inference_response_error_fn_(completed_response), + "inference response error"); timer.CaptureTimestamp(tc::RequestTimers::Kind::RECV_START); timer.CaptureTimestamp(tc::RequestTimers::Kind::RECV_END); timer.CaptureTimestamp(tc::RequestTimers::Kind::REQUEST_END); - tc::Error err = GetSingleton()->UpdateInferStat(timer); + tc::Error err = UpdateInferStat(timer); if (!err.IsOk()) { std::cerr << "Failed to update context stat: " << err << std::endl; } - const char* cid; - RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->request_id_fn_(irequest, &cid), - "Failed to get request id"); - std::string id(cid); + InferResult::Create(result, err, id); - return CleanUp(completed_response, allocator); + + // CleanUp the response allocators + error_handler.Complete(); + + return error; } Error @@ -962,10 +1003,11 @@ TritonLoader::CleanUp( TRITONSERVER_InferenceResponse* completed_response, TRITONSERVER_ResponseAllocator* allocator) { - TRITONSERVER_Error* response_err = - GetSingleton()->inference_response_delete_fn_(completed_response); - TRITONSERVER_Error* allocator_err = - GetSingleton()->response_allocator_delete_fn_(allocator); + TRITONSERVER_Error* response_err = nullptr; + if (completed_response != nullptr) { + response_err = inference_response_delete_fn_(completed_response); + } + TRITONSERVER_Error* allocator_err = response_allocator_delete_fn_(allocator); RETURN_IF_TRITONSERVER_ERROR(response_err, "deleting inference response"); RETURN_IF_TRITONSERVER_ERROR(allocator_err, "deleting response allocator"); return Error::Success; @@ -981,39 +1023,45 @@ TritonLoader::InitializeRequest( // Create the allocator that will be used to allocate buffers for // the result tensors. RETURN_IF_TRITONSERVER_ERROR( - GetSingleton() - ->response_allocator_new_fn_( - allocator, - reinterpret_cast< - TRITONSERVER_Error* (*)(TRITONSERVER_ResponseAllocator * allocator, const char* tensor_name, size_t byte_size, TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, void* userp, void** buffer, void** buffer_userp, TRITONSERVER_MemoryType* actual_memory_type, int64_t* actual_memory_type_id)>( - ResponseAlloc), - reinterpret_cast< - TRITONSERVER_Error* (*)(TRITONSERVER_ResponseAllocator * allocator, void* buffer, void* buffer_userp, size_t byte_size, TRITONSERVER_MemoryType memory_type, int64_t memory_type_id)>( - ResponseRelease), - nullptr /* start_fn */), + GetSingleton()->response_allocator_new_fn_( + allocator, + reinterpret_cast< + TRITONSERVER_Error* (*)(TRITONSERVER_ResponseAllocator* allocator, + const char* tensor_name, size_t byte_size, + TRITONSERVER_MemoryType memory_type, + int64_t memory_type_id, void* userp, + void** buffer, void** buffer_userp, + TRITONSERVER_MemoryType* + actual_memory_type, + int64_t* actual_memory_type_id)>( + ResponseAlloc), + reinterpret_cast< + TRITONSERVER_Error* (*)(TRITONSERVER_ResponseAllocator* allocator, + void* buffer, void* buffer_userp, + size_t byte_size, + TRITONSERVER_MemoryType memory_type, + int64_t memory_type_id)>(ResponseRelease), + nullptr /* start_fn */), "creating response allocator"); // set up inference request RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->inference_request_new_fn_( - irequest, (GetSingleton()->server_).get(), - GetSingleton()->model_name_.c_str(), GetSingleton()->model_version_), + inference_request_new_fn_( + irequest, (server_).get(), model_name_.c_str(), model_version_), "creating inference request"); RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->inference_request_set_id_fn_( - *irequest, options.request_id_.c_str()), + inference_request_set_id_fn_(*irequest, options.request_id_.c_str()), "setting ID for the request"); if ((options.sequence_id_ != 0) || (options.sequence_id_str_ != "") || (options.priority_ != 0) || (options.server_timeout_ != 0) || outputs.empty()) { if (options.sequence_id_ != 0) { RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->set_correlation_id_fn_( - *irequest, options.sequence_id_), + set_correlation_id_fn_(*irequest, options.sequence_id_), "setting sequence ID for the request"); } else if (options.sequence_id_str_ != "") { RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->set_string_correlation_id_fn_( + set_string_correlation_id_fn_( *irequest, options.sequence_id_str_.c_str()), "setting sequence ID for the request"); } @@ -1021,25 +1069,25 @@ TritonLoader::InitializeRequest( if (options.sequence_start_) { flags |= TRITONSERVER_REQUEST_FLAG_SEQUENCE_START; } - if (options.sequence_start_) { + if (options.sequence_end_) { flags |= TRITONSERVER_REQUEST_FLAG_SEQUENCE_END; } RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->set_flags_fn_(*irequest, flags), + set_flags_fn_(*irequest, flags), "setting inference flags for the request"); } if (options.priority_ != 0) { RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->set_priority_fn_(*irequest, options.priority_), + set_priority_fn_(*irequest, options.priority_), "setting priority for the request"); } if (options.server_timeout_ != 0) { RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->set_timeout_ms_fn_(*irequest, options.server_timeout_), + set_timeout_ms_fn_(*irequest, options.server_timeout_), "setting timeout for the request"); } RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->inference_request_set_release_callback_fn_( + inference_request_set_release_callback_fn_( *irequest, InferRequestComplete, nullptr /* request_release_userp */), "setting request release callback"); return Error::Success; @@ -1053,19 +1101,16 @@ TritonLoader::AddInputs( for (auto io : inputs) { const char* input_name = io->Name().c_str(); const char* datatype = io->Datatype().c_str(); - const TRITONSERVER_DataType dtype = - GetSingleton()->string_to_datatype_fn_(datatype); + const TRITONSERVER_DataType dtype = string_to_datatype_fn_(datatype); std::vector shape_vec; for (const int64_t dim : io->Shape()) { // this is a vector, just use it shape_vec.push_back(dim); } + RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->inference_request_add_input_fn_( + inference_request_add_input_fn_( irequest, input_name, dtype, &shape_vec[0], shape_vec.size()), "setting input for the request"); - if (io->IsSharedMemory()) { - return Error("shared library not supported for C API"); - } size_t byte_size; tc::Error err = io->ByteSize(&byte_size); if (!err.IsOk()) { @@ -1073,25 +1118,46 @@ TritonLoader::AddInputs( } if (byte_size == 0) { RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->inference_request_append_input_data_fn_( + inference_request_append_input_data_fn_( irequest, input_name, nullptr, 0 /* byte_size */, - TRITONSERVER_MEMORY_CPU, 0 /* memory_type_id */), + TRITONSERVER_MEMORY_CPU /* memory type */, + 0 /* memory_type_id */), "appending input data with byte size zero"); } else { - io->PrepareForRequest(); - bool end_of_input = false; - while (!end_of_input) { - const uint8_t* buf; - size_t buf_size; - io->GetNext(&buf, &buf_size, &end_of_input); - if (buf != nullptr) { - RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->inference_request_append_input_data_fn_( - irequest, input_name, const_cast(buf), buf_size, - GetSingleton()->requested_memory_type_, - 0 /* memory_type_id */), - "appending data to tritonserver"); + if (!io->IsSharedMemory()) { + io->PrepareForRequest(); + bool end_of_input = false; + while (!end_of_input) { + const uint8_t* buf; + size_t buf_size; + io->GetNext(&buf, &buf_size, &end_of_input); + if (buf != nullptr) { + RETURN_IF_TRITONSERVER_ERROR( + inference_request_append_input_data_fn_( + irequest, input_name, const_cast(buf), buf_size, + TRITONSERVER_MEMORY_CPU /* memory_type */, + 0 /* memory_type_id */), + "appending data to tritonserver"); + } } + } else { + std::string shm_name; + size_t shm_byte_size; + size_t offset; + // TODO: Error handling + io->SharedMemoryInfo(&shm_name, &shm_byte_size, &offset); + void* buf; + TRITONSERVER_MemoryType memory_type; + int64_t memory_type_id; + RETURN_IF_ERROR(shm_manager_->GetMemoryInfo( + shm_name, offset, shm_byte_size, &buf, &memory_type, + &memory_type_id)); + RETURN_IF_TRITONSERVER_ERROR( + inference_request_append_input_data_fn_( + irequest, input_name, buf, byte_size, + memory_type /* memory_type */, + memory_type_id /* memory_type_id */), + "appending data to tritonserver"); } } } @@ -1108,8 +1174,7 @@ TritonLoader::AddOutputs( for (auto io : outputs) { const char* output_name = io->Name().c_str(); RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->inference_request_add_requested_output_fn_( - irequest, output_name), + inference_request_add_requested_output_fn_(irequest, output_name), "setting output for the request"); } return Error::Success; @@ -1128,15 +1193,15 @@ TritonLoader::ModelInferenceStatistics( GetModelVersionFromString(model_version, &requested_model_version); if (err.IsOk()) { RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->model_statistics_fn_( - (GetSingleton()->server_).get(), model_name.c_str(), - requested_model_version, &model_stats_message), + model_statistics_fn_( + (server_).get(), model_name.c_str(), requested_model_version, + &model_stats_message), "getting model statistics from server"); const char* buffer; size_t byte_size; RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->message_serialize_to_json_fn_( + message_serialize_to_json_fn_( model_stats_message, &buffer, &byte_size), "serializing message to json"); @@ -1148,7 +1213,7 @@ TritonLoader::ModelInferenceStatistics( " at " + std::to_string(infer_stat->GetErrorOffset())); } RETURN_IF_TRITONSERVER_ERROR( - GetSingleton()->message_delete_fn_(model_stats_message), + message_delete_fn_(model_stats_message), "deleting inference statistics message"); } return err; @@ -1170,7 +1235,40 @@ TritonLoader::~TritonLoader() { FAIL_IF_ERR(Delete(), "dereferencing server instance..."); FAIL_IF_ERR(CloseLibraryHandle(dlhandle_), "error on closing triton loader"); - GetSingleton()->ClearHandles(); + ClearHandles(); +} + +#ifdef TRITON_ENABLE_GPU +Error +TritonLoader::RegisterCudaMemory( + const std::string& name, void* handle, const size_t byte_size) +{ + RETURN_IF_ERROR(shm_manager_->RegisterCUDAMemory( + name, handle, byte_size, 0 /* device id */)); + return Error::Success; +} +#endif // TRITON_ENABLE_GPU + +Error +TritonLoader::RegisterSystemMemory( + const std::string& name, void* ptr, const size_t byte_size) +{ + RETURN_IF_ERROR(shm_manager_->RegisterSystemMemory(name, ptr, byte_size)); + return Error::Success; +} + +Error +TritonLoader::UnregisterAllSharedMemory() +{ + RETURN_IF_ERROR(shm_manager_->UnregisterAll(TRITONSERVER_MEMORY_GPU)); + RETURN_IF_ERROR(shm_manager_->UnregisterAll(TRITONSERVER_MEMORY_GPU)); + return Error::Success; +} + +TRITONSERVER_Error* +TritonLoader::ErrorNew(TRITONSERVER_Error_Code code, const char* message) +{ + return error_new_fn_(code, message); } }}}} // namespace triton::perfanalyzer::clientbackend::tritoncapi diff --git a/client_backend/triton_c_api/triton_loader.h b/client_backend/triton_c_api/triton_loader.h index 8ea48d1c..1a18176c 100644 --- a/client_backend/triton_c_api/triton_loader.h +++ b/client_backend/triton_c_api/triton_loader.h @@ -25,18 +25,20 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once +#include +#include + #include #include #include #include + #include "../client_backend.h" #include "common.h" #include "shared_library.h" +#include "shared_memory_manager.h" #include "triton/core/tritonserver.h" -#include -#include - // If TRITONSERVER error is non-OK, return the corresponding status. #define RETURN_IF_TRITONSERVER_ERROR(E, MSG) \ do { \ @@ -86,48 +88,64 @@ class TritonLoader : public tc::InferenceServerClient { static Error Create( const std::string& triton_server_path, - const std::string& model_repository_path, const std::string& memory_type, - bool verbose); + const std::string& model_repository_path, bool verbose); - static Error Delete(); - static Error StartTriton(const std::string& memory_type); + Error Delete(); + Error StartTriton(); - static Error LoadModel( + Error LoadModel( const std::string& model_name, const std::string& model_version); - static Error ModelMetadata(rapidjson::Document* model_metadata); + Error ModelMetadata(rapidjson::Document* model_metadata); - static Error ModelConfig(rapidjson::Document* model_config); + Error ModelConfig( + rapidjson::Document* model_config, const std::string& model_name, + const std::string& model_version); - static Error ServerMetaData(rapidjson::Document* server_metadata); + Error ServerMetaData(rapidjson::Document* server_metadata); - static Error Infer( + Error Infer( const tc::InferOptions& options, const std::vector& inputs, const std::vector& outputs, InferResult** result); - static Error CleanUp( + Error CleanUp( TRITONSERVER_InferenceResponse* completed_response, TRITONSERVER_ResponseAllocator* allocator); - static Error ModelInferenceStatistics( + Error ModelInferenceStatistics( const std::string& model_name, const std::string& model_version, rapidjson::Document* infer_stat); - static Error ClientInferStat(tc::InferStat* infer_stat) + Error ClientInferStat(tc::InferStat* infer_stat) { - *infer_stat = GetSingleton()->infer_stat_; + *infer_stat = infer_stat_; return Error::Success; } - static bool ModelIsLoaded() { return GetSingleton()->model_is_loaded_; } - static bool ServerIsReady() { return GetSingleton()->server_is_ready_; } - static TRITONSERVER_Error* DeleteInferRequest( +#ifdef TRITON_ENABLE_GPU + Error RegisterCudaMemory( + const std::string& name, void* handle, const size_t byte_size); +#endif // TRITON_ENABLE_GPU + + Error RegisterSystemMemory( + const std::string& name, void* ptr, const size_t byte_size); + + Error UnregisterAllSharedMemory(); + + TRITONSERVER_Error* ErrorNew( + TRITONSERVER_Error_Code code, const char* message); + + bool ModelIsLoaded() { return model_is_loaded_; } + bool ServerIsReady() { return server_is_ready_; } + + TRITONSERVER_Error* DeleteInferRequest( TRITONSERVER_InferenceRequest* irequest) { - return GetSingleton()->request_delete_fn_(irequest); + return request_delete_fn_(irequest); } + static TritonLoader* GetSingleton(); // TRITONSERVER_ApiVersion typedef TRITONSERVER_Error* (*TritonServerApiVersionFn_t)( @@ -145,12 +163,15 @@ class TritonLoader : public tc::InferenceServerClient { // TRITONSERVER_ServerOptionsSetBackendDirectory typedef TRITONSERVER_Error* (*TritonServerSetBackendDirFn_t)( TRITONSERVER_ServerOptions* options, const char* backend_dir); + // TRITONSERVER_ServerOptionsSetRepoAgentDirectory typedef TRITONSERVER_Error* (*TritonServerSetRepoAgentDirFn_t)( TRITONSERVER_ServerOptions* options, const char* repoagent_dir); + // TRITONSERVER_ServerOptionsSetStrictModelConfig typedef TRITONSERVER_Error* (*TritonServerSetStrictModelConfigFn_t)( TRITONSERVER_ServerOptions* options, bool strict); + // TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability typedef TRITONSERVER_Error* ( *TritonServerSetMinSupportedComputeCapabilityFn_t)( @@ -159,12 +180,15 @@ class TritonLoader : public tc::InferenceServerClient { // TRITONSERVER_ServerNew typedef TRITONSERVER_Error* (*TritonServerNewFn_t)( TRITONSERVER_Server** server, TRITONSERVER_ServerOptions* option); + // TRITONSERVER_ServerOptionsDelete typedef TRITONSERVER_Error* (*TritonServerOptionsDeleteFn_t)( TRITONSERVER_ServerOptions* options); + // TRITONSERVER_ServerDelete typedef TRITONSERVER_Error* (*TritonServerDeleteFn_t)( TRITONSERVER_Server* server); + // TRITONSERVER_ServerIsLive typedef TRITONSERVER_Error* (*TritonServerIsLiveFn_t)( TRITONSERVER_Server* server, bool* live); @@ -172,12 +196,15 @@ class TritonLoader : public tc::InferenceServerClient { // TRITONSERVER_ServerIsReady typedef TRITONSERVER_Error* (*TritonServerIsReadyFn_t)( TRITONSERVER_Server* server, bool* ready); + // TRITONSERVER_ServerMetadata typedef TRITONSERVER_Error* (*TritonServerMetadataFn_t)( TRITONSERVER_Server* server, TRITONSERVER_Message** server_metadata); + // TRITONSERVER_MessageSerializeToJson typedef TRITONSERVER_Error* (*TritonServerMessageSerializeToJsonFn_t)( TRITONSERVER_Message* message, const char** base, size_t* byte_size); + // TRITONSERVER_MessageDelete typedef TRITONSERVER_Error* (*TritonServerMessageDeleteFn_t)( TRITONSERVER_Message* message); @@ -186,16 +213,19 @@ class TritonLoader : public tc::InferenceServerClient { typedef TRITONSERVER_Error* (*TritonServerModelIsReadyFn_t)( TRITONSERVER_Server* server, const char* model_name, const int64_t model_version, bool* ready); + // TRITONSERVER_ServerModelMetadata typedef TRITONSERVER_Error* (*TritonServerModelMetadataFn_t)( TRITONSERVER_Server* server, const char* model_name, const int64_t model_version, TRITONSERVER_Message** model_metadata); + // TRITONSERVER_ResponseAllocatorNew typedef TRITONSERVER_Error* (*TritonServerResponseAllocatorNewFn_t)( TRITONSERVER_ResponseAllocator** allocator, TRITONSERVER_ResponseAllocatorAllocFn_t alloc_fn, TRITONSERVER_ResponseAllocatorReleaseFn_t release_fn, TRITONSERVER_ResponseAllocatorStartFn_t start_fn); + // TRITONSERVER_InferenceRequestNew typedef TRITONSERVER_Error* (*TritonServerInferenceRequestNewFn_t)( TRITONSERVER_InferenceRequest** inference_request, @@ -205,17 +235,20 @@ class TritonLoader : public tc::InferenceServerClient { // TRITONSERVER_InferenceRequestSetId typedef TRITONSERVER_Error* (*TritonServerInferenceRequestSetIdFn_t)( TRITONSERVER_InferenceRequest* inference_request, const char* id); + // TRITONSERVER_InferenceRequestSetReleaseCallback typedef TRITONSERVER_Error* ( *TritonServerInferenceRequestSetReleaseCallbackFn_t)( TRITONSERVER_InferenceRequest* inference_request, TRITONSERVER_InferenceRequestReleaseFn_t request_release_fn, void* request_release_userp); + // TRITONSERVER_InferenceRequestAddInput typedef TRITONSERVER_Error* (*TritonServerInferenceRequestAddInputFn_t)( TRITONSERVER_InferenceRequest* inference_request, const char* name, const TRITONSERVER_DataType datatype, const int64_t* shape, uint64_t dim_count); + // TRITONSERVER_InferenceRequestAddRequestedOutput typedef TRITONSERVER_Error* ( *TritonServerInferenceRequestAddRequestedOutputFn_t)( @@ -227,6 +260,7 @@ class TritonLoader : public tc::InferenceServerClient { TRITONSERVER_InferenceRequest* inference_request, const char* name, const void* base, size_t byte_size, TRITONSERVER_MemoryType memory_type, int64_t memory_type_i); + // TRITONSERVER_InferenceRequestSetResponseCallback typedef TRITONSERVER_Error* ( *TritonServerInferenceRequestSetResponseCallbackFn_t)( @@ -235,11 +269,13 @@ class TritonLoader : public tc::InferenceServerClient { void* response_allocator_userp, TRITONSERVER_InferenceResponseCompleteFn_t response_fn, void* response_userp); + // TRITONSERVER_ServerInferAsync typedef TRITONSERVER_Error* (*TritonServerInferAsyncFn_t)( TRITONSERVER_Server* server, TRITONSERVER_InferenceRequest* inference_request, TRITONSERVER_InferenceTrace* trace); + // TRITONSERVER_InferenceResponseError typedef TRITONSERVER_Error* (*TritonServerInferenceResponseErrorFn_t)( TRITONSERVER_InferenceResponse* inference_response); @@ -247,13 +283,16 @@ class TritonLoader : public tc::InferenceServerClient { // TRITONSERVER_InferenceResponseDelete typedef TRITONSERVER_Error* (*TritonServerInferenceResponseDeleteFn_t)( TRITONSERVER_InferenceResponse* inference_response); + // TRITONSERVER_InferenceRequestRemoveAllInputData typedef TRITONSERVER_Error* ( *TritonServerInferenceRequestRemoveAllInputDataFn_t)( TRITONSERVER_InferenceRequest* inference_request, const char* name); + // TRITONSERVER_ResponseAllocatorDelete typedef TRITONSERVER_Error* (*TritonServerResponseAllocatorDeleteFn_t)( TRITONSERVER_ResponseAllocator* allocator); + // TRITONSERVER_ErrorNew typedef TRITONSERVER_Error* (*TritonServerErrorNewFn_t)( TRITONSERVER_Error_Code code, const char* msg); @@ -261,46 +300,57 @@ class TritonLoader : public tc::InferenceServerClient { // TRITONSERVER_MemoryTypeString typedef const char* (*TritonServerMemoryTypeStringFn_t)( TRITONSERVER_MemoryType memtype); + // TRITONSERVER_InferenceResponseOutputCount typedef TRITONSERVER_Error* (*TritonServerInferenceResponseOutputCountFn_t)( TRITONSERVER_InferenceResponse* inference_response, uint32_t* count); + // TRITONSERVER_DataTypeString typedef const char* (*TritonServerDataTypeStringFn_t)( TRITONSERVER_DataType datatype); + // TRITONSERVER_ErrorMessage typedef const char* (*TritonServerErrorMessageFn_t)( TRITONSERVER_Error* error); // TRITONSERVER_ErrorDelete typedef void (*TritonServerErrorDeleteFn_t)(TRITONSERVER_Error* error); + // TRITONSERVER_ErrorCodeString typedef const char* (*TritonServerErrorCodeToStringFn_t)( TRITONSERVER_Error* error); + // TRITONSERVER_ServerModelConfig typedef TRITONSERVER_Error* (*TritonServerModelConfigFn_t)( TRITONSERVER_Server* server, const char* model_name, const int64_t model_version, const uint32_t config_version, TRITONSERVER_Message** model_config); + // TRITONSERVER_InferenceRequestSetCorrelationId typedef TRITONSERVER_Error* ( *TritonServerInferenceRequestSetCorrelationIdFn_t)( TRITONSERVER_InferenceRequest* inference_request, uint64_t correlation_id); + // TRITONSERVER_InferenceRequestSetCorrelationId typedef TRITONSERVER_Error* ( *TritonServerInferenceRequestSetStringCorrelationIdFn_t)( TRITONSERVER_InferenceRequest* inference_request, const char* correlation_id); + // TRITONSERVER_InferenceRequestSetFlags typedef TRITONSERVER_Error* (*TritonServerInferenceRequestSetFlagsFn_t)( TRITONSERVER_InferenceRequest* inference_request, uint32_t flags); - // TRITONSERVER_InferenceRequestSetPriority + + // TRITONSERVER_InferenceRequestSetPriorityUInt64 typedef TRITONSERVER_Error* (*TritonServerInferenceRequestSetPriorityFn_t)( - TRITONSERVER_InferenceRequest* inference_request, uint32_t priority); + TRITONSERVER_InferenceRequest* inference_request, uint64_t priority); + // TRITONSERVER_InferenceRequestSetTimeoutMicroseconds typedef TRITONSERVER_Error* ( *TritonServerInferenceRequestSetTimeoutMicrosecondsFn_t)( TRITONSERVER_InferenceRequest* inference_request, uint64_t timeout_us); + // TRITONSERVER_StringToDataType typedef TRITONSERVER_DataType (*TritonServerStringToDatatypeFn_t)( const char* dtype); @@ -312,23 +362,32 @@ class TritonLoader : public tc::InferenceServerClient { uint64_t* dim_count, const void** base, size_t* byte_size, TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id, void** userp); + // TRITONSERVER_InferenceRequestId typedef TRITONSERVER_Error* (*TritonServerRequestIdFn_t)( TRITONSERVER_InferenceRequest* inference_request, const char** id); + // TRITONSERVER_InferenceRequestDelete typedef TRITONSERVER_Error* (*TritonServerRequestDeleteFn_t)( TRITONSERVER_InferenceRequest* inference_request); + // TRITONSERVER_ServerModelStatistics typedef TRITONSERVER_Error* (*TritonServerModelStatisticsFn_t)( TRITONSERVER_Server* server, const char* model_name, const int64_t model_version, TRITONSERVER_Message** model_stats); + // TRITONSERVER_ServerUnloadModel typedef TRITONSERVER_Error* (*TritonSeverUnloadModelFn_t)( TRITONSERVER_Server* server, const char* model_name); + // TRITONSERVER_ServerOptionsSetLogInfo typedef TRITONSERVER_Error* (*TritonSeverSetLogInfoFn_t)( TRITONSERVER_ServerOptions* options, bool log); + // TRITONSERVER_ServerOptionsSetCudaMemoryPoolByteSize + typedef TRITONSERVER_Error* (*TritonServerSetCudaMemoryPoolByteSizeFn_t)( + TRITONSERVER_ServerOptions* options, int gpu_device, uint64_t size); + private: TritonLoader() : InferenceServerClient( @@ -339,14 +398,12 @@ class TritonLoader : public tc::InferenceServerClient { requested_memory_type_ = TRITONSERVER_MEMORY_CPU; model_is_loaded_ = false; server_is_ready_ = false; + shm_manager_ = std::make_unique(); } Error PopulateInternals( const std::string& triton_server_path, - const std::string& model_repository_path, const std::string& memory_type, - bool verbose); - - static TritonLoader* GetSingleton(); + const std::string& model_repository_path, bool verbose); /// Load all tritonserver.h functions onto triton_loader /// internal handles @@ -357,7 +414,7 @@ class TritonLoader : public tc::InferenceServerClient { /// Check if file exists in the current directory /// \param filepath Path of library to check /// \return perfanalyzer::clientbackend::Error - static Error FileExists(std::string& filepath); + Error FileExists(std::string& filepath); Error InitializeRequest( const tc::InferOptions& options, @@ -443,18 +500,20 @@ class TritonLoader : public tc::InferenceServerClient { TritonSeverUnloadModelFn_t unload_model_fn_; TritonSeverSetLogInfoFn_t set_log_info_fn_; - - std::shared_ptr server_; - std::string triton_server_path_; - const std::string SERVER_LIBRARY_PATH = "/lib/libtritonserver.so"; - int verbose_level_; - bool enforce_memory_type_; - std::string model_repository_path_; - std::string model_name_; - int64_t model_version_; - TRITONSERVER_memorytype_enum requested_memory_type_; - bool model_is_loaded_; - bool server_is_ready_; + TritonServerSetCudaMemoryPoolByteSizeFn_t set_cuda_memory_pool_byte_size_; + + std::shared_ptr server_{nullptr}; + std::string triton_server_path_{}; + const std::string server_library_path_{"/lib/libtritonserver.so"}; + int verbose_level_{0}; + TRITONSERVER_MemoryType requested_memory_type_{TRITONSERVER_MEMORY_CPU}; + bool enforce_memory_type_{false}; + std::string model_repository_path_{""}; + std::string model_name_{""}; + int64_t model_version_{-1}; + bool model_is_loaded_{false}; + bool server_is_ready_{false}; + std::unique_ptr shm_manager_{nullptr}; }; }}}} // namespace triton::perfanalyzer::clientbackend::tritoncapi diff --git a/command_line_parser.cc b/command_line_parser.cc index fbf4e6d4..bd3d72d7 100644 --- a/command_line_parser.cc +++ b/command_line_parser.cc @@ -1,4 +1,4 @@ -// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -29,6 +29,7 @@ #include +#include #include #include #include @@ -46,6 +47,31 @@ CLParser::Parse(int argc, char** argv) return params_; } +std::vector +SplitString(const std::string& str, const std::string& delimiter = ":") +{ + std::vector substrs; + size_t pos = 0; + while (pos != std::string::npos) { + size_t colon_pos = str.find(":", pos); + substrs.push_back(str.substr(pos, colon_pos - pos)); + if (colon_pos == std::string::npos) { + pos = colon_pos; + } else { + pos = colon_pos + 1; + } + } + return substrs; +} + +void +ToLowerCase(std::string& s) +{ + std::transform(s.begin(), s.end(), s.begin(), [](unsigned char c) { + return std::tolower(c); + }); +} + // Used to format the usage message std::string CLParser::FormatMessage(std::string str, int offset) const @@ -66,17 +92,21 @@ void CLParser::Usage(const std::string& msg) { if (!msg.empty()) { - std::cerr << "error: " << msg << std::endl; + std::cerr << "Error: " << msg << std::endl; } std::cerr << "Usage: " << argv_[0] << " [options]" << std::endl; std::cerr << "==== SYNOPSIS ====\n \n"; - std::cerr << "\t--service-kind " - "<\"triton\"|\"tfserving\"|\"torchserve\"|\"triton_c_api\">" - << std::endl; + std::cerr << "\t--version " << std::endl; std::cerr << "\t-m " << std::endl; std::cerr << "\t-x " << std::endl; + std::cerr << "\t--bls-composing-models " << std::endl; std::cerr << "\t--model-signature-name " << std::endl; + std::cerr + << "\t--service-kind " + "<\"triton\"|\"openai\"|\"tfserving\"|\"torchserve\"|\"triton_c_api\">" + << std::endl; + std::cerr << "\t--endpoint " << std::endl; std::cerr << "\t-v" << std::endl; std::cerr << std::endl; std::cerr << "I. MEASUREMENT PARAMETERS: " << std::endl; @@ -85,12 +115,15 @@ CLParser::Usage(const std::string& msg) std::cerr << "\t--measurement-interval (-p) " << std::endl; std::cerr << "\t--concurrency-range " << std::endl; + std::cerr << "\t--periodic-concurrency-range " << std::endl; + std::cerr << "\t--request-period " << std::endl; std::cerr << "\t--request-rate-range " << std::endl; std::cerr << "\t--request-distribution <\"poisson\"|\"constant\">" << std::endl; std::cerr << "\t--request-intervals " << std::endl; + std::cerr << "\t--serial-sequences" << std::endl; std::cerr << "\t--binary-search" << std::endl; std::cerr << "\t--num-of-sequences " << std::endl; @@ -104,6 +137,7 @@ CLParser::Usage(const std::string& msg) "profiling>" << std::endl; std::cerr << "\t--percentile " << std::endl; + std::cerr << "\t--request-count " << std::endl; std::cerr << "\tDEPRECATED OPTIONS" << std::endl; std::cerr << "\t-t " << std::endl; std::cerr << "\t-c " << std::endl; @@ -116,9 +150,12 @@ CLParser::Usage(const std::string& msg) std::cerr << "\t--output-shared-memory-size " << std::endl; std::cerr << "\t--shape " << std::endl; std::cerr << "\t--sequence-length " << std::endl; + std::cerr << "\t--sequence-length-variation " << std::endl; std::cerr << "\t--sequence-id-range " << std::endl; std::cerr << "\t--string-length " << std::endl; std::cerr << "\t--string-data " << std::endl; + std::cerr << "\t--input-tensor-format [binary|json]" << std::endl; + std::cerr << "\t--output-tensor-format [binary|json]" << std::endl; std::cerr << "\tDEPRECATED OPTIONS" << std::endl; std::cerr << "\t-z" << std::endl; std::cerr << "\t--data-directory " << std::endl; @@ -141,11 +178,11 @@ CLParser::Usage(const std::string& msg) std::cerr << std::endl; std::cerr << "IV. OTHER OPTIONS: " << std::endl; std::cerr << "\t-f " << std::endl; + std::cerr << "\t--profile-export-file " << std::endl; std::cerr << "\t-H " << std::endl; std::cerr << "\t--streaming" << std::endl; std::cerr << "\t--grpc-compression-algorithm " << std::endl; - std::cerr << "\t--trace-file" << std::endl; std::cerr << "\t--trace-level" << std::endl; std::cerr << "\t--trace-rate" << std::endl; std::cerr << "\t--trace-count" << std::endl; @@ -156,20 +193,10 @@ CLParser::Usage(const std::string& msg) std::cerr << std::endl; std::cerr << "==== OPTIONS ==== \n \n"; - std::cerr - << FormatMessage( - " --service-kind: Describes the kind of service perf_analyzer to " - "generate load for. The options are \"triton\", \"triton_c_api\", " - "\"tfserving\" and \"torchserve\". Default value is \"triton\". " - "Note in order to use \"torchserve\" backend --input-data option " - "must point to a json file holding data in the following format " - "{\"data\" : [{\"TORCHSERVE_INPUT\" : [\"\"]}, {...}...]}. The type of file here will depend " - "on the model. In order to use \"triton_c_api\" you must specify " - "the Triton server install path and the model repository " - "path via the --library-name and --model-repo flags", - 18) - << std::endl; + std::cerr << FormatMessage( + " --version: print the current version of Perf Analyzer.", + 18) + << std::endl; std::cerr << std::setw(9) << std::left << " -m: " @@ -192,6 +219,33 @@ CLParser::Usage(const std::string& msg) "\"tfserving\".", 18) << std::endl; + + std::cerr + << FormatMessage( + " --service-kind: Describes the kind of service perf_analyzer to " + "generate load for. The options are \"triton\", \"openai\", " + "\"triton_c_api\", \"tfserving\" and \"torchserve\". Default " + "value is \"triton\". Note in order to use \"openai\" you must " + "specify an endpoint via --endpoint. " + "Note in order to use \"torchserve\" backend --input-data option " + "must point to a json file holding data in the following format " + "{\"data\" : [{\"TORCHSERVE_INPUT\" : [\"\"]}, {...}...]}. The type of file here will depend " + "on the model. In order to use \"triton_c_api\" you must specify " + "the Triton server install path and the model repository path via " + "the --triton-server-directory and --model-repository flags", + 18) + << std::endl; + + std::cerr + << FormatMessage( + " --endpoint: Describes what endpoint to send requests to on the " + "server. This is required when using \"openai\" service-kind, and " + "is ignored for all other cases. Currently only " + "\"v1/chat/completions\" is confirmed to work.", + 18) + << std::endl; + std::cerr << std::setw(9) << std::left << " -v: " << FormatMessage("Enables verbose mode.", 9) << std::endl; @@ -261,6 +315,45 @@ CLParser::Usage(const std::string& msg) "not be 0 for sequence models while using asynchronous mode.", 18) << std::endl; + std::cerr + << FormatMessage( + " --periodic-concurrency-range : Determines the " + "range of concurrency levels in the similar but slightly " + "different manner as the --concurrency-range. Perf Analyzer will " + "start from the concurrency level of 'start' and increase by " + "'step' each time. Unlike --concurrency-range, the 'end' " + "indicates the *total* number of concurrency since the 'start' " + "(including) and will stop increasing once the cumulative number " + "of concurrent requests has reached the 'end'. The user can " + "specify *when* to periodically increase the concurrency level " + "using the --request-period option. The concurrency level will " + "periodically increase for every n-th response specified by " + "--request-period. Since this disables stability check in Perf " + "Analyzer and reports response timestamps only, the user must " + "provide --profile-export-file to specify where to dump all the " + "measured timestamps. The default values of 'start', 'end', and " + "'step' are 1.", + 18) + << std::endl; + std::cerr + << FormatMessage( + " --request-period : Indicates the number of responses that " + "each request must receive before new, concurrent requests are " + "sent when --periodic-concurrency-range is specified. Default " + "value is 10.", + 18) + << std::endl; + std::cerr + << FormatMessage( + " --request-parameter : Specifies a custom " + "parameter that can be sent to a Triton backend as part of the " + "request. For example, providing '--request-parameter " + "max_tokens:256:int' to the command line will set an additional " + "parameter 'max_tokens' of type 'int' to 256 as part of the " + "request. The --request-parameter may be specified multiple times " + "for different custom parameters.", + 18) + << std::endl; std::cerr << FormatMessage( " --request-rate-range : Determines the range of " @@ -303,7 +396,7 @@ CLParser::Usage(const std::string& msg) << std::endl; std::cerr << FormatMessage( - "--binary-search: Enables the binary search on the specified " + " --binary-search: Enables the binary search on the specified " "search range. This option requires 'start' and 'end' to be " "expilicitly specified in the --concurrency-range or " "--request-rate-range. When using this option, 'step' is more " @@ -314,7 +407,7 @@ CLParser::Usage(const std::string& msg) << std::endl; std::cerr << FormatMessage( - "--num-of-sequences: Sets the number of concurrent " + " --num-of-sequences: Sets the number of concurrent " "sequences for sequence models. This option is ignored when " "--request-rate-range is not specified. By default, its " "value is 4.", @@ -371,6 +464,20 @@ CLParser::Usage(const std::string& msg) "that the average latency is used to determine stability", 18) << std::endl; + std::cerr + << FormatMessage( + " --request-count: Specifies a total number of requests to " + "use for measurement. The default is 0, which means that there is " + "no request count and the measurement will proceed using windows " + "until stabilization is detected.", + 18) + << std::endl; + std::cerr << FormatMessage( + " --serial-sequences: Enables serial sequence mode " + "where a maximum of one request is outstanding at a time " + "for any given sequence. The default is false.", + 18) + << std::endl; std::cerr << std::endl; std::cerr << "II. INPUT DATA OPTIONS: " << std::endl; std::cerr << std::setw(9) << std::left @@ -394,7 +501,7 @@ CLParser::Usage(const std::string& msg) "specifying json data users can control data used with every " "request. Multiple data streams can be specified for a sequence " "model and the analyzer will select a data stream in a " - "round-robin fashion for every new sequence. Muliple json files " + "round-robin fashion for every new sequence. Multiple json files " "can also be provided (--input-data json_file1 --input-data " "json-file2 and so on) and the analyzer will append data streams " "from each file. When using --service-kind=torchserve make sure " @@ -433,9 +540,22 @@ CLParser::Usage(const std::string& msg) std::cerr << FormatMessage( " --sequence-length: Indicates the base length of a " "sequence used for sequence models. A sequence with length " - "x will be composed of x requests to be sent as the " - "elements in the sequence. The length of the actual " - "sequence will be within +/- 20% of the base length.", + "X will be composed of X requests to be sent as the " + "elements in the sequence. The actual length of the sequence" + "will be within +/- Y% of the base length, where Y defaults " + "to 20% and is customizable via " + "`--sequence-length-variation`. If sequence length is " + "unspecified and input data is provided, the sequence " + "length will be the number of inputs in the user-provided " + "input data. Default is 20.", + 18) + << std::endl; + std::cerr << FormatMessage( + " --sequence-length-variation: The percentage variation in " + "length of sequences. This flag is only valid when " + "not using user-provided input data or when " + "`--sequence-length` is specified while using user-provided " + "input data. Default is 20.", 18) << std::endl; std::cerr @@ -467,6 +587,18 @@ CLParser::Usage(const std::string& msg) "option is ignored if --input-data points to a directory.", 18) << std::endl; + std::cerr << FormatMessage( + " --input-tensor-format=[binary|json]: Specifies Triton " + "inference request input tensor format. Only valid when " + "HTTP protocol is used. Default is 'binary'.", + 18) + << std::endl; + std::cerr << FormatMessage( + " --output-tensor-format=[binary|json]: Specifies Triton " + "inference response output tensor format. Only valid when " + "HTTP protocol is used. Default is 'binary'.", + 18) + << std::endl; std::cerr << std::endl; std::cerr << "III. SERVER DETAILS: " << std::endl; std::cerr << std::setw(38) << std::left << " -u: " @@ -565,6 +697,13 @@ CLParser::Usage(const std::string& msg) "this option. By default, the result is not recorded in a file.", 9) << std::endl; + std::cerr << std::setw(9) << std::left << " --profile-export-file: " + << FormatMessage( + "Specifies the path that the profile export will be " + "generated at. By default, the profile export will not be " + "generated.", + 9) + << std::endl; std::cerr << std::setw(9) << std::left << " -H: " << FormatMessage( @@ -590,19 +729,10 @@ CLParser::Usage(const std::string& msg) std::cerr << FormatMessage( - " --trace-file: Set the file where trace output will be saved." - " If --trace-log-frequency is also specified, this argument " - "value will be the prefix of the files to save the trace " - "output. See --trace-log-frequency for details. Only used for " - "service-kind of triton. Default value is none.", - 18) - << std::endl; - std::cerr - << FormatMessage( - "--trace-level: Specify a trace level. OFF to disable tracing, " + " --trace-level: Specify a trace level. OFF to disable tracing, " "TIMESTAMPS to trace timestamps, TENSORS to trace tensors. It " "may be specified multiple times to trace multiple " - "informations. Default is OFF.", + "information. Default is OFF.", 18) << std::endl; std::cerr @@ -619,7 +749,7 @@ CLParser::Usage(const std::string& msg) << FormatMessage( " --log-frequency: Set the trace log frequency. If the " "value is 0, Triton will only log the trace output to " - " when shutting down. Otherwise, Triton will log " + "the trace file when shutting down. Otherwise, Triton will log " "the trace output to . when it collects the " "specified number of traces. For example, if the log frequency " "is 100, when Triton collects the 100-th trace, it logs the " @@ -667,7 +797,25 @@ CLParser::Usage(const std::string& msg) "inference server metrics. Default is 1000.", 18) << std::endl; - exit(GENERIC_ERROR); + std::cerr << FormatMessage( + " --bls-composing-models: A comma separated list of all " + "BLS composing models (with optional model version number " + "after a colon for each) that may be called by the input " + "BLS model. For example, 'modelA:3,modelB' would specify " + "that modelA and modelB are composing models that may be " + "called by the input BLS model, and that modelA will use " + "version 3, while modelB's version is unspecified", + 18) + << std::endl; + throw pa::PerfAnalyzerException(GENERIC_ERROR); +} + +void +CLParser::PrintVersion() +{ + std::cerr << "Perf Analyzer Version " << VERSION << " (commit " << SHA << ")" + << std::endl; + exit(SUCCESS); } void @@ -722,14 +870,25 @@ CLParser::ParseCommandLine(int argc, char** argv) {"ssl-https-private-key-type", required_argument, 0, 41}, {"verbose-csv", no_argument, 0, 42}, {"enable-mpi", no_argument, 0, 43}, - {"trace-file", required_argument, 0, 44}, - {"trace-level", required_argument, 0, 45}, - {"trace-rate", required_argument, 0, 46}, - {"trace-count", required_argument, 0, 47}, - {"log-frequency", required_argument, 0, 48}, - {"collect-metrics", no_argument, 0, 49}, - {"metrics-url", required_argument, 0, 50}, - {"metrics-interval", required_argument, 0, 51}, + {"trace-level", required_argument, 0, 44}, + {"trace-rate", required_argument, 0, 45}, + {"trace-count", required_argument, 0, 46}, + {"log-frequency", required_argument, 0, 47}, + {"collect-metrics", no_argument, 0, 48}, + {"metrics-url", required_argument, 0, 49}, + {"metrics-interval", required_argument, 0, 50}, + {"sequence-length-variation", required_argument, 0, 51}, + {"bls-composing-models", required_argument, 0, 52}, + {"serial-sequences", no_argument, 0, 53}, + {"input-tensor-format", required_argument, 0, 54}, + {"output-tensor-format", required_argument, 0, 55}, + {"version", no_argument, 0, 56}, + {"profile-export-file", required_argument, 0, 57}, + {"periodic-concurrency-range", required_argument, 0, 58}, + {"request-period", required_argument, 0, 59}, + {"request-parameter", required_argument, 0, 60}, + {"endpoint", required_argument, 0, 61}, + {"request-count", required_argument, 0, 62}, {0, 0, 0, 0}}; // Parse commandline... @@ -737,36 +896,52 @@ CLParser::ParseCommandLine(int argc, char** argv) while ((opt = getopt_long( argc, argv, "vdazc:u:m:x:b:t:p:i:H:l:r:s:f:", long_options, NULL)) != -1) { - switch (opt) { - case 0: - params_->streaming = true; - break; - case 1: - params_->max_threads = std::atoi(optarg); - params_->max_threads_specified = true; - break; - case 2: - params_->sequence_length = std::atoi(optarg); - break; - case 3: - params_->percentile = std::atoi(optarg); - break; - case 4: - params_->user_data.push_back(optarg); - break; - case 5: { - std::string arg = optarg; - auto colon_pos = arg.rfind(":"); - if (colon_pos == std::string::npos) { - Usage( - "failed to parse input shape. There must be a colon after input " - "name."); + try { + switch (opt) { + case 0: + params_->streaming = true; + break; + case 1: { + std::string max_threads{optarg}; + if (std::stoi(max_threads) > 0) { + params_->max_threads = std::stoull(max_threads); + params_->max_threads_specified = true; + } else { + Usage("Failed to parse --max-threads. The value must be > 0."); + } + break; + } + case 2: { + std::string sequence_length{optarg}; + if (std::stoi(sequence_length) > 0) { + params_->sequence_length = std::stoull(sequence_length); + } else { + std::cerr << "WARNING: The sequence length must be > 0. Perf " + "Analyzer will use default value if it is measuring " + "on sequence model." + << std::endl; + } + params_->sequence_length_specified = true; + break; } - std::string name = arg.substr(0, colon_pos); - std::string shape_str = arg.substr(name.size() + 1); - size_t pos = 0; - std::vector shape; - try { + case 3: + params_->percentile = std::atoi(optarg); + break; + case 4: + params_->user_data.push_back(optarg); + break; + case 5: { + std::string arg = optarg; + auto colon_pos = arg.rfind(":"); + if (colon_pos == std::string::npos) { + Usage( + "Failed to parse --shape. There must be a colon after input " + "name."); + } + std::string name = arg.substr(0, colon_pos); + std::string shape_str = arg.substr(name.size() + 1); + size_t pos = 0; + std::vector shape; while (pos != std::string::npos) { size_t comma_pos = shape_str.find(",", pos); int64_t dim; @@ -778,115 +953,137 @@ CLParser::ParseCommandLine(int argc, char** argv) pos = comma_pos + 1; } if (dim <= 0) { - Usage("input shape must be > 0"); + Usage( + "Failed to parse --shape. The dimensions of input tensor " + "must be > 0."); } shape.emplace_back(dim); } + + params_->input_shapes[name] = shape; + break; } - catch (const std::invalid_argument& ia) { - Usage("failed to parse input shape: " + std::string(optarg)); + case 6: + case 'p': { + std::string measurement_window_ms{optarg}; + if (std::stoi(measurement_window_ms) > 0) { + params_->measurement_window_ms = std::stoull(measurement_window_ms); + } else { + Usage( + "Failed to parse --measurement-interval (-p). The value must " + "be > 0 msec."); + } + break; } - params_->input_shapes[name] = shape; - break; - } - case 6: { - params_->measurement_window_ms = std::atoi(optarg); - break; - } - case 7: { - params_->using_concurrency_range = true; - std::string arg = optarg; - size_t pos = 0; - int index = 0; - try { - while (pos != std::string::npos) { - size_t colon_pos = arg.find(":", pos); - if (index > 2) { - Usage( - "option concurrency-range can have maximum of three " - "elements"); - } - int64_t val; - if (colon_pos == std::string::npos) { - val = std::stoll(arg.substr(pos, colon_pos)); - pos = colon_pos; - } else { - val = std::stoll(arg.substr(pos, colon_pos - pos)); - pos = colon_pos + 1; - } - switch (index) { - case 0: - params_->concurrency_range.start = val; - break; - case 1: - params_->concurrency_range.end = val; - break; - case 2: - params_->concurrency_range.step = val; - break; + case 7: { + params_->using_concurrency_range = true; + std::string arg = optarg; + std::vector values{SplitString(arg)}; + if (values.size() > 3) { + Usage( + "Failed to parse --concurrency-range. The value does not match " + "."); + } + + for (size_t i = 0; i < values.size(); ++i) { + uint64_t val = std::stoull(values[i]); + if (i == 0) { + params_->concurrency_range.start = val; + } else if (i == 1) { + params_->concurrency_range.end = val; + } else if (i == 2) { + params_->concurrency_range.step = val; } - index++; } + break; } - catch (const std::invalid_argument& ia) { - Usage("failed to parse concurrency range: " + std::string(optarg)); + case 8: + case 'l': { + std::string latency_threshold_ms{optarg}; + if (std::stoi(latency_threshold_ms) == 0) { + params_->latency_threshold_ms = NO_LIMIT; + } else if (std::stoi(latency_threshold_ms) > 0) { + params_->latency_threshold_ms = std::stoull(latency_threshold_ms); + } else { + Usage( + "Failed to parse --latency-threshold (-l). The value must be " + ">= 0 msecs."); + } + break; } - break; - } - case 8: { - params_->latency_threshold_ms = std::atoi(optarg); - break; - } - case 9: { - params_->stability_threshold = atof(optarg) / 100; - break; - } - case 10: { - params_->max_trials = std::atoi(optarg); - break; - } - case 11: { - std::string arg = optarg; - // Check whether the argument is a directory - if (IsDirectory(arg) || IsFile(arg)) { - params_->user_data.push_back(optarg); - } else if (arg.compare("zero") == 0) { - params_->zero_input = true; - } else if (arg.compare("random") == 0) { + case 9: + case 's': { + std::string stability_threshold{optarg}; + if (std::stof(stability_threshold) >= 0.0) { + params_->stability_threshold = std::stof(optarg) / 100; + } else { + Usage( + "Failed to parse --stability-percentage (-s). The value must " + "be >= 0.0."); + } break; - } else { - Usage("unsupported input data provided " + std::string(optarg)); } - break; - } - case 12: { - params_->string_length = std::atoi(optarg); - break; - } - case 13: { - params_->string_data = optarg; - break; - } - case 14: { - params_->async = true; - break; - } - case 15: { - params_->forced_sync = true; - break; - } - case 16: { - params_->using_request_rate_range = true; - std::string arg = optarg; - size_t pos = 0; - int index = 0; - try { + case 10: + case 'r': { + std::string max_trials{optarg}; + if (std::stoi(max_trials) > 0) { + params_->max_trials = std::stoull(max_trials); + } else { + Usage("Failed to parse --max-trials (-r). The value must be > 0."); + } + break; + } + case 11: { + std::string arg = optarg; + // Check whether the argument is a directory + if (IsDirectory(arg) || IsFile(arg)) { + params_->user_data.push_back(optarg); + } else if (arg.compare("zero") == 0) { + params_->zero_input = true; + } else if (arg.compare("random") == 0) { + break; + } else { + Usage( + "Failed to parse --input-data. Unsupported type provided: '" + + std::string(optarg) + + "'. The available options are 'zero', 'random', path to a " + "directory, or a json file."); + } + break; + } + case 12: { + std::string string_length{optarg}; + if (std::stoi(string_length) > 0) { + params_->string_length = std::stoull(string_length); + } else { + Usage("Failed to parse --string-length. The value must be > 0"); + } + break; + } + case 13: { + params_->string_data = optarg; + break; + } + case 14: + case 'a': { + params_->async = true; + break; + } + case 15: { + params_->forced_sync = true; + break; + } + case 16: { + params_->using_request_rate_range = true; + std::string arg = optarg; + size_t pos = 0; + int index = 0; while (pos != std::string::npos) { size_t colon_pos = arg.find(":", pos); if (index > 2) { Usage( - "option request_rate_range can have maximum of three " - "elements"); + "Failed to parse --request-rate-range. The value does not " + "match ."); } if (colon_pos == std::string::npos) { params_->request_rate_range[index] = @@ -899,336 +1096,607 @@ CLParser::ParseCommandLine(int argc, char** argv) index++; } } + + break; } - catch (const std::invalid_argument& ia) { - Usage("failed to parse request rate range: " + std::string(optarg)); + case 17: { + std::string num_of_sequences{optarg}; + if (std::stoi(num_of_sequences) > 0) { + params_->num_of_sequences = std::stoul(num_of_sequences); + } else { + Usage("Failed to parse --num-of-sequences. The value must be > 0."); + } + break; } - break; - } - case 17: { - params_->num_of_sequences = std::atoi(optarg); - break; - } - case 18: { - params_->search_mode = SearchMode::BINARY; - break; - } - case 19: { - std::string arg = optarg; - if (arg.compare("poisson") == 0) { - params_->request_distribution = Distribution::POISSON; - } else if (arg.compare("constant") == 0) { - params_->request_distribution = Distribution::CONSTANT; - } else { - Usage( - "unsupported request distribution provided " + - std::string(optarg)); + case 18: { + params_->search_mode = SearchMode::BINARY; + break; } - break; - } - case 20: - params_->using_custom_intervals = true; - params_->request_intervals_file = optarg; - break; - case 21: { - std::string arg = optarg; - if (arg.compare("system") == 0) { - params_->shared_memory_type = SharedMemoryType::SYSTEM_SHARED_MEMORY; - } else if (arg.compare("cuda") == 0) { + case 19: { + std::string arg = optarg; + if (arg.compare("poisson") == 0) { + params_->request_distribution = Distribution::POISSON; + } else if (arg.compare("constant") == 0) { + params_->request_distribution = Distribution::CONSTANT; + } else { + Usage( + "Failed to parse --request-distribution. Unsupported type " + "provided: '" + + std::string(optarg) + "'. Choices are 'posson' or 'constant'."); + } + break; + } + case 20: { + std::string request_intervals_file{optarg}; + if (IsFile(request_intervals_file)) { + params_->request_intervals_file = request_intervals_file; + params_->using_custom_intervals = true; + } else { + Usage( + "Failed to parse --request-intervals. The value must be a " + "valid file path"); + } + break; + } + case 21: { + std::string arg = optarg; + if (arg.compare("system") == 0) { + params_->shared_memory_type = + SharedMemoryType::SYSTEM_SHARED_MEMORY; + } else if (arg.compare("cuda") == 0) { #ifdef TRITON_ENABLE_GPU - params_->shared_memory_type = SharedMemoryType::CUDA_SHARED_MEMORY; + params_->shared_memory_type = SharedMemoryType::CUDA_SHARED_MEMORY; #else - Usage("cuda shared memory is not supported when TRITON_ENABLE_GPU=0"); + Usage( + "Cuda shared memory is not supported when " + "TRITON_ENABLE_GPU=0."); #endif // TRITON_ENABLE_GPU + } else if (arg.compare("none") == 0) { + params_->shared_memory_type = SharedMemoryType::NO_SHARED_MEMORY; + } else { + Usage( + "Failed to parse --shared-memory. Unsupported type provided: " + "'" + + std::string(optarg) + + "'. The available options are 'system', 'cuda', or 'none'."); + } + break; } - break; - } - case 22: { - params_->output_shm_size = std::atoi(optarg); - break; - } - case 23: { - std::string arg = optarg; - if (arg.compare("triton") == 0) { - params_->kind = cb::TRITON; - } else if (arg.compare("tfserving") == 0) { - params_->kind = cb::TENSORFLOW_SERVING; - } else if (arg.compare("torchserve") == 0) { - params_->kind = cb::TORCHSERVE; - } else if (arg.compare("triton_c_api") == 0) { - params_->kind = cb::TRITON_C_API; - } else { - Usage("unsupported --service-kind specified"); + case 22: { + std::string output_shm_size{optarg}; + if (std::stoi(output_shm_size) >= 0) { + params_->output_shm_size = std::stoull(output_shm_size); + } else { + Usage( + "Failed to parse --output-shared-memory-size. The value must " + "be >= 0."); + } + break; } - break; - } - case 24: - params_->model_signature_name = optarg; - break; - case 25: { - params_->using_grpc_compression = true; - std::string arg = optarg; - if (arg.compare("none") == 0) { - params_->compression_algorithm = cb::COMPRESS_NONE; - } else if (arg.compare("deflate") == 0) { - params_->compression_algorithm = cb::COMPRESS_DEFLATE; - } else if (arg.compare("gzip") == 0) { - params_->compression_algorithm = cb::COMPRESS_GZIP; - } else { - Usage("unsupported --grpc-compression-algorithm specified"); + case 23: { + std::string arg = optarg; + if (arg.compare("triton") == 0) { + params_->kind = cb::TRITON; + } else if (arg.compare("tfserving") == 0) { + params_->kind = cb::TENSORFLOW_SERVING; + } else if (arg.compare("torchserve") == 0) { + params_->kind = cb::TORCHSERVE; + } else if (arg.compare("triton_c_api") == 0) { + params_->kind = cb::TRITON_C_API; + } else if (arg.compare("openai") == 0) { + params_->kind = cb::OPENAI; + } else { + Usage( + "Failed to parse --service-kind. Unsupported type provided: '" + + std::string{optarg} + + "'. The available options are 'triton', 'tfserving', " + "'torchserve', or 'triton_c_api'."); + } + break; } - break; - } - case 26: { - std::string arg = optarg; - if (arg.compare("time_windows") == 0) { - params_->measurement_mode = MeasurementMode::TIME_WINDOWS; - } else if (arg.compare("count_windows") == 0) { - params_->measurement_mode = MeasurementMode::COUNT_WINDOWS; - } else { - Usage("unsupported --measurement-mode specified"); + case 24: + params_->model_signature_name = optarg; + break; + case 25: { + std::string arg = optarg; + if (arg.compare("none") == 0) { + params_->compression_algorithm = cb::COMPRESS_NONE; + } else if (arg.compare("deflate") == 0) { + params_->compression_algorithm = cb::COMPRESS_DEFLATE; + } else if (arg.compare("gzip") == 0) { + params_->compression_algorithm = cb::COMPRESS_GZIP; + } else { + Usage( + "Failed to parse --grpc-compression-algorithm. Unsupported " + "type provided: '" + + arg + + "'. The available options are 'gzip', 'deflate', or 'none'."); + } + params_->using_grpc_compression = true; + break; } - break; - } - case 27: { - params_->measurement_request_count = std::atoi(optarg); - break; - } - case 28: { - params_->triton_server_path = optarg; - break; - } - case 29: { - params_->model_repository_path = optarg; - break; - } - case 30: { - std::string arg = optarg; - size_t pos = 0; - int index = 0; - try { + case 26: { + std::string arg = optarg; + if (arg.compare("time_windows") == 0) { + params_->measurement_mode = MeasurementMode::TIME_WINDOWS; + } else if (arg.compare("count_windows") == 0) { + params_->measurement_mode = MeasurementMode::COUNT_WINDOWS; + } else { + Usage( + "Failed to parse --measurement-mode. Unsupported type " + "provided: '" + + arg + + "'. The available options are 'time_windows' or " + "'count_windows'."); + } + break; + } + case 27: { + std::string request_count{optarg}; + if (std::stoi(request_count) > 0) { + params_->measurement_request_count = std::stoull(request_count); + } else { + Usage( + "Failed to parse --measurement-request-count. The value must " + "be > 0."); + } + break; + } + case 28: { + params_->triton_server_path = optarg; + break; + } + case 29: { + params_->model_repository_path = optarg; + break; + } + case 30: { + std::string arg = optarg; + int64_t start_id; + int64_t end_id; + size_t pos = 0; + int index = 0; while (pos != std::string::npos) { size_t colon_pos = arg.find(":", pos); if (index > 1) { Usage( - "option sequence-id-range can have maximum of two " - "elements"); + "Failed to parse --sequence-id-range. The value does not " + "match ."); } if (colon_pos == std::string::npos) { + std::string sequence_id{arg.substr(pos, colon_pos)}; if (index == 0) { - params_->start_sequence_id = - std::stoll(arg.substr(pos, colon_pos)); + start_id = std::stoi(sequence_id); } else { - params_->sequence_id_range = - std::stoll(arg.substr(pos, colon_pos)) - - params_->start_sequence_id; + end_id = std::stoi(sequence_id); } pos = colon_pos; } else { - params_->start_sequence_id = - std::stoll(arg.substr(pos, colon_pos - pos)); + std::string sequence_id{arg.substr(pos, colon_pos - pos)}; + start_id = std::stoi(sequence_id); pos = colon_pos + 1; index++; } } + + // Check for invalid inputs + if (start_id < 0 || end_id < 0) { + Usage( + "Failed to parse --sequence-id-range. The range values must be " + ">= 0."); + } else if (start_id > end_id) { + Usage( + "Failed to parse --sequence-id-range. The 'end' value must be " + "greater than 'start' value."); + } + + if (index == 0) { // Only start ID is given + params_->start_sequence_id = start_id; + } else { + params_->start_sequence_id = start_id; + params_->sequence_id_range = end_id - start_id; + } + break; } - catch (const std::invalid_argument& ia) { - Usage("failed to parse concurrency range: " + std::string(optarg)); + case 31: { + params_->ssl_options.ssl_grpc_use_ssl = true; + break; } - break; - } - case 31: { - params_->ssl_options.ssl_grpc_use_ssl = true; - break; - } - case 32: { - if (IsFile(optarg)) { - params_->ssl_options.ssl_grpc_root_certifications_file = optarg; - } else { - Usage( - "--ssl-grpc-root-certifications-file must be a valid file path"); + case 32: { + if (IsFile(optarg)) { + params_->ssl_options.ssl_grpc_root_certifications_file = optarg; + } else { + Usage( + "Failed to parse --ssl-grpc-root-certifications-file. The " + "value must be a valid file path."); + } + break; } - break; - } - case 33: { - if (IsFile(optarg)) { - params_->ssl_options.ssl_grpc_private_key_file = optarg; - } else { - Usage("--ssl-grpc-private-key-file must be a valid file path"); + case 33: { + if (IsFile(optarg)) { + params_->ssl_options.ssl_grpc_private_key_file = optarg; + } else { + Usage( + "Failed to parse --ssl-grpc-private-key-file. The value must " + "be a valid file path."); + } + break; } - break; - } - case 34: { - if (IsFile(optarg)) { - params_->ssl_options.ssl_grpc_certificate_chain_file = optarg; - } else { - Usage("--ssl-grpc-certificate-chain-file must be a valid file path"); + case 34: { + if (IsFile(optarg)) { + params_->ssl_options.ssl_grpc_certificate_chain_file = optarg; + } else { + Usage( + "Failed to parse --ssl-grpc-certificate-chain-file. The value " + "must be a valid file path."); + } + break; } - break; - } - case 35: { - if (std::atol(optarg) == 0 || std::atol(optarg) == 1) { - params_->ssl_options.ssl_https_verify_peer = std::atol(optarg); - } else { - Usage("--ssl-https-verify-peer must be 0 or 1"); + case 35: { + if (std::atol(optarg) == 0 || std::atol(optarg) == 1) { + params_->ssl_options.ssl_https_verify_peer = std::atol(optarg); + } else { + Usage( + "Failed to parse --ssl-https-verify-peer. The value must be " + "either 0 or 1."); + } + break; } - break; - } - case 36: { - if (std::atol(optarg) == 0 || std::atol(optarg) == 1 || - std::atol(optarg) == 2) { - params_->ssl_options.ssl_https_verify_host = std::atol(optarg); - } else { - Usage("--ssl-https-verify-host must be 0, 1, or 2"); + case 36: { + if (std::atol(optarg) == 0 || std::atol(optarg) == 1 || + std::atol(optarg) == 2) { + params_->ssl_options.ssl_https_verify_host = std::atol(optarg); + } else { + Usage( + "Failed to parse --ssl-https-verify-host. The value must be " + "either 0, 1, or 2."); + } + break; } - break; - } - case 37: { - if (IsFile(optarg)) { - params_->ssl_options.ssl_https_ca_certificates_file = optarg; - } else { - Usage("--ssl-https-ca-certificates-file must be a valid file path"); + case 37: { + if (IsFile(optarg)) { + params_->ssl_options.ssl_https_ca_certificates_file = optarg; + } else { + Usage( + "Failed to parse --ssl-https-ca-certificates-file. The value " + "must be a valid file path."); + } + break; } - break; - } - case 38: { - if (IsFile(optarg)) { - params_->ssl_options.ssl_https_client_certificate_file = optarg; - } else { - Usage( - "--ssl-https-client-certificate-file must be a valid file path"); + case 38: { + if (IsFile(optarg)) { + params_->ssl_options.ssl_https_client_certificate_file = optarg; + } else { + Usage( + "Failed to parse --ssl-https-client-certificate-file. The " + "value must be a valid file path."); + } + break; } - break; - } - case 39: { - if (std::string(optarg) == "PEM" || std::string(optarg) == "DER") { - params_->ssl_options.ssl_https_client_certificate_type = optarg; - } else { - Usage("--ssl-https-client-certificate-type must be 'PEM' or 'DER'"); + case 39: { + if (std::string(optarg) == "PEM" || std::string(optarg) == "DER") { + params_->ssl_options.ssl_https_client_certificate_type = optarg; + } else { + Usage( + "Failed to parse --ssl-https-client-certificate-type. " + "Unsupported type provided: '" + + std::string{optarg} + + "'. The available options are 'PEM' or 'DER'."); + } + break; } - break; - } - case 40: { - if (IsFile(optarg)) { - params_->ssl_options.ssl_https_private_key_file = optarg; - } else { - Usage("--ssl-https-private-key-file must be a valid file path"); + case 40: { + if (IsFile(optarg)) { + params_->ssl_options.ssl_https_private_key_file = optarg; + } else { + Usage( + "Failed to parse --ssl-https-private-key-file. The value must " + "be a valid file path."); + } + break; } - break; - } - case 41: { - if (std::string(optarg) == "PEM" || std::string(optarg) == "DER") { - params_->ssl_options.ssl_https_private_key_type = optarg; - } else { - Usage("--ssl-https-private-key-type must be 'PEM' or 'DER'"); + case 41: { + if (std::string(optarg) == "PEM" || std::string(optarg) == "DER") { + params_->ssl_options.ssl_https_private_key_type = optarg; + } else { + Usage( + "Failed to parse --ssl-https-private-key-type. Unsupported " + "type provided: '" + + std::string{optarg} + + "'. The available options are 'PEM' or 'DER'."); + } + break; } - break; - } - case 42: { - params_->verbose_csv = true; - break; - } - case 43: { - params_->enable_mpi = true; - break; - } - case 44: { - params_->trace_options["trace_file"] = {optarg}; - break; - } - case 45: { - params_->trace_options["trace_level"] = {optarg}; - break; - } - case 46: { - params_->trace_options["trace_rate"] = {optarg}; - break; - } - case 47: { - params_->trace_options["trace_count"] = {optarg}; - break; - } - case 48: { - params_->trace_options["log_frequency"] = {optarg}; - break; - } - case 49: { - params_->should_collect_metrics = true; - break; - } - case 50: { - params_->metrics_url = optarg; - params_->metrics_url_specified = true; - break; - } - case 51: { - params_->metrics_interval_ms = std::stoull(optarg); - params_->metrics_interval_ms_specified = true; - break; + case 42: { + params_->verbose_csv = true; + break; + } + case 43: { + params_->enable_mpi = true; + break; + } + case 44: { + std::string trace_level{optarg}; + if (trace_level == "OFF" || trace_level == "TIMESTAMPS" || + trace_level == "TENSORS") { + params_->trace_options["trace_level"] = {trace_level}; + } else { + Usage( + "Failed to parse --trace-level. Unsupported type provided: '" + + trace_level + + "'. The available options are 'OFF', 'TIMESTAMPS', or " + "'TENSORS'."); + } + break; + } + case 45: { + params_->trace_options["trace_rate"] = {optarg}; + break; + } + case 46: { + std::string trace_count{optarg}; + if (std::stoi(trace_count) >= -1) { + params_->trace_options["trace_count"] = {trace_count}; + } else { + Usage( + "Failed to parse --trace-count. The value must be >= 0 or set " + "to -1 (default)."); + } + break; + } + case 47: { + std::string log_frequency{optarg}; + if (std::stoi(log_frequency) >= 0) { + params_->trace_options["log_frequency"] = {log_frequency}; + } else { + Usage("Failed to parse --log-frequency. The value must be >= 0."); + } + break; + } + case 48: { + params_->should_collect_metrics = true; + break; + } + case 49: { + params_->metrics_url = optarg; + params_->metrics_url_specified = true; + break; + } + case 50: { + std::string metrics_interval_ms{optarg}; + if (std::stoi(metrics_interval_ms) > 0) { + params_->metrics_interval_ms = std::stoull(metrics_interval_ms); + params_->metrics_interval_ms_specified = true; + } else { + Usage( + "Failed to parse --metrics-interval. The value must be > 0 " + "msecs."); + } + break; + } + case 51: { + params_->sequence_length_variation = std::stod(optarg); + break; + } + case 52: { + std::string arg = optarg; + + // Remove all spaces in the string + arg.erase( + std::remove_if(arg.begin(), arg.end(), ::isspace), arg.end()); + + std::stringstream ss(arg); + while (ss.good()) { + std::string model_name; + std::string model_version{""}; + std::string tmp_model_name; + + getline(ss, tmp_model_name, ','); + + size_t colon_pos = tmp_model_name.find(":"); + + if (colon_pos == std::string::npos) { + model_name = tmp_model_name; + } else { + model_name = tmp_model_name.substr(0, colon_pos); + model_version = tmp_model_name.substr(colon_pos + 1); + } + + params_->bls_composing_models.push_back( + {model_name, model_version}); + } + break; + } + case 53: { + params_->serial_sequences = true; + break; + } + case 54: { + cb::TensorFormat input_tensor_format{ParseTensorFormat(optarg)}; + if (input_tensor_format == cb::TensorFormat::UNKNOWN) { + Usage( + "Failed to parse --input-tensor-format. Unsupported type " + "provided: '" + + std::string{optarg} + + "'. The available options are 'binary' or 'json'."); + } + params_->input_tensor_format = input_tensor_format; + break; + } + case 55: { + cb::TensorFormat output_tensor_format{ParseTensorFormat(optarg)}; + if (output_tensor_format == cb::TensorFormat::UNKNOWN) { + Usage( + "Failed to parse --output-tensor-format. Unsupported type " + "provided: '" + + std::string{optarg} + + "'. The available options are 'binary' or 'json'."); + } + params_->output_tensor_format = output_tensor_format; + break; + } + case 56: { + PrintVersion(); + break; + } + case 57: { + std::string profile_export_file{optarg}; + if (IsFile(profile_export_file) || IsDirectory(profile_export_file)) { + Usage( + "Failed to parse --profile-export-file. Path must not already " + "exist."); + } + params_->profile_export_file = profile_export_file; + break; + } + case 58: { + params_->is_using_periodic_concurrency_mode = true; + std::string arg = optarg; + std::vector values{SplitString(arg)}; + if (values.size() < 2) { + Usage( + "Failed to parse --periodic-concurrency-range. Both " + "and values must be provided."); + } else if (values.size() > 3) { + Usage( + "Failed to parse --periodic-concurrency-range. The value does " + "not match ."); + } + + for (size_t i = 0; i < values.size(); ++i) { + uint64_t val = std::stoull(values[i]); + if (i == 0) { + params_->periodic_concurrency_range.start = val; + } else if (i == 1) { + params_->periodic_concurrency_range.end = val; + } else if (i == 2) { + params_->periodic_concurrency_range.step = val; + } + } + + Range range{params_->periodic_concurrency_range}; + if (range.step == 0) { + Usage( + "Failed to parse --periodic-concurrency-range. The " + "value must be > 0."); + } else if (range.start > range.end) { + Usage( + "Failed to parse --periodic-concurrency-range. The " + "must be <= ."); + } else if ((range.end - range.start) % range.step != 0) { + Usage( + "Failed to parse --periodic-concurrency-range. The " + "value must be a factor of the range size ( - )."); + } + break; + } + case 59: { + std::string request_period{optarg}; + if (std::stoi(request_period) > 0) { + params_->request_period = std::stoull(request_period); + } else { + Usage("Failed to parse --request-period. The value must be > 0"); + } + break; + } + case 60: { + std::string arg = optarg; + std::vector values{SplitString(arg)}; + if (values.size() != 3) { + Usage( + "Failed to parse --request-parameter. The value does not match " + "."); + } + + std::for_each(values.begin(), values.end(), ToLowerCase); + std::string name{values[0]}; + std::string value{values[1]}; + std::string type{values[2]}; + + cb::RequestParameter param; + param.name = name; + param.value = value; + param.type = type; + params_->request_parameters[name] = param; + break; + } + case 61: { + params_->endpoint = optarg; + break; + } + case 62: { + if (std::stoi(optarg) < 0) { + Usage("Failed to parse --request-count. The value must be > 0."); + } + params_->request_count = std::stoi(optarg); + break; + } + case 'v': + params_->extra_verbose = params_->verbose; + params_->verbose = true; + break; + case 'z': + params_->zero_input = true; + break; + case 'd': + params_->using_old_options = true; + params_->dynamic_concurrency_mode = true; + break; + case 'u': + params_->url_specified = true; + params_->url = optarg; + break; + case 'm': + params_->model_name = optarg; + break; + case 'x': + params_->model_version = optarg; + break; + case 'b': { + std::string batch_size{optarg}; + if (std::stoi(batch_size) > 0) { + params_->batch_size = std::stoull(batch_size); + params_->using_batch_size = true; + } else { + Usage("Failed to parse -b (batch size). The value must be > 0."); + } + break; + } + case 't': + params_->using_old_options = true; + params_->concurrent_request_count = std::atoi(optarg); + break; + case 'i': + params_->protocol = ParseProtocol(optarg); + break; + case 'H': { + std::string arg = optarg; + std::string header = arg.substr(0, arg.find(":")); + (*params_->http_headers)[header] = arg.substr(header.size() + 1); + break; + } + case 'c': + params_->using_old_options = true; + params_->max_concurrency = std::atoi(optarg); + break; + case 'f': + params_->filename = optarg; + break; + case '?': + Usage(); + break; } - case 'v': - params_->extra_verbose = params_->verbose; - params_->verbose = true; - break; - case 'z': - params_->zero_input = true; - break; - case 'd': - params_->using_old_options = true; - params_->dynamic_concurrency_mode = true; - break; - case 'u': - params_->url_specified = true; - params_->url = optarg; - break; - case 'm': - params_->model_name = optarg; - break; - case 'x': - params_->model_version = optarg; - break; - case 'b': - params_->batch_size = std::atoi(optarg); - params_->using_batch_size = true; - break; - case 't': - params_->using_old_options = true; - params_->concurrent_request_count = std::atoi(optarg); - break; - case 'p': - params_->measurement_window_ms = std::atoi(optarg); - break; - case 'i': - params_->protocol = ParseProtocol(optarg); - break; - case 'H': { - std::string arg = optarg; - std::string header = arg.substr(0, arg.find(":")); - (*params_->http_headers)[header] = arg.substr(header.size() + 1); - break; + } + catch (const std::invalid_argument& ia) { + if (opt >= 'A') { // short options + Usage( + "Failed to parse -" + std::string{(char)opt} + + ". Invalid value provided: " + std::string{optarg}); + } else { + Usage( + "Failed to parse --" + std::string{long_options[opt].name} + + ". Invalid value provided: " + std::string{optarg}); } - case 'l': - params_->latency_threshold_ms = std::atoi(optarg); - break; - case 'c': - params_->using_old_options = true; - params_->max_concurrency = std::atoi(optarg); - break; - case 'r': - params_->max_trials = std::atoi(optarg); - break; - case 's': - params_->stability_threshold = atof(optarg) / 100; - break; - case 'f': - params_->filename = optarg; - break; - case 'a': - params_->async = true; - break; - case '?': - Usage(); - break; } } @@ -1254,48 +1722,46 @@ CLParser::ParseCommandLine(int argc, char** argv) // Will be using user-provided time intervals, hence no control variable. params_->search_mode = SearchMode::NONE; } + + // When the request-count feature is enabled, override the measurement mode to + // be count windows with a window size of the requested count + if (params_->request_count) { + params_->measurement_mode = MeasurementMode::COUNT_WINDOWS; + params_->measurement_request_count = params_->request_count; + } } void CLParser::VerifyOptions() { if (params_->model_name.empty()) { - Usage("-m flag must be specified"); - } - if (params_->batch_size <= 0) { - Usage("batch size must be > 0"); - } - if (params_->measurement_window_ms <= 0) { - Usage("measurement window must be > 0 in msec"); - } - if (params_->measurement_request_count <= 0) { - Usage("measurement request count must be > 0"); + Usage("Failed to parse -m (model name). The value must be specified."); } if (params_->concurrency_range.start <= 0 || params_->concurrent_request_count < 0) { Usage("The start of the search range must be > 0"); } if (params_->request_rate_range[SEARCH_RANGE::kSTART] <= 0) { - Usage("The start of the search range must be > 0"); + Usage( + "Failed to parse --request-rate-range. The start of the search range " + "must be > 0."); } if (params_->protocol == cb::ProtocolType::UNKNOWN) { - Usage("protocol should be either HTTP or gRPC"); + Usage( + "Failed to parse -i (protocol). The value should be either HTTP or " + "gRPC."); } if (params_->streaming && (params_->protocol != cb::ProtocolType::GRPC)) { - Usage("streaming is only allowed with gRPC protocol"); + Usage("Streaming is only allowed with gRPC protocol."); } if (params_->using_grpc_compression && (params_->protocol != cb::ProtocolType::GRPC)) { - Usage("compression is only allowed with gRPC protocol"); - } - if (params_->max_threads == 0) { - Usage("maximum number of threads must be > 0"); + Usage("Using compression algorithm is only allowed with gRPC protocol."); } - if (params_->sequence_length == 0) { - params_->sequence_length = 20; - std::cerr << "WARNING: using an invalid sequence length. Perf Analyzer will" - << " use default value if it is measuring on sequence model." - << std::endl; + if (params_->sequence_length_variation < 0.0) { + Usage( + "Failed to parse --sequence-length-variation. The value must be >= " + "0.0."); } if (params_->start_sequence_id == 0) { params_->start_sequence_id = 1; @@ -1305,17 +1771,19 @@ CLParser::VerifyOptions() } if (params_->percentile != -1 && (params_->percentile > 99 || params_->percentile < 1)) { - Usage("percentile must be -1 for not reporting or in range (0, 100)"); + Usage( + "Failed to parse --percentile. The value must be -1 for not reporting " + "or in range (0, 100)."); } if (params_->zero_input && !params_->user_data.empty()) { - Usage("zero input can't be set when data directory is provided"); + Usage("The -z flag cannot be set when --data-directory is provided."); } if (params_->async && params_->forced_sync) { - Usage("Both --async and --sync can not be specified simultaneously."); + Usage("Cannot specify --async and --sync simultaneously."); } if (params_->using_concurrency_range && params_->using_old_options) { - Usage("can not use deprecated options with --concurrency-range"); + Usage("Cannot use deprecated options with --concurrency-range."); } else if (params_->using_old_options) { if (params_->dynamic_concurrency_mode) { params_->concurrency_range.end = params_->max_concurrency; @@ -1324,36 +1792,80 @@ CLParser::VerifyOptions() } if (params_->using_request_rate_range && params_->using_old_options) { - Usage("can not use concurrency options with --request-rate-range"); + Usage("Cannot use concurrency options with --request-rate-range."); } - if (params_->using_request_rate_range && params_->using_concurrency_range) { + std::vector load_modes{ + params_->is_using_periodic_concurrency_mode, + params_->using_concurrency_range, params_->using_request_rate_range, + params_->using_custom_intervals}; + if (std::count(load_modes.begin(), load_modes.end(), true) > 1) { Usage( - "can not specify concurrency_range and request_rate_range " - "simultaneously"); + "Cannot specify more then one inference load mode. Please choose only " + "one of the following modes: --concurrency-range, " + "--periodic-concurrency-range, --request-rate-range, or " + "--request-intervals."); + } + + if (params_->is_using_periodic_concurrency_mode && !params_->streaming) { + Usage( + "The --periodic-concurrency-range option requires bi-directional gRPC " + "streaming."); + } + + if (params_->is_using_periodic_concurrency_mode && + (params_->profile_export_file == "")) { + Usage( + "Must provide --profile-export-file when using the " + "--periodic-concurrency-range option."); + } + + if (params_->is_using_periodic_concurrency_mode) { + if (params_->periodic_concurrency_range.end == pa::NO_LIMIT) { + std::cerr + << "WARNING: The maximum attainable concurrency will be limited by " + "max_threads specification." + << std::endl; + params_->periodic_concurrency_range.end = params_->max_threads; + } else { + if (params_->max_threads_specified) { + std::cerr << "WARNING: Overriding max_threads specification to ensure " + "requested concurrency range." + << std::endl; + } + params_->max_threads = std::max( + params_->max_threads, params_->periodic_concurrency_range.end); + } + } + + if (params_->request_parameters.size() > 0 && + params_->protocol != cb::ProtocolType::GRPC) { + Usage( + "The --request-parameter option is currently only supported by gRPC " + "protocol."); } if (params_->using_request_rate_range && params_->mpi_driver->IsMPIRun() && (params_->request_rate_range[SEARCH_RANGE::kEND] != 1.0 || params_->request_rate_range[SEARCH_RANGE::kSTEP] != 1.0)) { - Usage("cannot use request rate range with multi-model mode"); + Usage("Cannot specify --request-rate-range when in multi-model mode."); } if (params_->using_custom_intervals && params_->using_old_options) { - Usage("can not use deprecated options with --request-intervals"); + Usage("Cannot use deprecated options with --request-intervals."); } if ((params_->using_custom_intervals) && (params_->using_request_rate_range || params_->using_concurrency_range)) { Usage( - "can not use --concurrency-range or --request-rate-range " - "along with --request-intervals"); + "Cannot use --concurrency-range or --request-rate-range " + "along with --request-intervals."); } if (params_->using_concurrency_range && params_->mpi_driver->IsMPIRun() && (params_->concurrency_range.end != 1 || params_->concurrency_range.step != 1)) { - Usage("cannot use concurrency range with multi-model mode"); + Usage("Cannot specify --concurrency-range when in multi-model mode."); } if (((params_->concurrency_range.end == NO_LIMIT) || @@ -1374,7 +1886,7 @@ CLParser::VerifyOptions() if ((params_->search_mode == SearchMode::BINARY) && (params_->latency_threshold_ms == NO_LIMIT)) { - Usage("The latency threshold can not be 0 for binary search mode."); + Usage("The --latency-threshold cannot be 0 for binary search mode."); } if (((params_->concurrency_range.end < params_->concurrency_range.start) || @@ -1386,61 +1898,89 @@ CLParser::VerifyOptions() "binary search mode."); } + if (params_->request_count != 0) { + if (params_->using_concurrency_range) { + if (params_->request_count < params_->concurrency_range.start) { + Usage("request-count can not be less than concurrency"); + } + if (params_->concurrency_range.start < params_->concurrency_range.end) { + Usage( + "request-count not supported with multiple concurrency values in " + "one run"); + } + } + if (params_->using_request_rate_range) { + if (params_->request_count < + static_cast(params_->request_rate_range[0])) { + Usage("request-count can not be less than request-rate"); + } + if (params_->request_rate_range[SEARCH_RANGE::kSTART] < + params_->request_rate_range[SEARCH_RANGE::kEND]) { + Usage( + "request-count not supported with multiple request-rate values in " + "one run"); + } + } + } + if (params_->kind == cb::TENSORFLOW_SERVING) { if (params_->protocol != cb::ProtocolType::GRPC) { - std::cerr - << "perf_analyzer supports only grpc protocol for TensorFlow Serving." - << std::endl; - throw PerfAnalyzerException(GENERIC_ERROR); + Usage( + "perf_analyzer supports only grpc protocol for TensorFlow Serving."); } else if (params_->streaming) { - std::cerr - << "perf_analyzer does not support streaming for TensorFlow Serving." - << std::endl; - throw PerfAnalyzerException(GENERIC_ERROR); + Usage("perf_analyzer does not support streaming for TensorFlow Serving."); } else if (params_->async) { - std::cerr - << "perf_analyzer does not support async API for TensorFlow Serving." - << std::endl; - throw PerfAnalyzerException(GENERIC_ERROR); + Usage("perf_analyzer does not support async API for TensorFlow Serving."); } else if (!params_->using_batch_size) { params_->batch_size = 0; } } else if (params_->kind == cb::TORCHSERVE) { if (params_->user_data.empty()) { - std::cerr << "--input-data should be provided with a json file with " - "input data for torchserve" - << std::endl; - throw PerfAnalyzerException(GENERIC_ERROR); + Usage( + "--input-data should be provided with a json file with " + "input data for torchserve."); } } if (params_->kind == cb::BackendKind::TRITON_C_API) { - std::cout << " USING C API: only default functionalities supported " - << std::endl; - if (!params_->targeting_concurrency()) { - std::cerr << "Only target concurrency is supported by C API" << std::endl; - throw PerfAnalyzerException(GENERIC_ERROR); - } else if (params_->shared_memory_type != NO_SHARED_MEMORY) { - std::cerr << "Shared memory not yet supported by C API" << std::endl; - throw PerfAnalyzerException(GENERIC_ERROR); - } else if ( - params_->triton_server_path.empty() || - params_->model_repository_path.empty() || - params_->memory_type.empty()) { - std::cerr - << "Not enough information to create C API. /lib/libtritonserver.so " - "directory:" - << params_->triton_server_path - << " model repo:" << params_->model_repository_path - << " memory type:" << params_->memory_type << std::endl; - throw PerfAnalyzerException(GENERIC_ERROR); - } else if (params_->async) { - std::cerr << "Async API not yet supported by C API" << std::endl; - throw PerfAnalyzerException(GENERIC_ERROR); + if (params_->triton_server_path.empty()) { + Usage( + "--triton-server-path should not be empty when using " + "service-kind=triton_c_api."); + } + + if (params_->model_repository_path.empty()) { + Usage( + "--model-repository should not be empty when using " + "service-kind=triton_c_api."); } + + if (params_->async) { + Usage( + "Async mode is not supported by triton_c_api service " + "kind."); + } + params_->protocol = cb::ProtocolType::UNKNOWN; } + if (params_->kind == cb::BackendKind::OPENAI) { + if (params_->user_data.empty()) { + Usage("Must supply --input-data for OpenAI service kind."); + } + if (params_->endpoint.empty()) { + Usage( + "Must supply --endpoint for OpenAI service kind. For example, " + "\"v1/chat/completions\"."); + } + if (!params_->async) { + Usage("Only async mode is currently supported for OpenAI service-kind"); + } + if (params_->batch_size != 1) { + Usage("Batching is not currently supported with OpenAI service-kind"); + } + } + if (params_->should_collect_metrics && params_->kind != cb::BackendKind::TRITON) { Usage( @@ -1448,11 +1988,6 @@ CLParser::VerifyOptions() "backend."); } - if (params_->should_collect_metrics && params_->verbose_csv == false) { - Usage( - "Must specify --verbose-csv when using the --collect-metrics option."); - } - if (params_->metrics_url_specified && params_->should_collect_metrics == false) { Usage( @@ -1466,8 +2001,15 @@ CLParser::VerifyOptions() "option."); } - if (params_->metrics_interval_ms == 0) { - Usage("Metrics interval must be larger than 0 milliseconds."); + if (params_->should_collect_metrics && !params_->metrics_url_specified) { + // Update the default metrics URL to be associated with the input URL + // instead of localhost + // + size_t colon_pos = params_->url.find(':'); + if (colon_pos != std::string::npos) { + params_->metrics_url = + params_->url.substr(0, colon_pos) + ":8002/metrics"; + } } } diff --git a/command_line_parser.h b/command_line_parser.h index 2a955d2c..461e24e2 100644 --- a/command_line_parser.h +++ b/command_line_parser.h @@ -1,4 +1,4 @@ -// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -30,6 +30,7 @@ #include #include #include + #include "constants.h" #include "mpi_utils.h" #include "perf_utils.h" @@ -48,15 +49,20 @@ struct PerfAnalyzerParameters { size_t max_threads = 4; bool max_threads_specified = false; size_t sequence_length = 20; // average length of a sentence + bool sequence_length_specified = false; + double sequence_length_variation = 20.0; int32_t percentile = -1; std::vector user_data; std::unordered_map> input_shapes; + std::vector bls_composing_models; uint64_t measurement_window_ms = 5000; bool using_concurrency_range = false; Range concurrency_range{1, 1, 1}; + std::unordered_map request_parameters; uint64_t latency_threshold_ms = NO_LIMIT; double stability_threshold = 0.1; size_t max_trials = 10; + size_t request_count = 0; bool zero_input = false; size_t string_length = 128; std::string string_data; @@ -65,6 +71,7 @@ struct PerfAnalyzerParameters { bool using_request_rate_range = false; double request_rate_range[3] = {1.0, 1.0, 1.0}; uint32_t num_of_sequences = 4; + bool serial_sequences = false; SearchMode search_mode = SearchMode::LINEAR; Distribution request_distribution = Distribution::CONSTANT; bool using_custom_intervals = false; @@ -78,7 +85,7 @@ struct PerfAnalyzerParameters { clientbackend::GrpcCompressionAlgorithm::COMPRESS_NONE; MeasurementMode measurement_mode = MeasurementMode::TIME_WINDOWS; uint64_t measurement_request_count = 50; - std::string triton_server_path; + std::string triton_server_path = "/opt/tritonserver"; std::string model_repository_path; uint64_t start_sequence_id = 1; uint64_t sequence_id_range = UINT32_MAX; @@ -94,9 +101,10 @@ struct PerfAnalyzerParameters { bool dynamic_concurrency_mode = false; bool url_specified = false; std::string url{"localhost:8000"}; + std::string endpoint{""}; std::string model_name; std::string model_version; - int32_t batch_size = 1; + uint64_t batch_size = 1; bool using_batch_size = false; int32_t concurrent_request_count = 1; clientbackend::ProtocolType protocol = clientbackend::ProtocolType::HTTP; @@ -125,8 +133,29 @@ struct PerfAnalyzerParameters { { return ( using_concurrency_range || using_old_options || - !(using_request_rate_range || using_custom_intervals)); + !(using_request_rate_range || using_custom_intervals || + is_using_periodic_concurrency_mode)); } + + // Sets the threshold for PA client overhead. + // Overhead is defined as the percentage of time when PA is doing work and + // requests are not outstanding to the triton server. If the overhead + // percentage exceeds the threshold, a warning is displayed. + // + double overhead_pct_threshold{50.0}; + + // Triton inference request input tensor format. + cb::TensorFormat input_tensor_format{cb::TensorFormat::BINARY}; + + // Triton inference response output tensor format. + cb::TensorFormat output_tensor_format{cb::TensorFormat::BINARY}; + + // The profile export file path. + std::string profile_export_file{""}; + + bool is_using_periodic_concurrency_mode{false}; + Range periodic_concurrency_range{1, 1, 1}; + uint64_t request_period{10}; }; using PAParamsPtr = std::shared_ptr; @@ -135,7 +164,7 @@ class CLParser { public: CLParser() : params_(new PerfAnalyzerParameters{}) {} - // Parse command line arguements into a parameters struct + // Parse command line arguments into a parameters struct // PAParamsPtr Parse(int argc, char** argv); @@ -146,6 +175,7 @@ class CLParser { std::string FormatMessage(std::string str, int offset) const; virtual void Usage(const std::string& msg = std::string()); + void PrintVersion(); void ParseCommandLine(int argc, char** argv); void VerifyOptions(); }; diff --git a/concurrency_ctx_id_tracker.h b/concurrency_ctx_id_tracker.h new file mode 100644 index 00000000..9699fa30 --- /dev/null +++ b/concurrency_ctx_id_tracker.h @@ -0,0 +1,48 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include "base_queue_ctx_id_tracker.h" + +namespace triton { namespace perfanalyzer { + +// Context ID Tracker that always returns context 0, but ensures that only X +// requests are outstanding at a time +// +class ConcurrencyCtxIdTracker : public BaseQueueCtxIdTracker { + public: + ConcurrencyCtxIdTracker() = default; + void Reset(size_t count) override + { + Clear(); + + for (size_t i = 0; i < count; ++i) { + free_ctx_ids_.push(0); + } + } +}; + +}}; // namespace triton::perfanalyzer diff --git a/concurrency_manager.cc b/concurrency_manager.cc index 54557b58..28386184 100644 --- a/concurrency_manager.cc +++ b/concurrency_manager.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -41,29 +41,17 @@ cb::Error ConcurrencyManager::Create( const bool async, const bool streaming, const int32_t batch_size, const size_t max_threads, const size_t max_concurrency, - const size_t sequence_length, const size_t string_length, - const std::string& string_data, const bool zero_input, - std::vector& user_data, const SharedMemoryType shared_memory_type, const size_t output_shm_size, - const uint64_t start_sequence_id, const uint64_t sequence_id_range, const std::shared_ptr& parser, const std::shared_ptr& factory, - std::unique_ptr* manager) + std::unique_ptr* manager, + const std::unordered_map& + request_parameters) { std::unique_ptr local_manager(new ConcurrencyManager( async, streaming, batch_size, max_threads, max_concurrency, - sequence_length, shared_memory_type, output_shm_size, start_sequence_id, - sequence_id_range, parser, factory)); - - local_manager->threads_config_.reserve(max_threads); - - RETURN_IF_ERROR(local_manager->InitManagerInputs( - string_length, string_data, zero_input, user_data)); - - if (local_manager->shared_memory_type_ != - SharedMemoryType::NO_SHARED_MEMORY) { - RETURN_IF_ERROR(local_manager->InitSharedMemory()); - } + shared_memory_type, output_shm_size, parser, factory, + request_parameters)); *manager = std::move(local_manager); @@ -73,29 +61,43 @@ ConcurrencyManager::Create( ConcurrencyManager::ConcurrencyManager( const bool async, const bool streaming, const int32_t batch_size, const size_t max_threads, const size_t max_concurrency, - const size_t sequence_length, const SharedMemoryType shared_memory_type, - const size_t output_shm_size, const uint64_t start_sequence_id, - const uint64_t sequence_id_range, + const SharedMemoryType shared_memory_type, const size_t output_shm_size, const std::shared_ptr& parser, - const std::shared_ptr& factory) + const std::shared_ptr& factory, + const std::unordered_map& + request_parameters) : LoadManager( - async, streaming, batch_size, max_threads, sequence_length, - shared_memory_type, output_shm_size, start_sequence_id, - sequence_id_range, parser, factory), + async, streaming, batch_size, max_threads, shared_memory_type, + output_shm_size, parser, factory, request_parameters), execute_(true), max_concurrency_(max_concurrency) +{ + threads_config_.reserve(max_threads); +} + +void +ConcurrencyManager::InitManagerFinalize() { if (on_sequence_model_) { - for (uint64_t i = 0; i < max_concurrency_; i++) { - sequence_stat_.emplace_back(new SequenceStat(0)); - } + sequence_manager_->InitSequenceStatuses(max_concurrency_); } } cb::Error ConcurrencyManager::ChangeConcurrencyLevel( - const size_t concurrent_request_count) + const size_t concurrent_request_count, const size_t request_count) { - if (on_sequence_model_ && async_) { + PauseSequenceWorkers(); + ReconfigThreads(concurrent_request_count, request_count); + ResumeSequenceWorkers(); + + std::cout << "Request concurrency: " << concurrent_request_count << std::endl; + return cb::Error::Success; +} + +void +ConcurrencyManager::PauseSequenceWorkers() +{ + if (on_sequence_model_) { execute_ = false; // Wait to see all threads are paused. for (auto& thread_config : threads_config_) { @@ -104,426 +106,88 @@ ConcurrencyManager::ChangeConcurrencyLevel( } } } +} + +void +ConcurrencyManager::ReconfigThreads( + size_t concurrent_request_count, size_t request_count) +{ // Always prefer to create new threads if the maximum limit has not been met + // + // While operating in synchronous mode, each context can send only one + // request at a time, hence the number of worker threads should be equal to + // the requested concurrency levels. + // while ((concurrent_request_count > threads_.size()) && (threads_.size() < max_threads_)) { // Launch new thread for inferencing threads_stat_.emplace_back(new ThreadStat()); threads_config_.emplace_back(new ThreadConfig(threads_config_.size())); - // Worker maintains concurrency in different ways. - // For sequence models, multiple contexts must be created for multiple - // concurrent sequences. - // For non-sequence models, one context can send out multiple requests - // at the same time. Thus it uses one single context as every infer context - // creates a worker thread implicitly. - // While operating in synchronous mode, each context can send only one - // request at a time, hence the number of worker threads should be equal to - // the requested concurrency levels. - threads_.emplace_back( - &ConcurrencyManager::Infer, this, threads_stat_.back(), - threads_config_.back()); - } - - // Compute the new concurrency level for each thread (take floor) - // and spread the remaining value - size_t avg_concurrency = concurrent_request_count / threads_.size(); - size_t threads_add_one = concurrent_request_count % threads_.size(); + workers_.push_back( + MakeWorker(threads_stat_.back(), threads_config_.back())); - active_threads_ = 0; - for (size_t i = 0; i < threads_stat_.size(); i++) { - threads_config_[i]->concurrency_ = - avg_concurrency + (i < threads_add_one ? 1 : 0); - if (threads_config_[i]->concurrency_) { - active_threads_++; - } + threads_.emplace_back(&IWorker::Infer, workers_.back()); } - if (on_sequence_model_ && async_) { - execute_ = true; - } + { + // Make sure all threads are reconfigured before they are woken up + std::lock_guard lock(wake_mutex_); - // Make sure all threads will check their updated concurrency level - wake_signal_.notify_all(); - - std::cout << "Request concurrency: " << concurrent_request_count << std::endl; - return cb::Error::Success; -} + // Compute the new concurrency level for each thread (take floor) + // and spread the remaining value + size_t avg_concurrency = concurrent_request_count / threads_.size(); + size_t threads_add_one = concurrent_request_count % threads_.size(); -// Function for worker threads. -// If the model is non-sequence model, each worker uses only one context -// to maintain concurrency assigned to worker. -// If the model is sequence model, each worker has to use multiples contexts -// to maintain (sequence) concurrency assigned to worker. -void -ConcurrencyManager::Infer( - std::shared_ptr thread_stat, - std::shared_ptr thread_config) -{ - std::vector> ctxs; - uint32_t seq_id = 0, ctx_id = 0; - std::queue free_ctx_ids; + size_t avg_req_count = request_count / threads_.size(); + size_t req_count_add_one = request_count % threads_.size(); - // Reserve the vectors in case of sequence models. In non-sequence or - // synchronous mode only one context will be opened hence no need of - // reserving. - if (on_sequence_model_ && async_) { - thread_stat->contexts_stat_.reserve(max_concurrency_); - ctxs.reserve(max_concurrency_); - } + size_t seq_stat_index_offset = 0; + active_threads_ = 0; + for (size_t i = 0; i < threads_stat_.size(); i++) { + size_t concurrency = avg_concurrency + (i < threads_add_one ? 1 : 0); - // Variable used to signal request completion - bool notified = false; - std::mutex cb_mtx; - std::condition_variable cb_cv; + threads_config_[i]->concurrency_ = concurrency; + threads_config_[i]->seq_stat_index_offset_ = seq_stat_index_offset; - std::atomic total_ongoing_requests(0); - uint64_t request_id = 0; + size_t thread_num_reqs = avg_req_count + (i < req_count_add_one ? 1 : 0); + threads_config_[i]->num_requests_ = thread_num_reqs; - // request_id to start timestamp map - std::map async_req_map; + seq_stat_index_offset += concurrency; - // Callback function for handling asynchronous requests - const auto callback_func = [&](cb::InferResult* result) { - uint32_t ctx_id = 0; - std::shared_ptr result_ptr(result); - if (thread_stat->cb_status_.IsOk()) { - // Add the request timestamp to thread Timestamp vector with - // proper locking - std::lock_guard lock(thread_stat->mu_); - thread_stat->cb_status_ = result_ptr->RequestStatus(); - if (thread_stat->cb_status_.IsOk()) { - std::chrono::time_point end_time_async; - end_time_async = std::chrono::system_clock::now(); - std::string request_id; - thread_stat->cb_status_ = result_ptr->Id(&request_id); - const auto& it = async_req_map.find(request_id); - if (it != async_req_map.end()) { - thread_stat->request_timestamps_.emplace_back(std::make_tuple( - it->second.start_time_, end_time_async, it->second.sequence_end_, - false /* delayed */)); - ctx_id = it->second.ctx_id_; - ctxs[ctx_id]->infer_backend_->ClientInferStat( - &(thread_stat->contexts_stat_[ctx_id])); - thread_stat->cb_status_ = ValidateOutputs(*ctxs[ctx_id], result); - async_req_map.erase(request_id); - } + if (concurrency) { + active_threads_++; } } - // avoid competition over 'cb_mtx' - { - std::lock_guard lk(cb_mtx); - free_ctx_ids.push(ctx_id); - notified = true; - } - - total_ongoing_requests--; - - cb_cv.notify_all(); - }; - - // Specify the function as lambda here to work around the possible callback - // lifecycle issue when making this a class member function. - // Note that 'free_ctx_ids' must be reconstruct after the call because - // this function doesn't utilize 'free_ctx_ids' in the same way as in main - // loop - const auto complete_onging_sequence_func = [&]() { - if (!on_sequence_model_) { - return cb::Error::Success; - } - size_t offset = 0; - for (size_t i = 0; i < thread_config->thread_id_; i++) { - offset += threads_config_[i]->concurrency_; - } - - for (size_t ctx_id = 0; ctx_id < ctxs.size(); ++ctx_id) { - size_t seq_id = offset + ctx_id; - std::lock_guard guard(sequence_stat_[seq_id]->mtx_); - // Complete the sequence if there are remaining queries - while (sequence_stat_[seq_id]->remaining_queries_ != 0) { - SetInferSequenceOptions(seq_id, ctxs[ctx_id]->options_); - - // Update the inputs if required - if (using_json_data_) { - int step_id = data_loader_->GetTotalSteps( - sequence_stat_[seq_id]->data_stream_id_) - - sequence_stat_[seq_id]->remaining_queries_; - - RETURN_IF_ERROR(UpdateInputs( - ctxs[ctx_id]->inputs_, ctxs[ctx_id]->valid_inputs_, - sequence_stat_[seq_id]->data_stream_id_, step_id)); - RETURN_IF_ERROR(UpdateValidationOutputs( - ctxs[ctx_id]->outputs_, sequence_stat_[seq_id]->data_stream_id_, - step_id, ctxs[ctx_id]->expected_outputs_)); - } - sequence_stat_[seq_id]->remaining_queries_--; - - if (async_) { - ctxs[ctx_id]->options_->request_id_ = "0"; - if (streaming_) { - RETURN_IF_ERROR(ctxs[ctx_id]->infer_backend_->AsyncStreamInfer( - *(ctxs[ctx_id]->options_), ctxs[ctx_id]->inputs_, - ctxs[ctx_id]->outputs_)); - } else { - RETURN_IF_ERROR(ctxs[ctx_id]->infer_backend_->AsyncInfer( - callback_func, *(ctxs[ctx_id]->options_), ctxs[ctx_id]->inputs_, - ctxs[ctx_id]->outputs_)); - } - total_ongoing_requests++; - } else { - cb::InferResult* results = nullptr; - auto err = ctxs[ctx_id]->infer_backend_->Infer( - &results, *(ctxs[ctx_id]->options_), ctxs[ctx_id]->inputs_, - ctxs[ctx_id]->outputs_); - if (results != nullptr) { - delete results; - } - RETURN_IF_ERROR(err); - } - } - } - return cb::Error::Success; - }; - - // run inferencing until receiving exit signal to maintain server load. - do { - if (on_sequence_model_ && async_) { - if (!execute_) { - // Ensures the clean exit of the sequences - auto status = complete_onging_sequence_func(); - if (thread_stat->status_.IsOk()) { - thread_stat->status_ = status; - } - while (total_ongoing_requests != 0) { - std::this_thread::sleep_for(std::chrono::milliseconds(500)); - } - // Reconstruct 'free_ctx_ids' because complete_onging_sequence_func() - // has destructive side affects - free_ctx_ids = std::queue(); - for (size_t i = 0; i < ctxs.size(); ++i) { - free_ctx_ids.push(i); - } - // Wait if no request should be sent and it is not exiting - thread_config->is_paused_ = true; - std::unique_lock lock(wake_mutex_); - wake_signal_.wait(lock, [this]() { return early_exit || execute_; }); - } - } - - thread_config->is_paused_ = false; - - // Only interact with synchronous mechanism if the worker should wait - if (thread_config->concurrency_ == 0) { - // Wait if no request should be sent and it is not exiting - std::unique_lock lock(wake_mutex_); - wake_signal_.wait(lock, [&thread_config]() { - return early_exit || (thread_config->concurrency_ > 0); - }); - // Stop executing if concurrency is 0 and early exit is requested - if (early_exit && thread_config->concurrency_ == 0) { - break; - } - } - - size_t num_reqs = thread_config->concurrency_; - - // If the model is non-sequence model, use one InferContext to maintain - // concurrency for this thread. - size_t active_ctx_cnt = on_sequence_model_ ? num_reqs : 1; - - while (active_ctx_cnt > ctxs.size()) { - { - std::lock_guard lock(cb_mtx); - free_ctx_ids.push(ctxs.size()); - } - ctxs.emplace_back(new InferContext()); - thread_stat->status_ = - factory_->CreateClientBackend(&(ctxs.back()->infer_backend_)); - ctxs.back()->options_.reset(new cb::InferOptions(parser_->ModelName())); - ctxs.back()->options_->model_version_ = parser_->ModelVersion(); - ctxs.back()->options_->model_signature_name_ = - parser_->ModelSignatureName(); - thread_stat->contexts_stat_.emplace_back(); - if (shared_memory_type_ == SharedMemoryType::NO_SHARED_MEMORY) { - thread_stat->status_ = PrepareInfer(ctxs.back().get()); - } else { - thread_stat->status_ = PrepareSharedMemoryInfer(ctxs.back().get()); - } - if (!thread_stat->status_.IsOk()) { - return; - } - if (streaming_) { - // Decoupled models should not collect client side statistics - thread_stat->status_ = ctxs.back()->infer_backend_->StartStream( - callback_func, (!parser_->IsDecoupled())); - if (!thread_stat->status_.IsOk()) { - return; - } - } - } - - // Create async requests such that the number of ongoing requests - // matches the concurrency level - // Non-sequence model is 'num_reqs' * 1 ctx - // Sequence model is 1 request of 1 sequence * 'active_ctx_cnt' ctxs - while (total_ongoing_requests < (int)num_reqs) { - // Update the inputs if required for non-sequence - if (using_json_data_ && (!on_sequence_model_)) { - int step_id = (thread_config->non_sequence_data_step_id_ % - data_loader_->GetTotalStepsNonSequence()) * - batch_size_; - thread_config->non_sequence_data_step_id_ += active_threads_; - // There will be only one ctx in non-sequence case - thread_stat->status_ = UpdateInputs( - ctxs[ctx_id]->inputs_, ctxs[ctx_id]->valid_inputs_, 0, step_id); - if (thread_stat->status_.IsOk()) { - thread_stat->status_ = UpdateValidationOutputs( - ctxs[ctx_id]->outputs_, 0, step_id, - ctxs[ctx_id]->expected_outputs_); - } - if (!thread_stat->status_.IsOk()) { - return; - } - } - - if (on_sequence_model_) { - size_t offset = 0; - for (size_t i = 0; i < thread_config->thread_id_; i++) { - offset += threads_config_[i]->concurrency_; - } - - // Find the next available context id to use for this request - { - std::lock_guard lk(cb_mtx); - ctx_id = free_ctx_ids.front(); - free_ctx_ids.pop(); - } - seq_id = offset + ctx_id; - - { - std::lock_guard guard(sequence_stat_[seq_id]->mtx_); - SetInferSequenceOptions(seq_id, ctxs[ctx_id]->options_); + // TODO REFACTOR TMA-1043 the memory manager should have API to set + // num_active_threads in constructor, as well as overwrite it here + } +} - // Update the inputs if required - if (using_json_data_) { - int step_id = data_loader_->GetTotalSteps( - sequence_stat_[seq_id]->data_stream_id_) - - sequence_stat_[seq_id]->remaining_queries_; +void +ConcurrencyManager::ResumeSequenceWorkers() +{ + if (on_sequence_model_) { + execute_ = true; + } - thread_stat->status_ = UpdateInputs( - ctxs[ctx_id]->inputs_, ctxs[ctx_id]->valid_inputs_, - sequence_stat_[seq_id]->data_stream_id_, step_id); - if (thread_stat->status_.IsOk()) { - thread_stat->status_ = UpdateValidationOutputs( - ctxs[ctx_id]->outputs_, - sequence_stat_[seq_id]->data_stream_id_, step_id, - ctxs[ctx_id]->expected_outputs_); - } - if (!thread_stat->status_.IsOk()) { - return; - } - } - sequence_stat_[seq_id]->remaining_queries_--; - } - } - if (async_) { - ctxs[ctx_id]->options_->request_id_ = std::to_string(request_id++); - { - std::lock_guard lock(thread_stat->mu_); - auto it = async_req_map - .emplace( - ctxs[ctx_id]->options_->request_id_, - AsyncRequestProperties()) - .first; - it->second.start_time_ = std::chrono::system_clock::now(); - it->second.ctx_id_ = ctx_id; - it->second.sequence_end_ = ctxs[ctx_id]->options_->sequence_end_; - } - if (streaming_) { - thread_stat->status_ = ctxs[ctx_id]->infer_backend_->AsyncStreamInfer( - *(ctxs[ctx_id]->options_), ctxs[ctx_id]->valid_inputs_, - ctxs[ctx_id]->outputs_); - } else { - thread_stat->status_ = ctxs[ctx_id]->infer_backend_->AsyncInfer( - callback_func, *(ctxs[ctx_id]->options_), - ctxs[ctx_id]->valid_inputs_, ctxs[ctx_id]->outputs_); - } - if (!thread_stat->status_.IsOk()) { - return; - } - } else { - std::chrono::time_point start_time_sync, - end_time_sync; - start_time_sync = std::chrono::system_clock::now(); - cb::InferResult* results = nullptr; - thread_stat->status_ = ctxs[ctx_id]->infer_backend_->Infer( - &results, *(ctxs[ctx_id]->options_), ctxs[ctx_id]->valid_inputs_, - ctxs[ctx_id]->outputs_); - if (results != nullptr) { - if (thread_stat->status_.IsOk()) { - thread_stat->status_ = ValidateOutputs(*ctxs[ctx_id], results); - } - delete results; - } - if (!thread_stat->status_.IsOk()) { - return; - } - end_time_sync = std::chrono::system_clock::now(); - { - // Add the request timestamp to thread Timestamp vector with proper - // locking - std::lock_guard lock(thread_stat->mu_); - thread_stat->request_timestamps_.emplace_back(std::make_tuple( - start_time_sync, end_time_sync, - ctxs[ctx_id]->options_->sequence_end_, false /* delayed */)); - thread_stat->status_ = ctxs[ctx_id]->infer_backend_->ClientInferStat( - &(thread_stat->contexts_stat_[ctx_id])); - if (!thread_stat->status_.IsOk()) { - return; - } - } - { - std::lock_guard lock(cb_mtx); - free_ctx_ids.push(ctx_id); - } - } - total_ongoing_requests++; - } + // Make sure all threads will check their updated concurrency level + wake_signal_.notify_all(); +} - if (async_) { - { - // If async, then wait for signal from callback. - std::unique_lock lk(cb_mtx); - cb_cv.wait(lk, [¬ified] { - if (notified) { - notified = false; - return true; - } - return false; - }); - } - } else { - // If synchronous, then all the requests have already been completed. - total_ongoing_requests = 0; - } +std::shared_ptr +ConcurrencyManager::MakeWorker( + std::shared_ptr thread_stat, + std::shared_ptr thread_config) +{ + uint32_t id = workers_.size(); - if (early_exit || (!thread_stat->cb_status_.IsOk())) { - if (async_) { - // Wait for all callbacks to complete. - // Loop to ensure all the inflight requests have been completed. - auto status = complete_onging_sequence_func(); - if (thread_stat->status_.IsOk()) { - thread_stat->status_ = status; - } - while (total_ongoing_requests != 0) { - std::this_thread::sleep_for(std::chrono::milliseconds(500)); - } - } - // end loop - break; - } - } while (true); + return std::make_shared( + id, thread_stat, thread_config, parser_, data_loader_, factory_, + on_sequence_model_, async_, max_concurrency_, using_json_data_, + streaming_, batch_size_, wake_signal_, wake_mutex_, active_threads_, + execute_, infer_data_manager_, sequence_manager_); } }} // namespace triton::perfanalyzer diff --git a/concurrency_manager.h b/concurrency_manager.h index 6e77d673..c6c90f1d 100644 --- a/concurrency_manager.h +++ b/concurrency_manager.h @@ -1,4 +1,4 @@ -// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -25,10 +25,15 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once +#include "concurrency_worker.h" #include "load_manager.h" namespace triton { namespace perfanalyzer { +#ifndef DOCTEST_CONFIG_DISABLE +class TestConcurrencyManager; +#endif + //============================================================================== /// ConcurrencyManager is a helper class to send inference requests to inference /// server consistently, based on the specified setting, so that the @@ -57,7 +62,6 @@ class ConcurrencyManager : public LoadManager { /// \param batch_size The batch size used for each request. /// \param max_threads The maximum number of working threads to be spawned. /// \param max_concurrency The maximum concurrency which will be requested. - /// \param sequence_length The base length of each sequence. /// \param string_length The length of the string to create for input. /// \param string_data The data to use for generating string input. /// \param zero_input Whether to fill the input tensors with zero. @@ -70,58 +74,40 @@ class ConcurrencyManager : public LoadManager { /// \param factory The ClientBackendFactory object used to create /// client to the server. /// \param manager Returns a new ConcurrencyManager object. + /// \param request_parameters Custom request parameters to send to the server /// \return cb::Error object indicating success or failure. static cb::Error Create( const bool async, const bool streaming, const int32_t batch_size, const size_t max_threads, const size_t max_concurrency, - const size_t sequence_length, const size_t string_length, - const std::string& string_data, const bool zero_input, - std::vector& user_data, const SharedMemoryType shared_memory_type, const size_t output_shm_size, - const uint64_t start_sequence_id, const uint64_t sequence_id_range, const std::shared_ptr& parser, const std::shared_ptr& factory, - std::unique_ptr* manager); + std::unique_ptr* manager, + const std::unordered_map& + request_parameters); /// Adjusts the number of concurrent requests to be the same as /// 'concurrent_request_count' (by creating or pausing threads) /// \param concurent_request_count The number of concurrent requests. + /// \param request_count The number of requests to generate. If 0, then + /// there is no limit, and it will generate until told to stop. /// \return cb::Error object indicating success or failure. - cb::Error ChangeConcurrencyLevel(const size_t concurrent_request_count); + cb::Error ChangeConcurrencyLevel( + const size_t concurrent_request_count, const size_t request_count = 0); + + protected: + // Makes a new worker + virtual std::shared_ptr MakeWorker( + std::shared_ptr, std::shared_ptr); - private: ConcurrencyManager( const bool async, const bool streaming, const int32_t batch_size, const size_t max_threads, const size_t max_concurrency, - const size_t sequence_length, const SharedMemoryType shared_memory_type, - const size_t output_shm_size, const uint64_t start_sequence_id, - const uint64_t sequence_id_range, + const SharedMemoryType shared_memory_type, const size_t output_shm_size, const std::shared_ptr& parser, - const std::shared_ptr& factory); - - struct ThreadConfig { - ThreadConfig(size_t thread_id) - : thread_id_(thread_id), concurrency_(0), - non_sequence_data_step_id_(thread_id), is_paused_(false) - { - } - - // ID of corresponding worker thread - size_t thread_id_; - // The concurrency level that the worker should produce - size_t concurrency_; - // The current data step id in case of non-sequence model - size_t non_sequence_data_step_id_; - // Whether or not the thread is issuing new inference requests - bool is_paused_; - }; - - /// Function for worker that sends inference requests. - /// \param thread_stat Worker thread status specific data. - /// \param thread_config Worker thread configuration specific data. - void Infer( - std::shared_ptr thread_stat, - std::shared_ptr thread_config); + const std::shared_ptr& factory, + const std::unordered_map& + request_parameters); // The number of worker threads with non-zero concurrencies size_t active_threads_; @@ -129,7 +115,31 @@ class ConcurrencyManager : public LoadManager { bool execute_; size_t max_concurrency_; + std::vector> threads_config_; + + private: + void InitManagerFinalize() override; + + // Pause all worker threads that are working on sequences + // + void PauseSequenceWorkers(); + + // Create new threads (if necessary), and then reconfigure all worker threads + // to handle the new concurrent request count + // + void ReconfigThreads(size_t concurrent_request_count, size_t request_count); + + // Restart all worker threads that were working on sequences + // + void ResumeSequenceWorkers(); + +#ifndef DOCTEST_CONFIG_DISABLE + friend TestConcurrencyManager; + + public: + ConcurrencyManager() = default; +#endif }; }} // namespace triton::perfanalyzer diff --git a/concurrency_worker.cc b/concurrency_worker.cc new file mode 100644 index 00000000..37a562f7 --- /dev/null +++ b/concurrency_worker.cc @@ -0,0 +1,208 @@ +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "concurrency_worker.h" + +#include + +#include "client_backend/client_backend.h" +#include "perf_utils.h" + +namespace triton { namespace perfanalyzer { + +// Function for worker threads. +// If the model is non-sequence model, each worker uses only one context +// to maintain concurrency assigned to worker. +// If the model is sequence model, each worker has to use multiples contexts +// to maintain (sequence) concurrency assigned to worker. +void +ConcurrencyWorker::Infer() +{ + CreateCtxIdTracker(); + ReserveContexts(); + + // run inferencing until receiving exit signal to maintain server load. + do { + if (RunInference()) { + break; + } + } while (true); +} + +bool +ConcurrencyWorker::RunInference() +{ + HandleExecuteOff(); + if (HandleNoConcurrency()) { + return true; + } + CreateContextsAsNecessary(); + if (HandleExitConditions()) { + return true; + } + SendInferRequests(); + if (HandleExitConditions()) { + return true; + } + WaitForResponses(); + if (HandleExitConditions()) { + return true; + } + return false; +} + +void +ConcurrencyWorker::CreateCtxIdTracker() +{ + bool is_concurrency = true; + bool serial_sequences = false; + ctx_id_tracker_ = CtxIdTrackerFactory::CreateTracker( + is_concurrency, on_sequence_model_, serial_sequences); +} + +void +ConcurrencyWorker::ReserveContexts() +{ + // Reserve the vectors in case of sequence models. In non-sequence or + // synchronous mode only one context will be opened hence no need of + // reserving. + if (on_sequence_model_ && async_) { + thread_stat_->contexts_stat_.reserve(max_concurrency_); + ctxs_.reserve(max_concurrency_); + } +} + +void +ConcurrencyWorker::HandleExecuteOff() +{ + if (on_sequence_model_) { + if (!execute_) { + // Ensures the clean exit of the sequences + CompleteOngoingSequences(); + WaitForOngoingRequests(); + + // Reset Ctx IDs because CompleteOngoingSequences() + // has destructive side affects + ResetFreeCtxIds(); + + // Wait if no request should be sent and it is not exiting + thread_config_->is_paused_ = true; + std::unique_lock lock(wake_mutex_); + wake_signal_.wait(lock, [this]() { return early_exit || execute_; }); + + // TODO REFACTOR TMA-1043 - memory manager should be handling this instead + // of here + for (auto ctx : ctxs_) { + ctx->SetNumActiveThreads(active_threads_); + } + } + } + thread_config_->is_paused_ = false; +} + +bool +ConcurrencyWorker::HandleNoConcurrency() +{ + // Only interact with synchronous mechanism if the worker should wait + if (thread_config_->concurrency_ == 0) { + // Wait if no request should be sent and it is not exiting + std::unique_lock lock(wake_mutex_); + wake_signal_.wait(lock, [this]() { + return early_exit || (thread_config_->concurrency_ > 0); + }); + // Stop executing if concurrency is 0 and early exit is requested + if (early_exit && thread_config_->concurrency_ == 0) { + return true; + } + } + return false; +} + +void +ConcurrencyWorker::CreateContextsAsNecessary() +{ + // If the model is non-sequence model, use one InferContext to + // maintain concurrency for this thread. + size_t active_ctx_cnt = on_sequence_model_ ? thread_config_->concurrency_ : 1; + + if (active_ctx_cnt > ctxs_.size()) { + while (active_ctx_cnt > ctxs_.size()) { + CreateContext(); + } + ResetFreeCtxIds(); + } + + // TODO REFACTOR TMA-1043 -- this shouldn't be handled here + for (auto ctx : ctxs_) { + ctx->SetNumActiveThreads(active_threads_); + } +} + +void +ConcurrencyWorker::SendInferRequests() +{ + while (ctx_id_tracker_->IsAvailable() && execute_ && !ShouldExit()) { + uint32_t ctx_id = GetCtxId(); + SendInferRequest(ctx_id); + RestoreFreeCtxId(ctx_id); + } +} + + +void +ConcurrencyWorker::WaitForResponses() +{ + if (async_) { + { + // If async, then wait for signal from callback. + std::unique_lock lk(cb_mtx_); + thread_stat_->idle_timer.Start(); + cb_cv_.wait(lk, [this] { + if (notified_) { + notified_ = false; + return true; + } + return false; + }); + thread_stat_->idle_timer.Stop(); + } + } +} + +void +ConcurrencyWorker::ResetFreeCtxIds() +{ + std::lock_guard lock(cb_mtx_); + ctx_id_tracker_->Reset(thread_config_->concurrency_); +} + +uint32_t +ConcurrencyWorker::GetSeqStatIndex(uint32_t ctx_id) +{ + return (thread_config_->seq_stat_index_offset_ + ctx_id); +} + +}} // namespace triton::perfanalyzer diff --git a/concurrency_worker.h b/concurrency_worker.h new file mode 100644 index 00000000..4645f07a --- /dev/null +++ b/concurrency_worker.h @@ -0,0 +1,122 @@ +// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include + +#include "load_worker.h" +#include "sequence_manager.h" +#include "thread_config.h" + +namespace triton { namespace perfanalyzer { + + +#ifndef DOCTEST_CONFIG_DISABLE +class NaggyMockConcurrencyWorker; +#endif + +/// Worker thread for the ConcurrencyManager +/// +/// The worker maintains concurrency in different ways: +/// For sequence models, multiple contexts must be created for multiple +/// concurrent sequences. +/// +/// For non-sequence models, one context can send out multiple requests +/// at the same time. Thus it uses one single context as every infer context +/// creates a worker thread implicitly. +/// +class ConcurrencyWorker : public LoadWorker { + public: + ConcurrencyWorker( + uint32_t id, std::shared_ptr thread_stat, + std::shared_ptr thread_config, + const std::shared_ptr parser, + std::shared_ptr data_loader, + const std::shared_ptr factory, + const bool on_sequence_model, const bool async, + const size_t max_concurrency, const bool using_json_data, + const bool streaming, const int32_t batch_size, + std::condition_variable& wake_signal, std::mutex& wake_mutex, + size_t& active_threads, bool& execute, + const std::shared_ptr& infer_data_manager, + std::shared_ptr sequence_manager) + : LoadWorker( + id, thread_stat, thread_config, parser, data_loader, factory, + on_sequence_model, async, streaming, batch_size, using_json_data, + wake_signal, wake_mutex, execute, infer_data_manager, + sequence_manager), + max_concurrency_(max_concurrency), active_threads_(active_threads) + { + } + + virtual void Infer() override; + + protected: + bool RunInference(); + + void CreateCtxIdTracker(); + + // Reserve vector size for contexts + void ReserveContexts(); + + private: + const size_t max_concurrency_; + // TODO REFACTOR TMA-1020 can we decouple this thread from the total count of + // threads? + size_t& active_threads_; + + // Handle the case where execute_ is false + void HandleExecuteOff(); + + // Handle the case where this thread is configured to do nothing + // Returns true if an exit condition was met + bool HandleNoConcurrency(); + + // Create and populate contexts if needed + void CreateContextsAsNecessary(); + + // Send out the desired concurrency of requests + void SendInferRequests(); + + void WaitForResponses(); + + void ResetFreeCtxIds(); + + uint32_t GetSeqStatIndex(uint32_t ctx_id) override; + + void CreateContextFinalize(std::shared_ptr ctx) override + { + ctx->RegisterAsyncCallbackFinalize(std::bind( + &ConcurrencyWorker::AsyncCallbackFinalize, this, + std::placeholders::_1)); + } + +#ifndef DOCTEST_CONFIG_DISABLE + friend NaggyMockConcurrencyWorker; +#endif +}; + +}} // namespace triton::perfanalyzer diff --git a/constants.h b/constants.h index 2de968f4..44380678 100644 --- a/constants.h +++ b/constants.h @@ -1,4 +1,4 @@ -// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -26,11 +26,15 @@ #pragma once #include +#include -// namespace pa = triton::perfanalyzer; - +#define STRINGIFY_(x) #x +#define STRINGIFY(x) STRINGIFY_(x) namespace triton { namespace perfanalyzer { +const std::string SHA{STRINGIFY(GIT_SHA)}; +const std::string VERSION{STRINGIFY(PERF_ANALYZER_VERSION)}; + constexpr static const uint32_t SUCCESS = 0; constexpr static const uint32_t STABILITY_ERROR = 2; @@ -38,6 +42,8 @@ constexpr static const uint32_t OPTION_ERROR = 3; constexpr static const uint32_t GENERIC_ERROR = 99; +const double DELAY_PCT_THRESHOLD{1.0}; + /// Different measurement modes possible. enum MeasurementMode { TIME_WINDOWS = 0, COUNT_WINDOWS = 1 }; diff --git a/ctx_id_tracker_factory.h b/ctx_id_tracker_factory.h new file mode 100644 index 00000000..0a455fc9 --- /dev/null +++ b/ctx_id_tracker_factory.h @@ -0,0 +1,67 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include + +#include "concurrency_ctx_id_tracker.h" +#include "fifo_ctx_id_tracker.h" +#include "rand_ctx_id_tracker.h" + +namespace triton { namespace perfanalyzer { + +// Context ID tracker that is always available and returns random Context IDs +// +class CtxIdTrackerFactory { + public: + CtxIdTrackerFactory() = delete; + + /// Creates and returns a Context Id Tracker + /// + /// \param is_concurrency True if targeting Concurrency + /// \param is_sequence_model True if the model is a sequence model + /// \param serial_sequences True if in serial sequence mode + /// + static std::shared_ptr CreateTracker( + bool is_concurrency, bool is_sequence_model, bool serial_sequences) + { + if (is_concurrency) { + if (is_sequence_model) { + return std::make_shared(); + } else { + return std::make_shared(); + } + } else { + if (is_sequence_model && serial_sequences) { + return std::make_shared(); + } else { + return std::make_shared(); + } + } + } +}; + +}} // namespace triton::perfanalyzer diff --git a/custom_load_manager.cc b/custom_load_manager.cc index 6d5aaac7..55a20a69 100644 --- a/custom_load_manager.cc +++ b/custom_load_manager.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -26,38 +26,30 @@ #include "custom_load_manager.h" +#include + +#include "constants.h" + namespace triton { namespace perfanalyzer { cb::Error CustomLoadManager::Create( const bool async, const bool streaming, - const uint64_t measurement_window_ms, + const uint64_t measurement_window_ms, const size_t max_trials, const std::string& request_intervals_file, const int32_t batch_size, const size_t max_threads, const uint32_t num_of_sequences, - const size_t sequence_length, const size_t string_length, - const std::string& string_data, const bool zero_input, - std::vector& user_data, const SharedMemoryType shared_memory_type, const size_t output_shm_size, - const uint64_t start_sequence_id, const uint64_t sequence_id_range, - const std::shared_ptr& parser, + const bool serial_sequences, const std::shared_ptr& parser, const std::shared_ptr& factory, - std::unique_ptr* manager) + std::unique_ptr* manager, + const std::unordered_map& + request_parameters) { std::unique_ptr local_manager(new CustomLoadManager( async, streaming, request_intervals_file, batch_size, - measurement_window_ms, max_threads, num_of_sequences, sequence_length, - shared_memory_type, output_shm_size, start_sequence_id, sequence_id_range, - parser, factory)); - - local_manager->threads_config_.reserve(max_threads); - - RETURN_IF_ERROR(local_manager->InitManagerInputs( - string_length, string_data, zero_input, user_data)); - - if (local_manager->shared_memory_type_ != - SharedMemoryType::NO_SHARED_MEMORY) { - RETURN_IF_ERROR(local_manager->InitSharedMemory()); - } + measurement_window_ms, max_trials, max_threads, num_of_sequences, + shared_memory_type, output_shm_size, serial_sequences, parser, factory, + request_parameters)); *manager = std::move(local_manager); @@ -67,42 +59,83 @@ CustomLoadManager::Create( CustomLoadManager::CustomLoadManager( const bool async, const bool streaming, const std::string& request_intervals_file, int32_t batch_size, - const uint64_t measurement_window_ms, const size_t max_threads, - const uint32_t num_of_sequences, const size_t sequence_length, + const uint64_t measurement_window_ms, const size_t max_trials, + const size_t max_threads, const uint32_t num_of_sequences, const SharedMemoryType shared_memory_type, const size_t output_shm_size, - const uint64_t start_sequence_id, const uint64_t sequence_id_range, - const std::shared_ptr& parser, - const std::shared_ptr& factory) + const bool serial_sequences, const std::shared_ptr& parser, + const std::shared_ptr& factory, + const std::unordered_map& + request_parameters) : RequestRateManager( async, streaming, Distribution::CUSTOM, batch_size, - measurement_window_ms, max_threads, num_of_sequences, sequence_length, - shared_memory_type, output_shm_size, start_sequence_id, - sequence_id_range, parser, factory), + measurement_window_ms, max_trials, max_threads, num_of_sequences, + shared_memory_type, output_shm_size, serial_sequences, parser, + factory, request_parameters), request_intervals_file_(request_intervals_file) { } cb::Error -CustomLoadManager::InitCustomIntervals() +CustomLoadManager::InitCustomIntervals(const size_t request_count) +{ + PauseWorkers(); + ConfigureThreads(request_count); + auto status = GenerateSchedule(); + ResumeWorkers(); + return status; +} + +cb::Error +CustomLoadManager::GenerateSchedule() { - schedule_.clear(); - schedule_.emplace_back(0); - if (!request_intervals_file_.empty()) { - RETURN_IF_ERROR( - ReadTimeIntervalsFile(request_intervals_file_, &custom_intervals_)); - size_t index = 0; - while (schedule_.back() < *gen_duration_) { - std::chrono::nanoseconds next_timestamp( - schedule_.back() + custom_intervals_[index++]); - schedule_.emplace_back(next_timestamp); - if (index == custom_intervals_.size()) { - index = 0; - } - } + if (request_intervals_file_.empty()) { + return cb::Error::Success; } + + RETURN_IF_ERROR( + ReadTimeIntervalsFile(request_intervals_file_, &custom_intervals_)); + + auto worker_schedules = CreateWorkerSchedules(); + GiveSchedulesToWorkers(worker_schedules); return cb::Error::Success; } +std::vector +CustomLoadManager::CreateWorkerSchedules() +{ + std::vector worker_schedules = + CreateEmptyWorkerSchedules(); + std::vector thread_ids{CalculateThreadIds()}; + + size_t thread_id_index = 0; + size_t worker_index = 0; + size_t intervals_index = 0; + + std::chrono::nanoseconds next_timestamp(0); + + bool started = false; + + // Keep filling the schedule until both the thread_ids (which can differ if + // sequences are enabled) and the intervals are both at the end of their + // lists. This effectively finds the least common multiple of the two sizes + // and makes sure that the schedule is complete and can be repeated + // indefinitely + // + while (!started || thread_id_index != 0 || intervals_index != 0) { + started = true; + next_timestamp += custom_intervals_[intervals_index]; + worker_index = thread_ids[thread_id_index]; + worker_schedules[worker_index]->intervals.emplace_back(next_timestamp); + + thread_id_index = (thread_id_index + 1) % thread_ids.size(); + intervals_index = (intervals_index + 1) % custom_intervals_.size(); + } + + SetScheduleDurations(worker_schedules); + + return worker_schedules; +} + cb::Error CustomLoadManager::GetCustomRequestRate(double* request_rate) { @@ -115,7 +148,30 @@ CustomLoadManager::GetCustomRequestRate(double* request_rate) } *request_rate = - (custom_intervals_.size() * 1000 * 1000 * 1000) / (total_time_ns); + (custom_intervals_.size() * NANOS_PER_SECOND) / (total_time_ns); + return cb::Error::Success; +} + +cb::Error +CustomLoadManager::ReadTimeIntervalsFile( + const std::string& path, NanoIntervals* contents) +{ + std::ifstream in(path); + if (!in) { + return cb::Error("failed to open file '" + path + "'", pa::GENERIC_ERROR); + } + + std::string current_string; + while (std::getline(in, current_string)) { + std::chrono::nanoseconds curent_time_interval_ns( + std::stol(current_string) * 1000); + contents->push_back(curent_time_interval_ns); + } + in.close(); + + if (contents->size() == 0) { + return cb::Error("file '" + path + "' is empty", pa::GENERIC_ERROR); + } return cb::Error::Success; } diff --git a/custom_load_manager.h b/custom_load_manager.h index 30bd559a..39c51d99 100644 --- a/custom_load_manager.h +++ b/custom_load_manager.h @@ -1,4 +1,4 @@ -// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -25,10 +25,19 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once +#include +#include +#include + +#include "client_backend/client_backend.h" #include "request_rate_manager.h" namespace triton { namespace perfanalyzer { +#ifndef DOCTEST_CONFIG_DISABLE +class TestCustomLoadManager; +#endif + //============================================================================== /// CustomLoadManager is a helper class to send inference requests to /// inference server in accordance with user provided time intervals. This @@ -44,13 +53,13 @@ class CustomLoadManager : public RequestRateManager { /// request. /// \param streaming Whether to use gRPC streaming API for infer request /// \param measurement_window_ms The time window for measurements. + /// \param max_trials The maximum number of windows that will be measured /// \param request_intervals_file The path to the file to use to pick up the /// time intervals between the successive requests. /// \param batch_size The batch size used for each request. /// \param max_threads The maximum number of working threads to be spawned. /// \param num_of_sequences The number of concurrent sequences that must be /// maintained on the server. - /// \param sequence_length The base length of each sequence. /// \param zero_input Whether to fill the input tensors with zero. /// \param input_shapes The shape of the input tensors. /// \param user_data The vector containing path/paths to user-provided data @@ -58,29 +67,31 @@ class CustomLoadManager : public RequestRateManager { /// \param shared_memory_type The type of shared memory to use for inputs. /// \param output_shm_size The size of the shared memory to allocate for the /// output. + /// \param serial_sequences Enable serial sequence mode. /// \param parser The ModelParser object to get the model details. /// \param factory The ClientBackendFactory object used to create /// client to the server. /// \param manager Returns a new ConcurrencyManager object. + /// \param request_parameters Custom request parameters to send to the server /// \return cb::Error object indicating success or failure. static cb::Error Create( const bool async, const bool streaming, - const uint64_t measurement_window_ms, + const uint64_t measurement_window_ms, const size_t max_trials, const std::string& request_intervals_file, const int32_t batch_size, const size_t max_threads, const uint32_t num_of_sequences, - const size_t sequence_length, const size_t string_length, - const std::string& string_data, const bool zero_input, - std::vector& user_data, const SharedMemoryType shared_memory_type, const size_t output_shm_size, - const uint64_t start_sequence_id, const uint64_t sequence_id_range, - const std::shared_ptr& parser, + const bool serial_sequences, const std::shared_ptr& parser, const std::shared_ptr& factory, - std::unique_ptr* manager); + std::unique_ptr* manager, + const std::unordered_map& + request_parameter); /// Initializes the load manager with the provided file containing request /// intervals + /// \param request_count The number of requests to generate. If 0, then + /// there is no limit, and it will generate until told to stop. /// \return cb::Error object indicating success or failure. - cb::Error InitCustomIntervals(); + cb::Error InitCustomIntervals(const size_t request_count); /// Computes the request rate from the time interval file. Fails with an error /// if the file is not present or is empty. @@ -93,15 +104,34 @@ class CustomLoadManager : public RequestRateManager { CustomLoadManager( const bool async, const bool streaming, const std::string& request_intervals_file, const int32_t batch_size, - const uint64_t measurement_window_ms, const size_t max_threads, - const uint32_t num_of_sequences, const size_t sequence_length, + const uint64_t measurement_window_ms, const size_t max_trials, + const size_t max_threads, const uint32_t num_of_sequences, const SharedMemoryType shared_memory_type, const size_t output_shm_size, - const uint64_t start_sequence_id, const uint64_t sequence_id_range, - const std::shared_ptr& parser, - const std::shared_ptr& factory); + const bool serial_sequences, const std::shared_ptr& parser, + const std::shared_ptr& factory, + const std::unordered_map& + request_parameters); + + cb::Error GenerateSchedule(); + + std::vector CreateWorkerSchedules(); + + /// Reads the time intervals file and stores intervals in vector + /// \param path Filesystem path of the time intervals file. + /// \param contents Output intervals vector. + /// \return cb::Error object indicating success or failure. + virtual cb::Error ReadTimeIntervalsFile( + const std::string& path, NanoIntervals* contents); std::string request_intervals_file_; - std::vector custom_intervals_; + NanoIntervals custom_intervals_; + +#ifndef DOCTEST_CONFIG_DISABLE + friend TestCustomLoadManager; + + public: + CustomLoadManager() = default; +#endif }; }} // namespace triton::perfanalyzer diff --git a/data_loader.cc b/data_loader.cc index cf131b07..38bfe940 100644 --- a/data_loader.cc +++ b/data_loader.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -28,6 +28,7 @@ #include #include + #include namespace triton { namespace perfanalyzer { @@ -37,6 +38,34 @@ DataLoader::DataLoader(const size_t batch_size) { } +cb::Error +DataLoader::ValidateIOExistsInModel( + const std::shared_ptr& inputs, + const std::shared_ptr& outputs, + const std::string& data_directory) +{ + if (!std::filesystem::exists(data_directory) || + !std::filesystem::is_directory(data_directory)) { + return cb::Error( + "Error: Directory does not exist or is not a directory: " + + std::string(data_directory), + pa::GENERIC_ERROR); + } + + for (const auto& file : std::filesystem::directory_iterator(data_directory)) { + std::string io_name = file.path().filename().string(); + if (inputs->find(io_name) == inputs->end() && + outputs->find(io_name) == outputs->end()) { + return cb::Error( + "Provided data file '" + io_name + + "' does not correspond to a valid model input or output.", + pa::GENERIC_ERROR); + } + } + + return cb::Error::Success; +} + cb::Error DataLoader::ReadDataFromDir( const std::shared_ptr& inputs, @@ -90,8 +119,8 @@ DataLoader::ReadDataFromDir( if (input_string_data.size() != batch1_num_strings) { return cb::Error( "provided data for input " + input.second.name_ + " has " + - std::to_string(it->second.size()) + " byte elements, expect " + - std::to_string(batch1_num_strings), + std::to_string(input_string_data.size()) + + " elements, expect " + std::to_string(batch1_num_strings), pa::GENERIC_ERROR); } } @@ -142,25 +171,36 @@ DataLoader::ReadDataFromJSON( const unsigned int parseFlags = rapidjson::kParseNanAndInfFlag; d.ParseStream(fs); - if (d.HasParseError()) { - std::cerr << "cb::Error : " << d.GetParseError() << '\n' - << "Offset : " << d.GetErrorOffset() << '\n'; + fclose(data_file); + + return ParseData(d, inputs, outputs); +} + +cb::Error +DataLoader::ParseData( + const rapidjson::Document& json, + const std::shared_ptr& inputs, + const std::shared_ptr& outputs) +{ + if (json.HasParseError()) { + std::cerr << "cb::Error : " << json.GetParseError() << '\n' + << "Offset : " << json.GetErrorOffset() << '\n'; return cb::Error( "failed to parse the specified json file for reading provided data", pa::GENERIC_ERROR); } - if (!d.HasMember("data")) { + if (!json.HasMember("data")) { return cb::Error( "The json file doesn't contain data field", pa::GENERIC_ERROR); } - const rapidjson::Value& streams = d["data"]; + const rapidjson::Value& streams = json["data"]; // Validation data is optional, once provided, it must align with 'data' const rapidjson::Value* out_streams = nullptr; - if (d.HasMember("validation_data")) { - out_streams = &d["validation_data"]; + if (json.HasMember("validation_data")) { + out_streams = &json["validation_data"]; if (out_streams->Size() != streams.Size()) { return cb::Error( "The 'validation_data' field doesn't align with 'data' field in the " @@ -177,6 +217,9 @@ DataLoader::ReadDataFromJSON( const rapidjson::Value& steps = streams[i - offset]; const rapidjson::Value* output_steps = (out_streams == nullptr) ? nullptr : &(*out_streams)[i - offset]; + + RETURN_IF_ERROR(ValidateParsingMode(steps)); + if (steps.IsArray()) { step_num_.push_back(steps.Size()); for (size_t k = 0; k < step_num_[i]; k++) { @@ -222,9 +265,7 @@ DataLoader::ReadDataFromJSON( } } - max_non_sequence_step_id_ = std::max(1, (int)(step_num_[0] / batch_size_)); - fclose(data_file); return cb::Error::Success; } @@ -310,46 +351,31 @@ DataLoader::GenerateData( cb::Error DataLoader::GetInputData( const ModelTensor& input, const int stream_id, const int step_id, - const uint8_t** data_ptr, size_t* batch1_size) + TensorData& data) { - bool data_found = false; + data.data_ptr = nullptr; + data.batch1_size = 0; + data.is_valid = false; // If json data is available then try to retrieve the data from there if (!input_data_.empty()) { - // validate if the indices conform to the vector sizes - if (stream_id < 0 || stream_id >= (int)data_stream_cnt_) { - return cb::Error( - "stream_id for retrieving the data should be less than " + - std::to_string(data_stream_cnt_) + ", got " + - std::to_string(stream_id), - pa::GENERIC_ERROR); - } - if (step_id < 0 || step_id >= (int)step_num_[stream_id]) { - return cb::Error( - "step_id for retrieving the data should be less than " + - std::to_string(step_num_[stream_id]) + ", got " + - std::to_string(step_id), - pa::GENERIC_ERROR); - } + RETURN_IF_ERROR(ValidateIndexes(stream_id, step_id)); + std::string key_name( input.name_ + "_" + std::to_string(stream_id) + "_" + std::to_string(step_id)); + // Get the data and the corresponding byte-size auto it = input_data_.find(key_name); if (it != input_data_.end()) { - if (input.datatype_.compare("BYTES") != 0) { - *batch1_size = it->second.size(); - } else { - std::vector* string_data; - string_data = &it->second; - *batch1_size = string_data->size(); - } - *data_ptr = (const uint8_t*)&((it->second)[0]); - data_found = true; + std::vector* data_vec = &it->second; + data.is_valid = true; + data.batch1_size = data_vec->size(); + data.data_ptr = (const uint8_t*)data_vec->data(); } } - if (!data_found) { + if (!data.is_valid) { if ((input.datatype_.compare("BYTES") != 0) && (input_buf_.size() != 0)) { int64_t byte_size = ByteSize(input.shape_, input.datatype_); if (byte_size < 0) { @@ -357,13 +383,13 @@ DataLoader::GetInputData( "failed to get correct byte size for '" + input.name_ + "'.", pa::GENERIC_ERROR); } - *batch1_size = (size_t)byte_size; - *data_ptr = &input_buf_[0]; - data_found = true; + data.batch1_size = (size_t)byte_size; + data.data_ptr = &input_buf_[0]; + data.is_valid = true; } } - if (input.is_optional_ == false && !data_found) { + if (input.is_optional_ == false && !data.is_valid) { return cb::Error( "unable to find data for input '" + input.name_ + "'.", pa::GENERIC_ERROR); @@ -375,40 +401,54 @@ DataLoader::GetInputData( cb::Error DataLoader::GetOutputData( const std::string& output_name, const int stream_id, const int step_id, - const uint8_t** data_ptr, size_t* batch1_size) + TensorData& data) { - *data_ptr = nullptr; - *batch1_size = 0; + data.data_ptr = nullptr; + data.batch1_size = 0; + data.is_valid = false; + data.name = ""; + // If json data is available then try to retrieve the data from there if (!output_data_.empty()) { - // validate if the indices conform to the vector sizes - if (stream_id < 0 || stream_id >= (int)data_stream_cnt_) { - return cb::Error( - "stream_id for retrieving the data should be less than " + - std::to_string(data_stream_cnt_) + ", got " + - std::to_string(stream_id), - pa::GENERIC_ERROR); - } - if (step_id < 0 || step_id >= (int)step_num_[stream_id]) { - return cb::Error( - "step_id for retrieving the data should be less than " + - std::to_string(step_num_[stream_id]) + ", got " + - std::to_string(step_id), - pa::GENERIC_ERROR); - } + RETURN_IF_ERROR(ValidateIndexes(stream_id, step_id)); + std::string key_name( output_name + "_" + std::to_string(stream_id) + "_" + std::to_string(step_id)); // Get the data and the corresponding byte-size auto it = output_data_.find(key_name); if (it != output_data_.end()) { - *batch1_size = it->second.size(); - *data_ptr = (const uint8_t*)&((it->second)[0]); + std::vector* data_vec = &it->second; + data.is_valid = true; + data.batch1_size = data_vec->size(); + data.data_ptr = (const uint8_t*)data_vec->data(); + data.name = output_name; } } return cb::Error::Success; } +cb::Error +DataLoader::ValidateIndexes(int stream_id, int step_id) +{ + if (stream_id < 0 || stream_id >= (int)data_stream_cnt_) { + return cb::Error( + "stream_id for retrieving the data should be less than " + + std::to_string(data_stream_cnt_) + ", got " + + std::to_string(stream_id), + pa::GENERIC_ERROR); + } + if (step_id < 0 || step_id >= (int)step_num_[stream_id]) { + return cb::Error( + "step_id for retrieving the data should be less than " + + std::to_string(step_num_[stream_id]) + ", got " + + std::to_string(step_id), + pa::GENERIC_ERROR); + } + return cb::Error::Success; +} + + cb::Error DataLoader::GetInputShape( const ModelTensor& input, const int stream_id, const int step_id, @@ -437,9 +477,11 @@ DataLoader::ReadTensorData( const std::shared_ptr& tensors, const int stream_index, const int step_index, const bool is_input) { + std::unordered_set model_io_names; auto& tensor_data = is_input ? input_data_ : output_data_; auto& tensor_shape = is_input ? input_shapes_ : output_shapes_; for (const auto& io : *tensors) { + model_io_names.insert(io.first); if (step.HasMember(io.first.c_str())) { std::string key_name( io.first + "_" + std::to_string(stream_index) + "_" + @@ -458,8 +500,6 @@ DataLoader::ReadTensorData( if (tensor.IsArray()) { content = &tensor; - } else if (tensor.HasMember("b64")) { - content = &tensor; } else { // Populate the shape values first if available if (tensor.HasMember("shape")) { @@ -474,22 +514,26 @@ DataLoader::ReadTensorData( } } - if (!tensor.HasMember("content")) { - return cb::Error( - "missing content field. ( Location stream id: " + - std::to_string(stream_index) + - ", step id: " + std::to_string(step_index) + ")", - pa::GENERIC_ERROR); - } + if (tensor.HasMember("b64")) { + content = &tensor; + } else { + if (!tensor.HasMember("content")) { + return cb::Error( + "missing content field. ( Location stream id: " + + std::to_string(stream_index) + + ", step id: " + std::to_string(step_index) + ")", + pa::GENERIC_ERROR); + } - content = &tensor["content"]; + content = &tensor["content"]; + } } if (content->IsArray()) { RETURN_IF_ERROR(SerializeExplicitTensor( *content, io.second.datatype_, &it->second)); } else { - if (content->HasMember("b64")) { + if (content->IsObject() && content->HasMember("b64")) { if ((*content)["b64"].IsString()) { const std::string& encoded = (*content)["b64"].GetString(); it->second.resize(encoded.length()); @@ -497,25 +541,6 @@ DataLoader::ReadTensorData( int size = D.decode(encoded.c_str(), encoded.length(), &it->second[0]); it->second.resize(size); - - int64_t batch1_byte; - auto shape_it = tensor_shape.find(key_name); - if (shape_it == tensor_shape.end()) { - batch1_byte = ByteSize(io.second.shape_, io.second.datatype_); - } else { - batch1_byte = ByteSize(shape_it->second, io.second.datatype_); - } - if (batch1_byte > 0 && (size_t)batch1_byte != it->second.size()) { - return cb::Error( - "mismatch in the data provided. " - "Expected: " + - std::to_string(batch1_byte) + - " bytes, Got: " + std::to_string(it->second.size()) + - " bytes ( Location stream id: " + - std::to_string(stream_index) + - ", step id: " + std::to_string(step_index) + ")", - pa::GENERIC_ERROR); - } } else { return cb::Error( "the value of b64 field should be of type string ( " @@ -534,20 +559,8 @@ DataLoader::ReadTensorData( } } - // Validate if a fixed shape is available for the tensor. - int element_count; - auto shape_it = tensor_shape.find(key_name); - if (shape_it != tensor_shape.end()) { - element_count = ElementCount(shape_it->second); - } else { - element_count = ElementCount(io.second.shape_); - } - if (element_count < 0) { - return cb::Error( - "The variable-sized tensor \"" + io.second.name_ + - "\" is missing shape, see --shape option.", - pa::GENERIC_ERROR); - } + RETURN_IF_ERROR(ValidateTensor(io.second, stream_index, step_index)); + } else if (io.second.is_optional_ == false) { return cb::Error( "missing tensor " + io.first + @@ -557,6 +570,174 @@ DataLoader::ReadTensorData( } } + // Add allowed non-model inputs/outputs to the model_io_names set + model_io_names.insert("model"); + + for (auto itr = step.MemberBegin(); itr != step.MemberEnd(); ++itr) { + if (model_io_names.find(itr->name.GetString()) == model_io_names.end()) { + return cb::Error( + "The input or output '" + std::string(itr->name.GetString()) + + "' is not found in the model configuration", + pa::GENERIC_ERROR); + } + } + + + return cb::Error::Success; +} + + +cb::Error +DataLoader::ReadFile(const std::string& path, std::vector* contents) +{ + std::ifstream in(path, std::ios::in | std::ios::binary); + if (!in) { + return cb::Error("failed to open file '" + path + "'", pa::GENERIC_ERROR); + } + + in.seekg(0, std::ios::end); + + int file_size = in.tellg(); + if (file_size > 0) { + contents->resize(file_size); + in.seekg(0, std::ios::beg); + in.read(&(*contents)[0], contents->size()); + } + + in.close(); + + // If size is invalid, report after ifstream is closed + if (file_size < 0) { + return cb::Error( + "failed to get size for file '" + path + "'", pa::GENERIC_ERROR); + } else if (file_size == 0) { + return cb::Error("file '" + path + "' is empty", pa::GENERIC_ERROR); + } + + return cb::Error::Success; +} + +cb::Error +DataLoader::ReadTextFile( + const std::string& path, std::vector* contents) +{ + std::ifstream in(path); + if (!in) { + return cb::Error("failed to open file '" + path + "'", pa::GENERIC_ERROR); + } + + std::string current_string; + while (std::getline(in, current_string)) { + contents->push_back(current_string); + } + in.close(); + + if (contents->size() == 0) { + return cb::Error("file '" + path + "' is empty", pa::GENERIC_ERROR); + } + return cb::Error::Success; +} + +cb::Error +DataLoader::ValidateTensor( + const ModelTensor& model_tensor, const int stream_index, + const int step_index) +{ + std::string key_name( + model_tensor.name_ + "_" + std::to_string(stream_index) + "_" + + std::to_string(step_index)); + + auto data_it = input_data_.find(key_name); + if (data_it == input_data_.end()) { + data_it = output_data_.find(key_name); + } + if (data_it == output_data_.end()) { + return cb::Error("Can't validate a nonexistent tensor"); + } + + auto shape_it = input_shapes_.find(key_name); + + const std::vector& data = data_it->second; + const std::vector& shape = (shape_it == input_shapes_.end()) + ? model_tensor.shape_ + : shape_it->second; + + int64_t batch1_byte = ByteSize(shape, model_tensor.datatype_); + + RETURN_IF_ERROR(ValidateTensorShape(shape, model_tensor)); + RETURN_IF_ERROR(ValidateTensorDataSize(data, batch1_byte, model_tensor)); + + return cb::Error::Success; +} + +cb::Error +DataLoader::ValidateTensorShape( + const std::vector& shape, const ModelTensor& model_tensor) +{ + int element_count = ElementCount(shape); + if (element_count < 0) { + return cb::Error( + "The variable-sized tensor \"" + model_tensor.name_ + + "\" with model shape " + ShapeVecToString(model_tensor.shape_) + + " needs to have its shape fully defined. See the --shape option.", + pa::GENERIC_ERROR); + } + + bool is_error = false; + + if (shape.size() != model_tensor.shape_.size()) { + is_error = true; + } + + for (size_t i = 0; i < shape.size() && !is_error; i++) { + if (shape[i] != model_tensor.shape_[i] && model_tensor.shape_[i] != -1) { + is_error = true; + } + } + + if (is_error) { + return cb::Error( + "The supplied shape of " + ShapeVecToString(shape) + " for input \"" + + model_tensor.name_ + + "\" is incompatible with the model's input shape of " + + ShapeVecToString(model_tensor.shape_)); + } + + return cb::Error::Success; +} + +cb::Error +DataLoader::ValidateTensorDataSize( + const std::vector& data, int64_t batch1_byte, + const ModelTensor& model_tensor) +{ + // Validate that the supplied data matches the amount of data expected based + // on the shape + if (batch1_byte > 0 && (size_t)batch1_byte != data.size()) { + return cb::Error( + "mismatch in the data provided for " + model_tensor.name_ + + ". Expected: " + std::to_string(batch1_byte) + + " bytes, Got: " + std::to_string(data.size()) + " bytes", + pa::GENERIC_ERROR); + } + + return cb::Error::Success; +} + +cb::Error +DataLoader::ValidateParsingMode(const rapidjson::Value& steps) +{ + // If our first time parsing data, capture the mode + if (step_num_.size() == 0) { + multiple_stream_mode_ = steps.IsArray(); + } else { + if (steps.IsArray() != multiple_stream_mode_) { + return cb::Error( + "Inconsistency in input-data provided. Can not have a combination of " + "objects and arrays inside of the Data array", + pa::GENERIC_ERROR); + } + } return cb::Error::Success; } diff --git a/data_loader.h b/data_loader.h index 6775e34d..2f83f959 100644 --- a/data_loader.h +++ b/data_loader.h @@ -1,4 +1,4 @@ -// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -25,27 +25,32 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once +#include #include +#include + #include "model_parser.h" #include "perf_utils.h" +#include "tensor_data.h" namespace triton { namespace perfanalyzer { +#ifndef DOCTEST_CONFIG_DISABLE +class NaggyMockDataLoader; +#endif + + class DataLoader { public: DataLoader(size_t batch_size); - /// Returns the total number of data steps that can be supported by a - /// non-sequence model. - size_t GetTotalStepsNonSequence() { return max_non_sequence_step_id_; } - /// Returns the total number of data streams available. size_t GetDataStreamsCount() { return data_stream_cnt_; } /// Returns the total data steps supported for a requested data stream /// id. /// \param stream_id The target stream id - size_t GetTotalSteps(size_t stream_id) + virtual size_t GetTotalSteps(size_t stream_id) { if (stream_id < data_stream_cnt_) { return step_num_[stream_id]; @@ -53,9 +58,22 @@ class DataLoader { return 0; } + /// Validate user-supplied inputs and outputs exist in the model + /// \param inputs The pointer to the map holding the information about + /// input tensors of a model + /// \param outputs The pointer to the map holding the information about + /// output tensors of a model + /// \param data_directory The path to the directory containing the data + cb::Error ValidateIOExistsInModel( + const std::shared_ptr& inputs, + const std::shared_ptr& outputs, + const std::string& data_directory); + /// Reads the input data from the specified data directory. /// \param inputs The pointer to the map holding the information about /// input tensors of a model + /// \param outputs The pointer to the map holding the information about + /// output tensors of a model /// \param data_directory The path to the directory containing the data cb::Error ReadDataFromDir( const std::shared_ptr& inputs, @@ -68,7 +86,7 @@ class DataLoader { /// \param json_file The json file containing the user-provided input /// data. /// Returns error object indicating status - cb::Error ReadDataFromJSON( + virtual cb::Error ReadDataFromJSON( const std::shared_ptr& inputs, const std::shared_ptr& outputs, const std::string& json_file); @@ -91,12 +109,11 @@ class DataLoader { /// \param input The target model input tensor /// \param stream_id The data stream_id to use for retrieving input data. /// \param step_id The data step_id to use for retrieving input data. - /// \param data Returns the pointer to the data for the requested input. - /// \param batch1_size Returns the size of the input data in bytes. + /// \param data Returns the input TensorData /// Returns error object indicating status cb::Error GetInputData( const ModelTensor& input, const int stream_id, const int step_id, - const uint8_t** data_ptr, size_t* batch1_size); + TensorData& data); /// Helper function to get the shape values to the input /// \param input The target model input tensor @@ -114,14 +131,43 @@ class DataLoader { /// \param output_name The name of the output tensor /// \param stream_id The data stream_id to use for retrieving output data. /// \param step_id The data step_id to use for retrieving output data. - /// \param data Returns the pointer to the data for the requested output. - /// \param batch1_size Returns the size of the output data in bytes. + /// \param data Returns the output TensorData /// Returns error object indicating status cb::Error GetOutputData( const std::string& output_name, const int stream_id, const int step_id, - const uint8_t** data_ptr, size_t* batch1_size); + TensorData& data); + + /// Return an error if the stream index or step index are invalid + cb::Error ValidateIndexes(int stream_index, int step_index); + + protected: + /// Parses the input and output data from the json document + /// \param inputs The input tensors of a model + /// \param outputs The output tensors of a model + /// \param json The json document containing the raw json inputs/outputs + /// \return Returns error object indicating status + cb::Error ParseData( + const rapidjson::Document& json, + const std::shared_ptr& inputs, + const std::shared_ptr& outputs); private: + /// Reads the data from file specified by path into vector of characters + /// \param path The complete path to the file to be read + /// \param contents The character vector that will contain the data read + /// \return error status. Returns Non-Ok if an error is encountered during + /// read operation. + virtual cb::Error ReadFile( + const std::string& path, std::vector* contents); + + /// Reads the string from file specified by path into vector of strings + /// \param path The complete path to the file to be read + /// \param contents The string vector that will contain the data read + /// \return error status. Returns Non-Ok if an error is encountered during + /// read operation. + virtual cb::Error ReadTextFile( + const std::string& path, std::vector* contents); + /// Helper function to read data for the specified input from json /// \param step the DOM for current step /// \param inputs The pointer to the map holding the information about @@ -134,15 +180,45 @@ class DataLoader { const std::shared_ptr& tensors, const int stream_index, const int step_index, const bool is_input); + /// Helper function to validate the provided data and shape for the tensor + /// \param input The target model input or output tensor + /// \param stream_index the stream index the data should be exported to. + /// \param step_index the step index the data should be exported to. + /// Returns error object indicating status + cb::Error ValidateTensor( + const ModelTensor& model_tensor, const int stream_index, + const int step_index); + + /// Helper function to validate the provided shape for a tensor + /// \param shape Shape for the tensor + /// \param model_tensor The tensor to validate + /// Returns error object indicating status + cb::Error ValidateTensorShape( + const std::vector& shape, const ModelTensor& model_tensor); + + /// Helper function to validate the provided data's size + /// \param data The provided data for the tensor + /// \param batch1_byte The expected number of bytes of data + /// \param model_tensor The tensor to validate + /// Returns error object indicating status + cb::Error ValidateTensorDataSize( + const std::vector& data, int64_t batch1_byte, + const ModelTensor& model_tensor); + + /// Helper function to validate consistency of parsing mode for provided input + /// data. The code explicitly does not support a mixture of objects (multiple + /// entries of a single stream) and arrays (multiple streams) + /// + /// \param steps The json data provided for one or multiple streams + cb::Error ValidateParsingMode(const rapidjson::Value& steps); + // The batch_size_ for the data - size_t batch_size_; + size_t batch_size_{1}; // The total number of data streams available. - size_t data_stream_cnt_; + size_t data_stream_cnt_{0}; // A vector containing the supported step number for respective stream // ids. std::vector step_num_; - // The maximum supported data step id for non-sequence model. - size_t max_non_sequence_step_id_; // User provided input data, it will be preferred over synthetic data std::unordered_map> input_data_; @@ -155,6 +231,16 @@ class DataLoader { // Placeholder for generated input data, which will be used for all inputs // except string std::vector input_buf_; + + // Tracks what type of input data has been provided + bool multiple_stream_mode_ = false; + +#ifndef DOCTEST_CONFIG_DISABLE + friend NaggyMockDataLoader; + + public: + DataLoader() = default; +#endif }; }} // namespace triton::perfanalyzer diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..34f33475 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,55 @@ + + +# Perf Analyzer Documentation + +| [Installation](README.md#installation) | [Getting Started](README.md#getting-started) | [User Guide](README.md#user-guide) | +| -------------------------------------- | -------------------------------------------- | ---------------------------------- | + +## **Installation** + +See the [Installation Guide](install.md) for details on how to install Perf +Analyzer. + +## **Getting Started** + +The [Quick Start Guide](quick_start.md) will show you how to use Perf +Analyzer to profile a simple PyTorch model. + +## **User Guide** + +The User Guide describes the Perf Analyzer command line options, how to specify +model input data, the performance measurement modes, the performance metrics and +outputs, how to benchmark different servers, and more. + +- [Perf Analyzer CLI](cli.md) +- [Inference Load Modes](inference_load_modes.md) +- [Input Data](input_data.md) +- [Measurements & Metrics](measurements_metrics.md) +- [Benchmarking](benchmarking.md) +- [Large Language Models (LLMs)](../genai-perf/README.md) diff --git a/docs/benchmarking.md b/docs/benchmarking.md new file mode 100644 index 00000000..96f1ad3a --- /dev/null +++ b/docs/benchmarking.md @@ -0,0 +1,250 @@ + + +# Benchmarking Triton via HTTP or gRPC endpoint + +This is the default mode for Perf Analyzer. + +# Benchmarking Triton directly via C API + +Besides using HTTP or gRPC server endpoints to communicate with Triton, Perf +Analyzer also allows users to benchmark Triton directly using the C API. HTTP +and gRPC endpoints introduce an additional latency in the pipeline which may not +be of interest to users who are using Triton via C API within their application. +Specifically, this feature is useful to benchmark a bare minimum Triton without +additional overheads from HTTP/gRPC communication. + +## Prerequisite + +Pull the Triton SDK and the Triton Server container images on target machine. +Since you will need access to the `tritonserver` install, it might be easier if +you copy the `perf_analyzer` binary to the Inference Server container. + +## Required parameters + +Use the [`--help`](cli.md#--help) option to see a complete list of supported +command line arguments. By default, Perf Analyzer expects the Triton instance to +already be running. You can configure C API mode using the +[`--service-kind`](cli.md#--service-kindtritontriton_c_apitfservingtorchserve) +option. In addition, you will need to point Perf Analyzer to the Triton server +library path using the +[`--triton-server-directory`](cli.md#--triton-server-directorypath) option and +the model repository path using the +[`--model-repository`](cli.md#--model-repositorypath) option. + +An example run would look like: + +``` +$ perf_analyzer -m my_model --service-kind=triton_c_api --triton-server-directory=/opt/tritonserver --model-repository=/my/model/repository +... +*** Measurement Settings *** + Service Kind: Triton C-API + Using "time_windows" mode for stabilization + Measurement window: 5000 msec + Using synchronous calls for inference + Stabilizing using average latency + +Request concurrency: 1 + Client: + Request count: 353 + Throughput: 19.6095 infer/sec + Avg latency: 50951 usec (standard deviation 2265 usec) + p50 latency: 50833 usec + p90 latency: 50923 usec + p95 latency: 50940 usec + p99 latency: 50985 usec + + Server: + Inference count: 353 + Execution count: 353 + Successful request count: 353 + Avg request latency: 50841 usec (overhead 20 usec + queue 63 usec + compute input 35 usec + compute infer 50663 usec + compute output 59 usec) + +Inferences/Second vs. Client Average Batch Latency +Concurrency: 1, throughput: 19.6095 infer/sec, latency 50951 usec +``` + +## Non-supported functionalities + +There are a few functionalities that are missing from C API mode. They are: + +1. Async mode ([`--async`](cli.md#--async)) +2. For additional known non-working cases, please refer to + [qa/L0_perf_analyzer_capi/test.sh](https://github.com/triton-inference-server/server/blob/main/qa/L0_perf_analyzer_capi/test.sh#L239-L277) + +# Benchmarking TensorFlow Serving + +Perf Analyzer can also be used to benchmark models deployed on +[TensorFlow Serving](https://github.com/tensorflow/serving) using the +[`--service-kind=tfserving`](cli.md#--service-kindtritontriton_c_apitfservingtorchserve) +option. Only gRPC protocol is supported. + +The following invocation demonstrates how to configure Perf Analyzer to issue +requests to a running instance of `tensorflow_model_server`: + +``` +$ perf_analyzer -m resnet50 --service-kind tfserving -i grpc -b 1 -p 5000 -u localhost:8500 +*** Measurement Settings *** + Batch size: 1 + Using "time_windows" mode for stabilization + Measurement window: 5000 msec + Using synchronous calls for inference + Stabilizing using average latency +Request concurrency: 1 + Client: + Request count: 829 + Throughput: 165.8 infer/sec + Avg latency: 6032 usec (standard deviation 569 usec) + p50 latency: 5863 usec + p90 latency: 6655 usec + p95 latency: 6974 usec + p99 latency: 8093 usec + Avg gRPC time: 5984 usec ((un)marshal request/response 257 usec + response wait 5727 usec) +Inferences/Second vs. Client Average Batch Latency +Concurrency: 1, throughput: 165.8 infer/sec, latency 6032 usec +``` + +You might have to specify a different url ([`-u`](cli.md#-u-url)) to access +wherever the server is running. The report of Perf Analyzer will only include +statistics measured at the client-side. + +**NOTE:** The support is still in **beta**. Perf Analyzer does not guarantee +optimal tuning for TensorFlow Serving. However, a single benchmarking tool that +can be used to stress the inference servers in an identical manner is important +for performance analysis. + +The following points are important for interpreting the results: + +1. `Concurrent Request Execution`: + TensorFlow Serving (TFS), as of version 2.8.0, by default creates threads for + each request that individually submits requests to TensorFlow Session. There + is a resource limit on the number of concurrent threads serving requests. + When benchmarking at a higher request concurrency, you can see higher + throughput because of this. Unlike TFS, by default Triton is configured with + only a single + [instance count](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups). + Hence, at a higher request concurrency, most of the requests are blocked on + the instance availability. To configure Triton to behave like TFS, set the + instance count to a reasonably high value and then set + [MAX_SESSION_SHARE_COUNT](https://github.com/triton-inference-server/tensorflow_backend#parameters) + parameter in the model `config.pbtxt` to the same value. For some context, + the TFS sets its thread constraint to four times the num of schedulable CPUs. +2. `Different library versions`: + The version of TensorFlow might differ between Triton and TensorFlow Serving + being benchmarked. Even the versions of CUDA libraries might differ between + the two solutions. The performance of models can be susceptible to the + versions of these libraries. For a single request concurrency, if the + `compute_infer` time reported by Perf Analyzer when benchmarking Triton is as + large as the latency reported by Perf Analyzer when benchmarking TFS, then + the performance difference is likely because of the difference in the + software stack and outside the scope of Triton. +3. `CPU Optimization`: + TFS has separate builds for CPU and GPU targets. They have target-specific + optimization. Unlike TFS, Triton has a single build which is optimized for + execution on GPUs. When collecting performance on CPU models on Triton, try + running Triton with the environment variable `TF_ENABLE_ONEDNN_OPTS=1`. + +# Benchmarking TorchServe + +Perf Analyzer can also be used to benchmark +[TorchServe](https://github.com/pytorch/serve) using the +[`--service-kind=torchserve`](cli.md#--service-kindtritontriton_c_apitfservingtorchserve) +option. Only HTTP protocol is supported. It also requires input to be provided +via JSON file. + +The following invocation demonstrates how to configure Perf Analyzer to issue +requests to a running instance of `torchserve` assuming the location holds +`kitten_small.jpg`: + +``` +$ perf_analyzer -m resnet50 --service-kind torchserve -i http -u localhost:8080 -b 1 -p 5000 --input-data data.json + Successfully read data for 1 stream/streams with 1 step/steps. +*** Measurement Settings *** + Batch size: 1 + Using "time_windows" mode for stabilization + Measurement window: 5000 msec + Using synchronous calls for inference + Stabilizing using average latency +Request concurrency: 1 + Client: + Request count: 799 + Throughput: 159.8 infer/sec + Avg latency: 6259 usec (standard deviation 397 usec) + p50 latency: 6305 usec + p90 latency: 6448 usec + p95 latency: 6494 usec + p99 latency: 7158 usec + Avg HTTP time: 6272 usec (send/recv 77 usec + response wait 6195 usec) +Inferences/Second vs. Client Average Batch Latency +Concurrency: 1, throughput: 159.8 infer/sec, latency 6259 usec +``` + +The content of `data.json`: + +```json + { + "data" : + [ + { + "TORCHSERVE_INPUT" : ["kitten_small.jpg"] + } + ] + } +``` + +You might have to specify a different url ([`-u`](cli.md#-u-url)) to access +wherever the server is running. The report of Perf Analyzer will only include +statistics measured at the client-side. + +**NOTE:** The support is still in **beta**. Perf Analyzer does not guarantee +optimal tuning for TorchServe. However, a single benchmarking tool that can be +used to stress the inference servers in an identical manner is important for +performance analysis. + +# Advantages of using Perf Analyzer over third-party benchmark suites + +Triton Inference Server offers the entire serving solution which includes +[client libraries](https://github.com/triton-inference-server/client) that are +optimized for Triton. Using third-party benchmark suites like `jmeter` fails to +take advantage of the optimized libraries. Some of these optimizations includes +but are not limited to: + +1. Using + [binary tensor data extension](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_binary_data.md#binary-tensor-data-extension) + with HTTP requests. +2. Effective re-use of gRPC message allocation in subsequent requests. +3. Avoiding extra memory copy via libcurl interface. + +These optimizations can have a tremendous impact on overall performance. Using +Perf Analyzer for benchmarking directly allows a user to access these +optimizations in their study. + +Not only that, Perf Analyzer is also very customizable and supports many Triton +features as described in this document. This, along with a detailed report, +allows a user to identify performance bottlenecks and experiment with different +features before deciding upon what works best for them. diff --git a/docs/cli.md b/docs/cli.md new file mode 100644 index 00000000..399596fd --- /dev/null +++ b/docs/cli.md @@ -0,0 +1,656 @@ + + +# Perf Analyzer CLI + +This document details the Perf Analyzer command line interface: + +- [General Options](#general-options) +- [Measurement Options](#measurement-options) +- [Sequence Model Options](#sequence-model-options) +- [Input Data Options](#input-data-options) +- [Request Options](#request-options) +- [Server Options](#server-options) +- [Prometheus Metrics Options](#prometheus-metrics-options) +- [Report Options](#report-options) +- [Trace Options](#trace-options) +- [Deprecated Options](#deprecated-options) + +## General Options + +#### `-?` +#### `-h` +#### `--help` + +Prints a description of the Perf Analyzer command line interface. + +#### `-m ` + +Specifies the model name for Perf Analyzer to run. + +This is a required option. + +#### `-x ` + +Specifies the version of the model to be used. If not specified the most +recent version (the highest numbered version) of the model will be used. + +#### `--service-kind=[triton|triton_c_api|tfserving|torchserve]` + +Specifies the kind of service for Perf Analyzer to generate load for. Note: in +order to use `torchserve` backend, the `--input-data` option must point to a +JSON file holding data in the following format: + +``` +{ + "data": [ + { + "TORCHSERVE_INPUT": [ + "" + ] + }, + {...}, + ... + ] +} +``` + +The type of file here will depend on the model. In order to use `triton_c_api` +you must specify the Triton server install path and the model repository path +via the `--triton-server-directory` and `--model-repository` options. + +Default is `triton`. + +#### `--bls-composing-models=` + +Specifies the list of all BLS composing models as a comma separated list of +model names (with optional model version number after a colon for each) that may +be called by the input BLS model. For example, +`--bls-composing-models=modelA:3,modelB` would specify that modelA and modelB +are composing models that may be called by the input BLS model, and that modelA +will use version 3, while modelB's version is unspecified. + +#### `--model-signature-name=` + +Specifies the signature name of the saved model to use. + +Default is `serving_default`. This option will be ignored if `--service-kind` +is not `tfserving`. + +#### `-v` + +Enables verbose mode. May be specified an additional time (`-v -v`) to enable +extra verbose mode. + +## Measurement Options + +#### `--measurement-mode=[time_windows|count_windows]` + +Specifies the mode used for stabilizing measurements. 'time_windows' will +create windows such that the duration of each window is equal to +`--measurement-interval`. 'count_windows' will create windows such that there +are at least `--measurement-request-count` requests in each window and that +the window is at least one second in duration (adding more requests if +necessary). + +Default is `time_windows`. + +#### `-p ` +#### `--measurement-interval=` + +Specifies the time interval used for each measurement in milliseconds when +`--measurement-mode=time_windows` is used. Perf Analyzer will sample a time +interval specified by this option and take measurement over the requests +completed within that time interval. + +Default is `5000`. + +#### `--measurement-request-count=` + +Specifies the minimum number of requests to be collected in each measurement +window when `--measurement-mode=count_windows` is used. + +Default is `50`. + +#### `-s ` +#### `--stability-percentage=` + +Specifies the allowed variation in latency measurements when determining if a +result is stable. The measurement is considered stable if the ratio of max / +min from the recent 3 measurements is within (stability percentage)% in terms +of both inferences per second and latency. + +Default is `10`(%). + +#### `--percentile=` + +Specifies the confidence value as a percentile that will be used to determine +if a measurement is stable. For example, a value of `85` indicates that the +85th percentile latency will be used to determine stability. The percentile +will also be reported in the results. + +Default is `-1` indicating that the average latency is used to determine +stability. + +#### `-r ` +#### `--max-trials=` + +Specifies the maximum number of measurements when attempting to reach stability +of inferences per second and latency for each concurrency or request rate +during the search. Perf Analyzer will terminate if the measurement is still +unstable after the maximum number of trials. + +Default is `10`. + +#### `--concurrency-range=` + +Specifies the range of concurrency levels covered by Perf Analyzer. Perf +Analyzer will start from the concurrency level of 'start' and go until 'end' +with a stride of 'step'. + +Default of 'start', 'end', and 'step' are `1`. If 'end' is not specified then +Perf Analyzer will run for a single concurrency level determined by 'start'. If +'end' is set as `0`, then the concurrency limit will be incremented by 'step' +until the latency threshold is met. 'end' and `--latency-threshold` cannot +both be `0`. 'end' cannot be `0` for sequence models while using asynchronous +mode. + +#### `--periodic-concurrency-range=` + +Specifies the range of concurrency levels in the similar but slightly different +manner as the `--concurrency-range`. Perf Analyzer will start from the +concurrency level of 'start' and increase by 'step' each time. Unlike +`--concurrency-range`, the 'end' indicates the *total* number of concurrency +since the 'start' (including) and will stop increasing once the cumulative +number of concurrent requests has reached the 'end'. The user can specify +*when* to periodically increase the concurrency level using the +`--request-period` option. The concurrency level will periodically increase for +every `n`-th response specified by `--request-period`. Since this disables +stability check in Perf Analyzer and reports response timestamps only, the user +must provide `--profile-export-file` to specify where to dump all the measured +timestamps. + +The default values of 'start', 'end', and 'step' are `1`. + +#### `--request-period=` + +Specifies the number of responses that each request must receive before new, +concurrent requests are sent when `--periodic-concurrency-range` is specified. + +Default value is `10`. + +#### `--request-parameter=` + +Specifies a custom parameter that can be sent to a Triton backend as part of +the request. For example, providing '--request-parameter max_tokens:256:int' +to the command line will set an additional parameter 'max_tokens' of type +'int' to 256 as part of the request. The --request-parameter may be specified +multiple times for different custom parameters. + +Valid `type` values are: `bool`, `int`, and `string`. + +> **NOTE** +> +> The `--request-parameter` is currently only supported by gRPC protocol. + +#### `--request-rate-range=` + +Specifies the range of request rates for load generated by Perf Analyzer. This +option can take floating-point values. The search along the request rate range +is enabled only when using this option. + +If not specified, then Perf Analyzer will search along the concurrency range. +Perf Analyzer will start from the request rate of 'start' and go until 'end' +with a stride of 'step'. Default values of 'start', 'end' and 'step' are all +`1.0`. If 'end' is not specified, then Perf Analyzer will run for a single +request rate as determined by 'start'. If 'end' is set as `0.0`, then the +request rate will be incremented by 'step' until the latency threshold is met. +'end' and `--latency-threshold` can not be both `0`. + +#### `--request-distribution=[constant|poisson]` + +Specifies the time interval distribution between dispatching inference requests +to the server. Poisson distribution closely mimics the real-world work load on +a server. This option is ignored if not using `--request-rate-range`. + +Default is `constant`. + +#### `-l ` +#### `--latency-threshold=` + +Specifies the limit on the observed latency, in milliseconds. Perf Analyzer +will terminate the concurrency or request rate search once the measured latency +exceeds this threshold. + +Default is `0` indicating that Perf Analyzer will run for the entire +concurrency or request rate range. + +#### `--binary-search` + +Enables binary search on the specified search range (concurrency or request +rate). This option requires 'start' and 'end' to be expilicitly specified in +the concurrency range or request rate range. When using this option, 'step' is +more like the precision. When the 'step' is lower, there are more iterations +along the search path to find suitable convergence. + +When `--binary-search` is not specified, linear search is used. + +#### `--request-intervals=` + +Specifies a path to a file containing time intervals in microseconds. Each time +interval should be in a new line. Perf Analyzer will try to maintain time +intervals between successive generated requests to be as close as possible in +this file. This option can be used to apply custom load to server with a +certain pattern of interest. Perf Analyzer will loop around the file if the +duration of execution exceeds the amount of time specified by the intervals. +This option can not be used with `--request-rate-range` or +`--concurrency-range`. + +#### `--max-threads=` + +Specifies the maximum number of threads that will be created for providing +desired concurrency or request rate. However, when running in synchronous mode +with `--concurrency-range` having explicit 'end' specification, this value will +be ignored. + +Default is `4` if `--request-rate-range` is specified, otherwise default is +`16`. + +## Sequence Model Options + +#### `--num-of-sequences=` + +Specifies the number of concurrent sequences for sequence models. This option +is ignored when `--request-rate-range` is not specified. + +Default is `4`. + +#### `--sequence-length=` + +Specifies the base length of a sequence used for sequence models. A sequence +with length X will be composed of X requests to be sent as the elements in the +sequence. The actual length of the sequencewill be within +/- Y% of the base +length, where Y defaults to 20% and is customizable via +`--sequence-length-variation`. If sequence length is unspecified and input data +is provided, the sequence length will be the number of inputs in the +user-provided input data. + +Default is `20`. + +#### `--sequence-length-variation=` + +Specifies the percentage variation in length of sequences. This option is only +valid when not using user-provided input data or when `--sequence-length` is +specified while using user-provided input data. + +Default is `20`(%). + +#### `--sequence-id-range=` + +Specifies the range of sequence IDs used by Perf Analyzer. Perf Analyzer will +start from the sequence ID of 'start' and go until 'end' (excluded). If 'end' +is not specified then Perf Analyzer will generate new sequence IDs without +bounds. If 'end' is specified and the concurrency setting may result in +maintaining a number of sequences more than the range of available sequence +IDs, Perf Analyzer will exit with an error due to possible sequence ID +collisions. + +The default for 'start is `1`, and 'end' is not specified (no bounds). + +#### `--serial-sequences` + +Enables the serial sequence mode where a maximum of one request is live per sequence. +Note: It is possible that this mode can cause the request rate mode to not achieve the +desired rate, especially if num-of-sequences is too small. + +## Input Data Options + +#### `--input-data=[zero|random|]` + +Specifies type of data that will be used for input in inference requests. The +available options are `zero`, `random`, and a path to a directory or a JSON +file. + +When pointing to a JSON file, the user must adhere to the format described in +the [input data documentation](input_data.md). By specifying JSON data, users +can control data used with every request. Multiple data streams can be specified +for a sequence model, and Perf Analyzer will select a data stream in a +round-robin fashion for every new sequence. Multiple JSON files can also be +provided (`--input-data json_file1.json --input-data json_file2.json` and so on) +and Perf Analyzer will append data streams from each file. When using +`--service-kind=torchserve`, make sure this option points to a JSON file. + +If the option is path to a directory then the directory must contain a binary +text file for each non-string/string input respectively, named the same as the +input. Each file must contain the data required for that input for a batch-1 +request. Each binary file should contain the raw binary representation of the +input in row-major order for non-string inputs. The text file should contain +all strings needed by batch-1, each in a new line, listed in row-major order. + +Default is `random`. + +#### `-b ` + +Specifies the batch size for each request sent. + +Default is `1`. + +#### `--shape=` + +Specifies the shape used for the specified input. The argument must be +specified as 'name:shape' where the shape is a comma-separated list for +dimension sizes. For example `--shape=input_name:1,2,3` indicates that the +input `input_name` has tensor shape [ 1, 2, 3 ]. `--shape` may be specified +multiple times to specify shapes for different inputs. + +#### `--string-data=` + +Specifies the string to initialize string input buffers. Perf Analyzer will +replicate the given string to build tensors of required shape. +`--string-length` will not have any effect. This option is ignored if +`--input-data` points to a JSON file or directory. + +#### `--string-length=` + +Specifies the length of the random strings to be generated by Perf Analyzer +for string input. This option is ignored if `--input-data` points to a +JSON file or directory. + +Default is `128`. + +#### `--shared-memory=[none|system|cuda]` + +Specifies the type of the shared memory to use for input and output data. + +Default is `none`. + +#### `--output-shared-memory-size=` + +Specifies The size, in bytes, of the shared memory region to allocate per +output tensor. Only needed when one or more of the outputs are of string type +and/or variable shape. The value should be larger than the size of the largest +output tensor that the model is expected to return. Perf Analyzer will use the +following formula to calculate the total shared memory to allocate: +output_shared_memory_size * number_of_outputs * batch_size. + +Default is `102400` (100 KB). + +#### `--input-tensor-format=[binary|json]` + +Specifies the Triton inference request input tensor format. Only valid when HTTP +protocol is used. + +Default is `binary`. + +#### `--output-tensor-format=[binary|json]` + +Specifies the Triton inference response output tensor format. Only valid when +HTTP protocol is used. + +Default is `binary`. + +## Request Options + +#### `-i [http|grpc]` + +Specifies the communication protocol to use. The available protocols are HTTP +and gRPC. + +Default is `http`. + +#### `-a` +#### `--async` + +Enables asynchronous mode in Perf Analyzer. + +By default, Perf Analyzer will use a synchronous request API for inference. +However, if the model is sequential, then the default mode is asynchronous. +Specify `--sync` to operate sequential models in synchronous mode. In +synchronous mode, Perf Analyzer will start threads equal to the concurrency +level. Use asynchronous mode to limit the number of threads, yet maintain the +concurrency. + +#### `--sync` + +Enables synchronous mode in Perf Analyzer. Can be used to operate Perf +Analyzer with sequential model in synchronous mode. + +#### `--streaming` + +Enables the use of streaming API. This option is only valid with gRPC protocol. + +#### `-H ` + +Specifies the header that will be added to HTTP requests (ignored for gRPC +requests). The header must be specified as 'Header:Value'. `-H` may be +specified multiple times to add multiple headers. + +#### `--grpc-compression-algorithm=[none|gzip|deflate]` + +Specifies the compression algorithm to be used by gRPC when sending requests. +Only supported when gRPC protocol is being used. + +Default is `none`. + +## Server Options + +#### `-u ` + +Specifies the URL for the server. + +Default is `localhost:8000` when using `--service-kind=triton` with HTTP. +Default is `localhost:8001` when using `--service-kind=triton` with gRPC. +Default is `localhost:8500` when using `--service-kind=tfserving`. + +#### `--ssl-grpc-use-ssl` + +Enables usage of an encrypted channel to the server. + +#### `--ssl-grpc-root-certifications-file=` + +Specifies the path to file containing the PEM encoding of the server root +certificates. + +#### `--ssl-grpc-private-key-file=` + +Specifies the path to file containing the PEM encoding of the client's private +key. + +#### `--ssl-grpc-certificate-chain-file=` + +Specifies the path to file containing the PEM encoding of the client's +certificate chain. + +#### `--ssl-https-verify-peer=[0|1]` + +Specifies whether to verify the peer's SSL certificate. See +https://curl.se/libcurl/c/CURLOPT_SSL_VERIFYPEER.html for the meaning of each +value. + +Default is `1`. + +#### `--ssl-https-verify-host=[0|1|2]` + +Specifies whether to verify the certificate's name against host. See +https://curl.se/libcurl/c/CURLOPT_SSL_VERIFYHOST.html for the meaning of each +value. + +Default is `2`. + +#### `--ssl-https-ca-certificates-file=` + +Specifies the path to Certificate Authority (CA) bundle. + +#### `--ssl-https-client-certificate-file=` + +Specifies the path to the SSL client certificate. + +#### `--ssl-https-client-certificate-type=[PEM|DER]` + +Specifies the type of the client SSL certificate. + +Default is `PEM`. + +#### `--ssl-https-private-key-file=` + +Specifies the path to the private keyfile for TLS and SSL client cert. + +#### `--ssl-https-private-key-type=[PEM|DER]` + +Specifies the type of the private key file. + +Default is `PEM`. + +#### `--triton-server-directory=` + +Specifies the Triton server install path. Required by and only used when C API +is used (`--service-kind=triton_c_api`). + +Default is `/opt/tritonserver`. + +#### `--model-repository=` + +Specifies the model repository directory path for loading models. Required by +and only used when C API is used (`--service-kind=triton_c_api`). + +## Prometheus Metrics Options + +#### `--collect-metrics` + +Enables the collection of server-side inference server metrics. Perf Analyzer +will output metrics in the CSV file generated with the `-f` option. Only valid +when `--verbose-csv` option also used. + +#### `--metrics-url=` + +Specifies the URL to query for server-side inference server metrics. + +Default is `localhost:8002/metrics`. + +#### `--metrics-interval=` + +Specifies how often within each measurement window, in milliseconds, Perf +Analyzer should query for server-side inference server metrics. + +Default is `1000`. + +## Report Options + +#### `-f ` + +Specifies the path that the latency report file will be generated at. + +When `-f` is not specified, a latency report will not be generated. + +#### `--profile-export-file ` + +Specifies the path that the profile export will be generated at. + +When `--profile-export-file` is not specified, a profile export will not be +generated. + +#### `--verbose-csv` + +Enables additional information being output to the CSV file generated by Perf +Analyzer. + +## Trace Options + +#### `--trace-level=[OFF|TIMESTAMPS|TENSORS]` + +Specifies a trace level. `OFF` disables tracing. `TIMESTAMPS` traces +timestamps. `TENSORS` traces tensors. It may be specified multiple times to +trace multiple information. Only used for `--service-kind=triton`. + +Default is `OFF`. + +#### `--trace-rate=` + +Specifies the trace sampling rate (traces per second). + +Default is `1000`. + +#### `--trace-count=` + +Specifies the number of traces to be sampled. If the value is `-1`, the number +of traces to be sampled will not be limited. + +Default is `-1`. + +#### `--log-frequency=` + +Specifies the trace log frequency. If the value is `0`, Triton will only log +the trace output to the trace file when shutting down. +Otherwise, Triton will log the trace output to ``. when it +collects the specified number of traces. +For example, if the trace file is `trace_file.log`, and if the log +frequency is `100`, when Triton collects the 100th trace, it logs the traces +to file `trace_file.log.0`, and when it collects the 200th trace, it logs the +101st to the 200th traces to file `trace_file.log.1`. + +Default is `0`. + +## Deprecated Options + +#### `--data-directory=` + +**DEPRECATED** + +Alias for `--input-data=` where `` is the path to a directory. See +`--input-data` option documentation for details. + +#### `-c ` + +**DEPRECATED** + +Specifies the maximum concurrency that Perf Analyzer will search up to. Cannot +be used with `--concurrency-range`. + +#### `-d` + +**DEPRECATED** + +Enables dynamic concurrency mode. Perf Analyzer will search along +concurrencies up to the maximum concurrency specified via `-c `. Cannot be +used with `--concurrency-range`. + +#### `-t ` + +**DEPRECATED** + +Specifies the number of concurrent requests. Cannot be used with +`--concurrency-range`. + +Default is `1`. + +#### `-z` + +**DEPRECATED** + +Alias for `--input-data=zero`. See `--input-data` option documentation for +details. diff --git a/docs/inference_load_modes.md b/docs/inference_load_modes.md new file mode 100644 index 00000000..83fa83eb --- /dev/null +++ b/docs/inference_load_modes.md @@ -0,0 +1,100 @@ + + +# Inference Load Modes + +Perf Analyzer has several modes for generating inference request load for a +model. + +## Concurrency Mode + +In concurrency mode, Perf Analyzer attempts to send inference requests to the +server such that N requests are always outstanding during profiling. For +example, when using +[`--concurrency-range=4`](cli.md#--concurrency-rangestartendstep), Perf Analyzer +will to attempt to have 4 outgoing inference requests at all times during +profiling. + +## Periodic Concurrency Mode + +In periodic concurrency mode, Perf Analyzer will periodically launch a new set +of inference requests until the total number of inference requests that has been +launched since the beginning reaches N requests. + +For example, when using `--periodic-concurrency-range 10:100:30`, Perf Analyzer +will start with 10 concurrent requests and for every step, it will launch 30 new +inference requests until the total number of requests launched since the +beginning reaches 100. Additionally, the user can also specify *when* to launch +the new requests by specifying `--request-period M`. This will set Perf Analyzer +to launch a new set of requests whenever *all* of the latest set of launched +concurrent requests received M number of responses back from the server. + +The user can also specify custom parameters to the model using +`--request-parameter ` option. +For instance, passing `--request-parameter max_tokens:256:uint` will set an +additional parameter `max_tokens` of type `int` to 256 as part of the request. + +```bash +perf_analyzer -m -i grpc --async --streaming \ + --profile-export-file profile.json \ + --periodic-concurrency-range 10:100:30 \ + --request-period 10 \ + --request-parameter max_tokens:256:int +``` + +> **Note** +> +> The periodic concurrency mode is currently supported only by gRPC protocol and +> with [decoupled models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/decoupled_models.md). +> Additionally, the user must also specify a file where Perf Analyzer could dump all the +> profiled data using `--profile-export-file`. + +## Request Rate Mode + +In request rate mode, Perf Analyzer attempts to send N inference requests per +second to the server during profiling. For example, when using +[`--request-rate-range=20`](cli.md#--request-rate-rangestartendstep), Perf +Analyzer will attempt to send 20 requests per second during profiling. + +## Custom Interval Mode + +In custom interval mode, Perf Analyzer attempts to send inference requests +according to intervals (between requests, looping if necessary) provided by the +user in the form of a text file with one time interval (in microseconds) per +line. For example, when using +[`--request-intervals=my_intervals.txt`](cli.md#--request-intervalspath), +where `my_intervals.txt` contains: + +``` +100000 +200000 +500000 +``` + +Perf Analyzer will attempt to send requests at the following times: 0.1s, 0.3s, +0.8s, 0.9s, 1.1s, 1.6s, and so on, during profiling. diff --git a/docs/input_data.md b/docs/input_data.md new file mode 100644 index 00000000..af2328fc --- /dev/null +++ b/docs/input_data.md @@ -0,0 +1,306 @@ + + +# Input Data + +Use the [`--help`](cli.md#--help) option to see complete documentation for all +input data options. By default Perf Analyzer sends random data to all the inputs +of your model. You can select a different input data mode with the +[`--input-data`](cli.md#--input-datazerorandompath) option: + +- _random_: (default) Send random data for each input. Note: Perf Analyzer only + generates random data once per input and reuses that for all inferences +- _zero_: Send zeros for each input. +- directory path: A path to a directory containing a binary file for each input, + named the same as the input (and optionally a binary file for each output for + validation, named the same as the output). Each binary file must contain the + data required for that input/output for a batch-1 request. Each file should + contain the raw binary representation of the input/output in row-major order. +- file path: A path to a JSON file containing data to be used with every + inference request. See the "Real Input Data" section for further details. + [`--input-data`](cli.md#--input-datazerorandompath) can be provided multiple + times with different file paths to specific multiple JSON files. + +For tensors with `STRING`/`BYTES` datatype, the +[`--string-length`](cli.md#--string-lengthn) and +[`--string-data`](cli.md#--string-datastring) options may be used in some cases +(see [`--help`](cli.md#--help) for full documentation). + +For models that support batching you can use the [`-b`](cli.md#-b-n) option to +indicate the batch size of the requests that Perf Analyzer should send. For +models with variable-sized inputs you must provide the +[`--shape`](cli.md#--shapestring) argument so that Perf Analyzer knows what +shape tensors to use. For example, for a model that has an input called +`IMAGE` that has shape `[3, N, M]`, where `N` and `M` are variable-size +dimensions, to tell Perf Analyzer to send batch size 4 requests of shape +`[3, 224, 224]`: + +``` +$ perf_analyzer -m mymodel -b 4 --shape IMAGE:3,224,224 +``` + +## Real Input Data + +The performance of some models is highly dependent on the data used. For such +cases you can provide data to be used with every inference request made by Perf +Analyzer in a JSON file. Perf Analyzer will use the provided data in a +round-robin order when sending inference requests. For sequence models, if a +sequence length is specified via +[`--sequence-length`](cli.md#--sequence-lengthn), Perf Analyzer will also loop +through the provided data in a round-robin order up to the specified sequence +length (with a percentage variation customizable via +[`--sequence-length-variation`](cli.md#--sequence-length-variationn)). +Otherwise, the sequence length will be the number of inputs specified in +user-provided input data. + +Each entry in the `"data"` array must specify all input tensors with the exact +size expected by the model for a single batch. The following example describes +data for a model with inputs named, `INPUT0` and `INPUT1`, shape `[4, 4]` and +data type `INT32`: + +```json +{ + "data": + [ + { + "INPUT0": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + "INPUT1": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + }, + { + "INPUT0": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + "INPUT1": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + }, + { + "INPUT0": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + "INPUT1": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + }, + { + "INPUT0": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + "INPUT1": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + } + ] +} +``` + +Note that the `[4, 4]` tensor has been flattened in a row-major format for the +inputs. In addition to specifying explicit tensors, you can also provide Base64 +encoded binary data for the tensors. Each data object must list its data in a +row-major order. Binary data must be in little-endian byte order. The following +example highlights how this can be achieved: + +```json +{ + "data": + [ + { + "INPUT0": {"b64": "YmFzZTY0IGRlY29kZXI="}, + "INPUT1": {"b64": "YmFzZTY0IGRlY29kZXI="} + }, + { + "INPUT0": {"b64": "YmFzZTY0IGRlY29kZXI="}, + "INPUT1": {"b64": "YmFzZTY0IGRlY29kZXI="} + }, + { + "INPUT0": {"b64": "YmFzZTY0IGRlY29kZXI="}, + "INPUT1": {"b64": "YmFzZTY0IGRlY29kZXI="} + } + ] +} +``` + +In case of sequence models, multiple data streams can be specified in the JSON +file. Each sequence will get a data stream of its own and Perf Analyzer will +ensure the data from each stream is played back to the same correlation ID. The +below example highlights how to specify data for multiple streams for a sequence +model with a single input named `INPUT`, shape `[1]` and data type `STRING`: + +```json +{ + "data": + [ + [ + { + "INPUT": ["1"] + }, + { + "INPUT": ["2"] + }, + { + "INPUT": ["3"] + }, + { + "INPUT": ["4"] + } + ], + [ + { + "INPUT": ["1"] + }, + { + "INPUT": ["1"] + }, + { + "INPUT": ["1"] + } + ], + [ + { + "INPUT": ["1"] + }, + { + "INPUT": ["1"] + } + ] + ] +} +``` + +The above example describes three data streams with lengths 4, 3 and 2 +respectively. Perf Analyzer will hence produce sequences of length 4, 3 and 2 in +this case. + +You can also provide an optional `"shape"` field to the tensors. This is +especially useful while profiling the models with variable-sized tensors as +input. Additionally note that when providing the `"shape"` field, tensor +contents must be provided separately in a "content" field in row-major order. +The specified shape values will override default input shapes provided as a +command line option (see [`--shape`](cli.md#--shapestring)) for variable-sized +inputs. In the absence of a `"shape"` field, the provided defaults will be used. +There is no need to specify shape as a command line option if all the input data +provide shape values for variable tensors. Below is an example JSON file for a +model with a single input `INPUT`, shape `[-1, -1]` and data type `INT32`: + +```json +{ + "data": + [ + { + "INPUT": + { + "content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + "shape": [2,8] + } + }, + { + "INPUT": + { + "content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + "shape": [8,2] + } + }, + { + "INPUT": + { + "content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + } + }, + { + "INPUT": + { + "content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + "shape": [4,4] + } + } + ] +} +``` + +The following is the example to provide contents as base64 string with explicit +shapes: + +```json +{ + "data": + [ + { + "INPUT": + { + "content": {"b64": "/9j/4AAQSkZ(...)"}, + "shape": [7964] + } + }, + { + "INPUT": + { + "content": {"b64": "/9j/4AAQSkZ(...)"}, + "shape": [7964] + } + } + ] +} +``` + +Note that for `STRING` type, an element is represented by a 4-byte unsigned +integer giving the length followed by the actual bytes. The byte array to be +encoded using base64 must include the 4-byte unsigned integers. + +### Output Validation + +When real input data is provided, it is optional to request Perf Analyzer to +validate the inference output for the input data. + +Validation output can be specified in the `"validation_data"` field have the +same format as the `"data"` field for real input. Note that the entries in +`"validation_data"` must align with `"data"` for proper mapping. The following +example describes validation data for a model with inputs named `INPUT0` and +`INPUT1`, outputs named `OUTPUT0` and `OUTPUT1`, all tensors have shape `[4, 4]` +and data type `INT32`: + +```json +{ + "data": + [ + { + "INPUT0": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + "INPUT1": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + } + ], + "validation_data": + [ + { + "OUTPUT0": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "OUTPUT1": [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] + } + ] +} +``` + +Besides the above example, the validation outputs can be specified in the same +variations described in the real input data section. + +# Shared Memory + +By default Perf Analyzer sends input tensor data and receives output tensor data +over the network. You can instead instruct Perf Analyzer to use system shared +memory or CUDA shared memory to communicate tensor data. By using these options +you can model the performance that you can achieve by using shared memory in +your application. Use +[`--shared-memory=system`](cli.md#--shared-memorynonesystemcuda) to use system +(CPU) shared memory or +[`--shared-memory=cuda`](cli.md#--shared-memorynonesystemcuda) to use CUDA +shared memory. diff --git a/docs/install.md b/docs/install.md new file mode 100644 index 00000000..5390dc00 --- /dev/null +++ b/docs/install.md @@ -0,0 +1,106 @@ + + +# Recommended Installation Method + +## Triton SDK Container + +The recommended way to "install" Perf Analyzer is to run the pre-built +executable from within the Triton SDK docker container available on the +[NVIDIA GPU Cloud Catalog](https://ngc.nvidia.com/catalog/containers/nvidia:tritonserver). +As long as the SDK container has its network exposed to the address and port of +the inference server, Perf Analyzer will be able to run. + +```bash +export RELEASE= # e.g. to use the release from the end of February of 2023, do `export RELEASE=23.02` + +docker pull nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk + +docker run --gpus all --rm -it --net host nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk + +# inside container +perf_analyzer -m +``` + +# Alternative Installation Methods + +- [Pip](#pip) +- [Build from Source](#build-from-source) + +## Pip + +```bash +pip install tritonclient + +perf_analyzer -m +``` + +**Warning**: If any runtime dependencies are missing, Perf Analyzer will produce +errors showing which ones are missing. You will need to manually install them. + +## Build from Source + +The Triton SDK container is used for building, so some build and runtime +dependencies are already installed. + +```bash +export RELEASE= # e.g. to use the release from the end of February of 2023, do `export RELEASE=23.02` + +docker pull nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk + +docker run --gpus all --rm -it --net host nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk + +# inside container +# prep installing newer version of cmake +apt update && apt install -y gpg wget && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null && . /etc/os-release && echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | tee /etc/apt/sources.list.d/kitware.list >/dev/null + +# install build/runtime dependencies +apt update && apt install -y cmake-data=3.27.7* cmake=3.27.7* libcurl4-openssl-dev rapidjson-dev + +rm -rf client ; git clone --depth 1 https://github.com/triton-inference-server/client + +mkdir client/build ; cd client/build + +cmake -DTRITON_ENABLE_PERF_ANALYZER=ON .. + +make -j8 cc-clients + +cc-clients/perf_analyzer/perf_analyzer -m +``` + +- To enable + [CUDA shared memory](input_data.md#shared-memory), add + `-DTRITON_ENABLE_GPU=ON` to the `cmake` command. +- To enable + [C API mode](benchmarking.md#benchmarking-triton-directly-via-c-api), add + `-DTRITON_ENABLE_PERF_ANALYZER_C_API=ON` to the `cmake` command. +- To enable [TorchServe backend](benchmarking.md#benchmarking-torchserve), add + `-DTRITON_ENABLE_PERF_ANALYZER_TS=ON` to the `cmake` command. +- To enable + [Tensorflow Serving backend](benchmarking.md#benchmarking-tensorflow-serving), + add `-DTRITON_ENABLE_PERF_ANALYZER_TFS=ON` to the `cmake` command. diff --git a/docs/measurements_metrics.md b/docs/measurements_metrics.md new file mode 100644 index 00000000..3f5b6434 --- /dev/null +++ b/docs/measurements_metrics.md @@ -0,0 +1,225 @@ + + +# Measurement Modes + +Currently, Perf Analyzer has 2 measurement modes. + +## Time Windows + +When using time windows measurement mode +([`--measurement-mode=time_windows`](cli.md#--measurement-modetime_windowscount_windows)), +Perf Analyzer will count how many requests have completed during a window of +duration `X` (in milliseconds, via +[`--measurement-interval=X`](cli.md#--measurement-intervaln), default is +`5000`). This is the default measurement mode. + +## Count Windows + +When using count windows measurement mode +([`--measurement-mode=count_windows`](cli.md#--measurement-modetime_windowscount_windows)), +Perf Analyzer will start the window duration at 1 second and potentially +dynamically increase it until `X` requests have completed (via +[`--measurement-request-count=X`](cli.md#--measurement-request-countn), default +is `50`). + +# Metrics + +## How Throughput is Calculated + +Perf Analyzer calculates throughput to be the total number of requests completed +during a measurement, divided by the duration of the measurement, in seconds. + +## How Latency is Calculated + +For each request concurrency level Perf Analyzer reports latency and throughput +as seen from Perf Analyzer and also the average request latency on the server. + +The server latency measures the total time from when the request is received at +the server until when the response is sent from the server. Because of the HTTP +and gRPC libraries used to implement the server endpoints, total server latency +is typically more accurate for HTTP requests as it measures time from the first +byte received until last byte sent. For both HTTP and gRPC the total server +latency is broken-down into the following components: + +- _queue_: The average time spent in the inference schedule queue by a request + waiting for an instance of the model to become available. +- _compute_: The average time spent performing the actual inference, including + any time needed to copy data to/from the GPU. +- _overhead_: The average time spent in the endpoint that cannot be correctly + captured in the send/receive time with the way the gRPC and HTTP libraries are + structured. + +The client latency time is broken-down further for HTTP and gRPC as follows: + +- HTTP: _send/recv_ indicates the time on the client spent sending the request + and receiving the response. _response wait_ indicates time waiting for the + response from the server. +- gRPC: _(un)marshal request/response_ indicates the time spent marshalling the + request data into the gRPC protobuf and unmarshalling the response data from + the gRPC protobuf. _response wait_ indicates time writing the gRPC request to + the network, waiting for the response, and reading the gRPC response from the + network. + +Use the verbose ([`-v`](cli.md#-v)) option see more output, including the +stabilization passes run for each request concurrency level or request rate. + +# Reports + +## Visualizing Latency vs. Throughput + +Perf Analyzer provides the [`-f`](cli.md#-f-path) option to generate a file +containing CSV output of the results. + +``` +$ perf_analyzer -m inception_graphdef --concurrency-range 1:4 -f perf.csv +... +$ cat perf.csv +Concurrency,Inferences/Second,Client Send,Network+Server Send/Recv,Server Queue,Server Compute Input,Server Compute Infer,Server Compute Output,Client Recv,p50 latency,p90 latency,p95 latency,p99 latency +1,69.2,225,2148,64,206,11781,19,0,13891,18795,19753,21018 +3,84.2,237,1768,21673,209,11742,17,0,35398,43984,47085,51701 +4,84.2,279,1604,33669,233,11731,18,1,47045,56545,59225,64886 +2,87.2,235,1973,9151,190,11346,17,0,21874,28557,29768,34766 +``` + +NOTE: The rows in the CSV file are sorted in an increasing order of throughput +(Inferences/Second). + +You can import the CSV file into a spreadsheet to help visualize the latency vs +inferences/second tradeoff as well as see some components of the latency. Follow +these steps: + +- Open + [this spreadsheet](https://docs.google.com/spreadsheets/d/1S8h0bWBBElHUoLd2SOvQPzZzRiQ55xjyqodm_9ireiw) +- Make a copy from the File menu "Make a copy..." +- Open the copy +- Select the A1 cell on the "Raw Data" tab +- From the File menu select "Import..." +- Select "Upload" and upload the file +- Select "Replace data at selected cell" and then select the "Import data" + button + +## Server-side Prometheus metrics + +Perf Analyzer can collect +[server-side metrics](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md#gpu-metrics), +such as GPU utilization and GPU power usage. To enable the collection of these +metrics, use the [`--collect-metrics`](cli.md#--collect-metrics) option. + +By default, Perf Analyzer queries the metrics endpoint at the URL +`localhost:8002/metrics`. If the metrics are accessible at a different url, use +the [`--metrics-url=`](cli.md#--metrics-urlurl) option to specify that. + +By default, Perf Analyzer queries the metrics endpoint every 1000 milliseconds. +To use a different querying interval, use the +[`--metrics-interval=`](cli.md#--metrics-intervaln) option (specify in +milliseconds). + +Because Perf Analyzer can collect the server-side metrics multiple times per +run, these metrics are aggregated in specific ways to produce one final number +per searched concurrency or request rate. Here are how the metrics are +aggregated: + +| Metric | Aggregation | +| - | - | +| GPU Utilization | Averaged from each collection taken during stable passes. We want a number representative of all stable passes. | +| GPU Power Usage | Averaged from each collection taken during stable passes. We want a number representative of all stable passes. | +| GPU Used Memory | Maximum from all collections taken during a stable pass. Users are typically curious what the peak memory usage is for determining model/hardware viability. | +| GPU Total Memory | First from any collection taken during a stable pass. All of the collections should produce the same value for total memory available on the GPU. | + +Note that all metrics are per-GPU in the case of multi-GPU systems. + +To output these server-side metrics to a CSV file, use the +[`-f `](cli.md#-f-path) and [`--verbose-csv`](cli.md#--verbose-csv) +options. The output CSV will contain one column per metric. The value of each +column will be a `key:value` pair (`GPU UUID:metric value`). Each `key:value` +pair will be delimited by a semicolon (`;`) to indicate metric values for each +GPU accessible by the server. There is a trailing semicolon. See below: + +`:;:;...;` + +Here is a simplified CSV output: + +``` +$ perf_analyzer -m resnet50_libtorch --collect-metrics -f output.csv --verbose-csv +$ cat output.csv +Concurrency,...,Avg GPU Utilization,Avg GPU Power Usage,Max GPU Memory Usage,Total GPU Memory +1,...,gpu_uuid_0:0.33;gpu_uuid_1:0.5;,gpu_uuid_0:55.3;gpu_uuid_1:56.9;,gpu_uuid_0:10000;gpu_uuid_1:11000;,gpu_uuid_0:50000;gpu_uuid_1:75000;, +2,...,gpu_uuid_0:0.25;gpu_uuid_1:0.6;,gpu_uuid_0:25.6;gpu_uuid_1:77.2;,gpu_uuid_0:11000;gpu_uuid_1:17000;,gpu_uuid_0:50000;gpu_uuid_1:75000;, +3,...,gpu_uuid_0:0.87;gpu_uuid_1:0.9;,gpu_uuid_0:87.1;gpu_uuid_1:71.7;,gpu_uuid_0:15000;gpu_uuid_1:22000;,gpu_uuid_0:50000;gpu_uuid_1:75000;, +``` + +## Communication Protocol + +By default, Perf Analyzer uses HTTP to communicate with Triton. The gRPC +protocol can be specified with the [`-i [http|grpc]`](cli.md#-i-httpgrpc) +option. If gRPC is selected the [`--streaming`](cli.md#--streaming) option can +also be specified for gRPC streaming. + +### SSL/TLS Support + +Perf Analyzer can be used to benchmark Triton service behind SSL/TLS-enabled +endpoints. These options can help in establishing secure connection with the +endpoint and profile the server. + +For gRPC, see the following options: + +- [`--ssl-grpc-use-ssl`](cli.md#--ssl-grpc-use-ssl) +- [`--ssl-grpc-root-certifications-file=`](cli.md#--ssl-grpc-root-certifications-filepath) +- [`--ssl-grpc-private-key-file=`](cli.md#--ssl-grpc-private-key-filepath) +- [`--ssl-grpc-certificate-chain-file=`](cli.md#--ssl-grpc-certificate-chain-filepath) + +More details here: +https://grpc.github.io/grpc/cpp/structgrpc_1_1_ssl_credentials_options.html + +The +[inference protocol gRPC SSL/TLS section](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inference_protocols.md#ssltls) +describes server-side options to configure SSL/TLS in Triton's gRPC endpoint. + +For HTTPS, the following options are exposed: + +- [`--ssl-https-verify-peer`](cli.md#--ssl-https-verify-peer01) +- [`--ssl-https-verify-host`](cli.md#--ssl-https-verify-host012) +- [`--ssl-https-ca-certificates-file`](cli.md#--ssl-https-ca-certificates-filepath) +- [`--ssl-https-client-certificate-file`](cli.md#--ssl-https-client-certificate-filepath) +- [`--ssl-https-client-certificate-type`](cli.md#--ssl-https-client-certificate-typepemder) +- [`--ssl-https-private-key-file`](cli.md#--ssl-https-private-key-filepath) +- [`--ssl-https-private-key-type`](cli.md#--ssl-https-private-key-typepemder) + +See [`--help`](cli.md#--help) for full documentation. + +Unlike gRPC, Triton's HTTP server endpoint can not be configured with SSL/TLS +support. + +Note: Just providing these `--ssl-http-*` options to Perf Analyzer does not +ensure that SSL/TLS is used in communication. If SSL/TLS is not enabled on the +service endpoint, these options have no effect. The intent of exposing these +options to a user of Perf Analyzer is to allow them to configure Perf Analyzer +to benchmark a Triton service behind SSL/TLS-enabled endpoints. In other words, +if Triton is running behind a HTTPS server proxy, then these options would allow +Perf Analyzer to profile Triton via exposed HTTPS proxy. diff --git a/docs/quick_start.md b/docs/quick_start.md new file mode 100644 index 00000000..17d63f56 --- /dev/null +++ b/docs/quick_start.md @@ -0,0 +1,114 @@ + + +# Quick Start + +The steps below will guide you on how to start using Perf Analyzer. + +### Step 1: Start Triton Container + +```bash +export RELEASE= # e.g. to use the release from the end of February of 2023, do `export RELEASE=23.02` + +docker pull nvcr.io/nvidia/tritonserver:${RELEASE}-py3 + +docker run --gpus all --rm -it --net host nvcr.io/nvidia/tritonserver:${RELEASE}-py3 +``` + +### Step 2: Download `simple` Model + +```bash +# inside triton container +git clone --depth 1 https://github.com/triton-inference-server/server + +mkdir model_repository ; cp -r server/docs/examples/model_repository/simple model_repository +``` + +### Step 3: Start Triton Server + +```bash +# inside triton container +tritonserver --model-repository $(pwd)/model_repository &> server.log & + +# confirm server is ready, look for 'HTTP/1.1 200 OK' +curl -v localhost:8000/v2/health/ready + +# detach (CTRL-p CTRL-q) +``` + +### Step 4: Start Triton SDK Container + +```bash +docker pull nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk + +docker run --gpus all --rm -it --net host nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk +``` + +### Step 5: Run Perf Analyzer + +```bash +# inside sdk container +perf_analyzer -m simple +``` + +### Step 6: Observe and Analyze Output + +``` +$ perf_analyzer -m simple +*** Measurement Settings *** + Batch size: 1 + Service Kind: Triton + Using "time_windows" mode for stabilization + Measurement window: 5000 msec + Using synchronous calls for inference + Stabilizing using average latency + +Request concurrency: 1 + Client: + Request count: 25348 + Throughput: 1407.84 infer/sec + Avg latency: 708 usec (standard deviation 663 usec) + p50 latency: 690 usec + p90 latency: 881 usec + p95 latency: 926 usec + p99 latency: 1031 usec + Avg HTTP time: 700 usec (send/recv 102 usec + response wait 598 usec) + Server: + Inference count: 25348 + Execution count: 25348 + Successful request count: 25348 + Avg request latency: 382 usec (overhead 41 usec + queue 41 usec + compute input 26 usec + compute infer 257 usec + compute output 16 usec) + +Inferences/Second vs. Client Average Batch Latency +Concurrency: 1, throughput: 1407.84 infer/sec, latency 708 usec +``` + +We can see from the output that the model was able to complete approximately +1407.84 inferences per second, with an average latency of 708 microseconds per +inference request. Concurrency of 1 meant that Perf Analyzer attempted to always +have 1 outgoing request at all times. diff --git a/doctest.h b/doctest.h index 56778b01..adda4134 100644 --- a/doctest.h +++ b/doctest.h @@ -166,7 +166,7 @@ // ================================================================================================= // both the header and the implementation suppress all of these, -// so it only makes sense to aggregrate them like so +// so it only makes sense to aggregate them like so #define DOCTEST_SUPPRESS_COMMON_WARNINGS_PUSH \ DOCTEST_CLANG_SUPPRESS_WARNING_PUSH \ DOCTEST_CLANG_SUPPRESS_WARNING("-Wunknown-pragmas") \ @@ -897,8 +897,7 @@ struct ContextOptions //! OCLINT too many fields namespace detail { template -struct enable_if { -}; +struct enable_if {}; template struct enable_if { @@ -910,9 +909,9 @@ struct enable_if { template struct remove_reference { typedef T type; }; template struct remove_reference { typedef T type; }; - template U declval(int); + template U declval(int); - template T declval(long); + template T declval(long); template auto declval() DOCTEST_NOEXCEPT -> decltype(declval(0)) ; @@ -1241,8 +1240,7 @@ namespace detail { #endif // DOCTEST_CONFIG_TREAT_CHAR_STAR_AS_STRING // clang-format on -struct DOCTEST_INTERFACE TestFailureException { -}; +struct DOCTEST_INTERFACE TestFailureException {}; DOCTEST_INTERFACE bool checkIfShouldThrow(assertType::Enum at); @@ -1280,10 +1278,8 @@ DOCTEST_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wunused-comparison") // global scope is defined after this template, the template won't be // instantiated due to SFINAE. Once the template is not instantiated it can look // for global operator using normal conversions. -#define SFINAE_OP(ret, op) \ - decltype( \ - (void)(doctest::detail::declval() op doctest::detail::declval()), \ - ret{}) +#define SFINAE_OP(ret, op) \ + decltype((void)(doctest::detail::declval() op doctest::detail::declval()), ret{}) #define DOCTEST_DO_BINARY_EXPRESSION_COMPARISON(op, op_str, op_macro) \ template \ @@ -1798,8 +1794,7 @@ struct StringStreamBase { template struct StringStream - : public StringStreamBase::value> { -}; + : public StringStreamBase::value> {}; template void @@ -1935,7 +1930,8 @@ DOCTEST_DEFINE_DECORATOR(should_fail, bool, true); DOCTEST_DEFINE_DECORATOR(expected_failures, int, 0); template -int registerExceptionTranslator(String (*translateFunction)(T)) +int +registerExceptionTranslator(String (*translateFunction)(T)) { DOCTEST_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wexit-time-destructors") static detail::ExceptionTranslator exceptionTranslator(translateFunction); @@ -1956,7 +1952,8 @@ DOCTEST_INTERFACE doctest::detail::TestSuite& getCurrentTestSuite(); namespace doctest { #else // DOCTEST_CONFIG_DISABLE template -int registerExceptionTranslator(String (*)(T)) +int +registerExceptionTranslator(String (*)(T)) { return 0; } @@ -2181,7 +2178,10 @@ registerReporter(const char* name, int priority, bool isReporter) static void f() #define DOCTEST_CREATE_AND_REGISTER_FUNCTION_IN_CLASS(f, proxy, decorators) \ - static doctest::detail::funcType proxy() { return f; } \ + static doctest::detail::funcType proxy() \ + { \ + return f; \ + } \ DOCTEST_REGISTER_FUNCTION(inline, proxy(), decorators) \ static void f() @@ -3271,41 +3271,41 @@ DOCTEST_MSVC_SUPPRESS_WARNING( DOCTEST_MAKE_STD_HEADERS_CLEAN_FROM_WARNINGS_ON_WALL_BEGIN // required includes - will go only in one translation unit! -#include -#include #include +#include +#include // borland (Embarcadero) compiler requires math.h and not cmath - // https://github.com/doctest/doctest/pull/37 #ifdef __BORLANDC__ #include #endif // __BORLANDC__ -#include +#include +#include +#include +#include +#include +#include #include #include #include -#include -#include +#include #include -#include -#include -#include #include -#include -#include +#include +#include +#include #include +#include #include -#include -#include +#include #include -#include -#include -#include -#include +#include +#include #ifdef DOCTEST_PLATFORM_MAC +#include #include #include -#include #endif // DOCTEST_PLATFORM_MAC #ifdef DOCTEST_PLATFORM_WINDOWS @@ -3919,12 +3919,14 @@ String::operator=(String&& other) return *this; } -char String::operator[](unsigned i) const +char +String::operator[](unsigned i) const { return const_cast(this)->operator[](i); // NOLINT } -char& String::operator[](unsigned i) +char& +String::operator[](unsigned i) { if (isOnStack()) return reinterpret_cast(buf)[i]; @@ -4164,7 +4166,8 @@ DOCTEST_TO_STRING_OVERLOAD(int long unsigned, "%lu") DOCTEST_TO_STRING_OVERLOAD(int long long, "%lld") DOCTEST_TO_STRING_OVERLOAD(int long long unsigned, "%llu") -String toString(std::nullptr_t) +String +toString(std::nullptr_t) { return "NULL"; } @@ -4324,7 +4327,10 @@ void Context::setAsDefaultForAssertsOutOfTestCases() { } -void Context::setAssertHandler(detail::assert_handler) {} +void +Context::setAssertHandler(detail::assert_handler) +{ +} void Context::setCout(std::ostream* out) { @@ -4488,7 +4494,7 @@ wildcmp(const char* str, const char* wild, bool caseSensitive) } //// C string hash function (djb2) - taken from -///http://www.cse.yorku.ca/~oz/hash.html +/// http://www.cse.yorku.ca/~oz/hash.html // unsigned hashStr(unsigned const char* str) { // unsigned long hash = 5381; // char c; @@ -4601,7 +4607,8 @@ Result::Result(bool passed, const String& decomposition) ExpressionDecomposer::ExpressionDecomposer(assertType::Enum at) : m_at(at) {} -TestSuite& TestSuite::operator*(const char* in) +TestSuite& +TestSuite::operator*(const char* in) { m_test_suite = in; return *this; @@ -4652,7 +4659,8 @@ TestCase::operator=(const TestCase& other) } DOCTEST_MSVC_SUPPRESS_WARNING_POP -TestCase& TestCase::operator*(const char* in) +TestCase& +TestCase::operator*(const char* in) { m_name = in; // make a new name with an appended type for templated test case @@ -6220,7 +6228,8 @@ fulltext_log_assert_to_stream(std::ostream& s, const AssertData& rb) : "did NOT throw at all!") << Color::Cyan << rb.m_exception << "\n"; } else if (rb.m_at & assertType::is_throws_with) { //! OCLINT bitwise - //! operator in conditional + //! operator in + //! conditional s << Color::Cyan << assertString(rb.m_at) << "( " << rb.m_expr << ", \"" << rb.m_exception_string << "\" ) " << Color::None << (rb.m_threw ? (!rb.m_failed ? "threw as expected!" @@ -6542,8 +6551,9 @@ struct ConsoleReporter : public IReporter { Color::Enum getSuccessOrFailColor(bool success, assertType::Enum at) { - return success ? Color::BrightGreen - : (at & assertType::is_warn) ? Color::Yellow : Color::Red; + return success ? Color::BrightGreen + : (at & assertType::is_warn) ? Color::Yellow + : Color::Red; } void successOrFailColoredStringToStream( @@ -7192,10 +7202,10 @@ parseIntOption( if (type == 0) { // boolean - const char positive[][5] = {"1", "true", "on", - "yes"}; // 5 - strlen("true") + 1 - const char negative[][6] = {"0", "false", "off", - "no"}; // 6 - strlen("false") + 1 + const char positive[][5] = { + "1", "true", "on", "yes"}; // 5 - strlen("true") + 1 + const char negative[][6] = { + "0", "false", "off", "no"}; // 6 - strlen("false") + 1 // if the value matches any of the positive/negative possibilities for (unsigned i = 0; i < 4; i++) { @@ -7689,8 +7699,8 @@ Context::run() #ifndef DOCTEST_CONFIG_NO_EXCEPTIONS try { #endif // DOCTEST_CONFIG_NO_EXCEPTIONS - // MSVC 2015 diagnoses fatalConditionHandler as unused (because - // reset() is a static method) + // MSVC 2015 diagnoses fatalConditionHandler as unused (because + // reset() is a static method) DOCTEST_MSVC_SUPPRESS_WARNING_WITH_PUSH( 4101) // unreferenced local variable FatalConditionHandler fatalConditionHandler; // Handle signals diff --git a/fifo_ctx_id_tracker.h b/fifo_ctx_id_tracker.h new file mode 100644 index 00000000..750fc63b --- /dev/null +++ b/fifo_ctx_id_tracker.h @@ -0,0 +1,48 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include "base_queue_ctx_id_tracker.h" + +namespace triton { namespace perfanalyzer { + +// Context ID Tracker that reuses IDs in a roughly round-robin manner using a +// FIFO +// +class FifoCtxIdTracker : public BaseQueueCtxIdTracker { + public: + FifoCtxIdTracker() = default; + void Reset(size_t count) override + { + Clear(); + + for (size_t i = 0; i < count; ++i) { + free_ctx_ids_.push(i); + } + } +}; + +}}; // namespace triton::perfanalyzer diff --git a/genai-perf/.gitignore b/genai-perf/.gitignore new file mode 100644 index 00000000..d4f588ed --- /dev/null +++ b/genai-perf/.gitignore @@ -0,0 +1 @@ +artifacts/ diff --git a/genai-perf/README.md b/genai-perf/README.md new file mode 100644 index 00000000..24c1efe3 --- /dev/null +++ b/genai-perf/README.md @@ -0,0 +1,515 @@ + + +# GenAI-Perf + +GenAI-Perf is a command line tool for measuring the throughput and latency of +generative AI models as served through an inference server. For large language +models (LLMs), GenAI-Perf provides metrics such as +[output token throughput](#output_token_throughput_metric), +[time to first token](#time_to_first_token_metric), +[inter token latency](#inter_token_latency_metric), and +[request throughput](#request_throughput_metric). For a full list of metrics +please see the [Metrics section](#metrics). + +Users specify a model name, an inference server URL, the type of inputs to use +(synthetic or from dataset), and the type of load to generate (number of +concurrent requests, request rate). + +GenAI-Perf generates the specified load, measures the performance of the +inference server and reports the metrics in a simple table as console output. +The tool also logs all results in a csv file that can be used to derive +additional metrics and visualizations. The inference server must already be +running when GenAI-Perf is run. + +> [!Note] +> GenAI-Perf is currently in early release and under rapid development. While we +> will try to remain consistent, command line options and functionality are +> subject to change as the tool matures. + +# Installation + +## Triton SDK Container + +Available starting with the 24.03 release of the +[Triton Server SDK container](https://ngc.nvidia.com/catalog/containers/nvidia:tritonserver). + +Run the Triton Inference Server SDK docker container: + +```bash +export RELEASE="yy.mm" # e.g. export RELEASE="24.03" + +docker run -it --net=host --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk +``` + +
+ +Alternatively, to install from source: + +## From Source + +GenAI-Perf depends on Perf Analyzer. Here is how to install Perf Analyzer: + +### Install Perf Analyzer (Ubuntu, Python 3.8+) + +Note: you must already have CUDA 12 installed. + +```bash +pip install tritonclient + +apt update && apt install -y --no-install-recommends libb64-0d libcurl4 +``` + +Alternatively, you can install Perf Analyzer +[from source](../docs/install.md#build-from-source). + +### Install GenAI-Perf from source + +```bash +export RELEASE="yy.mm" # e.g. export RELEASE="24.03" + +pip install "git+https://github.com/triton-inference-server/client.git@r${RELEASE}#subdirectory=src/c++/perf_analyzer/genai-perf" +``` + +
+
+ +Run GenAI-Perf: + +```bash +genai-perf --help +``` + +# Quick Start + +## Measuring Throughput and Latency of GPT2 using Triton + TensorRT-LLM + +### Running GPT2 on Triton Inference Server using TensorRT-LLM + +
+See instructions + +1. Run Triton Inference Server with TensorRT-LLM backend container: + +```bash +export RELEASE="yy.mm" # e.g. export RELEASE="24.03" + +docker run -it --net=host --rm --gpus=all --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tritonserver:${RELEASE}-trtllm-python-py3 +``` + +2. Install Triton CLI (~5 min): + +```bash +pip install \ + --extra-index-url https://pypi.nvidia.com \ + -U \ + psutil \ + "pynvml>=11.5.0" \ + torch==2.1.2 \ + tensorrt_llm==0.8.0 \ + "git+https://github.com/triton-inference-server/triton_cli@0.0.6" +``` + +3. Download model: + +```bash +triton import -m gpt2 --backend tensorrtllm +``` + +4. Run server: + +```bash +triton start +``` + +
+ +### Running GenAI-Perf + +1. Run Triton Inference Server SDK container: + +```bash +export RELEASE="yy.mm" # e.g. export RELEASE="24.03" + +docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk +``` + +2. Run GenAI-Perf: + +```bash +genai-perf \ + -m gpt2 \ + --service-kind triton \ + --backend tensorrtllm \ + --num-prompts 100 \ + --random-seed 123 \ + --synthetic-input-tokens-mean 200 \ + --synthetic-input-tokens-stddev 0 \ + --streaming \ + --output-tokens-mean 100 \ + --output-tokens-stddev 0 \ + --output-tokens-mean-deterministic \ + --tokenizer hf-internal-testing/llama-tokenizer \ + --concurrency 1 \ + --measurement-interval 4000 \ + --profile-export-file my_profile_export.json \ + --url localhost:8001 +``` + +Example output: + +``` + LLM Metrics +┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ +┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ +│ Time to first token (ns) │ 13,266,974 │ 11,818,732 │ 18,351,779 │ 16,513,479 │ 13,741,986 │ 13,544,376 │ +│ Inter token latency (ns) │ 2,069,766 │ 42,023 │ 15,307,799 │ 3,256,375 │ 3,020,580 │ 2,090,930 │ +│ Request latency (ns) │ 223,532,625 │ 219,123,330 │ 241,004,192 │ 238,198,306 │ 229,676,183 │ 224,715,918 │ +│ Output sequence length │ 104 │ 100 │ 129 │ 128 │ 109 │ 105 │ +│ Input sequence length │ 199 │ 199 │ 199 │ 199 │ 199 │ 199 │ +└──────────────────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘ +Output token throughput (per sec): 460.42 +Request throughput (per sec): 4.44 +``` + +See [Tutorial](docs/tutorial.md) for additional examples. + +
+ +# Visualization + +GenAI-Perf can also generate various plots that visualize the performance of the +current profile run. This is disabled by default but users can easily enable it +by passing the `--generate-plots` option when running the benchmark: + +```bash +genai-perf \ + -m gpt2 \ + --service-kind triton \ + --backend tensorrtllm \ + --streaming \ + --concurrency 1 \ + --generate-plots +``` + +This will generate a [set of default plots](docs/compare.md#example-plots) such as: +- Time to first token (TTFT) analysis +- Request latency analysis +- TTFT vs Input sequence lengths +- Inter token latencies vs Token positions +- Input sequence lengths vs Output sequence lengths + + +## Using `compare` Subcommand to Visualize Multiple Runs + +The `compare` subcommand in GenAI-Perf facilitates users in comparing multiple +profile runs and visualizing the differences through plots. + +### Usage +Assuming the user possesses two profile export JSON files, +namely `profile1.json` and `profile2.json`, +they can execute the `compare` subcommand using the `--files` option: + +```bash +genai-perf compare --files profile1.json profile2.json +``` + +Executing the above command will perform the following actions under the +`compare` directory: +1. Generate a YAML configuration file (e.g. `config.yaml`) containing the +metadata for each plot generated during the comparison process. +2. Automatically generate the [default set of plots](docs/compare.md#example-plots) +(e.g. TTFT vs. Input Sequence Lengths) that compare the two profile runs. + +``` +compare +├── config.yaml +├── distribution_of_input_sequence_lengths_to_output_sequence_lengths.jpeg +├── request_latency.jpeg +├── time_to_first_token.jpeg +├── time_to_first_token_vs_input_sequence_lengths.jpeg +├── token-to-token_latency_vs_output_token_position.jpeg +└── ... +``` + +### Customization +Users have the flexibility to iteratively modify the generated YAML configuration +file to suit their specific requirements. +They can make alterations to the plots according to their preferences and execute +the command with the `--config` option followed by the path to the modified +configuration file: + +```bash +genai-perf compare --config compare/config.yaml +``` + +This command will regenerate the plots based on the updated configuration settings, +enabling users to refine the visual representation of the comparison results as +per their needs. + +See [Compare documentation](docs/compare.md) for more details. + +
+ +# Model Inputs + +GenAI-Perf supports model input prompts from either synthetically generated +inputs, or from the HuggingFace +[OpenOrca](https://huggingface.co/datasets/Open-Orca/OpenOrca) or +[CNN_DailyMail](https://huggingface.co/datasets/cnn_dailymail) datasets. This is +specified using the `--input-dataset` CLI option. + +When the dataset is synthetic, you can specify the following options: +* `--num-prompts `: The number of unique prompts to generate as stimulus, >= 1. +* `--synthetic-input-tokens-mean `: The mean of number of tokens in the + generated prompts when using synthetic data, >= 1. +* `--synthetic-input-tokens-stddev `: The standard deviation of number of + tokens in the generated prompts when using synthetic data, >= 0. +* `--random-seed `: The seed used to generate random values, >= 0. + +When the dataset is coming from HuggingFace, you can specify the following +options: +* `--input-dataset {openorca,cnn_dailymail}`: HuggingFace dataset to use for + benchmarking. +* `--num-prompts `: The number of unique prompts to generate as stimulus, >= 1. + +When the dataset is coming from a file, you can specify the following +options: +* `--input-file `: The input file containing the single prompt to + use for benchmarking. + +For any dataset, you can specify the following options: +* `--output-tokens-mean `: The mean number of tokens in each output. Ensure + the `--tokenizer` value is set correctly, >= 1. +* `--output-tokens-stddev `: The standard deviation of the number of tokens + in each output. This is only used when output-tokens-mean is provided, >= 1. +* `--output-tokens-mean-deterministic`: When using `--output-tokens-mean`, this + flag can be set to improve precision by setting the minimum number of tokens + equal to the requested number of tokens. This is currently supported with the + Triton service-kind. Note that there is still some variability in the + requested number of output tokens, but GenAi-Perf attempts its best effort + with your model to get the right number of output tokens. + +You can optionally set additional model inputs with the following option: +* `--extra-inputs :`: An additional input for use with the + model with a singular value, such as `stream:true` or `max_tokens:5`. This + flag can be repeated to supply multiple extra inputs. + +
+ +# Metrics + +GenAI-Perf collects a diverse set of metrics that captures the performance of +the inference server. + +| Metric | Description | Aggregations | +| - | - | - | +| Time to First Token | Time between when a request is sent and when its first response is received, one value per request in benchmark | Avg, min, max, p99, p90, p75 | +| Inter Token Latency | Time between intermediate responses for a single request divided by the number of generated tokens of the latter response, one value per response per request in benchmark | Avg, min, max, p99, p90, p75 | +| Request Latency | Time between when a request is sent and when its final response is received, one value per request in benchmark | Avg, min, max, p99, p90, p75 | +| Output Sequence Length | Total number of output tokens of a request, one value per request in benchmark | Avg, min, max, p99, p90, p75 | +| Input Sequence Length | Total number of input tokens of a request, one value per request in benchmark | Avg, min, max, p99, p90, p75 | +| Output Token Throughput | Total number of output tokens from benchmark divided by benchmark duration | None–one value per benchmark | +| Request Throughput | Number of final responses from benchmark divided by benchmark duration | None–one value per benchmark | + +
+ +# Command Line Options + +##### `-h` +##### `--help` + +Show the help message and exit. + +## Endpoint Options: + +##### `-m ` +##### `--model ` + +The names of the models to benchmark. +A single model is recommended, unless you are +[profiling multiple LoRA adapters](docs/lora.md). (default: `None`) + +##### `--model-selection-strategy {round_robin, random}` + +When multiple models are specified, this is how a specific model +is assigned to a prompt. Round robin means that each model receives +a request in order. Random means that assignment is uniformly random +(default: `round_robin`) + +##### `--backend {tensorrtllm,vllm}` + +When using the "triton" service-kind, this is the backend of the model. For the +TRT-LLM backend, you currently must set `exclude_input_in_output` to true in the +model config to not echo the input tokens in the output. (default: tensorrtllm) + +##### `--endpoint ` + +Set a custom endpoint that differs from the OpenAI defaults. (default: `None`) + +##### `--endpoint-type {chat,completions}` + +The endpoint-type to send requests to on the server. This is only used with the +`openai` service-kind. (default: `None`) + +##### `--service-kind {triton,openai}` + +The kind of service perf_analyzer will generate load for. In order to use +`openai`, you must specify an api via `--endpoint-type`. (default: `triton`) + +##### `--streaming` + +An option to enable the use of the streaming API. (default: `False`) + +##### `-u ` +##### `--url ` + +URL of the endpoint to target for benchmarking. (default: `None`) + +## Input Options + +##### `--extra-inputs ` + +Provide additional inputs to include with every request. You can repeat this +flag for multiple inputs. Inputs should be in an input_name:value format. +Alternatively, a string representing a json formatted dict can be provided. +(default: `None`) + +##### `--input-dataset {openorca,cnn_dailymail}` + +The HuggingFace dataset to use for prompts. +(default: `openorca`) + +##### `--input-file ` + +The input file containing the prompts to use for profiling. +Each line should be a JSON object with a 'text_input' field in JSONL format. +Example: {\"text_input\": \"Your prompt here\"}" + +##### `--num-prompts ` + +The number of unique prompts to generate as stimulus. (default: `100`) + +##### `--output-tokens-mean ` + +The mean number of tokens in each output. Ensure the `--tokenizer` value is set +correctly. (default: `-1`) + +##### `--output-tokens-mean-deterministic` + +When using `--output-tokens-mean`, this flag can be set to improve precision by +setting the minimum number of tokens equal to the requested number of tokens. +This is currently supported with the Triton service-kind. Note that there is +still some variability in the requested number of output tokens, but GenAi-Perf +attempts its best effort with your model to get the right number of output +tokens. (default: `False`) + +##### `--output-tokens-stddev ` + +The standard deviation of the number of tokens in each output. This is only used +when `--output-tokens-mean` is provided. (default: `0`) + +##### `--random-seed ` + +The seed used to generate random values. (default: `0`) + +##### `--synthetic-input-tokens-mean ` + +The mean of number of tokens in the generated prompts when using synthetic +data. (default: `550`) + +##### `--synthetic-input-tokens-stddev ` + +The standard deviation of number of tokens in the generated prompts when +using synthetic data. (default: `0`) + +## Profiling Options + +##### `--concurrency ` + +The concurrency value to benchmark. (default: `None`) + +##### `--measurement-interval ` +##### `-p ` + +The time interval used for each measurement in milliseconds. Perf Analyzer +will sample a time interval specified and take measurement over the requests +completed within that time interval. (default: `10000`) + +##### `--request-rate ` + +Sets the request rate for the load generated by PA. (default: `None`) + +##### `-s ` +##### `--stability-percentage ` + +The allowed variation in latency measurements when determining if a result is +stable. The measurement is considered as stable if the ratio of max / min from +the recent 3 measurements is within (stability percentage) in terms of both +infer per second and latency. (default: `999`) + +## Output Options + +##### `--artifact-dir` + +The directory to store all the (output) artifacts generated by GenAI-Perf and +Perf Analyzer. (default: `artifacts`) + +##### `--generate-plots` + +An option to enable the generation of plots. (default: False) + +##### `--profile-export-file ` + +The path where the perf_analyzer profile export will be generated. By default, +the profile export will be to `profile_export.json`. The genai-perf file will be +exported to `_genai_perf.csv`. For example, if the profile +export file is `profile_export.json`, the genai-perf file will be exported to +`profile_export_genai_perf.csv`. (default: `profile_export.json`) + +## Other Options + +##### `--tokenizer ` + +The HuggingFace tokenizer to use to interpret token metrics from prompts and +responses. (default: `hf-internal-testing/llama-tokenizer`) + +##### `-v` +##### `--verbose` + +An option to enable verbose mode. (default: `False`) + +##### `--version` + +An option to print the version and exit. + +# Known Issues + +* GenAI-Perf can be slow to finish if a high request-rate is provided +* Token counts may not be exact diff --git a/genai-perf/docs/assets/distribution_of_input_sequence_lengths_to_output_sequence_lengths.jpeg b/genai-perf/docs/assets/distribution_of_input_sequence_lengths_to_output_sequence_lengths.jpeg new file mode 100644 index 00000000..1f9b2cba Binary files /dev/null and b/genai-perf/docs/assets/distribution_of_input_sequence_lengths_to_output_sequence_lengths.jpeg differ diff --git a/genai-perf/docs/assets/request_latency.jpeg b/genai-perf/docs/assets/request_latency.jpeg new file mode 100644 index 00000000..d681195f Binary files /dev/null and b/genai-perf/docs/assets/request_latency.jpeg differ diff --git a/genai-perf/docs/assets/time_to_first_token.jpeg b/genai-perf/docs/assets/time_to_first_token.jpeg new file mode 100644 index 00000000..99ca06ee Binary files /dev/null and b/genai-perf/docs/assets/time_to_first_token.jpeg differ diff --git a/genai-perf/docs/assets/time_to_first_token_vs_input_sequence_lengths.jpeg b/genai-perf/docs/assets/time_to_first_token_vs_input_sequence_lengths.jpeg new file mode 100644 index 00000000..1b81ef53 Binary files /dev/null and b/genai-perf/docs/assets/time_to_first_token_vs_input_sequence_lengths.jpeg differ diff --git a/genai-perf/docs/assets/token-to-token_latency_vs_output_token_position.jpeg b/genai-perf/docs/assets/token-to-token_latency_vs_output_token_position.jpeg new file mode 100644 index 00000000..4a179ef8 Binary files /dev/null and b/genai-perf/docs/assets/token-to-token_latency_vs_output_token_position.jpeg differ diff --git a/genai-perf/docs/compare.md b/genai-perf/docs/compare.md new file mode 100644 index 00000000..5d1a3641 --- /dev/null +++ b/genai-perf/docs/compare.md @@ -0,0 +1,251 @@ + + +# GenAI-Perf Compare Subcommand + +There are two approaches for the users to use the `compare` subcommand to create +plots across multiple runs. First is to directly pass the profile export files +with `--files` option + +## Running initially with `--files` option + +If the user does not have a YAML configuration file, +they can run the `compare` subcommand with the `--files` option to generate a +set of default plots as well as a pre-filled YAML config file for the plots. + +```bash +genai-perf compare --files profile1.json profile2.json profile3.json +``` + +This will generate the default plots and compare across the three runs. +GenAI-Perf will also generate an initial YAML configuration file `config.yaml` +that is pre-filled with plot configurations as following: + +```yaml +plot1: + title: Time to First Token + x_metric: '' + y_metric: time_to_first_tokens + x_label: Time to First Token (ms) + y_label: '' + width: 1200 + height: 700 + type: box + paths: + - profile1.json + - profile2.json + - profile3.json + output: compare +plot2: + title: Request Latency + x_metric: '' + y_metric: request_latencies + x_label: Request Latency (ms) + y_label: '' + width: 1200 + height: 700 + type: box + paths: + - profile1.json + - profile2.json + - profile3.json + output: compare +plot3: + title: Distribution of Input Sequence Lengths to Output Sequence Lengths + x_metric: input_sequence_lengths + y_metric: output_sequence_lengths + x_label: Input Sequence Length + y_label: Output Sequence Length + width: 1200 + height: 450 + type: heatmap + paths: + - profile1.json + - profile2.json + - profile3.json + output: compare +plot4: + title: Time to First Token vs Input Sequence Lengths + x_metric: input_sequence_lengths + y_metric: time_to_first_tokens + x_label: Input Sequence Length + y_label: Time to First Token (ms) + width: 1200 + height: 700 + type: scatter + paths: + - profile1.json + - profile2.json + - profile3.json + output: compare +plot5: + title: Token-to-Token Latency vs Output Token Position + x_metric: token_positions + y_metric: inter_token_latencies + x_label: Output Token Position + y_label: Token-to-Token Latency (ms) + width: 1200 + height: 700 + type: scatter + paths: + - profile1.json + - profile2.json + - profile3.json + output: compare +``` + +Once the user has the YAML configuration file, +they can repeat the process of editing the config file and running with +`--config` option to re-generate the plots iteratively. + +```bash +# edit +vi config.yaml + +# re-generate the plots +genai-perf compare --config config.yaml +``` + +## Running directly with `--config` option + +If the user would like to create a custom plot (other than the default ones provided), +they can build their own YAML configuration file that contains the information +about the plots they would like to generate. +For instance, if the user would like to see how the inter token latencies change +by the number of output tokens, which is not part of the default plots, +they could add the following YAML block to the file: + +```yaml +plot1: + title: Inter Token Latency vs Output Tokens + x_metric: num_output_tokens + y_metric: inter_token_latencies + x_label: Num Output Tokens + y_label: Avg ITL (ms) + width: 1200 + height: 450 + type: scatter + paths: + - + - + output: compare +``` + +After adding the lines, the user can run the following command to generate the +plots specified in the configuration file (in this case, `config.yaml`): + +```bash +genai-perf compare --config config.yaml +``` + +The user can check the generated plots under the output directory: +``` +compare/ +├── inter_token_latency_vs_output_tokens.jpeg +└── ... +``` + +## YAML Schema + +Here are more details about the YAML configuration file and its stricture. +The general YAML schema for the plot configuration looks as following: + +```yaml +plot1: + title: [str] + x_metric: [str] + y_metric: [str] + x_label: [str] + y_label: [str] + width: [int] + height: [int] + type: [scatter,box,heatmap] + paths: + - [str] + - ... + output: [str] + +plot2: + title: [str] + x_metric: [str] + y_metric: [str] + x_label: [str] + y_label: [str] + width: [int] + height: [int] + type: [scatter,box,heatmap] + paths: + - [str] + - ... + output: [str] + +# add more plots +``` + +The user can add as many plots they would like to generate by adding the plot +blocks in the configuration file (they have a key pattern of `plot<#>`, +but that is not required and the user can set it to any arbitrary string). +For each plot block, the user can specify the following configurations: +- `title`: The title of the plot. +- `x_metric`: The name of the metric to be used on the x-axis. +- `y_metric`: The name of the metric to be used on the y-axis. +- `x_label`: The x-axis label (or description) +- `y_label`: The y-axis label (or description) +- `width`: The width of the entire plot +- `height`: The height of the entire plot +- `type`: The type of the plot. It must be one of the three: `scatter`, `box`, +or `heatmap`. +- `paths`: List of paths to the profile export files to compare. +- `output`: The path to the output directory to store all the plots and YAML +configuration file. + +> [!Note] +> User *MUST* provide at least one valid path to the profile export file. + + + +## Example Plots + +Here are the list of sample plots that gets created by default from running the +`compare` subcommand: + +### Distribution of Input Sequence Lengths to Output Sequence Lengths + + +### Request Latency Analysis + + +### Time to First Token Analysis + + +### Time to First Token vs. Input Sequence Lengths + + +### Token-to-Token Latency vs. Output Token Position + + diff --git a/genai-perf/docs/files.md b/genai-perf/docs/files.md new file mode 100644 index 00000000..6ebdf69f --- /dev/null +++ b/genai-perf/docs/files.md @@ -0,0 +1,129 @@ + + +# Generated File Structures + +## Overview + +This document serves as a guide to understanding the structure and contents of +the files generated by GenAi-Perf. + +## Directory Structure + +After running GenAi-Perf, your file tree should contain the following: + +``` +genai-perf/ +├── artifacts/ +│ ├── data/ +│ └── images/ +``` + +## File Types +Within the artifacts and docs directories, several file types are generated, +including .gzip, .csv, .json, .html, and .jpeg. Below is a detailed +explanation of each file and its purpose. + +### Artifacts Directory + +#### Data Subdirectory + +The data subdirectory contains the raw and processed performance data files. + +##### GZIP Files + +- all_data.gzip: Aggregated performance data from all collected metrics. +- input_sequence_lengths_vs_output_sequence_lengths.gzip: This contains data on +the input sequence lengths versus the output sequence lengths for each request. +- request_latency.gzip: This contains the latency for each request. +- time_to_first_token.gzip: This contains the time to first token for each request. +- token_to_token_vs_output_position.gzip: This contains the time from one token +generation to the next versus the position of the output token for each token. +- ttft_vs_input_sequence_lengths.gzip: This contains the time to first token +versus the input sequence length for each request. + +##### JSON Files + +- llm_inputs.json: This contains the input prompts provided to the LLM during testing. +- profile_export.json: This is provided by Perf Analyzer and contains the timestamps +for each event in the lifecycle of each request. This is low-level data used to calculate +metrics by GenAi-Perf. + +##### CSV File + +- profile_export_genai_perf.csv: A CSV of the output tables printed +in the GenAi-Perf output. These may have more detail than the printed tables. + +#### Images Subdirectory + +The images subdirectory contains visual representations of the performance +data. All images are in both HTML and JPEG formats. + +##### HTML and JPEG Files +- input_sequence_lengths_vs_output_sequence_lengths: A heat map showing the +relationship between input and generated tokens. +- request_latency: A box plot showing request latency. +- time_to_first_token: A box plot showing time to first token. +- token_to_token_vs_output_position: A scatterplot showing token-to-token +time versus output token position. +- ttft_vs_input_sequence_lengths: A scatterplot showing token-to-token time +versus the input sequence lengths. + +## Usage Instructions + +To use the generated files, navigate to the artifacts/data directory. Then, +the next steps depend on the file format you wish to work with. + +### GZIP Files + +The GZIP files contain Parquet files with calculated data, which can be read +with Pandas in Python. For example, you can create a dataframe with these files: + +``` +import pandas +df = pandas.read_partquet(path_to_file)` +``` + +You can then use Pandas to work with the data. + +``` +print(df.head()) # See the first few rows of the data. +print(df.describe()) # Get summary statistics for the data +``` + +### CSV and JSON Files +Open .csv and .json files with spreadsheet or JSON parsing tools for structured +data analysis. These can also be read via a text editor, like Vim. + +### HTML Files + +View .html visualizations in a web browser for interactive data exploration. + +### JPEG Files + +Use an image software to open .jpeg images for static visual representations. diff --git a/genai-perf/docs/lora.md b/genai-perf/docs/lora.md new file mode 100644 index 00000000..60be30c9 --- /dev/null +++ b/genai-perf/docs/lora.md @@ -0,0 +1,53 @@ + + +# Profiling Multiple LoRA Adapters +GenAI-Perf allows you to profile multiple LoRA adapters on top of a base model. + +## Selecting LoRA Adapters +To do this, list multiple adapters after the model name option `-m`: + +```bash +genai-perf -m lora_adapter1 lora_adapter2 lora_adapter3 +``` + +## Choosing a Strategy for Selecting Models +When profiling with multiple models, you can specify how the models should be +assigned to prompts using the `--model-selection-strategy` option: + +```bash +genai-perf \ + -m lora_adapter1 lora_adapter2 lora_adapter3 \ + --model-selection-strategy round_robin +``` + +This setup will cycle through the lora_adapter1, lora_adapter2, and +lora_adapter3 models in a round-robin manner for each prompt. + +For more details on additional options and configurations, refer to the +[Command Line Options section](../README.md#command-line-options) in the README. \ No newline at end of file diff --git a/genai-perf/docs/tutorial.md b/genai-perf/docs/tutorial.md new file mode 100644 index 00000000..bc9dec71 --- /dev/null +++ b/genai-perf/docs/tutorial.md @@ -0,0 +1,330 @@ + + +# Tutorials + +- [Profile GPT2 running on Triton + TensorRT-LLM](#tensorrt-llm) +- [Profile GPT2 running on Triton + vLLM](#triton-vllm) +- [Profile GPT2 running on OpenAI API-Compatible Server](#openai) + +--- + +## Profile GPT2 running on Triton + TensorRT-LLM + +### Running GPT2 on Triton Inference Server using TensorRT-LLM + +
+See instructions + +1. Run Triton Inference Server with TensorRT-LLM backend container: + +```bash +export RELEASE="yy.mm" # e.g. export RELEASE="24.03" + +docker run -it --net=host --rm --gpus=all --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tritonserver:${RELEASE}-trtllm-python-py3 +``` + +2. Install Triton CLI (~5 min): + +```bash +pip install "git+https://github.com/triton-inference-server/triton_cli@0.0.8" +``` + +3. Download model: + +```bash +triton import -m gpt2 --backend tensorrtllm +``` + +4. Run server: + +```bash +triton start +``` + +
+ +### Running GenAI-Perf + +1. Run Triton Inference Server SDK container: + +```bash +export RELEASE="yy.mm" # e.g. export RELEASE="24.03" + +docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk +``` + +2. Run GenAI-Perf: + +```bash +genai-perf \ + -m gpt2 \ + --service-kind triton \ + --backend tensorrtllm \ + --num-prompts 100 \ + --random-seed 123 \ + --synthetic-input-tokens-mean 200 \ + --synthetic-input-tokens-stddev 0 \ + --streaming \ + --output-tokens-mean 100 \ + --output-tokens-stddev 0 \ + --output-tokens-mean-deterministic \ + --tokenizer hf-internal-testing/llama-tokenizer \ + --concurrency 1 \ + --measurement-interval 4000 \ + --profile-export-file my_profile_export.json \ + --url localhost:8001 +``` + +Example output: + +``` + LLM Metrics +┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ +┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ +│ Time to first token (ns) │ 13,266,974 │ 11,818,732 │ 18,351,779 │ 16,513,479 │ 13,741,986 │ 13,544,376 │ +│ Inter token latency (ns) │ 2,069,766 │ 42,023 │ 15,307,799 │ 3,256,375 │ 3,020,580 │ 2,090,930 │ +│ Request latency (ns) │ 223,532,625 │ 219,123,330 │ 241,004,192 │ 238,198,306 │ 229,676,183 │ 224,715,918 │ +│ Output sequence length │ 104 │ 100 │ 129 │ 128 │ 109 │ 105 │ +│ Input sequence length │ 199 │ 199 │ 199 │ 199 │ 199 │ 199 │ +└──────────────────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘ +Output token throughput (per sec): 460.42 +Request throughput (per sec): 4.44 +``` + +## Profile GPT2 running on Triton + vLLM + +### Running GPT2 on Triton Inference Server using vLLM + +
+See instructions + +1. Run Triton Inference Server with vLLM backend container: + +```bash +export RELEASE="yy.mm" # e.g. export RELEASE="24.03" + +docker run -it --net=host --rm --gpus=all --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tritonserver:${RELEASE}-vllm-python-py3 +``` + +2. Install Triton CLI (~5 min): + +```bash +pip install "git+https://github.com/triton-inference-server/triton_cli@0.0.8" +``` + +3. Download model: + +```bash +triton import -m gpt2 --backend vllm +``` + +4. Run server: + +```bash +triton start +``` + +
+ +### Running GenAI-Perf + +1. Run Triton Inference Server SDK container: + +```bash +export RELEASE="yy.mm" # e.g. export RELEASE="24.03" + +docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk +``` + +2. Run GenAI-Perf: + +```bash +genai-perf \ + -m gpt2 \ + --service-kind triton \ + --backend vllm \ + --num-prompts 100 \ + --random-seed 123 \ + --synthetic-input-tokens-mean 200 \ + --synthetic-input-tokens-stddev 0 \ + --streaming \ + --output-tokens-mean 100 \ + --output-tokens-stddev 0 \ + --output-tokens-mean-deterministic \ + --tokenizer hf-internal-testing/llama-tokenizer \ + --concurrency 1 \ + --measurement-interval 4000 \ + --profile-export-file my_profile_export.json \ + --url localhost:8001 +``` + +Example output: + +``` + LLM Metrics +┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ +┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ +│ Time to first token (ns) │ 15,786,560 │ 11,437,189 │ 49,550,549 │ 40,129,652 │ 21,248,091 │ 17,824,695 │ +│ Inter token latency (ns) │ 3,543,380 │ 591,898 │ 10,013,690 │ 6,152,260 │ 5,039,278 │ 4,060,982 │ +│ Request latency (ns) │ 388,415,721 │ 312,552,612 │ 528,229,817 │ 518,189,390 │ 484,281,365 │ 459,417,637 │ +│ Output sequence length │ 113 │ 105 │ 123 │ 122 │ 119 │ 115 │ +│ Input sequence length │ 199 │ 199 │ 199 │ 199 │ 199 │ 199 │ +└──────────────────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘ +Output token throughput (per sec): 290.24 +Request throughput (per sec): 2.57 +``` + +## Profile GPT2 running on OpenAI API-Compatible Server + +### OpenAI Chat Completions API + +#### Running GPT2 on [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat)-compatible server + +
+See instructions + +1. Run the vLLM inference server: + +```bash +docker run -it --net=host --rm --gpus=all vllm/vllm-openai:latest --model gpt2 --dtype float16 --max-model-len 1024 +``` + +
+ +#### Running GenAI-Perf + +1. Run Triton Inference Server SDK container: + +```bash +export RELEASE="yy.mm" # e.g. export RELEASE="24.03" + +docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk +``` + +2. Run GenAI-Perf: + +```bash +genai-perf \ + -m gpt2 \ + --service-kind openai \ + --endpoint v1/chat/completions \ + --endpoint-type chat \ + --num-prompts 100 \ + --random-seed 123 \ + --synthetic-input-tokens-mean 200 \ + --synthetic-input-tokens-stddev 0 \ + --streaming \ + --output-tokens-mean 100 \ + --output-tokens-stddev 0 \ + --tokenizer hf-internal-testing/llama-tokenizer \ + --concurrency 1 \ + --measurement-interval 4000 \ + --profile-export-file my_profile_export.json \ + --url localhost:8000 +``` + +Example output: + +``` + LLM Metrics +┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ +┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ +│ Time to first token (ns) │ 13,546,815 │ 9,821,658 │ 48,317,756 │ 34,361,913 │ 16,541,625 │ 14,612,026 │ +│ Inter token latency (ns) │ 2,560,813 │ 457,703 │ 6,507,334 │ 3,754,617 │ 3,059,158 │ 2,953,540 │ +│ Request latency (ns) │ 283,597,027 │ 240,098,890 │ 361,730,568 │ 349,164,037 │ 323,279,761 │ 306,507,562 │ +│ Output sequence length │ 114 │ 103 │ 142 │ 136 │ 122 │ 119 │ +│ Input sequence length │ 199 │ 199 │ 199 │ 199 │ 199 │ 199 │ +└──────────────────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────┘ +Output token throughput (per sec): 401.62 +Request throughput (per sec): 3.52 +``` + +### OpenAI Completions API + +#### Running GPT2 on [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions)-compatible server + +
+See instructions + +1. Run the vLLM inference server: + +```bash +docker run -it --net=host --rm --gpus=all vllm/vllm-openai:latest --model gpt2 --dtype float16 --max-model-len 1024 +``` + +
+ +#### Running GenAI-Perf + +1. Run Triton Inference Server SDK container: + +```bash +export RELEASE="yy.mm" # e.g. export RELEASE="24.03" + +docker run -it --net=host --rm --gpus=all nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk +``` + +2. Run GenAI-Perf: + +```bash +genai-perf \ + -m gpt2 \ + --service-kind openai \ + --endpoint v1/completions \ + --endpoint-type completions \ + --num-prompts 100 \ + --random-seed 123 \ + --synthetic-input-tokens-mean 200 \ + --synthetic-input-tokens-stddev 0 \ + --output-tokens-mean 100 \ + --output-tokens-stddev 0 \ + --tokenizer hf-internal-testing/llama-tokenizer \ + --concurrency 1 \ + --measurement-interval 4000 \ + --profile-export-file my_profile_export.json \ + --url localhost:8000 +``` + +Example output: + +``` + LLM Metrics +┏━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ +┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ +│ Request latency (ns) │ 296,990,497 │ 43,312,449 │ 332,788,242 │ 327,475,292 │ 317,392,767 │ 310,343,333 │ +│ Output sequence length │ 109 │ 11 │ 158 │ 142 │ 118 │ 113 │ +│ Input sequence length │ 1 │ 1 │ 1 │ 1 │ 1 │ 1 │ +└────────────────────────┴─────────────┴────────────┴─────────────┴─────────────┴─────────────┴─────────────┘ +Output token throughput (per sec): 366.78 +Request throughput (per sec): 3.37 +``` diff --git a/genai-perf/genai_perf/.gitignore b/genai-perf/genai_perf/.gitignore new file mode 100644 index 00000000..973a71df --- /dev/null +++ b/genai-perf/genai_perf/.gitignore @@ -0,0 +1,2 @@ +*.json +*.cache diff --git a/genai-perf/genai_perf/__init__.py b/genai-perf/genai_perf/__init__.py new file mode 100644 index 00000000..025456b0 --- /dev/null +++ b/genai-perf/genai_perf/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +__version__ = "0.0.3dev" diff --git a/genai-perf/genai_perf/constants.py b/genai-perf/genai_perf/constants.py new file mode 100644 index 00000000..b951524b --- /dev/null +++ b/genai-perf/genai_perf/constants.py @@ -0,0 +1,38 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +DEFAULT_HTTP_URL = "localhost:8000" +DEFAULT_GRPC_URL = "localhost:8001" + + +OPEN_ORCA = "openorca" +CNN_DAILY_MAIL = "cnn_dailymail" +DEFAULT_INPUT_DATA_JSON = "llm_inputs.json" + + +DEFAULT_ARTIFACT_DIR = "artifacts" +DEFAULT_COMPARE_DIR = "compare" +DEFAULT_PARQUET_FILE = "all_data" diff --git a/genai-perf/genai_perf/exceptions.py b/genai-perf/genai_perf/exceptions.py new file mode 100644 index 00000000..ff4170af --- /dev/null +++ b/genai-perf/genai_perf/exceptions.py @@ -0,0 +1,21 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class GenAIPerfException(Exception): + """ + A custom exception specific to the genai-perf + """ + + pass diff --git a/genai-perf/genai_perf/export_data/console_exporter.py b/genai-perf/genai_perf/export_data/console_exporter.py new file mode 100644 index 00000000..bbd02b75 --- /dev/null +++ b/genai-perf/genai_perf/export_data/console_exporter.py @@ -0,0 +1,109 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +from genai_perf.export_data.exporter_config import ExporterConfig +from genai_perf.llm_metrics import Metrics +from rich.console import Console +from rich.table import Table + + +class ConsoleExporter: + """ + A class to export the statistics and arg values to the console. + """ + + def __init__(self, config: ExporterConfig): + self._stats = config.stats + + def export(self) -> None: + singular_metric_rows = [] + table = Table(title="LLM Metrics") + + table.add_column("Statistic", justify="right", style="cyan", no_wrap=True) + stats = ["avg", "min", "max", "p99", "p90", "p75"] + for stat in stats: + table.add_column(stat, justify="right", style="green") + + for metric in Metrics.metric_labels: + formatted_metric = metric.replace("_", " ").capitalize() + + # Throughput fields are printed after the table + is_throughput_field = metric in Metrics.throughput_fields + if is_throughput_field: + value = self._stats.get(f"{metric}", -1).get(stats[0], -1) + formatted_metric += f" (per sec): {value:.2f}" + singular_metric_rows.append(formatted_metric) + continue + + # TODO (TMA-1712): need to decide if we need this metric. Remove + # from statistics display for now. + # TODO (TMA-1678): output_token_throughput_per_request is treated + # separately since the current code treats all throughput metrics to + # be displayed outside of the statistics table. + if metric == "output_token_throughput_per_request": + formatted_metric += f" (per sec)" + continue + + is_time_field = metric in Metrics.time_fields + if is_time_field: + formatted_metric += " (ms)" + + row_values = [formatted_metric] + for stat in stats: + value = self._stats.get(f"{metric}", -1) + # Need to check for -1 for the non streaming case + if value == -1: + row_values.append(f"{value:,.2f}") + else: + value = value.get(stat, -1) + row_values.append(f"{value:,.2f}") + + # Without streaming, there is no inter-token latency available, so do not print it. + if metric == "inter_token_latency": + if all(float(value) < 0 for value in row_values[1:]): + continue + # Without streaming, TTFT and request latency are the same, so do not print TTFT. + elif metric == "time_to_first_token": + unique_values = False + for stat in stats: + value_ttft = self._stats.get(f"{metric}", -1).get(stat, -1) + value_req_latency = self._stats.get("request_latency", -1).get( + stat, -1 + ) + if value_ttft != value_req_latency: + unique_values = True + break + if not unique_values: + continue + + table.add_row(*row_values) + + console = Console() + console.print(table) + + for row in singular_metric_rows: + print(row) diff --git a/genai-perf/genai_perf/export_data/csv_exporter.py b/genai-perf/genai_perf/export_data/csv_exporter.py new file mode 100644 index 00000000..3677fe35 --- /dev/null +++ b/genai-perf/genai_perf/export_data/csv_exporter.py @@ -0,0 +1,137 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import csv + +import genai_perf.logging as logging +from genai_perf.export_data.exporter_config import ExporterConfig +from genai_perf.llm_metrics import Metrics + +DEFAULT_OUTPUT_DATA_CSV = "profile_export_genai_perf.csv" + +logger = logging.getLogger(__name__) + + +class CsvExporter: + """ + A class to export the statistics and arg values in a csv format. + """ + + def __init__(self, config: ExporterConfig): + self._stats = config.stats + self._output_dir = config.artifact_dir + + def export(self) -> None: + csv_filename = self._output_dir / DEFAULT_OUTPUT_DATA_CSV + logger.info(f"Generating {csv_filename}") + + multiple_metric_header = [ + "Metric", + "avg", + "min", + "max", + "p99", + "p95", + "p90", + "p75", + "p50", + "p25", + ] + + single_metric_header = [ + "Metric", + "Value", + ] + + with open(csv_filename, mode="w", newline="") as csvfile: + singular_metric_rows = [] + + csv_writer = csv.writer(csvfile) + csv_writer.writerow(multiple_metric_header) + + for metric in Metrics.metric_labels: + formatted_metric = metric.replace("_", " ").title() + + is_throughput_field = metric in Metrics.throughput_fields + is_time_field = metric in Metrics.time_fields + + if is_time_field: + formatted_metric += " (ms)" + elif is_throughput_field: + formatted_metric += " (per sec)" + # TODO (TMA-1712): need to decide if we need this metric. Do not + # include in the csv for now. + # TODO (TMA-1678): output_token_throughput_per_request is treated + # separately since the current code treats all throughput metrics + # to be displayed outside of the statistics table. + elif metric == "output_token_throughput_per_request": + formatted_metric += " (per sec)" + continue + + row_values = [formatted_metric] + + if is_throughput_field: + value = self._stats.get(f"{metric}", -1).get( + multiple_metric_header[1], -1 + ) + row_values.append(f"{value:.2f}") + singular_metric_rows.append(row_values) + continue + + for stat in multiple_metric_header[1:]: + value = self._stats.get(f"{metric}", -1) + # Need to check for -1 for the non streaming case + if value == -1: + row_values.append(f"{value:,.2f}") + else: + value = value.get(stat, -1) + row_values.append(f"{value:,.2f}") + + # Without streaming, there is no inter-token latency available, so do not print it. + if metric == "inter_token_latency": + if all(value == "-1" for value in row_values[1:]): + continue + # Without streaming, TTFT and request latency are the same, so do not print TTFT. + elif metric == "time_to_first_token": + unique_values = False + for stat in multiple_metric_header[1:]: + value_ttft = self._stats.get(f"{metric}", -1).get(stat, -1) + value_req_latency = self._stats.get("request_latency", -1).get( + stat, -1 + ) + if value_ttft != value_req_latency: + unique_values = True + break + if not unique_values: + continue + + csv_writer.writerow(row_values) + + csv_writer.writerow([]) + csv_writer.writerow(single_metric_header) + for row in singular_metric_rows: + csv_writer.writerow(row) diff --git a/genai-perf/genai_perf/export_data/data_exporter_factory.py b/genai-perf/genai_perf/export_data/data_exporter_factory.py new file mode 100644 index 00000000..ac226bdf --- /dev/null +++ b/genai-perf/genai_perf/export_data/data_exporter_factory.py @@ -0,0 +1,42 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from typing import List + +from genai_perf.export_data.console_exporter import ConsoleExporter +from genai_perf.export_data.csv_exporter import CsvExporter +from genai_perf.export_data.exporter_config import ExporterConfig +from genai_perf.export_data.json_exporter import JsonExporter + +DataExporterList = [ConsoleExporter, JsonExporter, CsvExporter] + + +class DataExporterFactory: + def create_data_exporters(self, config: ExporterConfig) -> List: + data_exporters = [] + for exporter in DataExporterList: + data_exporters.append(exporter(config)) + return data_exporters diff --git a/genai-perf/genai_perf/export_data/data_exporter_interface.py b/genai-perf/genai_perf/export_data/data_exporter_interface.py new file mode 100644 index 00000000..56bde9a5 --- /dev/null +++ b/genai-perf/genai_perf/export_data/data_exporter_interface.py @@ -0,0 +1,33 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +from typing import Protocol + + +class DataExporterInterface(Protocol): + def export(self): + pass diff --git a/genai-perf/genai_perf/export_data/exporter_config.py b/genai-perf/genai_perf/export_data/exporter_config.py new file mode 100644 index 00000000..3f045196 --- /dev/null +++ b/genai-perf/genai_perf/export_data/exporter_config.py @@ -0,0 +1,65 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class ExporterConfig: + def __init__(self): + self._stats = None + self._args = None + self._extra_inputs = None + self._artifact_dir = None + + @property + def stats(self): + return self._stats + + @stats.setter + def stats(self, stats_value): + self._stats = stats_value + + @property + def args(self): + return self._args + + @args.setter + def args(self, args_value): + self._args = args_value + + @property + def extra_inputs(self): + return self._extra_inputs + + @extra_inputs.setter + def extra_inputs(self, extra_inputs_value): + self._extra_inputs = extra_inputs_value + + @property + def artifact_dir(self): + return self._artifact_dir + + @artifact_dir.setter + def artifact_dir(self, artifact_dir_value): + self._artifact_dir = artifact_dir_value diff --git a/genai-perf/genai_perf/export_data/json_exporter.py b/genai-perf/genai_perf/export_data/json_exporter.py new file mode 100644 index 00000000..c5a0f36c --- /dev/null +++ b/genai-perf/genai_perf/export_data/json_exporter.py @@ -0,0 +1,76 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import json +from enum import Enum +from typing import Dict + +import genai_perf.logging as logging +from genai_perf.export_data.exporter_config import ExporterConfig + +DEFAULT_OUTPUT_DATA_JSON = "profile_export_genai_perf.json" + +logger = logging.getLogger(__name__) + + +class JsonExporter: + """ + A class to export the statistics and arg values in a json format. + """ + + def __init__(self, config: ExporterConfig): + self._stats: Dict = config.stats + self._args = dict(vars(config.args)) + self._extra_inputs = config.extra_inputs + self._output_dir = config.artifact_dir + self._stats_and_args: Dict = {} + self._prepare_args_for_export() + self._merge_stats_and_args() + + def export(self) -> None: + filename = self._output_dir / DEFAULT_OUTPUT_DATA_JSON + logger.info(f"Generating {filename}") + with open(str(filename), "w") as f: + f.write(json.dumps(self._stats_and_args, indent=2)) + + def _prepare_args_for_export(self) -> None: + del self._args["func"] + del self._args["output_format"] + self._args["profile_export_file"] = str(self._args["profile_export_file"]) + self._args["artifact_dir"] = str(self._args["artifact_dir"]) + for k, v in self._args.items(): + if isinstance(v, Enum): + self._args[k] = v.name.lower() + self._add_extra_inputs_to_args() + + def _add_extra_inputs_to_args(self) -> None: + del self._args["extra_inputs"] + self._args.update({"extra_inputs": self._extra_inputs}) + + def _merge_stats_and_args(self) -> None: + self._stats_and_args = dict(self._stats) + self._stats_and_args.update({"input_config": self._args}) diff --git a/genai-perf/genai_perf/export_data/output_reporter.py b/genai-perf/genai_perf/export_data/output_reporter.py new file mode 100644 index 00000000..0189ccfa --- /dev/null +++ b/genai-perf/genai_perf/export_data/output_reporter.py @@ -0,0 +1,60 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +from argparse import Namespace + +from genai_perf.export_data.data_exporter_factory import DataExporterFactory +from genai_perf.export_data.exporter_config import ExporterConfig +from genai_perf.llm_metrics import Statistics +from genai_perf.parser import get_extra_inputs_as_dict + + +class OutputReporter: + """ + A class to orchestrate output generation. + """ + + def __init__(self, stats: Statistics, args: Namespace): + self.args = args + self.stats = stats + self.stats.scale_data() + + def report_output(self) -> None: + factory = DataExporterFactory() + exporter_config = self._create_exporter_config() + data_exporters = factory.create_data_exporters(exporter_config) + + for exporter in data_exporters: + exporter.export() + + def _create_exporter_config(self) -> ExporterConfig: + config = ExporterConfig() + config.stats = self.stats.stats_dict + config.args = self.args + config.artifact_dir = self.args.artifact_dir + config.extra_inputs = get_extra_inputs_as_dict(self.args) + return config diff --git a/genai-perf/genai_perf/llm_inputs/__init__.py b/genai-perf/genai_perf/llm_inputs/__init__.py new file mode 100644 index 00000000..c6959fce --- /dev/null +++ b/genai-perf/genai_perf/llm_inputs/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/genai-perf/genai_perf/llm_inputs/farewell.txt b/genai-perf/genai_perf/llm_inputs/farewell.txt new file mode 100644 index 00000000..cfbe41a7 --- /dev/null +++ b/genai-perf/genai_perf/llm_inputs/farewell.txt @@ -0,0 +1,104 @@ +The period for a new election of a citizen to +administer the executive government of the United +States being not far distant, and the time actually +arrived when your thoughts must be employed in +designating the person who is to be clothed with that +important trust, it appears to me proper, especially as +it may conduce to a more distinct expression of the +public voice, that I should now apprise you of the +resolution I have formed, to decline being considered +among the number of those out of whom a choice is to be made. +I beg you, at the same time, to do me the justice to +be assured that this resolution has not been taken +without a strict regard to all the considerations +appertaining to the relation which binds a dutiful +citizen to his country—and that, in withdrawing the +tender of service which silence in my situation might +imply, I am influenced by no diminution of zeal for +your future interest, no deficiency of grateful respect +for your past kindness; but am supported by a full +conviction that the step is compatible with both. +The acceptance of, and continuance hitherto in, the +office to which your suffrages have twice called me, +have been a uniform sacrifice of inclination to the +opinion of duty and to a deference for what appeared +to be your desire. I constantly hoped that it would +have been much earlier in my power, consistently with +motives which I was not at liberty to disregard, to +return to that retirement from which I had been +reluctantly drawn. The strength of my inclination to +do this, previous to the last election, had even led to +the preparation of an address to declare it to you; but +mature reflection on the then perplexed and critical +posture of our affairs with foreign nations, and the +unanimous advice of persons entitled to my +confidence, impelled me to abandon the idea. +I rejoice that the state of your concerns, external as +well as internal, no longer renders the pursuit of +inclination incompatible with the sentiment of duty or +propriety, and am persuaded whatever partiality may +be retained for my services, that in the present +circumstances of our country, you will not disapprove +my determination to retire. +The impressions with which I first undertook the +arduous trust were explained on the proper occasion. +In the discharge of this trust, I will only say that I +have, with good intentions, contributed towards the +organization and administration of the government, +the best exertions of which a very fallible judgment +was capable. Not unconscious in the outset of the +inferiority of my qualifications, experience in my +own eyes, perhaps still more in the eyes of others, +has strengthened the motives to diffidence of myself; +and every day the increasing weight of years +admonishes me more and more that the shade of +retirement is as necessary to me as it will be +welcome. Satisfied that if any circumstances have +given peculiar value to my services, they were +temporary, I have the consolation to believe, that +while choice and prudence invite me to quit the +political scene, patriotism does not forbid it. +In looking forward to the moment which is +intended to terminate the career of my public life, my +feelings do not permit me to suspend the deep +acknowledgment of that debt of gratitude which I +owe to my beloved country for the many honors it has +conferred upon me; still more for the steadfast +confidence with which it has supported me; and for +the opportunities I have thence enjoyed of manifesting +my inviolable attachment, by services faithful and +persevering, though in usefulness unequal to my zeal. +If benefits have resulted to our country from these +services, let it always be remembered to your praise, +and as an instructive example in our annals that +under circumstances in which the passions agitated in +every direction were liable to mislead, amidst +appearances sometimes dubious, vicissitudes of +fortune often discouraging, in situations in which not +unfrequently want of success has countenanced the +spirit of criticism, the constancy of your support was +the essential prop of the efforts, and a guarantee of +the plans by which they were effected. Profoundly +penetrated with this idea, I shall carry it with me to +my grave, as a strong incitement to unceasing vows +that Heaven may continue to you the choicest tokens +of its beneficence; that your Union and brotherly +affection may be perpetual; that the free constitution, +which is the work of your hands, may be sacredly +maintained; that its administration in every +department may be stamped with wisdom and virtue; +that, in fine, the happiness of the people of these +states, under the auspices of liberty, may be made +complete by so careful a preservation and so prudent +a use of this blessing as will acquire to them the glory +of recommending it to the applause, the affection, +and adoption of every nation which is yet a stranger to it. +Here, perhaps, I ought to stop. But a solicitude for +your welfare, which cannot end but with my life, and +the apprehension of danger, natural to that +solicitude, urge me on an occasion like the present, +to offer to your solemn contemplation, and to +recommend to your frequent review, some sentiments +which are the result of much reflection, of no +inconsiderable observation, and which appear to me +all important to the permanency of your felicity as a \ No newline at end of file diff --git a/genai-perf/genai_perf/llm_inputs/llm_inputs.py b/genai-perf/genai_perf/llm_inputs/llm_inputs.py new file mode 100644 index 00000000..3613e564 --- /dev/null +++ b/genai-perf/genai_perf/llm_inputs/llm_inputs.py @@ -0,0 +1,1150 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import random +from copy import deepcopy +from enum import Enum, auto +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, cast + +import requests +from genai_perf.constants import CNN_DAILY_MAIL, DEFAULT_INPUT_DATA_JSON, OPEN_ORCA +from genai_perf.exceptions import GenAIPerfException +from genai_perf.llm_inputs.synthetic_prompt_generator import SyntheticPromptGenerator +from genai_perf.tokenizer import DEFAULT_TOKENIZER, Tokenizer, get_tokenizer +from requests import Response + + +class ModelSelectionStrategy(Enum): + ROUND_ROBIN = auto() + RANDOM = auto() + + +class PromptSource(Enum): + SYNTHETIC = auto() + DATASET = auto() + FILE = auto() + + +class OutputFormat(Enum): + OPENAI_CHAT_COMPLETIONS = auto() + OPENAI_COMPLETIONS = auto() + TENSORRTLLM = auto() + VLLM = auto() + + def to_lowercase(self): + return self.name.lower() + + +class LlmInputs: + """ + A library of methods that control the generation of LLM Inputs + """ + + OPEN_ORCA_URL = "https://datasets-server.huggingface.co/rows?dataset=Open-Orca%2FOpenOrca&config=default&split=train" + CNN_DAILYMAIL_URL = "https://datasets-server.huggingface.co/rows?dataset=cnn_dailymail&config=1.0.0&split=train" + + DEFAULT_STARTING_INDEX = 0 + MINIMUM_STARTING_INDEX = 0 + + DEFAULT_LENGTH = 100 + MINIMUM_LENGTH = 1 + + DEFAULT_TENSORRTLLM_MAX_TOKENS = 256 + + DEFAULT_RANDOM_SEED = 0 + DEFAULT_PROMPT_TOKENS_MEAN = 550 + DEFAULT_PROMPT_TOKENS_STDDEV = 0 + DEFAULT_OUTPUT_TOKENS_MEAN = -1 + DEFAULT_OUTPUT_TOKENS_STDDEV = 0 + DEFAULT_NUM_PROMPTS = 100 + + EMPTY_JSON_IN_VLLM_PA_FORMAT: Dict = {"data": []} + EMPTY_JSON_IN_TENSORRTLLM_PA_FORMAT: Dict = {"data": []} + EMPTY_JSON_IN_OPENAI_PA_FORMAT: Dict = {"data": []} + + dataset_url_map = {OPEN_ORCA: OPEN_ORCA_URL, CNN_DAILY_MAIL: CNN_DAILYMAIL_URL} + + @classmethod + def create_llm_inputs( + cls, + input_type: PromptSource, + output_format: OutputFormat, + dataset_name: str = "", + model_name: list = [], + model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN, + input_filename: Optional[Path] = Path(""), + starting_index: int = DEFAULT_STARTING_INDEX, + length: int = DEFAULT_LENGTH, + output_tokens_mean: int = DEFAULT_OUTPUT_TOKENS_MEAN, + output_tokens_stddev: int = DEFAULT_OUTPUT_TOKENS_STDDEV, + output_tokens_deterministic: bool = False, + prompt_tokens_mean: int = DEFAULT_PROMPT_TOKENS_MEAN, + prompt_tokens_stddev: int = DEFAULT_PROMPT_TOKENS_STDDEV, + random_seed: int = DEFAULT_RANDOM_SEED, + num_of_output_prompts: int = DEFAULT_NUM_PROMPTS, + add_model_name: bool = False, + add_stream: bool = False, + tokenizer: Tokenizer = get_tokenizer(DEFAULT_TOKENIZER), + extra_inputs: Optional[Dict] = None, + output_dir: Path = Path(""), + ) -> Dict: + """ + Given an input type, input format, and output type. Output a string of LLM Inputs + (in a JSON dictionary) to a file + + Required Parameters + ------------------- + input_type: + Specify how the input is received + output_format: + Specify the output format + + Optional Parameters + ------------------- + dataset_name: + The name of the dataset + model_name: + The model name + starting_index: + Offset from within the list to start gathering inputs + length: + Number of entries to gather + add_model_name: + If true, adds a model name field to each payload + add_stream: + If true, adds a steam field to each payload + extra_inputs: + If provided, append these inputs to every request + output_tokens_mean: + The mean length of the output to generate. If not using fixed output lengths, this should be set to -1. + output_tokens_stddev: + The standard deviation of the length of the output to generate. This is only used if output_tokens_mean is provided. + output_tokens_deterministic: + If true, the output tokens will set the minimum and maximum tokens to be equivalent. + + Required Synthetic Prompt Generation Parameters + ----------------------------------------------- + tokenizer: + The tokenizer to use when generating synthetic prompts + + Optional Synthetic Prompt Generation Parameters + ----------------------------------------------- + prompt_tokens_mean: + The mean length of the prompt to generate + prompt_tokens_stddev: + The standard deviation of the length of the prompt to generate + num_of_output_prompts: + The number of synthetic output prompts to generate + random_seed: + Seed used to generate random values + """ + + cls._check_for_valid_args( + input_type, dataset_name, starting_index, length, tokenizer + ) + + if input_type == PromptSource.DATASET: + dataset = cls._get_input_dataset_from_url( + dataset_name, starting_index, length + ) + generic_dataset_json = cls._convert_input_url_dataset_to_generic_json( + dataset + ) + elif input_type == PromptSource.SYNTHETIC: + random.seed(random_seed) + synthetic_dataset = cls._get_input_dataset_from_synthetic( + tokenizer, + prompt_tokens_mean, + prompt_tokens_stddev, + num_of_output_prompts, + ) + generic_dataset_json = ( + cls._convert_input_synthetic_or_file_dataset_to_generic_json( + synthetic_dataset + ) + ) + elif input_type == PromptSource.FILE: + input_filename = cast(Path, input_filename) + input_file_dataset = cls._get_input_dataset_from_file(input_filename) + generic_dataset_json = ( + cls._convert_input_synthetic_or_file_dataset_to_generic_json( + input_file_dataset + ) + ) + else: + raise GenAIPerfException("Input source is not recognized.") + + if extra_inputs is None: + extra_inputs = {} + + json_in_pa_format = cls._convert_generic_json_to_output_format( + output_format, + generic_dataset_json, + add_model_name, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + model_name, + model_selection_strategy, + ) + cls._write_json_to_file(json_in_pa_format, output_dir) + + return json_in_pa_format + + @classmethod + def _check_for_valid_args( + cls, + input_type: PromptSource, + dataset_name: str, + starting_index: int, + length: int, + tokenizer: Tokenizer, + ) -> None: + try: + cls._check_for_dataset_name_if_input_type_is_url(input_type, dataset_name) + cls._check_for_tokenzier_if_input_type_is_synthetic(input_type, tokenizer) + cls._check_for_valid_starting_index(starting_index) + cls._check_for_valid_length(length) + + except Exception as e: + raise GenAIPerfException(e) + + @classmethod + def _get_input_dataset_from_url( + cls, dataset_name: str, starting_index: int, length: int + ) -> Response: + url = cls._resolve_url(dataset_name) + configured_url = cls._create_configured_url(url, starting_index, length) + dataset = cls._download_dataset(configured_url) + + return dataset + + @classmethod + def _get_input_dataset_from_synthetic( + cls, + tokenizer: Tokenizer, + prompt_tokens_mean: int, + prompt_tokens_stddev: int, + num_of_output_prompts: int, + ) -> Dict[str, Any]: + dataset_json: Dict[str, Any] = {} + dataset_json["features"] = [{"name": "text_input"}] + dataset_json["rows"] = [] + for _ in range(num_of_output_prompts): + synthetic_prompt = cls._create_synthetic_prompt( + tokenizer, + prompt_tokens_mean, + prompt_tokens_stddev, + ) + dataset_json["rows"].append({"row": {"text_input": synthetic_prompt}}) + + return dataset_json + + @classmethod + def _resolve_url(cls, dataset_name: str) -> str: + if dataset_name in cls.dataset_url_map: + return cls.dataset_url_map[dataset_name] + else: + raise GenAIPerfException( + f"{dataset_name} does not have a corresponding URL in the dataset_url_map." + ) + + @classmethod + def _create_configured_url(cls, url: str, starting_index: int, length: int) -> str: + starting_index_str = str(starting_index) + length_str = str(length) + configured_url = url + f"&offset={starting_index_str}&length={length_str}" + + return configured_url + + @classmethod + def _download_dataset(cls, configured_url: str) -> Response: + dataset = cls._query_server(configured_url) + + return dataset + + @classmethod + def _convert_input_url_dataset_to_generic_json(cls, dataset: Response) -> Dict: + dataset_json = dataset.json() + try: + cls._check_for_error_in_json_of_dataset(dataset_json) + except Exception as e: + raise GenAIPerfException(e) + + generic_dataset_json = cls._convert_dataset_to_generic_input_json(dataset_json) + + return generic_dataset_json + + @classmethod + def _convert_input_synthetic_or_file_dataset_to_generic_json( + cls, dataset: Dict + ) -> Dict[str, List[Dict]]: + generic_dataset_json = cls._convert_dataset_to_generic_input_json(dataset) + + return generic_dataset_json + + @classmethod + def _convert_dataset_to_generic_input_json( + cls, dataset_json: Dict + ) -> Dict[str, List[Dict]]: + generic_input_json = cls._add_features_to_generic_json({}, dataset_json) + generic_input_json = cls._add_rows_to_generic_json( + generic_input_json, dataset_json + ) + + return generic_input_json + + @classmethod + def _add_features_to_generic_json( + cls, generic_input_json: Dict, dataset_json: Dict + ) -> Dict: + if "features" in dataset_json.keys(): + generic_input_json["features"] = [] + for feature in dataset_json["features"]: + generic_input_json["features"].append(feature["name"]) + + return generic_input_json + + @classmethod + def _add_rows_to_generic_json( + cls, generic_input_json: Dict, dataset_json: Dict + ) -> Dict[str, List[Dict]]: + generic_input_json["rows"] = [] + for row in dataset_json["rows"]: + generic_input_json["rows"].append(row["row"]) + + return generic_input_json + + @classmethod + def _get_input_dataset_from_file(cls, input_filename: Path) -> Dict: + """ + Reads the input prompts from a JSONL file and converts them into the required dataset format. + + Parameters + ---------- + input_filename : Path + The path to the input file containing the prompts in JSONL format. + + Returns + ------- + Dict + The dataset in the required format with the prompts read from the file. + """ + cls.verify_file(input_filename) + input_file_prompts = cls._get_prompts_from_input_file(input_filename) + dataset_json: Dict[str, Any] = {} + dataset_json["features"] = [{"name": "text_input"}] + dataset_json["rows"] = [ + {"row": {"text_input": prompt}} for prompt in input_file_prompts + ] + return dataset_json + + @classmethod + def _get_prompts_from_input_file(cls, input_filename: Path) -> List[str]: + """ + Reads the input prompts from a JSONL file and returns a list of prompts. + + Parameters + ---------- + input_filename : Path + The path to the input file containing the prompts in JSONL format. + + Returns + ------- + List[str] + A list of prompts read from the file. + """ + prompts = [] + with open(input_filename, mode="r", newline=None) as file: + for line in file: + if line.strip(): + prompts.append(json.loads(line).get("text_input", "").strip()) + return prompts + + @classmethod + def verify_file(cls, input_filename: Path) -> None: + if not input_filename.exists(): + raise FileNotFoundError(f"The file '{input_filename}' does not exist.") + + @classmethod + def _convert_generic_json_to_output_format( + cls, + output_format: OutputFormat, + generic_dataset: Dict, + add_model_name: bool, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + model_name: list = [], + model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN, + ) -> Dict: + if output_format == OutputFormat.OPENAI_CHAT_COMPLETIONS: + output_json = cls._convert_generic_json_to_openai_chat_completions_format( + generic_dataset, + add_model_name, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + model_name, + model_selection_strategy, + ) + elif output_format == OutputFormat.OPENAI_COMPLETIONS: + output_json = cls._convert_generic_json_to_openai_completions_format( + generic_dataset, + add_model_name, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + model_name, + model_selection_strategy, + ) + elif output_format == OutputFormat.VLLM: + output_json = cls._convert_generic_json_to_vllm_format( + generic_dataset, + add_model_name, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + model_name, + model_selection_strategy, + ) + elif output_format == OutputFormat.TENSORRTLLM: + output_json = cls._convert_generic_json_to_trtllm_format( + generic_dataset, + add_model_name, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + model_name, + model_selection_strategy, + ) + else: + raise GenAIPerfException( + f"Output format {output_format} is not currently supported" + ) + + return output_json + + @classmethod + def _convert_generic_json_to_openai_chat_completions_format( + cls, + dataset_json: Dict, + add_model_name: bool, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + model_name: list = [], + model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN, + ) -> Dict: + # TODO (TMA-1757): Implement a way to select a role for `text_input` + ( + system_role_headers, + user_role_headers, + _, + ) = cls._determine_json_feature_roles(dataset_json) + pa_json = cls._populate_openai_chat_completions_output_json( + dataset_json, + system_role_headers, + user_role_headers, + add_model_name, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + model_name, + model_selection_strategy, + ) + + return pa_json + + @classmethod + def _convert_generic_json_to_openai_completions_format( + cls, + dataset_json: Dict, + add_model_name: bool, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + model_name: list = [], + model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN, + ) -> Dict: + ( + system_role_headers, + user_role_headers, + text_input_headers, + ) = cls._determine_json_feature_roles(dataset_json) + pa_json = cls._populate_openai_completions_output_json( + dataset_json, + system_role_headers, + user_role_headers, + text_input_headers, + add_model_name, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + model_name, + model_selection_strategy, + ) + + return pa_json + + @classmethod + def _convert_generic_json_to_vllm_format( + cls, + dataset_json: Dict, + add_model_name: bool, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + model_name: list = [], + model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN, + ) -> Dict: + ( + system_role_headers, + user_role_headers, + text_input_headers, + ) = cls._determine_json_feature_roles(dataset_json) + + pa_json = cls._populate_vllm_output_json( + dataset_json, + system_role_headers, + user_role_headers, + text_input_headers, + add_model_name, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + model_name, + model_selection_strategy, + ) + + return pa_json + + @classmethod + def _convert_generic_json_to_trtllm_format( + cls, + dataset_json: Dict, + add_model_name: bool, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + model_name: list = [], + model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN, + ) -> Dict: + ( + system_role_headers, + user_role_headers, + text_input_headers, + ) = cls._determine_json_feature_roles(dataset_json) + + pa_json = cls._populate_trtllm_output_json( + dataset_json, + system_role_headers, + user_role_headers, + text_input_headers, + add_model_name, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + model_name, + model_selection_strategy, + ) + + return pa_json + + @classmethod + def _write_json_to_file(cls, json_in_pa_format: Dict, output_dir: Path) -> None: + filename = output_dir / DEFAULT_INPUT_DATA_JSON + with open(str(filename), "w") as f: + f.write(json.dumps(json_in_pa_format, indent=2)) + + @classmethod + def _determine_json_feature_roles( + cls, dataset_json: Dict + ) -> Tuple[List[str], List[str], List[str]]: + SYSTEM_ROLE_LIST = ["system_prompt"] + USER_ROLE_LIST = ["question", "article"] + TEXT_INPUT_LIST = ["text_input"] + + system_role_headers: List[str] = [] + user_role_headers: List[str] = [] + text_input_headers: List[str] = [] + + if "features" in dataset_json.keys(): + # TODO (TPA-53) remove enumerate if index isnt useful + for index, feature in enumerate(dataset_json["features"]): + if feature in SYSTEM_ROLE_LIST: + system_role_headers.append(feature) + if feature in USER_ROLE_LIST: + user_role_headers.append(feature) + if feature in TEXT_INPUT_LIST: + user_role_headers.append(feature) + + assert ( + system_role_headers is not None + or user_role_headers is not None + or text_input_headers is not None + ) + + return system_role_headers, user_role_headers, text_input_headers + + @classmethod + def _select_model_name(cls, model_name, index, model_selection_strategy): + if model_selection_strategy == ModelSelectionStrategy.ROUND_ROBIN: + return model_name[index % len(model_name)] + elif model_selection_strategy == ModelSelectionStrategy.RANDOM: + return random.choice(model_name) + else: + raise GenAIPerfException( + f"Model selection strategy '{model_selection_strategy}' is unsupported" + ) + + @classmethod + def _populate_openai_chat_completions_output_json( + cls, + dataset_json: Dict, + system_role_headers: List[str], + user_role_headers: List[str], + add_model_name: bool, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + model_name: list = [], + model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN, + ) -> Dict: + pa_json = cls._create_empty_openai_pa_json() + + for index, entry in enumerate(dataset_json["rows"]): + iter_model_name = cls._select_model_name( + model_name, index, model_selection_strategy + ) + pa_json["data"].append({"payload": []}) + pa_json["data"][index]["payload"].append({"messages": []}) + + for header, content in entry.items(): + new_message = cls._create_new_openai_chat_completions_message( + header, system_role_headers, user_role_headers, content + ) + + pa_json = cls._add_new_message_to_json(pa_json, index, new_message) + + pa_json = cls._add_optional_tags_to_openai_json( + pa_json, + index, + add_model_name, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + iter_model_name, + ) + + return pa_json + + @classmethod + def _populate_openai_completions_output_json( + cls, + dataset_json: Dict, + system_role_headers: List[str], + user_role_headers: List[str], + text_input_headers: List[str], + add_model_name: bool, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + model_name: list = [], + model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN, + ) -> Dict: + pa_json = cls._create_empty_openai_pa_json() + + for index, entry in enumerate(dataset_json["rows"]): + iter_model_name = cls._select_model_name( + model_name, index, model_selection_strategy + ) + pa_json["data"].append({"payload": []}) + pa_json["data"][index]["payload"].append({"prompt": ""}) + + for header, content in entry.items(): + new_prompt = cls._create_new_prompt( + header, + system_role_headers, + user_role_headers, + text_input_headers, + content, + ) + + pa_json = cls._add_new_prompt_to_json(pa_json, index, new_prompt) + + pa_json = cls._add_optional_tags_to_openai_json( + pa_json, + index, + add_model_name, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + iter_model_name, + ) + + return pa_json + + @classmethod + def _populate_vllm_output_json( + cls, + dataset_json: Dict, + system_role_headers: List[str], + user_role_headers: List[str], + text_input_headers: List[str], + add_model_name: bool, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + model_name: list = [], + model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN, + ) -> Dict: + pa_json = cls._create_empty_vllm_pa_json() + + for index, entry in enumerate(dataset_json["rows"]): + iter_model_name = cls._select_model_name( + model_name, index, model_selection_strategy + ) + pa_json["data"].append({"text_input": [""]}) + + for header, content in entry.items(): + new_text_input = cls._create_new_text_input( + header, + system_role_headers, + user_role_headers, + text_input_headers, + content, + ) + + pa_json = cls._add_new_text_input_to_json( + pa_json, index, new_text_input + ) + + pa_json = cls._add_optional_tags_to_vllm_json( + pa_json, + index, + add_model_name, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + iter_model_name, + ) + + return pa_json + + @classmethod + def _populate_trtllm_output_json( + cls, + dataset_json: Dict, + system_role_headers: List[str], + user_role_headers: List[str], + text_input_headers: List[str], + add_model_name: bool, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + model_name: list = [], + model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN, + ) -> Dict: + pa_json = cls._create_empty_trtllm_pa_json() + default_max_tokens = ( + "max_tokens" not in extra_inputs + or output_tokens_mean != cls.DEFAULT_OUTPUT_TOKENS_MEAN + ) + + for index, entry in enumerate(dataset_json["rows"]): + iter_model_name = cls._select_model_name( + model_name, index, model_selection_strategy + ) + pa_json["data"].append({"text_input": [""]}) + + for header, content in entry.items(): + new_text_input = cls._create_new_text_input( + header, + system_role_headers, + user_role_headers, + text_input_headers, + content, + ) + + pa_json = cls._add_new_text_input_to_json( + pa_json, index, new_text_input + ) + + pa_json = cls._add_required_tags_to_trtllm_json( + pa_json, index, default_max_tokens + ) + pa_json = cls._add_optional_tags_to_trtllm_json( + pa_json, + index, + add_model_name, + add_stream, + extra_inputs, + output_tokens_mean, + output_tokens_stddev, + output_tokens_deterministic, + iter_model_name, + ) + + return pa_json + + @classmethod + def _create_empty_openai_pa_json(cls) -> Dict: + empty_pa_json = deepcopy(cls.EMPTY_JSON_IN_OPENAI_PA_FORMAT) + + return empty_pa_json + + @classmethod + def _create_empty_vllm_pa_json(cls) -> Dict: + empty_pa_json = deepcopy(cls.EMPTY_JSON_IN_VLLM_PA_FORMAT) + + return empty_pa_json + + @classmethod + def _create_empty_trtllm_pa_json(cls) -> Dict: + empty_pa_json = deepcopy(cls.EMPTY_JSON_IN_TENSORRTLLM_PA_FORMAT) + + return empty_pa_json + + @classmethod + def _create_new_openai_chat_completions_message( + cls, + header: str, + system_role_headers: List[str], + user_role_headers: List[str], + content: str, + ) -> Optional[Dict]: + # Do not add messages with blank content + if not content: + return {} + + if header in system_role_headers: + new_message = { + "role": "system", + "content": content, + } + elif header in user_role_headers: + new_message = { + "role": "user", + "content": content, + } + else: + new_message = {} + + return new_message + + @classmethod + def _create_new_prompt( + cls, + header: str, + system_role_headers: List[str], + user_role_headers: List[str], + text_input_headers: List[str], + content: str, + ) -> str: + new_prompt = "" + + if ( + header in system_role_headers + or header in user_role_headers + or header in text_input_headers + ): + new_prompt = content + + return new_prompt + + @classmethod + def _create_new_text_input( + cls, + header: str, + system_role_headers: List[str], + user_role_headers: List[str], + text_input_headers: List[str], + content: str, + ) -> str: + new_text_input = "" + + if ( + header in system_role_headers + or header in user_role_headers + or header in text_input_headers + ): + new_text_input = content + + return new_text_input + + @classmethod + def _add_new_message_to_json( + cls, pa_json: Dict, index: int, new_message: Optional[Dict] + ) -> Dict: + if new_message: + pa_json["data"][index]["payload"][0]["messages"].append(new_message) + + return pa_json + + @classmethod + def _add_new_text_input_to_json( + cls, pa_json: Dict, index: int, new_text_input: str + ) -> Dict: + if new_text_input: + if pa_json["data"][index]["text_input"][0]: + pa_json["data"][index]["text_input"][0] = ( + pa_json["data"][index]["text_input"][0] + f" {new_text_input}" + ) + else: + pa_json["data"][index]["text_input"][0] = new_text_input + + return pa_json + + @classmethod + def _add_new_prompt_to_json( + cls, + pa_json: Dict, + index: int, + new_prompt: str, + ) -> Dict: + if new_prompt: + if pa_json["data"][index]["payload"][0]["prompt"]: + pa_json["data"][index]["payload"][0]["prompt"] += f" {new_prompt}" + else: + pa_json["data"][index]["payload"][0]["prompt"] = new_prompt + + return pa_json + + @classmethod + def _add_optional_tags_to_openai_json( + cls, + pa_json: Dict, + index: int, + add_model_name: bool, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + model_name: str = "", + ) -> Dict: + row = pa_json["data"][index]["payload"][0] + if add_model_name: + row["model"] = model_name + if add_stream: + row["stream"] = True + if output_tokens_mean != cls.DEFAULT_OUTPUT_TOKENS_MEAN: + row["max_tokens"] = int( + random.gauss(output_tokens_mean, output_tokens_stddev) + ) + for key, value in extra_inputs.items(): + row[key] = value + + return pa_json + + @classmethod + def _add_optional_tags_to_vllm_json( + cls, + pa_json: Dict, + index: int, + add_model_name: bool, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + model_name: str = "", + ) -> Dict: + row = pa_json["data"][index] + if add_model_name: + row["model"] = model_name + if add_stream: + row["stream"] = [True] + if output_tokens_mean != cls.DEFAULT_OUTPUT_TOKENS_MEAN: + number_of_tokens = str( + int(max(0, random.gauss(output_tokens_mean, output_tokens_stddev))) + ) + sampling_parameters = { + "max_tokens": number_of_tokens, + } + if output_tokens_deterministic: + sampling_parameters["min_tokens"] = number_of_tokens + sampling_parameters_str = json.dumps(sampling_parameters) + row["sampling_parameters"] = [sampling_parameters_str] + for key, value in extra_inputs.items(): + row[key] = [value] + if "exclude_input_in_output" not in row: + row["exclude_input_in_output"] = [True] + + return pa_json + + @classmethod + def _add_optional_tags_to_trtllm_json( + cls, + pa_json: Dict, + index: int, + add_model_name: bool, + add_stream: bool, + extra_inputs: Dict, + output_tokens_mean: int, + output_tokens_stddev: int, + output_tokens_deterministic: bool, + model_name: str = "", + ) -> Dict: + row = pa_json["data"][index] + if add_model_name: + row["model"] = model_name + if add_stream: + row["stream"] = [True] + if output_tokens_mean != cls.DEFAULT_OUTPUT_TOKENS_MEAN: + number_of_tokens = int( + random.gauss(output_tokens_mean, output_tokens_stddev) + ) + if output_tokens_deterministic: + row["min_length"] = [number_of_tokens] + row["max_tokens"] = [number_of_tokens] + for key, value in extra_inputs.items(): + row[key] = [value] + + return pa_json + + @classmethod + def _add_required_tags_to_trtllm_json( + cls, + pa_json: Dict, + index: int, + default_max_tokens: bool, + ) -> Dict: + row = pa_json["data"][index] + if default_max_tokens: + row["max_tokens"] = [cls.DEFAULT_TENSORRTLLM_MAX_TOKENS] + + return pa_json + + @classmethod + def _check_for_dataset_name_if_input_type_is_url( + cls, input_type: PromptSource, dataset_name: str + ) -> None: + if input_type == PromptSource.DATASET and not dataset_name: + raise GenAIPerfException( + "Input type is dataset, but dataset_name is not specified." + ) + + @classmethod + def _check_for_tokenzier_if_input_type_is_synthetic( + cls, + input_type: PromptSource, + tokenizer: Tokenizer, + ) -> None: + if input_type == PromptSource.SYNTHETIC and not tokenizer: + raise GenAIPerfException( + "Input type is SYNTHETIC, but a tokenizer was not specified." + ) + + @classmethod + def _check_for_valid_starting_index(cls, starting_index: int) -> None: + if not isinstance(starting_index, int): + raise GenAIPerfException( + f"starting_index: {starting_index} must be an integer." + ) + + if starting_index < cls.MINIMUM_STARTING_INDEX: + raise GenAIPerfException( + f"starting_index: {starting_index} must be larger than {cls.MINIMUM_STARTING_INDEX}." + ) + + @classmethod + def _check_for_valid_length(cls, length: int) -> None: + if not isinstance(length, int): + raise GenAIPerfException(f"length: {length} must be an integer.") + + if length < cls.MINIMUM_LENGTH: + raise GenAIPerfException( + f"starting_index: {length} must be larger than {cls.MINIMUM_LENGTH}." + ) + + @classmethod + def _query_server(cls, configured_url: str) -> Response: + try: + response = requests.get(configured_url) + except Exception as e: + error_message = cls._create_error_message(e) + raise GenAIPerfException(error_message) + + return response + + @classmethod + def _create_error_message(cls, exception: Exception) -> str: + url_str = exception.args[0].args[0] + url_start = url_str.find("'") + url_end = url_str.find("'", url_start + 1) + 1 + error_message = f"Invalid URL: {url_str[url_start:url_end]}" + + return error_message + + @classmethod + def _check_for_error_in_json_of_dataset(cls, dataset_json: Dict) -> None: + if "error" in dataset_json: + raise GenAIPerfException(dataset_json["error"]) + + @classmethod + def _create_synthetic_prompt( + cls, + tokenizer: Tokenizer, + prompt_tokens_mean: int, + prompt_tokens_stddev: int, + ) -> str: + return SyntheticPromptGenerator.create_synthetic_prompt( + tokenizer, prompt_tokens_mean, prompt_tokens_stddev + ) diff --git a/genai-perf/genai_perf/llm_inputs/synthetic_prompt_generator.py b/genai-perf/genai_perf/llm_inputs/synthetic_prompt_generator.py new file mode 100644 index 00000000..68b77fdc --- /dev/null +++ b/genai-perf/genai_perf/llm_inputs/synthetic_prompt_generator.py @@ -0,0 +1,125 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import math +import pathlib +import random +import re +from typing import List + +from genai_perf.tokenizer import Tokenizer + + +class SyntheticPromptGenerator: + @classmethod + def create_synthetic_prompt( + cls, + tokenizer: Tokenizer, + prompt_tokens_mean: int = 550, + prompt_tokens_stddev: int = 250, + ) -> str: + """ + Generate a prompt that randomly samples lines from + Washington's farewell address at farewell.txt. + + Args: + prompt_tokens_mean: + The mean length of the prompt to generate + prompt_tokens_stddev: + The standard deviation of the length of the prompt to generate + + Returns: + The prompt. + """ + + num_prompt_tokens = SyntheticPromptGenerator._sample_random_positive_int( + prompt_tokens_mean, prompt_tokens_stddev + ) + + farewell_lines = SyntheticPromptGenerator._create_farewell_lines() + prompt = SyntheticPromptGenerator._create_prompt_from_lines( + num_prompt_tokens, farewell_lines, tokenizer + ) + + return prompt + + @classmethod + def _create_farewell_lines(cls) -> List[str]: + farewell_path = pathlib.Path(__file__).parent.resolve() / "farewell.txt" + with open(farewell_path, "r") as f: + farewell_lines = f.readlines() + random.shuffle(farewell_lines) + + return farewell_lines + + @classmethod + def _create_prompt_from_lines( + cls, + requested_prompt_tokens: int, + source_lines: List[str], + tokenizer: Tokenizer, + ) -> str: + get_token_length = lambda text: len(tokenizer.encode(text)) + + line_iterator = itertools.cycle(source_lines) + + def word_generator(): + while True: + next_line = next(line_iterator) + words = re.split("[ \n]+", next_line) + for word in words: + yield word + + word_iterator = word_generator() + + # Fast add lines + remaining_tokens = requested_prompt_tokens + prompt = "" + num_tokens_in_avg_line = get_token_length(source_lines[0] + source_lines[1]) / 2 + num_lines_to_add_fast = math.floor( + 0.5 * requested_prompt_tokens / num_tokens_in_avg_line + ) + while num_lines_to_add_fast: + for _ in range(num_lines_to_add_fast): + next_line = next(line_iterator) + prompt = prompt + next_line + + curr_tokens = get_token_length(prompt) + remaining_tokens = requested_prompt_tokens - curr_tokens + num_lines_to_add_fast = math.floor( + 0.5 * remaining_tokens / num_tokens_in_avg_line + ) + + # Fast add words + final_line = "" + while get_token_length(final_line) < remaining_tokens - 3: + next_word = next(word_iterator) + final_line += next_word + " " + prompt += final_line + + # Final tweaks + diff = requested_prompt_tokens - get_token_length(prompt) + for _ in range(diff): + prompt = "hi " + prompt + + return prompt + + @classmethod + def _sample_random_positive_int(cls, mean: int, stddev: int) -> int: + random_pos_int = -1 + while random_pos_int <= 0: + random_pos_int = int(random.gauss(mean, stddev)) + + return random_pos_int diff --git a/genai-perf/genai_perf/llm_metrics.py b/genai-perf/genai_perf/llm_metrics.py new file mode 100755 index 00000000..05c1ce59 --- /dev/null +++ b/genai-perf/genai_perf/llm_metrics.py @@ -0,0 +1,619 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import csv +import json +from collections import defaultdict +from enum import Enum, auto +from itertools import tee +from pathlib import Path +from typing import Dict, List, Tuple, Union + +import numpy as np +import pandas as pd +from genai_perf.tokenizer import Tokenizer +from genai_perf.utils import load_json, remove_sse_prefix +from rich.console import Console +from rich.table import Table + + +class ResponseFormat(Enum): + OPENAI_CHAT_COMPLETIONS = auto() + OPENAI_COMPLETIONS = auto() + TRITON = auto() + + +class Metrics: + """A base class for all the metrics class that contains common metrics.""" + + metric_labels = [ + "time_to_first_token", + "inter_token_latency", + "request_latency", + "output_token_throughput", + "output_token_throughput_per_request", + "request_throughput", + "output_sequence_length", + "input_sequence_length", + ] + + time_fields = [ + "inter_token_latency", + "time_to_first_token", + "request_latency", + ] + + # TODO (TMA-1678): output_token_throughput_per_request is not on this list + # since the current code treats all the throughput metrics to be displayed + # outside of the statistics table. + throughput_fields = [ + "request_throughput", + "output_token_throughput", + ] + + def __init__( + self, + request_throughputs: List[float] = [], + request_latencies: List[int] = [], + ) -> None: + self.request_throughputs = request_throughputs + self.request_latencies = request_latencies + self._base_names = { + "request_throughputs": "request_throughput", + "request_latencies": "request_latency", + } + + def __repr__(self): + attr_strs = [] + for k, v in self.__dict__.items(): + if not k.startswith("_"): + attr_strs.append(f"{k}={v}") + return f"Metrics({','.join(attr_strs)})" + + @property + def data(self) -> dict: + """Returns all the metrics.""" + return {k: v for k, v in self.__dict__.items() if not k.startswith("_")} + + def get_base_name(self, metric_name: str) -> str: + """Returns singular name of a given metric.""" + if metric_name in self._base_names: + return self._base_names[metric_name] + else: + raise KeyError(f"No metric named '{metric_name}' exists.") + + +class LLMMetrics(Metrics): + """A simple dataclass that holds core LLM performance metrics.""" + + def __init__( + self, + request_throughputs: List[float] = [], + request_latencies: List[int] = [], + time_to_first_tokens: List[int] = [], + inter_token_latencies: List[int] = [], + output_token_throughputs: List[float] = [], + output_token_throughputs_per_request: List[int] = [], + output_sequence_lengths: List[int] = [], + input_sequence_lengths: List[int] = [], + chunked_inter_token_latencies: List[List[int]] = [[]], + ) -> None: + super().__init__(request_throughputs, request_latencies) + self.time_to_first_tokens = time_to_first_tokens + self.inter_token_latencies = inter_token_latencies + self.output_token_throughputs = output_token_throughputs + self.output_token_throughputs_per_request = output_token_throughputs_per_request + self.output_sequence_lengths = output_sequence_lengths + self.input_sequence_lengths = input_sequence_lengths + + # Keeping chunked ITL (old) as a WAR to preserve visualization. + # Excluded from data. + self._chunked_inter_token_latencies = chunked_inter_token_latencies + + # add base name mapping + self._base_names["time_to_first_tokens"] = "time_to_first_token" + self._base_names["inter_token_latencies"] = "inter_token_latency" + self._base_names["output_token_throughputs"] = "output_token_throughput" + self._base_names["output_token_throughputs_per_request"] = ( + "output_token_throughput_per_request" + ) + self._base_names["output_sequence_lengths"] = "output_sequence_length" + self._base_names["input_sequence_lengths"] = "input_sequence_length" + + +class Statistics: + """A class that aggregates various statistics from given metrics class. + + The Statistics class goes through each metric in the metrics class and + calculates several statistics such as: + - average (arithmetic mean) + - percentiles (p25, p50, p75, p90, p95, p99) + - minimum & maximum + - standard deviation + The class will store each calculated statistics as part of its attribute. + + Example: + + >>> metrics = LLMMetrics(request_throughputs=[2, 4]) + >>> stats = Statistics(metrics) + >>> print(stats.avg_request_throughput) # output: 3 + """ + + def __init__(self, metrics: Metrics): + # iterate through Metrics to calculate statistics and set attributes + self._metrics = metrics + self._stats_dict: Dict = defaultdict(dict) + for attr, data in metrics.data.items(): + if self._should_skip(data, attr): + continue + + attr = metrics.get_base_name(attr) + self._add_units(attr) + self._calculate_mean(data, attr) + if not self._is_throughput_field(attr): + self._calculate_percentiles(data, attr) + self._calculate_minmax(data, attr) + self._calculate_std(data, attr) + + def _should_skip(self, data: List[Union[int, float]], attr: str) -> bool: + """Checks if some metrics should be skipped.""" + # No data points + if len(data) == 0: + return True + # Skip ITL when non-streaming (all zero) + elif attr == "inter_token_latencies" and sum(data) == 0: + return True + return False + + def _calculate_mean(self, data: List[Union[int, float]], attr: str) -> None: + avg = np.mean(data) + setattr(self, "avg_" + attr, avg) + self._stats_dict[attr]["avg"] = float(avg) + + def _calculate_percentiles(self, data: List[Union[int, float]], attr: str) -> None: + p25, p50, p75 = np.percentile(data, [25, 50, 75]) + p90, p95, p99 = np.percentile(data, [90, 95, 99]) + setattr(self, "p25_" + attr, p25) + setattr(self, "p50_" + attr, p50) + setattr(self, "p75_" + attr, p75) + setattr(self, "p90_" + attr, p90) + setattr(self, "p95_" + attr, p95) + setattr(self, "p99_" + attr, p99) + self._stats_dict[attr]["p99"] = float(p99) + self._stats_dict[attr]["p95"] = float(p95) + self._stats_dict[attr]["p90"] = float(p90) + self._stats_dict[attr]["p75"] = float(p75) + self._stats_dict[attr]["p50"] = float(p50) + self._stats_dict[attr]["p25"] = float(p25) + + def _calculate_minmax(self, data: List[Union[int, float]], attr: str) -> None: + min, max = np.min(data), np.max(data) + setattr(self, "min_" + attr, min) + setattr(self, "max_" + attr, max) + self._stats_dict[attr]["max"] = float(max) + self._stats_dict[attr]["min"] = float(min) + + def _calculate_std(self, data: List[Union[int, float]], attr: str) -> None: + std = np.std(data) + setattr(self, "std_" + attr, std) + self._stats_dict[attr]["std"] = float(std) + + def scale_data(self, factor: float = 1 / 1e6) -> None: + for k1, v1 in self.stats_dict.items(): + if self._is_time_field(k1): + for k2, v2 in v1.items(): + if k2 != "unit": + self.stats_dict[k1][k2] = self._scale(v2, factor) + + def _scale(self, metric: float, factor: float = 1 / 1e6) -> float: + """ + Scale metrics from nanoseconds by factor. + Default is nanoseconds to milliseconds. + """ + return metric * factor + + def _add_units(self, key) -> None: + if self._is_time_field(key): + self._stats_dict[key]["unit"] = "ms" + if key == "request_throughput": + self._stats_dict[key]["unit"] = "requests/sec" + if key.startswith("output_token_throughput"): + self._stats_dict[key]["unit"] = "tokens/sec" + if "sequence_length" in key: + self._stats_dict[key]["unit"] = "tokens" + + def __repr__(self) -> str: + attr_strs = [] + for k, v in self.__dict__.items(): + if not k.startswith("_"): + attr_strs.append(f"{k}={v}") + return f"Statistics({','.join(attr_strs)})" + + @property + def data(self) -> dict: + """Return all the aggregated statistics.""" + return {k: v for k, v in self.__dict__.items() if not k.startswith("_")} + + @property + def metrics(self) -> Metrics: + """Return the underlying metrics used to calculate the statistics.""" + return self._metrics + + @property + def stats_dict(self) -> Dict: + return self._stats_dict + + def _is_throughput_field(self, field: str) -> bool: + return field in Metrics.throughput_fields + + def _is_time_field(self, field: str) -> bool: + return field in Metrics.time_fields + + def export_parquet(self, artifact_dir: Path, filename: str) -> None: + max_length = -1 + col_index = 0 + filler_list = [] + df = pd.DataFrame() + + # Data frames require all columns of the same length + # find the max length column + for key, value in self._metrics.data.items(): + max_length = max(max_length, len(value)) + + # Insert None for shorter columns to match longest column + for key, value in self._metrics.data.items(): + if len(value) < max_length: + diff = max_length - len(value) + filler_list = [None] * diff + df.insert(col_index, key, value + filler_list) + diff = 0 + filler_list = [] + col_index = col_index + 1 + + filepath = artifact_dir / f"{filename}.gzip" + df.to_parquet(filepath, compression="gzip") + + +class ProfileDataParser: + """Base profile data parser class that reads the profile data JSON file to + extract core metrics and calculate various performance statistics. + """ + + def __init__(self, filename: Path) -> None: + data = load_json(filename) + self._get_profile_metadata(data) + self._parse_profile_data(data) + + def _get_profile_metadata(self, data: dict) -> None: + self._service_kind = data["service_kind"] + if self._service_kind == "openai": + if data["endpoint"] == "v1/chat/completions": + self._response_format = ResponseFormat.OPENAI_CHAT_COMPLETIONS + elif data["endpoint"] == "v1/completions": + self._response_format = ResponseFormat.OPENAI_COMPLETIONS + else: + # TPA-66: add PA metadata to handle this case + # When endpoint field is either empty or custom endpoint, fall + # back to parsing the response to extract the response format. + request = data["experiments"][0]["requests"][0] + response = request["response_outputs"][0]["response"] + if "chat.completion" in response: + self._response_format = ResponseFormat.OPENAI_CHAT_COMPLETIONS + elif "text_completion" in response: + self._response_format = ResponseFormat.OPENAI_COMPLETIONS + else: + raise RuntimeError("Unknown OpenAI response format.") + + elif self._service_kind == "triton": + self._response_format = ResponseFormat.TRITON + else: + raise ValueError(f"Unknown service kind: {self._service_kind}") + + def _parse_profile_data(self, data: dict) -> None: + """Parse through the entire profile data to collect statistics.""" + self._profile_results = {} + for experiment in data["experiments"]: + infer_mode = experiment["experiment"]["mode"] + load_level = experiment["experiment"]["value"] + requests = experiment["requests"] + + metrics = self._parse_requests(requests) + + # aggregate and calculate statistics + statistics = Statistics(metrics) + self._profile_results[(infer_mode, str(load_level))] = statistics + + def _parse_requests(self, requests: dict) -> LLMMetrics: + """Parse each request in profile data to extract core metrics.""" + raise NotImplementedError + + def get_statistics(self, infer_mode: str, load_level: str) -> Statistics: + """Return profile statistics if it exists.""" + if (infer_mode, load_level) not in self._profile_results: + raise KeyError(f"Profile with {infer_mode}={load_level} does not exist.") + return self._profile_results[(infer_mode, load_level)] + + def get_profile_load_info(self) -> List[Tuple[str, str]]: + """Return available (infer_mode, load_level) tuple keys.""" + return [k for k, _ in self._profile_results.items()] + + +class LLMProfileDataParser(ProfileDataParser): + """A class that calculates and aggregates all the LLM performance statistics + across the Perf Analyzer profile results. + + The LLMProfileDataParser class parses profile export JSON file, collects the + core LLM performance metrics, and calculates summary statistics for each + different Perf Analyzer runs/experiments. + + Example: + + >>> ... # run Perf Analyzer with concurrency level 10 + >>> + >>> from transformers import AutoTokenizer + >>> + >>> tokenizer = AutoTokenizer.from_pretrained("gpt2") + >>> pd = LLMProfileDataParser( + >>> filename="profile_export.json", + >>> tokenizer=tokenizer, + >>> ) + >>> stats = pd.get_statistics(infer_mode="concurrency", level=10) + >>> + >>> print(stats) # output: Statistics(avg_time_to_first_token=...) + >>> stats.pretty_print() # Output: time_to_first_token_s: ... + """ + + def __init__( + self, + filename: Path, + tokenizer: Tokenizer, + ) -> None: + self._tokenizer = tokenizer + super().__init__(filename) + + def _parse_requests(self, requests: dict) -> LLMMetrics: + """Parse each requests in profile export data to extract key metrics.""" + min_req_timestamp, max_res_timestamp = float("inf"), 0 + request_latencies = [] + time_to_first_tokens = [] + inter_token_latencies = [] + output_token_throughputs_per_request = [] + input_sequence_lengths = [] + output_sequence_lengths = [] + chunked_inter_token_latencies = [] + + for request in requests: + req_timestamp = request["timestamp"] + req_inputs = request["request_inputs"] + res_timestamps = request["response_timestamps"] + res_outputs = request["response_outputs"] + + self._preprocess_response(res_timestamps, res_outputs) + + # Skip requests with empty response. This happens sometimes when the + # model returns a single response with empty string. + if not res_timestamps: + continue + + # track entire benchmark duration + min_req_timestamp = min(min_req_timestamp, req_timestamp) + max_res_timestamp = max(max_res_timestamp, res_timestamps[-1]) + + # request latencies + req_latency_ns = res_timestamps[-1] - req_timestamp + request_latencies.append(req_latency_ns) # nanosec + req_latency_s = req_latency_ns / 1e9 # sec + + # time to first token + ttft = res_timestamps[0] - req_timestamp + time_to_first_tokens.append(ttft) + + # number of input tokens + input_seq_len = self._get_input_token_count(req_inputs) + input_sequence_lengths.append(input_seq_len) + + # output token throughput per request + output_token_counts, total_output_token = self._get_output_token_counts( + res_outputs + ) + output_token_throughputs_per_request.append( + total_output_token / req_latency_s + ) + output_sequence_lengths.append(total_output_token) + + # inter token latencies + if total_output_token > 1: + inter_token_latency = (req_latency_ns - ttft) / (total_output_token - 1) + inter_token_latencies.append(round(inter_token_latency)) + + # The new ITL calculation above loses all token-level ITL information + # and as a result breaks ITL vs token position visualization. Keep + # the old version of inter token latency as a WAR to preserve the + # visualization. + chunked_inter_token_latency = [] + for (t1, _), (t2, n2) in self._pairwise( + zip(res_timestamps, output_token_counts) + ): + # TMA-1676: handle empty first/last responses + # if the latter response has zero token (e.g. empty string), + # then set it default to one for the sake of inter token latency + # calculation and to avoid divide by zero. + num_token = 1 if n2 == 0 else n2 + chunked_inter_token_latency.append(round((t2 - t1) / num_token)) + chunked_inter_token_latencies.append(chunked_inter_token_latency) + + # request & output token throughput + benchmark_duration = (max_res_timestamp - min_req_timestamp) / 1e9 # nanosec + request_throughputs = [len(requests) / benchmark_duration] + output_token_throughputs = [sum(output_sequence_lengths) / benchmark_duration] + + return LLMMetrics( + request_throughputs, + request_latencies, + time_to_first_tokens, + inter_token_latencies, + output_token_throughputs, + output_token_throughputs_per_request, + output_sequence_lengths, + input_sequence_lengths, + chunked_inter_token_latencies, + ) + + def _pairwise(self, iterable): + """Generate pairs of consecutive elements from the given iterable.""" + a, b = tee(iterable) + next(b, None) + return zip(a, b) + + def _preprocess_response( + self, res_timestamps: List[int], res_outputs: List[Dict[str, str]] + ) -> None: + """Helper function to preprocess responses of a request.""" + if self._service_kind == "openai": + # PA sometimes receives multiple SSE responses at once (as a single + # response). Handle these responses by merging into a single response. + for i in range(len(res_outputs)): + response = res_outputs[i]["response"] + responses = response.strip().split("\n\n") + if len(responses) > 1: + merged_response = json.loads(remove_sse_prefix(responses[0])) + if ( + merged_response["choices"][0]["delta"].get("content", None) + is None + ): + merged_response["choices"][0]["delta"]["content"] = "" + for r in responses[1:]: + text = self._extract_openai_text_output(r) + merged_response["choices"][0]["delta"]["content"] += text + + res_outputs[i] = {"response": json.dumps(merged_response)} + + # Remove responses without any content + indices_to_remove = [] + for idx, out in enumerate(res_outputs): + if self._is_openai_empty_response(out["response"]): + indices_to_remove.append(idx) + indices_to_remove.sort(reverse=True) + for index in indices_to_remove: + res_timestamps.pop(index) + res_outputs.pop(index) + + def _get_input_token_count(self, req_inputs: dict) -> int: + """Deserialize the request input and return tokenized inputs.""" + if self._service_kind == "triton": + input_text = req_inputs["text_input"] + elif self._service_kind == "openai": + input_text = self._get_openai_input_text(req_inputs) + else: + raise ValueError(f"Unknown service kind: '{self._service_kind}'.") + + return len(self._tokenizer.encode(input_text)) + + def _get_openai_input_text(self, req_inputs: dict) -> str: + """Tokenize the OpenAI request input texts.""" + payload = json.loads(req_inputs["payload"]) + if self._response_format == ResponseFormat.OPENAI_CHAT_COMPLETIONS: + return payload["messages"][0]["content"] + elif self._response_format == ResponseFormat.OPENAI_COMPLETIONS: + return payload["prompt"] + else: + raise ValueError( + "Failed to parse OpenAI request input in profile export file." + ) + + def _get_output_token_counts( + self, res_outputs: List[Dict] + ) -> Tuple[List[int], int]: + """Return response-level token counts and total token count.""" + if self._service_kind == "triton": + output_texts = self._get_triton_output_tokens(res_outputs) + elif self._service_kind == "openai": + output_texts = self._get_openai_output_tokens(res_outputs) + else: + raise ValueError(f"Unknown service kind: '{self._service_kind}'.") + + full_text_token_count = len(self._tokenizer.encode("".join(output_texts))) + + output_tokens = self._get_response_output_tokens(output_texts) + output_token_counts = list(map(len, output_tokens)) + return output_token_counts, full_text_token_count + + def _get_triton_output_tokens(self, res_outputs: List[Dict]) -> List[str]: + """Return a list of Triton response texts.""" + return [r["text_output"] for r in res_outputs] + + def _get_openai_output_tokens(self, res_outputs: List[Dict]) -> List[str]: + """Return a list of OpenAI response texts.""" + output_texts = [] + for output in res_outputs: + text = self._extract_openai_text_output(output["response"]) + output_texts.append(text) + return output_texts + + def _get_response_output_tokens(self, output_texts: List[str]) -> List[List[int]]: + """Return a list of response output tokens.""" + # Exclamation mark trick forces the llama tokenization to consistently + # start each output with a specific token which allows us to safely skip + # the first token of every tokenized output and get only the ones that + # are returned by the model + encodings = self._tokenizer(["!" + txt for txt in output_texts]) + return [out[1:] for out in encodings.data["input_ids"]] + + def _extract_openai_text_output(self, response: str) -> str: + """Extracts text/content of the OpenAI response object.""" + response = remove_sse_prefix(response) + + if response == "[DONE]": + return "" + + data = json.loads(response) + completions = data["choices"][0] + + text_output = "" + if "object" not in data: + # FIXME: TPA-47 workaround for vLLM not following OpenAI Completions + # API specification when streaming, missing 'object' field: + # https://platform.openai.com/docs/api-reference/completions + text_output = completions.get("text", "") + elif data["object"] == "text_completion": # legacy + text_output = completions.get("text", "") + elif data["object"] == "chat.completion": # non-streaming + text_output = completions["message"].get("content", "") + elif data["object"] == "chat.completion.chunk": # streaming + text_output = completions["delta"].get("content", "") + else: + obj_type = data["object"] + raise ValueError(f"Unknown OpenAI response object type '{obj_type}'.") + return text_output + + def _is_openai_empty_response(self, response: str) -> bool: + """Returns true if the response is an openai response with no content (or empty content)""" + text = self._extract_openai_text_output(response) + if text: + return False + return True diff --git a/genai-perf/genai_perf/logging.py b/genai-perf/genai_perf/logging.py new file mode 100644 index 00000000..f5cab490 --- /dev/null +++ b/genai-perf/genai_perf/logging.py @@ -0,0 +1,99 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import logging +import logging.config + +DEFAULT_LOG_FORMAT = "%(asctime)s [%(levelname)s] %(name)s:%(lineno)s - %(message)s" +DEFAULT_DATE_FORMAT = "%Y-%m-%d %H:%M" + + +def init_logging() -> None: + LOGGING_CONFIG = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "standard": { + "format": DEFAULT_LOG_FORMAT, + "datefmt": DEFAULT_DATE_FORMAT, + }, + }, + "handlers": { + "console": { + "level": "INFO", + "formatter": "standard", + "class": "logging.StreamHandler", + "stream": "ext://sys.stdout", # Default is stderr + }, + }, + "loggers": { + "": { # root logger - avoid using + "handlers": ["console"], + "level": "WARNING", + "propagate": False, + }, + "__main__": { # if __name__ == '__main__' + "handlers": ["console"], + "level": "DEBUG", + "propagate": False, + }, + "genai_perf.parser": { # must use module name for loggers + "handlers": ["console"], + "level": "DEBUG", + "propagate": False, + }, + "genai_perf.wrapper": { + "handlers": ["console"], + "level": "DEBUG", + "propagate": False, + }, + "genai_perf.plots.plot_config_parser": { + "handlers": ["console"], + "level": "DEBUG", + "propagate": False, + }, + "genai_perf.plots.plot_manager": { + "handlers": ["console"], + "level": "DEBUG", + "propagate": False, + }, + "genai_perf.export_data.json_exporter": { + "handlers": ["console"], + "level": "DEBUG", + "propagate": False, + }, + "genai_perf.export_data.csv_exporter": { + "handlers": ["console"], + "level": "DEBUG", + "propagate": False, + }, + }, + } + logging.config.dictConfig(LOGGING_CONFIG) + + +def getLogger(name): + return logging.getLogger(name) diff --git a/genai-perf/genai_perf/main.py b/genai-perf/genai_perf/main.py new file mode 100755 index 00000000..65b765d8 --- /dev/null +++ b/genai-perf/genai_perf/main.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python3 +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import sys +import traceback +from argparse import Namespace +from pathlib import Path + +import genai_perf.logging as logging +from genai_perf import parser +from genai_perf.constants import DEFAULT_PARQUET_FILE +from genai_perf.exceptions import GenAIPerfException +from genai_perf.export_data.output_reporter import OutputReporter +from genai_perf.llm_inputs.llm_inputs import LlmInputs +from genai_perf.llm_metrics import LLMProfileDataParser +from genai_perf.plots.plot_config_parser import PlotConfigParser +from genai_perf.plots.plot_manager import PlotManager +from genai_perf.tokenizer import Tokenizer, get_tokenizer + + +def create_artifacts_dirs(args: Namespace) -> None: + # TMA-1911: support plots CLI option + plot_dir = args.artifact_dir / "plots" + os.makedirs(args.artifact_dir, exist_ok=True) + os.makedirs(plot_dir, exist_ok=True) + + +def generate_inputs(args: Namespace, tokenizer: Tokenizer) -> None: + # TODO (TMA-1759): review if add_model_name is always true + input_filename = Path(args.input_file.name) if args.input_file else None + add_model_name = True + try: + extra_input_dict = parser.get_extra_inputs_as_dict(args) + except ValueError as e: + raise GenAIPerfException(e) + + LlmInputs.create_llm_inputs( + input_type=args.prompt_source, + output_format=args.output_format, + dataset_name=args.input_dataset, + model_name=args.model, + model_selection_strategy=args.model_selection_strategy, + input_filename=input_filename, + starting_index=LlmInputs.DEFAULT_STARTING_INDEX, + length=args.num_prompts, + prompt_tokens_mean=args.synthetic_input_tokens_mean, + prompt_tokens_stddev=args.synthetic_input_tokens_stddev, + output_tokens_mean=args.output_tokens_mean, + output_tokens_stddev=args.output_tokens_stddev, + output_tokens_deterministic=args.output_tokens_mean_deterministic, + random_seed=args.random_seed, + num_of_output_prompts=args.num_prompts, + add_model_name=add_model_name, + add_stream=args.streaming, + tokenizer=tokenizer, + extra_inputs=extra_input_dict, + output_dir=args.artifact_dir, + ) + + +def calculate_metrics(args: Namespace, tokenizer: Tokenizer) -> LLMProfileDataParser: + return LLMProfileDataParser( + filename=args.profile_export_file, + tokenizer=tokenizer, + ) + + +def report_output(data_parser: LLMProfileDataParser, args: Namespace) -> None: + if args.concurrency: + infer_mode = "concurrency" + load_level = f"{args.concurrency}" + elif args.request_rate: + infer_mode = "request_rate" + load_level = f"{args.request_rate}" + else: + raise GenAIPerfException("No valid infer mode specified") + + stats = data_parser.get_statistics(infer_mode, load_level) + reporter = OutputReporter(stats, args) + reporter.report_output() + if args.generate_plots: + create_plots(args) + + +def create_plots(args: Namespace) -> None: + # TMA-1911: support plots CLI option + plot_dir = args.artifact_dir / "plots" + PlotConfigParser.create_init_yaml_config( + filenames=[args.profile_export_file], # single run + output_dir=plot_dir, + ) + config_parser = PlotConfigParser(plot_dir / "config.yaml") + plot_configs = config_parser.generate_configs() + plot_manager = PlotManager(plot_configs) + plot_manager.generate_plots() + + +# Separate function that can raise exceptions used for testing +# to assert correct errors and messages. +def run(): + try: + # TMA-1900: refactor CLI handler + logging.init_logging() + args, extra_args = parser.parse_args() + if args.subcommand == "compare": + args.func(args) + else: + create_artifacts_dirs(args) + tokenizer = get_tokenizer(args.tokenizer) + generate_inputs(args, tokenizer) + args.func(args, extra_args) + data_parser = calculate_metrics(args, tokenizer) + report_output(data_parser, args) + except Exception as e: + raise GenAIPerfException(e) + + +def main(): + # Interactive use will catch exceptions and log formatted errors rather than + # tracebacks. + try: + run() + except Exception as e: + traceback.print_exc() + logger = logging.getLogger(__name__) + logger.error(e) + return 1 + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/genai-perf/genai_perf/parser.py b/genai-perf/genai_perf/parser.py new file mode 100644 index 00000000..24f98b42 --- /dev/null +++ b/genai-perf/genai_perf/parser.py @@ -0,0 +1,635 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import json +import os +import sys +from pathlib import Path + +import genai_perf.logging as logging +import genai_perf.utils as utils +from genai_perf.constants import ( + CNN_DAILY_MAIL, + DEFAULT_ARTIFACT_DIR, + DEFAULT_COMPARE_DIR, + OPEN_ORCA, +) +from genai_perf.llm_inputs.llm_inputs import ( + LlmInputs, + ModelSelectionStrategy, + OutputFormat, + PromptSource, +) +from genai_perf.plots.plot_config_parser import PlotConfigParser +from genai_perf.plots.plot_manager import PlotManager +from genai_perf.tokenizer import DEFAULT_TOKENIZER + +from . import __version__ + +logger = logging.getLogger(__name__) + +_endpoint_type_map = {"chat": "v1/chat/completions", "completions": "v1/completions"} + + +def _check_model_args( + parser: argparse.ArgumentParser, args: argparse.Namespace +) -> argparse.Namespace: + """ + Check if model name is provided. + """ + if not args.subcommand and not args.model: + parser.error("The -m/--model option is required and cannot be empty.") + args = _convert_str_to_enum_entry( + args, "model_selection_strategy", ModelSelectionStrategy + ) + _generate_formatted_model_name(args) + return args + + +def _generate_formatted_model_name(args: argparse.Namespace) -> None: + if len(args.model) == 1: + args.formatted_model_name = args.model[0] + elif len(args.model) == 0: + args.model = None + args.formatted_model_name = None + else: + args.formatted_model_name = args.model[0] + "_multi" + + +def _check_compare_args( + parser: argparse.ArgumentParser, args: argparse.Namespace +) -> argparse.Namespace: + """ + Check compare subcommand args + """ + if args.subcommand == "compare": + if not args.config and not args.files: + parser.error("Either the --config or --files option must be specified.") + return args + + +def _check_conditional_args( + parser: argparse.ArgumentParser, args: argparse.Namespace +) -> argparse.Namespace: + """ + Check for conditional args and raise an error if they are not set. + """ + + # Endpoint and output format checks + if args.service_kind == "openai": + if args.endpoint_type is None: + parser.error( + "The --endpoint-type option is required when using the 'openai' service-kind." + ) + else: + if args.endpoint_type == "chat": + args.output_format = OutputFormat.OPENAI_CHAT_COMPLETIONS + elif args.endpoint_type == "completions": + args.output_format = OutputFormat.OPENAI_COMPLETIONS + + if args.endpoint is not None: + args.endpoint = args.endpoint.lstrip(" /") + else: + args.endpoint = _endpoint_type_map[args.endpoint_type] + elif args.endpoint_type is not None: + parser.error( + "The --endpoint-type option should only be used when using the 'openai' service-kind." + ) + + if args.service_kind == "triton": + args = _convert_str_to_enum_entry(args, "backend", OutputFormat) + args.output_format = args.backend + + # Output token distribution checks + if args.output_tokens_mean == LlmInputs.DEFAULT_OUTPUT_TOKENS_MEAN: + if args.output_tokens_stddev != LlmInputs.DEFAULT_OUTPUT_TOKENS_STDDEV: + parser.error( + "The --output-tokens-mean option is required when using --output-tokens-stddev." + ) + if args.output_tokens_mean_deterministic: + parser.error( + "The --output-tokens-mean option is required when using --output-tokens-mean-deterministic." + ) + + if args.service_kind != "triton": + if args.output_tokens_mean_deterministic: + parser.error( + "The --output-tokens-mean-deterministic option is only supported with the Triton service-kind." + ) + + return args + + +def _check_load_manager_args(args: argparse.Namespace) -> argparse.Namespace: + """ + Check inference load args + """ + # If no concurrency or request rate is set, default to 1 + if not args.concurrency and not args.request_rate: + args.concurrency = 1 + return args + + +def _set_artifact_paths(args: argparse.Namespace) -> argparse.Namespace: + """ + Set paths for all the artifacts. + """ + if args.artifact_dir == Path(DEFAULT_ARTIFACT_DIR): + # Preprocess Huggingface model names that include '/' in their model name. + if (args.formatted_model_name is not None) and ( + "/" in args.formatted_model_name + ): + filtered_name = "_".join(args.formatted_model_name.split("/")) + logger.info( + f"Model name '{args.formatted_model_name}' cannot be used to create artifact " + f"directory. Instead, '{filtered_name}' will be used." + ) + name = [f"{filtered_name}"] + else: + name = [f"{args.formatted_model_name}"] + + if args.service_kind == "openai": + name += [f"{args.service_kind}-{args.endpoint_type}"] + elif args.service_kind == "triton": + name += [f"{args.service_kind}-{args.backend.to_lowercase()}"] + else: + raise ValueError(f"Unknown service kind '{args.service_kind}'.") + + if args.concurrency: + name += [f"concurrency{args.concurrency}"] + elif args.request_rate: + name += [f"request_rate{args.request_rate}"] + args.artifact_dir = args.artifact_dir / Path("-".join(name)) + + if args.profile_export_file.parent != Path(""): + raise ValueError( + "Please use --artifact-dir option to define intermediary paths to " + "the profile export file." + ) + + args.profile_export_file = args.artifact_dir / args.profile_export_file + return args + + +def _infer_prompt_source(args: argparse.Namespace) -> argparse.Namespace: + if args.input_dataset: + args.prompt_source = PromptSource.DATASET + logger.debug(f"Input source is the following dataset: {args.input_dataset}") + elif args.input_file: + args.prompt_source = PromptSource.FILE + logger.debug(f"Input source is the following file: {args.input_file.name}") + else: + args.prompt_source = PromptSource.SYNTHETIC + logger.debug("Input source is synthetic data") + return args + + +def _convert_str_to_enum_entry(args, option, enum): + """ + Convert string option to corresponding enum entry + """ + attr_val = getattr(args, option) + if attr_val is not None: + setattr(args, f"{option}", utils.get_enum_entry(attr_val, enum)) + return args + + +### Parsers ### + + +def _add_input_args(parser): + input_group = parser.add_argument_group("Input") + + input_group.add_argument( + "--extra-inputs", + action="append", + help="Provide additional inputs to include with every request. " + "You can repeat this flag for multiple inputs. Inputs should be in an input_name:value format." + "Alternatively, a string representing a json formatted dict can be provided.", + ) + + prompt_source_group = input_group.add_mutually_exclusive_group(required=False) + prompt_source_group.add_argument( + "--input-dataset", + type=str.lower, + default=None, + choices=[OPEN_ORCA, CNN_DAILY_MAIL], + required=False, + help="The HuggingFace dataset to use for prompts.", + ) + + prompt_source_group.add_argument( + "--input-file", + type=argparse.FileType("r"), + default=None, + required=False, + help="The input file containing the prompts to use for profiling. " + "Each line should be a JSON object with a 'text_input' field in JSONL format. " + 'Example: {"text_input": "Your prompt here"}', + ) + + input_group.add_argument( + "--num-prompts", + type=int, + default=LlmInputs.DEFAULT_NUM_PROMPTS, + required=False, + help=f"The number of unique prompts to generate as stimulus.", + ) + + input_group.add_argument( + "--output-tokens-mean", + type=int, + default=LlmInputs.DEFAULT_OUTPUT_TOKENS_MEAN, + required=False, + help=f"The mean number of tokens in each output. " + "Ensure the --tokenizer value is set correctly. ", + ) + + input_group.add_argument( + "--output-tokens-mean-deterministic", + action="store_true", + required=False, + help=f"When using --output-tokens-mean, this flag can be set to " + "improve precision by setting the minimum number of tokens " + "equal to the requested number of tokens. This is currently " + "supported with the Triton service-kind. " + "Note that there is still some variability in the requested number " + "of output tokens, but GenAi-Perf attempts its best effort with your " + "model to get the right number of output tokens. ", + ) + + input_group.add_argument( + "--output-tokens-stddev", + type=int, + default=LlmInputs.DEFAULT_OUTPUT_TOKENS_STDDEV, + required=False, + help=f"The standard deviation of the number of tokens in each output. " + "This is only used when --output-tokens-mean is provided.", + ) + + input_group.add_argument( + "--random-seed", + type=int, + default=LlmInputs.DEFAULT_RANDOM_SEED, + required=False, + help="The seed used to generate random values.", + ) + + input_group.add_argument( + "--synthetic-input-tokens-mean", + type=int, + default=LlmInputs.DEFAULT_PROMPT_TOKENS_MEAN, + required=False, + help=f"The mean of number of tokens in the generated prompts when using synthetic data.", + ) + + input_group.add_argument( + "--synthetic-input-tokens-stddev", + type=int, + default=LlmInputs.DEFAULT_PROMPT_TOKENS_STDDEV, + required=False, + help=f"The standard deviation of number of tokens in the generated prompts when using synthetic data.", + ) + + +def _add_profile_args(parser): + profile_group = parser.add_argument_group("Profiling") + load_management_group = profile_group.add_mutually_exclusive_group(required=False) + + load_management_group.add_argument( + "--concurrency", + type=int, + required=False, + help="The concurrency value to benchmark.", + ) + + profile_group.add_argument( + "--measurement-interval", + "-p", + type=int, + default="10000", + required=False, + help="The time interval used for each measurement in milliseconds. " + "Perf Analyzer will sample a time interval specified and take " + "measurement over the requests completed within that time interval.", + ) + + load_management_group.add_argument( + "--request-rate", + type=float, + required=False, + help="Sets the request rate for the load generated by PA.", + ) + + profile_group.add_argument( + "-s", + "--stability-percentage", + type=float, + default=999, + required=False, + help="The allowed variation in " + "latency measurements when determining if a result is stable. The " + "measurement is considered as stable if the ratio of max / min " + "from the recent 3 measurements is within (stability percentage) " + "in terms of both infer per second and latency.", + ) + + +def _add_endpoint_args(parser): + endpoint_group = parser.add_argument_group("Endpoint") + + endpoint_group.add_argument( + "-m", + "--model", + nargs="+", + default=[], + help=f"The name of the model(s) to benchmark.", + ) + endpoint_group.add_argument( + "--model-selection-strategy", + type=str, + choices=utils.get_enum_names(ModelSelectionStrategy), + default="round_robin", + required=False, + help=f"When multiple model are specified, this is how a specific model " + "should be assigned to a prompt. round_robin means that ith prompt in the " + "list gets assigned to i mod len(models). random means that assignment is " + "uniformly random", + ) + + endpoint_group.add_argument( + "--backend", + type=str, + choices=utils.get_enum_names(OutputFormat)[2:], + default="tensorrtllm", + required=False, + help=f'When using the "triton" service-kind, ' + "this is the backend of the model. " + "For the TENSORRT-LLM backend, you currently must set " + "'exclude_input_in_output' to true in the model config to " + "not echo the input tokens in the output.", + ) + + endpoint_group.add_argument( + "--endpoint", + type=str, + required=False, + help=f"Set a custom endpoint that differs from the OpenAI defaults.", + ) + + endpoint_group.add_argument( + "--endpoint-type", + type=str, + choices=["chat", "completions"], + required=False, + help=f"The endpoint-type to send requests to on the " + 'server. This is only used with the "openai" service-kind.', + ) + + endpoint_group.add_argument( + "--service-kind", + type=str, + choices=["triton", "openai"], + default="triton", + required=False, + help="The kind of service perf_analyzer will " + 'generate load for. In order to use "openai", ' + "you must specify an api via --endpoint-type.", + ) + + endpoint_group.add_argument( + "--streaming", + action="store_true", + required=False, + help=f"An option to enable the use of the streaming API.", + ) + + endpoint_group.add_argument( + "-u", + "--url", + type=str, + required=False, + dest="u", + metavar="URL", + help="URL of the endpoint to target for benchmarking.", + ) + + +def _add_output_args(parser): + output_group = parser.add_argument_group("Output") + output_group.add_argument( + "--artifact-dir", + type=Path, + default=Path(DEFAULT_ARTIFACT_DIR), + help="The directory to store all the (output) artifacts generated by " + "GenAI-Perf and Perf Analyzer.", + ) + output_group.add_argument( + "--generate-plots", + action="store_true", + required=False, + help="An option to enable the generation of plots.", + ) + output_group.add_argument( + "--profile-export-file", + type=Path, + default=Path("profile_export.json"), + help="The path where the perf_analyzer profile export will be " + "generated. By default, the profile export will be to profile_export.json. " + "The genai-perf file will be exported to _genai_perf.csv. " + "For example, if the profile export file is profile_export.json, the genai-perf file will be " + "exported to profile_export_genai_perf.csv.", + ) + + +def _add_other_args(parser): + other_group = parser.add_argument_group("Other") + + other_group.add_argument( + "--tokenizer", + type=str, + default=DEFAULT_TOKENIZER, + required=False, + help="The HuggingFace tokenizer to use to interpret token metrics from prompts and responses.", + ) + + other_group.add_argument( + "-v", + "--verbose", + action="store_true", + required=False, + help="An option to enable verbose mode.", + ) + + other_group.add_argument( + "--version", + action="version", + version="%(prog)s " + __version__, + help=f"An option to print the version and exit.", + ) + + +def get_extra_inputs_as_dict(args: argparse.Namespace) -> dict: + request_inputs = {} + if args.extra_inputs: + for input_str in args.extra_inputs: + if input_str.startswith("{") and input_str.endswith("}"): + request_inputs.update(json.loads(input_str)) + else: + semicolon_count = input_str.count(":") + if semicolon_count != 1: + raise ValueError( + f"Invalid input format for --extra-inputs: {input_str}\n" + "Expected input format: 'input_name:value'" + ) + input_name, value = input_str.split(":", 1) + + if not input_name or not value: + raise ValueError( + f"Input name or value is empty in --extra-inputs: {input_str}\n" + "Expected input format: 'input_name:value'" + ) + + is_bool = value.lower() in ["true", "false"] + is_int = value.isdigit() + is_float = value.count(".") == 1 and ( + value[0] == "." or value.replace(".", "").isdigit() + ) + + if is_bool: + value = value.lower() == "true" + elif is_int: + value = int(value) + elif is_float: + value = float(value) + + if input_name in request_inputs: + raise ValueError( + f"Input name already exists in request_inputs dictionary: {input_name}" + ) + request_inputs[input_name] = value + + return request_inputs + + +def _parse_compare_args(subparsers) -> argparse.ArgumentParser: + compare = subparsers.add_parser( + "compare", + description="Subcommand to generate plots that compare multiple profile runs.", + ) + compare_group = compare.add_argument_group("Compare") + mx_group = compare_group.add_mutually_exclusive_group(required=False) + mx_group.add_argument( + "--config", + type=Path, + default=None, + help="The path to the YAML file that specifies plot configurations for " + "comparing multiple runs.", + ) + mx_group.add_argument( + "-f", + "--files", + nargs="+", + default=[], + help="List of paths to the profile export JSON files. Users can specify " + "this option instead of the `--config` option if they would like " + "GenAI-Perf to generate default plots as well as initial YAML config file.", + ) + compare.set_defaults(func=compare_handler) + return compare + + +### Handlers ### + + +def create_compare_dir() -> None: + if not os.path.exists(DEFAULT_COMPARE_DIR): + os.mkdir(DEFAULT_COMPARE_DIR) + + +def profile_handler(args, extra_args): + from genai_perf.wrapper import Profiler + + Profiler.run(args=args, extra_args=extra_args) + + +def compare_handler(args: argparse.Namespace): + """Handles `compare` subcommand workflow.""" + if args.files: + create_compare_dir() + output_dir = Path(f"{DEFAULT_COMPARE_DIR}") + PlotConfigParser.create_init_yaml_config(args.files, output_dir) + args.config = output_dir / "config.yaml" + + config_parser = PlotConfigParser(args.config) + plot_configs = config_parser.generate_configs() + plot_manager = PlotManager(plot_configs) + plot_manager.generate_plots() + + +### Entrypoint ### + + +def parse_args(): + argv = sys.argv + + parser = argparse.ArgumentParser( + prog="genai-perf", + description="CLI to profile LLMs and Generative AI models with Perf Analyzer", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.set_defaults(func=profile_handler) + + # Conceptually group args for easier visualization + _add_endpoint_args(parser) + _add_input_args(parser) + _add_profile_args(parser) + _add_output_args(parser) + _add_other_args(parser) + + # Add subcommands + subparsers = parser.add_subparsers( + help="List of subparser commands.", dest="subcommand" + ) + compare_parser = _parse_compare_args(subparsers) + + # Check for passthrough args + if "--" in argv: + passthrough_index = argv.index("--") + logger.info(f"Detected passthrough args: {argv[passthrough_index + 1:]}") + else: + passthrough_index = len(argv) + + args = parser.parse_args(argv[1:passthrough_index]) + args = _infer_prompt_source(args) + args = _check_model_args(parser, args) + args = _check_conditional_args(parser, args) + args = _check_compare_args(compare_parser, args) + args = _check_load_manager_args(args) + args = _set_artifact_paths(args) + + return args, argv[passthrough_index + 1 :] diff --git a/genai-perf/genai_perf/plots/__init__.py b/genai-perf/genai_perf/plots/__init__.py new file mode 100755 index 00000000..086616e4 --- /dev/null +++ b/genai-perf/genai_perf/plots/__init__.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/genai-perf/genai_perf/plots/base_plot.py b/genai-perf/genai_perf/plots/base_plot.py new file mode 100755 index 00000000..470e0b94 --- /dev/null +++ b/genai-perf/genai_perf/plots/base_plot.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from pathlib import Path +from typing import List + +import pandas as pd +from genai_perf.exceptions import GenAIPerfException +from genai_perf.plots.plot_config import ProfileRunData +from plotly.graph_objects import Figure + + +class BasePlot: + """ + Base class for plots + """ + + def __init__(self, data: List[ProfileRunData]) -> None: + self._profile_data = data + + def create_plot( + self, + graph_title: str, + x_label: str, + y_label: str, + width: int, + height: int, + filename_root: str, + output_dir: Path, + ) -> None: + """ + Create plot for specific graph type + """ + raise NotImplementedError + + def _create_dataframe(self, x_label: str, y_label: str) -> pd.DataFrame: + return pd.DataFrame( + { + x_label: [prd.x_metric for prd in self._profile_data], + y_label: [prd.y_metric for prd in self._profile_data], + "Run Name": [prd.name for prd in self._profile_data], + } + ) + + def _generate_parquet(self, df: pd.DataFrame, output_dir: Path, file: str) -> None: + filepath = output_dir / f"{file}.gzip" + df.to_parquet(filepath, compression="gzip") + + def _generate_graph_file(self, fig: Figure, output_dir: Path, file: str) -> None: + if file.endswith("jpeg"): + filepath = output_dir / f"{file}" + fig.write_image(filepath) + elif file.endswith("html"): + filepath = output_dir / f"{file}" + fig.write_html(filepath) + else: + extension = file.split(".")[-1] + raise GenAIPerfException(f"image file type {extension} is not supported") diff --git a/genai-perf/genai_perf/plots/box_plot.py b/genai-perf/genai_perf/plots/box_plot.py new file mode 100755 index 00000000..38aad36d --- /dev/null +++ b/genai-perf/genai_perf/plots/box_plot.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from pathlib import Path +from typing import List + +import plotly.graph_objects as go +from genai_perf.plots.base_plot import BasePlot +from genai_perf.plots.plot_config import ProfileRunData + + +class BoxPlot(BasePlot): + """ + Generate a box plot in jpeg and html format. + """ + + def __init__(self, data: List[ProfileRunData]) -> None: + super().__init__(data) + + def create_plot( + self, + graph_title: str = "", + x_label: str = "", + y_label: str = "", + width: int = 700, + height: int = 450, + filename_root: str = "", + output_dir: Path = Path(""), + ) -> None: + fig = go.Figure() + for pd in self._profile_data: + fig.add_trace(go.Box(y=pd.y_metric, name=pd.name)) + + # Update layout and axis labels + fig.update_layout( + title={ + "text": f"{graph_title}", + "xanchor": "center", + "x": 0.5, + }, + width=width, + height=height, + ) + fig.update_traces(boxpoints="all") + fig.update_xaxes(title_text=x_label, showticklabels=False) + fig.update_yaxes(title_text=y_label) + + # Save dataframe as parquet file + df = self._create_dataframe(x_label, y_label) + self._generate_parquet(df, output_dir, filename_root) + + self._generate_graph_file(fig, output_dir, filename_root + ".html") + self._generate_graph_file(fig, output_dir, filename_root + ".jpeg") diff --git a/genai-perf/genai_perf/plots/heat_map.py b/genai-perf/genai_perf/plots/heat_map.py new file mode 100755 index 00000000..7f4dbe16 --- /dev/null +++ b/genai-perf/genai_perf/plots/heat_map.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from pathlib import Path +from typing import List + +import plotly.graph_objects as go +from genai_perf.plots.base_plot import BasePlot +from genai_perf.plots.plot_config import ProfileRunData +from plotly.subplots import make_subplots + + +class HeatMap(BasePlot): + """ + Generate a heat map in jpeg and html format. + """ + + def __init__(self, data: List[ProfileRunData]) -> None: + super().__init__(data) + + def create_plot( + self, + graph_title: str = "", + x_label: str = "", + y_label: str = "", + width: int = 700, + height: int = 450, + filename_root: str = "", + output_dir: Path = Path(""), + ) -> None: + N = len(self._profile_data) + + if N <= 3: + n_rows, n_cols = 1, N + else: + n_rows = (N + 2) // 3 + n_cols = 3 + + fig = make_subplots( + rows=n_rows, + cols=n_cols, + x_title=x_label, + y_title=y_label, + subplot_titles=[prd.name for prd in self._profile_data], + ) + + for index, prd in enumerate(self._profile_data): + hm = go.Histogram2d( + x=prd.x_metric, + y=prd.y_metric, + coloraxis="coloraxis", + name=prd.name, + ) + + # Calculate the location where the figure should be added in the subplot + c_row = int(index / n_cols) + 1 + c_col = index % n_cols + 1 + fig.add_trace(hm, c_row, c_col) + + fig.update_layout( + title={ + "text": graph_title, + "xanchor": "center", + "x": 0.5, + }, + width=width, + height=height, + ) + + # Save dataframe as parquet file + df = self._create_dataframe(x_label, y_label) + self._generate_parquet(df, output_dir, filename_root) + + # self._generate_parquet(df, filename_root) + self._generate_graph_file(fig, output_dir, filename_root + ".html") + self._generate_graph_file(fig, output_dir, filename_root + ".jpeg") diff --git a/genai-perf/genai_perf/plots/plot_config.py b/genai-perf/genai_perf/plots/plot_config.py new file mode 100755 index 00000000..2408d059 --- /dev/null +++ b/genai-perf/genai_perf/plots/plot_config.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from collections.abc import Sequence +from dataclasses import dataclass +from enum import Enum, auto +from pathlib import Path +from typing import List, Sequence, Union + + +class PlotType(Enum): + SCATTER = auto() + BOX = auto() + HEATMAP = auto() + + +@dataclass +class ProfileRunData: + name: str + x_metric: Sequence[Union[int, float]] + y_metric: Sequence[Union[int, float]] + + +@dataclass +class PlotConfig: + title: str + data: List[ProfileRunData] + x_label: str + y_label: str + width: int + height: int + type: PlotType + output: Path diff --git a/genai-perf/genai_perf/plots/plot_config_parser.py b/genai-perf/genai_perf/plots/plot_config_parser.py new file mode 100755 index 00000000..1072bc30 --- /dev/null +++ b/genai-perf/genai_perf/plots/plot_config_parser.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from pathlib import Path +from typing import List, Union + +import genai_perf.logging as logging + +# Skip type checking to avoid mypy error +# Issue: https://github.com/python/mypy/issues/10632 +import yaml # type: ignore +from genai_perf.llm_metrics import LLMProfileDataParser, Statistics +from genai_perf.plots.plot_config import PlotConfig, PlotType, ProfileRunData +from genai_perf.tokenizer import DEFAULT_TOKENIZER, get_tokenizer +from genai_perf.utils import load_yaml, scale + +logger = logging.getLogger(__name__) + + +class PlotConfigParser: + """Parses YAML configuration file to generate PlotConfigs.""" + + def __init__(self, filename: Path) -> None: + self._filename = filename + + def generate_configs(self) -> List[PlotConfig]: + """Load YAML configuration file and convert to PlotConfigs.""" + logger.info( + f"Generating plot configurations by parsing {self._filename}. " + "This may take a few seconds.", + ) + configs = load_yaml(self._filename) + + plot_configs = [] + for _, config in configs.items(): + # Collect profile run data + profile_data: List[ProfileRunData] = [] + for filepath in config["paths"]: + stats = self._get_statistics(filepath) + profile_data.append( + ProfileRunData( + name=self._get_run_name(Path(filepath)), + x_metric=self._get_metric(stats, config["x_metric"]), + y_metric=self._get_metric(stats, config["y_metric"]), + ) + ) + + plot_configs.append( + PlotConfig( + title=config["title"], + data=profile_data, + x_label=config["x_label"], + y_label=config["y_label"], + width=config["width"], + height=config["height"], + type=self._get_plot_type(config["type"]), + output=Path(config["output"]), + ) + ) + + return plot_configs + + def _get_statistics(self, filepath: str) -> Statistics: + """Extract a single profile run data.""" + data_parser = LLMProfileDataParser( + filename=Path(filepath), + tokenizer=get_tokenizer(DEFAULT_TOKENIZER), + ) + load_info = data_parser.get_profile_load_info() + + # TMA-1904: Remove single experiment assumption + assert len(load_info) == 1 + infer_mode, load_level = load_info[0] + stats = data_parser.get_statistics(infer_mode, load_level) + return stats + + def _get_run_name(self, filepath: Path) -> str: + """Construct a profile run name.""" + if filepath.parent.name: + return filepath.parent.name + "/" + filepath.stem + return filepath.stem + + def _get_metric(self, stats: Statistics, name: str) -> List[Union[int, float]]: + if not name: # no metric + return [] + elif name == "inter_token_latencies": + itls = stats.metrics.data[name] + return [scale(x, (1 / 1e6)) for x in itls] # ns to ms + elif name == "token_positions": + chunked_itls = getattr(stats.metrics, "_chunked_inter_token_latencies") + token_positions: List[Union[int, float]] = [] + for request_itls in chunked_itls: + token_positions += list(range(1, len(request_itls) + 1)) + return token_positions + elif name == "time_to_first_tokens": + ttfts = stats.metrics.data[name] + return [scale(x, (1 / 1e6)) for x in ttfts] # ns to ms + elif name == "request_latencies": + req_latencies = stats.metrics.data[name] + return [scale(x, (1 / 1e6)) for x in req_latencies] # ns to ms + + return stats.metrics.data[name] + + def _get_plot_type(self, plot_type: str) -> PlotType: + """Returns the plot type as PlotType object.""" + if plot_type == "scatter": + return PlotType.SCATTER + elif plot_type == "box": + return PlotType.BOX + elif plot_type == "heatmap": + return PlotType.HEATMAP + else: + raise ValueError( + "Unknown plot type encountered while parsing YAML configuration. " + "Plot type must be either 'scatter', 'box', or 'heatmap'." + ) + + @staticmethod + def create_init_yaml_config(filenames: List[Path], output_dir: Path) -> None: + config_str = f""" + plot1: + title: Time to First Token + x_metric: "" + y_metric: time_to_first_tokens + x_label: Time to First Token (ms) + y_label: "" + width: {1200 if len(filenames) > 1 else 700} + height: 450 + type: box + paths: {[str(f) for f in filenames]} + output: {output_dir} + + plot2: + title: Request Latency + x_metric: "" + y_metric: request_latencies + x_label: Request Latency (ms) + y_label: "" + width: {1200 if len(filenames) > 1 else 700} + height: 450 + type: box + paths: {[str(f) for f in filenames]} + output: {output_dir} + + plot3: + title: Distribution of Input Sequence Lengths to Output Sequence Lengths + x_metric: input_sequence_lengths + y_metric: output_sequence_lengths + x_label: Input Sequence Length + y_label: Output Sequence Length + width: {1200 if len(filenames) > 1 else 700} + height: 450 + type: heatmap + paths: {[str(f) for f in filenames]} + output: {output_dir} + + plot4: + title: Time to First Token vs Input Sequence Lengths + x_metric: input_sequence_lengths + y_metric: time_to_first_tokens + x_label: Input Sequence Length + y_label: Time to First Token (ms) + width: {1200 if len(filenames) > 1 else 700} + height: 450 + type: scatter + paths: {[str(f) for f in filenames]} + output: {output_dir} + + plot5: + title: Token-to-Token Latency vs Output Token Position + x_metric: token_positions + y_metric: inter_token_latencies + x_label: Output Token Position + y_label: Token-to-Token Latency (ms) + width: {1200 if len(filenames) > 1 else 700} + height: 450 + type: scatter + paths: {[str(f) for f in filenames]} + output: {output_dir} + """ + + filepath = output_dir / "config.yaml" + logger.info(f"Creating initial YAML configuration file to {filepath}") + config = yaml.safe_load(config_str) + with open(str(filepath), "w") as f: + yaml.dump(config, f, sort_keys=False) diff --git a/genai-perf/genai_perf/plots/plot_manager.py b/genai-perf/genai_perf/plots/plot_manager.py new file mode 100755 index 00000000..e548a7de --- /dev/null +++ b/genai-perf/genai_perf/plots/plot_manager.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from typing import List + +import genai_perf.logging as logging +from genai_perf.plots.box_plot import BoxPlot +from genai_perf.plots.heat_map import HeatMap +from genai_perf.plots.plot_config import PlotConfig, PlotType +from genai_perf.plots.scatter_plot import ScatterPlot + +logger = logging.getLogger(__name__) + + +class PlotManager: + """ + Manage details around plots generated + """ + + def __init__(self, plot_configs: List[PlotConfig]) -> None: + self._plot_configs = plot_configs + + def _generate_filename(self, title: str) -> str: + filename = "_".join(title.lower().split()) + return filename + + def generate_plots(self) -> None: + for plot_config in self._plot_configs: + logger.info(f"Generating '{plot_config.title}' plot") + if plot_config.type == PlotType.BOX: + bp = BoxPlot(plot_config.data) + bp.create_plot( + graph_title=plot_config.title, + x_label=plot_config.x_label, + width=plot_config.width, + height=plot_config.height, + filename_root=self._generate_filename(plot_config.title), + output_dir=plot_config.output, + ) + + elif plot_config.type == PlotType.HEATMAP: + hm = HeatMap(plot_config.data) + hm.create_plot( + graph_title=plot_config.title, + x_label=plot_config.x_label, + y_label=plot_config.y_label, + width=plot_config.width, + height=plot_config.height, + filename_root=self._generate_filename(plot_config.title), + output_dir=plot_config.output, + ) + + elif plot_config.type == PlotType.SCATTER: + sp = ScatterPlot(plot_config.data) + sp.create_plot( + graph_title=plot_config.title, + x_label=plot_config.x_label, + y_label=plot_config.y_label, + width=plot_config.width, + height=plot_config.height, + filename_root=self._generate_filename(plot_config.title), + output_dir=plot_config.output, + ) diff --git a/genai-perf/genai_perf/plots/scatter_plot.py b/genai-perf/genai_perf/plots/scatter_plot.py new file mode 100755 index 00000000..35dca8fc --- /dev/null +++ b/genai-perf/genai_perf/plots/scatter_plot.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from pathlib import Path +from typing import List + +import plotly.graph_objects as go +from genai_perf.plots.base_plot import BasePlot +from genai_perf.plots.plot_config import ProfileRunData + + +class ScatterPlot(BasePlot): + """ + Generate a scatter plot in jpeg and html format. + """ + + def __init__(self, data: List[ProfileRunData]) -> None: + super().__init__(data) + + def create_plot( + self, + graph_title: str = "", + x_label: str = "", + y_label: str = "", + width: int = 700, + height: int = 450, + filename_root: str = "", + output_dir: Path = Path(""), + ) -> None: + fig = go.Figure() + for pd in self._profile_data: + fig.add_trace( + go.Scatter( + x=pd.x_metric, + y=pd.y_metric, + mode="markers", + name=pd.name, + ) + ) + + fig.update_layout( + title={ + "text": f"{graph_title}", + "xanchor": "center", + "x": 0.5, + }, + width=width, + height=height, + ) + fig.update_xaxes(title_text=f"{x_label}") + fig.update_yaxes(title_text=f"{y_label}") + + # Save dataframe as parquet file + df = self._create_dataframe(x_label, y_label) + self._generate_parquet(df, output_dir, filename_root) + + self._generate_graph_file(fig, output_dir, filename_root + ".html") + self._generate_graph_file(fig, output_dir, filename_root + ".jpeg") diff --git a/genai-perf/genai_perf/test_end_to_end.py b/genai-perf/genai_perf/test_end_to_end.py new file mode 100644 index 00000000..3cc2999f --- /dev/null +++ b/genai-perf/genai_perf/test_end_to_end.py @@ -0,0 +1,92 @@ +import itertools +import os +import subprocess +import sys + +# How to run: +# test_end_to_end.py +# Where target is "nim_chat" or "nim_completions" or "vllm_openai" or "triton_tensorrtllm" +# +# For all cases but vllm_openai, it assumes that the server will be on port 9999 +# +# This script will run a sweep of all combinations of values in the testing matrix +# by appending those options on to the genai-pa base command +# + + +testing_matrix = [ + ["--concurrency 1", "--concurrency 32", "--request-rate 1", "--request-rate 32"], + ["--streaming", ""], +] + +base_commands = { + "nim_chat": "genai-perf -s 999 -p 20000 -m llama-2-7b-chat -u http://localhost:9999 --service-kind openai --endpoint-type chat", + "nim_completions": "genai-perf -s 999 -p 20000 -m llama-2-7b -u http://localhost:9999 --service-kind openai --endpoint-type completions", + "vllm_openai": "genai-perf -s 999 -p 20000 -m mistralai/Mistral-7B-v0.1 --service-kind openai --endpoint-type chat", + "triton_tensorrtllm": "genai-perf -s 999 -p 20000 -m llama-2-7b -u 0.0.0.0:9999 --service-kind triton --backend tensorrtllm", + "triton_vllm": "genai-perf -s 999 -p 20000 -m gpt2_vllm --service-kind triton --backend vllm", +} +testname = "" + +if len(sys.argv) == 2: + # The second element in sys.argv is the input string + testname = sys.argv[1] +else: + options = " ".join(base_commands.keys()) + print(f"This script requires exactly one argument. It must be one of {options}") + exit(1) + +base_command = base_commands[testname] + + +def rename_files(files: list, substr: str) -> None: + for f in files: + name, ext = f.rsplit(".", 1) + # Insert the substring and reassemble the filename + new_filename = f"{testname}__{name}__{substr}.{ext}" + try: + os.rename(f, new_filename) + except FileNotFoundError: + # Just ignore the error, since if PA failed these files may not exist + pass + + +def print_summary(): + # FIXME -- print out a few basic metrics. Maybe from the csv? + pass + + +def sanity_check(): + # FIXME -- add in some sanity checking? Throughput isn't 0? + pass + + +# Loop through all combinations +for combination in itertools.product(*testing_matrix): + options_string = " ".join(combination) + command_with_options = f"{base_command} {options_string}" + command_array = command_with_options.split() + + file_options_string = "__".join(combination) + file_options_string = file_options_string.replace(" ", "") + file_options_string = file_options_string.replace("-", "") + output_file = testname + "__" + file_options_string + ".log" + + with open(output_file, "w") as outfile: + print(f"\nCMD: {command_with_options}") + print(f" Output log is {output_file}") + proc = subprocess.run(command_array, stdout=outfile, stderr=subprocess.STDOUT) + + if proc.returncode != 0: + print(f" Command failed with return code: {proc.returncode}") + else: + print(f" Command executed successfully!") + print_summary() + sanity_check() + + files = [ + "profile_export.json", + "profile_export_genai_pa.csv", + "llm_inputs.json", + ] + rename_files(files, file_options_string) diff --git a/genai-perf/genai_perf/tokenizer.py b/genai-perf/genai_perf/tokenizer.py new file mode 100644 index 00000000..052a478e --- /dev/null +++ b/genai-perf/genai_perf/tokenizer.py @@ -0,0 +1,78 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import io +from typing import List + +from genai_perf.exceptions import GenAIPerfException + +# Silence tokenizer warning on import +with contextlib.redirect_stdout(io.StringIO()) as stdout, contextlib.redirect_stderr( + io.StringIO() +) as stderr: + from transformers import AutoTokenizer, BatchEncoding + from transformers import logging as token_logger + + token_logger.set_verbosity_error() + +DEFAULT_TOKENIZER = "hf-internal-testing/llama-tokenizer" + + +class Tokenizer: + """ + A small wrapper class around Huggingface Tokenizer + """ + + def __init__(self, name: str) -> None: + """ + Initialize by downloading the tokenizer from Huggingface.co + """ + try: + # Silence tokenizer warning on first use + with contextlib.redirect_stdout( + io.StringIO() + ) as stdout, contextlib.redirect_stderr(io.StringIO()) as stderr: + tokenizer = AutoTokenizer.from_pretrained(name) + except Exception as e: + raise GenAIPerfException(e) + + self._tokenizer = tokenizer + + # default tokenizer parameters for __call__, encode, decode methods + self._call_args = {"add_special_tokens": False} + self._encode_args = {"add_special_tokens": False} + self._decode_args = {"skip_special_tokens": True} + + def __call__(self, text, **kwargs) -> BatchEncoding: + self._call_args.update(kwargs) + return self._tokenizer(text, **self._call_args) + + def encode(self, text, **kwargs) -> List[int]: + self._encode_args.update(kwargs) + return self._tokenizer.encode(text, **self._encode_args) + + def decode(self, token_ids, **kwargs) -> str: + self._decode_args.update(kwargs) + return self._tokenizer.decode(token_ids, **self._decode_args) + + def __repr__(self) -> str: + return self._tokenizer.__repr__() + + +def get_tokenizer(tokenizer_model: str) -> Tokenizer: + """ + Return tokenizer for the given model name + """ + return Tokenizer(tokenizer_model) diff --git a/genai-perf/genai_perf/utils.py b/genai-perf/genai_perf/utils.py new file mode 100644 index 00000000..a10befe1 --- /dev/null +++ b/genai-perf/genai_perf/utils.py @@ -0,0 +1,79 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +from enum import Enum +from pathlib import Path +from typing import Any, Dict, List, Optional, Type + +# Skip type checking to avoid mypy error +# Issue: https://github.com/python/mypy/issues/10632 +import yaml # type: ignore + + +def remove_sse_prefix(msg: str) -> str: + prefix = "data: " + if msg.startswith(prefix): + return msg[len(prefix) :].strip() + return msg.strip() + + +def load_yaml(filepath: Path) -> Dict[str, Any]: + with open(str(filepath)) as f: + configs = yaml.safe_load(f) + return configs + + +def load_json(filepath: Path) -> Dict[str, Any]: + with open(str(filepath), encoding="utf-8", errors="ignore") as f: + return json.load(f) + + +def remove_file(file: Path) -> None: + if file.is_file(): + file.unlink() + + +def convert_option_name(name: str) -> str: + return name.replace("_", "-") + + +def get_enum_names(enum: Type[Enum]) -> List: + names = [] + for e in enum: + names.append(e.name.lower()) + return names + + +def get_enum_entry(name: str, enum: Type[Enum]) -> Optional[Enum]: + for e in enum: + if e.name.lower() == name.lower(): + return e + return None + + +def scale(value, factor): + return value * factor diff --git a/genai-perf/genai_perf/wrapper.py b/genai-perf/genai_perf/wrapper.py new file mode 100644 index 00000000..e5f70442 --- /dev/null +++ b/genai-perf/genai_perf/wrapper.py @@ -0,0 +1,141 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import subprocess +from argparse import Namespace +from typing import List, Optional + +import genai_perf.logging as logging +import genai_perf.utils as utils +from genai_perf.constants import DEFAULT_GRPC_URL, DEFAULT_INPUT_DATA_JSON +from genai_perf.llm_inputs.llm_inputs import OutputFormat + +logger = logging.getLogger(__name__) + + +class Profiler: + @staticmethod + def add_protocol_args(args: Namespace) -> List[str]: + cmd = [] + if args.service_kind == "triton": + cmd += ["-i", "grpc", "--streaming"] + if args.u is None: # url + cmd += ["-u", f"{DEFAULT_GRPC_URL}"] + if args.output_format == OutputFormat.TENSORRTLLM: + cmd += ["--shape", "max_tokens:1", "--shape", "text_input:1"] + elif args.service_kind == "openai": + cmd += ["-i", "http"] + return cmd + + @staticmethod + def add_inference_load_args(args: Namespace) -> List[str]: + cmd = [] + if args.concurrency: + cmd += ["--concurrency-range", f"{args.concurrency}"] + elif args.request_rate: + cmd += ["--request-rate-range", f"{args.request_rate}"] + return cmd + + @staticmethod + def build_cmd(args: Namespace, extra_args: Optional[List[str]] = None) -> List[str]: + skip_args = [ + "artifact_dir", + "backend", + "concurrency", + "endpoint_type", + "extra_inputs", + "formatted_model_name", + "func", + "generate_plots", + "input_dataset", + "input_file", + "input_format", + "model", + "model_selection_strategy", + "num_prompts", + "output_format", + "output_tokens_mean_deterministic", + "output_tokens_mean", + "output_tokens_stddev", + "prompt_source", + "random_seed", + "request_rate", + # The 'streaming' passed in to this script is to determine if the + # LLM response should be streaming. That is different than the + # 'streaming' that PA takes, which means something else (and is + # required for decoupled models into triton). + "streaming", + "synthetic_input_tokens_mean", + "synthetic_input_tokens_stddev", + "subcommand", + "tokenizer", + ] + + utils.remove_file(args.profile_export_file) + + cmd = [ + f"perf_analyzer", + f"-m", + f"{args.formatted_model_name}", + f"--async", + f"--input-data", + f"{args.artifact_dir / DEFAULT_INPUT_DATA_JSON}", + ] + for arg, value in vars(args).items(): + if arg in skip_args: + pass + elif value is None: + pass + elif value is False: + pass + elif value is True: + if len(arg) == 1: + cmd += [f"-{arg}"] + else: + cmd += [f"--{arg}"] + else: + if len(arg) == 1: + cmd += [f"-{arg}", f"{value}"] + else: + arg = utils.convert_option_name(arg) + cmd += [f"--{arg}", f"{value}"] + + cmd += Profiler.add_protocol_args(args) + cmd += Profiler.add_inference_load_args(args) + + if extra_args is not None: + for arg in extra_args: + cmd += [f"{arg}"] + return cmd + + @staticmethod + def run(args: Namespace, extra_args: Optional[List[str]]) -> None: + cmd = Profiler.build_cmd(args, extra_args) + logger.info(f"Running Perf Analyzer : '{' '.join(cmd)}'") + if args and args.verbose: + subprocess.run(cmd, check=True, stdout=None) + else: + subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL) diff --git a/genai-perf/pyproject.toml b/genai-perf/pyproject.toml new file mode 100644 index 00000000..982ee24b --- /dev/null +++ b/genai-perf/pyproject.toml @@ -0,0 +1,96 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +[project] +name = "genai-perf" +readme = "README.md" +description = "GenAI Perf Analyzer CLI - CLI tool to simplify profiling LLMs and Generative AI models with Perf Analyzer" +dynamic = ["version"] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Science/Research", + "Intended Audience :: Developers", + "Topic :: Software Development", + "Topic :: Scientific/Engineering", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.10", + "Operating System :: Unix", +] +authors = [] +maintainers = [] +keywords = [] +requires-python = ">=3.8,<4" +dependencies = [ + "numpy<2", + "pytest", + "rich", + "transformers", + "plotly", + "pandas", + "kaleido", + "statsmodels", + "pyarrow", + "fastparquet", + "pytest-mock", + "pyyaml", + "responses", +] + +# CLI Entrypoint +[project.scripts] +genai-perf = "genai_perf.main:main" + +[project.urls] +"Homepage" = "https://github.com/triton-inference-server/" +"Bug Tracker" = "https://github.com/triton-inference-server/server/issues" + +# Build +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.version] +path = "genai_perf/__init__.py" + +# Pre-commit hook tool configs +[tool.codespell] +# note: pre-commit passes explicit lists of files here, which this skip file list doesn't override - +# this is only to allow you to run codespell interactively +skip = "./.git,./.github" +# ignore short words, and typename parameters like OffsetT +ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b" +# use the 'clear' dictionary for unambiguous spelling mistakes +builtin = "clear" +# disable warnings about binary files and wrong encoding +quiet-level = 3 + +# Linting/formatting +[tool.ruff] +# Same as Black. +line-length = 88 +indent-width = 4 diff --git a/genai-perf/tests/__init__.py b/genai-perf/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/genai-perf/tests/test_artifacts.py b/genai-perf/tests/test_artifacts.py new file mode 100644 index 00000000..56b1b38d --- /dev/null +++ b/genai-perf/tests/test_artifacts.py @@ -0,0 +1,49 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from argparse import Namespace +from pathlib import Path + +import pytest +from genai_perf.main import create_artifacts_dirs + + +@pytest.fixture +def mock_makedirs(mocker): + return mocker.patch("os.makedirs") + + +def test_create_artifacts_dirs_custom_path(mock_makedirs): + artifacts_dir_path = "/genai_perf_artifacts" + mock_args = Namespace(artifact_dir=Path(artifacts_dir_path)) + create_artifacts_dirs(mock_args) + mock_makedirs.assert_any_call( + Path(artifacts_dir_path), exist_ok=True + ), f"Expected os.makedirs to create artifacts directory inside {artifacts_dir_path} path." + mock_makedirs.assert_any_call( + Path(artifacts_dir_path) / "plots", exist_ok=True + ), f"Expected os.makedirs to create plots directory inside {artifacts_dir_path}/plots path." + assert mock_makedirs.call_count == 2 diff --git a/genai-perf/tests/test_cli.py b/genai-perf/tests/test_cli.py new file mode 100644 index 00000000..5cf84c36 --- /dev/null +++ b/genai-perf/tests/test_cli.py @@ -0,0 +1,699 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +from pathlib import Path + +import genai_perf.logging as logging +import pytest +from genai_perf import __version__, parser +from genai_perf.llm_inputs.llm_inputs import ( + ModelSelectionStrategy, + OutputFormat, + PromptSource, +) + + +class TestCLIArguments: + # ================================================ + # GENAI-PERF COMMAND + # ================================================ + expected_help_output = ( + "CLI to profile LLMs and Generative AI models with Perf Analyzer" + ) + expected_version_output = f"genai-perf {__version__}" + + @pytest.mark.parametrize( + "args, expected_output", + [ + (["-h"], expected_help_output), + (["--help"], expected_help_output), + (["-m", "abc", "--help"], expected_help_output), + (["-m", "abc", "-h"], expected_help_output), + (["--version"], expected_version_output), + (["-m", "abc", "--version"], expected_version_output), + ], + ) + def test_help_version_arguments_output_and_exit( + self, monkeypatch, args, expected_output, capsys + ): + monkeypatch.setattr("sys.argv", ["genai-perf"] + args) + + with pytest.raises(SystemExit) as excinfo: + _ = parser.parse_args() + + # Check that the exit was successful + assert excinfo.value.code == 0 + + # Capture that the correct message was displayed + captured = capsys.readouterr() + assert expected_output in captured.out + + @pytest.mark.parametrize( + "arg, expected_attributes", + [ + ( + ["--artifact-dir", "test_artifact_dir"], + {"artifact_dir": Path("test_artifact_dir")}, + ), + (["--concurrency", "3"], {"concurrency": 3}), + ( + ["--endpoint-type", "completions", "--service-kind", "openai"], + {"endpoint": "v1/completions"}, + ), + ( + ["--endpoint-type", "chat", "--service-kind", "openai"], + {"endpoint": "v1/chat/completions"}, + ), + ( + [ + "--endpoint-type", + "chat", + "--service-kind", + "openai", + "--endpoint", + "custom/address", + ], + {"endpoint": "custom/address"}, + ), + ( + [ + "--endpoint-type", + "chat", + "--service-kind", + "openai", + "--endpoint", + " /custom/address", + ], + {"endpoint": "custom/address"}, + ), + ( + [ + "--endpoint-type", + "completions", + "--service-kind", + "openai", + "--endpoint", + "custom/address", + ], + {"endpoint": "custom/address"}, + ), + ( + ["--extra-inputs", "test_key:test_value"], + {"extra_inputs": ["test_key:test_value"]}, + ), + ( + [ + "--extra-inputs", + "test_key:5", + "--extra-inputs", + "another_test_key:6", + ], + {"extra_inputs": ["test_key:5", "another_test_key:6"]}, + ), + ( + [ + "--extra-inputs", + '{"name": "Wolverine","hobbies": ["hacking", "slashing"],"address": {"street": "1407 Graymalkin Lane, Salem Center","city": "NY"}}', + ], + { + "extra_inputs": [ + '{"name": "Wolverine","hobbies": ["hacking", "slashing"],"address": {"street": "1407 Graymalkin Lane, Salem Center","city": "NY"}}' + ] + }, + ), + (["--input-dataset", "openorca"], {"input_dataset": "openorca"}), + (["--measurement-interval", "100"], {"measurement_interval": 100}), + ( + ["--model-selection-strategy", "random"], + {"model_selection_strategy": ModelSelectionStrategy.RANDOM}, + ), + (["--num-prompts", "101"], {"num_prompts": 101}), + ( + ["--output-tokens-mean", "6"], + {"output_tokens_mean": 6}, + ), + ( + ["--output-tokens-mean", "6", "--output-tokens-stddev", "7"], + {"output_tokens_stddev": 7}, + ), + ( + ["--output-tokens-mean", "6", "--output-tokens-mean-deterministic"], + {"output_tokens_mean_deterministic": True}, + ), + (["-p", "100"], {"measurement_interval": 100}), + ( + ["--profile-export-file", "test.json"], + { + "profile_export_file": Path( + "artifacts/test_model-triton-tensorrtllm-concurrency1/test.json" + ) + }, + ), + (["--random-seed", "8"], {"random_seed": 8}), + (["--request-rate", "9.0"], {"request_rate": 9.0}), + (["-s", "99.5"], {"stability_percentage": 99.5}), + (["--service-kind", "triton"], {"service_kind": "triton"}), + ( + ["--service-kind", "openai", "--endpoint-type", "chat"], + {"service_kind": "openai", "endpoint": "v1/chat/completions"}, + ), + (["--stability-percentage", "99.5"], {"stability_percentage": 99.5}), + (["--streaming"], {"streaming": True}), + ( + ["--synthetic-input-tokens-mean", "6"], + {"synthetic_input_tokens_mean": 6}, + ), + ( + ["--synthetic-input-tokens-stddev", "7"], + {"synthetic_input_tokens_stddev": 7}, + ), + (["-v"], {"verbose": True}), + (["--verbose"], {"verbose": True}), + (["-u", "test_url"], {"u": "test_url"}), + (["--url", "test_url"], {"u": "test_url"}), + ], + ) + def test_non_file_flags_parsed(self, monkeypatch, arg, expected_attributes, capsys): + logging.init_logging() + combined_args = ["genai-perf", "--model", "test_model"] + arg + monkeypatch.setattr("sys.argv", combined_args) + args, _ = parser.parse_args() + + # Check that the attributes are set correctly + for key, value in expected_attributes.items(): + assert getattr(args, key) == value + + # Check that nothing was printed as a byproduct of parsing the arguments + captured = capsys.readouterr() + assert captured.out == "" + + @pytest.mark.parametrize( + "models, expected_model_list, formatted_name", + [ + ( + ["--model", "test_model_A"], + {"model": ["test_model_A"]}, + {"formatted_model_name": "test_model_A"}, + ), + ( + ["--model", "test_model_A", "test_model_B"], + {"model": ["test_model_A", "test_model_B"]}, + {"formatted_model_name": "test_model_A_multi"}, + ), + ( + ["--model", "test_model_A", "test_model_B", "test_model_C"], + {"model": ["test_model_A", "test_model_B", "test_model_C"]}, + {"formatted_model_name": "test_model_A_multi"}, + ), + ( + ["--model", "test_model_A:math", "test_model_B:embedding"], + {"model": ["test_model_A:math", "test_model_B:embedding"]}, + {"formatted_model_name": "test_model_A:math_multi"}, + ), + ], + ) + def test_multiple_model_args( + self, monkeypatch, models, expected_model_list, formatted_name, capsys + ): + logging.init_logging() + combined_args = ["genai-perf"] + models + monkeypatch.setattr("sys.argv", combined_args) + args, _ = parser.parse_args() + + # Check that models are handled correctly + for key, value in expected_model_list.items(): + assert getattr(args, key) == value + + # Check that the formatted_model_name is correctly generated + for key, value in formatted_name.items(): + assert getattr(args, key) == value + + # Check that nothing was printed as a byproduct of parsing the arguments + captured = capsys.readouterr() + assert captured.out == "" + + def test_file_flags_parsed(self, monkeypatch, mocker): + mocked_open = mocker.patch("builtins.open", mocker.mock_open(read_data="data")) + combined_args = [ + "genai-perf", + "--model", + "test_model", + "--input-file", + "fakefile.txt", + ] + monkeypatch.setattr("sys.argv", combined_args) + args, _ = parser.parse_args() + assert ( + args.input_file == mocked_open.return_value + ), "The file argument should be the mock object" + + @pytest.mark.parametrize( + "arg, expected_path", + [ + ( + ["--service-kind", "openai", "--endpoint-type", "chat"], + "artifacts/test_model-openai-chat-concurrency1", + ), + ( + ["--service-kind", "openai", "--endpoint-type", "completions"], + "artifacts/test_model-openai-completions-concurrency1", + ), + ( + ["--service-kind", "triton", "--backend", "tensorrtllm"], + "artifacts/test_model-triton-tensorrtllm-concurrency1", + ), + ( + ["--service-kind", "triton", "--backend", "vllm"], + "artifacts/test_model-triton-vllm-concurrency1", + ), + ( + [ + "--service-kind", + "triton", + "--backend", + "vllm", + "--concurrency", + "32", + ], + "artifacts/test_model-triton-vllm-concurrency32", + ), + ], + ) + def test_default_profile_export_filepath( + self, monkeypatch, arg, expected_path, capsys + ): + logging.init_logging() + combined_args = ["genai-perf", "--model", "test_model"] + arg + monkeypatch.setattr("sys.argv", combined_args) + args, _ = parser.parse_args() + + assert args.artifact_dir == Path(expected_path) + captured = capsys.readouterr() + assert captured.out == "" + + @pytest.mark.parametrize( + "arg, expected_path, expected_output", + [ + ( + ["--model", "strange/test_model"], + "artifacts/strange_test_model-triton-tensorrtllm-concurrency1", + ( + "Model name 'strange/test_model' cannot be used to create " + "artifact directory. Instead, 'strange_test_model' will be used" + ), + ), + ( + [ + "--model", + "hello/world/test_model", + "--service-kind", + "openai", + "--endpoint-type", + "chat", + ], + "artifacts/hello_world_test_model-openai-chat-concurrency1", + ( + "Model name 'hello/world/test_model' cannot be used to create " + "artifact directory. Instead, 'hello_world_test_model' will be used" + ), + ), + ], + ) + def test_model_name_artifact_path( + self, monkeypatch, arg, expected_path, expected_output, capsys + ): + logging.init_logging() + combined_args = ["genai-perf"] + arg + monkeypatch.setattr("sys.argv", combined_args) + args, _ = parser.parse_args() + + assert args.artifact_dir == Path(expected_path) + captured = capsys.readouterr() + assert expected_output in captured.out + + def test_default_load_level(self, monkeypatch, capsys): + logging.init_logging() + monkeypatch.setattr("sys.argv", ["genai-perf", "--model", "test_model"]) + args, _ = parser.parse_args() + assert args.concurrency == 1 + captured = capsys.readouterr() + assert captured.out == "" + + def test_load_level_mutually_exclusive(self, monkeypatch, capsys): + monkeypatch.setattr( + "sys.argv", ["genai-perf", "--concurrency", "3", "--request-rate", "9.0"] + ) + expected_output = ( + "argument --request-rate: not allowed with argument --concurrency" + ) + + with pytest.raises(SystemExit) as excinfo: + parser.parse_args() + + assert excinfo.value.code != 0 + captured = capsys.readouterr() + assert expected_output in captured.err + + def test_model_not_provided(self, monkeypatch, capsys): + monkeypatch.setattr("sys.argv", ["genai-perf"]) + expected_output = "The -m/--model option is required and cannot be empty." + + with pytest.raises(SystemExit) as excinfo: + parser.parse_args() + + assert excinfo.value.code != 0 + captured = capsys.readouterr() + assert expected_output in captured.err + + def test_pass_through_args(self, monkeypatch): + args = ["genai-perf", "-m", "test_model"] + other_args = ["--", "With", "great", "power"] + monkeypatch.setattr("sys.argv", args + other_args) + _, pass_through_args = parser.parse_args() + + assert pass_through_args == other_args[1:] + + def test_unrecognized_arg(self, monkeypatch, capsys): + monkeypatch.setattr( + "sys.argv", + [ + "genai-perf", + "-m", + "nonexistent_model", + "--wrong-arg", + ], + ) + expected_output = "unrecognized arguments: --wrong-arg" + + with pytest.raises(SystemExit) as excinfo: + parser.parse_args() + + assert excinfo.value.code != 0 + captured = capsys.readouterr() + assert expected_output in captured.err + + @pytest.mark.parametrize( + "args, expected_output", + [ + ( + ["genai-perf", "-m", "test_model", "--service-kind", "openai"], + "The --endpoint-type option is required when using the 'openai' service-kind.", + ), + ( + [ + "genai-perf", + "-m", + "test_model", + "--service-kind", + "openai", + "--endpoint", + "custom/address", + ], + "The --endpoint-type option is required when using the 'openai' service-kind.", + ), + ( + ["genai-perf", "-m", "test_model", "--output-tokens-stddev", "5"], + "The --output-tokens-mean option is required when using --output-tokens-stddev.", + ), + ( + [ + "genai-perf", + "-m", + "test_model", + "--output-tokens-mean-deterministic", + ], + "The --output-tokens-mean option is required when using --output-tokens-mean-deterministic.", + ), + ( + [ + "genai-perf", + "-m", + "test_model", + "--output-tokens-mean-deterministic", + ], + "The --output-tokens-mean option is required when using --output-tokens-mean-deterministic.", + ), + ( + [ + "genai-perf", + "-m", + "test_model", + "--service-kind", + "openai", + "--endpoint-type", + "chat", + "--output-tokens-mean", + "100", + "--output-tokens-mean-deterministic", + ], + "The --output-tokens-mean-deterministic option is only supported with the Triton service-kind", + ), + ], + ) + def test_conditional_errors(self, args, expected_output, monkeypatch, capsys): + monkeypatch.setattr("sys.argv", args) + + with pytest.raises(SystemExit) as excinfo: + parser.parse_args() + + assert excinfo.value.code != 0 + captured = capsys.readouterr() + assert expected_output in captured.err + + @pytest.mark.parametrize( + "args, expected_format", + [ + ( + ["--service-kind", "openai", "--endpoint-type", "chat"], + OutputFormat.OPENAI_CHAT_COMPLETIONS, + ), + ( + ["--service-kind", "openai", "--endpoint-type", "completions"], + OutputFormat.OPENAI_COMPLETIONS, + ), + ( + [ + "--service-kind", + "openai", + "--endpoint-type", + "completions", + "--endpoint", + "custom/address", + ], + OutputFormat.OPENAI_COMPLETIONS, + ), + ( + ["--service-kind", "triton", "--backend", "tensorrtllm"], + OutputFormat.TENSORRTLLM, + ), + (["--service-kind", "triton", "--backend", "vllm"], OutputFormat.VLLM), + ], + ) + def test_inferred_output_format(self, monkeypatch, args, expected_format): + monkeypatch.setattr("sys.argv", ["genai-perf", "-m", "test_model"] + args) + + parsed_args, _ = parser.parse_args() + assert parsed_args.output_format == expected_format + + @pytest.mark.parametrize( + "args, expected_error", + [ + ( + ["--extra-inputs", "hi:"], + "Input name or value is empty in --extra-inputs: hi:\nExpected input format: 'input_name:value'", + ), + ( + ["--extra-inputs", ":a"], + "Input name or value is empty in --extra-inputs: :a\nExpected input format: 'input_name:value'", + ), + ( + ["--extra-inputs", ":a:"], + "Invalid input format for --extra-inputs: :a:\nExpected input format: 'input_name:value'", + ), + ( + ["--extra-inputs", "unknown"], + "Invalid input format for --extra-inputs: unknown\nExpected input format: 'input_name:value'", + ), + ( + ["--extra-inputs", "test_key:5", "--extra-inputs", "test_key:6"], + "Input name already exists in request_inputs dictionary: test_key", + ), + ], + ) + def test_repeated_extra_arg_warning(self, monkeypatch, args, expected_error): + combined_args = ["genai-perf", "-m", "test_model"] + args + monkeypatch.setattr("sys.argv", combined_args) + + parsed_args, _ = parser.parse_args() + + with pytest.raises(ValueError) as exc_info: + _ = parser.get_extra_inputs_as_dict(parsed_args) + + assert str(exc_info.value) == expected_error + + @pytest.mark.parametrize( + "args, expected_prompt_source", + [ + ([], PromptSource.SYNTHETIC), + (["--input-dataset", "openorca"], PromptSource.DATASET), + (["--input-file", "prompt.txt"], PromptSource.FILE), + ( + ["--input-file", "prompt.txt", "--synthetic-input-tokens-mean", "10"], + PromptSource.FILE, + ), + ], + ) + def test_inferred_prompt_source( + self, monkeypatch, mocker, args, expected_prompt_source + ): + _ = mocker.patch("builtins.open", mocker.mock_open(read_data="data")) + combined_args = ["genai-perf", "--model", "test_model"] + args + monkeypatch.setattr("sys.argv", combined_args) + args, _ = parser.parse_args() + + assert args.prompt_source == expected_prompt_source + + def test_prompt_source_assertions(self, monkeypatch, mocker, capsys): + _ = mocker.patch("builtins.open", mocker.mock_open(read_data="data")) + args = [ + "genai-perf", + "--model", + "test_model", + "--input-dataset", + "openorca", + "--input-file", + "prompt.txt", + ] + monkeypatch.setattr("sys.argv", args) + + expected_output = ( + "argument --input-file: not allowed with argument --input-dataset" + ) + + with pytest.raises(SystemExit) as excinfo: + parser.parse_args() + + assert excinfo.value.code != 0 + captured = capsys.readouterr() + assert expected_output in captured.err + + # ================================================ + # COMPARE SUBCOMMAND + # ================================================ + expected_compare_help_output = ( + "Subcommand to generate plots that compare multiple profile runs." + ) + + @pytest.mark.parametrize( + "args, expected_output", + [ + (["-h"], expected_compare_help_output), + (["--help"], expected_compare_help_output), + ], + ) + def test_compare_help_arguments_output_and_exit( + self, monkeypatch, args, expected_output, capsys + ): + logging.init_logging() + monkeypatch.setattr("sys.argv", ["genai-perf", "compare"] + args) + + with pytest.raises(SystemExit) as excinfo: + _ = parser.parse_args() + + # Check that the exit was successful + assert excinfo.value.code == 0 + + # Capture that the correct message was displayed + captured = capsys.readouterr() + assert expected_output in captured.out + + def test_compare_mutually_exclusive(self, monkeypatch, capsys): + args = ["genai-perf", "compare", "--config", "hello", "--files", "a", "b", "c"] + monkeypatch.setattr("sys.argv", args) + expected_output = "argument -f/--files: not allowed with argument --config" + + with pytest.raises(SystemExit) as excinfo: + parser.parse_args() + + assert excinfo.value.code != 0 + captured = capsys.readouterr() + assert expected_output in captured.err + + def test_compare_not_provided(self, monkeypatch, capsys): + args = ["genai-perf", "compare"] + monkeypatch.setattr("sys.argv", args) + expected_output = "Either the --config or --files option must be specified." + + with pytest.raises(SystemExit) as excinfo: + parser.parse_args() + + assert excinfo.value.code != 0 + captured = capsys.readouterr() + assert expected_output in captured.err + + @pytest.mark.parametrize( + "args, expected_model", + [ + (["--files", "profile1.json", "profile2.json", "profile3.json"], None), + (["--config", "config.yaml"], None), + ], + ) + def test_compare_model_arg(self, monkeypatch, args, expected_model): + combined_args = ["genai-perf", "compare"] + args + monkeypatch.setattr("sys.argv", combined_args) + args, _ = parser.parse_args() + + assert args.model == expected_model + + @pytest.mark.parametrize( + "extra_inputs_list, expected_dict", + [ + (["test_key:test_value"], {"test_key": "test_value"}), + ( + ["test_key:1", "another_test_key:2"], + {"test_key": 1, "another_test_key": 2}, + ), + ( + [ + '{"name": "Wolverine","hobbies": ["hacking", "slashing"],"address": {"street": "1407 Graymalkin Lane, Salem Center","city": "NY"}}' + ], + { + "name": "Wolverine", + "hobbies": ["hacking", "slashing"], + "address": { + "street": "1407 Graymalkin Lane, Salem Center", + "city": "NY", + }, + }, + ), + ], + ) + def test_get_extra_inputs_as_dict(self, extra_inputs_list, expected_dict): + namespace = argparse.Namespace() + namespace.extra_inputs = extra_inputs_list + actual_dict = parser.get_extra_inputs_as_dict(namespace) + assert actual_dict == expected_dict diff --git a/genai-perf/tests/test_console_exporter.py b/genai-perf/tests/test_console_exporter.py new file mode 100644 index 00000000..2bf41441 --- /dev/null +++ b/genai-perf/tests/test_console_exporter.py @@ -0,0 +1,175 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from genai_perf.export_data.console_exporter import ConsoleExporter +from genai_perf.export_data.exporter_config import ExporterConfig +from genai_perf.llm_metrics import LLMMetrics, Statistics + + +class TestConsoleExporter: + + def test_pretty_print_output(self, capsys) -> None: + config = ExporterConfig() + config.stats = stats + exporter = ConsoleExporter(config) + exporter.export() + + expected_content = ( + " LLM Metrics \n" + "┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┓\n" + "┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃\n" + "┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━┩\n" + "│ Time to first token (ms) │ 2.00 │ 2.00 │ 3.00 │ 2.99 │ 2.90 │ 2.75 │\n" + "│ Inter token latency (ms) │ 0.50 │ 0.00 │ 1.00 │ 0.99 │ 0.90 │ 0.75 │\n" + "│ Request latency (ms) │ 3.00 │ 3.00 │ 4.00 │ 3.99 │ 3.90 │ 3.75 │\n" + "│ Output sequence length │ 6.50 │ 6.00 │ 7.00 │ 6.99 │ 6.90 │ 6.75 │\n" + "│ Input sequence length │ 7.50 │ 7.00 │ 8.00 │ 7.99 │ 7.90 │ 7.75 │\n" + "└──────────────────────────┴──────┴──────┴──────┴──────┴──────┴──────┘\n" + "Output token throughput (per sec): 123.00\n" + "Request throughput (per sec): 456.00\n" + ) + + returned_data = capsys.readouterr().out + + assert returned_data == expected_content + + def test_nonstreaming_llm_output(self, capsys) -> None: + metrics = LLMMetrics( + request_throughputs=[123], + request_latencies=[4, 5, 6], + time_to_first_tokens=[4, 5, 6], # same as request_latency + inter_token_latencies=[], # no ITL + output_token_throughputs=[456], + output_sequence_lengths=[1, 2, 3], + input_sequence_lengths=[5, 6, 7], + ) + stats = Statistics(metrics=metrics) + + config = ExporterConfig() + config.stats = stats.stats_dict + exporter = ConsoleExporter(config) + exporter.export() + + # No TTFT and ITL in the output + expected_content = ( + " LLM Metrics \n" + "┏━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━┓\n" + "┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃\n" + "┡━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━┩\n" + "│ Request latency (ms) │ 5.00 │ 4.00 │ 6.00 │ 5.98 │ 5.80 │ 5.50 │\n" + "│ Output sequence length │ 2.00 │ 1.00 │ 3.00 │ 2.98 │ 2.80 │ 2.50 │\n" + "│ Input sequence length │ 6.00 │ 5.00 │ 7.00 │ 6.98 │ 6.80 │ 6.50 │\n" + "└────────────────────────┴──────┴──────┴──────┴──────┴──────┴──────┘\n" + "Output token throughput (per sec): 456.00\n" + "Request throughput (per sec): 123.00\n" + ) + + returned_data = capsys.readouterr().out + assert returned_data == expected_content + + +stats = { + "request_throughput": {"unit": "requests/sec", "avg": 456.0}, + "request_latency": { + "unit": "ms", + "avg": 3.0, + "p99": 3.99, + "p95": 3.95, + "p90": 3.90, + "p75": 3.75, + "p50": 3.50, + "p25": 3.25, + "max": 4.0, + "min": 3.0, + "std": 3.50, + }, + "time_to_first_token": { + "unit": "ms", + "avg": 2.0, + "p99": 2.99, + "p95": 2.95, + "p90": 2.90, + "p75": 2.75, + "p50": 2.50, + "p25": 2.25, + "max": 3.00, + "min": 2.00, + "std": 2.50, + }, + "inter_token_latency": { + "unit": "ms", + "avg": 0.50, + "p99": 0.99, + "p95": 0.95, + "p90": 0.90, + "p75": 0.75, + "p50": 0.50, + "p25": 0.25, + "max": 1.00, + "min": 0.00, + "std": 0.50, + }, + "output_token_throughput": {"unit": "tokens/sec", "avg": 123.0}, + "output_token_throughput_per_request": { + "unit": "tokens/sec", + "avg": 300.00, + "p99": 300.00, + "p95": 300.00, + "p90": 300.00, + "p75": 300.00, + "p50": 300.00, + "p25": 300.00, + "max": 300.00, + "min": 300.00, + "std": 300.00, + }, + "output_sequence_length": { + "unit": "tokens", + "avg": 6.5, + "p99": 6.99, + "p95": 6.95, + "p90": 6.90, + "p75": 6.75, + "p50": 6.5, + "p25": 6.25, + "max": 7.0, + "min": 6.0, + "std": 6.5, + }, + "input_sequence_length": { + "unit": "tokens", + "avg": 7.5, + "p99": 7.99, + "p95": 7.95, + "p90": 7.90, + "p75": 7.75, + "p50": 7.5, + "p25": 7.25, + "max": 8.0, + "min": 7.0, + "std": 7.5, + }, +} diff --git a/genai-perf/tests/test_csv_exporter.py b/genai-perf/tests/test_csv_exporter.py new file mode 100644 index 00000000..5372612e --- /dev/null +++ b/genai-perf/tests/test_csv_exporter.py @@ -0,0 +1,167 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +from io import StringIO +from pathlib import Path +from typing import Any, List + +import pytest +from genai_perf.export_data.csv_exporter import CsvExporter +from genai_perf.export_data.exporter_config import ExporterConfig +from genai_perf.llm_metrics import LLMProfileDataParser +from genai_perf.tokenizer import DEFAULT_TOKENIZER, get_tokenizer + + +class TestCsvExporter: + @pytest.fixture + def mock_read_write(self, monkeypatch: pytest.MonkeyPatch) -> List[str]: + """ + This function will mock the open function for specific files. + """ + + written_data = [] + + original_open = open + + def custom_open(filename, *args, **kwargs): + def write(self: Any, content: str) -> int: + written_data.append(content) + return len(content) + + if str(filename) == "triton_profile_export.json": + tmp_file = StringIO(json.dumps(triton_profile_data)) + return tmp_file + elif str(filename) == "profile_export_genai_perf.csv": + tmp_file = StringIO() + tmp_file.write = write.__get__(tmp_file) + return tmp_file + else: + return original_open(filename, *args, **kwargs) + + monkeypatch.setattr("builtins.open", custom_open) + + return written_data + + def test_csv_output(self, mock_read_write: pytest.MonkeyPatch) -> None: + """ + Collect LLM metrics from profile export data and confirm correct values are + printed in csv. + """ + + tokenizer = get_tokenizer(DEFAULT_TOKENIZER) + pd = LLMProfileDataParser( + filename=Path("triton_profile_export.json"), + tokenizer=tokenizer, + ) + stat = pd.get_statistics(infer_mode="concurrency", load_level="10") + + expected_content = [ + "Metric,avg,min,max,p99,p95,p90,p75,p50,p25\r\n", + "Time To First Token (ms),2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00,2.00\r\n", + "Inter Token Latency (ms),1.50,1.00,2.00,1.99,1.95,1.90,1.75,1.50,1.25\r\n", + "Request Latency (ms),8.00,7.00,9.00,8.98,8.90,8.80,8.50,8.00,7.50\r\n", + "Output Sequence Length,4.50,3.00,6.00,5.97,5.85,5.70,5.25,4.50,3.75\r\n", + "Input Sequence Length,3.50,3.00,4.00,3.99,3.95,3.90,3.75,3.50,3.25\r\n", + "\r\n", + "Metric,Value\r\n", + "Output Token Throughput (per sec),900000000.00\r\n", + "Request Throughput (per sec),200000000.00\r\n", + ] + config = ExporterConfig() + config.stats = stat.stats_dict + config.artifact_dir = Path(".") + exporter = CsvExporter(config) + exporter.export() + + returned_data = mock_read_write + + assert returned_data == expected_content + + +triton_profile_data = { + "service_kind": "triton", + "endpoint": "", + "experiments": [ + { + "experiment": { + "mode": "concurrency", + "value": 10, + }, + "requests": [ + { + "timestamp": 1, + "request_inputs": {"text_input": "This is test"}, + "response_timestamps": [3, 5, 8], + "response_outputs": [ + {"text_output": "I"}, + {"text_output": " like"}, + {"text_output": " dogs"}, + ], + }, + { + "timestamp": 2, + "request_inputs": {"text_input": "This is test too"}, + "response_timestamps": [4, 7, 11], + "response_outputs": [ + {"text_output": "I"}, + {"text_output": " don't"}, + {"text_output": " cook food"}, + ], + }, + ], + }, + { + "experiment": { + "mode": "request_rate", + "value": 2.0, + }, + "requests": [ + { + "timestamp": 5, + "request_inputs": {"text_input": "This is test"}, + "response_timestamps": [7, 8, 13, 18], + "response_outputs": [ + {"text_output": "cat"}, + {"text_output": " is"}, + {"text_output": " cool"}, + {"text_output": " too"}, + ], + }, + { + "timestamp": 3, + "request_inputs": {"text_input": "This is test too"}, + "response_timestamps": [6, 8, 11], + "response_outputs": [ + {"text_output": "it's"}, + {"text_output": " very"}, + {"text_output": " simple work"}, + ], + }, + ], + }, + ], +} diff --git a/genai-perf/tests/test_data_exporter_factory.py b/genai-perf/tests/test_data_exporter_factory.py new file mode 100644 index 00000000..1a1628ac --- /dev/null +++ b/genai-perf/tests/test_data_exporter_factory.py @@ -0,0 +1,83 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +from argparse import Namespace + +import genai_perf.export_data.data_exporter_factory as factory +from genai_perf.export_data.console_exporter import ConsoleExporter +from genai_perf.export_data.csv_exporter import CsvExporter +from genai_perf.export_data.exporter_config import ExporterConfig +from genai_perf.export_data.json_exporter import JsonExporter +from genai_perf.parser import get_extra_inputs_as_dict + + +class TestOutputReporter: + stats = { + "request_latency": { + "unit": "ms", + "avg": 1, + "p99": 2, + "p95": 3, + "p90": 4, + "p75": 5, + "p50": 6, + "p25": 7, + "max": 8, + "min": 9, + "std": 0, + }, + } + args = { + "model": ["gpt2_vllm"], + "formatted_model_name": "gpt2_vllm", + "model_selection_strategy": "round_robin", + "func": "Should_be_removed", + "output_format": "Should_be_removed", + "profile_export_file": ".", + "artifact_dir": ".", + "extra_inputs": ["max_tokens:200"], + } + args_namespace = Namespace(**args) + + config = ExporterConfig() + config.stats = stats + config.args = args_namespace + config.artifact_dir = args_namespace.artifact_dir + config.extra_inputs = get_extra_inputs_as_dict(args_namespace) + f = factory.DataExporterFactory() + + def test_return_json_exporter(self) -> None: + exporter_list = self.f.create_data_exporters(self.config) + assert any(isinstance(exporter, JsonExporter) for exporter in exporter_list) + + def test_return_csv_exporter(self) -> None: + exporter_list = self.f.create_data_exporters(self.config) + assert any(isinstance(exporter, CsvExporter) for exporter in exporter_list) + + def test_return_console_exporter(self) -> None: + exporter_list = self.f.create_data_exporters(self.config) + assert any(isinstance(exporter, ConsoleExporter) for exporter in exporter_list) diff --git a/genai-perf/tests/test_json_exporter.py b/genai-perf/tests/test_json_exporter.py new file mode 100644 index 00000000..c59c688e --- /dev/null +++ b/genai-perf/tests/test_json_exporter.py @@ -0,0 +1,268 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import genai_perf.parser as parser +from genai_perf.export_data.exporter_config import ExporterConfig +from genai_perf.export_data.json_exporter import JsonExporter + + +class TestJsonExporter: + def test_generate_json(self, monkeypatch) -> None: + cli_cmd = [ + "genai-perf", + "-m", + "gpt2_vllm", + "--backend", + "vllm", + "--streaming", + "--extra-inputs", + "max_tokens:256", + "--extra-inputs", + "ignore_eos:true", + ] + monkeypatch.setattr("sys.argv", cli_cmd) + args, _ = parser.parse_args() + config = ExporterConfig() + config.stats = self.stats + config.args = args + config.extra_inputs = parser.get_extra_inputs_as_dict(args) + config.artifact_dir = args.artifact_dir + json_exporter = JsonExporter(config) + assert json_exporter._stats_and_args == json.loads(self.expected_json_output) + + stats = { + "request_throughput": {"unit": "requests/sec", "avg": "7"}, + "request_latency": { + "unit": "ms", + "avg": 1, + "p99": 2, + "p95": 3, + "p90": 4, + "p75": 5, + "p50": 6, + "p25": 7, + "max": 8, + "min": 9, + "std": 0, + }, + "time_to_first_token": { + "unit": "ms", + "avg": 11, + "p99": 12, + "p95": 13, + "p90": 14, + "p75": 15, + "p50": 16, + "p25": 17, + "max": 18, + "min": 19, + "std": 10, + }, + "inter_token_latency": { + "unit": "ms", + "avg": 21, + "p99": 22, + "p95": 23, + "p90": 24, + "p75": 25, + "p50": 26, + "p25": 27, + "max": 28, + "min": 29, + "std": 20, + }, + "output_token_throughput": { + "unit": "tokens/sec", + "avg": 31, + }, + "output_token_throughput_per_request": { + "unit": "tokens/sec", + "avg": 41, + "p99": 42, + "p95": 43, + "p90": 44, + "p75": 45, + "p50": 46, + "p25": 47, + "max": 48, + "min": 49, + "std": 40, + }, + "output_sequence_length": { + "unit": "tokens", + "avg": 51, + "p99": 52, + "p95": 53, + "p90": 54, + "p75": 55, + "p50": 56, + "p25": 57, + "max": 58, + "min": 59, + "std": 50, + }, + "input_sequence_length": { + "unit": "tokens", + "avg": 61, + "p99": 62, + "p95": 63, + "p90": 64, + "p75": 65, + "p50": 66, + "p25": 67, + "max": 68, + "min": 69, + "std": 60, + }, + } + + expected_json_output = """ + { + "request_throughput": { + "unit": "requests/sec", + "avg": "7" + }, + "request_latency": { + "unit": "ms", + "avg": 1, + "p99": 2, + "p95": 3, + "p90": 4, + "p75": 5, + "p50": 6, + "p25": 7, + "max": 8, + "min": 9, + "std": 0 + }, + "time_to_first_token": { + "unit": "ms", + "avg": 11, + "p99": 12, + "p95": 13, + "p90": 14, + "p75": 15, + "p50": 16, + "p25": 17, + "max": 18, + "min": 19, + "std": 10 + }, + "inter_token_latency": { + "unit": "ms", + "avg": 21, + "p99": 22, + "p95": 23, + "p90": 24, + "p75": 25, + "p50": 26, + "p25": 27, + "max": 28, + "min": 29, + "std": 20 + }, + "output_token_throughput": { + "unit": "tokens/sec", + "avg": 31 + }, + "output_token_throughput_per_request": { + "unit": "tokens/sec", + "avg": 41, + "p99": 42, + "p95": 43, + "p90": 44, + "p75": 45, + "p50": 46, + "p25": 47, + "max": 48, + "min": 49, + "std": 40 + }, + "output_sequence_length": { + "unit": "tokens", + "avg": 51, + "p99": 52, + "p95": 53, + "p90": 54, + "p75": 55, + "p50": 56, + "p25": 57, + "max": 58, + "min": 59, + "std": 50 + }, + "input_sequence_length": { + "unit": "tokens", + "avg": 61, + "p99": 62, + "p95": 63, + "p90": 64, + "p75": 65, + "p50": 66, + "p25": 67, + "max": 68, + "min": 69, + "std": 60 + }, + "input_config": { + "model": ["gpt2_vllm"], + "formatted_model_name": "gpt2_vllm", + "model_selection_strategy": "round_robin", + "backend": "vllm", + "endpoint": null, + "endpoint_type": null, + "service_kind": "triton", + "streaming": true, + "u": null, + "input_dataset": null, + "input_file": null, + "num_prompts": 100, + "output_tokens_mean": -1, + "output_tokens_mean_deterministic": false, + "output_tokens_stddev": 0, + "random_seed": 0, + "synthetic_input_tokens_mean": 550, + "synthetic_input_tokens_stddev": 0, + "concurrency": 1, + "measurement_interval": 10000, + "request_rate": null, + "stability_percentage": 999, + "generate_plots": false, + "profile_export_file": "artifacts/gpt2_vllm-triton-vllm-concurrency1/profile_export.json", + "artifact_dir": "artifacts/gpt2_vllm-triton-vllm-concurrency1", + "tokenizer": "hf-internal-testing/llama-tokenizer", + "verbose": false, + "subcommand": null, + "prompt_source": "synthetic", + "extra_inputs": { + "max_tokens": 256, + "ignore_eos": true + } + } + } + """ diff --git a/genai-perf/tests/test_library.py b/genai-perf/tests/test_library.py new file mode 100644 index 00000000..09cd13d4 --- /dev/null +++ b/genai-perf/tests/test_library.py @@ -0,0 +1,32 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import genai_perf + + +# Placeholder to add real tests in the future +def test_version(): + print(genai_perf.__version__) diff --git a/genai-perf/tests/test_llm_inputs.py b/genai-perf/tests/test_llm_inputs.py new file mode 100644 index 00000000..c6351918 --- /dev/null +++ b/genai-perf/tests/test_llm_inputs.py @@ -0,0 +1,762 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import random +import statistics +from pathlib import Path +from unittest.mock import mock_open, patch + +import pytest +import responses +from genai_perf import tokenizer +from genai_perf.constants import CNN_DAILY_MAIL, DEFAULT_INPUT_DATA_JSON, OPEN_ORCA +from genai_perf.exceptions import GenAIPerfException +from genai_perf.llm_inputs.llm_inputs import ( + LlmInputs, + ModelSelectionStrategy, + OutputFormat, + PromptSource, +) +from genai_perf.tokenizer import Tokenizer + +mocked_openorca_data = { + "features": [ + {"feature_idx": 0, "name": "id", "type": {"dtype": "string", "_type": "Value"}}, + { + "feature_idx": 1, + "name": "system_prompt", + "type": {"dtype": "string", "_type": "Value"}, + }, + { + "feature_idx": 2, + "name": "question", + "type": {"dtype": "string", "_type": "Value"}, + }, + { + "feature_idx": 3, + "name": "response", + "type": {"dtype": "string", "_type": "Value"}, + }, + ], + "rows": [ + { + "row_idx": 0, + "row": { + "id": "niv.242684", + "system_prompt": "", + "question": "You will be given a definition of a task first, then some input of the task.\\nThis task is about using the specified sentence and converting the sentence to Resource Description Framework (RDF) triplets of the form (subject, predicate object). The RDF triplets generated must be such that the triplets accurately capture the structure and semantics of the input sentence. The input is a sentence and the output is a list of triplets of the form [subject, predicate, object] that capture the relationships present in the sentence. When a sentence has more than 1 RDF triplet possible, the output must contain all of them.\\n\\nAFC Ajax (amateurs)'s ground is Sportpark De Toekomst where Ajax Youth Academy also play.\\nOutput:", + "response": '[\\n ["AFC Ajax (amateurs)", "has ground", "Sportpark De Toekomst"],\\n ["Ajax Youth Academy", "plays at", "Sportpark De Toekomst"]\\n]', + }, + "truncated_cells": [], + } + ], + "num_rows_total": 2914896, + "num_rows_per_page": 100, + "partial": True, +} + +TEST_LENGTH = 1 + + +class TestLlmInputs: + # Define service kind, backend or api, and output format combinations + SERVICE_KIND_BACKEND_ENDPOINT_TYPE_FORMATS = [ + ("triton", "vllm", OutputFormat.VLLM), + ("triton", "tensorrtllm", OutputFormat.TENSORRTLLM), + ("openai", "v1/completions", OutputFormat.OPENAI_COMPLETIONS), + ("openai", "v1/chat/completions", OutputFormat.OPENAI_CHAT_COMPLETIONS), + ] + + @pytest.fixture + def default_configured_url(self): + default_configured_url = LlmInputs._create_configured_url( + LlmInputs.OPEN_ORCA_URL, + LlmInputs.DEFAULT_STARTING_INDEX, + LlmInputs.DEFAULT_LENGTH, + ) + + yield default_configured_url + + # TODO (TMA-1754): Add tests that verify json schemas + @pytest.fixture(scope="class") + def default_tokenizer(self): + yield tokenizer.get_tokenizer(tokenizer.DEFAULT_TOKENIZER) + + def test_input_type_url_no_dataset_name(self): + """ + Test for exception when input type is URL and no dataset name + """ + with pytest.raises(GenAIPerfException): + _ = LlmInputs._check_for_dataset_name_if_input_type_is_url( + input_type=PromptSource.DATASET, dataset_name="" + ) + + def test_input_type_synthetic_no_tokenizer(self): + """ + Test for exception when input type is SYNTHETIC and no tokenizer + """ + with pytest.raises(GenAIPerfException): + _ = LlmInputs._check_for_tokenzier_if_input_type_is_synthetic( + input_type=PromptSource.SYNTHETIC, tokenizer=None # type: ignore + ) + + def test_illegal_starting_index(self): + """ + Test for exceptions when illegal values are given for starting index + """ + with pytest.raises(GenAIPerfException): + _ = LlmInputs._check_for_valid_starting_index(starting_index="foo") # type: ignore + + with pytest.raises(GenAIPerfException): + _ = LlmInputs._check_for_valid_starting_index(starting_index=-1) + + def test_illegal_length(self): + """ + Test for exceptions when illegal values are given for length + """ + with pytest.raises(GenAIPerfException): + _ = LlmInputs._check_for_valid_length(length="foo") # type: ignore + + with pytest.raises(GenAIPerfException): + _ = LlmInputs._check_for_valid_length(length=0) + + def test_create_configured_url(self): + """ + Test that we are appending and configuring the URL correctly + """ + expected_configured_url = ( + "http://test-url.com" + + f"&offset={LlmInputs.DEFAULT_STARTING_INDEX}" + + f"&length={LlmInputs.DEFAULT_LENGTH}" + ) + configured_url = LlmInputs._create_configured_url( + "http://test-url.com", + LlmInputs.DEFAULT_STARTING_INDEX, + LlmInputs.DEFAULT_LENGTH, + ) + + assert configured_url == expected_configured_url + + def test_download_dataset_illegal_url(self): + """ + Test for exception when URL is bad + """ + with pytest.raises(GenAIPerfException): + _ = LlmInputs._download_dataset( + "https://bad-url.zzz", + ) + + def test_llm_inputs_error_in_server_response(self): + """ + Test for exception when length is out of range + """ + with pytest.raises(GenAIPerfException): + _ = LlmInputs.create_llm_inputs( + input_type=PromptSource.DATASET, + dataset_name=OPEN_ORCA, + output_format=OutputFormat.OPENAI_CHAT_COMPLETIONS, + starting_index=LlmInputs.DEFAULT_STARTING_INDEX, + length=int(LlmInputs.DEFAULT_LENGTH * 100), + ) + + @responses.activate + def test_llm_inputs_with_defaults(self, default_configured_url): + """ + Test that default options work + """ + responses.add( + responses.GET, + f"{default_configured_url}", + json=mocked_openorca_data, + status=200, + ) + + dataset = LlmInputs._download_dataset( + default_configured_url, + ) + dataset_json = LlmInputs._convert_input_url_dataset_to_generic_json( + dataset=dataset + ) + + assert dataset_json is not None + assert len(dataset_json["rows"]) == TEST_LENGTH + + # TODO (TPA-114) Refactor LLM inputs and testing + # def test_llm_inputs_with_non_default_length(self): + # """ + # Test that non-default length works + # """ + # configured_url = LlmInputs._create_configured_url( + # LlmInputs.OPEN_ORCA_URL, + # LlmInputs.DEFAULT_STARTING_INDEX, + # (int(LlmInputs.DEFAULT_LENGTH / 2)), + # ) + # dataset = LlmInputs._download_dataset( + # configured_url, + # ) + # dataset_json = LlmInputs._convert_input_url_dataset_to_generic_json( + # dataset=dataset + # ) + + # assert dataset_json is not None + # assert len(dataset_json["rows"]) == LlmInputs.DEFAULT_LENGTH / 2 + + # def test_convert_default_json_to_pa_format(self, default_configured_url): + # """ + # Test that conversion to PA JSON format is correct + # """ + # dataset = LlmInputs._download_dataset( + # default_configured_url, + # ) + # dataset_json = LlmInputs._convert_input_url_dataset_to_generic_json( + # dataset=dataset + # ) + # pa_json = LlmInputs._convert_generic_json_to_output_format( + # output_format=OutputFormat.OPENAI_CHAT_COMPLETIONS, + # generic_dataset=dataset_json, + # add_model_name=False, + # add_stream=False, + # extra_inputs={}, + # output_tokens_mean=LlmInputs.DEFAULT_OUTPUT_TOKENS_MEAN, + # output_tokens_stddev=LlmInputs.DEFAULT_OUTPUT_TOKENS_STDDEV, + # output_tokens_deterministic=False, + # model_name=["test_model_A"], + # ) + + # assert pa_json is not None + # assert len(pa_json["data"]) == LlmInputs.DEFAULT_LENGTH + + # def test_create_openai_llm_inputs_cnn_dailymail(self): + # """ + # Test CNN_DAILYMAIL can be accessed + # """ + # pa_json = LlmInputs.create_llm_inputs( + # input_type=PromptSource.DATASET, + # dataset_name=CNN_DAILY_MAIL, + # output_format=OutputFormat.OPENAI_CHAT_COMPLETIONS, + # model_name=["test_model_A"], + # ) + + # os.remove(DEFAULT_INPUT_DATA_JSON) + + # assert pa_json is not None + # assert len(pa_json["data"]) == LlmInputs.DEFAULT_LENGTH + + # def test_write_to_file(self): + # """ + # Test that write to file is working correctly + # """ + # pa_json = LlmInputs.create_llm_inputs( + # input_type=PromptSource.DATASET, + # dataset_name=OPEN_ORCA, + # output_format=OutputFormat.OPENAI_CHAT_COMPLETIONS, + # model_name="open_orca", + # add_model_name=True, + # add_stream=True, + # ) + # try: + # with open(DEFAULT_INPUT_DATA_JSON, "r") as f: + # json_str = f.read() + # finally: + # os.remove(DEFAULT_INPUT_DATA_JSON) + + # assert pa_json == json.loads(json_str) + + # def test_create_openai_to_vllm(self): + # """ + # Test conversion of openai to vllm + # """ + # pa_json = LlmInputs.create_llm_inputs( + # input_type=PromptSource.DATASET, + # output_format=OutputFormat.VLLM, + # dataset_name=OPEN_ORCA, + # add_model_name=False, + # add_stream=True, + # model_name=["test_model_A"], + # ) + + # os.remove(DEFAULT_INPUT_DATA_JSON) + + # assert pa_json is not None + # assert len(pa_json["data"]) == LlmInputs.DEFAULT_LENGTH + + # def test_create_openai_to_completions(self): + # """ + # Test conversion of openai to completions + # """ + # pa_json = LlmInputs.create_llm_inputs( + # input_type=PromptSource.DATASET, + # output_format=OutputFormat.OPENAI_COMPLETIONS, + # dataset_name=OPEN_ORCA, + # add_model_name=False, + # add_stream=True, + # model_name=["test_model_A"], + # ) + + # os.remove(DEFAULT_INPUT_DATA_JSON) + + # assert pa_json is not None + # assert len(pa_json["data"]) == LlmInputs.DEFAULT_LENGTH + # # NIM legacy completion endpoint only supports string and not + # # array of strings. Verify that the prompt is of type string + # # not list + # assert isinstance(pa_json["data"][0]["payload"][0]["prompt"], str) + + # def test_create_openai_to_trtllm(self): + # """ + # Test conversion of openai to trtllm + # """ + # pa_json = LlmInputs.create_llm_inputs( + # input_type=PromptSource.DATASET, + # output_format=OutputFormat.TENSORRTLLM, + # dataset_name=OPEN_ORCA, + # add_model_name=False, + # add_stream=True, + # model_name=["test_model_A"], + # ) + + # os.remove(DEFAULT_INPUT_DATA_JSON) + + # assert pa_json is not None + # assert len(pa_json["data"]) == LlmInputs.DEFAULT_LENGTH + + # def test_random_synthetic_no_stddev(self, default_tokenizer): + # """ + # Test that we can produce an exact number of random synthetic tokens + # """ + # random.seed(1) + + # def _subtest(token_length): + # synthetic_prompt = LlmInputs._create_synthetic_prompt( + # tokenizer=default_tokenizer, + # prompt_tokens_mean=token_length, + # prompt_tokens_stddev=0, + # ) + + # actual_token_length = len(default_tokenizer.encode(synthetic_prompt)) + # assert token_length == actual_token_length + + # # Test all of 500-600 to make sure exact + # for i in range(500, 600): + # _subtest(i) + + # # Test some larger values + # _subtest(1500) + # _subtest(10000) + + # def test_random_synthetic_stddev(self, default_tokenizer): + # """ + # Test that we can produce random synthetic tokens within a requested stddev + # """ + # random.seed(1) + + # def _subtest(num_samples, mean, stddev): + # prompt_tokens = [] + # for _ in range(num_samples): + # prompt = LlmInputs._create_synthetic_prompt( + # tokenizer=default_tokenizer, + # prompt_tokens_mean=mean, + # prompt_tokens_stddev=stddev, + # ) + # prompt_tokens.append(len(default_tokenizer.encode(prompt))) + + # assert statistics.mean(prompt_tokens) == pytest.approx(mean, rel=0.1) + # assert statistics.stdev(prompt_tokens) == pytest.approx(stddev, rel=0.2) + + # _subtest(50, 200, 20) + # _subtest(50, 400, 10) + # _subtest(200, 50, 10) + + # def test_random_seed(self, default_tokenizer): + # """ + # Test that when given the same seed, create_llm_inputs will return the same result, + # and that when given a different seed, it will produce a different result + # """ + + # inputs_seed5_a = LlmInputs.create_llm_inputs( + # tokenizer=default_tokenizer, + # input_type=PromptSource.SYNTHETIC, + # output_format=OutputFormat.TENSORRTLLM, + # prompt_tokens_mean=300, + # prompt_tokens_stddev=20, + # num_of_output_prompts=5, + # random_seed=5, + # model_name=["test_model_A"], + # ) + + # inputs_seed5_b = LlmInputs.create_llm_inputs( + # tokenizer=default_tokenizer, + # input_type=PromptSource.SYNTHETIC, + # output_format=OutputFormat.TENSORRTLLM, + # prompt_tokens_mean=300, + # prompt_tokens_stddev=20, + # num_of_output_prompts=5, + # random_seed=5, + # model_name=["test_model_A"], + # ) + + # inputs_seed10 = LlmInputs.create_llm_inputs( + # tokenizer=default_tokenizer, + # input_type=PromptSource.SYNTHETIC, + # output_format=OutputFormat.TENSORRTLLM, + # prompt_tokens_mean=300, + # prompt_tokens_stddev=20, + # num_of_output_prompts=5, + # random_seed=10, + # model_name=["test_model_A"], + # ) + + # assert inputs_seed5_a == inputs_seed5_b + # assert inputs_seed5_a != inputs_seed10 + + # def test_synthetic_to_vllm(self, default_tokenizer): + # """ + # Test generating synthetic prompts and converting to vllm + # """ + # pa_json = LlmInputs.create_llm_inputs( + # input_type=PromptSource.SYNTHETIC, + # output_format=OutputFormat.VLLM, + # num_of_output_prompts=5, + # add_model_name=False, + # add_stream=True, + # tokenizer=default_tokenizer, + # model_name=["test_model_A"], + # ) + + # os.remove(DEFAULT_INPUT_DATA_JSON) + + # assert pa_json is not None + # assert len(pa_json["data"]) == 5 + + # def test_synthetic_to_trtllm(self, default_tokenizer): + # """ + # Test generating synthetic prompts and converting to trtllm + # """ + # pa_json = LlmInputs.create_llm_inputs( + # input_type=PromptSource.SYNTHETIC, + # output_format=OutputFormat.TENSORRTLLM, + # num_of_output_prompts=5, + # add_model_name=False, + # add_stream=True, + # tokenizer=default_tokenizer, + # model_name=["test_model_A"], + # ) + + # os.remove(DEFAULT_INPUT_DATA_JSON) + + # assert pa_json is not None + # assert len(pa_json["data"]) == 5 + + # def test_synthetic_to_openai_chat_completions(self, default_tokenizer): + # """ + # Test generating synthetic prompts and converting to OpenAI chat completions + # """ + # pa_json = LlmInputs.create_llm_inputs( + # input_type=PromptSource.SYNTHETIC, + # output_format=OutputFormat.OPENAI_CHAT_COMPLETIONS, + # num_of_output_prompts=5, + # add_model_name=False, + # add_stream=True, + # tokenizer=default_tokenizer, + # model_name=["test_model_A"], + # ) + + # os.remove(DEFAULT_INPUT_DATA_JSON) + + # assert pa_json is not None + # assert len(pa_json["data"]) == 5 + + # def test_synthetic_to_openai_completions(self, default_tokenizer): + # """ + # Test generating synthetic prompts and converting to OpenAI completions + # """ + # pa_json = LlmInputs.create_llm_inputs( + # input_type=PromptSource.SYNTHETIC, + # output_format=OutputFormat.OPENAI_COMPLETIONS, + # num_of_output_prompts=5, + # add_model_name=False, + # add_stream=True, + # tokenizer=default_tokenizer, + # model_name=["test_model_A"], + # ) + + # os.remove(DEFAULT_INPUT_DATA_JSON) + + # assert pa_json is not None + # assert len(pa_json["data"]) == 5 + + # @pytest.mark.parametrize( + # "output_format", + # [format[2] for format in SERVICE_KIND_BACKEND_ENDPOINT_TYPE_FORMATS], + # ) + # def test_extra_inputs( + # self, default_tokenizer: Tokenizer, output_format: OutputFormat + # ) -> None: + # input_name = "max_tokens" + # input_value = 5 + # request_inputs = {input_name: input_value} + + # pa_json = LlmInputs.create_llm_inputs( + # input_type=PromptSource.SYNTHETIC, + # output_format=output_format, + # num_of_output_prompts=5, + # add_model_name=False, + # add_stream=True, + # tokenizer=default_tokenizer, + # extra_inputs=request_inputs, + # model_name=["test_model_A"], + # ) + + # assert len(pa_json["data"]) == 5 + + # if ( + # output_format == OutputFormat.OPENAI_CHAT_COMPLETIONS + # or output_format == OutputFormat.OPENAI_COMPLETIONS + # ): + # for entry in pa_json["data"]: + # assert "payload" in entry, "Payload is missing in the request" + # payload = entry["payload"] + # for item in payload: + # assert ( + # input_name in item + # ), f"The input name {input_name} is not present in the request" + # assert ( + # item[input_name] == input_value + # ), f"The value of {input_name} is incorrect" + # elif ( + # output_format == OutputFormat.TENSORRTLLM + # or output_format == OutputFormat.VLLM + # ): + # for entry in pa_json["data"]: + # assert ( + # input_name in entry + # ), f"The {input_name} is not present in the request" + # assert entry[input_name] == [ + # input_value + # ], f"The value of {input_name} is incorrect" + # else: + # assert False, f"Unsupported output format: {output_format}" + + # def test_trtllm_default_max_tokens(self, default_tokenizer: Tokenizer) -> None: + # input_name = "max_tokens" + # input_value = 256 + + # pa_json = LlmInputs.create_llm_inputs( + # input_type=PromptSource.SYNTHETIC, + # output_format=OutputFormat.TENSORRTLLM, + # num_of_output_prompts=5, + # add_model_name=False, + # add_stream=True, + # tokenizer=default_tokenizer, + # model_name=["test_model_A"], + # ) + + # assert len(pa_json["data"]) == 5 + # for entry in pa_json["data"]: + # assert ( + # input_name in entry + # ), f"The {input_name} is not present in the request" + # assert entry[input_name] == [ + # input_value + # ], f"The value of {input_name} is incorrect" + + # @pytest.mark.parametrize( + # "output_format", + # [format[2] for format in SERVICE_KIND_BACKEND_ENDPOINT_TYPE_FORMATS], + # ) + # def test_output_tokens_mean(self, output_format, default_tokenizer): + # if ( + # output_format != OutputFormat.VLLM + # and output_format != OutputFormat.TENSORRTLLM + # ): + # return + + # output_tokens_mean = 100 + # output_tokens_stddev = 0 + # for deterministic in [True, False]: + # _ = LlmInputs.create_llm_inputs( + # input_type=PromptSource.SYNTHETIC, + # output_format=output_format, + # num_of_output_prompts=5, + # add_model_name=False, + # add_stream=True, + # tokenizer=default_tokenizer, + # output_tokens_mean=output_tokens_mean, + # output_tokens_stddev=output_tokens_stddev, + # output_tokens_deterministic=deterministic, + # model_name=["test_model_A"], + # ) + + # assert os.path.exists( + # DEFAULT_INPUT_DATA_JSON + # ), "llm_inputs.json file is not created" + + # with open(DEFAULT_INPUT_DATA_JSON, "r") as f: + # llm_inputs_data = json.load(f) + + # for entry in llm_inputs_data["data"]: + # if output_format == OutputFormat.VLLM: + # assert ( + # "sampling_parameters" in entry + # ), "sampling_parameters is missing in llm_inputs.json" + # sampling_parameters = json.loads(entry["sampling_parameters"][0]) + # assert ( + # "max_tokens" in sampling_parameters + # ), "max_tokens parameter is missing in sampling_parameters" + # assert sampling_parameters["max_tokens"] == str( + # output_tokens_mean + # ), "max_tokens parameter is not properly set" + # if deterministic: + # assert ( + # "min_tokens" in sampling_parameters + # ), "min_tokens parameter is missing in sampling_parameters" + # assert sampling_parameters["min_tokens"] == str( + # output_tokens_mean + # ), "min_tokens parameter is not properly set" + # else: + # assert ( + # "min_tokens" not in sampling_parameters + # ), "min_tokens parameter is present in sampling_parameters" + # elif output_format == OutputFormat.TENSORRTLLM: + # assert ( + # "max_tokens" in entry + # ), "max_tokens parameter is missing in llm_inputs.json" + # assert ( + # entry["max_tokens"][0] == output_tokens_mean + # ), "max_tokens parameter is not properly set" + # if deterministic: + # assert ( + # "min_length" in entry + # ), "min_length parameter is missing in llm_inputs.json" + # assert ( + # entry["min_length"][0] == output_tokens_mean + # ), "min_length parameter is not properly set" + # else: + # assert ( + # "min_length" not in entry + # ), "min_length parameter is present in llm_inputs.json" + # else: + # assert False, f"Unsupported output format: {output_format}" + + # os.remove(DEFAULT_INPUT_DATA_JSON) + + def test_get_input_file_without_file_existing(self): + with pytest.raises(FileNotFoundError): + LlmInputs._get_input_dataset_from_file(Path("prompt.txt")) + + @patch("pathlib.Path.exists", return_value=True) + @patch( + "builtins.open", + new_callable=mock_open, + read_data='{"text_input": "single prompt"}\n', + ) + def test_get_input_file_with_single_prompt(self, mock_file, mock_exists): + expected_prompts = ["single prompt"] + dataset = LlmInputs._get_input_dataset_from_file(Path("prompt.txt")) + + assert dataset is not None + assert len(dataset["rows"]) == len(expected_prompts) + for i, prompt in enumerate(expected_prompts): + assert dataset["rows"][i]["row"]["text_input"] == prompt + + @patch("pathlib.Path.exists", return_value=True) + @patch( + "builtins.open", + new_callable=mock_open, + read_data='{"text_input": "prompt1"}\n{"text_input": "prompt2"}\n{"text_input": "prompt3"}\n', + ) + def test_get_input_file_with_multiple_prompts(self, mock_file, mock_exists): + expected_prompts = ["prompt1", "prompt2", "prompt3"] + dataset = LlmInputs._get_input_dataset_from_file(Path("prompt.txt")) + + assert dataset is not None + assert len(dataset["rows"]) == len(expected_prompts) + for i, prompt in enumerate(expected_prompts): + assert dataset["rows"][i]["row"]["text_input"] == prompt + + @pytest.mark.parametrize( + "seed, model_name_list, index,model_selection_strategy,expected_model", + [ + ( + 1, + ["test_model_A", "test_model_B", "test_model_C"], + 0, + ModelSelectionStrategy.ROUND_ROBIN, + "test_model_A", + ), + ( + 1, + ["test_model_A", "test_model_B", "test_model_C"], + 1, + ModelSelectionStrategy.ROUND_ROBIN, + "test_model_B", + ), + ( + 1, + ["test_model_A", "test_model_B", "test_model_C"], + 2, + ModelSelectionStrategy.ROUND_ROBIN, + "test_model_C", + ), + ( + 1, + ["test_model_A", "test_model_B", "test_model_C"], + 3, + ModelSelectionStrategy.ROUND_ROBIN, + "test_model_A", + ), + ( + 100, + ["test_model_A", "test_model_B", "test_model_C"], + 0, + ModelSelectionStrategy.RANDOM, + "test_model_A", + ), + ( + 100, + ["test_model_A", "test_model_B", "test_model_C"], + 1, + ModelSelectionStrategy.RANDOM, + "test_model_A", + ), + ( + 1652, + ["test_model_A", "test_model_B", "test_model_C"], + 0, + ModelSelectionStrategy.RANDOM, + "test_model_B", + ), + ( + 95, + ["test_model_A", "test_model_B", "test_model_C"], + 0, + ModelSelectionStrategy.RANDOM, + "test_model_C", + ), + ], + ) + def test_select_model_name( + self, seed, model_name_list, index, model_selection_strategy, expected_model + ): + """ + Test that model selection strategy controls the model selected + """ + random.seed(seed) + + actual_model = LlmInputs._select_model_name( + model_name_list, index, model_selection_strategy + ) + assert actual_model == expected_model diff --git a/genai-perf/tests/test_llm_metrics.py b/genai-perf/tests/test_llm_metrics.py new file mode 100644 index 00000000..d221b759 --- /dev/null +++ b/genai-perf/tests/test_llm_metrics.py @@ -0,0 +1,614 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +from io import StringIO +from pathlib import Path +from typing import Any, List, Union + +import numpy as np +import pytest +from genai_perf.llm_metrics import LLMMetrics, LLMProfileDataParser, ResponseFormat +from genai_perf.tokenizer import DEFAULT_TOKENIZER, get_tokenizer + + +def ns_to_sec(ns: int) -> Union[int, float]: + """Convert from nanosecond to second.""" + return ns / 1e9 + + +class TestLLMProfileDataParser: + @pytest.fixture + def mock_read_write(self, monkeypatch: pytest.MonkeyPatch) -> List[str]: + """ + This function will mock the open function for specific files: + + - For "triton_profile_export.json", it will read and return the + contents of self.triton_profile_data + - For "openai_profile_export.json", it will read and return the + contents of self.openai_profile_data + - For "profile_export.csv", it will capture all data written to + the file, and return it as the return value of this function + - For all other files, it will behave like the normal open function + """ + + written_data = [] + + original_open = open + + def custom_open(filename, *args, **kwargs): + def write(self: Any, content: str) -> int: + written_data.append(content) + return len(content) + + if filename == "triton_profile_export.json": + tmp_file = StringIO(json.dumps(self.triton_profile_data)) + return tmp_file + elif filename == "openai_profile_export.json": + tmp_file = StringIO(json.dumps(self.openai_profile_data)) + return tmp_file + elif filename == "empty_profile_export.json": + tmp_file = StringIO(json.dumps(self.empty_profile_data)) + return tmp_file + elif filename == "profile_export.csv": + tmp_file = StringIO() + tmp_file.write = write.__get__(tmp_file) + return tmp_file + else: + return original_open(filename, *args, **kwargs) + + monkeypatch.setattr("builtins.open", custom_open) + + return written_data + + def test_triton_llm_profile_data(self, mock_read_write: pytest.MonkeyPatch) -> None: + """Collect LLM metrics from profile export data and check values. + + Metrics + * time to first tokens + - experiment 1: [3 - 1, 4 - 2] = [2, 2] + - experiment 2: [7 - 5, 6 - 3] = [2, 3] + * inter token latencies + - experiment 1: [((8 - 1) - 2)/(3 - 1), ((11 - 2) - 2)/(6 - 1)] + : [2.5, 1.4] + : [2, 1] # rounded + - experiment 2: [((18 - 5) - 2)/(4 - 1), ((11 - 3) - 3)/(6 - 1)] + : [11/3, 1] + : [4, 1] # rounded + * output token throughputs per request + - experiment 1: [3/(8 - 1), 6/(11 - 2)] = [3/7, 6/9] + - experiment 2: [4/(18 - 5), 6/(11 - 3)] = [4/13, 6/8] + * output token throughputs + - experiment 1: [(3 + 6)/(11 - 1)] = [9/10] + - experiment 2: [(4 + 6)/(18 - 3)] = [2/3] + * output sequence lengths + - experiment 1: [3, 6] + - experiment 2: [4, 6] + * input sequence lengths + - experiment 1: [3, 4] + - experiment 2: [3, 4] + """ + tokenizer = get_tokenizer(DEFAULT_TOKENIZER) + pd = LLMProfileDataParser( + filename=Path("triton_profile_export.json"), + tokenizer=tokenizer, + ) + + # experiment 1 metrics & statistics + stat_obj = pd.get_statistics(infer_mode="concurrency", load_level="10") + metrics = stat_obj.metrics + stat = stat_obj.stats_dict + + assert isinstance(metrics, LLMMetrics) + + assert metrics.time_to_first_tokens == [2, 2] + assert metrics.inter_token_latencies == [2, 1] + ottpr = [3 / ns_to_sec(7), 6 / ns_to_sec(9)] + assert metrics.output_token_throughputs_per_request == pytest.approx(ottpr) + ott = [9 / ns_to_sec(10)] + assert metrics.output_token_throughputs == pytest.approx(ott) + assert metrics.output_sequence_lengths == [3, 6] + assert metrics.input_sequence_lengths == [3, 4] + + # Disable Pylance warnings for dynamically set attributes due to Statistics + # not having strict attributes listed. + assert stat["time_to_first_token"]["avg"] == 2 # type: ignore + assert stat["inter_token_latency"]["avg"] == 1.5 # type: ignore + assert stat["output_token_throughput_per_request"]["avg"] == pytest.approx( # type: ignore + np.mean(ottpr) + ) + assert stat["output_sequence_length"]["avg"] == 4.5 # type: ignore + assert stat["input_sequence_length"]["avg"] == 3.5 # type: ignore + + assert stat["time_to_first_token"]["p50"] == 2 # type: ignore + assert stat["inter_token_latency"]["p50"] == 1.5 # type: ignore + assert stat["output_token_throughput_per_request"]["p50"] == pytest.approx( # type: ignore + np.percentile(ottpr, 50) + ) + assert stat["output_sequence_length"]["p50"] == 4.5 # type: ignore + assert stat["input_sequence_length"]["p50"] == 3.5 # type: ignore + + assert stat["time_to_first_token"]["min"] == 2 # type: ignore + assert stat["inter_token_latency"]["min"] == 1 # type: ignore + min_ottpr = 3 / ns_to_sec(7) + assert stat["output_token_throughput_per_request"]["min"] == pytest.approx(min_ottpr) # type: ignore + assert stat["output_sequence_length"]["min"] == 3 # type: ignore + assert stat["input_sequence_length"]["min"] == 3 # type: ignore + + assert stat["time_to_first_token"]["max"] == 2 # type: ignore + assert stat["inter_token_latency"]["max"] == 2 # type: ignore + max_ottpr = 6 / ns_to_sec(9) + assert stat["output_token_throughput_per_request"]["max"] == pytest.approx(max_ottpr) # type: ignore + assert stat["output_sequence_length"]["max"] == 6 # type: ignore + assert stat["input_sequence_length"]["max"] == 4 # type: ignore + + assert stat["time_to_first_token"]["std"] == np.std([2, 2]) # type: ignore + assert stat["inter_token_latency"]["std"] == np.std([2, 1]) # type: ignore + assert stat["output_token_throughput_per_request"]["std"] == pytest.approx( # type: ignore + np.std(ottpr) + ) + assert stat["output_sequence_length"]["std"] == np.std([3, 6]) # type: ignore + assert stat["input_sequence_length"]["std"] == np.std([3, 4]) # type: ignore + + oott = 9 / ns_to_sec(10) + assert stat["output_token_throughput"]["avg"] == pytest.approx(oott) # type: ignore + + # experiment 2 statistics + stat_obj = pd.get_statistics(infer_mode="request_rate", load_level="2.0") + metrics = stat_obj.metrics + stat = stat_obj.stats_dict + assert isinstance(metrics, LLMMetrics) + + assert metrics.time_to_first_tokens == [2, 3] + assert metrics.inter_token_latencies == [4, 1] + ottpr = [4 / ns_to_sec(13), 6 / ns_to_sec(8)] + assert metrics.output_token_throughputs_per_request == pytest.approx(ottpr) + ott = [2 / ns_to_sec(3)] + assert metrics.output_token_throughputs == pytest.approx(ott) + assert metrics.output_sequence_lengths == [4, 6] + assert metrics.input_sequence_lengths == [3, 4] + + assert stat["time_to_first_token"]["avg"] == pytest.approx(2.5) # type: ignore + assert stat["inter_token_latency"]["avg"] == pytest.approx(2.5) # type: ignore + assert stat["output_token_throughput_per_request"]["avg"] == pytest.approx( # type: ignore + np.mean(ottpr) + ) + assert stat["output_sequence_length"]["avg"] == 5 # type: ignore + assert stat["input_sequence_length"]["avg"] == 3.5 # type: ignore + + assert stat["time_to_first_token"]["p50"] == pytest.approx(2.5) # type: ignore + assert stat["inter_token_latency"]["p50"] == pytest.approx(2.5) # type: ignore + assert stat["output_token_throughput_per_request"]["p50"] == pytest.approx( # type: ignore + np.percentile(ottpr, 50) + ) + assert stat["output_sequence_length"]["p50"] == 5 # type: ignore + assert stat["input_sequence_length"]["p50"] == 3.5 # type: ignore + + assert stat["time_to_first_token"]["min"] == pytest.approx(2) # type: ignore + assert stat["inter_token_latency"]["min"] == pytest.approx(1) # type: ignore + min_ottpr = 4 / ns_to_sec(13) + assert stat["output_token_throughput_per_request"]["min"] == pytest.approx(min_ottpr) # type: ignore + assert stat["output_sequence_length"]["min"] == 4 # type: ignore + assert stat["input_sequence_length"]["min"] == 3 # type: ignore + + assert stat["time_to_first_token"]["max"] == pytest.approx(3) # type: ignore + assert stat["inter_token_latency"]["max"] == pytest.approx(4) # type: ignore + max_ottpr = 6 / ns_to_sec(8) + assert stat["output_token_throughput_per_request"]["max"] == pytest.approx(max_ottpr) # type: ignore + assert stat["output_sequence_length"]["max"] == 6 # type: ignore + assert stat["input_sequence_length"]["max"] == 4 # type: ignore + + assert stat["time_to_first_token"]["std"] == np.std([2, 3]) * (1) # type: ignore + assert stat["inter_token_latency"]["std"] == np.std([4, 1]) * (1) # type: ignore + assert stat["output_token_throughput_per_request"]["std"] == pytest.approx( # type: ignore + np.std(ottpr) + ) + assert stat["output_sequence_length"]["std"] == np.std([4, 6]) # type: ignore + assert stat["input_sequence_length"]["std"] == np.std([3, 4]) # type: ignore + + oott = 2 / ns_to_sec(3) + assert stat["output_token_throughput"]["avg"] == pytest.approx(oott) # type: ignore + + # check non-existing profile data + with pytest.raises(KeyError): + pd.get_statistics(infer_mode="concurrency", load_level="30") + + def test_openai_llm_profile_data(self, mock_read_write: pytest.MonkeyPatch) -> None: + """Collect LLM metrics from profile export data and check values. + + Metrics + * time to first tokens + - experiment 1: [5 - 1, 7 - 2] = [4, 5] + * inter token latencies + - experiment 1: [((12 - 1) - 4)/(3 - 1), ((15 - 2) - 5)/(6 - 1)] + : [3.5, 1.6] + : [4, 2] # rounded + * output token throughputs per request + - experiment 1: [3/(12 - 1), 6/(15 - 2)] = [3/11, 6/13] + * output token throughputs + - experiment 1: [(3 + 6)/(15 - 1)] = [9/14] + * output sequence lengths + - experiment 1: [3, 6] + * input sequence lengths + - experiment 1: [3, 4] + """ + tokenizer = get_tokenizer(DEFAULT_TOKENIZER) + pd = LLMProfileDataParser( + filename=Path("openai_profile_export.json"), + tokenizer=tokenizer, + ) + + # experiment 1 statistics + stat_obj = pd.get_statistics(infer_mode="concurrency", load_level="10") + metrics = stat_obj.metrics + stat = stat_obj.stats_dict + assert isinstance(metrics, LLMMetrics) + + assert metrics.time_to_first_tokens == [4, 5] + assert metrics.inter_token_latencies == [4, 2] + ottpr = [3 / ns_to_sec(11), 6 / ns_to_sec(13)] + assert metrics.output_token_throughputs_per_request == pytest.approx(ottpr) + ott = [9 / ns_to_sec(14)] + assert metrics.output_token_throughputs == pytest.approx(ott) + assert metrics.output_sequence_lengths == [3, 6] + assert metrics.input_sequence_lengths == [3, 4] + + assert stat["time_to_first_token"]["avg"] == pytest.approx(4.5) # type: ignore + assert stat["inter_token_latency"]["avg"] == pytest.approx(3) # type: ignore + assert stat["output_token_throughput_per_request"]["avg"] == pytest.approx( # type: ignore + np.mean(ottpr) + ) + assert stat["output_sequence_length"]["avg"] == 4.5 # type: ignore + assert stat["input_sequence_length"]["avg"] == 3.5 # type: ignore + + assert stat["time_to_first_token"]["p50"] == pytest.approx(4.5) # type: ignore + assert stat["inter_token_latency"]["p50"] == pytest.approx(3) # type: ignore + assert stat["output_token_throughput_per_request"]["p50"] == pytest.approx( # type: ignore + np.percentile(ottpr, 50) + ) + assert stat["output_sequence_length"]["p50"] == 4.5 # type: ignore + assert stat["input_sequence_length"]["p50"] == 3.5 # type: ignore + + assert stat["time_to_first_token"]["min"] == pytest.approx(4) # type: ignore + assert stat["inter_token_latency"]["min"] == pytest.approx(2) # type: ignore + min_ottpr = 3 / ns_to_sec(11) + assert stat["output_token_throughput_per_request"]["min"] == pytest.approx(min_ottpr) # type: ignore + assert stat["output_sequence_length"]["min"] == 3 # type: ignore + assert stat["input_sequence_length"]["min"] == 3 # type: ignore + + assert stat["time_to_first_token"]["max"] == pytest.approx(5) # type: ignore + assert stat["inter_token_latency"]["max"] == pytest.approx(4) # type: ignore + max_ottpr = 6 / ns_to_sec(13) + assert stat["output_token_throughput_per_request"]["max"] == pytest.approx(max_ottpr) # type: ignore + assert stat["output_sequence_length"]["max"] == 6 # type: ignore + assert stat["input_sequence_length"]["max"] == 4 # type: ignore + + assert stat["time_to_first_token"]["std"] == np.std([4, 5]) * (1) # type: ignore + assert stat["inter_token_latency"]["std"] == np.std([4, 2]) * (1) # type: ignore + assert stat["output_token_throughput_per_request"]["std"] == pytest.approx( # type: ignore + np.std(ottpr) + ) + assert stat["output_sequence_length"]["std"] == np.std([3, 6]) # type: ignore + assert stat["input_sequence_length"]["std"] == np.std([3, 4]) # type: ignore + + oott = 9 / ns_to_sec(14) + assert stat["output_token_throughput"]["avg"] == pytest.approx(oott) # type: ignore + + # check non-existing profile data + with pytest.raises(KeyError): + pd.get_statistics(infer_mode="concurrency", load_level="40") + + def test_merged_sse_response(self, mock_read_write: pytest.MonkeyPatch) -> None: + """Test merging the multiple sse response.""" + res_timestamps = [0, 1, 2, 3] + res_outputs = [ + { + "response": 'data: {"choices":[{"delta":{"content":"aaa"}}],"object":"chat.completion.chunk"}\n\n' + }, + { + "response": ( + 'data: {"choices":[{"delta":{"content":"abc"}}],"object":"chat.completion.chunk"}\n\n' + 'data: {"choices":[{"delta":{"content":"1234"}}],"object":"chat.completion.chunk"}\n\n' + 'data: {"choices":[{"delta":{"content":"helloworld"}}],"object":"chat.completion.chunk"}\n\n' + ) + }, + {"response": "data: [DONE]\n\n"}, + ] + expected_response = '{"choices": [{"delta": {"content": "abc1234helloworld"}}], "object": "chat.completion.chunk"}' + + tokenizer = get_tokenizer(DEFAULT_TOKENIZER) + pd = LLMProfileDataParser( + filename=Path("openai_profile_export.json"), + tokenizer=tokenizer, + ) + + pd._preprocess_response(res_timestamps, res_outputs) + assert res_outputs[1]["response"] == expected_response + + def test_openai_output_token_counts( + self, mock_read_write: pytest.MonkeyPatch + ) -> None: + output_texts = [ + "Ad", + "idas", + " Orig", + "inals", + " are", + " now", + " available", + " in", + " more", + " than", + ] + res_outputs = [] + for text in output_texts: + response = f'data: {{"choices":[{{"delta":{{"content":"{text}"}}}}],"object":"chat.completion.chunk"}}\n\n' + res_outputs.append({"response": response}) + + tokenizer = get_tokenizer(DEFAULT_TOKENIZER) + pd = LLMProfileDataParser( + filename=Path("openai_profile_export.json"), + tokenizer=tokenizer, + ) + + output_token_counts, total_output_token = pd._get_output_token_counts( + res_outputs + ) + assert output_token_counts == [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] # total 10 + assert total_output_token == 9 + assert total_output_token != sum(output_token_counts) + + def test_triton_output_token_counts( + self, mock_read_write: pytest.MonkeyPatch + ) -> None: + output_texts = [ + "Ad", + "idas", + " Orig", + "inals", + " are", + " now", + " available", + " in", + " more", + " than", + ] + res_outputs = [] + for text in output_texts: + res_outputs.append({"text_output": text}) + + tokenizer = get_tokenizer(DEFAULT_TOKENIZER) + pd = LLMProfileDataParser( + filename=Path("triton_profile_export.json"), + tokenizer=tokenizer, + ) + + output_token_counts, total_output_token = pd._get_output_token_counts( + res_outputs + ) + assert output_token_counts == [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] # total 10 + assert total_output_token == 9 + assert total_output_token != sum(output_token_counts) + + def test_llm_metrics_get_base_name(self) -> None: + """Test get_base_name method in LLMMetrics class.""" + # initialize with dummy values + metrics = LLMMetrics( + request_throughputs=[10.12, 11.33], + request_latencies=[3, 44], + time_to_first_tokens=[1, 2, 3], + inter_token_latencies=[4, 5], + output_token_throughputs=[22.13, 9423.02], + output_token_throughputs_per_request=[7, 8, 9], + output_sequence_lengths=[3, 4], + input_sequence_lengths=[12, 34], + ) + assert metrics.get_base_name("time_to_first_tokens") == "time_to_first_token" + assert metrics.get_base_name("inter_token_latencies") == "inter_token_latency" + assert ( + metrics.get_base_name("output_token_throughputs_per_request") + == "output_token_throughput_per_request" + ) + assert ( + metrics.get_base_name("output_sequence_lengths") == "output_sequence_length" + ) + assert ( + metrics.get_base_name("input_sequence_lengths") == "input_sequence_length" + ) + with pytest.raises(KeyError): + metrics.get_base_name("hello1234") + + def test_empty_response(self, mock_read_write: pytest.MonkeyPatch) -> None: + """Check if it handles all empty responses.""" + tokenizer = get_tokenizer(DEFAULT_TOKENIZER) + + # Should not throw error + _ = LLMProfileDataParser( + filename=Path("empty_profile_export.json"), + tokenizer=tokenizer, + ) + + empty_profile_data = { + "service_kind": "openai", + "endpoint": "v1/chat/completions", + "experiments": [ + { + "experiment": { + "mode": "concurrency", + "value": 10, + }, + "requests": [ + { + "timestamp": 1, + "request_inputs": { + "payload": '{"messages":[{"role":"user","content":"This is test"}],"model":"llama-2-7b","stream":true}', + }, + "response_timestamps": [3, 5, 8], + "response_outputs": [ + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":""},"finish_reason":null}]}\n\n' + }, + {"response": "data: [DONE]\n\n"}, + ], + }, + ], + }, + ], + } + + openai_profile_data = { + "service_kind": "openai", + "endpoint": "v1/chat/completions", + "experiments": [ + { + "experiment": { + "mode": "concurrency", + "value": 10, + }, + "requests": [ + { + "timestamp": 1, + "request_inputs": { + "payload": '{"messages":[{"role":"user","content":"This is test"}],"model":"llama-2-7b","stream":true}', + }, + # the first, and the last two responses will be ignored because they have no "content" + "response_timestamps": [3, 5, 8, 12, 13, 14], + "response_outputs": [ + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":"I"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":" like"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":" dogs"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{},"finish_reason":null}]}\n\n' + }, + {"response": "data: [DONE]\n\n"}, + ], + }, + { + "timestamp": 2, + "request_inputs": { + "payload": '{"messages":[{"role":"user","content":"This is test too"}],"model":"llama-2-7b","stream":true}', + }, + # the first, and the last two responses will be ignored because they have no "content" + "response_timestamps": [4, 7, 11, 15, 18, 19], + "response_outputs": [ + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":"I"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":"don\'t"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{"content":"cook food"},"finish_reason":null}]}\n\n' + }, + { + "response": 'data: {"id":"abc","object":"chat.completion.chunk","created":123,"model":"llama-2-7b","choices":[{"index":0,"delta":{},"finish_reason":null}]}\n\n' + }, + {"response": "data: [DONE]\n\n"}, + ], + }, + ], + }, + ], + } + + triton_profile_data = { + "service_kind": "triton", + "endpoint": "", + "experiments": [ + { + "experiment": { + "mode": "concurrency", + "value": 10, + }, + "requests": [ + { + "timestamp": 1, + "request_inputs": {"text_input": "This is test"}, + "response_timestamps": [3, 5, 8], + "response_outputs": [ + {"text_output": "I"}, + {"text_output": " like"}, + {"text_output": " dogs"}, + ], + }, + { + "timestamp": 2, + "request_inputs": {"text_input": "This is test too"}, + "response_timestamps": [4, 7, 11], + "response_outputs": [ + {"text_output": "I"}, + {"text_output": " don't"}, + {"text_output": " cook food"}, + ], + }, + ], + }, + { + "experiment": { + "mode": "request_rate", + "value": 2.0, + }, + "requests": [ + { + "timestamp": 5, + "request_inputs": {"text_input": "This is test"}, + "response_timestamps": [7, 8, 13, 18], + "response_outputs": [ + {"text_output": "cat"}, + {"text_output": " is"}, + {"text_output": " cool"}, + {"text_output": " too"}, + ], + }, + { + "timestamp": 3, + "request_inputs": {"text_input": "This is test too"}, + "response_timestamps": [6, 8, 11], + "response_outputs": [ + {"text_output": "it's"}, + {"text_output": " very"}, + {"text_output": " simple work"}, + ], + }, + ], + }, + ], + } diff --git a/genai-perf/tests/test_plot_configs.py b/genai-perf/tests/test_plot_configs.py new file mode 100644 index 00000000..8a1dfee7 --- /dev/null +++ b/genai-perf/tests/test_plot_configs.py @@ -0,0 +1,112 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from pathlib import Path + +# Skip type checking to avoid mypy error +# Issue: https://github.com/python/mypy/issues/10632 +import yaml # type: ignore +from genai_perf.plots.plot_config import PlotType +from genai_perf.plots.plot_config_parser import PlotConfigParser + + +class TestPlotConfigParser: + yaml_config = """ + plot1: + title: TTFT vs ITL + x_metric: time_to_first_tokens + y_metric: inter_token_latencies + x_label: TTFT (ms) + y_label: ITL (ms) + width: 1000 + height: 3000 + type: box + paths: + - run1/concurrency32.json + - run2/concurrency32.json + - run3/concurrency32.json + output: test_output_1 + + plot2: + title: Input Sequence Length vs Output Sequence Length + x_metric: input_sequence_lengths + y_metric: output_sequence_lengths + x_label: Input Sequence Length + y_label: Output Sequence Length + width: 1234 + height: 5678 + type: scatter + paths: + - run4/concurrency1.json + output: test_output_2 + """ + + def test_generate_configs(self, monkeypatch) -> None: + monkeypatch.setattr( + "genai_perf.plots.plot_config_parser.load_yaml", + lambda _: yaml.safe_load(self.yaml_config), + ) + monkeypatch.setattr(PlotConfigParser, "_get_statistics", lambda *_: {}) + monkeypatch.setattr(PlotConfigParser, "_get_metric", lambda *_: [1, 2, 3]) + + config_parser = PlotConfigParser(Path("test_config.yaml")) + plot_configs = config_parser.generate_configs() + + assert len(plot_configs) == 2 + pc1, pc2 = plot_configs + + # plot config 1 + assert pc1.title == "TTFT vs ITL" + assert pc1.x_label == "TTFT (ms)" + assert pc1.y_label == "ITL (ms)" + assert pc1.width == 1000 + assert pc1.height == 3000 + assert pc1.type == PlotType.BOX + assert pc1.output == Path("test_output_1") + + assert len(pc1.data) == 3 # profile run data + prd1, prd2, prd3 = pc1.data + assert prd1.name == "run1/concurrency32" + assert prd2.name == "run2/concurrency32" + assert prd3.name == "run3/concurrency32" + for prd in pc1.data: + assert prd.x_metric == [1, 2, 3] + assert prd.y_metric == [1, 2, 3] + + # plot config 2 + assert pc2.title == "Input Sequence Length vs Output Sequence Length" + assert pc2.x_label == "Input Sequence Length" + assert pc2.y_label == "Output Sequence Length" + assert pc2.width == 1234 + assert pc2.height == 5678 + assert pc2.type == PlotType.SCATTER + assert pc2.output == Path("test_output_2") + + assert len(pc2.data) == 1 # profile run data + prd = pc2.data[0] + assert prd.name == "run4/concurrency1" + assert prd.x_metric == [1, 2, 3] + assert prd.y_metric == [1, 2, 3] diff --git a/genai-perf/tests/test_tokenizer.py b/genai-perf/tests/test_tokenizer.py new file mode 100644 index 00000000..259389dc --- /dev/null +++ b/genai-perf/tests/test_tokenizer.py @@ -0,0 +1,76 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest +from genai_perf.exceptions import GenAIPerfException +from genai_perf.tokenizer import DEFAULT_TOKENIZER, get_tokenizer + + +class TestTokenizer: + def test_default_tokenizer(self): + tokenizer_model = DEFAULT_TOKENIZER + get_tokenizer(tokenizer_model) + + def test_non_default_tokenizer(self): + tokenizer_model = "gpt2" + get_tokenizer(tokenizer_model) + + def test_bad_tokenizer(self): + with pytest.raises(GenAIPerfException): + get_tokenizer("bad_tokenizer") + + def test_default_args(self): + tokenizer_model = DEFAULT_TOKENIZER + tokenizer = get_tokenizer(tokenizer_model) + + # There are 3 special tokens in the default tokenizer + # - : 0 (unknown) + # - : 1 (beginning of sentence) + # - : 2 (end of sentence) + special_tokens = list(tokenizer._tokenizer.added_tokens_encoder.keys()) + special_token_ids = list(tokenizer._tokenizer.added_tokens_encoder.values()) + + # special tokens are disabled by default + text = "This is test." + tokens = tokenizer(text)["input_ids"] + assert all([s not in tokens for s in special_token_ids]) + + tokens = tokenizer.encode(text) + assert all([s not in tokens for s in special_token_ids]) + + output = tokenizer.decode(tokens) + assert all([s not in output for s in special_tokens]) + + # check special tokens is enabled + text = "This is test." + tokens = tokenizer(text, add_special_tokens=True)["input_ids"] + assert any([s in tokens for s in special_token_ids]) + + tokens = tokenizer.encode(text, add_special_tokens=True) + assert any([s in tokens for s in special_token_ids]) + + output = tokenizer.decode(tokens, skip_special_tokens=False) + assert any([s in output for s in special_tokens]) diff --git a/genai-perf/tests/test_wrapper.py b/genai-perf/tests/test_wrapper.py new file mode 100644 index 00000000..184a47f1 --- /dev/null +++ b/genai-perf/tests/test_wrapper.py @@ -0,0 +1,153 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import subprocess +from unittest.mock import MagicMock, patch + +import pytest +from genai_perf import parser +from genai_perf.constants import DEFAULT_GRPC_URL +from genai_perf.wrapper import Profiler + + +class TestWrapper: + @pytest.mark.parametrize( + "arg", + [ + ([]), + (["-u", "testurl:1000"]), + (["--url", "testurl:1000"]), + ], + ) + def test_url_exactly_once_triton(self, monkeypatch, arg): + args = ["genai-perf", "-m", "test_model", "--service-kind", "triton"] + arg + monkeypatch.setattr("sys.argv", args) + args, extra_args = parser.parse_args() + cmd = Profiler.build_cmd(args, extra_args) + cmd_string = " ".join(cmd) + + number_of_url_args = cmd_string.count(" -u ") + cmd_string.count(" --url ") + assert number_of_url_args == 1 + + @pytest.mark.parametrize( + "arg, expected_filepath", + [ + ( + [], + "artifacts/test_model-triton-tensorrtllm-concurrency1/profile_export.json", + ), + ( + ["--artifact-dir", "test_dir"], + "test_dir/profile_export.json", + ), + ( + ["--artifact-dir", "test_dir", "--profile-export-file", "test.json"], + "test_dir/test.json", + ), + ], + ) + def test_profile_export_filepath(self, monkeypatch, arg, expected_filepath): + args = ["genai-perf", "-m", "test_model", "--service-kind", "triton"] + arg + monkeypatch.setattr("sys.argv", args) + args, extra_args = parser.parse_args() + cmd = Profiler.build_cmd(args, extra_args) + cmd_string = " ".join(cmd) + + expected_pattern = f"--profile-export-file {expected_filepath}" + assert expected_pattern in cmd_string + + @pytest.mark.parametrize( + "arg", + [ + (["--backend", "tensorrtllm"]), + (["--backend", "vllm"]), + ], + ) + def test_service_triton(self, monkeypatch, arg): + args = ["genai-perf", "-m", "test_model", "--service-kind", "triton"] + arg + monkeypatch.setattr("sys.argv", args) + args, extra_args = parser.parse_args() + cmd = Profiler.build_cmd(args, extra_args) + cmd_string = " ".join(cmd) + + # Ensure the correct arguments are appended. + assert cmd_string.count(" -i grpc") == 1 + assert cmd_string.count(" --streaming") == 1 + assert cmd_string.count(f"-u {DEFAULT_GRPC_URL}") == 1 + if arg[1] == "tensorrtllm": + assert cmd_string.count("--shape max_tokens:1") == 1 + assert cmd_string.count("--shape text_input:1") == 1 + + @pytest.mark.parametrize( + "arg", + [ + (["--endpoint-type", "completions"]), + (["--endpoint-type", "chat"]), + ], + ) + def test_service_openai(self, monkeypatch, arg): + args = [ + "genai-perf", + "-m", + "test_model", + "--service-kind", + "openai", + ] + arg + monkeypatch.setattr("sys.argv", args) + args, extra_args = parser.parse_args() + cmd = Profiler.build_cmd(args, extra_args) + cmd_string = " ".join(cmd) + + # Ensure the correct arguments are appended. + assert cmd_string.count(" -i http") == 1 + + @patch("genai_perf.wrapper.subprocess.run") + def test_stdout_verbose(self, mock_subprocess_run): + args = MagicMock() + args.model = "test_model" + args.verbose = True + Profiler.run(args=args, extra_args=None) + + # Check that standard output was not redirected. + for call_args in mock_subprocess_run.call_args_list: + _, kwargs = call_args + assert ( + "stdout" not in kwargs or kwargs["stdout"] is None + ), "With the verbose flag, stdout should not be redirected." + + @patch("genai_perf.wrapper.subprocess.run") + def test_stdout_not_verbose(self, mock_subprocess_run): + args = MagicMock() + args.model = "test_model" + args.verbose = False + Profiler.run(args=args, extra_args=None) + + # Check that standard output was redirected. + for call_args in mock_subprocess_run.call_args_list: + _, kwargs = call_args + assert ( + kwargs["stdout"] is subprocess.DEVNULL + ), "When the verbose flag is not passed, stdout should be redirected to /dev/null." diff --git a/ictx_id_tracker.h b/ictx_id_tracker.h new file mode 100644 index 00000000..8d85067e --- /dev/null +++ b/ictx_id_tracker.h @@ -0,0 +1,51 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + + +namespace triton { namespace perfanalyzer { + +/// Interface for object that tracks context IDs +/// +class ICtxIdTracker { + public: + // Reset the tracker using the provided input count + // + virtual void Reset(size_t count) = 0; + + // Restore the given ID into the tracker + // + virtual void Restore(size_t id) = 0; + + // Pick and return a Ctx ID + // + virtual size_t Get() = 0; + + // Returns true if there are Ctx IDs available to Get. + virtual bool IsAvailable() = 0; +}; + +}} // namespace triton::perfanalyzer diff --git a/idle_timer.h b/idle_timer.h new file mode 100644 index 00000000..419789ec --- /dev/null +++ b/idle_timer.h @@ -0,0 +1,115 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once +#include +#include +#include + +namespace triton { namespace perfanalyzer { + +#ifndef DOCTEST_CONFIG_DISABLE +class TestLoadManager; +#endif + + +/// Class to track idle periods of time +/// +class IdleTimer { + public: + void Start() + { + std::lock_guard lk(mtx_); + StartImpl(); + } + + void Stop() + { + std::lock_guard lk(mtx_); + StopImpl(); + } + + /// Reset the time counter, and restart the timer if it is active + /// + void Reset() + { + Restart(); + idle_ns_ = 0; + } + + /// Returns the number of nanoseconds this timer has counted as being idle + /// If the timer was already active, then it will first stop (and count the + /// pending time), and then start back up + /// + uint64_t GetIdleTime() + { + Restart(); + return idle_ns_; + } + + private: + std::mutex mtx_; + uint64_t idle_ns_{0}; + bool is_idle_{false}; + std::chrono::_V2::steady_clock::time_point start_time_; + + void Restart() + { + std::lock_guard lk(mtx_); + if (is_idle_) { + StopImpl(); + StartImpl(); + } + } + + void StartImpl() + { + if (is_idle_) { + throw std::runtime_error("Can't start a timer that is already active\n"); + } + + is_idle_ = true; + start_time_ = std::chrono::steady_clock::now(); + } + + void StopImpl() + { + if (!is_idle_) { + throw std::runtime_error("Can't stop a timer that isn't active\n"); + } + + is_idle_ = false; + auto end = std::chrono::steady_clock::now(); + auto duration = end - start_time_; + idle_ns_ += duration.count(); + } + + +#ifndef DOCTEST_CONFIG_DISABLE + friend TestLoadManager; +#endif +}; + +}} // namespace triton::perfanalyzer diff --git a/iinfer_data_manager.h b/iinfer_data_manager.h new file mode 100644 index 00000000..33dd8ac8 --- /dev/null +++ b/iinfer_data_manager.h @@ -0,0 +1,63 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include "client_backend/client_backend.h" +#include "constants.h" +#include "data_loader.h" +#include "infer_data.h" +#include "model_parser.h" +#include "perf_utils.h" + +namespace triton { namespace perfanalyzer { + +/// Interface for classes that manage infer data preparation for inference +/// +class IInferDataManager { + public: + /// Initialize this object. Must be called before any other functions + /// \return cb::Error object indicating success or failure. + virtual cb::Error Init() = 0; + + /// Populate the target InferData object with input and output objects + /// according to the model's shape + /// \param infer_data The target InferData object. + /// \return cb::Error object indicating success or failure. + virtual cb::Error InitInferData(InferData& infer_data) = 0; + + /// Updates the input and expected output data in the target infer_data for an + /// inference request + /// \param thread_id The ID of the calling thread + /// \param stream_index The data stream to use for next data + /// \param step_index The step index to use for next data + /// \param infer_data The target InferData object + /// \return cb::Error object indicating success or failure. + virtual cb::Error UpdateInferData( + size_t thread_id, int stream_index, int step_index, + InferData& infer_data) = 0; +}; + +}} // namespace triton::perfanalyzer diff --git a/infer_context.cc b/infer_context.cc new file mode 100644 index 00000000..aa868eba --- /dev/null +++ b/infer_context.cc @@ -0,0 +1,356 @@ +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "infer_context.h" + +namespace triton { namespace perfanalyzer { + +void +InferContext::Init() +{ + thread_stat_->status_ = infer_data_manager_->InitInferData(infer_data_); + if (!thread_stat_->status_.IsOk()) { + return; + } + + if (streaming_) { + // Decoupled models should not collect client side statistics + thread_stat_->status_ = infer_backend_->StartStream( + async_callback_func_, (!parser_->IsDecoupled())); + if (!thread_stat_->status_.IsOk()) { + return; + } + } +} + +void +InferContext::SendInferRequest(bool delayed) +{ + // Update the inputs if required + if (using_json_data_) { + UpdateJsonData(); + } + SendRequest(request_id_++, delayed); +} + +void +InferContext::SendSequenceInferRequest(uint32_t seq_stat_index, bool delayed) +{ + // Need lock to protect the order of dispatch across worker threads. + // This also helps in reporting the realistic latencies. + std::lock_guard guard( + sequence_manager_->GetMutex(seq_stat_index)); + if (!early_exit && execute_) { + sequence_manager_->SetInferSequenceOptions( + seq_stat_index, infer_data_.options_); + + // Update the inputs if required + if (using_json_data_) { + UpdateSeqJsonData(seq_stat_index); + } + + sequence_manager_->DecrementRemainingQueries(seq_stat_index); + + SendRequest( + request_id_++, delayed, + sequence_manager_->GetSequenceID(seq_stat_index)); + } +} + +void +InferContext::CompleteOngoingSequence(uint32_t seq_stat_index) +{ + std::lock_guard guard( + sequence_manager_->GetMutex(seq_stat_index)); + + if (sequence_manager_->GetRemainingQueries(seq_stat_index) != 0) { + sequence_manager_->SetRemainingQueries(seq_stat_index, 1); + sequence_manager_->SetInferSequenceOptions( + seq_stat_index, infer_data_.options_); + + if (using_json_data_) { + UpdateSeqJsonData(seq_stat_index); + } + sequence_manager_->DecrementRemainingQueries(seq_stat_index); + + bool is_delayed = false; + SendRequest( + request_id_++, is_delayed, + sequence_manager_->GetSequenceID(seq_stat_index)); + } +} + +void +InferContext::SendRequest( + const uint64_t request_id, const bool delayed, const uint64_t sequence_id) +{ + if (!thread_stat_->status_.IsOk()) { + return; + } + + thread_stat_->num_sent_requests_++; + + // Parse the request inputs to save in the profile export file + RequestRecord::RequestInput request_inputs{GetInputs()}; + + if (async_) { + uint64_t unique_request_id{(thread_id_ << 48) | ((request_id << 16) >> 16)}; + infer_data_.options_->request_id_ = std::to_string(unique_request_id); + { + std::lock_guard lock(thread_stat_->mu_); + auto it = async_req_map_ + .emplace(infer_data_.options_->request_id_, RequestRecord()) + .first; + it->second.request_inputs_ = {request_inputs}; + it->second.start_time_ = std::chrono::system_clock::now(); + it->second.sequence_end_ = infer_data_.options_->sequence_end_; + it->second.delayed_ = delayed; + it->second.sequence_id_ = sequence_id; + } + + thread_stat_->idle_timer.Start(); + if (streaming_) { + thread_stat_->status_ = infer_backend_->AsyncStreamInfer( + *(infer_data_.options_), infer_data_.valid_inputs_, + infer_data_.outputs_); + } else { + thread_stat_->status_ = infer_backend_->AsyncInfer( + async_callback_func_, *(infer_data_.options_), + infer_data_.valid_inputs_, infer_data_.outputs_); + } + thread_stat_->idle_timer.Stop(); + + total_ongoing_requests_++; + } else { + std::chrono::time_point start_time_sync, + end_time_sync; + thread_stat_->idle_timer.Start(); + start_time_sync = std::chrono::system_clock::now(); + cb::InferResult* results = nullptr; + thread_stat_->status_ = infer_backend_->Infer( + &results, *(infer_data_.options_), infer_data_.valid_inputs_, + infer_data_.outputs_); + thread_stat_->idle_timer.Stop(); + RequestRecord::ResponseOutput response_outputs{}; + if (results != nullptr) { + if (thread_stat_->status_.IsOk()) { + response_outputs = GetOutputs(*results); + thread_stat_->status_ = ValidateOutputs(results); + } + delete results; + } + if (!thread_stat_->status_.IsOk()) { + return; + } + end_time_sync = std::chrono::system_clock::now(); + std::vector> + end_time_syncs{end_time_sync}; + { + // Add the request record to thread request records vector with proper + // locking + std::lock_guard lock(thread_stat_->mu_); + auto total = end_time_sync - start_time_sync; + thread_stat_->request_records_.emplace_back(RequestRecord( + start_time_sync, std::move(end_time_syncs), {request_inputs}, + {response_outputs}, infer_data_.options_->sequence_end_, delayed, + sequence_id, false)); + thread_stat_->status_ = + infer_backend_->ClientInferStat(&(thread_stat_->contexts_stat_[id_])); + if (!thread_stat_->status_.IsOk()) { + return; + } + } + } +} + +const RequestRecord::RequestInput +InferContext::GetInputs() +{ + RequestRecord::RequestInput input{}; + for (const auto& request_input : infer_data_.valid_inputs_) { + std::string data_type{request_input->Datatype()}; + const uint8_t* buf{nullptr}; + size_t byte_size{0}; + request_input->RawData(&buf, &byte_size); + + // The first 4 bytes of BYTES data is a 32-bit integer to indicate the size + // of the rest of the data (which we already know based on byte_size). It + // should be ignored here, as it isn't part of the actual request + if (data_type == "BYTES" && byte_size >= 4) { + buf += 4; + byte_size -= 4; + } + input.emplace(request_input->Name(), RecordData(buf, byte_size, data_type)); + } + return input; +} + +const RequestRecord::ResponseOutput +InferContext::GetOutputs(const cb::InferResult& infer_result) +{ + RequestRecord::ResponseOutput output{}; + for (const auto& requested_output : infer_data_.outputs_) { + std::string data_type{requested_output->Datatype()}; + const uint8_t* buf{nullptr}; + size_t byte_size{0}; + infer_result.RawData(requested_output->Name(), &buf, &byte_size); + + // The first 4 bytes of BYTES data is a 32-bit integer to indicate the size + // of the rest of the data (which we already know based on byte_size). It + // should be ignored here, as it isn't part of the actual response + if (data_type == "BYTES" && byte_size >= 4) { + buf += 4; + byte_size -= 4; + } + output.emplace( + requested_output->Name(), RecordData(buf, byte_size, data_type)); + } + return output; +} + +void +InferContext::UpdateJsonData() +{ + int step_id = (data_step_id_ * batch_size_) % data_loader_->GetTotalSteps(0); + data_step_id_ += GetNumActiveThreads(); + thread_stat_->status_ = + infer_data_manager_->UpdateInferData(thread_id_, 0, step_id, infer_data_); +} + +void +InferContext::UpdateSeqJsonData(size_t seq_stat_index) +{ + const size_t sequence_length{ + sequence_manager_->GetSequenceLength(seq_stat_index)}; + const size_t remaining_queries{ + sequence_manager_->GetRemainingQueries(seq_stat_index)}; + const uint64_t data_stream_id{ + sequence_manager_->GetDataStreamID(seq_stat_index)}; + const size_t total_steps{data_loader_->GetTotalSteps(data_stream_id)}; + int step_id = (sequence_length - remaining_queries) % total_steps; + thread_stat_->status_ = infer_data_manager_->UpdateInferData( + thread_id_, data_stream_id, step_id, infer_data_); +} + +cb::Error +InferContext::ValidateOutputs(const cb::InferResult* result_ptr) +{ + // Validate output if set + if (!infer_data_.expected_outputs_.empty()) { + for (size_t i = 0; i < infer_data_.expected_outputs_.size(); ++i) { + const uint8_t* buf = nullptr; + size_t byte_size = 0; + for (const auto& expected : infer_data_.expected_outputs_[i]) { + // Request output by validation output's name explicitly, rather than + // relying on the array indices being sorted equally in both arrays. + result_ptr->RawData(expected.name, &buf, &byte_size); + if (!expected.is_valid) { + return cb::Error( + "Expected output can't be invalid", pa::GENERIC_ERROR); + } + if (byte_size < expected.batch1_size) { + return cb::Error( + "Output size doesn't match expected size", pa::GENERIC_ERROR); + } else if (memcmp(buf, expected.data_ptr, expected.batch1_size) != 0) { + return cb::Error( + "Output doesn't match expected output", pa::GENERIC_ERROR); + } else { + buf += expected.batch1_size; + byte_size -= expected.batch1_size; + } + } + if (byte_size != 0) { + return cb::Error( + "Output size doesn't match expected size", pa::GENERIC_ERROR); + } + } + } + return cb::Error::Success; +} + +void +InferContext::AsyncCallbackFuncImpl(cb::InferResult* result) +{ + std::shared_ptr result_ptr(result); + bool is_final_response{true}; + if (thread_stat_->cb_status_.IsOk()) { + // Add the request record to thread request records vector with + // proper locking + std::lock_guard lock(thread_stat_->mu_); + thread_stat_->cb_status_ = result_ptr->RequestStatus(); + if (thread_stat_->cb_status_.IsOk()) { + std::string request_id; + thread_stat_->cb_status_ = result_ptr->Id(&request_id); + const auto& it = async_req_map_.find(request_id); + if (it != async_req_map_.end()) { + bool is_null_response{false}; + thread_stat_->cb_status_ = + result_ptr->IsNullResponse(&is_null_response); + if (thread_stat_->cb_status_.IsOk() == false) { + return; + } + it->second.response_timestamps_.push_back( + std::chrono::system_clock::now()); + it->second.response_outputs_.push_back(GetOutputs(*result)); + num_responses_++; + if (is_null_response == true) { + it->second.has_null_last_response_ = true; + } + thread_stat_->cb_status_ = + result_ptr->IsFinalResponse(&is_final_response); + if (thread_stat_->cb_status_.IsOk() == false) { + return; + } + if (is_final_response) { + has_received_final_response_ = is_final_response; + thread_stat_->request_records_.emplace_back( + it->second.start_time_, it->second.response_timestamps_, + it->second.request_inputs_, it->second.response_outputs_, + it->second.sequence_end_, it->second.delayed_, + it->second.sequence_id_, it->second.has_null_last_response_); + infer_backend_->ClientInferStat(&(thread_stat_->contexts_stat_[id_])); + thread_stat_->cb_status_ = ValidateOutputs(result); + async_req_map_.erase(request_id); + } + } + } + } + + if (worker_callback_) { + worker_callback_(id_); + } + + if (is_final_response) { + total_ongoing_requests_--; + num_responses_ = 0; + + if (async_callback_finalize_func_ != nullptr) { + async_callback_finalize_func_(id_); + } + } +} + +}} // namespace triton::perfanalyzer diff --git a/infer_context.h b/infer_context.h new file mode 100644 index 00000000..7bacb16d --- /dev/null +++ b/infer_context.h @@ -0,0 +1,222 @@ +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include +#include +#include +#include +#include + +#include "client_backend/client_backend.h" +#include "data_loader.h" +#include "idle_timer.h" +#include "iinfer_data_manager.h" +#include "infer_data.h" +#include "perf_utils.h" +#include "request_record.h" +#include "sequence_manager.h" + +namespace triton { namespace perfanalyzer { + +// Holds the running status of the thread. +struct ThreadStat { + ThreadStat() {} + + // The status of the worker thread + cb::Error status_; + // The status of the callback thread for async requests + cb::Error cb_status_; + // TODO REFACTOR TMA-1046 -- This should be in the InferContext class + // The statistics of the InferContext + std::vector contexts_stat_; + + // Tracks the amount of time this thread spent sleeping or waiting + IdleTimer idle_timer; + + // A vector of request records + std::vector request_records_; + // A lock to protect thread data + std::mutex mu_; + // The number of sent requests by this thread. + std::atomic num_sent_requests_{0}; +}; + +#ifndef DOCTEST_CONFIG_DISABLE +class NaggyMockInferContext; +#endif + +/// Sends inference requests to the server +class InferContext { + public: + InferContext( + const size_t thread_id, const uint32_t id, const bool async, + const bool streaming, const bool on_sequence_model, + const bool using_json_data, const int32_t batch_size, + std::shared_ptr thread_stat, + std::shared_ptr data_loader, + std::shared_ptr parser, + std::shared_ptr factory, const bool& execute, + const std::shared_ptr& infer_data_manager, + std::shared_ptr sequence_manager) + : thread_id_(thread_id), id_(id), async_(async), streaming_(streaming), + on_sequence_model_(on_sequence_model), + using_json_data_(using_json_data), batch_size_(batch_size), + thread_stat_(thread_stat), data_loader_(data_loader), parser_(parser), + factory_(factory), data_step_id_(thread_id), execute_(execute), + infer_data_manager_(infer_data_manager), + sequence_manager_(sequence_manager) + { + thread_stat_->status_ = factory_->CreateClientBackend(&infer_backend_); + infer_data_.options_.reset(new cb::InferOptions(parser_->ModelName())); + infer_data_.options_->model_version_ = parser_->ModelVersion(); + infer_data_.options_->model_signature_name_ = parser_->ModelSignatureName(); + + thread_stat_->contexts_stat_.emplace_back(); + } + + InferContext(InferContext&&) = delete; + InferContext(const InferContext&) = delete; + + // Initialize the context. Must be done before any inferences are sent + void Init(); + + // Send a single inference request to the server + void SendInferRequest(bool delayed = false); + + // Send a single sequence inference request to the server + void SendSequenceInferRequest(uint32_t seq_index, bool delayed = false); + + // Finish the active sequence at the given seq_stat_index + void CompleteOngoingSequence(uint32_t seq_stat_index); + + // Returns the total number of async requests that have been sent by this + // object and have not returned + uint GetNumOngoingRequests() { return total_ongoing_requests_; } + + // Returns the number of responses for the current request + uint64_t GetNumResponsesForCurrentRequest() { return num_responses_; } + + // Register a function that will get called after every async request returns + void RegisterAsyncCallbackFinalize(std::function callback) + { + async_callback_finalize_func_ = callback; + } + + void RegisterWorkerCallback(std::function worker_callback) + { + worker_callback_ = worker_callback; + } + + // TODO REFACTOR TMA-1043 this should be in memory class + void SetNumActiveThreads(size_t num_threads) + { + num_active_threads_ = num_threads; + } + + bool HasReceivedFinalResponse() { return has_received_final_response_; } + + protected: + /// A helper function to issue inference request to the server. + /// \param request_id The unique id to be associated with the request. + /// \param delayed Whether the request fell behind its scheduled time. + /// \param sequence_id Sequence ID of the request. Note that the default of + /// `0` means the request is not a sequence. + virtual void SendRequest( + const uint64_t request_id, const bool delayed, + const uint64_t sequence_id = 0); + + /// Update inputs based on custom json data + void UpdateJsonData(); + + /// Update inputs based on custom json data for the given sequence + void UpdateSeqJsonData(size_t seq_stat_index); + + cb::Error ValidateOutputs(const cb::InferResult* result_ptr); + + // Callback function for handling asynchronous requests + void AsyncCallbackFuncImpl(cb::InferResult* result); + + bool async_{false}; + bool streaming_{false}; + const bool on_sequence_model_{false}; + bool using_json_data_{false}; + const int32_t batch_size_{0}; + + std::shared_ptr thread_stat_; + std::shared_ptr data_loader_; + std::shared_ptr parser_; + std::shared_ptr factory_; + std::shared_ptr infer_data_manager_; + + uint64_t request_id_ = 0; + std::map async_req_map_; + std::atomic total_ongoing_requests_{0}; + size_t data_step_id_; + + // Function pointer to the async callback function implementation + std::function async_callback_func_ = std::bind( + &InferContext::AsyncCallbackFuncImpl, this, std::placeholders::_1); + + // Function pointer to registered async callbacks + std::function async_callback_finalize_func_ = nullptr; + + private: + const RequestRecord::RequestInput GetInputs(); + + const RequestRecord::ResponseOutput GetOutputs( + const cb::InferResult& infer_result); + + const uint32_t id_{0}; + const size_t thread_id_{0}; + + size_t GetNumActiveThreads() { return num_active_threads_; } + + size_t num_active_threads_{0}; + + // The backend to communicate with the server + std::unique_ptr infer_backend_; + InferData infer_data_; + + // FIXME: update build to use C++17 instead of C++14. This is a workaround + // since C++14 doesn't have std::optional, but C++17 does. + const bool execute_placeholder_{false}; + std::reference_wrapper execute_{execute_placeholder_}; + + std::shared_ptr sequence_manager_{nullptr}; + uint64_t num_responses_{0}; + std::function worker_callback_{nullptr}; + bool has_received_final_response_{false}; + +#ifndef DOCTEST_CONFIG_DISABLE + friend NaggyMockInferContext; + + public: + InferContext() = default; +#endif +}; + +}} // namespace triton::perfanalyzer diff --git a/infer_data.h b/infer_data.h new file mode 100644 index 00000000..abc52bb8 --- /dev/null +++ b/infer_data.h @@ -0,0 +1,64 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include "client_backend/client_backend.h" +#include "tensor_data.h" + +namespace triton { namespace perfanalyzer { + +/// Holds all the data needed to send an inference request +struct InferData { + ~InferData() + { + for (const auto input : inputs_) { + delete input; + } + for (const auto output : outputs_) { + delete output; + } + } + + // The vector of pointers to InferInput objects for all possible inputs, + // potentially including optional inputs with no provided data. + std::vector inputs_; + // The vector of pointers to InferInput objects to be + // used for inference request. + std::vector valid_inputs_; + // The vector of pointers to InferRequestedOutput objects + // to be used with the inference request. + std::vector outputs_; + // If not empty, the expected output data in the same order as 'outputs_' + // The outer vector is per-output. The inner vector is for batching of each + // output + std::vector> expected_outputs_; + // The InferOptions object holding the details of the + // inference. + std::unique_ptr options_; +}; + + +}} // namespace triton::perfanalyzer diff --git a/infer_data_manager.cc b/infer_data_manager.cc new file mode 100644 index 00000000..fe5e9fcd --- /dev/null +++ b/infer_data_manager.cc @@ -0,0 +1,210 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "infer_data_manager.h" + +#include + +namespace triton { namespace perfanalyzer { + +cb::Error +InferDataManager::Init() +{ + RETURN_IF_ERROR(CreateAndPopulateInputs()); + return cb::Error::Success; +} + +cb::Error +InferDataManager::CreateAndPopulateInputs() +{ + // All combinations of thread + input + stream + step + // + for (size_t thread_id = 0; thread_id < max_threads_; thread_id++) { + for (const auto& input : *(parser_->Inputs())) { + const std::string& name = input.first; + const ModelTensor& tensor = input.second; + for (int stream_id = 0; + stream_id < (int)data_loader_->GetDataStreamsCount(); stream_id++) { + for (int step_id = 0; + step_id < (int)data_loader_->GetTotalSteps(stream_id); + step_id += 1) { + RETURN_IF_ERROR(CreateAndPopulateInput( + thread_id, name, tensor, stream_id, step_id)); + } + } + } + } + return cb::Error::Success; +} + +cb::Error +InferDataManager::CreateAndPopulateInput( + const size_t thread_id, const std::string& name, const ModelTensor& tensor, + int stream_id, int step_id) +{ + std::vector input_datas; + size_t count = 0; + + RETURN_IF_ERROR(GetInputData(name, tensor, stream_id, step_id, input_datas)); + + if (tensor.is_shape_tensor_) { + RETURN_IF_ERROR( + ValidateShapeTensor(tensor, stream_id, step_id, input_datas)); + } + + std::vector shape; + RETURN_IF_ERROR( + data_loader_->GetInputShape(tensor, stream_id, step_id, &shape)); + if (!shape.empty()) { + if ((parser_->MaxBatchSize() != 0) && (!tensor.is_shape_tensor_)) { + shape.insert(shape.begin(), (int64_t)batch_size_); + } + } + + cb::InferInput* input; + RETURN_IF_ERROR( + CreateInferInput(&input, backend_kind_, name, shape, tensor.datatype_)); + + + // Number of missing pieces of data for optional inputs + int missing_data_cnt = 0; + int total_cnt = input_datas.size(); + + for (size_t i = 0; i < total_cnt; i++) { + if (!input_datas[i].is_valid) { + missing_data_cnt++; + } else { + RETURN_IF_ERROR(input->AppendRaw( + input_datas[i].data_ptr, input_datas[i].batch1_size)); + } + } + + // If all optional inputs had data provided, this is a valid input. But if + // some inferences in the batch provided data for an optional input and + // some inferences did not, this is an invalid case and an error is + // thrown. + if (missing_data_cnt == 0) { + inputs_.insert({{thread_id, name, stream_id, step_id}, input}); + } else if (missing_data_cnt > 0 && missing_data_cnt < total_cnt) { + return cb::Error( + "For batch sizes larger than 1, the same set of inputs must be " + "specified for each batch. You cannot use different set of " + "optional inputs for each individual batch."); + } + + return cb::Error::Success; +} + +cb::InferInput* +InferDataManager::GetInput( + const size_t thread_id, const std::string& name, int stream_id, int step_id) +{ + auto input = inputs_.find({thread_id, name, stream_id, step_id}); + if (input == inputs_.end()) { + return nullptr; + } else { + return input->second; + } +} + + +cb::Error +InferDataManager::InitInferDataInput( + const std::string& name, const ModelTensor& model_tensor, + InferData& infer_data) +{ + std::vector shape; + RETURN_IF_ERROR(data_loader_->GetInputShape(model_tensor, 0, 0, &shape)); + if (shape.empty() && (backend_kind_ == cb::BackendKind::TRITON)) { + return cb::Error("unable to set shape for the input", pa::GENERIC_ERROR); + } + + if ((parser_->MaxBatchSize() != 0) && (!model_tensor.is_shape_tensor_)) { + shape.insert(shape.begin(), (int64_t)batch_size_); + } + + cb::InferInput* infer_input; + RETURN_IF_ERROR(CreateInferInput( + &infer_input, backend_kind_, name, shape, model_tensor.datatype_)); + infer_data.inputs_.push_back(infer_input); + + + TensorData input_data; + RETURN_IF_ERROR(data_loader_->GetInputData(model_tensor, 0, 0, input_data)); + + // Add optional input to request if data was found + if (input_data.is_valid) { + infer_data.valid_inputs_.push_back(infer_input); + } + + if (!shape.empty()) { + size_t max_count = (parser_->MaxBatchSize() == 0) ? 1 : batch_size_; + for (size_t i = 0; i < max_count; ++i) { + RETURN_IF_ERROR( + infer_input->AppendRaw(input_data.data_ptr, input_data.batch1_size)); + } + } + + AddInferDataParameters(infer_data); + + return cb::Error::Success; +} + +cb::Error +InferDataManager::InitInferDataOutput( + const std::string& name, const ModelTensor& model_tensor, + InferData& infer_data) +{ + cb::InferRequestedOutput* requested_output; + RETURN_IF_ERROR(cb::InferRequestedOutput::Create( + &requested_output, backend_kind_, name, model_tensor.datatype_)); + infer_data.outputs_.push_back(requested_output); + + return cb::Error::Success; +} + +cb::Error +InferDataManager::UpdateInputs( + const size_t thread_id, const int stream_index, const int step_index, + InferData& infer_data) +{ + // Reset inputs for this inference request + infer_data.valid_inputs_.clear(); + + for (const auto& input : infer_data.inputs_) { + const auto& name = input->Name(); + + cb::InferInput* tmp_input = + GetInput(thread_id, name, stream_index, step_index); + if (tmp_input != nullptr) { + infer_data.valid_inputs_.push_back(tmp_input); + } + } + return cb::Error::Success; +} + + +}} // namespace triton::perfanalyzer diff --git a/infer_data_manager.h b/infer_data_manager.h new file mode 100644 index 00000000..ccde8d2f --- /dev/null +++ b/infer_data_manager.h @@ -0,0 +1,96 @@ +// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include "client_backend/client_backend.h" +#include "constants.h" +#include "data_loader.h" +#include "infer_data.h" +#include "infer_data_manager_base.h" +#include "model_parser.h" +#include "perf_utils.h" + +namespace triton { namespace perfanalyzer { + +/// Manages infer data to prepare an inference request and the resulting +/// inference output from triton server +class InferDataManager : public InferDataManagerBase { + public: + InferDataManager( + const size_t max_threads, const int32_t batch_size, + const std::unordered_map& + request_parameters, + const std::shared_ptr& parser, + const std::shared_ptr& factory, + const std::shared_ptr& data_loader) + : max_threads_(max_threads), + InferDataManagerBase( + batch_size, request_parameters, parser, factory, data_loader) + { + } + + /// Initialize this object. Must be called before any other functions + /// \return cb::Error object indicating success or failure. + cb::Error Init() override; + + protected: + const size_t max_threads_{1}; + std::map, cb::InferInput*> inputs_; + + cb::Error CreateAndPopulateInputs(); + cb::Error CreateAndPopulateInput( + const size_t thread_id, const std::string& name, + const ModelTensor& model_tensor, int stream_id, int step_id); + + cb::InferInput* GetInput( + const size_t thread_id, const std::string& name, int stream_id, + int step_id); + + cb::Error InitInferDataInput( + const std::string& name, const ModelTensor& model_tensor, + InferData& infer_data) override; + + cb::Error InitInferDataOutput( + const std::string& name, const ModelTensor& model_tensor, + InferData& infer_data) override; + + /// Helper function to update the inputs + /// \param thread_id The ID of the calling thread + /// \param stream_index The data stream to use for next data + /// \param step_index The step index to use for next data + /// \param infer_data The target InferData object + /// \return cb::Error object indicating success or failure. + cb::Error UpdateInputs( + const size_t thread_id, const int stream_index, const int step_index, + InferData& infer_data); + +#ifndef DOCTEST_CONFIG_DISABLE + public: + InferDataManager() = default; +#endif +}; + +}} // namespace triton::perfanalyzer diff --git a/infer_data_manager_base.cc b/infer_data_manager_base.cc new file mode 100644 index 00000000..9a06f86b --- /dev/null +++ b/infer_data_manager_base.cc @@ -0,0 +1,189 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "infer_data_manager_base.h" + +#include + +namespace triton { namespace perfanalyzer { + +cb::Error +InferDataManagerBase::GetInputData( + const std::string& name, const ModelTensor& tensor, int stream_id, + int step_id, std::vector& input_datas) +{ + size_t max_count = tensor.is_shape_tensor_ ? 1 : batch_size_; + std::vector shape; + std::vector prev_shape; + + for (size_t count = 0; count < max_count; count++) { + int local_step_id = + (step_id + count) % data_loader_->GetTotalSteps(stream_id); + + TensorData input_data; + + RETURN_IF_ERROR( + data_loader_->GetInputShape(tensor, stream_id, local_step_id, &shape)); + if (!shape.empty()) { + if (count == 0) { + prev_shape = shape; + } else { + if (!std::equal(shape.begin(), shape.end(), prev_shape.begin())) { + return cb::Error( + "can not batch tensors with different shapes together " + "(input '" + + name + "' expected shape " + ShapeVecToString(prev_shape) + + " and received " + ShapeVecToString(shape), + pa::GENERIC_ERROR); + } + } + } + + RETURN_IF_ERROR(data_loader_->GetInputData( + tensor, stream_id, local_step_id, input_data)); + + input_datas.push_back(input_data); + } + + return cb::Error::Success; +} + +cb::Error +InferDataManagerBase::ValidateShapeTensor( + const ModelTensor& tensor, int stream_id, int step_id, + const std::vector& input_datas) +{ + // Validate that steps 1 through N are exactly the same as step 0, since step + // 0 is the only one we send for shape tensors + for (size_t count = 1; count < batch_size_; count++) { + int local_step_id = + (step_id + count) % data_loader_->GetTotalSteps(stream_id); + + TensorData input_data; + RETURN_IF_ERROR(data_loader_->GetInputData( + tensor, stream_id, local_step_id, input_data)); + + if (input_data.batch1_size != input_datas.back().batch1_size) { + return cb::Error( + "The shape tensors should be identical in a batch (mismatch " + "in size)", + pa::GENERIC_ERROR); + } + + for (size_t data_idx = 0; data_idx < input_data.batch1_size; data_idx++) { + if (*(input_data.data_ptr + data_idx) != + *(input_datas.back().data_ptr + data_idx)) { + return cb::Error( + "The shape tensors should be identical in a batch " + "(mismatch in content)", + pa::GENERIC_ERROR); + } + } + } + return cb::Error::Success; +} + +cb::Error +InferDataManagerBase::InitInferData(InferData& infer_data) +{ + // Initialize inputs + for (const auto& input : *(parser_->Inputs())) { + RETURN_IF_ERROR(InitInferDataInput(input.first, input.second, infer_data)); + } + + for (const auto& output : *(parser_->Outputs())) { + RETURN_IF_ERROR( + InitInferDataOutput(output.first, output.second, infer_data)); + } + + return cb::Error::Success; +} + +cb::Error +InferDataManagerBase::UpdateInferData( + size_t thread_id, int stream_index, int step_index, InferData& infer_data) +{ + RETURN_IF_ERROR(data_loader_->ValidateIndexes(stream_index, step_index)); + RETURN_IF_ERROR( + UpdateInputs(thread_id, stream_index, step_index, infer_data)); + RETURN_IF_ERROR( + UpdateValidationOutputs(stream_index, step_index, infer_data)); + return cb::Error::Success; +} + +cb::Error +InferDataManagerBase::UpdateValidationOutputs( + int stream_index, int step_index, InferData& infer_data) +{ + RETURN_IF_ERROR(data_loader_->ValidateIndexes(stream_index, step_index)); + + infer_data.expected_outputs_.clear(); + + for (const auto& output : infer_data.outputs_) { + const auto& model_output = (*(parser_->Outputs()))[output->Name()]; + + TensorData output_data; + const int* set_shape_values = nullptr; + int set_shape_value_cnt = 0; + + std::vector outputs; + for (size_t i = 0; i < batch_size_; ++i) { + RETURN_IF_ERROR(data_loader_->GetOutputData( + output->Name(), stream_index, + (step_index + i) % data_loader_->GetTotalSteps(0), output_data)); + if (!output_data.is_valid) { + break; + } + + outputs.emplace_back(output_data); + // Shape tensor only need the first batch element + if (model_output.is_shape_tensor_) { + break; + } + } + if (!outputs.empty()) { + infer_data.expected_outputs_.emplace_back(std::move(outputs)); + } + } + return cb::Error::Success; +} + +cb::Error +InferDataManagerBase::CreateInferInput( + cb::InferInput** infer_input, const cb::BackendKind kind, + const std::string& name, const std::vector& dims, + const std::string& datatype) +{ + return cb::InferInput::Create(infer_input, kind, name, dims, datatype); +} + +void +InferDataManagerBase::AddInferDataParameters(InferData& infer_data) +{ + infer_data.options_->request_parameters_ = request_parameters_; +} + +}} // namespace triton::perfanalyzer diff --git a/infer_data_manager_base.h b/infer_data_manager_base.h new file mode 100644 index 00000000..d9249906 --- /dev/null +++ b/infer_data_manager_base.h @@ -0,0 +1,152 @@ +// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include "client_backend/client_backend.h" +#include "constants.h" +#include "data_loader.h" +#include "iinfer_data_manager.h" +#include "infer_data.h" +#include "model_parser.h" +#include "perf_utils.h" +#include "tensor_data.h" + +namespace triton { namespace perfanalyzer { + +/// Base class for Infer Data managers +/// +class InferDataManagerBase : public IInferDataManager { + public: + InferDataManagerBase( + const int32_t batch_size, + const std::unordered_map& + request_parameters, + const std::shared_ptr& parser, + const std::shared_ptr& factory, + const std::shared_ptr& data_loader) + : batch_size_(batch_size), request_parameters_(request_parameters), + parser_(parser), factory_(factory), data_loader_(data_loader), + backend_kind_(factory->Kind()) + { + } + + /// Populate the target InferData object with input and output objects + /// according to the model's shape + /// \param infer_data The target InferData object. + /// \return cb::Error object indicating success or failure. + cb::Error InitInferData(InferData& infer_data) override; + + /// Updates the input data to use for inference request + /// \param thread_id The ID of the calling thread + /// \param stream_index The data stream to use for next data + /// \param step_index The step index to use for next data + /// \param infer_data The target InferData object + /// \return cb::Error object indicating success or failure. + cb::Error UpdateInferData( + size_t thread_id, int stream_index, int step_index, + InferData& infer_data) override; + + protected: + size_t batch_size_; + std::shared_ptr parser_; + std::shared_ptr factory_; + std::shared_ptr data_loader_; + std::unique_ptr backend_; + cb::BackendKind backend_kind_; + std::unordered_map request_parameters_; + + /// Gets the input data for the specified input for the specified batch size + /// + /// \param name The name of the input to get data for + /// \param tensor The ModelTensor of the input to get data for + /// \param stream_id The ID of the stream to get data for + /// \param step_id The ID of the step within the stream + /// \param input_datas The returned vector of TensorDatas + /// \return cb::Error object indicating success or failure. + cb::Error GetInputData( + const std::string& name, const ModelTensor& tensor, int stream_id, + int step_id, std::vector& input_datas); + + /// For the case of an input with is_shape_tensor true, validate that + /// it follows all rules, and throw an error if it does not + /// \param tensor The ModelTensor of the input to validate + /// \param stream_id The ID of the stream to validate + /// \param step_id The ID of the step within the stream + /// \param input_datas vector of TensorDatas to validate + /// \return cb::Error object indicating success or failure. + cb::Error ValidateShapeTensor( + const ModelTensor& tensor, int stream_id, int step_id, + const std::vector& input_datas); + + /// Helper function to update the inputs + /// \param thread_id The ID of the calling thread + /// \param stream_index The data stream to use for next data + /// \param step_index The step index to use for next data + /// \param infer_data The target InferData object + /// \return cb::Error object indicating success or failure. + virtual cb::Error UpdateInputs( + const size_t thread_id, const int stream_index, const int step_index, + InferData& infer_data) = 0; + + /// Updates the expected output data to use for inference request. Empty + /// vector will be returned if there is no expected output associated to the + /// step. + /// \param stream_index The data stream to use for next data + /// \param step_index The step index to use for next data + /// \param infer_data The target InferData object + /// \return cb::Error object indicating success or failure. + cb::Error UpdateValidationOutputs( + int stream_index, int step_index, InferData& infer_data); + + /// Creates inference input object + /// \param infer_input Output parameter storing newly created inference input + /// \param kind Backend kind + /// \param name Name of inference input + /// \param dims Shape of inference input + /// \param datatype Data type of inference input + /// \return cb::Error object indicating success or failure. + virtual cb::Error CreateInferInput( + cb::InferInput** infer_input, const cb::BackendKind kind, + const std::string& name, const std::vector& dims, + const std::string& datatype); + + virtual cb::Error InitInferDataInput( + const std::string& name, const ModelTensor& model_tensor, + InferData& infer_data) = 0; + + virtual cb::Error InitInferDataOutput( + const std::string& name, const ModelTensor& model_tensor, + InferData& infer_data) = 0; + + void AddInferDataParameters(InferData& infer_data); + +#ifndef DOCTEST_CONFIG_DISABLE + public: + InferDataManagerBase() = default; +#endif +}; + +}} // namespace triton::perfanalyzer diff --git a/infer_data_manager_factory.h b/infer_data_manager_factory.h new file mode 100644 index 00000000..6bf24bef --- /dev/null +++ b/infer_data_manager_factory.h @@ -0,0 +1,88 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include "data_loader.h" +#include "iinfer_data_manager.h" +#include "infer_data_manager.h" +#include "infer_data_manager_shm.h" +#include "model_parser.h" +#include "perf_utils.h" + +namespace triton { namespace perfanalyzer { + +class InferDataManagerFactory { + public: + static std::shared_ptr CreateInferDataManager( + const size_t max_threads, const int32_t batch_size, + const SharedMemoryType shared_memory_type, const size_t output_shm_size, + const std::unordered_map& + request_parameters, + const std::shared_ptr& parser, + const std::shared_ptr& factory, + const std::shared_ptr& data_loader) + { + if (shared_memory_type == SharedMemoryType::NO_SHARED_MEMORY) { + return CreateInferDataManagerNoShm( + max_threads, batch_size, request_parameters, parser, factory, + data_loader); + } else { + return CreateInferDataManagerShm( + batch_size, shared_memory_type, output_shm_size, request_parameters, + parser, factory, data_loader); + } + } + + private: + static std::shared_ptr CreateInferDataManagerNoShm( + const size_t max_threads, const int32_t batch_size, + const std::unordered_map& + request_parameters, + const std::shared_ptr& parser, + const std::shared_ptr& factory, + const std::shared_ptr& data_loader) + { + return std::make_shared( + max_threads, batch_size, request_parameters, parser, factory, + data_loader); + } + + static std::shared_ptr CreateInferDataManagerShm( + const int32_t batch_size, const SharedMemoryType shared_memory_type, + const size_t output_shm_size, + const std::unordered_map& + request_parameters, + const std::shared_ptr& parser, + const std::shared_ptr& factory, + const std::shared_ptr& data_loader) + { + return std::make_shared( + batch_size, shared_memory_type, output_shm_size, request_parameters, + parser, factory, data_loader); + } +}; + +}} // namespace triton::perfanalyzer diff --git a/infer_data_manager_shm.cc b/infer_data_manager_shm.cc new file mode 100644 index 00000000..8df7041e --- /dev/null +++ b/infer_data_manager_shm.cc @@ -0,0 +1,384 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "infer_data_manager_shm.h" + +#include + +namespace triton { namespace perfanalyzer { + +InferDataManagerShm::~InferDataManagerShm() +{ + cb::Error err; + if (backend_.get() != nullptr) { + err = backend_->UnregisterAllSharedMemory(); + if (!err.IsOk()) { + std::cerr << "Unable to unregister all shared memory regions" + << std::endl; + } + if (shared_memory_type_ == SharedMemoryType::SYSTEM_SHARED_MEMORY) { + for (auto& region : shared_memory_regions_) { + if (factory_->Kind() != + triton::perfanalyzer::clientbackend::BackendKind::TRITON_C_API) { + err = backend_->UnmapSharedMemory( + shared_memory_regions_[region.first].data_.get(), + shared_memory_regions_[region.first].byte_size_); + if (!err.IsOk()) { + std::cerr << "Unable to unmap shared memory with key (" + << region.first << "): Starting: " + << static_cast( + shared_memory_regions_[region.first].data_.get()) + << ", size: " + << shared_memory_regions_[region.first].byte_size_ + << std::endl; + } + err = backend_->UnlinkSharedMemoryRegion(region.first); + if (!err.IsOk()) { + std::cerr << "Unable to unlink shared memory with key: " + << region.first << std::endl; + } + } + } + } + } +} + + +cb::Error +InferDataManagerShm::Init() +{ + // TMA-1062 remove the factory from this class and use only the backend + RETURN_IF_ERROR(factory_->CreateClientBackend(&backend_)); + // Calling this function for the clean start + backend_->UnregisterAllSharedMemory(); + + RETURN_IF_ERROR(CreateOutputMemoryRegions()); + RETURN_IF_ERROR(CreateAndPopulateInputMemoryRegions()); + + return cb::Error::Success; +} + +cb::Error +InferDataManagerShm::CreateOutputMemoryRegions() +{ + // Allocate the shared memory for outputs + for (const auto& output : *(parser_->Outputs())) { + const std::string& name = output.first; + const ModelTensor& tensor = output.second; + int64_t batch1_bytesize = ByteSize(tensor.shape_, tensor.datatype_); + if (batch1_bytesize < 0) { + batch1_bytesize = output_shm_size_; + } + uint8_t* output_shm_ptr; + size_t alloc_size = batch1_bytesize * batch_size_; + std::string region_name(TensorToRegionName(name)); + RETURN_IF_ERROR(CreateMemoryRegion( + region_name, shared_memory_type_, alloc_size, + reinterpret_cast(&output_shm_ptr))); + } + return cb::Error::Success; +} + +cb::Error +InferDataManagerShm::CreateAndPopulateInputMemoryRegions() +{ + // All combinations of input + stream + step + // + for (const auto& input : *(parser_->Inputs())) { + const std::string& name = input.first; + const ModelTensor& tensor = input.second; + for (int stream_id = 0; + stream_id < (int)data_loader_->GetDataStreamsCount(); stream_id++) { + for (int step_id = 0; + step_id < (int)data_loader_->GetTotalSteps(stream_id); + step_id += 1) { + RETURN_IF_ERROR(CreateAndPopulateInputMemoryRegion( + name, tensor, stream_id, step_id)); + } + } + } + return cb::Error::Success; +} + +cb::Error +InferDataManagerShm::CreateAndPopulateInputMemoryRegion( + const std::string& name, const ModelTensor& tensor, int stream_id, + int step_id) +{ + std::vector input_datas; + size_t count = 0; + + RETURN_IF_ERROR(GetInputData(name, tensor, stream_id, step_id, input_datas)); + + if (tensor.is_shape_tensor_) { + RETURN_IF_ERROR( + ValidateShapeTensor(tensor, stream_id, step_id, input_datas)); + } + + size_t alloc_size = 0; + for (size_t i = 0; i < input_datas.size(); i++) { + if (!input_datas[i].is_valid) { + return cb::Error( + "Shared memory support in Perf Analyzer does not support " + "optional inputs at this time"); + } + alloc_size += input_datas[i].batch1_size; + } + + // Generate the shared memory region name + std::string region_name( + TensorToRegionName(name) + "_" + std::to_string(stream_id) + "_" + + std::to_string(step_id)); + uint8_t* input_shm_ptr; + RETURN_IF_ERROR(CreateMemoryRegion( + region_name, shared_memory_type_, alloc_size, + reinterpret_cast(&input_shm_ptr))); + RETURN_IF_ERROR(CopySharedMemory( + input_shm_ptr, input_datas, tensor.is_shape_tensor_, region_name)); + + return cb::Error::Success; +} + +cb::Error +InferDataManagerShm::CreateMemoryRegion( + const std::string& shm_region_name, const SharedMemoryType& memory_type, + const size_t byte_size, void** ptr) +{ + if (memory_type == SharedMemoryType::SYSTEM_SHARED_MEMORY) { + if (factory_->Kind() == + triton::perfanalyzer::clientbackend::BackendKind::TRITON_C_API) { + *ptr = new uint8_t[byte_size]; + RETURN_IF_ERROR( + backend_->RegisterSystemMemory(shm_region_name, *ptr, byte_size)); + + // Set free as the destructor. + shared_memory_regions_.emplace( + std::piecewise_construct, std::forward_as_tuple(shm_region_name), + std::forward_as_tuple(SharedMemoryData( + byte_size, + std::unique_ptr>( + reinterpret_cast(*ptr), + [](uint8_t* memory) { free(memory); })))); + } else { + std::string shm_key("/" + shm_region_name); + int shm_fd_op; + RETURN_IF_ERROR( + backend_->CreateSharedMemoryRegion(shm_key, byte_size, &shm_fd_op)); + RETURN_IF_ERROR(backend_->MapSharedMemory(shm_fd_op, 0, byte_size, ptr)); + + RETURN_IF_ERROR(backend_->RegisterSystemSharedMemory( + shm_region_name, shm_key, byte_size)); + + // No-op destruction + shared_memory_regions_.emplace( + std::piecewise_construct, std::forward_as_tuple(shm_region_name), + std::forward_as_tuple(SharedMemoryData( + byte_size, + std::unique_ptr>( + reinterpret_cast(*ptr), [](uint8_t* memory) {})))); + } + } else if (memory_type == SharedMemoryType::CUDA_SHARED_MEMORY) { +#ifdef TRITON_ENABLE_GPU + cudaError_t cuda_err = cudaMalloc((void**)ptr, byte_size); + if (cuda_err != cudaSuccess) { + return cb::Error( + "unable to allocate memory of " + std::to_string(byte_size) + + " bytes on gpu for output: " + + std::string(cudaGetErrorString(cuda_err)), + pa::GENERIC_ERROR); + } + + if (factory_->Kind() == + triton::perfanalyzer::clientbackend::BackendKind::TRITON_C_API) { + RETURN_IF_ERROR( + backend_->RegisterCudaMemory(shm_region_name, *ptr, byte_size)); + + // Set cudaFree as the destructor + shared_memory_regions_.emplace( + std::piecewise_construct, std::forward_as_tuple(shm_region_name), + std::forward_as_tuple(SharedMemoryData( + byte_size, + std::unique_ptr>( + reinterpret_cast(*ptr), + [shm_region_name, byte_size](uint8_t* memory) { + cudaError_t cuda_err = cudaFree(memory); + if (cuda_err != cudaSuccess) { + std::cerr + << "Unable to free cuda shared memory for " + << shm_region_name + << ": Starting: " << static_cast(memory) + << ", size: " << byte_size + << " bytes, Details: " << cudaGetErrorString(cuda_err) + << std::endl; + } + })))); + } else { + cudaIpcMemHandle_t cuda_handle; + RETURN_IF_ERROR( + CreateCUDAIPCHandle(&cuda_handle, reinterpret_cast(*ptr))); + RETURN_IF_ERROR(backend_->RegisterCudaSharedMemory( + shm_region_name, cuda_handle, byte_size)); + + // No operation required for deleting the memory + shared_memory_regions_.emplace( + std::piecewise_construct, std::forward_as_tuple(shm_region_name), + std::forward_as_tuple(SharedMemoryData( + byte_size, + std::unique_ptr>( + reinterpret_cast(*ptr), [](uint8_t* memory) {})))); + } +#endif // TRITON_ENABLE_GPU + } else { + return cb::Error( + "CreateMemoryRegion called with invalid memory region type.", + pa::GENERIC_ERROR); + } + + return cb::Error::Success; +} + +cb::Error +InferDataManagerShm::CopySharedMemory( + uint8_t* input_shm_ptr, const std::vector& tensor_datas, + bool is_shape_tensor, std::string& region_name) +{ + if (shared_memory_type_ == SharedMemoryType::SYSTEM_SHARED_MEMORY) { + // Populate the region with data + size_t count = 0; + size_t offset = 0; + size_t max_count = is_shape_tensor ? 1 : batch_size_; + while (count < max_count) { + memcpy( + input_shm_ptr + offset, tensor_datas[count].data_ptr, + tensor_datas[count].batch1_size); + offset += tensor_datas[count].batch1_size; + count++; + } + } else { +#ifdef TRITON_ENABLE_GPU + // Populate the region with data + size_t count = 0; + size_t offset = 0; + size_t max_count = is_shape_tensor ? 1 : batch_size_; + while (count < max_count) { + cudaError_t cuda_err = cudaMemcpy( + (void*)(input_shm_ptr + offset), (void*)tensor_datas[count].data_ptr, + tensor_datas[count].batch1_size, cudaMemcpyHostToDevice); + if (cuda_err != cudaSuccess) { + return cb::Error( + "Failed to copy data to cuda shared memory for " + region_name + + " : " + std::string(cudaGetErrorString(cuda_err)), + pa::GENERIC_ERROR); + } + offset += tensor_datas[count].batch1_size; + count++; + } +#endif // TRITON_ENABLE_GPU + } + return cb::Error::Success; +} + +cb::Error +InferDataManagerShm::InitInferDataInput( + const std::string& name, const ModelTensor& model_tensor, + InferData& infer_data) +{ + std::vector shape; + RETURN_IF_ERROR(data_loader_->GetInputShape(model_tensor, 0, 0, &shape)); + if (!shape.empty()) { + if ((parser_->MaxBatchSize() != 0) && (!model_tensor.is_shape_tensor_)) { + shape.insert(shape.begin(), (int64_t)batch_size_); + } + } else { + return cb::Error("unable to set shape for the input", pa::GENERIC_ERROR); + } + + cb::InferInput* infer_input; + RETURN_IF_ERROR(CreateInferInput( + &infer_input, backend_kind_, name, shape, model_tensor.datatype_)); + infer_data.inputs_.push_back(infer_input); + + // FIXME: TMA-765 - Shared memory mode does not support optional inputs, + // currently, and will be implemented in the associated story. + infer_data.valid_inputs_.push_back(infer_input); + + std::string region_name( + TensorToRegionName(name) + "_" + std::to_string(0) + "_" + + std::to_string(0)); + RETURN_IF_ERROR(infer_input->SetSharedMemory( + region_name, shared_memory_regions_[region_name].byte_size_)); + + AddInferDataParameters(infer_data); + + return cb::Error::Success; +} + +cb::Error +InferDataManagerShm::InitInferDataOutput( + const std::string& name, const ModelTensor& model_tensor, + InferData& infer_data) +{ + cb::InferRequestedOutput* requested_output; + RETURN_IF_ERROR(cb::InferRequestedOutput::Create( + &requested_output, backend_kind_, name, model_tensor.datatype_)); + infer_data.outputs_.push_back(requested_output); + + std::string region_name(TensorToRegionName(name)); + RETURN_IF_ERROR(requested_output->SetSharedMemory( + region_name, shared_memory_regions_[region_name].byte_size_)); + + return cb::Error::Success; +} + +cb::Error +InferDataManagerShm::UpdateInputs( + const size_t thread_id, const int stream_index, const int step_index, + InferData& infer_data) +{ + for (const auto& input : infer_data.inputs_) { + RETURN_IF_ERROR(input->Reset()); + const auto& model_input = (*(parser_->Inputs()))[input->Name()]; + + std::string region_name( + TensorToRegionName(input->Name()) + '_' + std::to_string(stream_index) + + "_" + std::to_string(step_index)); + + std::vector shape; + RETURN_IF_ERROR(data_loader_->GetInputShape( + model_input, stream_index, step_index, &shape)); + if (!shape.empty()) { + if ((parser_->MaxBatchSize() != 0) && (!model_input.is_shape_tensor_)) { + shape.insert(shape.begin(), (int64_t)batch_size_); + } + input->SetShape(shape); + } + RETURN_IF_ERROR(input->SetSharedMemory( + region_name, shared_memory_regions_[region_name].byte_size_)); + } + return cb::Error::Success; +} + +}} // namespace triton::perfanalyzer diff --git a/infer_data_manager_shm.h b/infer_data_manager_shm.h new file mode 100644 index 00000000..6a5ac9db --- /dev/null +++ b/infer_data_manager_shm.h @@ -0,0 +1,164 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include "client_backend/client_backend.h" +#include "constants.h" +#include "data_loader.h" +#include "infer_data.h" +#include "infer_data_manager_base.h" +#include "model_parser.h" +#include "perf_utils.h" + +namespace triton { namespace perfanalyzer { + +namespace { + +#ifdef TRITON_ENABLE_GPU + +#include + +#define RETURN_IF_CUDA_ERR(FUNC) \ + { \ + const cudaError_t result = FUNC; \ + if (result != cudaSuccess) { \ + return cb::Error( \ + "CUDA exception (line " + std::to_string(__LINE__) + \ + "): " + cudaGetErrorName(result) + " (" + \ + cudaGetErrorString(result) + ")", \ + pa::GENERIC_ERROR); \ + } \ + } + +cb::Error +CreateCUDAIPCHandle( + cudaIpcMemHandle_t* cuda_handle, void* input_d_ptr, int device_id = 0) +{ + // Set the GPU device to the desired GPU + RETURN_IF_CUDA_ERR(cudaSetDevice(device_id)); + + // Create IPC handle for data on the gpu + RETURN_IF_CUDA_ERR(cudaIpcGetMemHandle(cuda_handle, input_d_ptr)); + + return cb::Error::Success; +} + +#endif // TRITON_ENABLE_GPU + +} // namespace + +/// Holds information about the shared memory locations +struct SharedMemoryData { + SharedMemoryData( + size_t byte_size, + std::unique_ptr> data) + : byte_size_(byte_size), data_(std::move(data)) + { + } + + SharedMemoryData() {} + + // Byte size + size_t byte_size_; + + // Unique pointer holding the shared memory data + std::unique_ptr> data_; +}; + +/// Manages infer data to prepare an inference request and the resulting +/// inference output from triton server +class InferDataManagerShm : public InferDataManagerBase { + public: + InferDataManagerShm( + const int32_t batch_size, const SharedMemoryType shared_memory_type, + const size_t output_shm_size, + const std::unordered_map& + request_parameters, + const std::shared_ptr& parser, + const std::shared_ptr& factory, + const std::shared_ptr& data_loader) + : shared_memory_type_(shared_memory_type), + output_shm_size_(output_shm_size), + InferDataManagerBase( + batch_size, request_parameters, parser, factory, data_loader) + { + } + + ~InferDataManagerShm(); + + /// Initialize this object. Must be called before any other functions + /// \return cb::Error object indicating success or failure. + cb::Error Init() override; + + protected: + cb::Error CreateOutputMemoryRegions(); + cb::Error CreateAndPopulateInputMemoryRegions(); + cb::Error CreateAndPopulateInputMemoryRegion( + const std::string& name, const ModelTensor& tensor, int stream_id, + int step_id); + + /// Create a memory region. + /// \return cb::Error object indicating success or failure. + cb::Error CreateMemoryRegion( + const std::string& shm_region_name, const SharedMemoryType& memory_type, + const size_t byte_size, void** ptr); + + /// \brief Helper function to handle copying shared memory to the correct + /// memory region + /// \param input_shm_ptr Pointer to the shared memory for a specific input + /// \param input_datas The TensorDatas to be copied + /// \param is_shape_tensor Is the input a shape tensor + /// \param region_name Name of the shared memory region + /// \return cb::Error object indicating success or failure + virtual cb::Error CopySharedMemory( + uint8_t* input_shm_ptr, const std::vector& input_datas, + bool is_shape_tensor, std::string& region_name); + + cb::Error InitInferDataInput( + const std::string& name, const ModelTensor& model_tensor, + InferData& infer_data) override; + + cb::Error InitInferDataOutput( + const std::string& name, const ModelTensor& model_tensor, + InferData& infer_data) override; + + /// Helper function to update the inputs + /// \param thread_id The ID of the calling thread + /// \param stream_index The data stream to use for next data + /// \param step_index The step index to use for next data + /// \param infer_data The target InferData object + /// \return cb::Error object indicating success or failure. + virtual cb::Error UpdateInputs( + size_t thread_id, const int stream_index, const int step_index, + InferData& infer_data) override; + + SharedMemoryType shared_memory_type_; + size_t output_shm_size_; + // Map from shared memory key to its starting address and size + std::unordered_map shared_memory_regions_; +}; + +}} // namespace triton::perfanalyzer diff --git a/inference_profiler.cc b/inference_profiler.cc index e6d2a761..57a33942 100644 --- a/inference_profiler.cc +++ b/inference_profiler.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -27,14 +27,71 @@ #include "inference_profiler.h" #include + #include +#include +#include #include #include +#include #include + #include "client_backend/client_backend.h" +#include "constants.h" #include "doctest.h" namespace triton { namespace perfanalyzer { +cb::Error +ReportPrometheusMetrics(const Metrics& metrics) +{ + const size_t max_num_gpus_in_stdout{16}; + if (metrics.gpu_utilization_per_gpu.size() > max_num_gpus_in_stdout || + metrics.gpu_power_usage_per_gpu.size() > max_num_gpus_in_stdout || + metrics.gpu_memory_used_bytes_per_gpu.size() > max_num_gpus_in_stdout || + metrics.gpu_memory_total_bytes_per_gpu.size() > max_num_gpus_in_stdout) { + std::cout << "Too many GPUs on system to print out individual Prometheus " + "metrics, use the CSV output feature to see metrics." + << std::endl; + return cb::Error::Success; + } + + std::cout << " Avg GPU Utilization:" << std::endl; + for (const auto& gpu_uuid_metric_pair : metrics.gpu_utilization_per_gpu) { + const auto gpu_uuid{gpu_uuid_metric_pair.first}; + const auto metric{gpu_uuid_metric_pair.second}; + std::cout << " " << gpu_uuid << " : " << (metric * 100.0) << "%" + << std::endl; + } + + std::cout << " Avg GPU Power Usage:" << std::endl; + for (const auto& gpu_uuid_metric_pair : metrics.gpu_power_usage_per_gpu) { + const auto gpu_uuid{gpu_uuid_metric_pair.first}; + const auto metric{gpu_uuid_metric_pair.second}; + std::cout << " " << gpu_uuid << " : " << metric << " watts" + << std::endl; + } + + std::cout << " Max GPU Memory Usage:" << std::endl; + for (const auto& gpu_uuid_metric_pair : + metrics.gpu_memory_used_bytes_per_gpu) { + const auto gpu_uuid{gpu_uuid_metric_pair.first}; + const auto metric{gpu_uuid_metric_pair.second}; + std::cout << " " << gpu_uuid << " : " << metric << " bytes" + << std::endl; + } + + std::cout << " Total GPU Memory:" << std::endl; + for (const auto& gpu_uuid_metric_pair : + metrics.gpu_memory_total_bytes_per_gpu) { + const auto gpu_uuid{gpu_uuid_metric_pair.first}; + const auto metric{gpu_uuid_metric_pair.second}; + std::cout << " " << gpu_uuid << " : " << metric << " bytes" + << std::endl; + } + + return cb::Error::Success; +} + namespace { inline uint64_t @@ -50,6 +107,14 @@ EnsembleDurations GetTotalEnsembleDurations(const ServerSideStats& stats) { EnsembleDurations result; + // Calculate avg cache hit latency and cache miss latency for ensemble model + // in case top level response caching is enabled. + const uint64_t ensemble_cache_hit_cnt = stats.cache_hit_count; + const uint64_t ensemble_cache_miss_cnt = stats.cache_miss_count; + result.total_cache_hit_time_avg_us += + AverageDurationInUs(stats.cache_hit_time_ns, ensemble_cache_hit_cnt); + result.total_cache_miss_time_avg_us += + AverageDurationInUs(stats.cache_miss_time_ns, ensemble_cache_miss_cnt); for (const auto& model_stats : stats.composing_models_stat) { if (model_stats.second.composing_models_stat.empty()) { // Cache hit count covers cache hits, not related to compute times @@ -181,7 +246,6 @@ ReportServerSideStats( if (parser->ResponseCacheEnabled()) { const uint64_t overhead_avg_us = GetOverheadDuration( cumm_avg_us, queue_avg_us, combined_cache_compute_avg_us); - std::cout << " (overhead " << overhead_avg_us << " usec + " << "queue " << queue_avg_us << " usec + " << "cache hit/miss " << combined_cache_compute_avg_us @@ -226,12 +290,18 @@ ReportServerSideStats( const uint64_t overhead_avg_us = GetOverheadDuration( cumm_avg_us, ensemble_times.total_queue_time_avg_us, ensemble_times.total_combined_cache_compute_time_avg_us); - std::cout << " (overhead " << overhead_avg_us << " usec + " - << "queue " << ensemble_times.total_queue_time_avg_us - << " usec + " - << "cache hit/miss " - << ensemble_times.total_combined_cache_compute_time_avg_us - << " usec)" << std::endl; + // FIXME - Refactor these calculations in case of ensemble top level + // response cache is enabled + if (!parser->TopLevelResponseCachingEnabled()) { + std::cout << " (overhead " << overhead_avg_us << " usec + " + << "queue " << ensemble_times.total_queue_time_avg_us + << " usec + " + << "cache hit/miss " + << ensemble_times.total_combined_cache_compute_time_avg_us + << " usec)" << std::endl; + } else { + std::cout << std::endl; + } std::cout << ident << ident << " Average Cache Hit Latency: " << ensemble_times.total_cache_hit_time_avg_us << " usec" << std::endl; @@ -272,7 +342,9 @@ cb::Error ReportClientSideStats( const ClientSideStats& stats, const int64_t percentile, const cb::ProtocolType protocol, const bool verbose, - const bool on_sequence_model, const bool include_lib_stats) + const bool on_sequence_model, const bool include_lib_stats, + const double overhead_pct, const double send_request_rate, + const bool is_decoupled_model) { const uint64_t avg_latency_us = stats.avg_latency_ns / 1000; const uint64_t std_us = stats.std_us; @@ -321,9 +393,16 @@ ReportClientSideStats( } std::cout << " Request count: " << stats.request_count << std::endl; - if (stats.delayed_request_count != 0) { - std::cout << " Delayed Request Count: " << stats.delayed_request_count - << std::endl; + double delay_pct = + ((double)stats.delayed_request_count / stats.request_count) * 100; + if (delay_pct > DELAY_PCT_THRESHOLD) { + std::cout << " " + << "Avg send request rate: " << std::fixed << std::setprecision(2) + << send_request_rate << " infer/sec" << std::endl; + std::cout << " " + << "[WARNING] Perf Analyzer was not able to keep up with the " + "desired request rate. "; + std::cout << delay_pct << "% of the requests were delayed. " << std::endl; } if (on_sequence_model) { std::cout << " Sequence count: " << stats.sequence_count << " (" @@ -331,6 +410,19 @@ ReportClientSideStats( } std::cout << " Throughput: " << stats.infer_per_sec << " infer/sec" << std::endl; + if (is_decoupled_model) { + std::cout << " Response Throughput: " << stats.responses_per_sec + << " infer/sec" << std::endl; + } + + if (verbose) { + std::stringstream client_overhead{""}; + client_overhead << " " + << "Avg client overhead: " << std::fixed + << std::setprecision(2) << overhead_pct << "%"; + std::cout << client_overhead.str() << std::endl; + } + if (percentile == -1) { std::cout << " Avg latency: " << avg_latency_us << " usec" << " (standard deviation " << std_us << " usec)" << std::endl; @@ -351,18 +443,30 @@ Report( const PerfStatus& summary, const int64_t percentile, const cb::ProtocolType protocol, const bool verbose, const bool include_lib_stats, const bool include_server_stats, - const std::shared_ptr& parser) + const std::shared_ptr& parser, + const bool should_collect_metrics, const double overhead_pct_threshold) { std::cout << " Client: " << std::endl; ReportClientSideStats( summary.client_stats, percentile, protocol, verbose, - summary.on_sequence_model, include_lib_stats); + summary.on_sequence_model, include_lib_stats, summary.overhead_pct, + summary.send_request_rate, parser->IsDecoupled()); if (include_server_stats) { std::cout << " Server: " << std::endl; ReportServerSideStats(summary.server_stats, 1, parser); } + if (should_collect_metrics) { + std::cout << " Server Prometheus Metrics: " << std::endl; + ReportPrometheusMetrics(summary.metrics.front()); + } + + if (summary.overhead_pct > overhead_pct_threshold) { + std::cout << "[WARNING] Perf Analyzer is not able to keep up with the " + "desired load. The results may not be accurate." + << std::endl; + } return cb::Error::Success; } @@ -379,14 +483,18 @@ InferenceProfiler::Create( std::unique_ptr* profiler, uint64_t measurement_request_count, MeasurementMode measurement_mode, std::shared_ptr mpi_driver, const uint64_t metrics_interval_ms, - const bool should_collect_metrics) + const bool should_collect_metrics, const double overhead_pct_threshold, + const bool async_mode, + const std::shared_ptr collector, + const bool should_collect_profile_data) { std::unique_ptr local_profiler(new InferenceProfiler( verbose, stability_threshold, measurement_window_ms, max_trials, (percentile != -1), percentile, latency_threshold_ms_, protocol, parser, profile_backend, std::move(manager), measurement_request_count, - measurement_mode, mpi_driver, metrics_interval_ms, - should_collect_metrics)); + measurement_mode, mpi_driver, metrics_interval_ms, should_collect_metrics, + overhead_pct_threshold, async_mode, collector, + should_collect_profile_data)); *profiler = std::move(local_profiler); return cb::Error::Success; @@ -401,7 +509,10 @@ InferenceProfiler::InferenceProfiler( std::shared_ptr profile_backend, std::unique_ptr manager, uint64_t measurement_request_count, MeasurementMode measurement_mode, std::shared_ptr mpi_driver, - const uint64_t metrics_interval_ms, const bool should_collect_metrics) + const uint64_t metrics_interval_ms, const bool should_collect_metrics, + const double overhead_pct_threshold, const bool async_mode, + const std::shared_ptr collector, + const bool should_collect_profile_data) : verbose_(verbose), measurement_window_ms_(measurement_window_ms), max_trials_(max_trials), extra_percentile_(extra_percentile), percentile_(percentile), latency_threshold_ms_(latency_threshold_ms_), @@ -409,7 +520,10 @@ InferenceProfiler::InferenceProfiler( manager_(std::move(manager)), measurement_request_count_(measurement_request_count), measurement_mode_(measurement_mode), mpi_driver_(mpi_driver), - should_collect_metrics_(should_collect_metrics) + should_collect_metrics_(should_collect_metrics), + overhead_pct_threshold_(overhead_pct_threshold), async_mode_(async_mode), + collector_(collector), + should_collect_profile_data_(should_collect_profile_data) { load_parameters_.stability_threshold = stability_threshold; load_parameters_.stability_window = 3; @@ -435,25 +549,26 @@ InferenceProfiler::InferenceProfiler( cb::Error InferenceProfiler::Profile( - const size_t concurrent_request_count, std::vector& summary, - bool& meets_threshold, bool& is_stable) + const size_t concurrent_request_count, const size_t request_count, + std::vector& perf_statuses, bool& meets_threshold, + bool& is_stable) { cb::Error err; - PerfStatus status_summary; + PerfStatus perf_status{}; - status_summary.concurrency = concurrent_request_count; + perf_status.concurrency = concurrent_request_count; is_stable = false; meets_threshold = true; - RETURN_IF_ERROR(dynamic_cast(manager_.get()) - ->ChangeConcurrencyLevel(concurrent_request_count)); + RETURN_IF_ERROR( + dynamic_cast(manager_.get()) + ->ChangeConcurrencyLevel(concurrent_request_count, request_count)); - err = ProfileHelper(false /* clean_starts */, status_summary, &is_stable); + err = ProfileHelper(perf_status, request_count, &is_stable); if (err.IsOk()) { - summary.push_back(status_summary); uint64_t stabilizing_latency_ms = - status_summary.stabilizing_latency_ns / (1000 * 1000); + perf_status.stabilizing_latency_ns / NANOS_PER_MILLIS; if ((stabilizing_latency_ms >= latency_threshold_ms_) && (latency_threshold_ms_ != NO_LIMIT)) { std::cerr << "Measured latency went over the set limit of " @@ -473,9 +588,11 @@ InferenceProfiler::Profile( } meets_threshold = false; } else { + perf_statuses.push_back(perf_status); err = Report( - status_summary, percentile_, protocol_, verbose_, include_lib_stats_, - include_server_stats_, parser_); + perf_status, percentile_, protocol_, verbose_, include_lib_stats_, + include_server_stats_, parser_, should_collect_metrics_, + overhead_pct_threshold_); if (!err.IsOk()) { std::cerr << err; meets_threshold = false; @@ -490,25 +607,27 @@ InferenceProfiler::Profile( cb::Error InferenceProfiler::Profile( - const double request_rate, std::vector& summary, - bool& meets_threshold, bool& is_stable) + const double request_rate, const size_t request_count, + std::vector& perf_statuses, bool& meets_threshold, + bool& is_stable) { cb::Error err; - PerfStatus status_summary; + PerfStatus perf_status{}; - status_summary.request_rate = request_rate; + perf_status.request_rate = request_rate; is_stable = false; meets_threshold = true; RETURN_IF_ERROR(dynamic_cast(manager_.get()) - ->ChangeRequestRate(request_rate)); + ->ChangeRequestRate(request_rate, request_count)); + std::cout << "Request Rate: " << request_rate + << " inference requests per seconds" << std::endl; - err = ProfileHelper(false /*clean_starts*/, status_summary, &is_stable); + err = ProfileHelper(perf_status, request_count, &is_stable); if (err.IsOk()) { - summary.push_back(status_summary); uint64_t stabilizing_latency_ms = - status_summary.stabilizing_latency_ns / (1000 * 1000); + perf_status.stabilizing_latency_ns / NANOS_PER_MILLIS; if ((stabilizing_latency_ms >= latency_threshold_ms_) && (latency_threshold_ms_ != NO_LIMIT)) { std::cerr << "Measured latency went over the set limit of " @@ -518,9 +637,11 @@ InferenceProfiler::Profile( std::cerr << "Failed to obtain stable measurement." << std::endl; meets_threshold = false; } else { + perf_statuses.push_back(perf_status); err = Report( - status_summary, percentile_, protocol_, verbose_, include_lib_stats_, - include_server_stats_, parser_); + perf_status, percentile_, protocol_, verbose_, include_lib_stats_, + include_server_stats_, parser_, should_collect_metrics_, + overhead_pct_threshold_); if (!err.IsOk()) { std::cerr << err; meets_threshold = false; @@ -535,24 +656,24 @@ InferenceProfiler::Profile( cb::Error InferenceProfiler::Profile( - std::vector& summary, bool& meets_threshold, bool& is_stable) + const size_t request_count, std::vector& perf_statuses, + bool& meets_threshold, bool& is_stable) { cb::Error err; - PerfStatus status_summary; + PerfStatus perf_status{}; - RETURN_IF_ERROR( - dynamic_cast(manager_.get())->InitCustomIntervals()); RETURN_IF_ERROR(dynamic_cast(manager_.get()) - ->GetCustomRequestRate(&status_summary.request_rate)); + ->InitCustomIntervals(request_count)); + RETURN_IF_ERROR(dynamic_cast(manager_.get()) + ->GetCustomRequestRate(&perf_status.request_rate)); is_stable = false; meets_threshold = true; - err = ProfileHelper(true /* clean_starts */, status_summary, &is_stable); + err = ProfileHelper(perf_status, request_count, &is_stable); if (err.IsOk()) { - summary.push_back(status_summary); uint64_t stabilizing_latency_ms = - status_summary.stabilizing_latency_ns / (1000 * 1000); + perf_status.stabilizing_latency_ns / NANOS_PER_MILLIS; if ((stabilizing_latency_ms >= latency_threshold_ms_) && (latency_threshold_ms_ != NO_LIMIT)) { std::cerr << "Measured latency went over the set limit of " @@ -562,9 +683,11 @@ InferenceProfiler::Profile( std::cerr << "Failed to obtain stable measurement." << std::endl; meets_threshold = false; } else { + perf_statuses.push_back(perf_status); err = Report( - status_summary, percentile_, protocol_, verbose_, include_lib_stats_, - include_server_stats_, parser_); + perf_status, percentile_, protocol_, verbose_, include_lib_stats_, + include_server_stats_, parser_, should_collect_metrics_, + overhead_pct_threshold_); if (!err.IsOk()) { std::cerr << err; meets_threshold = false; @@ -579,41 +702,46 @@ InferenceProfiler::Profile( cb::Error InferenceProfiler::ProfileHelper( - const bool clean_starts, PerfStatus& status_summary, bool* is_stable) + PerfStatus& experiment_perf_status, size_t request_count, bool* is_stable) { // Start measurement LoadStatus load_status; size_t completed_trials = 0; std::queue error; - std::deque perf_status; - all_timestamps_.clear(); + std::deque measurement_perf_statuses; + all_request_records_.clear(); previous_window_end_ns_ = 0; + // Start with a fresh empty request records vector in the manager + // + std::vector empty_request_records; + RETURN_IF_ERROR(manager_->SwapRequestRecords(empty_request_records)); + do { - PerfStatus status_summary; + PerfStatus measurement_perf_status; + measurement_perf_status.concurrency = experiment_perf_status.concurrency; + measurement_perf_status.request_rate = experiment_perf_status.request_rate; RETURN_IF_ERROR(manager_->CheckHealth()); - // Needed to obtain stable measurements - if (clean_starts) { - manager_->ResetWorkers(); - } - if (measurement_mode_ == MeasurementMode::TIME_WINDOWS) { - error.push(Measure(status_summary, measurement_window_ms_, false)); + error.push( + Measure(measurement_perf_status, measurement_window_ms_, false)); } else { - error.push(Measure(status_summary, measurement_request_count_, true)); + error.push( + Measure(measurement_perf_status, measurement_request_count_, true)); } - perf_status.push_back(status_summary); + measurement_perf_statuses.push_back(measurement_perf_status); if (error.size() > load_parameters_.stability_window) { error.pop(); - perf_status.pop_front(); + measurement_perf_statuses.pop_front(); } if (error.back().IsOk()) { load_status.infer_per_sec.push_back( - status_summary.client_stats.infer_per_sec); - load_status.latencies.push_back(status_summary.stabilizing_latency_ns); + measurement_perf_status.client_stats.infer_per_sec); + load_status.latencies.push_back( + measurement_perf_status.stabilizing_latency_ns); } else { load_status.infer_per_sec.push_back(0); load_status.latencies.push_back(std::numeric_limits::max()); @@ -630,16 +758,18 @@ InferenceProfiler::ProfileHelper( << " infer/sec. "; if (extra_percentile_) { std::cout << "p" << percentile_ << " latency: " - << (status_summary.client_stats.percentile_latency_ns - .find(percentile_) + << (measurement_perf_status.client_stats + .percentile_latency_ns.find(percentile_) ->second / 1000) << " usec" << std::endl; } else { std::cout << "Avg latency: " - << (status_summary.client_stats.avg_latency_ns / 1000) - << " usec (std " << status_summary.client_stats.std_us - << " usec)" << std::endl; + << (measurement_perf_status.client_stats.avg_latency_ns / + 1000) + << " usec (std " + << measurement_perf_status.client_stats.std_us << " usec). " + << std::endl; } } else { std::cout << " Pass [" << (completed_trials + 1) @@ -647,6 +777,12 @@ InferenceProfiler::ProfileHelper( } } + // If request-count is specified, then only measure one window and exit + if (request_count != 0) { + *is_stable = true; + break; + } + *is_stable = DetermineStability(load_status); if (IsDoneProfiling(load_status, is_stable)) { @@ -656,11 +792,19 @@ InferenceProfiler::ProfileHelper( completed_trials++; } while ((!early_exit) && (completed_trials < max_trials_)); + // For async requests, print a warning if the latency threshold is not met. + if (async_mode_ && !*is_stable && DetermineStability(load_status, false)) { + std::cerr << "Warning: Request latency is not stabilizing. " + "Please try lowering the request rate." + << std::endl; + *is_stable = true; + } + if (should_collect_metrics_) { metrics_manager_->StopQueryingMetrics(); } - // return the appropriate error which might have occured in the + // return the appropriate error which might have occurred in the // stability_window for its proper handling. while (!error.empty()) { if (!error.front().IsOk()) { @@ -672,7 +816,8 @@ InferenceProfiler::ProfileHelper( // Only merge the results if the results have stabilized. if (*is_stable) { - RETURN_IF_ERROR(MergePerfStatusReports(perf_status, status_summary)); + RETURN_IF_ERROR(MergePerfStatusReports( + measurement_perf_statuses, experiment_perf_status)); } if (early_exit) { @@ -682,7 +827,8 @@ InferenceProfiler::ProfileHelper( } bool -InferenceProfiler::DetermineStability(LoadStatus& load_status) +InferenceProfiler::DetermineStability( + LoadStatus& load_status, bool check_latency) { bool stable = false; if (load_status.infer_per_sec.size() >= load_parameters_.stability_window) { @@ -696,16 +842,17 @@ InferenceProfiler::DetermineStability(LoadStatus& load_status) } } - stable = stable && CheckWindowForStability(idx, load_status); + stable = stable && CheckWindowForStability(idx, load_status, check_latency); } return stable; } bool -InferenceProfiler::CheckWindowForStability(size_t idx, LoadStatus& load_status) +InferenceProfiler::CheckWindowForStability( + size_t idx, LoadStatus& load_status, bool check_latency) { return IsInferWindowStable(idx, load_status) && - IsLatencyWindowStable(idx, load_status); + (!check_latency || IsLatencyWindowStable(idx, load_status)); } bool @@ -732,6 +879,8 @@ InferenceProfiler::IsLatencyWindowStable(size_t idx, LoadStatus& load_status) double max_latency = *latencies_per_sec_measurements.second; double min_latency = *latencies_per_sec_measurements.first; + auto is_stable = + max_latency / min_latency <= 1 + load_parameters_.stability_threshold; return max_latency / min_latency <= 1 + load_parameters_.stability_threshold; } @@ -765,7 +914,8 @@ InferenceProfiler::IsDoneProfiling(LoadStatus& load_status, bool* is_stable) bool InferenceProfiler::CheckWithinThreshold(size_t idx, LoadStatus& load_status) { - return load_status.latencies[idx] < (latency_threshold_ms_ * 1000 * 1000); + return load_status.latencies[idx] < + (latency_threshold_ms_ * NANOS_PER_MILLIS); } cb::Error @@ -850,24 +1000,25 @@ InferenceProfiler::MergeServerSideStats( cb::Error InferenceProfiler::MergePerfStatusReports( - std::deque& perf_status_reports, PerfStatus& summary_status) + std::deque& perf_status_reports, + PerfStatus& experiment_perf_status) { auto& perf_status = perf_status_reports[0]; // Make sure that the perf status reports profiling settings match with each // other. for (size_t i = 1; i < perf_status_reports.size(); i++) { - perf_status.concurrency = summary_status.concurrency; - perf_status.request_rate = summary_status.request_rate; + perf_status.concurrency = experiment_perf_status.concurrency; + perf_status.request_rate = experiment_perf_status.request_rate; if (perf_status_reports[i].on_sequence_model != perf_status.on_sequence_model) { return cb::Error( - "Incosistent sequence setting detected.", pa::GENERIC_ERROR); + "Inconsistent sequence setting detected.", pa::GENERIC_ERROR); } if (perf_status_reports[i].batch_size != perf_status.batch_size) { - return cb::Error("Incosistent batch size detected.", pa::GENERIC_ERROR); + return cb::Error("Inconsistent batch size detected.", pa::GENERIC_ERROR); } if (perf_status_reports[i].server_stats.composing_models_stat.size() != @@ -878,95 +1029,110 @@ InferenceProfiler::MergePerfStatusReports( } } - summary_status.batch_size = perf_status.batch_size; - summary_status.on_sequence_model = perf_status.on_sequence_model; + experiment_perf_status.batch_size = perf_status.batch_size; + experiment_perf_status.on_sequence_model = perf_status.on_sequence_model; // Initialize the client stats for the merged report. - summary_status.client_stats.request_count = 0; - summary_status.client_stats.sequence_count = 0; - summary_status.client_stats.delayed_request_count = 0; - summary_status.client_stats.duration_ns = 0; - summary_status.client_stats.avg_latency_ns = 0; - summary_status.client_stats.percentile_latency_ns.clear(); - summary_status.client_stats.latencies.clear(); - summary_status.client_stats.std_us = 0; - summary_status.client_stats.avg_request_time_ns = 0; - summary_status.client_stats.avg_send_time_ns = 0; - summary_status.client_stats.avg_receive_time_ns = 0; - summary_status.client_stats.infer_per_sec = 0; - summary_status.client_stats.sequence_per_sec = 0; - summary_status.client_stats.completed_count = 0; - summary_status.stabilizing_latency_ns = 0; + experiment_perf_status.client_stats.request_count = 0; + experiment_perf_status.client_stats.sequence_count = 0; + experiment_perf_status.client_stats.delayed_request_count = 0; + experiment_perf_status.client_stats.duration_ns = 0; + experiment_perf_status.client_stats.avg_latency_ns = 0; + experiment_perf_status.client_stats.percentile_latency_ns.clear(); + experiment_perf_status.client_stats.latencies.clear(); + experiment_perf_status.client_stats.std_us = 0; + experiment_perf_status.client_stats.avg_request_time_ns = 0; + experiment_perf_status.client_stats.avg_send_time_ns = 0; + experiment_perf_status.client_stats.avg_receive_time_ns = 0; + experiment_perf_status.client_stats.infer_per_sec = 0; + experiment_perf_status.client_stats.sequence_per_sec = 0; + experiment_perf_status.client_stats.completed_count = 0; + experiment_perf_status.stabilizing_latency_ns = 0; + experiment_perf_status.overhead_pct = 0; + experiment_perf_status.send_request_rate = 0.0; std::vector server_side_stats; for (auto& perf_status : perf_status_reports) { // Aggregated Client Stats - summary_status.client_stats.request_count += + experiment_perf_status.client_stats.request_count += perf_status.client_stats.request_count; - summary_status.client_stats.sequence_count += + experiment_perf_status.client_stats.sequence_count += perf_status.client_stats.sequence_count; - summary_status.client_stats.delayed_request_count += + experiment_perf_status.client_stats.delayed_request_count += perf_status.client_stats.delayed_request_count; - summary_status.client_stats.duration_ns += + experiment_perf_status.client_stats.response_count += + perf_status.client_stats.response_count; + experiment_perf_status.client_stats.duration_ns += perf_status.client_stats.duration_ns; server_side_stats.push_back(perf_status.server_stats); - summary_status.client_stats.latencies.insert( - summary_status.client_stats.latencies.end(), + experiment_perf_status.client_stats.latencies.insert( + experiment_perf_status.client_stats.latencies.end(), perf_status.client_stats.latencies.begin(), perf_status.client_stats.latencies.end()); + // Accumulate the overhead percentage and send rate here to remove extra + // traversals over the perf_status_reports + experiment_perf_status.overhead_pct += perf_status.overhead_pct; + experiment_perf_status.send_request_rate += perf_status.send_request_rate; } + // Calculate the average overhead_pct for the experiment. + experiment_perf_status.overhead_pct /= perf_status_reports.size(); + experiment_perf_status.send_request_rate /= perf_status_reports.size(); + if (include_lib_stats_) { for (auto& perf_status : perf_status_reports) { - summary_status.client_stats.completed_count += + experiment_perf_status.client_stats.completed_count += perf_status.client_stats.completed_count; - summary_status.client_stats.avg_request_time_ns += + experiment_perf_status.client_stats.avg_request_time_ns += perf_status.client_stats.avg_request_time_ns * perf_status.client_stats.completed_count; - summary_status.client_stats.avg_send_time_ns += + experiment_perf_status.client_stats.avg_send_time_ns += perf_status.client_stats.avg_send_time_ns * perf_status.client_stats.completed_count; - summary_status.client_stats.avg_receive_time_ns += + experiment_perf_status.client_stats.avg_receive_time_ns += perf_status.client_stats.avg_receive_time_ns * perf_status.client_stats.completed_count; } - if (summary_status.client_stats.completed_count != 0) { - summary_status.client_stats.avg_request_time_ns = - summary_status.client_stats.avg_request_time_ns / - summary_status.client_stats.completed_count; + if (experiment_perf_status.client_stats.completed_count != 0) { + experiment_perf_status.client_stats.avg_request_time_ns = + experiment_perf_status.client_stats.avg_request_time_ns / + experiment_perf_status.client_stats.completed_count; - summary_status.client_stats.avg_send_time_ns = - summary_status.client_stats.avg_send_time_ns / - summary_status.client_stats.completed_count; + experiment_perf_status.client_stats.avg_send_time_ns = + experiment_perf_status.client_stats.avg_send_time_ns / + experiment_perf_status.client_stats.completed_count; - summary_status.client_stats.avg_receive_time_ns = - summary_status.client_stats.avg_receive_time_ns / - summary_status.client_stats.completed_count; + experiment_perf_status.client_stats.avg_receive_time_ns = + experiment_perf_status.client_stats.avg_receive_time_ns / + experiment_perf_status.client_stats.completed_count; } } - RETURN_IF_ERROR( - MergeServerSideStats(server_side_stats, summary_status.server_stats)); + RETURN_IF_ERROR(MergeServerSideStats( + server_side_stats, experiment_perf_status.server_stats)); std::sort( - summary_status.client_stats.latencies.begin(), - summary_status.client_stats.latencies.end()); + experiment_perf_status.client_stats.latencies.begin(), + experiment_perf_status.client_stats.latencies.end()); float client_duration_sec = - (float)summary_status.client_stats.duration_ns / NANOS_PER_SECOND; - summary_status.client_stats.sequence_per_sec = - summary_status.client_stats.sequence_count / client_duration_sec; - summary_status.client_stats.infer_per_sec = - (summary_status.client_stats.request_count * summary_status.batch_size) / + (float)experiment_perf_status.client_stats.duration_ns / NANOS_PER_SECOND; + experiment_perf_status.client_stats.sequence_per_sec = + experiment_perf_status.client_stats.sequence_count / client_duration_sec; + experiment_perf_status.client_stats.infer_per_sec = + (experiment_perf_status.client_stats.request_count * + experiment_perf_status.batch_size) / client_duration_sec; - RETURN_IF_ERROR( - SummarizeLatency(summary_status.client_stats.latencies, summary_status)); + experiment_perf_status.client_stats.responses_per_sec = + experiment_perf_status.client_stats.response_count / client_duration_sec; + RETURN_IF_ERROR(SummarizeLatency( + experiment_perf_status.client_stats.latencies, experiment_perf_status)); if (should_collect_metrics_) { // Put all Metric objects in a flat vector so they're easier to merge @@ -981,7 +1147,7 @@ InferenceProfiler::MergePerfStatusReports( Metrics merged_metrics{}; RETURN_IF_ERROR(MergeMetrics(all_metrics, merged_metrics)); - summary_status.metrics.push_back(std::move(merged_metrics)); + experiment_perf_status.metrics.push_back(std::move(merged_metrics)); } return cb::Error::Success; @@ -1004,14 +1170,15 @@ InferenceProfiler::GetServerSideStatus( // Used for measurement cb::Error InferenceProfiler::Measure( - PerfStatus& status_summary, uint64_t measurement_window, - bool is_count_based) + PerfStatus& perf_status, uint64_t measurement_window, bool is_count_based) { std::map start_status; std::map end_status; cb::InferStat start_stat; cb::InferStat end_stat; + manager_->ResetIdleTime(); + // Set current window start time to end of previous window. For first // measurement window, capture start time, server side stats, and client side // stats. @@ -1028,9 +1195,6 @@ InferenceProfiler::Measure( if (include_server_stats_) { RETURN_IF_ERROR(GetServerSideStatus(&start_status)); } - // Need to zero out start stats when window start is 0 - start_status = std::map(); - start_stat = cb::InferStat(); RETURN_IF_ERROR(manager_->GetAccumulatedClientStat(&start_stat)); } @@ -1064,7 +1228,7 @@ InferenceProfiler::Measure( previous_window_end_ns_ = window_end_ns; if (should_collect_metrics_) { - metrics_manager_->GetLatestMetrics(status_summary.metrics); + metrics_manager_->GetLatestMetrics(perf_status.metrics); } // Get server status and then print report on difference between @@ -1077,14 +1241,14 @@ InferenceProfiler::Measure( RETURN_IF_ERROR(manager_->GetAccumulatedClientStat(&end_stat)); prev_client_side_stats_ = end_stat; - TimestampVector current_timestamps; - RETURN_IF_ERROR(manager_->SwapTimestamps(current_timestamps)); - all_timestamps_.insert( - all_timestamps_.end(), current_timestamps.begin(), - current_timestamps.end()); + std::vector current_request_records; + RETURN_IF_ERROR(manager_->SwapRequestRecords(current_request_records)); + all_request_records_.insert( + all_request_records_.end(), current_request_records.begin(), + current_request_records.end()); RETURN_IF_ERROR(Summarize( - start_status, end_status, start_stat, end_stat, status_summary, + start_status, end_status, start_stat, end_stat, perf_status, window_start_ns, window_end_ns)); return cb::Error::Success; @@ -1099,19 +1263,36 @@ InferenceProfiler::Summarize( { size_t valid_sequence_count = 0; size_t delayed_request_count = 0; + size_t response_count = 0; // Get measurement from requests that fall within the time interval std::pair valid_range{window_start_ns, window_end_ns}; + uint64_t window_duration_ns = valid_range.second - valid_range.first; std::vector latencies; + std::vector valid_requests{}; ValidLatencyMeasurement( - valid_range, valid_sequence_count, delayed_request_count, &latencies); + valid_range, valid_sequence_count, delayed_request_count, &latencies, + response_count, valid_requests); + + if (should_collect_profile_data_) { + CollectData( + summary, window_start_ns, window_end_ns, std::move(valid_requests)); + } RETURN_IF_ERROR(SummarizeLatency(latencies, summary)); RETURN_IF_ERROR(SummarizeClientStat( - start_stat, end_stat, valid_range.second - valid_range.first, - latencies.size(), valid_sequence_count, delayed_request_count, summary)); + start_stat, end_stat, window_duration_ns, latencies.size(), + valid_sequence_count, delayed_request_count, response_count, summary)); summary.client_stats.latencies = std::move(latencies); + SummarizeOverhead(window_duration_ns, manager_->GetIdleTime(), summary); + + double window_duration_s{ + window_duration_ns / static_cast(NANOS_PER_SECOND)}; + + SummarizeSendRequestRate( + window_duration_s, manager_->GetAndResetNumSentRequests(), summary); + if (include_server_stats_) { RETURN_IF_ERROR(SummarizeServerStats( start_status, end_status, &(summary.server_stats))); @@ -1124,44 +1305,77 @@ void InferenceProfiler::ValidLatencyMeasurement( const std::pair& valid_range, size_t& valid_sequence_count, size_t& delayed_request_count, - std::vector* valid_latencies) + std::vector* valid_latencies, size_t& response_count, + std::vector& valid_requests) { valid_latencies->clear(); valid_sequence_count = 0; + response_count = 0; std::vector erase_indices{}; - for (size_t i = 0; i < all_timestamps_.size(); i++) { - const auto& timestamp = all_timestamps_[i]; - uint64_t request_start_ns = CHRONO_TO_NANOS(std::get<0>(timestamp)); - uint64_t request_end_ns = CHRONO_TO_NANOS(std::get<1>(timestamp)); + for (size_t i = 0; i < all_request_records_.size(); i++) { + const auto& request_record = all_request_records_[i]; + uint64_t request_start_ns = CHRONO_TO_NANOS(request_record.start_time_); + uint64_t request_end_ns; + + if (request_record.has_null_last_response_ == false) { + request_end_ns = + CHRONO_TO_NANOS(request_record.response_timestamps_.back()); + } else if (request_record.response_timestamps_.size() > 1) { + size_t last_response_idx{request_record.response_timestamps_.size() - 2}; + request_end_ns = CHRONO_TO_NANOS( + request_record.response_timestamps_[last_response_idx]); + } else { + erase_indices.push_back(i); + continue; + } if (request_start_ns <= request_end_ns) { // Only counting requests that end within the time interval if ((request_end_ns >= valid_range.first) && (request_end_ns <= valid_range.second)) { valid_latencies->push_back(request_end_ns - request_start_ns); + response_count += request_record.response_timestamps_.size(); + if (request_record.has_null_last_response_) { + response_count--; + } erase_indices.push_back(i); - // Just add the sequence_end flag here. - if (std::get<2>(timestamp)) { + if (request_record.sequence_end_) { valid_sequence_count++; } - if (std::get<3>(timestamp)) { + if (request_record.delayed_) { delayed_request_count++; } } } } + std::for_each( + erase_indices.begin(), erase_indices.end(), + [this, &valid_requests](size_t i) { + valid_requests.push_back(std::move(this->all_request_records_[i])); + }); + // Iterate through erase indices backwards so that erases from - // `all_timestamps_` happen from the back to the front to avoid using wrong - // indices after subsequent erases + // `all_request_records_` happen from the back to the front to avoid using + // wrong indices after subsequent erases std::for_each(erase_indices.rbegin(), erase_indices.rend(), [this](size_t i) { - this->all_timestamps_.erase(this->all_timestamps_.begin() + i); + this->all_request_records_.erase(this->all_request_records_.begin() + i); }); // Always sort measured latencies as percentile will be reported as default std::sort(valid_latencies->begin(), valid_latencies->end()); } +void +InferenceProfiler::CollectData( + PerfStatus& summary, uint64_t window_start_ns, uint64_t window_end_ns, + std::vector&& request_records) +{ + InferenceLoadMode id{summary.concurrency, summary.request_rate}; + collector_->AddWindow(id, window_start_ns, window_end_ns); + collector_->AddData(id, std::move(request_records)); +} + cb::Error InferenceProfiler::SummarizeLatency( const std::vector& latencies, PerfStatus& summary) @@ -1237,7 +1451,7 @@ InferenceProfiler::SummarizeClientStat( const cb::InferStat& start_stat, const cb::InferStat& end_stat, const uint64_t duration_ns, const size_t valid_request_count, const size_t valid_sequence_count, const size_t delayed_request_count, - PerfStatus& summary) + const size_t response_count, PerfStatus& summary) { summary.on_sequence_model = ((parser_->SchedulerType() == ModelParser::SEQUENCE) || @@ -1246,6 +1460,7 @@ InferenceProfiler::SummarizeClientStat( summary.client_stats.request_count = valid_request_count; summary.client_stats.sequence_count = valid_sequence_count; summary.client_stats.delayed_request_count = delayed_request_count; + summary.client_stats.response_count = response_count; summary.client_stats.duration_ns = duration_ns; float client_duration_sec = (float)summary.client_stats.duration_ns / NANOS_PER_SECOND; @@ -1253,6 +1468,7 @@ InferenceProfiler::SummarizeClientStat( valid_sequence_count / client_duration_sec; summary.client_stats.infer_per_sec = (valid_request_count * summary.batch_size) / client_duration_sec; + summary.client_stats.responses_per_sec = response_count / client_duration_sec; if (include_lib_stats_) { size_t completed_count = @@ -1276,40 +1492,176 @@ InferenceProfiler::SummarizeClientStat( return cb::Error::Success; } +void +InferenceProfiler::SummarizeSendRequestRate( + const double window_duration_s, const size_t num_sent_requests, + PerfStatus& summary) +{ + if (window_duration_s <= 0.0) { + throw std::runtime_error("window_duration_s must be positive"); + } + + summary.send_request_rate = num_sent_requests / window_duration_s; +} + cb::Error -InferenceProfiler::SummarizeServerStatsHelper( +InferenceProfiler::DetermineStatsModelVersion( const cb::ModelIdentifier& model_identifier, - const std::map& start_status, - const std::map& end_status, - ServerSideStats* server_stats) + const std::map& start_stats, + const std::map& end_stats, + int64_t* status_model_version) { - // If model_version is an empty string then look in the end status to find - // the latest (highest valued version) and use that as the version. - int64_t status_model_version = -1; - if (model_identifier.second.empty()) { - for (const auto& id : end_status) { - // Model name should match - if (model_identifier.first.compare(id.first.first) == 0) { - int64_t this_version = std::stoll(id.first.second); - status_model_version = std::max(status_model_version, this_version); + // If model_version is unspecified then look in the stats to find the + // version with stats that incremented during the measurement. + // + // If multiple versions had incremented stats, use the highest numbered one + // and print a warning + *status_model_version = -1; + bool multiple_found = false; + bool version_unspecified = model_identifier.second.empty(); + + if (version_unspecified) { + for (const auto& x : end_stats) { + const auto& end_id = x.first; + const auto& end_stat = x.second; + + bool is_correct_model_name = + model_identifier.first.compare(end_id.first) == 0; + + if (is_correct_model_name) { + uint64_t end_queue_count = end_stat.queue_count_; + uint64_t start_queue_count = 0; + + const auto& itr = start_stats.find(end_id); + if (itr != start_stats.end()) { + start_queue_count = itr->second.queue_count_; + } + + if (end_queue_count > start_queue_count) { + int64_t this_version = std::stoll(end_id.second); + if (*status_model_version != -1) { + multiple_found = true; + } + *status_model_version = std::max(*status_model_version, this_version); + } } } } else { - status_model_version = std::stoll(model_identifier.second); + const auto& itr = end_stats.find(model_identifier); + if (itr != end_stats.end()) { + *status_model_version = std::stoll(model_identifier.second); + } } - - if (status_model_version == -1) { + // FIXME - Investigate why composing model version is -1 in case of ensemble + // cache hit. + // + // In case of ensemble models, if top level response caching is + // enabled, the composing models versions are unavailable in case of a cache + // hit. This is due to the scheduler sends cache response and composing models + // do not get executed. It's a valid scenario and shouldn't throw error. + bool model_version_unspecified_and_invalid = + *status_model_version == -1 && + (parser_ == nullptr || !parser_->TopLevelResponseCachingEnabled()); + if (model_version_unspecified_and_invalid) { return cb::Error( - "failed to determine the requested model version", pa::GENERIC_ERROR); + "failed to find the requested model version", pa::GENERIC_ERROR); } + if (multiple_found) { + std::cerr << "WARNING: Multiple versions of model " + << model_identifier.first + << " are loaded in the triton server, and the version to use was " + "unspecified. The stats for that model may be inaccurate." + << std::endl; + } + + return cb::Error::Success; +} + +// Only for unit-testing +#ifndef DOCTEST_CONFIG_DISABLE +cb::Error +InferenceProfiler::SetTopLevelResponseCaching( + bool enable_top_level_response_caching) +{ + parser_ = std::make_shared(cb::BackendKind::TRITON); + if (parser_ == nullptr) { + return cb::Error("Failed to initialize ModelParser"); + } + parser_->SetTopLevelResponseCaching(enable_top_level_response_caching); + return cb::Error::Success; +} +#endif + +cb::Error +InferenceProfiler::SummarizeServerStats( + const std::map& start_status, + const std::map& end_status, + ServerSideStats* server_stats) +{ + RETURN_IF_ERROR(SummarizeServerStats( + std::make_pair(parser_->ModelName(), parser_->ModelVersion()), + start_status, end_status, server_stats)); + return cb::Error::Success; +} + +cb::Error +InferenceProfiler::SummarizeServerStats( + const cb::ModelIdentifier& model_identifier, + const std::map& start_status, + const std::map& end_status, + ServerSideStats* server_stats) +{ + RETURN_IF_ERROR(SummarizeServerStatsHelper( + model_identifier, start_status, end_status, server_stats)); + + // Summarize the composing models, if any. + for (auto composing_model_identifier : + (*parser_->GetComposingModelMap())[model_identifier.first]) { + int64_t model_version; + RETURN_IF_ERROR(DetermineStatsModelVersion( + composing_model_identifier, start_status, end_status, &model_version)); + composing_model_identifier.second = std::to_string(model_version); + auto it = server_stats->composing_models_stat + .emplace(composing_model_identifier, ServerSideStats()) + .first; + RETURN_IF_ERROR(SummarizeServerStats( + composing_model_identifier, start_status, end_status, &(it->second))); + } + + return cb::Error::Success; +} + +cb::Error +InferenceProfiler::SummarizeServerStatsHelper( + const cb::ModelIdentifier& model_identifier, + const std::map& start_status, + const std::map& end_status, + ServerSideStats* server_stats) +{ + int64_t model_version; + RETURN_IF_ERROR(DetermineStatsModelVersion( + model_identifier, start_status, end_status, &model_version)); + const std::pair this_id( - model_identifier.first, std::to_string(status_model_version)); + model_identifier.first, std::to_string(model_version)); const auto& end_itr = end_status.find(this_id); if (end_itr == end_status.end()) { - return cb::Error( - "missing statistics for requested model", pa::GENERIC_ERROR); + // In case of ensemble models, if top level response caching is enabled, + // the composing models statistics are unavailable in case of a cache hit. + // This is due to the scheduler sends cache response and composing models do + // not get executed. It's a valid scenario and shouldn't throw error. + bool stats_not_found_and_invalid = + model_version == -1 && !parser_->TopLevelResponseCachingEnabled(); + if (stats_not_found_and_invalid) { + return cb::Error( + "missing statistics for requested model", pa::GENERIC_ERROR); + } else { + // Setting server stats 0 for composing model in case of ensemble request + // cache hit since the composing model will not be executed + server_stats->Reset(); + } } else { uint64_t start_infer_cnt = 0; uint64_t start_exec_cnt = 0; @@ -1383,39 +1735,22 @@ InferenceProfiler::SummarizeServerStatsHelper( return cb::Error::Success; } -cb::Error -InferenceProfiler::SummarizeServerStats( - const cb::ModelIdentifier& model_identifier, - const std::map& start_status, - const std::map& end_status, - ServerSideStats* server_stats) +void +InferenceProfiler::SummarizeOverhead( + const uint64_t window_duration_ns, const uint64_t idle_ns, + PerfStatus& summary) { - RETURN_IF_ERROR(SummarizeServerStatsHelper( - model_identifier, start_status, end_status, server_stats)); - - // Summarize the composing models, if any. - for (const auto& composing_model_identifier : - (*parser_->GetComposingModelMap())[model_identifier.first]) { - auto it = server_stats->composing_models_stat - .emplace(composing_model_identifier, ServerSideStats()) - .first; - RETURN_IF_ERROR(SummarizeServerStats( - composing_model_identifier, start_status, end_status, &(it->second))); + // The window start/stop is not instantaneous. It is possible that the PA + // overhead is smaller than the delay in the window start/stop process. Treat + // it as 0% overhead (100% idle) in that case + // + if (idle_ns > window_duration_ns) { + summary.overhead_pct = 0; + } else { + uint64_t overhead_ns = window_duration_ns - idle_ns; + double overhead_pct = double(overhead_ns) / window_duration_ns * 100; + summary.overhead_pct = overhead_pct; } - - return cb::Error::Success; -} - -cb::Error -InferenceProfiler::SummarizeServerStats( - const std::map& start_status, - const std::map& end_status, - ServerSideStats* server_stats) -{ - RETURN_IF_ERROR(SummarizeServerStats( - std::make_pair(parser_->ModelName(), parser_->ModelVersion()), - start_status, end_status, server_stats)); - return cb::Error::Success; } bool diff --git a/inference_profiler.h b/inference_profiler.h index 3172f630..cfd2a3b6 100644 --- a/inference_profiler.h +++ b/inference_profiler.h @@ -1,4 +1,4 @@ -// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -35,6 +35,7 @@ #include #include #include + #include "concurrency_manager.h" #include "constants.h" #include "custom_load_manager.h" @@ -42,12 +43,16 @@ #include "metrics_manager.h" #include "model_parser.h" #include "mpi_utils.h" +#include "periodic_concurrency_manager.h" +#include "profile_data_collector.h" #include "request_rate_manager.h" namespace triton { namespace perfanalyzer { #ifndef DOCTEST_CONFIG_DISABLE +class NaggyMockInferenceProfiler; class TestInferenceProfiler; +class ModelParser; #endif /// Constant parameters that determine the whether stopping criteria has met @@ -60,7 +65,7 @@ struct LoadParams { double stability_threshold; }; -/// Data structure to keep track of real-time load status and determine wether +/// Data structure to keep track of real-time load status and determine whether /// stopping criteria has met for the current phase of testing. struct LoadStatus { // Stores the observations of infer_per_sec and latencies in a vector @@ -115,6 +120,28 @@ struct ServerSideStats { uint64_t cache_miss_time_ns; std::map composing_models_stat; + // This function sets composing model server stats to 0 in case of a cache hit + // when top level response cache is enabled, since composing models are not + // executed and do not have any stats + void Reset() + { + inference_count = 0; + execution_count = 0; + success_count = 0; + queue_count = 0; + compute_input_count = 0; + compute_infer_count = 0; + compute_output_count = 0; + cumm_time_ns = 0; + queue_time_ns = 0; + compute_input_time_ns = 0; + compute_infer_time_ns = 0; + compute_output_time_ns = 0; + cache_hit_count = 0; + cache_hit_time_ns = 0; + cache_miss_count = 0; + cache_miss_time_ns = 0; + } }; /// Holds the statistics recorded at the client side. @@ -125,6 +152,8 @@ struct ClientSideStats { uint64_t sequence_count; // The number of requests that missed their schedule uint64_t delayed_request_count; + // The number of responses + uint64_t response_count; uint64_t duration_ns; uint64_t avg_latency_ns; // a ordered map of percentiles to be reported ( pair) @@ -138,6 +167,7 @@ struct ClientSideStats { uint64_t avg_receive_time_ns; // Per sec stat double infer_per_sec; + double responses_per_sec; double sequence_per_sec; // Completed request count reported by the client library @@ -152,12 +182,17 @@ struct PerfStatus { ServerSideStats server_stats; ClientSideStats client_stats; std::vector metrics{}; + double overhead_pct; bool on_sequence_model; // placeholder for the latency value that is used for conditional checking uint64_t stabilizing_latency_ns; + // Metric for requests sent per second + double send_request_rate{0.0}; }; +cb::Error ReportPrometheusMetrics(const Metrics& metrics); + //============================================================================== /// A InferenceProfiler is a helper class that measures and summarizes the /// inference statistic under different concurrency level. @@ -173,9 +208,9 @@ struct PerfStatus { /// time. /// 2. After given time interval, the profiler gets end status from the server /// and records the end time. -/// 3. The profiler obtains the timestamps recorded by concurrency manager, -/// and uses the timestamps that are recorded between start time and end time -/// to measure client side status and update status_summary. +/// 3. The profiler obtains the request records recorded by concurrency manager, +/// and uses the request records that are recorded between start time and end +/// time to measure client side status and update status_summary. /// class InferenceProfiler { public: @@ -183,7 +218,7 @@ class InferenceProfiler { /// \param verbose Whether to print verbose logging. /// \param stability_threshold The range that the measurement is considered as /// stable. i.e. within (1 +/- stability_threshold) * average value of the - /// last 3 measurements. The criterias are "infer per second" and "average + /// last 3 measurements. The criteria are "infer per second" and "average /// latency", or "infer per second" and "percentile latency" if valid /// percentile is set (see 'percentile' below). /// \param measurement_window_ms The duration of each measurement in msec. @@ -209,6 +244,10 @@ class InferenceProfiler { /// \param metrics_interval_ms The interval at which the server-side metrics /// \param should_collect_metrics Whether server-side inference server metrics /// should be collected. + /// \param overhead_pct_threshold User set threshold above which the PA + /// overhead is too significant to provide usable results. + /// \param collector Collector for the profile data from experiments + /// \param should_collect_profile_data Whether to collect profile data. /// \return cb::Error object indicating success or failure. static cb::Error Create( const bool verbose, const double stability_threshold, @@ -220,7 +259,10 @@ class InferenceProfiler { std::unique_ptr* profiler, uint64_t measurement_request_count, MeasurementMode measurement_mode, std::shared_ptr mpi_driver, const uint64_t metrics_interval_ms, - const bool should_collect_metrics); + const bool should_collect_metrics, const double overhead_pct_threshold, + const bool async_mode, + const std::shared_ptr collector, + const bool should_collect_profile_data); /// Performs the profiling on the given range with the given search algorithm. /// For profiling using request rate invoke template with double, otherwise @@ -230,25 +272,29 @@ class InferenceProfiler { /// \param step The step size to move along the search range in linear search /// or the precision in binary search. /// \param search_mode The search algorithm to be applied. - /// \param summary Returns the trace of the measurement along the search - /// path. + /// \param request_count The number of requests to generate in each + /// experiment. If 0, then there is no limit, and it will generate until + /// stable. + /// \param summary Returns the trace of the measurement along the search path. /// \return cb::Error object indicating success or failure. template cb::Error Profile( const T start, const T end, const T step, const SearchMode search_mode, - std::vector& summary) + const size_t request_count, std::vector& perf_statuses) { cb::Error err; bool meets_threshold, is_stable; if (search_mode == SearchMode::NONE) { - err = Profile(summary, meets_threshold, is_stable); + err = Profile(request_count, perf_statuses, meets_threshold, is_stable); if (!err.IsOk()) { return err; } } else if (search_mode == SearchMode::LINEAR) { T current_value = start; do { - err = Profile(current_value, summary, meets_threshold, is_stable); + err = Profile( + current_value, request_count, perf_statuses, meets_threshold, + is_stable); if (!err.IsOk()) { return err; } @@ -262,11 +308,13 @@ class InferenceProfiler { "Failed to obtain stable measurement.", pa::STABILITY_ERROR); } } else { - err = Profile(start, summary, meets_threshold, is_stable); + err = Profile( + start, request_count, perf_statuses, meets_threshold, is_stable); if (!err.IsOk() || (!meets_threshold)) { return err; } - err = Profile(end, summary, meets_threshold, is_stable); + err = Profile( + end, request_count, perf_statuses, meets_threshold, is_stable); if (!err.IsOk() || (meets_threshold)) { return err; } @@ -275,7 +323,9 @@ class InferenceProfiler { T this_end = end; while ((this_end - this_start) > step) { T current_value = (this_end + this_start) / 2; - err = Profile(current_value, summary, meets_threshold, is_stable); + err = Profile( + current_value, request_count, perf_statuses, meets_threshold, + is_stable); if (!err.IsOk()) { return err; } @@ -289,6 +339,18 @@ class InferenceProfiler { return cb::Error::Success; } + cb::Error ProfilePeriodicConcurrencyMode() + { + auto& manager{dynamic_cast(*manager_)}; + std::vector request_records{manager.RunExperiment()}; + // FIXME - Refactor collector class to not need ID or window in the case of + // periodic concurrency mode + InferenceLoadMode id{1, 0.0}; + collector_->AddWindow(id, 0, UINT64_MAX); + collector_->AddData(id, std::move(request_records)); + return cb::Error::Success; + } + bool IncludeServerStats() { return include_server_stats_; } private: @@ -301,7 +363,10 @@ class InferenceProfiler { std::shared_ptr profile_backend, std::unique_ptr manager, uint64_t measurement_request_count, MeasurementMode measurement_mode, std::shared_ptr mpi_driver, - const uint64_t metrics_interval_ms, const bool should_collect_metrics); + const uint64_t metrics_interval_ms, const bool should_collect_metrics, + const double overhead_pct_threshold, const bool async_mode, + const std::shared_ptr collector, + const bool should_collect_profile_data); /// Actively measure throughput in every 'measurement_window' msec until the /// throughput is stable. Once the throughput is stable, it adds the @@ -312,49 +377,65 @@ class InferenceProfiler { /// measures (we can't get the exact server status right before the first /// request and right after the last request in the measurement window). /// \param concurrent_request_count The concurrency level for the measurement. - /// \param summary Appends the measurements summary at the end of this list. - /// \param meets_threshold Returns whether the setting meets the threshold. + /// \param perf_statuses Appends the measurements summary at the end of this + /// list. + /// \param request_count The number of requests to generate when profiling. If + /// 0, then there is no limit, and it will generate until stable. + /// \param meets_threshold Returns whether the setting meets the + /// threshold. /// \param is_stable Returns whether the measurement is stable. /// \return cb::Error object indicating success or failure. cb::Error Profile( - const size_t concurrent_request_count, std::vector& summary, - bool& meets_threshold, bool& is_stable); + const size_t concurrent_request_count, const size_t request_count, + std::vector& perf_statuses, bool& meets_threshold, + bool& is_stable); /// Similar to above function, but instead of setting the concurrency, it /// sets the specified request rate for measurements. /// \param request_rate The request rate for inferences. - /// \param summary Appends the measurements summary at the end of this list. - /// \param meets_threshold Returns whether the setting meets the threshold. + /// \param request_count The number of requests to generate when profiling. If + /// 0, then there is no limit, and it will generate until stable. + /// \param perf_statuses Appends the measurements summary at the end of this + /// list. + /// \param meets_threshold Returns whether the setting meets the + /// threshold. /// \param is_stable Returns whether the measurement is stable. /// \return cb::Error object indicating success or failure. cb::Error Profile( - const double request_rate, std::vector& summary, - bool& meets_threshold, bool& is_stable); + const double request_rate, const size_t request_count, + std::vector& perf_statuses, bool& meets_threshold, + bool& is_stable); - /// Measures throughput and latencies for custom load without controling + /// Measures throughput and latencies for custom load without controlling /// request rate nor concurrency. Requires load manager to be loaded with /// a file specifying the time intervals. - /// \param summary Appends the measurements summary at the end of this list. - /// \param meets_threshold Returns whether the measurement met the threshold. + /// \param request_count The number of requests to generate when profiling. If + /// 0, then there is no limit, and it will generate until stable. + /// \param perf_statuses Appends the measurements summary at the end of this + /// list. + /// \param meets_threshold Returns whether the measurement met the + /// threshold. /// \param is_stable Returns whether the measurement is stable. /// \return cb::Error object indicating success /// or failure. cb::Error Profile( - std::vector& summary, bool& meets_threshold, bool& is_stable); + const size_t request_count, std::vector& perf_statuses, + bool& meets_threshold, bool& is_stable); /// A helper function for profiling functions. - /// \param clean_starts Whether or not to reset load cycle with every - /// measurement trials. /// \param status_summary Returns the summary of the measurement. + /// \param request_count The number of requests to generate when profiling. If + /// 0, then there is no limit, and it will generate until stable. /// \param is_stable Returns whether the measurement stabilized or not. /// \return cb::Error object indicating success or failure. cb::Error ProfileHelper( - const bool clean_starts, PerfStatus& status_summary, bool* is_stable); + PerfStatus& status_summary, size_t request_count, bool* is_stable); /// A helper function to determine if profiling is stable /// \param load_status Stores the observations of infer_per_sec and latencies + /// \param check_latency Whether to check latency for stability /// \return Returns if the threshold and latencies are stable. - bool DetermineStability(LoadStatus& load_status); + bool DetermineStability(LoadStatus& load_status, bool check_latency = true); /// Check if latency at index idx is within the latency threshold /// \param idx index in latency vector @@ -373,8 +454,10 @@ class InferenceProfiler { /// for a single window starting at idx /// \param idx index in latency vector /// \param load_status Stores the observations of infer_per_sec and latencies + /// \param check_latency Whether to check latency for stability /// \return Returns whether inference and latency are stable - bool CheckWindowForStability(size_t idx, LoadStatus& load_status); + bool CheckWindowForStability( + size_t idx, LoadStatus& load_status, bool check_latency); /// Check if observed inferences are within threshold /// for a single window starting at idx @@ -410,7 +493,7 @@ class InferenceProfiler { cb::Error GetServerSideStatus( std::map* model_status); - /// Sumarize the measurement with the provided statistics. + /// Summarize the measurement with the provided statistics. /// \param start_status The model status at the start of the measurement. /// \param end_status The model status at the end of the measurement. /// \param start_stat The accumulated context status at the start. @@ -431,16 +514,28 @@ class InferenceProfiler { /// sequence model. /// \param latencies Returns the vector of request latencies where the /// requests are completed within the measurement window. - void ValidLatencyMeasurement( + /// \param response_count Returns the number of responses + /// \param valid_requests Returns a vector of valid request records + virtual void ValidLatencyMeasurement( const std::pair& valid_range, size_t& valid_sequence_count, size_t& delayed_request_count, - std::vector* latencies); + std::vector* latencies, size_t& response_count, + std::vector& valid_requests); + + /// Add the data from the request records to the Raw Data Collector + /// \param perf_status PerfStatus of the current measurement + /// \param window_start_ns The window start timestamp in nanoseconds. + /// \param window_end_ns The window end timestamp in nanoseconds. + /// \param request_records The request records to collect. + void CollectData( + PerfStatus& perf_status, uint64_t window_start_ns, uint64_t window_end_ns, + std::vector&& request_records); /// \param latencies The vector of request latencies collected. /// \param summary Returns the summary that the latency related fields are /// set. /// \return cb::Error object indicating success or failure. - cb::Error SummarizeLatency( + virtual cb::Error SummarizeLatency( const std::vector& latencies, PerfStatus& summary); /// \param latencies The vector of request latencies collected. @@ -457,24 +552,51 @@ class InferenceProfiler { /// \param valid_sequence_count The number of completed sequences recorded. /// \param delayed_request_count The number of requests that missed their /// schedule. + /// \param response_count The number of responses. /// \param summary Returns the summary that the fields recorded by /// client are set. /// \return cb::Error object indicating success or failure. - cb::Error SummarizeClientStat( + virtual cb::Error SummarizeClientStat( const cb::InferStat& start_stat, const cb::InferStat& end_stat, const uint64_t duration_ns, const size_t valid_request_count, const size_t delayed_request_count, const size_t valid_sequence_count, + const size_t response_count, PerfStatus& summary); + + /// Adds the send request rate metric to the summary object. + /// \param window_duration_s The duration of the window in seconds. + /// \param num_sent_requests The number of requests sent during the last + /// window. + /// \param summary The summary object to be updated with the send request rate + /// metric. + void SummarizeSendRequestRate( + const double window_duration_s, const size_t num_sent_requests, PerfStatus& summary); + /// Given a model_identifier to gather stats for, and a map of ALL stats, + /// determine which version of the model should be gathered /// \param model_identifier A pair of model_name and model_version to identify - /// a specific model. + /// a specific model + /// \param start_stats The stats for all models at the start of the + /// measurement + /// \param end_stats The stats for all models at the end of the measurement + /// \param model_version The determined model version + + cb::Error DetermineStatsModelVersion( + const cb::ModelIdentifier& model_identifier, + const std::map& start_stats, + const std::map& end_stats, + int64_t* model_version); + +#ifndef DOCTEST_CONFIG_DISABLE + cb::Error SetTopLevelResponseCaching(bool enable_top_level_request_caching); +#endif + /// \param start_status The model status at the start of the measurement. /// \param end_status The model status at the end of the measurement. /// \param server_stats Returns the summary that the fields recorded by server /// are set. /// \return cb::Error object indicating success or failure. - cb::Error SummarizeServerStatsHelper( - const cb::ModelIdentifier& model_identifier, + cb::Error SummarizeServerStats( const std::map& start_status, const std::map& end_status, ServerSideStats* server_stats); @@ -492,16 +614,29 @@ class InferenceProfiler { const std::map& end_status, ServerSideStats* server_stats); + /// \param model_identifier A pair of model_name and model_version to identify + /// a specific model. /// \param start_status The model status at the start of the measurement. /// \param end_status The model status at the end of the measurement. /// \param server_stats Returns the summary that the fields recorded by server /// are set. /// \return cb::Error object indicating success or failure. - cb::Error SummarizeServerStats( + cb::Error SummarizeServerStatsHelper( + const cb::ModelIdentifier& model_identifier, const std::map& start_status, const std::map& end_status, ServerSideStats* server_stats); + /// Calculate the overhead and put the results into the summary + /// + /// \param window_duration_ns The duration of the window + /// \param idle_ns The average worker idle time during the window + /// \param summary The summary object to be updated with overhead stats + /// + void SummarizeOverhead( + const uint64_t window_duration_ns, const uint64_t idle_ns, + PerfStatus& summary); + /// Returns true if all MPI ranks (models) are stable. Should only be run if /// and only if IsMPIRun() returns true. /// \param current_rank_stability The stability of the current rank. @@ -514,7 +649,7 @@ class InferenceProfiler { /// \param perf_status List of perf status reports to be merged. /// \param summary_status Final merged summary status. /// \return cb::Error object indicating success or failure. - cb::Error MergePerfStatusReports( + virtual cb::Error MergePerfStatusReports( std::deque& perf_status, PerfStatus& summary_status); /// Merge individual server side statistics into a single server side report. @@ -522,7 +657,7 @@ class InferenceProfiler { /// merged. /// \param server_side_summary Final merged summary status. /// \return cb::Error object indicating success or failure. - cb::Error MergeServerSideStats( + virtual cb::Error MergeServerSideStats( std::vector& server_side_stats, ServerSideStats& server_side_summary); @@ -623,14 +758,15 @@ class InferenceProfiler { std::shared_ptr parser_; std::shared_ptr profile_backend_; std::unique_ptr manager_; + std::shared_ptr collector_; LoadParams load_parameters_; bool include_lib_stats_; bool include_server_stats_; std::shared_ptr mpi_driver_; - /// The timestamps of the requests completed during all measurements - TimestampVector all_timestamps_; + /// The request records of the requests completed during all measurements + std::vector all_request_records_; /// The end time of the previous measurement window uint64_t previous_window_end_ns_; @@ -647,11 +783,24 @@ class InferenceProfiler { /// Whether server-side inference server metrics should be collected. bool should_collect_metrics_{false}; + /// User set threshold above which the PA overhead is too significant to + /// provide usable results. + const double overhead_pct_threshold_{0.0}; + + // Whether to collect profile data. + bool should_collect_profile_data_{false}; + + // Whether the client is operating in async mode. + const bool async_mode_{false}; + #ifndef DOCTEST_CONFIG_DISABLE + friend NaggyMockInferenceProfiler; friend TestInferenceProfiler; + friend ModelParser; - protected: + public: InferenceProfiler() = default; #endif }; + }} // namespace triton::perfanalyzer diff --git a/ischeduler.h b/ischeduler.h new file mode 100644 index 00000000..a854b64b --- /dev/null +++ b/ischeduler.h @@ -0,0 +1,42 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include "rate_schedule.h" + +namespace triton { namespace perfanalyzer { + +/// Interface for worker threads that use a schedule +/// +class IScheduler { + public: + /// Provides the schedule that should be followed + /// + virtual void SetSchedule(RateSchedulePtr_t schedule) = 0; +}; + +}} // namespace triton::perfanalyzer diff --git a/iworker.h b/iworker.h new file mode 100644 index 00000000..3a72f4c1 --- /dev/null +++ b/iworker.h @@ -0,0 +1,38 @@ +// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +namespace triton { namespace perfanalyzer { + +/// Interface for worker threads that generate inference requests +/// +class IWorker { + public: + virtual void Infer() = 0; +}; + +}} // namespace triton::perfanalyzer diff --git a/load_manager.cc b/load_manager.cc index 109fd21c..1f648a7f 100644 --- a/load_manager.cc +++ b/load_manager.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -27,108 +27,13 @@ #include "load_manager.h" #include -#include "shm_utils.h" - -#ifdef TRITON_ENABLE_GPU -#include - -#define RETURN_IF_CUDA_ERR(FUNC) \ - { \ - const cudaError_t result = FUNC; \ - if (result != cudaSuccess) { \ - return cb::Error( \ - "CUDA exception (line " + std::to_string(__LINE__) + \ - "): " + cudaGetErrorName(result) + " (" + \ - cudaGetErrorString(result) + ")", \ - pa::GENERIC_ERROR); \ - } \ - } -#endif // TRITON_ENABLE_GPU +#include "client_backend/client_backend.h" +#include "infer_data_manager_factory.h" namespace triton { namespace perfanalyzer { -namespace { - -std::string -TensorToRegionName(std::string name) -{ - // Remove slashes from the name, if any. - name.erase( - std::remove_if( - name.begin(), name.end(), - [](const char& c) { return ((c == '/') || (c == '\\')); }), - name.end()); - return name; -} - -#ifdef TRITON_ENABLE_GPU -cb::Error -CreateCUDAIPCHandle( - cudaIpcMemHandle_t* cuda_handle, void* input_d_ptr, int device_id = 0) -{ - // Set the GPU device to the desired GPU - RETURN_IF_CUDA_ERR(cudaSetDevice(device_id)); - - // Create IPC handle for data on the gpu - RETURN_IF_CUDA_ERR(cudaIpcGetMemHandle(cuda_handle, input_d_ptr)); - - return cb::Error::Success; -} - -#endif // TRITON_ENABLE_GPU - -} // namespace - -LoadManager::~LoadManager() -{ - cb::Error err; - if (using_shared_memory_ && backend_.get() != nullptr) { - err = backend_->UnregisterAllSharedMemory(); - if (!err.IsOk()) { - std::cerr << "Unable to unregister all shared memory regions" - << std::endl; - } - if (shared_memory_type_ == SharedMemoryType::SYSTEM_SHARED_MEMORY) { - for (auto region : shared_memory_regions_) { - err = backend_->UnmapSharedMemory( - shared_memory_regions_[region.first].first, - shared_memory_regions_[region.first].second); - if (!err.IsOk()) { - std::cerr << "Unable to unmap shared memory with key (" - << region.first << "): Starting: " - << static_cast( - shared_memory_regions_[region.first].first) - << ", size: " << shared_memory_regions_[region.first].second - << std::endl; - } - err = backend_->UnlinkSharedMemoryRegion(region.first); - if (!err.IsOk()) { - std::cerr << "Unable to unlink shared memory with key: " - << region.first << std::endl; - } - } - } else if (shared_memory_type_ == SharedMemoryType::CUDA_SHARED_MEMORY) { -#ifdef TRITON_ENABLE_GPU - for (auto region : shared_memory_regions_) { - cudaError_t cuda_err = - cudaFree(shared_memory_regions_[region.first].first); - if (cuda_err != cudaSuccess) { - std::cerr << "Unable to free cuda shared memory for " << region.first - << ": Starting: " - << static_cast( - shared_memory_regions_[region.first].first) - << ", size: " << shared_memory_regions_[region.first].second - << " bytes, Details: " << cudaGetErrorString(cuda_err) - << std::endl; - } - } -#endif // TRITON_ENABLE_GPU - } - } -} - cb::Error LoadManager::CheckHealth() { @@ -154,20 +59,19 @@ LoadManager::CheckHealth() } cb::Error -LoadManager::SwapTimestamps(TimestampVector& new_timestamps) +LoadManager::SwapRequestRecords(std::vector& new_request_records) { - TimestampVector total_timestamp; - // Gather request timestamps with proper locking from all the worker - // threads + std::vector total_request_records; + // Gather request records with proper locking from all the worker threads for (auto& thread_stat : threads_stat_) { std::lock_guard lock(thread_stat->mu_); - total_timestamp.insert( - total_timestamp.end(), thread_stat->request_timestamps_.begin(), - thread_stat->request_timestamps_.end()); - thread_stat->request_timestamps_.clear(); + total_request_records.insert( + total_request_records.end(), thread_stat->request_records_.begin(), + thread_stat->request_records_.end()); + thread_stat->request_records_.clear(); } // Swap the results - total_timestamp.swap(new_timestamps); + total_request_records.swap(new_request_records); return cb::Error::Success; } @@ -177,7 +81,7 @@ LoadManager::CountCollectedRequests() uint64_t num_of_requests = 0; for (auto& thread_stat : threads_stat_) { std::lock_guard lock(thread_stat->mu_); - num_of_requests += thread_stat->request_timestamps_.size(); + num_of_requests += thread_stat->request_records_.size(); } return num_of_requests; } @@ -185,6 +89,11 @@ LoadManager::CountCollectedRequests() cb::Error LoadManager::GetAccumulatedClientStat(cb::InferStat* contexts_stat) { + contexts_stat->completed_request_count = 0; + contexts_stat->cumulative_receive_time_ns = 0; + contexts_stat->cumulative_send_time_ns = 0; + contexts_stat->cumulative_total_request_time_ns = 0; + for (auto& thread_stat : threads_stat_) { std::lock_guard lock(thread_stat->mu_); for (auto& context_stat : thread_stat->contexts_stat_) { @@ -201,25 +110,102 @@ LoadManager::GetAccumulatedClientStat(cb::InferStat* contexts_stat) return cb::Error::Success; } +uint64_t +LoadManager::GetIdleTime() +{ + uint64_t total{0}; + size_t num_active_threads = 0; + for (auto& thread_stat : threads_stat_) { + std::lock_guard lock(thread_stat->mu_); + uint64_t idle_time = thread_stat->idle_timer.GetIdleTime(); + if (idle_time) { + total += idle_time; + num_active_threads++; + } + } + + // TODO REFACTOR TMA-1043 InferDataManager should have an API to get + // num_active_threads. This method of determining active threads isn't fully + // accurate + if (num_active_threads) { + total /= num_active_threads; + } + + return total; +} + +void +LoadManager::ResetIdleTime() +{ + for (auto& thread_stat : threads_stat_) { + std::lock_guard lock(thread_stat->mu_); + thread_stat->idle_timer.Reset(); + } +} + +const size_t +LoadManager::GetAndResetNumSentRequests() +{ + size_t num_sent_requests{0}; + + for (auto& thread_stat : threads_stat_) { + num_sent_requests += thread_stat->num_sent_requests_; + thread_stat->num_sent_requests_ = 0; + } + + return num_sent_requests; +} + LoadManager::LoadManager( const bool async, const bool streaming, const int32_t batch_size, - const size_t max_threads, const size_t sequence_length, - const SharedMemoryType shared_memory_type, const size_t output_shm_size, - const uint64_t start_sequence_id, const uint64_t sequence_id_range, - const std::shared_ptr& parser, - const std::shared_ptr& factory) + const size_t max_threads, const SharedMemoryType shared_memory_type, + const size_t output_shm_size, const std::shared_ptr& parser, + const std::shared_ptr& factory, + const std::unordered_map& + request_parameters) : async_(async), streaming_(streaming), batch_size_(batch_size), - max_threads_(max_threads), sequence_length_(sequence_length), - shared_memory_type_(shared_memory_type), - output_shm_size_(output_shm_size), start_sequence_id_(start_sequence_id), - sequence_id_range_(sequence_id_range), parser_(parser), factory_(factory), - using_json_data_(false), using_shared_memory_(false), next_seq_id_(1) + max_threads_(max_threads), parser_(parser), factory_(factory), + using_json_data_(false) { on_sequence_model_ = ((parser_->SchedulerType() == ModelParser::SEQUENCE) || - (parser->SchedulerType() == ModelParser::ENSEMBLE_SEQUENCE)); + (parser_->SchedulerType() == ModelParser::ENSEMBLE_SEQUENCE)); + + data_loader_.reset(new DataLoader(batch_size_)); + + infer_data_manager_ = InferDataManagerFactory::CreateInferDataManager( + max_threads, batch_size, shared_memory_type, output_shm_size, + request_parameters, parser, factory, data_loader_); +} + +void +LoadManager::InitManager( + const size_t string_length, const std::string& string_data, + const bool zero_input, std::vector& user_data, + const uint64_t start_sequence_id, const uint64_t sequence_id_range, + const size_t sequence_length, const bool sequence_length_specified, + const double sequence_length_variation) +{ + // Note, this is already caught by the CLI, but adding it here for extra + // protection + if (on_sequence_model_ && batch_size_ > 1) { + throw PerfAnalyzerException( + "error: sequence models do not support batching", GENERIC_ERROR); + } + + auto status = + InitManagerInputs(string_length, string_data, zero_input, user_data); + THROW_IF_ERROR(status, "Failed to init manager inputs"); - data_loader_.reset(new DataLoader(batch_size)); + THROW_IF_ERROR( + infer_data_manager_->Init(), "Unable to init infer data manager"); + + sequence_manager_ = MakeSequenceManager( + start_sequence_id, sequence_id_range, sequence_length, + sequence_length_specified, sequence_length_variation, using_json_data_, + data_loader_); + + InitManagerFinalize(); } cb::Error @@ -232,6 +218,8 @@ LoadManager::InitManagerInputs( // Read provided data if (!user_data.empty()) { if (IsDirectory(user_data[0])) { + RETURN_IF_ERROR(data_loader_->ValidateIOExistsInModel( + parser_->Inputs(), parser_->Outputs(), user_data[0])); RETURN_IF_ERROR(data_loader_->ReadDataFromDir( parser_->Inputs(), parser_->Outputs(), user_data[0])); } else { @@ -240,8 +228,6 @@ LoadManager::InitManagerInputs( RETURN_IF_ERROR(data_loader_->ReadDataFromJSON( parser_->Inputs(), parser_->Outputs(), json_file)); } - distribution_ = std::uniform_int_distribution( - 0, data_loader_->GetDataStreamsCount() - 1); std::cout << " Successfully read data for " << data_loader_->GetDataStreamsCount() << " stream/streams"; if (data_loader_->GetDataStreamsCount() == 1) { @@ -261,607 +247,6 @@ LoadManager::InitManagerInputs( return cb::Error::Success; } -cb::Error -LoadManager::InitSharedMemory() -{ - using_shared_memory_ = true; - - // Calling this function for the clean start - backend_->UnregisterAllSharedMemory(); - - // Allocate the shared memory for outputs - for (const auto& output : *(parser_->Outputs())) { - int64_t batch1_bytesize = - ByteSize(output.second.shape_, output.second.datatype_); - if (batch1_bytesize < 0) { - batch1_bytesize = output_shm_size_; - } - uint8_t* output_shm_ptr; - size_t alloc_size = batch1_bytesize * batch_size_; - std::string region_name(TensorToRegionName(output.first)); - if (shared_memory_type_ == SharedMemoryType::SYSTEM_SHARED_MEMORY) { - std::string shm_key("/" + region_name); - int shm_fd_op; - RETURN_IF_ERROR( - backend_->CreateSharedMemoryRegion(shm_key, alloc_size, &shm_fd_op)); - RETURN_IF_ERROR(backend_->MapSharedMemory( - shm_fd_op, 0, alloc_size, (void**)&output_shm_ptr)); - - shared_memory_regions_[region_name] = - std::pair(output_shm_ptr, alloc_size); - - RETURN_IF_ERROR(backend_->RegisterSystemSharedMemory( - region_name, shm_key, alloc_size)); - } else { -#ifdef TRITON_ENABLE_GPU - cudaError_t cuda_err = cudaMalloc((void**)&output_shm_ptr, alloc_size); - if (cuda_err != cudaSuccess) { - return cb::Error( - "unable to allocate memory of " + std::to_string(alloc_size) + - " bytes on gpu for output " + output.first + " : " + - std::string(cudaGetErrorString(cuda_err)), - pa::GENERIC_ERROR); - } - shared_memory_regions_[region_name] = - std::pair(output_shm_ptr, alloc_size); - - cudaIpcMemHandle_t cuda_handle; - RETURN_IF_ERROR(CreateCUDAIPCHandle(&cuda_handle, (void*)output_shm_ptr)); - // Using GPU with device id 0 - RETURN_IF_ERROR(backend_->RegisterCudaSharedMemory( - region_name, cuda_handle, alloc_size)); -#endif // TRITON_ENABLE_GPU - } - } - - for (const auto& input : *(parser_->Inputs())) { - for (int i = 0; i < (int)data_loader_->GetDataStreamsCount(); i++) { - for (int j = 0; j < (int)data_loader_->GetTotalSteps(i); - j += batch_size_) { - // Extract the data for requested batch size - std::vector data_ptrs; - std::vector byte_size; - size_t alloc_size = 0; - size_t count = 0; - size_t max_count = input.second.is_shape_tensor_ ? 1 : batch_size_; - std::vector shape; - std::vector prev_shape; - while (count < max_count) { - const uint8_t* data_ptr; - size_t batch1_bytesize; - - RETURN_IF_ERROR(data_loader_->GetInputShape( - input.second, i, (j + count) % data_loader_->GetTotalSteps(i), - &shape)); - if (!shape.empty()) { - if (count == 0) { - prev_shape = shape; - } else { - if (!std::equal(shape.begin(), shape.end(), prev_shape.begin())) { - return cb::Error( - "can not batch tensors with different shapes together " - "(input '" + - input.first + "' expected shape " + - ShapeVecToString(prev_shape) + " and received " + - ShapeVecToString(shape), - pa::GENERIC_ERROR); - } - } - } - - RETURN_IF_ERROR(data_loader_->GetInputData( - input.second, i, (j + count) % data_loader_->GetTotalSteps(i), - &data_ptr, &batch1_bytesize)); - data_ptrs.push_back(data_ptr); - byte_size.push_back(batch1_bytesize); - alloc_size += batch1_bytesize; - count++; - } - - // Validate if the shape tensors specified in the batch are identical. - while (count < batch_size_) { - const uint8_t* data_ptr; - size_t batch1_bytesize; - RETURN_IF_ERROR(data_loader_->GetInputData( - input.second, i, (j + count) % data_loader_->GetTotalSteps(i), - &data_ptr, &batch1_bytesize)); - if (batch1_bytesize != byte_size.back()) { - return cb::Error( - "The shape tensors should be identical in a batch (mismatch in " - "size)", - pa::GENERIC_ERROR); - } - - for (size_t data_idx = 0; data_idx < batch1_bytesize; data_idx++) { - if (*(data_ptr + data_idx) != *(data_ptrs.back() + data_idx)) { - return cb::Error( - "The shape tensors should be identical in a batch (mismatch " - "in content)", - pa::GENERIC_ERROR); - } - } - count++; - } - - // Generate the shared memory region name - std::string region_name( - TensorToRegionName(input.first) + "_" + std::to_string(i) + "_" + - std::to_string(j)); - - uint8_t* input_shm_ptr; - if (shared_memory_type_ == SharedMemoryType::SYSTEM_SHARED_MEMORY) { - std::string shm_key("/" + region_name); - int shm_fd_ip; - RETURN_IF_ERROR(backend_->CreateSharedMemoryRegion( - shm_key, alloc_size, &shm_fd_ip)); - RETURN_IF_ERROR(backend_->MapSharedMemory( - shm_fd_ip, 0, alloc_size, (void**)&input_shm_ptr)); - shared_memory_regions_[region_name] = - std::pair(input_shm_ptr, alloc_size); - - // Populate the region with data - size_t count = 0; - size_t offset = 0; - size_t max_count = input.second.is_shape_tensor_ ? 1 : batch_size_; - while (count < max_count) { - memcpy(input_shm_ptr + offset, data_ptrs[count], byte_size[count]); - offset += byte_size[count]; - count++; - } - - // Register the region with triton - RETURN_IF_ERROR(backend_->RegisterSystemSharedMemory( - region_name, shm_key, alloc_size)); - } else { -#ifdef TRITON_ENABLE_GPU - cudaError_t cuda_err = cudaMalloc((void**)&input_shm_ptr, alloc_size); - if (cuda_err != cudaSuccess) { - return cb::Error( - "unable to allocate memory of " + std::to_string(alloc_size) + - "bytes on gpu for input " + region_name + " : " + - std::string(cudaGetErrorString(cuda_err)), - pa::GENERIC_ERROR); - } - - shared_memory_regions_[region_name] = - std::pair(input_shm_ptr, alloc_size); - - // Populate the region with data - size_t count = 0; - size_t offset = 0; - size_t max_count = input.second.is_shape_tensor_ ? 1 : batch_size_; - while (count < max_count) { - cudaError_t cuda_err = cudaMemcpy( - (void*)(input_shm_ptr + offset), (void*)data_ptrs[count], - byte_size[count], cudaMemcpyHostToDevice); - if (cuda_err != cudaSuccess) { - return cb::Error( - "Failed to copy data to cuda shared memory for " + - region_name + " : " + - std::string(cudaGetErrorString(cuda_err)), - pa::GENERIC_ERROR); - } - offset += byte_size[count]; - count++; - } - - cudaIpcMemHandle_t cuda_handle; - RETURN_IF_ERROR( - CreateCUDAIPCHandle(&cuda_handle, (void*)input_shm_ptr)); - - // Register the region with triton - RETURN_IF_ERROR(backend_->RegisterCudaSharedMemory( - region_name, cuda_handle, alloc_size)); -#endif // TRITON_ENABLE_GPU - } - } - } - } - return cb::Error::Success; -} - -cb::Error -LoadManager::PrepareInfer(InferContext* ctx) -{ - // Initialize inputs - for (const auto& input : *(parser_->Inputs())) { - const uint8_t* data_ptr{nullptr}; - size_t batch1_bytesize; - // Set input shape before getting the input data - std::vector shape; - RETURN_IF_ERROR(data_loader_->GetInputShape(input.second, 0, 0, &shape)); - if (shape.empty() && (backend_->Kind() == cb::BackendKind::TRITON)) { - return cb::Error("unable to set shape for the input", pa::GENERIC_ERROR); - } - - if ((parser_->MaxBatchSize() != 0) && (!input.second.is_shape_tensor_)) { - shape.insert(shape.begin(), (int64_t)batch_size_); - } - - cb::InferInput* infer_input; - RETURN_IF_ERROR(cb::InferInput::Create( - &infer_input, backend_->Kind(), input.first, shape, - input.second.datatype_)); - ctx->inputs_.push_back(infer_input); - - data_ptr = nullptr; - RETURN_IF_ERROR(data_loader_->GetInputData( - input.second, 0, 0, &data_ptr, &batch1_bytesize)); - - // Add optional input to request if data was found - if (data_ptr != nullptr) { - ctx->valid_inputs_.push_back(infer_input); - } - - if (!shape.empty()) { - size_t max_count = (parser_->MaxBatchSize() == 0) ? 1 : batch_size_; - for (size_t i = 0; i < max_count; ++i) { - RETURN_IF_ERROR(infer_input->AppendRaw(data_ptr, batch1_bytesize)); - } - } - } - - for (const auto& output : *(parser_->Outputs())) { - std::string region_name(TensorToRegionName(output.first)); - - cb::InferRequestedOutput* requested_output; - RETURN_IF_ERROR(cb::InferRequestedOutput::Create( - &requested_output, backend_->Kind(), output.first)); - ctx->outputs_.push_back(requested_output); - } - RETURN_IF_ERROR( - UpdateValidationOutputs(ctx->outputs_, 0, 0, ctx->expected_outputs_)); - - return cb::Error::Success; -} - -cb::Error -LoadManager::PrepareSharedMemoryInfer(InferContext* ctx) -{ - for (const auto& input : *(parser_->Inputs())) { - std::string region_name( - TensorToRegionName(input.first) + "_" + std::to_string(0) + "_" + - std::to_string(0)); - - std::vector shape; - RETURN_IF_ERROR(data_loader_->GetInputShape(input.second, 0, 0, &shape)); - if (!shape.empty()) { - if ((parser_->MaxBatchSize() != 0) && (!input.second.is_shape_tensor_)) { - shape.insert(shape.begin(), (int64_t)batch_size_); - } - } else { - return cb::Error("unable to set shape for the input", pa::GENERIC_ERROR); - } - - cb::InferInput* infer_input; - RETURN_IF_ERROR(cb::InferInput::Create( - &infer_input, backend_->Kind(), input.first, shape, - input.second.datatype_)); - ctx->inputs_.push_back(infer_input); - - // FIXME: TMA-765 - Shared memory mode does not support optional inputs, - // currently, and will be implemented in the associated story. - ctx->valid_inputs_.push_back(infer_input); - - RETURN_IF_ERROR(infer_input->SetSharedMemory( - region_name, shared_memory_regions_[region_name].second)); - } - - for (const auto& output : *(parser_->Outputs())) { - std::string region_name(TensorToRegionName(output.first)); - - cb::InferRequestedOutput* requested_output; - RETURN_IF_ERROR(cb::InferRequestedOutput::Create( - &requested_output, backend_->Kind(), output.first)); - ctx->outputs_.push_back(requested_output); - - RETURN_IF_ERROR(requested_output->SetSharedMemory( - region_name, shared_memory_regions_[region_name].second)); - } - - return cb::Error::Success; -} - -cb::Error -LoadManager::UpdateInputs( - const std::vector& inputs, - std::vector& valid_inputs, int stream_index, - int step_index) -{ - // Validate update parameters here - size_t data_stream_count = data_loader_->GetDataStreamsCount(); - if (stream_index < 0 || stream_index >= (int)data_stream_count) { - return cb::Error( - "stream_index for retrieving the data should be less than " + - std::to_string(data_stream_count) + ", got " + - std::to_string(stream_index), - pa::GENERIC_ERROR); - } - size_t step_count = data_loader_->GetTotalSteps(stream_index); - if (step_index < 0 || step_index >= (int)step_count) { - return cb::Error( - "step_id for retrieving the data should be less than " + - std::to_string(step_count) + ", got " + std::to_string(step_index), - pa::GENERIC_ERROR); - } - - if (shared_memory_type_ == SharedMemoryType::NO_SHARED_MEMORY) { - RETURN_IF_ERROR(SetInputs(inputs, valid_inputs, stream_index, step_index)); - } else { - RETURN_IF_ERROR(SetInputsSharedMemory(inputs, stream_index, step_index)); - } - - return cb::Error::Success; -} - -cb::Error -LoadManager::UpdateValidationOutputs( - const std::vector& outputs, - int stream_index, int step_index, - std::vector>>& data) -{ - data.clear(); - // Validate update parameters here - size_t data_stream_count = data_loader_->GetDataStreamsCount(); - if (stream_index < 0 || stream_index >= (int)data_stream_count) { - return cb::Error( - "stream_index for retrieving the data should be less than " + - std::to_string(data_stream_count) + ", got " + - std::to_string(stream_index), - pa::GENERIC_ERROR); - } - size_t step_count = data_loader_->GetTotalSteps(stream_index); - if (step_index < 0 || step_index >= (int)step_count) { - return cb::Error( - "step_id for retrieving the data should be less than " + - std::to_string(step_count) + ", got " + std::to_string(step_index), - pa::GENERIC_ERROR); - } - - for (const auto& output : outputs) { - const auto& model_output = (*(parser_->Outputs()))[output->Name()]; - const uint8_t* data_ptr; - size_t batch1_bytesize; - const int* set_shape_values = nullptr; - int set_shape_value_cnt = 0; - - std::vector> output_data; - for (size_t i = 0; i < batch_size_; ++i) { - RETURN_IF_ERROR(data_loader_->GetOutputData( - output->Name(), stream_index, - (step_index + i) % data_loader_->GetTotalSteps(0), &data_ptr, - &batch1_bytesize)); - if (data_ptr == nullptr) { - break; - } - output_data.emplace_back(data_ptr, batch1_bytesize); - // Shape tensor only need the first batch element - if (model_output.is_shape_tensor_) { - break; - } - } - if (!output_data.empty()) { - data.emplace_back(std::move(output_data)); - } - } - return cb::Error::Success; -} - -cb::Error -LoadManager::ValidateOutputs( - const InferContext& ctx, const cb::InferResult* result_ptr) -{ - // Validate output if set - if (!ctx.expected_outputs_.empty()) { - for (size_t i = 0; i < ctx.outputs_.size(); ++i) { - const uint8_t* buf = nullptr; - size_t byte_size = 0; - result_ptr->RawData(ctx.outputs_[i]->Name(), &buf, &byte_size); - for (const auto& expected : ctx.expected_outputs_[i]) { - if (byte_size < expected.second) { - return cb::Error( - "Output size doesn't match expected size", pa::GENERIC_ERROR); - } else if (memcmp(buf, expected.first, expected.second) != 0) { - return cb::Error( - "Output doesn't match expected output", pa::GENERIC_ERROR); - } else { - buf += expected.second; - byte_size -= expected.second; - } - } - if (byte_size != 0) { - return cb::Error( - "Output size doesn't match expected size", pa::GENERIC_ERROR); - } - } - } - return cb::Error::Success; -} - -cb::Error -LoadManager::SetInputs( - const std::vector& inputs, - std::vector& valid_inputs, const int stream_index, - const int step_index) -{ - // Reset inputs for this inference request - valid_inputs.clear(); - - for (const auto& input : inputs) { - RETURN_IF_ERROR(input->Reset()); - - const auto& model_input = (*(parser_->Inputs()))[input->Name()]; - - const uint8_t* data_ptr{nullptr}; - size_t batch1_bytesize; - const int* set_shape_values = nullptr; - int set_shape_value_cnt = 0; - - // Number of missing pieces of data for optional inputs - int missing_data_cnt = 0; - - for (size_t i = 0; i < batch_size_; ++i) { - std::vector shape; - RETURN_IF_ERROR(data_loader_->GetInputShape( - model_input, stream_index, - (step_index + i) % data_loader_->GetTotalSteps(stream_index), - &shape)); - if ((parser_->MaxBatchSize() != 0) && (!model_input.is_shape_tensor_)) { - shape.insert(shape.begin(), (int64_t)batch_size_); - } - if (!shape.empty()) { - if (i == 0) { - input->SetShape(shape); - } else { - if (!std::equal(shape.begin(), shape.end(), input->Shape().begin())) { - return cb::Error( - "can not batch tensors with different shapes together " - "(input '" + - input->Name() + "' expected shape " + - ShapeVecToString(input->Shape(), true /* skip_first */) + - " and received " + - ShapeVecToString(shape, true /* skip_first */), - pa::GENERIC_ERROR); - } - } - } - data_ptr = nullptr; - RETURN_IF_ERROR(data_loader_->GetInputData( - model_input, stream_index, - (step_index + i) % data_loader_->GetTotalSteps(0), &data_ptr, - &batch1_bytesize)); - - // Update number of missing pieces of data for optional inputs to - // potentially detect error - if (data_ptr == nullptr) { - missing_data_cnt++; - continue; - } - - if (!model_input.is_shape_tensor_) { - RETURN_IF_ERROR(input->AppendRaw(data_ptr, batch1_bytesize)); - } else { - if (i == 0) { - // Set data only once for shape tensors - RETURN_IF_ERROR(input->AppendRaw(data_ptr, batch1_bytesize)); - set_shape_values = (const int*)data_ptr; - set_shape_value_cnt = batch1_bytesize / sizeof(int); - } else { - // Validate if the shape values are identical in the batch - bool is_identical = true; - if ((size_t)set_shape_value_cnt != (batch1_bytesize / sizeof(int))) { - is_identical = false; - } else { - for (int i = 0; i < set_shape_value_cnt; i++) { - if (*(set_shape_values + i) != *((const int*)data_ptr + i)) { - is_identical = false; - break; - } - } - } - if (!is_identical) { - return cb::Error( - "can not batch shape tensors with different values together " - "(input '" + - input->Name() + "' expected shape values" + - ShapeTensorValuesToString( - set_shape_values, set_shape_value_cnt) + - " and received " + - ShapeTensorValuesToString( - (int*)data_ptr, (batch1_bytesize / sizeof(int))), - pa::GENERIC_ERROR); - } - } - } - } - - // If all optional inputs had data provided, this is a valid input. But if - // some inferences in the batch provided data for an optional input and some - // inferences did not, this is an invalid case and an error is thrown. - if (missing_data_cnt == 0) { - valid_inputs.push_back(input); - } else if (missing_data_cnt > 0 && missing_data_cnt < batch_size_) { - return cb::Error( - "For batch sizes larger than 1, the same set of inputs must be " - "specified for each batch. You cannot use different set of optional " - "inputs for each individual batch."); - } - } - return cb::Error::Success; -} - -cb::Error -LoadManager::SetInputsSharedMemory( - const std::vector& inputs, const int stream_index, - const int step_index) -{ - for (const auto& input : inputs) { - RETURN_IF_ERROR(input->Reset()); - const auto& model_input = (*(parser_->Inputs()))[input->Name()]; - - std::string region_name( - TensorToRegionName(input->Name()) + '_' + std::to_string(stream_index) + - "_" + std::to_string(step_index)); - - std::vector shape; - RETURN_IF_ERROR(data_loader_->GetInputShape( - model_input, stream_index, step_index, &shape)); - if (!shape.empty()) { - if ((parser_->MaxBatchSize() != 0) && (!model_input.is_shape_tensor_)) { - shape.insert(shape.begin(), (int64_t)batch_size_); - } - input->SetShape(shape); - } - RETURN_IF_ERROR(input->SetSharedMemory( - region_name, shared_memory_regions_[region_name].second)); - } - return cb::Error::Success; -} - -void -LoadManager::SetInferSequenceOptions( - const uint32_t seq_id, std::unique_ptr& options) -{ - options->sequence_start_ = (sequence_stat_[seq_id]->remaining_queries_ == 0); - - // New sequence must be intialized before setting the id. - if (options->sequence_start_) { - InitNewSequence(seq_id); - } - options->sequence_id_ = sequence_stat_[seq_id]->seq_id_; - options->sequence_end_ = (sequence_stat_[seq_id]->remaining_queries_ == 1); -} - -void -LoadManager::InitNewSequence(int sequence_id) -{ - sequence_stat_[sequence_id]->seq_id_ = - next_seq_id_++ % sequence_id_range_ + start_sequence_id_; - if (!using_json_data_) { - size_t new_length = GetRandomLength(0.2); - sequence_stat_[sequence_id]->remaining_queries_ = - new_length == 0 ? 1 : new_length; - } else { - // Selecting next available data stream based on uniform distribution. - sequence_stat_[sequence_id]->data_stream_id_ = - distribution_(rng_generator_); - sequence_stat_[sequence_id]->remaining_queries_ = - data_loader_->GetTotalSteps( - sequence_stat_[sequence_id]->data_stream_id_); - } -} - -size_t -LoadManager::GetRandomLength(double offset_ratio) -{ - int random_offset = ((2.0 * rand() / double(RAND_MAX)) - 1.0) * offset_ratio * - sequence_length_; - if (int(sequence_length_) + random_offset <= 0) { - return 1; - } - return sequence_length_ + random_offset; -} - void LoadManager::StopWorkerThreads() { @@ -884,6 +269,20 @@ LoadManager::StopWorkerThreads() } cnt++; } + threads_.clear(); +} + +std::shared_ptr +LoadManager::MakeSequenceManager( + const uint64_t start_sequence_id, const uint64_t sequence_id_range, + const size_t sequence_length, const bool sequence_length_specified, + const double sequence_length_variation, const bool using_json_data, + std::shared_ptr data_loader) +{ + return std::make_shared( + start_sequence_id, sequence_id_range, sequence_length, + sequence_length_specified, sequence_length_variation, using_json_data, + data_loader); } }} // namespace triton::perfanalyzer diff --git a/load_manager.h b/load_manager.h index 866d9770..799bfa75 100644 --- a/load_manager.h +++ b/load_manager.h @@ -1,4 +1,4 @@ -// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -27,110 +27,99 @@ #include #include +#include #include #include + #include "client_backend/client_backend.h" #include "data_loader.h" +#include "iinfer_data_manager.h" +#include "load_worker.h" #include "perf_utils.h" - +#include "sequence_manager.h" namespace triton { namespace perfanalyzer { + +#ifndef DOCTEST_CONFIG_DISABLE +class NaggyMockLoadManager; +#endif + class LoadManager { public: - virtual ~LoadManager(); + virtual ~LoadManager() = default; + + /// Initialize the Manager class to set up shared memory and inputs + /// \param string_length The length of the random strings to be generated + /// for string inputs. + /// \param string_data The string to be used as string inputs for model. + /// \param zero_input Whether to use zero for model inputs. + /// \param user_data The vector containing path/paths to user-provided data + /// that can be a directory or path to a json data file. + /// \param start_sequence_id The starting sequence ID to be used for iterating + /// through valid sequence IDs. + /// \param sequence_id_range The maximum sequence ID to be used for iterating + /// through valid sequence IDs. + /// \param sequence_length The base length of new sequences. + /// \param sequence_length_specified Whether the user specified the sequence + /// length. + /// \param sequence_length_variation The percentage variation in length of + /// sequences using autogenerated data as input. + void InitManager( + const size_t string_length, const std::string& string_data, + const bool zero_input, std::vector& user_data, + const uint64_t start_sequence_id, const uint64_t sequence_id_range, + const size_t sequence_length, const bool sequence_length_specified, + const double sequence_length_variation); /// Check if the load manager is working as expected. /// \return cb::Error object indicating success or failure. cb::Error CheckHealth(); - /// Swap the content of the timestamp vector recorded by the load - /// manager with a new timestamp vector - /// \param new_timestamps The timestamp vector to be swapped. + /// Swap the content of the request records vector recorded by the load + /// manager with a new request records vector + /// \param new_request_records The request records vector to be swapped. /// \return cb::Error object indicating success or failure. - cb::Error SwapTimestamps(TimestampVector& new_timestamps); + cb::Error SwapRequestRecords(std::vector& new_request_records); /// Get the sum of all contexts' stat /// \param contexts_stat Returned the accumulated stat from all contexts /// in load manager cb::Error GetAccumulatedClientStat(cb::InferStat* contexts_stat); - /// \return the batch size used for the inference requests - size_t BatchSize() const { return batch_size_; } + /// Returns the amount of valid time each worker thread has averaged in + /// nanoseconds + /// + uint64_t GetIdleTime(); - /// Resets all worker thread states to beginning of schedule. - /// \return cb::Error object indicating success or failure. - virtual cb::Error ResetWorkers() - { - return cb::Error( - "resetting worker threads not supported for this load manager.", - pa::GENERIC_ERROR); - } + /// Resets the counter for tracking valid time + /// + void ResetIdleTime(); - /// Count the number of requests collected until now. - uint64_t CountCollectedRequests(); + /// Calculates and returns the total number of sent requests across all + /// threads. Resets individual number of sent requests per thread. + /// \return The total number of sent requests across all threads. + const size_t GetAndResetNumSentRequests(); - /// Wraps the information required to send an inference to the - /// server - struct InferContext { - explicit InferContext() : inflight_request_cnt_(0) {} - InferContext(InferContext&&) = delete; - InferContext(const InferContext&) = delete; - ~InferContext() - { - for (const auto input : inputs_) { - delete input; - } - for (const auto output : outputs_) { - delete output; - } - } - // The backend to communicate with the server - std::unique_ptr infer_backend_; - // The vector of pointers to InferInput objects for all possible inputs, - // potentially including optional inputs with no provided data. - std::vector inputs_; - // The vector of pointers to InferInput objects to be - // used for inference request. - std::vector valid_inputs_; - // The vector of pointers to InferRequestedOutput objects - // to be used with the inference request. - std::vector outputs_; - // If not empty, the expected output data in the same order as 'outputs_' - std::vector>> - expected_outputs_; - // The InferOptions object holding the details of the - // inference. - std::unique_ptr options_; - // The total number of inference in-flight. - std::atomic inflight_request_cnt_; - }; + /// \return the batch size used for the inference requests + virtual size_t BatchSize() const { return batch_size_; } - /// The properties of an asynchronous request required in - /// the callback to effectively interpret the response. - struct AsyncRequestProperties { - AsyncRequestProperties() : sequence_end_(false), delayed_(true) {} - // The id of in the inference context which was used to - // send this request. - uint32_t ctx_id_; - // The timestamp of when the request was started. - std::chrono::time_point start_time_; - // Whether or not the request is at the end of a sequence. - bool sequence_end_; - // Whether or not the request is delayed as per schedule. - bool delayed_; - }; + /// Count the number of requests collected until now. + uint64_t CountCollectedRequests(); protected: LoadManager( const bool async, const bool streaming, const int32_t batch_size, - const size_t max_threads, const size_t sequence_length, - const SharedMemoryType shared_memory_type, const size_t output_shm_size, - const uint64_t start_sequence_id, const uint64_t sequence_id_range, - const std::shared_ptr& parser, - const std::shared_ptr& factory); + const size_t max_threads, const SharedMemoryType shared_memory_type, + const size_t output_shm_size, const std::shared_ptr& parser, + const std::shared_ptr& factory, + const std::unordered_map& + request_parameters); + + /// Complete any subclass-specific manager initialization tasks. + virtual void InitManagerFinalize() {} - /// Helper funtion to retrieve the input data for the inferences + /// Helper function to retrieve the input data for the inferences /// \param string_length The length of the random strings to be generated /// for string inputs. /// \param string_data The string to be used as string inputs for model. @@ -142,155 +131,28 @@ class LoadManager { const size_t string_length, const std::string& string_data, const bool zero_input, std::vector& user_data); - /// Helper function to allocate and prepare shared memory. - /// from shared memory. - /// \return cb::Error object indicating success or failure. - cb::Error InitSharedMemory(); - - /// Helper function to prepare the InferContext for sending inference request. - /// \param ctx The target InferContext object. - /// \return cb::Error object indicating success or failure. - cb::Error PrepareInfer(InferContext* ctx); - - /// Helper function to prepare the InferContext for sending inference - /// request in shared memory. - /// \param ctx The target InferContext object. - /// \return cb::Error object indicating success or failure. - cb::Error PrepareSharedMemoryInfer(InferContext* ctx); - - /// Updates the input data to use for inference request - /// \param inputs The vector of pointers to InferInput objects for all - /// possible inputs, potentially including optional inputs with no provided - /// data - /// \param valid_inputs The vector of pointers to InferInput objects to be - /// used for inference request. - /// \param stream_index The data stream to use for next data - /// \param step_index The step index to use for next data - /// \return cb::Error object indicating success or failure. - cb::Error UpdateInputs( - const std::vector& inputs, - std::vector& valid_inputs, int stream_index, - int step_index); - - /// Updates the expected output data to use for inference request. Empty - /// vector will be returned if there is no expected output associated to the - /// step. - /// \param outputs The vector of outputs to get the expected data - /// \param stream_index The data stream to use for next data - /// \param step_index The step index to use for next data - /// \param data The vector of pointer and size of the expected outputs - /// \return cb::Error object indicating success or failure. - cb::Error UpdateValidationOutputs( - const std::vector& outputs, - int stream_index, int step_index, - std::vector>>& data); - - cb::Error ValidateOutputs( - const InferContext& ctx, const cb::InferResult* result_ptr); - - void SetInferSequenceOptions( - const uint32_t seq_id, std::unique_ptr& options); - void InitNewSequence(int sequence_id); - - /// Generate random sequence length based on 'offset_ratio' and - /// 'sequence_length_'. (1 +/- 'offset_ratio') * 'sequence_length_' - /// \param offset_ratio The offset ratio of the generated length - /// \return random sequence length - size_t GetRandomLength(double offset_ratio); - /// Stops all the worker threads generating the request load. void StopWorkerThreads(); - private: - /// Helper function to update the inputs - /// \param inputs The vector of pointers to InferInput objects for all - /// possible inputs, potentially including optional inputs with no provided - /// data - /// \param valid_inputs The vector of pointers to InferInput objects to be - /// used for inference request. - /// \param stream_index The data stream to use for next data - /// \param step_index The step index to use for next data - /// \return cb::Error object indicating success or failure. - cb::Error SetInputs( - const std::vector& inputs, - std::vector& valid_inputs, const int stream_index, - const int step_index); - - /// Helper function to update the shared memory inputs - /// \param inputs The vector of pointers to InferInput objects - /// \param stream_index The data stream to use for next data - /// \param step_index The step index to use for next data - /// \return cb::Error object indicating success or failure. - cb::Error SetInputsSharedMemory( - const std::vector& inputs, const int stream_index, - const int step_index); - protected: bool async_; bool streaming_; size_t batch_size_; size_t max_threads_; - size_t sequence_length_; - SharedMemoryType shared_memory_type_; - size_t output_shm_size_; bool on_sequence_model_; - const uint64_t start_sequence_id_; - const uint64_t sequence_id_range_; - std::shared_ptr parser_; std::shared_ptr factory_; bool using_json_data_; - bool using_shared_memory_; - - std::default_random_engine rng_generator_; - std::uniform_int_distribution distribution_; - std::unique_ptr data_loader_; + std::shared_ptr data_loader_; std::unique_ptr backend_; + std::shared_ptr infer_data_manager_; - // Map from shared memory key to its starting address and size - std::unordered_map> - shared_memory_regions_; - - // Holds the running status of the thread. - struct ThreadStat { - ThreadStat() {} - - // The status of the worker thread - cb::Error status_; - // The status of the callback thread for async requests - cb::Error cb_status_; - // The statistics of the InferContext - std::vector contexts_stat_; - // The concurrency level that the worker should produce - size_t concurrency_; - // A vector of request timestamps - // Request latency will be end_time - start_time - TimestampVector request_timestamps_; - // A lock to protect thread data - std::mutex mu_; - }; - - // Holds the status of the inflight sequence - struct SequenceStat { - SequenceStat(uint64_t seq_id) - : seq_id_(seq_id), data_stream_id_(0), remaining_queries_(0) - { - } - // The unique correlation id allocated to the sequence - uint64_t seq_id_; - // The data stream id providing data for the sequence - uint64_t data_stream_id_; - // The number of queries remaining to complete the sequence - size_t remaining_queries_; - // A lock to protect sequence data - std::mutex mtx_; - }; - - std::vector> sequence_stat_; - std::atomic next_seq_id_; + // Track the workers so they all go out of scope at the + // same time + std::vector> workers_; // Worker threads that loads the server with inferences std::vector threads_; @@ -300,6 +162,21 @@ class LoadManager { // Use condition variable to pause/continue worker threads std::condition_variable wake_signal_; std::mutex wake_mutex_; + + std::shared_ptr sequence_manager_{nullptr}; + + virtual std::shared_ptr MakeSequenceManager( + const uint64_t start_sequence_id, const uint64_t sequence_id_range, + const size_t sequence_length, const bool sequence_length_specified, + const double sequence_length_variation, const bool using_json_data, + std::shared_ptr data_loader); + +#ifndef DOCTEST_CONFIG_DISABLE + friend NaggyMockLoadManager; + + public: + LoadManager() = default; +#endif }; }} // namespace triton::perfanalyzer diff --git a/load_worker.cc b/load_worker.cc new file mode 100644 index 00000000..a32976c6 --- /dev/null +++ b/load_worker.cc @@ -0,0 +1,132 @@ +// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "load_worker.h" + +#include +#include + +#include "client_backend/client_backend.h" +#include "perf_utils.h" + +namespace triton { namespace perfanalyzer { + +bool +LoadWorker::ShouldExit() +{ + bool bad_status = + !thread_stat_->cb_status_.IsOk() || !thread_stat_->status_.IsOk(); + + bool done_with_request_count = + thread_config_->num_requests_ != 0 && + thread_stat_->num_sent_requests_ >= thread_config_->num_requests_; + + return early_exit || bad_status || done_with_request_count; +} + +bool +LoadWorker::HandleExitConditions() +{ + if (ShouldExit()) { + CompleteOngoingSequences(); + thread_stat_->idle_timer.Start(); + WaitForOngoingRequests(); + return true; + } + return false; +} + +void +LoadWorker::CompleteOngoingSequences() +{ + if (on_sequence_model_) { + for (size_t ctx_id = 0; ctx_id < ctxs_.size(); ++ctx_id) { + size_t seq_stat_index = GetSeqStatIndex(ctx_id); + ctxs_[ctx_id]->CompleteOngoingSequence(seq_stat_index); + } + } +} + +void +LoadWorker::WaitForOngoingRequests() +{ + while (GetNumOngoingRequests() != 0) { + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } +} + +uint +LoadWorker::GetNumOngoingRequests() +{ + uint num = 0; + for (auto ctx : ctxs_) { + num += ctx->GetNumOngoingRequests(); + } + return num; +} + +void +LoadWorker::CreateContext() +{ + auto ctx = CreateInferContext(); + ctx->Init(); + CreateContextFinalize(ctx); + ctxs_.push_back(ctx); +} + +uint32_t +LoadWorker::GetCtxId() +{ + std::lock_guard lk(cb_mtx_); + return ctx_id_tracker_->Get(); +} + + +void +LoadWorker::RestoreFreeCtxId(uint32_t ctx_id) +{ + if (!async_) { + { + std::lock_guard lock(cb_mtx_); + ctx_id_tracker_->Restore(ctx_id); + } + } +} + +void +LoadWorker::AsyncCallbackFinalize(uint32_t ctx_id) +{ + // avoid competition over 'cb_mtx_' + { + std::lock_guard lk(cb_mtx_); + ctx_id_tracker_->Restore(ctx_id); + notified_ = true; + } + + cb_cv_.notify_all(); +} + +}} // namespace triton::perfanalyzer diff --git a/load_worker.h b/load_worker.h new file mode 100644 index 00000000..dd7e0297 --- /dev/null +++ b/load_worker.h @@ -0,0 +1,159 @@ +// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include +#include +#include +#include + +#include "ctx_id_tracker_factory.h" +#include "data_loader.h" +#include "infer_context.h" +#include "iworker.h" +#include "model_parser.h" +#include "sequence_manager.h" +#include "thread_config.h" + +namespace triton { namespace perfanalyzer { + +/// Abstract base class for worker threads +/// +class LoadWorker : public IWorker { + protected: + LoadWorker( + uint32_t id, std::shared_ptr thread_stat, + std::shared_ptr thread_config, + const std::shared_ptr parser, + std::shared_ptr data_loader, + const std::shared_ptr factory, + const bool on_sequence_model, const bool async, const bool streaming, + const int32_t batch_size, const bool using_json_data, + std::condition_variable& wake_signal, std::mutex& wake_mutex, + bool& execute, + const std::shared_ptr& infer_data_manager, + std::shared_ptr sequence_manager) + : id_(id), thread_stat_(thread_stat), thread_config_(thread_config), + parser_(parser), data_loader_(data_loader), factory_(factory), + on_sequence_model_(on_sequence_model), async_(async), + streaming_(streaming), batch_size_(batch_size), + using_json_data_(using_json_data), wake_signal_(wake_signal), + wake_mutex_(wake_mutex), execute_(execute), + infer_data_manager_(infer_data_manager), + sequence_manager_(sequence_manager) + { + } + + virtual ~LoadWorker() = default; + + protected: + // Return the total number of async requests that have started and not + // finished + uint GetNumOngoingRequests(); + + void SendInferRequest(uint32_t ctx_id, bool delayed = false) + { + if (ShouldExit()) { + return; + } + + if (on_sequence_model_) { + uint32_t seq_stat_index = GetSeqStatIndex(ctx_id); + ctxs_[ctx_id]->SendSequenceInferRequest(seq_stat_index, delayed); + } else { + ctxs_[ctx_id]->SendInferRequest(delayed); + } + } + + virtual std::shared_ptr CreateInferContext() + { + return std::make_shared( + id_, ctxs_.size(), async_, streaming_, on_sequence_model_, + using_json_data_, batch_size_, thread_stat_, data_loader_, parser_, + factory_, execute_, infer_data_manager_, sequence_manager_); + } + + // Create an inference context and add it to ctxs_ + virtual void CreateContext(); + + // Any code that needs to execute after the Context has been created + virtual void CreateContextFinalize(std::shared_ptr ctx) = 0; + + // Detect the cases where this thread needs to exit + bool ShouldExit(); + + // Detect and handle the case where this thread needs to exit + // Returns true if an exit condition was met + bool HandleExitConditions(); + void CompleteOngoingSequences(); + void WaitForOngoingRequests(); + + virtual uint32_t GetSeqStatIndex(uint32_t ctx_id) = 0; + uint32_t GetCtxId(); + void RestoreFreeCtxId(uint32_t ctx_id); + + void AsyncCallbackFinalize(uint32_t ctx_id); + + uint32_t id_; + + std::vector> ctxs_; + std::shared_ptr ctx_id_tracker_; + + // Variables used to signal async request completion + bool notified_ = false; + std::mutex cb_mtx_; + std::condition_variable cb_cv_; + + // TODO REFACTOR TMA-1017 is there a better way to do threading than to pass + // the same cv/mutex into every thread by reference? Used to wake up this + // thread if it has been put to sleep + std::condition_variable& wake_signal_; + std::mutex& wake_mutex_; + + // TODO REFACTOR TMA-1017 is there a better way to communicate this than a + // shared bool reference? Used to pause execution of this thread + bool& execute_; + + // Stats for this thread + std::shared_ptr thread_stat_; + // Configuration for this thread + std::shared_ptr thread_config_; + + std::shared_ptr data_loader_; + const std::shared_ptr parser_; + const std::shared_ptr factory_; + const std::shared_ptr infer_data_manager_; + + const bool on_sequence_model_; + const bool async_; + const bool streaming_; + const int32_t batch_size_; + const bool using_json_data_; + + std::shared_ptr sequence_manager_{nullptr}; +}; + +}} // namespace triton::perfanalyzer diff --git a/main.cc b/main.cc index 697b03c5..bf517629 100644 --- a/main.cc +++ b/main.cc @@ -40,6 +40,7 @@ main(int argc, char* argv[]) analyzer.Run(); } catch (pa::PerfAnalyzerException& e) { + std::cerr << e.what() << std::endl; return e.GetError(); } diff --git a/metrics_manager.cc b/metrics_manager.cc index d7e01ba7..0e1262ce 100644 --- a/metrics_manager.cc +++ b/metrics_manager.cc @@ -25,9 +25,11 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "metrics_manager.h" + #include #include #include + #include "constants.h" #include "perf_analyzer_exception.h" @@ -76,8 +78,8 @@ MetricsManager::QueryMetricsEveryNMilliseconds() const auto& end{std::chrono::system_clock::now()}; const auto& duration{end - start}; - const auto& remainder{std::chrono::milliseconds(metrics_interval_ms_) - - duration}; + const auto& remainder{ + std::chrono::milliseconds(metrics_interval_ms_) - duration}; CheckForMetricIntervalTooShort(remainder, duration); diff --git a/metrics_manager.h b/metrics_manager.h index fa0fe360..ae6b6135 100644 --- a/metrics_manager.h +++ b/metrics_manager.h @@ -1,4 +1,4 @@ -// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -32,6 +32,7 @@ #include #include #include + #include "client_backend/client_backend.h" #include "metrics.h" @@ -54,7 +55,7 @@ class MetricsManager { /// Starts background thread that queries metrics on an interval void StartQueryingMetrics(); - /// Checks if background thread threw exception and propogates it if so + /// Checks if background thread threw exception and propagates it if so void CheckQueryingStatus(); /// Puts the latest-collected metrics from background thread into vector @@ -85,7 +86,7 @@ class MetricsManager { #ifndef DOCTEST_CONFIG_DISABLE friend TestMetricsManager; - protected: + public: MetricsManager() = default; #endif }; diff --git a/mock_concurrency_worker.h b/mock_concurrency_worker.h new file mode 100644 index 00000000..636b9274 --- /dev/null +++ b/mock_concurrency_worker.h @@ -0,0 +1,69 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include "concurrency_worker.h" +#include "gmock/gmock.h" + +namespace triton { namespace perfanalyzer { + +class NaggyMockConcurrencyWorker : public ConcurrencyWorker { + public: + NaggyMockConcurrencyWorker( + uint32_t id, std::shared_ptr thread_stat, + std::shared_ptr thread_config, + const std::shared_ptr parser, + std::shared_ptr data_loader, + const std::shared_ptr factory, + const bool on_sequence_model, const bool async, + const size_t max_concurrency, const bool using_json_data, + const bool streaming, const int32_t batch_size, + std::condition_variable& wake_signal, std::mutex& wake_mutex, + size_t& active_threads, bool& execute, + const std::shared_ptr& infer_data_manager, + std::shared_ptr sequence_manager) + : ConcurrencyWorker( + id, thread_stat, thread_config, parser, data_loader, factory, + on_sequence_model, async, max_concurrency, using_json_data, + streaming, batch_size, wake_signal, wake_mutex, active_threads, + execute, infer_data_manager, sequence_manager) + { + ON_CALL(*this, Infer()).WillByDefault([this]() -> void { + ConcurrencyWorker::Infer(); + }); + } + + MOCK_METHOD(void, Infer, (), (override)); + + void EmptyInfer() { thread_config_->is_paused_ = true; } +}; + +// Non-naggy version of Mock (won't warn when using default gmock +// mocked function) +using MockConcurrencyWorker = testing::NiceMock; + +}} // namespace triton::perfanalyzer diff --git a/mock_data_loader.h b/mock_data_loader.h new file mode 100644 index 00000000..0eccdabf --- /dev/null +++ b/mock_data_loader.h @@ -0,0 +1,98 @@ +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include "data_loader.h" +#include "gmock/gmock.h" + +namespace triton { namespace perfanalyzer { + +/// Mock DataLoader class used for testing to allow JSON data to be read +/// from string, rather than file. +/// +class NaggyMockDataLoader : public DataLoader { + public: + NaggyMockDataLoader() { SetupMocks(); } + NaggyMockDataLoader(size_t batch_size) : DataLoader(batch_size) + { + SetupMocks(); + } + + void SetupMocks() + { + ON_CALL(*this, GetTotalSteps(testing::_)) + .WillByDefault([this](size_t stream_id) -> size_t { + return this->DataLoader::GetTotalSteps(stream_id); + }); + ON_CALL(*this, ReadFile(testing::_, testing::_)) + .WillByDefault( + [this]( + const std::string& path, + std::vector* contents) -> cb::Error { + return this->DataLoader::ReadFile(path, contents); + }); + ON_CALL(*this, ReadTextFile(testing::_, testing::_)) + .WillByDefault( + [this]( + const std::string& path, + std::vector* contents) -> cb::Error { + return this->DataLoader::ReadTextFile(path, contents); + }); + } + + MOCK_METHOD(size_t, GetTotalSteps, (size_t), (override)); + MOCK_METHOD(cb::Error, ReadFile, (const std::string&, std::vector*)); + MOCK_METHOD( + cb::Error, ReadTextFile, (const std::string&, std::vector*)); + + cb::Error ReadDataFromJSON( + const std::shared_ptr& inputs, + const std::shared_ptr& outputs, + const std::string& json_file) override + { + return ReadDataFromStr(json_file, inputs, outputs); + } + + cb::Error ReadDataFromStr( + const std::string& str, const std::shared_ptr& inputs, + const std::shared_ptr& outputs) + { + rapidjson::Document d{}; + const unsigned int parseFlags = rapidjson::kParseNanAndInfFlag; + d.Parse(str.c_str()); + + return ParseData(d, inputs, outputs); + }; + + std::vector& step_num_{DataLoader::step_num_}; + size_t& data_stream_cnt_{DataLoader::data_stream_cnt_}; +}; + +// Non-naggy version of Mock Data Loader (won't warn when using default gmock +// mocked function) +using MockDataLoader = testing::NiceMock; + +}} // namespace triton::perfanalyzer diff --git a/mock_infer_context.h b/mock_infer_context.h new file mode 100644 index 00000000..e1c15d03 --- /dev/null +++ b/mock_infer_context.h @@ -0,0 +1,69 @@ +// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include "gmock/gmock.h" +#include "infer_context.h" + +namespace triton { namespace perfanalyzer { + +class NaggyMockInferContext : public InferContext { + public: + NaggyMockInferContext() + { + ON_CALL(*this, SendRequest(testing::_, testing::_, testing::_)) + .WillByDefault( + [this]( + const uint64_t request_id, const bool delayed, + const uint64_t sequence_id) -> void { + this->InferContext::SendRequest(request_id, delayed, sequence_id); + }); + } + + MOCK_METHOD( + void, SendRequest, (const uint64_t, const bool, const uint64_t), + (override)); + + std::shared_ptr& sequence_manager_{ + InferContext::sequence_manager_}; + std::shared_ptr& data_loader_{InferContext::data_loader_}; + std::shared_ptr& infer_data_manager_{ + InferContext::infer_data_manager_}; + std::shared_ptr& thread_stat_{InferContext::thread_stat_}; + std::reference_wrapper& execute_{InferContext::execute_}; + bool& using_json_data_{InferContext::using_json_data_}; + bool& async_{InferContext::async_}; + bool& streaming_{InferContext::streaming_}; + InferData& infer_data_{InferContext::infer_data_}; + std::unique_ptr& infer_backend_{ + InferContext::infer_backend_}; + std::function& async_callback_func_{ + InferContext::async_callback_func_}; +}; + +using MockInferContext = testing::NiceMock; + +}} // namespace triton::perfanalyzer diff --git a/mock_infer_data_manager.h b/mock_infer_data_manager.h new file mode 100644 index 00000000..8f9cd7ec --- /dev/null +++ b/mock_infer_data_manager.h @@ -0,0 +1,150 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include "gmock/gmock.h" +#include "infer_data_manager.h" +#include "infer_data_manager_shm.h" +#include "mock_client_backend.h" + +namespace triton { namespace perfanalyzer { + + +class MockInferDataManagerShm : public InferDataManagerShm { + public: + MockInferDataManagerShm( + const int32_t batch_size, const SharedMemoryType shared_memory_type, + const size_t output_shm_size, + std::unordered_map + request_parameters, + const std::shared_ptr& parser, + const std::shared_ptr& factory, + const std::shared_ptr& data_loader) + : InferDataManagerShm( + batch_size, shared_memory_type, output_shm_size, request_parameters, + parser, factory, data_loader) + { + } + + // Mocked version of the CopySharedMemory method in loadmanager. + // Tracks the mapping of shared memory label to data + // + cb::Error CopySharedMemory( + uint8_t* input_shm_ptr, const std::vector& input_datas, + bool is_shape_tensor, std::string& region_name) override + { + std::vector vals; + + for (size_t i = 0; i < input_datas.size(); i++) { + int32_t val = *reinterpret_cast(input_datas[i].data_ptr); + vals.push_back(val); + } + mocked_shared_memory_regions.insert(std::make_pair(region_name, vals)); + return cb::Error::Success; + } + + cb::Error CreateInferInput( + cb::InferInput** infer_input, const cb::BackendKind kind, + const std::string& name, const std::vector& dims, + const std::string& datatype) override + { + *infer_input = new cb::MockInferInput(kind, name, dims, datatype); + return cb::Error::Success; + } + + // Tracks the mapping of shared memory label to data + std::map> mocked_shared_memory_regions; +}; + + +class MockInferDataManager : public InferDataManager { + public: + MockInferDataManager() { SetupMocks(); } + + MockInferDataManager( + const size_t max_threads, const int32_t batch_size, + std::unordered_map + request_parameters, + const std::shared_ptr& parser, + const std::shared_ptr& factory, + const std::shared_ptr& data_loader) + : InferDataManager( + max_threads, batch_size, request_parameters, parser, factory, + data_loader) + { + SetupMocks(); + } + + void SetupMocks() + { + ON_CALL( + *this, UpdateInferData(testing::_, testing::_, testing::_, testing::_)) + .WillByDefault( + [this]( + size_t thread_id, int stream_index, int step_index, + InferData& infer_data) -> cb::Error { + return this->InferDataManager::UpdateInferData( + thread_id, stream_index, step_index, infer_data); + }); + } + + MOCK_METHOD( + cb::Error, UpdateInferData, (size_t, int, int, InferData&), (override)); + + cb::Error CreateInferInput( + cb::InferInput** infer_input, const cb::BackendKind kind, + const std::string& name, const std::vector& dims, + const std::string& datatype) override + { + *infer_input = new cb::MockInferInput(kind, name, dims, datatype); + return cb::Error::Success; + } +}; + +class MockInferDataManagerFactory { + public: + static std::shared_ptr CreateMockInferDataManager( + const size_t max_threads, const int32_t batch_size, + const SharedMemoryType shared_memory_type, const size_t output_shm_size, + std::unordered_map + request_parameters, + const std::shared_ptr& parser, + const std::shared_ptr& factory, + const std::shared_ptr& data_loader) + { + if (shared_memory_type == SharedMemoryType::NO_SHARED_MEMORY) { + return std::make_shared>( + max_threads, batch_size, request_parameters, parser, factory, + data_loader); + } else { + return std::make_shared>( + batch_size, shared_memory_type, output_shm_size, request_parameters, + parser, factory, data_loader); + } + } +}; + +}} // namespace triton::perfanalyzer diff --git a/mock_inference_profiler.h b/mock_inference_profiler.h index b44d9495..7e08e489 100644 --- a/mock_inference_profiler.h +++ b/mock_inference_profiler.h @@ -1,4 +1,4 @@ -// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -30,9 +30,94 @@ namespace triton { namespace perfanalyzer { -class MockInferenceProfiler : public InferenceProfiler { +class NaggyMockInferenceProfiler : public InferenceProfiler { public: - MockInferenceProfiler() = default; + NaggyMockInferenceProfiler() + { + ON_CALL( + *this, ValidLatencyMeasurement( + testing::_, testing::_, testing::_, testing::_, testing::_, + testing::_)) + .WillByDefault( + [this]( + const std::pair& valid_range, + size_t& valid_sequence_count, size_t& delayed_request_count, + std::vector* latencies, size_t& response_count, + std::vector& valid_requests) -> void { + this->InferenceProfiler::ValidLatencyMeasurement( + valid_range, valid_sequence_count, delayed_request_count, + latencies, response_count, valid_requests); + }); + ON_CALL(*this, SummarizeLatency(testing::_, testing::_)) + .WillByDefault( + [this]( + const std::vector& latencies, + PerfStatus& summary) -> cb::Error { + return this->InferenceProfiler::SummarizeLatency( + latencies, summary); + }); + ON_CALL(*this, MergePerfStatusReports(testing::_, testing::_)) + .WillByDefault( + [this]( + std::deque& perf_status, + PerfStatus& summary_status) -> cb::Error { + return this->InferenceProfiler::MergePerfStatusReports( + perf_status, summary_status); + }); + ON_CALL(*this, MergeServerSideStats(testing::_, testing::_)) + .WillByDefault( + [this]( + std::vector& server_side_stats, + ServerSideStats& server_side_summary) -> cb::Error { + return this->InferenceProfiler::MergeServerSideStats( + server_side_stats, server_side_summary); + }); + ON_CALL( + *this, SummarizeClientStat( + testing::_, testing::_, testing::_, testing::_, testing::_, + testing::_, testing::_, testing::_)) + .WillByDefault( + [this]( + const cb::InferStat& start_stat, const cb::InferStat& end_stat, + const uint64_t duration_ns, const size_t valid_request_count, + const size_t delayed_request_count, + const size_t valid_sequence_count, const size_t response_count, + PerfStatus& summary) -> cb::Error { + return this->InferenceProfiler::SummarizeClientStat( + start_stat, end_stat, duration_ns, valid_request_count, + delayed_request_count, valid_sequence_count, response_count, + summary); + }); + }; + MOCK_METHOD0(IncludeServerStats, bool()); + MOCK_METHOD( + void, ValidLatencyMeasurement, + ((const std::pair&), size_t&, size_t&, + std::vector*, size_t&, std::vector&), + (override)); + MOCK_METHOD( + cb::Error, SummarizeLatency, (const std::vector&, PerfStatus&), + (override)); + MOCK_METHOD( + cb::Error, MergePerfStatusReports, (std::deque&, PerfStatus&), + (override)); + MOCK_METHOD( + cb::Error, MergeServerSideStats, + (std::vector&, ServerSideStats&), (override)); + MOCK_METHOD( + cb::Error, SummarizeClientStat, + (const cb::InferStat&, const cb::InferStat&, const uint64_t, const size_t, + const size_t, const size_t, const size_t, PerfStatus&), + (override)); + + std::shared_ptr& parser_{InferenceProfiler::parser_}; + std::unique_ptr& manager_{InferenceProfiler::manager_}; + bool& include_lib_stats_{InferenceProfiler::include_lib_stats_}; + std::vector& all_request_records_{ + InferenceProfiler::all_request_records_}; }; + +using MockInferenceProfiler = testing::NiceMock; + }} // namespace triton::perfanalyzer diff --git a/mock_load_manager.h b/mock_load_manager.h new file mode 100644 index 00000000..2088a405 --- /dev/null +++ b/mock_load_manager.h @@ -0,0 +1,37 @@ +// Copyright 2023 (c), NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include "gmock/gmock.h" +#include "load_manager.h" + +namespace triton { namespace perfanalyzer { + +class NaggyMockLoadManager : public LoadManager {}; + +using MockLoadManager = testing::NiceMock; + +}} // namespace triton::perfanalyzer diff --git a/mock_model_parser.h b/mock_model_parser.h new file mode 100644 index 00000000..72222a82 --- /dev/null +++ b/mock_model_parser.h @@ -0,0 +1,78 @@ +// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include "model_parser.h" + +namespace triton { namespace perfanalyzer { + +class MockModelParser : public ModelParser { + public: + MockModelParser() : ModelParser(clientbackend::BackendKind::TRITON) {} + + MockModelParser( + bool is_sequence_model, bool is_decoupled_model, + size_t max_batch_size = 64) + : ModelParser(clientbackend::BackendKind::TRITON) + { + if (is_sequence_model) { + scheduler_type_ = ModelParser::SEQUENCE; + } + is_decoupled_ = is_decoupled_model; + max_batch_size_ = max_batch_size; + } + + // Expose private function + cb::Error GetInt(const rapidjson::Value& value, int64_t* integer_value) + { + return ModelParser::GetInt(value, integer_value); + } + + // Expose private function + cb::Error DetermineComposingModelMap( + const std::vector& bls_composing_models, + const rapidjson::Document& config, + std::unique_ptr& backend) + { + return ModelParser::DetermineComposingModelMap( + bls_composing_models, config, backend); + } + + // Expose private function + cb::Error DetermineSchedulerType( + const rapidjson::Document& config, + std::unique_ptr& backend) + { + return ModelParser::DetermineSchedulerType(config, backend); + } + + std::shared_ptr& composing_models_map_{ + ModelParser::composing_models_map_}; + std::shared_ptr& inputs_{ModelParser::inputs_}; +}; + +}} // namespace triton::perfanalyzer diff --git a/mock_profile_data_collector.h b/mock_profile_data_collector.h new file mode 100644 index 00000000..94467892 --- /dev/null +++ b/mock_profile_data_collector.h @@ -0,0 +1,54 @@ +// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include "gmock/gmock.h" +#include "profile_data_collector.h" + +namespace triton { namespace perfanalyzer { + +class NaggyMockProfileDataCollector : public ProfileDataCollector { + public: + NaggyMockProfileDataCollector() + { + ON_CALL(*this, FindExperiment(testing::_)) + .WillByDefault( + [this](InferenceLoadMode& id) -> std::vector::iterator { + return this->ProfileDataCollector::FindExperiment(id); + }); + } + + MOCK_METHOD( + std::vector::iterator, FindExperiment, (InferenceLoadMode&), + (override)); + + std::vector& experiments_{ProfileDataCollector::experiments_}; +}; + +using MockProfileDataCollector = + testing::NiceMock; + +}} // namespace triton::perfanalyzer diff --git a/mock_profile_data_exporter.h b/mock_profile_data_exporter.h new file mode 100644 index 00000000..90e96d73 --- /dev/null +++ b/mock_profile_data_exporter.h @@ -0,0 +1,95 @@ +// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS"" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include "gmock/gmock.h" +#include "profile_data_exporter.h" + +namespace triton { namespace perfanalyzer { + +class NaggyMockProfileDataExporter : public ProfileDataExporter { + public: + NaggyMockProfileDataExporter() + { + ON_CALL( + *this, ConvertToJson(testing::_, testing::_, testing::_, testing::_)) + .WillByDefault( + [this]( + const std::vector& raw_experiments, + std::string& raw_version, cb::BackendKind& service_kind, + std::string& endpoint) -> void { + return this->ProfileDataExporter::ConvertToJson( + raw_experiments, raw_version, service_kind, endpoint); + }); + + ON_CALL(*this, OutputToFile(testing::_)) + .WillByDefault([this](std::string& file_path) -> void { + this->ProfileDataExporter::OutputToFile(file_path); + }); + + ON_CALL(*this, AddExperiment(testing::_, testing::_, testing::_)) + .WillByDefault( + [this]( + rapidjson::Value& entry, rapidjson::Value& experiment, + const Experiment& raw_experiment) -> void { + this->ProfileDataExporter::AddExperiment( + entry, experiment, raw_experiment); + }); + + ON_CALL(*this, AddServiceKind(testing::_)) + .WillByDefault([this](cb::BackendKind& service_kind) -> void { + this->ProfileDataExporter::AddServiceKind(service_kind); + }); + + ON_CALL(*this, AddEndpoint(testing::_)) + .WillByDefault([this](std::string& endpoint) -> void { + this->ProfileDataExporter::AddEndpoint(endpoint); + }); + + ON_CALL(*this, ClearDocument()).WillByDefault([this]() -> void { + this->ProfileDataExporter::ClearDocument(); + }); + } + + MOCK_METHOD( + void, ConvertToJson, + (const std::vector&, std::string&, cb::BackendKind&, + std::string&), + (override)); + MOCK_METHOD( + void, AddExperiment, + (rapidjson::Value&, rapidjson::Value&, const Experiment&), (override)); + MOCK_METHOD(void, OutputToFile, (std::string&), (override)); + MOCK_METHOD(void, AddServiceKind, (cb::BackendKind&)); + MOCK_METHOD(void, AddEndpoint, (std::string&)); + MOCK_METHOD(void, ClearDocument, ()); + + rapidjson::Document& document_{ProfileDataExporter::document_}; +}; + +using MockProfileDataExporter = testing::NiceMock; + +}} // namespace triton::perfanalyzer diff --git a/mock_request_rate_worker.h b/mock_request_rate_worker.h new file mode 100644 index 00000000..0132a9a0 --- /dev/null +++ b/mock_request_rate_worker.h @@ -0,0 +1,79 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include "gmock/gmock.h" +#include "request_rate_worker.h" + +namespace triton { namespace perfanalyzer { + +class NaggyMockRequestRateWorker : public RequestRateWorker { + public: + NaggyMockRequestRateWorker( + uint32_t id, std::shared_ptr thread_stat, + std::shared_ptr thread_config, + const std::shared_ptr parser, + std::shared_ptr data_loader, + const std::shared_ptr factory, + const bool on_sequence_model, const bool async, const size_t max_threads, + const bool using_json_data, const bool streaming, + const int32_t batch_size, std::condition_variable& wake_signal, + std::mutex& wake_mutex, bool& execute, + std::chrono::steady_clock::time_point& start_time, + const bool serial_sequences, + const std::shared_ptr& infer_data_manager, + std::shared_ptr sequence_manager) + : RequestRateWorker( + id, thread_stat, thread_config, parser, data_loader, factory, + on_sequence_model, async, max_threads, using_json_data, streaming, + batch_size, wake_signal, wake_mutex, execute, start_time, + serial_sequences, infer_data_manager, sequence_manager) + { + ON_CALL(*this, Infer()).WillByDefault([this]() -> void { + RequestRateWorker::Infer(); + }); + } + + MOCK_METHOD(void, Infer, (), (override)); + + void CreateContext() override { RequestRateWorker::CreateContext(); } + + void SendInferRequest() + { + if (thread_stat_->status_.IsOk()) { + LoadWorker::SendInferRequest(0, false); + } + } + + void EmptyInfer() { thread_config_->is_paused_ = true; } +}; + +// Non-naggy version of Mock (won't warn when using default gmock +// mocked function) +using MockRequestRateWorker = testing::NiceMock; + +}} // namespace triton::perfanalyzer diff --git a/mock_sequence_manager.h b/mock_sequence_manager.h new file mode 100644 index 00000000..522079c1 --- /dev/null +++ b/mock_sequence_manager.h @@ -0,0 +1,91 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include "gmock/gmock.h" +#include "sequence_manager.h" + +namespace triton { namespace perfanalyzer { + +class NaggyMockSequenceManager : public SequenceManager { + public: + NaggyMockSequenceManager() { SetupMocks(); } + + NaggyMockSequenceManager( + const uint64_t start_sequence_id, const uint64_t sequence_id_range, + const size_t sequence_length, const bool sequence_length_specified, + const double sequence_length_variation, const bool using_json_data, + std::shared_ptr data_loader) + : SequenceManager( + start_sequence_id, sequence_id_range, sequence_length, + sequence_length_specified, sequence_length_variation, + using_json_data, data_loader) + { + SetupMocks(); + } + + void SetupMocks() + { + ON_CALL(*this, SetInferSequenceOptions(testing::_, testing::_)) + .WillByDefault([this]( + const uint32_t seq_stat_index, + std::unique_ptr& options) { + this->SequenceManager::SetInferSequenceOptions( + seq_stat_index, options); + }); + ON_CALL(*this, InitNewSequence(testing::_)) + .WillByDefault([this](int seq_stat_index) { + this->SequenceManager::InitNewSequence(seq_stat_index); + }); + ON_CALL(*this, GetNextSeqId(testing::_)) + .WillByDefault([this](int seq_stat_index) -> uint64_t { + return this->SequenceManager::GetNextSeqId(seq_stat_index); + }); + ON_CALL(*this, GetRandomSequenceLength(testing::_)) + .WillByDefault([this](double offset_ratio) -> size_t { + return this->SequenceManager::GetRandomSequenceLength(offset_ratio); + }); + ON_CALL(*this, GetNewDataStreamId()).WillByDefault([this]() -> size_t { + return this->SequenceManager::GetNewDataStreamId(); + }); + } + + MOCK_METHOD( + void, SetInferSequenceOptions, + (const uint32_t, std::unique_ptr&), (override)); + MOCK_METHOD(void, InitNewSequence, (int), (override)); + MOCK_METHOD(uint64_t, GetNextSeqId, (int), (override)); + MOCK_METHOD(size_t, GetRandomSequenceLength, (double), (override)); + MOCK_METHOD(uint64_t, GetNewDataStreamId, (), (override)); + + std::vector>& sequence_statuses_{ + SequenceManager::sequence_statuses_}; + std::atomic& curr_seq_id_{SequenceManager::curr_seq_id_}; +}; + +using MockSequenceManager = testing::NiceMock; + +}} // namespace triton::perfanalyzer diff --git a/model_parser.cc b/model_parser.cc index c824e073..8ffea56d 100644 --- a/model_parser.cc +++ b/model_parser.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -34,34 +34,17 @@ cb::Error ModelParser::InitTriton( const rapidjson::Document& metadata, const rapidjson::Document& config, const std::string& model_version, + const std::vector& bls_composing_models, const std::unordered_map>& input_shapes, std::unique_ptr& backend) { model_name_ = metadata["name"].GetString(); model_version_ = model_version; - // Get the scheduler type for the model - scheduler_type_ = NONE; - const auto& ensemble_itr = config.FindMember("ensemble_scheduling"); - if (ensemble_itr != config.MemberEnd()) { - bool is_sequential = false; - RETURN_IF_ERROR(GetEnsembleSchedulerType( - config, model_version, backend, &is_sequential)); - if (is_sequential) { - scheduler_type_ = ENSEMBLE_SEQUENCE; - } else { - scheduler_type_ = ENSEMBLE; - } - } else { - const auto& sequence_itr = config.FindMember("sequence_batching"); - if (sequence_itr != config.MemberEnd()) { - scheduler_type_ = SEQUENCE; - } else { - const auto& dynamic_itr = config.FindMember("dynamic_batching"); - if (dynamic_itr != config.MemberEnd()) { - scheduler_type_ = DYNAMIC; - } - } - } + + RETURN_IF_ERROR( + DetermineComposingModelMap(bls_composing_models, config, backend)); + + RETURN_IF_ERROR(DetermineSchedulerType(config, backend)); max_batch_size_ = 0; const auto bs_itr = config.FindMember("max_batch_size"); @@ -186,6 +169,10 @@ ModelParser::InitTriton( response_cache_enabled_ = cache_itr->value["enable"].GetBool(); } + if (cache_itr != config.MemberEnd()) { + top_level_response_caching_enabled_ = cache_itr->value["enable"].GetBool(); + } + return cb::Error::Success; } @@ -282,6 +269,32 @@ ModelParser::InitTFServe( return cb::Error::Success; } +cb::Error +ModelParser::InitOpenAI( + const std::string& model_name, const std::string& model_version, + const int32_t batch_size) +{ + // OpenAI does not return model metadata hence we can not obtain any + // parameters. + model_name_ = model_name; + model_version_ = model_version; + max_batch_size_ = batch_size; + + // OpenAI will take a single json input with a fully formed payload + auto in_it = inputs_->emplace("payload", ModelTensor()).first; + in_it->second.name_ = "payload"; + in_it->second.datatype_ = "JSON"; + in_it->second.shape_.push_back(1); + + // OpenAI will reply with a single json output + auto out_it = outputs_->emplace("response", ModelTensor()).first; + out_it->second.name_ = "response"; + out_it->second.datatype_ = "JSON"; + out_it->second.shape_.push_back(1); + + return cb::Error::Success; +} + cb::Error ModelParser::InitTorchServe( const std::string& model_name, const std::string& model_version, @@ -305,16 +318,43 @@ ModelParser::InitTorchServe( } cb::Error -ModelParser::GetEnsembleSchedulerType( - const rapidjson::Document& config, const std::string& model_version, - std::unique_ptr& backend, bool* is_sequential) +ModelParser::DetermineComposingModelMap( + const std::vector& bls_composing_models, + const rapidjson::Document& config, + std::unique_ptr& backend) +{ + RETURN_IF_ERROR(AddBLSComposingModels(bls_composing_models, config, backend)); + RETURN_IF_ERROR(AddEnsembleComposingModels(config, backend)); + + return cb::Error::Success; +} + +cb::Error +ModelParser::AddBLSComposingModels( + const std::vector& bls_composing_models, + const rapidjson::Document& config, + std::unique_ptr& backend) { - const auto& sequence_itr = config.FindMember("sequence_batching"); - if (sequence_itr != config.MemberEnd()) { - *is_sequential = true; + for (auto model : bls_composing_models) { + (*composing_models_map_)[config["name"].GetString()].insert(model); + + rapidjson::Document composing_model_config; + RETURN_IF_ERROR(backend->ModelConfig( + &composing_model_config, model.first, model.second)); + RETURN_IF_ERROR( + AddEnsembleComposingModels(composing_model_config, backend)); } - if (std::string(config["platform"].GetString()).compare("ensemble") == 0) { + return cb::Error::Success; +} + +cb::Error +ModelParser::AddEnsembleComposingModels( + const rapidjson::Document& config, + std::unique_ptr& backend) +{ + if (config.HasMember("platform") && + std::string(config["platform"].GetString()).compare("ensemble") == 0) { const auto step_itr = config["ensemble_scheduling"].FindMember("step"); for (const auto& step : step_itr->value.GetArray()) { std::string step_model_version; @@ -325,25 +365,76 @@ ModelParser::GetEnsembleSchedulerType( } else { step_model_version = std::to_string(model_version_int); } + (*composing_models_map_)[config["name"].GetString()].emplace( std::string(step["model_name"].GetString()), step_model_version); - rapidjson::Document model_config; + rapidjson::Document composing_model_config; + RETURN_IF_ERROR(backend->ModelConfig( + &composing_model_config, step["model_name"].GetString(), + step_model_version)); + RETURN_IF_ERROR( + AddEnsembleComposingModels(composing_model_config, backend)); + } + } + + return cb::Error::Success; +} + + +cb::Error +ModelParser::DetermineSchedulerType( + const rapidjson::Document& config, + std::unique_ptr& backend) +{ + scheduler_type_ = NONE; + + if (composing_models_map_->size() != 0) { + bool is_sequential = false; + RETURN_IF_ERROR(GetComposingSchedulerType(backend, &is_sequential)); + if (is_sequential) { + scheduler_type_ = ENSEMBLE_SEQUENCE; + } else { + scheduler_type_ = ENSEMBLE; + } + } else { + const auto& sequence_itr = config.FindMember("sequence_batching"); + if (sequence_itr != config.MemberEnd()) { + scheduler_type_ = SEQUENCE; + } else { + const auto& dynamic_itr = config.FindMember("dynamic_batching"); + if (dynamic_itr != config.MemberEnd()) { + scheduler_type_ = DYNAMIC; + } + } + } + return cb::Error::Success; +} + +cb::Error +ModelParser::GetComposingSchedulerType( + std::unique_ptr& backend, bool* is_sequential) +{ + for (auto parent_composing_models : *composing_models_map_.get()) { + auto& composing_models = parent_composing_models.second; + for (auto composing_model : composing_models) { + rapidjson::Document config; RETURN_IF_ERROR(backend->ModelConfig( - &model_config, step["model_name"].GetString(), step_model_version)); - RETURN_IF_ERROR(GetEnsembleSchedulerType( - model_config, step_model_version, backend, is_sequential)); + &config, composing_model.first, composing_model.second)); - // Check if composing model has response caching enabled. - const auto cache_itr = model_config.FindMember("response_cache"); + const auto& sequence_itr = config.FindMember("sequence_batching"); + if (sequence_itr != config.MemberEnd()) { + *is_sequential = true; + } + + const auto cache_itr = config.FindMember("response_cache"); // response_cache_enabled_ set globally for reporting purposes if any // composing model has it enabled, so don't overwrite it if already set - if (cache_itr != model_config.MemberEnd() && !response_cache_enabled_) { + if (cache_itr != config.MemberEnd() && !response_cache_enabled_) { response_cache_enabled_ = cache_itr->value["enable"].GetBool(); } } } - return cb::Error::Success; } diff --git a/model_parser.h b/model_parser.h index 536284e0..ac76b3e2 100644 --- a/model_parser.h +++ b/model_parser.h @@ -1,4 +1,4 @@ -// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -26,6 +26,7 @@ #pragma once #include + #include "client_backend/client_backend.h" #include "perf_utils.h" @@ -33,6 +34,8 @@ namespace triton { namespace perfanalyzer { #ifndef DOCTEST_CONFIG_DISABLE class TestModelParser; +class MockModelParser; +class InferenceProfiler; #endif struct ModelTensor { @@ -40,6 +43,7 @@ struct ModelTensor { std::string name_; std::string datatype_; std::vector shape_; + // Indicates if this tensor holds shape information for other tensors bool is_shape_tensor_; bool is_optional_; }; @@ -70,7 +74,8 @@ class ModelParser { outputs_(std::make_shared()), composing_models_map_(std::make_shared()), scheduler_type_(NONE), max_batch_size_(0), is_decoupled_(false), - response_cache_enabled_(false) + response_cache_enabled_(false), + top_level_response_caching_enabled_(false) { } @@ -79,6 +84,7 @@ class ModelParser { /// \param metadata The metadata of the target model. /// \param config The config of the target model. /// \param model_version The version of target model. + /// \param bls_composing_models A list of BLS composing model identifiers /// \param input_shapes The user provided default shapes which will be use /// if a certain input has wildcard in its dimension. /// \param backend The backend object. @@ -86,6 +92,7 @@ class ModelParser { cb::Error InitTriton( const rapidjson::Document& metadata, const rapidjson::Document& config, const std::string& model_version, + const std::vector& bls_composing_models, const std::unordered_map>& input_shapes, std::unique_ptr& backend); @@ -106,6 +113,10 @@ class ModelParser { const std::unordered_map>& input_shapes, std::unique_ptr& backend); + cb::Error InitOpenAI( + const std::string& model_name, const std::string& model_version, + const int32_t batch_size); + cb::Error InitTorchServe( const std::string& model_name, const std::string& model_version, const int32_t batch_size); @@ -142,6 +153,22 @@ class ModelParser { /// model bool ResponseCacheEnabled() const { return response_cache_enabled_; } + /// Returns whether or not top level request caching is enabled for this model + /// \return the truth value of whether top level request caching is enabled + /// for this model + bool TopLevelResponseCachingEnabled() const + { + return top_level_response_caching_enabled_; + } + +/// Only for testing +#ifndef DOCTEST_CONFIG_DISABLE + void SetTopLevelResponseCaching(bool enable_top_level_response_caching) + { + top_level_response_caching_enabled_ = enable_top_level_response_caching; + } +#endif + /// Get the details about the model inputs. /// \return The map with tensor_name and the tensor details /// stored as key-value pair. @@ -153,22 +180,49 @@ class ModelParser { const std::shared_ptr& Outputs() { return outputs_; } /// Get the composing maps for the target model. - /// \return The pointer to the nested map descriping the + /// \return The pointer to the nested map describing the /// nested flow in the target model. const std::shared_ptr& GetComposingModelMap() { return composing_models_map_; } + + protected: + ModelSchedulerType scheduler_type_; + bool is_decoupled_; + private: - cb::Error GetEnsembleSchedulerType( - const rapidjson::Document& config, const std::string& model_version, + /// Populate composing_models_map_ based on any bls composing models passed in + /// via the CLI as well as any ensemble or nested ensemble models + cb::Error DetermineComposingModelMap( + const std::vector& bls_composing_models, + const rapidjson::Document& config, + std::unique_ptr& backend); + + cb::Error AddBLSComposingModels( + const std::vector& bls_composing_models, + const rapidjson::Document& config, + std::unique_ptr& backend); + + cb::Error AddEnsembleComposingModels( + const rapidjson::Document& config, + std::unique_ptr& backend); + + /// Populate scheduler_type_ based on the scheduler type of the parent model + /// as well as any composing models + cb::Error DetermineSchedulerType( + const rapidjson::Document& config, + std::unique_ptr& backend); + + /// Sets is_sequential to true if any of the composing models are sequential + cb::Error GetComposingSchedulerType( std::unique_ptr& backend, bool* is_sequential); /// In the json produced by protobuf, int64 and uint64 values are /// represented as strings. Protobuf doesn't provide an option to /// disable this (sigh) so we need to correctly parse these fields - /// for ModelParser to receive appopriate requests. + /// for ModelParser to receive appropriate requests. /// \param value The rapidjson value object with the int value. /// \param integer_value The output integer pointer. /// \return cb::Error object indicating success or failure. @@ -183,15 +237,16 @@ class ModelParser { std::string model_name_; std::string model_version_; std::string model_signature_name_; - ModelSchedulerType scheduler_type_; size_t max_batch_size_; - bool is_decoupled_; bool response_cache_enabled_; + bool top_level_response_caching_enabled_; #ifndef DOCTEST_CONFIG_DISABLE friend TestModelParser; + friend MockModelParser; + friend InferenceProfiler; - private: + public: ModelParser() = default; #endif }; diff --git a/mpi_utils.cc b/mpi_utils.cc index d5d86d26..2923f655 100644 --- a/mpi_utils.cc +++ b/mpi_utils.cc @@ -27,6 +27,7 @@ #include "mpi_utils.h" #include + #include #include diff --git a/perf_analyzer.cc b/perf_analyzer.cc index d95ffd01..c10101e1 100644 --- a/perf_analyzer.cc +++ b/perf_analyzer.cc @@ -1,4 +1,4 @@ -// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -27,6 +27,7 @@ #include "perf_analyzer.h" #include "perf_analyzer_exception.h" +#include "periodic_concurrency_manager.h" #include "report_writer.h" #include "request_rate_manager.h" @@ -64,6 +65,7 @@ PerfAnalyzer::Run() PrerunReport(); Profile(); WriteReport(); + GenerateProfileExport(); Finalize(); } @@ -75,11 +77,13 @@ PerfAnalyzer::CreateAnalyzerObjects() std::shared_ptr factory; FAIL_IF_ERR( cb::ClientBackendFactory::Create( - params_->kind, params_->url, params_->protocol, params_->ssl_options, - params_->trace_options, params_->compression_algorithm, - params_->http_headers, params_->triton_server_path, - params_->model_repository_path, params_->memory_type, - params_->extra_verbose, params_->metrics_url, &factory), + params_->kind, params_->url, params_->endpoint, params_->protocol, + params_->ssl_options, params_->trace_options, + params_->compression_algorithm, params_->http_headers, + params_->triton_server_path, params_->model_repository_path, + params_->extra_verbose, params_->metrics_url, + params_->input_tensor_format, params_->output_tensor_format, + &factory), "failed to create client factory"); FAIL_IF_ERR( @@ -99,10 +103,16 @@ PerfAnalyzer::CreateAnalyzerObjects() backend_->ModelConfig( &model_config, params_->model_name, params_->model_version), "failed to get model config"); + FAIL_IF_ERR( parser_->InitTriton( model_metadata, model_config, params_->model_version, - params_->input_shapes, backend_), + params_->bls_composing_models, params_->input_shapes, backend_), + "failed to create model parser"); + } else if (params_->kind == cb::BackendKind::OPENAI) { + FAIL_IF_ERR( + parser_->InitOpenAI( + params_->model_name, params_->model_version, params_->batch_size), "failed to create model parser"); } else if (params_->kind == cb::BackendKind::TENSORFLOW_SERVING) { rapidjson::Document model_metadata; @@ -156,7 +166,6 @@ PerfAnalyzer::CreateAnalyzerObjects() } std::unique_ptr manager; - if (params_->targeting_concurrency()) { if ((parser_->SchedulerType() == pa::ModelParser::SEQUENCE) || (parser_->SchedulerType() == pa::ModelParser::ENSEMBLE_SEQUENCE)) { @@ -193,7 +202,7 @@ PerfAnalyzer::CreateAnalyzerObjects() } if ((params_->sequence_id_range != 0) && (params_->sequence_id_range < params_->max_concurrency)) { - std::cerr << "sequence id range specified is smallar than the " + std::cerr << "sequence id range specified is smaller than the " << "maximum possible concurrency, sequence id collision may " << "occur." << std::endl; throw pa::PerfAnalyzerException(pa::GENERIC_ERROR); @@ -202,18 +211,22 @@ PerfAnalyzer::CreateAnalyzerObjects() pa::ConcurrencyManager::Create( params_->async, params_->streaming, params_->batch_size, params_->max_threads, params_->max_concurrency, - params_->sequence_length, params_->string_length, - params_->string_data, params_->zero_input, params_->user_data, - params_->shared_memory_type, params_->output_shm_size, - params_->start_sequence_id, params_->sequence_id_range, parser_, - factory, &manager), + params_->shared_memory_type, params_->output_shm_size, parser_, + factory, &manager, params_->request_parameters), "failed to create concurrency manager"); + } else if (params_->is_using_periodic_concurrency_mode) { + manager = std::make_unique( + params_->async, params_->streaming, params_->batch_size, + params_->max_threads, params_->max_concurrency, + params_->shared_memory_type, params_->output_shm_size, parser_, factory, + params_->periodic_concurrency_range, params_->request_period, + params_->request_parameters); } else if (params_->using_request_rate_range) { if ((params_->sequence_id_range != 0) && (params_->sequence_id_range < params_->num_of_sequences)) { std::cerr - << "sequence id range specified is smallar than the " + << "sequence id range specified is smaller than the " << "maximum possible number of sequences, sequence id collision " << "may occur." << std::endl; throw pa::PerfAnalyzerException(pa::GENERIC_ERROR); @@ -221,20 +234,18 @@ PerfAnalyzer::CreateAnalyzerObjects() FAIL_IF_ERR( pa::RequestRateManager::Create( params_->async, params_->streaming, params_->measurement_window_ms, - params_->request_distribution, params_->batch_size, - params_->max_threads, params_->num_of_sequences, - params_->sequence_length, params_->string_length, - params_->string_data, params_->zero_input, params_->user_data, - params_->shared_memory_type, params_->output_shm_size, - params_->start_sequence_id, params_->sequence_id_range, parser_, - factory, &manager), + params_->max_trials, params_->request_distribution, + params_->batch_size, params_->max_threads, + params_->num_of_sequences, params_->shared_memory_type, + params_->output_shm_size, params_->serial_sequences, parser_, + factory, &manager, params_->request_parameters), "failed to create request rate manager"); } else { if ((params_->sequence_id_range != 0) && (params_->sequence_id_range < params_->num_of_sequences)) { std::cerr - << "sequence id range specified is smallar than the " + << "sequence id range specified is smaller than the " << "maximum possible number of sequences, sequence id collision " << "may occur." << std::endl; throw pa::PerfAnalyzerException(pa::GENERIC_ERROR); @@ -242,16 +253,28 @@ PerfAnalyzer::CreateAnalyzerObjects() FAIL_IF_ERR( pa::CustomLoadManager::Create( params_->async, params_->streaming, params_->measurement_window_ms, - params_->request_intervals_file, params_->batch_size, - params_->max_threads, params_->num_of_sequences, - params_->sequence_length, params_->string_length, - params_->string_data, params_->zero_input, params_->user_data, - params_->shared_memory_type, params_->output_shm_size, - params_->start_sequence_id, params_->sequence_id_range, parser_, - factory, &manager), + params_->max_trials, params_->request_intervals_file, + params_->batch_size, params_->max_threads, + params_->num_of_sequences, params_->shared_memory_type, + params_->output_shm_size, params_->serial_sequences, parser_, + factory, &manager, params_->request_parameters), "failed to create custom load manager"); } + manager->InitManager( + params_->string_length, params_->string_data, params_->zero_input, + params_->user_data, params_->start_sequence_id, + params_->sequence_id_range, params_->sequence_length, + params_->sequence_length_specified, params_->sequence_length_variation); + + FAIL_IF_ERR( + pa::ProfileDataCollector::Create(&collector_), + "failed to create profile data collector"); + + FAIL_IF_ERR( + pa::ProfileDataExporter::Create(&exporter_), + "failed to create profile data exporter"); + FAIL_IF_ERR( pa::InferenceProfiler::Create( params_->verbose, params_->stability_threshold, @@ -260,7 +283,8 @@ PerfAnalyzer::CreateAnalyzerObjects() parser_, std::move(backend_), std::move(manager), &profiler_, params_->measurement_request_count, params_->measurement_mode, params_->mpi_driver, params_->metrics_interval_ms, - params_->should_collect_metrics), + params_->should_collect_metrics, params_->overhead_pct_threshold, + params_->async, collector_, !params_->profile_export_file.empty()), "failed to create profiler"); } @@ -271,19 +295,44 @@ PerfAnalyzer::PrerunReport() if (params_->kind == cb::BackendKind::TRITON || params_->using_batch_size) { std::cout << " Batch size: " << params_->batch_size << std::endl; } - if (params_->measurement_mode == pa::MeasurementMode::COUNT_WINDOWS) { - std::cout << " Using \"count_windows\" mode for stabilization" - << std::endl; + + std::cout << " Service Kind: " << BackendKindToString(params_->kind) + << std::endl; + + if (params_->request_count != 0) { + std::cout << " Sending a total of " << params_->request_count + << " requests" << std::endl; } else { - std::cout << " Using \"time_windows\" mode for stabilization" << std::endl; - } - if (params_->measurement_mode == pa::MeasurementMode::TIME_WINDOWS) { - std::cout << " Measurement window: " << params_->measurement_window_ms - << " msec" << std::endl; - } else if (params_->measurement_mode == pa::MeasurementMode::COUNT_WINDOWS) { - std::cout << " Minimum number of samples in each window: " - << params_->measurement_request_count << std::endl; + if (params_->measurement_mode == pa::MeasurementMode::COUNT_WINDOWS) { + std::cout << " Using \"count_windows\" mode for stabilization" + << std::endl; + } else { + std::cout << " Using \"time_windows\" mode for stabilization" + << std::endl; + } + + std::string stabilization_metric = "latency and throughput"; + if (params_->async) { + stabilization_metric = "throughput"; + } + if (params_->percentile == -1) { + std::cout << " Stabilizing using average " << stabilization_metric + << std::endl; + } else { + std::cout << " Stabilizing using p" << params_->percentile + << stabilization_metric << std::endl; + } + + if (params_->measurement_mode == pa::MeasurementMode::TIME_WINDOWS) { + std::cout << " Measurement window: " << params_->measurement_window_ms + << " msec" << std::endl; + } else if ( + params_->measurement_mode == pa::MeasurementMode::COUNT_WINDOWS) { + std::cout << " Minimum number of samples in each window: " + << params_->measurement_request_count << std::endl; + } } + if (params_->concurrency_range.end != 1) { std::cout << " Latency limit: " << params_->latency_threshold_ms << " msec" << std::endl; @@ -330,12 +379,6 @@ PerfAnalyzer::PrerunReport() << std::endl; } - if (params_->percentile == -1) { - std::cout << " Stabilizing using average latency" << std::endl; - } else { - std::cout << " Stabilizing using p" << params_->percentile << " latency" - << std::endl; - } std::cout << std::endl; } @@ -348,13 +391,16 @@ PerfAnalyzer::Profile() if (params_->targeting_concurrency()) { err = profiler_->Profile( params_->concurrency_range.start, params_->concurrency_range.end, - params_->concurrency_range.step, params_->search_mode, summary_); + params_->concurrency_range.step, params_->search_mode, + params_->request_count, perf_statuses_); + } else if (params_->is_using_periodic_concurrency_mode) { + err = profiler_->ProfilePeriodicConcurrencyMode(); } else { err = profiler_->Profile( params_->request_rate_range[pa::SEARCH_RANGE::kSTART], params_->request_rate_range[pa::SEARCH_RANGE::kEND], params_->request_rate_range[pa::SEARCH_RANGE::kSTEP], - params_->search_mode, summary_); + params_->search_mode, params_->request_count, perf_statuses_); } params_->mpi_driver->MPIBarrierWorld(); @@ -372,7 +418,7 @@ PerfAnalyzer::Profile() void PerfAnalyzer::WriteReport() { - if (!summary_.size()) { + if (!perf_statuses_.size() || params_->is_using_periodic_concurrency_mode) { return; } @@ -384,7 +430,7 @@ PerfAnalyzer::WriteReport() std::cout << "p" << params_->percentile << " Batch Latency" << std::endl; } - for (pa::PerfStatus& status : summary_) { + for (pa::PerfStatus& status : perf_statuses_) { if (params_->targeting_concurrency()) { std::cout << "Concurrency: " << status.concurrency << ", "; } else { @@ -395,19 +441,31 @@ PerfAnalyzer::WriteReport() << (status.stabilizing_latency_ns / 1000) << " usec" << std::endl; } + bool should_output_metrics{ + params_->should_collect_metrics && params_->verbose_csv}; + std::unique_ptr writer; FAIL_IF_ERR( pa::ReportWriter::Create( - params_->filename, params_->targeting_concurrency(), summary_, + params_->filename, params_->targeting_concurrency(), perf_statuses_, params_->verbose_csv, profiler_->IncludeServerStats(), - params_->percentile, parser_, &writer, - params_->should_collect_metrics), + params_->percentile, parser_, &writer, should_output_metrics), "failed to create report writer"); writer->GenerateReport(); } +void +PerfAnalyzer::GenerateProfileExport() +{ + if (!params_->profile_export_file.empty()) { + exporter_->Export( + collector_->GetData(), collector_->GetVersion(), + params_->profile_export_file, params_->kind, params_->endpoint); + } +} + void PerfAnalyzer::Finalize() { diff --git a/perf_analyzer.h b/perf_analyzer.h index e51102ac..b75fe35f 100644 --- a/perf_analyzer.h +++ b/perf_analyzer.h @@ -1,4 +1,4 @@ -// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -27,7 +27,9 @@ #include #include + #include + #include "command_line_parser.h" #include "concurrency_manager.h" #include "custom_load_manager.h" @@ -35,6 +37,8 @@ #include "model_parser.h" #include "mpi_utils.h" #include "perf_utils.h" +#include "profile_data_collector.h" +#include "profile_data_exporter.h" // Perf Analyzer provides various metrics to measure the performance of // the inference server. It can either be used to measure the throughput, @@ -179,7 +183,9 @@ class PerfAnalyzer { std::unique_ptr profiler_; std::unique_ptr backend_; std::shared_ptr parser_; - std::vector summary_; + std::vector perf_statuses_; + std::shared_ptr collector_; + std::shared_ptr exporter_; // // Helper methods @@ -191,5 +197,6 @@ class PerfAnalyzer { void PrerunReport(); void Profile(); void WriteReport(); + void GenerateProfileExport(); void Finalize(); }; diff --git a/perf_analyzer_exception.h b/perf_analyzer_exception.h index 4e36747b..a0b8ae70 100644 --- a/perf_analyzer_exception.h +++ b/perf_analyzer_exception.h @@ -42,12 +42,7 @@ class PerfAnalyzerException : public std::exception { { } - virtual const char* what() const throw() - { - std::string msg = - "Perf Error " + std::to_string(error_) + " thrown:\n" + message_; - return msg.c_str(); - } + virtual const char* what() const throw() { return message_.c_str(); } inline int GetError() const { return error_; } diff --git a/perf_utils.cc b/perf_utils.cc index d31ce158..6088c1b6 100644 --- a/perf_utils.cc +++ b/perf_utils.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -25,12 +25,18 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "perf_utils.h" + #include +#include +#include #include #include + #include +#include #include #include + #include "client_backend/client_backend.h" #include "doctest.h" @@ -88,79 +94,6 @@ ConvertDTypeFromTFS(const std::string& tf_dtype, std::string* datatype) return cb::Error::Success; } -cb::Error -ReadFile(const std::string& path, std::vector* contents) -{ - std::ifstream in(path, std::ios::in | std::ios::binary); - if (!in) { - return cb::Error("failed to open file '" + path + "'", pa::GENERIC_ERROR); - } - - in.seekg(0, std::ios::end); - - int file_size = in.tellg(); - if (file_size > 0) { - contents->resize(file_size); - in.seekg(0, std::ios::beg); - in.read(&(*contents)[0], contents->size()); - } - - in.close(); - - // If size is invalid, report after ifstream is closed - if (file_size < 0) { - return cb::Error( - "failed to get size for file '" + path + "'", pa::GENERIC_ERROR); - } else if (file_size == 0) { - return cb::Error("file '" + path + "' is empty", pa::GENERIC_ERROR); - } - - return cb::Error::Success; -} - -cb::Error -ReadTextFile(const std::string& path, std::vector* contents) -{ - std::ifstream in(path); - if (!in) { - return cb::Error("failed to open file '" + path + "'", pa::GENERIC_ERROR); - } - - std::string current_string; - while (std::getline(in, current_string)) { - contents->push_back(current_string); - } - in.close(); - - if (contents->size() == 0) { - return cb::Error("file '" + path + "' is empty", pa::GENERIC_ERROR); - } - return cb::Error::Success; -} - -cb::Error -ReadTimeIntervalsFile( - const std::string& path, std::vector* contents) -{ - std::ifstream in(path); - if (!in) { - return cb::Error("failed to open file '" + path + "'", pa::GENERIC_ERROR); - } - - std::string current_string; - while (std::getline(in, current_string)) { - std::chrono::nanoseconds curent_time_interval_ns( - std::stol(current_string) * 1000); - contents->push_back(curent_time_interval_ns); - } - in.close(); - - if (contents->size() == 0) { - return cb::Error("file '" + path + "' is empty", pa::GENERIC_ERROR); - } - return cb::Error::Success; -} - bool IsDirectory(const std::string& path) { @@ -269,6 +202,27 @@ SerializeExplicitTensor( std::copy( serialized.begin(), serialized.end(), std::back_inserter(*decoded_data)); + } else if (dt.compare("JSON") == 0) { + std::string serialized = ""; + + auto values = tensor.GetArray(); + if (values.Size() != 1) { + return cb::Error( + "JSON format does not yet support multiple json objects in the " + "input"); + } + for (const auto& value : values) { + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + value.Accept(writer); + + std::string element = buffer.GetString(); + uint32_t len = element.size(); + serialized.append(element); + } + std::copy( + serialized.begin(), serialized.end(), + std::back_inserter(*decoded_data)); } else { for (const auto& value : tensor.GetArray()) { if (dt.compare("BOOL") == 0) { @@ -367,6 +321,8 @@ SerializeExplicitTensor( double element(value.GetDouble()); const char* src = reinterpret_cast(&element); decoded_data->insert(decoded_data->end(), src, src + sizeof(double)); + } else { + return cb::Error("Unexpected type " + dt); } } } @@ -407,20 +363,15 @@ ShapeVecToString(const std::vector shape_vec, bool skip_first) } std::string -ShapeTensorValuesToString(const int* data_ptr, const int count) +TensorToRegionName(std::string name) { - bool first = true; - std::string str("["); - for (int i = 0; i < count; i++) { - if (!first) { - str += ","; - } - str += std::to_string(*(data_ptr + i)); - first = false; - } - - str += "]"; - return str; + // Remove slashes from the name, if any. + name.erase( + std::remove_if( + name.begin(), name.end(), + [](const char& c) { return ((c == '/') || (c == '\\')); }), + name.end()); + return name; } template <> @@ -445,14 +396,21 @@ ScheduleDistribution(const double request_rate) return [period](std::mt19937& /*gen*/) { return period; }; } -TEST_CASE("testing the ParseProtocol function") +cb::TensorFormat +ParseTensorFormat(const std::string& content_type_str) { - CHECK(ParseProtocol("http") == cb::ProtocolType::HTTP); - CHECK(ParseProtocol("HTTP") == cb::ProtocolType::HTTP); - CHECK(ParseProtocol("grpc") == cb::ProtocolType::GRPC); - CHECK(ParseProtocol("GRPC") == cb::ProtocolType::GRPC); - CHECK(ParseProtocol("abc") == cb::ProtocolType::UNKNOWN); - CHECK(ParseProtocol("") == cb::ProtocolType::UNKNOWN); + std::string content_type_str_lowercase{content_type_str}; + std::transform( + content_type_str.cbegin(), content_type_str.cend(), + content_type_str_lowercase.begin(), + [](unsigned char c) { return std::tolower(c); }); + if (content_type_str_lowercase == "binary") { + return cb::TensorFormat::BINARY; + } else if (content_type_str_lowercase == "json") { + return cb::TensorFormat::JSON; + } else { + return cb::TensorFormat::UNKNOWN; + } } }} // namespace triton::perfanalyzer diff --git a/perf_utils.h b/perf_utils.h index f2285d8a..7166936a 100644 --- a/perf_utils.h +++ b/perf_utils.h @@ -1,4 +1,4 @@ -// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -53,9 +53,6 @@ constexpr uint64_t NANOS_PER_MILLIS = 1000000; #define CHRONO_TO_MILLIS(TS) (CHRONO_TO_NANOS(TS) / pa::NANOS_PER_MILLIS) //============================================================================== -using TimestampVector = std::vector, - std::chrono::time_point, uint32_t, bool>>; // Will use the characters specified here to construct random strings std::string const character_set = @@ -97,30 +94,6 @@ cb::Error ConvertDTypeFromTFS( // Parse the communication protocol type cb::ProtocolType ParseProtocol(const std::string& str); -// Reads the data from file specified by path into vector of characters -// \param path The complete path to the file to be read -// \param contents The character vector that will contain the data read -// \return error status. Returns Non-Ok if an error is encountered during -// read operation. -cb::Error ReadFile(const std::string& path, std::vector* contents); - -// Reads the string from file specified by path into vector of strings -// \param path The complete path to the file to be read -// \param contents The string vector that will contain the data read -// \return error status. Returns Non-Ok if an error is encountered during -// read operation. -cb::Error ReadTextFile( - const std::string& path, std::vector* contents); - -// Reads the time intervals in microseconds from file specified by path into -// vector of time intervals in nanoseconds. -// \param path The complete path to the file to be read -// \param contents The time interval vector that will contain the data read. -// \return error status. Returns Non-Ok if an error is encountered during -// read operation. -cb::Error ReadTimeIntervalsFile( - const std::string& path, std::vector* contents); - // To check whether the path points to a valid system directory bool IsDirectory(const std::string& path); @@ -152,8 +125,8 @@ std::string GetRandomString(const int string_length); std::string ShapeVecToString( const std::vector shape_vec, bool skip_first = false); -// Returns the string containing the shape tensor values -std::string ShapeTensorValuesToString(const int* data_ptr, const int count); +// Remove slashes from tensor name, if any +std::string TensorToRegionName(std::string name); // Returns the request schedule distribution generator with the specified // request rate. @@ -161,4 +134,7 @@ template std::function ScheduleDistribution( const double request_rate); +// Parse the HTTP tensor format +cb::TensorFormat ParseTensorFormat(const std::string& tensor_format_str); + }} // namespace triton::perfanalyzer diff --git a/periodic_concurrency_manager.cc b/periodic_concurrency_manager.cc new file mode 100644 index 00000000..a8375ed6 --- /dev/null +++ b/periodic_concurrency_manager.cc @@ -0,0 +1,122 @@ +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "periodic_concurrency_manager.h" + +namespace triton { namespace perfanalyzer { + +std::vector +PeriodicConcurrencyManager::RunExperiment() +{ + AddConcurrentRequests(concurrency_range_.start); + WaitForRequestsToFinish(); + return GetRequestRecords(); +} + +std::shared_ptr +PeriodicConcurrencyManager::MakeWorker( + std::shared_ptr thread_stat, + std::shared_ptr thread_config) +{ + uint32_t id = workers_.size(); + auto worker = std::make_shared( + id, thread_stat, thread_config, parser_, data_loader_, factory_, + on_sequence_model_, async_, max_concurrency_, using_json_data_, + streaming_, batch_size_, wake_signal_, wake_mutex_, active_threads_, + execute_, infer_data_manager_, sequence_manager_, request_period_, + period_completed_callback_, request_completed_callback_); + return worker; +}; + +void +PeriodicConcurrencyManager::AddConcurrentRequests( + uint64_t num_concurrent_requests) +{ + for (size_t i = 0; i < num_concurrent_requests; i++) { + AddConcurrentRequest(i); + } + num_incomplete_periods_ = num_concurrent_requests; +} + +void +PeriodicConcurrencyManager::AddConcurrentRequest(size_t seq_stat_index_offset) +{ + threads_stat_.emplace_back(std::make_shared()); + threads_config_.emplace_back( + std::make_shared(threads_config_.size())); + threads_config_.back()->concurrency_ = 1; + threads_config_.back()->seq_stat_index_offset_ = seq_stat_index_offset; + workers_.emplace_back( + MakeWorker(threads_stat_.back(), threads_config_.back())); + threads_.emplace_back(&IWorker::Infer, workers_.back()); + active_threads_++; +} + +void +PeriodicConcurrencyManager::PeriodCompletedCallback() +{ + std::lock_guard lock(period_completed_callback_mutex_); + num_incomplete_periods_--; + if (num_incomplete_periods_ == 0) { + steps_completed_++; + uint64_t num_requests_sent{steps_completed_ * concurrency_range_.step}; + if (num_requests_sent < concurrency_range_.end) { + AddConcurrentRequests(concurrency_range_.step); + } + } +} + +void +PeriodicConcurrencyManager::RequestCompletedCallback() +{ + std::lock_guard lock(request_completed_callback_mutex_); + num_completed_requests_++; + if (num_completed_requests_ == concurrency_range_.end) { + all_requests_completed_promise_.set_value(true); + } +} + +void +PeriodicConcurrencyManager::WaitForRequestsToFinish() +{ + std::future all_requests_completed_future{ + all_requests_completed_promise_.get_future()}; + all_requests_completed_future.get(); +} + +std::vector +PeriodicConcurrencyManager::GetRequestRecords() +{ + std::vector request_records{}; + for (const auto& thread_stat : threads_stat_) { + request_records.insert( + request_records.end(), thread_stat->request_records_.cbegin(), + thread_stat->request_records_.cend()); + } + return request_records; +} + +}} // namespace triton::perfanalyzer diff --git a/periodic_concurrency_manager.h b/periodic_concurrency_manager.h new file mode 100644 index 00000000..40a0634b --- /dev/null +++ b/periodic_concurrency_manager.h @@ -0,0 +1,92 @@ +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include + +#include "concurrency_manager.h" +#include "periodic_concurrency_worker.h" + +namespace triton { namespace perfanalyzer { + +/// @brief Concurrency manager for periodically increasing concurrency by a step +/// amount based on the number of responses received (request period) by the +/// latest N (step or start concurrency for first-issued concurrent requests) +/// concurrent requests/workers. +class PeriodicConcurrencyManager : public ConcurrencyManager { + public: + PeriodicConcurrencyManager( + const bool async, const bool streaming, const int32_t batch_size, + const size_t max_threads, const size_t max_concurrency, + const SharedMemoryType shared_memory_type, const size_t output_shm_size, + + const std::shared_ptr& parser, + const std::shared_ptr& factory, + const Range concurrency_range, const uint64_t request_period, + const std::unordered_map& + request_parameters) + : ConcurrencyManager( + async, streaming, batch_size, max_threads, max_concurrency, + shared_memory_type, output_shm_size, parser, factory, + request_parameters), + concurrency_range_(concurrency_range), request_period_(request_period) + { + } + + std::vector RunExperiment(); + + private: + std::shared_ptr MakeWorker( + std::shared_ptr thread_stat, + std::shared_ptr thread_config) override; + + void AddConcurrentRequests(uint64_t num_concurrent_requests); + + void AddConcurrentRequest(size_t seq_stat_index_offset); + + void PeriodCompletedCallback(); + + void RequestCompletedCallback(); + + void WaitForRequestsToFinish(); + + std::vector GetRequestRecords(); + + Range concurrency_range_{1, 1, 1}; + uint64_t request_period_{0}; + uint64_t steps_completed_{0}; + uint64_t num_incomplete_periods_{0}; + uint64_t num_completed_requests_{0}; + std::mutex period_completed_callback_mutex_{}; + std::mutex request_completed_callback_mutex_{}; + std::promise all_requests_completed_promise_{}; + std::function period_completed_callback_{ + std::bind(&PeriodicConcurrencyManager::PeriodCompletedCallback, this)}; + std::function request_completed_callback_{ + std::bind(&PeriodicConcurrencyManager::RequestCompletedCallback, this)}; +}; + +}} // namespace triton::perfanalyzer diff --git a/periodic_concurrency_worker.cc b/periodic_concurrency_worker.cc new file mode 100644 index 00000000..9af3a9d8 --- /dev/null +++ b/periodic_concurrency_worker.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "periodic_concurrency_worker.h" + +namespace triton { namespace perfanalyzer { + +void +PeriodicConcurrencyWorker::Infer() +{ + CreateCtxIdTracker(); + ReserveContexts(); + RunInference(); +} + +std::shared_ptr +PeriodicConcurrencyWorker::CreateInferContext() +{ + std::shared_ptr infer_context{std::make_shared( + id_, ctxs_.size(), async_, streaming_, on_sequence_model_, + using_json_data_, batch_size_, thread_stat_, data_loader_, parser_, + factory_, execute_, infer_data_manager_, sequence_manager_)}; + infer_context->RegisterWorkerCallback(worker_callback_); + return infer_context; +} + +void +PeriodicConcurrencyWorker::WorkerCallback(uint32_t infer_context_id) +{ + if (ctxs_.at(infer_context_id)->GetNumResponsesForCurrentRequest() == + request_period_) { + period_completed_callback_(); + } + if (ctxs_.at(infer_context_id)->HasReceivedFinalResponse()) { + bool has_not_completed_period{ + ctxs_.at(infer_context_id)->GetNumResponsesForCurrentRequest() < + request_period_}; + if (has_not_completed_period) { + throw std::runtime_error( + "Request received final response before request period was reached. " + "Request period must be at most the total number of responses " + "received by any request."); + } + request_completed_callback_(); + } +} + +}} // namespace triton::perfanalyzer diff --git a/periodic_concurrency_worker.h b/periodic_concurrency_worker.h new file mode 100644 index 00000000..7242219b --- /dev/null +++ b/periodic_concurrency_worker.h @@ -0,0 +1,80 @@ +// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include + +#include "concurrency_worker.h" + +namespace triton { namespace perfanalyzer { + +/// @brief Worker class for periodic concurrency mode. Issues one request only +/// and waits for all responses to come in. Notifies manager when N responses +/// (request period) have been received. Notifies manager when final response +/// has been received. +class PeriodicConcurrencyWorker : public ConcurrencyWorker { + public: + PeriodicConcurrencyWorker( + uint32_t id, std::shared_ptr thread_stat, + std::shared_ptr thread_config, + const std::shared_ptr parser, + std::shared_ptr data_loader, + const std::shared_ptr factory, + const bool on_sequence_model, const bool async, + const size_t max_concurrency, const bool using_json_data, + const bool streaming, const int32_t batch_size, + std::condition_variable& wake_signal, std::mutex& wake_mutex, + size_t& active_threads, bool& execute, + const std::shared_ptr& infer_data_manager, + std::shared_ptr sequence_manager, + uint64_t request_period, std::function period_completed_callback, + std::function request_completed_callback) + : ConcurrencyWorker( + id, thread_stat, thread_config, parser, data_loader, factory, + on_sequence_model, async, max_concurrency, using_json_data, + streaming, batch_size, wake_signal, wake_mutex, active_threads, + execute, infer_data_manager, sequence_manager), + request_period_(request_period), + period_completed_callback_(period_completed_callback), + request_completed_callback_(request_completed_callback) + { + } + + void Infer() override; + + std::shared_ptr CreateInferContext() override; + + void WorkerCallback(uint32_t infer_context_id); + + private: + uint64_t request_period_{0}; + std::function period_completed_callback_{nullptr}; + std::function request_completed_callback_{nullptr}; + std::function worker_callback_{std::bind( + &PeriodicConcurrencyWorker::WorkerCallback, this, std::placeholders::_1)}; +}; + +}} // namespace triton::perfanalyzer diff --git a/profile_data_collector.cc b/profile_data_collector.cc new file mode 100644 index 00000000..8cca26a7 --- /dev/null +++ b/profile_data_collector.cc @@ -0,0 +1,85 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "profile_data_collector.h" + +#include + +#include "perf_utils.h" + +namespace triton { namespace perfanalyzer { + +cb::Error +ProfileDataCollector::Create(std::shared_ptr* collector) +{ + std::shared_ptr local_collector{ + new ProfileDataCollector()}; + *collector = std::move(local_collector); + return cb::Error::Success; +} + +void +ProfileDataCollector::AddWindow( + InferenceLoadMode& id, uint64_t window_start_ns, uint64_t window_end_ns) +{ + auto it = FindExperiment(id); + + if (it == experiments_.end()) { + Experiment new_experiment{}; + new_experiment.mode = id; + new_experiment.window_boundaries.push_back(window_start_ns); + new_experiment.window_boundaries.push_back(window_end_ns); + + experiments_.push_back(new_experiment); + } else { + // Window timestamps are always increasing so it is safe to check only the + // last element + if (it->window_boundaries.back() != window_start_ns) { + it->window_boundaries.push_back(window_start_ns); + } + it->window_boundaries.push_back(window_end_ns); + } +} + +void +ProfileDataCollector::AddData( + InferenceLoadMode& id, std::vector&& request_records) +{ + auto it = FindExperiment(id); + + if (it == experiments_.end()) { + Experiment new_experiment{}; + new_experiment.mode = id; + new_experiment.requests = std::move(request_records); + experiments_.push_back(new_experiment); + } else { + it->requests.insert( + it->requests.end(), std::make_move_iterator(request_records.begin()), + std::make_move_iterator(request_records.end())); + } +} + +}} // namespace triton::perfanalyzer diff --git a/profile_data_collector.h b/profile_data_collector.h new file mode 100644 index 00000000..3a726bbf --- /dev/null +++ b/profile_data_collector.h @@ -0,0 +1,122 @@ +// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include +#include +#include + +#include "client_backend/client_backend.h" +#include "constants.h" +#include "perf_utils.h" +#include "request_record.h" + +namespace triton { namespace perfanalyzer { + +/// Data structure to hold which inference load mode was used for an experiment. +/// Only one data member will be nonzero, indicating the inference load mode for +/// a particular experiment. +struct InferenceLoadMode { + uint32_t concurrency; + double request_rate; + + InferenceLoadMode() + { + concurrency = 0; + request_rate = 0.0; + } + + InferenceLoadMode(uint64_t c, double rr) + { + concurrency = c; + request_rate = rr; + } + + bool operator==(const InferenceLoadMode& rhs) const + { + return (concurrency == rhs.concurrency) && + (request_rate == rhs.request_rate); + } +}; + +/// Data structure to hold profile export data for an experiment (e.g. +/// concurrency 4 or request rate 50) +struct Experiment { + InferenceLoadMode mode; + std::vector requests; + std::vector window_boundaries; +}; + +#ifndef DOCTEST_CONFIG_DISABLE +class NaggyMockProfileDataCollector; +#endif + +/// Data structure and methods for storing profile export data. +class ProfileDataCollector { + public: + static cb::Error Create(std::shared_ptr* collector); + ~ProfileDataCollector() = default; + + + /// Add a measurement window to the collector + /// @param id Identifier for the experiment + /// @param window_start_ns The window start timestamp in nanoseconds. + /// @param window_end_ns The window end timestamp in nanoseconds. + void AddWindow( + InferenceLoadMode& id, uint64_t window_start_ns, uint64_t window_end_ns); + + /// Add request records to an experiment + /// @param id Identifier for the experiment + /// @param request_records The request information for the current experiment. + void AddData( + InferenceLoadMode& id, std::vector&& request_records); + + /// Get the experiment data for the profile + /// @return Experiment data + std::vector& GetData() { return experiments_; } + + std::string& GetVersion() { return version_; } + + private: + ProfileDataCollector() = default; + + virtual std::vector::iterator FindExperiment( + InferenceLoadMode& id) + { + return std::find_if( + experiments_.begin(), experiments_.end(), + [&id](const Experiment& e) { return e.mode == id; }); + }; + + std::vector experiments_{}; + std::string version_{VERSION}; + +#ifndef DOCTEST_CONFIG_DISABLE + friend NaggyMockProfileDataCollector; +#endif +}; +}} // namespace triton::perfanalyzer diff --git a/profile_data_exporter.cc b/profile_data_exporter.cc new file mode 100644 index 00000000..ea79d685 --- /dev/null +++ b/profile_data_exporter.cc @@ -0,0 +1,302 @@ +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +#include "profile_data_exporter.h" + +#include +#include +#include + +#include "client_backend/client_backend.h" + +namespace triton { namespace perfanalyzer { + +cb::Error +ProfileDataExporter::Create(std::shared_ptr* exporter) +{ + std::shared_ptr local_exporter{ + new ProfileDataExporter()}; + *exporter = std::move(local_exporter); + return cb::Error::Success; +} + +void +ProfileDataExporter::Export( + const std::vector& raw_experiments, std::string& raw_version, + std::string& file_path, cb::BackendKind& service_kind, + std::string& endpoint) +{ + ConvertToJson(raw_experiments, raw_version, service_kind, endpoint); + OutputToFile(file_path); +} + +void +ProfileDataExporter::ConvertToJson( + const std::vector& raw_experiments, std::string& raw_version, + cb::BackendKind& service_kind, std::string& endpoint) +{ + ClearDocument(); + rapidjson::Value experiments(rapidjson::kArrayType); + + for (const auto& raw_experiment : raw_experiments) { + rapidjson::Value entry(rapidjson::kObjectType); + rapidjson::Value experiment(rapidjson::kObjectType); + rapidjson::Value requests(rapidjson::kArrayType); + rapidjson::Value window_boundaries(rapidjson::kArrayType); + + AddExperiment(entry, experiment, raw_experiment); + AddRequests(entry, requests, raw_experiment); + AddWindowBoundaries(entry, window_boundaries, raw_experiment); + + experiments.PushBack(entry, document_.GetAllocator()); + } + + document_.AddMember("experiments", experiments, document_.GetAllocator()); + AddVersion(raw_version); + AddServiceKind(service_kind); + AddEndpoint(endpoint); +} + +void +ProfileDataExporter::ClearDocument() +{ + rapidjson::Document d{}; + document_.Swap(d); + document_.SetObject(); +} + +void +ProfileDataExporter::AddExperiment( + rapidjson::Value& entry, rapidjson::Value& experiment, + const Experiment& raw_experiment) +{ + rapidjson::Value mode; + rapidjson::Value value; + if (raw_experiment.mode.concurrency != 0) { + mode = rapidjson::StringRef("concurrency"); + value.SetUint64(raw_experiment.mode.concurrency); + } else { + mode = rapidjson::StringRef("request_rate"); + value.SetDouble(raw_experiment.mode.request_rate); + } + experiment.AddMember("mode", mode, document_.GetAllocator()); + experiment.AddMember("value", value, document_.GetAllocator()); + entry.AddMember("experiment", experiment, document_.GetAllocator()); +} + +void +ProfileDataExporter::AddRequests( + rapidjson::Value& entry, rapidjson::Value& requests, + const Experiment& raw_experiment) +{ + for (auto& raw_request : raw_experiment.requests) { + rapidjson::Value request(rapidjson::kObjectType); + rapidjson::Value timestamp; + + timestamp.SetUint64(raw_request.start_time_.time_since_epoch().count()); + request.AddMember("timestamp", timestamp, document_.GetAllocator()); + + if (raw_request.sequence_id_ != 0) { + rapidjson::Value sequence_id; + sequence_id.SetUint64(raw_request.sequence_id_); + request.AddMember("sequence_id", sequence_id, document_.GetAllocator()); + } + + rapidjson::Value request_inputs(rapidjson::kObjectType); + AddRequestInputs(request_inputs, raw_request.request_inputs_); + request.AddMember( + "request_inputs", request_inputs, document_.GetAllocator()); + + rapidjson::Value response_timestamps(rapidjson::kArrayType); + AddResponseTimestamps( + response_timestamps, raw_request.response_timestamps_); + request.AddMember( + "response_timestamps", response_timestamps, document_.GetAllocator()); + + rapidjson::Value response_outputs(rapidjson::kArrayType); + AddResponseOutputs(response_outputs, raw_request.response_outputs_); + request.AddMember( + "response_outputs", response_outputs, document_.GetAllocator()); + + requests.PushBack(request, document_.GetAllocator()); + } + entry.AddMember("requests", requests, document_.GetAllocator()); +} + +void +ProfileDataExporter::AddResponseTimestamps( + rapidjson::Value& timestamps_json, + const std::vector>& + timestamps) +{ + for (auto& timestamp : timestamps) { + rapidjson::Value timestamp_json; + timestamp_json.SetUint64(timestamp.time_since_epoch().count()); + timestamps_json.PushBack(timestamp_json, document_.GetAllocator()); + } +} + +void +ProfileDataExporter::AddRequestInputs( + rapidjson::Value& request_inputs_json, + const std::vector& request_inputs) +{ + for (const auto& request_input : request_inputs) { + for (const auto& input : request_input) { + const auto& name{input.first}; + const auto& buf{input.second.data_.get()}; + const auto& byte_size{input.second.size_}; + const auto& data_type{input.second.data_type_}; + rapidjson::Value name_json(name.c_str(), document_.GetAllocator()); + rapidjson::Value input_json{}; + // TMA-1777: support other data types + if (buf != nullptr) { + if (data_type == "BYTES" || data_type == "JSON") { + input_json.SetString( + reinterpret_cast(buf), byte_size, + document_.GetAllocator()); + } else if (data_type == "INT32") { + auto* val = reinterpret_cast(buf); + input_json.SetInt(*val); + } else if (data_type == "BOOL") { + bool is_true = (*buf > 0); + input_json.SetBool(is_true); + } else { + std::cerr << "WARNING: data type '" + data_type + + "' is not supported with JSON." + << std::endl; + } + } else { + input_json.SetString("", 0, document_.GetAllocator()); + } + request_inputs_json.AddMember( + name_json, input_json, document_.GetAllocator()); + } + } +} + +void +ProfileDataExporter::AddResponseOutputs( + rapidjson::Value& outputs_json, + const std::vector& response_outputs) +{ + for (const auto& response_output : response_outputs) { + rapidjson::Value response_output_json(rapidjson::kObjectType); + for (const auto& output : response_output) { + const auto& name{output.first}; + const auto& buf{output.second.data_.get()}; + const auto& byte_size{output.second.size_}; + rapidjson::Value name_json(name.c_str(), document_.GetAllocator()); + rapidjson::Value output_json{}; + // TMA-1777: support other data types + if (buf != nullptr) { + output_json.SetString( + reinterpret_cast(buf), byte_size, + document_.GetAllocator()); + } else { + output_json.SetString("", 0, document_.GetAllocator()); + } + response_output_json.AddMember( + name_json, output_json, document_.GetAllocator()); + } + outputs_json.PushBack(response_output_json, document_.GetAllocator()); + } +} + +void +ProfileDataExporter::AddWindowBoundaries( + rapidjson::Value& entry, rapidjson::Value& window_boundaries, + const Experiment& raw_experiment) +{ + for (auto& window : raw_experiment.window_boundaries) { + rapidjson::Value w; + w.SetUint64(window); + window_boundaries.PushBack(w, document_.GetAllocator()); + } + entry.AddMember( + "window_boundaries", window_boundaries, document_.GetAllocator()); +} + +void +ProfileDataExporter::AddVersion(std::string& raw_version) +{ + rapidjson::Value version; + version = rapidjson::StringRef(raw_version.c_str()); + document_.AddMember("version", version, document_.GetAllocator()); +} + +void +ProfileDataExporter::AddServiceKind(cb::BackendKind& kind) +{ + std::string raw_service_kind{""}; + if (kind == cb::BackendKind::TRITON) { + raw_service_kind = "triton"; + } else if (kind == cb::BackendKind::TENSORFLOW_SERVING) { + raw_service_kind = "tfserving"; + } else if (kind == cb::BackendKind::TORCHSERVE) { + raw_service_kind = "torchserve"; + } else if (kind == cb::BackendKind::TRITON_C_API) { + raw_service_kind = "triton_c_api"; + } else if (kind == cb::BackendKind::OPENAI) { + raw_service_kind = "openai"; + } else { + std::cerr << "Unknown service kind detected. The 'service_kind' will not " + "be specified." + << std::endl; + } + + rapidjson::Value service_kind; + service_kind.SetString(raw_service_kind.c_str(), document_.GetAllocator()); + document_.AddMember("service_kind", service_kind, document_.GetAllocator()); +} + +void +ProfileDataExporter::AddEndpoint(std::string& raw_endpoint) +{ + rapidjson::Value endpoint; + endpoint = rapidjson::StringRef(raw_endpoint.c_str()); + document_.AddMember("endpoint", endpoint, document_.GetAllocator()); +} + +void +ProfileDataExporter::OutputToFile(std::string& file_path) +{ + FILE* fp = fopen(file_path.c_str(), "w"); + if (fp == nullptr) { + throw PerfAnalyzerException( + "failed to open file for outputting raw profile data", GENERIC_ERROR); + } + char writeBuffer[65536]; + rapidjson::FileWriteStream os(fp, writeBuffer, sizeof(writeBuffer)); + + rapidjson::Writer writer(os); + document_.Accept(writer); + + fclose(fp); +} + +}} // namespace triton::perfanalyzer diff --git a/profile_data_exporter.h b/profile_data_exporter.h new file mode 100644 index 00000000..820148d7 --- /dev/null +++ b/profile_data_exporter.h @@ -0,0 +1,102 @@ +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include + +#include "client_backend/client_backend.h" +#include "profile_data_collector.h" + +namespace triton { namespace perfanalyzer { + +#ifndef DOCTEST_CONFIG_DISABLE +class NaggyMockProfileDataExporter; +#endif + +/// Exports profile data. +class ProfileDataExporter { + public: + static cb::Error Create(std::shared_ptr* exporter); + ~ProfileDataExporter() = default; + + /// Export profile data to json file + /// @param raw_experiments All of the raw data for the experiments run by perf + /// analyzer + /// @param raw_version String containing the version number for the json + /// output + /// @param file_path File path to export profile data to. + /// @param service_kind Service that Perf Analyzer generates load for. + /// @param endpoint Endpoint to send the requests. + void Export( + const std::vector& raw_experiments, std::string& raw_version, + std::string& file_path, cb::BackendKind& service_kind, + std::string& endpoint); + + private: + ProfileDataExporter() = default; + /// Convert the raw data collected to json output + /// @param raw_experiments All of the raw data for the experiments run by perf + /// analyzer + /// @param raw_version String containing the version number for the json + /// output + /// @param service_kind Service that Perf Analyzer generates load for. + /// @param endpoint Endpoint to send the requests. + virtual void ConvertToJson( + const std::vector& raw_experiments, std::string& raw_version, + cb::BackendKind& service_kind, std::string& endpoint); + virtual void OutputToFile(std::string& file_path); + virtual void AddExperiment( + rapidjson::Value& entry, rapidjson::Value& experiment, + const Experiment& raw_experiment); + void AddRequests( + rapidjson::Value& entry, rapidjson::Value& requests, + const Experiment& raw_experiment); + void AddRequestInputs( + rapidjson::Value& inputs_json, + const std::vector& inputs); + void AddResponseTimestamps( + rapidjson::Value& timestamps_json, + const std::vector>& + timestamps); + void AddResponseOutputs( + rapidjson::Value& outputs_json, + const std::vector& outputs); + void AddWindowBoundaries( + rapidjson::Value& entry, rapidjson::Value& window_boundaries, + const Experiment& raw_experiment); + void AddVersion(std::string& raw_version); + void AddServiceKind(cb::BackendKind& service_kind); + void AddEndpoint(std::string& endpoint); + void ClearDocument(); + + rapidjson::Document document_{}; + +#ifndef DOCTEST_CONFIG_DISABLE + friend NaggyMockProfileDataExporter; +#endif +}; +}} // namespace triton::perfanalyzer diff --git a/rand_ctx_id_tracker.h b/rand_ctx_id_tracker.h new file mode 100644 index 00000000..e850909a --- /dev/null +++ b/rand_ctx_id_tracker.h @@ -0,0 +1,58 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include + +#include "ictx_id_tracker.h" + +namespace triton { namespace perfanalyzer { + +// Context ID tracker that is always available and returns random Context IDs +// +class RandCtxIdTracker : public ICtxIdTracker { + public: + RandCtxIdTracker() = default; + + void Reset(size_t count) override + { + distribution_ = std::uniform_int_distribution(0, count - 1); + } + + void Restore(size_t id) override{}; + + size_t Get() override { return distribution_(rng_generator_); }; + + bool IsAvailable() override { return true; }; + + private: + std::uniform_int_distribution distribution_; + std::default_random_engine rng_generator_{}; + + size_t max = 0; +}; + +}}; // namespace triton::perfanalyzer diff --git a/rate_schedule.h b/rate_schedule.h new file mode 100644 index 00000000..d45ecd31 --- /dev/null +++ b/rate_schedule.h @@ -0,0 +1,66 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include +#include +#include + +namespace triton { namespace perfanalyzer { + +using NanoIntervals = std::vector; + +/// Defines a schedule, where the consumer should +/// loop through the provided intervals, and then every time it loops back to +/// the start add an additional amount equal to the duration +/// +struct RateSchedule { + NanoIntervals intervals; + std::chrono::nanoseconds duration; + + /// Returns the next timestamp in the schedule + /// + std::chrono::nanoseconds Next() + { + auto next = intervals[index_] + duration * rounds_; + + index_++; + if (index_ >= intervals.size()) { + rounds_++; + index_ = 0; + } + return next; + } + + private: + size_t rounds_ = 0; + size_t index_ = 0; +}; + +using RateSchedulePtr_t = std::shared_ptr; + +}} // namespace triton::perfanalyzer diff --git a/report_writer.cc b/report_writer.cc index 5942e79e..3d9cac6a 100644 --- a/report_writer.cc +++ b/report_writer.cc @@ -1,4 +1,4 @@ -// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -25,8 +25,10 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "report_writer.h" + #include #include + #include "constants.h" #include "perf_analyzer_exception.h" @@ -74,7 +76,11 @@ ReportWriter::GenerateReport() } else { ofs << "Request Rate,"; } - ofs << "Inferences/Second,Client Send,"; + ofs << "Inferences/Second,"; + if (parser_->IsDecoupled()) { + ofs << "Response Throughput,"; + } + ofs << "Client Send,"; if (include_server_stats_) { ofs << "Network+Server Send/Recv,Server Queue," << "Server Compute Input,Server Compute Infer," @@ -92,17 +98,16 @@ ReportWriter::GenerateReport() ofs << ",p" << percentile.first << " latency"; } if (verbose_csv_) { - ofs << ","; if (percentile_ == -1) { - ofs << "Avg latency,"; + ofs << ",Avg latency"; } - ofs << "request/response,"; - ofs << "response wait,"; + ofs << ",request/response"; + ofs << ",response wait"; if (should_output_metrics_) { - ofs << "Avg GPU Utilization,"; - ofs << "Avg GPU Power Usage,"; - ofs << "Max GPU Memory Usage,"; - ofs << "Total GPU Memory"; + ofs << ",Avg GPU Utilization"; + ofs << ",Avg GPU Power Usage"; + ofs << ",Max GPU Memory Usage"; + ofs << ",Total GPU Memory"; } } ofs << std::endl; @@ -121,8 +126,11 @@ ReportWriter::GenerateReport() ofs << status.request_rate << ","; } - ofs << status.client_stats.infer_per_sec << "," - << (status.client_stats.avg_send_time_ns / 1000) << ","; + ofs << status.client_stats.infer_per_sec << ","; + if (parser_->IsDecoupled()) { + ofs << status.client_stats.responses_per_sec << ","; + } + ofs << (status.client_stats.avg_send_time_ns / 1000) << ","; if (include_server_stats_) { uint64_t avg_queue_ns = status.server_stats.queue_count > 0 ? (status.server_stats.queue_time_ns / @@ -212,12 +220,11 @@ ReportWriter::GenerateReport() status.client_stats.avg_request_time_ns / 1000; const uint64_t avg_response_wait_time_us = avg_request_time_us - avg_send_time_us - avg_receive_time_us; - ofs << ","; if (percentile_ == -1) { - ofs << avg_latency_us << ","; + ofs << "," << avg_latency_us; } - ofs << std::to_string(avg_send_time_us + avg_receive_time_us) << ","; - ofs << std::to_string(avg_response_wait_time_us) << ","; + ofs << "," << std::to_string(avg_send_time_us + avg_receive_time_us); + ofs << "," << std::to_string(avg_response_wait_time_us); if (should_output_metrics_) { if (status.metrics.size() == 1) { WriteGpuMetrics(ofs, status.metrics[0]); @@ -260,7 +267,7 @@ ReportWriter::GenerateReport() ofs << "Server Cache Hit,"; ofs << "Server Cache Miss,"; } - ofs << "Client Recv"; + ofs << "Client Recv" << std::endl; for (pa::PerfStatus& status : summary_) { auto it = status.server_stats.composing_models_stat.find( @@ -362,6 +369,8 @@ ReportWriter::WriteGpuMetrics(std::ostream& ofs, const Metrics& metric) auto& gpu_power_usage_map = metric.gpu_power_usage_per_gpu; auto& gpu_mem_usage_map = metric.gpu_memory_used_bytes_per_gpu; auto& gpu_total_mem_map = metric.gpu_memory_total_bytes_per_gpu; + // Currently assume GPU metrics will be appended to existing line + ofs << ","; for (auto& entry : gpu_util_map) { ofs << entry.first << ":" << entry.second << ";"; } @@ -377,7 +386,6 @@ ReportWriter::WriteGpuMetrics(std::ostream& ofs, const Metrics& metric) for (auto& entry : gpu_total_mem_map) { ofs << entry.first << ":" << entry.second << ";"; } - ofs << ","; } }} // namespace triton::perfanalyzer diff --git a/report_writer.h b/report_writer.h index 54bc013b..eeb09c9a 100644 --- a/report_writer.h +++ b/report_writer.h @@ -1,4 +1,4 @@ -// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -26,6 +26,7 @@ #pragma once #include + #include "client_backend/client_backend.h" #include "inference_profiler.h" #include "metrics.h" @@ -99,7 +100,7 @@ class ReportWriter { #ifndef DOCTEST_CONFIG_DISABLE friend TestReportWriter; - protected: + public: ReportWriter() = default; #endif }; diff --git a/request_rate_manager.cc b/request_rate_manager.cc index c718368b..be12282a 100644 --- a/request_rate_manager.cc +++ b/request_rate_manager.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -38,31 +38,20 @@ RequestRateManager::~RequestRateManager() cb::Error RequestRateManager::Create( const bool async, const bool streaming, - const uint64_t measurement_window_ms, Distribution request_distribution, - const int32_t batch_size, const size_t max_threads, - const uint32_t num_of_sequences, const size_t sequence_length, - const size_t string_length, const std::string& string_data, - const bool zero_input, std::vector& user_data, + const uint64_t measurement_window_ms, const size_t max_trials, + Distribution request_distribution, const int32_t batch_size, + const size_t max_threads, const uint32_t num_of_sequences, const SharedMemoryType shared_memory_type, const size_t output_shm_size, - const uint64_t start_sequence_id, const uint64_t sequence_id_range, - const std::shared_ptr& parser, + const bool serial_sequences, const std::shared_ptr& parser, const std::shared_ptr& factory, - std::unique_ptr* manager) + std::unique_ptr* manager, + const std::unordered_map& + request_parameters) { std::unique_ptr local_manager(new RequestRateManager( async, streaming, request_distribution, batch_size, measurement_window_ms, - max_threads, num_of_sequences, sequence_length, shared_memory_type, - output_shm_size, start_sequence_id, sequence_id_range, parser, factory)); - - local_manager->threads_config_.reserve(max_threads); - - RETURN_IF_ERROR(local_manager->InitManagerInputs( - string_length, string_data, zero_input, user_data)); - - if (local_manager->shared_memory_type_ != - SharedMemoryType::NO_SHARED_MEMORY) { - RETURN_IF_ERROR(local_manager->InitSharedMemory()); - } + max_trials, max_threads, num_of_sequences, shared_memory_type, + output_shm_size, serial_sequences, parser, factory, request_parameters)); *manager = std::move(local_manager); @@ -72,31 +61,39 @@ RequestRateManager::Create( RequestRateManager::RequestRateManager( const bool async, const bool streaming, Distribution request_distribution, int32_t batch_size, const uint64_t measurement_window_ms, - const size_t max_threads, const uint32_t num_of_sequences, - const size_t sequence_length, const SharedMemoryType shared_memory_type, - const size_t output_shm_size, const uint64_t start_sequence_id, - const uint64_t sequence_id_range, + const size_t max_trials, const size_t max_threads, + const uint32_t num_of_sequences, const SharedMemoryType shared_memory_type, + const size_t output_shm_size, const bool serial_sequences, const std::shared_ptr& parser, - const std::shared_ptr& factory) + const std::shared_ptr& factory, + const std::unordered_map& + request_parameters) : LoadManager( - async, streaming, batch_size, max_threads, sequence_length, - shared_memory_type, output_shm_size, start_sequence_id, - sequence_id_range, parser, factory), - request_distribution_(request_distribution), execute_(false) + async, streaming, batch_size, max_threads, shared_memory_type, + output_shm_size, parser, factory, request_parameters), + request_distribution_(request_distribution), execute_(false), + num_of_sequences_(num_of_sequences), serial_sequences_(serial_sequences) +{ + gen_duration_.reset(new std::chrono::nanoseconds( + max_trials * measurement_window_ms * NANOS_PER_MILLIS)); + + threads_config_.reserve(max_threads); +} + +void +RequestRateManager::InitManagerFinalize() { if (on_sequence_model_) { - for (uint64_t i = 0; i < num_of_sequences; i++) { - sequence_stat_.emplace_back(new SequenceStat(next_seq_id_++)); - } + sequence_manager_->InitSequenceStatuses(num_of_sequences_); } - gen_duration_.reset( - new std::chrono::nanoseconds(2 * measurement_window_ms * 1000 * 1000)); } cb::Error -RequestRateManager::ChangeRequestRate(const double request_rate) +RequestRateManager::ChangeRequestRate( + const double request_rate, const size_t request_count) { PauseWorkers(); + ConfigureThreads(request_count); // Can safely update the schedule GenerateSchedule(request_rate); ResumeWorkers(); @@ -104,338 +101,205 @@ RequestRateManager::ChangeRequestRate(const double request_rate) return cb::Error::Success; } -cb::Error -RequestRateManager::ResetWorkers() -{ - PauseWorkers(); - ResumeWorkers(); - - return cb::Error::Success; -} - void RequestRateManager::GenerateSchedule(const double request_rate) { + std::chrono::nanoseconds max_duration; std::function distribution; + if (request_distribution_ == Distribution::POISSON) { distribution = ScheduleDistribution(request_rate); + // Poisson distribution needs to generate a schedule for the maximum + // possible duration to make sure that it is as random and as close to the + // desired rate as possible + max_duration = *gen_duration_; } else if (request_distribution_ == Distribution::CONSTANT) { distribution = ScheduleDistribution(request_rate); + // Constant distribution only needs one entry per worker -- that one value + // can be repeated over and over to emulate a full schedule of any length + max_duration = std::chrono::nanoseconds(1); } else { return; } - schedule_.clear(); - schedule_.emplace_back(0); - std::mt19937 schedule_rng; - while (schedule_.back() < *gen_duration_) { - std::chrono::nanoseconds next_timestamp( - schedule_.back() + distribution(schedule_rng)); - schedule_.emplace_back(next_timestamp); - } - std::cout << "Request Rate: " << request_rate - << " inference requests per seconds" << std::endl; + auto worker_schedules = CreateWorkerSchedules(max_duration, distribution); + GiveSchedulesToWorkers(worker_schedules); } -void -RequestRateManager::PauseWorkers() +std::vector +RequestRateManager::CreateWorkerSchedules( + std::chrono::nanoseconds max_duration, + std::function distribution) { - // Pause all the threads - execute_ = false; + std::mt19937 schedule_rng; - if (threads_.empty()) { - while (threads_.size() < max_threads_) { - // Launch new thread for inferencing - threads_stat_.emplace_back(new ThreadStat()); - threads_config_.emplace_back( - new ThreadConfig(threads_.size(), max_threads_)); - - // Worker threads share the responsibility to generate the inferences at - // a particular schedule. - threads_.emplace_back( - &RequestRateManager::Infer, this, threads_stat_.back(), - threads_config_.back()); - } - } + std::vector worker_schedules = + CreateEmptyWorkerSchedules(); + std::vector thread_ids{CalculateThreadIds()}; - // Wait to see all threads are paused. - for (auto& thread_config : threads_config_) { - while (!thread_config->is_paused_) { - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - } + std::chrono::nanoseconds next_timestamp(0); + size_t thread_id_index = 0; + size_t worker_index = 0; + + + // Generate schedule until we hit max_duration, but also make sure that all + // worker schedules follow the thread id distribution + // + while (next_timestamp < max_duration || + thread_id_index % thread_ids.size() != 0) { + next_timestamp = next_timestamp + distribution(schedule_rng); + worker_index = thread_ids[thread_id_index]; + thread_id_index = ++thread_id_index % thread_ids.size(); + worker_schedules[worker_index]->intervals.emplace_back(next_timestamp); } + + SetScheduleDurations(worker_schedules); + + return worker_schedules; } -void -RequestRateManager::ResumeWorkers() +std::vector +RequestRateManager::CreateEmptyWorkerSchedules() { - // Reset all the thread counters - for (auto& thread_config : threads_config_) { - thread_config->index_ = thread_config->id_; - thread_config->rounds_ = 0; + std::vector worker_schedules; + for (size_t i = 0; i < workers_.size(); i++) { + worker_schedules.push_back(std::make_shared()); } + return worker_schedules; +} - // Update the start_time_ to point to current time - start_time_ = std::chrono::steady_clock::now(); +std::vector +RequestRateManager::CalculateThreadIds() +{ + std::vector thread_ids{}; + // Determine number of ids to loop over for time stamps + size_t num_ids = 0; + if (on_sequence_model_) { + num_ids = num_of_sequences_; + } else { + num_ids = max_threads_; + } - // Wake up all the threads to begin execution - execute_ = true; - wake_signal_.notify_all(); + for (size_t i = 0; i < num_ids; i++) { + size_t t = i % DetermineNumThreads(); + thread_ids.push_back(t); + } + return thread_ids; } void -RequestRateManager::Infer( - std::shared_ptr thread_stat, - std::shared_ptr thread_config) +RequestRateManager::SetScheduleDurations( + std::vector& schedules) { - std::shared_ptr ctx(new InferContext()); - thread_stat->status_ = factory_->CreateClientBackend(&(ctx->infer_backend_)); - ctx->options_.reset(new cb::InferOptions(parser_->ModelName())); - ctx->options_->model_version_ = parser_->ModelVersion(); - ctx->options_->model_signature_name_ = parser_->ModelSignatureName(); + RateSchedulePtr_t last_schedule = schedules.back(); - thread_stat->contexts_stat_.emplace_back(); + std::chrono::nanoseconds duration = last_schedule->intervals.back(); - if (shared_memory_type_ == SharedMemoryType::NO_SHARED_MEMORY) { - thread_stat->status_ = PrepareInfer(ctx.get()); - } else { - thread_stat->status_ = PrepareSharedMemoryInfer(ctx.get()); + for (auto schedule : schedules) { + duration = std::max(schedule->intervals.back(), duration); } - if (!thread_stat->status_.IsOk()) { - return; + + for (auto schedule : schedules) { + schedule->duration = duration; } +} - uint64_t request_id = 0; - // request_id to start timestamp map - std::shared_ptr> async_req_map( - new std::map()); - - // Callback function for handling asynchronous requests - const auto callback_func = [&](cb::InferResult* result) { - std::shared_ptr result_ptr(result); - if (thread_stat->cb_status_.IsOk()) { - // Add the request timestamp to thread Timestamp vector with - // proper locking - std::lock_guard lock(thread_stat->mu_); - thread_stat->cb_status_ = result_ptr->RequestStatus(); - if (thread_stat->cb_status_.IsOk()) { - std::chrono::time_point end_time_async; - end_time_async = std::chrono::system_clock::now(); - std::string request_id; - thread_stat->cb_status_ = result_ptr->Id(&request_id); - const auto& it = async_req_map->find(request_id); - if (it != async_req_map->end()) { - thread_stat->request_timestamps_.emplace_back(std::make_tuple( - it->second.start_time_, end_time_async, it->second.sequence_end_, - it->second.delayed_)); - ctx->infer_backend_->ClientInferStat( - &(thread_stat->contexts_stat_[0])); - thread_stat->cb_status_ = ValidateOutputs(*ctx, result); - async_req_map->erase(request_id); - } else { - return; - } - } - } - ctx->inflight_request_cnt_--; - }; - - if (streaming_) { - // Decoupled models should not collect client side statistics - thread_stat->status_ = ctx->infer_backend_->StartStream( - callback_func, (!parser_->IsDecoupled())); - if (!thread_stat->status_.IsOk()) { - return; + +void +RequestRateManager::GiveSchedulesToWorkers( + const std::vector& worker_schedules) +{ + for (size_t i = 0; i < workers_.size(); i++) { + auto w = std::dynamic_pointer_cast(workers_[i]); + w->SetSchedule(worker_schedules[i]); + } +} + +void +RequestRateManager::PauseWorkers() +{ + // Pause all the threads + execute_ = false; + + // Wait to see all threads are paused. + for (auto& thread_config : threads_config_) { + while (!thread_config->is_paused_) { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); } } +} - // run inferencing until receiving exit signal to maintain server load. - do { - // Should wait till main thread signals execution start - if (!execute_) { - // Ensures the clean measurements after thread is woken up. - while (ctx->inflight_request_cnt_ != 0) { - std::this_thread::sleep_for(std::chrono::milliseconds(500)); - } - // Wait if no request should be sent and it is not exiting - thread_config->is_paused_ = true; - std::unique_lock lock(wake_mutex_); - wake_signal_.wait(lock, [this]() { return early_exit || execute_; }); +void +RequestRateManager::ConfigureThreads(const size_t request_count) +{ + if (threads_.empty()) { + size_t num_of_threads = DetermineNumThreads(); + while (workers_.size() < num_of_threads) { + // Launch new thread for inferencing + threads_stat_.emplace_back(new ThreadStat()); + threads_config_.emplace_back(new ThreadConfig(workers_.size())); + + workers_.push_back( + MakeWorker(threads_stat_.back(), threads_config_.back())); } + // Compute the number of sequences for each thread (take floor) + // and spread the remaining value + size_t avg_num_seqs = num_of_sequences_ / workers_.size(); + size_t num_seqs_add_one = num_of_sequences_ % workers_.size(); + size_t seq_offset = 0; - thread_config->is_paused_ = false; + size_t avg_req_count = request_count / workers_.size(); + size_t req_count_add_one = request_count % workers_.size(); - uint32_t seq_id = 0; - // Sleep if required - std::chrono::steady_clock::time_point now = - std::chrono::steady_clock::now(); - std::chrono::nanoseconds wait_time = - (schedule_[thread_config->index_] + - (thread_config->rounds_ * (*gen_duration_))) - - (now - start_time_); + for (size_t i = 0; i < workers_.size(); i++) { + size_t num_of_seq = avg_num_seqs + (i < num_seqs_add_one ? 1 : 0); + threads_config_[i]->num_sequences_ = num_of_seq; + threads_config_[i]->seq_stat_index_offset_ = seq_offset; + seq_offset += num_of_seq; - thread_config->index_ = (thread_config->index_ + thread_config->stride_); - // Loop around the schedule to keep running - thread_config->rounds_ += (thread_config->index_ / schedule_.size()); - thread_config->index_ = thread_config->index_ % schedule_.size(); + size_t thread_num_reqs = avg_req_count + (i < req_count_add_one ? 1 : 0); + threads_config_[i]->num_requests_ = thread_num_reqs; - bool delayed = false; - if (wait_time.count() < 0) { - delayed = true; - } else { - std::this_thread::sleep_for(wait_time); + threads_.emplace_back(&IWorker::Infer, workers_[i]); } + } +} - // Update the inputs if required - if (using_json_data_ && (!on_sequence_model_)) { - int step_id = (thread_config->non_sequence_data_step_id_ % - data_loader_->GetTotalStepsNonSequence()) * - batch_size_; - thread_config->non_sequence_data_step_id_ += max_threads_; - thread_stat->status_ = - UpdateInputs(ctx->inputs_, ctx->valid_inputs_, 0, step_id); - if (thread_stat->status_.IsOk()) { - thread_stat->status_ = UpdateValidationOutputs( - ctx->outputs_, 0, step_id, ctx->expected_outputs_); - } - if (!thread_stat->status_.IsOk()) { - return; - } - } +void +RequestRateManager::ResumeWorkers() +{ + // Update the start_time_ to point to current time + start_time_ = std::chrono::steady_clock::now(); - if (on_sequence_model_) { - // Select one of the sequence at random for this request - seq_id = rand() % sequence_stat_.size(); - // Need lock to protect the order of dispatch across worker threads. - // This also helps in reporting the realistic latencies. - std::lock_guard guard(sequence_stat_[seq_id]->mtx_); - if (!early_exit) { - SetInferSequenceOptions(seq_id, ctx->options_); - - // Update the inputs if required - if (using_json_data_) { - int step_id = data_loader_->GetTotalSteps( - sequence_stat_[seq_id]->data_stream_id_) - - sequence_stat_[seq_id]->remaining_queries_; - thread_stat->status_ = UpdateInputs( - ctx->inputs_, ctx->valid_inputs_, - sequence_stat_[seq_id]->data_stream_id_, step_id); - if (thread_stat->status_.IsOk()) { - thread_stat->status_ = UpdateValidationOutputs( - ctx->outputs_, sequence_stat_[seq_id]->data_stream_id_, step_id, - ctx->expected_outputs_); - } - if (!thread_stat->status_.IsOk()) { - return; - } - } - - Request( - ctx, request_id++, delayed, callback_func, async_req_map, - thread_stat); - sequence_stat_[seq_id]->remaining_queries_--; - } - } else { - Request( - ctx, request_id++, delayed, callback_func, async_req_map, - thread_stat); - } + // Wake up all the threads to begin execution + execute_ = true; + wake_signal_.notify_all(); +} - if (early_exit || (!thread_stat->cb_status_.IsOk())) { - if (on_sequence_model_) { - // Finish off all the ongoing sequences for graceful exit - for (size_t i = thread_config->id_; i < sequence_stat_.size(); - i += thread_config->stride_) { - std::lock_guard guard(sequence_stat_[i]->mtx_); - if (sequence_stat_[i]->remaining_queries_ != 0) { - ctx->options_->sequence_start_ = false; - ctx->options_->sequence_end_ = true; - ctx->options_->sequence_id_ = sequence_stat_[i]->seq_id_; - Request( - ctx, request_id++, false /* delayed */, callback_func, - async_req_map, thread_stat); - sequence_stat_[i]->remaining_queries_ = 0; - } - } - } - if (async_) { - // Loop to ensure all the inflight requests have been completed. - while (ctx->inflight_request_cnt_ != 0) { - std::this_thread::sleep_for(std::chrono::milliseconds(500)); - } - } - // end loop - break; - } - } while (true); +std::shared_ptr +RequestRateManager::MakeWorker( + std::shared_ptr thread_stat, + std::shared_ptr thread_config) +{ + size_t id = workers_.size(); + size_t num_of_threads = DetermineNumThreads(); + return std::make_shared( + id, thread_stat, thread_config, parser_, data_loader_, factory_, + on_sequence_model_, async_, num_of_threads, using_json_data_, streaming_, + batch_size_, wake_signal_, wake_mutex_, execute_, start_time_, + serial_sequences_, infer_data_manager_, sequence_manager_); } -void -RequestRateManager::Request( - std::shared_ptr context, const uint64_t request_id, - const bool delayed, cb::OnCompleteFn callback_func, - std::shared_ptr> - async_req_map, - std::shared_ptr thread_stat) +size_t +RequestRateManager::DetermineNumThreads() { - if (async_) { - context->options_->request_id_ = std::to_string(request_id); - { - std::lock_guard lock(thread_stat->mu_); - auto it = - async_req_map - ->emplace( - context->options_->request_id_, AsyncRequestProperties()) - .first; - it->second.start_time_ = std::chrono::system_clock::now(); - it->second.sequence_end_ = context->options_->sequence_end_; - it->second.delayed_ = delayed; - } - if (streaming_) { - thread_stat->status_ = context->infer_backend_->AsyncStreamInfer( - *(context->options_), context->valid_inputs_, context->outputs_); - } else { - thread_stat->status_ = context->infer_backend_->AsyncInfer( - callback_func, *(context->options_), context->valid_inputs_, - context->outputs_); - } - if (!thread_stat->status_.IsOk()) { - return; - } - context->inflight_request_cnt_++; - } else { - std::chrono::time_point start_time_sync, - end_time_sync; - start_time_sync = std::chrono::system_clock::now(); - cb::InferResult* results = nullptr; - thread_stat->status_ = context->infer_backend_->Infer( - &results, *(context->options_), context->valid_inputs_, - context->outputs_); - if (results != nullptr) { - if (thread_stat->status_.IsOk()) { - thread_stat->status_ = ValidateOutputs(*context, results); - } - delete results; - } - if (!thread_stat->status_.IsOk()) { - return; - } - end_time_sync = std::chrono::system_clock::now(); - { - // Add the request timestamp to thread Timestamp vector with proper - // locking - std::lock_guard lock(thread_stat->mu_); - thread_stat->request_timestamps_.emplace_back(std::make_tuple( - start_time_sync, end_time_sync, context->options_->sequence_end_, - delayed)); - thread_stat->status_ = context->infer_backend_->ClientInferStat( - &(thread_stat->contexts_stat_[0])); - if (!thread_stat->status_.IsOk()) { - return; - } - } + size_t num_of_threads = max_threads_; + if (on_sequence_model_) { + num_of_threads = std::min(max_threads_, num_of_sequences_); } + return num_of_threads; } + }} // namespace triton::perfanalyzer diff --git a/request_rate_manager.h b/request_rate_manager.h index a627ed4e..8c9131bb 100644 --- a/request_rate_manager.h +++ b/request_rate_manager.h @@ -1,4 +1,4 @@ -// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -26,11 +26,16 @@ #pragma once #include -#include + #include "load_manager.h" +#include "request_rate_worker.h" namespace triton { namespace perfanalyzer { +#ifndef DOCTEST_CONFIG_DISABLE +class TestRequestRateManager; +#endif + //============================================================================== /// RequestRateManager is a helper class to send inference requests to /// inference server in accordance with a Poisson distribution. This @@ -59,13 +64,13 @@ class RequestRateManager : public LoadManager { /// request. /// \param streaming Whether to use gRPC streaming API for infer request /// \param measurement_window_ms The time window for measurements. + /// \param max_trials The maximum number of windows that will be measured /// \param request_distribution The kind of distribution to use for drawing /// out intervals between successive requests. /// \param batch_size The batch size used for each request. /// \param max_threads The maximum number of working threads to be spawned. /// \param num_of_sequences The number of concurrent sequences that must be /// maintained on the server. - /// \param sequence_length The base length of each sequence. /// \param string_length The length of the string to create for input. /// \param string_data The data to use for generating string input. /// \param zero_input Whether to fill the input tensors with zero. @@ -74,100 +79,94 @@ class RequestRateManager : public LoadManager { /// \param shared_memory_type The type of shared memory to use for inputs. /// \param output_shm_size The size of the shared memory to allocate for the /// output. + /// \param serial_sequences Enable serial sequence mode. /// \param parser The ModelParser object to get the model details. /// \param factory The ClientBackendFactory object used to create /// client to the server. /// \param manager Returns a new ConcurrencyManager object. + /// \param request_parameters Custom request parameters to send to the server /// \return cb::Error object indicating success or failure. static cb::Error Create( const bool async, const bool streaming, - const uint64_t measurement_window_ms, Distribution request_distribution, - const int32_t batch_size, const size_t max_threads, - const uint32_t num_of_sequences, const size_t sequence_length, - const size_t string_length, const std::string& string_data, - const bool zero_input, std::vector& user_data, + const uint64_t measurement_window_ms, const size_t max_trials, + Distribution request_distribution, const int32_t batch_size, + const size_t max_threads, const uint32_t num_of_sequences, const SharedMemoryType shared_memory_type, const size_t output_shm_size, - const uint64_t start_sequence_id, const uint64_t sequence_id_range, - const std::shared_ptr& parser, + const bool serial_sequences, const std::shared_ptr& parser, const std::shared_ptr& factory, - std::unique_ptr* manager); + std::unique_ptr* manager, + const std::unordered_map& + request_parameters); /// Adjusts the rate of issuing requests to be the same as 'request_rate' - /// \param request_rate The rate at which requests must be issued to the - /// server. + /// \param target_request_rate The rate at which requests must be issued to + /// the server. + /// \param request_count The number of requests to generate when profiling. If + /// 0, then there is no limit, and it will generate until told to stop. /// \return cb::Error object indicating success or failure. - cb::Error ChangeRequestRate(const double target_request_rate); - - /// Resets all worker thread states to beginning of schedule. - /// \return cb::Error object indicating success or failure. - cb::Error ResetWorkers() override; + cb::Error ChangeRequestRate( + const double target_request_rate, const size_t request_count = 0); protected: - struct ThreadConfig { - ThreadConfig(uint32_t index, uint32_t stride) - : index_(index), id_(index), stride_(stride), is_paused_(false), - rounds_(0), non_sequence_data_step_id_(index) - { - } - - uint32_t index_; - uint32_t id_; - uint32_t stride_; - bool is_paused_; - uint64_t rounds_; - int non_sequence_data_step_id_; - }; - RequestRateManager( const bool async, const bool streaming, Distribution request_distribution, const int32_t batch_size, const uint64_t measurement_window_ms, - const size_t max_threads, const uint32_t num_of_sequences, - const size_t sequence_length, const SharedMemoryType shared_memory_type, - const size_t output_shm_size, const uint64_t start_sequence_id, - const uint64_t sequence_id_range, - const std::shared_ptr& parser, - const std::shared_ptr& factory); + const size_t max_trials, const size_t max_threads, + const uint32_t num_of_sequences, + const SharedMemoryType shared_memory_type, const size_t output_shm_size, + const bool serial_sequences, const std::shared_ptr& parser, + const std::shared_ptr& factory, + const std::unordered_map& + request_parameters); + + void InitManagerFinalize() override; /// Generates and update the request schedule as per the given request rate. /// \param request_rate The request rate to use for new schedule. void GenerateSchedule(const double request_rate); + std::vector CreateWorkerSchedules( + std::chrono::nanoseconds duration, + std::function distribution); + + std::vector CreateEmptyWorkerSchedules(); + + std::vector CalculateThreadIds(); + + void SetScheduleDurations(std::vector& schedules); + + void GiveSchedulesToWorkers( + const std::vector& worker_schedules); + // Pauses the worker threads void PauseWorkers(); + void ConfigureThreads(const size_t request_count = 0); + // Resets the counters and resumes the worker threads void ResumeWorkers(); - /// Function for worker that sends inference requests. - /// \param thread_stat Worker thread specific data. - /// \param thread_config Worker thread configuration specific data. - void Infer( - std::shared_ptr thread_stat, - std::shared_ptr thread_config); - - /// A helper function to issue inference request to the server. - /// \param context InferContext to use for sending the request. - /// \param request_id The unique id to be associated with the request. - /// \param delayed Whether the request fell behind its scheduled time. - /// \param callback_func The callback function to use with asynchronous - /// request. - /// \param async_req_map The map from ongoing request_id to the - /// request information needed to correctly interpret the details. - /// \param thread_stat The runnning status of the worker thread - void Request( - std::shared_ptr context, const uint64_t request_id, - const bool delayed, cb::OnCompleteFn callback_func, - std::shared_ptr> - async_req_map, - std::shared_ptr thread_stat); + // Makes a new worker + virtual std::shared_ptr MakeWorker( + std::shared_ptr, std::shared_ptr); + + size_t DetermineNumThreads(); std::vector> threads_config_; - std::unique_ptr gen_duration_; + std::shared_ptr gen_duration_; Distribution request_distribution_; - std::vector schedule_; std::chrono::steady_clock::time_point start_time_; bool execute_; + const size_t num_of_sequences_{0}; + const bool serial_sequences_{false}; + +#ifndef DOCTEST_CONFIG_DISABLE + friend TestRequestRateManager; + + public: + RequestRateManager() = default; +#endif }; }} // namespace triton::perfanalyzer diff --git a/request_rate_worker.cc b/request_rate_worker.cc new file mode 100644 index 00000000..48ccb361 --- /dev/null +++ b/request_rate_worker.cc @@ -0,0 +1,168 @@ +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "request_rate_worker.h" + +#include +#include + +#include "client_backend/client_backend.h" +#include "data_loader.h" +#include "perf_utils.h" + +namespace triton { namespace perfanalyzer { + +void +RequestRateWorker::Infer() +{ + CreateCtxIdTracker(); + CreateContexts(); + + // run inferencing until receiving exit signal to maintain server load. + do { + HandleExecuteOff(); + + bool is_delayed = SleepIfNecessary(); + uint32_t ctx_id = GetCtxId(); + SendInferRequest(ctx_id, is_delayed); + RestoreFreeCtxId(ctx_id); + + if (HandleExitConditions()) { + return; + } + + } while (true); +} + +void +RequestRateWorker::CreateCtxIdTracker() +{ + bool is_concurrency = false; + + ctx_id_tracker_ = CtxIdTrackerFactory::CreateTracker( + is_concurrency, on_sequence_model_, serial_sequences_); +} + +void +RequestRateWorker::CreateContexts() +{ + size_t active_ctx_cnt = + on_sequence_model_ ? thread_config_->num_sequences_ : 1; + while (ctxs_.size() < active_ctx_cnt) { + CreateContext(); + } + + ResetFreeCtxIds(); +} + +void +RequestRateWorker::ResetFreeCtxIds() +{ + std::lock_guard lock(cb_mtx_); + ctx_id_tracker_->Reset(ctxs_.size()); +} + +void +RequestRateWorker::SetSchedule(RateSchedulePtr_t schedule) +{ + schedule_ = schedule; +} + +std::chrono::nanoseconds +RequestRateWorker::GetNextTimestamp() +{ + return schedule_->Next(); +} + + +uint32_t +RequestRateWorker::GetSeqStatIndex(uint32_t ctx_id) +{ + return (thread_config_->seq_stat_index_offset_ + ctx_id); +} + +void +RequestRateWorker::HandleExecuteOff() +{ + // Should wait till main thread signals execution start + if (!execute_) { + CompleteOngoingSequences(); + WaitForOngoingRequests(); + + // Reset Ctx IDs because CompleteOngoingSequences() + // has destructive side affects + ResetFreeCtxIds(); + + // Wait if no request should be sent and it is not exiting + thread_config_->is_paused_ = true; + std::unique_lock lock(wake_mutex_); + wake_signal_.wait(lock, [this]() { return early_exit || execute_; }); + } + + thread_config_->is_paused_ = false; +} + +bool +RequestRateWorker::SleepIfNecessary() +{ + WaitForFreeCtx(); + + std::chrono::steady_clock::time_point now = std::chrono::steady_clock::now(); + std::chrono::nanoseconds next_timestamp = GetNextTimestamp(); + std::chrono::nanoseconds current_timestamp = now - start_time_; + std::chrono::nanoseconds wait_time = next_timestamp - current_timestamp; + + bool delayed = false; + if (wait_time.count() < 0) { + delayed = true; + } else { + thread_stat_->idle_timer.Start(); + std::this_thread::sleep_for(wait_time); + thread_stat_->idle_timer.Stop(); + } + return delayed; +} + +void +RequestRateWorker::WaitForFreeCtx() +{ + if (!ctx_id_tracker_->IsAvailable()) { + notified_ = false; + // wait for signal from callback. + std::unique_lock lk(cb_mtx_); + thread_stat_->idle_timer.Start(); + cb_cv_.wait(lk, [this] { + if (notified_) { + notified_ = false; + return true; + } + return false; + }); + thread_stat_->idle_timer.Stop(); + } +} + +}} // namespace triton::perfanalyzer diff --git a/request_rate_worker.h b/request_rate_worker.h new file mode 100644 index 00000000..e6d1804c --- /dev/null +++ b/request_rate_worker.h @@ -0,0 +1,126 @@ +// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include + +#include "ischeduler.h" +#include "load_worker.h" +#include "model_parser.h" +#include "sequence_manager.h" +#include "thread_config.h" + +namespace triton { namespace perfanalyzer { + + +#ifndef DOCTEST_CONFIG_DISABLE +class NaggyMockRequestRateWorker; +class TestRequestRateManager; +class TestCustomLoadManager; +#endif + +/// Worker thread for RequestRateManager +/// +/// If the model is non-sequence model, each worker uses only one context +/// to maintain concurrency assigned to worker. +/// If the model is sequence model, each worker has to use multiples contexts +/// to maintain (sequence) concurrency assigned to worker. +/// +class RequestRateWorker : public LoadWorker, public IScheduler { + public: + RequestRateWorker( + uint32_t id, std::shared_ptr thread_stat, + std::shared_ptr thread_config, + const std::shared_ptr parser, + std::shared_ptr data_loader, + const std::shared_ptr factory, + const bool on_sequence_model, const bool async, const size_t num_threads, + const bool using_json_data, const bool streaming, + const int32_t batch_size, std::condition_variable& wake_signal, + std::mutex& wake_mutex, bool& execute, + std::chrono::steady_clock::time_point& start_time, + const bool serial_sequences, + const std::shared_ptr& infer_data_manager, + std::shared_ptr sequence_manager) + : LoadWorker( + id, thread_stat, thread_config, parser, data_loader, factory, + on_sequence_model, async, streaming, batch_size, using_json_data, + wake_signal, wake_mutex, execute, infer_data_manager, + sequence_manager), + num_threads_(num_threads), start_time_(start_time), + serial_sequences_(serial_sequences) + { + } + + void Infer() override; + + /// Provides the schedule that should be followed + /// + void SetSchedule(RateSchedulePtr_t schedule) override; + + private: + RateSchedulePtr_t schedule_; + + const size_t num_threads_; + const bool serial_sequences_; + std::chrono::steady_clock::time_point& start_time_; + + void CreateCtxIdTracker(); + + std::chrono::nanoseconds GetNextTimestamp(); + + uint32_t GetSeqStatIndex(uint32_t ctx_id) override; + + void CreateContexts(); + + void HandleExecuteOff(); + void ResetFreeCtxIds(); + + // Sleep until it is time for the next part of the schedule + // Returns true if the request was delayed + bool SleepIfNecessary(); + + void WaitForFreeCtx(); + + void CreateContextFinalize(std::shared_ptr ctx) override + { + ctx->RegisterAsyncCallbackFinalize(std::bind( + &RequestRateWorker::AsyncCallbackFinalize, this, + std::placeholders::_1)); + + ctx->SetNumActiveThreads(num_threads_); + } + +#ifndef DOCTEST_CONFIG_DISABLE + friend NaggyMockRequestRateWorker; + friend TestCustomLoadManager; + friend TestRequestRateManager; + +#endif +}; + + +}} // namespace triton::perfanalyzer diff --git a/request_record.h b/request_record.h new file mode 100644 index 00000000..91b5ca19 --- /dev/null +++ b/request_record.h @@ -0,0 +1,101 @@ +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include +#include +#include +#include +#include + +namespace triton { namespace perfanalyzer { + +/// A record containing the data of a single request input or response output +struct RecordData { + RecordData(const uint8_t* buf, size_t size, std::string data_type = "") + { + uint8_t* array = new uint8_t[size]; + std::memcpy(array, buf, size); + data_ = std::shared_ptr(array, [](uint8_t* p) { delete[] p; }); + size_ = size; + data_type_ = data_type; + } + + // Define equality comparison operator so it can be inserted into maps + bool operator==(const RecordData& other) const + { + if (size_ != other.size_) + return false; + // Compare the contents of the arrays + return std::memcmp(data_.get(), other.data_.get(), size_) == 0; + } + + std::shared_ptr data_; + size_t size_; + std::string data_type_; +}; + + +/// A record of an individual request +struct RequestRecord { + using RequestInput = std::unordered_map; + using ResponseOutput = std::unordered_map; + + RequestRecord( + std::chrono::time_point start_time = + std::chrono::time_point(), + std::vector> + response_timestamps = {}, + std::vector request_inputs = {}, + std::vector response_outputs = {}, + bool sequence_end = true, bool delayed = false, uint64_t sequence_id = 0, + bool has_null_last_response = false) + : start_time_(start_time), response_timestamps_(response_timestamps), + request_inputs_(request_inputs), response_outputs_(response_outputs), + sequence_end_(sequence_end), delayed_(delayed), + sequence_id_(sequence_id), + has_null_last_response_(has_null_last_response) + { + } + // The timestamp of when the request was started. + std::chrono::time_point start_time_; + // Collection of response timestamps + std::vector> + response_timestamps_; + + std::vector request_inputs_; + std::vector response_outputs_; + // Whether or not the request is at the end of a sequence. + bool sequence_end_; + // Whether or not the request is delayed as per schedule. + bool delayed_; + // Sequence ID of the request + uint64_t sequence_id_; + // Whether the last response is null + bool has_null_last_response_; +}; + +}} // namespace triton::perfanalyzer diff --git a/sequence_manager.cc b/sequence_manager.cc new file mode 100644 index 00000000..eaf5d6e0 --- /dev/null +++ b/sequence_manager.cc @@ -0,0 +1,178 @@ +// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "sequence_manager.h" + +namespace triton { namespace perfanalyzer { + +SequenceManager::SequenceManager( + const uint64_t start_sequence_id, const uint64_t sequence_id_range, + const size_t sequence_length, const bool sequence_length_specified, + const double sequence_length_variation, const bool using_json_data, + std::shared_ptr data_loader) + : start_sequence_id_(start_sequence_id), + sequence_id_range_(sequence_id_range), sequence_length_(sequence_length), + sequence_length_specified_(sequence_length_specified), + sequence_length_variation_(sequence_length_variation), + using_json_data_(using_json_data), data_loader_(data_loader) +{ + distribution_ = std::uniform_int_distribution( + 0, data_loader_->GetDataStreamsCount() - 1); +} + +void +SequenceManager::InitSequenceStatuses(size_t num_sequence_statuses) +{ + sequence_statuses_.clear(); + for (size_t sequence_status_index{0}; + sequence_status_index < num_sequence_statuses; sequence_status_index++) { + sequence_statuses_.push_back(std::make_shared()); + } +} + +const uint64_t +SequenceManager::GetSequenceID(size_t sequence_status_index) const +{ + return sequence_statuses_.at(sequence_status_index)->seq_id_; +} + +std::mutex& +SequenceManager::GetMutex(size_t sequence_status_index) +{ + return sequence_statuses_.at(sequence_status_index)->mtx_; +} + +const uint64_t +SequenceManager::GetDataStreamID(size_t sequence_status_index) const +{ + return sequence_statuses_.at(sequence_status_index)->data_stream_id_; +} + +const size_t +SequenceManager::GetRemainingQueries(size_t sequence_status_index) const +{ + return sequence_statuses_.at(sequence_status_index)->remaining_queries_; +} + +void +SequenceManager::SetRemainingQueries( + size_t sequence_status_index, size_t remaining_queries) +{ + sequence_statuses_.at(sequence_status_index)->remaining_queries_ = + remaining_queries; +} + +void +SequenceManager::DecrementRemainingQueries(size_t sequence_status_index) +{ + sequence_statuses_.at(sequence_status_index)->remaining_queries_--; +} + +const size_t +SequenceManager::GetNumSequenceStatuses() const +{ + return sequence_statuses_.size(); +} + +void +SequenceManager::SetInferSequenceOptions( + const uint32_t seq_stat_index, std::unique_ptr& options) +{ + options->sequence_start_ = + (sequence_statuses_[seq_stat_index]->remaining_queries_ == 0); + + // New sequence must be initialized before setting the id. + if (options->sequence_start_) { + InitNewSequence(seq_stat_index); + } + options->sequence_id_ = sequence_statuses_[seq_stat_index]->seq_id_; + options->sequence_end_ = + (sequence_statuses_[seq_stat_index]->remaining_queries_ == 1); +} + +const size_t +SequenceManager::GetSequenceLength(size_t sequence_status_index) const +{ + return sequence_statuses_.at(sequence_status_index)->sequence_length_; +} + +void +SequenceManager::InitNewSequence(int seq_stat_index) +{ + sequence_statuses_[seq_stat_index]->seq_id_ = GetNextSeqId(seq_stat_index); + if (!using_json_data_) { + size_t new_length = GetRandomSequenceLength(sequence_length_variation_); + sequence_statuses_[seq_stat_index]->remaining_queries_ = + new_length == 0 ? 1 : new_length; + } else { + // Selecting next available data stream based on uniform distribution. + const uint64_t data_stream_id{GetNewDataStreamId()}; + sequence_statuses_[seq_stat_index]->data_stream_id_ = data_stream_id; + const size_t total_steps{data_loader_->GetTotalSteps(data_stream_id)}; + if (sequence_length_specified_) { + const size_t varied_sequence_length{ + GetRandomSequenceLength(sequence_length_variation_)}; + sequence_statuses_[seq_stat_index]->sequence_length_ = + varied_sequence_length; + } else { + sequence_statuses_[seq_stat_index]->sequence_length_ = total_steps; + } + sequence_statuses_[seq_stat_index]->remaining_queries_ = + sequence_statuses_[seq_stat_index]->sequence_length_; + } +} + +uint64_t +SequenceManager::GetNextSeqId(int seq_stat_index) +{ + uint64_t old_seq_id = sequence_statuses_[seq_stat_index]->seq_id_; + uint64_t next_seq_id = + curr_seq_id_++ % sequence_id_range_ + start_sequence_id_; + + // If the next sequence ID is still in use, reuse the same sequence ID + // that this sequence_status used last time + // + for (uint i = 0; i < sequence_statuses_.size(); i++) { + if (next_seq_id == sequence_statuses_[i]->seq_id_) { + next_seq_id = old_seq_id; + break; + } + } + return next_seq_id; +} + +size_t +SequenceManager::GetRandomSequenceLength(double offset_ratio) +{ + int random_offset = ((2.0 * rand() / double(RAND_MAX)) - 1.0) * offset_ratio / + 100.0 * sequence_length_; + if (int(sequence_length_) + random_offset <= 0) { + return 1; + } + return sequence_length_ + random_offset; +} + +}} // namespace triton::perfanalyzer diff --git a/sequence_manager.h b/sequence_manager.h new file mode 100644 index 00000000..c419a87f --- /dev/null +++ b/sequence_manager.h @@ -0,0 +1,218 @@ +// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include +#include +#include +#include +#include + +#include "client_backend/client_backend.h" +#include "data_loader.h" +#include "sequence_status.h" + +namespace triton { namespace perfanalyzer { + +#ifndef DOCTEST_CONFIG_DISABLE +class NaggyMockSequenceManager; +#endif + +/// Manages operations related to preparing requests to sequence models. +/// +class SequenceManager { + public: + /// Constructs the sequence manager object. Involves initializing the + /// distribution for randomly assigning input data streams to new sequences. + /// \param start_sequence_id See associated data member description. + /// \param sequence_id_range See associated data member description. + /// \param sequence_length See associated data member description. + /// \param sequence_length_specified See associated data member description. + /// \param sequence_length_variation See associated data member description. + /// \param using_json_data See associated data member description. + /// \param data_loader See associated data member description. + /// \return The constructed sequence manager object. + /// + SequenceManager( + const uint64_t start_sequence_id, const uint64_t sequence_id_range, + const size_t sequence_length, const bool sequence_length_specified, + const double sequence_length_variation, const bool using_json_data, + std::shared_ptr data_loader); + + /// Initializes the sequence statuses data structure. + /// \param num_sequence_statuses The number of sequence status objects to + /// create. + /// + void InitSequenceStatuses(size_t num_sequence_statuses); + + /// Gets the sequence ID for the specified sequence status object. + /// \param sequence_status_index The index of the sequence status object. + /// \return The sequence ID for the specified sequence status object. + /// + const uint64_t GetSequenceID(size_t sequence_status_index) const; + + /// Gets a non-const reference to the mutex for the specified sequence status + /// object. + /// \param sequence_status_index The index of the sequence status object. + /// \return A non-const reference to the mutex for the specified sequence + /// status object. + /// + std::mutex& GetMutex(size_t sequence_status_index); + + /// Gets the data stream ID for the specified sequence status object. + /// \param sequence_status_index The index of the sequence status object. + /// \return The data stream ID for the specified sequence status object. + /// + const uint64_t GetDataStreamID(size_t sequence_status_index) const; + + /// Gets the remaining queries for the specified sequence status object. + /// \param sequence_status_index The index of the sequence status object. + /// \return The remaining queries for the specified sequence status object. + /// + const size_t GetRemainingQueries(size_t sequence_status_index) const; + + /// Sets the remaining queries for the specified sequence status object. + /// \param sequence_status_index The index of the sequence status object. + /// \param remaining_queries The new value of the remaining queries for the + /// specified sequence status object. + /// + void SetRemainingQueries( + size_t sequence_status_index, size_t remaining_queries); + + /// Decrements the remaining queries for the specified sequence status object. + /// \param sequence_status_index The index of the sequence status object. + /// + void DecrementRemainingQueries(size_t sequence_status_index); + + /// Gets the number of sequence status objects in the sequence statuses data + /// structure. + /// \param sequence_status_index The index of the sequence status object. + /// \return The number of sequence status objects in the sequence statuses + /// data structure. + /// + const size_t GetNumSequenceStatuses() const; + + /// Sets options related to a single request to a sequence model. + /// \param seq_stat_index The index for the sequence status object that is + /// having its options set. + /// \param options The options object for the request that is being prepared. + /// + virtual void SetInferSequenceOptions( + const uint32_t seq_stat_index, + std::unique_ptr& options); + + /// Gets the sequence length for the specified sequence status object. + /// \param sequence_status_index The index of the sequence status object. + /// \return The sequence length for the specified sequence status object. + /// + const size_t GetSequenceLength(size_t sequence_status_index) const; + + private: + /// Initializes values for a sequence status object. + /// \param seq_stat_index The index for the sequence status object that is + /// being initialized. + /// + virtual void InitNewSequence(int seq_stat_index); + + /// Determines an appropriate next sequence ID for a renewed sequence status + /// object. + /// \param seq_stat_index The index for the sequence for which a request is + /// being prepared. + /// \return The potentially new sequence ID to be used by a renewed sequence + /// status object. + /// + virtual uint64_t GetNextSeqId(int seq_stat_index); + + virtual uint64_t GetNewDataStreamId() + { + return distribution_(rng_generator_); + } + + /// Generates a random sequence length based on a threshold. + /// \param offset_ratio The offset ratio/threshold of the generated length. + /// \return A random sequence length. + /// + virtual size_t GetRandomSequenceLength(double offset_ratio); + + /// Data structure holding sequence status objects + /// + std::vector> sequence_statuses_{}; + + /// Current sequence id (for issuing new sequences) + /// + std::atomic curr_seq_id_{0}; + + /// Data loader to be used for various sequence operations. + /// + std::shared_ptr data_loader_{nullptr}; + + /// The starting sequence ID to be used for iterating through valid sequence + /// IDs. + /// + const uint64_t start_sequence_id_{0}; + + /// The maximum sequence ID to be used for iterating through valid sequence + /// IDs. + /// + const uint64_t sequence_id_range_{0}; + + /// The base length of new sequences. + /// + const size_t sequence_length_{0}; + + /// Whether the user specified the sequence length. + /// + const bool sequence_length_specified_{false}; + + /// The percentage variation in length of sequences using autogenerated data + /// as input. + /// + const double sequence_length_variation_{0.0}; + + /// Indicates whether to generate sequence request input data or read it from + /// a JSON file. + /// + const bool using_json_data_{false}; + + /// The distribution for randomly assigning new sequences a data stream in the + /// input data JSON. + /// + std::uniform_int_distribution distribution_; + + /// The random number generator for randomly assigning new sequences a data + /// stream in the input data JSON. + /// + std::default_random_engine rng_generator_{}; + +#ifndef DOCTEST_CONFIG_DISABLE + friend NaggyMockSequenceManager; + + public: + SequenceManager() = default; +#endif +}; + +}} // namespace triton::perfanalyzer diff --git a/sequence_status.h b/sequence_status.h new file mode 100644 index 00000000..16ec3bf4 --- /dev/null +++ b/sequence_status.h @@ -0,0 +1,51 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include +#include + +namespace triton { namespace perfanalyzer { + +// Holds the status of the inflight sequence +struct SequenceStatus { + SequenceStatus(uint64_t seq_id = 0) + : seq_id_(seq_id), data_stream_id_(0), remaining_queries_(0) + { + } + // The unique correlation id allocated to the sequence + uint64_t seq_id_; + // The data stream id providing data for the sequence + uint64_t data_stream_id_; + // The number of queries remaining to complete the sequence + size_t remaining_queries_; + // The length of the sequence + size_t sequence_length_{0}; + // A lock to protect sequence data + std::mutex mtx_; +}; + +}} // namespace triton::perfanalyzer diff --git a/tensor_data.h b/tensor_data.h new file mode 100644 index 00000000..6f5cf719 --- /dev/null +++ b/tensor_data.h @@ -0,0 +1,40 @@ +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +namespace triton { namespace perfanalyzer { + +/// Data for one input or output tensor +/// +struct TensorData { + const uint8_t* data_ptr{nullptr}; + size_t batch1_size{0}; + bool is_valid{false}; + std::string name; +}; + + +}} // namespace triton::perfanalyzer diff --git a/test_command_line_parser.cc b/test_command_line_parser.cc index 51be9758..765def11 100644 --- a/test_command_line_parser.cc +++ b/test_command_line_parser.cc @@ -1,4 +1,4 @@ -// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -25,14 +25,17 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // #include + #include + #include "command_line_parser.h" #include "doctest.h" +#include "perf_analyzer_exception.h" namespace triton { namespace perfanalyzer { inline void -CHECK_STRING(const char* name, std::string str, const char* val) +CHECK_STRING(const char* name, const std::string& str, const std::string& val) { CHECK_MESSAGE( !str.compare(val), name, " expecting '", val, "', found '", str, "'"); @@ -45,6 +48,12 @@ CHECK_STRING(std::string act, std::string exp) !act.compare(exp), "Expecting: '", exp, "', Found: '", act, "'"); } +std::string +CreateUsageMessage(const std::string& option_name, const std::string& msg) +{ + return "Failed to parse " + option_name + ". " + msg; +} + // Performs a doc test check against all the individual parameters // in a PAParams object. // @@ -167,8 +176,81 @@ CHECK_PARAMS(PAParamsPtr act, PAParamsPtr exp) CHECK_STRING(act->filename, act->filename); CHECK(act->mpi_driver != nullptr); CHECK_STRING(act->memory_type, exp->memory_type); + CHECK( + act->is_using_periodic_concurrency_mode == + exp->is_using_periodic_concurrency_mode); + CHECK( + act->periodic_concurrency_range.start == + exp->periodic_concurrency_range.start); + CHECK( + act->periodic_concurrency_range.end == + exp->periodic_concurrency_range.end); + CHECK( + act->periodic_concurrency_range.step == + exp->periodic_concurrency_range.step); + CHECK(act->request_period == exp->request_period); + CHECK(act->request_parameters.size() == exp->request_parameters.size()); + for (auto act_param : act->request_parameters) { + auto exp_param = exp->request_parameters.find(act_param.first); + REQUIRE_MESSAGE( + exp_param != exp->request_parameters.end(), + "Unexpected parameter: ", act_param.first); + + CHECK(act_param.second.value == exp_param->second.value); + CHECK(act_param.second.type == exp_param->second.type); + } } + +#define CHECK_INT_OPTION(option_name, exp_val, msg) \ + SUBCASE("valid value") \ + { \ + int argc = 5; \ + char* argv[argc] = {app_name, "-m", model_name, option_name, "2000"}; \ + CAPTURE(argv[3]); \ + CAPTURE(argv[4]); \ + \ + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); \ + CHECK(!parser.UsageCalled()); \ + CAPTURE(parser.GetUsageMessage()); \ + \ + exp_val = 2000; \ + CAPTURE(exp_val); \ + } \ + \ + SUBCASE("negative value") \ + { \ + int argc = 5; \ + char* argv[argc] = {app_name, "-m", model_name, option_name, "-2000"}; \ + CHECK_THROWS_WITH_AS( \ + act = parser.Parse(argc, argv), msg.c_str(), PerfAnalyzerException); \ + \ + check_params = false; \ + } \ + \ + SUBCASE("floating point value") \ + { \ + int argc = 5; \ + char* argv[argc] = {app_name, "-m", model_name, option_name, "29.5"}; \ + \ + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); \ + CHECK(!parser.UsageCalled()); \ + \ + exp_val = 29; \ + } \ + \ + SUBCASE("missing value") \ + { \ + int argc = 4; \ + char* argv[argc] = {app_name, "-m", model_name, option_name}; \ + \ + CHECK_THROWS_WITH_AS( \ + act = parser.Parse(argc, argv), "", PerfAnalyzerException); \ + \ + check_params = false; \ + } + + TEST_CASE("Testing PerfAnalyzerParameters") { PAParamsPtr params(new PerfAnalyzerParameters{}); @@ -180,7 +262,9 @@ TEST_CASE("Testing PerfAnalyzerParameters") CHECK(params->max_threads_specified == false); CHECK(params->sequence_length == 20); CHECK(params->percentile == -1); + CHECK(params->request_count == 0); CHECK(params->user_data.size() == 0); + CHECK_STRING("endpoint", params->endpoint, ""); CHECK(params->input_shapes.size() == 0); CHECK(params->measurement_window_ms == 5000); CHECK(params->using_concurrency_range == false); @@ -215,7 +299,8 @@ TEST_CASE("Testing PerfAnalyzerParameters") clientbackend::GrpcCompressionAlgorithm::COMPRESS_NONE); CHECK(params->measurement_mode == MeasurementMode::TIME_WINDOWS); CHECK(params->measurement_request_count == 50); - CHECK_STRING("triton_server_path", params->triton_server_path, ""); + CHECK_STRING( + "triton_server_path", params->triton_server_path, "/opt/tritonserver"); CHECK_STRING("model_repository_path", params->model_repository_path, ""); CHECK(params->start_sequence_id == 1); CHECK(params->sequence_id_range == UINT32_MAX); @@ -279,17 +364,160 @@ class TestCLParser : public CLParser { virtual void Usage(const std::string& msg = std::string()) { - usage_called_ = true; - usage_message_ = msg; + throw PerfAnalyzerException(msg, GENERIC_ERROR); } }; +void +CheckValidRange( + std::vector& args, char* option_name, TestCLParser& parser, + PAParamsPtr& act, bool& using_range, Range& range) +{ + SUBCASE("start:end provided") + { + args.push_back(option_name); + args.push_back("100:400"); // start:end + + int argc = args.size(); + char* argv[argc]; + std::copy(args.begin(), args.end(), argv); + + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + CHECK(!parser.UsageCalled()); + + using_range = true; + range.start = 100; + range.end = 400; + } + + SUBCASE("start:end:step provided") + { + args.push_back(option_name); + args.push_back("100:400:10"); // start:end:step + + int argc = args.size(); + char* argv[argc]; + std::copy(args.begin(), args.end(), argv); + + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + CHECK(!parser.UsageCalled()); + + using_range = true; + range.start = 100; + range.end = 400; + range.step = 10; + } +} + +void +CheckInvalidRange( + std::vector& args, char* option_name, TestCLParser& parser, + PAParamsPtr& act, bool& check_params) +{ + std::string expected_msg; + + SUBCASE("too many input values") + { + args.push_back(option_name); + args.push_back("200:100:25:10"); + + int argc = args.size(); + char* argv[argc]; + std::copy(args.begin(), args.end(), argv); + + expected_msg = CreateUsageMessage( + option_name, "The value does not match ."); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + + check_params = false; + } + + SUBCASE("invalid start value") + { + args.push_back(option_name); + args.push_back("bad:400:10"); + + int argc = args.size(); + char* argv[argc]; + std::copy(args.begin(), args.end(), argv); + + expected_msg = + CreateUsageMessage(option_name, "Invalid value provided: bad:400:10"); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + + check_params = false; + } + + SUBCASE("invalid end value") + { + args.push_back(option_name); + args.push_back("100:bad:10"); + + int argc = args.size(); + char* argv[argc]; + std::copy(args.begin(), args.end(), argv); + + expected_msg = + CreateUsageMessage(option_name, "Invalid value provided: 100:bad:10"); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + + check_params = false; + } + + SUBCASE("invalid step value") + { + args.push_back(option_name); + args.push_back("100:400:bad"); + + int argc = args.size(); + char* argv[argc]; + std::copy(args.begin(), args.end(), argv); + + expected_msg = + CreateUsageMessage(option_name, "Invalid value provided: 100:400:bad"); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + + check_params = false; + } + + SUBCASE("no input values") + { + args.push_back(option_name); + + int argc = args.size(); + char* argv[argc]; + std::copy(args.begin(), args.end(), argv); + + // BUG (TMA-1307): Usage message does not contain error. Error statement + // "option '--concurrency-range' requires an argument" written directly + // to std::out + // + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), "", PerfAnalyzerException); + + check_params = false; + } +} + + TEST_CASE("Testing Command Line Parser") { char* model_name = "my_model"; char* app_name = "test_perf_analyzer"; - opterr = 0; // Disable error output for GetOpt library + std::string expected_msg; + std::vector args{app_name, "-m", model_name}; + + opterr = 1; // Enable error output for GetOpt library + bool check_params = true; TestCLParser parser; // Command Line parser under test PAParamsPtr act; // Actual options parsed from parser @@ -304,13 +532,13 @@ TEST_CASE("Testing Command Line Parser") int argc = 1; char* argv[argc] = {app_name}; - REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); - REQUIRE(parser.UsageCalled()); - CHECK_STRING( - "Usage Message", parser.GetUsageMessage(), "-m flag must be specified"); + expected_msg = + CreateUsageMessage("-m (model name)", "The value must be specified."); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); - exp->model_name = ""; - CHECK_PARAMS(act, exp); + check_params = false; } SUBCASE("with min parameters") @@ -318,11 +546,8 @@ TEST_CASE("Testing Command Line Parser") int argc = 3; char* argv[argc] = {app_name, "-m", model_name}; - PAParamsPtr act; REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); REQUIRE(!parser.UsageCalled()); - - CHECK_PARAMS(act, exp); } SUBCASE("Option : --streaming") @@ -332,16 +557,13 @@ TEST_CASE("Testing Command Line Parser") int argc = 2; char* argv[argc] = {app_name, "--streaming"}; - REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); - REQUIRE(parser.UsageCalled()); - CHECK_STRING( - "Usage Message", parser.GetUsageMessage(), - "streaming is only allowed with gRPC protocol"); + expected_msg = + CreateUsageMessage("-m (model name)", "The value must be specified."); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); - exp->model_name = ""; - exp->streaming = true; - // exp->max_threads = 16; - CHECK_PARAMS(act, exp); + check_params = false; } SUBCASE("with model") @@ -349,18 +571,15 @@ TEST_CASE("Testing Command Line Parser") int argc = 4; char* argv[argc] = {app_name, "-m", model_name, "--streaming"}; - REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); - REQUIRE(parser.UsageCalled()); - // NOTE: This is not an informative error message, how do I specify a gRPC - // protocol? Error ouput should list missing params. + // protocol? Error output should list missing params. // - CHECK_STRING( - "Usage Message", parser.GetUsageMessage(), - "streaming is only allowed with gRPC protocol"); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), + "Streaming is only allowed with gRPC protocol.", + PerfAnalyzerException); - exp->streaming = true; - CHECK_PARAMS(act, exp); + check_params = false; } SUBCASE("with model last") @@ -368,15 +587,12 @@ TEST_CASE("Testing Command Line Parser") int argc = 4; char* argv[argc] = {app_name, "--streaming", "-m", model_name}; - REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); - - REQUIRE(parser.UsageCalled()); - CHECK_STRING( - "Usage Message", parser.GetUsageMessage(), - "streaming is only allowed with gRPC protocol"); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), + "Streaming is only allowed with gRPC protocol.", + PerfAnalyzerException); - exp->streaming = true; - CHECK_PARAMS(act, exp); + check_params = false; } } @@ -392,7 +608,6 @@ TEST_CASE("Testing Command Line Parser") exp->max_threads = 1; exp->max_threads_specified = true; - CHECK_PARAMS(act, exp); } SUBCASE("set to max") @@ -405,7 +620,6 @@ TEST_CASE("Testing Command Line Parser") exp->max_threads = 65535; exp->max_threads_specified = true; - CHECK_PARAMS(act, exp); } SUBCASE("missing value") @@ -413,17 +627,15 @@ TEST_CASE("Testing Command Line Parser") int argc = 4; char* argv[argc] = {app_name, "-m", model_name, "--max-threads"}; - REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); - REQUIRE(parser.UsageCalled()); - // NOTE: Empty message is not helpful // - CHECK_STRING("Usage Message", parser.GetUsageMessage(), ""); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), "", PerfAnalyzerException); + // BUG: Dumping string "option '--max-threads' requires an argument" // directly to std::out, instead of through usage() // - - CHECK_PARAMS(act, exp); + check_params = false; } SUBCASE("bad value") @@ -431,17 +643,15 @@ TEST_CASE("Testing Command Line Parser") int argc = 4; char* argv[argc] = {app_name, "-m", model_name, "--max-threads", "bad"}; - REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); - REQUIRE(parser.UsageCalled()); - // NOTE: Empty message is not helpful // - CHECK_STRING("Usage Message", parser.GetUsageMessage(), ""); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), "", PerfAnalyzerException); + // BUG: Dumping string "option '--max-threads' requires an argument" // directly to std::out, instead of through usage() // - - CHECK_PARAMS(act, exp); + check_params = false; } } @@ -450,14 +660,54 @@ TEST_CASE("Testing Command Line Parser") SUBCASE("set to 2000") { int argc = 5; - char* argv[argc] = {app_name, "-m", model_name, "--sequence-length", - "2000"}; + char* argv[argc] = { + app_name, "-m", model_name, "--sequence-length", "2000"}; REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); CHECK(!parser.UsageCalled()); exp->sequence_length = 2000; - CHECK_PARAMS(act, exp); + } + + SUBCASE("set to 0") + { + int argc = 5; + char* argv[argc] = {app_name, "-m", model_name, "--sequence-length", "0"}; + + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + CHECK(!parser.UsageCalled()); + + exp->sequence_length = 20; + } + } + + SUBCASE("Option : --sequence-length-variation") + { + SUBCASE("non-negative") + { + int argc = 5; + char* argv[argc] = { + app_name, "-m", model_name, "--sequence-length-variation", "33.3"}; + + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + CHECK(!parser.UsageCalled()); + + exp->sequence_length_variation = 33.3; + } + + SUBCASE("negative") + { + int argc = 5; + char* argv[argc] = { + app_name, "-m", model_name, "--sequence-length-variation", "-10"}; + + expected_msg = CreateUsageMessage( + "--sequence-length-variation", "The value must be >= 0.0."); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + + check_params = false; } } @@ -472,7 +722,6 @@ TEST_CASE("Testing Command Line Parser") CHECK(!parser.UsageCalled()); exp->percentile = 25; - CHECK_PARAMS(act, exp); } SUBCASE("set to 225 - overflow check") @@ -480,14 +729,14 @@ TEST_CASE("Testing Command Line Parser") int argc = 5; char* argv[argc] = {app_name, "-m", model_name, "--percentile", "225"}; - REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); - CHECK(parser.UsageCalled()); - CHECK_STRING( - "Usage Message", parser.GetUsageMessage(), - "percentile must be -1 for not reporting or in range (0, 100)"); + expected_msg = CreateUsageMessage( + "--percentile", + "The value must be -1 for not reporting or in range (0, 100)."); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); - exp->percentile = 225; - CHECK_PARAMS(act, exp); + check_params = false; } SUBCASE("set to -1 - use average latency") @@ -499,7 +748,6 @@ TEST_CASE("Testing Command Line Parser") CHECK(!parser.UsageCalled()); exp->percentile = -1; - CHECK_PARAMS(act, exp); } } @@ -508,14 +756,13 @@ TEST_CASE("Testing Command Line Parser") SUBCASE("set to `/usr/data`") { int argc = 5; - char* argv[argc] = {app_name, "-m", model_name, "--data-directory", - "/usr/data"}; + char* argv[argc] = { + app_name, "-m", model_name, "--data-directory", "/usr/data"}; REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); CHECK(!parser.UsageCalled()); exp->user_data.push_back("/usr/data"); - CHECK_PARAMS(act, exp); } SUBCASE("call twice") @@ -534,38 +781,151 @@ TEST_CASE("Testing Command Line Parser") exp->user_data.push_back("/usr/data"); exp->user_data.push_back("/another/dir"); - CHECK_PARAMS(act, exp); } } + SUBCASE("Option : --sequence-id-range") + { + SUBCASE("One arg") + { + int argc = 5; + char* argv[argc] = { + app_name, "-m", model_name, "--sequence-id-range", "53"}; + + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + CHECK(!parser.UsageCalled()); + + exp->start_sequence_id = 53; + exp->sequence_id_range = UINT32_MAX; + } + SUBCASE("Two args") + { + int argc = 5; + char* argv[argc] = { + app_name, "-m", model_name, "--sequence-id-range", "53:67"}; + + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + CHECK(!parser.UsageCalled()); + + exp->start_sequence_id = 53; + exp->sequence_id_range = 14; + } + SUBCASE("Three args") + { + int argc = 5; + char* argv[argc] = { + app_name, "-m", model_name, "--sequence-id-range", "53:67:92"}; + + expected_msg = CreateUsageMessage( + "--sequence-id-range", "The value does not match ."); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + + check_params = false; + } + SUBCASE("Not a number") + { + int argc = 5; + char* argv[argc] = { + app_name, "-m", model_name, "--sequence-id-range", "BAD"}; + + expected_msg = CreateUsageMessage( + "--sequence-id-range", "Invalid value provided: BAD"); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + + check_params = false; // Usage message called + } + SUBCASE("Not a number 2") + { + int argc = 5; + char* argv[argc] = { + app_name, "-m", model_name, "--sequence-id-range", "53:BAD"}; + + expected_msg = CreateUsageMessage( + "--sequence-id-range", "Invalid value provided: 53:BAD"); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + + check_params = false; // Usage message called + } + } + + + SUBCASE("Option : --input-tensor-format") + { + SUBCASE("binary") + { + int argc = 5; + char* argv[argc] = { + app_name, "-m", model_name, "--input-tensor-format", "binary"}; + + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + CHECK(!parser.UsageCalled()); + + exp->input_tensor_format = cb::TensorFormat::BINARY; + } + SUBCASE("json") + { + int argc = 5; + char* argv[argc] = { + app_name, "-m", model_name, "--input-tensor-format", "json"}; + + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + CHECK(!parser.UsageCalled()); + + exp->input_tensor_format = cb::TensorFormat::JSON; + } + SUBCASE("invalid") + { + int argc = 5; + char* argv[argc] = { + app_name, "-m", model_name, "--input-tensor-format", "invalid"}; + + expected_msg = CreateUsageMessage( + "--input-tensor-format", + "Unsupported type provided: 'invalid'. The available options are " + "'binary' or 'json'."); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + + check_params = false; + } + } + + SUBCASE("Option : --shape") { SUBCASE("expected input, single shape") { int argc = 5; - char* argv[argc] = {app_name, "-m", model_name, "--shape", - "input_name:1,2,3"}; + char* argv[argc] = { + app_name, "-m", model_name, "--shape", "input_name:1,2,3"}; REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); CHECK(!parser.UsageCalled()); exp->input_shapes.emplace( std::string("input_name"), std::vector{1, 2, 3}); - CHECK_PARAMS(act, exp); } SUBCASE("expected input, multiple shapes") { int argc = 9; - char* argv[argc] = {app_name, - "-m", - model_name, - "--shape", - "input_name:1,2,3", - "--shape", - "alpha:10,24", - "--shape", - "beta:10,200,34,15,9000"}; + char* argv[argc] = { + app_name, + "-m", + model_name, + "--shape", + "input_name:1,2,3", + "--shape", + "alpha:10,24", + "--shape", + "beta:10,200,34,15,9000"}; REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); CHECK(!parser.UsageCalled()); @@ -576,38 +936,36 @@ TEST_CASE("Testing Command Line Parser") std::string("alpha"), std::vector{10, 24}); exp->input_shapes.emplace( std::string("beta"), std::vector{10, 200, 34, 15, 9000}); - CHECK_PARAMS(act, exp); } SUBCASE("using negative dims") { int argc = 5; - char* argv[argc] = {app_name, "-m", model_name, "--shape", - "input_name:-1,2,3"}; + char* argv[argc] = { + app_name, "-m", model_name, "--shape", "input_name:-1,2,3"}; - REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); - CHECK(parser.UsageCalled()); - CHECK_STRING( - "Usage Message", parser.GetUsageMessage(), "input shape must be > 0"); + expected_msg = CreateUsageMessage( + "--shape", "The dimensions of input tensor must be > 0."); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); - exp->input_shapes.emplace( - std::string("input_name"), std::vector{-1, 2, 3}); - CHECK_PARAMS(act, exp); + check_params = false; } SUBCASE("equals sign, not colon") { int argc = 5; - char* argv[argc] = {app_name, "-m", model_name, "--shape", - "input_name=-1,2,3"}; + char* argv[argc] = { + app_name, "-m", model_name, "--shape", "input_name=-1,2,3"}; - // BUG this should call usages with the message - // "failed to parse input shape. There must be a colon after input name - // - CHECK_THROWS_WITH( - act = parser.Parse(argc, argv), - "basic_string::substr: __pos (which is 18) > this->size() (which is " - "17)"); + expected_msg = CreateUsageMessage( + "--shape", "There must be a colon after input name."); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + + check_params = false; } SUBCASE("missing shape") @@ -615,62 +973,58 @@ TEST_CASE("Testing Command Line Parser") int argc = 5; char* argv[argc] = {app_name, "-m", model_name, "--shape", "input_name"}; - // BUG this should call usages with the message - // "failed to parse input shape. There must be a colon after input name - // - CHECK_THROWS_WITH( - act = parser.Parse(argc, argv), - "basic_string::substr: __pos (which is 11) > this->size() (which is " - "10)"); + expected_msg = CreateUsageMessage( + "--shape", "There must be a colon after input name."); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + + check_params = false; } SUBCASE("missing colon") { int argc = 5; - char* argv[argc] = {app_name, "-m", model_name, "--shape", - "input_name1,2,3"}; + char* argv[argc] = { + app_name, "-m", model_name, "--shape", "input_name1,2,3"}; - // BUG this should call usages with the message - // "failed to parse input shape. There must be a colon after input name - // - CHECK_THROWS_WITH( - act = parser.Parse(argc, argv), - "basic_string::substr: __pos (which is 16) > this->size() (which is " - "15)"); + expected_msg = CreateUsageMessage( + "--shape", "There must be a colon after input name."); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + + check_params = false; } SUBCASE("bad shapes - a,b,c") { int argc = 5; - char* argv[argc] = {app_name, "-m", model_name, "--shape", - "input_name:a,b,c"}; + char* argv[argc] = { + app_name, "-m", model_name, "--shape", "input_name:a,b,c"}; - REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); - CHECK(parser.UsageCalled()); - CHECK_STRING( - "Usage Message", parser.GetUsageMessage(), - "failed to parse input shape: input_name:a,b,c"); + expected_msg = CreateUsageMessage( + "--shape", "Invalid value provided: input_name:a,b,c"); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); - exp->input_shapes.emplace( - std::string("input_name"), std::vector{}); - CHECK_PARAMS(act, exp); + check_params = false; // Usage message called } SUBCASE("bad shapes - [1,2,3]") { int argc = 5; - char* argv[argc] = {app_name, "-m", model_name, "--shape", - "input_name:[1,2,3]"}; + char* argv[argc] = { + app_name, "-m", model_name, "--shape", "input_name:[1,2,3]"}; - REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); - CHECK(parser.UsageCalled()); - CHECK_STRING( - "Usage Message", parser.GetUsageMessage(), - "failed to parse input shape: input_name:[1,2,3]"); + expected_msg = CreateUsageMessage( + "--shape", "Invalid value provided: input_name:[1,2,3]"); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); - exp->input_shapes.emplace( - std::string("input_name"), std::vector{}); - CHECK_PARAMS(act, exp); + check_params = false; // Usage message called } } @@ -681,9 +1035,15 @@ TEST_CASE("Testing Command Line Parser") int argc = 5; char* argv[argc] = {app_name, "-m", model_name, "", "500"}; - SUBCASE("Long form") { argv[3] = "--measurement-interval"; } + SUBCASE("Long form") + { + argv[3] = "--measurement-interval"; + } - SUBCASE("Short form") { argv[3] = "-p"; } + SUBCASE("Short form") + { + argv[3] = "-p"; + } CAPTURE(argv[3]); @@ -691,7 +1051,6 @@ TEST_CASE("Testing Command Line Parser") CHECK(!parser.UsageCalled()); exp->measurement_window_ms = 500; - CHECK_PARAMS(act, exp); } SUBCASE("set to -200") @@ -699,21 +1058,25 @@ TEST_CASE("Testing Command Line Parser") int argc = 5; char* argv[argc] = {app_name, "-m", model_name, "", "-200"}; - SUBCASE("Long form") { argv[3] = "--measurement-interval"; } + SUBCASE("Long form") + { + argv[3] = "--measurement-interval"; + } - SUBCASE("Short form") { argv[3] = "-p"; } + SUBCASE("Short form") + { + argv[3] = "-p"; + } CAPTURE(argv[3]); - REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); - CHECK(!parser.UsageCalled()); + expected_msg = CreateUsageMessage( + "--measurement-interval (-p)", "The value must be > 0 msec."); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); - // BUG: may want to actually error out here, and not just use the unsigned - // conversion. This will result in unexpected behavior. The actual value - // becomes 18446744073709551416ULL, which is not what you would want. - // - exp->measurement_window_ms = -200; - CHECK_PARAMS(act, exp); + check_params = false; } SUBCASE("set to non-numeric value") @@ -721,220 +1084,744 @@ TEST_CASE("Testing Command Line Parser") int argc = 5; char* argv[argc] = {app_name, "-m", model_name, "", "foobar"}; - SUBCASE("Long form") { argv[3] = "--measurement-interval"; } + SUBCASE("Long form") + { + argv[3] = "--measurement-interval"; + expected_msg = CreateUsageMessage( + "--measurement-interval", "Invalid value provided: foobar"); + } - SUBCASE("Short form") { argv[3] = "-p"; } + SUBCASE("Short form") + { + argv[3] = "-p"; + expected_msg = + CreateUsageMessage("-p", "Invalid value provided: foobar"); + } CAPTURE(argv[3]); - REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); - CHECK(parser.UsageCalled()); - CHECK_STRING( - "Usage Message", parser.GetUsageMessage(), - "measurement window must be > 0 in msec"); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); - exp->measurement_window_ms = 0; - CHECK_PARAMS(act, exp); + check_params = false; // Usage message called } } SUBCASE("Option : --concurrency-range") { - SUBCASE("expected use") + char* option_name = "--concurrency-range"; + + SUBCASE("start provided") { - int argc = 5; - char* argv[argc] = {app_name, "-m", model_name, "--concurrency-range", - "100:400:10"}; + args.push_back(option_name); + args.push_back("100"); // start + + int argc = args.size(); + char* argv[argc]; + std::copy(args.begin(), args.end(), argv); REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); CHECK(!parser.UsageCalled()); exp->using_concurrency_range = true; exp->concurrency_range.start = 100; - exp->concurrency_range.end = 400; - exp->concurrency_range.step = 10; - CHECK_PARAMS(act, exp); } - SUBCASE("only two options") + CheckValidRange( + args, option_name, parser, act, exp->using_concurrency_range, + exp->concurrency_range); + + CheckInvalidRange(args, option_name, parser, act, check_params); + + SUBCASE("wrong separator") + { + args.push_back(option_name); + args.push_back("100,400,10"); + + int argc = args.size(); + char* argv[argc]; + std::copy(args.begin(), args.end(), argv); + + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + CHECK(!parser.UsageCalled()); + + // BUG (TMA-1307): Should detect this and through an error. User will + // enter this and have no clue why the end and step sizes are not used + // correctly. + // + + check_params = false; + } + + SUBCASE("invalid condition - end and latency threshold are 0") + { + args.push_back(option_name); + args.push_back("100:0:25"); + args.push_back("--latency-threshold"); + args.push_back("0"); + + int argc = args.size(); + char* argv[argc]; + std::copy(args.begin(), args.end(), argv); + + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), + "The end of the search range and the latency limit can not be both 0 " + "(or 0.0) simultaneously", + PerfAnalyzerException); + + check_params = false; + } + } + + SUBCASE("Option : --periodic-concurrency-range") + { + char* option_name = "--periodic-concurrency-range"; + + // Add required args that specifies where to dump profiled data + args.insert( + args.end(), {"-i", "grpc", "--async", "--streaming", + "--profile-export-file", "profile.json"}); + exp->protocol = cb::ProtocolType::GRPC; + exp->async = true; + exp->streaming = true; + exp->url = "localhost:8001"; // gRPC url + + SUBCASE("start provided") + { + args.push_back(option_name); + args.push_back("100"); // start + + int argc = args.size(); + char* argv[argc]; + std::copy(args.begin(), args.end(), argv); + + expected_msg = CreateUsageMessage( + option_name, "Both and values must be provided."); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + + check_params = false; + } + + exp->max_threads = 400; + + CheckValidRange( + args, option_name, parser, act, exp->is_using_periodic_concurrency_mode, + exp->periodic_concurrency_range); + + CheckInvalidRange(args, option_name, parser, act, check_params); + + SUBCASE("more than one load mode") + { + args.push_back(option_name); + args.push_back("100:400"); + args.push_back("--concurrency-range"); + args.push_back("10:40"); + + int argc = args.size(); + char* argv[argc]; + std::copy(args.begin(), args.end(), argv); + + expected_msg = + "Cannot specify more then one inference load mode. Please choose " + "only one of the following modes: --concurrency-range, " + "--periodic-concurrency-range, --request-rate-range, or " + "--request-intervals."; + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + + check_params = false; + } + + SUBCASE("no export file specified") + { + // Remove the export file args + args.pop_back(); + args.pop_back(); + + args.push_back(option_name); + args.push_back("100:400"); + + int argc = args.size(); + char* argv[argc]; + std::copy(args.begin(), args.end(), argv); + + expected_msg = + "Must provide --profile-export-file when using the " + "--periodic-concurrency-range option."; + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + + check_params = false; + } + + SUBCASE("step is not factor of range size") + { + args.push_back(option_name); + args.push_back("100:400:7"); + + int argc = args.size(); + char* argv[argc]; + std::copy(args.begin(), args.end(), argv); + + expected_msg = CreateUsageMessage( + option_name, + "The value must be a factor of the range size ( - " + ")."); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + + check_params = false; + } + + SUBCASE("step is zero") + { + args.push_back(option_name); + args.push_back("10:400:0"); + + int argc = args.size(); + char* argv[argc]; + std::copy(args.begin(), args.end(), argv); + + expected_msg = + CreateUsageMessage(option_name, "The value must be > 0."); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + + check_params = false; + } + } + + SUBCASE("Option : --request-period") + { + expected_msg = + CreateUsageMessage("--request-period", "The value must be > 0"); + CHECK_INT_OPTION("--request-period", exp->request_period, expected_msg); + + SUBCASE("set to 0") + { + args.push_back("--request-period"); + args.push_back("0"); + + int argc = args.size(); + char* argv[argc]; + std::copy(args.begin(), args.end(), argv); + + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + + check_params = false; + } + } + + SUBCASE("Option : --request-parameter") + { + char* option_name = "--request-parameter"; + + // Add required args that specifies where to dump profiled data + args.insert(args.end(), {"-i", "grpc", "--async", "--streaming"}); + exp->protocol = cb::ProtocolType::GRPC; + exp->async = true; + exp->streaming = true; + exp->url = "localhost:8001"; // gRPC url + + SUBCASE("valid parameter") + { + args.push_back(option_name); + args.push_back("max_tokens:256:int"); + + int argc = args.size(); + char* argv[argc]; + std::copy(args.begin(), args.end(), argv); + + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + CHECK(!parser.UsageCalled()); + + cb::RequestParameter param; + param.value = "256"; + param.type = "int"; + exp->request_parameters["max_tokens"] = param; + } + + SUBCASE("missing type") + { + args.push_back(option_name); + args.push_back("max_tokens:256"); + + int argc = args.size(); + char* argv[argc]; + std::copy(args.begin(), args.end(), argv); + + expected_msg = CreateUsageMessage( + option_name, "The value does not match ."); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + + check_params = false; + } + } + + SUBCASE("Option : --latency-threshold") + { + expected_msg = CreateUsageMessage( + "--latency-threshold (-l)", "The value must be >= 0 msecs."); + CHECK_INT_OPTION( + "--latency-threshold", exp->latency_threshold_ms, expected_msg); + + SUBCASE("set to 0") { int argc = 5; - char* argv[argc] = {app_name, "-m", model_name, "--concurrency-range", - "100:400"}; + char* argv[argc] = { + app_name, "-m", model_name, "--latency-threshold", "0"}; REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); CHECK(!parser.UsageCalled()); + } + } - exp->using_concurrency_range = true; - exp->concurrency_range.start = 100; - exp->concurrency_range.end = 400; - CHECK_PARAMS(act, exp); + SUBCASE("Option : --stability-percentage") + { + SUBCASE("valid value") + { + int argc = 5; + char* argv[argc] = { + app_name, "-m", model_name, "--stability-percentage", "80"}; + + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + CHECK(!parser.UsageCalled()); + + exp->stability_threshold = .8f; } - SUBCASE("only one options") + SUBCASE("set to 0") { int argc = 5; - char* argv[argc] = {app_name, "-m", model_name, "--concurrency-range", - "100"}; + char* argv[argc] = { + app_name, "-m", model_name, "--stability-percentage", "0"}; - // QUESTION: What does this mean? Why pass only one? - // REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); CHECK(!parser.UsageCalled()); + } - exp->using_concurrency_range = true; - exp->concurrency_range.start = 100; - CHECK_PARAMS(act, exp); + SUBCASE("negative value") + { + int argc = 5; + char* argv[argc] = { + app_name, "-m", model_name, "--stability-percentage", "-20"}; + + expected_msg = CreateUsageMessage( + "--stability-percentage (-s)", "The value must be >= 0.0."); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + + check_params = false; } - SUBCASE("no options") + SUBCASE("floating point value") { - int argc = 4; - char* argv[argc] = {app_name, "-m", model_name, "--concurrency-range"}; + int argc = 5; + char* argv[argc] = { + app_name, "-m", model_name, "--stability-percentage", "29.5"}; REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); - CHECK(parser.UsageCalled()); + CHECK(!parser.UsageCalled()); - // BUG: Usage message does not contain error. Error statement - // "option '--concurrency-range' requires an argument" written directly - // to std::out - // - CHECK_STRING("Usage Message", parser.GetUsageMessage(), ""); + exp->stability_threshold = .295f; + } + + SUBCASE("missing value") + { + int argc = 4; + char* argv[argc] = {app_name, "-m", model_name, "--stability-percentage"}; - CHECK_PARAMS(act, exp); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), "", PerfAnalyzerException); + + check_params = false; } } - SUBCASE("too many options") + SUBCASE("Option : --max-trials") { - int argc = 5; - char* argv[argc] = {app_name, "-m", model_name, "--concurrency-range", - "200:100:25:10"}; + expected_msg = + CreateUsageMessage("--max-trials (-r)", "The value must be > 0."); + CHECK_INT_OPTION("--max-trials", exp->max_trials, expected_msg); - REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); - CHECK(parser.UsageCalled()); - CHECK_STRING( - "Usage Message", parser.GetUsageMessage(), - "option concurrency-range can have maximum of three elements"); - - exp->using_concurrency_range = true; - exp->concurrency_range.start = 200; - exp->concurrency_range.end = 100; - exp->concurrency_range.step = 25; - CHECK_PARAMS(act, exp); - } + SUBCASE("set to 0") + { + int argc = 5; + char* argv[argc] = {app_name, "-m", model_name, "--max-trials", "0"}; - SUBCASE("way too many options") - { - int argc = 5; - char* argv[argc] = {app_name, "-m", model_name, "--concurrency-range", - "200:100:25:10:20:30"}; + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); - REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); - CHECK(parser.UsageCalled()); - CHECK_STRING( - "Usage Message", parser.GetUsageMessage(), - "option concurrency-range can have maximum of three elements"); - - exp->using_concurrency_range = true; - exp->concurrency_range.start = 200; - exp->concurrency_range.end = 100; - exp->concurrency_range.step = 25; - CHECK_PARAMS(act, exp); + check_params = false; + } } - SUBCASE("wrong separator") + SUBCASE("Option : --request-count") { - int argc = 5; - char* argv[argc] = {app_name, "-m", model_name, "--concurrency-range", - "100,400,10"}; + SUBCASE("valid value") + { + int argc = 5; + char* argv[argc] = {app_name, "-m", model_name, "--request-count", "500"}; - REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); - CHECK(!parser.UsageCalled()); + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + CHECK(!parser.UsageCalled()); - // BUG: Should detect this and through an error. User will enter this and - // have no clue why the end and step sizes are not used correctly. - // + exp->request_count = 500; + exp->measurement_mode = MeasurementMode::COUNT_WINDOWS; + exp->measurement_request_count = 500; + } + SUBCASE("negative value") + { + int argc = 5; + char* argv[argc] = {app_name, "-m", model_name, "--request-count", "-2"}; + + expected_msg = + CreateUsageMessage("--request-count", "The value must be > 0."); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + check_params = false; + } + SUBCASE("less than request rate") + { + int argc = 7; + char* argv[argc] = {app_name, "-m", + model_name, "--request-count", + "2", "--request-rate-range", + "5"}; + + expected_msg = "request-count can not be less than request-rate"; + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + check_params = false; + } + SUBCASE("less than concurrency") + { + int argc = 7; + char* argv[argc] = {app_name, "-m", + model_name, "--request-count", + "2", "--concurrency-range", + "5"}; + + expected_msg = "request-count can not be less than concurrency"; + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + check_params = false; + } + SUBCASE("multiple request rate") + { + int argc = 7; + char* argv[argc] = {app_name, "-m", + model_name, "--request-count", + "20", "--request-rate-range", + "5:6:1"}; + + expected_msg = + "request-count not supported with multiple request-rate values in " + "one run"; + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + check_params = false; + } + SUBCASE("multiple concurrency") + { + int argc = 7; + char* argv[argc] = {app_name, "-m", + model_name, "--request-count", + "20", "--concurrency-range", + "5:6:1"}; + + expected_msg = + "request-count not supported with multiple concurrency values in " + "one run"; + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + check_params = false; + } - exp->using_concurrency_range = true; - exp->concurrency_range.start = 100; - CHECK_PARAMS(act, exp); + SUBCASE("mode and count are overwritten with non-zero request-count") + { + int argc = 9; + char* argv[argc] = { + app_name, + "-m", + model_name, + "--request-count", + "2000", + "--measurement-mode", + "time_windows", + "measurement-request-count", + "30"}; + + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + CHECK(!parser.UsageCalled()); + + exp->request_count = 2000; + exp->measurement_mode = MeasurementMode::COUNT_WINDOWS; + exp->measurement_request_count = 2000; + } + SUBCASE("zero value (no override to measurement mode)") + { + int argc = 7; + char* argv[argc] = {app_name, "-m", model_name, + "--request-count", "0", "--measurement-mode", + "time_windows"}; + + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + CHECK(!parser.UsageCalled()); + + exp->request_count = 0; + exp->measurement_mode = MeasurementMode::TIME_WINDOWS; + } + SUBCASE("zero value (no override to measurement request count)") + { + int argc = 9; + char* argv[argc] = { + app_name, + "-m", + model_name, + "--request-count", + "0", + "--measurement-mode", + "count_windows", + "--measurement-request-count", + "50"}; + + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + CHECK(!parser.UsageCalled()); + + exp->request_count = 0; + exp->measurement_mode = MeasurementMode::COUNT_WINDOWS; + exp->measurement_request_count = 50; + } } - SUBCASE("bad start value") + SUBCASE("Option : --collect-metrics") { - int argc = 5; - char* argv[argc] = {app_name, "-m", model_name, "--concurrency-range", - "bad:400:10"}; + SUBCASE("with --service-kind != triton") + { + int argc = 8; + char* argv[argc] = { + app_name, "-m", model_name, "--collect-metrics", + "--service-kind", "tfserving", "-i", "grpc"}; - REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); - CHECK(parser.UsageCalled()); - CHECK_STRING( - "Usage Message", parser.GetUsageMessage(), - "failed to parse concurrency range: bad:400:10"); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), + "Server-side metric collection is only supported with Triton client " + "backend.", + PerfAnalyzerException); - exp->using_concurrency_range = true; - CHECK_PARAMS(act, exp); + check_params = false; + } } - SUBCASE("bad end value") + SUBCASE("Option : --metrics-url") { + // missing --collect-metrics int argc = 5; - char* argv[argc] = {app_name, "-m", model_name, "--concurrency-range", - "100:bad:10"}; + char* argv[argc] = { + app_name, "-m", model_name, "--metrics-url", "localhost:8002/metrics"}; - REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); - CHECK(parser.UsageCalled()); - CHECK_STRING( - "Usage Message", parser.GetUsageMessage(), - "failed to parse concurrency range: 100:bad:10"); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), + "Must specify --collect-metrics when using the --metrics-url option.", + PerfAnalyzerException); - exp->using_concurrency_range = true; - exp->concurrency_range.start = 100; - CHECK_PARAMS(act, exp); + check_params = false; } - SUBCASE("bad step value") + SUBCASE("Option : --metrics-interval") { - int argc = 5; - char* argv[argc] = {app_name, "-m", model_name, "--concurrency-range", - "100:400:bad"}; + SUBCASE("missing --collect-metrics") + { + int argc = 5; + char* argv[argc] = { + app_name, "-m", model_name, "--metrics-interval", "1000"}; - REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); - CHECK(parser.UsageCalled()); - CHECK_STRING( - "Usage Message", parser.GetUsageMessage(), - "failed to parse concurrency range: 100:400:bad"); - - exp->using_concurrency_range = true; - exp->concurrency_range.start = 100; - exp->concurrency_range.end = 400; - CHECK_PARAMS(act, exp); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), + "Must specify --collect-metrics when using the --metrics-interval " + "option.", + PerfAnalyzerException); + + check_params = false; + } + + SUBCASE("metrics interval 0") + { + int argc = 6; + char* argv[argc] = { + app_name, "-m", model_name, "--collect-metrics", "--metrics-interval", + "0"}; + + expected_msg = CreateUsageMessage( + "--metrics-interval", "The value must be > 0 msecs."); + CHECK_THROWS_WITH_AS( + act = parser.Parse(argc, argv), expected_msg.c_str(), + PerfAnalyzerException); + + check_params = false; + } } - SUBCASE("invalid condition - end and latency threshold are 0") + SUBCASE("Option : --bls-composing-models") { - int argc = 7; - char* argv[argc] = {app_name, "-m", - model_name, "--concurrency-range", - "100:0:25", "--latency-threshold", - "0"}; + int argc = 5; - REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); - CHECK(parser.UsageCalled()); - CHECK_STRING( - "Usage Message", parser.GetUsageMessage(), - "The end of the search range and the latency limit can not be both 0 " - "(or 0.0) simultaneously"); - - exp->using_concurrency_range = true; - exp->concurrency_range.start = 100; - exp->concurrency_range.end = 0; - exp->concurrency_range.step = 25; - exp->latency_threshold_ms = 0; - CHECK_PARAMS(act, exp); + SUBCASE("one model") + { + char* argv[argc] = { + app_name, "-m", model_name, "--bls-composing-models", "a"}; + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + CHECK(act->bls_composing_models.size() == 1); + CHECK_STRING(act->bls_composing_models[0].first, "a"); + CHECK_STRING(act->bls_composing_models[0].second, ""); + } + SUBCASE("lists with no version") + { + SUBCASE("a,b,c") + { + char* argv[argc] = { + app_name, "-m", model_name, "--bls-composing-models", "a,b,c"}; + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + } + SUBCASE("a, b, c") + { + char* argv[argc] = { + app_name, "-m", model_name, "--bls-composing-models", "a, b, c"}; + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + } + SUBCASE("a,b, c") + { + char* argv[argc] = { + app_name, "-m", model_name, "--bls-composing-models", "a,b, c"}; + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + } + SUBCASE("a, b,c") + { + char* argv[argc] = { + app_name, "-m", model_name, "--bls-composing-models", "a, b,c"}; + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + } + SUBCASE("a, b, c") + { + char* argv[argc] = { + app_name, "-m", model_name, "--bls-composing-models", "a, b, c"}; + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + } + + CHECK(!parser.UsageCalled()); + REQUIRE(act->bls_composing_models.size() == 3); + CHECK_STRING(act->bls_composing_models[0].first, "a"); + CHECK_STRING(act->bls_composing_models[1].first, "b"); + CHECK_STRING(act->bls_composing_models[2].first, "c"); + CHECK_STRING(act->bls_composing_models[0].second, ""); + CHECK_STRING(act->bls_composing_models[1].second, ""); + CHECK_STRING(act->bls_composing_models[2].second, ""); + } + SUBCASE("list with version") + { + SUBCASE("a:1,b:2,c:1") + { + char* argv[argc] = { + app_name, "-m", model_name, "--bls-composing-models", + "a:1,b:2,c:1"}; + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + } + SUBCASE("a:1, b:2, c:1") + { + char* argv[argc] = { + app_name, "-m", model_name, "--bls-composing-models", + "a:1, b:2, c:1"}; + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + } + SUBCASE("a:1, b:2, c:1") + { + char* argv[argc] = { + app_name, "-m", model_name, "--bls-composing-models", + "a:1, b:2, c:1"}; + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + } + SUBCASE("a:1 , b:2, c:1") + { + char* argv[argc] = { + app_name, "-m", model_name, "--bls-composing-models", + "a:1 , b:2, c:1"}; + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + } + CHECK(!parser.UsageCalled()); + REQUIRE(act->bls_composing_models.size() == 3); + CHECK_STRING(act->bls_composing_models[0].first, "a"); + CHECK_STRING(act->bls_composing_models[1].first, "b"); + CHECK_STRING(act->bls_composing_models[2].first, "c"); + CHECK_STRING(act->bls_composing_models[0].second, "1"); + CHECK_STRING(act->bls_composing_models[1].second, "2"); + CHECK_STRING(act->bls_composing_models[2].second, "1"); + } + SUBCASE("list with some versions") + { + SUBCASE("a,b:3,c") + { + char* argv[argc] = { + app_name, "-m", model_name, "--bls-composing-models", "a,b:3,c"}; + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + } + CHECK(!parser.UsageCalled()); + REQUIRE(act->bls_composing_models.size() == 3); + CHECK_STRING(act->bls_composing_models[0].first, "a"); + CHECK_STRING(act->bls_composing_models[1].first, "b"); + CHECK_STRING(act->bls_composing_models[2].first, "c"); + CHECK_STRING(act->bls_composing_models[0].second, ""); + CHECK_STRING(act->bls_composing_models[1].second, "3"); + CHECK_STRING(act->bls_composing_models[2].second, ""); + } + SUBCASE("multiple versions of the same model") + { + SUBCASE("a:1,b:2,a:2") + { + char* argv[argc] = { + app_name, "-m", model_name, "--bls-composing-models", "a:1,b,a:2"}; + REQUIRE_NOTHROW(act = parser.Parse(argc, argv)); + } + CHECK(!parser.UsageCalled()); + REQUIRE(act->bls_composing_models.size() == 3); + CHECK_STRING(act->bls_composing_models[0].first, "a"); + CHECK_STRING(act->bls_composing_models[1].first, "b"); + CHECK_STRING(act->bls_composing_models[2].first, "a"); + CHECK_STRING(act->bls_composing_models[0].second, "1"); + CHECK_STRING(act->bls_composing_models[1].second, ""); + CHECK_STRING(act->bls_composing_models[2].second, "2"); + } } + if (check_params) { + if (act == nullptr) { + std::cerr + << "Error: Attempting to access `act` but was not initialized. Check " + "if the test cases are missing `check_params = false` statement." + << std::endl; + exit(1); + } + CHECK_PARAMS(act, exp); + } optind = 1; // Reset GotOpt index, needed to parse the next command line } }} // namespace triton::perfanalyzer diff --git a/test_concurrency_manager.cc b/test_concurrency_manager.cc new file mode 100644 index 00000000..1941a018 --- /dev/null +++ b/test_concurrency_manager.cc @@ -0,0 +1,941 @@ +// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include +#include + +#include "command_line_parser.h" +#include "concurrency_manager.h" +#include "doctest.h" +#include "mock_client_backend.h" +#include "mock_concurrency_worker.h" +#include "mock_data_loader.h" +#include "mock_infer_data_manager.h" +#include "mock_model_parser.h" +#include "mock_sequence_manager.h" +#include "sequence_manager.h" +#include "test_load_manager_base.h" +#include "test_utils.h" + +namespace triton { namespace perfanalyzer { + +class TestConcurrencyManager : public TestLoadManagerBase, + public ConcurrencyManager { + public: + TestConcurrencyManager( + PerfAnalyzerParameters params, bool is_sequence_model = false, + bool is_decoupled_model = false, bool use_mock_infer = false) + : use_mock_infer_(use_mock_infer), + TestLoadManagerBase(params, is_sequence_model, is_decoupled_model), + ConcurrencyManager( + params.async, params.streaming, params.batch_size, + params.max_threads, params.max_concurrency, + params.shared_memory_type, params.output_shm_size, GetParser(), + GetFactory(), params.request_parameters) + { + } + + std::shared_ptr MakeWorker( + std::shared_ptr thread_stat, + std::shared_ptr thread_config) override + { + size_t id = workers_.size(); + + auto worker = std::make_shared( + id, thread_stat, thread_config, parser_, data_loader_, factory_, + on_sequence_model_, async_, max_concurrency_, using_json_data_, + streaming_, batch_size_, wake_signal_, wake_mutex_, active_threads_, + execute_, infer_data_manager_, sequence_manager_); + + if (use_mock_infer_) { + EXPECT_CALL(*worker, Infer()) + .WillRepeatedly(testing::Invoke( + worker.get(), &MockConcurrencyWorker::EmptyInfer)); + } + return worker; + } + + + void TestReconfigThreads( + const size_t concurrent_request_count, const size_t num_requests, + std::vector& expected_configs) + { + ConcurrencyManager::ReconfigThreads(concurrent_request_count, num_requests); + + auto expected_size = expected_configs.size(); + + // Check that the correct number of threads are created + // + CHECK(threads_.size() == expected_size); + + // Check that threads_config has correct concurrency and seq stat index + // offset + for (auto i = 0; i < expected_configs.size(); i++) { + CHECK( + threads_config_[i]->concurrency_ == expected_configs[i].concurrency_); + CHECK( + threads_config_[i]->seq_stat_index_offset_ == + expected_configs[i].seq_stat_index_offset_); + CHECK( + threads_config_[i]->num_requests_ == + expected_configs[i].num_requests_); + } + } + + void StopWorkerThreads() { LoadManager::StopWorkerThreads(); } + + /// Test that the correct Infer function is called in the backend + /// + void TestInferType() + { + // FIXME TMA-982: This delay is to avoid deadlock. Investigate why delay is + // needed. + stats_->SetDelays({50}); + + ChangeConcurrencyLevel(params_.max_concurrency); + + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + + CheckInferType(); + } + + /// Test that the correct concurrency is maintained in the load manager + /// + void TestConcurrency( + size_t response_delay, std::chrono::milliseconds sleep_time) + { + stats_->SetDelays({response_delay}); + + ChangeConcurrencyLevel(params_.max_concurrency); + std::this_thread::sleep_for(sleep_time); + + CheckConcurrency(); + } + + /// Test sequence handling + /// + void TestSequences() + { + size_t delay_ms = 10; + stats_->SetDelays({delay_ms}); + + auto stats = cb::InferStat(); + double concurrency1 = params_.max_concurrency / 2; + double concurrency2 = params_.max_concurrency; + int sleep_ms = 500; + + auto sleep_time = std::chrono::milliseconds(sleep_ms); + size_t expected_count1 = sleep_ms * concurrency1 / delay_ms; + size_t expected_count2 = + sleep_ms * concurrency2 / delay_ms + expected_count1; + + // Run and check request rate 1 + // + ChangeConcurrencyLevel(concurrency1); + std::this_thread::sleep_for(sleep_time); + + stats = cb::InferStat(); + GetAccumulatedClientStat(&stats); + CHECK( + stats.completed_request_count == + doctest::Approx(expected_count1).epsilon(0.10)); + + PauseSequenceWorkers(); + CheckSequences(concurrency1); + + // Make sure that the client and the manager are in agreement on the request + // count in between rates + // + stats = cb::InferStat(); + GetAccumulatedClientStat(&stats); + int client_total_requests = stats_->num_async_infer_calls + + stats_->num_async_stream_infer_calls + + stats_->num_infer_calls; + CHECK(stats.completed_request_count == client_total_requests); + + ResetStats(); + + // Run and check request rate 2 + // + ChangeConcurrencyLevel(concurrency2); + std::this_thread::sleep_for(sleep_time); + + stats = cb::InferStat(); + GetAccumulatedClientStat(&stats); + CHECK( + stats.completed_request_count == + doctest::Approx(expected_count2).epsilon(0.10)); + + // Stop all threads and make sure everything is as expected + // + StopWorkerThreads(); + + CheckSequences(concurrency2); + } + + /// Test that tries to find deadlocks and livelocks + /// + void TestTimeouts() + { + TestWatchDog watchdog(1000); + ChangeConcurrencyLevel(params_.max_concurrency); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + StopWorkerThreads(); + watchdog.stop(); + } + + /// Test that idle time is tracked correctly + void TestOverhead() + { + stats_->SetDelays({1}); + ChangeConcurrencyLevel(params_.max_concurrency); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + // During a run of 100 ms (100,000,000 ns), make sure that the idle time is + // at least 95% of that + // + auto idle_time_ns = GetIdleTime(); + CHECK(idle_time_ns > 95000000); + StopWorkerThreads(); + } + + std::shared_ptr& parser_{LoadManager::parser_}; + std::shared_ptr& data_loader_{LoadManager::data_loader_}; + std::shared_ptr& sequence_manager_{ + LoadManager::sequence_manager_}; + bool& using_json_data_{LoadManager::using_json_data_}; + bool& execute_{ConcurrencyManager::execute_}; + size_t& batch_size_{LoadManager::batch_size_}; + size_t& max_threads_{LoadManager::max_threads_}; + std::shared_ptr factory_{ + TestLoadManagerBase::factory_}; + std::shared_ptr& infer_data_manager_{ + LoadManager::infer_data_manager_}; + + private: + bool use_mock_infer_{false}; + + void CheckConcurrency() + { + if (params_.max_concurrency < 4) { + CHECK(stats_->num_active_infer_calls == params_.max_concurrency); + } else { + CHECK( + stats_->num_active_infer_calls == + doctest::Approx(params_.max_concurrency).epsilon(0.25)); + } + } + + + std::shared_ptr MakeSequenceManager( + const uint64_t start_sequence_id, const uint64_t sequence_id_range, + const size_t sequence_length, const bool sequence_length_specified, + const double sequence_length_variation, const bool using_json_data, + std::shared_ptr data_loader) override + { + return std::make_shared( + start_sequence_id, sequence_id_range, sequence_length, + sequence_length_specified, sequence_length_variation, using_json_data, + data_loader); + } +}; + +/// Test that the correct Infer function is called in the backend +/// +TEST_CASE("concurrency_infer_type") +{ + PerfAnalyzerParameters params{}; + + params.max_concurrency = 1; + + SUBCASE("async_streaming") + { + params.async = true; + params.streaming = true; + } + SUBCASE("async_no_streaming") + { + params.async = true; + params.streaming = false; + } + SUBCASE("no_async_streaming") + { + params.async = false; + params.streaming = true; + } + SUBCASE("no_async_no_streaming") + { + params.async = false; + params.streaming = false; + } + + + TestConcurrencyManager tcm(params); + + tcm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + + tcm.TestInferType(); +} + +/// Test that the correct concurrency is maintained in the load manager +/// +TEST_CASE("concurrency_concurrency") +{ + PerfAnalyzerParameters params{}; + size_t response_delay{50}; + std::chrono::milliseconds sleep_time{225}; + + SUBCASE("sync, no-streaming, 1 concurrency, 1 thread") + { + params.forced_sync = true; + params.async = false; + params.streaming = false; + params.max_concurrency = 1; + params.max_threads = 1; + } + + SUBCASE("sync, no-streaming, 4 concurrency, 4 threads") + { + params.forced_sync = true; + params.async = false; + params.streaming = false; + params.max_concurrency = 4; + params.max_threads = 4; + } + + SUBCASE("async, no-streaming, 1 concurrency, 1 thread") + { + params.forced_sync = false; + params.async = true; + params.streaming = false; + params.max_concurrency = 1; + params.max_threads = 1; + } + + SUBCASE("async, no-streaming, 4 concurrency, 1 thread") + { + params.forced_sync = false; + params.async = true; + params.streaming = false; + params.max_concurrency = 4; + params.max_threads = 1; + } + + SUBCASE("async, no-streaming, 4 concurrency, 2 threads") + { + params.forced_sync = false; + params.async = true; + params.streaming = false; + params.max_concurrency = 4; + params.max_threads = 2; + } + + SUBCASE("async, no-streaming, 4 concurrency, 4 threads") + { + params.forced_sync = false; + params.async = true; + params.streaming = false; + params.max_concurrency = 4; + params.max_threads = 4; + } + + SUBCASE("async, streaming, 1 concurrency, 1 thread") + { + params.forced_sync = false; + params.async = true; + params.streaming = true; + params.max_concurrency = 1; + params.max_threads = 1; + } + + SUBCASE("async, streaming, 4 concurrency, 1 thread") + { + params.forced_sync = false; + params.async = true; + params.streaming = true; + params.max_concurrency = 4; + params.max_threads = 1; + } + + SUBCASE("async, streaming, 4 concurrency, 2 threads") + { + params.forced_sync = false; + params.async = true; + params.streaming = true; + params.max_concurrency = 4; + params.max_threads = 2; + } + + SUBCASE("async, streaming, 4 concurrency, 4 threads") + { + params.forced_sync = false; + params.async = true; + params.streaming = true; + params.max_concurrency = 4; + params.max_threads = 4; + } + + + TestConcurrencyManager tcm(params); + + tcm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + + tcm.TestConcurrency(response_delay, sleep_time); +} + +/// Check that the inference requests for sequences follow all rules and +/// parameters +/// +TEST_CASE("concurrency_sequence") +{ + PerfAnalyzerParameters params = TestLoadManagerBase::GetSequenceTestParams(); + const bool is_sequence_model{true}; + + TestConcurrencyManager tcm(params, is_sequence_model); + + tcm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + tcm.TestSequences(); +} + +/// Create the case where the sequences do NOT go round robin due to +/// the first request taking longer than the rest. +/// +/// This exposed a bug where we were constantly resetting ctx IDs +/// and issuing over and over again to the first sequence even though +/// it was the only sequence that should NOT be issued because it was +/// still outstanding +/// +TEST_CASE("concurrency_free_ctx_ids") +{ + PerfAnalyzerParameters params{}; + params.async = true; + params.streaming = true; + params.max_concurrency = 6; + + bool is_sequence_model{true}; + + + TestConcurrencyManager tcm(params, is_sequence_model); + + + tcm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + + // Have the first request (sequence ID 1) take very long, and all the other + // requests are fast + // + tcm.stats_->SetDelays({50, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}); + + std::shared_ptr thread_stat{std::make_shared()}; + std::shared_ptr thread_config{ + std::make_shared(0)}; + thread_config->concurrency_ = 4; + + std::shared_ptr worker{tcm.MakeWorker(thread_stat, thread_config)}; + + std::future infer_future{std::async(&IWorker::Infer, worker)}; + + std::this_thread::sleep_for(std::chrono::milliseconds(15)); + + early_exit = true; + infer_future.get(); + + // The first sequence should only be called two times, once at the very start, + // and once during shutdown + // + CHECK(tcm.stats_->sequence_status.seq_ids_to_count.at(1) == 2); +} + +TEST_CASE("Concurrency - shared memory infer input calls") +{ + PerfAnalyzerParameters params{}; + params.max_concurrency = 4; + bool is_sequence_model{false}; + + const auto& ParameterizeAsyncAndStreaming{[&]() { + SUBCASE("sync non-streaming") + { + params.async = false; + params.streaming = false; + } + SUBCASE("async non-streaming") + { + params.async = true; + params.streaming = false; + } + SUBCASE("async streaming") + { + params.async = true; + params.streaming = true; + } + }}; + + const auto& ParameterizeSequence{[&]() { + SUBCASE("non-sequence") + { + is_sequence_model = false; + ParameterizeAsyncAndStreaming(); + } + SUBCASE("sequence") + { + is_sequence_model = true; + params.num_of_sequences = 1; + ParameterizeAsyncAndStreaming(); + } + }}; + + const auto& ParameterizeMemory{[&]() { + SUBCASE("No shared memory") + { + params.shared_memory_type = NO_SHARED_MEMORY; + ParameterizeSequence(); + } + SUBCASE("system shared memory") + { + params.shared_memory_type = SYSTEM_SHARED_MEMORY; + ParameterizeSequence(); + } + SUBCASE("cuda shared memory") + { + params.shared_memory_type = CUDA_SHARED_MEMORY; + ParameterizeSequence(); + } + }}; + + ParameterizeMemory(); + + + const std::string json_str{R"( + { + "data": [ + { + "INPUT0": [2000000000] + }, + { + "INPUT0": [2000000001] + } + ] + } + )"}; + + MockInputPipeline mip = + TestLoadManagerBase::ProcessCustomJsonData(json_str, is_sequence_model); + + + TestConcurrencyManager tcm(params, is_sequence_model); + + tcm.infer_data_manager_ = + MockInferDataManagerFactory::CreateMockInferDataManager( + params.max_threads, params.batch_size, params.shared_memory_type, + params.output_shm_size, params.request_parameters, + mip.mock_model_parser_, tcm.factory_, mip.mock_data_loader_); + + std::shared_ptr thread_stat{std::make_shared()}; + std::shared_ptr thread_config{ + std::make_shared(0)}; + thread_config->concurrency_ = 1; + + tcm.parser_ = mip.mock_model_parser_; + tcm.data_loader_ = mip.mock_data_loader_; + tcm.using_json_data_ = true; + tcm.execute_ = true; + tcm.batch_size_ = 1; + tcm.max_threads_ = 1; + + tcm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + + std::shared_ptr worker{tcm.MakeWorker(thread_stat, thread_config)}; + std::future infer_future{std::async(&IWorker::Infer, worker)}; + + std::this_thread::sleep_for(std::chrono::milliseconds(18)); + + early_exit = true; + infer_future.get(); + + const auto& actual_append_raw_calls{tcm.stats_->num_append_raw_calls}; + const auto& actual_set_shared_memory_calls{ + tcm.stats_->num_set_shared_memory_calls}; + + if (params.shared_memory_type == NO_SHARED_MEMORY) { + CHECK(actual_append_raw_calls > 0); + CHECK(actual_set_shared_memory_calls == 0); + } else { + CHECK(actual_append_raw_calls == 0); + CHECK(actual_set_shared_memory_calls > 0); + } +} + +/// Verify Shared Memory api calls +/// +TEST_CASE("Concurrency - Shared memory methods") +{ + PerfAnalyzerParameters params; + bool is_sequence = false; + bool is_decoupled = false; + bool use_mock_infer = true; + + const std::string json_str{R"( + { + "data": [ + { + "INPUT0": [2123456789] + } + ] + } + )"}; + + MockInputPipeline mip = TestLoadManagerBase::ProcessCustomJsonData(json_str); + + cb::MockClientStats::SharedMemoryStats expected_stats; + + SUBCASE("System shared memory usage") + { + params.shared_memory_type = SYSTEM_SHARED_MEMORY; + TestConcurrencyManager tcm( + params, is_sequence, is_decoupled, use_mock_infer); + + tcm.infer_data_manager_ = + MockInferDataManagerFactory::CreateMockInferDataManager( + params.max_threads, params.batch_size, params.shared_memory_type, + params.output_shm_size, params.request_parameters, + mip.mock_model_parser_, tcm.factory_, mip.mock_data_loader_); + + tcm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + + expected_stats.num_unregister_all_shared_memory_calls = 1; + expected_stats.num_register_system_shared_memory_calls = 1; + expected_stats.num_create_shared_memory_region_calls = 1; + expected_stats.num_map_shared_memory_calls = 1; + tcm.CheckSharedMemory(expected_stats); + } + + SUBCASE("Cuda shared memory usage") + { + params.shared_memory_type = CUDA_SHARED_MEMORY; + TestConcurrencyManager tcm( + params, is_sequence, is_decoupled, use_mock_infer); + + tcm.infer_data_manager_ = + MockInferDataManagerFactory::CreateMockInferDataManager( + params.max_threads, params.batch_size, params.shared_memory_type, + params.output_shm_size, params.request_parameters, + mip.mock_model_parser_, tcm.factory_, mip.mock_data_loader_); + + tcm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + + expected_stats.num_unregister_all_shared_memory_calls = 1; + expected_stats.num_register_cuda_shared_memory_calls = 1; + tcm.CheckSharedMemory(expected_stats); + } + + SUBCASE("No shared memory usage") + { + params.shared_memory_type = NO_SHARED_MEMORY; + TestConcurrencyManager tcm( + params, is_sequence, is_decoupled, use_mock_infer); + tcm.infer_data_manager_ = + MockInferDataManagerFactory::CreateMockInferDataManager( + params.max_threads, params.batch_size, params.shared_memory_type, + params.output_shm_size, params.request_parameters, + mip.mock_model_parser_, tcm.factory_, mip.mock_data_loader_); + tcm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + + tcm.CheckSharedMemory(expected_stats); + } +} + +TEST_CASE("concurrency_deadlock") +{ + PerfAnalyzerParameters params{}; + params.max_concurrency = 6; + bool is_sequence_model{true}; + bool some_infer_failures{false}; + + const auto& ParameterizeSyncStreaming{[&]() { + SUBCASE("sync") + { + params.async = false; + params.streaming = false; + } + SUBCASE("aync no streaming") + { + params.async = true; + params.streaming = false; + } + SUBCASE("async streaming") + { + params.async = true; + params.streaming = true; + } + }}; + + const auto& ParameterizeConcurrency{[&]() { + SUBCASE("10 concurrency, 10 thread") + { + ParameterizeSyncStreaming(); + params.max_concurrency = 10; + params.max_threads = 10; + } + SUBCASE("10 concurrency, 4 thread") + { + ParameterizeSyncStreaming(); + params.max_concurrency = 10; + params.max_threads = 4; + } + }}; + + const auto& ParameterizeSequence{[&]() { + SUBCASE("non-sequence") + { + ParameterizeConcurrency(); + is_sequence_model = false; + } + SUBCASE("sequence") + { + ParameterizeConcurrency(); + is_sequence_model = true; + } + }}; + + const auto& ParameterizeFailures{[&]() { + SUBCASE("yes_failures") + { + some_infer_failures = true; + ParameterizeSequence(); + } + SUBCASE("no_failures") + { + some_infer_failures = false; + ParameterizeSequence(); + } + }}; + + std::vector delays; + + const auto& ParameterizeDelays{[&]() { + SUBCASE("no_delay") + { + delays = {0}; + ParameterizeFailures(); + } + SUBCASE("random_delay") + { + delays = {1, 5, 20, 4, 3}; + ParameterizeFailures(); + } + }}; + + + ParameterizeDelays(); + + + TestConcurrencyManager tcm(params, is_sequence_model); + + tcm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + + tcm.stats_->SetDelays(delays); + + // Sometimes have a request fail + if (some_infer_failures) { + tcm.stats_->SetReturnStatuses({true, true, true, false}); + } + + tcm.TestTimeouts(); +} + +TEST_CASE("concurrency_overhead") +{ + PerfAnalyzerParameters params{}; + SUBCASE("sync, conc 1") + { + params.async = false; + params.max_concurrency = 1; + } + SUBCASE("sync, conc 4") + { + params.async = false; + params.max_concurrency = 4; + } + SUBCASE("async, conc 1") + { + params.async = true; + params.max_concurrency = 1; + } + SUBCASE("async, conc 1") + { + params.async = true; + params.max_concurrency = 4; + } + TestConcurrencyManager tcm(params, false); + tcm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + + tcm.TestOverhead(); +} + +TEST_CASE( + "send_request_rate_concurrency_manager: testing logic around detecting " + "send request count") +{ + PerfAnalyzerParameters params{}; + + SUBCASE("sync") + { + params.async = false; + } + SUBCASE("async") + { + params.async = true; + } + + TestConcurrencyManager tcm(params); + + tcm.stats_->SetDelays({10}); + + tcm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + + tcm.ChangeConcurrencyLevel(4); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + tcm.StopWorkerThreads(); + + const size_t num_sent_requests{tcm.GetAndResetNumSentRequests()}; + + CHECK(num_sent_requests == doctest::Approx(40).epsilon(0.1)); +} + +TEST_CASE( + "reconfigure_threads" * + doctest::description( + "This test confirms the side-effects of ReconfigThreads(). Namely, " + "that the correct number of threads are created and that they are " + "configured properly")) +{ + PerfAnalyzerParameters params{}; + std::vector expected_config_values; + std::vector expected_concurrencies; + std::vector expected_seq_stat_index_offsets; + std::vector expected_num_requests; + + size_t target_concurrency = 0; + size_t target_num_requests = 0; + + SUBCASE("normal") + { + params.max_threads = 10; + target_concurrency = 5; + target_num_requests = 15; + + expected_concurrencies = {1, 1, 1, 1, 1}; + expected_seq_stat_index_offsets = {0, 1, 2, 3, 4}; + expected_num_requests = {3, 3, 3, 3, 3}; + } + SUBCASE("thread_limited") + { + params.max_threads = 5; + target_concurrency = 10; + target_num_requests = 20; + + expected_concurrencies = {2, 2, 2, 2, 2}; + expected_seq_stat_index_offsets = {0, 2, 4, 6, 8}; + expected_num_requests = {4, 4, 4, 4, 4}; + } + SUBCASE("unbalanced") + { + params.max_threads = 6; + target_concurrency = 14; + target_num_requests = 15; + + expected_concurrencies = {3, 3, 2, 2, 2, 2}; + expected_seq_stat_index_offsets = {0, 3, 6, 8, 10, 12}; + expected_num_requests = {3, 3, 3, 2, 2, 2}; + } + SUBCASE("no requests specified") + { + params.max_threads = 2; + target_concurrency = 14; + target_num_requests = 0; + + expected_concurrencies = {7, 7}; + expected_seq_stat_index_offsets = {0, 7}; + expected_num_requests = {0, 0}; + } + + for (auto i = 0; i < expected_concurrencies.size(); i++) { + ThreadConfig tc(i); + tc.concurrency_ = expected_concurrencies[i]; + tc.seq_stat_index_offset_ = expected_seq_stat_index_offsets[i]; + tc.num_requests_ = expected_num_requests[i]; + expected_config_values.push_back(tc); + } + + TestConcurrencyManager tcm(params); + tcm.TestReconfigThreads( + target_concurrency, target_num_requests, expected_config_values); +} + + +}} // namespace triton::perfanalyzer diff --git a/test_ctx_id_tracker.cc b/test_ctx_id_tracker.cc new file mode 100644 index 00000000..8625fbd6 --- /dev/null +++ b/test_ctx_id_tracker.cc @@ -0,0 +1,146 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include +#include +#include +#include + +#include "concurrency_ctx_id_tracker.h" +#include "doctest.h" +#include "fifo_ctx_id_tracker.h" +#include "rand_ctx_id_tracker.h" + +namespace triton { namespace perfanalyzer { + +TEST_CASE("CtxIdTrackers: FIFO") +{ + std::shared_ptr tracker = std::make_shared(); + + // Reset will load up context IDs 0-9 into the queue and return them in order + // on consecutive Get calls + size_t count = 10; + CHECK_FALSE(tracker->IsAvailable()); + tracker->Reset(count); + CHECK(tracker->IsAvailable()); + for (size_t i = 0; i < count; i++) { + CHECK(tracker->Get() == i); + } + + // Manually restoring values should be returned in-order + CHECK_FALSE(tracker->IsAvailable()); + tracker->Restore(7); + CHECK(tracker->IsAvailable()); + tracker->Restore(13); + CHECK(tracker->Get() == 7); + CHECK(tracker->Get() == 13); + + // A reset should throw away any values on the old list + tracker->Reset(10); + tracker->Reset(1); + tracker->Get(); + CHECK(!tracker->IsAvailable()); + + // Calling Get when not available should Throw + CHECK_THROWS_AS(tracker->Get(), const std::exception&); +} + +TEST_CASE("CtxIdTrackers: Conc") +{ + std::shared_ptr tracker = + std::make_shared(); + + // Reset will load up 10 instances of context IDs 0 into the queue and return + // them in order on consecutive Get calls + size_t count = 10; + tracker->Reset(count); + for (size_t i = 0; i < count; i++) { + CHECK(tracker->Get() == 0); + } + + // Manually restoring values should be returned in-order + CHECK_FALSE(tracker->IsAvailable()); + tracker->Restore(7); + tracker->Restore(13); + CHECK(tracker->IsAvailable()); + CHECK(tracker->Get() == 7); + CHECK(tracker->Get() == 13); + + // A reset should throw away any values on the old list + tracker->Reset(10); + tracker->Reset(1); + tracker->Get(); + CHECK(!tracker->IsAvailable()); + + // Calling Get when not available should Throw + CHECK_THROWS_AS(tracker->Get(), const std::exception&); +} + +TEST_CASE("CtxIdTrackers: Rand") +{ + std::shared_ptr tracker = std::make_shared(); + size_t max; + + auto check_range_and_variance = [&]() { + size_t num_trials = 1000; + + std::vector results(max, 0); + for (size_t i = 0; i < num_trials; i++) { + auto x = tracker->Get(); + REQUIRE((x < max && x >= 0)); + results[x]++; + } + + // Confirm that the distribution of the picked CTX IDs is random + double mean = + std::accumulate(results.begin(), results.end(), 0.0) / results.size(); + double variance = 0; + for (size_t i = 0; i < results.size(); i++) { + variance += std::pow(results[i] - mean, 2); + } + variance /= results.size(); + CHECK((variance > 10 && variance < 100)); + }; + + // IsAvailable is always true for this class + CHECK(tracker->IsAvailable()); + + // Reset should define the bounds of random CTX id picking + max = 10; + tracker->Reset(max); + // Restore should have no impact on this class. + tracker->Restore(9999); + check_range_and_variance(); + + + // Reset should RE-define the bounds of random CTX id picking + max = 5; + tracker->Reset(max); + check_range_and_variance(); +} + + +}} // namespace triton::perfanalyzer diff --git a/test_custom_load_manager.cc b/test_custom_load_manager.cc new file mode 100644 index 00000000..ced79af7 --- /dev/null +++ b/test_custom_load_manager.cc @@ -0,0 +1,431 @@ +// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include +#include +#include +#include + +#include "client_backend/client_backend.h" +#include "constants.h" +#include "custom_load_manager.h" +#include "doctest.h" +#include "mock_request_rate_worker.h" +#include "request_rate_manager.h" +#include "test_load_manager_base.h" + +using nanoseconds = std::chrono::nanoseconds; +using milliseconds = std::chrono::milliseconds; + +namespace triton { namespace perfanalyzer { + +/// Class to test the CustomLoadManager +/// +class TestCustomLoadManager : public TestLoadManagerBase, + public CustomLoadManager { + public: + TestCustomLoadManager() = default; + + TestCustomLoadManager( + PerfAnalyzerParameters params, bool is_sequence_model = false, + bool is_decoupled_model = false, bool use_mock_infer = false) + : use_mock_infer_(use_mock_infer), + TestLoadManagerBase(params, is_sequence_model, is_decoupled_model), + CustomLoadManager( + params.async, params.streaming, "INTERVALS_FILE", params.batch_size, + params.measurement_window_ms, params.max_trials, params.max_threads, + params.num_of_sequences, params.shared_memory_type, + params.output_shm_size, params.serial_sequences, GetParser(), + GetFactory(), params.request_parameters) + { + InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + } + + std::shared_ptr MakeWorker( + std::shared_ptr thread_stat, + std::shared_ptr thread_config) override + { + size_t id = workers_.size(); + auto worker = std::make_shared( + id, thread_stat, thread_config, parser_, data_loader_, factory_, + on_sequence_model_, async_, max_threads_, using_json_data_, streaming_, + batch_size_, wake_signal_, wake_mutex_, execute_, start_time_, + serial_sequences_, infer_data_manager_, sequence_manager_); + + if (use_mock_infer_) { + EXPECT_CALL(*worker, Infer()) + .WillRepeatedly(testing::Invoke( + worker.get(), &MockRequestRateWorker::EmptyInfer)); + } + return worker; + } + + void TestSchedule( + std::vector intervals, PerfAnalyzerParameters params) + { + for (auto i : intervals) { + custom_intervals_.push_back(nanoseconds{i}); + } + nanoseconds measurement_window_nanoseconds{ + params.measurement_window_ms * NANOS_PER_MILLIS}; + nanoseconds max_test_duration{ + measurement_window_nanoseconds * params.max_trials}; + nanoseconds expected_current_timestamp{0}; + size_t intervals_index = 0; + + PauseWorkers(); + ConfigureThreads(); + GenerateSchedule(); + + std::vector expected_timestamps; + std::vector observed_timestamps; + + // Determine what the observed schedule was by getting each worker's + // schedule and then sorting them together + // + for (auto worker : workers_) { + nanoseconds observed_timestamp = + std::dynamic_pointer_cast(worker) + ->GetNextTimestamp(); + while (observed_timestamp <= max_test_duration) { + observed_timestamps.push_back(observed_timestamp); + observed_timestamp = + std::dynamic_pointer_cast(worker) + ->GetNextTimestamp(); + } + } + sort(observed_timestamps.begin(), observed_timestamps.end()); + + // Determine what the schedule "should" be + // + while (expected_current_timestamp < observed_timestamps.back()) { + expected_current_timestamp += custom_intervals_[intervals_index]; + expected_timestamps.push_back(expected_current_timestamp); + intervals_index = (intervals_index + 1) % custom_intervals_.size(); + } + + // Confirm that the expected and observed schedules were the same + // + REQUIRE_MESSAGE( + observed_timestamps.size() == expected_timestamps.size(), + "Mismatch in size of schedules"); + + for (size_t i = 0; i < observed_timestamps.size(); i++) { + CHECK(observed_timestamps[i] == expected_timestamps[i]); + } + } + + void TestSequences( + std::vector intervals, bool check_sequences_balanced) + { + auto sleep_time = milliseconds(20); + for (auto i : intervals) { + custom_intervals_.push_back(nanoseconds{i}); + } + + PauseWorkers(); + ConfigureThreads(); + GenerateSchedule(); + ResumeWorkers(); + std::this_thread::sleep_for(sleep_time); + if (check_sequences_balanced) { + CheckSequenceBalance(); + } + StopWorkerThreads(); + } + + std::shared_ptr& parser_{LoadManager::parser_}; + std::shared_ptr& factory_{ + TestLoadManagerBase::factory_}; + + std::string& request_intervals_file_{ + CustomLoadManager::request_intervals_file_}; + NanoIntervals& custom_intervals_{CustomLoadManager::custom_intervals_}; + + cb::Error ReadTimeIntervalsFile( + const std::string& path, NanoIntervals* contents) override + { + return cb::Error::Success; + } + + private: + bool use_mock_infer_; +}; + +TEST_CASE("custom_load_schedule") +{ + PerfAnalyzerParameters params; + params.measurement_window_ms = 1000; + params.max_trials = 10; + bool is_sequence = false; + bool is_decoupled = false; + bool use_mock_infer = true; + std::vector intervals; + + const auto& ParameterizeIntervals{[&]() { + SUBCASE("intervals A") + { + intervals = {100000000, 110000000, 130000000}; + } + SUBCASE("intervals B") + { + intervals = {150000000}; + } + SUBCASE("intervals C") + { + intervals = {100000000, 110000000, 120000000, 130000000, 140000000}; + } + }}; + + const auto& ParameterizeThreads{[&]() { + SUBCASE("threads 1") + { + ParameterizeIntervals(); + params.max_threads = 1; + } + SUBCASE("threads 2") + { + ParameterizeIntervals(); + params.max_threads = 2; + } + SUBCASE("threads 4") + { + ParameterizeIntervals(); + params.max_threads = 4; + } + SUBCASE("threads 7") + { + ParameterizeIntervals(); + params.max_threads = 7; + } + }}; + + const auto& ParameterizeTrials{[&]() { + SUBCASE("trials 3") + { + ParameterizeThreads(); + params.max_trials = 3; + } + SUBCASE("trials 10") + { + ParameterizeThreads(); + params.max_trials = 10; + } + SUBCASE("trials 20") + { + ParameterizeThreads(); + params.max_trials = 20; + } + }}; + + const auto& ParameterizeMeasurementWindow{[&]() { + SUBCASE("window 1000") + { + ParameterizeTrials(); + params.measurement_window_ms = 1000; + } + SUBCASE("window 10000") + { + ParameterizeTrials(); + params.measurement_window_ms = 10000; + } + SUBCASE("window 500") + { + ParameterizeTrials(); + params.measurement_window_ms = 500; + } + }}; + + const auto& ParameterizeSequences{[&]() { + SUBCASE("sequences off") + { + ParameterizeMeasurementWindow(); + is_sequence = false; + } + SUBCASE("3 sequences") + { + ParameterizeMeasurementWindow(); + is_sequence = true; + params.num_of_sequences = 3; + } + SUBCASE("6 sequences") + { + ParameterizeMeasurementWindow(); + is_sequence = true; + params.num_of_sequences = 6; + } + SUBCASE("9 sequences") + { + ParameterizeMeasurementWindow(); + is_sequence = true; + params.num_of_sequences = 9; + } + }}; + + ParameterizeSequences(); + TestCustomLoadManager tclm(params, is_sequence, is_decoupled, use_mock_infer); + tclm.TestSchedule(intervals, params); +} + +TEST_CASE("custom_load_sequences") +{ + PerfAnalyzerParameters params; + + // This is needed so we can confirm that all sequences are being requested + // equally when serial_sequences is on. Otherwise we would keep creating new + // sequences and wouldn't be able to track it properly. + // + params.sequence_length = 1000; + bool is_sequence_model = true; + bool check_sequences_balanced = false; + std::vector intervals; + + const auto& ParameterizeIntervals{[&]() { + SUBCASE("intervals A") + { + intervals = {100000, 110000, 130000}; + } + SUBCASE("intervals B") + { + intervals = {150000}; + } + SUBCASE("intervals C") + { + intervals = {100000, 110000, 120000, 130000, 140000}; + } + }}; + + const auto& ParameterizeSerialSequences{[&]() { + SUBCASE("serial_sequences") + { + ParameterizeIntervals(); + params.serial_sequences = true; + check_sequences_balanced = true; + } + SUBCASE("not serial_sequences") + { + ParameterizeIntervals(); + params.serial_sequences = false; + check_sequences_balanced = false; + } + }}; + + const auto& ParameterizeNumSequences{[&]() { + SUBCASE("2 sequences") + { + ParameterizeSerialSequences(); + params.num_of_sequences = 2; + } + SUBCASE("3 sequences") + { + ParameterizeSerialSequences(); + params.num_of_sequences = 3; + } + SUBCASE("5 sequences") + { + ParameterizeSerialSequences(); + params.num_of_sequences = 5; + } + SUBCASE("6 sequences") + { + ParameterizeSerialSequences(); + params.num_of_sequences = 6; + } + SUBCASE("9 sequences") + { + ParameterizeSerialSequences(); + params.num_of_sequences = 9; + } + }}; + + + const auto& ParameterizeThreads{[&]() { + SUBCASE("threads 1") + { + ParameterizeNumSequences(); + params.max_threads = 1; + } + SUBCASE("threads 2") + { + ParameterizeNumSequences(); + params.max_threads = 2; + } + SUBCASE("threads 4") + { + ParameterizeNumSequences(); + params.max_threads = 4; + } + SUBCASE("threads 7") + { + ParameterizeNumSequences(); + params.max_threads = 7; + } + }}; + + ParameterizeThreads(); + + TestCustomLoadManager tclm(params, is_sequence_model); + tclm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + + tclm.TestSequences(intervals, check_sequences_balanced); +} + + +TEST_CASE("testing the GetCustomRequestRate function") +{ + TestCustomLoadManager tclm{}; + double request_rate{0.0}; + + SUBCASE("custom_intervals_ empty") + { + cb::Error result{tclm.GetCustomRequestRate(&request_rate)}; + + CHECK(result.Err() == GENERIC_ERROR); + CHECK(result.Message() == "The custom intervals vector is empty"); + } + + SUBCASE("custom_intervals_ populated") + { + tclm.custom_intervals_.push_back(nanoseconds(100000000)); + tclm.custom_intervals_.push_back(nanoseconds(110000000)); + tclm.custom_intervals_.push_back(nanoseconds(130000000)); + + cb::Error result{tclm.GetCustomRequestRate(&request_rate)}; + + CHECK(result.Err() == SUCCESS); + CHECK(request_rate == doctest::Approx(8.0)); + } +} + +}} // namespace triton::perfanalyzer diff --git a/test_dataloader.cc b/test_dataloader.cc new file mode 100644 index 00000000..656571cb --- /dev/null +++ b/test_dataloader.cc @@ -0,0 +1,1445 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "data_loader.h" +#include "doctest.h" +#include "mock_data_loader.h" + +namespace triton { namespace perfanalyzer { + +/// Helper class for testing the DataLoader +/// +class TestDataLoader { + public: + // Static function to create a generic ModelTensor + // + static ModelTensor CreateTensor(std::string name) + { + ModelTensor t; + t.name_ = name; + t.datatype_ = "INT32"; + t.shape_ = {1}; + t.is_shape_tensor_ = false; + t.is_optional_ = false; + return t; + } +}; + +TEST_CASE("dataloader: no data") +{ + MockDataLoader dataloader; + CHECK(dataloader.GetDataStreamsCount() == 0); + cb::Error status = dataloader.ValidateIndexes(0, 0); + CHECK(status.IsOk() == false); +} + +TEST_CASE("dataloader: ValidateIndexes") +{ + MockDataLoader dataloader; + + // Pretend we loaded 2 streams, one with 1 step, one with 3 steps + dataloader.data_stream_cnt_ = 2; + dataloader.step_num_.push_back(1); + dataloader.step_num_.push_back(3); + + CHECK_EQ(dataloader.GetDataStreamsCount(), 2); + + // Step in range for stream 0 + cb::Error status = dataloader.ValidateIndexes(0, 0); + CHECK(status.IsOk() == true); + + // Step out of range for stream 0 + status = dataloader.ValidateIndexes(0, 1); + CHECK(status.IsOk() == false); + + // Step in range for stream 1 + status = dataloader.ValidateIndexes(1, 2); + CHECK(status.IsOk() == true); + + // Step out of range for stream 1 + status = dataloader.ValidateIndexes(1, 3); + CHECK(status.IsOk() == false); + + // Stream out of range + status = dataloader.ValidateIndexes(2, 0); + CHECK(status.IsOk() == false); +} + +TEST_CASE("dataloader: GetTotalSteps") +{ + MockDataLoader dataloader; + + // Pretend we loaded 2 streams, one with 1 step, one with 3 steps + dataloader.data_stream_cnt_ = 2; + dataloader.step_num_.push_back(1); + dataloader.step_num_.push_back(3); + + CHECK_EQ(dataloader.GetTotalSteps(0), 1); + CHECK_EQ(dataloader.GetTotalSteps(1), 3); + + // It will return 0 if out of range + CHECK_EQ(dataloader.GetTotalSteps(2), 0); +} + +TEST_CASE("dataloader: GetInputData missing data") +{ + MockDataLoader dataloader; + ModelTensor input1 = TestDataLoader::CreateTensor("INPUT1"); + + TensorData data; + + cb::Error status = dataloader.GetInputData(input1, 0, 0, data); + REQUIRE(status.IsOk() == false); + CHECK_EQ(status.Message(), "unable to find data for input 'INPUT1'."); +} + +TEST_CASE("dataloader: ParseData: Bad Json") +{ + std::string json_str{"bad json text"}; + + MockDataLoader dataloader; + std::shared_ptr inputs = std::make_shared(); + std::shared_ptr outputs = std::make_shared(); + + cb::Error status = dataloader.ReadDataFromStr(json_str, inputs, outputs); + CHECK(status.IsOk() == false); + CHECK_EQ( + status.Message(), + "failed to parse the specified json file for reading provided data"); +} + +TEST_CASE("dataloader: ParseData: Misc error cases") +{ + std::string expected_message; + std::string json_str; + + SUBCASE("No data") + { + json_str = R"({ "notdata" : 5})"; + expected_message = "The json file doesn't contain data field"; + } + SUBCASE("Not string b64") + { + json_str = R"({"data": [{ "INPUT1": {"b64": 5} }]})"; + expected_message = + "the value of b64 field should be of type string ( Location stream id: " + "0, step id: 0)"; + } + SUBCASE("Not b64 or array") + { + json_str = R"({"data": [{ "INPUT1": {"not_b64": "AAAAAQ=="} }]})"; + expected_message = + "missing content field. ( Location stream id: 0, step id: 0)"; + } + SUBCASE("Malformed input (boolean type)") + { + json_str = R"({"data": [{ "INPUT1": null }]})"; + expected_message = "Input data file is malformed."; + } + SUBCASE("Inconsistent elements in data array") + { + json_str = R"({"data": [ + [{ "INPUT1": [2] },{ "INPUT1": [3] }], + { "INPUT1": [1] } + ]})"; + expected_message = + "Inconsistency in input-data provided. Can not have a combination of " + "objects and arrays inside of the Data array"; + } + SUBCASE("Not integer shape") + { + json_str = R"({"data": [{ + "INPUT1": { "shape": ["a"], "content": [1,2,3,4,5,6] } + }]})"; + expected_message = "shape values must be integers."; + } + SUBCASE("Content not array") + { + json_str = R"({"data": [{ + "INPUT1": { "content": 6 } + }]})"; + expected_message = + "The tensor values are not supported. Expected an array or b64 string " + "( Location stream id: 0, step id: 0)"; + } + SUBCASE("Missing non-optional input") + { + json_str = R"({"data": [{ + "NOT_INPUT1": { "content": 6 } + }]})"; + expected_message = + "missing tensor INPUT1 ( Location stream id: 0, step id: 0)"; + } + SUBCASE("Invalid input") + { + json_str = R"({"data": + [{ + "INPUT1": [2], + "INVALID_INPUT": [2] + }] + })"; + expected_message = + "The input or output 'INVALID_INPUT' is not found in the model " + "configuration"; + } + + MockDataLoader dataloader; + std::shared_ptr inputs = std::make_shared(); + std::shared_ptr outputs = std::make_shared(); + ModelTensor input1 = TestDataLoader::CreateTensor("INPUT1"); + inputs->insert(std::make_pair(input1.name_, input1)); + + cb::Error status = dataloader.ReadDataFromStr(json_str, inputs, outputs); + CHECK(status.IsOk() == false); + CHECK_EQ(status.Message(), expected_message); +} + +TEST_CASE( + "dataloader: ParseData: Mismatching Shapes" * + doctest::description( + "When the shape is provided and it is incompatible with the actual " + "model shape, then an error should be thrown")) +{ + ModelTensor input1 = TestDataLoader::CreateTensor("INPUT1"); + + std::string expected_message; + std::string json_str; + + SUBCASE("Mismatching fixed shape") + { + input1.shape_ = {3}; + expected_message = + "The supplied shape of [1] for input \"INPUT1\" is incompatible with " + "the " + "model's input shape of [3]"; + + SUBCASE("content json") + { + json_str = + R"({"data": [{ "INPUT1": { "shape": [1], "content": [1] } }]})"; + } + SUBCASE("b64 json") + { + json_str = + R"({"data": [{ "INPUT1": { "shape": [1], "b64": "AAAAAQ=="} }]})"; + } + } + SUBCASE("Mismatching dynamic dimensions") + { + input1.shape_ = {-1}; + expected_message = + "The supplied shape of [1,1] for input \"INPUT1\" is incompatible with " + "the model's input shape of [-1]"; + + SUBCASE("content json") + { + json_str = + R"({"data": [{ "INPUT1": { "shape": [1,1], "content": [1] } }]})"; + } + SUBCASE("b64 json") + { + json_str = + R"({"data": [{ "INPUT1": { "shape": [1,1], "b64": "AAAAAQ=="} }]})"; + } + } + SUBCASE("Mismatching multiple dimensions") + { + input1.shape_ = {-1, 2}; + expected_message = + "The supplied shape of [1,1] for input \"INPUT1\" is incompatible with " + "the model's input shape of [-1,2]"; + + SUBCASE("content json") + { + json_str = + R"({"data": [{ "INPUT1": { "shape": [1,1], "content": [1] } }]})"; + } + SUBCASE("b64 json") + { + json_str = + R"({"data": [{ "INPUT1": { "shape": [1,1], "b64": "AAAAAQ=="} }]})"; + } + } + + MockDataLoader dataloader; + std::shared_ptr inputs = std::make_shared(); + inputs->insert(std::make_pair(input1.name_, input1)); + + std::shared_ptr outputs = std::make_shared(); + + cb::Error status = dataloader.ReadDataFromStr(json_str, inputs, outputs); + REQUIRE(status.IsOk() == false); + CHECK_EQ(status.Message(), expected_message); +} + + +TEST_CASE( + "dataloader: ParseData: Mismatch Input Data and Fixed Shape" * + doctest::description( + "When the size of the provided Input is not in line with the Tensor's " + "shape, then an error should be thrown")) +{ + ModelTensor input1 = TestDataLoader::CreateTensor("INPUT1"); + input1.shape_ = {3}; + + std::string expected_message; + std::string json_str; + + SUBCASE("Normal json") + { + json_str = R"({"data": [{ "INPUT1": [1,2] }]})"; + expected_message = + "mismatch in the data provided for INPUT1. Expected: 12 bytes, Got: 8 " + "bytes"; + } + SUBCASE("content json") + { + json_str = R"({"data": [{ "INPUT1": { "content": [1,2] } }]})"; + expected_message = + "mismatch in the data provided for INPUT1. Expected: 12 bytes, Got: 8 " + "bytes"; + } + SUBCASE("b64 json") + { + json_str = R"({"data": [{ "INPUT1": {"b64": "AAAAAQ=="} }]})"; + expected_message = + "mismatch in the data provided for INPUT1. Expected: 12 bytes, Got: 4 " + "bytes"; + } + + MockDataLoader dataloader; + std::shared_ptr inputs = std::make_shared(); + inputs->insert(std::make_pair(input1.name_, input1)); + + std::shared_ptr outputs = std::make_shared(); + + cb::Error status = dataloader.ReadDataFromStr(json_str, inputs, outputs); + REQUIRE(status.IsOk() == false); + CHECK_EQ(status.Message(), expected_message); +} + +TEST_CASE( + "dataloader: ParseData: Mismatch Input Data and Dynamic Shape" * + doctest::description( + "When the size of the provided Input is not in line with the Tensor's " + "shape, then an error should be thrown")) +{ + ModelTensor input1 = TestDataLoader::CreateTensor("INPUT1"); + input1.shape_ = {-1}; + + std::string expected_message; + std::string json_str; + + SUBCASE("content json") + { + json_str = + R"({"data": [{ "INPUT1": { "shape": [3], "content": [1,2] } }]})"; + expected_message = + "mismatch in the data provided for INPUT1. Expected: 12 bytes, Got: 8 " + "bytes"; + } + SUBCASE("b64 json") + { + json_str = R"({"data": [{ "INPUT1": {"shape": [3], "b64": "AAAAAQ=="} }]})"; + expected_message = + "mismatch in the data provided for INPUT1. Expected: 12 bytes, Got: 4 " + "bytes"; + } + + MockDataLoader dataloader; + std::shared_ptr inputs = std::make_shared(); + inputs->insert(std::make_pair(input1.name_, input1)); + + std::shared_ptr outputs = std::make_shared(); + + cb::Error status = dataloader.ReadDataFromStr(json_str, inputs, outputs); + REQUIRE(status.IsOk() == false); + CHECK_EQ(status.Message(), expected_message); +} + +TEST_CASE( + "dataloader: ParseData: Mismatch Input and Output" * + doctest::description( + "When the size of the provided Input and validation Output data are " + "different, then an error should be thrown")) +{ + std::string json_str; + + SUBCASE("Normal json") + { + json_str = R"({ + "data": [ + { "INPUT1": [1] }, + { "INPUT1": [2] }, + { "INPUT1": [3] } + ], + "validation_data": [ + { "OUTPUT1": [7] } + ]})"; + } + SUBCASE("content json") + { + json_str = R"({ + "data": [ + { "INPUT1": { "content": [1] } }, + { "INPUT1": { "content": [2] } }, + { "INPUT1": { "content": [3] } } + ], + "validation_data": [ + { "OUTPUT1": { "content": [7] } } + ]})"; + } + SUBCASE("b64 json") + { + json_str = R"({ + "data": [ + { "INPUT1": {"b64": "AAAAAQ=="} }, + { "INPUT1": {"b64": "AgAAAA=="} }, + { "INPUT1": {"b64": "AwAAAA=="} } + ], + "validation_data": [ + { "OUTPUT1": {"b64": "BAAAAA=="} } + ]})"; + } + + MockDataLoader dataloader; + std::shared_ptr inputs = std::make_shared(); + std::shared_ptr outputs = std::make_shared(); + + cb::Error status = dataloader.ReadDataFromStr(json_str, inputs, outputs); + CHECK(status.IsOk() == false); + CHECK_EQ( + status.Message(), + "The 'validation_data' field doesn't align with 'data' field in the json " + "file"); +} + +TEST_CASE("dataloader: ParseData: Valid Data") +{ + std::string json_str; + + SUBCASE("Normal json") + { + json_str = R"({ + "data": [ + { "INPUT1": [1] }, + { "INPUT1": [2] }, + { "INPUT1": [3] } + ], + "validation_data": [ + { "OUTPUT1": [4] }, + { "OUTPUT1": [5] }, + { "OUTPUT1": [6] } + ]})"; + } + SUBCASE("Content json") + { + json_str = R"({ + "data": [ + { "INPUT1": { "content": [1] } }, + { "INPUT1": { "content": [2] } }, + { "INPUT1": { "content": [3] } } + ], + "validation_data": [ + { "OUTPUT1": { "content": [4] } }, + { "OUTPUT1": { "content": [5] } }, + { "OUTPUT1": { "content": [6] } } + ]})"; + } + SUBCASE("b64 json") + { + // Note that these encoded values decode to the numbers 1,2,3,4,5,6, which + // is the same data as the normal json case above + json_str = R"({ + "data": [ + { "INPUT1": {"b64": "AAAAAQ=="} }, + { "INPUT1": {"b64": "AgAAAA=="} }, + { "INPUT1": {"b64": "AwAAAA=="} } + ], + "validation_data": [ + { "OUTPUT1": {"b64": "BAAAAA=="} }, + { "OUTPUT1": {"b64": "BQAAAA=="} }, + { "OUTPUT1": {"b64": "BgAAAA=="} } + ]})"; + } + + MockDataLoader dataloader; + std::shared_ptr inputs = std::make_shared(); + std::shared_ptr outputs = std::make_shared(); + + ModelTensor input1 = TestDataLoader::CreateTensor("INPUT1"); + ModelTensor output1 = TestDataLoader::CreateTensor("OUTPUT1"); + + inputs->insert(std::make_pair(input1.name_, input1)); + outputs->insert(std::make_pair(output1.name_, output1)); + + cb::Error status = dataloader.ReadDataFromStr(json_str, inputs, outputs); + REQUIRE(status.IsOk()); + CHECK_EQ(dataloader.GetDataStreamsCount(), 1); + CHECK_EQ(dataloader.GetTotalSteps(0), 3); + + // Confirm the correct data is in the dataloader + // + TensorData data; + std::vector shape; + + dataloader.GetInputShape(input1, 0, 1, &shape); + CHECK_EQ(shape.size(), 1); + CHECK_EQ(shape[0], 1); + + status = dataloader.GetInputData(input1, 0, 1, data); + REQUIRE(status.IsOk()); + CHECK(data.is_valid); + auto input_data = *reinterpret_cast(data.data_ptr); + CHECK_EQ(input_data, 2); + CHECK_EQ(data.batch1_size, 4); + + status = dataloader.GetOutputData("OUTPUT1", 0, 2, data); + REQUIRE(status.IsOk()); + CHECK(data.is_valid); + auto output_data = *reinterpret_cast(data.data_ptr); + CHECK_EQ(output_data, 6); + CHECK_EQ(data.batch1_size, 4); +} + +TEST_CASE("dataloader: ParseData: Multiple Streams Invalid Cases") +{ + // Mismatch because one stream with wrong number of steps + std::string mismatch_case1a{R"({ + "data": [ { "INPUT1": [1,2] } ], + "validation_data": [ { "OUTPUT1": [4] }, { "OUTPUT1": [5] } ] + })"}; + std::string mismatch_case1b{R"({ + "data": [ { "INPUT1": [1,2] }, { "INPUT1": [2,3] } ], + "validation_data": [ { "OUTPUT1": [4] } ] + })"}; + + // Mismatch because wrong number of streams (3 output streams for 2 input + // streams) + std::string mismatch_case2{R"({ + "data": [ + [ { "INPUT1": [1,2] }, { "INPUT1": [2,3] } ], + [ { "INPUT1": [10,11] } ] + ], + "validation_data": [ + [ { "OUTPUT1": [4] }, { "OUTPUT1": [5] } ], + [ { "OUTPUT1": [40] } ], + [ { "OUTPUT1": [60] } ] + ]})"}; + + // Mismatch because same number of streams but wrong number of steps + std::string mismatch_case3a{R"({ + "data": [ + [ { "INPUT1": [1,2] } ], + [ { "INPUT1": [10,11] } ] + ], + "validation_data": [ + [ { "OUTPUT1": [4] }, { "OUTPUT1": [5] } ], + [ { "OUTPUT1": [40] } ] + ]})"}; + std::string mismatch_case3b{R"({ + "data": [ + [ { "INPUT1": [1,2] } ], + [ { "INPUT1": [10,11] } ] + ], + "validation_data": [ + [ { "OUTPUT1": [4] } ], + [ { "OUTPUT1": [40] }, { "OUTPUT1": [50] } ] + ]})"}; + + auto test_lambda = [&](std::string json_data) { + std::shared_ptr inputs = std::make_shared(); + std::shared_ptr outputs = + std::make_shared(); + + ModelTensor input1 = TestDataLoader::CreateTensor("INPUT1"); + ModelTensor output1 = TestDataLoader::CreateTensor("OUTPUT1"); + input1.shape_ = {2}; + inputs->insert(std::make_pair(input1.name_, input1)); + outputs->insert(std::make_pair(output1.name_, output1)); + + MockDataLoader dataloader; + cb::Error status = dataloader.ReadDataFromStr(json_data, inputs, outputs); + CHECK(status.IsOk() == false); + CHECK_EQ( + status.Message(), + "The 'validation_data' field doesn't align with 'data' field in the " + "json file"); + }; + + test_lambda(mismatch_case1a); + test_lambda(mismatch_case1b); + test_lambda(mismatch_case2); + test_lambda(mismatch_case3a); + test_lambda(mismatch_case3b); +} + +TEST_CASE("dataloader: ParseData: Multiple Streams Valid") +{ + std::string json_str{R"({ + "data": [ + [ { "INPUT1": [1,2] }, { "INPUT1": [2,3] }], + [ { "INPUT1": [10,11] } ] + ], + "validation_data": [ + [ { "OUTPUT1": [4] }, { "OUTPUT1": [5] } ], + [ { "OUTPUT1": [40] } ] + ] + })"}; + + MockDataLoader dataloader; + std::shared_ptr inputs = std::make_shared(); + std::shared_ptr outputs = std::make_shared(); + + ModelTensor input1 = TestDataLoader::CreateTensor("INPUT1"); + ModelTensor output1 = TestDataLoader::CreateTensor("OUTPUT1"); + input1.shape_ = {2}; + inputs->insert(std::make_pair(input1.name_, input1)); + outputs->insert(std::make_pair(output1.name_, output1)); + + cb::Error status = dataloader.ReadDataFromStr(json_str, inputs, outputs); + REQUIRE(status.IsOk()); + CHECK_EQ(dataloader.GetDataStreamsCount(), 2); + CHECK_EQ(dataloader.GetTotalSteps(0), 2); + CHECK_EQ(dataloader.GetTotalSteps(1), 1); + + // Confirm the correct data is in the dataloader + // + TensorData data; + + status = dataloader.GetInputData(input1, 0, 1, data); + REQUIRE(status.IsOk()); + CHECK(data.is_valid); + + const int32_t* input_data = reinterpret_cast(data.data_ptr); + CHECK(data.is_valid); + CHECK_EQ(input_data[0], 2); + CHECK_EQ(input_data[1], 3); + // 2 elements of int32 data is 8 bytes + CHECK_EQ(data.batch1_size, 8); + + status = dataloader.GetOutputData("OUTPUT1", 1, 0, data); + REQUIRE(status.IsOk()); + CHECK(data.is_valid); + const int32_t* output_data = reinterpret_cast(data.data_ptr); + CHECK_EQ(output_data[0], 40); + CHECK_EQ(data.batch1_size, 4); +} + +TEST_CASE( + "dataloader: ParseData: Missing Shape" * + doctest::description( + "When a tensor's shape is dynamic (-1), then it needs to be provided " + "via --shape option (which is not visible to this testing), or via a " + "shape option in the json. If not, an error is thrown")) +{ + std::string json_str{R"({"data": [{ "INPUT1": [1,2,3] } ]})"}; + + MockDataLoader dataloader; + std::shared_ptr inputs = std::make_shared(); + std::shared_ptr outputs = std::make_shared(); + + ModelTensor input1 = TestDataLoader::CreateTensor("INPUT1"); + input1.shape_ = {-1}; + + inputs->insert(std::make_pair(input1.name_, input1)); + + cb::Error status = dataloader.ReadDataFromStr(json_str, inputs, outputs); + CHECK_EQ(status.IsOk(), false); + CHECK_EQ( + status.Message(), + "The variable-sized tensor \"INPUT1\" with model shape [-1] needs to " + "have its shape fully defined. See the --shape option."); +} + + +TEST_CASE( + "dataloader: ParseData: Supplied Shape is valid" * + doctest::description("Supply the dynamic shape for an input")) +{ + std::string json_str; + + SUBCASE("Normal json") + { + json_str = R"({"data": [{ + "INPUT1": { "shape": [3,2], "content": [1,2,3,4,5,6] } + }]})"; + } + SUBCASE("b64 json") + { + // This b64 encoding is the same as the unencoded case of [1,2,3,4,5,6] + json_str = R"({"data": [{ + "INPUT1": { "shape": [3,2], "b64": "AAAAAQAAAAIAAAADAAAABAAAAAUAAAAG" } + }]})"; + } + + MockDataLoader dataloader; + std::shared_ptr inputs = std::make_shared(); + std::shared_ptr outputs = std::make_shared(); + + ModelTensor input1 = TestDataLoader::CreateTensor("INPUT1"); + input1.shape_ = {-1, -1}; + + inputs->insert(std::make_pair(input1.name_, input1)); + + cb::Error status = dataloader.ReadDataFromStr(json_str, inputs, outputs); + REQUIRE(status.IsOk()); + + std::vector shape; + dataloader.GetInputShape(input1, 0, 0, &shape); + CHECK_EQ(shape.size(), 2); + CHECK_EQ(shape[0], 3); + CHECK_EQ(shape[1], 2); +} + + +TEST_CASE( + "dataloader: ParseData: Supplied Shape is zero" * + doctest::description( + "Zero is a legal shape value and should be handled correctly. " + "GetInputData differentiates between an empty valid result and an " + "invalid result via the is_valid bit in the returned struct")) +{ + std::string json_str{R"({"data": [{ + "INPUT1": { "shape": [0,2], "content": [] } + }]})"}; + + MockDataLoader dataloader; + std::shared_ptr inputs = std::make_shared(); + std::shared_ptr outputs = std::make_shared(); + + ModelTensor input1 = TestDataLoader::CreateTensor("INPUT1"); + input1.shape_ = {-1, 2}; + + ModelTensor input2 = TestDataLoader::CreateTensor("INPUT2"); + input2.is_optional_ = true; + + inputs->insert(std::make_pair(input1.name_, input1)); + inputs->insert(std::make_pair(input2.name_, input2)); + + cb::Error status = dataloader.ReadDataFromStr(json_str, inputs, outputs); + REQUIRE(status.IsOk()); + + std::vector shape; + dataloader.GetInputShape(input1, 0, 0, &shape); + CHECK_EQ(shape.size(), 2); + CHECK_EQ(shape[0], 0); + CHECK_EQ(shape[1], 2); + + // Confirm that the zero-shape input IS valid, but with size=0 and ptr=null + TensorData data; + status = dataloader.GetInputData(input1, 0, 0, data); + REQUIRE(status.IsOk()); + CHECK(data.is_valid); + CHECK(data.data_ptr == nullptr); + CHECK(data.batch1_size == 0); + + // Confirm that the unspecified input is NOT valid + status = dataloader.GetInputData(input2, 0, 0, data); + REQUIRE(status.IsOk()); + CHECK(!data.is_valid); + CHECK(data.data_ptr == nullptr); + CHECK(data.batch1_size == 0); +} + + +TEST_CASE( + "dataloader: ParseData: Multiple Calls simple" * + doctest::description( + "ParseData can be called multiple times (due to " + "multiple input-data files). The data should " + "accumulate in stream 0 when input data has no nested arrays")) +{ + std::string json_str1{R"({"data": [{ "INPUT1": [1] }]})"}; + std::string json_str2{R"({"data": [{ "INPUT1": [2] },{ "INPUT1": [22]}]})"}; + std::string json_str3{ + R"({"data": [{ "INPUT1": [3] }], "validation_data": [{ "OUTPUT1": [30] }]})"}; + + MockDataLoader dataloader; + std::shared_ptr inputs = std::make_shared(); + std::shared_ptr outputs = std::make_shared(); + + ModelTensor input1 = TestDataLoader::CreateTensor("INPUT1"); + ModelTensor output1 = TestDataLoader::CreateTensor("OUTPUT1"); + + inputs->insert(std::make_pair(input1.name_, input1)); + outputs->insert(std::make_pair(output1.name_, output1)); + + cb::Error status = dataloader.ReadDataFromStr(json_str1, inputs, outputs); + REQUIRE(status.IsOk()); + CHECK_EQ(dataloader.GetDataStreamsCount(), 1); + CHECK_EQ(dataloader.GetTotalSteps(0), 1); + + status = dataloader.ReadDataFromStr(json_str2, inputs, outputs); + REQUIRE(status.IsOk()); + CHECK_EQ(dataloader.GetDataStreamsCount(), 1); + CHECK_EQ(dataloader.GetTotalSteps(0), 3); + + status = dataloader.ReadDataFromStr(json_str3, inputs, outputs); + REQUIRE(status.IsOk()); + CHECK_EQ(dataloader.GetDataStreamsCount(), 1); + CHECK_EQ(dataloader.GetTotalSteps(0), 4); + + // Confirm the correct data is in the dataloader + // + TensorData data; + + status = dataloader.GetInputData(input1, 0, 3, data); + REQUIRE(status.IsOk()); + CHECK(data.is_valid); + + const int32_t* input_data = reinterpret_cast(data.data_ptr); + CHECK_EQ(input_data[0], 3); + CHECK_EQ(data.batch1_size, 4); + + // Confirm that only one of the 4 steps has output data + // + status = dataloader.GetOutputData("OUTPUT1", 0, 0, data); + REQUIRE(status.IsOk()); + CHECK(!data.is_valid); + status = dataloader.GetOutputData("OUTPUT1", 0, 1, data); + REQUIRE(status.IsOk()); + CHECK(!data.is_valid); + status = dataloader.GetOutputData("OUTPUT1", 0, 2, data); + REQUIRE(status.IsOk()); + CHECK(!data.is_valid); + status = dataloader.GetOutputData("OUTPUT1", 0, 3, data); + REQUIRE(status.IsOk()); + CHECK(data.is_valid); + CHECK(data.data_ptr != nullptr); + CHECK(data.batch1_size == 4); +} + +TEST_CASE( + "dataloader: ParseData: Multiple Calls array" * + doctest::description( + "ParseData can be called multiple times (due to " + "multiple input-data files). The data should " + "accumulate as multiple streams when input data has nested arrays")) +{ + std::string json_str1{R"({"data": [[{ "INPUT1": [1] }]]})"}; + std::string json_str2{ + R"({"data": [[{ "INPUT1": [2] },{ "INPUT1": [20] }]]})"}; + std::string json_str3{ + R"({"data": [[{ "INPUT1": [3] }]], "validation_data": [[{ "OUTPUT1": [30] }]]})"}; + + MockDataLoader dataloader; + std::shared_ptr inputs = std::make_shared(); + std::shared_ptr outputs = std::make_shared(); + + ModelTensor input1 = TestDataLoader::CreateTensor("INPUT1"); + ModelTensor output1 = TestDataLoader::CreateTensor("OUTPUT1"); + + inputs->insert(std::make_pair(input1.name_, input1)); + outputs->insert(std::make_pair(output1.name_, output1)); + + cb::Error status = dataloader.ReadDataFromStr(json_str1, inputs, outputs); + REQUIRE(status.IsOk()); + status = dataloader.ReadDataFromStr(json_str2, inputs, outputs); + REQUIRE(status.IsOk()); + status = dataloader.ReadDataFromStr(json_str3, inputs, outputs); + REQUIRE(status.IsOk()); + CHECK_EQ(dataloader.GetDataStreamsCount(), 3); + CHECK_EQ(dataloader.GetTotalSteps(0), 1); + CHECK_EQ(dataloader.GetTotalSteps(1), 2); + CHECK_EQ(dataloader.GetTotalSteps(2), 1); + + // Confirm the correct data is in the dataloader + // + TensorData data; + + status = dataloader.GetInputData(input1, 1, 1, data); + REQUIRE(status.IsOk()); + CHECK(data.is_valid); + + const int32_t* input_data = reinterpret_cast(data.data_ptr); + CHECK_EQ(input_data[0], 20); + CHECK_EQ(data.batch1_size, 4); + + // Confirm that only one of the 3 streams has output data + // + status = dataloader.GetOutputData("OUTPUT1", 0, 0, data); + REQUIRE(status.IsOk()); + CHECK(!data.is_valid); + status = dataloader.GetOutputData("OUTPUT1", 1, 0, data); + REQUIRE(status.IsOk()); + CHECK(!data.is_valid); + status = dataloader.GetOutputData("OUTPUT1", 2, 0, data); + REQUIRE(status.IsOk()); + CHECK(data.is_valid); + CHECK(data.data_ptr != nullptr); + CHECK(data.batch1_size == 4); +} + +TEST_CASE( + "dataloader: ParseData: Multiple Calls mixed" * + doctest::description( + "ParseData can be called multiple times (due to " + "multiple input-data files). An error should be thrown if there is a " + "mixture of nested vs no-nested arrays in the input data")) +{ + std::string json_str_not_nested{R"({"data": [{ "INPUT1": [2] }]})"}; + std::string json_str_nested{R"({"data": [[{ "INPUT1": [1] }]]})"}; + std::string json_str1, json_str2; + + SUBCASE("Nested then not-nested") + { + json_str1 = json_str_nested; + json_str2 = json_str_not_nested; + } + SUBCASE("Not-nested then nested") + { + json_str1 = json_str_not_nested; + json_str2 = json_str_nested; + } + + MockDataLoader dataloader; + std::shared_ptr inputs = std::make_shared(); + std::shared_ptr outputs = std::make_shared(); + + ModelTensor input1 = TestDataLoader::CreateTensor("INPUT1"); + + inputs->insert(std::make_pair(input1.name_, input1)); + + cb::Error status = dataloader.ReadDataFromStr(json_str1, inputs, outputs); + REQUIRE(status.IsOk()); + status = dataloader.ReadDataFromStr(json_str2, inputs, outputs); + REQUIRE(!status.IsOk()); + CHECK( + status.Message() == + "Inconsistency in input-data provided. Can not have a combination of " + "objects and arrays inside of the Data array"); +} + +TEST_CASE( + "dataloader: GenerateData: Is Shape Tensor" * + doctest::description("It is illegal to generate data for any Tensor with " + "is_shape_tensor=True")) +{ + MockDataLoader dataloader; + std::shared_ptr inputs = std::make_shared(); + std::shared_ptr outputs = std::make_shared(); + + ModelTensor input1 = TestDataLoader::CreateTensor("INPUT1"); + input1.is_shape_tensor_ = true; + inputs->insert(std::make_pair(input1.name_, input1)); + + bool zero_input = true; + size_t string_length = 5; + std::string string_data = "FOOBAR"; + cb::Error status = + dataloader.GenerateData(inputs, zero_input, string_length, string_data); + CHECK(status.IsOk() == false); + CHECK_EQ( + status.Message(), + "can not generate data for shape tensor 'INPUT1', user-provided data is " + "needed."); +} + + +TEST_CASE( + "dataloader: GenerateData: Non-BYTES" * + doctest::description( + "Calling GenerateData for non-BYTES datatype should result in a single " + "stream with one step. If the zero input flag is set, all of that data " + "will be 0. Else it will be random")) +{ + bool zero_input; + size_t string_length = 5; + std::string string_data = "FOOBAR"; + + SUBCASE("zero_input true") + { + zero_input = true; + } + SUBCASE("zero_input false") + { + zero_input = false; + } + MockDataLoader dataloader; + std::shared_ptr inputs = std::make_shared(); + std::shared_ptr outputs = std::make_shared(); + + ModelTensor input1 = TestDataLoader::CreateTensor("INPUT1"); + input1.shape_ = {3}; + inputs->insert(std::make_pair(input1.name_, input1)); + + cb::Error status = + dataloader.GenerateData(inputs, zero_input, string_length, string_data); + REQUIRE(status.IsOk()); + CHECK_EQ(dataloader.GetDataStreamsCount(), 1); + CHECK_EQ(dataloader.GetTotalSteps(0), 1); + + TensorData data; + + status = dataloader.GetInputData(input1, 0, 0, data); + REQUIRE(status.IsOk()); + CHECK(data.is_valid); + const int32_t* input_data = reinterpret_cast(data.data_ptr); + if (zero_input) { + CHECK_EQ(input_data[0], 0); + CHECK_EQ(input_data[1], 0); + CHECK_EQ(input_data[2], 0); + } else { + CHECK_NE(input_data[0], 0); + CHECK_NE(input_data[1], 0); + CHECK_NE(input_data[2], 0); + } + // 3 elements of int32 data is 12 bytes + CHECK_EQ(data.batch1_size, 12); +} + +TEST_CASE( + "dataloader: GenerateData: BYTES" * + doctest::description( + "Calling GenerateData for BYTES datatype should result in a single " + "stream with one step. The zero-input flag is ignored. If string_data " + "is not null, it will be used. Else it will be a random string of " + "length string_length")) +{ + bool zero_input = false; + size_t string_length = 5; + std::string string_data; + + SUBCASE("valid string_data") + { + string_data = "FOOBAR"; + } + SUBCASE("empty string_data") + { + string_data = ""; + } + + MockDataLoader dataloader; + std::shared_ptr inputs = std::make_shared(); + std::shared_ptr outputs = std::make_shared(); + + ModelTensor input1 = TestDataLoader::CreateTensor("INPUT1"); + input1.datatype_ = "BYTES"; + input1.shape_ = {3}; + inputs->insert(std::make_pair(input1.name_, input1)); + + cb::Error status = + dataloader.GenerateData(inputs, zero_input, string_length, string_data); + REQUIRE(status.IsOk()); + CHECK_EQ(dataloader.GetDataStreamsCount(), 1); + CHECK_EQ(dataloader.GetTotalSteps(0), 1); + + TensorData data; + + status = dataloader.GetInputData(input1, 0, 0, data); + REQUIRE(status.IsOk()); + CHECK(data.is_valid); + + // For string data, the result should be a 32-bit number indicating the data + // length, and then 1 byte per letter + // + // For "FOOBAR", the length would be 10 bytes: + // 4 bytes to indicate the string length (the number 6) + // 1 byte for each letter + // + // For empty string, the string length would instead be the value in + // string_length (5 in this case), and the characters would be random for + // each entry in the batch. Thus, the data length would be 9 bytes + // + // For a shape of [3], this data would be repeated 3 times + + if (string_data.empty()) { + // 3 elements of 9 bytes is 27 + CHECK_EQ(data.batch1_size, 27); + + const char* char_data = reinterpret_cast(data.data_ptr); + + // Check all 3 entries in the "batch" of shape [3] + for (size_t i = 0; i < 3; i++) { + size_t start_index = 9 * i; + + // The first 4 bytes are an int32 indicating the number of characters + const int32_t* int32_data = + reinterpret_cast(&char_data[start_index]); + CHECK_EQ(int32_data[0], 5); + + // All of the characters should be in the specified character_set + for (size_t j = start_index + 4; j < start_index + 9; j++) { + CHECK_NE(character_set.find(char_data[j]), std::string::npos); + } + } + + } else { + // 3 elements of 10 bytes is 30 + CHECK_EQ(data.batch1_size, 30); + + const int32_t* int32_data = reinterpret_cast(data.data_ptr); + const char* char_data = reinterpret_cast(data.data_ptr); + CHECK_EQ(int32_data[0], 6); + CHECK_EQ(char_data[4], 'F'); + CHECK_EQ(char_data[5], 'O'); + CHECK_EQ(char_data[6], 'O'); + CHECK_EQ(char_data[7], 'B'); + CHECK_EQ(char_data[8], 'A'); + CHECK_EQ(char_data[9], 'R'); + + // The data would repeat two more times for shape of [3] + for (size_t i = 10; i < 30; i++) { + CHECK_EQ(char_data[i - 10], char_data[i]); + } + } +} + +TEST_CASE("dataloader: GenerateData: Dynamic shape") +{ + bool zero_input = false; + size_t string_length = 5; + std::string string_data; + + ModelTensor input1 = TestDataLoader::CreateTensor("INPUT1"); + input1.shape_ = {-1}; + + std::string expected_message = + "input INPUT1 contains dynamic shape, provide shapes to send along with " + "the request"; + + SUBCASE("BYTES") + { + input1.datatype_ = "BYTES"; + } + SUBCASE("non-BYTES") + { + input1.datatype_ = "INT32"; + } + + MockDataLoader dataloader; + std::shared_ptr inputs = std::make_shared(); + std::shared_ptr outputs = std::make_shared(); + inputs->insert(std::make_pair(input1.name_, input1)); + + cb::Error status = + dataloader.GenerateData(inputs, zero_input, string_length, string_data); + REQUIRE(status.IsOk() == false); + CHECK_EQ(status.Message(), expected_message); +} + +TEST_CASE( + "dataloader: ReadDataFromDir: Error reading input file" * + doctest::description( + "When there is an error reading an input data file, the error should " + "bubble up to the return value of ReadDataFromDir")) +{ + MockDataLoader dataloader; + + std::shared_ptr inputs = std::make_shared(); + std::shared_ptr outputs = std::make_shared(); + + ModelTensor input1 = TestDataLoader::CreateTensor("INPUT1"); + + std::string dir{"fake/path"}; + + SUBCASE("BYTES (string) data") + { + input1.datatype_ = "BYTES"; + } + SUBCASE("Raw Binary data") + { + input1.datatype_ = "INT32"; + } + + inputs->insert(std::make_pair(input1.name_, input1)); + cb::Error status = dataloader.ReadDataFromDir(inputs, outputs, dir); + CHECK(status.IsOk() == false); +} + +TEST_CASE( + "dataloader: ReadDataFromDir: Error reading output file" * + doctest::description( + "When there is an error reading an output data file, an error is NOT " + "raised from ReadDataFromDir, and instead GetOutputData will return " + "nullptr with a batch1_size of 0")) +{ + MockDataLoader dataloader; + + std::shared_ptr inputs = std::make_shared(); + std::shared_ptr outputs = std::make_shared(); + + ModelTensor output1 = TestDataLoader::CreateTensor("OUTPUT1"); + + std::string dir{"fake/path"}; + + SUBCASE("BYTES (string) data") + { + output1.datatype_ = "BYTES"; + } + SUBCASE("Raw Binary data") + { + output1.datatype_ = "INT32"; + } + + outputs->insert(std::make_pair(output1.name_, output1)); + cb::Error status = dataloader.ReadDataFromDir(inputs, outputs, dir); + CHECK(status.IsOk() == true); + + TensorData data; + + dataloader.GetOutputData("OUTPUT1", 0, 0, data); + CHECK(!data.is_valid); + CHECK(data.data_ptr == nullptr); + CHECK(data.batch1_size == 0); +} + +TEST_CASE( + "dataloader: ReadDataFromDir: Mismatching Input Data" * + doctest::description("Successfully reading input files but having a " + "mismatch will result in an error being thrown")) +{ + MockDataLoader dataloader; + + std::string datatype; + std::string expected_error_message; + + std::shared_ptr inputs = std::make_shared(); + std::shared_ptr outputs = std::make_shared(); + + ModelTensor input1 = TestDataLoader::CreateTensor("INPUT1"); + ModelTensor output1 = TestDataLoader::CreateTensor("OUTPUT1"); + + std::string dir{"mocked_out"}; + + SUBCASE("BYTES (string) data") + { + datatype = "BYTES"; + std::vector string_data; + + SUBCASE("Dynamic shape") + { + input1.shape_ = {-1}; + expected_error_message = + "input INPUT1 contains dynamic shape, provide shapes to send along " + "with the request"; + } + SUBCASE("Supplied shape") + { + input1.shape_ = {1}; + string_data = {"InStr", "ExtraStr"}; + + expected_error_message = + "provided data for input INPUT1 has 2 elements, expect 1"; + } + + EXPECT_CALL(dataloader, ReadTextFile(testing::_, testing::_)) + .WillOnce(testing::DoAll( + testing::SetArgPointee<1>(string_data), + testing::Return(cb::Error::Success))); + } + SUBCASE("Raw Binary data") + { + datatype = "INT32"; + std::vector char_data; + + SUBCASE("Dynamic shape") + { + input1.shape_ = {-1}; + expected_error_message = + "input INPUT1 contains dynamic shape, provide shapes to send along " + "with the request"; + } + SUBCASE("Supplied shape") + { + // An INT32 of shape {1} will be 4 bytes. However, we are supplying 5 + // bytes via char_data. + input1.shape_ = {1}; + char_data = {'0', '0', '0', '7', '5'}; + expected_error_message = + "provided data for input INPUT1 has byte size 5, expect 4"; + } + + EXPECT_CALL(dataloader, ReadFile(testing::_, testing::_)) + .WillOnce(testing::DoAll( + testing::SetArgPointee<1>(char_data), + testing::Return(cb::Error::Success))); + } + + input1.datatype_ = datatype; + inputs->insert(std::make_pair(input1.name_, input1)); + + cb::Error status = dataloader.ReadDataFromDir(inputs, outputs, dir); + REQUIRE(status.IsOk() == false); + CHECK(status.Message() == expected_error_message); +} + +// FIXME TMA-1210 -- the output data is not being ignored here and no error is +// thrown, despite the mismatch +// TEST_CASE( +// "dataloader: ReadDataFromDir: Mismatching Output Data" * +// doctest::description("Successfully reading output files but having a " +// "mismatch will result in the data being ignored")) +//{ +// MockDataLoader dataloader; +// +// std::string datatype; +// +// std::shared_ptr inputs = std::make_shared(); +// std::shared_ptr outputs = +// std::make_shared(); +// +// ModelTensor input1 = TestDataLoader::CreateTensor("INPUT1"); +// ModelTensor output1 = TestDataLoader::CreateTensor("OUTPUT1"); +// +// std::string dir{"mocked_out"}; +// +// std::vector char_data{'0', '0', '0', '7', '5'}; +// +// std::vector string_data{"InStr", "ExtraStr"}; +// +// SUBCASE("BYTES (string) data") +// { +// datatype = "BYTES"; +// EXPECT_CALL(dataloader, ReadTextFile(testing::_, testing::_)) +// .WillOnce(testing::DoAll( +// testing::SetArgPointee<1>(string_data), +// testing::Return(cb::Error::Success))); +// +// SUBCASE("Dynamic shape") { output1.shape_ = {-1}; } +// SUBCASE("Supplied shape") { output1.shape_ = {1}; } +// } +// SUBCASE("Raw Binary data") +// { +// datatype = "INT32"; +// EXPECT_CALL(dataloader, ReadFile(testing::_, testing::_)) +// .WillOnce(testing::DoAll( +// testing::SetArgPointee<1>(char_data), +// testing::Return(cb::Error::Success))); +// +// SUBCASE("Dynamic shape") { input1.shape_ = {-1}; } +// SUBCASE("Supplied shape") { input1.shape_ = {1}; } +// } +// +// output1.datatype_ = datatype; +// outputs->insert(std::make_pair(output1.name_, output1)); +// +// cb::Error status = dataloader.ReadDataFromDir(inputs, outputs, dir); +// REQUIRE(status.IsOk() == true); +// +// // Confirm that the data is not in the dataloader +// const uint8_t* data_ptr{nullptr}; +// size_t batch1_size; +// +// dataloader.GetOutputData("OUTPUT1", 0, 0, &data_ptr, &batch1_size); +// CHECK(data_ptr == nullptr); +// CHECK(batch1_size == 0); +//} + +TEST_CASE( + "dataloader: ReadDataFromDir: Valid Data" * + doctest::description("Successfully reading files will always result in a " + "single stream with a single step")) +{ + MockDataLoader dataloader; + + std::string datatype; + + std::shared_ptr inputs = std::make_shared(); + std::shared_ptr outputs = std::make_shared(); + + ModelTensor input1 = TestDataLoader::CreateTensor("INPUT1"); + ModelTensor output1 = TestDataLoader::CreateTensor("OUTPUT1"); + + std::string dir{"mocked_out"}; + + std::vector input_char_data{'0', '0', '0', '7'}; + std::vector output_char_data{'0', '0', '0', '3'}; + + std::vector input_string_data{"InStr"}; + std::vector output_string_data{"OutStr"}; + + std::vector expected_input; + std::vector expected_output; + + SUBCASE("BYTES (string) data") + { + datatype = "BYTES"; + + expected_input = {'\5', '\0', '\0', '\0', 'I', 'n', 'S', 't', 'r'}; + expected_output = {'\6', '\0', '\0', '\0', 'O', 'u', 't', 'S', 't', 'r'}; + + EXPECT_CALL(dataloader, ReadTextFile(testing::_, testing::_)) + .WillOnce(testing::DoAll( + testing::SetArgPointee<1>(input_string_data), + testing::Return(cb::Error::Success))) + .WillOnce(testing::DoAll( + testing::SetArgPointee<1>(output_string_data), + testing::Return(cb::Error::Success))); + } + SUBCASE("Raw Binary data") + { + datatype = "INT32"; + + expected_input = input_char_data; + expected_output = output_char_data; + + EXPECT_CALL(dataloader, ReadFile(testing::_, testing::_)) + .WillOnce(testing::DoAll( + testing::SetArgPointee<1>(input_char_data), + testing::Return(cb::Error::Success))) + .WillOnce(testing::DoAll( + testing::SetArgPointee<1>(output_char_data), + testing::Return(cb::Error::Success))); + } + + input1.datatype_ = datatype; + output1.datatype_ = datatype; + + inputs->insert(std::make_pair(input1.name_, input1)); + outputs->insert(std::make_pair(output1.name_, output1)); + + cb::Error status = dataloader.ReadDataFromDir(inputs, outputs, dir); + REQUIRE(status.IsOk()); + CHECK_EQ(dataloader.GetDataStreamsCount(), 1); + CHECK_EQ(dataloader.GetTotalSteps(0), 1); + + // Validate input and output data + TensorData data; + + status = dataloader.GetInputData(input1, 0, 0, data); + REQUIRE(status.IsOk()); + CHECK(data.is_valid); + + const char* input_data = reinterpret_cast(data.data_ptr); + REQUIRE(data.batch1_size == expected_input.size()); + for (size_t i = 0; i < data.batch1_size; i++) { + CHECK(input_data[i] == expected_input[i]); + } + + status = dataloader.GetOutputData("OUTPUT1", 0, 0, data); + REQUIRE(status.IsOk()); + CHECK(data.is_valid); + + const char* output_data = reinterpret_cast(data.data_ptr); + REQUIRE(data.batch1_size == expected_output.size()); + for (size_t i = 0; i < data.batch1_size; i++) { + CHECK(output_data[i] == expected_output[i]); + } +} + +}} // namespace triton::perfanalyzer diff --git a/test_idle_timer.cc b/test_idle_timer.cc new file mode 100644 index 00000000..18f9d751 --- /dev/null +++ b/test_idle_timer.cc @@ -0,0 +1,94 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include + +#include "doctest.h" +#include "idle_timer.h" + +namespace triton { namespace perfanalyzer { + +TEST_CASE("idle_timer: basic usage") +{ + IdleTimer timer; + CHECK(timer.GetIdleTime() == 0); + timer.Start(); + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + timer.Stop(); + CHECK(timer.GetIdleTime() > 0); + timer.Reset(); + CHECK(timer.GetIdleTime() == 0); +} + +TEST_CASE("idle_timer: GetIdleTime when inactive") +{ + IdleTimer timer; + CHECK(timer.GetIdleTime() == 0); + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + CHECK(timer.GetIdleTime() == 0); + CHECK_NOTHROW(timer.Start()); +} + +TEST_CASE("idle_timer: GetIdleTime when active") +{ + IdleTimer timer; + timer.Start(); + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + CHECK(timer.GetIdleTime() > 0); + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + CHECK(timer.GetIdleTime() > 0); + CHECK_NOTHROW(timer.Stop()); +} + +TEST_CASE("idle_timer: reset when active") +{ + IdleTimer timer; + timer.Start(); + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + timer.Stop(); + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + timer.Start(); + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + timer.Reset(); + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + CHECK(timer.GetIdleTime() > 0); +} + +TEST_CASE("idle_timer: double start") +{ + IdleTimer timer; + timer.Start(); + CHECK_THROWS_AS(timer.Start(), const std::exception&); +} + +TEST_CASE("idle_timer: stop without start") +{ + IdleTimer timer; + CHECK_THROWS_AS(timer.Stop(), const std::exception&); +} + + +}} // namespace triton::perfanalyzer diff --git a/test_infer_context.cc b/test_infer_context.cc new file mode 100644 index 00000000..951fb2b1 --- /dev/null +++ b/test_infer_context.cc @@ -0,0 +1,178 @@ +// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "client_backend/mock_client_backend.h" +#include "doctest.h" +#include "gmock/gmock.h" +#include "infer_context.h" +#include "mock_data_loader.h" +#include "mock_infer_context.h" +#include "mock_infer_data_manager.h" +#include "mock_sequence_manager.h" + +namespace triton { namespace perfanalyzer { + +/// Tests the round robin ordering of json input data +/// +TEST_CASE("update_seq_json_data: testing the UpdateSeqJsonData function") +{ + std::shared_ptr mock_sequence_manager{ + std::make_shared()}; + + EXPECT_CALL( + *mock_sequence_manager, SetInferSequenceOptions(testing::_, testing::_)) + .Times(6) + .WillRepeatedly(testing::Return()); + + mock_sequence_manager->InitSequenceStatuses(1); + + std::shared_ptr mock_data_loader{ + std::make_shared()}; + + EXPECT_CALL(*mock_data_loader, GetTotalSteps(testing::_)) + .Times(6) + .WillRepeatedly(testing::Return(3)); + + std::shared_ptr mock_infer_data_manager{ + std::make_shared()}; + + testing::Sequence seq; + EXPECT_CALL( + *mock_infer_data_manager, + UpdateInferData(testing::_, testing::_, 0, testing::_)) + .InSequence(seq) + .WillOnce(testing::Return(cb::Error::Success)); + EXPECT_CALL( + *mock_infer_data_manager, + UpdateInferData(testing::_, testing::_, 1, testing::_)) + .InSequence(seq) + .WillOnce(testing::Return(cb::Error::Success)); + EXPECT_CALL( + *mock_infer_data_manager, + UpdateInferData(testing::_, testing::_, 2, testing::_)) + .InSequence(seq) + .WillOnce(testing::Return(cb::Error::Success)); + EXPECT_CALL( + *mock_infer_data_manager, + UpdateInferData(testing::_, testing::_, 0, testing::_)) + .InSequence(seq) + .WillOnce(testing::Return(cb::Error::Success)); + EXPECT_CALL( + *mock_infer_data_manager, + UpdateInferData(testing::_, testing::_, 1, testing::_)) + .InSequence(seq) + .WillOnce(testing::Return(cb::Error::Success)); + EXPECT_CALL( + *mock_infer_data_manager, + UpdateInferData(testing::_, testing::_, 2, testing::_)) + .InSequence(seq) + .WillOnce(testing::Return(cb::Error::Success)); + + std::shared_ptr mic{std::make_shared()}; + + EXPECT_CALL(*mic, SendRequest(testing::_, testing::_, testing::_)) + .Times(6) + .WillRepeatedly(testing::Return()); + + mic->sequence_manager_ = mock_sequence_manager; + mic->data_loader_ = mock_data_loader; + mic->infer_data_manager_ = mock_infer_data_manager; + mic->thread_stat_ = std::make_shared(); + bool execute{true}; + mic->execute_ = execute; + mic->using_json_data_ = true; + + size_t seq_stat_index{0}; + bool delayed{false}; + + mic->SendSequenceInferRequest(seq_stat_index, delayed); + mic->SendSequenceInferRequest(seq_stat_index, delayed); + mic->SendSequenceInferRequest(seq_stat_index, delayed); + mic->SendSequenceInferRequest(seq_stat_index, delayed); + mic->SendSequenceInferRequest(seq_stat_index, delayed); + mic->SendSequenceInferRequest(seq_stat_index, delayed); + + // Destruct gmock objects to determine gmock-related test failure + mock_sequence_manager.reset(); + mock_data_loader.reset(); + mock_infer_data_manager.reset(); + mic.reset(); + REQUIRE(testing::Test::HasFailure() == false); +} + +TEST_CASE("send_request: testing the SendRequest function") +{ + MockInferContext mock_infer_context{}; + + SUBCASE("testing logic relevant to request record sequence ID") + { + mock_infer_context.thread_stat_ = std::make_shared(); + mock_infer_context.thread_stat_->contexts_stat_.emplace_back(); + mock_infer_context.async_ = true; + mock_infer_context.streaming_ = true; + mock_infer_context.infer_data_.options_ = + std::make_unique("my_model"); + std::shared_ptr mock_client_stats{ + std::make_shared()}; + mock_infer_context.infer_backend_ = + std::make_unique(mock_client_stats); + + const uint64_t request_id{5}; + const bool delayed{false}; + const uint64_t sequence_id{2}; + + mock_infer_context.infer_data_.options_->request_id_ = + std::to_string(request_id); + + cb::MockInferResult* mock_infer_result{ + new cb::MockInferResult(*mock_infer_context.infer_data_.options_)}; + + cb::OnCompleteFn& stream_callback{mock_infer_context.async_callback_func_}; + + EXPECT_CALL( + dynamic_cast( + *mock_infer_context.infer_backend_), + AsyncStreamInfer(testing::_, testing::_, testing::_)) + .WillOnce( + [&mock_infer_result, &stream_callback]( + const cb::InferOptions& options, + const std::vector& inputs, + const std::vector& outputs) + -> cb::Error { + stream_callback(mock_infer_result); + return cb::Error::Success; + }); + + mock_infer_context.SendRequest(request_id, delayed, sequence_id); + + CHECK(mock_infer_context.thread_stat_->request_records_.size() == 1); + CHECK( + mock_infer_context.thread_stat_->request_records_[0].sequence_id_ == + sequence_id); + } +} + +}} // namespace triton::perfanalyzer diff --git a/test_inference_profiler.cc b/test_inference_profiler.cc index e491b755..40813ce5 100644 --- a/test_inference_profiler.cc +++ b/test_inference_profiler.cc @@ -1,4 +1,4 @@ -// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -25,8 +25,10 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "doctest.h" +#include "inference_profiler.h" #include "mock_inference_profiler.h" - +#include "mock_load_manager.h" +#include "mock_model_parser.h" namespace triton { namespace perfanalyzer { @@ -35,12 +37,15 @@ class TestInferenceProfiler : public InferenceProfiler { static void ValidLatencyMeasurement( const std::pair& valid_range, size_t& valid_sequence_count, size_t& delayed_request_count, - std::vector* latencies, TimestampVector& all_timestamps) + std::vector* latencies, size_t& response_count, + std::vector& valid_requests, + std::vector& all_request_records) { InferenceProfiler inference_profiler{}; - inference_profiler.all_timestamps_ = all_timestamps; + inference_profiler.all_request_records_ = all_request_records; inference_profiler.ValidLatencyMeasurement( - valid_range, valid_sequence_count, delayed_request_count, latencies); + valid_range, valid_sequence_count, delayed_request_count, latencies, + response_count, valid_requests); } static std::tuple GetMeanAndStdDev( @@ -50,6 +55,14 @@ class TestInferenceProfiler : public InferenceProfiler { return inference_profiler.GetMeanAndStdDev(latencies); } + void SummarizeSendRequestRate( + const double window_duration_s, const size_t num_sent_requests, + PerfStatus& summary) + { + InferenceProfiler::SummarizeSendRequestRate( + window_duration_s, num_sent_requests, summary); + } + static bool TestCheckWithinThreshold( LoadStatus& ls, LoadParams& lp, uint64_t latency_threshold_ms) { @@ -68,16 +81,17 @@ class TestInferenceProfiler : public InferenceProfiler { ip.load_parameters_.stability_threshold = lp.stability_threshold; ip.load_parameters_.stability_window = lp.stability_window; - return ip.CheckWindowForStability(idx, ls); + return ip.CheckWindowForStability(idx, ls, true); }; - static bool TestDetermineStability(LoadStatus& ls, LoadParams& lp) + static bool TestDetermineStability( + LoadStatus& ls, LoadParams& lp, bool check_latency = true) { InferenceProfiler ip; ip.load_parameters_.stability_threshold = lp.stability_threshold; ip.load_parameters_.stability_window = lp.stability_window; - return ip.DetermineStability(ls); + return ip.DetermineStability(ls, check_latency); } static bool TestIsDoneProfiling( @@ -129,55 +143,96 @@ class TestInferenceProfiler : public InferenceProfiler { InferenceProfiler::GetMetricFirstPerGPU( input_metric_maps, output_metric_map); } + + void SummarizeOverhead( + const uint64_t window_duration_ns, const uint64_t idle_ns, + PerfStatus& summary) + { + InferenceProfiler::SummarizeOverhead(window_duration_ns, idle_ns, summary); + } + + + cb::Error DetermineStatsModelVersion( + const cb::ModelIdentifier& model_identifier, + const std::map& start_stats, + const std::map& end_stats, + int64_t* model_version) + { + return InferenceProfiler::DetermineStatsModelVersion( + model_identifier, start_stats, end_stats, model_version); + } + + cb::Error SetTopLevelResponseCaching(bool enable_top_level_response_caching) + { + return InferenceProfiler::SetTopLevelResponseCaching( + enable_top_level_response_caching); + } }; + TEST_CASE("testing the ValidLatencyMeasurement function") { size_t valid_sequence_count{}; size_t delayed_request_count{}; std::vector latencies{}; + size_t response_count{}; + std::vector valid_requests{}; const std::pair window{4, 17}; using time_point = std::chrono::time_point; using ns = std::chrono::nanoseconds; - TimestampVector all_timestamps{ + std::vector all_request_records{ // request ends before window starts, this should not be possible to exist // in the vector of requests, but if it is, we exclude it: not included in // current window - std::make_tuple(time_point(ns(1)), time_point(ns(2)), 0, false), + RequestRecord( + time_point(ns(1)), std::vector{time_point(ns(2))}, {}, {}, + 0, false, 0, false), // request starts before window starts and ends inside window: included in // current window - std::make_tuple(time_point(ns(3)), time_point(ns(5)), 0, false), + RequestRecord( + time_point(ns(3)), std::vector{time_point(ns(5))}, {}, {}, + 0, false, 0, false), // requests start and end inside window: included in current window - std::make_tuple(time_point(ns(6)), time_point(ns(9)), 0, false), - std::make_tuple(time_point(ns(10)), time_point(ns(14)), 0, false), + RequestRecord( + time_point(ns(6)), std::vector{time_point(ns(9))}, {}, {}, + 0, false, 0, false), + RequestRecord( + time_point(ns(10)), std::vector{time_point(ns(14))}, {}, + {}, 0, false, 0, false), // request starts before window ends and ends after window ends: not // included in current window - std::make_tuple(time_point(ns(15)), time_point(ns(20)), 0, false), + RequestRecord( + time_point(ns(15)), std::vector{time_point(ns(20))}, {}, + {}, 0, false, 0, false), // request starts after window ends: not included in current window - std::make_tuple(time_point(ns(21)), time_point(ns(27)), 0, false)}; + RequestRecord( + time_point(ns(21)), std::vector{time_point(ns(27))}, {}, + {}, 0, false, 0, false)}; TestInferenceProfiler::ValidLatencyMeasurement( window, valid_sequence_count, delayed_request_count, &latencies, - all_timestamps); + response_count, valid_requests, all_request_records); - const auto& convert_timestamp_to_latency{ - [](std::tuple< - std::chrono::time_point, - std::chrono::time_point, uint32_t, bool> - t) { - return CHRONO_TO_NANOS(std::get<1>(t)) - - CHRONO_TO_NANOS(std::get<0>(t)); - }}; + const auto& convert_request_record_to_latency{[](RequestRecord t) { + return CHRONO_TO_NANOS(t.response_timestamps_.back()) - + CHRONO_TO_NANOS(t.start_time_); + }}; CHECK(latencies.size() == 3); - CHECK(latencies[0] == convert_timestamp_to_latency(all_timestamps[1])); - CHECK(latencies[1] == convert_timestamp_to_latency(all_timestamps[2])); - CHECK(latencies[2] == convert_timestamp_to_latency(all_timestamps[3])); + CHECK( + latencies[0] == + convert_request_record_to_latency(all_request_records[1])); + CHECK( + latencies[1] == + convert_request_record_to_latency(all_request_records[2])); + CHECK( + latencies[2] == + convert_request_record_to_latency(all_request_records[3])); } TEST_CASE("test_check_window_for_stability") @@ -225,6 +280,30 @@ TEST_CASE("test_check_window_for_stability") lp.stability_threshold = 0.1; CHECK(TestInferenceProfiler::TestCheckWindowForStability(ls, lp) == true); } + SUBCASE("test stability window of 5") + { + ls.infer_per_sec = {500.0, 520.0, 510.0, 505.0, 515.0}; + ls.latencies = {100, 104, 108, 102, 106}; + lp.stability_window = 5; + lp.stability_threshold = 0.1; + CHECK(TestInferenceProfiler::TestCheckWindowForStability(ls, lp) == true); + } + SUBCASE("test not stable in 5 but stable in 3") + { + ls.infer_per_sec = {1.0, 1000.0, 510.0, 505.0, 515.0}; + ls.latencies = {100, 104, 108, 102, 106}; + lp.stability_window = 5; + lp.stability_threshold = 0.1; + CHECK(TestInferenceProfiler::TestCheckWindowForStability(ls, lp) == false); + } + SUBCASE("test stability window of 2") + { + ls.infer_per_sec = {500.0, 1000.0, 1.0, 505.0, 515.0}; + ls.latencies = {100, 104, 108, 102, 106}; + lp.stability_window = 2; + lp.stability_threshold = 0.1; + CHECK(TestInferenceProfiler::TestCheckWindowForStability(ls, lp) == true); + } } TEST_CASE("test check within threshold") @@ -271,6 +350,16 @@ TEST_CASE("test_determine_stability") ls.infer_per_sec = {500.0, 520.0, 510.0}; CHECK(TestInferenceProfiler::TestDetermineStability(ls, lp) == true); } + + SUBCASE("test determine stability without latency check") + { + ls.infer_per_sec = {500.0, 520.0, 510.0}; + ls.latencies = {100, 106, 112}; + lp.stability_window = 3; + lp.stability_threshold = 0.1; + uint64_t latency_threshold_ms = 1; + CHECK(TestInferenceProfiler::TestDetermineStability(ls, lp, false) == true); + } } TEST_CASE("test_is_done_profiling") @@ -450,8 +539,8 @@ TEST_CASE("testing the GetMetricAveragePerGPU function") SUBCASE("all GPUs present") { - const std::map metric_1{{"gpu0", 0.45}, - {"gpu1", 0.23}}, + const std::map metric_1{ + {"gpu0", 0.45}, {"gpu1", 0.23}}, metric_2{{"gpu0", 0.52}, {"gpu1", 0.27}}, metric_3{{"gpu0", 0.56}, {"gpu1", 0.30}}; @@ -468,8 +557,8 @@ TEST_CASE("testing the GetMetricAveragePerGPU function") SUBCASE("missing one GPU from one metric") { - const std::map metric_1{{"gpu0", 0.45}, - {"gpu1", 0.23}}, + const std::map metric_1{ + {"gpu0", 0.45}, {"gpu1", 0.23}}, metric_2{{"gpu0", 0.52}}, metric_3{{"gpu0", 0.56}, {"gpu1", 0.30}}; const std::vector< @@ -563,4 +652,441 @@ TEST_CASE("testing the GetMetricFirstPerGPU function") } } +TEST_CASE("test the ReportPrometheusMetrics function") +{ + Metrics metrics{}; + std::stringstream captured_cout; + std::streambuf* old_cout{std::cout.rdbuf(captured_cout.rdbuf())}; + + SUBCASE("regular output") + { + metrics.gpu_utilization_per_gpu["gpu0"] = 0.45; + metrics.gpu_utilization_per_gpu["gpu1"] = 0.52; + + metrics.gpu_power_usage_per_gpu["gpu0"] = 70.0; + metrics.gpu_power_usage_per_gpu["gpu1"] = 84.5; + + metrics.gpu_memory_used_bytes_per_gpu["gpu0"] = 10000; + metrics.gpu_memory_used_bytes_per_gpu["gpu1"] = 12000; + + metrics.gpu_memory_total_bytes_per_gpu["gpu0"] = 100000; + metrics.gpu_memory_total_bytes_per_gpu["gpu1"] = 100000; + + cb::Error result{ReportPrometheusMetrics(metrics)}; + + std::cout.rdbuf(old_cout); + + CHECK(result.Err() == SUCCESS); + CHECK( + captured_cout.str() == + " Avg GPU Utilization:\n" + " gpu0 : 45%\n" + " gpu1 : 52%\n" + " Avg GPU Power Usage:\n" + " gpu0 : 70 watts\n" + " gpu1 : 84.5 watts\n" + " Max GPU Memory Usage:\n" + " gpu0 : 10000 bytes\n" + " gpu1 : 12000 bytes\n" + " Total GPU Memory:\n" + " gpu0 : 100000 bytes\n" + " gpu1 : 100000 bytes\n"); + } + + SUBCASE("too many GPUs") + { + const size_t num_gpus{17}; + for (size_t gpu_idx{0}; gpu_idx < num_gpus; gpu_idx++) { + const auto& gpu_key{"gpu" + std::to_string(gpu_idx)}; + metrics.gpu_utilization_per_gpu[gpu_key] = 0.5; + metrics.gpu_power_usage_per_gpu[gpu_key] = 75.5; + metrics.gpu_memory_used_bytes_per_gpu[gpu_key] = 12500; + metrics.gpu_memory_total_bytes_per_gpu[gpu_key] = 150000; + } + + cb::Error result{ReportPrometheusMetrics(metrics)}; + + std::cout.rdbuf(old_cout); + + CHECK(result.Err() == SUCCESS); + CHECK( + captured_cout.str() == + "Too many GPUs on system to print out individual Prometheus metrics, " + "use the CSV output feature to see metrics.\n"); + } +} + +TEST_CASE("InferenceProfiler: Test SummarizeOverhead") +{ + TestInferenceProfiler tip{}; + PerfStatus status; + SUBCASE("normal") + { + tip.SummarizeOverhead(100, 63, status); + CHECK(status.overhead_pct == doctest::Approx(37)); + } + SUBCASE("normal 2") + { + tip.SummarizeOverhead(234, 56, status); + CHECK(status.overhead_pct == doctest::Approx(76.068)); + } + SUBCASE("overflow") + { + tip.SummarizeOverhead(100, 101, status); + CHECK(status.overhead_pct == doctest::Approx(0)); + } +} + +TEST_CASE( + "summarize_send_request_rate: testing the SummarizeSendRequestRate " + "function") +{ + TestInferenceProfiler tip{}; + PerfStatus perf_status; + + SUBCASE("invalid zero window duration") + { + double window_duration_s{0.0}; + size_t num_sent_requests{0}; + CHECK_THROWS_WITH_AS( + tip.SummarizeSendRequestRate( + window_duration_s, num_sent_requests, perf_status), + "window_duration_s must be positive", std::runtime_error); + } + + SUBCASE("invalid negative window duration") + { + double window_duration_s{-1.0}; + size_t num_sent_requests{0}; + CHECK_THROWS_WITH_AS( + tip.SummarizeSendRequestRate( + window_duration_s, num_sent_requests, perf_status), + "window_duration_s must be positive", std::runtime_error); + } + + SUBCASE("regular case") + { + double window_duration_s{2.0}; + size_t num_sent_requests{100}; + tip.SummarizeSendRequestRate( + window_duration_s, num_sent_requests, perf_status); + CHECK(perf_status.send_request_rate == doctest::Approx(50)); + } +} + +TEST_CASE("determine_stats_model_version: testing DetermineStatsModelVersion()") +{ + TestInferenceProfiler tip{}; + cb::ModelIdentifier model_identifier; + cb::ModelStatistics old_stats; + cb::ModelStatistics new_stats; + old_stats.queue_count_ = 1; + new_stats.queue_count_ = 2; + + int64_t expected_model_version; + bool expect_warning = false; + bool expect_exception = false; + + std::map start_stats_map; + std::map end_stats_map; + + SUBCASE("One entry - unspecified - valid and in start") + { + model_identifier = {"ModelA", ""}; + start_stats_map.insert({{"ModelA", "3"}, old_stats}); + end_stats_map.insert({{"ModelA", "3"}, new_stats}); + expected_model_version = 3; + } + SUBCASE("One entry - unspecified - valid and not in start") + { + model_identifier = {"ModelA", ""}; + end_stats_map.insert({{"ModelA", "3"}, new_stats}); + expected_model_version = 3; + } + SUBCASE("One entry - unspecified - invalid") + { + model_identifier = {"ModelA", ""}; + start_stats_map.insert({{"ModelA", "3"}, old_stats}); + end_stats_map.insert({{"ModelA", "3"}, old_stats}); + expect_exception = true; + expected_model_version = -1; + } + SUBCASE("One entry - match") + { + model_identifier = {"ModelA", "3"}; + end_stats_map.insert({{"ModelA", "3"}, new_stats}); + expected_model_version = 3; + } + SUBCASE("One entry - miss") + { + model_identifier = {"ModelA", "2"}; + end_stats_map.insert({{"ModelA", "3"}, new_stats}); + expect_exception = true; + expected_model_version = -1; + } + SUBCASE("Two entries - unspecified case 1") + { + model_identifier = {"ModelA", ""}; + start_stats_map.insert({{"ModelA", "3"}, old_stats}); + start_stats_map.insert({{"ModelA", "4"}, old_stats}); + end_stats_map.insert({{"ModelA", "3"}, new_stats}); + end_stats_map.insert({{"ModelA", "4"}, old_stats}); + expected_model_version = 3; + } + SUBCASE("Two entries - unspecified case 2") + { + model_identifier = {"ModelA", ""}; + start_stats_map.insert({{"ModelA", "3"}, old_stats}); + start_stats_map.insert({{"ModelA", "4"}, old_stats}); + end_stats_map.insert({{"ModelA", "3"}, old_stats}); + end_stats_map.insert({{"ModelA", "4"}, new_stats}); + expected_model_version = 4; + } + SUBCASE("Two entries - unspecified case 3") + { + model_identifier = {"ModelA", ""}; + start_stats_map.insert({{"ModelA", "3"}, old_stats}); + start_stats_map.insert({{"ModelA", "4"}, old_stats}); + end_stats_map.insert({{"ModelA", "3"}, new_stats}); + end_stats_map.insert({{"ModelA", "4"}, new_stats}); + expected_model_version = 4; + expect_warning = 1; + } + SUBCASE("Two entries - specified hit") + { + model_identifier = {"ModelA", "3"}; + end_stats_map.insert({{"ModelA", "3"}, old_stats}); + end_stats_map.insert({{"ModelA", "4"}, old_stats}); + expected_model_version = 3; + } + SUBCASE("Two entries - specified miss") + { + model_identifier = {"ModelA", "2"}; + end_stats_map.insert({{"ModelA", "3"}, old_stats}); + end_stats_map.insert({{"ModelA", "4"}, old_stats}); + expected_model_version = -1; + expect_exception = true; + } + + SUBCASE("One entry - version -1 - valid and in start") + { + model_identifier = {"ModelA", "-1"}; + start_stats_map.insert({{"ModelA", "3"}, old_stats}); + end_stats_map.insert({{"ModelA", "3"}, new_stats}); + cb::Error status = tip.SetTopLevelResponseCaching(true); + CHECK(status.IsOk()); + expected_model_version = -1; + } + + SUBCASE("One entry - version -1 - not valid") + { + model_identifier = {"ModelA", "-1"}; + end_stats_map.insert({{"ModelA", "3"}, old_stats}); + cb::Error status = tip.SetTopLevelResponseCaching(false); + CHECK(status.IsOk()); + expected_model_version = -1; + expect_exception = true; + } + + std::stringstream captured_cerr; + std::streambuf* old = std::cerr.rdbuf(captured_cerr.rdbuf()); + + int64_t result_model_version; + cb::Error result; + result = tip.DetermineStatsModelVersion( + model_identifier, start_stats_map, end_stats_map, &result_model_version); + + CHECK(result_model_version == expected_model_version); + CHECK(result.IsOk() != expect_exception); + CHECK(captured_cerr.str().empty() != expect_warning); + + std::cerr.rdbuf(old); +} + +TEST_CASE( + "valid_latency_measurement: testing the ValidLatencyMeasurement function") +{ + MockInferenceProfiler mock_inference_profiler{}; + + SUBCASE("testing logic relevant to response throughput metric") + { + auto clock_epoch{std::chrono::time_point()}; + + auto request1_timestamp{clock_epoch + std::chrono::nanoseconds(1)}; + auto response1_timestamp{clock_epoch + std::chrono::nanoseconds(2)}; + auto response2_timestamp{clock_epoch + std::chrono::nanoseconds(3)}; + auto request_record1{RequestRecord( + request1_timestamp, + std::vector>{ + response1_timestamp, response2_timestamp}, + {}, {}, 0, false, 0, false)}; + + auto request2_timestamp{clock_epoch + std::chrono::nanoseconds(4)}; + RequestRecord request_record2{}; + size_t expected_response_count{0}; + + SUBCASE("second request has three data responses") + { + auto response3_timestamp{clock_epoch + std::chrono::nanoseconds(5)}; + auto response4_timestamp{clock_epoch + std::chrono::nanoseconds(6)}; + auto response5_timestamp{clock_epoch + std::chrono::nanoseconds(7)}; + request_record2 = RequestRecord( + request2_timestamp, + std::vector>{ + response3_timestamp, response4_timestamp, response5_timestamp}, + {}, {}, 0, false, 0, false); + expected_response_count = 5; + } + SUBCASE("second request has two data responses and one null response") + { + auto response3_timestamp{clock_epoch + std::chrono::nanoseconds(5)}; + auto response4_timestamp{clock_epoch + std::chrono::nanoseconds(6)}; + auto response5_timestamp{clock_epoch + std::chrono::nanoseconds(7)}; + request_record2 = RequestRecord( + request2_timestamp, + std::vector>{ + response3_timestamp, response4_timestamp, response5_timestamp}, + {}, {}, 0, false, 0, true); + expected_response_count = 4; + } + SUBCASE("second request has one null response") + { + request_record2 = RequestRecord( + request2_timestamp, + std::vector>{}, {}, + {}, 0, false, 0, true); + expected_response_count = 2; + } + + mock_inference_profiler.all_request_records_ = { + request_record1, request_record2}; + + const std::pair valid_range{ + std::make_pair(0, UINT64_MAX)}; + size_t valid_sequence_count{0}; + size_t delayed_request_count{0}; + std::vector valid_latencies{}; + size_t response_count{0}; + std::vector valid_requests{}; + + mock_inference_profiler.ValidLatencyMeasurement( + valid_range, valid_sequence_count, delayed_request_count, + &valid_latencies, response_count, valid_requests); + + CHECK(response_count == expected_response_count); + } + SUBCASE("testing logic relevant to valid request output") + { + auto clock_epoch{std::chrono::time_point()}; + + auto request1_timestamp{clock_epoch + std::chrono::nanoseconds(1)}; + auto response1_timestamp{clock_epoch + std::chrono::nanoseconds(2)}; + auto request_record1{RequestRecord( + request1_timestamp, + std::vector>{ + response1_timestamp}, + {}, {}, 0, false, 0, false)}; + + auto request2_timestamp{clock_epoch + std::chrono::nanoseconds(3)}; + auto response2_timestamp{clock_epoch + std::chrono::nanoseconds(4)}; + auto request_record2{RequestRecord( + request2_timestamp, + std::vector>{ + response2_timestamp}, + {}, {}, 0, false, 0, false)}; + + auto request3_timestamp{clock_epoch + std::chrono::nanoseconds(5)}; + auto response3_timestamp{clock_epoch + std::chrono::nanoseconds(6)}; + auto request_record3{RequestRecord( + request3_timestamp, + std::vector>{ + response3_timestamp}, + {}, {}, 0, false, 0, false)}; + + mock_inference_profiler.all_request_records_ = { + request_record1, request_record2, request_record3}; + + const std::pair valid_range{std::make_pair(0, 4)}; + size_t valid_sequence_count{0}; + size_t delayed_request_count{0}; + std::vector valid_latencies{}; + size_t response_count{0}; + std::vector valid_requests{}; + + mock_inference_profiler.ValidLatencyMeasurement( + valid_range, valid_sequence_count, delayed_request_count, + &valid_latencies, response_count, valid_requests); + + CHECK(valid_requests.size() == 2); + CHECK(valid_requests[0].start_time_ == request1_timestamp); + CHECK(valid_requests[1].start_time_ == request2_timestamp); + } +} + +TEST_CASE( + "merge_perf_status_reports: testing the MergePerfStatusReports function") +{ + MockInferenceProfiler mock_inference_profiler{}; + + SUBCASE("testing logic relevant to response throughput metric") + { + PerfStatus perf_status1{}; + perf_status1.client_stats.response_count = 8; + perf_status1.client_stats.duration_ns = 2000000000; + + PerfStatus perf_status2{}; + perf_status2.client_stats.response_count = 10; + perf_status2.client_stats.duration_ns = 4000000000; + + std::deque perf_status{perf_status1, perf_status2}; + PerfStatus summary_status{}; + + cb::Error error{}; + + EXPECT_CALL( + mock_inference_profiler, MergeServerSideStats(testing::_, testing::_)) + .WillOnce(testing::Return(cb::Error::Success)); + EXPECT_CALL( + mock_inference_profiler, SummarizeLatency(testing::_, testing::_)) + .WillOnce(testing::Return(cb::Error::Success)); + + error = mock_inference_profiler.MergePerfStatusReports( + perf_status, summary_status); + + REQUIRE(error.IsOk() == true); + CHECK(summary_status.client_stats.response_count == 18); + CHECK( + summary_status.client_stats.responses_per_sec == doctest::Approx(3.0)); + } +} + +TEST_CASE("summarize_client_stat: testing the SummarizeClientStat function") +{ + MockInferenceProfiler mock_inference_profiler{}; + + SUBCASE("testing logic relevant to response throughput metric") + { + mock_inference_profiler.parser_ = std::make_shared(); + mock_inference_profiler.manager_ = std::make_unique(); + + const cb::InferStat start_stat{}; + const cb::InferStat end_stat{}; + const uint64_t duration_ns{2000000000}; + const size_t valid_request_count{0}; + const size_t delayed_request_count{0}; + const size_t valid_sequence_count{0}; + const size_t response_count{8}; + PerfStatus summary{}; + + cb::Error error{}; + + error = mock_inference_profiler.SummarizeClientStat( + start_stat, end_stat, duration_ns, valid_request_count, + delayed_request_count, valid_sequence_count, response_count, summary); + + REQUIRE(error.IsOk() == true); + CHECK(summary.client_stats.response_count == 8); + CHECK(summary.client_stats.responses_per_sec == doctest::Approx(4.0)); + } +} }} // namespace triton::perfanalyzer diff --git a/test_load_manager.cc b/test_load_manager.cc new file mode 100644 index 00000000..3908374e --- /dev/null +++ b/test_load_manager.cc @@ -0,0 +1,460 @@ +// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "command_line_parser.h" +#include "doctest.h" +#include "load_manager.h" +#include "test_load_manager_base.h" + +namespace cb = triton::perfanalyzer::clientbackend; + +namespace triton { namespace perfanalyzer { + +namespace { + +bool +operator==(const RequestRecord& lhs, const RequestRecord& rhs) +{ + return std::tie( + lhs.start_time_, lhs.response_timestamps_, lhs.request_inputs_, + lhs.response_outputs_, lhs.sequence_end_, lhs.delayed_, + lhs.sequence_id_, lhs.has_null_last_response_) == + std::tie( + rhs.start_time_, rhs.response_timestamps_, rhs.request_inputs_, + rhs.response_outputs_, rhs.sequence_end_, rhs.delayed_, + rhs.sequence_id_, rhs.has_null_last_response_); +} + +} // namespace + +class TestLoadManager : public TestLoadManagerBase, public LoadManager { + public: + ~TestLoadManager() = default; + TestLoadManager( + PerfAnalyzerParameters params, bool is_sequence_model = false, + bool is_decoupled_model = false) + : TestLoadManagerBase(params, is_sequence_model, is_decoupled_model), + LoadManager( + params.async, params.streaming, params.batch_size, + params.max_threads, params.shared_memory_type, + params.output_shm_size, GetParser(), GetFactory(), + params.request_parameters) + { + } + + std::vector>& threads_stat_{ + LoadManager::threads_stat_}; + + /// Test the public function CheckHealth + /// + /// It will return a bad result if any of the thread stats + /// have a bad status or cb_status + /// + void TestCheckHealth() + { + auto good = std::make_shared(); + good->status_ = cb::Error::Success; + good->cb_status_ = cb::Error::Success; + + auto bad_status = std::make_shared(); + bad_status->status_ = cb::Error::Failure; + bad_status->cb_status_ = cb::Error::Success; + + auto bad_cb_status = std::make_shared(); + bad_cb_status->status_ = cb::Error::Success; + bad_cb_status->cb_status_ = cb::Error::Failure; + + threads_stat_.clear(); + bool expect_ok = true; + + SUBCASE("Empty") + { + expect_ok = true; + } + SUBCASE("Good") + { + // Good entries: expect OK + threads_stat_.push_back(good); + threads_stat_.push_back(good); + expect_ok = true; + } + SUBCASE("BadStatus") + { + // Bad Status: expect not OK + threads_stat_.push_back(good); + threads_stat_.push_back(bad_status); + expect_ok = false; + } + SUBCASE("BadCbStatus") + { + // Bad cb_Status: expect not OK + threads_stat_.push_back(bad_cb_status); + threads_stat_.push_back(good); + expect_ok = false; + } + SUBCASE("BadBothStatus") + { + threads_stat_.push_back(bad_status); + threads_stat_.push_back(good); + threads_stat_.push_back(bad_cb_status); + expect_ok = false; + } + + CHECK(CheckHealth().IsOk() == expect_ok); + } + + /// Test the public function SwapRequestRecords + /// + /// It will gather all request records from the thread_stats + /// and return them, and clear the thread_stats request records + /// + void TestSwapRequestRecords() + { + using time_point = std::chrono::time_point; + using ns = std::chrono::nanoseconds; + auto request_record1 = RequestRecord( + time_point(ns(1)), std::vector{time_point(ns(2))}, {}, {}, + 0, false, 0, false); + auto request_record2 = RequestRecord( + time_point(ns(3)), std::vector{time_point(ns(4))}, {}, {}, + 0, false, 0, false); + auto request_record3 = RequestRecord( + time_point(ns(5)), std::vector{time_point(ns(6))}, {}, {}, + 0, false, 0, false); + + std::vector source_request_records; + + SUBCASE("No threads") + { + auto ret = SwapRequestRecords(source_request_records); + CHECK(source_request_records.size() == 0); + CHECK(ret.IsOk() == true); + } + SUBCASE("Source has request records") + { + // Any request records in the vector passed in to SwapRequestRecords will + // be dropped on the floor + // + source_request_records.push_back(request_record1); + auto ret = SwapRequestRecords(source_request_records); + CHECK(source_request_records.size() == 0); + CHECK(ret.IsOk() == true); + } + SUBCASE("One thread") + { + auto stat1 = std::make_shared(); + stat1->request_records_.push_back(request_record1); + stat1->request_records_.push_back(request_record2); + stat1->request_records_.push_back(request_record3); + threads_stat_.push_back(stat1); + + CHECK(stat1->request_records_.size() == 3); + auto ret = SwapRequestRecords(source_request_records); + CHECK(stat1->request_records_.size() == 0); + + REQUIRE(source_request_records.size() == 3); + CHECK(source_request_records[0] == request_record1); + CHECK(source_request_records[1] == request_record2); + CHECK(source_request_records[2] == request_record3); + CHECK(ret.IsOk() == true); + } + SUBCASE("Multiple threads") + { + auto stat1 = std::make_shared(); + stat1->request_records_.push_back(request_record2); + + auto stat2 = std::make_shared(); + stat2->request_records_.push_back(request_record1); + stat2->request_records_.push_back(request_record3); + + threads_stat_.push_back(stat1); + threads_stat_.push_back(stat2); + + CHECK(stat1->request_records_.size() == 1); + CHECK(stat2->request_records_.size() == 2); + auto ret = SwapRequestRecords(source_request_records); + CHECK(stat1->request_records_.size() == 0); + CHECK(stat2->request_records_.size() == 0); + + REQUIRE(source_request_records.size() == 3); + CHECK(source_request_records[0] == request_record2); + CHECK(source_request_records[1] == request_record1); + CHECK(source_request_records[2] == request_record3); + CHECK(ret.IsOk() == true); + } + } + + /// Test the public function GetAccumulatedClientStat + /// + /// It will accumulate all contexts_stat data from all threads_stat + /// + void TestGetAccumulatedClientStat() + { + cb::InferStat result_stat; + + SUBCASE("No threads") + { + auto ret = GetAccumulatedClientStat(&result_stat); + CHECK(result_stat.completed_request_count == 0); + CHECK(result_stat.cumulative_total_request_time_ns == 0); + CHECK(result_stat.cumulative_send_time_ns == 0); + CHECK(result_stat.cumulative_receive_time_ns == 0); + CHECK(ret.IsOk() == true); + } + SUBCASE("One thread one context stat") + { + auto stat1 = std::make_shared(); + stat1->contexts_stat_.push_back(cb::InferStat()); + stat1->contexts_stat_[0].completed_request_count = 2; + stat1->contexts_stat_[0].cumulative_total_request_time_ns = 3; + stat1->contexts_stat_[0].cumulative_send_time_ns = 4; + stat1->contexts_stat_[0].cumulative_receive_time_ns = 5; + threads_stat_.push_back(stat1); + + auto ret = GetAccumulatedClientStat(&result_stat); + CHECK(result_stat.completed_request_count == 2); + CHECK(result_stat.cumulative_total_request_time_ns == 3); + CHECK(result_stat.cumulative_send_time_ns == 4); + CHECK(result_stat.cumulative_receive_time_ns == 5); + CHECK(ret.IsOk() == true); + } + SUBCASE("Multiple thread multiple contexts") + { + auto stat1 = std::make_shared(); + stat1->contexts_stat_.push_back(cb::InferStat()); + stat1->contexts_stat_.push_back(cb::InferStat()); + stat1->contexts_stat_[0].completed_request_count = 2; + stat1->contexts_stat_[0].cumulative_total_request_time_ns = 3; + stat1->contexts_stat_[0].cumulative_send_time_ns = 4; + stat1->contexts_stat_[0].cumulative_receive_time_ns = 5; + stat1->contexts_stat_[1].completed_request_count = 3; + stat1->contexts_stat_[1].cumulative_total_request_time_ns = 4; + stat1->contexts_stat_[1].cumulative_send_time_ns = 5; + stat1->contexts_stat_[1].cumulative_receive_time_ns = 6; + threads_stat_.push_back(stat1); + + auto stat2 = std::make_shared(); + stat2->contexts_stat_.push_back(cb::InferStat()); + stat2->contexts_stat_.push_back(cb::InferStat()); + stat2->contexts_stat_[0].completed_request_count = 7; + stat2->contexts_stat_[0].cumulative_total_request_time_ns = 8; + stat2->contexts_stat_[0].cumulative_send_time_ns = 9; + stat2->contexts_stat_[0].cumulative_receive_time_ns = 10; + stat2->contexts_stat_[1].completed_request_count = 11; + stat2->contexts_stat_[1].cumulative_total_request_time_ns = 12; + stat2->contexts_stat_[1].cumulative_send_time_ns = 13; + stat2->contexts_stat_[1].cumulative_receive_time_ns = 14; + threads_stat_.push_back(stat2); + + auto ret = GetAccumulatedClientStat(&result_stat); + // 2 + 3 + 7 + 11 + // + CHECK(result_stat.completed_request_count == 23); + // 3 + 4 + 8 + 12 + // + CHECK(result_stat.cumulative_total_request_time_ns == 27); + // 4 + 5 + 9 + 13 + // + CHECK(result_stat.cumulative_send_time_ns == 31); + // 5 + 6 + 10 + 14 + // + CHECK(result_stat.cumulative_receive_time_ns == 35); + + CHECK(ret.IsOk() == true); + } + } + + /// Test the public function CountCollectedRequests + /// + /// It will count all request records in the thread_stats (and not modify + /// the thread_stats in any way) + /// + void TestCountCollectedRequests() + { + using time_point = std::chrono::time_point; + using ns = std::chrono::nanoseconds; + auto request_record1 = RequestRecord( + time_point(ns(1)), std::vector{time_point(ns(2))}, {}, {}, + 0, false, 0, false); + auto request_record2 = RequestRecord( + time_point(ns(3)), std::vector{time_point(ns(4))}, {}, {}, + 0, false, 0, false); + auto request_record3 = RequestRecord( + time_point(ns(5)), std::vector{time_point(ns(6))}, {}, {}, + 0, false, 0, false); + + SUBCASE("No threads") + { + CHECK(CountCollectedRequests() == 0); + } + SUBCASE("One thread") + { + auto stat1 = std::make_shared(); + stat1->request_records_.push_back(request_record1); + stat1->request_records_.push_back(request_record2); + stat1->request_records_.push_back(request_record3); + threads_stat_.push_back(stat1); + + CHECK(stat1->request_records_.size() == 3); + CHECK(CountCollectedRequests() == 3); + CHECK(stat1->request_records_.size() == 3); + } + SUBCASE("Multiple threads") + { + auto stat1 = std::make_shared(); + stat1->request_records_.push_back(request_record2); + + auto stat2 = std::make_shared(); + stat2->request_records_.push_back(request_record1); + stat2->request_records_.push_back(request_record3); + + threads_stat_.push_back(stat1); + threads_stat_.push_back(stat2); + + CHECK(stat1->request_records_.size() == 1); + CHECK(stat2->request_records_.size() == 2); + CHECK(CountCollectedRequests() == 3); + CHECK(stat1->request_records_.size() == 1); + CHECK(stat2->request_records_.size() == 2); + } + } + + void TestIdle() + { + auto stat1 = std::make_shared(); + auto stat2 = std::make_shared(); + threads_stat_.push_back(stat1); + threads_stat_.push_back(stat2); + + SUBCASE("All active") + { + // If multiple threads are active, their idle times are averaged + stat1->idle_timer.idle_ns_ = 5; + stat2->idle_timer.idle_ns_ = 7; + CHECK(GetIdleTime() == 6); + ResetIdleTime(); + CHECK(GetIdleTime() == 0); + } + + SUBCASE("One inactive") + { + // If a thread has no idle time, it is considered inactive and not + // factored in to the average + stat1->idle_timer.idle_ns_ = 0; + stat2->idle_timer.idle_ns_ = 7; + CHECK(GetIdleTime() == 7); + ResetIdleTime(); + CHECK(GetIdleTime() == 0); + } + } +}; + +TEST_CASE("load_manager_check_health: Test the public function CheckHealth()") +{ + TestLoadManager tlm(PerfAnalyzerParameters{}); + tlm.TestCheckHealth(); +} + +TEST_CASE( + "load_manager_swap_request_records: Test the public function " + "SwapRequestRecords()") +{ + TestLoadManager tlm(PerfAnalyzerParameters{}); + tlm.TestSwapRequestRecords(); +} + +TEST_CASE( + "load_manager_get_accumulated_client_stat: Test the public function " + "GetAccumulatedClientStat()") +{ + TestLoadManager tlm(PerfAnalyzerParameters{}); + tlm.TestGetAccumulatedClientStat(); +} + +TEST_CASE( + "load_manager_count_collected_requests: Test the public function " + "CountCollectedRequests()") +{ + TestLoadManager tlm(PerfAnalyzerParameters{}); + tlm.TestCountCollectedRequests(); +} + +TEST_CASE("load_manager_batch_size: Test the public function BatchSize()") +{ + PerfAnalyzerParameters params; + + SUBCASE("batch size 0") + { + params.batch_size = 0; + } + SUBCASE("batch size 1") + { + params.batch_size = 1; + } + SUBCASE("batch size 4") + { + params.batch_size = 4; + } + + TestLoadManager tlm(params); + CHECK(tlm.BatchSize() == params.batch_size); +} + +TEST_CASE("load_manager: Test public idle time functions") +{ + PerfAnalyzerParameters params; + TestLoadManager tlm(params); + tlm.TestIdle(); +} + +TEST_CASE( + "send_request_rate_load_manager: testing the GetAndResetNumSentRequests " + "function") +{ + PerfAnalyzerParameters params{}; + + TestLoadManager tlm(params); + + std::shared_ptr thread_stat_1{std::make_shared()}; + std::shared_ptr thread_stat_2{std::make_shared()}; + + std::chrono::steady_clock::time_point start_time{ + std::chrono::steady_clock::time_point::min()}; + + thread_stat_1->num_sent_requests_ = 6; + thread_stat_2->num_sent_requests_ = 5; + + tlm.threads_stat_ = {thread_stat_1, thread_stat_2}; + + const size_t result{tlm.GetAndResetNumSentRequests()}; + + CHECK(result == 11); + CHECK(tlm.threads_stat_.size() == 2); + CHECK(tlm.threads_stat_[0]->num_sent_requests_ == 0); + CHECK(tlm.threads_stat_[1]->num_sent_requests_ == 0); +} + +}} // namespace triton::perfanalyzer diff --git a/test_load_manager_base.h b/test_load_manager_base.h new file mode 100644 index 00000000..6bbdf6d2 --- /dev/null +++ b/test_load_manager_base.h @@ -0,0 +1,305 @@ +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include +#include + +#include "command_line_parser.h" +#include "doctest.h" +#include "mock_client_backend.h" +#include "mock_data_loader.h" +#include "mock_model_parser.h" +#include "sequence_manager.h" + +namespace cb = triton::perfanalyzer::clientbackend; + +namespace triton { namespace perfanalyzer { + +// Struct to hold the mock pieces to ingest custom json data +struct MockInputPipeline { + MockInputPipeline( + std::shared_ptr mmp, std::shared_ptr mdl) + : mock_model_parser_(mmp), mock_data_loader_(mdl) + { + } + std::shared_ptr mock_model_parser_; + std::shared_ptr mock_data_loader_; +}; + +/// Helper base class to be inherited when testing any Load Manager class +/// +class TestLoadManagerBase { + public: + TestLoadManagerBase() = default; + TestLoadManagerBase( + PerfAnalyzerParameters params, bool is_sequence_model, + bool is_decoupled_model) + : params_(params) + { + stats_ = std::make_shared(); + factory_ = std::make_shared(stats_); + parser_ = std::make_shared( + is_sequence_model, is_decoupled_model); + } + + ~TestLoadManagerBase() + { + // Reset early_exit in case any test sets it to true during execution. + early_exit = false; + } + + // Helper function to process custom json data in testing + // Creates a model tensor to pass to a mock parser which is consumed by the + // mock data loader + static MockInputPipeline ProcessCustomJsonData( + const std::string& json_str, const bool is_sequence_model = false) + { + std::shared_ptr mmp{ + std::make_shared(is_sequence_model, false)}; + ModelTensor model_tensor{}; + model_tensor.datatype_ = "INT32"; + model_tensor.is_optional_ = false; + model_tensor.is_shape_tensor_ = false; + model_tensor.name_ = "INPUT0"; + model_tensor.shape_ = {1}; + mmp->inputs_ = std::make_shared(); + (*mmp->inputs_)[model_tensor.name_] = model_tensor; + + std::shared_ptr mdl{std::make_shared()}; + mdl->ReadDataFromStr(json_str, mmp->Inputs(), mmp->Outputs()); + return MockInputPipeline{mmp, mdl}; + } + + // Set up all combinations of parameters for sequence testing + // + static PerfAnalyzerParameters GetSequenceTestParams() + { + PerfAnalyzerParameters params; + bool is_async; + + SUBCASE("Async sequence") + { + is_async = true; + params = GetSequenceTestParamsHelper(is_async); + } + SUBCASE("Sync sequence") + { + is_async = false; + params = GetSequenceTestParamsHelper(is_async); + } + return params; + } + + void CheckInferType() + { + auto stats = GetStats(); + + if (params_.async) { + if (params_.streaming) { + CHECK(stats->num_infer_calls == 0); + CHECK(stats->num_async_infer_calls == 0); + CHECK(stats->num_async_stream_infer_calls > 0); + CHECK(stats->num_start_stream_calls > 0); + } else { + CHECK(stats->num_infer_calls == 0); + CHECK(stats->num_async_infer_calls > 0); + CHECK(stats->num_async_stream_infer_calls == 0); + CHECK(stats->num_start_stream_calls == 0); + } + } else { + if (params_.streaming) { + CHECK(stats->num_infer_calls > 0); + CHECK(stats->num_async_infer_calls == 0); + CHECK(stats->num_async_stream_infer_calls == 0); + CHECK(stats->num_start_stream_calls > 0); + } else { + CHECK(stats->num_infer_calls > 0); + CHECK(stats->num_async_infer_calls == 0); + CHECK(stats->num_async_stream_infer_calls == 0); + CHECK(stats->num_start_stream_calls == 0); + } + } + } + + + void CheckSharedMemory( + const cb::MockClientStats::SharedMemoryStats& expected_stats) + { + auto actual_stats = GetStats(); + CHECK(expected_stats == actual_stats->memory_stats); + } + + void CheckSequences(uint64_t expected_num_seq) + { + auto stats = GetStats(); + + // Make sure no live sequences remain + CHECK(stats->sequence_status.live_seq_ids_to_length.size() == 0); + + // Make sure all seq IDs are within range + // + for (auto seq_id : stats->sequence_status.used_seq_ids) { + CHECK(seq_id >= params_.start_sequence_id); + CHECK(seq_id <= params_.start_sequence_id + params_.sequence_id_range); + } + + // Make sure that we had the correct number of concurrently live sequences + // + // If the sequence length is only 1 then there is nothing to check because + // there are never any overlapping requests -- they always immediately exit + // + if (params_.sequence_length != 1) { + expected_num_seq = std::min(expected_num_seq, params_.sequence_id_range); + CHECK(expected_num_seq == stats->sequence_status.max_live_seq_count); + } + + // Make sure that the length of each sequence is as expected + // + // All but X of them should be within 20% (The code explicitly has a 20% + // slop) of the requested sequence length, where X is the number of + // sequences (This is due to the shutdown of sequences at the end that will + // create shorter than expected sequences) + // + auto num_values = stats->sequence_status.seq_lengths.size(); + auto max_len = params_.sequence_length * 1.2; + auto min_len = params_.sequence_length * 0.8; + auto num_allowed_to_be_below_min_len = expected_num_seq; + auto num_below_min_len = 0; + + for (size_t i = 0; i < num_values; i++) { + auto len = stats->sequence_status.seq_lengths[i]; + + CHECK(len <= max_len); + if (len < min_len) { + num_below_min_len++; + } + } + CHECK(num_below_min_len <= num_allowed_to_be_below_min_len); + } + + std::shared_ptr stats_; + + protected: + PerfAnalyzerParameters params_; + std::shared_ptr factory_; + std::shared_ptr parser_; + + const std::shared_ptr& GetParser() { return parser_; } + const std::shared_ptr& GetFactory() + { + return factory_; + } + std::shared_ptr GetStats() { return stats_; } + void ResetStats() { stats_->Reset(); } + + // Verifies that the number of inferences for each sequence is n or n+1. + // + void CheckSequenceBalance() + { + auto first_value = -1; + auto second_value = -1; + + for (auto seq : stats_->sequence_status.seq_ids_to_count) { + auto count = seq.second; + // set first possible value for seqs + if (first_value == -1) { + first_value = count; + continue; + } + // set second possible value for seqs count + if (second_value == -1) { + if (count == first_value + 1 || count == first_value - 1) { + second_value = count; + continue; + } else if (first_value == count) { + continue; + } + } + + if (count != first_value || count != second_value) { + std::stringstream os; + os << "Sequence request counts were not balanced: "; + for (auto x : stats_->sequence_status.seq_ids_to_count) { + os << x.second << ","; + } + CHECK_MESSAGE( + (count == first_value || count == second_value), os.str()); + break; + } + } + } + + static PerfAnalyzerParameters GetSequenceTestParamsHelper(bool is_async) + { + PerfAnalyzerParameters params; + + params.async = is_async; + + // Generally we want short sequences for testing + // so we can hit the corner cases more often + // + params.sequence_length = 4; + params.max_concurrency = 8; + params.max_threads = 8; + + SUBCASE("Normal") {} + SUBCASE("sequence IDs test 1") + { + params.start_sequence_id = 1; + params.sequence_id_range = 3; + } + SUBCASE("sequence IDs test 2") + { + params.start_sequence_id = 17; + params.sequence_id_range = 8; + } + SUBCASE("num_of_sequences 1") + { + params.num_of_sequences = 1; + } + SUBCASE("less threads than seq") + { + params.num_of_sequences = 12; + } + SUBCASE("num_of_sequences 8") + { + params.num_of_sequences = 8; + // Make sequences long so we actually get 8 in flight at a time + params.sequence_length = 20; + } + SUBCASE("sequence_length 1") + { + params.sequence_length = 1; + } + SUBCASE("sequence_length 10") + { + params.sequence_length = 10; + } + return params; + } +}; +}} // namespace triton::perfanalyzer diff --git a/test_metrics_manager.cc b/test_metrics_manager.cc index 5a63cfbe..b6fb1eb7 100644 --- a/test_metrics_manager.cc +++ b/test_metrics_manager.cc @@ -27,6 +27,7 @@ #include #include #include + #include "doctest.h" #include "metrics_manager.h" diff --git a/test_model_parser.cc b/test_model_parser.cc index b16dc2d3..dabf8c9e 100644 --- a/test_model_parser.cc +++ b/test_model_parser.cc @@ -25,31 +25,90 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include + #include + #include "client_backend/client_backend.h" #include "constants.h" #include "doctest.h" -#include "model_parser.h" +#include "mock_client_backend.h" +#include "mock_model_parser.h" + +namespace cb = triton::perfanalyzer::clientbackend; namespace triton { namespace perfanalyzer { class TestModelParser { public: - static cb::Error GetInt(const rapidjson::Value& value, int64_t* integer_value) + constexpr static const char* no_batching = + R"({ "name": "NoBatchingModel", "platform":"not_ensemble" })"; + + constexpr static const char* seq_batching = + R"({ "name": "SeqBatchingModel", "platform":"not_ensemble", "sequence_batching":{} })"; + + constexpr static const char* dyn_batching = + R"({ "name": "DynBatchingModel", "platform":"not_ensemble", "dynamic_batching":{} })"; + + constexpr static const char* ensemble = R"({ + "name": "EnsembleModel", + "platform": "ensemble", + "ensemble_scheduling": { + "step": [{ + "model_name": "ModelA", + "model_version": 2 + }, + { + "model_name": "ModelB", + "model_version": -1 + } + ] + } + })"; + + constexpr static const char* nested_ensemble = R"({ + "name": "ModelA", + "platform": "ensemble", + "ensemble_scheduling": { + "step": [{ + "model_name": "ModelC", + "model_version": -1 + }, + { + "model_name": "ModelD", + "model_version": -1 + } + ] + } + })"; + + static cb::Error SetJsonPtrNoSeq(rapidjson::Document* model_config) { - ModelParser mp{}; - return mp.GetInt(value, integer_value); - } + model_config->Parse(no_batching); + return cb::Error::Success; + }; + + static cb::Error SetJsonPtrYesSeq(rapidjson::Document* model_config) + { + model_config->Parse(seq_batching); + return cb::Error::Success; + }; + + static cb::Error SetJsonPtrNestedEnsemble(rapidjson::Document* model_config) + { + model_config->Parse(nested_ensemble); + return cb::Error::Success; + }; }; -TEST_CASE("testing the GetInt function") +TEST_CASE("ModelParser: testing the GetInt function") { int64_t integer_value{0}; + MockModelParser mmp; SUBCASE("valid string") { rapidjson::Value value("100"); - cb::Error result{TestModelParser::GetInt(value, &integer_value)}; + cb::Error result{mmp.GetInt(value, &integer_value)}; CHECK(result.Err() == SUCCESS); CHECK(integer_value == 100); } @@ -57,7 +116,7 @@ TEST_CASE("testing the GetInt function") SUBCASE("invalid string, alphabet") { rapidjson::Value value("abc"); - cb::Error result{TestModelParser::GetInt(value, &integer_value)}; + cb::Error result{mmp.GetInt(value, &integer_value)}; CHECK(result.Err() == GENERIC_ERROR); CHECK(result.Message() == "unable to convert 'abc' to integer"); CHECK(integer_value == 0); @@ -66,7 +125,7 @@ TEST_CASE("testing the GetInt function") SUBCASE("invalid string, number out of range") { rapidjson::Value value("9223372036854775808"); - cb::Error result{TestModelParser::GetInt(value, &integer_value)}; + cb::Error result{mmp.GetInt(value, &integer_value)}; CHECK(result.Err() == GENERIC_ERROR); CHECK( result.Message() == @@ -77,7 +136,7 @@ TEST_CASE("testing the GetInt function") SUBCASE("valid int, lowest Int64") { rapidjson::Value value(2147483648); - cb::Error result{TestModelParser::GetInt(value, &integer_value)}; + cb::Error result{mmp.GetInt(value, &integer_value)}; CHECK(result.Err() == SUCCESS); CHECK(integer_value == 2147483648); } @@ -85,7 +144,7 @@ TEST_CASE("testing the GetInt function") SUBCASE("valid int, highest Int32") { rapidjson::Value value(2147483647); - cb::Error result{TestModelParser::GetInt(value, &integer_value)}; + cb::Error result{mmp.GetInt(value, &integer_value)}; CHECK(result.Err() == SUCCESS); CHECK(integer_value == 2147483647); } @@ -93,11 +152,214 @@ TEST_CASE("testing the GetInt function") SUBCASE("invalid floating point") { rapidjson::Value value(100.1); - cb::Error result{TestModelParser::GetInt(value, &integer_value)}; + cb::Error result{mmp.GetInt(value, &integer_value)}; CHECK(result.Err() == GENERIC_ERROR); CHECK(result.Message() == "failed to parse the integer value"); CHECK(integer_value == 0); } } +TEST_CASE( + "ModelParser: DetermineComposingModelMap" * + doctest::description( + "This test confirms that the composing model map will be correctly " + "populated by DetermineComposingModelMap()")) +{ + std::shared_ptr stats = + std::make_shared(); + std::unique_ptr mock_backend = + std::make_unique(stats); + + rapidjson::Document config; + std::vector input_bls_composing_models; + ComposingModelMap expected_composing_model_map; + + std::string parent_model_name; + + + const auto& ParameterizeListedComposingModels{[&]() { + SUBCASE("No listed composing models") {} + SUBCASE("Yes listed composing models") + { + input_bls_composing_models.push_back({"ListedModelA", ""}); + input_bls_composing_models.push_back({"ListedModelB", ""}); + expected_composing_model_map[parent_model_name].emplace( + "ListedModelA", ""); + expected_composing_model_map[parent_model_name].emplace( + "ListedModelB", ""); + } + EXPECT_CALL(*mock_backend, ModelConfig(testing::_, testing::_, testing::_)) + .WillRepeatedly(testing::WithArg<0>(TestModelParser::SetJsonPtrNoSeq)); + }}; + + SUBCASE("No Ensemble") + { + config.Parse(TestModelParser::no_batching); + parent_model_name = "NoBatchingModel"; + ParameterizeListedComposingModels(); + } + SUBCASE("Ensemble") + { + config.Parse(TestModelParser::ensemble); + parent_model_name = "EnsembleModel"; + ParameterizeListedComposingModels(); + + expected_composing_model_map["EnsembleModel"].emplace("ModelA", "2"); + expected_composing_model_map["EnsembleModel"].emplace("ModelB", ""); + EXPECT_CALL(*mock_backend, ModelConfig(testing::_, testing::_, testing::_)) + .WillRepeatedly(testing::WithArg<0>(TestModelParser::SetJsonPtrNoSeq)); + } + SUBCASE("Nested Ensemble") + { + config.Parse(TestModelParser::ensemble); + parent_model_name = "EnsembleModel"; + ParameterizeListedComposingModels(); + + expected_composing_model_map["EnsembleModel"].emplace("ModelA", "2"); + expected_composing_model_map["EnsembleModel"].emplace("ModelB", ""); + expected_composing_model_map["ModelA"].emplace("ModelC", ""); + expected_composing_model_map["ModelA"].emplace("ModelD", ""); + EXPECT_CALL(*mock_backend, ModelConfig(testing::_, testing::_, testing::_)) + .WillOnce( + testing::WithArg<0>(TestModelParser::SetJsonPtrNestedEnsemble)) + .WillRepeatedly(testing::WithArg<0>(TestModelParser::SetJsonPtrNoSeq)); + } + SUBCASE("BLS with an Ensemble") + { + config.Parse(TestModelParser::no_batching); + parent_model_name = "NoBatchingModel"; + + input_bls_composing_models.push_back({"ModelA", ""}); + input_bls_composing_models.push_back({"ModelB", ""}); + + expected_composing_model_map[parent_model_name].emplace("ModelA", ""); + expected_composing_model_map[parent_model_name].emplace("ModelB", ""); + expected_composing_model_map["ModelA"].emplace("ModelC", ""); + expected_composing_model_map["ModelA"].emplace("ModelD", ""); + EXPECT_CALL(*mock_backend, ModelConfig(testing::_, testing::_, testing::_)) + .WillOnce( + testing::WithArg<0>(TestModelParser::SetJsonPtrNestedEnsemble)) + .WillRepeatedly(testing::WithArg<0>(TestModelParser::SetJsonPtrNoSeq)); + } + + std::unique_ptr backend = std::move(mock_backend); + + MockModelParser mmp; + + mmp.DetermineComposingModelMap(input_bls_composing_models, config, backend); + + auto actual_composing_model_map = *mmp.GetComposingModelMap().get(); + CHECK(actual_composing_model_map == expected_composing_model_map); + + // Destruct gmock objects to determine gmock-related test failure + backend.reset(); +} + +TEST_CASE( + "ModelParser: determining scheduler type" * + doctest::description("This test confirms that scheduler_type_ will be set " + "correctly by DetermineSchedulerType()")) +{ + std::shared_ptr stats = + std::make_shared(); + std::unique_ptr mock_backend = + std::make_unique(stats); + + + rapidjson::Document config; + ModelParser::ModelSchedulerType expected_type; + + ComposingModelMap input_composing_model_map; + + + SUBCASE("No batching") + { + config.Parse(TestModelParser::no_batching); + expected_type = ModelParser::ModelSchedulerType::NONE; + } + SUBCASE("Sequence batching") + { + config.Parse(TestModelParser::seq_batching); + expected_type = ModelParser::ModelSchedulerType::SEQUENCE; + } + SUBCASE("Dynamic batching") + { + config.Parse(TestModelParser::dyn_batching); + expected_type = ModelParser::ModelSchedulerType::DYNAMIC; + } + SUBCASE("Ensemble") + { + config.Parse(TestModelParser::ensemble); + + input_composing_model_map["EnsembleModel"].emplace("ModelA", "2"); + input_composing_model_map["EnsembleModel"].emplace("ModelB", ""); + + SUBCASE("no sequences") + { + EXPECT_CALL( + *mock_backend, ModelConfig(testing::_, testing::_, testing::_)) + .WillOnce(testing::WithArg<0>(TestModelParser::SetJsonPtrNoSeq)) + .WillOnce(testing::WithArg<0>(TestModelParser::SetJsonPtrNoSeq)); + + expected_type = ModelParser::ModelSchedulerType::ENSEMBLE; + } + SUBCASE("yes sequences") + { + EXPECT_CALL( + *mock_backend, ModelConfig(testing::_, testing::_, testing::_)) + .WillOnce(testing::WithArg<0>(TestModelParser::SetJsonPtrNoSeq)) + .WillOnce(testing::WithArg<0>(TestModelParser::SetJsonPtrYesSeq)); + + expected_type = ModelParser::ModelSchedulerType::ENSEMBLE_SEQUENCE; + } + } + SUBCASE("Nested Ensemble") + { + config.Parse(TestModelParser::ensemble); + + input_composing_model_map["EnsembleModel"].emplace("ModelA", "2"); + input_composing_model_map["EnsembleModel"].emplace("ModelB", ""); + input_composing_model_map["ModelA"].emplace("ModelC", ""); + input_composing_model_map["ModelA"].emplace("ModelD", ""); + + SUBCASE("no sequences") + { + EXPECT_CALL( + *mock_backend, ModelConfig(testing::_, testing::_, testing::_)) + .WillOnce( + testing::WithArg<0>(TestModelParser::SetJsonPtrNestedEnsemble)) + .WillOnce(testing::WithArg<0>(TestModelParser::SetJsonPtrNoSeq)) + .WillOnce(testing::WithArg<0>(TestModelParser::SetJsonPtrNoSeq)) + .WillOnce(testing::WithArg<0>(TestModelParser::SetJsonPtrNoSeq)); + + expected_type = ModelParser::ModelSchedulerType::ENSEMBLE; + } + SUBCASE("yes sequences") + { + EXPECT_CALL( + *mock_backend, ModelConfig(testing::_, testing::_, testing::_)) + .WillOnce( + testing::WithArg<0>(TestModelParser::SetJsonPtrNestedEnsemble)) + .WillOnce(testing::WithArg<0>(TestModelParser::SetJsonPtrNoSeq)) + .WillOnce(testing::WithArg<0>(TestModelParser::SetJsonPtrYesSeq)) + .WillOnce(testing::WithArg<0>(TestModelParser::SetJsonPtrNoSeq)); + + expected_type = ModelParser::ModelSchedulerType::ENSEMBLE_SEQUENCE; + } + } + + std::unique_ptr backend = std::move(mock_backend); + + MockModelParser mmp; + mmp.composing_models_map_ = + std::make_shared(input_composing_model_map); + mmp.DetermineSchedulerType(config, backend); + + auto actual_type = mmp.SchedulerType(); + CHECK(actual_type == expected_type); + + // Destruct gmock objects to determine gmock-related test failure + backend.reset(); +} + }} // namespace triton::perfanalyzer diff --git a/test_perf_utils.cc b/test_perf_utils.cc new file mode 100644 index 00000000..34a08a10 --- /dev/null +++ b/test_perf_utils.cc @@ -0,0 +1,374 @@ +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include + +#include +#include + +#include "doctest.h" +#include "perf_utils.h" +#include "test_utils.h" + +namespace triton { namespace perfanalyzer { + +/// Helper class to test perf_utils.cc +/// +class TestPerfUtils { + public: + /// Given a distributionType and request rate, confirm that request pattern + /// matches what is expected. + /// + static void TestDistribution( + Distribution distribution_type, uint32_t request_rate) + { + std::mt19937 schedule_rng; + std::vector delays; + + double avg, variance; + double expected_avg, expected_variance; + + auto dist_func = GetDistributionFunction(distribution_type, request_rate); + + for (int i = 0; i < 100000; i++) { + auto delay = dist_func(schedule_rng); + delays.push_back(delay.count()); + } + + avg = CalculateAverage(delays); + variance = CalculateVariance(delays, avg); + + std::chrono::nanoseconds ns_in_one_second = + std::chrono::duration_cast( + std::chrono::seconds(1)); + expected_avg = ns_in_one_second.count() / request_rate; + + if (distribution_type == CONSTANT) { + expected_variance = 0; + } else { + // By definition, variance = mean for poisson + expected_variance = expected_avg; + } + + CHECK(avg == doctest::Approx(expected_avg).epsilon(0.005)); + CHECK(variance == doctest::Approx(expected_variance).epsilon(0.005)); + } + + + private: + static std::function + GetDistributionFunction(Distribution type, uint32_t request_rate) + { + std::function distributionFunction; + + if (type == CONSTANT) { + distributionFunction = ScheduleDistribution(request_rate); + } else if (type == POISSON) { + distributionFunction = ScheduleDistribution(request_rate); + } else { + throw std::invalid_argument("Unexpected distribution type"); + } + return distributionFunction; + } +}; + +/// Test all distributions across various request rates +/// +TEST_CASE("perf_utils: TestDistribution") +{ + std::vector distTypes{CONSTANT, POISSON}; + std::vector requestRates{10, 100, 1000, 10000}; + + for (auto dist : distTypes) { + for (auto rate : requestRates) { + TestPerfUtils::TestDistribution(dist, rate); + } + } +} + +TEST_CASE("perf_utils: ParseTensorFormat") +{ + CHECK(ParseTensorFormat("binary") == cb::TensorFormat::BINARY); + CHECK(ParseTensorFormat("BINARY") == cb::TensorFormat::BINARY); + CHECK(ParseTensorFormat("json") == cb::TensorFormat::JSON); + CHECK(ParseTensorFormat("JSON") == cb::TensorFormat::JSON); + CHECK(ParseTensorFormat("abc") == cb::TensorFormat::UNKNOWN); + CHECK(ParseTensorFormat("") == cb::TensorFormat::UNKNOWN); +} + +TEST_CASE("perf_utils: ParseProtocol") +{ + CHECK(ParseProtocol("HTTP") == cb::ProtocolType::HTTP); + CHECK(ParseProtocol("http") == cb::ProtocolType::HTTP); + CHECK(ParseProtocol("GRPC") == cb::ProtocolType::GRPC); + CHECK(ParseProtocol("grpc") == cb::ProtocolType::GRPC); + CHECK(ParseProtocol("hhtp") == cb::ProtocolType::UNKNOWN); + CHECK(ParseProtocol("") == cb::ProtocolType::UNKNOWN); + CHECK(ParseProtocol("http2") == cb::ProtocolType::UNKNOWN); +} + +TEST_CASE("perf_utils: ConvertDTypeFromTFS") +{ + std::string datatype; + cb::Error status; + + SUBCASE("Check for correct conversion") + { + std::vector> tf_to_datatype{ + std::make_pair("DT_HALF", "FP16"), + std::make_pair("DT_BFLOAT16", "BF16"), + std::make_pair("DT_FLOAT", "FP32"), + std::make_pair("DT_DOUBLE", "FP64"), + std::make_pair("DT_INT32", "INT32"), + std::make_pair("DT_INT16", "INT16"), + std::make_pair("DT_INT8", "INT8"), + std::make_pair("DT_UINT8", "UINT8"), + std::make_pair("DT_STRING", "BYTES"), + std::make_pair("DT_INT64", "INT64"), + std::make_pair("DT_BOOL", "BOOL"), + std::make_pair("DT_UINT32", "UINT32"), + std::make_pair("DT_UINT64", "UINT64")}; + + for (const auto& type_pair : tf_to_datatype) { + status = ConvertDTypeFromTFS(type_pair.first, &datatype); + CHECK(status.IsOk()); + CHECK(datatype == type_pair.second); + } + } + + SUBCASE("Invalid tensorflow datatype") + { + status = ConvertDTypeFromTFS("dt_bool", &datatype); + CHECK(!status.IsOk()); + CHECK(datatype == ""); + + status = ConvertDTypeFromTFS("dt_uint8", &datatype); + CHECK(!status.IsOk()); + CHECK(datatype == ""); + + status = ConvertDTypeFromTFS("abcdef", &datatype); + CHECK(!status.IsOk()); + CHECK(datatype == ""); + + status = ConvertDTypeFromTFS("", &datatype); + CHECK(!status.IsOk()); + CHECK(datatype == ""); + } +} + +TEST_CASE("perf_utils: IsDirectory") +{ + // Create a temporary directory /tmp/abcdef1234 + int status; + std::string temp_path{"/tmp/abcdef1234"}; + + CHECK(!IsDirectory(temp_path)); + + status = mkdir(temp_path.c_str(), S_IRWXU | S_IROTH | S_IXOTH); + REQUIRE(status == 0); + CHECK(IsDirectory(temp_path)); + + status = rmdir(temp_path.c_str()); + REQUIRE(status == 0); + CHECK(!IsDirectory(temp_path)); +} + +TEST_CASE("perf_utils: IsFile") +{ + // Create a temporary file /tmp/test.txt + int status; + std::string temp_path{"/tmp/test.txt"}; + + CHECK(!IsFile(temp_path)); + + std::ofstream file(temp_path); + CHECK(IsFile(temp_path)); + + std::remove(temp_path.c_str()); + CHECK(!IsFile(temp_path)); +} + +TEST_CASE("perf_utils: ByteSize") +{ + std::vector shape{3, 4, 5}; + constexpr int num_elements = 3 * 4 * 5; + + SUBCASE("Single byte elements") + { + CHECK(ByteSize(shape, "BOOL") == 1 * num_elements); + CHECK(ByteSize(shape, "INT8") == 1 * num_elements); + CHECK(ByteSize(shape, "UINT8") == 1 * num_elements); + } + + SUBCASE("2 byte elements") + { + CHECK(ByteSize(shape, "INT16") == 2 * num_elements); + CHECK(ByteSize(shape, "UINT16") == 2 * num_elements); + CHECK(ByteSize(shape, "FP16") == 2 * num_elements); + CHECK(ByteSize(shape, "BF16") == 2 * num_elements); + } + + SUBCASE("4 byte elements") + { + CHECK(ByteSize(shape, "INT32") == 4 * num_elements); + CHECK(ByteSize(shape, "UINT32") == 4 * num_elements); + CHECK(ByteSize(shape, "FP32") == 4 * num_elements); + } + + SUBCASE("8 byte elements") + { + CHECK(ByteSize(shape, "INT64") == 8 * num_elements); + CHECK(ByteSize(shape, "UINT64") == 8 * num_elements); + CHECK(ByteSize(shape, "FP64") == 8 * num_elements); + } + + SUBCASE("Dynamic shape tensor") + { + shape.insert(shape.begin(), -1); + + CHECK(ByteSize(shape, "BOOL") == -1); + CHECK(ByteSize(shape, "INT8") == -1); + CHECK(ByteSize(shape, "UINT8") == -1); + + CHECK(ByteSize(shape, "INT16") == -1); + CHECK(ByteSize(shape, "UINT16") == -1); + CHECK(ByteSize(shape, "FP16") == -1); + CHECK(ByteSize(shape, "BF16") == -1); + + CHECK(ByteSize(shape, "INT32") == -1); + CHECK(ByteSize(shape, "UINT32") == -1); + CHECK(ByteSize(shape, "FP32") == -1); + + CHECK(ByteSize(shape, "INT64") == -1); + CHECK(ByteSize(shape, "UINT64") == -1); + CHECK(ByteSize(shape, "FP64") == -1); + } + + SUBCASE("Unknown data types") + { + CHECK(ByteSize(shape, "bool") == -1); + CHECK(ByteSize(shape, "int8") == -1); + CHECK(ByteSize(shape, "uint8") == -1); + + CHECK(ByteSize(shape, "int16") == -1); + CHECK(ByteSize(shape, "uint16") == -1); + CHECK(ByteSize(shape, "fp16") == -1); + CHECK(ByteSize(shape, "bf16") == -1); + + CHECK(ByteSize(shape, "int32") == -1); + CHECK(ByteSize(shape, "uint32") == -1); + CHECK(ByteSize(shape, "fp32") == -1); + + CHECK(ByteSize(shape, "int64") == -1); + CHECK(ByteSize(shape, "uint64") == -1); + CHECK(ByteSize(shape, "fp64") == -1); + + CHECK(ByteSize(shape, "abc") == -1); + CHECK(ByteSize(shape, "1234") == -1); + CHECK(ByteSize(shape, "") == -1); + } +} + +TEST_CASE("perf_utils: ElementCount") +{ + std::vector shape{3, 4, 5}; + constexpr int num_elements = 3 * 4 * 5; + + SUBCASE("Static tensor shape") + { + CHECK(ElementCount(shape) == num_elements); + + shape.push_back(1); + CHECK(ElementCount(shape) == num_elements * 1); + + shape.push_back(300); + CHECK(ElementCount(shape) == num_elements * 1 * 300); + } + + SUBCASE("Dynamic tensor shape") + { + CHECK(ElementCount(shape) == num_elements); + + shape.push_back(-1); + CHECK(ElementCount(shape) == -1); + + shape.pop_back(); + shape.insert(shape.begin(), -1); + CHECK(ElementCount(shape) == -1); + } +} + +TEST_CASE("perf_utils: ShapeVecToString") +{ + std::vector shape{3, 4, 5}; + + SUBCASE("No skipping first dim") + { + CHECK(ShapeVecToString(shape, false) == "[3,4,5]"); + + shape.push_back(10); + CHECK(ShapeVecToString(shape, false) == "[3,4,5,10]"); + + shape.push_back(-1); + CHECK(ShapeVecToString(shape, false) == "[3,4,5,10,-1]"); + + shape.pop_back(); + shape.insert(shape.begin(), -1); + CHECK(ShapeVecToString(shape, false) == "[-1,3,4,5,10]"); + + shape.clear(); + CHECK(ShapeVecToString(shape, false) == "[]"); + } + + SUBCASE("Skipping first dim") + { + CHECK(ShapeVecToString(shape, true) == "[4,5]"); + + shape.push_back(-1); + CHECK(ShapeVecToString(shape, true) == "[4,5,-1]"); + + shape.pop_back(); + shape.insert(shape.begin(), -1); + CHECK(ShapeVecToString(shape, true) == "[3,4,5]"); + + shape.clear(); + CHECK(ShapeVecToString(shape, true) == "[]"); + } +} + +TEST_CASE("perf_utils: TensorToRegionName") +{ + CHECK(TensorToRegionName("name/with/slash") == "namewithslash"); + CHECK(TensorToRegionName("name//with//slash") == "namewithslash"); + CHECK(TensorToRegionName("name\\with\\backslash") == "namewithbackslash"); + CHECK(TensorToRegionName("name\\\\with\\\\backslash") == "namewithbackslash"); + CHECK(TensorToRegionName("name_without_slash") == "name_without_slash"); + CHECK(TensorToRegionName("abc123!@#") == "abc123!@#"); + CHECK(TensorToRegionName("") == ""); +} + + +}} // namespace triton::perfanalyzer diff --git a/test_profile_data_collector.cc b/test_profile_data_collector.cc new file mode 100644 index 00000000..926a9015 --- /dev/null +++ b/test_profile_data_collector.cc @@ -0,0 +1,161 @@ +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS"" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "doctest.h" +#include "mock_profile_data_collector.h" +#include "profile_data_collector.h" + +namespace triton { namespace perfanalyzer { + +TEST_CASE("profile_data_collector: FindExperiment") +{ + MockProfileDataCollector collector{}; + InferenceLoadMode infer_mode1{10, 20.0}; + + std::vector::iterator it; + it = collector.FindExperiment(infer_mode1); + CHECK(it == collector.experiments_.end()); + + std::vector request_records{RequestRecord{}}; + collector.AddData(infer_mode1, std::move(request_records)); + + it = collector.FindExperiment(infer_mode1); + CHECK(it != collector.experiments_.end()); + CHECK((*it).mode == infer_mode1); + + InferenceLoadMode infer_mode2{123, 0.0}; + it = collector.FindExperiment(infer_mode2); + CHECK(it == collector.experiments_.end()); +} + +TEST_CASE("profile_data_collector: AddData") +{ + using std::chrono::nanoseconds; + using std::chrono::system_clock; + using std::chrono::time_point; + + MockProfileDataCollector collector{}; + InferenceLoadMode infer_mode{10, 20.0}; + + // Add RequestRecords + auto clock_epoch{time_point()}; + + uint64_t sequence_id1{123}; + auto request1_timestamp{clock_epoch + nanoseconds(1)}; + auto request1_response1_timestamp{clock_epoch + nanoseconds(2)}; + auto request1_response2_timestamp{clock_epoch + nanoseconds(3)}; + uint8_t fake_data_in[] = {0x01, 0x02, 0x03, 0x04}; + uint8_t fake_data_out[] = {0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08}; + RequestRecord::RequestInput request1_request_input{ + {"key1", RecordData(fake_data_in, 1)}, + {"key2", RecordData(fake_data_in, 2)}}; + RequestRecord::ResponseOutput request1_response1_output{ + {"key1", RecordData(fake_data_out, 1)}, + {"key2", RecordData(fake_data_out, 2)}}; + RequestRecord::ResponseOutput request1_response2_output{ + {"key3", RecordData(fake_data_out, 3)}, + {"key4", RecordData(fake_data_out, 4)}}; + + RequestRecord request_record1{ + request1_timestamp, + std::vector>{ + request1_response1_timestamp, request1_response2_timestamp}, + {request1_request_input}, + {request1_response1_output, request1_response2_output}, + 0, + false, + sequence_id1, + false}; + + uint64_t sequence_id2{456}; + auto request2_timestamp{clock_epoch + nanoseconds(4)}; + auto request2_response1_timestamp{clock_epoch + nanoseconds(5)}; + auto request2_response2_timestamp{clock_epoch + nanoseconds(6)}; + RequestRecord::RequestInput request2_request_input{ + {"key3", RecordData(fake_data_in, 3)}, + {"key4", RecordData(fake_data_in, 4)}}; + RequestRecord::ResponseOutput request2_response1_output{ + {"key5", RecordData(fake_data_out, 5)}, + {"key6", RecordData(fake_data_out, 6)}}; + RequestRecord::ResponseOutput request2_response2_output{ + {"key7", RecordData(fake_data_out, 7)}, + {"key8", RecordData(fake_data_out, 8)}}; + + RequestRecord request_record2{ + request2_timestamp, + std::vector>{ + request2_response1_timestamp, request2_response2_timestamp}, + {request2_request_input}, + {request2_response1_output, request2_response2_output}, + 0, + false, + sequence_id2, + false}; + + std::vector request_records{request_record1, request_record2}; + collector.AddData(infer_mode, std::move(request_records)); + + CHECK(!collector.experiments_.empty()); + + std::vector rr{collector.experiments_[0].requests}; + CHECK(rr[0].sequence_id_ == sequence_id1); + CHECK(rr[0].start_time_ == request1_timestamp); + CHECK(rr[0].request_inputs_[0] == request1_request_input); + CHECK(rr[0].response_timestamps_[0] == request1_response1_timestamp); + CHECK(rr[0].response_timestamps_[1] == request1_response2_timestamp); + CHECK(rr[0].response_outputs_[0] == request1_response1_output); + CHECK(rr[0].response_outputs_[1] == request1_response2_output); + CHECK(rr[1].sequence_id_ == sequence_id2); + CHECK(rr[1].start_time_ == request2_timestamp); + CHECK(rr[1].request_inputs_[0] == request2_request_input); + CHECK(rr[1].response_timestamps_[0] == request2_response1_timestamp); + CHECK(rr[1].response_timestamps_[1] == request2_response2_timestamp); + CHECK(rr[1].response_outputs_[0] == request2_response1_output); + CHECK(rr[1].response_outputs_[1] == request2_response2_output); +} + +TEST_CASE("profile_data_collector: AddWindow") +{ + MockProfileDataCollector collector{}; + InferenceLoadMode infer_mode{10, 20.0}; + + uint64_t window_start1{123}; + uint64_t window_end1{456}; + collector.AddWindow(infer_mode, window_start1, window_end1); + + CHECK(!collector.experiments_.empty()); + CHECK(collector.experiments_[0].window_boundaries[0] == window_start1); + CHECK(collector.experiments_[0].window_boundaries[1] == window_end1); + + uint64_t window_start2{678}; + uint64_t window_end2{912}; + collector.AddWindow(infer_mode, window_start2, window_end2); + + CHECK(collector.experiments_[0].window_boundaries[2] == window_start2); + CHECK(collector.experiments_[0].window_boundaries[3] == window_end2); +} + +}} // namespace triton::perfanalyzer diff --git a/test_profile_data_exporter.cc b/test_profile_data_exporter.cc new file mode 100644 index 00000000..ffd958c5 --- /dev/null +++ b/test_profile_data_exporter.cc @@ -0,0 +1,327 @@ +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS"" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "doctest.h" +#include "mock_profile_data_exporter.h" +#include "profile_data_exporter.h" + +namespace triton { namespace perfanalyzer { + +TEST_CASE("profile_data_exporter: ConvertToJson") +{ + using std::chrono::nanoseconds; + using std::chrono::system_clock; + using std::chrono::time_point; + + MockProfileDataExporter exporter{}; + + InferenceLoadMode infer_mode{4, 0.0}; + uint64_t sequence_id{1}; + + auto clock_epoch{time_point()}; + auto request_timestamp{clock_epoch + nanoseconds(1)}; + auto response_timestamp1{clock_epoch + nanoseconds(2)}; + auto response_timestamp2{clock_epoch + nanoseconds(3)}; + + // Request inputs + const std::string in_buf1{"abc123"}; + const int32_t in_buf2{456}; + const bool in_buf3{true}; + const std::string in_buf4{"{\"abc\":\"def\"}"}; + + RequestRecord::RequestInput request_input{ + {"in_key1", + {reinterpret_cast(in_buf1.data()), in_buf1.size(), + "BYTES"}}, + {"in_key2", + {reinterpret_cast(&in_buf2), sizeof(in_buf2), "INT32"}}, + {"in_key3", + {reinterpret_cast(&in_buf3), sizeof(in_buf3), "BOOL"}}, + {"in_key4", + {reinterpret_cast(in_buf4.data()), sizeof(in_buf4), + "JSON"}}, + }; + + // Response outputs + std::vector out_bufs{"abc", "def", "ghi", "jkl"}; + RequestRecord::ResponseOutput response_output1{ + {"out_key1", + {reinterpret_cast(out_bufs[0].data()), + out_bufs[0].size()}}, + {"out_key2", + {reinterpret_cast(out_bufs[1].data()), + out_bufs[1].size()}}}; + RequestRecord::ResponseOutput response_output2{ + {"out_key3", + {reinterpret_cast(out_bufs[2].data()), + out_bufs[2].size()}}, + {"out_key4", + {reinterpret_cast(out_bufs[3].data()), + out_bufs[3].size()}}}; + + RequestRecord request_record{ + request_timestamp, + std::vector>{ + response_timestamp1, response_timestamp2}, + {request_input}, + {response_output1, response_output2}, + 0, + false, + sequence_id, + false}; + std::vector requests{request_record}; + std::vector window_boundaries{1, 5, 6}; + + Experiment experiment; + experiment.mode = infer_mode; + experiment.requests = requests; + experiment.window_boundaries = window_boundaries; + std::vector experiments{experiment}; + + std::string version{"1.2.3"}; + cb::BackendKind service_kind = cb::BackendKind::TRITON; + std::string endpoint{""}; + + exporter.ConvertToJson(experiments, version, service_kind, endpoint); + + std::string json{R"( + { + "experiments" : [ + { + "experiment" : { + "mode" : "concurrency", + "value" : 4 + }, + "requests" : [ + { + "timestamp" : 1, + "sequence_id" : 1, + "request_inputs" : {"in_key1":"abc123","in_key2":456,"in_key3":true,"in_key4":"{\"abc\":\"def\"}"}, + "response_timestamps" : [ 2, 3 ], + "response_outputs" : [ {"out_key1":"abc","out_key2":"def"}, {"out_key3":"ghi","out_key4":"jkl"} ] + } + ], + "window_boundaries" : [ 1, 5, 6 ] + } + ], + "version" : "1.2.3", + "service_kind": "triton", + "endpoint": "" + } + )"}; + + rapidjson::Document expected_document; + expected_document.Parse(json.c_str()); + + // FIXME (TMA-1339): Look into the testing the order of things in the json + const rapidjson::Value& expected_experiment{ + expected_document["experiments"][0]["experiment"]}; + const rapidjson::Value& expected_request{ + expected_document["experiments"][0]["requests"][0]}; + const rapidjson::Value& expected_windows{ + expected_document["experiments"][0]["window_boundaries"]}; + const rapidjson::Value& expected_version{expected_document["version"]}; + + const rapidjson::Value& actual_experiment{ + exporter.document_["experiments"][0]["experiment"]}; + const rapidjson::Value& actual_request{ + exporter.document_["experiments"][0]["requests"][0]}; + const rapidjson::Value& actual_windows{ + exporter.document_["experiments"][0]["window_boundaries"]}; + const rapidjson::Value& actual_version{exporter.document_["version"]}; + + CHECK(actual_experiment["mode"] == expected_experiment["mode"]); + CHECK(actual_experiment["value"] == expected_experiment["value"]); + + CHECK(actual_request["timestamp"] == expected_request["timestamp"]); + CHECK(actual_request["sequence_id"] == expected_request["sequence_id"]); + + CHECK( + actual_request["request_inputs"]["in_key1"] == + expected_request["request_inputs"]["in_key1"]); + CHECK( + actual_request["request_inputs"]["in_key2"] == + expected_request["request_inputs"]["in_key2"]); + CHECK( + actual_request["request_inputs"]["in_key3"] == + expected_request["request_inputs"]["in_key3"]); + auto act_inkey_4 = actual_request["request_inputs"]["in_key4"].GetString(); + auto exp_inkey_4 = expected_request["request_inputs"]["in_key4"].GetString(); + CHECK(std::string{act_inkey_4} == std::string{exp_inkey_4}); + + CHECK( + actual_request["response_timestamps"][0] == + expected_request["response_timestamps"][0]); + CHECK( + actual_request["response_timestamps"][1] == + expected_request["response_timestamps"][1]); + CHECK( + actual_request["response_outputs"][0] == + expected_request["response_outputs"][0]); + CHECK( + actual_request["response_outputs"][1] == + expected_request["response_outputs"][1]); + + CHECK(actual_windows[0] == expected_windows[0]); + CHECK(actual_windows[1] == expected_windows[1]); + CHECK(actual_windows[2] == expected_windows[2]); + + CHECK(actual_version == expected_version); +} + +TEST_CASE("profile_data_exporter: AddExperiment") +{ + MockProfileDataExporter exporter{}; + + Experiment raw_experiment; + rapidjson::Value entry(rapidjson::kObjectType); + rapidjson::Value experiment(rapidjson::kObjectType); + + SUBCASE("Concurrency mode") + { + InferenceLoadMode infer_mode{15, 0.0}; + raw_experiment.mode = infer_mode; + + exporter.AddExperiment(entry, experiment, raw_experiment); + CHECK(entry.HasMember("experiment")); + CHECK(entry["experiment"]["mode"] == "concurrency"); + CHECK(entry["experiment"]["value"] == 15); + } + + SUBCASE("Request rate mode") + { + InferenceLoadMode infer_mode{0, 23.5}; + raw_experiment.mode = infer_mode; + + exporter.AddExperiment(entry, experiment, raw_experiment); + CHECK(entry.HasMember("experiment")); + CHECK(entry["experiment"]["mode"] == "request_rate"); + CHECK(entry["experiment"]["value"] == 23.5); + } +} + +TEST_CASE("profile_data_exporter: OutputToFile") +{ + MockProfileDataExporter exporter{}; + std::string file_path; + + SUBCASE("Empty file path") + { + file_path = ""; + CHECK_THROWS_WITH_AS( + exporter.OutputToFile(file_path), + "failed to open file for outputting raw profile data", + PerfAnalyzerException); + } + + SUBCASE("With file path") + { + file_path = "/tmp/test-" + GetRandomString(4) + ".json"; + CHECK_NOTHROW(exporter.OutputToFile(file_path)); + CHECK(IsFile(file_path)); + + std::remove(file_path.c_str()); + CHECK(!IsFile(file_path)); + } +} + +TEST_CASE("profile_data_exporter: AddServiceKind") +{ + MockProfileDataExporter exporter{}; + exporter.ClearDocument(); + + cb::BackendKind service_kind; + std::string json{""}; + + SUBCASE("Backend kind: TRITON") + { + service_kind = cb::BackendKind::TRITON; + json = R"({ "service_kind": "triton" })"; + } + + SUBCASE("Backend kind: TENSORFLOW_SERVING") + { + service_kind = cb::BackendKind::TENSORFLOW_SERVING; + json = R"({ "service_kind": "tfserving" })"; + } + + SUBCASE("Backend kind: TORCHSERVE") + { + service_kind = cb::BackendKind::TORCHSERVE; + json = R"({ "service_kind": "torchserve" })"; + } + + SUBCASE("Backend kind: TRITON_C_API") + { + service_kind = cb::BackendKind::TRITON_C_API; + json = R"({ "service_kind": "triton_c_api" })"; + } + + SUBCASE("Backend kind: OPENAI") + { + service_kind = cb::BackendKind::OPENAI; + json = R"({ "service_kind": "openai" })"; + } + + exporter.AddServiceKind(service_kind); + rapidjson::Document expected_document; + expected_document.Parse(json.c_str()); + + const rapidjson::Value& expected_kind{expected_document["service_kind"]}; + const rapidjson::Value& actual_kind{exporter.document_["service_kind"]}; + CHECK(actual_kind == expected_kind); +} + +TEST_CASE("profile_data_exporter: AddEndpoint") +{ + MockProfileDataExporter exporter{}; + exporter.ClearDocument(); + + std::string endpoint{""}; + std::string json{""}; + + SUBCASE("Endpoint: OpenAI Chat Completions") + { + endpoint = "v1/chat/completions"; + json = R"({ "endpoint": "v1/chat/completions" })"; + } + + SUBCASE("Endpoint: OpenAI Completions") + { + endpoint = "v1/completions"; + json = R"({ "endpoint": "v1/completions" })"; + } + + exporter.AddEndpoint(endpoint); + rapidjson::Document expected_document; + expected_document.Parse(json.c_str()); + + const rapidjson::Value& expected_endpoint{expected_document["endpoint"]}; + const rapidjson::Value& actual_endpoint{exporter.document_["endpoint"]}; + CHECK(actual_endpoint == expected_endpoint); +} + +}} // namespace triton::perfanalyzer diff --git a/test_report_writer.cc b/test_report_writer.cc index fdd1447f..5d341c30 100644 --- a/test_report_writer.cc +++ b/test_report_writer.cc @@ -25,6 +25,7 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include + #include "doctest.h" #include "report_writer.h" @@ -51,7 +52,7 @@ TEST_CASE("testing WriteGpuMetrics") SUBCASE("single gpu complete output") { trw.WriteGpuMetrics(actual_output, m); - const std::string expected_output{"a:1;,a:2.2;,a:3;,a:4;,"}; + const std::string expected_output{",a:1;,a:2.2;,a:3;,a:4;"}; CHECK(actual_output.str() == expected_output); } @@ -59,7 +60,7 @@ TEST_CASE("testing WriteGpuMetrics") { m.gpu_power_usage_per_gpu.erase("a"); trw.WriteGpuMetrics(actual_output, m); - const std::string expected_output{"a:1;,,a:3;,a:4;,"}; + const std::string expected_output{",a:1;,,a:3;,a:4;"}; CHECK(actual_output.str() == expected_output); } @@ -74,7 +75,7 @@ TEST_CASE("testing WriteGpuMetrics") { trw.WriteGpuMetrics(actual_output, m); const std::string expected_output{ - "a:1;z:100;,a:2.2;z:222.2;,a:3;z:45;,a:4;z:89;,"}; + ",a:1;z:100;,a:2.2;z:222.2;,a:3;z:45;,a:4;z:89;"}; CHECK(actual_output.str() == expected_output); } @@ -83,7 +84,7 @@ TEST_CASE("testing WriteGpuMetrics") m.gpu_utilization_per_gpu.erase("z"); m.gpu_power_usage_per_gpu.erase("a"); trw.WriteGpuMetrics(actual_output, m); - const std::string expected_output{"a:1;,z:222.2;,a:3;z:45;,a:4;z:89;,"}; + const std::string expected_output{",a:1;,z:222.2;,a:3;z:45;,a:4;z:89;"}; CHECK(actual_output.str() == expected_output); } } diff --git a/test_request_rate_manager.cc b/test_request_rate_manager.cc new file mode 100644 index 00000000..07b9016d --- /dev/null +++ b/test_request_rate_manager.cc @@ -0,0 +1,2242 @@ +// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include +#include + +#include "command_line_parser.h" +#include "common.h" +#include "doctest.h" +#include "mock_client_backend.h" +#include "mock_data_loader.h" +#include "mock_infer_data_manager.h" +#include "mock_model_parser.h" +#include "mock_request_rate_worker.h" +#include "mock_sequence_manager.h" +#include "request_rate_manager.h" +#include "test_load_manager_base.h" +#include "test_utils.h" + +namespace cb = triton::perfanalyzer::clientbackend; +using milliseconds = std::chrono::milliseconds; +using nanoseconds = std::chrono::nanoseconds; + +namespace triton { namespace perfanalyzer { + +/// Class to test the RequestRateManager +/// +class TestRequestRateManager : public TestLoadManagerBase, + public RequestRateManager { + public: + TestRequestRateManager( + PerfAnalyzerParameters params, bool is_sequence_model = false, + bool is_decoupled_model = false, bool use_mock_infer = false) + : use_mock_infer_(use_mock_infer), + TestLoadManagerBase(params, is_sequence_model, is_decoupled_model), + RequestRateManager( + params.async, params.streaming, params.request_distribution, + params.batch_size, params.measurement_window_ms, params.max_trials, + params.max_threads, params.num_of_sequences, + params.shared_memory_type, params.output_shm_size, + params.serial_sequences, GetParser(), GetFactory(), + params.request_parameters) + { + } + + std::shared_ptr MakeWorker( + std::shared_ptr thread_stat, + std::shared_ptr thread_config) override + { + size_t id = workers_.size(); + auto worker = std::make_shared( + id, thread_stat, thread_config, parser_, data_loader_, factory_, + on_sequence_model_, async_, max_threads_, using_json_data_, streaming_, + batch_size_, wake_signal_, wake_mutex_, execute_, start_time_, + serial_sequences_, infer_data_manager_, sequence_manager_); + + if (use_mock_infer_) { + EXPECT_CALL(*worker, Infer()) + .WillRepeatedly(testing::Invoke( + worker.get(), &MockRequestRateWorker::EmptyInfer)); + } + return worker; + } + + void TestConfigureThreads( + std::vector& expected_configs, size_t request_count) + { + RequestRateManager::ConfigureThreads(request_count); + + auto expected_size = expected_configs.size(); + + // Check that the correct number of threads are created + // + CHECK(threads_.size() == expected_size); + + // Check that threads_config has correct number of sequences and + // seq stat index offset + for (auto i = 0; i < expected_configs.size(); i++) { + CHECK( + threads_config_[i]->num_sequences_ == + expected_configs[i].num_sequences_); + CHECK( + threads_config_[i]->seq_stat_index_offset_ == + expected_configs[i].seq_stat_index_offset_); + CHECK( + threads_config_[i]->num_requests_ == + expected_configs[i].num_requests_); + } + } + + void TestCalculateThreadIds(std::vector& expected_thread_ids) + { + std::vector actual_thread_ids = + RequestRateManager::CalculateThreadIds(); + CHECK(actual_thread_ids.size() == expected_thread_ids.size()); + + for (auto i = 0; i < actual_thread_ids.size(); i++) { + CHECK(actual_thread_ids[i] == expected_thread_ids[i]); + } + } + + void StopWorkerThreads() { LoadManager::StopWorkerThreads(); } + + void TestSchedule(double rate, PerfAnalyzerParameters params) + { + PauseWorkers(); + ConfigureThreads(); + GenerateSchedule(rate); + + nanoseconds measurement_window_nanoseconds{ + params.measurement_window_ms * NANOS_PER_MILLIS}; + nanoseconds max_test_duration{ + measurement_window_nanoseconds * params.max_trials}; + + nanoseconds expected_time_between_requests{int(NANOS_PER_SECOND / rate)}; + nanoseconds expected_current_timestamp{0}; + + // Keep calling GetNextTimestamp for the entire test_duration to make sure + // the schedule is exactly as expected + // + while (expected_current_timestamp < max_test_duration) { + for (auto worker : workers_) { + expected_current_timestamp += expected_time_between_requests; + auto timestamp = std::dynamic_pointer_cast(worker) + ->GetNextTimestamp(); + REQUIRE(timestamp.count() == expected_current_timestamp.count()); + } + } + early_exit = true; + } + + void TestCreateSchedule( + double rate, PerfAnalyzerParameters params, + std::vector& expected_worker_ratio) + { + PauseWorkers(); + ConfigureThreads(); + GenerateSchedule(rate); + + std::vector worker_schedule_sizes; + uint32_t total_num_seqs{0}; + + for (auto worker : workers_) { + auto w = std::dynamic_pointer_cast(worker); + total_num_seqs += w->thread_config_->num_sequences_; + worker_schedule_sizes.push_back(w->schedule_->intervals.size()); + } + early_exit = true; + + CHECK(num_of_sequences_ == total_num_seqs); + for (int i = 0; i < worker_schedule_sizes.size() - 1; i++) { + CHECK( + worker_schedule_sizes[i] / expected_worker_ratio[i] == + worker_schedule_sizes[i + 1] / expected_worker_ratio[i + 1]); + } + } + + /// Test that the correct Infer function is called in the backend + /// + void TestInferType() + { + double request_rate = 50; + auto sleep_time = milliseconds(100); + + ChangeRequestRate(request_rate); + std::this_thread::sleep_for(sleep_time); + StopWorkerThreads(); + + CheckInferType(); + } + + /// Test that the inference distribution is as expected + /// + void TestDistribution(uint request_rate, uint duration_ms) + { + ChangeRequestRate(request_rate); + std::this_thread::sleep_for(milliseconds(duration_ms)); + StopWorkerThreads(); + + CheckCallDistribution(request_rate); + } + + /// Test that the schedule is properly update after calling ChangeRequestRate + /// + void TestMultipleRequestRate() + { + std::vector request_rates = {50, 200}; + auto sleep_time = milliseconds(500); + + for (auto request_rate : request_rates) { + ChangeRequestRate(request_rate); + ResetStats(); + std::this_thread::sleep_for(sleep_time); + CheckCallDistribution(request_rate); + } + } + + /// Test sequence handling + /// + void TestSequences(bool verify_seq_balance, bool check_expected_count) + { + stats_->SetDelays({10}); + double request_rate1 = 100; + double request_rate2 = 200; + + // A single sequence can't maintain the above rates + // + if (params_.num_of_sequences == 1) { + request_rate1 = 50; + request_rate2 = 100; + } + + auto stats = cb::InferStat(); + int sleep_ms = 500; + double num_seconds = double(sleep_ms) / 1000; + + auto sleep_time = milliseconds(sleep_ms); + size_t expected_count1 = num_seconds * request_rate1; + size_t expected_count2 = num_seconds * request_rate2 + expected_count1; + + // Run and check request rate 1 + // + ChangeRequestRate(request_rate1); + std::this_thread::sleep_for(sleep_time); + + stats = cb::InferStat(); + GetAccumulatedClientStat(&stats); + if (check_expected_count) { + CHECK( + stats.completed_request_count == + doctest::Approx(expected_count1).epsilon(0.10)); + } + + PauseWorkers(); + CheckSequences(params_.num_of_sequences); + + // Make sure that the client and the manager are in agreement on the request + // count in between rates + // + stats = cb::InferStat(); + GetAccumulatedClientStat(&stats); + int client_total_requests = stats_->num_async_infer_calls + + stats_->num_async_stream_infer_calls + + stats_->num_infer_calls; + CHECK(stats.completed_request_count == client_total_requests); + + if (verify_seq_balance) { + CheckSequenceBalance(); + } + + ResetStats(); + + // Run and check request rate 2 + // + ChangeRequestRate(request_rate2); + std::this_thread::sleep_for(sleep_time); + + stats = cb::InferStat(); + GetAccumulatedClientStat(&stats); + if (check_expected_count) { + CHECK( + stats.completed_request_count == + doctest::Approx(expected_count2).epsilon(0.10)); + } + + // Stop all threads and make sure everything is as expected + // + StopWorkerThreads(); + + CheckSequences(params_.num_of_sequences); + } + + /// Test that the shared memory methods are called correctly + /// + void TestSharedMemory(uint request_rate, uint duration_ms) + { + ChangeRequestRate(request_rate); + std::this_thread::sleep_for(milliseconds(duration_ms)); + StopWorkerThreads(); + } + + /// Test that tries to find deadlocks and livelocks + /// + void TestTimeouts() + { + TestWatchDog watchdog(1000); + ChangeRequestRate(100); + std::this_thread::sleep_for(milliseconds(100)); + StopWorkerThreads(); + watchdog.stop(); + } + + /// Test that idle time is tracked correctly + void TestOverhead(uint request_rate) + { + stats_->SetDelays({1}); + ChangeRequestRate(request_rate); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + // During a run of 100 ms (100,000,000 ns), make sure that the idle time is + // at least 95% of that + // + auto idle_time_ns = GetIdleTime(); + CHECK(idle_time_ns > 95000000); + StopWorkerThreads(); + } + + /// Helper function that will setup and run a case to verify custom data + /// behavior + /// \param num_requests Integer number of requests to send during the test + /// \param num_threads Number of worker threads to create + /// \param tensors Vector of input ModelTensors + /// \param json_str The custom data json text + /// \param expected_values Vector of expected input values for each inference + /// \param expect_init_failure True if InitManager is expected to throw an + /// error + /// \param expect_thread_failure True if the thread is expected to have + /// an error + void TestCustomData( + size_t num_requests, size_t num_threads, + std::vector& tensors, const std::string json_str, + std::vector>& expected_values, + bool expect_init_failure, bool expect_thread_failure) + { + CustomDataTestSetup(tensors, json_str, expect_init_failure, num_threads); + if (expect_init_failure) { + // The rest of the test is invalid if init failed + return; + } + auto thread_status = CustomDataTestSendRequests(num_requests, num_threads); + CustomDataTestCheckResults( + thread_status, expect_thread_failure, expected_values); + } + + void CustomDataTestSetup( + std::vector& tensors, const std::string json_str, + bool expect_init_failure, size_t num_threads) + { + params_.user_data = {json_str}; + + std::shared_ptr mdl{ + std::make_shared(params_.batch_size)}; + + std::shared_ptr mmp{ + std::make_shared(on_sequence_model_, false)}; + mmp->inputs_ = std::make_shared(); + for (auto t : tensors) { + (*mmp->inputs_)[t.name_] = t; + } + + infer_data_manager_ = + MockInferDataManagerFactory::CreateMockInferDataManager( + params_.max_threads, params_.batch_size, params_.shared_memory_type, + params_.output_shm_size, params_.request_parameters, mmp, factory_, + mdl); + + parser_ = mmp; + data_loader_ = mdl; + using_json_data_ = true; + execute_ = true; + max_threads_ = num_threads; + + if (expect_init_failure) { + REQUIRE_THROWS_AS( + InitManager( + params_.string_length, params_.string_data, params_.zero_input, + params_.user_data, params_.start_sequence_id, + params_.sequence_id_range, params_.sequence_length, + params_.sequence_length_specified, + params_.sequence_length_variation), + PerfAnalyzerException); + return; + } else { + REQUIRE_NOTHROW(InitManager( + params_.string_length, params_.string_data, params_.zero_input, + params_.user_data, params_.start_sequence_id, + params_.sequence_id_range, params_.sequence_length, + params_.sequence_length_specified, + params_.sequence_length_variation)); + } + } + + cb::Error CustomDataTestSendRequests(size_t num_requests, size_t num_threads) + { + std::vector> workers; + std::vector> thread_stats; + + for (auto i = 0; i < num_threads; i++) { + std::shared_ptr ts{std::make_shared()}; + thread_stats.push_back(ts); + std::shared_ptr tc{std::make_shared(i)}; + std::shared_ptr worker{MakeWorker(ts, tc)}; + workers_.push_back(worker); + + workers.push_back( + std::dynamic_pointer_cast(worker)); + + workers[i]->CreateContext(); + } + + size_t sent_requests = 0; + while (sent_requests < num_requests) { + for (auto i = 0; i < workers.size(); i++) { + workers[i]->SendInferRequest(); + sent_requests++; + } + } + + return thread_stats[0]->status_; + } + + void CustomDataTestCheckResults( + cb::Error& thread_status, bool expect_thread_failure, + std::vector>& expected_values) + { + if (expect_thread_failure) { + REQUIRE(!thread_status.IsOk()); + } else { + REQUIRE_MESSAGE(thread_status.IsOk(), thread_status.Message()); + } + + auto recorded_values = GetRecordedInputValues(); + + // Check that results are exactly as expected + REQUIRE(recorded_values.size() == expected_values.size()); + for (size_t i = 0; i < expected_values.size(); i++) { + REQUIRE(recorded_values[i].size() == expected_values[i].size()); + for (size_t j = 0; j < expected_values[i].size(); j++) { + CHECK(recorded_values[i][j] == expected_values[i][j]); + } + } + } + + std::shared_ptr& parser_{LoadManager::parser_}; + std::shared_ptr& data_loader_{LoadManager::data_loader_}; + std::shared_ptr& sequence_manager_{ + LoadManager::sequence_manager_}; + bool& using_json_data_{LoadManager::using_json_data_}; + bool& execute_{RequestRateManager::execute_}; + size_t& batch_size_{LoadManager::batch_size_}; + std::chrono::steady_clock::time_point& start_time_{ + RequestRateManager::start_time_}; + size_t& max_threads_{LoadManager::max_threads_}; + bool& async_{LoadManager::async_}; + bool& streaming_{LoadManager::streaming_}; + std::shared_ptr& factory_{ + TestLoadManagerBase::factory_}; + std::shared_ptr& infer_data_manager_{ + LoadManager::infer_data_manager_}; + + private: + bool use_mock_infer_; + + void CheckCallDistribution(int request_rate) + { + auto request_distribution = params_.request_distribution; + + auto timestamps = GetStats()->request_timestamps; + std::vector time_delays = GatherTimeBetweenRequests(timestamps); + + double delay_average = CalculateAverage(time_delays); + double delay_variance = CalculateVariance(time_delays, delay_average); + + double expected_delay_average = + NANOS_PER_SECOND / static_cast(request_rate); + + if (request_distribution == POISSON) { + // By definition, variance == average for Poisson. + // + // With such a small sample size for a poisson distribution, there will be + // noise. Allow 5% slop + // + CHECK( + delay_average == + doctest::Approx(expected_delay_average).epsilon(0.05)); + CHECK(delay_variance == doctest::Approx(delay_average).epsilon(0.05)); + } else if (request_distribution == CONSTANT) { + // constant should in theory have 0 variance, but with thread timing + // there is obviously some noise. + // + // Allow it to be at most 5% of average + // + auto max_allowed_delay_variance = 0.05 * delay_average; + + // Constant should be pretty tight. Allowing 1% slop there is noise in the + // thread scheduling + // + CHECK( + delay_average == + doctest::Approx(expected_delay_average).epsilon(0.1)); + CHECK_LT(delay_variance, max_allowed_delay_variance); + } else { + throw std::invalid_argument("Unexpected distribution type"); + } + } + + std::vector GatherTimeBetweenRequests( + const std::vector>& + timestamps) + { + std::vector time_between_requests; + + for (size_t i = 1; i < timestamps.size(); i++) { + auto diff = timestamps[i] - timestamps[i - 1]; + nanoseconds diff_ns = std::chrono::duration_cast(diff); + time_between_requests.push_back(diff_ns.count()); + } + return time_between_requests; + } + + // Gets the inputs recorded in the mock backend + // Returns a vector of vector of int32_t. Each entry in the parent vector is a + // list of all input values for a single inference request + // + std::vector> GetRecordedInputValues() + { + auto recorded_inputs{stats_->recorded_inputs}; + std::vector> recorded_values; + // Convert the recorded inputs into values, for both shared memory and non + // shared memory cases + // + if (params_.shared_memory_type != SharedMemoryType::NO_SHARED_MEMORY) { + auto recorded_memory_regions = + std::dynamic_pointer_cast( + infer_data_manager_) + ->mocked_shared_memory_regions; + for (auto recorded_input : recorded_inputs) { + std::vector recorded_value; + for (auto memory_label : recorded_input) { + auto itr = + recorded_memory_regions.find(memory_label.shared_memory_label); + if (itr == recorded_memory_regions.end()) { + std::string err_str = "Test error: Could not find label " + + memory_label.shared_memory_label + + " in recorded shared memory"; + REQUIRE_MESSAGE(false, err_str); + } else { + for (auto val : itr->second) { + recorded_value.push_back(val); + } + } + } + recorded_values.push_back(recorded_value); + } + } else { + for (auto recorded_input : recorded_inputs) { + std::vector recorded_value; + for (auto val : recorded_input) { + recorded_value.push_back(val.data); + } + recorded_values.push_back(recorded_value); + } + } + return recorded_values; + } + + std::shared_ptr MakeSequenceManager( + const uint64_t start_sequence_id, const uint64_t sequence_id_range, + const size_t sequence_length, const bool sequence_length_specified, + const double sequence_length_variation, const bool using_json_data, + std::shared_ptr data_loader) override + { + return std::make_shared( + start_sequence_id, sequence_id_range, sequence_length, + sequence_length_specified, sequence_length_variation, using_json_data, + data_loader); + } +}; + +TEST_CASE("request_rate_schedule") +{ + PerfAnalyzerParameters params; + params.measurement_window_ms = 1000; + params.max_trials = 10; + bool is_sequence = false; + bool is_decoupled = false; + bool use_mock_infer = true; + double rate; + + + const auto& ParameterizeRate{[&]() { + SUBCASE("rate 10") + { + rate = 10; + } + SUBCASE("rate 30") + { + rate = 30; + } + SUBCASE("rate 100") + { + rate = 100; + } + }}; + + const auto& ParameterizeThreads{[&]() { + SUBCASE("threads 1") + { + ParameterizeRate(); + params.max_threads = 1; + } + SUBCASE("threads 2") + { + ParameterizeRate(); + params.max_threads = 2; + } + SUBCASE("threads 4") + { + ParameterizeRate(); + params.max_threads = 4; + } + SUBCASE("threads 7") + { + ParameterizeRate(); + params.max_threads = 7; + } + }}; + + const auto& ParameterizeTrials{[&]() { + SUBCASE("trials 3") + { + ParameterizeThreads(); + params.max_trials = 3; + } + SUBCASE("trials 10") + { + ParameterizeThreads(); + params.max_trials = 10; + } + SUBCASE("trials 20") + { + ParameterizeThreads(); + params.max_trials = 20; + } + }}; + + const auto& ParameterizeMeasurementWindow{[&]() { + SUBCASE("window 1000") + { + ParameterizeTrials(); + params.measurement_window_ms = 1000; + } + SUBCASE("window 10000") + { + ParameterizeTrials(); + params.measurement_window_ms = 10000; + } + SUBCASE("window 500") + { + ParameterizeTrials(); + params.measurement_window_ms = 500; + } + }}; + + ParameterizeMeasurementWindow(); + + TestRequestRateManager trrm( + params, is_sequence, is_decoupled, use_mock_infer); + + trrm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + trrm.TestSchedule(rate, params); +} + +/// Check that the correct inference function calls +/// are used given different param values for async and stream +/// +TEST_CASE("request_rate_infer_type") +{ + bool async; + bool stream; + + SUBCASE("async_stream") + { + async = true; + stream = true; + } + SUBCASE("async_no_stream") + { + async = true; + stream = false; + } + SUBCASE("no_async_stream") + { + async = false; + stream = true; + } + SUBCASE("no_async_no_stream") + { + async = false; + stream = false; + } + + PerfAnalyzerParameters params; + params.async = async; + params.streaming = stream; + + TestRequestRateManager trrm(params, false); + + trrm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + trrm.TestInferType(); +} + +/// Check that the request distribution is correct for +/// different Distribution types +/// +TEST_CASE("request_rate_distribution") +{ + PerfAnalyzerParameters params; + uint request_rate = 500; + uint duration_ms = 1000; + + SUBCASE("constant") + { + params.request_distribution = CONSTANT; + } + SUBCASE("poisson") + { + params.request_distribution = POISSON; + } + + TestRequestRateManager trrm(params); + + trrm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + trrm.TestDistribution(request_rate, duration_ms); +} + +/// Check that the request distribution is correct +/// for the case where the measurement window is tiny. +/// +TEST_CASE("request_rate_tiny_window") +{ + PerfAnalyzerParameters params; + params.request_distribution = CONSTANT; + params.measurement_window_ms = 10; + params.max_trials = 100; + uint request_rate = 500; + uint duration_ms = 1000; + + + SUBCASE("one_thread") + { + params.max_threads = 1; + } + SUBCASE("odd_threads") + { + params.max_threads = 9; + } + + + TestRequestRateManager trrm(params); + trrm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + trrm.TestDistribution(request_rate, duration_ms); +} + +/// Check that the schedule properly handles mid-test +/// update to the request rate +/// +TEST_CASE("request_rate_multiple") +{ + PerfAnalyzerParameters params{}; + TestRequestRateManager trrm(PerfAnalyzerParameters{}); + + trrm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + trrm.TestMultipleRequestRate(); +} + +/// Check that the inference requests for sequences +/// follow all rules and parameters +/// +TEST_CASE("request_rate_sequence") +{ + PerfAnalyzerParameters params = TestLoadManagerBase::GetSequenceTestParams(); + bool verify_seq_balance = false; + bool check_expected_count = true; + bool is_sequence_model = true; + + TestRequestRateManager trrm(params, is_sequence_model); + trrm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + trrm.TestSequences(verify_seq_balance, check_expected_count); +} + +TEST_CASE("request_rate_serial_sequences") +{ + PerfAnalyzerParameters params; + params.serial_sequences = true; + bool verify_seq_balance = false; + bool check_expected_count = true; + bool is_sequence_model = true; + + const auto& ParameterizeDistribution{[&]() { + SUBCASE("Constant") + { + params.request_distribution = CONSTANT; + } + SUBCASE("Poisson") + { + params.request_distribution = POISSON; + check_expected_count = false; + } + }}; + + SUBCASE("num seqs 7, threads 4") + { + verify_seq_balance = true; + params.sequence_length = 100; + params.num_of_sequences = 7; + params.max_threads = 4; + ParameterizeDistribution(); + } + SUBCASE("num seqs 13, threads 5") + { + verify_seq_balance = true; + params.sequence_length = 100; + params.num_of_sequences = 13; + params.max_threads = 5; + ParameterizeDistribution(); + } + + TestRequestRateManager trrm(params, is_sequence_model); + trrm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + trrm.TestSequences(verify_seq_balance, check_expected_count); +} + +TEST_CASE("request_rate max inflight per seq") +{ + // Confirm that we can have multiple inferences in-flight for a given sequence + // unless in serial-sequence mode + PerfAnalyzerParameters params; + bool is_sequence_model = true; + params.num_of_sequences = 2; + size_t rate = 1000; + size_t time_ms = 10; + + bool expect_multiple_in_flight_sequences = false; + + SUBCASE("sync will never have multiple in flight") + { + params.async = false; + expect_multiple_in_flight_sequences = false; + + SUBCASE("serial_sequences on") + { + params.serial_sequences = true; + } + SUBCASE("serial_sequences off") + { + params.serial_sequences = false; + } + } + SUBCASE("async may have multiple in flight depending on serial sequences") + { + params.async = true; + + SUBCASE("serial_sequences on") + { + params.serial_sequences = true; + expect_multiple_in_flight_sequences = false; + } + SUBCASE("serial_sequences off") + { + params.serial_sequences = false; + expect_multiple_in_flight_sequences = true; + } + } + + TestRequestRateManager trrm(params, is_sequence_model); + trrm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + + trrm.stats_->SetDelays({100}); + + trrm.ChangeRequestRate(rate); + std::this_thread::sleep_for(std::chrono::milliseconds(time_ms)); + + auto max_observed_inflight = + trrm.stats_->sequence_status.max_inflight_seq_count; + + if (expect_multiple_in_flight_sequences) { + CHECK(max_observed_inflight > 1); + } else { + CHECK(max_observed_inflight == 1); + } + + trrm.StopWorkerThreads(); +} + + +TEST_CASE("request_rate_streaming: test that streaming-specific logic works") +{ + bool is_sequence = false; + bool is_decoupled; + bool expected_enable_stats_value; + + SUBCASE("enable_stats true") + { + is_decoupled = false; + expected_enable_stats_value = true; + } + SUBCASE("enable_stats false") + { + is_decoupled = true; + expected_enable_stats_value = false; + } + + PerfAnalyzerParameters params{}; + params.streaming = true; + + RateSchedulePtr_t schedule = std::make_shared(); + schedule->intervals = NanoIntervals{nanoseconds(1)}; + schedule->duration = nanoseconds{1}; + + std::shared_ptr thread_stat{std::make_shared()}; + std::shared_ptr thread_config{ + std::make_shared(0)}; + + TestRequestRateManager trrm(params, is_sequence, is_decoupled); + trrm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + + auto worker = trrm.MakeWorker(thread_stat, thread_config); + std::dynamic_pointer_cast(worker)->SetSchedule(schedule); + std::future infer_future{std::async(&IWorker::Infer, worker)}; + + early_exit = true; + infer_future.get(); + + CHECK( + trrm.stats_->start_stream_enable_stats_value == + expected_enable_stats_value); +} + +TEST_CASE( + "custom_json_data: Check custom json data to ensure that it is processed " + "correctly") +{ + PerfAnalyzerParameters params{}; + params.user_data = {"fake_file.json"}; + bool is_sequence_model{false}; + + std::vector> expected_results; + std::vector tensors; + bool expect_init_failure = false; + bool expect_thread_failure = false; + + ModelTensor model_tensor1{}; + model_tensor1.datatype_ = "INT32"; + model_tensor1.is_optional_ = false; + model_tensor1.is_shape_tensor_ = false; + model_tensor1.name_ = "INPUT1"; + model_tensor1.shape_ = {1}; + + ModelTensor model_tensor2 = model_tensor1; + model_tensor2.name_ = "INPUT2"; + + size_t num_requests = 4; + size_t num_threads = 1; + std::string json_str; + + const auto& ParameterizeTensors{[&]() { + SUBCASE("one tensor") + { + tensors.push_back(model_tensor1); + + json_str = R"({ + "data": [ + { "INPUT1": [1] }, + { "INPUT1": [2] }, + { "INPUT1": [3] } + ]})"; + + switch (params.batch_size) { + case 1: + expected_results = {{1}, {2}, {3}, {1}}; + break; + case 2: + expected_results = {{1, 2}, {3, 1}, {2, 3}, {1, 2}}; + break; + case 4: + expected_results = { + {1, 2, 3, 1}, {2, 3, 1, 2}, {3, 1, 2, 3}, {1, 2, 3, 1}}; + break; + default: + REQUIRE(false); + } + } + SUBCASE("two tensors") + { + tensors.push_back(model_tensor1); + tensors.push_back(model_tensor2); + + json_str = R"({ + "data": [ + { "INPUT1": [1], "INPUT2": [21] }, + { "INPUT1": [2], "INPUT2": [22] }, + { "INPUT1": [3], "INPUT2": [23] } + ]})"; + + switch (params.batch_size) { + case 1: + expected_results = {{1, 21}, {2, 22}, {3, 23}, {1, 21}}; + break; + case 2: + expected_results = { + {1, 2, 21, 22}, {3, 1, 23, 21}, {2, 3, 22, 23}, {1, 2, 21, 22}}; + break; + case 4: + expected_results = { + {1, 2, 3, 1, 21, 22, 23, 21}, + {2, 3, 1, 2, 22, 23, 21, 22}, + {3, 1, 2, 3, 23, 21, 22, 23}, + {1, 2, 3, 1, 21, 22, 23, 21}}; + break; + default: + REQUIRE(false); + } + } + }}; + + const auto& ParameterizeBatchSize{[&]() { + SUBCASE("batchsize = 1") + { + params.batch_size = 1; + ParameterizeTensors(); + } + SUBCASE("batchsize = 2") + { + params.batch_size = 2; + ParameterizeTensors(); + } + SUBCASE("batchsize = 4") + { + params.batch_size = 4; + ParameterizeTensors(); + } + }}; + + const auto& ParameterizeSharedMemory{[&]() { + SUBCASE("no_shared_memory") + { + params.shared_memory_type = SharedMemoryType::NO_SHARED_MEMORY; + ParameterizeBatchSize(); + } + SUBCASE("system_shared_memory") + { + params.shared_memory_type = SharedMemoryType::SYSTEM_SHARED_MEMORY; + ParameterizeBatchSize(); + } + SUBCASE("cuda_shared_memory") + { + params.shared_memory_type = SharedMemoryType::CUDA_SHARED_MEMORY; + ParameterizeBatchSize(); + } + }}; + + const auto& ParameterizeNumThreads{[&]() { + SUBCASE("1 thread") + { + num_threads = 1; + ParameterizeSharedMemory(); + } + SUBCASE("2 threads") + { + num_threads = 2; + ParameterizeSharedMemory(); + } + }}; + + ParameterizeNumThreads(); + + TestRequestRateManager trrm(params, is_sequence_model); + + trrm.TestCustomData( + num_requests, num_threads, tensors, json_str, expected_results, + expect_init_failure, expect_thread_failure); +} + +TEST_CASE("custom_json_data: handling is_shape_tensor") +{ + // Test the case where is_shape_tensor is true and is the same + // across a batch: it only ends up in each batch once + PerfAnalyzerParameters params{}; + params.user_data = {"fake_file.json"}; + bool is_sequence_model{false}; + + std::vector> expected_results; + std::vector tensors; + bool expect_init_failure = false; + bool expect_thread_failure = false; + + ModelTensor model_tensor1{}; + model_tensor1.datatype_ = "INT32"; + model_tensor1.is_optional_ = false; + model_tensor1.is_shape_tensor_ = false; + model_tensor1.name_ = "INPUT1"; + model_tensor1.shape_ = {1}; + + ModelTensor model_tensor2 = model_tensor1; + model_tensor2.name_ = "INPUT2"; + + std::string json_str{R"({ + "data": [ + { "INPUT1": [1], "INPUT2": [21] }, + { "INPUT1": [1], "INPUT2": [22] }, + { "INPUT1": [1], "INPUT2": [23] } + ]})"}; + + model_tensor1.is_shape_tensor_ = true; + model_tensor2.is_optional_ = true; + + size_t num_requests = 4; + size_t num_threads = 1; + + const auto& ParameterizeBatch{[&]() { + SUBCASE("batch 1") + { + params.batch_size = 1; + expected_results = {{1, 21}, {1, 22}, {1, 23}, {1, 21}}; + } + SUBCASE("batch 2") + { + params.batch_size = 2; + expected_results = {{1, 21, 22}, {1, 23, 21}, {1, 22, 23}, {1, 21, 22}}; + } + SUBCASE("batch 4") + { + params.batch_size = 4; + expected_results = { + {1, 21, 22, 23, 21}, + {1, 22, 23, 21, 22}, + {1, 23, 21, 22, 23}, + {1, 21, 22, 23, 21}}; + } + }}; + + const auto& ParameterizeNumThreads{[&]() { + SUBCASE("1 thread") + { + num_threads = 1; + ParameterizeBatch(); + } + SUBCASE("2 threads") + { + num_threads = 2; + ParameterizeBatch(); + } + }}; + + // Being optional should have no impact + SUBCASE("optional = 0,0") + { + model_tensor1.is_optional_ = false; + model_tensor2.is_optional_ = false; + ParameterizeNumThreads(); + } + SUBCASE("optional = 0,1") + { + model_tensor1.is_optional_ = false; + model_tensor2.is_optional_ = true; + ParameterizeNumThreads(); + } + SUBCASE("optional = 1,0") + { + model_tensor1.is_optional_ = true; + model_tensor2.is_optional_ = false; + ParameterizeNumThreads(); + } + SUBCASE("optional = 1,1") + { + model_tensor1.is_optional_ = true; + model_tensor2.is_optional_ = true; + ParameterizeNumThreads(); + } + + + TestRequestRateManager trrm(params, is_sequence_model); + + tensors.push_back(model_tensor1); + tensors.push_back(model_tensor2); + + trrm.TestCustomData( + num_requests, num_threads, tensors, json_str, expected_results, + expect_init_failure, expect_thread_failure); +} + +TEST_CASE("custom_json_data: handling missing optional is_shape_tensor") +{ + // Test the case where is_shape_tensor is true and is_optional_ is true + // and data for that input is completely omitted + PerfAnalyzerParameters params{}; + params.user_data = {"fake_file.json"}; + bool is_sequence_model{false}; + + std::vector> expected_results; + std::vector tensors; + bool expect_init_failure = false; + bool expect_thread_failure = false; + + ModelTensor model_tensor1{}; + model_tensor1.datatype_ = "INT32"; + model_tensor1.is_optional_ = true; + model_tensor1.is_shape_tensor_ = true; + model_tensor1.name_ = "INPUT1"; + model_tensor1.shape_ = {1}; + + ModelTensor model_tensor2 = model_tensor1; + model_tensor2.is_shape_tensor_ = false; + model_tensor2.is_optional_ = false; + model_tensor2.name_ = "INPUT2"; + + std::string json_str{R"({ + "data": [ + { "INPUT2": [21] }, + { "INPUT2": [22] }, + { "INPUT2": [23] } + ]})"}; + + + size_t num_requests = 4; + size_t num_threads = 1; + + const auto& ParameterizeBatch{[&]() { + SUBCASE("batch 1") + { + params.batch_size = 1; + expected_results = {{21}, {22}, {23}, {21}}; + } + SUBCASE("batch 2") + { + params.batch_size = 2; + expected_results = {{21, 22}, {23, 21}, {22, 23}, {21, 22}}; + } + SUBCASE("batch 4") + { + params.batch_size = 4; + expected_results = { + {21, 22, 23, 21}, + {22, 23, 21, 22}, + {23, 21, 22, 23}, + {21, 22, 23, 21}}; + } + }}; + + const auto& ParameterizeNumThreads{[&]() { + SUBCASE("1 thread") + { + num_threads = 1; + ParameterizeBatch(); + } + SUBCASE("2 threads") + { + num_threads = 2; + ParameterizeBatch(); + } + }}; + + SUBCASE("no shm") + { + params.shared_memory_type = SharedMemoryType::NO_SHARED_MEMORY; + ParameterizeNumThreads(); + } + SUBCASE("system shm") + { + params.shared_memory_type = SharedMemoryType::SYSTEM_SHARED_MEMORY; + ParameterizeNumThreads(); + expect_init_failure = true; + } + SUBCASE("cuda shm") + { + params.shared_memory_type = SharedMemoryType::CUDA_SHARED_MEMORY; + ParameterizeNumThreads(); + expect_init_failure = true; + } + + TestRequestRateManager trrm(params, is_sequence_model); + + tensors.push_back(model_tensor1); + tensors.push_back(model_tensor2); + + trrm.TestCustomData( + num_requests, num_threads, tensors, json_str, expected_results, + expect_init_failure, expect_thread_failure); +} + +TEST_CASE("custom_json_data: handling invalid is_shape_tensor") +{ + PerfAnalyzerParameters params{}; + params.user_data = {"fake_file.json"}; + bool is_sequence_model{false}; + + std::vector> expected_results; + std::vector tensors; + bool expect_init_failure = false; + bool expect_thread_failure = false; + + ModelTensor model_tensor1{}; + model_tensor1.datatype_ = "INT32"; + model_tensor1.is_optional_ = true; + model_tensor1.is_shape_tensor_ = true; + model_tensor1.name_ = "INPUT1"; + model_tensor1.shape_ = {1}; + + ModelTensor model_tensor2 = model_tensor1; + model_tensor2.name_ = "INPUT2"; + + size_t num_requests = 4; + size_t num_threads = 1; + + std::string json_str; + + + const auto& ParameterizeJson{[&]() { + SUBCASE("different data") + { + json_str = R"({ + "data": [ + { "INPUT1": [1], "INPUT2": [21] }, + { "INPUT1": [2], "INPUT2": [22] }, + { "INPUT1": [3], "INPUT2": [23] } + ]})"; + expected_results = {{1, 21}, {2, 22}, {3, 23}, {1, 21}}; + } + SUBCASE("missing data") + { + json_str = R"({ + "data": [ + { "INPUT2": [21] }, + { "INPUT2": [22] } + ]})"; + expected_results = {{21}, {22}, {21}, {22}}; + } + }}; + + const auto& ParameterizeNumThreads{[&]() { + SUBCASE("1 thread") + { + num_threads = 1; + ParameterizeJson(); + } + SUBCASE("2 threads") + { + num_threads = 2; + ParameterizeJson(); + } + }}; + + SUBCASE("no batching is ok") + { + params.batch_size = 1; + ParameterizeNumThreads(); + } + SUBCASE("batching - no shm") + { + params.batch_size = 2; + params.shared_memory_type = SharedMemoryType::NO_SHARED_MEMORY; + expect_init_failure = true; + ParameterizeNumThreads(); + } + SUBCASE("batching - shm") + { + params.batch_size = 2; + params.shared_memory_type = SharedMemoryType::SYSTEM_SHARED_MEMORY; + expect_init_failure = true; + ParameterizeNumThreads(); + } + + TestRequestRateManager trrm(params, is_sequence_model); + + tensors.push_back(model_tensor1); + tensors.push_back(model_tensor2); + + trrm.TestCustomData( + num_requests, num_threads, tensors, json_str, expected_results, + expect_init_failure, expect_thread_failure); +} + + +TEST_CASE("custom_json_data: handling of optional tensors") +{ + PerfAnalyzerParameters params{}; + params.user_data = {"fake_file.json"}; + bool is_sequence_model{false}; + + std::vector> expected_results; + std::vector tensors; + bool expect_init_failure = false; + bool expect_thread_failure = false; + + ModelTensor model_tensor1{}; + model_tensor1.datatype_ = "INT32"; + model_tensor1.is_optional_ = false; + model_tensor1.is_shape_tensor_ = false; + model_tensor1.name_ = "INPUT1"; + model_tensor1.shape_ = {1}; + + ModelTensor model_tensor2 = model_tensor1; + model_tensor2.name_ = "INPUT2"; + + std::string json_str{R"({ + "data": [ + { "INPUT1": [1] }, + { "INPUT1": [2], "INPUT2": [22] }, + { "INPUT1": [3] } + ]})"}; + + size_t num_requests = 4; + size_t num_threads = 1; + + const auto& ParameterizeNumThreads{[&]() { + SUBCASE("1 thread") + { + num_threads = 1; + } + SUBCASE("2 threads") + { + num_threads = 2; + } + }}; + + SUBCASE("normal") + { + model_tensor2.is_optional_ = true; + params.batch_size = 1; + expected_results = {{1}, {2, 22}, {3}, {1}}; + ParameterizeNumThreads(); + } + SUBCASE("tensor not optional -- expect parsing fail") + { + model_tensor2.is_optional_ = false; + expect_init_failure = true; + ParameterizeNumThreads(); + } + SUBCASE("shared memory not supported") + { + model_tensor2.is_optional_ = true; + params.shared_memory_type = SharedMemoryType::SYSTEM_SHARED_MEMORY; + // FIXME: TMA-765 - Shared memory mode does not support optional inputs, + // currently, and will be implemented in the associated story. + expect_init_failure = true; + ParameterizeNumThreads(); + } + SUBCASE("batching with mismatching data") + { + model_tensor2.is_optional_ = true; + params.batch_size = 2; + // For batch sizes larger than 1, the same set of inputs + // must be specified for each batch. You cannot use different + // set of optional inputs for each individual batch. + expect_init_failure = true; + ParameterizeNumThreads(); + } + + TestRequestRateManager trrm(params, is_sequence_model); + + tensors.push_back(model_tensor1); + tensors.push_back(model_tensor2); + + trrm.TestCustomData( + num_requests, num_threads, tensors, json_str, expected_results, + expect_init_failure, expect_thread_failure); +} + +TEST_CASE("custom_json_data: multiple streams") +{ + PerfAnalyzerParameters params{}; + params.user_data = {"fake_file.json"}; + params.num_of_sequences = 1; + bool is_sequence_model{false}; + + std::vector> expected_results; + std::vector tensors; + bool expect_init_failure = false; + bool expect_thread_failure = false; + + ModelTensor model_tensor1{}; + model_tensor1.datatype_ = "INT32"; + model_tensor1.is_optional_ = false; + model_tensor1.is_shape_tensor_ = false; + model_tensor1.name_ = "INPUT1"; + model_tensor1.shape_ = {1}; + + ModelTensor model_tensor2 = model_tensor1; + model_tensor2.name_ = "INPUT2"; + + std::string json_str{R"({ + "data": [[ + { "INPUT1": [1], "INPUT2": [21] }, + { "INPUT1": [2], "INPUT2": [22] }, + { "INPUT1": [3], "INPUT2": [23] } + ],[ + { "INPUT1": [201], "INPUT2": [221] }, + { "INPUT1": [202], "INPUT2": [222] } + ]]})"}; + + size_t num_requests = 10; + size_t num_threads = 1; + + const auto& ParameterizeMemory{[&]() { + SUBCASE("No shared memory") + { + params.shared_memory_type = NO_SHARED_MEMORY; + } + SUBCASE("system shared memory") + { + params.shared_memory_type = SYSTEM_SHARED_MEMORY; + } + SUBCASE("cuda shared memory") + { + params.shared_memory_type = CUDA_SHARED_MEMORY; + } + }}; + + const auto& ParameterizeNumThreads{[&]() { + SUBCASE("1 thread") + { + num_threads = 1; + ParameterizeMemory(); + } + SUBCASE("2 threads") + { + num_threads = 2; + ParameterizeMemory(); + } + }}; + + SUBCASE("yes sequence") + { + // Sequences will randomly pick among all streams + // (Although this test is hardcoded to pick ID 1 twice, and then ID 0 + // forever after) + is_sequence_model = true; + expected_results = {{201, 221}, {202, 222}, {201, 221}, {202, 222}, + {1, 21}, {2, 22}, {3, 23}, {1, 21}, + {2, 22}, {3, 23}}; + ParameterizeNumThreads(); + } + SUBCASE("no sequence") + { + // For the case of no sequences, only a single data stream is supported. The + // rest will be ignored + is_sequence_model = false; + expected_results = {{1, 21}, {2, 22}, {3, 23}, {1, 21}, {2, 22}, + {3, 23}, {1, 21}, {2, 22}, {3, 23}, {1, 21}}; + ParameterizeNumThreads(); + } + + TestRequestRateManager trrm(params, is_sequence_model); + + tensors.push_back(model_tensor1); + tensors.push_back(model_tensor2); + + trrm.CustomDataTestSetup(tensors, json_str, expect_init_failure, num_threads); + + if (is_sequence_model) { + // Force GetNewDataStreamId to return 1 twice and 0 every time after + EXPECT_CALL( + *std::dynamic_pointer_cast(trrm.sequence_manager_), + GetNewDataStreamId()) + .WillOnce(testing::Return(1)) + .WillOnce(testing::Return(1)) + .WillRepeatedly(testing::Return(0)); + } else { + // Expect that GetNewDataStreamId will never be called + EXPECT_CALL( + *std::dynamic_pointer_cast(trrm.sequence_manager_), + GetNewDataStreamId()) + .Times(0); + } + auto thread_status = + trrm.CustomDataTestSendRequests(num_requests, num_threads); + trrm.CustomDataTestCheckResults( + thread_status, expect_thread_failure, expected_results); +} + +/// Verify Shared Memory api calls +/// +TEST_CASE("Request rate - Shared memory methods") +{ + PerfAnalyzerParameters params; + bool is_sequence = false; + bool is_decoupled = false; + bool use_mock_infer = true; + + const std::string json_str{R"( + { + "data": [ + { + "INPUT0": [2123456789] + } + ] + } + )"}; + + + MockInputPipeline mip = TestLoadManagerBase::ProcessCustomJsonData(json_str); + + cb::MockClientStats::SharedMemoryStats expected_stats; + SUBCASE("System shared memory usage") + { + params.shared_memory_type = SYSTEM_SHARED_MEMORY; + TestRequestRateManager trrm( + params, is_sequence, is_decoupled, use_mock_infer); + + trrm.infer_data_manager_ = + MockInferDataManagerFactory::CreateMockInferDataManager( + params.max_threads, params.batch_size, params.shared_memory_type, + params.output_shm_size, params.request_parameters, + mip.mock_model_parser_, trrm.factory_, mip.mock_data_loader_); + + trrm.parser_ = mip.mock_model_parser_; + trrm.data_loader_ = mip.mock_data_loader_; + trrm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + + expected_stats.num_unregister_all_shared_memory_calls = 1; + expected_stats.num_register_system_shared_memory_calls = 1; + expected_stats.num_create_shared_memory_region_calls = 1; + expected_stats.num_map_shared_memory_calls = 1; + trrm.CheckSharedMemory(expected_stats); + } + + SUBCASE("Cuda shared memory usage") + { + params.shared_memory_type = CUDA_SHARED_MEMORY; + TestRequestRateManager trrm( + params, is_sequence, is_decoupled, use_mock_infer); + + trrm.infer_data_manager_ = + MockInferDataManagerFactory::CreateMockInferDataManager( + params.max_threads, params.batch_size, params.shared_memory_type, + params.output_shm_size, params.request_parameters, + mip.mock_model_parser_, trrm.factory_, mip.mock_data_loader_); + + trrm.parser_ = mip.mock_model_parser_; + trrm.data_loader_ = mip.mock_data_loader_; + trrm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + + expected_stats.num_unregister_all_shared_memory_calls = 1; + expected_stats.num_register_cuda_shared_memory_calls = 1; + trrm.CheckSharedMemory(expected_stats); + } + + SUBCASE("No shared memory usage") + { + params.shared_memory_type = NO_SHARED_MEMORY; + TestRequestRateManager trrm( + params, is_sequence, is_decoupled, use_mock_infer); + + trrm.infer_data_manager_ = + MockInferDataManagerFactory::CreateMockInferDataManager( + params.max_threads, params.batch_size, params.shared_memory_type, + params.output_shm_size, params.request_parameters, + mip.mock_model_parser_, trrm.factory_, mip.mock_data_loader_); + + trrm.parser_ = mip.mock_model_parser_; + trrm.data_loader_ = mip.mock_data_loader_; + trrm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + + trrm.CheckSharedMemory(expected_stats); + } +} + +TEST_CASE("Request rate - Shared memory infer input calls") +{ + PerfAnalyzerParameters params{}; + bool is_sequence_model{false}; + + const auto& ParameterizeAsyncAndStreaming{[&]() { + SUBCASE("sync non-streaming") + { + params.async = false; + params.streaming = false; + } + SUBCASE("async non-streaming") + { + params.async = true; + params.streaming = false; + } + SUBCASE("async streaming") + { + params.async = true; + params.streaming = true; + } + }}; + + const auto& ParameterizeSequence{[&]() { + SUBCASE("non-sequence") + { + is_sequence_model = false; + ParameterizeAsyncAndStreaming(); + } + SUBCASE("sequence") + { + is_sequence_model = true; + params.num_of_sequences = 1; + ParameterizeAsyncAndStreaming(); + } + }}; + + const auto& ParameterizeMemory{[&]() { + SUBCASE("No shared memory") + { + params.shared_memory_type = NO_SHARED_MEMORY; + ParameterizeSequence(); + } + SUBCASE("system shared memory") + { + params.shared_memory_type = SYSTEM_SHARED_MEMORY; + ParameterizeSequence(); + } + SUBCASE("cuda shared memory") + { + params.shared_memory_type = CUDA_SHARED_MEMORY; + ParameterizeSequence(); + } + }}; + + ParameterizeMemory(); + TestRequestRateManager trrm(params, is_sequence_model); + + const std::string json_str{R"( + { + "data": [ + { + "INPUT0": [2000000000] + }, + { + "INPUT0": [2000000001] + } + ] + } + )"}; + MockInputPipeline mip = + TestLoadManagerBase::ProcessCustomJsonData(json_str, is_sequence_model); + + trrm.infer_data_manager_ = + MockInferDataManagerFactory::CreateMockInferDataManager( + params.max_threads, params.batch_size, params.shared_memory_type, + params.output_shm_size, params.request_parameters, + mip.mock_model_parser_, trrm.factory_, mip.mock_data_loader_); + + std::shared_ptr thread_stat{std::make_shared()}; + std::shared_ptr thread_config{ + std::make_shared(0)}; + + trrm.parser_ = mip.mock_model_parser_; + trrm.data_loader_ = mip.mock_data_loader_; + trrm.using_json_data_ = true; + trrm.execute_ = true; + trrm.batch_size_ = 1; + trrm.max_threads_ = 1; + + RateSchedulePtr_t schedule = std::make_shared(); + schedule->intervals = NanoIntervals{ + milliseconds(4), milliseconds(8), milliseconds(12), milliseconds(16)}; + schedule->duration = nanoseconds{16000000}; + + trrm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + + trrm.start_time_ = std::chrono::steady_clock::now(); + + std::shared_ptr worker{trrm.MakeWorker(thread_stat, thread_config)}; + std::dynamic_pointer_cast(worker)->SetSchedule(schedule); + std::future infer_future{std::async(&IWorker::Infer, worker)}; + + std::this_thread::sleep_for(milliseconds(18)); + + early_exit = true; + infer_future.get(); + + const auto& actual_append_raw_calls{trrm.stats_->num_append_raw_calls}; + const auto& actual_set_shared_memory_calls{ + trrm.stats_->num_set_shared_memory_calls}; + + if (params.shared_memory_type == NO_SHARED_MEMORY) { + CHECK(actual_append_raw_calls > 0); + CHECK(actual_set_shared_memory_calls == 0); + } else { + CHECK(actual_append_raw_calls == 0); + CHECK(actual_set_shared_memory_calls > 0); + } +} + +TEST_CASE("request_rate_deadlock") +{ + PerfAnalyzerParameters params{}; + params.max_concurrency = 6; + bool is_sequence_model{true}; + bool some_infer_failures{false}; + + const auto& ParameterizeSync{[&]() { + SUBCASE("sync") + { + params.async = false; + params.streaming = false; + } + SUBCASE("aync no streaming") + { + params.async = true; + params.streaming = false; + } + SUBCASE("async streaming") + { + params.async = true; + params.streaming = true; + } + }}; + + const auto& ParameterizeThreads{[&]() { + SUBCASE("2 thread") + { + ParameterizeSync(); + params.max_threads = 2; + } + SUBCASE("10 thread") + { + ParameterizeSync(); + params.max_threads = 10; + } + }}; + + const auto& ParameterizeSequence{[&]() { + SUBCASE("non-sequence") + { + ParameterizeThreads(); + is_sequence_model = false; + } + SUBCASE("sequence") + { + ParameterizeThreads(); + is_sequence_model = true; + params.num_of_sequences = 3; + } + }}; + + const auto& ParameterizeFailures{[&]() { + SUBCASE("yes_failures") + { + some_infer_failures = true; + ParameterizeSequence(); + } + SUBCASE("no_failures") + { + some_infer_failures = false; + ParameterizeSequence(); + } + }}; + + std::vector delays; + + const auto& ParameterizeDelays{[&]() { + SUBCASE("no_delay") + { + delays = {0}; + ParameterizeFailures(); + } + SUBCASE("random_delay") + { + delays = {1, 5, 20, 4, 3}; + ParameterizeFailures(); + } + }}; + + ParameterizeDelays(); + + TestRequestRateManager trrm(params, is_sequence_model); + trrm.stats_->SetDelays(delays); + trrm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + + // Sometimes have a request fail + if (some_infer_failures) { + trrm.stats_->SetReturnStatuses({true, true, true, false}); + } + + trrm.TestTimeouts(); +} + +TEST_CASE("request_rate_overhead") +{ + uint rate; + PerfAnalyzerParameters params{}; + SUBCASE("sync, rate 10") + { + params.async = false; + rate = 10; + } + SUBCASE("sync, rate 100") + { + params.async = false; + rate = 100; + } + SUBCASE("async, rate 10") + { + params.async = true; + rate = 10; + } + SUBCASE("async, rate 100") + { + params.async = true; + rate = 100; + } + TestRequestRateManager trrm(params, false); + trrm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + + trrm.TestOverhead(rate); +} + +std::chrono::steady_clock::time_point mk_start{}; + +TEST_CASE( + "send_request_rate_request_rate_manager: testing logic around detecting " + "send request count") +{ + PerfAnalyzerParameters params{}; + + std::vector delays; + bool is_sequence_model = false; + size_t rate = 1000; + size_t time_ms = 50; + size_t expected_count = time_ms; + + SUBCASE("sync") + { + params.async = false; + delays = {0}; + } + SUBCASE("async - fast response") + { + params.async = true; + delays = {0}; + } + SUBCASE( + "async - slow response with sequences off should not slow down our send " + "rate") + { + params.async = true; + delays = {100}; + } + SUBCASE("async - slow response with sequences on") + { + is_sequence_model = true; + params.async = true; + params.num_of_sequences = 5; + delays = {100}; + + SUBCASE("send rate can be limited if serial sequences is on") + { + params.serial_sequences = true; + expected_count = params.num_of_sequences; + } + SUBCASE( + "send rate will not be affected by response time if serial sequences " + "is off") + { + params.serial_sequences = false; + } + } + + TestRequestRateManager trrm(params, is_sequence_model); + + trrm.stats_->SetDelays(delays); + + trrm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + + trrm.ChangeRequestRate(rate); + std::this_thread::sleep_for(std::chrono::milliseconds(time_ms)); + const size_t num_sent_requests{trrm.GetAndResetNumSentRequests()}; + CHECK(num_sent_requests == doctest::Approx(expected_count).epsilon(0.1)); + + trrm.StopWorkerThreads(); +} + +TEST_CASE("request rate manager - Configure threads") +{ + PerfAnalyzerParameters params{}; + std::vector expected_config_values; + std::vector expected_number_of_sequences_owned_by_thread; + std::vector expected_seq_stat_index_offsets; + std::vector expected_num_requests; + bool is_sequence_model = true; + bool is_decoupled_model = false; + bool use_mock_infer = true; + size_t target_num_requests = 0; + + SUBCASE("normal") + { + params.max_threads = 4; + params.num_of_sequences = 4; + target_num_requests = 0; + + expected_number_of_sequences_owned_by_thread = {1, 1, 1, 1}; + expected_seq_stat_index_offsets = {0, 1, 2, 3}; + expected_num_requests = {0, 0, 0, 0}; + } + + SUBCASE("max_threads > num_seqs") + { + params.max_threads = 10; + params.num_of_sequences = 4; + target_num_requests = 8; + + expected_number_of_sequences_owned_by_thread = {1, 1, 1, 1}; + expected_seq_stat_index_offsets = {0, 1, 2, 3}; + expected_num_requests = {2, 2, 2, 2}; + } + + SUBCASE("num_seqs > max_threads") + { + params.max_threads = 4; + params.num_of_sequences = 10; + target_num_requests = 20; + + expected_number_of_sequences_owned_by_thread = {3, 3, 2, 2}; + expected_seq_stat_index_offsets = {0, 3, 6, 8}; + expected_num_requests = {5, 5, 5, 5}; + } + + SUBCASE("not divisible") + { + params.max_threads = 4; + params.num_of_sequences = 7; + target_num_requests = 13; + + expected_number_of_sequences_owned_by_thread = {2, 2, 2, 1}; + expected_seq_stat_index_offsets = {0, 2, 4, 6}; + expected_num_requests = {4, 3, 3, 3}; + } + + for (auto i = 0; i < expected_number_of_sequences_owned_by_thread.size(); + i++) { + ThreadConfig tc(i); + tc.num_sequences_ = expected_number_of_sequences_owned_by_thread[i]; + tc.seq_stat_index_offset_ = expected_seq_stat_index_offsets[i]; + tc.num_requests_ = expected_num_requests[i]; + + expected_config_values.push_back(tc); + } + TestRequestRateManager trrm( + params, is_sequence_model, is_decoupled_model, use_mock_infer); + trrm.TestConfigureThreads(expected_config_values, target_num_requests); +} + +TEST_CASE("request rate manager - Calculate thread ids") +{ + PerfAnalyzerParameters params{}; + bool is_sequence_model; + bool is_decoupled_model = false; + bool use_mock_infer = true; + std::vector expected_thread_ids; + + SUBCASE("normal, on sequence model") + { + is_sequence_model = true; + params.max_threads = 4; + params.num_of_sequences = 4; + expected_thread_ids = {0, 1, 2, 3}; + } + SUBCASE("normal, not sequence model") + { + is_sequence_model = false; + params.max_threads = 4; + params.num_of_sequences = 4; + expected_thread_ids = {0, 1, 2, 3}; + } + SUBCASE("num_seq > max_threads, on sequence model") + { + is_sequence_model = true; + params.max_threads = 4; + params.num_of_sequences = 5; + expected_thread_ids = {0, 1, 2, 3, 0}; + } + SUBCASE("num_seq > max_threads, not sequence model") + { + is_sequence_model = false; + params.max_threads = 4; + params.num_of_sequences = 5; + expected_thread_ids = {0, 1, 2, 3}; + } + SUBCASE("max_threads > num_seq, on sequence model") + { + is_sequence_model = true; + params.max_threads = 5; + params.num_of_sequences = 4; + expected_thread_ids = {0, 1, 2, 3}; + } + SUBCASE("max_threads > num_seq, not sequence model") + { + is_sequence_model = false; + params.max_threads = 5; + params.num_of_sequences = 4; + expected_thread_ids = {0, 1, 2, 3, 4}; + } + SUBCASE("large example") + { + is_sequence_model = true; + params.max_threads = 4; + params.num_of_sequences = 7; + expected_thread_ids = {0, 1, 2, 3, 0, 1, 2}; + } + + TestRequestRateManager trrm( + params, is_sequence_model, is_decoupled_model, use_mock_infer); + trrm.TestCalculateThreadIds(expected_thread_ids); +} + +TEST_CASE("request rate create schedule") +{ + PerfAnalyzerParameters params; + params.measurement_window_ms = 1000; + params.max_trials = 10; + bool is_sequence_model = false; + bool is_decoupled = false; + bool use_mock_infer = false; + double rate = 10; + std::vector expected_worker_ratio; + + SUBCASE("num_seq > max_threads, on sequence model, CONSTANT") + { + is_sequence_model = true; + params.max_threads = 4; + params.num_of_sequences = 5; + expected_worker_ratio = {2, 1, 1, 1}; + } + + SUBCASE("num_seq = 7, max_threads = 4, on sequence model, CONSTANT") + { + is_sequence_model = true; + params.max_threads = 4; + params.num_of_sequences = 7; + expected_worker_ratio = {2, 2, 2, 1}; + } + + SUBCASE("num_seq = 4, max_threads = 2, on sequence model, CONSTANT") + { + is_sequence_model = true; + params.max_threads = 2; + params.num_of_sequences = 4; + expected_worker_ratio = {1, 1}; + } + + SUBCASE("num_seq > max_threads, on sequence model, POISSON") + { + is_sequence_model = true; + params.max_threads = 4; + params.num_of_sequences = 5; + expected_worker_ratio = {2, 1, 1, 1}; + params.request_distribution = POISSON; + } + + TestRequestRateManager trrm( + params, is_sequence_model, is_decoupled, use_mock_infer); + + trrm.InitManager( + params.string_length, params.string_data, params.zero_input, + params.user_data, params.start_sequence_id, params.sequence_id_range, + params.sequence_length, params.sequence_length_specified, + params.sequence_length_variation); + trrm.TestCreateSchedule(rate, params, expected_worker_ratio); +} +}} // namespace triton::perfanalyzer diff --git a/test_sequence_manager.cc b/test_sequence_manager.cc new file mode 100644 index 00000000..243500b8 --- /dev/null +++ b/test_sequence_manager.cc @@ -0,0 +1,298 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "doctest.h" +#include "mock_data_loader.h" +#include "mock_sequence_manager.h" +#include "sequence_manager.h" + +namespace triton { namespace perfanalyzer { + +TEST_CASE("get_sequence_id: testing the GetSequenceID function") +{ + MockSequenceManager msm{}; + + std::shared_ptr sequence_status{ + std::make_shared(5)}; + + msm.sequence_statuses_.push_back(sequence_status); + + CHECK(msm.GetSequenceID(0) == 5); +} + +TEST_CASE( + "test_set_infer_sequence_options: testing the SetInferSequenceOptions " + "function") +{ + const uint64_t seq_id{5}; + std::vector> sequence_statuses{ + std::make_shared(seq_id)}; + std::uniform_int_distribution distribution(0, 0); + const uint64_t start_sequence_id{1}; + const uint64_t sequence_id_range{UINT32_MAX}; + const size_t sequence_length{20}; + const bool sequence_length_specified{false}; + const double sequence_length_variation{0.0}; + bool using_json_data{false}; + std::shared_ptr data_loader{ + std::make_shared()}; + const uint32_t seq_stat_index{0}; + const std::string model_name{"model"}; + std::unique_ptr options{ + std::make_unique(model_name)}; + + SUBCASE("start false, end false") + { + sequence_statuses[seq_stat_index]->remaining_queries_ = 2; + + MockSequenceManager msm( + start_sequence_id, sequence_id_range, sequence_length, + sequence_length_specified, sequence_length_variation, using_json_data, + data_loader); + msm.sequence_statuses_ = sequence_statuses; + msm.curr_seq_id_ = 5; + + msm.SetInferSequenceOptions(seq_stat_index, options); + + CHECK(options->sequence_start_ == false); + CHECK(options->sequence_id_ == 5); + CHECK(options->sequence_end_ == false); + } + SUBCASE("start true, end false") + { + sequence_statuses[seq_stat_index]->remaining_queries_ = 0; + + MockSequenceManager msm( + start_sequence_id, sequence_id_range, sequence_length, + sequence_length_specified, sequence_length_variation, using_json_data, + data_loader); + msm.sequence_statuses_ = sequence_statuses; + msm.curr_seq_id_ = 5; + + msm.SetInferSequenceOptions(seq_stat_index, options); + + CHECK(options->sequence_start_ == true); + CHECK(options->sequence_id_ == 6); + CHECK(options->sequence_end_ == false); + } + SUBCASE("start false, end true") + { + sequence_statuses[seq_stat_index]->remaining_queries_ = 1; + + MockSequenceManager msm( + start_sequence_id, sequence_id_range, sequence_length, + sequence_length_specified, sequence_length_variation, using_json_data, + data_loader); + msm.sequence_statuses_ = sequence_statuses; + msm.curr_seq_id_ = 5; + + msm.SetInferSequenceOptions(seq_stat_index, options); + + CHECK(options->sequence_start_ == false); + CHECK(options->sequence_id_ == 5); + CHECK(options->sequence_end_ == true); + } + SUBCASE("start true, end true") + { + sequence_statuses[seq_stat_index]->remaining_queries_ = 0; + using_json_data = true; + data_loader->step_num_.push_back(1); + data_loader->data_stream_cnt_ = 1; + + MockSequenceManager msm( + start_sequence_id, sequence_id_range, sequence_length, + sequence_length_specified, sequence_length_variation, using_json_data, + data_loader); + msm.sequence_statuses_ = sequence_statuses; + msm.curr_seq_id_ = 5; + + msm.SetInferSequenceOptions(seq_stat_index, options); + + CHECK(options->sequence_start_ == true); + CHECK(options->sequence_id_ == 6); + CHECK(options->sequence_end_ == true); + } +} + +TEST_CASE("init_new_sequence: testing the InitNewSequence function") +{ + const uint64_t seq_id{5}; + std::vector> sequence_statuses{ + std::make_shared(seq_id)}; + std::uniform_int_distribution distribution(0, 0); + const uint64_t start_sequence_id{1}; + const uint64_t sequence_id_range{UINT32_MAX}; + size_t sequence_length{20}; + bool sequence_length_specified{false}; + const double sequence_length_variation{0.0}; + bool using_json_data{false}; + std::shared_ptr data_loader{ + std::make_shared()}; + int seq_stat_index{0}; + size_t expected_sequence_length{0}; + + SUBCASE("not using json data") + { + MockSequenceManager msm( + start_sequence_id, sequence_id_range, sequence_length, + sequence_length_specified, sequence_length_variation, using_json_data, + data_loader); + msm.sequence_statuses_ = sequence_statuses; + msm.curr_seq_id_ = 5; + + msm.InitNewSequence(seq_stat_index); + + CHECK(msm.sequence_statuses_[seq_stat_index]->seq_id_ == 6); + CHECK(msm.sequence_statuses_[seq_stat_index]->remaining_queries_ > 0); + } + + SUBCASE("using json data") + { + using_json_data = true; + data_loader->step_num_.push_back(5); + data_loader->data_stream_cnt_ = 1; + + SUBCASE("sequence length not specified") + { + sequence_length_specified = false; + expected_sequence_length = 5; + } + + SUBCASE("sequence length specified, smaller than input data") + { + sequence_length_specified = true; + sequence_length = 4; + expected_sequence_length = 4; + } + + SUBCASE("sequence length specified, larger than input data") + { + sequence_length_specified = true; + sequence_length = 6; + expected_sequence_length = 6; + } + + MockSequenceManager msm( + start_sequence_id, sequence_id_range, sequence_length, + sequence_length_specified, sequence_length_variation, using_json_data, + data_loader); + msm.sequence_statuses_ = sequence_statuses; + msm.curr_seq_id_ = 5; + + msm.InitNewSequence(seq_stat_index); + + CHECK(msm.sequence_statuses_[seq_stat_index]->seq_id_ == 6); + CHECK( + msm.sequence_statuses_[seq_stat_index]->remaining_queries_ == + expected_sequence_length); + CHECK( + msm.sequence_statuses_[seq_stat_index]->sequence_length_ == + expected_sequence_length); + } +} + +TEST_CASE("get_next_seq_id: testing the GetNextSeqId function") +{ + std::vector> sequence_statuses{}; + std::uniform_int_distribution distribution(0, 0); + uint64_t start_sequence_id{0}; + uint64_t sequence_id_range{0}; + const size_t sequence_length{20}; + const bool sequence_length_specified{false}; + const double sequence_length_variation{0.0}; + const bool using_json_data{false}; + std::shared_ptr data_loader{ + std::make_shared()}; + int seq_stat_index{0}; + + SUBCASE("next sequence id not in use") + { + sequence_statuses.push_back(std::make_shared(1)); + start_sequence_id = 1; + sequence_id_range = 2; + + MockSequenceManager msm( + start_sequence_id, sequence_id_range, sequence_length, + sequence_length_specified, sequence_length_variation, using_json_data, + data_loader); + msm.sequence_statuses_ = sequence_statuses; + msm.curr_seq_id_ = 3; + + uint64_t result{msm.GetNextSeqId(seq_stat_index)}; + + CHECK(result == 2); + } + + SUBCASE("next sequence id in use") + { + sequence_statuses.push_back(std::make_shared(1)); + sequence_statuses.push_back(std::make_shared(2)); + start_sequence_id = 1; + sequence_id_range = 2; + + MockSequenceManager msm( + start_sequence_id, sequence_id_range, sequence_length, + sequence_length_specified, sequence_length_variation, using_json_data, + data_loader); + msm.sequence_statuses_ = sequence_statuses; + msm.curr_seq_id_ = 3; + + uint64_t result{msm.GetNextSeqId(seq_stat_index)}; + + CHECK(result == 1); + } +} + +TEST_CASE( + "get_random_sequence_length: testing the GetRandomSequenceLength function") +{ + std::vector> sequence_statuses{}; + std::uniform_int_distribution distribution(0, 0); + const uint64_t start_sequence_id{0}; + const uint64_t sequence_id_range{0}; + size_t sequence_length{20}; + const bool sequence_length_specified{false}; + const double sequence_length_variation{0.0}; + const bool using_json_data{false}; + std::shared_ptr data_loader{ + std::make_shared()}; + int seq_stat_index{0}; + double offset_ratio{0.2}; + + MockSequenceManager msm( + start_sequence_id, sequence_id_range, sequence_length, + sequence_length_specified, sequence_length_variation, using_json_data, + data_loader); + msm.sequence_statuses_ = sequence_statuses; + msm.curr_seq_id_ = 3; + + uint64_t result{msm.GetRandomSequenceLength(offset_ratio)}; + + CHECK(result >= 16); + CHECK(result <= 24); +} + +}} // namespace triton::perfanalyzer diff --git a/test_utils.h b/test_utils.h new file mode 100644 index 00000000..168aba71 --- /dev/null +++ b/test_utils.h @@ -0,0 +1,112 @@ +// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace triton { namespace perfanalyzer { + +/// This class will create a thread that will raise an error after a fixed +/// amount of time, unless the stop function is called. +/// +/// It can be used to detect livelock/deadlock cases in tests so that the test +/// will be guaranteed to finish instead of hang +/// +class TestWatchDog { + public: + /// Create the watchdog + /// + /// @param max_time_ms How long (in milliseconds) until this watchdog will + /// raise an error + TestWatchDog(unsigned int max_time_ms) { start(max_time_ms); } + + /// Stop the watchdog so that it will not raise any errors + /// + void stop() + { + running_ = false; + thread_.join(); + } + + private: + uint sleep_interval_ms{40}; + uint max_time_ms_; + std::atomic timer_; + std::atomic running_; + std::thread thread_; + + void start(unsigned int max_time_ms) + { + max_time_ms_ = max_time_ms; + timer_ = 0; + running_ = true; + thread_ = std::thread(&TestWatchDog::loop, this); + } + + void loop() + { + while (running_) { + if (timer_ >= max_time_ms_) { + running_ = false; + REQUIRE_MESSAGE(false, "WATCHDOG TIMEOUT!"); + } + + std::this_thread::sleep_for(std::chrono::milliseconds(sleep_interval_ms)); + timer_ += sleep_interval_ms; + } + } +}; + +/// Calculate the average of a vector of integers +/// +static double +CalculateAverage(const std::vector& values) +{ + double avg = + std::accumulate(values.begin(), values.end(), 0.0) / values.size(); + return avg; +} + +/// Calculate the variance of a vector of integers +/// +static double +CalculateVariance(const std::vector& values, double average) +{ + double tmp = 0; + for (auto value : values) { + tmp += (value - average) * (value - average) / values.size(); + } + double variance = std::sqrt(tmp); + return variance; +} + +}} // namespace triton::perfanalyzer diff --git a/thread_config.h b/thread_config.h new file mode 100644 index 00000000..4c4845a6 --- /dev/null +++ b/thread_config.h @@ -0,0 +1,58 @@ +// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +namespace triton { namespace perfanalyzer { + +// Holds the configuration for a worker thread +struct ThreadConfig { + ThreadConfig(size_t thread_id) : thread_id_(thread_id) {} + + // ID of corresponding worker thread + size_t thread_id_{0}; + + // The concurrency level that the worker should produce + // TPA-69: This is only used in concurrency mode and shouldn't be visible in + // other modes + size_t concurrency_{0}; + + // The number of sequences owned by this worker + // TPA-69: This is only used in request-rate mode and shouldn't be visible in + // other modes + uint32_t num_sequences_{1}; + + // How many requests to generate before stopping. If 0, generate indefinitely + size_t num_requests_{0}; + + // The starting sequence stat index for this worker + size_t seq_stat_index_offset_{0}; + + // Whether or not the thread is issuing new inference requests + bool is_paused_{false}; +}; + + +}} // namespace triton::perfanalyzer