Merge pull request #3 from triton-inference-server/fpetrini-migrate-p…

…a-history Migrate PA History
triton-inference-server · Jun 27, 2024 · 548d4b7 · 548d4b7
2 parents dfeb676 + 7a31081
commit 548d4b7
Show file tree

Hide file tree

Showing 202 changed files with 33,670 additions and 3,785 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -33,6 +33,16 @@ else()
 
 add_subdirectory(client_backend)
 
+find_package(Git REQUIRED)
+
+execute_process(WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  COMMAND "${GIT_EXECUTABLE}" log -n 1 --abbrev-commit --format=format:%h
+  RESULT_VARIABLE RETURN_CODE
+  OUTPUT_VARIABLE GIT_SHA)
+if(NOT RETURN_CODE EQUAL "0")
+  set(GIT_SHA "unknown")
+endif()
+
 set(
   PERF_ANALYZER_SRCS
   command_line_parser.cc
@@ -43,11 +53,23 @@ set(
   data_loader.cc
   concurrency_manager.cc
   request_rate_manager.cc
+  load_worker.cc
+  concurrency_worker.cc
+  request_rate_worker.cc
   custom_load_manager.cc
+  infer_context.cc
   inference_profiler.cc
   report_writer.cc
   mpi_utils.cc
   metrics_manager.cc
+  infer_data_manager_base.cc
+  infer_data_manager.cc
+  infer_data_manager_shm.cc
+  sequence_manager.cc
+  profile_data_collector.cc
+  profile_data_exporter.cc
+  periodic_concurrency_manager.cc
+  periodic_concurrency_worker.cc
 )
 
 set(
@@ -61,13 +83,36 @@ set(
   concurrency_manager.h
   request_rate_manager.h
   custom_load_manager.h
+  iworker.h
+  load_worker.h
+  request_rate_worker.h
+  concurrency_worker.h
+  infer_context.h
   inference_profiler.h
   report_writer.h
   mpi_utils.h
   doctest.h
   constants.h
   metrics.h
   metrics_manager.h
+  infer_data_manager_factory.h
+  iinfer_data_manager.h
+  infer_data_manager.h
+  infer_data_manager_shm.h
+  infer_data_manager_base.h
+  infer_data.h
+  sequence_manager.h
+  sequence_status.h
+  ictx_id_tracker.h
+  concurrency_ctx_id_tracker.h
+  fifo_ctx_id_tracker.h
+  rand_ctx_id_tracker.h
+  request_record.h
+  profile_data_collector.h
+  profile_data_exporter.h
+  periodic_concurrency_manager.h
+  periodic_concurrency_worker.h
+  thread_config.h
 )
 
 add_executable(
@@ -85,6 +130,13 @@ target_link_libraries(
     ${CMAKE_DL_LIBS}
 )
 
+target_compile_definitions(
+  perf_analyzer
+  PRIVATE
+    PERF_ANALYZER_VERSION=${PERF_ANALYZER_VERSION}
+    GIT_SHA=${GIT_SHA}
+)
+
 # If gpu is enabled then compile with CUDA dependencies
 if(TRITON_ENABLE_GPU)
   target_compile_definitions(
@@ -119,6 +171,13 @@ if(TRITON_ENABLE_PERF_ANALYZER_TS)
   )
 endif()
 
+if(TRITON_ENABLE_PERF_ANALYZER_OPENAI)
+  target_compile_definitions(
+    client-backend-library
+    PUBLIC TRITON_ENABLE_PERF_ANALYZER_OPENAI=1
+  )
+endif()
+
 install(
   TARGETS perf_analyzer
   RUNTIME DESTINATION bin
@@ -142,19 +201,43 @@ add_executable(
   ${PERF_ANALYZER_UNIT_TESTS_SRCS}
   ${PERF_ANALYZER_UNIT_TESTS_HDRS}
   mock_inference_profiler.h
+  mock_model_parser.h
+  test_utils.h
+  client_backend/mock_client_backend.h
+  mock_concurrency_worker.h
+  mock_data_loader.h
+  mock_infer_context.h
+  mock_infer_data_manager.h
+  mock_request_rate_worker.h
+  mock_sequence_manager.h
+  mock_profile_data_collector.h
+  mock_profile_data_exporter.h
+  test_dataloader.cc
   test_inference_profiler.cc
   test_command_line_parser.cc
+  test_idle_timer.cc
+  test_load_manager_base.h
+  test_load_manager.cc
   test_model_parser.cc
   test_metrics_manager.cc
+  test_perf_utils.cc
   test_report_writer.cc
   client_backend/triton/test_triton_client_backend.cc
+  test_request_rate_manager.cc
+  test_concurrency_manager.cc
+  test_custom_load_manager.cc
+  test_sequence_manager.cc
+  test_infer_context.cc
+  test_ctx_id_tracker.cc
+  test_profile_data_collector.cc
+  test_profile_data_exporter.cc
   $<TARGET_OBJECTS:json-utils-library>
 )
 
 # -Wno-write-strings is needed for the unit tests in order to statically create
 # input argv cases in the CommandLineParser unit test
 #
-set_target_properties(perf_analyzer_unit_tests 
+set_target_properties(perf_analyzer_unit_tests
   PROPERTIES COMPILE_FLAGS "-Wno-write-strings")
 
 target_link_libraries(

diff --git a/README.md b/README.md
@@ -0,0 +1,171 @@
+<!--
+Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+# Triton Performance Analyzer
+
+Triton Performance Analyzer is CLI tool which can help you optimize the
+inference performance of models running on Triton Inference Server by measuring
+changes in performance as you experiment with different optimization strategies.
+
+<br>
+
+# Features
+
+### Inference Load Modes
+
+- [Concurrency Mode](docs/inference_load_modes.md#concurrency-mode) simlulates
+  load by maintaining a specific concurrency of outgoing requests to the
+  server
+
+- [Request Rate Mode](docs/inference_load_modes.md#request-rate-mode) simulates
+  load by sending consecutive requests at a specific rate to the server
+
+- [Custom Interval Mode](docs/inference_load_modes.md#custom-interval-mode)
+  simulates load by sending consecutive requests at specific intervals to the
+  server
+
+### Performance Measurement Modes
+
+- [Time Windows Mode](docs/measurements_metrics.md#time-windows) measures model
+  performance repeatedly over a specific time interval until performance has
+  stabilized
+
+- [Count Windows Mode](docs/measurements_metrics.md#count-windows) measures
+  model performance repeatedly over a specific number of requests until
+  performance has stabilized
+
+### Other Features
+
+- [Sequence Models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#stateful-models),
+  [Ensemble Models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#ensemble-models),
+  and
+  [Decoupled Models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/decoupled_models.md)
+  can be profiled in addition to standard/stateless/coupled models
+
+- [Input Data](docs/input_data.md) to model inferences can be auto-generated or
+  specified as well as verifying output
+
+- [TensorFlow Serving](docs/benchmarking.md#benchmarking-tensorflow-serving) and
+  [TorchServe](docs/benchmarking.md#benchmarking-torchserve) can be used as the
+  inference server in addition to the default Triton server
+
+<br>
+
+# Quick Start
+
+The steps below will guide you on how to start using Perf Analyzer.
+
+### Step 1: Start Triton Container
+
+```bash
+export RELEASE=<yy.mm> # e.g. to use the release from the end of February of 2023, do `export RELEASE=23.02`
+
+docker pull nvcr.io/nvidia/tritonserver:${RELEASE}-py3
+
+docker run --gpus all --rm -it --net host nvcr.io/nvidia/tritonserver:${RELEASE}-py3
+```
+
+### Step 2: Download `simple` Model
+
+```bash
+# inside triton container
+git clone --depth 1 https://github.com/triton-inference-server/server
+
+mkdir model_repository ; cp -r server/docs/examples/model_repository/simple model_repository
+```
+
+### Step 3: Start Triton Server
+
+```bash
+# inside triton container
+tritonserver --model-repository $(pwd)/model_repository &> server.log &
+
+# confirm server is ready, look for 'HTTP/1.1 200 OK'
+curl -v localhost:8000/v2/health/ready
+
+# detach (CTRL-p CTRL-q)
+```
+
+### Step 4: Start Triton SDK Container
+
+```bash
+docker pull nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk
+
+docker run --gpus all --rm -it --net host nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk
+```
+
+### Step 5: Run Perf Analyzer
+
+```bash
+# inside sdk container
+perf_analyzer -m simple
+```
+
+See the full [quick start guide](docs/quick_start.md) for additional tips on
+how to analyze output.
+
+<br>
+
+# Documentation
+
+- [Installation](docs/install.md)
+- [Perf Analyzer CLI](docs/cli.md)
+- [Inference Load Modes](docs/inference_load_modes.md)
+- [Input Data](docs/input_data.md)
+- [Measurements & Metrics](docs/measurements_metrics.md)
+- [Benchmarking](docs/benchmarking.md)
+
+<br>
+
+# Contributing
+
+Contributions to Triton Perf Analyzer are more than welcome. To contribute
+please review the [contribution
+guidelines](https://github.com/triton-inference-server/server/blob/main/CONTRIBUTING.md),
+then fork and create a pull request.
+
+<br>
+
+# Reporting problems, asking questions
+
+We appreciate any feedback, questions or bug reporting regarding this
+project. When help with code is needed, follow the process outlined in
+the Stack Overflow (https://stackoverflow.com/help/mcve)
+document. Ensure posted examples are:
+
+- minimal - use as little code as possible that still produces the
+  same problem
+
+- complete - provide all parts needed to reproduce the problem. Check
+  if you can strip external dependency and still show the problem. The
+  less time we spend on reproducing problems the more time we have to
+  fix it
+
+- verifiable - test the code you're about to provide to make sure it
+  reproduces the problem. Remove all other problems that are not
+  related to your request/question.
diff --git a/base_queue_ctx_id_tracker.h b/base_queue_ctx_id_tracker.h
@@ -0,0 +1,67 @@
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <queue>
+
+#include "ictx_id_tracker.h"
+
+namespace triton { namespace perfanalyzer {
+
+// Base class for CtxIdTrackers that track available IDs via a queue
+//
+class BaseQueueCtxIdTracker : public ICtxIdTracker {
+ public:
+  BaseQueueCtxIdTracker() = default;
+
+  void Restore(size_t id) override { free_ctx_ids_.push(id); }
+
+  size_t Get() override
+  {
+    if (!IsAvailable()) {
+      throw std::runtime_error("free ctx id list is empty");
+    }
+
+    size_t ctx_id = free_ctx_ids_.front();
+    free_ctx_ids_.pop();
+    return ctx_id;
+  }
+
+  bool IsAvailable() override { return free_ctx_ids_.size() > 0; }
+
+ protected:
+  std::queue<size_t> free_ctx_ids_;
+
+  // Erase all entries in the tracking queue
+  //
+  void Clear()
+  {
+    std::queue<size_t> empty;
+    std::swap(free_ctx_ids_, empty);
+  }
+};
+
+}};  // namespace triton::perfanalyzer