triton-inference-server · matthewkotila · Feb 1, 2024
diff --git a/src/c++/perf_analyzer/CMakeLists.txt b/src/c++/perf_analyzer/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -68,8 +68,6 @@ set(
   sequence_manager.cc
   profile_data_collector.cc
   profile_data_exporter.cc
-  periodic_concurrency_manager.cc
-  periodic_concurrency_worker.cc
 )
 
 set(
@@ -110,8 +108,6 @@ set(
   request_record.h
   profile_data_collector.h
   profile_data_exporter.h
-  periodic_concurrency_manager.h
-  periodic_concurrency_worker.h
 )
 
 add_executable(

diff --git a/src/c++/perf_analyzer/command_line_parser.cc b/src/c++/perf_analyzer/command_line_parser.cc
@@ -1,4 +1,4 @@
-// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -113,8 +113,6 @@ CLParser::Usage(const std::string& msg)
   std::cerr << "\t--measurement-interval (-p) <measurement window (in msec)>"
             << std::endl;
   std::cerr << "\t--concurrency-range <start:end:step>" << std::endl;
-  std::cerr << "\t--periodic-concurrency-range <start:end:step>" << std::endl;
-  std::cerr << "\t--request-period <number of responses>" << std::endl;
   std::cerr << "\t--request-rate-range <start:end:step>" << std::endl;
   std::cerr << "\t--request-distribution <\"poisson\"|\"constant\">"
             << std::endl;
@@ -301,34 +299,6 @@ CLParser::Usage(const std::string& msg)
              "not be 0 for sequence models while using asynchronous mode.",
              18)
       << std::endl;
-  std::cerr
-      << FormatMessage(
-             "--periodic-concurrency-range <start:end:step>: Determines the "
-             "range of concurrency levels in the similar but slightly "
-             "different manner as the --concurrency-range. Perf Analyzer will "
-             "start from the concurrency level of 'start' and increase by "
-             "'step' each time. Unlike --concurrency-range, the 'end' "
-             "indicates the *total* number of concurrency since the 'start' "
-             "(including) and will stop increasing once the cumulative number "
-             "of concurrent requests has reached the 'end'. The user can "
-             "specify *when* to periodically increase the concurrency level "
-             "using the --request-period option. The concurrency level will "
-             "periodically increase for every n-th response specified by "
-             "--request-period. Since this disables stability check in Perf "
-             "Analyzer and reports response timestamps only, the user must "
-             "provide --profile-export-file to specify where to dump all the "
-             "measured timestamps. The default values of 'start', 'end', and "
-             "'step' are 1.",
-             18)
-      << std::endl;
-  std::cerr
-      << FormatMessage(
-             "--request-period <n>: Indicates the number of responses that "
-             "each request must receive before new, concurrent requests are "
-             "sent when --periodic-concurrency-range is specified. Default "
-             "value is 10.",
-             18)
-      << std::endl;
   std::cerr
       << FormatMessage(
              "--request-parameter <name:value:type>: Specifies a custom "
@@ -872,9 +842,7 @@ CLParser::ParseCommandLine(int argc, char** argv)
       {"output-tensor-format", required_argument, 0, 56},
       {"version", no_argument, 0, 57},
       {"profile-export-file", required_argument, 0, 58},
-      {"periodic-concurrency-range", required_argument, 0, 59},
-      {"request-period", required_argument, 0, 60},
-      {"request-parameter", required_argument, 0, 61},
+      {"request-parameter", required_argument, 0, 59},
       {0, 0, 0, 0}};
 
   // Parse commandline...
@@ -1538,56 +1506,6 @@ CLParser::ParseCommandLine(int argc, char** argv)
           break;
         }
         case 59: {
-          params_->is_using_periodic_concurrency_mode = true;
-          std::string arg = optarg;
-          std::vector<std::string> values{SplitString(arg)};
-          if (values.size() < 2) {
-            Usage(
-                "Failed to parse --periodic-concurrency-range. Both <start> "
-                "and <end> values must be provided.");
-          } else if (values.size() > 3) {
-            Usage(
-                "Failed to parse --periodic-concurrency-range. The value does "
-                "not match <start:end:step>.");
-          }
-
-          for (size_t i = 0; i < values.size(); ++i) {
-            uint64_t val = std::stoull(values[i]);
-            if (i == 0) {
-              params_->periodic_concurrency_range.start = val;
-            } else if (i == 1) {
-              params_->periodic_concurrency_range.end = val;
-            } else if (i == 2) {
-              params_->periodic_concurrency_range.step = val;
-            }
-          }
-
-          Range<uint64_t> range{params_->periodic_concurrency_range};
-          if (range.step == 0) {
-            Usage(
-                "Failed to parse --periodic-concurrency-range. The <step> "
-                "value must be > 0.");
-          } else if (range.start > range.end) {
-            Usage(
-                "Failed to parse --periodic-concurrency-range. The <start> "
-                "must be <= <end>.");
-          } else if ((range.end - range.start) % range.step != 0) {
-            Usage(
-                "Failed to parse --periodic-concurrency-range. The <step> "
-                "value must be a factor of the range size (<end> - <start>).");
-          }
-          break;
-        }
-        case 60: {
-          std::string request_period{optarg};
-          if (std::stoi(request_period) > 0) {
-            params_->request_period = std::stoull(request_period);
-          } else {
-            Usage("Failed to parse --request-period. The value must be > 0");
-          }
-          break;
-        }
-        case 61: {
           std::string arg = optarg;
           std::vector<std::string> values{SplitString(arg)};
           if (values.size() != 3) {
@@ -1766,46 +1684,13 @@ CLParser::VerifyOptions()
   }
 
   std::vector<bool> load_modes{
-      params_->is_using_periodic_concurrency_mode,
       params_->using_concurrency_range, params_->using_request_rate_range,
       params_->using_custom_intervals};
   if (std::count(load_modes.begin(), load_modes.end(), true) > 1) {
     Usage(
         "Cannot specify more then one inference load mode. Please choose only "
         "one of the following modes: --concurrency-range, "
-        "--periodic-concurrency-range, --request-rate-range, or "
-        "--request-intervals.");
-  }
-
-  if (params_->is_using_periodic_concurrency_mode && !params_->streaming) {
-    Usage(
-        "The --periodic-concurrency-range option requires bi-directional gRPC "
-        "streaming.");
-  }
-
-  if (params_->is_using_periodic_concurrency_mode &&
-      (params_->profile_export_file == "")) {
-    Usage(
-        "Must provide --profile-export-file when using the "
-        "--periodic-concurrency-range option.");
-  }
-
-  if (params_->is_using_periodic_concurrency_mode) {
-    if (params_->periodic_concurrency_range.end == pa::NO_LIMIT) {
-      std::cerr
-          << "WARNING: The maximum attainable concurrency will be limited by "
-             "max_threads specification."
-          << std::endl;
-      params_->periodic_concurrency_range.end = params_->max_threads;
-    } else {
-      if (params_->max_threads_specified) {
-        std::cerr << "WARNING: Overriding max_threads specification to ensure "
-                     "requested concurrency range."
-                  << std::endl;
-      }
-      params_->max_threads = std::max(
-          params_->max_threads, params_->periodic_concurrency_range.end);
-    }
+        "--request-rate-range, or --request-intervals.");
   }
 
   if (params_->request_parameters.size() > 0 &&

diff --git a/src/c++/perf_analyzer/command_line_parser.h b/src/c++/perf_analyzer/command_line_parser.h
@@ -1,4 +1,4 @@
-// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -131,8 +131,7 @@ struct PerfAnalyzerParameters {
   {
     return (
         using_concurrency_range || using_old_options ||
-        !(using_request_rate_range || using_custom_intervals ||
-          is_using_periodic_concurrency_mode));
+        !(using_request_rate_range || using_custom_intervals));
   }
 
   // Sets the threshold for PA client overhead.
@@ -150,10 +149,6 @@ struct PerfAnalyzerParameters {
 
   // The profile export file path.
   std::string profile_export_file{""};
-
-  bool is_using_periodic_concurrency_mode{false};
-  Range<uint64_t> periodic_concurrency_range{1, 1, 1};
-  uint64_t request_period{10};
 };
 
 using PAParamsPtr = std::shared_ptr<PerfAnalyzerParameters>;

diff --git a/src/c++/perf_analyzer/docs/cli.md b/src/c++/perf_analyzer/docs/cli.md
@@ -1,5 +1,5 @@
 <!--
-Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
@@ -180,30 +180,6 @@ until the latency threshold is met. 'end' and `--latency-threshold` cannot
 both be `0`. 'end' cannot be `0` for sequence models while using asynchronous
 mode.
 
-#### `--periodic-concurrency-range=<start:end:step>`
-
-Specifies the range of concurrency levels in the similar but slightly different
-manner as the `--concurrency-range`. Perf Analyzer will start from the
-concurrency level of 'start' and increase by 'step' each time. Unlike
-`--concurrency-range`, the 'end' indicates the *total* number of concurrency
-since the 'start' (including) and will stop increasing once the cumulative
-number of concurrent requests has reached the 'end'. The user can specify
-*when* to periodically increase the concurrency level using the
-`--request-period` option. The concurrency level will periodically increase for
-every `n`-th response specified by `--request-period`. Since this disables
-stability check in Perf Analyzer and reports response timestamps only, the user
-must provide `--profile-export-file` to specify where to dump all the measured
-timestamps.
-
-The default values of 'start', 'end', and 'step' are `1`.
-
-#### `--request-period=<n>`
-
-Specifies the number of responses that each request must receive before new,
-concurrent requests are sent when `--periodic-concurrency-range` is specified.
-
-Default value is `10`.
-
 #### `--request-parameter=<name:value:type>`
 
 Specifies a custom parameter that can be sent to a Triton backend as part of

diff --git a/src/c++/perf_analyzer/docs/inference_load_modes.md b/src/c++/perf_analyzer/docs/inference_load_modes.md
@@ -1,5 +1,5 @@
 <!--
-Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
@@ -40,40 +40,6 @@ example, when using
 will to attempt to have 4 outgoing inference requests at all times during
 profiling.
 
-## Periodic Concurrency Mode
-
-In periodic concurrency mode, Perf Analyzer will periodically launch a new set
-of inference requests until the total number of inference requests that has been
-launched since the beginning reaches N requests.
-
-For example, when using `--periodic-concurrency-range 10:100:30`, Perf Analyzer
-will start with 10 concurrent requests and for every step, it will launch 30 new
-inference requests until the total number of requests launched since the
-beginning reaches 100. Additionally, the user can also specify *when* to launch
-the new requests by specifying `--request-period M`. This will set Perf Analyzer
-to launch a new set of requests whenever *all* of the latest set of launched
-concurrent requests received M number of responses back from the server.
-
-The user can also specify custom parameters to the model using
-`--request-parameter <name:value:type>` option.
-For instance, passing `--request-parameter max_tokens:256:uint` will set an
-additional parameter `max_tokens` of type `int` to 256 as part of the request.
-
-```bash
-perf_analyzer -m <model_name> -i grpc --async --streaming \
-    --profile-export-file profile.json \
-    --periodic-concurrency-range 10:100:30 \
-    --request-period 10 \
-    --request-parameter max_tokens:256:int
-```
-
-> **Note**
->
-> The periodic concurrency mode is currently supported only by gRPC protocol and
-> with [decoupled models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/decoupled_models.md).
-> Additionally, the user must also specify a file where Perf Analyzer could dump all the
-> profiled data using `--profile-export-file`.
-
 ## Request Rate Mode
 
 In request rate mode, Perf Analyzer attempts to send N inference requests per

diff --git a/src/c++/perf_analyzer/docs/llm.md b/src/c++/perf_analyzer/docs/llm.md
@@ -1,5 +1,5 @@
 <!--
-Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
@@ -224,80 +224,3 @@ python profile.py -m ensemble -b trtllm --prompt-size-range 100 500 200 --max-to
 #   * Avg first token latency: 16.0468 ms
 #  ...
 ```
-
-## Benchmark 3: Profiling In-Flight Batching
-
-In this benchmarking scenario, we want to measure the effect of in-flight
-batch size on token-to-token (T2T) latency. We systematically issue requests to
-the server of fixed input sizes and request the model to compute a fixed amount
-of tokens in order to increase the in-flight batch size over time.
-
-#### Example
-
-In this benchmark, we will run Perf Analyzer in
-[periodic concurrency mode](inference_load_modes.md#periodic-concurrency-mode)
-that periodically launches a new concurrent request to the model using
-`--periodic-concurrency-range START END STEP` option.
-In this example, Perf Analyzer starts with a single request and launches the new
-ones until the total number reaches 100.
-You can also specify the timing of the new requests:
-Setting `--request-period` to 32 (as shown below) will make Perf Analyzer to
-wait for all the requests to receive 32 responses before launching new requests.
-Run the following command inside the client container.
-
-```bash
-# Install matplotlib to generate the benchmark plot
-pip install matplotlib
-
-# Run Perf Analyzer
-# trtllm: -m ensemble -b trtllm
-# vllm: -m vllm_model -b vllm
-python profile.py -m ensemble -b trtllm --prompt-size-range 10 10 1 --periodic-concurrency-range 1 100 1 --request-period 32 --max-tokens 1024 --ignore-eos
-
-# [ BENCHMARK SUMMARY ]
-# Prompt size: 10
-#   * Max first token latency: 125.7212 ms
-#   * Min first token latency: 18.4281 ms
-#   * Avg first token latency: 61.8372 ms
-#   ...
-# Saved in-flight batching benchmark plots @ 'inflight_batching_benchmark-*.png'.
-```
-
-The resulting plot will look like
-
-<img src="examples/inflight_batching_benchmark.png" width="600">
-
-The plot demonstrates how the average T2T latency changes across the entire
-benchmark process as we increase the number of requests.
-To observe the change, we first align the responses of every requests and then
-split them into multiple segments of responses.
-For instance, assume we ran the following benchmark command:
-
-```bash
-# trtllm: -m ensemble -b trtllm
-# vllm: -m vllm_model -b vllm
-python profile.py -m ensemble -b trtllm --periodic-concurrency-range 1 4 1 --request-period 32 --max-tokens 1024 --ignore-eos
-```
-
-We start from a single request and increment up to 4 requests one by one for
-every 32 responses (defined by `--request-period`).
-For each request, there are total 1024 generated responses (defined by `--max-tokens`).
-We align these total 1024 generated responses and split them by request period,
-giving us 1024/32 = 32 total segments per request as shown below:
-
-```
-          32 responses (=request period)
-            ┌────┐
-request 1   ──────┊──────┊──────┊──────┊─ ··· ─┊──────┊
-request 2         ┊──────┊──────┊──────┊─ ··· ─┊──────┊──────┊
-request 3         ┊      ┊──────┊──────┊─ ··· ─┊──────┊──────┊──────┊
-request 4         ┊      ┊      ┊──────┊─ ··· ─┊──────┊──────┊──────┊──────
-
-segment #     1      2      3       4     ···     32     33     34     35
-```
-
-Then for each segment, we compute the mean of T2T latencies of the responses.
-This will allow us to visualize the change in T2T latency as the number of
-requests increase, filling up the inflight batch slots, and as they terminate.
-See [profile.py](examples/profile.py) for more details.
-