Skip to content

Commit

Permalink
Remove periodic concurrency feature
Browse files Browse the repository at this point in the history
  • Loading branch information
matthewkotila committed Feb 1, 2024
1 parent 24c1ff7 commit ca1fec6
Show file tree
Hide file tree
Showing 15 changed files with 15 additions and 842 deletions.
6 changes: 1 addition & 5 deletions src/c++/perf_analyzer/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -68,8 +68,6 @@ set(
sequence_manager.cc
profile_data_collector.cc
profile_data_exporter.cc
periodic_concurrency_manager.cc
periodic_concurrency_worker.cc
)

set(
Expand Down Expand Up @@ -110,8 +108,6 @@ set(
request_record.h
profile_data_collector.h
profile_data_exporter.h
periodic_concurrency_manager.h
periodic_concurrency_worker.h
)

add_executable(
Expand Down
121 changes: 3 additions & 118 deletions src/c++/perf_analyzer/command_line_parser.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -113,8 +113,6 @@ CLParser::Usage(const std::string& msg)
std::cerr << "\t--measurement-interval (-p) <measurement window (in msec)>"
<< std::endl;
std::cerr << "\t--concurrency-range <start:end:step>" << std::endl;
std::cerr << "\t--periodic-concurrency-range <start:end:step>" << std::endl;
std::cerr << "\t--request-period <number of responses>" << std::endl;
std::cerr << "\t--request-rate-range <start:end:step>" << std::endl;
std::cerr << "\t--request-distribution <\"poisson\"|\"constant\">"
<< std::endl;
Expand Down Expand Up @@ -301,34 +299,6 @@ CLParser::Usage(const std::string& msg)
"not be 0 for sequence models while using asynchronous mode.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
"--periodic-concurrency-range <start:end:step>: Determines the "
"range of concurrency levels in the similar but slightly "
"different manner as the --concurrency-range. Perf Analyzer will "
"start from the concurrency level of 'start' and increase by "
"'step' each time. Unlike --concurrency-range, the 'end' "
"indicates the *total* number of concurrency since the 'start' "
"(including) and will stop increasing once the cumulative number "
"of concurrent requests has reached the 'end'. The user can "
"specify *when* to periodically increase the concurrency level "
"using the --request-period option. The concurrency level will "
"periodically increase for every n-th response specified by "
"--request-period. Since this disables stability check in Perf "
"Analyzer and reports response timestamps only, the user must "
"provide --profile-export-file to specify where to dump all the "
"measured timestamps. The default values of 'start', 'end', and "
"'step' are 1.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
"--request-period <n>: Indicates the number of responses that "
"each request must receive before new, concurrent requests are "
"sent when --periodic-concurrency-range is specified. Default "
"value is 10.",
18)
<< std::endl;
std::cerr
<< FormatMessage(
"--request-parameter <name:value:type>: Specifies a custom "
Expand Down Expand Up @@ -872,9 +842,7 @@ CLParser::ParseCommandLine(int argc, char** argv)
{"output-tensor-format", required_argument, 0, 56},
{"version", no_argument, 0, 57},
{"profile-export-file", required_argument, 0, 58},
{"periodic-concurrency-range", required_argument, 0, 59},
{"request-period", required_argument, 0, 60},
{"request-parameter", required_argument, 0, 61},
{"request-parameter", required_argument, 0, 59},
{0, 0, 0, 0}};

// Parse commandline...
Expand Down Expand Up @@ -1538,56 +1506,6 @@ CLParser::ParseCommandLine(int argc, char** argv)
break;
}
case 59: {
params_->is_using_periodic_concurrency_mode = true;
std::string arg = optarg;
std::vector<std::string> values{SplitString(arg)};
if (values.size() < 2) {
Usage(
"Failed to parse --periodic-concurrency-range. Both <start> "
"and <end> values must be provided.");
} else if (values.size() > 3) {
Usage(
"Failed to parse --periodic-concurrency-range. The value does "
"not match <start:end:step>.");
}

for (size_t i = 0; i < values.size(); ++i) {
uint64_t val = std::stoull(values[i]);
if (i == 0) {
params_->periodic_concurrency_range.start = val;
} else if (i == 1) {
params_->periodic_concurrency_range.end = val;
} else if (i == 2) {
params_->periodic_concurrency_range.step = val;
}
}

Range<uint64_t> range{params_->periodic_concurrency_range};
if (range.step == 0) {
Usage(
"Failed to parse --periodic-concurrency-range. The <step> "
"value must be > 0.");
} else if (range.start > range.end) {
Usage(
"Failed to parse --periodic-concurrency-range. The <start> "
"must be <= <end>.");
} else if ((range.end - range.start) % range.step != 0) {
Usage(
"Failed to parse --periodic-concurrency-range. The <step> "
"value must be a factor of the range size (<end> - <start>).");
}
break;
}
case 60: {
std::string request_period{optarg};
if (std::stoi(request_period) > 0) {
params_->request_period = std::stoull(request_period);
} else {
Usage("Failed to parse --request-period. The value must be > 0");
}
break;
}
case 61: {
std::string arg = optarg;
std::vector<std::string> values{SplitString(arg)};
if (values.size() != 3) {
Expand Down Expand Up @@ -1766,46 +1684,13 @@ CLParser::VerifyOptions()
}

std::vector<bool> load_modes{
params_->is_using_periodic_concurrency_mode,
params_->using_concurrency_range, params_->using_request_rate_range,
params_->using_custom_intervals};
if (std::count(load_modes.begin(), load_modes.end(), true) > 1) {
Usage(
"Cannot specify more then one inference load mode. Please choose only "
"one of the following modes: --concurrency-range, "
"--periodic-concurrency-range, --request-rate-range, or "
"--request-intervals.");
}

if (params_->is_using_periodic_concurrency_mode && !params_->streaming) {
Usage(
"The --periodic-concurrency-range option requires bi-directional gRPC "
"streaming.");
}

if (params_->is_using_periodic_concurrency_mode &&
(params_->profile_export_file == "")) {
Usage(
"Must provide --profile-export-file when using the "
"--periodic-concurrency-range option.");
}

if (params_->is_using_periodic_concurrency_mode) {
if (params_->periodic_concurrency_range.end == pa::NO_LIMIT) {
std::cerr
<< "WARNING: The maximum attainable concurrency will be limited by "
"max_threads specification."
<< std::endl;
params_->periodic_concurrency_range.end = params_->max_threads;
} else {
if (params_->max_threads_specified) {
std::cerr << "WARNING: Overriding max_threads specification to ensure "
"requested concurrency range."
<< std::endl;
}
params_->max_threads = std::max(
params_->max_threads, params_->periodic_concurrency_range.end);
}
"--request-rate-range, or --request-intervals.");
}

if (params_->request_parameters.size() > 0 &&
Expand Down
9 changes: 2 additions & 7 deletions src/c++/perf_analyzer/command_line_parser.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -131,8 +131,7 @@ struct PerfAnalyzerParameters {
{
return (
using_concurrency_range || using_old_options ||
!(using_request_rate_range || using_custom_intervals ||
is_using_periodic_concurrency_mode));
!(using_request_rate_range || using_custom_intervals));
}

// Sets the threshold for PA client overhead.
Expand All @@ -150,10 +149,6 @@ struct PerfAnalyzerParameters {

// The profile export file path.
std::string profile_export_file{""};

bool is_using_periodic_concurrency_mode{false};
Range<uint64_t> periodic_concurrency_range{1, 1, 1};
uint64_t request_period{10};
};

using PAParamsPtr = std::shared_ptr<PerfAnalyzerParameters>;
Expand Down
26 changes: 1 addition & 25 deletions src/c++/perf_analyzer/docs/cli.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<!--
Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -180,30 +180,6 @@ until the latency threshold is met. 'end' and `--latency-threshold` cannot
both be `0`. 'end' cannot be `0` for sequence models while using asynchronous
mode.

#### `--periodic-concurrency-range=<start:end:step>`

Specifies the range of concurrency levels in the similar but slightly different
manner as the `--concurrency-range`. Perf Analyzer will start from the
concurrency level of 'start' and increase by 'step' each time. Unlike
`--concurrency-range`, the 'end' indicates the *total* number of concurrency
since the 'start' (including) and will stop increasing once the cumulative
number of concurrent requests has reached the 'end'. The user can specify
*when* to periodically increase the concurrency level using the
`--request-period` option. The concurrency level will periodically increase for
every `n`-th response specified by `--request-period`. Since this disables
stability check in Perf Analyzer and reports response timestamps only, the user
must provide `--profile-export-file` to specify where to dump all the measured
timestamps.

The default values of 'start', 'end', and 'step' are `1`.

#### `--request-period=<n>`

Specifies the number of responses that each request must receive before new,
concurrent requests are sent when `--periodic-concurrency-range` is specified.

Default value is `10`.

#### `--request-parameter=<name:value:type>`

Specifies a custom parameter that can be sent to a Triton backend as part of
Expand Down
36 changes: 1 addition & 35 deletions src/c++/perf_analyzer/docs/inference_load_modes.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<!--
Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -40,40 +40,6 @@ example, when using
will to attempt to have 4 outgoing inference requests at all times during
profiling.

## Periodic Concurrency Mode

In periodic concurrency mode, Perf Analyzer will periodically launch a new set
of inference requests until the total number of inference requests that has been
launched since the beginning reaches N requests.

For example, when using `--periodic-concurrency-range 10:100:30`, Perf Analyzer
will start with 10 concurrent requests and for every step, it will launch 30 new
inference requests until the total number of requests launched since the
beginning reaches 100. Additionally, the user can also specify *when* to launch
the new requests by specifying `--request-period M`. This will set Perf Analyzer
to launch a new set of requests whenever *all* of the latest set of launched
concurrent requests received M number of responses back from the server.

The user can also specify custom parameters to the model using
`--request-parameter <name:value:type>` option.
For instance, passing `--request-parameter max_tokens:256:uint` will set an
additional parameter `max_tokens` of type `int` to 256 as part of the request.

```bash
perf_analyzer -m <model_name> -i grpc --async --streaming \
--profile-export-file profile.json \
--periodic-concurrency-range 10:100:30 \
--request-period 10 \
--request-parameter max_tokens:256:int
```

> **Note**
>
> The periodic concurrency mode is currently supported only by gRPC protocol and
> with [decoupled models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/decoupled_models.md).
> Additionally, the user must also specify a file where Perf Analyzer could dump all the
> profiled data using `--profile-export-file`.
## Request Rate Mode

In request rate mode, Perf Analyzer attempts to send N inference requests per
Expand Down
79 changes: 1 addition & 78 deletions src/c++/perf_analyzer/docs/llm.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<!--
Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -224,80 +224,3 @@ python profile.py -m ensemble -b trtllm --prompt-size-range 100 500 200 --max-to
# * Avg first token latency: 16.0468 ms
# ...
```
## Benchmark 3: Profiling In-Flight Batching

In this benchmarking scenario, we want to measure the effect of in-flight
batch size on token-to-token (T2T) latency. We systematically issue requests to
the server of fixed input sizes and request the model to compute a fixed amount
of tokens in order to increase the in-flight batch size over time.

#### Example

In this benchmark, we will run Perf Analyzer in
[periodic concurrency mode](inference_load_modes.md#periodic-concurrency-mode)
that periodically launches a new concurrent request to the model using
`--periodic-concurrency-range START END STEP` option.
In this example, Perf Analyzer starts with a single request and launches the new
ones until the total number reaches 100.
You can also specify the timing of the new requests:
Setting `--request-period` to 32 (as shown below) will make Perf Analyzer to
wait for all the requests to receive 32 responses before launching new requests.
Run the following command inside the client container.

```bash
# Install matplotlib to generate the benchmark plot
pip install matplotlib

# Run Perf Analyzer
# trtllm: -m ensemble -b trtllm
# vllm: -m vllm_model -b vllm
python profile.py -m ensemble -b trtllm --prompt-size-range 10 10 1 --periodic-concurrency-range 1 100 1 --request-period 32 --max-tokens 1024 --ignore-eos

# [ BENCHMARK SUMMARY ]
# Prompt size: 10
# * Max first token latency: 125.7212 ms
# * Min first token latency: 18.4281 ms
# * Avg first token latency: 61.8372 ms
# ...
# Saved in-flight batching benchmark plots @ 'inflight_batching_benchmark-*.png'.
```

The resulting plot will look like

<img src="examples/inflight_batching_benchmark.png" width="600">

The plot demonstrates how the average T2T latency changes across the entire
benchmark process as we increase the number of requests.
To observe the change, we first align the responses of every requests and then
split them into multiple segments of responses.
For instance, assume we ran the following benchmark command:

```bash
# trtllm: -m ensemble -b trtllm
# vllm: -m vllm_model -b vllm
python profile.py -m ensemble -b trtllm --periodic-concurrency-range 1 4 1 --request-period 32 --max-tokens 1024 --ignore-eos
```

We start from a single request and increment up to 4 requests one by one for
every 32 responses (defined by `--request-period`).
For each request, there are total 1024 generated responses (defined by `--max-tokens`).
We align these total 1024 generated responses and split them by request period,
giving us 1024/32 = 32 total segments per request as shown below:

```
32 responses (=request period)
┌────┐
request 1 ──────┊──────┊──────┊──────┊─ ··· ─┊──────┊
request 2 ┊──────┊──────┊──────┊─ ··· ─┊──────┊──────┊
request 3 ┊ ┊──────┊──────┊─ ··· ─┊──────┊──────┊──────┊
request 4 ┊ ┊ ┊──────┊─ ··· ─┊──────┊──────┊──────┊──────
segment # 1 2 3 4 ··· 32 33 34 35
```

Then for each segment, we compute the mean of T2T latencies of the responses.
This will allow us to visualize the change in T2T latency as the number of
requests increase, filling up the inflight batch slots, and as they terminate.
See [profile.py](examples/profile.py) for more details.

Loading

0 comments on commit ca1fec6

Please sign in to comment.