Skip to content

Commit

Permalink
Merge branch 'main' into andy-tests-review
Browse files Browse the repository at this point in the history
  • Loading branch information
AndyDai-nv authored Jul 25, 2024
2 parents aab2ae6 + c514dea commit 6eb20e8
Show file tree
Hide file tree
Showing 38 changed files with 1,698 additions and 405 deletions.
7 changes: 6 additions & 1 deletion src/c++/perf_analyzer/client_backend/openai/openai_client.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ namespace openai {
void
ChatCompletionRequest::SendResponse(bool is_final, bool is_null)
{
final_response_sent_ = is_final;
response_callback_(new ChatCompletionResult(
http_code_, std::move(response_buffer_), is_final, is_null, request_id_));
}
Expand Down Expand Up @@ -172,7 +173,11 @@ ChatCompletionClient::AsyncInfer(
request->timer_.CaptureTimestamp(
triton::client::RequestTimers::Kind::REQUEST_END);
UpdateInferStat(request->timer_);
if (!request->is_stream_) {

// Send final response on request completion
// if it has not already been sent.
// (e.g. in the case of seeing [DONE] in streaming case)
if (!request->IsFinalResponseSent()) {
request->SendResponse(true /* is_final */, false /* is_null */);
}
};
Expand Down
2 changes: 2 additions & 0 deletions src/c++/perf_analyzer/client_backend/openai/openai_client.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,12 +121,14 @@ class ChatCompletionRequest : public HttpRequest {
request_id_(request_id)
{
}
bool IsFinalResponseSent() { return final_response_sent_; };
void SendResponse(bool is_final, bool is_null);
bool is_stream_{false};
std::function<void(InferResult*)> response_callback_{nullptr};
// The timers for infer request.
triton::client::RequestTimers timer_;
const std::string request_id_;
bool final_response_sent_{false};
};

class ChatCompletionClient : public HttpClient {
Expand Down
3 changes: 2 additions & 1 deletion src/c++/perf_analyzer/command_line_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1715,7 +1715,8 @@ CLParser::ParseCommandLine(int argc, char** argv)

// Overriding the max_threads default for request_rate search
if (!params_->max_threads_specified && params_->targeting_concurrency()) {
params_->max_threads = 16;
params_->max_threads =
std::max(DEFAULT_MAX_THREADS, params_->concurrency_range.end);
}

if (params_->using_custom_intervals) {
Expand Down
1 change: 1 addition & 0 deletions src/c++/perf_analyzer/constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ constexpr static const uint32_t STABILITY_ERROR = 2;
constexpr static const uint32_t OPTION_ERROR = 3;

constexpr static const uint32_t GENERIC_ERROR = 99;
constexpr static const size_t DEFAULT_MAX_THREADS = 16;

const double DELAY_PCT_THRESHOLD{1.0};

Expand Down
7 changes: 7 additions & 0 deletions src/c++/perf_analyzer/docs/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,13 @@ will also be reported in the results.
Default is `-1` indicating that the average latency is used to determine
stability.

#### `--request-count=<n>`

Specifies a total number of requests to use for measurement.

Default is `0`, which means that there is no request count and the measurement
will proceed using windows until stabilization is detected.

#### `-r <n>`
#### `--max-trials=<n>`

Expand Down
Loading

0 comments on commit 6eb20e8

Please sign in to comment.