diff --git a/README.md b/README.md index a157ed61..b90edcfc 100644 --- a/README.md +++ b/README.md @@ -307,15 +307,21 @@ or left empty (false by default) in [model.json](https://github.com/triton-infer *Note:* vLLM metrics are not reported to Triton metrics server by default due to potential performance slowdowns. To enable vLLM model's metrics reporting, please add following lines to its config.pbtxt as well. -```bash +``` parameters: { key: "REPORT_CUSTOM_METRICS" value: { - string_value:"yes" + string_value: "true" } } ``` +## vLLM Engine Health Check (BETA) + +vLLM Engine Health Check may be enabled optionally, for more accurate model +state reported by the server. See [this docs](docs/health_check.md) for more +information. + ## Referencing the Tutorial You can read further in the diff --git a/ci/L0_backend_vllm/metrics_test/test.sh b/ci/L0_backend_vllm/metrics_test/test.sh index fd976d4a..5564fb12 100755 --- a/ci/L0_backend_vllm/metrics_test/test.sh +++ b/ci/L0_backend_vllm/metrics_test/test.sh @@ -86,26 +86,26 @@ RET=0 copy_model_repository run_test VLLMTritonMetricsTest.test_vllm_metrics_disabled -# Test disabling vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "no" in config.pbtxt +# Test disabling vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "false" in config.pbtxt copy_model_repository echo -e " parameters: { key: \"REPORT_CUSTOM_METRICS\" value: { - string_value:\"no\" + string_value: \"false\" } } " >> models/vllm_opt/config.pbtxt run_test VLLMTritonMetricsTest.test_vllm_metrics_disabled -# Test vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "yes" in config.pbtxt +# Test vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "true" in config.pbtxt copy_model_repository cp ${SAMPLE_MODELS_REPO}/vllm_model/config.pbtxt models/vllm_opt echo -e " parameters: { key: \"REPORT_CUSTOM_METRICS\" value: { - string_value:\"yes\" + string_value: \"true\" } } " >> models/vllm_opt/config.pbtxt @@ -120,7 +120,7 @@ echo -e " parameters: { key: \"REPORT_CUSTOM_METRICS\" value: { - string_value:\"yes\" + string_value: \"true\" } } " >> models/vllm_opt/config.pbtxt @@ -134,7 +134,7 @@ echo -e " parameters: { key: \"REPORT_CUSTOM_METRICS\" value: { - string_value:\"yes\" + string_value: \"true\" } } " >> models/vllm_opt/config.pbtxt @@ -146,7 +146,7 @@ echo -e " parameters: { key: \"REPORT_CUSTOM_METRICS\" value: { - string_value:\"yes\" + string_value: \"true\" } } " >> models/vllm_opt/config.pbtxt diff --git a/ci/L0_check_health_vllm/check_health_test.py b/ci/L0_check_health_vllm/check_health_test.py new file mode 100644 index 00000000..263a42bd --- /dev/null +++ b/ci/L0_check_health_vllm/check_health_test.py @@ -0,0 +1,128 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import numpy as np +import tritonclient.grpc as grpcclient + + +class TestCheckHealth: + _grpc_url = "localhost:8001" + _model_name = "vllm_opt" + _sampling_parameters = {"temperature": "0", "top_p": "1"} + _prompt = "In this example," + + def _get_inputs(self, prompt, stream=True, sampling_parameters=None): + inputs = [] + + inputs.append(grpcclient.InferInput("text_input", [1], "BYTES")) + inputs[-1].set_data_from_numpy( + np.array([prompt.encode("utf-8")], dtype=np.object_) + ) + + inputs.append(grpcclient.InferInput("stream", [1], "BOOL")) + inputs[-1].set_data_from_numpy(np.array([stream], dtype=bool)) + + if sampling_parameters is not None: + inputs.append(grpcclient.InferInput("sampling_parameters", [1], "BYTES")) + inputs[-1].set_data_from_numpy( + np.array( + [json.dumps(sampling_parameters).encode("utf-8")], dtype=np.object_ + ) + ) + + return inputs + + def _callback(self, result, error): + self._responses.append({"result": result, "error": error}) + + def _llm_infer(self): + inputs = self._get_inputs( + self._prompt, stream=True, sampling_parameters=self._sampling_parameters + ) + self._responses = [] + with grpcclient.InferenceServerClient(self._grpc_url) as client: + client.start_stream(self._callback) + client.async_stream_infer( + self._model_name, inputs=inputs, parameters=self._sampling_parameters + ) + client.stop_stream() + + def _assert_text_output_valid(self): + text_output = "" + for response in self._responses: + result, error = response["result"], response["error"] + assert error is None + text_output += result.as_numpy(name="text_output")[0].decode("utf-8") + assert len(text_output) > 0, "output is empty" + assert text_output.count(" ") > 4, "output is not a sentence" + + def _assert_infer_exception(self, expected_exception_message): + assert len(self._responses) == 1 + for response in self._responses: + result, error = response["result"], response["error"] + assert result is None + assert str(error) == expected_exception_message + + def _assert_model_ready(self, expected_readiness): + with grpcclient.InferenceServerClient(self._grpc_url) as client: + # is_model_ready API + assert client.is_model_ready(self._model_name) == expected_readiness + # get_model_repository_index API + model_state = None + for model_index in client.get_model_repository_index().models: + if model_index.name == self._model_name: + assert model_state is None, "duplicate model index found" + model_state = model_index.state == "READY" + assert model_state == expected_readiness + + def test_vllm_is_healthy(self): + num_repeats = 3 + for i in range(num_repeats): + self._assert_model_ready(True) + self._llm_infer() + self._assert_text_output_valid() + self._assert_model_ready(True) + + def test_vllm_not_healthy(self): + self._assert_model_ready(True) + # The 1st infer should complete successfully + self._llm_infer() + self._assert_text_output_valid() + self._assert_model_ready(True) + # The 2nd infer should begin with health check failed + self._llm_infer() + self._assert_infer_exception( + "Model is unavailable due to unhealthy vLLM engine" + ) + self._assert_model_ready(False) + # The 3rd infer should have model not found + self._llm_infer() + self._assert_infer_exception( + "Request for unknown model: 'vllm_opt' has no available versions" + ) + self._assert_model_ready(False) diff --git a/ci/L0_check_health_vllm/mock_async_llm_engine.py b/ci/L0_check_health_vllm/mock_async_llm_engine.py new file mode 100644 index 00000000..d8d9f038 --- /dev/null +++ b/ci/L0_check_health_vllm/mock_async_llm_engine.py @@ -0,0 +1,36 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from vllm.engine.async_llm_engine import AsyncLLMEngine as real_AsyncLLMEngine + + +class mock_AsyncLLMEngine(real_AsyncLLMEngine): + _mock_check_health_count = 0 + + async def check_health(self) -> None: + self._mock_check_health_count += 1 + if self._mock_check_health_count > 1: + raise RuntimeError("Simulated vLLM check_health() failure") diff --git a/ci/L0_check_health_vllm/test.sh b/ci/L0_check_health_vllm/test.sh new file mode 100755 index 00000000..9c3b4eec --- /dev/null +++ b/ci/L0_check_health_vllm/test.sh @@ -0,0 +1,126 @@ +#!/bin/bash +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +export CUDA_VISIBLE_DEVICES=0 +source ../common/util.sh + +pip3 install pytest==8.1.1 +pip3 install tritonclient[grpc] + +RET=0 + +function setup_model_repository { + local sample_model_repo_path=${1:-"../../samples/model_repository"} + rm -rf models vllm_baseline_output.pkl && mkdir -p models + cp -r $sample_model_repo_path/vllm_model models/vllm_opt +} + +function enable_health_check { + local enable_vllm_health_check="$1" + echo -e "parameters: {" >> models/vllm_opt/config.pbtxt + echo -e " key: \"ENABLE_VLLM_HEALTH_CHECK\"" >> models/vllm_opt/config.pbtxt + echo -e " value: { string_value: \"$enable_vllm_health_check\" }" >> models/vllm_opt/config.pbtxt + echo -e "}" >> models/vllm_opt/config.pbtxt +} + +function mock_vllm_async_llm_engine { + mv /opt/tritonserver/backends/vllm/model.py /opt/tritonserver/backends/vllm/.model.py.backup + cp /opt/tritonserver/backends/vllm/.model.py.backup /opt/tritonserver/backends/vllm/model.py + sed -i 's/from vllm.engine.async_llm_engine import AsyncLLMEngine/from mock_async_llm_engine import mock_AsyncLLMEngine as AsyncLLMEngine/' /opt/tritonserver/backends/vllm/model.py + cp mock_async_llm_engine.py /opt/tritonserver/backends/vllm +} + +function unmock_vllm_async_llm_engine { + rm -f /opt/tritonserver/backends/vllm/mock_async_llm_engine.py /opt/tritonserver/backends/vllm/model.py + mv /opt/tritonserver/backends/vllm/.model.py.backup /opt/tritonserver/backends/vllm/model.py +} + +function test_check_health { + local test_name="$1" + local unit_test_name="$2" + + SERVER_LOG="$test_name.server.log" + SERVER_ARGS="--model-repository=models --model-control-mode=explicit --load-model=*" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + python3 -m pytest --junitxml=$test_name.report.xml -s -v check_health_test.py::TestCheckHealth::$unit_test_name > $test_name.log + if [ $? -ne 0 ]; then + echo -e "\n***\n*** $test_name FAILED. \n***" + RET=1 + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID +} + +# Test health check unspecified +setup_model_repository +test_check_health "health_check_unspecified" "test_vllm_is_healthy" + +# Test health check disabled +setup_model_repository +enable_health_check "false" +test_check_health "health_check_disabled" "test_vllm_is_healthy" + +# Test health check enabled +setup_model_repository +enable_health_check "true" +test_check_health "health_check_enabled" "test_vllm_is_healthy" + +# Mock check_health() from vLLM +mock_vllm_async_llm_engine + +# Test health check unspecified with mocked vLLM check_health() failure +setup_model_repository +test_check_health "health_check_unspecified_mocked_failure" "test_vllm_is_healthy" + +# Test health check disabled with mocked vLLM check_health() failure +setup_model_repository +enable_health_check "false" +test_check_health "health_check_disabled_mocked_failure" "test_vllm_is_healthy" + +# Test health check enabled with mocked vLLM check_health() failure +setup_model_repository +enable_health_check "true" +test_check_health "health_check_enabled_mocked_failure" "test_vllm_not_healthy" + +# Unmock check_health() +unmock_vllm_async_llm_engine + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi +exit $RET diff --git a/docs/health_check.md b/docs/health_check.md new file mode 100644 index 00000000..64c5ee8a --- /dev/null +++ b/docs/health_check.md @@ -0,0 +1,55 @@ + + +# vLLM Health Check (BETA) + +> [!NOTE] +> The vLLM Health Check support is currently in BETA. Its features and +> functionality are subject to change as we collect feedback. We are excited to +> hear any thoughts you have! + +The vLLM backend supports checking for +[vLLM Engine Health](https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/engine/async_llm_engine.py#L1177-L1185) +upon receiving each inference request. If the health check fails, the model +state will becomes NOT Ready at the server, which can be queried by the +[Repository Index](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_model_repository.md#index) +or +[Model Ready](https://github.com/triton-inference-server/client/blob/main/src/c%2B%2B/library/http_client.h#L178-L192) +APIs. + +The Health Check is disabled by default. To enable it, set the following +parameter on the model config to true +``` +parameters: { + key: "ENABLE_VLLM_HEALTH_CHECK" + value: { string_value: "true" } +} +``` +and select +[Model Control Mode EXPLICIT](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_management.md#model-control-mode-explicit) +when the server is started. diff --git a/src/model.py b/src/model.py index d7b550c6..b9fd25a9 100644 --- a/src/model.py +++ b/src/model.py @@ -154,6 +154,12 @@ def initialize(self, args): ) self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"]) + # Setup vLLM engine health check + self._enable_health_check = self._get_bool_config_param( + "ENABLE_VLLM_HEALTH_CHECK" + ) + self._is_healthy = True + # Prepare vLLM engine self.init_engine() @@ -206,9 +212,7 @@ def init_engine(self): # Create vLLM custom metrics self.vllm_metrics = None if ( - "REPORT_CUSTOM_METRICS" in self.model_config["parameters"] - and self.model_config["parameters"]["REPORT_CUSTOM_METRICS"]["string_value"] - == "yes" + self._get_bool_config_param("REPORT_CUSTOM_METRICS") and not aync_engine_args.disable_log_stats ): try: @@ -229,6 +233,12 @@ def init_engine(self): else: raise e + def _get_bool_config_param(self, param_name: str) -> bool: + return (param_name in self.model_config["parameters"]) and ( + self.model_config["parameters"][param_name]["string_value"].lower() + == "true" + ) + def setup_lora(self): self.enable_lora = False @@ -675,6 +685,30 @@ def verify_loras(self, request): verified_request = request return verified_request + def _check_health(self, requests): + coro = self.llm_engine.check_health() + future = asyncio.run_coroutine_threadsafe(coro, self._loop) + try: + future.result() + except Exception as e: + self.logger.log_error( + f"[vllm] Engine is not healthy and model will be unloaded: {e}" + ) + pb_utils.unload_model(self.model_config["name"]) # non-blocking + self._is_healthy = False + if not self._is_healthy: + for request in requests: + request.get_response_sender().send( + pb_utils.InferenceResponse( + error=pb_utils.TritonError( + message="Model is unavailable due to unhealthy vLLM engine", + code=pb_utils.TritonError.UNAVAILABLE, + ) + ), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, + ) + return self._is_healthy + def execute(self, requests): """ Triton core issues requests to the backend via this method. @@ -685,6 +719,8 @@ def execute(self, requests): is too loaded. We are pushing all the requests on vllm and let it handle the full traffic. """ + if self._enable_health_check and not self._check_health(requests): + return None for request in requests: request = self.verify_loras(request) if request is not None: