From ca13f02c15129653e131aaa55c44ef58777a3650 Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Fri, 15 Nov 2024 18:28:55 -0800 Subject: [PATCH 01/17] [WIP] Add vLLM health check * [WIP] vLLM check_health() is async * [WIP] Fix model name query * [WIP] Health check may only be enabled when instance count is 1 --- src/model.py | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/src/model.py b/src/model.py index 3f6e23bb..a06d079a 100644 --- a/src/model.py +++ b/src/model.py @@ -111,6 +111,9 @@ def initialize(self, args): ) self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"]) + # Setup vLLM engine health check + self._setup_health_check() + # Prepare vLLM engine self.init_engine() @@ -131,6 +134,31 @@ def initialize(self, args): self._shutdown_event = asyncio.Event() self._event_thread.start() + def _setup_health_check(self): + # Check if health check should be enabled + self._enable_health_check = ( + "ENABLE_VLLM_HEALTH_CHECK" in self.model_config["parameters"] + ) and ( + self.model_config["parameters"]["ENABLE_VLLM_HEALTH_CHECK"][ + "string_value" + ].lower() + in ["yes", "true"] + ) + # Setup health check if enabled + if self._enable_health_check: + # Only enable health check if there is exactly 1 instance + num_instances = 0 + for group in self.model_config["instance_group"]: + num_instances += group["count"] + if num_instances != 1: + self.logger.log_warn( + f"[vllm] Health check may only be enabled when the model has exactly 1 instance but {num_instances} are found" + ) + self._enable_health_check = False + return + # Set is healthy flag + self._is_healthy = True + def init_engine(self): # Currently, Triton needs to use decoupled policy for asynchronously # forwarding requests to vLLM engine, so assert it. @@ -542,6 +570,28 @@ def verify_loras(self, request): verified_request = request return verified_request + def _check_health(self, requests): + coro = self.llm_engine.check_health() + future = asyncio.run_coroutine_threadsafe(coro, self._loop) + try: + future.result() + except Exception as e: + self.logger.log_error(f"[vllm] Engine is not healthy: {e}") + pb_utils.unload_model(self.model_config["name"]) # non-blocking + self._is_healthy = False + if not self._is_healthy: + for request in requests: + request.get_response_sender().send( + pb_utils.InferenceResponse( + error=pb_utils.TritonError( + message="vLLM engine is not healthy", + code=pb_utils.TritonError.UNAVAILABLE, + ) + ), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, + ) + return self._is_healthy + def execute(self, requests): """ Triton core issues requests to the backend via this method. @@ -552,6 +602,8 @@ def execute(self, requests): is too loaded. We are pushing all the requests on vllm and let it handle the full traffic. """ + if self._enable_health_check and not self._check_health(requests): + return None for request in requests: request = self.verify_loras(request) if request is not None: From ed62ba41f971e2ad56348e9e49f97e5bb4eb91e5 Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Tue, 19 Nov 2024 14:30:23 -0800 Subject: [PATCH 02/17] [WIP] Add L0_check_health_vllm --- ci/L0_check_health_vllm/check_health_test.py | 128 ++++++++++++++++ .../mock_async_llm_engine.py | 36 +++++ ci/L0_check_health_vllm/test.sh | 141 ++++++++++++++++++ 3 files changed, 305 insertions(+) create mode 100644 ci/L0_check_health_vllm/check_health_test.py create mode 100644 ci/L0_check_health_vllm/mock_async_llm_engine.py create mode 100755 ci/L0_check_health_vllm/test.sh diff --git a/ci/L0_check_health_vllm/check_health_test.py b/ci/L0_check_health_vllm/check_health_test.py new file mode 100644 index 00000000..f40e9b2b --- /dev/null +++ b/ci/L0_check_health_vllm/check_health_test.py @@ -0,0 +1,128 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import os + +import numpy as np +import pytest +import tritonclient.grpc as grpcclient + + +class TestCheckHealth: + _grpc_url = "localhost:8001" + _model_name = "vllm_opt" + _sampling_parameters = {"temperature": "0", "top_p": "1"} + _prompt = "In this example," + + def _get_inputs(self, prompt, stream=True, sampling_parameters=None): + inputs = [] + + inputs.append(grpcclient.InferInput("text_input", [1], "BYTES")) + inputs[-1].set_data_from_numpy( + np.array([prompt.encode("utf-8")], dtype=np.object_) + ) + + inputs.append(grpcclient.InferInput("stream", [1], "BOOL")) + inputs[-1].set_data_from_numpy(np.array([stream], dtype=bool)) + + if sampling_parameters is not None: + inputs.append(grpcclient.InferInput("sampling_parameters", [1], "BYTES")) + inputs[-1].set_data_from_numpy( + np.array( + [json.dumps(sampling_parameters).encode("utf-8")], dtype=np.object_ + ) + ) + + return inputs + + def _callback(self, result, error): + self._responses.append({"result": result, "error": error}) + + def _llm_infer(self): + inputs = self._get_inputs( + self._prompt, stream=True, sampling_parameters=self._sampling_parameters + ) + self._responses = [] + with grpcclient.InferenceServerClient(self._grpc_url) as client: + client.start_stream(self._callback) + client.async_stream_infer( + self._model_name, inputs=inputs, parameters=self._sampling_parameters + ) + client.stop_stream() + + def _assert_text_output_valid(self): + text_output = "" + for response in self._responses: + result, error = response["result"], response["error"] + assert error is None + text_output += result.as_numpy(name="text_output")[0].decode("utf-8") + assert len(text_output) > 0, "output is empty" + assert text_output.count(" ") > 4, "output is not a sentence" + + def _assert_infer_exception(self, expected_exception_message): + assert len(self._responses) == 1 + for response in self._responses: + result, error = response["result"], response["error"] + assert result is None + assert str(error) == expected_exception_message + + def _assert_model_ready(self, expected_readiness): + with grpcclient.InferenceServerClient(self._grpc_url) as client: + assert client.is_model_ready(self._model_name) == expected_readiness + + def test_vllm_is_healthy(self): + num_repeats = 3 + for i in range(num_repeats): + self._assert_model_ready(True) + self._llm_infer() + self._assert_text_output_valid() + self._assert_model_ready(True) + + def test_vllm_not_healthy(self): + self._assert_model_ready(True) + # The 1st infer should complete successfully + self._llm_infer() + self._assert_text_output_valid() + self._assert_model_ready(True) + # The 2nd infer should begin with health check failed + self._llm_infer() + self._assert_infer_exception("vLLM engine is not healthy") + self._assert_model_ready(False) + # The 3rd infer should have model not found + self._llm_infer() + self._assert_infer_exception( + "Request for unknown model: 'vllm_opt' has no available versions" + ) + self._assert_model_ready(False) + + def test_vllm_enable_health_check_multi_instance(self): + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + expected_vllm_warning = "[vllm] Health check may only be enabled when the model has exactly 1 instance but 2 are found" + assert expected_vllm_warning in server_log + # Health check should be disabled + self.test_vllm_is_healthy() diff --git a/ci/L0_check_health_vllm/mock_async_llm_engine.py b/ci/L0_check_health_vllm/mock_async_llm_engine.py new file mode 100644 index 00000000..d8d9f038 --- /dev/null +++ b/ci/L0_check_health_vllm/mock_async_llm_engine.py @@ -0,0 +1,36 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from vllm.engine.async_llm_engine import AsyncLLMEngine as real_AsyncLLMEngine + + +class mock_AsyncLLMEngine(real_AsyncLLMEngine): + _mock_check_health_count = 0 + + async def check_health(self) -> None: + self._mock_check_health_count += 1 + if self._mock_check_health_count > 1: + raise RuntimeError("Simulated vLLM check_health() failure") diff --git a/ci/L0_check_health_vllm/test.sh b/ci/L0_check_health_vllm/test.sh new file mode 100755 index 00000000..02dbb327 --- /dev/null +++ b/ci/L0_check_health_vllm/test.sh @@ -0,0 +1,141 @@ +#!/bin/bash +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +export CUDA_VISIBLE_DEVICES=0 +source ../common/util.sh + +pip3 install pytest==8.1.1 +pip3 install tritonclient[grpc] + +RET=0 + +function setup_model_repository { + local sample_model_repo_path=${1:-"../../samples/model_repository"} + rm -rf models vllm_baseline_output.pkl && mkdir -p models + cp -r $sample_model_repo_path/vllm_model models/vllm_opt +} + +function setup_model_repository_with_multi_instances { + setup_model_repository + echo -e "backend: \"vllm\"" > models/vllm_opt/config.pbtxt + echo -e "instance_group [" >> models/vllm_opt/config.pbtxt + echo -e " { kind: KIND_MODEL }," >> models/vllm_opt/config.pbtxt + echo -e " { kind: KIND_MODEL \n count: 1 }" >> models/vllm_opt/config.pbtxt + echo -e "]" >> models/vllm_opt/config.pbtxt +} + +function enable_health_check { + local enable_vllm_health_check="$1" + echo -e "parameters: {" >> models/vllm_opt/config.pbtxt + echo -e " key: \"ENABLE_VLLM_HEALTH_CHECK\"" >> models/vllm_opt/config.pbtxt + echo -e " value: { string_value: \"$enable_vllm_health_check\" }" >> models/vllm_opt/config.pbtxt + echo -e "}" >> models/vllm_opt/config.pbtxt +} + +function mock_vllm_async_llm_engine { + mv /opt/tritonserver/backends/vllm/model.py /opt/tritonserver/backends/vllm/.model.py.backup + cp /opt/tritonserver/backends/vllm/.model.py.backup /opt/tritonserver/backends/vllm/model.py + sed -i 's/from vllm.engine.async_llm_engine import AsyncLLMEngine/from mock_async_llm_engine import mock_AsyncLLMEngine as AsyncLLMEngine/' /opt/tritonserver/backends/vllm/model.py + cp mock_async_llm_engine.py /opt/tritonserver/backends/vllm +} + +function unmock_vllm_async_llm_engine { + rm -f /opt/tritonserver/backends/vllm/mock_async_llm_engine.py /opt/tritonserver/backends/vllm/model.py + mv /opt/tritonserver/backends/vllm/.model.py.backup /opt/tritonserver/backends/vllm/model.py +} + +function test_check_health { + local test_name="$1" + local unit_test_name="$2" + + SERVER_LOG="$test_name.server.log" + SERVER_ARGS="--model-repository=models --model-control-mode=explicit --load-model=*" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + SERVER_LOG=$SERVER_LOG python3 -m pytest --junitxml=$test_name.report.xml -s -v check_health_test.py::TestCheckHealth::$unit_test_name > $test_name.log + if [ $? -ne 0 ]; then + echo -e "\n***\n*** $test_name FAILED. \n***" + RET=1 + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID +} + +# Test health check unspecified +setup_model_repository +test_check_health "health_check_unspecified" "test_vllm_is_healthy" + +# Test health check disabled +setup_model_repository +enable_health_check "false" +test_check_health "health_check_disabled" "test_vllm_is_healthy" + +# Test health check enabled +setup_model_repository +enable_health_check "true" +test_check_health "health_check_enabled" "test_vllm_is_healthy" + +# Mock check_health() from vLLM +mock_vllm_async_llm_engine + +# Test health check unspecified with mocked vLLM check_health() failure +setup_model_repository +test_check_health "health_check_unspecified_mocked_failure" "test_vllm_is_healthy" + +# Test health check disabled with mocked vLLM check_health() failure +setup_model_repository +enable_health_check "false" +test_check_health "health_check_disabled_mocked_failure" "test_vllm_is_healthy" + +# Test health check enabled with mocked vLLM check_health() failure +setup_model_repository +enable_health_check "true" +test_check_health "health_check_enabled_mocked_failure" "test_vllm_not_healthy" + +# Test health check enabled with mocked vLLM check_health() failure when there +# are multiple instances +setup_model_repository_with_multi_instances +enable_health_check "true" +test_check_health "health_check_enabled_multi_instance_mocked_failure" "test_vllm_enable_health_check_multi_instance" + +# Unmock check_health() +unmock_vllm_async_llm_engine + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi +exit $RET From 4803ee0a805e4f3a9ce11395ddf8aad80867de08 Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Tue, 19 Nov 2024 14:43:45 -0800 Subject: [PATCH 03/17] [chore] Define server for tests --- ci/common/util.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/common/util.sh b/ci/common/util.sh index 8baf4f92..c6ea29bb 100755 --- a/ci/common/util.sh +++ b/ci/common/util.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -25,7 +25,7 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - +SERVER=${SERVER:=/opt/tritonserver/bin/tritonserver} SERVER_IPADDR=${TRITONSERVER_IPADDR:=localhost} SERVER_LOG=${SERVER_LOG:=./server.log} SERVER_TIMEOUT=${SERVER_TIMEOUT:=120} From 33367f64b7c0cdcb8002194303edae46a106cc56 Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Wed, 20 Nov 2024 16:16:29 -0800 Subject: [PATCH 04/17] Allow health check to be enableable for all instance counts --- ci/L0_check_health_vllm/check_health_test.py | 10 ----- ci/L0_check_health_vllm/test.sh | 17 +-------- src/model.py | 40 ++++++-------------- 3 files changed, 12 insertions(+), 55 deletions(-) diff --git a/ci/L0_check_health_vllm/check_health_test.py b/ci/L0_check_health_vllm/check_health_test.py index f40e9b2b..44dde9e3 100644 --- a/ci/L0_check_health_vllm/check_health_test.py +++ b/ci/L0_check_health_vllm/check_health_test.py @@ -25,10 +25,8 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import json -import os import numpy as np -import pytest import tritonclient.grpc as grpcclient @@ -118,11 +116,3 @@ def test_vllm_not_healthy(self): "Request for unknown model: 'vllm_opt' has no available versions" ) self._assert_model_ready(False) - - def test_vllm_enable_health_check_multi_instance(self): - with open(os.environ["SERVER_LOG"]) as f: - server_log = f.read() - expected_vllm_warning = "[vllm] Health check may only be enabled when the model has exactly 1 instance but 2 are found" - assert expected_vllm_warning in server_log - # Health check should be disabled - self.test_vllm_is_healthy() diff --git a/ci/L0_check_health_vllm/test.sh b/ci/L0_check_health_vllm/test.sh index 02dbb327..9c3b4eec 100755 --- a/ci/L0_check_health_vllm/test.sh +++ b/ci/L0_check_health_vllm/test.sh @@ -39,15 +39,6 @@ function setup_model_repository { cp -r $sample_model_repo_path/vllm_model models/vllm_opt } -function setup_model_repository_with_multi_instances { - setup_model_repository - echo -e "backend: \"vllm\"" > models/vllm_opt/config.pbtxt - echo -e "instance_group [" >> models/vllm_opt/config.pbtxt - echo -e " { kind: KIND_MODEL }," >> models/vllm_opt/config.pbtxt - echo -e " { kind: KIND_MODEL \n count: 1 }" >> models/vllm_opt/config.pbtxt - echo -e "]" >> models/vllm_opt/config.pbtxt -} - function enable_health_check { local enable_vllm_health_check="$1" echo -e "parameters: {" >> models/vllm_opt/config.pbtxt @@ -82,7 +73,7 @@ function test_check_health { fi set +e - SERVER_LOG=$SERVER_LOG python3 -m pytest --junitxml=$test_name.report.xml -s -v check_health_test.py::TestCheckHealth::$unit_test_name > $test_name.log + python3 -m pytest --junitxml=$test_name.report.xml -s -v check_health_test.py::TestCheckHealth::$unit_test_name > $test_name.log if [ $? -ne 0 ]; then echo -e "\n***\n*** $test_name FAILED. \n***" RET=1 @@ -124,12 +115,6 @@ setup_model_repository enable_health_check "true" test_check_health "health_check_enabled_mocked_failure" "test_vllm_not_healthy" -# Test health check enabled with mocked vLLM check_health() failure when there -# are multiple instances -setup_model_repository_with_multi_instances -enable_health_check "true" -test_check_health "health_check_enabled_multi_instance_mocked_failure" "test_vllm_enable_health_check_multi_instance" - # Unmock check_health() unmock_vllm_async_llm_engine diff --git a/src/model.py b/src/model.py index a06d079a..09f89b91 100644 --- a/src/model.py +++ b/src/model.py @@ -112,7 +112,10 @@ def initialize(self, args): self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"]) # Setup vLLM engine health check - self._setup_health_check() + self._enable_health_check = self._get_bool_config_param( + "ENABLE_VLLM_HEALTH_CHECK" + ) + self._is_healthy = True # Prepare vLLM engine self.init_engine() @@ -134,31 +137,6 @@ def initialize(self, args): self._shutdown_event = asyncio.Event() self._event_thread.start() - def _setup_health_check(self): - # Check if health check should be enabled - self._enable_health_check = ( - "ENABLE_VLLM_HEALTH_CHECK" in self.model_config["parameters"] - ) and ( - self.model_config["parameters"]["ENABLE_VLLM_HEALTH_CHECK"][ - "string_value" - ].lower() - in ["yes", "true"] - ) - # Setup health check if enabled - if self._enable_health_check: - # Only enable health check if there is exactly 1 instance - num_instances = 0 - for group in self.model_config["instance_group"]: - num_instances += group["count"] - if num_instances != 1: - self.logger.log_warn( - f"[vllm] Health check may only be enabled when the model has exactly 1 instance but {num_instances} are found" - ) - self._enable_health_check = False - return - # Set is healthy flag - self._is_healthy = True - def init_engine(self): # Currently, Triton needs to use decoupled policy for asynchronously # forwarding requests to vLLM engine, so assert it. @@ -191,9 +169,7 @@ def init_engine(self): # Create vLLM custom metrics self.vllm_metrics = None if ( - "REPORT_CUSTOM_METRICS" in self.model_config["parameters"] - and self.model_config["parameters"]["REPORT_CUSTOM_METRICS"]["string_value"] - == "yes" + self._get_bool_config_param("REPORT_CUSTOM_METRICS") and not aync_engine_args.disable_log_stats ): try: @@ -214,6 +190,12 @@ def init_engine(self): else: raise e + def _get_bool_config_param(self, param_name: str) -> bool: + return (param_name in self.model_config["parameters"]) and ( + self.model_config["parameters"][param_name]["string_value"].lower() + in ["yes", "true"] + ) + def setup_lora(self): self.enable_lora = False From 6946b8965ab0f5d6e18515fe69a3725b96bfbda1 Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Wed, 20 Nov 2024 16:23:22 -0800 Subject: [PATCH 05/17] Update unhealthy unload message --- ci/L0_check_health_vllm/check_health_test.py | 2 +- src/model.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ci/L0_check_health_vllm/check_health_test.py b/ci/L0_check_health_vllm/check_health_test.py index 44dde9e3..9af67933 100644 --- a/ci/L0_check_health_vllm/check_health_test.py +++ b/ci/L0_check_health_vllm/check_health_test.py @@ -108,7 +108,7 @@ def test_vllm_not_healthy(self): self._assert_model_ready(True) # The 2nd infer should begin with health check failed self._llm_infer() - self._assert_infer_exception("vLLM engine is not healthy") + self._assert_infer_exception("vLLM engine is not healthy and will be unloaded") self._assert_model_ready(False) # The 3rd infer should have model not found self._llm_infer() diff --git a/src/model.py b/src/model.py index 09f89b91..7224ed81 100644 --- a/src/model.py +++ b/src/model.py @@ -558,7 +558,9 @@ def _check_health(self, requests): try: future.result() except Exception as e: - self.logger.log_error(f"[vllm] Engine is not healthy: {e}") + self.logger.log_error( + f"[vllm] Engine is not healthy and will be unloaded: {e}" + ) pb_utils.unload_model(self.model_config["name"]) # non-blocking self._is_healthy = False if not self._is_healthy: From 3427a3ac71745ce6aa19a4f606b016ca045e9de0 Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Wed, 20 Nov 2024 16:27:23 -0800 Subject: [PATCH 06/17] Update unhealthy unload message content --- src/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/model.py b/src/model.py index 7224ed81..b1fefa75 100644 --- a/src/model.py +++ b/src/model.py @@ -559,7 +559,7 @@ def _check_health(self, requests): future.result() except Exception as e: self.logger.log_error( - f"[vllm] Engine is not healthy and will be unloaded: {e}" + f"[vllm] Engine is not healthy and model will be unloaded: {e}" ) pb_utils.unload_model(self.model_config["name"]) # non-blocking self._is_healthy = False @@ -568,7 +568,7 @@ def _check_health(self, requests): request.get_response_sender().send( pb_utils.InferenceResponse( error=pb_utils.TritonError( - message="vLLM engine is not healthy", + message="vLLM engine is not healthy and model will be unloaded", code=pb_utils.TritonError.UNAVAILABLE, ) ), From 14b66a0681fcdcfd3d78d94e25ade6fd48f229e9 Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Wed, 20 Nov 2024 16:30:41 -0800 Subject: [PATCH 07/17] Update unhealthy unload message content --- ci/L0_check_health_vllm/check_health_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/L0_check_health_vllm/check_health_test.py b/ci/L0_check_health_vllm/check_health_test.py index 9af67933..628afebd 100644 --- a/ci/L0_check_health_vllm/check_health_test.py +++ b/ci/L0_check_health_vllm/check_health_test.py @@ -108,7 +108,7 @@ def test_vllm_not_healthy(self): self._assert_model_ready(True) # The 2nd infer should begin with health check failed self._llm_infer() - self._assert_infer_exception("vLLM engine is not healthy and will be unloaded") + self._assert_infer_exception("vLLM engine is not healthy and model will be unloaded") self._assert_model_ready(False) # The 3rd infer should have model not found self._llm_infer() From 1d58300141b3b426d8ecf7215c5e658a0c54e866 Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Wed, 20 Nov 2024 16:33:54 -0800 Subject: [PATCH 08/17] Update unhealthy unload message content --- ci/L0_check_health_vllm/check_health_test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ci/L0_check_health_vllm/check_health_test.py b/ci/L0_check_health_vllm/check_health_test.py index 628afebd..d56e627d 100644 --- a/ci/L0_check_health_vllm/check_health_test.py +++ b/ci/L0_check_health_vllm/check_health_test.py @@ -108,7 +108,9 @@ def test_vllm_not_healthy(self): self._assert_model_ready(True) # The 2nd infer should begin with health check failed self._llm_infer() - self._assert_infer_exception("vLLM engine is not healthy and model will be unloaded") + self._assert_infer_exception( + "vLLM engine is not healthy and model will be unloaded" + ) self._assert_model_ready(False) # The 3rd infer should have model not found self._llm_infer() From b986aea042e47e086bf2c86f41cb42a81422f7aa Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Thu, 21 Nov 2024 19:18:05 -0800 Subject: [PATCH 09/17] [WIP] Enable backend specifc API for model load/unload --- ci/L0_check_health_vllm/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/L0_check_health_vllm/test.sh b/ci/L0_check_health_vllm/test.sh index 9c3b4eec..7725a80c 100755 --- a/ci/L0_check_health_vllm/test.sh +++ b/ci/L0_check_health_vllm/test.sh @@ -64,7 +64,7 @@ function test_check_health { local unit_test_name="$2" SERVER_LOG="$test_name.server.log" - SERVER_ARGS="--model-repository=models --model-control-mode=explicit --load-model=*" + SERVER_ARGS="--model-repository=models" run_server if [ "$SERVER_PID" == "0" ]; then echo -e "\n***\n*** Failed to start $SERVER\n***" From 7c9baf548f9e50b767b356f52723249d2b9f6f67 Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Mon, 25 Nov 2024 16:04:26 -0800 Subject: [PATCH 10/17] Revert "[WIP] Enable backend specifc API for model load/unload" This reverts commit b986aea042e47e086bf2c86f41cb42a81422f7aa. --- ci/L0_check_health_vllm/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/L0_check_health_vllm/test.sh b/ci/L0_check_health_vllm/test.sh index 7725a80c..9c3b4eec 100755 --- a/ci/L0_check_health_vllm/test.sh +++ b/ci/L0_check_health_vllm/test.sh @@ -64,7 +64,7 @@ function test_check_health { local unit_test_name="$2" SERVER_LOG="$test_name.server.log" - SERVER_ARGS="--model-repository=models" + SERVER_ARGS="--model-repository=models --model-control-mode=explicit --load-model=*" run_server if [ "$SERVER_PID" == "0" ]; then echo -e "\n***\n*** Failed to start $SERVER\n***" From a0822ff26024d5b96c61000479c1673557e4dbae Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Mon, 25 Nov 2024 17:30:14 -0800 Subject: [PATCH 11/17] [docs] Add vLLM health check docs --- README.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/README.md b/README.md index 8a993d99..f6c308ac 100644 --- a/README.md +++ b/README.md @@ -311,6 +311,32 @@ parameters: { } ``` +## vLLM Health Check (BETA) + +> [!NOTE] +> The vLLM Health Check feature is currently in BETA. Its features and +> functionality are subject to change as we collect feedback. We are excited to +> hear any thoughts you have! + +The vLLM backend supports checking for +[vLLM Engine Health](https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/engine/async_llm_engine.py#L1177-L1185) +when an inference request is received. If the health check fails, the entire +model will be unloaded, so it becomes NOT Ready at the server. + +The Health Check is disabled by default. To enable it, set the following +parameter on the model config to true +``` +parameters: { + key: "ENABLE_VLLM_HEALTH_CHECK" + value: { string_value: "true" } +} +``` +and select +[Model Control Mode EXPLICIT](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_management.md#model-control-mode-explicit) +when the server is started. + +Supported since r24.12. + ## Referencing the Tutorial You can read further in the From eb838cd90d4c82f53345a7932d0cd73509369924 Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Tue, 26 Nov 2024 10:37:43 -0800 Subject: [PATCH 12/17] Include model_repository_index API into _assert_model_ready() --- ci/L0_check_health_vllm/check_health_test.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ci/L0_check_health_vllm/check_health_test.py b/ci/L0_check_health_vllm/check_health_test.py index d56e627d..b5d8d6ea 100644 --- a/ci/L0_check_health_vllm/check_health_test.py +++ b/ci/L0_check_health_vllm/check_health_test.py @@ -90,7 +90,15 @@ def _assert_infer_exception(self, expected_exception_message): def _assert_model_ready(self, expected_readiness): with grpcclient.InferenceServerClient(self._grpc_url) as client: + # is_model_ready API assert client.is_model_ready(self._model_name) == expected_readiness + # get_model_repository_index API + model_state = None + for model_index in client.get_model_repository_index().models: + if model_index.name == self._model_name: + assert model_state == None, "duplicate model index found" + model_state = model_index.state == "READY" + assert model_state == expected_readiness def test_vllm_is_healthy(self): num_repeats = 3 From 7188485cd0c351f97b1c1b8dc071a0954736bd88 Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Tue, 26 Nov 2024 10:57:13 -0800 Subject: [PATCH 13/17] [docs] Enhance vLLM health check docs --- README.md | 28 +++------------------ docs/health_check.md | 58 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 24 deletions(-) create mode 100644 docs/health_check.md diff --git a/README.md b/README.md index 141b7ace..f3baf018 100644 --- a/README.md +++ b/README.md @@ -316,31 +316,11 @@ parameters: { } ``` -## vLLM Health Check (BETA) +## vLLM Engine Health Check (BETA) -> [!NOTE] -> The vLLM Health Check feature is currently in BETA. Its features and -> functionality are subject to change as we collect feedback. We are excited to -> hear any thoughts you have! - -The vLLM backend supports checking for -[vLLM Engine Health](https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/engine/async_llm_engine.py#L1177-L1185) -when an inference request is received. If the health check fails, the entire -model will be unloaded, so it becomes NOT Ready at the server. - -The Health Check is disabled by default. To enable it, set the following -parameter on the model config to true -``` -parameters: { - key: "ENABLE_VLLM_HEALTH_CHECK" - value: { string_value: "true" } -} -``` -and select -[Model Control Mode EXPLICIT](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_management.md#model-control-mode-explicit) -when the server is started. - -Supported since r24.12. +vLLM Engine Health Check may be enabled optionally, for more accurate model +state reported by the server. See [this docs](docs/health_check.md) for more +information. ## Referencing the Tutorial diff --git a/docs/health_check.md b/docs/health_check.md new file mode 100644 index 00000000..14a7e68d --- /dev/null +++ b/docs/health_check.md @@ -0,0 +1,58 @@ + + +# vLLM Health Check (BETA) + +> [!NOTE] +> The vLLM Health Check support is currently in BETA. Its features and +> functionality are subject to change as we collect feedback. We are excited to +> hear any thoughts you have! + +The vLLM backend supports checking for +[vLLM Engine Health](https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/engine/async_llm_engine.py#L1177-L1185) +upon receiving each inference request. If the health check fails, the entire +model will be unloaded, so its state becomes NOT Ready at the server, which can +be queried by the +[Repository Index](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_model_repository.md#index) +or +[Model Ready](https://github.com/triton-inference-server/client/blob/main/src/c%2B%2B/library/http_client.h#L178-L192) +APIs. + +The Health Check is disabled by default. To enable it, set the following +parameter on the model config to true +``` +parameters: { + key: "ENABLE_VLLM_HEALTH_CHECK" + value: { string_value: "true" } +} +``` +and select +[Model Control Mode EXPLICIT](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_management.md#model-control-mode-explicit) +when the server is started. + +Supported since r24.12. From 83766518399d3f82ceb02745ecb57a0c6449be8d Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Tue, 26 Nov 2024 11:09:53 -0800 Subject: [PATCH 14/17] Minor fixes --- ci/L0_check_health_vllm/check_health_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/L0_check_health_vllm/check_health_test.py b/ci/L0_check_health_vllm/check_health_test.py index b5d8d6ea..dea599cd 100644 --- a/ci/L0_check_health_vllm/check_health_test.py +++ b/ci/L0_check_health_vllm/check_health_test.py @@ -96,7 +96,7 @@ def _assert_model_ready(self, expected_readiness): model_state = None for model_index in client.get_model_repository_index().models: if model_index.name == self._model_name: - assert model_state == None, "duplicate model index found" + assert model_state is None, "duplicate model index found" model_state = model_index.state == "READY" assert model_state == expected_readiness From b6bd6496627f7eb37ab94899a5eda98d3ffef48d Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Tue, 26 Nov 2024 15:04:59 -0800 Subject: [PATCH 15/17] Update docs and messages --- ci/L0_check_health_vllm/check_health_test.py | 2 +- docs/health_check.md | 7 ++----- src/model.py | 2 +- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/ci/L0_check_health_vllm/check_health_test.py b/ci/L0_check_health_vllm/check_health_test.py index dea599cd..263a42bd 100644 --- a/ci/L0_check_health_vllm/check_health_test.py +++ b/ci/L0_check_health_vllm/check_health_test.py @@ -117,7 +117,7 @@ def test_vllm_not_healthy(self): # The 2nd infer should begin with health check failed self._llm_infer() self._assert_infer_exception( - "vLLM engine is not healthy and model will be unloaded" + "Model is unavailable due to unhealthy vLLM engine" ) self._assert_model_ready(False) # The 3rd infer should have model not found diff --git a/docs/health_check.md b/docs/health_check.md index 14a7e68d..64c5ee8a 100644 --- a/docs/health_check.md +++ b/docs/health_check.md @@ -35,9 +35,8 @@ The vLLM backend supports checking for [vLLM Engine Health](https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/engine/async_llm_engine.py#L1177-L1185) -upon receiving each inference request. If the health check fails, the entire -model will be unloaded, so its state becomes NOT Ready at the server, which can -be queried by the +upon receiving each inference request. If the health check fails, the model +state will becomes NOT Ready at the server, which can be queried by the [Repository Index](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_model_repository.md#index) or [Model Ready](https://github.com/triton-inference-server/client/blob/main/src/c%2B%2B/library/http_client.h#L178-L192) @@ -54,5 +53,3 @@ parameters: { and select [Model Control Mode EXPLICIT](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_management.md#model-control-mode-explicit) when the server is started. - -Supported since r24.12. diff --git a/src/model.py b/src/model.py index 2a7e7663..a016f104 100644 --- a/src/model.py +++ b/src/model.py @@ -701,7 +701,7 @@ def _check_health(self, requests): request.get_response_sender().send( pb_utils.InferenceResponse( error=pb_utils.TritonError( - message="vLLM engine is not healthy and model will be unloaded", + message="Model is unavailable due to unhealthy vLLM engine", code=pb_utils.TritonError.UNAVAILABLE, ) ), From a26a083755da584ce3b117e09c907d4b0f90c71c Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Tue, 26 Nov 2024 15:49:42 -0800 Subject: [PATCH 16/17] [chore] Set bool config param to accept only true/false - compat breaking --- README.md | 4 ++-- ci/L0_backend_vllm/metrics_test/test.sh | 10 +++++----- src/model.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index f3baf018..b90edcfc 100644 --- a/README.md +++ b/README.md @@ -307,11 +307,11 @@ or left empty (false by default) in [model.json](https://github.com/triton-infer *Note:* vLLM metrics are not reported to Triton metrics server by default due to potential performance slowdowns. To enable vLLM model's metrics reporting, please add following lines to its config.pbtxt as well. -```bash +``` parameters: { key: "REPORT_CUSTOM_METRICS" value: { - string_value:"yes" + string_value: "true" } } ``` diff --git a/ci/L0_backend_vllm/metrics_test/test.sh b/ci/L0_backend_vllm/metrics_test/test.sh index fd976d4a..21573bda 100755 --- a/ci/L0_backend_vllm/metrics_test/test.sh +++ b/ci/L0_backend_vllm/metrics_test/test.sh @@ -98,14 +98,14 @@ parameters: { " >> models/vllm_opt/config.pbtxt run_test VLLMTritonMetricsTest.test_vllm_metrics_disabled -# Test vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "yes" in config.pbtxt +# Test vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "true" in config.pbtxt copy_model_repository cp ${SAMPLE_MODELS_REPO}/vllm_model/config.pbtxt models/vllm_opt echo -e " parameters: { key: \"REPORT_CUSTOM_METRICS\" value: { - string_value:\"yes\" + string_value: \"true\" } } " >> models/vllm_opt/config.pbtxt @@ -120,7 +120,7 @@ echo -e " parameters: { key: \"REPORT_CUSTOM_METRICS\" value: { - string_value:\"yes\" + string_value: \"true\" } } " >> models/vllm_opt/config.pbtxt @@ -134,7 +134,7 @@ echo -e " parameters: { key: \"REPORT_CUSTOM_METRICS\" value: { - string_value:\"yes\" + string_value: \"true\" } } " >> models/vllm_opt/config.pbtxt @@ -146,7 +146,7 @@ echo -e " parameters: { key: \"REPORT_CUSTOM_METRICS\" value: { - string_value:\"yes\" + string_value: \"true\" } } " >> models/vllm_opt/config.pbtxt diff --git a/src/model.py b/src/model.py index a016f104..b9fd25a9 100644 --- a/src/model.py +++ b/src/model.py @@ -236,7 +236,7 @@ def init_engine(self): def _get_bool_config_param(self, param_name: str) -> bool: return (param_name in self.model_config["parameters"]) and ( self.model_config["parameters"][param_name]["string_value"].lower() - in ["yes", "true"] + == "true" ) def setup_lora(self): From d4ac6501421b87a38dfaece95b1a7de278ee2cb6 Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Tue, 26 Nov 2024 16:04:39 -0800 Subject: [PATCH 17/17] [chore] More metrics flag value update --- ci/L0_backend_vllm/metrics_test/test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/L0_backend_vllm/metrics_test/test.sh b/ci/L0_backend_vllm/metrics_test/test.sh index 21573bda..5564fb12 100755 --- a/ci/L0_backend_vllm/metrics_test/test.sh +++ b/ci/L0_backend_vllm/metrics_test/test.sh @@ -86,13 +86,13 @@ RET=0 copy_model_repository run_test VLLMTritonMetricsTest.test_vllm_metrics_disabled -# Test disabling vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "no" in config.pbtxt +# Test disabling vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "false" in config.pbtxt copy_model_repository echo -e " parameters: { key: \"REPORT_CUSTOM_METRICS\" value: { - string_value:\"no\" + string_value: \"false\" } } " >> models/vllm_opt/config.pbtxt