feat: Auto unload model if vLLM health check failed (#73)

triton-inference-server · Dec 5, 2024 · b594d07 · b594d07
1 parent 366e668
commit b594d07
Show file tree

Hide file tree

Showing 7 changed files with 399 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -307,15 +307,21 @@ or left empty (false by default) in [model.json](https://github.com/triton-infer
 *Note:* vLLM metrics are not reported to Triton metrics server by default
 due to potential performance slowdowns. To enable vLLM model's metrics
 reporting, please add following lines to its config.pbtxt as well.
-```bash
+```
 parameters: {
   key: "REPORT_CUSTOM_METRICS"
   value: {
-    string_value:"yes"
+    string_value: "true"
   }
 }
 ```
 
+## vLLM Engine Health Check (BETA)
+
+vLLM Engine Health Check may be enabled optionally, for more accurate model
+state reported by the server. See [this docs](docs/health_check.md) for more
+information.
+
 ## Referencing the Tutorial
 
 You can read further in the

diff --git a/ci/L0_backend_vllm/metrics_test/test.sh b/ci/L0_backend_vllm/metrics_test/test.sh
@@ -86,26 +86,26 @@ RET=0
 copy_model_repository
 run_test VLLMTritonMetricsTest.test_vllm_metrics_disabled
 
-# Test disabling vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "no" in config.pbtxt
+# Test disabling vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "false" in config.pbtxt
 copy_model_repository
 echo -e "
 parameters: {
   key: \"REPORT_CUSTOM_METRICS\"
   value: {
-    string_value:\"no\"
+    string_value: \"false\"
   }
 }
 " >> models/vllm_opt/config.pbtxt
 run_test VLLMTritonMetricsTest.test_vllm_metrics_disabled
 
-# Test vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "yes" in config.pbtxt
+# Test vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "true" in config.pbtxt
 copy_model_repository
 cp ${SAMPLE_MODELS_REPO}/vllm_model/config.pbtxt models/vllm_opt
 echo -e "
 parameters: {
   key: \"REPORT_CUSTOM_METRICS\"
   value: {
-    string_value:\"yes\"
+    string_value: \"true\"
   }
 }
 " >> models/vllm_opt/config.pbtxt
@@ -120,7 +120,7 @@ echo -e "
 parameters: {
   key: \"REPORT_CUSTOM_METRICS\"
   value: {
-    string_value:\"yes\"
+    string_value: \"true\"
   }
 }
 " >> models/vllm_opt/config.pbtxt
@@ -134,7 +134,7 @@ echo -e "
 parameters: {
   key: \"REPORT_CUSTOM_METRICS\"
   value: {
-    string_value:\"yes\"
+    string_value: \"true\"
   }
 }
 " >> models/vllm_opt/config.pbtxt
@@ -146,7 +146,7 @@ echo -e "
 parameters: {
   key: \"REPORT_CUSTOM_METRICS\"
   value: {
-    string_value:\"yes\"
+    string_value: \"true\"
   }
 }
 " >> models/vllm_opt/config.pbtxt

diff --git a/ci/L0_check_health_vllm/check_health_test.py b/ci/L0_check_health_vllm/check_health_test.py
@@ -0,0 +1,128 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import json
+
+import numpy as np
+import tritonclient.grpc as grpcclient
+
+
+class TestCheckHealth:
+    _grpc_url = "localhost:8001"
+    _model_name = "vllm_opt"
+    _sampling_parameters = {"temperature": "0", "top_p": "1"}
+    _prompt = "In this example,"
+
+    def _get_inputs(self, prompt, stream=True, sampling_parameters=None):
+        inputs = []
+
+        inputs.append(grpcclient.InferInput("text_input", [1], "BYTES"))
+        inputs[-1].set_data_from_numpy(
+            np.array([prompt.encode("utf-8")], dtype=np.object_)
+        )
+
+        inputs.append(grpcclient.InferInput("stream", [1], "BOOL"))
+        inputs[-1].set_data_from_numpy(np.array([stream], dtype=bool))
+
+        if sampling_parameters is not None:
+            inputs.append(grpcclient.InferInput("sampling_parameters", [1], "BYTES"))
+            inputs[-1].set_data_from_numpy(
+                np.array(
+                    [json.dumps(sampling_parameters).encode("utf-8")], dtype=np.object_
+                )
+            )
+
+        return inputs
+
+    def _callback(self, result, error):
+        self._responses.append({"result": result, "error": error})
+
+    def _llm_infer(self):
+        inputs = self._get_inputs(
+            self._prompt, stream=True, sampling_parameters=self._sampling_parameters
+        )
+        self._responses = []
+        with grpcclient.InferenceServerClient(self._grpc_url) as client:
+            client.start_stream(self._callback)
+            client.async_stream_infer(
+                self._model_name, inputs=inputs, parameters=self._sampling_parameters
+            )
+            client.stop_stream()
+
+    def _assert_text_output_valid(self):
+        text_output = ""
+        for response in self._responses:
+            result, error = response["result"], response["error"]
+            assert error is None
+            text_output += result.as_numpy(name="text_output")[0].decode("utf-8")
+        assert len(text_output) > 0, "output is empty"
+        assert text_output.count(" ") > 4, "output is not a sentence"
+
+    def _assert_infer_exception(self, expected_exception_message):
+        assert len(self._responses) == 1
+        for response in self._responses:
+            result, error = response["result"], response["error"]
+            assert result is None
+            assert str(error) == expected_exception_message
+
+    def _assert_model_ready(self, expected_readiness):
+        with grpcclient.InferenceServerClient(self._grpc_url) as client:
+            # is_model_ready API
+            assert client.is_model_ready(self._model_name) == expected_readiness
+            # get_model_repository_index API
+            model_state = None
+            for model_index in client.get_model_repository_index().models:
+                if model_index.name == self._model_name:
+                    assert model_state is None, "duplicate model index found"
+                    model_state = model_index.state == "READY"
+            assert model_state == expected_readiness
+
+    def test_vllm_is_healthy(self):
+        num_repeats = 3
+        for i in range(num_repeats):
+            self._assert_model_ready(True)
+            self._llm_infer()
+            self._assert_text_output_valid()
+        self._assert_model_ready(True)
+
+    def test_vllm_not_healthy(self):
+        self._assert_model_ready(True)
+        # The 1st infer should complete successfully
+        self._llm_infer()
+        self._assert_text_output_valid()
+        self._assert_model_ready(True)
+        # The 2nd infer should begin with health check failed
+        self._llm_infer()
+        self._assert_infer_exception(
+            "Model is unavailable due to unhealthy vLLM engine"
+        )
+        self._assert_model_ready(False)
+        # The 3rd infer should have model not found
+        self._llm_infer()
+        self._assert_infer_exception(
+            "Request for unknown model: 'vllm_opt' has no available versions"
+        )
+        self._assert_model_ready(False)
diff --git a/ci/L0_check_health_vllm/mock_async_llm_engine.py b/ci/L0_check_health_vllm/mock_async_llm_engine.py
@@ -0,0 +1,36 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from vllm.engine.async_llm_engine import AsyncLLMEngine as real_AsyncLLMEngine
+
+
+class mock_AsyncLLMEngine(real_AsyncLLMEngine):
+    _mock_check_health_count = 0
+
+    async def check_health(self) -> None:
+        self._mock_check_health_count += 1
+        if self._mock_check_health_count > 1:
+            raise RuntimeError("Simulated vLLM check_health() failure")
diff --git a/ci/L0_check_health_vllm/test.sh b/ci/L0_check_health_vllm/test.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+export CUDA_VISIBLE_DEVICES=0
+source ../common/util.sh
+
+pip3 install pytest==8.1.1
+pip3 install tritonclient[grpc]
+
+RET=0
+
+function setup_model_repository {
+    local sample_model_repo_path=${1:-"../../samples/model_repository"}
+    rm -rf models vllm_baseline_output.pkl && mkdir -p models
+    cp -r $sample_model_repo_path/vllm_model models/vllm_opt
+}
+
+function enable_health_check {
+    local enable_vllm_health_check="$1"
+    echo -e "parameters: {" >> models/vllm_opt/config.pbtxt
+    echo -e "  key: \"ENABLE_VLLM_HEALTH_CHECK\"" >> models/vllm_opt/config.pbtxt
+    echo -e "  value: { string_value: \"$enable_vllm_health_check\" }" >> models/vllm_opt/config.pbtxt
+    echo -e "}" >> models/vllm_opt/config.pbtxt
+}
+
+function mock_vllm_async_llm_engine {
+    mv /opt/tritonserver/backends/vllm/model.py /opt/tritonserver/backends/vllm/.model.py.backup
+    cp /opt/tritonserver/backends/vllm/.model.py.backup /opt/tritonserver/backends/vllm/model.py
+    sed -i 's/from vllm.engine.async_llm_engine import AsyncLLMEngine/from mock_async_llm_engine import mock_AsyncLLMEngine as AsyncLLMEngine/' /opt/tritonserver/backends/vllm/model.py
+    cp mock_async_llm_engine.py /opt/tritonserver/backends/vllm
+}
+
+function unmock_vllm_async_llm_engine {
+    rm -f /opt/tritonserver/backends/vllm/mock_async_llm_engine.py /opt/tritonserver/backends/vllm/model.py
+    mv /opt/tritonserver/backends/vllm/.model.py.backup /opt/tritonserver/backends/vllm/model.py
+}
+
+function test_check_health {
+    local test_name="$1"
+    local unit_test_name="$2"
+
+    SERVER_LOG="$test_name.server.log"
+    SERVER_ARGS="--model-repository=models --model-control-mode=explicit --load-model=*"
+    run_server
+    if [ "$SERVER_PID" == "0" ]; then
+        echo -e "\n***\n*** Failed to start $SERVER\n***"
+        cat $SERVER_LOG
+        exit 1
+    fi
+
+    set +e
+    python3 -m pytest --junitxml=$test_name.report.xml -s -v check_health_test.py::TestCheckHealth::$unit_test_name > $test_name.log
+    if [ $? -ne 0 ]; then
+        echo -e "\n***\n*** $test_name FAILED. \n***"
+        RET=1
+    fi
+    set -e
+
+    kill $SERVER_PID
+    wait $SERVER_PID
+}
+
+# Test health check unspecified
+setup_model_repository
+test_check_health "health_check_unspecified" "test_vllm_is_healthy"
+
+# Test health check disabled
+setup_model_repository
+enable_health_check "false"
+test_check_health "health_check_disabled" "test_vllm_is_healthy"
+
+# Test health check enabled
+setup_model_repository
+enable_health_check "true"
+test_check_health "health_check_enabled" "test_vllm_is_healthy"
+
+# Mock check_health() from vLLM
+mock_vllm_async_llm_engine
+
+# Test health check unspecified with mocked vLLM check_health() failure
+setup_model_repository
+test_check_health "health_check_unspecified_mocked_failure" "test_vllm_is_healthy"
+
+# Test health check disabled with mocked vLLM check_health() failure
+setup_model_repository
+enable_health_check "false"
+test_check_health "health_check_disabled_mocked_failure" "test_vllm_is_healthy"
+
+# Test health check enabled with mocked vLLM check_health() failure
+setup_model_repository
+enable_health_check "true"
+test_check_health "health_check_enabled_mocked_failure" "test_vllm_not_healthy"
+
+# Unmock check_health()
+unmock_vllm_async_llm_engine
+
+if [ $RET -eq 0 ]; then
+    echo -e "\n***\n*** Test Passed\n***"
+else
+    echo -e "\n***\n*** Test FAILED\n***"
+fi
+exit $RET