Update docs and messages

triton-inference-server · Nov 26, 2024 · b6bd649 · b6bd649
1 parent 8376651
commit b6bd649
Show file tree

Hide file tree

Showing 3 changed files with 4 additions and 7 deletions.
diff --git a/ci/L0_check_health_vllm/check_health_test.py b/ci/L0_check_health_vllm/check_health_test.py
@@ -117,7 +117,7 @@ def test_vllm_not_healthy(self):
         # The 2nd infer should begin with health check failed
         self._llm_infer()
         self._assert_infer_exception(
-            "vLLM engine is not healthy and model will be unloaded"
+            "Model is unavailable due to unhealthy vLLM engine"
         )
         self._assert_model_ready(False)
         # The 3rd infer should have model not found

diff --git a/docs/health_check.md b/docs/health_check.md
@@ -35,9 +35,8 @@
 
 The vLLM backend supports checking for
 [vLLM Engine Health](https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/engine/async_llm_engine.py#L1177-L1185)
-upon receiving each inference request. If the health check fails, the entire
-model will be unloaded, so its state becomes NOT Ready at the server, which can
-be queried by the
+upon receiving each inference request. If the health check fails, the model
+state will becomes NOT Ready at the server, which can be queried by the
 [Repository Index](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_model_repository.md#index)
 or
 [Model Ready](https://github.com/triton-inference-server/client/blob/main/src/c%2B%2B/library/http_client.h#L178-L192)
@@ -54,5 +53,3 @@ parameters: {
 and select
 [Model Control Mode EXPLICIT](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_management.md#model-control-mode-explicit)
 when the server is started.
-
-Supported since r24.12.
diff --git a/src/model.py b/src/model.py
@@ -701,7 +701,7 @@ def _check_health(self, requests):
                 request.get_response_sender().send(
                     pb_utils.InferenceResponse(
                         error=pb_utils.TritonError(
-                            message="vLLM engine is not healthy and model will be unloaded",
+                            message="Model is unavailable due to unhealthy vLLM engine",
                             code=pb_utils.TritonError.UNAVAILABLE,
                         )
                     ),