-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Auto unload model if vLLM health check failed (#73)
- Loading branch information
Showing
7 changed files
with
399 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
# | ||
# Redistribution and use in source and binary forms, with or without | ||
# modification, are permitted provided that the following conditions | ||
# are met: | ||
# * Redistributions of source code must retain the above copyright | ||
# notice, this list of conditions and the following disclaimer. | ||
# * Redistributions in binary form must reproduce the above copyright | ||
# notice, this list of conditions and the following disclaimer in the | ||
# documentation and/or other materials provided with the distribution. | ||
# * Neither the name of NVIDIA CORPORATION nor the names of its | ||
# contributors may be used to endorse or promote products derived | ||
# from this software without specific prior written permission. | ||
# | ||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY | ||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR | ||
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | ||
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
|
||
import json | ||
|
||
import numpy as np | ||
import tritonclient.grpc as grpcclient | ||
|
||
|
||
class TestCheckHealth: | ||
_grpc_url = "localhost:8001" | ||
_model_name = "vllm_opt" | ||
_sampling_parameters = {"temperature": "0", "top_p": "1"} | ||
_prompt = "In this example," | ||
|
||
def _get_inputs(self, prompt, stream=True, sampling_parameters=None): | ||
inputs = [] | ||
|
||
inputs.append(grpcclient.InferInput("text_input", [1], "BYTES")) | ||
inputs[-1].set_data_from_numpy( | ||
np.array([prompt.encode("utf-8")], dtype=np.object_) | ||
) | ||
|
||
inputs.append(grpcclient.InferInput("stream", [1], "BOOL")) | ||
inputs[-1].set_data_from_numpy(np.array([stream], dtype=bool)) | ||
|
||
if sampling_parameters is not None: | ||
inputs.append(grpcclient.InferInput("sampling_parameters", [1], "BYTES")) | ||
inputs[-1].set_data_from_numpy( | ||
np.array( | ||
[json.dumps(sampling_parameters).encode("utf-8")], dtype=np.object_ | ||
) | ||
) | ||
|
||
return inputs | ||
|
||
def _callback(self, result, error): | ||
self._responses.append({"result": result, "error": error}) | ||
|
||
def _llm_infer(self): | ||
inputs = self._get_inputs( | ||
self._prompt, stream=True, sampling_parameters=self._sampling_parameters | ||
) | ||
self._responses = [] | ||
with grpcclient.InferenceServerClient(self._grpc_url) as client: | ||
client.start_stream(self._callback) | ||
client.async_stream_infer( | ||
self._model_name, inputs=inputs, parameters=self._sampling_parameters | ||
) | ||
client.stop_stream() | ||
|
||
def _assert_text_output_valid(self): | ||
text_output = "" | ||
for response in self._responses: | ||
result, error = response["result"], response["error"] | ||
assert error is None | ||
text_output += result.as_numpy(name="text_output")[0].decode("utf-8") | ||
assert len(text_output) > 0, "output is empty" | ||
assert text_output.count(" ") > 4, "output is not a sentence" | ||
|
||
def _assert_infer_exception(self, expected_exception_message): | ||
assert len(self._responses) == 1 | ||
for response in self._responses: | ||
result, error = response["result"], response["error"] | ||
assert result is None | ||
assert str(error) == expected_exception_message | ||
|
||
def _assert_model_ready(self, expected_readiness): | ||
with grpcclient.InferenceServerClient(self._grpc_url) as client: | ||
# is_model_ready API | ||
assert client.is_model_ready(self._model_name) == expected_readiness | ||
# get_model_repository_index API | ||
model_state = None | ||
for model_index in client.get_model_repository_index().models: | ||
if model_index.name == self._model_name: | ||
assert model_state is None, "duplicate model index found" | ||
model_state = model_index.state == "READY" | ||
assert model_state == expected_readiness | ||
|
||
def test_vllm_is_healthy(self): | ||
num_repeats = 3 | ||
for i in range(num_repeats): | ||
self._assert_model_ready(True) | ||
self._llm_infer() | ||
self._assert_text_output_valid() | ||
self._assert_model_ready(True) | ||
|
||
def test_vllm_not_healthy(self): | ||
self._assert_model_ready(True) | ||
# The 1st infer should complete successfully | ||
self._llm_infer() | ||
self._assert_text_output_valid() | ||
self._assert_model_ready(True) | ||
# The 2nd infer should begin with health check failed | ||
self._llm_infer() | ||
self._assert_infer_exception( | ||
"Model is unavailable due to unhealthy vLLM engine" | ||
) | ||
self._assert_model_ready(False) | ||
# The 3rd infer should have model not found | ||
self._llm_infer() | ||
self._assert_infer_exception( | ||
"Request for unknown model: 'vllm_opt' has no available versions" | ||
) | ||
self._assert_model_ready(False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
# | ||
# Redistribution and use in source and binary forms, with or without | ||
# modification, are permitted provided that the following conditions | ||
# are met: | ||
# * Redistributions of source code must retain the above copyright | ||
# notice, this list of conditions and the following disclaimer. | ||
# * Redistributions in binary form must reproduce the above copyright | ||
# notice, this list of conditions and the following disclaimer in the | ||
# documentation and/or other materials provided with the distribution. | ||
# * Neither the name of NVIDIA CORPORATION nor the names of its | ||
# contributors may be used to endorse or promote products derived | ||
# from this software without specific prior written permission. | ||
# | ||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY | ||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR | ||
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | ||
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
|
||
from vllm.engine.async_llm_engine import AsyncLLMEngine as real_AsyncLLMEngine | ||
|
||
|
||
class mock_AsyncLLMEngine(real_AsyncLLMEngine): | ||
_mock_check_health_count = 0 | ||
|
||
async def check_health(self) -> None: | ||
self._mock_check_health_count += 1 | ||
if self._mock_check_health_count > 1: | ||
raise RuntimeError("Simulated vLLM check_health() failure") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
#!/bin/bash | ||
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
# | ||
# Redistribution and use in source and binary forms, with or without | ||
# modification, are permitted provided that the following conditions | ||
# are met: | ||
# * Redistributions of source code must retain the above copyright | ||
# notice, this list of conditions and the following disclaimer. | ||
# * Redistributions in binary form must reproduce the above copyright | ||
# notice, this list of conditions and the following disclaimer in the | ||
# documentation and/or other materials provided with the distribution. | ||
# * Neither the name of NVIDIA CORPORATION nor the names of its | ||
# contributors may be used to endorse or promote products derived | ||
# from this software without specific prior written permission. | ||
# | ||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY | ||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR | ||
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | ||
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | ||
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
|
||
export CUDA_VISIBLE_DEVICES=0 | ||
source ../common/util.sh | ||
|
||
pip3 install pytest==8.1.1 | ||
pip3 install tritonclient[grpc] | ||
|
||
RET=0 | ||
|
||
function setup_model_repository { | ||
local sample_model_repo_path=${1:-"../../samples/model_repository"} | ||
rm -rf models vllm_baseline_output.pkl && mkdir -p models | ||
cp -r $sample_model_repo_path/vllm_model models/vllm_opt | ||
} | ||
|
||
function enable_health_check { | ||
local enable_vllm_health_check="$1" | ||
echo -e "parameters: {" >> models/vllm_opt/config.pbtxt | ||
echo -e " key: \"ENABLE_VLLM_HEALTH_CHECK\"" >> models/vllm_opt/config.pbtxt | ||
echo -e " value: { string_value: \"$enable_vllm_health_check\" }" >> models/vllm_opt/config.pbtxt | ||
echo -e "}" >> models/vllm_opt/config.pbtxt | ||
} | ||
|
||
function mock_vllm_async_llm_engine { | ||
mv /opt/tritonserver/backends/vllm/model.py /opt/tritonserver/backends/vllm/.model.py.backup | ||
cp /opt/tritonserver/backends/vllm/.model.py.backup /opt/tritonserver/backends/vllm/model.py | ||
sed -i 's/from vllm.engine.async_llm_engine import AsyncLLMEngine/from mock_async_llm_engine import mock_AsyncLLMEngine as AsyncLLMEngine/' /opt/tritonserver/backends/vllm/model.py | ||
cp mock_async_llm_engine.py /opt/tritonserver/backends/vllm | ||
} | ||
|
||
function unmock_vllm_async_llm_engine { | ||
rm -f /opt/tritonserver/backends/vllm/mock_async_llm_engine.py /opt/tritonserver/backends/vllm/model.py | ||
mv /opt/tritonserver/backends/vllm/.model.py.backup /opt/tritonserver/backends/vllm/model.py | ||
} | ||
|
||
function test_check_health { | ||
local test_name="$1" | ||
local unit_test_name="$2" | ||
|
||
SERVER_LOG="$test_name.server.log" | ||
SERVER_ARGS="--model-repository=models --model-control-mode=explicit --load-model=*" | ||
run_server | ||
if [ "$SERVER_PID" == "0" ]; then | ||
echo -e "\n***\n*** Failed to start $SERVER\n***" | ||
cat $SERVER_LOG | ||
exit 1 | ||
fi | ||
|
||
set +e | ||
python3 -m pytest --junitxml=$test_name.report.xml -s -v check_health_test.py::TestCheckHealth::$unit_test_name > $test_name.log | ||
if [ $? -ne 0 ]; then | ||
echo -e "\n***\n*** $test_name FAILED. \n***" | ||
RET=1 | ||
fi | ||
set -e | ||
|
||
kill $SERVER_PID | ||
wait $SERVER_PID | ||
} | ||
|
||
# Test health check unspecified | ||
setup_model_repository | ||
test_check_health "health_check_unspecified" "test_vllm_is_healthy" | ||
|
||
# Test health check disabled | ||
setup_model_repository | ||
enable_health_check "false" | ||
test_check_health "health_check_disabled" "test_vllm_is_healthy" | ||
|
||
# Test health check enabled | ||
setup_model_repository | ||
enable_health_check "true" | ||
test_check_health "health_check_enabled" "test_vllm_is_healthy" | ||
|
||
# Mock check_health() from vLLM | ||
mock_vllm_async_llm_engine | ||
|
||
# Test health check unspecified with mocked vLLM check_health() failure | ||
setup_model_repository | ||
test_check_health "health_check_unspecified_mocked_failure" "test_vllm_is_healthy" | ||
|
||
# Test health check disabled with mocked vLLM check_health() failure | ||
setup_model_repository | ||
enable_health_check "false" | ||
test_check_health "health_check_disabled_mocked_failure" "test_vllm_is_healthy" | ||
|
||
# Test health check enabled with mocked vLLM check_health() failure | ||
setup_model_repository | ||
enable_health_check "true" | ||
test_check_health "health_check_enabled_mocked_failure" "test_vllm_not_healthy" | ||
|
||
# Unmock check_health() | ||
unmock_vllm_async_llm_engine | ||
|
||
if [ $RET -eq 0 ]; then | ||
echo -e "\n***\n*** Test Passed\n***" | ||
else | ||
echo -e "\n***\n*** Test FAILED\n***" | ||
fi | ||
exit $RET |
Oops, something went wrong.