Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Auto unload model if vLLM health check failed #73

Merged
merged 18 commits into from
Dec 5, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 128 additions & 0 deletions ci/L0_check_health_vllm/check_health_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import json
import os

import numpy as np
import pytest
Fixed Show fixed Hide fixed
import tritonclient.grpc as grpcclient


class TestCheckHealth:
_grpc_url = "localhost:8001"
_model_name = "vllm_opt"
_sampling_parameters = {"temperature": "0", "top_p": "1"}
_prompt = "In this example,"

def _get_inputs(self, prompt, stream=True, sampling_parameters=None):
inputs = []

inputs.append(grpcclient.InferInput("text_input", [1], "BYTES"))
inputs[-1].set_data_from_numpy(
np.array([prompt.encode("utf-8")], dtype=np.object_)
)

inputs.append(grpcclient.InferInput("stream", [1], "BOOL"))
inputs[-1].set_data_from_numpy(np.array([stream], dtype=bool))

if sampling_parameters is not None:
inputs.append(grpcclient.InferInput("sampling_parameters", [1], "BYTES"))
inputs[-1].set_data_from_numpy(
np.array(
[json.dumps(sampling_parameters).encode("utf-8")], dtype=np.object_
)
)

return inputs

def _callback(self, result, error):
self._responses.append({"result": result, "error": error})

def _llm_infer(self):
inputs = self._get_inputs(
self._prompt, stream=True, sampling_parameters=self._sampling_parameters
)
self._responses = []
with grpcclient.InferenceServerClient(self._grpc_url) as client:
client.start_stream(self._callback)
client.async_stream_infer(
self._model_name, inputs=inputs, parameters=self._sampling_parameters
)
client.stop_stream()

def _assert_text_output_valid(self):
text_output = ""
for response in self._responses:
result, error = response["result"], response["error"]
assert error is None
text_output += result.as_numpy(name="text_output")[0].decode("utf-8")
assert len(text_output) > 0, "output is empty"
assert text_output.count(" ") > 4, "output is not a sentence"

def _assert_infer_exception(self, expected_exception_message):
assert len(self._responses) == 1
for response in self._responses:
result, error = response["result"], response["error"]
assert result is None
assert str(error) == expected_exception_message

def _assert_model_ready(self, expected_readiness):
with grpcclient.InferenceServerClient(self._grpc_url) as client:
assert client.is_model_ready(self._model_name) == expected_readiness

def test_vllm_is_healthy(self):
num_repeats = 3
for i in range(num_repeats):
self._assert_model_ready(True)
self._llm_infer()
self._assert_text_output_valid()
self._assert_model_ready(True)

def test_vllm_not_healthy(self):
self._assert_model_ready(True)
# The 1st infer should complete successfully
self._llm_infer()
self._assert_text_output_valid()
self._assert_model_ready(True)
# The 2nd infer should begin with health check failed
self._llm_infer()
self._assert_infer_exception("vLLM engine is not healthy")
self._assert_model_ready(False)
# The 3rd infer should have model not found
self._llm_infer()
self._assert_infer_exception(
"Request for unknown model: 'vllm_opt' has no available versions"
)
self._assert_model_ready(False)

def test_vllm_enable_health_check_multi_instance(self):
with open(os.environ["SERVER_LOG"]) as f:
server_log = f.read()
expected_vllm_warning = "[vllm] Health check may only be enabled when the model has exactly 1 instance but 2 are found"
assert expected_vllm_warning in server_log
# Health check should be disabled
self.test_vllm_is_healthy()
36 changes: 36 additions & 0 deletions ci/L0_check_health_vllm/mock_async_llm_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

from vllm.engine.async_llm_engine import AsyncLLMEngine as real_AsyncLLMEngine


class mock_AsyncLLMEngine(real_AsyncLLMEngine):
_mock_check_health_count = 0

async def check_health(self) -> None:
self._mock_check_health_count += 1
if self._mock_check_health_count > 1:
raise RuntimeError("Simulated vLLM check_health() failure")
141 changes: 141 additions & 0 deletions ci/L0_check_health_vllm/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#!/bin/bash
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

export CUDA_VISIBLE_DEVICES=0
source ../common/util.sh

pip3 install pytest==8.1.1
pip3 install tritonclient[grpc]

RET=0

function setup_model_repository {
local sample_model_repo_path=${1:-"../../samples/model_repository"}
rm -rf models vllm_baseline_output.pkl && mkdir -p models
cp -r $sample_model_repo_path/vllm_model models/vllm_opt
}

function setup_model_repository_with_multi_instances {
setup_model_repository
echo -e "backend: \"vllm\"" > models/vllm_opt/config.pbtxt
echo -e "instance_group [" >> models/vllm_opt/config.pbtxt
echo -e " { kind: KIND_MODEL }," >> models/vllm_opt/config.pbtxt
echo -e " { kind: KIND_MODEL \n count: 1 }" >> models/vllm_opt/config.pbtxt
echo -e "]" >> models/vllm_opt/config.pbtxt
}

function enable_health_check {
local enable_vllm_health_check="$1"
echo -e "parameters: {" >> models/vllm_opt/config.pbtxt
echo -e " key: \"ENABLE_VLLM_HEALTH_CHECK\"" >> models/vllm_opt/config.pbtxt
echo -e " value: { string_value: \"$enable_vllm_health_check\" }" >> models/vllm_opt/config.pbtxt
echo -e "}" >> models/vllm_opt/config.pbtxt
}

function mock_vllm_async_llm_engine {
mv /opt/tritonserver/backends/vllm/model.py /opt/tritonserver/backends/vllm/.model.py.backup
cp /opt/tritonserver/backends/vllm/.model.py.backup /opt/tritonserver/backends/vllm/model.py
sed -i 's/from vllm.engine.async_llm_engine import AsyncLLMEngine/from mock_async_llm_engine import mock_AsyncLLMEngine as AsyncLLMEngine/' /opt/tritonserver/backends/vllm/model.py
cp mock_async_llm_engine.py /opt/tritonserver/backends/vllm
}

function unmock_vllm_async_llm_engine {
rm -f /opt/tritonserver/backends/vllm/mock_async_llm_engine.py /opt/tritonserver/backends/vllm/model.py
mv /opt/tritonserver/backends/vllm/.model.py.backup /opt/tritonserver/backends/vllm/model.py
}

function test_check_health {
local test_name="$1"
local unit_test_name="$2"

SERVER_LOG="$test_name.server.log"
SERVER_ARGS="--model-repository=models --model-control-mode=explicit --load-model=*"
run_server
if [ "$SERVER_PID" == "0" ]; then
echo -e "\n***\n*** Failed to start $SERVER\n***"
cat $SERVER_LOG
exit 1
fi

set +e
SERVER_LOG=$SERVER_LOG python3 -m pytest --junitxml=$test_name.report.xml -s -v check_health_test.py::TestCheckHealth::$unit_test_name > $test_name.log
if [ $? -ne 0 ]; then
echo -e "\n***\n*** $test_name FAILED. \n***"
RET=1
fi
set -e

kill $SERVER_PID
wait $SERVER_PID
}

# Test health check unspecified
setup_model_repository
test_check_health "health_check_unspecified" "test_vllm_is_healthy"

# Test health check disabled
setup_model_repository
enable_health_check "false"
test_check_health "health_check_disabled" "test_vllm_is_healthy"

# Test health check enabled
setup_model_repository
enable_health_check "true"
test_check_health "health_check_enabled" "test_vllm_is_healthy"

# Mock check_health() from vLLM
mock_vllm_async_llm_engine

# Test health check unspecified with mocked vLLM check_health() failure
setup_model_repository
test_check_health "health_check_unspecified_mocked_failure" "test_vllm_is_healthy"

# Test health check disabled with mocked vLLM check_health() failure
setup_model_repository
enable_health_check "false"
test_check_health "health_check_disabled_mocked_failure" "test_vllm_is_healthy"

# Test health check enabled with mocked vLLM check_health() failure
setup_model_repository
enable_health_check "true"
test_check_health "health_check_enabled_mocked_failure" "test_vllm_not_healthy"

# Test health check enabled with mocked vLLM check_health() failure when there
# are multiple instances
setup_model_repository_with_multi_instances
enable_health_check "true"
test_check_health "health_check_enabled_multi_instance_mocked_failure" "test_vllm_enable_health_check_multi_instance"

# Unmock check_health()
unmock_vllm_async_llm_engine

if [ $RET -eq 0 ]; then
echo -e "\n***\n*** Test Passed\n***"
else
echo -e "\n***\n*** Test FAILED\n***"
fi
exit $RET
4 changes: 2 additions & 2 deletions ci/common/util.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand All @@ -25,7 +25,7 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


SERVER=${SERVER:=/opt/tritonserver/bin/tritonserver}
SERVER_IPADDR=${TRITONSERVER_IPADDR:=localhost}
SERVER_LOG=${SERVER_LOG:=./server.log}
SERVER_TIMEOUT=${SERVER_TIMEOUT:=120}
Expand Down
Loading
Loading