Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Auto unload model if vLLM health check failed #73

Merged
merged 18 commits into from
Dec 5, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 120 additions & 0 deletions ci/L0_check_health_vllm/check_health_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import json

import numpy as np
import tritonclient.grpc as grpcclient


class TestCheckHealth:
_grpc_url = "localhost:8001"
_model_name = "vllm_opt"
_sampling_parameters = {"temperature": "0", "top_p": "1"}
_prompt = "In this example,"

def _get_inputs(self, prompt, stream=True, sampling_parameters=None):
inputs = []

inputs.append(grpcclient.InferInput("text_input", [1], "BYTES"))
inputs[-1].set_data_from_numpy(
np.array([prompt.encode("utf-8")], dtype=np.object_)
)

inputs.append(grpcclient.InferInput("stream", [1], "BOOL"))
inputs[-1].set_data_from_numpy(np.array([stream], dtype=bool))

if sampling_parameters is not None:
inputs.append(grpcclient.InferInput("sampling_parameters", [1], "BYTES"))
inputs[-1].set_data_from_numpy(
np.array(
[json.dumps(sampling_parameters).encode("utf-8")], dtype=np.object_
)
)

return inputs

def _callback(self, result, error):
self._responses.append({"result": result, "error": error})

def _llm_infer(self):
inputs = self._get_inputs(
self._prompt, stream=True, sampling_parameters=self._sampling_parameters
)
self._responses = []
with grpcclient.InferenceServerClient(self._grpc_url) as client:
client.start_stream(self._callback)
client.async_stream_infer(
self._model_name, inputs=inputs, parameters=self._sampling_parameters
)
client.stop_stream()

def _assert_text_output_valid(self):
text_output = ""
for response in self._responses:
result, error = response["result"], response["error"]
assert error is None
text_output += result.as_numpy(name="text_output")[0].decode("utf-8")
assert len(text_output) > 0, "output is empty"
assert text_output.count(" ") > 4, "output is not a sentence"

def _assert_infer_exception(self, expected_exception_message):
assert len(self._responses) == 1
for response in self._responses:
result, error = response["result"], response["error"]
assert result is None
assert str(error) == expected_exception_message

def _assert_model_ready(self, expected_readiness):
with grpcclient.InferenceServerClient(self._grpc_url) as client:
assert client.is_model_ready(self._model_name) == expected_readiness

def test_vllm_is_healthy(self):
num_repeats = 3
for i in range(num_repeats):
self._assert_model_ready(True)
self._llm_infer()
self._assert_text_output_valid()
self._assert_model_ready(True)

def test_vllm_not_healthy(self):
self._assert_model_ready(True)
# The 1st infer should complete successfully
self._llm_infer()
self._assert_text_output_valid()
self._assert_model_ready(True)
# The 2nd infer should begin with health check failed
self._llm_infer()
self._assert_infer_exception(
"vLLM engine is not healthy and model will be unloaded"
)
self._assert_model_ready(False)
# The 3rd infer should have model not found
self._llm_infer()
self._assert_infer_exception(
"Request for unknown model: 'vllm_opt' has no available versions"
)
self._assert_model_ready(False)
36 changes: 36 additions & 0 deletions ci/L0_check_health_vllm/mock_async_llm_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

from vllm.engine.async_llm_engine import AsyncLLMEngine as real_AsyncLLMEngine


class mock_AsyncLLMEngine(real_AsyncLLMEngine):
_mock_check_health_count = 0

async def check_health(self) -> None:
self._mock_check_health_count += 1
if self._mock_check_health_count > 1:
raise RuntimeError("Simulated vLLM check_health() failure")
126 changes: 126 additions & 0 deletions ci/L0_check_health_vllm/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#!/bin/bash
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

export CUDA_VISIBLE_DEVICES=0
source ../common/util.sh

pip3 install pytest==8.1.1
pip3 install tritonclient[grpc]

RET=0

function setup_model_repository {
local sample_model_repo_path=${1:-"../../samples/model_repository"}
rm -rf models vllm_baseline_output.pkl && mkdir -p models
cp -r $sample_model_repo_path/vllm_model models/vllm_opt
}

function enable_health_check {
local enable_vllm_health_check="$1"
echo -e "parameters: {" >> models/vllm_opt/config.pbtxt
echo -e " key: \"ENABLE_VLLM_HEALTH_CHECK\"" >> models/vllm_opt/config.pbtxt
echo -e " value: { string_value: \"$enable_vllm_health_check\" }" >> models/vllm_opt/config.pbtxt
echo -e "}" >> models/vllm_opt/config.pbtxt
}

function mock_vllm_async_llm_engine {
mv /opt/tritonserver/backends/vllm/model.py /opt/tritonserver/backends/vllm/.model.py.backup
cp /opt/tritonserver/backends/vllm/.model.py.backup /opt/tritonserver/backends/vllm/model.py
sed -i 's/from vllm.engine.async_llm_engine import AsyncLLMEngine/from mock_async_llm_engine import mock_AsyncLLMEngine as AsyncLLMEngine/' /opt/tritonserver/backends/vllm/model.py
cp mock_async_llm_engine.py /opt/tritonserver/backends/vllm
}

function unmock_vllm_async_llm_engine {
rm -f /opt/tritonserver/backends/vllm/mock_async_llm_engine.py /opt/tritonserver/backends/vllm/model.py
mv /opt/tritonserver/backends/vllm/.model.py.backup /opt/tritonserver/backends/vllm/model.py
}

function test_check_health {
local test_name="$1"
local unit_test_name="$2"

SERVER_LOG="$test_name.server.log"
SERVER_ARGS="--model-repository=models --model-control-mode=explicit --load-model=*"
run_server
if [ "$SERVER_PID" == "0" ]; then
echo -e "\n***\n*** Failed to start $SERVER\n***"
cat $SERVER_LOG
exit 1
fi

set +e
python3 -m pytest --junitxml=$test_name.report.xml -s -v check_health_test.py::TestCheckHealth::$unit_test_name > $test_name.log
if [ $? -ne 0 ]; then
echo -e "\n***\n*** $test_name FAILED. \n***"
RET=1
fi
set -e

kill $SERVER_PID
wait $SERVER_PID
}

# Test health check unspecified
setup_model_repository
test_check_health "health_check_unspecified" "test_vllm_is_healthy"

# Test health check disabled
setup_model_repository
enable_health_check "false"
test_check_health "health_check_disabled" "test_vllm_is_healthy"

# Test health check enabled
setup_model_repository
enable_health_check "true"
test_check_health "health_check_enabled" "test_vllm_is_healthy"

# Mock check_health() from vLLM
mock_vllm_async_llm_engine

# Test health check unspecified with mocked vLLM check_health() failure
setup_model_repository
test_check_health "health_check_unspecified_mocked_failure" "test_vllm_is_healthy"

# Test health check disabled with mocked vLLM check_health() failure
setup_model_repository
enable_health_check "false"
test_check_health "health_check_disabled_mocked_failure" "test_vllm_is_healthy"

# Test health check enabled with mocked vLLM check_health() failure
setup_model_repository
enable_health_check "true"
test_check_health "health_check_enabled_mocked_failure" "test_vllm_not_healthy"

# Unmock check_health()
unmock_vllm_async_llm_engine

if [ $RET -eq 0 ]; then
echo -e "\n***\n*** Test Passed\n***"
else
echo -e "\n***\n*** Test FAILED\n***"
fi
exit $RET
4 changes: 2 additions & 2 deletions ci/common/util.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand All @@ -25,7 +25,7 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


SERVER=${SERVER:=/opt/tritonserver/bin/tritonserver}
SERVER_IPADDR=${TRITONSERVER_IPADDR:=localhost}
SERVER_LOG=${SERVER_LOG:=./server.log}
SERVER_TIMEOUT=${SERVER_TIMEOUT:=120}
Expand Down
42 changes: 39 additions & 3 deletions src/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,12 @@ def initialize(self, args):
)
self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"])

# Setup vLLM engine health check
self._enable_health_check = self._get_bool_config_param(
"ENABLE_VLLM_HEALTH_CHECK"
)
self._is_healthy = True

# Prepare vLLM engine
self.init_engine()

Expand Down Expand Up @@ -163,9 +169,7 @@ def init_engine(self):
# Create vLLM custom metrics
self.vllm_metrics = None
if (
"REPORT_CUSTOM_METRICS" in self.model_config["parameters"]
and self.model_config["parameters"]["REPORT_CUSTOM_METRICS"]["string_value"]
== "yes"
self._get_bool_config_param("REPORT_CUSTOM_METRICS")
and not aync_engine_args.disable_log_stats
):
try:
Expand All @@ -186,6 +190,12 @@ def init_engine(self):
else:
raise e

def _get_bool_config_param(self, param_name: str) -> bool:
return (param_name in self.model_config["parameters"]) and (
self.model_config["parameters"][param_name]["string_value"].lower()
in ["yes", "true"]
kthui marked this conversation as resolved.
Show resolved Hide resolved
)

def setup_lora(self):
self.enable_lora = False

Expand Down Expand Up @@ -542,6 +552,30 @@ def verify_loras(self, request):
verified_request = request
return verified_request

def _check_health(self, requests):
coro = self.llm_engine.check_health()
future = asyncio.run_coroutine_threadsafe(coro, self._loop)
try:
future.result()
kthui marked this conversation as resolved.
Show resolved Hide resolved
except Exception as e:
self.logger.log_error(
f"[vllm] Engine is not healthy and model will be unloaded: {e}"
)
pb_utils.unload_model(self.model_config["name"]) # non-blocking
kthui marked this conversation as resolved.
Show resolved Hide resolved
self._is_healthy = False
if not self._is_healthy:
for request in requests:
request.get_response_sender().send(
pb_utils.InferenceResponse(
error=pb_utils.TritonError(
message="vLLM engine is not healthy and model will be unloaded",
kthui marked this conversation as resolved.
Show resolved Hide resolved
code=pb_utils.TritonError.UNAVAILABLE,
)
),
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL,
)
return self._is_healthy

def execute(self, requests):
"""
Triton core issues requests to the backend via this method.
Expand All @@ -552,6 +586,8 @@ def execute(self, requests):
is too loaded.
We are pushing all the requests on vllm and let it handle the full traffic.
"""
if self._enable_health_check and not self._check_health(requests):
return None
for request in requests:
request = self.verify_loras(request)
if request is not None:
Expand Down
Loading