Skip to content

Commit

Permalink
Request cancellation test (#19)
Browse files Browse the repository at this point in the history
  • Loading branch information
pskiran1 authored Nov 7, 2023
1 parent d997922 commit 797038d
Show file tree
Hide file tree
Showing 7 changed files with 189 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@


class VLLMTritonStreamTest(AsyncTestResultCollector):
async def test_vllm_model_stream_enabled(self):
async def test_vllm_model_enabled_stream(self):
async with grpcclient.InferenceServerClient(
url="localhost:8001"
) as triton_client:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"}
SERVER=${TRITON_DIR}/bin/tritonserver
BACKEND_DIR=${TRITON_DIR}/backends
SERVER_ARGS="--model-repository=`pwd`/models --backend-directory=${BACKEND_DIR} --log-verbose=1"
SERVER_LOG="./stream_enabled_server.log"
CLIENT_LOG="./stream_enabled_client.log"
SERVER_LOG="./enabled_stream_server.log"
CLIENT_LOG="./enabled_stream_client.log"
TEST_RESULT_FILE='test_results.txt'
CLIENT_PY="./stream_enabled_test.py"
CLIENT_PY="./enabled_stream_test.py"
SAMPLE_MODELS_REPO="../../../samples/model_repository"
EXPECTED_NUM_TESTS=1

Expand Down Expand Up @@ -74,9 +74,9 @@ rm -rf models/
if [ $RET -eq 1 ]; then
cat $CLIENT_LOG
cat $SERVER_LOG
echo -e "\n***\n*** Straem Enabled test FAILED. \n***"
echo -e "\n***\n*** Enabled Stream test FAILED. \n***"
else
echo -e "\n***\n*** Straem Enabled test PASSED. \n***"
echo -e "\n***\n*** Enabled Stream test PASSED. \n***"
fi

collect_artifacts_from_subdir
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import sys
import time
import unittest
from functools import partial

import tritonclient.grpc as grpcclient
from tritonclient.utils import *

sys.path.append("../../common")
from test_util import TestResultCollector, UserData, callback, create_vllm_request


class VLLMRequestCancelTest(TestResultCollector):
def test_request_cancellation(self, send_parameters_as_tensor=True):
with grpcclient.InferenceServerClient(url="localhost:8001") as triton_client:
log_file_path = "./request_cancellation_server.log"
user_data = UserData()
model_name = "vllm_opt"
stream = False
sampling_parameters = {
"temperature": "0",
"top_p": "1",
"max_tokens": "1500",
}
prompt = f"Write an original and creative poem of at least 200 words."

triton_client.start_stream(callback=partial(callback, user_data))

request_data = create_vllm_request(
prompt,
"1",
stream,
sampling_parameters,
model_name,
send_parameters_as_tensor,
)
triton_client.async_stream_infer(
model_name=model_name,
request_id=request_data["request_id"],
inputs=request_data["inputs"],
outputs=request_data["outputs"],
parameters=sampling_parameters,
)
time.sleep(1)

triton_client.stop_stream(cancel_requests=True)
time.sleep(1)
self.assertFalse(user_data._completed_requests.empty())

result = user_data._completed_requests.get()
self.assertIsInstance(result, InferenceServerException)
self.assertEqual(result.status(), "StatusCode.CANCELLED")
self.assertTrue(user_data._completed_requests.empty())

with open(log_file_path, mode="r") as log_file:
log_text = log_file.read()
self.assertIn("[vllm] Cancelling the request", log_text)
self.assertIn("[vllm] Successfully cancelled the request", log_text)


if __name__ == "__main__":
unittest.main()
84 changes: 84 additions & 0 deletions ci/L0_backend_vllm/request_cancellation/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/bin/bash
# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

source ../../common/util.sh

TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"}
SERVER=${TRITON_DIR}/bin/tritonserver
BACKEND_DIR=${TRITON_DIR}/backends
SERVER_ARGS="--model-repository=`pwd`/models --backend-directory=${BACKEND_DIR} --log-verbose=1"
SERVER_LOG="./request_cancellation_server.log"
CLIENT_LOG="./request_cancellation_client.log"
TEST_RESULT_FILE='test_results.txt'
CLIENT_PY="./request_cancellation_test.py"
SAMPLE_MODELS_REPO="../../../samples/model_repository"
EXPECTED_NUM_TESTS=1

rm -rf models && mkdir -p models
cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt

RET=0

run_server
if [ "$SERVER_PID" == "0" ]; then
cat $SERVER_LOG
echo -e "\n***\n*** Failed to start $SERVER\n***"
exit 1
fi

set +e
python3 $CLIENT_PY -v > $CLIENT_LOG 2>&1

if [ $? -ne 0 ]; then
cat $CLIENT_LOG
echo -e "\n***\n*** Running $CLIENT_PY FAILED. \n***"
RET=1
else
check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
if [ $? -ne 0 ]; then
cat $CLIENT_LOG
echo -e "\n***\n*** Test Result Verification FAILED.\n***"
RET=1
fi
fi
set -e

kill $SERVER_PID
wait $SERVER_PID
rm -rf models/

if [ $RET -eq 1 ]; then
cat $CLIENT_LOG
cat $SERVER_LOG
echo -e "\n***\n*** Request Cancellation test FAILED. \n***"
else
echo -e "\n***\n*** Request Cancellation test PASSED. \n***"
fi

collect_artifacts_from_subdir

exit $RET
4 changes: 2 additions & 2 deletions ci/L0_backend_vllm/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

RET=0
SUBTESTS="accuracy_test stream_enabled vllm_backend"
SUBTESTS="accuracy_test request_cancellation enabled_stream vllm_backend"

pip3 install tritonclient grpcio
python3 -m pip install --upgrade pip && pip3 install tritonclient[grpc]

for TEST in ${SUBTESTS}; do
(cd ${TEST} && bash -ex test.sh && cd ..)
Expand Down
2 changes: 1 addition & 1 deletion ci/L0_multi_gpu/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ rm -rf models && mkdir -p models
cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt
sed -i '3s/^/ "tensor_parallel_size": 2,\n/' models/vllm_opt/1/model.json

pip3 install tritonclient grpcio nvidia-ml-py3
python3 -m pip install --upgrade pip && pip3 install tritonclient[grpc] nvidia-ml-py3

RET=0

Expand Down
12 changes: 8 additions & 4 deletions src/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,15 +114,17 @@ async def await_shutdown(self):
# Wait for the ongoing_requests
while self.ongoing_request_count > 0:
self.logger.log_info(
"Awaiting remaining {} requests".format(self.ongoing_request_count)
"[vllm] Awaiting remaining {} requests".format(
self.ongoing_request_count
)
)
await asyncio.sleep(5)

for task in asyncio.all_tasks(loop=self._loop):
if task is not asyncio.current_task():
task.cancel()

self.logger.log_info("Shutdown complete")
self.logger.log_info("[vllm] Shutdown complete")

def get_sampling_params_dict(self, params_json):
"""
Expand Down Expand Up @@ -209,7 +211,9 @@ async def generate(self, request):
prompt, sampling_params, request_id
):
if response_sender.is_cancelled():
self.logger.log_info("[vllm] Cancelling the request")
await self.llm_engine.abort(request_id)
self.logger.log_info("[vllm] Successfully cancelled the request")
break
if stream:
response_sender.send(self.create_response(output))
Expand All @@ -220,7 +224,7 @@ async def generate(self, request):
response_sender.send(self.create_response(last_output))

except Exception as e:
self.logger.log_info(f"Error generating stream: {e}")
self.logger.log_info(f"[vllm] Error generating stream: {e}")
error = pb_utils.TritonError(f"Error generating stream: {e}")
triton_output_tensor = pb_utils.Tensor(
"text_output", np.asarray(["N/A"], dtype=self.output_dtype)
Expand Down Expand Up @@ -252,7 +256,7 @@ def finalize(self):
"""
Triton virtual method; called when the model is unloaded.
"""
self.logger.log_info("Issuing finalize to vllm backend")
self.logger.log_info("[vllm] Issuing finalize to vllm backend")
self._shutdown_event.set()
if self._loop_thread is not None:
self._loop_thread.join()
Expand Down

0 comments on commit 797038d

Please sign in to comment.