From fb5bc9ffa26594d433c270e4cd2a480df52288c3 Mon Sep 17 00:00:00 2001 From: Katherine Yang <80359429+jbkyang-nvi@users.noreply.github.com> Date: Thu, 16 Nov 2023 14:59:48 -0800 Subject: [PATCH] Add timeout to client apis and tests (#6546) Client PR: triton-inference-server/client#429 --- Dockerfile.sdk | 3 - ...t_test.py => client_infer_timeout_test.py} | 20 +- .../client_non_infer_timeout_test.py | 340 ++++++++++++++++++ qa/L0_client_timeout/test.sh | 80 ++++- src/grpc/grpc_server.cc | 78 ++-- 5 files changed, 474 insertions(+), 47 deletions(-) rename qa/L0_client_timeout/{client_timeout_test.py => client_infer_timeout_test.py} (93%) create mode 100755 qa/L0_client_timeout/client_non_infer_timeout_test.py diff --git a/Dockerfile.sdk b/Dockerfile.sdk index 46ed8c8deb..496185816a 100644 --- a/Dockerfile.sdk +++ b/Dockerfile.sdk @@ -34,7 +34,6 @@ ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:23.10-py3-min ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo ARG TRITON_COMMON_REPO_TAG=main ARG TRITON_CORE_REPO_TAG=main -ARG TRITON_BACKEND_REPO_TAG=main ARG TRITON_THIRD_PARTY_REPO_TAG=main ARG TRITON_MODEL_ANALYZER_REPO_TAG=main ARG TRITON_ENABLE_GPU=ON @@ -107,7 +106,6 @@ RUN rm -f /usr/bin/python && \ ARG TRITON_CLIENT_REPO_SUBDIR ARG TRITON_COMMON_REPO_TAG ARG TRITON_CORE_REPO_TAG -ARG TRITON_BACKEND_REPO_TAG ARG TRITON_THIRD_PARTY_REPO_TAG ARG TRITON_ENABLE_GPU ARG JAVA_BINDINGS_MAVEN_VERSION @@ -123,7 +121,6 @@ RUN cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \ -DTRITON_VERSION=`cat /workspace/TRITON_VERSION` \ -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \ -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \ - -DTRITON_BACKEND_REPO_TAG=${TRITON_BACKEND_REPO_TAG} \ -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \ -DTRITON_ENABLE_CC_HTTP=ON -DTRITON_ENABLE_CC_GRPC=ON \ -DTRITON_ENABLE_PYTHON_HTTP=ON -DTRITON_ENABLE_PYTHON_GRPC=ON \ diff --git a/qa/L0_client_timeout/client_timeout_test.py b/qa/L0_client_timeout/client_infer_timeout_test.py similarity index 93% rename from qa/L0_client_timeout/client_timeout_test.py rename to qa/L0_client_timeout/client_infer_timeout_test.py index 7b0081074a..700e9bfe9b 100755 --- a/qa/L0_client_timeout/client_timeout_test.py +++ b/qa/L0_client_timeout/client_infer_timeout_test.py @@ -37,9 +37,9 @@ import numpy as np import test_util as tu -import tritongrpcclient as grpcclient -import tritonhttpclient as httpclient -from tritonclientutils import InferenceServerException +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +from tritonclient.utils import InferenceServerException class UserData: @@ -54,10 +54,12 @@ def callback(user_data, result, error): user_data._completed_requests.put(result) -class ClientTimeoutTest(tu.TestResultCollector): +class ClientInferTimeoutTest(tu.TestResultCollector): def setUp(self): self.model_name_ = "custom_identity_int32" self.input0_data_ = np.array([[10]], dtype=np.int32) + self.input0_data_byte_size_ = 32 + self.INFER_SMALL_INTERVAL = 2.0 # seconds for a timeout def _prepare_request(self, protocol): if protocol == "grpc": @@ -118,7 +120,7 @@ def test_grpc_async_infer(self): inputs=self.inputs_, callback=partial(callback, user_data), outputs=self.outputs_, - client_timeout=2, + client_timeout=self.INFER_SMALL_INTERVAL, ) data_item = user_data._completed_requests.get() if type(data_item) == InferenceServerException: @@ -190,7 +192,9 @@ def test_http_infer(self): # response. Expect an exception for small timeout values. with self.assertRaises(socket.timeout) as cm: triton_client = httpclient.InferenceServerClient( - url="localhost:8000", verbose=True, network_timeout=2.0 + url="localhost:8000", + verbose=True, + network_timeout=self.INFER_SMALL_INTERVAL, ) _ = triton_client.infer( model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_ @@ -216,7 +220,9 @@ def test_http_async_infer(self): # response. Expect an exception for small timeout values. with self.assertRaises(socket.timeout) as cm: triton_client = httpclient.InferenceServerClient( - url="localhost:8000", verbose=True, network_timeout=2.0 + url="localhost:8000", + verbose=True, + network_timeout=self.INFER_SMALL_INTERVAL, ) async_request = triton_client.async_infer( model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_ diff --git a/qa/L0_client_timeout/client_non_infer_timeout_test.py b/qa/L0_client_timeout/client_non_infer_timeout_test.py new file mode 100755 index 0000000000..bbaf8c34e8 --- /dev/null +++ b/qa/L0_client_timeout/client_non_infer_timeout_test.py @@ -0,0 +1,340 @@ +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest + +import numpy as np +import test_util as tu +import tritonclient.grpc as grpcclient +from tritonclient.utils import InferenceServerException + + +class ClientNonInferTimeoutTest(tu.TestResultCollector): + def setUp(self): + self.model_name_ = "custom_identity_int32" + self.input0_data_ = np.array([[10]], dtype=np.int32) + self.input0_data_byte_size_ = 32 + self.SMALL_INTERVAL = 0.1 # seconds for a timeout + self.NORMAL_INTERVAL = 5.0 # seconds for server to load then receive request + + def test_grpc_server_live(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.is_server_live(client_timeout=self.SMALL_INTERVAL) + self.assertIn("Deadline Exceeded", str(cm.exception)) + self.assertTrue( + triton_client.is_server_live(client_timeout=self.NORMAL_INTERVAL) + ) + + def test_grpc_is_server_ready(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.is_server_ready(client_timeout=self.SMALL_INTERVAL) + self.assertIn("Deadline Exceeded", str(cm.exception)) + self.assertTrue( + triton_client.is_server_ready(client_timeout=self.NORMAL_INTERVAL) + ) + + def test_grpc_is_model_ready(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.is_model_ready( + model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + self.assertTrue( + triton_client.is_model_ready( + model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL + ) + ) + + def test_grpc_get_server_metadata(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.get_server_metadata(client_timeout=self.SMALL_INTERVAL) + self.assertIn("Deadline Exceeded", str(cm.exception)) + + triton_client.get_server_metadata(client_timeout=self.NORMAL_INTERVAL) + + def test_grpc_get_model_metadata(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.get_model_metadata( + model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.get_model_metadata( + model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL + ) + + def test_grpc_get_model_config(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.get_model_config( + model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.get_model_config( + model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL + ) + + def test_grpc_model_repository_index(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.get_model_repository_index( + client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.get_model_repository_index(client_timeout=self.NORMAL_INTERVAL) + + def test_grpc_load_model(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + triton_client.unload_model(model_name=self.model_name_) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.load_model( + model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.unload_model( + model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL + ) + triton_client.load_model( + model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL + ) + + def test_grpc_unload_model(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.unload_model( + model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.load_model(model_name=self.model_name_) + triton_client.unload_model( + model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL + ) + triton_client.load_model(model_name=self.model_name_) + + def test_grpc_get_inference_statistics(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.get_inference_statistics( + model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.get_inference_statistics( + model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL + ) + + def test_grpc_update_trace_settings(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.update_trace_settings( + model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.update_trace_settings( + model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL + ) + + def test_grpc_get_trace_settings(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.get_trace_settings( + model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.get_trace_settings( + model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL + ) + + def test_grpc_update_log_settings(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + settings = {} + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.update_log_settings( + settings=settings, client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.update_log_settings( + settings=settings, client_timeout=self.NORMAL_INTERVAL + ) + + def test_grpc_get_log_settings(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.get_log_settings( + as_json=True, client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.get_log_settings( + as_json=True, client_timeout=self.NORMAL_INTERVAL + ) + + def test_grpc_get_system_shared_memory_status(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.get_system_shared_memory_status( + client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.get_system_shared_memory_status( + client_timeout=self.NORMAL_INTERVAL + ) + + def test_grpc_register_system_shared_memory(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + triton_client.unregister_system_shared_memory() + import tritonclient.utils.shared_memory as shm + + shm_ip0_handle = shm.create_shared_memory_region( + "input0_data", "/input_simple", self.input0_data_byte_size_ + ) + shm.set_shared_memory_region(shm_ip0_handle, [self.input0_data_]) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.register_system_shared_memory( + "input0_data", + "/input_simple", + self.input0_data_byte_size_, + client_timeout=self.SMALL_INTERVAL, + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.unregister_system_shared_memory() + triton_client.register_system_shared_memory( + "input0_data", + "/input_simple", + self.input0_data_byte_size_, + client_timeout=self.NORMAL_INTERVAL, + ) + triton_client.unregister_system_shared_memory() + + def test_grpc_unregister_system_shared_memory(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.unregister_system_shared_memory( + client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.unregister_system_shared_memory( + client_timeout=self.NORMAL_INTERVAL + ) + + def test_grpc_get_cuda_shared_memory_status(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.get_cuda_shared_memory_status( + client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.get_cuda_shared_memory_status(client_timeout=self.NORMAL_INTERVAL) + + def test_grpc_register_cuda_shared_memory(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + import tritonclient.utils.cuda_shared_memory as cshm + + input_data = np.array([[10]], dtype=np.int32) + byteSize = input_data.itemsize * input_data.size + shm_op0_handle = cshm.create_shared_memory_region( + "dummy_data", byte_size=byteSize, device_id=0 + ) + cshm.set_shared_memory_region(shm_op0_handle, [input_data]) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.register_cuda_shared_memory( + "dummy_data", + cshm.get_raw_handle(shm_op0_handle), + device_id=0, + byte_size=byteSize, + client_timeout=self.SMALL_INTERVAL, + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.unregister_cuda_shared_memory() + triton_client.register_cuda_shared_memory( + "dummy_data", + cshm.get_raw_handle(shm_op0_handle), + device_id=0, + byte_size=byteSize, + client_timeout=self.NORMAL_INTERVAL, + ) + cshm.destroy_shared_memory_region(shm_op0_handle) + + def test_grpc_unregister_cuda_shared_memory(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.unregister_cuda_shared_memory( + client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.unregister_cuda_shared_memory(client_timeout=self.NORMAL_INTERVAL) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_client_timeout/test.sh b/qa/L0_client_timeout/test.sh index a832694b84..f250dc9fa3 100755 --- a/qa/L0_client_timeout/test.sh +++ b/qa/L0_client_timeout/test.sh @@ -39,10 +39,12 @@ if [ ! -z "$TEST_REPO_ARCH" ]; then fi export CUDA_VISIBLE_DEVICES=0 - +TIMEOUT_VALUE=100000000 +SHORT_TIMEOUT_VALUE=1000 RET=0 -CLIENT_TIMEOUT_TEST=client_timeout_test.py +CLIENT_INFER_TIMEOUT_TEST=client_infer_timeout_test.py +CLIENT_NON_INFER_TIMEOUT_TEST=client_non_infer_timeout_test.py CLIENT_TIMEOUT_TEST_CPP=../clients/client_timeout_test TEST_RESULT_FILE='test_results.txt' @@ -50,27 +52,62 @@ rm -f *.log rm -f *.log.* CLIENT_LOG=`pwd`/client.log +CLIENT_GRPC_TIMEOUTS_LOG=`pwd`/client.log.grpc DATADIR=`pwd`/models SERVER=/opt/tritonserver/bin/tritonserver -SERVER_ARGS="--model-repository=$DATADIR" +SERVER_ARGS="--model-repository=$DATADIR --model-control-mode=explicit --load-model=custom_identity_int32 --log-verbose 2" source ../common/util.sh mkdir -p $DATADIR/custom_identity_int32/1 +# Test all APIs apart from Infer. +export TRITONSERVER_SERVER_DELAY_GRPC_RESPONSE_SEC=2 run_server +if [ $? -eq 1 ]; then + echo -e "\n***\n*** Test Failed: GRPC non-infer APIs\n***" + RET=1 +fi if [ "$SERVER_PID" == "0" ]; then echo -e "\n***\n*** Failed to start $SERVER\n***" cat $SERVER_LOG exit 1 fi +set +e +# Expect timeout for everything +$CLIENT_TIMEOUT_TEST_CPP -t $SHORT_TIMEOUT_VALUE -v -i grpc -p >> ${CLIENT_LOG}.c++.grpc_non_infer_apis 2>&1 +if [ `grep -c "Deadline Exceeded" ${CLIENT_LOG}.c++.grpc_non_infer_apis` != "18" ]; then + cat ${CLIENT_LOG}.c++.grpc_non_infer_apis + echo -e "\n***\n*** Test Failed. Expected 18 failed\n***" + RET=1 +fi +# Test all APIs with long timeout +$CLIENT_TIMEOUT_TEST_CPP -t $TIMEOUT_VALUE -v -i grpc -p >> ${CLIENT_LOG} 2>&1 +if [ $? -eq 0 ]; then + echo -e "\n***\n*** Test Failed: GRPC non-infer APIs\n***" + RET=1 +fi + +set -e +kill $SERVER_PID +wait $SERVER_PID + +# Test infer APIs +unset TRITONSERVER_SERVER_DELAY_GRPC_RESPONSE_SEC +SERVER_ARGS="--model-repository=$DATADIR --log-verbose 2" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi set +e # CASE 1: Provide too small a timeout and expect a failure. # Note, the custom_identity_int32 is configured with a delay # of 3 sec. # Test request timeout in grpc synchronous inference -$CLIENT_TIMEOUT_TEST_CPP -t 1000 -v -i grpc >> ${CLIENT_LOG}.c++.grpc_infer 2>&1 +$CLIENT_TIMEOUT_TEST_CPP -t $SHORT_TIMEOUT_VALUE -v -i grpc >> ${CLIENT_LOG}.c++.grpc_infer 2>&1 if [ $? -eq 0 ]; then RET=1 fi @@ -81,7 +118,7 @@ if [ `grep -c "Deadline Exceeded" ${CLIENT_LOG}.c++.grpc_infer` != "1" ]; then fi # Test request timeout in grpc asynchronous inference -$CLIENT_TIMEOUT_TEST_CPP -t 1000 -v -i grpc -a >> ${CLIENT_LOG}.c++.grpc_async_infer 2>&1 +$CLIENT_TIMEOUT_TEST_CPP -t $SHORT_TIMEOUT_VALUE -v -i grpc -a >> ${CLIENT_LOG}.c++.grpc_async_infer 2>&1 if [ $? -eq 0 ]; then RET=1 fi @@ -92,7 +129,7 @@ if [ `grep -c "Deadline Exceeded" ${CLIENT_LOG}.c++.grpc_async_infer` != "1" ]; fi # Test stream timeout in grpc asynchronous streaming inference -$CLIENT_TIMEOUT_TEST_CPP -t 1000 -v -i grpc -s >> ${CLIENT_LOG}.c++.grpc_async_stream_infer 2>&1 +$CLIENT_TIMEOUT_TEST_CPP -t $SHORT_TIMEOUT_VALUE -v -i grpc -s >> ${CLIENT_LOG}.c++.grpc_async_stream_infer 2>&1 if [ $? -eq 0 ]; then RET=1 fi @@ -103,7 +140,7 @@ if [ `grep -c "Stream has been closed" ${CLIENT_LOG}.c++.grpc_async_stream_infer fi # Test request timeout in http synchronous inference -$CLIENT_TIMEOUT_TEST_CPP -t 1000 -v >> ${CLIENT_LOG}.c++.http_infer 2>&1 +$CLIENT_TIMEOUT_TEST_CPP -t $SHORT_TIMEOUT_VALUE -v >> ${CLIENT_LOG}.c++.http_infer 2>&1 if [ $? -eq 0 ]; then RET=1 fi @@ -115,7 +152,7 @@ fi # Test request timeout in http asynchronous inference -$CLIENT_TIMEOUT_TEST_CPP -t 1000 -v -a >> ${CLIENT_LOG}.c++.http_async_infer 2>&1 +$CLIENT_TIMEOUT_TEST_CPP -t $SHORT_TIMEOUT_VALUE -v -a >> ${CLIENT_LOG}.c++.http_async_infer 2>&1 if [ $? -eq 0 ]; then RET=1 fi @@ -136,7 +173,6 @@ fi # CASE 2: Provide sufficiently large timeout value -TIMEOUT_VALUE=100000000 set +e echo "TEST: GRPC Synchronous" >> ${CLIENT_LOG} @@ -174,7 +210,6 @@ if [ $? -ne 0 ]; then RET=1 fi - echo "TEST: Python Library" >> ${CLIENT_LOG} # CASE 3: Python Library @@ -185,7 +220,7 @@ for i in test_grpc_infer \ test_http_infer \ test_http_async_infer \ ; do - python $CLIENT_TIMEOUT_TEST ClientTimeoutTest.$i >>$CLIENT_LOG 2>&1 + python $CLIENT_INFER_TIMEOUT_TEST ClientInferTimeoutTest.$i >>$CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG echo -e "\n***\n*** Test $i Failed\n***" @@ -204,6 +239,28 @@ set -e kill $SERVER_PID wait $SERVER_PID +# Test all APIs other than infer +export TRITONSERVER_SERVER_DELAY_GRPC_RESPONSE_SEC=2 +SERVER_ARGS="${SERVER_ARGS} --model-control-mode=explicit --load-model=custom_identity_int32 --log-verbose 2" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi +set +e + +python $CLIENT_NON_INFER_TIMEOUT_TEST >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 +fi + +set -e +kill $SERVER_PID +wait $SERVER_PID + if [ $RET -eq 0 ]; then echo -e "\n***\n*** Test Passed\n***" else @@ -211,4 +268,5 @@ else echo -e "\n***\n*** Test FAILED\n***" fi +set +e exit $RET diff --git a/src/grpc/grpc_server.cc b/src/grpc/grpc_server.cc index 91b0068509..0fcb66f5ba 100644 --- a/src/grpc/grpc_server.cc +++ b/src/grpc/grpc_server.cc @@ -92,10 +92,11 @@ class CommonCallData : public ICallData { const StandardRegisterFunc OnRegister, const StandardCallbackFunc OnExecute, const bool async, ::grpc::ServerCompletionQueue* cq, - const std::pair& restricted_kv) + const std::pair& restricted_kv, + const uint64_t& response_delay = 0) : name_(name), id_(id), OnRegister_(OnRegister), OnExecute_(OnExecute), async_(async), cq_(cq), responder_(&ctx_), step_(Steps::START), - restricted_kv_(restricted_kv) + restricted_kv_(restricted_kv), response_delay_(response_delay) { OnRegister_(&ctx_, &request_, &responder_, this); LOG_VERBOSE(1) << "Ready for RPC '" << name_ << "', " << id_; @@ -140,6 +141,8 @@ class CommonCallData : public ICallData { Steps step_; std::pair restricted_kv_{"", ""}; + + const uint64_t response_delay_; }; template @@ -165,7 +168,8 @@ CommonCallData::Process(bool rpc_ok) // Start a new request to replace this one... if (!shutdown) { new CommonCallData( - name_, id_ + 1, OnRegister_, OnExecute_, async_, cq_, restricted_kv_); + name_, id_ + 1, OnRegister_, OnExecute_, async_, cq_, restricted_kv_, + response_delay_); } if (!async_) { @@ -234,6 +238,14 @@ template void CommonCallData::WriteResponse() { + if (response_delay_ != 0) { + // Will delay the write of the response by the specified time. + // This can be used to test the flow where there are other + // responses available to be written. + LOG_VERBOSE(1) << "Delaying the write of the response by " + << response_delay_ << " seconds"; + std::this_thread::sleep_for(std::chrono::seconds(response_delay_)); + } step_ = Steps::COMPLETE; responder_.Finish(response_, status_, this); } @@ -253,7 +265,7 @@ class CommonHandler : public HandlerBase { inference::GRPCInferenceService::AsyncService* service, ::grpc::health::v1::Health::AsyncService* health_service, ::grpc::ServerCompletionQueue* cq, - const RestrictedFeatures& restricted_keys); + const RestrictedFeatures& restricted_keys, const uint64_t response_delay); // Descriptive name of of the handler. const std::string& Name() const { return name_; } @@ -299,6 +311,7 @@ class CommonHandler : public HandlerBase { ::grpc::ServerCompletionQueue* cq_; std::unique_ptr thread_; RestrictedFeatures restricted_keys_{}; + const uint64_t response_delay_ = 0; }; CommonHandler::CommonHandler( @@ -309,11 +322,12 @@ CommonHandler::CommonHandler( inference::GRPCInferenceService::AsyncService* service, ::grpc::health::v1::Health::AsyncService* health_service, ::grpc::ServerCompletionQueue* cq, - const RestrictedFeatures& restricted_keys) + const RestrictedFeatures& restricted_keys, + const uint64_t response_delay = 0) : name_(name), tritonserver_(tritonserver), shm_manager_(shm_manager), trace_manager_(trace_manager), service_(service), health_service_(health_service), cq_(cq), - restricted_keys_(restricted_keys) + restricted_keys_(restricted_keys), response_delay_(response_delay) { } @@ -440,7 +454,7 @@ CommonHandler::RegisterServerLive() ::grpc::ServerAsyncResponseWriter, inference::ServerLiveRequest, inference::ServerLiveResponse>( "ServerLive", 0, OnRegisterServerLive, OnExecuteServerLive, - false /* async */, cq_, restricted_kv); + false /* async */, cq_, restricted_kv, response_delay_); } void @@ -476,7 +490,7 @@ CommonHandler::RegisterServerReady() ::grpc::ServerAsyncResponseWriter, inference::ServerReadyRequest, inference::ServerReadyResponse>( "ServerReady", 0, OnRegisterServerReady, OnExecuteServerReady, - false /* async */, cq_, restricted_kv); + false /* async */, cq_, restricted_kv, response_delay_); } void @@ -525,7 +539,7 @@ CommonHandler::RegisterHealthCheck() ::grpc::health::v1::HealthCheckRequest, ::grpc::health::v1::HealthCheckResponse>( "Check", 0, OnRegisterHealthCheck, OnExecuteHealthCheck, - false /* async */, cq_, restricted_kv); + false /* async */, cq_, restricted_kv, response_delay_); } void @@ -567,7 +581,7 @@ CommonHandler::RegisterModelReady() ::grpc::ServerAsyncResponseWriter, inference::ModelReadyRequest, inference::ModelReadyResponse>( "ModelReady", 0, OnRegisterModelReady, OnExecuteModelReady, - false /* async */, cq_, restricted_kv); + false /* async */, cq_, restricted_kv, response_delay_); } void @@ -645,7 +659,7 @@ CommonHandler::RegisterServerMetadata() ::grpc::ServerAsyncResponseWriter, inference::ServerMetadataRequest, inference::ServerMetadataResponse>( "ServerMetadata", 0, OnRegisterServerMetadata, OnExecuteServerMetadata, - false /* async */, cq_, restricted_kv); + false /* async */, cq_, restricted_kv, response_delay_); } void @@ -813,7 +827,7 @@ CommonHandler::RegisterModelMetadata() ::grpc::ServerAsyncResponseWriter, inference::ModelMetadataRequest, inference::ModelMetadataResponse>( "ModelMetadata", 0, OnRegisterModelMetadata, OnExecuteModelMetadata, - false /* async */, cq_, restricted_kv); + false /* async */, cq_, restricted_kv, response_delay_); } void @@ -866,7 +880,7 @@ CommonHandler::RegisterModelConfig() ::grpc::ServerAsyncResponseWriter, inference::ModelConfigRequest, inference::ModelConfigResponse>( "ModelConfig", 0, OnRegisterModelConfig, OnExecuteModelConfig, - false /* async */, cq_, restricted_kv); + false /* async */, cq_, restricted_kv, response_delay_); } void @@ -1196,7 +1210,7 @@ CommonHandler::RegisterModelStatistics() ::grpc::ServerAsyncResponseWriter, inference::ModelStatisticsRequest, inference::ModelStatisticsResponse>( "ModelStatistics", 0, OnRegisterModelStatistics, OnExecuteModelStatistics, - false /* async */, cq_, restricted_kv); + false /* async */, cq_, restricted_kv, response_delay_); } void @@ -1471,7 +1485,7 @@ CommonHandler::RegisterTrace() ::grpc::ServerAsyncResponseWriter, inference::TraceSettingRequest, inference::TraceSettingResponse>( "Trace", 0, OnRegisterTrace, OnExecuteTrace, false /* async */, cq_, - restricted_kv); + restricted_kv, response_delay_); } void @@ -1680,7 +1694,7 @@ CommonHandler::RegisterLogging() ::grpc::ServerAsyncResponseWriter, inference::LogSettingsRequest, inference::LogSettingsResponse>( "Logging", 0, OnRegisterLogging, OnExecuteLogging, false /* async */, cq_, - restricted_kv); + restricted_kv, response_delay_); } void @@ -1754,7 +1768,8 @@ CommonHandler::RegisterSystemSharedMemoryStatus() inference::SystemSharedMemoryStatusRequest, inference::SystemSharedMemoryStatusResponse>( "SystemSharedMemoryStatus", 0, OnRegisterSystemSharedMemoryStatus, - OnExecuteSystemSharedMemoryStatus, false /* async */, cq_, restricted_kv); + OnExecuteSystemSharedMemoryStatus, false /* async */, cq_, restricted_kv, + response_delay_); } void @@ -1793,7 +1808,7 @@ CommonHandler::RegisterSystemSharedMemoryRegister() inference::SystemSharedMemoryRegisterResponse>( "SystemSharedMemoryRegister", 0, OnRegisterSystemSharedMemoryRegister, OnExecuteSystemSharedMemoryRegister, false /* async */, cq_, - restricted_kv); + restricted_kv, response_delay_); } void @@ -1836,7 +1851,7 @@ CommonHandler::RegisterSystemSharedMemoryUnregister() inference::SystemSharedMemoryUnregisterResponse>( "SystemSharedMemoryUnregister", 0, OnRegisterSystemSharedMemoryUnregister, OnExecuteSystemSharedMemoryUnregister, false /* async */, cq_, - restricted_kv); + restricted_kv, response_delay_); } void @@ -1902,7 +1917,8 @@ CommonHandler::RegisterCudaSharedMemoryStatus() inference::CudaSharedMemoryStatusRequest, inference::CudaSharedMemoryStatusResponse>( "CudaSharedMemoryStatus", 0, OnRegisterCudaSharedMemoryStatus, - OnExecuteCudaSharedMemoryStatus, false /* async */, cq_, restricted_kv); + OnExecuteCudaSharedMemoryStatus, false /* async */, cq_, restricted_kv, + response_delay_); } void @@ -1952,7 +1968,8 @@ CommonHandler::RegisterCudaSharedMemoryRegister() inference::CudaSharedMemoryRegisterRequest, inference::CudaSharedMemoryRegisterResponse>( "CudaSharedMemoryRegister", 0, OnRegisterCudaSharedMemoryRegister, - OnExecuteCudaSharedMemoryRegister, false /* async */, cq_, restricted_kv); + OnExecuteCudaSharedMemoryRegister, false /* async */, cq_, restricted_kv, + response_delay_); } void @@ -1995,7 +2012,7 @@ CommonHandler::RegisterCudaSharedMemoryUnregister() inference::CudaSharedMemoryUnregisterResponse>( "CudaSharedMemoryUnregister", 0, OnRegisterCudaSharedMemoryUnregister, OnExecuteCudaSharedMemoryUnregister, false /* async */, cq_, - restricted_kv); + restricted_kv, response_delay_); } void @@ -2097,7 +2114,7 @@ CommonHandler::RegisterRepositoryIndex() ::grpc::ServerAsyncResponseWriter, inference::RepositoryIndexRequest, inference::RepositoryIndexResponse>( "RepositoryIndex", 0, OnRegisterRepositoryIndex, OnExecuteRepositoryIndex, - false /* async */, cq_, restricted_kv); + false /* async */, cq_, restricted_kv, response_delay_); } void @@ -2209,7 +2226,8 @@ CommonHandler::RegisterRepositoryModelLoad() inference::RepositoryModelLoadRequest, inference::RepositoryModelLoadResponse>( "RepositoryModelLoad", 0, OnRegisterRepositoryModelLoad, - OnExecuteRepositoryModelLoad, true /* async */, cq_, restricted_kv); + OnExecuteRepositoryModelLoad, true /* async */, cq_, restricted_kv, + response_delay_); } void @@ -2278,7 +2296,8 @@ CommonHandler::RegisterRepositoryModelUnload() inference::RepositoryModelUnloadRequest, inference::RepositoryModelUnloadResponse>( "RepositoryModelUnload", 0, OnRegisterRepositoryModelUnload, - OnExecuteRepositoryModelUnload, true /* async */, cq_, restricted_kv); + OnExecuteRepositoryModelUnload, true /* async */, cq_, restricted_kv, + response_delay_); } } // namespace @@ -2387,10 +2406,17 @@ Server::Server( model_infer_cq_ = builder_.AddCompletionQueue(); model_stream_infer_cq_ = builder_.AddCompletionQueue(); + // For testing purposes only, add artificial delay in grpc responses. + const char* dstr = getenv("TRITONSERVER_SERVER_DELAY_GRPC_RESPONSE_SEC"); + uint64_t response_delay = 0; + if (dstr != nullptr) { + response_delay = atoi(dstr); + } // A common Handler for other non-inference requests common_handler_.reset(new CommonHandler( "CommonHandler", tritonserver_, shm_manager_, trace_manager_, &service_, - &health_service_, common_cq_.get(), options.restricted_protocols_)); + &health_service_, common_cq_.get(), options.restricted_protocols_, + response_delay)); // [FIXME] "register" logic is different for infer // Handler for model inference requests.