Skip to content

Commit

Permalink
Add metrics test
Browse files Browse the repository at this point in the history
  • Loading branch information
yinggeh committed Aug 3, 2024
1 parent d95bb2c commit 321faa0
Show file tree
Hide file tree
Showing 6 changed files with 274 additions and 3 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,9 @@ container with the following commands:

```
mkdir -p /opt/tritonserver/backends/vllm
wget -P /opt/tritonserver/backends/vllm https://raw.githubusercontent.com/triton-inference-server/vllm_backend/main/src/model.py
git clone https://github.com/triton-inference-server/vllm_backend.git /opt/tritonserver/backends/vllm/vllm_backend
cp -r /opt/tritonserver/backends/vllm/vllm_backend/src/* /opt/tritonserver/backends/vllm
rm -rf /opt/tritonserver/backends/vllm/vllm_backend
```

## Using the vLLM Backend
Expand Down
98 changes: 98 additions & 0 deletions ci/L0_backend_vllm/metrics_test/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#!/bin/bash
# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

source ../../common/util.sh

TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"}
SERVER=${TRITON_DIR}/bin/tritonserver
BACKEND_DIR=${TRITON_DIR}/backends
SERVER_ARGS="--model-repository=$(pwd)/models --backend-directory=${BACKEND_DIR} --model-control-mode=explicit --load-model=vllm_opt --log-verbose=1"
SERVER_LOG="./vllm_metrics_server.log"
CLIENT_LOG="./vllm_metrics_client.log"
TEST_RESULT_FILE='test_results.txt'
CLIENT_PY="./vllm_metrics_test.py"
SAMPLE_MODELS_REPO="../../../samples/model_repository"
EXPECTED_NUM_TESTS=1

# Helpers =======================================
function assert_curl_success {
message="${1}"
if [ "$code" != "200" ]; then
cat ./curl.out
echo -e "\n***\n*** ${message} : line ${BASH_LINENO}\n***"
RET=1
fi
}

rm -rf models && mkdir -p models
cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt
# `vllm_opt`` model will be loaded on server start and stay loaded throughout
# unittesting. To ensure that vllm's memory profiler will not error out
# on `vllm_load_test` load, we reduce "gpu_memory_utilization" for `vllm_opt`,
# so that at least 60% of GPU memory was available for other models.
sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/vllm_opt/1/model.json

RET=0

run_server
if [ "$SERVER_PID" == "0" ]; then
cat $SERVER_LOG
echo -e "\n***\n*** Failed to start $SERVER\n***"
exit 1
fi

set +e
python3 $CLIENT_PY -v > $CLIENT_LOG 2>&1

if [ $? -ne 0 ]; then
cat $CLIENT_LOG
echo -e "\n***\n*** Running $CLIENT_PY FAILED. \n***"
RET=1
else
check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
if [ $? -ne 0 ]; then
cat $CLIENT_LOG
echo -e "\n***\n*** Test Result Verification FAILED.\n***"
RET=1
fi
fi
set -e

kill $SERVER_PID
wait $SERVER_PID
rm -rf "./models"

if [ $RET -eq 1 ]; then
cat $CLIENT_LOG
cat $SERVER_LOG
echo -e "\n***\n*** vLLM test FAILED. \n***"
else
echo -e "\n***\n*** vLLM test PASSED. \n***"
fi

collect_artifacts_from_subdir
exit $RET
171 changes: 171 additions & 0 deletions ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import os
import re
import sys
import unittest
from functools import partial

import requests
import tritonclient.grpc as grpcclient
from tritonclient.utils import *

sys.path.append("../../common")
from test_util import TestResultCollector, UserData, callback, create_vllm_request

_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")

PROMPTS = [
"The most dangerous animal is",
"The capital of France is",
"The future of AI is",
]
SAMPLING_PARAMETERS = {"temperature": "0", "top_p": "1"}


def get_metrics():
"""
Store vllm metrics in a dictionary.
"""
r = requests.get(f"http://{_tritonserver_ipaddr}:8002/metrics")
r.raise_for_status()

# Regular expression to match the pattern
pattern = r"^(vllm:.*){.*} (\d+)$"
vllm_dict = {}

# Find all matches in the text
matches = re.findall(pattern, r.text, re.MULTILINE)

for match in matches:
key, value = match
vllm_dict[key] = int(value)

return vllm_dict


class VLLMTritonMetricsTest(TestResultCollector):
def setUp(self):
self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001")
self.vllm_model_name = "vllm_opt"

def test_vllm_metrics(self):
# Supported vLLM metrics
expected_metrics_dict = {
"vllm:num_requests_running": 0,
"vllm:num_requests_waiting": 0,
"vllm:num_requests_swapped": 0,
"vllm:gpu_cache_usage_perc": 0,
"vllm:cpu_cache_usage_perc": 0,
"vllm:num_preemptions_total": 0,
"vllm:prompt_tokens_total": 0,
"vllm:generation_tokens_total": 0,
}

# Test vLLM metrics
self._test_vllm_model(
prompts=PROMPTS,
sampling_parameters=SAMPLING_PARAMETERS,
stream=False,
send_parameters_as_tensor=True,
model_name=self.vllm_model_name,
)
expected_metrics_dict["vllm:prompt_tokens_total"] = 18
expected_metrics_dict["vllm:generation_tokens_total"] = 48
print(get_metrics())
print(expected_metrics_dict)
self.assertEqual(get_metrics(), expected_metrics_dict)

self._test_vllm_model(
prompts=PROMPTS,
sampling_parameters=SAMPLING_PARAMETERS,
stream=False,
send_parameters_as_tensor=False,
model_name=self.vllm_model_name,
)
expected_metrics_dict["vllm:prompt_tokens_total"] = 36
expected_metrics_dict["vllm:generation_tokens_total"] = 96
self.assertEqual(get_metrics(), expected_metrics_dict)

def _test_vllm_model(
self,
prompts,
sampling_parameters,
stream,
send_parameters_as_tensor,
exclude_input_in_output=None,
expected_output=None,
model_name="vllm_opt",
):
user_data = UserData()
number_of_vllm_reqs = len(prompts)

self.triton_client.start_stream(callback=partial(callback, user_data))
for i in range(number_of_vllm_reqs):
request_data = create_vllm_request(
prompts[i],
i,
stream,
sampling_parameters,
model_name,
send_parameters_as_tensor,
exclude_input_in_output=exclude_input_in_output,
)
self.triton_client.async_stream_infer(
model_name=model_name,
request_id=request_data["request_id"],
inputs=request_data["inputs"],
outputs=request_data["outputs"],
parameters=sampling_parameters,
)

for i in range(number_of_vllm_reqs):
result = user_data._completed_requests.get()
if type(result) is InferenceServerException:
print(result.message())
self.assertIsNot(type(result), InferenceServerException, str(result))

output = result.as_numpy("text_output")
self.assertIsNotNone(output, "`text_output` should not be None")
if expected_output is not None:
self.assertEqual(
output,
expected_output[i],
'Actual and expected outputs do not match.\n \
Expected "{}" \n Actual:"{}"'.format(
output, expected_output[i]
),
)

self.triton_client.stop_stream()

def tearDown(self):
self.triton_client.close()


if __name__ == "__main__":
unittest.main()
2 changes: 1 addition & 1 deletion ci/L0_backend_vllm/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

RET=0
SUBTESTS="accuracy_test request_cancellation enabled_stream vllm_backend"
SUBTESTS="accuracy_test request_cancellation enabled_stream vllm_backend metrics_test"

python3 -m pip install --upgrade pip && pip3 install tritonclient[grpc]

Expand Down
2 changes: 1 addition & 1 deletion src/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from vllm.sampling_params import SamplingParams
from vllm.utils import random_uuid

from metrics import VllmStatLogger
from utils.metrics import VllmStatLogger

_VLLM_ENGINE_ARGS_FILENAME = "model.json"
_MULTI_LORA_ARGS_FILENAME = "multi_lora.json"
Expand Down
File renamed without changes.

0 comments on commit 321faa0

Please sign in to comment.