Skip to content

Commit

Permalink
Remove unused metrics and update comments
Browse files Browse the repository at this point in the history
  • Loading branch information
yinggeh committed Aug 6, 2024
1 parent 468539f commit 8b3e790
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 111 deletions.
75 changes: 32 additions & 43 deletions ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,79 +37,68 @@
sys.path.append("../../common")
from test_util import TestResultCollector, UserData, callback, create_vllm_request

_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")

PROMPTS = [
"The most dangerous animal is",
"The capital of France is",
"The future of AI is",
]
SAMPLING_PARAMETERS = {"temperature": "0", "top_p": "1"}


def get_metrics():
"""
Store vllm metrics in a dictionary.
"""
r = requests.get(f"http://{_tritonserver_ipaddr}:8002/metrics")
r.raise_for_status()

# Regular expression to match the pattern
pattern = r"^(vllm:.*){.*} (\d+)$"
vllm_dict = {}
class VLLMTritonMetricsTest(TestResultCollector):
def setUp(self):
self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001")
self.tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
self.vllm_model_name = "vllm_opt"
self.prompts = [
"The most dangerous animal is",
"The capital of France is",
"The future of AI is",
]
self.sampling_parameters = {"temperature": "0", "top_p": "1"}

# Find all matches in the text
matches = re.findall(pattern, r.text, re.MULTILINE)
def get_metrics(self):
"""
Store vllm metrics in a dictionary.
"""
r = requests.get(f"http://{self.tritonserver_ipaddr}:8002/metrics")
r.raise_for_status()

for match in matches:
key, value = match
vllm_dict[key] = int(value)
# Regular expression to match the pattern
pattern = r"^(vllm:.*){.*} (\d+)$"
vllm_dict = {}

return vllm_dict
# Find all matches in the text
matches = re.findall(pattern, r.text, re.MULTILINE)

for match in matches:
key, value = match
vllm_dict[key] = int(value)

class VLLMTritonMetricsTest(TestResultCollector):
def setUp(self):
self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001")
self.vllm_model_name = "vllm_opt"
return vllm_dict

def test_vllm_metrics(self):
# Supported vLLM metrics
expected_metrics_dict = {
"vllm:num_requests_running": 0,
"vllm:num_requests_waiting": 0,
"vllm:num_requests_swapped": 0,
"vllm:gpu_cache_usage_perc": 0,
"vllm:cpu_cache_usage_perc": 0,
"vllm:num_preemptions_total": 0,
"vllm:prompt_tokens_total": 0,
"vllm:generation_tokens_total": 0,
}

# Test vLLM metrics
self._test_vllm_model(
prompts=PROMPTS,
sampling_parameters=SAMPLING_PARAMETERS,
prompts=self.prompts,
sampling_parameters=self.sampling_parameters,
stream=False,
send_parameters_as_tensor=True,
model_name=self.vllm_model_name,
)
expected_metrics_dict["vllm:prompt_tokens_total"] = 18
expected_metrics_dict["vllm:generation_tokens_total"] = 48
print(get_metrics())
print(expected_metrics_dict)
self.assertEqual(get_metrics(), expected_metrics_dict)
self.assertEqual(self.get_metrics(), expected_metrics_dict)

self._test_vllm_model(
prompts=PROMPTS,
sampling_parameters=SAMPLING_PARAMETERS,
prompts=self.prompts,
sampling_parameters=self.sampling_parameters,
stream=False,
send_parameters_as_tensor=False,
model_name=self.vllm_model_name,
)
expected_metrics_dict["vllm:prompt_tokens_total"] = 36
expected_metrics_dict["vllm:generation_tokens_total"] = 96
self.assertEqual(get_metrics(), expected_metrics_dict)
self.assertEqual(self.get_metrics(), expected_metrics_dict)

def _test_vllm_model(
self,
Expand Down
113 changes: 45 additions & 68 deletions src/utils/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,41 +34,8 @@

class TritonMetrics:
def __init__(self, labels):
# System stats
# Scheduler State
self.gauge_scheduler_running_family = pb_utils.MetricFamily(
name="vllm:num_requests_running",
description="Number of requests currently running on GPU.",
kind=pb_utils.MetricFamily.GAUGE,
)
self.gauge_scheduler_waiting_family = pb_utils.MetricFamily(
name="vllm:num_requests_waiting",
description="Number of requests waiting to be processed.",
kind=pb_utils.MetricFamily.GAUGE,
)
self.gauge_scheduler_swapped_family = pb_utils.MetricFamily(
name="vllm:num_requests_swapped",
description="Number of requests swapped to CPU.",
kind=pb_utils.MetricFamily.GAUGE,
)
# KV Cache Usage in %
self.gauge_gpu_cache_usage_family = pb_utils.MetricFamily(
name="vllm:gpu_cache_usage_perc",
description="GPU KV-cache usage. 1 means 100 percent usage.",
kind=pb_utils.MetricFamily.GAUGE,
)
self.gauge_cpu_cache_usage_family = pb_utils.MetricFamily(
name="vllm:cpu_cache_usage_perc",
description="CPU KV-cache usage. 1 means 100 percent usage.",
kind=pb_utils.MetricFamily.GAUGE,
)

# Initialize metric families
# Iteration stats
self.counter_num_preemption_family = pb_utils.MetricFamily(
name="vllm:num_preemptions_total",
description="Cumulative number of preemption from the engine.",
kind=pb_utils.MetricFamily.COUNTER,
)
self.counter_prompt_tokens_family = pb_utils.MetricFamily(
name="vllm:prompt_tokens_total",
description="Number of prefill tokens processed.",
Expand All @@ -79,30 +46,19 @@ def __init__(self, labels):
description="Number of generation tokens processed.",
kind=pb_utils.MetricFamily.COUNTER,
)

# System stats
# Scheduler State
self.gauge_scheduler_running = self.gauge_scheduler_running_family.Metric(
labels=labels
)
self.gauge_scheduler_waiting = self.gauge_scheduler_waiting_family.Metric(
labels=labels
self.histogram_time_to_first_token_family = pb_utils.MetricFamily(
name="vllm:time_to_first_token_seconds",
description="Histogram of time to first token in seconds.",
kind=pb_utils.MetricFamily.HISTOGRAM,
)
self.gauge_scheduler_swapped = self.gauge_scheduler_swapped_family.Metric(
labels=labels
)
# KV Cache Usage in %
self.gauge_gpu_cache_usage = self.gauge_gpu_cache_usage_family.Metric(
labels=labels
)
self.gauge_cpu_cache_usage = self.gauge_cpu_cache_usage_family.Metric(
labels=labels
self.histogram_time_per_output_token_family = pb_utils.MetricFamily(
name="vllm:time_per_output_token_seconds",
description="Histogram of time per output token in seconds.",
kind=pb_utils.MetricFamily.HISTOGRAM,
)

# Initialize metrics
# Iteration stats
self.counter_num_preemption = self.counter_num_preemption_family.Metric(
labels=labels
)
self.counter_prompt_tokens = self.counter_prompt_tokens_family.Metric(
labels=labels
)
Expand All @@ -124,30 +80,51 @@ def info(self, type: str, obj: SupportsMetricsInfo) -> None:
raise NotImplementedError

def _log_gauge(self, gauge, data: Union[int, float]) -> None:
# Convenience function for logging to gauge.
"""Convenience function for logging to gauge.
Args:
gauge: A gauge metric instance.
data: An int or float to set the gauge metric.
Returns:
None
"""
gauge.set(data)

def _log_counter(self, counter, data: Union[int, float]) -> None:
# Convenience function for logging to counter.
"""Convenience function for logging to counter.
Args:
counter: A counter metric instance.
data: An int or float to increment the count metric.
Returns:
None
"""
counter.increment(data)

def _log_histogram(self, histogram, data: Union[List[int], List[float]]) -> None:
# Convenience function for logging list to histogram.
"""Convenience function for logging list to histogram.
Args:
histogram: A histogram metric instance.
data: A list of int or float data to observe into the histogram metric.
Returns:
None
"""
for datum in data:
histogram.observe(datum)

def log(self, stats: VllmStats) -> None:
# System state data
self._log_gauge(self.metrics.gauge_scheduler_running, stats.num_running_sys)
self._log_gauge(self.metrics.gauge_scheduler_waiting, stats.num_waiting_sys)
self._log_gauge(self.metrics.gauge_scheduler_swapped, stats.num_swapped_sys)
self._log_gauge(self.metrics.gauge_gpu_cache_usage, stats.gpu_cache_usage_sys)
self._log_gauge(self.metrics.gauge_cpu_cache_usage, stats.cpu_cache_usage_sys)

# Iteration level data
self._log_counter(
self.metrics.counter_num_preemption, stats.num_preemption_iter
)
"""Logs tracked stats to triton metrics server every iteration.
Args:
stats: Created by LLMEngine for use by VllmStatLogger.
Returns:
None
"""
self._log_counter(
self.metrics.counter_prompt_tokens, stats.num_prompt_tokens_iter
)
Expand Down

0 comments on commit 8b3e790

Please sign in to comment.