Remove unused metrics and update comments

triton-inference-server · Aug 6, 2024 · 8b3e790 · 8b3e790
1 parent 468539f
commit 8b3e790
Show file tree

Hide file tree

Showing 2 changed files with 77 additions and 111 deletions.
diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py
@@ -37,79 +37,68 @@
 sys.path.append("../../common")
 from test_util import TestResultCollector, UserData, callback, create_vllm_request
 
-_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
 
-PROMPTS = [
-    "The most dangerous animal is",
-    "The capital of France is",
-    "The future of AI is",
-]
-SAMPLING_PARAMETERS = {"temperature": "0", "top_p": "1"}
-
-
-def get_metrics():
-    """
-    Store vllm metrics in a dictionary.
-    """
-    r = requests.get(f"http://{_tritonserver_ipaddr}:8002/metrics")
-    r.raise_for_status()
-
-    # Regular expression to match the pattern
-    pattern = r"^(vllm:.*){.*} (\d+)$"
-    vllm_dict = {}
+class VLLMTritonMetricsTest(TestResultCollector):
+    def setUp(self):
+        self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001")
+        self.tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
+        self.vllm_model_name = "vllm_opt"
+        self.prompts = [
+            "The most dangerous animal is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
+        self.sampling_parameters = {"temperature": "0", "top_p": "1"}
 
-    # Find all matches in the text
-    matches = re.findall(pattern, r.text, re.MULTILINE)
+    def get_metrics(self):
+        """
+        Store vllm metrics in a dictionary.
+        """
+        r = requests.get(f"http://{self.tritonserver_ipaddr}:8002/metrics")
+        r.raise_for_status()
 
-    for match in matches:
-        key, value = match
-        vllm_dict[key] = int(value)
+        # Regular expression to match the pattern
+        pattern = r"^(vllm:.*){.*} (\d+)$"
+        vllm_dict = {}
 
-    return vllm_dict
+        # Find all matches in the text
+        matches = re.findall(pattern, r.text, re.MULTILINE)
 
+        for match in matches:
+            key, value = match
+            vllm_dict[key] = int(value)
 
-class VLLMTritonMetricsTest(TestResultCollector):
-    def setUp(self):
-        self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001")
-        self.vllm_model_name = "vllm_opt"
+        return vllm_dict
 
     def test_vllm_metrics(self):
         # Supported vLLM metrics
         expected_metrics_dict = {
-            "vllm:num_requests_running": 0,
-            "vllm:num_requests_waiting": 0,
-            "vllm:num_requests_swapped": 0,
-            "vllm:gpu_cache_usage_perc": 0,
-            "vllm:cpu_cache_usage_perc": 0,
-            "vllm:num_preemptions_total": 0,
             "vllm:prompt_tokens_total": 0,
             "vllm:generation_tokens_total": 0,
         }
 
         # Test vLLM metrics
         self._test_vllm_model(
-            prompts=PROMPTS,
-            sampling_parameters=SAMPLING_PARAMETERS,
+            prompts=self.prompts,
+            sampling_parameters=self.sampling_parameters,
             stream=False,
             send_parameters_as_tensor=True,
             model_name=self.vllm_model_name,
         )
         expected_metrics_dict["vllm:prompt_tokens_total"] = 18
         expected_metrics_dict["vllm:generation_tokens_total"] = 48
-        print(get_metrics())
-        print(expected_metrics_dict)
-        self.assertEqual(get_metrics(), expected_metrics_dict)
+        self.assertEqual(self.get_metrics(), expected_metrics_dict)
 
         self._test_vllm_model(
-            prompts=PROMPTS,
-            sampling_parameters=SAMPLING_PARAMETERS,
+            prompts=self.prompts,
+            sampling_parameters=self.sampling_parameters,
             stream=False,
             send_parameters_as_tensor=False,
             model_name=self.vllm_model_name,
         )
         expected_metrics_dict["vllm:prompt_tokens_total"] = 36
         expected_metrics_dict["vllm:generation_tokens_total"] = 96
-        self.assertEqual(get_metrics(), expected_metrics_dict)
+        self.assertEqual(self.get_metrics(), expected_metrics_dict)
 
     def _test_vllm_model(
         self,

diff --git a/src/utils/metrics.py b/src/utils/metrics.py
@@ -34,41 +34,8 @@
 
 class TritonMetrics:
     def __init__(self, labels):
-        # System stats
-        #   Scheduler State
-        self.gauge_scheduler_running_family = pb_utils.MetricFamily(
-            name="vllm:num_requests_running",
-            description="Number of requests currently running on GPU.",
-            kind=pb_utils.MetricFamily.GAUGE,
-        )
-        self.gauge_scheduler_waiting_family = pb_utils.MetricFamily(
-            name="vllm:num_requests_waiting",
-            description="Number of requests waiting to be processed.",
-            kind=pb_utils.MetricFamily.GAUGE,
-        )
-        self.gauge_scheduler_swapped_family = pb_utils.MetricFamily(
-            name="vllm:num_requests_swapped",
-            description="Number of requests swapped to CPU.",
-            kind=pb_utils.MetricFamily.GAUGE,
-        )
-        #   KV Cache Usage in %
-        self.gauge_gpu_cache_usage_family = pb_utils.MetricFamily(
-            name="vllm:gpu_cache_usage_perc",
-            description="GPU KV-cache usage. 1 means 100 percent usage.",
-            kind=pb_utils.MetricFamily.GAUGE,
-        )
-        self.gauge_cpu_cache_usage_family = pb_utils.MetricFamily(
-            name="vllm:cpu_cache_usage_perc",
-            description="CPU KV-cache usage. 1 means 100 percent usage.",
-            kind=pb_utils.MetricFamily.GAUGE,
-        )
-
+        # Initialize metric families
         # Iteration stats
-        self.counter_num_preemption_family = pb_utils.MetricFamily(
-            name="vllm:num_preemptions_total",
-            description="Cumulative number of preemption from the engine.",
-            kind=pb_utils.MetricFamily.COUNTER,
-        )
         self.counter_prompt_tokens_family = pb_utils.MetricFamily(
             name="vllm:prompt_tokens_total",
             description="Number of prefill tokens processed.",
@@ -79,30 +46,19 @@ def __init__(self, labels):
             description="Number of generation tokens processed.",
             kind=pb_utils.MetricFamily.COUNTER,
         )
-
-        # System stats
-        #   Scheduler State
-        self.gauge_scheduler_running = self.gauge_scheduler_running_family.Metric(
-            labels=labels
-        )
-        self.gauge_scheduler_waiting = self.gauge_scheduler_waiting_family.Metric(
-            labels=labels
+        self.histogram_time_to_first_token_family = pb_utils.MetricFamily(
+            name="vllm:time_to_first_token_seconds",
+            description="Histogram of time to first token in seconds.",
+            kind=pb_utils.MetricFamily.HISTOGRAM,
         )
-        self.gauge_scheduler_swapped = self.gauge_scheduler_swapped_family.Metric(
-            labels=labels
-        )
-        #   KV Cache Usage in %
-        self.gauge_gpu_cache_usage = self.gauge_gpu_cache_usage_family.Metric(
-            labels=labels
-        )
-        self.gauge_cpu_cache_usage = self.gauge_cpu_cache_usage_family.Metric(
-            labels=labels
+        self.histogram_time_per_output_token_family = pb_utils.MetricFamily(
+            name="vllm:time_per_output_token_seconds",
+            description="Histogram of time per output token in seconds.",
+            kind=pb_utils.MetricFamily.HISTOGRAM,
         )
 
+        # Initialize metrics
         # Iteration stats
-        self.counter_num_preemption = self.counter_num_preemption_family.Metric(
-            labels=labels
-        )
         self.counter_prompt_tokens = self.counter_prompt_tokens_family.Metric(
             labels=labels
         )
@@ -124,30 +80,51 @@ def info(self, type: str, obj: SupportsMetricsInfo) -> None:
         raise NotImplementedError
 
     def _log_gauge(self, gauge, data: Union[int, float]) -> None:
-        # Convenience function for logging to gauge.
+        """Convenience function for logging to gauge.
+
+        Args:
+            gauge: A gauge metric instance.
+            data: An int or float to set the gauge metric.
+
+        Returns:
+            None
+        """
         gauge.set(data)
 
     def _log_counter(self, counter, data: Union[int, float]) -> None:
-        # Convenience function for logging to counter.
+        """Convenience function for logging to counter.
+
+        Args:
+            counter: A counter metric instance.
+            data: An int or float to increment the count metric.
+
+        Returns:
+            None
+        """
         counter.increment(data)
 
     def _log_histogram(self, histogram, data: Union[List[int], List[float]]) -> None:
-        # Convenience function for logging list to histogram.
+        """Convenience function for logging list to histogram.
+
+        Args:
+            histogram: A histogram metric instance.
+            data: A list of int or float data to observe into the histogram metric.
+
+        Returns:
+            None
+        """
         for datum in data:
             histogram.observe(datum)
 
     def log(self, stats: VllmStats) -> None:
-        # System state data
-        self._log_gauge(self.metrics.gauge_scheduler_running, stats.num_running_sys)
-        self._log_gauge(self.metrics.gauge_scheduler_waiting, stats.num_waiting_sys)
-        self._log_gauge(self.metrics.gauge_scheduler_swapped, stats.num_swapped_sys)
-        self._log_gauge(self.metrics.gauge_gpu_cache_usage, stats.gpu_cache_usage_sys)
-        self._log_gauge(self.metrics.gauge_cpu_cache_usage, stats.cpu_cache_usage_sys)
-
-        # Iteration level data
-        self._log_counter(
-            self.metrics.counter_num_preemption, stats.num_preemption_iter
-        )
+        """Logs tracked stats to triton metrics server every iteration.
+
+        Args:
+            stats: Created by LLMEngine for use by VllmStatLogger.
+
+        Returns:
+            None
+        """
         self._log_counter(
             self.metrics.counter_prompt_tokens, stats.num_prompt_tokens_iter
         )