From bbe8569ae54034fe322383295d7d0500fd0136c5 Mon Sep 17 00:00:00 2001 From: lkomali Date: Wed, 21 Aug 2024 15:38:26 -0700 Subject: [PATCH] Fix exception thrown by genai-perf if metrics url is unreachable --- .../telemetry_data_collector.py | 15 +++++++ genai-perf/genai_perf/wrapper.py | 7 +++- .../tests/test_telemetry_data_collector.py | 41 +++++++++++++++++++ 3 files changed, 62 insertions(+), 1 deletion(-) diff --git a/genai-perf/genai_perf/telemetry_data/telemetry_data_collector.py b/genai-perf/genai_perf/telemetry_data/telemetry_data_collector.py index 79002ce4..64a0873f 100755 --- a/genai-perf/genai_perf/telemetry_data/telemetry_data_collector.py +++ b/genai-perf/genai_perf/telemetry_data/telemetry_data_collector.py @@ -46,6 +46,16 @@ def __init__( self._stop_event = Event() self._thread: Optional[Thread] = None + def check_url_reachability(self) -> bool: + """Check if the server metrics URL is reachable""" + if self._server_metrics_url: + try: + response = requests.get(self._server_metrics_url, timeout=5) + return response.status_code == requests.codes.ok + except requests.RequestException: + return False + return True + def start(self) -> None: """Start the telemetry data collection thread.""" if self._thread is None or not self._thread.is_alive(): @@ -81,3 +91,8 @@ def _collect_metrics(self) -> None: def metrics(self) -> TelemetryMetrics: """Return the collected metrics.""" return self._metrics + + @property + def metrics_url(self) -> str: + """Return server metrics url""" + return self._server_metrics_url diff --git a/genai-perf/genai_perf/wrapper.py b/genai-perf/genai_perf/wrapper.py index 8951cd35..d589fc6a 100644 --- a/genai-perf/genai_perf/wrapper.py +++ b/genai-perf/genai_perf/wrapper.py @@ -152,7 +152,12 @@ def run( ) -> None: try: if telemetry_data_collector is not None: - telemetry_data_collector.start() + if telemetry_data_collector.check_url_reachability(): + telemetry_data_collector.start() + else: + logger.warning( + f"The metrics url ({telemetry_data_collector.metrics_url}) is unreachable, cannot collect telemetry data" + ) cmd = Profiler.build_cmd(args, extra_args) logger.info(f"Running Perf Analyzer : '{' '.join(cmd)}'") if args and args.verbose: diff --git a/genai-perf/tests/test_telemetry_data_collector.py b/genai-perf/tests/test_telemetry_data_collector.py index 441c6e31..d605a745 100755 --- a/genai-perf/tests/test_telemetry_data_collector.py +++ b/genai-perf/tests/test_telemetry_data_collector.py @@ -143,3 +143,44 @@ def test_collect_metrics( self.triton_metrics_response ) mock_sleep.assert_called_once() + + @patch("requests.get") + def test_url_reachability_check_success( + self, + mock_get: MagicMock, + collector: MockTelemetryDataCollector, + ) -> None: + mock_get.return_value.status_code = requests.codes.ok # 200 + assert collector.check_url_reachability() is True + + @patch("requests.get") + def test_url_reachability_check_failure( + self, mock_get: MagicMock, collector: MockTelemetryDataCollector + ) -> None: + # Simulate a 404 Not Found error + mock_get.return_value.status_code = requests.codes.not_found # 404 + assert collector.check_url_reachability() is False + + # Simulate a 500 Internal Server Error + mock_get.return_value.status_code = requests.codes.server_error # 500 + assert collector.check_url_reachability() is False + + # Simulate a 403 Forbidden error + mock_get.return_value.status_code = requests.codes.forbidden # 403 + assert collector.check_url_reachability() is False + + # Simulate a timeout exception + mock_get.side_effect = requests.exceptions.Timeout + assert collector.check_url_reachability() is False + + # Simulate a connection error + mock_get.side_effect = requests.exceptions.ConnectionError + assert collector.check_url_reachability() is False + + # Simulate too many redirects + mock_get.side_effect = requests.exceptions.TooManyRedirects + assert collector.check_url_reachability() is False + + # Simulate a generic request exception + mock_get.side_effect = requests.exceptions.RequestException + assert collector.check_url_reachability() is False