From e8d70d52df98688dcd47166d5ebf23de094f45c9 Mon Sep 17 00:00:00 2001 From: Ivan Zubenko Date: Tue, 21 May 2024 18:50:49 +0200 Subject: [PATCH] ignore non platform nodes in capacity calculation (#915) --- platform_monitoring/jobs_service.py | 10 +++------- tests/unit/test_jobs_service.py | 12 +----------- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/platform_monitoring/jobs_service.py b/platform_monitoring/jobs_service.py index 3513b38e..daa10f26 100644 --- a/platform_monitoring/jobs_service.py +++ b/platform_monitoring/jobs_service.py @@ -277,15 +277,11 @@ async def _get_available_resources_by_node_pool( nodes = await self._kube_client.get_nodes( label_selector=self._kube_node_pool_label ) + nodes_by_name = {node.metadata.name: node for node in nodes} for node_name, node_pods in self._get_pods_by_node(pods).items(): - for node in nodes: - if node.metadata.name == node_name: - break - else: - raise NodeNotFoundException(node_name) - node_pool_name = node.metadata.labels.get(self._kube_node_pool_label) - if not node_pool_name: # pragma: no coverage + if not (node := nodes_by_name.get(node_name)): continue + node_pool_name = node.metadata.labels[self._kube_node_pool_label] resource_requests = sum( (pod.resource_requests for pod in node_pods), ContainerResources() ) diff --git a/tests/unit/test_jobs_service.py b/tests/unit/test_jobs_service.py index 77be5662..44188db9 100644 --- a/tests/unit/test_jobs_service.py +++ b/tests/unit/test_jobs_service.py @@ -17,7 +17,7 @@ from neuro_sdk import Jobs as JobsClient from platform_monitoring.container_runtime_client import ContainerRuntimeClientRegistry -from platform_monitoring.jobs_service import JobsService, NodeNotFoundException +from platform_monitoring.jobs_service import JobsService from platform_monitoring.kube_client import KubeClient, Node, Pod @@ -268,13 +268,3 @@ async def test_get_available_jobs_count_for_pods_without_nodes( result = await service.get_available_jobs_counts() assert result == {"cpu": 10, "nvidia-gpu": 2, "amd-gpu": 4, "cpu-p": 0} - - async def test_get_available_jobs_count_node_not_found( - self, service: JobsService, kube_client: mock.Mock - ) -> None: - kube_client.get_pods.side_effect = get_pods_factory( - create_pod("unknown", cpu_m=1000, memory=2**30) - ) - - with pytest.raises(NodeNotFoundException, match="Node 'unknown' was not found"): - await service.get_available_jobs_counts()