From 58dad76e735c2a9bcdc8db5f646c484d918c48bf Mon Sep 17 00:00:00 2001 From: Cathal O'Brien Date: Mon, 11 Nov 2024 09:33:45 +0100 Subject: [PATCH] Change how mlflow measures CPU memory usage (#94) * Custom CPU Monitor * Replace old usage formula with new formula * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * passing pre-commit * update changelog --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- CHANGELOG.md | 1 + .../training/diagnostics/mlflow/logger.py | 51 ++++++++++++++++++- 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e1fef642..6ef73523 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -106,6 +106,7 @@ Keep it human-readable, your future self will thank you! - Updated configuration examples in documentation and corrected links - [#46](https://github.com/ecmwf/anemoi-training/pull/46) - Remove credential prompt from mlflow login, replace with seed refresh token via web - [#78](https://github.com/ecmwf/anemoi-training/pull/78) - Update CODEOWNERS +- Change how mlflow measures CPU Memory usage - [94](https://github.com/ecmwf/anemoi-training/pull/94) ## [0.1.0 - Anemoi training - First release](https://github.com/ecmwf/anemoi-training/releases/tag/0.1.0) - 2024-08-16 diff --git a/src/anemoi/training/diagnostics/mlflow/logger.py b/src/anemoi/training/diagnostics/mlflow/logger.py index ad07daa6..7c482ce1 100644 --- a/src/anemoi/training/diagnostics/mlflow/logger.py +++ b/src/anemoi/training/diagnostics/mlflow/logger.py @@ -433,10 +433,59 @@ def experiment(self) -> MLFlowLogger.experiment: def log_system_metrics(self) -> None: """Log system metrics (CPU, GPU, etc).""" import mlflow + import psutil + from mlflow.system_metrics.metrics.base_metrics_monitor import BaseMetricsMonitor + from mlflow.system_metrics.metrics.disk_monitor import DiskMonitor + from mlflow.system_metrics.metrics.gpu_monitor import GPUMonitor + from mlflow.system_metrics.metrics.network_monitor import NetworkMonitor from mlflow.system_metrics.system_metrics_monitor import SystemMetricsMonitor + class CustomCPUMonitor(BaseMetricsMonitor): + """Class for monitoring CPU stats. + + Extends default CPUMonitor, to also measure total \ + memory and a different formula for calculating used memory. + + """ + + def collect_metrics(self) -> None: + # Get CPU metrics. + cpu_percent = psutil.cpu_percent() + self._metrics["cpu_utilization_percentage"].append(cpu_percent) + + system_memory = psutil.virtual_memory() + # Change the formula for measuring CPU memory usage + # By default Mlflow uses psutil.virtual_memory().used + # Tests have shown that "used" underreports memory usage by as much as a factor of 2, + # "used" also misses increased memory usage from using a higher prefetch factor + self._metrics["system_memory_usage_megabytes"].append( + (system_memory.total - system_memory.available) / 1e6, + ) + self._metrics["system_memory_usage_percentage"].append(system_memory.percent) + + # QOL: report the total system memory in raw numbers + self._metrics["system_memory_total_megabytes"].append(system_memory.total / 1e6) + + def aggregate_metrics(self) -> dict[str, int]: + return {k: round(sum(v) / len(v), 1) for k, v in self._metrics.items()} + + class CustomSystemMetricsMonitor(SystemMetricsMonitor): + def __init__(self, run_id: str, resume_logging: bool = False): + super().__init__(run_id, resume_logging=resume_logging) + + # Replace the CPUMonitor with custom implementation + self.monitors = [CustomCPUMonitor(), DiskMonitor(), NetworkMonitor()] + try: + gpu_monitor = GPUMonitor() + self.monitors.append(gpu_monitor) + except ImportError: + LOGGER.warning( + "`pynvml` is not installed, to log GPU metrics please run `pip install pynvml` \ + to install it", + ) + mlflow.enable_system_metrics_logging() - system_monitor = SystemMetricsMonitor( + system_monitor = CustomSystemMetricsMonitor( self.run_id, resume_logging=self.run_id is not None, )