Merge remote-tracking branch 'origin/develop' into netatmo

metno · Nov 26, 2024 · bc3dfbc · bc3dfbc
2 parents 5749e2f + fa43078
commit bc3dfbc
Show file tree

Hide file tree

Showing 17 changed files with 532 additions and 145 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,7 +11,14 @@ Keep it human-readable, your future self will thank you!
 ## [Unreleased](https://github.com/ecmwf/anemoi-training/compare/0.3.0...HEAD)
 ### Fixed
 
+- Fixed bug in power spectra plotting for the n320 resolution.
+- Allow histogram and spectrum plot for one variable [#165](https://github.com/ecmwf/anemoi-training/pull/165)
+
 ### Added
+- Introduce variable to configure (Cosine Annealing) optimizer warm up [#155](https://github.com/ecmwf/anemoi-training/pull/155)
+
+
+- Add reader groups to reduce CPU memory usage and increase dataloader throughput [#76](https://github.com/ecmwf/anemoi-training/pull/76)
 
 ### Changed
 ## [0.3.0 - Loss & Callback Refactors](https://github.com/ecmwf/anemoi-training/compare/0.2.2...0.3.0) - 2024-11-14
@@ -47,6 +54,7 @@ Keep it human-readable, your future self will thank you!
 - Feat: Save a gif for longer rollouts in validation [#65](https://github.com/ecmwf/anemoi-training/pull/65)
 - New limited area config file added, limited_area.yaml. [#134](https://github.com/ecmwf/anemoi-training/pull/134/)
 - New stretched grid config added, stretched_grid.yaml [#133](https://github.com/ecmwf/anemoi-training/pull/133)
+- Custom System monitor for Nvidia and AMD GPUs [#147](https://github.com/ecmwf/anemoi-training/pull/147)
 
 ### Changed
 
@@ -112,6 +120,7 @@ Keep it human-readable, your future self will thank you!
 - Feature: Support training for datasets with missing time steps [#48](https://github.com/ecmwf/anemoi-training/pulls/48)
 - Feature: `AnemoiMlflowClient`, an mlflow client with authentication support [#86](https://github.com/ecmwf/anemoi-training/pull/86)
 - Long Rollout Plots
+- Mask NaN values in training loss function [#72](https://github.com/ecmwf/anemoi-training/pull/72) and [#271](https://github.com/ecmwf-lab/aifs-mono/issues/271)
 
 ### Fixed
 

diff --git a/docs/user-guide/distributed.rst b/docs/user-guide/distributed.rst
@@ -45,6 +45,10 @@ number of GPUs you wish to shard the model across. It is recommended to
 only shard if the model does not fit in GPU memory, as data distribution
 is a much more efficient way to parallelise the training.
 
+When using model sharding, ``config.dataloader.read_group_size`` allows
+for sharded data loading in subgroups. This should be set to the number
+of GPUs per model for optimal performance.
+
 *********
  Example
 *********

diff --git a/docs/user-guide/training.rst b/docs/user-guide/training.rst
@@ -188,10 +188,11 @@ level has a weighting less than 0.2).
 ***************
 
 Anemoi training uses the ``CosineLRScheduler`` from PyTorch as it's
-learning rate scheduler. The user can configure the maximum learning
-rate by setting ``config.training.lr.rate``. Note that this learning
-rate is scaled by the number of GPUs where for the `data parallelism
-<distributed>`_.
+learning rate scheduler. Docs for this scheduler can be found here
+https://github.com/huggingface/pytorch-image-models/blob/main/timm/scheduler/cosine_lr.py
+The user can configure the maximum learning rate by setting
+``config.training.lr.rate``. Note that this learning rate is scaled by
+the number of GPUs where for the `data parallelism <distributed>`_.
 
 .. code:: yaml
 
@@ -201,7 +202,11 @@ The user can also control the rate at which the learning rate decreases
 by setting the total number of iterations through
 ``config.training.lr.iterations`` and the minimum learning rate reached
 through ``config.training.lr.min``. Note that the minimum learning rate
-is not scaled by the number of GPUs.
+is not scaled by the number of GPUs. The user can also control the
+warmup period by setting ``config.training.lr.warmup_t``. If the warmup
+period is set to 0, the learning rate will start at the maximum learning
+rate. If no warmup period is defined, a default warmup period of 1000
+iterations is used.
 
 *********
  Rollout

diff --git a/src/anemoi/training/config/dataloader/native_grid.yaml b/src/anemoi/training/config/dataloader/native_grid.yaml
@@ -1,6 +1,17 @@
 prefetch_factor: 2
 pin_memory: True
 
+# ============
+# read_group_size:
+#   Form subgroups of model comm groups that read data together.
+#   Each reader in the group only reads 1/read_group_size of the data
+#   which is then all-gathered between the group.
+#   This can reduce CPU memory usage as well as increase dataloader throughput.
+#   The number of GPUs per model must be divisible by read_group_size.
+#   To disable, set to 1.
+# ============
+read_group_size: ${hardware.num_gpus_per_model}
+
 num_workers:
   training: 8
   validation: 8

diff --git a/src/anemoi/training/config/training/default.yaml b/src/anemoi/training/config/training/default.yaml
@@ -43,19 +43,15 @@ zero_optimizer: False
 
 # loss function for the model
 training_loss:
-  - _target_: anemoi.training.losses.mse.WeightedMSELoss
-    scalars: ['variable']
-    ignore_nans: False
-  - _target_: anemoi.training.losses.mse.WeightedMSELoss
-    scalars: ['variable']
-    ignore_nans: False
   # loss class to initialise
-#  _target_: anemoi.training.losses.mse.WeightedMSELoss
+  _target_: anemoi.training.losses.mse.WeightedMSELoss
   # Scalars to include in loss calculation
   # Available scalars include:
   # - 'variable': See `variable_loss_scaling` for more information
-#  scalars: ['variable']
-#  ignore_nans: False
+  # - 'loss_weights_mask': Giving imputed NaNs a zero weight in the loss function
+  scalars: ['variable', 'loss_weights_mask']
+
+  ignore_nans: False
 
 loss_gradient_scaling: False
 
@@ -87,6 +83,7 @@ lr:
   rate: 0.625e-4 #local_lr
   iterations: ${training.max_steps} # NOTE: When max_epochs < max_steps, scheduler will run for max_steps
   min: 3e-7 #Not scaled by #GPU
+  warmup_t: 1000
 
 # Changes in per-gpu batch_size should come with a rescaling of the local_lr
 # in order to keep a constant global_lr

diff --git a/src/anemoi/training/data/datamodule.py b/src/anemoi/training/data/datamodule.py
@@ -9,7 +9,6 @@
 
 
 import logging
-import os
 from functools import cached_property
 from typing import Callable
 
@@ -44,31 +43,6 @@ def __init__(self, config: DictConfig) -> None:
 
         self.config = config
 
-        self.global_rank = int(os.environ.get("SLURM_PROCID", "0"))  # global rank
-        self.model_comm_group_id = (
-            self.global_rank // self.config.hardware.num_gpus_per_model
-        )  # id of the model communication group the rank is participating in
-        self.model_comm_group_rank = (
-            self.global_rank % self.config.hardware.num_gpus_per_model
-        )  # rank within one model communication group
-        total_gpus = self.config.hardware.num_gpus_per_node * self.config.hardware.num_nodes
-        assert (
-            total_gpus
-        ) % self.config.hardware.num_gpus_per_model == 0, (
-            f"GPUs per model {self.config.hardware.num_gpus_per_model} does not divide total GPUs {total_gpus}"
-        )
-        self.model_comm_num_groups = (
-            self.config.hardware.num_gpus_per_node
-            * self.config.hardware.num_nodes
-            // self.config.hardware.num_gpus_per_model
-        )  # number of model communication groups
-        LOGGER.debug(
-            "Rank %d model communication group number %d, with local model communication group rank %d",
-            self.global_rank,
-            self.model_comm_group_id,
-            self.model_comm_group_rank,
-        )
-
         # Set the maximum rollout to be expected
         self.rollout = (
             self.config.training.rollout.max
@@ -143,10 +117,12 @@ def ds_train(self) -> NativeGridDataset:
     def ds_valid(self) -> NativeGridDataset:
         r = max(self.rollout, self.config.dataloader.get("validation_rollout", 1))
 
-        assert self.config.dataloader.training.end < self.config.dataloader.validation.start, (
-            f"Training end date {self.config.dataloader.training.end} is not before"
-            f"validation start date {self.config.dataloader.validation.start}"
-        )
+        if not self.config.dataloader.training.end < self.config.dataloader.validation.start:
+            LOGGER.warning(
+                "Training end date %s is not before validation start date %s.",
+                self.config.dataloader.training.end,
+                self.config.dataloader.validation.start,
+            )
         return self._get_dataset(
             open_dataset(OmegaConf.to_container(self.config.dataloader.validation, resolve=True)),
             shuffle=False,
@@ -183,9 +159,6 @@ def _get_dataset(
             rollout=r,
             multistep=self.config.training.multistep_input,
             timeincrement=self.timeincrement,
-            model_comm_group_rank=self.model_comm_group_rank,
-            model_comm_group_id=self.model_comm_group_id,
-            model_comm_num_groups=self.model_comm_num_groups,
             shuffle=shuffle,
             label=label,
         )

diff --git a/src/anemoi/training/data/dataset.py b/src/anemoi/training/data/dataset.py
@@ -36,9 +36,6 @@ def __init__(
         rollout: int = 1,
         multistep: int = 1,
         timeincrement: int = 1,
-        model_comm_group_rank: int = 0,
-        model_comm_group_id: int = 0,
-        model_comm_num_groups: int = 1,
         shuffle: bool = True,
         label: str = "generic",
     ) -> None:
@@ -54,12 +51,6 @@ def __init__(
             time increment between samples, by default 1
         multistep : int, optional
             collate (t-1, ... t - multistep) into the input state vector, by default 1
-        model_comm_group_rank : int, optional
-            process rank in the torch.distributed group (important when running on multiple GPUs), by default 0
-        model_comm_group_id: int, optional
-            device group ID, default 0
-        model_comm_num_groups : int, optional
-            total number of device groups, by default 1
         shuffle : bool, optional
             Shuffle batches, by default True
         label : str, optional
@@ -77,11 +68,14 @@ def __init__(
         self.n_samples_per_epoch_total: int = 0
         self.n_samples_per_epoch_per_worker: int = 0
 
-        # DDP-relevant info
-        self.model_comm_group_rank = model_comm_group_rank
-        self.model_comm_num_groups = model_comm_num_groups
-        self.model_comm_group_id = model_comm_group_id
-        self.global_rank = int(os.environ.get("SLURM_PROCID", "0"))
+        # lazy init model and reader group info, will be set by the DDPGroupStrategy:
+        self.model_comm_group_rank = 0
+        self.model_comm_num_groups = 1
+        self.model_comm_group_id = 0
+        self.global_rank = 0
+
+        self.reader_group_rank = 0
+        self.reader_group_size = 1
 
         # additional state vars (lazy init)
         self.n_samples_per_worker = 0
@@ -93,6 +87,8 @@ def __init__(
         assert self.multi_step > 0, "Multistep value must be greater than zero."
         self.ensemble_dim: int = 2
         self.ensemble_size = self.data.shape[self.ensemble_dim]
+        self.grid_dim: int = -1
+        self.grid_size = self.data.shape[self.grid_dim]
 
     @cached_property
     def statistics(self) -> dict:
@@ -128,6 +124,58 @@ def valid_date_indices(self) -> np.ndarray:
         """
         return get_usable_indices(self.data.missing, len(self.data), self.rollout, self.multi_step, self.timeincrement)
 
+    def set_comm_group_info(
+        self,
+        global_rank: int,
+        model_comm_group_id: int,
+        model_comm_group_rank: int,
+        model_comm_num_groups: int,
+        reader_group_rank: int,
+        reader_group_size: int,
+    ) -> None:
+        """Set model and reader communication group information (called by DDPGroupStrategy).
+
+        Parameters
+        ----------
+        global_rank : int
+            Global rank
+        model_comm_group_id : int
+            Model communication group ID
+        model_comm_group_rank : int
+            Model communication group rank
+        model_comm_num_groups : int
+            Number of model communication groups
+        reader_group_rank : int
+            Reader group rank
+        reader_group_size : int
+            Reader group size
+        """
+        self.global_rank = global_rank
+        self.model_comm_group_id = model_comm_group_id
+        self.model_comm_group_rank = model_comm_group_rank
+        self.model_comm_num_groups = model_comm_num_groups
+        self.reader_group_rank = reader_group_rank
+        self.reader_group_size = reader_group_size
+
+        if self.reader_group_size > 1:
+            # get the grid shard size and start/end indices
+            grid_shard_size = self.grid_size // self.reader_group_size
+            self.grid_start = self.reader_group_rank * grid_shard_size
+            if self.reader_group_rank == self.reader_group_size - 1:
+                self.grid_end = self.grid_size
+            else:
+                self.grid_end = (self.reader_group_rank + 1) * grid_shard_size
+
+        LOGGER.debug(
+            "NativeGridDataset.set_group_info(): global_rank %d, model_comm_group_id %d, "
+            "model_comm_group_rank %d, model_comm_num_groups %d, reader_group_rank %d",
+            global_rank,
+            model_comm_group_id,
+            model_comm_group_rank,
+            model_comm_num_groups,
+            reader_group_rank,
+        )
+
     def per_worker_init(self, n_workers: int, worker_id: int) -> None:
         """Called by worker_init_func on each copy of dataset.
 
@@ -233,7 +281,11 @@ def __iter__(self) -> torch.Tensor:
             start = i - (self.multi_step - 1) * self.timeincrement
             end = i + (self.rollout + 1) * self.timeincrement
 
-            x = self.data[start : end : self.timeincrement]
+            if self.reader_group_size > 1:  # read only a subset of the grid
+                x = self.data[start : end : self.timeincrement, :, :, self.grid_start : self.grid_end]
+            else:  # read the full grid
+                x = self.data[start : end : self.timeincrement, :, :, :]
+
             x = rearrange(x, "dates variables ensemble gridpoints -> dates ensemble gridpoints variables")
             self.ensemble_dim = 1
 

diff --git a/src/anemoi/training/diagnostics/callbacks/evaluation.py b/src/anemoi/training/diagnostics/callbacks/evaluation.py
@@ -15,7 +15,6 @@
 
 import torch
 from pytorch_lightning.callbacks import Callback
-from pytorch_lightning.utilities import rank_zero_only
 
 if TYPE_CHECKING:
     import pytorch_lightning as pl
@@ -103,7 +102,6 @@ def _log(self, pl_module: pl.LightningModule, loss: torch.Tensor, metrics: dict,
                 rank_zero_only=True,
             )
 
-    @rank_zero_only
     def on_validation_batch_end(
         self,
         trainer: pl.Trainer,
@@ -114,6 +112,8 @@ def on_validation_batch_end(
     ) -> None:
         del outputs  # outputs are not used
         if batch_idx % self.every_n_batches == 0:
+            batch = pl_module.allgather_batch(batch)
+
             precision_mapping = {
                 "16-mixed": torch.float16,
                 "bf16-mixed": torch.bfloat16,