From 389ff8f614606766a8683039012cfc23bb4d9c01 Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Wed, 21 Aug 2024 13:21:10 +0200
Subject: [PATCH 01/33] small modifications

---
 modelforge/tests/test_dataset.py |  3 ++-
 modelforge/train/training.py     | 10 ++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/modelforge/tests/test_dataset.py b/modelforge/tests/test_dataset.py
index 7702892b..e2120562 100644
--- a/modelforge/tests/test_dataset.py
+++ b/modelforge/tests/test_dataset.py
@@ -713,7 +713,8 @@ def test_numpy_dataset_assignment(dataset_name):
 
 
 def test_energy_postprocessing():
-    # setup test dataset
+    # test that the mean and stddev of the dataset 
+    # are correct
     from modelforge.dataset.dataset import DataModule
 
     # test the self energy calculation on the QM9 dataset
diff --git a/modelforge/train/training.py b/modelforge/train/training.py
index 53f5df4b..ebcce9db 100644
--- a/modelforge/train/training.py
+++ b/modelforge/train/training.py
@@ -651,12 +651,12 @@ def training_step(self, batch: "BatchData", batch_idx: int) -> torch.Tensor:
         for key, loss in loss_dict.items():
             self.log(
                 f"loss/{key}",
-                torch.mean(loss),
+                loss,
                 on_step=False,
                 prog_bar=True,
                 on_epoch=True,
-                batch_size=1,
-            )  # batch size is 1 because the mean of the batch is logged
+                sync_dist=True,
+            )
 
         return loss_dict["total_loss"]
 
@@ -790,7 +790,9 @@ def _log_on_epoch(self, log_mode: str = "train"):
                     metrics[name] = metric.compute()
                     metric.reset()
             # log dict, print val metrics to console
-            self.log_dict(metrics, on_epoch=True, prog_bar=(phase == "val"))
+            self.log_dict(
+                metrics, on_epoch=True, prog_bar=(phase == "val"), sync_dist=True
+            )
 
     def configure_optimizers(self):
         """

From 5b75fed7feb17d0a30361db86515a0af599c7614 Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Wed, 21 Aug 2024 14:19:48 +0200
Subject: [PATCH 02/33] add batchsize

---
 modelforge/dataset/dataset.py     |  3 +++
 modelforge/tests/test_training.py |  8 +++----
 modelforge/train/training.py      | 38 +++++++++++++++++--------------
 3 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py
index 53142d86..c7b4d326 100644
--- a/modelforge/dataset/dataset.py
+++ b/modelforge/dataset/dataset.py
@@ -238,6 +238,9 @@ def to(
         return self
 
 
+    def batch_size(self):
+        return self.metadata.E.size(dim=0)
+    
 class TorchDataset(torch.utils.data.Dataset[BatchData]):
     """
     Wraps a numpy dataset to make it compatible with PyTorch DataLoader.
diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py
index 3b514847..76847915 100644
--- a/modelforge/tests/test_training.py
+++ b/modelforge/tests/test_training.py
@@ -108,8 +108,8 @@ def test_train_from_single_toml_file():
 def test_error_calculation(single_batch_with_batchsize_16_with_force):
     # test the different Loss classes
     from modelforge.train.training import (
-        FromPerAtomToPerMoleculeMeanSquaredError,
-        PerMoleculeMeanSquaredError,
+        FromPerAtomToPerMoleculeSquaredError,
+        PerMoleculeSquaredError,
     )
 
     # generate data
@@ -122,7 +122,7 @@ def test_error_calculation(single_batch_with_batchsize_16_with_force):
     predicted_F = true_F + torch.rand_like(true_F) * 10
 
     # test error for property with shape (nr_of_molecules, 1)
-    error = PerMoleculeMeanSquaredError()
+    error = PerMoleculeSquaredError()
     E_error = error(predicted_E, true_E, data)
 
     # compare output (mean squared error scaled by number of atoms in the molecule)
@@ -135,7 +135,7 @@ def test_error_calculation(single_batch_with_batchsize_16_with_force):
     assert torch.allclose(E_error, reference_E_error)
 
     # test error for property with shape (nr_of_atoms, 3)
-    error = FromPerAtomToPerMoleculeMeanSquaredError()
+    error = FromPerAtomToPerMoleculeSquaredError()
     F_error = error(predicted_F, true_F, data)
 
     # compare error (mean squared error scaled by number of atoms in the molecule)
diff --git a/modelforge/train/training.py b/modelforge/train/training.py
index ebcce9db..97057941 100644
--- a/modelforge/train/training.py
+++ b/modelforge/train/training.py
@@ -25,10 +25,10 @@
 
 __all__ = [
     "Error",
-    "FromPerAtomToPerMoleculeMeanSquaredError",
+    "FromPerAtomToPerMoleculeSquaredError",
     "Loss",
     "LossFactory",
-    "PerMoleculeMeanSquaredError",
+    "PerMoleculeSquaredError",
     "ModelTrainer",
     "create_error_metrics",
     "ModelTrainer",
@@ -107,7 +107,7 @@ def scale_by_number_of_atoms(
         return scaled_by_number_of_atoms
 
 
-class FromPerAtomToPerMoleculeMeanSquaredError(Error):
+class FromPerAtomToPerMoleculeSquaredError(Error):
     """
     Calculates the per-atom error and aggregates it to per-molecule mean squared error.
     """
@@ -170,11 +170,11 @@ def forward(
             batch.metadata.atomic_subsystem_counts,
             prefactor=per_atom_prediction.shape[-1],
         )
-        # return the average
-        return torch.mean(per_molecule_square_error_scaled)
 
+        return per_molecule_square_error_scaled
 
-class PerMoleculeMeanSquaredError(Error):
+
+class PerMoleculeSquaredError(Error):
     """
     Calculates the per-molecule mean squared error.
     """
@@ -214,8 +214,7 @@ def forward(
             batch.metadata.atomic_subsystem_counts,
         )
 
-        # return the average
-        return torch.mean(per_molecule_square_error_scaled)
+        return per_molecule_square_error_scaled
 
     def calculate_error(
         self,
@@ -259,15 +258,15 @@ def __init__(self, loss_porperty: List[str], weight: Dict[str, float]):
         for prop, w in weight.items():
             if prop in self._SUPPORTED_PROPERTIES:
                 if prop == "per_atom_force":
-                    self.loss[prop] = FromPerAtomToPerMoleculeMeanSquaredError(
+                    self.loss[prop] = FromPerAtomToPerMoleculeSquaredError(
                         scale_by_number_of_atoms=True
                     )
                 elif prop == "per_atom_energy":
-                    self.loss[prop] = PerMoleculeMeanSquaredError(
+                    self.loss[prop] = PerMoleculeSquaredError(
                         scale_by_number_of_atoms=True
                     )  # FIXME: this is currently not working
                 elif prop == "per_molecule_energy":
-                    self.loss[prop] = PerMoleculeMeanSquaredError(
+                    self.loss[prop] = PerMoleculeSquaredError(
                         scale_by_number_of_atoms=False
                     )
                 self.register_buffer(prop, torch.tensor(w))
@@ -638,27 +637,32 @@ def training_step(self, batch: "BatchData", batch_idx: int) -> torch.Tensor:
             The loss tensor computed for the current training step.
         """
 
-        # calculate energy and forces
+        # calculate energy and forces, Note that `predict_target` is a
+        # dictionary containing the predicted and true values for energy and
+        # force`
         predict_target = self.calculate_predictions(batch, self.potential)
 
-        # calculate the loss
+        # calculate the loss (for every entry in predict_target the squared
+        # error is calculated)
         loss_dict = self.loss(predict_target, batch)
 
-        # Update and log training error
-        self._update_metrics(self.train_error, predict_target)
+        # Update and log training error (if requested)
+        if self.log_on_training_step:
+            self._update_metrics(self.train_error, predict_target)
 
         # log the loss (this includes the individual contributions that the loss contains)
         for key, loss in loss_dict.items():
             self.log(
                 f"loss/{key}",
-                loss,
+                torch.mean(loss),
                 on_step=False,
                 prog_bar=True,
                 on_epoch=True,
                 sync_dist=True,
+                batch_size=batch.batch_size(),
             )
 
-        return loss_dict["total_loss"]
+        return torch.mean(loss_dict["total_loss"])
 
     @torch.enable_grad()
     def validation_step(self, batch: "BatchData", batch_idx: int) -> None:

From 2876384ccbb663306dc609fbb58590dfce1179ff Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Thu, 22 Aug 2024 16:13:50 +0200
Subject: [PATCH 03/33] update

---
 modelforge/dataset/dataset.py                 |  2 +-
 modelforge/tests/conftest.py                  | 28 ++--------
 .../tests/data/training_defaults/default.toml |  6 +-
 modelforge/tests/test_dataset.py              |  5 +-
 modelforge/tests/test_models.py               | 55 ++++++++++++-------
 modelforge/tests/test_nn.py                   |  8 ++-
 modelforge/tests/test_painn.py                | 11 ++--
 modelforge/tests/test_physnet.py              |  6 +-
 modelforge/tests/test_sake.py                 | 21 ++++---
 modelforge/tests/test_training.py             | 12 ++--
 modelforge/train/training.py                  | 49 +++++++++++------
 scripts/config.toml                           | 21 ++++---
 12 files changed, 125 insertions(+), 99 deletions(-)

diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py
index c7b4d326..440c867b 100644
--- a/modelforge/dataset/dataset.py
+++ b/modelforge/dataset/dataset.py
@@ -1343,7 +1343,7 @@ def _per_datapoint_operations(
         from tqdm import tqdm
 
         # remove the self energies if requested
-        log.info("Precalculating pairlist for dataset")
+        log.info("Performing per datapoint operations in the dataset dataset")
         if self.remove_self_energies:
             log.info("Removing self energies from the dataset")
 
diff --git a/modelforge/tests/conftest.py b/modelforge/tests/conftest.py
index 7e8e475f..f7ccd939 100644
--- a/modelforge/tests/conftest.py
+++ b/modelforge/tests/conftest.py
@@ -96,35 +96,15 @@ def single_batch(batch_size: int = 64, dataset_name="QM9"):
 
 
 @pytest.fixture(scope="session")
-def single_batch_with_batchsize_64():
+def single_batch_with_batchsize():
     """
     Utility fixture to create a single batch of data for testing.
     """
-    return single_batch(batch_size=64)
 
+    def _create_single_batch(batch_size: int, dataset_name: str):
+        return single_batch(batch_size=batch_size, dataset_name=dataset_name)
 
-@pytest.fixture(scope="session")
-def single_batch_with_batchsize_1():
-    """
-    Utility fixture to create a single batch of data for testing.
-    """
-    return single_batch(batch_size=1)
-
-
-@pytest.fixture(scope="session")
-def single_batch_with_batchsize_2_with_force():
-    """
-    Utility fixture to create a single batch of data for testing.
-    """
-    return single_batch(batch_size=2, dataset_name="PHALKETHOH")
-
-
-@pytest.fixture(scope="session")
-def single_batch_with_batchsize_16_with_force():
-    """
-    Utility fixture to create a single batch of data for testing.
-    """
-    return single_batch(batch_size=16, dataset_name="PHALKETHOH")
+    return _create_single_batch
 
 
 def initialize_dataset(
diff --git a/modelforge/tests/data/training_defaults/default.toml b/modelforge/tests/data/training_defaults/default.toml
index 19c0d3b7..1e4025a9 100644
--- a/modelforge/tests/data/training_defaults/default.toml
+++ b/modelforge/tests/data/training_defaults/default.toml
@@ -35,11 +35,11 @@ monitor = "val/per_molecule_energy/rmse"
 interval = "epoch"
 
 [training.loss_parameter]
-loss_property = ['per_molecule_energy', 'per_atom_force'] # use
+loss_property = ['per_molecule_energy'] #, 'per_atom_force'] # use
 
 [training.loss_parameter.weight]
-per_molecule_energy = 0.999 #NOTE: reciprocal units
-per_atom_force = 0.001
+per_molecule_energy = 1.0 #NOTE: reciprocal units
+#per_atom_force = 0.001
 
 
 [training.early_stopping]
diff --git a/modelforge/tests/test_dataset.py b/modelforge/tests/test_dataset.py
index e2120562..b591fa30 100644
--- a/modelforge/tests/test_dataset.py
+++ b/modelforge/tests/test_dataset.py
@@ -458,10 +458,11 @@ def test_data_item_format_of_datamodule(
 @pytest.mark.parametrize(
     "potential_name", _Implemented_NNPs.get_all_neural_network_names()
 )
-def test_dataset_neighborlist(potential_name, single_batch_with_batchsize_64):
+def test_dataset_neighborlist(potential_name, single_batch_with_batchsize):
     """Test the neighborlist."""
 
-    nnp_input = single_batch_with_batchsize_64.nnp_input
+    batch = single_batch_with_batchsize(64, 'QM9')
+    nnp_input = batch.nnp_input
 
     # test that the neighborlist is correctly generated
     from modelforge.tests.test_models import load_configs_into_pydantic_models
diff --git a/modelforge/tests/test_models.py b/modelforge/tests/test_models.py
index 9709c6b0..58602e31 100644
--- a/modelforge/tests/test_models.py
+++ b/modelforge/tests/test_models.py
@@ -54,11 +54,13 @@ def load_configs_into_pydantic_models(potential_name: str, dataset_name: str):
 @pytest.mark.parametrize(
     "potential_name", _Implemented_NNPs.get_all_neural_network_names()
 )
-def test_JAX_wrapping(potential_name, single_batch_with_batchsize_64):
+def test_JAX_wrapping(potential_name, single_batch_with_batchsize):
     from modelforge.potential.models import (
         NeuralNetworkPotentialFactory,
     )
 
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
+
     # read default parameters
     config = load_configs_into_pydantic_models(f"{potential_name.lower()}", "qm9")
 
@@ -70,7 +72,7 @@ def test_JAX_wrapping(potential_name, single_batch_with_batchsize_64):
     )
 
     assert "JAX" in str(type(model))
-    nnp_input = single_batch_with_batchsize_64.nnp_input.as_jax_namedtuple()
+    nnp_input = batch.nnp_input.as_jax_namedtuple()
     out = model(nnp_input)["per_molecule_energy"]
     import jax
 
@@ -329,13 +331,14 @@ def test_dataset_statistic(potential_name):
     "potential_name", _Implemented_NNPs.get_all_neural_network_names()
 )
 def test_energy_between_simulation_environments(
-    potential_name, single_batch_with_batchsize_64
+    potential_name, single_batch_with_batchsize
 ):
     # compare that the energy is the same for the JAX and PyTorch Model
     import numpy as np
     import torch
 
-    nnp_input = single_batch_with_batchsize_64.nnp_input
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
+    nnp_input = batch.nnp_input
     # test the forward pass through each of the models
     # cast input and model to torch.float64
     # read default parameters
@@ -416,20 +419,24 @@ def test_forward_pass_with_all_datasets(
     assert torch.all(pair_list[0, 1:] >= pair_list[0, :-1])
 
 
+@pytest.mark.parametrize("dataset_name", ["QM9", "SPICE2"])
 @pytest.mark.parametrize(
     "potential_name", _Implemented_NNPs.get_all_neural_network_names()
 )
 @pytest.mark.parametrize("simulation_environment", ["JAX", "PyTorch"])
 def test_forward_pass(
-    potential_name, simulation_environment, single_batch_with_batchsize_64
+    dataset_name, potential_name, simulation_environment, single_batch_with_batchsize
 ):
     # this test sends a single batch from different datasets through the model
     import torch
 
-    nnp_input = single_batch_with_batchsize_64.nnp_input
+    batch = batch = single_batch_with_batchsize(batch_size=6, dataset_name=dataset_name)
+    nnp_input = batch.nnp_input
 
     # read default parameters
-    config = load_configs_into_pydantic_models(f"{potential_name.lower()}", "qm9")
+    config = load_configs_into_pydantic_models(
+        f"{potential_name.lower()}", dataset_name
+    )
     nr_of_mols = nnp_input.atomic_subsystem_indices.unique().shape[0]
 
     # test the forward pass through each of the models
@@ -443,6 +450,7 @@ def test_forward_pass(
 
     output = model(nnp_input)
 
+
     # test that we get an energie per molecule
     assert len(output["per_molecule_energy"]) == nr_of_mols
 
@@ -450,8 +458,13 @@ def test_forward_pass(
     # which have chemically equivalent hydrogens at the minimum geometry.
     # This has to be reflected in the atomic energies E_i, which
     # have to be equal for all hydrogens
-    if "JAX" not in str(type(model)):
-        from loguru import logger as log
+    if "JAX" not in str(type(model)) and dataset_name == "QM9":
+    # make sure that we are correctly reducing
+        ref = torch.zeros_like(output["per_molecule_energy"]).scatter_add_(
+            0, nnp_input.atomic_subsystem_indices.long(), output["per_atom_energy"]
+        )
+
+        assert torch.allclose(ref, output["per_molecule_energy"])
 
         # assert that the following tensor has equal values for dim=0 index 1 to 4 and 6 to 8
 
@@ -478,17 +491,18 @@ def test_forward_pass(
 @pytest.mark.parametrize(
     "potential_name", _Implemented_NNPs.get_all_neural_network_names()
 )
-def test_calculate_energies_and_forces(potential_name, single_batch_with_batchsize_64):
+def test_calculate_energies_and_forces(potential_name, single_batch_with_batchsize):
     """
     Test the calculation of energies and forces for a molecule.
     """
     import torch
 
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
     # read default parameters
     config = load_configs_into_pydantic_models(f"{potential_name.lower()}", "qm9")
 
     # get batch
-    nnp_input = single_batch_with_batchsize_64.nnp_input
+    nnp_input = batch.nnp_input
 
     # test the pass through each of the models
     model_training = NeuralNetworkPotentialFactory.generate_potential(
@@ -536,7 +550,7 @@ def test_calculate_energies_and_forces(potential_name, single_batch_with_batchsi
     "potential_name", _Implemented_NNPs.get_all_neural_network_names()
 )
 def test_calculate_energies_and_forces_with_jax(
-    potential_name, single_batch_with_batchsize_64
+    potential_name, single_batch_with_batchsize
 ):
     """
     Test the calculation of energies and forces for a molecule.
@@ -545,8 +559,8 @@ def test_calculate_energies_and_forces_with_jax(
 
     # read default parameters
     config = load_configs_into_pydantic_models(f"{potential_name.lower()}", "qm9")
-
-    nnp_input = single_batch_with_batchsize_64.nnp_input
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
+    nnp_input = batch.nnp_input
     # test the backward pass through each of the models
     nr_of_mols = nnp_input.atomic_subsystem_indices.unique().shape[0]
     nr_of_atoms_per_batch = nnp_input.atomic_subsystem_indices.shape[0]
@@ -929,11 +943,11 @@ def test_pairlist_on_dataset():
 @pytest.mark.parametrize(
     "potential_name", _Implemented_NNPs.get_all_neural_network_names()
 )
-def test_casting(potential_name, single_batch_with_batchsize_64):
+def test_casting(potential_name, single_batch_with_batchsize):
     # test dtype casting
     import torch
 
-    batch = single_batch_with_batchsize_64
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
     batch_ = batch.to(dtype=torch.float64)
     assert batch_.nnp_input.positions.dtype == torch.float64
     batch_ = batch_.to(dtype=torch.float32)
@@ -978,7 +992,7 @@ def test_casting(potential_name, single_batch_with_batchsize_64):
 def test_equivariant_energies_and_forces(
     potential_name,
     simulation_environment,
-    single_batch_with_batchsize_64,
+    single_batch_with_batchsize,
     equivariance_utils,
 ):
     """
@@ -1001,7 +1015,8 @@ def test_equivariant_energies_and_forces(
     translation, rotation, reflection = equivariance_utils
     # define the tolerance
     atol = 1e-3
-    nnp_input = single_batch_with_batchsize_64.nnp_input
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
+    nnp_input = batch.nnp_input
 
     # initialize the models
     model = model.to(dtype=torch.float64)
@@ -1009,7 +1024,9 @@ def test_equivariant_energies_and_forces(
     # ------------------- #
     # start the test
     # reference values
-    nnp_input = single_batch_with_batchsize_64.nnp_input.to(dtype=torch.float64)
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
+
+    nnp_input = batch.nnp_input.to(dtype=torch.float64)
     reference_result = model(nnp_input)["per_molecule_energy"].to(dtype=torch.float64)
     reference_forces = -torch.autograd.grad(
         reference_result.sum(),
diff --git a/modelforge/tests/test_nn.py b/modelforge/tests/test_nn.py
index e399550a..a5b0b276 100644
--- a/modelforge/tests/test_nn.py
+++ b/modelforge/tests/test_nn.py
@@ -1,14 +1,16 @@
 from .test_models import load_configs_into_pydantic_models
 
 
-def test_embedding(single_batch_with_batchsize_64):
+def test_embedding(single_batch_with_batchsize):
     # test the input featurization, including:
     # - nuclear charge embedding
     # - total charge mixing
 
-    import torch
+    import torch  # noqa: F401
 
-    nnp_input = single_batch_with_batchsize_64.nnp_input
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
+
+    nnp_input = batch.nnp_input
     model_name = "SchNet"
     # read default parameters and extract featurization
     config = load_configs_into_pydantic_models(f"{model_name.lower()}", "qm9")
diff --git a/modelforge/tests/test_painn.py b/modelforge/tests/test_painn.py
index 647d9901..3dc60d48 100644
--- a/modelforge/tests/test_painn.py
+++ b/modelforge/tests/test_painn.py
@@ -2,7 +2,7 @@
 from modelforge.potential.painn import PaiNN
 
 
-def test_forward(single_batch_with_batchsize_64):
+def test_forward(single_batch_with_batchsize):
     """Test initialization of the PaiNN neural network potential."""
     # read default parameters
     from modelforge.tests.test_models import load_configs_into_pydantic_models
@@ -17,8 +17,9 @@ def test_forward(single_batch_with_batchsize_64):
         ],
     )
     assert painn is not None, "PaiNN model should be initialized."
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
 
-    nnp_input = single_batch_with_batchsize_64.nnp_input.to(dtype=torch.float32)
+    nnp_input = batch.nnp_input.to(dtype=torch.float32)
     energy = painn(nnp_input)["per_molecule_energy"]
     nr_of_mols = nnp_input.atomic_subsystem_indices.unique().shape[0]
 
@@ -27,11 +28,13 @@ def test_forward(single_batch_with_batchsize_64):
     )  # Assuming energy is calculated per sample in the batch
 
 
-def test_equivariance(single_batch_with_batchsize_64):
+def test_equivariance(single_batch_with_batchsize):
     from modelforge.potential.painn import PaiNN
     from dataclasses import replace
     import torch
 
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
+
     from modelforge.tests.test_models import load_configs_into_pydantic_models
 
     # read default parameters
@@ -50,7 +53,7 @@ def test_equivariance(single_batch_with_batchsize_64):
         ],
     ).double()
 
-    methane_input = single_batch_with_batchsize_64.nnp_input.to(dtype=torch.float64)
+    methane_input = batch.nnp_input.to(dtype=torch.float64)
     perturbed_methane_input = replace(methane_input)
     perturbed_methane_input.positions = torch.matmul(
         methane_input.positions, rotation_matrix
diff --git a/modelforge/tests/test_physnet.py b/modelforge/tests/test_physnet.py
index 8aadca05..1601654d 100644
--- a/modelforge/tests/test_physnet.py
+++ b/modelforge/tests/test_physnet.py
@@ -15,7 +15,7 @@ def test_init():
     )
 
 
-def test_forward(single_batch_with_batchsize_64):
+def test_forward(single_batch_with_batchsize):
     import torch
     from modelforge.potential.physnet import PhysNet
 
@@ -37,7 +37,9 @@ def test_forward(single_batch_with_batchsize_64):
     )
     model = model.to(torch.float32)
     print(model)
-    yhat = model(single_batch_with_batchsize_64.nnp_input.to(dtype=torch.float32))
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
+
+    yhat = model(batch.nnp_input.to(dtype=torch.float32))
 
 
 def test_compare_representation():
diff --git a/modelforge/tests/test_sake.py b/modelforge/tests/test_sake.py
index 8fff65ba..b3f36ee1 100644
--- a/modelforge/tests/test_sake.py
+++ b/modelforge/tests/test_sake.py
@@ -33,12 +33,13 @@ def test_init():
 from openff.units import unit
 
 
-def test_forward(single_batch_with_batchsize_64):
+def test_forward(single_batch_with_batchsize):
     """
     Test the forward pass of the SAKE model.
     """
     # get methane input
-    methane = single_batch_with_batchsize_64.nnp_input
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
+    methane = batch.nnp_input
 
     from modelforge.tests.test_models import load_configs_into_pydantic_models
 
@@ -91,7 +92,7 @@ def test_interaction_forward():
 
 @pytest.mark.parametrize("eq_atol", [3e-1])
 @pytest.mark.parametrize("h_atol", [8e-2])
-def test_layer_equivariance(h_atol, eq_atol, single_batch_with_batchsize_64):
+def test_layer_equivariance(h_atol, eq_atol, single_batch_with_batchsize):
     import torch
     from modelforge.potential.sake import SAKE
     from dataclasses import replace
@@ -118,7 +119,9 @@ def test_layer_equivariance(h_atol, eq_atol, single_batch_with_batchsize_64):
     )
 
     # get methane input
-    methane = single_batch_with_batchsize_64.nnp_input
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
+
+    methane = batch.nnp_input
     perturbed_methane_input = replace(methane)
     perturbed_methane_input.positions = torch.matmul(methane.positions, rotation_matrix)
 
@@ -424,7 +427,7 @@ def test_sake_layer_against_reference(include_self_pairs, v_is_none):
 
 # FIXME: this test is currently failing
 @pytest.mark.xfail
-def test_model_against_reference(single_batch_with_batchsize_1):
+def test_model_against_reference(single_batch_with_batchsize):
     nr_heads = 5
     key = jax.random.PRNGKey(1884)
     torch.manual_seed(1884)
@@ -462,7 +465,8 @@ def test_model_against_reference(single_batch_with_batchsize_1):
     )
 
     # get methane input
-    methane = single_batch_with_batchsize_1.nnp_input
+    batch = single_batch_with_batchsize(batch_size=1)
+    methane = batch.nnp_input
     pairlist_output = mf_sake.compute_interacting_pairs.prepare_inputs(methane)
     prepared_methane = mf_sake.core_module._model_specific_input_preparation(
         methane, pairlist_output
@@ -605,7 +609,7 @@ def test_model_against_reference(single_batch_with_batchsize_1):
     # assert torch.allclose(mf_out.E, torch.from_numpy(onp.array(ref_out[0])))
 
 
-def test_model_invariance(single_batch_with_batchsize_1):
+def test_model_invariance(single_batch_with_batchsize):
     from dataclasses import replace
 
     from modelforge.tests.test_models import load_configs_into_pydantic_models
@@ -620,7 +624,8 @@ def test_model_invariance(single_batch_with_batchsize_1):
         ],
     )
     # get methane input
-    methane = single_batch_with_batchsize_1.nnp_input
+    batch = single_batch_with_batchsize(batch_size=1)
+    methane = batch.nnp_input
 
     rotation_matrix = torch.tensor([[0.0, 1.0, 0.0], [-1.0, 0.0, 0.0], [0.0, 0.0, 1.0]])
     perturbed_methane_input = replace(methane)
diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py
index 76847915..b50f6454 100644
--- a/modelforge/tests/test_training.py
+++ b/modelforge/tests/test_training.py
@@ -80,7 +80,7 @@ def get_trainer(potential_name: str, dataset_name: str):
 @pytest.mark.parametrize(
     "potential_name", _Implemented_NNPs.get_all_neural_network_names()
 )
-@pytest.mark.parametrize("dataset_name", ["QM9"])
+@pytest.mark.parametrize("dataset_name", ["QM9", "SPICE2"])
 def test_train_with_lightning(potential_name, dataset_name):
     """
     Test that we can train, save and load checkpoints.
@@ -105,7 +105,7 @@ def test_train_from_single_toml_file():
     read_config_and_train(config_path)
 
 
-def test_error_calculation(single_batch_with_batchsize_16_with_force):
+def test_error_calculation(single_batch_with_batchsize):
     # test the different Loss classes
     from modelforge.train.training import (
         FromPerAtomToPerMoleculeSquaredError,
@@ -113,7 +113,9 @@ def test_error_calculation(single_batch_with_batchsize_16_with_force):
     )
 
     # generate data
-    data = single_batch_with_batchsize_16_with_force
+    batch = single_batch_with_batchsize(batch_size=16, dataset_name="PHALKETHOH")
+
+    data = batch
     true_E = data.metadata.E
     true_F = data.metadata.F
 
@@ -158,10 +160,10 @@ def test_error_calculation(single_batch_with_batchsize_16_with_force):
     assert torch.allclose(F_error, reference_F_error)
 
 
-def test_loss(single_batch_with_batchsize_16_with_force):
+def test_loss(single_batch_with_batchsize):
     from modelforge.train.training import Loss
 
-    batch = single_batch_with_batchsize_16_with_force
+    batch = single_batch_with_batchsize(batch_size=16, dataset_name="PHALKETHOH")
 
     loss_porperty = ["per_molecule_energy", "per_atom_force"]
     loss_weights = {"per_molecule_energy": 0.5, "per_atom_force": 0.5}
diff --git a/modelforge/train/training.py b/modelforge/train/training.py
index 97057941..84b19100 100644
--- a/modelforge/train/training.py
+++ b/modelforge/train/training.py
@@ -267,7 +267,7 @@ def __init__(self, loss_porperty: List[str], weight: Dict[str, float]):
                     )  # FIXME: this is currently not working
                 elif prop == "per_molecule_energy":
                     self.loss[prop] = PerMoleculeSquaredError(
-                        scale_by_number_of_atoms=False
+                        scale_by_number_of_atoms=True
                     )
                 self.register_buffer(prop, torch.tensor(w))
             else:
@@ -376,20 +376,18 @@ def create_error_metrics(loss_properties: List[str]) -> ModuleDict:
 
 class CalculateProperties(torch.nn.Module):
 
-    def __init__(self):
+    def __init__(self, requested_properties: List[str]):
         """
         A utility class for calculating properties such as energies and forces from batches using a neural network model.
 
-        Methods
-        -------
-        _get_forces(batch: BatchData, energies: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]
-            Computes the forces from a given batch using the model.
-        _get_energies(batch: BatchData, model: Type[torch.nn.Module]) -> Dict[str, torch.Tensor]
-            Computes the energies from a given batch using the model.
-        forward(batch: BatchData, model: Type[torch.nn.Module]) -> Dict[str, torch.Tensor]
-            Computes the energies and forces from a given batch using the model.
+        Parameters
+
         """
         super().__init__()
+        self.requested_properties = requested_properties
+        self.include_force = False
+        if "force" in self.requested_properties:
+            self.include_force = True
 
     def _get_forces(
         self, batch: "BatchData", energies: Dict[str, torch.Tensor]
@@ -490,7 +488,10 @@ def forward(
             The true and predicted energies and forces from the dataset and the model.
         """
         energies = self._get_energies(batch, model)
-        forces = self._get_forces(batch, energies)
+        if self.include_force:
+            forces = self._get_forces(batch, energies)
+        else:
+            forces = {}
         return {**energies, **forces}
 
 
@@ -542,7 +543,9 @@ def __init__(
             potential_seed=potential_seed,
         )
 
-        self.calculate_predictions = CalculateProperties()
+        self.calculate_predictions = CalculateProperties(
+            training_parameter.loss_parameter.loss_property
+        )
         self.optimizer = training_parameter.optimizer
         self.learning_rate = training_parameter.lr
         self.lr_scheduler = training_parameter.lr_scheduler
@@ -659,10 +662,12 @@ def training_step(self, batch: "BatchData", batch_idx: int) -> torch.Tensor:
                 prog_bar=True,
                 on_epoch=True,
                 sync_dist=True,
-                batch_size=batch.batch_size(),
+                batch_size=1,  # batch.batch_size(),
             )
 
-        return torch.mean(loss_dict["total_loss"])
+        loss = torch.mean(loss_dict["total_loss"])
+
+        return loss
 
     @torch.enable_grad()
     def validation_step(self, batch: "BatchData", batch_idx: int) -> None:
@@ -685,10 +690,15 @@ def validation_step(self, batch: "BatchData", batch_idx: int) -> None:
         batch.nnp_input.positions.requires_grad_(True)
         # calculate energy and forces
         predict_target = self.calculate_predictions(batch, self.potential)
-        # calculate the loss
-        loss = self.loss(predict_target, batch)
-        # log the loss
         self._update_metrics(self.val_error, predict_target)
+        # calculate the MSE with torch
+        l1 = torch.nn.functional.l1_loss(
+            predict_target["per_molecule_energy_predict"],
+            predict_target["per_molecule_energy_true"],
+        )
+
+        self.mae_validation_set += l1.item()
+        self.nr_of_batches += 1
 
     @torch.enable_grad()
     def test_step(self, batch: "BatchData", batch_idx: int) -> None:
@@ -798,6 +808,11 @@ def _log_on_epoch(self, log_mode: str = "train"):
                 metrics, on_epoch=True, prog_bar=(phase == "val"), sync_dist=True
             )
 
+            mse_loss = self.mse_training_set / self.nr_of_batches
+            mae_val = self.mae_validation_set / self.nr_of_batches
+
+            a = 7
+
     def configure_optimizers(self):
         """
         Configures the model's optimizers (and optionally schedulers).
diff --git a/scripts/config.toml b/scripts/config.toml
index 32e3018b..e0c84a82 100644
--- a/scripts/config.toml
+++ b/scripts/config.toml
@@ -2,11 +2,11 @@
 potential_name = "SchNet"
 
 [potential.core_parameter]
-number_of_radial_basis_functions = 20
+number_of_radial_basis_functions = 32
 maximum_interaction_radius = "5.0 angstrom"
-number_of_interaction_modules = 3
-number_of_filters = 32
-shared_interactions = false
+number_of_interaction_modules = 5
+number_of_filters = 64
+shared_interactions = true
 
 [potential.core_parameter.activation_function_parameter]
 activation_function_name = "ShiftedSoftplus"
@@ -14,7 +14,7 @@ activation_function_name = "ShiftedSoftplus"
 [potential.core_parameter.featurization]
 properties_to_featurize = ['atomic_number']
 maximum_atomic_number = 101
-number_of_per_atom_features = 32
+number_of_per_atom_features = 128
 
 [potential.postprocessing_parameter]
 [potential.postprocessing_parameter.per_atom_energy]
@@ -31,10 +31,10 @@ num_workers = 4
 pin_memory = true
 
 [training]
-number_of_epochs = 4
+number_of_epochs = 1000
 remove_self_energies = true
-batch_size = 128
-lr = 1e-3
+batch_size = 16
+lr = 0.5e-3
 monitor = "val/per_molecule_energy/rmse"
 
 [training.experiment_logger]
@@ -56,11 +56,10 @@ monitor = "val/per_molecule_energy/rmse"
 interval = "epoch"
 
 [training.loss_parameter]
-loss_property = ['per_molecule_energy', 'per_atom_force'] # use
+loss_property = ['per_molecule_energy'] # use
 
 [training.loss_parameter.weight]
-per_molecule_energy = 0.999 #NOTE: reciprocal units
-per_atom_force = 0.001
+per_molecule_energy = 1.0 #NOTE: reciprocal units
 
 
 [training.early_stopping]

From ba118ebbc83060d9b48773035cc958ed97c3e905 Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Thu, 22 Aug 2024 16:28:02 +0200
Subject: [PATCH 04/33] update optimizer, fix bug

---
 modelforge/train/parameters.py |  2 +-
 modelforge/train/training.py   | 11 -----------
 2 files changed, 1 insertion(+), 12 deletions(-)

diff --git a/modelforge/train/parameters.py b/modelforge/train/parameters.py
index 8fe6c2c9..1ea820d7 100644
--- a/modelforge/train/parameters.py
+++ b/modelforge/train/parameters.py
@@ -270,7 +270,7 @@ def ensure_logger_configuration(self) -> "ExperimentLogger":
     stochastic_weight_averaging: Optional[StochasticWeightAveraging] = None
     experiment_logger: ExperimentLogger
     verbose: bool = False
-    optimizer: Type[torch.optim.Optimizer] = torch.optim.Adam
+    optimizer: Type[torch.optim.Optimizer] = torch.optim.AdamW
 
 
 ### Runtime Parameters
diff --git a/modelforge/train/training.py b/modelforge/train/training.py
index 84b19100..32671f1c 100644
--- a/modelforge/train/training.py
+++ b/modelforge/train/training.py
@@ -691,14 +691,7 @@ def validation_step(self, batch: "BatchData", batch_idx: int) -> None:
         # calculate energy and forces
         predict_target = self.calculate_predictions(batch, self.potential)
         self._update_metrics(self.val_error, predict_target)
-        # calculate the MSE with torch
-        l1 = torch.nn.functional.l1_loss(
-            predict_target["per_molecule_energy_predict"],
-            predict_target["per_molecule_energy_true"],
-        )
 
-        self.mae_validation_set += l1.item()
-        self.nr_of_batches += 1
 
     @torch.enable_grad()
     def test_step(self, batch: "BatchData", batch_idx: int) -> None:
@@ -808,10 +801,6 @@ def _log_on_epoch(self, log_mode: str = "train"):
                 metrics, on_epoch=True, prog_bar=(phase == "val"), sync_dist=True
             )
 
-            mse_loss = self.mse_training_set / self.nr_of_batches
-            mae_val = self.mae_validation_set / self.nr_of_batches
-
-            a = 7
 
     def configure_optimizers(self):
         """

From ae7ee910575c889cab2f88ab72e7ab9863bd49cf Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Thu, 22 Aug 2024 19:00:33 +0200
Subject: [PATCH 05/33] use custom torchmetric to sync accross differnt nodes

---
 modelforge/train/training.py | 79 ++++++++++++++++++++++--------------
 1 file changed, 48 insertions(+), 31 deletions(-)

diff --git a/modelforge/train/training.py b/modelforge/train/training.py
index 32671f1c..8b9f2549 100644
--- a/modelforge/train/training.py
+++ b/modelforge/train/training.py
@@ -273,7 +273,9 @@ def __init__(self, loss_porperty: List[str], weight: Dict[str, float]):
             else:
                 raise NotImplementedError(f"Loss type {prop} not implemented.")
 
-    def forward(self, predict_target: Dict[str, torch.Tensor], batch):
+    def forward(
+        self, predict_target: Dict[str, torch.Tensor], batch
+    ) -> Dict[str, torch.Tensor]:
         """
         Calculates the combined loss for the specified properties.
 
@@ -341,7 +343,7 @@ def create_loss(loss_property: List[str], weight: Dict[str, float]) -> Type[Loss
 from torch.nn import ModuleDict
 
 
-def create_error_metrics(loss_properties: List[str]) -> ModuleDict:
+def create_error_metrics(loss_properties: List[str], loss: bool = False) -> ModuleDict:
     """
     Creates a ModuleDict of MetricCollections for the given loss properties.
 
@@ -361,14 +363,36 @@ def create_error_metrics(loss_properties: List[str]) -> ModuleDict:
     )
     from torchmetrics import MetricCollection
 
-    return ModuleDict(
-        {
-            prop: MetricCollection(
-                [MeanAbsoluteError(), MeanSquaredError(squared=False)]
-            )
-            for prop in loss_properties
-        }
-    )
+    if loss:
+        return ModuleDict({prop: MeanLossMetric() for prop in loss_properties})
+    else:
+        return ModuleDict(
+            {
+                prop: MetricCollection(
+                    [MeanAbsoluteError(), MeanSquaredError(squared=False)]
+                )
+                for prop in loss_properties
+            }
+        )
+
+
+import torchmetrics
+
+
+class MeanLossMetric(torchmetrics.Metric):
+    def __init__(self):
+        super().__init__()
+        self.add_state("sum_loss", default=torch.tensor(0.0), dist_reduce_fx="sum")
+        self.add_state("total_batches", default=torch.tensor(0), dist_reduce_fx="sum")
+
+    def update(self, loss: torch.Tensor, batch_size: int):
+        # Accumulate the loss sum and batch count
+        self.sum_loss += loss.sum()
+        self.total_batches += batch_size
+
+    def compute(self):
+        # Compute the mean loss
+        return self.sum_loss / self.total_batches
 
 
 from modelforge.train.parameters import RuntimeParameters, TrainingParameters
@@ -573,6 +597,10 @@ def __init__(
         self.train_error = create_error_metrics(
             training_parameter.loss_parameter.loss_property
         )
+        # Initialize your custom metric
+        self.train_loss_metric = create_error_metrics(
+            training_parameter.loss_parameter.loss_property, loss=True
+        )
 
     def forward(self, batch: "BatchData") -> Dict[str, torch.Tensor]:
         """
@@ -645,29 +673,18 @@ def training_step(self, batch: "BatchData", batch_idx: int) -> torch.Tensor:
         # force`
         predict_target = self.calculate_predictions(batch, self.potential)
 
-        # calculate the loss (for every entry in predict_target the squared
-        # error is calculated)
+        # Calculate the loss
         loss_dict = self.loss(predict_target, batch)
 
-        # Update and log training error (if requested)
-        if self.log_on_training_step:
-            self._update_metrics(self.train_error, predict_target)
-
-        # log the loss (this includes the individual contributions that the loss contains)
-        for key, loss in loss_dict.items():
+        # Update the custom metric with the different loss components
+        for key, value in loss_dict.items():
+            self.train_loss_metric[key].update(value, batch.batch_size())
+            # Log the metric instead of the mean loss directly
             self.log(
-                f"loss/{key}",
-                torch.mean(loss),
-                on_step=False,
-                prog_bar=True,
-                on_epoch=True,
-                sync_dist=True,
-                batch_size=1,  # batch.batch_size(),
+                "loss/{key}", self.train_loss_metric[key], on_epoch=True, prog_bar=True
             )
 
-        loss = torch.mean(loss_dict["total_loss"])
-
-        return loss
+        return torch.mean(loss_dict["total_loss"])
 
     @torch.enable_grad()
     def validation_step(self, batch: "BatchData", batch_idx: int) -> None:
@@ -692,7 +709,6 @@ def validation_step(self, batch: "BatchData", batch_idx: int) -> None:
         predict_target = self.calculate_predictions(batch, self.potential)
         self._update_metrics(self.val_error, predict_target)
 
-
     @torch.enable_grad()
     def test_step(self, batch: "BatchData", batch_idx: int) -> None:
         """
@@ -798,10 +814,11 @@ def _log_on_epoch(self, log_mode: str = "train"):
                     metric.reset()
             # log dict, print val metrics to console
             self.log_dict(
-                metrics, on_epoch=True, prog_bar=(phase == "val"), sync_dist=True
+                metrics,
+                on_epoch=True,
+                prog_bar=(phase == "val"),
             )
 
-
     def configure_optimizers(self):
         """
         Configures the model's optimizers (and optionally schedulers).

From 0d61ccbb35372c4d2e20a8d0559540628a41ef2e Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Fri, 23 Aug 2024 11:42:17 +0200
Subject: [PATCH 06/33] make logging consistent and log also loss with
 torchmetric (necessary for synchronized training)

---
 modelforge/train/training.py | 87 +++++++++++++++++++-----------------
 1 file changed, 45 insertions(+), 42 deletions(-)

diff --git a/modelforge/train/training.py b/modelforge/train/training.py
index 8b9f2549..307346a8 100644
--- a/modelforge/train/training.py
+++ b/modelforge/train/training.py
@@ -2,26 +2,32 @@
 This module contains classes and functions for training neural network potentials using PyTorch Lightning.
 """
 
-from torch.optim.lr_scheduler import ReduceLROnPlateau
-from typing import Any, Union, Dict, Type, Optional, List
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Optional, Type, Union
+
+import lightning.pytorch as pL
 import torch
-from loguru import logger as log
-from modelforge.dataset.dataset import BatchData, NNPInput
 import torchmetrics
+from lightning import Trainer
+from loguru import logger as log
 from torch import nn
-from abc import ABC, abstractmethod
-from modelforge.dataset.dataset import DatasetParameters
+from torch.optim.lr_scheduler import ReduceLROnPlateau
+
+from modelforge.dataset.dataset import (
+    BatchData,
+    DataModule,
+    DatasetParameters,
+    NNPInput,
+)
 from modelforge.potential.parameters import (
     ANI2xParameters,
-    PhysNetParameters,
-    SchNetParameters,
     PaiNNParameters,
+    PhysNetParameters,
     SAKEParameters,
+    SchNetParameters,
     TensorNetParameters,
 )
-from lightning import Trainer
-import lightning.pytorch as pL
-from modelforge.dataset.dataset import DataModule
+from modelforge.train.parameters import RuntimeParameters, TrainingParameters
 
 __all__ = [
     "Error",
@@ -306,7 +312,7 @@ def forward(
             # add total loss
             loss = loss + (self.weight[prop] * loss_)
             # save loss
-            loss_dict[f"{prop}/mse"] = loss_
+            loss_dict[f"{prop}"] = loss_
 
         # add total loss to results dict and return
         loss_dict["total_loss"] = loss
@@ -339,8 +345,8 @@ def create_loss(loss_property: List[str], weight: Dict[str, float]) -> Type[Loss
         return Loss(loss_property, weight)
 
 
-from torch.optim import Optimizer
 from torch.nn import ModuleDict
+from torch.optim import Optimizer
 
 
 def create_error_metrics(loss_properties: List[str], loss: bool = False) -> ModuleDict:
@@ -351,22 +357,25 @@ def create_error_metrics(loss_properties: List[str], loss: bool = False) -> Modu
     ----------
     loss_properties : List[str]
         List of loss properties for which to create the metrics.
-
+    loss : bool, optional
+        If True, only the loss metric is created, by default False.
     Returns
     -------
     ModuleDict
         A dictionary where keys are loss properties and values are MetricCollections.
     """
-    from torchmetrics.regression import (
-        MeanAbsoluteError,
-        MeanSquaredError,
-    )
     from torchmetrics import MetricCollection
+    from torchmetrics.regression import MeanAbsoluteError, MeanSquaredError
+    from torchmetrics.aggregation import MeanMetric
 
     if loss:
-        return ModuleDict({prop: MeanLossMetric() for prop in loss_properties})
+        metric_dict = ModuleDict(
+            {prop: MetricCollection([MeanMetric()]) for prop in loss_properties}
+        )
+        metric_dict["total_loss"] = MetricCollection([MeanMetric()])
+        return metric_dict
     else:
-        return ModuleDict(
+        metric_dict = ModuleDict(
             {
                 prop: MetricCollection(
                     [MeanAbsoluteError(), MeanSquaredError(squared=False)]
@@ -374,12 +383,13 @@ def create_error_metrics(loss_properties: List[str], loss: bool = False) -> Modu
                 for prop in loss_properties
             }
         )
+        return metric_dict
 
 
-import torchmetrics
+from torchmetrics import Metric
 
 
-class MeanLossMetric(torchmetrics.Metric):
+class MeanLossMetric(Metric):
     def __init__(self):
         super().__init__()
         self.add_state("sum_loss", default=torch.tensor(0.0), dist_reduce_fx="sum")
@@ -395,9 +405,6 @@ def compute(self):
         return self.sum_loss / self.total_batches
 
 
-from modelforge.train.parameters import RuntimeParameters, TrainingParameters
-
-
 class CalculateProperties(torch.nn.Module):
 
     def __init__(self, requested_properties: List[str]):
@@ -597,8 +604,9 @@ def __init__(
         self.train_error = create_error_metrics(
             training_parameter.loss_parameter.loss_property
         )
-        # Initialize your custom metric
-        self.train_loss_metric = create_error_metrics(
+
+        # Initialize loss  metric
+        self.loss_metric = create_error_metrics(
             training_parameter.loss_parameter.loss_property, loss=True
         )
 
@@ -676,13 +684,9 @@ def training_step(self, batch: "BatchData", batch_idx: int) -> torch.Tensor:
         # Calculate the loss
         loss_dict = self.loss(predict_target, batch)
 
-        # Update the custom metric with the different loss components
-        for key, value in loss_dict.items():
-            self.train_loss_metric[key].update(value, batch.batch_size())
-            # Log the metric instead of the mean loss directly
-            self.log(
-                "loss/{key}", self.train_loss_metric[key], on_epoch=True, prog_bar=True
-            )
+        # Update the loss metric with the different loss components
+        for key, metric in loss_dict.items():
+            self.loss_metric[key].update(metric, batch.batch_size())
 
         return torch.mean(loss_dict["total_loss"])
 
@@ -793,6 +797,7 @@ def _log_on_epoch(self, log_mode: str = "train"):
             errors = [
                 ("train", self.train_error),
                 ("val", self.val_error),
+                ("loss", self.loss_metric),
             ]
         elif log_mode == "test":
             errors = [
@@ -809,7 +814,7 @@ def _log_on_epoch(self, log_mode: str = "train"):
             metrics = {}
             for property, metrics_dict in error_dict.items():
                 for name, metric in metrics_dict.items():
-                    name = f"{phase}/{property}/{conv[name]}"
+                    name = f"{phase}/{property}/{conv.get(name, name)}"
                     metrics[name] = metric.compute()
                     metric.reset()
             # log dict, print val metrics to console
@@ -974,8 +979,8 @@ def setup_datamodule(self) -> DataModule:
         DataModule
             Configured DataModule instance.
         """
-        from modelforge.dataset.utils import REGISTERED_SPLITTING_STRATEGIES
         from modelforge.dataset.dataset import DataModule
+        from modelforge.dataset.utils import REGISTERED_SPLITTING_STRATEGIES
 
         dm = DataModule(
             name=self.dataset_parameter.dataset_name,
@@ -1074,8 +1079,8 @@ def setup_callbacks(self) -> List[Any]:
             List of configured callbacks.
         """
         from lightning.pytorch.callbacks import (
-            ModelCheckpoint,
             EarlyStopping,
+            ModelCheckpoint,
             StochasticWeightAveraging,
         )
 
@@ -1309,9 +1314,9 @@ def read_config(
             runtime_config_dict[key] = value
 
     # Load and instantiate the data classes with the merged configuration
-    from modelforge.potential import _Implemented_NNP_Parameters
     from modelforge.dataset.dataset import DatasetParameters
-    from modelforge.train.parameters import TrainingParameters, RuntimeParameters
+    from modelforge.potential import _Implemented_NNP_Parameters
+    from modelforge.train.parameters import RuntimeParameters, TrainingParameters
 
     potential_name = potential_config_dict["potential_name"]
     PotentialParameters = (
@@ -1409,9 +1414,7 @@ def read_config_and_train(
         log_every_n_steps=log_every_n_steps,
         simulation_environment=simulation_environment,
     )
-    from modelforge.potential.models import (
-        NeuralNetworkPotentialFactory,
-    )
+    from modelforge.potential.models import NeuralNetworkPotentialFactory
 
     model = NeuralNetworkPotentialFactory.generate_potential(
         use="training",

From a62029c7ca3e40aa2c65420fe2d87a9c0cdf2f84 Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Fri, 23 Aug 2024 11:48:50 +0200
Subject: [PATCH 07/33] skip if ANI and SPICE

---
 modelforge/tests/test_training.py | 4 ++++
 modelforge/train/training.py      | 1 +
 2 files changed, 5 insertions(+)

diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py
index b50f6454..f39fb79c 100644
--- a/modelforge/tests/test_training.py
+++ b/modelforge/tests/test_training.py
@@ -85,7 +85,11 @@ def test_train_with_lightning(potential_name, dataset_name):
     """
     Test that we can train, save and load checkpoints.
     """
+    # train potential
 
+    # SKIP if potential is ANI and dataset is SPICE2
+    if potential_name == "ANI" and dataset_name == "SPICE2":
+        pytest.skip("ANI potential is not compatible with SPICE2 dataset")
     get_trainer(potential_name, dataset_name).train_potential().save_checkpoint(
         "test.chp"
     )  # save checkpoint
diff --git a/modelforge/train/training.py b/modelforge/train/training.py
index 307346a8..7d44b95f 100644
--- a/modelforge/train/training.py
+++ b/modelforge/train/training.py
@@ -790,6 +790,7 @@ def _log_on_epoch(self, log_mode: str = "train"):
         conv = {
             "MeanAbsoluteError": "mae",
             "MeanSquaredError": "rmse",
+            "MeanMetric": "mse",  # NOTE: MeanMetric is the MSE since we accumulate the squared error
         }  # NOTE: MeanSquaredError(squared=False) is RMSE
 
         # Log all accumulated metrics for train and val phases

From ac9e853afd1111e4d4781af814599e0dda70e485 Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Fri, 23 Aug 2024 13:19:05 +0200
Subject: [PATCH 08/33] include force training test

---
 .gitignore                                    |  4 ++
 .../training_defaults/default_with_force.toml | 54 +++++++++++++++++++
 modelforge/tests/test_training.py             | 25 +++++----
 modelforge/train/training.py                  |  4 +-
 scripts/config.toml                           | 12 ++---
 5 files changed, 82 insertions(+), 17 deletions(-)
 create mode 100644 modelforge/tests/data/training_defaults/default_with_force.toml

diff --git a/.gitignore b/.gitignore
index 2ac2cee4..3c452056 100644
--- a/.gitignore
+++ b/.gitignore
@@ -190,3 +190,7 @@ lightning_logs/
 *.hdf5
 */tb_logs/*
 .vscode/settings.json
+logs/*
+cache/*
+*/logs/*
+*/cache/*
diff --git a/modelforge/tests/data/training_defaults/default_with_force.toml b/modelforge/tests/data/training_defaults/default_with_force.toml
new file mode 100644
index 00000000..19c0d3b7
--- /dev/null
+++ b/modelforge/tests/data/training_defaults/default_with_force.toml
@@ -0,0 +1,54 @@
+[training]
+number_of_epochs = 2
+remove_self_energies = true
+batch_size = 128
+lr = 1e-3
+monitor = "val/per_molecule_energy/rmse"
+
+
+[training.experiment_logger]
+logger_name = "tensorboard" # this will set which logger to use
+
+# configuration for both loggers can be defined simultaneously, the logger_name variable defines which logger to use
+[training.experiment_logger.tensorboard_configuration]
+save_dir = "logs"
+
+[training.experiment_logger.wandb_configuration]
+save_dir = "logs"
+project = "training_potentials"
+group = "exp00"
+log_model = true
+job_type = "testing"
+tags = ["v_0.1.0"]
+notes = "testing training"
+
+[training.lr_scheduler]
+frequency = 1
+mode = "min"
+factor = 0.1
+patience = 10
+cooldown = 5
+min_lr = 1e-8
+threshold = 0.1
+threshold_mode = "abs"
+monitor = "val/per_molecule_energy/rmse"
+interval = "epoch"
+
+[training.loss_parameter]
+loss_property = ['per_molecule_energy', 'per_atom_force'] # use
+
+[training.loss_parameter.weight]
+per_molecule_energy = 0.999 #NOTE: reciprocal units
+per_atom_force = 0.001
+
+
+[training.early_stopping]
+verbose = true
+monitor = "val/per_molecule_energy/rmse"
+min_delta = 0.001
+patience = 50
+
+[training.splitting_strategy]
+name = "random_record_splitting_strategy"
+data_split = [0.8, 0.1, 0.1]
+seed = 42
diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py
index f39fb79c..bec6e422 100644
--- a/modelforge/tests/test_training.py
+++ b/modelforge/tests/test_training.py
@@ -10,7 +10,9 @@
 from modelforge.potential import NeuralNetworkPotentialFactory, _Implemented_NNPs
 
 
-def load_configs_into_pydantic_models(potential_name: str, dataset_name: str):
+def load_configs_into_pydantic_models(
+    potential_name: str, dataset_name: str, training_toml: str
+):
     from importlib import resources
 
     import toml
@@ -26,7 +28,7 @@ def load_configs_into_pydantic_models(potential_name: str, dataset_name: str):
         resources.files(potential_defaults) / f"{potential_name.lower()}.toml"
     )
     dataset_path = resources.files(dataset_defaults) / f"{dataset_name.lower()}.toml"
-    training_path = resources.files(training_defaults) / "default.toml"
+    training_path = resources.files(training_defaults) / f"{training_toml}.toml"
     runtime_path = resources.files(runtime_defaults) / "runtime.toml"
 
     training_config_dict = toml.load(training_path)
@@ -58,8 +60,10 @@ def load_configs_into_pydantic_models(potential_name: str, dataset_name: str):
     }
 
 
-def get_trainer(potential_name: str, dataset_name: str):
-    config = load_configs_into_pydantic_models(potential_name, dataset_name)
+def get_trainer(potential_name: str, dataset_name: str, training_toml: str):
+    config = load_configs_into_pydantic_models(
+        potential_name, dataset_name, training_toml
+    )
 
     # Extract parameters
     potential_parameter = config["potential"]
@@ -81,21 +85,24 @@ def get_trainer(potential_name: str, dataset_name: str):
     "potential_name", _Implemented_NNPs.get_all_neural_network_names()
 )
 @pytest.mark.parametrize("dataset_name", ["QM9", "SPICE2"])
-def test_train_with_lightning(potential_name, dataset_name):
+@pytest.mark.parametrize("training", ["with_force", "without_force"])
+def test_train_with_lightning(training, potential_name, dataset_name):
     """
     Test that we can train, save and load checkpoints.
     """
     # train potential
-
+    training_toml = "default_with_force" if training == "with_force" else "default"
     # SKIP if potential is ANI and dataset is SPICE2
-    if potential_name == "ANI" and dataset_name == "SPICE2":
+    if "ANI" in potential_name  and dataset_name == "SPICE2":
         pytest.skip("ANI potential is not compatible with SPICE2 dataset")
-    get_trainer(potential_name, dataset_name).train_potential().save_checkpoint(
+    get_trainer(
+        potential_name, dataset_name, training_toml
+    ).train_potential().save_checkpoint(
         "test.chp"
     )  # save checkpoint
 
     # continue training from checkpoint
-    get_trainer(potential_name, dataset_name).train_potential()
+    get_trainer(potential_name, dataset_name, training_toml).train_potential()
 
 
 def test_train_from_single_toml_file():
diff --git a/modelforge/train/training.py b/modelforge/train/training.py
index 7d44b95f..f3ec2c33 100644
--- a/modelforge/train/training.py
+++ b/modelforge/train/training.py
@@ -417,7 +417,7 @@ def __init__(self, requested_properties: List[str]):
         super().__init__()
         self.requested_properties = requested_properties
         self.include_force = False
-        if "force" in self.requested_properties:
+        if "per_atom_force" in self.requested_properties:
             self.include_force = True
 
     def _get_forces(
@@ -822,7 +822,7 @@ def _log_on_epoch(self, log_mode: str = "train"):
             self.log_dict(
                 metrics,
                 on_epoch=True,
-                prog_bar=(phase == "val"),
+                prog_bar=(phase == "val" or phase == "loss"),
             )
 
     def configure_optimizers(self):
diff --git a/scripts/config.toml b/scripts/config.toml
index e0c84a82..037437c6 100644
--- a/scripts/config.toml
+++ b/scripts/config.toml
@@ -25,7 +25,7 @@ keep_per_atom_property = true
 calculate_molecular_self_energy = true
 
 [dataset]
-dataset_name = "QM9"
+dataset_name = "PHALKETHOH"
 version_select = "nc_1000_v0"
 num_workers = 4
 pin_memory = true
@@ -56,11 +56,11 @@ monitor = "val/per_molecule_energy/rmse"
 interval = "epoch"
 
 [training.loss_parameter]
-loss_property = ['per_molecule_energy'] # use
+loss_property = ['per_molecule_energy', 'per_atom_force'] # use
 
 [training.loss_parameter.weight]
-per_molecule_energy = 1.0 #NOTE: reciprocal units
-
+per_molecule_energy = 0.009 #NOTE: reciprocal units
+per_atom_force = 0.001
 
 [training.early_stopping]
 verbose = true
@@ -74,12 +74,12 @@ data_split = [0.8, 0.1, 0.1]
 seed = 42
 
 [runtime]
-save_dir = "lightning_logs"
+save_dir = "test_setup"
 experiment_name = "{potential_name}_{dataset_name}"
 local_cache_dir = "./cache"
 accelerator = "cpu"
 number_of_nodes = 1
-devices = 1                        #[0,1,2,3]
+devices = 1                                         #[0,1,2,3]
 checkpoint_path = "None"
 simulation_environment = "PyTorch"
 log_every_n_steps = 1

From 6ae7ca051e09f05aa5c9a7bab5fd4ba6869fa6a9 Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Fri, 23 Aug 2024 17:40:02 +0200
Subject: [PATCH 09/33] still issues with mutliple GPUs

---
 modelforge/train/training.py |  9 +--------
 scripts/config.toml          | 24 +++++++++++-------------
 2 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/modelforge/train/training.py b/modelforge/train/training.py
index f3ec2c33..1ab1d3c4 100644
--- a/modelforge/train/training.py
+++ b/modelforge/train/training.py
@@ -812,18 +812,11 @@ def _log_on_epoch(self, log_mode: str = "train"):
             if phase == "train" and not self.log_on_training_step:
                 continue
 
-            metrics = {}
             for property, metrics_dict in error_dict.items():
                 for name, metric in metrics_dict.items():
                     name = f"{phase}/{property}/{conv.get(name, name)}"
-                    metrics[name] = metric.compute()
+                    self.log(name, metric.compute(), prog_bar=True)
                     metric.reset()
-            # log dict, print val metrics to console
-            self.log_dict(
-                metrics,
-                on_epoch=True,
-                prog_bar=(phase == "val" or phase == "loss"),
-            )
 
     def configure_optimizers(self):
         """
diff --git a/scripts/config.toml b/scripts/config.toml
index 037437c6..d6e92a10 100644
--- a/scripts/config.toml
+++ b/scripts/config.toml
@@ -1,28 +1,26 @@
 [potential]
-potential_name = "SchNet"
+potential_name = "ANI2x"
 
 [potential.core_parameter]
-number_of_radial_basis_functions = 32
-maximum_interaction_radius = "5.0 angstrom"
-number_of_interaction_modules = 5
-number_of_filters = 64
-shared_interactions = true
+angle_sections = 4
+maximum_interaction_radius = "5.1 angstrom"
+minimum_interaction_radius = "0.8 angstrom"
+number_of_radial_basis_functions = 16
+maximum_interaction_radius_for_angular_features = "3.5 angstrom"
+minimum_interaction_radius_for_angular_features = "0.8 angstrom"
+angular_dist_divisions = 8
 
 [potential.core_parameter.activation_function_parameter]
-activation_function_name = "ShiftedSoftplus"
+activation_function_name = "CeLU"   # for the original ANI behavior please stick with CeLu since the alpha parameter is currently hard coded and might lead to different behavior when another activation function is used.
 
-[potential.core_parameter.featurization]
-properties_to_featurize = ['atomic_number']
-maximum_atomic_number = 101
-number_of_per_atom_features = 128
+[potential.core_parameter.activation_function_parameter.activation_function_arguments]
+alpha = 0.1
 
 [potential.postprocessing_parameter]
 [potential.postprocessing_parameter.per_atom_energy]
 normalize = true
 from_atom_to_molecule_reduction = true
 keep_per_atom_property = true
-[potential.postprocessing_parameter.general_postprocessing_operation]
-calculate_molecular_self_energy = true
 
 [dataset]
 dataset_name = "PHALKETHOH"

From ebba646b2e8696cec43c84bb248d96ec2b61d01b Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Fri, 23 Aug 2024 19:10:59 +0200
Subject: [PATCH 10/33] make loss tensor's stride contiguous

---
 modelforge/tests/test_training.py |  4 +++-
 modelforge/train/training.py      | 16 +++++++++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py
index bec6e422..545b31c0 100644
--- a/modelforge/tests/test_training.py
+++ b/modelforge/tests/test_training.py
@@ -93,7 +93,7 @@ def test_train_with_lightning(training, potential_name, dataset_name):
     # train potential
     training_toml = "default_with_force" if training == "with_force" else "default"
     # SKIP if potential is ANI and dataset is SPICE2
-    if "ANI" in potential_name  and dataset_name == "SPICE2":
+    if "ANI" in potential_name and dataset_name == "SPICE2":
         pytest.skip("ANI potential is not compatible with SPICE2 dataset")
     get_trainer(
         potential_name, dataset_name, training_toml
@@ -104,6 +104,8 @@ def test_train_with_lightning(training, potential_name, dataset_name):
     # continue training from checkpoint
     get_trainer(potential_name, dataset_name, training_toml).train_potential()
 
+    assert False
+
 
 def test_train_from_single_toml_file():
     from importlib import resources
diff --git a/modelforge/train/training.py b/modelforge/train/training.py
index 1ab1d3c4..a9dc3bf3 100644
--- a/modelforge/train/training.py
+++ b/modelforge/train/training.py
@@ -574,6 +574,19 @@ def __init__(
             potential_seed=potential_seed,
         )
 
+        # def check_strides(module, grad_input, grad_output):
+        #     print(f"Layer: {module.__class__.__name__}")
+
+        #     for i, grad in enumerate(grad_input):
+        #         if grad is not None:
+        #             print(
+        #                 f"Grad input {i}: size {grad.size()}, strides {grad.stride()}"
+        #             )
+
+        # # Register the hook
+        # for module in self.potential.modules():
+        #     module.register_backward_hook(check_strides)
+
         self.calculate_predictions = CalculateProperties(
             training_parameter.loss_parameter.loss_property
         )
@@ -688,7 +701,8 @@ def training_step(self, batch: "BatchData", batch_idx: int) -> torch.Tensor:
         for key, metric in loss_dict.items():
             self.loss_metric[key].update(metric, batch.batch_size())
 
-        return torch.mean(loss_dict["total_loss"])
+        loss = torch.mean(loss_dict["total_loss"]).contiguous()
+        return loss
 
     @torch.enable_grad()
     def validation_step(self, batch: "BatchData", batch_idx: int) -> None:

From a315724f28d9a642f3a938af00ca18e28f3303ed Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Fri, 23 Aug 2024 19:16:50 +0200
Subject: [PATCH 11/33] sync log

---
 modelforge/train/training.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelforge/train/training.py b/modelforge/train/training.py
index a9dc3bf3..e88a1768 100644
--- a/modelforge/train/training.py
+++ b/modelforge/train/training.py
@@ -829,7 +829,7 @@ def _log_on_epoch(self, log_mode: str = "train"):
             for property, metrics_dict in error_dict.items():
                 for name, metric in metrics_dict.items():
                     name = f"{phase}/{property}/{conv.get(name, name)}"
-                    self.log(name, metric.compute(), prog_bar=True)
+                    self.log(name, metric.compute(), prog_bar=True, sync_dist=True)
                     metric.reset()
 
     def configure_optimizers(self):

From c5b87be391bdbbfaac31ceeea37078b03af04fab Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Sat, 24 Aug 2024 08:14:06 +0200
Subject: [PATCH 12/33] stride is an issue in the backward pass through the
 forces, this m might fix it

---
 modelforge/train/training.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/modelforge/train/training.py b/modelforge/train/training.py
index e88a1768..e2d814f3 100644
--- a/modelforge/train/training.py
+++ b/modelforge/train/training.py
@@ -463,7 +463,7 @@ def _get_forces(
 
         return {
             "per_atom_force_true": per_atom_force_true,
-            "per_atom_force_predict": per_atom_force_predict,
+            "per_atom_force_predict": per_atom_force_predict.contiguous(),
         }
 
     def _get_energies(
@@ -1128,6 +1128,10 @@ def setup_trainer(self) -> Trainer:
         """
         from lightning import Trainer
 
+        # if devices is a list
+        if isinstance(self.runtime_parameter.devices, list):
+            strategy = "ddp"
+
         trainer = Trainer(
             max_epochs=self.training_parameter.number_of_epochs,
             num_nodes=self.runtime_parameter.number_of_nodes,

From 8da97be4ac322ee91270425120e3baa8be0a11c3 Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Sat, 24 Aug 2024 08:45:34 +0200
Subject: [PATCH 13/33] still stride issue

---
 modelforge/tests/test_training.py |  6 +++++-
 modelforge/train/training.py      | 22 +++++++++++-----------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py
index 545b31c0..891d9169 100644
--- a/modelforge/tests/test_training.py
+++ b/modelforge/tests/test_training.py
@@ -80,6 +80,8 @@ def get_trainer(potential_name: str, dataset_name: str, training_toml: str):
     )
 
 
+
+
 @pytest.mark.skipif(ON_MACOS, reason="Skipping this test on MacOS GitHub Actions")
 @pytest.mark.parametrize(
     "potential_name", _Implemented_NNPs.get_all_neural_network_names()
@@ -90,11 +92,13 @@ def test_train_with_lightning(training, potential_name, dataset_name):
     """
     Test that we can train, save and load checkpoints.
     """
-    # train potential
+    # get correct training toml
     training_toml = "default_with_force" if training == "with_force" else "default"
     # SKIP if potential is ANI and dataset is SPICE2
     if "ANI" in potential_name and dataset_name == "SPICE2":
         pytest.skip("ANI potential is not compatible with SPICE2 dataset")
+
+    # train potential
     get_trainer(
         potential_name, dataset_name, training_toml
     ).train_potential().save_checkpoint(
diff --git a/modelforge/train/training.py b/modelforge/train/training.py
index e2d814f3..b79d7b84 100644
--- a/modelforge/train/training.py
+++ b/modelforge/train/training.py
@@ -169,7 +169,7 @@ def forward(
             0,
             batch.nnp_input.atomic_subsystem_indices.long().unsqueeze(1),
             per_atom_squared_error,
-        )
+        ).contiguous()
         # divide by number of atoms
         per_molecule_square_error_scaled = self.scale_by_number_of_atoms(
             per_molecule_squared_error,
@@ -574,18 +574,18 @@ def __init__(
             potential_seed=potential_seed,
         )
 
-        # def check_strides(module, grad_input, grad_output):
-        #     print(f"Layer: {module.__class__.__name__}")
+        def check_strides(module, grad_input, grad_output):
+            print(f"Layer: {module.__class__.__name__}")
 
-        #     for i, grad in enumerate(grad_input):
-        #         if grad is not None:
-        #             print(
-        #                 f"Grad input {i}: size {grad.size()}, strides {grad.stride()}"
-        #             )
+            for i, grad in enumerate(grad_input):
+                if grad is not None:
+                    print(
+                        f"Grad input {i}: size {grad.size()}, strides {grad.stride()}"
+                    )
 
-        # # Register the hook
-        # for module in self.potential.modules():
-        #     module.register_backward_hook(check_strides)
+        # Register the hook
+        for module in self.potential.modules():
+            module.register_backward_hook(check_strides)
 
         self.calculate_predictions = CalculateProperties(
             training_parameter.loss_parameter.loss_property

From e4ce8d66eda970b472a059996f6370782da6eee4 Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Sat, 24 Aug 2024 14:59:32 +0200
Subject: [PATCH 14/33] add stride hook for backward

---
 modelforge/train/training.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/modelforge/train/training.py b/modelforge/train/training.py
index b79d7b84..0a870f7d 100644
--- a/modelforge/train/training.py
+++ b/modelforge/train/training.py
@@ -175,7 +175,7 @@ def forward(
             per_molecule_squared_error,
             batch.metadata.atomic_subsystem_counts,
             prefactor=per_atom_prediction.shape[-1],
-        )
+        ).contiguous()
 
         return per_molecule_square_error_scaled
 
@@ -542,6 +542,7 @@ def __init__(
         dataset_statistic: Dict[str, float],
         training_parameter: TrainingParameters,
         potential_seed: Optional[int] = None,
+        debugging: bool = True,
     ):
         """
         Initializes the TrainingAdapter with the specified model and training configuration.
@@ -576,16 +577,21 @@ def __init__(
 
         def check_strides(module, grad_input, grad_output):
             print(f"Layer: {module.__class__.__name__}")
-
             for i, grad in enumerate(grad_input):
                 if grad is not None:
                     print(
                         f"Grad input {i}: size {grad.size()}, strides {grad.stride()}"
                     )
+            for i, grad in enumerate(grad_output):
+                if grad is not None:
+                    print(
+                        f"Grad output {i}: size {grad.size()}, strides {grad.stride()}"
+                    )
 
-        # Register the hook
-        for module in self.potential.modules():
-            module.register_backward_hook(check_strides)
+        # Register the full backward hook
+        if debugging is True:
+            for module in self.potential.modules():
+                module.register_full_backward_hook(check_strides)
 
         self.calculate_predictions = CalculateProperties(
             training_parameter.loss_parameter.loss_property

From 3f087ab2e64a5410e376669156a842a708a838f6 Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Sat, 24 Aug 2024 15:25:14 +0200
Subject: [PATCH 15/33] dicst as module output for stride

---
 modelforge/train/training.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/modelforge/train/training.py b/modelforge/train/training.py
index 0a870f7d..caeaac59 100644
--- a/modelforge/train/training.py
+++ b/modelforge/train/training.py
@@ -582,11 +582,22 @@ def check_strides(module, grad_input, grad_output):
                     print(
                         f"Grad input {i}: size {grad.size()}, strides {grad.stride()}"
                     )
-            for i, grad in enumerate(grad_output):
-                if grad is not None:
-                    print(
-                        f"Grad output {i}: size {grad.size()}, strides {grad.stride()}"
-                    )
+            # Handle grad_output
+            if isinstance(grad_output, tuple) and isinstance(grad_output[0], dict):
+                # If the output is a dict wrapped in a tuple, extract the dict
+                grad_output = grad_output[0]
+            if isinstance(grad_output, dict):
+                for key, grad in grad_output.items():
+                    if grad is not None:
+                        print(
+                            f"Grad output [{key}]: size {grad.size()}, strides {grad.stride()}"
+                        )
+            else:
+                for i, grad in enumerate(grad_output):
+                    if grad is not None:
+                        print(
+                            f"Grad output {i}: size {grad.size()}, strides {grad.stride()}"
+                        )
 
         # Register the full backward hook
         if debugging is True:

From 507209868a14c7b65de5084d96ecf22b5d21d29f Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Sun, 25 Aug 2024 19:26:26 +0200
Subject: [PATCH 16/33] avoid saving grad in val/test routine

---
 modelforge/tests/test_training.py | 19 ++++++++-----
 modelforge/train/training.py      | 45 +++++++++++++++++++++----------
 2 files changed, 43 insertions(+), 21 deletions(-)

diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py
index 891d9169..4acd629b 100644
--- a/modelforge/tests/test_training.py
+++ b/modelforge/tests/test_training.py
@@ -80,8 +80,6 @@ def get_trainer(potential_name: str, dataset_name: str, training_toml: str):
     )
 
 
-
-
 @pytest.mark.skipif(ON_MACOS, reason="Skipping this test on MacOS GitHub Actions")
 @pytest.mark.parametrize(
     "potential_name", _Implemented_NNPs.get_all_neural_network_names()
@@ -108,8 +106,6 @@ def test_train_with_lightning(training, potential_name, dataset_name):
     # continue training from checkpoint
     get_trainer(potential_name, dataset_name, training_toml).train_potential()
 
-    assert False
-
 
 def test_train_from_single_toml_file():
     from importlib import resources
@@ -188,8 +184,17 @@ def test_loss(single_batch_with_batchsize):
     assert loss is not None
 
     # get trainer
-    trainer = get_trainer("schnet", "QM9")
-    prediction = trainer.model.calculate_predictions(batch, trainer.model.potential)
+    trainer = get_trainer("schnet", "QM9", "default_with_force")
+    prediction = trainer.model.calculate_predictions(
+        batch, trainer.model.potential, train_mode=True
+    )  # train_mode=True is required for gradients in force prediction
+
+    assert prediction["per_molecule_energy_predict"].size(
+        dim=0
+    ) == batch.metadata.E.size(dim=0)
+    assert prediction["per_molecule_force_predict"].size(
+        dim=0
+    ) == batch.metadata.E.size(dim=0)
 
     # pass prediction through loss module
     loss_output = loss(prediction, batch)
@@ -209,7 +214,7 @@ def test_loss(single_batch_with_batchsize):
             ).pow(2)
         )
     )
-    assert torch.allclose(loss_output["per_molecule_energy/mse"], E_loss)
+    assert torch.allclose(loss_output["per_molecule_energy"], E_loss)
 
     # --------------------------------------------- #
     # now calculate F_loss
diff --git a/modelforge/train/training.py b/modelforge/train/training.py
index caeaac59..7243e819 100644
--- a/modelforge/train/training.py
+++ b/modelforge/train/training.py
@@ -421,7 +421,7 @@ def __init__(self, requested_properties: List[str]):
             self.include_force = True
 
     def _get_forces(
-        self, batch: "BatchData", energies: Dict[str, torch.Tensor]
+        self, batch: "BatchData", energies: Dict[str, torch.Tensor], train_mode: bool
     ) -> Dict[str, torch.Tensor]:
         """
         Computes the forces from a given batch using the model.
@@ -454,11 +454,17 @@ def _get_forces(
 
         # Compute the gradient (forces) from the predicted energies
         grad = torch.autograd.grad(
-            per_molecule_energy_predict.sum(),
+            per_molecule_energy_predict,
             nnp_input.positions,
-            create_graph=True,
-            retain_graph=True,
+            grad_outputs=torch.ones_like(per_molecule_energy_predict),
+            create_graph=train_mode,
+            retain_graph=train_mode,
+            allow_unused=True,
         )[0]
+
+        if grad is None:
+            raise RuntimeWarning("Force calculation did not return a gradient")
+
         per_atom_force_predict = -1 * grad  # Forces are the negative gradient of energy
 
         return {
@@ -501,7 +507,7 @@ def _get_energies(
         }
 
     def forward(
-        self, batch: "BatchData", model: Type[torch.nn.Module]
+        self, batch: "BatchData", model: Type[torch.nn.Module], train_mode: bool = False
     ) -> Dict[str, torch.Tensor]:
         """
         Computes the energies and forces from a given batch using the model.
@@ -520,7 +526,7 @@ def forward(
         """
         energies = self._get_energies(batch, model)
         if self.include_force:
-            forces = self._get_forces(batch, energies)
+            forces = self._get_forces(batch, energies, train_mode)
         else:
             forces = {}
         return {**energies, **forces}
@@ -542,7 +548,6 @@ def __init__(
         dataset_statistic: Dict[str, float],
         training_parameter: TrainingParameters,
         potential_seed: Optional[int] = None,
-        debugging: bool = True,
     ):
         """
         Initializes the TrainingAdapter with the specified model and training configuration.
@@ -600,10 +605,14 @@ def check_strides(module, grad_input, grad_output):
                         )
 
         # Register the full backward hook
-        if debugging is True:
+        if training_parameter.verbose is True:
             for module in self.potential.modules():
                 module.register_full_backward_hook(check_strides)
 
+        self.include_force = False
+        if "per_atom_force" in training_parameter.loss_parameter.loss_property:
+            self.include_force = True
+
         self.calculate_predictions = CalculateProperties(
             training_parameter.loss_parameter.loss_property
         )
@@ -709,7 +718,9 @@ def training_step(self, batch: "BatchData", batch_idx: int) -> torch.Tensor:
         # calculate energy and forces, Note that `predict_target` is a
         # dictionary containing the predicted and true values for energy and
         # force`
-        predict_target = self.calculate_predictions(batch, self.potential)
+        predict_target = self.calculate_predictions(
+            batch, self.potential, self.training
+        )
 
         # Calculate the loss
         loss_dict = self.loss(predict_target, batch)
@@ -721,7 +732,6 @@ def training_step(self, batch: "BatchData", batch_idx: int) -> torch.Tensor:
         loss = torch.mean(loss_dict["total_loss"]).contiguous()
         return loss
 
-    @torch.enable_grad()
     def validation_step(self, batch: "BatchData", batch_idx: int) -> None:
         """
         Validation step to compute the RMSE/MAE across epochs.
@@ -740,11 +750,15 @@ def validation_step(self, batch: "BatchData", batch_idx: int) -> None:
 
         # Ensure positions require gradients for force calculation
         batch.nnp_input.positions.requires_grad_(True)
-        # calculate energy and forces
-        predict_target = self.calculate_predictions(batch, self.potential)
+        with torch.inference_mode(False):
+
+            # calculate energy and forces
+            predict_target = self.calculate_predictions(
+                batch, self.potential, self.training
+            )
+    
         self._update_metrics(self.val_error, predict_target)
 
-    @torch.enable_grad()
     def test_step(self, batch: "BatchData", batch_idx: int) -> None:
         """
         Test step to compute the RMSE loss for a given batch.
@@ -767,7 +781,10 @@ def test_step(self, batch: "BatchData", batch_idx: int) -> None:
         # Ensure positions require gradients for force calculation
         batch.nnp_input.positions.requires_grad_(True)
         # calculate energy and forces
-        predict_target = self.calculate_predictions(batch, self.potential)
+        with torch.inference_mode(False):
+            predict_target = self.calculate_predictions(
+                batch, self.potential, self.training
+            )
         # Update and log metrics
         self._update_metrics(self.test_error, predict_target)
 

From 5895acad99958093cdc8af657899e68e6b17f6c9 Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Sun, 25 Aug 2024 19:35:10 +0200
Subject: [PATCH 17/33] only linting changes

---
 modelforge/dataset/dataset.py    | 4 ++--
 modelforge/tests/test_dataset.py | 4 ++--
 modelforge/tests/test_models.py  | 3 +--
 modelforge/train/training.py     | 2 +-
 4 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py
index 440c867b..8c78ab32 100644
--- a/modelforge/dataset/dataset.py
+++ b/modelforge/dataset/dataset.py
@@ -237,10 +237,10 @@ def to(
         self.metadata = self.metadata.to(device=device, dtype=dtype)
         return self
 
-
     def batch_size(self):
         return self.metadata.E.size(dim=0)
-    
+
+
 class TorchDataset(torch.utils.data.Dataset[BatchData]):
     """
     Wraps a numpy dataset to make it compatible with PyTorch DataLoader.
diff --git a/modelforge/tests/test_dataset.py b/modelforge/tests/test_dataset.py
index b591fa30..91f963a3 100644
--- a/modelforge/tests/test_dataset.py
+++ b/modelforge/tests/test_dataset.py
@@ -461,7 +461,7 @@ def test_data_item_format_of_datamodule(
 def test_dataset_neighborlist(potential_name, single_batch_with_batchsize):
     """Test the neighborlist."""
 
-    batch = single_batch_with_batchsize(64, 'QM9')
+    batch = single_batch_with_batchsize(64, "QM9")
     nnp_input = batch.nnp_input
 
     # test that the neighborlist is correctly generated
@@ -714,7 +714,7 @@ def test_numpy_dataset_assignment(dataset_name):
 
 
 def test_energy_postprocessing():
-    # test that the mean and stddev of the dataset 
+    # test that the mean and stddev of the dataset
     # are correct
     from modelforge.dataset.dataset import DataModule
 
diff --git a/modelforge/tests/test_models.py b/modelforge/tests/test_models.py
index 58602e31..e536b1b0 100644
--- a/modelforge/tests/test_models.py
+++ b/modelforge/tests/test_models.py
@@ -450,7 +450,6 @@ def test_forward_pass(
 
     output = model(nnp_input)
 
-
     # test that we get an energie per molecule
     assert len(output["per_molecule_energy"]) == nr_of_mols
 
@@ -459,7 +458,7 @@ def test_forward_pass(
     # This has to be reflected in the atomic energies E_i, which
     # have to be equal for all hydrogens
     if "JAX" not in str(type(model)) and dataset_name == "QM9":
-    # make sure that we are correctly reducing
+        # make sure that we are correctly reducing
         ref = torch.zeros_like(output["per_molecule_energy"]).scatter_add_(
             0, nnp_input.atomic_subsystem_indices.long(), output["per_atom_energy"]
         )
diff --git a/modelforge/train/training.py b/modelforge/train/training.py
index 7243e819..076a121c 100644
--- a/modelforge/train/training.py
+++ b/modelforge/train/training.py
@@ -756,7 +756,7 @@ def validation_step(self, batch: "BatchData", batch_idx: int) -> None:
             predict_target = self.calculate_predictions(
                 batch, self.potential, self.training
             )
-    
+
         self._update_metrics(self.val_error, predict_target)
 
     def test_step(self, batch: "BatchData", batch_idx: int) -> None:

From 701122d2793f2cdab40f9191ce8c82fc360dd21b Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Sun, 25 Aug 2024 22:59:21 +0200
Subject: [PATCH 18/33] fix loss test

---
 modelforge/tests/test_training.py | 15 +++++++++------
 modelforge/train/training.py      |  3 ++-
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py
index 4acd629b..38c2ba24 100644
--- a/modelforge/tests/test_training.py
+++ b/modelforge/tests/test_training.py
@@ -192,9 +192,9 @@ def test_loss(single_batch_with_batchsize):
     assert prediction["per_molecule_energy_predict"].size(
         dim=0
     ) == batch.metadata.E.size(dim=0)
-    assert prediction["per_molecule_force_predict"].size(
+    assert prediction["per_atom_force_predict"].size(dim=0) == batch.metadata.F.size(
         dim=0
-    ) == batch.metadata.E.size(dim=0)
+    )
 
     # pass prediction through loss module
     loss_output = loss(prediction, batch)
@@ -212,9 +212,12 @@ def test_loss(single_batch_with_batchsize):
                 prediction["per_molecule_energy_predict"]
                 - prediction["per_molecule_energy_true"]
             ).pow(2)
+            / batch.metadata.atomic_subsystem_counts.unsqueeze(1)
         )
     )
-    assert torch.allclose(loss_output["per_molecule_energy"], E_loss)
+    # compare to referenc evalue obtained from Loos class
+    ref = torch.mean(loss_output["per_molecule_energy"])
+    assert torch.allclose(ref, E_loss)
 
     # --------------------------------------------- #
     # now calculate F_loss
@@ -239,15 +242,15 @@ def test_loss(single_batch_with_batchsize):
     )
 
     per_atom_force_mse = torch.mean(per_molecule_squared_error)
-    assert torch.allclose(loss_output["per_atom_force/mse"], per_atom_force_mse)
+    assert torch.allclose(torch.mean(loss_output["per_atom_force"]), per_atom_force_mse)
 
     # --------------------------------------------- #
     # let's double check that the loss is calculated correctly
     # calculate the total loss
 
     assert torch.allclose(
-        loss_weights["per_molecule_energy"] * loss_output["per_molecule_energy/mse"]
-        + loss_weights["per_atom_force"] * loss_output["per_atom_force/mse"],
+        loss_weights["per_molecule_energy"] * loss_output["per_molecule_energy"]
+        + loss_weights["per_atom_force"] * loss_output["per_atom_force"],
         loss_output["total_loss"].to(torch.float32),
     )
 
diff --git a/modelforge/train/training.py b/modelforge/train/training.py
index 076a121c..c9d36ec9 100644
--- a/modelforge/train/training.py
+++ b/modelforge/train/training.py
@@ -83,7 +83,8 @@ def calculate_squared_error(
         torch.Tensor
             The calculated error.
         """
-        error = (predicted_tensor - reference_tensor).pow(2).sum(dim=1, keepdim=True)
+        squared_diff = (predicted_tensor - reference_tensor).pow(2)
+        error = squared_diff.sum(dim=1, keepdim=True)
         return error
 
     @staticmethod

From 42dc20a9da92786ddc576d459d0794dd30a8a20c Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Mon, 26 Aug 2024 07:04:22 +0200
Subject: [PATCH 19/33] fix tests

---
 modelforge/tests/test_parameter_models.py | 7 +++++--
 modelforge/tests/test_sake.py             | 2 +-
 modelforge/tests/test_training.py         | 4 ++--
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/modelforge/tests/test_parameter_models.py b/modelforge/tests/test_parameter_models.py
index ce97537d..2c26e2cd 100644
--- a/modelforge/tests/test_parameter_models.py
+++ b/modelforge/tests/test_parameter_models.py
@@ -142,6 +142,9 @@ def test_training_parameter_model():
     with pytest.raises(ValidationError):
         training_parameters.splitting_strategy.dataset_split = [0.7, 0.1, 0.1, 0.1]
 
-    # this will throw an error because the datafile has 2 entries for the loss_property dictionary
+    # this will throw an error because the datafile has 1 entries for the loss_property dictionary
     with pytest.raises(ValidationError):
-        training_parameters.loss_parameter.loss_property = ["per_molecule_energy"]
+        training_parameters.loss_parameter.loss_property = [
+            "per_molecule_energy",
+            "per_atom_force",
+        ]
diff --git a/modelforge/tests/test_sake.py b/modelforge/tests/test_sake.py
index b3f36ee1..994b7112 100644
--- a/modelforge/tests/test_sake.py
+++ b/modelforge/tests/test_sake.py
@@ -624,7 +624,7 @@ def test_model_invariance(single_batch_with_batchsize):
         ],
     )
     # get methane input
-    batch = single_batch_with_batchsize(batch_size=1)
+    batch = single_batch_with_batchsize(batch_size=1, dataset_name="QM9")
     methane = batch.nnp_input
 
     rotation_matrix = torch.tensor([[0.0, 1.0, 0.0], [-1.0, 0.0, 0.0], [0.0, 0.0, 1.0]])
diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py
index 38c2ba24..a442f980 100644
--- a/modelforge/tests/test_training.py
+++ b/modelforge/tests/test_training.py
@@ -147,7 +147,7 @@ def test_error_calculation(single_batch_with_batchsize):
         1
     )  # FIXME : fi
     reference_E_error = torch.mean(scale_squared_error)
-    assert torch.allclose(E_error, reference_E_error)
+    assert torch.allclose(torch.mean(E_error), reference_E_error)
 
     # test error for property with shape (nr_of_atoms, 3)
     error = FromPerAtomToPerMoleculeSquaredError()
@@ -170,7 +170,7 @@ def test_error_calculation(single_batch_with_batchsize):
     reference_F_error = torch.mean(
         per_mol_error / (3 * data.metadata.atomic_subsystem_counts.unsqueeze(1))
     )
-    assert torch.allclose(F_error, reference_F_error)
+    assert torch.allclose(torch.mean(F_error), reference_F_error)
 
 
 def test_loss(single_batch_with_batchsize):

From 2c0b6436756c7995ce7a4c5728060e4c7acd2aa0 Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Tue, 27 Aug 2024 13:42:59 +0200
Subject: [PATCH 20/33] decorator for method locking

---
 modelforge/dataset/dataset.py | 43 +++++++++++++++--------
 modelforge/utils/__init__.py  |  1 +
 modelforge/utils/misc.py      | 64 +++++++++++++++++++++++++++++++++--
 3 files changed, 91 insertions(+), 17 deletions(-)

diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py
index 8c78ab32..4ae48bf9 100644
--- a/modelforge/dataset/dataset.py
+++ b/modelforge/dataset/dataset.py
@@ -4,7 +4,7 @@
 
 import os
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Union, NamedTuple
+from typing import TYPE_CHECKING, Dict, List, Literal, NamedTuple, Optional, Union
 
 import numpy as np
 import pytorch_lightning as pl
@@ -19,11 +19,10 @@
 if TYPE_CHECKING:
     from modelforge.potential.processing import AtomicSelfEnergies
 
-
-from pydantic import BaseModel, field_validator, ConfigDict, Field
-
 from enum import Enum
 
+from pydantic import BaseModel, ConfigDict, Field, field_validator
+
 
 class CaseInsensitiveEnum(str, Enum):
     @classmethod
@@ -208,8 +207,9 @@ def as_jax_namedtuple(self) -> NamedTuple:
         """Export the dataclass fields and values as a named tuple.
         Convert pytorch tensors to jax arrays."""
 
-        from dataclasses import dataclass, fields
         import collections
+        from dataclasses import dataclass, fields
+
         from modelforge.utils.io import import_
 
         convert_to_jax = import_("pytorch2jax").pytorch2jax.convert_to_jax
@@ -1042,7 +1042,6 @@ def create_dataset(
         return TorchDataset(data.numpy_data, data._property_names)
 
 
-from torch import nn
 from openff.units import unit
 
 
@@ -1101,6 +1100,9 @@ def __init__(
             regenerate_cache : bool, defaults to False
                 Whether to regenerate the cache.
         """
+        from modelforge.potential.models import Pairlist
+        import os
+
         super().__init__()
 
         self.name = name
@@ -1117,28 +1119,41 @@ def __init__(
         self.train_dataset = None
         self.test_dataset = None
         self.val_dataset = None
-        import os
 
         # make sure we can handle a path with a ~ in it
         self.local_cache_dir = os.path.expanduser(local_cache_dir)
         self.regenerate_cache = regenerate_cache
-        from modelforge.potential.models import Pairlist
 
         self.pairlist = Pairlist()
         self.dataset_statistic_filename = (
             f"{self.local_cache_dir}/{self.name}_dataset_statistic.toml"
         )
+        self.cache_processed_dataset_filename = f"{self.local_cache_dir}/{self.name}_{self.version_select}processed_dataset.pt"
 
     def prepare_data(
         self,
     ) -> None:
         """
-        Prepares the dataset for use. This method is responsible for the initial processing of the data such as calculating self energies, atomic energy statistics, and splitting. It is executed only once per node.
+        Prepares the dataset for use. This method is responsible for the initial
+        processing of the data such as calculating self energies, atomic energy
+        statistics, and splitting. It is executed only once per node.
         """
+        # if the dataset has already been processed, skip this step
+        if (
+            os.path.exists(self.cache_processed_dataset_filename)
+            and not self.regenerate_cache
+        ):
+            if not os.path.exists(self.dataset_statistic_filename):
+                raise FileNotFoundError(
+                    f"Dataset statistics file {self.dataset_statistic_filename} not found. Please regenerate the cache."
+                )
+            log.info('Processed dataset already exists. Skipping "prepare_data" step.')
+            return None
+
+        # if the dataset is not already processed, process it
         from modelforge.dataset import _ImplementedDatasets
-        import toml
 
-        dataset_class = _ImplementedDatasets.get_dataset_class(self.name)
+        dataset_class = _ImplementedDatasets.get_dataset_class(str(self.name))
         dataset = dataset_class(
             force_download=self.force_download,
             version_select=self.version_select,
@@ -1284,11 +1299,11 @@ def _calculate_atomic_self_energies(
 
     def _cache_dataset(self, torch_dataset):
         """Cache the dataset and its statistics using PyTorch's serialization."""
-        torch.save(torch_dataset, "torch_dataset.pt")
-        # sleep for 1 second to make sure that the dataset was written to disk
+        torch.save(torch_dataset, self.cache_processed_dataset_filename)
+        # sleep for 5 second to make sure that the dataset was written to disk
         import time
 
-        time.sleep(1)
+        time.sleep(5)
 
     def setup(self, stage: Optional[str] = None) -> None:
         """Sets up datasets for the train, validation, and test stages based on the stage argument."""
diff --git a/modelforge/utils/__init__.py b/modelforge/utils/__init__.py
index 4e57d2f1..605aea59 100644
--- a/modelforge/utils/__init__.py
+++ b/modelforge/utils/__init__.py
@@ -1,3 +1,4 @@
 """Module of general modelforge utilities."""
 
 from .prop import SpeciesEnergies, PropertyNames
+from .misc import lock_with_attribute
diff --git a/modelforge/utils/misc.py b/modelforge/utils/misc.py
index f56b2d8d..2a0e7280 100644
--- a/modelforge/utils/misc.py
+++ b/modelforge/utils/misc.py
@@ -2,15 +2,18 @@
 Module of miscellaneous utilities.
 """
 
-from typing import Literal
+from typing import Literal, TYPE_CHECKING
 
 import torch
 from loguru import logger
-from modelforge.dataset.dataset import DataModule
+
+# import DataModule for typing hint
+if TYPE_CHECKING:
+    from modelforge.dataset.dataset import DataModule
 
 
 def visualize_model(
-    dm: DataModule,
+    dm: 'DataModule',
     potential_name: Literal["ANI2x", "PhysNet", "SchNet", "PaiNN", "SAKE"],
 ):
     # visualize the compute graph
@@ -314,3 +317,58 @@ def __exit__(self, *args):
         # fcntl.flock(self._file_handle.fileno(), fcntl.LOCK_UN)
         unlock_file(self._file_handle)
         self._file_handle.close()
+
+
+import os
+from functools import wraps
+
+
+def lock_with_attribute(attribute_name):
+    """
+    Decorator for locking a method using a lock file path stored in an instance
+    attribute. The attribute is accessed on the instance (`self`) at runtime.
+
+    Parameters
+    ----------
+    attribute_name : str
+        The name of the instance attribute that contains the lock file path.
+
+    Examples
+    --------
+    >>> from modelforge.utils.misc import lock_with_attribute
+    >>>
+    >>> class MyClass:
+    >>>     def __init__(self, lock_file):
+    >>>         self.method_lock = lock_file
+    >>>
+    >>>     @lock_with_attribute('method_lock')
+    >>>     def critical_section(self):
+    >>>         print("Executing critical section")
+    """
+
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            # Retrieve the instance (`self`)
+            instance = args[0]
+            # Get the lock file path from the specified attribute
+            lock_file_path = getattr(instance, attribute_name)
+            with open(lock_file_path, 'w') as lock_file:
+                # Lock the file
+                lock_file(lock_file)
+
+                try:
+                    # Execute the wrapped function
+                    result = func(*args, **kwargs)
+                finally:
+                    # Unlock the file
+                    unlock_file(lock_file)
+
+                    # Optionally, remove the lock file
+                    os.remove(lock_file_path)
+
+                return result
+
+        return wrapper
+
+    return decorator

From 992b7bc4b10169db10267f5cb0933ae006055255 Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Tue, 27 Aug 2024 13:43:09 +0200
Subject: [PATCH 21/33] lock

---
 modelforge/dataset/dataset.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py
index 4ae48bf9..433138de 100644
--- a/modelforge/dataset/dataset.py
+++ b/modelforge/dataset/dataset.py
@@ -15,13 +15,14 @@
 
 from modelforge.dataset.utils import RandomRecordSplittingStrategy, SplittingStrategy
 from modelforge.utils.prop import PropertyNames
+from modelforge.utils.misc import lock_with_attribute
 
 if TYPE_CHECKING:
     from modelforge.potential.processing import AtomicSelfEnergies
 
 from enum import Enum
 
-from pydantic import BaseModel, ConfigDict, Field, field_validator
+from pydantic import BaseModel, ConfigDict, Field
 
 
 class CaseInsensitiveEnum(str, Enum):
@@ -1045,6 +1046,7 @@ def create_dataset(
 from openff.units import unit
 
 
+
 class DataModule(pl.LightningDataModule):
     def __init__(
         self,
@@ -1128,8 +1130,12 @@ def __init__(
         self.dataset_statistic_filename = (
             f"{self.local_cache_dir}/{self.name}_dataset_statistic.toml"
         )
-        self.cache_processed_dataset_filename = f"{self.local_cache_dir}/{self.name}_{self.version_select}processed_dataset.pt"
+        self.cache_processed_dataset_filename = (
+            f"{self.local_cache_dir}/{self.name}_{self.version_select}_processed.pt"
+        )
+        self.lock_file = f"{self.cache_processed_dataset_filename}.lockfile"
 
+    @lock_with_attribute("lock_file")
     def prepare_data(
         self,
     ) -> None:
@@ -1138,6 +1144,8 @@ def prepare_data(
         processing of the data such as calculating self energies, atomic energy
         statistics, and splitting. It is executed only once per node.
         """
+        # check if there is a filelock present, if so, wait until it is removed
+
         # if the dataset has already been processed, skip this step
         if (
             os.path.exists(self.cache_processed_dataset_filename)
@@ -1151,6 +1159,7 @@ def prepare_data(
             return None
 
         # if the dataset is not already processed, process it
+
         from modelforge.dataset import _ImplementedDatasets
 
         dataset_class = _ImplementedDatasets.get_dataset_class(str(self.name))
@@ -1161,7 +1170,6 @@ def prepare_data(
             regenerate_cache=self.regenerate_cache,
         )
         torch_dataset = self._create_torch_dataset(dataset)
-
         # if dataset statistics is present load it from disk
         if (
             os.path.exists(self.dataset_statistic_filename)
@@ -1308,7 +1316,7 @@ def _cache_dataset(self, torch_dataset):
     def setup(self, stage: Optional[str] = None) -> None:
         """Sets up datasets for the train, validation, and test stages based on the stage argument."""
 
-        self.torch_dataset = torch.load("torch_dataset.pt")
+        self.torch_dataset = torch.load(self.cache_processed_dataset_filename)
         (
             self.train_dataset,
             self.val_dataset,

From b5f9888d7a02a4deb5688146f2d0e5c01a777cd9 Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Tue, 27 Aug 2024 13:56:34 +0200
Subject: [PATCH 22/33] typo

---
 modelforge/utils/misc.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/modelforge/utils/misc.py b/modelforge/utils/misc.py
index 2a0e7280..ea288a4a 100644
--- a/modelforge/utils/misc.py
+++ b/modelforge/utils/misc.py
@@ -13,7 +13,7 @@
 
 
 def visualize_model(
-    dm: 'DataModule',
+    dm: "DataModule",
     potential_name: Literal["ANI2x", "PhysNet", "SchNet", "PaiNN", "SAKE"],
 ):
     # visualize the compute graph
@@ -353,16 +353,16 @@ def wrapper(*args, **kwargs):
             instance = args[0]
             # Get the lock file path from the specified attribute
             lock_file_path = getattr(instance, attribute_name)
-            with open(lock_file_path, 'w') as lock_file:
+            with open(lock_file_path, "w") as f:
                 # Lock the file
-                lock_file(lock_file)
+                lock_file(f)
 
                 try:
                     # Execute the wrapped function
                     result = func(*args, **kwargs)
                 finally:
                     # Unlock the file
-                    unlock_file(lock_file)
+                    unlock_file(f)
 
                     # Optionally, remove the lock file
                     os.remove(lock_file_path)

From acd06fee31ddcffda3f59e3ad36a80048c7e3d4e Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Tue, 27 Aug 2024 15:37:55 +0200
Subject: [PATCH 23/33] correct lock file mode

---
 modelforge/utils/misc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelforge/utils/misc.py b/modelforge/utils/misc.py
index ea288a4a..f4e918f2 100644
--- a/modelforge/utils/misc.py
+++ b/modelforge/utils/misc.py
@@ -353,7 +353,7 @@ def wrapper(*args, **kwargs):
             instance = args[0]
             # Get the lock file path from the specified attribute
             lock_file_path = getattr(instance, attribute_name)
-            with open(lock_file_path, "w") as f:
+            with open(lock_file_path, "w+") as f:
                 # Lock the file
                 lock_file(f)
 

From 62003b7815461670dce32cab839630c62db9f571 Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Tue, 27 Aug 2024 15:40:00 +0200
Subject: [PATCH 24/33] linting

---
 modelforge/dataset/dataset.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py
index 433138de..4cfd8aa0 100644
--- a/modelforge/dataset/dataset.py
+++ b/modelforge/dataset/dataset.py
@@ -1046,7 +1046,6 @@ def create_dataset(
 from openff.units import unit
 
 
-
 class DataModule(pl.LightningDataModule):
     def __init__(
         self,

From e48355191ba99634704d1666aa181c1bfd65a051 Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Tue, 27 Aug 2024 15:50:12 +0200
Subject: [PATCH 25/33] fix test failures

---
 modelforge/tests/conftest.py      | 2 ++
 modelforge/tests/test_training.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/modelforge/tests/conftest.py b/modelforge/tests/conftest.py
index f7ccd939..ad519b40 100644
--- a/modelforge/tests/conftest.py
+++ b/modelforge/tests/conftest.py
@@ -51,6 +51,7 @@ def initialize_datamodule(
     remove_self_energies: bool = True,
     regression_ase: bool = False,
     regenerate_dataset_statistic: bool = False,
+    regenerate_cache: bool = True,
 ) -> DataModule:
     """
     Initialize a dataset for a given mode.
@@ -64,6 +65,7 @@ def initialize_datamodule(
         remove_self_energies=remove_self_energies,
         regression_ase=regression_ase,
         regenerate_dataset_statistic=regenerate_dataset_statistic,
+        regenerate_cache=regenerate_cache,
     )
     data_module.prepare_data()
     data_module.setup()
diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py
index a442f980..9393e858 100644
--- a/modelforge/tests/test_training.py
+++ b/modelforge/tests/test_training.py
@@ -84,7 +84,7 @@ def get_trainer(potential_name: str, dataset_name: str, training_toml: str):
 @pytest.mark.parametrize(
     "potential_name", _Implemented_NNPs.get_all_neural_network_names()
 )
-@pytest.mark.parametrize("dataset_name", ["QM9", "SPICE2"])
+@pytest.mark.parametrize("dataset_name", ["PHALKETHOH"])
 @pytest.mark.parametrize("training", ["with_force", "without_force"])
 def test_train_with_lightning(training, potential_name, dataset_name):
     """

From c2cfb5f9b5cf95f73ad8a0e3feb85ed22764fa2f Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Tue, 27 Aug 2024 17:30:17 +0200
Subject: [PATCH 26/33] reasonable defaults for regeneration

---
 modelforge/dataset/dataset.py    | 15 ++++++++++++++-
 modelforge/tests/conftest.py     |  4 ++--
 modelforge/tests/test_dataset.py |  1 +
 modelforge/train/training.py     |  1 +
 4 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py
index 4cfd8aa0..cb2e9a0e 100644
--- a/modelforge/dataset/dataset.py
+++ b/modelforge/dataset/dataset.py
@@ -64,6 +64,7 @@ class DatasetParameters(BaseModel):
     version_select: str
     num_workers: int = Field(gt=0)
     pin_memory: bool
+    regenerate_processed_cache: bool = False
 
 
 @dataclass(frozen=False)
@@ -474,6 +475,7 @@ def __init__(
         local_cache_dir: str,
         force_download: bool = False,
         regenerate_cache: bool = False,
+        regenerate_processed_cache: bool = False,
     ):
         """
         Initializes the HDF5Dataset with paths to raw and processed data files.
@@ -504,6 +506,11 @@ def __init__(
         self.force_download = force_download
         self.regenerate_cache = regenerate_cache
 
+        # is True if regenerate_cache is True
+        self.regenerate_processed_cache = (
+            regenerate_processed_cache or self.regenerate_cache
+        )
+
         self.hdf5data: Optional[Dict[str, List[np.ndarray]]] = None
         self.numpy_data: Optional[np.ndarray] = None
 
@@ -1068,6 +1075,7 @@ def __init__(
         local_cache_dir: str = "./",
         regenerate_cache: bool = False,
         regenerate_dataset_statistic: bool = False,
+        regenerate_processed_cache: bool = True,
     ):
         """
         Initializes adData module for PyTorch Lightning handling data preparation and loading object with the specified configuration.
@@ -1124,6 +1132,11 @@ def __init__(
         # make sure we can handle a path with a ~ in it
         self.local_cache_dir = os.path.expanduser(local_cache_dir)
         self.regenerate_cache = regenerate_cache
+        # Use a logical OR to ensure regenerate_processed_cache is True when
+        # regenerate_cache is True
+        self.regenerate_processed_cache = (
+            regenerate_processed_cache or self.regenerate_cache
+        )
 
         self.pairlist = Pairlist()
         self.dataset_statistic_filename = (
@@ -1148,7 +1161,7 @@ def prepare_data(
         # if the dataset has already been processed, skip this step
         if (
             os.path.exists(self.cache_processed_dataset_filename)
-            and not self.regenerate_cache
+            and not self.regenerate_processed_cache
         ):
             if not os.path.exists(self.dataset_statistic_filename):
                 raise FileNotFoundError(
diff --git a/modelforge/tests/conftest.py b/modelforge/tests/conftest.py
index ad519b40..b7c0c48f 100644
--- a/modelforge/tests/conftest.py
+++ b/modelforge/tests/conftest.py
@@ -51,7 +51,7 @@ def initialize_datamodule(
     remove_self_energies: bool = True,
     regression_ase: bool = False,
     regenerate_dataset_statistic: bool = False,
-    regenerate_cache: bool = True,
+    regenerate_processed_cache: bool = True,
 ) -> DataModule:
     """
     Initialize a dataset for a given mode.
@@ -65,7 +65,7 @@ def initialize_datamodule(
         remove_self_energies=remove_self_energies,
         regression_ase=regression_ase,
         regenerate_dataset_statistic=regenerate_dataset_statistic,
-        regenerate_cache=regenerate_cache,
+        regenerate_processed_cache=regenerate_processed_cache,
     )
     data_module.prepare_data()
     data_module.setup()
diff --git a/modelforge/tests/test_dataset.py b/modelforge/tests/test_dataset.py
index 91f963a3..de15775a 100644
--- a/modelforge/tests/test_dataset.py
+++ b/modelforge/tests/test_dataset.py
@@ -730,6 +730,7 @@ def test_energy_postprocessing():
         splitting_strategy=FirstComeFirstServeSplittingStrategy(),
         remove_self_energies=True,
         regenerate_dataset_statistic=True,
+        regenerate_processed_cache=True,
     )
     dm.prepare_data()
     dm.setup()
diff --git a/modelforge/train/training.py b/modelforge/train/training.py
index c9d36ec9..254e2acf 100644
--- a/modelforge/train/training.py
+++ b/modelforge/train/training.py
@@ -1037,6 +1037,7 @@ def setup_datamodule(self) -> DataModule:
                 seed=self.training_parameter.splitting_strategy.seed,
                 split=self.training_parameter.splitting_strategy.data_split,
             ),
+            regenerate_processed_cache=self.dataset_parameter.regenerate_processed_cache,
         )
         dm.prepare_data()
         dm.setup()

From a1daf2cd0853cc0f924120ab3a2b96fcfda065b9 Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Tue, 27 Aug 2024 17:50:30 +0200
Subject: [PATCH 27/33] ha, didn't think about that

---
 modelforge/dataset/dataset.py    | 8 ++------
 modelforge/tests/conftest.py     | 2 --
 modelforge/tests/test_dataset.py | 1 -
 3 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py
index cb2e9a0e..7a7944d6 100644
--- a/modelforge/dataset/dataset.py
+++ b/modelforge/dataset/dataset.py
@@ -475,7 +475,6 @@ def __init__(
         local_cache_dir: str,
         force_download: bool = False,
         regenerate_cache: bool = False,
-        regenerate_processed_cache: bool = False,
     ):
         """
         Initializes the HDF5Dataset with paths to raw and processed data files.
@@ -506,11 +505,6 @@ def __init__(
         self.force_download = force_download
         self.regenerate_cache = regenerate_cache
 
-        # is True if regenerate_cache is True
-        self.regenerate_processed_cache = (
-            regenerate_processed_cache or self.regenerate_cache
-        )
-
         self.hdf5data: Optional[Dict[str, List[np.ndarray]]] = None
         self.numpy_data: Optional[np.ndarray] = None
 
@@ -1131,6 +1125,8 @@ def __init__(
 
         # make sure we can handle a path with a ~ in it
         self.local_cache_dir = os.path.expanduser(local_cache_dir)
+        # create the local cache directory if it does not exist
+        os.makedirs(self.local_cache_dir, exist_ok=True)
         self.regenerate_cache = regenerate_cache
         # Use a logical OR to ensure regenerate_processed_cache is True when
         # regenerate_cache is True
diff --git a/modelforge/tests/conftest.py b/modelforge/tests/conftest.py
index b7c0c48f..f7ccd939 100644
--- a/modelforge/tests/conftest.py
+++ b/modelforge/tests/conftest.py
@@ -51,7 +51,6 @@ def initialize_datamodule(
     remove_self_energies: bool = True,
     regression_ase: bool = False,
     regenerate_dataset_statistic: bool = False,
-    regenerate_processed_cache: bool = True,
 ) -> DataModule:
     """
     Initialize a dataset for a given mode.
@@ -65,7 +64,6 @@ def initialize_datamodule(
         remove_self_energies=remove_self_energies,
         regression_ase=regression_ase,
         regenerate_dataset_statistic=regenerate_dataset_statistic,
-        regenerate_processed_cache=regenerate_processed_cache,
     )
     data_module.prepare_data()
     data_module.setup()
diff --git a/modelforge/tests/test_dataset.py b/modelforge/tests/test_dataset.py
index de15775a..91f963a3 100644
--- a/modelforge/tests/test_dataset.py
+++ b/modelforge/tests/test_dataset.py
@@ -730,7 +730,6 @@ def test_energy_postprocessing():
         splitting_strategy=FirstComeFirstServeSplittingStrategy(),
         remove_self_energies=True,
         regenerate_dataset_statistic=True,
-        regenerate_processed_cache=True,
     )
     dm.prepare_data()
     dm.setup()

From 2bc8c3cb62bedd95393a487a73ccaee656bb62ef Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Tue, 27 Aug 2024 21:31:55 +0200
Subject: [PATCH 28/33] update parameter name

---
 docs/getting_started.rst                      |  2 +-
 modelforge/tests/data/config.toml             |  2 +-
 .../tests/data/training_defaults/default.toml |  2 +-
 .../training_defaults/default_with_force.toml |  2 +-
 modelforge/train/parameters.py                |  3 ++-
 modelforge/train/training.py                  | 22 ++-----------------
 scripts/config.toml                           |  2 +-
 7 files changed, 9 insertions(+), 26 deletions(-)

diff --git a/docs/getting_started.rst b/docs/getting_started.rst
index c533e47f..0dc1b6bd 100644
--- a/docs/getting_started.rst
+++ b/docs/getting_started.rst
@@ -149,7 +149,7 @@ Here is an example of a training routine definition:
     remove_self_energies = true  # Whether to remove self-energies from the dataset
     batch_size = 128  # Number of samples per batch
     lr = 1e-3  # Learning rate for the optimizer
-    monitor = "val/per_molecule_energy/rmse"  # Metric to monitor for early stopping and checkpointing
+    monitor_for_checkpoint = "val/per_molecule_energy/rmse"  # Metric to monitor for checkpointing
 
 
     [training.experiment_logger]
diff --git a/modelforge/tests/data/config.toml b/modelforge/tests/data/config.toml
index c323c354..53e408cf 100644
--- a/modelforge/tests/data/config.toml
+++ b/modelforge/tests/data/config.toml
@@ -35,7 +35,7 @@ number_of_epochs = 2
 remove_self_energies = true
 batch_size = 128
 lr = 1e-3
-monitor = "val/per_molecule_energy/rmse"
+monitor_for_checkpoint = "val/per_molecule_energy/rmse"
 
 [training.experiment_logger]
 logger_name = "tensorboard"
diff --git a/modelforge/tests/data/training_defaults/default.toml b/modelforge/tests/data/training_defaults/default.toml
index 1e4025a9..c8d96fc2 100644
--- a/modelforge/tests/data/training_defaults/default.toml
+++ b/modelforge/tests/data/training_defaults/default.toml
@@ -3,7 +3,7 @@ number_of_epochs = 2
 remove_self_energies = true
 batch_size = 128
 lr = 1e-3
-monitor = "val/per_molecule_energy/rmse"
+monitor_for_checkpoint = "val/per_molecule_energy/rmse"
 
 
 [training.experiment_logger]
diff --git a/modelforge/tests/data/training_defaults/default_with_force.toml b/modelforge/tests/data/training_defaults/default_with_force.toml
index 19c0d3b7..4ba07f8b 100644
--- a/modelforge/tests/data/training_defaults/default_with_force.toml
+++ b/modelforge/tests/data/training_defaults/default_with_force.toml
@@ -3,7 +3,7 @@ number_of_epochs = 2
 remove_self_energies = true
 batch_size = 128
 lr = 1e-3
-monitor = "val/per_molecule_energy/rmse"
+monitor_for_checkpoint = "val/per_molecule_energy/rmse"
 
 
 [training.experiment_logger]
diff --git a/modelforge/train/parameters.py b/modelforge/train/parameters.py
index 1ea820d7..f6daf0d2 100644
--- a/modelforge/train/parameters.py
+++ b/modelforge/train/parameters.py
@@ -262,7 +262,7 @@ def ensure_logger_configuration(self) -> "ExperimentLogger":
     remove_self_energies: bool
     batch_size: int
     lr: float
-    monitor: str
+    monitor_for_checkpoint: str
     lr_scheduler: Optional[SchedulerConfig] = None
     loss_parameter: LossParameter
     early_stopping: Optional[EarlyStopping] = None
@@ -271,6 +271,7 @@ def ensure_logger_configuration(self) -> "ExperimentLogger":
     experiment_logger: ExperimentLogger
     verbose: bool = False
     optimizer: Type[torch.optim.Optimizer] = torch.optim.AdamW
+    min_number_of_epochs: Union[int, None] = None
 
 
 ### Runtime Parameters
diff --git a/modelforge/train/training.py b/modelforge/train/training.py
index 254e2acf..3829c9e3 100644
--- a/modelforge/train/training.py
+++ b/modelforge/train/training.py
@@ -387,25 +387,6 @@ def create_error_metrics(loss_properties: List[str], loss: bool = False) -> Modu
         return metric_dict
 
 
-from torchmetrics import Metric
-
-
-class MeanLossMetric(Metric):
-    def __init__(self):
-        super().__init__()
-        self.add_state("sum_loss", default=torch.tensor(0.0), dist_reduce_fx="sum")
-        self.add_state("total_batches", default=torch.tensor(0), dist_reduce_fx="sum")
-
-    def update(self, loss: torch.Tensor, batch_size: int):
-        # Accumulate the loss sum and batch count
-        self.sum_loss += loss.sum()
-        self.total_batches += batch_size
-
-    def compute(self):
-        # Compute the mean loss
-        return self.sum_loss / self.total_batches
-
-
 class CalculateProperties(torch.nn.Module):
 
     def __init__(self, requested_properties: List[str]):
@@ -1147,7 +1128,7 @@ def setup_callbacks(self) -> List[Any]:
         )
         checkpoint_callback = ModelCheckpoint(
             save_top_k=2,
-            monitor=self.training_parameter.monitor,
+            monitor=self.training_parameter.monitor_for_checkpoint,
             filename=checkpoint_filename,
         )
         callbacks.append(checkpoint_callback)
@@ -1170,6 +1151,7 @@ def setup_trainer(self) -> Trainer:
 
         trainer = Trainer(
             max_epochs=self.training_parameter.number_of_epochs,
+            min_epochs=self.training_parameter.min_number_of_epochs,
             num_nodes=self.runtime_parameter.number_of_nodes,
             devices=self.runtime_parameter.devices,
             accelerator=self.runtime_parameter.accelerator,
diff --git a/scripts/config.toml b/scripts/config.toml
index d6e92a10..b49e1434 100644
--- a/scripts/config.toml
+++ b/scripts/config.toml
@@ -33,7 +33,7 @@ number_of_epochs = 1000
 remove_self_energies = true
 batch_size = 16
 lr = 0.5e-3
-monitor = "val/per_molecule_energy/rmse"
+monitor_for_checkpoint = "val/per_molecule_energy/rmse"
 
 [training.experiment_logger]
 logger_name = "tensorboard"

From d3539e5b24907df5027ecd84eace87ec8e8f0363 Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Thu, 29 Aug 2024 16:39:25 +0200
Subject: [PATCH 29/33] detach metric

---
 modelforge/train/training.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modelforge/train/training.py b/modelforge/train/training.py
index 3829c9e3..23ab1a68 100644
--- a/modelforge/train/training.py
+++ b/modelforge/train/training.py
@@ -709,10 +709,10 @@ def training_step(self, batch: "BatchData", batch_idx: int) -> torch.Tensor:
 
         # Update the loss metric with the different loss components
         for key, metric in loss_dict.items():
-            self.loss_metric[key].update(metric, batch.batch_size())
+            self.loss_metric[key].update(metric.clone().detach(), batch.batch_size())
 
-        loss = torch.mean(loss_dict["total_loss"]).contiguous()
-        return loss
+        loss = torch.mean(loss_dict["total_loss"])
+        return loss.contiguous()
 
     def validation_step(self, batch: "BatchData", batch_idx: int) -> None:
         """

From 7c374a592342fbd42d2e44d370d0a40aa3051ed0 Mon Sep 17 00:00:00 2001
From: chrisiacovella <chris.iacovella@gmail.com>
Date: Fri, 30 Aug 2024 17:04:56 -0700
Subject: [PATCH 30/33] Having test_train_lightning skip sake+forces when on
 github CI because it allocates too much money

---
 modelforge/tests/data/dataset_defaults/phalkethoh.toml | 2 +-
 modelforge/tests/test_training.py                      | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/modelforge/tests/data/dataset_defaults/phalkethoh.toml b/modelforge/tests/data/dataset_defaults/phalkethoh.toml
index 60281c0c..ddac0202 100644
--- a/modelforge/tests/data/dataset_defaults/phalkethoh.toml
+++ b/modelforge/tests/data/dataset_defaults/phalkethoh.toml
@@ -1,5 +1,5 @@
 [dataset]
 dataset_name = "PHALKETHOH"
-version_select = "nc_1000_v0"
+version_select = "nc_1000_v1"
 num_workers = 4
 pin_memory = true
\ No newline at end of file
diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py
index 9393e858..937d2571 100644
--- a/modelforge/tests/test_training.py
+++ b/modelforge/tests/test_training.py
@@ -95,7 +95,10 @@ def test_train_with_lightning(training, potential_name, dataset_name):
     # SKIP if potential is ANI and dataset is SPICE2
     if "ANI" in potential_name and dataset_name == "SPICE2":
         pytest.skip("ANI potential is not compatible with SPICE2 dataset")
-
+    if IN_GITHUB_ACTIONS and potential_name == "sake" and training == "with_force":
+        pytest.skip(
+            "Skipping Phalkethoh with sake training with forces on GitHub Actions because it allocates too much memory"
+        )
     # train potential
     get_trainer(
         potential_name, dataset_name, training_toml

From 2ca2b0af5ae1298c6c8b6487c7da0d33b74161c8 Mon Sep 17 00:00:00 2001
From: chrisiacovella <chris.iacovella@gmail.com>
Date: Fri, 30 Aug 2024 22:57:59 -0700
Subject: [PATCH 31/33] fixed captitalization error, test should now skip

---
 modelforge/tests/data/dataset_defaults/phalkethoh.toml | 2 +-
 modelforge/tests/test_training.py                      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/modelforge/tests/data/dataset_defaults/phalkethoh.toml b/modelforge/tests/data/dataset_defaults/phalkethoh.toml
index ddac0202..60281c0c 100644
--- a/modelforge/tests/data/dataset_defaults/phalkethoh.toml
+++ b/modelforge/tests/data/dataset_defaults/phalkethoh.toml
@@ -1,5 +1,5 @@
 [dataset]
 dataset_name = "PHALKETHOH"
-version_select = "nc_1000_v1"
+version_select = "nc_1000_v0"
 num_workers = 4
 pin_memory = true
\ No newline at end of file
diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py
index 937d2571..2b862b88 100644
--- a/modelforge/tests/test_training.py
+++ b/modelforge/tests/test_training.py
@@ -95,9 +95,9 @@ def test_train_with_lightning(training, potential_name, dataset_name):
     # SKIP if potential is ANI and dataset is SPICE2
     if "ANI" in potential_name and dataset_name == "SPICE2":
         pytest.skip("ANI potential is not compatible with SPICE2 dataset")
-    if IN_GITHUB_ACTIONS and potential_name == "sake" and training == "with_force":
+    if potential_name == "SAKE" and training == "with_force":
         pytest.skip(
-            "Skipping Phalkethoh with sake training with forces on GitHub Actions because it allocates too much memory"
+            "Skipping Sake training with forces on GitHub Actions because it allocates too much memory"
         )
     # train potential
     get_trainer(

From daaedff5c7f8c3cc3a45b62581fd3f0cc684d864 Mon Sep 17 00:00:00 2001
From: chrisiacovella <chris.iacovella@gmail.com>
Date: Sat, 31 Aug 2024 00:49:19 -0700
Subject: [PATCH 32/33] in testing I removed the check to see if in github
 actions. This is resolved.

---
 modelforge/tests/test_training.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py
index 2b862b88..72ee5e15 100644
--- a/modelforge/tests/test_training.py
+++ b/modelforge/tests/test_training.py
@@ -95,7 +95,7 @@ def test_train_with_lightning(training, potential_name, dataset_name):
     # SKIP if potential is ANI and dataset is SPICE2
     if "ANI" in potential_name and dataset_name == "SPICE2":
         pytest.skip("ANI potential is not compatible with SPICE2 dataset")
-    if potential_name == "SAKE" and training == "with_force":
+    if IN_GITHUB_ACTIONS and potential_name == "SAKE" and training == "with_force":
         pytest.skip(
             "Skipping Sake training with forces on GitHub Actions because it allocates too much memory"
         )

From 4ffc4d5912ba9a38c0d9962235ff4c44bc99a272 Mon Sep 17 00:00:00 2001
From: wiederm <marcus.wieder@gmail.com>
Date: Sat, 31 Aug 2024 10:36:01 +0200
Subject: [PATCH 33/33] update weights

---
 scripts/config.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/config.toml b/scripts/config.toml
index b49e1434..45e8d64c 100644
--- a/scripts/config.toml
+++ b/scripts/config.toml
@@ -57,7 +57,7 @@ interval = "epoch"
 loss_property = ['per_molecule_energy', 'per_atom_force'] # use
 
 [training.loss_parameter.weight]
-per_molecule_energy = 0.009 #NOTE: reciprocal units
+per_molecule_energy = 0.999 #NOTE: reciprocal units
 per_atom_force = 0.001
 
 [training.early_stopping]