diff --git a/.gitignore b/.gitignore
index 2ac2cee4..3c452056 100644
--- a/.gitignore
+++ b/.gitignore
@@ -190,3 +190,7 @@ lightning_logs/
 *.hdf5
 */tb_logs/*
 .vscode/settings.json
+logs/*
+cache/*
+*/logs/*
+*/cache/*
diff --git a/MANIFEST.in b/MANIFEST.in
index e0267afd..54419180 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,8 @@
 include CODE_OF_CONDUCT.md
 
+include modelforge/dataset/yaml_files/*.yaml
+include modelforge/curation/yaml_files/*.yaml
+include modelforge/tests/data/potential_defaults/*.toml
+include modelforge/tests/data/training_defaults/*.toml
+
 global-exclude *.py[cod] __pycache__ *.so
diff --git a/devtools/conda-envs/env.yaml b/devtools/conda-envs/env.yaml
new file mode 100644
index 00000000..8535f561
--- /dev/null
+++ b/devtools/conda-envs/env.yaml
@@ -0,0 +1,27 @@
+name: modelforge_env
+channels:
+  - conda-forge
+  - pytorch
+dependencies:
+  # Base depends
+  - python
+  - pip
+  - h5py
+  - tqdm
+  - toml
+  - qcportal>=0.50
+  - qcelemental
+  - pytorch>=2.1
+  - loguru
+  - lightning>=2.0.8
+  - tensorboard
+  - torchvision
+  - openff-units
+  - torchmetrics>=1.4
+  - pint=0.23
+  - rdkit
+  - retry
+  - sqlitedict
+  - pydantic>=2
+  - ray-all
+  - jax
diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml
index b045aecf..8e574780 100644
--- a/devtools/conda-envs/test_env.yaml
+++ b/devtools/conda-envs/test_env.yaml
@@ -8,6 +8,7 @@ dependencies:
   - pip
   - h5py
   - tqdm
+  - toml
   - qcportal>=0.50
   - qcelemental
   - pytorch>=2.1
diff --git a/docs/getting_started.rst b/docs/getting_started.rst
index c533e47f..0dc1b6bd 100644
--- a/docs/getting_started.rst
+++ b/docs/getting_started.rst
@@ -149,7 +149,7 @@ Here is an example of a training routine definition:
     remove_self_energies = true  # Whether to remove self-energies from the dataset
     batch_size = 128  # Number of samples per batch
     lr = 1e-3  # Learning rate for the optimizer
-    monitor = "val/per_molecule_energy/rmse"  # Metric to monitor for early stopping and checkpointing
+    monitor_for_checkpoint = "val/per_molecule_energy/rmse"  # Metric to monitor for checkpointing
 
 
     [training.experiment_logger]
diff --git a/modelforge/curation/phalkethoh_curation.py b/modelforge/curation/phalkethoh_curation.py
index 600ef8ca..52d1ec2f 100644
--- a/modelforge/curation/phalkethoh_curation.py
+++ b/modelforge/curation/phalkethoh_curation.py
@@ -267,7 +267,7 @@ def _calculate_total_charge(
         rdmol = Chem.MolFromSmiles(smiles, sanitize=False)
         total_charge = sum(atom.GetFormalCharge() for atom in rdmol.GetAtoms())
 
-        return (int(total_charge) * unit.elementary_charge,)
+        return int(total_charge) * unit.elementary_charge
 
     def _process_downloaded(
         self,
@@ -277,6 +277,8 @@ def _process_downloaded(
         max_conformers_per_record: Optional[int] = None,
         total_conformers: Optional[int] = None,
         atomic_numbers_to_limit: Optional[List[int]] = None,
+        max_force: Optional[unit.Quantity] = None,
+        final_conformer_only: Optional[bool] = None,
     ):
         """
         Processes a downloaded dataset: extracts relevant information.
@@ -295,6 +297,11 @@ def _process_downloaded(
             If set, this will limit the total number of conformers to the specified number.
         atomic_numbers_to_limit: Optional[List[int]], optional, default=None
             If set, this will limit the dataset to only include molecules with atomic numbers in the list.
+        max_force: Optional[float], optional, default=None
+            If set, this will exclude any conformers with a force that exceeds this value.
+        final_conformer_only: Optional[bool], optional, default=None
+            If set to True, only the final conformer of each record will be processed. This should be the final
+            energy minimized conformer.
         """
         from tqdm import tqdm
         import numpy as np
@@ -358,7 +365,7 @@ def _process_downloaded(
                         ]
                         data_temp["n_configs"] = 0
 
-                        (data_temp["total_charge"],) = self._calculate_total_charge(
+                        data_temp["total_charge"] = self._calculate_total_charge(
                             data_temp[
                                 "canonical_isomeric_explicit_hydrogen_mapped_smiles"
                             ]
@@ -377,105 +384,123 @@ def _process_downloaded(
                     name = key
                     index = self.molecule_names[name]
 
+                    if final_conformer_only:
+                        trajectory = [trajectory[-1]]
                     for state in trajectory:
+                        add_record = True
                         properties, config = state
-                        self.data[index]["n_configs"] += 1
-
-                        # note, we will use the convention of names being lowercase
-                        # and spaces denoted by underscore
-                        quantity = "geometry"
-                        quantity_o = "geometry"
-                        if quantity_o not in self.data[index].keys():
-                            self.data[index][quantity_o] = config.reshape(1, -1, 3)
-                        else:
-                            self.data[index][quantity_o] = np.vstack(
-                                (
-                                    self.data[index][quantity_o],
-                                    config.reshape(1, -1, 3),
+
+                        # if set, let us see if the configuration has a force that exceeds the maximum
+                        if max_force is not None:
+                            force_magnitude = (
+                                np.abs(
+                                    properties["properties"]["current gradient"]
+                                    + properties["properties"][
+                                        "dispersion correction gradient"
+                                    ]
                                 )
+                                * self.qm_parameters["dft_total_force"]["u_in"]
                             )
+                            if np.any(force_magnitude > max_force):
+                                add_record = False
+                        if add_record:
+                            self.data[index]["n_configs"] += 1
+
+                            # note, we will use the convention of names being lowercase
+                            # and spaces denoted by underscore
+                            quantity = "geometry"
+                            quantity_o = "geometry"
+                            if quantity_o not in self.data[index].keys():
+                                self.data[index][quantity_o] = config.reshape(1, -1, 3)
+                            else:
+                                self.data[index][quantity_o] = np.vstack(
+                                    (
+                                        self.data[index][quantity_o],
+                                        config.reshape(1, -1, 3),
+                                    )
+                                )
 
-                        # note, we will use the convention of names being lowercase
-                        # and spaces denoted by underscore
-                        quantity = "current energy"
-                        quantity_o = "dft_total_energy"
-                        if quantity_o not in self.data[index].keys():
-                            self.data[index][quantity_o] = properties["properties"][
-                                quantity
-                            ]
-                        else:
-                            self.data[index][quantity_o] = np.vstack(
-                                (
-                                    self.data[index][quantity_o],
-                                    properties["properties"][quantity],
+                            # note, we will use the convention of names being lowercase
+                            # and spaces denoted by underscore
+                            quantity = "current energy"
+                            quantity_o = "dft_total_energy"
+                            if quantity_o not in self.data[index].keys():
+                                self.data[index][quantity_o] = properties["properties"][
+                                    quantity
+                                ]
+                            else:
+                                self.data[index][quantity_o] = np.vstack(
+                                    (
+                                        self.data[index][quantity_o],
+                                        properties["properties"][quantity],
+                                    )
                                 )
-                            )
 
-                        quantity = "dispersion correction energy"
-                        quantity_o = "dispersion_correction_energy"
-                        # Note need to typecast here because of a bug in the
-                        # qcarchive entry: see issue: https://github.com/MolSSI/QCFractal/issues/766
-                        if quantity_o not in self.data[index].keys():
-                            self.data[index][quantity_o] = np.array(
-                                float(properties["properties"][quantity])
-                            ).reshape(1, 1)
-                        else:
-                            self.data[index][quantity_o] = np.vstack(
-                                (
-                                    self.data[index][quantity_o],
-                                    np.array(
-                                        float(properties["properties"][quantity])
-                                    ).reshape(1, 1),
-                                ),
-                            )
+                            quantity = "dispersion correction energy"
+                            quantity_o = "dispersion_correction_energy"
+                            # Note need to typecast here because of a bug in the
+                            # qcarchive entry: see issue: https://github.com/MolSSI/QCFractal/issues/766
+                            if quantity_o not in self.data[index].keys():
+                                self.data[index][quantity_o] = np.array(
+                                    float(properties["properties"][quantity])
+                                ).reshape(1, 1)
+                            else:
+                                self.data[index][quantity_o] = np.vstack(
+                                    (
+                                        self.data[index][quantity_o],
+                                        np.array(
+                                            float(properties["properties"][quantity])
+                                        ).reshape(1, 1),
+                                    ),
+                                )
 
-                        quantity = "current gradient"
-                        quantity_o = "dft_total_gradient"
-                        if quantity_o not in self.data[index].keys():
-                            self.data[index][quantity_o] = np.array(
-                                properties["properties"][quantity]
-                            ).reshape(1, -1, 3)
-                        else:
-                            self.data[index][quantity_o] = np.vstack(
-                                (
-                                    self.data[index][quantity_o],
-                                    np.array(
-                                        properties["properties"][quantity]
-                                    ).reshape(1, -1, 3),
+                            quantity = "current gradient"
+                            quantity_o = "dft_total_gradient"
+                            if quantity_o not in self.data[index].keys():
+                                self.data[index][quantity_o] = np.array(
+                                    properties["properties"][quantity]
+                                ).reshape(1, -1, 3)
+                            else:
+                                self.data[index][quantity_o] = np.vstack(
+                                    (
+                                        self.data[index][quantity_o],
+                                        np.array(
+                                            properties["properties"][quantity]
+                                        ).reshape(1, -1, 3),
+                                    )
                                 )
-                            )
 
-                        quantity = "dispersion correction gradient"
-                        quantity_o = "dispersion_correction_gradient"
-                        if quantity_o not in self.data[index].keys():
-                            self.data[index][quantity_o] = np.array(
-                                properties["properties"][quantity]
-                            ).reshape(1, -1, 3)
-                        else:
-                            self.data[index][quantity_o] = np.vstack(
-                                (
-                                    self.data[index][quantity_o],
-                                    np.array(
-                                        properties["properties"][quantity]
-                                    ).reshape(1, -1, 3),
+                            quantity = "dispersion correction gradient"
+                            quantity_o = "dispersion_correction_gradient"
+                            if quantity_o not in self.data[index].keys():
+                                self.data[index][quantity_o] = np.array(
+                                    properties["properties"][quantity]
+                                ).reshape(1, -1, 3)
+                            else:
+                                self.data[index][quantity_o] = np.vstack(
+                                    (
+                                        self.data[index][quantity_o],
+                                        np.array(
+                                            properties["properties"][quantity]
+                                        ).reshape(1, -1, 3),
+                                    )
                                 )
-                            )
 
-                        quantity = "scf dipole"
-                        quantity_o = "scf_dipole"
-                        if quantity_o not in self.data[index].keys():
-                            self.data[index][quantity_o] = np.array(
-                                properties["properties"][quantity]
-                            ).reshape(1, 3)
-                        else:
-                            self.data[index][quantity_o] = np.vstack(
-                                (
-                                    self.data[index][quantity_o],
-                                    np.array(
-                                        properties["properties"][quantity]
-                                    ).reshape(1, 3),
+                            quantity = "scf dipole"
+                            quantity_o = "scf_dipole"
+                            if quantity_o not in self.data[index].keys():
+                                self.data[index][quantity_o] = np.array(
+                                    properties["properties"][quantity]
+                                ).reshape(1, 3)
+                            else:
+                                self.data[index][quantity_o] = np.vstack(
+                                    (
+                                        self.data[index][quantity_o],
+                                        np.array(
+                                            properties["properties"][quantity]
+                                        ).reshape(1, 3),
+                                    )
                                 )
-                            )
 
         # assign units
         for datapoint in self.data:
@@ -564,6 +589,8 @@ def process(
         max_conformers_per_record: Optional[int] = None,
         total_conformers: Optional[int] = None,
         limit_atomic_species: Optional[list] = None,
+        max_force: Optional[unit.Quantity] = None,
+        final_conformer_only=None,
         n_threads=2,
     ) -> None:
         """
@@ -586,7 +613,11 @@ def process(
             Note defining this will only fetch from the "SPICE PubChem Set 1 Single Points Dataset v1.2"
         limit_atomic_species: Optional[list] = None,
             If set to a list of element symbols, records that contain any elements not in this list will be ignored.
-        n_threads, int, default=6
+        max_force: Optional[float], optional, default=None
+            If set this any confirugrations with a force that exceeds this value will be excluded.
+        final_conformer_only: Optional[bool], optional, default=None
+            If set to True, only the final conformer of each record will be processed.
+        n_threads, int, default=2
             Number of concurrent threads for retrieving data from QCArchive
         Examples
         --------
@@ -664,6 +695,8 @@ def process(
             max_conformers_per_record=max_conformers_per_record,
             total_conformers=total_conformers,
             atomic_numbers_to_limit=self.atomic_numbers_to_limit,
+            max_force=max_force,
+            final_conformer_only=final_conformer_only,
         )
 
         self._generate_hdf5()
diff --git a/modelforge/curation/scripts/curate_PhAlkEthOH.py b/modelforge/curation/scripts/curate_PhAlkEthOH.py
index 6dfab740..cecff1ff 100644
--- a/modelforge/curation/scripts/curate_PhAlkEthOH.py
+++ b/modelforge/curation/scripts/curate_PhAlkEthOH.py
@@ -20,6 +20,8 @@ def PhAlkEthOH_openff_wrapper(
     max_conformers_per_record=None,
     total_conformers=None,
     limit_atomic_species=None,
+    max_force=None,
+    final_conformer_only=False,
 ):
     """
     This curates and processes the SPICE 114 dataset at the OpenFF level of theory into an hdf5 file.
@@ -49,7 +51,10 @@ def PhAlkEthOH_openff_wrapper(
     limit_atomic_species: list, optional, default=None
         A list of atomic species to limit the dataset to. Any molecules that contain elements outside of this list
         will be ignored. If not defined, no filtering by atomic species will be performed.
-
+    max_force: float, optional, default=None
+        The maximum force to allow in the dataset. Any conformers with forces greater than this value will be ignored.
+    final_conformer_only: bool, optional, default=False
+        If True, only the final conformer for each molecule will be processed. If False, all conformers will be processed.
 
     """
     from modelforge.curation.phalkethoh_curation import PhAlkEthOHCuration
@@ -67,6 +72,8 @@ def PhAlkEthOH_openff_wrapper(
         total_conformers=total_conformers,
         limit_atomic_species=limit_atomic_species,
         n_threads=1,
+        max_force=max_force,
+        final_conformer_only=final_conformer_only,
     )
     print(f"Total records: {PhAlkEthOH_dataset.total_records}")
     print(f"Total conformers: {PhAlkEthOH_dataset.total_conformers}")
@@ -74,6 +81,8 @@ def PhAlkEthOH_openff_wrapper(
 
 def main():
 
+    from openff.units import unit
+
     # define the location where to store and output the files
     import os
 
@@ -83,9 +92,9 @@ def main():
 
     # We'll want to provide some simple means of versioning
     # if we make updates to either the underlying dataset, curation modules, or parameters given to the code
-    version = "0"
+    version = "1"
     # version of the dataset to curate
-    version_select = f"v_{version}"
+    version_select = f"v_0"
 
     # curate dataset with 1000 total conformers, max of 10 conformers per record
     hdf5_file_name = f"PhAlkEthOH_openff_dataset_v{version}_ntc_1000.hdf5"
@@ -99,6 +108,7 @@ def main():
         max_records=1000,
         total_conformers=1000,
         max_conformers_per_record=10,
+        max_force=1.0 * unit.hartree / unit.bohr,
     )
 
     # curate the full dataset
@@ -110,6 +120,36 @@ def main():
         local_cache_dir,
         force_download=False,
         version_select=version_select,
+        max_force=1.0 * unit.hartree / unit.bohr,
+    )
+
+    # curate dataset with 1000 total conformers, max of 10 conformers per record
+    hdf5_file_name = f"PhAlkEthOH_openff_dataset_v{version}_ntc_1000_minimal.hdf5"
+
+    PhAlkEthOH_openff_wrapper(
+        hdf5_file_name,
+        output_file_dir,
+        local_cache_dir,
+        force_download=False,
+        version_select=version_select,
+        max_records=1000,
+        total_conformers=1000,
+        max_conformers_per_record=10,
+        max_force=1.0 * unit.hartree / unit.bohr,
+        final_conformer_only=True,
+    )
+
+    # curate the full dataset
+    hdf5_file_name = f"PhAlkEthOH_openff_dataset_v{version}_minimal.hdf5"
+    print("total dataset")
+    PhAlkEthOH_openff_wrapper(
+        hdf5_file_name,
+        output_file_dir,
+        local_cache_dir,
+        force_download=False,
+        version_select=version_select,
+        max_force=1.0 * unit.hartree / unit.bohr,
+        final_conformer_only=True,
     )
 
 
diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py
index 907a246f..c7f048fc 100644
--- a/modelforge/dataset/dataset.py
+++ b/modelforge/dataset/dataset.py
@@ -4,7 +4,7 @@
 
 import os
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Union, NamedTuple
+from typing import TYPE_CHECKING, Dict, List, Literal, NamedTuple, Optional, Union
 
 import numpy as np
 import pytorch_lightning as pl
@@ -15,15 +15,15 @@
 
 from modelforge.dataset.utils import RandomRecordSplittingStrategy, SplittingStrategy
 from modelforge.utils.prop import PropertyNames
+from modelforge.utils.misc import lock_with_attribute
 
 if TYPE_CHECKING:
     from modelforge.potential.processing import AtomicSelfEnergies
 
-
-from pydantic import BaseModel, field_validator, ConfigDict, Field
-
 from enum import Enum
 
+from pydantic import BaseModel, ConfigDict, Field
+
 
 class CaseInsensitiveEnum(str, Enum):
     @classmethod
@@ -64,6 +64,7 @@ class DatasetParameters(BaseModel):
     version_select: str
     num_workers: int = Field(gt=0)
     pin_memory: bool
+    regenerate_processed_cache: bool = False
 
 
 @dataclass(frozen=False)
@@ -208,8 +209,9 @@ def as_jax_namedtuple(self) -> NamedTuple:
         """Export the dataclass fields and values as a named tuple.
         Convert pytorch tensors to jax arrays."""
 
-        from dataclasses import fields
         import collections
+        from dataclasses import dataclass, fields
+
         from modelforge.utils.io import import_
 
         convert_to_jax = import_("pytorch2jax").pytorch2jax.convert_to_jax
@@ -236,6 +238,9 @@ def to(
         self.metadata = self.metadata.to(device=device, dtype=dtype)
         return self
 
+    def batch_size(self):
+        return self.metadata.E.size(dim=0)
+
 
 class TorchDataset(torch.utils.data.Dataset[BatchData]):
     """
@@ -487,6 +492,8 @@ def __init__(
             Directory to store the files.
         force_download : bool, optional
             If set to True, the data will be downloaded even if it already exists. Default is False.
+        regenerate_cache : bool, optional
+            If set to True, the cache file will be regenerated even if it already exists. Default is False.
         """
         self.url = url
         self.gz_data_file = gz_data_file
@@ -1038,7 +1045,6 @@ def create_dataset(
         return TorchDataset(data.numpy_data, data._property_names)
 
 
-from torch import nn
 from openff.units import unit
 
 
@@ -1064,6 +1070,7 @@ def __init__(
         local_cache_dir: str = "./",
         regenerate_cache: bool = False,
         regenerate_dataset_statistic: bool = False,
+        regenerate_processed_cache: bool = True,
     ):
         """
         Initializes adData module for PyTorch Lightning handling data preparation and loading object with the specified configuration.
@@ -1097,6 +1104,9 @@ def __init__(
             regenerate_cache : bool, defaults to False
                 Whether to regenerate the cache.
         """
+        from modelforge.potential.models import Pairlist
+        import os
+
         super().__init__()
 
         self.name = name
@@ -1113,28 +1123,55 @@ def __init__(
         self.train_dataset = None
         self.test_dataset = None
         self.val_dataset = None
-        import os
 
         # make sure we can handle a path with a ~ in it
         self.local_cache_dir = os.path.expanduser(local_cache_dir)
+        # create the local cache directory if it does not exist
+        os.makedirs(self.local_cache_dir, exist_ok=True)
         self.regenerate_cache = regenerate_cache
-        from modelforge.potential.models import Pairlist
+        # Use a logical OR to ensure regenerate_processed_cache is True when
+        # regenerate_cache is True
+        self.regenerate_processed_cache = (
+            regenerate_processed_cache or self.regenerate_cache
+        )
 
         self.pairlist = Pairlist()
         self.dataset_statistic_filename = (
             f"{self.local_cache_dir}/{self.name}_dataset_statistic.toml"
         )
+        self.cache_processed_dataset_filename = (
+            f"{self.local_cache_dir}/{self.name}_{self.version_select}_processed.pt"
+        )
+        self.lock_file = f"{self.cache_processed_dataset_filename}.lockfile"
 
+    @lock_with_attribute("lock_file")
     def prepare_data(
         self,
     ) -> None:
         """
-        Prepares the dataset for use. This method is responsible for the initial processing of the data such as calculating self energies, atomic energy statistics, and splitting. It is executed only once per node.
+        Prepares the dataset for use. This method is responsible for the initial
+        processing of the data such as calculating self energies, atomic energy
+        statistics, and splitting. It is executed only once per node.
         """
+        # check if there is a filelock present, if so, wait until it is removed
+
+        # if the dataset has already been processed, skip this step
+        if (
+            os.path.exists(self.cache_processed_dataset_filename)
+            and not self.regenerate_processed_cache
+        ):
+            if not os.path.exists(self.dataset_statistic_filename):
+                raise FileNotFoundError(
+                    f"Dataset statistics file {self.dataset_statistic_filename} not found. Please regenerate the cache."
+                )
+            log.info('Processed dataset already exists. Skipping "prepare_data" step.')
+            return None
+
+        # if the dataset is not already processed, process it
+
         from modelforge.dataset import _ImplementedDatasets
-        import toml
 
-        dataset_class = _ImplementedDatasets.get_dataset_class(self.name)
+        dataset_class = _ImplementedDatasets.get_dataset_class(str(self.name))
         dataset = dataset_class(
             force_download=self.force_download,
             version_select=self.version_select,
@@ -1142,7 +1179,6 @@ def prepare_data(
             regenerate_cache=self.regenerate_cache,
         )
         torch_dataset = self._create_torch_dataset(dataset)
-
         # if dataset statistics is present load it from disk
         if (
             os.path.exists(self.dataset_statistic_filename)
@@ -1280,16 +1316,16 @@ def _calculate_atomic_self_energies(
 
     def _cache_dataset(self, torch_dataset):
         """Cache the dataset and its statistics using PyTorch's serialization."""
-        torch.save(torch_dataset, "torch_dataset.pt")
-        # sleep for 1 second to make sure that the dataset was written to disk
+        torch.save(torch_dataset, self.cache_processed_dataset_filename)
+        # sleep for 5 second to make sure that the dataset was written to disk
         import time
 
-        time.sleep(1)
+        time.sleep(5)
 
     def setup(self, stage: Optional[str] = None) -> None:
         """Sets up datasets for the train, validation, and test stages based on the stage argument."""
 
-        self.torch_dataset = torch.load("torch_dataset.pt")
+        self.torch_dataset = torch.load(self.cache_processed_dataset_filename)
         (
             self.train_dataset,
             self.val_dataset,
@@ -1339,7 +1375,7 @@ def _per_datapoint_operations(
         from tqdm import tqdm
 
         # remove the self energies if requested
-        log.info("Precalculating pairlist for dataset")
+        log.info("Performing per datapoint operations in the dataset dataset")
         if self.remove_self_energies:
             log.info("Removing self energies from the dataset")
 
@@ -1515,3 +1551,71 @@ def collate_conformers(conf_list: List[BatchData]) -> BatchData:
         number_of_atoms=atomic_numbers.numel(),
     )
     return BatchData(nnp_input, metadata)
+
+
+from modelforge.dataset.dataset import DatasetFactory
+from modelforge.dataset.utils import (
+    FirstComeFirstServeSplittingStrategy,
+    SplittingStrategy,
+)
+
+
+def initialize_datamodule(
+    dataset_name: str,
+    version_select: str = "nc_1000_v0",
+    batch_size: int = 64,
+    splitting_strategy: SplittingStrategy = FirstComeFirstServeSplittingStrategy(),
+    remove_self_energies: bool = True,
+    regression_ase: bool = False,
+    regenerate_dataset_statistic: bool = False,
+) -> DataModule:
+    """
+    Initialize a dataset for a given mode.
+    """
+
+    data_module = DataModule(
+        dataset_name,
+        splitting_strategy=splitting_strategy,
+        batch_size=batch_size,
+        version_select=version_select,
+        remove_self_energies=remove_self_energies,
+        regression_ase=regression_ase,
+        regenerate_dataset_statistic=regenerate_dataset_statistic,
+    )
+    data_module.prepare_data()
+    data_module.setup()
+    return data_module
+
+
+def single_batch(batch_size: int = 64, dataset_name="QM9"):
+    """
+    Utility function to create a single batch of data for testing.
+    """
+    data_module = initialize_datamodule(
+        dataset_name=dataset_name,
+        batch_size=batch_size,
+        version_select="nc_1000_v0",
+    )
+    return next(iter(data_module.train_dataloader(shuffle=False)))
+
+
+def initialize_dataset(
+    dataset_name: str,
+    local_cache_dir: str,
+    versions_select: str = "nc_1000_v0",
+    force_download: bool = False,
+) -> DataModule:
+    """
+    Initialize a dataset for a given mode.
+    """
+    from modelforge.dataset import _ImplementedDatasets
+
+    factory = DatasetFactory()
+    data = _ImplementedDatasets.get_dataset_class(dataset_name)(
+        local_cache_dir=local_cache_dir,
+        version_select=versions_select,
+        force_download=force_download,
+    )
+    dataset = factory.create_dataset(data)
+
+    return dataset
diff --git a/modelforge/dataset/yaml_files/PhAlkEthOH.yaml b/modelforge/dataset/yaml_files/PhAlkEthOH.yaml
index 06ca005e..7d8d186a 100644
--- a/modelforge/dataset/yaml_files/PhAlkEthOH.yaml
+++ b/modelforge/dataset/yaml_files/PhAlkEthOH.yaml
@@ -1,6 +1,66 @@
 dataset: PhAlkEthOH
-latest: full_dataset_v0
-latest_test: nc_1000_v0
+latest: full_dataset_v1
+latest_test: nc_1000_v1
+full_dataset_v1:
+  version: 1
+  doi: 10.5281/zenodo.13450735
+  notes: removes high force conformers
+  gz_data_file:
+    length: 3300668359
+    md5: b051af374f3233e2925f7a1b96707772
+    name: PhAlkEthOH_dataset_v1.hdf5.gz
+  hdf5_data_file:
+    md5: f5d9dccb8e79a51892b671108bc57bde
+    name: PhAlkEthOH_dataset_v1.hdf5
+  processed_data_file:
+    md5: null
+    name: PhAlkEthOH_dataset_v1_processed.npz
+  url: https://zenodo.org/records/13450735/files/PhAlkEthOH_openff_dataset_v1.hdf5.gz
+nc_1000_v1:
+  version: 1
+  doi: 10.5281/zenodo.13560343
+  notes: removes high force conformers, 1000 conformers, max 10 per molecule
+  gz_data_file:
+    length: 2702091
+    md5: 76b421802bef68f858757dba41f3ea2e
+    name: PhAlkEthOH_dataset_v1_nc_1000.hdf5.gz
+  hdf5_data_file:
+    md5: 244eb8d1b3547b8da229fd1507fb4d4e
+    name: PhAlkEthOH_dataset_v1_nc_1000.hdf5
+  processed_data_file:
+    md5: null
+    name: PhAlkEthOH_dataset_v1_nc_1000_processed.npz
+  url: https://zenodo.org/records/13560343/files/PhAlkEthOH_openff_dataset_v1_ntc_1000.hdf5.gz
+full_dataset_min_v1:
+  version: 1
+  doi: 10.5281/zenodo.13561100
+  notes: removes high force configurations, only contains final optimized configuration
+  gz_data_file:
+    length: 31352642
+    md5: 205b0b7bc1858b1d3745480d9a29a770
+    name: PhAlkEthOH_dataset_v1_min.hdf5.gz
+  hdf5_data_file:
+    md5: 41cb40718f8872baa6c468ab08574d46
+    name: PhAlkEthOH_dataset_v1_min.hdf5
+  processed_data_file:
+    md5: null
+    name: PhAlkEthOH_dataset_v1_min_processed.npz
+  url: https://zenodo.org/records/13561100/files/PhAlkEthOH_openff_dataset_v1_min.hdf5.gz
+nc_1000_min_v1:
+  version: 1
+  doi: 10.5281/zenodo.13576458
+  notes: removes high force conformers, 1000 conformers, only contains final optimized configuration
+  gz_data_file:
+    length: 3476870
+    md5: 7261f4738efd4bf8409268961837ba78
+    name: PhAlkEthOH_dataset_v1_nc_1000_min.hdf5.gz
+  hdf5_data_file:
+    md5: 5d347a78c6c3b45531870a05d5aab77e
+    name: PhAlkEthOH_dataset_v1_nc_1000_min.hdf5
+  processed_data_file:
+    md5: null
+    name: PhAlkEthOH_dataset_v1_nc_1000_min_processed.npz
+  url: https://zenodo.org/records/13576458/files/PhAlkEthOH_openff_dataset_v1_ntc_1000_min.hdf5.gz
 full_dataset_v0:
   version: 0
   doi: 10.5281/zenodo.12174233
diff --git a/modelforge/potential/ani.py b/modelforge/potential/ani.py
index 65033e5f..00f0751c 100644
--- a/modelforge/potential/ani.py
+++ b/modelforge/potential/ani.py
@@ -2,6 +2,9 @@
 This module contains the classes for the ANI2x neural network potential.
 """
 
+from __future__ import annotations
+
+
 from dataclasses import dataclass
 from typing import Dict, Tuple, Type, List
 
@@ -103,7 +106,6 @@ def __init__(
         angle_sections: int,
         nr_of_supported_elements: int = 7,
     ):
-
         super().__init__()
         from modelforge.potential.utils import CosineAttenuationFunction
 
@@ -601,7 +603,7 @@ def __init__(
         self.register_buffer("lookup_tensor", lookup_tensor)
 
     def _model_specific_input_preparation(
-        self, data: "NNPInput", pairlist_output: "PairListOutputs"
+        self, data: NNPInput, pairlist_output: Dict[str, PairListOutputs]
     ) -> AniNeuralNetworkData:
         """
         Prepare the model-specific input data for the ANI2x model.
@@ -610,8 +612,8 @@ def _model_specific_input_preparation(
         ----------
         data : NNPInput
             The input data for the model.
-        pairlist_output : PairListOutputs
-            The pairlist output.
+        pairlist_output : Dict[str,PairListOutputs]
+            The output from the pairlist.
 
         Returns
         -------
@@ -620,6 +622,11 @@ def _model_specific_input_preparation(
         """
         number_of_atoms = data.atomic_numbers.shape[0]
 
+        # Note, pairlist_output is a Dict where the key corresponds to the name of the cutoff parameter
+        # e.g. "maximum_interaction_radius"
+
+        pairlist_output = pairlist_output["maximum_interaction_radius"]
+
         nnp_data = AniNeuralNetworkData(
             pair_indices=pairlist_output.pair_indices,
             d_ij=pairlist_output.d_ij,
@@ -714,7 +721,6 @@ def __init__(
         dataset_statistic: Optional[Dict[str, float]] = None,
         potential_seed: Optional[int] = None,
     ) -> None:
-
         from modelforge.utils.units import _convert_str_to_unit
 
         self.only_unique_pairs = True  # NOTE: need to be set before super().__init__
diff --git a/modelforge/potential/models.py b/modelforge/potential/models.py
index a95dad4c..ede27330 100644
--- a/modelforge/potential/models.py
+++ b/modelforge/potential/models.py
@@ -311,24 +311,31 @@ def forward(
 
 class Neighborlist(Pairlist):
     """
-    Manage neighbor list calculations with a specified cutoff distance.
+    Manage neighbor list calculations with a specified cutoff distance(s).
 
     This class extends Pairlist to consider a cutoff distance for neighbor calculations.
     """
 
-    def __init__(self, cutoff: unit.Quantity, only_unique_pairs: bool = False):
+    def __init__(
+        self, cutoffs: Dict[str, unit.Quantity], only_unique_pairs: bool = False
+    ):
         """
         Initialize the Neighborlist with a specific cutoff distance.
 
         Parameters
         ----------
-        cutoff : unit.Quantity
-            Cutoff distance for neighbor calculations.
+        cutoffs : Dict[str, unit.Quantity]
+            Cutoff distances for neighbor calculations.
         only_unique_pairs : bool, optional
             If True, only unique pairs are returned (default is False).
         """
         super().__init__(only_unique_pairs=only_unique_pairs)
-        self.register_buffer("cutoff", torch.tensor(cutoff.to(unit.nanometer).m))
+
+        # self.register_buffer("cutoff", torch.tensor(cutoff.to(unit.nanometer).m))
+        self.register_buffer(
+            "cutoffs", torch.tensor([c.to(unit.nanometer).m for c in cutoffs.values()])
+        )
+        self.labels = list(cutoffs.keys())
 
     def forward(
         self,
@@ -364,16 +371,20 @@ def forward(
         r_ij = self.calculate_r_ij(pair_indices, positions)
         d_ij = self.calculate_d_ij(r_ij)
 
-        # Find pairs within the cutoff
-        in_cutoff = (d_ij <= self.cutoff).squeeze()
-        # Get the atom indices within the cutoff
-        pair_indices_within_cutoff = pair_indices[:, in_cutoff]
+        interacting_outputs = {}
+        for cutoff, label in zip(self.cutoffs, self.labels):
+            # Find pairs within the cutoff
+            in_cutoff = (d_ij <= cutoff).squeeze()
+            # Get the atom indices within the cutoff
+            pair_indices_within_cutoff = pair_indices[:, in_cutoff]
+
+            interacting_outputs[label] = PairListOutputs(
+                pair_indices=pair_indices_within_cutoff,
+                d_ij=d_ij[in_cutoff],
+                r_ij=r_ij[in_cutoff],
+            )
 
-        return PairListOutputs(
-            pair_indices=pair_indices_within_cutoff,
-            d_ij=d_ij[in_cutoff],
-            r_ij=r_ij[in_cutoff],
-        )
+        return interacting_outputs
 
 
 from typing import Callable, Literal, Optional, Union
@@ -666,14 +677,16 @@ class ComputeInteractingAtomPairs(torch.nn.Module):
     distances (d_ij), and displacement vectors (r_ij) for molecular simulations.
     """
 
-    def __init__(self, cutoff: unit.Quantity, only_unique_pairs: bool = True):
+    def __init__(
+        self, cutoffs: Dict[str, unit.Quantity], only_unique_pairs: bool = True
+    ):
         """
         Initialize the ComputeInteractingAtomPairs module.
 
         Parameters
         ----------
-        cutoff : unit.Quantity
-            The cutoff distance for neighbor list calculations.
+        cutoffs : Dict[str, unit.Quantity]
+            The cutoff distance(s) for neighbor list calculations.
         only_unique_pairs : bool, optional
             Whether to only use unique pairs in the pair list calculation, by
             default True. This should be set to True for all message passing
@@ -684,7 +697,7 @@ def __init__(self, cutoff: unit.Quantity, only_unique_pairs: bool = True):
         from .models import Neighborlist
 
         self.only_unique_pairs = only_unique_pairs
-        self.calculate_distances_and_pairlist = Neighborlist(cutoff, only_unique_pairs)
+        self.calculate_distances_and_pairlist = Neighborlist(cutoffs, only_unique_pairs)
 
     def prepare_inputs(self, data: Union[NNPInput, NamedTuple]):
         """
@@ -703,7 +716,7 @@ def prepare_inputs(self, data: Union[NNPInput, NamedTuple]):
         Returns
         -------
         PairListOutputs
-            A namedtuple containing the pair indices, Euclidean distances
+            A Dict for each cutoff type, where each entry is a namedtuple containing the pair indices, Euclidean distances
             (d_ij), and displacement vectors (r_ij).
         """
         # ---------------------------
@@ -736,6 +749,7 @@ def prepare_inputs(self, data: Union[NNPInput, NamedTuple]):
                 pair_indices=pair_list.to(torch.int64),
             )
 
+        # this will return a Dict of the PairListOutputs for each cutoff we specify
         return pairlist_output
 
     def _input_checks(self, data: Union[NNPInput, NamedTuple]):
@@ -1037,6 +1051,8 @@ def __init__(
         postprocessing_parameter: Dict[str, Dict[str, bool]],
         dataset_statistic: Optional[Dict[str, float]],
         maximum_interaction_radius: unit.Quantity,
+        maximum_dispersion_interaction_radius: Optional[unit.Quantity] = None,
+        maximum_coulomb_interaction_radius: Optional[unit.Quantity] = None,
         potential_seed: Optional[int] = None,
     ):
         """
@@ -1049,7 +1065,11 @@ def __init__(
         dataset_statistic : Optional[Dict[str, float]]
             Dataset statistics for normalization.
         maximum_interaction_radius : unit.Quantity
-            cutoff radius.
+            cutoff radius for local interactions
+        maximum_dispersion_interaction_radius : unit.Quantity, optional
+            cutoff radius for dispersion interactions.
+        maximum_coulomb_interaction_radius : unit.Quantity, optional
+            cutoff radius for Coulomb interactions.
         potential_seed : Optional[int], optional
             Value used for torch.manual_seed, by default None.
         """
@@ -1083,8 +1103,25 @@ def __init__(
             raise RuntimeError(
                 "The only_unique_pairs attribute is not set in the child class. Please set it to True or False before calling super().__init__."
             )
+
+        # to handle multiple cutoffs, we will create a dictionary with the cutoffs
+        # the dictionary will make it more transparent which PairListOutputs belong to which cutoff
+
+        cutoffs = {}
+        cutoffs["maximum_interaction_radius"] = _convert_str_to_unit(
+            maximum_interaction_radius
+        )
+        if maximum_dispersion_interaction_radius is not None:
+            cutoffs["maximum_dispersion_interaction_radius"] = _convert_str_to_unit(
+                maximum_dispersion_interaction_radius
+            )
+        if maximum_coulomb_interaction_radius is not None:
+            cutoffs["maximum_coulomb_interaction_radius"] = _convert_str_to_unit(
+                maximum_coulomb_interaction_radius
+            )
+
         self.compute_interacting_pairs = ComputeInteractingAtomPairs(
-            cutoff=_convert_str_to_unit(maximum_interaction_radius),
+            cutoffs=cutoffs,
             only_unique_pairs=self.only_unique_pairs,
         )
 
diff --git a/modelforge/potential/painn.py b/modelforge/potential/painn.py
index 0d1925fc..ec5c31a9 100644
--- a/modelforge/potential/painn.py
+++ b/modelforge/potential/painn.py
@@ -148,7 +148,7 @@ def __init__(
             )
 
     def _model_specific_input_preparation(
-        self, data: NNPInput, pairlist_output: PairListOutputs
+        self, data: NNPInput, pairlist_output: Dict[str, PairListOutputs]
     ) -> PaiNNNeuralNetworkData:
         """
         Prepare the model-specific input for the PaiNN network.
@@ -157,8 +157,8 @@ def _model_specific_input_preparation(
         ----------
         data : NNPInput
             The input data.
-        pairlist_output : PairListOutputs
-            The pairlist output.
+        pairlist_output : dict[str, PairListOutputs]
+            The output from the pairlist.
 
         Returns
         -------
@@ -169,6 +169,11 @@ def _model_specific_input_preparation(
 
         number_of_atoms = data.atomic_numbers.shape[0]
 
+        # Note, pairlist_output is a Dict where the key corresponds to the name of the cutoff parameter
+        # e.g. "maximum_interaction_radius"
+
+        pairlist_output = pairlist_output["maximum_interaction_radius"]
+
         nnp_input = PaiNNNeuralNetworkData(
             pair_indices=pairlist_output.pair_indices,
             d_ij=pairlist_output.d_ij,
@@ -221,7 +226,7 @@ def compute_properties(
             )
 
         results = {
-            "per_atom_scalar_representation": per_atom_scalar_feature,
+            "per_atom_scalar_representation": per_atom_scalar_feature.squeeze(1),
             "per_atom_vector_representation": per_atom_vector_feature,
             "atomic_subsystem_indices": data.atomic_subsystem_indices,
         }
diff --git a/modelforge/potential/parameters.py b/modelforge/potential/parameters.py
index 179cc973..e38c1305 100644
--- a/modelforge/potential/parameters.py
+++ b/modelforge/potential/parameters.py
@@ -221,7 +221,7 @@ class CoreParameter(ParametersBase):
         number_of_radial_basis_functions: int
         maximum_interaction_radius: Union[str, unit.Quantity]
         minimum_interaction_radius: Union[str, unit.Quantity]
-        highest_atomic_number: int
+        maximum_atomic_number: int
         equivariance_invariance_group: str
         activation_function_parameter: ActivationFunctionConfig
         predicted_properties: List[PredictedPropertiesParameter]
diff --git a/modelforge/potential/physnet.py b/modelforge/potential/physnet.py
index fa981614..d70fe198 100644
--- a/modelforge/potential/physnet.py
+++ b/modelforge/potential/physnet.py
@@ -14,23 +14,32 @@
 from modelforge.utils.units import _convert_str_to_unit
 
 from .models import BaseNetwork, CoreNetwork, NNPInput, PairListOutputs
+from modelforge.utils.io import import_
+from modelforge.utils.units import _convert_str_to_unit
+
+from .models import BaseNetwork, CoreNetwork, NNPInput, PairListOutputs
+from .utils import Dense
 
 
 @dataclass
 class PhysNetNeuralNetworkData(NeuralNetworkData):
     """
-    A dataclass to structure the inputs for PhysNet-based neural network potentials,
-    facilitating the efficient and structured representation of atomic systems for
-    energy computation and property prediction within the PhysNet framework.
+    A dataclass to structure the inputs for PhysNet-based neural network
+    potentials, facilitating the efficient and structured representation of
+    atomic systems for energy computation and property prediction within the
+    PhysNet framework.
 
     Attributes
     ----------
     atomic_embedding : torch.Tensor
-        A 2D tensor containing embeddings or features for each atom, derived from atomic numbers or other properties.
-        Shape: [num_atoms, embedding_dim].
+        A 2D tensor containing embeddings or features for each atom, derived
+        from atomic numbers or other properties. Shape: [num_atoms,
+        embedding_dim].
     f_ij : Optional[torch.Tensor]
-        A tensor representing the radial basis function (RBF) expansion applied to distances between atom pairs,
-        capturing the local chemical environment. Will be added after initialization. Shape: [num_pairs, num_rbf].
+        A tensor representing the radial basis function (RBF) expansion applied
+        to distances between atom pairs, capturing the local chemical
+        environment. Will be added after initialization. Shape: [num_pairs,
+        num_rbf].
     """
 
     atomic_embedding: Optional[torch.Tensor] = field(default=None)
@@ -68,6 +77,10 @@ def __init__(
         self.cutoff_module = CosineAttenuationFunction(maximum_interaction_radius)
 
         # Initialize radial symmetry function module
+        from modelforge.potential.utils import FeaturizeInput
+
+        from .utils import PhysNetRadialBasisFunction
+
         self.featurize_input = FeaturizeInput(featurization_config)
 
         self.radial_symmetry_function_module = PhysNetRadialBasisFunction(
@@ -101,52 +114,23 @@ def forward(self, data: Type[PhysNetNeuralNetworkData]) -> Dict[str, torch.Tenso
         }
 
 
-class GatingModule(nn.Module):
-    def __init__(self, number_of_atom_basis: int):
-        """
-        Initializes a gating module that optionally applies a sigmoid gating mechanism to input features.
-
-        Parameters
-        ----------
-        number_of_atom_basis : int
-            The dimensionality of the input (and output) features.
-        """
-        super().__init__()
-        self.gate = nn.Parameter(torch.ones(number_of_atom_basis))
-
-    def forward(self, x: torch.Tensor, activation_fn: bool = False) -> torch.Tensor:
-        """
-        Apply gating to the input tensor.
-
-        Parameters:
-        -----------
-        x : torch.Tensor
-            The input tensor to gate.
-
-        Returns:
-        --------
-        torch.Tensor
-            The gated input tensor.
-        """
-        gating_signal = torch.sigmoid(self.gate)
-        return gating_signal * x
-
-
-from .utils import DenseWithCustomDist
-
-
 class PhysNetResidual(nn.Module):
     """
-    Implements a preactivation residual block as described in Equation 4 of the PhysNet paper.
+    Implements a preactivation residual block as described in Equation 4 of the
+    PhysNet paper.
 
-    The block refines atomic feature vectors by adding a residual component computed through two linear transformations and a non-linear activation function (Softplus). This setup enhances gradient flow and supports effective deep network training by employing a preactivation scheme.
+    The block refines atomic feature vectors by adding a residual component
+    computed through two linear transformations and a non-linear activation
+    function (Softplus). This setup enhances gradient flow and supports
+    effective deep network training by employing a preactivation scheme.
 
     Parameters
     ----------
     input_dim : int
         Dimensionality of the input feature vector.
     output_dim : int
-        Dimensionality of the output feature vector, which typically matches the input dimension.
+        Dimensionality of the output feature vector, which typically matches the
+        input dimension.
     activation_function : Type[torch.nn.Module]
         The activation function to be used in the residual block.
     """
@@ -159,27 +143,28 @@ def __init__(
     ):
         super().__init__()
         # Initialize dense layers and residual connection
-        self.dense = DenseWithCustomDist(
-            input_dim, output_dim, activation_function=activation_function
+
+        self.dense = nn.Sequential(
+            activation_function,
+            Dense(input_dim, output_dim, activation_function),
+            Dense(output_dim, output_dim),
         )
-        self.residual = DenseWithCustomDist(output_dim, output_dim)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Forward pass of the ResidualBlock.
 
-        Parameters:
-        -----------
-        x: torch.Tensor
+        Parameters
+        ----------
+        x : torch.Tensor
             Input tensor containing feature vectors of atoms.
 
-        Returns:
-        --------
+        Returns
+        -------
         torch.Tensor
             Output tensor after applying the residual block operations.
         """
-        # update x with residual
-        return x + self.residual(self.dense(x))
+        return x + self.dense(x)
 
 
 class PhysNetInteractionModule(nn.Module):
@@ -220,21 +205,19 @@ def __init__(
         )
 
         # Initialize networks for processing atomic embeddings of i and j atoms
-        self.interaction_i = DenseWithCustomDist(
+        self.interaction_i = Dense(
             number_of_per_atom_features,
             number_of_per_atom_features,
             activation_function=activation_function,
         )
-        self.interaction_j = DenseWithCustomDist(
+        self.interaction_j = Dense(
             number_of_per_atom_features,
             number_of_per_atom_features,
             activation_function=activation_function,
         )
 
         # Initialize processing network
-        self.process_v = DenseWithCustomDist(
-            number_of_per_atom_features, number_of_per_atom_features
-        )
+        self.process_v = Dense(number_of_per_atom_features, number_of_per_atom_features)
 
         # Initialize residual blocks
         self.residuals = nn.ModuleList(
@@ -254,69 +237,68 @@ def __init__(
 
     def forward(self, data: PhysNetNeuralNetworkData) -> torch.Tensor:
         """
-        Processes input tensors through the interaction module, applying Gaussian Logarithm Attention to modulate
-        the influence of pairwise distances on the interaction features, followed by aggregation to update atomic embeddings.
+        Processes input tensors through the interaction module, applying
+        Gaussian Logarithm Attention to modulate the influence of pairwise
+        distances on the interaction features, followed by aggregation to update
+        atomic embeddings.
 
         Parameters
         ----------
         data : PhysNetNeuralNetworkData
-            Input data containing pair indices, distances, and atomic embeddings.
+            Input data containing pair indices, distances, and atomic
+            embeddings.
 
         Returns
         -------
         torch.Tensor
-            Updated atomic feature representations incorporating interaction information.
+            Updated atomic feature representations incorporating interaction
+            information.
         """
-        # Equation 6: Formation of the Proto-Message ṽ_i for an Atom i ṽ_i =
-        # σ(Wl_I * x_i^l + bl_I) + Σ_j (G_g * Wl * (σ(σl_J * x_j^l + bl_J)) *
-        # g(r_ij))
-        #
-        # Equation 6 implementation overview: ṽ_i = x_i_prime +
-        # sum_over_j(x_j_prime * f_ij_prime) where:
-        # - x_i_prime and x_j_prime are the features of atoms i and j,
-        #   respectively, processed through separate networks.
-        # - f_ij_prime represents the modulated radial basis functions (f_ij) by
-        #   the Gaussian Logarithm Attention weights.
 
         # extract relevant variables
-        idx_i, idx_j = data.pair_indices
-        f_ij = data.f_ij
-        x = data.atomic_embedding
+        idx_i, idx_j = data.pair_indices  # (nr_of_pairs, 2)
+        f_ij = data.f_ij  # (nr_of_pairs, number_of_radial_basis_functions)
 
         # # Apply activation to atomic embeddings
-        xa = self.dropout(self.activation_function(x))
+        per_atom_embedding = self.activation_function(
+            data.atomic_embedding
+        )  # (nr_of_atoms_in_batch, number_of_per_atom_features)
 
         # calculate attention weights and transform to
         # input shape: (number_of_pairs, number_of_radial_basis_functions)
         # output shape: (number_of_pairs, number_of_per_atom_features)
         g = self.attention_mask(f_ij)
 
-        # Calculate contribution of central atom
-        x_i = self.interaction_i(xa)
+        # Calculate contribution of central atom i
+        per_atom_updated_embedding = self.interaction_i(per_atom_embedding)
+
         # Calculate contribution of neighbor atom
-        x_j = self.interaction_j(xa)
-        # Gather the results according to idx_j
-        x_j = x_j[idx_j]
-        # Multiply the gathered features by g
-        x_j_modulated = x_j * g
-        # Aggregate modulated contributions for each atom i
-        x_j_prime = torch.zeros_like(x_i)
-        x_j_prime.scatter_add_(
-            0, idx_i.unsqueeze(-1).expand(-1, x_j_modulated.size(-1)), x_j_modulated
+        per_interaction_embededding_for_atom_j = (
+            self.interaction_j(per_atom_embedding[idx_j]) * g
         )
 
-        # Draft proto message v_tilde
-        m = x_i + x_j_prime
-        # shape of m (nr_of_atoms_in_batch, 1)
-        # Equation 4: Preactivation Residual Block Implementation
-        # xl+2_i = xl_i + Wl+1 * sigma(Wl * xl_i + bl) + bl+1
+        per_atom_updated_embedding.scatter_add_(
+            0,
+            idx_i.unsqueeze(-1).expand(
+                -1, per_interaction_embededding_for_atom_j.shape[-1]
+            ),
+            per_interaction_embededding_for_atom_j,
+        )
+
+        # apply residual blocks
         for residual in self.residuals:
-            m = residual(
-                m
+            per_atom_updated_embedding = residual(
+                per_atom_updated_embedding
             )  # shape (nr_of_atoms_in_batch, number_of_radial_basis_functions)
-        m = self.activation_function(m)
-        x = self.gate * x + self.process_v(m)
-        return x
+
+        per_atom_updated_embedding = self.activation_function(
+            per_atom_updated_embedding
+        )
+
+        per_atom_embedding = self.gate * per_atom_embedding + self.process_v(
+            per_atom_updated_embedding
+        )
+        return per_atom_embedding
 
 
 class PhysNetOutput(nn.Module):
@@ -360,7 +342,6 @@ def __init__(
             number_of_per_atom_features,
             number_of_atomic_properties,
             weight_init=torch.nn.init.zeros_,
-            bias=False,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -436,25 +417,6 @@ def forward(self, data: PhysNetNeuralNetworkData) -> Dict[str, torch.Tensor]:
         Dict[str, torch.Tensor]
             Dictionary containing predictions and updated embeddings.
         """
-        # The PhysNet module is a sequence of interaction modules and residual modules.
-        #              x_1, ..., x_N
-        #                     |
-        #                     v
-        #               ┌─────────────┐
-        #               │ interaction │ <-- g(d_ij)
-        #               └─────────────┘
-        #                     │
-        #                     v
-        #                ┌───────────┐
-        #                │  residual │
-        #                └───────────┘
-        #                ┌───────────┐
-        #                │  residual │
-        #                └───────────┘
-        # ┌───────────┐      │
-        # │   output  │<-----│
-        # └───────────┘      │
-        #                    v
 
         # calculate the interaction
         updated_embedding = self.interaction(data)
@@ -546,7 +508,7 @@ def __init__(
         self.predicted_properties = predicted_properties
 
     def _model_specific_input_preparation(
-        self, data: "NNPInput", pairlist_output: "PairListOutputs"
+        self, data: NNPInput, pairlist_output: Dict[str, PairListOutputs]
     ) -> PhysNetNeuralNetworkData:
         """
         Prepare model-specific input data.
@@ -555,7 +517,7 @@ def _model_specific_input_preparation(
         ----------
         data : NNPInput
             Input data containing atomic information.
-        pairlist_output : PairListOutputs
+        pairlist_output : Dict[str, PairListOutputs]
             Output from the pairlist calculation.
 
         Returns
@@ -565,6 +527,11 @@ def _model_specific_input_preparation(
         """
         number_of_atoms = data.atomic_numbers.shape[0]
 
+        # Note, pairlist_output is a Dict where the key corresponds to the name of the cutoff parameter
+        # e.g. "maximum_interaction_radius"
+
+        pairlist_output = pairlist_output["maximum_interaction_radius"]
+
         nnp_input = PhysNetNeuralNetworkData(
             pair_indices=pairlist_output.pair_indices,
             d_ij=pairlist_output.d_ij,
diff --git a/modelforge/potential/processing.py b/modelforge/potential/processing.py
index 99ad162d..00868022 100644
--- a/modelforge/potential/processing.py
+++ b/modelforge/potential/processing.py
@@ -389,7 +389,7 @@ def __init__(
             }
         self.atomic_self_energies = AtomicSelfEnergies(atomic_self_energies)
 
-    def forward(self, data: Dict[str, torch.Tensor]) -> torch.Tensor:
+    def forward(self, data: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
         """
         Calculates the molecular self energy.
 
diff --git a/modelforge/potential/sake.py b/modelforge/potential/sake.py
index c526725b..6e0ff14e 100644
--- a/modelforge/potential/sake.py
+++ b/modelforge/potential/sake.py
@@ -153,7 +153,7 @@ def __init__(
             )
 
     def _model_specific_input_preparation(
-        self, data: "NNPInput", pairlist_output: "PairListOutputs"
+        self, data: NNPInput, pairlist_output: Dict[str, PairListOutputs]
     ) -> SAKENeuralNetworkInput:
         """
         Prepare the model-specific input.
@@ -162,8 +162,8 @@ def _model_specific_input_preparation(
         ----------
         data : NNPInput
             Input data.
-        pairlist_output : PairListOutputs
-            Pairlist output.
+        pairlist_output : Dict[str,PairListOutputs]
+            Pairlist output(s)
 
         Returns
         -------
@@ -174,6 +174,11 @@ def _model_specific_input_preparation(
 
         number_of_atoms = data.atomic_numbers.shape[0]
 
+        # Note, pairlist_output is a Dict where the key corresponds to the name of the cutoff parameter
+        # e.g. "maximum_interaction_radius"
+
+        pairlist_output = pairlist_output["maximum_interaction_radius"]
+
         nnp_input = SAKENeuralNetworkInput(
             pair_indices=pairlist_output.pair_indices,
             number_of_atoms=number_of_atoms,
@@ -219,7 +224,6 @@ def compute_properties(self, data: SAKENeuralNetworkInput):
         return results
 
 
-
 class SAKEInteraction(nn.Module):
     """
     Spatial Attention Kinetic Networks Layer.
diff --git a/modelforge/potential/schnet.py b/modelforge/potential/schnet.py
index a0ff151f..83dee317 100644
--- a/modelforge/potential/schnet.py
+++ b/modelforge/potential/schnet.py
@@ -94,7 +94,7 @@ def __init__(
             number_of_radial_basis_functions,
             featurization_config=featurization_config,
         )
-        # Intialize interaction blocks
+        # Initialize interaction blocks
         if shared_interactions:
             self.interaction_modules = nn.ModuleList(
                 [
@@ -143,7 +143,7 @@ def __init__(
             )
 
     def _model_specific_input_preparation(
-        self, data: "NNPInput", pairlist_output: PairListOutputs
+        self, data: "NNPInput", pairlist_output: Dict[str, PairListOutputs]
     ) -> SchnetNeuralNetworkData:
         """
         Prepare the input data for the SchNet model.
@@ -152,8 +152,8 @@ def _model_specific_input_preparation(
         ----------
         data : NNPInput
             The input data for the model.
-        pairlist_output : PairListOutputs
-            The pairlist output.
+        pairlist_output : Dict[str, PairListOutputs]
+            The pairlist output(s).
 
         Returns
         -------
@@ -162,6 +162,11 @@ def _model_specific_input_preparation(
         """
         number_of_atoms = data.atomic_numbers.shape[0]
 
+        # Note, pairlist_output is a Dict where the key corresponds to the name of the cutoff parameter
+        # e.g. "maximum_interaction_radius"
+
+        pairlist_output = pairlist_output["maximum_interaction_radius"]
+
         nnp_input = SchnetNeuralNetworkData(
             pair_indices=pairlist_output.pair_indices,
             d_ij=pairlist_output.d_ij,
diff --git a/modelforge/potential/tensornet.py b/modelforge/potential/tensornet.py
index 824d87c2..5c7d973a 100644
--- a/modelforge/potential/tensornet.py
+++ b/modelforge/potential/tensornet.py
@@ -193,8 +193,8 @@ class TensorNet(BaseNetwork):
         Maximum interaction radius.
     minimum_interaction_radius : unit.Quantity
         Minimum interaction radius.
-    highest_atomic_number : int
-        Highest atomic number in the dataset.
+    maximum_atomic_number : int
+        Maximum atomic number in the dataset.
     equivariance_invariance_group : str
         Equivariance invariance group, either "O(3)" or "SO(3)".
     activation_function_parameter : Dict
@@ -215,7 +215,7 @@ def __init__(
         number_of_radial_basis_functions: int,
         maximum_interaction_radius: unit.Quantity,
         minimum_interaction_radius: unit.Quantity,
-        highest_atomic_number: int,
+        maximum_atomic_number: int,
         equivariance_invariance_group: str,
         activation_function_parameter: Dict,
         postprocessing_parameter: Dict[str, Dict[str, bool]],
@@ -241,7 +241,7 @@ def __init__(
             maximum_interaction_radius=_convert_str_to_unit(maximum_interaction_radius),
             minimum_interaction_radius=_convert_str_to_unit(minimum_interaction_radius),
             trainable_centers_and_scale_factors=False,
-            highest_atomic_number=highest_atomic_number,
+            maximum_atomic_number=maximum_atomic_number,
             equivariance_invariance_group=equivariance_invariance_group,
             activation_function=activation_function,
             predicted_properties=predicted_properties,
@@ -282,8 +282,8 @@ class TensorNetCore(CoreNetwork):
         Minimum interaction radius.
     trainable_centers_and_scale_factors : bool
         If True, centers and scale factors are trainable.
-    highest_atomic_number : int
-        Highest atomic number in the dataset.
+    maximum_atomic_number : int
+        Maximum atomic number in the dataset.
     equivariance_invariance_group : str
         Equivariance invariance group, either "O(3)" or "SO(3)".
     activation_function : Type[torch.nn.Module]
@@ -298,7 +298,7 @@ def __init__(
         maximum_interaction_radius: unit.Quantity,
         minimum_interaction_radius: unit.Quantity,
         trainable_centers_and_scale_factors: bool,
-        highest_atomic_number: int,
+        maximum_atomic_number: int,
         equivariance_invariance_group: str,
         activation_function: Type[torch.nn.Module],
         predicted_properties: List[Dict[str, str]],
@@ -314,7 +314,7 @@ def __init__(
             maximum_interaction_radius=maximum_interaction_radius,
             minimum_interaction_radius=minimum_interaction_radius,
             trainable_centers_and_scale_factors=trainable_centers_and_scale_factors,
-            highest_atomic_number=highest_atomic_number,
+            maximum_atomic_number=maximum_atomic_number,
         )
         self.interaction_modules = nn.ModuleList(
             [
@@ -407,7 +407,7 @@ def compute_properties(
         return results
 
     def _model_specific_input_preparation(
-        self, data: "NNPInput", pairlist_output: "PairListOutputs"
+        self, data: NNPInput, pairlist_output: Dict[str, PairListOutputs]
     ) -> TensorNetNeuralNetworkData:
         """
         Prepare the input data for the TensorNet model.
@@ -416,8 +416,8 @@ def _model_specific_input_preparation(
         ----------
         data : NNPInput
             The input data for the model.
-        pairlist_output : PairListOutputs
-            The pairlist output.
+        pairlist_output : Dict[str, PairListOutputs]
+            The pairlist output(s)
 
         Returns
         -------
@@ -426,6 +426,11 @@ def _model_specific_input_preparation(
         """
         number_of_atoms = data.atomic_numbers.shape[0]
 
+        # Note, pairlist_output is a Dict where the key corresponds to the name of the cutoff parameter
+        # e.g. "maximum_interaction_radius"
+
+        pairlist_output = pairlist_output["maximum_interaction_radius"]
+
         nnpdata = TensorNetNeuralNetworkData(
             pair_indices=pairlist_output.pair_indices,
             d_ij=pairlist_output.d_ij,
@@ -458,8 +463,8 @@ class TensorNetRepresentation(torch.nn.Module):
         Minimum interaction radius.
     trainable_centers_and_scale_factors : bool
         If True, centers and scale factors are trainable.
-    highest_atomic_number : int
-        Highest atomic number in the dataset.
+    maximum_atomic_number : int
+        Maximum atomic number in the dataset.
     """
 
     def __init__(
@@ -470,7 +475,7 @@ def __init__(
         maximum_interaction_radius: unit.Quantity,
         minimum_interaction_radius: unit.Quantity,
         trainable_centers_and_scale_factors: bool,
-        highest_atomic_number: int,
+        maximum_atomic_number: int,
     ):
         super().__init__()
         from modelforge.potential.utils import Dense
@@ -502,7 +507,7 @@ def __init__(
             }
         )
         self.atomic_number_i_embedding_layer = nn.Embedding(
-            highest_atomic_number,
+            maximum_atomic_number,
             number_of_per_atom_features,
         )
         self.atomic_number_ij_embedding_layer = nn.Linear(
diff --git a/modelforge/tests/conftest.py b/modelforge/tests/conftest.py
index 7e8e475f..a67ffcaf 100644
--- a/modelforge/tests/conftest.py
+++ b/modelforge/tests/conftest.py
@@ -1,9 +1,10 @@
-import torch
+from dataclasses import dataclass
+from typing import Dict, Optional
+
 import pytest
-from modelforge.dataset import DataModule
+import torch
 
-from typing import Optional, Dict
-from dataclasses import dataclass
+from modelforge.dataset import DataModule
 
 
 # let us setup a few pytest options
@@ -37,39 +38,6 @@ def create_datamodule(**kwargs):
     return create_datamodule
 
 
-from modelforge.dataset.utils import (
-    FirstComeFirstServeSplittingStrategy,
-    SplittingStrategy,
-)
-
-
-def initialize_datamodule(
-    dataset_name: str,
-    version_select: str = "nc_1000_v0",
-    batch_size: int = 64,
-    splitting_strategy: SplittingStrategy = FirstComeFirstServeSplittingStrategy(),
-    remove_self_energies: bool = True,
-    regression_ase: bool = False,
-    regenerate_dataset_statistic: bool = False,
-) -> DataModule:
-    """
-    Initialize a dataset for a given mode.
-    """
-
-    data_module = DataModule(
-        dataset_name,
-        splitting_strategy=splitting_strategy,
-        batch_size=batch_size,
-        version_select=version_select,
-        remove_self_energies=remove_self_energies,
-        regression_ase=regression_ase,
-        regenerate_dataset_statistic=regenerate_dataset_statistic,
-    )
-    data_module.prepare_data()
-    data_module.setup()
-    return data_module
-
-
 # dataset fixture
 @pytest.fixture
 def dataset_factory():
@@ -79,73 +47,23 @@ def create_dataset(**kwargs):
     return create_dataset
 
 
-from modelforge.dataset.dataset import DatasetFactory, TorchDataset
-from modelforge.dataset import _ImplementedDatasets
-
-
-def single_batch(batch_size: int = 64, dataset_name="QM9"):
-    """
-    Utility function to create a single batch of data for testing.
-    """
-    data_module = initialize_datamodule(
-        dataset_name=dataset_name,
-        batch_size=batch_size,
-        version_select="nc_1000_v0",
-    )
-    return next(iter(data_module.train_dataloader(shuffle=False)))
-
-
-@pytest.fixture(scope="session")
-def single_batch_with_batchsize_64():
-    """
-    Utility fixture to create a single batch of data for testing.
-    """
-    return single_batch(batch_size=64)
-
-
-@pytest.fixture(scope="session")
-def single_batch_with_batchsize_1():
-    """
-    Utility fixture to create a single batch of data for testing.
-    """
-    return single_batch(batch_size=1)
-
-
-@pytest.fixture(scope="session")
-def single_batch_with_batchsize_2_with_force():
-    """
-    Utility fixture to create a single batch of data for testing.
-    """
-    return single_batch(batch_size=2, dataset_name="PHALKETHOH")
+from modelforge.dataset.dataset import (
+    initialize_datamodule,
+    initialize_dataset,
+    single_batch,
+)
 
 
 @pytest.fixture(scope="session")
-def single_batch_with_batchsize_16_with_force():
+def single_batch_with_batchsize():
     """
     Utility fixture to create a single batch of data for testing.
     """
-    return single_batch(batch_size=16, dataset_name="PHALKETHOH")
-
-
-def initialize_dataset(
-    dataset_name: str,
-    local_cache_dir: str,
-    versions_select: str = "nc_1000_v0",
-    force_download: bool = False,
-) -> DataModule:
-    """
-    Initialize a dataset for a given mode.
-    """
 
-    factory = DatasetFactory()
-    data = _ImplementedDatasets.get_dataset_class(dataset_name)(
-        local_cache_dir=local_cache_dir,
-        version_select=versions_select,
-        force_download=force_download,
-    )
-    dataset = factory.create_dataset(data)
+    def _create_single_batch(batch_size: int, dataset_name: str):
+        return single_batch(batch_size=batch_size, dataset_name=dataset_name)
 
-    return dataset
+    return _create_single_batch
 
 
 @pytest.fixture(scope="session")
@@ -170,7 +88,6 @@ class DataSetContainer:
 
 from modelforge.dataset import _ImplementedDatasets
 
-
 dataset_container: Dict[str, DataSetContainer] = {
     "QM9": DataSetContainer(
         name="QM9",
@@ -288,7 +205,7 @@ def methane() -> BatchData:
     -------
     BatchData
     """
-    from modelforge.potential.utils import Metadata, NNPInput, BatchData
+    from modelforge.potential.utils import BatchData, Metadata, NNPInput
 
     atomic_numbers = torch.tensor([6, 1, 1, 1, 1], dtype=torch.int64)
     positions = (
@@ -322,9 +239,10 @@ def methane() -> BatchData:
     )
 
 
-import torch
 import math
 
+import torch
+
 
 def generate_uniform_quaternion(u=None):
     """
diff --git a/modelforge/tests/data/config.toml b/modelforge/tests/data/config.toml
index c323c354..53e408cf 100644
--- a/modelforge/tests/data/config.toml
+++ b/modelforge/tests/data/config.toml
@@ -35,7 +35,7 @@ number_of_epochs = 2
 remove_self_energies = true
 batch_size = 128
 lr = 1e-3
-monitor = "val/per_molecule_energy/rmse"
+monitor_for_checkpoint = "val/per_molecule_energy/rmse"
 
 [training.experiment_logger]
 logger_name = "tensorboard"
diff --git a/modelforge/tests/data/dataset_defaults/phalkethoh.toml b/modelforge/tests/data/dataset_defaults/phalkethoh.toml
index 60281c0c..ddac0202 100644
--- a/modelforge/tests/data/dataset_defaults/phalkethoh.toml
+++ b/modelforge/tests/data/dataset_defaults/phalkethoh.toml
@@ -1,5 +1,5 @@
 [dataset]
 dataset_name = "PHALKETHOH"
-version_select = "nc_1000_v0"
+version_select = "nc_1000_v1"
 num_workers = 4
 pin_memory = true
\ No newline at end of file
diff --git a/modelforge/tests/data/potential_defaults/tensornet.toml b/modelforge/tests/data/potential_defaults/tensornet.toml
index efe4cf71..cd82d9df 100644
--- a/modelforge/tests/data/potential_defaults/tensornet.toml
+++ b/modelforge/tests/data/potential_defaults/tensornet.toml
@@ -7,7 +7,7 @@ number_of_interaction_layers = 2
 number_of_radial_basis_functions = 16
 maximum_interaction_radius = "5.1 angstrom"
 minimum_interaction_radius = "0.0 angstrom"
-highest_atomic_number = 128
+maximum_atomic_number = 128
 equivariance_invariance_group = "O(3)"
 predicted_properties = [
     { name = "per_atom_energy", type = "scalar" },
diff --git a/modelforge/tests/data/runtime_defaults/runtime.toml b/modelforge/tests/data/runtime_defaults/runtime.toml
index 238ef10c..0dd5c758 100644
--- a/modelforge/tests/data/runtime_defaults/runtime.toml
+++ b/modelforge/tests/data/runtime_defaults/runtime.toml
@@ -1,5 +1,4 @@
 [runtime]
-save_dir = "lightning_logs"
 experiment_name = "{potential_name}_{dataset_name}"
 local_cache_dir = "./cache"
 accelerator = "cpu"
diff --git a/modelforge/tests/data/training_defaults/default.toml b/modelforge/tests/data/training_defaults/default.toml
index 19c0d3b7..c8d96fc2 100644
--- a/modelforge/tests/data/training_defaults/default.toml
+++ b/modelforge/tests/data/training_defaults/default.toml
@@ -3,7 +3,7 @@ number_of_epochs = 2
 remove_self_energies = true
 batch_size = 128
 lr = 1e-3
-monitor = "val/per_molecule_energy/rmse"
+monitor_for_checkpoint = "val/per_molecule_energy/rmse"
 
 
 [training.experiment_logger]
@@ -35,11 +35,11 @@ monitor = "val/per_molecule_energy/rmse"
 interval = "epoch"
 
 [training.loss_parameter]
-loss_property = ['per_molecule_energy', 'per_atom_force'] # use
+loss_property = ['per_molecule_energy'] #, 'per_atom_force'] # use
 
 [training.loss_parameter.weight]
-per_molecule_energy = 0.999 #NOTE: reciprocal units
-per_atom_force = 0.001
+per_molecule_energy = 1.0 #NOTE: reciprocal units
+#per_atom_force = 0.001
 
 
 [training.early_stopping]
diff --git a/modelforge/tests/data/training_defaults/default_with_force.toml b/modelforge/tests/data/training_defaults/default_with_force.toml
new file mode 100644
index 00000000..4ba07f8b
--- /dev/null
+++ b/modelforge/tests/data/training_defaults/default_with_force.toml
@@ -0,0 +1,54 @@
+[training]
+number_of_epochs = 2
+remove_self_energies = true
+batch_size = 128
+lr = 1e-3
+monitor_for_checkpoint = "val/per_molecule_energy/rmse"
+
+
+[training.experiment_logger]
+logger_name = "tensorboard" # this will set which logger to use
+
+# configuration for both loggers can be defined simultaneously, the logger_name variable defines which logger to use
+[training.experiment_logger.tensorboard_configuration]
+save_dir = "logs"
+
+[training.experiment_logger.wandb_configuration]
+save_dir = "logs"
+project = "training_potentials"
+group = "exp00"
+log_model = true
+job_type = "testing"
+tags = ["v_0.1.0"]
+notes = "testing training"
+
+[training.lr_scheduler]
+frequency = 1
+mode = "min"
+factor = 0.1
+patience = 10
+cooldown = 5
+min_lr = 1e-8
+threshold = 0.1
+threshold_mode = "abs"
+monitor = "val/per_molecule_energy/rmse"
+interval = "epoch"
+
+[training.loss_parameter]
+loss_property = ['per_molecule_energy', 'per_atom_force'] # use
+
+[training.loss_parameter.weight]
+per_molecule_energy = 0.999 #NOTE: reciprocal units
+per_atom_force = 0.001
+
+
+[training.early_stopping]
+verbose = true
+monitor = "val/per_molecule_energy/rmse"
+min_delta = 0.001
+patience = 50
+
+[training.splitting_strategy]
+name = "random_record_splitting_strategy"
+data_split = [0.8, 0.1, 0.1]
+seed = 42
diff --git a/modelforge/tests/test_dataset.py b/modelforge/tests/test_dataset.py
index 7702892b..91f963a3 100644
--- a/modelforge/tests/test_dataset.py
+++ b/modelforge/tests/test_dataset.py
@@ -458,10 +458,11 @@ def test_data_item_format_of_datamodule(
 @pytest.mark.parametrize(
     "potential_name", _Implemented_NNPs.get_all_neural_network_names()
 )
-def test_dataset_neighborlist(potential_name, single_batch_with_batchsize_64):
+def test_dataset_neighborlist(potential_name, single_batch_with_batchsize):
     """Test the neighborlist."""
 
-    nnp_input = single_batch_with_batchsize_64.nnp_input
+    batch = single_batch_with_batchsize(64, "QM9")
+    nnp_input = batch.nnp_input
 
     # test that the neighborlist is correctly generated
     from modelforge.tests.test_models import load_configs_into_pydantic_models
@@ -713,7 +714,8 @@ def test_numpy_dataset_assignment(dataset_name):
 
 
 def test_energy_postprocessing():
-    # setup test dataset
+    # test that the mean and stddev of the dataset
+    # are correct
     from modelforge.dataset.dataset import DataModule
 
     # test the self energy calculation on the QM9 dataset
diff --git a/modelforge/tests/test_models.py b/modelforge/tests/test_models.py
index 9a946a38..de561b90 100644
--- a/modelforge/tests/test_models.py
+++ b/modelforge/tests/test_models.py
@@ -183,11 +183,13 @@ def load_configs_into_pydantic_models(potential_name: str, dataset_name: str):
 @pytest.mark.parametrize(
     "potential_name", _Implemented_NNPs.get_all_neural_network_names()
 )
-def test_JAX_wrapping(potential_name, single_batch_with_batchsize_64):
+def test_JAX_wrapping(potential_name, single_batch_with_batchsize):
     from modelforge.potential.models import (
         NeuralNetworkPotentialFactory,
     )
 
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
+
     # read default parameters
     config = load_configs_into_pydantic_models(f"{potential_name.lower()}", "qm9")
 
@@ -199,7 +201,7 @@ def test_JAX_wrapping(potential_name, single_batch_with_batchsize_64):
     )
 
     assert "JAX" in str(type(model))
-    nnp_input = single_batch_with_batchsize_64.nnp_input.as_jax_namedtuple()
+    nnp_input = batch.nnp_input.as_jax_namedtuple()
     out = model(nnp_input)["per_molecule_energy"]
     import jax
 
@@ -458,13 +460,14 @@ def test_dataset_statistic(potential_name):
     "potential_name", _Implemented_NNPs.get_all_neural_network_names()
 )
 def test_energy_between_simulation_environments(
-    potential_name, single_batch_with_batchsize_64
+    potential_name, single_batch_with_batchsize
 ):
     # compare that the energy is the same for the JAX and PyTorch Model
     import numpy as np
     import torch
 
-    nnp_input = single_batch_with_batchsize_64.nnp_input
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
+    nnp_input = batch.nnp_input
     # test the forward pass through each of the models
     # cast input and model to torch.float64
     # read default parameters
@@ -545,6 +548,7 @@ def test_forward_pass_with_all_datasets(
     assert torch.all(pair_list[0, 1:] >= pair_list[0, :-1])
 
 
+@pytest.mark.parametrize("dataset_name", ["QM9", "SPICE2"])
 @pytest.mark.parametrize(
     "energy_expression",
     [
@@ -557,10 +561,11 @@ def test_forward_pass_with_all_datasets(
 )
 @pytest.mark.parametrize("simulation_environment", ["JAX", "PyTorch"])
 def test_forward_pass(
+  dataset_name.
     energy_expression,
     potential_name,
     simulation_environment,
-    single_batch_with_batchsize_64,
+    single_batch_with_batchsize,
 ):
     # this test sends a single batch from different datasets through the model
 
@@ -581,7 +586,6 @@ def test_forward_pass(
     output, atomic_subsystem_indices = convert_to_pytorch_if_needed(
         output, nnp_input, model
     )
-
     # test that charge correction is working
     if energy_expression == "short_range_and_long_range_electrostatic":
         per_molecule_charge, per_molecule_charge_corrected = retrieve_molecular_charges(
@@ -603,17 +607,18 @@ def test_forward_pass(
 @pytest.mark.parametrize(
     "potential_name", _Implemented_NNPs.get_all_neural_network_names()
 )
-def test_calculate_energies_and_forces(potential_name, single_batch_with_batchsize_64):
+def test_calculate_energies_and_forces(potential_name, single_batch_with_batchsize):
     """
     Test the calculation of energies and forces for a molecule.
     """
     import torch
 
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
     # read default parameters
     config = load_configs_into_pydantic_models(f"{potential_name.lower()}", "qm9")
 
     # get batch
-    nnp_input = single_batch_with_batchsize_64.nnp_input
+    nnp_input = batch.nnp_input
 
     # test the pass through each of the models
     model_training = NeuralNetworkPotentialFactory.generate_potential(
@@ -661,7 +666,7 @@ def test_calculate_energies_and_forces(potential_name, single_batch_with_batchsi
     "potential_name", _Implemented_NNPs.get_all_neural_network_names()
 )
 def test_calculate_energies_and_forces_with_jax(
-    potential_name, single_batch_with_batchsize_64
+    potential_name, single_batch_with_batchsize
 ):
     """
     Test the calculation of energies and forces for a molecule.
@@ -670,8 +675,8 @@ def test_calculate_energies_and_forces_with_jax(
 
     # read default parameters
     config = load_configs_into_pydantic_models(f"{potential_name.lower()}", "qm9")
-
-    nnp_input = single_batch_with_batchsize_64.nnp_input
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
+    nnp_input = batch.nnp_input
     # test the backward pass through each of the models
     nr_of_mols = nnp_input.atomic_subsystem_indices.unique().shape[0]
     nr_of_atoms_per_batch = nnp_input.atomic_subsystem_indices.shape[0]
@@ -785,8 +790,8 @@ def test_pairlist():
     from openff.units import unit
 
     cutoff = 5.0 * unit.nanometer  # no relevant cutoff
-    pairlist = Neighborlist(cutoff, only_unique_pairs=True)
-    r = pairlist(positions, atomic_subsystem_indices)
+    pairlist = Neighborlist({"cutoff1": cutoff}, only_unique_pairs=True)
+    r = pairlist(positions, atomic_subsystem_indices)["cutoff1"]
     pair_indices = r.pair_indices
 
     # pairlist describes the pairs of interacting atoms within a batch
@@ -815,8 +820,8 @@ def test_pairlist():
 
     # test with cutoff
     cutoff = 2.0 * unit.nanometer
-    pairlist = Neighborlist(cutoff, only_unique_pairs=True)
-    r = pairlist(positions, atomic_subsystem_indices)
+    pairlist = Neighborlist({"cutoff1": cutoff}, only_unique_pairs=True)
+    r = pairlist(positions, atomic_subsystem_indices)["cutoff1"]
     pair_indices = r.pair_indices
 
     assert torch.equal(pair_indices, torch.tensor([[0, 1, 3, 4], [1, 2, 4, 5]]))
@@ -839,8 +844,8 @@ def test_pairlist():
 
     # test with complete pairlist
     cutoff = 2.0 * unit.nanometer
-    pairlist = Neighborlist(cutoff, only_unique_pairs=False)
-    r = pairlist(positions, atomic_subsystem_indices)
+    pairlist = Neighborlist({"cutoff1": cutoff}, only_unique_pairs=False)
+    r = pairlist(positions, atomic_subsystem_indices)["cutoff1"]
     pair_indices = r.pair_indices
 
     print(pair_indices, flush=True)
@@ -851,11 +856,13 @@ def test_pairlist():
     # make sure that Pairlist and Neighborlist behave the same for large cutoffs
     cutoff = 10.0 * unit.nanometer
     only_unique_pairs = False
-    neighborlist = Neighborlist(cutoff, only_unique_pairs=only_unique_pairs)
+    neighborlist = Neighborlist(
+        {"cutoff1": cutoff}, only_unique_pairs=only_unique_pairs
+    )
     pairlist = Pairlist(only_unique_pairs=only_unique_pairs)
     r = pairlist(positions, atomic_subsystem_indices)
     pair_indices = r.pair_indices
-    r = neighborlist(positions, atomic_subsystem_indices)
+    r = neighborlist(positions, atomic_subsystem_indices)["cutoff1"]
     neighbor_indices = r.pair_indices
 
     assert torch.equal(pair_indices, neighbor_indices)
@@ -863,11 +870,13 @@ def test_pairlist():
     # make sure that they are the same also for non-redundant pairs
     cutoff = 10.0 * unit.nanometer
     only_unique_pairs = True
-    neighborlist = Neighborlist(cutoff, only_unique_pairs=only_unique_pairs)
+    neighborlist = Neighborlist(
+        {"cutoff1": cutoff}, only_unique_pairs=only_unique_pairs
+    )
     pairlist = Pairlist(only_unique_pairs=only_unique_pairs)
     r = pairlist(positions, atomic_subsystem_indices)
     pair_indices = r.pair_indices
-    r = neighborlist(positions, atomic_subsystem_indices)
+    r = neighborlist(positions, atomic_subsystem_indices)["cutoff1"]
     neighbor_indices = r.pair_indices
 
     assert torch.equal(pair_indices, neighbor_indices)
@@ -875,16 +884,59 @@ def test_pairlist():
     # this should fail
     cutoff = 2.0 * unit.nanometer
     only_unique_pairs = True
-    neighborlist = Neighborlist(cutoff, only_unique_pairs=only_unique_pairs)
+    neighborlist = Neighborlist(
+        {"cutoff1": cutoff}, only_unique_pairs=only_unique_pairs
+    )
     pairlist = Pairlist(only_unique_pairs=only_unique_pairs)
     r = pairlist(positions, atomic_subsystem_indices)
     pair_indices = r.pair_indices
-    r = neighborlist(positions, atomic_subsystem_indices)
+    r = neighborlist(positions, atomic_subsystem_indices)["cutoff1"]
     neighbor_indices = r.pair_indices
 
     assert not pair_indices.shape == neighbor_indices.shape
 
 
+def test_multiple_neighborlists():
+    from modelforge.potential.models import Pairlist, Neighborlist
+    import torch
+    from openff.units import unit
+
+    atomic_subsystem_indices = torch.tensor([0, 0, 0, 0, 0])
+
+    positions = torch.tensor(
+        [
+            [0.0, 0.0, 0.0],
+            [1.0, 0.0, 0.0],
+            [2.0, 0.0, 0.0],
+            [3.0, 0.0, 0.0],
+            [4.0, 0.0, 0.0],
+        ]
+    )
+
+    cutoff_short = 1.5 * unit.nanometer
+    cutoff_medium = 2.5 * unit.nanometer
+    cutoff_long = 3.5 * unit.nanometer
+    pairlist = Neighborlist(
+        {"short": cutoff_short, "medium": cutoff_medium, "long": cutoff_long},
+        only_unique_pairs=True,
+    )
+    r = pairlist(positions, atomic_subsystem_indices)
+
+    assert torch.equal(
+        r["short"].pair_indices, torch.tensor([[0, 1, 2, 3], [1, 2, 3, 4]])
+    )
+
+    assert torch.equal(
+        r["medium"].pair_indices,
+        torch.tensor([[0, 0, 1, 1, 2, 2, 3], [1, 2, 2, 3, 3, 4, 4]]),
+    )
+
+    assert torch.equal(
+        r["long"].pair_indices,
+        torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 3], [1, 2, 3, 2, 3, 4, 3, 4, 4]]),
+    )
+
+
 def test_pairlist_precomputation():
     from modelforge.potential.models import Pairlist
     import torch
@@ -1054,11 +1106,11 @@ def test_pairlist_on_dataset():
 @pytest.mark.parametrize(
     "potential_name", _Implemented_NNPs.get_all_neural_network_names()
 )
-def test_casting(potential_name, single_batch_with_batchsize_64):
+def test_casting(potential_name, single_batch_with_batchsize):
     # test dtype casting
     import torch
 
-    batch = single_batch_with_batchsize_64
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
     batch_ = batch.to(dtype=torch.float64)
     assert batch_.nnp_input.positions.dtype == torch.float64
     batch_ = batch_.to(dtype=torch.float32)
@@ -1101,9 +1153,9 @@ def test_casting(potential_name, single_batch_with_batchsize_64):
 )
 @pytest.mark.parametrize("simulation_environment", ["PyTorch"])
 def test_equivariant_energies_and_forces(
-    potential_name: str,
-    simulation_environment: str,
-    single_batch_with_batchsize_64,
+    potential_name,
+    simulation_environment,
+    single_batch_with_batchsize,
     equivariance_utils,
 ):
     """
@@ -1126,7 +1178,8 @@ def test_equivariant_energies_and_forces(
     translation, rotation, reflection = equivariance_utils
     # define the tolerance
     atol = 1e-3
-    nnp_input = single_batch_with_batchsize_64.nnp_input
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
+    nnp_input = batch.nnp_input
 
     # initialize the models
     model = model.to(dtype=torch.float64)
@@ -1134,7 +1187,9 @@ def test_equivariant_energies_and_forces(
     # ------------------- #
     # start the test
     # reference values
-    nnp_input = single_batch_with_batchsize_64.nnp_input.to(dtype=torch.float64)
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
+
+    nnp_input = batch.nnp_input.to(dtype=torch.float64)
     reference_result = model(nnp_input)["per_molecule_energy"].to(dtype=torch.float64)
     reference_forces = -torch.autograd.grad(
         reference_result.sum(),
@@ -1238,7 +1293,7 @@ def test_pairlist_calculate_r_ij_and_d_ij():
     # Create Pairlist instance
     # --------------------------- #
     # Only unique pairs
-    pairlist = Neighborlist(cutoff, only_unique_pairs=True)
+    pairlist = Neighborlist({"cutoff_1": cutoff}, only_unique_pairs=True)
     pair_indices = pairlist.enumerate_all_pairs(atomic_subsystem_indices)
 
     # Calculate r_ij and d_ij
@@ -1260,7 +1315,7 @@ def test_pairlist_calculate_r_ij_and_d_ij():
 
     # --------------------------- #
     # ALL pairs
-    pairlist = Neighborlist(cutoff, only_unique_pairs=False)
+    pairlist = Neighborlist({"cutoff_1": cutoff}, only_unique_pairs=False)
     pair_indices = pairlist.enumerate_all_pairs(atomic_subsystem_indices)
 
     # Calculate r_ij and d_ij
diff --git a/modelforge/tests/test_nn.py b/modelforge/tests/test_nn.py
index e399550a..a5b0b276 100644
--- a/modelforge/tests/test_nn.py
+++ b/modelforge/tests/test_nn.py
@@ -1,14 +1,16 @@
 from .test_models import load_configs_into_pydantic_models
 
 
-def test_embedding(single_batch_with_batchsize_64):
+def test_embedding(single_batch_with_batchsize):
     # test the input featurization, including:
     # - nuclear charge embedding
     # - total charge mixing
 
-    import torch
+    import torch  # noqa: F401
 
-    nnp_input = single_batch_with_batchsize_64.nnp_input
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
+
+    nnp_input = batch.nnp_input
     model_name = "SchNet"
     # read default parameters and extract featurization
     config = load_configs_into_pydantic_models(f"{model_name.lower()}", "qm9")
diff --git a/modelforge/tests/test_painn.py b/modelforge/tests/test_painn.py
index 647d9901..3dc60d48 100644
--- a/modelforge/tests/test_painn.py
+++ b/modelforge/tests/test_painn.py
@@ -2,7 +2,7 @@
 from modelforge.potential.painn import PaiNN
 
 
-def test_forward(single_batch_with_batchsize_64):
+def test_forward(single_batch_with_batchsize):
     """Test initialization of the PaiNN neural network potential."""
     # read default parameters
     from modelforge.tests.test_models import load_configs_into_pydantic_models
@@ -17,8 +17,9 @@ def test_forward(single_batch_with_batchsize_64):
         ],
     )
     assert painn is not None, "PaiNN model should be initialized."
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
 
-    nnp_input = single_batch_with_batchsize_64.nnp_input.to(dtype=torch.float32)
+    nnp_input = batch.nnp_input.to(dtype=torch.float32)
     energy = painn(nnp_input)["per_molecule_energy"]
     nr_of_mols = nnp_input.atomic_subsystem_indices.unique().shape[0]
 
@@ -27,11 +28,13 @@ def test_forward(single_batch_with_batchsize_64):
     )  # Assuming energy is calculated per sample in the batch
 
 
-def test_equivariance(single_batch_with_batchsize_64):
+def test_equivariance(single_batch_with_batchsize):
     from modelforge.potential.painn import PaiNN
     from dataclasses import replace
     import torch
 
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
+
     from modelforge.tests.test_models import load_configs_into_pydantic_models
 
     # read default parameters
@@ -50,7 +53,7 @@ def test_equivariance(single_batch_with_batchsize_64):
         ],
     ).double()
 
-    methane_input = single_batch_with_batchsize_64.nnp_input.to(dtype=torch.float64)
+    methane_input = batch.nnp_input.to(dtype=torch.float64)
     perturbed_methane_input = replace(methane_input)
     perturbed_methane_input.positions = torch.matmul(
         methane_input.positions, rotation_matrix
diff --git a/modelforge/tests/test_parameter_models.py b/modelforge/tests/test_parameter_models.py
index ce97537d..2c26e2cd 100644
--- a/modelforge/tests/test_parameter_models.py
+++ b/modelforge/tests/test_parameter_models.py
@@ -142,6 +142,9 @@ def test_training_parameter_model():
     with pytest.raises(ValidationError):
         training_parameters.splitting_strategy.dataset_split = [0.7, 0.1, 0.1, 0.1]
 
-    # this will throw an error because the datafile has 2 entries for the loss_property dictionary
+    # this will throw an error because the datafile has 1 entries for the loss_property dictionary
     with pytest.raises(ValidationError):
-        training_parameters.loss_parameter.loss_property = ["per_molecule_energy"]
+        training_parameters.loss_parameter.loss_property = [
+            "per_molecule_energy",
+            "per_atom_force",
+        ]
diff --git a/modelforge/tests/test_physnet.py b/modelforge/tests/test_physnet.py
index 8aadca05..1601654d 100644
--- a/modelforge/tests/test_physnet.py
+++ b/modelforge/tests/test_physnet.py
@@ -15,7 +15,7 @@ def test_init():
     )
 
 
-def test_forward(single_batch_with_batchsize_64):
+def test_forward(single_batch_with_batchsize):
     import torch
     from modelforge.potential.physnet import PhysNet
 
@@ -37,7 +37,9 @@ def test_forward(single_batch_with_batchsize_64):
     )
     model = model.to(torch.float32)
     print(model)
-    yhat = model(single_batch_with_batchsize_64.nnp_input.to(dtype=torch.float32))
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
+
+    yhat = model(batch.nnp_input.to(dtype=torch.float32))
 
 
 def test_compare_representation():
diff --git a/modelforge/tests/test_sake.py b/modelforge/tests/test_sake.py
index 8fff65ba..994b7112 100644
--- a/modelforge/tests/test_sake.py
+++ b/modelforge/tests/test_sake.py
@@ -33,12 +33,13 @@ def test_init():
 from openff.units import unit
 
 
-def test_forward(single_batch_with_batchsize_64):
+def test_forward(single_batch_with_batchsize):
     """
     Test the forward pass of the SAKE model.
     """
     # get methane input
-    methane = single_batch_with_batchsize_64.nnp_input
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
+    methane = batch.nnp_input
 
     from modelforge.tests.test_models import load_configs_into_pydantic_models
 
@@ -91,7 +92,7 @@ def test_interaction_forward():
 
 @pytest.mark.parametrize("eq_atol", [3e-1])
 @pytest.mark.parametrize("h_atol", [8e-2])
-def test_layer_equivariance(h_atol, eq_atol, single_batch_with_batchsize_64):
+def test_layer_equivariance(h_atol, eq_atol, single_batch_with_batchsize):
     import torch
     from modelforge.potential.sake import SAKE
     from dataclasses import replace
@@ -118,7 +119,9 @@ def test_layer_equivariance(h_atol, eq_atol, single_batch_with_batchsize_64):
     )
 
     # get methane input
-    methane = single_batch_with_batchsize_64.nnp_input
+    batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9")
+
+    methane = batch.nnp_input
     perturbed_methane_input = replace(methane)
     perturbed_methane_input.positions = torch.matmul(methane.positions, rotation_matrix)
 
@@ -424,7 +427,7 @@ def test_sake_layer_against_reference(include_self_pairs, v_is_none):
 
 # FIXME: this test is currently failing
 @pytest.mark.xfail
-def test_model_against_reference(single_batch_with_batchsize_1):
+def test_model_against_reference(single_batch_with_batchsize):
     nr_heads = 5
     key = jax.random.PRNGKey(1884)
     torch.manual_seed(1884)
@@ -462,7 +465,8 @@ def test_model_against_reference(single_batch_with_batchsize_1):
     )
 
     # get methane input
-    methane = single_batch_with_batchsize_1.nnp_input
+    batch = single_batch_with_batchsize(batch_size=1)
+    methane = batch.nnp_input
     pairlist_output = mf_sake.compute_interacting_pairs.prepare_inputs(methane)
     prepared_methane = mf_sake.core_module._model_specific_input_preparation(
         methane, pairlist_output
@@ -605,7 +609,7 @@ def test_model_against_reference(single_batch_with_batchsize_1):
     # assert torch.allclose(mf_out.E, torch.from_numpy(onp.array(ref_out[0])))
 
 
-def test_model_invariance(single_batch_with_batchsize_1):
+def test_model_invariance(single_batch_with_batchsize):
     from dataclasses import replace
 
     from modelforge.tests.test_models import load_configs_into_pydantic_models
@@ -620,7 +624,8 @@ def test_model_invariance(single_batch_with_batchsize_1):
         ],
     )
     # get methane input
-    methane = single_batch_with_batchsize_1.nnp_input
+    batch = single_batch_with_batchsize(batch_size=1, dataset_name="QM9")
+    methane = batch.nnp_input
 
     rotation_matrix = torch.tensor([[0.0, 1.0, 0.0], [-1.0, 0.0, 0.0], [0.0, 0.0, 1.0]])
     perturbed_methane_input = replace(methane)
diff --git a/modelforge/tests/test_tensornet.py b/modelforge/tests/test_tensornet.py
index c7a9fd4e..305c8332 100644
--- a/modelforge/tests/test_tensornet.py
+++ b/modelforge/tests/test_tensornet.py
@@ -106,7 +106,9 @@ def test_input():
         ],
     )
     tensornet.compute_interacting_pairs._input_checks(mf_input)
-    pairlist_output = tensornet.compute_interacting_pairs.prepare_inputs(mf_input)
+    pairlist_output = tensornet.compute_interacting_pairs.prepare_inputs(mf_input)[
+        "maximum_interaction_radius"
+    ]
 
     # torchmd-net TensorNet
     if reference_data:
@@ -222,7 +224,7 @@ def test_representation():
     cutoff_lower = 0.0
     cutoff_upper = 5.1
     trainable_rbf = False
-    highest_atomic_number = 128
+    maximum_atomic_number = 128
 
     # Set up a dataset
     # prepare reference value
@@ -264,7 +266,7 @@ def test_representation():
         cutoff_upper * unit.angstrom,
         cutoff_lower * unit.angstrom,
         trainable_rbf,
-        highest_atomic_number,
+        maximum_atomic_number,
     )
     nnp_input = tensornet.core_module._model_specific_input_preparation(
         mf_input, pairlist_output
@@ -284,7 +286,7 @@ def test_representation():
             cutoff_lower,
             cutoff_upper,
             trainable_rbf,
-            highest_atomic_number,
+            maximum_atomic_number,
             seed,
         )
     ################ torchmd-net TensorNet ################
diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py
index 2bc0f5bd..72ee5e15 100644
--- a/modelforge/tests/test_training.py
+++ b/modelforge/tests/test_training.py
@@ -1,30 +1,34 @@
 import os
-import pytest
-
 import platform
 
+import pytest
+import torch
+
 ON_MACOS = platform.system() == "Darwin"
 
 IN_GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true"
-from modelforge.potential import _Implemented_NNPs
-from modelforge.potential import NeuralNetworkPotentialFactory
+from modelforge.potential import NeuralNetworkPotentialFactory, _Implemented_NNPs
+
+
+def load_configs_into_pydantic_models(
+    potential_name: str, dataset_name: str, training_toml: str
+):
+    from importlib import resources
 
+    import toml
 
-def load_configs_into_pydantic_models(potential_name: str, dataset_name: str):
     from modelforge.tests.data import (
-        potential_defaults,
-        training_defaults,
         dataset_defaults,
+        potential_defaults,
         runtime_defaults,
+        training_defaults,
     )
-    from importlib import resources
-    import toml
 
     potential_path = (
         resources.files(potential_defaults) / f"{potential_name.lower()}.toml"
     )
     dataset_path = resources.files(dataset_defaults) / f"{dataset_name.lower()}.toml"
-    training_path = resources.files(training_defaults) / "default.toml"
+    training_path = resources.files(training_defaults) / f"{training_toml}.toml"
     runtime_path = resources.files(runtime_defaults) / "runtime.toml"
 
     training_config_dict = toml.load(training_path)
@@ -42,7 +46,7 @@ def load_configs_into_pydantic_models(potential_name: str, dataset_name: str):
     potential_parameters = PotentialParameters(**potential_config_dict["potential"])
 
     from modelforge.dataset.dataset import DatasetParameters
-    from modelforge.train.parameters import TrainingParameters, RuntimeParameters
+    from modelforge.train.parameters import RuntimeParameters, TrainingParameters
 
     dataset_parameters = DatasetParameters(**dataset_config_dict["dataset"])
     training_parameters = TrainingParameters(**training_config_dict["training"])
@@ -56,18 +60,10 @@ def load_configs_into_pydantic_models(potential_name: str, dataset_name: str):
     }
 
 
-@pytest.mark.skipif(ON_MACOS, reason="Skipping this test on MacOS GitHub Actions")
-@pytest.mark.parametrize(
-    "potential_name", _Implemented_NNPs.get_all_neural_network_names()
-)
-@pytest.mark.parametrize("dataset_name", ["QM9"])
-def test_train_with_lightning(potential_name, dataset_name):
-    """
-    Test the forward pass for a given model and dataset.
-    """
-
-    # read default parameters
-    config = load_configs_into_pydantic_models(potential_name, dataset_name)
+def get_trainer(potential_name: str, dataset_name: str, training_toml: str):
+    config = load_configs_into_pydantic_models(
+        potential_name, dataset_name, training_toml
+    )
 
     # Extract parameters
     potential_parameter = config["potential"]
@@ -75,53 +71,67 @@ def test_train_with_lightning(potential_name, dataset_name):
     dataset_parameter = config["dataset"]
     runtime_parameter = config["runtime"]
 
-    from modelforge.potential.models import (
-        NeuralNetworkPotentialFactory,
-    )
-
-    trainer = (
-        NeuralNetworkPotentialFactory.generate_potential(
-            use="training",
-            potential_parameter=potential_parameter,
-            training_parameter=training_parameter,
-            dataset_parameter=dataset_parameter,
-            runtime_parameter=runtime_parameter,
-        )
-        .train_potential()
-        .save_checkpoint("test.chp")  # save checkpoint
-    )
-    # continue training
-    NeuralNetworkPotentialFactory.generate_potential(
+    return NeuralNetworkPotentialFactory.generate_potential(
         use="training",
         potential_parameter=potential_parameter,
         training_parameter=training_parameter,
         dataset_parameter=dataset_parameter,
         runtime_parameter=runtime_parameter,
-    ).train_potential()
+    )
+
+
+@pytest.mark.skipif(ON_MACOS, reason="Skipping this test on MacOS GitHub Actions")
+@pytest.mark.parametrize(
+    "potential_name", _Implemented_NNPs.get_all_neural_network_names()
+)
+@pytest.mark.parametrize("dataset_name", ["PHALKETHOH"])
+@pytest.mark.parametrize("training", ["with_force", "without_force"])
+def test_train_with_lightning(training, potential_name, dataset_name):
+    """
+    Test that we can train, save and load checkpoints.
+    """
+    # get correct training toml
+    training_toml = "default_with_force" if training == "with_force" else "default"
+    # SKIP if potential is ANI and dataset is SPICE2
+    if "ANI" in potential_name and dataset_name == "SPICE2":
+        pytest.skip("ANI potential is not compatible with SPICE2 dataset")
+    if IN_GITHUB_ACTIONS and potential_name == "SAKE" and training == "with_force":
+        pytest.skip(
+            "Skipping Sake training with forces on GitHub Actions because it allocates too much memory"
+        )
+    # train potential
+    get_trainer(
+        potential_name, dataset_name, training_toml
+    ).train_potential().save_checkpoint(
+        "test.chp"
+    )  # save checkpoint
+
+    # continue training from checkpoint
+    get_trainer(potential_name, dataset_name, training_toml).train_potential()
 
 
 def test_train_from_single_toml_file():
-    from modelforge.train.training import read_config_and_train
-    from modelforge.tests import data
     from importlib import resources
 
+    from modelforge.tests import data
+    from modelforge.train.training import read_config_and_train
+
     config_path = resources.files(data) / f"config.toml"
 
     read_config_and_train(config_path)
 
 
-import torch
-
-
-def test_error_calculation(single_batch_with_batchsize_16_with_force):
+def test_error_calculation(single_batch_with_batchsize):
     # test the different Loss classes
     from modelforge.train.training import (
-        FromPerAtomToPerMoleculeMeanSquaredError,
-        PerMoleculeMeanSquaredError,
+        FromPerAtomToPerMoleculeSquaredError,
+        PerMoleculeSquaredError,
     )
 
     # generate data
-    data = single_batch_with_batchsize_16_with_force
+    batch = single_batch_with_batchsize(batch_size=16, dataset_name="PHALKETHOH")
+
+    data = batch
     true_E = data.metadata.E
     true_F = data.metadata.F
 
@@ -130,7 +140,7 @@ def test_error_calculation(single_batch_with_batchsize_16_with_force):
     predicted_F = true_F + torch.rand_like(true_F) * 10
 
     # test error for property with shape (nr_of_molecules, 1)
-    error = PerMoleculeMeanSquaredError()
+    error = PerMoleculeSquaredError()
     E_error = error(predicted_E, true_E, data)
 
     # compare output (mean squared error scaled by number of atoms in the molecule)
@@ -140,14 +150,13 @@ def test_error_calculation(single_batch_with_batchsize_16_with_force):
         1
     )  # FIXME : fi
     reference_E_error = torch.mean(scale_squared_error)
-    assert torch.allclose(E_error, reference_E_error)
+    assert torch.allclose(torch.mean(E_error), reference_E_error)
 
     # test error for property with shape (nr_of_atoms, 3)
-    error = FromPerAtomToPerMoleculeMeanSquaredError()
+    error = FromPerAtomToPerMoleculeSquaredError()
     F_error = error(predicted_F, true_F, data)
 
     # compare error (mean squared error scaled by number of atoms in the molecule)
-
     scaled_error = (
         torch.linalg.vector_norm(predicted_F - true_F, dim=1, keepdim=True) ** 2
     )
@@ -162,9 +171,91 @@ def test_error_calculation(single_batch_with_batchsize_16_with_force):
     )
 
     reference_F_error = torch.mean(
-        per_mol_error / data.metadata.atomic_subsystem_counts.unsqueeze(1)
+        per_mol_error / (3 * data.metadata.atomic_subsystem_counts.unsqueeze(1))
+    )
+    assert torch.allclose(torch.mean(F_error), reference_F_error)
+
+
+def test_loss(single_batch_with_batchsize):
+    from modelforge.train.training import Loss
+
+    batch = single_batch_with_batchsize(batch_size=16, dataset_name="PHALKETHOH")
+
+    loss_porperty = ["per_molecule_energy", "per_atom_force"]
+    loss_weights = {"per_molecule_energy": 0.5, "per_atom_force": 0.5}
+    loss = Loss(loss_porperty, loss_weights)
+    assert loss is not None
+
+    # get trainer
+    trainer = get_trainer("schnet", "QM9", "default_with_force")
+    prediction = trainer.model.calculate_predictions(
+        batch, trainer.model.potential, train_mode=True
+    )  # train_mode=True is required for gradients in force prediction
+
+    assert prediction["per_molecule_energy_predict"].size(
+        dim=0
+    ) == batch.metadata.E.size(dim=0)
+    assert prediction["per_atom_force_predict"].size(dim=0) == batch.metadata.F.size(
+        dim=0
+    )
+
+    # pass prediction through loss module
+    loss_output = loss(prediction, batch)
+    # let's recalculate the loss (NOTE: we scale the loss by the number of atoms)
+    # --------------------------------------------- #
+    # make sure that both have gradients
+    assert prediction["per_molecule_energy_predict"].requires_grad
+    assert prediction["per_atom_force_predict"].requires_grad
+
+    # --------------------------------------------- #
+    # first, calculate E_loss
+    E_loss = torch.mean(
+        (
+            (
+                prediction["per_molecule_energy_predict"]
+                - prediction["per_molecule_energy_true"]
+            ).pow(2)
+            / batch.metadata.atomic_subsystem_counts.unsqueeze(1)
+        )
+    )
+    # compare to referenc evalue obtained from Loos class
+    ref = torch.mean(loss_output["per_molecule_energy"])
+    assert torch.allclose(ref, E_loss)
+
+    # --------------------------------------------- #
+    # now calculate F_loss
+    per_atom_force_squared_error = (
+        (prediction["per_atom_force_predict"] - prediction["per_atom_force_true"])
+        .pow(2)
+        .sum(dim=1, keepdim=True)
+    ).squeeze(-1)
+
+    # # Aggregate error per molecule
+    per_molecule_squared_error = torch.zeros_like(
+        batch.metadata.E.squeeze(-1), dtype=per_atom_force_squared_error.dtype
+    )
+    per_molecule_squared_error.scatter_add_(
+        0,
+        batch.nnp_input.atomic_subsystem_indices.long(),
+        per_atom_force_squared_error,
+    )
+    # divide by number of atoms
+    per_molecule_squared_error = per_molecule_squared_error / (
+        3 * batch.metadata.atomic_subsystem_counts
+    )
+
+    per_atom_force_mse = torch.mean(per_molecule_squared_error)
+    assert torch.allclose(torch.mean(loss_output["per_atom_force"]), per_atom_force_mse)
+
+    # --------------------------------------------- #
+    # let's double check that the loss is calculated correctly
+    # calculate the total loss
+
+    assert torch.allclose(
+        loss_weights["per_molecule_energy"] * loss_output["per_molecule_energy"]
+        + loss_weights["per_atom_force"] * loss_output["per_atom_force"],
+        loss_output["total_loss"].to(torch.float32),
     )
-    assert torch.allclose(F_error, reference_F_error)
 
 
 @pytest.mark.skipif(IN_GITHUB_ACTIONS, reason="Skipping this test on GitHub Actions")
diff --git a/modelforge/train/parameters.py b/modelforge/train/parameters.py
index 95bf3569..f6daf0d2 100644
--- a/modelforge/train/parameters.py
+++ b/modelforge/train/parameters.py
@@ -262,7 +262,7 @@ def ensure_logger_configuration(self) -> "ExperimentLogger":
     remove_self_energies: bool
     batch_size: int
     lr: float
-    monitor: str
+    monitor_for_checkpoint: str
     lr_scheduler: Optional[SchedulerConfig] = None
     loss_parameter: LossParameter
     early_stopping: Optional[EarlyStopping] = None
@@ -270,7 +270,8 @@ def ensure_logger_configuration(self) -> "ExperimentLogger":
     stochastic_weight_averaging: Optional[StochasticWeightAveraging] = None
     experiment_logger: ExperimentLogger
     verbose: bool = False
-    optimizer: Type[torch.optim.Optimizer] = torch.optim.Adam
+    optimizer: Type[torch.optim.Optimizer] = torch.optim.AdamW
+    min_number_of_epochs: Union[int, None] = None
 
 
 ### Runtime Parameters
@@ -309,7 +310,6 @@ class RuntimeParameters(ParametersBase):
     A class to hold the runtime parameters that inherits from the pydantic BaseModel
 
     args:
-        save_dir (str): The save directory
         experiment_name (str): The experiment name
         accelerator (Accelerator): The accelerator, options are: "cpu", "gpu", "tpu"
         number_of_nodes (int): The number of nodes
@@ -322,7 +322,6 @@ class RuntimeParameters(ParametersBase):
 
     """
 
-    save_dir: str
     experiment_name: str
     accelerator: Accelerator
     number_of_nodes: int
diff --git a/modelforge/train/training.py b/modelforge/train/training.py
index 401e1c91..23ab1a68 100644
--- a/modelforge/train/training.py
+++ b/modelforge/train/training.py
@@ -2,33 +2,39 @@
 This module contains classes and functions for training neural network potentials using PyTorch Lightning.
 """
 
-from torch.optim.lr_scheduler import ReduceLROnPlateau
-from typing import Any, Union, Dict, Type, Optional, List
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Optional, Type, Union
+
+import lightning.pytorch as pL
 import torch
-from loguru import logger as log
-from modelforge.dataset.dataset import BatchData, NNPInput
 import torchmetrics
+from lightning import Trainer
+from loguru import logger as log
 from torch import nn
-from abc import ABC, abstractmethod
-from modelforge.dataset.dataset import DatasetParameters
+from torch.optim.lr_scheduler import ReduceLROnPlateau
+
+from modelforge.dataset.dataset import (
+    BatchData,
+    DataModule,
+    DatasetParameters,
+    NNPInput,
+)
 from modelforge.potential.parameters import (
     ANI2xParameters,
-    PhysNetParameters,
-    SchNetParameters,
     PaiNNParameters,
+    PhysNetParameters,
     SAKEParameters,
+    SchNetParameters,
     TensorNetParameters,
 )
-from lightning import Trainer
-import lightning.pytorch as pL
-from modelforge.dataset.dataset import DataModule
+from modelforge.train.parameters import RuntimeParameters, TrainingParameters
 
 __all__ = [
     "Error",
-    "FromPerAtomToPerMoleculeMeanSquaredError",
+    "FromPerAtomToPerMoleculeSquaredError",
     "Loss",
     "LossFactory",
-    "PerMoleculeMeanSquaredError",
+    "PerMoleculeSquaredError",
     "ModelTrainer",
     "create_error_metrics",
     "ModelTrainer",
@@ -38,14 +44,16 @@
 class Error(nn.Module, ABC):
     """
     Class representing the error calculation for predicted and true values.
+    """
 
-    Methods:
-        calculate_error(predicted: torch.Tensor, true: torch.Tensor) -> torch.Tensor:
-            Calculates the error between the predicted and true values.
+    def __init__(self, scale_by_number_of_atoms: bool = True):
 
-        scale_by_number_of_atoms(error, atomic_subsystem_counts) -> torch.Tensor:
-            Scales the error by the number of atoms in the atomic subsystems.
-    """
+        super().__init__()
+        if not scale_by_number_of_atoms:
+            # If scaling is not desired, override the method to just return the input error unchanged
+            self.scale_by_number_of_atoms = (
+                lambda error, atomic_subsystem_counts, prefactor=1: error
+            )
 
     @abstractmethod
     def calculate_error(
@@ -75,10 +83,14 @@ def calculate_squared_error(
         torch.Tensor
             The calculated error.
         """
-        return (predicted_tensor - reference_tensor).pow(2).sum(dim=1, keepdim=True)
+        squared_diff = (predicted_tensor - reference_tensor).pow(2)
+        error = squared_diff.sum(dim=1, keepdim=True)
+        return error
 
     @staticmethod
-    def scale_by_number_of_atoms(error, atomic_subsystem_counts) -> torch.Tensor:
+    def scale_by_number_of_atoms(
+        error, atomic_subsystem_counts, prefactor: int = 1
+    ) -> torch.Tensor:
         """
         Scales the error by the number of atoms in the atomic subsystems.
 
@@ -88,29 +100,27 @@ def scale_by_number_of_atoms(error, atomic_subsystem_counts) -> torch.Tensor:
             The error to be scaled.
         atomic_subsystem_counts : torch.Tensor
             The number of atoms in the atomic subsystems.
-
+        prefactor : int
+           To consider the shape of the property, e.g., if the reference property has shape (N,3) it is necessary to further devide the result by 3
         Returns
         -------
         torch.Tensor
             The scaled error.
         """
         # divide by number of atoms
-        scaled_by_number_of_atoms = error / atomic_subsystem_counts.unsqueeze(
-            1
+        scaled_by_number_of_atoms = error / (
+            prefactor * atomic_subsystem_counts.unsqueeze(1)
         )  # FIXME: ensure that all per-atom properties have dimension (N, 1)
         return scaled_by_number_of_atoms
 
 
-class FromPerAtomToPerMoleculeMeanSquaredError(Error):
+class FromPerAtomToPerMoleculeSquaredError(Error):
     """
     Calculates the per-atom error and aggregates it to per-molecule mean squared error.
     """
 
-    def __init__(self):
-        """
-        Initializes the PerAtomToPerMoleculeError class.
-        """
-        super().__init__()
+    def __init__(self, scale_by_number_of_atoms: bool = True):
+        super().__init__(scale_by_number_of_atoms)
 
     def calculate_error(
         self,
@@ -160,27 +170,24 @@ def forward(
             0,
             batch.nnp_input.atomic_subsystem_indices.long().unsqueeze(1),
             per_atom_squared_error,
-        )
+        ).contiguous()
         # divide by number of atoms
         per_molecule_square_error_scaled = self.scale_by_number_of_atoms(
-            per_molecule_squared_error, batch.metadata.atomic_subsystem_counts
-        )
-        # return the average
-        return torch.mean(per_molecule_square_error_scaled)
+            per_molecule_squared_error,
+            batch.metadata.atomic_subsystem_counts,
+            prefactor=per_atom_prediction.shape[-1],
+        ).contiguous()
+
+        return per_molecule_square_error_scaled
 
 
-class PerMoleculeMeanSquaredError(Error):
+class PerMoleculeSquaredError(Error):
     """
     Calculates the per-molecule mean squared error.
-
     """
 
-    def __init__(self):
-        """
-        Initializes the PerMoleculeMeanSquaredError class.
-        """
-
-        super().__init__()
+    def __init__(self, scale_by_number_of_atoms: bool = True):
+        super().__init__(scale_by_number_of_atoms)
 
     def forward(
         self,
@@ -210,11 +217,11 @@ def forward(
             per_molecule_prediction, per_molecule_reference
         )
         per_molecule_square_error_scaled = self.scale_by_number_of_atoms(
-            per_molecule_squared_error, batch.metadata.atomic_subsystem_counts
+            per_molecule_squared_error,
+            batch.metadata.atomic_subsystem_counts,
         )
 
-        # return the average
-        return torch.mean(per_molecule_square_error_scaled)
+        return per_molecule_square_error_scaled
 
     def calculate_error(
         self,
@@ -228,24 +235,12 @@ def calculate_error(
 
 
 class Loss(nn.Module):
-    """
-    Calculates the combined loss for energy and force predictions.
 
-    Attributes
-    ----------
-    loss_property : List[str]
-        List of properties to include in the loss calculation.
-    weight : Dict[str, float]
-        Dictionary containing the weights for each property in the loss calculation.
-    loss : nn.ModuleDict
-        Module dictionary containing the loss functions for each property.
-    """
-
-    _SUPPORTED_PROPERTIES = ["per_molecule_energy", "per_atom_force"]
+    _SUPPORTED_PROPERTIES = ["per_atom_energy", "per_molecule_energy", "per_atom_force"]
 
     def __init__(self, loss_porperty: List[str], weight: Dict[str, float]):
         """
-        Initializes the Loss class.
+        Calculates the combined loss for energy and force predictions.
 
         Parameters
         ----------
@@ -270,14 +265,24 @@ def __init__(self, loss_porperty: List[str], weight: Dict[str, float]):
         for prop, w in weight.items():
             if prop in self._SUPPORTED_PROPERTIES:
                 if prop == "per_atom_force":
-                    self.loss[prop] = FromPerAtomToPerMoleculeMeanSquaredError()
+                    self.loss[prop] = FromPerAtomToPerMoleculeSquaredError(
+                        scale_by_number_of_atoms=True
+                    )
+                elif prop == "per_atom_energy":
+                    self.loss[prop] = PerMoleculeSquaredError(
+                        scale_by_number_of_atoms=True
+                    )  # FIXME: this is currently not working
                 elif prop == "per_molecule_energy":
-                    self.loss[prop] = PerMoleculeMeanSquaredError()
+                    self.loss[prop] = PerMoleculeSquaredError(
+                        scale_by_number_of_atoms=True
+                    )
                 self.register_buffer(prop, torch.tensor(w))
             else:
                 raise NotImplementedError(f"Loss type {prop} not implemented.")
 
-    def forward(self, predict_target: Dict[str, torch.Tensor], batch):
+    def forward(
+        self, predict_target: Dict[str, torch.Tensor], batch
+    ) -> Dict[str, torch.Tensor]:
         """
         Calculates the combined loss for the specified properties.
 
@@ -302,13 +307,13 @@ def forward(self, predict_target: Dict[str, torch.Tensor], batch):
         # iterate over loss properties
         for prop in self.loss_property:
             # calculate loss per property
-            loss_ = self.weight[prop] * self.loss[prop](
+            loss_ = self.loss[prop](
                 predict_target[f"{prop}_predict"], predict_target[f"{prop}_true"], batch
             )
             # add total loss
-            loss = loss + loss_
+            loss = loss + (self.weight[prop] * loss_)
             # save loss
-            loss_dict[f"{prop}/mse"] = loss_
+            loss_dict[f"{prop}"] = loss_
 
         # add total loss to results dict and return
         loss_dict["total_loss"] = loss
@@ -341,11 +346,11 @@ def create_loss(loss_property: List[str], weight: Dict[str, float]) -> Type[Loss
         return Loss(loss_property, weight)
 
 
-from torch.optim import Optimizer
 from torch.nn import ModuleDict
+from torch.optim import Optimizer
 
 
-def create_error_metrics(loss_properties: List[str]) -> ModuleDict:
+def create_error_metrics(loss_properties: List[str], loss: bool = False) -> ModuleDict:
     """
     Creates a ModuleDict of MetricCollections for the given loss properties.
 
@@ -353,50 +358,52 @@ def create_error_metrics(loss_properties: List[str]) -> ModuleDict:
     ----------
     loss_properties : List[str]
         List of loss properties for which to create the metrics.
-
+    loss : bool, optional
+        If True, only the loss metric is created, by default False.
     Returns
     -------
     ModuleDict
         A dictionary where keys are loss properties and values are MetricCollections.
     """
-    from torchmetrics.regression import (
-        MeanAbsoluteError,
-        MeanSquaredError,
-    )
     from torchmetrics import MetricCollection
+    from torchmetrics.regression import MeanAbsoluteError, MeanSquaredError
+    from torchmetrics.aggregation import MeanMetric
 
-    return ModuleDict(
-        {
-            prop: MetricCollection(
-                [MeanAbsoluteError(), MeanSquaredError(squared=False)]
-            )
-            for prop in loss_properties
-        }
-    )
-
-
-from modelforge.train.parameters import RuntimeParameters, TrainingParameters
+    if loss:
+        metric_dict = ModuleDict(
+            {prop: MetricCollection([MeanMetric()]) for prop in loss_properties}
+        )
+        metric_dict["total_loss"] = MetricCollection([MeanMetric()])
+        return metric_dict
+    else:
+        metric_dict = ModuleDict(
+            {
+                prop: MetricCollection(
+                    [MeanAbsoluteError(), MeanSquaredError(squared=False)]
+                )
+                for prop in loss_properties
+            }
+        )
+        return metric_dict
 
 
 class CalculateProperties(torch.nn.Module):
 
-    def __init__(self):
+    def __init__(self, requested_properties: List[str]):
         """
         A utility class for calculating properties such as energies and forces from batches using a neural network model.
 
-        Methods
-        -------
-        _get_forces(batch: BatchData, energies: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]
-            Computes the forces from a given batch using the model.
-        _get_energies(batch: BatchData, model: Type[torch.nn.Module]) -> Dict[str, torch.Tensor]
-            Computes the energies from a given batch using the model.
-        forward(batch: BatchData, model: Type[torch.nn.Module]) -> Dict[str, torch.Tensor]
-            Computes the energies and forces from a given batch using the model.
+        Parameters
+
         """
         super().__init__()
+        self.requested_properties = requested_properties
+        self.include_force = False
+        if "per_atom_force" in self.requested_properties:
+            self.include_force = True
 
     def _get_forces(
-        self, batch: "BatchData", energies: Dict[str, torch.Tensor]
+        self, batch: "BatchData", energies: Dict[str, torch.Tensor], train_mode: bool
     ) -> Dict[str, torch.Tensor]:
         """
         Computes the forces from a given batch using the model.
@@ -429,16 +436,22 @@ def _get_forces(
 
         # Compute the gradient (forces) from the predicted energies
         grad = torch.autograd.grad(
-            per_molecule_energy_predict.sum(),
+            per_molecule_energy_predict,
             nnp_input.positions,
-            create_graph=False,
-            retain_graph=True,
+            grad_outputs=torch.ones_like(per_molecule_energy_predict),
+            create_graph=train_mode,
+            retain_graph=train_mode,
+            allow_unused=True,
         )[0]
+
+        if grad is None:
+            raise RuntimeWarning("Force calculation did not return a gradient")
+
         per_atom_force_predict = -1 * grad  # Forces are the negative gradient of energy
 
         return {
             "per_atom_force_true": per_atom_force_true,
-            "per_atom_force_predict": per_atom_force_predict,
+            "per_atom_force_predict": per_atom_force_predict.contiguous(),
         }
 
     def _get_energies(
@@ -476,7 +489,7 @@ def _get_energies(
         }
 
     def forward(
-        self, batch: "BatchData", model: Type[torch.nn.Module]
+        self, batch: "BatchData", model: Type[torch.nn.Module], train_mode: bool = False
     ) -> Dict[str, torch.Tensor]:
         """
         Computes the energies and forces from a given batch using the model.
@@ -494,7 +507,10 @@ def forward(
             The true and predicted energies and forces from the dataset and the model.
         """
         energies = self._get_energies(batch, model)
-        forces = self._get_forces(batch, energies)
+        if self.include_force:
+            forces = self._get_forces(batch, energies, train_mode)
+        else:
+            forces = {}
         return {**energies, **forces}
 
 
@@ -546,7 +562,42 @@ def __init__(
             potential_seed=potential_seed,
         )
 
-        self.calculate_predictions = CalculateProperties()
+        def check_strides(module, grad_input, grad_output):
+            print(f"Layer: {module.__class__.__name__}")
+            for i, grad in enumerate(grad_input):
+                if grad is not None:
+                    print(
+                        f"Grad input {i}: size {grad.size()}, strides {grad.stride()}"
+                    )
+            # Handle grad_output
+            if isinstance(grad_output, tuple) and isinstance(grad_output[0], dict):
+                # If the output is a dict wrapped in a tuple, extract the dict
+                grad_output = grad_output[0]
+            if isinstance(grad_output, dict):
+                for key, grad in grad_output.items():
+                    if grad is not None:
+                        print(
+                            f"Grad output [{key}]: size {grad.size()}, strides {grad.stride()}"
+                        )
+            else:
+                for i, grad in enumerate(grad_output):
+                    if grad is not None:
+                        print(
+                            f"Grad output {i}: size {grad.size()}, strides {grad.stride()}"
+                        )
+
+        # Register the full backward hook
+        if training_parameter.verbose is True:
+            for module in self.potential.modules():
+                module.register_full_backward_hook(check_strides)
+
+        self.include_force = False
+        if "per_atom_force" in training_parameter.loss_parameter.loss_property:
+            self.include_force = True
+
+        self.calculate_predictions = CalculateProperties(
+            training_parameter.loss_parameter.loss_property
+        )
         self.optimizer = training_parameter.optimizer
         self.learning_rate = training_parameter.lr
         self.lr_scheduler = training_parameter.lr_scheduler
@@ -575,6 +626,11 @@ def __init__(
             training_parameter.loss_parameter.loss_property
         )
 
+        # Initialize loss  metric
+        self.loss_metric = create_error_metrics(
+            training_parameter.loss_parameter.loss_property, loss=True
+        )
+
     def forward(self, batch: "BatchData") -> Dict[str, torch.Tensor]:
         """
         Computes the energies and forces from a given batch using the model.
@@ -641,29 +697,23 @@ def training_step(self, batch: "BatchData", batch_idx: int) -> torch.Tensor:
             The loss tensor computed for the current training step.
         """
 
-        # calculate energy and forces
-        predict_target = self.calculate_predictions(batch, self.potential)
+        # calculate energy and forces, Note that `predict_target` is a
+        # dictionary containing the predicted and true values for energy and
+        # force`
+        predict_target = self.calculate_predictions(
+            batch, self.potential, self.training
+        )
 
-        # calculate the loss
+        # Calculate the loss
         loss_dict = self.loss(predict_target, batch)
 
-        # Update and log training error
-        self._update_metrics(self.train_error, predict_target)
-
-        # log the loss (this includes the individual contributions that the loss contains)
-        for key, loss in loss_dict.items():
-            self.log(
-                f"loss/{key}",
-                torch.mean(loss),
-                on_step=False,
-                prog_bar=True,
-                on_epoch=True,
-                batch_size=1,
-            )  # batch size is 1 because the mean of the batch is logged
+        # Update the loss metric with the different loss components
+        for key, metric in loss_dict.items():
+            self.loss_metric[key].update(metric.clone().detach(), batch.batch_size())
 
-        return loss_dict["total_loss"]
+        loss = torch.mean(loss_dict["total_loss"])
+        return loss.contiguous()
 
-    @torch.enable_grad()
     def validation_step(self, batch: "BatchData", batch_idx: int) -> None:
         """
         Validation step to compute the RMSE/MAE across epochs.
@@ -682,14 +732,15 @@ def validation_step(self, batch: "BatchData", batch_idx: int) -> None:
 
         # Ensure positions require gradients for force calculation
         batch.nnp_input.positions.requires_grad_(True)
-        # calculate energy and forces
-        predict_target = self.calculate_predictions(batch, self.potential)
-        # calculate the loss
-        loss = self.loss(predict_target, batch)
-        # log the loss
+        with torch.inference_mode(False):
+
+            # calculate energy and forces
+            predict_target = self.calculate_predictions(
+                batch, self.potential, self.training
+            )
+
         self._update_metrics(self.val_error, predict_target)
 
-    @torch.enable_grad()
     def test_step(self, batch: "BatchData", batch_idx: int) -> None:
         """
         Test step to compute the RMSE loss for a given batch.
@@ -712,7 +763,10 @@ def test_step(self, batch: "BatchData", batch_idx: int) -> None:
         # Ensure positions require gradients for force calculation
         batch.nnp_input.positions.requires_grad_(True)
         # calculate energy and forces
-        predict_target = self.calculate_predictions(batch, self.potential)
+        with torch.inference_mode(False):
+            predict_target = self.calculate_predictions(
+                batch, self.potential, self.training
+            )
         # Update and log metrics
         self._update_metrics(self.test_error, predict_target)
 
@@ -766,6 +820,7 @@ def _log_on_epoch(self, log_mode: str = "train"):
         conv = {
             "MeanAbsoluteError": "mae",
             "MeanSquaredError": "rmse",
+            "MeanMetric": "mse",  # NOTE: MeanMetric is the MSE since we accumulate the squared error
         }  # NOTE: MeanSquaredError(squared=False) is RMSE
 
         # Log all accumulated metrics for train and val phases
@@ -773,6 +828,7 @@ def _log_on_epoch(self, log_mode: str = "train"):
             errors = [
                 ("train", self.train_error),
                 ("val", self.val_error),
+                ("loss", self.loss_metric),
             ]
         elif log_mode == "test":
             errors = [
@@ -786,14 +842,11 @@ def _log_on_epoch(self, log_mode: str = "train"):
             if phase == "train" and not self.log_on_training_step:
                 continue
 
-            metrics = {}
             for property, metrics_dict in error_dict.items():
                 for name, metric in metrics_dict.items():
-                    name = f"{phase}/{property}/{conv[name]}"
-                    metrics[name] = metric.compute()
+                    name = f"{phase}/{property}/{conv.get(name, name)}"
+                    self.log(name, metric.compute(), prog_bar=True, sync_dist=True)
                     metric.reset()
-            # log dict, print val metrics to console
-            self.log_dict(metrics, on_epoch=True, prog_bar=(phase == "val"))
 
     def configure_optimizers(self):
         """
@@ -950,8 +1003,8 @@ def setup_datamodule(self) -> DataModule:
         DataModule
             Configured DataModule instance.
         """
-        from modelforge.dataset.utils import REGISTERED_SPLITTING_STRATEGIES
         from modelforge.dataset.dataset import DataModule
+        from modelforge.dataset.utils import REGISTERED_SPLITTING_STRATEGIES
 
         dm = DataModule(
             name=self.dataset_parameter.dataset_name,
@@ -965,6 +1018,7 @@ def setup_datamodule(self) -> DataModule:
                 seed=self.training_parameter.splitting_strategy.seed,
                 split=self.training_parameter.splitting_strategy.data_split,
             ),
+            regenerate_processed_cache=self.dataset_parameter.regenerate_processed_cache,
         )
         dm.prepare_data()
         dm.setup()
@@ -1050,8 +1104,8 @@ def setup_callbacks(self) -> List[Any]:
             List of configured callbacks.
         """
         from lightning.pytorch.callbacks import (
-            ModelCheckpoint,
             EarlyStopping,
+            ModelCheckpoint,
             StochasticWeightAveraging,
         )
 
@@ -1074,7 +1128,7 @@ def setup_callbacks(self) -> List[Any]:
         )
         checkpoint_callback = ModelCheckpoint(
             save_top_k=2,
-            monitor=self.training_parameter.monitor,
+            monitor=self.training_parameter.monitor_for_checkpoint,
             filename=checkpoint_filename,
         )
         callbacks.append(checkpoint_callback)
@@ -1091,8 +1145,13 @@ def setup_trainer(self) -> Trainer:
         """
         from lightning import Trainer
 
+        # if devices is a list
+        if isinstance(self.runtime_parameter.devices, list):
+            strategy = "ddp"
+
         trainer = Trainer(
             max_epochs=self.training_parameter.number_of_epochs,
+            min_epochs=self.training_parameter.min_number_of_epochs,
             num_nodes=self.runtime_parameter.number_of_nodes,
             devices=self.runtime_parameter.devices,
             accelerator=self.runtime_parameter.accelerator,
@@ -1285,9 +1344,9 @@ def read_config(
             runtime_config_dict[key] = value
 
     # Load and instantiate the data classes with the merged configuration
-    from modelforge.potential import _Implemented_NNP_Parameters
     from modelforge.dataset.dataset import DatasetParameters
-    from modelforge.train.parameters import TrainingParameters, RuntimeParameters
+    from modelforge.potential import _Implemented_NNP_Parameters
+    from modelforge.train.parameters import RuntimeParameters, TrainingParameters
 
     potential_name = potential_config_dict["potential_name"]
     PotentialParameters = (
@@ -1385,9 +1444,7 @@ def read_config_and_train(
         log_every_n_steps=log_every_n_steps,
         simulation_environment=simulation_environment,
     )
-    from modelforge.potential.models import (
-        NeuralNetworkPotentialFactory,
-    )
+    from modelforge.potential.models import NeuralNetworkPotentialFactory
 
     model = NeuralNetworkPotentialFactory.generate_potential(
         use="training",
diff --git a/modelforge/utils/__init__.py b/modelforge/utils/__init__.py
index 4e57d2f1..605aea59 100644
--- a/modelforge/utils/__init__.py
+++ b/modelforge/utils/__init__.py
@@ -1,3 +1,4 @@
 """Module of general modelforge utilities."""
 
 from .prop import SpeciesEnergies, PropertyNames
+from .misc import lock_with_attribute
diff --git a/modelforge/utils/misc.py b/modelforge/utils/misc.py
index f56b2d8d..3a65e17d 100644
--- a/modelforge/utils/misc.py
+++ b/modelforge/utils/misc.py
@@ -2,15 +2,18 @@
 Module of miscellaneous utilities.
 """
 
-from typing import Literal
+from typing import Literal, TYPE_CHECKING
 
 import torch
 from loguru import logger
-from modelforge.dataset.dataset import DataModule
+
+# import DataModule for typing hint
+if TYPE_CHECKING:
+    from modelforge.dataset.dataset import DataModule
 
 
 def visualize_model(
-    dm: DataModule,
+    dm: "DataModule",
     potential_name: Literal["ANI2x", "PhysNet", "SchNet", "PaiNN", "SAKE"],
 ):
     # visualize the compute graph
@@ -314,3 +317,104 @@ def __exit__(self, *args):
         # fcntl.flock(self._file_handle.fileno(), fcntl.LOCK_UN)
         unlock_file(self._file_handle)
         self._file_handle.close()
+
+
+import os
+from functools import wraps
+
+
+def lock_with_attribute(attribute_name):
+    """
+    Decorator for locking a method using a lock file path stored in an instance
+    attribute. The attribute is accessed on the instance (`self`) at runtime.
+
+    Parameters
+    ----------
+    attribute_name : str
+        The name of the instance attribute that contains the lock file path.
+
+    Examples
+    --------
+    >>> from modelforge.utils.misc import lock_with_attribute
+    >>>
+    >>> class MyClass:
+    >>>     def __init__(self, lock_file):
+    >>>         self.method_lock = lock_file
+    >>>
+    >>>     @lock_with_attribute('method_lock')
+    >>>     def critical_section(self):
+    >>>         print("Executing critical section")
+    """
+
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            # Retrieve the instance (`self`)
+            instance = args[0]
+            # Get the lock file path from the specified attribute
+            lock_file_path = getattr(instance, attribute_name)
+            with open(lock_file_path, "w+") as f:
+                # Lock the file
+                lock_file(f)
+
+                try:
+                    # Execute the wrapped function
+                    result = func(*args, **kwargs)
+                finally:
+                    # Unlock the file
+                    unlock_file(f)
+
+                    # Optionally, remove the lock file
+                    os.remove(lock_file_path)
+
+                return result
+
+        return wrapper
+
+    return decorator
+
+
+def load_configs_into_pydantic_models(potential_name: str, dataset_name: str):
+    from modelforge.tests.data import (
+        potential_defaults,
+        training_defaults,
+        dataset_defaults,
+        runtime_defaults,
+    )
+    from importlib import resources
+    import toml
+
+    potential_path = (
+        resources.files(potential_defaults) / f"{potential_name.lower()}.toml"
+    )
+    dataset_path = resources.files(dataset_defaults) / f"{dataset_name.lower()}.toml"
+    training_path = resources.files(training_defaults) / "default.toml"
+    runtime_path = resources.files(runtime_defaults) / "runtime.toml"
+
+    training_config_dict = toml.load(training_path)
+    dataset_config_dict = toml.load(dataset_path)
+    potential_config_dict = toml.load(potential_path)
+    runtime_config_dict = toml.load(runtime_path)
+
+    potential_name = potential_config_dict["potential"]["potential_name"]
+
+    from modelforge.potential import _Implemented_NNP_Parameters
+
+    PotentialParameters = (
+        _Implemented_NNP_Parameters.get_neural_network_parameter_class(potential_name)
+    )
+    potential_parameters = PotentialParameters(**potential_config_dict["potential"])
+
+    from modelforge.dataset.dataset import DatasetParameters
+    from modelforge.train.parameters import TrainingParameters, RuntimeParameters
+
+    dataset_parameters = DatasetParameters(**dataset_config_dict["dataset"])
+    training_parameters = TrainingParameters(**training_config_dict["training"])
+    runtime_parameters = RuntimeParameters(**runtime_config_dict["runtime"])
+
+    return {
+        "potential": potential_parameters,
+        "dataset": dataset_parameters,
+        "training": training_parameters,
+        "runtime": runtime_parameters,
+    }
diff --git a/pyproject.toml b/pyproject.toml
index ed8b70a9..c9652872 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,9 +1,7 @@
 [build-system]
-requires = ["setuptools>=61.0", "versioningit~=2.0"]
+requires = ["setuptools>=61.0", "versioningit~=3.0"]
 build-backend = "setuptools.build_meta"
 
-# Self-descriptive entries which should always be present
-# https://packaging.python.org/en/latest/specifications/declaring-project-metadata/
 [project]
 name = "modelforge"
 description = "Infrastructure to implement and train NNPs"
@@ -17,45 +15,27 @@ license = { text = "MIT" }
 classifiers = [
     "License :: OSI Approved :: MIT License",
     "Programming Language :: Python :: 3",
+    "Intended Audience :: Science/Research",
+    "Topic :: Scientific/Engineering",
+    "Operating System :: POSIX :: Linux",
+    "Environment :: GPU",
+    "Environment :: GPU :: NVIDIA CUDA",
 ]
-requires-python = ">=3.8"
-# Declare any run-time dependencies that should be installed with the package.
-#dependencies = [
-#    "importlib-resources;python_version<'3.10'",
-#]
+requires-python = ">=3.10"
 
-# Update the urls once the hosting is set up.
-#[project.urls]
-#"Source" = "https://github.com/<username>/modelforge/"
-#"Documentation" = "https://modelforge.readthedocs.io/"
-
-[project.optional-dependencies]
-test = [
-  "pytest>=6.1.2",
-  "pytest-runner"
-]
+[project.urls]
+Source = "https://github.com/choderalab/modelforge"
+Documentation = "https://modelforge.readthedocs.io/"
+Wiki = "https://github.com/choderalab/modelforge/wiki"
 
 [tool.setuptools]
-# This subkey is a beta stage development and keys may change in the future, see https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html for more details
-#
-# As of version 0.971, mypy does not support type checking of installed zipped
-# packages (because it does not actually import the Python packages).
-# We declare the package not-zip-safe so that our type hints are also available
-# when checking client code that uses our (installed) package.
-# Ref:
+# Disable zipping because mypy cannot read zip imports and this may affect downstream development.
 # https://mypy.readthedocs.io/en/stable/installed_packages.html?highlight=zip#using-installed-packages-with-mypy-pep-561
+# NOTE: We might consider removing this once we can test the code in a 
+# production environment since zipping the package may increase performance.
 zip-safe = false
-# Let setuptools discover the package in the current directory,
-# but be explicit about non-Python files.
-# See also:
-#   https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html#setuptools-specific-configuration
-# Note that behavior is currently evolving with respect to how to interpret the
-# "data" and "tests" subdirectories. As of setuptools 63, both are automatically
-# included if namespaces is true (default), even if the package is named explicitly
-# (instead of using 'find'). With 'find', the 'tests' subpackage is discovered
-# recursively because of its __init__.py file, but the data subdirectory is excluded
-# with include-package-data = false and namespaces = false.
-include-package-data = false
+include-package-data = true
+
 [tool.setuptools.packages.find]
 namespaces = false
 where = ["."]
@@ -66,6 +46,7 @@ modelforge = [
     "py.typed"
 ]
 
+# https://versioningit.readthedocs.io/en/stable/configuration.html#
 [tool.versioningit]
 default-version = "1+unknown"
 
@@ -75,9 +56,7 @@ dirty = "{base_version}+{distance}.{vcs}{rev}.dirty"
 distance-dirty = "{base_version}+{distance}.{vcs}{rev}.dirty"
 
 [tool.versioningit.vcs]
-# The method key:
-method = "git"  # <- The method name
-# Parameters to pass to the method:
+method = "git"
 match = ["*"]
 default-tag = "1.0.0"
 
@@ -90,7 +69,7 @@ file = "modelforge/_version.py"
 omit = [
     # Omit the tests
     "*/tests/*",
-    # Omit generated versioneer
+    # Omit generated versioningit
     "modelforge/_version.py"
 ]
 
diff --git a/scripts/config.toml b/scripts/config.toml
index 32e3018b..e7fc2563 100644
--- a/scripts/config.toml
+++ b/scripts/config.toml
@@ -1,41 +1,39 @@
 [potential]
-potential_name = "SchNet"
+potential_name = "ANI2x"
 
 [potential.core_parameter]
-number_of_radial_basis_functions = 20
-maximum_interaction_radius = "5.0 angstrom"
-number_of_interaction_modules = 3
-number_of_filters = 32
-shared_interactions = false
+angle_sections = 4
+maximum_interaction_radius = "5.1 angstrom"
+minimum_interaction_radius = "0.8 angstrom"
+number_of_radial_basis_functions = 16
+maximum_interaction_radius_for_angular_features = "3.5 angstrom"
+minimum_interaction_radius_for_angular_features = "0.8 angstrom"
+angular_dist_divisions = 8
 
 [potential.core_parameter.activation_function_parameter]
-activation_function_name = "ShiftedSoftplus"
+activation_function_name = "CeLU"   # for the original ANI behavior please stick with CeLu since the alpha parameter is currently hard coded and might lead to different behavior when another activation function is used.
 
-[potential.core_parameter.featurization]
-properties_to_featurize = ['atomic_number']
-maximum_atomic_number = 101
-number_of_per_atom_features = 32
+[potential.core_parameter.activation_function_parameter.activation_function_arguments]
+alpha = 0.1
 
 [potential.postprocessing_parameter]
 [potential.postprocessing_parameter.per_atom_energy]
 normalize = true
 from_atom_to_molecule_reduction = true
 keep_per_atom_property = true
-[potential.postprocessing_parameter.general_postprocessing_operation]
-calculate_molecular_self_energy = true
 
 [dataset]
-dataset_name = "QM9"
+dataset_name = "PHALKETHOH"
 version_select = "nc_1000_v0"
 num_workers = 4
 pin_memory = true
 
 [training]
-number_of_epochs = 4
+number_of_epochs = 1000
 remove_self_energies = true
-batch_size = 128
-lr = 1e-3
-monitor = "val/per_molecule_energy/rmse"
+batch_size = 16
+lr = 0.5e-3
+monitor_for_checkpoint = "val/per_molecule_energy/rmse"
 
 [training.experiment_logger]
 logger_name = "tensorboard"
@@ -62,7 +60,6 @@ loss_property = ['per_molecule_energy', 'per_atom_force'] # use
 per_molecule_energy = 0.999 #NOTE: reciprocal units
 per_atom_force = 0.001
 
-
 [training.early_stopping]
 verbose = true
 monitor = "val/per_molecule_energy/rmse"
@@ -75,12 +72,12 @@ data_split = [0.8, 0.1, 0.1]
 seed = 42
 
 [runtime]
-save_dir = "lightning_logs"
+save_dir = "test_setup"
 experiment_name = "{potential_name}_{dataset_name}"
 local_cache_dir = "./cache"
 accelerator = "cpu"
 number_of_nodes = 1
-devices = 1                        #[0,1,2,3]
+devices = 1                                         #[0,1,2,3]
 checkpoint_path = "None"
 simulation_environment = "PyTorch"
-log_every_n_steps = 1
+log_every_n_steps = 1
\ No newline at end of file
diff --git a/setup.py b/setup.py
deleted file mode 100644
index c7e538b2..00000000
--- a/setup.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from setuptools import setup
-
-setup(
-    name="modelforge",
-    version="0.1",
-    packages=["modelforge"],
-    package_data={
-        "modelforge": [
-            "dataset/yaml_files/*",
-            "curation/yaml_files/*",
-            "tests/data/potential_defaults/*",
-            "tests/data/training_defaults/*",
-        ]
-    },
-    url="https://github.com/choderalab/modelforge",
-    license="MIT",
-    author="Chodera lab, Marcus Wieder, Christopher Iacovella, and others",
-    author_email="",
-    description="A library for building and training neural network potentials",
-    include_package_data=True,
-)