From e154d2a49bcab8778f02f04f19802753f5dd85de Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Mon, 6 Jan 2025 16:59:49 +0100
Subject: [PATCH 01/23] On-the-fly training works for the RAM case

---
 mala/datahandling/data_handler.py        | 39 +++++++++++++++++++++++-
 mala/datahandling/data_handler_base.py   | 14 ++++++++-
 mala/datahandling/snapshot.py            |  3 ++
 mala/descriptors/atomic_density.py       |  4 +++
 mala/descriptors/bispectrum.py           | 32 +++++++++++--------
 mala/descriptors/descriptor.py           | 27 +++++++++++++++-
 mala/descriptors/minterpy_descriptors.py |  6 ++++
 mala/network/trainer.py                  |  3 ++
 mala/targets/target.py                   |  2 +-
 9 files changed, 113 insertions(+), 17 deletions(-)

diff --git a/mala/datahandling/data_handler.py b/mala/datahandling/data_handler.py
index 3b9521e4..fa24d3c3 100644
--- a/mala/datahandling/data_handler.py
+++ b/mala/datahandling/data_handler.py
@@ -1,6 +1,7 @@
 """DataHandler class that loads and scales data."""
 
 import os
+import tempfile
 
 import numpy as np
 import torch
@@ -169,6 +170,19 @@ def clear_data(self):
         self.output_data_scaler.reset()
         super(DataHandler, self).clear_data()
 
+    def delete_temporary_data(self):
+        """
+        Delete temporary data files.
+
+        These may have been created during a training or testing process
+        when using atomic positions for on-the-fly calculation of descriptors
+        rather than precomputed data files.
+        """
+        for snapshot in self.parameters.snapshot_directories_list:
+            if snapshot.temporary_input_file is not None:
+                if os.path.isfile(snapshot.temporary_input_file):
+                    os.remove(snapshot.temporary_input_file)
+
     # Preparing data
     ######################
 
@@ -595,6 +609,26 @@ def __load_data(self, function, data_type):
                         snapshot.input_npy_directory, snapshot.input_npy_file
                     )
                     units = snapshot.input_units
+
+                    # If the input for the descriptors is actually a JSON
+                    # file then we need to calculate the descriptors.
+                    if snapshot.snapshot_type == "json+numpy":
+                        snapshot.temporary_input_file = (
+                            tempfile.NamedTemporaryFile(
+                                delete=False,
+                                prefix=snapshot.input_npy_file.split(".")[0],
+                                suffix=".in.npy",
+                                dir=snapshot.input_npy_directory,
+                            ).name
+                        )
+                        descriptors, grid = (
+                            self.descriptor_calculator.calculate_from_json(
+                                file
+                            )
+                        )
+                        np.save(snapshot.temporary_input_file, descriptors)
+                        file = snapshot.temporary_input_file
+
                 else:
                     file = os.path.join(
                         snapshot.output_npy_directory,
@@ -602,7 +636,10 @@ def __load_data(self, function, data_type):
                     )
                     units = snapshot.output_units
 
-                if snapshot.snapshot_type == "numpy":
+                if (
+                    snapshot.snapshot_type == "numpy"
+                    or snapshot.snapshot_type == "json+numpy"
+                ):
                     calculator.read_from_numpy_file(
                         file,
                         units=units,
diff --git a/mala/datahandling/data_handler_base.py b/mala/datahandling/data_handler_base.py
index c141551f..95742633 100644
--- a/mala/datahandling/data_handler_base.py
+++ b/mala/datahandling/data_handler_base.py
@@ -207,6 +207,15 @@ def _check_snapshots(self, comm=None):
                     ),
                     comm=comm,
                 )
+            elif snapshot.snapshot_type == "json+numpy":
+                tmp_dimension = (
+                    self.descriptor_calculator.read_dimensions_from_json(
+                        os.path.join(
+                            snapshot.input_npy_directory,
+                            snapshot.input_npy_file,
+                        )
+                    )
+                )
             else:
                 raise Exception("Unknown snapshot file type.")
 
@@ -235,7 +244,10 @@ def _check_snapshots(self, comm=None):
                 snapshot.output_npy_directory,
                 min_verbosity=1,
             )
-            if snapshot.snapshot_type == "numpy":
+            if (
+                snapshot.snapshot_type == "numpy"
+                or snapshot.snapshot_type == "json+numpy"
+            ):
                 tmp_dimension = (
                     self.target_calculator.read_dimensions_from_numpy_file(
                         os.path.join(
diff --git a/mala/datahandling/snapshot.py b/mala/datahandling/snapshot.py
index 1bac8488..e4472835 100644
--- a/mala/datahandling/snapshot.py
+++ b/mala/datahandling/snapshot.py
@@ -133,6 +133,9 @@ def __init__(
         self.input_dimension = None
         self.output_dimension = None
 
+        # Temporary descriptor files, which may be needed.
+        self.temporary_input_file = None
+
     @classmethod
     def from_json(cls, json_dict):
         """
diff --git a/mala/descriptors/atomic_density.py b/mala/descriptors/atomic_density.py
index 4459c838..6a052747 100755
--- a/mala/descriptors/atomic_density.py
+++ b/mala/descriptors/atomic_density.py
@@ -119,6 +119,10 @@ def _calculate(self, outdir, **kwargs):
         else:
             return self.__calculate_python(**kwargs)
 
+    def _read_feature_dimension_from_json(self, json_dict):
+        # For now, has to be adapted in the multielement case.
+        return 4
+
     def __calculate_lammps(self, outdir, **kwargs):
         """Perform actual Gaussian descriptor calculation."""
         # For version compatibility; older lammps versions (the serial version
diff --git a/mala/descriptors/bispectrum.py b/mala/descriptors/bispectrum.py
index ab8bbff7..8932c61b 100755
--- a/mala/descriptors/bispectrum.py
+++ b/mala/descriptors/bispectrum.py
@@ -120,6 +120,24 @@ def _calculate(self, outdir, **kwargs):
         else:
             return self.__calculate_python(**kwargs)
 
+    def _read_feature_dimension_from_json(self, json_dict):
+        if self.parameters.descriptors_contain_xyz:
+            return self.__get_feature_size() - 3
+        else:
+            return self.__get_feature_size()
+
+    def __get_feature_size(self):
+        ncols0 = 3
+
+        # Analytical relation for fingerprint length
+        ncoeff = (
+            (self.parameters.bispectrum_twojmax + 2)
+            * (self.parameters.bispectrum_twojmax + 3)
+            * (self.parameters.bispectrum_twojmax + 4)
+        )
+        ncoeff = ncoeff // 24  # integer division
+        return ncols0 + ncoeff
+
     def __calculate_lammps(self, outdir, **kwargs):
         """
         Perform bispectrum calculation using LAMMPS.
@@ -173,19 +191,7 @@ def __calculate_lammps(self, outdir, **kwargs):
 
         # Do the LAMMPS calculation and clean up.
         lmp.file(self.parameters.lammps_compute_file)
-
-        # Set things not accessible from LAMMPS
-        # First 3 cols are x, y, z, coords
-        ncols0 = 3
-
-        # Analytical relation for fingerprint length
-        ncoeff = (
-            (self.parameters.bispectrum_twojmax + 2)
-            * (self.parameters.bispectrum_twojmax + 3)
-            * (self.parameters.bispectrum_twojmax + 4)
-        )
-        ncoeff = ncoeff // 24  # integer division
-        self.feature_size = ncols0 + ncoeff
+        self.feature_size = self.__get_feature_size()
 
         # Extract data from LAMMPS calculation.
         # This is different for the parallel and the serial case.
diff --git a/mala/descriptors/descriptor.py b/mala/descriptors/descriptor.py
index 041dd4b3..2794e716 100644
--- a/mala/descriptors/descriptor.py
+++ b/mala/descriptors/descriptor.py
@@ -1,11 +1,12 @@
 """Base class for all descriptor calculators."""
 
 from abc import abstractmethod
-from functools import cached_property
+import json
 import os
 import tempfile
 
 import ase
+from ase.cell import Cell
 from ase.units import m
 from ase.neighborlist import NeighborList, NewPrimitiveNeighborList
 import numpy as np
@@ -375,6 +376,16 @@ def calculate_from_qe_out(
 
         return self._calculate(working_directory, **kwargs)
 
+    def calculate_from_json(self, json_file, working_directory=".", **kwargs):
+        if isinstance(json_file, str):
+            json_dict = json.load(open(json_file, encoding="utf-8"))
+        else:
+            json_dict = json.load(json_file)
+        self.grid_dimensions = json_dict["grid_dimensions"]
+        self._atoms = ase.Atoms.fromdict(json_dict["atoms"])
+        self._voxel = Cell(json_dict["voxel"]["array"])
+        return self._calculate(working_directory, **kwargs)
+
     def calculate_from_atoms(
         self, atoms, grid_dimensions, working_directory=".", **kwargs
     ):
@@ -573,6 +584,16 @@ def convert_local_to_3d(self, descriptors_np):
         ).transpose([2, 1, 0, 3])
         return descriptors_full, local_offset, local_reach
 
+    def read_dimensions_from_json(self, json_file):
+        if isinstance(json_file, str):
+            json_dict = json.load(open(json_file, encoding="utf-8"))
+        else:
+            json_dict = json.load(json_file)
+        grid_dimensions = json_dict["grid_dimensions"] + [
+            self._read_feature_dimension_from_json(json_dict)
+        ]
+        return grid_dimensions
+
     # Private methods
     #################
 
@@ -1021,5 +1042,9 @@ def _grid_to_coord(self, gridpoint):
     def _calculate(self, outdir, **kwargs):
         pass
 
+    @abstractmethod
+    def _read_feature_dimension_from_json(self, json_dict):
+        pass
+
     def _set_feature_size_from_array(self, array):
         self.feature_size = np.shape(array)[-1]
diff --git a/mala/descriptors/minterpy_descriptors.py b/mala/descriptors/minterpy_descriptors.py
index 2d9d5216..cfedf113 100755
--- a/mala/descriptors/minterpy_descriptors.py
+++ b/mala/descriptors/minterpy_descriptors.py
@@ -87,6 +87,12 @@ def backconvert_units(array, out_units):
         else:
             raise Exception("Unsupported unit for Minterpy descriptors.")
 
+    def _read_feature_dimension_from_json(self, json_dict):
+        raise Exception(
+            "This feature has not been implemented for Minterpy "
+            "descriptors."
+        )
+
     def _calculate(self, atoms, outdir, grid_dimensions, **kwargs):
         # For version compatibility; older lammps versions (the serial version
         # we still use on some machines) have these constants as part of the
diff --git a/mala/network/trainer.py b/mala/network/trainer.py
index ccd0ab70..b69faae5 100644
--- a/mala/network/trainer.py
+++ b/mala/network/trainer.py
@@ -597,6 +597,9 @@ def train_network(self):
                 )
             self.final_validation_loss = vloss
 
+        # Cleaning up temporary data files.
+        self.data.delete_temporary_data()
+
         # Clean-up for pre-fetching lazy loading.
         if self.data.parameters.use_lazy_loading_prefetch:
             self._training_data_loaders.cleanup()
diff --git a/mala/targets/target.py b/mala/targets/target.py
index 10a414c6..17eb5a4d 100644
--- a/mala/targets/target.py
+++ b/mala/targets/target.py
@@ -653,7 +653,7 @@ def read_additional_calculation_data(self, data, data_type=None):
             }
             self.atomic_forces_dft = None
             self.entropy_contribution_dft_calculation = None
-            self.grid_dimensions = [0, 0, 0]
+            self.grid_dimensions = json_dict["grid_dimensions"]
             self.atoms = None
 
             for key in json_dict:

From 88a5edb4568e8f15013a7be34a05d95da2e9e2db Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Mon, 6 Jan 2025 17:41:46 +0100
Subject: [PATCH 02/23] Lazy Loading training works now

---
 mala/datahandling/data_handler.py      | 51 ++++++++++++++++++--------
 mala/datahandling/lazy_load_dataset.py | 12 ++++++
 2 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/mala/datahandling/data_handler.py b/mala/datahandling/data_handler.py
index fa24d3c3..0a11ff2c 100644
--- a/mala/datahandling/data_handler.py
+++ b/mala/datahandling/data_handler.py
@@ -350,6 +350,21 @@ def get_snapshot_calculation_output(self, snapshot_number):
             snapshot_number
         ].calculation_output
 
+    def calculate_temporary_inputs(self, snapshot: Snapshot):
+        snapshot.temporary_input_file = tempfile.NamedTemporaryFile(
+            delete=False,
+            prefix=snapshot.input_npy_file.split(".")[0],
+            suffix=".in.npy",
+            dir=snapshot.input_npy_directory,
+        ).name
+        tmp, grid = self.descriptor_calculator.calculate_from_json(
+            os.path.join(
+                snapshot.input_npy_directory,
+                snapshot.input_npy_file,
+            )
+        )
+        np.save(snapshot.temporary_input_file, tmp)
+
     # Debugging
     ######################
 
@@ -613,22 +628,8 @@ def __load_data(self, function, data_type):
                     # If the input for the descriptors is actually a JSON
                     # file then we need to calculate the descriptors.
                     if snapshot.snapshot_type == "json+numpy":
-                        snapshot.temporary_input_file = (
-                            tempfile.NamedTemporaryFile(
-                                delete=False,
-                                prefix=snapshot.input_npy_file.split(".")[0],
-                                suffix=".in.npy",
-                                dir=snapshot.input_npy_directory,
-                            ).name
-                        )
-                        descriptors, grid = (
-                            self.descriptor_calculator.calculate_from_json(
-                                file
-                            )
-                        )
-                        np.save(snapshot.temporary_input_file, descriptors)
+                        self.calculate_temporary_inputs(snapshot)
                         file = snapshot.temporary_input_file
-
                 else:
                     file = os.path.join(
                         snapshot.output_npy_directory,
@@ -753,11 +754,20 @@ def __build_datasets(self):
                     self.training_data_sets[0].add_snapshot_to_dataset(
                         snapshot
                     )
+                # For training snapshots, temporary files (if needed) have
+                # already been built during parametrization, for all other
+                # snapshot types, this has to be done here.
                 if snapshot.snapshot_function == "va":
+                    if snapshot.snapshot_type == "json+numpy":
+                        self.calculate_temporary_inputs(snapshot)
+
                     self.validation_data_sets[0].add_snapshot_to_dataset(
                         snapshot
                     )
                 if snapshot.snapshot_function == "te":
+                    if snapshot.snapshot_type == "json+numpy":
+                        self.calculate_temporary_inputs(snapshot)
+
                     self.test_data_sets[0].add_snapshot_to_dataset(snapshot)
 
             # I don't think we need to mix them here. We can use the standard
@@ -915,6 +925,12 @@ def __parametrize_scalers(self):
                                 )
                             )
                         )
+                    elif snapshot.snapshot_type == "json+numpy":
+                        self.calculate_temporary_inputs(snapshot)
+                        tmp = self.descriptor_calculator.read_from_numpy_file(
+                            snapshot.temporary_input_file,
+                            units=snapshot.input_units,
+                        )
                     else:
                         raise Exception("Unknown snapshot file type.")
 
@@ -956,7 +972,10 @@ def __parametrize_scalers(self):
             for snapshot in self.parameters.snapshot_directories_list:
                 # Data scaling is only performed on the training data sets.
                 if snapshot.snapshot_function == "tr":
-                    if snapshot.snapshot_type == "numpy":
+                    if (
+                        snapshot.snapshot_type == "numpy"
+                        or snapshot.snapshot_type == "json+numpy"
+                    ):
                         tmp = self.target_calculator.read_from_numpy_file(
                             os.path.join(
                                 snapshot.output_npy_directory,
diff --git a/mala/datahandling/lazy_load_dataset.py b/mala/datahandling/lazy_load_dataset.py
index a5a2b1a5..78d11431 100644
--- a/mala/datahandling/lazy_load_dataset.py
+++ b/mala/datahandling/lazy_load_dataset.py
@@ -163,6 +163,18 @@ def get_new_data(self, file_index):
                 ),
                 units=self._snapshot_list[file_index].output_units,
             )
+        elif self._snapshot_list[file_index].snapshot_type == "json+numpy":
+            self.input_data = self._descriptor_calculator.read_from_numpy_file(
+                self._snapshot_list[file_index].temporary_input_file,
+                units=self._snapshot_list[file_index].input_units,
+            )
+            self.output_data = self._target_calculator.read_from_numpy_file(
+                os.path.join(
+                    self._snapshot_list[file_index].output_npy_directory,
+                    self._snapshot_list[file_index].output_npy_file,
+                ),
+                units=self._snapshot_list[file_index].output_units,
+            )
 
         elif self._snapshot_list[file_index].snapshot_type == "openpmd":
             self.input_data = (

From 0e4ddfea0aed7509688cff339b1edb488cba2bbd Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Tue, 7 Jan 2025 11:36:40 +0100
Subject: [PATCH 03/23] Checkpointing works now as well

---
 mala/datahandling/data_handler.py | 81 ++++++++++++++++++++-----------
 mala/network/trainer.py           |  2 +-
 2 files changed, 54 insertions(+), 29 deletions(-)

diff --git a/mala/datahandling/data_handler.py b/mala/datahandling/data_handler.py
index 0a11ff2c..b9127ca4 100644
--- a/mala/datahandling/data_handler.py
+++ b/mala/datahandling/data_handler.py
@@ -7,7 +7,7 @@
 import torch
 from torch.utils.data import TensorDataset
 
-from mala.common.parallelizer import printout, barrier
+from mala.common.parallelizer import printout, barrier, get_rank
 from mala.common.parameters import Parameters, DEFAULT_NP_DATA_DTYPE
 from mala.datahandling.data_handler_base import DataHandlerBase
 from mala.datahandling.data_scaler import DataScaler
@@ -170,7 +170,7 @@ def clear_data(self):
         self.output_data_scaler.reset()
         super(DataHandler, self).clear_data()
 
-    def delete_temporary_data(self):
+    def _delete_temporary_data(self):
         """
         Delete temporary data files.
 
@@ -178,10 +178,12 @@ def delete_temporary_data(self):
         when using atomic positions for on-the-fly calculation of descriptors
         rather than precomputed data files.
         """
-        for snapshot in self.parameters.snapshot_directories_list:
-            if snapshot.temporary_input_file is not None:
-                if os.path.isfile(snapshot.temporary_input_file):
-                    os.remove(snapshot.temporary_input_file)
+        if get_rank() == 0:
+            for snapshot in self.parameters.snapshot_directories_list:
+                if snapshot.temporary_input_file is not None:
+                    if os.path.isfile(snapshot.temporary_input_file):
+                        os.remove(snapshot.temporary_input_file)
+        barrier()
 
     # Preparing data
     ######################
@@ -241,16 +243,8 @@ def prepare_data(self, reparametrize_scaler=True):
             printout("Initializing the data scalers.", min_verbosity=1)
             self.__parametrize_scalers()
             printout("Data scalers initialized.", min_verbosity=0)
-        elif (
-            self.parameters.use_lazy_loading is False
-            and self.nr_training_data != 0
-        ):
-            printout(
-                "Data scalers already initilized, loading data to RAM.",
-                min_verbosity=0,
-            )
-            self.__load_data("training", "inputs")
-            self.__load_data("training", "outputs")
+        elif self.nr_training_data != 0:
+            self.__parametrized_load_training_data()
 
         # Build Datasets.
         printout("Build datasets.", min_verbosity=1)
@@ -267,6 +261,11 @@ def prepare_data(self, reparametrize_scaler=True):
         # allows for parallel I/O.
         barrier()
 
+        # In the RAM case, there is no reason not to delete all temporary files
+        # now.
+        if self.parameters.use_lazy_loading is False:
+            self._delete_temporary_data()
+
     def prepare_for_testing(self):
         """
         Prepare DataHandler for usage within Tester class.
@@ -351,19 +350,24 @@ def get_snapshot_calculation_output(self, snapshot_number):
         ].calculation_output
 
     def calculate_temporary_inputs(self, snapshot: Snapshot):
-        snapshot.temporary_input_file = tempfile.NamedTemporaryFile(
-            delete=False,
-            prefix=snapshot.input_npy_file.split(".")[0],
-            suffix=".in.npy",
-            dir=snapshot.input_npy_directory,
-        ).name
-        tmp, grid = self.descriptor_calculator.calculate_from_json(
-            os.path.join(
-                snapshot.input_npy_directory,
-                snapshot.input_npy_file,
+        if snapshot.temporary_input_file is not None:
+            if not os.path.isfile(snapshot.temporary_input_file):
+                snapshot.temporary_input_file = None
+
+        if snapshot.temporary_input_file is None:
+            snapshot.temporary_input_file = tempfile.NamedTemporaryFile(
+                delete=False,
+                prefix=snapshot.input_npy_file.split(".")[0],
+                suffix=".in.npy",
+                dir=snapshot.input_npy_directory,
+            ).name
+            tmp, grid = self.descriptor_calculator.calculate_from_json(
+                os.path.join(
+                    snapshot.input_npy_directory,
+                    snapshot.input_npy_file,
+                )
             )
-        )
-        np.save(snapshot.temporary_input_file, tmp)
+            np.save(snapshot.temporary_input_file, tmp)
 
     # Debugging
     ######################
@@ -1014,6 +1018,27 @@ def __parametrize_scalers(self):
 
         printout("Output scaler parametrized.", min_verbosity=1)
 
+    def __parametrized_load_training_data(self):
+        if self.parameters.use_lazy_loading:
+            printout(
+                "Data scalers already initilized, preparing input data.",
+                min_verbosity=0,
+            )
+            for snapshot in self.parameters.snapshot_directories_list:
+                # Data scaling is only performed on the training data sets.
+                if (
+                    snapshot.snapshot_function == "tr"
+                    and snapshot.snapshot_type == "json+numpy"
+                ):
+                    self.calculate_temporary_inputs(snapshot)
+        else:
+            printout(
+                "Data scalers already initilized, loading data to RAM.",
+                min_verbosity=0,
+            )
+            self.__load_data("training", "inputs")
+            self.__load_data("training", "outputs")
+
     def __raw_numpy_to_converted_numpy(
         self, numpy_array, data_type="in", units=None
     ):
diff --git a/mala/network/trainer.py b/mala/network/trainer.py
index b69faae5..c55e2b34 100644
--- a/mala/network/trainer.py
+++ b/mala/network/trainer.py
@@ -598,7 +598,7 @@ def train_network(self):
             self.final_validation_loss = vloss
 
         # Cleaning up temporary data files.
-        self.data.delete_temporary_data()
+        self.data._delete_temporary_data()
 
         # Clean-up for pre-fetching lazy loading.
         if self.data.parameters.use_lazy_loading_prefetch:

From c43fefc51103875f8200b35c6c557710f866182b Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Tue, 7 Jan 2025 11:55:54 +0100
Subject: [PATCH 04/23] Made method private

---
 mala/datahandling/data_handler.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mala/datahandling/data_handler.py b/mala/datahandling/data_handler.py
index b9127ca4..d0c8ee1e 100644
--- a/mala/datahandling/data_handler.py
+++ b/mala/datahandling/data_handler.py
@@ -349,7 +349,7 @@ def get_snapshot_calculation_output(self, snapshot_number):
             snapshot_number
         ].calculation_output
 
-    def calculate_temporary_inputs(self, snapshot: Snapshot):
+    def __calculate_temporary_inputs(self, snapshot: Snapshot):
         if snapshot.temporary_input_file is not None:
             if not os.path.isfile(snapshot.temporary_input_file):
                 snapshot.temporary_input_file = None
@@ -632,7 +632,7 @@ def __load_data(self, function, data_type):
                     # If the input for the descriptors is actually a JSON
                     # file then we need to calculate the descriptors.
                     if snapshot.snapshot_type == "json+numpy":
-                        self.calculate_temporary_inputs(snapshot)
+                        self.__calculate_temporary_inputs(snapshot)
                         file = snapshot.temporary_input_file
                 else:
                     file = os.path.join(
@@ -763,14 +763,14 @@ def __build_datasets(self):
                 # snapshot types, this has to be done here.
                 if snapshot.snapshot_function == "va":
                     if snapshot.snapshot_type == "json+numpy":
-                        self.calculate_temporary_inputs(snapshot)
+                        self.__calculate_temporary_inputs(snapshot)
 
                     self.validation_data_sets[0].add_snapshot_to_dataset(
                         snapshot
                     )
                 if snapshot.snapshot_function == "te":
                     if snapshot.snapshot_type == "json+numpy":
-                        self.calculate_temporary_inputs(snapshot)
+                        self.__calculate_temporary_inputs(snapshot)
 
                     self.test_data_sets[0].add_snapshot_to_dataset(snapshot)
 
@@ -930,7 +930,7 @@ def __parametrize_scalers(self):
                             )
                         )
                     elif snapshot.snapshot_type == "json+numpy":
-                        self.calculate_temporary_inputs(snapshot)
+                        self.__calculate_temporary_inputs(snapshot)
                         tmp = self.descriptor_calculator.read_from_numpy_file(
                             snapshot.temporary_input_file,
                             units=snapshot.input_units,
@@ -1030,7 +1030,7 @@ def __parametrized_load_training_data(self):
                     snapshot.snapshot_function == "tr"
                     and snapshot.snapshot_type == "json+numpy"
                 ):
-                    self.calculate_temporary_inputs(snapshot)
+                    self.__calculate_temporary_inputs(snapshot)
         else:
             printout(
                 "Data scalers already initilized, loading data to RAM.",

From 7a90ed5c4cc5704a1dce2f9f3122e6d6fb5a2467 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Tue, 7 Jan 2025 13:13:30 +0100
Subject: [PATCH 05/23] Tester class now also works with on-the-fly
 calculations

---
 mala/datahandling/data_handler.py | 4 ++--
 mala/network/tester.py            | 2 ++
 mala/network/trainer.py           | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/mala/datahandling/data_handler.py b/mala/datahandling/data_handler.py
index d0c8ee1e..d452b06f 100644
--- a/mala/datahandling/data_handler.py
+++ b/mala/datahandling/data_handler.py
@@ -170,7 +170,7 @@ def clear_data(self):
         self.output_data_scaler.reset()
         super(DataHandler, self).clear_data()
 
-    def _delete_temporary_data(self):
+    def delete_temporary_data(self):
         """
         Delete temporary data files.
 
@@ -264,7 +264,7 @@ def prepare_data(self, reparametrize_scaler=True):
         # In the RAM case, there is no reason not to delete all temporary files
         # now.
         if self.parameters.use_lazy_loading is False:
-            self._delete_temporary_data()
+            self.delete_temporary_data()
 
     def prepare_for_testing(self):
         """
diff --git a/mala/network/tester.py b/mala/network/tester.py
index d7c07761..fd1b04c6 100644
--- a/mala/network/tester.py
+++ b/mala/network/tester.py
@@ -110,6 +110,8 @@ def test_all_snapshots(self):
             for observable in self.observables_to_test:
                 results[observable].append(snapshot_result[observable])
 
+        self.data.delete_temporary_data()
+
         if self.output_format == "list":
             return results
 
diff --git a/mala/network/trainer.py b/mala/network/trainer.py
index c55e2b34..b69faae5 100644
--- a/mala/network/trainer.py
+++ b/mala/network/trainer.py
@@ -598,7 +598,7 @@ def train_network(self):
             self.final_validation_loss = vloss
 
         # Cleaning up temporary data files.
-        self.data._delete_temporary_data()
+        self.data.delete_temporary_data()
 
         # Clean-up for pre-fetching lazy loading.
         if self.data.parameters.use_lazy_loading_prefetch:

From 39a40c364233f67d801455139d2b1198ba0a42f8 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Tue, 7 Jan 2025 16:16:39 +0100
Subject: [PATCH 06/23] Prefetching works

---
 mala/datahandling/data_handler.py             |  5 +++
 mala/datahandling/lazy_load_dataset_single.py | 20 ++++++++++
 .../multi_lazy_load_data_loader.py            | 37 ++++++++++++++++++-
 3 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/mala/datahandling/data_handler.py b/mala/datahandling/data_handler.py
index d452b06f..e1611677 100644
--- a/mala/datahandling/data_handler.py
+++ b/mala/datahandling/data_handler.py
@@ -816,6 +816,9 @@ def __build_datasets(self):
                             self._use_ddp,
                         )
                     )
+                    if snapshot.snapshot_type == "json+numpy":
+                        self.__calculate_temporary_inputs(snapshot)
+
                 if snapshot.snapshot_function == "te":
                     self.test_data_sets.append(
                         LazyLoadDatasetSingle(
@@ -831,6 +834,8 @@ def __build_datasets(self):
                             input_requires_grad=True,
                         )
                     )
+                    if snapshot.snapshot_type == "json+numpy":
+                        self.__calculate_temporary_inputs(snapshot)
 
         else:
             if self.nr_training_data != 0:
diff --git a/mala/datahandling/lazy_load_dataset_single.py b/mala/datahandling/lazy_load_dataset_single.py
index 402d149d..1baef9ea 100644
--- a/mala/datahandling/lazy_load_dataset_single.py
+++ b/mala/datahandling/lazy_load_dataset_single.py
@@ -180,6 +180,26 @@ def allocate_shared_mem(self):
                     read_dtype=True,
                 )
             )
+        elif self.snapshot.snapshot_type == "json+numpy":
+            self.input_shape = (
+                self.descriptor_calculator.read_dimensions_from_json(
+                    os.path.join(
+                        self.snapshot.input_npy_directory,
+                        self.snapshot.input_npy_file,
+                    ),
+                )
+            )
+            self.input_dtype = np.dtype(np.float32)
+
+            self.output_shape, self.output_dtype = (
+                self.target_calculator.read_dimensions_from_numpy_file(
+                    os.path.join(
+                        self.snapshot.output_npy_directory,
+                        self.snapshot.output_npy_file,
+                    ),
+                    read_dtype=True,
+                )
+            )
         else:
             raise Exception("Invalid snapshot type selected.")
 
diff --git a/mala/datahandling/multi_lazy_load_data_loader.py b/mala/datahandling/multi_lazy_load_data_loader.py
index a9aca6af..35b02601 100644
--- a/mala/datahandling/multi_lazy_load_data_loader.py
+++ b/mala/datahandling/multi_lazy_load_data_loader.py
@@ -192,6 +192,24 @@ def load_snapshot_to_shm(
                     read_dtype=True,
                 )
             )
+        elif snapshot.snapshot_type == "json+numpy":
+            input_shape = descriptor_calculator.read_dimensions_from_json(
+                os.path.join(
+                    snapshot.input_npy_directory,
+                    snapshot.input_npy_file,
+                ),
+            )
+            input_dtype = np.dtype(np.float32)
+
+            output_shape, output_dtype = (
+                target_calculator.read_dimensions_from_numpy_file(
+                    os.path.join(
+                        snapshot.output_npy_directory,
+                        snapshot.output_npy_file,
+                    ),
+                    read_dtype=True,
+                )
+            )
         else:
             raise Exception("Invalid snapshot type selected.")
 
@@ -219,7 +237,22 @@ def load_snapshot_to_shm(
                 units=snapshot.output_units,
                 array=output_data,
             )
-        else:
+
+        elif snapshot.snapshot_type == "json+numpy":
+            descriptor_calculator.read_from_numpy_file(
+                snapshot.temporary_input_file,
+                units=snapshot.input_units,
+                array=input_data,
+            )
+            target_calculator.read_from_numpy_file(
+                os.path.join(
+                    snapshot.output_npy_directory, snapshot.output_npy_file
+                ),
+                units=snapshot.output_units,
+                array=output_data,
+            )
+
+        elif snapshot.snapshot_type == "openpmd":
             descriptor_calculator.read_from_openpmd_file(
                 os.path.join(
                     snapshot.input_npy_directory, snapshot.input_npy_file
@@ -234,6 +267,8 @@ def load_snapshot_to_shm(
                 units=snapshot.output_units,
                 array=output_data,
             )
+        else:
+            raise Exception("Invalid snapshot type selected.")
 
         # This function only loads the numpy data with scaling. Remaining data
         # preprocessing occurs in __getitem__ of LazyLoadDatasetSingle

From 2201e34fcd6995bfb24528ae6e29fcdbd094bd0b Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Tue, 7 Jan 2025 16:27:14 +0100
Subject: [PATCH 07/23] Is this already enought to get DDP working?

---
 mala/datahandling/data_handler.py | 11 ++++++++---
 mala/network/tester.py            |  2 +-
 mala/network/trainer.py           |  2 +-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/mala/datahandling/data_handler.py b/mala/datahandling/data_handler.py
index e1611677..d0ce2ad4 100644
--- a/mala/datahandling/data_handler.py
+++ b/mala/datahandling/data_handler.py
@@ -170,7 +170,7 @@ def clear_data(self):
         self.output_data_scaler.reset()
         super(DataHandler, self).clear_data()
 
-    def delete_temporary_data(self):
+    def delete_temporary_inputs(self):
         """
         Delete temporary data files.
 
@@ -264,7 +264,7 @@ def prepare_data(self, reparametrize_scaler=True):
         # In the RAM case, there is no reason not to delete all temporary files
         # now.
         if self.parameters.use_lazy_loading is False:
-            self.delete_temporary_data()
+            self.delete_temporary_inputs()
 
     def prepare_for_testing(self):
         """
@@ -367,7 +367,12 @@ def __calculate_temporary_inputs(self, snapshot: Snapshot):
                     snapshot.input_npy_file,
                 )
             )
-            np.save(snapshot.temporary_input_file, tmp)
+            if self.parameters._configuration["mpi"]:
+                tmp = self.descriptor_calculator.gather_descriptors(tmp)
+
+            if get_rank() == 0:
+                np.save(snapshot.temporary_input_file, tmp)
+            barrier()
 
     # Debugging
     ######################
diff --git a/mala/network/tester.py b/mala/network/tester.py
index fd1b04c6..b8ee1c70 100644
--- a/mala/network/tester.py
+++ b/mala/network/tester.py
@@ -110,7 +110,7 @@ def test_all_snapshots(self):
             for observable in self.observables_to_test:
                 results[observable].append(snapshot_result[observable])
 
-        self.data.delete_temporary_data()
+        self.data.delete_temporary_inputs()
 
         if self.output_format == "list":
             return results
diff --git a/mala/network/trainer.py b/mala/network/trainer.py
index b69faae5..9ff5ad12 100644
--- a/mala/network/trainer.py
+++ b/mala/network/trainer.py
@@ -598,7 +598,7 @@ def train_network(self):
             self.final_validation_loss = vloss
 
         # Cleaning up temporary data files.
-        self.data.delete_temporary_data()
+        self.data.delete_temporary_inputs()
 
         # Clean-up for pre-fetching lazy loading.
         if self.data.parameters.use_lazy_loading_prefetch:

From c1737c51273f760842c521b25b8a9c03f67325a5 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Tue, 7 Jan 2025 17:26:57 +0100
Subject: [PATCH 08/23] Renamed "additional info", since it will be used more
 regularly with these changes

---
 docs/source/basic_usage/more_data.rst         |  8 +--
 .../advanced/ex10_convert_numpy_openpmd.py    | 18 +++---
 examples/basic/ex03_preprocess_data.py        | 10 +--
 mala/datahandling/data_converter.py           | 64 +++++++++----------
 mala/datahandling/data_shuffler.py            | 10 ++-
 test/complete_interfaces_test.py              | 12 ++--
 6 files changed, 60 insertions(+), 62 deletions(-)

diff --git a/docs/source/basic_usage/more_data.rst b/docs/source/basic_usage/more_data.rst
index d643e8c6..4232a79b 100644
--- a/docs/source/basic_usage/more_data.rst
+++ b/docs/source/basic_usage/more_data.rst
@@ -99,8 +99,8 @@ and fill it with data, e.g., by
                                         descriptor_input_path=outfile,
                                         target_input_type=".cube",
                                         target_input_path=ldosfile,
-                                        additional_info_input_type="espresso-out",
-                                        additional_info_input_path=outfile,
+                                        simulation_output_type="espresso-out",
+                                        simulation_output_path=outfile,
                                         target_units="1/(Ry*Bohr^3)")
 
 The ``add_snapshot`` function can be called multiple times to add
@@ -108,7 +108,7 @@ multiple snapshots to MALA.
 For regular Quantum ESPRESSO calculations, the ``descriptor_input_type``
 and ``target_input_type`` will always be ``"espresso-out"`` and ``".cube"``,
 respectively, and the ``target_units`` will always be ``"1/(Ry*Bohr^3)"``.
-The paths have to be modified accordingly. ``additional_info_input_*`` refers
+The paths have to be modified accordingly. ``simulation_output_*`` refers
 to the calculation output file - MALA provides an interface to condense
 the entire, verbose simulation output to ``.json`` files for further
 processing. In the preceding section, we had to specify calculation output
@@ -121,7 +121,7 @@ Once data is provided, the conversion itself is simple.
 
             data_converter.convert_snapshots(descriptor_save_path="./",
                                              target_save_path="./",
-                                             additional_info_save_path="./",
+                                             simulation_output_save_path="./",
                                              naming_scheme="Be_snapshot*.npy",
                                              descriptor_calculation_kwargs=
                                              {"working_directory": data_path})
diff --git a/examples/advanced/ex10_convert_numpy_openpmd.py b/examples/advanced/ex10_convert_numpy_openpmd.py
index 7ebc22da..22607b30 100644
--- a/examples/advanced/ex10_convert_numpy_openpmd.py
+++ b/examples/advanced/ex10_convert_numpy_openpmd.py
@@ -20,15 +20,15 @@
         target_input_path=os.path.join(
             data_path, "Be_snapshot{}.out.npy".format(snapshot)
         ),
-        additional_info_input_type=None,
-        additional_info_input_path=None,
+        simulation_output_type=None,
+        simulation_output_path=None,
         target_units=None,
     )
 
 data_converter.convert_snapshots(
     descriptor_save_path="./",
     target_save_path="./",
-    additional_info_save_path="./",
+    simulation_output_save_path="./",
     naming_scheme="converted_from_numpy_*.h5",
     descriptor_calculation_kwargs={"working_directory": "./"},
 )
@@ -43,15 +43,15 @@
         descriptor_input_path="converted_from_numpy_{}.in.h5".format(snapshot),
         target_input_type="openpmd",
         target_input_path="converted_from_numpy_{}.out.h5".format(snapshot),
-        additional_info_input_type=None,
-        additional_info_input_path=None,
+        simulation_output_type=None,
+        simulation_output_path=None,
         target_units=None,
     )
 
 data_converter.convert_snapshots(
     descriptor_save_path="./",
     target_save_path="./",
-    additional_info_save_path="./",
+    simulation_output_save_path="./",
     naming_scheme="verify_against_original_numpy_data_*.npy",
     descriptor_calculation_kwargs={"working_directory": "./"},
 )
@@ -84,15 +84,15 @@
         target_input_path=os.path.join(
             data_path, "Be_snapshot{}.out.h5".format(snapshot)
         ),
-        additional_info_input_type=None,
-        additional_info_input_path=None,
+        simulation_output_type=None,
+        simulation_output_path=None,
         target_units=None,
     )
 
 data_converter.convert_snapshots(
     descriptor_save_path="./",
     target_save_path="./",
-    additional_info_save_path="./",
+    simulation_output_save_path="./",
     naming_scheme="converted_from_openpmd_*.npy",
     descriptor_calculation_kwargs={"working_directory": "./"},
 )
diff --git a/examples/basic/ex03_preprocess_data.py b/examples/basic/ex03_preprocess_data.py
index b0a10488..e9f4691a 100644
--- a/examples/basic/ex03_preprocess_data.py
+++ b/examples/basic/ex03_preprocess_data.py
@@ -46,12 +46,12 @@
 # Data conversion itself is simple. We select input and output data
 # to be converted, add this data snapshot-wise and tell MALA to
 # convert snapshots. Inputs and outputs can be processed individually.
-# Further, via the additional_info_input_* keywords, calculation output
+# Further, via the simulation_output_input_* keywords, calculation output
 # can be processed from the original simulation *.out output files into
 # more convenient *.json files that can be used in their stead. This saves
 # on disk space.
 # To only process parts of the data, omit/add descriptor_input*, target_input_*
-# and additional_info_input_* at your leisure.
+# and simulation_output_* at your leisure.
 # Make sure to set the correct units - for QE, this should always be
 # 1/(Ry*Bohr^3).
 ####################
@@ -65,8 +65,8 @@
     descriptor_input_path=outfile,
     target_input_type=".cube",
     target_input_path=ldosfile,
-    additional_info_input_type="espresso-out",
-    additional_info_input_path=outfile,
+    simulation_output_type="espresso-out",
+    simulation_output_path=outfile,
     target_units="1/(Ry*Bohr^3)",
 )
 
@@ -84,7 +84,7 @@
 data_converter.convert_snapshots(
     descriptor_save_path="./",
     target_save_path="./",
-    additional_info_save_path="./",
+    simulation_output_save_path="./",
     naming_scheme="Be_snapshot*.npy",
     descriptor_calculation_kwargs={"working_directory": data_path},
 )
diff --git a/mala/datahandling/data_converter.py b/mala/datahandling/data_converter.py
index 21fb34cd..89558fec 100644
--- a/mala/datahandling/data_converter.py
+++ b/mala/datahandling/data_converter.py
@@ -12,7 +12,7 @@
 
 descriptor_input_types = ["espresso-out", "openpmd", "numpy"]
 target_input_types = [".cube", ".xsf", "openpmd", "numpy"]
-additional_info_input_types = ["espresso-out"]
+simulation_output_types = ["espresso-out"]
 
 
 class DataConverter:
@@ -78,7 +78,7 @@ def __init__(
         # Keep track of what has to be done by this data converter.
         self.__process_descriptors = False
         self.__process_targets = False
-        self.__process_additional_info = False
+        self.__process_simulation_output = False
 
     def add_snapshot(
         self,
@@ -86,8 +86,8 @@ def add_snapshot(
         descriptor_input_path=None,
         target_input_type=None,
         target_input_path=None,
-        additional_info_input_type=None,
-        additional_info_input_path=None,
+        simulation_output_type=None,
+        simulation_output_path=None,
         descriptor_units=None,
         metadata_input_type=None,
         metadata_input_path=None,
@@ -114,22 +114,22 @@ def add_snapshot(
         target_input_path : string
             Path of target data to be processed.
 
-        additional_info_input_type : string
+        simulation_output_type : string
             Type of additional info data to be processed.
-            See mala.datahandling.data_converter.additional_info_input_types
+            See mala.datahandling.data_converter.simulation_output_types
             for options.
 
-        additional_info_input_path : string
+        simulation_output_path : string
             Path of additional info data to be processed.
 
         metadata_input_type : string
             Type of additional metadata to be processed.
-            See mala.datahandling.data_converter.additional_info_input_types
+            See mala.datahandling.data_converter.simulation_output_types
             for options.
-            This is essentially the same as additional_info_input_type,
+            This is essentially the same as simulation_output_type,
             but will not affect saving; i.e., the data given here will
             only be saved in OpenPMD files, not saved separately.
-            If additional_info_input_type is set, this argument will be
+            If simulation_output_type is set, this argument will be
             ignored.
 
         metadata_input_path : string
@@ -161,20 +161,20 @@ def add_snapshot(
                 raise Exception("Cannot process this type of target data.")
             self.__process_targets = True
 
-        if additional_info_input_type is not None:
-            metadata_input_type = additional_info_input_type
-            if additional_info_input_path is None:
+        if simulation_output_type is not None:
+            metadata_input_type = simulation_output_type
+            if simulation_output_path is None:
                 raise Exception(
                     "Cannot process additional info data with "
                     "no path given."
                 )
-            if additional_info_input_type not in additional_info_input_types:
+            if simulation_output_type not in simulation_output_types:
                 raise Exception(
                     "Cannot process this type of additional info data."
                 )
-            self.__process_additional_info = True
+            self.__process_simulation_output = True
 
-            metadata_input_path = additional_info_input_path
+            metadata_input_path = simulation_output_path
 
         if metadata_input_type is not None:
             if metadata_input_path is None:
@@ -182,7 +182,7 @@ def add_snapshot(
                     "Cannot process additional info data with "
                     "no path given."
                 )
-            if metadata_input_type not in additional_info_input_types:
+            if metadata_input_type not in simulation_output_types:
                 raise Exception(
                     "Cannot process this type of additional info data."
                 )
@@ -192,7 +192,7 @@ def add_snapshot(
             {
                 "input": descriptor_input_path,
                 "output": target_input_path,
-                "additional_info": additional_info_input_path,
+                "simulation_output": simulation_output_path,
                 "metadata": metadata_input_path,
             }
         )
@@ -200,7 +200,7 @@ def add_snapshot(
             {
                 "input": descriptor_input_type,
                 "output": target_input_type,
-                "additional_info": additional_info_input_type,
+                "simulation_output": simulation_output_type,
                 "metadata": metadata_input_type,
             }
         )
@@ -213,7 +213,7 @@ def convert_snapshots(
         complete_save_path=None,
         descriptor_save_path=None,
         target_save_path=None,
-        additional_info_save_path=None,
+        simulation_output_save_path=None,
         naming_scheme="ELEM_snapshot*.npy",
         starts_at=0,
         file_based_communication=False,
@@ -231,7 +231,7 @@ def convert_snapshots(
         complete_save_path : string
             If not None: the directory in which all snapshots will be saved.
             Overwrites descriptor_save_path, target_save_path and
-            additional_info_save_path if set.
+            simulation_output_save_path if set.
 
         descriptor_save_path : string
             Directory in which to save descriptor data.
@@ -239,7 +239,7 @@ def convert_snapshots(
         target_save_path : string
             Directory in which to save target data.
 
-        additional_info_save_path : string
+        simulation_output_save_path : string
             Directory in which to save additional info data.
 
         naming_scheme : string
@@ -304,7 +304,7 @@ def convert_snapshots(
         if complete_save_path is not None:
             descriptor_save_path = complete_save_path
             target_save_path = complete_save_path
-            additional_info_save_path = complete_save_path
+            simulation_output_save_path = complete_save_path
         else:
             if self.__process_targets is True and target_save_path is None:
                 raise Exception(
@@ -318,8 +318,8 @@ def convert_snapshots(
                     "No descriptor path specified, cannot process data."
                 )
             if (
-                self.__process_additional_info is True
-                and additional_info_save_path is None
+                self.__process_simulation_output is True
+                and simulation_output_save_path is None
             ):
                 raise Exception(
                     "No additional info path specified, cannot "
@@ -393,9 +393,9 @@ def convert_snapshots(
             snapshot_name = snapshot_name.replace("*", str(snapshot_number))
 
             # Create the paths as needed.
-            if self.__process_additional_info:
+            if self.__process_simulation_output:
                 info_path = os.path.join(
-                    additional_info_save_path, snapshot_name + ".info.json"
+                    simulation_output_save_path, snapshot_name + ".info.json"
                 )
             else:
                 info_path = None
@@ -454,7 +454,7 @@ def convert_snapshots(
                 use_memmap=memmap,
                 input_iteration=input_iteration,
                 output_iteration=output_iteration,
-                additional_info_path=info_path,
+                simulation_output_info_path=info_path,
                 use_fp64=use_fp64,
             )
 
@@ -479,7 +479,7 @@ def __convert_single_snapshot(
         target_calculator_kwargs,
         input_path=None,
         output_path=None,
-        additional_info_path=None,
+        simulation_output_info_path=None,
         use_memmap=None,
         output_iteration=None,
         input_iteration=None,
@@ -719,11 +719,11 @@ def __convert_single_snapshot(
             del tmp_output
 
         # Parse and/or calculate the additional info.
-        if description["additional_info"] is not None:
+        if description["simulation_output"] is not None:
             # Parsing and saving is done using the target calculator.
             self.target_calculator.read_additional_calculation_data(
-                snapshot["additional_info"], description["additional_info"]
+                snapshot["simulation_output"], description["simulation_output"]
             )
             self.target_calculator.write_additional_calculation_data(
-                additional_info_path
+                simulation_output_info_path
             )
diff --git a/mala/datahandling/data_shuffler.py b/mala/datahandling/data_shuffler.py
index 660a1fec..be859f5a 100644
--- a/mala/datahandling/data_shuffler.py
+++ b/mala/datahandling/data_shuffler.py
@@ -334,9 +334,7 @@ def __init__(self):
     # XXXXXXXOOO
     # OOOOOOOOOO
 
-    def __contiguous_slice_within_ndim_grid_as_blocks(
-        extent, x, y
-    ):
+    def __contiguous_slice_within_ndim_grid_as_blocks(extent, x, y):
         # Used for converting a block defined by inclusive lower and upper
         # coordinate to a chunk as defined by openPMD.
         # The openPMD extent is defined by (to-from)+1 (to make up for the
@@ -426,7 +424,8 @@ def worker(inner_idx, inner_strides):
             if not inner_strides:
                 if inner_idx != 0:
                     raise RuntimeError(
-                        "This cannot happen. There is bug somewhere.")
+                        "This cannot happen. There is bug somewhere."
+                    )
                 else:
                     return []
             div, mod = divmod(inner_idx, inner_strides[0])
@@ -655,8 +654,7 @@ def shuffle_snapshots(
         ----------
         complete_save_path : string
             If not None: the directory in which all snapshots will be saved.
-            Overwrites descriptor_save_path, target_save_path and
-            additional_info_save_path if set.
+            Overwrites descriptor_save_path and target_save_path if set.
 
         descriptor_save_path : string
             Directory in which to save descriptor data.
diff --git a/test/complete_interfaces_test.py b/test/complete_interfaces_test.py
index 6a795665..5288afa8 100644
--- a/test/complete_interfaces_test.py
+++ b/test/complete_interfaces_test.py
@@ -108,15 +108,15 @@ def test_convert_numpy_openpmd(self):
                 target_input_path=os.path.join(
                     data_path, "Be_snapshot{}.out.npy".format(snapshot)
                 ),
-                additional_info_input_type=None,
-                additional_info_input_path=None,
+                simulation_output_type=None,
+                simulation_output_path=None,
                 target_units=None,
             )
 
         data_converter.convert_snapshots(
             descriptor_save_path="./",
             target_save_path="./",
-            additional_info_save_path="./",
+            simulation_output_save_path="./",
             naming_scheme="converted_from_numpy_*.h5",
             descriptor_calculation_kwargs={"working_directory": "./"},
         )
@@ -135,15 +135,15 @@ def test_convert_numpy_openpmd(self):
                 target_input_path="converted_from_numpy_{}.out.h5".format(
                     snapshot
                 ),
-                additional_info_input_type=None,
-                additional_info_input_path=None,
+                simulation_output_type=None,
+                simulation_output_path=None,
                 target_units=None,
             )
 
         data_converter.convert_snapshots(
             descriptor_save_path="./",
             target_save_path="./",
-            additional_info_save_path="./",
+            simulation_output_save_path="./",
             naming_scheme="verify_against_original_numpy_data_*.npy",
             descriptor_calculation_kwargs={"working_directory": "./"},
         )

From 4001d0a61cdd492e835aa50dec3dfae2cf803594 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Wed, 8 Jan 2025 14:10:44 +0100
Subject: [PATCH 09/23] Fixing a parallel writing bug

---
 mala/descriptors/bispectrum.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/mala/descriptors/bispectrum.py b/mala/descriptors/bispectrum.py
index 8932c61b..47865372 100755
--- a/mala/descriptors/bispectrum.py
+++ b/mala/descriptors/bispectrum.py
@@ -9,7 +9,7 @@
 import numpy as np
 from scipy.spatial import distance
 
-from mala.common.parallelizer import printout
+from mala.common.parallelizer import printout, get_rank, barrier
 from mala.descriptors.lammps_utils import extract_compute_np
 from mala.descriptors.descriptor import Descriptor
 
@@ -156,9 +156,14 @@ def __calculate_lammps(self, outdir, **kwargs):
         lammps_format = "lammps-data"
         self.setup_lammps_tmp_files("bgrid", outdir)
 
-        ase.io.write(
-            self._lammps_temporary_input, self._atoms, format=lammps_format
-        )
+        if get_rank() == 0:
+            ase.io.write(
+                self._lammps_temporary_input,
+                self._atoms,
+                format=lammps_format,
+                parallel=False,
+            )
+        barrier()
 
         nx = self.grid_dimensions[0]
         ny = self.grid_dimensions[1]

From e6a072302c9a23d2b8b22001b8d32f84cbefaa09 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Wed, 8 Jan 2025 14:15:28 +0100
Subject: [PATCH 10/23] Can I use DDP and MPI at the same time?

---
 mala/common/parallelizer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mala/common/parallelizer.py b/mala/common/parallelizer.py
index e59b8a98..8e394999 100644
--- a/mala/common/parallelizer.py
+++ b/mala/common/parallelizer.py
@@ -64,10 +64,10 @@ def set_mpi_status(new_value):
         Value the MPI status has.
 
     """
-    if use_ddp is True and new_value is True:
-        raise Exception(
-            "Cannot use ddp and inference-level MPI at " "the same time yet."
-        )
+    # if use_ddp is True and new_value is True:
+    #     raise Exception(
+    #         "Cannot use ddp and inference-level MPI at " "the same time yet."
+    #     )
     global use_mpi
     use_mpi = new_value
     if use_mpi:

From 516240486221f5f3bd8fdad476525f1a0cfe3bde Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Wed, 8 Jan 2025 14:17:35 +0100
Subject: [PATCH 11/23] It does not help

---
 mala/common/parallelizer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mala/common/parallelizer.py b/mala/common/parallelizer.py
index 8e394999..e59b8a98 100644
--- a/mala/common/parallelizer.py
+++ b/mala/common/parallelizer.py
@@ -64,10 +64,10 @@ def set_mpi_status(new_value):
         Value the MPI status has.
 
     """
-    # if use_ddp is True and new_value is True:
-    #     raise Exception(
-    #         "Cannot use ddp and inference-level MPI at " "the same time yet."
-    #     )
+    if use_ddp is True and new_value is True:
+        raise Exception(
+            "Cannot use ddp and inference-level MPI at " "the same time yet."
+        )
     global use_mpi
     use_mpi = new_value
     if use_mpi:

From 399722990ae7f1c969f9bbcab8e06f23ba8e34e8 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Wed, 8 Jan 2025 14:23:08 +0100
Subject: [PATCH 12/23] Getting rid of the parallel modification for now

---
 mala/descriptors/bispectrum.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/mala/descriptors/bispectrum.py b/mala/descriptors/bispectrum.py
index 47865372..8932c61b 100755
--- a/mala/descriptors/bispectrum.py
+++ b/mala/descriptors/bispectrum.py
@@ -9,7 +9,7 @@
 import numpy as np
 from scipy.spatial import distance
 
-from mala.common.parallelizer import printout, get_rank, barrier
+from mala.common.parallelizer import printout
 from mala.descriptors.lammps_utils import extract_compute_np
 from mala.descriptors.descriptor import Descriptor
 
@@ -156,14 +156,9 @@ def __calculate_lammps(self, outdir, **kwargs):
         lammps_format = "lammps-data"
         self.setup_lammps_tmp_files("bgrid", outdir)
 
-        if get_rank() == 0:
-            ase.io.write(
-                self._lammps_temporary_input,
-                self._atoms,
-                format=lammps_format,
-                parallel=False,
-            )
-        barrier()
+        ase.io.write(
+            self._lammps_temporary_input, self._atoms, format=lammps_format
+        )
 
         nx = self.grid_dimensions[0]
         ny = self.grid_dimensions[1]

From 15ff5dc4740ea2629fb459c09a62b4615f895025 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Thu, 9 Jan 2025 11:57:00 +0100
Subject: [PATCH 13/23] Shuffling from atomic positions works now

---
 mala/datahandling/data_handler.py      | 55 ++++---------------------
 mala/datahandling/data_handler_base.py | 43 +++++++++++++++++++-
 mala/datahandling/data_shuffler.py     | 56 ++++++++++++++++++--------
 3 files changed, 88 insertions(+), 66 deletions(-)

diff --git a/mala/datahandling/data_handler.py b/mala/datahandling/data_handler.py
index d0ce2ad4..5823d459 100644
--- a/mala/datahandling/data_handler.py
+++ b/mala/datahandling/data_handler.py
@@ -1,7 +1,6 @@
 """DataHandler class that loads and scales data."""
 
 import os
-import tempfile
 
 import numpy as np
 import torch
@@ -170,21 +169,6 @@ def clear_data(self):
         self.output_data_scaler.reset()
         super(DataHandler, self).clear_data()
 
-    def delete_temporary_inputs(self):
-        """
-        Delete temporary data files.
-
-        These may have been created during a training or testing process
-        when using atomic positions for on-the-fly calculation of descriptors
-        rather than precomputed data files.
-        """
-        if get_rank() == 0:
-            for snapshot in self.parameters.snapshot_directories_list:
-                if snapshot.temporary_input_file is not None:
-                    if os.path.isfile(snapshot.temporary_input_file):
-                        os.remove(snapshot.temporary_input_file)
-        barrier()
-
     # Preparing data
     ######################
 
@@ -349,31 +333,6 @@ def get_snapshot_calculation_output(self, snapshot_number):
             snapshot_number
         ].calculation_output
 
-    def __calculate_temporary_inputs(self, snapshot: Snapshot):
-        if snapshot.temporary_input_file is not None:
-            if not os.path.isfile(snapshot.temporary_input_file):
-                snapshot.temporary_input_file = None
-
-        if snapshot.temporary_input_file is None:
-            snapshot.temporary_input_file = tempfile.NamedTemporaryFile(
-                delete=False,
-                prefix=snapshot.input_npy_file.split(".")[0],
-                suffix=".in.npy",
-                dir=snapshot.input_npy_directory,
-            ).name
-            tmp, grid = self.descriptor_calculator.calculate_from_json(
-                os.path.join(
-                    snapshot.input_npy_directory,
-                    snapshot.input_npy_file,
-                )
-            )
-            if self.parameters._configuration["mpi"]:
-                tmp = self.descriptor_calculator.gather_descriptors(tmp)
-
-            if get_rank() == 0:
-                np.save(snapshot.temporary_input_file, tmp)
-            barrier()
-
     # Debugging
     ######################
 
@@ -637,7 +596,7 @@ def __load_data(self, function, data_type):
                     # If the input for the descriptors is actually a JSON
                     # file then we need to calculate the descriptors.
                     if snapshot.snapshot_type == "json+numpy":
-                        self.__calculate_temporary_inputs(snapshot)
+                        self._calculate_temporary_inputs(snapshot)
                         file = snapshot.temporary_input_file
                 else:
                     file = os.path.join(
@@ -768,14 +727,14 @@ def __build_datasets(self):
                 # snapshot types, this has to be done here.
                 if snapshot.snapshot_function == "va":
                     if snapshot.snapshot_type == "json+numpy":
-                        self.__calculate_temporary_inputs(snapshot)
+                        self._calculate_temporary_inputs(snapshot)
 
                     self.validation_data_sets[0].add_snapshot_to_dataset(
                         snapshot
                     )
                 if snapshot.snapshot_function == "te":
                     if snapshot.snapshot_type == "json+numpy":
-                        self.__calculate_temporary_inputs(snapshot)
+                        self._calculate_temporary_inputs(snapshot)
 
                     self.test_data_sets[0].add_snapshot_to_dataset(snapshot)
 
@@ -822,7 +781,7 @@ def __build_datasets(self):
                         )
                     )
                     if snapshot.snapshot_type == "json+numpy":
-                        self.__calculate_temporary_inputs(snapshot)
+                        self._calculate_temporary_inputs(snapshot)
 
                 if snapshot.snapshot_function == "te":
                     self.test_data_sets.append(
@@ -840,7 +799,7 @@ def __build_datasets(self):
                         )
                     )
                     if snapshot.snapshot_type == "json+numpy":
-                        self.__calculate_temporary_inputs(snapshot)
+                        self._calculate_temporary_inputs(snapshot)
 
         else:
             if self.nr_training_data != 0:
@@ -940,7 +899,7 @@ def __parametrize_scalers(self):
                             )
                         )
                     elif snapshot.snapshot_type == "json+numpy":
-                        self.__calculate_temporary_inputs(snapshot)
+                        self._calculate_temporary_inputs(snapshot)
                         tmp = self.descriptor_calculator.read_from_numpy_file(
                             snapshot.temporary_input_file,
                             units=snapshot.input_units,
@@ -1040,7 +999,7 @@ def __parametrized_load_training_data(self):
                     snapshot.snapshot_function == "tr"
                     and snapshot.snapshot_type == "json+numpy"
                 ):
-                    self.__calculate_temporary_inputs(snapshot)
+                    self._calculate_temporary_inputs(snapshot)
         else:
             printout(
                 "Data scalers already initilized, loading data to RAM.",
diff --git a/mala/datahandling/data_handler_base.py b/mala/datahandling/data_handler_base.py
index 95742633..968e497b 100644
--- a/mala/datahandling/data_handler_base.py
+++ b/mala/datahandling/data_handler_base.py
@@ -2,11 +2,12 @@
 
 from abc import ABC
 import os
+import tempfile
 
 import numpy as np
 
 from mala.common.parameters import ParametersData, Parameters
-from mala.common.parallelizer import printout
+from mala.common.parallelizer import printout, get_rank, barrier
 from mala.targets.target import Target
 from mala.descriptors.descriptor import Descriptor
 from mala.datahandling.snapshot import Snapshot
@@ -166,6 +167,21 @@ def clear_data(self):
         """
         self.parameters.snapshot_directories_list = []
 
+    def delete_temporary_inputs(self):
+        """
+        Delete temporary data files.
+
+        These may have been created during a training or testing process
+        when using atomic positions for on-the-fly calculation of descriptors
+        rather than precomputed data files.
+        """
+        if get_rank() == 0:
+            for snapshot in self.parameters.snapshot_directories_list:
+                if snapshot.temporary_input_file is not None:
+                    if os.path.isfile(snapshot.temporary_input_file):
+                        os.remove(snapshot.temporary_input_file)
+        barrier()
+
     ##############################
     # Private methods
     ##############################
@@ -286,3 +302,28 @@ def _check_snapshots(self, comm=None):
 
             if firstsnapshot:
                 firstsnapshot = False
+
+    def _calculate_temporary_inputs(self, snapshot: Snapshot):
+        if snapshot.temporary_input_file is not None:
+            if not os.path.isfile(snapshot.temporary_input_file):
+                snapshot.temporary_input_file = None
+
+        if snapshot.temporary_input_file is None:
+            snapshot.temporary_input_file = tempfile.NamedTemporaryFile(
+                delete=False,
+                prefix=snapshot.input_npy_file.split(".")[0],
+                suffix=".in.npy",
+                dir=snapshot.input_npy_directory,
+            ).name
+            tmp, grid = self.descriptor_calculator.calculate_from_json(
+                os.path.join(
+                    snapshot.input_npy_directory,
+                    snapshot.input_npy_file,
+                )
+            )
+            if self.parameters._configuration["mpi"]:
+                tmp = self.descriptor_calculator.gather_descriptors(tmp)
+
+            if get_rank() == 0:
+                np.save(snapshot.temporary_input_file, tmp)
+            barrier()
diff --git a/mala/datahandling/data_shuffler.py b/mala/datahandling/data_shuffler.py
index be859f5a..231fd646 100644
--- a/mala/datahandling/data_shuffler.py
+++ b/mala/datahandling/data_shuffler.py
@@ -45,14 +45,6 @@ def __init__(
             target_calculator=target_calculator,
             descriptor_calculator=descriptor_calculator,
         )
-        if self.descriptor_calculator.parameters.descriptors_contain_xyz:
-            printout(
-                "Disabling XYZ-cutting from descriptor data for "
-                "shuffling. If needed, please re-enable afterwards."
-            )
-            self.descriptor_calculator.parameters.descriptors_contain_xyz = (
-                False
-            )
         self._data_points_to_remove = None
 
     def add_snapshot(
@@ -112,15 +104,29 @@ def __shuffle_numpy(
         for idx, snapshot in enumerate(
             self.parameters.snapshot_directories_list
         ):
-            # TODO: Use descriptor and target calculator for this.
-            descriptor_data.append(
-                np.load(
-                    os.path.join(
-                        snapshot.input_npy_directory, snapshot.input_npy_file
-                    ),
-                    mmap_mode="r",
+            if snapshot.snapshot_type == "numpy":
+                # TODO: Use descriptor and target calculator for this.
+                descriptor_data.append(
+                    np.load(
+                        os.path.join(
+                            snapshot.input_npy_directory,
+                            snapshot.input_npy_file,
+                        ),
+                        mmap_mode="r",
+                    )
                 )
-            )
+            elif snapshot.snapshot_type == "json+numpy":
+                descriptor_data.append(
+                    np.load(
+                        snapshot.temporary_input_file,
+                        mmap_mode="r",
+                    )
+                )
+            else:
+                raise Exception(
+                    "Invalid snapshot for numpy shuffling " "selected."
+                )
+
             target_data.append(
                 np.load(
                     os.path.join(
@@ -262,6 +268,8 @@ def __shuffle_numpy(
                     internal_iteration_number=i,
                 )
 
+        self.delete_temporary_inputs()
+
     # The function __shuffle_openpmd can be used to shuffle descriptor data and
     # target data.
     # It will be executed one after another for both of them.
@@ -692,11 +700,13 @@ def shuffle_snapshots(
         else:
             file_ending = "npy"
 
+        old_xyz = self.descriptor_calculator.parameters.descriptors_contain_xyz
+        self.descriptor_calculator.parameters.descriptors_contain_xyz = False
         if self.parameters._configuration["mpi"]:
             self._check_snapshots(comm=get_comm())
         else:
             self._check_snapshots()
-
+        self.descriptor_calculator.parameters.descriptors_contain_xyz = old_xyz
         snapshot_types = {
             snapshot.snapshot_type
             for snapshot in self.parameters.snapshot_directories_list
@@ -785,6 +795,18 @@ def shuffle_snapshots(
                 permutations,
                 file_ending,
             )
+        elif snapshot_type == "json+numpy":
+            for snapshot in self.parameters.snapshot_directories_list:
+                self._calculate_temporary_inputs(snapshot)
+            self.__shuffle_numpy(
+                number_of_shuffled_snapshots,
+                shuffle_dimensions,
+                descriptor_save_path,
+                save_name,
+                target_save_path,
+                permutations,
+                file_ending,
+            )
         elif snapshot_type == "openpmd":
             descriptor = self.__DescriptorOrTarget(
                 descriptor_save_path,

From 8e8cb3fc792978b0c22a0ed453bd5de0b6e367f6 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Thu, 9 Jan 2025 16:39:08 +0100
Subject: [PATCH 14/23] Shuffling now works as part of the temporary pipeline

---
 mala/datahandling/data_shuffler.py | 153 ++++++++++++++++++++++++-----
 1 file changed, 129 insertions(+), 24 deletions(-)

diff --git a/mala/datahandling/data_shuffler.py b/mala/datahandling/data_shuffler.py
index 231fd646..9d8bb958 100644
--- a/mala/datahandling/data_shuffler.py
+++ b/mala/datahandling/data_shuffler.py
@@ -3,12 +3,14 @@
 import os
 
 import numpy as np
+import tempfile
 
+import mala
 from mala.common.parameters import (
     Parameters,
     DEFAULT_NP_DATA_DTYPE,
 )
-from mala.common.parallelizer import printout
+from mala.common.parallelizer import printout, parallel_warn
 from mala.common.physical_data import PhysicalData
 from mala.datahandling.data_handler_base import DataHandlerBase
 from mala.common.parallelizer import get_comm
@@ -32,6 +34,17 @@ class DataShuffler(DataHandlerBase):
     target_calculator : mala.targets.target.Target
         Used to do unit conversion on output data. If None, then one will
         be created by this class.
+
+    Attributes
+    ----------
+    temporary_shuffled_snapshots : list
+        A list containing snapshot objects of temporary, snapshot-like
+        shuffled data files. By default, this list is empty. If the
+        function "shuffle_snapshots_temporary" is used, it will be populated
+        with temporary files saved to hard drive, which can be deleted
+        after model training. Please note that the "snapshot_function",
+        "input_units", "output_units" and "calculation_output" fields of the
+        snapshots within this list
     """
 
     def __init__(
@@ -46,6 +59,7 @@ def __init__(
             descriptor_calculator=descriptor_calculator,
         )
         self._data_points_to_remove = None
+        self.temporary_shuffled_snapshots = []
 
     def add_snapshot(
         self,
@@ -97,6 +111,7 @@ def __shuffle_numpy(
         target_save_path,
         permutations,
         file_ending,
+        temporary,
     ):
         # Load the data (via memmap).
         descriptor_data = []
@@ -168,12 +183,6 @@ def __shuffle_numpy(
                 target_data[idx] = current_target[indices]
 
         # Do the actual shuffling.
-        target_name_openpmd = os.path.join(
-            target_save_path, save_name.replace("*", "%T")
-        )
-        descriptor_name_openpmd = os.path.join(
-            descriptor_save_path, save_name.replace("*", "%T")
-        )
         for i in range(0, number_of_new_snapshots):
             new_descriptors = np.zeros(
                 (int(np.prod(shuffle_dimensions)), self.input_dimension),
@@ -184,12 +193,73 @@ def __shuffle_numpy(
                 dtype=DEFAULT_NP_DATA_DTYPE,
             )
             last_start = 0
-            descriptor_name = os.path.join(
-                descriptor_save_path, save_name.replace("*", str(i))
-            )
-            target_name = os.path.join(
-                target_save_path, save_name.replace("*", str(i))
-            )
+
+            # Figure out where to save / how to name things.
+            # TODO: This could probably be shortened.
+            if temporary:
+
+                # Adding "snapshot numbers" here is technically not necessary
+                # I think, but it also doesn't hurt.
+                if file_ending == "npy":
+                    descriptor_name = tempfile.NamedTemporaryFile(
+                        delete=False,
+                        prefix=save_name.replace("*", str(i)),
+                        suffix=".in.npy",
+                        dir=descriptor_save_path,
+                    ).name
+                    target_name = tempfile.NamedTemporaryFile(
+                        delete=False,
+                        prefix=save_name.replace("*", str(i)),
+                        suffix=".out.npy",
+                        dir=target_save_path,
+                    ).name
+                    snapshot_type = "numpy"
+                else:
+                    descriptor_name = tempfile.NamedTemporaryFile(
+                        delete=False,
+                        prefix=save_name.replace("*", "%T"),
+                        suffix=".in." + file_ending,
+                        dir=descriptor_save_path,
+                    ).name
+                    target_name = tempfile.NamedTemporaryFile(
+                        delete=False,
+                        prefix=save_name.replace("*", "%T"),
+                        suffix=".out." + file_ending,
+                        dir=target_save_path,
+                    ).name
+                    snapshot_type = "openpmd"
+                self.temporary_shuffled_snapshots.append(
+                    mala.Snapshot(
+                        os.path.basename(descriptor_name),
+                        os.path.dirname(descriptor_name),
+                        os.path.basename(target_name),
+                        os.path.dirname(target_name),
+                        snapshot_function="te",
+                        output_units="None",
+                        input_units="None",
+                        calculation_output="",
+                        snapshot_type=snapshot_type,
+                    )
+                )
+            else:
+                if file_ending == "npy":
+                    descriptor_name = os.path.join(
+                        descriptor_save_path,
+                        save_name.replace("*", str(i)) + ".in.npy",
+                    )
+                    target_name = os.path.join(
+                        target_save_path,
+                        save_name.replace("*", str(i)) + ".out.npy",
+                    )
+                else:
+                    descriptor_name = os.path.join(
+                        descriptor_save_path,
+                        save_name.replace("*", "%T") + ".in." + file_ending,
+                    )
+                    target_name = os.path.join(
+                        target_save_path,
+                        save_name.replace("*", "%T") + ".out." + file_ending,
+                    )
 
             # Each new snapshot gets an number_of_new_snapshots-th from each
             # snapshot.
@@ -234,10 +304,10 @@ def __shuffle_numpy(
             )
             if file_ending == "npy":
                 self.descriptor_calculator.write_to_numpy_file(
-                    descriptor_name + ".in.npy", new_descriptors
+                    descriptor_name, new_descriptors
                 )
                 self.target_calculator.write_to_numpy_file(
-                    target_name + ".out.npy", new_targets
+                    target_name, new_targets
                 )
             else:
                 # We check above that in the non-numpy case, OpenPMD will work.
@@ -248,7 +318,7 @@ def __shuffle_numpy(
                     shuffle_dimensions
                 )
                 self.descriptor_calculator.write_to_openpmd_file(
-                    descriptor_name_openpmd + ".in." + file_ending,
+                    descriptor_name,
                     new_descriptors,
                     additional_attributes={
                         "global_shuffling_seed": self.parameters.shuffling_seed,
@@ -258,7 +328,7 @@ def __shuffle_numpy(
                     internal_iteration_number=i,
                 )
                 self.target_calculator.write_to_openpmd_file(
-                    target_name_openpmd + ".out." + file_ending,
+                    target_name,
                     array=new_targets,
                     additional_attributes={
                         "global_shuffling_seed": self.parameters.shuffling_seed,
@@ -268,8 +338,6 @@ def __shuffle_numpy(
                     internal_iteration_number=i,
                 )
 
-        self.delete_temporary_inputs()
-
     # The function __shuffle_openpmd can be used to shuffle descriptor data and
     # target data.
     # It will be executed one after another for both of them.
@@ -652,6 +720,7 @@ def shuffle_snapshots(
         target_save_path=None,
         save_name="mala_shuffled_snapshot*",
         number_of_shuffled_snapshots=None,
+        shuffle_to_temporary=False,
     ):
         """
         Shuffle the snapshots into new snapshots.
@@ -677,6 +746,12 @@ def shuffle_snapshots(
             If not None, this class will attempt to redistribute the data
             to this amount of snapshots. If None, then the same number of
             snapshots provided will be used.
+
+        shuffle_to_temporary : bool
+            If True, shuffled files will be writen to temporary data files.
+            Which paths are used is consistent with non-temporary usage of this
+            class. The path and names of these temporary files can then be
+            found in the class attribute temporary_shuffled_snapshots.
         """
         # Check the paths.
         if complete_save_path is not None:
@@ -691,12 +766,23 @@ def shuffle_snapshots(
             file_ending = save_name.split(".")[-1]
             save_name = save_name.split(".")[0]
             if file_ending != "npy":
-                import openpmd_api as io
-
-                if file_ending not in io.file_extensions:
-                    raise Exception(
-                        "Invalid file ending selected: " + file_ending
+                if shuffle_to_temporary:
+                    parallel_warn(
+                        "Shuffling to temporary files currently"
+                        " only works with numpy as an enginge for "
+                        "intermediate files. You have selected both"
+                        " openpmd and the temporary file option. "
+                        "Will proceed with numpy instead of "
+                        "openpmd."
                     )
+                    file_ending = "npy"
+                else:
+                    import openpmd_api as io
+
+                    if file_ending not in io.file_extensions:
+                        raise Exception(
+                            "Invalid file ending selected: " + file_ending
+                        )
         else:
             file_ending = "npy"
 
@@ -794,6 +880,7 @@ def shuffle_snapshots(
                 target_save_path,
                 permutations,
                 file_ending,
+                shuffle_to_temporary,
             )
         elif snapshot_type == "json+numpy":
             for snapshot in self.parameters.snapshot_directories_list:
@@ -806,6 +893,7 @@ def shuffle_snapshots(
                 target_save_path,
                 permutations,
                 file_ending,
+                shuffle_to_temporary,
             )
         elif snapshot_type == "openpmd":
             descriptor = self.__DescriptorOrTarget(
@@ -843,6 +931,23 @@ def shuffle_snapshots(
         else:
             raise Exception("Unknown snapshot type: {}".format(snapshot_type))
 
+        # Deleting temporary files that may have been created.
+        self.delete_temporary_inputs()
+
         # Since no training will be done with this class, we should always
         # clear the data at the end.
         self.clear_data()
+
+    def delete_temporary_shuffled_snapshots(self):
+        for snapshot in self.temporary_shuffled_snapshots:
+            input_file = os.path.join(
+                snapshot.input_npy_directory, snapshot.input_npy_file
+            )
+            if os.path.isfile(input_file):
+                os.remove(input_file)
+            output_file = os.path.join(
+                snapshot.output_npy_directory, snapshot.output_npy_file
+            )
+            if os.path.isfile(output_file):
+                os.remove(output_file)
+        self.temporary_shuffled_snapshots = []

From 615792b242ed3f6833e46422084377a024f606a7 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Thu, 9 Jan 2025 17:16:03 +0100
Subject: [PATCH 15/23] Fixed docstrings

---
 mala/datahandling/data_shuffler.py |  7 ++++
 mala/descriptors/descriptor.py     | 53 ++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)

diff --git a/mala/datahandling/data_shuffler.py b/mala/datahandling/data_shuffler.py
index 9d8bb958..1ba0aab5 100644
--- a/mala/datahandling/data_shuffler.py
+++ b/mala/datahandling/data_shuffler.py
@@ -939,6 +939,13 @@ def shuffle_snapshots(
         self.clear_data()
 
     def delete_temporary_shuffled_snapshots(self):
+        """
+        Delete temporary files creating during shuffling of data.
+
+        If shuffling has been done with the option "shuffle_to_temporary",
+        shuffled data will be saved to temporary files which can safely be
+        deleted with this function.
+        """
         for snapshot in self.temporary_shuffled_snapshots:
             input_file = os.path.join(
                 snapshot.input_npy_directory, snapshot.input_npy_file
diff --git a/mala/descriptors/descriptor.py b/mala/descriptors/descriptor.py
index 2794e716..e6fec1d3 100644
--- a/mala/descriptors/descriptor.py
+++ b/mala/descriptors/descriptor.py
@@ -377,6 +377,40 @@ def calculate_from_qe_out(
         return self._calculate(working_directory, **kwargs)
 
     def calculate_from_json(self, json_file, working_directory=".", **kwargs):
+        """
+        Calculate the descriptors based on a MALA generated json file.
+
+        These json files are generated by the MALA DataConverter class
+        and bundle information about a DFT simulation.
+
+        Parameters
+        ----------
+        json_file : string
+            Name of MALA json output file for snapshot.
+
+        working_directory : string
+            A directory in which to write the output of the LAMMPS calculation.
+            Usually the local directory should suffice, given that there
+            are no multiple instances running in the same directory.
+
+        kwargs : dict
+            A collection of keyword arguments, that are mainly used for
+            debugging and development. Different types of descriptors
+            may support different keyword arguments. Commonly supported
+            are
+
+            - "use_fp64": To use enforce floating point 64 precision for
+              descriptors.
+            - "keep_logs": To not delete temporary files created during
+              LAMMPS calculation of descriptors.
+
+        Returns
+        -------
+        descriptors : numpy.array
+            Numpy array containing the descriptors with the dimension
+            (x,y,z,descriptor_dimension)
+
+        """
         if isinstance(json_file, str):
             json_dict = json.load(open(json_file, encoding="utf-8"))
         else:
@@ -585,6 +619,25 @@ def convert_local_to_3d(self, descriptors_np):
         return descriptors_full, local_offset, local_reach
 
     def read_dimensions_from_json(self, json_file):
+        """
+        Read only the descriptor dimensions from a json file.
+
+        These json files are generated by the MALA DataConverter class
+        and bundle information about a DFT simulation.
+
+        Parameters
+        ----------
+        json_file : string
+            Path to the numpy file.
+
+        Returns
+        -------
+        dimension_info : list or tuple
+            If read_dtype is False, then only a list containing the dimensions
+            of the saved array is returned. If read_dtype is True, a tuple
+            containing this list of dimensions and the dtype of the array will
+            be returned.
+        """
         if isinstance(json_file, str):
             json_dict = json.load(open(json_file, encoding="utf-8"))
         else:

From d0e8de649eea7d7abbe6b706622cff102158cb3a Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Thu, 9 Jan 2025 17:33:37 +0100
Subject: [PATCH 16/23] Added automatic snapshot type detection

---
 mala/datahandling/data_handler_base.py | 21 ++++++++++++++++++++-
 mala/datahandling/data_shuffler.py     |  2 +-
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/mala/datahandling/data_handler_base.py b/mala/datahandling/data_handler_base.py
index 968e497b..a69dd879 100644
--- a/mala/datahandling/data_handler_base.py
+++ b/mala/datahandling/data_handler_base.py
@@ -106,7 +106,7 @@ def add_snapshot(
         output_units="1/(eV*A^3)",
         input_units="None",
         calculation_output_file="",
-        snapshot_type="numpy",
+        snapshot_type=None,
     ):
         """
         Add a snapshot to the data pipeline.
@@ -146,6 +146,25 @@ def add_snapshot(
             Either "numpy" or "openpmd" based on what kind of files you
             want to operate on.
         """
+        # Try to guess snapshot type if no information was provided.
+        if snapshot_type is None:
+            input_file_ending = input_file.split(".")[-1]
+            output_file_ending = output_file.split(".")[-1]
+
+            if input_file_ending == "npy" and output_file_ending == "npy":
+                snapshot_type = "numpy"
+
+            elif input_file_ending == "json" and output_file_ending == "npy":
+                snapshot_type = "json+numpy"
+            else:
+                import openpmd_api as io
+
+                if (
+                    input_file_ending in io.file_extensions
+                    and output_file_ending in io.file_extensions
+                ):
+                    snapshot_type = "openpmd"
+
         snapshot = Snapshot(
             input_file,
             input_directory,
diff --git a/mala/datahandling/data_shuffler.py b/mala/datahandling/data_shuffler.py
index 1ba0aab5..269b0342 100644
--- a/mala/datahandling/data_shuffler.py
+++ b/mala/datahandling/data_shuffler.py
@@ -67,7 +67,7 @@ def add_snapshot(
         input_directory,
         output_file,
         output_directory,
-        snapshot_type="numpy",
+        snapshot_type=None,
     ):
         """
         Add a snapshot to the data pipeline.

From 07126f1b7584a48313ad56066012e16338e28417 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Tue, 14 Jan 2025 09:20:47 +0100
Subject: [PATCH 17/23] Added new temporary framework to examples

---
 examples/advanced/ex02_shuffle_data.py | 92 +++++++++++++++++++++++++-
 examples/basic/ex01_train_network.py   | 23 ++++++-
 examples/basic/ex02_test_network.py    | 11 ++-
 examples/basic/ex03_preprocess_data.py |  2 +-
 4 files changed, 119 insertions(+), 9 deletions(-)

diff --git a/examples/advanced/ex02_shuffle_data.py b/examples/advanced/ex02_shuffle_data.py
index db75d515..7f67f428 100644
--- a/examples/advanced/ex02_shuffle_data.py
+++ b/examples/advanced/ex02_shuffle_data.py
@@ -1,5 +1,3 @@
-import os
-
 import mala
 
 from mala.datahandling.data_repo import data_path
@@ -10,6 +8,96 @@
 easily done in memory.
 """
 
+# There are two ways to perform shuffling. One can either shuffle snapshots
+# on the fly and create snapshot-like, temporary files to be directly used
+# for training, or save them to permanent files, and load them as needed.
+# The latter is useful when multiple networks are to be trained with the same
+# shuffled data, the former may save on hard-disk space by deleting
+# shuffled data right after usage.
+
+
+# 1. Shuffling snapshots on the fly.
+
+# Define parameters for network training after shuffling.
+parameters = mala.Parameters()
+parameters.verbosity = 1
+parameters.data.input_rescaling_type = "feature-wise-standard"
+parameters.data.output_rescaling_type = "minmax"
+parameters.network.layer_activations = ["ReLU"]
+parameters.running.max_number_epochs = 100
+parameters.running.mini_batch_size = 40
+parameters.running.learning_rate = 0.00001
+parameters.running.optimizer = "Adam"
+parameters.targets.target_type = "LDOS"
+parameters.targets.ldos_gridsize = 11
+parameters.targets.ldos_gridspacing_ev = 2.5
+parameters.targets.ldos_gridoffset_ev = -5
+parameters.data.use_lazy_loading = True
+
+# This ensures reproducibility of the created data sets.
+parameters.data.shuffling_seed = 1234
+
+data_shuffler = mala.DataShuffler(parameters)
+
+# Instead of precomputed snapshots, on-the-fly calculated ones may be used
+# as well. Here, we use precomputed ones.
+data_shuffler.add_snapshot(
+    "Be_snapshot0.in.npy",
+    data_path,
+    "Be_snapshot0.out.npy",
+    data_path,
+)
+data_shuffler.add_snapshot(
+    "Be_snapshot1.in.npy",
+    data_path,
+    "Be_snapshot1.out.npy",
+    data_path,
+)
+
+# Shuffle the snapshots using the "shuffle_to_temporary" option.
+data_shuffler.shuffle_snapshots(
+    complete_save_path=data_path,
+    save_name="Be_shuffled*",
+    shuffle_to_temporary=True,
+    number_of_shuffled_snapshots=2,
+)
+
+# The data shuffler provides a list with these temporary, snapshot-like
+# objects that can then be used in a network training.
+
+data_handler = mala.DataHandler(parameters)
+data_handler.add_snapshot(
+    data_shuffler.temporary_shuffled_snapshots[0].input_npy_file,
+    data_shuffler.temporary_shuffled_snapshots[0].input_npy_directory,
+    data_shuffler.temporary_shuffled_snapshots[0].output_npy_file,
+    data_shuffler.temporary_shuffled_snapshots[0].output_npy_directory,
+    "tr",
+)
+data_handler.add_snapshot(
+    data_shuffler.temporary_shuffled_snapshots[1].input_npy_file,
+    data_shuffler.temporary_shuffled_snapshots[1].input_npy_directory,
+    data_shuffler.temporary_shuffled_snapshots[1].output_npy_file,
+    data_shuffler.temporary_shuffled_snapshots[1].output_npy_directory,
+    "va",
+)
+data_handler.prepare_data()
+
+# Nowe we can train a network in the standard MALA way.
+parameters.network.layer_sizes = [
+    data_handler.input_dimension,
+    100,
+    data_handler.output_dimension,
+]
+network = mala.Network(parameters)
+trainer = mala.Trainer(parameters, network, data_handler)
+trainer.train_network()
+
+# Afterwards, the temporary files should be deleted.
+data_shuffler.delete_temporary_shuffled_snapshots()
+
+# 2. Shuffling snapshots to permanent files.
+# After shuffling, the standard approach to MALA data loading can/should
+# be used to train a network.
 
 parameters = mala.Parameters()
 
diff --git a/examples/basic/ex01_train_network.py b/examples/basic/ex01_train_network.py
index c7a5ca78..a2542302 100644
--- a/examples/basic/ex01_train_network.py
+++ b/examples/basic/ex01_train_network.py
@@ -48,16 +48,33 @@
 # Data has to be added to the MALA workflow. The central object for this
 # is the DataHandler class, which takes care of all data needs. After data
 # has been added, it is loaded and scaled with the prepare_data function.
-####################
+#
+# There are two ways to add descriptor (=input) data. One is to provide
+# precomputed numpy files. This makes sense when training multiple models
+# with the same data. Alternatively, MALA generated JSON files containin
+# information about a simulation can be provided, with which MALA will
+# automatically generate volumetric descriptor input. This is useful when, e.g.
+# descriptor hyperparameters are to be optimized. In order to use that
+# feature, an existing GPU-ready LAMMPS version is recommended. Creation
+# of MALA JSON files is shown in ex03_preprocess_data.
+#####################
 
 data_handler = mala.DataHandler(parameters)
-# Add a snapshot we want to use in to the list.
+
+# Add precomputed snapshots.
 data_handler.add_snapshot(
     "Be_snapshot0.in.npy", data_path, "Be_snapshot0.out.npy", data_path, "tr"
 )
+# Add snapshots with "raw" (=MALA formatted) JSON, computation of descriptors
+# will be performed "on-the-fly".
 data_handler.add_snapshot(
-    "Be_snapshot1.in.npy", data_path, "Be_snapshot1.out.npy", data_path, "va"
+    "Be_snapshot1.info.json",
+    data_path,
+    "Be_snapshot1.out.npy",
+    data_path,
+    "va",
 )
+
 data_handler.prepare_data()
 
 ####################
diff --git a/examples/basic/ex02_test_network.py b/examples/basic/ex02_test_network.py
index 0d90dfe7..8675a6dc 100644
--- a/examples/basic/ex02_test_network.py
+++ b/examples/basic/ex02_test_network.py
@@ -40,21 +40,26 @@
 # When preparing the data, make sure to select "reparametrize_scalers=False",
 # since data scaling was initialized during model training.
 ####################
+
+# Add precomputed snapshots.
 data_handler.add_snapshot(
     "Be_snapshot2.in.npy",
     data_path,
     "Be_snapshot2.out.npy",
     data_path,
     "te",
-    calculation_output_file=os.path.join(data_path, "Be_snapshot2.out"),
+    calculation_output_file=os.path.join(data_path, "Be_snapshot2.info.json"),
 )
+
+# Add snapshots with "raw" (=MALA formatted) JSON, computation of descriptors
+# will be performed "on-the-fly".
 data_handler.add_snapshot(
-    "Be_snapshot3.in.npy",
+    "Be_snapshot3.info.json",
     data_path,
     "Be_snapshot3.out.npy",
     data_path,
     "te",
-    calculation_output_file=os.path.join(data_path, "Be_snapshot3.out"),
+    calculation_output_file=os.path.join(data_path, "Be_snapshot3.info.json"),
 )
 data_handler.prepare_data(reparametrize_scaler=False)
 
diff --git a/examples/basic/ex03_preprocess_data.py b/examples/basic/ex03_preprocess_data.py
index e9f4691a..8476ce49 100644
--- a/examples/basic/ex03_preprocess_data.py
+++ b/examples/basic/ex03_preprocess_data.py
@@ -49,7 +49,7 @@
 # Further, via the simulation_output_input_* keywords, calculation output
 # can be processed from the original simulation *.out output files into
 # more convenient *.json files that can be used in their stead. This saves
-# on disk space.
+# on disk space and makes the process more reproducible.
 # To only process parts of the data, omit/add descriptor_input*, target_input_*
 # and simulation_output_* at your leisure.
 # Make sure to set the correct units - for QE, this should always be

From a7d7bd3ff39cd4efb1f8bac5a68b837b89b4c196 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Tue, 14 Jan 2025 11:59:30 +0100
Subject: [PATCH 18/23] Fixed examples

---
 examples/advanced/ex02_shuffle_data.py |  8 ++++++++
 examples/basic/ex01_train_network.py   | 24 +++++++++++++++++-------
 examples/basic/ex02_test_network.py    | 24 ++++++++++++++++++++----
 examples/basic/ex03_preprocess_data.py | 19 +++++++++++++++++--
 4 files changed, 62 insertions(+), 13 deletions(-)

diff --git a/examples/advanced/ex02_shuffle_data.py b/examples/advanced/ex02_shuffle_data.py
index 7f67f428..4631a934 100644
--- a/examples/advanced/ex02_shuffle_data.py
+++ b/examples/advanced/ex02_shuffle_data.py
@@ -53,6 +53,14 @@
     "Be_snapshot1.out.npy",
     data_path,
 )
+# On-the-fly snapshots can be added as well.
+# data_shuffler.add_snapshot(
+#     "Be_snapshot2.info.json",
+#     data_path,
+#     "Be_snapshot2.out.npy",
+#     data_path,
+# )
+
 
 # Shuffle the snapshots using the "shuffle_to_temporary" option.
 data_shuffler.shuffle_snapshots(
diff --git a/examples/basic/ex01_train_network.py b/examples/basic/ex01_train_network.py
index a2542302..600840b5 100644
--- a/examples/basic/ex01_train_network.py
+++ b/examples/basic/ex01_train_network.py
@@ -65,15 +65,25 @@
 data_handler.add_snapshot(
     "Be_snapshot0.in.npy", data_path, "Be_snapshot0.out.npy", data_path, "tr"
 )
-# Add snapshots with "raw" (=MALA formatted) JSON, computation of descriptors
-# will be performed "on-the-fly".
 data_handler.add_snapshot(
-    "Be_snapshot1.info.json",
-    data_path,
-    "Be_snapshot1.out.npy",
-    data_path,
-    "va",
+    "Be_snapshot1.in.npy", data_path, "Be_snapshot1.out.npy", data_path, "va"
 )
+# Add snapshots with "raw" (=MALA formatted) JSON, computation of descriptors
+# will be performed "on-the-fly".
+# data_handler.add_snapshot(
+#     "Be_snapshot0.info.json",
+#     data_path,
+#     "Be_snapshot0.out.npy",
+#     data_path,
+#     "tr",
+# )
+# data_handler.add_snapshot(
+#     "Be_snapshot1.info.json",
+#     data_path,
+#     "Be_snapshot1.out.npy",
+#     data_path,
+#     "va",
+# )
 
 data_handler.prepare_data()
 
diff --git a/examples/basic/ex02_test_network.py b/examples/basic/ex02_test_network.py
index 8675a6dc..d02a5874 100644
--- a/examples/basic/ex02_test_network.py
+++ b/examples/basic/ex02_test_network.py
@@ -50,17 +50,33 @@
     "te",
     calculation_output_file=os.path.join(data_path, "Be_snapshot2.info.json"),
 )
-
-# Add snapshots with "raw" (=MALA formatted) JSON, computation of descriptors
-# will be performed "on-the-fly".
 data_handler.add_snapshot(
-    "Be_snapshot3.info.json",
+    "Be_snapshot3.in.npy",
     data_path,
     "Be_snapshot3.out.npy",
     data_path,
     "te",
     calculation_output_file=os.path.join(data_path, "Be_snapshot3.info.json"),
 )
+
+# Add snapshots with "raw" (=MALA formatted) JSON, computation of descriptors
+# will be performed "on-the-fly".
+# data_handler.add_snapshot(
+#     "Be_snapshot2.info.json",
+#     data_path,
+#     "Be_snapshot2.out.npy",
+#     data_path,
+#     "te",
+#     calculation_output_file=os.path.join(data_path, "Be_snapshot2.info.json"),
+# )
+# data_handler.add_snapshot(
+#     "Be_snapshot3.info.json",
+#     data_path,
+#     "Be_snapshot3.out.npy",
+#     data_path,
+#     "te",
+#     calculation_output_file=os.path.join(data_path, "Be_snapshot3.info.json"),
+# )
 data_handler.prepare_data(reparametrize_scaler=False)
 
 
diff --git a/examples/basic/ex03_preprocess_data.py b/examples/basic/ex03_preprocess_data.py
index 8476ce49..4f8f014b 100644
--- a/examples/basic/ex03_preprocess_data.py
+++ b/examples/basic/ex03_preprocess_data.py
@@ -51,7 +51,9 @@
 # more convenient *.json files that can be used in their stead. This saves
 # on disk space and makes the process more reproducible.
 # To only process parts of the data, omit/add descriptor_input*, target_input_*
-# and simulation_output_* at your leisure.
+# and simulation_output_* at your leisure. This is especially useful if you,
+# e.g., do not need to convert the descriptor data, since it will be
+# calculated on-the-fly during training.
 # Make sure to set the correct units - for QE, this should always be
 # 1/(Ry*Bohr^3).
 ####################
@@ -60,6 +62,7 @@
 outfile = os.path.join(data_path, "Be_snapshot0.out")
 ldosfile = os.path.join(data_path, "cubes/tmp.pp*Be_ldos.cube")
 
+# Converting a snapshot for training on precomputed descriptor data.
 data_converter.add_snapshot(
     descriptor_input_type="espresso-out",
     descriptor_input_path=outfile,
@@ -70,6 +73,16 @@
     target_units="1/(Ry*Bohr^3)",
 )
 
+# Converting a snapshot for training with on-the-fly descriptor calculation.
+# data_converter.add_snapshot(
+#     target_input_type=".cube",
+#     target_input_path=ldosfile,
+#     simulation_output_type="espresso-out",
+#     simulation_output_path=outfile,
+#     target_units="1/(Ry*Bohr^3)",
+# )
+
+
 ####################
 # 3. Converting the data
 # To convert the data we now simply have to call the convert_snapshot function.
@@ -82,9 +95,11 @@
 ####################
 
 data_converter.convert_snapshots(
-    descriptor_save_path="./",
     target_save_path="./",
     simulation_output_save_path="./",
+    # The next line should be omitted, if the descriptor data is to be
+    # calculated on-the-fly during training.
+    descriptor_save_path="./",
     naming_scheme="Be_snapshot*.npy",
     descriptor_calculation_kwargs={"working_directory": data_path},
 )

From f0738ce564d7ce9b5563f13ffcd5094d9b966e1c Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Tue, 14 Jan 2025 20:12:17 +0100
Subject: [PATCH 19/23] Added documentation

---
 docs/source/advanced_usage/trainingmodel.rst | 15 +++++++--
 docs/source/basic_usage/more_data.rst        | 13 ++++++--
 docs/source/basic_usage/trainingmodel.rst    | 35 +++++++++++++++++---
 examples/basic/ex05_run_predictions.py       |  2 +-
 4 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/docs/source/advanced_usage/trainingmodel.rst b/docs/source/advanced_usage/trainingmodel.rst
index 9b118d86..dfa7b38a 100644
--- a/docs/source/advanced_usage/trainingmodel.rst
+++ b/docs/source/advanced_usage/trainingmodel.rst
@@ -170,10 +170,11 @@ data sets have to be saved - in-memory implementations are currently developed.
 To use the data shuffling (also shown in example
 ``advanced/ex02_shuffle_data.py``), you can use the ``DataShuffler`` class.
 
-The syntax is very easy, you create a ``DataShufller`` object,
+The syntax is very easy, you create a ``DataShuffler`` object,
 which provides the same ``add_snapshot`` functionalities as the ``DataHandler``
-object, and shuffle the data once you have added all snapshots in question,
-i.e.,
+object, and shuffle the data once you have added all snapshots in question.
+Just as with the ``DataHandler`` class, on-the-fly calculation of bispectrum
+descriptors is supported.
 
       .. code-block:: python
 
@@ -187,6 +188,14 @@ i.e.,
             data_shuffler.shuffle_snapshots(complete_save_path="../",
                                             save_name="Be_shuffled*")
 
+By using the ``shuffle_to_temporary`` keyword, you can shuffle the data to
+temporary files, which will can deleted after the training run. This is useful
+if you want to shuffle the data right before training and do not plan to re-use
+shuffled data files for multiple training runs. As detailed in
+``advanced/ex02_shuffle_data.py``, access to temporary files is provided via
+``data_shuffler.temporary_shuffled_snapshots[...]``, which is a list containing
+``mala.Snapshot`` objects.
+
 The seed ``parameters.data.shuffling_seed`` ensures reproducibility of data
 sets. The ``shuffle_snapshots`` function has a path handling ability akin to
 the ``DataConverter`` class. Further, via the ``number_of_shuffled_snapshots``
diff --git a/docs/source/basic_usage/more_data.rst b/docs/source/basic_usage/more_data.rst
index 4232a79b..89484510 100644
--- a/docs/source/basic_usage/more_data.rst
+++ b/docs/source/basic_usage/more_data.rst
@@ -49,7 +49,13 @@ MALA can be used to process raw data into ready-to-use data for ML-DFT model
 creation. For this, the ``DataConverter`` class can be used, as also shown
 in the example ``basic/ex03_preprocess_data``.
 The first thing when converting data is to select how the data should be
-processed. Up until now, MALA operates with bispectrum descriptors as
+processed. As outlined in :doc:`the training documentation <trainingmodel>`,
+there are two ways to provide descriptor data to MALA models. One can either
+precompute files containing descriptors with the ``DataConverter`` class or
+compute descriptor data on-the-fly by providing MALA generated JSON files
+containing simulation output information. These JSON files can also be
+generated by the ``DataConverter`` class.
+Up until now, MALA operates with bispectrum descriptors as
 input data (=descriptors) and LDOS as output data (=targets). Their
 calculation is calculated via
 
@@ -73,6 +79,8 @@ values are included in the energy grid upon which the LDOS is sampled,
 ``ldos_gridoffset_ev`` determines the lowest energy value sampled. These values
 are chosen for the ``pp.x`` simulation and have to be given here.
 
+If descriptors are precomputed, then hyperparameters for their calculation
+have to be provided.
 For the bispectrum calculation, ``bispectrum_cutoff`` gives the radius of
 the cutoff sphere from which information on the atomic structure is incoporated
 into the bispectrum descriptor vector at each point in space, whereas
@@ -111,7 +119,8 @@ respectively, and the ``target_units`` will always be ``"1/(Ry*Bohr^3)"``.
 The paths have to be modified accordingly. ``simulation_output_*`` refers
 to the calculation output file - MALA provides an interface to condense
 the entire, verbose simulation output to ``.json`` files for further
-processing. In the preceding section, we had to specify calculation output
+processing or on-the-fly descriptor calculation.
+In the preceding section, we had to specify calculation output
 files a number of times - instead, we can use the reduced ``.json`` files
 if we let them be created by the ``DataConverter`` class.
 
diff --git a/docs/source/basic_usage/trainingmodel.rst b/docs/source/basic_usage/trainingmodel.rst
index 53cb8a8d..a85a4a8f 100644
--- a/docs/source/basic_usage/trainingmodel.rst
+++ b/docs/source/basic_usage/trainingmodel.rst
@@ -89,10 +89,14 @@ As with any ML library, MALA is a data-driven framework. So before we can
 train a model, we need to add data. The central object to manage data for any
 MALA workflow is the ``DataHandler`` class.
 
-MALA manages data "per snapshot". One snapshot is one atomic configuration,
-for which volumetric input and output data has been calculated. Data has to
-be added to the ``DataHandler`` object per snapshot, pointing to the
-where the volumetric data files are saved on disk. This is done via
+MALA manages data "per snapshot". One snapshot is an atomic configuration with
+associated volumetric data. Snapshots have to be added to the ``DataHandler``
+object. There are two ways to provide snapshot data, which are selected by
+providing the respective types of data files.
+
+1. Precomputed descriptors: The LDOS is sampled and the volumetric descriptor
+   data is precomputed into either OpenPMD or numpy files
+   (as described :doc:`here <more_data>`), and both can be loaded for training.
 
       .. code-block:: python
 
@@ -102,12 +106,33 @@ where the volumetric data files are saved on disk. This is done via
             data_handler.add_snapshot("Be_snapshot1.in.npy", data_path,
                                       "Be_snapshot1.out.npy", data_path, "va")
 
+2. On-the-fly descriptors: The LDOS is sampled into either OpenPMD or numpy
+   files, while the volumetric descriptor data is computed on-the-fly during
+   training or shuffling. Starting point for the descriptor calculation in this
+   case is the simulation output saved in a JSON file. This mode is only
+   recommended if a GPU-enabled LAMMPS version is available. If this route is
+   used, then descriptor calculation hyperparamters need to be set before
+   adding snapshots, see :doc:`data conversion manual <more_data>` for details.
+
+      .. code-block:: python
+
+            # Bispectrum parameters.
+            parameters.descriptors.descriptor_type = "Bispectrum"
+            parameters.descriptors.bispectrum_twojmax = 10
+            parameters.descriptors.bispectrum_cutoff = 4.67637
+
+            data_handler = mala.DataHandler(parameters)
+            data_handler.add_snapshot("Be_snapshot0.info.json", data_path,
+                                      "Be_snapshot0.out.npy", data_path, "tr")
+            data_handler.add_snapshot("Be_snapshot1.info.json", data_path,
+                                      "Be_snapshot1.out.npy", data_path, "va")
+
 The ``"tr"`` and ``"va"`` flag signal that the respective snapshots are added as
 training and validation data, respectively. Training data is data the model
 is directly tuned on; validation data is data used to verify the model
 performance during the run time and make sure that no overfitting occurs.
 After data has been added to the ``DataHandler``, it has to be actually loaded
-and scaled via
+(or in the case of on-the-fly usage, computed) and scaled via
 
       .. code-block:: python
 
diff --git a/examples/basic/ex05_run_predictions.py b/examples/basic/ex05_run_predictions.py
index 05deb857..a0b193c9 100644
--- a/examples/basic/ex05_run_predictions.py
+++ b/examples/basic/ex05_run_predictions.py
@@ -11,7 +11,7 @@
 configurations. Either execute ex01 before executing this one or download the
 appropriate model from the provided test data repo.
 
-REQUIRES LAMMPS (and potentiall the total energy module).
+REQUIRES LAMMPS (and potentially the total energy module).
 """
 
 model_name = "Be_model"

From d359057c84c6d0b1a588a7e00f9b08f1de913ccc Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Tue, 14 Jan 2025 20:13:56 +0100
Subject: [PATCH 20/23] Small adjustment in example

---
 examples/advanced/ex02_shuffle_data.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/advanced/ex02_shuffle_data.py b/examples/advanced/ex02_shuffle_data.py
index 4631a934..a56ebdfc 100644
--- a/examples/advanced/ex02_shuffle_data.py
+++ b/examples/advanced/ex02_shuffle_data.py
@@ -24,7 +24,9 @@
 parameters.data.input_rescaling_type = "feature-wise-standard"
 parameters.data.output_rescaling_type = "minmax"
 parameters.network.layer_activations = ["ReLU"]
-parameters.running.max_number_epochs = 100
+
+# No real training, just showing how shuffling directly before training works.
+parameters.running.max_number_epochs = 5
 parameters.running.mini_batch_size = 40
 parameters.running.learning_rate = 0.00001
 parameters.running.optimizer = "Adam"

From 9d87a2f0debdba4b099711f3513bc7df5d91e699 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Tue, 14 Jan 2025 21:06:43 +0100
Subject: [PATCH 21/23] Implemented rudimentary json+openpmd

---
 mala/datahandling/data_handler.py             | 57 +++++++++++++++----
 mala/datahandling/data_handler_base.py        | 15 ++++-
 mala/datahandling/data_shuffler.py            |  5 ++
 mala/datahandling/lazy_load_dataset.py        | 12 ++++
 mala/datahandling/lazy_load_dataset_single.py | 20 +++++++
 .../multi_lazy_load_data_loader.py            | 31 ++++++++++
 6 files changed, 128 insertions(+), 12 deletions(-)

diff --git a/mala/datahandling/data_handler.py b/mala/datahandling/data_handler.py
index 5823d459..35dffcbf 100644
--- a/mala/datahandling/data_handler.py
+++ b/mala/datahandling/data_handler.py
@@ -595,7 +595,10 @@ def __load_data(self, function, data_type):
 
                     # If the input for the descriptors is actually a JSON
                     # file then we need to calculate the descriptors.
-                    if snapshot.snapshot_type == "json+numpy":
+                    if (
+                        snapshot.snapshot_type == "json+numpy"
+                        or snapshot.snapshot_type == "json+openpmd"
+                    ):
                         self._calculate_temporary_inputs(snapshot)
                         file = snapshot.temporary_input_file
                 else:
@@ -623,6 +626,22 @@ def __load_data(self, function, data_type):
                             file, units=units
                         ).reshape([gs_new, feature_dimension])
                     )
+                elif snapshot.snapshot_type == "json+openpmd":
+                    if data_type == "inputs":
+                        calculator.read_from_numpy_file(
+                            file,
+                            units=units,
+                            array=getattr(self, array)[
+                                gs_old : gs_old + gs_new, :
+                            ],
+                            reshape=True,
+                        )
+                    else:
+                        getattr(self, array)[gs_old : gs_old + gs_new] = (
+                            calculator.read_from_openpmd_file(
+                                file, units=units
+                            ).reshape([gs_new, feature_dimension])
+                        )
                 else:
                     raise Exception("Unknown snapshot file type.")
                 snapshot_counter += 1
@@ -726,14 +745,20 @@ def __build_datasets(self):
                 # already been built during parametrization, for all other
                 # snapshot types, this has to be done here.
                 if snapshot.snapshot_function == "va":
-                    if snapshot.snapshot_type == "json+numpy":
+                    if (
+                        snapshot.snapshot_type == "json+numpy"
+                        or snapshot.snapshot_type == "json+openpmd"
+                    ):
                         self._calculate_temporary_inputs(snapshot)
 
                     self.validation_data_sets[0].add_snapshot_to_dataset(
                         snapshot
                     )
                 if snapshot.snapshot_function == "te":
-                    if snapshot.snapshot_type == "json+numpy":
+                    if (
+                        snapshot.snapshot_type == "json+numpy"
+                        or snapshot.snapshot_type == "json+openpmd"
+                    ):
                         self._calculate_temporary_inputs(snapshot)
 
                     self.test_data_sets[0].add_snapshot_to_dataset(snapshot)
@@ -780,7 +805,10 @@ def __build_datasets(self):
                             self._use_ddp,
                         )
                     )
-                    if snapshot.snapshot_type == "json+numpy":
+                    if (
+                        snapshot.snapshot_type == "json+numpy"
+                        or snapshot.snapshot_type == "json+openpmd"
+                    ):
                         self._calculate_temporary_inputs(snapshot)
 
                 if snapshot.snapshot_function == "te":
@@ -798,7 +826,10 @@ def __build_datasets(self):
                             input_requires_grad=True,
                         )
                     )
-                    if snapshot.snapshot_type == "json+numpy":
+                    if (
+                        snapshot.snapshot_type == "json+numpy"
+                        or snapshot.snapshot_type == "json+openpmd"
+                    ):
                         self._calculate_temporary_inputs(snapshot)
 
         else:
@@ -898,7 +929,10 @@ def __parametrize_scalers(self):
                                 )
                             )
                         )
-                    elif snapshot.snapshot_type == "json+numpy":
+                    elif (
+                        snapshot.snapshot_type == "json+numpy"
+                        or snapshot.snapshot_type == "json+openpmd"
+                    ):
                         self._calculate_temporary_inputs(snapshot)
                         tmp = self.descriptor_calculator.read_from_numpy_file(
                             snapshot.temporary_input_file,
@@ -956,7 +990,10 @@ def __parametrize_scalers(self):
                             ),
                             units=snapshot.output_units,
                         )
-                    elif snapshot.snapshot_type == "openpmd":
+                    elif (
+                        snapshot.snapshot_type == "openpmd"
+                        or snapshot.snapshot_type == "json+openpmd"
+                    ):
                         tmp = self.target_calculator.read_from_openpmd_file(
                             os.path.join(
                                 snapshot.output_npy_directory,
@@ -995,9 +1032,9 @@ def __parametrized_load_training_data(self):
             )
             for snapshot in self.parameters.snapshot_directories_list:
                 # Data scaling is only performed on the training data sets.
-                if (
-                    snapshot.snapshot_function == "tr"
-                    and snapshot.snapshot_type == "json+numpy"
+                if snapshot.snapshot_function == "tr" and (
+                    snapshot.snapshot_type == "json+numpy"
+                    or snapshot.snapshot_type == "json+openpmd"
                 ):
                     self._calculate_temporary_inputs(snapshot)
         else:
diff --git a/mala/datahandling/data_handler_base.py b/mala/datahandling/data_handler_base.py
index a69dd879..8c2b6141 100644
--- a/mala/datahandling/data_handler_base.py
+++ b/mala/datahandling/data_handler_base.py
@@ -164,6 +164,11 @@ def add_snapshot(
                     and output_file_ending in io.file_extensions
                 ):
                     snapshot_type = "openpmd"
+                if (
+                    input_file_ending == "json"
+                    and output_file_ending in io.file_extensions
+                ):
+                    snapshot_type = "json+openpmd"
 
         snapshot = Snapshot(
             input_file,
@@ -242,7 +247,10 @@ def _check_snapshots(self, comm=None):
                     ),
                     comm=comm,
                 )
-            elif snapshot.snapshot_type == "json+numpy":
+            elif (
+                snapshot.snapshot_type == "json+numpy"
+                or snapshot.snapshot_type == "json+openpmd"
+            ):
                 tmp_dimension = (
                     self.descriptor_calculator.read_dimensions_from_json(
                         os.path.join(
@@ -291,7 +299,10 @@ def _check_snapshots(self, comm=None):
                         )
                     )
                 )
-            elif snapshot.snapshot_type == "openpmd":
+            elif (
+                snapshot.snapshot_type == "openpmd"
+                or snapshot.snapshot_type == "json+openpmd"
+            ):
                 tmp_dimension = (
                     self.target_calculator.read_dimensions_from_openpmd_file(
                         os.path.join(
diff --git a/mala/datahandling/data_shuffler.py b/mala/datahandling/data_shuffler.py
index 269b0342..2e0799c1 100644
--- a/mala/datahandling/data_shuffler.py
+++ b/mala/datahandling/data_shuffler.py
@@ -928,6 +928,11 @@ def shuffle_snapshots(
                 permutations,
                 file_ending,
             )
+        elif snapshot_type == "json+openpmd":
+            raise Exception(
+                "Shuffling from JSON files and OpenPMD is "
+                "currently not supported."
+            )
         else:
             raise Exception("Unknown snapshot type: {}".format(snapshot_type))
 
diff --git a/mala/datahandling/lazy_load_dataset.py b/mala/datahandling/lazy_load_dataset.py
index 78d11431..10c00528 100644
--- a/mala/datahandling/lazy_load_dataset.py
+++ b/mala/datahandling/lazy_load_dataset.py
@@ -192,6 +192,18 @@ def get_new_data(self, file_index):
                 )
             )
 
+        elif self._snapshot_list[file_index].snapshot_type == "json+openpmd":
+            self.input_data = self._descriptor_calculator.read_from_numpy_file(
+                self._snapshot_list[file_index].temporary_input_file,
+                units=self._snapshot_list[file_index].input_units,
+            )
+            self.output_data = self._target_calculator.read_from_openpmd_file(
+                os.path.join(
+                    self._snapshot_list[file_index].output_npy_directory,
+                    self._snapshot_list[file_index].output_npy_file,
+                )
+            )
+
         # Transform the data.
         self.input_data = self.input_data.reshape(
             [self._snapshot_list[file_index].grid_size, self._input_dimension]
diff --git a/mala/datahandling/lazy_load_dataset_single.py b/mala/datahandling/lazy_load_dataset_single.py
index 1baef9ea..e775a0cc 100644
--- a/mala/datahandling/lazy_load_dataset_single.py
+++ b/mala/datahandling/lazy_load_dataset_single.py
@@ -171,6 +171,26 @@ def allocate_shared_mem(self):
                 )
             )
 
+            self.output_shape, self.output_dtype = (
+                self.target_calculator.read_dimensions_from_openpmd_file(
+                    os.path.join(
+                        self.snapshot.output_npy_directory,
+                        self.snapshot.output_npy_file,
+                    ),
+                    read_dtype=True,
+                )
+            )
+        elif self.snapshot.snapshot_type == "json+openpmd":
+            self.input_shape = (
+                self.descriptor_calculator.read_dimensions_from_json(
+                    os.path.join(
+                        self.snapshot.input_npy_directory,
+                        self.snapshot.input_npy_file,
+                    ),
+                )
+            )
+            self.input_dtype = np.dtype(np.float32)
+
             self.output_shape, self.output_dtype = (
                 self.target_calculator.read_dimensions_from_openpmd_file(
                     os.path.join(
diff --git a/mala/datahandling/multi_lazy_load_data_loader.py b/mala/datahandling/multi_lazy_load_data_loader.py
index 35b02601..993c4170 100644
--- a/mala/datahandling/multi_lazy_load_data_loader.py
+++ b/mala/datahandling/multi_lazy_load_data_loader.py
@@ -183,6 +183,24 @@ def load_snapshot_to_shm(
                 )
             )
 
+            output_shape, output_dtype = (
+                target_calculator.read_dimensions_from_openpmd_file(
+                    os.path.join(
+                        snapshot.output_npy_directory,
+                        snapshot.output_npy_file,
+                    ),
+                    read_dtype=True,
+                )
+            )
+        elif snapshot.snapshot_type == "json+openpmd":
+            input_shape = descriptor_calculator.read_dimensions_from_json(
+                os.path.join(
+                    snapshot.input_npy_directory,
+                    snapshot.input_npy_file,
+                ),
+            )
+            input_dtype = np.dtype(np.float32)
+
             output_shape, output_dtype = (
                 target_calculator.read_dimensions_from_openpmd_file(
                     os.path.join(
@@ -267,6 +285,19 @@ def load_snapshot_to_shm(
                 units=snapshot.output_units,
                 array=output_data,
             )
+        elif snapshot.snapshot_type == "json+openpmd":
+            descriptor_calculator.read_from_numpy_file(
+                snapshot.temporary_input_file,
+                units=snapshot.input_units,
+                array=input_data,
+            )
+            target_calculator.read_from_openpmd_file(
+                os.path.join(
+                    snapshot.output_npy_directory, snapshot.output_npy_file
+                ),
+                units=snapshot.output_units,
+                array=output_data,
+            )
         else:
             raise Exception("Invalid snapshot type selected.")
 

From d1960d42bd42b874ddf08e49c1c4f8bab6aa3684 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Tue, 14 Jan 2025 21:30:02 +0100
Subject: [PATCH 22/23] Added tests for on-the-fly calculations

---
 test/on_the_fly_test.py | 180 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 180 insertions(+)
 create mode 100644 test/on_the_fly_test.py

diff --git a/test/on_the_fly_test.py b/test/on_the_fly_test.py
new file mode 100644
index 00000000..28d20916
--- /dev/null
+++ b/test/on_the_fly_test.py
@@ -0,0 +1,180 @@
+import os
+
+import mala
+import numpy as np
+
+from mala.datahandling.data_repo import data_path
+
+# Control the accuracies for the postprocessing routines.
+accuracy_predictions = 1.0
+
+
+class TestOnTheFly:
+    """
+    Test the on-the-fly descriptor calculation capabilities.
+
+    This relates to training, testing and shuffling descriptor data that has
+    been computed on the fly.
+    """
+
+    @staticmethod
+    def __setup_training(lazy_loading, checkpoint_name=None):
+        test_parameters = mala.Parameters()
+        test_parameters.use_lammps = False
+        test_parameters.data.data_splitting_type = "by_snapshot"
+        test_parameters.data.input_rescaling_type = "feature-wise-standard"
+        test_parameters.data.output_rescaling_type = "minmax"
+        test_parameters.data.use_lazy_loading = lazy_loading
+        test_parameters.network.layer_activations = ["ReLU"]
+        test_parameters.running.max_number_epochs = 5
+        test_parameters.running.mini_batch_size = 40
+        test_parameters.running.learning_rate = 0.00001
+        test_parameters.running.optimizer = "Adam"
+        if checkpoint_name:
+            test_parameters.running.checkpoints_each_epoch = 5
+            test_parameters.running.checkpoint_name = checkpoint_name
+
+        test_parameters.descriptors.descriptor_type = "Bispectrum"
+        # These values make no phyiscal sense, but are chosen to make the test
+        # fast enough to be executed without LAMMPS.
+        test_parameters.descriptors.bispectrum_twojmax = 2
+        test_parameters.descriptors.bispectrum_cutoff = 1.5
+
+        # Load data.
+        data_handler = mala.DataHandler(test_parameters)
+        data_handler.add_snapshot(
+            "Be_snapshot0.info.json",
+            data_path,
+            "Be_snapshot0.out.npy",
+            data_path,
+            "tr",
+        )
+        data_handler.add_snapshot(
+            "Be_snapshot1.info.json",
+            data_path,
+            "Be_snapshot1.out.npy",
+            data_path,
+            "va",
+        )
+        data_handler.prepare_data()
+
+        # Train a network.
+        test_parameters.network.layer_sizes = [
+            data_handler.input_dimension,
+            100,
+            data_handler.output_dimension,
+        ]
+
+        # Setup network and trainer.
+        test_network = mala.Network(test_parameters)
+        test_trainer = mala.Trainer(
+            test_parameters, test_network, data_handler
+        )
+
+        return test_trainer
+
+    @staticmethod
+    def __resume_checkpoint(checkpoint_name, actual_max_epochs):
+        loaded_params, loaded_network, new_datahandler, new_trainer = (
+            mala.Trainer.load_run(checkpoint_name)
+        )
+        loaded_params.running.max_number_epochs = actual_max_epochs
+        return new_trainer
+
+    def test_training(self):
+        test_trainer = self.__setup_training(False)
+        test_trainer.train_network()
+        assert test_trainer.final_validation_loss < np.inf
+
+    def test_training_lazy_loading(self):
+        test_trainer = self.__setup_training(True)
+        test_trainer.train_network()
+        assert test_trainer.final_validation_loss < np.inf
+
+    def test_training_lazy_loading_interrupted(self):
+        test_trainer = self.__setup_training(True, "test_checkpoint_onthefly")
+        test_trainer.train_network()
+        test_trainer.data.delete_temporary_inputs()
+        test_trainer = self.__resume_checkpoint("test_checkpoint_onthefly", 10)
+        test_trainer.train_network()
+        assert test_trainer.final_validation_loss < np.inf
+
+    def test_training_lazy_loading_uninterrupted(self):
+        test_trainer = self.__setup_training(True, "test_checkpoint_onthefly")
+        test_trainer.train_network()
+        test_trainer = self.__resume_checkpoint("test_checkpoint_onthefly", 10)
+        test_trainer.train_network()
+        assert test_trainer.final_validation_loss < np.inf
+
+    def test_shuffling(self):
+        test_parameters = mala.Parameters()
+        test_parameters.data.shuffling_seed = 1234
+        test_parameters.data.data_splitting_type = "by_snapshot"
+        test_parameters.data.input_rescaling_type = "feature-wise-standard"
+        test_parameters.data.output_rescaling_type = "minmax"
+        test_parameters.network.layer_activations = ["ReLU"]
+        test_parameters.running.max_number_epochs = 5
+        test_parameters.running.mini_batch_size = 40
+        test_parameters.running.learning_rate = 0.00001
+        test_parameters.running.optimizer = "Adam"
+        test_parameters.verbosity = 1
+        test_parameters.data.use_lazy_loading = True
+        data_shuffler = mala.DataShuffler(test_parameters)
+        test_parameters.descriptors.descriptor_type = "Bispectrum"
+        # These values make no phyiscal sense, but are chosen to make the test
+        # fast enough to be executed without LAMMPS.
+        test_parameters.descriptors.bispectrum_twojmax = 2
+        test_parameters.descriptors.bispectrum_cutoff = 1.5
+
+        # Add a snapshot we want to use in to the list.
+        data_shuffler.add_snapshot(
+            "Be_snapshot0.info.json",
+            data_path,
+            "Be_snapshot0.out.npy",
+            data_path,
+        )
+        data_shuffler.add_snapshot(
+            "Be_snapshot1.info.json",
+            data_path,
+            "Be_snapshot1.out.npy",
+            data_path,
+        )
+
+        # After shuffling, these snapshots can be loaded as regular snapshots
+        # for lazily loaded training-
+        data_shuffler.shuffle_snapshots(
+            "./", save_name="Be_shuffled*", shuffle_to_temporary=True
+        )
+        test_parameters.descriptors.descriptors_contain_xyz = True
+
+        # Train with shuffling.
+        data_handler = mala.DataHandler(test_parameters)
+        # Add a snapshot we want to use in to the list.
+        data_handler.add_snapshot(
+            data_shuffler.temporary_shuffled_snapshots[0].input_npy_file,
+            data_shuffler.temporary_shuffled_snapshots[0].input_npy_directory,
+            data_shuffler.temporary_shuffled_snapshots[0].output_npy_file,
+            data_shuffler.temporary_shuffled_snapshots[0].output_npy_directory,
+            "tr",
+        )
+        data_handler.add_snapshot(
+            data_shuffler.temporary_shuffled_snapshots[1].input_npy_file,
+            data_shuffler.temporary_shuffled_snapshots[1].input_npy_directory,
+            data_shuffler.temporary_shuffled_snapshots[1].output_npy_file,
+            data_shuffler.temporary_shuffled_snapshots[1].output_npy_directory,
+            "va",
+        )
+        data_handler.prepare_data()
+
+        test_parameters.network.layer_sizes = [
+            data_handler.input_dimension,
+            100,
+            data_handler.output_dimension,
+        ]
+
+        test_network = mala.Network(test_parameters)
+        test_trainer = mala.Trainer(
+            test_parameters, test_network, data_handler
+        )
+        test_trainer.train_network()
+        assert test_trainer.final_validation_loss < np.inf

From ac255b292ce172a4b674e162f1b325a6f2d788ff Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Wed, 15 Jan 2025 10:31:03 +0100
Subject: [PATCH 23/23] Small adjustment in test

---
 test/on_the_fly_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/on_the_fly_test.py b/test/on_the_fly_test.py
index 28d20916..2074ceb4 100644
--- a/test/on_the_fly_test.py
+++ b/test/on_the_fly_test.py
@@ -20,7 +20,6 @@ class TestOnTheFly:
     @staticmethod
     def __setup_training(lazy_loading, checkpoint_name=None):
         test_parameters = mala.Parameters()
-        test_parameters.use_lammps = False
         test_parameters.data.data_splitting_type = "by_snapshot"
         test_parameters.data.input_rescaling_type = "feature-wise-standard"
         test_parameters.data.output_rescaling_type = "minmax"
@@ -177,4 +176,5 @@ def test_shuffling(self):
             test_parameters, test_network, data_handler
         )
         test_trainer.train_network()
+        data_shuffler.delete_temporary_shuffled_snapshots()
         assert test_trainer.final_validation_loss < np.inf