From e154d2a49bcab8778f02f04f19802753f5dd85de Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Mon, 6 Jan 2025 16:59:49 +0100 Subject: [PATCH 01/23] On-the-fly training works for the RAM case --- mala/datahandling/data_handler.py | 39 +++++++++++++++++++++++- mala/datahandling/data_handler_base.py | 14 ++++++++- mala/datahandling/snapshot.py | 3 ++ mala/descriptors/atomic_density.py | 4 +++ mala/descriptors/bispectrum.py | 32 +++++++++++-------- mala/descriptors/descriptor.py | 27 +++++++++++++++- mala/descriptors/minterpy_descriptors.py | 6 ++++ mala/network/trainer.py | 3 ++ mala/targets/target.py | 2 +- 9 files changed, 113 insertions(+), 17 deletions(-) diff --git a/mala/datahandling/data_handler.py b/mala/datahandling/data_handler.py index 3b9521e4..fa24d3c3 100644 --- a/mala/datahandling/data_handler.py +++ b/mala/datahandling/data_handler.py @@ -1,6 +1,7 @@ """DataHandler class that loads and scales data.""" import os +import tempfile import numpy as np import torch @@ -169,6 +170,19 @@ def clear_data(self): self.output_data_scaler.reset() super(DataHandler, self).clear_data() + def delete_temporary_data(self): + """ + Delete temporary data files. + + These may have been created during a training or testing process + when using atomic positions for on-the-fly calculation of descriptors + rather than precomputed data files. + """ + for snapshot in self.parameters.snapshot_directories_list: + if snapshot.temporary_input_file is not None: + if os.path.isfile(snapshot.temporary_input_file): + os.remove(snapshot.temporary_input_file) + # Preparing data ###################### @@ -595,6 +609,26 @@ def __load_data(self, function, data_type): snapshot.input_npy_directory, snapshot.input_npy_file ) units = snapshot.input_units + + # If the input for the descriptors is actually a JSON + # file then we need to calculate the descriptors. + if snapshot.snapshot_type == "json+numpy": + snapshot.temporary_input_file = ( + tempfile.NamedTemporaryFile( + delete=False, + prefix=snapshot.input_npy_file.split(".")[0], + suffix=".in.npy", + dir=snapshot.input_npy_directory, + ).name + ) + descriptors, grid = ( + self.descriptor_calculator.calculate_from_json( + file + ) + ) + np.save(snapshot.temporary_input_file, descriptors) + file = snapshot.temporary_input_file + else: file = os.path.join( snapshot.output_npy_directory, @@ -602,7 +636,10 @@ def __load_data(self, function, data_type): ) units = snapshot.output_units - if snapshot.snapshot_type == "numpy": + if ( + snapshot.snapshot_type == "numpy" + or snapshot.snapshot_type == "json+numpy" + ): calculator.read_from_numpy_file( file, units=units, diff --git a/mala/datahandling/data_handler_base.py b/mala/datahandling/data_handler_base.py index c141551f..95742633 100644 --- a/mala/datahandling/data_handler_base.py +++ b/mala/datahandling/data_handler_base.py @@ -207,6 +207,15 @@ def _check_snapshots(self, comm=None): ), comm=comm, ) + elif snapshot.snapshot_type == "json+numpy": + tmp_dimension = ( + self.descriptor_calculator.read_dimensions_from_json( + os.path.join( + snapshot.input_npy_directory, + snapshot.input_npy_file, + ) + ) + ) else: raise Exception("Unknown snapshot file type.") @@ -235,7 +244,10 @@ def _check_snapshots(self, comm=None): snapshot.output_npy_directory, min_verbosity=1, ) - if snapshot.snapshot_type == "numpy": + if ( + snapshot.snapshot_type == "numpy" + or snapshot.snapshot_type == "json+numpy" + ): tmp_dimension = ( self.target_calculator.read_dimensions_from_numpy_file( os.path.join( diff --git a/mala/datahandling/snapshot.py b/mala/datahandling/snapshot.py index 1bac8488..e4472835 100644 --- a/mala/datahandling/snapshot.py +++ b/mala/datahandling/snapshot.py @@ -133,6 +133,9 @@ def __init__( self.input_dimension = None self.output_dimension = None + # Temporary descriptor files, which may be needed. + self.temporary_input_file = None + @classmethod def from_json(cls, json_dict): """ diff --git a/mala/descriptors/atomic_density.py b/mala/descriptors/atomic_density.py index 4459c838..6a052747 100755 --- a/mala/descriptors/atomic_density.py +++ b/mala/descriptors/atomic_density.py @@ -119,6 +119,10 @@ def _calculate(self, outdir, **kwargs): else: return self.__calculate_python(**kwargs) + def _read_feature_dimension_from_json(self, json_dict): + # For now, has to be adapted in the multielement case. + return 4 + def __calculate_lammps(self, outdir, **kwargs): """Perform actual Gaussian descriptor calculation.""" # For version compatibility; older lammps versions (the serial version diff --git a/mala/descriptors/bispectrum.py b/mala/descriptors/bispectrum.py index ab8bbff7..8932c61b 100755 --- a/mala/descriptors/bispectrum.py +++ b/mala/descriptors/bispectrum.py @@ -120,6 +120,24 @@ def _calculate(self, outdir, **kwargs): else: return self.__calculate_python(**kwargs) + def _read_feature_dimension_from_json(self, json_dict): + if self.parameters.descriptors_contain_xyz: + return self.__get_feature_size() - 3 + else: + return self.__get_feature_size() + + def __get_feature_size(self): + ncols0 = 3 + + # Analytical relation for fingerprint length + ncoeff = ( + (self.parameters.bispectrum_twojmax + 2) + * (self.parameters.bispectrum_twojmax + 3) + * (self.parameters.bispectrum_twojmax + 4) + ) + ncoeff = ncoeff // 24 # integer division + return ncols0 + ncoeff + def __calculate_lammps(self, outdir, **kwargs): """ Perform bispectrum calculation using LAMMPS. @@ -173,19 +191,7 @@ def __calculate_lammps(self, outdir, **kwargs): # Do the LAMMPS calculation and clean up. lmp.file(self.parameters.lammps_compute_file) - - # Set things not accessible from LAMMPS - # First 3 cols are x, y, z, coords - ncols0 = 3 - - # Analytical relation for fingerprint length - ncoeff = ( - (self.parameters.bispectrum_twojmax + 2) - * (self.parameters.bispectrum_twojmax + 3) - * (self.parameters.bispectrum_twojmax + 4) - ) - ncoeff = ncoeff // 24 # integer division - self.feature_size = ncols0 + ncoeff + self.feature_size = self.__get_feature_size() # Extract data from LAMMPS calculation. # This is different for the parallel and the serial case. diff --git a/mala/descriptors/descriptor.py b/mala/descriptors/descriptor.py index 041dd4b3..2794e716 100644 --- a/mala/descriptors/descriptor.py +++ b/mala/descriptors/descriptor.py @@ -1,11 +1,12 @@ """Base class for all descriptor calculators.""" from abc import abstractmethod -from functools import cached_property +import json import os import tempfile import ase +from ase.cell import Cell from ase.units import m from ase.neighborlist import NeighborList, NewPrimitiveNeighborList import numpy as np @@ -375,6 +376,16 @@ def calculate_from_qe_out( return self._calculate(working_directory, **kwargs) + def calculate_from_json(self, json_file, working_directory=".", **kwargs): + if isinstance(json_file, str): + json_dict = json.load(open(json_file, encoding="utf-8")) + else: + json_dict = json.load(json_file) + self.grid_dimensions = json_dict["grid_dimensions"] + self._atoms = ase.Atoms.fromdict(json_dict["atoms"]) + self._voxel = Cell(json_dict["voxel"]["array"]) + return self._calculate(working_directory, **kwargs) + def calculate_from_atoms( self, atoms, grid_dimensions, working_directory=".", **kwargs ): @@ -573,6 +584,16 @@ def convert_local_to_3d(self, descriptors_np): ).transpose([2, 1, 0, 3]) return descriptors_full, local_offset, local_reach + def read_dimensions_from_json(self, json_file): + if isinstance(json_file, str): + json_dict = json.load(open(json_file, encoding="utf-8")) + else: + json_dict = json.load(json_file) + grid_dimensions = json_dict["grid_dimensions"] + [ + self._read_feature_dimension_from_json(json_dict) + ] + return grid_dimensions + # Private methods ################# @@ -1021,5 +1042,9 @@ def _grid_to_coord(self, gridpoint): def _calculate(self, outdir, **kwargs): pass + @abstractmethod + def _read_feature_dimension_from_json(self, json_dict): + pass + def _set_feature_size_from_array(self, array): self.feature_size = np.shape(array)[-1] diff --git a/mala/descriptors/minterpy_descriptors.py b/mala/descriptors/minterpy_descriptors.py index 2d9d5216..cfedf113 100755 --- a/mala/descriptors/minterpy_descriptors.py +++ b/mala/descriptors/minterpy_descriptors.py @@ -87,6 +87,12 @@ def backconvert_units(array, out_units): else: raise Exception("Unsupported unit for Minterpy descriptors.") + def _read_feature_dimension_from_json(self, json_dict): + raise Exception( + "This feature has not been implemented for Minterpy " + "descriptors." + ) + def _calculate(self, atoms, outdir, grid_dimensions, **kwargs): # For version compatibility; older lammps versions (the serial version # we still use on some machines) have these constants as part of the diff --git a/mala/network/trainer.py b/mala/network/trainer.py index ccd0ab70..b69faae5 100644 --- a/mala/network/trainer.py +++ b/mala/network/trainer.py @@ -597,6 +597,9 @@ def train_network(self): ) self.final_validation_loss = vloss + # Cleaning up temporary data files. + self.data.delete_temporary_data() + # Clean-up for pre-fetching lazy loading. if self.data.parameters.use_lazy_loading_prefetch: self._training_data_loaders.cleanup() diff --git a/mala/targets/target.py b/mala/targets/target.py index 10a414c6..17eb5a4d 100644 --- a/mala/targets/target.py +++ b/mala/targets/target.py @@ -653,7 +653,7 @@ def read_additional_calculation_data(self, data, data_type=None): } self.atomic_forces_dft = None self.entropy_contribution_dft_calculation = None - self.grid_dimensions = [0, 0, 0] + self.grid_dimensions = json_dict["grid_dimensions"] self.atoms = None for key in json_dict: From 88a5edb4568e8f15013a7be34a05d95da2e9e2db Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Mon, 6 Jan 2025 17:41:46 +0100 Subject: [PATCH 02/23] Lazy Loading training works now --- mala/datahandling/data_handler.py | 51 ++++++++++++++++++-------- mala/datahandling/lazy_load_dataset.py | 12 ++++++ 2 files changed, 47 insertions(+), 16 deletions(-) diff --git a/mala/datahandling/data_handler.py b/mala/datahandling/data_handler.py index fa24d3c3..0a11ff2c 100644 --- a/mala/datahandling/data_handler.py +++ b/mala/datahandling/data_handler.py @@ -350,6 +350,21 @@ def get_snapshot_calculation_output(self, snapshot_number): snapshot_number ].calculation_output + def calculate_temporary_inputs(self, snapshot: Snapshot): + snapshot.temporary_input_file = tempfile.NamedTemporaryFile( + delete=False, + prefix=snapshot.input_npy_file.split(".")[0], + suffix=".in.npy", + dir=snapshot.input_npy_directory, + ).name + tmp, grid = self.descriptor_calculator.calculate_from_json( + os.path.join( + snapshot.input_npy_directory, + snapshot.input_npy_file, + ) + ) + np.save(snapshot.temporary_input_file, tmp) + # Debugging ###################### @@ -613,22 +628,8 @@ def __load_data(self, function, data_type): # If the input for the descriptors is actually a JSON # file then we need to calculate the descriptors. if snapshot.snapshot_type == "json+numpy": - snapshot.temporary_input_file = ( - tempfile.NamedTemporaryFile( - delete=False, - prefix=snapshot.input_npy_file.split(".")[0], - suffix=".in.npy", - dir=snapshot.input_npy_directory, - ).name - ) - descriptors, grid = ( - self.descriptor_calculator.calculate_from_json( - file - ) - ) - np.save(snapshot.temporary_input_file, descriptors) + self.calculate_temporary_inputs(snapshot) file = snapshot.temporary_input_file - else: file = os.path.join( snapshot.output_npy_directory, @@ -753,11 +754,20 @@ def __build_datasets(self): self.training_data_sets[0].add_snapshot_to_dataset( snapshot ) + # For training snapshots, temporary files (if needed) have + # already been built during parametrization, for all other + # snapshot types, this has to be done here. if snapshot.snapshot_function == "va": + if snapshot.snapshot_type == "json+numpy": + self.calculate_temporary_inputs(snapshot) + self.validation_data_sets[0].add_snapshot_to_dataset( snapshot ) if snapshot.snapshot_function == "te": + if snapshot.snapshot_type == "json+numpy": + self.calculate_temporary_inputs(snapshot) + self.test_data_sets[0].add_snapshot_to_dataset(snapshot) # I don't think we need to mix them here. We can use the standard @@ -915,6 +925,12 @@ def __parametrize_scalers(self): ) ) ) + elif snapshot.snapshot_type == "json+numpy": + self.calculate_temporary_inputs(snapshot) + tmp = self.descriptor_calculator.read_from_numpy_file( + snapshot.temporary_input_file, + units=snapshot.input_units, + ) else: raise Exception("Unknown snapshot file type.") @@ -956,7 +972,10 @@ def __parametrize_scalers(self): for snapshot in self.parameters.snapshot_directories_list: # Data scaling is only performed on the training data sets. if snapshot.snapshot_function == "tr": - if snapshot.snapshot_type == "numpy": + if ( + snapshot.snapshot_type == "numpy" + or snapshot.snapshot_type == "json+numpy" + ): tmp = self.target_calculator.read_from_numpy_file( os.path.join( snapshot.output_npy_directory, diff --git a/mala/datahandling/lazy_load_dataset.py b/mala/datahandling/lazy_load_dataset.py index a5a2b1a5..78d11431 100644 --- a/mala/datahandling/lazy_load_dataset.py +++ b/mala/datahandling/lazy_load_dataset.py @@ -163,6 +163,18 @@ def get_new_data(self, file_index): ), units=self._snapshot_list[file_index].output_units, ) + elif self._snapshot_list[file_index].snapshot_type == "json+numpy": + self.input_data = self._descriptor_calculator.read_from_numpy_file( + self._snapshot_list[file_index].temporary_input_file, + units=self._snapshot_list[file_index].input_units, + ) + self.output_data = self._target_calculator.read_from_numpy_file( + os.path.join( + self._snapshot_list[file_index].output_npy_directory, + self._snapshot_list[file_index].output_npy_file, + ), + units=self._snapshot_list[file_index].output_units, + ) elif self._snapshot_list[file_index].snapshot_type == "openpmd": self.input_data = ( From 0e4ddfea0aed7509688cff339b1edb488cba2bbd Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Tue, 7 Jan 2025 11:36:40 +0100 Subject: [PATCH 03/23] Checkpointing works now as well --- mala/datahandling/data_handler.py | 81 ++++++++++++++++++++----------- mala/network/trainer.py | 2 +- 2 files changed, 54 insertions(+), 29 deletions(-) diff --git a/mala/datahandling/data_handler.py b/mala/datahandling/data_handler.py index 0a11ff2c..b9127ca4 100644 --- a/mala/datahandling/data_handler.py +++ b/mala/datahandling/data_handler.py @@ -7,7 +7,7 @@ import torch from torch.utils.data import TensorDataset -from mala.common.parallelizer import printout, barrier +from mala.common.parallelizer import printout, barrier, get_rank from mala.common.parameters import Parameters, DEFAULT_NP_DATA_DTYPE from mala.datahandling.data_handler_base import DataHandlerBase from mala.datahandling.data_scaler import DataScaler @@ -170,7 +170,7 @@ def clear_data(self): self.output_data_scaler.reset() super(DataHandler, self).clear_data() - def delete_temporary_data(self): + def _delete_temporary_data(self): """ Delete temporary data files. @@ -178,10 +178,12 @@ def delete_temporary_data(self): when using atomic positions for on-the-fly calculation of descriptors rather than precomputed data files. """ - for snapshot in self.parameters.snapshot_directories_list: - if snapshot.temporary_input_file is not None: - if os.path.isfile(snapshot.temporary_input_file): - os.remove(snapshot.temporary_input_file) + if get_rank() == 0: + for snapshot in self.parameters.snapshot_directories_list: + if snapshot.temporary_input_file is not None: + if os.path.isfile(snapshot.temporary_input_file): + os.remove(snapshot.temporary_input_file) + barrier() # Preparing data ###################### @@ -241,16 +243,8 @@ def prepare_data(self, reparametrize_scaler=True): printout("Initializing the data scalers.", min_verbosity=1) self.__parametrize_scalers() printout("Data scalers initialized.", min_verbosity=0) - elif ( - self.parameters.use_lazy_loading is False - and self.nr_training_data != 0 - ): - printout( - "Data scalers already initilized, loading data to RAM.", - min_verbosity=0, - ) - self.__load_data("training", "inputs") - self.__load_data("training", "outputs") + elif self.nr_training_data != 0: + self.__parametrized_load_training_data() # Build Datasets. printout("Build datasets.", min_verbosity=1) @@ -267,6 +261,11 @@ def prepare_data(self, reparametrize_scaler=True): # allows for parallel I/O. barrier() + # In the RAM case, there is no reason not to delete all temporary files + # now. + if self.parameters.use_lazy_loading is False: + self._delete_temporary_data() + def prepare_for_testing(self): """ Prepare DataHandler for usage within Tester class. @@ -351,19 +350,24 @@ def get_snapshot_calculation_output(self, snapshot_number): ].calculation_output def calculate_temporary_inputs(self, snapshot: Snapshot): - snapshot.temporary_input_file = tempfile.NamedTemporaryFile( - delete=False, - prefix=snapshot.input_npy_file.split(".")[0], - suffix=".in.npy", - dir=snapshot.input_npy_directory, - ).name - tmp, grid = self.descriptor_calculator.calculate_from_json( - os.path.join( - snapshot.input_npy_directory, - snapshot.input_npy_file, + if snapshot.temporary_input_file is not None: + if not os.path.isfile(snapshot.temporary_input_file): + snapshot.temporary_input_file = None + + if snapshot.temporary_input_file is None: + snapshot.temporary_input_file = tempfile.NamedTemporaryFile( + delete=False, + prefix=snapshot.input_npy_file.split(".")[0], + suffix=".in.npy", + dir=snapshot.input_npy_directory, + ).name + tmp, grid = self.descriptor_calculator.calculate_from_json( + os.path.join( + snapshot.input_npy_directory, + snapshot.input_npy_file, + ) ) - ) - np.save(snapshot.temporary_input_file, tmp) + np.save(snapshot.temporary_input_file, tmp) # Debugging ###################### @@ -1014,6 +1018,27 @@ def __parametrize_scalers(self): printout("Output scaler parametrized.", min_verbosity=1) + def __parametrized_load_training_data(self): + if self.parameters.use_lazy_loading: + printout( + "Data scalers already initilized, preparing input data.", + min_verbosity=0, + ) + for snapshot in self.parameters.snapshot_directories_list: + # Data scaling is only performed on the training data sets. + if ( + snapshot.snapshot_function == "tr" + and snapshot.snapshot_type == "json+numpy" + ): + self.calculate_temporary_inputs(snapshot) + else: + printout( + "Data scalers already initilized, loading data to RAM.", + min_verbosity=0, + ) + self.__load_data("training", "inputs") + self.__load_data("training", "outputs") + def __raw_numpy_to_converted_numpy( self, numpy_array, data_type="in", units=None ): diff --git a/mala/network/trainer.py b/mala/network/trainer.py index b69faae5..c55e2b34 100644 --- a/mala/network/trainer.py +++ b/mala/network/trainer.py @@ -598,7 +598,7 @@ def train_network(self): self.final_validation_loss = vloss # Cleaning up temporary data files. - self.data.delete_temporary_data() + self.data._delete_temporary_data() # Clean-up for pre-fetching lazy loading. if self.data.parameters.use_lazy_loading_prefetch: From c43fefc51103875f8200b35c6c557710f866182b Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Tue, 7 Jan 2025 11:55:54 +0100 Subject: [PATCH 04/23] Made method private --- mala/datahandling/data_handler.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mala/datahandling/data_handler.py b/mala/datahandling/data_handler.py index b9127ca4..d0c8ee1e 100644 --- a/mala/datahandling/data_handler.py +++ b/mala/datahandling/data_handler.py @@ -349,7 +349,7 @@ def get_snapshot_calculation_output(self, snapshot_number): snapshot_number ].calculation_output - def calculate_temporary_inputs(self, snapshot: Snapshot): + def __calculate_temporary_inputs(self, snapshot: Snapshot): if snapshot.temporary_input_file is not None: if not os.path.isfile(snapshot.temporary_input_file): snapshot.temporary_input_file = None @@ -632,7 +632,7 @@ def __load_data(self, function, data_type): # If the input for the descriptors is actually a JSON # file then we need to calculate the descriptors. if snapshot.snapshot_type == "json+numpy": - self.calculate_temporary_inputs(snapshot) + self.__calculate_temporary_inputs(snapshot) file = snapshot.temporary_input_file else: file = os.path.join( @@ -763,14 +763,14 @@ def __build_datasets(self): # snapshot types, this has to be done here. if snapshot.snapshot_function == "va": if snapshot.snapshot_type == "json+numpy": - self.calculate_temporary_inputs(snapshot) + self.__calculate_temporary_inputs(snapshot) self.validation_data_sets[0].add_snapshot_to_dataset( snapshot ) if snapshot.snapshot_function == "te": if snapshot.snapshot_type == "json+numpy": - self.calculate_temporary_inputs(snapshot) + self.__calculate_temporary_inputs(snapshot) self.test_data_sets[0].add_snapshot_to_dataset(snapshot) @@ -930,7 +930,7 @@ def __parametrize_scalers(self): ) ) elif snapshot.snapshot_type == "json+numpy": - self.calculate_temporary_inputs(snapshot) + self.__calculate_temporary_inputs(snapshot) tmp = self.descriptor_calculator.read_from_numpy_file( snapshot.temporary_input_file, units=snapshot.input_units, @@ -1030,7 +1030,7 @@ def __parametrized_load_training_data(self): snapshot.snapshot_function == "tr" and snapshot.snapshot_type == "json+numpy" ): - self.calculate_temporary_inputs(snapshot) + self.__calculate_temporary_inputs(snapshot) else: printout( "Data scalers already initilized, loading data to RAM.", From 7a90ed5c4cc5704a1dce2f9f3122e6d6fb5a2467 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Tue, 7 Jan 2025 13:13:30 +0100 Subject: [PATCH 05/23] Tester class now also works with on-the-fly calculations --- mala/datahandling/data_handler.py | 4 ++-- mala/network/tester.py | 2 ++ mala/network/trainer.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/mala/datahandling/data_handler.py b/mala/datahandling/data_handler.py index d0c8ee1e..d452b06f 100644 --- a/mala/datahandling/data_handler.py +++ b/mala/datahandling/data_handler.py @@ -170,7 +170,7 @@ def clear_data(self): self.output_data_scaler.reset() super(DataHandler, self).clear_data() - def _delete_temporary_data(self): + def delete_temporary_data(self): """ Delete temporary data files. @@ -264,7 +264,7 @@ def prepare_data(self, reparametrize_scaler=True): # In the RAM case, there is no reason not to delete all temporary files # now. if self.parameters.use_lazy_loading is False: - self._delete_temporary_data() + self.delete_temporary_data() def prepare_for_testing(self): """ diff --git a/mala/network/tester.py b/mala/network/tester.py index d7c07761..fd1b04c6 100644 --- a/mala/network/tester.py +++ b/mala/network/tester.py @@ -110,6 +110,8 @@ def test_all_snapshots(self): for observable in self.observables_to_test: results[observable].append(snapshot_result[observable]) + self.data.delete_temporary_data() + if self.output_format == "list": return results diff --git a/mala/network/trainer.py b/mala/network/trainer.py index c55e2b34..b69faae5 100644 --- a/mala/network/trainer.py +++ b/mala/network/trainer.py @@ -598,7 +598,7 @@ def train_network(self): self.final_validation_loss = vloss # Cleaning up temporary data files. - self.data._delete_temporary_data() + self.data.delete_temporary_data() # Clean-up for pre-fetching lazy loading. if self.data.parameters.use_lazy_loading_prefetch: From 39a40c364233f67d801455139d2b1198ba0a42f8 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Tue, 7 Jan 2025 16:16:39 +0100 Subject: [PATCH 06/23] Prefetching works --- mala/datahandling/data_handler.py | 5 +++ mala/datahandling/lazy_load_dataset_single.py | 20 ++++++++++ .../multi_lazy_load_data_loader.py | 37 ++++++++++++++++++- 3 files changed, 61 insertions(+), 1 deletion(-) diff --git a/mala/datahandling/data_handler.py b/mala/datahandling/data_handler.py index d452b06f..e1611677 100644 --- a/mala/datahandling/data_handler.py +++ b/mala/datahandling/data_handler.py @@ -816,6 +816,9 @@ def __build_datasets(self): self._use_ddp, ) ) + if snapshot.snapshot_type == "json+numpy": + self.__calculate_temporary_inputs(snapshot) + if snapshot.snapshot_function == "te": self.test_data_sets.append( LazyLoadDatasetSingle( @@ -831,6 +834,8 @@ def __build_datasets(self): input_requires_grad=True, ) ) + if snapshot.snapshot_type == "json+numpy": + self.__calculate_temporary_inputs(snapshot) else: if self.nr_training_data != 0: diff --git a/mala/datahandling/lazy_load_dataset_single.py b/mala/datahandling/lazy_load_dataset_single.py index 402d149d..1baef9ea 100644 --- a/mala/datahandling/lazy_load_dataset_single.py +++ b/mala/datahandling/lazy_load_dataset_single.py @@ -180,6 +180,26 @@ def allocate_shared_mem(self): read_dtype=True, ) ) + elif self.snapshot.snapshot_type == "json+numpy": + self.input_shape = ( + self.descriptor_calculator.read_dimensions_from_json( + os.path.join( + self.snapshot.input_npy_directory, + self.snapshot.input_npy_file, + ), + ) + ) + self.input_dtype = np.dtype(np.float32) + + self.output_shape, self.output_dtype = ( + self.target_calculator.read_dimensions_from_numpy_file( + os.path.join( + self.snapshot.output_npy_directory, + self.snapshot.output_npy_file, + ), + read_dtype=True, + ) + ) else: raise Exception("Invalid snapshot type selected.") diff --git a/mala/datahandling/multi_lazy_load_data_loader.py b/mala/datahandling/multi_lazy_load_data_loader.py index a9aca6af..35b02601 100644 --- a/mala/datahandling/multi_lazy_load_data_loader.py +++ b/mala/datahandling/multi_lazy_load_data_loader.py @@ -192,6 +192,24 @@ def load_snapshot_to_shm( read_dtype=True, ) ) + elif snapshot.snapshot_type == "json+numpy": + input_shape = descriptor_calculator.read_dimensions_from_json( + os.path.join( + snapshot.input_npy_directory, + snapshot.input_npy_file, + ), + ) + input_dtype = np.dtype(np.float32) + + output_shape, output_dtype = ( + target_calculator.read_dimensions_from_numpy_file( + os.path.join( + snapshot.output_npy_directory, + snapshot.output_npy_file, + ), + read_dtype=True, + ) + ) else: raise Exception("Invalid snapshot type selected.") @@ -219,7 +237,22 @@ def load_snapshot_to_shm( units=snapshot.output_units, array=output_data, ) - else: + + elif snapshot.snapshot_type == "json+numpy": + descriptor_calculator.read_from_numpy_file( + snapshot.temporary_input_file, + units=snapshot.input_units, + array=input_data, + ) + target_calculator.read_from_numpy_file( + os.path.join( + snapshot.output_npy_directory, snapshot.output_npy_file + ), + units=snapshot.output_units, + array=output_data, + ) + + elif snapshot.snapshot_type == "openpmd": descriptor_calculator.read_from_openpmd_file( os.path.join( snapshot.input_npy_directory, snapshot.input_npy_file @@ -234,6 +267,8 @@ def load_snapshot_to_shm( units=snapshot.output_units, array=output_data, ) + else: + raise Exception("Invalid snapshot type selected.") # This function only loads the numpy data with scaling. Remaining data # preprocessing occurs in __getitem__ of LazyLoadDatasetSingle From 2201e34fcd6995bfb24528ae6e29fcdbd094bd0b Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Tue, 7 Jan 2025 16:27:14 +0100 Subject: [PATCH 07/23] Is this already enought to get DDP working? --- mala/datahandling/data_handler.py | 11 ++++++++--- mala/network/tester.py | 2 +- mala/network/trainer.py | 2 +- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/mala/datahandling/data_handler.py b/mala/datahandling/data_handler.py index e1611677..d0ce2ad4 100644 --- a/mala/datahandling/data_handler.py +++ b/mala/datahandling/data_handler.py @@ -170,7 +170,7 @@ def clear_data(self): self.output_data_scaler.reset() super(DataHandler, self).clear_data() - def delete_temporary_data(self): + def delete_temporary_inputs(self): """ Delete temporary data files. @@ -264,7 +264,7 @@ def prepare_data(self, reparametrize_scaler=True): # In the RAM case, there is no reason not to delete all temporary files # now. if self.parameters.use_lazy_loading is False: - self.delete_temporary_data() + self.delete_temporary_inputs() def prepare_for_testing(self): """ @@ -367,7 +367,12 @@ def __calculate_temporary_inputs(self, snapshot: Snapshot): snapshot.input_npy_file, ) ) - np.save(snapshot.temporary_input_file, tmp) + if self.parameters._configuration["mpi"]: + tmp = self.descriptor_calculator.gather_descriptors(tmp) + + if get_rank() == 0: + np.save(snapshot.temporary_input_file, tmp) + barrier() # Debugging ###################### diff --git a/mala/network/tester.py b/mala/network/tester.py index fd1b04c6..b8ee1c70 100644 --- a/mala/network/tester.py +++ b/mala/network/tester.py @@ -110,7 +110,7 @@ def test_all_snapshots(self): for observable in self.observables_to_test: results[observable].append(snapshot_result[observable]) - self.data.delete_temporary_data() + self.data.delete_temporary_inputs() if self.output_format == "list": return results diff --git a/mala/network/trainer.py b/mala/network/trainer.py index b69faae5..9ff5ad12 100644 --- a/mala/network/trainer.py +++ b/mala/network/trainer.py @@ -598,7 +598,7 @@ def train_network(self): self.final_validation_loss = vloss # Cleaning up temporary data files. - self.data.delete_temporary_data() + self.data.delete_temporary_inputs() # Clean-up for pre-fetching lazy loading. if self.data.parameters.use_lazy_loading_prefetch: From c1737c51273f760842c521b25b8a9c03f67325a5 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Tue, 7 Jan 2025 17:26:57 +0100 Subject: [PATCH 08/23] Renamed "additional info", since it will be used more regularly with these changes --- docs/source/basic_usage/more_data.rst | 8 +-- .../advanced/ex10_convert_numpy_openpmd.py | 18 +++--- examples/basic/ex03_preprocess_data.py | 10 +-- mala/datahandling/data_converter.py | 64 +++++++++---------- mala/datahandling/data_shuffler.py | 10 ++- test/complete_interfaces_test.py | 12 ++-- 6 files changed, 60 insertions(+), 62 deletions(-) diff --git a/docs/source/basic_usage/more_data.rst b/docs/source/basic_usage/more_data.rst index d643e8c6..4232a79b 100644 --- a/docs/source/basic_usage/more_data.rst +++ b/docs/source/basic_usage/more_data.rst @@ -99,8 +99,8 @@ and fill it with data, e.g., by descriptor_input_path=outfile, target_input_type=".cube", target_input_path=ldosfile, - additional_info_input_type="espresso-out", - additional_info_input_path=outfile, + simulation_output_type="espresso-out", + simulation_output_path=outfile, target_units="1/(Ry*Bohr^3)") The ``add_snapshot`` function can be called multiple times to add @@ -108,7 +108,7 @@ multiple snapshots to MALA. For regular Quantum ESPRESSO calculations, the ``descriptor_input_type`` and ``target_input_type`` will always be ``"espresso-out"`` and ``".cube"``, respectively, and the ``target_units`` will always be ``"1/(Ry*Bohr^3)"``. -The paths have to be modified accordingly. ``additional_info_input_*`` refers +The paths have to be modified accordingly. ``simulation_output_*`` refers to the calculation output file - MALA provides an interface to condense the entire, verbose simulation output to ``.json`` files for further processing. In the preceding section, we had to specify calculation output @@ -121,7 +121,7 @@ Once data is provided, the conversion itself is simple. data_converter.convert_snapshots(descriptor_save_path="./", target_save_path="./", - additional_info_save_path="./", + simulation_output_save_path="./", naming_scheme="Be_snapshot*.npy", descriptor_calculation_kwargs= {"working_directory": data_path}) diff --git a/examples/advanced/ex10_convert_numpy_openpmd.py b/examples/advanced/ex10_convert_numpy_openpmd.py index 7ebc22da..22607b30 100644 --- a/examples/advanced/ex10_convert_numpy_openpmd.py +++ b/examples/advanced/ex10_convert_numpy_openpmd.py @@ -20,15 +20,15 @@ target_input_path=os.path.join( data_path, "Be_snapshot{}.out.npy".format(snapshot) ), - additional_info_input_type=None, - additional_info_input_path=None, + simulation_output_type=None, + simulation_output_path=None, target_units=None, ) data_converter.convert_snapshots( descriptor_save_path="./", target_save_path="./", - additional_info_save_path="./", + simulation_output_save_path="./", naming_scheme="converted_from_numpy_*.h5", descriptor_calculation_kwargs={"working_directory": "./"}, ) @@ -43,15 +43,15 @@ descriptor_input_path="converted_from_numpy_{}.in.h5".format(snapshot), target_input_type="openpmd", target_input_path="converted_from_numpy_{}.out.h5".format(snapshot), - additional_info_input_type=None, - additional_info_input_path=None, + simulation_output_type=None, + simulation_output_path=None, target_units=None, ) data_converter.convert_snapshots( descriptor_save_path="./", target_save_path="./", - additional_info_save_path="./", + simulation_output_save_path="./", naming_scheme="verify_against_original_numpy_data_*.npy", descriptor_calculation_kwargs={"working_directory": "./"}, ) @@ -84,15 +84,15 @@ target_input_path=os.path.join( data_path, "Be_snapshot{}.out.h5".format(snapshot) ), - additional_info_input_type=None, - additional_info_input_path=None, + simulation_output_type=None, + simulation_output_path=None, target_units=None, ) data_converter.convert_snapshots( descriptor_save_path="./", target_save_path="./", - additional_info_save_path="./", + simulation_output_save_path="./", naming_scheme="converted_from_openpmd_*.npy", descriptor_calculation_kwargs={"working_directory": "./"}, ) diff --git a/examples/basic/ex03_preprocess_data.py b/examples/basic/ex03_preprocess_data.py index b0a10488..e9f4691a 100644 --- a/examples/basic/ex03_preprocess_data.py +++ b/examples/basic/ex03_preprocess_data.py @@ -46,12 +46,12 @@ # Data conversion itself is simple. We select input and output data # to be converted, add this data snapshot-wise and tell MALA to # convert snapshots. Inputs and outputs can be processed individually. -# Further, via the additional_info_input_* keywords, calculation output +# Further, via the simulation_output_input_* keywords, calculation output # can be processed from the original simulation *.out output files into # more convenient *.json files that can be used in their stead. This saves # on disk space. # To only process parts of the data, omit/add descriptor_input*, target_input_* -# and additional_info_input_* at your leisure. +# and simulation_output_* at your leisure. # Make sure to set the correct units - for QE, this should always be # 1/(Ry*Bohr^3). #################### @@ -65,8 +65,8 @@ descriptor_input_path=outfile, target_input_type=".cube", target_input_path=ldosfile, - additional_info_input_type="espresso-out", - additional_info_input_path=outfile, + simulation_output_type="espresso-out", + simulation_output_path=outfile, target_units="1/(Ry*Bohr^3)", ) @@ -84,7 +84,7 @@ data_converter.convert_snapshots( descriptor_save_path="./", target_save_path="./", - additional_info_save_path="./", + simulation_output_save_path="./", naming_scheme="Be_snapshot*.npy", descriptor_calculation_kwargs={"working_directory": data_path}, ) diff --git a/mala/datahandling/data_converter.py b/mala/datahandling/data_converter.py index 21fb34cd..89558fec 100644 --- a/mala/datahandling/data_converter.py +++ b/mala/datahandling/data_converter.py @@ -12,7 +12,7 @@ descriptor_input_types = ["espresso-out", "openpmd", "numpy"] target_input_types = [".cube", ".xsf", "openpmd", "numpy"] -additional_info_input_types = ["espresso-out"] +simulation_output_types = ["espresso-out"] class DataConverter: @@ -78,7 +78,7 @@ def __init__( # Keep track of what has to be done by this data converter. self.__process_descriptors = False self.__process_targets = False - self.__process_additional_info = False + self.__process_simulation_output = False def add_snapshot( self, @@ -86,8 +86,8 @@ def add_snapshot( descriptor_input_path=None, target_input_type=None, target_input_path=None, - additional_info_input_type=None, - additional_info_input_path=None, + simulation_output_type=None, + simulation_output_path=None, descriptor_units=None, metadata_input_type=None, metadata_input_path=None, @@ -114,22 +114,22 @@ def add_snapshot( target_input_path : string Path of target data to be processed. - additional_info_input_type : string + simulation_output_type : string Type of additional info data to be processed. - See mala.datahandling.data_converter.additional_info_input_types + See mala.datahandling.data_converter.simulation_output_types for options. - additional_info_input_path : string + simulation_output_path : string Path of additional info data to be processed. metadata_input_type : string Type of additional metadata to be processed. - See mala.datahandling.data_converter.additional_info_input_types + See mala.datahandling.data_converter.simulation_output_types for options. - This is essentially the same as additional_info_input_type, + This is essentially the same as simulation_output_type, but will not affect saving; i.e., the data given here will only be saved in OpenPMD files, not saved separately. - If additional_info_input_type is set, this argument will be + If simulation_output_type is set, this argument will be ignored. metadata_input_path : string @@ -161,20 +161,20 @@ def add_snapshot( raise Exception("Cannot process this type of target data.") self.__process_targets = True - if additional_info_input_type is not None: - metadata_input_type = additional_info_input_type - if additional_info_input_path is None: + if simulation_output_type is not None: + metadata_input_type = simulation_output_type + if simulation_output_path is None: raise Exception( "Cannot process additional info data with " "no path given." ) - if additional_info_input_type not in additional_info_input_types: + if simulation_output_type not in simulation_output_types: raise Exception( "Cannot process this type of additional info data." ) - self.__process_additional_info = True + self.__process_simulation_output = True - metadata_input_path = additional_info_input_path + metadata_input_path = simulation_output_path if metadata_input_type is not None: if metadata_input_path is None: @@ -182,7 +182,7 @@ def add_snapshot( "Cannot process additional info data with " "no path given." ) - if metadata_input_type not in additional_info_input_types: + if metadata_input_type not in simulation_output_types: raise Exception( "Cannot process this type of additional info data." ) @@ -192,7 +192,7 @@ def add_snapshot( { "input": descriptor_input_path, "output": target_input_path, - "additional_info": additional_info_input_path, + "simulation_output": simulation_output_path, "metadata": metadata_input_path, } ) @@ -200,7 +200,7 @@ def add_snapshot( { "input": descriptor_input_type, "output": target_input_type, - "additional_info": additional_info_input_type, + "simulation_output": simulation_output_type, "metadata": metadata_input_type, } ) @@ -213,7 +213,7 @@ def convert_snapshots( complete_save_path=None, descriptor_save_path=None, target_save_path=None, - additional_info_save_path=None, + simulation_output_save_path=None, naming_scheme="ELEM_snapshot*.npy", starts_at=0, file_based_communication=False, @@ -231,7 +231,7 @@ def convert_snapshots( complete_save_path : string If not None: the directory in which all snapshots will be saved. Overwrites descriptor_save_path, target_save_path and - additional_info_save_path if set. + simulation_output_save_path if set. descriptor_save_path : string Directory in which to save descriptor data. @@ -239,7 +239,7 @@ def convert_snapshots( target_save_path : string Directory in which to save target data. - additional_info_save_path : string + simulation_output_save_path : string Directory in which to save additional info data. naming_scheme : string @@ -304,7 +304,7 @@ def convert_snapshots( if complete_save_path is not None: descriptor_save_path = complete_save_path target_save_path = complete_save_path - additional_info_save_path = complete_save_path + simulation_output_save_path = complete_save_path else: if self.__process_targets is True and target_save_path is None: raise Exception( @@ -318,8 +318,8 @@ def convert_snapshots( "No descriptor path specified, cannot process data." ) if ( - self.__process_additional_info is True - and additional_info_save_path is None + self.__process_simulation_output is True + and simulation_output_save_path is None ): raise Exception( "No additional info path specified, cannot " @@ -393,9 +393,9 @@ def convert_snapshots( snapshot_name = snapshot_name.replace("*", str(snapshot_number)) # Create the paths as needed. - if self.__process_additional_info: + if self.__process_simulation_output: info_path = os.path.join( - additional_info_save_path, snapshot_name + ".info.json" + simulation_output_save_path, snapshot_name + ".info.json" ) else: info_path = None @@ -454,7 +454,7 @@ def convert_snapshots( use_memmap=memmap, input_iteration=input_iteration, output_iteration=output_iteration, - additional_info_path=info_path, + simulation_output_info_path=info_path, use_fp64=use_fp64, ) @@ -479,7 +479,7 @@ def __convert_single_snapshot( target_calculator_kwargs, input_path=None, output_path=None, - additional_info_path=None, + simulation_output_info_path=None, use_memmap=None, output_iteration=None, input_iteration=None, @@ -719,11 +719,11 @@ def __convert_single_snapshot( del tmp_output # Parse and/or calculate the additional info. - if description["additional_info"] is not None: + if description["simulation_output"] is not None: # Parsing and saving is done using the target calculator. self.target_calculator.read_additional_calculation_data( - snapshot["additional_info"], description["additional_info"] + snapshot["simulation_output"], description["simulation_output"] ) self.target_calculator.write_additional_calculation_data( - additional_info_path + simulation_output_info_path ) diff --git a/mala/datahandling/data_shuffler.py b/mala/datahandling/data_shuffler.py index 660a1fec..be859f5a 100644 --- a/mala/datahandling/data_shuffler.py +++ b/mala/datahandling/data_shuffler.py @@ -334,9 +334,7 @@ def __init__(self): # XXXXXXXOOO # OOOOOOOOOO - def __contiguous_slice_within_ndim_grid_as_blocks( - extent, x, y - ): + def __contiguous_slice_within_ndim_grid_as_blocks(extent, x, y): # Used for converting a block defined by inclusive lower and upper # coordinate to a chunk as defined by openPMD. # The openPMD extent is defined by (to-from)+1 (to make up for the @@ -426,7 +424,8 @@ def worker(inner_idx, inner_strides): if not inner_strides: if inner_idx != 0: raise RuntimeError( - "This cannot happen. There is bug somewhere.") + "This cannot happen. There is bug somewhere." + ) else: return [] div, mod = divmod(inner_idx, inner_strides[0]) @@ -655,8 +654,7 @@ def shuffle_snapshots( ---------- complete_save_path : string If not None: the directory in which all snapshots will be saved. - Overwrites descriptor_save_path, target_save_path and - additional_info_save_path if set. + Overwrites descriptor_save_path and target_save_path if set. descriptor_save_path : string Directory in which to save descriptor data. diff --git a/test/complete_interfaces_test.py b/test/complete_interfaces_test.py index 6a795665..5288afa8 100644 --- a/test/complete_interfaces_test.py +++ b/test/complete_interfaces_test.py @@ -108,15 +108,15 @@ def test_convert_numpy_openpmd(self): target_input_path=os.path.join( data_path, "Be_snapshot{}.out.npy".format(snapshot) ), - additional_info_input_type=None, - additional_info_input_path=None, + simulation_output_type=None, + simulation_output_path=None, target_units=None, ) data_converter.convert_snapshots( descriptor_save_path="./", target_save_path="./", - additional_info_save_path="./", + simulation_output_save_path="./", naming_scheme="converted_from_numpy_*.h5", descriptor_calculation_kwargs={"working_directory": "./"}, ) @@ -135,15 +135,15 @@ def test_convert_numpy_openpmd(self): target_input_path="converted_from_numpy_{}.out.h5".format( snapshot ), - additional_info_input_type=None, - additional_info_input_path=None, + simulation_output_type=None, + simulation_output_path=None, target_units=None, ) data_converter.convert_snapshots( descriptor_save_path="./", target_save_path="./", - additional_info_save_path="./", + simulation_output_save_path="./", naming_scheme="verify_against_original_numpy_data_*.npy", descriptor_calculation_kwargs={"working_directory": "./"}, ) From 4001d0a61cdd492e835aa50dec3dfae2cf803594 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Wed, 8 Jan 2025 14:10:44 +0100 Subject: [PATCH 09/23] Fixing a parallel writing bug --- mala/descriptors/bispectrum.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/mala/descriptors/bispectrum.py b/mala/descriptors/bispectrum.py index 8932c61b..47865372 100755 --- a/mala/descriptors/bispectrum.py +++ b/mala/descriptors/bispectrum.py @@ -9,7 +9,7 @@ import numpy as np from scipy.spatial import distance -from mala.common.parallelizer import printout +from mala.common.parallelizer import printout, get_rank, barrier from mala.descriptors.lammps_utils import extract_compute_np from mala.descriptors.descriptor import Descriptor @@ -156,9 +156,14 @@ def __calculate_lammps(self, outdir, **kwargs): lammps_format = "lammps-data" self.setup_lammps_tmp_files("bgrid", outdir) - ase.io.write( - self._lammps_temporary_input, self._atoms, format=lammps_format - ) + if get_rank() == 0: + ase.io.write( + self._lammps_temporary_input, + self._atoms, + format=lammps_format, + parallel=False, + ) + barrier() nx = self.grid_dimensions[0] ny = self.grid_dimensions[1] From e6a072302c9a23d2b8b22001b8d32f84cbefaa09 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Wed, 8 Jan 2025 14:15:28 +0100 Subject: [PATCH 10/23] Can I use DDP and MPI at the same time? --- mala/common/parallelizer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mala/common/parallelizer.py b/mala/common/parallelizer.py index e59b8a98..8e394999 100644 --- a/mala/common/parallelizer.py +++ b/mala/common/parallelizer.py @@ -64,10 +64,10 @@ def set_mpi_status(new_value): Value the MPI status has. """ - if use_ddp is True and new_value is True: - raise Exception( - "Cannot use ddp and inference-level MPI at " "the same time yet." - ) + # if use_ddp is True and new_value is True: + # raise Exception( + # "Cannot use ddp and inference-level MPI at " "the same time yet." + # ) global use_mpi use_mpi = new_value if use_mpi: From 516240486221f5f3bd8fdad476525f1a0cfe3bde Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Wed, 8 Jan 2025 14:17:35 +0100 Subject: [PATCH 11/23] It does not help --- mala/common/parallelizer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mala/common/parallelizer.py b/mala/common/parallelizer.py index 8e394999..e59b8a98 100644 --- a/mala/common/parallelizer.py +++ b/mala/common/parallelizer.py @@ -64,10 +64,10 @@ def set_mpi_status(new_value): Value the MPI status has. """ - # if use_ddp is True and new_value is True: - # raise Exception( - # "Cannot use ddp and inference-level MPI at " "the same time yet." - # ) + if use_ddp is True and new_value is True: + raise Exception( + "Cannot use ddp and inference-level MPI at " "the same time yet." + ) global use_mpi use_mpi = new_value if use_mpi: From 399722990ae7f1c969f9bbcab8e06f23ba8e34e8 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Wed, 8 Jan 2025 14:23:08 +0100 Subject: [PATCH 12/23] Getting rid of the parallel modification for now --- mala/descriptors/bispectrum.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/mala/descriptors/bispectrum.py b/mala/descriptors/bispectrum.py index 47865372..8932c61b 100755 --- a/mala/descriptors/bispectrum.py +++ b/mala/descriptors/bispectrum.py @@ -9,7 +9,7 @@ import numpy as np from scipy.spatial import distance -from mala.common.parallelizer import printout, get_rank, barrier +from mala.common.parallelizer import printout from mala.descriptors.lammps_utils import extract_compute_np from mala.descriptors.descriptor import Descriptor @@ -156,14 +156,9 @@ def __calculate_lammps(self, outdir, **kwargs): lammps_format = "lammps-data" self.setup_lammps_tmp_files("bgrid", outdir) - if get_rank() == 0: - ase.io.write( - self._lammps_temporary_input, - self._atoms, - format=lammps_format, - parallel=False, - ) - barrier() + ase.io.write( + self._lammps_temporary_input, self._atoms, format=lammps_format + ) nx = self.grid_dimensions[0] ny = self.grid_dimensions[1] From 15ff5dc4740ea2629fb459c09a62b4615f895025 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Thu, 9 Jan 2025 11:57:00 +0100 Subject: [PATCH 13/23] Shuffling from atomic positions works now --- mala/datahandling/data_handler.py | 55 ++++--------------------- mala/datahandling/data_handler_base.py | 43 +++++++++++++++++++- mala/datahandling/data_shuffler.py | 56 ++++++++++++++++++-------- 3 files changed, 88 insertions(+), 66 deletions(-) diff --git a/mala/datahandling/data_handler.py b/mala/datahandling/data_handler.py index d0ce2ad4..5823d459 100644 --- a/mala/datahandling/data_handler.py +++ b/mala/datahandling/data_handler.py @@ -1,7 +1,6 @@ """DataHandler class that loads and scales data.""" import os -import tempfile import numpy as np import torch @@ -170,21 +169,6 @@ def clear_data(self): self.output_data_scaler.reset() super(DataHandler, self).clear_data() - def delete_temporary_inputs(self): - """ - Delete temporary data files. - - These may have been created during a training or testing process - when using atomic positions for on-the-fly calculation of descriptors - rather than precomputed data files. - """ - if get_rank() == 0: - for snapshot in self.parameters.snapshot_directories_list: - if snapshot.temporary_input_file is not None: - if os.path.isfile(snapshot.temporary_input_file): - os.remove(snapshot.temporary_input_file) - barrier() - # Preparing data ###################### @@ -349,31 +333,6 @@ def get_snapshot_calculation_output(self, snapshot_number): snapshot_number ].calculation_output - def __calculate_temporary_inputs(self, snapshot: Snapshot): - if snapshot.temporary_input_file is not None: - if not os.path.isfile(snapshot.temporary_input_file): - snapshot.temporary_input_file = None - - if snapshot.temporary_input_file is None: - snapshot.temporary_input_file = tempfile.NamedTemporaryFile( - delete=False, - prefix=snapshot.input_npy_file.split(".")[0], - suffix=".in.npy", - dir=snapshot.input_npy_directory, - ).name - tmp, grid = self.descriptor_calculator.calculate_from_json( - os.path.join( - snapshot.input_npy_directory, - snapshot.input_npy_file, - ) - ) - if self.parameters._configuration["mpi"]: - tmp = self.descriptor_calculator.gather_descriptors(tmp) - - if get_rank() == 0: - np.save(snapshot.temporary_input_file, tmp) - barrier() - # Debugging ###################### @@ -637,7 +596,7 @@ def __load_data(self, function, data_type): # If the input for the descriptors is actually a JSON # file then we need to calculate the descriptors. if snapshot.snapshot_type == "json+numpy": - self.__calculate_temporary_inputs(snapshot) + self._calculate_temporary_inputs(snapshot) file = snapshot.temporary_input_file else: file = os.path.join( @@ -768,14 +727,14 @@ def __build_datasets(self): # snapshot types, this has to be done here. if snapshot.snapshot_function == "va": if snapshot.snapshot_type == "json+numpy": - self.__calculate_temporary_inputs(snapshot) + self._calculate_temporary_inputs(snapshot) self.validation_data_sets[0].add_snapshot_to_dataset( snapshot ) if snapshot.snapshot_function == "te": if snapshot.snapshot_type == "json+numpy": - self.__calculate_temporary_inputs(snapshot) + self._calculate_temporary_inputs(snapshot) self.test_data_sets[0].add_snapshot_to_dataset(snapshot) @@ -822,7 +781,7 @@ def __build_datasets(self): ) ) if snapshot.snapshot_type == "json+numpy": - self.__calculate_temporary_inputs(snapshot) + self._calculate_temporary_inputs(snapshot) if snapshot.snapshot_function == "te": self.test_data_sets.append( @@ -840,7 +799,7 @@ def __build_datasets(self): ) ) if snapshot.snapshot_type == "json+numpy": - self.__calculate_temporary_inputs(snapshot) + self._calculate_temporary_inputs(snapshot) else: if self.nr_training_data != 0: @@ -940,7 +899,7 @@ def __parametrize_scalers(self): ) ) elif snapshot.snapshot_type == "json+numpy": - self.__calculate_temporary_inputs(snapshot) + self._calculate_temporary_inputs(snapshot) tmp = self.descriptor_calculator.read_from_numpy_file( snapshot.temporary_input_file, units=snapshot.input_units, @@ -1040,7 +999,7 @@ def __parametrized_load_training_data(self): snapshot.snapshot_function == "tr" and snapshot.snapshot_type == "json+numpy" ): - self.__calculate_temporary_inputs(snapshot) + self._calculate_temporary_inputs(snapshot) else: printout( "Data scalers already initilized, loading data to RAM.", diff --git a/mala/datahandling/data_handler_base.py b/mala/datahandling/data_handler_base.py index 95742633..968e497b 100644 --- a/mala/datahandling/data_handler_base.py +++ b/mala/datahandling/data_handler_base.py @@ -2,11 +2,12 @@ from abc import ABC import os +import tempfile import numpy as np from mala.common.parameters import ParametersData, Parameters -from mala.common.parallelizer import printout +from mala.common.parallelizer import printout, get_rank, barrier from mala.targets.target import Target from mala.descriptors.descriptor import Descriptor from mala.datahandling.snapshot import Snapshot @@ -166,6 +167,21 @@ def clear_data(self): """ self.parameters.snapshot_directories_list = [] + def delete_temporary_inputs(self): + """ + Delete temporary data files. + + These may have been created during a training or testing process + when using atomic positions for on-the-fly calculation of descriptors + rather than precomputed data files. + """ + if get_rank() == 0: + for snapshot in self.parameters.snapshot_directories_list: + if snapshot.temporary_input_file is not None: + if os.path.isfile(snapshot.temporary_input_file): + os.remove(snapshot.temporary_input_file) + barrier() + ############################## # Private methods ############################## @@ -286,3 +302,28 @@ def _check_snapshots(self, comm=None): if firstsnapshot: firstsnapshot = False + + def _calculate_temporary_inputs(self, snapshot: Snapshot): + if snapshot.temporary_input_file is not None: + if not os.path.isfile(snapshot.temporary_input_file): + snapshot.temporary_input_file = None + + if snapshot.temporary_input_file is None: + snapshot.temporary_input_file = tempfile.NamedTemporaryFile( + delete=False, + prefix=snapshot.input_npy_file.split(".")[0], + suffix=".in.npy", + dir=snapshot.input_npy_directory, + ).name + tmp, grid = self.descriptor_calculator.calculate_from_json( + os.path.join( + snapshot.input_npy_directory, + snapshot.input_npy_file, + ) + ) + if self.parameters._configuration["mpi"]: + tmp = self.descriptor_calculator.gather_descriptors(tmp) + + if get_rank() == 0: + np.save(snapshot.temporary_input_file, tmp) + barrier() diff --git a/mala/datahandling/data_shuffler.py b/mala/datahandling/data_shuffler.py index be859f5a..231fd646 100644 --- a/mala/datahandling/data_shuffler.py +++ b/mala/datahandling/data_shuffler.py @@ -45,14 +45,6 @@ def __init__( target_calculator=target_calculator, descriptor_calculator=descriptor_calculator, ) - if self.descriptor_calculator.parameters.descriptors_contain_xyz: - printout( - "Disabling XYZ-cutting from descriptor data for " - "shuffling. If needed, please re-enable afterwards." - ) - self.descriptor_calculator.parameters.descriptors_contain_xyz = ( - False - ) self._data_points_to_remove = None def add_snapshot( @@ -112,15 +104,29 @@ def __shuffle_numpy( for idx, snapshot in enumerate( self.parameters.snapshot_directories_list ): - # TODO: Use descriptor and target calculator for this. - descriptor_data.append( - np.load( - os.path.join( - snapshot.input_npy_directory, snapshot.input_npy_file - ), - mmap_mode="r", + if snapshot.snapshot_type == "numpy": + # TODO: Use descriptor and target calculator for this. + descriptor_data.append( + np.load( + os.path.join( + snapshot.input_npy_directory, + snapshot.input_npy_file, + ), + mmap_mode="r", + ) ) - ) + elif snapshot.snapshot_type == "json+numpy": + descriptor_data.append( + np.load( + snapshot.temporary_input_file, + mmap_mode="r", + ) + ) + else: + raise Exception( + "Invalid snapshot for numpy shuffling " "selected." + ) + target_data.append( np.load( os.path.join( @@ -262,6 +268,8 @@ def __shuffle_numpy( internal_iteration_number=i, ) + self.delete_temporary_inputs() + # The function __shuffle_openpmd can be used to shuffle descriptor data and # target data. # It will be executed one after another for both of them. @@ -692,11 +700,13 @@ def shuffle_snapshots( else: file_ending = "npy" + old_xyz = self.descriptor_calculator.parameters.descriptors_contain_xyz + self.descriptor_calculator.parameters.descriptors_contain_xyz = False if self.parameters._configuration["mpi"]: self._check_snapshots(comm=get_comm()) else: self._check_snapshots() - + self.descriptor_calculator.parameters.descriptors_contain_xyz = old_xyz snapshot_types = { snapshot.snapshot_type for snapshot in self.parameters.snapshot_directories_list @@ -785,6 +795,18 @@ def shuffle_snapshots( permutations, file_ending, ) + elif snapshot_type == "json+numpy": + for snapshot in self.parameters.snapshot_directories_list: + self._calculate_temporary_inputs(snapshot) + self.__shuffle_numpy( + number_of_shuffled_snapshots, + shuffle_dimensions, + descriptor_save_path, + save_name, + target_save_path, + permutations, + file_ending, + ) elif snapshot_type == "openpmd": descriptor = self.__DescriptorOrTarget( descriptor_save_path, From 8e8cb3fc792978b0c22a0ed453bd5de0b6e367f6 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Thu, 9 Jan 2025 16:39:08 +0100 Subject: [PATCH 14/23] Shuffling now works as part of the temporary pipeline --- mala/datahandling/data_shuffler.py | 153 ++++++++++++++++++++++++----- 1 file changed, 129 insertions(+), 24 deletions(-) diff --git a/mala/datahandling/data_shuffler.py b/mala/datahandling/data_shuffler.py index 231fd646..9d8bb958 100644 --- a/mala/datahandling/data_shuffler.py +++ b/mala/datahandling/data_shuffler.py @@ -3,12 +3,14 @@ import os import numpy as np +import tempfile +import mala from mala.common.parameters import ( Parameters, DEFAULT_NP_DATA_DTYPE, ) -from mala.common.parallelizer import printout +from mala.common.parallelizer import printout, parallel_warn from mala.common.physical_data import PhysicalData from mala.datahandling.data_handler_base import DataHandlerBase from mala.common.parallelizer import get_comm @@ -32,6 +34,17 @@ class DataShuffler(DataHandlerBase): target_calculator : mala.targets.target.Target Used to do unit conversion on output data. If None, then one will be created by this class. + + Attributes + ---------- + temporary_shuffled_snapshots : list + A list containing snapshot objects of temporary, snapshot-like + shuffled data files. By default, this list is empty. If the + function "shuffle_snapshots_temporary" is used, it will be populated + with temporary files saved to hard drive, which can be deleted + after model training. Please note that the "snapshot_function", + "input_units", "output_units" and "calculation_output" fields of the + snapshots within this list """ def __init__( @@ -46,6 +59,7 @@ def __init__( descriptor_calculator=descriptor_calculator, ) self._data_points_to_remove = None + self.temporary_shuffled_snapshots = [] def add_snapshot( self, @@ -97,6 +111,7 @@ def __shuffle_numpy( target_save_path, permutations, file_ending, + temporary, ): # Load the data (via memmap). descriptor_data = [] @@ -168,12 +183,6 @@ def __shuffle_numpy( target_data[idx] = current_target[indices] # Do the actual shuffling. - target_name_openpmd = os.path.join( - target_save_path, save_name.replace("*", "%T") - ) - descriptor_name_openpmd = os.path.join( - descriptor_save_path, save_name.replace("*", "%T") - ) for i in range(0, number_of_new_snapshots): new_descriptors = np.zeros( (int(np.prod(shuffle_dimensions)), self.input_dimension), @@ -184,12 +193,73 @@ def __shuffle_numpy( dtype=DEFAULT_NP_DATA_DTYPE, ) last_start = 0 - descriptor_name = os.path.join( - descriptor_save_path, save_name.replace("*", str(i)) - ) - target_name = os.path.join( - target_save_path, save_name.replace("*", str(i)) - ) + + # Figure out where to save / how to name things. + # TODO: This could probably be shortened. + if temporary: + + # Adding "snapshot numbers" here is technically not necessary + # I think, but it also doesn't hurt. + if file_ending == "npy": + descriptor_name = tempfile.NamedTemporaryFile( + delete=False, + prefix=save_name.replace("*", str(i)), + suffix=".in.npy", + dir=descriptor_save_path, + ).name + target_name = tempfile.NamedTemporaryFile( + delete=False, + prefix=save_name.replace("*", str(i)), + suffix=".out.npy", + dir=target_save_path, + ).name + snapshot_type = "numpy" + else: + descriptor_name = tempfile.NamedTemporaryFile( + delete=False, + prefix=save_name.replace("*", "%T"), + suffix=".in." + file_ending, + dir=descriptor_save_path, + ).name + target_name = tempfile.NamedTemporaryFile( + delete=False, + prefix=save_name.replace("*", "%T"), + suffix=".out." + file_ending, + dir=target_save_path, + ).name + snapshot_type = "openpmd" + self.temporary_shuffled_snapshots.append( + mala.Snapshot( + os.path.basename(descriptor_name), + os.path.dirname(descriptor_name), + os.path.basename(target_name), + os.path.dirname(target_name), + snapshot_function="te", + output_units="None", + input_units="None", + calculation_output="", + snapshot_type=snapshot_type, + ) + ) + else: + if file_ending == "npy": + descriptor_name = os.path.join( + descriptor_save_path, + save_name.replace("*", str(i)) + ".in.npy", + ) + target_name = os.path.join( + target_save_path, + save_name.replace("*", str(i)) + ".out.npy", + ) + else: + descriptor_name = os.path.join( + descriptor_save_path, + save_name.replace("*", "%T") + ".in." + file_ending, + ) + target_name = os.path.join( + target_save_path, + save_name.replace("*", "%T") + ".out." + file_ending, + ) # Each new snapshot gets an number_of_new_snapshots-th from each # snapshot. @@ -234,10 +304,10 @@ def __shuffle_numpy( ) if file_ending == "npy": self.descriptor_calculator.write_to_numpy_file( - descriptor_name + ".in.npy", new_descriptors + descriptor_name, new_descriptors ) self.target_calculator.write_to_numpy_file( - target_name + ".out.npy", new_targets + target_name, new_targets ) else: # We check above that in the non-numpy case, OpenPMD will work. @@ -248,7 +318,7 @@ def __shuffle_numpy( shuffle_dimensions ) self.descriptor_calculator.write_to_openpmd_file( - descriptor_name_openpmd + ".in." + file_ending, + descriptor_name, new_descriptors, additional_attributes={ "global_shuffling_seed": self.parameters.shuffling_seed, @@ -258,7 +328,7 @@ def __shuffle_numpy( internal_iteration_number=i, ) self.target_calculator.write_to_openpmd_file( - target_name_openpmd + ".out." + file_ending, + target_name, array=new_targets, additional_attributes={ "global_shuffling_seed": self.parameters.shuffling_seed, @@ -268,8 +338,6 @@ def __shuffle_numpy( internal_iteration_number=i, ) - self.delete_temporary_inputs() - # The function __shuffle_openpmd can be used to shuffle descriptor data and # target data. # It will be executed one after another for both of them. @@ -652,6 +720,7 @@ def shuffle_snapshots( target_save_path=None, save_name="mala_shuffled_snapshot*", number_of_shuffled_snapshots=None, + shuffle_to_temporary=False, ): """ Shuffle the snapshots into new snapshots. @@ -677,6 +746,12 @@ def shuffle_snapshots( If not None, this class will attempt to redistribute the data to this amount of snapshots. If None, then the same number of snapshots provided will be used. + + shuffle_to_temporary : bool + If True, shuffled files will be writen to temporary data files. + Which paths are used is consistent with non-temporary usage of this + class. The path and names of these temporary files can then be + found in the class attribute temporary_shuffled_snapshots. """ # Check the paths. if complete_save_path is not None: @@ -691,12 +766,23 @@ def shuffle_snapshots( file_ending = save_name.split(".")[-1] save_name = save_name.split(".")[0] if file_ending != "npy": - import openpmd_api as io - - if file_ending not in io.file_extensions: - raise Exception( - "Invalid file ending selected: " + file_ending + if shuffle_to_temporary: + parallel_warn( + "Shuffling to temporary files currently" + " only works with numpy as an enginge for " + "intermediate files. You have selected both" + " openpmd and the temporary file option. " + "Will proceed with numpy instead of " + "openpmd." ) + file_ending = "npy" + else: + import openpmd_api as io + + if file_ending not in io.file_extensions: + raise Exception( + "Invalid file ending selected: " + file_ending + ) else: file_ending = "npy" @@ -794,6 +880,7 @@ def shuffle_snapshots( target_save_path, permutations, file_ending, + shuffle_to_temporary, ) elif snapshot_type == "json+numpy": for snapshot in self.parameters.snapshot_directories_list: @@ -806,6 +893,7 @@ def shuffle_snapshots( target_save_path, permutations, file_ending, + shuffle_to_temporary, ) elif snapshot_type == "openpmd": descriptor = self.__DescriptorOrTarget( @@ -843,6 +931,23 @@ def shuffle_snapshots( else: raise Exception("Unknown snapshot type: {}".format(snapshot_type)) + # Deleting temporary files that may have been created. + self.delete_temporary_inputs() + # Since no training will be done with this class, we should always # clear the data at the end. self.clear_data() + + def delete_temporary_shuffled_snapshots(self): + for snapshot in self.temporary_shuffled_snapshots: + input_file = os.path.join( + snapshot.input_npy_directory, snapshot.input_npy_file + ) + if os.path.isfile(input_file): + os.remove(input_file) + output_file = os.path.join( + snapshot.output_npy_directory, snapshot.output_npy_file + ) + if os.path.isfile(output_file): + os.remove(output_file) + self.temporary_shuffled_snapshots = [] From 615792b242ed3f6833e46422084377a024f606a7 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Thu, 9 Jan 2025 17:16:03 +0100 Subject: [PATCH 15/23] Fixed docstrings --- mala/datahandling/data_shuffler.py | 7 ++++ mala/descriptors/descriptor.py | 53 ++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/mala/datahandling/data_shuffler.py b/mala/datahandling/data_shuffler.py index 9d8bb958..1ba0aab5 100644 --- a/mala/datahandling/data_shuffler.py +++ b/mala/datahandling/data_shuffler.py @@ -939,6 +939,13 @@ def shuffle_snapshots( self.clear_data() def delete_temporary_shuffled_snapshots(self): + """ + Delete temporary files creating during shuffling of data. + + If shuffling has been done with the option "shuffle_to_temporary", + shuffled data will be saved to temporary files which can safely be + deleted with this function. + """ for snapshot in self.temporary_shuffled_snapshots: input_file = os.path.join( snapshot.input_npy_directory, snapshot.input_npy_file diff --git a/mala/descriptors/descriptor.py b/mala/descriptors/descriptor.py index 2794e716..e6fec1d3 100644 --- a/mala/descriptors/descriptor.py +++ b/mala/descriptors/descriptor.py @@ -377,6 +377,40 @@ def calculate_from_qe_out( return self._calculate(working_directory, **kwargs) def calculate_from_json(self, json_file, working_directory=".", **kwargs): + """ + Calculate the descriptors based on a MALA generated json file. + + These json files are generated by the MALA DataConverter class + and bundle information about a DFT simulation. + + Parameters + ---------- + json_file : string + Name of MALA json output file for snapshot. + + working_directory : string + A directory in which to write the output of the LAMMPS calculation. + Usually the local directory should suffice, given that there + are no multiple instances running in the same directory. + + kwargs : dict + A collection of keyword arguments, that are mainly used for + debugging and development. Different types of descriptors + may support different keyword arguments. Commonly supported + are + + - "use_fp64": To use enforce floating point 64 precision for + descriptors. + - "keep_logs": To not delete temporary files created during + LAMMPS calculation of descriptors. + + Returns + ------- + descriptors : numpy.array + Numpy array containing the descriptors with the dimension + (x,y,z,descriptor_dimension) + + """ if isinstance(json_file, str): json_dict = json.load(open(json_file, encoding="utf-8")) else: @@ -585,6 +619,25 @@ def convert_local_to_3d(self, descriptors_np): return descriptors_full, local_offset, local_reach def read_dimensions_from_json(self, json_file): + """ + Read only the descriptor dimensions from a json file. + + These json files are generated by the MALA DataConverter class + and bundle information about a DFT simulation. + + Parameters + ---------- + json_file : string + Path to the numpy file. + + Returns + ------- + dimension_info : list or tuple + If read_dtype is False, then only a list containing the dimensions + of the saved array is returned. If read_dtype is True, a tuple + containing this list of dimensions and the dtype of the array will + be returned. + """ if isinstance(json_file, str): json_dict = json.load(open(json_file, encoding="utf-8")) else: From d0e8de649eea7d7abbe6b706622cff102158cb3a Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Thu, 9 Jan 2025 17:33:37 +0100 Subject: [PATCH 16/23] Added automatic snapshot type detection --- mala/datahandling/data_handler_base.py | 21 ++++++++++++++++++++- mala/datahandling/data_shuffler.py | 2 +- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/mala/datahandling/data_handler_base.py b/mala/datahandling/data_handler_base.py index 968e497b..a69dd879 100644 --- a/mala/datahandling/data_handler_base.py +++ b/mala/datahandling/data_handler_base.py @@ -106,7 +106,7 @@ def add_snapshot( output_units="1/(eV*A^3)", input_units="None", calculation_output_file="", - snapshot_type="numpy", + snapshot_type=None, ): """ Add a snapshot to the data pipeline. @@ -146,6 +146,25 @@ def add_snapshot( Either "numpy" or "openpmd" based on what kind of files you want to operate on. """ + # Try to guess snapshot type if no information was provided. + if snapshot_type is None: + input_file_ending = input_file.split(".")[-1] + output_file_ending = output_file.split(".")[-1] + + if input_file_ending == "npy" and output_file_ending == "npy": + snapshot_type = "numpy" + + elif input_file_ending == "json" and output_file_ending == "npy": + snapshot_type = "json+numpy" + else: + import openpmd_api as io + + if ( + input_file_ending in io.file_extensions + and output_file_ending in io.file_extensions + ): + snapshot_type = "openpmd" + snapshot = Snapshot( input_file, input_directory, diff --git a/mala/datahandling/data_shuffler.py b/mala/datahandling/data_shuffler.py index 1ba0aab5..269b0342 100644 --- a/mala/datahandling/data_shuffler.py +++ b/mala/datahandling/data_shuffler.py @@ -67,7 +67,7 @@ def add_snapshot( input_directory, output_file, output_directory, - snapshot_type="numpy", + snapshot_type=None, ): """ Add a snapshot to the data pipeline. From 07126f1b7584a48313ad56066012e16338e28417 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Tue, 14 Jan 2025 09:20:47 +0100 Subject: [PATCH 17/23] Added new temporary framework to examples --- examples/advanced/ex02_shuffle_data.py | 92 +++++++++++++++++++++++++- examples/basic/ex01_train_network.py | 23 ++++++- examples/basic/ex02_test_network.py | 11 ++- examples/basic/ex03_preprocess_data.py | 2 +- 4 files changed, 119 insertions(+), 9 deletions(-) diff --git a/examples/advanced/ex02_shuffle_data.py b/examples/advanced/ex02_shuffle_data.py index db75d515..7f67f428 100644 --- a/examples/advanced/ex02_shuffle_data.py +++ b/examples/advanced/ex02_shuffle_data.py @@ -1,5 +1,3 @@ -import os - import mala from mala.datahandling.data_repo import data_path @@ -10,6 +8,96 @@ easily done in memory. """ +# There are two ways to perform shuffling. One can either shuffle snapshots +# on the fly and create snapshot-like, temporary files to be directly used +# for training, or save them to permanent files, and load them as needed. +# The latter is useful when multiple networks are to be trained with the same +# shuffled data, the former may save on hard-disk space by deleting +# shuffled data right after usage. + + +# 1. Shuffling snapshots on the fly. + +# Define parameters for network training after shuffling. +parameters = mala.Parameters() +parameters.verbosity = 1 +parameters.data.input_rescaling_type = "feature-wise-standard" +parameters.data.output_rescaling_type = "minmax" +parameters.network.layer_activations = ["ReLU"] +parameters.running.max_number_epochs = 100 +parameters.running.mini_batch_size = 40 +parameters.running.learning_rate = 0.00001 +parameters.running.optimizer = "Adam" +parameters.targets.target_type = "LDOS" +parameters.targets.ldos_gridsize = 11 +parameters.targets.ldos_gridspacing_ev = 2.5 +parameters.targets.ldos_gridoffset_ev = -5 +parameters.data.use_lazy_loading = True + +# This ensures reproducibility of the created data sets. +parameters.data.shuffling_seed = 1234 + +data_shuffler = mala.DataShuffler(parameters) + +# Instead of precomputed snapshots, on-the-fly calculated ones may be used +# as well. Here, we use precomputed ones. +data_shuffler.add_snapshot( + "Be_snapshot0.in.npy", + data_path, + "Be_snapshot0.out.npy", + data_path, +) +data_shuffler.add_snapshot( + "Be_snapshot1.in.npy", + data_path, + "Be_snapshot1.out.npy", + data_path, +) + +# Shuffle the snapshots using the "shuffle_to_temporary" option. +data_shuffler.shuffle_snapshots( + complete_save_path=data_path, + save_name="Be_shuffled*", + shuffle_to_temporary=True, + number_of_shuffled_snapshots=2, +) + +# The data shuffler provides a list with these temporary, snapshot-like +# objects that can then be used in a network training. + +data_handler = mala.DataHandler(parameters) +data_handler.add_snapshot( + data_shuffler.temporary_shuffled_snapshots[0].input_npy_file, + data_shuffler.temporary_shuffled_snapshots[0].input_npy_directory, + data_shuffler.temporary_shuffled_snapshots[0].output_npy_file, + data_shuffler.temporary_shuffled_snapshots[0].output_npy_directory, + "tr", +) +data_handler.add_snapshot( + data_shuffler.temporary_shuffled_snapshots[1].input_npy_file, + data_shuffler.temporary_shuffled_snapshots[1].input_npy_directory, + data_shuffler.temporary_shuffled_snapshots[1].output_npy_file, + data_shuffler.temporary_shuffled_snapshots[1].output_npy_directory, + "va", +) +data_handler.prepare_data() + +# Nowe we can train a network in the standard MALA way. +parameters.network.layer_sizes = [ + data_handler.input_dimension, + 100, + data_handler.output_dimension, +] +network = mala.Network(parameters) +trainer = mala.Trainer(parameters, network, data_handler) +trainer.train_network() + +# Afterwards, the temporary files should be deleted. +data_shuffler.delete_temporary_shuffled_snapshots() + +# 2. Shuffling snapshots to permanent files. +# After shuffling, the standard approach to MALA data loading can/should +# be used to train a network. parameters = mala.Parameters() diff --git a/examples/basic/ex01_train_network.py b/examples/basic/ex01_train_network.py index c7a5ca78..a2542302 100644 --- a/examples/basic/ex01_train_network.py +++ b/examples/basic/ex01_train_network.py @@ -48,16 +48,33 @@ # Data has to be added to the MALA workflow. The central object for this # is the DataHandler class, which takes care of all data needs. After data # has been added, it is loaded and scaled with the prepare_data function. -#################### +# +# There are two ways to add descriptor (=input) data. One is to provide +# precomputed numpy files. This makes sense when training multiple models +# with the same data. Alternatively, MALA generated JSON files containin +# information about a simulation can be provided, with which MALA will +# automatically generate volumetric descriptor input. This is useful when, e.g. +# descriptor hyperparameters are to be optimized. In order to use that +# feature, an existing GPU-ready LAMMPS version is recommended. Creation +# of MALA JSON files is shown in ex03_preprocess_data. +##################### data_handler = mala.DataHandler(parameters) -# Add a snapshot we want to use in to the list. + +# Add precomputed snapshots. data_handler.add_snapshot( "Be_snapshot0.in.npy", data_path, "Be_snapshot0.out.npy", data_path, "tr" ) +# Add snapshots with "raw" (=MALA formatted) JSON, computation of descriptors +# will be performed "on-the-fly". data_handler.add_snapshot( - "Be_snapshot1.in.npy", data_path, "Be_snapshot1.out.npy", data_path, "va" + "Be_snapshot1.info.json", + data_path, + "Be_snapshot1.out.npy", + data_path, + "va", ) + data_handler.prepare_data() #################### diff --git a/examples/basic/ex02_test_network.py b/examples/basic/ex02_test_network.py index 0d90dfe7..8675a6dc 100644 --- a/examples/basic/ex02_test_network.py +++ b/examples/basic/ex02_test_network.py @@ -40,21 +40,26 @@ # When preparing the data, make sure to select "reparametrize_scalers=False", # since data scaling was initialized during model training. #################### + +# Add precomputed snapshots. data_handler.add_snapshot( "Be_snapshot2.in.npy", data_path, "Be_snapshot2.out.npy", data_path, "te", - calculation_output_file=os.path.join(data_path, "Be_snapshot2.out"), + calculation_output_file=os.path.join(data_path, "Be_snapshot2.info.json"), ) + +# Add snapshots with "raw" (=MALA formatted) JSON, computation of descriptors +# will be performed "on-the-fly". data_handler.add_snapshot( - "Be_snapshot3.in.npy", + "Be_snapshot3.info.json", data_path, "Be_snapshot3.out.npy", data_path, "te", - calculation_output_file=os.path.join(data_path, "Be_snapshot3.out"), + calculation_output_file=os.path.join(data_path, "Be_snapshot3.info.json"), ) data_handler.prepare_data(reparametrize_scaler=False) diff --git a/examples/basic/ex03_preprocess_data.py b/examples/basic/ex03_preprocess_data.py index e9f4691a..8476ce49 100644 --- a/examples/basic/ex03_preprocess_data.py +++ b/examples/basic/ex03_preprocess_data.py @@ -49,7 +49,7 @@ # Further, via the simulation_output_input_* keywords, calculation output # can be processed from the original simulation *.out output files into # more convenient *.json files that can be used in their stead. This saves -# on disk space. +# on disk space and makes the process more reproducible. # To only process parts of the data, omit/add descriptor_input*, target_input_* # and simulation_output_* at your leisure. # Make sure to set the correct units - for QE, this should always be From a7d7bd3ff39cd4efb1f8bac5a68b837b89b4c196 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Tue, 14 Jan 2025 11:59:30 +0100 Subject: [PATCH 18/23] Fixed examples --- examples/advanced/ex02_shuffle_data.py | 8 ++++++++ examples/basic/ex01_train_network.py | 24 +++++++++++++++++------- examples/basic/ex02_test_network.py | 24 ++++++++++++++++++++---- examples/basic/ex03_preprocess_data.py | 19 +++++++++++++++++-- 4 files changed, 62 insertions(+), 13 deletions(-) diff --git a/examples/advanced/ex02_shuffle_data.py b/examples/advanced/ex02_shuffle_data.py index 7f67f428..4631a934 100644 --- a/examples/advanced/ex02_shuffle_data.py +++ b/examples/advanced/ex02_shuffle_data.py @@ -53,6 +53,14 @@ "Be_snapshot1.out.npy", data_path, ) +# On-the-fly snapshots can be added as well. +# data_shuffler.add_snapshot( +# "Be_snapshot2.info.json", +# data_path, +# "Be_snapshot2.out.npy", +# data_path, +# ) + # Shuffle the snapshots using the "shuffle_to_temporary" option. data_shuffler.shuffle_snapshots( diff --git a/examples/basic/ex01_train_network.py b/examples/basic/ex01_train_network.py index a2542302..600840b5 100644 --- a/examples/basic/ex01_train_network.py +++ b/examples/basic/ex01_train_network.py @@ -65,15 +65,25 @@ data_handler.add_snapshot( "Be_snapshot0.in.npy", data_path, "Be_snapshot0.out.npy", data_path, "tr" ) -# Add snapshots with "raw" (=MALA formatted) JSON, computation of descriptors -# will be performed "on-the-fly". data_handler.add_snapshot( - "Be_snapshot1.info.json", - data_path, - "Be_snapshot1.out.npy", - data_path, - "va", + "Be_snapshot1.in.npy", data_path, "Be_snapshot1.out.npy", data_path, "va" ) +# Add snapshots with "raw" (=MALA formatted) JSON, computation of descriptors +# will be performed "on-the-fly". +# data_handler.add_snapshot( +# "Be_snapshot0.info.json", +# data_path, +# "Be_snapshot0.out.npy", +# data_path, +# "tr", +# ) +# data_handler.add_snapshot( +# "Be_snapshot1.info.json", +# data_path, +# "Be_snapshot1.out.npy", +# data_path, +# "va", +# ) data_handler.prepare_data() diff --git a/examples/basic/ex02_test_network.py b/examples/basic/ex02_test_network.py index 8675a6dc..d02a5874 100644 --- a/examples/basic/ex02_test_network.py +++ b/examples/basic/ex02_test_network.py @@ -50,17 +50,33 @@ "te", calculation_output_file=os.path.join(data_path, "Be_snapshot2.info.json"), ) - -# Add snapshots with "raw" (=MALA formatted) JSON, computation of descriptors -# will be performed "on-the-fly". data_handler.add_snapshot( - "Be_snapshot3.info.json", + "Be_snapshot3.in.npy", data_path, "Be_snapshot3.out.npy", data_path, "te", calculation_output_file=os.path.join(data_path, "Be_snapshot3.info.json"), ) + +# Add snapshots with "raw" (=MALA formatted) JSON, computation of descriptors +# will be performed "on-the-fly". +# data_handler.add_snapshot( +# "Be_snapshot2.info.json", +# data_path, +# "Be_snapshot2.out.npy", +# data_path, +# "te", +# calculation_output_file=os.path.join(data_path, "Be_snapshot2.info.json"), +# ) +# data_handler.add_snapshot( +# "Be_snapshot3.info.json", +# data_path, +# "Be_snapshot3.out.npy", +# data_path, +# "te", +# calculation_output_file=os.path.join(data_path, "Be_snapshot3.info.json"), +# ) data_handler.prepare_data(reparametrize_scaler=False) diff --git a/examples/basic/ex03_preprocess_data.py b/examples/basic/ex03_preprocess_data.py index 8476ce49..4f8f014b 100644 --- a/examples/basic/ex03_preprocess_data.py +++ b/examples/basic/ex03_preprocess_data.py @@ -51,7 +51,9 @@ # more convenient *.json files that can be used in their stead. This saves # on disk space and makes the process more reproducible. # To only process parts of the data, omit/add descriptor_input*, target_input_* -# and simulation_output_* at your leisure. +# and simulation_output_* at your leisure. This is especially useful if you, +# e.g., do not need to convert the descriptor data, since it will be +# calculated on-the-fly during training. # Make sure to set the correct units - for QE, this should always be # 1/(Ry*Bohr^3). #################### @@ -60,6 +62,7 @@ outfile = os.path.join(data_path, "Be_snapshot0.out") ldosfile = os.path.join(data_path, "cubes/tmp.pp*Be_ldos.cube") +# Converting a snapshot for training on precomputed descriptor data. data_converter.add_snapshot( descriptor_input_type="espresso-out", descriptor_input_path=outfile, @@ -70,6 +73,16 @@ target_units="1/(Ry*Bohr^3)", ) +# Converting a snapshot for training with on-the-fly descriptor calculation. +# data_converter.add_snapshot( +# target_input_type=".cube", +# target_input_path=ldosfile, +# simulation_output_type="espresso-out", +# simulation_output_path=outfile, +# target_units="1/(Ry*Bohr^3)", +# ) + + #################### # 3. Converting the data # To convert the data we now simply have to call the convert_snapshot function. @@ -82,9 +95,11 @@ #################### data_converter.convert_snapshots( - descriptor_save_path="./", target_save_path="./", simulation_output_save_path="./", + # The next line should be omitted, if the descriptor data is to be + # calculated on-the-fly during training. + descriptor_save_path="./", naming_scheme="Be_snapshot*.npy", descriptor_calculation_kwargs={"working_directory": data_path}, ) From f0738ce564d7ce9b5563f13ffcd5094d9b966e1c Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Tue, 14 Jan 2025 20:12:17 +0100 Subject: [PATCH 19/23] Added documentation --- docs/source/advanced_usage/trainingmodel.rst | 15 +++++++-- docs/source/basic_usage/more_data.rst | 13 ++++++-- docs/source/basic_usage/trainingmodel.rst | 35 +++++++++++++++++--- examples/basic/ex05_run_predictions.py | 2 +- 4 files changed, 54 insertions(+), 11 deletions(-) diff --git a/docs/source/advanced_usage/trainingmodel.rst b/docs/source/advanced_usage/trainingmodel.rst index 9b118d86..dfa7b38a 100644 --- a/docs/source/advanced_usage/trainingmodel.rst +++ b/docs/source/advanced_usage/trainingmodel.rst @@ -170,10 +170,11 @@ data sets have to be saved - in-memory implementations are currently developed. To use the data shuffling (also shown in example ``advanced/ex02_shuffle_data.py``), you can use the ``DataShuffler`` class. -The syntax is very easy, you create a ``DataShufller`` object, +The syntax is very easy, you create a ``DataShuffler`` object, which provides the same ``add_snapshot`` functionalities as the ``DataHandler`` -object, and shuffle the data once you have added all snapshots in question, -i.e., +object, and shuffle the data once you have added all snapshots in question. +Just as with the ``DataHandler`` class, on-the-fly calculation of bispectrum +descriptors is supported. .. code-block:: python @@ -187,6 +188,14 @@ i.e., data_shuffler.shuffle_snapshots(complete_save_path="../", save_name="Be_shuffled*") +By using the ``shuffle_to_temporary`` keyword, you can shuffle the data to +temporary files, which will can deleted after the training run. This is useful +if you want to shuffle the data right before training and do not plan to re-use +shuffled data files for multiple training runs. As detailed in +``advanced/ex02_shuffle_data.py``, access to temporary files is provided via +``data_shuffler.temporary_shuffled_snapshots[...]``, which is a list containing +``mala.Snapshot`` objects. + The seed ``parameters.data.shuffling_seed`` ensures reproducibility of data sets. The ``shuffle_snapshots`` function has a path handling ability akin to the ``DataConverter`` class. Further, via the ``number_of_shuffled_snapshots`` diff --git a/docs/source/basic_usage/more_data.rst b/docs/source/basic_usage/more_data.rst index 4232a79b..89484510 100644 --- a/docs/source/basic_usage/more_data.rst +++ b/docs/source/basic_usage/more_data.rst @@ -49,7 +49,13 @@ MALA can be used to process raw data into ready-to-use data for ML-DFT model creation. For this, the ``DataConverter`` class can be used, as also shown in the example ``basic/ex03_preprocess_data``. The first thing when converting data is to select how the data should be -processed. Up until now, MALA operates with bispectrum descriptors as +processed. As outlined in :doc:`the training documentation `, +there are two ways to provide descriptor data to MALA models. One can either +precompute files containing descriptors with the ``DataConverter`` class or +compute descriptor data on-the-fly by providing MALA generated JSON files +containing simulation output information. These JSON files can also be +generated by the ``DataConverter`` class. +Up until now, MALA operates with bispectrum descriptors as input data (=descriptors) and LDOS as output data (=targets). Their calculation is calculated via @@ -73,6 +79,8 @@ values are included in the energy grid upon which the LDOS is sampled, ``ldos_gridoffset_ev`` determines the lowest energy value sampled. These values are chosen for the ``pp.x`` simulation and have to be given here. +If descriptors are precomputed, then hyperparameters for their calculation +have to be provided. For the bispectrum calculation, ``bispectrum_cutoff`` gives the radius of the cutoff sphere from which information on the atomic structure is incoporated into the bispectrum descriptor vector at each point in space, whereas @@ -111,7 +119,8 @@ respectively, and the ``target_units`` will always be ``"1/(Ry*Bohr^3)"``. The paths have to be modified accordingly. ``simulation_output_*`` refers to the calculation output file - MALA provides an interface to condense the entire, verbose simulation output to ``.json`` files for further -processing. In the preceding section, we had to specify calculation output +processing or on-the-fly descriptor calculation. +In the preceding section, we had to specify calculation output files a number of times - instead, we can use the reduced ``.json`` files if we let them be created by the ``DataConverter`` class. diff --git a/docs/source/basic_usage/trainingmodel.rst b/docs/source/basic_usage/trainingmodel.rst index 53cb8a8d..a85a4a8f 100644 --- a/docs/source/basic_usage/trainingmodel.rst +++ b/docs/source/basic_usage/trainingmodel.rst @@ -89,10 +89,14 @@ As with any ML library, MALA is a data-driven framework. So before we can train a model, we need to add data. The central object to manage data for any MALA workflow is the ``DataHandler`` class. -MALA manages data "per snapshot". One snapshot is one atomic configuration, -for which volumetric input and output data has been calculated. Data has to -be added to the ``DataHandler`` object per snapshot, pointing to the -where the volumetric data files are saved on disk. This is done via +MALA manages data "per snapshot". One snapshot is an atomic configuration with +associated volumetric data. Snapshots have to be added to the ``DataHandler`` +object. There are two ways to provide snapshot data, which are selected by +providing the respective types of data files. + +1. Precomputed descriptors: The LDOS is sampled and the volumetric descriptor + data is precomputed into either OpenPMD or numpy files + (as described :doc:`here `), and both can be loaded for training. .. code-block:: python @@ -102,12 +106,33 @@ where the volumetric data files are saved on disk. This is done via data_handler.add_snapshot("Be_snapshot1.in.npy", data_path, "Be_snapshot1.out.npy", data_path, "va") +2. On-the-fly descriptors: The LDOS is sampled into either OpenPMD or numpy + files, while the volumetric descriptor data is computed on-the-fly during + training or shuffling. Starting point for the descriptor calculation in this + case is the simulation output saved in a JSON file. This mode is only + recommended if a GPU-enabled LAMMPS version is available. If this route is + used, then descriptor calculation hyperparamters need to be set before + adding snapshots, see :doc:`data conversion manual ` for details. + + .. code-block:: python + + # Bispectrum parameters. + parameters.descriptors.descriptor_type = "Bispectrum" + parameters.descriptors.bispectrum_twojmax = 10 + parameters.descriptors.bispectrum_cutoff = 4.67637 + + data_handler = mala.DataHandler(parameters) + data_handler.add_snapshot("Be_snapshot0.info.json", data_path, + "Be_snapshot0.out.npy", data_path, "tr") + data_handler.add_snapshot("Be_snapshot1.info.json", data_path, + "Be_snapshot1.out.npy", data_path, "va") + The ``"tr"`` and ``"va"`` flag signal that the respective snapshots are added as training and validation data, respectively. Training data is data the model is directly tuned on; validation data is data used to verify the model performance during the run time and make sure that no overfitting occurs. After data has been added to the ``DataHandler``, it has to be actually loaded -and scaled via +(or in the case of on-the-fly usage, computed) and scaled via .. code-block:: python diff --git a/examples/basic/ex05_run_predictions.py b/examples/basic/ex05_run_predictions.py index 05deb857..a0b193c9 100644 --- a/examples/basic/ex05_run_predictions.py +++ b/examples/basic/ex05_run_predictions.py @@ -11,7 +11,7 @@ configurations. Either execute ex01 before executing this one or download the appropriate model from the provided test data repo. -REQUIRES LAMMPS (and potentiall the total energy module). +REQUIRES LAMMPS (and potentially the total energy module). """ model_name = "Be_model" From d359057c84c6d0b1a588a7e00f9b08f1de913ccc Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Tue, 14 Jan 2025 20:13:56 +0100 Subject: [PATCH 20/23] Small adjustment in example --- examples/advanced/ex02_shuffle_data.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/advanced/ex02_shuffle_data.py b/examples/advanced/ex02_shuffle_data.py index 4631a934..a56ebdfc 100644 --- a/examples/advanced/ex02_shuffle_data.py +++ b/examples/advanced/ex02_shuffle_data.py @@ -24,7 +24,9 @@ parameters.data.input_rescaling_type = "feature-wise-standard" parameters.data.output_rescaling_type = "minmax" parameters.network.layer_activations = ["ReLU"] -parameters.running.max_number_epochs = 100 + +# No real training, just showing how shuffling directly before training works. +parameters.running.max_number_epochs = 5 parameters.running.mini_batch_size = 40 parameters.running.learning_rate = 0.00001 parameters.running.optimizer = "Adam" From 9d87a2f0debdba4b099711f3513bc7df5d91e699 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Tue, 14 Jan 2025 21:06:43 +0100 Subject: [PATCH 21/23] Implemented rudimentary json+openpmd --- mala/datahandling/data_handler.py | 57 +++++++++++++++---- mala/datahandling/data_handler_base.py | 15 ++++- mala/datahandling/data_shuffler.py | 5 ++ mala/datahandling/lazy_load_dataset.py | 12 ++++ mala/datahandling/lazy_load_dataset_single.py | 20 +++++++ .../multi_lazy_load_data_loader.py | 31 ++++++++++ 6 files changed, 128 insertions(+), 12 deletions(-) diff --git a/mala/datahandling/data_handler.py b/mala/datahandling/data_handler.py index 5823d459..35dffcbf 100644 --- a/mala/datahandling/data_handler.py +++ b/mala/datahandling/data_handler.py @@ -595,7 +595,10 @@ def __load_data(self, function, data_type): # If the input for the descriptors is actually a JSON # file then we need to calculate the descriptors. - if snapshot.snapshot_type == "json+numpy": + if ( + snapshot.snapshot_type == "json+numpy" + or snapshot.snapshot_type == "json+openpmd" + ): self._calculate_temporary_inputs(snapshot) file = snapshot.temporary_input_file else: @@ -623,6 +626,22 @@ def __load_data(self, function, data_type): file, units=units ).reshape([gs_new, feature_dimension]) ) + elif snapshot.snapshot_type == "json+openpmd": + if data_type == "inputs": + calculator.read_from_numpy_file( + file, + units=units, + array=getattr(self, array)[ + gs_old : gs_old + gs_new, : + ], + reshape=True, + ) + else: + getattr(self, array)[gs_old : gs_old + gs_new] = ( + calculator.read_from_openpmd_file( + file, units=units + ).reshape([gs_new, feature_dimension]) + ) else: raise Exception("Unknown snapshot file type.") snapshot_counter += 1 @@ -726,14 +745,20 @@ def __build_datasets(self): # already been built during parametrization, for all other # snapshot types, this has to be done here. if snapshot.snapshot_function == "va": - if snapshot.snapshot_type == "json+numpy": + if ( + snapshot.snapshot_type == "json+numpy" + or snapshot.snapshot_type == "json+openpmd" + ): self._calculate_temporary_inputs(snapshot) self.validation_data_sets[0].add_snapshot_to_dataset( snapshot ) if snapshot.snapshot_function == "te": - if snapshot.snapshot_type == "json+numpy": + if ( + snapshot.snapshot_type == "json+numpy" + or snapshot.snapshot_type == "json+openpmd" + ): self._calculate_temporary_inputs(snapshot) self.test_data_sets[0].add_snapshot_to_dataset(snapshot) @@ -780,7 +805,10 @@ def __build_datasets(self): self._use_ddp, ) ) - if snapshot.snapshot_type == "json+numpy": + if ( + snapshot.snapshot_type == "json+numpy" + or snapshot.snapshot_type == "json+openpmd" + ): self._calculate_temporary_inputs(snapshot) if snapshot.snapshot_function == "te": @@ -798,7 +826,10 @@ def __build_datasets(self): input_requires_grad=True, ) ) - if snapshot.snapshot_type == "json+numpy": + if ( + snapshot.snapshot_type == "json+numpy" + or snapshot.snapshot_type == "json+openpmd" + ): self._calculate_temporary_inputs(snapshot) else: @@ -898,7 +929,10 @@ def __parametrize_scalers(self): ) ) ) - elif snapshot.snapshot_type == "json+numpy": + elif ( + snapshot.snapshot_type == "json+numpy" + or snapshot.snapshot_type == "json+openpmd" + ): self._calculate_temporary_inputs(snapshot) tmp = self.descriptor_calculator.read_from_numpy_file( snapshot.temporary_input_file, @@ -956,7 +990,10 @@ def __parametrize_scalers(self): ), units=snapshot.output_units, ) - elif snapshot.snapshot_type == "openpmd": + elif ( + snapshot.snapshot_type == "openpmd" + or snapshot.snapshot_type == "json+openpmd" + ): tmp = self.target_calculator.read_from_openpmd_file( os.path.join( snapshot.output_npy_directory, @@ -995,9 +1032,9 @@ def __parametrized_load_training_data(self): ) for snapshot in self.parameters.snapshot_directories_list: # Data scaling is only performed on the training data sets. - if ( - snapshot.snapshot_function == "tr" - and snapshot.snapshot_type == "json+numpy" + if snapshot.snapshot_function == "tr" and ( + snapshot.snapshot_type == "json+numpy" + or snapshot.snapshot_type == "json+openpmd" ): self._calculate_temporary_inputs(snapshot) else: diff --git a/mala/datahandling/data_handler_base.py b/mala/datahandling/data_handler_base.py index a69dd879..8c2b6141 100644 --- a/mala/datahandling/data_handler_base.py +++ b/mala/datahandling/data_handler_base.py @@ -164,6 +164,11 @@ def add_snapshot( and output_file_ending in io.file_extensions ): snapshot_type = "openpmd" + if ( + input_file_ending == "json" + and output_file_ending in io.file_extensions + ): + snapshot_type = "json+openpmd" snapshot = Snapshot( input_file, @@ -242,7 +247,10 @@ def _check_snapshots(self, comm=None): ), comm=comm, ) - elif snapshot.snapshot_type == "json+numpy": + elif ( + snapshot.snapshot_type == "json+numpy" + or snapshot.snapshot_type == "json+openpmd" + ): tmp_dimension = ( self.descriptor_calculator.read_dimensions_from_json( os.path.join( @@ -291,7 +299,10 @@ def _check_snapshots(self, comm=None): ) ) ) - elif snapshot.snapshot_type == "openpmd": + elif ( + snapshot.snapshot_type == "openpmd" + or snapshot.snapshot_type == "json+openpmd" + ): tmp_dimension = ( self.target_calculator.read_dimensions_from_openpmd_file( os.path.join( diff --git a/mala/datahandling/data_shuffler.py b/mala/datahandling/data_shuffler.py index 269b0342..2e0799c1 100644 --- a/mala/datahandling/data_shuffler.py +++ b/mala/datahandling/data_shuffler.py @@ -928,6 +928,11 @@ def shuffle_snapshots( permutations, file_ending, ) + elif snapshot_type == "json+openpmd": + raise Exception( + "Shuffling from JSON files and OpenPMD is " + "currently not supported." + ) else: raise Exception("Unknown snapshot type: {}".format(snapshot_type)) diff --git a/mala/datahandling/lazy_load_dataset.py b/mala/datahandling/lazy_load_dataset.py index 78d11431..10c00528 100644 --- a/mala/datahandling/lazy_load_dataset.py +++ b/mala/datahandling/lazy_load_dataset.py @@ -192,6 +192,18 @@ def get_new_data(self, file_index): ) ) + elif self._snapshot_list[file_index].snapshot_type == "json+openpmd": + self.input_data = self._descriptor_calculator.read_from_numpy_file( + self._snapshot_list[file_index].temporary_input_file, + units=self._snapshot_list[file_index].input_units, + ) + self.output_data = self._target_calculator.read_from_openpmd_file( + os.path.join( + self._snapshot_list[file_index].output_npy_directory, + self._snapshot_list[file_index].output_npy_file, + ) + ) + # Transform the data. self.input_data = self.input_data.reshape( [self._snapshot_list[file_index].grid_size, self._input_dimension] diff --git a/mala/datahandling/lazy_load_dataset_single.py b/mala/datahandling/lazy_load_dataset_single.py index 1baef9ea..e775a0cc 100644 --- a/mala/datahandling/lazy_load_dataset_single.py +++ b/mala/datahandling/lazy_load_dataset_single.py @@ -171,6 +171,26 @@ def allocate_shared_mem(self): ) ) + self.output_shape, self.output_dtype = ( + self.target_calculator.read_dimensions_from_openpmd_file( + os.path.join( + self.snapshot.output_npy_directory, + self.snapshot.output_npy_file, + ), + read_dtype=True, + ) + ) + elif self.snapshot.snapshot_type == "json+openpmd": + self.input_shape = ( + self.descriptor_calculator.read_dimensions_from_json( + os.path.join( + self.snapshot.input_npy_directory, + self.snapshot.input_npy_file, + ), + ) + ) + self.input_dtype = np.dtype(np.float32) + self.output_shape, self.output_dtype = ( self.target_calculator.read_dimensions_from_openpmd_file( os.path.join( diff --git a/mala/datahandling/multi_lazy_load_data_loader.py b/mala/datahandling/multi_lazy_load_data_loader.py index 35b02601..993c4170 100644 --- a/mala/datahandling/multi_lazy_load_data_loader.py +++ b/mala/datahandling/multi_lazy_load_data_loader.py @@ -183,6 +183,24 @@ def load_snapshot_to_shm( ) ) + output_shape, output_dtype = ( + target_calculator.read_dimensions_from_openpmd_file( + os.path.join( + snapshot.output_npy_directory, + snapshot.output_npy_file, + ), + read_dtype=True, + ) + ) + elif snapshot.snapshot_type == "json+openpmd": + input_shape = descriptor_calculator.read_dimensions_from_json( + os.path.join( + snapshot.input_npy_directory, + snapshot.input_npy_file, + ), + ) + input_dtype = np.dtype(np.float32) + output_shape, output_dtype = ( target_calculator.read_dimensions_from_openpmd_file( os.path.join( @@ -267,6 +285,19 @@ def load_snapshot_to_shm( units=snapshot.output_units, array=output_data, ) + elif snapshot.snapshot_type == "json+openpmd": + descriptor_calculator.read_from_numpy_file( + snapshot.temporary_input_file, + units=snapshot.input_units, + array=input_data, + ) + target_calculator.read_from_openpmd_file( + os.path.join( + snapshot.output_npy_directory, snapshot.output_npy_file + ), + units=snapshot.output_units, + array=output_data, + ) else: raise Exception("Invalid snapshot type selected.") From d1960d42bd42b874ddf08e49c1c4f8bab6aa3684 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Tue, 14 Jan 2025 21:30:02 +0100 Subject: [PATCH 22/23] Added tests for on-the-fly calculations --- test/on_the_fly_test.py | 180 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100644 test/on_the_fly_test.py diff --git a/test/on_the_fly_test.py b/test/on_the_fly_test.py new file mode 100644 index 00000000..28d20916 --- /dev/null +++ b/test/on_the_fly_test.py @@ -0,0 +1,180 @@ +import os + +import mala +import numpy as np + +from mala.datahandling.data_repo import data_path + +# Control the accuracies for the postprocessing routines. +accuracy_predictions = 1.0 + + +class TestOnTheFly: + """ + Test the on-the-fly descriptor calculation capabilities. + + This relates to training, testing and shuffling descriptor data that has + been computed on the fly. + """ + + @staticmethod + def __setup_training(lazy_loading, checkpoint_name=None): + test_parameters = mala.Parameters() + test_parameters.use_lammps = False + test_parameters.data.data_splitting_type = "by_snapshot" + test_parameters.data.input_rescaling_type = "feature-wise-standard" + test_parameters.data.output_rescaling_type = "minmax" + test_parameters.data.use_lazy_loading = lazy_loading + test_parameters.network.layer_activations = ["ReLU"] + test_parameters.running.max_number_epochs = 5 + test_parameters.running.mini_batch_size = 40 + test_parameters.running.learning_rate = 0.00001 + test_parameters.running.optimizer = "Adam" + if checkpoint_name: + test_parameters.running.checkpoints_each_epoch = 5 + test_parameters.running.checkpoint_name = checkpoint_name + + test_parameters.descriptors.descriptor_type = "Bispectrum" + # These values make no phyiscal sense, but are chosen to make the test + # fast enough to be executed without LAMMPS. + test_parameters.descriptors.bispectrum_twojmax = 2 + test_parameters.descriptors.bispectrum_cutoff = 1.5 + + # Load data. + data_handler = mala.DataHandler(test_parameters) + data_handler.add_snapshot( + "Be_snapshot0.info.json", + data_path, + "Be_snapshot0.out.npy", + data_path, + "tr", + ) + data_handler.add_snapshot( + "Be_snapshot1.info.json", + data_path, + "Be_snapshot1.out.npy", + data_path, + "va", + ) + data_handler.prepare_data() + + # Train a network. + test_parameters.network.layer_sizes = [ + data_handler.input_dimension, + 100, + data_handler.output_dimension, + ] + + # Setup network and trainer. + test_network = mala.Network(test_parameters) + test_trainer = mala.Trainer( + test_parameters, test_network, data_handler + ) + + return test_trainer + + @staticmethod + def __resume_checkpoint(checkpoint_name, actual_max_epochs): + loaded_params, loaded_network, new_datahandler, new_trainer = ( + mala.Trainer.load_run(checkpoint_name) + ) + loaded_params.running.max_number_epochs = actual_max_epochs + return new_trainer + + def test_training(self): + test_trainer = self.__setup_training(False) + test_trainer.train_network() + assert test_trainer.final_validation_loss < np.inf + + def test_training_lazy_loading(self): + test_trainer = self.__setup_training(True) + test_trainer.train_network() + assert test_trainer.final_validation_loss < np.inf + + def test_training_lazy_loading_interrupted(self): + test_trainer = self.__setup_training(True, "test_checkpoint_onthefly") + test_trainer.train_network() + test_trainer.data.delete_temporary_inputs() + test_trainer = self.__resume_checkpoint("test_checkpoint_onthefly", 10) + test_trainer.train_network() + assert test_trainer.final_validation_loss < np.inf + + def test_training_lazy_loading_uninterrupted(self): + test_trainer = self.__setup_training(True, "test_checkpoint_onthefly") + test_trainer.train_network() + test_trainer = self.__resume_checkpoint("test_checkpoint_onthefly", 10) + test_trainer.train_network() + assert test_trainer.final_validation_loss < np.inf + + def test_shuffling(self): + test_parameters = mala.Parameters() + test_parameters.data.shuffling_seed = 1234 + test_parameters.data.data_splitting_type = "by_snapshot" + test_parameters.data.input_rescaling_type = "feature-wise-standard" + test_parameters.data.output_rescaling_type = "minmax" + test_parameters.network.layer_activations = ["ReLU"] + test_parameters.running.max_number_epochs = 5 + test_parameters.running.mini_batch_size = 40 + test_parameters.running.learning_rate = 0.00001 + test_parameters.running.optimizer = "Adam" + test_parameters.verbosity = 1 + test_parameters.data.use_lazy_loading = True + data_shuffler = mala.DataShuffler(test_parameters) + test_parameters.descriptors.descriptor_type = "Bispectrum" + # These values make no phyiscal sense, but are chosen to make the test + # fast enough to be executed without LAMMPS. + test_parameters.descriptors.bispectrum_twojmax = 2 + test_parameters.descriptors.bispectrum_cutoff = 1.5 + + # Add a snapshot we want to use in to the list. + data_shuffler.add_snapshot( + "Be_snapshot0.info.json", + data_path, + "Be_snapshot0.out.npy", + data_path, + ) + data_shuffler.add_snapshot( + "Be_snapshot1.info.json", + data_path, + "Be_snapshot1.out.npy", + data_path, + ) + + # After shuffling, these snapshots can be loaded as regular snapshots + # for lazily loaded training- + data_shuffler.shuffle_snapshots( + "./", save_name="Be_shuffled*", shuffle_to_temporary=True + ) + test_parameters.descriptors.descriptors_contain_xyz = True + + # Train with shuffling. + data_handler = mala.DataHandler(test_parameters) + # Add a snapshot we want to use in to the list. + data_handler.add_snapshot( + data_shuffler.temporary_shuffled_snapshots[0].input_npy_file, + data_shuffler.temporary_shuffled_snapshots[0].input_npy_directory, + data_shuffler.temporary_shuffled_snapshots[0].output_npy_file, + data_shuffler.temporary_shuffled_snapshots[0].output_npy_directory, + "tr", + ) + data_handler.add_snapshot( + data_shuffler.temporary_shuffled_snapshots[1].input_npy_file, + data_shuffler.temporary_shuffled_snapshots[1].input_npy_directory, + data_shuffler.temporary_shuffled_snapshots[1].output_npy_file, + data_shuffler.temporary_shuffled_snapshots[1].output_npy_directory, + "va", + ) + data_handler.prepare_data() + + test_parameters.network.layer_sizes = [ + data_handler.input_dimension, + 100, + data_handler.output_dimension, + ] + + test_network = mala.Network(test_parameters) + test_trainer = mala.Trainer( + test_parameters, test_network, data_handler + ) + test_trainer.train_network() + assert test_trainer.final_validation_loss < np.inf From ac255b292ce172a4b674e162f1b325a6f2d788ff Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Wed, 15 Jan 2025 10:31:03 +0100 Subject: [PATCH 23/23] Small adjustment in test --- test/on_the_fly_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/on_the_fly_test.py b/test/on_the_fly_test.py index 28d20916..2074ceb4 100644 --- a/test/on_the_fly_test.py +++ b/test/on_the_fly_test.py @@ -20,7 +20,6 @@ class TestOnTheFly: @staticmethod def __setup_training(lazy_loading, checkpoint_name=None): test_parameters = mala.Parameters() - test_parameters.use_lammps = False test_parameters.data.data_splitting_type = "by_snapshot" test_parameters.data.input_rescaling_type = "feature-wise-standard" test_parameters.data.output_rescaling_type = "minmax" @@ -177,4 +176,5 @@ def test_shuffling(self): test_parameters, test_network, data_handler ) test_trainer.train_network() + data_shuffler.delete_temporary_shuffled_snapshots() assert test_trainer.final_validation_loss < np.inf