diff --git a/docs/source/basic_usage/more_data.rst b/docs/source/basic_usage/more_data.rst index d643e8c6c..4232a79b7 100644 --- a/docs/source/basic_usage/more_data.rst +++ b/docs/source/basic_usage/more_data.rst @@ -99,8 +99,8 @@ and fill it with data, e.g., by descriptor_input_path=outfile, target_input_type=".cube", target_input_path=ldosfile, - additional_info_input_type="espresso-out", - additional_info_input_path=outfile, + simulation_output_type="espresso-out", + simulation_output_path=outfile, target_units="1/(Ry*Bohr^3)") The ``add_snapshot`` function can be called multiple times to add @@ -108,7 +108,7 @@ multiple snapshots to MALA. For regular Quantum ESPRESSO calculations, the ``descriptor_input_type`` and ``target_input_type`` will always be ``"espresso-out"`` and ``".cube"``, respectively, and the ``target_units`` will always be ``"1/(Ry*Bohr^3)"``. -The paths have to be modified accordingly. ``additional_info_input_*`` refers +The paths have to be modified accordingly. ``simulation_output_*`` refers to the calculation output file - MALA provides an interface to condense the entire, verbose simulation output to ``.json`` files for further processing. In the preceding section, we had to specify calculation output @@ -121,7 +121,7 @@ Once data is provided, the conversion itself is simple. data_converter.convert_snapshots(descriptor_save_path="./", target_save_path="./", - additional_info_save_path="./", + simulation_output_save_path="./", naming_scheme="Be_snapshot*.npy", descriptor_calculation_kwargs= {"working_directory": data_path}) diff --git a/examples/advanced/ex10_convert_numpy_openpmd.py b/examples/advanced/ex10_convert_numpy_openpmd.py index 7ebc22daa..22607b304 100644 --- a/examples/advanced/ex10_convert_numpy_openpmd.py +++ b/examples/advanced/ex10_convert_numpy_openpmd.py @@ -20,15 +20,15 @@ target_input_path=os.path.join( data_path, "Be_snapshot{}.out.npy".format(snapshot) ), - additional_info_input_type=None, - additional_info_input_path=None, + simulation_output_type=None, + simulation_output_path=None, target_units=None, ) data_converter.convert_snapshots( descriptor_save_path="./", target_save_path="./", - additional_info_save_path="./", + simulation_output_save_path="./", naming_scheme="converted_from_numpy_*.h5", descriptor_calculation_kwargs={"working_directory": "./"}, ) @@ -43,15 +43,15 @@ descriptor_input_path="converted_from_numpy_{}.in.h5".format(snapshot), target_input_type="openpmd", target_input_path="converted_from_numpy_{}.out.h5".format(snapshot), - additional_info_input_type=None, - additional_info_input_path=None, + simulation_output_type=None, + simulation_output_path=None, target_units=None, ) data_converter.convert_snapshots( descriptor_save_path="./", target_save_path="./", - additional_info_save_path="./", + simulation_output_save_path="./", naming_scheme="verify_against_original_numpy_data_*.npy", descriptor_calculation_kwargs={"working_directory": "./"}, ) @@ -84,15 +84,15 @@ target_input_path=os.path.join( data_path, "Be_snapshot{}.out.h5".format(snapshot) ), - additional_info_input_type=None, - additional_info_input_path=None, + simulation_output_type=None, + simulation_output_path=None, target_units=None, ) data_converter.convert_snapshots( descriptor_save_path="./", target_save_path="./", - additional_info_save_path="./", + simulation_output_save_path="./", naming_scheme="converted_from_openpmd_*.npy", descriptor_calculation_kwargs={"working_directory": "./"}, ) diff --git a/examples/basic/ex03_preprocess_data.py b/examples/basic/ex03_preprocess_data.py index b0a104885..e9f4691af 100644 --- a/examples/basic/ex03_preprocess_data.py +++ b/examples/basic/ex03_preprocess_data.py @@ -46,12 +46,12 @@ # Data conversion itself is simple. We select input and output data # to be converted, add this data snapshot-wise and tell MALA to # convert snapshots. Inputs and outputs can be processed individually. -# Further, via the additional_info_input_* keywords, calculation output +# Further, via the simulation_output_input_* keywords, calculation output # can be processed from the original simulation *.out output files into # more convenient *.json files that can be used in their stead. This saves # on disk space. # To only process parts of the data, omit/add descriptor_input*, target_input_* -# and additional_info_input_* at your leisure. +# and simulation_output_* at your leisure. # Make sure to set the correct units - for QE, this should always be # 1/(Ry*Bohr^3). #################### @@ -65,8 +65,8 @@ descriptor_input_path=outfile, target_input_type=".cube", target_input_path=ldosfile, - additional_info_input_type="espresso-out", - additional_info_input_path=outfile, + simulation_output_type="espresso-out", + simulation_output_path=outfile, target_units="1/(Ry*Bohr^3)", ) @@ -84,7 +84,7 @@ data_converter.convert_snapshots( descriptor_save_path="./", target_save_path="./", - additional_info_save_path="./", + simulation_output_save_path="./", naming_scheme="Be_snapshot*.npy", descriptor_calculation_kwargs={"working_directory": data_path}, ) diff --git a/mala/datahandling/data_converter.py b/mala/datahandling/data_converter.py index 21fb34cdb..89558fecf 100644 --- a/mala/datahandling/data_converter.py +++ b/mala/datahandling/data_converter.py @@ -12,7 +12,7 @@ descriptor_input_types = ["espresso-out", "openpmd", "numpy"] target_input_types = [".cube", ".xsf", "openpmd", "numpy"] -additional_info_input_types = ["espresso-out"] +simulation_output_types = ["espresso-out"] class DataConverter: @@ -78,7 +78,7 @@ def __init__( # Keep track of what has to be done by this data converter. self.__process_descriptors = False self.__process_targets = False - self.__process_additional_info = False + self.__process_simulation_output = False def add_snapshot( self, @@ -86,8 +86,8 @@ def add_snapshot( descriptor_input_path=None, target_input_type=None, target_input_path=None, - additional_info_input_type=None, - additional_info_input_path=None, + simulation_output_type=None, + simulation_output_path=None, descriptor_units=None, metadata_input_type=None, metadata_input_path=None, @@ -114,22 +114,22 @@ def add_snapshot( target_input_path : string Path of target data to be processed. - additional_info_input_type : string + simulation_output_type : string Type of additional info data to be processed. - See mala.datahandling.data_converter.additional_info_input_types + See mala.datahandling.data_converter.simulation_output_types for options. - additional_info_input_path : string + simulation_output_path : string Path of additional info data to be processed. metadata_input_type : string Type of additional metadata to be processed. - See mala.datahandling.data_converter.additional_info_input_types + See mala.datahandling.data_converter.simulation_output_types for options. - This is essentially the same as additional_info_input_type, + This is essentially the same as simulation_output_type, but will not affect saving; i.e., the data given here will only be saved in OpenPMD files, not saved separately. - If additional_info_input_type is set, this argument will be + If simulation_output_type is set, this argument will be ignored. metadata_input_path : string @@ -161,20 +161,20 @@ def add_snapshot( raise Exception("Cannot process this type of target data.") self.__process_targets = True - if additional_info_input_type is not None: - metadata_input_type = additional_info_input_type - if additional_info_input_path is None: + if simulation_output_type is not None: + metadata_input_type = simulation_output_type + if simulation_output_path is None: raise Exception( "Cannot process additional info data with " "no path given." ) - if additional_info_input_type not in additional_info_input_types: + if simulation_output_type not in simulation_output_types: raise Exception( "Cannot process this type of additional info data." ) - self.__process_additional_info = True + self.__process_simulation_output = True - metadata_input_path = additional_info_input_path + metadata_input_path = simulation_output_path if metadata_input_type is not None: if metadata_input_path is None: @@ -182,7 +182,7 @@ def add_snapshot( "Cannot process additional info data with " "no path given." ) - if metadata_input_type not in additional_info_input_types: + if metadata_input_type not in simulation_output_types: raise Exception( "Cannot process this type of additional info data." ) @@ -192,7 +192,7 @@ def add_snapshot( { "input": descriptor_input_path, "output": target_input_path, - "additional_info": additional_info_input_path, + "simulation_output": simulation_output_path, "metadata": metadata_input_path, } ) @@ -200,7 +200,7 @@ def add_snapshot( { "input": descriptor_input_type, "output": target_input_type, - "additional_info": additional_info_input_type, + "simulation_output": simulation_output_type, "metadata": metadata_input_type, } ) @@ -213,7 +213,7 @@ def convert_snapshots( complete_save_path=None, descriptor_save_path=None, target_save_path=None, - additional_info_save_path=None, + simulation_output_save_path=None, naming_scheme="ELEM_snapshot*.npy", starts_at=0, file_based_communication=False, @@ -231,7 +231,7 @@ def convert_snapshots( complete_save_path : string If not None: the directory in which all snapshots will be saved. Overwrites descriptor_save_path, target_save_path and - additional_info_save_path if set. + simulation_output_save_path if set. descriptor_save_path : string Directory in which to save descriptor data. @@ -239,7 +239,7 @@ def convert_snapshots( target_save_path : string Directory in which to save target data. - additional_info_save_path : string + simulation_output_save_path : string Directory in which to save additional info data. naming_scheme : string @@ -304,7 +304,7 @@ def convert_snapshots( if complete_save_path is not None: descriptor_save_path = complete_save_path target_save_path = complete_save_path - additional_info_save_path = complete_save_path + simulation_output_save_path = complete_save_path else: if self.__process_targets is True and target_save_path is None: raise Exception( @@ -318,8 +318,8 @@ def convert_snapshots( "No descriptor path specified, cannot process data." ) if ( - self.__process_additional_info is True - and additional_info_save_path is None + self.__process_simulation_output is True + and simulation_output_save_path is None ): raise Exception( "No additional info path specified, cannot " @@ -393,9 +393,9 @@ def convert_snapshots( snapshot_name = snapshot_name.replace("*", str(snapshot_number)) # Create the paths as needed. - if self.__process_additional_info: + if self.__process_simulation_output: info_path = os.path.join( - additional_info_save_path, snapshot_name + ".info.json" + simulation_output_save_path, snapshot_name + ".info.json" ) else: info_path = None @@ -454,7 +454,7 @@ def convert_snapshots( use_memmap=memmap, input_iteration=input_iteration, output_iteration=output_iteration, - additional_info_path=info_path, + simulation_output_info_path=info_path, use_fp64=use_fp64, ) @@ -479,7 +479,7 @@ def __convert_single_snapshot( target_calculator_kwargs, input_path=None, output_path=None, - additional_info_path=None, + simulation_output_info_path=None, use_memmap=None, output_iteration=None, input_iteration=None, @@ -719,11 +719,11 @@ def __convert_single_snapshot( del tmp_output # Parse and/or calculate the additional info. - if description["additional_info"] is not None: + if description["simulation_output"] is not None: # Parsing and saving is done using the target calculator. self.target_calculator.read_additional_calculation_data( - snapshot["additional_info"], description["additional_info"] + snapshot["simulation_output"], description["simulation_output"] ) self.target_calculator.write_additional_calculation_data( - additional_info_path + simulation_output_info_path ) diff --git a/mala/datahandling/data_handler.py b/mala/datahandling/data_handler.py index 3b9521e44..5823d459d 100644 --- a/mala/datahandling/data_handler.py +++ b/mala/datahandling/data_handler.py @@ -6,7 +6,7 @@ import torch from torch.utils.data import TensorDataset -from mala.common.parallelizer import printout, barrier +from mala.common.parallelizer import printout, barrier, get_rank from mala.common.parameters import Parameters, DEFAULT_NP_DATA_DTYPE from mala.datahandling.data_handler_base import DataHandlerBase from mala.datahandling.data_scaler import DataScaler @@ -227,16 +227,8 @@ def prepare_data(self, reparametrize_scaler=True): printout("Initializing the data scalers.", min_verbosity=1) self.__parametrize_scalers() printout("Data scalers initialized.", min_verbosity=0) - elif ( - self.parameters.use_lazy_loading is False - and self.nr_training_data != 0 - ): - printout( - "Data scalers already initilized, loading data to RAM.", - min_verbosity=0, - ) - self.__load_data("training", "inputs") - self.__load_data("training", "outputs") + elif self.nr_training_data != 0: + self.__parametrized_load_training_data() # Build Datasets. printout("Build datasets.", min_verbosity=1) @@ -253,6 +245,11 @@ def prepare_data(self, reparametrize_scaler=True): # allows for parallel I/O. barrier() + # In the RAM case, there is no reason not to delete all temporary files + # now. + if self.parameters.use_lazy_loading is False: + self.delete_temporary_inputs() + def prepare_for_testing(self): """ Prepare DataHandler for usage within Tester class. @@ -595,6 +592,12 @@ def __load_data(self, function, data_type): snapshot.input_npy_directory, snapshot.input_npy_file ) units = snapshot.input_units + + # If the input for the descriptors is actually a JSON + # file then we need to calculate the descriptors. + if snapshot.snapshot_type == "json+numpy": + self._calculate_temporary_inputs(snapshot) + file = snapshot.temporary_input_file else: file = os.path.join( snapshot.output_npy_directory, @@ -602,7 +605,10 @@ def __load_data(self, function, data_type): ) units = snapshot.output_units - if snapshot.snapshot_type == "numpy": + if ( + snapshot.snapshot_type == "numpy" + or snapshot.snapshot_type == "json+numpy" + ): calculator.read_from_numpy_file( file, units=units, @@ -716,11 +722,20 @@ def __build_datasets(self): self.training_data_sets[0].add_snapshot_to_dataset( snapshot ) + # For training snapshots, temporary files (if needed) have + # already been built during parametrization, for all other + # snapshot types, this has to be done here. if snapshot.snapshot_function == "va": + if snapshot.snapshot_type == "json+numpy": + self._calculate_temporary_inputs(snapshot) + self.validation_data_sets[0].add_snapshot_to_dataset( snapshot ) if snapshot.snapshot_function == "te": + if snapshot.snapshot_type == "json+numpy": + self._calculate_temporary_inputs(snapshot) + self.test_data_sets[0].add_snapshot_to_dataset(snapshot) # I don't think we need to mix them here. We can use the standard @@ -765,6 +780,9 @@ def __build_datasets(self): self._use_ddp, ) ) + if snapshot.snapshot_type == "json+numpy": + self._calculate_temporary_inputs(snapshot) + if snapshot.snapshot_function == "te": self.test_data_sets.append( LazyLoadDatasetSingle( @@ -780,6 +798,8 @@ def __build_datasets(self): input_requires_grad=True, ) ) + if snapshot.snapshot_type == "json+numpy": + self._calculate_temporary_inputs(snapshot) else: if self.nr_training_data != 0: @@ -878,6 +898,12 @@ def __parametrize_scalers(self): ) ) ) + elif snapshot.snapshot_type == "json+numpy": + self._calculate_temporary_inputs(snapshot) + tmp = self.descriptor_calculator.read_from_numpy_file( + snapshot.temporary_input_file, + units=snapshot.input_units, + ) else: raise Exception("Unknown snapshot file type.") @@ -919,7 +945,10 @@ def __parametrize_scalers(self): for snapshot in self.parameters.snapshot_directories_list: # Data scaling is only performed on the training data sets. if snapshot.snapshot_function == "tr": - if snapshot.snapshot_type == "numpy": + if ( + snapshot.snapshot_type == "numpy" + or snapshot.snapshot_type == "json+numpy" + ): tmp = self.target_calculator.read_from_numpy_file( os.path.join( snapshot.output_npy_directory, @@ -958,6 +987,27 @@ def __parametrize_scalers(self): printout("Output scaler parametrized.", min_verbosity=1) + def __parametrized_load_training_data(self): + if self.parameters.use_lazy_loading: + printout( + "Data scalers already initilized, preparing input data.", + min_verbosity=0, + ) + for snapshot in self.parameters.snapshot_directories_list: + # Data scaling is only performed on the training data sets. + if ( + snapshot.snapshot_function == "tr" + and snapshot.snapshot_type == "json+numpy" + ): + self._calculate_temporary_inputs(snapshot) + else: + printout( + "Data scalers already initilized, loading data to RAM.", + min_verbosity=0, + ) + self.__load_data("training", "inputs") + self.__load_data("training", "outputs") + def __raw_numpy_to_converted_numpy( self, numpy_array, data_type="in", units=None ): diff --git a/mala/datahandling/data_handler_base.py b/mala/datahandling/data_handler_base.py index c141551fa..a69dd8790 100644 --- a/mala/datahandling/data_handler_base.py +++ b/mala/datahandling/data_handler_base.py @@ -2,11 +2,12 @@ from abc import ABC import os +import tempfile import numpy as np from mala.common.parameters import ParametersData, Parameters -from mala.common.parallelizer import printout +from mala.common.parallelizer import printout, get_rank, barrier from mala.targets.target import Target from mala.descriptors.descriptor import Descriptor from mala.datahandling.snapshot import Snapshot @@ -105,7 +106,7 @@ def add_snapshot( output_units="1/(eV*A^3)", input_units="None", calculation_output_file="", - snapshot_type="numpy", + snapshot_type=None, ): """ Add a snapshot to the data pipeline. @@ -145,6 +146,25 @@ def add_snapshot( Either "numpy" or "openpmd" based on what kind of files you want to operate on. """ + # Try to guess snapshot type if no information was provided. + if snapshot_type is None: + input_file_ending = input_file.split(".")[-1] + output_file_ending = output_file.split(".")[-1] + + if input_file_ending == "npy" and output_file_ending == "npy": + snapshot_type = "numpy" + + elif input_file_ending == "json" and output_file_ending == "npy": + snapshot_type = "json+numpy" + else: + import openpmd_api as io + + if ( + input_file_ending in io.file_extensions + and output_file_ending in io.file_extensions + ): + snapshot_type = "openpmd" + snapshot = Snapshot( input_file, input_directory, @@ -166,6 +186,21 @@ def clear_data(self): """ self.parameters.snapshot_directories_list = [] + def delete_temporary_inputs(self): + """ + Delete temporary data files. + + These may have been created during a training or testing process + when using atomic positions for on-the-fly calculation of descriptors + rather than precomputed data files. + """ + if get_rank() == 0: + for snapshot in self.parameters.snapshot_directories_list: + if snapshot.temporary_input_file is not None: + if os.path.isfile(snapshot.temporary_input_file): + os.remove(snapshot.temporary_input_file) + barrier() + ############################## # Private methods ############################## @@ -207,6 +242,15 @@ def _check_snapshots(self, comm=None): ), comm=comm, ) + elif snapshot.snapshot_type == "json+numpy": + tmp_dimension = ( + self.descriptor_calculator.read_dimensions_from_json( + os.path.join( + snapshot.input_npy_directory, + snapshot.input_npy_file, + ) + ) + ) else: raise Exception("Unknown snapshot file type.") @@ -235,7 +279,10 @@ def _check_snapshots(self, comm=None): snapshot.output_npy_directory, min_verbosity=1, ) - if snapshot.snapshot_type == "numpy": + if ( + snapshot.snapshot_type == "numpy" + or snapshot.snapshot_type == "json+numpy" + ): tmp_dimension = ( self.target_calculator.read_dimensions_from_numpy_file( os.path.join( @@ -274,3 +321,28 @@ def _check_snapshots(self, comm=None): if firstsnapshot: firstsnapshot = False + + def _calculate_temporary_inputs(self, snapshot: Snapshot): + if snapshot.temporary_input_file is not None: + if not os.path.isfile(snapshot.temporary_input_file): + snapshot.temporary_input_file = None + + if snapshot.temporary_input_file is None: + snapshot.temporary_input_file = tempfile.NamedTemporaryFile( + delete=False, + prefix=snapshot.input_npy_file.split(".")[0], + suffix=".in.npy", + dir=snapshot.input_npy_directory, + ).name + tmp, grid = self.descriptor_calculator.calculate_from_json( + os.path.join( + snapshot.input_npy_directory, + snapshot.input_npy_file, + ) + ) + if self.parameters._configuration["mpi"]: + tmp = self.descriptor_calculator.gather_descriptors(tmp) + + if get_rank() == 0: + np.save(snapshot.temporary_input_file, tmp) + barrier() diff --git a/mala/datahandling/data_shuffler.py b/mala/datahandling/data_shuffler.py index 660a1fec4..269b0342c 100644 --- a/mala/datahandling/data_shuffler.py +++ b/mala/datahandling/data_shuffler.py @@ -3,12 +3,14 @@ import os import numpy as np +import tempfile +import mala from mala.common.parameters import ( Parameters, DEFAULT_NP_DATA_DTYPE, ) -from mala.common.parallelizer import printout +from mala.common.parallelizer import printout, parallel_warn from mala.common.physical_data import PhysicalData from mala.datahandling.data_handler_base import DataHandlerBase from mala.common.parallelizer import get_comm @@ -32,6 +34,17 @@ class DataShuffler(DataHandlerBase): target_calculator : mala.targets.target.Target Used to do unit conversion on output data. If None, then one will be created by this class. + + Attributes + ---------- + temporary_shuffled_snapshots : list + A list containing snapshot objects of temporary, snapshot-like + shuffled data files. By default, this list is empty. If the + function "shuffle_snapshots_temporary" is used, it will be populated + with temporary files saved to hard drive, which can be deleted + after model training. Please note that the "snapshot_function", + "input_units", "output_units" and "calculation_output" fields of the + snapshots within this list """ def __init__( @@ -45,15 +58,8 @@ def __init__( target_calculator=target_calculator, descriptor_calculator=descriptor_calculator, ) - if self.descriptor_calculator.parameters.descriptors_contain_xyz: - printout( - "Disabling XYZ-cutting from descriptor data for " - "shuffling. If needed, please re-enable afterwards." - ) - self.descriptor_calculator.parameters.descriptors_contain_xyz = ( - False - ) self._data_points_to_remove = None + self.temporary_shuffled_snapshots = [] def add_snapshot( self, @@ -61,7 +67,7 @@ def add_snapshot( input_directory, output_file, output_directory, - snapshot_type="numpy", + snapshot_type=None, ): """ Add a snapshot to the data pipeline. @@ -105,6 +111,7 @@ def __shuffle_numpy( target_save_path, permutations, file_ending, + temporary, ): # Load the data (via memmap). descriptor_data = [] @@ -112,15 +119,29 @@ def __shuffle_numpy( for idx, snapshot in enumerate( self.parameters.snapshot_directories_list ): - # TODO: Use descriptor and target calculator for this. - descriptor_data.append( - np.load( - os.path.join( - snapshot.input_npy_directory, snapshot.input_npy_file - ), - mmap_mode="r", + if snapshot.snapshot_type == "numpy": + # TODO: Use descriptor and target calculator for this. + descriptor_data.append( + np.load( + os.path.join( + snapshot.input_npy_directory, + snapshot.input_npy_file, + ), + mmap_mode="r", + ) ) - ) + elif snapshot.snapshot_type == "json+numpy": + descriptor_data.append( + np.load( + snapshot.temporary_input_file, + mmap_mode="r", + ) + ) + else: + raise Exception( + "Invalid snapshot for numpy shuffling " "selected." + ) + target_data.append( np.load( os.path.join( @@ -162,12 +183,6 @@ def __shuffle_numpy( target_data[idx] = current_target[indices] # Do the actual shuffling. - target_name_openpmd = os.path.join( - target_save_path, save_name.replace("*", "%T") - ) - descriptor_name_openpmd = os.path.join( - descriptor_save_path, save_name.replace("*", "%T") - ) for i in range(0, number_of_new_snapshots): new_descriptors = np.zeros( (int(np.prod(shuffle_dimensions)), self.input_dimension), @@ -178,12 +193,73 @@ def __shuffle_numpy( dtype=DEFAULT_NP_DATA_DTYPE, ) last_start = 0 - descriptor_name = os.path.join( - descriptor_save_path, save_name.replace("*", str(i)) - ) - target_name = os.path.join( - target_save_path, save_name.replace("*", str(i)) - ) + + # Figure out where to save / how to name things. + # TODO: This could probably be shortened. + if temporary: + + # Adding "snapshot numbers" here is technically not necessary + # I think, but it also doesn't hurt. + if file_ending == "npy": + descriptor_name = tempfile.NamedTemporaryFile( + delete=False, + prefix=save_name.replace("*", str(i)), + suffix=".in.npy", + dir=descriptor_save_path, + ).name + target_name = tempfile.NamedTemporaryFile( + delete=False, + prefix=save_name.replace("*", str(i)), + suffix=".out.npy", + dir=target_save_path, + ).name + snapshot_type = "numpy" + else: + descriptor_name = tempfile.NamedTemporaryFile( + delete=False, + prefix=save_name.replace("*", "%T"), + suffix=".in." + file_ending, + dir=descriptor_save_path, + ).name + target_name = tempfile.NamedTemporaryFile( + delete=False, + prefix=save_name.replace("*", "%T"), + suffix=".out." + file_ending, + dir=target_save_path, + ).name + snapshot_type = "openpmd" + self.temporary_shuffled_snapshots.append( + mala.Snapshot( + os.path.basename(descriptor_name), + os.path.dirname(descriptor_name), + os.path.basename(target_name), + os.path.dirname(target_name), + snapshot_function="te", + output_units="None", + input_units="None", + calculation_output="", + snapshot_type=snapshot_type, + ) + ) + else: + if file_ending == "npy": + descriptor_name = os.path.join( + descriptor_save_path, + save_name.replace("*", str(i)) + ".in.npy", + ) + target_name = os.path.join( + target_save_path, + save_name.replace("*", str(i)) + ".out.npy", + ) + else: + descriptor_name = os.path.join( + descriptor_save_path, + save_name.replace("*", "%T") + ".in." + file_ending, + ) + target_name = os.path.join( + target_save_path, + save_name.replace("*", "%T") + ".out." + file_ending, + ) # Each new snapshot gets an number_of_new_snapshots-th from each # snapshot. @@ -228,10 +304,10 @@ def __shuffle_numpy( ) if file_ending == "npy": self.descriptor_calculator.write_to_numpy_file( - descriptor_name + ".in.npy", new_descriptors + descriptor_name, new_descriptors ) self.target_calculator.write_to_numpy_file( - target_name + ".out.npy", new_targets + target_name, new_targets ) else: # We check above that in the non-numpy case, OpenPMD will work. @@ -242,7 +318,7 @@ def __shuffle_numpy( shuffle_dimensions ) self.descriptor_calculator.write_to_openpmd_file( - descriptor_name_openpmd + ".in." + file_ending, + descriptor_name, new_descriptors, additional_attributes={ "global_shuffling_seed": self.parameters.shuffling_seed, @@ -252,7 +328,7 @@ def __shuffle_numpy( internal_iteration_number=i, ) self.target_calculator.write_to_openpmd_file( - target_name_openpmd + ".out." + file_ending, + target_name, array=new_targets, additional_attributes={ "global_shuffling_seed": self.parameters.shuffling_seed, @@ -334,9 +410,7 @@ def __init__(self): # XXXXXXXOOO # OOOOOOOOOO - def __contiguous_slice_within_ndim_grid_as_blocks( - extent, x, y - ): + def __contiguous_slice_within_ndim_grid_as_blocks(extent, x, y): # Used for converting a block defined by inclusive lower and upper # coordinate to a chunk as defined by openPMD. # The openPMD extent is defined by (to-from)+1 (to make up for the @@ -426,7 +500,8 @@ def worker(inner_idx, inner_strides): if not inner_strides: if inner_idx != 0: raise RuntimeError( - "This cannot happen. There is bug somewhere.") + "This cannot happen. There is bug somewhere." + ) else: return [] div, mod = divmod(inner_idx, inner_strides[0]) @@ -645,6 +720,7 @@ def shuffle_snapshots( target_save_path=None, save_name="mala_shuffled_snapshot*", number_of_shuffled_snapshots=None, + shuffle_to_temporary=False, ): """ Shuffle the snapshots into new snapshots. @@ -655,8 +731,7 @@ def shuffle_snapshots( ---------- complete_save_path : string If not None: the directory in which all snapshots will be saved. - Overwrites descriptor_save_path, target_save_path and - additional_info_save_path if set. + Overwrites descriptor_save_path and target_save_path if set. descriptor_save_path : string Directory in which to save descriptor data. @@ -671,6 +746,12 @@ def shuffle_snapshots( If not None, this class will attempt to redistribute the data to this amount of snapshots. If None, then the same number of snapshots provided will be used. + + shuffle_to_temporary : bool + If True, shuffled files will be writen to temporary data files. + Which paths are used is consistent with non-temporary usage of this + class. The path and names of these temporary files can then be + found in the class attribute temporary_shuffled_snapshots. """ # Check the paths. if complete_save_path is not None: @@ -685,20 +766,33 @@ def shuffle_snapshots( file_ending = save_name.split(".")[-1] save_name = save_name.split(".")[0] if file_ending != "npy": - import openpmd_api as io - - if file_ending not in io.file_extensions: - raise Exception( - "Invalid file ending selected: " + file_ending + if shuffle_to_temporary: + parallel_warn( + "Shuffling to temporary files currently" + " only works with numpy as an enginge for " + "intermediate files. You have selected both" + " openpmd and the temporary file option. " + "Will proceed with numpy instead of " + "openpmd." ) + file_ending = "npy" + else: + import openpmd_api as io + + if file_ending not in io.file_extensions: + raise Exception( + "Invalid file ending selected: " + file_ending + ) else: file_ending = "npy" + old_xyz = self.descriptor_calculator.parameters.descriptors_contain_xyz + self.descriptor_calculator.parameters.descriptors_contain_xyz = False if self.parameters._configuration["mpi"]: self._check_snapshots(comm=get_comm()) else: self._check_snapshots() - + self.descriptor_calculator.parameters.descriptors_contain_xyz = old_xyz snapshot_types = { snapshot.snapshot_type for snapshot in self.parameters.snapshot_directories_list @@ -786,6 +880,20 @@ def shuffle_snapshots( target_save_path, permutations, file_ending, + shuffle_to_temporary, + ) + elif snapshot_type == "json+numpy": + for snapshot in self.parameters.snapshot_directories_list: + self._calculate_temporary_inputs(snapshot) + self.__shuffle_numpy( + number_of_shuffled_snapshots, + shuffle_dimensions, + descriptor_save_path, + save_name, + target_save_path, + permutations, + file_ending, + shuffle_to_temporary, ) elif snapshot_type == "openpmd": descriptor = self.__DescriptorOrTarget( @@ -823,6 +931,30 @@ def shuffle_snapshots( else: raise Exception("Unknown snapshot type: {}".format(snapshot_type)) + # Deleting temporary files that may have been created. + self.delete_temporary_inputs() + # Since no training will be done with this class, we should always # clear the data at the end. self.clear_data() + + def delete_temporary_shuffled_snapshots(self): + """ + Delete temporary files creating during shuffling of data. + + If shuffling has been done with the option "shuffle_to_temporary", + shuffled data will be saved to temporary files which can safely be + deleted with this function. + """ + for snapshot in self.temporary_shuffled_snapshots: + input_file = os.path.join( + snapshot.input_npy_directory, snapshot.input_npy_file + ) + if os.path.isfile(input_file): + os.remove(input_file) + output_file = os.path.join( + snapshot.output_npy_directory, snapshot.output_npy_file + ) + if os.path.isfile(output_file): + os.remove(output_file) + self.temporary_shuffled_snapshots = [] diff --git a/mala/datahandling/lazy_load_dataset.py b/mala/datahandling/lazy_load_dataset.py index a5a2b1a50..78d114313 100644 --- a/mala/datahandling/lazy_load_dataset.py +++ b/mala/datahandling/lazy_load_dataset.py @@ -163,6 +163,18 @@ def get_new_data(self, file_index): ), units=self._snapshot_list[file_index].output_units, ) + elif self._snapshot_list[file_index].snapshot_type == "json+numpy": + self.input_data = self._descriptor_calculator.read_from_numpy_file( + self._snapshot_list[file_index].temporary_input_file, + units=self._snapshot_list[file_index].input_units, + ) + self.output_data = self._target_calculator.read_from_numpy_file( + os.path.join( + self._snapshot_list[file_index].output_npy_directory, + self._snapshot_list[file_index].output_npy_file, + ), + units=self._snapshot_list[file_index].output_units, + ) elif self._snapshot_list[file_index].snapshot_type == "openpmd": self.input_data = ( diff --git a/mala/datahandling/lazy_load_dataset_single.py b/mala/datahandling/lazy_load_dataset_single.py index 402d149de..1baef9eac 100644 --- a/mala/datahandling/lazy_load_dataset_single.py +++ b/mala/datahandling/lazy_load_dataset_single.py @@ -180,6 +180,26 @@ def allocate_shared_mem(self): read_dtype=True, ) ) + elif self.snapshot.snapshot_type == "json+numpy": + self.input_shape = ( + self.descriptor_calculator.read_dimensions_from_json( + os.path.join( + self.snapshot.input_npy_directory, + self.snapshot.input_npy_file, + ), + ) + ) + self.input_dtype = np.dtype(np.float32) + + self.output_shape, self.output_dtype = ( + self.target_calculator.read_dimensions_from_numpy_file( + os.path.join( + self.snapshot.output_npy_directory, + self.snapshot.output_npy_file, + ), + read_dtype=True, + ) + ) else: raise Exception("Invalid snapshot type selected.") diff --git a/mala/datahandling/multi_lazy_load_data_loader.py b/mala/datahandling/multi_lazy_load_data_loader.py index a9aca6afc..35b02601e 100644 --- a/mala/datahandling/multi_lazy_load_data_loader.py +++ b/mala/datahandling/multi_lazy_load_data_loader.py @@ -192,6 +192,24 @@ def load_snapshot_to_shm( read_dtype=True, ) ) + elif snapshot.snapshot_type == "json+numpy": + input_shape = descriptor_calculator.read_dimensions_from_json( + os.path.join( + snapshot.input_npy_directory, + snapshot.input_npy_file, + ), + ) + input_dtype = np.dtype(np.float32) + + output_shape, output_dtype = ( + target_calculator.read_dimensions_from_numpy_file( + os.path.join( + snapshot.output_npy_directory, + snapshot.output_npy_file, + ), + read_dtype=True, + ) + ) else: raise Exception("Invalid snapshot type selected.") @@ -219,7 +237,22 @@ def load_snapshot_to_shm( units=snapshot.output_units, array=output_data, ) - else: + + elif snapshot.snapshot_type == "json+numpy": + descriptor_calculator.read_from_numpy_file( + snapshot.temporary_input_file, + units=snapshot.input_units, + array=input_data, + ) + target_calculator.read_from_numpy_file( + os.path.join( + snapshot.output_npy_directory, snapshot.output_npy_file + ), + units=snapshot.output_units, + array=output_data, + ) + + elif snapshot.snapshot_type == "openpmd": descriptor_calculator.read_from_openpmd_file( os.path.join( snapshot.input_npy_directory, snapshot.input_npy_file @@ -234,6 +267,8 @@ def load_snapshot_to_shm( units=snapshot.output_units, array=output_data, ) + else: + raise Exception("Invalid snapshot type selected.") # This function only loads the numpy data with scaling. Remaining data # preprocessing occurs in __getitem__ of LazyLoadDatasetSingle diff --git a/mala/datahandling/snapshot.py b/mala/datahandling/snapshot.py index 1bac8488c..e44728352 100644 --- a/mala/datahandling/snapshot.py +++ b/mala/datahandling/snapshot.py @@ -133,6 +133,9 @@ def __init__( self.input_dimension = None self.output_dimension = None + # Temporary descriptor files, which may be needed. + self.temporary_input_file = None + @classmethod def from_json(cls, json_dict): """ diff --git a/mala/descriptors/atomic_density.py b/mala/descriptors/atomic_density.py index 4459c838b..6a0527479 100755 --- a/mala/descriptors/atomic_density.py +++ b/mala/descriptors/atomic_density.py @@ -119,6 +119,10 @@ def _calculate(self, outdir, **kwargs): else: return self.__calculate_python(**kwargs) + def _read_feature_dimension_from_json(self, json_dict): + # For now, has to be adapted in the multielement case. + return 4 + def __calculate_lammps(self, outdir, **kwargs): """Perform actual Gaussian descriptor calculation.""" # For version compatibility; older lammps versions (the serial version diff --git a/mala/descriptors/bispectrum.py b/mala/descriptors/bispectrum.py index ab8bbff7f..8932c61be 100755 --- a/mala/descriptors/bispectrum.py +++ b/mala/descriptors/bispectrum.py @@ -120,6 +120,24 @@ def _calculate(self, outdir, **kwargs): else: return self.__calculate_python(**kwargs) + def _read_feature_dimension_from_json(self, json_dict): + if self.parameters.descriptors_contain_xyz: + return self.__get_feature_size() - 3 + else: + return self.__get_feature_size() + + def __get_feature_size(self): + ncols0 = 3 + + # Analytical relation for fingerprint length + ncoeff = ( + (self.parameters.bispectrum_twojmax + 2) + * (self.parameters.bispectrum_twojmax + 3) + * (self.parameters.bispectrum_twojmax + 4) + ) + ncoeff = ncoeff // 24 # integer division + return ncols0 + ncoeff + def __calculate_lammps(self, outdir, **kwargs): """ Perform bispectrum calculation using LAMMPS. @@ -173,19 +191,7 @@ def __calculate_lammps(self, outdir, **kwargs): # Do the LAMMPS calculation and clean up. lmp.file(self.parameters.lammps_compute_file) - - # Set things not accessible from LAMMPS - # First 3 cols are x, y, z, coords - ncols0 = 3 - - # Analytical relation for fingerprint length - ncoeff = ( - (self.parameters.bispectrum_twojmax + 2) - * (self.parameters.bispectrum_twojmax + 3) - * (self.parameters.bispectrum_twojmax + 4) - ) - ncoeff = ncoeff // 24 # integer division - self.feature_size = ncols0 + ncoeff + self.feature_size = self.__get_feature_size() # Extract data from LAMMPS calculation. # This is different for the parallel and the serial case. diff --git a/mala/descriptors/descriptor.py b/mala/descriptors/descriptor.py index 041dd4b3f..e6fec1d35 100644 --- a/mala/descriptors/descriptor.py +++ b/mala/descriptors/descriptor.py @@ -1,11 +1,12 @@ """Base class for all descriptor calculators.""" from abc import abstractmethod -from functools import cached_property +import json import os import tempfile import ase +from ase.cell import Cell from ase.units import m from ase.neighborlist import NeighborList, NewPrimitiveNeighborList import numpy as np @@ -375,6 +376,50 @@ def calculate_from_qe_out( return self._calculate(working_directory, **kwargs) + def calculate_from_json(self, json_file, working_directory=".", **kwargs): + """ + Calculate the descriptors based on a MALA generated json file. + + These json files are generated by the MALA DataConverter class + and bundle information about a DFT simulation. + + Parameters + ---------- + json_file : string + Name of MALA json output file for snapshot. + + working_directory : string + A directory in which to write the output of the LAMMPS calculation. + Usually the local directory should suffice, given that there + are no multiple instances running in the same directory. + + kwargs : dict + A collection of keyword arguments, that are mainly used for + debugging and development. Different types of descriptors + may support different keyword arguments. Commonly supported + are + + - "use_fp64": To use enforce floating point 64 precision for + descriptors. + - "keep_logs": To not delete temporary files created during + LAMMPS calculation of descriptors. + + Returns + ------- + descriptors : numpy.array + Numpy array containing the descriptors with the dimension + (x,y,z,descriptor_dimension) + + """ + if isinstance(json_file, str): + json_dict = json.load(open(json_file, encoding="utf-8")) + else: + json_dict = json.load(json_file) + self.grid_dimensions = json_dict["grid_dimensions"] + self._atoms = ase.Atoms.fromdict(json_dict["atoms"]) + self._voxel = Cell(json_dict["voxel"]["array"]) + return self._calculate(working_directory, **kwargs) + def calculate_from_atoms( self, atoms, grid_dimensions, working_directory=".", **kwargs ): @@ -573,6 +618,35 @@ def convert_local_to_3d(self, descriptors_np): ).transpose([2, 1, 0, 3]) return descriptors_full, local_offset, local_reach + def read_dimensions_from_json(self, json_file): + """ + Read only the descriptor dimensions from a json file. + + These json files are generated by the MALA DataConverter class + and bundle information about a DFT simulation. + + Parameters + ---------- + json_file : string + Path to the numpy file. + + Returns + ------- + dimension_info : list or tuple + If read_dtype is False, then only a list containing the dimensions + of the saved array is returned. If read_dtype is True, a tuple + containing this list of dimensions and the dtype of the array will + be returned. + """ + if isinstance(json_file, str): + json_dict = json.load(open(json_file, encoding="utf-8")) + else: + json_dict = json.load(json_file) + grid_dimensions = json_dict["grid_dimensions"] + [ + self._read_feature_dimension_from_json(json_dict) + ] + return grid_dimensions + # Private methods ################# @@ -1021,5 +1095,9 @@ def _grid_to_coord(self, gridpoint): def _calculate(self, outdir, **kwargs): pass + @abstractmethod + def _read_feature_dimension_from_json(self, json_dict): + pass + def _set_feature_size_from_array(self, array): self.feature_size = np.shape(array)[-1] diff --git a/mala/descriptors/minterpy_descriptors.py b/mala/descriptors/minterpy_descriptors.py index 2d9d52168..cfedf1135 100755 --- a/mala/descriptors/minterpy_descriptors.py +++ b/mala/descriptors/minterpy_descriptors.py @@ -87,6 +87,12 @@ def backconvert_units(array, out_units): else: raise Exception("Unsupported unit for Minterpy descriptors.") + def _read_feature_dimension_from_json(self, json_dict): + raise Exception( + "This feature has not been implemented for Minterpy " + "descriptors." + ) + def _calculate(self, atoms, outdir, grid_dimensions, **kwargs): # For version compatibility; older lammps versions (the serial version # we still use on some machines) have these constants as part of the diff --git a/mala/network/tester.py b/mala/network/tester.py index d7c07761a..b8ee1c70e 100644 --- a/mala/network/tester.py +++ b/mala/network/tester.py @@ -110,6 +110,8 @@ def test_all_snapshots(self): for observable in self.observables_to_test: results[observable].append(snapshot_result[observable]) + self.data.delete_temporary_inputs() + if self.output_format == "list": return results diff --git a/mala/network/trainer.py b/mala/network/trainer.py index ccd0ab70c..9ff5ad126 100644 --- a/mala/network/trainer.py +++ b/mala/network/trainer.py @@ -597,6 +597,9 @@ def train_network(self): ) self.final_validation_loss = vloss + # Cleaning up temporary data files. + self.data.delete_temporary_inputs() + # Clean-up for pre-fetching lazy loading. if self.data.parameters.use_lazy_loading_prefetch: self._training_data_loaders.cleanup() diff --git a/mala/targets/target.py b/mala/targets/target.py index 10a414c6c..17eb5a4d3 100644 --- a/mala/targets/target.py +++ b/mala/targets/target.py @@ -653,7 +653,7 @@ def read_additional_calculation_data(self, data, data_type=None): } self.atomic_forces_dft = None self.entropy_contribution_dft_calculation = None - self.grid_dimensions = [0, 0, 0] + self.grid_dimensions = json_dict["grid_dimensions"] self.atoms = None for key in json_dict: diff --git a/test/complete_interfaces_test.py b/test/complete_interfaces_test.py index 6a7956656..5288afa81 100644 --- a/test/complete_interfaces_test.py +++ b/test/complete_interfaces_test.py @@ -108,15 +108,15 @@ def test_convert_numpy_openpmd(self): target_input_path=os.path.join( data_path, "Be_snapshot{}.out.npy".format(snapshot) ), - additional_info_input_type=None, - additional_info_input_path=None, + simulation_output_type=None, + simulation_output_path=None, target_units=None, ) data_converter.convert_snapshots( descriptor_save_path="./", target_save_path="./", - additional_info_save_path="./", + simulation_output_save_path="./", naming_scheme="converted_from_numpy_*.h5", descriptor_calculation_kwargs={"working_directory": "./"}, ) @@ -135,15 +135,15 @@ def test_convert_numpy_openpmd(self): target_input_path="converted_from_numpy_{}.out.h5".format( snapshot ), - additional_info_input_type=None, - additional_info_input_path=None, + simulation_output_type=None, + simulation_output_path=None, target_units=None, ) data_converter.convert_snapshots( descriptor_save_path="./", target_save_path="./", - additional_info_save_path="./", + simulation_output_save_path="./", naming_scheme="verify_against_original_numpy_data_*.npy", descriptor_calculation_kwargs={"working_directory": "./"}, )