diff --git a/docs/source/advanced_usage/predictions.rst b/docs/source/advanced_usage/predictions.rst index a16ece7bd..3246526c8 100644 --- a/docs/source/advanced_usage/predictions.rst +++ b/docs/source/advanced_usage/predictions.rst @@ -81,11 +81,13 @@ Gaussian representation of atomic positions. In this algorithm, most of the computational overhead of the total energy calculation is offloaded to the computation of this Gaussian representation. This calculation is realized via LAMMPS and can therefore be GPU accelerated (parallelized) in the same fashion -as the bispectrum descriptor calculation. Simply activate this option via +as the bispectrum descriptor calculation. If a GPU is activated (and LAMMPS +is available), this option will be used by default. It can also manually be +activated via .. code-block:: python - parameters.descriptors.use_atomic_density_energy_formula = True + parameters.use_atomic_density_formula = True The Gaussian representation algorithm is describe in the publication `Predicting electronic structures at any length scale with machine learning `_. diff --git a/docs/source/advanced_usage/trainingmodel.rst b/docs/source/advanced_usage/trainingmodel.rst index 290aa15f3..9b118d86b 100644 --- a/docs/source/advanced_usage/trainingmodel.rst +++ b/docs/source/advanced_usage/trainingmodel.rst @@ -194,22 +194,64 @@ keyword, you can fine-tune the number of new snapshots being created. By default, the same number of snapshots as had been provided will be created (if possible). -Using tensorboard -****************** +Logging metrics during training +******************************* + +Training progress in MALA can be visualized via tensorboard or wandb, as also shown +in the file ``advanced/ex03_tensor_board``. Simply select a logger prior to training as + + .. code-block:: python + + parameters.running.logger = "tensorboard" + parameters.running.logging_dir = "mala_vis" -Training routines in MALA can be visualized via tensorboard, as also shown -in the file ``advanced/ex03_tensor_board``. Simply enable tensorboard -visualization prior to training via +or .. code-block:: python - # 0: No visualizatuon, 1: loss and learning rate, 2: like 1, - # but additionally weights and biases are saved - parameters.running.logging = 1 + import wandb + wandb.init( + project="mala_training", + entity="your_wandb_entity" + ) + parameters.running.logger = "wandb" parameters.running.logging_dir = "mala_vis" where ``logging_dir`` specifies some directory in which to save the -MALA logging data. Afterwards, you can run the training without any +MALA logging data. You can also select which metrics to record via + + .. code-block:: python + + parameters.validation_metrics = ["ldos", "dos", "density", "total_energy"] + +Full list of available metrics: + - "ldos": MSE of the LDOS. + - "band_energy": Band energy. + - "band_energy_actual_fe": Band energy computed with ground truth Fermi energy. + - "total_energy": Total energy. + - "total_energy_actual_fe": Total energy computed with ground truth Fermi energy. + - "fermi_energy": Fermi energy. + - "density": Electron density. + - "density_relative": Rlectron density (Mean Absolute Percentage Error). + - "dos": Density of states. + - "dos_relative": Density of states (Mean Absolute Percentage Error). + +To save time and resources you can specify the logging interval via + + .. code-block:: python + + parameters.running.validate_every_n_epochs = 10 + +If you want to monitor the degree to which the model overfits to the training data, +you can use the option + + .. code-block:: python + + parameters.running.validate_on_training_data = True + +MALA will evaluate the validation metrics on the training set as well as the validation set. + +Afterwards, you can run the training without any other modifications. Once training is finished (or during training, in case you want to use tensorboard to monitor progress), you can launch tensorboard via @@ -221,6 +263,7 @@ via The full path for ``path_to_log_directory`` can be accessed via ``trainer.full_logging_path``. +If you're using wandb, you can monitor the training progress on the wandb website. Training in parallel ******************** diff --git a/docs/source/basic_usage/trainingmodel.rst b/docs/source/basic_usage/trainingmodel.rst index e6bc8c967..53cb8a8df 100644 --- a/docs/source/basic_usage/trainingmodel.rst +++ b/docs/source/basic_usage/trainingmodel.rst @@ -28,7 +28,7 @@ options to train a simple network with example data, namely parameters = mala.Parameters() parameters.data.input_rescaling_type = "feature-wise-standard" - parameters.data.output_rescaling_type = "normal" + parameters.data.output_rescaling_type = "minmax" parameters.network.layer_activations = ["ReLU"] @@ -43,15 +43,18 @@ sub-objects dealing with the individual aspects of the workflow. In the first two lines, which data scaling MALA should employ. Scaling data greatly improves the performance of NN based ML models. Options are -* ``None``: No normalization is applied. +* ``None``: No scaling is applied. -* ``standard``: Standardization (Scale to mean 0, standard deviation 1) +* ``standard``: Standardization (Scale to mean 0, standard deviation 1) is + applied to the entire array. -* ``normal``: Min-Max scaling (Scale to be in range 0...1) +* ``minmax``: Min-Max scaling (Scale to be in range 0...1) is applied to the entire array. -* ``feature-wise-standard``: Row Standardization (Scale to mean 0, standard deviation 1) +* ``feature-wise-standard``: Standardization (Scale to mean 0, standard + deviation 1) is applied to each feature dimension individually. -* ``feature-wise-normal``: Row Min-Max scaling (Scale to be in range 0...1) +* ``feature-wise-minmax``: Min-Max scaling (Scale to be in range 0...1) is + applied to each feature dimension individually. Here, we specify that MALA should standardize the input (=descriptors) by feature (i.e., each entry of the vector separately on the grid) and diff --git a/docs/source/conf.py b/docs/source/conf.py index 1225852c5..7c205cba0 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -72,7 +72,6 @@ "scipy", "oapackage", "matplotlib", - "horovod", "lammps", "total_energy", "pqkmeans", diff --git a/examples/advanced/ex01_checkpoint_training.py b/examples/advanced/ex01_checkpoint_training.py index 5222a5232..af8ee5687 100644 --- a/examples/advanced/ex01_checkpoint_training.py +++ b/examples/advanced/ex01_checkpoint_training.py @@ -21,7 +21,7 @@ def initial_setup(): parameters = mala.Parameters() parameters.data.data_splitting_type = "by_snapshot" parameters.data.input_rescaling_type = "feature-wise-standard" - parameters.data.output_rescaling_type = "normal" + parameters.data.output_rescaling_type = "minmax" parameters.network.layer_activations = ["ReLU"] parameters.running.max_number_epochs = 9 parameters.running.mini_batch_size = 8 diff --git a/examples/advanced/ex03_tensor_board.py b/examples/advanced/ex03_tensor_board.py index 97bc781cf..cf1e884a7 100644 --- a/examples/advanced/ex03_tensor_board.py +++ b/examples/advanced/ex03_tensor_board.py @@ -13,7 +13,7 @@ parameters = mala.Parameters() parameters.data.input_rescaling_type = "feature-wise-standard" -parameters.data.output_rescaling_type = "normal" +parameters.data.output_rescaling_type = "minmax" parameters.targets.ldos_gridsize = 11 parameters.targets.ldos_gridspacing_ev = 2.5 parameters.targets.ldos_gridoffset_ev = -5 @@ -32,11 +32,19 @@ data_handler = mala.DataHandler(parameters) data_handler.add_snapshot( - "Be_snapshot0.in.npy", data_path, "Be_snapshot0.out.npy", data_path, "tr", + "Be_snapshot0.in.npy", + data_path, + "Be_snapshot0.out.npy", + data_path, + "tr", calculation_output_file=os.path.join(data_path, "Be_snapshot0.out"), ) data_handler.add_snapshot( - "Be_snapshot1.in.npy", data_path, "Be_snapshot1.out.npy", data_path, "va", + "Be_snapshot1.in.npy", + data_path, + "Be_snapshot1.out.npy", + data_path, + "va", calculation_output_file=os.path.join(data_path, "Be_snapshot1.out"), ) data_handler.prepare_data() diff --git a/examples/advanced/ex05_checkpoint_hyperparameter_optimization.py b/examples/advanced/ex05_checkpoint_hyperparameter_optimization.py index 99a92fa35..7680c7a91 100644 --- a/examples/advanced/ex05_checkpoint_hyperparameter_optimization.py +++ b/examples/advanced/ex05_checkpoint_hyperparameter_optimization.py @@ -17,7 +17,7 @@ def initial_setup(): parameters = mala.Parameters() parameters.data.input_rescaling_type = "feature-wise-standard" - parameters.data.output_rescaling_type = "normal" + parameters.data.output_rescaling_type = "minmax" parameters.running.max_number_epochs = 10 parameters.running.mini_batch_size = 40 parameters.running.learning_rate = 0.00001 diff --git a/examples/advanced/ex06_distributed_hyperparameter_optimization.py b/examples/advanced/ex06_distributed_hyperparameter_optimization.py index 215dd1ab2..4a6e42f9b 100644 --- a/examples/advanced/ex06_distributed_hyperparameter_optimization.py +++ b/examples/advanced/ex06_distributed_hyperparameter_optimization.py @@ -24,7 +24,7 @@ parameters = mala.Parameters() # Specify the data scaling. parameters.data.input_rescaling_type = "feature-wise-standard" -parameters.data.output_rescaling_type = "normal" +parameters.data.output_rescaling_type = "minmax" parameters.running.max_number_epochs = 5 parameters.running.mini_batch_size = 40 parameters.running.learning_rate = 0.00001 diff --git a/examples/advanced/ex07_advanced_hyperparameter_optimization.py b/examples/advanced/ex07_advanced_hyperparameter_optimization.py index 242ffd7dd..0072ed3a0 100644 --- a/examples/advanced/ex07_advanced_hyperparameter_optimization.py +++ b/examples/advanced/ex07_advanced_hyperparameter_optimization.py @@ -17,7 +17,7 @@ def optimize_hyperparameters(hyper_optimizer): parameters = mala.Parameters() parameters.data.input_rescaling_type = "feature-wise-standard" - parameters.data.output_rescaling_type = "normal" + parameters.data.output_rescaling_type = "minmax" parameters.running.max_number_epochs = 10 parameters.running.mini_batch_size = 40 parameters.running.learning_rate = 0.00001 diff --git a/examples/advanced/ex10_convert_numpy_openpmd.py b/examples/advanced/ex10_convert_numpy_openpmd.py index 45369ff89..7ebc22daa 100644 --- a/examples/advanced/ex10_convert_numpy_openpmd.py +++ b/examples/advanced/ex10_convert_numpy_openpmd.py @@ -29,7 +29,7 @@ descriptor_save_path="./", target_save_path="./", additional_info_save_path="./", - naming_scheme="converted_from_numpy_*.bp5", + naming_scheme="converted_from_numpy_*.h5", descriptor_calculation_kwargs={"working_directory": "./"}, ) @@ -40,11 +40,9 @@ for snapshot in range(2): data_converter.add_snapshot( descriptor_input_type="openpmd", - descriptor_input_path="converted_from_numpy_{}.in.bp5".format( - snapshot - ), + descriptor_input_path="converted_from_numpy_{}.in.h5".format(snapshot), target_input_type="openpmd", - target_input_path="converted_from_numpy_{}.out.bp5".format(snapshot), + target_input_path="converted_from_numpy_{}.out.h5".format(snapshot), additional_info_input_type=None, additional_info_input_path=None, target_units=None, diff --git a/examples/basic/ex01_train_network.py b/examples/basic/ex01_train_network.py index 1eca8c6b7..c7a5ca782 100644 --- a/examples/basic/ex01_train_network.py +++ b/examples/basic/ex01_train_network.py @@ -20,7 +20,7 @@ # Specify the data scaling. For regular bispectrum and LDOS data, # these have proven successful. parameters.data.input_rescaling_type = "feature-wise-standard" -parameters.data.output_rescaling_type = "normal" +parameters.data.output_rescaling_type = "minmax" # Specify the used activation function. parameters.network.layer_activations = ["ReLU"] # Specify the training parameters. diff --git a/examples/basic/ex04_hyperparameter_optimization.py b/examples/basic/ex04_hyperparameter_optimization.py index cebb4c42e..3160206c3 100644 --- a/examples/basic/ex04_hyperparameter_optimization.py +++ b/examples/basic/ex04_hyperparameter_optimization.py @@ -19,7 +19,7 @@ #################### parameters = mala.Parameters() parameters.data.input_rescaling_type = "feature-wise-standard" -parameters.data.output_rescaling_type = "normal" +parameters.data.output_rescaling_type = "minmax" parameters.running.max_number_epochs = 20 parameters.running.mini_batch_size = 40 parameters.running.optimizer = "Adam" diff --git a/mala/__init__.py b/mala/__init__.py index 6646077b5..5c578bf3b 100644 --- a/mala/__init__.py +++ b/mala/__init__.py @@ -40,9 +40,10 @@ HyperparameterOAT, HyperparameterNASWOT, HyperparameterOptuna, - HyperparameterACSD, + HyperparameterDescriptorScoring, ACSDAnalyzer, Runner, + MutualInformationAnalyzer, ) from .targets import LDOS, DOS, Density, fermi_function, AtomicForce, Target from .interfaces import MALA diff --git a/mala/common/parallelizer.py b/mala/common/parallelizer.py index 160695a42..e59b8a984 100644 --- a/mala/common/parallelizer.py +++ b/mala/common/parallelizer.py @@ -5,7 +5,6 @@ import os import warnings -import torch import torch.distributed as dist use_ddp = False @@ -154,6 +153,11 @@ def get_local_rank(): LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + Returns + ------- + local_rank : int + The local rank of the current thread. """ if use_ddp: return int(os.environ.get("LOCAL_RANK")) @@ -189,7 +193,6 @@ def get_size(): return comm.Get_size() -# TODO: This is hacky, improve it. def get_comm(): """ Return the MPI communicator, if MPI is being used. @@ -197,7 +200,7 @@ def get_comm(): Returns ------- comm : MPI.COMM_WORLD - A MPI communicator. + An MPI communicator. """ return comm @@ -221,7 +224,7 @@ def printout(*values, sep=" ", min_verbosity=0): Parameters ---------- - values + values : object Values to be printed. sep : string @@ -245,7 +248,7 @@ def parallel_warn(warning, min_verbosity=0, category=UserWarning): Parameters ---------- - warning + warning : str Warning to be printed. min_verbosity : int Minimum number of verbosity for this output to still be printed. diff --git a/mala/common/parameters.py b/mala/common/parameters.py index be8484626..1d2ba9d96 100644 --- a/mala/common/parameters.py +++ b/mala/common/parameters.py @@ -40,6 +40,7 @@ def __init__( "openpmd_configuration": {}, "openpmd_granularity": 1, "lammps": True, + "atomic_density_formula": False, } pass @@ -88,6 +89,11 @@ def _update_openpmd_granularity(self, new_granularity): def _update_lammps(self, new_lammps): self._configuration["lammps"] = new_lammps + def _update_atomic_density_formula(self, new_atomic_density_formula): + self._configuration["atomic_density_formula"] = ( + new_atomic_density_formula + ) + @staticmethod def _member_to_json(member): if isinstance(member, (int, float, type(None), str)): @@ -163,7 +169,7 @@ def _json_to_member(json_value): @classmethod def from_json(cls, json_dict): """ - Read this object from a dictionary saved in a JSON file. + Read parameters from a dictionary saved in a JSON file. Parameters ---------- @@ -270,6 +276,10 @@ class ParametersNetwork(ParametersBase): Number of heads to be used in Multi head attention network This should be a divisor of input dimension Default: None + + dropout : float + Dropout rate for positional encoding in transformer. + Default: 0.1 """ def __init__(self): @@ -279,13 +289,13 @@ def __init__(self): self.layer_activations = ["Sigmoid"] self.loss_function_type = "mse" - # for LSTM/Gru + Transformer - self.num_hidden_layers = 1 - # for LSTM/Gru self.no_hidden_state = False self.bidirection = False + # for LSTM/Gru + Transformer + self.num_hidden_layers = 1 + # for transformer net self.dropout = 0.1 self.num_heads = 10 @@ -310,23 +320,52 @@ class ParametersDescriptors(ParametersBase): bispectrum descriptors. Default value for jmax is 5, so default value for twojmax is 10. - lammps_compute_file : string - Bispectrum calculation: LAMMPS input file that is used to calculate the - Bispectrum descriptors. If this string is empty, the standard LAMMPS input - file found in this repository will be used (recommended). - descriptors_contain_xyz : bool Legacy option. If True, it is assumed that the first three entries of the descriptor vector are the xyz coordinates and they are cut from the descriptor vector. If False, no such cutting is peformed. atomic_density_sigma : float - Sigma used for the calculation of the Gaussian descriptors. - - use_atomic_density_energy_formula : bool - If True, Gaussian descriptors will be calculated for the - calculation of the Ewald sum as part of the total energy module. - Default is False. + Sigma (=width) used for the calculation of the Gaussian descriptors. + Explicitly setting this value is discouraged if the atomic density is + used only during the total energy calculation and, e.g., bispectrum + descriptors are used for models. In this case, the width will + automatically be set correctly during inference based on model + parameters. This parameter mainly exists for debugging purposes. + If the atomic density is instead used for model training itself, this + parameter needs to be set. + + atomic_density_cutoff : float + Cutoff radius used for atomic density calculation. Explicitly setting + this value is discouraged if the atomic density is used only during the + total energy calculation and, e.g., bispectrum descriptors are used + for models. In this case, the cutoff will automatically be set + correctly during inference based on model parameters. This parameter + mainly exists for debugging purposes. If the atomic density is instead + used for model training itself, this parameter needs to be set. + + lammps_compute_file : str + Path to a LAMMPS compute file for the bispectrum descriptor + calculation. MALA has its own collection of compute files which are + used by default. Setting this parameter is thus not necessarys for + model training and inference, and it exists mainly for debugging + purposes. + + minterpy_cutoff_cube_size : float + WILL BE DEPRECATED IN MALA v1.4.0 - size of cube for minterpy + descriptor calculation. + + minterpy_lp_norm : int + WILL BE DEPRECATED IN MALA v1.4.0 - LP norm for minterpy + descriptor calculation. + + minterpy_point_list : list + WILL BE DEPRECATED IN MALA v1.4.0 - list of points for minterpy + descriptor calculation. + + minterpy_polynomial_degree : int + WILL BE DEPRECATED IN MALA v1.4.0 - polynomial degree for minterpy + descriptor calculation. """ def __init__(self): @@ -356,7 +395,6 @@ def __init__(self): # atomic density may be used at the same time, if e.g. bispectrum # descriptors are used for a full inference, which then uses the atomic # density for the calculation of the Ewald sum. - self.use_atomic_density_energy_formula = False self.atomic_density_sigma = None self.atomic_density_cutoff = None @@ -556,11 +594,6 @@ class ParametersData(ParametersBase): Attributes ---------- - descriptors_contain_xyz : bool - Legacy option. If True, it is assumed that the first three entries of - the descriptor vector are the xyz coordinates and they are cut from the - descriptor vector. If False, no such cutting is peformed. - snapshot_directories_list : list A list of all added snapshots. @@ -573,27 +606,45 @@ class ParametersData(ParametersBase): Specifies how input quantities are normalized. Options: - - "None": No normalization is applied. - - "standard": Standardization (Scale to mean 0, standard - deviation 1) - - "normal": Min-Max scaling (Scale to be in range 0...1) - - "feature-wise-standard": Row Standardization (Scale to mean 0, - standard deviation 1) - - "feature-wise-normal": Row Min-Max scaling (Scale to be in range - 0...1) + - "None": No scaling is applied. + - "standard": Standardization (Scale to mean 0, + standard deviation 1) is applied to the entire array. + - "minmax": Min-Max scaling (Scale to be in range 0...1) is applied + to the entire array. + - "feature-wise-standard": Standardization (Scale to mean 0, + standard deviation 1) is applied to each feature dimension + individually. + I.e., if your training data has dimensions (d,f), then each + of the f columns with d entries is scaled indiviually. + - "feature-wise-minmax": Min-Max scaling (Scale to be in range + 0...1) is applied to each feature dimension individually. + I.e., if your training data has dimensions (d,f), then each + of the f columns with d entries is scaled indiviually. + - "normal": (DEPRECATED) Old name for "minmax". + - "feature-wise-normal": (DEPRECATED) Old name for + "feature-wise-minmax" output_rescaling_type : string Specifies how output quantities are normalized. Options: - - "None": No normalization is applied. + - "None": No scaling is applied. - "standard": Standardization (Scale to mean 0, - standard deviation 1) - - "normal": Min-Max scaling (Scale to be in range 0...1) - - "feature-wise-standard": Row Standardization (Scale to mean 0, - standard deviation 1) - - "feature-wise-normal": Row Min-Max scaling (Scale to be in - range 0...1) + standard deviation 1) is applied to the entire array. + - "minmax": Min-Max scaling (Scale to be in range 0...1) is applied + to the entire array. + - "feature-wise-standard": Standardization (Scale to mean 0, + standard deviation 1) is applied to each feature dimension + individually. + I.e., if your training data has dimensions (d,f), then each + of the f columns with d entries is scaled indiviually. + - "feature-wise-minmax": Min-Max scaling (Scale to be in range + 0...1) is applied to each feature dimension individually. + I.e., if your training data has dimensions (d,f), then each + of the f columns with d entries is scaled indiviually. + - "normal": (DEPRECATED) Old name for "minmax". + - "feature-wise-normal": (DEPRECATED) Old name for + "feature-wise-minmax" use_lazy_loading : bool If True, data is lazily loaded, i.e. only the snapshots that are @@ -693,13 +744,16 @@ class ParametersRunning(ParametersBase): a "by snapshot" basis. checkpoints_each_epoch : int - If not 0, checkpoint files will be saved after eac + If not 0, checkpoint files will be saved after each checkpoints_each_epoch epoch. checkpoint_name : string Name used for the checkpoints. Using this, multiple runs can be performed in the same directory. + run_name : string + Name of the run used for logging. + logging_dir : string Name of the folder that logging files will be saved to. @@ -708,12 +762,33 @@ class ParametersRunning(ParametersBase): in a subfolder of logging_dir labelled with the starting date of the logging, to avoid having to change input scripts often. - inference_data_grid : list - List holding the grid to be used for inference in the form of - [x,y,z]. + logger : string + Name of the logger to be used. + Currently supported are: - use_mixed_precision : bool - If True, mixed precision computation (via AMP) will be used. + - "tensorboard": Tensorboard logger. + - "wandb": Weights and Biases logger. + + validation_metrics : list + List of metrics to be used for validation. Default is ["ldos"]. + Possible options are: + + - "ldos": MSE of the LDOS. + - "band_energy": Band energy. + - "band_energy_actual_fe": Band energy computed with ground truth Fermi energy. + - "total_energy": Total energy. + - "total_energy_actual_fe": Total energy computed with ground truth Fermi energy. + - "fermi_energy": Fermi energy. + - "density": Electron density. + - "density_relative": Rlectron density (MAPE). + - "dos": Density of states. + - "dos_relative": Density of states (MAPE). + + validate_on_training_data : bool + Whether to validate on the training data as well. Default is False. + + validate_every_n_epochs : int + Determines how often validation is performed. Default is 1. training_log_interval : int Determines how often detailed performance info is printed during @@ -723,23 +798,36 @@ class ParametersRunning(ParametersBase): List with two entries determining with which batch/iteration number the CUDA profiler will start and stop profiling. Please note that this option only holds significance if the nsys profiler is used. + + inference_data_grid : list + Grid dimensions used during inference. Typically, these are automatically + determined by DFT reference data, and this parameter does not need to + be set. Thus, this parameter mainly exists for debugging purposes. + + use_mixed_precision : bool + If True, mixed precision computation (via AMP) will be used. + + l2_regularization : float + Weight decay rate for NN optimizer. + + dropout : float + Dropout rate for positional encoding in transformer net. """ def __init__(self): super(ParametersRunning, self).__init__() self.optimizer = "Adam" self.learning_rate = 10 ** (-5) - self.learning_rate_embedding = 10 ** (-4) + # self.learning_rate_embedding = 10 ** (-4) self.max_number_epochs = 100 - self.verbosity = True self.mini_batch_size = 10 - self.snapshots_per_epoch = -1 + # self.snapshots_per_epoch = -1 - self.l1_regularization = 0.0 + # self.l1_regularization = 0.0 self.l2_regularization = 0.0 self.dropout = 0.0 - self.batch_norm = False - self.input_noise = 0.0 + # self.batch_norm = False + # self.input_noise = 0.0 self.early_stopping_epochs = 0 self.early_stopping_threshold = 0 @@ -748,11 +836,11 @@ def __init__(self): self.learning_rate_patience = 0 self._during_training_metric = "ldos" self._after_training_metric = "ldos" - self.use_compression = False + # self.use_compression = False self.num_workers = 0 self.use_shuffling_for_samplers = True self.checkpoints_each_epoch = 0 - self.checkpoint_best_so_far = False + # self.checkpoint_best_so_far = False self.checkpoint_name = "checkpoint_mala" self.run_name = "" self.logging_dir = "./mala_logging" @@ -980,6 +1068,15 @@ class ParametersHyperparameterOptimization(ParametersBase): not recommended because it is file based and can lead to errors; With a suitable timeout it can be used somewhat stable though and help in HPC settings. + + acsd_points : int + Parameter of the ACSD HyperparamterOptimization scheme. Controls + the number of point-pairs which are used to compute the ACSD. + An array of acsd_points*acsd_points will be computed, i.e., if + acsd_points=100, 100 points will be drawn at random, and thereafter + each of these 100 points will be compared with a new, random set + of 100 points, leading to 10000 points in total for the calculation + of the ACSD. """ def __init__(self): @@ -1004,6 +1101,7 @@ def __init__(self): # For accelerated hyperparameter optimization. self.acsd_points = 100 + self.mutual_information_points = 20000 @property def rdb_storage_heartbeat(self): @@ -1186,12 +1284,12 @@ class Parameters: hyperparameters : ParametersHyperparameterOptimization Parameters used for hyperparameter optimization. - debug : ParametersDebug - Container for all debugging parameters. - manual_seed: int If not none, this value is used as manual seed for the neural networks. Can be used to make experiments comparable. Default: None. + + datageneration : ParametersDataGeneration + Parameters used for data generation routines. """ def __init__(self): @@ -1220,6 +1318,7 @@ def __init__(self): # different. self.openpmd_granularity = 1 self.use_lammps = True + self.use_atomic_density_formula = False @property def openpmd_granularity(self): @@ -1271,7 +1370,7 @@ def verbosity(self, value): @property def use_gpu(self): - """Control whether or not a GPU is used (provided there is one).""" + """Control whether a GPU is used (provided there is one).""" return self._use_gpu @use_gpu.setter @@ -1286,6 +1385,12 @@ def use_gpu(self, value): "GPU requested, but no GPU found. MALA will " "operate with CPU only." ) + if self._use_gpu and self.use_lammps: + printout( + "Enabling atomic density formula because LAMMPS and GPU " + "are used." + ) + self.use_atomic_density_formula = True # Invalidate, will be updated in setter. self.device = None @@ -1298,7 +1403,7 @@ def use_gpu(self, value): @property def use_ddp(self): - """Control whether or not dd is used for parallel training.""" + """Control whether ddp is used for parallel training.""" return self._use_ddp @use_ddp.setter @@ -1349,7 +1454,7 @@ def device(self, value): @property def use_mpi(self): - """Control whether or not MPI is used for paralle inference.""" + """Control whether MPI is used for paralle inference.""" return self._use_mpi @use_mpi.setter @@ -1393,12 +1498,18 @@ def openpmd_configuration(self, value): @property def use_lammps(self): - """Control whether or not to use LAMMPS for descriptor calculation.""" + """Control whether to use LAMMPS for descriptor calculation.""" return self._use_lammps @use_lammps.setter def use_lammps(self, value): self._use_lammps = value + if self.use_gpu and value: + printout( + "Enabling atomic density formula because LAMMPS and GPU " + "are used." + ) + self.use_atomic_density_formula = True self.network._update_lammps(self.use_lammps) self.descriptors._update_lammps(self.use_lammps) self.targets._update_lammps(self.use_lammps) @@ -1406,6 +1517,48 @@ def use_lammps(self, value): self.running._update_lammps(self.use_lammps) self.hyperparameters._update_lammps(self.use_lammps) + @property + def use_atomic_density_formula(self): + """Control whether to use the atomic density formula. + + This formula uses as a Gaussian representation of the atomic density + to calculate the structure factor and with it, the Ewald energy + and parts of the exchange-correlation energy. By using it, one can + go from N^2 to NlogN scaling, and offloads most of the computational + overhead of energy calculation from QE to LAMMPS. This is beneficial + since LAMMPS can benefit from GPU acceleration (QE GPU acceleration + is not used in the portion of the QE code MALA employs). If set + to True, this means MALA will perform another LAMMPS calculation + during inference. The hyperparameters for this atomic density + calculation are set via the parameters.descriptors object. + Default is False, except for when both use_gpu and use_lammps + are True, in which case this value will be set to True as well. + """ + return self._use_atomic_density_formula + + @use_atomic_density_formula.setter + def use_atomic_density_formula(self, value): + self._use_atomic_density_formula = value + + self.network._update_atomic_density_formula( + self.use_atomic_density_formula + ) + self.descriptors._update_atomic_density_formula( + self.use_atomic_density_formula + ) + self.targets._update_atomic_density_formula( + self.use_atomic_density_formula + ) + self.data._update_atomic_density_formula( + self.use_atomic_density_formula + ) + self.running._update_atomic_density_formula( + self.use_atomic_density_formula + ) + self.hyperparameters._update_atomic_density_formula( + self.use_atomic_density_formula + ) + def show(self): """Print name and values of all attributes of this object.""" printout( @@ -1598,6 +1751,18 @@ def load_from_file( ].from_json(json_dict[key]) setattr(loaded_parameters, key, sub_parameters) + # Backwards compatability: + if key == "descriptors": + if ( + "use_atomic_density_energy_formula" + in json_dict[key] + ): + loaded_parameters.use_atomic_density_formula = ( + json_dict[key][ + "use_atomic_density_energy_formula" + ] + ) + # We iterate a second time, to set global values, so that they # are properly forwarded. for key in json_dict: @@ -1611,6 +1776,13 @@ def load_from_file( setattr(loaded_parameters, key, json_dict[key]) if no_snapshots is True: loaded_parameters.data.snapshot_directories_list = [] + # Backwards compatability: since the transfer of old property + # to new property happens _before_ all children descriptor classes + # are instantiated, it is not properly propagated. Thus, we + # simply have to set it to its own value again. + loaded_parameters.use_atomic_density_formula = ( + loaded_parameters.use_atomic_density_formula + ) else: raise Exception("Unsupported parameter save format.") diff --git a/mala/common/physical_data.py b/mala/common/physical_data.py index 629378829..c7dd08f40 100644 --- a/mala/common/physical_data.py +++ b/mala/common/physical_data.py @@ -12,10 +12,27 @@ class PhysicalData(ABC): """ - Base class for physical data. + Base class for volumetric physical data. Implements general framework to read and write such data to and from - files. + files. Volumetric data is assumed to exist on a 3D grid. As such it + either has the dimensions [x,y,z,f], where f is the feature dimension. + All loading functions within this class assume such a 4D array. Within + MALA, occasionally 2D arrays of dimension [x*y*z,f] are used and reshaped + accordingly. + + Parameters + ---------- + parameters : mala.Parameters + MALA Parameters object used to create this class. + + Attributes + ---------- + parameters : mala.Parameters + MALA parameters object. + + grid_dimensions : list + List of the grid dimensions (x,y,z) """ ############################## @@ -86,6 +103,9 @@ def read_from_numpy_file( If not None, the array to save the data into. The array has to be 4-dimensional. + reshape : bool + If True, the loaded 4D array will be reshaped into a 2D array. + Returns ------- data : numpy.ndarray or None @@ -263,6 +283,14 @@ def read_dimensions_from_numpy_file(self, path, read_dtype=False): read_dtype : bool If True, the dtype is read alongside the dimensions. + + Returns + ------- + dimension_info : list or tuple + If read_dtype is False, then only a list containing the dimensions + of the saved array is returned. If read_dtype is True, a tuple + containing this list of dimensions and the dtype of the array will + be returned. """ loaded_array = np.load(path, mmap_mode="r") if read_dtype: @@ -286,6 +314,14 @@ def read_dimensions_from_openpmd_file( read_dtype : bool If True, the dtype is read alongside the dimensions. + + comm : MPI.Comm + An MPI communicator to be used for parallelized I/O + + Returns + ------- + dimension_info : list + A list containing the dimensions of the saved array. """ if comm is None or comm.rank == 0: import openpmd_api as io @@ -379,6 +415,22 @@ class SkipArrayWriting: In order to provide this data, the numpy array can be replaced with an instance of the class SkipArrayWriting. + + Parameters + ---------- + dataset : openpmd_api.Dataset + OpenPMD Data set to eventually write to. + + feature_size : int + Size of the feature dimension. + + Attributes + ---------- + dataset : mala.Parameters + OpenPMD Data set to eventually write to. + + feature_size : list + Size of the feature dimension. """ # dataset has type openpmd_api.Dataset (not adding a type hint to avoid @@ -408,7 +460,7 @@ def write_to_openpmd_file( the openPMD structure. additional_attributes : dict - Dict containing additional attributes to be saved. + Dictionary containing additional attributes to be saved. internal_iteration_number : int Internal OpenPMD iteration number. Ideally, this number should @@ -489,6 +541,22 @@ def write_to_openpmd_iteration( If not None, and the selected class implements it, additional metadata will be read from this source. This metadata will then, depending on the class, be saved in the OpenPMD file. + + local_offset : list + [x,y,z] value from which to start writing the array. + + local_reach : list + [x,y,z] value until which to read the array. + + feature_from : int + Value from which to start writing in the feature dimension. With + this parameter and feature_to, one can parallelize over the feature + dimension. + + feature_to : int + Value until which to write in the feature dimension. With + this parameter and feature_from, one can parallelize over the feature + dimension. """ import openpmd_api as io diff --git a/mala/datageneration/ofdft_initializer.py b/mala/datageneration/ofdft_initializer.py index 2086b8dbb..0f932f8c5 100644 --- a/mala/datageneration/ofdft_initializer.py +++ b/mala/datageneration/ofdft_initializer.py @@ -22,12 +22,26 @@ class OFDFTInitializer: Parameters ---------- - parameters : mala.common.parameters.Parameters - Parameters object used to create this instance. + parameters : mala.Parameters + MALA parameters object used to create this instance. atoms : ase.Atoms Initial atomic configuration for which an equilibrated configuration is to be created. + + + Attributes + ---------- + parameters : mala.mala.common.parameters.ParametersDataGeneration + MALA data generation parameters object. + + atoms : ase.Atoms + Initial atomic configuration for which an + equilibrated configuration is to be created. + + dftpy_configuration : dict + Dictionary containing the DFTpy configuration. Will partially be + populated via the MALA parameters object. """ def __init__(self, parameters, atoms): @@ -37,7 +51,7 @@ def __init__(self, parameters, atoms): "large changes." ) self.atoms = atoms - self.params = parameters.datageneration + self.parameters = parameters.datageneration # Check that only one element is used in the atoms. number_of_elements = len(set([x.symbol for x in self.atoms])) @@ -47,11 +61,13 @@ def __init__(self, parameters, atoms): ) self.dftpy_configuration = DefaultOption() - self.dftpy_configuration["PATH"]["pppath"] = self.params.local_psp_path + self.dftpy_configuration["PATH"][ + "pppath" + ] = self.parameters.local_psp_path self.dftpy_configuration["PP"][ self.atoms[0].symbol - ] = self.params.local_psp_name - self.dftpy_configuration["OPT"]["method"] = self.params.ofdft_kedf + ] = self.parameters.local_psp_name + self.dftpy_configuration["OPT"]["method"] = self.parameters.ofdft_kedf self.dftpy_configuration["KEDF"]["kedf"] = "WT" self.dftpy_configuration["JOB"]["calctype"] = "Energy Force" @@ -64,6 +80,11 @@ def get_equilibrated_configuration(self, logging_period=None): logging_period : int If not None, a .log and .traj file will be filled with snapshot information every logging_period steps. + + Returns + ------- + equilibrated_configuration : ase.Atoms + Equilibrated atomic configuration. """ # Set the DFTPy configuration. conf = OptionFormat(self.dftpy_configuration) @@ -75,14 +96,14 @@ def get_equilibrated_configuration(self, logging_period=None): # Create the initial velocities, and dynamics object. MaxwellBoltzmannDistribution( self.atoms, - temperature_K=self.params.ofdft_temperature, + temperature_K=self.parameters.ofdft_temperature, force_temp=True, ) dyn = Langevin( self.atoms, - self.params.ofdft_timestep * units.fs, - temperature_K=self.params.ofdft_temperature, - friction=self.params.ofdft_friction, + self.parameters.ofdft_timestep * units.fs, + temperature_K=self.parameters.ofdft_temperature, + friction=self.parameters.ofdft_friction, ) # If logging is desired, do the logging. @@ -105,5 +126,5 @@ def get_equilibrated_configuration(self, logging_period=None): # Let the OF-DFT-MD run. ase.io.write("POSCAR_initial", self.atoms, "vasp") - dyn.run(self.params.ofdft_number_of_timesteps) + dyn.run(self.parameters.ofdft_number_of_timesteps) ase.io.write("POSCAR_equilibrated", self.atoms, "vasp") diff --git a/mala/datageneration/trajectory_analyzer.py b/mala/datageneration/trajectory_analyzer.py index 4de1a8d1d..fa0493af7 100644 --- a/mala/datageneration/trajectory_analyzer.py +++ b/mala/datageneration/trajectory_analyzer.py @@ -29,6 +29,44 @@ class TrajectoryAnalyzer: target_calculator : mala.targets.target.Target A target calculator to calculate e.g. the RDF. If None is provided, one will be generated ad-hoc (recommended). + + temperatures : string or numpy.ndarray + Array holding the temperatures for the trajectory or path to numpy + file containing temperatures. + + target_temperature : float + Target temperature for equilibration. + + malada_compatability : bool + If True, twice the radius set by the minimum imaging convention (MIC) + will be used for RDF calculation. This is generally discouraged, + but some older malada calculations have been performed with it, so + this parameter provides reproducibility. + + Attributes + ---------- + parameters : mala.common.parameters.ParametersDataGeneration + MALA data generation parameters. + + average_distance_equilibrated : float + Distance threshold for determination of first equilibrated snapshot. + + distance_metrics_denoised : numpy.ndarray + RDF based distance metrics used for equilibration analysis. + + distances_realspace : numpy.ndarray + Realspace distance metrics used to sample snapshots. + + first_considered_snapshot : int + First snapshot to be considered during equilibration analysis (i.e., + after pruning). + + last_considered_snapshot : int + Last snapshot to be considered during equilibration analysis (i.e., + after pruning). + + target_calculator : mala.targets.target.Target + Target calculator used for computing RDFs. """ def __init__( @@ -46,7 +84,7 @@ def __init__( "large changes." ) - self.params: ParametersDataGeneration = parameters.datageneration + self.parameters: ParametersDataGeneration = parameters.datageneration # If needed, read the trajectory self.trajectory = None @@ -58,12 +96,12 @@ def __init__( raise Exception("Incompatible trajectory format provided.") # If needed, read the temperature files - self.temperatures = None + self._temperatures = None if temperatures is not None: if isinstance(temperatures, np.ndarray): - self.temperatures = temperatures + self._temperatures = temperatures elif isinstance(temperatures, str): - self.temperatures = np.load(temperatures) + self._temperatures = np.load(temperatures) else: raise Exception("Incompatible temperature format provided.") @@ -76,7 +114,7 @@ def __init__( self.target_calculator.temperature = target_temperature # Initialize variables. - self.distance_metrics = [] + self._distance_metrics = [] self.distance_metrics_denoised = [] self.average_distance_equilibrated = None self.__saved_rdf = None @@ -150,11 +188,11 @@ def get_first_snapshot( # First, we ned to calculate the reduced metrics for the trajectory. # For this, we calculate the distance between all the snapshots # and the last one. - self.distance_metrics = [] + self._distance_metrics = [] if equilibrated_snapshot is None: equilibrated_snapshot = self.trajectory[-1] for idx, step in enumerate(self.trajectory): - self.distance_metrics.append( + self._distance_metrics.append( self._calculate_distance_between_snapshots( equilibrated_snapshot, step, @@ -165,16 +203,16 @@ def get_first_snapshot( ) # Now, we denoise the distance metrics. - self.distance_metrics_denoised = self.__denoise(self.distance_metrics) + self.distance_metrics_denoised = self.__denoise(self._distance_metrics) # Which snapshots are considered depends on how we denoise the # distance metrics. self.first_considered_snapshot = ( - self.params.trajectory_analysis_denoising_width + self.parameters.trajectory_analysis_denoising_width ) self.last_considered_snapshot = ( np.shape(self.distance_metrics_denoised)[0] - - self.params.trajectory_analysis_denoising_width + - self.parameters.trajectory_analysis_denoising_width ) considered_length = ( self.last_considered_snapshot - self.first_considered_snapshot @@ -189,7 +227,7 @@ def get_first_snapshot( self.distance_metrics_denoised[ considered_length - int( - self.params.trajectory_analysis_estimated_equilibrium + self.parameters.trajectory_analysis_estimated_equilibrium * considered_length ) : self.last_considered_snapshot ] @@ -212,7 +250,7 @@ def get_first_snapshot( is_below = False if ( counter - == self.params.trajectory_analysis_below_average_counter + == self.parameters.trajectory_analysis_below_average_counter ): first_snapshot = idx break @@ -242,10 +280,12 @@ def get_snapshot_correlation_cutoff(self): to each other to a degree that suggests temporal neighborhood. """ - if self.params.trajectory_analysis_correlation_metric_cutoff < 0: + if self.parameters.trajectory_analysis_correlation_metric_cutoff < 0: return self._analyze_distance_metric(self.trajectory) else: - return self.params.trajectory_analysis_correlation_metric_cutoff + return ( + self.parameters.trajectory_analysis_correlation_metric_cutoff + ) def get_uncorrelated_snapshots(self, filename_uncorrelated_snapshots): """ @@ -265,7 +305,8 @@ def get_uncorrelated_snapshots(self, filename_uncorrelated_snapshots): filename_uncorrelated_snapshots ).split(".")[0] allowed_temp_diff_K = ( - self.params.trajectory_analysis_temperature_tolerance_percent / 100 + self.parameters.trajectory_analysis_temperature_tolerance_percent + / 100 ) * self.target_calculator.temperature current_snapshot = self.first_snapshot begin_snapshot = self.first_snapshot + 1 @@ -275,9 +316,9 @@ def get_uncorrelated_snapshots(self, filename_uncorrelated_snapshots): for i in range(begin_snapshot, end_snapshot): if self.__check_if_snapshot_is_valid( self.trajectory[i], - self.temperatures[i], + self._temperatures[i], self.trajectory[current_snapshot], - self.temperatures[current_snapshot], + self._temperatures[current_snapshot], self.snapshot_correlation_cutoff, allowed_temp_diff_K, ): @@ -316,7 +357,7 @@ def _analyze_distance_metric(self, trajectory): + self.first_snapshot ) width = int( - self.params.trajectory_analysis_estimated_equilibrium + self.parameters.trajectory_analysis_estimated_equilibrium * np.shape(self.distance_metrics_denoised)[0] ) self.distances_realspace = [] @@ -403,8 +444,8 @@ def _calculate_distance_between_snapshots( def __denoise(self, signal): denoised_signal = np.convolve( signal, - np.ones(self.params.trajectory_analysis_denoising_width) - / self.params.trajectory_analysis_denoising_width, + np.ones(self.parameters.trajectory_analysis_denoising_width) + / self.parameters.trajectory_analysis_denoising_width, mode="same", ) return denoised_signal diff --git a/mala/datahandling/data_converter.py b/mala/datahandling/data_converter.py index 5b22e2293..21fb34cdb 100644 --- a/mala/datahandling/data_converter.py +++ b/mala/datahandling/data_converter.py @@ -43,6 +43,13 @@ class DataConverter: target_calculator : mala.targets.target.Target Target calculator used for parsing/converting target data. + + parameters : mala.common.parameters.ParametersData + MALA data handling parameters object. + + parameters_full : mala.common.parameters.Parameters + MALA parameters object. The full object is necessary for some data + handling tasks. """ def __init__( @@ -69,9 +76,9 @@ def __init__( self.__snapshot_units = [] # Keep track of what has to be done by this data converter. - self.process_descriptors = False - self.process_targets = False - self.process_additional_info = False + self.__process_descriptors = False + self.__process_targets = False + self.__process_additional_info = False def add_snapshot( self, @@ -143,7 +150,7 @@ def add_snapshot( ) if descriptor_input_type not in descriptor_input_types: raise Exception("Cannot process this type of descriptor data.") - self.process_descriptors = True + self.__process_descriptors = True if target_input_type is not None: if target_input_path is None: @@ -152,7 +159,7 @@ def add_snapshot( ) if target_input_type not in target_input_types: raise Exception("Cannot process this type of target data.") - self.process_targets = True + self.__process_targets = True if additional_info_input_type is not None: metadata_input_type = additional_info_input_type @@ -165,7 +172,7 @@ def add_snapshot( raise Exception( "Cannot process this type of additional info data." ) - self.process_additional_info = True + self.__process_additional_info = True metadata_input_path = additional_info_input_path @@ -299,19 +306,19 @@ def convert_snapshots( target_save_path = complete_save_path additional_info_save_path = complete_save_path else: - if self.process_targets is True and target_save_path is None: + if self.__process_targets is True and target_save_path is None: raise Exception( "No target path specified, cannot process data." ) if ( - self.process_descriptors is True + self.__process_descriptors is True and descriptor_save_path is None ): raise Exception( "No descriptor path specified, cannot process data." ) if ( - self.process_additional_info is True + self.__process_additional_info is True and additional_info_save_path is None ): raise Exception( @@ -323,7 +330,7 @@ def convert_snapshots( snapshot_name = naming_scheme series_name = snapshot_name.replace("*", str("%01T")) - if self.process_descriptors: + if self.__process_descriptors: if self.parameters._configuration["mpi"]: input_series = io.Series( os.path.join( @@ -351,7 +358,7 @@ def convert_snapshots( input_series.set_software(name="MALA", version="x.x.x") input_series.author = "..." - if self.process_targets: + if self.__process_targets: if self.parameters._configuration["mpi"]: output_series = io.Series( os.path.join( @@ -386,7 +393,7 @@ def convert_snapshots( snapshot_name = snapshot_name.replace("*", str(snapshot_number)) # Create the paths as needed. - if self.process_additional_info: + if self.__process_additional_info: info_path = os.path.join( additional_info_save_path, snapshot_name + ".info.json" ) @@ -397,7 +404,7 @@ def convert_snapshots( if file_ending == "npy": # Create the actual paths, if needed. - if self.process_descriptors: + if self.__process_descriptors: descriptor_path = os.path.join( descriptor_save_path, snapshot_name + ".in." + file_ending, @@ -406,7 +413,7 @@ def convert_snapshots( descriptor_path = None memmap = None - if self.process_targets: + if self.__process_targets: target_path = os.path.join( target_save_path, snapshot_name + ".out." + file_ending, @@ -425,13 +432,13 @@ def convert_snapshots( descriptor_path = None target_path = None memmap = None - if self.process_descriptors: + if self.__process_descriptors: input_iteration = input_series.write_iterations()[ i + starts_at ] input_iteration.dt = i + starts_at input_iteration.time = 0 - if self.process_targets: + if self.__process_targets: output_iteration = output_series.write_iterations()[ i + starts_at ] @@ -460,9 +467,9 @@ def convert_snapshots( # Properly close series if file_ending != "npy": - if self.process_descriptors: + if self.__process_descriptors: del input_series - if self.process_targets: + if self.__process_targets: del output_series def __convert_single_snapshot( @@ -500,9 +507,6 @@ def __convert_single_snapshot( output_path : string If not None, outputs will be saved in this file. - return_data : bool - If True, inputs and outputs will be returned directly. - target_calculator_kwargs : dict Dictionary with additional keyword arguments for the calculation or parsing of the target quantities. @@ -636,10 +640,8 @@ def __convert_single_snapshot( snapshot["output"], units=original_units["output"] ) elif description["output"] == "numpy": - tmp_output = ( - self.target_calculator.read_from_numpy_file( - snapshot["output"], units=original_units["output"] - ) + tmp_output = self.target_calculator.read_from_numpy_file( + snapshot["output"], units=original_units["output"] ) elif description["output"] is None: @@ -687,10 +689,8 @@ def __convert_single_snapshot( snapshot["output"], units=original_units["output"] ) elif description["output"] == "numpy": - tmp_output = ( - self.target_calculator.read_from_numpy_file( - snapshot["output"] - ) + tmp_output = self.target_calculator.read_from_numpy_file( + snapshot["output"] ) elif description["output"] is None: diff --git a/mala/datahandling/data_handler.py b/mala/datahandling/data_handler.py index 7b8fc2a43..3b9521e44 100644 --- a/mala/datahandling/data_handler.py +++ b/mala/datahandling/data_handler.py @@ -18,10 +18,10 @@ class DataHandler(DataHandlerBase): """ - Loads and scales data. Can only process numpy arrays at the moment. + Loads and scales data. Can load from numpy or OpenPMD files. - Data that is not in a numpy array can be converted using the DataConverter - class. + Data that is not saved as numpy or OpenPMD file can be converted using the + DataConverter class. Parameters ---------- @@ -47,6 +47,41 @@ class DataHandler(DataHandlerBase): clear_data : bool If true (default), the data list will be cleared upon creation of the object. + + Attributes + ---------- + input_data_scaler : mala.datahandling.data_scaler.DataScaler + Used to scale the input data. + + nr_test_data : int + Number of test data points. + + nr_test_snapshots : int + Number of test snapshots. + + nr_training_data : int + Number of training data points. + + nr_training_snapshots : int + Number of training snapshots. + + nr_validation_data : int + Number of validation data points. + + nr_validation_snapshots : int + Number of validation snapshots. + + output_data_scaler : mala.datahandling.data_scaler.DataScaler + Used to scale the output data. + + test_data_sets : list + List containing torch data sets for test data. + + training_data_sets : list + List containing torch data sets for training data. + + validation_data_sets : list + List containing torch data sets for validation data. """ ############################## @@ -72,14 +107,14 @@ def __init__( if self.input_data_scaler is None: self.input_data_scaler = DataScaler( self.parameters.input_rescaling_type, - use_ddp=self.use_ddp, + use_ddp=self._use_ddp, ) self.output_data_scaler = output_data_scaler if self.output_data_scaler is None: self.output_data_scaler = DataScaler( self.parameters.output_rescaling_type, - use_ddp=self.use_ddp, + use_ddp=self._use_ddp, ) # Actual data points in the different categories. @@ -93,18 +128,18 @@ def __init__( self.nr_validation_snapshots = 0 # Arrays and data sets containing the actual data. - self.training_data_inputs = torch.empty(0) - self.validation_data_inputs = torch.empty(0) - self.test_data_inputs = torch.empty(0) - self.training_data_outputs = torch.empty(0) - self.validation_data_outputs = torch.empty(0) - self.test_data_outputs = torch.empty(0) + self._training_data_inputs = torch.empty(0) + self._validation_data_inputs = torch.empty(0) + self._test_data_inputs = torch.empty(0) + self._training_data_outputs = torch.empty(0) + self._validation_data_outputs = torch.empty(0) + self._test_data_outputs = torch.empty(0) self.training_data_sets = [] self.validation_data_sets = [] self.test_data_sets = [] # Needed for the fast tensor data sets. - self.mini_batch_size = parameters.running.mini_batch_size + self._mini_batch_size = parameters.running.mini_batch_size if clear_data: self.clear_data() @@ -130,6 +165,8 @@ def clear_data(self): self.nr_training_snapshots = 0 self.nr_test_snapshots = 0 self.nr_validation_snapshots = 0 + self.input_data_scaler.reset() + self.output_data_scaler.reset() super(DataHandler, self).clear_data() # Preparing data @@ -258,7 +295,7 @@ def get_test_input_gradient(self, snapshot_number): Returns ------- - torch.Tensor + gradient : torch.Tensor Tensor holding the gradient. """ @@ -274,7 +311,7 @@ def get_test_input_gradient(self, snapshot_number): ) return self.test_data_sets[0].input_data.grad else: - return self.test_data_inputs.grad[ + return self._test_data_inputs.grad[ snapshot.grid_size * snapshot_number : snapshot.grid_size * (snapshot_number + 1) @@ -303,7 +340,10 @@ def get_snapshot_calculation_output(self, snapshot_number): ###################### def raw_numpy_to_converted_scaled_tensor( - self, numpy_array, data_type, units, convert3Dto1D=False + self, + numpy_array, + data_type, + units, ): """ Transform a raw numpy array into a scaled torch tensor. @@ -315,14 +355,13 @@ def raw_numpy_to_converted_scaled_tensor( ---------- numpy_array : np.array Array that is to be converted. + data_type : string Either "in" or "out", depending if input or output data is + processed. units : string Units of the data that is processed. - convert3Dto1D : bool - If True (default: False), then a (x,y,z,dim) array is transformed - into a (x*y*z,dim) array. Returns ------- @@ -341,12 +380,12 @@ def raw_numpy_to_converted_scaled_tensor( ) # If desired, the dimensions can be changed. - if convert3Dto1D: + if len(np.shape(numpy_array)) == 4: if data_type == "in": data_dimension = self.input_dimension else: data_dimension = self.output_dimension - grid_size = np.prod(numpy_array[0:3]) + grid_size = np.prod(np.shape(numpy_array)[0:3]) desired_dimensions = [grid_size, data_dimension] else: desired_dimensions = None @@ -479,31 +518,31 @@ def _check_snapshots(self): def __allocate_arrays(self): if self.nr_training_data > 0: - self.training_data_inputs = np.zeros( + self._training_data_inputs = np.zeros( (self.nr_training_data, self.input_dimension), dtype=DEFAULT_NP_DATA_DTYPE, ) - self.training_data_outputs = np.zeros( + self._training_data_outputs = np.zeros( (self.nr_training_data, self.output_dimension), dtype=DEFAULT_NP_DATA_DTYPE, ) if self.nr_validation_data > 0: - self.validation_data_inputs = np.zeros( + self._validation_data_inputs = np.zeros( (self.nr_validation_data, self.input_dimension), dtype=DEFAULT_NP_DATA_DTYPE, ) - self.validation_data_outputs = np.zeros( + self._validation_data_outputs = np.zeros( (self.nr_validation_data, self.output_dimension), dtype=DEFAULT_NP_DATA_DTYPE, ) if self.nr_test_data > 0: - self.test_data_inputs = np.zeros( + self._test_data_inputs = np.zeros( (self.nr_test_data, self.input_dimension), dtype=DEFAULT_NP_DATA_DTYPE, ) - self.test_data_outputs = np.zeros( + self._test_data_outputs = np.zeros( (self.nr_test_data, self.output_dimension), dtype=DEFAULT_NP_DATA_DTYPE, ) @@ -531,7 +570,7 @@ def __load_data(self, function, data_type): raise Exception("Unknown data type detected.") # Extracting all the information pertaining to the data set. - array = function + "_data_" + data_type + array = "_" + function + "_data_" + data_type if data_type == "inputs": calculator = self.descriptor_calculator else: @@ -593,34 +632,34 @@ def __load_data(self, function, data_type): # all ears. if data_type == "inputs": if function == "training": - self.training_data_inputs = torch.from_numpy( - self.training_data_inputs + self._training_data_inputs = torch.from_numpy( + self._training_data_inputs ).float() if function == "validation": - self.validation_data_inputs = torch.from_numpy( - self.validation_data_inputs + self._validation_data_inputs = torch.from_numpy( + self._validation_data_inputs ).float() if function == "test": - self.test_data_inputs = torch.from_numpy( - self.test_data_inputs + self._test_data_inputs = torch.from_numpy( + self._test_data_inputs ).float() if data_type == "outputs": if function == "training": - self.training_data_outputs = torch.from_numpy( - self.training_data_outputs + self._training_data_outputs = torch.from_numpy( + self._training_data_outputs ).float() if function == "validation": - self.validation_data_outputs = torch.from_numpy( - self.validation_data_outputs + self._validation_data_outputs = torch.from_numpy( + self._validation_data_outputs ).float() if function == "test": - self.test_data_outputs = torch.from_numpy( - self.test_data_outputs + self._test_data_outputs = torch.from_numpy( + self._test_data_outputs ).float() def __build_datasets(self): @@ -639,7 +678,7 @@ def __build_datasets(self): self.output_data_scaler, self.descriptor_calculator, self.target_calculator, - self.use_ddp, + self._use_ddp, self.parameters._configuration["device"], ) ) @@ -651,7 +690,7 @@ def __build_datasets(self): self.output_data_scaler, self.descriptor_calculator, self.target_calculator, - self.use_ddp, + self._use_ddp, self.parameters._configuration["device"], ) ) @@ -665,7 +704,7 @@ def __build_datasets(self): self.output_data_scaler, self.descriptor_calculator, self.target_calculator, - self.use_ddp, + self._use_ddp, self.parameters._configuration["device"], input_requires_grad=True, ) @@ -701,7 +740,7 @@ def __build_datasets(self): if snapshot.snapshot_function == "tr": self.training_data_sets.append( LazyLoadDatasetSingle( - self.mini_batch_size, + self._mini_batch_size, snapshot, self.input_dimension, self.output_dimension, @@ -709,13 +748,13 @@ def __build_datasets(self): self.output_data_scaler, self.descriptor_calculator, self.target_calculator, - self.use_ddp, + self._use_ddp, ) ) if snapshot.snapshot_function == "va": self.validation_data_sets.append( LazyLoadDatasetSingle( - self.mini_batch_size, + self._mini_batch_size, snapshot, self.input_dimension, self.output_dimension, @@ -723,13 +762,13 @@ def __build_datasets(self): self.output_data_scaler, self.descriptor_calculator, self.target_calculator, - self.use_ddp, + self._use_ddp, ) ) if snapshot.snapshot_function == "te": self.test_data_sets.append( LazyLoadDatasetSingle( - self.mini_batch_size, + self._mini_batch_size, snapshot, self.input_dimension, self.output_dimension, @@ -737,65 +776,67 @@ def __build_datasets(self): self.output_data_scaler, self.descriptor_calculator, self.target_calculator, - self.use_ddp, + self._use_ddp, input_requires_grad=True, ) ) else: if self.nr_training_data != 0: - self.input_data_scaler.transform(self.training_data_inputs) - self.output_data_scaler.transform(self.training_data_outputs) + self.input_data_scaler.transform(self._training_data_inputs) + self.output_data_scaler.transform(self._training_data_outputs) if self.parameters.use_fast_tensor_data_set: printout("Using FastTensorDataset.", min_verbosity=2) self.training_data_sets.append( FastTensorDataset( - self.mini_batch_size, - self.training_data_inputs, - self.training_data_outputs, + self._mini_batch_size, + self._training_data_inputs, + self._training_data_outputs, ) ) else: self.training_data_sets.append( TensorDataset( - self.training_data_inputs, - self.training_data_outputs, + self._training_data_inputs, + self._training_data_outputs, ) ) if self.nr_validation_data != 0: self.__load_data("validation", "inputs") - self.input_data_scaler.transform(self.validation_data_inputs) + self.input_data_scaler.transform(self._validation_data_inputs) self.__load_data("validation", "outputs") - self.output_data_scaler.transform(self.validation_data_outputs) + self.output_data_scaler.transform( + self._validation_data_outputs + ) if self.parameters.use_fast_tensor_data_set: printout("Using FastTensorDataset.", min_verbosity=2) self.validation_data_sets.append( FastTensorDataset( - self.mini_batch_size, - self.validation_data_inputs, - self.validation_data_outputs, + self._mini_batch_size, + self._validation_data_inputs, + self._validation_data_outputs, ) ) else: self.validation_data_sets.append( TensorDataset( - self.validation_data_inputs, - self.validation_data_outputs, + self._validation_data_inputs, + self._validation_data_outputs, ) ) if self.nr_test_data != 0: self.__load_data("test", "inputs") - self.input_data_scaler.transform(self.test_data_inputs) - self.test_data_inputs.requires_grad = True + self.input_data_scaler.transform(self._test_data_inputs) + self._test_data_inputs.requires_grad = True self.__load_data("test", "outputs") - self.output_data_scaler.transform(self.test_data_outputs) + self.output_data_scaler.transform(self._test_data_outputs) self.test_data_sets.append( TensorDataset( - self.test_data_inputs, self.test_data_outputs + self._test_data_inputs, self._test_data_outputs ) ) @@ -815,7 +856,6 @@ def __parametrize_scalers(self): # scaling. This should save some performance. if self.parameters.use_lazy_loading: - self.input_data_scaler.start_incremental_fitting() # We need to perform the data scaling over the entirety of the # training data. for snapshot in self.parameters.snapshot_directories_list: @@ -853,13 +893,11 @@ def __parametrize_scalers(self): [snapshot.grid_size, self.input_dimension] ) tmp = torch.from_numpy(tmp).float() - self.input_data_scaler.incremental_fit(tmp) - - self.input_data_scaler.finish_incremental_fitting() + self.input_data_scaler.partial_fit(tmp) else: self.__load_data("training", "inputs") - self.input_data_scaler.fit(self.training_data_inputs) + self.input_data_scaler.fit(self._training_data_inputs) printout("Input scaler parametrized.", min_verbosity=1) @@ -876,7 +914,6 @@ def __parametrize_scalers(self): if self.parameters.use_lazy_loading: i = 0 - self.output_data_scaler.start_incremental_fitting() # We need to perform the data scaling over the entirety of the # training data. for snapshot in self.parameters.snapshot_directories_list: @@ -912,13 +949,12 @@ def __parametrize_scalers(self): [snapshot.grid_size, self.output_dimension] ) tmp = torch.from_numpy(tmp).float() - self.output_data_scaler.incremental_fit(tmp) + self.output_data_scaler.partial_fit(tmp) i += 1 - self.output_data_scaler.finish_incremental_fitting() else: self.__load_data("training", "outputs") - self.output_data_scaler.fit(self.training_data_outputs) + self.output_data_scaler.fit(self._training_data_outputs) printout("Output scaler parametrized.", min_verbosity=1) diff --git a/mala/datahandling/data_handler_base.py b/mala/datahandling/data_handler_base.py index 54e27e959..c141551fa 100644 --- a/mala/datahandling/data_handler_base.py +++ b/mala/datahandling/data_handler_base.py @@ -28,6 +28,20 @@ class DataHandlerBase(ABC): target_calculator : mala.targets.target.Target Used to do unit conversion on output data. If None, then one will be created by this class. + + Attributes + ---------- + descriptor_calculator + Used to do unit conversion on input data. + + nr_snapshots : int + Number of snapshots loaded. + + parameters : mala.common.parameters.ParametersData + MALA data handling parameters. + + target_calculator + Used to do unit conversion on output data. """ def __init__( @@ -37,7 +51,7 @@ def __init__( descriptor_calculator=None, ): self.parameters: ParametersData = parameters.data - self.use_ddp = parameters.use_ddp + self._use_ddp = parameters.use_ddp # Calculators used to parse data from compatible files. self.target_calculator = target_calculator diff --git a/mala/datahandling/data_scaler.py b/mala/datahandling/data_scaler.py index e3c8a5328..5840e8816 100644 --- a/mala/datahandling/data_scaler.py +++ b/mala/datahandling/data_scaler.py @@ -6,13 +6,19 @@ import torch.distributed as dist from mala.common.parameters import printout +from mala.common.parallelizer import parallel_warn +# IMPORTANT: If you change the docstrings, make sure to also change them +# in the ParametersData subclass, because users do usually not interact +# with this class directly. class DataScaler: """Scales input and output data. Sort of emulates the functionality of the scikit-learn library, but by - implementing the class by ourselves we have more freedom. + implementing the class by ourselves we have more freedom. Specifically + assumes data of the form (d,f), where d=x*y*z, i.e., the product of spatial + dimensions, and f is the feature dimension. Parameters ---------- @@ -20,25 +26,81 @@ class DataScaler: Specifies how scaling should be performed. Options: - - "None": No normalization is applied. + - "None": No scaling is applied. - "standard": Standardization (Scale to mean 0, - standard deviation 1) - - "normal": Min-Max scaling (Scale to be in range 0...1) - - "feature-wise-standard": Row Standardization (Scale to mean 0, - standard deviation 1) - - "feature-wise-normal": Row Min-Max scaling (Scale to be in range - 0...1) + standard deviation 1) is applied to the entire array. + - "minmax": Min-Max scaling (Scale to be in range 0...1) is applied + to the entire array. + - "feature-wise-standard": Standardization (Scale to mean 0, + standard deviation 1) is applied to each feature dimension + individually. + I.e., if your training data has dimensions (d,f), then each + of the f columns with d entries is scaled indiviually. + - "feature-wise-minmax": Min-Max scaling (Scale to be in range + 0...1) is applied to each feature dimension individually. + I.e., if your training data has dimensions (d,f), then each + of the f columns with d entries is scaled indiviually. + - "normal": (DEPRECATED) Old name for "minmax". + - "feature-wise-normal": (DEPRECATED) Old name for + "feature-wise-minmax" use_ddp : bool If True, the DataScaler will use ddp to check that data is only saved on the root process in parallel execution. + + Attributes + ---------- + cantransform : bool + If True, this scaler is set up to perform scaling. + + feature_wise : bool + (Managed internally, not set to private due to legacy issues) + + maxs : torch.Tensor + (Managed internally, not set to private due to legacy issues) + + means : torch.Tensor + (Managed internally, not set to private due to legacy issues) + + mins : torch.Tensor + (Managed internally, not set to private due to legacy issues) + + scale_minmax : bool + (Managed internally, not set to private due to legacy issues) + + scale_standard : bool + (Managed internally, not set to private due to legacy issues) + + stds : torch.Tensor + (Managed internally, not set to private due to legacy issues) + + total_data_count : int + (Managed internally, not set to private due to legacy issues) + + total_max : float + (Managed internally, not set to private due to legacy issues) + + total_mean : float + (Managed internally, not set to private due to legacy issues) + + total_min : float + (Managed internally, not set to private due to legacy issues) + + total_std : float + (Managed internally, not set to private due to legacy issues) + + typestring : str + (Managed internally, not set to private due to legacy issues) + + use_ddp : bool + (Managed internally, not set to private due to legacy issues) """ def __init__(self, typestring, use_ddp=False): self.use_ddp = use_ddp self.typestring = typestring self.scale_standard = False - self.scale_normal = False + self.scale_minmax = False self.feature_wise = False self.cantransform = False self.__parse_typestring() @@ -57,23 +119,32 @@ def __init__(self, typestring, use_ddp=False): def __parse_typestring(self): """Parse the typestring to class attributes.""" self.scale_standard = False - self.scale_normal = False + self.scale_minmax = False self.feature_wise = False if "standard" in self.typestring: self.scale_standard = True if "normal" in self.typestring: - self.scale_normal = True + parallel_warn( + "Options 'normal' and 'feature-wise-normal' will be " + "deprecated, starting in MALA v1.4.0. Please use 'minmax' and " + "'feature-wise-minmax' instead.", + min_verbosity=0, + category=FutureWarning, + ) + self.scale_minmax = True + if "minmax" in self.typestring: + self.scale_minmax = True if "feature-wise" in self.typestring: self.feature_wise = True - if self.scale_standard is False and self.scale_normal is False: + if self.scale_standard is False and self.scale_minmax is False: printout("No data rescaling will be performed.", min_verbosity=1) self.cantransform = True return - if self.scale_standard is True and self.scale_normal is True: + if self.scale_standard is True and self.scale_minmax is True: raise Exception("Invalid input data rescaling.") - def start_incremental_fitting(self): + def reset(self): """ Start the incremental calculation of scaling parameters. @@ -81,7 +152,7 @@ def start_incremental_fitting(self): """ self.total_data_count = 0 - def incremental_fit(self, unscaled): + def partial_fit(self, unscaled): """ Add data to the incremental calculation of scaling parameters. @@ -93,7 +164,16 @@ def incremental_fit(self, unscaled): Data that is to be added to the fit. """ - if self.scale_standard is False and self.scale_normal is False: + if len(unscaled.size()) != 2: + raise ValueError( + "MALA DataScaler are designed for 2D-arrays, " + "while a {0}D-array has been provided.".format( + len(unscaled.size()) + ) + ) + + if self.scale_standard is False and self.scale_minmax is False: + self.cantransform = True return else: with torch.no_grad(): @@ -142,7 +222,7 @@ def incremental_fit(self, unscaled): self.stds = new_std self.total_data_count += current_data_count - if self.scale_normal: + if self.scale_minmax: new_maxs = torch.max(unscaled, 0, keepdim=True) if list(self.maxs.size())[0] > 0: for i in range(list(new_maxs.values.size())[1]): @@ -205,7 +285,7 @@ def incremental_fit(self, unscaled): self.total_std = torch.sqrt(self.total_std) self.total_data_count += current_data_count - if self.scale_normal: + if self.scale_minmax: new_max = torch.max(unscaled) if new_max > self.total_max: self.total_max = new_max @@ -213,13 +293,6 @@ def incremental_fit(self, unscaled): new_min = torch.min(unscaled) if new_min < self.total_min: self.total_min = new_min - - def finish_incremental_fitting(self): - """ - Indicate that all data has been added to the incremental calculation. - - This is necessary for lazy loading. - """ self.cantransform = True def fit(self, unscaled): @@ -232,7 +305,15 @@ def fit(self, unscaled): Data that on which the scaling will be calculated. """ - if self.scale_standard is False and self.scale_normal is False: + if len(unscaled.size()) != 2: + raise ValueError( + "MALA DataScaler are designed for 2D-arrays, " + "while a {0}D-array has been provided.".format( + len(unscaled.size()) + ) + ) + + if self.scale_standard is False and self.scale_minmax is False: return else: with torch.no_grad(): @@ -246,7 +327,7 @@ def fit(self, unscaled): self.means = torch.mean(unscaled, 0, keepdim=True) self.stds = torch.std(unscaled, 0, keepdim=True) - if self.scale_normal: + if self.scale_minmax: self.maxs = torch.max(unscaled, 0, keepdim=True).values self.mins = torch.min(unscaled, 0, keepdim=True).values @@ -260,13 +341,13 @@ def fit(self, unscaled): self.total_mean = torch.mean(unscaled) self.total_std = torch.std(unscaled) - if self.scale_normal: + if self.scale_minmax: self.total_max = torch.max(unscaled) self.total_min = torch.min(unscaled) self.cantransform = True - def transform(self, unscaled): + def transform(self, unscaled, copy=False): """ Transform data from unscaled to scaled. @@ -278,13 +359,29 @@ def transform(self, unscaled): unscaled : torch.Tensor Real world data. + copy : bool + If False, data is modified in-place. If True, a copy of the + data is modified. Default is False. + Returns ------- scaled : torch.Tensor Scaled data. """ + if len(unscaled.size()) != 2: + raise ValueError( + "MALA DataScaler are designed for 2D-arrays, " + "while a {0}D-array has been provided.".format( + len(unscaled.size()) + ) + ) + + # Backward compatability. + if not hasattr(self, "scale_minmax") and hasattr(self, "scale_normal"): + self.scale_minmax = self.scale_normal + # First we need to find out if we even have to do anything. - if self.scale_standard is False and self.scale_normal is False: + if self.scale_standard is False and self.scale_minmax is False: pass elif self.cantransform is False: @@ -295,6 +392,8 @@ def transform(self, unscaled): # Perform the actual scaling, but use no_grad to make sure # that the next couple of iterations stay untracked. + scaled = unscaled.clone() if copy else unscaled + with torch.no_grad(): if self.feature_wise: @@ -303,12 +402,12 @@ def transform(self, unscaled): ########################## if self.scale_standard: - unscaled -= self.means - unscaled /= self.stds + scaled -= self.means + scaled /= self.stds - if self.scale_normal: - unscaled -= self.mins - unscaled /= self.maxs - self.mins + if self.scale_minmax: + scaled -= self.mins + scaled /= self.maxs - self.mins else: @@ -317,14 +416,16 @@ def transform(self, unscaled): ########################## if self.scale_standard: - unscaled -= self.total_mean - unscaled /= self.total_std + scaled -= self.total_mean + scaled /= self.total_std - if self.scale_normal: - unscaled -= self.total_min - unscaled /= self.total_max - self.total_min + if self.scale_minmax: + scaled -= self.total_min + scaled /= self.total_max - self.total_min - def inverse_transform(self, scaled, as_numpy=False): + return scaled + + def inverse_transform(self, scaled, copy=False, as_numpy=False): """ Transform data from scaled to unscaled. @@ -337,7 +438,11 @@ def inverse_transform(self, scaled, as_numpy=False): Scaled data. as_numpy : bool - If True, a numpy array is returned, otherwsie. + If True, a numpy array is returned, otherwise a torch tensor. + + copy : bool + If False, data is modified in-place. If True, a copy of the + data is modified. Default is False. Returns ------- @@ -345,9 +450,25 @@ def inverse_transform(self, scaled, as_numpy=False): Real world data. """ + if len(scaled.size()) != 2: + raise ValueError( + "MALA DataScaler are designed for 2D-arrays, " + "while a {0}D-array has been provided.".format( + len(scaled.size()) + ) + ) + + # Backward compatability. + if not hasattr(self, "scale_minmax") and hasattr(self, "scale_normal"): + self.scale_minmax = self.scale_normal + + # Perform the actual scaling, but use no_grad to make sure + # that the next couple of iterations stay untracked. + unscaled = scaled.clone() if copy else scaled + # First we need to find out if we even have to do anything. - if self.scale_standard is False and self.scale_normal is False: - unscaled = scaled + if self.scale_standard is False and self.scale_minmax is False: + pass else: if self.cantransform is False: @@ -366,12 +487,12 @@ def inverse_transform(self, scaled, as_numpy=False): ########################## if self.scale_standard: - unscaled = (scaled * self.stds) + self.means + unscaled *= self.stds + unscaled += self.means - if self.scale_normal: - unscaled = ( - scaled * (self.maxs - self.mins) - ) + self.mins + if self.scale_minmax: + unscaled *= self.maxs - self.mins + unscaled += self.mins else: @@ -380,13 +501,13 @@ def inverse_transform(self, scaled, as_numpy=False): ########################## if self.scale_standard: - unscaled = (scaled * self.total_std) + self.total_mean + unscaled *= self.total_std + unscaled += self.total_mean + + if self.scale_minmax: + unscaled *= self.total_max - self.total_min + unscaled += self.total_min - if self.scale_normal: - unscaled = ( - scaled * (self.total_max - self.total_min) - ) + self.total_min - # if as_numpy: return unscaled.detach().numpy().astype(np.float64) else: diff --git a/mala/datahandling/data_shuffler.py b/mala/datahandling/data_shuffler.py index 223f51b99..9303b0ee7 100644 --- a/mala/datahandling/data_shuffler.py +++ b/mala/datahandling/data_shuffler.py @@ -53,6 +53,7 @@ def __init__( self.descriptor_calculator.parameters.descriptors_contain_xyz = ( False ) + self._data_points_to_remove = None def add_snapshot( self, @@ -133,10 +134,17 @@ def __shuffle_numpy( # if the number of new snapshots is not a divisor of the grid size # then we have to trim the original snapshots to size # the indicies to be removed are selected at random - if self.data_points_to_remove is not None: + if ( + self._data_points_to_remove is not None + and np.sum(self._data_points_to_remove) > 0 + ): if self.parameters.shuffling_seed is not None: np.random.seed(idx * self.parameters.shuffling_seed) - ngrid = descriptor_data[idx].shape[0] + ngrid = ( + descriptor_data[idx].shape[0] + * descriptor_data[idx].shape[1] + * descriptor_data[idx].shape[2] + ) n_descriptor = descriptor_data[idx].shape[-1] n_target = target_data[idx].shape[-1] @@ -146,8 +154,8 @@ def __shuffle_numpy( ) indices = np.random.choice( - ngrid**3, - size=ngrid**3 - self.data_points_to_remove[idx], + ngrid, + size=ngrid - self._data_points_to_remove[idx], ) descriptor_data[idx] = current_descriptor[indices] @@ -532,121 +540,75 @@ def shuffle_snapshots( snapshot_type = snapshot_types.pop() del snapshot_types - snapshot_size_list = [ - snapshot.grid_size - for snapshot in self.parameters.snapshot_directories_list - ] + # Set the defaults, these may be changed below as needed. + snapshot_size_list = np.array( + [ + snapshot.grid_size + for snapshot in self.parameters.snapshot_directories_list + ] + ) number_of_data_points = np.sum(snapshot_size_list) - - self.data_points_to_remove = None - + self._data_points_to_remove = None if number_of_shuffled_snapshots is None: - # If the user does not tell us how many snapshots to use, - # we have to check if the number of snapshots is straightforward. - # If all snapshots have the same size, we can just replicate the - # snapshot structure. - if np.max(snapshot_size_list) == np.min(snapshot_size_list): - shuffle_dimensions = self.parameters.snapshot_directories_list[ - 0 - ].grid_dimension - number_of_new_snapshots = self.nr_snapshots - else: - # If the snapshots have different sizes we simply create - # (x, 1, 1) snapshots big enough to hold the data. - number_of_new_snapshots = self.nr_snapshots - while number_of_data_points % number_of_new_snapshots != 0: - number_of_new_snapshots += 1 - # If they do have different sizes, we start with the smallest - # snapshot, there is some padding down below anyhow. - shuffle_dimensions = [ - int(number_of_data_points / number_of_new_snapshots), - 1, - 1, + number_of_shuffled_snapshots = self.nr_snapshots + + # Currently, the openPMD interface is not feature-complete. + if snapshot_type == "openpmd" and np.any( + np.array( + [ + snapshot.grid_dimension[0] % number_of_shuffled_snapshots + for snapshot in self.parameters.snapshot_directories_list ] + ) + != 0 + ): + raise ValueError( + "Shuffling from OpenPMD files currently only " + "supported if first dimension of all snapshots " + "can evenly be divided by number of snapshots. " + "Please select a different number of shuffled " + "snapshots or use the numpy interface. " + ) - if snapshot_type == "openpmd": - import math - import functools - - number_of_new_snapshots = functools.reduce( - math.gcd, - [ - snapshot.grid_dimension[0] - for snapshot in self.parameters.snapshot_directories_list - ], - number_of_new_snapshots, - ) - else: - number_of_new_snapshots = number_of_shuffled_snapshots - - if snapshot_type == "openpmd": - import math - import functools - - specified_number_of_new_snapshots = number_of_new_snapshots - number_of_new_snapshots = functools.reduce( - math.gcd, - [ - snapshot.grid_dimension[0] - for snapshot in self.parameters.snapshot_directories_list - ], - number_of_new_snapshots, - ) - if ( - number_of_new_snapshots - != specified_number_of_new_snapshots - ): - print( - f"[openPMD shuffling] Reduced the number of output snapshots to " - f"{number_of_new_snapshots} because of the dataset dimensions." - ) - del specified_number_of_new_snapshots - - if number_of_data_points % number_of_new_snapshots != 0: - if snapshot_type == "numpy": - self.data_points_to_remove = [] - for i in range(0, self.nr_snapshots): - gridsize = self.parameters.snapshot_directories_list[ - i - ].grid_size - shuffled_gridsize = int( - gridsize / number_of_new_snapshots - ) - self.data_points_to_remove.append( - gridsize - - shuffled_gridsize * number_of_new_snapshots - ) - tot_points_missing = sum(self.data_points_to_remove) + shuffled_gridsizes = snapshot_size_list // number_of_shuffled_snapshots - printout( - "Warning: number of requested snapshots is not a divisor of", - "the original grid sizes.\n", - f"{tot_points_missing} / {number_of_data_points} data points", - "will be left out of the shuffled snapshots." - ) + if np.any( + np.array(snapshot_size_list) + - ( + (np.array(snapshot_size_list) // number_of_shuffled_snapshots) + * number_of_shuffled_snapshots + ) + > 0 + ): + number_of_data_points = int( + np.sum(shuffled_gridsizes) * number_of_shuffled_snapshots + ) - shuffle_dimensions = [ - int(number_of_data_points / number_of_new_snapshots), - 1, - 1, - ] + self._data_points_to_remove = [] + for i in range(0, self.nr_snapshots): + self._data_points_to_remove.append( + snapshot_size_list[i] + - shuffled_gridsizes[i] * number_of_shuffled_snapshots + ) + tot_points_missing = sum(self._data_points_to_remove) - elif snapshot_type == "openpmd": - # TODO implement arbitrary grid sizes for openpmd - raise Exception( - "Cannot create this number of snapshots " - "from data provided." - ) - else: - shuffle_dimensions = [ - int(number_of_data_points / number_of_new_snapshots), - 1, - 1, - ] + if tot_points_missing > 0: + printout( + "Warning: number of requested snapshots is not a divisor of", + "the original grid sizes.\n", + f"{tot_points_missing} / {number_of_data_points} data points", + "will be left out of the shuffled snapshots.", + ) + + shuffle_dimensions = [ + int(number_of_data_points / number_of_shuffled_snapshots), + 1, + 1, + ] printout( "Data shuffler will generate", - number_of_new_snapshots, + number_of_shuffled_snapshots, "new snapshots.", ) printout("Shuffled snapshot dimension will be ", shuffle_dimensions) @@ -654,7 +616,7 @@ def shuffle_snapshots( # Prepare permutations. permutations = [] seeds = [] - for i in range(0, number_of_new_snapshots): + for i in range(0, number_of_shuffled_snapshots): # This makes the shuffling deterministic, if specified by the user. if self.parameters.shuffling_seed is not None: np.random.seed(i * self.parameters.shuffling_seed) @@ -664,7 +626,7 @@ def shuffle_snapshots( if snapshot_type == "numpy": self.__shuffle_numpy( - number_of_new_snapshots, + number_of_shuffled_snapshots, shuffle_dimensions, descriptor_save_path, save_name, @@ -683,7 +645,7 @@ def shuffle_snapshots( ) self.__shuffle_openpmd( descriptor, - number_of_new_snapshots, + number_of_shuffled_snapshots, shuffle_dimensions, save_name, permutations, @@ -699,7 +661,7 @@ def shuffle_snapshots( ) self.__shuffle_openpmd( target, - number_of_new_snapshots, + number_of_shuffled_snapshots, shuffle_dimensions, save_name, permutations, diff --git a/mala/datahandling/fast_tensor_dataset.py b/mala/datahandling/fast_tensor_dataset.py index 6b38477d5..50f2679a0 100644 --- a/mala/datahandling/fast_tensor_dataset.py +++ b/mala/datahandling/fast_tensor_dataset.py @@ -10,15 +10,28 @@ class FastTensorDataset(torch.utils.data.Dataset): This version of TensorDataset gathers data using a single call within __getitem__. A bit more tricky to manage but is faster still. + + Parameters + ---------- + batch_size : int + Batch size to be used with this data set. + + tensors : object + Torch tensors for this data set. + + Attributes + ---------- + batch_size : int + Batch size to be used with this data set. """ def __init__(self, batch_size, *tensors): super(FastTensorDataset).__init__() self.batch_size = batch_size - self.tensors = tensors + self._tensors = tensors total_samples = tensors[0].shape[0] - self.indices = np.arange(total_samples) - self.len = total_samples // self.batch_size + self._indices = np.arange(total_samples) + self._len = total_samples // self.batch_size def __getitem__(self, idx): """ @@ -36,16 +49,16 @@ def __getitem__(self, idx): batch : tuple The data tuple for this batch. """ - batch = self.indices[ + batch = self._indices[ idx * self.batch_size : (idx + 1) * self.batch_size ] - rv = tuple(t[batch, ...] for t in self.tensors) + rv = tuple(t[batch, ...] for t in self._tensors) return rv def __len__(self): """Get the length of the data set.""" - return self.len + return self._len def shuffle(self): """Shuffle the data set.""" - np.random.shuffle(self.indices) + np.random.shuffle(self._indices) diff --git a/mala/datahandling/lazy_load_dataset.py b/mala/datahandling/lazy_load_dataset.py index 00810beb3..a5a2b1a50 100644 --- a/mala/datahandling/lazy_load_dataset.py +++ b/mala/datahandling/lazy_load_dataset.py @@ -48,6 +48,17 @@ class LazyLoadDataset(Dataset): input_requires_grad : bool If True, then the gradient is stored for the inputs. + + Attributes + ---------- + currently_loaded_file : int + Index of currently loaded file. + + input_data : torch.Tensor + Input data tensor. + + output_data : torch.Tensor + Output data tensor. """ def __init__( @@ -62,25 +73,22 @@ def __init__( device, input_requires_grad=False, ): - self.snapshot_list = [] - self.input_dimension = input_dimension - self.output_dimension = output_dimension - self.input_data_scaler = input_data_scaler - self.output_data_scaler = output_data_scaler - self.descriptor_calculator = descriptor_calculator - self.target_calculator = target_calculator - self.number_of_snapshots = 0 - self.total_size = 0 - self.descriptors_contain_xyz = ( - self.descriptor_calculator.descriptors_contain_xyz - ) + self._snapshot_list = [] + self._input_dimension = input_dimension + self._output_dimension = output_dimension + self._input_data_scaler = input_data_scaler + self._output_data_scaler = output_data_scaler + self._descriptor_calculator = descriptor_calculator + self._target_calculator = target_calculator + self._number_of_snapshots = 0 + self._total_size = 0 self.currently_loaded_file = None self.input_data = np.empty(0) self.output_data = np.empty(0) - self.use_ddp = use_ddp + self._use_ddp = use_ddp self.return_outputs_directly = False - self.input_requires_grad = input_requires_grad - self.device = device + self._input_requires_grad = input_requires_grad + self._device = device @property def return_outputs_directly(self): @@ -108,9 +116,9 @@ def add_snapshot_to_dataset(self, snapshot: Snapshot): Snapshot that is to be added to this DataSet. """ - self.snapshot_list.append(snapshot) - self.number_of_snapshots += 1 - self.total_size += snapshot.grid_size + self._snapshot_list.append(snapshot) + self._number_of_snapshots += 1 + self._total_size += snapshot.grid_size def mix_datasets(self): """ @@ -118,16 +126,16 @@ def mix_datasets(self): With this, there can be some variance between runs. """ - used_perm = torch.randperm(self.number_of_snapshots) + used_perm = torch.randperm(self._number_of_snapshots) barrier() - if self.use_ddp: - used_perm = used_perm.to(device=self.device) + if self._use_ddp: + used_perm = used_perm.to(device=self._device) dist.broadcast(used_perm, 0) - self.snapshot_list = [ - self.snapshot_list[i] for i in used_perm.to("cpu") + self._snapshot_list = [ + self._snapshot_list[i] for i in used_perm.to("cpu") ] else: - self.snapshot_list = [self.snapshot_list[i] for i in used_perm] + self._snapshot_list = [self._snapshot_list[i] for i in used_perm] self.get_new_data(0) def get_new_data(self, file_index): @@ -140,50 +148,50 @@ def get_new_data(self, file_index): File to be read. """ # Load the data into RAM. - if self.snapshot_list[file_index].snapshot_type == "numpy": - self.input_data = self.descriptor_calculator.read_from_numpy_file( + if self._snapshot_list[file_index].snapshot_type == "numpy": + self.input_data = self._descriptor_calculator.read_from_numpy_file( os.path.join( - self.snapshot_list[file_index].input_npy_directory, - self.snapshot_list[file_index].input_npy_file, + self._snapshot_list[file_index].input_npy_directory, + self._snapshot_list[file_index].input_npy_file, ), - units=self.snapshot_list[file_index].input_units, + units=self._snapshot_list[file_index].input_units, ) - self.output_data = self.target_calculator.read_from_numpy_file( + self.output_data = self._target_calculator.read_from_numpy_file( os.path.join( - self.snapshot_list[file_index].output_npy_directory, - self.snapshot_list[file_index].output_npy_file, + self._snapshot_list[file_index].output_npy_directory, + self._snapshot_list[file_index].output_npy_file, ), - units=self.snapshot_list[file_index].output_units, + units=self._snapshot_list[file_index].output_units, ) - elif self.snapshot_list[file_index].snapshot_type == "openpmd": + elif self._snapshot_list[file_index].snapshot_type == "openpmd": self.input_data = ( - self.descriptor_calculator.read_from_openpmd_file( + self._descriptor_calculator.read_from_openpmd_file( os.path.join( - self.snapshot_list[file_index].input_npy_directory, - self.snapshot_list[file_index].input_npy_file, + self._snapshot_list[file_index].input_npy_directory, + self._snapshot_list[file_index].input_npy_file, ) ) ) - self.output_data = self.target_calculator.read_from_openpmd_file( + self.output_data = self._target_calculator.read_from_openpmd_file( os.path.join( - self.snapshot_list[file_index].output_npy_directory, - self.snapshot_list[file_index].output_npy_file, + self._snapshot_list[file_index].output_npy_directory, + self._snapshot_list[file_index].output_npy_file, ) ) # Transform the data. self.input_data = self.input_data.reshape( - [self.snapshot_list[file_index].grid_size, self.input_dimension] + [self._snapshot_list[file_index].grid_size, self._input_dimension] ) if self.input_data.dtype != DEFAULT_NP_DATA_DTYPE: self.input_data = self.input_data.astype(DEFAULT_NP_DATA_DTYPE) self.input_data = torch.from_numpy(self.input_data).float() - self.input_data_scaler.transform(self.input_data) - self.input_data.requires_grad = self.input_requires_grad + self._input_data_scaler.transform(self.input_data) + self.input_data.requires_grad = self._input_requires_grad self.output_data = self.output_data.reshape( - [self.snapshot_list[file_index].grid_size, self.output_dimension] + [self._snapshot_list[file_index].grid_size, self._output_dimension] ) if self.return_outputs_directly is False: self.output_data = np.array(self.output_data) @@ -192,7 +200,7 @@ def get_new_data(self, file_index): DEFAULT_NP_DATA_DTYPE ) self.output_data = torch.from_numpy(self.output_data).float() - self.output_data_scaler.transform(self.output_data) + self._output_data_scaler.transform(self.output_data) # Save which data we have currently loaded. self.currently_loaded_file = file_index @@ -201,28 +209,28 @@ def _get_file_index(self, idx, is_slice=False, is_start=False): file_index = None index_in_file = idx if is_slice: - for i in range(len(self.snapshot_list)): - if index_in_file - self.snapshot_list[i].grid_size <= 0: + for i in range(len(self._snapshot_list)): + if index_in_file - self._snapshot_list[i].grid_size <= 0: file_index = i # From the end of previous file to beginning of new. if ( - index_in_file == self.snapshot_list[i].grid_size + index_in_file == self._snapshot_list[i].grid_size and is_start ): file_index = i + 1 index_in_file = 0 break else: - index_in_file -= self.snapshot_list[i].grid_size + index_in_file -= self._snapshot_list[i].grid_size return file_index, index_in_file else: - for i in range(len(self.snapshot_list)): - if index_in_file - self.snapshot_list[i].grid_size < 0: + for i in range(len(self._snapshot_list)): + if index_in_file - self._snapshot_list[i].grid_size < 0: file_index = i break else: - index_in_file -= self.snapshot_list[i].grid_size + index_in_file -= self._snapshot_list[i].grid_size return file_index, index_in_file def __getitem__(self, idx): @@ -266,7 +274,7 @@ def __getitem__(self, idx): # the stop index will point to the wrong file. if file_index_start != file_index_stop: if index_in_file_stop == 0: - index_in_file_stop = self.snapshot_list[ + index_in_file_stop = self._snapshot_list[ file_index_stop ].grid_size else: @@ -297,4 +305,4 @@ def __len__(self): length : int Number of data points in DataSet. """ - return self.total_size + return self._total_size diff --git a/mala/datahandling/lazy_load_dataset_single.py b/mala/datahandling/lazy_load_dataset_single.py index 33d7fee87..402d149de 100644 --- a/mala/datahandling/lazy_load_dataset_single.py +++ b/mala/datahandling/lazy_load_dataset_single.py @@ -44,6 +44,56 @@ class LazyLoadDatasetSingle(Dataset): input_requires_grad : bool If True, then the gradient is stored for the inputs. + + Attributes + ---------- + allocated : bool + True if dataset is allocated. + + currently_loaded_file : int + Index of currently loaded file + + descriptor_calculator : mala.descriptors.descriptor.Descriptor + Used to do unit conversion on input data. + + input_data : torch.Tensor + Input data tensor. + + input_dtype : numpy.dtype + Input data type. + + input_shape : list + Input data dimensions + + input_shm_name : str + Name of shared memory allocated for input data + + loaded : bool + True if data has been loaded to shared memory. + + output_data : torch.Tensor + Output data tensor. + + output_dtype : numpy.dtype + Output data dtype. + + output_shape : list + Output data dimensions. + + output_shm_name : str + Name of shared memory allocated for output data. + + return_outputs_directly : bool + + Control whether outputs are actually transformed. + Has to be False for training. In the testing case, + Numerical errors are smaller if set to True. + + snapshot : mala.datahandling.snapshot.Snapshot + Currently loaded snapshot object. + + target_calculator : mala.targets.target.Target or derivative + Used to do unit conversion on output data. """ def __init__( @@ -60,27 +110,24 @@ def __init__( input_requires_grad=False, ): self.snapshot = snapshot - self.input_dimension = input_dimension - self.output_dimension = output_dimension - self.input_data_scaler = input_data_scaler - self.output_data_scaler = output_data_scaler + self._input_dimension = input_dimension + self._output_dimension = output_dimension + self._input_data_scaler = input_data_scaler + self._output_data_scaler = output_data_scaler self.descriptor_calculator = descriptor_calculator self.target_calculator = target_calculator - self.number_of_snapshots = 0 - self.total_size = 0 - self.descriptors_contain_xyz = ( - self.descriptor_calculator.descriptors_contain_xyz - ) + self._number_of_snapshots = 0 + self._total_size = 0 self.currently_loaded_file = None self.input_data = np.empty(0) self.output_data = np.empty(0) - self.use_ddp = use_ddp + self._use_ddp = use_ddp self.return_outputs_directly = False - self.input_requires_grad = input_requires_grad + self._input_requires_grad = input_requires_grad - self.batch_size = batch_size - self.len = int(np.ceil(snapshot.grid_size / self.batch_size)) - self.indices = np.arange(snapshot.grid_size) + self._batch_size = batch_size + self._len = int(np.ceil(snapshot.grid_size / self._batch_size)) + self._indices = np.arange(snapshot.grid_size) self.input_shm_name = None self.output_shm_name = None self.loaded = False @@ -197,20 +244,20 @@ def __getitem__(self, idx): output_shm = shared_memory.SharedMemory(name=self.output_shm_name) input_data = np.ndarray( - shape=[self.snapshot.grid_size, self.input_dimension], + shape=[self.snapshot.grid_size, self._input_dimension], dtype=np.float32, buffer=input_shm.buf, ) output_data = np.ndarray( - shape=[self.snapshot.grid_size, self.output_dimension], + shape=[self.snapshot.grid_size, self._output_dimension], dtype=np.float32, buffer=output_shm.buf, ) - if idx == self.len - 1: - batch = self.indices[idx * self.batch_size :] + if idx == self._len - 1: + batch = self._indices[idx * self._batch_size :] else: - batch = self.indices[ - idx * self.batch_size : (idx + 1) * self.batch_size + batch = self._indices[ + idx * self._batch_size : (idx + 1) * self._batch_size ] # print(batch.shape) @@ -219,12 +266,12 @@ def __getitem__(self, idx): # Perform conversion to tensor and perform transforms input_batch = torch.from_numpy(input_batch) - self.input_data_scaler.transform(input_batch) - input_batch.requires_grad = self.input_requires_grad + self._input_data_scaler.transform(input_batch) + input_batch.requires_grad = self._input_requires_grad if self.return_outputs_directly is False: output_batch = torch.from_numpy(output_batch) - self.output_data_scaler.transform(output_batch) + self._output_data_scaler.transform(output_batch) input_shm.close() output_shm.close() @@ -240,7 +287,7 @@ def __len__(self): length : int Number of data points in DataSet. """ - return self.len + return self._len def mix_datasets(self): """ @@ -257,4 +304,4 @@ def mix_datasets(self): avoid erroneously overwriting shared memory data in cases where a single dataset object is used back to back. """ - np.random.shuffle(self.indices) + np.random.shuffle(self._indices) diff --git a/mala/datahandling/ldos_aligner.py b/mala/datahandling/ldos_aligner.py index 892a94fbf..acc712094 100644 --- a/mala/datahandling/ldos_aligner.py +++ b/mala/datahandling/ldos_aligner.py @@ -33,6 +33,11 @@ class LDOSAligner(DataHandlerBase): target_calculator : mala.targets.target.Target Used to do unit conversion on output data. If None, then one will be created by this class. + + Attributes + ---------- + ldos_parameters : mala.common.parameters.ParametersTargets + MALA target calculation parameters. """ def __init__( @@ -85,8 +90,6 @@ def add_snapshot( def align_ldos_to_ref( self, - save_path=None, - save_name=None, save_path_ext="aligned/", reference_index=0, zero_tol=1e-5, @@ -96,35 +99,34 @@ def align_ldos_to_ref( n_shift_mse=None, ): """ - Add a snapshot to the data pipeline. + Align LDOS to reference. Parameters ---------- - save_path : string - path to save the aligned LDOS vectors - save_name : string - naming convention for the aligned LDOS vectors save_path_ext : string - additional path for the LDOS vectors (useful if - save_path is left as default None) + Extra path to be added to the input path before saving. + By default, new snapshot files are saved into exactly the + same directory they were read from with exactly the same name. + reference_index : int the snapshot number (in the snapshot directory list) to which all other LDOS vectors are aligned + zero_tol : float the "zero" value for alignment / left side truncation always scaled by norm of reference LDOS mean + left_truncate : bool whether to truncate the zero values on the LHS + right_truncate_value : float right-hand energy value (based on reference LDOS vector) to which truncate LDOS vectors if None, no right-side truncation - egrid_spacing_ev : float - spacing of energy grid - egrid_offset_ev : float - original offset of energy grid + number_of_electrons : float / int if not None, computes the energy shift relative to QE energies + n_shift_mse : int how many energy grid points to consider when aligning LDOS vectors based on mean-squared error @@ -304,10 +306,9 @@ def align_ldos_to_ref( json.dump(ldos_shift_info, f, indent=2) barrier() - + @staticmethod def calc_optimal_ldos_shift( - e_grid, ldos_mean, ldos_mean_ref, left_index, @@ -322,8 +323,6 @@ def calc_optimal_ldos_shift( Parameters ---------- - e_grid : array_like - energy grid ldos_mean : array_like mean of LDOS vector for shifting ldos_mean_ref : array_like diff --git a/mala/datahandling/multi_lazy_load_data_loader.py b/mala/datahandling/multi_lazy_load_data_loader.py index ed0154e32..a9aca6afc 100644 --- a/mala/datahandling/multi_lazy_load_data_loader.py +++ b/mala/datahandling/multi_lazy_load_data_loader.py @@ -20,23 +20,23 @@ class MultiLazyLoadDataLoader: """ def __init__(self, datasets, **kwargs): - self.datasets = datasets - self.loaders = [] + self._datasets = datasets + self._loaders = [] for d in datasets: - self.loaders.append( + self._loaders.append( DataLoader(d, batch_size=None, **kwargs, shuffle=False) ) # Create single process pool for prefetching # Can use ThreadPoolExecutor for debugging. # self.pool = concurrent.futures.ThreadPoolExecutor(1) - self.pool = concurrent.futures.ProcessPoolExecutor(1) + self._pool = concurrent.futures.ProcessPoolExecutor(1) # Allocate shared memory and commence file load for first # dataset in list - dset = self.datasets[0] + dset = self._datasets[0] dset.allocate_shared_mem() - self.load_future = self.pool.submit( + self._load_future = self._pool.submit( self.load_snapshot_to_shm, dset.snapshot, dset.descriptor_calculator, @@ -54,7 +54,7 @@ def __len__(self): length : int Number of datasets/snapshots contained within this loader. """ - return len(self.loaders) + return len(self._loaders) def __iter__(self): """ @@ -66,7 +66,7 @@ def __iter__(self): An iterator over the individual datasets/snapshots in this object. """ - self.count = 0 + self._count = 0 return self def __next__(self): @@ -78,25 +78,25 @@ def __next__(self): iterator: DataLoader The next data loader. """ - self.count += 1 - if self.count > len(self.loaders): + self._count += 1 + if self._count > len(self._loaders): raise StopIteration else: # Wait on last prefetch - if self.count - 1 >= 0: - if not self.datasets[self.count - 1].loaded: - self.load_future.result() - self.datasets[self.count - 1].loaded = True + if self._count - 1 >= 0: + if not self._datasets[self._count - 1].loaded: + self._load_future.result() + self._datasets[self._count - 1].loaded = True # Delete last - if self.count - 2 >= 0: - self.datasets[self.count - 2].delete_data() + if self._count - 2 >= 0: + self._datasets[self._count - 2].delete_data() # Prefetch next file (looping around epoch boundary) - dset = self.datasets[self.count % len(self.loaders)] + dset = self._datasets[self._count % len(self._loaders)] if not dset.loaded: dset.allocate_shared_mem() - self.load_future = self.pool.submit( + self._load_future = self._pool.submit( self.load_snapshot_to_shm, dset.snapshot, dset.descriptor_calculator, @@ -106,7 +106,7 @@ def __next__(self): ) # Return current - return self.loaders[self.count - 1] + return self._loaders[self._count - 1] # TODO: Without this function, I get 2 times the number of snapshots # memory leaks after shutdown. With it, I get 1 times the number of @@ -114,9 +114,9 @@ def __next__(self): # enough? I am not sure where the memory leak is coming from. def cleanup(self): """Deallocate arrays still left in memory.""" - for dset in self.datasets: + for dset in self._datasets: dset.deallocate_shared_mem() - self.pool.shutdown() + self._pool.shutdown() # Worker function to load data into shared memory (limited to numpy files # only for now) diff --git a/mala/datahandling/snapshot.py b/mala/datahandling/snapshot.py index 8f6bc4666..1bac8488c 100644 --- a/mala/datahandling/snapshot.py +++ b/mala/datahandling/snapshot.py @@ -43,8 +43,54 @@ class Snapshot(JSONSerializable): - tr: This snapshot will be a training snapshot. - va: This snapshot will be a validation snapshot. - Replaces the old approach of MALA to have a separate list. - Default is None. + Attributes + ---------- + grid_dimensions : list + Grid dimension [x,y,z]. + + grid_size : int + Number of grid points in total. + + input_dimension : int + Input feature dimension. + + output_dimension : int + Output feature dimension + + input_npy_file : string + File with saved numpy input array. + + input_npy_directory : string + Directory containing input_npy_directory. + + output_npy_file : string + File with saved numpy output array. + + output_npy_directory : string + Directory containing output_npy_file. + + input_units : string + Units of input data. See descriptor classes to see which units are + supported. + + output_units : string + Units of output data. See target classes to see which units are + supported. + + calculation_output : string + File with the output of the original snapshot calculation. This is + only needed when testing multiple snapshots. + + snapshot_function : string + "Function" of the snapshot in the MALA workflow. + + - te: This snapshot will be a testing snapshot. + - tr: This snapshot will be a training snapshot. + - va: This snapshot will be a validation snapshot. + + snapshot_type : string + Can be either "numpy" or "openpmd" and denotes which type of files + this snapshot contains. """ def __init__( diff --git a/mala/descriptors/atomic_density.py b/mala/descriptors/atomic_density.py index 6c5a7acac..4459c838b 100755 --- a/mala/descriptors/atomic_density.py +++ b/mala/descriptors/atomic_density.py @@ -31,18 +31,12 @@ class AtomicDensity(Descriptor): def __init__(self, parameters): super(AtomicDensity, self).__init__(parameters) - self.verbosity = parameters.verbosity @property def data_name(self): """Get a string that describes the target (for e.g. metadata).""" return "AtomicDensity" - @property - def feature_size(self): - """Get the feature dimension of this data.""" - return self.fingerprint_length - @staticmethod def convert_units(array, in_units="None"): """ @@ -140,7 +134,7 @@ def __calculate_lammps(self, outdir, **kwargs): self.setup_lammps_tmp_files("ggrid", outdir) ase.io.write( - self.lammps_temporary_input, self.atoms, format=lammps_format + self._lammps_temporary_input, self._atoms, format=lammps_format ) nx = self.grid_dimensions[0] @@ -151,7 +145,7 @@ def __calculate_lammps(self, outdir, **kwargs): if self.parameters.atomic_density_sigma is None: self.grid_dimensions = [nx, ny, nz] self.parameters.atomic_density_sigma = self.get_optimal_sigma( - self.voxel + self._voxel ) # Create LAMMPS instance. @@ -213,7 +207,7 @@ def __calculate_lammps(self, outdir, **kwargs): if return_directly: return gaussian_descriptors_np else: - self.fingerprint_length = 4 + self.feature_size = 4 return gaussian_descriptors_np, nrows_ggrid else: # Since the atomic density may be directly fed back into QE @@ -238,10 +232,10 @@ def __calculate_lammps(self, outdir, **kwargs): [2, 1, 0, 3] ) if self.parameters.descriptors_contain_xyz: - self.fingerprint_length = 4 + self.feature_size = 4 return gaussian_descriptors_np[:, :, :, 3:], nx * ny * nz else: - self.fingerprint_length = 1 + self.feature_size = 1 return gaussian_descriptors_np[:, :, :, 6:], nx * ny * nz def __calculate_python(self, **kwargs): @@ -281,7 +275,7 @@ def __calculate_python(self, **kwargs): # This follows the implementation in the LAMMPS code. if self.parameters.atomic_density_sigma is None: self.parameters.atomic_density_sigma = self.get_optimal_sigma( - self.voxel + self._voxel ) cutoff_squared = ( self.parameters.atomic_density_cutoff @@ -329,10 +323,10 @@ def __calculate_python(self, **kwargs): ) if self.parameters.descriptors_contain_xyz: - self.fingerprint_length = 4 + self.feature_size = 4 return gaussian_descriptors_np, np.prod(self.grid_dimensions) else: - self.fingerprint_length = 1 + self.feature_size = 1 return gaussian_descriptors_np[:, :, :, 3:], np.prod( self.grid_dimensions ) diff --git a/mala/descriptors/bispectrum.py b/mala/descriptors/bispectrum.py index 207fac341..ab8bbff7f 100755 --- a/mala/descriptors/bispectrum.py +++ b/mala/descriptors/bispectrum.py @@ -57,11 +57,6 @@ def data_name(self): """Get a string that describes the target (for e.g. metadata).""" return "Bispectrum" - @property - def feature_size(self): - """Get the feature dimension of this data.""" - return self.fingerprint_length - @staticmethod def convert_units(array, in_units="None"): """ @@ -144,7 +139,7 @@ def __calculate_lammps(self, outdir, **kwargs): self.setup_lammps_tmp_files("bgrid", outdir) ase.io.write( - self.lammps_temporary_input, self.atoms, format=lammps_format + self._lammps_temporary_input, self._atoms, format=lammps_format ) nx = self.grid_dimensions[0] @@ -190,7 +185,7 @@ def __calculate_lammps(self, outdir, **kwargs): * (self.parameters.bispectrum_twojmax + 4) ) ncoeff = ncoeff // 24 # integer division - self.fingerprint_length = ncols0 + ncoeff + self.feature_size = ncols0 + ncoeff # Extract data from LAMMPS calculation. # This is different for the parallel and the serial case. @@ -210,7 +205,7 @@ def __calculate_lammps(self, outdir, **kwargs): lammps_constants.LMP_STYLE_LOCAL, lammps_constants.LMP_SIZE_COLS, ) - if ncols_local != self.fingerprint_length + 3: + if ncols_local != self.feature_size + 3: raise Exception("Inconsistent number of features.") snap_descriptors_np = extract_compute_np( @@ -235,7 +230,7 @@ def __calculate_lammps(self, outdir, **kwargs): "bgrid", 0, 2, - (nz, ny, nx, self.fingerprint_length), + (nz, ny, nx, self.feature_size), use_fp64=use_fp64, ) @@ -297,13 +292,13 @@ def __calculate_python(self, **kwargs): * (self.parameters.bispectrum_twojmax + 4) ) ncoeff = ncoeff // 24 # integer division - self.fingerprint_length = ncoeff + 3 + self.feature_size = ncoeff + 3 bispectrum_np = np.zeros( ( self.grid_dimensions[0], self.grid_dimensions[1], self.grid_dimensions[2], - self.fingerprint_length, + self.feature_size, ), dtype=np.float64, ) @@ -313,16 +308,16 @@ def __calculate_python(self, **kwargs): # These are technically hyperparameters. We currently simply set them # to set values for everything. - self.rmin0 = 0.0 - self.rfac0 = 0.99363 - self.bzero_flag = False - self.wselfall_flag = False + self._rmin0 = 0.0 + self._rfac0 = 0.99363 + self._bzero_flag = False + self._wselfall_flag = False # Currently not supported - self.bnorm_flag = False + self._bnorm_flag = False # Currently not supported - self.quadraticflag = False - self.number_elements = 1 - self.wself = 1.0 + self._quadraticflag = False + self._python_calculation_number_elements = 1 + self._wself = 1.0 # What follows is the python implementation of the # bispectrum descriptor calculation. @@ -496,7 +491,7 @@ def __calculate_python(self, **kwargs): if self.parameters.descriptors_contain_xyz: return bispectrum_np, np.prod(self.grid_dimensions) else: - self.fingerprint_length -= 3 + self.feature_size -= 3 return bispectrum_np[:, :, :, 3:], np.prod(self.grid_dimensions) ######## @@ -902,10 +897,10 @@ def __compute_ui(self, nr_atoms, atoms_cutoff, distances_cutoff, grid): """ # Precompute and prepare ui stuff theta0 = ( - (distances_cutoff - self.rmin0) - * self.rfac0 + (distances_cutoff - self._rmin0) + * self._rfac0 * np.pi - / (self.parameters.bispectrum_cutoff - self.rmin0) + / (self.parameters.bispectrum_cutoff - self._rmin0) ) z0 = np.squeeze(distances_cutoff / np.tan(theta0)) @@ -986,13 +981,14 @@ def __compute_ui(self, nr_atoms, atoms_cutoff, distances_cutoff, grid): sfac += 1.0 else: rcutfac = np.pi / ( - self.parameters.bispectrum_cutoff - self.rmin0 + self.parameters.bispectrum_cutoff - self._rmin0 ) if nr_atoms > 1: sfac = 0.5 * ( - np.cos((distances_cutoff - self.rmin0) * rcutfac) + 1.0 + np.cos((distances_cutoff - self._rmin0) * rcutfac) + + 1.0 ) - sfac[np.where(distances_cutoff <= self.rmin0)] = 1.0 + sfac[np.where(distances_cutoff <= self._rmin0)] = 1.0 sfac[ np.where( distances_cutoff @@ -1000,8 +996,8 @@ def __compute_ui(self, nr_atoms, atoms_cutoff, distances_cutoff, grid): ) ] = 0.0 else: - sfac = 1.0 if distances_cutoff <= self.rmin0 else sfac - sfac = 0.0 if distances_cutoff <= self.rmin0 else sfac + sfac = 1.0 if distances_cutoff <= self._rmin0 else sfac + sfac = 0.0 if distances_cutoff <= self._rmin0 else sfac # sfac technically has to be weighted according to the chemical # species. But this is a minimal implementation only for a single @@ -1099,12 +1095,12 @@ def __compute_bi(self, ulisttot_r, ulisttot_i, zlist_r, zlist_i): itriple = 0 idouble = 0 - if self.bzero_flag: + if self._bzero_flag: wself = 1.0 bzero = np.zeros(self.parameters.bispectrum_twojmax + 1) www = wself * wself * wself for j in range(self.parameters.bispectrum_twojmax + 1): - if self.bnorm_flag: + if self._bnorm_flag: bzero[j] = www else: bzero[j] = www * (j + 1) @@ -1158,8 +1154,8 @@ def __compute_bi(self, ulisttot_r, ulisttot_i, zlist_r, zlist_i): itriple += 1 idouble += 1 - if self.bzero_flag: - if not self.wselfall_flag: + if self._bzero_flag: + if not self._wselfall_flag: itriple = ( ielem * number_elements + ielem ) * number_elements + ielem @@ -1179,9 +1175,9 @@ def __compute_bi(self, ulisttot_r, ulisttot_i, zlist_r, zlist_i): itriple += 1 # Untested & Unoptimized - if self.quadraticflag: + if self._quadraticflag: xyz_length = 3 if self.parameters.descriptors_contain_xyz else 0 - ncount = self.fingerprint_length - xyz_length + ncount = self.feature_size - xyz_length for icoeff in range(ncount): bveci = blist[icoeff] blist[3 + ncount] = 0.5 * bveci * bveci diff --git a/mala/descriptors/descriptor.py b/mala/descriptors/descriptor.py index 17cd9e5b0..041dd4b3f 100644 --- a/mala/descriptors/descriptor.py +++ b/mala/descriptors/descriptor.py @@ -36,6 +36,10 @@ class Descriptor(PhysicalData): parameters : mala.common.parameters.Parameters Parameters object used to create this object. + Attributes + ---------- + parameters: mala.common.parameters.ParametersDescriptors + MALA descriptor calculation parameters. """ ############################## @@ -106,17 +110,16 @@ def __getnewargs__(self): def __init__(self, parameters): super(Descriptor, self).__init__(parameters) self.parameters: ParametersDescriptors = parameters.descriptors - self.fingerprint_length = 0 # so iterations will fail - self.verbosity = parameters.verbosity - self.in_format_ase = "" - self.atoms = None - self.voxel = None + self.feature_size = 0 # so iterations will fail + self._in_format_ase = "" + self._atoms = None + self._voxel = None # If we ever have NON LAMMPS descriptors, these parameters have no # meaning anymore and should probably be moved to an intermediate # DescriptorsLAMMPS class, from which the LAMMPS descriptors inherit. - self.lammps_temporary_input = None - self.lammps_temporary_log = None + self._lammps_temporary_input = None + self._lammps_temporary_log = None ############################## # Properties @@ -182,6 +185,15 @@ def convert_units(array, in_units="1/eV"): " descriptor type." ) + @property + def feature_size(self): + """Get the feature dimension of this data.""" + return self._feature_size + + @feature_size.setter + def feature_size(self, value): + self._feature_size = value + @staticmethod def backconvert_units(array, out_units): """ @@ -227,24 +239,24 @@ def setup_lammps_tmp_files(self, lammps_type, outdir): lammps_tmp_input_file = tempfile.NamedTemporaryFile( delete=False, prefix=prefix_inp_str, suffix="_.tmp", dir=outdir ) - self.lammps_temporary_input = lammps_tmp_input_file.name + self._lammps_temporary_input = lammps_tmp_input_file.name lammps_tmp_input_file.close() lammps_tmp_log_file = tempfile.NamedTemporaryFile( delete=False, prefix=prefix_log_str, suffix="_.tmp", dir=outdir ) - self.lammps_temporary_log = lammps_tmp_log_file.name + self._lammps_temporary_log = lammps_tmp_log_file.name lammps_tmp_log_file.close() else: - self.lammps_temporary_input = None - self.lammps_temporary_log = None + self._lammps_temporary_input = None + self._lammps_temporary_log = None if self.parameters._configuration["mpi"]: - self.lammps_temporary_input = get_comm().bcast( - self.lammps_temporary_input, root=0 + self._lammps_temporary_input = get_comm().bcast( + self._lammps_temporary_input, root=0 ) - self.lammps_temporary_log = get_comm().bcast( - self.lammps_temporary_log, root=0 + self._lammps_temporary_log = get_comm().bcast( + self._lammps_temporary_log, root=0 ) # Calculations @@ -328,13 +340,13 @@ def calculate_from_qe_out( (x,y,z,descriptor_dimension) """ - self.in_format_ase = "espresso-out" + self._in_format_ase = "espresso-out" printout("Calculating descriptors from", qe_out_file, min_verbosity=0) # We get the atomic information by using ASE. - self.atoms = ase.io.read(qe_out_file, format=self.in_format_ase) + self._atoms = ase.io.read(qe_out_file, format=self._in_format_ase) # Enforcing / Checking PBC on the read atoms. - self.atoms = self.enforce_pbc(self.atoms) + self._atoms = self.enforce_pbc(self._atoms) # Get the grid dimensions. if "grid_dimensions" in kwargs.keys(): @@ -356,10 +368,10 @@ def calculate_from_qe_out( self.grid_dimensions[2] = int(tmp.split(",")[2]) break - self.voxel = self.atoms.cell.copy() - self.voxel[0] = self.voxel[0] / (self.grid_dimensions[0]) - self.voxel[1] = self.voxel[1] / (self.grid_dimensions[1]) - self.voxel[2] = self.voxel[2] / (self.grid_dimensions[2]) + self._voxel = self._atoms.cell.copy() + self._voxel[0] = self._voxel[0] / (self.grid_dimensions[0]) + self._voxel[1] = self._voxel[1] / (self.grid_dimensions[1]) + self._voxel[2] = self._voxel[2] / (self.grid_dimensions[2]) return self._calculate(working_directory, **kwargs) @@ -400,12 +412,12 @@ def calculate_from_atoms( (x,y,z,descriptor_dimension) """ # Enforcing / Checking PBC on the input atoms. - self.atoms = self.enforce_pbc(atoms) + self._atoms = self.enforce_pbc(atoms) self.grid_dimensions = grid_dimensions - self.voxel = self.atoms.cell.copy() - self.voxel[0] = self.voxel[0] / (self.grid_dimensions[0]) - self.voxel[1] = self.voxel[1] / (self.grid_dimensions[1]) - self.voxel[2] = self.voxel[2] / (self.grid_dimensions[2]) + self._voxel = self._atoms.cell.copy() + self._voxel[0] = self._voxel[0] / (self.grid_dimensions[0]) + self._voxel[1] = self._voxel[1] / (self.grid_dimensions[1]) + self._voxel[2] = self._voxel[2] / (self.grid_dimensions[2]) return self._calculate(working_directory, **kwargs) def gather_descriptors(self, descriptors_np, use_pickled_comm=False): @@ -445,7 +457,7 @@ def gather_descriptors(self, descriptors_np, use_pickled_comm=False): sendcounts = np.array( comm.gather(np.shape(descriptors_np)[0], root=0) ) - raw_feature_length = self.fingerprint_length + 3 + raw_feature_length = self.feature_size + 3 if get_rank() == 0: # print("sendcounts: {}, total: {}".format(sendcounts, @@ -490,7 +502,7 @@ def gather_descriptors(self, descriptors_np, use_pickled_comm=False): nx = self.grid_dimensions[0] ny = self.grid_dimensions[1] nz = self.grid_dimensions[2] - descriptors_full = np.zeros([nx, ny, nz, self.fingerprint_length]) + descriptors_full = np.zeros([nx, ny, nz, self.feature_size]) # Fill the full bispectrum descriptors array. for idx, local_grid in enumerate(all_descriptors_list): # We glue the individual cells back together, and transpose. @@ -508,7 +520,7 @@ def gather_descriptors(self, descriptors_np, use_pickled_comm=False): last_z - first_z, last_y - first_y, last_x - first_x, - self.fingerprint_length, + self.feature_size, ], ).transpose( [2, 1, 0, 3] @@ -554,10 +566,10 @@ def convert_local_to_3d(self, descriptors_np): ny = local_reach[1] - local_offset[1] nz = local_reach[2] - local_offset[2] - descriptors_full = np.zeros([nx, ny, nz, self.fingerprint_length]) + descriptors_full = np.zeros([nx, ny, nz, self.feature_size]) descriptors_full[0:nx, 0:ny, 0:nz] = np.reshape( - descriptors_np[:, 3:], [nz, ny, nx, self.fingerprint_length] + descriptors_np[:, 3:], [nz, ny, nx, self.feature_size] ).transpose([2, 1, 0, 3]) return descriptors_full, local_offset, local_reach @@ -580,20 +592,20 @@ def _process_loaded_dimensions(self, array_dimensions): def _set_geometry_info(self, mesh): # Geometry: Save the cell parameters and angles of the grid. - if self.atoms is not None: + if self._atoms is not None: import openpmd_api as io - self.voxel = self.atoms.cell.copy() - self.voxel[0] = self.voxel[0] / (self.grid_dimensions[0]) - self.voxel[1] = self.voxel[1] / (self.grid_dimensions[1]) - self.voxel[2] = self.voxel[2] / (self.grid_dimensions[2]) + self._voxel = self._atoms.cell.copy() + self._voxel[0] = self._voxel[0] / (self.grid_dimensions[0]) + self._voxel[1] = self._voxel[1] / (self.grid_dimensions[1]) + self._voxel[2] = self._voxel[2] / (self.grid_dimensions[2]) mesh.geometry = io.Geometry.cartesian - mesh.grid_spacing = self.voxel.cellpar()[0:3] - mesh.set_attribute("angles", self.voxel.cellpar()[3:]) + mesh.grid_spacing = self._voxel.cellpar()[0:3] + mesh.set_attribute("angles", self._voxel.cellpar()[3:]) def _get_atoms(self): - return self.atoms + return self._atoms def _feature_mask(self): if self.descriptors_contain_xyz: @@ -614,9 +626,9 @@ def _setup_lammps(self, nx, ny, nz, lammps_dict): "-screen", "none", "-log", - self.lammps_temporary_log, + self._lammps_temporary_log, ] - lammps_dict["atom_config_fname"] = self.lammps_temporary_input + lammps_dict["atom_config_fname"] = self._lammps_temporary_input if self.parameters._configuration["mpi"]: size = get_size() @@ -833,8 +845,8 @@ def _clean_calculation(self, lmp, keep_logs): lmp.close() if not keep_logs: if get_rank() == 0: - os.remove(self.lammps_temporary_log) - os.remove(self.lammps_temporary_input) + os.remove(self._lammps_temporary_log) + os.remove(self._lammps_temporary_input) def _setup_atom_list(self): """ @@ -847,7 +859,7 @@ def _setup_atom_list(self): FURTHER OPTIMIZATION: Probably not that much, this mostly already uses optimized python functions. """ - if np.any(self.atoms.pbc): + if np.any(self._atoms.pbc): # To determine the list of relevant atoms we first take the edges # of the simulation cell and use them to determine all cells @@ -874,19 +886,19 @@ def _setup_atom_list(self): for edge in edges: edge_point = self._grid_to_coord(edge) neighborlist = NeighborList( - np.zeros(len(self.atoms) + 1) + np.zeros(len(self._atoms) + 1) + [self.parameters.atomic_density_cutoff], bothways=True, self_interaction=False, primitive=NewPrimitiveNeighborList, ) - atoms_with_grid_point = self.atoms.copy() + atoms_with_grid_point = self._atoms.copy() # Construct a ghost atom representing the grid point. atoms_with_grid_point.append(ase.Atom("H", edge_point)) neighborlist.update(atoms_with_grid_point) - indices, offsets = neighborlist.get_neighbors(len(self.atoms)) + indices, offsets = neighborlist.get_neighbors(len(self._atoms)) # Incrementally fill the list containing all cells to be # considered. @@ -911,18 +923,18 @@ def _setup_atom_list(self): # First, instantiate it by filling it will all atoms from all # potentiall relevant cells, as identified above. all_atoms = None - for a in range(0, len(self.atoms)): + for a in range(0, len(self._atoms)): if all_atoms is None: all_atoms = ( - self.atoms.positions[a] - + all_cells @ self.atoms.get_cell() + self._atoms.positions[a] + + all_cells @ self._atoms.get_cell() ) else: all_atoms = np.concatenate( ( all_atoms, - self.atoms.positions[a] - + all_cells @ self.atoms.get_cell(), + self._atoms.positions[a] + + all_cells @ self._atoms.get_cell(), ) ) @@ -975,11 +987,11 @@ def _setup_atom_list(self): :, ] ) - return np.concatenate((all_atoms, self.atoms.positions)) + return np.concatenate((all_atoms, self._atoms.positions)) else: # If no PBC are used, only consider a single cell. - return self.atoms.positions + return self._atoms.positions def _grid_to_coord(self, gridpoint): # Convert grid indices to real space grid point. @@ -989,20 +1001,20 @@ def _grid_to_coord(self, gridpoint): # Orthorhombic cells and triclinic ones have # to be treated differently, see domain.cpp - if self.atoms.cell.orthorhombic: - return np.diag(self.voxel) * [i, j, k] + if self._atoms.cell.orthorhombic: + return np.diag(self._voxel) * [i, j, k] else: ret = [0, 0, 0] ret[0] = ( - i / self.grid_dimensions[0] * self.atoms.cell[0, 0] - + j / self.grid_dimensions[1] * self.atoms.cell[1, 0] - + k / self.grid_dimensions[2] * self.atoms.cell[2, 0] + i / self.grid_dimensions[0] * self._atoms.cell[0, 0] + + j / self.grid_dimensions[1] * self._atoms.cell[1, 0] + + k / self.grid_dimensions[2] * self._atoms.cell[2, 0] ) ret[1] = ( - j / self.grid_dimensions[1] * self.atoms.cell[1, 1] - + k / self.grid_dimensions[2] * self.atoms.cell[1, 2] + j / self.grid_dimensions[1] * self._atoms.cell[1, 1] + + k / self.grid_dimensions[2] * self._atoms.cell[1, 2] ) - ret[2] = k / self.grid_dimensions[2] * self.atoms.cell[2, 2] + ret[2] = k / self.grid_dimensions[2] * self._atoms.cell[2, 2] return np.array(ret) @abstractmethod @@ -1010,4 +1022,4 @@ def _calculate(self, outdir, **kwargs): pass def _set_feature_size_from_array(self, array): - self.fingerprint_length = np.shape(array)[-1] + self.feature_size = np.shape(array)[-1] diff --git a/mala/descriptors/minterpy_descriptors.py b/mala/descriptors/minterpy_descriptors.py index 55fd69de4..2d9d52168 100755 --- a/mala/descriptors/minterpy_descriptors.py +++ b/mala/descriptors/minterpy_descriptors.py @@ -1,4 +1,4 @@ -"""Gaussian descriptor class.""" +"""Minterpy descriptor class.""" import os @@ -10,10 +10,14 @@ from mala.descriptors.lammps_utils import extract_compute_np from mala.descriptors.descriptor import Descriptor from mala.descriptors.atomic_density import AtomicDensity +from mala.common.parallelizer import parallel_warn class MinterpyDescriptors(Descriptor): - """Class for calculation and parsing of Gaussian descriptors. + """ + Class for calculation and parsing of Minterpy descriptors. + + Marked for deprecation. Parameters ---------- @@ -23,18 +27,16 @@ class MinterpyDescriptors(Descriptor): def __init__(self, parameters): super(MinterpyDescriptors, self).__init__(parameters) - self.verbosity = parameters.verbosity + parallel_warn( + "Minterpy descriptors will be deprecated starting with MALA v1.4.0", + category=FutureWarning, + ) @property def data_name(self): """Get a string that describes the target (for e.g. metadata).""" return "Minterpy" - @property - def feature_size(self): - """Get the feature dimension of this data.""" - return self.fingerprint_length - @staticmethod def convert_units(array, in_units="None"): """ @@ -149,11 +151,11 @@ def _calculate(self, atoms, outdir, grid_dimensions, **kwargs): ], dtype=np.float64, ) - self.fingerprint_length = ( + self.feature_size = ( len(self.parameters.minterpy_point_list) + coord_length ) - self.fingerprint_length = len(self.parameters.minterpy_point_list) + self.feature_size = len(self.parameters.minterpy_point_list) # Perform one LAMMPS call for each point in the Minterpy point list. for idx, point in enumerate(self.parameters.minterpy_point_list): # Shift the atoms in negative direction of the point(s) we actually @@ -166,7 +168,7 @@ def _calculate(self, atoms, outdir, grid_dimensions, **kwargs): self.setup_lammps_tmp_files("minterpy", outdir) ase.io.write( - self.lammps_temporary_input, self.atoms, format=lammps_format + self._lammps_temporary_input, self._atoms, format=lammps_format ) # Create LAMMPS instance. diff --git a/mala/interfaces/ase_calculator.py b/mala/interfaces/ase_calculator.py index 1ccd73d3a..66e548dfe 100644 --- a/mala/interfaces/ase_calculator.py +++ b/mala/interfaces/ase_calculator.py @@ -34,9 +34,21 @@ class MALA(Calculator): the neural network), calculator can access all important data such as temperature, number of electrons, etc. that might not be known simply from the atomic positions. + + predictor : mala.network.predictor.Predictor + A Predictor class object to be used for the underlying MALA + predictions. + + Attributes + ---------- + mala_parameters : mala.common.parameters.Parameters + MALA parameters used for predictions. + + last_energy_contributions : dict + Contains all total energy contributions for the last prediction. """ - implemented_properties = ["energy", "forces"] + implemented_properties = ["energy"] def __init__( self, @@ -55,21 +67,21 @@ def __init__( "The MALA calculator currently only works with the LDOS." ) - self.network: Network = network - self.data_handler: DataHandler = data + self._network: Network = network + self._data_handler: DataHandler = data # Prepare for prediction. if predictor is None: - self.predictor = Predictor( - self.mala_parameters, self.network, self.data_handler + self._predictor = Predictor( + self.mala_parameters, self._network, self._data_handler ) else: - self.predictor = predictor + self._predictor = predictor if reference_data is not None: # Get critical values from a reference file (cutoff, # temperature, etc.) - self.data_handler.target_calculator.read_additional_calculation_data( + self._data_handler.target_calculator.read_additional_calculation_data( reference_data ) @@ -91,6 +103,11 @@ def load_model(cls, run_name, path="./"): path : str Path where the model is saved. + + Returns + ------- + calculator : mala.interfaces.calculator.Calculator + The calculator object. """ parallel_warn( "MALA.load_model() will be deprecated in MALA v1.4.0." @@ -115,6 +132,11 @@ def load_run(cls, run_name, path="./"): path : str Path where the model is saved. + + Returns + ------- + calculator : mala.interfaces.calculator.Calculator + The calculator object. """ loaded_params, loaded_network, new_datahandler, loaded_runner = ( Predictor.load_run(run_name, path=path) @@ -152,10 +174,10 @@ def calculate( Calculator.calculate(self, atoms, properties, system_changes) # Get the LDOS from the NN. - ldos = self.predictor.predict_for_atoms(atoms) + ldos = self._predictor.predict_for_atoms(atoms) # Use the LDOS determined DOS and density to get energy and forces. - ldos_calculator: LDOS = self.data_handler.target_calculator + ldos_calculator: LDOS = self._data_handler.target_calculator ldos_calculator.read_from_array(ldos) self.results["energy"] = ldos_calculator.total_energy energy, self.last_energy_contributions = ( @@ -197,19 +219,19 @@ def calculate_properties(self, atoms, properties): if "rdf" in properties: self.results["rdf"] = ( - self.data_handler.target_calculator.get_radial_distribution_function( + self._data_handler.target_calculator.get_radial_distribution_function( atoms ) ) if "tpcf" in properties: self.results["tpcf"] = ( - self.data_handler.target_calculator.get_three_particle_correlation_function( + self._data_handler.target_calculator.get_three_particle_correlation_function( atoms ) ) if "static_structure_factor" in properties: self.results["static_structure_factor"] = ( - self.data_handler.target_calculator.get_static_structure_factor( + self._data_handler.target_calculator.get_static_structure_factor( atoms ) ) @@ -233,6 +255,6 @@ def save_calculator(self, filename, path="./"): Path where the calculator should be saved. """ - self.predictor.save_run( + self._predictor.save_run( filename, path=path, additional_calculation_data=True ) diff --git a/mala/network/__init__.py b/mala/network/__init__.py index eaa50c125..e058688aa 100644 --- a/mala/network/__init__.py +++ b/mala/network/__init__.py @@ -11,6 +11,7 @@ from .hyperparameter_oat import HyperparameterOAT from .hyperparameter_naswot import HyperparameterNASWOT from .hyperparameter_optuna import HyperparameterOptuna -from .hyperparameter_acsd import HyperparameterACSD +from .hyperparameter_descriptor_scoring import HyperparameterDescriptorScoring from .acsd_analyzer import ACSDAnalyzer +from .mutual_information_analyzer import MutualInformationAnalyzer from .runner import Runner diff --git a/mala/network/acsd_analyzer.py b/mala/network/acsd_analyzer.py index b9bcba60a..049a1d824 100644 --- a/mala/network/acsd_analyzer.py +++ b/mala/network/acsd_analyzer.py @@ -1,28 +1,17 @@ """Class for performing a full ACSD analysis.""" -import itertools -import os - import numpy as np from mala.datahandling.data_converter import ( descriptor_input_types, target_input_types, ) -from mala.descriptors.descriptor import Descriptor -from mala.targets.target import Target -from mala.network.hyperparameter import Hyperparameter -from mala.network.hyper_opt import HyperOpt -from mala.common.parallelizer import get_rank, printout -from mala.descriptors.bispectrum import Bispectrum -from mala.descriptors.atomic_density import AtomicDensity -from mala.descriptors.minterpy_descriptors import MinterpyDescriptors - -descriptor_input_types_acsd = descriptor_input_types + ["numpy", "openpmd"] -target_input_types_acsd = target_input_types + ["numpy", "openpmd"] +from mala.network.descriptor_scoring_optimizer import ( + DescriptorScoringOptimizer, +) -class ACSDAnalyzer(HyperOpt): +class ACSDAnalyzer(DescriptorScoringOptimizer): """ Analyzer based on the ACSD analysis. @@ -47,669 +36,23 @@ class ACSDAnalyzer(HyperOpt): def __init__( self, params, target_calculator=None, descriptor_calculator=None ): - super(ACSDAnalyzer, self).__init__(params) - # Calculators used to parse data from compatible files. - self.target_calculator = target_calculator - if self.target_calculator is None: - self.target_calculator = Target(params) - self.descriptor_calculator = descriptor_calculator - if self.descriptor_calculator is None: - self.descriptor_calculator = Descriptor(params) - if ( - not isinstance(self.descriptor_calculator, Bispectrum) - and not isinstance(self.descriptor_calculator, AtomicDensity) - and not isinstance(self.descriptor_calculator, MinterpyDescriptors) - ): - raise Exception( - "Cannot calculate ACSD for the selected descriptors." - ) - - # Internal variables. - self.__snapshots = [] - self.__snapshot_description = [] - self.__snapshot_units = [] - - # Filled after the analysis. - self.labels = [] - self.study = [] - self.reduced_study = None - self.internal_hyperparam_list = None - - def add_snapshot( - self, - descriptor_input_type=None, - descriptor_input_path=None, - target_input_type=None, - target_input_path=None, - descriptor_units=None, - target_units=None, - ): - """ - Add a snapshot to be processed. - - Parameters - ---------- - descriptor_input_type : string - Type of descriptor data to be processed. - See mala.datahandling.data_converter.descriptor_input_types - for options. - - descriptor_input_path : string - Path of descriptor data to be processed. - - target_input_type : string - Type of target data to be processed. - See mala.datahandling.data_converter.target_input_types - for options. - - target_input_path : string - Path of target data to be processed. - - descriptor_units : string - Units for descriptor data processing. - - target_units : string - Units for target data processing. - """ - # Check the input. - if descriptor_input_type is not None: - if descriptor_input_path is None: - raise Exception( - "Cannot process descriptor data with no path given." - ) - if descriptor_input_type not in descriptor_input_types_acsd: - raise Exception("Cannot process this type of descriptor data.") - else: - raise Exception("Cannot calculate ACSD without descriptor data.") - - if target_input_type is not None: - if target_input_path is None: - raise Exception( - "Cannot process target data with no path given." - ) - if target_input_type not in target_input_types_acsd: - raise Exception("Cannot process this type of target data.") - else: - raise Exception("Cannot calculate ACSD without target data.") - - # Assign info. - self.__snapshots.append( - {"input": descriptor_input_path, "output": target_input_path} - ) - self.__snapshot_description.append( - {"input": descriptor_input_type, "output": target_input_type} - ) - self.__snapshot_units.append( - {"input": descriptor_units, "output": target_units} - ) - - def add_hyperparameter(self, name, choices): - """ - Add a hyperparameter to the current investigation. - - Parameters - ---------- - name : string - Name of the hyperparameter. Please note that these names always - have to be the same as the parameter names in - ParametersDescriptors. - - choices : - List of possible choices. - """ - if name not in [ - "bispectrum_twojmax", - "bispectrum_cutoff", - "atomic_density_sigma", - "atomic_density_cutoff", - "minterpy_cutoff_cube_size", - "minterpy_polynomial_degree", - "minterpy_lp_norm", - ]: - raise Exception("Unkown hyperparameter for ACSD analysis entered.") - - self.params.hyperparameters.hlist.append( - Hyperparameter( - hotype="acsd", - name=name, - choices=choices, - opttype="categorical", - ) + super(ACSDAnalyzer, self).__init__( + params, + target_calculator=target_calculator, + descriptor_calculator=descriptor_calculator, ) - def perform_study( - self, file_based_communication=False, return_plotting=False - ): - """ - Perform the study, i.e. the optimization. + def _update_logging(self, score, index): + if self.best_score is None: + self.best_score = score + self.best_trial_index = index + elif score < self.best_score: + self.best_score = score + self.best_trial_index = index - This is done by sampling different descriptors, calculated with - different hyperparameters and then calculating the ACSD. - """ - # Prepare the hyperparameter lists. - self._construct_hyperparam_list() - hyperparameter_tuples = list( - itertools.product(*self.internal_hyperparam_list) - ) - - # Perform the ACSD analysis separately for each snapshot. - best_acsd = None - best_trial = None - for i in range(0, len(self.__snapshots)): - printout( - "Starting ACSD analysis of snapshot", str(i), min_verbosity=1 - ) - current_list = [] - target = self._load_target( - self.__snapshots[i], - self.__snapshot_description[i], - self.__snapshot_units[i], - file_based_communication, - ) - - for idx, hyperparameter_tuple in enumerate(hyperparameter_tuples): - if isinstance(self.descriptor_calculator, Bispectrum): - self.params.descriptors.bispectrum_cutoff = ( - hyperparameter_tuple[0] - ) - self.params.descriptors.bispectrum_twojmax = ( - hyperparameter_tuple[1] - ) - elif isinstance(self.descriptor_calculator, AtomicDensity): - self.params.descriptors.atomic_density_cutoff = ( - hyperparameter_tuple[0] - ) - self.params.descriptors.atomic_density_sigma = ( - hyperparameter_tuple[1] - ) - elif isinstance( - self.descriptor_calculator, MinterpyDescriptors - ): - self.params.descriptors.atomic_density_cutoff = ( - hyperparameter_tuple[0] - ) - self.params.descriptors.atomic_density_sigma = ( - hyperparameter_tuple[1] - ) - self.params.descriptors.minterpy_cutoff_cube_size = ( - hyperparameter_tuple[2] - ) - self.params.descriptors.minterpy_polynomial_degree = ( - hyperparameter_tuple[3] - ) - self.params.descriptors.minterpy_lp_norm = ( - hyperparameter_tuple[4] - ) - - descriptor = self._calculate_descriptors( - self.__snapshots[i], - self.__snapshot_description[i], - self.__snapshot_units[i], - ) - if get_rank() == 0: - acsd = self._calculate_acsd( - descriptor, - target, - self.params.hyperparameters.acsd_points, - descriptor_vectors_contain_xyz=self.params.descriptors.descriptors_contain_xyz, - ) - if not np.isnan(acsd): - if best_acsd is None: - best_acsd = acsd - best_trial = idx - elif acsd < best_acsd: - best_acsd = acsd - best_trial = idx - current_list.append( - list(hyperparameter_tuple) + [acsd] - ) - else: - current_list.append( - list(hyperparameter_tuple) + [np.inf] - ) - - outstring = "[" - for label_id, label in enumerate(self.labels): - outstring += ( - label + ": " + str(hyperparameter_tuple[label_id]) - ) - if label_id < len(self.labels) - 1: - outstring += ", " - outstring += "]" - best_trial_string = ". No suitable trial found yet." - if best_acsd is not None: - best_trial_string = ( - ". Best trial is " - + str(best_trial) - + " with " - + str(best_acsd) - ) - - printout( - "Trial", - idx, - "finished with ACSD=" + str(acsd), - "and parameters:", - outstring + best_trial_string, - min_verbosity=1, - ) - - if get_rank() == 0: - self.study.append(current_list) - - if get_rank() == 0: - self.study = np.mean(self.study, axis=0) - - # TODO: Does this even make sense for the minterpy descriptors? - if return_plotting: - results_to_plot = [] - if len(self.internal_hyperparam_list) == 2: - len_first_dim = len(self.internal_hyperparam_list[0]) - len_second_dim = len(self.internal_hyperparam_list[1]) - for i in range(0, len_first_dim): - results_to_plot.append( - self.study[ - i * len_second_dim : (i + 1) * len_second_dim, - 2:, - ] - ) - - if isinstance(self.descriptor_calculator, Bispectrum): - return results_to_plot, { - "twojmax": self.internal_hyperparam_list[1], - "cutoff": self.internal_hyperparam_list[0], - } - if isinstance(self.descriptor_calculator, AtomicDensity): - return results_to_plot, { - "sigma": self.internal_hyperparam_list[1], - "cutoff": self.internal_hyperparam_list[0], - } - - def set_optimal_parameters(self): - """ - Set the optimal parameters found in the present study. - - The parameters will be written to the parameter object with which the - hyperparameter optimizer was created. - """ - if get_rank() == 0: - minimum_acsd = self.study[np.argmin(self.study[:, -1])] - if len(self.internal_hyperparam_list) == 2: - if isinstance(self.descriptor_calculator, Bispectrum): - self.params.descriptors.bispectrum_cutoff = minimum_acsd[0] - self.params.descriptors.bispectrum_twojmax = int( - minimum_acsd[1] - ) - printout( - "ACSD analysis finished, optimal parameters: ", - ) - printout( - "Bispectrum twojmax: ", - self.params.descriptors.bispectrum_twojmax, - ) - printout( - "Bispectrum cutoff: ", - self.params.descriptors.bispectrum_cutoff, - ) - if isinstance(self.descriptor_calculator, AtomicDensity): - self.params.descriptors.atomic_density_cutoff = ( - minimum_acsd[0] - ) - self.params.descriptors.atomic_density_sigma = ( - minimum_acsd[1] - ) - printout( - "ACSD analysis finished, optimal parameters: ", - ) - printout( - "Atomic density sigma: ", - self.params.descriptors.atomic_density_sigma, - ) - printout( - "Atomic density cutoff: ", - self.params.descriptors.atomic_density_cutoff, - ) - elif len(self.internal_hyperparam_list) == 5: - if isinstance(self.descriptor_calculator, MinterpyDescriptors): - self.params.descriptors.atomic_density_cutoff = ( - minimum_acsd[0] - ) - self.params.descriptors.atomic_density_sigma = ( - minimum_acsd[1] - ) - self.params.descriptors.minterpy_cutoff_cube_size = ( - minimum_acsd[2] - ) - self.params.descriptors.minterpy_polynomial_degree = int( - minimum_acsd[3] - ) - self.params.descriptors.minterpy_lp_norm = int( - minimum_acsd[4] - ) - printout( - "ACSD analysis finished, optimal parameters: ", - ) - printout( - "Atomic density sigma: ", - self.params.descriptors.atomic_density_sigma, - ) - printout( - "Atomic density cutoff: ", - self.params.descriptors.atomic_density_cutoff, - ) - printout( - "Minterpy cube cutoff: ", - self.params.descriptors.minterpy_cutoff_cube_size, - ) - printout( - "Minterpy polynomial degree: ", - self.params.descriptors.minterpy_polynomial_degree, - ) - printout( - "Minterpy LP norm degree: ", - self.params.descriptors.minterpy_lp_norm, - ) - - def _construct_hyperparam_list(self): - if isinstance(self.descriptor_calculator, Bispectrum): - if ( - list( - map( - lambda p: "bispectrum_cutoff" in p.name, - self.params.hyperparameters.hlist, - ) - ).count(True) - == 0 - ): - first_dim_list = [self.params.descriptors.bispectrum_cutoff] - else: - first_dim_list = self.params.hyperparameters.hlist[ - list( - map( - lambda p: "bispectrum_cutoff" in p.name, - self.params.hyperparameters.hlist, - ) - ).index(True) - ].choices - - if ( - list( - map( - lambda p: "bispectrum_twojmax" in p.name, - self.params.hyperparameters.hlist, - ) - ).count(True) - == 0 - ): - second_dim_list = [self.params.descriptors.bispectrum_twojmax] - else: - second_dim_list = self.params.hyperparameters.hlist[ - list( - map( - lambda p: "bispectrum_twojmax" in p.name, - self.params.hyperparameters.hlist, - ) - ).index(True) - ].choices - - self.internal_hyperparam_list = [first_dim_list, second_dim_list] - self.labels = ["cutoff", "twojmax"] - - elif isinstance(self.descriptor_calculator, AtomicDensity): - if ( - list( - map( - lambda p: "atomic_density_cutoff" in p.name, - self.params.hyperparameters.hlist, - ) - ).count(True) - == 0 - ): - first_dim_list = [ - self.params.descriptors.atomic_density_cutoff - ] - else: - first_dim_list = self.params.hyperparameters.hlist[ - list( - map( - lambda p: "atomic_density_cutoff" in p.name, - self.params.hyperparameters.hlist, - ) - ).index(True) - ].choices - - if ( - list( - map( - lambda p: "atomic_density_sigma" in p.name, - self.params.hyperparameters.hlist, - ) - ).count(True) - == 0 - ): - second_dim_list = [ - self.params.descriptors.atomic_density_sigma - ] - else: - second_dim_list = self.params.hyperparameters.hlist[ - list( - map( - lambda p: "atomic_density_sigma" in p.name, - self.params.hyperparameters.hlist, - ) - ).index(True) - ].choices - self.internal_hyperparam_list = [first_dim_list, second_dim_list] - self.labels = ["cutoff", "sigma"] - - elif isinstance(self.descriptor_calculator, MinterpyDescriptors): - if ( - list( - map( - lambda p: "atomic_density_cutoff" in p.name, - self.params.hyperparameters.hlist, - ) - ).count(True) - == 0 - ): - first_dim_list = [ - self.params.descriptors.atomic_density_cutoff - ] - else: - first_dim_list = self.params.hyperparameters.hlist[ - list( - map( - lambda p: "atomic_density_cutoff" in p.name, - self.params.hyperparameters.hlist, - ) - ).index(True) - ].choices - - if ( - list( - map( - lambda p: "atomic_density_sigma" in p.name, - self.params.hyperparameters.hlist, - ) - ).count(True) - == 0 - ): - second_dim_list = [ - self.params.descriptors.atomic_density_sigma - ] - else: - second_dim_list = self.params.hyperparameters.hlist[ - list( - map( - lambda p: "atomic_density_sigma" in p.name, - self.params.hyperparameters.hlist, - ) - ).index(True) - ].choices - - if ( - list( - map( - lambda p: "minterpy_cutoff_cube_size" in p.name, - self.params.hyperparameters.hlist, - ) - ).count(True) - == 0 - ): - third_dim_list = [ - self.params.descriptors.minterpy_cutoff_cube_size - ] - else: - third_dim_list = self.params.hyperparameters.hlist[ - list( - map( - lambda p: "minterpy_cutoff_cube_size" in p.name, - self.params.hyperparameters.hlist, - ) - ).index(True) - ].choices - - if ( - list( - map( - lambda p: "minterpy_polynomial_degree" in p.name, - self.params.hyperparameters.hlist, - ) - ).count(True) - == 0 - ): - fourth_dim_list = [ - self.params.descriptors.minterpy_polynomial_degree - ] - else: - fourth_dim_list = self.params.hyperparameters.hlist[ - list( - map( - lambda p: "minterpy_polynomial_degree" in p.name, - self.params.hyperparameters.hlist, - ) - ).index(True) - ].choices - - if ( - list( - map( - lambda p: "minterpy_lp_norm" in p.name, - self.params.hyperparameters.hlist, - ) - ).count(True) - == 0 - ): - fifth_dim_list = [self.params.descriptors.minterpy_lp_norm] - else: - fifth_dim_list = self.params.hyperparameters.hlist[ - list( - map( - lambda p: "minterpy_lp_norm" in p.name, - self.params.hyperparameters.hlist, - ) - ).index(True) - ].choices - - self.internal_hyperparam_list = [ - first_dim_list, - second_dim_list, - third_dim_list, - fourth_dim_list, - fifth_dim_list, - ] - self.labels = [ - "cutoff", - "sigma", - "minterpy_cutoff", - "minterpy_polynomial_degree", - "minterpy_lp_norm", - ] - - else: - raise Exception( - "Unkown descriptor calculator selected. Cannot " - "calculate ACSD." - ) - - def _calculate_descriptors(self, snapshot, description, original_units): - descriptor_calculation_kwargs = {} - tmp_input = None - if description["input"] == "espresso-out": - descriptor_calculation_kwargs["units"] = original_units["input"] - tmp_input, local_size = ( - self.descriptor_calculator.calculate_from_qe_out( - snapshot["input"], **descriptor_calculation_kwargs - ) - ) - - elif description["input"] is None: - # In this case, only the output is processed. - pass - - else: - raise Exception( - "Unknown file extension, cannot convert descriptor" - ) - if self.params.descriptors._configuration["mpi"]: - tmp_input = self.descriptor_calculator.gather_descriptors( - tmp_input - ) - - return tmp_input - - def _load_target( - self, snapshot, description, original_units, file_based_communication - ): - memmap = None - if ( - self.params.descriptors._configuration["mpi"] - and file_based_communication - ): - memmap = "acsd.out.npy_temp" - - target_calculator_kwargs = {} - - # Read the output data - tmp_output = None - if description["output"] == ".cube": - target_calculator_kwargs["units"] = original_units["output"] - target_calculator_kwargs["use_memmap"] = memmap - # If no units are provided we just assume standard units. - tmp_output = self.target_calculator.read_from_cube( - snapshot["output"], **target_calculator_kwargs - ) - - elif description["output"] == ".xsf": - target_calculator_kwargs["units"] = original_units["output"] - target_calculator_kwargs["use_memmap"] = memmap - # If no units are provided we just assume standard units. - tmp_output = self.target_calculator.read_from_xsf( - snapshot["output"], **target_calculator_kwargs - ) - - elif description["output"] == "numpy": - if get_rank() == 0: - tmp_output = self.target_calculator.read_from_numpy_file( - snapshot["output"], units=original_units["output"] - ) - - elif description["output"] == "openpmd": - if get_rank() == 0: - tmp_output = self.target_calculator.read_from_numpy_file( - snapshot["output"], units=original_units["output"] - ) - else: - raise Exception("Unknown file extension, cannot convert target") - - if get_rank() == 0: - if ( - self.params.targets._configuration["mpi"] - and file_based_communication - ): - os.remove(memmap) - - return tmp_output + def _get_best_trial(self): + """Determine the best trial as given by this study.""" + return self._study[np.argmin(self._study[:, -1])] @staticmethod def _calculate_cosine_similarities( @@ -796,6 +139,14 @@ def _calculate_cosine_similarities( return np.array(similarity_array) + def _calculate_score(self, descriptor, target): + return self._calculate_acsd( + descriptor, + target, + self.params.hyperparameters.acsd_points, + descriptor_vectors_contain_xyz=self.params.descriptors.descriptors_contain_xyz, + ) + @staticmethod def _calculate_acsd( descriptor_data, @@ -833,10 +184,6 @@ def _calculate_acsd( The average cosine similarity distance. """ - - def distance_between_points(x1, y1, x2, y2): - return np.sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2)) - similarity_data = ACSDAnalyzer._calculate_cosine_similarities( descriptor_data, ldos_data, @@ -844,16 +191,10 @@ def distance_between_points(x1, y1, x2, y2): descriptor_vectors_contain_xyz=descriptor_vectors_contain_xyz, ) data_size = np.shape(similarity_data)[0] - distances = [] - for i in range(0, data_size): - distances.append( - distance_between_points( - similarity_data[i, 0], - similarity_data[i, 1], - similarity_data[i, 0], - similarity_data[i, 0], - ) - ) + + # Subtracting LDOS similarities from bispectrum similiarities. + distances = similarity_data[:, 0] - similarity_data[:, 1] + distances = distances.clip(min=0) return np.mean(distances) @staticmethod diff --git a/mala/network/descriptor_scoring_optimizer.py b/mala/network/descriptor_scoring_optimizer.py new file mode 100644 index 000000000..11608f5d3 --- /dev/null +++ b/mala/network/descriptor_scoring_optimizer.py @@ -0,0 +1,554 @@ +"""Base class for ACSD, mutual information and related methods.""" + +from abc import abstractmethod, ABC +import itertools +import os + +import numpy as np + +from mala.datahandling.data_converter import ( + descriptor_input_types, + target_input_types, +) +from mala.descriptors.descriptor import Descriptor +from mala.targets.target import Target +from mala.network.hyperparameter import Hyperparameter +from mala.network.hyper_opt import HyperOpt +from mala.common.parallelizer import get_rank, printout +from mala.descriptors.bispectrum import Bispectrum +from mala.descriptors.atomic_density import AtomicDensity +from mala.descriptors.minterpy_descriptors import MinterpyDescriptors + +descriptor_input_types_descriptor_scoring = descriptor_input_types + [ + "numpy", + "openpmd", +] +target_input_types_descriptor_scoring = target_input_types + [ + "numpy", + "openpmd", +] + + +class DescriptorScoringOptimizer(HyperOpt, ABC): + """ + Base class for all training-free descriptor hyperparameter optimizers. + + These optimizer use alternative metrics ACSD, mutual information, etc. + to tune descriptor hyperparameters. + + Parameters + ---------- + params : mala.common.parametes.Parameters + Parameters used to create this hyperparameter optimizer. + + descriptor_calculator : mala.descriptors.descriptor.Descriptor + The descriptor calculator used for parsing/converting fingerprint + data. If None, the descriptor calculator will be created by this + object using the parameters provided. Default: None + + target_calculator : mala.targets.target.Target + Target calculator used for parsing/converting target data. If None, + the target calculator will be created by this object using the + parameters provided. Default: None + + Attributes + ---------- + best_score : float + Score associated with best-performing trial. + + best_trial_index : int + Index of best-performing trial + """ + + def __init__( + self, params, target_calculator=None, descriptor_calculator=None + ): + super(DescriptorScoringOptimizer, self).__init__(params) + # Calculators used to parse data from compatible files. + self._target_calculator = target_calculator + if self._target_calculator is None: + self._target_calculator = Target(params) + self._descriptor_calculator = descriptor_calculator + if self._descriptor_calculator is None: + self._descriptor_calculator = Descriptor(params) + if not isinstance( + self._descriptor_calculator, Bispectrum + ) and not isinstance(self._descriptor_calculator, AtomicDensity): + raise Exception("Unsupported descriptor type selected.") + + # Internal variables. + self._snapshots = [] + self._snapshot_description = [] + self._snapshot_units = [] + + # Filled after the analysis. + self._labels = [] + self._study = [] + self._reduced_study = None + self._internal_hyperparam_list = None + + # Logging metrics. + self.best_score = None + self.best_trial_index = None + + def add_snapshot( + self, + descriptor_input_type=None, + descriptor_input_path=None, + target_input_type=None, + target_input_path=None, + descriptor_units=None, + target_units=None, + ): + """ + Add a snapshot to be processed. + + Parameters + ---------- + descriptor_input_type : string + Type of descriptor data to be processed. + See mala.datahandling.data_converter.descriptor_input_types + for options. + + descriptor_input_path : string + Path of descriptor data to be processed. + + target_input_type : string + Type of target data to be processed. + See mala.datahandling.data_converter.target_input_types + for options. + + target_input_path : string + Path of target data to be processed. + + descriptor_units : string + Units for descriptor data processing. + + target_units : string + Units for target data processing. + """ + # Check the input. + if descriptor_input_type is not None: + if descriptor_input_path is None: + raise Exception( + "Cannot process descriptor data with no path given." + ) + if ( + descriptor_input_type + not in descriptor_input_types_descriptor_scoring + ): + raise Exception("Cannot process this type of descriptor data.") + else: + raise Exception( + "Cannot calculate scoring metrics without descriptor data." + ) + + if target_input_type is not None: + if target_input_path is None: + raise Exception( + "Cannot process target data with no path given." + ) + if target_input_type not in target_input_types_descriptor_scoring: + raise Exception("Cannot process this type of target data.") + else: + raise Exception( + "Cannot calculate scoring metrics without target data." + ) + + # Assign info. + self._snapshots.append( + {"input": descriptor_input_path, "output": target_input_path} + ) + self._snapshot_description.append( + {"input": descriptor_input_type, "output": target_input_type} + ) + self._snapshot_units.append( + {"input": descriptor_units, "output": target_units} + ) + + def add_hyperparameter(self, name, choices): + """ + Add a hyperparameter to the current investigation. + + Parameters + ---------- + name : string + Name of the hyperparameter. Please note that these names always + have to be the same as the parameter names in + ParametersDescriptors. + + choices : + List of possible choices. + """ + if name not in [ + "bispectrum_twojmax", + "bispectrum_cutoff", + "atomic_density_sigma", + "atomic_density_cutoff", + ]: + raise Exception( + "Unkown hyperparameter for training free descriptor" + "hyperparameter optimization entered." + ) + + self.params.hyperparameters.hlist.append( + Hyperparameter( + hotype="descriptor_scoring", + name=name, + choices=choices, + opttype="categorical", + ) + ) + + def perform_study( + self, file_based_communication=False, return_plotting=False + ): + """ + Perform the study, i.e. the optimization. + + This is done by sampling different descriptors, calculated with + different hyperparameters and then calculating some surrogate score. + """ + # Prepare the hyperparameter lists. + self._construct_hyperparam_list() + hyperparameter_tuples = list( + itertools.product(*self._internal_hyperparam_list) + ) + + # Perform the descriptor scoring analysis separately for each snapshot. + for i in range(0, len(self._snapshots)): + self.best_trial_index = None + self.best_score = None + printout( + "Starting descriptor scoring analysis of snapshot", + str(i), + min_verbosity=1, + ) + current_list = [] + target = self._load_target( + self._snapshots[i], + self._snapshot_description[i], + self._snapshot_units[i], + file_based_communication, + ) + + for idx, hyperparameter_tuple in enumerate(hyperparameter_tuples): + if isinstance(self._descriptor_calculator, Bispectrum): + self.params.descriptors.bispectrum_cutoff = ( + hyperparameter_tuple[0] + ) + self.params.descriptors.bispectrum_twojmax = ( + hyperparameter_tuple[1] + ) + elif isinstance(self._descriptor_calculator, AtomicDensity): + self.params.descriptors.atomic_density_cutoff = ( + hyperparameter_tuple[0] + ) + self.params.descriptors.atomic_density_sigma = ( + hyperparameter_tuple[1] + ) + + descriptor = self._calculate_descriptors( + self._snapshots[i], + self._snapshot_description[i], + self._snapshot_units[i], + ) + if get_rank() == 0: + score = self._calculate_score( + descriptor, + target, + ) + if not np.isnan(score): + self._update_logging(score, idx) + current_list.append( + list(hyperparameter_tuple) + [score] + ) + else: + current_list.append( + list(hyperparameter_tuple) + [np.inf] + ) + + outstring = "[" + for label_id, label in enumerate(self._labels): + outstring += ( + label + ": " + str(hyperparameter_tuple[label_id]) + ) + if label_id < len(self._labels) - 1: + outstring += ", " + outstring += "]" + best_trial_string = ". No suitable trial found yet." + if self.best_score is not None: + best_trial_string = ( + ". Best trial is " + + str(self.best_trial_index) + + " with " + + str(self.best_score) + ) + + printout( + "Trial", + idx, + "finished with score=" + str(score), + "and parameters:", + outstring + best_trial_string, + min_verbosity=1, + ) + + if get_rank() == 0: + self._study.append(current_list) + + if get_rank() == 0: + self._study = np.mean(self._study, axis=0) + + if return_plotting: + results_to_plot = [] + if len(self._internal_hyperparam_list) == 2: + len_first_dim = len(self._internal_hyperparam_list[0]) + len_second_dim = len(self._internal_hyperparam_list[1]) + for i in range(0, len_first_dim): + results_to_plot.append( + self._study[ + i * len_second_dim : (i + 1) * len_second_dim, + 2:, + ] + ) + + if isinstance(self._descriptor_calculator, Bispectrum): + return results_to_plot, { + "twojmax": self._internal_hyperparam_list[1], + "cutoff": self._internal_hyperparam_list[0], + } + if isinstance(self._descriptor_calculator, AtomicDensity): + return results_to_plot, { + "sigma": self._internal_hyperparam_list[1], + "cutoff": self._internal_hyperparam_list[0], + } + + def set_optimal_parameters(self): + """ + Set optimal parameters. + + This function will write the determined hyperparameters directly to + MALA parameters object referenced in this class. + """ + if get_rank() == 0: + best_trial = self._get_best_trial() + minimum_score = self._study[np.argmin(self._study[:, -1])] + if isinstance(self._descriptor_calculator, Bispectrum): + self.params.descriptors.bispectrum_cutoff = best_trial[0] + self.params.descriptors.bispectrum_twojmax = int(best_trial[1]) + printout( + "Descriptor scoring analysis finished, optimal parameters: ", + ) + printout( + "Bispectrum twojmax: ", + self.params.descriptors.bispectrum_twojmax, + ) + printout( + "Bispectrum cutoff: ", + self.params.descriptors.bispectrum_cutoff, + ) + if isinstance(self._descriptor_calculator, AtomicDensity): + self.params.descriptors.atomic_density_cutoff = best_trial[0] + self.params.descriptors.atomic_density_sigma = best_trial[1] + printout( + "Descriptor scoring analysis finished, optimal parameters: ", + ) + printout( + "Atomic density sigma: ", + self.params.descriptors.atomic_density_sigma, + ) + printout( + "Atomic density cutoff: ", + self.params.descriptors.atomic_density_cutoff, + ) + + @abstractmethod + def _get_best_trial(self): + """Determine the best trial as given by this study.""" + pass + + def _construct_hyperparam_list(self): + if isinstance(self._descriptor_calculator, Bispectrum): + if ( + list( + map( + lambda p: "bispectrum_cutoff" in p.name, + self.params.hyperparameters.hlist, + ) + ).count(True) + == 0 + ): + first_dim_list = [self.params.descriptors.bispectrum_cutoff] + else: + first_dim_list = self.params.hyperparameters.hlist[ + list( + map( + lambda p: "bispectrum_cutoff" in p.name, + self.params.hyperparameters.hlist, + ) + ).index(True) + ].choices + + if ( + list( + map( + lambda p: "bispectrum_twojmax" in p.name, + self.params.hyperparameters.hlist, + ) + ).count(True) + == 0 + ): + second_dim_list = [self.params.descriptors.bispectrum_twojmax] + else: + second_dim_list = self.params.hyperparameters.hlist[ + list( + map( + lambda p: "bispectrum_twojmax" in p.name, + self.params.hyperparameters.hlist, + ) + ).index(True) + ].choices + + self._internal_hyperparam_list = [first_dim_list, second_dim_list] + self._labels = ["cutoff", "twojmax"] + + elif isinstance(self._descriptor_calculator, AtomicDensity): + if ( + list( + map( + lambda p: "atomic_density_cutoff" in p.name, + self.params.hyperparameters.hlist, + ) + ).count(True) + == 0 + ): + first_dim_list = [ + self.params.descriptors.atomic_density_cutoff + ] + else: + first_dim_list = self.params.hyperparameters.hlist[ + list( + map( + lambda p: "atomic_density_cutoff" in p.name, + self.params.hyperparameters.hlist, + ) + ).index(True) + ].choices + + if ( + list( + map( + lambda p: "atomic_density_sigma" in p.name, + self.params.hyperparameters.hlist, + ) + ).count(True) + == 0 + ): + second_dim_list = [ + self.params.descriptors.atomic_density_sigma + ] + else: + second_dim_list = self.params.hyperparameters.hlist[ + list( + map( + lambda p: "atomic_density_sigma" in p.name, + self.params.hyperparameters.hlist, + ) + ).index(True) + ].choices + self._internal_hyperparam_list = [first_dim_list, second_dim_list] + self._labels = ["cutoff", "sigma"] + + else: + raise Exception( + "Unkown descriptor calculator selected. Cannot " + "perform descriptor scoring optimization." + ) + + def _calculate_descriptors(self, snapshot, description, original_units): + descriptor_calculation_kwargs = {} + tmp_input = None + if description["input"] == "espresso-out": + descriptor_calculation_kwargs["units"] = original_units["input"] + tmp_input, local_size = ( + self._descriptor_calculator.calculate_from_qe_out( + snapshot["input"], **descriptor_calculation_kwargs + ) + ) + + elif description["input"] is None: + # In this case, only the output is processed. + pass + + else: + raise Exception( + "Unknown file extension, cannot convert descriptor" + ) + if self.params.descriptors._configuration["mpi"]: + tmp_input = self._descriptor_calculator.gather_descriptors( + tmp_input + ) + + return tmp_input + + def _load_target( + self, snapshot, description, original_units, file_based_communication + ): + memmap = None + if ( + self.params.descriptors._configuration["mpi"] + and file_based_communication + ): + memmap = "descriptor_scoring.out.npy_temp" + + target_calculator_kwargs = {} + + # Read the output data + tmp_output = None + if description["output"] == ".cube": + target_calculator_kwargs["units"] = original_units["output"] + target_calculator_kwargs["use_memmap"] = memmap + # If no units are provided we just assume standard units. + tmp_output = self._target_calculator.read_from_cube( + snapshot["output"], **target_calculator_kwargs + ) + + elif description["output"] == ".xsf": + target_calculator_kwargs["units"] = original_units["output"] + target_calculator_kwargs["use_memmap"] = memmap + # If no units are provided we just assume standard units. + tmp_output = self._target_calculator.read_from_xsf( + snapshot["output"], **target_calculator_kwargs + ) + + elif description["output"] == "numpy": + if get_rank() == 0: + tmp_output = self._target_calculator.read_from_numpy_file( + snapshot["output"], units=original_units["output"] + ) + + elif description["output"] == "openpmd": + if get_rank() == 0: + tmp_output = self._target_calculator.read_from_numpy_file( + snapshot["output"], units=original_units["output"] + ) + else: + raise Exception("Unknown file extension, cannot convert target") + + if get_rank() == 0: + if ( + self.params.targets._configuration["mpi"] + and file_based_communication + ): + os.remove(memmap) + + return tmp_output + + @abstractmethod + def _update_logging(self, score, index): + pass + + @abstractmethod + def _calculate_score(self, descriptor, target): + pass diff --git a/mala/network/hyper_opt.py b/mala/network/hyper_opt.py index c26e93a81..2311c2ad1 100644 --- a/mala/network/hyper_opt.py +++ b/mala/network/hyper_opt.py @@ -24,6 +24,11 @@ class HyperOpt(ABC): use_pkl_checkpoints : bool If true, .pkl checkpoints will be created. + + Attributes + ---------- + params : mala.common.parametes.Parameters + MALA Parameters object. """ def __new__(cls, params: Parameters, data=None, use_pkl_checkpoints=False): @@ -73,9 +78,9 @@ def __init__( self, params: Parameters, data=None, use_pkl_checkpoints=False ): self.params: Parameters = params - self.data_handler = data - self.objective = ObjectiveBase(self.params, self.data_handler) - self.use_pkl_checkpoints = use_pkl_checkpoints + self._data_handler = data + self._objective = ObjectiveBase(self.params, self._data_handler) + self._use_pkl_checkpoints = use_pkl_checkpoints def add_hyperparameter( self, opttype="float", name="", low=0, high=0, choices=None @@ -153,7 +158,7 @@ def set_parameters(self, trial): The parameters will be written to the parameter object with which the hyperparameter optimizer was created. """ - self.objective.parse_trial(trial) + self._objective.parse_trial(trial) def _save_params_and_scaler(self): # Saving the Scalers is straight forward. @@ -163,12 +168,12 @@ def _save_params_and_scaler(self): oscaler_name = ( self.params.hyperparameters.checkpoint_name + "_oscaler.pkl" ) - self.data_handler.input_data_scaler.save(iscaler_name) - self.data_handler.output_data_scaler.save(oscaler_name) + self._data_handler.input_data_scaler.save(iscaler_name) + self._data_handler.output_data_scaler.save(oscaler_name) # For the parameters we have to make sure we choose the correct # format. - if self.use_pkl_checkpoints: + if self._use_pkl_checkpoints: param_name = ( self.params.hyperparameters.checkpoint_name + "_params.pkl" ) @@ -198,7 +203,6 @@ def checkpoint_exists(cls, checkpoint_name, use_pkl_checkpoints=False): ------- checkpoint_exists : bool True if the checkpoint exists, False otherwise. - """ iscaler_name = checkpoint_name + "_iscaler.pkl" oscaler_name = checkpoint_name + "_oscaler.pkl" diff --git a/mala/network/hyper_opt_naswot.py b/mala/network/hyper_opt_naswot.py index 9a11e1ca0..0d57bedbf 100644 --- a/mala/network/hyper_opt_naswot.py +++ b/mala/network/hyper_opt_naswot.py @@ -2,8 +2,9 @@ import itertools -import optuna +from functools import cached_property import numpy as np +import optuna from mala.common.parallelizer import ( printout, @@ -11,6 +12,7 @@ get_size, get_comm, barrier, + parallel_warn, ) from mala.network.hyper_opt import HyperOpt from mala.network.objective_naswot import ObjectiveNASWOT @@ -33,11 +35,10 @@ class HyperOptNASWOT(HyperOpt): def __init__(self, params, data): super(HyperOptNASWOT, self).__init__(params, data) - self.objective = None - self.trial_losses = None - self.best_trial = None - self.trial_list = None - self.ignored_hyperparameters = [ + self._objective = None + self._trial_losses = None + self._trial_list = None + self._ignored_hyperparameters = [ "learning_rate", "optimizer", "mini_batch_size", @@ -47,8 +48,68 @@ def __init__(self, params, data): ] # For parallelization. - self.first_trial = None - self.last_trial = None + self._first_trial = None + self._last_trial = None + + @property + def best_trial_index(self): + """ + Get the index and loss of best trial determined in this NASWOT run. + + This property is read only, and will be recomputed. + + Returns + ------- + best_trial_index : list + A list containing [0] the best trial index and [1] the best + trial loss. + """ + if self._trial_losses is None: + parallel_warn( + "Trial list is not yet computed, cannot determine " + "best trial." + ) + return [-1, np.inf] + + if self.params.use_mpi: + comm = get_comm() + local_result = np.array( + [ + float(np.argmax(self._trial_losses) + self._first_trial), + np.max(self._trial_losses), + ] + ) + all_results = comm.allgather(local_result) + max_on_node = np.argmax(np.array(all_results)[:, 1]) + return [ + int(all_results[max_on_node][0]), + all_results[max_on_node][1], + ] + else: + return [np.argmax(self._trial_losses), np.max(self._trial_losses)] + + @best_trial_index.setter + def best_trial_index(self, value): + pass + + @property + def best_trial(self): + """ + Get the best trial determined in this NASWOT run. + + This property is read only, and will be recomputed. + """ + if self._trial_losses is None: + parallel_warn( + "Trial list is not yet computed, cannot determine " + "best trial." + ) + return None + return self._trial_list[self.best_trial_index[0]] + + @best_trial.setter + def best_trial(self, value): + pass def perform_study(self, trial_list=None): """ @@ -62,6 +123,11 @@ def perform_study(self, trial_list=None): ---------- trial_list : list A list containing trials from either HyperOptOptuna or HyperOptOAT. + + Returns + ------- + best_trial_loss : float + Loss of the best trial. """ # The minibatch size can not vary in the analysis. # This check ensures that e.g. optuna results can be used. @@ -76,29 +142,29 @@ def perform_study(self, trial_list=None): # Ideally, this type of HO is called with a list of trials for which # the parameter has to be identified. - self.trial_list = trial_list - if self.trial_list is None: + self._trial_list = trial_list + if self._trial_list is None: printout( "No trial list provided, one will be created using all " "possible permutations of hyperparameters. " "The following hyperparameters will be ignored:", min_verbosity=0, ) - printout(self.ignored_hyperparameters) + printout(self._ignored_hyperparameters) # Please note for the parallel case: The trial list returned # here is deterministic. - self.trial_list = self.__all_combinations() + self._trial_list = self.__all_combinations() if self.params.use_mpi: trials_per_rank = int( - np.floor((len(self.trial_list) / get_size())) + np.floor((len(self._trial_list) / get_size())) ) - self.first_trial = get_rank() * trials_per_rank - self.last_trial = (get_rank() + 1) * trials_per_rank + self._first_trial = get_rank() * trials_per_rank + self._last_trial = (get_rank() + 1) * trials_per_rank if get_size() == get_rank() + 1: - trials_per_rank += len(self.trial_list) % get_size() - self.last_trial += len(self.trial_list) % get_size() + trials_per_rank += len(self._trial_list) % get_size() + self._last_trial += len(self._trial_list) % get_size() # We currently do not support checkpointing in parallel mode # for performance reasons. @@ -109,78 +175,58 @@ def perform_study(self, trial_list=None): ) self.params.hyperparameters.checkpoints_each_trial = 0 else: - self.first_trial = 0 - self.last_trial = len(self.trial_list) + self._first_trial = 0 + self._last_trial = len(self._trial_list) # TODO: For now. Needs some refinements later. if isinstance( - self.trial_list[0], optuna.trial.FrozenTrial - ) or isinstance(self.trial_list[0], optuna.trial.FixedTrial): + self._trial_list[0], optuna.trial.FrozenTrial + ) or isinstance(self._trial_list[0], optuna.trial.FixedTrial): trial_type = "optuna" else: trial_type = "oat" - self.objective = ObjectiveNASWOT( - self.params, self.data_handler, trial_type + self._objective = ObjectiveNASWOT( + self.params, self._data_handler, trial_type ) printout( "Starting NASWOT hyperparameter optimization,", - len(self.trial_list), + len(self._trial_list), "trials will be performed.", min_verbosity=0, ) - self.trial_losses = [] + self._trial_losses = [] for idx, row in enumerate( - self.trial_list[self.first_trial : self.last_trial] + self._trial_list[self._first_trial : self._last_trial] ): - trial_loss = self.objective(row) - self.trial_losses.append(trial_loss) + trial_loss = self._objective(row) + self._trial_losses.append(trial_loss) # Output diagnostic information. if self.params.use_mpi: print( "Trial number", - idx + self.first_trial, + idx + self._first_trial, "finished with:", - self.trial_losses[idx], + self._trial_losses[idx], ) else: - best_trial = self.get_best_trial_results() printout( "Trial number", idx, "finished with:", - self.trial_losses[idx], + self._trial_losses[idx], ", best is trial", - best_trial[0], + self.best_trial_index[0], "with", - best_trial[1], + self.best_trial_index[1], min_verbosity=0, ) barrier() # Return the best loss value we could achieve. - return self.get_best_trial_results()[1] - - def get_best_trial_results(self): - """Get the best trial out of the list, including the value.""" - if self.params.use_mpi: - comm = get_comm() - local_result = np.array( - [ - float(np.argmax(self.trial_losses) + self.first_trial), - np.max(self.trial_losses), - ] - ) - all_results = comm.allgather(local_result) - max_on_node = np.argmax(np.array(all_results)[:, 1]) - return [ - int(all_results[max_on_node][0]), - all_results[max_on_node][1], - ] - else: - return [np.argmax(self.trial_losses), np.max(self.trial_losses)] + return self.best_trial_index[1] def set_optimal_parameters(self): """ @@ -189,29 +235,13 @@ def set_optimal_parameters(self): The parameters will be written to the parameter object with which the hyperparameter optimizer was created. """ - # Getting the best trial based on the test errors - if self.params.use_mpi: - comm = get_comm() - local_result = np.array( - [ - float(np.argmax(self.trial_losses) + self.first_trial), - np.max(self.trial_losses), - ] - ) - all_results = comm.allgather(local_result) - max_on_node = np.argmax(np.array(all_results)[:, 1]) - idx = int(all_results[max_on_node][0]) - else: - idx = self.trial_losses.index(max(self.trial_losses)) - - self.best_trial = self.trial_list[idx] - self.objective.parse_trial(self.best_trial) + self._objective.parse_trial(self.best_trial) def __all_combinations(self): # First, remove all the hyperparameters we don't actually need. indices_to_remove = [] for idx, par in enumerate(self.params.hyperparameters.hlist): - if par.name in self.ignored_hyperparameters: + if par.name in self._ignored_hyperparameters: indices_to_remove.append(idx) for index in sorted(indices_to_remove, reverse=True): del self.params.hyperparameters.hlist[index] diff --git a/mala/network/hyper_opt_oat.py b/mala/network/hyper_opt_oat.py index 4fcf85808..4642320db 100644 --- a/mala/network/hyper_opt_oat.py +++ b/mala/network/hyper_opt_oat.py @@ -14,7 +14,7 @@ from mala.network.hyper_opt import HyperOpt from mala.network.objective_base import ObjectiveBase from mala.network.hyperparameter_oat import HyperparameterOAT -from mala.common.parallelizer import printout +from mala.common.parallelizer import printout, parallel_warn class HyperOptOAT(HyperOpt): @@ -38,22 +38,55 @@ def __init__(self, params, data, use_pkl_checkpoints=False): super(HyperOptOAT, self).__init__( params, data, use_pkl_checkpoints=use_pkl_checkpoints ) - self.objective = None - self.optimal_params = None - self.checkpoint_counter = 0 + self._objective = None + self._optimal_params = None + self._checkpoint_counter = 0 # Related to the OA itself. - self.importance = None - self.n_factors = None - self.factor_levels = None - self.strength = None - self.N_runs = None + self._importance = None + self._n_factors = None + self._factor_levels = None + self._strength = None + self._N_runs = None self.__OA = None # Tracking the trial progress. - self.sorted_num_choices = [] - self.current_trial = 0 - self.trial_losses = None + self._sorted_num_choices = [] + self._current_trial = 0 + self._trial_losses = None + + @property + def best_trial_index(self): + """ + Get the index and loss of best trial determined in this NASWOT run. + + This property is read only, and will be recomputed. + + Returns + ------- + best_trial_index : list + A list containing [0] the best trial index and [1] the best + trial loss. + """ + if self._trial_losses is None: + parallel_warn( + "Trial list is not yet computed, cannot determine " + "best trial." + ) + return [-1, np.inf] + + if self.params.hyperparameters.direction == "minimize": + return [np.argmin(self._trial_losses), np.min(self._trial_losses)] + elif self.params.hyperparameters.direction == "maximize": + return [np.argmax(self._trial_losses), np.max(self._trial_losses)] + else: + raise Exception( + "Invalid direction for hyperparameter optimization selected." + ) + + @best_trial_index.setter + def best_trial_index(self, value): + pass def add_hyperparameter( self, opttype="categorical", name="", choices=None, **kwargs @@ -70,15 +103,15 @@ def add_hyperparameter( Datatype of the hyperparameter. Follows optuna's naming conventions, but currently only supports "categorical" (a list). """ - if not self.sorted_num_choices: # if empty + if not self._sorted_num_choices: # if empty super(HyperOptOAT, self).add_hyperparameter( opttype=opttype, name=name, choices=choices ) - self.sorted_num_choices.append(len(choices)) + self._sorted_num_choices.append(len(choices)) else: - index = bisect(self.sorted_num_choices, len(choices)) - self.sorted_num_choices.insert(index, len(choices)) + index = bisect(self._sorted_num_choices, len(choices)) + self._sorted_num_choices.insert(index, len(choices)) self.params.hyperparameters.hlist.insert( index, HyperparameterOAT(opttype=opttype, name=name, choices=choices), @@ -88,50 +121,50 @@ def perform_study(self): """ Perform the study, i.e. the optimization. - Uses Optunas TPE sampler. + Internally constructs an orthogonal array and performs trial NN + trainings based on it. """ if self.__OA is None: - self.__OA = self.get_orthogonal_array() + self.__OA = self._get_orthogonal_array() print(self.__OA) - if self.trial_losses is None: - self.trial_losses = np.zeros(self.__OA.shape[0]) + float("inf") + if self._trial_losses is None: + self._trial_losses = np.zeros(self.__OA.shape[0]) + float("inf") printout( "Performing", - self.N_runs, + self._N_runs, "trials, starting with trial number", - self.current_trial, + self._current_trial, min_verbosity=0, ) # The parameters could have changed. - self.objective = ObjectiveBase(self.params, self.data_handler) + self._objective = ObjectiveBase(self.params, self._data_handler) # Iterate over the OA and perform the trials. - for i in range(self.current_trial, self.N_runs): + for i in range(self._current_trial, self._N_runs): row = self.__OA[i] - self.trial_losses[self.current_trial] = self.objective(row) + self._trial_losses[self._current_trial] = self._objective(row) # Output diagnostic information. - best_trial = self.get_best_trial_results() printout( "Trial number", - self.current_trial, + self._current_trial, "finished with:", - self.trial_losses[self.current_trial], + self._trial_losses[self._current_trial], ", best is trial", - best_trial[0], + self.best_trial_index[0], "with", - best_trial[1], + self.best_trial_index[1], min_verbosity=0, ) - self.current_trial += 1 + self._current_trial += 1 self.__create_checkpointing(row) # Perform Range Analysis - self.get_optimal_parameters() + self._range_analysis() - def get_optimal_parameters(self): + def _range_analysis(self): """ Find the optimal set of hyperparameters by doing range analysis. @@ -143,15 +176,15 @@ def indices(idx, val): return np.where(self.__OA[:, idx] == val)[0] R = [ - [self.trial_losses[indices(idx, l)].sum() for l in range(levels)] - for (idx, levels) in enumerate(self.factor_levels) + [self._trial_losses[indices(idx, l)].sum() for l in range(levels)] + for (idx, levels) in enumerate(self._factor_levels) ] A = [[i / len(j) for i in j] for j in R] # Taking loss as objective to minimise - self.optimal_params = np.array([i.index(min(i)) for i in A]) - self.importance = np.argsort([max(i) - min(i) for i in A]) + self._optimal_params = np.array([i.index(min(i)) for i in A]) + self._importance = np.argsort([max(i) - min(i) for i in A]) def show_order_of_importance(self): """Print the order of importance of the hyperparameters.""" @@ -159,7 +192,7 @@ def show_order_of_importance(self): printout( *[ self.params.hyperparameters.hlist[idx].name - for idx in self.importance + for idx in self._importance ], sep=" < ", min_verbosity=0 @@ -172,23 +205,23 @@ def set_optimal_parameters(self): The parameters will be written to the parameter object with which the hyperparameter optimizer was created. """ - self.objective.parse_trial_oat(self.optimal_params) + self._objective.parse_trial_oat(self._optimal_params) - def get_orthogonal_array(self): + def _get_orthogonal_array(self): """ Generate the best OA used for optimal hyperparameter sampling. This is function is taken from the example notebook of OApackage. """ self.__check_factor_levels() - print("Sorted factor levels:", self.sorted_num_choices) - self.n_factors = len(self.params.hyperparameters.hlist) + print("Sorted factor levels:", self._sorted_num_choices) + self._n_factors = len(self.params.hyperparameters.hlist) - self.factor_levels = [ + self._factor_levels = [ par.num_choices for par in self.params.hyperparameters.hlist ] - self.strength = 2 + self._strength = 2 arraylist = None # This is a little bit hacky. @@ -200,11 +233,14 @@ def get_orthogonal_array(self): # holds. x is unknown, but we can be confident that it should be # small. So simply trying 3 time should be fine for now. for i in range(1, 4): - self.N_runs = self.number_of_runs() * i - print("Trying run size:", self.N_runs) + self._N_runs = self._number_of_runs() * i + print("Trying run size:", self._N_runs) print("Generating Suitable Orthogonal Array.") arrayclass = oa.arraydata_t( - self.factor_levels, self.N_runs, self.strength, self.n_factors + self._factor_levels, + self._N_runs, + self._strength, + self._n_factors, ) arraylist = [arrayclass.create_root()] @@ -212,7 +248,7 @@ def get_orthogonal_array(self): options = oa.OAextend() options.setAlgorithmAuto(arrayclass) - for _ in range(self.strength + 1, self.n_factors + 1): + for _ in range(self._strength + 1, self._n_factors + 1): arraylist_extensions = oa.extend_arraylist( arraylist, arrayclass, options ) @@ -231,7 +267,7 @@ def get_orthogonal_array(self): else: return np.unique(np.array(arraylist[0]), axis=0) - def number_of_runs(self): + def _number_of_runs(self): """ Calculate the minimum number of runs required for an Orthogonal array. @@ -241,29 +277,20 @@ def number_of_runs(self): """ runs = [ np.prod(tt) - for tt in itertools.combinations(self.factor_levels, self.strength) + for tt in itertools.combinations( + self._factor_levels, self._strength + ) ] N = np.lcm.reduce(runs) return int(N) - def get_best_trial_results(self): - """Get the best trial out of the list, including the value.""" - if self.params.hyperparameters.direction == "minimize": - return [np.argmin(self.trial_losses), np.min(self.trial_losses)] - elif self.params.hyperparameters.direction == "maximize": - return [np.argmax(self.trial_losses), np.max(self.trial_losses)] - else: - raise Exception( - "Invalid direction for hyperparameter optimization selected." - ) - def __check_factor_levels(self): """Check that the factors are in a decreasing order.""" - dx = np.diff(self.sorted_num_choices) + dx = np.diff(self._sorted_num_choices) if np.all(dx >= 0): # Factors in increasing order, we have to reverse the order. - self.sorted_num_choices.reverse() + self._sorted_num_choices.reverse() self.params.hyperparameters.hlist.reverse() elif np.all(dx <= 0): # Factors are in decreasing order, we don't have to do anything. @@ -348,31 +375,33 @@ def load_from_file(cls, params, file_path, data): with open(file_path, "rb") as handle: loaded_tracking_data = pickle.load(handle) loaded_hyperopt = HyperOptOAT(params, data) - loaded_hyperopt.sorted_num_choices = loaded_tracking_data[ + loaded_hyperopt._sorted_num_choices = loaded_tracking_data[ "sorted_num_choices" ] - loaded_hyperopt.current_trial = loaded_tracking_data[ + loaded_hyperopt._current_trial = loaded_tracking_data[ "current_trial" ] - loaded_hyperopt.trial_losses = loaded_tracking_data["trial_losses"] - loaded_hyperopt.importance = loaded_tracking_data["importance"] - loaded_hyperopt.n_factors = loaded_tracking_data["n_factors"] - loaded_hyperopt.factor_levels = loaded_tracking_data[ + loaded_hyperopt._trial_losses = loaded_tracking_data[ + "trial_losses" + ] + loaded_hyperopt._importance = loaded_tracking_data["importance"] + loaded_hyperopt._n_factors = loaded_tracking_data["n_factors"] + loaded_hyperopt._factor_levels = loaded_tracking_data[ "factor_levels" ] - loaded_hyperopt.strength = loaded_tracking_data["strength"] - loaded_hyperopt.N_runs = loaded_tracking_data["N_runs"] + loaded_hyperopt._strength = loaded_tracking_data["strength"] + loaded_hyperopt._N_runs = loaded_tracking_data["N_runs"] loaded_hyperopt.__OA = loaded_tracking_data["OA"] return loaded_hyperopt def __create_checkpointing(self, trial): """Create a checkpoint of optuna study, if necessary.""" - self.checkpoint_counter += 1 + self._checkpoint_counter += 1 need_to_checkpoint = False if ( - self.checkpoint_counter + self._checkpoint_counter >= self.params.hyperparameters.checkpoints_each_trial and self.params.hyperparameters.checkpoints_each_trial > 0 ): @@ -386,12 +415,12 @@ def __create_checkpointing(self, trial): ) if ( self.params.hyperparameters.checkpoints_each_trial < 0 - and np.argmin(self.trial_losses) == self.current_trial - 1 + and np.argmin(self._trial_losses) == self._current_trial - 1 ): need_to_checkpoint = True printout( "Best trial is " - + str(self.current_trial - 1) + + str(self._current_trial - 1) + ", creating a " "checkpoint for it.", min_verbosity=1, @@ -399,16 +428,10 @@ def __create_checkpointing(self, trial): if need_to_checkpoint is True: # We need to create a checkpoint! - self.checkpoint_counter = 0 + self._checkpoint_counter = 0 self._save_params_and_scaler() - # Next, we save all the other objects. - # Here some horovod stuff would have to go. - # But so far, the optuna implementation is not horovod-ready... - # if self.params.use_horovod: - # if hvd.rank() != 0: - # return # The study only has to be saved if the no RDB storage is used. if self.params.hyperparameters.rdb_storage is None: hyperopt_name = ( @@ -417,14 +440,14 @@ def __create_checkpointing(self, trial): ) study = { - "sorted_num_choices": self.sorted_num_choices, - "current_trial": self.current_trial, - "trial_losses": self.trial_losses, - "importance": self.importance, - "n_factors": self.n_factors, - "factor_levels": self.factor_levels, - "strength": self.strength, - "N_runs": self.N_runs, + "sorted_num_choices": self._sorted_num_choices, + "current_trial": self._current_trial, + "trial_losses": self._trial_losses, + "importance": self._importance, + "n_factors": self._n_factors, + "factor_levels": self._factor_levels, + "strength": self._strength, + "N_runs": self._N_runs, "OA": self.__OA, } with open(hyperopt_name, "wb") as handle: diff --git a/mala/network/hyper_opt_optuna.py b/mala/network/hyper_opt_optuna.py index 173ed4cec..623c7415c 100644 --- a/mala/network/hyper_opt_optuna.py +++ b/mala/network/hyper_opt_optuna.py @@ -25,6 +25,18 @@ class HyperOptOptuna(HyperOpt): use_pkl_checkpoints : bool If true, .pkl checkpoints will be created. + + Attributes + ---------- + params : mala.common.parameters.Parameters + MALA Parameters object. + + objective : mala.network.objective_base.ObjectiveBase + MALA objective to be optimized, i.e., a MALA NN model training. + + study : optuna.study.Study + An Optuna study used to collect the results of the hyperparameter + optimization. """ def __init__(self, params, data, use_pkl_checkpoints=False): @@ -93,7 +105,7 @@ def __init__(self, params, data, use_pkl_checkpoints=False): load_if_exists=True, pruner=pruner, ) - self.checkpoint_counter = 0 + self._checkpoint_counter = 0 def perform_study(self): """ @@ -101,9 +113,14 @@ def perform_study(self): This is done by sampling a certain subset of network architectures. In this case, optuna is used. + + Returns + ------- + best_trial_loss : float + Loss of the best trial. """ # The parameters could have changed. - self.objective = ObjectiveBase(self.params, self.data_handler) + self.objective = ObjectiveBase(self.params, self._data_handler) # Fill callback list based on user checkpoint wishes. callback_list = [self.__check_stopping] @@ -131,6 +148,8 @@ def get_trials_from_study(self): """ Return the trials from the last study. + Only returns completed trials. + Returns ------- last_trials: list @@ -350,11 +369,11 @@ def __check_stopping(self, study, trial): def __create_checkpointing(self, study, trial): """Create a checkpoint of optuna study, if necessary.""" - self.checkpoint_counter += 1 + self._checkpoint_counter += 1 need_to_checkpoint = False if ( - self.checkpoint_counter + self._checkpoint_counter >= self.params.hyperparameters.checkpoints_each_trial and self.params.hyperparameters.checkpoints_each_trial > 0 ): @@ -380,16 +399,10 @@ def __create_checkpointing(self, study, trial): if need_to_checkpoint is True: # We need to create a checkpoint! - self.checkpoint_counter = 0 + self._checkpoint_counter = 0 self._save_params_and_scaler() - # Next, we save all the other objects. - # Here some horovod stuff would have to go. - # But so far, the optuna implementation is not horovod-ready... - # if self.params.use_horovod: - # if hvd.rank() != 0: - # return # The study only has to be saved if the no RDB storage is used. if self.params.hyperparameters.rdb_storage is None: hyperopt_name = ( diff --git a/mala/network/hyperparameter.py b/mala/network/hyperparameter.py index b951c85a5..17f0111ab 100644 --- a/mala/network/hyperparameter.py +++ b/mala/network/hyperparameter.py @@ -44,10 +44,33 @@ class Hyperparameter(JSONSerializable): choices : list List of possible choices (for categorical parameter). - Returns - ------- - hyperparameter : HyperparameterOptuna or HyperparameterOAT or HyperparameterNASWOT or HyperparameterACSD - Hyperparameter in desired format. + Attributes + ---------- + opttype : string + Datatype of the hyperparameter. Follows optunas naming convetions. + In principle supported are: + + - float + - int + - categorical (list) + + Float and int are not available for OA based approaches at the + moment. + + name : string + Name of the hyperparameter. Please note that these names always + have to be distinct; if you e.g. want to investigate multiple + layer sizes use e.g. ff_neurons_layer_001, ff_neurons_layer_002, + etc. as names. + + low : float or int + Lower bound for numerical parameter. + + high : float or int + Higher bound for numerical parameter. + + choices : list + List of possible choices (for categorical parameter). """ def __new__( @@ -135,12 +158,12 @@ def __new__( hparam = HyperparameterOAT( hotype=hotype, opttype=opttype, name=name, choices=choices ) - if hotype == "acsd": - from mala.network.hyperparameter_acsd import ( - HyperparameterACSD, + if hotype == "descriptor_scoring": + from mala.network.hyperparameter_descriptor_scoring import ( + HyperparameterDescriptorScoring, ) - hparam = HyperparameterACSD( + hparam = HyperparameterDescriptorScoring( hotype=hotype, opttype=opttype, name=name, diff --git a/mala/network/hyperparameter_acsd.py b/mala/network/hyperparameter_descriptor_scoring.py similarity index 92% rename from mala/network/hyperparameter_acsd.py rename to mala/network/hyperparameter_descriptor_scoring.py index 6ecee0e76..34428f1d6 100644 --- a/mala/network/hyperparameter_acsd.py +++ b/mala/network/hyperparameter_descriptor_scoring.py @@ -3,7 +3,7 @@ from mala.network.hyperparameter import Hyperparameter -class HyperparameterACSD(Hyperparameter): +class HyperparameterDescriptorScoring(Hyperparameter): """Represents an optuna parameter. Parameters @@ -44,7 +44,7 @@ def __init__( high=0, choices=None, ): - super(HyperparameterACSD, self).__init__( + super(HyperparameterDescriptorScoring, self).__init__( opttype=opttype, name=name, low=low, high=high, choices=choices ) diff --git a/mala/network/mutual_information_analyzer.py b/mala/network/mutual_information_analyzer.py new file mode 100644 index 000000000..f563bd648 --- /dev/null +++ b/mala/network/mutual_information_analyzer.py @@ -0,0 +1,213 @@ +"""Class for performing a full mutual information analysis.""" + +import numpy as np + + +from mala.common.parallelizer import parallel_warn +from mala.network.descriptor_scoring_optimizer import ( + DescriptorScoringOptimizer, +) + + +class MutualInformationAnalyzer(DescriptorScoringOptimizer): + """ + Analyzer based on mutual information analysis. + + Parameters + ---------- + params : mala.common.parametes.Parameters + Parameters used to create this hyperparameter optimizer. + + descriptor_calculator : mala.descriptors.descriptor.Descriptor + The descriptor calculator used for parsing/converting fingerprint + data. If None, the descriptor calculator will be created by this + object using the parameters provided. Default: None + + target_calculator : mala.targets.target.Target + Target calculator used for parsing/converting target data. If None, + the target calculator will be created by this object using the + parameters provided. Default: None + """ + + def __init__( + self, params, target_calculator=None, descriptor_calculator=None + ): + parallel_warn( + "The MutualInformationAnalyzer is still in its " + "experimental stage. The API is consistent with " + "MALA hyperparameter optimization and will likely not " + "change, but the internal algorithm may be subject " + "to changes in the near-future." + ) + super(MutualInformationAnalyzer, self).__init__( + params, + target_calculator=target_calculator, + descriptor_calculator=descriptor_calculator, + ) + + def _get_best_trial(self): + """Determine the best trial as given by this study.""" + return self._study[np.argmax(self._study[:, -1])] + + def _update_logging(self, score, index): + if self.best_score is None: + self.best_score = score + self.best_trial_index = index + elif score > self.best_score: + self.best_score = score + self.best_trial_index = index + + def _calculate_score(self, descriptor, target): + return self._calculate_mutual_information( + descriptor, + target, + self.params.hyperparameters.mutual_information_points, + descriptor_vectors_contain_xyz=self.params.descriptors.descriptors_contain_xyz, + ) + + @staticmethod + def _calculate_mutual_information( + descriptor_data, + ldos_data, + n_samples, + descriptor_vectors_contain_xyz=True, + ): + """ + Calculate the Mutual Information for given descriptor and LDOS data. + + Mutual Information measures how well the descriptors capture the + relevant information that is needed to predict the LDOS. + The unit of MI is bits. + + Parameters + ---------- + descriptor_data : numpy.ndarray + Array containing the descriptors. + + ldos_data : numpy.ndarray + Array containing the LDOS. + + n_samples : int + The number of points for which to calculate the mutual information. + + Returns + ------- + mi : float + The mutual information between the descriptor and the LDOS in bits. + + """ + descriptor_dim = np.shape(descriptor_data) + ldos_dim = np.shape(ldos_data) + if len(descriptor_dim) == 4: + descriptor_data = np.reshape( + descriptor_data, + ( + descriptor_dim[0] * descriptor_dim[1] * descriptor_dim[2], + descriptor_dim[3], + ), + ) + if descriptor_vectors_contain_xyz: + descriptor_data = descriptor_data[:, 3:] + elif len(descriptor_dim) != 2: + raise Exception("Cannot work with this descriptor data.") + + if len(ldos_dim) == 4: + ldos_data = np.reshape( + ldos_data, + (ldos_dim[0] * ldos_dim[1] * ldos_dim[2], ldos_dim[3]), + ) + elif len(ldos_dim) != 2: + raise Exception("Cannot work with this LDOS data.") + + # The hyperparameters could be put potentially into the params. + mi = MutualInformationAnalyzer._mutual_information( + descriptor_data, + ldos_data, + n_components=None, + n_samples=n_samples, + covariance_type="diag", + normalize_data=True, + ) + return mi + + @staticmethod + def _normalize(data): + mean = np.mean(data, axis=0) + std = np.std(data, axis=0) + std_nonzero = std > 1e-6 + data = data[:, std_nonzero] + mean = mean[std_nonzero] + std = std[std_nonzero] + data = (data - mean) / std + return data + + @staticmethod + def _mutual_information( + X, + Y, + n_components=None, + max_iter=1000, + n_samples=100000, + covariance_type="diag", + normalize_data=False, + ): + import sklearn.mixture + import sklearn.covariance + + assert ( + covariance_type == "diag" + ), "Only support covariance_type='diag' for now" + n = X.shape[0] + dim_X = X.shape[-1] + rand_subset = np.random.permutation(n)[:n_samples] + if normalize_data: + X = MutualInformationAnalyzer._normalize(X) + Y = MutualInformationAnalyzer._normalize(Y) + X = X[rand_subset] + Y = Y[rand_subset] + XY = np.concatenate([X, Y], axis=1) + d = XY.shape[-1] + if n_components is None: + n_components = d // 2 + gmm_XY = sklearn.mixture.GaussianMixture( + n_components=n_components, + covariance_type=covariance_type, + max_iter=max_iter, + ) + gmm_XY.fit(XY) + + gmm_X = sklearn.mixture.GaussianMixture( + n_components=n_components, + covariance_type=covariance_type, + max_iter=max_iter, + ) + gmm_X.weights_ = gmm_XY.weights_ + gmm_X.means_ = gmm_XY.means_[:, :dim_X] + gmm_X.covariances_ = gmm_XY.covariances_[:, :dim_X] + gmm_X.precisions_ = gmm_XY.precisions_[:, :dim_X] + gmm_X.precisions_cholesky_ = gmm_XY.precisions_cholesky_[:, :dim_X] + + gmm_Y = sklearn.mixture.GaussianMixture( + n_components=n_components, + covariance_type=covariance_type, + max_iter=max_iter, + ) + gmm_Y.weights_ = gmm_XY.weights_ + gmm_Y.means_ = gmm_XY.means_[:, dim_X:] + gmm_Y.covariances_ = gmm_XY.covariances_[:, dim_X:] + gmm_Y.precisions_ = gmm_XY.precisions_[:, dim_X:] + gmm_Y.precisions_cholesky_ = gmm_XY.precisions_cholesky_[:, dim_X:] + + rand_perm = np.random.permutation(Y.shape[0]) + Y_perm = Y[rand_perm] + XY_perm = np.concatenate([X, Y_perm], axis=1) + temp = ( + gmm_XY.score_samples(XY_perm) + - gmm_X.score_samples(X) + - gmm_Y.score_samples(Y_perm) + ) + temp_exp = np.exp(temp) + mi = np.mean(temp_exp * temp) + # change log base e to log base 2 + mi = mi / np.log(2) + return mi diff --git a/mala/network/network.py b/mala/network/network.py index 3835702b9..6a9a5dbe1 100644 --- a/mala/network/network.py +++ b/mala/network/network.py @@ -8,7 +8,7 @@ import torch.nn.functional as functional from mala.common.parameters import Parameters -from mala.common.parallelizer import printout +from mala.common.parallelizer import printout, parallel_warn class Network(nn.Module): @@ -23,6 +23,23 @@ class Network(nn.Module): ---------- params : mala.common.parametes.Parameters Parameters used to create this neural network. + + Attributes + ---------- + loss_func : function + Loss function. + + mini_batch_size : int + Size of mini batches propagated through network. + + number_of_layers : int + Number of NN layers. + + params : mala.common.parametes.ParametersNetwork + MALA neural network parameters. + + use_ddp : bool + If True, the torch distributed data parallel formalism will be used. """ def __new__(cls, params: Parameters): @@ -78,7 +95,7 @@ def __init__(self, params: Parameters): super(Network, self).__init__() # Mappings for parsing of the activation layers. - self.activation_mappings = { + self._activation_mappings = { "Sigmoid": nn.Sigmoid, "ReLU": nn.ReLU, "LeakyReLU": nn.LeakyReLU, @@ -96,12 +113,19 @@ def __init__(self, params: Parameters): @abstractmethod def forward(self, inputs): - """Abstract method. To be implemented by the derived class.""" + """ + Abstract method. To be implemented by the derived class. + + Parameters + ---------- + inputs : torch.Tensor + Torch tensor to be propagated. + """ pass def do_prediction(self, array): """ - Predict the output values for an input array.. + Predict the output values for an input array. Interface to do predictions. The data put in here is assumed to be a scaled torch.Tensor and in the right units. Be aware that this will @@ -143,8 +167,6 @@ def calculate_loss(self, output, target): """ return self.loss_func(output, target) - # FIXME: This guarentees downwards compatibility, but it is ugly. - # Rather enforce the right package versions in the repo. def save_network(self, path_to_file): """ Save the network. @@ -238,13 +260,13 @@ def __init__(self, params): try: if use_only_one_activation_type: self.layers.append( - self.activation_mappings[ + self._activation_mappings[ self.params.layer_activations[0] ]() ) else: self.layers.append( - self.activation_mappings[ + self._activation_mappings[ self.params.layer_activations[i] ]() ) @@ -281,6 +303,11 @@ class LSTM(Network): # was passed to be used in the entire network. def __init__(self, params): super(LSTM, self).__init__(params) + parallel_warn( + "The LSTM class will be deprecated in MALA v1.4.0.", + 0, + category=FutureWarning, + ) self.hidden_dim = self.params.layer_sizes[-1] @@ -312,7 +339,7 @@ def __init__(self, params): self.params.num_hidden_layers, batch_first=True, ) - self.activation = self.activation_mappings[ + self.activation = self._activation_mappings[ self.params.layer_activations[0] ]() @@ -417,6 +444,11 @@ class GRU(LSTM): # layer as GRU. def __init__(self, params): Network.__init__(self, params) + parallel_warn( + "The GRU class will be deprecated in MALA v1.4.0.", + 0, + category=FutureWarning, + ) self.hidden_dim = self.params.layer_sizes[-1] @@ -445,7 +477,7 @@ def __init__(self, params): self.params.num_hidden_layers, batch_first=True, ) - self.activation = self.activation_mappings[ + self.activation = self._activation_mappings[ self.params.layer_activations[0] ]() @@ -536,6 +568,11 @@ class TransformerNet(Network): def __init__(self, params): super(TransformerNet, self).__init__(params) + parallel_warn( + "The TransformerNet class will be deprecated in MALA v1.4.0.", + 0, + category=FutureWarning, + ) # Adjust number of heads. if self.params.layer_sizes[0] % self.params.num_heads != 0: @@ -637,6 +674,11 @@ class PositionalEncoding(nn.Module): """ def __init__(self, d_model, dropout=0.1, max_len=400): + parallel_warn( + "The PositionalEncoding class will be deprecated in MALA v1.4.0.", + 0, + category=FutureWarning, + ) super(PositionalEncoding, self).__init__() self.dropout = nn.Dropout(p=dropout) diff --git a/mala/network/objective_base.py b/mala/network/objective_base.py index 2fbf29503..539820f8e 100644 --- a/mala/network/objective_base.py +++ b/mala/network/objective_base.py @@ -7,6 +7,7 @@ from mala.network.hyperparameter_oat import HyperparameterOAT from mala.network.network import Network from mala.network.trainer import Trainer +from mala.common.parameters import Parameters from mala import printout @@ -15,22 +16,19 @@ class ObjectiveBase: Represents the objective function of a training process. This is usually the result of a training of a network. - """ - def __init__(self, params, data_handler): - """ - Create an ObjectiveBase object. + Parameters + ---------- + params : mala.common.parametes.Parameters + Parameters used to create this objective. - Parameters - ---------- - params : mala.common.parametes.Parameters - Parameters used to create this objective. + data_handler : mala.datahandling.data_handler.DataHandler + datahandler to be used during the hyperparameter optimization. + """ - data_handler : mala.datahandling.data_handler.DataHandler - datahandler to be used during the hyperparameter optimization. - """ - self.params = params - self.data_handler = data_handler + def __init__(self, params, data_handler): + self.params: Parameters = params + self._data_handler = data_handler # We need to find out if we have to reparametrize the lists with the # layers and the activations. @@ -58,17 +56,17 @@ def __init__(self, params, data_handler): "the range of neurons or number of layers is missing. " "This input will be ignored." ) - self.optimize_layer_list = contains_single_layer or ( + self._optimize_layer_list = contains_single_layer or ( contains_multiple_layer_neurons and contains_multiple_layers_count ) - self.optimize_activation_list = list( + self._optimize_activation_list = list( map( lambda p: "layer_activation" in p.name, self.params.hyperparameters.hlist, ) ).count(True) - self.trial_type = self.params.hyperparameters.hyper_opt_method + self._trial_type = self.params.hyperparameters.hyper_opt_method def __call__(self, trial): """ @@ -83,7 +81,7 @@ def __call__(self, trial): # Parse the parameters included in the trial. self.parse_trial(trial) if ( - self.trial_type == "optuna" + self._trial_type == "optuna" and self.params.hyperparameters.pruner == "naswot" ): if trial.should_prune(): @@ -96,12 +94,12 @@ def __call__(self, trial): ): test_network = Network(self.params) test_trainer = Trainer( - self.params, test_network, self.data_handler + self.params, test_network, self._data_handler ) test_trainer.train_network() final_validation_loss.append(test_trainer.final_validation_loss) if ( - self.trial_type == "optuna" + self._trial_type == "optuna" and self.params.hyperparameters.pruner == "multi_training" ): @@ -148,9 +146,9 @@ def parse_trial(self, trial): A trial is a set of hyperparameters; can be an optuna based trial or simply a OAT compatible list. """ - if self.trial_type == "optuna": + if self._trial_type == "optuna": self.parse_trial_optuna(trial) - elif self.trial_type == "oat": + elif self._trial_type == "oat": self.parse_trial_oat(trial) else: raise Exception( @@ -167,11 +165,11 @@ def parse_trial_optuna(self, trial: Trial): trial : optuna.trial.Trial. A set of hyperparameters encoded by optuna. """ - if self.optimize_layer_list: + if self._optimize_layer_list: self.params.network.layer_sizes = [ - self.data_handler.input_dimension + self._data_handler.input_dimension ] - if self.optimize_activation_list > 0: + if self._optimize_activation_list > 0: self.params.network.layer_activations = [] # Some layers may have been turned off by optuna. @@ -274,9 +272,9 @@ def parse_trial_optuna(self, trial: Trial): ) layer_counter += 1 - if self.optimize_layer_list: + if self._optimize_layer_list: self.params.network.layer_sizes.append( - self.data_handler.output_dimension + self._data_handler.output_dimension ) def parse_trial_oat(self, trial): @@ -288,12 +286,12 @@ def parse_trial_oat(self, trial): trial : numpy.array Row in an orthogonal array which respresents current trial. """ - if self.optimize_layer_list: + if self._optimize_layer_list: self.params.network.layer_sizes = [ - self.data_handler.input_dimension + self._data_handler.input_dimension ] - if self.optimize_activation_list: + if self._optimize_activation_list: self.params.network.layer_activations = [] # Some layers may have been turned off by optuna. @@ -404,7 +402,7 @@ def parse_trial_oat(self, trial): ) layer_counter += 1 - if self.optimize_layer_list: + if self._optimize_layer_list: self.params.network.layer_sizes.append( - self.data_handler.output_dimension + self._data_handler.output_dimension ) diff --git a/mala/network/objective_naswot.py b/mala/network/objective_naswot.py index 96377e527..7f2c117de 100644 --- a/mala/network/objective_naswot.py +++ b/mala/network/objective_naswot.py @@ -46,10 +46,10 @@ def __init__( batch_size=None, ): super(ObjectiveNASWOT, self).__init__(search_parameters, data_handler) - self.trial_type = trial_type - self.batch_size = batch_size - if self.batch_size is None: - self.batch_size = search_parameters.running.mini_batch_size + self._trial_type = trial_type + self._batch_size = batch_size + if self._batch_size is None: + self._batch_size = search_parameters.running.mini_batch_size def __call__(self, trial): """ @@ -75,15 +75,15 @@ def __call__(self, trial): # Load the batchesand get the jacobian. do_shuffle = self.params.running.use_shuffling_for_samplers if ( - self.data_handler.parameters.use_lazy_loading + self._data_handler.parameters.use_lazy_loading or self.params.use_ddp ): do_shuffle = False if self.params.running.use_shuffling_for_samplers: - self.data_handler.mix_datasets() + self._data_handler.mix_datasets() loader = DataLoader( - self.data_handler.training_data_sets[0], - batch_size=self.batch_size, + self._data_handler.training_data_sets[0], + batch_size=self._batch_size, shuffle=do_shuffle, ) jac = ObjectiveNASWOT.__get_batch_jacobian(net, loader, device) diff --git a/mala/network/predictor.py b/mala/network/predictor.py index 785671dc0..3dfc99177 100644 --- a/mala/network/predictor.py +++ b/mala/network/predictor.py @@ -26,6 +26,12 @@ class Predictor(Runner): data : mala.datahandling.data_handler.DataHandler DataHandler, in this case not directly holding data, but serving as an interface to Target and Descriptor objects. + + Attributes + ---------- + target_calculator : mala.targets.target.Target + Target calculator used for predictions. Can be used for further + processing. """ def __init__(self, params, network, data): @@ -37,8 +43,8 @@ def __init__(self, params, network, data): * self.data.grid_dimension[1] * self.data.grid_dimension[2] ) - self.test_data_loader = None - self.number_of_batches_per_snapshot = 0 + self._test_data_loader = None + self._number_of_batches_per_snapshot = 0 self.target_calculator = data.target_calculator def predict_from_qeout(self, path_to_file, gather_ldos=False): @@ -139,7 +145,7 @@ def predict_for_atoms(self, atoms, gather_ldos=False, temperature=None): self.data.target_calculator.read_additional_calculation_data( [atoms, self.data.grid_dimension], "atoms+grid" ) - feature_length = self.data.descriptor_calculator.fingerprint_length + feature_length = self.data.descriptor_calculator.feature_size # The actual calculation of the LDOS from the descriptors depends # on whether we run in parallel or serial. In the former case, @@ -228,11 +234,11 @@ def _forward_snap_descriptors( ) self.parameters.mini_batch_size = optimal_batch_size - self.number_of_batches_per_snapshot = int( + self._number_of_batches_per_snapshot = int( local_data_size / self.parameters.mini_batch_size ) - for i in range(0, self.number_of_batches_per_snapshot): + for i in range(0, self._number_of_batches_per_snapshot): sl = slice( i * self.parameters.mini_batch_size, (i + 1) * self.parameters.mini_batch_size, diff --git a/mala/network/runner.py b/mala/network/runner.py index 7d3d2ffa8..c4bfa6f0d 100644 --- a/mala/network/runner.py +++ b/mala/network/runner.py @@ -38,6 +38,20 @@ class Runner: network : mala.network.network.Network Network which is being run. + data : mala.datahandling.data_handler.DataHandler + DataHandler holding the data for the run. + + Attributes + ---------- + parameters : mala.common.parametes.ParametersRunning + MALA neural network training/inference parameters. + + parameters_full : mala.common.parametes.Parameters + Full MALA Parameters object. + + network : mala.network.network.Network + Network which is being run. + data : mala.datahandling.data_handler.DataHandler DataHandler holding the data for the run. """ @@ -46,7 +60,7 @@ def __init__(self, params, network, data, runner_dict=None): self.parameters_full: Parameters = params self.parameters: ParametersRunning = params.running self.network = network - self.data = data + self.data: DataHandler = data self.__prepare_to_run() def _calculate_errors( diff --git a/mala/network/tester.py b/mala/network/tester.py index fa1a27190..d7c07761a 100644 --- a/mala/network/tester.py +++ b/mala/network/tester.py @@ -41,6 +41,32 @@ class Tester(Runner): - "density": MAPE of the density prediction - "dos": MAPE of the DOS prediction + output_format : string + Can be "list" or "mae". If "list", then a list of results across all + snapshots is returned. If "mae", then the MAE across all snapshots + will be calculated and returned. + + Attributes + ---------- + target_calculator : mala.targets.target.Target + Target calculator used for predictions. Can be used for further + processing. + + observables_to_test : list + List of observables to test. Supported are: + + - "ldos": Calculate the MSE loss of the LDOS. + - "band_energy": Band energy error + - "band_energy_full": Band energy absolute values (only works with + list, as both actual and predicted are returned) + - "total_energy": Total energy error + - "total_energy_full": Total energy absolute values (only works + with list, as both actual and predicted are returned) + - "number_of_electrons": Number of electrons (Fermi energy is not + determined dynamically for this quantity. + - "density": MAPE of the density prediction + - "dos": MAPE of the DOS prediction + output_format : string Can be "list" or "mae". If "list", then a list of results across all snapshots is returned. If "mae", then the MAE across all snapshots @@ -57,8 +83,8 @@ def __init__( ): # copy the parameters into the class. super(Tester, self).__init__(params, network, data) - self.test_data_loader = None - self.number_of_batches_per_snapshot = 0 + self._test_data_loader = None + self._number_of_batches_per_snapshot = 0 self.observables_to_test = observables_to_test self.output_format = output_format if self.output_format != "list" and self.output_format != "mae": @@ -205,7 +231,7 @@ def predict_targets(self, snapshot_number, data_type="te"): offset_snapshots + snapshot_number, data_set, data_type, - self.number_of_batches_per_snapshot, + self._number_of_batches_per_snapshot, self.parameters.mini_batch_size, ) @@ -235,6 +261,6 @@ def __prepare_to_test(self, snapshot_number): min_verbosity=0, ) self.parameters.mini_batch_size = optimal_batch_size - self.number_of_batches_per_snapshot = int( + self._number_of_batches_per_snapshot = int( grid_size / self.parameters.mini_batch_size ) diff --git a/mala/network/trainer.py b/mala/network/trainer.py index 85fc52044..b5eb0892a 100644 --- a/mala/network/trainer.py +++ b/mala/network/trainer.py @@ -38,11 +38,22 @@ class Trainer(Runner): data : mala.datahandling.data_handler.DataHandler DataHandler holding the training data. - use_pkl_checkpoints : bool - If true, .pkl checkpoints will be created. + _optimizer_dict : dict + For internal use by the Trainer class during loading procecdures only. + + Attributes + ---------- + final_validation_loss : float + Validation loss after training + + network : mala.network.network.Network + Network which is being trained. + + full_logging_path : str + Full path to training logs. """ - def __init__(self, params, network, data, optimizer_dict=None): + def __init__(self, params, network, data, _optimizer_dict=None): # copy the parameters into the class. super(Trainer, self).__init__(params, network, data) @@ -56,22 +67,21 @@ def __init__(self, params, network, data, optimizer_dict=None): torch.cuda.current_stream().wait_stream(s) self.final_validation_loss = float("inf") - self.initial_validation_loss = float("inf") - self.optimizer = None - self.scheduler = None - self.patience_counter = 0 - self.last_epoch = 0 - self.last_loss = None - self.training_data_loaders = [] - self.validation_data_loaders = [] + self._optimizer = None + self._scheduler = None + self._patience_counter = 0 + self._last_epoch = 0 + self._last_loss = None + self._training_data_loaders = [] + self._validation_data_loaders = [] # Samplers for the ddp case. - self.train_sampler = None - self.validation_sampler = None + self._train_sampler = None + self._validation_sampler = None - self.__prepare_to_train(optimizer_dict) + self.__prepare_to_train(_optimizer_dict) - self.logger = None + self._logger = None self.full_logging_path = None if self.parameters.logger is not None: os.makedirs(self.parameters.logging_dir, exist_ok=True) @@ -92,9 +102,9 @@ def __init__(self, params, network, data, optimizer_dict=None): if self.parameters.logger == "wandb": import wandb - self.logger = wandb + self._logger = wandb elif self.parameters.logger == "tensorboard": - self.logger = SummaryWriter(self.full_logging_path) + self._logger = SummaryWriter(self.full_logging_path) else: raise Exception( f"Unsupported logger {self.parameters.logger}." @@ -105,13 +115,13 @@ def __init__(self, params, network, data, optimizer_dict=None): min_verbosity=1, ) - self.gradscaler = None + self._gradscaler = None if self.parameters.use_mixed_precision: printout("Using mixed precision via AMP.", min_verbosity=1) - self.gradscaler = torch.cuda.amp.GradScaler() + self._gradscaler = torch.cuda.amp.GradScaler() - self.train_graph = None - self.validation_graph = None + self._train_graph = None + self._validation_graph = None @classmethod def run_exists(cls, run_name, params_format="json", zip_run=True): @@ -253,7 +263,7 @@ def _load_from_run(cls, params, network, data, file=None): # Now, create the Trainer class with it. loaded_trainer = Trainer( - params, network, data, optimizer_dict=checkpoint + params, network, data, _optimizer_dict=checkpoint ) return loaded_trainer @@ -265,18 +275,15 @@ def train_network(self): vloss = float("inf") - # Save losses for later use. - self.initial_validation_loss = vloss - # Initialize all the counters. checkpoint_counter = 0 # If we restarted from a checkpoint, we have to differently initialize # the loss. - if self.last_loss is None: + if self._last_loss is None: vloss_old = vloss else: - vloss_old = self.last_loss + vloss_old = self._last_loss ############################ # PERFORM TRAINING @@ -284,7 +291,9 @@ def train_network(self): total_batch_id = 0 - for epoch in range(self.last_epoch, self.parameters.max_number_epochs): + for epoch in range( + self._last_epoch, self.parameters.max_number_epochs + ): start_time = time.time() # Prepare model for training. @@ -298,8 +307,8 @@ def train_network(self): ) # train sampler - if self.train_sampler: - self.train_sampler.set_epoch(epoch) + if self._train_sampler: + self._train_sampler.set_epoch(epoch) # shuffle dataset if necessary if isinstance(self.data.training_data_sets[0], FastTensorDataset): @@ -312,7 +321,7 @@ def train_network(self): tsample = time.time() t0 = time.time() batchid = 0 - for loader in self.training_data_loaders: + for loader in self._training_data_loaders: t = time.time() for inputs, outputs in tqdm( loader, @@ -387,19 +396,19 @@ def train_network(self): training_loss_sum_logging / self.parameters.training_log_interval ) - self.logger.add_scalars( + self._logger.add_scalars( "ldos", {"during_training": training_loss_mean}, total_batch_id, ) - self.logger.close() + self._logger.close() training_loss_sum_logging = 0.0 if self.parameters.logger == "wandb": training_loss_mean = ( training_loss_sum_logging / self.parameters.training_log_interval ) - self.logger.log( + self._logger.log( { "ldos_during_training": training_loss_mean }, @@ -422,7 +431,7 @@ def train_network(self): ) else: batchid = 0 - for loader in self.training_data_loaders: + for loader in self._training_data_loaders: for inputs, outputs in loader: inputs = inputs.to( self.parameters._configuration["device"] @@ -473,7 +482,7 @@ def train_network(self): if self.parameters.logger == "tensorboard": for dataset_fraction in dataset_fractions: for metric in errors[dataset_fraction]: - self.logger.add_scalars( + self._logger.add_scalars( metric, { dataset_fraction: errors[dataset_fraction][ @@ -482,11 +491,11 @@ def train_network(self): }, total_batch_id, ) - self.logger.close() + self._logger.close() if self.parameters.logger == "wandb": for dataset_fraction in dataset_fractions: for metric in errors[dataset_fraction]: - self.logger.log( + self._logger.log( { f"{dataset_fraction}_{metric}": errors[ dataset_fraction @@ -510,38 +519,38 @@ def train_network(self): ) # If a scheduler is used, update it. - if self.scheduler is not None: + if self._scheduler is not None: if ( self.parameters.learning_rate_scheduler == "ReduceLROnPlateau" ): - self.scheduler.step(vloss) + self._scheduler.step(vloss) # If early stopping is used, check if we need to do something. if self.parameters.early_stopping_epochs > 0: if vloss < vloss_old * ( 1.0 - self.parameters.early_stopping_threshold ): - self.patience_counter = 0 + self._patience_counter = 0 vloss_old = vloss else: - self.patience_counter += 1 + self._patience_counter += 1 printout( "Validation accuracy has not improved enough.", min_verbosity=1, ) if ( - self.patience_counter + self._patience_counter >= self.parameters.early_stopping_epochs ): printout( "Stopping the training, validation " "accuracy has not improved for", - self.patience_counter, + self._patience_counter, "epochs.", min_verbosity=1, ) - self.last_epoch = epoch + self._last_epoch = epoch break # If checkpointing is enabled, we need to checkpoint. @@ -552,8 +561,8 @@ def train_network(self): >= self.parameters.checkpoints_each_epoch ): printout("Checkpointing training.", min_verbosity=0) - self.last_epoch = epoch - self.last_loss = vloss_old + self._last_epoch = epoch + self._last_loss = vloss_old self.__create_training_checkpoint() checkpoint_counter = 0 @@ -590,8 +599,8 @@ def train_network(self): # Clean-up for pre-fetching lazy loading. if self.data.parameters.use_lazy_loading_prefetch: - self.training_data_loaders.cleanup() - self.validation_data_loaders.cleanup() + self._training_data_loaders.cleanup() + self._validation_data_loaders.cleanup() def _validate_network(self, data_set_fractions, metrics): # """Validate a network, using train or validation data.""" @@ -599,13 +608,13 @@ def _validate_network(self, data_set_fractions, metrics): errors = {} for data_set_type in data_set_fractions: if data_set_type == "train": - data_loaders = self.training_data_loaders + data_loaders = self._training_data_loaders data_sets = self.data.training_data_sets number_of_snapshots = self.data.nr_training_snapshots offset_snapshots = 0 elif data_set_type == "validation": - data_loaders = self.validation_data_loaders + data_loaders = self._validation_data_loaders data_sets = self.data.validation_data_sets number_of_snapshots = self.data.nr_validation_snapshots offset_snapshots = self.data.nr_training_snapshots @@ -720,11 +729,11 @@ def __prepare_to_train(self, optimizer_dict): # Read last epoch if optimizer_dict is not None: - self.last_epoch = optimizer_dict["epoch"] + 1 + self._last_epoch = optimizer_dict["epoch"] + 1 # Scale the learning rate according to ddp. if self.parameters_full.use_ddp: - if dist.get_world_size() > 1 and self.last_epoch == 0: + if dist.get_world_size() > 1 and self._last_epoch == 0: printout( "Rescaling learning rate because multiple workers are" " used for training.", @@ -736,20 +745,20 @@ def __prepare_to_train(self, optimizer_dict): # Choose an optimizer to use. if self.parameters.optimizer == "SGD": - self.optimizer = optim.SGD( + self._optimizer = optim.SGD( self.network.parameters(), lr=self.parameters.learning_rate, weight_decay=self.parameters.l2_regularization, ) elif self.parameters.optimizer == "Adam": - self.optimizer = optim.Adam( + self._optimizer = optim.Adam( self.network.parameters(), lr=self.parameters.learning_rate, weight_decay=self.parameters.l2_regularization, ) elif self.parameters.optimizer == "FusedAdam": if version.parse(torch.__version__) >= version.parse("1.13.0"): - self.optimizer = optim.Adam( + self._optimizer = optim.Adam( self.network.parameters(), lr=self.parameters.learning_rate, weight_decay=self.parameters.l2_regularization, @@ -762,11 +771,11 @@ def __prepare_to_train(self, optimizer_dict): # Load data from pytorch file. if optimizer_dict is not None: - self.optimizer.load_state_dict( + self._optimizer.load_state_dict( optimizer_dict["optimizer_state_dict"] ) - self.patience_counter = optimizer_dict["early_stopping_counter"] - self.last_loss = optimizer_dict["early_stopping_last_loss"] + self._patience_counter = optimizer_dict["early_stopping_counter"] + self._last_loss = optimizer_dict["early_stopping_last_loss"] if self.parameters_full.use_ddp: # scaling the batch size for multiGPU per node @@ -781,7 +790,7 @@ def __prepare_to_train(self, optimizer_dict): if self.data.parameters.use_lazy_loading: do_shuffle = False - self.train_sampler = ( + self._train_sampler = ( torch.utils.data.distributed.DistributedSampler( self.data.training_data_sets[0], num_replicas=dist.get_world_size(), @@ -789,7 +798,7 @@ def __prepare_to_train(self, optimizer_dict): shuffle=do_shuffle, ) ) - self.validation_sampler = ( + self._validation_sampler = ( torch.utils.data.distributed.DistributedSampler( self.data.validation_data_sets[0], num_replicas=dist.get_world_size(), @@ -800,8 +809,8 @@ def __prepare_to_train(self, optimizer_dict): # Instantiate the learning rate scheduler, if necessary. if self.parameters.learning_rate_scheduler == "ReduceLROnPlateau": - self.scheduler = optim.lr_scheduler.ReduceLROnPlateau( - self.optimizer, + self._scheduler = optim.lr_scheduler.ReduceLROnPlateau( + self._optimizer, patience=self.parameters.learning_rate_patience, mode="min", factor=self.parameters.learning_rate_decay, @@ -811,8 +820,8 @@ def __prepare_to_train(self, optimizer_dict): pass else: raise Exception("Unsupported learning rate schedule.") - if self.scheduler is not None and optimizer_dict is not None: - self.scheduler.load_state_dict( + if self._scheduler is not None and optimizer_dict is not None: + self._scheduler.load_state_dict( optimizer_dict["lr_scheduler_state_dict"] ) @@ -848,11 +857,11 @@ def __prepare_to_train(self, optimizer_dict): if isinstance(self.data.training_data_sets[0], FastTensorDataset): # Not shuffling in loader. # I manually shuffle the data set each epoch. - self.training_data_loaders.append( + self._training_data_loaders.append( DataLoader( self.data.training_data_sets[0], batch_size=None, - sampler=self.train_sampler, + sampler=self._train_sampler, **kwargs, shuffle=False, ) @@ -861,26 +870,26 @@ def __prepare_to_train(self, optimizer_dict): if isinstance( self.data.training_data_sets[0], LazyLoadDatasetSingle ): - self.training_data_loaders = MultiLazyLoadDataLoader( + self._training_data_loaders = MultiLazyLoadDataLoader( self.data.training_data_sets, **kwargs ) else: - self.training_data_loaders.append( + self._training_data_loaders.append( DataLoader( self.data.training_data_sets[0], batch_size=self.parameters.mini_batch_size, - sampler=self.train_sampler, + sampler=self._train_sampler, **kwargs, shuffle=do_shuffle, ) ) if isinstance(self.data.validation_data_sets[0], FastTensorDataset): - self.validation_data_loaders.append( + self._validation_data_loaders.append( DataLoader( self.data.validation_data_sets[0], batch_size=None, - sampler=self.validation_sampler, + sampler=self._validation_sampler, **kwargs, ) ) @@ -888,15 +897,15 @@ def __prepare_to_train(self, optimizer_dict): if isinstance( self.data.validation_data_sets[0], LazyLoadDatasetSingle ): - self.validation_data_loaders = MultiLazyLoadDataLoader( + self._validation_data_loaders = MultiLazyLoadDataLoader( self.data.validation_data_sets, **kwargs ) else: - self.validation_data_loaders.append( + self._validation_data_loaders.append( DataLoader( self.data.validation_data_sets[0], batch_size=self.parameters.mini_batch_size * 1, - sampler=self.validation_sampler, + sampler=self._validation_sampler, **kwargs, ) ) @@ -904,7 +913,7 @@ def __prepare_to_train(self, optimizer_dict): def __process_mini_batch(self, network, input_data, target_data): """Process a mini batch.""" if self.parameters._configuration["gpu"]: - if self.parameters.use_graphs and self.train_graph is None: + if self.parameters.use_graphs and self._train_graph is None: printout("Capturing CUDA graph for training.", min_verbosity=2) s = torch.cuda.Stream(self.parameters._configuration["device"]) s.wait_stream( @@ -931,8 +940,8 @@ def __process_mini_batch(self, network, input_data, target_data): prediction, target_data ) - if self.gradscaler: - self.gradscaler.scale(loss).backward() + if self._gradscaler: + self._gradscaler.scale(loss).backward() else: loss.backward() torch.cuda.current_stream( @@ -940,38 +949,40 @@ def __process_mini_batch(self, network, input_data, target_data): ).wait_stream(s) # Create static entry point tensors to graph - self.static_input_data = torch.empty_like(input_data) - self.static_target_data = torch.empty_like(target_data) + self._static_input_data = torch.empty_like(input_data) + self._static_target_data = torch.empty_like(target_data) # Capture graph - self.train_graph = torch.cuda.CUDAGraph() + self._train_graph = torch.cuda.CUDAGraph() network.zero_grad(set_to_none=True) - with torch.cuda.graph(self.train_graph): + with torch.cuda.graph(self._train_graph): with torch.cuda.amp.autocast( enabled=self.parameters.use_mixed_precision ): - self.static_prediction = network( - self.static_input_data + self._static_prediction = network( + self._static_input_data ) if self.parameters_full.use_ddp: - self.static_loss = network.module.calculate_loss( - self.static_prediction, self.static_target_data + self._static_loss = network.module.calculate_loss( + self._static_prediction, + self._static_target_data, ) else: - self.static_loss = network.calculate_loss( - self.static_prediction, self.static_target_data + self._static_loss = network.calculate_loss( + self._static_prediction, + self._static_target_data, ) - if self.gradscaler: - self.gradscaler.scale(self.static_loss).backward() + if self._gradscaler: + self._gradscaler.scale(self._static_loss).backward() else: - self.static_loss.backward() + self._static_loss.backward() - if self.train_graph: - self.static_input_data.copy_(input_data) - self.static_target_data.copy_(target_data) - self.train_graph.replay() + if self._train_graph: + self._static_input_data.copy_(input_data) + self._static_target_data.copy_(target_data) + self._train_graph.replay() else: torch.cuda.nvtx.range_push("zero_grad") self.network.zero_grad(set_to_none=True) @@ -1001,24 +1012,24 @@ def __process_mini_batch(self, network, input_data, target_data): # loss torch.cuda.nvtx.range_pop() - if self.gradscaler: - self.gradscaler.scale(loss).backward() + if self._gradscaler: + self._gradscaler.scale(loss).backward() else: loss.backward() t = time.time() torch.cuda.nvtx.range_push("optimizer") - if self.gradscaler: - self.gradscaler.step(self.optimizer) - self.gradscaler.update() + if self._gradscaler: + self._gradscaler.step(self._optimizer) + self._gradscaler.update() else: - self.optimizer.step() + self._optimizer.step() dt = time.time() - t printout(f"optimizer time: {dt}", min_verbosity=3) torch.cuda.nvtx.range_pop() # optimizer - if self.train_graph: - return self.static_loss + if self._train_graph: + return self._static_loss else: return loss else: @@ -1028,8 +1039,8 @@ def __process_mini_batch(self, network, input_data, target_data): else: loss = network.calculate_loss(prediction, target_data) loss.backward() - self.optimizer.step() - self.optimizer.zero_grad() + self._optimizer.step() + self._optimizer.zero_grad() return loss def __create_training_checkpoint(self): @@ -1046,20 +1057,20 @@ def __create_training_checkpoint(self): if self.parameters_full.use_ddp: if dist.get_rank() != 0: return - if self.scheduler is None: + if self._scheduler is None: save_dict = { - "epoch": self.last_epoch, - "optimizer_state_dict": self.optimizer.state_dict(), - "early_stopping_counter": self.patience_counter, - "early_stopping_last_loss": self.last_loss, + "epoch": self._last_epoch, + "optimizer_state_dict": self._optimizer.state_dict(), + "early_stopping_counter": self._patience_counter, + "early_stopping_last_loss": self._last_loss, } else: save_dict = { - "epoch": self.last_epoch, - "optimizer_state_dict": self.optimizer.state_dict(), - "lr_scheduler_state_dict": self.scheduler.state_dict(), - "early_stopping_counter": self.patience_counter, - "early_stopping_last_loss": self.last_loss, + "epoch": self._last_epoch, + "optimizer_state_dict": self._optimizer.state_dict(), + "lr_scheduler_state_dict": self._scheduler.state_dict(), + "early_stopping_counter": self._patience_counter, + "early_stopping_last_loss": self._last_loss, } torch.save( save_dict, optimizer_name, _use_new_zipfile_serialization=False diff --git a/mala/targets/atomic_force.py b/mala/targets/atomic_force.py index d5e81e4cd..a830b806c 100644 --- a/mala/targets/atomic_force.py +++ b/mala/targets/atomic_force.py @@ -3,6 +3,7 @@ from ase.units import Rydberg, Bohr from .target import Target +from mala.common.parallelizer import parallel_warn class AtomicForce(Target): @@ -24,6 +25,10 @@ def __init__(self, params): Parameters used to create this TargetBase object. """ + parallel_warn( + "The AtomicForce class is currently be developed and" + " not feature-complete." + ) super(AtomicForce, self).__init__(params) def get_feature_size(self): diff --git a/mala/targets/density.py b/mala/targets/density.py index d5fdfe27c..61623fe24 100644 --- a/mala/targets/density.py +++ b/mala/targets/density.py @@ -29,7 +29,8 @@ class Density(Target): - """Postprocessing / parsing functions for the electronic density. + """ + Postprocessing / parsing functions for the electronic density. Parameters ---------- @@ -40,7 +41,10 @@ class Density(Target): ############################## # Class attributes ############################## - + """ + Total energy module mutual exclusion token used to make sure there + the total energy module is not initialized twice. + """ te_mutex = False ############################## @@ -278,6 +282,12 @@ def get_target(self): This is the generic interface for cached target quantities. It should work for all implemented targets. + + Returns + ------- + density : numpy.ndarray + Electronic charge density as a volumetric array. May be 4D or 2D + depending on workflow. """ return self.density @@ -407,7 +417,8 @@ def read_from_cube(self, path, units="1/Bohr^3", **kwargs): Units the density is saved in. Usually none. """ printout("Reading density from .cube file ", path, min_verbosity=0) - # automatically convert units if they are None since cube files take atomic units + # automatically convert units if they are None since cube files take + # atomic units if units is None: units = "1/Bohr^3" if units != "1/Bohr^3": @@ -582,7 +593,7 @@ def get_number_of_electrons( voxel : ase.cell.Cell Voxel to be used for grid intergation. Needs to reflect the - symmetry of the simulation cell. In Bohr. + symmetry of the simulation cell. integration_method : str Integration method used to integrate density on the grid. @@ -1118,7 +1129,7 @@ def __setup_total_energy_module( # instantiate the process with the file. positions_for_qe = self.get_scaled_positions_for_qe(atoms_Angstrom) - if self._parameters_full.descriptors.use_atomic_density_energy_formula: + if self.parameters._configuration["atomic_density_formula"]: # Calculate the Gaussian descriptors for the calculation of the # structure factors. barrier() @@ -1187,8 +1198,8 @@ def __setup_total_energy_module( te.set_positions( np.transpose(positions_for_qe), number_of_atoms, - self._parameters_full.descriptors.use_atomic_density_energy_formula, - self._parameters_full.descriptors.use_atomic_density_energy_formula, + self.parameters._configuration["atomic_density_formula"], + self.parameters._configuration["atomic_density_formula"], ) barrier() printout( @@ -1199,7 +1210,7 @@ def __setup_total_energy_module( ) barrier() - if self._parameters_full.descriptors.use_atomic_density_energy_formula: + if self.parameters._configuration["atomic_density_formula"]: t0 = time.perf_counter() gaussian_descriptors = np.reshape( gaussian_descriptors, diff --git a/mala/targets/dos.py b/mala/targets/dos.py index faac8dfa4..b1f6f103b 100644 --- a/mala/targets/dos.py +++ b/mala/targets/dos.py @@ -248,6 +248,12 @@ def get_target(self): This is the generic interface for cached target quantities. It should work for all implemented targets. + + Returns + ------- + density_of_states : numpy.ndarray + Electronic density of states. + """ return self.density_of_states diff --git a/mala/targets/ldos.py b/mala/targets/ldos.py index 4b2f4bbae..c53245003 100644 --- a/mala/targets/ldos.py +++ b/mala/targets/ldos.py @@ -239,6 +239,12 @@ def get_target(self): This is the generic interface for cached target quantities. It should work for all implemented targets. + + Returns + ------- + local_density_of_states : numpy.ndarray + Electronic local density of states as a volumetric array. + May be 4D- or 2D depending on workflow. """ return self.local_density_of_states @@ -598,8 +604,6 @@ def get_total_energy( If neither LDOS nor DOS+Density data is provided, the cached LDOS will be attempted to be used for the calculation. - - Parameters ---------- ldos_data : numpy.array diff --git a/mala/targets/target.py b/mala/targets/target.py index 52664353b..10a414c6c 100644 --- a/mala/targets/target.py +++ b/mala/targets/target.py @@ -27,18 +27,88 @@ class Target(PhysicalData): - """ + r""" Base class for all target quantity parser. Target parsers read the target quantity (i.e. the quantity the NN will learn to predict) from a specified file format and performs postprocessing calculations on the quantity. + Target parsers often read DFT reference information. + Parameters ---------- params : mala.common.parameters.Parameters or mala.common.parameters.ParametersTargets Parameters used to create this Target object. + + Attributes + ---------- + atomic_forces_dft : numpy.ndarray + Atomic forces as per DFT reference file. + + atoms : ase.Atoms + ASE atoms object used for calculations. + + band_energy_dft_calculation + Band energy as per DFT reference file. + + electrons_per_atom : int + Electrons per atom, usually determined by DFT reference file. + + entropy_contribution_dft_calculation : float + Electronic entropy contribution as per DFT reference file. + + fermi_energy_dft : float + Fermi energy as per DFT reference file. + + kpoints : list + k-grid used for MALA calculations. Managed internally. + + local_grid : list + Size of local grid (in MPI mode). + + number_of_electrons_exact + Exact number of electrons, usually given via DFT reference file. + + number_of_electrons_from_eigenvals : float + Number of electrons as calculated from DFT reference eigenvalues. + + parameters : mala.common.parameters.ParametersTarget + MALA target calculation parameters. + + qe_pseudopotentials : list + List of Quantum ESPRESSO pseudopotentials, read from DFT reference file + and used for the total energy module. + + save_target_data : bool + Control whether target data will be saved. Can be important for I/O + applications. Managed internally, default is True. + + temperature : float + Temperature used for all computations. By default read from DFT + reference file, but can freely be changed from the outside. + + total_energy_contributions_dft_calculation : dict + Dictionary holding contributions to total free energy not given + as individual properties, as read from the DFT reference file. + Contains: + + - "one_electron_contribution", :math:`n\,V_\mathrm{xc}` plus band + energy + - "hartree_contribution", :math:`E_\mathrm{H}` + - "xc_contribution", :math:`E_\mathrm{xc}` + - "ewald_contribution", :math:`E_\mathrm{Ewald}` + + total_energy_dft_calculation : float + Total free energy as read from DFT reference file. + voxel : ase.cell.Cell + Voxel to be used for grid intergation. Reflects the + symmetry of the simulation cell. Calculated from DFT reference data. + + y_planes : int + Number of y_planes used for Quantum ESPRESSO parallelization. Handled + internally. """ ############################## @@ -99,7 +169,6 @@ def __getnewargs__(self): Used for pickling. - Returns ------- params : mala.Parameters @@ -792,7 +861,14 @@ def get_energy_grid(self): raise Exception("No method implement to calculate an energy grid.") def get_real_space_grid(self): - """Get the real space grid.""" + """ + Get the real space grid. + + Returns + ------- + grid3D : numpy.ndarray + Numpy array holding the entire grid. + """ grid3D = np.zeros( ( self.grid_dimensions[0], @@ -1374,8 +1450,7 @@ def write_tem_input_file( None. mpi_rank : int - Rank within MPI - + Rank within MPI. """ # Specify grid dimensions, if any are given. if ( @@ -1608,7 +1683,7 @@ def _process_openpmd_attributes(self, series, iteration, mesh): "electrons_per_atom", default_value=self.electrons_per_atom, ) - self.number_of_electrons_from_eigenval = ( + self.number_of_electrons_from_eigenvals = ( self._get_attribute_if_attribute_exists( iteration, "number_of_electrons_from_eigenvals", diff --git a/setup.py b/setup.py index b34c3fef2..7ce1509ff 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ extras = { "dev": ["bump2version"], - "opt": ["oapackage"], + "opt": ["oapackage", "scikit-learn"], "test": ["pytest", "pytest-cov"], "doc": open("docs/requirements.txt").read().splitlines(), "experimental": ["asap3", "dftpy", "minterpy"], diff --git a/test/all_lazy_loading_test.py b/test/all_lazy_loading_test.py index 351c98292..e51501fae 100644 --- a/test/all_lazy_loading_test.py +++ b/test/all_lazy_loading_test.py @@ -30,7 +30,7 @@ def test_scaling(self): #################### test_parameters = Parameters() test_parameters.data.input_rescaling_type = "feature-wise-standard" - test_parameters.data.output_rescaling_type = "normal" + test_parameters.data.output_rescaling_type = "minmax" test_parameters.data.data_splitting_type = "by_snapshot" test_parameters.descriptors.bispectrum_twojmax = 11 test_parameters.targets.ldos_gridsize = 10 @@ -53,9 +53,9 @@ def test_scaling(self): training_tester = [] for scalingtype in [ "standard", - "normal", + "minmax", "feature-wise-standard", - "feature-wise-normal", + "feature-wise-minmax", ]: comparison = [scalingtype] for ll_type in [True, False]: @@ -125,7 +125,7 @@ def test_scaling(self): data_handler.output_data_scaler.total_std / data_handler.nr_training_data ) - elif scalingtype == "normal": + elif scalingtype == "minmax": torch.manual_seed(2002) this_result.append( data_handler.input_data_scaler.total_max @@ -188,7 +188,7 @@ def test_scaling(self): 0 ].grid_size ) - elif scalingtype == "feature-wise-normal": + elif scalingtype == "feature-wise-minmax": this_result.append( torch.mean(data_handler.input_data_scaler.maxs) ) @@ -250,148 +250,12 @@ def test_prefetching(self): ) assert with_prefetching < without_prefetching - @pytest.mark.skipif( - importlib.util.find_spec("horovod") is None, - reason="Horovod is currently not part of the pipeline", - ) - def test_performance_horovod(self): - - #################### - # PARAMETERS - #################### - test_parameters = Parameters() - test_parameters.data.input_rescaling_type = "feature-wise-standard" - test_parameters.data.output_rescaling_type = "normal" - test_parameters.data.data_splitting_type = "by_snapshot" - test_parameters.network.layer_activations = ["LeakyReLU"] - test_parameters.running.max_number_epochs = 20 - test_parameters.running.mini_batch_size = 500 - test_parameters.running.optimizer = "Adam" - test_parameters.comment = "Horovod / lazy loading benchmark." - test_parameters.network.nn_type = "feed-forward" - test_parameters.manual_seed = 2021 - - #################### - # DATA - #################### - results = [] - for hvduse in [False, True]: - for ll in [True, False]: - start_time = time.time() - test_parameters.running.learning_rate = 0.00001 - test_parameters.data.use_lazy_loading = ll - test_parameters.use_horovod = hvduse - data_handler = DataHandler(test_parameters) - data_handler.add_snapshot( - "Al_debug_2k_nr0.in.npy", - data_path, - "Al_debug_2k_nr0.out.npy", - data_path, - add_snapshot_as="tr", - output_units="1/(Ry*Bohr^3)", - ) - data_handler.add_snapshot( - "Al_debug_2k_nr1.in.npy", - data_path, - "Al_debug_2k_nr1.out.npy", - data_path, - add_snapshot_as="tr", - output_units="1/(Ry*Bohr^3)", - ) - data_handler.add_snapshot( - "Al_debug_2k_nr2.in.npy", - data_path, - "Al_debug_2k_nr2.out.npy", - data_path, - add_snapshot_as="tr", - output_units="1/(Ry*Bohr^3)", - ) - data_handler.add_snapshot( - "Al_debug_2k_nr1.in.npy", - data_path, - "Al_debug_2k_nr1.out.npy", - data_path, - add_snapshot_as="va", - output_units="1/(Ry*Bohr^3)", - ) - data_handler.add_snapshot( - "Al_debug_2k_nr2.in.npy", - data_path, - "Al_debug_2k_nr2.out.npy", - data_path, - add_snapshot_as="te", - output_units="1/(Ry*Bohr^3)", - ) - - data_handler.prepare_data() - test_parameters.network.layer_sizes = [ - data_handler.input_dimension, - 100, - data_handler.output_dimension, - ] - - # Setup network and trainer. - test_network = Network(test_parameters) - test_trainer = Trainer( - test_parameters, test_network, data_handler - ) - test_trainer.train_network() - - hvdstring = "no horovod" - if hvduse: - hvdstring = "horovod" - - llstring = "data in RAM" - if ll: - llstring = "using lazy loading" - - results.append( - [ - hvdstring, - llstring, - test_trainer.initial_validation_loss, - test_trainer.final_validation_loss, - time.time() - start_time, - ] - ) - - diff = [] - # For 4 local processes I get: - # Test: no horovod , using lazy loading - # Initial loss: 0.1342976689338684 - # Final loss: 0.10587086156010628 - # Time: 3.743736743927002 - # Test: no horovod , data in RAM - # Initial loss: 0.13430887088179588 - # Final loss: 0.10572846792638302 - # Time: 1.825883388519287 - # Test: horovod , using lazy loading - # Initial loss: 0.1342976726591587 - # Final loss: 0.10554153844714165 - # Time: 4.513132572174072 - # Test: horovod , data in RAM - # Initial loss: 0.13430887088179588 - # Final loss: 0.1053303349763155 - # Time: 3.2193074226379395 - - for r in results: - printout("Test: ", r[0], ", ", r[1], min_verbosity=0) - printout("Initial loss: ", r[2], min_verbosity=0) - printout("Final loss: ", r[3], min_verbosity=0) - printout("Time: ", r[4], min_verbosity=0) - diff.append(r[3] - r[2]) - - diff = np.array(diff) - - # The loss improvements should be comparable. - assert np.std(diff) < accuracy_coarse - @staticmethod def _train_lazy_loading(prefetching): test_parameters = Parameters() test_parameters.data.data_splitting_type = "by_snapshot" test_parameters.data.input_rescaling_type = "feature-wise-standard" - test_parameters.data.output_rescaling_type = "normal" + test_parameters.data.output_rescaling_type = "minmax" test_parameters.network.layer_activations = ["ReLU"] test_parameters.manual_seed = 1234 test_parameters.running.max_number_epochs = 100 diff --git a/test/basic_gpu_test.py b/test/basic_gpu_test.py index 514a70f21..46a44803f 100644 --- a/test/basic_gpu_test.py +++ b/test/basic_gpu_test.py @@ -82,7 +82,7 @@ def __run(use_gpu): # Specify the data scaling. test_parameters.data.input_rescaling_type = "feature-wise-standard" - test_parameters.data.output_rescaling_type = "normal" + test_parameters.data.output_rescaling_type = "minmax" # Specify the used activation function. test_parameters.network.layer_activations = ["ReLU"] diff --git a/test/checkpoint_hyperopt_test.py b/test/checkpoint_hyperopt_test.py index a1909f21b..3c64ffa71 100644 --- a/test/checkpoint_hyperopt_test.py +++ b/test/checkpoint_hyperopt_test.py @@ -61,7 +61,7 @@ def __original_setup(n_trials): # Specify the data scaling. test_parameters.data.input_rescaling_type = "feature-wise-standard" - test_parameters.data.output_rescaling_type = "normal" + test_parameters.data.output_rescaling_type = "minmax" # Specify the training parameters. test_parameters.running.max_number_epochs = 10 diff --git a/test/checkpoint_training_test.py b/test/checkpoint_training_test.py index 3bc5e83e3..abb3f1d93 100644 --- a/test/checkpoint_training_test.py +++ b/test/checkpoint_training_test.py @@ -45,7 +45,7 @@ def test_learning_rate(self): learning_rate=0.1, ) trainer.train_network() - original_learning_rate = trainer.optimizer.param_groups[0]["lr"] + original_learning_rate = trainer._optimizer.param_groups[0]["lr"] # Now do the same, but cut at epoch 22 and see if it recovers the # correct result. @@ -58,7 +58,7 @@ def test_learning_rate(self): trainer.train_network() trainer = self.__resume_checkpoint(test_checkpoint_name, 40) trainer.train_network() - new_learning_rate = trainer.optimizer.param_groups[0]["lr"] + new_learning_rate = trainer._optimizer.param_groups[0]["lr"] assert np.isclose( original_learning_rate, new_learning_rate, atol=accuracy ) @@ -73,7 +73,7 @@ def test_early_stopping(self): learning_rate=0.1, ) trainer.train_network() - original_nr_epochs = trainer.last_epoch + original_nr_epochs = trainer._last_epoch # Now do the same, but cut at epoch 22 and see if it recovers the # correct result. @@ -86,7 +86,7 @@ def test_early_stopping(self): trainer.train_network() trainer = self.__resume_checkpoint(test_checkpoint_name, 40) trainer.train_network() - last_nr_epochs = trainer.last_epoch + last_nr_epochs = trainer._last_epoch # integer comparison! assert original_nr_epochs == last_nr_epochs @@ -137,7 +137,7 @@ def __original_setup( # Specify the data scaling. test_parameters.data.input_rescaling_type = "feature-wise-standard" - test_parameters.data.output_rescaling_type = "normal" + test_parameters.data.output_rescaling_type = "minmax" # Specify the used activation function. test_parameters.network.layer_activations = ["ReLU"] diff --git a/test/complete_interfaces_test.py b/test/complete_interfaces_test.py index 4ceb691d8..6a7956656 100644 --- a/test/complete_interfaces_test.py +++ b/test/complete_interfaces_test.py @@ -117,7 +117,7 @@ def test_convert_numpy_openpmd(self): descriptor_save_path="./", target_save_path="./", additional_info_save_path="./", - naming_scheme="converted_from_numpy_*.bp5", + naming_scheme="converted_from_numpy_*.h5", descriptor_calculation_kwargs={"working_directory": "./"}, ) @@ -128,11 +128,13 @@ def test_convert_numpy_openpmd(self): for snapshot in range(2): data_converter.add_snapshot( descriptor_input_type="openpmd", - descriptor_input_path="converted_from_numpy_{}.in.bp5".format( + descriptor_input_path="converted_from_numpy_{}.in.h5".format( snapshot ), target_input_type="openpmd", - target_input_path="converted_from_numpy_{}.out.bp5".format(snapshot), + target_input_path="converted_from_numpy_{}.out.h5".format( + snapshot + ), additional_info_input_type=None, additional_info_input_path=None, target_units=None, @@ -151,8 +153,10 @@ def test_convert_numpy_openpmd(self): original = os.path.join( data_path, "Be_snapshot{}.{}.npy".format(snapshot, i_o) ) - roundtrip = "verify_against_original_numpy_data_{}.{}.npy".format( - snapshot, i_o + roundtrip = ( + "verify_against_original_numpy_data_{}.{}.npy".format( + snapshot, i_o + ) ) import numpy as np @@ -180,7 +184,7 @@ def test_ase_calculator(self): test_parameters = mala.Parameters() test_parameters.data.data_splitting_type = "by_snapshot" test_parameters.data.input_rescaling_type = "feature-wise-standard" - test_parameters.data.output_rescaling_type = "normal" + test_parameters.data.output_rescaling_type = "minmax" test_parameters.network.layer_activations = ["ReLU"] test_parameters.running.max_number_epochs = 100 test_parameters.running.mini_batch_size = 40 @@ -247,7 +251,7 @@ def test_ase_calculator(self): reference_data=os.path.join(data_path, "Be_snapshot1.out"), ) total_energy_dft_calculation = ( - calculator.data_handler.target_calculator.total_energy_dft_calculation + calculator._data_handler.target_calculator.total_energy_dft_calculation ) calculator.calculate(atoms, properties=["energy"]) assert np.isclose( diff --git a/test/hyperopt_test.py b/test/hyperopt_test.py index 77b0b9896..51fb5d199 100644 --- a/test/hyperopt_test.py +++ b/test/hyperopt_test.py @@ -12,7 +12,7 @@ # Control how much the loss should be better after hyperopt compared to # before. This value is fairly high, but we're training on absolutely # minimal amounts of data. -desired_loss_improvement_factor = 2 +desired_loss_improvement_factor = 1.5 # Different HO methods will lead to different results, but they should be # approximately the same. @@ -38,7 +38,7 @@ def test_hyperopt(self): test_parameters = mala.Parameters() test_parameters.data.data_splitting_type = "by_snapshot" test_parameters.data.input_rescaling_type = "feature-wise-standard" - test_parameters.data.output_rescaling_type = "normal" + test_parameters.data.output_rescaling_type = "minmax" test_parameters.running.max_number_epochs = 20 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 @@ -129,7 +129,7 @@ def test_distributed_hyperopt(self): test_parameters = mala.Parameters() test_parameters.data.data_splitting_type = "by_snapshot" test_parameters.data.input_rescaling_type = "feature-wise-standard" - test_parameters.data.output_rescaling_type = "normal" + test_parameters.data.output_rescaling_type = "minmax" test_parameters.running.max_number_epochs = 5 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 @@ -238,7 +238,7 @@ def test_naswot_eigenvalues(self): test_parameters.manual_seed = 1234 test_parameters.data.data_splitting_type = "by_snapshot" test_parameters.data.input_rescaling_type = "feature-wise-standard" - test_parameters.data.output_rescaling_type = "normal" + test_parameters.data.output_rescaling_type = "minmax" test_parameters.running.max_number_epochs = 10 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 @@ -294,7 +294,7 @@ def test_naswot_eigenvalues(self): for idx, trial in enumerate(correct_trial_list): assert np.isclose( trial, - test_hp_optimizer.trial_losses[idx], + test_hp_optimizer._trial_losses[idx], rtol=naswot_accuracy, ) @@ -306,7 +306,7 @@ def __optimize_hyperparameters(hyper_optimizer): test_parameters = mala.Parameters() test_parameters.data.data_splitting_type = "by_snapshot" test_parameters.data.input_rescaling_type = "feature-wise-standard" - test_parameters.data.output_rescaling_type = "normal" + test_parameters.data.output_rescaling_type = "minmax" test_parameters.running.max_number_epochs = 20 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 @@ -387,7 +387,7 @@ def test_hyperopt_optuna_requeue_zombie_trials(self, tmp_path): test_parameters = mala.Parameters() test_parameters.data.data_splitting_type = "by_snapshot" test_parameters.data.input_rescaling_type = "feature-wise-standard" - test_parameters.data.output_rescaling_type = "normal" + test_parameters.data.output_rescaling_type = "minmax" test_parameters.running.max_number_epochs = 2 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 diff --git a/test/scaling_test.py b/test/scaling_test.py index b7925cd9f..eed0c201f 100644 --- a/test/scaling_test.py +++ b/test/scaling_test.py @@ -19,8 +19,8 @@ def test_errors_and_accuracy(self): "feature-wise-standard", "standard", "None", - "normal", - "feature-wise-normal", + "minmax", + "feature-wise-minmax", ]: data = np.load(os.path.join(data_path, "Be_snapshot2.out.npy")) data = data.astype(np.float32) @@ -43,3 +43,37 @@ def test_errors_and_accuracy(self): transformed = scaler.inverse_transform(transformed) relative_error = torch.sum(np.abs((data2 - transformed) / data2)) assert relative_error < desired_accuracy + + def test_array_referencing(self): + # Asserts that even with the new in-place scaling, data is referenced + # and not copied (unless that is explicitly asked) + + for scaling in [ + "feature-wise-standard", + "standard", + "None", + "minmax", + "feature-wise-minmax", + ]: + data = np.load(os.path.join(data_path, "Be_snapshot2.in.npy")) + data = data.astype(np.float32) + data = data.reshape( + [np.prod(np.shape(data)[0:3]), np.shape(data)[3]] + ) + data = torch.from_numpy(data).float() + + scaler = mala.DataScaler(scaling) + scaler.fit(data) + + numpy_array = np.expand_dims(np.random.random(94), axis=0) + test_data = torch.from_numpy(numpy_array) + scaler.transform(test_data) + scaler.inverse_transform(test_data) + numpy_array *= 2 + assert np.isclose( + np.sum( + test_data.detach().numpy().astype(np.float64) - numpy_array + ), + 0.0, + rtol=1e-16, + ) diff --git a/test/shuffling_test.py b/test/shuffling_test.py index 72d28d6ef..2ac098012 100644 --- a/test/shuffling_test.py +++ b/test/shuffling_test.py @@ -119,7 +119,7 @@ def test_training(self): test_parameters = mala.Parameters() test_parameters.data.data_splitting_type = "by_snapshot" test_parameters.data.input_rescaling_type = "feature-wise-standard" - test_parameters.data.output_rescaling_type = "normal" + test_parameters.data.output_rescaling_type = "minmax" test_parameters.network.layer_activations = ["ReLU"] test_parameters.running.max_number_epochs = 50 test_parameters.running.mini_batch_size = 40 @@ -163,7 +163,7 @@ def test_training(self): test_parameters.data.shuffling_seed = 1234 test_parameters.data.data_splitting_type = "by_snapshot" test_parameters.data.input_rescaling_type = "feature-wise-standard" - test_parameters.data.output_rescaling_type = "normal" + test_parameters.data.output_rescaling_type = "minmax" test_parameters.network.layer_activations = ["ReLU"] test_parameters.running.max_number_epochs = 50 test_parameters.running.mini_batch_size = 40 @@ -215,7 +215,7 @@ def test_training_openpmd(self): test_parameters = mala.Parameters() test_parameters.data.data_splitting_type = "by_snapshot" test_parameters.data.input_rescaling_type = "feature-wise-standard" - test_parameters.data.output_rescaling_type = "normal" + test_parameters.data.output_rescaling_type = "minmax" test_parameters.network.layer_activations = ["ReLU"] test_parameters.running.max_number_epochs = 50 test_parameters.running.mini_batch_size = 40 @@ -261,7 +261,7 @@ def test_training_openpmd(self): test_parameters.data.shuffling_seed = 1234 test_parameters.data.data_splitting_type = "by_snapshot" test_parameters.data.input_rescaling_type = "feature-wise-standard" - test_parameters.data.output_rescaling_type = "normal" + test_parameters.data.output_rescaling_type = "minmax" test_parameters.network.layer_activations = ["ReLU"] test_parameters.running.max_number_epochs = 50 test_parameters.running.mini_batch_size = 40 @@ -326,3 +326,31 @@ def test_training_openpmd(self): test_trainer.train_network() new_loss = test_trainer.final_validation_loss assert old_loss > new_loss + + def test_arbitrary_number_snapshots(self): + parameters = mala.Parameters() + + # This ensures reproducibility of the created data sets. + parameters.data.shuffling_seed = 1234 + + data_shuffler = mala.DataShuffler(parameters) + + for i in range(5): + data_shuffler.add_snapshot( + "Be_snapshot0.in.npy", + data_path, + "Be_snapshot0.out.npy", + data_path, + ) + data_shuffler.shuffle_snapshots( + complete_save_path=".", + save_name="Be_shuffled*", + number_of_shuffled_snapshots=5, + ) + for i in range(4): + bispectrum = np.load("Be_shuffled" + str(i) + ".in.npy") + ldos = np.load("Be_shuffled" + str(i) + ".out.npy") + assert not np.any(np.where(np.all(ldos == 0, axis=-1).squeeze())) + assert not np.any( + np.where(np.all(bispectrum == 0, axis=-1).squeeze()) + ) diff --git a/test/workflow_test.py b/test/workflow_test.py index 8cc33faf6..6ec94b842 100644 --- a/test/workflow_test.py +++ b/test/workflow_test.py @@ -500,7 +500,7 @@ def test_total_energy_predictions(self): ) ldos_calculator.read_from_array(predicted_ldos) total_energy_traditional = ldos_calculator.total_energy - parameters.descriptors.use_atomic_density_energy_formula = True + parameters.use_atomic_density_formula = True ldos_calculator.read_from_array(predicted_ldos) total_energy_atomic_density = ldos_calculator.total_energy assert np.isclose( @@ -523,7 +523,7 @@ def __simple_training( test_parameters = mala.Parameters() test_parameters.data.data_splitting_type = "by_snapshot" test_parameters.data.input_rescaling_type = "feature-wise-standard" - test_parameters.data.output_rescaling_type = "normal" + test_parameters.data.output_rescaling_type = "minmax" test_parameters.network.layer_activations = ["ReLU"] test_parameters.running.max_number_epochs = 400 test_parameters.running.mini_batch_size = 40