diff --git a/.github/workflows/cpu-tests.yml b/.github/workflows/cpu-tests.yml index 48f0a456c..48dc91a34 100644 --- a/.github/workflows/cpu-tests.yml +++ b/.github/workflows/cpu-tests.yml @@ -35,6 +35,8 @@ jobs: steps: - name: Check out repository uses: actions/checkout@v4 + with: + fetch-depth: '1' - name: Set environment variables run: | @@ -62,7 +64,7 @@ jobs: fi - name: Pull latest image from container registry - run: docker pull $IMAGE_REPO/$IMAGE_NAME || true + run: docker pull $IMAGE_REPO/$IMAGE_NAME --quiet || true - name: Build temporary Docker image run: | @@ -131,12 +133,12 @@ jobs: - name: "Prepare environment: Load Docker image from cache" if: env.DOCKER_TAG != 'latest' - run: docker load -i $DOCKER_CACHE_PATH/docker-image.tar.gz + run: docker load -i $DOCKER_CACHE_PATH/docker-image.tar.gz --quiet - name: "Prepare environment: Pull latest image from container registry" if: env.DOCKER_TAG == 'latest' run: | - docker pull $IMAGE_REPO/$IMAGE_NAME:latest + docker pull $IMAGE_REPO/$IMAGE_NAME:latest --quiet docker image tag $IMAGE_REPO/$IMAGE_NAME:latest $IMAGE_NAME:latest - name: "Prepare environment: Run Docker container" @@ -155,6 +157,8 @@ jobs: - name: Check out repository (mala) uses: actions/checkout@v4 + with: + fetch-depth: '1' - name: Install mala package # Exec all commands inside the mala-cpu container @@ -174,7 +178,13 @@ jobs: # if comparison fails, `install/mala_cpu_[base]_environment.yml` needs to be aligned with # `requirements.txt` and/or extra dependencies are missing in the Docker Conda environment - diff --side-by-side --color=always env_before.yml env_after.yml + + if diff --brief env_before.yml env_after.yml + then + echo "Files env_before.yml and env_after.yml do not differ." + else + diff --side-by-side --color-always env_before.yml env_after.yml + fi - name: Download test data repository from RODARE shell: 'bash -c "docker exec -i mala-cpu python < {0}"' @@ -229,9 +239,6 @@ jobs: ((contains(github.ref_name, 'develop') || contains(github.ref_name, 'master')) && needs.build-docker-image-cpu.outputs.docker-tag != 'latest') || startsWith(github.ref, 'refs/tags/') steps: - - name: Check out repository - uses: actions/checkout@v4 - - name: "Prepare environment: Restore cache" if: env.DOCKER_TAG != 'latest' uses: actions/cache@v4 @@ -242,21 +249,19 @@ jobs: - name: "Prepare environment: Load Docker image from cache" if: env.DOCKER_TAG != 'latest' - run: docker load -i $DOCKER_CACHE_PATH/docker-image.tar.gz + run: docker load -i $DOCKER_CACHE_PATH/docker-image.tar.gz --quiet - name: "Prepare environment: Pull latest image from container registry" if: env.DOCKER_TAG == 'latest' - run: docker pull $IMAGE_REPO/$IMAGE_NAME:latest + run: docker pull $IMAGE_REPO/$IMAGE_NAME:latest --quiet - name: Tag Docker image run: | # Execute on change of Docker image if [[ "$DOCKER_TAG" != 'latest' ]]; then - GIT_SHA=${GITHUB_REF_NAME}-$(git rev-parse --short "$GITHUB_SHA") - echo "GIT_SHA=$GIT_SHA" docker tag $IMAGE_NAME:$GITHUB_RUN_ID $IMAGE_REPO/$IMAGE_NAME:latest - docker tag $IMAGE_NAME:$GITHUB_RUN_ID $IMAGE_REPO/$IMAGE_NAME:$GIT_SHA + docker tag $IMAGE_NAME:$GITHUB_RUN_ID $IMAGE_REPO/$IMAGE_NAME:${GITHUB_REF_NAME}-${GITHUB_SHA:0:7} fi # Execute on push of git tag @@ -272,4 +277,4 @@ jobs: run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u ${{ github.actor }} --password-stdin - name: Push Docker image - run: docker push $IMAGE_REPO/$IMAGE_NAME --all-tags + run: docker push $IMAGE_REPO/$IMAGE_NAME --all-tags | grep -v -E 'Waiting|Layer already|Preparing|Pushed' diff --git a/docs/source/advanced_usage/hyperparameters.rst b/docs/source/advanced_usage/hyperparameters.rst index 4240250e7..5c0665b44 100644 --- a/docs/source/advanced_usage/hyperparameters.rst +++ b/docs/source/advanced_usage/hyperparameters.rst @@ -114,7 +114,7 @@ a physical validation metric such as .. code-block:: python - parameters.running.after_before_training_metric = "band_energy" + parameters.running.after_training_metric = "band_energy" Advanced optimization algorithms ******************************** diff --git a/docs/source/advanced_usage/predictions.rst b/docs/source/advanced_usage/predictions.rst index 7058f17de..20e82494b 100644 --- a/docs/source/advanced_usage/predictions.rst +++ b/docs/source/advanced_usage/predictions.rst @@ -40,6 +40,8 @@ Likewise, you can adjust the inference temperature via calculator.data_handler.target_calculator.temperature = ... +.. _production_gpu: + Predictions on GPU ******************* @@ -137,4 +139,3 @@ With the exception of the electronic density, which is saved into the ``.cube`` format for visualization with regular electronic structure visualization software, all of these observables can be plotted with Python based visualization libraries such as ``matplotlib``. - diff --git a/docs/source/advanced_usage/trainingmodel.rst b/docs/source/advanced_usage/trainingmodel.rst index 52e50ec50..290aa15f3 100644 --- a/docs/source/advanced_usage/trainingmodel.rst +++ b/docs/source/advanced_usage/trainingmodel.rst @@ -77,7 +77,7 @@ Specifically, when setting .. code-block:: python - parameters.running.after_before_training_metric = "band_energy" + parameters.running.after_training_metric = "band_energy" the error in the band energy between actual and predicted LDOS will be calculated and printed before and after network training (in meV/atom). @@ -205,21 +205,21 @@ visualization prior to training via # 0: No visualizatuon, 1: loss and learning rate, 2: like 1, # but additionally weights and biases are saved - parameters.running.visualisation = 1 - parameters.running.visualisation_dir = "mala_vis" + parameters.running.logging = 1 + parameters.running.logging_dir = "mala_vis" -where ``visualisation_dir`` specifies some directory in which to save the -MALA visualization data. Afterwards, you can run the training without any +where ``logging_dir`` specifies some directory in which to save the +MALA logging data. Afterwards, you can run the training without any other modifications. Once training is finished (or during training, in case you want to use tensorboard to monitor progress), you can launch tensorboard via .. code-block:: bash - tensorboard --logdir path_to_visualization + tensorboard --logdir path_to_log_directory -The full path for ``path_to_visualization`` can be accessed via -``trainer.full_visualization_path``. +The full path for ``path_to_log_directory`` can be accessed via +``trainer.full_logging_path``. Training in parallel diff --git a/docs/source/basic_usage/hyperparameters.rst b/docs/source/basic_usage/hyperparameters.rst index 11742932d..d10bb440e 100644 --- a/docs/source/basic_usage/hyperparameters.rst +++ b/docs/source/basic_usage/hyperparameters.rst @@ -118,9 +118,9 @@ properties of the ``Parameters`` class: during the optimization. - ``network.layer_sizes`` - ``"int"``, ``"categorical"`` - * - ``"trainingtype"`` + * - ``"optimizer"`` - Optimization algorithm used during the NN optimization. - - ``running.trainingtype`` + - ``running.optimizer`` - ``"categorical"`` * - ``"mini_batch_size"`` - Size of the mini batches used to calculate the gradient during diff --git a/docs/source/basic_usage/trainingmodel.rst b/docs/source/basic_usage/trainingmodel.rst index 3995865e6..e6bc8c967 100644 --- a/docs/source/basic_usage/trainingmodel.rst +++ b/docs/source/basic_usage/trainingmodel.rst @@ -35,7 +35,7 @@ options to train a simple network with example data, namely parameters.running.max_number_epochs = 100 parameters.running.mini_batch_size = 40 parameters.running.learning_rate = 0.00001 - parameters.running.trainingtype = "Adam" + parameters.running.optimizer = "Adam" parameters.verbosity = 1 # level of output; 1 is standard, 0 is low, 2 is debug. Here, we can see that the ``Parameters`` object contains multiple diff --git a/docs/source/install/installing_lammps.rst b/docs/source/install/installing_lammps.rst index 50fb41cef..28affb950 100644 --- a/docs/source/install/installing_lammps.rst +++ b/docs/source/install/installing_lammps.rst @@ -41,18 +41,24 @@ The MALA team recommends to build LAMMPS with ``cmake``. To do so * ``Kokkos_ARCH_GPUARCH=???``: Your GPU architecture (see see `Kokkos instructions `_) * ``CMAKE_CXX_COMPILER=???``: Path to the ``nvcc_wrapper`` executable shipped with the LAMMPS code, should be at ``/your/path/to/lammps/lib/kokkos/bin/nvcc_wrapper`` -* For example, this configures the LAMMPS cmake build with Kokkos support - for an Intel Haswell CPU and an Nvidia Volta GPU, with MPI support: + + For example, this configures the LAMMPS cmake build with Kokkos support + for an Intel Haswell CPU and an Nvidia Volta GPU, with MPI support: .. code-block:: bash cmake ../cmake -D PKG_KOKKOS=yes -D BUILD_MPI=yes -D PKG_ML-SNAP=yes -D Kokkos_ENABLE_CUDA=yes -D Kokkos_ARCH_HSW=yes -D Kokkos_ARCH_VOLTA70=yes -D CMAKE_CXX_COMPILER=/path/to/lammps/lib/kokkos/bin/nvcc_wrapper -D BUILD_SHARED_LIBS=yes + .. note:: + When using a GPU by setting ``parameters.use_gpu = True``, you *need* to + have a GPU version of ``LAMMPS`` installed. See :ref:`production_gpu` for + details. * Build the library and executable with ``cmake --build .`` (Add ``--parallel=8`` for a faster build) + Installing the Python extension ******************************** diff --git a/docs/source/install/installing_qe.rst b/docs/source/install/installing_qe.rst index 3b426ba48..9ff514c7a 100644 --- a/docs/source/install/installing_qe.rst +++ b/docs/source/install/installing_qe.rst @@ -4,24 +4,25 @@ Installing Quantum ESPRESSO (total energy module) Prerequisites ************* -To run the total energy module, you need a full Quantum ESPRESSO installation, -for which to install the Python bindings. This module has been tested with -version ``7.2.``, the most recent version at the time of this release of MALA. -Newer versions may work (untested), but installation instructions may vary. +To build and run the total energy module, you need a full Quantum ESPRESSO +installation, for which to install the Python bindings. This module has been +tested with version ``7.2.``, the most recent version at the time of this +release of MALA. Newer versions may work (untested), but installation +instructions may vary. Make sure you have an (MPI-aware) F90 compiler such as ``mpif90`` (e.g. Debian-ish machine: ``apt install openmpi-bin``, on an HPC cluster something like ``module load openmpi gcc``). Make sure to use the same compiler for QE and the extension. This should be the default case, but if problems arise you can manually select the compiler via -``--f90exec=`` in ``build_total_energy_energy_module.sh`` +``--f90exec=`` in ``build_total_energy_module.sh`` We assume that QE's ``configure`` script will find your system libs, e.g. use ``-lblas``, ``-llapack`` and ``-lfftw3``. We use those by default in -``build_total_energy_energy_module.sh``. If you have, say, the MKL library, +``build_total_energy_module.sh``. If you have, say, the MKL library, you may see ``configure`` use something like ``-lmkl_intel_lp64 -lmkl_sequential -lmkl_core`` when building QE. In this case you have to modify -``build_total_energy_energy_module.sh`` to use the same libraries! +``build_total_energy_module.sh`` to use the same libraries! Build Quantum ESPRESSO ********************** @@ -35,10 +36,16 @@ Build Quantum ESPRESSO * Change to the ``external_modules/total_energy_module`` directory of the MALA repository +.. note:: + At the moment, building QE using ``cmake`` `doesn't work together with the + build_total_energy_module.sh script + `_. Please use the + ``configure`` + ``make`` build workflow. + Installing the Python extension ******************************** -* Run ``build_total_energy_energy_module.sh /path/to/your/q-e``. +* Run ``build_total_energy_module.sh /path/to/your/q-e``. * If the build is successful, a file named something like ``total_energy.cpython-39m-x86_64-linux-gnu.so`` will be generated. This is diff --git a/examples/advanced/ex01_checkpoint_training.py b/examples/advanced/ex01_checkpoint_training.py index 01bb9b486..5222a5232 100644 --- a/examples/advanced/ex01_checkpoint_training.py +++ b/examples/advanced/ex01_checkpoint_training.py @@ -26,7 +26,7 @@ def initial_setup(): parameters.running.max_number_epochs = 9 parameters.running.mini_batch_size = 8 parameters.running.learning_rate = 0.00001 - parameters.running.trainingtype = "Adam" + parameters.running.optimizer = "Adam" # We checkpoint the training every 5 epochs and save the results # as "ex07". diff --git a/examples/advanced/ex03_tensor_board.py b/examples/advanced/ex03_tensor_board.py index b15239495..43a066aaf 100644 --- a/examples/advanced/ex03_tensor_board.py +++ b/examples/advanced/ex03_tensor_board.py @@ -18,7 +18,7 @@ parameters.running.max_number_epochs = 100 parameters.running.mini_batch_size = 40 parameters.running.learning_rate = 0.001 -parameters.running.trainingtype = "Adam" +parameters.running.optimizer = "Adam" # Turn the visualization on and select a folder to save the visualization # files into. @@ -45,6 +45,6 @@ trainer.train_network() printout( 'Run finished, launch tensorboard with "tensorboard --logdir ' - + trainer.full_visualization_path + + trainer.full_logging_path + '"' ) diff --git a/examples/advanced/ex05_checkpoint_hyperparameter_optimization.py b/examples/advanced/ex05_checkpoint_hyperparameter_optimization.py index cef7c8f4f..99a92fa35 100644 --- a/examples/advanced/ex05_checkpoint_hyperparameter_optimization.py +++ b/examples/advanced/ex05_checkpoint_hyperparameter_optimization.py @@ -21,7 +21,7 @@ def initial_setup(): parameters.running.max_number_epochs = 10 parameters.running.mini_batch_size = 40 parameters.running.learning_rate = 0.00001 - parameters.running.trainingtype = "Adam" + parameters.running.optimizer = "Adam" parameters.hyperparameters.n_trials = 9 parameters.hyperparameters.checkpoints_each_trial = 5 parameters.hyperparameters.checkpoint_name = "ex05_checkpoint" diff --git a/examples/advanced/ex06_distributed_hyperparameter_optimization.py b/examples/advanced/ex06_distributed_hyperparameter_optimization.py index b34f9bb8b..215dd1ab2 100644 --- a/examples/advanced/ex06_distributed_hyperparameter_optimization.py +++ b/examples/advanced/ex06_distributed_hyperparameter_optimization.py @@ -28,7 +28,7 @@ parameters.running.max_number_epochs = 5 parameters.running.mini_batch_size = 40 parameters.running.learning_rate = 0.00001 -parameters.running.trainingtype = "Adam" +parameters.running.optimizer = "Adam" parameters.hyperparameters.n_trials = 10 parameters.hyperparameters.checkpoints_each_trial = -1 parameters.hyperparameters.checkpoint_name = "ex06" @@ -44,7 +44,7 @@ parameters.targets.ldos_gridspacing_ev = 2.5 parameters.targets.ldos_gridoffset_ev = -5 parameters.hyperparameters.number_training_per_trial = 3 -parameters.running.after_before_training_metric = "band_energy" +parameters.running.after_training_metric = "band_energy" data_handler = mala.DataHandler(parameters) diff --git a/examples/advanced/ex07_advanced_hyperparameter_optimization.py b/examples/advanced/ex07_advanced_hyperparameter_optimization.py index 8165ef01e..242ffd7dd 100644 --- a/examples/advanced/ex07_advanced_hyperparameter_optimization.py +++ b/examples/advanced/ex07_advanced_hyperparameter_optimization.py @@ -21,7 +21,7 @@ def optimize_hyperparameters(hyper_optimizer): parameters.running.max_number_epochs = 10 parameters.running.mini_batch_size = 40 parameters.running.learning_rate = 0.00001 - parameters.running.trainingtype = "Adam" + parameters.running.optimizer = "Adam" parameters.hyperparameters.n_trials = 8 parameters.hyperparameters.hyper_opt_method = hyper_optimizer @@ -64,7 +64,7 @@ def optimize_hyperparameters(hyper_optimizer): data_handler.output_dimension, ] hyperoptimizer.add_hyperparameter( - "categorical", "trainingtype", choices=["Adam", "SGD"] + "categorical", "optimizer", choices=["Adam", "SGD"] ) hyperoptimizer.add_hyperparameter( "categorical", "layer_activation_00", choices=["ReLU", "Sigmoid"] diff --git a/examples/basic/ex01_train_network.py b/examples/basic/ex01_train_network.py index 95eb2d51b..1eca8c6b7 100644 --- a/examples/basic/ex01_train_network.py +++ b/examples/basic/ex01_train_network.py @@ -28,7 +28,7 @@ parameters.running.max_number_epochs = 100 parameters.running.mini_batch_size = 40 parameters.running.learning_rate = 0.00001 -parameters.running.trainingtype = "Adam" +parameters.running.optimizer = "Adam" # These parameters characterize how the LDOS and bispectrum descriptors # were calculated. They are _technically_ not needed to train a simple # network. However, it is useful to define them prior to training. Then, diff --git a/examples/basic/ex02_test_network.py b/examples/basic/ex02_test_network.py index 2e4b8953c..0d90dfe7f 100644 --- a/examples/basic/ex02_test_network.py +++ b/examples/basic/ex02_test_network.py @@ -21,15 +21,15 @@ # It is recommended to enable the "lazy-loading" feature, so that # data is loaded into memory one snapshot at a time during testing - this # helps keep RAM requirement down. Furthermore, you have to decide which -# observables to test (usual choices are "band_energy", "total_energy" and -# "number_of_electrons") and whether you want the results per snapshot +# observables to test (usual choices are "band_energy", "total_energy") +# and whether you want the results per snapshot # (output_format="list") or as an averaged value (output_format="mae") #################### parameters, network, data_handler, tester = mala.Tester.load_run( run_name=model_name, path=model_path ) -tester.observables_to_test = ["band_energy", "number_of_electrons"] +tester.observables_to_test = ["band_energy", "density"] tester.output_format = "list" parameters.data.use_lazy_loading = True diff --git a/examples/basic/ex04_hyperparameter_optimization.py b/examples/basic/ex04_hyperparameter_optimization.py index 4c68179c2..cebb4c42e 100644 --- a/examples/basic/ex04_hyperparameter_optimization.py +++ b/examples/basic/ex04_hyperparameter_optimization.py @@ -22,7 +22,7 @@ parameters.data.output_rescaling_type = "normal" parameters.running.max_number_epochs = 20 parameters.running.mini_batch_size = 40 -parameters.running.trainingtype = "Adam" +parameters.running.optimizer = "Adam" parameters.hyperparameters.n_trials = 20 #################### diff --git a/mala/common/parameters.py b/mala/common/parameters.py index 51e5ac937..c9b1b826c 100644 --- a/mala/common/parameters.py +++ b/mala/common/parameters.py @@ -733,7 +733,7 @@ def __init__(self): self.learning_rate_decay = 0.1 self.learning_rate_patience = 0 self._during_training_metric = "ldos" - self._after_before_training_metric = "ldos" + self._after_training_metric = "ldos" self.use_compression = False self.num_workers = 0 self.use_shuffling_for_samplers = True @@ -755,7 +755,7 @@ def __init__(self): def _update_ddp(self, new_ddp): super(ParametersRunning, self)._update_ddp(new_ddp) self.during_training_metric = self.during_training_metric - self.after_before_training_metric = self.after_before_training_metric + self.after_training_metric = self.after_training_metric @property def during_training_metric(self): @@ -783,7 +783,7 @@ def during_training_metric(self, value): self._during_training_metric = value @property - def after_before_training_metric(self): + def after_training_metric(self): """ Get the metric used during training. @@ -795,17 +795,17 @@ def after_before_training_metric(self): DFT results. Of these, the mean average error in eV/atom will be calculated. """ - return self._after_before_training_metric + return self._after_training_metric - @after_before_training_metric.setter - def after_before_training_metric(self, value): + @after_training_metric.setter + def after_training_metric(self, value): if value != "ldos": if self._configuration["ddp"]: raise Exception( "Currently, MALA can only operate with the " '"ldos" metric for ddp runs.' ) - self._after_before_training_metric = value + self._after_training_metric = value @during_training_metric.setter def during_training_metric(self, value): diff --git a/mala/network/hyper_opt_naswot.py b/mala/network/hyper_opt_naswot.py index ae27f7d13..9a11e1ca0 100644 --- a/mala/network/hyper_opt_naswot.py +++ b/mala/network/hyper_opt_naswot.py @@ -39,7 +39,7 @@ def __init__(self, params, data): self.trial_list = None self.ignored_hyperparameters = [ "learning_rate", - "trainingtype", + "optimizer", "mini_batch_size", "early_stopping_epochs", "learning_rate_patience", diff --git a/mala/network/objective_base.py b/mala/network/objective_base.py index 52d0d9464..2fbf29503 100644 --- a/mala/network/objective_base.py +++ b/mala/network/objective_base.py @@ -231,8 +231,8 @@ def parse_trial_optuna(self, trial: Trial): turned_off_layers.append(layer_counter) layer_counter += 1 - elif "trainingtype" == par.name: - self.params.running.trainingtype = par.get_parameter(trial) + elif "optimizer" == par.name: + self.params.running.optimizer = par.get_parameter(trial) elif "mini_batch_size" == par.name: self.params.running.mini_batch_size = par.get_parameter(trial) @@ -358,8 +358,8 @@ def parse_trial_oat(self, trial): turned_off_layers.append(layer_counter) layer_counter += 1 - elif "trainingtype" == par.name: - self.params.running.trainingtype = par.get_parameter( + elif "optimizer" == par.name: + self.params.running.optimizer = par.get_parameter( trial, factor_idx ) elif "mini_batch_size" == par.name: diff --git a/mala/network/runner.py b/mala/network/runner.py index f62bd2b9c..beb7c6c17 100644 --- a/mala/network/runner.py +++ b/mala/network/runner.py @@ -12,6 +12,7 @@ import mala from mala.common.parallelizer import get_rank from mala.common.parameters import ParametersRunning +from mala.datahandling.fast_tensor_dataset import FastTensorDataset from mala.network.network import Network from mala.datahandling.data_scaler import DataScaler from mala.datahandling.data_handler import DataHandler @@ -78,38 +79,21 @@ def _calculate_errors( non_energy_metrics = [ metric for metric in metrics if "energy" not in metric ] - errors = self._calculate_energy_errors( - actual_outputs, predicted_outputs, energy_metrics, snapshot_number - ) + if len(energy_metrics) > 0: + errors = self._calculate_energy_errors( + actual_outputs, + predicted_outputs, + energy_metrics, + snapshot_number, + ) + else: + errors = {} for metric in non_energy_metrics: try: if metric == "ldos": error = np.mean((predicted_outputs - actual_outputs) ** 2) errors[metric] = error - elif metric == "number_of_electrons": - target_calculator = self.data.target_calculator - if ( - not isinstance(target_calculator, LDOS) - and not isinstance(target_calculator, DOS) - and not isinstance(target_calculator, Density) - ): - raise Exception( - "Cannot calculate the band energy from this observable." - ) - target_calculator.read_additional_calculation_data( - self.data.get_snapshot_calculation_output( - snapshot_number - ) - ) - actual = target_calculator.get_number_of_electrons( - actual_outputs - ) - predicted = target_calculator.get_number_of_electrons( - predicted_outputs - ) - errors[metric] = actual - predicted - elif metric == "density": target_calculator = self.data.target_calculator if not isinstance( @@ -179,7 +163,7 @@ def _calculate_errors( errors[metric] = np.abs(actual - predicted).mean() - elif metric == "dos_realtive": + elif metric == "dos_relative": target_calculator = self.data.target_calculator if not isinstance( target_calculator, LDOS @@ -211,9 +195,11 @@ def _calculate_errors( ).mean() * 100 ) + else: + raise Exception(f"Invalid metric ({metric}) requested.") except ValueError as e: printout( - f"Error calculating observable: {observable} for snapshot {snapshot_number}", + f"Error calculating observable: {metric} for snapshot {snapshot_number}", min_verbosity=0, ) printout(e, min_verbosity=2) @@ -241,9 +227,14 @@ def _calculate_energy_errors( Snapshot number for which the errors are calculated. """ target_calculator = self.data.target_calculator - target_calculator.read_additional_calculation_data( - self.data.get_snapshot_calculation_output(snapshot_number) + output_file = self.data.get_snapshot_calculation_output( + snapshot_number ) + if not output_file: + raise Exception( + "Output file needed for energy error calculations." + ) + target_calculator.read_additional_calculation_data(output_file) errors = {} fe_dft = target_calculator.fermi_energy_dft @@ -737,28 +728,51 @@ def _forward_entire_snapshot( from_index += snapshot.grid_size grid_size = to_index - from_index - if self.data.parameters.use_lazy_loading: - data_set.return_outputs_directly = True - actual_outputs = (data_set[from_index:to_index])[1] - else: - actual_outputs = self.data.output_data_scaler.inverse_transform( - (data_set[from_index:to_index])[1], as_numpy=True + if isinstance(data_set, FastTensorDataset): + predicted_outputs = np.zeros( + (grid_size, self.data.output_dimension) ) - - predicted_outputs = np.zeros((grid_size, self.data.output_dimension)) - - for i in range(0, number_of_batches_per_snapshot): - inputs, outputs = data_set[ - from_index - + (i * batch_size) : from_index - + ((i + 1) * batch_size) - ] - inputs = inputs.to(self.parameters._configuration["device"]) - predicted_outputs[i * batch_size : (i + 1) * batch_size, :] = ( - self.data.output_data_scaler.inverse_transform( + actual_outputs = np.zeros((grid_size, self.data.output_dimension)) + + for i in range(len(data_set)): + inputs, outputs = data_set[from_index + i] + inputs = inputs.to(self.parameters._configuration["device"]) + predicted_outputs[ + i * data_set.batch_size : (i + 1) * data_set.batch_size, : + ] = self.data.output_data_scaler.inverse_transform( self.network(inputs).to("cpu"), as_numpy=True ) + actual_outputs[ + i * data_set.batch_size : (i + 1) * data_set.batch_size, : + ] = self.data.output_data_scaler.inverse_transform( + torch.tensor(outputs), as_numpy=True + ) + else: + if self.data.parameters.use_lazy_loading: + data_set.return_outputs_directly = True + actual_outputs = (data_set[from_index:to_index])[1] + else: + actual_outputs = ( + self.data.output_data_scaler.inverse_transform( + (data_set[from_index:to_index])[1], as_numpy=True + ) + ) + + predicted_outputs = np.zeros( + (grid_size, self.data.output_dimension) ) + for i in range(0, number_of_batches_per_snapshot): + inputs, outputs = data_set[ + from_index + + (i * batch_size) : from_index + + ((i + 1) * batch_size) + ] + inputs = inputs.to(self.parameters._configuration["device"]) + predicted_outputs[i * batch_size : (i + 1) * batch_size, :] = ( + self.data.output_data_scaler.inverse_transform( + self.network(inputs).to("cpu"), as_numpy=True + ) + ) # Restricting the actual quantities to physical meaningful values, # i.e. restricting the (L)DOS to positive values. diff --git a/mala/network/trainer.py b/mala/network/trainer.py index a30820ea0..c37add951 100644 --- a/mala/network/trainer.py +++ b/mala/network/trainer.py @@ -55,8 +55,6 @@ def __init__(self, params, network, data, optimizer_dict=None): self.network = DDP(self.network) torch.cuda.current_stream().wait_stream(s) - self.final_test_loss = float("inf") - self.initial_test_loss = float("inf") self.final_validation_loss = float("inf") self.initial_validation_loss = float("inf") self.optimizer = None @@ -66,11 +64,9 @@ def __init__(self, params, network, data, optimizer_dict=None): self.last_loss = None self.training_data_loaders = [] self.validation_data_loaders = [] - self.test_data_loaders = [] # Samplers for the ddp case. self.train_sampler = None - self.test_sampler = None self.validation_sampler = None self.__prepare_to_train(optimizer_dict) @@ -267,12 +263,10 @@ def train_network(self): # CALCULATE INITIAL METRICS ############################ - tloss = float("inf") vloss = float("inf") # Save losses for later use. self.initial_validation_loss = vloss - self.initial_test_loss = tloss # Initialize all the counters. checkpoint_counter = 0 @@ -422,8 +416,6 @@ def train_network(self): t1 = time.time() printout(f"training time: {t1 - t0}", min_verbosity=2) - training_loss = training_loss_sum.item() / batchid - # Calculate the validation loss. and output it. torch.cuda.synchronize( self.parameters._configuration["device"] @@ -442,7 +434,6 @@ def train_network(self): self.network, inputs, outputs ) batchid += 1 - training_loss = training_loss_sum.item() / batchid dataset_fractions = ["validation"] if self.parameters.validate_on_training_data: dataset_fractions.append("train") @@ -457,6 +448,12 @@ def train_network(self): vloss = errors["validation"][ self.parameters.during_training_metric ] + if self.parameters_full.use_ddp: + vloss = self.__average_validation( + vloss, + "average_loss", + self.parameters._configuration["device"], + ) if self.parameters_full.verbosity > 1: printout("Errors:", errors, min_verbosity=2) else: @@ -558,12 +555,144 @@ def train_network(self): min_verbosity=2, ) + ############################ + # CALCULATE FINAL METRICS + ############################ + if self.parameters.after_training_metric in errors["validation"]: + self.final_validation_loss = errors["validation"][ + self.parameters.after_training_metric + ] + else: + final_errors = self._validate_network( + ["validation"], [self.parameters.after_training_metric] + ) + vloss = np.mean( + final_errors["validation"][ + self.parameters.after_training_metric + ] + ) + + if self.parameters_full.use_ddp: + vloss = self.__average_validation( + vloss, + "average_loss", + self.parameters._configuration["device"], + ) + self.final_validation_loss = vloss + # Clean-up for pre-fetching lazy loading. if self.data.parameters.use_lazy_loading_prefetch: self.training_data_loaders.cleanup() self.validation_data_loaders.cleanup() - if len(self.data.test_data_sets) > 0: - self.test_data_loaders.cleanup() + + def _validate_network(self, data_set_fractions, metrics): + # """Validate a network, using train or validation data.""" + self.network.eval() + errors = {} + for data_set_type in data_set_fractions: + if data_set_type == "train": + data_loaders = self.training_data_loaders + data_sets = self.data.training_data_sets + number_of_snapshots = self.data.nr_training_snapshots + offset_snapshots = 0 + + elif data_set_type == "validation": + data_loaders = self.validation_data_loaders + data_sets = self.data.validation_data_sets + number_of_snapshots = self.data.nr_validation_snapshots + offset_snapshots = self.data.nr_training_snapshots + + elif data_set_type == "test": + raise Exception( + "You should not look at test set results during training" + ) + else: + raise Exception( + f"Dataset type ({data_set_type}) not recognized." + ) + + errors[data_set_type] = {} + for metric in metrics: + errors[data_set_type][metric] = [] + + if isinstance(data_loaders, MultiLazyLoadDataLoader): + loader_id = 0 + for loader in data_loaders: + grid_size = self.data.parameters.snapshot_directories_list[ + loader_id + offset_snapshots + ].grid_size + + actual_outputs = np.zeros( + (grid_size, self.data.output_dimension) + ) + predicted_outputs = np.zeros( + (grid_size, self.data.output_dimension) + ) + last_start = 0 + + for x, y in loader: + + x = x.to(self.parameters._configuration["device"]) + length = int(x.size()[0]) + predicted_outputs[ + last_start : last_start + length, : + ] = self.data.output_data_scaler.inverse_transform( + self.network(x).to("cpu"), as_numpy=True + ) + actual_outputs[last_start : last_start + length, :] = ( + self.data.output_data_scaler.inverse_transform( + y, as_numpy=True + ) + ) + + last_start += length + errors[data_set_type] = self._calculate_errors( + actual_outputs, + predicted_outputs, + metrics, + loader_id + offset_snapshots, + ) + loader_id += 1 + else: + with torch.no_grad(): + for snapshot_number in trange( + offset_snapshots, + number_of_snapshots + offset_snapshots, + desc="Validation", + disable=self.parameters_full.verbosity < 2, + ): + # Get optimal batch size and number of batches per snapshotss + grid_size = ( + self.data.parameters.snapshot_directories_list[ + snapshot_number + ].grid_size + ) + + optimal_batch_size = ( + self._correct_batch_size_for_testing( + grid_size, self.parameters.mini_batch_size + ) + ) + number_of_batches_per_snapshot = int( + grid_size / optimal_batch_size + ) + + actual_outputs, predicted_outputs = ( + self._forward_entire_snapshot( + snapshot_number, + data_sets[0], + data_set_type[0:2], + number_of_batches_per_snapshot, + optimal_batch_size, + ) + ) + errors[data_set_type] = self._calculate_errors( + actual_outputs, + predicted_outputs, + metrics, + snapshot_number, + ) + return errors def _validate_network(self, data_set_fractions, metrics): # """Validate a network, using train, test or validation data.""" @@ -732,16 +861,6 @@ def __prepare_to_train(self, optimizer_dict): ) ) - if self.data.test_data_sets: - self.test_sampler = ( - torch.utils.data.distributed.DistributedSampler( - self.data.test_data_sets[0], - num_replicas=dist.get_world_size(), - rank=dist.get_rank(), - shuffle=False, - ) - ) - # Instantiate the learning rate scheduler, if necessary. if self.parameters.learning_rate_scheduler == "ReduceLROnPlateau": self.scheduler = optim.lr_scheduler.ReduceLROnPlateau( @@ -829,21 +948,6 @@ def __prepare_to_train(self, optimizer_dict): ) ) - if self.data.test_data_sets: - if isinstance(self.data.test_data_sets[0], LazyLoadDatasetSingle): - self.test_data_loaders = MultiLazyLoadDataLoader( - self.data.test_data_sets, **kwargs - ) - else: - self.test_data_loaders.append( - DataLoader( - self.data.test_data_sets[0], - batch_size=self.parameters.mini_batch_size * 1, - sampler=self.test_sampler, - **kwargs, - ) - ) - def __process_mini_batch(self, network, input_data, target_data): """Process a mini batch.""" if self.parameters._configuration["gpu"]: @@ -1007,17 +1111,14 @@ def __create_training_checkpoint(self): torch.save( save_dict, optimizer_name, _use_new_zipfile_serialization=False ) - if self.parameters.run_name != '': + if self.parameters.run_name != "": self.save_run( self.parameters.checkpoint_name, save_runner=True, save_path=self.parameters.run_name, ) else: - self.save_run( - self.parameters.checkpoint_name, - save_runner=True - ) + self.save_run(self.parameters.checkpoint_name, save_runner=True) @staticmethod def __average_validation(val, name, device="cpu"): diff --git a/test/all_lazy_loading_test.py b/test/all_lazy_loading_test.py index 065cbb86e..351c98292 100644 --- a/test/all_lazy_loading_test.py +++ b/test/all_lazy_loading_test.py @@ -38,7 +38,7 @@ def test_scaling(self): test_parameters.running.max_number_epochs = 3 test_parameters.running.mini_batch_size = 512 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.comment = "Lazy loading test." test_parameters.network.nn_type = "feed-forward" test_parameters.running.use_gpu = True @@ -157,10 +157,7 @@ def test_scaling(self): test_parameters, test_network, data_handler ) test_trainer.train_network() - training_tester.append( - test_trainer.final_test_loss - - test_trainer.initial_test_loss - ) + training_tester.append(test_trainer.final_validation_loss) elif scalingtype == "feature-wise-standard": # The lazy-loading STD equation (and to a smaller amount the @@ -269,7 +266,7 @@ def test_performance_horovod(self): test_parameters.network.layer_activations = ["LeakyReLU"] test_parameters.running.max_number_epochs = 20 test_parameters.running.mini_batch_size = 500 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.comment = "Horovod / lazy loading benchmark." test_parameters.network.nn_type = "feed-forward" test_parameters.manual_seed = 2021 @@ -352,8 +349,8 @@ def test_performance_horovod(self): [ hvdstring, llstring, - test_trainer.initial_test_loss, - test_trainer.final_test_loss, + test_trainer.initial_validation_loss, + test_trainer.final_validation_loss, time.time() - start_time, ] ) @@ -400,8 +397,8 @@ def _train_lazy_loading(prefetching): test_parameters.running.max_number_epochs = 100 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" - test_parameters.verbosity = 2 + test_parameters.running.optimizer = "Adam" + test_parameters.verbosity = 1 test_parameters.data.use_lazy_loading = True test_parameters.data.use_lazy_loading_prefetch = prefetching diff --git a/test/basic_gpu_test.py b/test/basic_gpu_test.py index dcd588ad1..514a70f21 100644 --- a/test/basic_gpu_test.py +++ b/test/basic_gpu_test.py @@ -91,7 +91,7 @@ def __run(use_gpu): test_parameters.running.max_number_epochs = 100 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.manual_seed = 1002 test_parameters.running.use_shuffling_for_samplers = False test_parameters.use_gpu = use_gpu @@ -150,4 +150,4 @@ def __run(use_gpu): starttime = time.time() test_trainer.train_network() - return test_trainer.final_test_loss, time.time() - starttime + return test_trainer.final_validation_loss, time.time() - starttime diff --git a/test/checkpoint_hyperopt_test.py b/test/checkpoint_hyperopt_test.py index 28889c2df..a1909f21b 100644 --- a/test/checkpoint_hyperopt_test.py +++ b/test/checkpoint_hyperopt_test.py @@ -67,7 +67,7 @@ def __original_setup(n_trials): test_parameters.running.max_number_epochs = 10 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" # Specify the number of trials, the hyperparameter optimizer should run # and the type of hyperparameter. diff --git a/test/checkpoint_training_test.py b/test/checkpoint_training_test.py index 4c56ed8eb..3bc5e83e3 100644 --- a/test/checkpoint_training_test.py +++ b/test/checkpoint_training_test.py @@ -20,7 +20,7 @@ def test_general(self): # First run the entire test. trainer = self.__original_setup(test_checkpoint_name, 40) trainer.train_network() - original_final_test_loss = trainer.final_test_loss + original_final_validation_loss = trainer.final_validation_loss # Now do the same, but cut at epoch 22 and see if it recovers the # correct result. @@ -28,9 +28,11 @@ def test_general(self): trainer.train_network() trainer = self.__resume_checkpoint(test_checkpoint_name, 40) trainer.train_network() - new_final_test_loss = trainer.final_test_loss + new_final_validation_loss = trainer.final_validation_loss assert np.isclose( - original_final_test_loss, new_final_test_loss, atol=accuracy + original_final_validation_loss, + new_final_validation_loss, + atol=accuracy, ) def test_learning_rate(self): @@ -144,7 +146,7 @@ def __original_setup( test_parameters.running.max_number_epochs = maxepochs test_parameters.running.mini_batch_size = 38 test_parameters.running.learning_rate = learning_rate - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.running.learning_rate_scheduler = ( learning_rate_scheduler ) diff --git a/test/complete_interfaces_test.py b/test/complete_interfaces_test.py index 65a26c26b..8aa7da85d 100644 --- a/test/complete_interfaces_test.py +++ b/test/complete_interfaces_test.py @@ -114,7 +114,7 @@ def test_ase_calculator(self): test_parameters.running.max_number_epochs = 100 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.targets.target_type = "LDOS" test_parameters.targets.ldos_gridsize = 11 test_parameters.targets.ldos_gridspacing_ev = 2.5 diff --git a/test/examples_test.py b/test/examples_test.py index b5aa9143a..4a83dd538 100644 --- a/test/examples_test.py +++ b/test/examples_test.py @@ -6,6 +6,7 @@ import pytest + @pytest.mark.examples class TestExamples: dir_path = os.path.dirname(__file__) @@ -13,96 +14,85 @@ class TestExamples: def test_basic_ex01(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/basic/ex01_train_network.py" + self.dir_path + "/../examples/basic/ex01_train_network.py" ) @pytest.mark.order(after="test_basic_ex01") def test_basic_ex02(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/basic/ex02_test_network.py" + self.dir_path + "/../examples/basic/ex02_test_network.py" ) @pytest.mark.order(after="test_basic_ex01") def test_basic_ex03(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/basic/ex03_preprocess_data.py" + self.dir_path + "/../examples/basic/ex03_preprocess_data.py" ) @pytest.mark.order(after="test_basic_ex01") def test_basic_ex04(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/basic/ex04_hyperparameter_optimization.py" + self.dir_path + + "/../examples/basic/ex04_hyperparameter_optimization.py" ) @pytest.mark.order(after="test_basic_ex01") def test_basic_ex05(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/basic/ex05_run_predictions.py" + self.dir_path + "/../examples/basic/ex05_run_predictions.py" ) @pytest.mark.order(after="test_basic_ex01") def test_basic_ex06(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/basic/ex06_ase_calculator.py" + self.dir_path + "/../examples/basic/ex06_ase_calculator.py" ) @pytest.mark.order(after="test_basic_ex01") def test_advanced_ex01(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/advanced/ex01_checkpoint_training.py" + self.dir_path + "/../examples/advanced/ex01_checkpoint_training.py" ) @pytest.mark.order(after="test_basic_ex01") def test_advanced_ex02(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/advanced/ex02_shuffle_data.py" + self.dir_path + "/../examples/advanced/ex02_shuffle_data.py" ) @pytest.mark.order(after="test_basic_ex01") def test_advanced_ex03(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/advanced/ex03_tensor_board.py" + self.dir_path + "/../examples/advanced/ex03_tensor_board.py" ) @pytest.mark.order(after="test_basic_ex01") def test_advanced_ex04(self, tmp_path): os.chdir(tmp_path) - runpy.run_path( - self.dir_path + - "/../examples/advanced/ex04_acsd.py" - ) + runpy.run_path(self.dir_path + "/../examples/advanced/ex04_acsd.py") @pytest.mark.order(after="test_basic_ex01") def test_advanced_ex05(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/advanced/ex05_checkpoint_hyperparameter_optimization.py" + self.dir_path + + "/../examples/advanced/ex05_checkpoint_hyperparameter_optimization.py" ) @pytest.mark.order(after="test_basic_ex01") def test_advanced_ex06(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/advanced/ex06_distributed_hyperparameter_optimization.py" + self.dir_path + + "/../examples/advanced/ex06_distributed_hyperparameter_optimization.py" ) @pytest.mark.skipif( @@ -113,14 +103,14 @@ def test_advanced_ex06(self, tmp_path): def test_advanced_ex07(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/advanced/ex07_advanced_hyperparameter_optimization.py" + self.dir_path + + "/../examples/advanced/ex07_advanced_hyperparameter_optimization.py" ) @pytest.mark.order(after="test_basic_ex01") def test_advanced_ex08(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/advanced/ex08_visualize_observables.py" + self.dir_path + + "/../examples/advanced/ex08_visualize_observables.py" ) diff --git a/test/hyperopt_test.py b/test/hyperopt_test.py index bb003082a..77b0b9896 100644 --- a/test/hyperopt_test.py +++ b/test/hyperopt_test.py @@ -42,7 +42,7 @@ def test_hyperopt(self): test_parameters.running.max_number_epochs = 20 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.hyperparameters.n_trials = 20 test_parameters.hyperparameters.hyper_opt_method = "optuna" @@ -133,7 +133,7 @@ def test_distributed_hyperopt(self): test_parameters.running.max_number_epochs = 5 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.hyperparameters.n_trials = 20 test_parameters.hyperparameters.hyper_opt_method = "optuna" test_parameters.hyperparameters.study_name = "test_ho" @@ -242,7 +242,7 @@ def test_naswot_eigenvalues(self): test_parameters.running.max_number_epochs = 10 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.hyperparameters.n_trials = 8 test_parameters.hyperparameters.hyper_opt_method = "naswot" @@ -310,7 +310,7 @@ def __optimize_hyperparameters(hyper_optimizer): test_parameters.running.max_number_epochs = 20 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.hyperparameters.n_trials = 8 test_parameters.hyperparameters.hyper_opt_method = hyper_optimizer @@ -352,7 +352,7 @@ def __optimize_hyperparameters(hyper_optimizer): # If we do a NASWOT run currently we can provide an input # array of trials. test_hp_optimizer.add_hyperparameter( - "categorical", "trainingtype", choices=["Adam", "SGD"] + "categorical", "optimizer", choices=["Adam", "SGD"] ) test_hp_optimizer.add_hyperparameter( "categorical", "layer_activation_00", choices=["ReLU", "Sigmoid"] @@ -375,7 +375,7 @@ def __optimize_hyperparameters(hyper_optimizer): ) test_trainer.train_network() test_parameters.show() - return test_trainer.final_test_loss + return test_trainer.final_validation_loss def test_hyperopt_optuna_requeue_zombie_trials(self, tmp_path): @@ -391,7 +391,7 @@ def test_hyperopt_optuna_requeue_zombie_trials(self, tmp_path): test_parameters.running.max_number_epochs = 2 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.hyperparameters.n_trials = 2 test_parameters.hyperparameters.hyper_opt_method = "optuna" test_parameters.hyperparameters.study_name = "test_ho" diff --git a/test/shuffling_test.py b/test/shuffling_test.py index e637c7d2b..72d28d6ef 100644 --- a/test/shuffling_test.py +++ b/test/shuffling_test.py @@ -124,7 +124,7 @@ def test_training(self): test_parameters.running.max_number_epochs = 50 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.verbosity = 1 test_parameters.data.use_lazy_loading = True @@ -168,7 +168,7 @@ def test_training(self): test_parameters.running.max_number_epochs = 50 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.verbosity = 1 test_parameters.data.use_lazy_loading = True data_shuffler = mala.DataShuffler(test_parameters) @@ -220,7 +220,7 @@ def test_training_openpmd(self): test_parameters.running.max_number_epochs = 50 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.verbosity = 1 test_parameters.data.use_lazy_loading = True @@ -266,7 +266,7 @@ def test_training_openpmd(self): test_parameters.running.max_number_epochs = 50 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.verbosity = 1 test_parameters.data.use_lazy_loading = True diff --git a/test/workflow_test.py b/test/workflow_test.py index fa7dee018..8cc33faf6 100644 --- a/test/workflow_test.py +++ b/test/workflow_test.py @@ -29,28 +29,19 @@ def test_network_training(self): """Test whether MALA can train a NN.""" test_trainer = self.__simple_training() - assert ( - desired_loss_improvement_factor * test_trainer.initial_test_loss - > test_trainer.final_test_loss - ) + assert test_trainer.final_validation_loss < np.inf def test_network_training_openpmd(self): """Test whether MALA can train a NN.""" test_trainer = self.__simple_training(use_openpmd_data=True) - assert ( - desired_loss_improvement_factor * test_trainer.initial_test_loss - > test_trainer.final_test_loss - ) + assert test_trainer.final_validation_loss < np.inf def test_network_training_fast_dataset(self): """Test whether MALA can train a NN.""" test_trainer = self.__simple_training(use_fast_tensor_dataset=True) - assert ( - desired_loss_improvement_factor * test_trainer.initial_test_loss - > test_trainer.final_test_loss - ) + assert test_trainer.final_validation_loss < np.inf def test_preprocessing(self): """ @@ -191,16 +182,8 @@ def test_postprocessing_from_dos(self): self_consistent_fermi_energy = dos.get_self_consistent_fermi_energy( dos_data ) - number_of_electrons = dos.get_number_of_electrons( - dos_data, fermi_energy=self_consistent_fermi_energy - ) band_energy = dos.get_band_energy(dos_data) - assert np.isclose( - number_of_electrons, - dos.number_of_electrons_exact, - atol=accuracy_electrons, - ) assert np.isclose( band_energy, dos.band_energy_dft_calculation, @@ -232,18 +215,10 @@ def test_postprocessing(self): self_consistent_fermi_energy = ldos.get_self_consistent_fermi_energy( ldos_data ) - number_of_electrons = ldos.get_number_of_electrons( - ldos_data, fermi_energy=self_consistent_fermi_energy - ) band_energy = ldos.get_band_energy( ldos_data, fermi_energy=self_consistent_fermi_energy ) - assert np.isclose( - number_of_electrons, - ldos.number_of_electrons_exact, - atol=accuracy_electrons, - ) assert np.isclose( band_energy, ldos.band_energy_dft_calculation, @@ -403,13 +378,12 @@ def test_training_with_postprocessing_data_repo(self): data_handler.prepare_data(reparametrize_scaler=False) # Instantiate and use a Tester object. - tester.observables_to_test = ["band_energy", "number_of_electrons"] + tester.observables_to_test = ["band_energy"] errors = tester.test_snapshot(0) # Check whether the prediction is accurate enough. - assert np.isclose(errors["band_energy"], 0, atol=accuracy_predictions) assert np.isclose( - errors["number_of_electrons"], 0, atol=accuracy_predictions + errors["band_energy"], 0, atol=accuracy_predictions * 1000 ) @pytest.mark.skipif( @@ -460,9 +434,6 @@ def test_predictions(self): band_energy_tester_class = ldos_calculator.get_band_energy( predicted_ldos ) - nr_electrons_tester_class = ldos_calculator.get_number_of_electrons( - predicted_ldos - ) #################### # Now, use the predictor class to make the same prediction. @@ -478,12 +449,6 @@ def test_predictions(self): ldos_calculator.read_additional_calculation_data( os.path.join(data_path, "Be_snapshot3.out"), "espresso-out" ) - - nr_electrons_predictor_class = ( - data_handler.target_calculator.get_number_of_electrons( - predicted_ldos - ) - ) band_energy_predictor_class = ( data_handler.target_calculator.get_band_energy(predicted_ldos) ) @@ -493,11 +458,6 @@ def test_predictions(self): band_energy_tester_class, atol=accuracy_strict, ) - assert np.isclose( - nr_electrons_predictor_class, - nr_electrons_tester_class, - atol=accuracy_strict, - ) @pytest.mark.skipif( importlib.util.find_spec("total_energy") is None @@ -568,7 +528,7 @@ def __simple_training( test_parameters.running.max_number_epochs = 400 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.data.use_fast_tensor_data_set = use_fast_tensor_dataset # Load data.