From c4c587f9984f52acc2c15756955a9210f0e175b1 Mon Sep 17 00:00:00 2001 From: Steve Schmerler Date: Thu, 20 Jun 2024 23:54:04 +0200 Subject: [PATCH 1/9] doc: link to GPU usage docs from lammps install section --- docs/source/advanced_usage/predictions.rst | 3 ++- docs/source/install/installing_lammps.rst | 10 ++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/docs/source/advanced_usage/predictions.rst b/docs/source/advanced_usage/predictions.rst index 7058f17de..20e82494b 100644 --- a/docs/source/advanced_usage/predictions.rst +++ b/docs/source/advanced_usage/predictions.rst @@ -40,6 +40,8 @@ Likewise, you can adjust the inference temperature via calculator.data_handler.target_calculator.temperature = ... +.. _production_gpu: + Predictions on GPU ******************* @@ -137,4 +139,3 @@ With the exception of the electronic density, which is saved into the ``.cube`` format for visualization with regular electronic structure visualization software, all of these observables can be plotted with Python based visualization libraries such as ``matplotlib``. - diff --git a/docs/source/install/installing_lammps.rst b/docs/source/install/installing_lammps.rst index 50fb41cef..28affb950 100644 --- a/docs/source/install/installing_lammps.rst +++ b/docs/source/install/installing_lammps.rst @@ -41,18 +41,24 @@ The MALA team recommends to build LAMMPS with ``cmake``. To do so * ``Kokkos_ARCH_GPUARCH=???``: Your GPU architecture (see see `Kokkos instructions `_) * ``CMAKE_CXX_COMPILER=???``: Path to the ``nvcc_wrapper`` executable shipped with the LAMMPS code, should be at ``/your/path/to/lammps/lib/kokkos/bin/nvcc_wrapper`` -* For example, this configures the LAMMPS cmake build with Kokkos support - for an Intel Haswell CPU and an Nvidia Volta GPU, with MPI support: + + For example, this configures the LAMMPS cmake build with Kokkos support + for an Intel Haswell CPU and an Nvidia Volta GPU, with MPI support: .. code-block:: bash cmake ../cmake -D PKG_KOKKOS=yes -D BUILD_MPI=yes -D PKG_ML-SNAP=yes -D Kokkos_ENABLE_CUDA=yes -D Kokkos_ARCH_HSW=yes -D Kokkos_ARCH_VOLTA70=yes -D CMAKE_CXX_COMPILER=/path/to/lammps/lib/kokkos/bin/nvcc_wrapper -D BUILD_SHARED_LIBS=yes + .. note:: + When using a GPU by setting ``parameters.use_gpu = True``, you *need* to + have a GPU version of ``LAMMPS`` installed. See :ref:`production_gpu` for + details. * Build the library and executable with ``cmake --build .`` (Add ``--parallel=8`` for a faster build) + Installing the Python extension ******************************** From bf10ea059ded2484c6e79a458d111442651d3c7d Mon Sep 17 00:00:00 2001 From: Steve Schmerler Date: Fri, 21 Jun 2024 11:00:37 +0200 Subject: [PATCH 2/9] doc: QE install: fix typos, add cmake note build_total_energy_energy_module.sh -> build_total_energy_module.sh Link to github issue documenting issues when building QE with cmake. --- docs/source/install/installing_qe.rst | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/docs/source/install/installing_qe.rst b/docs/source/install/installing_qe.rst index 3b426ba48..9ff514c7a 100644 --- a/docs/source/install/installing_qe.rst +++ b/docs/source/install/installing_qe.rst @@ -4,24 +4,25 @@ Installing Quantum ESPRESSO (total energy module) Prerequisites ************* -To run the total energy module, you need a full Quantum ESPRESSO installation, -for which to install the Python bindings. This module has been tested with -version ``7.2.``, the most recent version at the time of this release of MALA. -Newer versions may work (untested), but installation instructions may vary. +To build and run the total energy module, you need a full Quantum ESPRESSO +installation, for which to install the Python bindings. This module has been +tested with version ``7.2.``, the most recent version at the time of this +release of MALA. Newer versions may work (untested), but installation +instructions may vary. Make sure you have an (MPI-aware) F90 compiler such as ``mpif90`` (e.g. Debian-ish machine: ``apt install openmpi-bin``, on an HPC cluster something like ``module load openmpi gcc``). Make sure to use the same compiler for QE and the extension. This should be the default case, but if problems arise you can manually select the compiler via -``--f90exec=`` in ``build_total_energy_energy_module.sh`` +``--f90exec=`` in ``build_total_energy_module.sh`` We assume that QE's ``configure`` script will find your system libs, e.g. use ``-lblas``, ``-llapack`` and ``-lfftw3``. We use those by default in -``build_total_energy_energy_module.sh``. If you have, say, the MKL library, +``build_total_energy_module.sh``. If you have, say, the MKL library, you may see ``configure`` use something like ``-lmkl_intel_lp64 -lmkl_sequential -lmkl_core`` when building QE. In this case you have to modify -``build_total_energy_energy_module.sh`` to use the same libraries! +``build_total_energy_module.sh`` to use the same libraries! Build Quantum ESPRESSO ********************** @@ -35,10 +36,16 @@ Build Quantum ESPRESSO * Change to the ``external_modules/total_energy_module`` directory of the MALA repository +.. note:: + At the moment, building QE using ``cmake`` `doesn't work together with the + build_total_energy_module.sh script + `_. Please use the + ``configure`` + ``make`` build workflow. + Installing the Python extension ******************************** -* Run ``build_total_energy_energy_module.sh /path/to/your/q-e``. +* Run ``build_total_energy_module.sh /path/to/your/q-e``. * If the build is successful, a file named something like ``total_energy.cpython-39m-x86_64-linux-gnu.so`` will be generated. This is From 01e46bec302dd5e1a089ff51e9970b79e15e6586 Mon Sep 17 00:00:00 2001 From: Daniel Kotik Date: Sat, 29 Jun 2024 23:51:41 +0200 Subject: [PATCH 3/9] Be explicit about the fetch depth --- .github/workflows/cpu-tests.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/cpu-tests.yml b/.github/workflows/cpu-tests.yml index 48f0a456c..8c12200ec 100644 --- a/.github/workflows/cpu-tests.yml +++ b/.github/workflows/cpu-tests.yml @@ -35,6 +35,8 @@ jobs: steps: - name: Check out repository uses: actions/checkout@v4 + with: + fetch-depth: '1' - name: Set environment variables run: | @@ -155,6 +157,8 @@ jobs: - name: Check out repository (mala) uses: actions/checkout@v4 + with: + fetch-depth: '1' - name: Install mala package # Exec all commands inside the mala-cpu container @@ -231,6 +235,8 @@ jobs: steps: - name: Check out repository uses: actions/checkout@v4 + with: + fetch-depth: '1' - name: "Prepare environment: Restore cache" if: env.DOCKER_TAG != 'latest' From b579fbf0f1d2377939b76f019d6006e9a5a5769b Mon Sep 17 00:00:00 2001 From: Daniel Kotik Date: Wed, 3 Jul 2024 18:03:51 +0200 Subject: [PATCH 4/9] Calculate short commit SHA via parameter expansion: It is not necessary to clone the source code just to calculate the short commit SHA. We can fall back on the GitHub default environment variable GITHUB_SHA and calculate the short form via bash parameter expansion. --- .github/workflows/cpu-tests.yml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/workflows/cpu-tests.yml b/.github/workflows/cpu-tests.yml index 8c12200ec..5022c1dc6 100644 --- a/.github/workflows/cpu-tests.yml +++ b/.github/workflows/cpu-tests.yml @@ -233,11 +233,6 @@ jobs: ((contains(github.ref_name, 'develop') || contains(github.ref_name, 'master')) && needs.build-docker-image-cpu.outputs.docker-tag != 'latest') || startsWith(github.ref, 'refs/tags/') steps: - - name: Check out repository - uses: actions/checkout@v4 - with: - fetch-depth: '1' - - name: "Prepare environment: Restore cache" if: env.DOCKER_TAG != 'latest' uses: actions/cache@v4 @@ -258,7 +253,8 @@ jobs: run: | # Execute on change of Docker image if [[ "$DOCKER_TAG" != 'latest' ]]; then - GIT_SHA=${GITHUB_REF_NAME}-$(git rev-parse --short "$GITHUB_SHA") + GITHUB_SHORT_SHA=${GITHUB_SHA:0:7} + GIT_SHA=${GITHUB_REF_NAME}-${GITHUB_SHORT_SHA} echo "GIT_SHA=$GIT_SHA" docker tag $IMAGE_NAME:$GITHUB_RUN_ID $IMAGE_REPO/$IMAGE_NAME:latest From 20a91769ab85b3744618775d1ac32156229aa265 Mon Sep 17 00:00:00 2001 From: Daniel Kotik Date: Wed, 3 Jul 2024 21:25:43 +0200 Subject: [PATCH 5/9] Suppress verbose output from docker pull/load --- .github/workflows/cpu-tests.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/cpu-tests.yml b/.github/workflows/cpu-tests.yml index 5022c1dc6..6dd715a7a 100644 --- a/.github/workflows/cpu-tests.yml +++ b/.github/workflows/cpu-tests.yml @@ -64,7 +64,7 @@ jobs: fi - name: Pull latest image from container registry - run: docker pull $IMAGE_REPO/$IMAGE_NAME || true + run: docker pull $IMAGE_REPO/$IMAGE_NAME --quiet || true - name: Build temporary Docker image run: | @@ -133,12 +133,12 @@ jobs: - name: "Prepare environment: Load Docker image from cache" if: env.DOCKER_TAG != 'latest' - run: docker load -i $DOCKER_CACHE_PATH/docker-image.tar.gz + run: docker load -i $DOCKER_CACHE_PATH/docker-image.tar.gz --quiet - name: "Prepare environment: Pull latest image from container registry" if: env.DOCKER_TAG == 'latest' run: | - docker pull $IMAGE_REPO/$IMAGE_NAME:latest + docker pull $IMAGE_REPO/$IMAGE_NAME:latest --quiet docker image tag $IMAGE_REPO/$IMAGE_NAME:latest $IMAGE_NAME:latest - name: "Prepare environment: Run Docker container" @@ -243,11 +243,11 @@ jobs: - name: "Prepare environment: Load Docker image from cache" if: env.DOCKER_TAG != 'latest' - run: docker load -i $DOCKER_CACHE_PATH/docker-image.tar.gz + run: docker load -i $DOCKER_CACHE_PATH/docker-image.tar.gz --quiet - name: "Prepare environment: Pull latest image from container registry" if: env.DOCKER_TAG == 'latest' - run: docker pull $IMAGE_REPO/$IMAGE_NAME:latest + run: docker pull $IMAGE_REPO/$IMAGE_NAME:latest --quiet - name: Tag Docker image run: | From b64fb14891503c9e59fa9a73ff2d7c1492b793c1 Mon Sep 17 00:00:00 2001 From: Daniel Kotik Date: Wed, 3 Jul 2024 21:26:16 +0200 Subject: [PATCH 6/9] Suppress detailed layer status while pushing images: The `--quiet` is too quiet, we still want to see the tags/digests pushed. --- .github/workflows/cpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cpu-tests.yml b/.github/workflows/cpu-tests.yml index 6dd715a7a..a3436e27e 100644 --- a/.github/workflows/cpu-tests.yml +++ b/.github/workflows/cpu-tests.yml @@ -274,4 +274,4 @@ jobs: run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u ${{ github.actor }} --password-stdin - name: Push Docker image - run: docker push $IMAGE_REPO/$IMAGE_NAME --all-tags + run: docker push $IMAGE_REPO/$IMAGE_NAME --all-tags | grep -v -E 'Waiting|Layer already|Preparing|Pushed' From 92eb513a9d4a0918e7a83bb6e16216dd4e931982 Mon Sep 17 00:00:00 2001 From: Daniel Kotik Date: Wed, 3 Jul 2024 22:49:51 +0200 Subject: [PATCH 7/9] Refactor a bit --- .github/workflows/cpu-tests.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/cpu-tests.yml b/.github/workflows/cpu-tests.yml index a3436e27e..780ed9a6a 100644 --- a/.github/workflows/cpu-tests.yml +++ b/.github/workflows/cpu-tests.yml @@ -253,12 +253,9 @@ jobs: run: | # Execute on change of Docker image if [[ "$DOCKER_TAG" != 'latest' ]]; then - GITHUB_SHORT_SHA=${GITHUB_SHA:0:7} - GIT_SHA=${GITHUB_REF_NAME}-${GITHUB_SHORT_SHA} - echo "GIT_SHA=$GIT_SHA" docker tag $IMAGE_NAME:$GITHUB_RUN_ID $IMAGE_REPO/$IMAGE_NAME:latest - docker tag $IMAGE_NAME:$GITHUB_RUN_ID $IMAGE_REPO/$IMAGE_NAME:$GIT_SHA + docker tag $IMAGE_NAME:$GITHUB_RUN_ID $IMAGE_REPO/$IMAGE_NAME:${GITHUB_REF_NAME}-${GITHUB_SHA:0:7} fi # Execute on push of git tag From 4ba5dbe2fbe523a867c5adcf44b12511828e912d Mon Sep 17 00:00:00 2001 From: Daniel Kotik Date: Thu, 4 Jul 2024 11:23:26 +0200 Subject: [PATCH 8/9] Condition-based display of Conda environment diffs: Report full diff of Conda environment.yml files before and after installation of MALA only when they differ. --- .github/workflows/cpu-tests.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cpu-tests.yml b/.github/workflows/cpu-tests.yml index 780ed9a6a..48dc91a34 100644 --- a/.github/workflows/cpu-tests.yml +++ b/.github/workflows/cpu-tests.yml @@ -178,7 +178,13 @@ jobs: # if comparison fails, `install/mala_cpu_[base]_environment.yml` needs to be aligned with # `requirements.txt` and/or extra dependencies are missing in the Docker Conda environment - diff --side-by-side --color=always env_before.yml env_after.yml + + if diff --brief env_before.yml env_after.yml + then + echo "Files env_before.yml and env_after.yml do not differ." + else + diff --side-by-side --color-always env_before.yml env_after.yml + fi - name: Download test data repository from RODARE shell: 'bash -c "docker exec -i mala-cpu python < {0}"' From 4139713d316cecd65879870c5d4b3aad758d8b50 Mon Sep 17 00:00:00 2001 From: nerkulec Date: Mon, 24 Jun 2024 17:13:56 +0200 Subject: [PATCH 9/9] Unified error calculation --- .../source/advanced_usage/hyperparameters.rst | 2 +- docs/source/advanced_usage/predictions.rst | 3 +- docs/source/advanced_usage/trainingmodel.rst | 16 +- docs/source/basic_usage/hyperparameters.rst | 4 +- docs/source/basic_usage/trainingmodel.rst | 2 +- docs/source/install/installing_lammps.rst | 8 +- docs/source/install/installing_qe.rst | 23 +- examples/advanced/ex01_checkpoint_training.py | 2 +- examples/advanced/ex03_tensor_board.py | 4 +- ..._checkpoint_hyperparameter_optimization.py | 2 +- ...distributed_hyperparameter_optimization.py | 4 +- ...07_advanced_hyperparameter_optimization.py | 4 +- examples/basic/ex01_train_network.py | 2 +- examples/basic/ex02_test_network.py | 6 +- .../basic/ex04_hyperparameter_optimization.py | 2 +- mala/common/parameters.py | 83 +- mala/datahandling/data_shuffler.py | 14 +- mala/network/hyper_opt_naswot.py | 2 +- mala/network/objective_base.py | 8 +- mala/network/runner.py | 398 +++++++++- mala/network/tester.py | 184 +---- mala/network/trainer.py | 749 ++++++------------ test/all_lazy_loading_test.py | 17 +- test/basic_gpu_test.py | 4 +- test/checkpoint_hyperopt_test.py | 2 +- test/checkpoint_training_test.py | 10 +- test/complete_interfaces_test.py | 6 +- test/examples_test.py | 50 +- test/hyperopt_test.py | 14 +- test/shuffling_test.py | 8 +- test/workflow_test.py | 52 +- 31 files changed, 804 insertions(+), 881 deletions(-) diff --git a/docs/source/advanced_usage/hyperparameters.rst b/docs/source/advanced_usage/hyperparameters.rst index 4240250e7..5c0665b44 100644 --- a/docs/source/advanced_usage/hyperparameters.rst +++ b/docs/source/advanced_usage/hyperparameters.rst @@ -114,7 +114,7 @@ a physical validation metric such as .. code-block:: python - parameters.running.after_before_training_metric = "band_energy" + parameters.running.after_training_metric = "band_energy" Advanced optimization algorithms ******************************** diff --git a/docs/source/advanced_usage/predictions.rst b/docs/source/advanced_usage/predictions.rst index 7058f17de..20e82494b 100644 --- a/docs/source/advanced_usage/predictions.rst +++ b/docs/source/advanced_usage/predictions.rst @@ -40,6 +40,8 @@ Likewise, you can adjust the inference temperature via calculator.data_handler.target_calculator.temperature = ... +.. _production_gpu: + Predictions on GPU ******************* @@ -137,4 +139,3 @@ With the exception of the electronic density, which is saved into the ``.cube`` format for visualization with regular electronic structure visualization software, all of these observables can be plotted with Python based visualization libraries such as ``matplotlib``. - diff --git a/docs/source/advanced_usage/trainingmodel.rst b/docs/source/advanced_usage/trainingmodel.rst index 52e50ec50..290aa15f3 100644 --- a/docs/source/advanced_usage/trainingmodel.rst +++ b/docs/source/advanced_usage/trainingmodel.rst @@ -77,7 +77,7 @@ Specifically, when setting .. code-block:: python - parameters.running.after_before_training_metric = "band_energy" + parameters.running.after_training_metric = "band_energy" the error in the band energy between actual and predicted LDOS will be calculated and printed before and after network training (in meV/atom). @@ -205,21 +205,21 @@ visualization prior to training via # 0: No visualizatuon, 1: loss and learning rate, 2: like 1, # but additionally weights and biases are saved - parameters.running.visualisation = 1 - parameters.running.visualisation_dir = "mala_vis" + parameters.running.logging = 1 + parameters.running.logging_dir = "mala_vis" -where ``visualisation_dir`` specifies some directory in which to save the -MALA visualization data. Afterwards, you can run the training without any +where ``logging_dir`` specifies some directory in which to save the +MALA logging data. Afterwards, you can run the training without any other modifications. Once training is finished (or during training, in case you want to use tensorboard to monitor progress), you can launch tensorboard via .. code-block:: bash - tensorboard --logdir path_to_visualization + tensorboard --logdir path_to_log_directory -The full path for ``path_to_visualization`` can be accessed via -``trainer.full_visualization_path``. +The full path for ``path_to_log_directory`` can be accessed via +``trainer.full_logging_path``. Training in parallel diff --git a/docs/source/basic_usage/hyperparameters.rst b/docs/source/basic_usage/hyperparameters.rst index 11742932d..d10bb440e 100644 --- a/docs/source/basic_usage/hyperparameters.rst +++ b/docs/source/basic_usage/hyperparameters.rst @@ -118,9 +118,9 @@ properties of the ``Parameters`` class: during the optimization. - ``network.layer_sizes`` - ``"int"``, ``"categorical"`` - * - ``"trainingtype"`` + * - ``"optimizer"`` - Optimization algorithm used during the NN optimization. - - ``running.trainingtype`` + - ``running.optimizer`` - ``"categorical"`` * - ``"mini_batch_size"`` - Size of the mini batches used to calculate the gradient during diff --git a/docs/source/basic_usage/trainingmodel.rst b/docs/source/basic_usage/trainingmodel.rst index 3995865e6..e6bc8c967 100644 --- a/docs/source/basic_usage/trainingmodel.rst +++ b/docs/source/basic_usage/trainingmodel.rst @@ -35,7 +35,7 @@ options to train a simple network with example data, namely parameters.running.max_number_epochs = 100 parameters.running.mini_batch_size = 40 parameters.running.learning_rate = 0.00001 - parameters.running.trainingtype = "Adam" + parameters.running.optimizer = "Adam" parameters.verbosity = 1 # level of output; 1 is standard, 0 is low, 2 is debug. Here, we can see that the ``Parameters`` object contains multiple diff --git a/docs/source/install/installing_lammps.rst b/docs/source/install/installing_lammps.rst index 50fb41cef..ae3933783 100644 --- a/docs/source/install/installing_lammps.rst +++ b/docs/source/install/installing_lammps.rst @@ -41,18 +41,24 @@ The MALA team recommends to build LAMMPS with ``cmake``. To do so * ``Kokkos_ARCH_GPUARCH=???``: Your GPU architecture (see see `Kokkos instructions `_) * ``CMAKE_CXX_COMPILER=???``: Path to the ``nvcc_wrapper`` executable shipped with the LAMMPS code, should be at ``/your/path/to/lammps/lib/kokkos/bin/nvcc_wrapper`` -* For example, this configures the LAMMPS cmake build with Kokkos support + + For example, this configures the LAMMPS cmake build with Kokkos support for an Intel Haswell CPU and an Nvidia Volta GPU, with MPI support: .. code-block:: bash cmake ../cmake -D PKG_KOKKOS=yes -D BUILD_MPI=yes -D PKG_ML-SNAP=yes -D Kokkos_ENABLE_CUDA=yes -D Kokkos_ARCH_HSW=yes -D Kokkos_ARCH_VOLTA70=yes -D CMAKE_CXX_COMPILER=/path/to/lammps/lib/kokkos/bin/nvcc_wrapper -D BUILD_SHARED_LIBS=yes +.. note:: + When using a GPU by setting ``parameters.use_gpu = True``, you *need* to + have a GPU version of ``LAMMPS`` installed. See :ref:`production_gpu` for + details. * Build the library and executable with ``cmake --build .`` (Add ``--parallel=8`` for a faster build) + Installing the Python extension ******************************** diff --git a/docs/source/install/installing_qe.rst b/docs/source/install/installing_qe.rst index 3b426ba48..9ff514c7a 100644 --- a/docs/source/install/installing_qe.rst +++ b/docs/source/install/installing_qe.rst @@ -4,24 +4,25 @@ Installing Quantum ESPRESSO (total energy module) Prerequisites ************* -To run the total energy module, you need a full Quantum ESPRESSO installation, -for which to install the Python bindings. This module has been tested with -version ``7.2.``, the most recent version at the time of this release of MALA. -Newer versions may work (untested), but installation instructions may vary. +To build and run the total energy module, you need a full Quantum ESPRESSO +installation, for which to install the Python bindings. This module has been +tested with version ``7.2.``, the most recent version at the time of this +release of MALA. Newer versions may work (untested), but installation +instructions may vary. Make sure you have an (MPI-aware) F90 compiler such as ``mpif90`` (e.g. Debian-ish machine: ``apt install openmpi-bin``, on an HPC cluster something like ``module load openmpi gcc``). Make sure to use the same compiler for QE and the extension. This should be the default case, but if problems arise you can manually select the compiler via -``--f90exec=`` in ``build_total_energy_energy_module.sh`` +``--f90exec=`` in ``build_total_energy_module.sh`` We assume that QE's ``configure`` script will find your system libs, e.g. use ``-lblas``, ``-llapack`` and ``-lfftw3``. We use those by default in -``build_total_energy_energy_module.sh``. If you have, say, the MKL library, +``build_total_energy_module.sh``. If you have, say, the MKL library, you may see ``configure`` use something like ``-lmkl_intel_lp64 -lmkl_sequential -lmkl_core`` when building QE. In this case you have to modify -``build_total_energy_energy_module.sh`` to use the same libraries! +``build_total_energy_module.sh`` to use the same libraries! Build Quantum ESPRESSO ********************** @@ -35,10 +36,16 @@ Build Quantum ESPRESSO * Change to the ``external_modules/total_energy_module`` directory of the MALA repository +.. note:: + At the moment, building QE using ``cmake`` `doesn't work together with the + build_total_energy_module.sh script + `_. Please use the + ``configure`` + ``make`` build workflow. + Installing the Python extension ******************************** -* Run ``build_total_energy_energy_module.sh /path/to/your/q-e``. +* Run ``build_total_energy_module.sh /path/to/your/q-e``. * If the build is successful, a file named something like ``total_energy.cpython-39m-x86_64-linux-gnu.so`` will be generated. This is diff --git a/examples/advanced/ex01_checkpoint_training.py b/examples/advanced/ex01_checkpoint_training.py index 01bb9b486..5222a5232 100644 --- a/examples/advanced/ex01_checkpoint_training.py +++ b/examples/advanced/ex01_checkpoint_training.py @@ -26,7 +26,7 @@ def initial_setup(): parameters.running.max_number_epochs = 9 parameters.running.mini_batch_size = 8 parameters.running.learning_rate = 0.00001 - parameters.running.trainingtype = "Adam" + parameters.running.optimizer = "Adam" # We checkpoint the training every 5 epochs and save the results # as "ex07". diff --git a/examples/advanced/ex03_tensor_board.py b/examples/advanced/ex03_tensor_board.py index b15239495..43a066aaf 100644 --- a/examples/advanced/ex03_tensor_board.py +++ b/examples/advanced/ex03_tensor_board.py @@ -18,7 +18,7 @@ parameters.running.max_number_epochs = 100 parameters.running.mini_batch_size = 40 parameters.running.learning_rate = 0.001 -parameters.running.trainingtype = "Adam" +parameters.running.optimizer = "Adam" # Turn the visualization on and select a folder to save the visualization # files into. @@ -45,6 +45,6 @@ trainer.train_network() printout( 'Run finished, launch tensorboard with "tensorboard --logdir ' - + trainer.full_visualization_path + + trainer.full_logging_path + '"' ) diff --git a/examples/advanced/ex05_checkpoint_hyperparameter_optimization.py b/examples/advanced/ex05_checkpoint_hyperparameter_optimization.py index cef7c8f4f..99a92fa35 100644 --- a/examples/advanced/ex05_checkpoint_hyperparameter_optimization.py +++ b/examples/advanced/ex05_checkpoint_hyperparameter_optimization.py @@ -21,7 +21,7 @@ def initial_setup(): parameters.running.max_number_epochs = 10 parameters.running.mini_batch_size = 40 parameters.running.learning_rate = 0.00001 - parameters.running.trainingtype = "Adam" + parameters.running.optimizer = "Adam" parameters.hyperparameters.n_trials = 9 parameters.hyperparameters.checkpoints_each_trial = 5 parameters.hyperparameters.checkpoint_name = "ex05_checkpoint" diff --git a/examples/advanced/ex06_distributed_hyperparameter_optimization.py b/examples/advanced/ex06_distributed_hyperparameter_optimization.py index b34f9bb8b..215dd1ab2 100644 --- a/examples/advanced/ex06_distributed_hyperparameter_optimization.py +++ b/examples/advanced/ex06_distributed_hyperparameter_optimization.py @@ -28,7 +28,7 @@ parameters.running.max_number_epochs = 5 parameters.running.mini_batch_size = 40 parameters.running.learning_rate = 0.00001 -parameters.running.trainingtype = "Adam" +parameters.running.optimizer = "Adam" parameters.hyperparameters.n_trials = 10 parameters.hyperparameters.checkpoints_each_trial = -1 parameters.hyperparameters.checkpoint_name = "ex06" @@ -44,7 +44,7 @@ parameters.targets.ldos_gridspacing_ev = 2.5 parameters.targets.ldos_gridoffset_ev = -5 parameters.hyperparameters.number_training_per_trial = 3 -parameters.running.after_before_training_metric = "band_energy" +parameters.running.after_training_metric = "band_energy" data_handler = mala.DataHandler(parameters) diff --git a/examples/advanced/ex07_advanced_hyperparameter_optimization.py b/examples/advanced/ex07_advanced_hyperparameter_optimization.py index 8165ef01e..242ffd7dd 100644 --- a/examples/advanced/ex07_advanced_hyperparameter_optimization.py +++ b/examples/advanced/ex07_advanced_hyperparameter_optimization.py @@ -21,7 +21,7 @@ def optimize_hyperparameters(hyper_optimizer): parameters.running.max_number_epochs = 10 parameters.running.mini_batch_size = 40 parameters.running.learning_rate = 0.00001 - parameters.running.trainingtype = "Adam" + parameters.running.optimizer = "Adam" parameters.hyperparameters.n_trials = 8 parameters.hyperparameters.hyper_opt_method = hyper_optimizer @@ -64,7 +64,7 @@ def optimize_hyperparameters(hyper_optimizer): data_handler.output_dimension, ] hyperoptimizer.add_hyperparameter( - "categorical", "trainingtype", choices=["Adam", "SGD"] + "categorical", "optimizer", choices=["Adam", "SGD"] ) hyperoptimizer.add_hyperparameter( "categorical", "layer_activation_00", choices=["ReLU", "Sigmoid"] diff --git a/examples/basic/ex01_train_network.py b/examples/basic/ex01_train_network.py index 95eb2d51b..1eca8c6b7 100644 --- a/examples/basic/ex01_train_network.py +++ b/examples/basic/ex01_train_network.py @@ -28,7 +28,7 @@ parameters.running.max_number_epochs = 100 parameters.running.mini_batch_size = 40 parameters.running.learning_rate = 0.00001 -parameters.running.trainingtype = "Adam" +parameters.running.optimizer = "Adam" # These parameters characterize how the LDOS and bispectrum descriptors # were calculated. They are _technically_ not needed to train a simple # network. However, it is useful to define them prior to training. Then, diff --git a/examples/basic/ex02_test_network.py b/examples/basic/ex02_test_network.py index 2e4b8953c..0d90dfe7f 100644 --- a/examples/basic/ex02_test_network.py +++ b/examples/basic/ex02_test_network.py @@ -21,15 +21,15 @@ # It is recommended to enable the "lazy-loading" feature, so that # data is loaded into memory one snapshot at a time during testing - this # helps keep RAM requirement down. Furthermore, you have to decide which -# observables to test (usual choices are "band_energy", "total_energy" and -# "number_of_electrons") and whether you want the results per snapshot +# observables to test (usual choices are "band_energy", "total_energy") +# and whether you want the results per snapshot # (output_format="list") or as an averaged value (output_format="mae") #################### parameters, network, data_handler, tester = mala.Tester.load_run( run_name=model_name, path=model_path ) -tester.observables_to_test = ["band_energy", "number_of_electrons"] +tester.observables_to_test = ["band_energy", "density"] tester.output_format = "list" parameters.data.use_lazy_loading = True diff --git a/examples/basic/ex04_hyperparameter_optimization.py b/examples/basic/ex04_hyperparameter_optimization.py index 4c68179c2..cebb4c42e 100644 --- a/examples/basic/ex04_hyperparameter_optimization.py +++ b/examples/basic/ex04_hyperparameter_optimization.py @@ -22,7 +22,7 @@ parameters.data.output_rescaling_type = "normal" parameters.running.max_number_epochs = 20 parameters.running.mini_batch_size = 40 -parameters.running.trainingtype = "Adam" +parameters.running.optimizer = "Adam" parameters.hyperparameters.n_trials = 20 #################### diff --git a/mala/common/parameters.py b/mala/common/parameters.py index 3627bd40f..c9b1b826c 100644 --- a/mala/common/parameters.py +++ b/mala/common/parameters.py @@ -265,11 +265,6 @@ class ParametersNetwork(ParametersBase): Number of hidden layers to be used in lstm or gru or transformer nets Default: None - dropout: float - Dropout rate for transformer net - 0.0 ≤ dropout ≤ 1.0 - Default: 0.0 - num_heads: int Number of heads to be used in Multi head attention network This should be a divisor of input dimension @@ -452,7 +447,7 @@ class ParametersTargets(ParametersBase): Number of points in the energy grid that is used to calculate the (L)DOS. - ldos_gridsize : float + ldos_gridsize : int Gridsize of the LDOS. ldos_gridspacing_ev: float @@ -625,9 +620,8 @@ class ParametersRunning(ParametersBase): Attributes ---------- - trainingtype : string - Training type to be used. Supported options at the moment: - + optimizer : string + Optimizer to be used. Supported options at the moment: - SGD: Stochastic gradient descent. - Adam: Adam Optimization Algorithm @@ -640,10 +634,6 @@ class ParametersRunning(ParametersBase): mini_batch_size : int Size of the mini batch for the optimization algorihm. Default: 10. - weight_decay : float - Weight decay for regularization. Always refers to L2 regularization. - Default: 0. - early_stopping_epochs : int Number of epochs the validation accuracy is allowed to not improve by at leastearly_stopping_threshold, before we terminate. If 0, no @@ -696,19 +686,13 @@ class ParametersRunning(ParametersBase): Name used for the checkpoints. Using this, multiple runs can be performed in the same directory. - visualisation : int - If True then Tensorboard is activated for visualisation - case 0: No tensorboard activated - case 1: tensorboard activated with Loss and learning rate - case 2; additonally weights and biases and gradient + logging_dir : string + Name of the folder that logging files will be saved to. - visualisation_dir : string - Name of the folder that visualization files will be saved to. - - visualisation_dir_append_date : bool - If True, then upon creating visualization files, these will be saved - in a subfolder of visualisation_dir labelled with the starting date - of the visualization, to avoid having to change input scripts often. + logging_dir_append_date : bool + If True, then upon creating logging files, these will be saved + in a subfolder of logging_dir labelled with the starting date + of the logging, to avoid having to change input scripts often. inference_data_grid : list List holding the grid to be used for inference in the form of @@ -717,7 +701,7 @@ class ParametersRunning(ParametersBase): use_mixed_precision : bool If True, mixed precision computation (via AMP) will be used. - training_report_frequency : int + training_log_interval : int Determines how often detailed performance info is printed during training (only has an effect if the verbosity is high enough). @@ -729,36 +713,49 @@ class ParametersRunning(ParametersBase): def __init__(self): super(ParametersRunning, self).__init__() - self.trainingtype = "SGD" - self.learning_rate = 0.5 + self.optimizer = "Adam" + self.learning_rate = 10 ** (-5) + self.learning_rate_embedding = 10 ** (-4) self.max_number_epochs = 100 self.verbosity = True self.mini_batch_size = 10 - self.weight_decay = 0 + self.snapshots_per_epoch = -1 + + self.l1_regularization = 0.0 + self.l2_regularization = 0.0 + self.dropout = 0.0 + self.batch_norm = False + self.input_noise = 0.0 + self.early_stopping_epochs = 0 self.early_stopping_threshold = 0 self.learning_rate_scheduler = None self.learning_rate_decay = 0.1 self.learning_rate_patience = 0 + self._during_training_metric = "ldos" + self._after_training_metric = "ldos" + self.use_compression = False self.num_workers = 0 self.use_shuffling_for_samplers = True self.checkpoints_each_epoch = 0 + self.checkpoint_best_so_far = False self.checkpoint_name = "checkpoint_mala" - self.visualisation = 0 - self.visualisation_dir = os.path.join(".", "mala_logging") - self.visualisation_dir_append_date = True - self.during_training_metric = "ldos" - self.after_before_training_metric = "ldos" + self.run_name = "" + self.logging_dir = "./mala_logging" + self.logging_dir_append_date = True + self.logger = "tensorboard" + self.validation_metrics = ["ldos"] + self.validate_on_training_data = False self.inference_data_grid = [0, 0, 0] self.use_mixed_precision = False self.use_graphs = False - self.training_report_frequency = 1000 - self.profiler_range = None # [1000, 2000] + self.training_log_interval = 1000 + self.profiler_range = [1000, 2000] def _update_ddp(self, new_ddp): super(ParametersRunning, self)._update_ddp(new_ddp) self.during_training_metric = self.during_training_metric - self.after_before_training_metric = self.after_before_training_metric + self.after_training_metric = self.after_training_metric @property def during_training_metric(self): @@ -786,7 +783,7 @@ def during_training_metric(self, value): self._during_training_metric = value @property - def after_before_training_metric(self): + def after_training_metric(self): """ Get the metric used during training. @@ -798,17 +795,17 @@ def after_before_training_metric(self): DFT results. Of these, the mean average error in eV/atom will be calculated. """ - return self._after_before_training_metric + return self._after_training_metric - @after_before_training_metric.setter - def after_before_training_metric(self, value): + @after_training_metric.setter + def after_training_metric(self, value): if value != "ldos": if self._configuration["ddp"]: raise Exception( "Currently, MALA can only operate with the " '"ldos" metric for ddp runs.' ) - self._after_before_training_metric = value + self._after_training_metric = value @during_training_metric.setter def during_training_metric(self, value): @@ -1474,7 +1471,7 @@ def save(self, filename, save_format="json"): if member[0][0] != "_": if isinstance(member[1], ParametersBase): # All the subclasses have to provide this function. - member[1]: ParametersBase + member[1]: ParametersBase # type: ignore json_dict[member[0]] = member[1].to_json() with open(filename, "w", encoding="utf-8") as f: json.dump(json_dict, f, ensure_ascii=False, indent=4) diff --git a/mala/datahandling/data_shuffler.py b/mala/datahandling/data_shuffler.py index 62d6e11a3..e7d7a07cb 100644 --- a/mala/datahandling/data_shuffler.py +++ b/mala/datahandling/data_shuffler.py @@ -131,10 +131,12 @@ def __shuffle_numpy( ) # Do the actual shuffling. - target_name_openpmd = os.path.join(target_save_path, - save_name.replace("*", "%T")) - descriptor_name_openpmd = os.path.join(descriptor_save_path, - save_name.replace("*", "%T")) + target_name_openpmd = os.path.join( + target_save_path, save_name.replace("*", "%T") + ) + descriptor_name_openpmd = os.path.join( + descriptor_save_path, save_name.replace("*", "%T") + ) for i in range(0, number_of_new_snapshots): new_descriptors = np.zeros( (int(np.prod(shuffle_dimensions)), self.input_dimension), @@ -363,9 +365,7 @@ def from_chunk_i(i, n, dset, slice_dimension=0): import json # Do the actual shuffling. - name_prefix = os.path.join( - dot.save_path, save_name.replace("*", "%T") - ) + name_prefix = os.path.join(dot.save_path, save_name.replace("*", "%T")) for i in range(my_items_start, my_items_end): # We check above that in the non-numpy case, OpenPMD will work. dot.calculator.grid_dimensions = list(shuffle_dimensions) diff --git a/mala/network/hyper_opt_naswot.py b/mala/network/hyper_opt_naswot.py index ae27f7d13..9a11e1ca0 100644 --- a/mala/network/hyper_opt_naswot.py +++ b/mala/network/hyper_opt_naswot.py @@ -39,7 +39,7 @@ def __init__(self, params, data): self.trial_list = None self.ignored_hyperparameters = [ "learning_rate", - "trainingtype", + "optimizer", "mini_batch_size", "early_stopping_epochs", "learning_rate_patience", diff --git a/mala/network/objective_base.py b/mala/network/objective_base.py index 52d0d9464..2fbf29503 100644 --- a/mala/network/objective_base.py +++ b/mala/network/objective_base.py @@ -231,8 +231,8 @@ def parse_trial_optuna(self, trial: Trial): turned_off_layers.append(layer_counter) layer_counter += 1 - elif "trainingtype" == par.name: - self.params.running.trainingtype = par.get_parameter(trial) + elif "optimizer" == par.name: + self.params.running.optimizer = par.get_parameter(trial) elif "mini_batch_size" == par.name: self.params.running.mini_batch_size = par.get_parameter(trial) @@ -358,8 +358,8 @@ def parse_trial_oat(self, trial): turned_off_layers.append(layer_counter) layer_counter += 1 - elif "trainingtype" == par.name: - self.params.running.trainingtype = par.get_parameter( + elif "optimizer" == par.name: + self.params.running.optimizer = par.get_parameter( trial, factor_idx ) elif "mini_batch_size" == par.name: diff --git a/mala/network/runner.py b/mala/network/runner.py index a5f620071..17ce572b6 100644 --- a/mala/network/runner.py +++ b/mala/network/runner.py @@ -3,6 +3,8 @@ import os from zipfile import ZipFile, ZIP_STORED +from mala.common.parallelizer import printout + import numpy as np import torch import torch.distributed as dist @@ -10,10 +12,16 @@ import mala from mala.common.parallelizer import get_rank from mala.common.parameters import ParametersRunning +from mala.datahandling.fast_tensor_dataset import FastTensorDataset from mala.network.network import Network from mala.datahandling.data_scaler import DataScaler from mala.datahandling.data_handler import DataHandler from mala import Parameters +from mala.targets.ldos import LDOS +from mala.targets.dos import DOS +from mala.targets.density import Density + +from tqdm.auto import tqdm, trange class Runner: @@ -41,6 +49,335 @@ def __init__(self, params, network, data, runner_dict=None): self.data = data self.__prepare_to_run() + def _calculate_errors( + self, actual_outputs, predicted_outputs, metrics, snapshot_number + ): + """ + Calculate the errors between the actual and predicted outputs. + + Parameters + ---------- + actual_outputs : numpy.ndarray + Actual outputs. + + predicted_outputs : numpy.ndarray + Predicted outputs. + + metrics : list + List of metrics to calculate. + + snapshot_number : int + Snapshot number for which the errors are calculated. + + Returns + ------- + errors : dict + Dictionary containing the errors. + """ + + energy_metrics = [metric for metric in metrics if "energy" in metric] + non_energy_metrics = [ + metric for metric in metrics if "energy" not in metric + ] + if len(energy_metrics) > 0: + errors = self._calculate_energy_errors( + actual_outputs, + predicted_outputs, + energy_metrics, + snapshot_number, + ) + else: + errors = {} + for metric in non_energy_metrics: + try: + if metric == "ldos": + error = np.mean((predicted_outputs - actual_outputs) ** 2) + errors[metric] = error + + elif metric == "density": + target_calculator = self.data.target_calculator + if not isinstance( + target_calculator, LDOS + ) and not isinstance(target_calculator, Density): + raise Exception( + "Cannot calculate the total energy from this " + "observable." + ) + target_calculator.read_additional_calculation_data( + self.data.get_snapshot_calculation_output( + snapshot_number + ) + ) + + target_calculator.read_from_array(actual_outputs) + actual = target_calculator.density + + target_calculator.read_from_array(predicted_outputs) + predicted = target_calculator.density + errors[metric] = np.mean(np.abs(actual - predicted)) + + elif metric == "density_relative": + target_calculator = self.data.target_calculator + if not isinstance( + target_calculator, LDOS + ) and not isinstance(target_calculator, Density): + raise Exception( + "Cannot calculate the total energy from this " + "observable." + ) + target_calculator.read_additional_calculation_data( + self.data.get_snapshot_calculation_output( + snapshot_number + ) + ) + + target_calculator.read_from_array(actual_outputs) + actual = target_calculator.density + + target_calculator.read_from_array(predicted_outputs) + predicted = target_calculator.density + errors[metric] = ( + np.mean(np.abs((actual - predicted) / actual)) * 100 + ) + + elif metric == "dos": + target_calculator = self.data.target_calculator + if not isinstance( + target_calculator, LDOS + ) and not isinstance(target_calculator, DOS): + raise Exception( + "Cannot calculate the total energy from this " + "observable." + ) + target_calculator.read_additional_calculation_data( + self.data.get_snapshot_calculation_output( + snapshot_number + ) + ) + + target_calculator.read_from_array(actual_outputs) + actual = target_calculator.density_of_states + + target_calculator.read_from_array(predicted_outputs) + predicted = target_calculator.density_of_states + + errors[metric] = np.abs(actual - predicted).mean() + + elif metric == "dos_relative": + target_calculator = self.data.target_calculator + if not isinstance( + target_calculator, LDOS + ) and not isinstance(target_calculator, DOS): + raise Exception( + "Cannot calculate the total energy from this " + "observable." + ) + target_calculator.read_additional_calculation_data( + self.data.get_snapshot_calculation_output( + snapshot_number + ) + ) + + # We shift both the actual and predicted DOS by 1.0 to overcome + # numerical issues with the DOS having values equal to zero. + target_calculator.read_from_array(actual_outputs) + actual = target_calculator.density_of_states + 1.0 + + target_calculator.read_from_array(predicted_outputs) + predicted = target_calculator.density_of_states + 1.0 + + errors[metric] = ( + np.ma.masked_invalid( + np.abs( + (actual - predicted) + / (np.abs(actual) + np.abs(predicted)) + ) + ).mean() + * 100 + ) + else: + raise Exception(f"Invalid metric ({metric}) requested.") + except ValueError as e: + printout( + f"Error calculating observable: {metric} for snapshot {snapshot_number}", + min_verbosity=0, + ) + printout(e, min_verbosity=2) + errors[metric] = float("inf") + return errors + + def _calculate_energy_errors( + self, actual_outputs, predicted_outputs, energy_types, snapshot_number + ): + """ + Calculate the errors between the actual and predicted outputs. + + Parameters + ---------- + actual_outputs : numpy.ndarray + Actual outputs. + + predicted_outputs : numpy.ndarray + Predicted outputs. + + energy_types : list + List of energy types to calculate errors. + + snapshot_number : int + Snapshot number for which the errors are calculated. + """ + target_calculator = self.data.target_calculator + output_file = self.data.get_snapshot_calculation_output( + snapshot_number + ) + if not output_file: + raise Exception( + "Output file needed for energy error calculations." + ) + target_calculator.read_additional_calculation_data(output_file) + + errors = {} + fe_dft = target_calculator.fermi_energy_dft + fe_actual = None + fe_predicted = None + try: + fe_actual = target_calculator.get_self_consistent_fermi_energy( + actual_outputs + ) + except ValueError: + errors = { + energy_type: float("inf") for energy_type in energy_types + } + printout( + "CAUTION! LDOS ground truth is so wrong that the " + "estimation of the self consistent Fermi energy fails." + ) + return errors + try: + fe_predicted = target_calculator.get_self_consistent_fermi_energy( + predicted_outputs + ) + except ValueError: + errors = { + energy_type: float("inf") for energy_type in energy_types + } + printout( + "CAUTION! LDOS prediction is so wrong that the " + "estimation of the self consistent Fermi energy fails." + ) + return errors + for energy_type in energy_types: + if energy_type == "fermi_energy": + fe_error = fe_predicted - fe_actual + errors[energy_type] = fe_error + elif energy_type == "fermi_energy_dft": + fe_error_dft = fe_predicted - fe_dft + errors[energy_type] = fe_error_dft + elif energy_type == "band_energy": + if not isinstance(target_calculator, LDOS) and not isinstance( + target_calculator, DOS + ): + raise Exception( + "Cannot calculate the band energy from this observable." + ) + try: + target_calculator.read_from_array(actual_outputs) + be_actual = target_calculator.get_band_energy( + fermi_energy=fe_actual + ) + target_calculator.read_from_array(predicted_outputs) + be_predicted = target_calculator.get_band_energy( + fermi_energy=fe_predicted + ) + be_error = (be_predicted - be_actual) * ( + 1000 / len(target_calculator.atoms) + ) + errors[energy_type] = be_error + except ValueError: + errors[energy_type] = float("inf") + elif energy_type == "band_energy_dft_fe": + try: + target_calculator.read_from_array(predicted_outputs) + be_predicted_dft_fe = target_calculator.get_band_energy( + fermi_energy=fe_dft + ) + be_error_dft_fe = (be_predicted_dft_fe - be_actual) * ( + 1000 / len(target_calculator.atoms) + ) + errors[energy_type] = be_error_dft_fe + except ValueError: + errors[energy_type] = float("inf") + elif energy_type == "band_energy_actual_fe": + try: + target_calculator.read_from_array(predicted_outputs) + be_predicted_actual_fe = target_calculator.get_band_energy( + fermi_energy=fe_actual + ) + be_error_actual_fe = ( + be_predicted_actual_fe - be_actual + ) * (1000 / len(target_calculator.atoms)) + errors[energy_type] = be_error_actual_fe + except ValueError: + errors[energy_type] = float("inf") + + elif energy_type == "total_energy": + if not isinstance(target_calculator, LDOS): + raise Exception( + "Cannot calculate the total energy from this " + "observable." + ) + try: + target_calculator.read_additional_calculation_data( + self.data.get_snapshot_calculation_output( + snapshot_number + ) + ) + target_calculator.read_from_array(actual_outputs) + te_actual = target_calculator.get_total_energy( + fermi_energy=fe_actual + ) + target_calculator.read_from_array(predicted_outputs) + te_predicted = target_calculator.get_total_energy( + fermi_energy=fe_predicted + ) + te_error = (te_predicted - te_actual) * ( + 1000 / len(target_calculator.atoms) + ) + errors[energy_type] = te_error + except ValueError: + errors[energy_type] = float("inf") + elif energy_type == "total_energy_dft_fe": + try: + target_calculator.read_from_array(predicted_outputs) + te_predicted_dft_fe = target_calculator.get_total_energy( + fermi_energy=fe_dft + ) + te_error_dft_fe = (te_predicted_dft_fe - te_actual) * ( + 1000 / len(target_calculator.atoms) + ) + errors[energy_type] = te_error_dft_fe + except ValueError: + errors[energy_type] = float("inf") + elif energy_type == "total_energy_actual_fe": + try: + target_calculator.read_from_array(predicted_outputs) + te_predicted_actual_fe = ( + target_calculator.get_total_energy( + fermi_energy=fe_actual + ) + ) + te_error_actual_fe = ( + te_predicted_actual_fe - te_actual + ) * (1000 / len(target_calculator.atoms)) + errors[energy_type] = te_error_actual_fe + except ValueError: + errors[energy_type] = float("inf") + else: + raise Exception( + f"Invalid energy type ({energy_type}) requested." + ) + return errors + def save_run( self, run_name, @@ -87,7 +424,7 @@ def save_run( params_file = run_name + ".params.json" if save_runner: optimizer_file = run_name + ".optimizer.pth" - + os.makedirs(save_path, exist_ok=True) self.parameters_full.save(os.path.join(save_path, params_file)) if self.parameters_full.use_ddp: self.network.module.save_network( @@ -391,28 +728,51 @@ def _forward_entire_snapshot( from_index += snapshot.grid_size grid_size = to_index - from_index - if self.data.parameters.use_lazy_loading: - data_set.return_outputs_directly = True - actual_outputs = (data_set[from_index:to_index])[1] - else: - actual_outputs = self.data.output_data_scaler.inverse_transform( - (data_set[from_index:to_index])[1], as_numpy=True + if isinstance(data_set, FastTensorDataset): + predicted_outputs = np.zeros( + (grid_size, self.data.output_dimension) ) - - predicted_outputs = np.zeros((grid_size, self.data.output_dimension)) - - for i in range(0, number_of_batches_per_snapshot): - inputs, outputs = data_set[ - from_index - + (i * batch_size) : from_index - + ((i + 1) * batch_size) - ] - inputs = inputs.to(self.parameters._configuration["device"]) - predicted_outputs[i * batch_size : (i + 1) * batch_size, :] = ( - self.data.output_data_scaler.inverse_transform( + actual_outputs = np.zeros((grid_size, self.data.output_dimension)) + + for i in range(len(data_set)): + inputs, outputs = data_set[from_index + i] + inputs = inputs.to(self.parameters._configuration["device"]) + predicted_outputs[ + i * data_set.batch_size : (i + 1) * data_set.batch_size, : + ] = self.data.output_data_scaler.inverse_transform( self.network(inputs).to("cpu"), as_numpy=True ) + actual_outputs[ + i * data_set.batch_size : (i + 1) * data_set.batch_size, : + ] = self.data.output_data_scaler.inverse_transform( + torch.tensor(outputs), as_numpy=True + ) + else: + if self.data.parameters.use_lazy_loading: + data_set.return_outputs_directly = True + actual_outputs = (data_set[from_index:to_index])[1] + else: + actual_outputs = ( + self.data.output_data_scaler.inverse_transform( + (data_set[from_index:to_index])[1], as_numpy=True + ) + ) + + predicted_outputs = np.zeros( + (grid_size, self.data.output_dimension) ) + for i in range(0, number_of_batches_per_snapshot): + inputs, outputs = data_set[ + from_index + + (i * batch_size) : from_index + + ((i + 1) * batch_size) + ] + inputs = inputs.to(self.parameters._configuration["device"]) + predicted_outputs[i * batch_size : (i + 1) * batch_size, :] = ( + self.data.output_data_scaler.inverse_transform( + self.network(inputs).to("cpu"), as_numpy=True + ) + ) # Restricting the actual quantities to physical meaningful values, # i.e. restricting the (L)DOS to positive values. diff --git a/mala/network/tester.py b/mala/network/tester.py index 93e67b935..9a7831f57 100644 --- a/mala/network/tester.py +++ b/mala/network/tester.py @@ -61,7 +61,7 @@ def __init__( self.number_of_batches_per_snapshot = 0 self.observables_to_test = observables_to_test self.output_format = output_format - if self.output_format != "list" and self.output_format == "mae": + if self.output_format != "list" and self.output_format != "mae": raise Exception("Wrong output format for testing selected.") self.target_calculator = data.target_calculator @@ -117,22 +117,12 @@ def test_snapshot(self, snapshot_number, data_type="te"): snapshot_number, data_type=data_type ) - results = {} - for observable in self.observables_to_test: - try: - results[observable] = self.__calculate_observable_error( - snapshot_number, - observable, - predicted_outputs, - actual_outputs, - ) - except ValueError as e: - printout( - f"Error calculating observable: {observable} for snapshot {snapshot_number}", - min_verbosity=0, - ) - printout(e, min_verbosity=2) - results[observable] = np.inf + results = self._calculate_errors( + actual_outputs, + predicted_outputs, + self.observables_to_test, + snapshot_number, + ) return results def predict_targets(self, snapshot_number, data_type="te"): @@ -185,166 +175,6 @@ def predict_targets(self, snapshot_number, data_type="te"): self.parameters.mini_batch_size, ) - def __calculate_observable_error( - self, snapshot_number, observable, predicted_target, actual_target - ): - if observable == "ldos": - return np.mean((predicted_target - actual_target) ** 2) - - elif observable == "band_energy": - target_calculator = self.data.target_calculator - if not isinstance(target_calculator, LDOS) and not isinstance( - target_calculator, DOS - ): - raise Exception( - "Cannot calculate the band energy from this observable." - ) - target_calculator.read_additional_calculation_data( - self.data.get_snapshot_calculation_output(snapshot_number) - ) - - target_calculator.read_from_array(actual_target) - actual = target_calculator.band_energy - - target_calculator.read_from_array(predicted_target) - predicted = target_calculator.band_energy - return actual - predicted - - elif observable == "band_energy_full": - target_calculator = self.data.target_calculator - if not isinstance(target_calculator, LDOS) and not isinstance( - target_calculator, DOS - ): - raise Exception( - "Cannot calculate the band energy from this observable." - ) - target_calculator.read_additional_calculation_data( - self.data.get_snapshot_calculation_output(snapshot_number) - ) - - target_calculator.read_from_array(actual_target) - actual = target_calculator.band_energy - - target_calculator.read_from_array(predicted_target) - predicted = target_calculator.band_energy - return [ - actual, - predicted, - target_calculator.band_energy_dft_calculation, - ] - - elif observable == "number_of_electrons": - target_calculator = self.data.target_calculator - if ( - not isinstance(target_calculator, LDOS) - and not isinstance(target_calculator, DOS) - and not isinstance(target_calculator, Density) - ): - raise Exception( - "Cannot calculate the band energy from this observable." - ) - target_calculator.read_additional_calculation_data( - self.data.get_snapshot_calculation_output(snapshot_number) - ) - - actual = target_calculator.get_number_of_electrons(actual_target) - - predicted = target_calculator.get_number_of_electrons( - predicted_target - ) - return actual - predicted - - elif observable == "total_energy": - target_calculator = self.data.target_calculator - if not isinstance(target_calculator, LDOS): - raise Exception( - "Cannot calculate the total energy from this " - "observable." - ) - target_calculator.read_additional_calculation_data( - self.data.get_snapshot_calculation_output(snapshot_number) - ) - - target_calculator.read_from_array(actual_target) - actual = target_calculator.total_energy - - target_calculator.read_from_array(predicted_target) - predicted = target_calculator.total_energy - return actual - predicted - - elif observable == "total_energy_full": - target_calculator = self.data.target_calculator - if not isinstance(target_calculator, LDOS): - raise Exception( - "Cannot calculate the total energy from this " - "observable." - ) - target_calculator.read_additional_calculation_data( - self.data.get_snapshot_calculation_output(snapshot_number) - ) - - target_calculator.read_from_array(actual_target) - actual = target_calculator.total_energy - - target_calculator.read_from_array(predicted_target) - predicted = target_calculator.total_energy - return [ - actual, - predicted, - target_calculator.total_energy_dft_calculation, - ] - - elif observable == "density": - target_calculator = self.data.target_calculator - if not isinstance(target_calculator, LDOS) and not isinstance( - target_calculator, Density - ): - raise Exception( - "Cannot calculate the total energy from this " - "observable." - ) - target_calculator.read_additional_calculation_data( - self.data.get_snapshot_calculation_output(snapshot_number) - ) - - target_calculator.read_from_array(actual_target) - actual = target_calculator.density - - target_calculator.read_from_array(predicted_target) - predicted = target_calculator.density - return np.mean(np.abs((actual - predicted) / actual)) * 100 - - elif observable == "dos": - target_calculator = self.data.target_calculator - if not isinstance(target_calculator, LDOS) and not isinstance( - target_calculator, DOS - ): - raise Exception( - "Cannot calculate the total energy from this " - "observable." - ) - target_calculator.read_additional_calculation_data( - self.data.get_snapshot_calculation_output(snapshot_number) - ) - - # We shift both the actual and predicted DOS by 1.0 to overcome - # numerical issues with the DOS having values equal to zero. - target_calculator.read_from_array(actual_target) - actual = target_calculator.density_of_states + 1.0 - - target_calculator.read_from_array(predicted_target) - predicted = target_calculator.density_of_states + 1.0 - - return ( - np.ma.masked_invalid( - np.abs( - (actual - predicted) - / (np.abs(actual) + np.abs(predicted)) - ) - ).mean() - * 100 - ) - def __prepare_to_test(self, snapshot_number): """Prepare the tester class to for test run.""" # We will use the DataSet iterator to iterate over the test data. diff --git a/mala/network/trainer.py b/mala/network/trainer.py index 81977c40e..3cbf7cfad 100644 --- a/mala/network/trainer.py +++ b/mala/network/trainer.py @@ -21,6 +21,7 @@ from mala.datahandling.multi_lazy_load_data_loader import ( MultiLazyLoadDataLoader, ) +from tqdm.auto import trange, tqdm class Trainer(Runner): @@ -54,8 +55,6 @@ def __init__(self, params, network, data, optimizer_dict=None): self.network = DDP(self.network) torch.cuda.current_stream().wait_stream(s) - self.final_test_loss = float("inf") - self.initial_test_loss = float("inf") self.final_validation_loss = float("inf") self.initial_validation_loss = float("inf") self.optimizer = None @@ -65,36 +64,44 @@ def __init__(self, params, network, data, optimizer_dict=None): self.last_loss = None self.training_data_loaders = [] self.validation_data_loaders = [] - self.test_data_loaders = [] # Samplers for the ddp case. self.train_sampler = None - self.test_sampler = None self.validation_sampler = None self.__prepare_to_train(optimizer_dict) - self.tensor_board = None - self.full_visualization_path = None - if self.parameters.visualisation: - if not os.path.exists(self.parameters.visualisation_dir): - os.makedirs(self.parameters.visualisation_dir) - if self.parameters.visualisation_dir_append_date: + self.logger = None + self.full_logging_path = None + if self.parameters.logger is not None: + os.makedirs(self.parameters.logging_dir, exist_ok=True) + if self.parameters.logging_dir_append_date: date_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") - self.full_visualization_path = os.path.join( - self.parameters.visualisation_dir, date_time + if len(self.parameters.run_name) > 0: + name = self.parameters.run_name + "_" + date_time + else: + name = date_time + self.full_logging_path = os.path.join( + self.parameters.logging_dir, name ) - os.makedirs(self.full_visualization_path) + os.makedirs(self.full_logging_path, exist_ok=True) else: - self.full_visualization_path = ( - self.parameters.visualisation_dir - ) + self.full_logging_path = self.parameters.logging_dir # Set the path to log files - self.tensor_board = SummaryWriter(self.full_visualization_path) + if self.parameters.logger == "wandb": + import wandb + + self.logger = wandb + elif self.parameters.logger == "tensorboard": + self.logger = SummaryWriter(self.full_logging_path) + else: + raise Exception( + f"Unsupported logger {self.parameters.logger}." + ) printout( - "Writing visualization output to", - self.full_visualization_path, + "Writing logging output to", + self.full_logging_path, min_verbosity=1, ) @@ -256,45 +263,10 @@ def train_network(self): # CALCULATE INITIAL METRICS ############################ - tloss = float("inf") - vloss = self.__validate_network( - self.network, - "validation", - self.parameters.after_before_training_metric, - ) - - if self.data.test_data_sets: - tloss = self.__validate_network( - self.network, - "test", - self.parameters.after_before_training_metric, - ) - - # Collect and average all the losses from all the devices - if self.parameters_full.use_ddp: - vloss = self.__average_validation( - vloss, "average_loss", self.parameters._configuration["device"] - ) - self.initial_validation_loss = vloss - if self.data.test_data_sets: - tloss = self.__average_validation( - tloss, - "average_loss", - self.parameters._configuration["device"], - ) - self.initial_test_loss = tloss - - printout( - "Initial Guess - validation data loss: ", vloss, min_verbosity=1 - ) - if self.data.test_data_sets: - printout( - "Initial Guess - test data loss: ", tloss, min_verbosity=1 - ) + vloss = float("inf") # Save losses for later use. self.initial_validation_loss = vloss - self.initial_test_loss = tloss # Initialize all the counters. checkpoint_counter = 0 @@ -310,12 +282,16 @@ def train_network(self): # PERFORM TRAINING ############################ + total_batch_id = 0 + for epoch in range(self.last_epoch, self.parameters.max_number_epochs): start_time = time.time() # Prepare model for training. self.network.train() + training_loss_sum_logging = 0.0 + # Process each mini batch and save the training loss. training_loss_sum = torch.zeros( 1, device=self.parameters._configuration["device"] @@ -337,7 +313,15 @@ def train_network(self): t0 = time.time() batchid = 0 for loader in self.training_data_loaders: - for inputs, outputs in loader: + t = time.time() + for inputs, outputs in tqdm( + loader, + desc="training", + disable=self.parameters_full.verbosity < 2, + total=len(loader), + ): + dt = time.time() - t + printout(f"load time: {dt}", min_verbosity=3) if self.parameters.profiler_range is not None: if batchid == self.parameters.profiler_range[0]: @@ -348,6 +332,7 @@ def train_network(self): torch.cuda.nvtx.range_push(f"step {batchid}") torch.cuda.nvtx.range_push("data copy in") + t = time.time() inputs = inputs.to( self.parameters._configuration["device"], non_blocking=True, @@ -356,6 +341,8 @@ def train_network(self): self.parameters._configuration["device"], non_blocking=True, ) + dt = time.time() - t + printout(f"data copy in time: {dt}", min_verbosity=3) # data copy in torch.cuda.nvtx.range_pop() @@ -365,11 +352,12 @@ def train_network(self): # step torch.cuda.nvtx.range_pop() training_loss_sum += loss + training_loss_sum_logging += loss.item() if ( batchid != 0 and (batchid + 1) - % self.parameters.training_report_frequency + % self.parameters.training_log_interval == 0 ): torch.cuda.synchronize( @@ -378,10 +366,10 @@ def train_network(self): sample_time = time.time() - tsample avg_sample_time = ( sample_time - / self.parameters.training_report_frequency + / self.parameters.training_log_interval ) avg_sample_tput = ( - self.parameters.training_report_frequency + self.parameters.training_log_interval * inputs.shape[0] / sample_time ) @@ -389,18 +377,45 @@ def train_network(self): f"batch {batchid + 1}, " # /{total_samples}, " f"train avg time: {avg_sample_time} " f"train avg throughput: {avg_sample_tput}", - min_verbosity=2, + min_verbosity=3, ) tsample = time.time() + + # summary_writer tensor board + if self.parameters.logger == "tensorboard": + training_loss_mean = ( + training_loss_sum_logging + / self.parameters.training_log_interval + ) + self.logger.add_scalars( + "ldos", + {"during_training": training_loss_mean}, + total_batch_id, + ) + self.logger.close() + training_loss_sum_logging = 0.0 + if self.parameters.logger == "wandb": + training_loss_mean = ( + training_loss_sum_logging + / self.parameters.training_log_interval + ) + self.logger.log( + { + "ldos_during_training": training_loss_mean + }, + step=total_batch_id, + ) + training_loss_sum_logging = 0.0 + batchid += 1 + total_batch_id += 1 + t = time.time() torch.cuda.synchronize( self.parameters._configuration["device"] ) t1 = time.time() printout(f"training time: {t1 - t0}", min_verbosity=2) - training_loss = training_loss_sum.item() / batchid - # Calculate the validation loss. and output it. torch.cuda.synchronize( self.parameters._configuration["device"] @@ -419,14 +434,20 @@ def train_network(self): self.network, inputs, outputs ) batchid += 1 - training_loss = training_loss_sum.item() / batchid - - vloss = self.__validate_network( - self.network, - "validation", - self.parameters.during_training_metric, + dataset_fractions = ["validation"] + if self.parameters.validate_on_training_data: + dataset_fractions.append("train") + errors = self._validate_network( + dataset_fractions, self.parameters.validation_metrics ) - + for dataset_fraction in dataset_fractions: + for metric in errors[dataset_fraction]: + errors[dataset_fraction][metric] = np.mean( + errors[dataset_fraction][metric] + ) + vloss = errors["validation"][ + self.parameters.during_training_metric + ] if self.parameters_full.use_ddp: vloss = self.__average_validation( vloss, @@ -434,41 +455,37 @@ def train_network(self): self.parameters._configuration["device"], ) if self.parameters_full.verbosity > 1: - printout( - "Epoch {0}: validation data loss: {1}, " - "training data loss: {2}".format( - epoch, vloss, training_loss - ), - min_verbosity=2, - ) + printout("Errors:", errors, min_verbosity=2) else: printout( - "Epoch {0}: validation data loss: {1}".format( - epoch, vloss - ), + f"Epoch {epoch}: validation data loss: {vloss:.3e}", min_verbosity=1, ) - # summary_writer tensor board - if self.parameters.visualisation: - self.tensor_board.add_scalars( - "Loss", - {"validation": vloss, "training": training_loss}, - epoch, - ) - self.tensor_board.add_scalar( - "Learning rate", self.parameters.learning_rate, epoch - ) - if self.parameters.visualisation == 2: - for name, param in self.network.named_parameters(): - self.tensor_board.add_histogram(name, param, epoch) - self.tensor_board.add_histogram( - f"{name}.grad", param.grad, epoch + if self.parameters.logger == "tensorboard": + for dataset_fraction in dataset_fractions: + for metric in errors[dataset_fraction]: + self.logger.add_scalars( + metric, + { + dataset_fraction: errors[dataset_fraction][ + metric + ] + }, + total_batch_id, + ) + self.logger.close() + if self.parameters.logger == "wandb": + for dataset_fraction in dataset_fractions: + for metric in errors[dataset_fraction]: + self.logger.log( + { + f"{dataset_fraction}_{metric}": errors[ + dataset_fraction + ][metric] + }, + step=total_batch_id, ) - - # method to make sure that all pending events have been written - # to disk - self.tensor_board.close() if self.parameters._configuration["gpu"]: torch.cuda.synchronize( @@ -541,49 +558,141 @@ def train_network(self): ############################ # CALCULATE FINAL METRICS ############################ - - if ( - self.parameters.after_before_training_metric - != self.parameters.during_training_metric - ): - vloss = self.__validate_network( - self.network, - "validation", - self.parameters.after_before_training_metric, + if self.parameters.after_training_metric in errors["validation"]: + self.final_validation_loss = errors["validation"][ + self.parameters.after_training_metric + ] + else: + final_errors = self._validate_network( + ["validation"], [self.parameters.after_training_metric] ) + vloss = np.mean( + final_errors["validation"][ + self.parameters.after_training_metric + ] + ) + if self.parameters_full.use_ddp: vloss = self.__average_validation( vloss, "average_loss", self.parameters._configuration["device"], ) - - # Calculate final loss. - self.final_validation_loss = vloss - printout("Final validation data loss: ", vloss, min_verbosity=0) - - tloss = float("inf") - if len(self.data.test_data_sets) > 0: - tloss = self.__validate_network( - self.network, - "test", - self.parameters.after_before_training_metric, - ) - if self.parameters_full.use_ddp: - tloss = self.__average_validation( - tloss, - "average_loss", - self.parameters._configuration["device"], - ) - printout("Final test data loss: ", tloss, min_verbosity=0) - self.final_test_loss = tloss + self.final_validation_loss = vloss # Clean-up for pre-fetching lazy loading. if self.data.parameters.use_lazy_loading_prefetch: self.training_data_loaders.cleanup() self.validation_data_loaders.cleanup() - if len(self.data.test_data_sets) > 0: - self.test_data_loaders.cleanup() + + def _validate_network(self, data_set_fractions, metrics): + # """Validate a network, using train or validation data.""" + self.network.eval() + errors = {} + for data_set_type in data_set_fractions: + if data_set_type == "train": + data_loaders = self.training_data_loaders + data_sets = self.data.training_data_sets + number_of_snapshots = self.data.nr_training_snapshots + offset_snapshots = 0 + + elif data_set_type == "validation": + data_loaders = self.validation_data_loaders + data_sets = self.data.validation_data_sets + number_of_snapshots = self.data.nr_validation_snapshots + offset_snapshots = self.data.nr_training_snapshots + + elif data_set_type == "test": + raise Exception( + "You should not look at test set results during training" + ) + else: + raise Exception( + f"Dataset type ({data_set_type}) not recognized." + ) + + errors[data_set_type] = {} + for metric in metrics: + errors[data_set_type][metric] = [] + + if isinstance(data_loaders, MultiLazyLoadDataLoader): + loader_id = 0 + for loader in data_loaders: + grid_size = self.data.parameters.snapshot_directories_list[ + loader_id + offset_snapshots + ].grid_size + + actual_outputs = np.zeros( + (grid_size, self.data.output_dimension) + ) + predicted_outputs = np.zeros( + (grid_size, self.data.output_dimension) + ) + last_start = 0 + + for x, y in loader: + + x = x.to(self.parameters._configuration["device"]) + length = int(x.size()[0]) + predicted_outputs[ + last_start : last_start + length, : + ] = self.data.output_data_scaler.inverse_transform( + self.network(x).to("cpu"), as_numpy=True + ) + actual_outputs[last_start : last_start + length, :] = ( + self.data.output_data_scaler.inverse_transform( + y, as_numpy=True + ) + ) + + last_start += length + errors[data_set_type] = self._calculate_errors( + actual_outputs, + predicted_outputs, + metrics, + loader_id + offset_snapshots, + ) + loader_id += 1 + else: + with torch.no_grad(): + for snapshot_number in trange( + offset_snapshots, + number_of_snapshots + offset_snapshots, + desc="Validation", + disable=self.parameters_full.verbosity < 2, + ): + # Get optimal batch size and number of batches per snapshotss + grid_size = ( + self.data.parameters.snapshot_directories_list[ + snapshot_number + ].grid_size + ) + + optimal_batch_size = ( + self._correct_batch_size_for_testing( + grid_size, self.parameters.mini_batch_size + ) + ) + number_of_batches_per_snapshot = int( + grid_size / optimal_batch_size + ) + + actual_outputs, predicted_outputs = ( + self._forward_entire_snapshot( + snapshot_number, + data_sets[0], + data_set_type[0:2], + number_of_batches_per_snapshot, + optimal_batch_size, + ) + ) + errors[data_set_type] = self._calculate_errors( + actual_outputs, + predicted_outputs, + metrics, + snapshot_number, + ) + return errors def __prepare_to_train(self, optimizer_dict): """Prepare everything for training.""" @@ -612,32 +721,30 @@ def __prepare_to_train(self, optimizer_dict): ) # Choose an optimizer to use. - if self.parameters.trainingtype == "SGD": + if self.parameters.optimizer == "SGD": self.optimizer = optim.SGD( self.network.parameters(), lr=self.parameters.learning_rate, - weight_decay=self.parameters.weight_decay, + weight_decay=self.parameters.l2_regularization, ) - elif self.parameters.trainingtype == "Adam": + elif self.parameters.optimizer == "Adam": self.optimizer = optim.Adam( self.network.parameters(), lr=self.parameters.learning_rate, - weight_decay=self.parameters.weight_decay, + weight_decay=self.parameters.l2_regularization, ) - elif self.parameters.trainingtype == "FusedAdam": + elif self.parameters.optimizer == "FusedAdam": if version.parse(torch.__version__) >= version.parse("1.13.0"): self.optimizer = optim.Adam( self.network.parameters(), lr=self.parameters.learning_rate, - weight_decay=self.parameters.weight_decay, + weight_decay=self.parameters.l2_regularization, fused=True, ) else: - raise Exception( - "Training method requires at least torch 1.13.0." - ) + raise Exception("Optimizer requires " "at least torch 1.13.0.") else: - raise Exception("Unsupported training method.") + raise Exception("Unsupported optimizer.") # Load data from pytorch file. if optimizer_dict is not None: @@ -677,16 +784,6 @@ def __prepare_to_train(self, optimizer_dict): ) ) - if self.data.test_data_sets: - self.test_sampler = ( - torch.utils.data.distributed.DistributedSampler( - self.data.test_data_sets[0], - num_replicas=dist.get_world_size(), - rank=dist.get_rank(), - shuffle=False, - ) - ) - # Instantiate the learning rate scheduler, if necessary. if self.parameters.learning_rate_scheduler == "ReduceLROnPlateau": self.scheduler = optim.lr_scheduler.ReduceLROnPlateau( @@ -774,21 +871,6 @@ def __prepare_to_train(self, optimizer_dict): ) ) - if self.data.test_data_sets: - if isinstance(self.data.test_data_sets[0], LazyLoadDatasetSingle): - self.test_data_loaders = MultiLazyLoadDataLoader( - self.data.test_data_sets, **kwargs - ) - else: - self.test_data_loaders.append( - DataLoader( - self.data.test_data_sets[0], - batch_size=self.parameters.mini_batch_size * 1, - sampler=self.test_sampler, - **kwargs, - ) - ) - def __process_mini_batch(self, network, input_data, target_data): """Process a mini batch.""" if self.parameters._configuration["gpu"]: @@ -870,7 +952,10 @@ def __process_mini_batch(self, network, input_data, target_data): enabled=self.parameters.use_mixed_precision ): torch.cuda.nvtx.range_push("forward") + t = time.time() prediction = network(input_data) + dt = time.time() - t + printout(f"forward time: {dt}", min_verbosity=3) # forward torch.cuda.nvtx.range_pop() @@ -881,6 +966,8 @@ def __process_mini_batch(self, network, input_data, target_data): ) else: loss = network.calculate_loss(prediction, target_data) + dt = time.time() - t + printout(f"loss time: {dt}", min_verbosity=3) # loss torch.cuda.nvtx.range_pop() @@ -889,12 +976,15 @@ def __process_mini_batch(self, network, input_data, target_data): else: loss.backward() + t = time.time() torch.cuda.nvtx.range_push("optimizer") if self.gradscaler: self.gradscaler.step(self.optimizer) self.gradscaler.update() else: self.optimizer.step() + dt = time.time() - t + printout(f"optimizer time: {dt}", min_verbosity=3) torch.cuda.nvtx.range_pop() # optimizer if self.train_graph: @@ -912,327 +1002,6 @@ def __process_mini_batch(self, network, input_data, target_data): self.optimizer.zero_grad() return loss - def __validate_network(self, network, data_set_type, validation_type): - """Validate a network, using test or validation data.""" - if data_set_type == "test": - data_loaders = self.test_data_loaders - data_sets = self.data.test_data_sets - number_of_snapshots = self.data.nr_test_snapshots - offset_snapshots = ( - self.data.nr_validation_snapshots - + self.data.nr_training_snapshots - ) - - elif data_set_type == "validation": - data_loaders = self.validation_data_loaders - data_sets = self.data.validation_data_sets - number_of_snapshots = self.data.nr_validation_snapshots - offset_snapshots = self.data.nr_training_snapshots - - else: - raise Exception( - "Please select test or validation when using this function." - ) - network.eval() - if validation_type == "ldos": - validation_loss_sum = torch.zeros( - 1, device=self.parameters._configuration["device"] - ) - with torch.no_grad(): - if self.parameters._configuration["gpu"]: - report_freq = self.parameters.training_report_frequency - torch.cuda.synchronize( - self.parameters._configuration["device"] - ) - tsample = time.time() - batchid = 0 - for loader in data_loaders: - for x, y in loader: - x = x.to( - self.parameters._configuration["device"], - non_blocking=True, - ) - y = y.to( - self.parameters._configuration["device"], - non_blocking=True, - ) - - if ( - self.parameters.use_graphs - and self.validation_graph is None - ): - printout( - "Capturing CUDA graph for validation.", - min_verbosity=2, - ) - s = torch.cuda.Stream( - self.parameters._configuration["device"] - ) - s.wait_stream( - torch.cuda.current_stream( - self.parameters._configuration[ - "device" - ] - ) - ) - # Warmup for graphs - with torch.cuda.stream(s): - for _ in range(20): - with torch.cuda.amp.autocast( - enabled=self.parameters.use_mixed_precision - ): - prediction = network(x) - if self.parameters_full.use_ddp: - loss = network.module.calculate_loss( - prediction, y - ) - else: - loss = network.calculate_loss( - prediction, y - ) - torch.cuda.current_stream( - self.parameters._configuration["device"] - ).wait_stream(s) - - # Create static entry point tensors to graph - self.static_input_validation = ( - torch.empty_like(x) - ) - self.static_target_validation = ( - torch.empty_like(y) - ) - - # Capture graph - self.validation_graph = torch.cuda.CUDAGraph() - with torch.cuda.graph(self.validation_graph): - with torch.cuda.amp.autocast( - enabled=self.parameters.use_mixed_precision - ): - self.static_prediction_validation = ( - network( - self.static_input_validation - ) - ) - if self.parameters_full.use_ddp: - self.static_loss_validation = network.module.calculate_loss( - self.static_prediction_validation, - self.static_target_validation, - ) - else: - self.static_loss_validation = network.calculate_loss( - self.static_prediction_validation, - self.static_target_validation, - ) - - if self.validation_graph: - self.static_input_validation.copy_(x) - self.static_target_validation.copy_(y) - self.validation_graph.replay() - validation_loss_sum += ( - self.static_loss_validation - ) - else: - with torch.cuda.amp.autocast( - enabled=self.parameters.use_mixed_precision - ): - prediction = network(x) - if self.parameters_full.use_ddp: - loss = network.module.calculate_loss( - prediction, y - ) - else: - loss = network.calculate_loss( - prediction, y - ) - validation_loss_sum += loss - if ( - batchid != 0 - and (batchid + 1) % report_freq == 0 - ): - torch.cuda.synchronize( - self.parameters._configuration["device"] - ) - sample_time = time.time() - tsample - avg_sample_time = sample_time / report_freq - avg_sample_tput = ( - report_freq * x.shape[0] / sample_time - ) - printout( - f"batch {batchid + 1}, " # /{total_samples}, " - f"validation avg time: {avg_sample_time} " - f"validation avg throughput: {avg_sample_tput}", - min_verbosity=2, - ) - tsample = time.time() - batchid += 1 - torch.cuda.synchronize( - self.parameters._configuration["device"] - ) - else: - batchid = 0 - for loader in data_loaders: - for x, y in loader: - x = x.to(self.parameters._configuration["device"]) - y = y.to(self.parameters._configuration["device"]) - prediction = network(x) - if self.parameters_full.use_ddp: - validation_loss_sum += ( - network.module.calculate_loss( - prediction, y - ).item() - ) - else: - validation_loss_sum += network.calculate_loss( - prediction, y - ).item() - batchid += 1 - - validation_loss = validation_loss_sum.item() / batchid - return validation_loss - elif ( - validation_type == "band_energy" - or validation_type == "total_energy" - ): - errors = [] - if isinstance( - self.validation_data_loaders, MultiLazyLoadDataLoader - ): - loader_id = 0 - for loader in data_loaders: - grid_size = self.data.parameters.snapshot_directories_list[ - loader_id + offset_snapshots - ].grid_size - - actual_outputs = np.zeros( - (grid_size, self.data.output_dimension) - ) - predicted_outputs = np.zeros( - (grid_size, self.data.output_dimension) - ) - last_start = 0 - - for x, y in loader: - - x = x.to(self.parameters._configuration["device"]) - length = int(x.size()[0]) - predicted_outputs[ - last_start : last_start + length, : - ] = self.data.output_data_scaler.inverse_transform( - self.network(x).to("cpu"), as_numpy=True - ) - actual_outputs[last_start : last_start + length, :] = ( - self.data.output_data_scaler.inverse_transform( - y, as_numpy=True - ) - ) - - last_start += length - errors.append( - self._calculate_energy_errors( - actual_outputs, - predicted_outputs, - validation_type, - loader_id + offset_snapshots, - ) - ) - loader_id += 1 - - else: - for snapshot_number in range( - offset_snapshots, number_of_snapshots + offset_snapshots - ): - # Get optimal batch size and number of batches per snapshotss - grid_size = self.data.parameters.snapshot_directories_list[ - snapshot_number - ].grid_size - - optimal_batch_size = self._correct_batch_size_for_testing( - grid_size, self.parameters.mini_batch_size - ) - number_of_batches_per_snapshot = int( - grid_size / optimal_batch_size - ) - - actual_outputs, predicted_outputs = ( - self._forward_entire_snapshot( - snapshot_number, - data_sets[0], - data_set_type[0:2], - number_of_batches_per_snapshot, - optimal_batch_size, - ) - ) - - errors.append( - self._calculate_energy_errors( - actual_outputs, - predicted_outputs, - validation_type, - snapshot_number, - ) - ) - return np.mean(errors) - else: - raise Exception("Selected validation method not supported.") - - def _calculate_energy_errors( - self, actual_outputs, predicted_outputs, energy_type, snapshot_number - ): - self.data.target_calculator.read_additional_calculation_data( - self.data.get_snapshot_calculation_output(snapshot_number) - ) - if energy_type == "band_energy": - try: - fe_actual = self.data.target_calculator.get_self_consistent_fermi_energy( - actual_outputs - ) - be_actual = self.data.target_calculator.get_band_energy( - actual_outputs, fermi_energy=fe_actual - ) - - fe_predicted = self.data.target_calculator.get_self_consistent_fermi_energy( - predicted_outputs - ) - be_predicted = self.data.target_calculator.get_band_energy( - predicted_outputs, fermi_energy=fe_predicted - ) - return np.abs(be_predicted - be_actual) * ( - 1000 / len(self.data.target_calculator.atoms) - ) - except ValueError: - # If the training went badly, it might be that the above - # code results in an error, due to the LDOS being so wrong - # that the estimation of the self consistent Fermi energy - # fails. - return float("inf") - elif energy_type == "total_energy": - try: - fe_actual = self.data.target_calculator.get_self_consistent_fermi_energy( - actual_outputs - ) - be_actual = self.data.target_calculator.get_total_energy( - ldos_data=actual_outputs, fermi_energy=fe_actual - ) - - fe_predicted = self.data.target_calculator.get_self_consistent_fermi_energy( - predicted_outputs - ) - be_predicted = self.data.target_calculator.get_total_energy( - ldos_data=predicted_outputs, fermi_energy=fe_predicted - ) - return np.abs(be_predicted - be_actual) * ( - 1000 / len(self.data.target_calculator.atoms) - ) - except ValueError: - # If the training went badly, it might be that the above - # code results in an error, due to the LDOS being so wrong - # that the estimation of the self consistent Fermi energy - # fails. - return float("inf") - - else: - raise Exception("Invalid energy type requested.") - def __create_training_checkpoint(self): """ Create a checkpoint during training. @@ -1265,8 +1034,14 @@ def __create_training_checkpoint(self): torch.save( save_dict, optimizer_name, _use_new_zipfile_serialization=False ) - - self.save_run(self.parameters.checkpoint_name, save_runner=True) + if self.parameters.run_name != "": + self.save_run( + self.parameters.checkpoint_name, + save_runner=True, + save_path=self.parameters.run_name, + ) + else: + self.save_run(self.parameters.checkpoint_name, save_runner=True) @staticmethod def __average_validation(val, name, device="cpu"): diff --git a/test/all_lazy_loading_test.py b/test/all_lazy_loading_test.py index 065cbb86e..351c98292 100644 --- a/test/all_lazy_loading_test.py +++ b/test/all_lazy_loading_test.py @@ -38,7 +38,7 @@ def test_scaling(self): test_parameters.running.max_number_epochs = 3 test_parameters.running.mini_batch_size = 512 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.comment = "Lazy loading test." test_parameters.network.nn_type = "feed-forward" test_parameters.running.use_gpu = True @@ -157,10 +157,7 @@ def test_scaling(self): test_parameters, test_network, data_handler ) test_trainer.train_network() - training_tester.append( - test_trainer.final_test_loss - - test_trainer.initial_test_loss - ) + training_tester.append(test_trainer.final_validation_loss) elif scalingtype == "feature-wise-standard": # The lazy-loading STD equation (and to a smaller amount the @@ -269,7 +266,7 @@ def test_performance_horovod(self): test_parameters.network.layer_activations = ["LeakyReLU"] test_parameters.running.max_number_epochs = 20 test_parameters.running.mini_batch_size = 500 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.comment = "Horovod / lazy loading benchmark." test_parameters.network.nn_type = "feed-forward" test_parameters.manual_seed = 2021 @@ -352,8 +349,8 @@ def test_performance_horovod(self): [ hvdstring, llstring, - test_trainer.initial_test_loss, - test_trainer.final_test_loss, + test_trainer.initial_validation_loss, + test_trainer.final_validation_loss, time.time() - start_time, ] ) @@ -400,8 +397,8 @@ def _train_lazy_loading(prefetching): test_parameters.running.max_number_epochs = 100 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" - test_parameters.verbosity = 2 + test_parameters.running.optimizer = "Adam" + test_parameters.verbosity = 1 test_parameters.data.use_lazy_loading = True test_parameters.data.use_lazy_loading_prefetch = prefetching diff --git a/test/basic_gpu_test.py b/test/basic_gpu_test.py index dcd588ad1..514a70f21 100644 --- a/test/basic_gpu_test.py +++ b/test/basic_gpu_test.py @@ -91,7 +91,7 @@ def __run(use_gpu): test_parameters.running.max_number_epochs = 100 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.manual_seed = 1002 test_parameters.running.use_shuffling_for_samplers = False test_parameters.use_gpu = use_gpu @@ -150,4 +150,4 @@ def __run(use_gpu): starttime = time.time() test_trainer.train_network() - return test_trainer.final_test_loss, time.time() - starttime + return test_trainer.final_validation_loss, time.time() - starttime diff --git a/test/checkpoint_hyperopt_test.py b/test/checkpoint_hyperopt_test.py index 28889c2df..a1909f21b 100644 --- a/test/checkpoint_hyperopt_test.py +++ b/test/checkpoint_hyperopt_test.py @@ -67,7 +67,7 @@ def __original_setup(n_trials): test_parameters.running.max_number_epochs = 10 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" # Specify the number of trials, the hyperparameter optimizer should run # and the type of hyperparameter. diff --git a/test/checkpoint_training_test.py b/test/checkpoint_training_test.py index 4c56ed8eb..3bc5e83e3 100644 --- a/test/checkpoint_training_test.py +++ b/test/checkpoint_training_test.py @@ -20,7 +20,7 @@ def test_general(self): # First run the entire test. trainer = self.__original_setup(test_checkpoint_name, 40) trainer.train_network() - original_final_test_loss = trainer.final_test_loss + original_final_validation_loss = trainer.final_validation_loss # Now do the same, but cut at epoch 22 and see if it recovers the # correct result. @@ -28,9 +28,11 @@ def test_general(self): trainer.train_network() trainer = self.__resume_checkpoint(test_checkpoint_name, 40) trainer.train_network() - new_final_test_loss = trainer.final_test_loss + new_final_validation_loss = trainer.final_validation_loss assert np.isclose( - original_final_test_loss, new_final_test_loss, atol=accuracy + original_final_validation_loss, + new_final_validation_loss, + atol=accuracy, ) def test_learning_rate(self): @@ -144,7 +146,7 @@ def __original_setup( test_parameters.running.max_number_epochs = maxepochs test_parameters.running.mini_batch_size = 38 test_parameters.running.learning_rate = learning_rate - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.running.learning_rate_scheduler = ( learning_rate_scheduler ) diff --git a/test/complete_interfaces_test.py b/test/complete_interfaces_test.py index d793da77f..8aa7da85d 100644 --- a/test/complete_interfaces_test.py +++ b/test/complete_interfaces_test.py @@ -114,7 +114,7 @@ def test_ase_calculator(self): test_parameters.running.max_number_epochs = 100 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.targets.target_type = "LDOS" test_parameters.targets.ldos_gridsize = 11 test_parameters.targets.ldos_gridspacing_ev = 2.5 @@ -123,9 +123,7 @@ def test_ase_calculator(self): test_parameters.descriptors.descriptor_type = "Bispectrum" test_parameters.descriptors.bispectrum_twojmax = 10 test_parameters.descriptors.bispectrum_cutoff = 4.67637 - test_parameters.targets.pseudopotential_path = os.path.join( - data_repo_path, "Be2" - ) + test_parameters.targets.pseudopotential_path = data_path #################### # DATA diff --git a/test/examples_test.py b/test/examples_test.py index b5aa9143a..4a83dd538 100644 --- a/test/examples_test.py +++ b/test/examples_test.py @@ -6,6 +6,7 @@ import pytest + @pytest.mark.examples class TestExamples: dir_path = os.path.dirname(__file__) @@ -13,96 +14,85 @@ class TestExamples: def test_basic_ex01(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/basic/ex01_train_network.py" + self.dir_path + "/../examples/basic/ex01_train_network.py" ) @pytest.mark.order(after="test_basic_ex01") def test_basic_ex02(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/basic/ex02_test_network.py" + self.dir_path + "/../examples/basic/ex02_test_network.py" ) @pytest.mark.order(after="test_basic_ex01") def test_basic_ex03(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/basic/ex03_preprocess_data.py" + self.dir_path + "/../examples/basic/ex03_preprocess_data.py" ) @pytest.mark.order(after="test_basic_ex01") def test_basic_ex04(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/basic/ex04_hyperparameter_optimization.py" + self.dir_path + + "/../examples/basic/ex04_hyperparameter_optimization.py" ) @pytest.mark.order(after="test_basic_ex01") def test_basic_ex05(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/basic/ex05_run_predictions.py" + self.dir_path + "/../examples/basic/ex05_run_predictions.py" ) @pytest.mark.order(after="test_basic_ex01") def test_basic_ex06(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/basic/ex06_ase_calculator.py" + self.dir_path + "/../examples/basic/ex06_ase_calculator.py" ) @pytest.mark.order(after="test_basic_ex01") def test_advanced_ex01(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/advanced/ex01_checkpoint_training.py" + self.dir_path + "/../examples/advanced/ex01_checkpoint_training.py" ) @pytest.mark.order(after="test_basic_ex01") def test_advanced_ex02(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/advanced/ex02_shuffle_data.py" + self.dir_path + "/../examples/advanced/ex02_shuffle_data.py" ) @pytest.mark.order(after="test_basic_ex01") def test_advanced_ex03(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/advanced/ex03_tensor_board.py" + self.dir_path + "/../examples/advanced/ex03_tensor_board.py" ) @pytest.mark.order(after="test_basic_ex01") def test_advanced_ex04(self, tmp_path): os.chdir(tmp_path) - runpy.run_path( - self.dir_path + - "/../examples/advanced/ex04_acsd.py" - ) + runpy.run_path(self.dir_path + "/../examples/advanced/ex04_acsd.py") @pytest.mark.order(after="test_basic_ex01") def test_advanced_ex05(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/advanced/ex05_checkpoint_hyperparameter_optimization.py" + self.dir_path + + "/../examples/advanced/ex05_checkpoint_hyperparameter_optimization.py" ) @pytest.mark.order(after="test_basic_ex01") def test_advanced_ex06(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/advanced/ex06_distributed_hyperparameter_optimization.py" + self.dir_path + + "/../examples/advanced/ex06_distributed_hyperparameter_optimization.py" ) @pytest.mark.skipif( @@ -113,14 +103,14 @@ def test_advanced_ex06(self, tmp_path): def test_advanced_ex07(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/advanced/ex07_advanced_hyperparameter_optimization.py" + self.dir_path + + "/../examples/advanced/ex07_advanced_hyperparameter_optimization.py" ) @pytest.mark.order(after="test_basic_ex01") def test_advanced_ex08(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/advanced/ex08_visualize_observables.py" + self.dir_path + + "/../examples/advanced/ex08_visualize_observables.py" ) diff --git a/test/hyperopt_test.py b/test/hyperopt_test.py index bb003082a..77b0b9896 100644 --- a/test/hyperopt_test.py +++ b/test/hyperopt_test.py @@ -42,7 +42,7 @@ def test_hyperopt(self): test_parameters.running.max_number_epochs = 20 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.hyperparameters.n_trials = 20 test_parameters.hyperparameters.hyper_opt_method = "optuna" @@ -133,7 +133,7 @@ def test_distributed_hyperopt(self): test_parameters.running.max_number_epochs = 5 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.hyperparameters.n_trials = 20 test_parameters.hyperparameters.hyper_opt_method = "optuna" test_parameters.hyperparameters.study_name = "test_ho" @@ -242,7 +242,7 @@ def test_naswot_eigenvalues(self): test_parameters.running.max_number_epochs = 10 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.hyperparameters.n_trials = 8 test_parameters.hyperparameters.hyper_opt_method = "naswot" @@ -310,7 +310,7 @@ def __optimize_hyperparameters(hyper_optimizer): test_parameters.running.max_number_epochs = 20 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.hyperparameters.n_trials = 8 test_parameters.hyperparameters.hyper_opt_method = hyper_optimizer @@ -352,7 +352,7 @@ def __optimize_hyperparameters(hyper_optimizer): # If we do a NASWOT run currently we can provide an input # array of trials. test_hp_optimizer.add_hyperparameter( - "categorical", "trainingtype", choices=["Adam", "SGD"] + "categorical", "optimizer", choices=["Adam", "SGD"] ) test_hp_optimizer.add_hyperparameter( "categorical", "layer_activation_00", choices=["ReLU", "Sigmoid"] @@ -375,7 +375,7 @@ def __optimize_hyperparameters(hyper_optimizer): ) test_trainer.train_network() test_parameters.show() - return test_trainer.final_test_loss + return test_trainer.final_validation_loss def test_hyperopt_optuna_requeue_zombie_trials(self, tmp_path): @@ -391,7 +391,7 @@ def test_hyperopt_optuna_requeue_zombie_trials(self, tmp_path): test_parameters.running.max_number_epochs = 2 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.hyperparameters.n_trials = 2 test_parameters.hyperparameters.hyper_opt_method = "optuna" test_parameters.hyperparameters.study_name = "test_ho" diff --git a/test/shuffling_test.py b/test/shuffling_test.py index e637c7d2b..72d28d6ef 100644 --- a/test/shuffling_test.py +++ b/test/shuffling_test.py @@ -124,7 +124,7 @@ def test_training(self): test_parameters.running.max_number_epochs = 50 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.verbosity = 1 test_parameters.data.use_lazy_loading = True @@ -168,7 +168,7 @@ def test_training(self): test_parameters.running.max_number_epochs = 50 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.verbosity = 1 test_parameters.data.use_lazy_loading = True data_shuffler = mala.DataShuffler(test_parameters) @@ -220,7 +220,7 @@ def test_training_openpmd(self): test_parameters.running.max_number_epochs = 50 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.verbosity = 1 test_parameters.data.use_lazy_loading = True @@ -266,7 +266,7 @@ def test_training_openpmd(self): test_parameters.running.max_number_epochs = 50 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.verbosity = 1 test_parameters.data.use_lazy_loading = True diff --git a/test/workflow_test.py b/test/workflow_test.py index fa7dee018..8cc33faf6 100644 --- a/test/workflow_test.py +++ b/test/workflow_test.py @@ -29,28 +29,19 @@ def test_network_training(self): """Test whether MALA can train a NN.""" test_trainer = self.__simple_training() - assert ( - desired_loss_improvement_factor * test_trainer.initial_test_loss - > test_trainer.final_test_loss - ) + assert test_trainer.final_validation_loss < np.inf def test_network_training_openpmd(self): """Test whether MALA can train a NN.""" test_trainer = self.__simple_training(use_openpmd_data=True) - assert ( - desired_loss_improvement_factor * test_trainer.initial_test_loss - > test_trainer.final_test_loss - ) + assert test_trainer.final_validation_loss < np.inf def test_network_training_fast_dataset(self): """Test whether MALA can train a NN.""" test_trainer = self.__simple_training(use_fast_tensor_dataset=True) - assert ( - desired_loss_improvement_factor * test_trainer.initial_test_loss - > test_trainer.final_test_loss - ) + assert test_trainer.final_validation_loss < np.inf def test_preprocessing(self): """ @@ -191,16 +182,8 @@ def test_postprocessing_from_dos(self): self_consistent_fermi_energy = dos.get_self_consistent_fermi_energy( dos_data ) - number_of_electrons = dos.get_number_of_electrons( - dos_data, fermi_energy=self_consistent_fermi_energy - ) band_energy = dos.get_band_energy(dos_data) - assert np.isclose( - number_of_electrons, - dos.number_of_electrons_exact, - atol=accuracy_electrons, - ) assert np.isclose( band_energy, dos.band_energy_dft_calculation, @@ -232,18 +215,10 @@ def test_postprocessing(self): self_consistent_fermi_energy = ldos.get_self_consistent_fermi_energy( ldos_data ) - number_of_electrons = ldos.get_number_of_electrons( - ldos_data, fermi_energy=self_consistent_fermi_energy - ) band_energy = ldos.get_band_energy( ldos_data, fermi_energy=self_consistent_fermi_energy ) - assert np.isclose( - number_of_electrons, - ldos.number_of_electrons_exact, - atol=accuracy_electrons, - ) assert np.isclose( band_energy, ldos.band_energy_dft_calculation, @@ -403,13 +378,12 @@ def test_training_with_postprocessing_data_repo(self): data_handler.prepare_data(reparametrize_scaler=False) # Instantiate and use a Tester object. - tester.observables_to_test = ["band_energy", "number_of_electrons"] + tester.observables_to_test = ["band_energy"] errors = tester.test_snapshot(0) # Check whether the prediction is accurate enough. - assert np.isclose(errors["band_energy"], 0, atol=accuracy_predictions) assert np.isclose( - errors["number_of_electrons"], 0, atol=accuracy_predictions + errors["band_energy"], 0, atol=accuracy_predictions * 1000 ) @pytest.mark.skipif( @@ -460,9 +434,6 @@ def test_predictions(self): band_energy_tester_class = ldos_calculator.get_band_energy( predicted_ldos ) - nr_electrons_tester_class = ldos_calculator.get_number_of_electrons( - predicted_ldos - ) #################### # Now, use the predictor class to make the same prediction. @@ -478,12 +449,6 @@ def test_predictions(self): ldos_calculator.read_additional_calculation_data( os.path.join(data_path, "Be_snapshot3.out"), "espresso-out" ) - - nr_electrons_predictor_class = ( - data_handler.target_calculator.get_number_of_electrons( - predicted_ldos - ) - ) band_energy_predictor_class = ( data_handler.target_calculator.get_band_energy(predicted_ldos) ) @@ -493,11 +458,6 @@ def test_predictions(self): band_energy_tester_class, atol=accuracy_strict, ) - assert np.isclose( - nr_electrons_predictor_class, - nr_electrons_tester_class, - atol=accuracy_strict, - ) @pytest.mark.skipif( importlib.util.find_spec("total_energy") is None @@ -568,7 +528,7 @@ def __simple_training( test_parameters.running.max_number_epochs = 400 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.data.use_fast_tensor_data_set = use_fast_tensor_dataset # Load data.