From 97df67951639a72f6c589dfb8c7bb982856059e4 Mon Sep 17 00:00:00 2001 From: Sander Vandenhaute Date: Sun, 21 Jul 2024 12:49:52 -0400 Subject: [PATCH] tweak cp2k setup --- Dockerfile.cp2k | 8 ++++---- build_containers.sh | 1 + configs/hortense.yaml | 6 +++--- configs/lumi.yaml | 2 +- examples/hortense.yaml | 2 +- examples/lumi.yaml | 2 +- psiflow/models/_mace.py | 2 ++ 7 files changed, 13 insertions(+), 10 deletions(-) diff --git a/Dockerfile.cp2k b/Dockerfile.cp2k index afb5c29..219bc0e 100644 --- a/Dockerfile.cp2k +++ b/Dockerfile.cp2k @@ -18,21 +18,20 @@ RUN git clone --recursive -b support/v2024.1 https://github.com/cp2k/cp2k.git /o # Build CP2K toolchain for target CPU generic WORKDIR /opt/cp2k/tools/toolchain RUN /bin/bash -c -o pipefail \ - "./install_cp2k_toolchain.sh -j 12 \ + "./install_cp2k_toolchain.sh -j 8 \ --install-all \ --enable-cuda=no \ --target-cpu=generic \ --with-cusolvermp=no \ --with-gcc=system \ - --with-mpich=system \ - --with-libtorch=no" + --with-mpich=system" # Build CP2K for target CPU generic WORKDIR /opt/cp2k RUN /bin/bash -c -o pipefail \ "cp ./tools/toolchain/install/arch/local.psmp ./arch/; \ source ./tools/toolchain/install/setup; \ - make -j 12 ARCH=local VERSION=psmp" + make -j 8 ARCH=local VERSION=psmp" # Collect components for installation and remove symbolic links RUN /bin/bash -c -o pipefail \ @@ -84,6 +83,7 @@ RUN printf "#!/bin/bash\n\ ulimit -c 0 -s unlimited\n\ \ export OMP_STACKSIZE=16M\n\ +export OMP_NUM_THREADS=1\n\ export PATH=/opt/cp2k/exe/local:\${PATH}\n\ source /opt/cp2k/tools/toolchain/install/setup\n\ \"\$@\"" \ diff --git a/build_containers.sh b/build_containers.sh index d36ca0a..1c053be 100644 --- a/build_containers.sh +++ b/build_containers.sh @@ -12,6 +12,7 @@ psiflow=false gpaw=false cp2k=false build_sif=false +#mpi=mpich # Parse command line options while [[ $# -gt 0 ]]; do diff --git a/configs/hortense.yaml b/configs/hortense.yaml index 241deac..c2a4088 100644 --- a/configs/hortense.yaml +++ b/configs/hortense.yaml @@ -28,14 +28,14 @@ ModelTraining: walltime: "12:00:00" scheduler_options: "#SBATCH --clusters=dodrio\n#SBATCH --gpus=1\n" CP2K: - cores_per_worker: 128 + cores_per_worker: 64 max_evaluation_time: 30 - launch_command: 'apptainer exec -e --no-init oras://ghcr.io/molmod/cp2k:2023.2 /opt/entry.sh mpirun -np 32 --bind-to core --map-by core -x OMP_NUM_THREADS=1 cp2k.psmp -i cp2k.inp' + launch_command: 'apptainer exec -e --no-init oras://ghcr.io/molmod/cp2k:2024.1 /opt/entry.sh mpirun -np 32 -bind-to core cp2k.psmp -i cp2k.inp' slurm: partition: "cpu_rome" account: "2024_079" nodes_per_block: 1 - cores_per_node: 32 + cores_per_node: 64 max_blocks: 25 walltime: "06:00:00" scheduler_options: "#SBATCH --clusters=dodrio\n" diff --git a/configs/lumi.yaml b/configs/lumi.yaml index d1c5166..2cb326b 100644 --- a/configs/lumi.yaml +++ b/configs/lumi.yaml @@ -6,7 +6,7 @@ default_threads: 8 CP2K: cores_per_worker: 32 max_evaluation_time: 20 - launch_command: 'singularity exec -e --no-init oras://ghcr.io/molmod/cp2k:2023.2 /opt/entry.sh mpirun -np 32 -x OMP_NUM_THREADS=1 cp2k.psmp -i cp2k.inp' + launch_command: 'singularity exec -e --no-init oras://ghcr.io/molmod/cp2k:2024.1 /opt/entry.sh mpirun -np 32 cp2k.psmp -i cp2k.inp' slurm: partition: "standard" account: "project_465001125" diff --git a/examples/hortense.yaml b/examples/hortense.yaml index f845b73..9533085 100644 --- a/examples/hortense.yaml +++ b/examples/hortense.yaml @@ -30,7 +30,7 @@ ModelTraining: CP2K: cores_per_worker: 64 max_evaluation_time: 30 - launch_command: 'apptainer exec -e --no-init oras://ghcr.io/molmod/cp2k:2023.2 /opt/entry.sh mpirun -np 32 --bind-to core --map-by core -x OMP_NUM_THREADS=1 cp2k.psmp -i cp2k.inp' + launch_command: 'apptainer exec -e --no-init oras://ghcr.io/molmod/cp2k:2024.1 /opt/entry.sh mpirun -np 32 -bind-to core cp2k.psmp -i cp2k.inp' slurm: partition: "cpu_rome" account: "2024_079" diff --git a/examples/lumi.yaml b/examples/lumi.yaml index 622c134..4b107d4 100644 --- a/examples/lumi.yaml +++ b/examples/lumi.yaml @@ -6,7 +6,7 @@ default_threads: 8 CP2K: cores_per_worker: 32 max_evaluation_time: 20 - launch_command: 'singularity exec -e --no-init oras://ghcr.io/molmod/cp2k:2023.2 /opt/entry.sh mpirun -np 32 -x OMP_NUM_THREADS=1 cp2k.psmp -i cp2k.inp' + launch_command: 'singularity exec -e --no-init oras://ghcr.io/molmod/cp2k:2024.1 /opt/entry.sh mpirun -np 32 cp2k.psmp -i cp2k.inp' slurm: partition: "standard" account: "project_465001125" diff --git a/psiflow/models/_mace.py b/psiflow/models/_mace.py index 4dac9e7..c761f6a 100644 --- a/psiflow/models/_mace.py +++ b/psiflow/models/_mace.py @@ -208,6 +208,8 @@ def __init__(self, **config) -> None: config = MACEConfig(**config) # validate input config.save_cpu = True # assert model is saved to CPU after training config.device = "cpu" + if not config.swa: + config.start_swa = int(1e10) # otherwise he fails to read self._config = asdict(config) self.model_future = None