Skip to content

Commit

Permalink
added dlc-arceus to conf + created a partition specific for high VRAM…
Browse files Browse the repository at this point in the history
… machines + added lua submission plugin
  • Loading branch information
stephandooper committed Aug 20, 2024
1 parent a830ad8 commit 20b1550
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 2 deletions.
7 changes: 6 additions & 1 deletion playbooks/slurm-cluster/templates/slurm.conf
Original file line number Diff line number Diff line change
Expand Up @@ -131,15 +131,20 @@ NodeName=dlc-lugia Gres=gpu:8 CPUs=88 Sockets=2 CoresPerSocket=22 ThreadsPerCore
NodeName=dlc-moltres Gres=gpu:8 CPUs=88 Sockets=2 CoresPerSocket=22 ThreadsPerCore=2 Procs=44 RealMemory=256836 State=UNKNOWN
NodeName=dlc-zapdos Gres=gpu:8 CPUs=88 Sockets=2 CoresPerSocket=22 ThreadsPerCore=2 Procs=44 RealMemory=248000 State=UNKNOWN
NodeName=dlc-meowth Gres=gpu:4 CPUs=96 Sockets=2 CoresPerSocket=24 ThreadsPerCore=2 Procs=48 RealMemory=248000 State=UNKNOWN
NodeName=dlc-arceus Gres=gpu:8 CPUs=256 Sockets=2 CoresPerSocket=64 ThreadsPerCore=2 Procs=128 RealMemory=1000000 State=UNKNOWN

# hardcoding the partitions and default memory per node
# TODO: automatically define the partitions by resource
# TODO: set DefMemPerCPU = TotalMemory / LogicalCPUs
PartitionName=batch Nodes=ALL Default=YES DefMemPerCPU=0 State=UP OverSubscribe=NO MaxTime=7-0:00 DefaultTime=0-4:00
PartitionName=normal Nodes=ALL Default=YES DefMemPerCPU=0 State=UP OverSubscribe=NO MaxTime=7-0:00 DefaultTime=0-4:00
PartitionName=high_vram Nodes=dlc-groudon,dlc-arceus,dlc-meowth Default=NO DefMemPerCPU=0 State=UP OverSubscribe=NO MaxTime=7-0:00 DefaultTime=0-4:00

# QoS settings
PreemptMode=REQUEUE
PreemptType=preempt/qos
JobRequeue=1
PreemptExemptTime=0
PreemptParameters=strict_order,youngest_first

# SUBMISSION FILTERS
JobSubmitPlugins=lua
2 changes: 1 addition & 1 deletion roles/slurm/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ slurm_install_prefix: /usr/local
slurm_cuda_prefix: /usr/local/cuda
slurm_configure: './configure --prefix={{ slurm_install_prefix }} --disable-dependency-tracking --disable-debug --disable-x11 --enable-really-no-cray --enable-salloc-kill-cmd --with-hdf5=no --sysconfdir={{ slurm_config_dir }} --enable-pam --with-pam_dir={{ slurm_pam_lib_dir }} --with-shared-libslurm --without-rpath --with-pmix={{ pmix_install_prefix }} --with-hwloc={{ hwloc_install_prefix }}'
slurm_configure_nvml: './configure --prefix={{ slurm_install_prefix }} --disable-dependency-tracking --disable-debug --disable-x11 --enable-really-no-cray --enable-salloc-kill-cmd --with-hdf5=no --sysconfdir={{ slurm_config_dir }} --enable-pam --with-pam_dir={{ slurm_pam_lib_dir }} --with-shared-libslurm --without-rpath --with-pmix={{ pmix_install_prefix }} --with-hwloc={{ hwloc_install_prefix }} --with-nvml={{ slurm_cuda_prefix }}'
slurm_force_rebuild: no
slurm_force_rebuild: yes
slurm_contain_ssh: yes

slurm_cluster_name: deepops
Expand Down

0 comments on commit 20b1550

Please sign in to comment.