Skip to content

Commit

Permalink
Use official SchedMD debian RPMs to install Slurm (#8)
Browse files Browse the repository at this point in the history
* Use official SchedMD debian RPMs to install Slurm

Update config files to accommodate newer version of Slurm.

* Make the Dockefile architecture independent

* Add step to build frontend on ARM

* Add debug step to CI

* Update to ubuntu 22.04

* Fix inconsistent cores per socket

* Remove duplicate clustername from slurm.conf

* Remove unused env from CI workflow config

* Use native arm runners for the arm builds and merge digests

* Fix frontend digest names

* Add shared munge key for slurm communication across containers

Update image to ghcr.io.

* Actually add the key, not just update the Dockerfiles

* Allow up to 10 compute nodes

* Remove testing volume mount on frontend

* Remove leftover comment from CI workflow

* Remove unnecessary action and update checkout action version
  • Loading branch information
christopherwharrop-noaa authored Jun 17, 2024
1 parent a00fae4 commit 5e0ceee
Show file tree
Hide file tree
Showing 17 changed files with 856 additions and 125 deletions.
660 changes: 627 additions & 33 deletions .github/workflows/docker.yml

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ services:
build:
context: ./frontend
dockerfile: Dockerfile
image: noaagsl/slurm-frontend:latest
image: ghcr.io/noaa-gsl/dockerslurmcluster/slurm-frontend:latest
container_name: slurm-frontend
shm_size: '4g'
hostname: slurmfrontend
Expand All @@ -16,7 +16,7 @@ services:
build:
context: ./master
dockerfile: Dockerfile
image: noaagsl/slurm-master:latest
image: ghcr.io/noaa-gsl/dockerslurmcluster/slurm-master:latest
container_name: slurm-master
shm_size: '4g'
hostname: slurmmaster
Expand All @@ -33,7 +33,7 @@ services:
build:
context: ./node
dockerfile: Dockerfile
image: noaagsl/slurm-node:latest
image: ghcr.io/noaa-gsl/dockerslurmcluster/slurm-node:latest
container_name: slurm-node1
shm_size: '4g'
hostname: slurmnode1
Expand All @@ -49,7 +49,7 @@ services:
build:
context: ./node
dockerfile: Dockerfile
image: noaagsl/slurm-node:latest
image: ghcr.io/noaa-gsl/dockerslurmcluster/slurm-node:latest
container_name: slurm-node2
shm_size: '4g'
hostname: slurmnode2
Expand All @@ -65,7 +65,7 @@ services:
build:
context: ./node
dockerfile: Dockerfile
image: noaagsl/slurm-node:latest
image: ghcr.io/noaa-gsl/dockerslurmcluster/slurm-node:latest
container_name: slurm-node3
shm_size: '4g'
hostname: slurmnode3
Expand Down
52 changes: 44 additions & 8 deletions frontend/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM ubuntu:20.04
FROM ubuntu:22.04

RUN apt-get update -y && apt-get install -y \
build-essential \
Expand All @@ -10,34 +10,70 @@ RUN apt-get update -y && apt-get install -y \

ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get install -y \
devscripts \
equivs \
fakeroot \
libbpf-dev \
libdbus-1-dev \
libhwloc-dev \
openssh-server \
slurm-client \
sudo


RUN cd /tmp \
&& wget https://download.schedmd.com/slurm/slurm-23.11.7.tar.bz2 \
&& tar -xaf slurm-23.11.7.tar.bz2 \
&& cd slurm-23.11.7 \
&& mk-build-deps -t "apt-get -y" -i debian/control \
&& debuild -b -uc -us \
&& cd .. \
&& ARCH=$(dpkg --print-architecture) \
&& dpkg --install slurm-smd_23.11.7-1_${ARCH}.deb \
&& dpkg --install slurm-smd-client_23.11.7-1_${ARCH}.deb \
&& dpkg --install slurm-smd-dev_23.11.7-1_${ARCH}.deb \
&& dpkg --install slurm-smd-doc_23.11.7-1_all.deb \
&& dpkg --install slurm-smd-libnss-slurm_23.11.7-1_${ARCH}.deb \
&& dpkg --install slurm-smd-libpam-slurm-adopt_23.11.7-1_${ARCH}.deb \
&& dpkg --install slurm-smd-libpmi0_23.11.7-1_${ARCH}.deb \
&& dpkg --install slurm-smd-libpmi2-0_23.11.7-1_${ARCH}.deb \
&& dpkg --install slurm-smd-libslurm-perl_23.11.7-1_${ARCH}.deb \
&& dpkg --install slurm-smd-sackd_23.11.7-1_${ARCH}.deb \
&& dpkg --install slurm-smd-sview_23.11.7-1_${ARCH}.deb

RUN useradd -m admin -s /usr/bin/bash -d /home/admin \
&& echo "admin:admin" | chpasswd \
&& adduser admin sudo \
&& echo "admin ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers

RUN useradd -m slurm -s /usr/bin/bash -d /home/slurm \
&& echo "slurm:slurm" | chpasswd

RUN mkdir /var/run/sshd \
&& sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd \
&& chmod -x /etc/update-motd.d/* \
&& rm -f /etc/legal

COPY munge.key /etc/munge/
COPY slurm.conf /etc/slurm/
COPY cgroup.conf /etc/slurm/
COPY docker-entrypoint.sh /etc/slurm/

RUN chown munge:munge etc/munge/munge.key \
&& chown slurm:slurm /etc/slurm \
&& chown slurm:slurm /etc/slurm/*.conf \
&& chmod 600 /etc/munge/munge.key \
&& chmod 775 /etc/slurm \
&& chmod 775 /etc/slurm/*.conf

COPY ssh /home/admin/.ssh

RUN chown -R admin:admin /home/admin/.ssh \
&& chmod -R 700 /home/admin/.ssh

COPY slurm.conf /etc/slurm-llnl/
COPY cgroup.conf /etc/slurm-llnl/
COPY docker-entrypoint.sh /etc/slurm-llnl/

WORKDIR /home/admin

EXPOSE 22 8888

ENV USER admin
ENV SHELL bash

ENTRYPOINT ["/etc/slurm-llnl/docker-entrypoint.sh"]
ENTRYPOINT ["/etc/slurm/docker-entrypoint.sh"]
12 changes: 9 additions & 3 deletions frontend/cgroup.conf
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
CgroupAutomount=yes
CgroupReleaseAgentDir="/etc/slurm/cgroup"
###
#
# Slurm cgroup support configuration file
#
# See man slurm.conf and man cgroup.conf for further
# information on cgroup configuration parameters
#--
CgroupPlugin=cgroup/v1
ConstrainSwapSpace=no
ConstrainCores=yes
ConstrainDevices=yes
ConstrainRAMSpace=yes

2 changes: 1 addition & 1 deletion frontend/docker-entrypoint.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

export SLURM_CPUS_ON_NODE=$(cat /proc/cpuinfo | grep processor | wc -l)
sudo sed -i "s/REPLACE_IT/CPUs=${SLURM_CPUS_ON_NODE}/g" /etc/slurm-llnl/slurm.conf
sudo sed -i "s/REPLACE_IT/${SLURM_CPUS_ON_NODE}/g" /etc/slurm/slurm.conf

sudo service munge start
sudo service ssh start
Expand Down
3 changes: 3 additions & 0 deletions frontend/munge.key
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
�q���W/g{nq;Żɝj�mS�-|�0Ia-֫Ŭ�A Z�%�z�b96}@�����hol�<�w���[g��d�J��ૅH�=���%�JI�A]ҕ�S�Է"�m�,�,SY�Mt}
��2F��
�Fw�
30 changes: 14 additions & 16 deletions frontend/slurm.conf
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
ClusterName=cluster
SlurmctldHost=slurmmaster
#
#DisableRootJobs=NO
Expand Down Expand Up @@ -39,20 +40,21 @@ ProctrackType=proctrack/linuxproc
#RebootProgram=
ReturnToService=1
#SallocDefaultCommand=
SlurmdParameters=config_overrides
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmUser=root
SlurmUser=slurm
#SlurmdUser=root
#SrunEpilog=
#SrunProlog=
StateSaveLocation=/var/spool
StateSaveLocation=/var/spool/slurmctld
SwitchType=switch/none
#TaskEpilog=
TaskPlugin=task/affinity
TaskPluginParam=Sched
#TaskPlugin=task/affinity
TaskPlugin=task/none
#TaskProlog=
#TopologyPlugin=topology/tree
#TmpFS=/tmp
Expand Down Expand Up @@ -87,8 +89,7 @@ Waittime=0
#MaxMemPerCPU=0
#SchedulerTimeSlice=30
SchedulerType=sched/backfill
SelectType=select/cons_res
SelectTypeParameters=CR_Core
SelectType=select/cons_tres
#
#
# JOB PRIORITY
Expand All @@ -114,8 +115,6 @@ SelectTypeParameters=CR_Core
#AccountingStoragePort=
AccountingStorageType=accounting_storage/none
#AccountingStorageUser=
AccountingStoreJobComment=YES
ClusterName=cluster
#DebugFlags=
#JobCompHost=
#JobCompLoc=
Expand All @@ -126,10 +125,10 @@ JobCompType=jobcomp/none
#JobContainerType=job_container/none
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/none
SlurmctldDebug=error
SlurmctldLogFile=/var/log/slurm-llnl/slurmctld.log
SlurmdDebug=error
SlurmdLogFile=/var/log/slurm-llnl/slurmd.log
SlurmctldDebug=debug2
SlurmctldLogFile=/var/log/slurmctld.log
SlurmdDebug=debug2
SlurmdLogFile=/var/log/slurmd.log
#SlurmSchedLogFile=
#SlurmSchedLogLevel=
#
Expand All @@ -147,7 +146,6 @@ SlurmdLogFile=/var/log/slurm-llnl/slurmd.log
#
#
# COMPUTE NODES
#
NodeName=slurmnode[1-10] REPLACE_IT State=UNKNOWN
PartitionName=slurmpar Nodes=slurmnode[1-10] Default=YES MaxTime=INFINITE State=UP

NodeName=DEFAULT State=UNKNOWN Sockets=1 ThreadsPerCore=1 CoresPerSocket=REPLACE_IT
NodeName=slurmnode[1-10] CPUs=REPLACE_IT
PartitionName=slurmpar Nodes=ALL Default=YES MaxTime=INFINITE State=UP
59 changes: 49 additions & 10 deletions master/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM ubuntu:20.04
FROM ubuntu:22.04

RUN apt-get update -y && apt-get install -y \
build-essential \
Expand All @@ -10,34 +10,73 @@ RUN apt-get update -y && apt-get install -y \

ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get install -y \
libpmi2-0-dev \
devscripts \
equivs \
fakeroot \
libbpf-dev \
libdbus-1-dev \
libhwloc-dev \
openssh-server \
slurm-client \
slurmctld \
slurmd \
sudo

RUN cd /tmp \
&& wget https://download.schedmd.com/slurm/slurm-23.11.7.tar.bz2 \
&& tar -xaf slurm-23.11.7.tar.bz2 \
&& cd slurm-23.11.7 \
&& mk-build-deps -t "apt-get -y" -i debian/control \
&& debuild -b -uc -us \
&& cd .. \
&& ARCH=$(dpkg --print-architecture) \
&& dpkg --install slurm-smd_23.11.7-1_${ARCH}.deb \
&& dpkg --install slurm-smd-client_23.11.7-1_${ARCH}.deb \
&& dpkg --install slurm-smd-slurmctld_23.11.7-1_${ARCH}.deb \
&& dpkg --install slurm-smd-dev_23.11.7-1_${ARCH}.deb \
&& dpkg --install slurm-smd-doc_23.11.7-1_all.deb \
&& dpkg --install slurm-smd-libnss-slurm_23.11.7-1_${ARCH}.deb \
&& dpkg --install slurm-smd-libpam-slurm-adopt_23.11.7-1_${ARCH}.deb \
&& dpkg --install slurm-smd-libpmi0_23.11.7-1_${ARCH}.deb \
&& dpkg --install slurm-smd-libpmi2-0_23.11.7-1_${ARCH}.deb \
&& dpkg --install slurm-smd-libslurm-perl_23.11.7-1_${ARCH}.deb \
&& dpkg --install slurm-smd-sackd_23.11.7-1_${ARCH}.deb \
&& dpkg --install slurm-smd-sview_23.11.7-1_${ARCH}.deb

RUN useradd -m admin -s /usr/bin/bash -d /home/admin \
&& echo "admin:admin" | chpasswd \
&& adduser admin sudo \
&& echo "admin ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers

RUN useradd -m slurm -s /usr/bin/bash -d /home/slurm \
&& echo "slurm:slurm" | chpasswd

RUN mkdir /var/run/sshd \
&& sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd \
&& chmod -x /etc/update-motd.d/* \
&& rm -f /etc/legal

COPY munge.key /etc/munge/
COPY slurm.conf /etc/slurm/
COPY cgroup.conf /etc/slurm/
COPY docker-entrypoint.sh /etc/slurm/

RUN chown munge:munge etc/munge/munge.key \
&& mkdir /var/spool/slurmctld \
&& chown slurm:slurm /var/spool/slurmctld \
&& chmod 755 /var/spool/slurmctld \
&& chown slurm:slurm /etc/slurm \
&& chown slurm:slurm /etc/slurm/*.conf \
&& chmod 600 /etc/munge/munge.key \
&& chmod 775 /etc/slurm \
&& chmod 775 /etc/slurm/*.conf

RUN systemctl enable slurmctld

COPY ssh /home/admin/.ssh

RUN chown -R admin:admin /home/admin/.ssh \
&& chmod -R 700 /home/admin/.ssh

COPY slurm.conf /etc/slurm-llnl/
COPY cgroup.conf /etc/slurm-llnl/
COPY docker-entrypoint.sh /etc/slurm-llnl/

EXPOSE 22 6817 6818 6819 3306

WORKDIR /home/admin

ENTRYPOINT ["/etc/slurm-llnl/docker-entrypoint.sh"]
ENTRYPOINT ["/etc/slurm/docker-entrypoint.sh"]
12 changes: 9 additions & 3 deletions master/cgroup.conf
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
CgroupAutomount=yes
CgroupReleaseAgentDir="/etc/slurm/cgroup"
###
#
# Slurm cgroup support configuration file
#
# See man slurm.conf and man cgroup.conf for further
# information on cgroup configuration parameters
#--
CgroupPlugin=cgroup/v1
ConstrainSwapSpace=no
ConstrainCores=yes
ConstrainDevices=yes
ConstrainRAMSpace=yes

4 changes: 2 additions & 2 deletions master/docker-entrypoint.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#!/bin/bash

export SLURM_CPUS_ON_NODE=$(cat /proc/cpuinfo | grep processor | wc -l)
sudo sed -i "s/REPLACE_IT/CPUs=${SLURM_CPUS_ON_NODE}/g" /etc/slurm-llnl/slurm.conf
sudo sed -i "s/REPLACE_IT/${SLURM_CPUS_ON_NODE}/g" /etc/slurm/slurm.conf

sudo service munge start
sudo service slurmctld start
sudo slurmctld
sudo service ssh start

tail -f /dev/null
3 changes: 3 additions & 0 deletions master/munge.key
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
�q���W/g{nq;Żɝj�mS�-|�0Ia-֫Ŭ�A Z�%�z�b96}@�����hol�<�w���[g��d�J��ૅH�=���%�JI�A]ҕ�S�Է"�m�,�,SY�Mt}
��2F��
�Fw�
Loading

0 comments on commit 5e0ceee

Please sign in to comment.