Add scripts for running jobs on SLURM

pwr-ai · Dec 12, 2024 · 08ee974 · 08ee974
1 parent bc870c0
commit 08ee974
Show file tree

Hide file tree

Showing 5 changed files with 163 additions and 0 deletions.
diff --git a/slurm/.example.env b/slurm/.example.env
@@ -0,0 +1,28 @@
+WANDB_API_KEY=<you_api_key>
+HF_TOKEN=<your_huggingface_token>
+SIF_IMAGE_PATH=<path_to_your_sif_image>
+
+# Set according to your needs
+# HF_HOME="$TMPDIR/.cache/huggingface"
+# WANDB_DIR="$TMPDIR/.cache/wandb"
+# WANDB_CACHE_DIR="$TMPDIR/.cache/wandb"
+# WANDB_CONFIG_DIR="$TMPDIR/.cache/wandb"
+# TRITON_CACHE_DIR="$TMPDIR/.cache/triton"
+
+# uncomment depending on your cluster configuration
+# NCCL
+# NCCL_SOCKET_NTHREADS=8
+# NCCL_NSOCKS_PERTHREAD=8
+# NCCL_CROSS_NIC=0
+# NCCL_MIN_NCHANNELS=8
+# NCCL_CHECKS_DISABLE=1
+# NCCL_NTHREADS=512
+# NCCL_IGNORE_CPU_AFFINITY=1
+# NCCL_COLLNET_ENABLE=1
+# NCCL_WORK_FIFO_DEPTH=4194304
+# NCCL_VERSION=2.21.5
+# NCCL_NVLS_ENABLE=0
+# OMP_NUM_THREADS=16
+# TORCH_NCCL_AVOID_RECORD_STREAMS=1
+# TORCH_NCCL_ASYNC_ERROR_HANDLING=1
+# TORCH_EXTENSIONS_DIR="$TMPDIR/torch_extensions"
diff --git a/slurm/README.md b/slurm/README.md
@@ -0,0 +1,18 @@
+# Running fine-tuning on SLURM cluster
+Runs `scripts/sft/fine_tune_deepspeed.py` on SLURM cluster with a given model and dataset.
+
+> [!NOTE]
+> We currently support only single-node, multi-gpu training.
+
+## Instructions
+
+1. Create a .env file with the variables exported at the top of the `run_on_cluster.sh` script.
+2. Build .sif image using dockerfile in `slurm/fine_tuning_env.dockerfile` and make sure it'll be available on the cluster (after running sbatch)
+3. Submit a job to the cluster using `sbatch` (adjust allocation parameters as needed by overriding them in CLI args right after `sbatch` call, as `--job-name` below)
+    ```bash
+    sbatch \
+        --job-name sft
+        run_on_cluster.sh \
+            --model llama_3.1_8b_instruct \
+            --dataset pl-frankowe-instruct
+    ```
diff --git a/slurm/build_apptainer.sh b/slurm/build_apptainer.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+
+# NOTE: Run this script from the root of the repository (to properly deliver context to the docker image builder).
+
+set -e
+
+# Creates docker image containing the environment for fine-tuning.
+docker build \
+    --tag juddges_sft:latest \
+    --file ./slurm/fine_tuning_env.dockerfile \
+    .
+
+# Converts the docker image into an Apptainer image.
+docker run \
+    --rm \
+    --volume /var/run/docker.sock:/var/run/docker.sock \
+    --volume $(pwd):/juddges \
+    kaczmarj/apptainer build juddges_sft.sif docker-daemon://juddges_sft:latest
diff --git a/slurm/fine_tuning_env.dockerfile b/slurm/fine_tuning_env.dockerfile
@@ -0,0 +1,15 @@
+FROM nvcr.io/nvidia/pytorch:24.11-py3
+
+WORKDIR /judddges
+
+RUN apt-get update -qq && apt-get install --yes -q make git
+
+# Install requirements
+RUN pip install --upgrade pip
+COPY requirements.txt requirements.txt
+COPY Makefile .
+RUN make install
+
+# Fix issues occurring when importing torchvision and transformers-engine
+# as these are not necessary.
+RUN pip uninstall torchvision transformers-engine --yes
diff --git a/slurm/run_on_cluster.sh b/slurm/run_on_cluster.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+
+#SBATCH --job-name=juddges_sft
+#SBATCH --output=logs/%j-%x.log
+#SBATCH --time=72:00:00
+#SBATCH --nodes=1
+#SBATCH --gpus=4
+#SBATCH --cpus-per-gpu=8
+#SBATCH --mem=64G
+# NOTE: You can reconfigure the above parameters to your needs in the sbatch call.
+# NOTE: All env variables must be exported to be available after calling srun.
+# NOTE: You may need to specify some NCCL args in .env file depending on your cluster configuration
+
+# =====Provide these user-specific env variables through .env file=====
+
+if [ -f .env ]; then
+    source .env
+else
+    echo "Error: .env file not found" >&2
+    exit 1
+fi
+
+export WANDB_API_KEY
+export HF_TOKEN
+export SIF_IMAGE_PATH
+
+export NODES=($(scontrol show hostnames $SLURM_JOB_NODELIST | tr '\n' '\n'))
+export WORLD_SIZE=$(($SLURM_GPUS_PER_NODE * $SLURM_NNODES))
+
+# =====Parse command line arguments=====
+while [ $# -gt 0 ]; do
+    case "$1" in
+        -m|--model)
+            model="$2"
+            shift 2
+            ;;
+        -d|--dataset)
+            dataset="$2"
+            shift 2
+            ;;
+        *)
+            echo "Invalid option: $1" >&2
+            echo "Usage: $0 --model <model> --dataset <dataset>" >&2
+            echo "   or: $0 -m <model> -d <dataset>" >&2
+            echo "Example: $0 --model Unsloth-Llama-3-8B-Instruct --dataset pl-court-instruct" >&2
+            exit 1
+            ;;
+    esac
+done
+
+# check if both parameters are provided
+if [ -z "$model" ] || [ -z "$dataset" ]; then
+    echo "Both model (--model) and dataset (--dataset) parameters are required" >&2
+    echo "Usage: $0 --model <model> --dataset <dataset>" >&2
+    echo "   or: $0 -m <model> -d <dataset>" >&2
+    echo "Example: $0 --model Unsloth-Llama-3-8B-Instruct --dataset pl-court-instruct" >&2
+    exit 1
+fi
+
+# =====Run the script using apptainer image=====
+export NUM_PROC=$SLURM_CPUS_PER_GPU
+export PYTHONPATH=$PYTHONPATH:.
+export model
+export dataset
+
+export SFT_COMMAND="accelerate launch \
+    --num_processes=$WORLD_SIZE \
+    --num_machines=1 \
+    --use-deepspeed \
+    scripts/sft/fine_tune_deepspeed.py
+        model=${model}
+        dataset=${dataset}
+"
+srun --kill-on-bad-exit=1 \
+    --jobid $SLURM_JOB_ID \
+    apptainer run \
+        --fakeroot \
+        --bind "$TMPDIR:$TMPDIR" \
+        --nv \
+        "$SIF_IMAGE_PATH" \
+        bash -c "$SFT_COMMAND"
+
+EXIT_CODE=$?
+exit $EXIT_CODE