Skip to content

Commit

Permalink
Add scripts for running jobs on SLURM
Browse files Browse the repository at this point in the history
  • Loading branch information
binkjakub committed Dec 12, 2024
1 parent bc870c0 commit 08ee974
Show file tree
Hide file tree
Showing 5 changed files with 163 additions and 0 deletions.
28 changes: 28 additions & 0 deletions slurm/.example.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
WANDB_API_KEY=<you_api_key>
HF_TOKEN=<your_huggingface_token>
SIF_IMAGE_PATH=<path_to_your_sif_image>

# Set according to your needs
# HF_HOME="$TMPDIR/.cache/huggingface"
# WANDB_DIR="$TMPDIR/.cache/wandb"
# WANDB_CACHE_DIR="$TMPDIR/.cache/wandb"
# WANDB_CONFIG_DIR="$TMPDIR/.cache/wandb"
# TRITON_CACHE_DIR="$TMPDIR/.cache/triton"

# uncomment depending on your cluster configuration
# NCCL
# NCCL_SOCKET_NTHREADS=8
# NCCL_NSOCKS_PERTHREAD=8
# NCCL_CROSS_NIC=0
# NCCL_MIN_NCHANNELS=8
# NCCL_CHECKS_DISABLE=1
# NCCL_NTHREADS=512
# NCCL_IGNORE_CPU_AFFINITY=1
# NCCL_COLLNET_ENABLE=1
# NCCL_WORK_FIFO_DEPTH=4194304
# NCCL_VERSION=2.21.5
# NCCL_NVLS_ENABLE=0
# OMP_NUM_THREADS=16
# TORCH_NCCL_AVOID_RECORD_STREAMS=1
# TORCH_NCCL_ASYNC_ERROR_HANDLING=1
# TORCH_EXTENSIONS_DIR="$TMPDIR/torch_extensions"
18 changes: 18 additions & 0 deletions slurm/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Running fine-tuning on SLURM cluster
Runs `scripts/sft/fine_tune_deepspeed.py` on SLURM cluster with a given model and dataset.

> [!NOTE]
> We currently support only single-node, multi-gpu training.
## Instructions

1. Create a .env file with the variables exported at the top of the `run_on_cluster.sh` script.
2. Build .sif image using dockerfile in `slurm/fine_tuning_env.dockerfile` and make sure it'll be available on the cluster (after running sbatch)
3. Submit a job to the cluster using `sbatch` (adjust allocation parameters as needed by overriding them in CLI args right after `sbatch` call, as `--job-name` below)
```bash
sbatch \
--job-name sft
run_on_cluster.sh \
--model llama_3.1_8b_instruct \
--dataset pl-frankowe-instruct
```
18 changes: 18 additions & 0 deletions slurm/build_apptainer.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/sh

# NOTE: Run this script from the root of the repository (to properly deliver context to the docker image builder).

set -e

# Creates docker image containing the environment for fine-tuning.
docker build \
--tag juddges_sft:latest \
--file ./slurm/fine_tuning_env.dockerfile \
.

# Converts the docker image into an Apptainer image.
docker run \
--rm \
--volume /var/run/docker.sock:/var/run/docker.sock \
--volume $(pwd):/juddges \
kaczmarj/apptainer build juddges_sft.sif docker-daemon://juddges_sft:latest
15 changes: 15 additions & 0 deletions slurm/fine_tuning_env.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM nvcr.io/nvidia/pytorch:24.11-py3

WORKDIR /judddges

RUN apt-get update -qq && apt-get install --yes -q make git

# Install requirements
RUN pip install --upgrade pip
COPY requirements.txt requirements.txt
COPY Makefile .
RUN make install

# Fix issues occurring when importing torchvision and transformers-engine
# as these are not necessary.
RUN pip uninstall torchvision transformers-engine --yes
84 changes: 84 additions & 0 deletions slurm/run_on_cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/bin/bash

#SBATCH --job-name=juddges_sft
#SBATCH --output=logs/%j-%x.log
#SBATCH --time=72:00:00
#SBATCH --nodes=1
#SBATCH --gpus=4
#SBATCH --cpus-per-gpu=8
#SBATCH --mem=64G
# NOTE: You can reconfigure the above parameters to your needs in the sbatch call.
# NOTE: All env variables must be exported to be available after calling srun.
# NOTE: You may need to specify some NCCL args in .env file depending on your cluster configuration

# =====Provide these user-specific env variables through .env file=====

if [ -f .env ]; then
source .env
else
echo "Error: .env file not found" >&2
exit 1
fi

export WANDB_API_KEY
export HF_TOKEN
export SIF_IMAGE_PATH

export NODES=($(scontrol show hostnames $SLURM_JOB_NODELIST | tr '\n' '\n'))
export WORLD_SIZE=$(($SLURM_GPUS_PER_NODE * $SLURM_NNODES))

# =====Parse command line arguments=====
while [ $# -gt 0 ]; do
case "$1" in
-m|--model)
model="$2"
shift 2
;;
-d|--dataset)
dataset="$2"
shift 2
;;
*)
echo "Invalid option: $1" >&2
echo "Usage: $0 --model <model> --dataset <dataset>" >&2
echo " or: $0 -m <model> -d <dataset>" >&2
echo "Example: $0 --model Unsloth-Llama-3-8B-Instruct --dataset pl-court-instruct" >&2
exit 1
;;
esac
done

# check if both parameters are provided
if [ -z "$model" ] || [ -z "$dataset" ]; then
echo "Both model (--model) and dataset (--dataset) parameters are required" >&2
echo "Usage: $0 --model <model> --dataset <dataset>" >&2
echo " or: $0 -m <model> -d <dataset>" >&2
echo "Example: $0 --model Unsloth-Llama-3-8B-Instruct --dataset pl-court-instruct" >&2
exit 1
fi

# =====Run the script using apptainer image=====
export NUM_PROC=$SLURM_CPUS_PER_GPU
export PYTHONPATH=$PYTHONPATH:.
export model
export dataset

export SFT_COMMAND="accelerate launch \
--num_processes=$WORLD_SIZE \
--num_machines=1 \
--use-deepspeed \
scripts/sft/fine_tune_deepspeed.py
model=${model}
dataset=${dataset}
"
srun --kill-on-bad-exit=1 \
--jobid $SLURM_JOB_ID \
apptainer run \
--fakeroot \
--bind "$TMPDIR:$TMPDIR" \
--nv \
"$SIF_IMAGE_PATH" \
bash -c "$SFT_COMMAND"

EXIT_CODE=$?
exit $EXIT_CODE

0 comments on commit 08ee974

Please sign in to comment.