-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add scripts for running jobs on SLURM
- Loading branch information
Showing
5 changed files
with
163 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
WANDB_API_KEY=<you_api_key> | ||
HF_TOKEN=<your_huggingface_token> | ||
SIF_IMAGE_PATH=<path_to_your_sif_image> | ||
|
||
# Set according to your needs | ||
# HF_HOME="$TMPDIR/.cache/huggingface" | ||
# WANDB_DIR="$TMPDIR/.cache/wandb" | ||
# WANDB_CACHE_DIR="$TMPDIR/.cache/wandb" | ||
# WANDB_CONFIG_DIR="$TMPDIR/.cache/wandb" | ||
# TRITON_CACHE_DIR="$TMPDIR/.cache/triton" | ||
|
||
# uncomment depending on your cluster configuration | ||
# NCCL | ||
# NCCL_SOCKET_NTHREADS=8 | ||
# NCCL_NSOCKS_PERTHREAD=8 | ||
# NCCL_CROSS_NIC=0 | ||
# NCCL_MIN_NCHANNELS=8 | ||
# NCCL_CHECKS_DISABLE=1 | ||
# NCCL_NTHREADS=512 | ||
# NCCL_IGNORE_CPU_AFFINITY=1 | ||
# NCCL_COLLNET_ENABLE=1 | ||
# NCCL_WORK_FIFO_DEPTH=4194304 | ||
# NCCL_VERSION=2.21.5 | ||
# NCCL_NVLS_ENABLE=0 | ||
# OMP_NUM_THREADS=16 | ||
# TORCH_NCCL_AVOID_RECORD_STREAMS=1 | ||
# TORCH_NCCL_ASYNC_ERROR_HANDLING=1 | ||
# TORCH_EXTENSIONS_DIR="$TMPDIR/torch_extensions" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# Running fine-tuning on SLURM cluster | ||
Runs `scripts/sft/fine_tune_deepspeed.py` on SLURM cluster with a given model and dataset. | ||
|
||
> [!NOTE] | ||
> We currently support only single-node, multi-gpu training. | ||
## Instructions | ||
|
||
1. Create a .env file with the variables exported at the top of the `run_on_cluster.sh` script. | ||
2. Build .sif image using dockerfile in `slurm/fine_tuning_env.dockerfile` and make sure it'll be available on the cluster (after running sbatch) | ||
3. Submit a job to the cluster using `sbatch` (adjust allocation parameters as needed by overriding them in CLI args right after `sbatch` call, as `--job-name` below) | ||
```bash | ||
sbatch \ | ||
--job-name sft | ||
run_on_cluster.sh \ | ||
--model llama_3.1_8b_instruct \ | ||
--dataset pl-frankowe-instruct | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
#!/bin/sh | ||
|
||
# NOTE: Run this script from the root of the repository (to properly deliver context to the docker image builder). | ||
|
||
set -e | ||
|
||
# Creates docker image containing the environment for fine-tuning. | ||
docker build \ | ||
--tag juddges_sft:latest \ | ||
--file ./slurm/fine_tuning_env.dockerfile \ | ||
. | ||
|
||
# Converts the docker image into an Apptainer image. | ||
docker run \ | ||
--rm \ | ||
--volume /var/run/docker.sock:/var/run/docker.sock \ | ||
--volume $(pwd):/juddges \ | ||
kaczmarj/apptainer build juddges_sft.sif docker-daemon://juddges_sft:latest |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
FROM nvcr.io/nvidia/pytorch:24.11-py3 | ||
|
||
WORKDIR /judddges | ||
|
||
RUN apt-get update -qq && apt-get install --yes -q make git | ||
|
||
# Install requirements | ||
RUN pip install --upgrade pip | ||
COPY requirements.txt requirements.txt | ||
COPY Makefile . | ||
RUN make install | ||
|
||
# Fix issues occurring when importing torchvision and transformers-engine | ||
# as these are not necessary. | ||
RUN pip uninstall torchvision transformers-engine --yes |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
#!/bin/bash | ||
|
||
#SBATCH --job-name=juddges_sft | ||
#SBATCH --output=logs/%j-%x.log | ||
#SBATCH --time=72:00:00 | ||
#SBATCH --nodes=1 | ||
#SBATCH --gpus=4 | ||
#SBATCH --cpus-per-gpu=8 | ||
#SBATCH --mem=64G | ||
# NOTE: You can reconfigure the above parameters to your needs in the sbatch call. | ||
# NOTE: All env variables must be exported to be available after calling srun. | ||
# NOTE: You may need to specify some NCCL args in .env file depending on your cluster configuration | ||
|
||
# =====Provide these user-specific env variables through .env file===== | ||
|
||
if [ -f .env ]; then | ||
source .env | ||
else | ||
echo "Error: .env file not found" >&2 | ||
exit 1 | ||
fi | ||
|
||
export WANDB_API_KEY | ||
export HF_TOKEN | ||
export SIF_IMAGE_PATH | ||
|
||
export NODES=($(scontrol show hostnames $SLURM_JOB_NODELIST | tr '\n' '\n')) | ||
export WORLD_SIZE=$(($SLURM_GPUS_PER_NODE * $SLURM_NNODES)) | ||
|
||
# =====Parse command line arguments===== | ||
while [ $# -gt 0 ]; do | ||
case "$1" in | ||
-m|--model) | ||
model="$2" | ||
shift 2 | ||
;; | ||
-d|--dataset) | ||
dataset="$2" | ||
shift 2 | ||
;; | ||
*) | ||
echo "Invalid option: $1" >&2 | ||
echo "Usage: $0 --model <model> --dataset <dataset>" >&2 | ||
echo " or: $0 -m <model> -d <dataset>" >&2 | ||
echo "Example: $0 --model Unsloth-Llama-3-8B-Instruct --dataset pl-court-instruct" >&2 | ||
exit 1 | ||
;; | ||
esac | ||
done | ||
|
||
# check if both parameters are provided | ||
if [ -z "$model" ] || [ -z "$dataset" ]; then | ||
echo "Both model (--model) and dataset (--dataset) parameters are required" >&2 | ||
echo "Usage: $0 --model <model> --dataset <dataset>" >&2 | ||
echo " or: $0 -m <model> -d <dataset>" >&2 | ||
echo "Example: $0 --model Unsloth-Llama-3-8B-Instruct --dataset pl-court-instruct" >&2 | ||
exit 1 | ||
fi | ||
|
||
# =====Run the script using apptainer image===== | ||
export NUM_PROC=$SLURM_CPUS_PER_GPU | ||
export PYTHONPATH=$PYTHONPATH:. | ||
export model | ||
export dataset | ||
|
||
export SFT_COMMAND="accelerate launch \ | ||
--num_processes=$WORLD_SIZE \ | ||
--num_machines=1 \ | ||
--use-deepspeed \ | ||
scripts/sft/fine_tune_deepspeed.py | ||
model=${model} | ||
dataset=${dataset} | ||
" | ||
srun --kill-on-bad-exit=1 \ | ||
--jobid $SLURM_JOB_ID \ | ||
apptainer run \ | ||
--fakeroot \ | ||
--bind "$TMPDIR:$TMPDIR" \ | ||
--nv \ | ||
"$SIF_IMAGE_PATH" \ | ||
bash -c "$SFT_COMMAND" | ||
|
||
EXIT_CODE=$? | ||
exit $EXIT_CODE |