-
Notifications
You must be signed in to change notification settings - Fork 0
/
paralle.sh
executable file
·19 lines (18 loc) · 1010 Bytes
/
paralle.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#!/bin/zsh
if [ -z "$RUN_ON_REMOTE" ]; then
source /work/gk77/k77025/.zshrc
fi
first_node=$(cat "$PJM_O_NODEINF" | awk 'NR==1{print $1}')
export MASTER_ADDR=$first_node
N_NODE=$1
RANK=$2
CONTEXT_SIZE=$3
CONTINUATION_SIZE=$4
BATCH_SIZE=$5
MODEL=$6
echo $MASTER_ADDR
#python3 -m torch.distributed.launch --nproc_per_node=1 --nnodes=1 --node_rank=$OMPI_COMM_WORLD_RANK --master_addr=`hostname -i` --master_port=29501 distributed_generate.py
cat $OMPI_COMM_WORLD_RANK >> /work/gk77/share/log
cat $MASTER_ADDR >> /work/gk77/share/log
#torchrun --nproc_per_node=$OMPI_COMM_WORLD_RANK distributed_generate.py --batch_size=1024 --context_size=$CONTEXT_SIZE --continuation_size=$CONTINUATION_SIZE --model=$MODEL --checkpoint=143000
torchrun --nproc_per_node=$RANK --nnodes=$N_NODE --node_rank=$OMPI_COMM_WORLD_RANK --master_addr=$MASTER_ADDR --master_port=29504 distributed_generate.py --batch_size=$BATCH_SIZE --context_size=$CONTEXT_SIZE --continuation_size=$CONTINUATION_SIZE --model=$MODEL --checkpoint=143000