-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.sh
51 lines (47 loc) · 2.13 KB
/
train.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/bin/bash
# Following arguments has to be specified for training:
# - MAX_NUMBER_OF_TRAINING_STEPS - maximum number of training steps,
# omit this flag or set it to -1 to have unlimited number of training steps.
# - MODEL_NAME - name of the model, now only "resnet_v2_50" is supported.
# - MOVING_AVG_DECAY - decay rate for exponential moving average of the
# trainable variables. Training with exponential moving average usually
# leads to better accuracy. Default of 0.9999. -1 disable exponential moving
# average. Default works well, so typically you set it only if you want
# to disable this feature.
# - HYPERPARAMETERS - string with hyperparameters,
# see model_lib.py for full list of hyperparameters.
# - DATASET - dataset, either "imagenet" or "tiny_imagenet".
# - IMAGE_SIZE - size of the image (single number).
# - OUTPUT_DIRECTORY - directory where to write results.
# - IMAGENET_DIR - directory with ImageNet dataset in TFRecord format.
# - TINY_IMAGENET_DIR - directory with Tiny ImageNet dataset in TFRecord format.
#
# Note that only one of IMAGENET_DIR or TINY_IMAGENET_DIR has to be provided
# depending on which dataset you use.
#
# Usage:
# ./run.sh JOBNAME PARTITION GPU_NUM
#
# Example:
# ./run.sh D3 AD1 32
set -x
set -e
JOBNAME=$1
PARTITION=$2
GPU_NUM=$3
TIMESTAMP="`date +%Y-%m-%d-%H-%M-%S`"
MV2_USE_CUDA=1 MV2_ENABLE_AFFINITY=0 MV2_SMP_USE_CMA=0 srun \
--job-name=${JOBNAME} \
--partition=${PARTITION} \
--mpi=pmi2 --gres=gpu:1 -n${GPU_NUM} --ntasks-per-node=1 \
--kill-on-bad-exit=1 \
python /mnt/lustre/liukunlin/caad/alp/train.py \
--model_name="resnet_v2_50" \
--hparams="train_adv_method=pgdll_32_10_30,train_lp_weight=0.45" \
--dataset="imagenet" \
--dataset_image_size=299 \
--output_dir="/mnt/lustre/liukunlin/caad/alp/pgd-image-299-newtry_normal-lvbo-" \
--imagenet_data_dir="/mnt/lustre/liukunlin/caad/alp/imagenet-tfrecord" \
--finetune_exclude_pretrained_scopes="resnet_v2_50/logits" \
--finetune_trainable_scopes="resnet_v2_50/logits,resnet_v2_50/postnorm" \
--finetune_checkpoint_path="/mnt/lustre/liukunlin/caad/alp/pretrained" 2>&1 | tee log_${TIMESTAMP} &