diff --git a/scripts/classification/imagenet/train_imagenet.py b/scripts/classification/imagenet/train_imagenet.py index 7bd70f0023..53f6a32044 100644 --- a/scripts/classification/imagenet/train_imagenet.py +++ b/scripts/classification/imagenet/train_imagenet.py @@ -6,6 +6,7 @@ from mxnet import gluon, nd from mxnet import autograd as ag from mxnet.gluon.data.vision import transforms +from mxnet.contrib import amp import gluoncv as gcv gcv.utils.check_version('0.6.0') @@ -104,6 +105,10 @@ def parse_args(): help='name of training log file') parser.add_argument('--use-gn', action='store_true', help='whether to use group norm.') + parser.add_argument('--amp', action='store_true', + help='Use MXNet AMP for mixed precision training.') + parser.add_argument('--auto-layout', action='store_true', + help='Add layout optimization to AMP. Must be used in addition of `--amp`.') opt = parser.parse_args() return opt @@ -121,6 +126,11 @@ def main(): logger.info(opt) + assert not opt.auto_layout or opt.amp, "--auto-layout needs to be used with --amp" + + if opt.amp: + amp.init(layout_optimization=opt.auto_layout) + batch_size = opt.batch_size classes = 1000 num_training_samples = 1281167 @@ -347,10 +357,13 @@ def train(ctx): for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 - trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) + trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params, update_on_kvstore=(False if opt.amp else None)) if opt.resume_states != '': trainer.load_states(opt.resume_states) + if opt.amp: + amp.init_trainer(trainer) + if opt.label_smoothing or opt.mixup: sparse_label_loss = False else: @@ -402,8 +415,13 @@ def train(ctx): p.astype('float32', copy=False)) for yhat, y, p in zip(outputs, label, teacher_prob)] else: loss = [L(yhat, y.astype(opt.dtype, copy=False)) for yhat, y in zip(outputs, label)] - for l in loss: - l.backward() + if opt.amp: + with amp.scale_loss(loss, trainer) as scaled_loss: + ag.backward(scaled_loss) + else: + for l in loss: + l.backward() + trainer.step(batch_size) if opt.mixup: diff --git a/scripts/detection/faster_rcnn/train_faster_rcnn.py b/scripts/detection/faster_rcnn/train_faster_rcnn.py index e5395d8f3b..5e30299b36 100644 --- a/scripts/detection/faster_rcnn/train_faster_rcnn.py +++ b/scripts/detection/faster_rcnn/train_faster_rcnn.py @@ -121,6 +121,8 @@ def parse_args(): help='Whether to use static memory allocation. Memory usage will increase.') parser.add_argument('--amp', action='store_true', help='Use MXNet AMP for mixed precision training.') + parser.add_argument('--auto-layout', action='store_true', + help='Add layout optimization to AMP. Must be used in addition of `--amp`.') parser.add_argument('--horovod', action='store_true', help='Use MXNet Horovod for distributed training. Must be run with OpenMPI. ' '--gpus is ignored when using --horovod.') @@ -571,8 +573,10 @@ def train(net, train_data, val_data, eval_metric, batch_size, ctx, args): # fix seed for mxnet, numpy and python builtin random generator. gutils.random.seed(args.seed) + assert not args.auto_layout or args.amp, "--auto-layout needs to be used with --amp" + if args.amp: - amp.init() + amp.init(layout_optimization=args.auto_layout) # training contexts if args.horovod: diff --git a/scripts/detection/ssd/train_ssd.py b/scripts/detection/ssd/train_ssd.py index 849d0ee47e..c017d86f21 100644 --- a/scripts/detection/ssd/train_ssd.py +++ b/scripts/detection/ssd/train_ssd.py @@ -89,6 +89,8 @@ def parse_args(): 'Currently supports only COCO.') parser.add_argument('--amp', action='store_true', help='Use MXNet AMP for mixed precision training.') + parser.add_argument('--auto-layout', action='store_true', + help='Add layout optimization to AMP. Must be used in addition of `--amp`.') parser.add_argument('--horovod', action='store_true', help='Use MXNet Horovod for distributed training. Must be run with OpenMPI. ' '--gpus is ignored when using --horovod.') @@ -360,8 +362,10 @@ def train(net, train_data, val_data, eval_metric, ctx, args): if __name__ == '__main__': args = parse_args() + assert not args.auto_layout or args.amp, "--auto-layout needs to be used with --amp" + if args.amp: - amp.init() + amp.init(layout_optimization=args.auto_layout) if args.horovod: hvd.init() diff --git a/scripts/detection/yolo/train_yolo3.py b/scripts/detection/yolo/train_yolo3.py index 95684eaa83..fae863bd8e 100644 --- a/scripts/detection/yolo/train_yolo3.py +++ b/scripts/detection/yolo/train_yolo3.py @@ -97,6 +97,8 @@ def parse_args(): parser.add_argument('--label-smooth', action='store_true', help='Use label smoothing.') parser.add_argument('--amp', action='store_true', help='Use MXNet AMP for mixed precision training.') + parser.add_argument('--auto-layout', action='store_true', + help='Add layout optimization to AMP. Must be used in addition of `--amp`.') parser.add_argument('--horovod', action='store_true', help='Use MXNet Horovod for distributed training. Must be run with OpenMPI. ' '--gpus is ignored when using --horovod.') @@ -325,8 +327,10 @@ def train(net, train_data, val_data, eval_metric, ctx, args): if __name__ == '__main__': args = parse_args() + assert not args.auto_layout or args.amp, "--auto-layout needs to be used with --amp" + if args.amp: - amp.init() + amp.init(layout_optimization=args.auto_layout) if args.horovod: if hvd is None: diff --git a/scripts/instance/mask_rcnn/train_mask_rcnn.py b/scripts/instance/mask_rcnn/train_mask_rcnn.py index e1ca35c4ba..ef8f0b0010 100644 --- a/scripts/instance/mask_rcnn/train_mask_rcnn.py +++ b/scripts/instance/mask_rcnn/train_mask_rcnn.py @@ -124,6 +124,8 @@ def parse_args(): help='Whether to use static memory allocation. Memory usage will increase.') parser.add_argument('--amp', action='store_true', help='Use MXNet AMP for mixed precision training.') + parser.add_argument('--auto-layout', action='store_true', + help='Add layout optimization to AMP. Must be used in addition of `--amp`.') parser.add_argument('--horovod', action='store_true', help='Use MXNet Horovod for distributed training. Must be run with OpenMPI. ' '--gpus is ignored when using --horovod.') @@ -621,8 +623,10 @@ def train(net, train_data, val_data, eval_metric, batch_size, ctx, logger, args) # fix seed for mxnet, numpy and python builtin random generator. gutils.random.seed(args.seed) + assert not args.auto_layout or args.amp, "--auto-layout needs to be used with --amp" + if args.amp: - amp.init() + amp.init(layout_optimization=args.auto_layout) # training contexts if args.horovod: diff --git a/scripts/segmentation/train.py b/scripts/segmentation/train.py index b2b8e47872..46680d8ba3 100644 --- a/scripts/segmentation/train.py +++ b/scripts/segmentation/train.py @@ -8,6 +8,7 @@ import mxnet as mx from mxnet import gluon, autograd from mxnet.gluon.data.vision import transforms +from mxnet.contrib import amp import gluoncv gluoncv.utils.check_version('0.6.0') @@ -99,6 +100,11 @@ def parse_args(): # synchronized Batch Normalization parser.add_argument('--syncbn', action='store_true', default=False, help='using Synchronized Cross-GPU BatchNorm') + # performance related + parser.add_argument('--amp', action='store_true', + help='Use MXNet AMP for mixed precision training.') + parser.add_argument('--auto-layout', action='store_true', + help='Add layout optimization to AMP. Must be used in addition of `--amp`.') # the parser args = parser.parse_args() @@ -229,7 +235,12 @@ def __init__(self, args, logger): v.wd_mult = 0.0 self.optimizer = gluon.Trainer(self.net.module.collect_params(), args.optimizer, - optimizer_params, kvstore=kv) + optimizer_params, update_on_kvstore=(False if args.amp else None)) + + + if args.amp: + amp.init_trainer(self.optimizer) + # evaluation metrics self.metric = gluoncv.utils.metrics.SegmentationMetric(trainset.num_class) @@ -241,7 +252,11 @@ def training(self, epoch): outputs = self.net(data.astype(args.dtype, copy=False)) losses = self.criterion(outputs, target) mx.nd.waitall() - autograd.backward(losses) + if args.amp: + with amp.scale_loss(losses, self.optimizer) as scaled_losses: + autograd.backward(scaled_losses) + else: + autograd.backward(losses) self.optimizer.step(self.args.batch_size) for loss in losses: train_loss += np.mean(loss.asnumpy()) / len(losses) @@ -281,7 +296,10 @@ def save_checkpoint(net, args, epoch, mIoU, is_best=False): if __name__ == "__main__": args = parse_args() + assert not args.auto_layout or args.amp, "--auto-layout needs to be used with --amp" + if args.amp: + amp.init(layout_optimization=args.auto_layout) # build logger filehandler = logging.FileHandler(os.path.join(args.save_dir, args.logging_file)) streamhandler = logging.StreamHandler()