From c4a07275e83e5ed0f26b10ddd3929c4c2d9e3356 Mon Sep 17 00:00:00 2001 From: NASEEM A P Date: Sun, 24 Sep 2023 10:35:58 +0530 Subject: [PATCH 1/3] fixed issue with resume - added train_params[resume] = True --- train.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/train.py b/train.py index 896d093..bce2fe1 100644 --- a/train.py +++ b/train.py @@ -184,6 +184,10 @@ "metric_to_watch": 'mAP@0.50' } + # to Resume Training + if args['resume']: + train_params['resume'] = True + trainer.train( model=model, training_params=train_params, From e8c2f1aea4c629fdb64e49b802e434f827d827da Mon Sep 17 00:00:00 2001 From: NASEEM A P Date: Sun, 24 Sep 2023 11:28:20 +0530 Subject: [PATCH 2/3] fixed issue with resume training, now it will resume from the last, eg:- (50/100) to (100/100) --- train.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/train.py b/train.py index bce2fe1..27f43b0 100644 --- a/train.py +++ b/train.py @@ -60,20 +60,25 @@ s_time = time.time() + if args['name'] is None: name = 'train' else: name = args['name'] - n = 0 - while True: - if not os.path.exists(os.path.join('runs', f'{name}{n}')): - name = f'{name}{n}' - os.makedirs(os.path.join('runs', name)) - print(f"[INFO] Checkpoints saved in \033[1m{os.path.join('runs', name)}\033[0m") - break - else: - n += 1 - + + if args['resume']: + name = os.path.split(args['weight'])[0].split('/')[-1] + else: + n = 0 + while True: + if not os.path.exists(os.path.join('runs', f'{name}{n}')): + name = f'{name}{n}' + os.makedirs(os.path.join('runs', name)) + break + else: + n += 1 + + print(f"[INFO] Checkpoints saved in \033[1m{os.path.join('runs', name)}\033[0m") # Training on GPU or CPU if args['cpu']: print('[INFO] Training on \033[1mCPU\033[0m') @@ -188,6 +193,9 @@ if args['resume']: train_params['resume'] = True + # Print Training Params + print('[INFO] Training Params:\n', train_params) + trainer.train( model=model, training_params=train_params, From 73c0e399a1b7fe31bafeb973e57ca30aa8b2a9d1 Mon Sep 17 00:00:00 2001 From: NASEEM A P Date: Sun, 24 Sep 2023 11:33:57 +0530 Subject: [PATCH 3/3] added training resume to README --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 10c4638..8a014a1 100644 --- a/README.md +++ b/README.md @@ -134,6 +134,12 @@ You can train your **YOLO-NAS** model with **Single Command Line** ``` python3 train.py --data /dir/dataset/data.yaml --batch 6 --epoch 100 --model yolo_nas_m --size 640 ``` +### If your training ends in 65th epoch (total 100 epochs), now you can start from 65th epoch and complete your 100 epochs training. +**Example:** +``` +python3 train.py --data /dir/dataset/data.yaml --batch 6 --epoch 100 --model yolo_nas_m --size 640 \ + --weight runs/train2/ckpt_latest.pth --resume +``` ## 📺 Inference You can Inference your **YOLO-NAS** model with **Single Command Line**