From 779476d74f6e8372396c6aa106fc2af2a1d981a4 Mon Sep 17 00:00:00 2001 From: "Druzhkov, Pavel" Date: Fri, 30 Jul 2021 19:21:07 +0300 Subject: [PATCH 01/11] add basic functionality for multi-gpu support --- mmdet/apis/ote/apis/detection/config_utils.py | 22 ++- .../apis/ote/apis/detection/configuration.py | 3 +- .../apis/ote/apis/detection/openvino_task.py | 5 + mmdet/apis/ote/apis/detection/task.py | 165 +++++++++++------- mmdet/apis/ote/sample/sample.py | 11 +- 5 files changed, 133 insertions(+), 73 deletions(-) diff --git a/mmdet/apis/ote/apis/detection/config_utils.py b/mmdet/apis/ote/apis/detection/config_utils.py index f4240c9a710..0cffd588016 100644 --- a/mmdet/apis/ote/apis/detection/config_utils.py +++ b/mmdet/apis/ote/apis/detection/config_utils.py @@ -17,9 +17,10 @@ import glob import os import tempfile -from typing import Optional, List +from typing import Any, Optional, List from mmcv import Config, ConfigDict +from mmcv.runner import master_only from sc_sdk.entities.datasets import Dataset, Subset from sc_sdk.entities.label import Label from sc_sdk.logging import logger_factory @@ -28,7 +29,7 @@ from .configuration import OTEDetectionConfig -logger = logger_factory.get_logger("OTEDetectionTask") +logger = logger_factory.get_logger("OTEDetectionTask.config_utils") def apply_template_configurable_parameters(params: OTEDetectionConfig, template: dict): @@ -76,6 +77,8 @@ def patch_config(config: Config, work_dir: str, labels: List[Label], random_seed if evaluation_metric is not None: config.evaluation.save_best = evaluation_metric config.evaluation.rule = 'greater' + # FIXME. Does if have to be explicitly set for CPU-only mode? + # config.evaluation.gpu_collect = False label_names = [lab.name for lab in labels] set_data_classes(config, label_names) @@ -108,10 +111,10 @@ def prepare_for_testing(config: Config, dataset: Dataset) -> Config: def prepare_for_training(config: Config, train_dataset: Dataset, val_dataset: Dataset, - time_monitor: TimeMonitorCallback, learning_curves: defaultdict) -> Config: + round_id: Any, time_monitor: TimeMonitorCallback, learning_curves: defaultdict) -> Config: config = copy.deepcopy(config) - prepare_work_dir(config) + prepare_work_dir(config, round_id) # config.data.test.ote_dataset = dataset.get_subset(Subset.TESTING) config.data.val.ote_dataset = val_dataset @@ -160,6 +163,7 @@ def config_from_string(config_string: str) -> Config: return Config.fromfile(temp_file.name) +@master_only def save_config_to_file(config: Config): """ Dump the full config to a file. Filename is 'config.py', it is saved in the current work_dir. """ filepath = os.path.join(config.work_dir, 'config.py') @@ -168,16 +172,16 @@ def save_config_to_file(config: Config): f.write(config_string) -def prepare_work_dir(config: Config) -> str: +def prepare_work_dir(config: Config, round_id: Any = 0) -> str: base_work_dir = config.work_dir - checkpoint_dirs = glob.glob(os.path.join(base_work_dir, "checkpoints_round_*")) - train_round_checkpoint_dir = os.path.join(base_work_dir, f"checkpoints_round_{len(checkpoint_dirs)}") - os.makedirs(train_round_checkpoint_dir) + + train_round_checkpoint_dir = os.path.join(base_work_dir, f"checkpoints_round_{round_id}") + os.makedirs(train_round_checkpoint_dir, exist_ok=True) logger.info(f"Checkpoints and logs for this training run are stored in {train_round_checkpoint_dir}") config.work_dir = train_round_checkpoint_dir if 'meta' not in config.runner: config.runner.meta = ConfigDict() - config.runner.meta.exp_name = f"train_round_{len(checkpoint_dirs)}" + config.runner.meta.exp_name = f"train_round_{round_id}" # Save training config for debugging. It is saved in the checkpoint dir for this training round save_config_to_file(config) return train_round_checkpoint_dir diff --git a/mmdet/apis/ote/apis/detection/configuration.py b/mmdet/apis/ote/apis/detection/configuration.py index d5e4dbbd5d9..4d877c614d3 100644 --- a/mmdet/apis/ote/apis/detection/configuration.py +++ b/mmdet/apis/ote/apis/detection/configuration.py @@ -144,7 +144,8 @@ class __AlgoBackend(ParameterGroup): template = configurable_str("template.yaml", "", editable=False, visible_in_ui=False) model = configurable_str("model.py", "", editable=False, visible_in_ui=False) model_name = configurable_str("object detection model", "", editable=False, visible_in_ui=False) - data_pipeline = configurable_str("ote_data_pipeline.py", "", editable=False, visible_in_ui=False) + scratch_space = configurable_str("/tmp/ote-det-scratch", "", editable=False, visible_in_ui=False) + learning_parameters = add_parameter_group(__LearningParameters) algo_backend = add_parameter_group(__AlgoBackend) diff --git a/mmdet/apis/ote/apis/detection/openvino_task.py b/mmdet/apis/ote/apis/detection/openvino_task.py index c42da2b6816..7777f2ee60e 100644 --- a/mmdet/apis/ote/apis/detection/openvino_task.py +++ b/mmdet/apis/ote/apis/detection/openvino_task.py @@ -31,6 +31,8 @@ from sc_sdk.usecases.tasks.interfaces.evaluate_interface import IEvaluationTask from sc_sdk.usecases.tasks.interfaces.inference_interface import IInferenceTask +from mmcv.runner import master_only + from .configuration import OTEDetectionConfig @@ -163,6 +165,7 @@ def __init__(self, task_environment: TaskEnvironment): self.model = self.task_environment.model self.inferencer = self.load_inferencer() + @master_only def load_inferencer(self) -> OpenVINODetectionInferencer: labels = self.task_environment.label_schema.get_labels(include_empty=False) return OpenVINODetectionInferencer(self.hparams, @@ -170,12 +173,14 @@ def load_inferencer(self) -> OpenVINODetectionInferencer: self.model.get_data("openvino.xml"), self.model.get_data("openvino.bin")) + @master_only def infer(self, dataset: Dataset, inference_parameters: Optional[InferenceParameters] = None) -> Dataset: from tqdm import tqdm for dataset_item in tqdm(dataset): dataset_item.annotation_scene = self.inferencer.predict(dataset_item.numpy) return dataset + @master_only def evaluate(self, output_result_set: ResultSet, evaluation_metric: Optional[str] = None): diff --git a/mmdet/apis/ote/apis/detection/task.py b/mmdet/apis/ote/apis/detection/task.py index 11c464ff604..ed9a6ddda83 100644 --- a/mmdet/apis/ote/apis/detection/task.py +++ b/mmdet/apis/ote/apis/detection/task.py @@ -14,10 +14,13 @@ import copy import io +import logging import os import shutil import tempfile import torch +import torch.distributed as dist +import torch.multiprocessing as mp import warnings from collections import defaultdict from typing import Optional, List, Tuple @@ -47,21 +50,31 @@ from sc_sdk.usecases.tasks.interfaces.unload_interface import IUnload from sc_sdk.logging import logger_factory -from mmcv.parallel import MMDataParallel -from mmcv.runner import load_checkpoint +from mmcv.parallel import MMDataParallel, MMDistributedDataParallel +from mmcv.runner import load_checkpoint, get_dist_info, init_dist, master_only from mmcv.utils import Config -from mmdet.apis import train_detector, single_gpu_test, export_model +from mmdet.apis import train_detector, single_gpu_test, multi_gpu_test, export_model from mmdet.apis.ote.apis.detection.configuration import OTEDetectionConfig from mmdet.apis.ote.apis.detection.config_utils import (patch_config, set_hyperparams, prepare_for_training, prepare_for_testing) from mmdet.apis.ote.extension.utils.hooks import OTELoggerHook from mmdet.datasets import build_dataset, build_dataloader from mmdet.models import build_detector +from mmdet.parallel import MMDataCPU logger = logger_factory.get_logger("OTEDetectionTask") +def init_dist_cpu(launcher, backend, **kwargs): + if mp.get_start_method(allow_none=True) is None: + mp.set_start_method('spawn') + if launcher == 'pytorch': + dist.init_process_group(backend=backend, **kwargs) + else: + raise ValueError(f'Invalid launcher type: {launcher}') + + class OTEDetectionTask(ITrainingTask, IInferenceTask, IExportTask, IEvaluationTask, IUnload): task_environment: TaskEnvironment @@ -72,15 +85,24 @@ def __init__(self, task_environment: TaskEnvironment): """ logger.info(f"Loading OTEDetectionTask.") - self.scratch_space = tempfile.mkdtemp(prefix="ote-det-scratch-") - logger.info(f"Scratch space created at {self.scratch_space}") self.task_environment = task_environment self.hyperparams = hyperparams = task_environment.get_configurable_parameters(OTEDetectionConfig) + self.scratch_space = self.hyperparams.algo_backend.scratch_space + logger.info(f"Scratch space for the task: {self.scratch_space}") self.model_name = hyperparams.algo_backend.model_name self.labels = task_environment.get_labels(False) + if not torch.distributed.is_initialized(): + if torch.cuda.is_available(): + init_dist(launcher='pytorch') + else: + init_dist_cpu(backend="gloo") + self.rank, self.world_size = get_dist_info() + self.gpu_ids = range(self.world_size) + logger.warning(f'World size {self.world_size}, rank {self.rank}') + # Get and prepare mmdet config. template_file_path = hyperparams.algo_backend.template base_dir = os.path.abspath(os.path.dirname(template_file_path)) @@ -88,11 +110,13 @@ def __init__(self, task_environment: TaskEnvironment): self.config = Config.fromfile(config_file_path) patch_config(self.config, self.scratch_space, self.labels, random_seed=42) set_hyperparams(self.config, hyperparams) + self.config.gpu_ids = self.gpu_ids # Create and initialize PyTorch model. self.model = self._load_model(task_environment.model) # Extra control variables. + self.training_round_id = 0 self.is_training = False self.should_stop = False self.time_monitor = None @@ -159,38 +183,37 @@ def infer(self, dataset: Dataset, inference_parameters: Optional[InferenceParame prediction_results, _ = self._infer_detector(self.model, self.config, dataset, False) - # Loop over dataset again to assign predictions. Convert from MMDetection format to OTE format - for dataset_item, output in zip(dataset, prediction_results): - width = dataset_item.width - height = dataset_item.height + if self.rank == 0: + # Loop over dataset again to assign predictions. Convert from MMDetection format to OTE format + for dataset_item, output in zip(dataset, prediction_results): + width = dataset_item.width + height = dataset_item.height - shapes = [] - for label_idx, detections in enumerate(output): - for i in range(detections.shape[0]): - probability = float(detections[i, 4]) - coords = detections[i, :4].astype(float).copy() - coords /= np.array([width, height, width, height], dtype=float) - coords = np.clip(coords, 0, 1) + shapes = [] + for label_idx, detections in enumerate(output): + for i in range(detections.shape[0]): + probability = float(detections[i, 4]) + coords = detections[i, :4].astype(float).copy() + coords /= np.array([width, height, width, height], dtype=float) + coords = np.clip(coords, 0, 1) - if probability < confidence_threshold: - continue + if probability < confidence_threshold: + continue - assigned_label = [ScoredLabel(self.labels[label_idx], - probability=probability)] - if coords[3] - coords[1] <= 0 or coords[2] - coords[0] <= 0: - continue + assigned_label = [ScoredLabel(self.labels[label_idx], probability=probability)] + if coords[3] - coords[1] <= 0 or coords[2] - coords[0] <= 0: + continue - shapes.append(Annotation( - Box(x1=coords[0], y1=coords[1], x2=coords[2], y2=coords[3]), - labels=assigned_label)) + shapes.append(Annotation( + Box(x1=coords[0], y1=coords[1], x2=coords[2], y2=coords[3]), + labels=assigned_label)) - dataset_item.append_annotations(shapes) + dataset_item.append_annotations(shapes) return dataset - @staticmethod - def _infer_detector(model: torch.nn.Module, config: Config, dataset: Dataset, + def _infer_detector(self, model: torch.nn.Module, config: Config, dataset: Dataset, eval: Optional[bool] = False, metric_name: Optional[str] = 'mAP') -> Tuple[List, float]: model.eval() test_config = prepare_for_testing(config, dataset) @@ -200,19 +223,26 @@ def _infer_detector(model: torch.nn.Module, config: Config, dataset: Dataset, samples_per_gpu=batch_size, workers_per_gpu=test_config.data.workers_per_gpu, num_gpus=1, - dist=False, + dist=True, shuffle=False) - eval_model = MMDataParallel(model.cuda(test_config.gpu_ids[0]), - device_ids=test_config.gpu_ids) - # Use a single gpu for testing. Set in both mm_val_dataloader and eval_model - eval_predictions = single_gpu_test(eval_model, mm_val_dataloader, show=False) + + if torch.cuda.is_available(): + model = MMDistributedDataParallel( + model.cuda(), + device_ids=[torch.cuda.current_device()], + broadcast_buffers=False) + eval_predictions = multi_gpu_test(model, mm_val_dataloader, config.work_dir, False) + else: + model = MMDataCPU(model) + eval_predictions = single_gpu_test(model, mm_val_dataloader, show=False) metric = None - if eval: + if eval and self.rank == 0: metric = mm_val_dataset.evaluate(eval_predictions, metric=metric_name)[metric_name] return eval_predictions, metric + @master_only def evaluate(self, output_result_set: ResultSet, evaluation_metric: Optional[str] = None): @@ -246,6 +276,7 @@ def train(self, dataset: Dataset, output_model: Model, train_parameters: Optiona """ Trains a model on a dataset """ set_hyperparams(self.config, self.hyperparams) + self.training_round_id += 1 train_dataset = dataset.get_subset(Subset.TRAINING) val_dataset = dataset.get_subset(Subset.VALIDATION) @@ -274,11 +305,12 @@ def train(self, dataset: Dataset, output_model: Model, train_parameters: Optiona # Run training. self.time_monitor = TimeMonitorCallback(0, 0, 0, 0, update_progress_callback=lambda _: None) learning_curves = defaultdict(OTELoggerHook.Curve) - training_config = prepare_for_training(config, train_dataset, val_dataset, self.time_monitor, learning_curves) + training_config = prepare_for_training(config, train_dataset, val_dataset, + self.training_round_id, self.time_monitor, learning_curves) mm_train_dataset = build_dataset(training_config.data.train) self.is_training = True self.model.train() - train_detector(model=self.model, dataset=mm_train_dataset, cfg=training_config, validate=True) + train_detector(model=self.model, dataset=mm_train_dataset, cfg=training_config, distributed=True, validate=True) # Check for stop signal when training has stopped. If should_stop is true, training was cancelled and no new # model should be returned. Old train model is restored. @@ -291,37 +323,40 @@ def train(self, dataset: Dataset, output_model: Model, train_parameters: Optiona return # Load the best weights and check if model has improved. - training_metrics = self._generate_training_metrics_group(learning_curves) best_checkpoint_path = os.path.join(training_config.work_dir, 'latest.pth') best_checkpoint = torch.load(best_checkpoint_path) self.model.load_state_dict(best_checkpoint['state_dict']) # Evaluate model performance after training. _, final_performance = self._infer_detector(self.model, config, val_dataset, True) - improved = final_performance > initial_performance - # Return a new model if model has improved, or there is no model yet. - if improved or isinstance(self.task_environment.model, NullModel): - if improved: - logger.info("Training finished, and it has an improved model") + if self.rank == 0: + improved = final_performance > initial_performance + + # Return a new model if model has improved, or there is no model yet. + if improved or isinstance(self.task_environment.model, NullModel): + if improved: + logger.info("Training finished, and it has an improved model") + else: + logger.info("First training round, saving the model.") + # Add mAP metric and loss curves + training_metrics = self._generate_training_metrics_group(learning_curves) + performance = Performance(score=ScoreMetric(value=final_performance, name="mAP"), + dashboard_metrics=training_metrics) + logger.info('FINAL MODEL PERFORMANCE\n' + str(performance)) + self.save_model(output_model) + output_model.performance = performance + output_model.model_status = ModelStatus.SUCCESS else: - logger.info("First training round, saving the model.") - # Add mAP metric and loss curves - performance = Performance(score=ScoreMetric(value=final_performance, name="mAP"), - dashboard_metrics=training_metrics) - logger.info('FINAL MODEL PERFORMANCE\n' + str(performance)) - self.save_model(output_model) - output_model.performance = performance - output_model.model_status = ModelStatus.SUCCESS - else: - logger.info("Model performance has not improved while training. No new model has been saved.") - # Restore old training model if training from scratch and not improved - self.model = old_model + logger.info("Model performance has not improved while training. No new model has been saved.") + # Restore old training model if training from scratch and not improved + self.model = old_model self.is_training = False self.time_monitor = None + @master_only def save_model(self, output_model: Model): buffer = io.BytesIO() hyperparams = self.task_environment.get_configurable_parameters(OTEDetectionConfig) @@ -343,6 +378,7 @@ def get_training_progress(self) -> float: return -1.0 + @master_only def cancel_training(self): """ Sends a cancel training signal to gracefully stop the optimizer. The signal consists of creating a @@ -427,24 +463,28 @@ def unload(self): ctypes.string_at(0) else: logger.warning("Got unload request, but not on Docker. Only clearing CUDA cache") - torch.cuda.empty_cache() - logger.warning(f"Done unloading. " - f"Torch is still occupying {torch.cuda.memory_allocated()} bytes of GPU memory") + if torch.cuda.is_available(): + torch.cuda.empty_cache() + logger.warning(f"CUDA cache is cleared. " + "Torch is still occupying {torch.cuda.memory_allocated()} bytes of GPU memory") + logger.warning("Done unloading.") + @master_only def export(self, export_type: ExportType, output_model: OptimizedModel): assert export_type == ExportType.OPENVINO optimized_model_precision = ModelPrecision.FP32 - with tempfile.TemporaryDirectory() as tempdir: - optimized_model_dir = os.path.join(tempdir, "export") - logger.info(f'Optimized model will be temporarily saved to "{optimized_model_dir}"') - os.makedirs(optimized_model_dir, exist_ok=True) + with tempfile.TemporaryDirectory(prefix="export-", dir=self.config.work_dir) as tempdir: + logger.info(f'Optimized model will be temporarily saved to "{tempdir}"') try: from torch.jit._trace import TracerWarning warnings.filterwarnings("ignore", category=TracerWarning) - model = self.model.cuda(self.config.gpu_ids[0]) + if torch.cuda.is_available(): + model = self.model.cuda(self.config.gpu_ids[0]) + else: + model = self.model.cpu() export_model(model, self.config, tempdir, target='openvino', precision=optimized_model_precision.name) bin_file = [f for f in os.listdir(tempdir) if f.endswith('.bin')][0] @@ -458,6 +498,7 @@ def export(self, raise RuntimeError("Optimization was unsuccessful.") from ex + @master_only def _delete_scratch_space(self): """ Remove model checkpoints and mmdet logs diff --git a/mmdet/apis/ote/sample/sample.py b/mmdet/apis/ote/sample/sample.py index 4e4eb8776ca..b67ce6b14ae 100644 --- a/mmdet/apis/ote/sample/sample.py +++ b/mmdet/apis/ote/sample/sample.py @@ -15,6 +15,11 @@ import argparse import os.path as osp import sys +import warnings + +warnings.filterwarnings('ignore', category=DeprecationWarning, message='.*cElementTree is deprecated.*') +warnings.filterwarnings('ignore', category=UserWarning, message='.*Nevergrad package could not be imported.*') +warnings.filterwarnings('ignore', category=UserWarning, message='.*This overload of nonzero is deprecated.*') from sc_sdk.entities.dataset_storage import NullDatasetStorage from sc_sdk.entities.datasets import Subset @@ -35,13 +40,16 @@ from mmdet.apis.ote.apis.detection.ote_utils import generate_label_schema, load_template, get_task_class -logger = logger_factory.get_logger('Sample') +logger = logger_factory.get_logger('OTEDetectionSample') +import logging +logger.setLevel(logging.INFO) def parse_args(): parser = argparse.ArgumentParser(description='Sample showcasing the new API') parser.add_argument('template_file_path', help='path to template file') parser.add_argument('--data-dir', default='data') + parser.add_argument('--work-dir', default='/tmp/ote-det-scratch') parser.add_argument('--export', action='store_true') args = parser.parse_args() return args @@ -71,6 +79,7 @@ def main(args): logger.info('Setup environment') params = OTEDetectionConfig(workspace_id=ID(), project_id=ID(), task_id=ID()) apply_template_configurable_parameters(params, template) + params.algo_backend.scratch_space = args.work_dir environment = TaskEnvironment(model=NullModel(), configurable_parameters=params, label_schema=labels_schema) logger.info('Create base Task') From 9812c119787579a044b191976e2e93839defd4e7 Mon Sep 17 00:00:00 2001 From: "Druzhkov, Pavel" Date: Sat, 31 Jul 2021 02:02:38 +0300 Subject: [PATCH 02/11] temporary change export dir location --- mmdet/apis/ote/apis/detection/task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmdet/apis/ote/apis/detection/task.py b/mmdet/apis/ote/apis/detection/task.py index ed9a6ddda83..06a99c26857 100644 --- a/mmdet/apis/ote/apis/detection/task.py +++ b/mmdet/apis/ote/apis/detection/task.py @@ -476,7 +476,7 @@ def export(self, output_model: OptimizedModel): assert export_type == ExportType.OPENVINO optimized_model_precision = ModelPrecision.FP32 - with tempfile.TemporaryDirectory(prefix="export-", dir=self.config.work_dir) as tempdir: + with tempfile.TemporaryDirectory(prefix="ote-det-export-") as tempdir: logger.info(f'Optimized model will be temporarily saved to "{tempdir}"') try: from torch.jit._trace import TracerWarning From 29180b24dbf18ec0a2d92d96d9176aa481e7e66d Mon Sep 17 00:00:00 2001 From: "Druzhkov, Pavel" Date: Sat, 31 Jul 2021 02:03:03 +0300 Subject: [PATCH 03/11] fix training cancelation --- mmdet/apis/ote/apis/detection/task.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mmdet/apis/ote/apis/detection/task.py b/mmdet/apis/ote/apis/detection/task.py index 06a99c26857..b08bf1712a0 100644 --- a/mmdet/apis/ote/apis/detection/task.py +++ b/mmdet/apis/ote/apis/detection/task.py @@ -118,6 +118,7 @@ def __init__(self, task_environment: TaskEnvironment): # Extra control variables. self.training_round_id = 0 self.is_training = False + self.training_work_dir = None self.should_stop = False self.time_monitor = None @@ -299,6 +300,7 @@ def train(self, dataset: Dataset, output_model: Model, train_parameters: Optiona self.model = old_model self.should_stop = False self.is_training = False + self.training_work_dir = None self.time_monitor = None return @@ -308,6 +310,7 @@ def train(self, dataset: Dataset, output_model: Model, train_parameters: Optiona training_config = prepare_for_training(config, train_dataset, val_dataset, self.training_round_id, self.time_monitor, learning_curves) mm_train_dataset = build_dataset(training_config.data.train) + self.training_work_dir = training_config.work_dir self.is_training = True self.model.train() train_detector(model=self.model, dataset=mm_train_dataset, cfg=training_config, distributed=True, validate=True) @@ -319,6 +322,7 @@ def train(self, dataset: Dataset, output_model: Model, train_parameters: Optiona self.model = old_model self.should_stop = False self.is_training = False + self.training_work_dir = None self.time_monitor = None return @@ -353,6 +357,7 @@ def train(self, dataset: Dataset, output_model: Model, train_parameters: Optiona self.model = old_model self.is_training = False + self.training_work_dir = None self.time_monitor = None @@ -388,7 +393,7 @@ def cancel_training(self): """ logger.info("Cancel training requested.") self.should_stop = True - stop_training_filepath = os.path.join(self.config.work_dir, '.stop_training') + stop_training_filepath = os.path.join(self.training_work_dir, '.stop_training') open(stop_training_filepath, 'a').close() From b12269a05446901f54b54cc8ab5783ac4ae4ab8a Mon Sep 17 00:00:00 2001 From: "Druzhkov, Pavel" Date: Wed, 18 Aug 2021 12:12:13 +0300 Subject: [PATCH 04/11] return back local scratch space --- mmdet/apis/ote/apis/detection/task.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mmdet/apis/ote/apis/detection/task.py b/mmdet/apis/ote/apis/detection/task.py index 96a6395e868..784169752e3 100644 --- a/mmdet/apis/ote/apis/detection/task.py +++ b/mmdet/apis/ote/apis/detection/task.py @@ -100,8 +100,10 @@ def __init__(self, task_environment: TaskEnvironment): self.task_environment = task_environment self.hyperparams = hyperparams = task_environment.get_hyper_parameters(OTEDetectionConfig) - self.scratch_space = self.hyperparams.algo_backend.scratch_space - logger.info(f"Scratch space for the task: {self.scratch_space}") + # self.scratch_space = self.hyperparams.algo_backend.scratch_space + self.scratch_space = tempfile.mkdtemp(prefix="ote-det-scratch-") + logger.info(f"Scratch space created at {self.scratch_space}") + # logger.info(f"Scratch space for the task: {self.scratch_space}") self.model_name = hyperparams.algo_backend.model_name self.labels = task_environment.get_labels(False) @@ -244,7 +246,7 @@ def _infer_detector(self, model: torch.nn.Module, config: Config, dataset: Datas model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) - eval_predictions = multi_gpu_test(model, mm_val_dataloader, config.work_dir, False) + eval_predictions = multi_gpu_test(model, mm_val_dataloader) else: model = MMDataCPU(model) eval_predictions = single_gpu_test(model, mm_val_dataloader, show=False) From 90e6f05d0d10227d00cfdfde3d44907e391ab7b6 Mon Sep 17 00:00:00 2001 From: "Druzhkov, Pavel" Date: Thu, 19 Aug 2021 11:32:05 +0300 Subject: [PATCH 05/11] fix distributed environment initialization --- mmdet/apis/ote/apis/detection/task.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mmdet/apis/ote/apis/detection/task.py b/mmdet/apis/ote/apis/detection/task.py index 784169752e3..4c2cd5e4c63 100644 --- a/mmdet/apis/ote/apis/detection/task.py +++ b/mmdet/apis/ote/apis/detection/task.py @@ -108,10 +108,14 @@ def __init__(self, task_environment: TaskEnvironment): self.labels = task_environment.get_labels(False) if not torch.distributed.is_initialized(): + os.environ.setdefault("MASTER_ADDR", "127.0.0.1") + os.environ.setdefault("MASTER_PORT", "29500") + os.environ.setdefault("WORLD_SIZE", "1") + os.environ.setdefault("RANK", "0") if torch.cuda.is_available(): init_dist(launcher='pytorch') else: - init_dist_cpu(backend="gloo") + init_dist_cpu(launcher='pytorch', backend="gloo") self.rank, self.world_size = get_dist_info() self.gpu_ids = range(self.world_size) logger.warning(f'World size {self.world_size}, rank {self.rank}') From c7d25bb8c816d6e3898b6032c6bc8750dc845676 Mon Sep 17 00:00:00 2001 From: "Druzhkov, Pavel" Date: Thu, 19 Aug 2021 11:34:16 +0300 Subject: [PATCH 06/11] use local scratch space --- mmdet/apis/ote/apis/detection/task.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/mmdet/apis/ote/apis/detection/task.py b/mmdet/apis/ote/apis/detection/task.py index 4c2cd5e4c63..4a49e1d5132 100644 --- a/mmdet/apis/ote/apis/detection/task.py +++ b/mmdet/apis/ote/apis/detection/task.py @@ -100,10 +100,8 @@ def __init__(self, task_environment: TaskEnvironment): self.task_environment = task_environment self.hyperparams = hyperparams = task_environment.get_hyper_parameters(OTEDetectionConfig) - # self.scratch_space = self.hyperparams.algo_backend.scratch_space self.scratch_space = tempfile.mkdtemp(prefix="ote-det-scratch-") logger.info(f"Scratch space created at {self.scratch_space}") - # logger.info(f"Scratch space for the task: {self.scratch_space}") self.model_name = hyperparams.algo_backend.model_name self.labels = task_environment.get_labels(False) @@ -345,11 +343,6 @@ def train(self, dataset: Dataset, output_model: Model, train_parameters: Optiona self.time_monitor = None return - # Load the best weights and check if model has improved. - best_checkpoint_path = os.path.join(training_config.work_dir, 'latest.pth') - best_checkpoint = torch.load(best_checkpoint_path) - self.model.load_state_dict(best_checkpoint['state_dict']) - # Evaluate model performance after training. _, final_performance = self._infer_detector(self.model, config, val_dataset, True) From 9f0445999a5a94b5b4722b073f9075a02da80ea9 Mon Sep 17 00:00:00 2001 From: "Druzhkov, Pavel" Date: Thu, 19 Aug 2021 11:34:39 +0300 Subject: [PATCH 07/11] remove model name --- mmdet/apis/ote/apis/detection/task.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/mmdet/apis/ote/apis/detection/task.py b/mmdet/apis/ote/apis/detection/task.py index 4a49e1d5132..585b06a9f49 100644 --- a/mmdet/apis/ote/apis/detection/task.py +++ b/mmdet/apis/ote/apis/detection/task.py @@ -102,7 +102,6 @@ def __init__(self, task_environment: TaskEnvironment): self.scratch_space = tempfile.mkdtemp(prefix="ote-det-scratch-") logger.info(f"Scratch space created at {self.scratch_space}") - self.model_name = hyperparams.algo_backend.model_name self.labels = task_environment.get_labels(False) if not torch.distributed.is_initialized(): @@ -150,7 +149,6 @@ def _load_model(self, model: Model): try: model.load_state_dict(model_data['model']) logger.info(f"Loaded model weights from Task Environment") - logger.info(f"Model architecture: {self.model_name}") except BaseException as ex: raise ValueError("Could not load the saved model. The model file structure is invalid.") \ from ex @@ -158,8 +156,7 @@ def _load_model(self, model: Model): # If there is no trained model yet, create model with pretrained weights as defined in the model config # file. model = self._create_model(self.config, from_scratch=False) - logger.info(f"No trained model in project yet. Created new model with '{self.model_name}' " - f"architecture and general-purpose pretrained weights.") + logger.info(f"No trained model in project yet. Created new model with general-purpose pretrained weights.") return model @staticmethod @@ -417,13 +414,6 @@ def _generate_training_metrics_group(self, learning_curves) -> Optional[List[Met """ output: List[MetricsGroup] = [] - # Model architecture - architecture = InfoMetric(name='Model architecture', value=self.model_name) - visualization_info_architecture = VisualizationInfo(name="Model architecture", - visualisation_type=VisualizationType.TEXT) - output.append(MetricsGroup(metrics=[architecture], - visualization_info=visualization_info_architecture)) - # Learning curves for key, curve in learning_curves.items(): metric_curve = CurveMetric(xs=curve.x, ys=curve.y, name=key) From 4fdc135f094fc9e0e839c95fb2813ebd5793507c Mon Sep 17 00:00:00 2001 From: "Druzhkov, Pavel" Date: Thu, 19 Aug 2021 15:37:29 +0300 Subject: [PATCH 08/11] clean up --- mmdet/apis/ote/apis/detection/config_utils.py | 5 ----- mmdet/apis/ote/apis/detection/task.py | 8 ++------ 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/mmdet/apis/ote/apis/detection/config_utils.py b/mmdet/apis/ote/apis/detection/config_utils.py index 503240a8897..8f5b88431b5 100644 --- a/mmdet/apis/ote/apis/detection/config_utils.py +++ b/mmdet/apis/ote/apis/detection/config_utils.py @@ -60,8 +60,6 @@ def patch_config(config: Config, work_dir: str, labels: List[Label], random_seed if evaluation_metric is not None: config.evaluation.save_best = evaluation_metric config.evaluation.rule = 'greater' - # FIXME. Does if have to be explicitly set for CPU-only mode? - # config.evaluation.gpu_collect = False label_names = [lab.name for lab in labels] set_data_classes(config, label_names) @@ -129,7 +127,6 @@ def config_to_string(config: Config) -> str: config_copy.data.train.ote_dataset = None else: config_copy.data.train.dataset.ote_dataset = None - # config_copy.labels = [label.name for label in config.labels] return Config(config_copy).pretty_text @@ -190,8 +187,6 @@ def set_data_classes(config: Config, label_names: List[str]): config.model.roi_head.bbox_head.num_classes = num_classes elif 'bbox_head' in config.model: config.model.bbox_head.num_classes = num_classes - # FIXME. ? - # self.config.model.CLASSES = label_names def patch_datasets(config: Config): diff --git a/mmdet/apis/ote/apis/detection/task.py b/mmdet/apis/ote/apis/detection/task.py index 585b06a9f49..a27c70a9656 100644 --- a/mmdet/apis/ote/apis/detection/task.py +++ b/mmdet/apis/ote/apis/detection/task.py @@ -14,7 +14,6 @@ import copy import io -import logging import os import shutil import tempfile @@ -32,10 +31,7 @@ LineChartInfo, MetricsGroup, Performance, - ScoreMetric, - InfoMetric, - VisualizationType, - VisualizationInfo) + ScoreMetric) from ote_sdk.entities.shapes.box import Box from ote_sdk.entities.train_parameters import TrainParameters from ote_sdk.entities.label import ScoredLabel @@ -61,7 +57,7 @@ from sc_sdk.usecases.tasks.interfaces.unload_interface import IUnload from sc_sdk.logging import logger_factory -from mmcv.parallel import MMDataParallel, MMDistributedDataParallel +from mmcv.parallel import MMDistributedDataParallel from mmcv.runner import load_checkpoint, get_dist_info, init_dist, master_only from mmcv.utils import Config from mmdet.apis import train_detector, single_gpu_test, multi_gpu_test, export_model From 8adaa8bcc780f6883367419ca9faef6c22d63303 Mon Sep 17 00:00:00 2001 From: "Druzhkov, Pavel" Date: Thu, 19 Aug 2021 17:01:55 +0300 Subject: [PATCH 09/11] clean up --- mmdet/apis/ote/apis/detection/configuration.py | 12 ------------ mmdet/apis/ote/sample/sample.py | 1 - 2 files changed, 13 deletions(-) diff --git a/mmdet/apis/ote/apis/detection/configuration.py b/mmdet/apis/ote/apis/detection/configuration.py index 1560b812352..962ad647d48 100644 --- a/mmdet/apis/ote/apis/detection/configuration.py +++ b/mmdet/apis/ote/apis/detection/configuration.py @@ -118,17 +118,5 @@ class __Postprocessing(ParameterGroup): affects_outcome_of=ModelLifecycle.INFERENCE ) - @attrs - class __AlgoBackend(ParameterGroup): - header = string_attribute("Internal Algo Backend parameters") - description = header - visible_in_ui = boolean_attribute(False) - - template = string_attribute("template.yaml") - model = string_attribute("model.py") - model_name = string_attribute("object detection model") - scratch_space = string_attribute("/tmp/ote-det-scratch") - learning_parameters = add_parameter_group(__LearningParameters) - algo_backend = add_parameter_group(__AlgoBackend) postprocessing = add_parameter_group(__Postprocessing) diff --git a/mmdet/apis/ote/sample/sample.py b/mmdet/apis/ote/sample/sample.py index a74de8dcc09..ba56e696495 100644 --- a/mmdet/apis/ote/sample/sample.py +++ b/mmdet/apis/ote/sample/sample.py @@ -52,7 +52,6 @@ def parse_args(): parser = argparse.ArgumentParser(description='Sample showcasing the new API') parser.add_argument('template_file_path', help='path to template file') parser.add_argument('--data-dir', default='data') - parser.add_argument('--work-dir', default='/tmp/ote-det-scratch') parser.add_argument('--export', action='store_true') args = parser.parse_args() return args From 257499cc6148488d43dfc69fb9b65d0e842cf472 Mon Sep 17 00:00:00 2001 From: "Druzhkov, Pavel" Date: Thu, 19 Aug 2021 17:20:24 +0300 Subject: [PATCH 10/11] fix config file name --- mmdet/apis/ote/apis/detection/task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmdet/apis/ote/apis/detection/task.py b/mmdet/apis/ote/apis/detection/task.py index 8dba0aacf81..cea6930c00c 100644 --- a/mmdet/apis/ote/apis/detection/task.py +++ b/mmdet/apis/ote/apis/detection/task.py @@ -109,7 +109,7 @@ def __init__(self, task_environment: TaskEnvironment): # Get and prepare mmdet config. base_dir = os.path.abspath(os.path.dirname(template_file_path)) - config_file_path = os.path.join(base_dir, hyperparams.algo_backend.model) + config_file_path = os.path.join(base_dir, 'model.py') self._config = Config.fromfile(config_file_path) patch_config(self._config, self._scratch_space, self._labels, random_seed=42) set_hyperparams(self._config, hyperparams) From 287a286e8bcdcea9db725b12846f0355b24e996b Mon Sep 17 00:00:00 2001 From: "Druzhkov, Pavel" Date: Thu, 19 Aug 2021 18:07:20 +0300 Subject: [PATCH 11/11] fix configuration file test --- tests/test_ote_api.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_ote_api.py b/tests/test_ote_api.py index 17787a68e64..7302fa4794e 100644 --- a/tests/test_ote_api.py +++ b/tests/test_ote_api.py @@ -70,7 +70,6 @@ def test_configuration_yaml(): configuration_yaml_converted = yaml.safe_load(configuration_yaml_str) with open(osp.join('mmdet', 'apis', 'ote', 'apis', 'detection', 'configuration.yaml')) as read_file: configuration_yaml_loaded = yaml.safe_load(read_file) - del configuration_yaml_converted['algo_backend'] assert configuration_yaml_converted == configuration_yaml_loaded def test_set_values_as_default():