diff --git a/.github/workflows/cicd.yaml b/.github/workflows/cicd.yaml index 0c66fafb..207f2756 100644 --- a/.github/workflows/cicd.yaml +++ b/.github/workflows/cicd.yaml @@ -38,8 +38,8 @@ jobs: - name: Example inference run via Docker with default config and checkpoint run: > docker run - -v /var/data/cicd/CICD_github_assets/myria3d_V3.6.0/inputs/:/inputs/ - -v /var/data/cicd/CICD_github_assets/myria3d_V3.6.0/outputs/:/outputs/ + -v /var/data/cicd/CICD_github_assets/myria3d_V3.7.0/inputs/:/inputs/ + -v /var/data/cicd/CICD_github_assets/myria3d_V3.7.0/outputs/:/outputs/ --ipc=host --shm-size=2gb myria3d @@ -53,14 +53,14 @@ jobs: - name: Example inference run via Docker with inference-time subtiles overlap to smooth-out results. run: > docker run - -v /var/data/cicd/CICD_github_assets/myria3d_V3.6.0/inputs/:/inputs/ - -v /var/data/cicd/CICD_github_assets/myria3d_V3.6.0/outputs/:/outputs/ + -v /var/data/cicd/CICD_github_assets/myria3d_V3.7.0/inputs/:/inputs/ + -v /var/data/cicd/CICD_github_assets/myria3d_V3.7.0/outputs/:/outputs/ --ipc=host --shm-size=2gb myria3d python run.py --config-path /inputs/ - --config-name proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.6.0 + --config-name proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.7.0 predict.ckpt_path=/inputs/proto151_V2.0_epoch_100_Myria3DV3.1.0.ckpt datamodule.epsg=2154 predict.src_las=/inputs/792000_6272000_subset_buildings.las diff --git a/CHANGELOG.md b/CHANGELOG.md index 05cde1b7..0507c492 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,13 @@ # CHANGELOG +## 3.7.0 +- Update all versions of Pytorch, Pytorch Lightning, and Pytorch Geometric. + Changes are retrocompatible for models trained with older versions (with adjustment to the configuration file). +- Refactor logging of single-class IoUs to go from num_classes+1 torchmetrics instances to only 1. ### 3.6.1 - Set urllib3<2 for comet logging to function and add back seaborn for plotting optimal LR graph. - + ## 3.6.0 - Remove the "EPSG:2154" by default and use the metadata of the lidar file, unless a parameter is given. diff --git a/configs/callbacks/default.yaml b/configs/callbacks/default.yaml index 1f221750..dcab6e52 100755 --- a/configs/callbacks/default.yaml +++ b/configs/callbacks/default.yaml @@ -12,12 +12,6 @@ lr_monitor: logging_interval: "step" log_momentum: true -# This logs IoU at validation and test time -# Predictions are aggregated and saved at test time in a way coherent with prediction logic. -log_iou_by_class: - _target_: myria3d.callbacks.logging_callbacks.LogIoUByClass - classification_dict: ${dataset_description.classification_dict} - model_checkpoint: _target_: pytorch_lightning.callbacks.ModelCheckpoint monitor: "val/loss_epoch" # name of the logged metric which determines when model is improving diff --git a/configs/experiment/DebugFineTune.yaml b/configs/experiment/DebugFineTune.yaml index 1637344c..d90bc832 100644 --- a/configs/experiment/DebugFineTune.yaml +++ b/configs/experiment/DebugFineTune.yaml @@ -18,7 +18,6 @@ trainer: limit_test_batches: 1 max_epochs: 1 num_sanity_val_steps: 0 - # gpus: [1] callbacks: finetune: diff --git a/configs/experiment/RandLaNet_base_run_FR-MultiGPU.yaml b/configs/experiment/RandLaNet_base_run_FR-MultiGPU.yaml index f5664212..97e36a0b 100755 --- a/configs/experiment/RandLaNet_base_run_FR-MultiGPU.yaml +++ b/configs/experiment/RandLaNet_base_run_FR-MultiGPU.yaml @@ -10,5 +10,4 @@ trainer: strategy: ddp_find_unused_parameters_false # Replace by cpu to simulate multi-cpus training. accelerator: gpu - num_processes: 2 - gpus: 2 + devices: 2 diff --git a/configs/model/default.yaml b/configs/model/default.yaml index d6d418d1..2223d6fd 100644 --- a/configs/model/default.yaml +++ b/configs/model/default.yaml @@ -3,6 +3,7 @@ _target_: myria3d.models.model.Model ## Inputs and outputs d_in: ${dataset_description.d_in} # XYZ (3) + Other features (N) num_classes: ${dataset_description.num_classes} +classification_dict: ${dataset_description.classification_dict} # Architecture defined in sub-configs ckpt_path: null # str, for resuming training and finetuning. @@ -13,14 +14,6 @@ neural_net_hparams: ??? interpolation_k: ${predict.interpolator.interpolation_k} # interpolation at eval time num_workers: 4 # for knn_interpolate -## Evaluation metric - partial for triple (train/val/test) init -iou: - _target_: functools.partial - _args_: - - "${get_method:torchmetrics.JaccardIndex}" - - ${model.num_classes} - absent_score: 1.0 # do not penalize if a class is absent from labels. - ## Optimization momentum: 0.9 # arbitrary monitor: "val/loss_epoch" diff --git a/configs/predict/default.yaml b/configs/predict/default.yaml index 9a4ca973..4c078533 100644 --- a/configs/predict/default.yaml +++ b/configs/predict/default.yaml @@ -1,7 +1,7 @@ src_las: "/path/to/input.las" # Any glob pattern can be used to predict on multiple files. output_dir: "/path/to/output_dir/" # Predictions are saved in a new file which shares src_las basename. ckpt_path: "/path/to/lightning_model.ckpt" # Checkpoint of trained model. -gpus: 0 # 0 for none, 1 for one, [gpu_id] to specify which gpu to use e.g [1] +gpus: 0 # Probas interpolation parameters # subtile_overlap=25 to use a sliding window of inference of which predictions will be merged. diff --git a/configs/task/default.yaml b/configs/task/default.yaml index 45716d12..ef0d0e10 100644 --- a/configs/task/default.yaml +++ b/configs/task/default.yaml @@ -1,2 +1,3 @@ # Task at hand. Can be train or predict -task_name: fit # "fit" or "test" or "fit+test", or "predict", or "finetune" \ No newline at end of file +task_name: fit # "fit" or "test" or "fit+test", or "predict", or "finetune" +auto_lr_find: false # override with true to run the LR-range test in train.py. diff --git a/configs/trainer/all_params.yaml b/configs/trainer/all_params.yaml deleted file mode 100755 index 067c2953..00000000 --- a/configs/trainer/all_params.yaml +++ /dev/null @@ -1,50 +0,0 @@ -_target_: pytorch_lightning.Trainer - -# default values for all trainer parameters -checkpoint_callback: True -default_root_dir: null -gradient_clip_val: 0.0 -process_position: 0 -num_nodes: 1 -num_processes: 1 -gpus: null -auto_select_gpus: False -tpu_cores: null -log_gpu_memory: null -progress_bar_refresh_rate: 1 -overfit_batches: 0.0 -track_grad_norm: -1 -check_val_every_n_epoch: 1 -fast_dev_run: False -accumulate_grad_batches: 1 -max_epochs: 1 -min_epochs: 1 -max_steps: null -min_steps: null -limit_train_batches: 1.0 -limit_val_batches: 1.0 -limit_test_batches: 1.0 -val_check_interval: 1.0 -flush_logs_every_n_steps: 100 -log_every_n_steps: 50 -accelerator: null -sync_batchnorm: False -precision: 32 -weights_summary: "top" -weights_save_path: null -num_sanity_val_steps: 2 -truncated_bptt_steps: null -resume_from_checkpoint: null -profiler: null -benchmark: False -deterministic: False -reload_dataloaders_every_epoch: False -auto_lr_find: False -replace_sampler_ddp: True -terminate_on_nan: False -auto_scale_batch_size: False -prepare_data_per_node: True -plugins: null -amp_backend: "native" -amp_level: "O2" -move_metrics_to_cpu: False diff --git a/configs/trainer/default.yaml b/configs/trainer/default.yaml index b7968c1e..f8019e86 100755 --- a/configs/trainer/default.yaml +++ b/configs/trainer/default.yaml @@ -1,14 +1,10 @@ _target_: pytorch_lightning.Trainer -# set `1` to train on GPU, `0` to train on CPU only -gpus: 0 - min_epochs: 1 max_epochs: 1300 log_every_n_steps: 1 -weights_summary: null -progress_bar_refresh_rate: 1 - -auto_lr_find: false # override with true to run the LR-range test in train.py. - +# set to gpu for gpu training (if devices > 1, set ddp_find_unused_parameters_false: true) +accelerator: cpu +devices: 1 +num_nodes: 1 diff --git a/docs/source/apidoc/default_config.yml b/docs/source/apidoc/default_config.yml index ffef6ec8..eb6d004a 100644 --- a/docs/source/apidoc/default_config.yml +++ b/docs/source/apidoc/default_config.yml @@ -5,13 +5,11 @@ print_config: true ignore_warnings: true trainer: _target_: pytorch_lightning.Trainer - gpus: 0 + accelerator: cpu + devices: 1 min_epochs: 1 max_epochs: 1 log_every_n_steps: 1 - weights_summary: null - progress_bar_refresh_rate: 1 - auto_lr_find: false limit_train_batches: 1 limit_val_batches: 1 limit_test_batches: 1 @@ -253,6 +251,7 @@ logger: disabled: true task: task_name: fit + auto_lr_find: false predict: src_las: /path/to/input.las output_dir: /path/to/output_dir/ diff --git a/docs/source/guides/train_new_model.md b/docs/source/guides/train_new_model.md index 64bcead0..ccc22fc5 100644 --- a/docs/source/guides/train_new_model.md +++ b/docs/source/guides/train_new_model.md @@ -36,7 +36,7 @@ After training, you model best checkpoints and hydra config will be saved in a ` ### Optimized learning rate Pytorch Lightning support au [automated learning rate finder](https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html#auto-lr-find), by means of an Learning Rate-range test (see section 3.3 in [this paper](https://arxiv.org/pdf/1506.01186.pdf) for reference). -You can perfom this automatically before training by setting `trainer.auto_lr_find=true` when calling training on your dataset. The best learning rate will be logged and results saved as an image, so that you do not need to perform this test more than once. +You can perfom this automatically before training by setting `task.auto_lr_find=true` when calling training on your dataset. The best learning rate will be logged and results saved as an image, so that you do not need to perform this test more than once. ### Multi-GPUs diff --git a/environment.yml b/environment.yml index 7e52538f..553bccf0 100644 --- a/environment.yml +++ b/environment.yml @@ -2,35 +2,32 @@ # mamba env create -f environment.yml name: myria3d channels: - - conda-forge - - anaconda - pytorch - - comet_ml + - nvidia - pyg + - comet_ml + - conda-forge dependencies: - - python==3.9.* + - python=3.9.* - pip - # --------- data formats --------- # - - numpy - - h5py # --------- Deep Learning --------- # - # cudatoolkit to specify the cuda driver in the conda env - - conda-forge::cudatoolkit=11.3.1 # single equal sign there, not a typo - - pytorch::pytorch==1.11.0 - - pytorch::torchvision==0.12.0 - - conda-forge::pytorch-lightning==1.5.9 - - conda-forge::torchmetrics==0.7.* - - comet_ml::comet_ml==3.31.* - - conda-forge::urllib3<2 # To solve for https://github.com/GeneralMills/pytrends/issues/591 + - pytorch::pytorch=2.1 + - pytorch::pytorch-cuda=11.8 + - pytorch::torchvision=0.16 + - conda-forge::lightning=2.0 + - conda-forge::torchmetrics=0.11 + - pyg::pyg=2.4 - pyg::pytorch-cluster - pyg::pytorch-scatter - pyg::pytorch-sparse - - pyg::pyg==2.1.0 - # Nota: if libcusparse.so.11. errors occur, run - # export LD_LIBRARY_PATH="/home/${USER}/miniconda/envs/lib:$LD_LIBRARY_PATH" - # ou - # export LD_LIBRARY_PATH="/home/${USER}/anaconda3/envs/lib:$LD_LIBRARY_PATH" - # see https://github.com/pyg-team/pytorch_geometric/issues/2040#issuecomment-766610625 + # Troubleshooting: if libcusparse.so.11. errors occur, run + # export LD_LIBRARY_PATH="/home/${USER}/miniconda/envs/lib:$LD_LIBRARY_PATH" + # ou + # export LD_LIBRARY_PATH="/home/${USER}/anaconda3/envs/lib:$LD_LIBRARY_PATH" + # see https://github.com/pyg-team/pytorch_geometric/issues/2040#issuecomment-766610625 + # --------- data formats --------- # + - numpy + - h5py # --------- geo --------- # - pdal - python-pdal @@ -39,6 +36,12 @@ dependencies: - pandas - matplotlib # --------- loggers --------- # + - comet_ml::comet_ml=3.35 + - conda-forge::urllib3<2 # To solve for https://github.com/GeneralMills/pytrends/issues/591 + # --------- Visualization --------- # + - pandas + - matplotlib + - seaborn # used in some callbacks # --------- linters --------- # - pre-commit # hooks for applying linters on commit - black # code formatting @@ -52,9 +55,6 @@ dependencies: - python-dotenv # loading env variables from .env file - rich # beautiful text formatting in terminal - sh # for running bash commands in some tests - # - scikit-learn # used in some callbacks - - seaborn # used in some callbacks - # - jupyterlab # better jupyter notebooks - pudb # debugger # # --------- Documentation --------- # - sphinx==4.5.* @@ -63,7 +63,6 @@ dependencies: - docutils==0.17 - rstcheck==3.3.* # RST Linter - pip: - # --------- hydra configs --------- # - hydra-core==1.1.* - hydra-colorlog==1.1.* # --------- Documentation --------- # diff --git a/myria3d/callbacks/comet_callbacks.py b/myria3d/callbacks/comet_callbacks.py index c1309ec6..84a82d0c 100755 --- a/myria3d/callbacks/comet_callbacks.py +++ b/myria3d/callbacks/comet_callbacks.py @@ -12,7 +12,7 @@ from typing import Optional from pytorch_lightning import Callback, Trainer -from pytorch_lightning.loggers import CometLogger, LoggerCollection +from pytorch_lightning.loggers import CometLogger from pytorch_lightning.utilities import rank_zero_only from myria3d.utils import utils @@ -27,7 +27,7 @@ def get_comet_logger(trainer: Trainer) -> Optional[CometLogger]: if isinstance(trainer.logger, CometLogger): return trainer.logger - if isinstance(trainer.logger, LoggerCollection): + if isinstance(trainer.logger, list): for logger in trainer.logger: if isinstance(logger, CometLogger): return logger @@ -65,7 +65,7 @@ class LogLogsPath(Callback): """Logs run working directory to comet.ml""" @rank_zero_only - def on_init_end(self, trainer): + def setup(self, trainer, pl_module, stage): logger = get_comet_logger(trainer=trainer) if logger: log_path = os.getcwd() diff --git a/myria3d/callbacks/logging_callbacks.py b/myria3d/callbacks/logging_callbacks.py deleted file mode 100644 index aa983a54..00000000 --- a/myria3d/callbacks/logging_callbacks.py +++ /dev/null @@ -1,154 +0,0 @@ -from typing import Any, Dict, Optional - -import pytorch_lightning as pl -import torch -from pytorch_lightning import Callback -from pytorch_lightning.utilities.types import STEP_OUTPUT -from torchmetrics import JaccardIndex -from torchmetrics.functional.classification.jaccard import _jaccard_from_confmat - -from myria3d.utils import utils - -log = utils.get_logger(__name__) - - -# Training was not lenghtend so we keep "as-is" for now, but this -# is not optimal at the moment, and a single class JaccardIndex by phase could -# be used # with specific class of interest specified before each logging. - - -class LogIoUByClass(Callback): - """ - A Callback to log JaccardIndex for each class. - """ - - def __init__(self, classification_dict: Dict[int, str]): - self.classification_names = classification_dict.values() - self.num_classes = len(classification_dict) - self.metric = SingleClassIoU - - def get_all_iou_by_class_object(self): - """Get a dict with schema {class_name:iou_for_class_name, ...}""" - iou_dict = { - name: self.metric(self.num_classes, idx) - for idx, name in enumerate(self.classification_names) - } - return iou_dict - - def on_fit_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule): - """Setup IoU torchmetrics objects for train and val phases.""" - self.train_iou_by_class_dict = self.get_all_iou_by_class_object() - self.val_iou_by_class_dict = self.get_all_iou_by_class_object() - - def on_test_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule): - """Setup IoU torchmetrics objects for test phase.""" - self.test_iou_by_class_dict = self.get_all_iou_by_class_object() - - def on_init_end(self, trainer: pl.Trainer) -> None: - """Setup logging functionnalities.""" - self.experiment = trainer.logger.experiment[0] - - def on_train_batch_end( - self, - trainer: pl.Trainer, - pl_module: pl.LightningModule, - outputs: Optional[STEP_OUTPUT], - batch: Any, - batch_idx: int, - dataloader_idx: int, - ): - """Log IoU for each class.""" - logits = outputs["logits"] - targets = outputs["targets"] - self.log_iou(logits, targets, "train", self.train_iou_by_class_dict) - - def on_validation_batch_end( - self, - trainer: pl.Trainer, - pl_module: pl.LightningModule, - outputs: Optional[STEP_OUTPUT], - batch: Any, - batch_idx: int, - dataloader_idx: int, - ): - """Log IoU for each class.""" - logits = outputs["logits"] - targets = outputs["targets"] - self.log_iou(logits, targets, "val", self.val_iou_by_class_dict) - - def on_test_batch_end( - self, - trainer: pl.Trainer, - pl_module: pl.LightningModule, - outputs: Optional[STEP_OUTPUT], - batch: Any, - batch_idx: int, - dataloader_idx: int, - ): - """Log IoU for each class.""" - logits = outputs["logits"] - targets = outputs["targets"] - self.log_iou(logits, targets, "test", self.test_iou_by_class_dict) - - def log_iou(self, logits, targets, phase: str, iou_dict): - device = logits.device - preds = torch.argmax(logits, dim=1) - for class_name, class_iou in iou_dict.items(): - class_iou = class_iou.to(device) - class_iou(preds, targets) - metric_name = f"{phase}/iou_CLASS_{class_name}" - self.log( - metric_name, - class_iou, - on_step=False, - on_epoch=True, - metric_attribute=metric_name, - ) - - -class SingleClassIoU(JaccardIndex): - """ - Custom JaccardIndex metrics to log single class JaccardIndex using PytorchLighting log system. - This enables good computation of epoch-level JaccardIndex. - i.e. use the full confusion matrix instead of averaging many step-level JaccardIndex. - Default parameters of JaccardIndex are used except for absent_score set to 1.0 and none reduction. - - """ - - def __init__( - self, - num_classes: int, - class_of_interest_idx: int, - ignore_index: Optional[int] = None, - absent_score: float = 1.0, - threshold: float = 0.5, - reduction: str = "none", - compute_on_step: bool = True, - dist_sync_on_step: bool = False, - process_group: Optional[Any] = None, - ) -> None: - self.class_of_interest_idx = class_of_interest_idx - - super().__init__( - num_classes, - ignore_index, - absent_score, - threshold, - reduction, - compute_on_step, - dist_sync_on_step, - process_group, - ) - - def compute(self): - """Computes intersection over union (JaccardIndex)""" - - iou_no_reduction = _jaccard_from_confmat( - self.confmat, - self.num_classes, - self.ignore_index, - self.absent_score, - self.reduction, - ) - class_of_interest_iou = iou_no_reduction[self.class_of_interest_idx] - return class_of_interest_iou diff --git a/myria3d/metrics/iou.py b/myria3d/metrics/iou.py new file mode 100644 index 00000000..f9281b37 --- /dev/null +++ b/myria3d/metrics/iou.py @@ -0,0 +1,21 @@ +from torch import Tensor + +EPSILON = 1e-8 + + +def iou(confmat: Tensor): + """Computes the Intersection over Union of each class in the + confusion matrix + + Return: + (iou, missing_class_mask) - iou for class as well as a mask + highlighting existing classes + """ + true_positives_and_false_negatives = confmat.sum(dim=0) + true_positives_and_false_positives = confmat.sum(dim=1) + true_positives = confmat.diag() + union = ( + true_positives_and_false_negatives + true_positives_and_false_positives - true_positives + ) + iou = EPSILON + true_positives / (union + EPSILON) + return iou diff --git a/myria3d/models/model.py b/myria3d/models/model.py index 0f837f19..67c2752d 100755 --- a/myria3d/models/model.py +++ b/myria3d/models/model.py @@ -1,11 +1,11 @@ -from typing import Optional - import torch from pytorch_lightning import LightningModule from torch import nn from torch_geometric.data import Batch from torch_geometric.nn import knn_interpolate +from torchmetrics.classification import MulticlassJaccardIndex +from myria3d.metrics.iou import iou from myria3d.models.modules.pyg_randla_net import PyGRandLANet from myria3d.utils import utils @@ -60,21 +60,29 @@ def __init__(self, **kwargs): # this line ensures params passed to LightningModule will be saved to ckpt # it also allows to access params with 'self.hparams' attribute - self.save_hyperparameters() + self.save_hyperparameters(ignore=["criterion"]) - neural_net_class = get_neural_net_class(self.hparams.neural_net_class_name) - self.model = neural_net_class(**self.hparams.neural_net_hparams) + neural_net_class = get_neural_net_class(kwargs.get("neural_net_class_name")) + self.model = neural_net_class(**kwargs.get("neural_net_hparams")) self.softmax = nn.Softmax(dim=1) - self.criterion = self.hparams.criterion + self.criterion = kwargs.get("criterion") + + def on_fit_start(self) -> None: + self.criterion = self.criterion.to(self.device) + self.train_iou = MulticlassJaccardIndex(self.hparams.num_classes).to(self.device) + self.val_iou = MulticlassJaccardIndex(self.hparams.num_classes).to(self.device) + + def on_test_start(self) -> None: + self.test_iou = MulticlassJaccardIndex(self.hparams.num_classes).to(self.device) - def setup(self, stage: Optional[str]) -> None: - """Setup stage: prepare to compute IoU and loss.""" - if stage == "fit": - self.train_iou = self.hparams.iou() - self.val_iou = self.hparams.iou() - if stage == "test": - self.test_iou = self.hparams.iou() + def log_all_class_ious(self, confmat, phase: str): + ious = iou(confmat) + for class_iou, class_name in zip(ious, self.hparams.classification_dict.values()): + metric_name = f"{phase}/iou_CLASS_{class_name}" + self.log( + metric_name, class_iou, on_step=False, on_epoch=True, metric_attribute=metric_name + ) def forward(self, batch: Batch) -> torch.Tensor: """Forward pass of neural network. @@ -114,11 +122,6 @@ def forward(self, batch: Batch) -> torch.Tensor: targets = batch.copies["transformed_y_copy"].to(logits.device) return targets, logits - def on_fit_start(self) -> None: - """On fit start: get the experiment for easier access.""" - self.experiment = self.logger.experiment[0] - self.criterion = self.criterion.to(self.device) - def training_step(self, batch: Batch, batch_idx: int) -> dict: """Training step. @@ -140,15 +143,14 @@ def training_step(self, batch: Batch, batch_idx: int) -> dict: with torch.no_grad(): preds = torch.argmax(logits.detach(), dim=1) self.train_iou(preds, targets) - self.log( - "train/iou", - self.train_iou, - on_step=True, - on_epoch=True, - prog_bar=True, - ) + self.log("train/iou", self.train_iou, on_step=True, on_epoch=True, prog_bar=True) return {"loss": loss, "logits": logits, "targets": targets} + def on_train_epoch_end(self) -> None: + self.train_iou.compute() + self.log_all_class_ious(self.train_iou.confmat, "train") + self.train_iou.reset() + def validation_step(self, batch: Batch, batch_idx: int) -> dict: """Validation step. @@ -182,6 +184,8 @@ def on_validation_epoch_end(self) -> None: """ self.val_iou.compute() + self.log_all_class_ious(self.val_iou.confmat, "val") + self.val_iou.reset() def test_step(self, batch: Batch, batch_idx: int): """Test step. @@ -202,16 +206,21 @@ def test_step(self, batch: Batch, batch_idx: int): preds = torch.argmax(logits, dim=1) self.test_iou = self.test_iou.to(preds.device) self.test_iou(preds, targets) - self.log( - "test/iou", - self.test_iou, - on_step=False, - on_epoch=True, - prog_bar=True, - ) + self.log("test/iou", self.test_iou, on_step=False, on_epoch=True, prog_bar=True) return {"loss": loss, "logits": logits, "targets": targets} + def on_test_epoch_end(self) -> None: + """At the end of a validation epoch, compute the IoU. + + Args: + outputs : output of test + + """ + self.test_iou.compute() + self.log_all_class_ious(self.test_iou.confmat, "test") + self.test_iou.reset() + def predict_step(self, batch: Batch) -> dict: """Prediction step. diff --git a/myria3d/pctl/datamodule/hdf5.py b/myria3d/pctl/datamodule/hdf5.py index 167744df..eb0e6da4 100644 --- a/myria3d/pctl/datamodule/hdf5.py +++ b/myria3d/pctl/datamodule/hdf5.py @@ -42,6 +42,8 @@ def __init__( transforms: Optional[Dict[str, TRANSFORMS_LIST]] = None, **kwargs, ): + super().__init__() + self.split_csv_path = split_csv_path self.data_dir = data_dir self.hdf5_file_path = hdf5_file_path @@ -86,7 +88,7 @@ def eval_transform(self) -> CustomCompose: def predict_transform(self) -> CustomCompose: return CustomCompose(self.preparation_predict_transform + self.normalization_transform) - def prepare_data(self, stage: Optional[str] = None): + def prepare_data_per_node(self, stage: Optional[str] = None): """Prepare dataset containing train, val, test data.""" if stage in ["fit", "test"] or stage is None: diff --git a/myria3d/predict.py b/myria3d/predict.py index bc71bfd5..7c50219e 100644 --- a/myria3d/predict.py +++ b/myria3d/predict.py @@ -5,9 +5,11 @@ import hydra import torch from omegaconf import DictConfig -from pytorch_lightning import LightningDataModule, LightningModule +from pytorch_lightning import LightningDataModule from tqdm import tqdm +from myria3d.models.model import Model + sys.path.append(osp.dirname(osp.dirname(__file__))) from myria3d.models.interpolation import Interpolator # noqa from myria3d.utils import utils # noqa @@ -44,9 +46,7 @@ def predict(config: DictConfig) -> str: # Do not require gradient for faster predictions torch.set_grad_enabled(False) - - model: LightningModule = hydra.utils.instantiate(config.model) - model = model.load_from_checkpoint(config.predict.ckpt_path) + model = Model.load_from_checkpoint(config.predict.ckpt_path) device = utils.define_device_from_config_param(config.predict.gpus) model.to(device) model.eval() @@ -67,5 +67,7 @@ def predict(config: DictConfig) -> str: logits = model.predict_step(batch)["logits"] itp.store_predictions(logits, batch.idx_in_original_cloud) - out_f = itp.reduce_predictions_and_save(config.predict.src_las, config.predict.output_dir, config.datamodule.get("epsg")) + out_f = itp.reduce_predictions_and_save( + config.predict.src_las, config.predict.output_dir, config.datamodule.get("epsg") + ) return out_f diff --git a/myria3d/train.py b/myria3d/train.py index e364990e..1bac4dae 100755 --- a/myria3d/train.py +++ b/myria3d/train.py @@ -19,7 +19,7 @@ Trainer, seed_everything, ) -from pytorch_lightning.loggers import LightningLoggerBase +from pytorch_lightning.loggers.logger import Logger from myria3d.models.model import Model from myria3d.utils import utils @@ -87,7 +87,7 @@ def train(config: DictConfig) -> Trainer: callbacks.append(hydra.utils.instantiate(cb_conf)) # Init lightning loggers - logger: List[LightningLoggerBase] = [] + logger: List[Logger] = [] if "logger" in config: for lg_conf in config.logger.values(): if "_target_" in lg_conf: @@ -111,7 +111,7 @@ def train(config: DictConfig) -> Trainer: task_name = config.task.get("task_name") if task_name == TASK_NAMES.FIT.value: - if config.trainer.auto_lr_find: + if config.task.auto_lr_find: log.info("Finding best lr with auto_lr_find!") # Run learn ing rate finder lr_finder = trainer.tuner.lr_find( diff --git a/myria3d/utils/utils.py b/myria3d/utils/utils.py index 141aa5d4..0364fcc0 100755 --- a/myria3d/utils/utils.py +++ b/myria3d/utils/utils.py @@ -113,7 +113,7 @@ def log_hyperparameters( datamodule: pl.LightningDataModule, trainer: pl.Trainer, callbacks: List[pl.Callback], - logger: List[pl.loggers.LightningLoggerBase], + logger: List[pl.logging.Logger], ) -> None: """This method controls which parameters from Hydra config are saved by Lightning loggers. diff --git a/package_metadata.yaml b/package_metadata.yaml index c84bde5b..a861d216 100644 --- a/package_metadata.yaml +++ b/package_metadata.yaml @@ -1,4 +1,4 @@ -__version__: "3.6.1" +__version__: "3.7.0" __name__: "myria3d" __url__: "https://github.com/IGNF/myria3d" __description__: "Deep Learning for the Semantic Segmentation of Aerial Lidar Point Clouds" diff --git a/run.py b/run.py index e9731a7b..4e47aef8 100755 --- a/run.py +++ b/run.py @@ -22,7 +22,7 @@ TASK_NAME_DETECTION_STRING = "task.task_name=" DEFAULT_DIRECTORY = "trained_model_assets/" -DEFAULT_CONFIG_FILE = "proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.5.0.yaml" +DEFAULT_CONFIG_FILE = "proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.7.0.yaml" DEFAULT_CHECKPOINT = "proto151_V2.0_epoch_100_Myria3DV3.1.0.ckpt" DEFAULT_ENV = "placeholder.env" @@ -48,6 +48,7 @@ def launch_train( # Imports should be nested inside @hydra.main to optimize tab completion # Read more here: https://github.com/facebookresearch/hydra/issues/934 from myria3d.train import train + utils.extras(config) # Pretty print config using Rich library diff --git a/tests/myria3d/test_train_and_predict.py b/tests/myria3d/test_train_and_predict.py index c0d1a7b0..0ef0c388 100644 --- a/tests/myria3d/test_train_and_predict.py +++ b/tests/myria3d/test_train_and_predict.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from lightning.pytorch.accelerators import find_usable_cuda_devices + from myria3d.pctl.dataset.toy_dataset import TOY_LAS_DATA from myria3d.pctl.dataset.utils import pdal_read_las_array @@ -59,11 +61,14 @@ def test_FrenchLidar_RandLaNetDebug_with_gpu(toy_dataset_hdf5_path, tmpdir_facto tmp_paths_overrides = _make_list_of_necesary_hydra_overrides_with_tmp_paths( toy_dataset_hdf5_path, tmpdir ) - # We will always use the first GPU id for tests, because it always exists if there are some GPUs. - # Attention to concurrency with other processes using the GPU when running tests. - gpu_id = 0 + gpu_id = find_usable_cuda_devices(1) cfg_one_epoch = make_default_hydra_cfg( - overrides=["experiment=RandLaNetDebug", f"trainer.gpus=[{gpu_id}]"] + tmp_paths_overrides + overrides=[ + "experiment=RandLaNetDebug", + "trainer.accelerator=gpu", + f"trainer.devices=[{gpu_id}]", + ] + + tmp_paths_overrides ) train(cfg_one_epoch) @@ -110,7 +115,10 @@ def test_command_without_epsg(one_epoch_trained_RandLaNet_checkpoint, tmpdir): "+predict.interpolator.probas_to_save=[building,unclassified]", "task.task_name=predict", ] - assert "No EPSG provided, neither in the lidar file or as parameter" in run_hydra_decorated_command_with_return_error(command) + assert ( + "No EPSG provided, neither in the lidar file or as parameter" + in run_hydra_decorated_command_with_return_error(command) + ) def test_predict_on_single_point_cloud(one_epoch_trained_RandLaNet_checkpoint, tmpdir): @@ -177,10 +185,7 @@ def test_run_test_with_trained_model_on_toy_dataset_on_cpu( one_epoch_trained_RandLaNet_checkpoint, toy_dataset_hdf5_path, tmpdir ): _run_test_right_after_training( - one_epoch_trained_RandLaNet_checkpoint, - toy_dataset_hdf5_path, - tmpdir, - "null", + one_epoch_trained_RandLaNet_checkpoint, toy_dataset_hdf5_path, tmpdir, "cpu" ) @@ -189,18 +194,12 @@ def test_run_test_with_trained_model_on_toy_dataset_on_gpu( one_epoch_trained_RandLaNet_checkpoint, toy_dataset_hdf5_path, tmpdir ): _run_test_right_after_training( - one_epoch_trained_RandLaNet_checkpoint, - toy_dataset_hdf5_path, - tmpdir, - "[0]", + one_epoch_trained_RandLaNet_checkpoint, toy_dataset_hdf5_path, tmpdir, "gpu" ) def _run_test_right_after_training( - one_epoch_trained_RandLaNet_checkpoint, - toy_dataset_hdf5_path, - tmpdir, - trainer_gpus, + one_epoch_trained_RandLaNet_checkpoint, toy_dataset_hdf5_path, tmpdir, accelerator ): """Run test using the model that was just trained for one epoch. @@ -217,11 +216,13 @@ def _run_test_right_after_training( tmp_paths_overrides = _make_list_of_necesary_hydra_overrides_with_tmp_paths( toy_dataset_hdf5_path, tmpdir ) + devices = find_usable_cuda_devices(1) if accelerator == "gpu" else 1 cfg_test_using_trained_model = make_default_hydra_cfg( overrides=[ "experiment=test", # sets task.task_name to "test" f"model.ckpt_path={one_epoch_trained_RandLaNet_checkpoint}", - f"trainer.gpus={trainer_gpus}", + f"trainer.devices={devices}", + f"trainer.accelerator={accelerator}", ] + tmp_paths_overrides ) diff --git a/tests/runif.py b/tests/runif.py index 8f17699e..7a2ac5f6 100644 --- a/tests/runif.py +++ b/tests/runif.py @@ -1,5 +1,5 @@ import pytest -import torch +from lightning.pytorch.accelerators import find_usable_cuda_devices """ Simplified from: @@ -35,8 +35,12 @@ def __new__( reasons = [] if min_gpus: - conditions.append(torch.cuda.device_count() < min_gpus) - reasons.append(f"GPUs>={min_gpus}") + try: + find_usable_cuda_devices(min_gpus) + conditions.append(False) + except (ValueError, RuntimeError): + conditions.append(True) + reasons.append(f"GPUs>={min_gpus}") reasons = [rs for cond, rs in zip(conditions, reasons) if cond] return pytest.mark.skipif( diff --git a/trained_model_assets/proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.5.0.yaml b/trained_model_assets/proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.7.0.yaml similarity index 98% rename from trained_model_assets/proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.5.0.yaml rename to trained_model_assets/proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.7.0.yaml index bfbc1e9a..a9970595 100644 --- a/trained_model_assets/proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.5.0.yaml +++ b/trained_model_assets/proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.7.0.yaml @@ -5,20 +5,14 @@ print_config: true ignore_warnings: true trainer: _target_: pytorch_lightning.Trainer - gpus: - - 0 - - 1 min_epochs: 100 max_epochs: 150 log_every_n_steps: 1 - weights_summary: null - progress_bar_refresh_rate: 1 - auto_lr_find: false num_sanity_val_steps: 2 accumulate_grad_batches: 3 strategy: ddp accelerator: gpu - num_processes: 2 + devices: 2 datamodule: transforms: preparations: @@ -264,6 +258,7 @@ logger: disabled: false task: task_name: predict + auto_lr_find: false predict: src_las: /path/to/input.las output_dir: /path/to/output_dir/