diff --git a/CHANGELOG.md b/CHANGELOG.md index 189a25f3..e4441000 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,7 @@ # main +Save optimized thresholds as yaml instead of pickle to make it easier to read + ### 1.10.2 - Add support for metadata propagation through compound pdal pipelines: - fix epsg propagation diff --git a/configs/building_validation/optimization/default.yaml b/configs/building_validation/optimization/default.yaml index 2ac65eec..3bf1ac23 100644 --- a/configs/building_validation/optimization/default.yaml +++ b/configs/building_validation/optimization/default.yaml @@ -13,7 +13,7 @@ paths: group_info_pickle_path: ${.results_output_dir}/group_info.pickle prepared_las_dir: ${.results_output_dir}/prepared/ updated_las_dir: ${.results_output_dir}/updated/ - building_validation_thresholds_pickle: ${.results_output_dir}/optimized_thresholds.pickle # Wher + building_validation_thresholds: ${.results_output_dir}/optimized_thresholds.yaml # Wher # CLASSIFICATION CODES of a dataset which was inspected # and labeled post TerraSolid macro diff --git a/docs/source/guides/thresholds_optimization.md b/docs/source/guides/thresholds_optimization.md index a2efc220..acac645c 100644 --- a/docs/source/guides/thresholds_optimization.md +++ b/docs/source/guides/thresholds_optimization.md @@ -37,7 +37,7 @@ building_validation.optimization.paths.results_output_dir=[path/to/save/results] ### Evaluation of optimized thresholds on a test set -Once an optimal solution was found, you may want to evaluate the decision process on unseen data to evaluate generalization capability. For that, you will need another test folder of corrected data in the same format as before (a different `input_las_dir`). You need to specify that no optimization is required using the `todo` params. You also need to give the path to the pickled decision thresholds from the previous step, and specify a different `results_output_dir` so that prepared data of test and val test are not pooled together. +Once an optimal solution was found, you may want to evaluate the decision process on unseen data to evaluate generalization capability. For that, you will need another test folder of corrected data in the same format as before (a different `input_las_dir`). You need to specify that no optimization is required using the `todo` params. You also need to give the path to the decision thresholds file (yaml file) from the previous step, and specify a different `results_output_dir` so that prepared data of test and val test are not pooled together. ```bash @@ -48,7 +48,7 @@ python lidar_prod/run.py \ building_validation.optimization.todo='prepare+evaluate+update' \ building_validation.optimization.paths.input_las_dir=[path/to/labelled/test/dataset/] \ building_validation.optimization.paths.results_output_dir=[path/to/save/results] \ -building_validation.optimization.paths.building_validation_thresholds_pickle=[path/to/optimized_thresholds.pickle] +building_validation.optimization.paths.building_validation_thresholds=[path/to/optimized_thresholds.yaml] ``` ### Utils @@ -57,4 +57,4 @@ Debug mode: to run on a single file during development, add a `+building_validat Reference: -- [Deb et al. (2002) - A fast and elitist multiobjective genetic algorithm\: NSGA-II](https://ieeexplore.ieee.org/document/996017)). +- [Deb et al. (2002) - A fast and elitist multiobjective genetic algorithm\: NSGA-II](https://ieeexplore.ieee.org/document/996017). diff --git a/lidar_prod/tasks/building_validation.py b/lidar_prod/tasks/building_validation.py index 7952624c..10872b48 100644 --- a/lidar_prod/tasks/building_validation.py +++ b/lidar_prod/tasks/building_validation.py @@ -9,6 +9,7 @@ import geopandas import numpy as np import pdal +import yaml from tqdm import tqdm from lidar_prod.tasks.utils import ( @@ -378,3 +379,14 @@ class thresholds: min_frac_refutation: float min_entropy_uncertainty: float min_frac_entropy_uncertain: float + + def dump(self, filename: str): + with open(filename, "w") as f: + yaml.safe_dump(self.__dict__, f) + + @staticmethod + def load(filename: str): + with open(filename, "r") as f: + data = yaml.safe_load(f) + + return thresholds(**data) diff --git a/lidar_prod/tasks/building_validation_optimization.py b/lidar_prod/tasks/building_validation_optimization.py index fcffb625..1f39a42d 100644 --- a/lidar_prod/tasks/building_validation_optimization.py +++ b/lidar_prod/tasks/building_validation_optimization.py @@ -185,22 +185,22 @@ def evaluate(self) -> dict: """ clusters = self._load_clusters() - self._set_thresholds_from_pickle_if_available() + self._set_thresholds_from_file_if_available() decisions = np.array([self.bv._make_group_decision(c) for c in clusters]) mts_gt = np.array([c.target for c in clusters]) metrics_dict = self.evaluate_decisions(mts_gt, decisions) log.info(f"\n Results:\n{self._get_results_logs_str(metrics_dict)}") return metrics_dict - def _set_thresholds_from_pickle_if_available(self): + def _set_thresholds_from_file_if_available(self): try: - with open(self.paths.building_validation_thresholds_pickle, "rb") as f: - self.bv.thresholds = pickle.load(f) + self.bv.thresholds = thresholds.load(self.paths.building_validation_thresholds) + except FileNotFoundError: warnings.warn( "Using default thresholds from hydra config to perform decisions. " - "You may want to specify different thresholds via a pickled object by specifying " - "building_validation.optimization.paths.building_validation_thresholds_pickle", + "You may want to specify different thresholds via a yaml file by specifying " + "building_validation.optimization.paths.building_validation_thresholds", UserWarning, ) @@ -213,7 +213,7 @@ def update(self): """ log.info(f"Updated las will be saved in {self.paths.results_output_dir}") - self._set_thresholds_from_pickle_if_available() + self._set_thresholds_from_file_if_available() for prepared_las_path, target_las_path in tqdm( zip(self.prepared_las_filepaths, self.out_las_filepaths), total=len(self.prepared_las_filepaths), @@ -354,11 +354,10 @@ def _select_best_rules(self, study): best_rules = thresholds(**best.params) return best_rules - def _dump_best_rules(self, best_trial_params): - """Serializes best thresholds.""" - with open(self.paths.building_validation_thresholds_pickle, "wb") as f: - pickle.dump(best_trial_params, f) - log.info(f"Pickled best params to {self.paths.building_validation_thresholds_pickle}") + def _dump_best_rules(self, best_trial_params: thresholds): + """Saves best thresholds to a yaml file.""" + best_trial_params.dump(self.paths.building_validation_thresholds) + log.info(f"Saved best params to {self.paths.building_validation_thresholds}") def _dump_clusters(self, clusters): """Serializes the list of cluster-level information objects.""" diff --git a/tests/lidar_prod/tasks/test_building_validation.py b/tests/lidar_prod/tasks/test_building_validation.py index 36fe943b..a489bbc7 100644 --- a/tests/lidar_prod/tasks/test_building_validation.py +++ b/tests/lidar_prod/tasks/test_building_validation.py @@ -5,7 +5,7 @@ import numpy as np import pytest -from lidar_prod.tasks.building_validation import BuildingValidator +from lidar_prod.tasks.building_validation import BuildingValidator, thresholds from lidar_prod.tasks.utils import BDUniConnectionParams, get_las_data_from_las from tests.conftest import ( check_expected_classification, @@ -171,3 +171,24 @@ def test_run(hydra_cfg): dims.candidate_buildings_flag, ], ) + + +def test_thresholds(): + dump_file = str(TMP_DIR / "threshold_dump.yml") + + th = thresholds( + min_confidence_confirmation=0.1, + min_frac_confirmation=0.2, + min_frac_confirmation_factor_if_bd_uni_overlay=0.3, + min_uni_db_overlay_frac=0.4, + min_confidence_refutation=0.5, + min_frac_refutation=0.6, + min_entropy_uncertainty=0.7, + min_frac_entropy_uncertain=0.8, + ) + + th.dump(dump_file) + + th1 = th.load(dump_file) + + assert th1 == th diff --git a/tests/lidar_prod/tasks/test_building_validation_optimization.py b/tests/lidar_prod/tasks/test_building_validation_optimization.py new file mode 100644 index 00000000..daebad20 --- /dev/null +++ b/tests/lidar_prod/tasks/test_building_validation_optimization.py @@ -0,0 +1,194 @@ +import os +import os.path as osp +import shutil +from pathlib import Path + +import hydra +import numpy as np +import pytest + +from lidar_prod.tasks.building_validation import thresholds +from lidar_prod.tasks.building_validation_optimization import ( + BuildingValidationOptimizer, +) +from lidar_prod.tasks.utils import BDUniConnectionParams +from tests.conftest import pdal_read_las_array + +"""We test the building validation optimizer against two LAS: + +These datasets must have the right classification codes, i.e. the ones defined in +buildings_correction_labels. + +WARNING: The large LAS cannot be versionned by git. If it is absent from environment, +pytest expects the test to fail. +This is to enable a shallower run of these tests without the file. + +""" + +TMP_DIR = Path("tmp/lidar_prod/tasks/building_validation_optimization") + + +# Small LAS, for which we optimize thresholds and reach perfect validation, +# to quickly check optimization logic. +LAS_SUBSET_FILE = "tests/files/870000_6618000.subset.postIA.corrected.las" +SUBSET_EXPECTED_METRICS = { + "exact": { + "groups_count": 15, + "group_no_buildings": 0.4, + }, + "min": { + "p_auto": 1.0, + "recall": 1.0, + "precision": 1.0, + }, +} +# Large LAS, for which we evaluate performance, to control that there was no regression in terms of +# automation/precision/recall of building validation. +LAS_LARGE_FILE = "tests/files/large/V0.5_792000_6272000.las" +LARGE_EXPECTED_METRICS = { + "exact": { + "groups_count": 1493, + "group_no_buildings": 0.149, + "group_building": 0.847, + }, + "min": { + "p_auto": 0.94, + "recall": 0.99, + "precision": 0.94, + }, +} + +# Relative tolerance when comparing metrics to their expected value for large LAS. +# i.e. resulting metrics are >= (1-tolerance) * expected metrics for performance indicators. +RELATIVE_MIN_TOLERANCE_OF_EXPECTED_METRICS = 0.05 + + +def test_BVOptimization_on_subset(hydra_cfg): + out_dir = str(TMP_DIR / "subset") + # Optimization output (thresholds and prepared/updated LASfiles) saved to out_dir + hydra_cfg.building_validation.optimization.paths.results_output_dir = out_dir + + # We isolate the input file in a subdir, and prepare it for optimization + input_las_dir = osp.join(out_dir, "inputs/") + hydra_cfg.building_validation.optimization.paths.input_las_dir = input_las_dir + os.makedirs(input_las_dir, exist_ok=False) + src_las_copy_path = osp.join(input_las_dir, "copy.las") + shutil.copy(LAS_SUBSET_FILE, src_las_copy_path) + + # Check that a full optimization run can pass successfully + bvo: BuildingValidationOptimizer = hydra.utils.instantiate( + hydra_cfg.building_validation.optimization + ) + bd_uni_connection_params: BDUniConnectionParams = hydra.utils.instantiate( + hydra_cfg.bd_uni_connection_params + ) + bvo.bv.bd_uni_connection_params = bd_uni_connection_params + bvo.run() + + # Check that the threshold are saved in a yaml file successfully + th_yaml = hydra_cfg.building_validation.optimization.paths.building_validation_thresholds + assert os.path.isfile(th_yaml) + assert isinstance(thresholds.load(th_yaml), thresholds) + + # Assert that a prepared and an updated file are generated in the temporary dir + # in subfolders. + assert os.path.isfile(osp.join(out_dir, "prepared", osp.basename(src_las_copy_path))) + updated_las_path = osp.join(out_dir, "updated", osp.basename(src_las_copy_path)) + assert os.path.isfile(updated_las_path) + + # Check the output of the evaluate method. Note that it uses the + # prepared data and the threshold from previous run + metrics_dict = bvo.evaluate() + print(metrics_dict) + # Assert inclusion + assert SUBSET_EXPECTED_METRICS["exact"].items() <= metrics_dict.items() + # Assert <= with a relative tolerance + for k, v in SUBSET_EXPECTED_METRICS["min"].items(): + v <= metrics_dict[k] + # Update classification dimension and check if the codes are the expected ones. + bvo.bv.use_final_classification_codes = True + bvo.update() + assert os.path.isfile(updated_las_path) + arr, _ = pdal_read_las_array(updated_las_path, hydra_cfg.data_format.epsg) + # Check that we have either 1/2 (ground/unclassified), or one of + # the final classification code of the module. + final_codes = hydra_cfg.data_format.codes.building.final + expected_codes = { + 1, + 2, + final_codes.building, + final_codes.not_building, + final_codes.unsure, + } + actual_codes = {*np.unique(arr["Classification"])} + assert actual_codes.issubset(expected_codes) + + +@pytest.mark.slow() +def test_BVOptimization_on_large_file(hydra_cfg): + + if not os.path.isfile(LAS_LARGE_FILE): + pytest.xfail(reason=f"File {LAS_LARGE_FILE} is not present in environment.") + + out_dir = str(TMP_DIR / "large_file") + + # Optimization output (thresholds and prepared/updated LASfiles) saved to td + hydra_cfg.building_validation.optimization.paths.results_output_dir = out_dir + + # We isolate the input file in a subdir, and prepare it for optimization + input_las_dir = osp.join(out_dir, "inputs/") + hydra_cfg.building_validation.optimization.paths.input_las_dir = input_las_dir + os.makedirs(input_las_dir, exist_ok=False) + src_las_copy_path = osp.join(input_las_dir, "copy.las") + shutil.copy(LAS_LARGE_FILE, src_las_copy_path) + + # Check that a full optimization run can pass successfully + bvo: BuildingValidationOptimizer = hydra.utils.instantiate( + hydra_cfg.building_validation.optimization + ) + + bd_uni_connection_params: BDUniConnectionParams = hydra.utils.instantiate( + hydra_cfg.bd_uni_connection_params + ) + bvo.bv.bd_uni_connection_params = bd_uni_connection_params + + bvo.prepare() + metrics_dict = bvo.evaluate() + print(metrics_dict) + + exact_expected_val = LARGE_EXPECTED_METRICS["exact"] + for k in exact_expected_val: + assert ( + pytest.approx(exact_expected_val[k], RELATIVE_MIN_TOLERANCE_OF_EXPECTED_METRICS) + == metrics_dict[k] + ) + min_expected_val = LARGE_EXPECTED_METRICS["min"] + for k in min_expected_val: + assert ( + (1 - RELATIVE_MIN_TOLERANCE_OF_EXPECTED_METRICS) * min_expected_val[k] + ) <= metrics_dict[k] + + +# All expected metrics for reference: +""" + groups_count=1493 + group_unsure=0.00402 + group_no_buildings=0.149 + group_building=0.847 + p_auto=0.889 + p_unsure=0.111 + p_refute=0.0924 + p_confirm=0.797 + a_refute=0.899 + a_confirm=0.976 + precision=0.98 + recall=0.99 + Confusion Matrix + [[ 2 1 3] + [ 74 124 25] + [ 89 13 1162]] + Confusion Matrix (normalized) + [[0.333 0.167 0.5 ] + [0.332 0.556 0.112] + [0.07 0.01 0.919]] +""" diff --git a/tests/lidar_prod/test_optimization.py b/tests/lidar_prod/test_optimization.py deleted file mode 100644 index b93d2b53..00000000 --- a/tests/lidar_prod/test_optimization.py +++ /dev/null @@ -1,184 +0,0 @@ -import os -import os.path as osp -import shutil -import tempfile - -import hydra -import numpy as np -import pytest - -from lidar_prod.tasks.building_validation_optimization import ( - BuildingValidationOptimizer, -) -from lidar_prod.tasks.utils import BDUniConnectionParams -from tests.conftest import pdal_read_las_array - -"""We test the building validation optimizer against two LAS: - -These datasets must have the right classification codes, i.e. the ones defined in -buildings_correction_labels. - -WARNING: The large LAS cannot be versionned by git. If it is absent from environment, -pytest expects the test to fail. -This is to enable a shallower run of these tests without the file. - -""" - -# Small LAS, for which we optimize thresholds and reach perfect validation, -# to quickly check optimization logic. -LAS_SUBSET_FILE = "tests/files/870000_6618000.subset.postIA.corrected.las" -SUBSET_EXPECTED_METRICS = { - "exact": { - "groups_count": 15, - "group_no_buildings": 0.4, - }, - "min": { - "p_auto": 1.0, - "recall": 1.0, - "precision": 1.0, - }, -} -# Large LAS, for which we evaluate performance, to control that there was no regression in terms of -# automation/precision/recall of building validation. -LAS_LARGE_FILE = "tests/files/large/V0.5_792000_6272000.las" -LARGE_EXPECTED_METRICS = { - "exact": { - "groups_count": 1493, - "group_no_buildings": 0.149, - "group_building": 0.847, - }, - "min": { - "p_auto": 0.94, - "recall": 0.99, - "precision": 0.94, - }, -} - -# Relative tolerance when comparing metrics to their expected value for large LAS. -# i.e. resulting metrics are >= (1-tolerance) * expected metrics for performance indicators. -RELATIVE_MIN_TOLERANCE_OF_EXPECTED_METRICS = 0.05 - - -def test_BVOptimization_on_subset(hydra_cfg): - with tempfile.TemporaryDirectory() as td: - # Optimization output (thresholds and prepared/updated LASfiles) saved to td - hydra_cfg.building_validation.optimization.paths.results_output_dir = td - - # We isolate the input file in a subdir, and prepare it for optimization - input_las_dir = osp.join(td, "inputs/") - hydra_cfg.building_validation.optimization.paths.input_las_dir = input_las_dir - os.makedirs(input_las_dir, exist_ok=False) - src_las_copy_path = osp.join(input_las_dir, "copy.las") - shutil.copy(LAS_SUBSET_FILE, src_las_copy_path) - - # Check that a full optimization run can pass successfully - bvo: BuildingValidationOptimizer = hydra.utils.instantiate( - hydra_cfg.building_validation.optimization - ) - bd_uni_connection_params: BDUniConnectionParams = hydra.utils.instantiate( - hydra_cfg.bd_uni_connection_params - ) - bvo.bv.bd_uni_connection_params = bd_uni_connection_params - bvo.run() - - # Assert that a prepared and an updated file are generated in the temporary dir - # in subfolders. - assert os.path.isfile(osp.join(td, "prepared", osp.basename(src_las_copy_path))) - updated_las_path = osp.join(td, "updated", osp.basename(src_las_copy_path)) - assert os.path.isfile(updated_las_path) - - # Check the output of the evaluate method. Note that it uses the - # prepared data and the threshold from previous run - metrics_dict = bvo.evaluate() - print(metrics_dict) - # Assert inclusion - assert SUBSET_EXPECTED_METRICS["exact"].items() <= metrics_dict.items() - # Assert <= with a relative tolerance - for k, v in SUBSET_EXPECTED_METRICS["min"].items(): - v <= metrics_dict[k] - # Update classification dimension and check if the codes are the expected ones. - bvo.bv.use_final_classification_codes = True - bvo.update() - assert os.path.isfile(updated_las_path) - arr, _ = pdal_read_las_array(updated_las_path, hydra_cfg.data_format.epsg) - # Check that we have either 1/2 (ground/unclassified), or one of - # the final classification code of the module. - final_codes = hydra_cfg.data_format.codes.building.final - expected_codes = { - 1, - 2, - final_codes.building, - final_codes.not_building, - final_codes.unsure, - } - actual_codes = {*np.unique(arr["Classification"])} - assert actual_codes.issubset(expected_codes) - - -@pytest.mark.slow() -def test_BVOptimization_on_large_file(hydra_cfg): - - if not os.path.isfile(LAS_LARGE_FILE): - pytest.xfail(reason=f"File {LAS_LARGE_FILE} is not present in environment.") - - with tempfile.TemporaryDirectory() as td: - # Optimization output (thresholds and prepared/updated LASfiles) saved to td - hydra_cfg.building_validation.optimization.paths.results_output_dir = td - - # We isolate the input file in a subdir, and prepare it for optimization - input_las_dir = osp.join(td, "inputs/") - hydra_cfg.building_validation.optimization.paths.input_las_dir = input_las_dir - os.makedirs(input_las_dir, exist_ok=False) - src_las_copy_path = osp.join(input_las_dir, "copy.las") - shutil.copy(LAS_LARGE_FILE, src_las_copy_path) - - # Check that a full optimization run can pass successfully - bvo: BuildingValidationOptimizer = hydra.utils.instantiate( - hydra_cfg.building_validation.optimization - ) - - bd_uni_connection_params: BDUniConnectionParams = hydra.utils.instantiate( - hydra_cfg.bd_uni_connection_params - ) - bvo.bv.bd_uni_connection_params = bd_uni_connection_params - - bvo.prepare() - metrics_dict = bvo.evaluate() - print(metrics_dict) - - exact_expected_val = LARGE_EXPECTED_METRICS["exact"] - for k in exact_expected_val: - assert ( - pytest.approx(exact_expected_val[k], RELATIVE_MIN_TOLERANCE_OF_EXPECTED_METRICS) - == metrics_dict[k] - ) - min_expected_val = LARGE_EXPECTED_METRICS["min"] - for k in min_expected_val: - assert ( - (1 - RELATIVE_MIN_TOLERANCE_OF_EXPECTED_METRICS) * min_expected_val[k] - ) <= metrics_dict[k] - - -# All expected metrics for reference: -""" - groups_count=1493 - group_unsure=0.00402 - group_no_buildings=0.149 - group_building=0.847 - p_auto=0.889 - p_unsure=0.111 - p_refute=0.0924 - p_confirm=0.797 - a_refute=0.899 - a_confirm=0.976 - precision=0.98 - recall=0.99 - Confusion Matrix - [[ 2 1 3] - [ 74 124 25] - [ 89 13 1162]] - Confusion Matrix (normalized) - [[0.333 0.167 0.5 ] - [0.332 0.556 0.112] - [0.07 0.01 0.919]] -"""