From 7291b483e990ac5f72c042bd454ba44f11ddcc70 Mon Sep 17 00:00:00 2001 From: "Marek (on kosmos)" Date: Wed, 17 Apr 2024 12:56:29 +0200 Subject: [PATCH 1/5] fixes arguments and also copies the folder for mrxs files --- ahcore/cli/data.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/ahcore/cli/data.py b/ahcore/cli/data.py index 9a7d111..7acdb3f 100644 --- a/ahcore/cli/data.py +++ b/ahcore/cli/data.py @@ -1,7 +1,5 @@ """Module to write copy manifests files over to SCRATCH directory""" - from __future__ import annotations - import argparse import hashlib import os @@ -44,9 +42,19 @@ def copy_data(args: argparse.Namespace) -> None: for patient in all_records: for image in patient.images: image_fn = image.filename + get_from = base_dir / image_fn write_to = Path(target_dir) / dataset_name / image_fn + accompanying_folder_write_to, accompanying_folder_get_from = None, None + if get_from.suffix == ".mrxs": + accompanying_folder_get_from = get_from.parent / get_from.stem + if not accompanying_folder_get_from.is_dir(): + raise ValueError( + f"Image {image_fn} does not have an accompanying folder, which is expected for mrxs images" + ) + accompanying_folder_write_to = write_to.parent / write_to.stem + write_to.parent.mkdir(parents=True, exist_ok=True) if write_to.exists(): # compute the hash of previous and new file @@ -62,6 +70,9 @@ def copy_data(args: argparse.Namespace) -> None: # Copy file from get_from to write_to shutil.copy(get_from, write_to) + if accompanying_folder_get_from is not None: + shutil.copytree(accompanying_folder_get_from, accompanying_folder_write_to, dirs_exist_ok=True) + total_size += accompanying_folder_get_from.stat().st_size progress.update(task, advance=1) progress.console.log("Total data size copied: {:.2f} GB".format(total_size / 1024**3)) @@ -85,7 +96,7 @@ def register_parser( _parser.add_argument( "manifest_uri", type=str, - help="URI that refers to the sqlalchemy supported database path.", + help="URI that refers to the sqlalchemy supported database path. If using an sqlite database this looks like 'sqlite:///your_database_path'.", ) _parser.add_argument( "manifest_name", From 1f1c7280b0005384d586733c949d5843e9787eed Mon Sep 17 00:00:00 2001 From: "Marek (on hp-zbook)" Date: Thu, 8 Aug 2024 10:57:11 +0200 Subject: [PATCH 2/5] Added H5Slide reader and fixed reshaping in backend --- ahcore/backends.py | 38 +++++++++++++++++++++++++++++++++++++- ahcore/readers.py | 14 ++++++++++---- 2 files changed, 47 insertions(+), 5 deletions(-) diff --git a/ahcore/backends.py b/ahcore/backends.py index 94e66d9..cb719b2 100644 --- a/ahcore/backends.py +++ b/ahcore/backends.py @@ -4,7 +4,7 @@ from dlup.backends.common import AbstractSlideBackend from dlup.types import PathLike -from ahcore.readers import StitchingMode, ZarrFileImageReader +from ahcore.readers import StitchingMode, ZarrFileImageReader, H5FileImageReader class ZarrSlide(AbstractSlideBackend): @@ -42,3 +42,39 @@ def read_region(self, coordinates: tuple[int, int], level: int, size: tuple[int, def close(self): self._reader.close() + +class H5Slide(AbstractSlideBackend): + def __init__(self, filename: PathLike, stitching_mode: StitchingMode | str = StitchingMode.CROP) -> None: + super().__init__(filename) + self._reader: H5FileImageReader = H5FileImageReader(filename, stitching_mode=stitching_mode) + self._spacings = [(self._reader.mpp, self._reader.mpp)] + + @property + def size(self): + return self._reader.size + + @property + def level_dimensions(self) -> tuple[tuple[int, int], ...]: + return (self._reader.size,) + + @property + def level_downsamples(self) -> tuple[float, ...]: + return (1.0,) + + @property + def vendor(self) -> str: + return "H5FileImageReader" + + @property + def properties(self) -> dict[str, Any]: + return self._reader.metadata + + @property + def magnification(self): + return None + + def read_region(self, coordinates: tuple[int, int], level: int, size: tuple[int, int]) -> pyvips.Image: + return self._reader.read_region(coordinates, level, size) + + def close(self): + self._reader.close() \ No newline at end of file diff --git a/ahcore/readers.py b/ahcore/readers.py index 9c3b583..e61965d 100644 --- a/ahcore/readers.py +++ b/ahcore/readers.py @@ -156,11 +156,16 @@ def metadata(self) -> dict[str, Any]: assert self._metadata return self._metadata - def _decompress_data(self, tile: GenericNumberArray) -> GenericNumberArray: + def _decompress_and_reshape_data(self, tile: GenericNumberArray) -> GenericNumberArray: if self._is_binary: with PIL.Image.open(io.BytesIO(tile)) as img: - return np.array(img).transpose(2, 0, 1) + return np.array(img).transpose(2, 0, 1) # fixme: this also shouldn't work because the thing is flattened and doesn't have 3 dimensions else: + # If handling features, we need to expand dimensions to match the expected shape. + if tile.ndim == 1: # fixme: is this the correct location for this + if not self._tile_size == [1, 1]: + raise NotImplementedError(f"Tile is single dimensional and {self._tile_size=} should be [1, 1], other cases have not been considered and cause unwanted behaviour.") + return tile.reshape(self._num_channels, *self._tile_size) return tile def read_region(self, location: tuple[int, int], level: int, size: tuple[int, int]) -> pyvips.Image: @@ -201,7 +206,7 @@ def read_region(self, location: tuple[int, int], level: int, size: tuple[int, in total_rows = math.ceil((self._size[1] - self._tile_overlap[1]) / self._stride[1]) total_cols = math.ceil((self._size[0] - self._tile_overlap[0]) / self._stride[0]) - assert total_rows * total_cols == num_tiles + assert total_rows * total_cols == num_tiles # Equality only holds if features where created without mask x, y = location w, h = size @@ -230,7 +235,7 @@ def read_region(self, location: tuple[int, int], level: int, size: tuple[int, in tile = ( self._empty_tile() if tile_index_in_image_dataset == -1 - else self._decompress_data(image_dataset[tile_index_in_image_dataset]) + else self._decompress_and_reshape_data(image_dataset[tile_index_in_image_dataset]) ) start_y = i * self._stride[1] - y end_y = start_y + self._tile_size[1] @@ -242,6 +247,7 @@ def read_region(self, location: tuple[int, int], level: int, size: tuple[int, in img_start_x = max(0, start_x) img_end_x = min(w, end_x) + if self._stitching_mode == StitchingMode.CROP: crop_start_y = img_start_y - start_y crop_end_y = img_end_y - start_y From 5eb07b3e32a3df3d16f2207b47e6c479af1fa59c Mon Sep 17 00:00:00 2001 From: "Marek (on hp-zbook)" Date: Thu, 8 Aug 2024 11:50:26 +0200 Subject: [PATCH 3/5] added database models for features --- ahcore/utils/database_models.py | 41 +++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/ahcore/utils/database_models.py b/ahcore/utils/database_models.py index 943e449..a7501bf 100644 --- a/ahcore/utils/database_models.py +++ b/ahcore/utils/database_models.py @@ -72,6 +72,7 @@ class Image(Base): annotations: Mapped[List["ImageAnnotations"]] = relationship("ImageAnnotations", back_populates="image") labels: Mapped[List["ImageLabels"]] = relationship("ImageLabels", back_populates="image") caches: Mapped[List["ImageCache"]] = relationship("ImageCache", back_populates="image") + features: Mapped[List["ImageFeature"]] = relationship("ImageFeature", back_populates="image") class ImageCache(Base): @@ -115,6 +116,46 @@ class CacheDescription(Base): cache: Mapped["ImageCache"] = relationship("ImageCache", back_populates="description") +class ImageFeature(Base): + """Image feature table.""" + + __tablename__ = "image_feature" + id = Column(Integer, primary_key=True) + # pylint: disable=E1102 + created = Column(DateTime(timezone=True), default=func.now()) + last_updated = Column(DateTime(timezone=True), default=func.now(), onupdate=func.now()) + filename = Column(String, unique=True, nullable=False) + reader = Column(String) + num_tiles = Column(Integer) + image_id = Column(Integer, ForeignKey("image.id"), nullable=False) + + image: Mapped["Image"] = relationship("Image", back_populates="features") + description: Mapped["FeatureDescription"] = relationship("FeatureDescription", back_populates="image_feature") + +class FeatureDescription(Base): + """Feature description table.""" + + __tablename__ = "feature_description" + + id = Column(Integer, primary_key=True) + # pylint: disable=E1102 + created = Column(DateTime(timezone=True), default=func.now()) + last_updated = Column(DateTime(timezone=True), default=func.now(), onupdate=func.now()) + + mpp = Column(Float) + tile_size_width = Column(Integer) + tile_size_height = Column(Integer) + tile_overlap_width = Column(Integer) + tile_overlap_height = Column(Integer) + description = Column(String) + + version = Column(String, unique=True, nullable=False) # use this to select which features we want to use + + model_name = Column(String) + model_path = Column(String) + feature_dimension = Column(Integer) + image_transforms_description = Column(String) # it would be nice to have a way to track which transforms the feature extractors used, but maybe this is not the best way to do it + class Mask(Base): """Mask table.""" From 117b66e2fc274c883237d6943a4ae11db9f6a429 Mon Sep 17 00:00:00 2001 From: "Marek (on hp-zbook)" Date: Thu, 8 Aug 2024 14:10:06 +0200 Subject: [PATCH 4/5] added loading features as a dataset + necessary utils --- ahcore/utils/data.py | 5 +-- ahcore/utils/manifest.py | 77 +++++++++++++++++++++++++++++++++++----- ahcore/writers.py | 3 +- 3 files changed, 73 insertions(+), 12 deletions(-) diff --git a/ahcore/utils/data.py b/ahcore/utils/data.py index 39a5fa9..8adac60 100644 --- a/ahcore/utils/data.py +++ b/ahcore/utils/data.py @@ -58,9 +58,10 @@ class DataDescription(BaseModel): manifest_database_uri: str manifest_name: str split_version: str + feature_version: Optional[str] = None annotations_dir: Path - training_grid: GridDescription - inference_grid: GridDescription + training_grid: Optional[GridDescription] = None + inference_grid: Optional[GridDescription] = None index_map: Optional[Dict[str, int]] remap_labels: Optional[Dict[str, str]] = None use_class_weights: Optional[bool] = False diff --git a/ahcore/utils/manifest.py b/ahcore/utils/manifest.py index 7a83ab9..01ee2a4 100644 --- a/ahcore/utils/manifest.py +++ b/ahcore/utils/manifest.py @@ -34,6 +34,7 @@ Patient, Split, SplitDefinitions, + ImageFeature, ) from ahcore.utils.io import get_enum_key_from_value, get_logger from ahcore.utils.rois import compute_rois @@ -144,6 +145,28 @@ def get_labels_from_record(record: Image | Patient) -> list[tuple[str, str]] | N _labels = [(str(label.key), str(label.value)) for label in record.labels] if record.labels else None return _labels +def get_relevant_feature_info_from_record(record: ImageFeature, data_description: DataDescription) -> tuple[Path, PositiveFloat, tuple[PositiveInt, PositiveInt], tuple[PositiveInt, PositiveInt], TilingMode, ImageBackend, PositiveFloat]: + """Get the features from a record of type Image. + + Parameters + ---------- + record : Type[Image] + The record containing the features. + + Returns + ------- + tuple[Path, PositiveFloat, tuple[PositiveInt, PositiveInt], tuple[PositiveInt, PositiveInt], TilingMode, ImageBackend, PositiveFloat] + The features of the image. + """ + image_path = data_description.data_dir / record.filename + mpp = record.mpp + tile_size = (record.num_tiles, 1) # this would load all the features in one go --> can be extended to only load relevant tile level features + tile_overlap = (0, 0) + tile_mode = TilingMode.C + backend = ImageBackend[str(record.reader)] + overwrite_mpp = record.mpp + return image_path, mpp, tile_size, tile_overlap, tile_mode, backend, overwrite_mpp + def _get_rois(mask: WsiAnnotations | None, data_description: DataDescription, stage: str) -> Optional[Rois]: if (mask is None) or (stage != "fit") or (not data_description.convert_mask_to_rois): @@ -300,6 +323,28 @@ def get_image_metadata_by_id(self, image_id: int) -> ImageMetadata: assert image is not None # mypy return fetch_image_metadata(image) + def get_image_features_by_image_and_feature_version(self, image_id: int, feature_version: str) -> ImageFeature: + """ + Fetch the features for an image based on its ID and feature version. + + Parameters + ---------- + image_id : int + The ID of the image. + feature_version : str + The version of the features. + + Returns + ------- + ImageFeature + The features of the image. + """ + image_feature = self._session.query(ImageFeature).filter_by(image_id=image_id, version=feature_version).first() + self._ensure_record(image_feature, f"No features found for image ID {image_id} and feature version {feature_version}") + assert image_feature is not None + # todo: make sure that this only allows to run one ImageFeature, I think it should be good bc of the unique constraint + return image_feature + def __enter__(self) -> "DataManager": return self @@ -333,19 +378,23 @@ def datasets_from_data_description( assert isinstance(stage, str), "Stage should be a string." if stage == "fit": - grid_description = data_description.training_grid + grid_description = data_description.training_grid else: - grid_description = data_description.inference_grid + grid_description = data_description.inference_grid patients = db_manager.get_records_by_split( manifest_name=data_description.manifest_name, split_version=data_description.split_version, split_category=stage, ) + + use_features = data_description.feature_version is not None + for patient in patients: patient_labels = get_labels_from_record(patient) for image in patient.images: + mask, annotations = get_mask_and_annotations_from_record(annotations_root, image) assert isinstance(mask, WsiAnnotations) or (mask is None) image_labels = get_labels_from_record(image) @@ -353,12 +402,22 @@ def datasets_from_data_description( rois = _get_rois(mask, data_description, stage) mask_threshold = 0.0 if stage != "fit" else data_description.mask_threshold + if use_features: + image_feature = db_manager.get_image_features_by_image_and_feature_version(image.id, data_description.feature_version) + image_path, mpp, tile_size, tile_overlap, tile_mode, backend, overwrite_mpp = get_relevant_feature_info_from_record(image_feature, data_description) + else: + image_path = image_root / image.filename + tile_size = grid_description.tile_size + tile_overlap = grid_description.tile_overlap + backend = ImageBackend[str(image.reader)] + mpp = grid_description.mpp + overwrite_mpp = image.mpp + dataset = TiledWsiDataset.from_standard_tiling( - path=image_root / image.filename, - mpp=grid_description.mpp, - tile_size=grid_description.tile_size, - tile_overlap=grid_description.tile_overlap, - tile_mode=TilingMode.overflow, + path=image_path, + mpp=mpp, + tile_size=tile_size, + tile_overlap=tile_overlap, grid_order=GridOrder.C, crop=False, mask=mask, @@ -368,8 +427,8 @@ def datasets_from_data_description( annotations=annotations if stage != "predict" else None, labels=labels, # type: ignore transform=transform, - backend=ImageBackend[str(image.reader)], - overwrite_mpp=(image.mpp, image.mpp), + backend=backend, + overwrite_mpp=(overwrite_mpp, overwrite_mpp), limit_bounds=True, apply_color_profile=data_description.apply_color_profile, internal_handler="vips", diff --git a/ahcore/writers.py b/ahcore/writers.py index 1a59997..19fa909 100644 --- a/ahcore/writers.py +++ b/ahcore/writers.py @@ -87,6 +87,7 @@ def __init__( precision: InferencePrecision | None = None, grid: Grid | None = None, ) -> None: + # todo: better documentation for this, can basically set everything it is almost independent from slide image self._grid = grid self._filename: Path = filename self._size: tuple[int, int] = size @@ -481,7 +482,7 @@ def insert_data(self, batch: GenericNumberArray) -> None: raise ValueError(f"Batch should have a single element when writing h5. Got batch shape {batch.shape}.") batch_size = batch.shape[0] self._data[self._current_index : self._current_index + batch_size] = ( - batch.flatten() if self._is_compressed_image else batch + batch.flatten() if self._is_compressed_image else batch # fixme: flatten shouldn't work here ) def create_dataset( From 7cfbe85af98eb180ae7e5b44ff15628b58d4f580 Mon Sep 17 00:00:00 2001 From: "Marek (on hp-zbook)" Date: Thu, 8 Aug 2024 15:22:25 +0200 Subject: [PATCH 5/5] add option to copy other data such as mask, annotations and features --- ahcore/cli/data.py | 108 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 106 insertions(+), 2 deletions(-) diff --git a/ahcore/cli/data.py b/ahcore/cli/data.py index 7acdb3f..9ab85bb 100644 --- a/ahcore/cli/data.py +++ b/ahcore/cli/data.py @@ -1,4 +1,5 @@ """Module to write copy manifests files over to SCRATCH directory""" + from __future__ import annotations import argparse import hashlib @@ -44,7 +45,7 @@ def copy_data(args: argparse.Namespace) -> None: image_fn = image.filename get_from = base_dir / image_fn - write_to = Path(target_dir) / dataset_name / image_fn + write_to = Path(target_dir) / dataset_name / "images" / image_fn accompanying_folder_write_to, accompanying_folder_get_from = None, None if get_from.suffix == ".mrxs": @@ -62,7 +63,7 @@ def copy_data(args: argparse.Namespace) -> None: new_hash = _quick_hash(get_from) if old_hash == new_hash: # Skip if they are the same - progress.console.log("Skipping file as it already exists: {}".format(image_fn)) + progress.console.log("Skipping (image) file as it already exists: {}".format(image_fn)) progress.update(task, advance=1) continue @@ -75,6 +76,80 @@ def copy_data(args: argparse.Namespace) -> None: total_size += accompanying_folder_get_from.stat().st_size progress.update(task, advance=1) + # copy mask and annotations + if args.mask_dir is not None or args.annotations_dir is not None: + for mask in image.masks: + mask_fn = mask.filename + get_from = args.mask_dir / mask_fn + write_to = Path(target_dir) / dataset_name / "masks" / mask_fn + write_to.parent.mkdir(parents=True, exist_ok=True) + if write_to.exists(): + # compute the hash of previous and new file + old_hash = _quick_hash(write_to) + new_hash = _quick_hash(get_from) + if old_hash == new_hash: + # Skip if they are the same + progress.console.log( + "Skipping (mask) file as it already exists: {}".format(mask_fn) + ) + progress.update(task, advance=1) + continue + + total_size += get_from.stat().st_size + shutil.copy(get_from, write_to) + progress.update(task, advance=1) + + for annotation in image.annotations: + annotation_fn = annotation.filename + get_from = args.annotations_dir / annotation_fn + write_to = Path(target_dir) / dataset_name / "annotations" / annotation_fn + write_to.parent.mkdir(parents=True, exist_ok=True) + if write_to.exists(): + # compute the hash of previous and new file + old_hash = _quick_hash(write_to) + new_hash = _quick_hash(get_from) + if old_hash == new_hash: + # Skip if they are the same + progress.console.log( + "Skipping (annotation) file as it already exists: {}".format(annotation_fn) + ) + progress.update(task, advance=1) + continue + + total_size += get_from.stat().st_size + shutil.copy(get_from, write_to) + progress.update(task, advance=1) + + # copy features + if args.features_dir is not None: + if args.feature_version is not None: + feature = dm.get_image_features_by_image_and_feature_version( + image.id, args.feature_version + ) # there should only be one + feature_fn = feature.filename + get_from = args.features_dir / feature_fn + write_to = Path(target_dir) / dataset_name / "features" / feature_fn + write_to.parent.mkdir(parents=True, exist_ok=True) + if write_to.exists(): + # compute the hash of previous and new file + old_hash = _quick_hash(write_to) + new_hash = _quick_hash(get_from) + if old_hash == new_hash: + # Skip if they are the same + progress.console.log( + "Skipping (feature) file as it already exists: {}".format(feature_fn) + ) + progress.update(task, advance=1) + continue + + total_size += get_from.stat().st_size + shutil.copy(get_from, write_to) + progress.update(task, advance=1) + else: + raise ValueError( + "Feature version is not provided, but features directory is provided. Please provide both to copy features." + ) + progress.console.log("Total data size copied: {:.2f} GB".format(total_size / 1024**3)) @@ -118,4 +193,33 @@ def register_parser( type=str, help="Name of the dataset to copy the data to. The data will be copied over to $SCRATCH / DATASET_NAME", ) + + _parser.add_argument( + "--mask_dir", + type=dir_path(require_writable=False), + help="Directory to which the masks paths defined in the manifest are relative to.", + default=None, + ) + + _parser.add_argument( + "--annotations_dir", + type=dir_path(require_writable=False), + help="Directory to which the annotations paths defined in the manifest are relative to.", + default=None, + ) + + _parser.add_argument( + "--features_dir", + type=dir_path(require_writable=False), + help="Directory to which the features paths defined in the manifest are relative to.", + default=None, + ) + + _parser.add_argument( + "--feature_version", + type=dir_path(require_writable=False), + help="Version of the features to copy.", + default=None, + ) + _parser.set_defaults(subcommand=copy_data)