Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add PANDASmall dataset #664

Merged
merged 12 commits into from
Oct 8, 2024
133 changes: 133 additions & 0 deletions configs/vision/pathology/offline/classification/panda_small.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
---
trainer:
class_path: eva.Trainer
init_args:
n_runs: &N_RUNS ${oc.env:N_RUNS, 5}
default_root_dir: &OUTPUT_ROOT ${oc.env:OUTPUT_ROOT, logs/${oc.env:MODEL_NAME, dino_vits16}/offline/panda}
max_epochs: &MAX_EPOCHS ${oc.env:MAX_EPOCHS, 49}
callbacks:
- class_path: eva.callbacks.ConfigurationLogger
- class_path: lightning.pytorch.callbacks.TQDMProgressBar
init_args:
refresh_rate: ${oc.env:TQDM_REFRESH_RATE, 1}
- class_path: lightning.pytorch.callbacks.LearningRateMonitor
init_args:
logging_interval: epoch
- class_path: lightning.pytorch.callbacks.ModelCheckpoint
init_args:
filename: best
save_last: true
save_top_k: 1
monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/MulticlassAccuracy}
mode: &MONITOR_METRIC_MODE ${oc.env:MONITOR_METRIC_MODE, max}
- class_path: lightning.pytorch.callbacks.EarlyStopping
init_args:
min_delta: 0
patience: ${oc.env:PATIENCE, 8}
monitor: *MONITOR_METRIC
mode: *MONITOR_METRIC_MODE
- class_path: eva.callbacks.ClassificationEmbeddingsWriter
init_args:
output_dir: &DATASET_EMBEDDINGS_ROOT ${oc.env:EMBEDDINGS_ROOT, ./data/embeddings/${oc.env:MODEL_NAME, dino_vits16}/panda}
dataloader_idx_map:
0: train
1: val
2: test
metadata_keys: ["wsi_id"]
backbone:
class_path: eva.vision.models.ModelFromRegistry
init_args:
model_name: ${oc.env:MODEL_NAME, universal/vit_small_patch16_224_dino}
model_extra_kwargs: ${oc.env:MODEL_EXTRA_KWARGS, null}
overwrite: false
logger:
- class_path: lightning.pytorch.loggers.TensorBoardLogger
init_args:
save_dir: *OUTPUT_ROOT
name: ""
model:
class_path: eva.HeadModule
init_args:
head:
class_path: eva.vision.models.networks.ABMIL
init_args:
input_size: ${oc.env:IN_FEATURES, 384}
output_size: &NUM_CLASSES 6
projected_input_size: 128
criterion: torch.nn.CrossEntropyLoss
optimizer:
class_path: torch.optim.AdamW
init_args:
lr: ${oc.env:LR_VALUE, 0.001}
betas: [0.9, 0.999]
lr_scheduler:
class_path: torch.optim.lr_scheduler.CosineAnnealingLR
init_args:
T_max: *MAX_EPOCHS
eta_min: 0.0
metrics:
common:
- class_path: eva.metrics.AverageLoss
- class_path: eva.metrics.MulticlassClassificationMetrics
init_args:
num_classes: *NUM_CLASSES
data:
class_path: eva.DataModule
init_args:
datasets:
train:
class_path: eva.datasets.MultiEmbeddingsClassificationDataset
init_args: &DATASET_ARGS
root: *DATASET_EMBEDDINGS_ROOT
manifest_file: manifest.csv
split: train
embeddings_transforms:
class_path: eva.core.data.transforms.Pad2DTensor
init_args:
pad_size: &N_PATCHES 200
val:
class_path: eva.datasets.MultiEmbeddingsClassificationDataset
init_args:
<<: *DATASET_ARGS
split: val
test:
class_path: eva.datasets.MultiEmbeddingsClassificationDataset
init_args:
<<: *DATASET_ARGS
split: test
predict:
- class_path: eva.vision.datasets.PANDASmall
init_args: &PREDICT_DATASET_ARGS
root: ${oc.env:DATA_ROOT, ./data/panda/prostate-cancer-grade-assessment}
sampler:
class_path: eva.vision.data.wsi.patching.samplers.ForegroundGridSampler
init_args:
max_samples: *N_PATCHES
width: 224
height: 224
target_mpp: 0.5
split: train
image_transforms:
class_path: eva.vision.data.transforms.common.ResizeAndCrop
init_args:
size: ${oc.env:RESIZE_DIM, 224}
mean: ${oc.env:NORMALIZE_MEAN, [0.485, 0.456, 0.406]}
std: ${oc.env:NORMALIZE_STD, [0.229, 0.224, 0.225]}
- class_path: eva.vision.datasets.PANDASmall
init_args:
<<: *PREDICT_DATASET_ARGS
split: val
- class_path: eva.vision.datasets.PANDASmall
init_args:
<<: *PREDICT_DATASET_ARGS
split: test
dataloaders:
train:
batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 32}
shuffle: true
val:
batch_size: *BATCH_SIZE
test:
batch_size: *BATCH_SIZE
predict:
batch_size: &PREDICT_BATCH_SIZE ${oc.env:PREDICT_BATCH_SIZE, 64}
4 changes: 2 additions & 2 deletions docs/user-guide/advanced/replicate_evaluations.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,8 @@ were released on [HuggingFace](https://huggingface.co/bioptimus/H-optimus-0).

```
MODEL_NAME=pathology/bioptimus_h_optimus_0 \
NORMALIZE_MEAN=[0.707223, 0.578729, 0.703617] \
NORMALIZE_STD=[0.211883, 0.230117, 0.177517] \
NORMALIZE_MEAN=[0.707223,0.578729,0.703617] \
NORMALIZE_STD=[0.211883,0.230117,0.177517] \
IN_FEATURES=1024 \
eva predict_fit --config configs/vision/pathology/offline/<task>.yaml
```
Expand Down
9 changes: 5 additions & 4 deletions src/eva/core/data/splitting/random.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,13 @@ def random_split(
Returns:
The indices of the train, validation, and test sets as lists.
"""
if train_ratio + val_ratio + (test_ratio or 0) != 1:
raise ValueError("The sum of the ratios must be equal to 1.")
total_ratio = train_ratio + val_ratio + test_ratio
if total_ratio > 1.0:
raise ValueError("The sum of the ratios must be lower or equal to 1.")

random_generator = np.random.default_rng(seed)
n_samples = len(samples)
indices = random_generator.permutation(n_samples)
n_samples = int(total_ratio * len(samples))
indices = random_generator.permutation(len(samples))[:n_samples]

n_train = int(np.floor(train_ratio * n_samples))
n_val = n_samples - n_train if test_ratio == 0.0 else int(np.floor(val_ratio * n_samples)) or 1
Expand Down
18 changes: 12 additions & 6 deletions src/eva/core/data/splitting/stratified.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,29 +28,35 @@ def stratified_split(
"""
if len(samples) != len(targets):
raise ValueError("The number of samples and targets must be equal.")
if train_ratio + val_ratio + (test_ratio or 0) != 1:
raise ValueError("The sum of the ratios must be equal to 1.")
if train_ratio + val_ratio + (test_ratio or 0) > 1.0:
raise ValueError("The sum of the ratios must be lower or equal to 1.")

np.random.seed(seed)
use_all_samples = train_ratio + val_ratio + test_ratio == 1
random_generator = np.random.default_rng(seed)
unique_classes, y_indices = np.unique(targets, return_inverse=True)
n_classes = unique_classes.shape[0]

train_indices, val_indices, test_indices = [], [], []

for c in range(n_classes):
class_indices = np.where(y_indices == c)[0]
np.random.shuffle(class_indices)
random_generator.shuffle(class_indices)
roman807 marked this conversation as resolved.
Show resolved Hide resolved

n_train = int(np.floor(train_ratio * len(class_indices))) or 1
n_val = (
len(class_indices) - n_train
if test_ratio == 0.0
if test_ratio == 0.0 and use_all_samples
else int(np.floor(val_ratio * len(class_indices))) or 1
)

train_indices.extend(class_indices[:n_train])
val_indices.extend(class_indices[n_train : n_train + n_val])
if test_ratio > 0.0:
test_indices.extend(class_indices[n_train + n_val :])
n_test = (
len(class_indices) - n_train - n_val
if use_all_samples
else int(np.floor(test_ratio * len(class_indices))) or 1
)
test_indices.extend(class_indices[n_train + n_val : n_train + n_val + n_test])

return train_indices, val_indices, test_indices or None
2 changes: 2 additions & 0 deletions src/eva/vision/data/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
MHIST,
PANDA,
Camelyon16,
PANDASmall,
PatchCamelyon,
WsiClassificationDataset,
)
Expand All @@ -28,6 +29,7 @@
"CRC",
"MHIST",
"PANDA",
"PANDASmall",
"Camelyon16",
"PatchCamelyon",
"WsiClassificationDataset",
Expand Down
3 changes: 2 additions & 1 deletion src/eva/vision/data/datasets/classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from eva.vision.data.datasets.classification.camelyon16 import Camelyon16
from eva.vision.data.datasets.classification.crc import CRC
from eva.vision.data.datasets.classification.mhist import MHIST
from eva.vision.data.datasets.classification.panda import PANDA
from eva.vision.data.datasets.classification.panda import PANDA, PANDASmall
from eva.vision.data.datasets.classification.patch_camelyon import PatchCamelyon
from eva.vision.data.datasets.classification.wsi import WsiClassificationDataset

Expand All @@ -15,5 +15,6 @@
"PatchCamelyon",
"WsiClassificationDataset",
"PANDA",
"PANDASmall",
"Camelyon16",
]
13 changes: 13 additions & 0 deletions src/eva/vision/data/datasets/classification/panda.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,3 +182,16 @@ def _get_target_from_path(self, file_path: str) -> int:

def _get_id_from_path(self, file_path: str) -> str:
return os.path.basename(file_path).replace(".tiff", "")


class PANDASmall(PANDA):
"""Small version of the PANDA dataset for quicker benchmarking."""

_train_split_ratio: float = 0.1
"""Train split ratio."""

_val_split_ratio: float = 0.05
"""Validation split ratio."""

_test_split_ratio: float = 0.05
"""Test split ratio."""
10 changes: 2 additions & 8 deletions src/eva/vision/data/wsi/patching/samplers/_utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,8 @@
import random
from typing import Tuple

import numpy as np


def set_seed(seed: int) -> None:
random.seed(seed)
np.random.seed(seed)


def get_grid_coords_and_indices(
layer_shape: Tuple[int, int],
width: int,
Expand All @@ -33,8 +27,8 @@ def get_grid_coords_and_indices(

indices = list(range(len(x_y)))
if shuffle:
set_seed(seed)
np.random.shuffle(indices)
random_generator = np.random.default_rng(seed)
random_generator.shuffle(indices)
return x_y, indices


Expand Down
6 changes: 4 additions & 2 deletions src/eva/vision/data/wsi/patching/samplers/random.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def __init__(self, n_samples: int = 1, seed: int = 42):
"""Initializes the sampler."""
self.seed = seed
self.n_samples = n_samples
self.random_generator = random.Random(seed) # nosec

def sample(
self,
Expand All @@ -33,9 +34,10 @@ def sample(
layer_shape: The shape of the layer.
"""
_utils.validate_dimensions(width, height, layer_shape)
_utils.set_seed(self.seed)

x_max, y_max = layer_shape[0], layer_shape[1]
for _ in range(self.n_samples):
x, y = random.randint(0, x_max - width), random.randint(0, y_max - height) # nosec
x, y = self.random_generator.randint(0, x_max - width), self.random_generator.randint(
0, y_max - height
)
yield x, y
27 changes: 23 additions & 4 deletions tests/eva/core/data/splitting/test_random.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Tests for the random split function."""

from typing import List

import pytest

from eva.core.data import splitting
Expand Down Expand Up @@ -32,11 +34,11 @@ def test_split_ratios(n_samples: int, train_ratio: float, val_ratio: float, test
assert len(train_indices) + len(val_indices) + len(test_indices or []) == n_samples


@pytest.mark.parametrize("train_ratio, val_ratio, test_ratio", [(0.6, 0.3, 0.0), (0.6, 0.4, 0.3)])
@pytest.mark.parametrize("train_ratio, val_ratio, test_ratio", [(0.6, 0.7, 0.0), (0.6, 0.4, 0.3)])
def test_invalid_ratio_sums(train_ratio: float, val_ratio: float, test_ratio: float):
"""Tests if the function raises an error when the ratios do not sum to 1."""
samples = list(range(100))
expected_error = "The sum of the ratios must be equal to 1."
expected_error = "The sum of the ratios must be lower or equal to 1"
with pytest.raises(ValueError, match=expected_error):
splitting.random_split(samples, train_ratio, val_ratio, test_ratio)

Expand All @@ -53,8 +55,20 @@ def test_different_seeds_produce_different_outputs(seed1, seed2):
assert test1 != test2, "Different seeds should produce different test indices"


@pytest.mark.parametrize("seed", [42, 123, 999])
def test_same_seed_produces_same_outputs(seed):
@pytest.mark.parametrize(
"seed, train_expected_indices, val_expected_indices, test_expected_indices",
[
(42, [59, 21, 56, 18], [69, 15, 48, 55], [49, 6, 90, 11]),
(123, [21, 71, 92, 23], [89, 14, 64, 4], [45, 75, 62, 6]),
(999, [47, 42, 57, 50], [41, 3, 81, 61], [45, 6, 56, 67]),
],
)
def test_same_seed_produces_same_outputs(
seed: int,
train_expected_indices: List[int],
val_expected_indices: List[int],
test_expected_indices: List[int],
):
"""Tests if the same seed produces the same train, validation, and test indices."""
samples = list(range(100))
train1, val1, test1 = splitting.random_split(samples, 0.6, 0.2, 0.2, seed=seed)
Expand All @@ -63,6 +77,11 @@ def test_same_seed_produces_same_outputs(seed):
assert train1 == train2, "Same seed should produce the same train indices"
assert val1 == val2, "Same seed should produce the same validation indices"
assert test1 == test2, "Same seed should produce the same test indices"
assert isinstance(test1, list)

assert train1[: len(train_expected_indices)] == train_expected_indices, "Unexpected indices"
assert val1[: len(val_expected_indices)] == val_expected_indices, "Unexpected indices"
assert test1[: len(test_expected_indices)] == test_expected_indices, "Unexpected indices"


def test_no_test_set():
Expand Down
Loading