From 2d4a5bd010f0d65c2708c496321754a6114a00a3 Mon Sep 17 00:00:00 2001 From: LVeefkind Date: Fri, 25 Oct 2024 16:17:38 +0200 Subject: [PATCH] Added confusion matrix plotting --- neural_networks/__init__.py | 37 +++++---- neural_networks/parameters.txt | 15 ++++ neural_networks/plots/confusion_matrix.py | 95 +++++++++++++++++++++++ neural_networks/requirements.txt | 9 ++- neural_networks/train_nn.job | 21 ++++- neural_networks/train_nn.py | 62 ++++++++++++--- 6 files changed, 210 insertions(+), 29 deletions(-) create mode 100644 neural_networks/plots/confusion_matrix.py diff --git a/neural_networks/__init__.py b/neural_networks/__init__.py index c8e1eef7..9704ca81 100644 --- a/neural_networks/__init__.py +++ b/neural_networks/__init__.py @@ -9,8 +9,12 @@ import __main__ from astropy.io import fits -from train_nn import ImagenetTransferLearning, load_checkpoint # noqa -from pre_processing_for_ml import normalize_fits +from .train_nn import ( + ImagenetTransferLearning, + load_checkpoint, + normalize_inputs, +) # noqa +from .pre_processing_for_ml import normalize_fits setattr(__main__, "ImagenetTransferLearning", ImagenetTransferLearning) @@ -28,7 +32,7 @@ def __init__( model_name: str = None, device: str = None, variational_dropout: int = 0, - **kwargs + **kwargs, ): super().__init__(model_name, device) @@ -47,10 +51,10 @@ def load_checkpoint(self, path) -> torch.nn.Module: ( model, _, - args, + self.args, ) = load_checkpoint(path, self.device).values() - self.resize = args["resize"] - self.lift = args["lift"] + self.resize = self.args["resize"] + self.lift = self.args["lift"] return model @functools.lru_cache(maxsize=1) @@ -58,19 +62,26 @@ def prepare_data(self, input_path: str) -> torch.Tensor: input_data: torch.Tensor = torch.from_numpy(process_fits(input_path)) input_data = input_data.to(self.dtype) input_data = input_data.swapdims(0, 2).unsqueeze(0) + return self.prepare_batch(input_data) + + def prepare_batch(self, batch: torch.Tensor, mean=None, std=None) -> torch.Tensor: + batch = batch.to(self.dtype).to(self.device) if self.resize != 0: - input_data = interpolate( - input_data, size=self.resize, mode="bilinear", align_corners=False + batch = interpolate( + batch, size=self.resize, mode="bilinear", align_corners=False ) - input_data = input_data.to(self.device) - return input_data + if mean is None: + mean = self.mean + if std is None: + std = self.std + batch = normalize_inputs(batch, mean, std, normalize=1) + return batch @torch.no_grad() def predict(self, data: torch.Tensor): with torch.autocast(dtype=self.dtype, device_type=self.device): if self.variational_dropout > 0: self.model.train() - # self.model.classifier.train() predictions = torch.concat( [ @@ -80,8 +91,8 @@ def predict(self, data: torch.Tensor): dim=1, ) - mean = predictions.mean() - std = predictions.std() + mean = predictions.mean(dim=1) + std = predictions.std(dim=1) print(mean, std) return mean, std diff --git a/neural_networks/parameters.txt b/neural_networks/parameters.txt index 3dda20e2..b093a18c 100644 --- a/neural_networks/parameters.txt +++ b/neural_networks/parameters.txt @@ -58,6 +58,21 @@ efficientnet_v2_l 1e-05 1 0.1 32 0.2 1 0 16 16 0 efficientnet_v2_l 1e-05 1 0.1 32 0.1 1 0 16 16 0 efficientnet_v2_l 1e-05 1 0.1 32 0.1 0 0 16 16 0 +dinov2_vitl14_reg 1e-04 1 0.25 32 0.1 0 0 16 16 560 conv 0 +dinov2_vitl14_reg 1e-04 1 0.1 32 0.1 0 0 16 16 560 conv 0 +dinov2_vitl14_reg 1e-04 1 0.25 32 0.1 0 0 16 16 560 conv 1 +dinov2_vitl14_reg 1e-04 1 0.1 32 0.1 0 0 16 16 560 conv 1 +dinov2_vitl14_reg 1e-04 1 0.25 32 0.1 0 0 16 16 560 stack 0 +dinov2_vitl14_reg 1e-04 1 0.1 32 0.1 0 0 16 16 560 stack 0 +efficientnet_v2_l 1e-04 1 0.1 32 0.2 0 0 16 16 0 stack 0 +dinov2_vitl14_reg 1e-04 1 0.25 32 0.1 0 0 16 16 560 stack 1 +dinov2_vitl14_reg 1e-04 1 0.1 32 0.1 0 0 16 16 560 stack 1 +efficientnet_v2_l 1e-04 1 0.1 32 0.2 0 0 16 16 0 stack 1 +dinov2_vitl14_reg 1e-04 1 0.25 32 0.1 0 1 16 16 560 conv 0 +dinov2_vitl14_reg 1e-04 1 0.1 32 0.1 0 1 16 16 560 conv 0 +dinov2_vitl14_reg 1e-04 1 0.25 32 0.1 0 1 16 16 560 conv 1 +dinov2_vitl14_reg 1e-04 1 0.1 32 0.1 0 1 16 16 560 conv 1 + diff --git a/neural_networks/plots/confusion_matrix.py b/neural_networks/plots/confusion_matrix.py new file mode 100644 index 00000000..db142361 --- /dev/null +++ b/neural_networks/plots/confusion_matrix.py @@ -0,0 +1,95 @@ +from cortexchange.architecture import get_architecture, Architecture +from pathlib import Path +import sys +import os + +SCRIPT_DIR = Path(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(os.path.dirname(SCRIPT_DIR)) +from train_nn import MultiEpochsDataLoader +from pre_processing_for_ml import FitsDataset +import matplotlib.pyplot as plt +import numpy as np +import torch +from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay + + +def load_model(architecture_name, model_name): + StopPredictor: type(Architecture) = get_architecture(architecture_name) + predictor = StopPredictor(device="cuda", model_name=model_name) + return predictor + + +def get_dataloader(data_root, mode, batch_size): + num_workers = min(12, len(os.sched_getaffinity(0))) + + prefetch_factor, persistent_workers = ( + (2, True) if num_workers > 0 else (None, False) + ) + + return MultiEpochsDataLoader( + dataset=FitsDataset( + data_root, + mode=mode, + ), + batch_size=batch_size, + num_workers=num_workers, + prefetch_factor=prefetch_factor, + persistent_workers=persistent_workers, + pin_memory=True, + shuffle=False, + drop_last=False, + ) + + +def get_statistics(data_root, mode): + return FitsDataset( + data_root, + mode=mode, + ).compute_statistics(1) + + +@torch.no_grad() +def get_confusion_matrix( + predictor, dataloader, mean, std, thresholds=[0.2, 0.3, 0.4, 0.5] +): + confusion_matrices = np.zeros((len(thresholds), 2, 2)) + thresholds = torch.tensor(thresholds) + for img, label in dataloader: + data = predictor.prepare_batch(img, mean=mean, std=std) + pred = torch.sigmoid(predictor.model(data)).to("cpu") + preds_thres = pred >= thresholds + for i, _ in enumerate(thresholds): + confusion_matrices[i] += confusion_matrix( + label, preds_thres[:, i], labels=[0, 1] + ) + + for i, conf_matrix in enumerate(confusion_matrices): + + disp = ConfusionMatrixDisplay( + # Normalization + conf_matrix / np.sum(conf_matrix, axis=1, keepdims=True), + display_labels=["continue", "stop"], + ) + disp.plot() + + plt.savefig(f"confusion_thres_{thresholds[i]:.3f}.png") + + +if __name__ == "__main__": + model_name = "surf/dinov2_09814" + architecture_name = "surf/TransferLearning" + predictor = load_model(architecture_name, model_name) + if hasattr(predictor, "args") and "dataset_mean" in predictor.args: + mean, std = predictor.args["dataset_mean"], predictor.args["dataset_std"] + else: + mean, std = get_statistics( + "/scratch-shared/CORTEX/public.spider.surfsara.nl/lofarvwf/jdejong/CORTEX/calibrator_selection_robertjan/cnn_data/", + mode="train", + ) + + dataloader = get_dataloader( + "/scratch-shared/CORTEX/public.spider.surfsara.nl/lofarvwf/jdejong/CORTEX/calibrator_selection_robertjan/cnn_data/", + mode="val", + batch_size=32, + ) + get_confusion_matrix(predictor, dataloader, mean, std) diff --git a/neural_networks/requirements.txt b/neural_networks/requirements.txt index 1c8be69b..ce717149 100644 --- a/neural_networks/requirements.txt +++ b/neural_networks/requirements.txt @@ -1,9 +1,14 @@ matplotlib -torch +torch>=2.1.2 torchvision torcheval tqdm matplotlib joblib -astropy +astropy>6.0.0 +xformers +tensorboard +dino-finetune @ git+https://github.com/sara-nl/dinov2-finetune.git +scikit-learn + diff --git a/neural_networks/train_nn.job b/neural_networks/train_nn.job index 2a2c8574..f41dab2d 100644 --- a/neural_networks/train_nn.job +++ b/neural_networks/train_nn.job @@ -22,7 +22,7 @@ SLURM_ARRAY_TASK_ID=${SLURM_ARRAY_TASK_ID:=1} PARAMS=$(sed -n "${SLURM_ARRAY_TASK_ID}p" $PARAM_FILE) # Parse the parameters -read model lr normalize dropout_p batch_size label_smoothing stochastic_smoothing use_lora rank alpha resize <<< $PARAMS +read model lr normalize dropout_p batch_size label_smoothing stochastic_smoothing use_lora rank alpha resize lift flip_augmentations <<< $PARAMS if [ "$use_lora" -eq 1 ]; then LORA_ARG="--use_lora" @@ -36,7 +36,24 @@ else STOCHASTIC_SMOOTHING="" fi +if [ "$flip_augmentations" -eq 1 ]; then + FLIP_AUGMENTATIONS="--flip_augmentations" +else + FLIP_AUGMENTATIONS="" +fi + +# Scale up by 1e6 to convert to integers for comparison +scaled_lr=$(echo "$lr * 1000000" | awk '{printf("%d", $1)}') +scaled_threshold=$(echo "4e-05 * 1000000" | awk '{printf("%d", $1)}') + +if [ "$scaled_lr" -le "$scaled_threshold" ]; then + EPOCHS="250" +else + EPOCHS="120" +fi + DATA_TRAINDATA_PATH="/scratch-shared/CORTEX/public.spider.surfsara.nl/lofarvwf/jdejong/CORTEX/calibrator_selection_robertjan/cnn_data/" # Execute your Python script with the given parameters -python train_nn.py $DATA_TRAINDATA_PATH --model $model --lr $lr --normalize $normalize --dropout_p $dropout_p --batch_size $batch_size --log_path grid_search_2 --label_smoothing $label_smoothing --rank $rank --resize $resize --alpha $alpha $LORA_ARG $STOCHASTIC_SMOOTHING -d +echo $DATA_TRAINDATA_PATH --model $model --lr $lr --normalize $normalize --dropout_p $dropout_p --batch_size $batch_size --log_path grid_search_2 --label_smoothing $label_smoothing --rank $rank --resize $resize --alpha $alpha $LORA_ARG $STOCHASTIC_SMOOTHING -d --epochs $EPOCHS --lift $lift $FLIP_AUGMENTATIONS +python train_nn.py $DATA_TRAINDATA_PATH --model $model --lr $lr --normalize $normalize --dropout_p $dropout_p --batch_size $batch_size --log_path grid_search_2 --label_smoothing $label_smoothing --rank $rank --resize $resize --alpha $alpha $LORA_ARG $STOCHASTIC_SMOOTHING -d --epochs $EPOCHS --lift $lift $FLIP_AUGMENTATIONS diff --git a/neural_networks/train_nn.py b/neural_networks/train_nn.py index 79e7ad5d..027ae589 100644 --- a/neural_networks/train_nn.py +++ b/neural_networks/train_nn.py @@ -13,7 +13,6 @@ from torchvision import models from torchvision.transforms import v2 from tqdm import tqdm -import joblib import numpy as np import random @@ -134,8 +133,8 @@ def normalize_inputs(inputs, means, stds, normalize=1): @torch.no_grad() -def augmentation(inputs): - inputs = get_transforms()(inputs) +def augmentation(inputs, flip_augmentations=False): + inputs = get_transforms(flip_augmentations=flip_augmentations)(inputs) inputs = inputs + 0.01 * torch.randn_like(inputs) return inputs @@ -369,9 +368,11 @@ def main( stochastic_smoothing: bool, lift: str, use_lora: bool, - rank=16, - alpha=16, - log_path="runs", + rank: int = 16, + alpha: float = 16, + log_path: Path = "runs", + epochs: int = 120, + flip_augmentations: bool = False, ): torch.set_float32_matmul_precision("high") torch.backends.cudnn.benchmark = True @@ -404,6 +405,8 @@ def main( resize=resize, rank=rank, alpha=alpha, + lift=lift, + flip_augmentations=flip_augmentations, ) writer = get_tensorboard_logger(logging_dir) @@ -461,6 +464,7 @@ def main( stochastic=stochastic_smoothing, smoothing_factor=label_smoothing, ), + augmentation_fn=partial(augmentation, flip_augmentations=flip_augmentations), ) val_step_f = partial(val_step_f, val_dataloader=val_dataloader) @@ -483,6 +487,9 @@ def main( "lr": lr, "dropout_p": dropout_p, "model_name": model_name, + "flip_augmentations": flip_augmentations, + "dataset_mean": mean, + "dataset_std": std, }, ) @@ -491,7 +498,7 @@ def main( best_results = {} - n_epochs = 120 + n_epochs = epochs for epoch in range(n_epochs): global_step = train_step_f(global_step=global_step, model=model) @@ -591,6 +598,7 @@ def train_step( logging_interval, metrics_logger, smoothing_fn, + augmentation_fn, ): # print("training") model.train() @@ -602,7 +610,7 @@ def train_step( data, labels = prepare_data_f(data, labels) smoothed_label = smoothing_fn(labels) - data = augmentation(data) + data = augmentation_fn(data) optimizer.zero_grad(set_to_none=True) with torch.autocast("cuda", dtype=torch.bfloat16): @@ -668,11 +676,29 @@ def __iter__(self): yield from iter(self.sampler) +import torchvision.transforms.functional as TF +from torchvision.transforms.functional import InterpolationMode + + +class Rotate90Transform: + def __init__(self, angles=[0, 90, 180, 270]): + self.angles = angles + + def __call__(self, x): + angle = np.random.choice(self.angles) + return v2.functional.rotate(x, int(angle), InterpolationMode.BILINEAR) + + @lru_cache(maxsize=1) -def get_transforms(): +def get_transforms(flip_augmentations=False): + return v2.Compose( [ - v2.RandomVerticalFlip(p=0.5), + ( + Rotate90Transform() + if not flip_augmentations + else v2.RandomVerticalFlip(p=0.5) + ), v2.RandomHorizontalFlip(p=0.5), ] ) @@ -698,7 +724,7 @@ def save_checkpoint(logging_dir, model, optimizer, global_step, **kwargs): def load_checkpoint(ckpt_path, device="gpu"): if os.path.isfile(ckpt_path): - ckpt_dict = torch.load(ckpt_path, weights_only=False, map_location=device) + ckpt_dict = torch.load(ckpt_path, weights_only=False) else: files = os.listdir(ckpt_path) possible_checkpoints = list(filter(lambda x: x.endswith(".pth"), files)) @@ -707,7 +733,7 @@ def load_checkpoint(ckpt_path, device="gpu"): f"Too many checkpoint files in the given checkpoint directory. Please specify the model you want to load directly." ) ckpt_path = f"{ckpt_path}/{possible_checkpoints[0]}" - ckpt_dict = torch.load(ckpt_path, weights_only=False, map_location=device) + ckpt_dict = torch.load(ckpt_path, weights_only=False) # strip 'model_' from the name model_name = ckpt_dict["args"]["model_name"] @@ -782,6 +808,12 @@ def get_argparser(): default=0, help="size to resize to. Will be set to 512 for ViT.", ) + parser.add_argument( + "--epochs", + type=int, + default=120, + help="number of epochs", + ) parser.add_argument("--use_compile", action="store_true") parser.add_argument( "--profile", @@ -832,6 +864,12 @@ def get_argparser(): help="LoRA alpha scaling. Defaults to rank value if not set", ) + parser.add_argument( + "--flip_augmentations", + action="store_true", + help="Uses double flip augmentations instead of rotate + flip", + ) + parser.add_argument("--log_path", type=str, default="runs") return parser.parse_args()