Merge branch 'master' of github.com:jurjen93/lofar_helpers

jurjen93 · Oct 29, 2024 · 5e7b459 · 5e7b459
2 parents 92b1092 + 15e3734
commit 5e7b459
Show file tree

Hide file tree

Showing 6 changed files with 322 additions and 29 deletions.
diff --git a/neural_networks/__init__.py b/neural_networks/__init__.py
@@ -9,8 +9,12 @@
 import __main__
 from astropy.io import fits
 
-from train_nn import ImagenetTransferLearning, load_checkpoint  # noqa
-from pre_processing_for_ml import normalize_fits
+from .train_nn import (
+    ImagenetTransferLearning,
+    load_checkpoint,
+    normalize_inputs,
+)  # noqa
+from .pre_processing_for_ml import normalize_fits
 
 setattr(__main__, "ImagenetTransferLearning", ImagenetTransferLearning)
 
@@ -28,7 +32,7 @@ def __init__(
         model_name: str = None,
         device: str = None,
         variational_dropout: int = 0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(model_name, device)
 
@@ -47,30 +51,37 @@ def load_checkpoint(self, path) -> torch.nn.Module:
         (
             model,
             _,
-            args,
+            self.args,
         ) = load_checkpoint(path, self.device).values()
-        self.resize = args["resize"]
-        self.lift = args["lift"]
+        self.resize = self.args["resize"]
+        self.lift = self.args["lift"]
         return model
 
     @functools.lru_cache(maxsize=1)
     def prepare_data(self, input_path: str) -> torch.Tensor:
         input_data: torch.Tensor = torch.from_numpy(process_fits(input_path))
         input_data = input_data.to(self.dtype)
         input_data = input_data.swapdims(0, 2).unsqueeze(0)
+        return self.prepare_batch(input_data)
+
+    def prepare_batch(self, batch: torch.Tensor, mean=None, std=None) -> torch.Tensor:
+        batch = batch.to(self.dtype).to(self.device)
         if self.resize != 0:
-            input_data = interpolate(
-                input_data, size=self.resize, mode="bilinear", align_corners=False
+            batch = interpolate(
+                batch, size=self.resize, mode="bilinear", align_corners=False
             )
-        input_data = input_data.to(self.device)
-        return input_data
+        if mean is None:
+            mean = self.mean
+        if std is None:
+            std = self.std
+        batch = normalize_inputs(batch, mean, std, normalize=1)
+        return batch
 
     @torch.no_grad()
     def predict(self, data: torch.Tensor):
         with torch.autocast(dtype=self.dtype, device_type=self.device):
             if self.variational_dropout > 0:
                 self.model.train()
-                # self.model.classifier.train()
 
             predictions = torch.concat(
                 [
@@ -80,8 +91,8 @@ def predict(self, data: torch.Tensor):
                 dim=1,
             )
 
-            mean = predictions.mean()
-            std = predictions.std()
+            mean = predictions.mean(dim=1)
+            std = predictions.std(dim=1)
 
         print(mean, std)
         return mean, std

diff --git a/neural_networks/parameters.txt b/neural_networks/parameters.txt
@@ -58,6 +58,21 @@ efficientnet_v2_l 1e-05 1 0.1 32 0.2 1 0 16 16 0
 efficientnet_v2_l 1e-05 1 0.1 32 0.1 1 0 16 16 0
 efficientnet_v2_l 1e-05 1 0.1 32 0.1 0 0 16 16 0
 
+dinov2_vitl14_reg 1e-04 1 0.25 32 0.1 0 0 16 16 560 conv 0
+dinov2_vitl14_reg 1e-04 1 0.1 32 0.1 0 0 16 16 560 conv 0
+dinov2_vitl14_reg 1e-04 1 0.25 32 0.1 0 0 16 16 560 conv 1
+dinov2_vitl14_reg 1e-04 1 0.1 32 0.1 0 0 16 16 560 conv 1
+dinov2_vitl14_reg 1e-04 1 0.25 32 0.1 0 0 16 16 560 stack 0
+dinov2_vitl14_reg 1e-04 1 0.1 32 0.1 0 0 16 16 560 stack 0
+efficientnet_v2_l 1e-04 1 0.1 32 0.2 0 0 16 16 0 stack 0
+dinov2_vitl14_reg 1e-04 1 0.25 32 0.1 0 0 16 16 560 stack 1
+dinov2_vitl14_reg 1e-04 1 0.1 32 0.1 0 0 16 16 560 stack 1
+efficientnet_v2_l 1e-04 1 0.1 32 0.2 0 0 16 16 0 stack 1
+dinov2_vitl14_reg 1e-04 1 0.25 32 0.1 0 1 16 16 560 conv 0
+dinov2_vitl14_reg 1e-04 1 0.1 32 0.1 0 1 16 16 560 conv 0
+dinov2_vitl14_reg 1e-04 1 0.25 32 0.1 0 1 16 16 560 conv 1
+dinov2_vitl14_reg 1e-04 1 0.1 32 0.1 0 1 16 16 560 conv 1
+
 
 
 

diff --git a/neural_networks/plots/confusion_matrix.py b/neural_networks/plots/confusion_matrix.py
@@ -0,0 +1,207 @@
+from cortexchange.architecture import get_architecture, Architecture
+from pathlib import Path
+import sys
+import os
+
+SCRIPT_DIR = Path(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(os.path.dirname(SCRIPT_DIR))
+from pre_processing_for_ml import normalize_fits
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from functools import lru_cache
+from torch.utils.data import Dataset, DataLoader
+from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
+from astropy.io import fits
+
+
+class RawFitsDataset(Dataset):
+    def __init__(self, root_dir, mode="train"):
+        """
+        Args:
+            root_dir (string): Directory with good/bad folders in it.
+        """
+
+        modes = ("train", "val")
+        assert mode in modes
+
+        classes = {"stop": 0, "continue": 1}
+
+        root_dir = Path(root_dir)
+        assert root_dir.exists(), f"'{root_dir}' doesn't exist!"
+
+        ext = ".fits"
+        glob_ext = "*" + ext
+
+        self.root_dir = root_dir
+
+        for folder in (
+            root_dir / (cls + ("" if mode == "train" else "_val")) for cls in classes
+        ):
+            assert (
+                folder.exists()
+            ), f"root folder doesn't exist, got: '{str(folder.resolve())}'"
+            assert (
+                len(list(folder.glob(glob_ext))) > 0
+            ), f"no '{ext}' files were found in '{str(folder.resolve())}'"
+
+        # Yes this code is way overengineered. Yes I also derive pleasure from writing it :) - RJS
+        #
+        # Actual documentation:
+        # You want all 'self.x' variables to be non-python objects such as numpy arrays,
+        # otherwise you get memory leaks in the PyTorch dataloader
+        self.data_paths, self.labels = map(
+            np.asarray,
+            list(
+                zip(
+                    *(
+                        (str(file), val)
+                        for cls, val in classes.items()
+                        for file in (
+                            root_dir / (cls + ("" if mode == "train" else "_val"))
+                        ).glob(glob_ext)
+                    )
+                )
+            ),
+        )
+
+        assert len(self.data_paths) > 0
+        self.sources = ", ".join(
+            sorted([str(elem).split("/")[-1].strip(ext) for elem in self.data_paths])
+        )
+        self.mode = mode
+        _, counts = np.unique(self.labels, return_counts=True)
+        self.label_ratio = counts[0] / counts[1]
+        # print(f'{mode}: using the following sources: {sources}')
+
+    @staticmethod
+    def transform_data(image_data):
+        """
+        Transform data for preprocessing
+        """
+
+        # FIXME: this should really be a parameter
+        image_data = torch.from_numpy(image_data).to(torch.bfloat16)
+        image_data = torch.movedim(image_data, -1, 0)
+
+        return image_data
+
+    @lru_cache(maxsize=1)
+    def __len__(self):
+        return len(self.data_paths)
+
+    def __getitem__(self, idx):
+
+        fits_path = self.data_paths[idx]
+        label = self.labels[idx]
+
+        image_data = process_fits(fits_path)
+        # there is always only one array
+
+        # Pre-processing
+        image_data = self.transform_data(image_data)
+
+        return image_data, label
+
+
+def load_model(architecture_name, model_name, device="cpu"):
+    StopPredictor: type(Architecture) = get_architecture(architecture_name)
+    predictor = StopPredictor(device=device, model_name=model_name)
+    return predictor
+
+
+@torch.no_grad()
+def get_confusion_matrix(predictor, dataloader, mean, std, thresholds):
+    confusion_matrices = np.zeros((len(thresholds), 2, 2))
+    thresholds = torch.tensor(thresholds)
+    for i, (img, label) in enumerate(dataloader):
+        data = predictor.prepare_batch(img, mean=mean, std=std)
+        pred = torch.sigmoid(predictor.model(data)).to("cpu")
+        preds_thres = pred >= thresholds
+        for i, _ in enumerate(thresholds):
+            confusion_matrices[i] += confusion_matrix(
+                label, preds_thres[:, i], labels=[0, 1]
+            )
+
+    return confusion_matrices
+
+
+def plot_conf_matrices(savedir, confusion_matrices, thresholds):
+    savedir = model_name.split("/")[-1]
+    os.makedirs(savedir, exist_ok=True)
+    for i, conf_matrix in enumerate(confusion_matrices):
+
+        disp = ConfusionMatrixDisplay(
+            # Normalization
+            conf_matrix / np.sum(conf_matrix, axis=1, keepdims=True),
+            display_labels=["stop", "continue"],
+        )
+        # print(conf_matrix)
+        disp.plot()
+
+        plt.savefig(f"{savedir}/confusion_thres_{thresholds[i]:.3f}.png")
+
+
+def process_fits(fits_path):
+    with fits.open(fits_path) as hdul:
+        image_data = hdul[0].data
+
+    return normalize_fits(image_data)
+
+
+def get_dataloader(data_root, mode="val", batch_size=32):
+    dataset = RawFitsDataset(data_root, mode="val")
+    num_workers = min(12, len(os.sched_getaffinity(0)))
+
+    prefetch_factor, persistent_workers = (
+        (2, True) if num_workers > 0 else (None, False)
+    )
+    dataloader = DataLoader(
+        dataset,
+        batch_size=32,
+        shuffle=True,
+        num_workers=num_workers,
+        persistent_workers=persistent_workers,
+        prefetch_factor=prefetch_factor,
+        drop_last=False,
+    )
+
+    return dataloader
+
+
+if __name__ == "__main__":
+    # Latest model
+    model_name = "surf/dinov2_09739_rotations"
+    TESTING = True
+    architecture_name = "surf/TransferLearning"
+    # Set Device here
+    DEVICE = "cuda"
+    # Thresholds to consider for classification
+    thresholds = [0.2, 0.3, 0.4, 0.5]
+    # Change to directory of files. Should have subfolders 'continue_val' and 'stop_val'
+    data_root = "/scratch-shared/CORTEX/public.spider.surfsara.nl/lofarvwf/jdejong/CORTEX/calibrator_selection_robertjan/cnn_data"
+    # Uses cached confusion matrix for testing the plotting functionalities
+    if model_name == "surf/dinov2_09739_rotations" and TESTING:
+        confusion_matrices = np.asarray(
+            [
+                [[149, 56], [2, 116]],
+                [[178, 27], [4, 114]],
+                [[190, 15], [6, 112]],
+                [[191, 14], [7, 111]],
+            ]
+        )
+    else:
+
+        dataloader = get_dataloader(data_root, mode="val")
+
+        predictor = load_model(architecture_name, model_name, device=DEVICE)
+
+        mean, std = predictor.args["dataset_mean"], predictor.args["dataset_std"]
+
+        confusion_matrices = get_confusion_matrix(
+            predictor, dataloader, mean, std, thresholds
+        )
+
+    print(confusion_matrices)
+
+    plot_conf_matrices(model_name.split("/")[-1], confusion_matrices, thresholds)
diff --git a/neural_networks/requirements.txt b/neural_networks/requirements.txt
@@ -1,9 +1,14 @@
 matplotlib
-torch
+torch>=2.1.2
 torchvision
 torcheval
 tqdm
 matplotlib
 joblib
-astropy
+astropy>6.0.0
+xformers
+tensorboard
+dino-finetune @ git+https://github.com/sara-nl/dinov2-finetune.git
+scikit-learn
+
 
diff --git a/neural_networks/train_nn.job b/neural_networks/train_nn.job
@@ -22,7 +22,7 @@ SLURM_ARRAY_TASK_ID=${SLURM_ARRAY_TASK_ID:=1}
 PARAMS=$(sed -n "${SLURM_ARRAY_TASK_ID}p" $PARAM_FILE)
 
 # Parse the parameters
-read model lr normalize dropout_p batch_size label_smoothing stochastic_smoothing use_lora rank alpha resize <<< $PARAMS
+read model lr normalize dropout_p batch_size label_smoothing stochastic_smoothing use_lora rank alpha resize lift flip_augmentations <<< $PARAMS
 
 if [ "$use_lora" -eq 1 ]; then
     LORA_ARG="--use_lora"
@@ -36,7 +36,24 @@ else
     STOCHASTIC_SMOOTHING=""
 fi
 
+if [ "$flip_augmentations" -eq 1 ]; then
+    FLIP_AUGMENTATIONS="--flip_augmentations"
+else
+    FLIP_AUGMENTATIONS=""
+fi
+
+# Scale up by 1e6 to convert to integers for comparison
+scaled_lr=$(echo "$lr * 1000000" | awk '{printf("%d", $1)}')
+scaled_threshold=$(echo "4e-05 * 1000000" | awk '{printf("%d", $1)}')
+
+if [ "$scaled_lr" -le "$scaled_threshold" ]; then
+    EPOCHS="250"
+else
+    EPOCHS="120"
+fi
+
 DATA_TRAINDATA_PATH="/scratch-shared/CORTEX/public.spider.surfsara.nl/lofarvwf/jdejong/CORTEX/calibrator_selection_robertjan/cnn_data/"
 
 # Execute your Python script with the given parameters
-python train_nn.py $DATA_TRAINDATA_PATH --model $model --lr $lr --normalize $normalize --dropout_p $dropout_p --batch_size $batch_size --log_path grid_search_2 --label_smoothing $label_smoothing --rank $rank --resize $resize --alpha $alpha  $LORA_ARG $STOCHASTIC_SMOOTHING -d
+echo  $DATA_TRAINDATA_PATH --model $model --lr $lr --normalize $normalize --dropout_p $dropout_p --batch_size $batch_size --log_path grid_search_2 --label_smoothing $label_smoothing --rank $rank --resize $resize --alpha $alpha  $LORA_ARG $STOCHASTIC_SMOOTHING -d --epochs $EPOCHS --lift $lift $FLIP_AUGMENTATIONS 
+python train_nn.py $DATA_TRAINDATA_PATH --model $model --lr $lr --normalize $normalize --dropout_p $dropout_p --batch_size $batch_size --log_path grid_search_2 --label_smoothing $label_smoothing --rank $rank --resize $resize --alpha $alpha  $LORA_ARG $STOCHASTIC_SMOOTHING -d --epochs $EPOCHS --lift $lift $FLIP_AUGMENTATIONS