From 4a126962d87ff76b0e4a2bdea79caa77498be17e Mon Sep 17 00:00:00 2001 From: LVeefkind Date: Mon, 21 Oct 2024 14:47:30 +0200 Subject: [PATCH 1/3] added __init__.py for cortexchange. Changed job script --- neural_networks/__init__.py | 80 ++++++++++++++++++++++++++++++++++ neural_networks/parameters.txt | 4 +- neural_networks/train_nn.job | 8 ++-- neural_networks/train_nn.py | 41 ++++++++++++++--- 4 files changed, 121 insertions(+), 12 deletions(-) create mode 100644 neural_networks/__init__.py diff --git a/neural_networks/__init__.py b/neural_networks/__init__.py new file mode 100644 index 0000000..b857ec6 --- /dev/null +++ b/neural_networks/__init__.py @@ -0,0 +1,80 @@ +import argparse +import functools + +import torch + +from cortexchange.architecture import Architecture +import __main__ +from astropy.io import fits + +from .train_nn import ImagenetTransferLearning, load_checkpoint # noqa +from .pre_processing_for_ml import normalize_fits + +setattr(__main__, "ImagenetTransferLearning", ImagenetTransferLearning) + + +def process_fits(fits_path): + with fits.open(fits_path) as hdul: + image_data = hdul[0].data + + return normalize_fits(image_data) + + +class TransferLearning(Architecture): + def __init__( + self, + model_name: str = None, + device: str = None, + variational_dropout: int = 0, + **kwargs + ): + super().__init__(model_name, device) + + self.dtype = torch.float32 + + self.model = self.model.to(self.dtype) + self.model.eval() + + assert variational_dropout >= 0 + self.variational_dropout = variational_dropout + + def load_checkpoint(self, path) -> torch.nn.Module: + model, _, _, resize = load_checkpoint(path, self.device).values() + return model + + @functools.lru_cache(maxsize=1) + def prepare_data(self, input_path: str) -> torch.Tensor: + input_data: torch.Tensor = torch.from_numpy(process_fits(input_path)) + input_data = input_data.to(self.dtype) + input_data = input_data.swapdims(0, 2).unsqueeze(0) + return input_data + + @torch.no_grad() + def predict(self, data: torch.Tensor): + with torch.autocast(dtype=self.dtype, device_type=self.device): + if self.variational_dropout > 0: + self.model.feature_extractor.eval() + self.model.classifier.train() + + predictions = torch.concat( + [ + torch.sigmoid(self.model(data)).clone() + for _ in range(self.variational_dropout) + ], + dim=1, + ) + + mean = predictions.mean() + std = predictions.std() + + print(mean, std) + return mean, std + + @staticmethod + def add_argparse_args(parser: argparse.ArgumentParser) -> None: + parser.add_argument( + "--variational_dropout", + type=int, + default=None, + help="Optional: Amount of times to run the model to obtain a variational estimate of the stdev", + ) diff --git a/neural_networks/parameters.txt b/neural_networks/parameters.txt index 4686a18..e5404be 100644 --- a/neural_networks/parameters.txt +++ b/neural_networks/parameters.txt @@ -35,7 +35,9 @@ dinov2_vitl14_reg 1e-04 0 0.25 32 1 0.1 0 dinov2_vitl14_reg 1e-04 1 0.25 32 1 0.1 0 dinov2_vitl14_reg 1e-04 1 0.25 32 1 0.2 1 3 1 560 -dinov2_vitl14_reg 1e-04 1 0.25 32 1 0.1 1 16 16 560 + +dinov2_vitl14_reg 1e-04 1 0.25 32 0.1 0 0 16 16 560 + dinov2_vitl14_reg 1e-04 1 0.25 32 1 0.1 1 16 16 784 dinov2_vitl14_reg 1e-04 1 0.25 32 1 0.2 1 16 16 560 dinov2_vitl14_reg 1e-04 1 0.25 32 1 0.2 1 16 16 784 diff --git a/neural_networks/train_nn.job b/neural_networks/train_nn.job index 4b5bda2..2a2c857 100644 --- a/neural_networks/train_nn.job +++ b/neural_networks/train_nn.job @@ -11,7 +11,7 @@ cd ~/projects/lofar_helpers/neural_networks module load 2023 -source venv/bin/activate +source ../../lofar_venv/bin/activate # Read the parameter file PARAM_FILE=parameters.txt @@ -22,7 +22,7 @@ SLURM_ARRAY_TASK_ID=${SLURM_ARRAY_TASK_ID:=1} PARAMS=$(sed -n "${SLURM_ARRAY_TASK_ID}p" $PARAM_FILE) # Parse the parameters -read model lr normalize dropout_p batch_size use_lora label_smoothing stochastic_smoothing rank alpha resize <<< $PARAMS +read model lr normalize dropout_p batch_size label_smoothing stochastic_smoothing use_lora rank alpha resize <<< $PARAMS if [ "$use_lora" -eq 1 ]; then LORA_ARG="--use_lora" @@ -36,7 +36,7 @@ else STOCHASTIC_SMOOTHING="" fi -DATA_TRAINDATA_PATH="public.spider.surfsara.nl/project/lofarvwf/jdejong/CORTEX/calibrator_selection_robertjan/cnn_data/" +DATA_TRAINDATA_PATH="/scratch-shared/CORTEX/public.spider.surfsara.nl/lofarvwf/jdejong/CORTEX/calibrator_selection_robertjan/cnn_data/" # Execute your Python script with the given parameters -python train_nn.py $DATA_TRAINDATA_PATH --model $model --lr $lr --normalize $normalize --dropout_p $dropout_p --batch_size $batch_size --log_path grid_search_2 --label_smoothing $label_smoothing --rank $rank --resize $resize --alpha $alpha $LORA_ARG $STOCHASTIC_SMOOTHING +python train_nn.py $DATA_TRAINDATA_PATH --model $model --lr $lr --normalize $normalize --dropout_p $dropout_p --batch_size $batch_size --log_path grid_search_2 --label_smoothing $label_smoothing --rank $rank --resize $resize --alpha $alpha $LORA_ARG $STOCHASTIC_SMOOTHING -d diff --git a/neural_networks/train_nn.py b/neural_networks/train_nn.py index 70278a6..0a7321f 100644 --- a/neural_networks/train_nn.py +++ b/neural_networks/train_nn.py @@ -456,7 +456,11 @@ def main( train_dataloader=train_dataloader, optimizer=optimizer, logging_interval=logging_interval, - smoothing_fn=partial(label_smoother, stochastic=stochastic_smoothing, smoothing_factor=label_smoothing), + smoothing_fn=partial( + label_smoother, + stochastic=stochastic_smoothing, + smoothing_factor=label_smoothing, + ), ) val_step_f = partial(val_step_f, val_dataloader=val_dataloader) @@ -467,6 +471,16 @@ def main( optimizer=optimizer, normalize=normalize, batch_size=batch_size, + use_compile=use_compile, + label_smoothing=label_smoothing, + stochastic_smoothing=stochastic_smoothing, + lift=lift, + use_lora=use_lora, + rank=rank, + alpha=alpha, + resize=resize, + lr=lr, + dropout_p=dropout_p, ) best_val_loss = torch.inf @@ -555,13 +569,16 @@ def val_step(model, val_dataloader, global_step, metrics_logger, prepare_data_f) return mean_loss, logits, targets -def label_smoother(labels: torch.tensor, smoothing_factor: float = 0.1, stochastic: bool = True): +def label_smoother( + labels: torch.tensor, smoothing_factor: float = 0.1, stochastic: bool = True +): smoothing_factor = smoothing_factor - ( - torch.rand_like(labels) * smoothing_factor * stochastic + torch.rand_like(labels) * smoothing_factor * stochastic ) smoothed_label = (1 - smoothing_factor) * labels + 0.5 * smoothing_factor return smoothed_label + def train_step( model, optimizer, @@ -676,13 +693,23 @@ def save_checkpoint(logging_dir, model, optimizer, global_step, **kwargs): ) -def load_checkpoint(ckpt_path): - - ckpt_dict = torch.load(ckpt_path, weights_only=False) +def load_checkpoint(ckpt_path, device="gpu"): + if os.path.isfile(ckpt_path): + ckpt_dict = torch.load(ckpt_path, weights_only=False, map_location=device) + else: + files = os.listdir(ckpt_path) + possible_checkpoints = list(filter(lambda x: x.endswith(".pth"), files)) + if len(possible_checkpoints) != 1: + raise ValueError( + f"Too many checkpoint files in the given checkpoint directory. Please specify the model you want to load directly." + ) + ckpt_path = f"{ckpt_path}/{possible_checkpoints[0]}" + ckpt_dict = torch.load(ckpt_path, weights_only=False, map_location=device) # ugh, this is so ugly, something something hindsight something something 20-20 # FIXME: probably should do a pattern match, but this works for now kwargs = str(Path(ckpt_path).parent).split("/")[-1].split("__") + print(ckpt_dict.keys()) # strip 'model_' from the name model_name = kwargs[1][6:] @@ -698,7 +725,7 @@ def load_checkpoint(ckpt_path): optim = ckpt_dict.get("optimizer", torch.optim.AdamW)( lr=lr, params=model.classifier.parameters() ).load_state_dict(ckpt_dict["optimizer_state_dict"]) - except e: + except Exception as e: print(f"Could not load optim due to {e}; skipping.") optim = None From 95340f78fd83cca40a3ea5b31aca44d9bf52a15c Mon Sep 17 00:00:00 2001 From: LVeefkind Date: Mon, 21 Oct 2024 16:55:01 +0200 Subject: [PATCH 2/3] changes to model checkpointing/loading --- neural_networks/__init__.py | 32 ++++++++++++++++++------- neural_networks/train_nn.py | 47 +++++++++++++++++-------------------- 2 files changed, 46 insertions(+), 33 deletions(-) diff --git a/neural_networks/__init__.py b/neural_networks/__init__.py index b857ec6..c8e1eef 100644 --- a/neural_networks/__init__.py +++ b/neural_networks/__init__.py @@ -2,13 +2,15 @@ import functools import torch +from torch.nn.functional import interpolate +import os from cortexchange.architecture import Architecture import __main__ from astropy.io import fits -from .train_nn import ImagenetTransferLearning, load_checkpoint # noqa -from .pre_processing_for_ml import normalize_fits +from train_nn import ImagenetTransferLearning, load_checkpoint # noqa +from pre_processing_for_ml import normalize_fits setattr(__main__, "ImagenetTransferLearning", ImagenetTransferLearning) @@ -30,7 +32,7 @@ def __init__( ): super().__init__(model_name, device) - self.dtype = torch.float32 + self.dtype = torch.bfloat16 self.model = self.model.to(self.dtype) self.model.eval() @@ -39,7 +41,16 @@ def __init__( self.variational_dropout = variational_dropout def load_checkpoint(self, path) -> torch.nn.Module: - model, _, _, resize = load_checkpoint(path, self.device).values() + # To avoid errors on CPU + if "gpu" not in self.device and self.device != "cuda": + os.environ["XFORMERS_DISABLED"] = "1" + ( + model, + _, + args, + ) = load_checkpoint(path, self.device).values() + self.resize = args["resize"] + self.lift = args["lift"] return model @functools.lru_cache(maxsize=1) @@ -47,19 +58,24 @@ def prepare_data(self, input_path: str) -> torch.Tensor: input_data: torch.Tensor = torch.from_numpy(process_fits(input_path)) input_data = input_data.to(self.dtype) input_data = input_data.swapdims(0, 2).unsqueeze(0) + if self.resize != 0: + input_data = interpolate( + input_data, size=self.resize, mode="bilinear", align_corners=False + ) + input_data = input_data.to(self.device) return input_data @torch.no_grad() def predict(self, data: torch.Tensor): with torch.autocast(dtype=self.dtype, device_type=self.device): if self.variational_dropout > 0: - self.model.feature_extractor.eval() - self.model.classifier.train() + self.model.train() + # self.model.classifier.train() predictions = torch.concat( [ torch.sigmoid(self.model(data)).clone() - for _ in range(self.variational_dropout) + for _ in range(max(self.variational_dropout, 1)) ], dim=1, ) @@ -75,6 +91,6 @@ def add_argparse_args(parser: argparse.ArgumentParser) -> None: parser.add_argument( "--variational_dropout", type=int, - default=None, + default=0, help="Optional: Amount of times to run the model to obtain a variational estimate of the stdev", ) diff --git a/neural_networks/train_nn.py b/neural_networks/train_nn.py index 0a7321f..cb90f45 100644 --- a/neural_networks/train_nn.py +++ b/neural_networks/train_nn.py @@ -18,8 +18,8 @@ import numpy as np import random -from pre_processing_for_ml import FitsDataset -from dino_model import DINOV2FeatureExtractor +from .pre_processing_for_ml import FitsDataset +from .dino_model import DINOV2FeatureExtractor PROFILE = False SEED = None @@ -469,18 +469,21 @@ def main( logging_dir=logging_dir, model=model, optimizer=optimizer, - normalize=normalize, - batch_size=batch_size, - use_compile=use_compile, - label_smoothing=label_smoothing, - stochastic_smoothing=stochastic_smoothing, - lift=lift, - use_lora=use_lora, - rank=rank, - alpha=alpha, - resize=resize, - lr=lr, - dropout_p=dropout_p, + args={ + "normalize": normalize, + "batch_size": batch_size, + "use_compile": use_compile, + "label_smoothing": label_smoothing, + "stochastic_smoothing": stochastic_smoothing, + "lift": lift, + "use_lora": use_lora, + "rank": rank, + "alpha": alpha, + "resize": resize, + "lr": lr, + "dropout_p": dropout_p, + "model_name": model_name, + }, ) best_val_loss = torch.inf @@ -706,18 +709,12 @@ def load_checkpoint(ckpt_path, device="gpu"): ckpt_path = f"{ckpt_path}/{possible_checkpoints[0]}" ckpt_dict = torch.load(ckpt_path, weights_only=False, map_location=device) - # ugh, this is so ugly, something something hindsight something something 20-20 - # FIXME: probably should do a pattern match, but this works for now - kwargs = str(Path(ckpt_path).parent).split("/")[-1].split("__") - print(ckpt_dict.keys()) - # strip 'model_' from the name - model_name = kwargs[1][6:] - lr = float(kwargs[2].split("_")[-1]) - normalize = int(kwargs[3].split("_")[-1]) - dropout_p = float(kwargs[4].split("_")[-1]) + model_name = ckpt_dict["args"]["model_name"] + lr = ckpt_dict["args"]["lr"] + dropout_p = ckpt_dict["args"]["dropout_p"] - model = ckpt_dict["model"](model_name=model_name, dropout_p=dropout_p) + model = ckpt_dict["model"](model_name=model_name, dropout_p=dropout_p).to(device) model.load_state_dict(ckpt_dict["model_state_dict"]) try: @@ -729,7 +726,7 @@ def load_checkpoint(ckpt_path, device="gpu"): print(f"Could not load optim due to {e}; skipping.") optim = None - return {"model": model, "optim": optim, "normalize": normalize} + return {"model": model, "optim": optim, "args": ckpt_dict["args"]} def get_argparser(): From 2890fe01d9d4f7a05835226130bfcbb73b3ae21b Mon Sep 17 00:00:00 2001 From: LVeefkind Date: Mon, 21 Oct 2024 17:07:17 +0200 Subject: [PATCH 3/3] fix import error --- neural_networks/parameters.txt | 98 ++++++++++++++++++++-------------- neural_networks/train_nn.py | 4 +- 2 files changed, 59 insertions(+), 43 deletions(-) diff --git a/neural_networks/parameters.txt b/neural_networks/parameters.txt index e5404be..3dda20e 100644 --- a/neural_networks/parameters.txt +++ b/neural_networks/parameters.txt @@ -1,46 +1,62 @@ -efficientnet_v2_l 1e-04 0 0.25 32 0 0 0 -efficientnet_v2_l 1e-04 1 0.25 32 0 0 0 -efficientnet_v2_l 1e-04 2 0.25 32 0 0 0 -dinov2_vitl14_reg 1e-04 0 0.25 32 0 0 0 -dinov2_vitl14_reg 1e-04 1 0.25 32 0 0 0 -dinov2_vitl14_reg 1e-04 0 0.25 32 1 0 0 -dinov2_vitl14_reg 1e-04 1 0.25 32 1 0 0 -efficientnet_v2_l 1e-04 0 0.25 32 0 0.1 1 -efficientnet_v2_l 1e-04 1 0.25 32 0 0.1 1 -efficientnet_v2_l 1e-04 2 0.25 32 0 0.1 1 -dinov2_vitl14_reg 1e-04 0 0.25 32 0 0.1 1 -dinov2_vitl14_reg 1e-04 1 0.25 32 0 0.1 1 -dinov2_vitl14_reg 1e-04 0 0.25 32 1 0.1 1 -dinov2_vitl14_reg 1e-04 1 0.25 32 1 0.1 1 -efficientnet_v2_l 1e-04 0 0.25 32 0 0.1 1 -efficientnet_v2_l 1e-04 1 0.25 32 0 0.1 1 -efficientnet_v2_l 1e-04 2 0.25 32 0 0.1 1 -dinov2_vitl14_reg 1e-04 0 0.25 32 0 0.1 1 -dinov2_vitl14_reg 1e-04 1 0.25 32 0 0.1 1 -dinov2_vitl14_reg 1e-04 0 0.25 32 1 0.1 1 -dinov2_vitl14_reg 1e-04 1 0.25 32 1 0.1 1 -efficientnet_v2_l 1e-04 0 0.25 32 0 0.2 0 -efficientnet_v2_l 1e-04 1 0.25 32 0 0.2 0 -efficientnet_v2_l 1e-04 2 0.25 32 0 0.2 0 -dinov2_vitl14_reg 1e-04 0 0.25 32 0 0.2 0 -dinov2_vitl14_reg 1e-04 1 0.25 32 0 0.2 0 -dinov2_vitl14_reg 1e-04 0 0.25 32 1 0.2 0 -dinov2_vitl14_reg 1e-04 1 0.25 32 1 0.2 0 -efficientnet_v2_l 1e-04 0 0.25 32 0 0.1 0 -efficientnet_v2_l 1e-04 1 0.25 32 0 0.1 0 -efficientnet_v2_l 1e-04 2 0.25 32 0 0.1 0 -dinov2_vitl14_reg 1e-04 0 0.25 32 0 0.1 0 -dinov2_vitl14_reg 1e-04 1 0.25 32 0 0.1 0 -dinov2_vitl14_reg 1e-04 0 0.25 32 1 0.1 0 -dinov2_vitl14_reg 1e-04 1 0.25 32 1 0.1 0 - -dinov2_vitl14_reg 1e-04 1 0.25 32 1 0.2 1 3 1 560 - dinov2_vitl14_reg 1e-04 1 0.25 32 0.1 0 0 16 16 560 +dinov2_vitl14_reg 1e-04 1 0.25 32 0.1 1 1 16 16 560 +dinov2_vitl14_reg 1e-04 1 0.25 32 0.2 1 0 16 16 560 +dinov2_vitl14_reg 1e-04 1 0.25 32 0.2 0 1 16 16 560 + +dinov2_vitl14_reg 1e-04 1 0.1 32 0.1 0 0 16 16 560 +dinov2_vitl14_reg 1e-04 1 0.1 32 0.1 1 1 16 16 560 +dinov2_vitl14_reg 1e-04 1 0.1 32 0.2 1 0 16 16 560 +dinov2_vitl14_reg 1e-04 1 0.1 32 0.2 0 1 16 16 560 + +dinov2_vitl14_reg 1e-05 1 0.25 32 0.1 0 0 16 16 560 +dinov2_vitl14_reg 1e-05 1 0.25 32 0.1 1 1 16 16 560 +dinov2_vitl14_reg 1e-05 1 0.25 32 0.2 1 0 16 16 560 +dinov2_vitl14_reg 1e-05 1 0.25 32 0.2 0 1 16 16 560 + +dinov2_vitl14_reg 1e-05 1 0.1 32 0.1 0 0 16 16 560 +dinov2_vitl14_reg 1e-05 1 0.1 32 0.1 1 1 16 16 560 +dinov2_vitl14_reg 1e-05 1 0.1 32 0.2 1 0 16 16 560 +dinov2_vitl14_reg 1e-05 1 0.1 32 0.2 0 1 16 16 560 + +dinov2_vitl14_reg 5e-05 1 0.25 32 0.1 0 0 16 16 560 +dinov2_vitl14_reg 5e-05 1 0.25 32 0.1 1 1 16 16 560 +dinov2_vitl14_reg 5e-05 1 0.25 32 0.2 1 0 16 16 560 +dinov2_vitl14_reg 5e-05 1 0.25 32 0.2 0 1 16 16 560 + +dinov2_vitl14_reg 5e-05 1 0.1 32 0.1 0 0 16 16 560 +dinov2_vitl14_reg 5e-05 1 0.1 32 0.1 1 1 16 16 560 +dinov2_vitl14_reg 5e-05 1 0.1 32 0.2 1 0 16 16 560 +dinov2_vitl14_reg 5e-05 1 0.1 32 0.2 0 1 16 16 560 + +efficientnet_v2_l 1e-04 1 0.25 32 0.2 0 0 16 16 0 +efficientnet_v2_l 1e-04 1 0.25 32 0.2 1 0 16 16 0 +efficientnet_v2_l 1e-04 1 0.25 32 0.1 1 0 16 16 0 +efficientnet_v2_l 1e-04 1 0.25 32 0.1 0 0 16 16 0 + +efficientnet_v2_l 1e-04 1 0.1 32 0.2 0 0 16 16 0 +efficientnet_v2_l 1e-04 1 0.1 32 0.2 1 0 16 16 0 +efficientnet_v2_l 1e-04 1 0.1 32 0.1 1 0 16 16 0 +efficientnet_v2_l 1e-04 1 0.1 32 0.1 0 0 16 16 0 + +efficientnet_v2_l 5e-05 1 0.25 32 0.2 0 0 16 16 0 +efficientnet_v2_l 5e-05 1 0.25 32 0.2 1 0 16 16 0 +efficientnet_v2_l 5e-05 1 0.25 32 0.1 1 0 16 16 0 +efficientnet_v2_l 5e-05 1 0.25 32 0.1 0 0 16 16 0 + +efficientnet_v2_l 5e-05 1 0.1 32 0.2 0 0 16 16 0 +efficientnet_v2_l 5e-05 1 0.1 32 0.2 1 0 16 16 0 +efficientnet_v2_l 5e-05 1 0.1 32 0.1 1 0 16 16 0 +efficientnet_v2_l 5e-05 1 0.1 32 0.1 0 0 16 16 0 + +efficientnet_v2_l 1e-05 1 0.25 32 0.2 0 0 16 16 0 +efficientnet_v2_l 1e-05 1 0.25 32 0.2 1 0 16 16 0 +efficientnet_v2_l 1e-05 1 0.25 32 0.1 1 0 16 16 0 +efficientnet_v2_l 1e-05 1 0.25 32 0.1 0 0 16 16 0 -dinov2_vitl14_reg 1e-04 1 0.25 32 1 0.1 1 16 16 784 -dinov2_vitl14_reg 1e-04 1 0.25 32 1 0.2 1 16 16 560 -dinov2_vitl14_reg 1e-04 1 0.25 32 1 0.2 1 16 16 784 +efficientnet_v2_l 1e-05 1 0.1 32 0.2 0 0 16 16 0 +efficientnet_v2_l 1e-05 1 0.1 32 0.2 1 0 16 16 0 +efficientnet_v2_l 1e-05 1 0.1 32 0.1 1 0 16 16 0 +efficientnet_v2_l 1e-05 1 0.1 32 0.1 0 0 16 16 0 diff --git a/neural_networks/train_nn.py b/neural_networks/train_nn.py index cb90f45..79e7ad5 100644 --- a/neural_networks/train_nn.py +++ b/neural_networks/train_nn.py @@ -18,8 +18,8 @@ import numpy as np import random -from .pre_processing_for_ml import FitsDataset -from .dino_model import DINOV2FeatureExtractor +from pre_processing_for_ml import FitsDataset +from dino_model import DINOV2FeatureExtractor PROFILE = False SEED = None