From 4a126962d87ff76b0e4a2bdea79caa77498be17e Mon Sep 17 00:00:00 2001 From: LVeefkind Date: Mon, 21 Oct 2024 14:47:30 +0200 Subject: [PATCH] added __init__.py for cortexchange. Changed job script --- neural_networks/__init__.py | 80 ++++++++++++++++++++++++++++++++++ neural_networks/parameters.txt | 4 +- neural_networks/train_nn.job | 8 ++-- neural_networks/train_nn.py | 41 ++++++++++++++--- 4 files changed, 121 insertions(+), 12 deletions(-) create mode 100644 neural_networks/__init__.py diff --git a/neural_networks/__init__.py b/neural_networks/__init__.py new file mode 100644 index 0000000..b857ec6 --- /dev/null +++ b/neural_networks/__init__.py @@ -0,0 +1,80 @@ +import argparse +import functools + +import torch + +from cortexchange.architecture import Architecture +import __main__ +from astropy.io import fits + +from .train_nn import ImagenetTransferLearning, load_checkpoint # noqa +from .pre_processing_for_ml import normalize_fits + +setattr(__main__, "ImagenetTransferLearning", ImagenetTransferLearning) + + +def process_fits(fits_path): + with fits.open(fits_path) as hdul: + image_data = hdul[0].data + + return normalize_fits(image_data) + + +class TransferLearning(Architecture): + def __init__( + self, + model_name: str = None, + device: str = None, + variational_dropout: int = 0, + **kwargs + ): + super().__init__(model_name, device) + + self.dtype = torch.float32 + + self.model = self.model.to(self.dtype) + self.model.eval() + + assert variational_dropout >= 0 + self.variational_dropout = variational_dropout + + def load_checkpoint(self, path) -> torch.nn.Module: + model, _, _, resize = load_checkpoint(path, self.device).values() + return model + + @functools.lru_cache(maxsize=1) + def prepare_data(self, input_path: str) -> torch.Tensor: + input_data: torch.Tensor = torch.from_numpy(process_fits(input_path)) + input_data = input_data.to(self.dtype) + input_data = input_data.swapdims(0, 2).unsqueeze(0) + return input_data + + @torch.no_grad() + def predict(self, data: torch.Tensor): + with torch.autocast(dtype=self.dtype, device_type=self.device): + if self.variational_dropout > 0: + self.model.feature_extractor.eval() + self.model.classifier.train() + + predictions = torch.concat( + [ + torch.sigmoid(self.model(data)).clone() + for _ in range(self.variational_dropout) + ], + dim=1, + ) + + mean = predictions.mean() + std = predictions.std() + + print(mean, std) + return mean, std + + @staticmethod + def add_argparse_args(parser: argparse.ArgumentParser) -> None: + parser.add_argument( + "--variational_dropout", + type=int, + default=None, + help="Optional: Amount of times to run the model to obtain a variational estimate of the stdev", + ) diff --git a/neural_networks/parameters.txt b/neural_networks/parameters.txt index 4686a18..e5404be 100644 --- a/neural_networks/parameters.txt +++ b/neural_networks/parameters.txt @@ -35,7 +35,9 @@ dinov2_vitl14_reg 1e-04 0 0.25 32 1 0.1 0 dinov2_vitl14_reg 1e-04 1 0.25 32 1 0.1 0 dinov2_vitl14_reg 1e-04 1 0.25 32 1 0.2 1 3 1 560 -dinov2_vitl14_reg 1e-04 1 0.25 32 1 0.1 1 16 16 560 + +dinov2_vitl14_reg 1e-04 1 0.25 32 0.1 0 0 16 16 560 + dinov2_vitl14_reg 1e-04 1 0.25 32 1 0.1 1 16 16 784 dinov2_vitl14_reg 1e-04 1 0.25 32 1 0.2 1 16 16 560 dinov2_vitl14_reg 1e-04 1 0.25 32 1 0.2 1 16 16 784 diff --git a/neural_networks/train_nn.job b/neural_networks/train_nn.job index 4b5bda2..2a2c857 100644 --- a/neural_networks/train_nn.job +++ b/neural_networks/train_nn.job @@ -11,7 +11,7 @@ cd ~/projects/lofar_helpers/neural_networks module load 2023 -source venv/bin/activate +source ../../lofar_venv/bin/activate # Read the parameter file PARAM_FILE=parameters.txt @@ -22,7 +22,7 @@ SLURM_ARRAY_TASK_ID=${SLURM_ARRAY_TASK_ID:=1} PARAMS=$(sed -n "${SLURM_ARRAY_TASK_ID}p" $PARAM_FILE) # Parse the parameters -read model lr normalize dropout_p batch_size use_lora label_smoothing stochastic_smoothing rank alpha resize <<< $PARAMS +read model lr normalize dropout_p batch_size label_smoothing stochastic_smoothing use_lora rank alpha resize <<< $PARAMS if [ "$use_lora" -eq 1 ]; then LORA_ARG="--use_lora" @@ -36,7 +36,7 @@ else STOCHASTIC_SMOOTHING="" fi -DATA_TRAINDATA_PATH="public.spider.surfsara.nl/project/lofarvwf/jdejong/CORTEX/calibrator_selection_robertjan/cnn_data/" +DATA_TRAINDATA_PATH="/scratch-shared/CORTEX/public.spider.surfsara.nl/lofarvwf/jdejong/CORTEX/calibrator_selection_robertjan/cnn_data/" # Execute your Python script with the given parameters -python train_nn.py $DATA_TRAINDATA_PATH --model $model --lr $lr --normalize $normalize --dropout_p $dropout_p --batch_size $batch_size --log_path grid_search_2 --label_smoothing $label_smoothing --rank $rank --resize $resize --alpha $alpha $LORA_ARG $STOCHASTIC_SMOOTHING +python train_nn.py $DATA_TRAINDATA_PATH --model $model --lr $lr --normalize $normalize --dropout_p $dropout_p --batch_size $batch_size --log_path grid_search_2 --label_smoothing $label_smoothing --rank $rank --resize $resize --alpha $alpha $LORA_ARG $STOCHASTIC_SMOOTHING -d diff --git a/neural_networks/train_nn.py b/neural_networks/train_nn.py index 70278a6..0a7321f 100644 --- a/neural_networks/train_nn.py +++ b/neural_networks/train_nn.py @@ -456,7 +456,11 @@ def main( train_dataloader=train_dataloader, optimizer=optimizer, logging_interval=logging_interval, - smoothing_fn=partial(label_smoother, stochastic=stochastic_smoothing, smoothing_factor=label_smoothing), + smoothing_fn=partial( + label_smoother, + stochastic=stochastic_smoothing, + smoothing_factor=label_smoothing, + ), ) val_step_f = partial(val_step_f, val_dataloader=val_dataloader) @@ -467,6 +471,16 @@ def main( optimizer=optimizer, normalize=normalize, batch_size=batch_size, + use_compile=use_compile, + label_smoothing=label_smoothing, + stochastic_smoothing=stochastic_smoothing, + lift=lift, + use_lora=use_lora, + rank=rank, + alpha=alpha, + resize=resize, + lr=lr, + dropout_p=dropout_p, ) best_val_loss = torch.inf @@ -555,13 +569,16 @@ def val_step(model, val_dataloader, global_step, metrics_logger, prepare_data_f) return mean_loss, logits, targets -def label_smoother(labels: torch.tensor, smoothing_factor: float = 0.1, stochastic: bool = True): +def label_smoother( + labels: torch.tensor, smoothing_factor: float = 0.1, stochastic: bool = True +): smoothing_factor = smoothing_factor - ( - torch.rand_like(labels) * smoothing_factor * stochastic + torch.rand_like(labels) * smoothing_factor * stochastic ) smoothed_label = (1 - smoothing_factor) * labels + 0.5 * smoothing_factor return smoothed_label + def train_step( model, optimizer, @@ -676,13 +693,23 @@ def save_checkpoint(logging_dir, model, optimizer, global_step, **kwargs): ) -def load_checkpoint(ckpt_path): - - ckpt_dict = torch.load(ckpt_path, weights_only=False) +def load_checkpoint(ckpt_path, device="gpu"): + if os.path.isfile(ckpt_path): + ckpt_dict = torch.load(ckpt_path, weights_only=False, map_location=device) + else: + files = os.listdir(ckpt_path) + possible_checkpoints = list(filter(lambda x: x.endswith(".pth"), files)) + if len(possible_checkpoints) != 1: + raise ValueError( + f"Too many checkpoint files in the given checkpoint directory. Please specify the model you want to load directly." + ) + ckpt_path = f"{ckpt_path}/{possible_checkpoints[0]}" + ckpt_dict = torch.load(ckpt_path, weights_only=False, map_location=device) # ugh, this is so ugly, something something hindsight something something 20-20 # FIXME: probably should do a pattern match, but this works for now kwargs = str(Path(ckpt_path).parent).split("/")[-1].split("__") + print(ckpt_dict.keys()) # strip 'model_' from the name model_name = kwargs[1][6:] @@ -698,7 +725,7 @@ def load_checkpoint(ckpt_path): optim = ckpt_dict.get("optimizer", torch.optim.AdamW)( lr=lr, params=model.classifier.parameters() ).load_state_dict(ckpt_dict["optimizer_state_dict"]) - except e: + except Exception as e: print(f"Could not load optim due to {e}; skipping.") optim = None