From a71f66ff0e6421921cefbab7e01a452108352a3b Mon Sep 17 00:00:00 2001 From: LVeefkind Date: Wed, 20 Nov 2024 14:04:27 +0100 Subject: [PATCH] Add option to finetune pos embedding, slightly more general way to save model params for loading in cortexchange. Skip non 2k images in data processing --- neural_networks/README.md | 6 ++ neural_networks/__init__.py | 4 +- neural_networks/parameters.txt | 55 ++++++++++++++++ neural_networks/pre_processing_for_ml.py | 27 +++++--- .../test_scripts/test_cortexchange.py | 24 +++++++ neural_networks/train_nn.py | 55 +++++++++++++--- neural_networks/train_nn_lora.job | 63 +++++++++++++++++++ 7 files changed, 215 insertions(+), 19 deletions(-) create mode 100644 neural_networks/test_scripts/test_cortexchange.py create mode 100644 neural_networks/train_nn_lora.job diff --git a/neural_networks/README.md b/neural_networks/README.md index 5d4e0d5c..880b062b 100644 --- a/neural_networks/README.md +++ b/neural_networks/README.md @@ -23,8 +23,14 @@ python pre_processing_for_ml.py ``` ## (Optional) Copy files to /dev/shm for fast dataloading +Copy data ```shell find -type f -name "*.npz" | xargs -n 1 -P 8 -i rsync -R {} /dev/shm +``` +Copy cached dataset statistics +```shell +find -type d -name "_cache" | xargs -n 1 -P 8 -I {} rsync -aR {}/ /dev/shm + ``` ## Run neural network training diff --git a/neural_networks/__init__.py b/neural_networks/__init__.py index 42f110b9..14ed6ea0 100644 --- a/neural_networks/__init__.py +++ b/neural_networks/__init__.py @@ -71,9 +71,9 @@ def prepare_batch(self, batch: torch.Tensor, mean=None, std=None) -> torch.Tenso batch, size=self.resize, mode="bilinear", align_corners=False ) if mean is None: - mean = self.mean + mean = self.args["dataset_mean"] if std is None: - std = self.std + std = self.args["dataset_std"] batch = normalize_inputs(batch, mean, std, normalize=1) return batch diff --git a/neural_networks/parameters.txt b/neural_networks/parameters.txt index 43693e44..e7ab7652 100644 --- a/neural_networks/parameters.txt +++ b/neural_networks/parameters.txt @@ -142,3 +142,58 @@ efficientnet_v2_l 5e-05 1 0.2 32 0.3 0 0 16 16 0 stack 0 efficientnet_v2_l 5e-05 1 0.25 32 0.1 0 0 16 16 0 stack 0 efficientnet_v2_l 5e-05 1 0.25 32 0.2 0 0 16 16 0 stack 0 efficientnet_v2_l 5e-05 1 0.25 32 0.3 0 0 16 16 0 stack 0 + + +dinov2_vitl14_reg 0.0001 1 0.1 32 0.1 0 1 16 16 560 stack 0 +dinov2_vitl14_reg 0.0001 1 0.1 32 0.1 0 1 32 32 560 stack 0 +dinov2_vitl14_reg 0.0001 1 0.1 32 0.1 0 1 64 64 560 stack 0 +dinov2_vitl14_reg 0.0001 1 0.1 32 0.1 0 1 128 128 560 stack 0 +dinov2_vitl14_reg 0.0001 1 0.1 32 0.1 0 1 256 256 560 stack 0 +dinov2_vitb14_reg 0.0001 1 0.1 32 0.1 0 1 16 16 560 stack 0 +dinov2_vitb14_reg 0.0001 1 0.1 32 0.1 0 1 32 32 560 stack 0 +dinov2_vitb14_reg 0.0001 1 0.1 32 0.1 0 1 64 64 560 stack 0 +dinov2_vitb14_reg 0.0001 1 0.1 32 0.1 0 1 128 128 560 stack 0 +dinov2_vitb14_reg 0.0001 1 0.1 32 0.1 0 1 256 256 560 stack 0 +dinov2_vits14_reg 0.0001 1 0.1 32 0.1 0 1 16 16 560 stack 0 +dinov2_vits14_reg 0.0001 1 0.1 32 0.1 0 1 32 32 560 stack 0 +dinov2_vits14_reg 0.0001 1 0.1 32 0.1 0 1 64 64 560 stack 0 +dinov2_vits14_reg 0.0001 1 0.1 32 0.1 0 1 128 128 560 stack 0 +dinov2_vits14_reg 0.0001 1 0.1 32 0.1 0 1 256 256 560 stack 0 + +dinov2_vitl14_reg 0.00001 1 0.1 32 0.1 0 1 16 16 560 stack 0 +dinov2_vitl14_reg 0.00001 1 0.1 32 0.1 0 1 32 32 560 stack 0 +dinov2_vitl14_reg 0.00001 1 0.1 32 0.1 0 1 64 64 560 stack 0 +dinov2_vitl14_reg 0.00001 1 0.1 32 0.1 0 1 128 128 560 stack 0 +dinov2_vitl14_reg 0.00001 1 0.1 32 0.1 0 1 256 256 560 stack 0 +dinov2_vitb14_reg 0.00001 1 0.1 32 0.1 0 1 16 16 560 stack 0 +dinov2_vitb14_reg 0.00001 1 0.1 32 0.1 0 1 32 32 560 stack 0 +dinov2_vitb14_reg 0.00001 1 0.1 32 0.1 0 1 64 64 560 stack 0 +dinov2_vitb14_reg 0.00001 1 0.1 32 0.1 0 1 128 128 560 stack 0 +dinov2_vitb14_reg 0.00001 1 0.1 32 0.1 0 1 256 256 560 stack 0 +dinov2_vits14_reg 0.00001 1 0.1 32 0.1 0 1 16 16 560 stack 0 +dinov2_vits14_reg 0.00001 1 0.1 32 0.1 0 1 32 32 560 stack 0 +dinov2_vits14_reg 0.00001 1 0.1 32 0.1 0 1 64 64 560 stack 0 +dinov2_vits14_reg 0.00001 1 0.1 32 0.1 0 1 128 128 560 stack 0 +dinov2_vits14_reg 0.00001 1 0.1 32 0.1 0 1 256 256 560 stack 0 +dinov2_vitl14_reg 0.00005 1 0.1 32 0.1 0 1 16 16 560 stack 0 +dinov2_vitl14_reg 0.00005 1 0.1 32 0.1 0 1 32 32 560 stack 0 +dinov2_vitl14_reg 0.00005 1 0.1 32 0.1 0 1 64 64 560 stack 0 +dinov2_vitl14_reg 0.00005 1 0.1 32 0.1 0 1 128 128 560 stack 0 +dinov2_vitl14_reg 0.00005 1 0.1 32 0.1 0 1 256 256 560 stack 0 +dinov2_vitb14_reg 0.00005 1 0.1 32 0.1 0 1 16 16 560 stack 0 +dinov2_vitb14_reg 0.00005 1 0.1 32 0.1 0 1 32 32 560 stack 0 +dinov2_vitb14_reg 0.00005 1 0.1 32 0.1 0 1 64 64 560 stack 0 +dinov2_vitb14_reg 0.00005 1 0.1 32 0.1 0 1 128 128 560 stack 0 +dinov2_vitb14_reg 0.00005 1 0.1 32 0.1 0 1 256 256 560 stack 0 +dinov2_vits14_reg 0.00005 1 0.1 32 0.1 0 1 16 16 560 stack 0 +dinov2_vits14_reg 0.00005 1 0.1 32 0.1 0 1 32 32 560 stack 0 +dinov2_vits14_reg 0.00005 1 0.1 32 0.1 0 1 64 64 560 stack 0 +dinov2_vits14_reg 0.00005 1 0.1 32 0.1 0 1 128 128 560 stack 0 +dinov2_vits14_reg 0.00005 1 0.1 32 0.1 0 1 256 256 560 stack 0 + +dinov2_vitb14_reg 0.00001 1 0.1 32 0.1 0 1 32 32 784 stack 0 +dinov2_vitb14_reg 0.00001 1 0.1 32 0.1 0 1 64 64 784 stack 0 + +dinov2_vitb14_reg 0.00001 1 0.1 32 0.1 0 0 32 32 784 stack 0 +dinov2_vitb14_reg 0.00005 1 0.1 32 0.1 0 0 32 32 784 stack 0 +dinov2_vitb14_reg 0.0001 1 0.1 32 0.1 0 0 32 32 784 stack 0 \ No newline at end of file diff --git a/neural_networks/pre_processing_for_ml.py b/neural_networks/pre_processing_for_ml.py index a6e20452..fa24f226 100644 --- a/neural_networks/pre_processing_for_ml.py +++ b/neural_networks/pre_processing_for_ml.py @@ -92,12 +92,15 @@ def transform_data(root_dir, classes=("continue", "stop"), modes=("", "_val")): def process_fits(fits_path): with fits.open(fits_path) as hdul: image_data = hdul[0].data - # assert image_data.shape[2] == 2048, (image_data.shape, image_data.shape[2], fits_path) - transformed = normalize_fits(image_data) + if image_data.shape[2] == 2048: + transformed = normalize_fits(image_data) - np.savez_compressed( - fits_path.with_suffix(".npz"), transformed.astype(np.float32) - ) + np.savez_compressed( + fits_path.with_suffix(".npz"), transformed.astype(np.float32) + ) + else: + print(f"Skipping {fits_path}. Improper image size {image_data.shape}") + # print(fits_path) root_dir = Path(root_dir) assert root_dir.exists() @@ -174,6 +177,13 @@ def compute_statistics(self, normalize): self.mean, self.std = cached_compute(self, normalize) return self.mean, self.std + # For caching + def __reduce__(self): + return ( + self.__class__, + (self.labels, self.mode, self.label_ratio, self.sources), + ) + @staticmethod def _compute_statistics(loader, normalize, verbose=True): if verbose: @@ -218,7 +228,6 @@ def __getitem__(self, idx): npy_path = self.data_paths[idx] label = self.labels[idx] - image_data = np.load(npy_path)["arr_0"] # there is always only one array # Pre-processing @@ -288,7 +297,7 @@ def make_histogram(root_dir): # make_histogram(root) # dataset.compute_statistics(normalize=1) - # dataset = FitsDataset(root, mode='train', normalize=1) + # dataset = FitsDataset(root, mode="train") # sources = dataset.sources # hash(dataset) # print(sources) @@ -298,8 +307,8 @@ def make_histogram(root_dir): # plt.savefig('test.png') # for img, label in dataset: - # print(img.shape) - # exit() + # if img.shape[2] != 2048: + # exit() # images = np.concatenate([image.flatten() for image, label in Idat]) # print("creating hist") diff --git a/neural_networks/test_scripts/test_cortexchange.py b/neural_networks/test_scripts/test_cortexchange.py new file mode 100644 index 00000000..233509a0 --- /dev/null +++ b/neural_networks/test_scripts/test_cortexchange.py @@ -0,0 +1,24 @@ +from cortexchange.wdclient import init_downloader + +init_downloader( + url="https://researchdrive.surfsara.nl/public.php/webdav/", + login="JeofximLVcr8Ttm", + password="?CortexAdminTest1?", + cache="/home/larsve/.cache/cortexchange", + # cache=".cache/cortexchange", +) + +from cortexchange.architecture import get_architecture, Architecture + +TransferLearning: type(Architecture) = get_architecture("surf/TransferLearning") +model = TransferLearning(device="cpu", model_name="surf/dinov2_october_09902_lora") + +# torch_tensor = model.prepare_data( +# "ILTJ160454.72+555949.7_selfcal/selfcal_007-MFS-image.fits" +# ) +torch_tensor = model.prepare_data( + "/scratch-shared/CORTEX/public.spider.surfsara.nl/lofarvwf/jdejong/CORTEX/calibrator_selection_robertjan/cnn_data/stop/ILTJ142906.77+334820.3_image_009-MFS-image.fits" +) +print(torch_tensor.shape) +result = model.predict(torch_tensor) +print(result) diff --git a/neural_networks/train_nn.py b/neural_networks/train_nn.py index ae866f94..e00c0421 100644 --- a/neural_networks/train_nn.py +++ b/neural_networks/train_nn.py @@ -151,8 +151,9 @@ def __init__( use_compile: bool = True, lift: str = "stack", use_lora: bool = False, - alpha=16, - rank=16, + alpha: float = 16.0, + rank: int = 16, + tune_pos_embed: bool = False, ): super().__init__() @@ -167,6 +168,9 @@ def __init__( "use_compile": use_compile, "lift": lift, "use_lora": use_lora, + "rank": rank, + "alpha": alpha, + "tune_pos_embed": tune_pos_embed, } if lift == "stack": @@ -242,6 +246,7 @@ def eval(self): self.dino.eval() else: self.dino.decoder.eval() + self.dino.encoder.pos_embed.requires_grad = False else: self.classifier.eval() @@ -253,12 +258,14 @@ def train(self): self.dino.train() else: self.dino.decoder.train() + # Finetune learnable pos_embedding + self.dino.encoder.pos_embed.requires_grad = self.kwargs["tune_pos_embed"] else: self.classifier.train() def get_dataloaders(dataset_root, batch_size): - num_workers = min(12, len(os.sched_getaffinity(0))) + num_workers = min(18, len(os.sched_getaffinity(0))) prefetch_factor, persistent_workers = ( (2, True) if num_workers > 0 else (None, False) @@ -382,6 +389,7 @@ def main( log_path: Path = "runs", epochs: int = 120, flip_augmentations: bool = False, + tune_pos_embed: bool = False, ): torch.set_float32_matmul_precision("high") torch.backends.cudnn.benchmark = True @@ -430,6 +438,7 @@ def main( use_lora=use_lora, alpha=alpha, rank=rank, + tune_pos_embed=tune_pos_embed, ) # noinspection PyArgumentList @@ -500,6 +509,16 @@ def main( "dataset_mean": mean, "dataset_std": std, }, + model_args={ + "model_name": model_name, + "use_compile": use_compile, + "lift": lift, + "use_lora": use_lora, + "rank": rank, + "alpha": alpha, + "dropout_p": dropout_p, + "tune_pos_embed": tune_pos_embed, + }, ) best_val_loss = torch.inf @@ -742,12 +761,26 @@ def load_checkpoint(ckpt_path, device="cuda"): # strip 'model_' from the name model_name = ckpt_dict["args"]["model_name"] - lr = ckpt_dict["args"]["lr"] - dropout_p = ckpt_dict["args"]["dropout_p"] - - model = ckpt_dict["model"](model_name=model_name, dropout_p=dropout_p).to(device) + if "model_args" in ckpt_dict["args"]: + model = ckpt_dict["model"](**ckpt_dict["model_args"]).to(device) + else: + dropout_p = ckpt_dict["args"]["dropout_p"] + use_lora = ckpt_dict["args"]["use_lora"] + rank = ckpt_dict["args"]["rank"] + alpha = ckpt_dict["args"]["alpha"] + lift = ckpt_dict["args"]["lift"] + model_name = ckpt_dict["args"]["model_name"] + + model = ckpt_dict["model"]( + model_name=model_name, + dropout_p=dropout_p, + use_lora=use_lora, + lift=lift, + alpha=alpha, + rank=rank, + ).to(device) model.load_state_dict(ckpt_dict["model_state_dict"]) - + lr = ckpt_dict["args"]["lr"] try: # FIXME: add optim class and args to state dict optim = ckpt_dict.get("optimizer", torch.optim.AdamW)( @@ -860,6 +893,12 @@ def get_argparser(): help="Whether to use LoRA if applicable.", ) + parser.add_argument( + "--tune_pos_embed", + action="store_true", + help="Whether to fine-tune the positional embedding if applicable", + ) + parser.add_argument("--rank", type=int, default=16, help="rank of LoRA") parser.add_argument( diff --git a/neural_networks/train_nn_lora.job b/neural_networks/train_nn_lora.job new file mode 100644 index 00000000..ed9d71c0 --- /dev/null +++ b/neural_networks/train_nn_lora.job @@ -0,0 +1,63 @@ +#!/bin/bash +#SBATCH --job-name=cortex_grid_search +#SBATCH -p gpu +#SBATCH -t 08:00:00 +#SBATCH --gpus 1 +#SBATCH --output=out/multi_cortex%A_%a.out + +set -e + +cd ~/projects/lofar_helpers/neural_networks + + +module load 2023 +source ../../lofar_venv/bin/activate + +# Read the parameter file +PARAM_FILE=parameters.txt + +# Set default value for SLURM_ARRAY_TASK_ID +SLURM_ARRAY_TASK_ID=${SLURM_ARRAY_TASK_ID:=1} +# Extract the specific line corresponding to the SLURM_ARRAY_TASK_ID +PARAMS=$(sed -n "${SLURM_ARRAY_TASK_ID}p" $PARAM_FILE) + +# Parse the parameters +read model lr normalize dropout_p batch_size label_smoothing stochastic_smoothing use_lora rank alpha resize lift flip_augmentations <<< $PARAMS + +if [ "$use_lora" -eq 1 ]; then + LORA_ARG="--use_lora" +else + LORA_ARG="" +fi + +if [ "$stochastic_smoothing" -eq 1 ]; then + STOCHASTIC_SMOOTHING="--stochastic_smoothing" +else + STOCHASTIC_SMOOTHING="" +fi + +if [ "$flip_augmentations" -eq 1 ]; then + FLIP_AUGMENTATIONS="--flip_augmentations" +else + FLIP_AUGMENTATIONS="" +fi + +# Scale up by 1e6 to convert to integers for comparison +scaled_lr=$(echo "$lr * 1000000" | awk '{printf("%d", $1)}') +scaled_threshold=$(echo "4e-05 * 1000000" | awk '{printf("%d", $1)}') + +if [ "$scaled_lr" -le "$scaled_threshold" ]; then + EPOCHS="250" +else + EPOCHS="120" +fi + +DATA_INPUT_PATH="/scratch-shared/CORTEX/" +# find $DATA_INPUT_PATH -name '*npz' | xargs -n 1 -P 18 -i rsync -R {} '/dev/shm/' + +DATA_TRAINDATA_PATH="/scratch-shared/CORTEX/public.spider.surfsara.nl/lofarvwf/jdejong/CORTEX/calibrator_selection_robertjan/cnn_data/" + + +# Execute your Python script with the given parameters +echo $DATA_TRAINDATA_PATH --model $model --lr $lr --normalize $normalize --dropout_p $dropout_p --batch_size $batch_size --log_path grid_search_lora --label_smoothing $label_smoothing --rank $rank --resize $resize --alpha $alpha $LORA_ARG $STOCHASTIC_SMOOTHING -d --epochs $EPOCHS --lift $lift $FLIP_AUGMENTATIONS +python train_nn.py $DATA_TRAINDATA_PATH --model $model --lr $lr --normalize $normalize --dropout_p $dropout_p --batch_size $batch_size --log_path grid_search_lora --label_smoothing $label_smoothing --rank $rank --resize $resize --alpha $alpha $LORA_ARG $STOCHASTIC_SMOOTHING -d --epochs $EPOCHS --lift $lift $FLIP_AUGMENTATIONS