From fdfe47d0c5d168e8d137dde055e3b090797e4888 Mon Sep 17 00:00:00 2001 From: Arne Nix Date: Thu, 30 Jun 2022 10:24:38 +0200 Subject: [PATCH] New Singularity def --- .gitignore | 1 + Singularity.v0.4.def | 2 +- nntransfer/trainer/trainer.py | 10 +++++++++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index b66d494..9a1d5fe 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,4 @@ ENV/ #misc .DS_Store .history.* +*.sif diff --git a/Singularity.v0.4.def b/Singularity.v0.4.def index d5d616c..a2a2cc0 100644 --- a/Singularity.v0.4.def +++ b/Singularity.v0.4.def @@ -94,6 +94,7 @@ From: ubuntu:21.10 jupyterlab \ ipykernel \ opencv-python \ + ffcv \ datajoint==0.12.7 conda run -n ffcv pip install -e /src/bias_transfer @@ -101,7 +102,6 @@ From: ubuntu:21.10 conda run -n ffcv pip install -e /src/nnfabrik conda run -n ffcv pip install -e /src/neuralpredictors conda run -n ffcv pip install -e /src/pytorch_warmup - conda run -n ffcv pip install -e /src/ffcv conda run -n ffcv python -m ipykernel install --user --name=ffcv diff --git a/nntransfer/trainer/trainer.py b/nntransfer/trainer/trainer.py index 9714d39..faedafe 100644 --- a/nntransfer/trainer/trainer.py +++ b/nntransfer/trainer/trainer.py @@ -57,6 +57,7 @@ def __init__(self, dataloaders, model, seed, uid, cb, **kwargs): self.seed = seed self.data_loaders = dataloaders + print(self.data_loaders["train"]) self.task_keys = dataloaders["train"].keys() self.optimizer, self.stop_closure, self.criterion = self.get_training_controls() self.lr_scheduler = self.prepare_lr_schedule() @@ -234,8 +235,15 @@ def main_loop( outputs, loss, targets = module.post_forward( outputs, loss, targets, **shared_memory ) - + if outputs.isinf().any(): + print(outputs) + raise ValueError() + if outputs.isnan().any(): + print(outputs) + raise ValueError() loss = self.compute_loss(mode, task_key, loss, outputs, targets) + if loss.isnan(): + raise ValueError() if not self.config.show_epoch_progress or not mode not in ( "Validation",