diff --git a/submission_runner.py b/submission_runner.py index 6aff6ded5..fbb2065b1 100644 --- a/submission_runner.py +++ b/submission_runner.py @@ -193,10 +193,9 @@ def train_once( save_checkpoints: Optional[bool] = True ) -> Tuple[spec.Timing, Dict[str, Any]]: data_rng, opt_init_rng, model_init_rng, rng = prng.split(rng, 4) - torch.cuda.memory._record_memory_history(enabled=True) - if torch.cuda.is_initialized(): - torch._C._cuda_attach_out_of_memory_observer(oom_observer) + # if torch.cuda.is_initialized(): + # torch._C._cuda_attach_out_of_memory_observer(oom_observer) # Workload setup. logging.info('Initializing dataset.') @@ -471,6 +470,8 @@ def score_submission_on_workload(workload: spec.Workload, log_dir: Optional[str] = None, save_checkpoints: Optional[bool] = True, rng_seed: Optional[int] = None): + torch.cuda.memory._record_memory_history(enabled=True) + # Expand paths because '~' may not be recognized data_dir = os.path.expanduser(data_dir) if imagenet_v2_data_dir: