Skip to content

Commit

Permalink
oon observer
Browse files Browse the repository at this point in the history
  • Loading branch information
priyakasimbeg committed Sep 22, 2023
1 parent e9992a8 commit 471dcab
Showing 1 changed file with 3 additions and 1 deletion.
4 changes: 3 additions & 1 deletion submission_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,6 @@ def oom_observer(device, alloc, device_alloc, device_free):
snapshot = torch.cuda.memory._snapshot()
dump(snapshot, open('oom_snapshot.pickle', 'wb'))

torch._C._cuda_attach_out_of_memory_observer(oom_observer)

def _reset_cuda_mem():
if FLAGS.framework == 'pytorch' and torch.cuda.is_available():
Expand Down Expand Up @@ -194,6 +193,9 @@ def train_once(
) -> Tuple[spec.Timing, Dict[str, Any]]:
data_rng, opt_init_rng, model_init_rng, rng = prng.split(rng, 4)

if torch.cuda.is_initialized():
torch._C._cuda_attach_out_of_memory_observer(oom_observer)

# Workload setup.
logging.info('Initializing dataset.')
with profiler.profile('Initializing dataset'):
Expand Down

0 comments on commit 471dcab

Please sign in to comment.