From 81b45ebd868b8dfb1ce9a74266553e8286bba862 Mon Sep 17 00:00:00 2001 From: fw Date: Mon, 2 Sep 2024 09:19:34 +0000 Subject: [PATCH] . --- realhf/base/constants.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/realhf/base/constants.py b/realhf/base/constants.py index c2739c5b..7beea2cd 100755 --- a/realhf/base/constants.py +++ b/realhf/base/constants.py @@ -6,6 +6,7 @@ import pathlib from collections import defaultdict from typing import * +import datetime import numpy as np @@ -56,7 +57,7 @@ def get_tensor(self, tensor_shape, dtype, name, force_zero: bool = False): # 30 minutes. Transferring super-large batches via NCCL bcast # for the first time may consumer over 600 secs, which is the # pytorch's default. Increase this value to 30 minutes. -NCCL_DEFAULT_TIMEOUT = 1800 +NCCL_DEFAULT_TIMEOUT = datetime.timedelta(seconds=1800) # constants in experiment instance scope MODEL_SAVE_ROOT = f"{cluster_spec.fileroot}/checkpoints/{getpass.getuser()}"