diff --git a/.github/workflows/cpu-torch-latest.yml b/.github/workflows/cpu-torch-latest.yml index 1808a1b220e4..bb2b002b1a17 100644 --- a/.github/workflows/cpu-torch-latest.yml +++ b/.github/workflows/cpu-torch-latest.yml @@ -19,7 +19,7 @@ concurrency: jobs: unit-tests: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml index d2554b7c0038..e33da160aaf3 100644 --- a/.github/workflows/formatting.yml +++ b/.github/workflows/formatting.yml @@ -18,7 +18,7 @@ jobs: # formatting and basic install on cpu-only machine unit-tests: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/nv-mii.yml b/.github/workflows/nv-mii.yml index 8a60823050ab..aab9d28b769c 100644 --- a/.github/workflows/nv-mii.yml +++ b/.github/workflows/nv-mii.yml @@ -46,7 +46,7 @@ jobs: git clone https://github.com/huggingface/transformers cd transformers # if needed switch to the last known good SHA until transformers@master is fixed - git checkout bdf36dc + # git checkout bdf36dc git rev-parse --short HEAD pip install . diff --git a/.github/workflows/nv-pre-compile-ops.yml b/.github/workflows/nv-pre-compile-ops.yml index 6afc11fddaab..a506bb27fda4 100644 --- a/.github/workflows/nv-pre-compile-ops.yml +++ b/.github/workflows/nv-pre-compile-ops.yml @@ -21,7 +21,7 @@ concurrency: jobs: unit-tests: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 container: image: deepspeed/gh-builder:ubuntu1804-py38-torch1131-cu116 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 2f571a14b228..02881ef12f39 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -7,7 +7,7 @@ on: jobs: deploy: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 environment: release-env steps: diff --git a/accelerator/xpu_accelerator.py b/accelerator/xpu_accelerator.py index 4fb107b0fa63..6da48000dafa 100644 --- a/accelerator/xpu_accelerator.py +++ b/accelerator/xpu_accelerator.py @@ -26,7 +26,11 @@ def is_synchronized_device(self): return False def use_host_timers(self): - return self.is_synchronized_device() + # WA XPU event will be consolidated in 2.5 + if ipex.__version__ < '2.5': + return True + else: + return self.is_synchronized_device() def resolves_data_dependency(self): return self.is_synchronized_device() diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py index 9e84121d50fa..c627846b743c 100644 --- a/deepspeed/runtime/pipe/engine.py +++ b/deepspeed/runtime/pipe/engine.py @@ -213,6 +213,8 @@ def __init__(self, has_bool_tensors=False, *super_args, **super_kwargs): self.module.activation_checkpoint_func = ds_checkpointing.non_reentrant_checkpoint if self.grid.get_global_rank() == 0: logger.info(f'CONFIG: activation_checkpoint_func=non_reentrant_checkpoint') + if self.module.activation_checkpoint_interval > 0: + self.module._precompute_checkpointable_values() self.module.checkpoint_parallel_write_pipeline = self._config.checkpoint_parallel_write_pipeline diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py index 8036faef72ee..3c25cbee66ec 100644 --- a/deepspeed/runtime/pipe/module.py +++ b/deepspeed/runtime/pipe/module.py @@ -196,6 +196,16 @@ def __init__(self, #newseed = get_accelerator().initial_seed() + self._grid.get_stage_id() #ds_utils.set_random_seed(newseed) + self.activation_checkpoint_interval = activation_checkpoint_interval + + self.activation_checkpoint_func = activation_checkpoint_func + + #storage for precomputed checkpointeble results + self.is_checkpointable_results = [] + self.is_checkpointable_results_interval = None + + # if configuration use_reentrant = False, self.activation_checkpoint_func will be set to ``checkpointing.non_reentrant_checkpoint`` + #with torch.random.fork_rng(devices=[get_accelerator().current_device_name()]): self._build() self.to(get_accelerator().device_name(self.local_rank)) @@ -203,10 +213,15 @@ def __init__(self, self.tied_comms = self._index_tied_modules() self._synchronize_tied_weights() - self.activation_checkpoint_interval = activation_checkpoint_interval - - self.activation_checkpoint_func = activation_checkpoint_func - # if configuration use_reentrant = False, self.activation_checkpoint_func will be set to ``checkpointing.non_reentrant_checkpoint`` + def _precompute_checkpointable_values(self): + if self.activation_checkpoint_interval > 0 and self.is_checkpointable_results_interval != self.activation_checkpoint_interval: + num_layers = len(self.forward_funcs) + self.interval_was_zero = False + for start_idx in range(0, num_layers, self.activation_checkpoint_interval): + end_idx = min(start_idx + self.activation_checkpoint_interval, num_layers) + funcs = self.forward_funcs[start_idx:end_idx] + self.is_checkpointable_results.append(self._is_checkpointable(funcs)) + self.is_checkpointable_results_interval = self.activation_checkpoint_interval def _build(self): specs = self._layer_specs @@ -352,7 +367,9 @@ def exec_func(*inputs): else: num_layers = len(self.forward_funcs) x = forward_input - for start_idx in range(0, num_layers, self.activation_checkpoint_interval): + for start_idx, is_checkpointable_result in \ + zip(range(0, num_layers, self.activation_checkpoint_interval), self.is_checkpointable_results): + end_idx = min(start_idx + self.activation_checkpoint_interval, num_layers) funcs = self.forward_funcs[start_idx:end_idx] @@ -361,7 +378,7 @@ def exec_func(*inputs): if not isinstance(x, tuple): x = (x, ) - if self._is_checkpointable(funcs): + if is_checkpointable_result: x = self.activation_checkpoint_func(exec_range_func(start_idx, end_idx), *x) else: x = exec_range_func(start_idx, end_idx)(*x)