From 45ba093de87fe1ebb6dba2bdd10b6cfae2ecd56d Mon Sep 17 00:00:00 2001 From: Joe Mayer Date: Thu, 21 Sep 2023 14:33:37 -0700 Subject: [PATCH 1/6] destroy function for zero stage 1 and 2 --- deepspeed/runtime/engine.py | 3 +++ deepspeed/runtime/zero/stage_1_and_2.py | 11 +++++++++++ 2 files changed, 14 insertions(+) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 22d7c882eb1a..d4dc93e5f426 100644 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -362,6 +362,9 @@ def destroy(self): if self.optimizer is not None and hasattr(self.optimizer, 'destroy'): self.optimizer.destroy() + def __del__(self): + self.destroy() + def _get_model_parameters(self): if self.autotuning_profile_model_info(): self.autotuning_model_info = {} diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index 3c1efaad27a0..e26a4abba858 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -519,6 +519,9 @@ def __init__(self, self._enable_universal_checkpoint() self._param_slice_mappings = self._create_param_mapping() + def destroy(self): + self._uncreate_param_mapping() + def _enable_universal_checkpoint(self): for lp_param_group in self.bit16_groups: enable_universal_checkpoint(param_list=lp_param_group) @@ -535,6 +538,14 @@ def _create_param_mapping(self): return param_mapping + def _uncreate_param_mapping(self): + param_mapping = [] + for i, _ in enumerate(self.optimizer.param_groups): + param_mapping_per_group = OrderedDict() + for lp in self.bit16_groups[i]: + lp._hp_mapping = None + return + def _link_all_hp_params(self): dp_world_size = dist.get_world_size(group=self.dp_process_group) if self.cpu_offload: From 90cc1bd9a0dc312f92010b4ade081ca56ea6550e Mon Sep 17 00:00:00 2001 From: Joe Mayer Date: Thu, 21 Sep 2023 14:38:43 -0700 Subject: [PATCH 2/6] remove unnecessary lines --- deepspeed/runtime/zero/stage_1_and_2.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index e26a4abba858..d141d25da02e 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -539,9 +539,7 @@ def _create_param_mapping(self): return param_mapping def _uncreate_param_mapping(self): - param_mapping = [] for i, _ in enumerate(self.optimizer.param_groups): - param_mapping_per_group = OrderedDict() for lp in self.bit16_groups[i]: lp._hp_mapping = None return From 5d5e43e93993f757dd4a1c784f54bbb40bfd0533 Mon Sep 17 00:00:00 2001 From: Joe Mayer Date: Thu, 21 Sep 2023 16:08:27 -0700 Subject: [PATCH 3/6] destroy for zero3 --- deepspeed/runtime/zero/stage3.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index 9509b5a692ca..55d13c03608f 100644 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -377,9 +377,12 @@ def __init__( def destroy(self): self.parameter_offload.destroy() - del self.__ipg_bucket_flat_buffer + attributes = [attr for attr in vars(self) if not callable(getattr(self, attr))] + for attr in attributes: + delattr(self,attr) def initialize_ds_offload( + self, module, timers, From 08114139c2fe513c71e41936835754ced715c80c Mon Sep 17 00:00:00 2001 From: Joe Mayer Date: Fri, 22 Sep 2023 13:35:00 -0700 Subject: [PATCH 4/6] addressing comments --- deepspeed/runtime/engine.py | 3 --- deepspeed/runtime/zero/stage3.py | 1 - deepspeed/runtime/zero/stage_1_and_2.py | 12 ++++-------- deepspeed/utils/__init__.py | 2 +- deepspeed/utils/mixed_precision_linkage.py | 9 +++++++++ 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index d4dc93e5f426..22d7c882eb1a 100644 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -362,9 +362,6 @@ def destroy(self): if self.optimizer is not None and hasattr(self.optimizer, 'destroy'): self.optimizer.destroy() - def __del__(self): - self.destroy() - def _get_model_parameters(self): if self.autotuning_profile_model_info(): self.autotuning_model_info = {} diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index 55d13c03608f..256bce1c0266 100644 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -382,7 +382,6 @@ def destroy(self): delattr(self,attr) def initialize_ds_offload( - self, module, timers, diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index d141d25da02e..0561fefb11b0 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -28,7 +28,8 @@ from deepspeed.checkpoint.constants import (DS_VERSION, GROUP_PADDINGS, PARTITION_COUNT, SINGLE_PARTITION_OF_FP32_GROUPS, BASE_OPTIMIZER_STATE, CLIP_GRAD, ZERO_STAGE, PARAM_SLICE_MAPPINGS) -from deepspeed.utils import link_hp_params + +from deepspeed.utils import link_hp_params, unlink_hp_mapping from deepspeed.checkpoint import enable_universal_checkpoint # Toggle this to true to enable correctness test @@ -520,7 +521,8 @@ def __init__(self, self._param_slice_mappings = self._create_param_mapping() def destroy(self): - self._uncreate_param_mapping() + for i, _ in enumerate(self.optimizer.param_groups): + unlink_hp_mapping(self.bit16_groups[i]) def _enable_universal_checkpoint(self): for lp_param_group in self.bit16_groups: @@ -538,12 +540,6 @@ def _create_param_mapping(self): return param_mapping - def _uncreate_param_mapping(self): - for i, _ in enumerate(self.optimizer.param_groups): - for lp in self.bit16_groups[i]: - lp._hp_mapping = None - return - def _link_all_hp_params(self): dp_world_size = dist.get_world_size(group=self.dp_process_group) if self.cpu_offload: diff --git a/deepspeed/utils/__init__.py b/deepspeed/utils/__init__.py index b6668b5ff5ce..e6989866e3b1 100644 --- a/deepspeed/utils/__init__.py +++ b/deepspeed/utils/__init__.py @@ -14,6 +14,6 @@ from .tensor_fragment import safe_get_full_fp32_param, safe_get_full_grad, safe_get_full_optimizer_state from .tensor_fragment import set_full_hp_param from .tensor_fragment import safe_set_full_fp32_param, safe_set_full_optimizer_state -from .mixed_precision_linkage import link_hp_params +from .mixed_precision_linkage import link_hp_params, unlink_hp_mapping from deepspeed.runtime.dataloader import RepeatingLoader from .numa import get_numactl_cmd diff --git a/deepspeed/utils/mixed_precision_linkage.py b/deepspeed/utils/mixed_precision_linkage.py index b1afa8f00aa3..d2641de34b4b 100644 --- a/deepspeed/utils/mixed_precision_linkage.py +++ b/deepspeed/utils/mixed_precision_linkage.py @@ -7,6 +7,11 @@ from deepspeed.utils import get_full_hp_param, get_full_hp_grad, get_hp_fragment_mapping from deepspeed.utils import set_full_hp_param +def _uncreate_param_mapping(self): + for i, _ in enumerate(self.optimizer.param_groups): + for lp in self.bit16_groups[i]: + lp._hp_mapping = None + return def link_hp_params(lp_param_list, flat_hp_partition, gradient_dict, offload_gradient_dict, use_offload, param_group_index, partition_start, partition_size, partition_optimizer_state, dp_group): @@ -17,6 +22,10 @@ def link_hp_params(lp_param_list, flat_hp_partition, gradient_dict, offload_grad offload_gradient_dict, use_offload, param_group_index, partition_start, partition_size, partition_optimizer_state) +def unlink_hp_mapping(lp_param_list): + for lp in lp_param_list: + lp._hp_mapping = None + return def _init_lp_to_hp_mapping(lp_param_list, partition_start, partition_size, dp_group): current_offset = 0 From de5ac7669e2f5a35eae58fdf94166255e78368d8 Mon Sep 17 00:00:00 2001 From: Joe Mayer Date: Fri, 22 Sep 2023 13:35:49 -0700 Subject: [PATCH 5/6] removing extra func --- deepspeed/utils/mixed_precision_linkage.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/deepspeed/utils/mixed_precision_linkage.py b/deepspeed/utils/mixed_precision_linkage.py index d2641de34b4b..8390db7ea9bf 100644 --- a/deepspeed/utils/mixed_precision_linkage.py +++ b/deepspeed/utils/mixed_precision_linkage.py @@ -7,12 +7,6 @@ from deepspeed.utils import get_full_hp_param, get_full_hp_grad, get_hp_fragment_mapping from deepspeed.utils import set_full_hp_param -def _uncreate_param_mapping(self): - for i, _ in enumerate(self.optimizer.param_groups): - for lp in self.bit16_groups[i]: - lp._hp_mapping = None - return - def link_hp_params(lp_param_list, flat_hp_partition, gradient_dict, offload_gradient_dict, use_offload, param_group_index, partition_start, partition_size, partition_optimizer_state, dp_group): local_lp_param_and_offset = _init_lp_to_hp_mapping(lp_param_list, partition_start, partition_size, dp_group) From 0d376cba5bbf2924a7f13a381b9e49c5c781f66b Mon Sep 17 00:00:00 2001 From: Joe Mayer Date: Fri, 29 Sep 2023 13:33:37 -0700 Subject: [PATCH 6/6] changing names for comments --- deepspeed/runtime/utils.py | 4 ++++ deepspeed/runtime/zero/stage3.py | 6 ++---- deepspeed/runtime/zero/stage_1_and_2.py | 7 ++++--- deepspeed/utils/__init__.py | 2 +- deepspeed/utils/mixed_precision_linkage.py | 2 +- 5 files changed, 12 insertions(+), 9 deletions(-) diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py index 7024b93d6820..ebab04c5413c 100755 --- a/deepspeed/runtime/utils.py +++ b/deepspeed/runtime/utils.py @@ -73,6 +73,10 @@ def ensure_directory_exists(filename): dirname = os.path.dirname(filename) os.makedirs(dirname, exist_ok=True) +def del_obj_attrs(obj): + attributes = [attr for attr in vars(obj) if not callable(getattr(obj, attr))] + for attr in attributes: + delattr(obj,attr) def set_random_seed(seed): """Set the random seed for common PRNGs used during training: random, numpy, and torch. diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index 256bce1c0266..39161a440292 100644 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -15,7 +15,7 @@ from deepspeed.utils import logger from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced, all_to_all_quant_reduce -from deepspeed.runtime.utils import inf, get_global_norm, is_model_parallel_parameter +from deepspeed.runtime.utils import inf, get_global_norm, is_model_parallel_parameter, del_obj_attrs from deepspeed.runtime.zero.partition_parameters import * from deepspeed.runtime.zero.config import ZeroStageEnum from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum @@ -377,9 +377,7 @@ def __init__( def destroy(self): self.parameter_offload.destroy() - attributes = [attr for attr in vars(self) if not callable(getattr(self, attr))] - for attr in attributes: - delattr(self,attr) + del_obj_attrs(self) def initialize_ds_offload( self, diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index 0561fefb11b0..3a5ec264b64d 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -12,7 +12,7 @@ from deepspeed.runtime import ZeROOptimizer from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler -from deepspeed.runtime.utils import (bwc_tensor_model_parallel_rank, get_global_norm, empty_cache, see_memory_usage, +from deepspeed.runtime.utils import (bwc_tensor_model_parallel_rank, get_global_norm, empty_cache, see_memory_usage, del_obj_attrs, inf, is_model_parallel_parameter, align_dense_tensors, all_gather_dp_groups) from deepspeed.runtime.zero.config import ZeroStageEnum @@ -29,7 +29,7 @@ SINGLE_PARTITION_OF_FP32_GROUPS, BASE_OPTIMIZER_STATE, CLIP_GRAD, ZERO_STAGE, PARAM_SLICE_MAPPINGS) -from deepspeed.utils import link_hp_params, unlink_hp_mapping +from deepspeed.utils import link_hp_params, unlink_hp_params from deepspeed.checkpoint import enable_universal_checkpoint # Toggle this to true to enable correctness test @@ -522,7 +522,8 @@ def __init__(self, def destroy(self): for i, _ in enumerate(self.optimizer.param_groups): - unlink_hp_mapping(self.bit16_groups[i]) + unlink_hp_params(self.bit16_groups[i]) + del_obj_attrs(self) def _enable_universal_checkpoint(self): for lp_param_group in self.bit16_groups: diff --git a/deepspeed/utils/__init__.py b/deepspeed/utils/__init__.py index e6989866e3b1..80301a9fd6df 100644 --- a/deepspeed/utils/__init__.py +++ b/deepspeed/utils/__init__.py @@ -14,6 +14,6 @@ from .tensor_fragment import safe_get_full_fp32_param, safe_get_full_grad, safe_get_full_optimizer_state from .tensor_fragment import set_full_hp_param from .tensor_fragment import safe_set_full_fp32_param, safe_set_full_optimizer_state -from .mixed_precision_linkage import link_hp_params, unlink_hp_mapping +from .mixed_precision_linkage import link_hp_params, unlink_hp_params from deepspeed.runtime.dataloader import RepeatingLoader from .numa import get_numactl_cmd diff --git a/deepspeed/utils/mixed_precision_linkage.py b/deepspeed/utils/mixed_precision_linkage.py index 8390db7ea9bf..f42719c86335 100644 --- a/deepspeed/utils/mixed_precision_linkage.py +++ b/deepspeed/utils/mixed_precision_linkage.py @@ -16,7 +16,7 @@ def link_hp_params(lp_param_list, flat_hp_partition, gradient_dict, offload_grad offload_gradient_dict, use_offload, param_group_index, partition_start, partition_size, partition_optimizer_state) -def unlink_hp_mapping(lp_param_list): +def unlink_hp_params(lp_param_list): for lp in lp_param_list: lp._hp_mapping = None return