Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Destroy ZeRO #4383

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions deepspeed/runtime/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ def ensure_directory_exists(filename):
dirname = os.path.dirname(filename)
os.makedirs(dirname, exist_ok=True)

def del_obj_attrs(obj):
attributes = [attr for attr in vars(obj) if not callable(getattr(obj, attr))]
for attr in attributes:
delattr(obj,attr)

def set_random_seed(seed):
"""Set the random seed for common PRNGs used during training: random, numpy, and torch.
Expand Down
4 changes: 2 additions & 2 deletions deepspeed/runtime/zero/stage3.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from deepspeed.utils import logger
from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler
from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced, all_to_all_quant_reduce
from deepspeed.runtime.utils import inf, get_global_norm, is_model_parallel_parameter
from deepspeed.runtime.utils import inf, get_global_norm, is_model_parallel_parameter, del_obj_attrs
from deepspeed.runtime.zero.partition_parameters import *
from deepspeed.runtime.zero.config import ZeroStageEnum
from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
Expand Down Expand Up @@ -377,7 +377,7 @@ def __init__(

def destroy(self):
self.parameter_offload.destroy()
del self.__ipg_bucket_flat_buffer
del_obj_attrs(self)

def initialize_ds_offload(
self,
Expand Down
9 changes: 7 additions & 2 deletions deepspeed/runtime/zero/stage_1_and_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from deepspeed.runtime import ZeROOptimizer
from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler
from deepspeed.runtime.utils import (bwc_tensor_model_parallel_rank, get_global_norm, empty_cache, see_memory_usage,
from deepspeed.runtime.utils import (bwc_tensor_model_parallel_rank, get_global_norm, empty_cache, see_memory_usage, del_obj_attrs,
inf, is_model_parallel_parameter, align_dense_tensors, all_gather_dp_groups)

from deepspeed.runtime.zero.config import ZeroStageEnum
Expand All @@ -28,7 +28,7 @@
from deepspeed.checkpoint.constants import (DS_VERSION, GROUP_PADDINGS, PARTITION_COUNT,
SINGLE_PARTITION_OF_FP32_GROUPS, BASE_OPTIMIZER_STATE,
BASE_OPTIMIZER_STATE_STEP, CLIP_GRAD, ZERO_STAGE, PARAM_SLICE_MAPPINGS)
from deepspeed.utils import link_hp_params
from deepspeed.utils import link_hp_params, unlink_hp_params
from deepspeed.checkpoint import enable_universal_checkpoint

# Toggle this to true to enable correctness test
Expand Down Expand Up @@ -519,6 +519,11 @@ def __init__(self,
self._enable_universal_checkpoint()
self._param_slice_mappings = self._create_param_mapping()

def destroy(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for i, _ in enumerate(self.optimizer.param_groups):
unlink_hp_params(self.bit16_groups[i])
del_obj_attrs(self)

def _enable_universal_checkpoint(self):
for lp_param_group in self.bit16_groups:
enable_universal_checkpoint(param_list=lp_param_group)
Expand Down
2 changes: 1 addition & 1 deletion deepspeed/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@
from .tensor_fragment import safe_get_full_fp32_param, safe_get_full_grad, safe_get_full_optimizer_state
from .tensor_fragment import set_full_hp_param
from .tensor_fragment import safe_set_full_fp32_param, safe_set_full_optimizer_state
from .mixed_precision_linkage import link_hp_params
from .mixed_precision_linkage import link_hp_params, unlink_hp_params
from deepspeed.runtime.dataloader import RepeatingLoader
from .numa import get_numactl_cmd
5 changes: 4 additions & 1 deletion deepspeed/utils/mixed_precision_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from deepspeed.utils import get_full_hp_param, get_full_hp_grad, get_hp_fragment_mapping
from deepspeed.utils import set_full_hp_param


def link_hp_params(lp_param_list, flat_hp_partition, gradient_dict, offload_gradient_dict, use_offload,
param_group_index, partition_start, partition_size, partition_optimizer_state, dp_group):
local_lp_param_and_offset = _init_lp_to_hp_mapping(lp_param_list, partition_start, partition_size, dp_group)
Expand All @@ -17,6 +16,10 @@ def link_hp_params(lp_param_list, flat_hp_partition, gradient_dict, offload_grad
offload_gradient_dict, use_offload, param_group_index,
partition_start, partition_size, partition_optimizer_state)

def unlink_hp_params(lp_param_list):
for lp in lp_param_list:
lp._hp_mapping = None
return

def _init_lp_to_hp_mapping(lp_param_list, partition_start, partition_size, dp_group):
current_offset = 0
Expand Down