Skip to content

Commit

Permalink
Cleanup required_torch_version code and references. (microsoft#5370)
Browse files Browse the repository at this point in the history
- Move `required_torch_version` check from deepspeed.runtime.utils to
deepspeed.utils.torch (newly created).
- Remove unused duplicate definition from `tests/unit/util.py`.
- Update all references to this function.
- Switch checks in `deepspeed/runtime/pipe/p2p.py` to use this function.
- Switch checks in `deepspeed/comm/torch.py` to use this function.

---------

Co-authored-by: Lev Kurilenko <[email protected]>
  • Loading branch information
2 people authored and rraminen committed May 9, 2024
1 parent f373bf6 commit a5d0aaa
Show file tree
Hide file tree
Showing 30 changed files with 56 additions and 85 deletions.
28 changes: 4 additions & 24 deletions deepspeed/comm/torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .backend import *
from .comm import *
from ..runtime import compiler
from deepspeed.utils.torch import required_torch_version
import os

DS_COMM_ALL_GATHER_OFF = False
Expand All @@ -18,40 +19,19 @@
DS_COMM_REDUCE_OFF = False


def is_torch_ver_eq_2_0():
TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[:2])
if TORCH_MAJOR == 2 and TORCH_MINOR == 0:
return True
return False


def is_torch_ver_ge_2_1():
TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[:2])
if TORCH_MAJOR >= 2 and TORCH_MINOR >= 1:
return True
return False


def torch_ver_ge_1_13():
TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[:2])
if TORCH_MAJOR >= 1 and TORCH_MINOR >= 13:
return True
return False


def has_coalescing_manager():
has_c10d = hasattr(torch.distributed, 'distributed_c10d')
return has_c10d and hasattr(torch.distributed.distributed_c10d, '_coalescing_manager')


def has_all_reduce_coalesced():
return hasattr(torch.distributed, "all_reduce_coalesced") and torch_ver_ge_1_13()
return hasattr(torch.distributed, "all_reduce_coalesced") and required_torch_version(min_version=1.13)


def get_coalescing_manager(group, device, reqs, async_op):
if is_torch_ver_eq_2_0():
if required_torch_version(min_version=2.0, max_version=2.0):
return torch.distributed.distributed_c10d._coalescing_manager(group, device=device, reqs=reqs)
elif is_torch_ver_ge_2_1():
elif required_torch_version(min_version=2.1):
return torch.distributed.distributed_c10d._coalescing_manager(group, device=device, async_ops=async_op)
else:
return torch.distributed.distributed_c10d._coalescing_manager(group, reqs)
Expand Down
2 changes: 1 addition & 1 deletion deepspeed/elasticity/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

# DeepSpeed Team

from deepspeed.runtime.utils import required_torch_version
from deepspeed.utils.torch import required_torch_version


def is_torch_elastic_compatible():
Expand Down
2 changes: 1 addition & 1 deletion deepspeed/runtime/comm/nccl.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import numpy as np

from deepspeed.runtime.compression.cupy import CupyBackend
from deepspeed.runtime.utils import required_torch_version
from deepspeed.utils.torch import required_torch_version
from deepspeed.accelerator import get_accelerator


Expand Down
3 changes: 2 additions & 1 deletion deepspeed/runtime/fp16/fused_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors

from deepspeed.runtime.base_optimizer import DeepSpeedOptimizer
from deepspeed.runtime.utils import get_global_norm, get_grad_norm, CheckOverflow, get_weight_norm, required_torch_version, get_norm_with_moe_layers
from deepspeed.runtime.utils import get_global_norm, get_grad_norm, CheckOverflow, get_weight_norm, get_norm_with_moe_layers
from deepspeed.runtime.fp16.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, MIN_LOSS_SCALE
from deepspeed.utils import logger, log_dist
from deepspeed.utils.torch import required_torch_version
from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT, CLIP_GRAD
from deepspeed.accelerator import get_accelerator
from deepspeed.moe.utils import is_moe_param_group
Expand Down
2 changes: 1 addition & 1 deletion deepspeed/runtime/fp16/onebit/adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import torch
import numpy as np
from deepspeed.accelerator import get_accelerator
from deepspeed.runtime.utils import required_torch_version
from deepspeed.utils.torch import required_torch_version
from deepspeed import comm as dist


Expand Down
2 changes: 1 addition & 1 deletion deepspeed/runtime/fp16/onebit/lamb.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import torch
import numpy as np
from deepspeed import comm as dist
from deepspeed.runtime.utils import required_torch_version
from deepspeed.utils.torch import required_torch_version
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
from deepspeed.accelerator import get_accelerator

Expand Down
2 changes: 1 addition & 1 deletion deepspeed/runtime/fp16/onebit/zoadam.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import torch
import numpy as np
from deepspeed.accelerator import get_accelerator
from deepspeed.runtime.utils import required_torch_version
from deepspeed.utils.torch import required_torch_version
from deepspeed import comm as dist


Expand Down
3 changes: 2 additions & 1 deletion deepspeed/runtime/fp16/unfused_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@
from torch._utils import _flatten_dense_tensors

from deepspeed.runtime.base_optimizer import DeepSpeedOptimizer
from deepspeed.runtime.utils import get_global_norm, CheckOverflow, get_weight_norm, required_torch_version
from deepspeed.runtime.utils import get_global_norm, CheckOverflow, get_weight_norm
from deepspeed.runtime.fp16.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, MIN_LOSS_SCALE
from deepspeed.utils import logger
from deepspeed.utils.torch import required_torch_version
from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT
from deepspeed.accelerator import get_accelerator
from deepspeed import comm as dist
Expand Down
8 changes: 2 additions & 6 deletions deepspeed/runtime/pipe/p2p.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@
import torch
from deepspeed import comm as dist

# To query whether we have send/recv support
from packaging.version import Version
from deepspeed.git_version_info import torch_info
from deepspeed.utils.torch import required_torch_version
from deepspeed.accelerator import get_accelerator

_groups = None
Expand All @@ -21,9 +19,7 @@


def can_send_recv() -> bool:
torch_version = Version(torch_info['version'])
sendrecv_min = Version('1.8')
return torch_version >= sendrecv_min
return required_torch_version(min_version=1.8)


#initializes adjacent process groups
Expand Down
15 changes: 0 additions & 15 deletions deepspeed/runtime/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import psutil
import gc
from math import sqrt
from packaging import version as pkg_version

import torch
from deepspeed import comm as dist
Expand Down Expand Up @@ -1036,20 +1035,6 @@ def get_inactive_params(param_list):
param.ds_status == ZeroParamStatus.NOT_AVAILABLE)]


def required_torch_version(min_version=None, max_version=None):
assert min_version or max_version, "Must provide a min_version or max_version argument"

torch_version = pkg_version.parse(torch.__version__)

if min_version and pkg_version.parse(str(min_version)) > torch_version:
return False

if max_version and pkg_version.parse(str(max_version)) < torch_version:
return False

return True


def get_norm_with_moe_layers(non_expert_norm, mpu, expert_tensors, norm_type=2):
""" Compute the global norm with MoE experts
Expand Down
22 changes: 22 additions & 0 deletions deepspeed/utils/torch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team

from packaging import version as pkg_version

import torch


def required_torch_version(min_version=None, max_version=None):
assert min_version or max_version, "Must provide a min_version or max_version argument"

torch_version = pkg_version.parse(torch.__version__)

if min_version and pkg_version.parse(str(min_version)) > torch_version:
return False

if max_version and pkg_version.parse(str(max_version)) < torch_version:
return False

return True
2 changes: 1 addition & 1 deletion tests/unit/alexnet_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import deepspeed
import deepspeed.comm as dist
import deepspeed.runtime.utils as ds_utils
from deepspeed.runtime.utils import required_torch_version
from deepspeed.utils.torch import required_torch_version
from deepspeed.accelerator import get_accelerator
from deepspeed.runtime.pipe.module import PipelineModule, LayerSpec

Expand Down
2 changes: 1 addition & 1 deletion tests/unit/checkpoint/test_mics_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import deepspeed

from deepspeed.runtime.utils import required_torch_version
from deepspeed.utils.torch import required_torch_version
from unit.common import DistributedTest
from unit.simple_model import *
from unit.checkpoint.common import *
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/checkpoint/test_moe_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# DeepSpeed Team

from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer
from deepspeed.runtime.utils import required_torch_version
from deepspeed.utils.torch import required_torch_version

from unit.common import DistributedTest
from unit.simple_model import *
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/checkpoint/test_universal_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from types import SimpleNamespace
from torch.utils._pytree import tree_map

from deepspeed.runtime.utils import required_torch_version
from deepspeed.utils.torch import required_torch_version
from deepspeed.checkpoint import UNIVERSAL_CHECKPOINT_INFO
from deepspeed.checkpoint.ds_to_universal import main as convert_to_universal

Expand Down
2 changes: 1 addition & 1 deletion tests/unit/checkpoint/test_zero_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from deepspeed.ops.op_builder import CPUAdamBuilder
from deepspeed.checkpoint.utils import clone_tensors_for_torch_save, get_model_ckpt_name_for_rank
from deepspeed.accelerator import get_accelerator
from deepspeed.runtime.utils import required_torch_version
from deepspeed.utils.torch import required_torch_version

from unit.common import DistributedTest, DistributedFixture
from unit.simple_model import *
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/compression/test_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from deepspeed.compression.basic_layer import LinearLayer_Compress, ColumnParallelLinear_Compress, RowParallelLinear_Compress
from deepspeed.compression.helper import convert_conv1d_to_linear
from deepspeed.accelerator import get_accelerator
from deepspeed.runtime.utils import required_torch_version
from deepspeed.utils.torch import required_torch_version
from unit.common import DistributedTest

pytestmark = pytest.mark.skipif(not required_torch_version(min_version=1.5),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from deepspeed.inference.quantization.quantization import _init_group_wise_weight_quantization
from deepspeed.inference.quantization.utils import Quantizer, DeQuantizer
from deepspeed.inference.quantization.layers import QuantizedLinear
from deepspeed.runtime.utils import required_torch_version
from deepspeed.utils.torch import required_torch_version
from transformers.models.opt.modeling_opt import OPTDecoderLayer
from transformers import AutoConfig, OPTConfig, AutoModel
import pytest
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from deepspeed.accelerator import get_accelerator
from unit.common import DistributedTest, DistributedFixture
from unit.megatron_model import get_gpt2_model, get_megatron_version
from deepspeed.runtime.utils import required_torch_version
from deepspeed.utils.torch import required_torch_version

pytestmark = pytest.mark.skipif(not required_torch_version(min_version=1.5, max_version=1.13),
reason='Megatron-LM package requires Pytorch version >=1.5 and <=1.13')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from unit.megatron_model import MockGPT2ModelPipe as GPT2ModelPipe
from deepspeed.utils import RepeatingLoader
from deepspeed.accelerator import get_accelerator
from deepspeed.runtime.utils import required_torch_version
from deepspeed.utils.torch import required_torch_version

pytestmark = pytest.mark.skipif(not required_torch_version(min_version=1.5, max_version=1.13),
reason='Megatron-LM package requires Pytorch version >=1.5 and <=1.13')
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/moe/test_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from deepspeed import get_accelerator
from deepspeed.moe.sharded_moe import top1gating
from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer, is_moe_param
from deepspeed.runtime.utils import required_torch_version
from deepspeed.utils.torch import required_torch_version


@pytest.mark.parametrize("zero_stage", [0, 1, 2])
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/moe/test_moe_tp.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import deepspeed
import pytest
from unit.common import DistributedTest
from deepspeed.runtime.utils import required_torch_version
from deepspeed.utils.torch import required_torch_version
from deepspeed.moe.layer import MoE


Expand Down
2 changes: 1 addition & 1 deletion tests/unit/profiling/flops_profiler/test_flops_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from deepspeed.profiling.flops_profiler import get_model_profile
from unit.simple_model import SimpleModel, random_dataloader
from unit.common import DistributedTest
from deepspeed.runtime.utils import required_torch_version
from deepspeed.utils.torch import required_torch_version
from deepspeed.accelerator import get_accelerator

if torch.half not in get_accelerator().supported_dtypes():
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/runtime/compile/test_compile_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import deepspeed
from deepspeed.accelerator import get_accelerator
from deepspeed.runtime.utils import required_torch_version
from deepspeed.utils.torch import required_torch_version

from unit.common import DistributedTest

Expand Down
2 changes: 1 addition & 1 deletion tests/unit/runtime/compile/test_compile_zero.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import torch

from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
from deepspeed.runtime.utils import required_torch_version
from deepspeed.utils.torch import required_torch_version
from deepspeed.accelerator import get_accelerator

from unit.runtime.compile.util import compare_loss
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/runtime/compile/test_load_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from unit.simple_model import SimpleModel
import deepspeed
from deepspeed.accelerator import get_accelerator
from deepspeed.runtime.utils import required_torch_version
from deepspeed.utils.torch import required_torch_version

from unit.common import DistributedTest

Expand Down
2 changes: 1 addition & 1 deletion tests/unit/runtime/half_precision/onebit/test_onebit.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from unit.common import DistributedTest
from unit.simple_model import SimpleModel, random_dataloader
from unit.alexnet_model import AlexNetPipe, train_cifar
from deepspeed.runtime.utils import required_torch_version
from deepspeed.utils.torch import required_torch_version
from deepspeed.accelerator import get_accelerator

PipeTopo = PipeDataParallelTopology
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/runtime/half_precision/test_fp16.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from deepspeed.ops.adam import FusedAdam
from unit.common import DistributedTest
from unit.simple_model import SimpleModel, SimpleOptimizer, random_dataloader, SimpleMoEModel, sequence_dataloader
from deepspeed.runtime.utils import required_torch_version
from deepspeed.utils.torch import required_torch_version
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder import CPUAdamBuilder
from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer
Expand Down
3 changes: 2 additions & 1 deletion tests/unit/runtime/test_ds_initialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
from deepspeed.ops.adam import FusedAdam
from deepspeed.runtime.lr_schedules import WARMUP_LR, WarmupLR
from deepspeed.runtime.config import ADAM_OPTIMIZER
from deepspeed.runtime.utils import see_memory_usage, required_torch_version
from deepspeed.runtime.utils import see_memory_usage
from deepspeed.utils.torch import required_torch_version
from deepspeed.accelerator import get_accelerator


Expand Down
15 changes: 0 additions & 15 deletions tests/unit/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import torch
from deepspeed.accelerator import get_accelerator, is_current_accelerator_supported
from deepspeed.git_version_info import torch_info
from packaging import version as pkg_version


def skip_on_arch(min_arch=7):
Expand Down Expand Up @@ -62,20 +61,6 @@ def bf16_required_version_check(accelerator_check=True):
return False


def required_torch_version(min_version=None, max_version=None):
assert min_version or max_version, "Must provide a min_version or max_version argument"

torch_version = pkg_version.parse(torch.__version__)

if min_version and pkg_version.parse(str(min_version)) > torch_version:
return False

if max_version and pkg_version.parse(str(max_version)) < torch_version:
return False

return True


def required_amp_check():
from importlib.util import find_spec
if find_spec('apex') is None:
Expand Down

0 comments on commit a5d0aaa

Please sign in to comment.