Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

{ai,lib}[GCCcore/12.2.0,foss/2022b] PyTorch v2.1.2, NCCL v2.18.3 w/ CUDA 12.0.0 #20520

Draft
wants to merge 4 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name = 'NCCL'
version = '2.18.3'
versionsuffix = '-CUDA-%(cudaver)s'

homepage = 'https://developer.nvidia.com/nccl'
description = """The NVIDIA Collective Communications Library (NCCL) implements multi-GPU and multi-node collective
communication primitives that are performance optimized for NVIDIA GPUs."""

toolchain = {'name': 'GCCcore', 'version': '12.2.0'}

github_account = 'NVIDIA'
source_urls = [GITHUB_SOURCE]
sources = ['v%(version)s-1.tar.gz']
patches = ['NCCL-2.16.2_fix-cpuid.patch']
checksums = [
('6477d83c9edbb34a0ebce6d751a1b32962bc6415d75d04972b676c6894ceaef9',
'b4f5d7d9eea2c12e32e7a06fe138b2cfc75969c6d5c473aa6f819a792db2fc96'),
{'NCCL-2.16.2_fix-cpuid.patch': '0459ecadcd32b2a7a000a2ce4f675afba908b2c0afabafde585330ff4f83e277'},
]

builddependencies = [('binutils', '2.39')]

dependencies = [
('CUDA', '12.0.0', '', SYSTEM),
('UCX-CUDA', '1.13.1', versionsuffix),
]

# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
cuda_compute_capabilities = ['5.0', '6.0', '7.0', '7.5', '8.0', '8.6', '9.0']

moduleclass = 'lib'
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
name = 'PyTorch'
version = '2.1.2'
versionsuffix = '-CUDA-%(cudaver)s'

homepage = 'https://pytorch.org/'
description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
PyTorch is a deep learning framework that puts Python first."""

toolchain = {'name': 'foss', 'version': '2022b'}

source_urls = [GITHUB_RELEASE]
sources = ['%(namelower)s-v%(version)s.tar.gz']
patches = [
'PyTorch-1.7.0_disable-dev-shm-test.patch',
'PyTorch-1.11.1_skip-test_init_from_local_shards.patch',
'PyTorch-1.12.1_add-hypothesis-suppression.patch',
'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch',
'PyTorch-1.12.1_fix-TestTorch.test_to.patch',
'PyTorch-1.12.1_skip-test_round_robin.patch',
'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch',
'PyTorch-1.13.1_fix-protobuf-dependency.patch',
'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch',
'PyTorch-1.13.1_skip-failing-singular-grad-test.patch',
'PyTorch-1.13.1_skip-tests-without-fbgemm.patch',
'PyTorch-2.0.1_avoid-test_quantization-failures.patch',
'PyTorch-2.0.1_fix-skip-decorators.patch',
'PyTorch-2.0.1_fix-ub-in-inductor-codegen.patch',
'PyTorch-2.0.1_fix-vsx-loadu.patch',
'PyTorch-2.0.1_no-cuda-stubs-rpath.patch',
'PyTorch-2.0.1_skip-failing-gradtest.patch',
'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch',
'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch',
'PyTorch-2.1.0_disable-gcc12-warning.patch',
'PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch',
'PyTorch-2.1.0_fix-bufferoverflow-in-oneDNN.patch',
'PyTorch-2.1.0_fix-test_numpy_torch_operators.patch',
'PyTorch-2.1.0_fix-validationError-output-test.patch',
'PyTorch-2.1.0_fix-vsx-vector-shift-functions.patch',
'PyTorch-2.1.0_increase-tolerance-functorch-test_vmapvjpvjp.patch',
'PyTorch-2.1.0_remove-sparse-csr-nnz-overflow-test.patch',
'PyTorch-2.1.0_remove-test-requiring-online-access.patch',
'PyTorch-2.1.0_skip-diff-test-on-ppc.patch',
'PyTorch-2.1.0_skip-dynamo-test_predispatch.patch',
'PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch',
'PyTorch-2.1.0_skip-test_linear_fp32-without-MKL.patch',
'PyTorch-2.1.0_skip-test_wrap_bad.patch',
'PyTorch-2.1.2_add-cuda-skip-markers.patch',
'PyTorch-2.1.2_fix-conj-mismatch-test-failures.patch',
'PyTorch-2.1.2_fix-device-mesh-check.patch',
'PyTorch-2.1.2_fix-fsdp-tp-integration-test.patch',
'PyTorch-2.1.2_fix-locale-issue-in-nvrtcCompileProgram.patch',
'PyTorch-2.1.2_fix-test_cuda-non-x86.patch',
'PyTorch-2.1.2_fix-test_extension_backend-without-vectorization.patch',
'PyTorch-2.1.2_fix-test_memory_profiler.patch',
'PyTorch-2.1.2_fix-test_parallelize_api.patch',
'PyTorch-2.1.2_fix-test_torchinductor-rounding.patch',
'PyTorch-2.1.2_fix-vsx-vector-abs.patch',
'PyTorch-2.1.2_fix-vsx-vector-div.patch',
'PyTorch-2.1.2_fix-with_temp_dir-decorator.patch',
'PyTorch-2.1.2_fix-wrong-device-mesh-size-in-tests.patch',
'PyTorch-2.1.2_relax-cuda-tolerances.patch',
'PyTorch-2.1.2_remove-nccl-backend-default-without-gpus.patch',
'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch',
'PyTorch-2.1.2_skip-failing-test_dtensor_ops-subtests.patch',
'PyTorch-2.1.2_skip-test_fsdp_tp_checkpoint_integration.patch',
'PyTorch-2.1.2_skip-xfailing-test_dtensor_ops.patch',
'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch',
]
checksums = [
{'pytorch-v2.1.2.tar.gz': '85effbcce037bffa290aea775c9a4bad5f769cb229583450c40055501ee1acd7'},
{'PyTorch-1.7.0_disable-dev-shm-test.patch': '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a'},
{'PyTorch-1.11.1_skip-test_init_from_local_shards.patch':
'4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7'},
{'PyTorch-1.12.1_add-hypothesis-suppression.patch':
'e71ffb94ebe69f580fa70e0de84017058325fdff944866d6bd03463626edc32c'},
{'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch':
'1efc9850c431d702e9117d4766277d3f88c5c8b3870997c9974971bce7f2ab83'},
{'PyTorch-1.12.1_fix-TestTorch.test_to.patch': '75f27987c3f25c501e719bd2b1c70a029ae0ee28514a97fe447516aee02b1535'},
{'PyTorch-1.12.1_skip-test_round_robin.patch': '63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349'},
{'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch':
'5c7be91a6096083a0b1315efe0001537499c600f1f569953c6a2c7f4cc1d0910'},
{'PyTorch-1.13.1_fix-protobuf-dependency.patch':
'8bd755a0cab7233a243bc65ca57c9630dfccdc9bf8c9792f0de4e07a644fcb00'},
{'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch':
'bdde0f2105215c95a54de64ec4b1a4520528510663174fef6d5b900eb1db3937'},
{'PyTorch-1.13.1_skip-failing-singular-grad-test.patch':
'72688a57b2bb617665ad1a1d5e362c5111ae912c10936bb38a089c0204729f48'},
{'PyTorch-1.13.1_skip-tests-without-fbgemm.patch':
'481e595f673baf8ae58b41697a6792b83048b0264aa79b422f48cd8c22948bb7'},
{'PyTorch-2.0.1_avoid-test_quantization-failures.patch':
'02e3f47e4ed1d7d6077e26f1ae50073dc2b20426269930b505f4aefe5d2f33cd'},
{'PyTorch-2.0.1_fix-skip-decorators.patch': '2039012cef45446065e1a2097839fe20bb29fe3c1dcc926c3695ebf29832e920'},
{'PyTorch-2.0.1_fix-ub-in-inductor-codegen.patch':
'1b37194f55ae678f3657b8728dfb896c18ffe8babe90987ce468c4fa9274f357'},
{'PyTorch-2.0.1_fix-vsx-loadu.patch': 'a0ffa61da2d47c6acd09aaf6d4791e527d8919a6f4f1aa7ed38454cdcadb1f72'},
{'PyTorch-2.0.1_no-cuda-stubs-rpath.patch': '8902e58a762240f24cdbf0182e99ccdfc2a93492869352fcb4ca0ec7e407f83a'},
{'PyTorch-2.0.1_skip-failing-gradtest.patch': '8030bdec6ba49b057ab232d19a7f1a5e542e47e2ec340653a246ec9ed59f8bc1'},
{'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch':
'7047862abc1abaff62954da59700f36d4f39fcf83167a638183b1b7f8fec78ae'},
{'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch':
'166c134573a95230e39b9ea09ece3ad8072f39d370c9a88fb2a1e24f6aaac2b5'},
{'PyTorch-2.1.0_disable-gcc12-warning.patch': 'c858b8db0010f41005dc06f9a50768d0d3dc2d2d499ccbdd5faf8a518869a421'},
{'PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch':
'd895018ebdfd46e65d9f7645444a3b4c5bbfe3d533a08db559a04be34e01e478'},
{'PyTorch-2.1.0_fix-bufferoverflow-in-oneDNN.patch':
'b15b1291a3c37bf6a4982cfbb3483f693acb46a67bc0912b383fd98baf540ccf'},
{'PyTorch-2.1.0_fix-test_numpy_torch_operators.patch':
'84bb51a719abc677031a7a3dfe4382ff098b0cbd8b39b8bed2a7fa03f80ac1e9'},
{'PyTorch-2.1.0_fix-validationError-output-test.patch':
'7eba0942afb121ed92fac30d1529447d892a89eb3d53c565f8e9d480e95f692b'},
{'PyTorch-2.1.0_fix-vsx-vector-shift-functions.patch':
'3793b4b878be1abe7791efcbd534774b87862cfe7dc4774ca8729b6cabb39e7e'},
{'PyTorch-2.1.0_increase-tolerance-functorch-test_vmapvjpvjp.patch':
'aef38adf1210d0c5455e91d7c7a9d9e5caad3ae568301e0ba9fc204309438e7b'},
{'PyTorch-2.1.0_remove-sparse-csr-nnz-overflow-test.patch':
'0ac36411e76506b3354c85a8a1260987f66af947ee52ffc64230aee1fa02ea8b'},
{'PyTorch-2.1.0_remove-test-requiring-online-access.patch':
'35184b8c5a1b10f79e511cc25db3b8a5585a5d58b5d1aa25dd3d250200b14fd7'},
{'PyTorch-2.1.0_skip-diff-test-on-ppc.patch': '394157dbe565ffcbc1821cd63d05930957412156cc01e949ef3d3524176a1dda'},
{'PyTorch-2.1.0_skip-dynamo-test_predispatch.patch':
'6298daf9ddaa8542850eee9ea005f28594ab65b1f87af43d8aeca1579a8c4354'},
{'PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch':
'5229ca88a71db7667a90ddc0b809b2c817698bd6e9c5aaabd73d3173cf9b99fe'},
{'PyTorch-2.1.0_skip-test_linear_fp32-without-MKL.patch':
'5dcc79883b6e3ec0a281a8e110db5e0a5880de843bb05653589891f16473ead5'},
{'PyTorch-2.1.0_skip-test_wrap_bad.patch': 'b8583125ee94e553b6f77c4ab4bfa812b89416175dc7e9b7390919f3b485cb63'},
{'PyTorch-2.1.2_add-cuda-skip-markers.patch': 'd007d6d0cdb533e7d01f503e9055218760123a67c1841c57585385144be18c9a'},
{'PyTorch-2.1.2_fix-conj-mismatch-test-failures.patch':
'c164357efa4ce88095376e590ba508fc1daa87161e1e59544eda56daac7f2847'},
{'PyTorch-2.1.2_fix-device-mesh-check.patch': 'c0efc288bf3d9a9a3c8bbd2691348a589a2677ea43880a8c987db91c8de4806b'},
{'PyTorch-2.1.2_fix-fsdp-tp-integration-test.patch':
'f583532c59f35f36998851957d501b3ac8c883884efd61bbaa308db55cb6bdcd'},
{'PyTorch-2.1.2_fix-locale-issue-in-nvrtcCompileProgram.patch':
'f7adafb4e4d3b724b93237a259797b6ed6f535f83be0e34a7b759c71c6a8ddf2'},
{'PyTorch-2.1.2_fix-test_cuda-non-x86.patch': '1ed76fcc87e6c50606ac286487292a3d534707068c94af74c3a5de8153fa2c2c'},
{'PyTorch-2.1.2_fix-test_extension_backend-without-vectorization.patch':
'cd1455495886a7d6b2d30d48736eb0103fded21e2e36de6baac719b9c52a1c92'},
{'PyTorch-2.1.2_fix-test_memory_profiler.patch':
'30b0c9355636c0ab3dedae02399789053825dc3835b4d7dac6e696767772b1ce'},
{'PyTorch-2.1.2_fix-test_parallelize_api.patch':
'f8387a1693af344099c806981ca38df1306d7f4847d7d44713306338384b1cfd'},
{'PyTorch-2.1.2_fix-test_torchinductor-rounding.patch':
'a0ef99192ee2ad1509c78a8377023d5be2b5fddb16f84063b7c9a0b53d979090'},
{'PyTorch-2.1.2_fix-vsx-vector-abs.patch': 'd67d32407faed7dc1dbab4bba0e2f7de36c3db04560ced35c94caf8d84ade886'},
{'PyTorch-2.1.2_fix-vsx-vector-div.patch': '11f497a6892eb49b249a15320e4218e0d7ac8ae4ce67de39e4a018a064ca1acc'},
{'PyTorch-2.1.2_fix-with_temp_dir-decorator.patch':
'90bd001e034095329277d70c6facc4026b4ce6d7f8b8d6aa81c0176eeb462eb1'},
{'PyTorch-2.1.2_fix-wrong-device-mesh-size-in-tests.patch':
'07a5e4233d02fb6348872838f4d69573c777899c6f0ea4e39ae23c08660d41e5'},
{'PyTorch-2.1.2_relax-cuda-tolerances.patch': '554ad09787f61080fafdb84216e711e32327aa357e2a9c40bb428eb6503dee6e'},
{'PyTorch-2.1.2_remove-nccl-backend-default-without-gpus.patch':
'e6a1efe3d127fcbf4723476a7a1c01cfcf2ccb16d1fb250f478192623e8b6a15'},
{'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch':
'7ace835af60c58d9e0754a34c19d4b9a0c3a531f19e5d0eba8e2e49206eaa7eb'},
{'PyTorch-2.1.2_skip-failing-test_dtensor_ops-subtests.patch':
'6cf711bf26518550903b09ed4431de9319791e79d61aab065785d6608fd5cc88'},
{'PyTorch-2.1.2_skip-test_fsdp_tp_checkpoint_integration.patch':
'943ee92f5fd518f608a59e43fe426b9bb45d7e7ad0ba04639e516db2d61fa57d'},
{'PyTorch-2.1.2_skip-xfailing-test_dtensor_ops.patch':
'7f5befddcb006b6ab5377de6ee3c29df375c5f8ef5e42b998d35113585b983f3'},
{'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch':
'fb96eefabf394617bbb3fbd3a7a7c1aa5991b3836edc2e5d2a30e708bfe49ba1'},
]

osdependencies = [OS_PKG_IBVERBS_DEV]

builddependencies = [
('CMake', '3.24.3'),
('hypothesis', '6.68.2'),
# For tests
('pytest-flakefinder', '1.1.0'),
('pytest-rerunfailures', '12.0'),
('pytest-shard', '0.1.2'),
]

dependencies = [
('CUDA', '12.0.0', '', SYSTEM),
('cuDNN', '8.8.0.121', '-CUDA-%(cudaver)s', SYSTEM),
('magma', '2.7.1', '-CUDA-%(cudaver)s'),
('NCCL', '2.18.3', '-CUDA-%(cudaver)s'),
('Ninja', '1.11.1'), # Required for JIT compilation of C++ extensions
('Python', '3.10.8'),
('protobuf', '23.0'),
('protobuf-python', '4.23.0'),
('pybind11', '2.10.3'),
('SciPy-bundle', '2023.02'),
('PyYAML', '6.0'),
('MPFR', '4.2.0'),
('GMP', '6.2.1'),
('numactl', '2.0.16'),
('FFmpeg', '5.1.2'),
('Pillow', '9.4.0'),
('expecttest', '0.1.3'),
('networkx', '3.0'),
('sympy', '1.12'),
('Z3', '4.12.2', '-Python-%(pyver)s'),
]

use_pip = True
buildcmd = '%(python)s setup.py build' # Run the (long) build in the build step

excluded_tests = {
'': [
# This test seems to take too long on NVIDIA Ampere at least.
'distributed/test_distributed_spawn',
# Broken on CUDA 11.6/11.7: https://github.com/pytorch/pytorch/issues/75375
'distributions/test_constraints',
# no xdoctest
'doctests',
# failing on broadwell
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
'test_native_mha',
# intermittent failures on various systems
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
'distributed/rpc/test_tensorpipe_agent',
# Broken test, can't ever succeed, see https://github.com/pytorch/pytorch/issues/122184
'distributed/tensor/parallel/test_tp_random_state',
# failures on OmniPath systems, which don't support some optional InfiniBand features
# See https://github.com/pytorch/tensorpipe/issues/413
'distributed/pipeline/sync/skip/test_gpipe',
'distributed/pipeline/sync/skip/test_leak',
'distributed/pipeline/sync/test_bugs',
'distributed/pipeline/sync/test_inplace',
'distributed/pipeline/sync/test_pipe',
'distributed/pipeline/sync/test_transparency',
]
}

runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s'

# Especially test_quantization has a few corner cases that are triggered by the random input values,
# those cannot be easily avoided, see https://github.com/pytorch/pytorch/issues/107030
# test_nn is also prone to spurious failures: https://github.com/pytorch/pytorch/issues/118294
# So allow a low number of tests to fail as the tests "usually" succeed
max_failed_tests = 10

# The readelf sanity check command can be taken out once the TestRPATH test from
# https://github.com/pytorch/pytorch/pull/122318 is accepted, since it is then checked as part of the PyTorch test suite
local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT
sanity_check_commands = [
"readelf -d %s | egrep 'RPATH|RUNPATH' | grep -v stubs" % local_libcaffe2,
]

tests = ['PyTorch-check-cpp-extension.py']

moduleclass = 'ai'
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
This test seems to expect at most 4 GPUs.
Especially when the number of GPUs is not a power of 2 (e.g. 6) test_fsdp_tp_integration_tensor_parallel_size_2_cpu_offload_CPUOffload(offload_params=False) fails:

torch.testing._internal.common_distributed: [ERROR] File "/dev/shm//pytorch/test/distributed/fsdp/test_fsdp_tp_integration.py", line 157, in _sync_tp_grads
torch.testing._internal.common_distributed: [ERROR] per_param_masks = unsharded_zeros.split(splits)
torch.testing._internal.common_distributed: [ERROR] File "/tmp/easybuild-install/lib/python3.10/site-packages/torch/_tensor.py", line 864, in split
torch.testing._internal.common_distributed: [ERROR] return torch._VF.split_with_sizes(self, split_size, dim)
torch.testing._internal.common_distributed: [ERROR] RuntimeError: split_with_sizes expects split_sizes to sum exactly to 105 (input tensor's size at dimension 0), but got split_sizes=[20, 4, 16, 4, 48, 12]

See https://github.com/pytorch/pytorch/issues/141237

Limitting to 4 GPUs seems to work.

Author: Alexander Grund (TU Dresden)

diff --git a/test/distributed/fsdp/test_fsdp_tp_integration.py b/test/distributed/fsdp/test_fsdp_tp_integration.py
index bc7a4aef4a3..61eb13162f2 100644
--- a/test/distributed/fsdp/test_fsdp_tp_integration.py
+++ b/test/distributed/fsdp/test_fsdp_tp_integration.py
@@ -71,6 +71,10 @@ class SimpleModel(torch.nn.Module):


class TestTPFSDPIntegration(FSDPTest):
+ @property
+ def world_size(self):
+ return min(4, super().world_size)
+
def _get_params_and_sharding_info(
self,
model: SimpleModel,
Loading
Loading