diff --git a/easybuild/easyconfigs/n/NCCL/NCCL-2.18.3-GCCcore-12.2.0-CUDA-12.0.0.eb b/easybuild/easyconfigs/n/NCCL/NCCL-2.18.3-GCCcore-12.2.0-CUDA-12.0.0.eb new file mode 100644 index 00000000000..a25e7862106 --- /dev/null +++ b/easybuild/easyconfigs/n/NCCL/NCCL-2.18.3-GCCcore-12.2.0-CUDA-12.0.0.eb @@ -0,0 +1,31 @@ +name = 'NCCL' +version = '2.18.3' +versionsuffix = '-CUDA-%(cudaver)s' + +homepage = 'https://developer.nvidia.com/nccl' +description = """The NVIDIA Collective Communications Library (NCCL) implements multi-GPU and multi-node collective +communication primitives that are performance optimized for NVIDIA GPUs.""" + +toolchain = {'name': 'GCCcore', 'version': '12.2.0'} + +github_account = 'NVIDIA' +source_urls = [GITHUB_SOURCE] +sources = ['v%(version)s-1.tar.gz'] +patches = ['NCCL-2.16.2_fix-cpuid.patch'] +checksums = [ + ('6477d83c9edbb34a0ebce6d751a1b32962bc6415d75d04972b676c6894ceaef9', + 'b4f5d7d9eea2c12e32e7a06fe138b2cfc75969c6d5c473aa6f819a792db2fc96'), + {'NCCL-2.16.2_fix-cpuid.patch': '0459ecadcd32b2a7a000a2ce4f675afba908b2c0afabafde585330ff4f83e277'}, +] + +builddependencies = [('binutils', '2.39')] + +dependencies = [ + ('CUDA', '12.0.0', '', SYSTEM), + ('UCX-CUDA', '1.13.1', versionsuffix), +] + +# default CUDA compute capabilities to use (override via --cuda-compute-capabilities) +cuda_compute_capabilities = ['5.0', '6.0', '7.0', '7.5', '8.0', '8.6', '9.0'] + +moduleclass = 'lib' diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2-foss-2022b-CUDA-12.0.0.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2-foss-2022b-CUDA-12.0.0.eb new file mode 100644 index 00000000000..9cbcda474f7 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2-foss-2022b-CUDA-12.0.0.eb @@ -0,0 +1,246 @@ +name = 'PyTorch' +version = '2.1.2' +versionsuffix = '-CUDA-%(cudaver)s' + +homepage = 'https://pytorch.org/' +description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration. +PyTorch is a deep learning framework that puts Python first.""" + +toolchain = {'name': 'foss', 'version': '2022b'} + +source_urls = [GITHUB_RELEASE] +sources = ['%(namelower)s-v%(version)s.tar.gz'] +patches = [ + 'PyTorch-1.7.0_disable-dev-shm-test.patch', + 'PyTorch-1.11.1_skip-test_init_from_local_shards.patch', + 'PyTorch-1.12.1_add-hypothesis-suppression.patch', + 'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch', + 'PyTorch-1.12.1_fix-TestTorch.test_to.patch', + 'PyTorch-1.12.1_skip-test_round_robin.patch', + 'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch', + 'PyTorch-1.13.1_fix-protobuf-dependency.patch', + 'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch', + 'PyTorch-1.13.1_skip-failing-singular-grad-test.patch', + 'PyTorch-1.13.1_skip-tests-without-fbgemm.patch', + 'PyTorch-2.0.1_avoid-test_quantization-failures.patch', + 'PyTorch-2.0.1_fix-skip-decorators.patch', + 'PyTorch-2.0.1_fix-ub-in-inductor-codegen.patch', + 'PyTorch-2.0.1_fix-vsx-loadu.patch', + 'PyTorch-2.0.1_no-cuda-stubs-rpath.patch', + 'PyTorch-2.0.1_skip-failing-gradtest.patch', + 'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch', + 'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch', + 'PyTorch-2.1.0_disable-gcc12-warning.patch', + 'PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch', + 'PyTorch-2.1.0_fix-bufferoverflow-in-oneDNN.patch', + 'PyTorch-2.1.0_fix-test_numpy_torch_operators.patch', + 'PyTorch-2.1.0_fix-validationError-output-test.patch', + 'PyTorch-2.1.0_fix-vsx-vector-shift-functions.patch', + 'PyTorch-2.1.0_increase-tolerance-functorch-test_vmapvjpvjp.patch', + 'PyTorch-2.1.0_remove-sparse-csr-nnz-overflow-test.patch', + 'PyTorch-2.1.0_remove-test-requiring-online-access.patch', + 'PyTorch-2.1.0_skip-diff-test-on-ppc.patch', + 'PyTorch-2.1.0_skip-dynamo-test_predispatch.patch', + 'PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch', + 'PyTorch-2.1.0_skip-test_linear_fp32-without-MKL.patch', + 'PyTorch-2.1.0_skip-test_wrap_bad.patch', + 'PyTorch-2.1.2_add-cuda-skip-markers.patch', + 'PyTorch-2.1.2_fix-conj-mismatch-test-failures.patch', + 'PyTorch-2.1.2_fix-device-mesh-check.patch', + 'PyTorch-2.1.2_fix-fsdp-tp-integration-test.patch', + 'PyTorch-2.1.2_fix-locale-issue-in-nvrtcCompileProgram.patch', + 'PyTorch-2.1.2_fix-test_cuda-non-x86.patch', + 'PyTorch-2.1.2_fix-test_extension_backend-without-vectorization.patch', + 'PyTorch-2.1.2_fix-test_memory_profiler.patch', + 'PyTorch-2.1.2_fix-test_parallelize_api.patch', + 'PyTorch-2.1.2_fix-test_torchinductor-rounding.patch', + 'PyTorch-2.1.2_fix-vsx-vector-abs.patch', + 'PyTorch-2.1.2_fix-vsx-vector-div.patch', + 'PyTorch-2.1.2_fix-with_temp_dir-decorator.patch', + 'PyTorch-2.1.2_fix-wrong-device-mesh-size-in-tests.patch', + 'PyTorch-2.1.2_relax-cuda-tolerances.patch', + 'PyTorch-2.1.2_remove-nccl-backend-default-without-gpus.patch', + 'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch', + 'PyTorch-2.1.2_skip-failing-test_dtensor_ops-subtests.patch', + 'PyTorch-2.1.2_skip-test_fsdp_tp_checkpoint_integration.patch', + 'PyTorch-2.1.2_skip-xfailing-test_dtensor_ops.patch', + 'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch', +] +checksums = [ + {'pytorch-v2.1.2.tar.gz': '85effbcce037bffa290aea775c9a4bad5f769cb229583450c40055501ee1acd7'}, + {'PyTorch-1.7.0_disable-dev-shm-test.patch': '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a'}, + {'PyTorch-1.11.1_skip-test_init_from_local_shards.patch': + '4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7'}, + {'PyTorch-1.12.1_add-hypothesis-suppression.patch': + 'e71ffb94ebe69f580fa70e0de84017058325fdff944866d6bd03463626edc32c'}, + {'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch': + '1efc9850c431d702e9117d4766277d3f88c5c8b3870997c9974971bce7f2ab83'}, + {'PyTorch-1.12.1_fix-TestTorch.test_to.patch': '75f27987c3f25c501e719bd2b1c70a029ae0ee28514a97fe447516aee02b1535'}, + {'PyTorch-1.12.1_skip-test_round_robin.patch': '63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349'}, + {'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch': + '5c7be91a6096083a0b1315efe0001537499c600f1f569953c6a2c7f4cc1d0910'}, + {'PyTorch-1.13.1_fix-protobuf-dependency.patch': + '8bd755a0cab7233a243bc65ca57c9630dfccdc9bf8c9792f0de4e07a644fcb00'}, + {'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch': + 'bdde0f2105215c95a54de64ec4b1a4520528510663174fef6d5b900eb1db3937'}, + {'PyTorch-1.13.1_skip-failing-singular-grad-test.patch': + '72688a57b2bb617665ad1a1d5e362c5111ae912c10936bb38a089c0204729f48'}, + {'PyTorch-1.13.1_skip-tests-without-fbgemm.patch': + '481e595f673baf8ae58b41697a6792b83048b0264aa79b422f48cd8c22948bb7'}, + {'PyTorch-2.0.1_avoid-test_quantization-failures.patch': + '02e3f47e4ed1d7d6077e26f1ae50073dc2b20426269930b505f4aefe5d2f33cd'}, + {'PyTorch-2.0.1_fix-skip-decorators.patch': '2039012cef45446065e1a2097839fe20bb29fe3c1dcc926c3695ebf29832e920'}, + {'PyTorch-2.0.1_fix-ub-in-inductor-codegen.patch': + '1b37194f55ae678f3657b8728dfb896c18ffe8babe90987ce468c4fa9274f357'}, + {'PyTorch-2.0.1_fix-vsx-loadu.patch': 'a0ffa61da2d47c6acd09aaf6d4791e527d8919a6f4f1aa7ed38454cdcadb1f72'}, + {'PyTorch-2.0.1_no-cuda-stubs-rpath.patch': '8902e58a762240f24cdbf0182e99ccdfc2a93492869352fcb4ca0ec7e407f83a'}, + {'PyTorch-2.0.1_skip-failing-gradtest.patch': '8030bdec6ba49b057ab232d19a7f1a5e542e47e2ec340653a246ec9ed59f8bc1'}, + {'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch': + '7047862abc1abaff62954da59700f36d4f39fcf83167a638183b1b7f8fec78ae'}, + {'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch': + '166c134573a95230e39b9ea09ece3ad8072f39d370c9a88fb2a1e24f6aaac2b5'}, + {'PyTorch-2.1.0_disable-gcc12-warning.patch': 'c858b8db0010f41005dc06f9a50768d0d3dc2d2d499ccbdd5faf8a518869a421'}, + {'PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch': + 'd895018ebdfd46e65d9f7645444a3b4c5bbfe3d533a08db559a04be34e01e478'}, + {'PyTorch-2.1.0_fix-bufferoverflow-in-oneDNN.patch': + 'b15b1291a3c37bf6a4982cfbb3483f693acb46a67bc0912b383fd98baf540ccf'}, + {'PyTorch-2.1.0_fix-test_numpy_torch_operators.patch': + '84bb51a719abc677031a7a3dfe4382ff098b0cbd8b39b8bed2a7fa03f80ac1e9'}, + {'PyTorch-2.1.0_fix-validationError-output-test.patch': + '7eba0942afb121ed92fac30d1529447d892a89eb3d53c565f8e9d480e95f692b'}, + {'PyTorch-2.1.0_fix-vsx-vector-shift-functions.patch': + '3793b4b878be1abe7791efcbd534774b87862cfe7dc4774ca8729b6cabb39e7e'}, + {'PyTorch-2.1.0_increase-tolerance-functorch-test_vmapvjpvjp.patch': + 'aef38adf1210d0c5455e91d7c7a9d9e5caad3ae568301e0ba9fc204309438e7b'}, + {'PyTorch-2.1.0_remove-sparse-csr-nnz-overflow-test.patch': + '0ac36411e76506b3354c85a8a1260987f66af947ee52ffc64230aee1fa02ea8b'}, + {'PyTorch-2.1.0_remove-test-requiring-online-access.patch': + '35184b8c5a1b10f79e511cc25db3b8a5585a5d58b5d1aa25dd3d250200b14fd7'}, + {'PyTorch-2.1.0_skip-diff-test-on-ppc.patch': '394157dbe565ffcbc1821cd63d05930957412156cc01e949ef3d3524176a1dda'}, + {'PyTorch-2.1.0_skip-dynamo-test_predispatch.patch': + '6298daf9ddaa8542850eee9ea005f28594ab65b1f87af43d8aeca1579a8c4354'}, + {'PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch': + '5229ca88a71db7667a90ddc0b809b2c817698bd6e9c5aaabd73d3173cf9b99fe'}, + {'PyTorch-2.1.0_skip-test_linear_fp32-without-MKL.patch': + '5dcc79883b6e3ec0a281a8e110db5e0a5880de843bb05653589891f16473ead5'}, + {'PyTorch-2.1.0_skip-test_wrap_bad.patch': 'b8583125ee94e553b6f77c4ab4bfa812b89416175dc7e9b7390919f3b485cb63'}, + {'PyTorch-2.1.2_add-cuda-skip-markers.patch': 'd007d6d0cdb533e7d01f503e9055218760123a67c1841c57585385144be18c9a'}, + {'PyTorch-2.1.2_fix-conj-mismatch-test-failures.patch': + 'c164357efa4ce88095376e590ba508fc1daa87161e1e59544eda56daac7f2847'}, + {'PyTorch-2.1.2_fix-device-mesh-check.patch': 'c0efc288bf3d9a9a3c8bbd2691348a589a2677ea43880a8c987db91c8de4806b'}, + {'PyTorch-2.1.2_fix-fsdp-tp-integration-test.patch': + 'f583532c59f35f36998851957d501b3ac8c883884efd61bbaa308db55cb6bdcd'}, + {'PyTorch-2.1.2_fix-locale-issue-in-nvrtcCompileProgram.patch': + 'f7adafb4e4d3b724b93237a259797b6ed6f535f83be0e34a7b759c71c6a8ddf2'}, + {'PyTorch-2.1.2_fix-test_cuda-non-x86.patch': '1ed76fcc87e6c50606ac286487292a3d534707068c94af74c3a5de8153fa2c2c'}, + {'PyTorch-2.1.2_fix-test_extension_backend-without-vectorization.patch': + 'cd1455495886a7d6b2d30d48736eb0103fded21e2e36de6baac719b9c52a1c92'}, + {'PyTorch-2.1.2_fix-test_memory_profiler.patch': + '30b0c9355636c0ab3dedae02399789053825dc3835b4d7dac6e696767772b1ce'}, + {'PyTorch-2.1.2_fix-test_parallelize_api.patch': + 'f8387a1693af344099c806981ca38df1306d7f4847d7d44713306338384b1cfd'}, + {'PyTorch-2.1.2_fix-test_torchinductor-rounding.patch': + 'a0ef99192ee2ad1509c78a8377023d5be2b5fddb16f84063b7c9a0b53d979090'}, + {'PyTorch-2.1.2_fix-vsx-vector-abs.patch': 'd67d32407faed7dc1dbab4bba0e2f7de36c3db04560ced35c94caf8d84ade886'}, + {'PyTorch-2.1.2_fix-vsx-vector-div.patch': '11f497a6892eb49b249a15320e4218e0d7ac8ae4ce67de39e4a018a064ca1acc'}, + {'PyTorch-2.1.2_fix-with_temp_dir-decorator.patch': + '90bd001e034095329277d70c6facc4026b4ce6d7f8b8d6aa81c0176eeb462eb1'}, + {'PyTorch-2.1.2_fix-wrong-device-mesh-size-in-tests.patch': + '07a5e4233d02fb6348872838f4d69573c777899c6f0ea4e39ae23c08660d41e5'}, + {'PyTorch-2.1.2_relax-cuda-tolerances.patch': '554ad09787f61080fafdb84216e711e32327aa357e2a9c40bb428eb6503dee6e'}, + {'PyTorch-2.1.2_remove-nccl-backend-default-without-gpus.patch': + 'e6a1efe3d127fcbf4723476a7a1c01cfcf2ccb16d1fb250f478192623e8b6a15'}, + {'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch': + '7ace835af60c58d9e0754a34c19d4b9a0c3a531f19e5d0eba8e2e49206eaa7eb'}, + {'PyTorch-2.1.2_skip-failing-test_dtensor_ops-subtests.patch': + '6cf711bf26518550903b09ed4431de9319791e79d61aab065785d6608fd5cc88'}, + {'PyTorch-2.1.2_skip-test_fsdp_tp_checkpoint_integration.patch': + '943ee92f5fd518f608a59e43fe426b9bb45d7e7ad0ba04639e516db2d61fa57d'}, + {'PyTorch-2.1.2_skip-xfailing-test_dtensor_ops.patch': + '7f5befddcb006b6ab5377de6ee3c29df375c5f8ef5e42b998d35113585b983f3'}, + {'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch': + 'fb96eefabf394617bbb3fbd3a7a7c1aa5991b3836edc2e5d2a30e708bfe49ba1'}, +] + +osdependencies = [OS_PKG_IBVERBS_DEV] + +builddependencies = [ + ('CMake', '3.24.3'), + ('hypothesis', '6.68.2'), + # For tests + ('pytest-flakefinder', '1.1.0'), + ('pytest-rerunfailures', '12.0'), + ('pytest-shard', '0.1.2'), +] + +dependencies = [ + ('CUDA', '12.0.0', '', SYSTEM), + ('cuDNN', '8.8.0.121', '-CUDA-%(cudaver)s', SYSTEM), + ('magma', '2.7.1', '-CUDA-%(cudaver)s'), + ('NCCL', '2.18.3', '-CUDA-%(cudaver)s'), + ('Ninja', '1.11.1'), # Required for JIT compilation of C++ extensions + ('Python', '3.10.8'), + ('protobuf', '23.0'), + ('protobuf-python', '4.23.0'), + ('pybind11', '2.10.3'), + ('SciPy-bundle', '2023.02'), + ('PyYAML', '6.0'), + ('MPFR', '4.2.0'), + ('GMP', '6.2.1'), + ('numactl', '2.0.16'), + ('FFmpeg', '5.1.2'), + ('Pillow', '9.4.0'), + ('expecttest', '0.1.3'), + ('networkx', '3.0'), + ('sympy', '1.12'), + ('Z3', '4.12.2', '-Python-%(pyver)s'), +] + +use_pip = True +buildcmd = '%(python)s setup.py build' # Run the (long) build in the build step + +excluded_tests = { + '': [ + # This test seems to take too long on NVIDIA Ampere at least. + 'distributed/test_distributed_spawn', + # Broken on CUDA 11.6/11.7: https://github.com/pytorch/pytorch/issues/75375 + 'distributions/test_constraints', + # no xdoctest + 'doctests', + # failing on broadwell + # See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712 + 'test_native_mha', + # intermittent failures on various systems + # See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712 + 'distributed/rpc/test_tensorpipe_agent', + # Broken test, can't ever succeed, see https://github.com/pytorch/pytorch/issues/122184 + 'distributed/tensor/parallel/test_tp_random_state', + # failures on OmniPath systems, which don't support some optional InfiniBand features + # See https://github.com/pytorch/tensorpipe/issues/413 + 'distributed/pipeline/sync/skip/test_gpipe', + 'distributed/pipeline/sync/skip/test_leak', + 'distributed/pipeline/sync/test_bugs', + 'distributed/pipeline/sync/test_inplace', + 'distributed/pipeline/sync/test_pipe', + 'distributed/pipeline/sync/test_transparency', + ] +} + +runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s' + +# Especially test_quantization has a few corner cases that are triggered by the random input values, +# those cannot be easily avoided, see https://github.com/pytorch/pytorch/issues/107030 +# test_nn is also prone to spurious failures: https://github.com/pytorch/pytorch/issues/118294 +# So allow a low number of tests to fail as the tests "usually" succeed +max_failed_tests = 10 + +# The readelf sanity check command can be taken out once the TestRPATH test from +# https://github.com/pytorch/pytorch/pull/122318 is accepted, since it is then checked as part of the PyTorch test suite +local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT +sanity_check_commands = [ + "readelf -d %s | egrep 'RPATH|RUNPATH' | grep -v stubs" % local_libcaffe2, +] + +tests = ['PyTorch-check-cpp-extension.py'] + +moduleclass = 'ai' diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-fsdp-tp-integration-test.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-fsdp-tp-integration-test.patch new file mode 100644 index 00000000000..5a54cac4d0b --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-fsdp-tp-integration-test.patch @@ -0,0 +1,30 @@ +This test seems to expect at most 4 GPUs. +Especially when the number of GPUs is not a power of 2 (e.g. 6) test_fsdp_tp_integration_tensor_parallel_size_2_cpu_offload_CPUOffload(offload_params=False) fails: + +torch.testing._internal.common_distributed: [ERROR] File "/dev/shm//pytorch/test/distributed/fsdp/test_fsdp_tp_integration.py", line 157, in _sync_tp_grads +torch.testing._internal.common_distributed: [ERROR] per_param_masks = unsharded_zeros.split(splits) +torch.testing._internal.common_distributed: [ERROR] File "/tmp/easybuild-install/lib/python3.10/site-packages/torch/_tensor.py", line 864, in split +torch.testing._internal.common_distributed: [ERROR] return torch._VF.split_with_sizes(self, split_size, dim) +torch.testing._internal.common_distributed: [ERROR] RuntimeError: split_with_sizes expects split_sizes to sum exactly to 105 (input tensor's size at dimension 0), but got split_sizes=[20, 4, 16, 4, 48, 12] + +See https://github.com/pytorch/pytorch/issues/141237 + +Limitting to 4 GPUs seems to work. + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/distributed/fsdp/test_fsdp_tp_integration.py b/test/distributed/fsdp/test_fsdp_tp_integration.py +index bc7a4aef4a3..61eb13162f2 100644 +--- a/test/distributed/fsdp/test_fsdp_tp_integration.py ++++ b/test/distributed/fsdp/test_fsdp_tp_integration.py +@@ -71,6 +71,10 @@ class SimpleModel(torch.nn.Module): + + + class TestTPFSDPIntegration(FSDPTest): ++ @property ++ def world_size(self): ++ return min(4, super().world_size) ++ + def _get_params_and_sharding_info( + self, + model: SimpleModel, diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-test_cuda-non-x86.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-test_cuda-non-x86.patch new file mode 100644 index 00000000000..6ed59612f40 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-test_cuda-non-x86.patch @@ -0,0 +1,65 @@ +test_cuda fails on non-x86 machines because the tested feature is not available there. +> RuntimeError: record_context_cpp is not support on non-linux non-x86_64 platforms + +Skip the tests on the non-supported platforms. + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/test_cuda.py b/test/test_cuda.py +index e81c9365139..79b438060fe 100644 +--- a/test/test_cuda.py ++++ b/test/test_cuda.py +@@ -28,7 +28,7 @@ from torch.utils.checkpoint import checkpoint_sequential + from torch.testing._internal.common_utils import TestCase, freeze_rng_state, run_tests, \ + NO_MULTIPROCESSING_SPAWN, skipIfRocm, load_tests, IS_WINDOWS, \ + slowTest, skipCUDANonDefaultStreamIf, skipCUDAMemoryLeakCheckIf, TEST_CUDA, TEST_CUDA_GRAPH, TEST_WITH_ROCM, TEST_NUMPY, \ +- get_cycles_per_ms, parametrize, instantiate_parametrized_tests, subtest, IS_JETSON, gcIfJetson, NoTest, IS_LINUX ++ get_cycles_per_ms, parametrize, instantiate_parametrized_tests, subtest, IS_JETSON, gcIfJetson, NoTest, IS_LINUX, IS_X86 + from torch.testing._internal.common_cuda import TEST_CUDNN, TEST_MULTIGPU, _create_scaling_case, _create_scaling_models_optimizers + from torch.testing._internal.autocast_test_lists import AutocastTestLists + from torch.utils.viz._cycles import observe_tensor_cycles +@@ -3386,7 +3386,7 @@ class TestCudaMallocAsync(TestCase): + finally: + torch.cuda.memory._record_memory_history(None) + +- @unittest.skipIf(not IS_LINUX, "linux only cpp unwinding") ++ @unittest.skipIf(not IS_LINUX or not IS_X86, "linux only cpp unwinding") + def test_direct_traceback(self): + from torch._C._profiler import gather_traceback, symbolize_tracebacks + c = gather_traceback(True, True, True) +@@ -3396,7 +3396,7 @@ class TestCudaMallocAsync(TestCase): + self.assertTrue("unwind" in r) + + @unittest.skipIf(TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync") +- @unittest.skipIf(not IS_LINUX, "cpp contexts are linux only") ++ @unittest.skipIf(not IS_LINUX or not IS_X86, "cpp contexts are linux only") + def test_memory_snapshot_with_cpp(self): + try: + torch.cuda.memory.empty_cache() +@@ -3432,7 +3432,7 @@ class TestCudaMallocAsync(TestCase): + self.assertTrue('category' in plot) + + @unittest.skipIf(TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync") +- @unittest.skipIf(not IS_LINUX, "cpp contexts are linux only") ++ @unittest.skipIf(not IS_LINUX or not IS_X86, "cpp contexts are linux only") + def test_cycles(self): + fired = False + +@@ -3469,7 +3469,7 @@ class TestCudaMallocAsync(TestCase): + disarm() + + @unittest.skipIf(TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync") +- @unittest.skipIf(not IS_LINUX, "cpp contexts are linux only") ++ @unittest.skipIf(not IS_LINUX or not IS_X86, "cpp contexts are linux only") + def test_memory_plots(self): + for context, stacks in (("all", "all" if IS_LINUX else "python"), ("all", "python"), (None, "python")): + try: +@@ -3497,7 +3497,7 @@ class TestCudaMallocAsync(TestCase): + torch.cuda.memory._record_memory_history(None) + + @unittest.skipIf(TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync") +- @unittest.skipIf(not IS_LINUX, "cpp contexts are linux only") ++ @unittest.skipIf(not IS_LINUX or not IS_X86, "cpp contexts are linux only") + def test_memory_plots_free_stack(self): + for context in ["alloc", "all", "state"]: + try: diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-test_parallelize_api.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-test_parallelize_api.patch new file mode 100644 index 00000000000..540885db01d --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-test_parallelize_api.patch @@ -0,0 +1,24 @@ +The test_linear_row_wise_parallel subtest fails when run on e.g. 6 GPUs with + +> RuntimeError: a and b must have same reduction dim, but got [9, 18] X [16, 10]. +> RuntimeError: a and b must have same reduction dim, but got [9, 6] X [16, 10]. + +Reason is the test suite expects at most 4 GPUs. +See https://github.com/pytorch/pytorch/issues/141335 + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/distributed/tensor/parallel/test_parallelize_api.py b/test/distributed/tensor/parallel/test_parallelize_api.py +index b5eaf6eaf78..901c7131cfe 100644 +--- a/test/distributed/tensor/parallel/test_parallelize_api.py ++++ b/test/distributed/tensor/parallel/test_parallelize_api.py +@@ -28,8 +28,7 @@ from torch.testing._internal.distributed._tensor.common_dtensor import ( + class TensorParallelAPITests(DTensorTestBase): + @property + def world_size(self): +- gpu_num = torch.cuda.device_count() +- return gpu_num if gpu_num % 2 == 0 and gpu_num > 4 else 4 ++ return 4 + + @with_comms + def test_create_1d_device_mesh(self): diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_skip-xfailing-test_dtensor_ops.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_skip-xfailing-test_dtensor_ops.patch new file mode 100644 index 00000000000..3735fd1a0b9 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_skip-xfailing-test_dtensor_ops.patch @@ -0,0 +1,25 @@ +This test has ~400 sub-tests marked as xfail, i.e. expected to fail. +A comment states +> these sometimes pass and sometimes fail + +That doesn't match the definition of "xfail"/"unittest.expectedFailure" as some are really +"expected failures" which have to fail (like some operators which are not defined for the type tested) + +To reduce the flakiness and verbosity of this test suite (every xfailed tests prints an error and a rather large backtrace) +just skip those tests by setting the last element in each tuple of the list to `False`, +see the definition of `fail` vs `xfail` in that file. + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/distributed/_tensor/test_dtensor_ops.py b/test/distributed/_tensor/test_dtensor_ops.py +index b7d453e56be..15478840ae5 100644 +--- a/test/distributed/_tensor/test_dtensor_ops.py ++++ b/test/distributed/_tensor/test_dtensor_ops.py +@@ -541,6 +541,7 @@ dtensor_fails = { + skip("squeeze"), + } + ++dtensor_fails = [x[:-1] + (False,) for x in dtensor_fails] + + # Add a list of ops that are currently failing BW pass + skip_bw = [