From b0167a68f25a80eb61ec805e45396176ecb1f51f Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin Date: Fri, 3 Jan 2025 12:11:23 -0800 Subject: [PATCH] Update gcc version for FBGEMM install in CI (#2654) Summary: TorchRec CI currently is failing with issues on incompatible GLIBCXX version. The cause is that FBGEMM now requires g++ 11.1+ for building binaries that reference GLIBCXX_3.4.29 (as of https://github.com/pytorch/pytorch/pull/141035) As recommended in https://github.com/pytorch/FBGEMM/blob/main/.github/scripts/utils_build.bash and https://github.com/pytorch/FBGEMM/issues/3423, install GCC using conda to control glibcxx version being used. Differential Revision: D67607624 --- .github/workflows/docs.yml | 24 ++++++++++++++++++++++-- .github/workflows/release_build.yml | 26 +++++++++++++++++++++++--- .github/workflows/unittest_ci_cpu.yml | 27 ++++++++++++++++++++++++--- torchrec/distributed/utils.py | 8 ++++++-- 4 files changed, 75 insertions(+), 10 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 51217790e..7e50fbafb 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -33,7 +33,8 @@ jobs: - name: Setup conda run: | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh - bash ~/miniconda.sh -b -p $HOME/miniconda + bash ~/miniconda.sh -b -p $HOME/miniconda -u + conda update -n base -c defaults -y conda - name: setup Path run: | echo "/home/ec2-user/miniconda/bin" >> $GITHUB_PATH @@ -51,7 +52,26 @@ jobs: - name: Install gcc shell: bash run: | - sudo apt-get install build-essential + conda install -n build_binary -c conda-forge -y gxx_linux-64=11.4.0 sysroot_linux-64=2.17 + echo "[INSTALL] Setting the C/C++ compiler symlinks ..." + cc_path=$(conda run -n build_binary printenv CC) + cxx_path=$(conda run -n build_binary printenv CXX) + ln -sf "${cc_path}" "$(dirname "$cc_path")/cc" + ln -sf "${cc_path}" "$(dirname "$cc_path")/gcc" + ln -sf "${cxx_path}" "$(dirname "$cxx_path")/c++" + ln -sf "${cxx_path}" "$(dirname "$cxx_path")/g++" + + conda_prefix=$(conda run -n build_binary printenv CONDA_PREFIX) + echo "[TEST] Enumerating libstdc++.so files ..." + all_libcxx_libs=$(find "${conda_prefix}/lib" -type f -name 'libstdc++.so*' -print | sort) + for f in $all_libcxx_libs; do + echo "$f"; + objdump -TC "$f" | grep GLIBCXX_ | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat + echo "" + done + + echo "[TEST] Appending the Conda-installed libstdc++ to LD_PRELOAD ..." + conda env config vars set -n build_binary LD_PRELOAD="${all_libcxx_libs[0]}" - name: setup Path run: | echo /usr/local/bin >> $GITHUB_PATH diff --git a/.github/workflows/release_build.yml b/.github/workflows/release_build.yml index 1ea837d4b..f988b3462 100644 --- a/.github/workflows/release_build.yml +++ b/.github/workflows/release_build.yml @@ -48,6 +48,7 @@ jobs: run: | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh bash ~/miniconda.sh -b -p $HOME/miniconda -u + conda update -n base -c defaults -y conda - name: setup Path run: | echo "/home/ec2-user/miniconda/bin" >> $GITHUB_PATH @@ -62,9 +63,28 @@ jobs: - name: check python version run: | conda run -n build_binary python --version - - name: Install C/C++ compilers - run: | - sudo yum install -y gcc gcc-c++ + - name: Install gcc + run: | + conda install -n build_binary -c conda-forge -y gxx_linux-64=11.4.0 sysroot_linux-64=2.17 + echo "[INSTALL] Setting the C/C++ compiler symlinks ..." + cc_path=$(conda run -n build_binary printenv CC) + cxx_path=$(conda run -n build_binary printenv CXX) + ln -sf "${cc_path}" "$(dirname "$cc_path")/cc" + ln -sf "${cc_path}" "$(dirname "$cc_path")/gcc" + ln -sf "${cxx_path}" "$(dirname "$cxx_path")/c++" + ln -sf "${cxx_path}" "$(dirname "$cxx_path")/g++" + + conda_prefix=$(conda run -n build_binary printenv CONDA_PREFIX) + echo "[TEST] Enumerating libstdc++.so files ..." + all_libcxx_libs=$(find "${conda_prefix}/lib" -type f -name 'libstdc++.so*' -print | sort) + for f in $all_libcxx_libs; do + echo "$f"; + objdump -TC "$f" | grep GLIBCXX_ | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat + echo "" + done + + echo "[TEST] Appending the Conda-installed libstdc++ to LD_PRELOAD ..." + conda env config vars set -n build_binary LD_PRELOAD="${all_libcxx_libs[0]}" - name: Install PyTorch and CUDA shell: bash run: | diff --git a/.github/workflows/unittest_ci_cpu.yml b/.github/workflows/unittest_ci_cpu.yml index 1efe64178..80bd6a0b1 100644 --- a/.github/workflows/unittest_ci_cpu.yml +++ b/.github/workflows/unittest_ci_cpu.yml @@ -45,6 +45,30 @@ jobs: conda info python --version conda run -n build_binary python --version + + echo "[INSTALL] Installing gcc..." + conda install -n build_binary -c conda-forge -y gxx_linux-64=11.4.0 sysroot_linux-64=2.17 + + echo "[INSTALL] Setting the C/C++ compiler symlinks ..." + cc_path=$(conda run -n build_binary printenv CC) + cxx_path=$(conda run -n build_binary printenv CXX) + ln -sf "${cc_path}" "$(dirname "$cc_path")/cc" + ln -sf "${cc_path}" "$(dirname "$cc_path")/gcc" + ln -sf "${cxx_path}" "$(dirname "$cxx_path")/c++" + ln -sf "${cxx_path}" "$(dirname "$cxx_path")/g++" + + conda_prefix=$(conda run -n build_binary printenv CONDA_PREFIX) + echo "[INSTALL] Enumerating libstdc++.so files ..." + all_libcxx_libs=$(find "${conda_prefix}/lib" -type f -name 'libstdc++.so*' -print | sort) + for f in $all_libcxx_libs; do + echo "$f"; + objdump -TC "$f" | grep GLIBCXX_ | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat + echo "" + done + + echo "[INSTALL] Appending the Conda-installed libstdc++ to LD_PRELOAD ..." + current_value=$(conda run -n build_binary printenv LD_PRELOAD) + conda run -n build_binary \ pip install torch --index-url https://download.pytorch.org/whl/nightly/cpu conda run -n build_binary \ @@ -73,9 +97,6 @@ jobs: python -m pytest torchrec -v -s -W ignore::pytest.PytestCollectionWarning --continue-on-collection-errors \ --ignore-glob=**/test_utils/ echo "Starting C++ Tests" - conda install -n build_binary -y gxx_linux-64 - conda run -n build_binary \ - x86_64-conda-linux-gnu-g++ --version conda install -n build_binary -c anaconda redis -y conda run -n build_binary redis-server --daemonize yes mkdir cpp-build diff --git a/torchrec/distributed/utils.py b/torchrec/distributed/utils.py index 8a3db1209..830fef412 100644 --- a/torchrec/distributed/utils.py +++ b/torchrec/distributed/utils.py @@ -525,7 +525,9 @@ def create_global_tensor_shape_stride_from_metadata( """ size = None if parameter_sharding.sharding_type == ShardingType.COLUMN_WISE.value: - row_dim = parameter_sharding.sharding_spec.shards[0].shard_sizes[0] # pyre-ignore[16] + row_dim = parameter_sharding.sharding_spec.shards[0].shard_sizes[ + 0 + ] # pyre-ignore[16] col_dim = 0 for shard in parameter_sharding.sharding_spec.shards: col_dim += shard.shard_sizes[1] @@ -551,4 +553,6 @@ def create_global_tensor_shape_stride_from_metadata( for _ in range(devices_per_node): row_dim += parameter_sharding.sharding_spec.shards[0].shard_sizes[0] size = torch.Size([row_dim, col_dim]) - return size, (size[1], 1) if size else (torch.Size([0, 0]), (0, 1)) # pyre-ignore[7] + return size, ( + (size[1], 1) if size else (torch.Size([0, 0]), (0, 1)) + ) # pyre-ignore[7]