diff --git a/dockerfiles/README.md b/dockerfiles/README.md index 4758e76..9b6e153 100644 --- a/dockerfiles/README.md +++ b/dockerfiles/README.md @@ -49,10 +49,27 @@ This script can be used as reference to build docker images for Gaudi. make build ``` + #### Build triton vllm backend (default OS - ubuntu22.04): + ``` + cd triton_vllm_backend + make build BUILD_OS=ubuntu22.04 + ``` + 3. Build command variables #### Optional Parameters * BUILD_OS - set the OS to build (default ubuntu22.04) * BUILD_DIR - the folder where the build be executed from (default dockerbuild in image folder) * VERBOSE - set to TRUE to echo the commands (default FALSE) - * DOCKER_CACHE - set to TRUE to use cache for building docker image (default FALSE) \ No newline at end of file + * DOCKER_CACHE - set to TRUE to use cache for building docker image (default FALSE) + +4. Instructions for triton-vllm-back-end server + + * Run the backend container as described in [habana docs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Triton_Inference.html?highlight=triton%20inference#run-the-backend-container) + * Start the triton server + ```bash + tritonserver --model-repository samples/model_repository + ``` + The current samples/model_repository/vllm_model contains llama27B 1x.We also have sample model files for llama2 7b/70b and qwen2-7b respectively under samples/model_repository/test_models folder. To use them , copy the model.json and config.pbtxt to vllm_model folder structure. + * To test with client, please follow the instructions [here](https://github.com/triton-inference-server/vllm_backend?tab=readme-ov-file#sending-your-first-inference) + diff --git a/dockerfiles/base/Dockerfile.amzn2 b/dockerfiles/base/Dockerfile.amzn2 index f091ab3..dfc548d 100644 --- a/dockerfiles/base/Dockerfile.amzn2 +++ b/dockerfiles/base/Dockerfile.amzn2 @@ -24,8 +24,10 @@ RUN amazon-linux-extras enable python3.8 && \ wget \ lsof \ tar \ - mesa-libGL && \ - yum clean all && rm -rf /var/cache/yum + mesa-libGL \ + sox-devel && \ + yum clean all && rm -rf /var/cache/yum && \ + rm -f /etc/ssh/ssh_host_*_key* # Install jemalloc-3.6.0-1.el7.x86_64 package with required /lib64/libjemalloc.so.1 lib need for topologies RUN yum install -y https://archives.fedoraproject.org/pub/archive/epel/7/x86_64/Packages/e/epel-release-7-14.noarch.rpm && \ @@ -39,7 +41,7 @@ RUN yum install -y sudo system-lsb-core cmake COPY install_efa.sh . RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh -ENV LIBFABRIC_VERSION="1.20.0" +ENV LIBFABRIC_VERSION="1.22.0" ENV LIBFABRIC_ROOT="/opt/habanalabs/libfabric-${LIBFABRIC_VERSION}" ENV MPI_ROOT=/opt/amazon/openmpi ENV LD_LIBRARY_PATH=$LIBFABRIC_ROOT/lib:${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH @@ -55,7 +57,7 @@ ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ echo "name=Habana AWS Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/AmazonLinux2" >> /etc/yum.repos.d/habanalabs.repo && \ - echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/api/v2/repositories/AmazonLinux2/keyPairs/primary/public" >> /etc/yum.repos.d/habanalabs.repo + echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/AmazonLinux2/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo RUN yum makecache && \ yum install -y habanalabs-rdma-core-"$VERSION"-"$REVISION".amzn2 && \ @@ -74,7 +76,6 @@ RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && sed -i 's/[ #]\(.*ForwardAgent \).*/ \1yes/g' /etc/ssh/ssh_config && \ echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ - ssh-keygen -A && \ mkdir -p /var/run/sshd && echo "/usr/sbin/sshd -p 3022" | tee -a ~/.bashrc # There is no need to store pip installation files inside docker image @@ -94,7 +95,7 @@ RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archi cd / && \ rm -rf /tmp/main.zip /tmp/hccl_ofi_wrapper-main -RUN python3 -m pip install pip==23.3.1 setuptools==67.3.3 wheel==0.38.4 +RUN python3 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0 RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" diff --git a/dockerfiles/base/Dockerfile.rhel8.6 b/dockerfiles/base/Dockerfile.rhel8.6 index ecefcd2..2e836c7 100644 --- a/dockerfiles/base/Dockerfile.rhel8.6 +++ b/dockerfiles/base/Dockerfile.rhel8.6 @@ -38,6 +38,8 @@ RUN dnf install -y \ llvm \ lsof \ python38-devel \ + bzip2 \ + bzip2-devel \ openssh-clients \ libjpeg-devel \ openssh-server \ @@ -50,7 +52,8 @@ RUN dnf install -y \ # update pkgs (except OS version) for resolving potentials CVEs dnf versionlock add redhat-release* && \ dnf update -y && \ - dnf clean all && rm -rf /var/cache/yum + dnf clean all && rm -rf /var/cache/yum && \ + rm -f /etc/ssh/ssh_host_*_key* # CVE-2023-47038 RHSA-2024:3128 RUN dnf module reset perl -y && \ @@ -67,20 +70,22 @@ RUN echo "[appstream]" > /etc/yum.repos.d/CentOS-Linux-AppStream.repo && \ COPY install_efa.sh . RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh +ENV OPENMPI_VERSION=4.1.6 ENV LIBFABRIC_VERSION="1.20.0" ENV LIBFABRIC_ROOT="/opt/habanalabs/libfabric-${LIBFABRIC_VERSION}" -ENV MPI_ROOT=/opt/amazon/openmpi +ENV MPI_ROOT=/opt/habanalabs/openmpi ENV LD_LIBRARY_PATH=$LIBFABRIC_ROOT/lib:${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH ENV PATH=${LIBFABRIC_ROOT}/bin:${MPI_ROOT}/bin:$PATH ENV OPAL_PREFIX=${MPI_ROOT} ENV MPICC=${MPI_ROOT}/bin/mpicc ENV RDMAV_FORK_SAFE=1 -ENV FI_EFA_USE_DEVICE_RDMA=1 +ENV FI_EFA_USE_DEVICE_RDMA=0 +ENV OMPI_MCA_btl=^openib RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ echo "name=Habana RH8 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/rhel/8/8.6" >> /etc/yum.repos.d/habanalabs.repo && \ - echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/api/v2/repositories/rhel/keyPairs/primary/public" >> /etc/yum.repos.d/habanalabs.repo + echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/rhel/8/8.6/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo RUN echo "[powertools]" > /etc/yum.repos.d/powertools.repo && \ echo "name=powertools" >> /etc/yum.repos.d/powertools.repo && \ @@ -109,6 +114,12 @@ RUN wget -nv -O /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 https://github.com/o ./configure --prefix=$LIBFABRIC_ROOT --enable-psm3-verbs --enable-verbs=yes --with-synapseai=/usr && \ make && make install && cd / && rm -rf /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 /tmp/libfabric-${LIBFABRIC_VERSION} +RUN wget -q -O /tmp/openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPENMPI_VERSION}.tar.gz && \ + tar -xzf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz -C /tmp && \ + cd /tmp/openmpi-${OPENMPI_VERSION} && \ + ./configure --prefix=${MPI_ROOT} --with-libfabric=$LIBFABRIC_ROOT --with-verbs && \ + make -j$(nproc) && make install && cd / && rm -rf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz /tmp/openmpi-${OPENMPI_VERSION} + RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/heads/main.zip && \ unzip /tmp/main.zip -d /tmp && \ cd /tmp/hccl_ofi_wrapper-main && \ @@ -117,7 +128,7 @@ RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archi rm -rf /tmp/main.zip /tmp/hccl_ofi_wrapper-main ENV PYTHON_VERSION=3.8 -RUN python3.8 -m pip install pip==23.3.1 setuptools==67.3.3 wheel==0.38.4 +RUN python3.8 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0 RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 2 && \ alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1 && \ @@ -131,7 +142,6 @@ RUN mkdir -p /var/run/sshd && \ sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \ echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ - ssh-keygen -A && \ mkdir -p /var/run/sshd && echo "/usr/sbin/sshd -p 3022" | tee -a ~/.bashrc ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so diff --git a/dockerfiles/base/Dockerfile.rhel9.2 b/dockerfiles/base/Dockerfile.rhel9.2 index 0fad818..4900b21 100644 --- a/dockerfiles/base/Dockerfile.rhel9.2 +++ b/dockerfiles/base/Dockerfile.rhel9.2 @@ -53,6 +53,7 @@ RUN dnf install -y \ wget \ git \ libffi-devel \ + bzip2 \ bzip2-devel \ zlib-devel \ mesa-libGL \ @@ -61,7 +62,8 @@ RUN dnf install -y \ # update pkgs (except OS version) for resolving potentials CVEs dnf versionlock add redhat-release* && \ dnf update -y && \ - dnf clean all && rm -rf /var/cache/yum + dnf clean all && rm -rf /var/cache/yum && \ + rm -f /etc/ssh/ssh_host_*_key* ENV PYTHON_VERSION=3.10 COPY install-python310.sh . @@ -71,29 +73,31 @@ ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH COPY install_efa.sh . RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh +ENV OPENMPI_VERSION=4.1.6 ENV LIBFABRIC_VERSION="1.20.0" ENV LIBFABRIC_ROOT="/opt/habanalabs/libfabric-${LIBFABRIC_VERSION}" -ENV MPI_ROOT=/opt/amazon/openmpi +ENV MPI_ROOT=/opt/habanalabs/openmpi ENV LD_LIBRARY_PATH=$LIBFABRIC_ROOT/lib:${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH ENV PATH=${LIBFABRIC_ROOT}/bin:${MPI_ROOT}/bin:$PATH ENV OPAL_PREFIX=${MPI_ROOT} ENV MPICC=${MPI_ROOT}/bin/mpicc ENV RDMAV_FORK_SAFE=1 -ENV FI_EFA_USE_DEVICE_RDMA=1 +ENV FI_EFA_USE_DEVICE_RDMA=0 +ENV OMPI_MCA_btl=^openib RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ echo "name=Habana RH9 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.2" >> /etc/yum.repos.d/habanalabs.repo && \ - echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/api/v2/repositories/rhel/keyPairs/primary/public" >> /etc/yum.repos.d/habanalabs.repo && \ + echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.2/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo && \ echo "gpgcheck=1" >> /etc/yum.repos.d/habanalabs.repo # for Habana GPG key with SHA-1 signature RUN update-crypto-policies --set DEFAULT:SHA1 RUN dnf install -y habanalabs-rdma-core-"$VERSION"-"$REVISION".el9 \ - habanalabs-thunk-"$VERSION"-"$REVISION".el9 \ - habanalabs-firmware-tools-"$VERSION"-"$REVISION".el9 \ - habanalabs-graph-"$VERSION"-"$REVISION".el9 && \ + habanalabs-thunk-"$VERSION"-"$REVISION".el9 \ + habanalabs-firmware-tools-"$VERSION"-"$REVISION".el9 \ + habanalabs-graph-"$VERSION"-"$REVISION".el9 && \ rm -f /etc/yum.repos.d/habanalabs.repo && rm -f /etc/yum.repos.d/habana.repo && rm -rf /tmp/* && \ dnf clean all && rm -rf /var/cache/yum @@ -111,6 +115,12 @@ RUN wget -nv -O /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 https://github.com/o ./configure --prefix=$LIBFABRIC_ROOT --enable-psm3-verbs --enable-verbs=yes --with-synapseai=/usr && \ make && make install && cd / && rm -rf /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 /tmp/libfabric-${LIBFABRIC_VERSION} +RUN wget -q -O /tmp/openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPENMPI_VERSION}.tar.gz && \ + tar -xzf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz -C /tmp && \ + cd /tmp/openmpi-${OPENMPI_VERSION} && \ + ./configure --prefix=${MPI_ROOT} --with-libfabric=$LIBFABRIC_ROOT --with-verbs && \ + make -j$(nproc) && make install && cd / && rm -rf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz /tmp/openmpi-${OPENMPI_VERSION} + RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/heads/main.zip && \ unzip /tmp/main.zip -d /tmp && \ cd /tmp/hccl_ofi_wrapper-main && \ @@ -118,7 +128,7 @@ RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archi cd / && \ rm -rf /tmp/main.zip /tmp/hccl_ofi_wrapper-main -RUN python3.10 -m pip install pip==23.3.1 setuptools==67.3.3 wheel==0.38.4 +RUN python3.10 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0 RUN ln -s /usr/bin/python3 /usr/bin/python @@ -130,7 +140,6 @@ RUN mkdir -p /var/run/sshd && \ sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \ echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ - ssh-keygen -A && \ mkdir -p /var/run/sshd && echo "/usr/sbin/sshd -p 3022" | tee -a ~/.bashrc ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so diff --git a/dockerfiles/base/Dockerfile.rhel9.4 b/dockerfiles/base/Dockerfile.rhel9.4 index 53b62c7..a00aa3a 100644 --- a/dockerfiles/base/Dockerfile.rhel9.4 +++ b/dockerfiles/base/Dockerfile.rhel9.4 @@ -51,14 +51,15 @@ RUN dnf install -y \ lsof \ python3-devel \ openssh-clients \ - openssl-1:3.0.7-27.el9 \ - openssl-devel-1:3.0.7-27.el9 \ + openssl-1:3.0.7-28.el9_4 \ + openssl-devel-1:3.0.7-28.el9_4 \ libjpeg-devel \ openssh-server \ lsb_release \ wget \ git \ libffi-devel \ + bzip2 \ bzip2-devel \ zlib-devel \ mesa-libGL \ @@ -66,13 +67,14 @@ RUN dnf install -y \ python3.11 \ python3.11-pip \ python3.11-devel \ + python3.11-rpm \ ffmpeg-free \ - perl-Net-SSLeay-1.92-2.el9 \ python3-dnf-plugin-versionlock && \ # update pkgs (except OS version) for resolving potentials CVEs - dnf versionlock add redhat-release* openssl* perl-Net-SSLeay && \ + dnf versionlock add redhat-release* openssl* libcurl-minimal curl-minimal ima-evm-utils python3-rpm rpm* && \ dnf update -y && \ - dnf clean all && rm -rf /var/cache/yum + dnf clean all && rm -rf /var/cache/yum && \ + rm -f /etc/ssh/ssh_host_*_key* RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && \ alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1 && \ @@ -84,29 +86,31 @@ RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && \ COPY install_efa.sh . RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh -ENV LIBFABRIC_VERSION="1.20.0" +ENV OPENMPI_VERSION=4.1.6 +ENV LIBFABRIC_VERSION="1.22.0" ENV LIBFABRIC_ROOT="/opt/habanalabs/libfabric-${LIBFABRIC_VERSION}" -ENV MPI_ROOT=/opt/amazon/openmpi +ENV MPI_ROOT=/opt/habanalabs/openmpi ENV LD_LIBRARY_PATH=$LIBFABRIC_ROOT/lib:${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH ENV PATH=${LIBFABRIC_ROOT}/bin:${MPI_ROOT}/bin:$PATH ENV OPAL_PREFIX=${MPI_ROOT} ENV MPICC=${MPI_ROOT}/bin/mpicc ENV RDMAV_FORK_SAFE=1 -ENV FI_EFA_USE_DEVICE_RDMA=1 +ENV FI_EFA_USE_DEVICE_RDMA=0 +ENV OMPI_MCA_btl=^openib RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ echo "name=Habana RH9 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.4" >> /etc/yum.repos.d/habanalabs.repo && \ - echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/api/v2/repositories/rhel/keyPairs/primary/public" >> /etc/yum.repos.d/habanalabs.repo && \ + echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.4/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo && \ echo "gpgcheck=1" >> /etc/yum.repos.d/habanalabs.repo # for Habana GPG key with SHA-1 signature RUN update-crypto-policies --set DEFAULT:SHA1 RUN dnf install -y habanalabs-rdma-core-"$VERSION"-"$REVISION".el9 \ - habanalabs-thunk-"$VERSION"-"$REVISION".el9 \ - habanalabs-firmware-tools-"$VERSION"-"$REVISION".el9 \ - habanalabs-graph-"$VERSION"-"$REVISION".el9 && \ + habanalabs-thunk-"$VERSION"-"$REVISION".el9 \ + habanalabs-firmware-tools-"$VERSION"-"$REVISION".el9 \ + habanalabs-graph-"$VERSION"-"$REVISION".el9 && \ rm -f /etc/yum.repos.d/habanalabs.repo && rm -f /etc/yum.repos.d/habana.repo && rm -rf /tmp/* && \ dnf clean all && rm -rf /var/cache/yum @@ -124,6 +128,12 @@ RUN wget -nv -O /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 https://github.com/o ./configure --prefix=$LIBFABRIC_ROOT --enable-psm3-verbs --enable-verbs=yes --with-synapseai=/usr && \ make && make install && cd / && rm -rf /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 /tmp/libfabric-${LIBFABRIC_VERSION} +RUN wget -q -O /tmp/openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPENMPI_VERSION}.tar.gz && \ + tar -xzf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz -C /tmp && \ + cd /tmp/openmpi-${OPENMPI_VERSION} && \ + ./configure --prefix=${MPI_ROOT} --with-libfabric=$LIBFABRIC_ROOT --with-verbs && \ + make -j$(nproc) && make install && cd / && rm -rf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz /tmp/openmpi-${OPENMPI_VERSION} + RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/heads/main.zip && \ unzip /tmp/main.zip -d /tmp && \ cd /tmp/hccl_ofi_wrapper-main && \ @@ -131,7 +141,7 @@ RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archi cd / && \ rm -rf /tmp/main.zip /tmp/hccl_ofi_wrapper-main -RUN python3.11 -m pip install pip==23.3.1 setuptools==67.3.3 wheel==0.38.4 +RUN python3.11 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0 RUN ln -s /usr/bin/python3 /usr/bin/python @@ -143,7 +153,6 @@ RUN mkdir -p /var/run/sshd && \ sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \ echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ - ssh-keygen -A && \ mkdir -p /var/run/sshd && echo "/usr/sbin/sshd -p 3022" | tee -a ~/.bashrc ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so diff --git a/dockerfiles/base/Dockerfile.suse15.5 b/dockerfiles/base/Dockerfile.suse15.5 new file mode 100644 index 0000000..f53bd3c --- /dev/null +++ b/dockerfiles/base/Dockerfile.suse15.5 @@ -0,0 +1,121 @@ +# Copyright (c) 2024 Habana Labs, Ltd. +# +# SPDX-License-Identifier: Apache-2.0 +# +# HabanaLabs Dockerfile base installer layer for SUSE 15.5 +FROM registry.suse.com/suse/sle15:15.5.36.11.33 +ARG ARTIFACTORY_URL +ARG VERSION +ARG REVISION + +# for RHEL certification +LABEL vendor="Habanalabs Ltd." +LABEL release="${VERSION}-${REVISION}" + +COPY LICENSE /licenses/ + +RUN zypper addrepo -f http://download.opensuse.org/distribution/leap/15.5/repo/oss/ OpenSUSI && \ + echo "gpgcheck=0" >> /etc/zypp/repos.d/OpenSUSI.repo && \ + echo "repo_gpgcheck=0" >> /etc/zypp/repos.d/OpenSUSI.repo + +RUN zypper addrepo -f http://download.opensuse.org/source/distribution/leap/15.5/repo/oss/ OpenSUSISrc && \ + echo "gpgcheck=0" >> /etc/zypp/repos.d/OpenSUSISrc.repo && \ + echo "repo_gpgcheck=0" >> /etc/zypp/repos.d/OpenSUSISrc.repo + +RUN zypper install -y --allow-downgrade \ + clang \ + cmake \ + ffmpeg \ + gcc \ + gcc-c++ \ + git \ + glibc-devel \ + iproute \ + jemalloc \ + lbzip2 \ + libarchive-devel \ + libffi-devel \ + libjpeg-devel \ + libksba \ + linux-glibc-devel \ + llvm \ + lsof \ + Mesa-libGL-devel \ + Mesa-libGL1 \ + openssh-clients \ + openssh-server \ + openssl openssl-devel \ + python311 \ + python311-devel \ + python311-pip \ + unzip \ + wget \ + zlib-devel && \ + rm -f /etc/ssh/ssh_host_*_key* + +RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && \ + alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1 && \ + alternatives --set python3 /usr/bin/python3.11 && \ + alternatives --install /usr/bin/pip3 pip3 /usr/bin/pip3.11 1 && \ + alternatives --set pip3 /usr/bin/pip3.11 + +COPY install_efa.sh . +RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh + +ENV LIBFABRIC_VERSION="1.22.0" +ENV LIBFABRIC_ROOT="/opt/habanalabs/libfabric-${LIBFABRIC_VERSION}" +ENV MPI_ROOT=/opt/amazon/openmpi +ENV LD_LIBRARY_PATH=$LIBFABRIC_ROOT/lib:${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH +ENV PATH=${LIBFABRIC_ROOT}/bin:${MPI_ROOT}/bin:$PATH +ENV OPAL_PREFIX=${MPI_ROOT} +ENV MPICC=${MPI_ROOT}/bin/mpicc +ENV RDMA_FORK_SAFE=1 +ENV FI_EFA_USE_DEVICE_RDMA=1 + +RUN echo "[habanalabs]" > /etc/zypp/repos.d/habanalabs.repo && \ + echo "name=Habana SUSE Linux repo" >> /etc/zypp/repos.d/habanalabs.repo && \ + echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/sles/15/15.5" >> /etc/zypp/repos.d/habanalabs.repo && \ + echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/sles/15/15.5/repodata/repomd.xml.key" >> /etc/zypp/repos.d/habanalabs.repo && \ + echo "gpgcheck=1" >> /etc/zypp/repos.d/habanalabs.repo + +RUN zypper --gpg-auto-import-keys install -y habanalabs-rdma-core-"$VERSION"-"$REVISION" \ + habanalabs-thunk-"$VERSION"-"$REVISION" \ + habanalabs-firmware-tools-"$VERSION"-"$REVISION" \ + habanalabs-graph-"$VERSION"-"$REVISION" && \ + rm -f /etc/yum.repos.d/habanalabs.repo + +ENV PIP_NO_CACHE_DIR=on +ENV PIP_DISABLE_PIP_VERSION_CHECK=1 +ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src +ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib + +RUN wget -nv -O /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 https://github.com/ofiwg/libfabric/releases/download/v${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION}.tar.bz2 && \ + cd /tmp/ && tar xf /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 && \ + cd /tmp/libfabric-${LIBFABRIC_VERSION} && \ + ./configure --prefix=$LIBFABRIC_ROOT --enable-psm3-verbs --enable-verbs=yes --with-synapseai=/usr && \ + make && make install && cd / && rm -rf /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 /tmp/libfabric-${LIBFABRIC_VERSION} + +RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/heads/main.zip && \ + unzip /tmp/main.zip -d /tmp && \ + cd /tmp/hccl_ofi_wrapper-main && \ + make && cp -f libhccl_ofi_wrapper.so /usr/lib/habanalabs/libhccl_ofi_wrapper.so && \ + cd / && \ + rm -rf /tmp/main.zip /tmp/hccl_ofi_wrapper-main + +RUN python3.11 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0 + + +RUN python3.11 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" + +# SSH configuration necessary to support mpi-operator v2 +RUN mkdir -p /var/run/sshd && \ + sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ + sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \ + echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ + sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ + mkdir -p /var/run/sshd && echo "/usr/sbin/sshd -p 3022" | tee -a ~/.bashrc + +ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so +ENV HABANA_LOGS=/var/log/habana_logs/ +ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw +ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins diff --git a/dockerfiles/base/Dockerfile.tencentos3.1 b/dockerfiles/base/Dockerfile.tencentos3.1 index 2cf70aa..c5a28a7 100644 --- a/dockerfiles/base/Dockerfile.tencentos3.1 +++ b/dockerfiles/base/Dockerfile.tencentos3.1 @@ -42,7 +42,8 @@ RUN dnf install -y \ iproute \ python3-dnf-plugin-versionlock && \ dnf versionlock add redhat-release* && \ - dnf clean all && rm -rf /var/cache/yum + dnf clean all && rm -rf /var/cache/yum && \ + rm -f /etc/ssh/ssh_host_*_key* COPY install-python310.sh . RUN ./install-python310.sh tencentos3.1 && rm install-python310.sh @@ -65,7 +66,7 @@ ENV FI_EFA_USE_DEVICE_RDMA=1 RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ echo "name=Habana TC31 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/tencentos/3/3.1" >> /etc/yum.repos.d/habanalabs.repo && \ - echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/api/v2/repositories/tencentos/keyPairs/primary/public" >> /etc/yum.repos.d/habanalabs.repo + echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/tencentos/3/3.1/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo RUN dnf install -y habanalabs-rdma-core-"$VERSION"-"$REVISION".tl3 \ habanalabs-thunk-"$VERSION"-"$REVISION".tl3 \ @@ -95,7 +96,7 @@ RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archi cd / && \ rm -rf /tmp/main.zip /tmp/hccl_ofi_wrapper-main -RUN python3 -m pip install pip==23.3.1 setuptools==67.3.3 wheel==0.38.4 +RUN python3 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0 RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" @@ -105,7 +106,6 @@ RUN mkdir -p /var/run/sshd && \ sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \ echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ - ssh-keygen -A && \ mkdir -p /var/run/sshd && echo "/usr/sbin/sshd -p 3022" | tee -a ~/.bashrc ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so diff --git a/dockerfiles/base/Dockerfile.ubuntu22.04 b/dockerfiles/base/Dockerfile.ubuntu22.04 index 4a29c98..b322cbd 100644 --- a/dockerfiles/base/Dockerfile.ubuntu22.04 +++ b/dockerfiles/base/Dockerfile.ubuntu22.04 @@ -48,13 +48,14 @@ RUN apt-get update && \ libkrb5-3 \ libgnutls30 \ wget && \ - apt-get autoremove && apt-get clean + apt-get autoremove && apt-get clean && \ + rm -f /etc/ssh/ssh_host_*_key* # There is no need to store pip installation files inside docker image ENV PIP_NO_CACHE_DIR=on ENV PIP_DISABLE_PIP_VERSION_CHECK=1 -RUN python3 -m pip install pip==23.3.1 setuptools==67.3.3 wheel==0.38.4 +RUN python3 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0 COPY install_efa.sh . RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh @@ -76,11 +77,14 @@ RUN wget -O- https://${ARTIFACTORY_URL}/artifactory/api/gpg/key/public | gpg --d chmod 644 /usr/share/keyrings/habana-artifactory.gpg && \ echo "deb [signed-by=/usr/share/keyrings/habana-artifactory.gpg] https://${ARTIFACTORY_URL}/artifactory/debian jammy main" | tee -a /etc/apt/sources.list && \ apt-get update && \ + cp /etc/dpkg/dpkg.cfg.d/excludes /etc/dpkg/dpkg.cfg.d/excludes.bak && \ + sed -i '/path-exclude=\/usr\/share\/doc/d' /etc/dpkg/dpkg.cfg.d/excludes && \ apt-get install -y habanalabs-rdma-core="$VERSION"-"$REVISION" \ habanalabs-thunk="$VERSION"-"$REVISION" \ habanalabs-firmware-tools="$VERSION"-"$REVISION" \ habanalabs-graph="$VERSION"-"$REVISION" && \ apt-get autoremove --yes && apt-get clean && rm -rf /var/lib/apt/lists/* && \ + mv -f /etc/dpkg/dpkg.cfg.d/excludes.bak /etc/dpkg/dpkg.cfg.d/excludes && \ sed --in-place "/$ARTIFACTORY_URL/d" /etc/apt/sources.list RUN wget -nv -O /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 https://github.com/ofiwg/libfabric/releases/download/v${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION}.tar.bz2 && \ diff --git a/dockerfiles/base/Dockerfile.ubuntu24.04 b/dockerfiles/base/Dockerfile.ubuntu24.04 new file mode 100644 index 0000000..7f47c08 --- /dev/null +++ b/dockerfiles/base/Dockerfile.ubuntu24.04 @@ -0,0 +1,114 @@ +# Copyright (c) 2024 HabanaLabs, Ltd. +# +# SPDX-License-Identifier: Apache-2.0 +# +# HabanaLabs Dockerfile base installer layer for Ubuntu 24.04 +FROM ubuntu:noble +ARG ARTIFACTORY_URL +ARG VERSION +ARG REVISION + +ENV DEBIAN_FRONTEND=noninteractive +ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so +ENV HABANA_LOGS=/var/log/habana_logs/ +ENV OS_NUMBER=2404 +ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw +ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + apt-transport-https \ + apt-utils \ + bc \ + build-essential \ + ca-certificates \ + dkms \ + ethtool \ + gcc \ + git \ + gnupg \ + gpg-agent \ + graphviz \ + libgl1 \ + libgoogle-glog0v6t64 \ + libjemalloc2 \ + libpq-dev \ + lsof \ + make \ + openssh-client \ + openssh-server \ + protobuf-compiler \ + python3 \ + python3-dev \ + python3-pip \ + python3-tk \ + python3-venv \ + unzip \ + vim \ + libkrb5-3 \ + libgnutls30 \ + wget && \ + apt-get autoremove && apt-get clean && \ + rm -f /etc/ssh/ssh_host_*_key* + +# There is no need to store pip installation files inside docker image +ENV PIP_NO_CACHE_DIR=on +ENV PIP_DISABLE_PIP_VERSION_CHECK=1 + +RUN python3 -m pip install pip==24.0 setuptools==75.1.0 wheel==0.42.0 --break-system-packages + +COPY install_efa.sh . +RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh + +ENV LIBFABRIC_VERSION="1.20.0" +ENV LIBFABRIC_ROOT="/opt/habanalabs/libfabric-${LIBFABRIC_VERSION}" +ENV MPI_ROOT=/opt/amazon/openmpi +ENV LD_LIBRARY_PATH=$LIBFABRIC_ROOT/lib:${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH +ENV PATH=${LIBFABRIC_ROOT}/bin:${MPI_ROOT}/bin:$PATH +ENV OPAL_PREFIX=${MPI_ROOT} +ENV MPICC=${MPI_ROOT}/bin/mpicc +ENV RDMAV_FORK_SAFE=1 +ENV FI_EFA_USE_DEVICE_RDMA=1 +ENV RDMA_CORE_ROOT=/opt/habanalabs/rdma-core/src +ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib + +RUN wget -O- https://${ARTIFACTORY_URL}/artifactory/api/gpg/key/public | gpg --dearmor -o /usr/share/keyrings/habana-artifactory.gpg && \ + chown root:root /usr/share/keyrings/habana-artifactory.gpg && \ + chmod 644 /usr/share/keyrings/habana-artifactory.gpg && \ + echo "deb [signed-by=/usr/share/keyrings/habana-artifactory.gpg] https://${ARTIFACTORY_URL}/artifactory/debian noble main" | tee -a /etc/apt/sources.list && \ + apt-get update && \ + cp /etc/dpkg/dpkg.cfg.d/excludes /etc/dpkg/dpkg.cfg.d/excludes.bak && \ + sed -i '/path-exclude=\/usr\/share\/doc/d' /etc/dpkg/dpkg.cfg.d/excludes && \ + apt-get install -y habanalabs-rdma-core="$VERSION"-"$REVISION" \ + habanalabs-thunk="$VERSION"-"$REVISION" \ + habanalabs-firmware-tools="$VERSION"-"$REVISION" \ + habanalabs-graph="$VERSION"-"$REVISION" && \ + apt-get autoremove --yes && apt-get clean && rm -rf /var/lib/apt/lists/* && \ + mv -f /etc/dpkg/dpkg.cfg.d/excludes.bak /etc/dpkg/dpkg.cfg.d/excludes && \ + sed --in-place "/$ARTIFACTORY_URL/d" /etc/apt/sources.list + +RUN wget -nv -O /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 https://github.com/ofiwg/libfabric/releases/download/v${LIBFABRIC_VERSION}/libfabric-${LIBFABRIC_VERSION}.tar.bz2 && \ + cd /tmp/ && tar xf /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 && \ + cd /tmp/libfabric-${LIBFABRIC_VERSION} && \ + ./configure --prefix=$LIBFABRIC_ROOT --enable-psm3-verbs --enable-verbs=yes --with-synapseai=/usr && \ + make && make install && cd / && rm -rf /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 /tmp/libfabric-${LIBFABRIC_VERSION} + +RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/heads/main.zip && \ + unzip /tmp/main.zip -d /tmp && \ + cd /tmp/hccl_ofi_wrapper-main && \ + make && cp -f libhccl_ofi_wrapper.so /usr/lib/habanalabs/libhccl_ofi_wrapper.so && \ + cd / && \ + rm -rf /tmp/main.zip /tmp/hccl_ofi_wrapper-main + +RUN python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" --break-system-packages + +# SSH configuration necessary to support mpi-operator v2 +RUN mkdir -p /var/run/sshd && \ + sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ + sed -i 's/#\(ForwardAgent \).*/\1yes/g' /etc/ssh/ssh_config && \ + echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ + sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ + echo "/etc/init.d/ssh start \"-p 3022\"" >> ~/.bashrc && \ + sed -i '/[ -z "$PS1" ] && return/s/^/#/g' ~/.bashrc + +RUN mv /usr/lib/python3.12/EXTERNALLY-MANAGED /usr/lib/python3.12/EXTERNALLY-MANAGED.old \ No newline at end of file diff --git a/dockerfiles/base/install_efa.sh b/dockerfiles/base/install_efa.sh index bb6f680..e651dff 100755 --- a/dockerfiles/base/install_efa.sh +++ b/dockerfiles/base/install_efa.sh @@ -1,22 +1,38 @@ #!/bin/bash -ex -DEFAULT_EFA_INSTALLER_VER=1.29.0 +DEFAULT_EFA_INSTALLER_VER=1.34.0 efa_installer_version=${1:-$DEFAULT_EFA_INSTALLER_VER} tmp_dir=$(mktemp -d) wget -nv https://efa-installer.amazonaws.com/aws-efa-installer-$efa_installer_version.tar.gz -P $tmp_dir tar -xf $tmp_dir/aws-efa-installer-$efa_installer_version.tar.gz -C $tmp_dir +RUN_EFA_INSTALLER="./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify" pushd $tmp_dir/aws-efa-installer -case $(. /etc/os-release ; echo -n $ID) in +. /etc/os-release +case $ID in rhel) # we cannot install dkms packages on RHEL images due to OCP rules - rm -f RPMS/RHEL8/x86_64/dkms*.rpm + find RPMS/ -name 'dkms*.rpm' -exec rm -f {} \; + find RPMS/ -name 'efa-*.rpm' -exec rm -f {} \; + case $VERSION_ID in + 8*) + dnf install -y RPMS/ROCKYLINUX8/x86_64/rdma-core/*.rpm + ;; + 9*) + dnf install -y RPMS/ROCKYLINUX9/x86_64/rdma-core/*.rpm + ;; + *) + echo "Unsupported RHEL version: $VERSION_ID" + exit 1 + ;; + esac + RUN_EFA_INSTALLER="echo 'Skipping EFA installer on RHEL'" ;; tencentos) - dnf install -y RPMS/ROCKYLINUX8/x86_64/rdma-core/libibverbs-46.0-1.el8.x86_64.rpm RPMS/ROCKYLINUX8/x86_64/rdma-core/libibverbs-utils-46.0-1.el8.x86_64.rpm + dnf install -y RPMS/ROCKYLINUX8/x86_64/rdma-core/*.rpm patch -f -p1 -i /tmp/tencentos_efa_patch.txt --reject-file=tencentos_efa_patch.rej --no-backup-if-mismatch ;; esac -./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify +eval $RUN_EFA_INSTALLER popd rm -rf $tmp_dir diff --git a/dockerfiles/base/tencentos_efa_patch.txt b/dockerfiles/base/tencentos_efa_patch.txt index 9b08bed..cfd050d 100644 --- a/dockerfiles/base/tencentos_efa_patch.txt +++ b/dockerfiles/base/tencentos_efa_patch.txt @@ -1,5 +1,5 @@ diff --git a/common.sh b/common.sh -index cae76fc..afe440a 100755 +index 3c3a0e4..b463f42 100755 --- a/common.sh +++ b/common.sh @@ -50,6 +50,15 @@ has_substring() { @@ -18,70 +18,88 @@ index cae76fc..afe440a 100755 is_amazon_linux_2() { . /etc/os-release if [ "$NAME" = "Amazon Linux" ] && [ "$VERSION_ID" = "2" ]; then -@@ -183,7 +192,7 @@ is_suse_15() { +@@ -164,7 +173,7 @@ is_suse_15() { } install_cmd() { -- if is_amazon_linux_2 || is_amazon_linux_2023 || is_centos_7 || is_rhel_7 || is_centos_8 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then -+ if is_amazon_linux_2 || is_amazon_linux_2023 || is_centos_7 || is_rhel_7 || is_centos_8 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_tencentos_3; then +- if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then ++ if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_tencentos_3; then if [ $1 == "localinstall" ]; then shift - if is_centos_8; then -@@ -207,7 +216,7 @@ install_cmd() { + yum -y localinstall $@ +@@ -181,7 +190,7 @@ install_cmd() { fi } search_cmd() { -- if is_amazon_linux_2 || is_amazon_linux_2023 || is_centos_7 || is_rhel_7 || is_centos_8 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then -+ if is_amazon_linux_2 || is_amazon_linux_2023 || is_centos_7 || is_rhel_7 || is_centos_8 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_tencentos_3; then +- if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then ++ if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_tencentos_3; then yum list installed $@ elif is_suse_15; then zypper search --installed-only --match-exact $@ -@@ -219,7 +228,7 @@ search_cmd() { - fi +@@ -194,7 +203,7 @@ search_cmd() { } remove_cmd() { -- if is_amazon_linux_2 || is_amazon_linux_2023 || is_centos_7 || is_rhel_7 || is_centos_8 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then -+ if is_amazon_linux_2 || is_amazon_linux_2023 || is_centos_7 || is_rhel_7 || is_centos_8 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_tencentos_3; then - yum -y remove $@ - elif is_suse_15; then - zypper remove -y $@ + # we don't remove the dependencies of the efa packages as it may have reverse dependencies on other system packages +- if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_suse_15; then ++ if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_suse_15 || is_tencentos_3; then + rpm --erase --nodeps $@ + elif is_debian_10 || is_debian_11 || is_ubuntu_2004 || is_ubuntu_2204 || is_ubuntu_2404; then + # purge is identical to remove except that packages are removed and purged +@@ -207,7 +216,7 @@ remove_cmd() { + } + # Get the list of file installed by the package name + query_file_list_cmd() { +- if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_suse_15; then ++ if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_suse_15 || is_tencentos_3; then + rpm -ql $@ + elif is_debian_10 || is_debian_11 || is_ubuntu_2004 || is_ubuntu_2204 || is_ubuntu_2404; then + dpkg -L $@ +@@ -220,7 +229,7 @@ query_file_list_cmd() { + # reverse dependencies (some other installed packages depend on them) + # this command will return non-zero + remove_dryrun_cmd() { +- if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_suse_15; then ++ if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_suse_15 || is_tencentos_3; then + rpm --erase --test $@ + elif is_debian_10 || is_debian_11 || is_ubuntu_2004 || is_ubuntu_2204 || is_ubuntu_2404; then + dpkg -r --dry-run $@ diff --git a/efa_installer.sh b/efa_installer.sh -index 35a3628..5e94a21 100755 +index 544673f..faf3369 100755 --- a/efa_installer.sh +++ b/efa_installer.sh -@@ -49,7 +49,7 @@ EOF +@@ -97,7 +97,7 @@ select_mpi() { } detect_os() { -- if is_amazon_linux_2 || is_amazon_linux_2023 || is_centos_7 || is_rhel_7 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then -+ if is_amazon_linux_2 || is_amazon_linux_2023 || is_centos_7 || is_rhel_7 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_tencentos_3; then +- if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then ++ if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_tencentos_3; then PACKAGE_TYPE="rpm" KERNEL_SEARCH_STRING=kernel INSTALL_ARGS="--setopt=skip_missing_names_on_install=False" -@@ -170,7 +170,7 @@ setup_install_package_paths() { - if is_centos_7 || is_rhel_7; then - base_dir="RPMS/CENT7/${arch}" - debug_dir="RPMS/CENT7/${arch}/debug" -- elif is_rhel_8 || is_rockylinux_8; then -+ elif is_rhel_8 || is_rockylinux_8 || is_tencentos_3; then +@@ -209,7 +209,7 @@ setup_install_package_paths() { + local kmod_path + + if [ "${PACKAGE_TYPE}" = "rpm" ]; then +- if is_rhel_8 || is_rockylinux_8; then ++ if is_rhel_8 || is_rockylinux_8|| is_tencentos_3; then base_dir="RPMS/ROCKYLINUX8/${arch}" debug_dir="RPMS/ROCKYLINUX8/${arch}/debug" elif is_rockylinux_9 || is_rhel_9; then -@@ -390,7 +390,7 @@ install_apt_package() { +@@ -465,7 +465,7 @@ install_apt_package() { install_dependencies() { local packages -- if is_amazon_linux_2 || is_amazon_linux_2023 || is_centos_7 || is_rhel_7 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then -+ if is_amazon_linux_2 || is_amazon_linux_2023 || is_centos_7 || is_rhel_7 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_tencentos_3; then +- if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then ++ if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_tencentos_3; then packages="pciutils rpmdevtools" if [ ${SKIP_KMOD} -eq 0 ]; then for kernel in ${INSTALLED_KERNELS[@]}; do -@@ -642,7 +642,7 @@ uninstall_efa() { +@@ -785,7 +785,7 @@ uninstall_efa() { uninstall_old_efa_packages() { # Uninstall 'openmpi' and 'libfabric' if packaged by AWS. -- if is_amazon_linux_2 || is_amazon_linux_2023 || is_centos_7 || is_rhel_7 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then -+ if is_amazon_linux_2 || is_amazon_linux_2023 || is_centos_7 || is_rhel_7 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 | is_tencentos_3; then +- if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9; then ++ if is_amazon_linux_2 || is_amazon_linux_2023 || is_rhel_8 || is_rockylinux_8 || is_rockylinux_9 || is_rhel_9 || is_tencentos_3; then for pkg in openmpi libfabric libfabric-debuginfo; do rpm -ql $pkg | grep -q /opt/amazon if [ $? -eq 0 ]; then diff --git a/dockerfiles/common.mk b/dockerfiles/common.mk index 4404cd5..6e29640 100644 --- a/dockerfiles/common.mk +++ b/dockerfiles/common.mk @@ -5,9 +5,9 @@ BUILD_OS ?= ubuntu22.04 BUILD_DIR ?= $(CURDIR)/dockerbuild REPO_SERVER ?= vault.habana.ai -PT_VERSION ?= 2.3.1 -RELEASE_VERSION ?= 1.17.1 -RELEASE_BUILD_ID ?= 40 +PT_VERSION ?= 2.4.0 +RELEASE_VERSION ?= 1.18.0 +RELEASE_BUILD_ID ?= 524 BASE_IMAGE_URL ?= base-installer-$(BUILD_OS) IMAGE_URL = $(IMAGE_NAME):$(RELEASE_VERSION)-$(RELEASE_BUILD_ID) diff --git a/dockerfiles/pytorch/Dockerfile.rhel9.4 b/dockerfiles/pytorch/Dockerfile.rhel9.4 index 4e65878..d09fafe 100644 --- a/dockerfiles/pytorch/Dockerfile.rhel9.4 +++ b/dockerfiles/pytorch/Dockerfile.rhel9.4 @@ -26,7 +26,7 @@ RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \ echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo RUN dnf install --allowerasing -y \ - curl \ + curl-7.76.1-29.el9_4.1 \ cairo-devel \ numactl-devel \ iproute \ @@ -38,10 +38,19 @@ RUN dnf install --allowerasing -y \ gperftools-devel && \ dnf clean all && rm -rf /var/cache/yum -RUN dnf config-manager --add-repo https://yum.repos.intel.com/mkl/setup/intel-mkl.repo -y && \ - dnf install --allowerasing -y intel-mkl-64bit-2020.4-912 && \ +RUN echo "[oneAPI]" >> /etc/yum.repos.d/oneAPI.repo && \ + echo "name=IntelĀ® oneAPI repository" >> /etc/yum.repos.d/oneAPI.repo && \ + echo "baseurl=https://yum.repos.intel.com/oneapi" >> /etc/yum.repos.d/oneAPI.repo && \ + echo 'enabled=1' >> /etc/yum.repos.d/oneAPI.repo && \ + echo "gpgcheck=1" >> /etc/yum.repos.d/oneAPI.repo && \ + echo "repo_gpgcheck=1" >> /etc/yum.repos.d/oneAPI.repo && \ + echo "gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB" >> /etc/yum.repos.d/oneAPI.repo + +RUN dnf install --allowerasing -y intel-oneapi-mkl-2024.2.0 && \ dnf clean all && rm -rf /var/cache/yum +ENV LD_LIBRARY_PATH=/opt/intel/oneapi/mkl/2024.2/lib:${LD_LIBRARY_PATH} + COPY install_packages.sh . RUN ./install_packages.sh && rm -f install_packages.sh && \ diff --git a/dockerfiles/pytorch/Dockerfile.suse15.5 b/dockerfiles/pytorch/Dockerfile.suse15.5 new file mode 100644 index 0000000..8fe9f54 --- /dev/null +++ b/dockerfiles/pytorch/Dockerfile.suse15.5 @@ -0,0 +1,47 @@ +# Copyright (c) 2024 HabanaLabs, Ltd. +# +# SPDX-License-Identifier: Apache-2.0 +# +# HabanaLabs Dockerfile PyTorch installer layer for SUSE 15.5 +ARG BASE_NAME +ARG VERSION +ARG REVISION +FROM ${BASE_NAME}:${VERSION}-${REVISION} +ARG PT_VERSION +ARG VERSION +ARG REVISION +ARG BASE_NAME +ARG ARTIFACTORY_URL + +# for RHEL certification +LABEL name="PyTorch Installer" +LABEL summary="Habanalabs PyTorch installer layer for SUSE 15.5" +LABEL description="Image with pre installed Habanalabs packages for PyTorch" + +ENV PYTHONPATH=/root:/usr/lib/habanalabs/ + +RUN zypper install -y --allow-downgrade \ + cairo-devel \ + numactl \ + lapack-devel \ + numactl \ + gperftools-devel + +RUN zypper addrepo -f https://yum.repos.intel.com/oneapi oneAPI && \ + echo "gpgcheck=0" >> /etc/zypp/repos.d/oneAPI.repo && \ + echo "repo_gpgcheck=0" >> /etc/zypp/repos.d/oneAPI.repo + +RUN zypper install -y intel-oneapi-mkl-2021.1.1 intel-oneapi-mkl-devel-2021.1.1 + + +COPY install_packages.sh . + +RUN ./install_packages.sh && rm -f install_packages.sh && \ + /sbin/ldconfig && echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc + +# Set LD_PRELOAD after all required installations to +# avoid warnings during docker creation +ENV LD_PRELOAD=/usr/lib64/libtcmalloc.so.4 +ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 + +RUN rm -rf /tmp/* \ No newline at end of file diff --git a/dockerfiles/pytorch/install_packages.sh b/dockerfiles/pytorch/install_packages.sh index a2cf8dd..396ab29 100755 --- a/dockerfiles/pytorch/install_packages.sh +++ b/dockerfiles/pytorch/install_packages.sh @@ -4,6 +4,9 @@ set -ex pt_package_name="pytorch_modules-v${PT_VERSION}_${VERSION}_${REVISION}.tgz" os_string="ubuntu${OS_NUMBER}" case "${BASE_NAME}" in + *sles15.5* | *suse15.5*) + os_string="suse155" + ;; *rhel9.2*) os_string="rhel92" ;; diff --git a/dockerfiles/triton/Dockerfile b/dockerfiles/triton/Dockerfile index 8da6a12..d8ac15a 100644 --- a/dockerfiles/triton/Dockerfile +++ b/dockerfiles/triton/Dockerfile @@ -52,8 +52,8 @@ RUN apt-get update && apt-get install -y \ libgoogle-perftools-dev && \ apt-get clean && rm -rf /var/lib/apt/lists/* -RUN python3 -m pip install pip==23.3.1 --disable-pip-version-check && \ - python3 -m pip install setuptools==67.3.3 --disable-pip-version-check && \ +RUN python3 -m pip install pip==24.2 --disable-pip-version-check && \ + python3 -m pip install setuptools==75.1.0 --disable-pip-version-check && \ python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" --disable-pip-version-check RUN ln -s /usr/bin/python3.10 /usr/bin/python && wget --no-verbose "${PT_ARTIFACT_PATH}/${PT_PACKAGE_NAME}" && \ diff --git a/dockerfiles/triton_vllm_backend/Dockerfile b/dockerfiles/triton_vllm_backend/Dockerfile new file mode 100644 index 0000000..6011cf2 --- /dev/null +++ b/dockerfiles/triton_vllm_backend/Dockerfile @@ -0,0 +1,79 @@ +# Copyright (c) 2023 HabanaLabs, Ltd. +# +# SPDX-License-Identifier: Apache-2.0 +# +# HabanaLabs Dockerfile triton installer layer for Ubuntu 22.04 +FROM nvcr.io/nvidia/tritonserver:24.06-py3 +ARG ARTIFACTORY_URL +ARG PT_VERSION +ARG VERSION +ARG REVISION +ARG HABANA_PIP_VERSION="22.3" +ARG PT_BUILD_REPO=gaudi-pt-modules +ARG PT_PACKAGE_NAME="pytorch_modules-v"${PT_VERSION}"_"${VERSION}"_"${REVISION}".tgz" +ARG PT_ARTIFACT_PATH="https://"${ARTIFACTORY_URL}"/artifactory/${PT_BUILD_REPO}/"${VERSION}"/"${REVISION}"/pytorch/ubuntu2204" +ENV DEBIAN_FRONTEND=noninteractive +ENV GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so +ENV HABANA_LOGS=/var/log/habana_logs/ +ENV HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw +ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins +ENV PIP_NO_CACHE_DIR=on +ENV PIP_DEFAULT_TIMEOUT=1000 +ENV MPI_ROOT=/opt/hpcx/ompi +ENV LD_LIBRARY_PATH=${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH +ENV PATH=${MPI_ROOT}/bin:$PATH +ENV OPAL_PREFIX=${MPI_ROOT} +ENV MPICC=${MPI_ROOT}/bin/mpicc +ENV RDMAV_FORK_SAFE=1 +ENV PYTHONPATH=/root:/usr/lib/habanalabs/ + +ADD model.py . +RUN echo "deb https://${ARTIFACTORY_URL}/artifactory/debian jammy main" | tee -a /etc/apt/sources.list && \ + wget "https://${ARTIFACTORY_URL}/artifactory/api/gpg/key/public" && \ + apt-key add public && rm public && apt-get update && \ + apt-get install -y habanalabs-rdma-core="$VERSION"-"$REVISION" \ + habanalabs-thunk="$VERSION"-"$REVISION" \ + habanalabs-firmware-tools="$VERSION"-"$REVISION" \ + habanalabs-graph="$VERSION"-"$REVISION" && \ + apt-get autoremove --yes && apt-get clean && rm -rf /var/lib/apt/lists/* && \ + sed --in-place "/$ARTIFACTORY_URL/d" /etc/apt/sources.list + +RUN apt-get update && apt-get install -y \ + libjemalloc2 \ + libcairo2-dev \ + libglib2.0-dev \ + libhdf5-dev \ + libnuma-dev \ + libpcre2-dev \ + libjpeg-dev \ + liblapack-dev \ + libopenblas-dev \ + numactl \ + libgoogle-perftools-dev && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +RUN python3 -m pip install pip==23.3.1 --disable-pip-version-check && \ + python3 -m pip install setuptools==67.3.3 --disable-pip-version-check && \ + python3 -m pip install habana_media_loader=="${VERSION}"."${REVISION}" --disable-pip-version-check + +RUN ln -s /usr/bin/python3.10 /usr/bin/python && wget --no-verbose "${PT_ARTIFACT_PATH}/${PT_PACKAGE_NAME}" && \ + mkdir -p /root/habanalabs/pytorch_temp && \ + tar -xf pytorch_modules-v"${PT_VERSION}"_"${VERSION}"_"${REVISION}".tgz -C /root/habanalabs/pytorch_temp/. && \ + python3 -m pip install pip=="${HABANA_PIP_VERSION}" && \ + pip install mpi4py==3.1.4 --disable-pip-version-check && \ + #pip install $(grep -ivE "#|lightning" /root/habanalabs/pytorch_temp/requirements-pytorch.txt | grep .) --no-warn-script-location --disable-pip-version-check && \ + pip install /root/habanalabs/pytorch_temp/*.whl --disable-pip-version-check && \ + pip install $(grep "lightning" /root/habanalabs/pytorch_temp/requirements-pytorch.txt) --disable-pip-version-check && \ + echo "source /etc/profile.d/habanalabs.sh" >> ~/.bashrc && \ + pip uninstall -y pillow && \ + pip uninstall -y pillow-simd && \ + pip install pillow-simd==7.0.0.post3 --disable-pip-version-check && \ + rm -rf /root/habanalabs pytorch_modules-v"${PT_VERSION}"_"${VERSION}"_"${REVISION}".tgz /tmp/* +#RUN python3 -m pip install --no-cache-dir git+https://github.com/HabanaAI/vllm-fork.git@v0.4.2-Gaudi-1.16.0 +RUN python3 -m pip install --no-cache-dir git+https://github.com/HabanaAI/vllm-fork.git@275e3250ba6ed8cc13b2d6e4928db73df420e64b + +RUN mkdir -p /opt/tritonserver/backends/vllm +COPY model.py /opt/tritonserver/backends/vllm/ + +ENV LD_PRELOAD=/lib/x86_64-linux-gnu/libtcmalloc.so.4 +ENV TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=7516192768 diff --git a/dockerfiles/triton_vllm_backend/Makefile b/dockerfiles/triton_vllm_backend/Makefile new file mode 100644 index 0000000..d749807 --- /dev/null +++ b/dockerfiles/triton_vllm_backend/Makefile @@ -0,0 +1,15 @@ + +include ../common.mk + +IMAGE_NAME = triton-installer-$(PT_VERSION)-${BUILD_OS} +DOCKER_BUILD_ARGS := $(DOCKER_BUILD_ARGS) --build-arg PT_VERSION=$(PT_VERSION) + +init: +ifneq ($(BUILD_OS), ubuntu22.04) + $(error triton is only supported on ubuntu22.04) +endif + $(HIDE)mkdir -p $(BUILD_DIR) + $(HIDE)cp $(CURDIR)/Dockerfile $(BUILD_DIR)/Dockerfile + $(HIDE)cp $(CURDIR)/model.py $(BUILD_DIR)/model.py + +build: init diff --git a/dockerfiles/triton_vllm_backend/model.py b/dockerfiles/triton_vllm_backend/model.py new file mode 100755 index 0000000..450b35c --- /dev/null +++ b/dockerfiles/triton_vllm_backend/model.py @@ -0,0 +1,520 @@ +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import asyncio +import json +import os +import threading +from typing import Dict, List + +import numpy as np +import torch +import triton_python_backend_utils as pb_utils +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.lora.request import LoRARequest +from vllm.sampling_params import SamplingParams +from vllm.utils import random_uuid +from copy import deepcopy +from transformers import AutoTokenizer +_VLLM_ENGINE_ARGS_FILENAME = "model.json" +_MULTI_LORA_ARGS_FILENAME = "multi_lora.json" +#from https://github.com/triton-inference-server/vllm_backend/commit/18a96e365caa2032eb900ac116753e1384c624c8 +# add chat-template for qwen2 + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + inputs = [ + {"name": "text_input", "data_type": "TYPE_STRING", "dims": [1]}, + { + "name": "stream", + "data_type": "TYPE_BOOL", + "dims": [1], + "optional": True, + }, + { + "name": "sampling_parameters", + "data_type": "TYPE_STRING", + "dims": [1], + "optional": True, + }, + { + "name": "exclude_input_in_output", + "data_type": "TYPE_BOOL", + "dims": [1], + "optional": True, + }, + ] + outputs = [{"name": "text_output", "data_type": "TYPE_STRING", "dims": [-1]}] + + # Store the model configuration as a dictionary. + config = auto_complete_model_config.as_dict() + input_names = [] + output_names = [] + for input in config["input"]: + input_names.append(input["name"]) + for output in config["output"]: + output_names.append(output["name"]) + + # Add only missing inputs and output to the model configuration. + for input in inputs: + if input["name"] not in input_names: + auto_complete_model_config.add_input(input) + for output in outputs: + if output["name"] not in output_names: + auto_complete_model_config.add_output(output) + + # We need to use decoupled transaction policy for saturating + # vLLM engine for max throughtput. + # TODO [DLIS:5233]: Allow asynchronous execution to lift this + # restriction for cases there is exactly a single response to + # a single request. + auto_complete_model_config.set_model_transaction_policy(dict(decoupled=True)) + + # Disabling batching in Triton, let vLLM handle the batching on its own. + auto_complete_model_config.set_max_batch_size(0) + + return auto_complete_model_config + + def initialize(self, args): + self.args = args + self.logger = pb_utils.Logger + self.model_config = json.loads(args["model_config"]) + output_config = pb_utils.get_output_config_by_name( + self.model_config, "text_output" + ) + self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"]) + + # Prepare vLLM engine + self.init_engine() + + # Counter to keep track of ongoing request counts + self.ongoing_request_count = 0 + + # Starting asyncio event loop to process the received requests asynchronously. + self._loop = asyncio.get_event_loop() + self._loop_thread = threading.Thread( + target=self.engine_loop, args=(self._loop,) + ) + self._shutdown_event = asyncio.Event() + self._loop_thread.start() + + def init_engine(self): + # Currently, Triton needs to use decoupled policy for asynchronously + # forwarding requests to vLLM engine, so assert it. + self.using_decoupled = pb_utils.using_decoupled_model_transaction_policy( + self.model_config + ) + assert ( + self.using_decoupled + ), "vLLM Triton backend must be configured to use decoupled model transaction policy" + + engine_args_filepath = os.path.join( + pb_utils.get_model_dir(), _VLLM_ENGINE_ARGS_FILENAME + ) + assert os.path.isfile( + engine_args_filepath + ), f"'{_VLLM_ENGINE_ARGS_FILENAME}' containing vllm engine args must be provided in '{pb_utils.get_model_dir()}'" + with open(engine_args_filepath) as file: + self.vllm_engine_config = json.load(file) + self.tokenizer = AutoTokenizer.from_pretrained(self.vllm_engine_config["tokenizer"], resume_download=True) + # Validate device and multi-processing settings are currently set based on model/configs. + self.validate_device_config() + + self.chat_template = self.vllm_engine_config.get("chat_template", None) and self.tokenizer.chat_template + self.vllm_engine_config.pop("chat_template", None) + # Check for LoRA config and set it up if enabled + self.setup_lora() + + + # Create an AsyncLLMEngine from the config from JSON + self.llm_engine = AsyncLLMEngine.from_engine_args( + AsyncEngineArgs(**self.vllm_engine_config) + ) + + def setup_lora(self): + self.enable_lora = False + + if ( + "enable_lora" in self.vllm_engine_config.keys() + and self.vllm_engine_config["enable_lora"].lower() == "true" + ): + # create Triton LoRA weights repository + multi_lora_args_filepath = os.path.join( + pb_utils.get_model_dir(), _MULTI_LORA_ARGS_FILENAME + ) + try: + with open(multi_lora_args_filepath) as lora_file: + lora_repository: Dict[str, str] = json.load(lora_file) + self.lora_repository = lora_repository + self.supported_loras: List[str] = list(self.lora_repository.keys()) + self.supported_loras_len = len(self.supported_loras) + self.enable_lora = True + except FileNotFoundError: + raise FileNotFoundError( + f"Triton backend cannot find {multi_lora_args_filepath}." + ) + + def validate_device_config(self): + triton_kind = self.args["model_instance_kind"] + triton_device_id = int(self.args["model_instance_device_id"]) + triton_instance = f"{self.args['model_name']}_{triton_device_id}" + + # Triton's current definition of KIND_GPU makes assumptions that + # models only use a single GPU. For multi-GPU models, the recommendation + # is to specify KIND_MODEL to acknowledge that the model will take control + # of the devices made available to it. + # NOTE: Consider other parameters that would indicate multi-GPU in the future. + tp_size = int(self.vllm_engine_config.get("tensor_parallel_size", 1)) + if tp_size > 1 and triton_kind == "GPU": + raise ValueError( + "KIND_GPU is currently for single-GPU models, please specify KIND_MODEL " + "in the model's config.pbtxt for multi-GPU models" + ) + + # If KIND_GPU is specified, specify the device ID assigned by Triton to ensure that + # multiple model instances do not oversubscribe the same default device. + if triton_kind == "GPU" and triton_device_id >= 0: + self.logger.log_info( + f"Detected KIND_GPU model instance, explicitly setting GPU device={triton_device_id} for {triton_instance}" + ) + # vLLM doesn't currently (v0.4.2) expose device selection in the APIs + torch.cuda.set_device(triton_device_id) + + def create_task(self, coro): + """ + Creates a task on the engine's event loop which is running on a separate thread. + """ + assert ( + self._shutdown_event.is_set() is False + ), "Cannot create tasks after shutdown has been requested" + + return asyncio.run_coroutine_threadsafe(coro, self._loop) + + def engine_loop(self, loop): + """ + Runs the engine's event loop on a separate thread. + """ + asyncio.set_event_loop(loop) + self._loop.run_until_complete(self.await_shutdown()) + + async def await_shutdown(self): + """ + Primary coroutine running on the engine event loop. This coroutine is responsible for + keeping the engine alive until a shutdown is requested. + """ + # first await the shutdown signal + while self._shutdown_event.is_set() is False: + await asyncio.sleep(5) + + # Wait for the ongoing_requests + while self.ongoing_request_count > 0: + self.logger.log_info( + "[vllm] Awaiting remaining {} requests".format( + self.ongoing_request_count + ) + ) + await asyncio.sleep(5) + + for task in asyncio.all_tasks(loop=self._loop): + if task is not asyncio.current_task(): + task.cancel() + + self.logger.log_info("[vllm] Shutdown complete") + + def get_sampling_params_dict(self, params_json): + """ + This functions parses the dictionary values into their + expected format. + """ + + params_dict = json.loads(params_json) + + # Special parsing for the supported sampling parameters + bool_keys = ["ignore_eos", "skip_special_tokens", "use_beam_search"] + for k in bool_keys: + if k in params_dict: + params_dict[k] = bool(params_dict[k]) + + float_keys = [ + "frequency_penalty", + "length_penalty", + "presence_penalty", + "temperature", + "top_p", + ] + for k in float_keys: + if k in params_dict: + params_dict[k] = float(params_dict[k]) + + int_keys = ["best_of", "max_tokens", "min_tokens", "n", "top_k"] + for k in int_keys: + if k in params_dict: + params_dict[k] = int(params_dict[k]) + + return params_dict + + def create_response(self, vllm_output, prepend_input): + """ + Parses the output from the vLLM engine into Triton + response. + """ + prompt = "" + if prepend_input: + prompt = vllm_output.prompt + if prompt: + text_outputs = [ + (prompt + output.text).encode("utf-8") for output in vllm_output.outputs + ] + else: + text_outputs = [ + output.text.encode("utf-8") for output in vllm_output.outputs + ] + + triton_output_tensor = pb_utils.Tensor( + "text_output", np.asarray(text_outputs, dtype=self.output_dtype) + ) + + return pb_utils.InferenceResponse(output_tensors=[triton_output_tensor]) + + def create_stream_response(self, vllm_output, previous_outputs_lengths): + """ + Parses the output from the vLLM engine, extracts only newly generated + text and packs it into Triton response. + """ + if previous_outputs_lengths is None: + return self.create_response(vllm_output, prepend_input=False) + + text_outputs = [ + (output.text[prev_output_length:]).encode("utf-8") + for output, prev_output_length in zip( + vllm_output.outputs, previous_outputs_lengths + ) + ] + triton_output_tensor = pb_utils.Tensor( + "text_output", np.asarray(text_outputs, dtype=self.output_dtype) + ) + return pb_utils.InferenceResponse(output_tensors=[triton_output_tensor]) + + async def generate(self, request): + """ + Forwards single request to LLM engine and returns responses. + """ + response_sender = request.get_response_sender() + self.ongoing_request_count += 1 + try: + request_id = random_uuid() + prompt = pb_utils.get_input_tensor_by_name( + request, "text_input" + ).as_numpy()[0] + if isinstance(prompt, bytes): + prompt = prompt.decode("utf-8") + stream = pb_utils.get_input_tensor_by_name(request, "stream") + if stream: + stream = stream.as_numpy()[0] + else: + stream = False + prepend_input = pb_utils.get_input_tensor_by_name( + request, "exclude_input_in_output" + ) + if prepend_input: + # When `exclude_input_in_output` is False, we want to prepend + # input prompt to output, thus prepend_input should be True, + # and vice versa. + prepend_input = not prepend_input.as_numpy()[0] + elif prepend_input is None and stream: + prepend_input = False + else: + prepend_input = True + + if prepend_input and stream: + raise ValueError( + "When streaming, `exclude_input_in_output` = False is not allowed." + ) + + # Request parameters are not yet supported via + # BLS. Provide an optional mechanism to receive serialized + # parameters as an input tensor until support is added + + parameters_input_tensor = pb_utils.get_input_tensor_by_name( + request, "sampling_parameters" + ) + if parameters_input_tensor: + parameters = parameters_input_tensor.as_numpy()[0].decode("utf-8") + else: + parameters = request.parameters() + + sampling_params_dict = self.get_sampling_params_dict(parameters) + lora_name = sampling_params_dict.pop("lora_name", None) + sampling_params = SamplingParams(**sampling_params_dict) + last_output = None + prev_outputs = None + lora_request = None + if lora_name is not None: + lora_id = str(self.supported_loras.index(lora_name) + 1) + lora_int_id = int(lora_id) + lora_local_path = self.lora_repository[lora_name] + lora_request = LoRARequest(lora_id, lora_int_id, lora_local_path) + if self.chat_template: + message = self.build_message(prompt) + message_template = self.tokenizer.apply_chat_template( + message, + tokenize=False, + add_generation_prompt=True + ) + + model_inputs = self.tokenizer(message_template).input_ids + inputs = {} + inputs["prompt_token_ids"] = model_inputs + else: + inputs = prompt + async for output in self.llm_engine.generate( + inputs, sampling_params, request_id, lora_request=lora_request, + ): + if response_sender.is_cancelled(): + self.logger.log_info("[vllm] Cancelling the request") + await self.llm_engine.abort(request_id) + self.logger.log_info("[vllm] Successfully cancelled the request") + break + + if stream: + prev_outputs_lengths = None + if prev_outputs is not None: + prev_outputs_lengths = [ + len(prev_output.text) + for prev_output in prev_outputs.outputs + ] + if output.finished: + response_sender.send( + self.create_stream_response(output, prev_outputs_lengths), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, + ) + else: + response_sender.send( + self.create_stream_response(output, prev_outputs_lengths) + ) + prev_outputs = output + + last_output = output + + if not stream: + response_sender.send( + self.create_response(last_output, prepend_input), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, + ) + + except Exception as e: + self.logger.log_info(f"[vllm] Error generating stream: {e}") + error = pb_utils.TritonError(f"Error generating stream: {e}") + triton_output_tensor = pb_utils.Tensor( + "text_output", np.asarray(["N/A"], dtype=self.output_dtype) + ) + response = pb_utils.InferenceResponse( + output_tensors=[triton_output_tensor], error=error + ) + response_sender.send( + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + raise e + finally: + self.ongoing_request_count -= 1 + + def verify_loras(self, request): + # We will check if the requested lora exists here, if not we will send a + # response with `LoRA not found` information. In this way we may avoid + # further processing. + verified_request = None + lora_error = None + lora_name = None + parameters_input_tensor = pb_utils.get_input_tensor_by_name( + request, "sampling_parameters" + ) + if parameters_input_tensor: + parameters = parameters_input_tensor.as_numpy()[0].decode("utf-8") + sampling_params_dict = self.get_sampling_params_dict(parameters) + lora_name = sampling_params_dict.pop("lora_name", None) + + if lora_name is not None: + if not self.enable_lora: + lora_error = pb_utils.TritonError("LoRA feature is not enabled.") + self.logger.log_info( + "[vllm] LoRA is not enabled, please restart the backend with LoRA enabled." + ) + elif lora_name not in self.supported_loras: + lora_error = pb_utils.TritonError( + f"LoRA {lora_name} is not supported, we currently support {self.supported_loras}" + ) + self.logger.log_info(f"[vllm] LoRA {lora_name} not found.") + + if lora_error is not None: + output_tensor = pb_utils.Tensor( + "text_output", + np.asarray(["[Error] Unsupported LoRA."], dtype=self.output_dtype), + ) + response = pb_utils.InferenceResponse( + output_tensors=[output_tensor], error=lora_error + ) + response_sender = request.get_response_sender() + response_sender.send( + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + else: + verified_request = request + return verified_request + + def execute(self, requests): + """ + Triton core issues requests to the backend via this method. + + When this method returns, new requests can be issued to the backend. Blocking + this function would prevent the backend from pulling additional requests from + Triton into the vLLM engine. This can be done if the kv cache within vLLM engine + is too loaded. + We are pushing all the requests on vllm and let it handle the full traffic. + """ + for request in requests: + request = self.verify_loras(request) + if request is not None: + self.create_task(self.generate(request)) + return None + + def finalize(self): + """ + Triton virtual method; called when the model is unloaded. + """ + self.logger.log_info("[vllm] Issuing finalize to vllm backend") + self._shutdown_event.set() + if self._loop_thread is not None: + self._loop_thread.join() + self._loop_thread = None + + def build_message(self, prompt: str, history: List[Dict] = None): + history = deepcopy(history) + if len(history or []) == 0: + history = [{"role": "system", "content": "You are a helpful assistant."}] + history.append({"role": "user", "content": prompt}) + return history diff --git a/dockerfiles/triton_vllm_backend/samples/client.py b/dockerfiles/triton_vllm_backend/samples/client.py new file mode 100755 index 0000000..390a365 --- /dev/null +++ b/dockerfiles/triton_vllm_backend/samples/client.py @@ -0,0 +1,280 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import asyncio +import json +import sys + +import numpy as np +import tritonclient.grpc.aio as grpcclient +from tritonclient.utils import * + + +class LLMClient: + def __init__(self, flags: argparse.Namespace): + self._client = grpcclient.InferenceServerClient( + url=flags.url, verbose=flags.verbose + ) + self._flags = flags + self._loop = asyncio.get_event_loop() + self._results_dict = {} + + async def async_request_iterator( + self, prompts, sampling_parameters, exclude_input_in_output + ): + try: + for iter in range(self._flags.iterations): + for i, prompt in enumerate(prompts): + prompt_id = self._flags.offset + (len(prompts) * iter) + i + self._results_dict[str(prompt_id)] = [] + yield self.create_request( + prompt, + self._flags.streaming_mode, + prompt_id, + sampling_parameters, + exclude_input_in_output, + ) + except Exception as error: + print(f"Caught an error in the request iterator: {error}") + + async def stream_infer(self, prompts, sampling_parameters, exclude_input_in_output): + try: + # Start streaming + response_iterator = self._client.stream_infer( + inputs_iterator=self.async_request_iterator( + prompts, sampling_parameters, exclude_input_in_output + ), + stream_timeout=self._flags.stream_timeout, + ) + async for response in response_iterator: + yield response + except InferenceServerException as error: + print(error) + sys.exit(1) + + async def process_stream( + self, prompts, sampling_parameters, exclude_input_in_output + ): + # Clear results in between process_stream calls + self.results_dict = [] + success = True + # Read response from the stream + async for response in self.stream_infer( + prompts, sampling_parameters, exclude_input_in_output + ): + result, error = response + if error: + print(f"Encountered error while processing: {error}") + success = False + else: + output = result.as_numpy("text_output") + for i in output: + self._results_dict[result.get_response().id].append(i) + return success + + async def run(self): + # Sampling parameters for text generation + # including `temperature`, `top_p`, top_k`, `max_tokens`, `early_stopping`. + # Full list available at: + # https://github.com/vllmproject/vllm/blob/5255d99dc595f9ae7647842242d6542aa4145a4f/vllm/sampling_params.py#L23 + sampling_parameters = { + "temperature": "0.1", + "top_p": "0.95", + "max_tokens": "100", + } + exclude_input_in_output = self._flags.exclude_inputs_in_outputs + if self._flags.lora_name is not None: + sampling_parameters["lora_name"] = self._flags.lora_name + with open(self._flags.input_prompts, "r") as file: + print(f"Loading inputs from `{self._flags.input_prompts}`...") + prompts = file.readlines() + + success = await self.process_stream( + prompts, sampling_parameters, exclude_input_in_output + ) + + with open(self._flags.results_file, "w") as file: + for id in self._results_dict.keys(): + for result in self._results_dict[id]: + file.write(result.decode("utf-8")) + + file.write("\n") + file.write("\n=========\n\n") + print(f"Storing results into `{self._flags.results_file}`...") + + if self._flags.verbose: + with open(self._flags.results_file, "r") as file: + print(f"\nContents of `{self._flags.results_file}` ===>") + print(file.read()) + if success: + print("PASS: vLLM example") + else: + print("FAIL: vLLM example") + + def run_async(self): + self._loop.run_until_complete(self.run()) + + def create_request( + self, + prompt, + stream, + request_id, + sampling_parameters, + exclude_input_in_output, + send_parameters_as_tensor=True, + ): + inputs = [] + prompt_data = np.array([prompt.encode("utf-8")], dtype=np.object_) + try: + inputs.append(grpcclient.InferInput("text_input", [1], "BYTES")) + inputs[-1].set_data_from_numpy(prompt_data) + except Exception as error: + print(f"Encountered an error during request creation: {error}") + + stream_data = np.array([stream], dtype=bool) + inputs.append(grpcclient.InferInput("stream", [1], "BOOL")) + inputs[-1].set_data_from_numpy(stream_data) + + # Request parameters are not yet supported via BLS. Provide an + # optional mechanism to send serialized parameters as an input + # tensor until support is added + + if send_parameters_as_tensor: + sampling_parameters_data = np.array( + [json.dumps(sampling_parameters).encode("utf-8")], dtype=np.object_ + ) + inputs.append(grpcclient.InferInput("sampling_parameters", [1], "BYTES")) + inputs[-1].set_data_from_numpy(sampling_parameters_data) + + inputs.append(grpcclient.InferInput("exclude_input_in_output", [1], "BOOL")) + inputs[-1].set_data_from_numpy(np.array([exclude_input_in_output], dtype=bool)) + + # Add requested outputs + outputs = [] + outputs.append(grpcclient.InferRequestedOutput("text_output")) + + # Issue the asynchronous sequence inference. + return { + "model_name": self._flags.model, + "inputs": inputs, + "outputs": outputs, + "request_id": str(request_id), + "parameters": sampling_parameters, + } + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-m", + "--model", + type=str, + required=False, + default="vllm_model", + help="Model name", + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + required=False, + default=False, + help="Enable verbose output", + ) + parser.add_argument( + "-u", + "--url", + type=str, + required=False, + default="localhost:8001", + help="Inference server URL and its gRPC port. Default is localhost:8001.", + ) + parser.add_argument( + "-t", + "--stream-timeout", + type=float, + required=False, + default=None, + help="Stream timeout in seconds. Default is None.", + ) + parser.add_argument( + "--offset", + type=int, + required=False, + default=0, + help="Add offset to request IDs used", + ) + parser.add_argument( + "--input-prompts", + type=str, + required=False, + default="prompts.txt", + help="Text file with input prompts", + ) + parser.add_argument( + "--results-file", + type=str, + required=False, + default="results.txt", + help="The file with output results", + ) + parser.add_argument( + "--iterations", + type=int, + required=False, + default=1, + help="Number of iterations through the prompts file", + ) + parser.add_argument( + "-s", + "--streaming-mode", + action="store_true", + required=False, + default=False, + help="Enable streaming mode", + ) + parser.add_argument( + "--exclude-inputs-in-outputs", + action="store_true", + required=False, + default=False, + help="Exclude prompt from outputs", + ) + parser.add_argument( + "-l", + "--lora-name", + type=str, + required=False, + default=None, + help="The querying LoRA name", + ) + FLAGS = parser.parse_args() + + client = LLMClient(FLAGS) + client.run_async() diff --git a/dockerfiles/triton_vllm_backend/samples/model_repository/vllm_model/1/model.json b/dockerfiles/triton_vllm_backend/samples/model_repository/vllm_model/1/model.json new file mode 100755 index 0000000..f801b3b --- /dev/null +++ b/dockerfiles/triton_vllm_backend/samples/model_repository/vllm_model/1/model.json @@ -0,0 +1,12 @@ +{ + "model":"meta-llama/Llama-2-7b-hf", + "tokenizer":"meta-llama/Llama-2-7b-hf", + "disable_log_requests": "false", + "gpu_memory_utilization": 0.5, + "enforce_eager": "true", + "max_num_seqs": 512, + "swap_space": 16, + "dtype": "bfloat16", + "tensor_parallel_size": 1, + "max_num_batched_tokens": 8192 +} diff --git a/dockerfiles/triton_vllm_backend/samples/model_repository/vllm_model/config.pbtxt b/dockerfiles/triton_vllm_backend/samples/model_repository/vllm_model/config.pbtxt new file mode 100644 index 0000000..48871c6 --- /dev/null +++ b/dockerfiles/triton_vllm_backend/samples/model_repository/vllm_model/config.pbtxt @@ -0,0 +1,36 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Note: You do not need to change any fields in this configuration. + +backend: "vllm" +# The usage of device is deferred to the vLLM engine +instance_group [ + { + count: 1 + kind: KIND_MODEL + } +] diff --git a/dockerfiles/triton_vllm_backend/samples/prompts.txt b/dockerfiles/triton_vllm_backend/samples/prompts.txt new file mode 100644 index 0000000..133800e --- /dev/null +++ b/dockerfiles/triton_vllm_backend/samples/prompts.txt @@ -0,0 +1,4 @@ +Hello, my name is +The most dangerous animal is +The capital of France is +The future of AI is diff --git a/dockerfiles/triton_vllm_backend/samples/test_models/llama70b_8x/1/model.json b/dockerfiles/triton_vllm_backend/samples/test_models/llama70b_8x/1/model.json new file mode 100644 index 0000000..f576654 --- /dev/null +++ b/dockerfiles/triton_vllm_backend/samples/test_models/llama70b_8x/1/model.json @@ -0,0 +1,12 @@ +{ + "model":"meta-llama/Llama-2-70b-hf", + "tokenizer":"meta-llama/Llama-2-70b-hf", + "disable_log_requests": "false", + "gpu_memory_utilization": 0.5, + "enforce_eager": "true", + "max_num_seqs": 512, + "swap_space": 16, + "dtype": "bfloat16", + "tensor_parallel_size": 8, + "max_num_batched_tokens": 8192 +} diff --git a/dockerfiles/triton_vllm_backend/samples/test_models/llama70b_8x/config.pbtxt b/dockerfiles/triton_vllm_backend/samples/test_models/llama70b_8x/config.pbtxt new file mode 100644 index 0000000..48871c6 --- /dev/null +++ b/dockerfiles/triton_vllm_backend/samples/test_models/llama70b_8x/config.pbtxt @@ -0,0 +1,36 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Note: You do not need to change any fields in this configuration. + +backend: "vllm" +# The usage of device is deferred to the vLLM engine +instance_group [ + { + count: 1 + kind: KIND_MODEL + } +] diff --git a/dockerfiles/triton_vllm_backend/samples/test_models/llama7b_1x/1/model.json b/dockerfiles/triton_vllm_backend/samples/test_models/llama7b_1x/1/model.json new file mode 100755 index 0000000..f801b3b --- /dev/null +++ b/dockerfiles/triton_vllm_backend/samples/test_models/llama7b_1x/1/model.json @@ -0,0 +1,12 @@ +{ + "model":"meta-llama/Llama-2-7b-hf", + "tokenizer":"meta-llama/Llama-2-7b-hf", + "disable_log_requests": "false", + "gpu_memory_utilization": 0.5, + "enforce_eager": "true", + "max_num_seqs": 512, + "swap_space": 16, + "dtype": "bfloat16", + "tensor_parallel_size": 1, + "max_num_batched_tokens": 8192 +} diff --git a/dockerfiles/triton_vllm_backend/samples/test_models/llama7b_1x/config.pbtxt b/dockerfiles/triton_vllm_backend/samples/test_models/llama7b_1x/config.pbtxt new file mode 100644 index 0000000..48871c6 --- /dev/null +++ b/dockerfiles/triton_vllm_backend/samples/test_models/llama7b_1x/config.pbtxt @@ -0,0 +1,36 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Note: You do not need to change any fields in this configuration. + +backend: "vllm" +# The usage of device is deferred to the vLLM engine +instance_group [ + { + count: 1 + kind: KIND_MODEL + } +] diff --git a/dockerfiles/triton_vllm_backend/samples/test_models/qwen_7b_chat/1/model.json b/dockerfiles/triton_vllm_backend/samples/test_models/qwen_7b_chat/1/model.json new file mode 100644 index 0000000..ef8a958 --- /dev/null +++ b/dockerfiles/triton_vllm_backend/samples/test_models/qwen_7b_chat/1/model.json @@ -0,0 +1,13 @@ +{ + "model":"Qwen/Qwen2-7B-Instruct", + "tokenizer":"Qwen/Qwen2-7B-Instruct", + "disable_log_requests": "false", + "gpu_memory_utilization": 0.5, + "enforce_eager": "true", + "max_num_seqs": 512, + "swap_space": 16, + "dtype": "bfloat16", + "tensor_parallel_size": 1, + "max_num_batched_tokens": 131072, + "chat_template": "true" +} diff --git a/dockerfiles/triton_vllm_backend/samples/test_models/qwen_7b_chat/config.pbtxt b/dockerfiles/triton_vllm_backend/samples/test_models/qwen_7b_chat/config.pbtxt new file mode 100644 index 0000000..48871c6 --- /dev/null +++ b/dockerfiles/triton_vllm_backend/samples/test_models/qwen_7b_chat/config.pbtxt @@ -0,0 +1,36 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Note: You do not need to change any fields in this configuration. + +backend: "vllm" +# The usage of device is deferred to the vLLM engine +instance_group [ + { + count: 1 + kind: KIND_MODEL + } +] diff --git a/utils/intel_gaudi_health_screen/IGNodes.py b/utils/intel_gaudi_health_screen/IGNodes.py index be2a1f2..6fdbab8 100644 --- a/utils/intel_gaudi_health_screen/IGNodes.py +++ b/utils/intel_gaudi_health_screen/IGNodes.py @@ -114,21 +114,25 @@ def __init__(self, name="", health_report=HealthReport(), num_checks_link_state= def scan_cards(self): self.logger.info(f"Scanning cards info on Node: {self.name}") - cmd = "hl-smi -Q index,module_id,bus_id,memory.used,temperature.aip -f csv,noheader" + cmd = "hl-smi -Q index,module_id,bus_id,memory.used,temperature.aip,name -f csv,noheader" output = run_cmd(cmd) reader = csv.reader(output.split('\n'), delimiter=',') for row in reader: if len(row) == 0: continue + elif len(row) < 6: + _logger.error("hl-smi output is not correct: Recieved output: {row}") + continue i = row[0] module_id = row[1].strip() pci_address = row[2] memory_used = int(row[3].split()[0]) temperature_C = int(row[4].split()[0]) + system_name = row[5] - card = IGCard(index=i, module_id=module_id, pci_address=pci_address, memory_used=memory_used, temperature=temperature_C, logger=self.logger) + card = IGCard(system_name=system_name, index=i, module_id=module_id, pci_address=pci_address, memory_used=memory_used, temperature=temperature_C, logger=self.logger) self.cards[i] = card self.cards = dict(sorted(self.cards.items())) @@ -190,7 +194,8 @@ def write_json(self, cards): class IGCard(): - def __init__(self, index=-1, module_id=-1, pci_address="", memory_used=-1, framework="pytorch", temperature=-1, logger=None): + def __init__(self, system_name="", index=-1, module_id=-1, pci_address="", memory_used=-1, framework="pytorch", temperature=-1, logger=None): + self.system_name = system_name self.node_id = "" self.logger = logger self.index = index @@ -286,14 +291,22 @@ def check_device_acquire_fail(self): return self.device_acquire_fail def check_temperature_state(self): - max_good_temperature = 83 - base_temperature = 25 - max_delta = 25 + if self.system_name == "HL-325": + # Gaudi-3 System + max_good_temperature = 200 + base_temperature = 45 + max_delta = 80 + else: + # Gaudi-2 System + max_good_temperature = 83 + base_temperature = 25 + max_delta = 25 + if self.temperature_C >= max_good_temperature: self.temperature_state_C = "CRITICAL" self.is_infected = True - elif self.temperature_C - base_temperature >= max_delta: + elif abs(self.temperature_C - base_temperature) >= max_delta: self.temperature_state_C = "WARN" self.is_infected = True else: diff --git a/utils/intel_gaudi_health_screen/README.md b/utils/intel_gaudi_health_screen/README.md index 7d67984..f0a537c 100644 --- a/utils/intel_gaudi_health_screen/README.md +++ b/utils/intel_gaudi_health_screen/README.md @@ -1,4 +1,4 @@ -# Intel Gaudi Health Screen 2.2.0 +# Intel Gaudi Health Screen 2.2.2 A large scale Intel Gaudi cluster contains a lot of moving parts. To ensure distributed training proceeds smoothly, it is recommended to check the cluster network health. Troubleshooting issues on a large cluster can be a tedious act. To simplify the debugging process the @@ -127,16 +127,16 @@ been tested, such as having missing cards, it is occupied by another session, or ## Setup IGHS is compatible with python3 default packages and does not require additional packages -to be installed +to be installed. -If your setup envionrment requires custom configruation, update the yaml files located in the templates folder. +If your setup Environment requires custom configruation, update the yaml files located in the templates folder. If running on bare metal system, then install `pdsh` to your system. -Update [config.yaml](config.yaml) to match your system envionrment +Update [config.yaml](config.yaml) to match your system Environment ``` yaml -# Sets IGHS to screen for K8s or Bare Metal Envionrment (k8s, bare-metal). +# Sets IGHS to screen for K8s or Bare Metal Environment (k8s, bare-metal). system-info: type: "k8s" # Namespace is only required for k8s settings @@ -149,7 +149,7 @@ system-info: tcp-interface: "10.3.124.0/24" # Image to run Intel Gaudi Health Screen -image: "vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest" +image: "vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest" # Node Label used to identify a Intel Gaudi Node gaudi-node-label: "habana.ai/gaudi:NoSchedule" @@ -220,7 +220,7 @@ IGHS can alternatively be run through below script: To run on bare-metal systems update the [config.yaml](config.yaml) to use bare-metal configuration. ``` yaml -# Sets IGHS to screen for K8s or Bare Metal Envionrment (k8s, bare-metal). +# Sets IGHS to screen for K8s or Bare Metal Environment (k8s, bare-metal). system-info: type: "bare-metal" # Namespace is only required for k8s settings @@ -233,7 +233,7 @@ system-info: tcp-interface: "10.3.124.0/24" # Image to run Intel Gaudi Health Screen -image: "vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest" +image: "vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest" # Node Label used to identify a Intel Gaudi Node gaudi-node-label: "habana.ai/gaudi:NoSchedule" @@ -260,11 +260,13 @@ Before running the screening test, you need to generate the ssh key used for pas ``` bash # Keys to setup initial bare-metal passwordless ssh connection between systems +mkdir -p ssh; ssh-keygen -t rsa -f ssh/ighs_rsa; chmod 600 ssh/ighs_rsa; chmod 644 ssh/ighs_rsa.pub; # Keys to setup containers passwordless ssh connection +mkdir -p template/bare-metal/ssh; ssh-keygen -t rsa -f template/bare-metal/ssh/id_rsa; chmod 600 template/bare-metal/ssh/id_rsa; chmod 644 template/bare-metal/ssh/id_rsa.pub; diff --git a/utils/intel_gaudi_health_screen/config.yaml b/utils/intel_gaudi_health_screen/config.yaml index b9c3ae0..f3aef5b 100644 --- a/utils/intel_gaudi_health_screen/config.yaml +++ b/utils/intel_gaudi_health_screen/config.yaml @@ -12,7 +12,7 @@ system-info: tcp-interface: "10.3.124.0/24" # Image to run Intel Gaudi Health Screen -image: "vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest" +image: "vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest" # Node Label used to identify a Intel Gaudi Node gaudi-node-label: "habana.ai/gaudi:NoSchedule" @@ -32,4 +32,4 @@ level-2: run: true timeout_s: 130 # Number of times to check Network connections between nodes - num-rounds: 5 \ No newline at end of file + num-rounds: 5 diff --git a/utils/intel_gaudi_health_screen/hccl_demo_helper.py b/utils/intel_gaudi_health_screen/hccl_demo_helper.py index d0d492c..107d8ba 100644 --- a/utils/intel_gaudi_health_screen/hccl_demo_helper.py +++ b/utils/intel_gaudi_health_screen/hccl_demo_helper.py @@ -35,10 +35,11 @@ def find_groups(healthy_nodes, watch_nodes, groups_tracker): max_num_groups = num_nodes // 2 max_combinations = (math.factorial(num_nodes)) / (math.factorial(num_nodes-2) * 2) max_attempts = 10 + groups_tracker = set(groups_tracker) if num_nodes == 1: - _logger.warn(f"Need more than 1 Node to test pair all_reduce") - return False + _logger.warning(f"Need more than 1 Node to test pair all_reduce") + return node_groups, list(groups_tracker) while len(node_groups) < max_num_groups and found_unique: i = 0 @@ -49,27 +50,27 @@ def find_groups(healthy_nodes, watch_nodes, groups_tracker): break node_group, group_id, (h_i, w_i) = find_group_id(healthy_nodes, watch_nodes, h_i, w_i) - - if node_group[0] == node_group[1]: - _logger.info(f"Found duplicate nodes in node_group {node_group}. Exiting group id search") + i += 1 + if len(node_group) < 2 or node_group[0] == node_group[1]: + _logger.info(f"Found invalid node_group {node_group}. Exiting group id search") found_unique = False break while group_id in groups_tracker: - if i > max_attempts: - _logger.warn(f"Max attempt {max_attempts} reached for finding unique pair combination.") + if i >= max_attempts: + _logger.warning(f"Max attempt {max_attempts} reached for finding unique pair combination.") found_unique = False break node_group, group_id, (h_i, w_i) = find_group_id(healthy_nodes, watch_nodes, h_i, w_i) - if group_id == "" and node_group[0] == node_group[1]: + i += 1 + if len(node_group) < 2 or node_group[0] == node_group[1]: + _logger.info(f"Internal while Found invalid node_group {node_group}. Exiting group id search") found_unique = False break - i += 1 - if found_unique: - groups_tracker.append(group_id) + groups_tracker.add(group_id) node_groups.append(node_group) for n in node_group: @@ -81,7 +82,7 @@ def find_groups(healthy_nodes, watch_nodes, groups_tracker): if len(watch_nodes) == 0: break - return node_groups, groups_tracker + return node_groups, list(groups_tracker) def find_group_id(healthy_nodes, watch_nodes, h_i=0, w_i=0): """ Finds a group of nodes and combines to form a group id @@ -111,10 +112,10 @@ def find_group_id(healthy_nodes, watch_nodes, h_i=0, w_i=0): node_group.append(healthy_nodes[h_i]) h_i += 1 - if h_i > len(healthy_nodes): + if h_i >= len(healthy_nodes): random.shuffle(healthy_nodes) h_i = 0 - if w_i > len(watch_nodes): + if w_i >= len(watch_nodes): random.shuffle(watch_nodes) w_i = 0 diff --git a/utils/intel_gaudi_health_screen/screen.py b/utils/intel_gaudi_health_screen/screen.py index 3f644e1..18fd25d 100644 --- a/utils/intel_gaudi_health_screen/screen.py +++ b/utils/intel_gaudi_health_screen/screen.py @@ -14,7 +14,7 @@ import argparse import logging -from utilities import download_repos, clear_ighs_pods, create_logger, get_logging_level +from utilities import download_repos, create_logger, get_logging_level from hccl_demo_helper import hccl_demo_check from system_utils import KubeUtils, BareMetalUtils @@ -77,11 +77,11 @@ def monitor_ighs_status(system_mode, level, nodes, timeout_s=240, round=0): if level == 1: nodes.healthy_nodes = set(healthy_nodes) - _logger.info(f"Infected {len(infected_nodes)} Node: {infected_nodes}") + _logger.info(f"Detected {len(detected_nodes)} Node: {detected_nodes}") + _logger.info(f" Healthy {len(healthy_nodes)} Node: {healthy_nodes}") + _logger.info(f" Infected {len(infected_nodes)} Node: {infected_nodes}") _logger.info(f"Missing {len(missing_nodes)} Node: {missing_nodes}") _logger.info(f"Unverified {len(watch_nodes)} Node: {watch_nodes}") - _logger.info(f"Healthy {len(healthy_nodes)} Node: {healthy_nodes}") - _logger.info(f"Detected {len(detected_nodes)} Node: {detected_nodes}") return healthy_nodes, infected_nodes, missing_nodes @@ -111,18 +111,21 @@ def main(args): with open(args.config, 'r') as f: config_data = yaml.safe_load(f) + hostfile = "" + if "hostfile" in config_data["system-info"]: + hostfile = config_data["system-info"]["hostfile"] + log_level = get_logging_level(config_data["log-level"]) _logger, _ = create_logger(logger_name="health_screener", logger_file_name="screener", f_path=args.logs_dir, level=log_level) if config_data["system-info"]["type"] == "k8s": system_mode = KubeUtils(image=config_data["image"], - hostfile=config_data["system-info"]["hostfile"], + hostfile=hostfile, namespace=config_data["system-info"]["namespace"], log_dir=args.logs_dir) elif config_data["system-info"]["type"] == "bare-metal": - system_mode = BareMetalUtils(image=config_data["image"], - hostfile=config_data["system-info"]["hostfile"], + hostfile=hostfile, ssh_path=config_data["system-info"]["ssh-path"], tcp_interface=config_data["system-info"]["tcp-interface"], log_dir=args.logs_dir) @@ -145,7 +148,6 @@ def main(args): intel_gaudi_nodes = IGNodes(health_report=health_report) intel_gaudi_nodes.all_nodes = system_mode.collect_nodes(gaudi_node_label=config_data["gaudi-node-label"]) - intel_gaudi_nodes.watch_nodes = set(intel_gaudi_nodes.all_nodes) healthy_nodes, infected_nodes, missing_nodes = list(), list(), list() occupied_nodes, missing_cards_nodes, misc_nodes = list(), list(), list() @@ -162,7 +164,7 @@ def main(args): level=1, nodes=intel_gaudi_nodes, timeout_s=config_data["level-1"]["timeout_s"]) - occupied_nodes, missing_cards_nodes, misc_nodes = system_mode.diagnose_unhealthy_nodes(infected_nodes, missing_nodes) + occupied_nodes, missing_cards_nodes, misc_nodes = system_mode.diagnose_missing_nodes(missing_nodes) system_mode.clear_ighs_pods() summary = { @@ -184,7 +186,8 @@ def main(args): os.makedirs(f"{health_report.f_dir}/L2") intel_gaudi_nodes.healthy_nodes = set() - intel_gaudi_nodes.watch_nodes = set(intel_gaudi_nodes.all_nodes) + intel_gaudi_nodes.watch_nodes = set(intel_gaudi_nodes.all_nodes).difference(set(missing_nodes)) + intel_gaudi_nodes.missing_nodes = set(missing_nodes) for i in range(config_data["level-2"]["num-rounds"]): nodes_initialized = system_mode.initialize_node_jobs(level=2, @@ -200,7 +203,7 @@ def main(args): nodes=intel_gaudi_nodes, timeout_s=config_data["level-2"]["timeout_s"], round=i) - occupied_nodes, missing_cards_nodes, misc_nodes = system_mode.diagnose_unhealthy_nodes(infected_nodes, missing_nodes) + occupied_nodes, missing_cards_nodes, misc_nodes = system_mode.diagnose_missing_nodes(missing_nodes) system_mode.clear_ighs_pods(job_type="mpijobs") if len(intel_gaudi_nodes.watch_nodes) == 0: diff --git a/utils/intel_gaudi_health_screen/system_utils.py b/utils/intel_gaudi_health_screen/system_utils.py index 99d0671..76551eb 100644 --- a/utils/intel_gaudi_health_screen/system_utils.py +++ b/utils/intel_gaudi_health_screen/system_utils.py @@ -71,7 +71,7 @@ def collect_nodes(self, gaudi_node_label): output = run_cmd(cmd) all_nodes = output.strip().split() - _logger.info(f"Collected Nodes: {all_nodes}") + _logger.info(f"Collected {len(all_nodes)} k8s Nodes: {all_nodes}") return all_nodes @@ -109,7 +109,7 @@ def initialize_node_jobs(self, level, job_path = f"{job_base_path}/L2/r{round}" if len(node_groups) == 0 : - _logger.warn(f"No Node Groups to test found during initialization") + _logger.warning(f"No Node Groups to test found during initialization") return nodes_initialized @@ -266,53 +266,56 @@ def check_screen_complete(self, current_run_status, health_report, level, round= pods = output.split("\n") for p in pods: - p_name, status, state = p.split() - if status == "Succeeded": - cmd = f"kubectl logs -n {self.namespace} {p_name}" - output = run_cmd(cmd).strip().split("\n") - - start_analyze = False - for l in output: - if "START of Node Report" in l: - start_analyze = True - continue - elif "END of Node Report" in l: - start_analyze = False - continue - - #### analyze output - if start_analyze: - # Ignore Logger output level - bracket_index = l.index("{") - node_status_txt = l[bracket_index:] - status_dict = json.loads(node_status_txt) - - if not p_name in current_run_status: - with open(f"{log_dir}/{p_name}.json", 'w', encoding ='utf8') as f: - json.dump(status_dict, f, indent=4) - with open(f"{log_dir}/{p_name}.log", 'w', encoding ='utf8') as f: - f.write('\n'.join(output)) - - if level == 1: - health_report.write_rows(data=status_dict["cards"], level=level) - current_run_status[p_name] = True - elif level == 2: - health_report.write_rows(data=[status_dict], level=level) - current_run_status[p_name] = (True, status_dict["num_nodes"]) - elif state == "CrashLoopBackOff" and level==2 or (final_check and "launcher" in p_name and status=="Running"): - cmd = f"kubectl logs -n {self.namespace} {p_name}" - output = run_cmd(cmd).strip().split("\n") - - hccL_results = hccl_demo_check(job_id=p_name, health_report=health_report, hccl_log=output, write=False) - - if not p_name in current_run_status: - with open(f"{log_dir}/{p_name}.json", 'w', encoding ='utf8') as f: - json.dump(hccL_results, f, indent=4) - with open(f"{log_dir}/{p_name}.log", 'w', encoding ='utf8') as f: - f.write('\n'.join(output)) - - health_report.write_rows(data=[hccL_results], level=level) - current_run_status[p_name] = (True, hccL_results["num_nodes"]) + try: + p_name, status, state = p.split() + if status == "Succeeded": + cmd = f"kubectl logs -n {self.namespace} {p_name}" + output = run_cmd(cmd).strip().split("\n") + + start_analyze = False + for l in output: + if "START of Node Report" in l: + start_analyze = True + continue + elif "END of Node Report" in l: + start_analyze = False + continue + + #### analyze output + if start_analyze: + # Ignore Logger output level + bracket_index = l.index("{") + node_status_txt = l[bracket_index:] + status_dict = json.loads(node_status_txt) + + if not p_name in current_run_status: + with open(f"{log_dir}/{p_name}.json", 'w', encoding ='utf8') as f: + json.dump(status_dict, f, indent=4) + with open(f"{log_dir}/{p_name}.log", 'w', encoding ='utf8') as f: + f.write('\n'.join(output)) + + if level == 1: + health_report.write_rows(data=status_dict["cards"], level=level) + current_run_status[p_name] = True + elif level == 2: + health_report.write_rows(data=[status_dict], level=level) + current_run_status[p_name] = (True, status_dict["num_nodes"]) + elif state == "CrashLoopBackOff" and level==2 or (final_check and "launcher" in p_name and status=="Running"): + cmd = f"kubectl logs -n {self.namespace} {p_name}" + output = run_cmd(cmd).strip().split("\n") + + hccL_results = hccl_demo_check(job_id=p_name, health_report=health_report, hccl_log=output, write=False) + + if not p_name in current_run_status: + with open(f"{log_dir}/{p_name}.json", 'w', encoding ='utf8') as f: + json.dump(hccL_results, f, indent=4) + with open(f"{log_dir}/{p_name}.log", 'w', encoding ='utf8') as f: + f.write('\n'.join(output)) + + health_report.write_rows(data=[hccL_results], level=level) + current_run_status[p_name] = (True, hccL_results["num_nodes"]) + except ValueError: + _logger.error(f"Not able to retrieve Running Pods. Expected to recieve list of pods but got output: {pods}") if level == 1: num_nodes = len(current_run_status) @@ -325,10 +328,11 @@ def check_screen_complete(self, current_run_status, health_report, level, round= return num_nodes - def diagnose_unhealthy_nodes(self, infected_nodes, missing_nodes): + def diagnose_missing_nodes(self, missing_nodes): in_use_set = set() missing_cards_set = set() misc_set = set() + _logger.info(f"Diagnose {len(missing_nodes)} missing_nodes:") for n in missing_nodes: cmd = f"kubectl describe nodes -n {self.namespace} {n}" @@ -359,11 +363,11 @@ def diagnose_unhealthy_nodes(self, infected_nodes, missing_nodes): misc_list = sorted(list(set(missing_nodes).difference(in_use_set).difference(missing_cards_set))) if(len(in_use_list)): - _logger.info(f"{len(in_use_list)} Occupied Nodes: {in_use_list}") + _logger.info(f" {len(in_use_list)} Occupied Nodes: {in_use_list}") if(len(missing_cards_list)): - _logger.info(f"{len(missing_cards_list)} Nodes w/ missing cards: {missing_cards_list}") + _logger.info(f" {len(missing_cards_list)} Nodes w/ missing cards: {missing_cards_list}") if(len(misc_list)): - _logger.info(f"{len(misc_list)} Unaccounted Nodes: {misc_list}") + _logger.info(f" {len(misc_list)} Untested Nodes: {misc_list}") return in_use_list, missing_cards_list, misc_list @@ -426,7 +430,7 @@ def initialize_system(self): def collect_nodes(self, gaudi_node_label=""): - _logger.info(f"Collected Nodes: {self.hosts}") + _logger.info(f"Collected {len(self.hosts)} Nodes: {self.hosts}") return self.hosts @@ -460,7 +464,7 @@ def initialize_node_jobs(self, level, nodes.worker_nodes = list() if len(node_groups) == 0: - _logger.warn(f"No Node Groups to test found during initialization") + _logger.warning(f"No Node Groups to test found during initialization") return nodes_initialized self.update_yaml_job(source_file="config.yaml", out_dir="tmp", out_file="config.yaml", yaml_type="config") @@ -618,63 +622,66 @@ def check_screen_complete(self, current_run_status, health_report, level, round= pods = output.split("\n") for p in pods: - if ":" not in p: - continue - - colon_index = p.index(":") - name = p[:colon_index] - data_txt = p[colon_index+1:] - - data = json.loads(data_txt) - - if data["State"] == "exited": - cmd = f"ssh {name} {check_log_cmd}" - output = run_cmd(cmd).strip().split("\n") - - start_analyze = False - for l in output: - if "START of Node Report" in l: - start_analyze = True - continue - elif "END of Node Report" in l: - start_analyze = False - continue - - #### analyze output - if start_analyze: - # Ignore Logger output level - bracket_index = l.index("{") - node_status_txt = l[bracket_index:] - status_dict = json.loads(node_status_txt) - - if not name in current_run_status: - if level == 1: - health_report.write_rows(data=status_dict["cards"], level=level) - current_run_status[name] = True - elif level == 2: - health_report.write_rows(data=[status_dict], level=level) - current_run_status[name] = (True, status_dict["num_nodes"]) - name = f"ighs-hccl-r{status_dict['round']}-{status_dict['group_id']}" - - with open(f"{log_dir}/{name}.json", 'w', encoding ='utf8') as f: - json.dump(status_dict, f, indent=4) - with open(f"{log_dir}/{name}.log", 'w', encoding ='utf8') as f: - f.write('\n'.join(output)) - elif level==2 and final_check: - cmd = f"ssh {name} {check_log_cmd}" - output = run_cmd(cmd).strip().split("\n") - - if not name in current_run_status: - hccL_results = hccl_demo_check(job_id=name, health_report=health_report, hccl_log=output, write=False) - f_name = f"ighs-hccl-r{hccL_results['round']}-{hccL_results['group_id']}" - - with open(f"{log_dir}/{f_name}.json", 'w', encoding ='utf8') as f: - json.dump(hccL_results, f, indent=4) - with open(f"{log_dir}/{f_name}.log", 'w', encoding ='utf8') as f: - f.write('\n'.join(output)) - - health_report.write_rows(data=[hccL_results], level=level) - current_run_status[name] = (True, hccL_results["num_nodes"]) + try: + if ":" not in p: + continue + + colon_index = p.index(":") + name = p[:colon_index] + data_txt = p[colon_index+1:] + + data = json.loads(data_txt) + + if data["State"] == "exited": + cmd = f"ssh {name} {check_log_cmd}" + output = run_cmd(cmd).strip().split("\n") + + start_analyze = False + for l in output: + if "START of Node Report" in l: + start_analyze = True + continue + elif "END of Node Report" in l: + start_analyze = False + continue + + #### analyze output + if start_analyze: + # Ignore Logger output level + bracket_index = l.index("{") + node_status_txt = l[bracket_index:] + status_dict = json.loads(node_status_txt) + + if not name in current_run_status: + if level == 1: + health_report.write_rows(data=status_dict["cards"], level=level) + current_run_status[name] = True + elif level == 2: + health_report.write_rows(data=[status_dict], level=level) + current_run_status[name] = (True, status_dict["num_nodes"]) + name = f"ighs-hccl-r{status_dict['round']}-{status_dict['group_id']}" + + with open(f"{log_dir}/{name}.json", 'w', encoding ='utf8') as f: + json.dump(status_dict, f, indent=4) + with open(f"{log_dir}/{name}.log", 'w', encoding ='utf8') as f: + f.write('\n'.join(output)) + elif level==2 and final_check: + cmd = f"ssh {name} {check_log_cmd}" + output = run_cmd(cmd).strip().split("\n") + + if not name in current_run_status: + hccL_results = hccl_demo_check(job_id=name, health_report=health_report, hccl_log=output, write=False) + f_name = f"ighs-hccl-r{hccL_results['round']}-{hccL_results['group_id']}" + + with open(f"{log_dir}/{f_name}.json", 'w', encoding ='utf8') as f: + json.dump(hccL_results, f, indent=4) + with open(f"{log_dir}/{f_name}.log", 'w', encoding ='utf8') as f: + f.write('\n'.join(output)) + + health_report.write_rows(data=[hccL_results], level=level) + current_run_status[name] = (True, hccL_results["num_nodes"]) + except: + _logger.error(f"Not able to retrieve Running Pods. Expected to recieve list of pods but got output: {pods}") if level == 1: num_nodes = len(current_run_status) @@ -687,5 +694,5 @@ def check_screen_complete(self, current_run_status, health_report, level, round= return num_nodes - def diagnose_unhealthy_nodes(self, infected_nodes, missing_nodes): - pass + def diagnose_missing_nodes(self, missing_nodes): + return [],[],[] diff --git a/utils/intel_gaudi_health_screen/template/bare-metal/dockerfile b/utils/intel_gaudi_health_screen/template/bare-metal/dockerfile index e57131c..9a0d218 100644 --- a/utils/intel_gaudi_health_screen/template/bare-metal/dockerfile +++ b/utils/intel_gaudi_health_screen/template/bare-metal/dockerfile @@ -3,6 +3,7 @@ FROM ${BASE_IMAGE} RUN mkdir ~/.ssh && \ cd ~/.ssh && \ +ssh-keygen -A && \ sed -i 's/#Port 22/Port 3122/g' /etc/ssh/sshd_config && \ sed -i 's/# Port 22/ Port 3122/g' /etc/ssh/ssh_config && \ sed -i 's/3022/3122/g' ~/.bashrc && \ @@ -11,6 +12,7 @@ echo "ForwardAgent yes" >> ~/.ssh/config && \ echo "StrictHostKeyChecking no" >> ~/.ssh/config && \ echo "UserKnownHostsFile /dev/null" >> ~/.ssh/config && \ echo "LogLevel ERROR" >> ~/.ssh/config && \ +service ssh start && \ chmod 600 ~/.ssh/config diff --git a/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L1.yaml b/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L1.yaml index 2dab422..d1f6941 100644 --- a/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L1.yaml +++ b/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L1.yaml @@ -36,6 +36,9 @@ spec: command: ["/bin/bash", "-c"] args: - >- + ssh-keygen -A; + service ssh start; + while [ ! -d /workdir/intel_gaudi_health_screen ]; do sleep 2s; done; diff --git a/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L2_hccl-demo.yaml b/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L2_hccl-demo.yaml index 6319743..04c50c0 100644 --- a/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L2_hccl-demo.yaml +++ b/utils/intel_gaudi_health_screen/template/k8s/intel-gaudi-health-screen-L2_hccl-demo.yaml @@ -48,8 +48,10 @@ spec: command: ["/bin/bash", "-c"] args: - >- - set -eo pipefail; + set -eo pipefail; echo "Target Nodes: $TARGET_NODES"; + ssh-keygen -A; + service ssh start; while [ ! -d /workdir/intel_gaudi_health_screen ]; do sleep 2s; @@ -155,5 +157,6 @@ spec: args: - >- printenv | grep "MY" >> /etc/environment; + ssh-keygen -A; service ssh start; sleep 365d; diff --git a/utils/intel_gaudi_health_screen/utilities.py b/utils/intel_gaudi_health_screen/utilities.py index 47f5458..cfcd893 100644 --- a/utils/intel_gaudi_health_screen/utilities.py +++ b/utils/intel_gaudi_health_screen/utilities.py @@ -91,7 +91,7 @@ def run_cmd(cmd, timeout_s=1_800, verbose=False): if (verbose): _logger.debug(f"Running cmd: {cmd}") - _logger.info(result.stdout) + _logger.debug(result.stdout) return result.stdout @@ -161,41 +161,3 @@ def clear_job(job): _logger.info(f"Attempt {attempts} Pods are still up. Will wait 10 seconds to check again") time.sleep(10) - - -def clear_ighs_pods(job_type="jobs"): - """ Clear Pods with label=ighs,ighs-hccl - - Args: - job_type (str, optional): Type of Job to delete. Options: [jobs, mpijobs]. Defaults to "jobs". - """ - _logger.info(f"Checking for existing IGHS Pods ({job_type})") - - metadata_app = "ighs" if (job_type == "jobs") else "ighs-hccl" - - cmd = f"kubectl get pods -n default -l app={metadata_app} -o=custom-columns='NAME:.metadata.name' --no-headers" - output = run_cmd(cmd).strip() - - if len(output) > 0: - _logger.info(f"Found existing IGHS Pods ({job_type}). Will delete.") - - cmd = f"kubectl get {job_type} -n default -l app={metadata_app} -o=custom-columns='NAME:.metadata.name' --no-headers" - output = run_cmd(cmd).strip() - jobs = output.split() - - _logger.info(f"Deleting jobs {jobs}") - for job in jobs: - cmd = f"kubectl delete {job_type} -n default {job}" - output = run_cmd(cmd) - - cmd = f"kubectl get pods -n default -l app={metadata_app} -o=custom-columns='NAME:.metadata.name' --no-headers" - max_attempt = 15 - for attempts in range(max_attempt): - output = run_cmd(cmd).strip() - - if(len(output) == 0): - break - - _logger.info(f"Attempt {attempts}: Pods are still up. Will wait 10 seconds to check again") - time.sleep(10) - diff --git a/utils/intel_gaudi_health_screen/version.txt b/utils/intel_gaudi_health_screen/version.txt index e3a4f19..7e541ae 100644 --- a/utils/intel_gaudi_health_screen/version.txt +++ b/utils/intel_gaudi_health_screen/version.txt @@ -1 +1 @@ -2.2.0 \ No newline at end of file +2.2.2 \ No newline at end of file