diff --git a/dockerfiles/base/Dockerfile.amzn2 b/dockerfiles/base/Dockerfile.amzn2 index 87a873b..f091ab3 100644 --- a/dockerfiles/base/Dockerfile.amzn2 +++ b/dockerfiles/base/Dockerfile.amzn2 @@ -55,7 +55,7 @@ ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ echo "name=Habana AWS Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/AmazonLinux2" >> /etc/yum.repos.d/habanalabs.repo && \ - echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/AmazonLinux2/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo + echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/api/v2/repositories/AmazonLinux2/keyPairs/primary/public" >> /etc/yum.repos.d/habanalabs.repo RUN yum makecache && \ yum install -y habanalabs-rdma-core-"$VERSION"-"$REVISION".amzn2 && \ diff --git a/dockerfiles/base/Dockerfile.rhel8.6 b/dockerfiles/base/Dockerfile.rhel8.6 index eab74b1..ecefcd2 100644 --- a/dockerfiles/base/Dockerfile.rhel8.6 +++ b/dockerfiles/base/Dockerfile.rhel8.6 @@ -80,7 +80,7 @@ ENV FI_EFA_USE_DEVICE_RDMA=1 RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ echo "name=Habana RH8 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/rhel/8/8.6" >> /etc/yum.repos.d/habanalabs.repo && \ - echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/rhel/8/8.6/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo + echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/api/v2/repositories/rhel/keyPairs/primary/public" >> /etc/yum.repos.d/habanalabs.repo RUN echo "[powertools]" > /etc/yum.repos.d/powertools.repo && \ echo "name=powertools" >> /etc/yum.repos.d/powertools.repo && \ diff --git a/dockerfiles/base/Dockerfile.rhel9.2 b/dockerfiles/base/Dockerfile.rhel9.2 index 84954ca..0fad818 100644 --- a/dockerfiles/base/Dockerfile.rhel9.2 +++ b/dockerfiles/base/Dockerfile.rhel9.2 @@ -84,7 +84,7 @@ ENV FI_EFA_USE_DEVICE_RDMA=1 RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ echo "name=Habana RH9 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.2" >> /etc/yum.repos.d/habanalabs.repo && \ - echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.2/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo && \ + echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/api/v2/repositories/rhel/keyPairs/primary/public" >> /etc/yum.repos.d/habanalabs.repo && \ echo "gpgcheck=1" >> /etc/yum.repos.d/habanalabs.repo # for Habana GPG key with SHA-1 signature diff --git a/dockerfiles/base/Dockerfile.rhel9.4 b/dockerfiles/base/Dockerfile.rhel9.4 index 70d2d9d..53b62c7 100644 --- a/dockerfiles/base/Dockerfile.rhel9.4 +++ b/dockerfiles/base/Dockerfile.rhel9.4 @@ -97,7 +97,7 @@ ENV FI_EFA_USE_DEVICE_RDMA=1 RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ echo "name=Habana RH9 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.4" >> /etc/yum.repos.d/habanalabs.repo && \ - echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.4/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo && \ + echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/api/v2/repositories/rhel/keyPairs/primary/public" >> /etc/yum.repos.d/habanalabs.repo && \ echo "gpgcheck=1" >> /etc/yum.repos.d/habanalabs.repo # for Habana GPG key with SHA-1 signature diff --git a/dockerfiles/base/Dockerfile.tencentos3.1 b/dockerfiles/base/Dockerfile.tencentos3.1 index eb089d5..2cf70aa 100644 --- a/dockerfiles/base/Dockerfile.tencentos3.1 +++ b/dockerfiles/base/Dockerfile.tencentos3.1 @@ -65,7 +65,7 @@ ENV FI_EFA_USE_DEVICE_RDMA=1 RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \ echo "name=Habana TC31 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \ echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/tencentos/3/3.1" >> /etc/yum.repos.d/habanalabs.repo && \ - echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/tencentos/3/3.1/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo + echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/api/v2/repositories/tencentos/keyPairs/primary/public" >> /etc/yum.repos.d/habanalabs.repo RUN dnf install -y habanalabs-rdma-core-"$VERSION"-"$REVISION".tl3 \ habanalabs-thunk-"$VERSION"-"$REVISION".tl3 \ diff --git a/dockerfiles/common.mk b/dockerfiles/common.mk index abcc906..4404cd5 100644 --- a/dockerfiles/common.mk +++ b/dockerfiles/common.mk @@ -6,8 +6,8 @@ BUILD_DIR ?= $(CURDIR)/dockerbuild REPO_SERVER ?= vault.habana.ai PT_VERSION ?= 2.3.1 -RELEASE_VERSION ?= 1.17.0 -RELEASE_BUILD_ID ?= 495 +RELEASE_VERSION ?= 1.17.1 +RELEASE_BUILD_ID ?= 40 BASE_IMAGE_URL ?= base-installer-$(BUILD_OS) IMAGE_URL = $(IMAGE_NAME):$(RELEASE_VERSION)-$(RELEASE_BUILD_ID) diff --git a/utils/README.md b/utils/README.md index b05dbab..f992817 100644 --- a/utils/README.md +++ b/utils/README.md @@ -25,7 +25,7 @@ This folder contains some Intel Gaudi utility scripts that users can access as r ## manage_network_ifs -Moved to habanalabs-qual Example: (/opt/habanalabs/qual/gaudi2/bin/manage_network_ifs.sh). +Moved to habanalabs-qual Example: (/opt/habanalabs/qual/gaudi2/bin/manage_network_ifs.sh or /opt/habanalabs/qual/gaudi3/bin/manage_network_ifs.sh). This script can be used as reference to bring up, take down, set IPs, unset IPs and check for status of the Intel Gaudi network interfaces. @@ -38,8 +38,10 @@ options: --up toggle up all Intel Gaudi network interfaces --down toggle down all Intel Gaudi network interfaces --status print status of all Intel Gaudi network interfaces - --set-ip set IP for all internal Intel Gaudi network interfaces - --unset-ip unset IP from all internal Intel Gaudi network interfaces + --set-pfc set PFC (enabled=0,1,2,3) + --unset-pfc unset PFC (enabled=none) + --check-pfc dump PFC configuration + --no-progbar do not show progress bar -v, --verbose print more logs -h, --help print this help @@ -56,7 +58,6 @@ Use the following command to bring all Intel Gaudi network interfaces online: ``` sudo manage_network_ifs.sh --up ``` -Once all the Intel Gaudi interfaces are toggled up, IPs will be set by default. Please refer [Set Ip](#set-ip) for more detail. To unset IPs, run this script with '--unset-ip' ### Down Use the following command to bring all Intel Gaudi network interfaces offline: @@ -69,18 +70,24 @@ Print the current operational state of all Intel Gaudi network interfaces such a ``` sudo manage_network_ifs.sh --status ``` -### Set IP +### Set PFC -Use the following command to assign a default IP for all Intel Gaudi network interfaces: +Use the following command to set PFC for all Intel Gaudi network interfaces: ``` -sudo manage_network_ifs.sh --set-ip +sudo manage_network_ifs.sh --set-pfc ``` -Note: Default IPs are 192.168.100.1, 192.168.100.2, 192.168.100.3 and so on -### Unset IP +### Unset PFC -Remove IP from all available Intel Gaudi network interfaces by the following command: +Use the following command to unset PFC for all Intel Gaudi network interfaces: ``` -sudo manage_network_ifs.sh --unset-ip +sudo manage_network_ifs.sh --unset-pfc +``` + +### Check current PFC configuration + +Use the following command to check current PFC status for all Intel Gaudi network interfaces: +``` +sudo manage_network_ifs.sh --check-pfc ``` ## check_framework_env @@ -141,4 +148,4 @@ IGHS can alternatively be run through below script: # Creates IGHS Report and screens clusters for any infected nodes. # Will check Level 1 and 2 by default ./run_ighs.sh -``` \ No newline at end of file +``` diff --git a/utils/intel_gaudi_health_screen/README.md b/utils/intel_gaudi_health_screen/README.md index af4ad4e..7d67984 100644 --- a/utils/intel_gaudi_health_screen/README.md +++ b/utils/intel_gaudi_health_screen/README.md @@ -149,10 +149,10 @@ system-info: tcp-interface: "10.3.124.0/24" # Image to run Intel Gaudi Health Screen -image: "vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest" +image: "vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest" # Node Label used to identify a Intel Gaudi Node -gaudi-node-label: "ighs_label=gaudi" +gaudi-node-label: "habana.ai/gaudi:NoSchedule" # Controls granularity of Logs (INFO, DEBUG, WARN, ERROR, CRITICAL) log-level: "DEBUG" @@ -236,7 +236,7 @@ system-info: image: "vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest" # Node Label used to identify a Intel Gaudi Node -gaudi-node-label: "brightcomputing.com/node-category=gaudi" +gaudi-node-label: "habana.ai/gaudi:NoSchedule" # Controls granularity of Logs (INFO, DEBUG, WARN, ERROR, CRITICAL) log-level: "DEBUG" diff --git a/utils/intel_gaudi_health_screen/config.yaml b/utils/intel_gaudi_health_screen/config.yaml index cb88c01..b9c3ae0 100644 --- a/utils/intel_gaudi_health_screen/config.yaml +++ b/utils/intel_gaudi_health_screen/config.yaml @@ -15,7 +15,7 @@ system-info: image: "vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest" # Node Label used to identify a Intel Gaudi Node -gaudi-node-label: "ighs_label=gaudi" +gaudi-node-label: "habana.ai/gaudi:NoSchedule" # Controls granularity of Logs (INFO, DEBUG, WARN, ERROR, CRITICAL) log-level: "DEBUG"