Skip to content

Commit

Permalink
SynapseAi 1.17.1 release
Browse files Browse the repository at this point in the history
 * Update dockerfiles with 1.17.1 content
  • Loading branch information
igor999999 committed Aug 23, 2024
1 parent fbba3e5 commit 7e23f64
Show file tree
Hide file tree
Showing 9 changed files with 30 additions and 23 deletions.
2 changes: 1 addition & 1 deletion dockerfiles/base/Dockerfile.amzn2
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ ENV RDMA_CORE_LIB=${RDMA_CORE_ROOT}/build/lib
RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \
echo "name=Habana AWS Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \
echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/AmazonLinux2" >> /etc/yum.repos.d/habanalabs.repo && \
echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/AmazonLinux2/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo
echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/api/v2/repositories/AmazonLinux2/keyPairs/primary/public" >> /etc/yum.repos.d/habanalabs.repo

RUN yum makecache && \
yum install -y habanalabs-rdma-core-"$VERSION"-"$REVISION".amzn2 && \
Expand Down
2 changes: 1 addition & 1 deletion dockerfiles/base/Dockerfile.rhel8.6
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ ENV FI_EFA_USE_DEVICE_RDMA=1
RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \
echo "name=Habana RH8 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \
echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/rhel/8/8.6" >> /etc/yum.repos.d/habanalabs.repo && \
echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/rhel/8/8.6/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo
echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/api/v2/repositories/rhel/keyPairs/primary/public" >> /etc/yum.repos.d/habanalabs.repo

RUN echo "[powertools]" > /etc/yum.repos.d/powertools.repo && \
echo "name=powertools" >> /etc/yum.repos.d/powertools.repo && \
Expand Down
2 changes: 1 addition & 1 deletion dockerfiles/base/Dockerfile.rhel9.2
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ ENV FI_EFA_USE_DEVICE_RDMA=1
RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \
echo "name=Habana RH9 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \
echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.2" >> /etc/yum.repos.d/habanalabs.repo && \
echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.2/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo && \
echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/api/v2/repositories/rhel/keyPairs/primary/public" >> /etc/yum.repos.d/habanalabs.repo && \
echo "gpgcheck=1" >> /etc/yum.repos.d/habanalabs.repo

# for Habana GPG key with SHA-1 signature
Expand Down
2 changes: 1 addition & 1 deletion dockerfiles/base/Dockerfile.rhel9.4
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ ENV FI_EFA_USE_DEVICE_RDMA=1
RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \
echo "name=Habana RH9 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \
echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.4" >> /etc/yum.repos.d/habanalabs.repo && \
echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.4/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo && \
echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/api/v2/repositories/rhel/keyPairs/primary/public" >> /etc/yum.repos.d/habanalabs.repo && \
echo "gpgcheck=1" >> /etc/yum.repos.d/habanalabs.repo

# for Habana GPG key with SHA-1 signature
Expand Down
2 changes: 1 addition & 1 deletion dockerfiles/base/Dockerfile.tencentos3.1
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ ENV FI_EFA_USE_DEVICE_RDMA=1
RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \
echo "name=Habana TC31 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \
echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/tencentos/3/3.1" >> /etc/yum.repos.d/habanalabs.repo && \
echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/tencentos/3/3.1/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo
echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/api/v2/repositories/tencentos/keyPairs/primary/public" >> /etc/yum.repos.d/habanalabs.repo

RUN dnf install -y habanalabs-rdma-core-"$VERSION"-"$REVISION".tl3 \
habanalabs-thunk-"$VERSION"-"$REVISION".tl3 \
Expand Down
4 changes: 2 additions & 2 deletions dockerfiles/common.mk
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ BUILD_DIR ?= $(CURDIR)/dockerbuild

REPO_SERVER ?= vault.habana.ai
PT_VERSION ?= 2.3.1
RELEASE_VERSION ?= 1.17.0
RELEASE_BUILD_ID ?= 495
RELEASE_VERSION ?= 1.17.1
RELEASE_BUILD_ID ?= 40

BASE_IMAGE_URL ?= base-installer-$(BUILD_OS)
IMAGE_URL = $(IMAGE_NAME):$(RELEASE_VERSION)-$(RELEASE_BUILD_ID)
Expand Down
31 changes: 19 additions & 12 deletions utils/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ This folder contains some Intel Gaudi utility scripts that users can access as r

## manage_network_ifs

Moved to habanalabs-qual Example: (/opt/habanalabs/qual/gaudi2/bin/manage_network_ifs.sh).
Moved to habanalabs-qual Example: (/opt/habanalabs/qual/gaudi2/bin/manage_network_ifs.sh or /opt/habanalabs/qual/gaudi3/bin/manage_network_ifs.sh).

This script can be used as reference to bring up, take down, set IPs, unset IPs and check for status of the Intel Gaudi network interfaces.

Expand All @@ -38,8 +38,10 @@ options:
--up toggle up all Intel Gaudi network interfaces
--down toggle down all Intel Gaudi network interfaces
--status print status of all Intel Gaudi network interfaces
--set-ip set IP for all internal Intel Gaudi network interfaces
--unset-ip unset IP from all internal Intel Gaudi network interfaces
--set-pfc set PFC (enabled=0,1,2,3)
--unset-pfc unset PFC (enabled=none)
--check-pfc dump PFC configuration
--no-progbar do not show progress bar
-v, --verbose print more logs
-h, --help print this help
Expand All @@ -56,7 +58,6 @@ Use the following command to bring all Intel Gaudi network interfaces online:
```
sudo manage_network_ifs.sh --up
```
Once all the Intel Gaudi interfaces are toggled up, IPs will be set by default. Please refer [Set Ip](#set-ip) for more detail. To unset IPs, run this script with '--unset-ip'
### Down

Use the following command to bring all Intel Gaudi network interfaces offline:
Expand All @@ -69,18 +70,24 @@ Print the current operational state of all Intel Gaudi network interfaces such a
```
sudo manage_network_ifs.sh --status
```
### Set IP
### Set PFC

Use the following command to assign a default IP for all Intel Gaudi network interfaces:
Use the following command to set PFC for all Intel Gaudi network interfaces:
```
sudo manage_network_ifs.sh --set-ip
sudo manage_network_ifs.sh --set-pfc
```
Note: Default IPs are 192.168.100.1, 192.168.100.2, 192.168.100.3 and so on
### Unset IP
### Unset PFC

Remove IP from all available Intel Gaudi network interfaces by the following command:
Use the following command to unset PFC for all Intel Gaudi network interfaces:
```
sudo manage_network_ifs.sh --unset-ip
sudo manage_network_ifs.sh --unset-pfc
```

### Check current PFC configuration

Use the following command to check current PFC status for all Intel Gaudi network interfaces:
```
sudo manage_network_ifs.sh --check-pfc
```

## check_framework_env
Expand Down Expand Up @@ -141,4 +148,4 @@ IGHS can alternatively be run through below script:
# Creates IGHS Report and screens clusters for any infected nodes.
# Will check Level 1 and 2 by default
./run_ighs.sh
```
```
6 changes: 3 additions & 3 deletions utils/intel_gaudi_health_screen/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,10 +149,10 @@ system-info:
tcp-interface: "10.3.124.0/24"

# Image to run Intel Gaudi Health Screen
image: "vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest"
image: "vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest"

# Node Label used to identify a Intel Gaudi Node
gaudi-node-label: "ighs_label=gaudi"
gaudi-node-label: "habana.ai/gaudi:NoSchedule"

# Controls granularity of Logs (INFO, DEBUG, WARN, ERROR, CRITICAL)
log-level: "DEBUG"
Expand Down Expand Up @@ -236,7 +236,7 @@ system-info:
image: "vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest"

# Node Label used to identify a Intel Gaudi Node
gaudi-node-label: "brightcomputing.com/node-category=gaudi"
gaudi-node-label: "habana.ai/gaudi:NoSchedule"

# Controls granularity of Logs (INFO, DEBUG, WARN, ERROR, CRITICAL)
log-level: "DEBUG"
Expand Down
2 changes: 1 addition & 1 deletion utils/intel_gaudi_health_screen/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ system-info:
image: "vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest"

# Node Label used to identify a Intel Gaudi Node
gaudi-node-label: "ighs_label=gaudi"
gaudi-node-label: "habana.ai/gaudi:NoSchedule"

# Controls granularity of Logs (INFO, DEBUG, WARN, ERROR, CRITICAL)
log-level: "DEBUG"
Expand Down

0 comments on commit 7e23f64

Please sign in to comment.