From 3bd7b8aeaf9eaa30552cda9f4d7eee7b26e55a80 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Tue, 7 Nov 2023 22:16:31 -0800 Subject: [PATCH] Update pax and t5x build workflows and source to allow TE to be configurable --- .github/container/Dockerfile.pax.amd64 | 4 ++- .github/container/Dockerfile.pax.arm64 | 3 ++ .github/workflows/_build_pax.yaml | 12 ++++++++ .github/workflows/nightly-pax-build.yaml | 36 ++++++++++++++++++++++-- .github/workflows/nightly-t5x-build.yaml | 36 ++++++++++++++++++++++-- 5 files changed, 84 insertions(+), 7 deletions(-) diff --git a/.github/container/Dockerfile.pax.amd64 b/.github/container/Dockerfile.pax.amd64 index e8ae18291..f7b4bee55 100644 --- a/.github/container/Dockerfile.pax.amd64 +++ b/.github/container/Dockerfile.pax.amd64 @@ -13,12 +13,14 @@ ADD install-te.sh /usr/local/bin ENV NVTE_FRAMEWORK=jax ARG REPO_PAXML=https://github.com/google/paxml.git ARG REPO_PRAXIS=https://github.com/google/praxis.git +ARG REPO_TE=https://github.com/NVIDIA/TransformerEngine.git ARG REF_PAXML=main ARG REF_PRAXIS=main +ARG REF_TE=main RUN <<"EOF" bash -ex install-pax.sh --defer --from_paxml ${REPO_PAXML} --from_praxis ${REPO_PRAXIS} --ref_paxml ${REF_PAXML} --ref_praxis ${REF_PRAXIS} install-flax.sh --defer -install-te.sh --defer +install-te.sh --defer --from ${REPO_TE} --ref ${REF_TE} if [[ -f /opt/requirements-defer.txt ]]; then # SKIP_HEAD_INSTALLS avoids having to install jax from Github source so that diff --git a/.github/container/Dockerfile.pax.arm64 b/.github/container/Dockerfile.pax.arm64 index 5d55bf2a5..5725ff73a 100644 --- a/.github/container/Dockerfile.pax.arm64 +++ b/.github/container/Dockerfile.pax.arm64 @@ -60,6 +60,9 @@ ADD test-pax.sh /usr/local/bin # TODO: Utilize these build-args and use them when installing pax # ARG REPO_PAXML=https://github.com/google/paxml.git # ARG REPO_PRAXIS=https://github.com/google/praxis.git +# ARG REPO_TE=https://github.com/NVIDIA/TransformerEngine.git # ARG REF_PAXML=main # ARG REF_PRAXIS=main +# ARG REF_TE=main # install-pax.sh --defer --from_paxml ${REPO_PAXML} --from_praxis ${REPO_PRAXIS} --ref_paxml ${REF_PAXML} --ref_praxis ${REF_PRAXIS} +# install-te.sh --defer --from ${REPO_TE} --ref ${REF_TE} \ No newline at end of file diff --git a/.github/workflows/_build_pax.yaml b/.github/workflows/_build_pax.yaml index 84012afa8..01f7019ad 100644 --- a/.github/workflows/_build_pax.yaml +++ b/.github/workflows/_build_pax.yaml @@ -23,6 +23,11 @@ on: description: URL of Praxis repository to check out required: false default: "https://github.com/google/praxis.git" + REPO_TE: + type: string + description: URL of TE repository to check out + required: false + default: "https://github.com/NVIDIA/TransformerEngine.git" REF_PAXML: type: string description: Git commit, tag, or branch for Paxml @@ -33,6 +38,11 @@ on: description: Git commit, tag, or branch for Praxis required: false default: main + REF_TE: + type: string + description: Git commit, tag, or branch for TE + required: false + default: main outputs: DOCKER_TAGS: description: "Tags of the image built" @@ -101,8 +111,10 @@ jobs: BUILD_DATE=${{ inputs.BUILD_DATE }} REPO_PAXML=${{ inputs.REPO_PAXML }} REPO_PRAXIS=${{ inputs.REPO_PRAXIS }} + REPO_TE=${{ inputs.REPO_TE }} REF_PAXML=${{ inputs.REF_PAXML }} REF_PRAXIS=${{ inputs.REF_PRAXIS }} + REF_TE=${{ inputs.REF_TE }} # Temporary workaround until the following issues are solved: # https://github.com/orgs/community/discussions/17245 diff --git a/.github/workflows/nightly-pax-build.yaml b/.github/workflows/nightly-pax-build.yaml index 64728265a..ad155e425 100644 --- a/.github/workflows/nightly-pax-build.yaml +++ b/.github/workflows/nightly-pax-build.yaml @@ -8,12 +8,26 @@ on: branches: [main] workflow_dispatch: inputs: + REPO_TE: + type: string + description: URL of TE repository to check out + required: false + default: "https://github.com/NVIDIA/TransformerEngine.git" + REF_TE: + type: string + description: Git commit, tag, or branch for TE + required: false + default: main PUBLISH: type: boolean description: Publish nightly images and update the 'latest' tag? default: false required: false +env: + DEFAULT_REPO_TE: https://github.com/NVIDIA/TransformerEngine.git + DEFAULT_REF_TE: main + permissions: contents: read # to fetch code actions: write # to cancel previous workflows @@ -25,14 +39,28 @@ jobs: if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' runs-on: ubuntu-22.04 outputs: - BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} + BUILD_DATE: ${{ steps.meta.outputs.BUILD_DATE }} + REPO_TE: ${{ steps.meta.outputs.REPO_TE }} + REF_TE: ${{ steps.meta.outputs.REF_TE }} steps: - - name: Set build date - id: date + - name: Set build metadata + id: meta shell: bash -x -e {0} run: | BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT + if [[ -z "${{ inputs.REPO_TE }}" ]]; then + REPO_TE=${{ env.DEFAULT_REPO_TE }} + else + REPO_TE=${{ inputs.REPO_TE }} + fi + echo "REPO_TE=$REPO_TE" >> $GITHUB_OUTPUT + if [[ -z "${{ inputs.REF_TE }}" ]]; then + REF_TE=${{ env.DEFAULT_REF_TE }} + else + REF_TE=${{ inputs.REF_TE }} + fi + echo "REF_TE=$REF_TE" >> $GITHUB_OUTPUT build: if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' @@ -40,6 +68,8 @@ jobs: uses: ./.github/workflows/_build_pax.yaml with: BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + REPO_TE: ${{ needs.metadata.outputs.REPO_TE }} + REF_TE: ${{ needs.metadata.outputs.REF_TE }} secrets: inherit publish: diff --git a/.github/workflows/nightly-t5x-build.yaml b/.github/workflows/nightly-t5x-build.yaml index 089f94069..2a9cdb697 100644 --- a/.github/workflows/nightly-t5x-build.yaml +++ b/.github/workflows/nightly-t5x-build.yaml @@ -8,12 +8,26 @@ on: branches: [main] workflow_dispatch: inputs: + REPO_TE: + type: string + description: URL of TE repository to check out + required: false + default: "https://github.com/NVIDIA/TransformerEngine.git" + REF_TE: + type: string + description: Git commit, tag, or branch for TE + required: false + default: main PUBLISH: type: boolean description: Publish dated images and update the 'latest' tag? default: false required: false +env: + DEFAULT_REPO_TE: https://github.com/NVIDIA/TransformerEngine.git + DEFAULT_REF_TE: main + permissions: contents: read # to fetch code actions: write # to cancel previous workflows @@ -25,14 +39,28 @@ jobs: if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' runs-on: ubuntu-22.04 outputs: - BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} + BUILD_DATE: ${{ steps.meta.outputs.BUILD_DATE }} + REPO_TE: ${{ steps.meta.outputs.REPO_TE }} + REF_TE: ${{ steps.meta.outputs.REF_TE }} steps: - - name: Set build date - id: date + - name: Set build metadata + id: meta shell: bash -x -e {0} run: | BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT + if [[ -z "${{ inputs.REPO_TE }}" ]]; then + REPO_TE=${{ env.DEFAULT_REPO_TE }} + else + REPO_TE=${{ inputs.REPO_TE }} + fi + echo "REPO_TE=$REPO_TE" >> $GITHUB_OUTPUT + if [[ -z "${{ inputs.REF_TE }}" ]]; then + REF_TE=${{ env.DEFAULT_REF_TE }} + else + REF_TE=${{ inputs.REF_TE }} + fi + echo "REF_TE=$REF_TE" >> $GITHUB_OUTPUT build: if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' @@ -40,6 +68,8 @@ jobs: uses: ./.github/workflows/_build_t5x.yaml with: BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + REPO_TE: ${{ needs.metadata.outputs.REPO_TE }} + REF_TE: ${{ needs.metadata.outputs.REF_TE }} secrets: inherit publish: