-
Notifications
You must be signed in to change notification settings - Fork 48
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Introduces manifest.yaml that is the "last working world state" #405
Changes from 27 commits
fa48558
2ccf1a9
404d629
4404710
4951c31
7a852f8
6206f4c
5688cac
83c07fd
27036f7
2ef9313
ecf9a6a
c5f0b87
645d4af
e780f9b
53ee07e
f2f13f5
48a3a09
e8b857b
68b001a
7281f59
2c54e8d
2e9fccd
a554f01
723f91d
c0b683a
0828455
73df444
811c9d2
aa7cea8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,10 @@ | ||
ARG BASE_IMAGE=ghcr.io/nvidia/jax-toolbox:base | ||
ARG REPO_JAX="https://github.com/google/jax.git" | ||
ARG REPO_XLA="https://github.com/openxla/xla.git" | ||
ARG REPO_FLAX="https://github.com/google/flax.git" | ||
ARG REPO_TE="https://github.com/NVIDIA/TransformerEngine.git" | ||
ARG REF_JAX=main | ||
ARG REF_XLA=main | ||
ARG REF_FLAX=main | ||
ARG REF_TE=main | ||
ARG SRC_MANIFEST_FILE=manifest.yaml | ||
ARG DEST_MANIFEST_DIR=/opt/manifest.d | ||
ARG SRC_PATH_JAX=/opt/jax | ||
ARG SRC_PATH_XLA=/opt/xla-source | ||
ARG SRC_PATH_XLA=/opt/xla | ||
ARG SRC_PATH_FLAX=/opt/flax | ||
ARG SRC_PATH_TE=/opt/transformer-engine-source | ||
ARG SRC_PATH_TE=/opt/transformer-engine | ||
ARG GIT_USER_NAME="JAX Toolbox" | ||
ARG [email protected] | ||
|
||
|
@@ -22,22 +16,28 @@ ARG BUILD_DATE | |
############################################################################### | ||
|
||
FROM ${BASE_IMAGE} as builder | ||
ARG REPO_JAX | ||
ARG REPO_XLA | ||
ARG REF_JAX | ||
ARG REF_XLA | ||
ARG SRC_MANIFEST_FILE | ||
ARG DEST_MANIFEST_DIR | ||
ARG SRC_PATH_JAX | ||
ARG SRC_PATH_XLA | ||
ARG BAZEL_CACHE | ||
ARG GIT_USER_NAME | ||
ARG GIT_USER_EMAIL | ||
|
||
RUN git clone "${REPO_JAX}" "${SRC_PATH_JAX}" && cd "${SRC_PATH_JAX}" && git checkout ${REF_JAX} | ||
# Set the manifest env vars | ||
ENV MANIFEST_FILE=${DEST_MANIFEST_DIR}/${SRC_MANIFEST_FILE} | ||
|
||
ADD --chmod=777 create-distribution.sh ${DEST_MANIFEST_DIR}/ | ||
COPY ${SRC_MANIFEST_FILE} ${DEST_MANIFEST_DIR}/${SRC_MANIFEST_FILE} | ||
terrykong marked this conversation as resolved.
Show resolved
Hide resolved
|
||
COPY patches/ ${DEST_MANIFEST_DIR}/patches/ | ||
|
||
RUN get-source.sh -l jax -m ${MANIFEST_FILE} | ||
RUN --mount=type=ssh \ | ||
--mount=type=secret,id=SSH_KNOWN_HOSTS,target=/root/.ssh/known_hosts \ | ||
git clone "${REPO_XLA}" "${SRC_PATH_XLA}" && cd "${SRC_PATH_XLA}" && git checkout ${REF_XLA} | ||
get-source.sh -l xla -m ${MANIFEST_FILE} | ||
|
||
ADD build-jax.sh local_cuda_arch test-jax.sh /usr/local/bin/ | ||
# TODO: move this patch into the manifest | ||
ADD xla-arm64-neon.patch /opt | ||
RUN build-jax.sh \ | ||
--bazel-cache ${BAZEL_CACHE} \ | ||
|
@@ -53,46 +53,53 @@ RUN build-jax.sh \ | |
|
||
ARG BASE_IMAGE | ||
FROM ${BASE_IMAGE} as mealkit | ||
ARG SRC_MANIFEST_FILE | ||
ARG DEST_MANIFEST_DIR | ||
ARG SRC_PATH_JAX | ||
ARG SRC_PATH_XLA | ||
ARG SRC_PATH_TE | ||
ARG BUILD_DATE | ||
|
||
ENV BUILD_DATE=${BUILD_DATE} | ||
# The following environment variables tune performance | ||
ENV XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_async_all_gather=true --xla_gpu_enable_async_reduce_scatter=true --xla_gpu_enable_triton_gemm=false" | ||
ENV CUDA_DEVICE_MAX_CONNECTIONS=1 | ||
ENV NCCL_IB_SL=1 | ||
ENV NCCL_NVLS_ENABLE=0 | ||
ENV CUDA_MODULE_LOADING=EAGER | ||
# Set the manifest env vars | ||
ENV MANIFEST_FILE=${DEST_MANIFEST_DIR}/${SRC_MANIFEST_FILE} | ||
|
||
ADD --chmod=777 create-distribution.sh ${DEST_MANIFEST_DIR}/ | ||
COPY ${SRC_MANIFEST_FILE} ${DEST_MANIFEST_DIR}/${SRC_MANIFEST_FILE} | ||
terrykong marked this conversation as resolved.
Show resolved
Hide resolved
|
||
COPY patches/ ${DEST_MANIFEST_DIR}/patches/ | ||
|
||
COPY --from=builder ${SRC_PATH_JAX} ${SRC_PATH_JAX} | ||
COPY --from=builder ${SRC_PATH_XLA} ${SRC_PATH_XLA} | ||
ADD build-jax.sh local_cuda_arch test-jax.sh /usr/local/bin/ | ||
|
||
COPY --from=builder ${SRC_PATH_JAX} ${SRC_PATH_JAX} | ||
COPY --from=builder ${SRC_PATH_XLA} ${SRC_PATH_XLA} | ||
ADD build-jax.sh local_cuda_arch test-jax.sh /usr/local/bin/ | ||
|
||
RUN mkdir -p /opt/pip-tools.d | ||
RUN <<"EOF" bash -ex | ||
echo "-e file://${SRC_PATH_JAX}" >> /opt/pip-tools.d/manifest.jax | ||
echo "jaxlib @ file://$(ls ${SRC_PATH_JAX}/dist/*.whl)" >> /opt/pip-tools.d/manifest.jax | ||
echo "-e file://${SRC_PATH_JAX}" >> /opt/pip-tools.d/requirements-jax.in | ||
echo "jaxlib @ file://$(ls ${SRC_PATH_JAX}/dist/*.whl)" >> /opt/pip-tools.d/requirements-jax.in | ||
EOF | ||
|
||
## Flax | ||
ARG REPO_FLAX | ||
ARG REF_FLAX | ||
ARG SRC_PATH_FLAX | ||
RUN get-source.sh -f ${REPO_FLAX} -r ${REF_FLAX} -d ${SRC_PATH_FLAX} -m /opt/pip-tools.d/manifest.flax | ||
RUN get-source.sh -l flax -m ${MANIFEST_FILE} -o /opt/pip-tools.d/requirements-flax.in | ||
|
||
## Transformer engine: check out source and build wheel | ||
ARG REPO_TE | ||
ARG REF_TE | ||
ARG SRC_PATH_TE | ||
ENV NVTE_FRAMEWORK=jax | ||
ENV SRC_PATH_TE=${SRC_PATH_TE} | ||
RUN <<"EOF" bash -ex | ||
set -o pipefail | ||
RUN <<"EOF" bash -ex -o pipefail | ||
pip install ninja && rm -rf ~/.cache/pip | ||
get-source.sh -f ${REPO_TE} -r ${REF_TE} -d ${SRC_PATH_TE} | ||
get-source.sh -l transformer-engine -m ${MANIFEST_FILE} | ||
pushd ${SRC_PATH_TE} | ||
python setup.py bdist_wheel && rm -rf build | ||
echo "transformer-engine @ file://$(ls ${SRC_PATH_TE}/dist/*.whl)" >> /opt/pip-tools.d/manifest.te | ||
echo "transformer-engine @ file://$(ls ${SRC_PATH_TE}/dist/*.whl)" >> /opt/pip-tools.d/requirements-te.in | ||
EOF | ||
|
||
# TODO: properly configure entrypoint | ||
|
@@ -104,3 +111,4 @@ EOF | |
FROM mealkit as final | ||
|
||
RUN pip-finalize.sh | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,6 @@ | ||
# syntax=docker/dockerfile:1-labs | ||
|
||
ARG BASE_IMAGE=ghcr.io/nvidia/jax:mealkit | ||
ARG REPO_PAXML=https://github.com/google/paxml.git | ||
ARG REPO_PRAXIS=https://github.com/google/praxis.git | ||
ARG REF_PAXML=main | ||
ARG REF_PRAXIS=main | ||
ARG SRC_PATH_PAXML=/opt/paxml | ||
ARG SRC_PATH_PRAXIS=/opt/praxis | ||
|
||
|
@@ -13,21 +9,17 @@ ARG SRC_PATH_PRAXIS=/opt/praxis | |
############################################################################### | ||
|
||
FROM ${BASE_IMAGE} as mealkit | ||
ARG REPO_PAXML | ||
ARG REPO_PRAXIS | ||
ARG REF_PAXML | ||
ARG REF_PRAXIS | ||
ARG SRC_PATH_PAXML | ||
ARG SRC_PATH_PRAXIS | ||
|
||
# update TE manifest file to install the [test] extras | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why here but not in Dockerfile.jax? TE is a part of JAX image now, so I would assume someone wants to test TE in JAX container. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't mind. Any reason you kept it separate @yhtang ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So I took a look at it, and it appears we could merge the two without any issue since there's just a Any objections? @DwarKapex @yhtang Actually, another issue I found looking at this is that Dockerfile.pax.arm64 doesn't include this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. As I mentioned during the CI sync, IIRC the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can address this as part of #338. |
||
RUN sed -i "s/transformer-engine @/transformer-engine[test] @/g" /opt/pip-tools.d/manifest.te | ||
RUN sed -i "s/transformer-engine @/transformer-engine[test] @/g" /opt/pip-tools.d/requirements-te.in | ||
|
||
RUN <<"EOF" bash -ex | ||
get-source.sh -f ${REPO_PAXML} -r ${REF_PAXML} -d ${SRC_PATH_PAXML} | ||
get-source.sh -f ${REPO_PRAXIS} -r ${REF_PRAXIS} -d ${SRC_PATH_PRAXIS} | ||
echo "-e file://${SRC_PATH_PAXML}[gpu]" >> /opt/pip-tools.d/manifest.pax | ||
echo "-e file://${SRC_PATH_PRAXIS}" >> /opt/pip-tools.d/manifest.pax | ||
get-source.sh -l paxml -m ${MANIFEST_FILE} | ||
get-source.sh -l praxis -m ${MANIFEST_FILE} | ||
echo "-e file://${SRC_PATH_PAXML}[gpu]" >> /opt/pip-tools.d/requirements-paxml.in | ||
echo "-e file://${SRC_PATH_PRAXIS}" >> /opt/pip-tools.d/requirements-paxml.in | ||
|
||
for src in ${SRC_PATH_PAXML} ${SRC_PATH_PRAXIS}; do | ||
pushd ${src} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,10 @@ | ||
# syntax=docker/dockerfile:1-labs | ||
|
||
ARG BASE_IMAGE=ghcr.io/nvidia/jax:mealkit | ||
ARG REPO_PAXML=https://github.com/google/paxml.git | ||
ARG REPO_PRAXIS=https://github.com/google/praxis.git | ||
ARG REF_PAXML=main | ||
ARG REF_PRAXIS=main | ||
ARG SRC_PATH_PAXML=/opt/paxml | ||
ARG SRC_PATH_PRAXIS=/opt/praxis | ||
ARG SRC_PATH_TFTEXT=/opt/tensorflow-text | ||
ARG SRC_PATH_LINGVO=/opt/lingvo | ||
|
||
############################################################################### | ||
## build tensorflow-text and lingvo, which do not have working arm64 pip wheels | ||
|
@@ -24,39 +22,35 @@ RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.17.0/bazeli | |
#------------------------------------------------------------------------------ | ||
|
||
FROM wheel-builder as tftext-builder | ||
|
||
RUN <<"EOT" bash -exu | ||
set -o pipefail | ||
ARG SRC_PATH_TFTEXT | ||
RUN <<"EOF" bash -exu -o pipefail | ||
pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 | ||
git clone http://github.com/tensorflow/text.git /opt/tensorflow-text | ||
cd /opt/tensorflow-text | ||
git checkout v2.13.0 | ||
get-source.sh -l tensorflow-text -m ${MANIFEST_FILE} | ||
cd ${SRC_PATH_TFTEXT} | ||
./oss_scripts/run_build.sh | ||
EOT | ||
EOF | ||
|
||
#------------------------------------------------------------------------------ | ||
# build lingvo | ||
#------------------------------------------------------------------------------ | ||
|
||
FROM wheel-builder as lingvo-builder | ||
ARG REPO_LINGVO=https://github.com/tensorflow/lingvo.git | ||
ARG REF_LINGVO=master | ||
ARG SRC_PATH_LINGVO=/opt/lingvo | ||
ARG SRC_PATH_TFTEXT | ||
ARG SRC_PATH_LINGVO | ||
|
||
COPY --from=tftext-builder /opt/tensorflow-text/tensorflow_text*.whl /opt/ | ||
COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ | ||
|
||
RUN get-source.sh -f ${REPO_LINGVO} -r ${REF_LINGVO} -d ${SRC_PATH_LINGVO} | ||
RUN get-source.sh -l lingvo -m ${MANIFEST_FILE} | ||
|
||
# build lingvo | ||
RUN <<"EOT" bash -exu | ||
set -o pipefail | ||
RUN <<"EOF" bash -exu -o pipefail | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I used There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I mean we don't have to rename, but I feel that it's more clear since a question already came up about its significance. This way the identifier is also self-documenting |
||
|
||
pushd ${SRC_PATH_LINGVO} | ||
git fetch origin pull/329/head:pr329 | ||
git cherry-pick --allow-empty pr329 | ||
|
||
# Disable 2 flaky tests here | ||
terrykong marked this conversation as resolved.
Show resolved
Hide resolved
|
||
patch -p1 <<"EOF" | ||
patch -p1 <<"EOFINNER" | ||
diff --git a/pip_package/build.sh b/pip_package/build.sh | ||
index ef62c432e..659e78956 100755 | ||
--- a/pip_package/build.sh | ||
|
@@ -70,7 +64,7 @@ index ef62c432e..659e78956 100755 | |
fi | ||
|
||
DST_DIR="/tmp/lingvo/dist" | ||
EOF | ||
EOFINNER | ||
|
||
pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 /opt/tensorflow_text*.whl | ||
sed -i 's/tensorflow=/#tensorflow=/' docker/dev.requirements.txt | ||
|
@@ -82,38 +76,35 @@ pip install -r docker/dev.requirements.txt | |
# running the tests entirely by uncommentin the following line. | ||
# SKIP_TEST=1 | ||
PYTHON_MINOR_VERSION=$(python --version | cut -d ' ' -f 2 | cut -d '.' -f 2) pip_package/build.sh | ||
EOT | ||
EOF | ||
|
||
############################################################################### | ||
## Pax for AArch64 | ||
############################################################################### | ||
|
||
ARG BASE_IMAGE | ||
FROM ${BASE_IMAGE} as mealkit | ||
ARG REPO_PAXML | ||
ARG REPO_PRAXIS | ||
ARG REF_PAXML | ||
ARG REF_PRAXIS | ||
ARG SRC_PATH_PAXML | ||
ARG SRC_PATH_PRAXIS | ||
ARG SRC_PATH_TFTEXT | ||
|
||
COPY --from=lingvo-builder /tmp/lingvo/dist/lingvo*linux_aarch64.whl /opt/ | ||
RUN echo "lingvo @ file://$(ls /opt/lingvo*.whl)" >> /opt/pip-tools.d/manifest.pax | ||
RUN echo "lingvo @ file://$(ls /opt/lingvo*.whl)" >> /opt/pip-tools.d/requirements-paxml.in | ||
|
||
COPY --from=tftext-builder /opt/tensorflow-text/tensorflow_text*.whl /opt/ | ||
RUN echo "tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl)" >> /opt/pip-tools.d/manifest.pax | ||
COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/ | ||
RUN echo "tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl)" >> /opt/pip-tools.d/requirements-paxml.in | ||
|
||
# paxml + praxis | ||
RUN <<"EOT" bash -ex | ||
echo "tensorflow==2.13.0" >> /opt/pip-tools.d/manifest.pax | ||
echo "tensorflow_datasets==4.9.2" >> /opt/pip-tools.d/manifest.pax | ||
echo "chex==0.1.7" >> /opt/pip-tools.d/manifest.pax | ||
echo "auditwheel" >> /opt/pip-tools.d/manifest.pax | ||
RUN <<"EOF" bash -ex | ||
echo "tensorflow==2.13.0" >> /opt/pip-tools.d/requirements-paxml.in | ||
echo "tensorflow_datasets==4.9.2" >> /opt/pip-tools.d/requirements-paxml.in | ||
echo "chex==0.1.7" >> /opt/pip-tools.d/requirements-paxml.in | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do I see
? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't author this dockerfile so I defer to @joker-eph regarding W.r.t There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can rename the nested one to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I wrote it, that was likely because it was required at the time to make the container work on ARM64. Ultimately line 27 and here are trying to get to a different state (earlier is just the requirement for building PAXML I believe?). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Regarding EOT vs EOF, EOT is what Docker's official documentation used for demoing heredoc RUN. But ultimately this choice is arbitrary and up to us to reach a consensus. |
||
echo "auditwheel" >> /opt/pip-tools.d/requirements-paxml.in | ||
|
||
get-source.sh -f ${REPO_PAXML} -r ${REF_PAXML} -d ${SRC_PATH_PAXML} | ||
get-source.sh -f ${REPO_PRAXIS} -r ${REF_PRAXIS} -d ${SRC_PATH_PRAXIS} | ||
echo "-e file://${SRC_PATH_PAXML}[gpu]" >> /opt/pip-tools.d/manifest.pax | ||
echo "-e file://${SRC_PATH_PRAXIS}" >> /opt/pip-tools.d/manifest.pax | ||
get-source.sh -l paxml -m ${MANIFEST_FILE} | ||
get-source.sh -l praxis -m ${MANIFEST_FILE} | ||
echo "-e file://${SRC_PATH_PAXML}[gpu]" >> /opt/pip-tools.d/requirements-paxml.in | ||
echo "-e file://${SRC_PATH_PRAXIS}" >> /opt/pip-tools.d/requirements-paxml.in | ||
|
||
for src in ${SRC_PATH_PAXML} ${SRC_PATH_PRAXIS}; do | ||
pushd ${src} | ||
|
@@ -139,7 +130,7 @@ for src in ${SRC_PATH_PAXML} ${SRC_PATH_PRAXIS}; do | |
fi | ||
popd | ||
done | ||
EOT | ||
EOF | ||
|
||
ADD test-pax.sh /usr/local/bin | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[question] Will it be better to install it thru
apt install
:?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess it would be better to have it be managed by
apt
, then the package can be inspected bydpkg
. FWIW, I had to installapt-get install software-properties-common
to getadd-apt-respository
which is 68.4MB according toapt
. Is that okay with you?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't have a guideline yet regarding third-party PPAs. Let's stick with this simple wget for the time being.